gentle-pi 0.5.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,16 @@
1
+ ---
2
+ name: jd-fix-agent
3
+ description: Judgment Day surgical fix agent for confirmed findings. Can edit code and run focused tests.
4
+ tools: read, grep, glob, edit, write, bash
5
+ ---
6
+
7
+ You are the Judgment Day fix agent for Gentle AI.
8
+
9
+ Apply surgical fixes for confirmed Judgment Day findings only. Preserve the original design intent, keep the patch focused, and avoid unrelated refactors.
10
+
11
+ Rules:
12
+
13
+ - Edit only the files needed to resolve confirmed findings.
14
+ - Add or update focused tests when the fix changes behavior.
15
+ - Run the relevant tests when practical and report exact results.
16
+ - Clearly list what was fixed, what was verified, and any remaining risks.
@@ -0,0 +1,16 @@
1
+ ---
2
+ name: jd-judge-a
3
+ description: Judgment Day blind adversarial reviewer A. Read-only; reports findings and does not fix code.
4
+ tools: read, grep, glob, bash
5
+ ---
6
+
7
+ You are Judgment Day judge A for Gentle AI.
8
+
9
+ Run an independent, blind adversarial review of the assigned change. Focus on correctness, regressions, missing tests, unsafe behavior, and mismatches with the user's request.
10
+
11
+ Rules:
12
+
13
+ - Stay read-only. Do not edit files or apply fixes.
14
+ - Do not coordinate with judge B before producing your review.
15
+ - Report concrete findings with file paths, evidence, severity, and suggested verification.
16
+ - If you find no confirmed issues, say so clearly.
@@ -0,0 +1,16 @@
1
+ ---
2
+ name: jd-judge-b
3
+ description: Judgment Day blind adversarial reviewer B. Read-only; independently reports findings and does not fix code.
4
+ tools: read, grep, glob, bash
5
+ ---
6
+
7
+ You are Judgment Day judge B for Gentle AI.
8
+
9
+ Run an independent, blind adversarial review of the assigned change. Challenge assumptions from a different angle than judge A, with special attention to edge cases, test gaps, integration risks, and user-visible regressions.
10
+
11
+ Rules:
12
+
13
+ - Stay read-only. Do not edit files or apply fixes.
14
+ - Work independently from judge A and do not rely on judge A's conclusions.
15
+ - Report concrete findings with file paths, evidence, severity, and suggested verification.
16
+ - If you find no confirmed issues, say so clearly.
@@ -0,0 +1,24 @@
1
+ ---
2
+ name: review-readability
3
+ description: R2 Readability reviewer — naming, complexity, intention, maintainability, review size, and context clarity.
4
+ tools: read, grep, glob, bash
5
+ ---
6
+
7
+ You are **R2 Readability**, a read-only reviewer. Find clarity problems; do not fix them.
8
+
9
+ Rule sources: ai-course-2 slides `05-code-smells.md`, `06-safe-refactoring.md`, `07-advanced-refactoring.md`, `08-tech-debt.md`, `22-docs-as-code.md`, `25-executive-summary.md`.
10
+
11
+ ## Review rules
12
+
13
+ - Flag magic numbers that should be named constants or business-rule objects.
14
+ - Flag long parameter lists that should be parameter objects.
15
+ - Flag duplicated logic across components/hooks/modules.
16
+ - Flag dead code: commented-out blocks, unused imports, unreachable branches, never-called functions.
17
+ - Flag naming that hides intent or needs comment-heavy explanation.
18
+ - Flag PR/context explanation that is too vague to review safely; require concrete intent and impact.
19
+ - Require evidence for "too complex" claims: cite exact function, branch, or repeated pattern.
20
+ - Do not flag a small helper or inline constant that is clear, local, and self-explanatory.
21
+
22
+ ## Output contract
23
+
24
+ Report findings only. Each finding must include `severity: BLOCKER | CRITICAL | WARNING | SUGGESTION`, affected files, evidence, and why it matters. If clean, say exactly: `No findings.`
@@ -0,0 +1,25 @@
1
+ ---
2
+ name: review-reliability
3
+ description: R3 Reliability reviewer — behavior-first tests, coverage value, edge cases, determinism, contracts, and regressions.
4
+ tools: read, grep, glob, bash
5
+ ---
6
+
7
+ You are **R3 Reliability**, a read-only reviewer. Find test and behavior risks; do not fix them.
8
+
9
+ Rule sources: ai-course-2 slides `01-testing-setup.md`, `02-tdd-implementation.md`, `03-integration-testing.md`, `04-e2e-testing.md`, `10-strategic-coverage.md`, `11-playwright-visibility.md`, `12-quality-gates-husky.md`, `23-apis-components.md`.
10
+
11
+ ## Review rules
12
+
13
+ - Block behavior changes without tests that assert externally visible contract.
14
+ - Flag tests that are implementation-centric instead of user/behavior-centric.
15
+ - Flag missing edge cases: boundaries, invalid inputs, empty states, retries, failure paths.
16
+ - Block when CI can pass with `test.only`; require `forbidOnly` or equivalent in CI configs.
17
+ - Flag misallocated test coverage: too much E2E where cheaper deterministic unit/integration tests should cover behavior.
18
+ - Require evidence of determinism: same input -> same output; external dependencies mocked or controlled.
19
+ - Flag weak selectors in UI tests; prefer semantic/user-visible queries.
20
+ - Do not flag intentional reliance on built-in async waiting/trace visibility over custom polling/logging.
21
+ - Require evidence that new APIs/components have example usage or documented contract.
22
+
23
+ ## Output contract
24
+
25
+ Report findings only. Each finding must include `severity: BLOCKER | CRITICAL | WARNING | SUGGESTION`, affected files, evidence, and why it matters. If clean, say exactly: `No findings.`
@@ -0,0 +1,24 @@
1
+ ---
2
+ name: review-resilience
3
+ description: R4 Resilience reviewer — fallbacks, retry/backoff, graceful degradation, observability, load, rollback, and SLO risks.
4
+ tools: read, grep, glob, bash
5
+ ---
6
+
7
+ You are **R4 Resilience**, a read-only reviewer. Find operational failure risks; do not fix them.
8
+
9
+ Rule sources: ai-course-2 slides `09-essential-metrics.md`, `13-observability-strategy.md`, `14-sentry-implementation.md`, `15-sentry-errors.md`, `16-sentry-performance.md`, `17-sentry-alertas.md`, `29-performance-percibida.md`.
10
+
11
+ ## Review rules
12
+
13
+ - Flag failures with no fallback, retry, or graceful-degradation path.
14
+ - Block when production error-rate or build/test thresholds are ignored. Use thresholds as anchors: test success < 95%, build success < 95%, prod error rate > 1% investigate, > 2% emergency, > 5% all hands.
15
+ - Flag releases that can regress without alerting/observability hooks.
16
+ - Require evidence for rollback/fix-forward readiness: a concrete recovery path must exist.
17
+ - Flag performance regressions that exceed user-visible budgets or lack measurement.
18
+ - Block when there is no production visibility for error/performance issues expected in the wild.
19
+ - Do not flag explicitly low-impact expected issues already isolated by alert grouping or silence rules.
20
+ - Require evidence of SLO/latency/load impact, not generic "might be slow" claims.
21
+
22
+ ## Output contract
23
+
24
+ Report findings only. Each finding must include `severity: BLOCKER | CRITICAL | WARNING | SUGGESTION`, affected files, evidence, and why it matters. If clean, say exactly: `No findings.`
@@ -0,0 +1,24 @@
1
+ ---
2
+ name: review-risk
3
+ description: R1 Risk reviewer — security, privilege boundaries, data exposure, dependency risks, and merge-blocking vulnerabilities.
4
+ tools: read, grep, glob, bash
5
+ ---
6
+
7
+ You are **R1 Risk**, a read-only reviewer. Find security risks; do not fix them.
8
+
9
+ Rule sources: ai-course-2 slides `18-env-secrets.md`, `19-web-security.md`, `20-auth-tokens.md`, `21-owasp-top10.md`.
10
+
11
+ ## Review rules
12
+
13
+ - Flag when secrets, tokens, API keys, JWT secrets, or DB URLs are hardcoded in code or committed examples.
14
+ - Block when authz is enforced only in the frontend; require backend verification on every request.
15
+ - Flag when user input reaches HTML/DOM sinks without escaping/sanitization.
16
+ - Block when SQL/NoSQL/command strings are built by concatenation instead of parameterization.
17
+ - Flag when cookies storing auth state miss `httpOnly`, `secure`, or `sameSite` protections.
18
+ - Require evidence that security-sensitive changes are covered by backend checks, not UI disabled states.
19
+ - Do not flag when React default escaping is used and no raw HTML sink exists.
20
+ - Require evidence for dependency/security findings: cite scan failure or vulnerable package, not just "looks risky".
21
+
22
+ ## Output contract
23
+
24
+ Report findings only. Each finding must include `severity: BLOCKER | CRITICAL | WARNING | SUGGESTION`, affected files, evidence, and why it matters. If clean, say exactly: `No findings.`
@@ -0,0 +1,39 @@
1
+ ---
2
+ name: 4r-review
3
+ description: Pre-PR 4R review fan-out — runs all four review lenses (risk, readability, reliability, resilience) in sequence and writes individual reports.
4
+ ---
5
+
6
+ ## review-risk
7
+
8
+ output: review-risk-report.md
9
+ outputMode: file-only
10
+ progress: true
11
+
12
+ Run R1 Risk review on the current diff. Report security, privilege boundary, data exposure, dependency, and merge-blocking vulnerability findings. If clean, say exactly: `No findings.`
13
+
14
+ ## review-readability
15
+
16
+ reads: review-risk-report.md
17
+ output: review-readability-report.md
18
+ outputMode: file-only
19
+ progress: true
20
+
21
+ Run R2 Readability review on the current diff. Report naming, complexity, intention, maintainability, review size, and context clarity findings. If clean, say exactly: `No findings.`
22
+
23
+ ## review-reliability
24
+
25
+ reads: review-risk-report.md+review-readability-report.md
26
+ output: review-reliability-report.md
27
+ outputMode: file-only
28
+ progress: true
29
+
30
+ Run R3 Reliability review on the current diff. Report behavior-first test coverage, edge case, determinism, contract, and regression findings. If clean, say exactly: `No findings.`
31
+
32
+ ## review-resilience
33
+
34
+ reads: review-risk-report.md+review-readability-report.md+review-reliability-report.md
35
+ output: review-resilience-report.md
36
+ outputMode: file-only
37
+ progress: true
38
+
39
+ Run R4 Resilience review on the current diff. Report fallback, retry/backoff, graceful degradation, observability, load, rollback, and SLO risk findings. If clean, say exactly: `No findings.`
@@ -384,3 +384,19 @@ Automatic mode does not override reviewer burnout protection.
384
384
  - Ask before destructive git operations, publishing, or irreversible file changes.
385
385
  - Keep writes single-threaded unless isolated worktrees are explicitly approved.
386
386
  - Preserve human control: user decisions beat agent momentum.
387
+
388
+ ## 4R Review Triggers
389
+
390
+ The extension (`extensions/gentle-ai.ts`) gates `bash` tool calls that look like git/gh workflow events. Gate semantics:
391
+
392
+ - **pre-commit** (`git commit`): advisory only. The extension notifies the user to consider running `review-readability` but does NOT block. No orchestrator action needed.
393
+ - **pre-push** (`git push`): advisory only. Same as pre-commit — notify, do not block.
394
+ - **pre-pr** (`gh pr create`): **strong gate**. The extension blocks when any of these hold:
395
+ - Changed paths match hot globs: `**/auth/**`, `**/update/**`, `**/security/**`, `**/payments/**`
396
+ - Diff exceeds 400 changed lines (added + deleted)
397
+ - When blocked, the reason names all four agents to run first.
398
+ - **post-sdd-phase** (design, apply): **strong gate** for `judgment-day`. Handled separately by SDD phase orchestration, not this diff-based hook.
399
+
400
+ When the extension blocks a `gh pr create` command, the orchestrator must launch the `4r-review` chain (or run the four agents individually) and wait for their reports before the user retries the PR command.
401
+
402
+ Prohibition: do NOT configure the full 4R fan-out on `pre-commit` or `pre-push` with `always: true`. Everyday events must use a single advisory lens to keep development-loop cost low (spec G token-budget rule). The `validateTriggerRuleSet` function in `lib/review-triggers.ts` enforces this at config load time.
@@ -1,3 +1,4 @@
1
+ import { execFileSync } from "node:child_process";
1
2
  import {
2
3
  existsSync,
3
4
  mkdirSync,
@@ -38,6 +39,12 @@ import {
38
39
  sddStatusSeverity,
39
40
  type SddPhase,
40
41
  } from "../lib/sdd-status.ts";
42
+ import {
43
+ evaluateEvent,
44
+ matchPathGlobs,
45
+ type ChangedDiff,
46
+ type TriggerEvent,
47
+ } from "../lib/review-triggers.ts";
41
48
 
42
49
  const PACKAGE_ROOT = dirname(dirname(fileURLToPath(import.meta.url)));
43
50
  const ASSETS_DIR = join(PACKAGE_ROOT, "assets");
@@ -431,7 +438,18 @@ const SDD_AGENT_NAMES = [
431
438
  ] as const;
432
439
  const SDD_AGENT_NAME_SET = new Set<string>(SDD_AGENT_NAMES);
433
440
 
434
- type SddAgentName = (typeof SDD_AGENT_NAMES)[number];
441
+ const JUDGMENT_DAY_AGENT_NAMES = [
442
+ "jd-judge-a",
443
+ "jd-judge-b",
444
+ "jd-fix-agent",
445
+ ] as const;
446
+
447
+ const CORE_MODEL_AGENT_NAMES = [
448
+ ...SDD_AGENT_NAMES,
449
+ ...JUDGMENT_DAY_AGENT_NAMES,
450
+ ] as const;
451
+ const CORE_MODEL_AGENT_NAME_SET = new Set<string>(CORE_MODEL_AGENT_NAMES);
452
+
435
453
  type ThinkingLevel = "off" | "minimal" | "low" | "medium" | "high" | "xhigh";
436
454
  interface AgentRoutingEntry {
437
455
  model?: string;
@@ -991,14 +1009,7 @@ function listDiscoverableAgents(cwd: string): AgentEntry[] {
991
1009
  ];
992
1010
  const byName = new Map<string, AgentEntry>();
993
1011
  for (const agent of agents) byName.set(agent.name, agent);
994
- const discovered = Array.from(byName.values());
995
- const sddFirst = SDD_AGENT_NAMES.map((name) =>
996
- discovered.find((agent) => agent.name === name),
997
- ).filter((agent): agent is AgentEntry => agent !== undefined);
998
- const rest = discovered
999
- .filter((agent) => !SDD_AGENT_NAMES.includes(agent.name as SddAgentName))
1000
- .sort((left, right) => left.name.localeCompare(right.name));
1001
- return [...sddFirst, ...rest];
1012
+ return orderDiscoverableAgents(Array.from(byName.values()));
1002
1013
  }
1003
1014
 
1004
1015
  async function listDiscoverableAgentsAsync(cwd: string): Promise<AgentEntry[]> {
@@ -1023,14 +1034,17 @@ async function listDiscoverableAgentsAsync(cwd: string): Promise<AgentEntry[]> {
1023
1034
  }
1024
1035
  const byName = new Map<string, AgentEntry>();
1025
1036
  for (const agent of agents) byName.set(agent.name, agent);
1026
- const discovered = Array.from(byName.values());
1027
- const sddFirst = SDD_AGENT_NAMES.map((name) =>
1028
- discovered.find((agent) => agent.name === name),
1037
+ return orderDiscoverableAgents(Array.from(byName.values()));
1038
+ }
1039
+
1040
+ function orderDiscoverableAgents(agents: AgentEntry[]): AgentEntry[] {
1041
+ const coreFirst = CORE_MODEL_AGENT_NAMES.map((name) =>
1042
+ agents.find((agent) => agent.name === name),
1029
1043
  ).filter((agent): agent is AgentEntry => agent !== undefined);
1030
- const rest = discovered
1031
- .filter((agent) => !SDD_AGENT_NAMES.includes(agent.name as SddAgentName))
1044
+ const rest = agents
1045
+ .filter((agent) => !CORE_MODEL_AGENT_NAME_SET.has(agent.name))
1032
1046
  .sort((left, right) => left.name.localeCompare(right.name));
1033
- return [...sddFirst, ...rest];
1047
+ return [...coreFirst, ...rest];
1034
1048
  }
1035
1049
 
1036
1050
  function projectSettingsPath(cwd: string): string {
@@ -1760,13 +1774,150 @@ async function handlePersonaCommand(ctx: ExtensionContext): Promise<void> {
1760
1774
  );
1761
1775
  }
1762
1776
 
1777
+ // ---------------------------------------------------------------------------
1778
+ // Review gate helpers — pure, exported via __testing for unit tests
1779
+ // ---------------------------------------------------------------------------
1780
+
1781
+ /**
1782
+ * Classifies a bash command string as a TriggerEvent for the review gate,
1783
+ * or returns null if the command is not a recognized git/gh workflow trigger.
1784
+ *
1785
+ * Regexes are tolerant of flags between tokens.
1786
+ */
1787
+ export function classifyReviewEvent(command: string): TriggerEvent | null {
1788
+ const trimmed = command.trim();
1789
+ // gh pr create → pre-pr (check before generic push to avoid overlap)
1790
+ if (/^gh\s+pr\s+create\b/.test(trimmed)) return "pre-pr";
1791
+ // git commit → pre-commit
1792
+ if (/^git(?:\s+(?:-C\s+\S+|--work-tree=\S+|--git-dir=\S+))?\s+commit\b/.test(trimmed))
1793
+ return "pre-commit";
1794
+ // git push → pre-push
1795
+ if (/^git(?:\s+(?:-C\s+\S+|--work-tree=\S+|--git-dir=\S+))?\s+push\b/.test(trimmed))
1796
+ return "pre-push";
1797
+ return null;
1798
+ }
1799
+
1800
+ /**
1801
+ * Parses the output of `git diff --numstat` into a ChangedDiff.
1802
+ * Binary files show `- - path`; their contribution to changedLines is 0.
1803
+ */
1804
+ export function parseNumstat(output: string): ChangedDiff {
1805
+ const changedPaths: string[] = [];
1806
+ let changedLines = 0;
1807
+ for (const line of output.split("\n")) {
1808
+ const trimmed = line.trim();
1809
+ if (!trimmed) continue;
1810
+ // Format: "<added>\t<deleted>\t<path>"
1811
+ const parts = trimmed.split("\t");
1812
+ if (parts.length < 3) continue;
1813
+ const added = parts[0];
1814
+ const deleted = parts[1];
1815
+ const filePath = parts.slice(2).join("\t");
1816
+ if (!filePath) continue;
1817
+ changedPaths.push(filePath);
1818
+ // Binary rows have "-" in both columns; treat as 0.
1819
+ const addedNum = added === "-" ? 0 : parseInt(added, 10);
1820
+ const deletedNum = deleted === "-" ? 0 : parseInt(deleted, 10);
1821
+ if (!isNaN(addedNum)) changedLines += addedNum;
1822
+ if (!isNaN(deletedNum)) changedLines += deletedNum;
1823
+ }
1824
+ return { changedPaths, changedLines };
1825
+ }
1826
+
1827
+ /**
1828
+ * Computes a ChangedDiff for the given event by running git numstat.
1829
+ * Returns null on any error (fail open — never break the user's git command).
1830
+ */
1831
+ function computeDiffForEvent(event: TriggerEvent, cwd: string): ChangedDiff | null {
1832
+ const gitOpts = {
1833
+ cwd,
1834
+ encoding: "utf8" as const,
1835
+ stdio: ["pipe", "pipe", "pipe"] as const,
1836
+ // Bound synchronous git calls so a slow/large repo cannot freeze the extension process.
1837
+ // The existing outer try/catch returns null (fail-open) when this throws.
1838
+ timeout: 2000,
1839
+ };
1840
+ try {
1841
+ let raw: string;
1842
+ if (event === "pre-commit") {
1843
+ raw = execFileSync("git", ["diff", "--cached", "--numstat"], gitOpts);
1844
+ } else {
1845
+ // pre-push or pre-pr: diff vs merge-base
1846
+ let base = "";
1847
+ for (const ref of ["origin/HEAD", "origin/main", "main"]) {
1848
+ try {
1849
+ base = execFileSync("git", ["merge-base", "HEAD", ref], gitOpts).trim();
1850
+ if (base) break;
1851
+ } catch {
1852
+ // try next ref
1853
+ }
1854
+ }
1855
+ if (!base) {
1856
+ // Final fallback: cached diff
1857
+ try {
1858
+ raw = execFileSync("git", ["diff", "--cached", "--numstat"], gitOpts);
1859
+ return parseNumstat(raw);
1860
+ } catch {
1861
+ return null;
1862
+ }
1863
+ }
1864
+ raw = execFileSync("git", ["diff", "--numstat", `${base}...HEAD`], gitOpts);
1865
+ }
1866
+ return parseNumstat(raw);
1867
+ } catch {
1868
+ return null;
1869
+ }
1870
+ }
1871
+
1872
+ /**
1873
+ * Runs the review gate for a bash command, composing with the existing
1874
+ * confirmCommand flow. Returns a block result for strong mode, notifies for
1875
+ * advisory mode, or returns undefined to fall through.
1876
+ */
1877
+ async function applyReviewGate(
1878
+ command: string,
1879
+ ctx: ExtensionContext,
1880
+ ): Promise<ToolCallEventResult | undefined> {
1881
+ const event = classifyReviewEvent(command);
1882
+ if (!event) return undefined;
1883
+
1884
+ const diff = computeDiffForEvent(event, ctx.cwd);
1885
+ if (!diff) return undefined;
1886
+
1887
+ const result = evaluateEvent(event, diff);
1888
+ if (!result) return undefined;
1889
+
1890
+ if (result.mode === "advisory") {
1891
+ if (ctx.hasUI) {
1892
+ const commitOrPush = event === "pre-push" ? "this push" : "this commit";
1893
+ ctx.ui.notify(
1894
+ `Review suggestion: consider running agent "${result.run.join(", ")}" before ${commitOrPush}. ${result.reason}`,
1895
+ "info",
1896
+ );
1897
+ }
1898
+ return undefined;
1899
+ }
1900
+
1901
+ // strong mode — block
1902
+ return {
1903
+ block: true,
1904
+ reason:
1905
+ `Gentle AI 4R review gate: run ${result.run.join(", ")} before this command. ` +
1906
+ result.reason,
1907
+ };
1908
+ }
1909
+
1763
1910
  /** @internal */
1764
1911
  export const __testing = {
1765
1912
  listAgentsFromDir,
1766
1913
  listAgentsFromDirAsync,
1914
+ listDiscoverableAgents,
1915
+ orderDiscoverableAgents,
1767
1916
  classifyGuardedCommand,
1768
1917
  loadRuntimeGuardrailsConfig,
1769
1918
  buildGentlePrompt,
1919
+ classifyReviewEvent,
1920
+ parseNumstat,
1770
1921
  };
1771
1922
 
1772
1923
  export default function gentleAi(pi: ExtensionAPI): void {
@@ -1850,6 +2001,8 @@ export default function gentleAi(pi: ExtensionAPI): void {
1850
2001
  if (event.toolName !== "bash") return undefined;
1851
2002
  if (!isRecord(event.input) || typeof event.input.command !== "string")
1852
2003
  return undefined;
2004
+ const reviewGateResult = await applyReviewGate(event.input.command, ctx);
2005
+ if (reviewGateResult) return reviewGateResult;
1853
2006
  return confirmCommand(event.input.command, ctx);
1854
2007
  });
1855
2008