@neikyun/ciel 6.11.0 → 6.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/assets/.claude/hooks/memory-engine.py +29 -4
  2. package/assets/commands/ciel-create-skill.md +2 -2
  3. package/assets/commands/ciel-status.md +1 -1
  4. package/assets/platforms/opencode/.opencode/agents/ciel-improver.md +2 -2
  5. package/assets/platforms/opencode/.opencode/commands/ciel-create-skill.md +2 -2
  6. package/assets/platforms/opencode/.opencode/commands/ciel-memory-bootstrap.md +195 -0
  7. package/assets/skills/workflow/adr-auto/SKILL.md +88 -0
  8. package/assets/skills/workflow/ai-failure-modes-detector/SKILL.md +180 -0
  9. package/assets/skills/workflow/ask-window/SKILL.md +119 -0
  10. package/assets/skills/workflow/avec-quoi-versioner/SKILL.md +111 -0
  11. package/assets/skills/workflow/ci-watcher/SKILL.md +194 -0
  12. package/assets/skills/workflow/critiquer-auditor/SKILL.md +135 -0
  13. package/assets/skills/workflow/critiquer-auditor/reference.md +134 -0
  14. package/assets/skills/workflow/debug-reasoning-rca/SKILL.md +174 -0
  15. package/assets/skills/workflow/depth-classifier/SKILL.md +118 -0
  16. package/assets/skills/workflow/diverge/SKILL.md +91 -0
  17. package/assets/skills/workflow/doc-validator-official/SKILL.md +196 -0
  18. package/assets/skills/workflow/evaluer-sizer/SKILL.md +112 -0
  19. package/assets/skills/workflow/faire-gatekeeper/SKILL.md +99 -0
  20. package/assets/skills/workflow/flux-narrator/SKILL.md +93 -0
  21. package/assets/skills/workflow/memoire/SKILL.md +198 -0
  22. package/assets/skills/workflow/memoire-consolidator/SKILL.md +91 -0
  23. package/assets/skills/workflow/meta-critiquer/SKILL.md +112 -0
  24. package/assets/skills/workflow/modern-patterns-checker/SKILL.md +166 -0
  25. package/assets/skills/workflow/pattern-fitness-check/SKILL.md +108 -0
  26. package/assets/skills/workflow/playwright-visual-critic/SKILL.md +98 -0
  27. package/assets/skills/workflow/pr-review-responder/SKILL.md +214 -0
  28. package/assets/skills/workflow/prouver-verifier/SKILL.md +184 -0
  29. package/assets/skills/workflow/prouver-verifier/reference.md +152 -0
  30. package/assets/skills/workflow/quoi-framer/SKILL.md +91 -0
  31. package/assets/skills/workflow/relire-critic/SKILL.md +99 -0
  32. package/assets/skills/workflow/security-regression-check/SKILL.md +86 -0
  33. package/assets/skills/workflow/self-consistency-verifier/SKILL.md +85 -0
  34. package/assets/skills/workflow/spike-mode/SKILL.md +101 -0
  35. package/assets/skills/workflow/stride-analyzer/SKILL.md +96 -0
  36. package/assets/skills/workflow/stride-analyzer/reference.md +144 -0
  37. package/assets/skills/workflow/test-strategy-vitest-playwright/SKILL.md +119 -0
  38. package/package.json +1 -1
@@ -851,6 +851,17 @@ def cmd_analyze(args):
851
851
  insights_json = base / 'insights.json'
852
852
  atomic_write_json(insights_json, insights)
853
853
 
854
+ # Cap human-readable INSIGHTS.md sections when the corpus is large.
855
+ # insights.json (machine consumer) keeps everything; INSIGHTS.md is read
856
+ # by humans + by ciel-audit narration so token cost matters at scale.
857
+ LARGE_CORPUS_THRESHOLD = 150
858
+ TOP_N = 10
859
+
860
+ def maybe_cap(items):
861
+ if total > LARGE_CORPUS_THRESHOLD and len(items) > TOP_N:
862
+ return items[:TOP_N], len(items) - TOP_N
863
+ return items, 0
864
+
854
865
  lines = [
855
866
  "# Memory insights",
856
867
  "",
@@ -866,50 +877,64 @@ def cmd_analyze(args):
866
877
  lines.append("")
867
878
 
868
879
  if promotion_candidates:
880
+ shown, omitted = maybe_cap(promotion_candidates)
869
881
  lines += [
870
882
  "## Promotion candidates",
871
883
  "",
872
884
  f"Episodes triggered >= {MIN_PROMOTION} times. Promote via skill `memoire-consolidator`.",
873
885
  "",
874
886
  ]
875
- for mid in promotion_candidates:
887
+ for mid in shown:
876
888
  m = episodes[mid]
877
889
  lines.append(f"- `{mid}` (trigger_count={m.get('trigger_count', 0)}) - {m.get('title', '?')}")
890
+ if omitted:
891
+ lines.append(f"- _+{omitted} more, see insights.json_")
878
892
  lines.append("")
879
893
 
880
894
  if dead_anchors:
895
+ shown, omitted = maybe_cap(dead_anchors)
881
896
  lines += [
882
897
  "## Dead anchors",
883
898
  "",
884
899
  "Memories whose every `path_patterns` entry resolves to no file. Triage in `.ciel/memory/review-queue.md`.",
885
900
  "",
886
901
  ]
887
- for mid in dead_anchors:
902
+ for mid in shown:
888
903
  m = memories[mid]
889
904
  patterns = ", ".join(m.get('path_patterns') or [])
890
905
  lines.append(f"- `{mid}` - {m.get('title', '?')} (patterns: {patterns})")
906
+ if omitted:
907
+ lines.append(f"- _+{omitted} more, see insights.json_")
891
908
  lines.append("")
892
909
 
893
910
  if intent_clusters:
911
+ ranked = sorted(intent_clusters.items(), key=lambda x: -len(x[1]))
912
+ shown, omitted = maybe_cap(ranked)
894
913
  lines += [
895
914
  "## Intent clusters",
896
915
  "",
897
916
  f"Intents shared by >= {MIN_SUPPORT} memories - recurring topics.",
898
917
  "",
899
918
  ]
900
- for intent, ids in sorted(intent_clusters.items(), key=lambda x: -len(x[1])):
919
+ for intent, ids in shown:
901
920
  lines.append(f"- `{intent}` ({len(ids)}): {', '.join(ids)}")
921
+ if omitted:
922
+ lines.append(f"- _+{omitted} more, see insights.json_")
902
923
  lines.append("")
903
924
 
904
925
  if path_clusters:
926
+ ranked = sorted(path_clusters.items(), key=lambda x: -len(x[1]))
927
+ shown, omitted = maybe_cap(ranked)
905
928
  lines += [
906
929
  "## Path clusters",
907
930
  "",
908
931
  f"Paths referenced by >= {MIN_SUPPORT} memories - high-traffic surface.",
909
932
  "",
910
933
  ]
911
- for path, ids in sorted(path_clusters.items(), key=lambda x: -len(x[1])):
934
+ for path, ids in shown:
912
935
  lines.append(f"- `{path}` ({len(ids)}): {', '.join(ids)}")
936
+ if omitted:
937
+ lines.append(f"- _+{omitted} more, see insights.json_")
913
938
  lines.append("")
914
939
 
915
940
  insights_md = base / 'INSIGHTS.md'
@@ -1,10 +1,10 @@
1
1
  ---
2
- description: Generates a valid Ciel SKILL.md scaffold following Anthropic Skills-first rules (kebab-case ≤64, YAML description ≤1024, body ≤500 lines).
2
+ description: Generates a valid Ciel SKILL.md scaffold following Anthropic Skills-first rules (kebab-case ≤64, YAML description ≤1536, body ≤500 lines).
3
3
  ---
4
4
 
5
5
  # /ciel-create-skill — Create a new Ciel skill
6
6
 
7
- *Generates a valid SKILL.md scaffold following Anthropic Skills-first rules (kebab-case name ≤64 chars, YAML frontmatter ≤1024-char description, ≤500-line body, progressive disclosure to one reference.md).*
7
+ *Generates a valid SKILL.md scaffold following Anthropic Skills-first rules (kebab-case name ≤64 chars, YAML frontmatter ≤1536-char description, ≤500-line body, progressive disclosure to one reference.md).*
8
8
 
9
9
  Usage: `/ciel-create-skill <name> <purpose>`
10
10
 
@@ -15,7 +15,7 @@ Usage: `/ciel-status [--check]`
15
15
  ```
16
16
  ## CIEL STATUS
17
17
 
18
- Version: v6.11.0
18
+ Version: v6.11.1
19
19
  Platform: Claude Code
20
20
  Config: .claude/settings.json — OK (4 hooks registered)
21
21
  Skills directory: skills/ — 43 skills loaded
@@ -265,7 +265,7 @@ Generates a valid SKILL.md scaffold following Ciel's conventions. Returns a diff
265
265
  - Third person: "Analyzes X" ✓ / "I analyze X" ✗
266
266
  - Front-load use case + trigger keywords
267
267
  - Include "Use when..." clause
268
- - ≤ 1024 chars, recommended 200-500
268
+ - ≤ 1536 chars, recommended 200-500
269
269
 
270
270
  ### 3. Scaffold SKILL.md
271
271
 
@@ -333,7 +333,7 @@ Problems: no trigger, no output, no specificity.
333
333
 
334
334
  - [ ] Name valid kebab-case, ≤ 64 chars, unique?
335
335
  - [ ] Category is one of the 5 valid categories?
336
- - [ ] Description: third person, ≤ 1024 chars, includes trigger?
336
+ - [ ] Description: third person, ≤ 1536 chars, includes trigger?
337
337
  - [ ] SKILL.md ≤ 300 lines?
338
338
  - [ ] No overlap with existing skills (grep checked)?
339
339
  - [ ] YAML frontmatter valid?
@@ -7,12 +7,12 @@ subtask: true
7
7
  > **OpenCode note**: This command requires `claude --print` headless mode for full functionality (binary evals, skill scaffold generation). On OpenCode it runs in degraded mode — the improver agent returns proposals only. For the full harness, use Claude Code.
8
8
 
9
9
  ---
10
- description: Generates a valid Ciel SKILL.md scaffold following Anthropic Skills-first rules (kebab-case ≤64, YAML description ≤1024, body ≤500 lines).
10
+ description: Generates a valid Ciel SKILL.md scaffold following Anthropic Skills-first rules (kebab-case ≤64, YAML description ≤1536, body ≤500 lines).
11
11
  ---
12
12
 
13
13
  # /ciel-create-skill — Create a new Ciel skill
14
14
 
15
- *Generates a valid SKILL.md scaffold following Anthropic Skills-first rules (kebab-case name ≤64 chars, YAML frontmatter ≤1024-char description, ≤500-line body, progressive disclosure to one reference.md).*
15
+ *Generates a valid SKILL.md scaffold following Anthropic Skills-first rules (kebab-case name ≤64 chars, YAML frontmatter ≤1536-char description, ≤500-line body, progressive disclosure to one reference.md).*
16
16
 
17
17
  Usage: `/ciel-create-skill <name> <purpose>`
18
18
 
@@ -0,0 +1,195 @@
1
+ ---
2
+ description: Scan project for ingestable tribal docs (lessons.md, ciel-overlay.md, .claude/rules/, Claude Code auto-memory at ~/.claude/projects/<slug>/memory/, etc.) and propose ingestion into the cued-recall memory under .ciel/memory/. Reports findings if no sources found. Always confirms each candidate with the user before writing.
3
+ ---
4
+
5
+ # /ciel-memory-bootstrap — Initialize Cued-Recall Memory
6
+
7
+ **Purpose:** First-run scan of an existing project to convert tribal knowledge already documented in `lessons.md`, `ciel-overlay.md`, `.claude/rules/`, Claude Code's per-project auto-memory (`~/.claude/projects/<slug>/memory/`), and similar files into the structured cued-recall memory at `.ciel/memory/`.
8
+
9
+ **Usage:** `/ciel-memory-bootstrap` (no args)
10
+
11
+ This is **deterministic**: no agent dispatch, no pipeline, no DIVERGE/EVALUER. Just scan, propose, write on user confirmation.
12
+
13
+ ---
14
+
15
+ ## Instructions
16
+
17
+ You are bootstrapping the cued-recall memory for this project. Follow these steps in order.
18
+
19
+ ### Step 1 — Scan
20
+
21
+ Run the bootstrap script in `scan` mode:
22
+
23
+ ```bash
24
+ # Try installed location first, fallback to dev location
25
+ script="$CLAUDE_PROJECT_DIR/.claude/hooks/memory-bootstrap.sh"
26
+ [ -f "$script" ] || script="$CLAUDE_PROJECT_DIR/hooks/memory-bootstrap.sh"
27
+ bash "$script" scan
28
+ ```
29
+
30
+ Or, if running on an installed Ciel: `bash "$HOME/.ciel/hooks/memory-bootstrap.sh" scan`.
31
+
32
+ Report the output verbatim to the user.
33
+
34
+ ### Step 2 — Decide path
35
+
36
+ Based on the scan output:
37
+
38
+ - **If 0 sources found** → tell the user clearly: "No tribal docs to bootstrap from. The cued-recall memory will populate organically as you intervene with me. Nothing more to do." End here.
39
+ - **If sources found** → proceed to Step 3.
40
+
41
+ ### Step 3 — Initialize structure
42
+
43
+ Run:
44
+
45
+ ```bash
46
+ script="$CLAUDE_PROJECT_DIR/.claude/hooks/memory-bootstrap.sh"
47
+ [ -f "$script" ] || script="$CLAUDE_PROJECT_DIR/hooks/memory-bootstrap.sh"
48
+ bash "$script" ingest
49
+ ```
50
+
51
+ This creates `.ciel/memory/{episodes,concepts,guards}/` and an empty `index.json`. It does NOT auto-write memories — auto-ingestion would create cargo-cult entries from possibly-stale docs (see ADR-0001).
52
+
53
+ ### Step 4 — Read each source
54
+
55
+ For each source found in Step 1, `Read` the file fully. Identify candidate memories:
56
+
57
+ | Source format | What becomes a memory |
58
+ |---|---|
59
+ | `[YYYY-MM-DD] MISTAKE: X → RULE: Y` lines (lessons.md style) | One memory per line. Title = the rule. |
60
+ | `## Heading\n\n- rule\n- rule` (rules.md style) | One memory per rule. |
61
+ | Numbered lessons in `ciel-overlay.md` "Key Lessons" | One memory per lesson. |
62
+ | `## section` in CLAUDE.md/AGENTS.md describing a non-obvious convention | One memory per section. |
63
+ | **Claude Code auto-memory** entries (`~/.claude/projects/<slug>/memory/*.md`, excluding `MEMORY.md`) | One memory per file. Title = frontmatter `description`. Cues derived per "Auto-memory mapping" below. |
64
+
65
+ #### Auto-memory mapping (special parser)
66
+
67
+ Claude Code auto-memory uses a different frontmatter than Ciel's cued-recall. Each source file looks like:
68
+
69
+ ```yaml
70
+ ---
71
+ name: feedback-okhttp-cookiejar-override
72
+ description: Neiyomi shared PersistentCookieJar overrides manual Cookie headers via OkHttp BridgeInterceptor
73
+ metadata:
74
+ type: feedback
75
+ ---
76
+
77
+ (body markdown — Context / Why / How to apply sections)
78
+ ```
79
+
80
+ When you encounter a file under `$AUTO_MEMORY_DIR`, map it to a Ciel episode as follows:
81
+
82
+ | Auto-memory field | Ciel frontmatter field | Notes |
83
+ |---|---|---|
84
+ | `description:` | `title:` | one-line summary |
85
+ | `name:` | base of slug for filename | already kebab-case |
86
+ | `metadata.type:` (`user`/`feedback`/`project`/`reference`) | `intents:` `[<type>]` plus topic-specific intents inferred from body | e.g. `feedback` + `okhttp` + `cookie` |
87
+ | body markdown | Ciel episode body, verbatim | preserve Context/Why/How to apply structure |
88
+ | paths cited in body (e.g. `src/`, `*.kt`, `Caddyfile`) | `path_patterns:` | infer from grep — narrow patterns preferred |
89
+ | symbols cited in body (class/function/table names) | `symbols:` | infer from grep |
90
+ | language hint (file extensions in body) | `languages:` | `kotlin`/`typescript`/`python`/`sql`/etc. |
91
+ | `captured_from:` (NEW) | `auto-memory-migration` | distinguishes from user-intervention captures |
92
+
93
+ **Skip `MEMORY.md`** — it's a table-of-contents index, not memory content. The scan already excludes it.
94
+
95
+ **Backup before delete.** After successfully writing an episode file for an auto-memory entry, MOVE (not delete) the source to `$AUTO_MEMORY_DIR/.migrated-to-ciel/<filename>` so the user can audit migration. The MEMORY.md index file itself stays in place — Claude Code may regenerate it on next session.
96
+
97
+ Skip:
98
+ - The pipeline / workflow descriptions (those belong in CLAUDE.md, not memory)
99
+ - General principles already in CLAUDE.md
100
+ - Anything that's just project description (READMEish)
101
+ - Code examples (those go in skills/, not memory)
102
+
103
+ ### Step 5 — Propose batch capture
104
+
105
+ Once you have N candidate memories from the sources, present them to the user **as a batch**, not one by one (avoid 50 confirmation prompts). Use a single `AskUserQuestion` with the structure:
106
+
107
+ > "Found N candidates from your tribal docs. I'll list them; you tell me which to capture, which to skip, or 'all'."
108
+
109
+ For each candidate, show:
110
+ - **Title** (one line)
111
+ - **Source** (file:line)
112
+ - **Suggested tags** (paths, symbols, intents, language inferred from the lesson content)
113
+
114
+ The user replies with: "all", "1,3,5,8" (specific indices), or "skip".
115
+
116
+ ### Step 6 — Write captured memories
117
+
118
+ For each captured candidate, create `.ciel/memory/episodes/<YYYY-MM-DD>-<slug>.md` with frontmatter:
119
+
120
+ ```yaml
121
+ ---
122
+ id: mem_<NNN>
123
+ title: <title>
124
+ languages: [<inferred>]
125
+ path_patterns:
126
+ - <pattern>
127
+ symbols: [<inferred>]
128
+ intents: [<inferred>]
129
+ captured_at: <ISO8601 now>
130
+ captured_from: bootstrap
131
+ source: <original-file:line>
132
+ trigger_count: 0
133
+ last_triggered: null
134
+ stale_after_days: 90
135
+ stale: false
136
+ ---
137
+
138
+ # <title>
139
+
140
+ <content from source, lightly cleaned>
141
+ ```
142
+
143
+ ID strategy: read existing `index.json` for max id, increment. Slug = first 5 words of title, kebab-cased.
144
+
145
+ ### Step 7 — Rebuild index
146
+
147
+ After all writes, regenerate `.ciel/memory/index.json` by parsing every frontmatter under `.ciel/memory/{episodes,concepts,guards}/`:
148
+
149
+ ```python
150
+ # pseudo — use python3 -c '...' inline
151
+ for each *.md file:
152
+ parse frontmatter
153
+ add to memories dict by id
154
+ for each path_pattern, symbol, intent, language:
155
+ append id to corresponding by_* index
156
+ write back to index.json
157
+ ```
158
+
159
+ ### Step 8 — Confirm
160
+
161
+ Report:
162
+
163
+ - N memories captured
164
+ - Sources processed
165
+ - Index rebuilt with M total entries
166
+ - Suggest: "Cued-recall memory now active. Memories will auto-inject when their cues match in future tasks. Run `/ciel-memory-bootstrap` again anytime to re-scan for new tribal docs."
167
+
168
+ ---
169
+
170
+ ## Constraints
171
+
172
+ - **Never write a memory without user confirmation.** Even on bulk confirmation ("all"), display the list first.
173
+ - **Do not delete the source files.** Bootstrap converts; the user keeps the originals as long as they want.
174
+ - **Tag conservatively.** A memory tagged with `**/*` will fire on every task and pollute. If unsure, narrow the path pattern.
175
+ - **No agent dispatch.** This command is deterministic and runs inline.
176
+ - **Idempotent.** Re-running on an already-bootstrapped project should detect existing memories (by source field) and offer to skip duplicates.
177
+
178
+ ---
179
+
180
+ ## Failure modes
181
+
182
+ | Symptom | Cause | Fix |
183
+ |---|---|---|
184
+ | Script not found | `$CLAUDE_PROJECT_DIR` not set | Try `$HOME/.ciel/hooks/memory-bootstrap.sh` instead |
185
+ | Nothing scanned | No tribal docs in this project | Working as intended; report and end |
186
+ | Memories all tagged with broad paths | Source content didn't include path hints | Ask user to refine tags after listing |
187
+ | index.json malformed after rebuild | python3 parse error | Recreate empty index, re-run rebuild step |
188
+ | Auto-memory not detected | Slug derivation mismatch (cwd has unexpected characters) | Override via `CIEL_AUTO_MEMORY_DIR=<absolute-path> bash hooks/memory-bootstrap.sh scan` |
189
+ | Auto-memory file has `name:` but no `description:` | Older auto-memory format | Use first heading or filename as title; ask user to confirm before writing |
190
+
191
+ ## See also
192
+
193
+ - `docs/adrs/0001-cued-recall-memory.md` — full design rationale
194
+ - `skills/workflow/memoire/SKILL.md` — capture/recall flow
195
+ - `skills/workflow/memoire-consolidator/SKILL.md` — periodic maintenance
@@ -0,0 +1,88 @@
1
+ ---
2
+ name: adr-auto
3
+ description: How to document architectural decisions automatically in Ciel v5 (etape 12). After FAIRE but before RELIRE, if the task involved a significant architectural decision, write an ADR (Architecture Decision Record) to docs/adrs/. Prevents knowledge loss.
4
+ ---
5
+
6
+ # Automatic ADR — Document Decisions in Real Time (Ciel v5)
7
+
8
+ ## What this covers
9
+
10
+ How to document architectural decisions during the Ciel v5 pipeline (etape 12: ADR). After FAIRE but before RELIRE, if the task involved a significant architectural decision, write an ADR. The decision is documented while fresh, not months later.
11
+
12
+ ## Core principle
13
+
14
+ **If the decision was non-trivial, document WHY.** Code shows WHAT. ADRs show WHY. Without ADRs, future developers (or future you) will wonder why the code is the way it is.
15
+
16
+ ## When to write an ADR
17
+
18
+ Write an ADR when the task involves:
19
+ - Adding a new dependency/library
20
+ - Choosing between two technologies
21
+ - Changing a database schema
22
+ - Adopting a design pattern
23
+ - Making a performance trade-off
24
+ - Changing the build/deploy pipeline
25
+ - Any decision with long-term consequences
26
+
27
+ Do NOT write an ADR for:
28
+ - Bug fixes (tests document the fix)
29
+ - Refactoring without semantic change
30
+ - Renames/reorganizations
31
+ - Dependency upgrades (changelog suffices)
32
+
33
+ ## ADR format (based on Michael Nygard's template)
34
+
35
+ ```
36
+ # ADR-<NNN>: <Title>
37
+
38
+ ## Status
39
+
40
+ <proposed | accepted | deprecated | superseded by ADR-NNN>
41
+
42
+ ## Context
43
+
44
+ <What is the issue that we're seeing that is motivating this decision or change? 2-3 sentences.>
45
+
46
+ ## Decision
47
+
48
+ <What is the change that we're proposing and/or doing? 1-2 sentences.>
49
+
50
+ ## Consequences
51
+
52
+ <What becomes easier or harder to do because of this change? 2-3 items.>
53
+
54
+ ## References
55
+
56
+ <Link to relevant docs, tickets, or PRs>
57
+ ```
58
+
59
+ ## File naming
60
+
61
+ `docs/adrs/<NNN>-<kebab-case-title>.md`
62
+
63
+ Start at 001 and increment.
64
+
65
+ ## How to trigger (Ciel v5)
66
+
67
+ In the Ciel pipeline (etape 12), during ADR:
68
+ 1. Check if the task involved a significant decision (see list above)
69
+ 2. If yes -> write `docs/adrs/<NNN>-<title>.md`
70
+ 3. Update `.ciel/map.json` to reference the new ADR
71
+ 4. Reference the ADR in the RELIRE submission so the critic can check it
72
+
73
+ ## Common rationalizations
74
+
75
+ | Rationalization | Reality |
76
+ |---|---|
77
+ | "The code is self-documenting" | Code shows WHAT. ADRs show WHY. Six months from now, "why did we choose this" is not visible in the code. |
78
+ | "I'll add it later" | Later is when the decision is forgotten and the context is lost. Write it now or it never gets written. |
79
+ | "This decision is too small for an ADR" | If you had to think about it for more than 30 seconds, it's big enough for an ADR. |
80
+ | "Nobody reads ADRs anyway" | Nobody reads them until they need to undo a decision and can't figure out why it was made. Then they're invaluable. |
81
+
82
+ ## How to verify
83
+
84
+ - [ ] ADR written for every significant decision?
85
+ - [ ] No ADR written for trivial changes?
86
+ - [ ] ADR includes context, decision, consequences?
87
+ - [ ] Map updated with ADR reference?
88
+ - [ ] ADR committed with the code?
@@ -0,0 +1,180 @@
1
+ ---
2
+ name: ai-failure-modes-detector
3
+ description: Detects the six canonical failure modes of LLM-generated code — invented APIs, hallucinated dependencies, version drift, async/sync mismatch, confident-wrong logic, and extrinsic hallucination (plausible but unverifiable output). Runs self-consistency triple-generation checks, AST-based dependency audits, and uncertainty scoring. Triggers BEFORE merging agent-authored code, especially when the author is an LLM. Partners with doc-validator-official (API-level) and self-consistency-verifier (semantic-level).
4
+ allowed-tools: Read, Grep, Glob, Bash
5
+ ---
6
+
7
+ # ai-failure-modes-detector — Catch confident-wrong before it lands
8
+
9
+ LLM-generated code compiles more often than it's correct. Six failure modes account for >90% of post-merge incidents in agentic PRs (ISSTA 2025). This skill runs each check systematically.
10
+
11
+ ---
12
+
13
+ ## Inputs (infer before asking — see orchestrator's Autonomy protocol)
14
+
15
+ ```
16
+ CODE_UNDER_REVIEW: [file paths OR diff hunk]
17
+ AUTHOR: [human | LLM | mixed]
18
+ PROPOSED_DEPS: [new dependencies being added, if any]
19
+ TEST_COVERAGE: [files that have tests | files without]
20
+ ```
21
+
22
+ ### Auto-inference sources (exhaust BEFORE asking the user)
23
+
24
+ - **CODE_UNDER_REVIEW** → `git diff HEAD~1` (last commit) or `git diff main...HEAD` (branch diff) — usually the intent. If user said "this file", extract from prompt.
25
+ - **AUTHOR** → check the last commit's message / co-author trailer. `Co-Authored-By: Claude` or `Generated with Claude Code` → LLM. Otherwise human. If unsure, assume `mixed` (safer default).
26
+ - **PROPOSED_DEPS** → `git diff HEAD~1 -- package.json go.mod requirements.txt` → list added entries. Zero added → skip dep-hallucination check.
27
+ - **TEST_COVERAGE** → for each changed file in CODE_UNDER_REVIEW, check if a corresponding `*.test.*` / `*_test.go` / `test_*.py` exists next to it.
28
+
29
+ Never ask the user for AUTHOR — always inferable from git. Never ask for TEST_COVERAGE — always checkable via filesystem.
30
+
31
+ ---
32
+
33
+ ## The six failure modes
34
+
35
+ ### 1. Invented APIs
36
+
37
+ Function/class/method that doesn't exist in the library at the pinned version.
38
+
39
+ **Detection**:
40
+ - Grep every import and every method call on imported symbols
41
+ - Cross-reference with `node_modules/<pkg>/package.json` + type definitions
42
+ - For dynamic imports (`await import()`), inspect at runtime if possible
43
+
44
+ **Signal**: import resolves but `<symbol>` not in the `.d.ts` or `__init__.py`.
45
+
46
+ ### 2. Hallucinated dependencies
47
+
48
+ `npm package` or `pip package` that doesn't exist on the registry (or typo-squat).
49
+
50
+ **Detection**:
51
+ - For each new dep in PROPOSED_DEPS: `npm view <pkg> --json` or `pip index versions <pkg>`
52
+ - Check publisher reputation (weekly downloads, last publish date, repo link present)
53
+ - Typo-squat check: Levenshtein distance ≤ 2 from a popular package name is SUSPICIOUS
54
+
55
+ **Signal**: registry returns 404, or package has < 100 downloads/week with no repo.
56
+
57
+ ### 3. Version drift
58
+
59
+ Code uses an API that exists but at a different version than pinned.
60
+
61
+ **Detection**:
62
+ - For each external API call, check "Added in vX.Y" / "Deprecated in vX.Y" metadata
63
+ - Compare against pinned version in lockfile
64
+
65
+ **Signal**: API exists in v2, code pins v1 — silently broken.
66
+
67
+ ### 4. Async/sync mismatch
68
+
69
+ Sync call in an async codebase or a Promise-returning function not awaited.
70
+
71
+ **Detection** (TS):
72
+ - `@typescript-eslint/no-floating-promises`
73
+ - Grep for `fetch(`, `fs.readFileSync` (sync in async) or unawaited `async` functions
74
+ - Any `Promise<T>` returned from a function whose callers don't `await`
75
+
76
+ **Detection** (Python):
77
+ - Sync `requests.get()` inside an `async def`
78
+ - `asyncio.run()` called inside an event loop
79
+
80
+ **Signal**: type checker emits "Promise returned but not awaited" OR sync call blocks in async context.
81
+
82
+ ### 5. Confident-wrong logic
83
+
84
+ Code is syntactically and typing-wise valid, passes linting, but is semantically wrong:
85
+ - Off-by-one on pagination
86
+ - Wrong operator (`>=` where `>` needed)
87
+ - Negated boolean
88
+ - Swapped arguments of same type
89
+
90
+ **Detection**:
91
+ - Run existing tests (if present) — failing tests is the first signal
92
+ - Invariant check: can you state in 1 sentence what the code guarantees? Does it actually guarantee it?
93
+ - For any numerical boundary, ask: "off-by-one in either direction — which breaks?"
94
+
95
+ **Signal**: behavior divergence between stated goal and actual execution.
96
+
97
+ ### 6. Extrinsic hallucination
98
+
99
+ Output is plausible but references facts outside the code that cannot be verified:
100
+ - Cites a spec section that doesn't exist
101
+ - Comments claim "per RFC 7231 §5.3" when section 5.3 doesn't cover that
102
+ - Error codes invented (`ERR_USER_QUOTA_EXCEEDED` — is that really thrown?)
103
+
104
+ **Detection**:
105
+ - Every code comment with a source claim → spot-check
106
+ - Every user-facing string (error codes, log messages) → grep for prior use in the codebase
107
+
108
+ **Signal**: claim cannot be corroborated.
109
+
110
+ ---
111
+
112
+ ## Report format
113
+
114
+ ```
115
+ ## AI-FAILURE-MODES VERDICT
116
+
117
+ ### Author
118
+ LLM (auto-detected via commit message pattern | user-declared)
119
+
120
+ ### Findings by mode
121
+ 1. Invented APIs:
122
+ [BLOCK] src/auth.ts:42 — `jwt.verifyStrict()` not in jsonwebtoken@9.0.2 (use `verify()` with `algorithms` option)
123
+
124
+ 2. Hallucinated deps:
125
+ (none — all 3 new deps exist on npm, >10k weekly downloads)
126
+
127
+ 3. Version drift:
128
+ [WARN] src/db.ts:18 — `drizzle.innerJoin()` added in v0.30, pinned 0.29 — upgrade drizzle-orm
129
+
130
+ 4. Async/sync mismatch:
131
+ [BLOCK] src/upload.ts:55 — `fs.writeFileSync()` inside async handler — blocks event loop
132
+
133
+ 5. Confident-wrong:
134
+ [WARN] src/pagination.ts:22 — `offset = page * pageSize` — off-by-one on page=0
135
+
136
+ 6. Extrinsic:
137
+ [INFO] src/rate-limit.ts:10 — comment cites "per RFC 6585 §4" — RFC 6585 does not have §4; 429 is §4 of RFC 6585 (comment is right, citation format wrong)
138
+
139
+ ### Summary
140
+ BLOCK: 2
141
+ WARN: 2
142
+ INFO: 1
143
+ ```
144
+
145
+ ---
146
+
147
+ ## Guardrails
148
+
149
+ - **BLOCK means don't merge** — invented APIs, hallucinated deps, and async/sync mismatches are production-breaking.
150
+ - **WARN means discuss in review** — not auto-blocking but requires human acknowledgment.
151
+ - **Run against diff, not whole repo** — old code isn't the subject; the new change is.
152
+ - **When tests are absent**, confidence in "confident-wrong" findings drops — request tests be added before clearing the review.
153
+ - **Don't false-positive on stubs** — intentional mocks in `__mocks__/` or `test-helpers/` may reference not-yet-implemented APIs; verify context.
154
+ - **Typo-squat false positives**: popular packages sometimes have close cousins (`request` vs `request-promise`) — check download count AND repo history before flagging.
155
+
156
+ ---
157
+
158
+ ## How to verify
159
+
160
+ - [ ] All 6 failure modes checked (invented APIs, hallucinated deps, version drift, async/sync, confident-wrong, extrinsic)?
161
+ - [ ] Each finding has evidence (file:line or URL)?
162
+ - [ ] VERDICT issued (CLEAN / FINDINGS)?
163
+ - [ ] Author identified (LLM vs human)?
164
+ - [ ] External API calls validated against official docs?
165
+
166
+ ## When triggered
167
+
168
+ - Post-write hook when AUTHOR=LLM and task is Standard/Critical
169
+ - Before any PR merge authored wholly or partially by an agent
170
+ - After `@ciel-explorer` completes CODEBASE review
171
+ - User command: "audit this code for AI mistakes"
172
+
173
+ ---
174
+
175
+ ## References
176
+
177
+ - ISSTA 2025 — "LLM Hallucinations in Practical Code Generation: Phenomena, Mechanism, and Mitigation"
178
+ - arxiv 2601.19106 — "Detecting and Correcting Hallucinations in LLM-Generated Code"
179
+ - arxiv 2404.00971 — "Beyond Functional Correctness"
180
+ - Anthropic 2604.08906 — agentic framework failure taxonomy