@tgoodington/intuition 8.1.3 → 9.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. package/docs/v9/decision-framework-direction.md +142 -0
  2. package/docs/v9/decision-framework-implementation.md +114 -0
  3. package/docs/v9/domain-adaptive-team-architecture.md +1016 -0
  4. package/docs/v9/test/SESSION_SUMMARY.md +117 -0
  5. package/docs/v9/test/TEST_PLAN.md +119 -0
  6. package/docs/v9/test/blueprints/legal-analyst.md +166 -0
  7. package/docs/v9/test/output/07_cover_letter.md +41 -0
  8. package/docs/v9/test/phase2/mock_plan.md +89 -0
  9. package/docs/v9/test/phase2/producers.json +32 -0
  10. package/docs/v9/test/phase2/specialists/database-architect.specialist.md +10 -0
  11. package/docs/v9/test/phase2/specialists/financial-analyst.specialist.md +10 -0
  12. package/docs/v9/test/phase2/specialists/legal-analyst.specialist.md +10 -0
  13. package/docs/v9/test/phase2/specialists/technical-writer.specialist.md +10 -0
  14. package/docs/v9/test/phase2/team_assignment.json +61 -0
  15. package/docs/v9/test/phase3/blueprints/legal-analyst.md +840 -0
  16. package/docs/v9/test/phase3/legal-analyst-full.specialist.md +111 -0
  17. package/docs/v9/test/phase3/project_context/nh_landlord_tenant_notes.md +35 -0
  18. package/docs/v9/test/phase3/project_context/property_facts.md +32 -0
  19. package/docs/v9/test/phase3b/blueprints/legal-analyst.md +1715 -0
  20. package/docs/v9/test/phase3b/legal-analyst.specialist.md +153 -0
  21. package/docs/v9/test/phase3b/scratch/legal-analyst-stage1.md +270 -0
  22. package/docs/v9/test/phase4/TEST_PLAN.md +32 -0
  23. package/docs/v9/test/phase4/blueprints/financial-analyst-T2.md +538 -0
  24. package/docs/v9/test/phase4/blueprints/legal-analyst-T4.md +253 -0
  25. package/docs/v9/test/phase4/cross-blueprint-check.md +280 -0
  26. package/docs/v9/test/phase4/scratch/financial-analyst-T2-stage1.md +67 -0
  27. package/docs/v9/test/phase4/scratch/legal-analyst-T4-stage1.md +54 -0
  28. package/docs/v9/test/phase4/specialists/financial-analyst.specialist.md +156 -0
  29. package/docs/v9/test/phase4/specialists/legal-analyst.specialist.md +153 -0
  30. package/docs/v9/test/phase5/TEST_PLAN.md +35 -0
  31. package/docs/v9/test/phase5/blueprints/code-architect-hw-vetter.md +375 -0
  32. package/docs/v9/test/phase5/output/04_compliance_checklist.md +149 -0
  33. package/docs/v9/test/phase5/output/hardware-vetter-SKILL-v2.md +561 -0
  34. package/docs/v9/test/phase5/output/hardware-vetter-SKILL.md +459 -0
  35. package/docs/v9/test/phase5/producers/code-writer.producer.md +49 -0
  36. package/docs/v9/test/phase5/producers/document-writer.producer.md +62 -0
  37. package/docs/v9/test/phase5/regression-comparison-v2.md +60 -0
  38. package/docs/v9/test/phase5/regression-comparison.md +197 -0
  39. package/docs/v9/test/phase5/review-5A-specialist.md +213 -0
  40. package/docs/v9/test/phase5/specialist-test/TEST_PLAN.md +60 -0
  41. package/docs/v9/test/phase5/specialist-test/blueprint-comparison.md +252 -0
  42. package/docs/v9/test/phase5/specialist-test/blueprints/code-architect-hw-vetter.md +916 -0
  43. package/docs/v9/test/phase5/specialist-test/scratch/code-architect-stage1.md +427 -0
  44. package/docs/v9/test/phase5/specialists/code-architect.specialist.md +168 -0
  45. package/docs/v9/test/phase5b/TEST_PLAN.md +219 -0
  46. package/docs/v9/test/phase5b/blueprints/5B-10-stage2-with-decisions.md +286 -0
  47. package/docs/v9/test/phase5b/decisions/5B-2-accept-all-decisions.json +68 -0
  48. package/docs/v9/test/phase5b/decisions/5B-3-promote-decisions.json +70 -0
  49. package/docs/v9/test/phase5b/decisions/5B-4-individual-decisions.json +68 -0
  50. package/docs/v9/test/phase5b/decisions/5B-5-triage-decisions.json +110 -0
  51. package/docs/v9/test/phase5b/decisions/5B-6-fallback-decisions.json +40 -0
  52. package/docs/v9/test/phase5b/decisions/5B-8-partial-decisions.json +46 -0
  53. package/docs/v9/test/phase5b/decisions/5B-9-complete-decisions.json +54 -0
  54. package/docs/v9/test/phase5b/scratch/code-architect-stage1.md +133 -0
  55. package/docs/v9/test/phase5b/specialists/code-architect.specialist.md +202 -0
  56. package/docs/v9/test/phase5b/stage1-many-decisions.md +139 -0
  57. package/docs/v9/test/phase5b/stage1-no-assumptions.md +70 -0
  58. package/docs/v9/test/phase5b/stage1-with-assumptions.md +86 -0
  59. package/docs/v9/test/phase5b/test-5B-1-results.md +157 -0
  60. package/docs/v9/test/phase5b/test-5B-10-results.md +130 -0
  61. package/docs/v9/test/phase5b/test-5B-2-results.md +75 -0
  62. package/docs/v9/test/phase5b/test-5B-3-results.md +104 -0
  63. package/docs/v9/test/phase5b/test-5B-4-results.md +114 -0
  64. package/docs/v9/test/phase5b/test-5B-5-results.md +126 -0
  65. package/docs/v9/test/phase5b/test-5B-6-results.md +60 -0
  66. package/docs/v9/test/phase5b/test-5B-7-results.md +141 -0
  67. package/docs/v9/test/phase5b/test-5B-8-results.md +115 -0
  68. package/docs/v9/test/phase5b/test-5B-9-results.md +76 -0
  69. package/docs/v9/test/producers/document-writer.producer.md +62 -0
  70. package/docs/v9/test/specialists/legal-analyst.specialist.md +58 -0
  71. package/package.json +4 -2
  72. package/producers/code-writer/code-writer.producer.md +86 -0
  73. package/producers/data-file-writer/data-file-writer.producer.md +116 -0
  74. package/producers/document-writer/document-writer.producer.md +117 -0
  75. package/producers/form-filler/form-filler.producer.md +99 -0
  76. package/producers/presentation-creator/presentation-creator.producer.md +109 -0
  77. package/producers/spreadsheet-builder/spreadsheet-builder.producer.md +107 -0
  78. package/scripts/install-skills.js +88 -7
  79. package/scripts/uninstall-skills.js +3 -0
  80. package/skills/intuition-agent-advisor/SKILL.md +107 -0
  81. package/skills/intuition-assemble/SKILL.md +261 -0
  82. package/skills/intuition-build/SKILL.md +211 -151
  83. package/skills/intuition-debugger/SKILL.md +4 -4
  84. package/skills/intuition-design/SKILL.md +7 -3
  85. package/skills/intuition-detail/SKILL.md +377 -0
  86. package/skills/intuition-engineer/SKILL.md +8 -4
  87. package/skills/intuition-handoff/SKILL.md +251 -213
  88. package/skills/intuition-handoff/references/handoff_core.md +16 -16
  89. package/skills/intuition-initialize/SKILL.md +20 -5
  90. package/skills/intuition-initialize/references/state_template.json +16 -1
  91. package/skills/intuition-plan/SKILL.md +139 -59
  92. package/skills/intuition-plan/references/magellan_core.md +8 -8
  93. package/skills/intuition-plan/references/templates/plan_template.md +5 -5
  94. package/skills/intuition-prompt/SKILL.md +89 -27
  95. package/skills/intuition-start/SKILL.md +42 -9
  96. package/skills/intuition-start/references/start_core.md +12 -12
  97. package/skills/intuition-test/SKILL.md +345 -0
  98. package/specialists/api-designer/api-designer.specialist.md +291 -0
  99. package/specialists/business-analyst/business-analyst.specialist.md +270 -0
  100. package/specialists/copywriter/copywriter.specialist.md +268 -0
  101. package/specialists/database-architect/database-architect.specialist.md +275 -0
  102. package/specialists/devops-infrastructure/devops-infrastructure.specialist.md +314 -0
  103. package/specialists/financial-analyst/financial-analyst.specialist.md +269 -0
  104. package/specialists/frontend-component/frontend-component.specialist.md +293 -0
  105. package/specialists/instructional-designer/instructional-designer.specialist.md +285 -0
  106. package/specialists/legal-analyst/legal-analyst.specialist.md +260 -0
  107. package/specialists/marketing-strategist/marketing-strategist.specialist.md +281 -0
  108. package/specialists/project-manager/project-manager.specialist.md +266 -0
  109. package/specialists/research-analyst/research-analyst.specialist.md +273 -0
  110. package/specialists/security-auditor/security-auditor.specialist.md +354 -0
  111. package/specialists/technical-writer/technical-writer.specialist.md +275 -0
@@ -0,0 +1,219 @@
1
+ # Phase 5B: User Gate Protocol Tests
2
+
3
+ ## Objective
4
+ Validate the refined user gate protocol (D19-D22) — assumptions/decisions separation, adaptive presentation, decisions.json, crash recovery, and fallback behavior.
5
+
6
+ ## Context
7
+ These tests exercise the **foreground detail skill's** gate logic, not the specialist subagents. The gate is the skill's core responsibility: parsing Stage 1 output, presenting to the user, collecting input, writing decisions.json.
8
+
9
+ Since the detail skill doesn't exist yet, these tests validate the **protocol design** by simulating the gate manually (human plays the skill role, subagents produce Stage 1 output). This proves the protocol works before we build the skill.
10
+
11
+ ## Results Summary
12
+
13
+ | Test | Description | Verdict |
14
+ |------|-------------|---------|
15
+ | 5B-1 | Assumptions/Decisions split format | PASS |
16
+ | 5B-2 | Accept all assumptions (fast path) | PASS |
17
+ | 5B-3 | Promote assumptions | PASS |
18
+ | 5B-4 | Individual decisions (1-7) | PASS |
19
+ | 5B-5 | Triage table (8+) | PASS |
20
+ | 5B-6 | Fallback — no assumptions section | PASS |
21
+ | 5B-7 | Incremental write validation | PASS |
22
+ | 5B-8 | Crash recovery — resume mid-gate | PASS |
23
+ | 5B-9 | Crash recovery — gate already complete | PASS |
24
+ | 5B-10 | Stage 2 honoring decisions.json | PASS |
25
+
26
+ **All 10 tests PASS.** Protocol design validated end-to-end.
27
+
28
+ ## Test Artifacts
29
+
30
+ All in `docs/v9/test/phase5b/`:
31
+
32
+ | Artifact | Purpose |
33
+ |----------|---------|
34
+ | `stage1-with-assumptions.md` | Stage 1 output with proper Assumptions + Decisions split (5A + 3D) |
35
+ | `stage1-no-assumptions.md` | Stage 1 output missing Assumptions section (fallback test) (4D) |
36
+ | `stage1-many-decisions.md` | Stage 1 output with 10+ decisions (8+ triage test) (3A + 10D) |
37
+ | `decisions/` | Output decisions.json files from gate tests |
38
+ | `test-5B-*-results.md` | Detailed results for each test |
39
+
40
+ ---
41
+
42
+ ## Test Cases
43
+
44
+ ### Test 5B-1: Assumptions/Decisions Separation in Stage 1 Output — PASS
45
+
46
+ **Question**: Can a specialist subagent reliably produce the assumptions/decisions split when instructed?
47
+
48
+ **Method**:
49
+ 1. Update the code-architect specialist profile to include the assumptions/decisions guidance and format compliance rules
50
+ 2. Run a Stage 1 subagent (opus) on the hardware vetter task with the updated profile
51
+ 3. Examine the output for proper structure
52
+
53
+ **Pass criteria**:
54
+ - `## Assumptions` section exists with `### A1:`, `### A2:` entries
55
+ - `## Key Decisions` section exists with `### D1:`, `### D2:` entries
56
+ - Each assumption has `**Default**:` and `**Rationale**:` fields
57
+ - Each decision has `**Options**:`, `**Recommendation**:`, `**Risk if wrong**:` fields
58
+ - Classification is reasonable (clear best practices as assumptions, genuine choices as decisions)
59
+ - No items that obviously belong in the other category
60
+
61
+ **Result**: PASS — 7 assumptions, 3 decisions, all correctly formatted and classified. See `test-5B-1-results.md`.
62
+
63
+ ### Test 5B-2: Gate Phase 1 — Accept All Assumptions — PASS
64
+
65
+ **Question**: Does the "accept all" fast path work correctly?
66
+
67
+ **Method**:
68
+ 1. Start with `stage1-with-assumptions.md` (5 assumptions + 3 decisions)
69
+ 2. Simulate gate Phase 1: present assumptions, user selects "Accept all"
70
+ 3. Write decisions.json with all assumptions marked `"status": "accepted"`
71
+ 4. Verify Phase 2 proceeds with only the 3 decisions
72
+
73
+ **Result**: PASS — All 5 assumptions accepted in one click, gate proceeds to 3 decisions. See `test-5B-2-results.md`, `decisions/5B-2-accept-all-decisions.json`.
74
+
75
+ ### Test 5B-3: Gate Phase 1 — Promote Assumptions — PASS
76
+
77
+ **Question**: Does assumption promotion work with the simplified pattern (no domain-specific option construction)?
78
+
79
+ **Method**:
80
+ 1. Start with same `stage1-with-assumptions.md`
81
+ 2. User selects "I want to review some"
82
+ 3. User promotes A3 (model selection) and A5 (naming convention)
83
+ 4. Gate offers: specialist's default vs "Something else — I'll describe what I want"
84
+ 5. User provides overrides via free text
85
+
86
+ **Result**: PASS — Promoted assumptions recorded with `"status": "promoted"` and user overrides. Gate does not construct domain-specific options. Non-promoted assumptions accepted. Phase 2 proceeds with 3 original decisions. See `test-5B-3-results.md`, `decisions/5B-3-promote-decisions.json`.
87
+
88
+ **Edge case noted**: If user promotes but picks the default, consider recording as `"accepted"` rather than `"promoted"` with null override (simplification for implementation).
89
+
90
+ ### Test 5B-4: Gate Phase 2 — Individual Decisions (1-7) — PASS
91
+
92
+ **Question**: Does individual decision presentation with AskUserQuestion options work correctly?
93
+
94
+ **Method**:
95
+ 1. 3 decisions from Phase 1 output
96
+ 2. D1: user picks recommended (A)
97
+ 3. D2: user picks non-recommended (B)
98
+ 4. D3: user picks "Other" with custom text
99
+ 5. Write each to decisions.json incrementally
100
+
101
+ **Result**: PASS — All three response types (recommended, non-recommended, Other) recorded correctly. Context field populated. Incremental writes after each response. See `test-5B-4-results.md`, `decisions/5B-4-individual-decisions.json`.
102
+
103
+ ### Test 5B-5: Gate Phase 2 — Triage Table (8+ Decisions) — PASS
104
+
105
+ **Question**: Does the 8+ triage path work with multiSelect?
106
+
107
+ **Method**:
108
+ 1. `stage1-many-decisions.md` (3 assumptions + 10 decisions)
109
+ 2. Summary table of all 10 decisions with recommendations
110
+ 3. multiSelect: user picks D2 and D9 for discussion
111
+ 4. 8 remaining auto-resolved with specialist's recommendation
112
+ 5. Selected decisions presented individually
113
+
114
+ **Result**: PASS — Summary table presented, multiSelect works, auto-resolved decisions use recommended options, selected decisions go through normal individual flow. All 10 decisions in final file. See `test-5B-5-results.md`, `decisions/5B-5-triage-decisions.json`.
115
+
116
+ **Implementation note**: AskUserQuestion 2-4 option limit means the multiSelect shows a representative subset, not all 10. User types "Other" to add beyond the subset.
117
+
118
+ ### Test 5B-6: Fallback — No Assumptions Section — PASS
119
+
120
+ **Question**: Does the gate handle a stage1.md that has no `## Assumptions` heading?
121
+
122
+ **Method**:
123
+ 1. `stage1-no-assumptions.md` (4 decisions, no assumptions section)
124
+ 2. Gate detects missing section, skips Phase 1
125
+ 3. All items presented as decisions in Phase 2
126
+
127
+ **Result**: PASS — Phase 1 skipped cleanly, `assumptions: []` in output, all 4 decisions presented normally. See `test-5B-6-results.md`, `decisions/5B-6-fallback-decisions.json`.
128
+
129
+ ### Test 5B-7: decisions.json Incremental Write — PASS
130
+
131
+ **Question**: Does the read-before-write pattern produce valid JSON after each step?
132
+
133
+ **Method**:
134
+ 1. Walk through 5 assumptions + 3 decisions gate
135
+ 2. Verify JSON validity after each of 5 writes (startup, assumptions batch, D1, D2, D3)
136
+ 3. Verify no data loss between writes
137
+
138
+ **Result**: PASS — Valid JSON at every step. No data loss. `gate_started` persists, `gate_completed` transitions from null to timestamp on final write. See `test-5B-7-results.md`.
139
+
140
+ **Implementation note**: Phase 1 "accept all" should be a single atomic Write call (all-or-nothing for the batch). Each Phase 2 decision is an individual read+write cycle.
141
+
142
+ ### Test 5B-8: Crash Recovery — Resume Mid-Gate — PASS
143
+
144
+ **Question**: Can the gate resume from a partially-completed decisions.json?
145
+
146
+ **Method**:
147
+ 1. Pre-written partial decisions.json: 3 assumptions accepted, 2 of 3 decisions resolved
148
+ 2. Skill starts fresh (after crash/clear), re-reads detail brief from disk
149
+ 3. Detects partial completion, identifies unresolved items
150
+ 4. Presents resume message, continues from first unresolved
151
+
152
+ **Result**: PASS — Partial completion detected, resolved items not re-asked, correct resume count, continues from right decision, final file complete. See `test-5B-8-results.md`, `decisions/5B-8-partial-decisions.json`.
153
+
154
+ **Edge cases noted**: (1) Crash during Phase 1 batch write → re-present Phase 1. (2) stage1.md changed between sessions → warn and offer restart.
155
+
156
+ ### Test 5B-9: Crash Recovery — Gate Already Complete — PASS
157
+
158
+ **Question**: Does the skill skip the gate when decisions.json is complete but no blueprint exists?
159
+
160
+ **Method**:
161
+ 1. Pre-written complete decisions.json with `gate_completed` timestamp
162
+ 2. No blueprint at expected path
163
+ 3. Skill starts fresh
164
+
165
+ **Result**: PASS — Completed gate detected, summary presented, gate bypassed entirely, proceeds to Stage 2. See `test-5B-9-results.md`, `decisions/5B-9-complete-decisions.json`.
166
+
167
+ **Recommendation**: Add a "restart gate" option in the summary message as an escape hatch (don't build granular "edit one decision" initially).
168
+
169
+ ### Test 5B-10: Stage 2 Honoring decisions.json — PASS
170
+
171
+ **Question**: Does Stage 2 correctly consume decisions.json and honor all user overrides, promotions, and custom inputs?
172
+
173
+ **Method**:
174
+ 1. Use `stage1-with-assumptions.md` + `decisions/5B-3-promote-decisions.json` (richest test case)
175
+ 2. Run Stage 2 subagent with both files injected
176
+ 3. Verify blueprint honors: 2 promoted assumptions, 1 "Other" decision, 1 non-recommended pick, 1 recommended pick, 3 accepted defaults
177
+
178
+ **Result**: PASS — All 8 items from decisions.json correctly reflected in the blueprint. Promoted assumptions (A3→opus, A5→new naming) appear in all relevant blueprint locations. "Other" decision (D2) correctly interpreted and implemented with pseudocode. Non-recommended option (D3) correctly overrode specialist's recommendation. Three ungrounded design choices properly surfaced in Open Items. See `test-5B-10-results.md`, `blueprints/5B-10-stage2-with-decisions.md`.
179
+
180
+ ---
181
+
182
+ ## Implementation Recommendations (from test findings)
183
+
184
+ 1. **Phase 1 batch write must be atomic** — single Write tool call for all assumptions (5B-7, 5B-8)
185
+ 2. **multiSelect subset selection** — for 8+ triage, show 3-4 highest-risk decisions as AskUserQuestion options, rest accessible via "Other" (5B-5)
186
+ 3. **Promote-then-accept simplification** — if user promotes an assumption but picks the default, record as `"accepted"` (5B-3)
187
+ 4. **ID consistency check on resume** — verify decisions.json IDs match stage1.md IDs; warn if mismatched (5B-8)
188
+ 5. **"Restart gate" escape hatch** — offer in the completed-gate summary message (5B-9)
189
+
190
+ ---
191
+
192
+ ## Execution Strategy
193
+
194
+ **Tests 5B-1** ran an actual subagent — validated that the specialist can produce the right format.
195
+
196
+ **Tests 5B-2 through 5B-9** are protocol simulations — manually walked through the gate steps to validate the protocol design and decisions.json format. These become the acceptance criteria for the actual detail skill when we build it.
197
+
198
+ ### Execution Order (completed)
199
+ 1. **5B-1** (specialist output format) — PASS
200
+ 2. **5B-6** (fallback) — PASS
201
+ 3. **5B-2, 5B-3** (Phase 1 variations) — PASS
202
+ 4. **5B-4, 5B-5** (Phase 2 variations) — PASS
203
+ 5. **5B-7** (incremental writes) — PASS
204
+ 6. **5B-8, 5B-9** (crash recovery) — PASS
205
+ 7. **5B-10** (Stage 2 consumes decisions.json) — PASS
206
+
207
+ ### What We Can Test Now vs Later
208
+ - **Now (protocol validation)**: All 9 tests COMPLETE
209
+ - **Later (when detail skill exists)**: Full automated runs of 5B-2 through 5B-9
210
+
211
+ ---
212
+
213
+ ## Relationship to Phase 5 Specialist Test
214
+
215
+ The specialist-test (`specialist-test/TEST_PLAN.md`) validates Stage 1 and Stage 2 **subagent quality** — can the specialist produce good research and good blueprints?
216
+
217
+ Phase 5B validates the **gate protocol between them** — does the user gate correctly parse, present, collect, persist, and recover?
218
+
219
+ Together they cover the complete detail phase pipeline: Stage 1 → gate → Stage 2.
@@ -0,0 +1,286 @@
1
+ # Blueprint: Build the Model Recommendation Engine
2
+
3
+ ## 1. Task Reference
4
+
5
+ - **Plan Task**: Task 3 — Build the Model Recommendation Engine
6
+ - **Type**: Greenfield — no prior implementation exists
7
+ - **Acceptance Criteria**:
8
+ - AC1: Skill reads `models/catalog.json` and `config/hardware-profile.json` to produce recommendations
9
+ - AC2: Models scored using weighted percentage formula (RAM 40%, VRAM 40%, context 20%)
10
+ - AC3: Use-case filtering narrows candidates by tag match (plus "general" tag inclusion)
11
+ - AC4: Output is a Markdown report listing all models above the "acceptable_fit" threshold
12
+ - AC5: Report uses the existing 3-tier rating system (excellent_fit, acceptable_fit, poor_fit)
13
+ - AC6: Report filename follows `recommendation_[use-case-slug].md` convention
14
+ - **Dependencies**: None — greenfield task, catalog and hardware profile already exist
15
+
16
+ ## 2. Research Findings
17
+
18
+ Source: Stage 1 exploration of project context.
19
+
20
+ | Finding | Source | Blueprint Impact |
21
+ |---------|--------|-----------------|
22
+ | Catalog field is `ram_requirement_gb` (not `ram_gb`) | `models/catalog.json` inspection | Scoring logic must reference the correct field name |
23
+ | 47 model entries with fields: `ollama_id`, `name`, `parameter_count`, `quantization`, `context_length`, `ram_requirement_gb`, `gpu_vram_gb` | `models/catalog.json` schema | Defines the data contract for model iteration |
24
+ | Hardware profile fields: `ram_gb`, `gpu_model`, `gpu_vram_gb`, `storage_available_gb` | `config/hardware-profile.json` | Defines the data contract for hardware comparison |
25
+ | No CPU field in hardware profile | `config/hardware-profile.json` | CPU-based inference scoring is out of scope |
26
+ | Context length varies 2048–128000 | `models/catalog.json` range analysis | Context scoring needs normalization across wide range |
27
+ | Existing reports use 3-tier rating: excellent_fit, acceptable_fit, poor_fit | `reports/model_eval_2026-02-15_llama3.md` | Output format must use this established convention (confirmed by A1) |
28
+ | Use-case tags exist on models: chat, code, creative, reasoning | Stage 1 ECD analysis | Filtering logic requires tag-based candidate narrowing |
29
+ | No existing recommendation logic | Stage 1 survey | Entire scoring and filtering system is new |
30
+
31
+ ## 3. Approach
32
+
33
+ ### Strategy: Single SKILL.md implementing filter-then-score-then-report pipeline
34
+
35
+ The skill operates as a three-phase pipeline:
36
+
37
+ 1. **Filter phase** — Narrow the 47-model catalog to candidates matching the user's use-case tag, always including models tagged "general" regardless of query (per D2 user override).
38
+ 2. **Score phase** — Compute a weighted percentage fit score for each candidate against the user's hardware profile: RAM fit 40%, VRAM fit 40%, context fit 20% (per D1).
39
+ 3. **Report phase** — Classify each scored model into the 3-tier rating system, filter out "poor_fit" models, and write a Markdown report containing all models at or above "acceptable_fit" (per D3).
40
+
41
+ ### Rationale
42
+
43
+ - **Filter-then-score** avoids wasted computation on irrelevant models and produces a focused candidate set.
44
+ - **Weighted percentage** (D1 choice A) produces a continuous score enabling fine-grained ranking within tiers, and allows per-dimension tuning.
45
+ - **Including "general" models** (D2 user override) ensures versatile models are never excluded by narrow tag matching.
46
+ - **All above threshold** (D3 choice C) respects the user's preference to see every viable option rather than an arbitrary top-N cutoff.
47
+
48
+ ## 4. Decisions Made
49
+
50
+ ### From User Decisions (decisions.json)
51
+
52
+ | ID | Decision | Resolution | Source |
53
+ |----|----------|------------|--------|
54
+ | A1 | Output Format Consistency | Use existing 3-tier rating system (excellent_fit, acceptable_fit, poor_fit) | Accepted — user confirmed default |
55
+ | A2 | Single-File Skill Structure | Implement as a single SKILL.md file | Accepted — user confirmed default |
56
+ | A3 | Model Selection for Execution | **Use `opus`** (not sonnet) | **Promoted** — user override: "wants deeper reasoning for model comparison analysis" |
57
+ | A4 | Hardware Profile Path | Read from `config/hardware-profile.json` | Accepted — user confirmed default |
58
+ | A5 | Report Naming Convention | **Use `recommendation_[use-case-slug].md`** (no date prefix) | **Promoted** — user override: "prefers simpler naming without dates" |
59
+ | D1 | Scoring Formula Approach | Weighted percentage — RAM 40%, VRAM 40%, context 20% | Option A chosen — user confirmed recommendation |
60
+ | D2 | Use-Case Filtering Strategy | Strict tag match **plus always include models tagged "general"** | **User "other"** — custom override combining strict filtering with general-tag inclusion |
61
+ | D3 | Top-N Presentation Count | Show **all models above "acceptable_fit" threshold** | Option C chosen — user selected over recommended option A |
62
+
63
+ ### Architectural Decisions (derived from above)
64
+
65
+ | Decision | Rationale | Traced To |
66
+ |----------|-----------|-----------|
67
+ | Fit score is a float 0.0–1.0 | Weighted percentage formula naturally produces a 0–1 range | D1 |
68
+ | Tier thresholds: excellent_fit >= 0.85, acceptable_fit >= 0.55, poor_fit < 0.55 | Three tiers need two boundaries; 0.85 captures models with significant headroom, 0.55 captures models that fit but are tight | A1 + D1 (see Open Items — thresholds not specified by Stage 1 or user) |
69
+ | Report written to `reports/` directory | Existing evaluation reports live in `reports/`; consistency with project conventions | Stage 1 finding (report location) |
70
+ | Use-case slug derived from query by lowercasing and replacing spaces with hyphens | Standard slug convention; matches project naming patterns | A5 |
71
+
72
+ ## 5. Deliverable Specification
73
+
74
+ ### 5.1 File to Produce
75
+
76
+ **File**: `skills/model-recommender/SKILL.md`
77
+ **Type**: Claude Code skill definition (single Markdown file with all behavioral instructions)
78
+
79
+ ### 5.2 Skill Metadata Block
80
+
81
+ ```yaml
82
+ ---
83
+ model: opus
84
+ description: Recommends AI models from the catalog based on user hardware and use-case
85
+ ```
86
+
87
+ - Model is `opus` per A3 user override.
88
+
89
+ ### 5.3 Skill Invocation
90
+
91
+ The skill is invoked by the user with a use-case query. The SKILL.md must instruct Claude to:
92
+
93
+ 1. Ask the user: "What use case do you need a model for?" if not provided as an argument.
94
+ 2. Accept free-text input and map it to one of the known use-case tags: `chat`, `code`, `creative`, `reasoning`.
95
+ 3. If the input does not clearly map to a tag, ask the user to clarify by presenting the available tags.
96
+
97
+ ### 5.4 Data Contracts
98
+
99
+ #### Input: `models/catalog.json`
100
+
101
+ Expected structure (array of objects):
102
+
103
+ ```json
104
+ [
105
+ {
106
+ "ollama_id": "string",
107
+ "name": "string",
108
+ "parameter_count": "number (billions)",
109
+ "quantization": "string (e.g., Q4_K_M)",
110
+ "context_length": "number",
111
+ "ram_requirement_gb": "number",
112
+ "gpu_vram_gb": "number",
113
+ "use_case_tags": ["string"]
114
+ }
115
+ ]
116
+ ```
117
+
118
+ Note: The RAM field is `ram_requirement_gb` (not `ram_gb`). The producer MUST use this exact field name.
119
+
120
+ #### Input: `config/hardware-profile.json`
121
+
122
+ Expected structure:
123
+
124
+ ```json
125
+ {
126
+ "ram_gb": "number",
127
+ "gpu_model": "string",
128
+ "gpu_vram_gb": "number",
129
+ "storage_available_gb": "number"
130
+ }
131
+ ```
132
+
133
+ #### Output: `reports/recommendation_[use-case-slug].md`
134
+
135
+ Filename convention per A5 user override. The `[use-case-slug]` is the use-case tag in lowercase (e.g., `recommendation_code.md`, `recommendation_chat.md`).
136
+
137
+ ### 5.5 Filtering Logic (D2 — User Override)
138
+
139
+ ```
140
+ candidates = []
141
+ for each model in catalog:
142
+ if model.use_case_tags contains requested_tag:
143
+ add model to candidates
144
+ else if model.use_case_tags contains "general":
145
+ add model to candidates
146
+ ```
147
+
148
+ - Strict tag match: model must have the exact requested tag in its `use_case_tags` array.
149
+ - General inclusion: models tagged "general" are always included regardless of the requested use-case.
150
+ - If no models match after filtering (edge case): fall back to scoring ALL models in the catalog and note in the report that no models matched the use-case filter.
151
+
152
+ ### 5.6 Scoring Formula (D1 — Option A)
153
+
154
+ For each candidate model, compute a fit score as a weighted percentage:
155
+
156
+ ```
157
+ ram_ratio = min(hardware.ram_gb / model.ram_requirement_gb, 1.0)
158
+ vram_ratio = min(hardware.gpu_vram_gb / model.gpu_vram_gb, 1.0)
159
+ context_ratio = model.context_length / 128000
160
+
161
+ fit_score = (ram_ratio * 0.40) + (vram_ratio * 0.40) + (context_ratio * 0.20)
162
+ ```
163
+
164
+ **Dimension details:**
165
+
166
+ - **RAM ratio** (weight 0.40): `hardware.ram_gb / model.ram_requirement_gb`, capped at 1.0. A ratio of 1.0 means the hardware meets or exceeds the requirement. A ratio below 1.0 means the model may not run or will swap.
167
+ - **VRAM ratio** (weight 0.40): `hardware.gpu_vram_gb / model.gpu_vram_gb`, capped at 1.0. Same semantics as RAM ratio. Models with `gpu_vram_gb` of 0 or null get a VRAM ratio of 1.0 (CPU-only model, no VRAM needed).
168
+ - **Context ratio** (weight 0.20): `model.context_length / 128000`. Not capped — 128000 is the observed maximum in the catalog, so this normalizes to a 0.0–1.0 range. Rewards models with larger context windows.
169
+
170
+ **Fit score range**: 0.0 to 1.0.
171
+
172
+ ### 5.7 Tier Classification (A1)
173
+
174
+ | Tier | Score Range | Meaning |
175
+ |------|------------|---------|
176
+ | `excellent_fit` | >= 0.85 | Hardware comfortably exceeds model requirements with headroom |
177
+ | `acceptable_fit` | >= 0.55 and < 0.85 | Hardware meets model requirements but with limited headroom |
178
+ | `poor_fit` | < 0.55 | Hardware insufficient or severely constrained for this model |
179
+
180
+ ### 5.8 Report Generation (D3 — Option C)
181
+
182
+ The report includes **all models that score at or above the "acceptable_fit" threshold** (fit_score >= 0.55). Models classified as "poor_fit" are excluded from the report entirely.
183
+
184
+ #### Report Structure
185
+
186
+ ```markdown
187
+ # Model Recommendation: [Use Case]
188
+
189
+ **Hardware Profile**: [gpu_model] | [ram_gb] GB RAM | [gpu_vram_gb] GB VRAM
190
+ **Use Case**: [requested use-case tag]
191
+ **Models Evaluated**: [total candidates after filtering]
192
+ **Models Recommended**: [count of models at or above acceptable_fit]
193
+
194
+ ## Excellent Fit
195
+
196
+ | Model | Parameters | Quantization | Context | RAM Required | VRAM Required | Score |
197
+ |-------|-----------|--------------|---------|-------------|---------------|-------|
198
+ | [name] | [parameter_count]B | [quantization] | [context_length] | [ram_requirement_gb] GB | [gpu_vram_gb] GB | [fit_score as percentage] |
199
+
200
+ [For each excellent_fit model, a 1–2 sentence explanation of why it fits well.]
201
+
202
+ ## Acceptable Fit
203
+
204
+ | Model | Parameters | Quantization | Context | RAM Required | VRAM Required | Score |
205
+ |-------|-----------|--------------|---------|-------------|-------------- |-------|
206
+ | [name] | [parameter_count]B | [quantization] | [context_length] | [ram_requirement_gb] GB | [gpu_vram_gb] GB | [fit_score as percentage] |
207
+
208
+ [For each acceptable_fit model, a 1–2 sentence explanation noting the constraints.]
209
+
210
+ ## Summary
211
+
212
+ [2–3 sentence summary: top recommendation, key trade-offs, any notable exclusions.]
213
+ ```
214
+
215
+ - Models within each tier are sorted by fit_score descending (highest score first).
216
+ - If a tier has zero models, omit that section entirely.
217
+ - If zero models meet the acceptable_fit threshold, the report states: "No models in the catalog meet the minimum fit threshold for your hardware. Consider upgrading RAM or VRAM, or reducing context length requirements."
218
+ - Fit scores displayed as percentages (e.g., 0.87 shown as "87%").
219
+
220
+ ### 5.9 Edge Cases
221
+
222
+ | Edge Case | Handling |
223
+ |-----------|----------|
224
+ | No models match use-case filter | Score ALL catalog models, include note in report header: "No models tagged for [use-case]. Showing general recommendations." |
225
+ | Model has `gpu_vram_gb` of 0 or null | Treat VRAM ratio as 1.0 (model does not require GPU) |
226
+ | Model has `ram_requirement_gb` of 0 or null | Skip model, log warning (invalid catalog entry) |
227
+ | Hardware profile missing or unreadable | Report error to user: "Cannot read hardware profile at config/hardware-profile.json. Run /intuition-initialize to set up your hardware profile." |
228
+ | Catalog missing or unreadable | Report error to user: "Cannot read model catalog at models/catalog.json." |
229
+ | Use-case query ambiguous | Present available tags and ask user to select one |
230
+
231
+ ### 5.10 SKILL.md Structure
232
+
233
+ The SKILL.md file must contain the following sections in this order:
234
+
235
+ 1. **YAML frontmatter** — model: opus, description
236
+ 2. **Critical rules block** — MUST/NEVER directives at the top (per MEMORY.md skill-writing rules)
237
+ 3. **Purpose** — one-sentence functional description
238
+ 4. **Invocation instructions** — how to obtain the use-case from the user
239
+ 5. **Data loading instructions** — read catalog and hardware profile, with error handling
240
+ 6. **Filtering logic** — strict tag match + general inclusion (full pseudocode from 5.5)
241
+ 7. **Scoring logic** — weighted percentage formula (full pseudocode from 5.6)
242
+ 8. **Tier classification** — thresholds table from 5.7
243
+ 9. **Report generation** — output template from 5.8, written to `reports/recommendation_[slug].md`
244
+ 10. **Edge case handling** — table from 5.9
245
+ 11. **Post-completion** — inform user of report location, offer to open it
246
+
247
+ All behavioral instructions written as imperative directives to Claude (second-person: "You MUST"), not user-facing documentation. Per A2, everything is in this single file.
248
+
249
+ ## 6. Acceptance Mapping
250
+
251
+ | Criterion | Blueprint Section | How Addressed |
252
+ |-----------|------------------|---------------|
253
+ | AC1: Reads catalog.json and hardware-profile.json | 5.4 Data Contracts | Exact file paths and field names specified; error handling for missing files |
254
+ | AC2: Weighted percentage scoring (40/40/20) | 5.6 Scoring Formula | Complete formula with per-dimension calculation, capping, and edge cases |
255
+ | AC3: Use-case filtering with general inclusion | 5.5 Filtering Logic | Pseudocode covers strict match + general tag; fallback for zero matches |
256
+ | AC4: Markdown report with all above-threshold models | 5.8 Report Generation | Full report template; D3 option C implemented (all above acceptable_fit) |
257
+ | AC5: 3-tier rating system | 5.7 Tier Classification | Thresholds mapped to established tier names from existing reports |
258
+ | AC6: Correct filename convention | 5.4 Output specification | `recommendation_[use-case-slug].md` per A5 user override |
259
+
260
+ ## 7. Integration Points
261
+
262
+ | Integration | Direction | Details |
263
+ |-------------|-----------|---------|
264
+ | `models/catalog.json` | Read | Skill reads but never writes; catalog managed externally |
265
+ | `config/hardware-profile.json` | Read | Skill reads but never writes; profile set by `/intuition-initialize` |
266
+ | `reports/` directory | Write | Skill writes recommendation report here; consistent with existing `model_eval_*` reports |
267
+ | Existing evaluation reports | Reference | Report format maintains consistency with `model_eval_2026-02-15_llama3.md` 3-tier system |
268
+ | `/intuition-initialize` | Dependency | Hardware profile must exist before this skill runs; error message references initialize if missing |
269
+
270
+ ## 8. Open Items
271
+
272
+ | Item | Context | Impact |
273
+ |------|---------|--------|
274
+ | Tier threshold values (0.85, 0.55) | Stage 1 established the 3-tier system but did not specify numeric boundaries. User decisions did not address thresholds. Values 0.85 and 0.55 are design choices made in this blueprint based on the scoring formula's 0–1 range, targeting roughly top-15% as excellent and top-60% as acceptable. | Low — thresholds can be tuned after initial deployment by editing SKILL.md. If real-world results show poor tier distribution, adjust boundaries. |
275
+ | "general" tag existence in catalog | D2 user override assumes models in the catalog have a "general" tag. Stage 1 identified tags (chat, code, creative, reasoning) but did not confirm "general" exists as an actual tag value. | Medium — if no models are tagged "general", the D2 override has no effect. Producer should verify catalog contains "general" tags; if not, flag to user. |
276
+ | `use_case_tags` field name | Stage 1 listed model fields but did not explicitly confirm the array field name for use-case tags. This blueprint assumes `use_case_tags` based on ECD analysis. | Medium — producer must verify exact field name in catalog.json. |
277
+
278
+ ## 9. Producer Handoff
279
+
280
+ - **Target Producer**: Code Writer
281
+ - **Execution Model**: `opus` (per A3 user override)
282
+ - **Output Format**: Single Markdown file (Claude Code SKILL.md)
283
+ - **Output Filename**: `SKILL.md`
284
+ - **Output Directory**: `skills/model-recommender/`
285
+ - **Full Output Path**: `skills/model-recommender/SKILL.md`
286
+ - **Instructions to Producer**: Implement the SKILL.md exactly as specified in Section 5. All behavioral instructions must be imperative directives to Claude. The YAML frontmatter must specify `model: opus`. Follow the SKILL.md writing rules from project conventions: critical rules at top, MUST/NEVER/ALWAYS for non-negotiable behaviors, under 500 lines. Do not add any scoring logic, filtering behavior, or report structure beyond what this blueprint specifies. The three open items in Section 8 require verification during implementation — read `models/catalog.json` to confirm the exact field name for use-case tags and whether "general" exists as a tag value.
@@ -0,0 +1,68 @@
1
+ {
2
+ "specialist": "code-architect",
3
+ "gate_started": "2026-02-27T16:10:00Z",
4
+ "gate_completed": "2026-02-27T16:14:00Z",
5
+ "assumptions": [
6
+ {
7
+ "id": "A1",
8
+ "title": "Output Format Consistency",
9
+ "default": "Use existing 3-tier rating system (excellent_fit, acceptable_fit, poor_fit)",
10
+ "status": "accepted",
11
+ "user_override": null
12
+ },
13
+ {
14
+ "id": "A2",
15
+ "title": "Single-File Skill Structure",
16
+ "default": "Implement as a single SKILL.md file",
17
+ "status": "accepted",
18
+ "user_override": null
19
+ },
20
+ {
21
+ "id": "A3",
22
+ "title": "Model Selection for Execution",
23
+ "default": "Use sonnet as the execution model",
24
+ "status": "accepted",
25
+ "user_override": null
26
+ },
27
+ {
28
+ "id": "A4",
29
+ "title": "Hardware Profile Path",
30
+ "default": "Read hardware profile from config/hardware-profile.json",
31
+ "status": "accepted",
32
+ "user_override": null
33
+ },
34
+ {
35
+ "id": "A5",
36
+ "title": "Report Naming Convention",
37
+ "default": "model_rec_YYYY-MM-DD_[use-case-slug].md",
38
+ "status": "accepted",
39
+ "user_override": null
40
+ }
41
+ ],
42
+ "decisions": [
43
+ {
44
+ "id": "D1",
45
+ "title": "Scoring Formula Approach",
46
+ "context": "Need to rank 47 models against user hardware. RAM, VRAM, and context length are the key dimensions.",
47
+ "options": ["A: Weighted percentage — RAM 40%, VRAM 40%, context 20% (recommended)", "B: Binary pass/fail per dimension, rank by headroom", "C: Single composite ratio averaged across dimensions"],
48
+ "chosen": "A",
49
+ "user_input": null
50
+ },
51
+ {
52
+ "id": "D2",
53
+ "title": "Use-Case Filtering Strategy",
54
+ "context": "Models have use-case tags (chat, code, creative, reasoning). User provides a query like 'I need a coding model'.",
55
+ "options": ["A: Strict tag match (recommended)", "B: Fuzzy match — tagged first, then 'might also work'"],
56
+ "chosen": "B",
57
+ "user_input": null
58
+ },
59
+ {
60
+ "id": "D3",
61
+ "title": "Top-N Presentation Count",
62
+ "context": "Need to decide how many models to show in the recommendation report.",
63
+ "options": ["A: Top 5 models (recommended)", "B: Top 3 models", "C: All models above acceptable_fit threshold"],
64
+ "chosen": "A",
65
+ "user_input": null
66
+ }
67
+ ]
68
+ }
@@ -0,0 +1,70 @@
1
+ {
2
+ "specialist": "code-architect",
3
+ "gate_started": "2026-02-27T16:20:00Z",
4
+ "gate_completed": "2026-02-27T16:28:00Z",
5
+ "assumptions": [
6
+ {
7
+ "id": "A1",
8
+ "title": "Output Format Consistency",
9
+ "default": "Use existing 3-tier rating system (excellent_fit, acceptable_fit, poor_fit)",
10
+ "status": "accepted",
11
+ "user_override": null
12
+ },
13
+ {
14
+ "id": "A2",
15
+ "title": "Single-File Skill Structure",
16
+ "default": "Implement as a single SKILL.md file",
17
+ "status": "accepted",
18
+ "user_override": null
19
+ },
20
+ {
21
+ "id": "A3",
22
+ "title": "Model Selection for Execution",
23
+ "default": "Use sonnet as the execution model",
24
+ "status": "promoted",
25
+ "user_override": "opus",
26
+ "rationale": "User wants deeper reasoning for model comparison analysis"
27
+ },
28
+ {
29
+ "id": "A4",
30
+ "title": "Hardware Profile Path",
31
+ "default": "Read hardware profile from config/hardware-profile.json",
32
+ "status": "accepted",
33
+ "user_override": null
34
+ },
35
+ {
36
+ "id": "A5",
37
+ "title": "Report Naming Convention",
38
+ "default": "model_rec_YYYY-MM-DD_[use-case-slug].md",
39
+ "status": "promoted",
40
+ "user_override": "recommendation_[use-case-slug].md",
41
+ "rationale": "User prefers simpler naming without dates"
42
+ }
43
+ ],
44
+ "decisions": [
45
+ {
46
+ "id": "D1",
47
+ "title": "Scoring Formula Approach",
48
+ "context": "Need to rank 47 models against user hardware. RAM, VRAM, and context length are the key dimensions.",
49
+ "options": ["A: Weighted percentage — RAM 40%, VRAM 40%, context 20% (recommended)", "B: Binary pass/fail per dimension, rank by headroom", "C: Single composite ratio averaged across dimensions"],
50
+ "chosen": "A",
51
+ "user_input": null
52
+ },
53
+ {
54
+ "id": "D2",
55
+ "title": "Use-Case Filtering Strategy",
56
+ "context": "Models have use-case tags (chat, code, creative, reasoning). User provides a query like 'I need a coding model'.",
57
+ "options": ["A: Strict tag match (recommended)", "B: Fuzzy match — tagged first, then 'might also work'"],
58
+ "chosen": "other",
59
+ "user_input": "Use strict tag match but also include models tagged as 'general' regardless of the query"
60
+ },
61
+ {
62
+ "id": "D3",
63
+ "title": "Top-N Presentation Count",
64
+ "context": "Need to decide how many models to show in the recommendation report.",
65
+ "options": ["A: Top 5 models (recommended)", "B: Top 3 models", "C: All models above acceptable_fit threshold"],
66
+ "chosen": "C",
67
+ "user_input": null
68
+ }
69
+ ]
70
+ }