openhermes 4.1.0 → 4.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. package/CONTEXT.md +9 -0
  2. package/ETHOS.md +6 -3
  3. package/LICENSE +21 -21
  4. package/README.md +120 -79
  5. package/bootstrap.ts +284 -41
  6. package/harness/agents/oh-browser.md +97 -0
  7. package/harness/agents/oh-builder.md +78 -0
  8. package/harness/agents/oh-facade.md +75 -0
  9. package/harness/agents/oh-fusion.md +45 -0
  10. package/harness/agents/oh-gauntlet.md +71 -0
  11. package/harness/agents/oh-grill.md +71 -0
  12. package/harness/agents/oh-investigate.md +60 -0
  13. package/harness/agents/oh-manifest.md +95 -0
  14. package/harness/agents/oh-plan-review.md +40 -0
  15. package/harness/agents/oh-planner.md +50 -0
  16. package/harness/agents/oh-refactor.md +37 -0
  17. package/harness/agents/oh-retro.md +46 -0
  18. package/harness/agents/oh-review.md +85 -0
  19. package/harness/agents/oh-security.md +83 -0
  20. package/harness/agents/oh-ship.md +76 -0
  21. package/harness/agents/oh-skill-craft.md +38 -0
  22. package/harness/agents/openhermes.md +106 -62
  23. package/harness/codex/AUTOPILOT.md +178 -0
  24. package/harness/codex/CHARTER.md +81 -0
  25. package/harness/commands/oh-doctor.md +193 -14
  26. package/harness/commands/oh-log.md +18 -0
  27. package/harness/instructions/SHELL.md +76 -0
  28. package/harness/skills/oh-ascii/DEEP.md +292 -0
  29. package/harness/skills/oh-ascii/SKILL.md +31 -0
  30. package/harness/skills/oh-ascii/scripts/check_ascii_alignment.py +596 -0
  31. package/harness/skills/oh-browser/DEEP.md +54 -0
  32. package/harness/skills/oh-browser/SKILL.md +30 -0
  33. package/harness/skills/oh-builder/DEEP.md +63 -0
  34. package/harness/skills/oh-builder/SKILL.md +16 -89
  35. package/harness/skills/oh-expert/DEEP.md +85 -0
  36. package/harness/skills/oh-expert/SKILL.md +19 -106
  37. package/harness/skills/oh-facade/DEEP.md +182 -0
  38. package/harness/skills/oh-facade/SKILL.md +34 -0
  39. package/harness/skills/oh-freeze/DEEP.md +18 -0
  40. package/harness/skills/oh-freeze/SKILL.md +15 -15
  41. package/harness/skills/oh-full-output/DEEP.md +25 -0
  42. package/harness/skills/oh-full-output/SKILL.md +28 -0
  43. package/harness/skills/oh-fusion/DEEP.md +120 -0
  44. package/harness/skills/oh-fusion/SKILL.md +36 -0
  45. package/harness/skills/oh-gauntlet/DEEP.md +77 -0
  46. package/harness/skills/oh-gauntlet/SKILL.md +17 -105
  47. package/harness/skills/oh-grill/DEEP.md +51 -0
  48. package/harness/skills/oh-grill/SKILL.md +16 -63
  49. package/harness/skills/oh-guard/DEEP.md +19 -0
  50. package/harness/skills/oh-guard/SKILL.md +15 -20
  51. package/harness/skills/oh-handoff/DEEP.md +48 -0
  52. package/harness/skills/oh-handoff/SKILL.md +18 -19
  53. package/harness/skills/oh-health/DEEP.md +74 -0
  54. package/harness/skills/oh-health/SKILL.md +17 -76
  55. package/harness/skills/oh-init/DEEP.md +85 -0
  56. package/harness/skills/oh-init/SKILL.md +17 -197
  57. package/harness/skills/oh-investigate/DEEP.md +171 -0
  58. package/harness/skills/oh-investigate/SKILL.md +18 -61
  59. package/harness/skills/oh-issue/DEEP.md +21 -0
  60. package/harness/skills/oh-issue/SKILL.md +16 -23
  61. package/harness/skills/oh-learn/DEEP.md +44 -0
  62. package/harness/skills/oh-learn/SKILL.md +17 -79
  63. package/harness/skills/oh-manifest/DEEP.md +92 -0
  64. package/harness/skills/oh-manifest/SKILL.md +15 -107
  65. package/harness/skills/oh-plan-review/DEEP.md +90 -0
  66. package/harness/skills/oh-plan-review/SKILL.md +19 -114
  67. package/harness/skills/oh-planner/DEEP.md +172 -0
  68. package/harness/skills/oh-planner/SKILL.md +16 -143
  69. package/harness/skills/oh-prd/DEEP.md +45 -0
  70. package/harness/skills/oh-prd/SKILL.md +15 -22
  71. package/harness/skills/oh-refactor/DEEP.md +122 -0
  72. package/harness/skills/oh-refactor/SKILL.md +33 -0
  73. package/harness/skills/oh-retro/DEEP.md +26 -0
  74. package/harness/skills/oh-retro/SKILL.md +17 -20
  75. package/harness/skills/oh-review/DEEP.md +87 -0
  76. package/harness/skills/oh-review/SKILL.md +17 -96
  77. package/harness/skills/oh-security/DEEP.md +83 -0
  78. package/harness/skills/oh-security/SKILL.md +18 -96
  79. package/harness/skills/oh-ship/DEEP.md +141 -0
  80. package/harness/skills/oh-ship/SKILL.md +18 -26
  81. package/harness/skills/oh-skill-craft/DEEP.md +369 -0
  82. package/harness/skills/oh-skill-craft/SKILL.md +20 -93
  83. package/harness/skills/oh-skills-link/DEEP.md +16 -0
  84. package/harness/skills/oh-skills-link/SKILL.md +15 -16
  85. package/harness/skills/oh-skills-list/DEEP.md +20 -0
  86. package/harness/skills/oh-skills-list/SKILL.md +14 -18
  87. package/harness/skills/oh-triage/DEEP.md +23 -0
  88. package/harness/skills/oh-triage/SKILL.md +15 -20
  89. package/harness/skills/oh-worktree/DEEP.md +169 -0
  90. package/harness/skills/oh-worktree/SKILL.md +32 -0
  91. package/lib/harness-resolver.ts +10 -12
  92. package/package.json +9 -4
  93. package/scripts/count-tokens.mjs +158 -0
  94. package/scripts/oh-doctor.ps1 +342 -0
  95. package/harness/codex/CONSTITUTION.md +0 -70
  96. package/harness/codex/ROUTING.md +0 -127
  97. package/harness/instructions/RUNTIME.md +0 -55
  98. package/harness/skills/oh-caveman/SKILL.md +0 -33
  99. package/lib/logger.ts +0 -69
@@ -0,0 +1,171 @@
1
+ # oh-investigate — Deep Reference
2
+
3
+ ## The Iron Law
4
+
5
+ > **NO FIXES WITHOUT ROOT CAUSE INVESTIGATION FIRST. Surface fixes compound into technical debt.**
6
+
7
+ If you haven't completed root cause investigation, you cannot propose fixes. Violating this process is violating the spirit of debugging.
8
+
9
+ ## Phase 0 — Build a feedback loop
10
+
11
+ **This is the actual skill. Everything else is mechanical.**
12
+
13
+ A fast, deterministic, agent-runnable pass/fail signal = you find the cause. Without one, staring at code won't save you.
14
+
15
+ ### Ways to construct a loop (try in order)
16
+ 1. Failing test at the bug's seam
17
+ 2. Curl/HTTP script against dev server
18
+ 3. CLI invocation + fixture, diff stdout
19
+ 4. Headless browser — assert on DOM/console/network
20
+ 5. Replay captured trace in isolation
21
+ 6. Throwaway harness — minimal subset exercising the bug path
22
+ 7. Property/fuzz loop — 1000 random inputs
23
+ 8. Bisection harness — `git bisect run`-able
24
+ 9. Differential loop — old vs new version output diff
25
+ 10. HITL script — drive human with structured loop
26
+
27
+ **Sharpen the loop:** Faster? Sharper signal (specific symptom, not "didn't crash")? More deterministic (pin time, seed RNG, isolate FS)? A 2s deterministic loop is a superpower.
28
+
29
+ **Non-deterministic:** Goal = higher reproduction rate. Loop 100×, parallelize, add stress. 50% flake is debuggable; 1% is not.
30
+
31
+ **Cannot build a loop?** Stop. Say so. List what you tried. Do NOT hypothesise without a loop.
32
+
33
+ ## Workflow
34
+
35
+ Complete each phase before proceeding. Each phase consumes the feedback loop built in Phase 0.
36
+
37
+ ### Phase 1 — Root Cause Investigation
38
+
39
+ **Before attempting ANY fix:**
40
+
41
+ 1. **Reproduce** — Loop confirms the described failure. Exact steps? Every time?
42
+ 2. **Read Error Messages** — Read stack traces completely. Note line numbers and error codes.
43
+ 3. **Check Recent Changes** — Git diff, recent commits, new dependencies, env differences.
44
+ 4. **Minimise** — Strip unrelated code. Remove noise until only the failure path remains.
45
+ 5. **Gather Evidence** — One probe per hypothesis. Change one variable. Use unique debug prefixes.
46
+ 6. **Trace Data Flow** — If error is deep in call stack, trace backward.
47
+
48
+ ### Phase 2 — Pattern Analysis
49
+
50
+ **Find the pattern before fixing:**
51
+
52
+ 1. **Find Working Examples** — Locate similar working code. What works that's analogous?
53
+ 2. **Compare Against References** — Read reference implementation completely. Don't skim.
54
+ 3. **Identify Differences** — List every difference between working and broken. Don't dismiss anything.
55
+ 4. **Understand Dependencies** — What components, config, or environment does this depend on?
56
+
57
+ ### Phase 3 — Hypothesis & Testing
58
+
59
+ **Scientific method:**
60
+
61
+ 1. **Form Single Hypothesis** — "I think X is root cause because Y." Be specific, not vague.
62
+ 2. **Test Minimally** — Smallest change to test hypothesis. One variable. Don't fix multiple things.
63
+ 3. **Verify** — Prediction held? → Phase 4. No → new hypothesis. DON'T stack more fixes.
64
+
65
+ ### Phase 4 — Implementation
66
+
67
+ **Fix root cause, not symptom:**
68
+
69
+ 1. **Create Failing Test** — Simplest reproduction. Automated if possible. Must fail before fix.
70
+ 2. **Implement Single Fix** — Address root cause. ONE change. No "while I'm here" improvements.
71
+ 3. **Verify Fix** — Failing test passes? No other tests broken? Phase 0 loop confirms resolution?
72
+ 4. **Regression Test** — Verify existing behavior. No regression seam = architecture gap (flag it).
73
+ 5. **Document** — Log root cause + fix. State which hypothesis was correct. What was the trigger?
74
+
75
+ ## Root Cause Tracing
76
+
77
+ Bugs manifest deep in call stacks. Fixing at the symptom treats the wrong layer. **Trace backward through the call chain to find the original trigger.**
78
+
79
+ 1. **Observe symptom** — Error at point of failure.
80
+ 2. **Find immediate cause** — What code directly produces this error?
81
+ 3. **What called this?** — Step one level up the call chain.
82
+ 4. **Keep tracing up** — What value was passed? Where from?
83
+ 5. **Find original trigger** — Root source of bad state. Fix here, not at symptom.
84
+
85
+ **Stack trace instrumentation:**
86
+ ```
87
+ const stack = new Error().stack;
88
+ console.error('DEBUG <component>:', { directory, cwd, stack });
89
+ ```
90
+ Use `console.error()` (logger may be suppressed in tests). Grep output. **Never fix just where the error appears** — trace back and add validation at each layer.
91
+
92
+ ## Multi-Component Diagnostics
93
+
94
+ **In multi-component systems (CI → build → signing, API → service → database), add instrumentation at each boundary BEFORE proposing fixes:**
95
+
96
+ - Log data entering and exiting each component
97
+ - Verify environment/config propagation across layers
98
+ - Check state at each layer
99
+
100
+ Run once to gather evidence, identify the failing component, THEN investigate it.
101
+
102
+ **Example (build pipeline):** Layer 1 (workflow → secrets?), Layer 2 (build → env vars?), Layer 3 (signing → keychain?), Layer 4 (actual signing). Reveals which layer fails in one pass.
103
+
104
+ ## Red Flags
105
+
106
+ **If you catch yourself thinking any of these, STOP. Return to Phase 1.**
107
+
108
+ - "Quick fix for now, investigate later"
109
+ - "Just try changing X and see if it works"
110
+ - "Add multiple changes, run tests"
111
+ - "Skip the test, I'll manually verify"
112
+ - "It's probably X, let me fix that"
113
+ - "I don't fully understand but this might work"
114
+ - "Pattern says X but I'll adapt it differently"
115
+ - Proposing solutions before tracing data flow
116
+ - "One more fix attempt" (when already tried 2+)
117
+ - Each fix reveals new problem in different place
118
+ - "Here are the main problems: [lists fixes without investigation]"
119
+ - "Issue is simple, don't need the full process"
120
+
121
+ **All of these mean: STOP. Return to investigation.**
122
+
123
+ ## 3+ Fix Failure Rule
124
+
125
+ **After 3 failed fix attempts, STOP and question the architecture.**
126
+
127
+ Three failed fixes signals an architectural problem:
128
+ - Each fix reveals new shared state/coupling in a different place
129
+ - Fixes require "massive refactoring" to implement
130
+ - Each fix creates new symptoms elsewhere
131
+
132
+ **Do not attempt Fix #4 without architectural discussion:**
133
+ - Is this pattern fundamentally sound?
134
+ - Are we sticking with it through inertia?
135
+ - Should we refactor architecture vs. continue fixing symptoms?
136
+
137
+ This is NOT a failed hypothesis — this is a wrong architecture.
138
+
139
+ ## Partner Signal Monitoring
140
+
141
+ **When your human partner says these, they mean you're guessing, not debugging:**
142
+
143
+ | Phrase | Meaning |
144
+ |--------|---------|
145
+ | "Is that happening?" | You assumed without verifying |
146
+ | "Will it show us...?" | You skipped evidence gathering |
147
+ | "Stop guessing" | You're proposing fixes without understanding |
148
+ | "We're stuck?" | Your approach isn't working |
149
+
150
+ **When you see any of these: STOP. Return to Phase 1.**
151
+
152
+ ## Common Rationalizations
153
+
154
+ | Excuse | Reality |
155
+ |--------|---------|
156
+ | "Issue is simple, don't need process" | Simple bugs have root causes too. Process is fast. |
157
+ | "Emergency, no time for process" | Systematic is FASTER than guess-and-check thrashing. |
158
+ | "Just try this first, then investigate" | First fix sets the pattern. Do it right from start. |
159
+ | "I'll write test after confirming fix works" | Untested fixes don't stick. Test first proves bug. |
160
+ | "Multiple fixes at once saves time" | Can't isolate what worked. Causes new bugs. |
161
+ | "Reference too long, I'll adapt the pattern" | Partial understanding guarantees bugs. Read fully. |
162
+ | "I see the problem, let me fix it" | Seeing symptoms ≠ understanding root cause. |
163
+ | "One more fix attempt" (after 2+ failures) | 3+ failures = architectural problem. Stop fixing. |
164
+
165
+ ## Anti-patterns
166
+
167
+ - Fixing symptoms (same bug reappears)
168
+ - Changing code without reproducing
169
+ - Shotgun debugging (multiple changes hoping one sticks)
170
+ - Not documenting root cause
171
+ - Hypothesizing without a feedback loop
@@ -1,74 +1,31 @@
1
1
  ---
2
2
  name: oh-investigate
3
- description: "Systematic bug diagnosis with root cause investigation"
3
+ description: "Use when debugging any bug, test failure, or unexpected behavior. Finds root cause systematically before attempting fixes."
4
+ tier: 2
5
+ route:
6
+ pass: oh-builder
7
+ fail: oh-expert
8
+ blocker: surface
4
9
  ---
5
10
 
6
11
  # oh-investigate
7
12
 
8
- ## When to Use
9
- When a bug is reported, a test fails, or unexpected behavior occurs. Use this before attempting any fix.
13
+ Systematic bug diagnosis: build a feedback loop, trace root cause, fix with evidence.
10
14
 
11
- ## Phase 0 — Build a feedback loop
15
+ ## Steps
12
16
 
13
- **This is the actual skill. Everything else is mechanical.**
14
-
15
- If you have a fast, deterministic, agent-runnable pass/fail signal for the bug, you will find the cause — bisection, hypothesis-testing, and instrumentation are just consuming that signal. If you don't have one, no amount of staring at code will save you.
16
-
17
- Spend disproportionate effort here. **Be aggressive. Be creative. Refuse to give up.**
18
-
19
- ### Ways to construct a feedback loop (try in this order)
20
-
21
- 1. **Failing test** at whatever seam reaches the bug.
22
- 2. **Curl / HTTP script** against a running dev server.
23
- 3. **CLI invocation** with a fixture input, diffing stdout against a known-good snapshot.
24
- 4. **Headless browser script** — drive the UI, assert on DOM/console/network.
25
- 5. **Replay a captured trace** — save a real payload/event log, replay it in isolation.
26
- 6. **Throwaway harness** — minimal subset of the system exercising the bug code path with a single call.
27
- 7. **Property / fuzz loop** — run 1000 random inputs, look for the failure mode.
28
- 8. **Bisection harness** — automate "boot at state X, check, repeat" so you can `git bisect run` it.
29
- 9. **Differential loop** — run same input through old-version vs new-version, diff outputs.
30
- 10. **HITL script** — last resort. Drive a human with a structured loop.
31
-
32
- ### Iterate on the loop itself
33
-
34
- - Can I make it faster? (Cache setup, skip unrelated init, narrow the scope.)
35
- - Can I make the signal sharper? (Assert on the specific symptom, not "didn't crash".)
36
- - Can I make it more deterministic? (Pin time, seed RNG, isolate filesystem.)
37
-
38
- A 30-second flaky loop is barely better than no loop. A 2-second deterministic loop is a debugging superpower.
39
-
40
- ### Non-deterministic bugs
41
-
42
- The goal is not a clean repro but a **higher reproduction rate**. Loop the trigger 100×, parallelise, add stress, narrow timing windows. A 50%-flake bug is debuggable; 1% is not.
43
-
44
- ### When you genuinely cannot build a loop
45
-
46
- Stop and say so explicitly. List what you tried. Do **not** proceed to hypothesise without a loop.
47
-
48
- ## Workflow (consumes the loop)
49
-
50
- 1. **Reproduce** — run the loop, confirm the bug appears. The loop must match the user's described failure, not a different nearby failure.
51
- 2. **Minimise** — strip away unrelated code until the minimal reproduction remains.
52
- 3. **Hypothesise** — generate 3–5 ranked falsifiable hypotheses before testing any. Each must state a prediction: "If X is the cause, then changing Y will make the bug disappear".
53
- 4. **Instrument** — one probe per hypothesis. Change one variable at a time. Tag every debug log with a unique prefix (e.g. `[DEBUG-a4f2]`) for easy cleanup.
54
- 5. **Fix** — write the regression test at a correct seam first. Watch it fail. Apply the smallest correct change. Watch it pass. Re-run the Phase 0 loop against the original scenario.
55
- 6. **Regression test** — verify fix doesn't break existing behavior. If no correct seam exists for a regression test, that itself is a finding — flag the architecture gap.
56
- 7. **Document** — log the root cause and fix in the handoff, issue, or relevant docs. State which hypothesis was correct so the next debugger learns.
57
-
58
- ## Iron Law
59
- No fixes without root cause. Surface-level fixes compound into technical debt.
60
-
61
- ## Anti-patterns
62
- - Fixing symptoms instead of causes (the same bug reappears next week)
63
- - Changing code without reproducing the bug first
64
- - "Shotgun" debugging — changing multiple things hoping one sticks
65
- - Not documenting root cause for future reference
66
- - Proceeding to hypothesise without a feedback loop
17
+ 1. Build a feedback loop — failing test, curl, CLI, headless browser, or throwaway harness. Must be fast, deterministic, and agent-runnable.
18
+ 2. Apply the Iron Law — NO FIXES WITHOUT ROOT CAUSE INVESTIGATION FIRST. Surface fixes compound into technical debt.
19
+ 3. Reproduce and trace confirm failure, read stack traces, check recent changes, minimise to failure path, gather evidence one probe per hypothesis.
20
+ 4. Trace backward — from symptom through call chain to original trigger. Instrument boundaries with debug logging.
21
+ 5. Find working pattern locate similar working code, compare against references, list every difference.
22
+ 6. Form single hypothesis — "I think X is root cause because Y." Test with minimal change. One variable.
23
+ 7. Implement fix create failing test first, apply one fix, verify resolution, regression test, document root cause.
67
24
 
68
25
  ## Routing
69
26
 
70
27
  | Outcome | Route |
71
28
  |---------|-------|
72
- | pass | → oh-builder (implement the fix) |
73
- | fail | → oh-expert (deepen diagnosis) |
74
- | blocker | → surface to user |
29
+ | pass | → oh-builder (fix) |
30
+ | fail | → oh-expert (deepen) |
31
+ | blocker | → surface |
@@ -0,0 +1,21 @@
1
+ # oh-issue — Deep Reference
2
+
3
+ ## When to Use
4
+
5
+ Plan/PRD needs breaking into actionable issues. Vertical tracer-bullet slices.
6
+
7
+ Triggers: break into issues, create issues from plan, issue breakdown.
8
+
9
+ ## Issue Structure
10
+
11
+ - **Title**: action-oriented ("Add user auth API")
12
+ - **AC**: concrete, testable ("User signs up with email + password")
13
+ - **Notes**: pointers for implementer
14
+ - **Deps**: what must come first
15
+ - **Labels**: type, priority, area
16
+
17
+ ## Anti-patterns
18
+
19
+ - Horizontal slicing (no one ships "DB layer" alone)
20
+ - Issues too large (3+ days) or too small (<1 hour)
21
+ - Missing acceptance criteria
@@ -1,36 +1,29 @@
1
1
  ---
2
2
  name: oh-issue
3
- description: "Break a plan, spec, or PRD into independently-grabbable GitHub issues"
3
+ description: "Break plans/PRDs into independently-grabbable GitHub issues"
4
+ tier: 2
5
+ route:
6
+ pass: done
7
+ fail: oh-planner
8
+ blocker: surface
4
9
  ---
5
10
 
6
11
  # oh-issue
7
12
 
8
- ## When to Use
9
- When a plan exists and needs to be broken into actionable issues. Uses tracer-bullet vertical slices for independent work items.
13
+ Break plans/PRDs into vertical-slice issues with acceptance criteria and dependencies.
10
14
 
11
- ## Workflow
12
- 1. Read the plan or PRD
13
- 2. Identify vertical slices — self-contained features that ship independently
14
- 3. Write each issue with: clear title, acceptance criteria, implementation notes, dependencies
15
- 4. Use `gh issue create` to publish each issue
16
- 5. Label and milestone each issue appropriately
15
+ ## Steps
17
16
 
18
- ## Issue Structure
19
- - **Title**: action-oriented ("Add user authentication API")
20
- - **Acceptance criteria**: concrete, testable ("User can sign up with email + password")
21
- - **Implementation notes**: pointers for the implementer
22
- - **Dependencies**: what must be done first
23
- - **Labels**: type, priority, area
24
-
25
- ## Anti-patterns
26
- - Horizontal slicing (DB layer / API layer / UI layer — no one ships a layer)
27
- - Issues too large (3+ days) or too small (< 1 hour)
28
- - Writing issues without acceptance criteria
17
+ 1. Read plan or PRD
18
+ 2. Identify vertical slices self-contained, independently shippable
19
+ 3. Write each issue with title, acceptance criteria, implementation notes, and dependencies
20
+ 4. Publish issues via `gh issue create`
21
+ 5. Apply labels and milestone
29
22
 
30
23
  ## Routing
31
24
 
32
25
  | Outcome | Route |
33
26
  |---------|-------|
34
- | pass | → [done — issues published to tracker] |
35
- | fail | → oh-planner (re-spec unclear slices) |
36
- | blocker | → surface to user |
27
+ | pass | → done |
28
+ | fail | → oh-planner |
29
+ | blocker | → surface |
@@ -0,0 +1,44 @@
1
+ # oh-learn — Deep Reference
2
+
3
+ ## When to Use
4
+
5
+ Session learnings should be captured, reviewed, or promoted as reusable instincts for future work.
6
+
7
+ Triggers: learn from session, extract patterns, run oh-learn.
8
+
9
+ ## Instinct Data Model
10
+
11
+ JSONL at `~/.local/share/opencode/openhermes/plans/<project>-instincts.jsonl`:
12
+
13
+ ```json
14
+ {"trigger": "specific situation", "action": "recommended response", "confidence": 0.5, "applications": 1, "successes": 1, "category": "coding", "source": "oh-learn:extract", "ts": "2026-05-15T12:00:00Z"}
15
+ ```
16
+
17
+ **Trigger:** specific, matchable (not general advice). **Action:** executable (not belief). **Confidence:** starts 0.5, +0.05 per success, -0.02/day decay. **Category:** coding, testing, security, git, planning, orchestration, debugging, ux.
18
+
19
+ ## Workflows
20
+
21
+ ### Extract
22
+
23
+ Scan session for repeated decisions. For each: write instinct. Check existing file for near-duplicates. Merge (max confidence, increment applications) or append.
24
+
25
+ ### Evolve
26
+
27
+ Read all instincts. Group by category then topic. ≥5 instincts with avg confidence ≥ 0.7 → oh-skill-craft spec. 3-4 with confidence ≥ 0.8 → suggest update to existing skill.
28
+
29
+ ### Promote
30
+
31
+ Instincts with confidence ≥ 0.85 AND applications ≥ 10 → filter project-specific → append to global `%USERPROFILE%\.config\opencode\instincts.jsonl`. Tag promoted.
32
+
33
+ ### Review / Search / Prune / Export
34
+
35
+ Review: totals + distributions. Search: by topic, trigger, category, confidence. Prune: stale >30d with confidence < 0.3. Export: portable JSON.
36
+
37
+ ## Anti-patterns
38
+
39
+ - Hoarding every observation (most aren't learnings)
40
+ - Never pruning
41
+ - Storing what not why
42
+ - Over-promoting to global
43
+ - Extracting without applying
44
+ - Ignoring confidence
@@ -1,92 +1,30 @@
1
1
  ---
2
2
  name: oh-learn
3
- description: "Extract, evolve, and promote session learnings as instincts. Review, search, prune, export."
3
+ description: "Capture, review, and promote session learnings as reusable instincts"
4
+ tier: 2
5
+ route:
6
+ pass: done
7
+ fail: surface
8
+ blocker: surface
4
9
  ---
5
10
 
6
11
  # oh-learn
7
12
 
8
- Learning engine for the harness. Distills patterns from sessions into **instincts** (trigger-action pairs with confidence), clusters them into skill candidates, and graduates high-signal patterns from project to global scope.
13
+ Distill session patterns into instincts, cluster into skill candidates, and promote high-signal patterns.
9
14
 
10
- ## Instinct Data Model
15
+ ## Steps
11
16
 
12
- Every learning stored as one JSONL line in `.opencode/instincts.jsonl`:
13
-
14
- ```json
15
- { "trigger": "situation pattern", "action": "recommended response", "confidence": 0.5, "applications": 1, "successes": 1, "category": "coding", "source": "oh-learn:extract", "ts": "2026-05-15T12:00:00Z" }
16
- ```
17
-
18
- **Rules:**
19
- - **Trigger** — specific, matchable situation. *Not* general advice.
20
- - **Action** — executable response. *Not* a belief.
21
- - **Confidence** — starts at 0.5, increments +0.05 per successful application, decays -0.02 per day without use.
22
- - **Category** — one of: `coding`, `testing`, `security`, `git`, `planning`, `orchestration`, `debugging`, `ux`.
23
-
24
- ## When to Use
25
-
26
- After completing a significant piece of work, at session handoff, or when you notice the same pattern repeat 2+ times in one session. Also on explicit user request.
27
-
28
- ## Workflows
29
-
30
- ### Extract
31
- Mine the current session for reusable patterns.
32
-
33
- 1. Scan recent conversation + code changes for repeated decision patterns
34
- 2. For each distinct pattern write an instinct: trigger, action, confidence=0.5, category
35
- 3. Read existing `.opencode/instincts.jsonl`, check for near-duplicate triggers
36
- 4. If duplicate found: merge — `confidence = max(existing, 0.8 × new)`, increment applications
37
- 5. If new: append line to file
38
-
39
- **Good instinct:** trigger=`"tsc --noEmit shows 10+ errors after batch edit"`, action=`"Fix errors one at a time, re-running tsc after each, rather than batch-fixing"`, category=`"debugging"`
40
-
41
- **Bad instinct:** `"Write clean code"` — too vague to trigger on.
42
-
43
- ### Evolve
44
- Cluster related instincts into skill/command/agent candidates.
45
-
46
- 1. Read all instincts from `.opencode/instincts.jsonl`
47
- 2. Group by `category`, then by trigger topic similarity
48
- 3. **If cluster ≥ 5 instincts AND avg confidence ≥ 0.7** → generate `oh-skill-craft` spec for a new skill
49
- 4. **If cluster 3-4 instincts with confidence ≥ 0.8** → suggest update to existing skill
50
- 5. Output candidate summary with trigger list and extracted core pattern
51
-
52
- ### Promote
53
- Graduate high-confidence instincts from project to global scope.
54
-
55
- 1. Scan `.opencode/instincts.jsonl` for instincts with `confidence >= 0.85 AND applications >= 10`
56
- 2. Filter out project-specific patterns (reference paths, local APIs, domain terms)
57
- 3. Append filtered candidates to `%USERPROFILE%\.config\opencode\instincts.jsonl` (global)
58
- 4. Tag promoted instincts with `"promoted": true` in project file
59
- 5. Report: "Promoted N instincts to global scope"
60
-
61
- ### Review
62
- Show instinct summary: total count, confidence distribution, category breakdown, recently promoted.
63
-
64
- ### Search
65
- Find instincts by topic, trigger fragment, category, or confidence range.
66
-
67
- ### Prune
68
- Remove instincts stale for 30+ days with confidence < 0.3, or superseded by a higher-confidence instinct covering the same trigger.
69
-
70
- ### Export
71
- Serialize instincts to portable JSON for sharing across projects or teams:
72
-
73
- ```json
74
- { "version": 1, "exported": "2026-05-15T12:00:00Z", "instincts": [...] }
75
- ```
76
-
77
- ## Anti-patterns
78
-
79
- - Hoarding every observation (most things aren't learnings)
80
- - Never pruning (stale knowledge is worse than no knowledge)
81
- - Storing what, not why (context-less facts are forgettable)
82
- - Over-promoting: not every pattern is globally useful
83
- - Extracting without applying: instincts that never trigger are noise
84
- - Ignoring confidence: treating all instincts as equally reliable
17
+ 1. Scan session for repeated decisions
18
+ 2. Write instinct for each pattern — trigger, action, confidence
19
+ 3. Check existing file for near-duplicates; merge or append
20
+ 4. Group instincts by category and topic
21
+ 5. Promote high-confidence instincts (≥0.85, ≥10 applications) to global scope
22
+ 6. Prune stale low-confidence instincts (<0.3, >30 days)
85
23
 
86
24
  ## Routing
87
25
 
88
26
  | Outcome | Route |
89
27
  |---------|-------|
90
- | pass | → [done — report summary] |
91
- | fail | → [surface gaps to user] |
92
- | blocker | → surface to user |
28
+ | pass | → done |
29
+ | fail | → surface |
30
+ | blocker | → surface |
@@ -0,0 +1,92 @@
1
+ # oh-manifest — Deep Reference
2
+
3
+ ## Phase 0: Pre-Flight
4
+
5
+ ALL must pass before any work:
6
+
7
+ - ☐ **Quality baseline** — existing tests pass. Capture before/after.
8
+ - ☐ **Rollback path** — clean `git stash` or committed state to return to.
9
+ - ☐ **Branch isolation** — working branch, not main/master.
10
+ - ☐ **Scope documented** — plan exists and unambiguous.
11
+
12
+ Any check fails → STOP. Report which. Do not proceed until resolved.
13
+
14
+ **Continuous execution:** Execute all tasks without pausing for progress check-ins between them. Only stop for BLOCKED, genuine ambiguity, or all tasks complete.
15
+
16
+ ## Pipeline
17
+
18
+ ### Step 1: Plan
19
+ If plan exists, load. If not, run oh-planner. Auto-decide minor scope via decision principles. Surface only: premises needing human judgment, or plan/alternative conflicts.
20
+
21
+ ### Step 2: Build
22
+ Run oh-builder for each plan phase in dependency order. Parallelizable phases → sub-agents. Auto-decide implementation choices.
23
+
24
+ **Two-stage review (in order — never reverse):**
25
+ 1. **Spec compliance first** — Does the output match the plan/spec requirements? Quote the spec. No scope creep, no missing requirements.
26
+ 2. **Code quality second** — Only after spec compliance is ✅. Architecture, readability, test quality, edge cases.
27
+
28
+ **Implementer status protocol** — Implementers report one of:
29
+
30
+ | Status | Action |
31
+ |--------|--------|
32
+ | **DONE** | Proceed to spec review |
33
+ | **DONE_WITH_CONCERNS** | Read concerns before proceeding |
34
+ | **NEEDS_CONTEXT** | Provide context, re-dispatch |
35
+ | **BLOCKED** | Assess: context problem? capability gap? task too large? plan wrong? |
36
+
37
+ Never ignore BLOCKED or retry same approach without changes.
38
+
39
+ ### Step 3: Verify
40
+ Check each phase against verification criteria. Tests pass → mark complete. Fail → diagnose (oh-expert), fix, re-verify.
41
+
42
+ ### Step 4: Loop
43
+ All done → DONE. Phase fails → BLOCKER (surface). New work discovered → add to plan, continue.
44
+
45
+ ## Loop Patterns
46
+
47
+ | Pattern | Use | Behavior |
48
+ |---------|-----|----------|
49
+ | sequential | Normal features | One phase at a time, verify each |
50
+ | continuous-pr | Multi-step refactors | Per-phase PRs |
51
+ | infinite | Watch mode, CI repair | Continue until stop signal |
52
+ | rfc-dag | Complex deps | DAG resolution, parallelize independent branches |
53
+
54
+ Default: sequential.
55
+
56
+ ## Escalation Triggers
57
+
58
+ | Trigger | Condition | Action |
59
+ |---------|-----------|--------|
60
+ | Stall | 2 consecutive zero-progress checkpoints | Pause, report attempts |
61
+ | Retry storm | Same error 5+ times | Stop, surface with fixes tried |
62
+ | Cost drift | Cumulative changes exceed scope | Pause, show diff |
63
+ | Quality regression | Verify scores lower than baseline | Pause, report |
64
+
65
+ These are not optional. When triggered, loop **must** pause.
66
+
67
+ ## Decision Principles
68
+
69
+ Auto-resolve: completeness > cleverness, boil the lake, pragmatic > perfect, DRY at 3rd instance, explicit > implicit, bias toward action.
70
+
71
+ Surface only: premises, dead ends, cross-model disagreement.
72
+
73
+ **Model selection guidance:**
74
+ - Mechanical tasks (isolated, 1-2 files, clear spec) → fast cheap model
75
+ - Integration tasks (multi-file, coordination) → standard model
76
+ - Architecture/design/review tasks → most capable model
77
+
78
+ ## Blocker Protocol
79
+
80
+ `BLOCKER: <what> | Options: A, B, C` → wait for decision.
81
+
82
+ ## Anti-patterns
83
+ - Skipping pre-flight
84
+ - Auto-deciding premises
85
+ - Pushing through blockers without surfacing
86
+ - Skipping verification
87
+ - Parallelizing dependent phases
88
+ - Not updating plan file
89
+ - Ignoring escalation triggers
90
+ - Starting code quality review before spec compliance is ✅
91
+ - Ignoring implementer BLOCKED status and retrying with same approach
92
+ - Pausing between tasks for progress updates (breaks flow)