oh-my-customcodex 0.4.0 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. package/README.md +5 -5
  2. package/dist/cli/index.js +2 -9
  3. package/dist/index.js +1 -1
  4. package/package.json +1 -1
  5. package/templates/.claude/agents/mgr-creator.md +11 -0
  6. package/templates/.claude/output-styles/korean-engineer.md +24 -0
  7. package/templates/.claude/rules/MUST-agent-design.md +1 -0
  8. package/templates/.claude/rules/MUST-completion-verification.md +13 -0
  9. package/templates/.claude/rules/SHOULD-interaction.md +2 -0
  10. package/templates/.claude/skills/agent-eval-framework/SKILL.md +92 -0
  11. package/templates/.claude/skills/agora/SKILL.md +11 -0
  12. package/templates/.claude/skills/codex-exec/SKILL.md +12 -0
  13. package/templates/.claude/skills/evaluator-optimizer/SKILL.md +20 -0
  14. package/templates/.claude/skills/harness-eval/SKILL.md +13 -0
  15. package/templates/.claude/skills/roundtable-debate/SKILL.md +60 -0
  16. package/templates/.claude/skills/visual-ralph/SKILL.md +126 -0
  17. package/templates/.claude/skills/visual-verdict/SKILL.md +49 -0
  18. package/templates/AGENTS.md.en +6 -21
  19. package/templates/AGENTS.md.ko +6 -21
  20. package/templates/CLAUDE.md.en +3 -3
  21. package/templates/CLAUDE.md.ko +3 -3
  22. package/templates/guides/agent-eval/README.md +48 -0
  23. package/templates/guides/agent-eval/index.yaml +6 -0
  24. package/templates/guides/browser-automation/README.md +12 -0
  25. package/templates/guides/index.yaml +12 -0
  26. package/templates/guides/multi-agent-debate-patterns/README.md +26 -0
  27. package/templates/guides/multi-agent-debate-patterns/index.yaml +6 -0
  28. package/templates/manifest.json +4 -4
  29. package/templates/workflows/auto-dev.yaml +4 -4
package/README.md CHANGED
@@ -13,7 +13,7 @@
13
13
 
14
14
  **[한국어 문서 (Korean)](./README_ko.md)**
15
15
 
16
- 49 agents. 112 skills. 22 rules. One command.
16
+ 49 agents. 116 skills. 22 rules. One command.
17
17
 
18
18
  ```bash
19
19
  npm install -g oh-my-customcodex && cd your-project && omcustomcodex init
@@ -134,7 +134,7 @@ Each agent declares its tools, model, memory scope, and limitations in YAML fron
134
134
 
135
135
  ---
136
136
 
137
- ### Skills (112)
137
+ ### Skills (116)
138
138
 
139
139
  | Category | Count | Includes |
140
140
  |----------|-------|----------|
@@ -147,7 +147,7 @@ Each agent declares its tools, model, memory scope, and limitations in YAML fron
147
147
  | Package | 3 | npm-publish, npm-version, npm-audit |
148
148
  | Optimization | 3 | optimize-analyze, optimize-bundle, optimize-report |
149
149
  | Security | 3 | adversarial-review, cve-triage, jinja2-prompts |
150
- | Other | 10 | codex-exec, claude-native, vercel-deploy, skills-sh-search, result-aggregation, writing-clearly-and-concisely, and more |
150
+ | Other | 12 | codex-exec, claude-native, visual-ralph, visual-verdict, vercel-deploy, skills-sh-search, result-aggregation, writing-clearly-and-concisely, and more |
151
151
 
152
152
  Skills use a 3-tier scope system: `core` (universal), `harness` (agent/skill maintenance), `package` (project-specific).
153
153
 
@@ -227,7 +227,7 @@ Key rules: R010 (orchestrator never writes files), R009 (parallel execution mand
227
227
 
228
228
  ---
229
229
 
230
- ### Guides (40)
230
+ ### Guides (42)
231
231
 
232
232
  Reference documentation covering best practices, architecture decisions, and integration patterns. Located in `guides/` at project root, covering topics from agent design to CI/CD to observability.
233
233
 
@@ -286,7 +286,7 @@ your-project/
286
286
  │ ├── contexts/ # 4 shared context files
287
287
  │ └── ontology/ # Knowledge graph for RAG
288
288
  ├── .agents/
289
- │ └── skills/ # 112 installed skill modules
289
+ │ └── skills/ # 116 installed skill modules
290
290
  └── guides/ # 40 reference documents
291
291
  ```
292
292
 
package/dist/cli/index.js CHANGED
@@ -3091,7 +3091,7 @@ var init_package = __esm(() => {
3091
3091
  workspaces: [
3092
3092
  "packages/*"
3093
3093
  ],
3094
- version: "0.4.0",
3094
+ version: "0.4.2",
3095
3095
  description: "Batteries-included agent harness on top of GPT Codex + OMX",
3096
3096
  type: "module",
3097
3097
  bin: {
@@ -29925,14 +29925,7 @@ async function initCommand(options) {
29925
29925
  await registerProject(targetDir, package_default.version);
29926
29926
  } catch {}
29927
29927
  console.log("");
29928
- console.log("Required plugins (install manually):");
29929
- console.log(" /plugin marketplace add obra/superpowers-marketplace");
29930
- console.log(" /plugin install superpowers");
29931
- console.log(" /plugin install openai-docs");
29932
- console.log(" /plugin install elements-of-style");
29933
- console.log(" /plugin install context7");
29934
- console.log("");
29935
- console.log('See AGENTS.md "외부 의존성" section for details.');
29928
+ console.log("Codex setup complete. See AGENTS.md for Codex-native MCP and runtime guidance.");
29936
29929
  return {
29937
29930
  success: true,
29938
29931
  message: i18n.t("cli.init.success"),
package/dist/index.js CHANGED
@@ -2180,7 +2180,7 @@ var package_default = {
2180
2180
  workspaces: [
2181
2181
  "packages/*"
2182
2182
  ],
2183
- version: "0.4.0",
2183
+ version: "0.4.2",
2184
2184
  description: "Batteries-included agent harness on top of GPT Codex + OMX",
2185
2185
  type: "module",
2186
2186
  bin: {
package/package.json CHANGED
@@ -3,7 +3,7 @@
3
3
  "workspaces": [
4
4
  "packages/*"
5
5
  ],
6
- "version": "0.4.0",
6
+ "version": "0.4.2",
7
7
  "description": "Batteries-included agent harness on top of GPT Codex + OMX",
8
8
  "type": "module",
9
9
  "bin": {
@@ -7,6 +7,7 @@ memory: project
7
7
  effort: high
8
8
  skills:
9
9
  - create-agent
10
+ - agent-eval-framework
10
11
  tools:
11
12
  - Read
12
13
  - Write
@@ -36,6 +37,16 @@ Frontmatter (name, description, model, tools, skills, memory) + body (purpose, c
36
37
 
37
38
  No registry update needed - agents auto-discovered from `.claude/agents/*.md`.
38
39
 
40
+ ### Phase 4: Optional Quantitative Gate
41
+
42
+ For high-risk or reusable agents, use `agent-eval-framework` after creation:
43
+
44
+ 1. Define an ideal trajectory for the agent's first representative task.
45
+ 2. Run correctness checks before measuring efficiency.
46
+ 3. Record `step_ratio`, `tool_call_ratio`, and `latency_ratio` as advisory evidence.
47
+
48
+ Do not force this gate for every small helper agent. It is opt-in when the extra cost is justified by reuse, safety, or routing criticality.
49
+
39
50
  ## Rules Applied
40
51
 
41
52
  - R000: All files in English
@@ -0,0 +1,24 @@
1
+ ---
2
+ name: korean-engineer
3
+ description: Korean-first engineering responses with agent identity and evidence-focused completion
4
+ keep-coding-instructions: true
5
+ ---
6
+
7
+ # Korean Engineer Output Style
8
+
9
+ Use Korean for user-facing communication unless the user explicitly asks otherwise. Keep code, file contents, identifiers, and commit trailers in English when that is the repository convention.
10
+
11
+ Every response starts with the agent identity block required by the project guidance:
12
+
13
+ ```text
14
+ ┌─ Agent: {agent-name} / {model}
15
+ │ Skill: {active-skill-or-none}
16
+ └─ Status: {current action or result}
17
+ ```
18
+
19
+ Prefer concise, evidence-focused engineering reports:
20
+
21
+ - State the current action or outcome first.
22
+ - Cite concrete verification evidence before declaring completion.
23
+ - Do not claim release, deploy, or publish completion until the external surface has been checked.
24
+ - Keep uncertainty explicit and tied to the missing evidence.
@@ -254,6 +254,7 @@ Recommended practice:
254
254
  2. Keep allow rules only as defensive documentation; do not rely on them to suppress sensitive-path prompts.
255
255
  3. Do not run unattended Claude Code release automation that writes `templates/.claude/**` unless the workflow can handle interactive approval.
256
256
  4. In this Codex port, update `.codex/...` source files and their `templates/.claude/...` mirrors deliberately instead of bulk-copying with shell commands.
257
+ 5. For unattended Claude compatibility-template writes, use a reviewed temporary script wrapper and verify the resulting diff; direct Bash/Write/Edit targets under `templates/.claude/**` can all trigger the sensitive-path guard.
257
258
 
258
259
  ## Separation of Concerns
259
260
 
@@ -21,6 +21,19 @@ Before declaring any task `[Done]`, verify completion against task-type-specific
21
21
 
22
22
  Before [Done]: (1) Verify ACTUAL outcome not just attempt — "ran command" ≠ "succeeded". (2) Check task-type criteria above. (3) No unchecked items. (4) Would bet $100 it's complete.
23
23
 
24
+ ## Optional: Quantitative Evidence
25
+
26
+ For agent, skill, or workflow changes, completion evidence MAY include `agent-eval-framework` metrics:
27
+
28
+ | Metric | Meaning | Gate |
29
+ |--------|---------|------|
30
+ | `correctness` | Acceptance criteria satisfied | Required if included |
31
+ | `step_ratio` | Observed steps vs. ideal steps | Advisory |
32
+ | `tool_call_ratio` | Observed tool calls vs. ideal tool calls | Advisory |
33
+ | `latency_ratio` | Observed duration vs. ideal duration | Advisory |
34
+
35
+ These metrics strengthen a `[Done]` claim but do not replace task-specific verification. A failed correctness score blocks completion even if efficiency ratios are good.
36
+
24
37
  <!-- DETAIL: Self-Check box
25
38
  1. Did I verify ACTUAL outcome? "I ran the command" ≠ "the command succeeded" → YES: Continue / NO: Verify first
26
39
  2. Does task type have specific criteria? YES: Check each / NO: Apply general verification
@@ -35,6 +35,8 @@
35
35
 
36
36
  ## Output Styles
37
37
 
38
+ Session-level style enforcement belongs in runtime output-style mechanisms when the host supports them. In this Codex port, R003 remains the portable source of style-selection rules; packaged Claude compatibility may additionally provide `.claude/output-styles/` presets that reinforce the same constraints.
39
+
38
40
  | Style | Trigger | Behavior |
39
41
  |-------|---------|----------|
40
42
  | `concise` | effort: low, batch operations | Key result only, no preamble, no elaboration |
@@ -0,0 +1,92 @@
1
+ ---
2
+ name: agent-eval-framework
3
+ description: Quantitative agent evaluation using correctness, step ratio, tool-call ratio, and latency ratio
4
+ scope: harness
5
+ user-invocable: true
6
+ argument-hint: "<trace-or-task> [--ideal <path>] [--format markdown|json]"
7
+ effort: high
8
+ version: 1.0.0
9
+ ---
10
+
11
+ # Agent Eval Framework
12
+
13
+ ## Purpose
14
+
15
+ Evaluate agent runs with a two-phase quantitative gate:
16
+
17
+ 1. **Correctness first**: the task must meet its stated acceptance criteria.
18
+ 2. **Efficiency second**: only correctness-passing runs are compared by step, tool-call, and latency ratios.
19
+
20
+ This keeps eval pressure useful. A faster run that fails the task is not a better run.
21
+
22
+ ## Metric Framework
23
+
24
+ | Metric | Formula | Pass Signal |
25
+ |--------|---------|-------------|
26
+ | `correctness` | `passed_criteria / total_criteria` | `1.0` for release-quality evidence |
27
+ | `step_ratio` | `observed_steps / ideal_steps` | `<= 1.25` preferred |
28
+ | `tool_call_ratio` | `observed_tool_calls / ideal_tool_calls` | `<= 1.25` preferred |
29
+ | `latency_ratio` | `observed_ms / ideal_ms` | `<= 1.50` preferred |
30
+
31
+ Use ratios as advisory evidence unless a task explicitly opts into a stricter gate.
32
+
33
+ ## Ideal Trajectory Schema
34
+
35
+ ```yaml
36
+ task: "short task name"
37
+ capability: "file_operations | retrieval | tool_use | memory | conversation | summarization"
38
+ ideal:
39
+ steps: 4
40
+ tool_calls: 5
41
+ latency_ms: 120000
42
+ acceptance_criteria:
43
+ - "Criterion one"
44
+ - "Criterion two"
45
+ notes: "Why this ideal path is reasonable"
46
+ ```
47
+
48
+ ## Capability Taxonomy
49
+
50
+ | Capability | Typical Evidence |
51
+ |------------|------------------|
52
+ | `file_operations` | precise diffs, no unrelated churn, verification after writes |
53
+ | `retrieval` | targeted `rg`/file reads, source references, low duplicate search |
54
+ | `tool_use` | appropriate tool choice, no unnecessary escalation |
55
+ | `memory` | relevant memory used and cited, stale facts re-verified when needed |
56
+ | `conversation` | clear routing, no repeated clarification for known constraints |
57
+ | `summarization` | faithful compression, preserved blockers and evidence |
58
+
59
+ ## Workflow
60
+
61
+ 1. Define or load an ideal trajectory for the task.
62
+ 2. Collect observed run data from trace, transcript, hook output, or manual evidence.
63
+ 3. Score correctness against acceptance criteria.
64
+ 4. If correctness fails, stop and report failed criteria.
65
+ 5. If correctness passes, compute efficiency ratios.
66
+ 6. Attach the metric table to the completion evidence or improvement report.
67
+
68
+ ## Output Format
69
+
70
+ ```markdown
71
+ ## Agent Eval Result
72
+
73
+ | Metric | Observed | Ideal | Ratio | Verdict |
74
+ |--------|----------|-------|-------|---------|
75
+ | correctness | 4/4 | 4/4 | 1.00 | pass |
76
+ | steps | 5 | 4 | 1.25 | pass |
77
+ | tool calls | 7 | 5 | 1.40 | advisory |
78
+ | latency | 150s | 120s | 1.25 | pass |
79
+
80
+ Decision: correctness-pass, efficiency-advisory
81
+ ```
82
+
83
+ ## Integration Points
84
+
85
+ - `harness-eval`: use this framework to add trajectory efficiency evidence to benchmark runs.
86
+ - `evaluator-optimizer`: run correctness before efficiency comparisons.
87
+ - `mgr-creator`: opt in for high-risk new agents where quantitative validation is worth the extra cost.
88
+ - `omcustomcodex:improve-report`: include repeated ratio regressions as improvement suggestions.
89
+
90
+ ## Attribution
91
+
92
+ Adapted from LangChain Deep Agents eval methodology: correctness-first scoring, ideal trajectory annotation, and efficiency ratios for step, tool-call, and latency comparison.
@@ -43,6 +43,17 @@ source:
43
43
  Spawn 3 reviewers as Agent Team members:
44
44
 
45
45
  ```
46
+
47
+ ### Anti-Groupthink Mode
48
+
49
+ Use `--anti-groupthink` when consensus itself is a risk:
50
+
51
+ 1. Reviewers submit independent findings before seeing peer output.
52
+ 2. One reviewer is assigned as devil's advocate.
53
+ 3. Minority findings are preserved unless the synthesis explicitly rejects them with evidence.
54
+ 4. Debate is capped at two challenge rounds before the lead either decides or requests more facts.
55
+
56
+ For decisions where dissent preservation is the main goal, use `roundtable-debate` directly instead of `agora`.
46
57
  Agent(name: "claude-critic", model: opus, effort: max)
47
58
  → 20-point deep adversarial review
48
59
 
@@ -204,3 +204,15 @@ When routing skills detect a code generation task and codex is available:
204
204
  ```
205
205
  /codex-exec "Generate {description} following {framework} best practices" --effort high --full-auto
206
206
  ```
207
+
208
+ ## Browser Verify Workflow
209
+
210
+ For frontend or browser-visible changes, use a Build + Vision + Verify loop instead of stopping at a successful build:
211
+
212
+ 1. Build or start the local dev server.
213
+ 2. Open the target in the available browser automation surface.
214
+ 3. Capture screenshot evidence and console/network errors.
215
+ 4. If the visual state or console is wrong, run `codex-exec` with the concrete evidence and repeat.
216
+ 5. Stop only when build, browser render, and error checks all pass.
217
+
218
+ This pattern composes with the Codex App Browser Use plugin or any local browser MCP. Keep the loop evidence-driven: screenshot, console output, network status, and the exact command that produced the build.
@@ -104,6 +104,26 @@ When `conditional.enabled: true` and ANY `skip_when` condition is met, the evalu
104
104
  | Complex architecture, security-critical | High | Run with pre-negotiation |
105
105
  | Previously failed task retry | Any | Always run |
106
106
 
107
+ ### Quantitative Efficiency Metrics
108
+
109
+ When a task provides an ideal trajectory, the evaluator MAY attach `agent-eval-framework` metrics after the normal quality gate:
110
+
111
+ ```yaml
112
+ evaluator-optimizer:
113
+ quantitative_metrics:
114
+ enabled: true
115
+ ideal:
116
+ steps: 4
117
+ tool_calls: 5
118
+ latency_ms: 120000
119
+ advisory_thresholds:
120
+ step_ratio: 1.25
121
+ tool_call_ratio: 1.25
122
+ latency_ratio: 1.50
123
+ ```
124
+
125
+ Correctness remains the primary gate. Efficiency ratios are used to compare correctness-passing candidates or to create follow-up improvement suggestions.
126
+
107
127
  ### Parameter Details
108
128
 
109
129
  | Parameter | Required | Default | Description |
@@ -86,6 +86,19 @@ This skill provides preset rubrics for the evaluator-optimizer pipeline:
86
86
 
87
87
  The evaluator-optimizer skill's `pre_negotiation` phase accepts harness-eval rubric dimensions as sprint contract criteria.
88
88
 
89
+ ## Optional 4-Metric Trajectory Evidence
90
+
91
+ For agent or skill benchmarks, enrich the 0-100 quality score with the `agent-eval-framework` metrics:
92
+
93
+ | Metric | Source | Use |
94
+ |--------|--------|-----|
95
+ | `correctness` | benchmark assertions and acceptance criteria | Required before efficiency is considered |
96
+ | `step_ratio` | observed steps vs. ideal trajectory | Advisory signal for unnecessary loops |
97
+ | `tool_call_ratio` | observed tool calls vs. ideal trajectory | Advisory signal for noisy tool use |
98
+ | `latency_ratio` | observed duration vs. ideal trajectory | Advisory signal for runtime regression |
99
+
100
+ Evaluation order is fixed: correctness first, efficiency second. A benchmark run with failed correctness cannot be rescued by strong efficiency ratios.
101
+
89
102
  ## Output
90
103
 
91
104
  Results saved to `.codex/outputs/sessions/{YYYY-MM-DD}/harness-eval-{HHmmss}.md` with per-task scores and aggregate grade.
@@ -0,0 +1,60 @@
1
+ ---
2
+ name: roundtable-debate
3
+ description: Structured multi-agent debate that preserves dissent with a mandatory devil's advocate and two-round cap
4
+ scope: core
5
+ user-invocable: true
6
+ argument-hint: "<topic-or-document> [--rounds 1|2] [--decision required|advisory]"
7
+ effort: high
8
+ version: 1.0.0
9
+ ---
10
+
11
+ # Roundtable Debate
12
+
13
+ ## Purpose
14
+
15
+ Run a bounded debate when convergence would hide useful disagreement. Unlike `agora`, which drives toward consensus, this workflow preserves minority positions and requires explicit justification before dismissing them.
16
+
17
+ ## When To Use
18
+
19
+ - Architecture or product choices with multiple defensible paths.
20
+ - Review work where anchoring or groupthink is likely.
21
+ - Decisions where a minority risk could be more important than the majority preference.
22
+
23
+ ## Workflow
24
+
25
+ 1. **Independent-first analysis**: spawn 3-5 reviewers in parallel. Do not share intermediate opinions before each reviewer submits an initial view.
26
+ 2. **Mandatory devil's advocate**: one reviewer argues against the emerging default, even if they personally agree with it.
27
+ 3. **Round 1 synthesis**: group findings into majority positions, minority positions, and unresolved facts.
28
+ 4. **Round 2 challenge**: reviewers respond only to disputed claims and missing evidence.
29
+ 5. **Decision record**: keep the final recommendation and any protected dissent.
30
+
31
+ Hard cap: two debate rounds. If the decision still depends on missing facts, stop and gather evidence instead of debating longer.
32
+
33
+ ## Output
34
+
35
+ ```markdown
36
+ # Roundtable Debate Result
37
+
38
+ ## Topic
39
+ {topic}
40
+
41
+ ## Majority Recommendation
42
+ {recommendation}
43
+
44
+ ## Protected Dissent
45
+ | Position | Advocate | Why It Was Not Dismissed |
46
+ |----------|----------|--------------------------|
47
+ | {position} | devil's advocate | {evidence or risk} |
48
+
49
+ ## Decision
50
+ {adopt | defer | reject | gather-more-evidence}
51
+ ```
52
+
53
+ ## Relationship To Agora
54
+
55
+ | Workflow | Goal | Best For |
56
+ |----------|------|----------|
57
+ | `agora` | adversarial consensus | release gates, spec approval |
58
+ | `roundtable-debate` | dissent preservation | ambiguous strategy, architectural tradeoffs |
59
+
60
+ Use `agora --anti-groupthink` when you need consensus plus explicit dissent handling.
@@ -0,0 +1,126 @@
1
+ ---
2
+ name: visual-ralph
3
+ description: Visual Ralph orchestration for frontend UI using approved references, Ralph implementation, Visual Verdict scoring, and reproducible design-system evidence
4
+ ---
5
+
6
+ # Visual Ralph Skill
7
+
8
+ Use this skill when the user wants a frontend UI implemented or restyled through
9
+ a measured visual loop rather than subjective design feedback.
10
+
11
+ ## Purpose
12
+
13
+ Create a frontend delivery loop:
14
+
15
+ `user description or live URL -> approved visual reference -> $ralph implementation -> $visual-verdict + optional pixel diff -> reusable design system`.
16
+
17
+ For live URL cloning or recreation requests, Visual Ralph owns the flow. Preserve
18
+ the URL, viewport, content state, and interaction notes in the handoff instead of
19
+ routing new work to a standalone web-clone path.
20
+
21
+ ## Use When
22
+
23
+ - The user describes a web/app UI and wants implementation.
24
+ - The user provides a live URL and wants a measured visual implementation.
25
+ - A generated raster mockup or static reference image would clarify the target.
26
+ - The task needs screenshot-based pass/fail iteration.
27
+ - The final result should leave reusable tokens/components, not only a one-off visual match.
28
+
29
+ ## Do Not Use When
30
+
31
+ - The user only wants design critique or frontend advice.
32
+ - The task is non-visual backend/API work.
33
+ - The user supplied a final static reference and only needs a comparison/fix loop; hand directly to `$ralph` with `$visual-verdict`.
34
+ - The requested output is a deterministic SVG/vector/code-native asset.
35
+
36
+ ## Workflow
37
+
38
+ ### 1. Ground The Target Repo
39
+
40
+ Inspect local evidence before choosing stack-specific tactics:
41
+ - package manager and scripts,
42
+ - frontend framework and routing,
43
+ - styling system and design-token conventions,
44
+ - screenshot/test tooling,
45
+ - existing components that should be reused.
46
+
47
+ Do not assume React, Vue, Tailwind, Playwright, or another stack without repo evidence.
48
+
49
+ ### 2. Establish The Visual Reference
50
+
51
+ For live URL work, record:
52
+ - source URL and scope note,
53
+ - viewport(s), route/state, and seed/login assumptions,
54
+ - baseline screenshot path or capture command,
55
+ - interaction parity notes,
56
+ - known exclusions such as backend/auth/personalized data/third-party widget parity.
57
+
58
+ For generated UI concepts, use `$imagegen` to produce the reference. Prompt for a
59
+ `ui-mockup` with viewport/aspect ratio, intended surface, hierarchy, typography,
60
+ color direction, exact text, and a ban on unrequested logos/watermarks.
61
+
62
+ For project-bound implementation, save the approved reference in the workspace,
63
+ for example `.omx/artifacts/visual-ralph/<slug>/reference.png`.
64
+
65
+ ### 3. Require User Approval
66
+
67
+ Stop after reference generation or URL-derived reference capture and ask the user
68
+ to approve one reference image/state or request targeted changes. Do not start
69
+ frontend implementation before approval.
70
+
71
+ ### 4. Hand Off To Ralph
72
+
73
+ Invoke `$ralph` with:
74
+ - approved reference image path or URL-derived artifact,
75
+ - source URL and scope note when relevant,
76
+ - viewport/content state,
77
+ - detected repo/frontend context,
78
+ - exact screenshot command and output path,
79
+ - the completion checklist below.
80
+
81
+ ### 5. Use Visual Verdict Before Every Next Edit
82
+
83
+ For each visual iteration:
84
+ 1. Capture the current screenshot with recorded viewport/state.
85
+ 2. Run `$visual-verdict` against the approved reference.
86
+ 3. Treat the JSON verdict as authoritative.
87
+ 4. If `score < 90`, convert `differences[]` and `suggestions[]` into the next edit plan.
88
+ 5. Rerun before the next edit.
89
+
90
+ Pixel diff is secondary evidence only. It helps translate mismatch hotspots into
91
+ concrete edits, but it does not replace `$visual-verdict`.
92
+
93
+ ### 6. Leave A Reproducible Design System
94
+
95
+ Encode the match in repo-native reusable artifacts: CSS variables, theme tokens,
96
+ Tailwind config, component variants, stories, docs, or the existing equivalent.
97
+ Capture applicable colors, spacing, typography, radii, shadows, and key states.
98
+
99
+ ## Completion Checklist
100
+
101
+ - Approved reference or URL-derived artifact is saved in the workspace.
102
+ - Screenshot reproduction command, viewport, route/state, and output paths are documented.
103
+ - Final `$visual-verdict` score is `>= 90`.
104
+ - Pixel diff or overlay evidence is recorded when useful.
105
+ - Design tokens/components are repo-native and reusable.
106
+ - Build/lint/test or the repo equivalent passes.
107
+ - Remaining visual differences are documented with rationale.
108
+
109
+ ## Handoff Template
110
+
111
+ ```text
112
+ $ralph "Implement the approved frontend reference.
113
+ Reference: <workspace-reference-image-or-url-derived-artifact>
114
+ Source URL: <url and permission/scope note, if relevant>
115
+ Viewport/content state: <viewport, route/state, seed/login assumptions>
116
+ Interaction parity notes: <visible controls and known exclusions>
117
+ Route/surface: <route or component>
118
+ Screenshot command: <command and viewport>
119
+ Use $visual-verdict before every next edit; pass threshold score >= 90.
120
+ Use pixel diff only as secondary debug evidence.
121
+ Extract reusable design tokens/components.
122
+ Run build/lint/test before completion.
123
+ Do not make major design pivots unless explicitly requested."
124
+ ```
125
+
126
+ Task: {{ARGUMENTS}}
@@ -0,0 +1,49 @@
1
+ ---
2
+ name: visual-verdict
3
+ description: Structured visual QA verdict for screenshot-to-reference comparisons
4
+ ---
5
+
6
+ # Visual Verdict Skill
7
+
8
+ Use this skill when a UI task needs a strict visual comparison between one or
9
+ more reference images and the current generated screenshot.
10
+
11
+ ## Inputs
12
+
13
+ - `reference_images[]`: one or more reference image paths
14
+ - `generated_screenshot`: current output screenshot path
15
+ - `category_hint`: optional UI category, such as `dashboard`, `landing-page`, or `editor`
16
+
17
+ ## Output Contract
18
+
19
+ Return JSON only:
20
+
21
+ ```json
22
+ {
23
+ "score": 0,
24
+ "verdict": "revise",
25
+ "category_match": false,
26
+ "differences": ["..."],
27
+ "suggestions": ["..."],
28
+ "reasoning": "short explanation"
29
+ }
30
+ ```
31
+
32
+ Rules:
33
+ - `score` is an integer from 0 to 100.
34
+ - `verdict` is `pass`, `revise`, or `fail`.
35
+ - `category_match` is true only when the screenshot matches the intended UI category.
36
+ - `differences[]` lists concrete visual mismatches.
37
+ - `suggestions[]` lists concrete next edits tied to the mismatches.
38
+ - `reasoning` is one or two short sentences.
39
+
40
+ ## Threshold And Loop
41
+
42
+ - Target pass threshold is `score >= 90`.
43
+ - If `score < 90`, continue editing and rerun `$visual-verdict` before the next edit.
44
+ - Persist useful verdict evidence in `.omx/state/{scope}/ralph-progress.json` when Ralph is active.
45
+
46
+ ## Debug Visualization
47
+
48
+ Pixel diff or pixelmatch overlays are secondary debugging aids. They help locate
49
+ hotspots, but `$visual-verdict` remains the authoritative pass/fail signal.
@@ -220,38 +220,23 @@ Task tool + routing skills remain the fallback for simple/cost-sensitive tasks.
220
220
 
221
221
  ## External Dependencies
222
222
 
223
- ### Required Plugins
224
-
225
- Install via `/plugin install <name>`:
226
-
227
- | Plugin | Source | Purpose |
228
- |--------|--------|---------|
229
- | superpowers | claude-plugins-official | TDD, debugging, collaboration patterns |
230
- | openai-docs | superpowers-marketplace | OpenAI and Codex development documentation |
231
- | elements-of-style | superpowers-marketplace | Writing clarity guidelines |
232
- | obsidian-skills | - | Obsidian markdown support |
233
- | context7 | claude-plugins-official | Library documentation lookup |
234
-
235
- ### Recommended MCP Servers
223
+ ### Recommended Codex MCP Servers
236
224
 
237
225
  | Server | Purpose |
238
226
  |--------|---------|
239
227
  | omx-memory | Session memory persistence (Chroma-based) |
228
+ | context7 | Library documentation lookup MCP server when a project needs it |
240
229
 
241
230
  ### Setup Commands
242
231
 
243
232
  ```bash
244
- # Add marketplace
245
- /plugin marketplace add obra/superpowers-marketplace
246
-
247
- # Install plugins
248
- /plugin install superpowers
249
- /plugin install openai-docs
250
- /plugin install elements-of-style
251
-
252
233
  # MCP setup (omx-memory)
253
234
  npm install -g omx-memory
254
235
  omx-memory setup
255
236
  ```
256
237
 
238
+ ### Claude Code Compatibility Note
239
+
240
+ Projects that run in the Claude Code plugin ecosystem may separately install plugins such as `superpowers`, `openai-docs`, `elements-of-style`, and `context7`. They are not required Codex init steps.
241
+
257
242
  <!-- omcodex:git-workflow -->
@@ -220,38 +220,23 @@ Codex CLI의 Agent Teams 기능이 활성화되어 있으면 (`OMCODEX_AGENT_TEA
220
220
 
221
221
  ## 외부 의존성
222
222
 
223
- ### 필수 플러그인
224
-
225
- `/plugin install <이름>`으로 설치:
226
-
227
- | 플러그인 | 소스 | 용도 |
228
- |----------|------|------|
229
- | superpowers | claude-plugins-official | TDD, 디버깅, 협업 패턴 |
230
- | openai-docs | superpowers-marketplace | Codex CLI 개발 문서 |
231
- | elements-of-style | superpowers-marketplace | 글쓰기 명확성 가이드라인 |
232
- | obsidian-skills | - | 옵시디언 마크다운 지원 |
233
- | context7 | claude-plugins-official | 라이브러리 문서 조회 |
234
-
235
- ### 권장 MCP 서버
223
+ ### Codex 권장 MCP 서버
236
224
 
237
225
  | 서버 | 용도 |
238
226
  |------|------|
239
227
  | omx-memory | 세션 메모리 영속성 (Chroma 기반) |
228
+ | context7 | 라이브러리 문서 조회용 MCP 서버 (프로젝트 필요 시 설정) |
240
229
 
241
230
  ### 설치 명령어
242
231
 
243
232
  ```bash
244
- # 마켓플레이스 추가
245
- /plugin marketplace add obra/superpowers-marketplace
246
-
247
- # 플러그인 설치
248
- /plugin install superpowers
249
- /plugin install openai-docs
250
- /plugin install elements-of-style
251
-
252
233
  # MCP 설정 (omx-memory)
253
234
  npm install -g omx-memory
254
235
  omx-memory setup
255
236
  ```
256
237
 
238
+ ### Claude Code 호환 참고
239
+
240
+ Claude Code 플러그인 생태계를 쓰는 프로젝트에서는 `superpowers`, `openai-docs`, `elements-of-style`, `context7` 같은 플러그인을 별도로 설치할 수 있습니다. Codex 초기화의 필수 단계는 아닙니다.
241
+
257
242
  <!-- omcodex:git-workflow -->
@@ -222,9 +222,9 @@ Task tool + routing skills remain the fallback for simple/cost-sensitive tasks.
222
222
 
223
223
  ## External Dependencies
224
224
 
225
- ### Required Plugins
225
+ ### Claude Code Plugins
226
226
 
227
- Install via `/plugin install <name>`:
227
+ Install in Claude Code via `/plugin install <name>`:
228
228
 
229
229
  | Plugin | Source | Purpose |
230
230
  |--------|--------|---------|
@@ -240,7 +240,7 @@ Install via `/plugin install <name>`:
240
240
  |--------|---------|
241
241
  | omx-memory | Session memory persistence |
242
242
 
243
- ### Setup Commands
243
+ ### Claude Code Setup Commands
244
244
 
245
245
  ```bash
246
246
  # Add marketplace
@@ -222,9 +222,9 @@ Codex CLI의 Agent Teams 기능이 활성화되어 있으면 (`OMCODEX_AGENT_TEA
222
222
 
223
223
  ## 외부 의존성
224
224
 
225
- ### 필수 플러그인
225
+ ### Claude Code 플러그인
226
226
 
227
- `/plugin install <이름>`으로 설치:
227
+ Claude Code 환경에서 `/plugin install <이름>`으로 설치:
228
228
 
229
229
  | 플러그인 | 소스 | 용도 |
230
230
  |----------|------|------|
@@ -240,7 +240,7 @@ Codex CLI의 Agent Teams 기능이 활성화되어 있으면 (`OMCODEX_AGENT_TEA
240
240
  |------|------|
241
241
  | omx-memory | 세션 메모리 영속성 |
242
242
 
243
- ### 설치 명령어
243
+ ### Claude Code 설치 명령어
244
244
 
245
245
  ```bash
246
246
  # 마켓플레이스 추가
@@ -0,0 +1,48 @@
1
+ # Agent Eval Guide
2
+
3
+ ## Evaluation Order
4
+
5
+ Agent evaluation uses two phases:
6
+
7
+ 1. **Correctness gate**: verify the task outcome against explicit acceptance criteria.
8
+ 2. **Efficiency review**: compare only correctness-passing runs against an ideal trajectory.
9
+
10
+ Do not optimize step count or latency before correctness is proven.
11
+
12
+ ## Four Metrics
13
+
14
+ | Metric | Definition | Typical Use |
15
+ |--------|------------|-------------|
16
+ | `correctness` | Passed criteria divided by total criteria | Release or completion gate |
17
+ | `step_ratio` | Observed steps divided by ideal steps | Detect avoidable loops |
18
+ | `tool_call_ratio` | Observed tool calls divided by ideal tool calls | Detect noisy retrieval or tool misuse |
19
+ | `latency_ratio` | Observed duration divided by ideal duration | Detect runtime regressions |
20
+
21
+ ## Ideal Trajectory
22
+
23
+ ```yaml
24
+ task: "create a small routing skill"
25
+ capability: "tool_use"
26
+ ideal:
27
+ steps: 5
28
+ tool_calls: 8
29
+ latency_ms: 180000
30
+ acceptance_criteria:
31
+ - "Skill frontmatter is valid"
32
+ - "Routing docs reference the skill"
33
+ - "Tests or static checks pass"
34
+ ```
35
+
36
+ ## Interpreting Ratios
37
+
38
+ - `1.00`: observed matched the ideal.
39
+ - `< 1.00`: faster or shorter than ideal; verify no evidence was skipped.
40
+ - `1.00-1.25`: usually acceptable.
41
+ - `> 1.25`: advisory improvement candidate.
42
+ - correctness below `1.00`: fail regardless of efficiency.
43
+
44
+ ## Integration
45
+
46
+ - Use `agent-eval-framework` for task-level scoring.
47
+ - Use `harness-eval` when running repeatable benchmark suites.
48
+ - Use `omcustomcodex:improve-report` to turn repeated ratio regressions into improvement suggestions.
@@ -0,0 +1,6 @@
1
+ name: agent-eval
2
+ description: Quantitative agent evaluation with correctness-first 4-metric evidence
3
+ source:
4
+ type: internal
5
+ files:
6
+ - README.md
@@ -75,6 +75,18 @@ Capture at least one of:
75
75
 
76
76
  When summarizing evidence for the model, preserve reference tokens and URLs so follow-up steps can still target the right page elements.
77
77
 
78
+ ## Build + Vision + Verify Loop
79
+
80
+ For browser-visible changes, treat a successful build as the start of verification, not the end:
81
+
82
+ 1. Build or start the local app.
83
+ 2. Open the page in the available browser surface.
84
+ 3. Capture screenshot, console, and network evidence.
85
+ 4. Feed concrete failures back to the implementation agent.
86
+ 5. Repeat until build, render, and runtime evidence all pass.
87
+
88
+ This is the Codex Browser Use pattern in portable form. Prefer the in-app Browser Use plugin when available; otherwise use Playwright or the existing browser MCP surface.
89
+
78
90
  ## Design And Strategy Workflows
79
91
 
80
92
  ### Product strategy sessions
@@ -40,6 +40,18 @@ guides:
40
40
  source:
41
41
  type: internal
42
42
 
43
+ - name: agent-eval
44
+ description: Quantitative agent evaluation with correctness-first 4-metric evidence
45
+ path: ./agent-eval/
46
+ source:
47
+ type: internal
48
+
49
+ - name: multi-agent-debate-patterns
50
+ description: Anti-groupthink debate patterns for Agora and roundtable-debate workflows
51
+ path: ./multi-agent-debate-patterns/
52
+ source:
53
+ type: internal
54
+
43
55
  # Languages
44
56
  - name: golang
45
57
  description: Go language reference from Effective Go
@@ -0,0 +1,26 @@
1
+ # Multi-Agent Debate Patterns
2
+
3
+ ## Pattern Choice
4
+
5
+ | Pattern | Goal | Use When |
6
+ |---------|------|----------|
7
+ | `agora` | Reach adversarial consensus | Release gates, design approval, high-risk specs |
8
+ | `roundtable-debate` | Preserve dissent | Strategy choices, tradeoffs, ambiguous product or architecture decisions |
9
+
10
+ ## Failure Modes
11
+
12
+ - **Anchoring**: later reviewers inherit the first opinion.
13
+ - **Groupthink**: reviewers converge because convergence looks productive.
14
+ - **Degeneration of thought**: debate continues without adding new evidence.
15
+
16
+ ## Controls
17
+
18
+ 1. Start with independent parallel analysis.
19
+ 2. Assign a devil's advocate.
20
+ 3. Protect minority findings unless explicitly rejected with evidence.
21
+ 4. Cap debate at two rounds.
22
+ 5. Switch from debate to evidence gathering when facts are missing.
23
+
24
+ ## Decision Record
25
+
26
+ Keep the final recommendation, rejected alternatives, and protected dissent together. Future agents should be able to see not only what was chosen, but which minority risk remains live.
@@ -0,0 +1,6 @@
1
+ name: multi-agent-debate-patterns
2
+ description: Anti-groupthink debate patterns for Agora and roundtable-debate workflows
3
+ source:
4
+ type: internal
5
+ files:
6
+ - README.md
@@ -1,6 +1,6 @@
1
1
  {
2
- "version": "0.4.0",
3
- "lastUpdated": "2026-04-24T14:35:00.000Z",
2
+ "version": "0.4.2",
3
+ "lastUpdated": "2026-04-27T02:00:00.000Z",
4
4
  "components": [
5
5
  {
6
6
  "name": "rules",
@@ -18,13 +18,13 @@
18
18
  "name": "skills",
19
19
  "path": ".agents/skills",
20
20
  "description": "Reusable skill modules (project-scoped repo skills)",
21
- "files": 112
21
+ "files": 116
22
22
  },
23
23
  {
24
24
  "name": "guides",
25
25
  "path": "guides",
26
26
  "description": "Reference documentation",
27
- "files": 40
27
+ "files": 42
28
28
  },
29
29
  {
30
30
  "name": "hooks",
@@ -1,5 +1,5 @@
1
1
  # /pipeline auto-dev — Full-auto release pipeline
2
- # Pre-triages open issues → triage verify-done → plan → implement → verify → PR/release → publish → followup
2
+ # Pre-triages open issues and release monitors → triage verify-done → plan → implement → verify → PR/release → publish → followup
3
3
 
4
4
  name: auto-dev
5
5
  description: "Full-auto release pipeline: pre-triage → triage → plan → implement → verify → PR → publish → followup"
@@ -11,11 +11,11 @@ steps:
11
11
  parallel:
12
12
  - name: pre-triage
13
13
  skill: professor-triage
14
- description: Run professor-triage on open issues that lack verify-done label
15
- condition: "open issues without label:verify-done exist"
14
+ description: Run professor-triage on open issues that lack verify-done label, including release-monitor labels codex-release and oh-my-codex-release
15
+ condition: "open issues without label:verify-done exist OR open release-monitor issues with label:codex-release or label:oh-my-codex-release exist"
16
16
  - name: triage
17
17
  skill: professor-triage
18
- description: Analyze verify-done issues against current codebase and perform automated triage
18
+ description: Analyze verify-done and release-monitor issues against current codebase and perform automated triage
19
19
 
20
20
  - name: plan
21
21
  depends_on: issue-analysis