oh-my-customcodex 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -4
- package/dist/cli/index.js +2 -9
- package/dist/index.js +1 -1
- package/package.json +1 -1
- package/templates/.claude/agents/mgr-creator.md +11 -0
- package/templates/.claude/output-styles/korean-engineer.md +24 -0
- package/templates/.claude/rules/MUST-agent-design.md +1 -0
- package/templates/.claude/rules/MUST-completion-verification.md +13 -0
- package/templates/.claude/rules/SHOULD-interaction.md +2 -0
- package/templates/.claude/skills/agent-eval-framework/SKILL.md +92 -0
- package/templates/.claude/skills/agora/SKILL.md +11 -0
- package/templates/.claude/skills/codex-exec/SKILL.md +12 -0
- package/templates/.claude/skills/evaluator-optimizer/SKILL.md +20 -0
- package/templates/.claude/skills/harness-eval/SKILL.md +13 -0
- package/templates/.claude/skills/roundtable-debate/SKILL.md +60 -0
- package/templates/AGENTS.md.en +6 -21
- package/templates/AGENTS.md.ko +6 -21
- package/templates/CLAUDE.md.en +3 -3
- package/templates/CLAUDE.md.ko +3 -3
- package/templates/guides/agent-eval/README.md +48 -0
- package/templates/guides/agent-eval/index.yaml +6 -0
- package/templates/guides/browser-automation/README.md +12 -0
- package/templates/guides/index.yaml +12 -0
- package/templates/guides/multi-agent-debate-patterns/README.md +26 -0
- package/templates/guides/multi-agent-debate-patterns/index.yaml +6 -0
- package/templates/manifest.json +4 -4
package/README.md
CHANGED
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
|
|
14
14
|
**[한국어 문서 (Korean)](./README_ko.md)**
|
|
15
15
|
|
|
16
|
-
49 agents.
|
|
16
|
+
49 agents. 114 skills. 22 rules. One command.
|
|
17
17
|
|
|
18
18
|
```bash
|
|
19
19
|
npm install -g oh-my-customcodex && cd your-project && omcustomcodex init
|
|
@@ -134,7 +134,7 @@ Each agent declares its tools, model, memory scope, and limitations in YAML fron
|
|
|
134
134
|
|
|
135
135
|
---
|
|
136
136
|
|
|
137
|
-
### Skills (
|
|
137
|
+
### Skills (114)
|
|
138
138
|
|
|
139
139
|
| Category | Count | Includes |
|
|
140
140
|
|----------|-------|----------|
|
|
@@ -227,7 +227,7 @@ Key rules: R010 (orchestrator never writes files), R009 (parallel execution mand
|
|
|
227
227
|
|
|
228
228
|
---
|
|
229
229
|
|
|
230
|
-
### Guides (
|
|
230
|
+
### Guides (42)
|
|
231
231
|
|
|
232
232
|
Reference documentation covering best practices, architecture decisions, and integration patterns. Located in `guides/` at project root, covering topics from agent design to CI/CD to observability.
|
|
233
233
|
|
|
@@ -286,7 +286,7 @@ your-project/
|
|
|
286
286
|
│ ├── contexts/ # 4 shared context files
|
|
287
287
|
│ └── ontology/ # Knowledge graph for RAG
|
|
288
288
|
├── .agents/
|
|
289
|
-
│ └── skills/ #
|
|
289
|
+
│ └── skills/ # 114 installed skill modules
|
|
290
290
|
└── guides/ # 40 reference documents
|
|
291
291
|
```
|
|
292
292
|
|
package/dist/cli/index.js
CHANGED
|
@@ -3091,7 +3091,7 @@ var init_package = __esm(() => {
|
|
|
3091
3091
|
workspaces: [
|
|
3092
3092
|
"packages/*"
|
|
3093
3093
|
],
|
|
3094
|
-
version: "0.4.
|
|
3094
|
+
version: "0.4.1",
|
|
3095
3095
|
description: "Batteries-included agent harness on top of GPT Codex + OMX",
|
|
3096
3096
|
type: "module",
|
|
3097
3097
|
bin: {
|
|
@@ -29925,14 +29925,7 @@ async function initCommand(options) {
|
|
|
29925
29925
|
await registerProject(targetDir, package_default.version);
|
|
29926
29926
|
} catch {}
|
|
29927
29927
|
console.log("");
|
|
29928
|
-
console.log("
|
|
29929
|
-
console.log(" /plugin marketplace add obra/superpowers-marketplace");
|
|
29930
|
-
console.log(" /plugin install superpowers");
|
|
29931
|
-
console.log(" /plugin install openai-docs");
|
|
29932
|
-
console.log(" /plugin install elements-of-style");
|
|
29933
|
-
console.log(" /plugin install context7");
|
|
29934
|
-
console.log("");
|
|
29935
|
-
console.log('See AGENTS.md "외부 의존성" section for details.');
|
|
29928
|
+
console.log("Codex setup complete. See AGENTS.md for Codex-native MCP and runtime guidance.");
|
|
29936
29929
|
return {
|
|
29937
29930
|
success: true,
|
|
29938
29931
|
message: i18n.t("cli.init.success"),
|
package/dist/index.js
CHANGED
package/package.json
CHANGED
|
@@ -7,6 +7,7 @@ memory: project
|
|
|
7
7
|
effort: high
|
|
8
8
|
skills:
|
|
9
9
|
- create-agent
|
|
10
|
+
- agent-eval-framework
|
|
10
11
|
tools:
|
|
11
12
|
- Read
|
|
12
13
|
- Write
|
|
@@ -36,6 +37,16 @@ Frontmatter (name, description, model, tools, skills, memory) + body (purpose, c
|
|
|
36
37
|
|
|
37
38
|
No registry update needed - agents auto-discovered from `.claude/agents/*.md`.
|
|
38
39
|
|
|
40
|
+
### Phase 4: Optional Quantitative Gate
|
|
41
|
+
|
|
42
|
+
For high-risk or reusable agents, use `agent-eval-framework` after creation:
|
|
43
|
+
|
|
44
|
+
1. Define an ideal trajectory for the agent's first representative task.
|
|
45
|
+
2. Run correctness checks before measuring efficiency.
|
|
46
|
+
3. Record `step_ratio`, `tool_call_ratio`, and `latency_ratio` as advisory evidence.
|
|
47
|
+
|
|
48
|
+
Do not force this gate for every small helper agent. It is opt-in when the extra cost is justified by reuse, safety, or routing criticality.
|
|
49
|
+
|
|
39
50
|
## Rules Applied
|
|
40
51
|
|
|
41
52
|
- R000: All files in English
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: korean-engineer
|
|
3
|
+
description: Korean-first engineering responses with agent identity and evidence-focused completion
|
|
4
|
+
keep-coding-instructions: true
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# Korean Engineer Output Style
|
|
8
|
+
|
|
9
|
+
Use Korean for user-facing communication unless the user explicitly asks otherwise. Keep code, file contents, identifiers, and commit trailers in English when that is the repository convention.
|
|
10
|
+
|
|
11
|
+
Every response starts with the agent identity block required by the project guidance:
|
|
12
|
+
|
|
13
|
+
```text
|
|
14
|
+
┌─ Agent: {agent-name} / {model}
|
|
15
|
+
│ Skill: {active-skill-or-none}
|
|
16
|
+
└─ Status: {current action or result}
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
Prefer concise, evidence-focused engineering reports:
|
|
20
|
+
|
|
21
|
+
- State the current action or outcome first.
|
|
22
|
+
- Cite concrete verification evidence before declaring completion.
|
|
23
|
+
- Do not claim release, deploy, or publish completion until the external surface has been checked.
|
|
24
|
+
- Keep uncertainty explicit and tied to the missing evidence.
|
|
@@ -254,6 +254,7 @@ Recommended practice:
|
|
|
254
254
|
2. Keep allow rules only as defensive documentation; do not rely on them to suppress sensitive-path prompts.
|
|
255
255
|
3. Do not run unattended Claude Code release automation that writes `templates/.claude/**` unless the workflow can handle interactive approval.
|
|
256
256
|
4. In this Codex port, update `.codex/...` source files and their `templates/.claude/...` mirrors deliberately instead of bulk-copying with shell commands.
|
|
257
|
+
5. For unattended Claude compatibility-template writes, use a reviewed temporary script wrapper and verify the resulting diff; direct Bash/Write/Edit targets under `templates/.claude/**` can all trigger the sensitive-path guard.
|
|
257
258
|
|
|
258
259
|
## Separation of Concerns
|
|
259
260
|
|
|
@@ -21,6 +21,19 @@ Before declaring any task `[Done]`, verify completion against task-type-specific
|
|
|
21
21
|
|
|
22
22
|
Before [Done]: (1) Verify ACTUAL outcome not just attempt — "ran command" ≠ "succeeded". (2) Check task-type criteria above. (3) No unchecked items. (4) Would bet $100 it's complete.
|
|
23
23
|
|
|
24
|
+
## Optional: Quantitative Evidence
|
|
25
|
+
|
|
26
|
+
For agent, skill, or workflow changes, completion evidence MAY include `agent-eval-framework` metrics:
|
|
27
|
+
|
|
28
|
+
| Metric | Meaning | Gate |
|
|
29
|
+
|--------|---------|------|
|
|
30
|
+
| `correctness` | Acceptance criteria satisfied | Required if included |
|
|
31
|
+
| `step_ratio` | Observed steps vs. ideal steps | Advisory |
|
|
32
|
+
| `tool_call_ratio` | Observed tool calls vs. ideal tool calls | Advisory |
|
|
33
|
+
| `latency_ratio` | Observed duration vs. ideal duration | Advisory |
|
|
34
|
+
|
|
35
|
+
These metrics strengthen a `[Done]` claim but do not replace task-specific verification. A failed correctness score blocks completion even if efficiency ratios are good.
|
|
36
|
+
|
|
24
37
|
<!-- DETAIL: Self-Check box
|
|
25
38
|
1. Did I verify ACTUAL outcome? "I ran the command" ≠ "the command succeeded" → YES: Continue / NO: Verify first
|
|
26
39
|
2. Does task type have specific criteria? YES: Check each / NO: Apply general verification
|
|
@@ -35,6 +35,8 @@
|
|
|
35
35
|
|
|
36
36
|
## Output Styles
|
|
37
37
|
|
|
38
|
+
Session-level style enforcement belongs in runtime output-style mechanisms when the host supports them. In this Codex port, R003 remains the portable source of style-selection rules; packaged Claude compatibility may additionally provide `.claude/output-styles/` presets that reinforce the same constraints.
|
|
39
|
+
|
|
38
40
|
| Style | Trigger | Behavior |
|
|
39
41
|
|-------|---------|----------|
|
|
40
42
|
| `concise` | effort: low, batch operations | Key result only, no preamble, no elaboration |
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: agent-eval-framework
|
|
3
|
+
description: Quantitative agent evaluation using correctness, step ratio, tool-call ratio, and latency ratio
|
|
4
|
+
scope: harness
|
|
5
|
+
user-invocable: true
|
|
6
|
+
argument-hint: "<trace-or-task> [--ideal <path>] [--format markdown|json]"
|
|
7
|
+
effort: high
|
|
8
|
+
version: 1.0.0
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# Agent Eval Framework
|
|
12
|
+
|
|
13
|
+
## Purpose
|
|
14
|
+
|
|
15
|
+
Evaluate agent runs with a two-phase quantitative gate:
|
|
16
|
+
|
|
17
|
+
1. **Correctness first**: the task must meet its stated acceptance criteria.
|
|
18
|
+
2. **Efficiency second**: only correctness-passing runs are compared by step, tool-call, and latency ratios.
|
|
19
|
+
|
|
20
|
+
This keeps eval pressure useful. A faster run that fails the task is not a better run.
|
|
21
|
+
|
|
22
|
+
## Metric Framework
|
|
23
|
+
|
|
24
|
+
| Metric | Formula | Pass Signal |
|
|
25
|
+
|--------|---------|-------------|
|
|
26
|
+
| `correctness` | `passed_criteria / total_criteria` | `1.0` for release-quality evidence |
|
|
27
|
+
| `step_ratio` | `observed_steps / ideal_steps` | `<= 1.25` preferred |
|
|
28
|
+
| `tool_call_ratio` | `observed_tool_calls / ideal_tool_calls` | `<= 1.25` preferred |
|
|
29
|
+
| `latency_ratio` | `observed_ms / ideal_ms` | `<= 1.50` preferred |
|
|
30
|
+
|
|
31
|
+
Use ratios as advisory evidence unless a task explicitly opts into a stricter gate.
|
|
32
|
+
|
|
33
|
+
## Ideal Trajectory Schema
|
|
34
|
+
|
|
35
|
+
```yaml
|
|
36
|
+
task: "short task name"
|
|
37
|
+
capability: "file_operations | retrieval | tool_use | memory | conversation | summarization"
|
|
38
|
+
ideal:
|
|
39
|
+
steps: 4
|
|
40
|
+
tool_calls: 5
|
|
41
|
+
latency_ms: 120000
|
|
42
|
+
acceptance_criteria:
|
|
43
|
+
- "Criterion one"
|
|
44
|
+
- "Criterion two"
|
|
45
|
+
notes: "Why this ideal path is reasonable"
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Capability Taxonomy
|
|
49
|
+
|
|
50
|
+
| Capability | Typical Evidence |
|
|
51
|
+
|------------|------------------|
|
|
52
|
+
| `file_operations` | precise diffs, no unrelated churn, verification after writes |
|
|
53
|
+
| `retrieval` | targeted `rg`/file reads, source references, low duplicate search |
|
|
54
|
+
| `tool_use` | appropriate tool choice, no unnecessary escalation |
|
|
55
|
+
| `memory` | relevant memory used and cited, stale facts re-verified when needed |
|
|
56
|
+
| `conversation` | clear routing, no repeated clarification for known constraints |
|
|
57
|
+
| `summarization` | faithful compression, preserved blockers and evidence |
|
|
58
|
+
|
|
59
|
+
## Workflow
|
|
60
|
+
|
|
61
|
+
1. Define or load an ideal trajectory for the task.
|
|
62
|
+
2. Collect observed run data from trace, transcript, hook output, or manual evidence.
|
|
63
|
+
3. Score correctness against acceptance criteria.
|
|
64
|
+
4. If correctness fails, stop and report failed criteria.
|
|
65
|
+
5. If correctness passes, compute efficiency ratios.
|
|
66
|
+
6. Attach the metric table to the completion evidence or improvement report.
|
|
67
|
+
|
|
68
|
+
## Output Format
|
|
69
|
+
|
|
70
|
+
```markdown
|
|
71
|
+
## Agent Eval Result
|
|
72
|
+
|
|
73
|
+
| Metric | Observed | Ideal | Ratio | Verdict |
|
|
74
|
+
|--------|----------|-------|-------|---------|
|
|
75
|
+
| correctness | 4/4 | 4/4 | 1.00 | pass |
|
|
76
|
+
| steps | 5 | 4 | 1.25 | pass |
|
|
77
|
+
| tool calls | 7 | 5 | 1.40 | advisory |
|
|
78
|
+
| latency | 150s | 120s | 1.25 | pass |
|
|
79
|
+
|
|
80
|
+
Decision: correctness-pass, efficiency-advisory
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Integration Points
|
|
84
|
+
|
|
85
|
+
- `harness-eval`: use this framework to add trajectory efficiency evidence to benchmark runs.
|
|
86
|
+
- `evaluator-optimizer`: run correctness before efficiency comparisons.
|
|
87
|
+
- `mgr-creator`: opt in for high-risk new agents where quantitative validation is worth the extra cost.
|
|
88
|
+
- `omcustomcodex:improve-report`: include repeated ratio regressions as improvement suggestions.
|
|
89
|
+
|
|
90
|
+
## Attribution
|
|
91
|
+
|
|
92
|
+
Adapted from LangChain Deep Agents eval methodology: correctness-first scoring, ideal trajectory annotation, and efficiency ratios for step, tool-call, and latency comparison.
|
|
@@ -43,6 +43,17 @@ source:
|
|
|
43
43
|
Spawn 3 reviewers as Agent Team members:
|
|
44
44
|
|
|
45
45
|
```
|
|
46
|
+
|
|
47
|
+
### Anti-Groupthink Mode
|
|
48
|
+
|
|
49
|
+
Use `--anti-groupthink` when consensus itself is a risk:
|
|
50
|
+
|
|
51
|
+
1. Reviewers submit independent findings before seeing peer output.
|
|
52
|
+
2. One reviewer is assigned as devil's advocate.
|
|
53
|
+
3. Minority findings are preserved unless the synthesis explicitly rejects them with evidence.
|
|
54
|
+
4. Debate is capped at two challenge rounds before the lead either decides or requests more facts.
|
|
55
|
+
|
|
56
|
+
For decisions where dissent preservation is the main goal, use `roundtable-debate` directly instead of `agora`.
|
|
46
57
|
Agent(name: "claude-critic", model: opus, effort: max)
|
|
47
58
|
→ 20-point deep adversarial review
|
|
48
59
|
|
|
@@ -204,3 +204,15 @@ When routing skills detect a code generation task and codex is available:
|
|
|
204
204
|
```
|
|
205
205
|
/codex-exec "Generate {description} following {framework} best practices" --effort high --full-auto
|
|
206
206
|
```
|
|
207
|
+
|
|
208
|
+
## Browser Verify Workflow
|
|
209
|
+
|
|
210
|
+
For frontend or browser-visible changes, use a Build + Vision + Verify loop instead of stopping at a successful build:
|
|
211
|
+
|
|
212
|
+
1. Build or start the local dev server.
|
|
213
|
+
2. Open the target in the available browser automation surface.
|
|
214
|
+
3. Capture screenshot evidence and console/network errors.
|
|
215
|
+
4. If the visual state or console is wrong, run `codex-exec` with the concrete evidence and repeat.
|
|
216
|
+
5. Stop only when build, browser render, and error checks all pass.
|
|
217
|
+
|
|
218
|
+
This pattern composes with the Codex App Browser Use plugin or any local browser MCP. Keep the loop evidence-driven: screenshot, console output, network status, and the exact command that produced the build.
|
|
@@ -104,6 +104,26 @@ When `conditional.enabled: true` and ANY `skip_when` condition is met, the evalu
|
|
|
104
104
|
| Complex architecture, security-critical | High | Run with pre-negotiation |
|
|
105
105
|
| Previously failed task retry | Any | Always run |
|
|
106
106
|
|
|
107
|
+
### Quantitative Efficiency Metrics
|
|
108
|
+
|
|
109
|
+
When a task provides an ideal trajectory, the evaluator MAY attach `agent-eval-framework` metrics after the normal quality gate:
|
|
110
|
+
|
|
111
|
+
```yaml
|
|
112
|
+
evaluator-optimizer:
|
|
113
|
+
quantitative_metrics:
|
|
114
|
+
enabled: true
|
|
115
|
+
ideal:
|
|
116
|
+
steps: 4
|
|
117
|
+
tool_calls: 5
|
|
118
|
+
latency_ms: 120000
|
|
119
|
+
advisory_thresholds:
|
|
120
|
+
step_ratio: 1.25
|
|
121
|
+
tool_call_ratio: 1.25
|
|
122
|
+
latency_ratio: 1.50
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
Correctness remains the primary gate. Efficiency ratios are used to compare correctness-passing candidates or to create follow-up improvement suggestions.
|
|
126
|
+
|
|
107
127
|
### Parameter Details
|
|
108
128
|
|
|
109
129
|
| Parameter | Required | Default | Description |
|
|
@@ -86,6 +86,19 @@ This skill provides preset rubrics for the evaluator-optimizer pipeline:
|
|
|
86
86
|
|
|
87
87
|
The evaluator-optimizer skill's `pre_negotiation` phase accepts harness-eval rubric dimensions as sprint contract criteria.
|
|
88
88
|
|
|
89
|
+
## Optional 4-Metric Trajectory Evidence
|
|
90
|
+
|
|
91
|
+
For agent or skill benchmarks, enrich the 0-100 quality score with the `agent-eval-framework` metrics:
|
|
92
|
+
|
|
93
|
+
| Metric | Source | Use |
|
|
94
|
+
|--------|--------|-----|
|
|
95
|
+
| `correctness` | benchmark assertions and acceptance criteria | Required before efficiency is considered |
|
|
96
|
+
| `step_ratio` | observed steps vs. ideal trajectory | Advisory signal for unnecessary loops |
|
|
97
|
+
| `tool_call_ratio` | observed tool calls vs. ideal trajectory | Advisory signal for noisy tool use |
|
|
98
|
+
| `latency_ratio` | observed duration vs. ideal trajectory | Advisory signal for runtime regression |
|
|
99
|
+
|
|
100
|
+
Evaluation order is fixed: correctness first, efficiency second. A benchmark run with failed correctness cannot be rescued by strong efficiency ratios.
|
|
101
|
+
|
|
89
102
|
## Output
|
|
90
103
|
|
|
91
104
|
Results saved to `.codex/outputs/sessions/{YYYY-MM-DD}/harness-eval-{HHmmss}.md` with per-task scores and aggregate grade.
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: roundtable-debate
|
|
3
|
+
description: Structured multi-agent debate that preserves dissent with a mandatory devil's advocate and two-round cap
|
|
4
|
+
scope: core
|
|
5
|
+
user-invocable: true
|
|
6
|
+
argument-hint: "<topic-or-document> [--rounds 1|2] [--decision required|advisory]"
|
|
7
|
+
effort: high
|
|
8
|
+
version: 1.0.0
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# Roundtable Debate
|
|
12
|
+
|
|
13
|
+
## Purpose
|
|
14
|
+
|
|
15
|
+
Run a bounded debate when convergence would hide useful disagreement. Unlike `agora`, which drives toward consensus, this workflow preserves minority positions and requires explicit justification before dismissing them.
|
|
16
|
+
|
|
17
|
+
## When To Use
|
|
18
|
+
|
|
19
|
+
- Architecture or product choices with multiple defensible paths.
|
|
20
|
+
- Review work where anchoring or groupthink is likely.
|
|
21
|
+
- Decisions where a minority risk could be more important than the majority preference.
|
|
22
|
+
|
|
23
|
+
## Workflow
|
|
24
|
+
|
|
25
|
+
1. **Independent-first analysis**: spawn 3-5 reviewers in parallel. Do not share intermediate opinions before each reviewer submits an initial view.
|
|
26
|
+
2. **Mandatory devil's advocate**: one reviewer argues against the emerging default, even if they personally agree with it.
|
|
27
|
+
3. **Round 1 synthesis**: group findings into majority positions, minority positions, and unresolved facts.
|
|
28
|
+
4. **Round 2 challenge**: reviewers respond only to disputed claims and missing evidence.
|
|
29
|
+
5. **Decision record**: keep the final recommendation and any protected dissent.
|
|
30
|
+
|
|
31
|
+
Hard cap: two debate rounds. If the decision still depends on missing facts, stop and gather evidence instead of debating longer.
|
|
32
|
+
|
|
33
|
+
## Output
|
|
34
|
+
|
|
35
|
+
```markdown
|
|
36
|
+
# Roundtable Debate Result
|
|
37
|
+
|
|
38
|
+
## Topic
|
|
39
|
+
{topic}
|
|
40
|
+
|
|
41
|
+
## Majority Recommendation
|
|
42
|
+
{recommendation}
|
|
43
|
+
|
|
44
|
+
## Protected Dissent
|
|
45
|
+
| Position | Advocate | Why It Was Not Dismissed |
|
|
46
|
+
|----------|----------|--------------------------|
|
|
47
|
+
| {position} | devil's advocate | {evidence or risk} |
|
|
48
|
+
|
|
49
|
+
## Decision
|
|
50
|
+
{adopt | defer | reject | gather-more-evidence}
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Relationship To Agora
|
|
54
|
+
|
|
55
|
+
| Workflow | Goal | Best For |
|
|
56
|
+
|----------|------|----------|
|
|
57
|
+
| `agora` | adversarial consensus | release gates, spec approval |
|
|
58
|
+
| `roundtable-debate` | dissent preservation | ambiguous strategy, architectural tradeoffs |
|
|
59
|
+
|
|
60
|
+
Use `agora --anti-groupthink` when you need consensus plus explicit dissent handling.
|
package/templates/AGENTS.md.en
CHANGED
|
@@ -220,38 +220,23 @@ Task tool + routing skills remain the fallback for simple/cost-sensitive tasks.
|
|
|
220
220
|
|
|
221
221
|
## External Dependencies
|
|
222
222
|
|
|
223
|
-
###
|
|
224
|
-
|
|
225
|
-
Install via `/plugin install <name>`:
|
|
226
|
-
|
|
227
|
-
| Plugin | Source | Purpose |
|
|
228
|
-
|--------|--------|---------|
|
|
229
|
-
| superpowers | claude-plugins-official | TDD, debugging, collaboration patterns |
|
|
230
|
-
| openai-docs | superpowers-marketplace | OpenAI and Codex development documentation |
|
|
231
|
-
| elements-of-style | superpowers-marketplace | Writing clarity guidelines |
|
|
232
|
-
| obsidian-skills | - | Obsidian markdown support |
|
|
233
|
-
| context7 | claude-plugins-official | Library documentation lookup |
|
|
234
|
-
|
|
235
|
-
### Recommended MCP Servers
|
|
223
|
+
### Recommended Codex MCP Servers
|
|
236
224
|
|
|
237
225
|
| Server | Purpose |
|
|
238
226
|
|--------|---------|
|
|
239
227
|
| omx-memory | Session memory persistence (Chroma-based) |
|
|
228
|
+
| context7 | Library documentation lookup MCP server when a project needs it |
|
|
240
229
|
|
|
241
230
|
### Setup Commands
|
|
242
231
|
|
|
243
232
|
```bash
|
|
244
|
-
# Add marketplace
|
|
245
|
-
/plugin marketplace add obra/superpowers-marketplace
|
|
246
|
-
|
|
247
|
-
# Install plugins
|
|
248
|
-
/plugin install superpowers
|
|
249
|
-
/plugin install openai-docs
|
|
250
|
-
/plugin install elements-of-style
|
|
251
|
-
|
|
252
233
|
# MCP setup (omx-memory)
|
|
253
234
|
npm install -g omx-memory
|
|
254
235
|
omx-memory setup
|
|
255
236
|
```
|
|
256
237
|
|
|
238
|
+
### Claude Code Compatibility Note
|
|
239
|
+
|
|
240
|
+
Projects that run in the Claude Code plugin ecosystem may separately install plugins such as `superpowers`, `openai-docs`, `elements-of-style`, and `context7`. They are not required Codex init steps.
|
|
241
|
+
|
|
257
242
|
<!-- omcodex:git-workflow -->
|
package/templates/AGENTS.md.ko
CHANGED
|
@@ -220,38 +220,23 @@ Codex CLI의 Agent Teams 기능이 활성화되어 있으면 (`OMCODEX_AGENT_TEA
|
|
|
220
220
|
|
|
221
221
|
## 외부 의존성
|
|
222
222
|
|
|
223
|
-
###
|
|
224
|
-
|
|
225
|
-
`/plugin install <이름>`으로 설치:
|
|
226
|
-
|
|
227
|
-
| 플러그인 | 소스 | 용도 |
|
|
228
|
-
|----------|------|------|
|
|
229
|
-
| superpowers | claude-plugins-official | TDD, 디버깅, 협업 패턴 |
|
|
230
|
-
| openai-docs | superpowers-marketplace | Codex CLI 개발 문서 |
|
|
231
|
-
| elements-of-style | superpowers-marketplace | 글쓰기 명확성 가이드라인 |
|
|
232
|
-
| obsidian-skills | - | 옵시디언 마크다운 지원 |
|
|
233
|
-
| context7 | claude-plugins-official | 라이브러리 문서 조회 |
|
|
234
|
-
|
|
235
|
-
### 권장 MCP 서버
|
|
223
|
+
### Codex 권장 MCP 서버
|
|
236
224
|
|
|
237
225
|
| 서버 | 용도 |
|
|
238
226
|
|------|------|
|
|
239
227
|
| omx-memory | 세션 메모리 영속성 (Chroma 기반) |
|
|
228
|
+
| context7 | 라이브러리 문서 조회용 MCP 서버 (프로젝트 필요 시 설정) |
|
|
240
229
|
|
|
241
230
|
### 설치 명령어
|
|
242
231
|
|
|
243
232
|
```bash
|
|
244
|
-
# 마켓플레이스 추가
|
|
245
|
-
/plugin marketplace add obra/superpowers-marketplace
|
|
246
|
-
|
|
247
|
-
# 플러그인 설치
|
|
248
|
-
/plugin install superpowers
|
|
249
|
-
/plugin install openai-docs
|
|
250
|
-
/plugin install elements-of-style
|
|
251
|
-
|
|
252
233
|
# MCP 설정 (omx-memory)
|
|
253
234
|
npm install -g omx-memory
|
|
254
235
|
omx-memory setup
|
|
255
236
|
```
|
|
256
237
|
|
|
238
|
+
### Claude Code 호환 참고
|
|
239
|
+
|
|
240
|
+
Claude Code 플러그인 생태계를 쓰는 프로젝트에서는 `superpowers`, `openai-docs`, `elements-of-style`, `context7` 같은 플러그인을 별도로 설치할 수 있습니다. Codex 초기화의 필수 단계는 아닙니다.
|
|
241
|
+
|
|
257
242
|
<!-- omcodex:git-workflow -->
|
package/templates/CLAUDE.md.en
CHANGED
|
@@ -222,9 +222,9 @@ Task tool + routing skills remain the fallback for simple/cost-sensitive tasks.
|
|
|
222
222
|
|
|
223
223
|
## External Dependencies
|
|
224
224
|
|
|
225
|
-
###
|
|
225
|
+
### Claude Code Plugins
|
|
226
226
|
|
|
227
|
-
Install via `/plugin install <name>`:
|
|
227
|
+
Install in Claude Code via `/plugin install <name>`:
|
|
228
228
|
|
|
229
229
|
| Plugin | Source | Purpose |
|
|
230
230
|
|--------|--------|---------|
|
|
@@ -240,7 +240,7 @@ Install via `/plugin install <name>`:
|
|
|
240
240
|
|--------|---------|
|
|
241
241
|
| omx-memory | Session memory persistence |
|
|
242
242
|
|
|
243
|
-
### Setup Commands
|
|
243
|
+
### Claude Code Setup Commands
|
|
244
244
|
|
|
245
245
|
```bash
|
|
246
246
|
# Add marketplace
|
package/templates/CLAUDE.md.ko
CHANGED
|
@@ -222,9 +222,9 @@ Codex CLI의 Agent Teams 기능이 활성화되어 있으면 (`OMCODEX_AGENT_TEA
|
|
|
222
222
|
|
|
223
223
|
## 외부 의존성
|
|
224
224
|
|
|
225
|
-
###
|
|
225
|
+
### Claude Code 플러그인
|
|
226
226
|
|
|
227
|
-
`/plugin install <이름>`으로 설치:
|
|
227
|
+
Claude Code 환경에서 `/plugin install <이름>`으로 설치:
|
|
228
228
|
|
|
229
229
|
| 플러그인 | 소스 | 용도 |
|
|
230
230
|
|----------|------|------|
|
|
@@ -240,7 +240,7 @@ Codex CLI의 Agent Teams 기능이 활성화되어 있으면 (`OMCODEX_AGENT_TEA
|
|
|
240
240
|
|------|------|
|
|
241
241
|
| omx-memory | 세션 메모리 영속성 |
|
|
242
242
|
|
|
243
|
-
### 설치 명령어
|
|
243
|
+
### Claude Code 설치 명령어
|
|
244
244
|
|
|
245
245
|
```bash
|
|
246
246
|
# 마켓플레이스 추가
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# Agent Eval Guide
|
|
2
|
+
|
|
3
|
+
## Evaluation Order
|
|
4
|
+
|
|
5
|
+
Agent evaluation uses two phases:
|
|
6
|
+
|
|
7
|
+
1. **Correctness gate**: verify the task outcome against explicit acceptance criteria.
|
|
8
|
+
2. **Efficiency review**: compare only correctness-passing runs against an ideal trajectory.
|
|
9
|
+
|
|
10
|
+
Do not optimize step count or latency before correctness is proven.
|
|
11
|
+
|
|
12
|
+
## Four Metrics
|
|
13
|
+
|
|
14
|
+
| Metric | Definition | Typical Use |
|
|
15
|
+
|--------|------------|-------------|
|
|
16
|
+
| `correctness` | Passed criteria divided by total criteria | Release or completion gate |
|
|
17
|
+
| `step_ratio` | Observed steps divided by ideal steps | Detect avoidable loops |
|
|
18
|
+
| `tool_call_ratio` | Observed tool calls divided by ideal tool calls | Detect noisy retrieval or tool misuse |
|
|
19
|
+
| `latency_ratio` | Observed duration divided by ideal duration | Detect runtime regressions |
|
|
20
|
+
|
|
21
|
+
## Ideal Trajectory
|
|
22
|
+
|
|
23
|
+
```yaml
|
|
24
|
+
task: "create a small routing skill"
|
|
25
|
+
capability: "tool_use"
|
|
26
|
+
ideal:
|
|
27
|
+
steps: 5
|
|
28
|
+
tool_calls: 8
|
|
29
|
+
latency_ms: 180000
|
|
30
|
+
acceptance_criteria:
|
|
31
|
+
- "Skill frontmatter is valid"
|
|
32
|
+
- "Routing docs reference the skill"
|
|
33
|
+
- "Tests or static checks pass"
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Interpreting Ratios
|
|
37
|
+
|
|
38
|
+
- `1.00`: observed matched the ideal.
|
|
39
|
+
- `< 1.00`: faster or shorter than ideal; verify no evidence was skipped.
|
|
40
|
+
- `1.00-1.25`: usually acceptable.
|
|
41
|
+
- `> 1.25`: advisory improvement candidate.
|
|
42
|
+
- correctness below `1.00`: fail regardless of efficiency.
|
|
43
|
+
|
|
44
|
+
## Integration
|
|
45
|
+
|
|
46
|
+
- Use `agent-eval-framework` for task-level scoring.
|
|
47
|
+
- Use `harness-eval` when running repeatable benchmark suites.
|
|
48
|
+
- Use `omcustomcodex:improve-report` to turn repeated ratio regressions into improvement suggestions.
|
|
@@ -75,6 +75,18 @@ Capture at least one of:
|
|
|
75
75
|
|
|
76
76
|
When summarizing evidence for the model, preserve reference tokens and URLs so follow-up steps can still target the right page elements.
|
|
77
77
|
|
|
78
|
+
## Build + Vision + Verify Loop
|
|
79
|
+
|
|
80
|
+
For browser-visible changes, treat a successful build as the start of verification, not the end:
|
|
81
|
+
|
|
82
|
+
1. Build or start the local app.
|
|
83
|
+
2. Open the page in the available browser surface.
|
|
84
|
+
3. Capture screenshot, console, and network evidence.
|
|
85
|
+
4. Feed concrete failures back to the implementation agent.
|
|
86
|
+
5. Repeat until build, render, and runtime evidence all pass.
|
|
87
|
+
|
|
88
|
+
This is the Codex Browser Use pattern in portable form. Prefer the in-app Browser Use plugin when available; otherwise use Playwright or the existing browser MCP surface.
|
|
89
|
+
|
|
78
90
|
## Design And Strategy Workflows
|
|
79
91
|
|
|
80
92
|
### Product strategy sessions
|
|
@@ -40,6 +40,18 @@ guides:
|
|
|
40
40
|
source:
|
|
41
41
|
type: internal
|
|
42
42
|
|
|
43
|
+
- name: agent-eval
|
|
44
|
+
description: Quantitative agent evaluation with correctness-first 4-metric evidence
|
|
45
|
+
path: ./agent-eval/
|
|
46
|
+
source:
|
|
47
|
+
type: internal
|
|
48
|
+
|
|
49
|
+
- name: multi-agent-debate-patterns
|
|
50
|
+
description: Anti-groupthink debate patterns for Agora and roundtable-debate workflows
|
|
51
|
+
path: ./multi-agent-debate-patterns/
|
|
52
|
+
source:
|
|
53
|
+
type: internal
|
|
54
|
+
|
|
43
55
|
# Languages
|
|
44
56
|
- name: golang
|
|
45
57
|
description: Go language reference from Effective Go
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Multi-Agent Debate Patterns
|
|
2
|
+
|
|
3
|
+
## Pattern Choice
|
|
4
|
+
|
|
5
|
+
| Pattern | Goal | Use When |
|
|
6
|
+
|---------|------|----------|
|
|
7
|
+
| `agora` | Reach adversarial consensus | Release gates, design approval, high-risk specs |
|
|
8
|
+
| `roundtable-debate` | Preserve dissent | Strategy choices, tradeoffs, ambiguous product or architecture decisions |
|
|
9
|
+
|
|
10
|
+
## Failure Modes
|
|
11
|
+
|
|
12
|
+
- **Anchoring**: later reviewers inherit the first opinion.
|
|
13
|
+
- **Groupthink**: reviewers converge because convergence looks productive.
|
|
14
|
+
- **Degeneration of thought**: debate continues without adding new evidence.
|
|
15
|
+
|
|
16
|
+
## Controls
|
|
17
|
+
|
|
18
|
+
1. Start with independent parallel analysis.
|
|
19
|
+
2. Assign a devil's advocate.
|
|
20
|
+
3. Protect minority findings unless explicitly rejected with evidence.
|
|
21
|
+
4. Cap debate at two rounds.
|
|
22
|
+
5. Switch from debate to evidence gathering when facts are missing.
|
|
23
|
+
|
|
24
|
+
## Decision Record
|
|
25
|
+
|
|
26
|
+
Keep the final recommendation, rejected alternatives, and protected dissent together. Future agents should be able to see not only what was chosen, but which minority risk remains live.
|
package/templates/manifest.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
|
-
"version": "0.4.
|
|
3
|
-
"lastUpdated": "2026-04-
|
|
2
|
+
"version": "0.4.1",
|
|
3
|
+
"lastUpdated": "2026-04-27T01:00:00.000Z",
|
|
4
4
|
"components": [
|
|
5
5
|
{
|
|
6
6
|
"name": "rules",
|
|
@@ -18,13 +18,13 @@
|
|
|
18
18
|
"name": "skills",
|
|
19
19
|
"path": ".agents/skills",
|
|
20
20
|
"description": "Reusable skill modules (project-scoped repo skills)",
|
|
21
|
-
"files":
|
|
21
|
+
"files": 114
|
|
22
22
|
},
|
|
23
23
|
{
|
|
24
24
|
"name": "guides",
|
|
25
25
|
"path": "guides",
|
|
26
26
|
"description": "Reference documentation",
|
|
27
|
-
"files":
|
|
27
|
+
"files": 42
|
|
28
28
|
},
|
|
29
29
|
{
|
|
30
30
|
"name": "hooks",
|