@automagik/genie 4.260409.13 → 4.260409.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. package/.claude-plugin/marketplace.json +1 -1
  2. package/.genie/brainstorm.md +2 -0
  3. package/.genie/wishes/genie-wishes-backup/genie-metaverse/WISH.md +339 -0
  4. package/.genie/wishes/genie-wishes-backup/genie-simulations/WISH.md +488 -0
  5. package/.genie/wishes/genie-wishes-backup/unify-bridge-revamp-skills/WISH.md +271 -0
  6. package/.genie/wishes/unify-bridge-revamp-skills/WISH.md +270 -0
  7. package/dist/genie.js +632 -636
  8. package/knip.json +1 -0
  9. package/package.json +3 -1
  10. package/plugins/genie/.claude-plugin/plugin.json +1 -1
  11. package/plugins/genie/package.json +1 -1
  12. package/scripts/skills-audit.ts +116 -0
  13. package/scripts/skills-lint.ts +140 -0
  14. package/src/genie-commands/doctor.ts +40 -25
  15. package/src/lib/bridge-status.test.ts +114 -0
  16. package/src/lib/bridge-status.ts +162 -0
  17. package/src/services/__tests__/omni-bridge-pidfile.test.ts +136 -0
  18. package/src/services/omni-bridge.ts +159 -0
  19. package/src/term-commands/omni.ts +6 -133
  20. package/src/term-commands/serve.ts +12 -8
  21. /package/.genie/wishes/{agent-flexibility-guide → genie-wishes-backup/agent-flexibility-guide}/WISH.md +0 -0
  22. /package/.genie/wishes/{agent-stability-hardening → genie-wishes-backup/agent-stability-hardening}/WISH.md +0 -0
  23. /package/.genie/wishes/{auto-orchestrate → genie-wishes-backup/auto-orchestrate}/WISH.md +0 -0
  24. /package/.genie/wishes/{daily-metrics-agent → genie-wishes-backup/daily-metrics-agent}/WISH.md +0 -0
  25. /package/.genie/wishes/{docs-overhaul → genie-wishes-backup/docs-overhaul}/WISH.md +0 -0
  26. /package/.genie/wishes/{docs-readme-review → genie-wishes-backup/docs-readme-review}/AUDIT-REPORT.md +0 -0
  27. /package/.genie/wishes/{docs-readme-review → genie-wishes-backup/docs-readme-review}/WISH.md +0 -0
  28. /package/.genie/wishes/{dx-800-closeout → genie-wishes-backup/dx-800-closeout}/WISH.md +0 -0
  29. /package/.genie/wishes/{feature-matrix-page → genie-wishes-backup/feature-matrix-page}/WISH.md +0 -0
  30. /package/.genie/wishes/{fire-and-forget → genie-wishes-backup/fire-and-forget}/WISH.md +0 -0
  31. /package/.genie/wishes/{fix-agent-join-delay → genie-wishes-backup/fix-agent-join-delay}/WISH.md +0 -0
  32. /package/.genie/wishes/{fix-cli-polish → genie-wishes-backup/fix-cli-polish}/WISH.md +0 -0
  33. /package/.genie/wishes/{fix-depends-parser → genie-wishes-backup/fix-depends-parser}/WISH.md +0 -0
  34. /package/.genie/wishes/{fix-dispatch-initial-prompt → genie-wishes-backup/fix-dispatch-initial-prompt}/WISH.md +0 -0
  35. /package/.genie/wishes/{fix-first-run → genie-wishes-backup/fix-first-run}/WISH.md +0 -0
  36. /package/.genie/wishes/{fix-genie-v4-stability → genie-wishes-backup/fix-genie-v4-stability}/WISH.md +0 -0
  37. /package/.genie/wishes/{fix-metrics-agent → genie-wishes-backup/fix-metrics-agent}/WISH.md +0 -0
  38. /package/.genie/wishes/{fix-native-inbox → genie-wishes-backup/fix-native-inbox}/WISH.md +0 -0
  39. /package/.genie/wishes/{fix-omni-bridge-hardening → genie-wishes-backup/fix-omni-bridge-hardening}/WISH.md +0 -0
  40. /package/.genie/wishes/{fix-release-blockers → genie-wishes-backup/fix-release-blockers}/WISH.md +0 -0
  41. /package/.genie/wishes/{fix-session-continue-fallback → genie-wishes-backup/fix-session-continue-fallback}/WISH.md +0 -0
  42. /package/.genie/wishes/{fix-session-hook-channel → genie-wishes-backup/fix-session-hook-channel}/WISH.md +0 -0
  43. /package/.genie/wishes/{fix-session-uuid-resume → genie-wishes-backup/fix-session-uuid-resume}/WISH.md +0 -0
  44. /package/.genie/wishes/{fix-short-name-resolver → genie-wishes-backup/fix-short-name-resolver}/WISH.md +0 -0
  45. /package/.genie/wishes/{fix-task-project → genie-wishes-backup/fix-task-project}/WISH.md +0 -0
  46. /package/.genie/wishes/{fix-team-lead-exit → genie-wishes-backup/fix-team-lead-exit}/WISH.md +0 -0
  47. /package/.genie/wishes/{fix-team-lead-polling → genie-wishes-backup/fix-team-lead-polling}/WISH.md +0 -0
  48. /package/.genie/wishes/{fix-test-safety → genie-wishes-backup/fix-test-safety}/WISH.md +0 -0
  49. /package/.genie/wishes/{fix-tmux-dual-statusbar → genie-wishes-backup/fix-tmux-dual-statusbar}/WISH.md +0 -0
  50. /package/.genie/wishes/{fix-tmux-session-explosion → genie-wishes-backup/fix-tmux-session-explosion}/WISH.md +0 -0
  51. /package/.genie/wishes/{fix-trust-prompt → genie-wishes-backup/fix-trust-prompt}/WISH.md +0 -0
  52. /package/.genie/wishes/{genie-app-v2-ui → genie-wishes-backup/genie-app-v2-ui}/WISH.md +0 -0
  53. /package/.genie/wishes/{genie-base-skill → genie-wishes-backup/genie-base-skill}/WISH.md +0 -0
  54. /package/.genie/wishes/{genie-boards-workflows → genie-wishes-backup/genie-boards-workflows}/WISH.md +0 -0
  55. /package/.genie/wishes/{genie-dx-polish → genie-wishes-backup/genie-dx-polish}/WISH.md +0 -0
  56. /package/.genie/wishes/{genie-export-import → genie-wishes-backup/genie-export-import}/WISH.md +0 -0
  57. /package/.genie/wishes/{genie-final-polish → genie-wishes-backup/genie-final-polish}/WISH.md +0 -0
  58. /package/.genie/wishes/{genie-hacks-community-docs → genie-wishes-backup/genie-hacks-community-docs}/WISH.md +0 -0
  59. /package/.genie/wishes/{genie-hardening → genie-wishes-backup/genie-hardening}/WISH.md +0 -0
  60. /package/.genie/wishes/{genie-item-registry → genie-wishes-backup/genie-item-registry}/WISH.md +0 -0
  61. /package/.genie/wishes/{genie-observability → genie-wishes-backup/genie-observability}/WISH.md +0 -0
  62. /package/.genie/wishes/{genie-omni-marriage → genie-wishes-backup/genie-omni-marriage}/WISH.md +0 -0
  63. /package/.genie/wishes/{genie-orchestration-fix → genie-wishes-backup/genie-orchestration-fix}/WISH.md +0 -0
  64. /package/.genie/wishes/{genie-resume → genie-wishes-backup/genie-resume}/WISH.md +0 -0
  65. /package/.genie/wishes/{genie-scheduler → genie-wishes-backup/genie-scheduler}/WISH.md +0 -0
  66. /package/.genie/wishes/{genie-stats-command → genie-wishes-backup/genie-stats-command}/WISH.md +0 -0
  67. /package/.genie/wishes/{genie-task-tables → genie-wishes-backup/genie-task-tables}/WISH.md +0 -0
  68. /package/.genie/wishes/{genie-workflow-engine → genie-wishes-backup/genie-workflow-engine}/WISH.md +0 -0
  69. /package/.genie/wishes/{hook-only-first-install → genie-wishes-backup/hook-only-first-install}/WISH.md +0 -0
  70. /package/.genie/wishes/{inbox-driven-sessions → genie-wishes-backup/inbox-driven-sessions}/WISH.md +0 -0
  71. /package/.genie/wishes/{messaging-refresh → genie-wishes-backup/messaging-refresh}/WISH.md +0 -0
  72. /package/.genie/wishes/{multi-agent-session-isolation → genie-wishes-backup/multi-agent-session-isolation}/WISH.md +0 -0
  73. /package/.genie/wishes/{omni-session-isolation → genie-wishes-backup/omni-session-isolation}/WISH.md +0 -0
  74. /package/.genie/wishes/{parallel-execution → genie-wishes-backup/parallel-execution}/WISH.md +0 -0
  75. /package/.genie/wishes/{pg-state-migration → genie-wishes-backup/pg-state-migration}/WISH.md +0 -0
  76. /package/.genie/wishes/{pg-total-migration → genie-wishes-backup/pg-total-migration}/WISH.md +0 -0
  77. /package/.genie/wishes/{pgserve-embed → genie-wishes-backup/pgserve-embed}/WISH.md +0 -0
  78. /package/.genie/wishes/{qa-dev-to-main → genie-wishes-backup/qa-dev-to-main}/WISH.md +0 -0
  79. /package/.genie/wishes/{readme-v4-agent-first → genie-wishes-backup/readme-v4-agent-first}/WISH.md +0 -0
  80. /package/.genie/wishes/{resilient-messaging → genie-wishes-backup/resilient-messaging}/WISH.md +0 -0
  81. /package/.genie/wishes/{resilient-resume → genie-wishes-backup/resilient-resume}/WISH.md +0 -0
  82. /package/.genie/wishes/{session-auto-create → genie-wishes-backup/session-auto-create}/WISH.md +0 -0
  83. /package/.genie/wishes/{session-continue-by-name → genie-wishes-backup/session-continue-by-name}/WISH.md +0 -0
  84. /package/.genie/wishes/{skills-v4-upgrade → genie-wishes-backup/skills-v4-upgrade}/WISH.md +0 -0
  85. /package/.genie/wishes/{stable-release-fixes → genie-wishes-backup/stable-release-fixes}/WISH.md +0 -0
  86. /package/.genie/wishes/{task-auto-close-on-merge → genie-wishes-backup/task-auto-close-on-merge}/WISH.md +0 -0
  87. /package/.genie/wishes/{task-external-linking → genie-wishes-backup/task-external-linking}/WISH.md +0 -0
  88. /package/.genie/wishes/{task-lifecycle-foundation → genie-wishes-backup/task-lifecycle-foundation}/WISH.md +0 -0
  89. /package/.genie/wishes/{task-projects → genie-wishes-backup/task-projects}/WISH.md +0 -0
  90. /package/.genie/wishes/{team-lead-minimal → genie-wishes-backup/team-lead-minimal}/WISH.md +0 -0
  91. /package/.genie/wishes/{test-pg-ram-isolation → genie-wishes-backup/test-pg-ram-isolation}/TRACE.md +0 -0
  92. /package/.genie/wishes/{test-pg-ram-isolation → genie-wishes-backup/test-pg-ram-isolation}/WISH.md +0 -0
  93. /package/.genie/wishes/{test-schema-isolation → genie-wishes-backup/test-schema-isolation}/WISH.md +0 -0
  94. /package/.genie/wishes/{tmux-split-tabbar → genie-wishes-backup/tmux-split-tabbar}/WISH.md +0 -0
  95. /package/.genie/wishes/{tmux-tui → genie-wishes-backup/tmux-tui}/WISH.md +0 -0
  96. /package/.genie/wishes/{transcript-docs → genie-wishes-backup/transcript-docs}/WISH.md +0 -0
  97. /package/.genie/wishes/{unified-executor-layer → genie-wishes-backup/unified-executor-layer}/AUDIT.md +0 -0
  98. /package/.genie/wishes/{unified-executor-layer → genie-wishes-backup/unified-executor-layer}/WISH.md +0 -0
  99. /package/.genie/wishes/{unified-omni-bridge → genie-wishes-backup/unified-omni-bridge}/INTEGRATION-FROM-OMNI.md +0 -0
  100. /package/.genie/wishes/{unified-omni-bridge → genie-wishes-backup/unified-omni-bridge}/WISH.md +0 -0
  101. /package/.genie/wishes/{unique-leader-names → genie-wishes-backup/unique-leader-names}/WISH.md +0 -0
  102. /package/.genie/wishes/{v3-fixes-release → genie-wishes-backup/v3-fixes-release}/WISH.md +0 -0
  103. /package/.genie/wishes/{v4-critical-fixes → genie-wishes-backup/v4-critical-fixes}/WISH.md +0 -0
  104. /package/.genie/wishes/{voice-personality-pass → genie-wishes-backup/voice-personality-pass}/WISH.md +0 -0
  105. /package/.genie/wishes/{work-fire-forget → genie-wishes-backup/work-fire-forget}/WISH.md +0 -0
  106. /package/.genie/wishes/{workflow-engine-runtime → genie-wishes-backup/workflow-engine-runtime}/WISH.md +0 -0
  107. /package/.genie/wishes/{worktree-out-of-repo → genie-wishes-backup/worktree-out-of-repo}/WISH.md +0 -0
@@ -0,0 +1,488 @@
1
+ # Wish: Genie Simulations — Agent Evaluation via Real Multi-Turn Conversations
2
+
3
+ | Field | Value |
4
+ |-------|-------|
5
+ | **Status** | DRAFT |
6
+ | **Slug** | `genie-simulations` |
7
+ | **Date** | 2026-04-09 |
8
+ | **Design** | [DESIGN.md](../../brainstorms/genie-simulations/DRAFT.md) |
9
+
10
+ ## Summary
11
+
12
+ Build a simulation and evaluation pipeline for genie agents. Sim agents (full genie agents playing human personas) converse with a target agent through real Omni sessions. Each agent gets 100 curated scenarios scored 0-100 via compound LLM-as-judge + human annotation. Scenarios are extracted from real WhatsApp conversations with PII anonymized. All data persists to PG for full transcript reconstruction in genie-app.
13
+
14
+ ## Scope
15
+
16
+ ### IN
17
+ - PG schema: simulation tables (runs, scenarios, turns, scores, annotations)
18
+ - Simulation service (`src/services/simulator/`) — orchestrator, scenario loader, scoring pipeline
19
+ - Sim agent convention: `.genie/simulations/<scenario>/AGENTS.md` with standard genie frontmatter
20
+ - Scenario extraction: `genie sim create` — pull real Omni conversation, anonymize PII, generate sim agent scaffold
21
+ - Simulation runner: `genie sim run <scenario>` (single) and `genie sim run --all` (full suite, wave-based concurrency)
22
+ - Real Omni session injection — sim agents and target agents communicate through a dedicated sim Omni instance
23
+ - Compound scoring: 9 dimensions (script adherence, goal completion, response quality, latency, tool usage, recovery, hallucination, instruction compliance, human eval)
24
+ - LLM-as-judge per dimension with reasoning + evidence stored in PG
25
+ - `genie sim results` — per-scenario and aggregate score display
26
+ - `genie sim done` — sim agent exit signal, triggers scoring and teardown
27
+ - `/simulate` skill for interactive simulation piloting
28
+ - `simulator` agent type in genie's built-in agent registry
29
+ - CLI namespace: `genie sim` with subcommands (create, run, results, annotate, list)
30
+ - Session lifecycle: same mechanics as omni-bridge (PG tracking, 100-turn hard cap, stale cleanup)
31
+ - Genie-app Simulations view: runs dashboard, scenario list, transcript replay with WhatsApp-like chat bubbles, per-dimension score breakdown, human annotation (thumbs up/down per turn)
32
+ - Backend NATS subjects + PG queries for simulation data (follows existing pg-bridge pattern)
33
+ - Manifest + component registration for Simulations nav item
34
+
35
+ ### OUT
36
+ - Automatic scenario extraction / auto-generation (v1 is manual curation only)
37
+ - Real-time production monitoring (offline evaluation only)
38
+ - Eugenia-specific scenario content (this wish builds the framework; scenario authoring is per-agent work)
39
+ - `@langwatch/scenario` OTel integration (evaluated during brainstorm, not needed — we build our own scoring pipeline)
40
+
41
+ ## Decisions
42
+
43
+ | Decision | Rationale |
44
+ |----------|-----------|
45
+ | Sim agents are full genie agents (AGENTS.md, frontmatter) | Same spawn mechanics, no special-casing. A sim agent is just an agent playing a human persona |
46
+ | Sim agents live under `.genie/simulations/` not `.genie/agents/` | Keeps sim agents separate from production agents, all sim context co-located |
47
+ | Conversations flow through real Omni (not mocked) | Tests the full stack: NATS, bridge, executor, tools, latency, concurrency. Target agent is unaware |
48
+ | Dedicated sim Omni instance/channel for side effects | Handoffs, replies, tool side effects flow to sim channel — no per-agent customization needed |
49
+ | Manual scenario curation (not automated) | Quality over quantity. Real conversations are manually selected and PII-anonymized |
50
+ | 100-turn hard cap with goal-based exit | `genie sim done` as primary exit, turn cap as safety net. Loops score 0 — that's valid data |
51
+ | PG stores full transcript (every turn, tool call, latency) | Enables genie-app to reconstruct simulations for human annotation |
52
+ | LLM-as-judge with per-dimension reasoning | Brain repo pattern (answer-judge.ts). Reasoning stored for transparency and debugging |
53
+ | Human annotation is a weight in the score, not a gate | Annotations are post-hoc via genie-app, feed into aggregate score. Sims run without waiting for humans |
54
+ | Inside genie core (`src/services/simulator/`) not a plugin | Needs executor, PG, NATS internals. Same codebase as omni-bridge |
55
+ | Target agent spawned from specific git branch | `genie sim run --target agent@branch` — enables before/after comparison across branches |
56
+ | Genie-app view follows existing patterns | Same stack: React 19 + Vite + NATS req/reply + pg-bridge. Lazy-loaded view in manifest.ts/components.ts. Reuses shared components (ChatBubble, SearchBar, KpiCard, LoadingState) |
57
+ | Annotation UI in app, not CLI-only | Human annotation requires reviewing full transcripts with context — WhatsApp-like chat replay is far more usable than CLI for this |
58
+
59
+ ## Success Criteria
60
+
61
+ - [ ] `genie sim create <name> --source <omni-chat-id>` extracts a real conversation into an anonymized sim agent under `.genie/simulations/`
62
+ - [ ] `genie sim run <scenario> --target <agent>@<branch>` spawns both agents through Omni, produces a scored result in PG
63
+ - [ ] `genie sim run --all --target <agent>@<branch>` runs all scenarios with wave-based concurrency, produces aggregate 0-100 score
64
+ - [ ] `genie sim results [--run <id>]` shows per-scenario and aggregate scores from PG
65
+ - [ ] `genie sim list` lists available scenarios for an agent
66
+ - [ ] `genie sim annotate <scenario> --run <id>` allows human verdict (up/down) per turn or scenario, stored in PG
67
+ - [ ] Sim agent is a standard genie agent (AGENTS.md frontmatter, same spawn mechanics as any agent)
68
+ - [ ] Target agent runs from specified git branch, completely unaware it's being simulated
69
+ - [ ] All conversations flow through real Omni instance (dedicated sim channel captures side effects)
70
+ - [ ] LLM-as-judge scores each scenario across applicable dimensions with reasoning persisted
71
+ - [ ] Full transcript (turns, tool calls, latency) stored in PG, sufficient for UI reconstruction
72
+ - [ ] Scenarios that loop or error score 0, with status recorded (loop/error/done)
73
+ - [ ] `/simulate` skill registered for interactive simulation piloting
74
+ - [ ] `simulator` agent type registered in genie's built-in agent types
75
+ - [ ] PII anonymization enforced at scenario creation time — no raw customer data in repo
76
+ - [ ] Genie-app "Simulations" view shows runs list with aggregate scores, click-through to scenario breakdown
77
+ - [ ] Transcript replay renders as WhatsApp-like chat bubbles (reuses ChatBubble shared component)
78
+ - [ ] Per-dimension score breakdown visible per scenario with LLM judge reasoning
79
+ - [ ] Human can annotate (thumbs up/down) individual turns in the app, annotations persist to PG
80
+ - [ ] Real-time run progress updates via NATS subscription (scenarios completing as they finish)
81
+
82
+ ## Execution Strategy
83
+
84
+ ### Wave 1 (parallel — foundations)
85
+ | Group | Agent | Description |
86
+ |-------|-------|-------------|
87
+ | 1 | engineer | PG schema: migration for simulation tables |
88
+ | 2 | engineer | Sim agent convention: loader, validator, frontmatter spec |
89
+
90
+ ### Wave 2 (parallel — after Wave 1)
91
+ | Group | Agent | Description |
92
+ |-------|-------|-------------|
93
+ | 3 | engineer | Scenario extraction: `genie sim create` + PII anonymizer |
94
+ | 4 | engineer | Simulation service: orchestrator + Omni session injection |
95
+
96
+ ### Wave 3 (parallel — after Wave 2)
97
+ | Group | Agent | Description |
98
+ |-------|-------|-------------|
99
+ | 5 | engineer | Scoring pipeline: LLM judge + dimension evaluators |
100
+ | 6 | engineer | CLI commands: `genie sim` namespace + `/simulate` skill |
101
+
102
+ ### Wave 4 (parallel — after Wave 3)
103
+ | Group | Agent | Description |
104
+ |-------|-------|-------------|
105
+ | 7 | engineer | Session lifecycle: `genie sim done`, turn cap, stale cleanup, annotation storage |
106
+ | 8 | engineer | Backend: NATS subjects + PG query handlers for simulation data |
107
+
108
+ ### Wave 5 (parallel — after Wave 4)
109
+ | Group | Agent | Description |
110
+ |-------|-------|-------------|
111
+ | 9 | engineer | App: Simulations view — runs dashboard, scenario list, score breakdown |
112
+ | 10 | engineer | App: Transcript replay + human annotation UI |
113
+
114
+ ### Wave 6 (after Wave 5)
115
+ | Group | Agent | Description |
116
+ |-------|-------|-------------|
117
+ | review | reviewer | Review all groups |
118
+
119
+ ## Execution Groups
120
+
121
+ ### Group 1: PG Schema — Simulation Tables
122
+ **Goal:** Create the database schema for storing simulation runs, scenarios, turns, scores, and annotations.
123
+
124
+ **Deliverables:**
125
+ 1. Migration file creating 5 tables: `genie_sim_runs`, `genie_sim_scenarios`, `genie_sim_turns`, `genie_sim_scores`, `genie_sim_annotations`
126
+ 2. Indexes for common queries: runs by agent, scenarios by run, turns by scenario, annotations by turn
127
+
128
+ **Schema:**
129
+ ```
130
+ genie_sim_runs
131
+ ├── id (uuid PK), target_agent, target_branch, executor_type
132
+ ├── sim_instance_id (Omni instance used), started_at, completed_at
133
+ ├── status (pending|running|done|error)
134
+ ├── total_scenarios, completed_scenarios
135
+ └── aggregate_score (numeric 0-100)
136
+
137
+ genie_sim_scenarios
138
+ ├── id (uuid PK), run_id (FK), sim_agent_name, scenario_slug
139
+ ├── status (pending|running|done|loop|error)
140
+ ├── score (numeric 0.00-1.00), turn_count
141
+ ├── started_at, completed_at
142
+ └── scoring_weights (jsonb)
143
+
144
+ genie_sim_turns
145
+ ├── id (uuid PK), scenario_id (FK), turn_number
146
+ ├── role (sim|target), content (text), message_type (text|audio|image)
147
+ ├── tool_calls (jsonb), latency_ms (int)
148
+ ├── timestamp, omni_message_id
149
+ └── INDEX (scenario_id, turn_number)
150
+
151
+ genie_sim_scores
152
+ ├── id (uuid PK), scenario_id (FK), dimension (text)
153
+ ├── score (numeric 0.00-1.00), weight (numeric)
154
+ ├── reasoning (text), evidence (jsonb)
155
+ └── INDEX (scenario_id, dimension)
156
+
157
+ genie_sim_annotations
158
+ ├── id (uuid PK), turn_id (FK nullable), scenario_id (FK)
159
+ ├── annotator (text), verdict (up|down)
160
+ ├── comment (text nullable), created_at
161
+ └── INDEX (scenario_id)
162
+ ```
163
+
164
+ **Acceptance Criteria:**
165
+ - [ ] Migration runs cleanly on a fresh DB
166
+ - [ ] All FK constraints and indexes are present
167
+ - [ ] Schema supports full transcript reconstruction (turns ordered by turn_number)
168
+
169
+ **Validation:**
170
+ ```bash
171
+ bun run migrate && bunx tsc --noEmit
172
+ ```
173
+
174
+ **depends-on:** none
175
+
176
+ ---
177
+
178
+ ### Group 2: Sim Agent Convention — Loader and Validator
179
+ **Goal:** Define and implement the sim agent format, loader, and validator so sim agents follow genie's standard agent conventions.
180
+
181
+ **Deliverables:**
182
+ 1. `src/services/simulator/sim-loader.ts` — loads `.genie/simulations/<scenario>/AGENTS.md`, validates frontmatter, returns structured scenario config
183
+ 2. Sim agent frontmatter spec: extends standard genie agent frontmatter with sim-specific fields (`scenario_slug`, `target_agent`, `scoring_dimensions`, `source_conversation`, `max_turns`)
184
+ 3. `src/services/simulator/pii-anonymizer.ts` — utility for replacing PII (names, phones, CPFs, addresses) with fictional equivalents during scenario creation
185
+
186
+ **Acceptance Criteria:**
187
+ - [ ] Sim agent AGENTS.md uses standard genie frontmatter (parseable by existing agent-sync)
188
+ - [ ] Loader validates required sim fields and returns typed ScenarioConfig
189
+ - [ ] PII anonymizer handles Brazilian PII patterns (CPF, phone +55, common names)
190
+
191
+ **Validation:**
192
+ ```bash
193
+ bunx biome check src/services/simulator/ && bunx tsc --noEmit
194
+ ```
195
+
196
+ **depends-on:** none
197
+
198
+ ---
199
+
200
+ ### Group 3: Scenario Extraction — `genie sim create`
201
+ **Goal:** CLI command that extracts a real Omni conversation into an anonymized sim agent scaffold.
202
+
203
+ **Deliverables:**
204
+ 1. `src/term-commands/sim/create.ts` — pulls conversation history from Omni API, anonymizes PII, generates `.genie/simulations/<name>/AGENTS.md` with persona + roleplay script + scoring config
205
+ 2. Conversation fetcher: uses Omni API (or SSH to remote server) to retrieve full chat history
206
+ 3. Output: ready-to-edit sim agent directory with AGENTS.md, source.md (anonymized source reference), scoring.md (applicable dimensions + weights)
207
+
208
+ **Acceptance Criteria:**
209
+ - [ ] `genie sim create <name> --source <chat-id> [--instance <id>] [--server <ssh-host>]` produces a complete sim agent directory
210
+ - [ ] All PII in output files is replaced with fictional data
211
+ - [ ] Generated AGENTS.md is a valid genie agent (parseable frontmatter, clear persona and roleplay script)
212
+ - [ ] source.md records which conversation inspired the scenario (anonymized)
213
+
214
+ **Validation:**
215
+ ```bash
216
+ bunx biome check src/term-commands/sim/ && bunx tsc --noEmit
217
+ ```
218
+
219
+ **depends-on:** Group 2
220
+
221
+ ---
222
+
223
+ ### Group 4: Simulation Service — Orchestrator + Omni Session Injection
224
+ **Goal:** Core simulation engine that spawns sim agents and target agents through real Omni sessions.
225
+
226
+ **Deliverables:**
227
+ 1. `src/services/simulator/orchestrator.ts` — loads scenarios, manages wave-based concurrency, tracks run state in PG, coordinates spawn/teardown
228
+ 2. `src/services/simulator/sim-session.ts` — creates Omni sessions for sim agent ↔ target agent pairs, injects messages through NATS, captures replies
229
+ 3. Integration with omni-bridge: sim sessions use the same executor interface (tmux or SDK) for the target agent
230
+ 4. Sim agent spawned as a genie agent with its AGENTS.md persona, connected to the sim Omni instance
231
+ 5. `genie sim run <scenario> --target <agent>@<branch>` single-scenario runner
232
+ 6. `genie sim run --all --target <agent>@<branch> [--concurrency <n>]` full-suite runner with wave-based concurrency
233
+
234
+ **Acceptance Criteria:**
235
+ - [ ] Single scenario: sim agent and target agent exchange messages through real Omni
236
+ - [ ] Target agent is unaware it's a simulation (receives standard Omni messages)
237
+ - [ ] Full suite: scenarios run in waves respecting concurrency limit
238
+ - [ ] Run state tracked in `genie_sim_runs` and `genie_sim_scenarios` tables
239
+ - [ ] Every turn persisted to `genie_sim_turns` with content, tool calls, latency, omni_message_id
240
+
241
+ **Validation:**
242
+ ```bash
243
+ bunx biome check src/services/simulator/ && bunx tsc --noEmit
244
+ ```
245
+
246
+ **depends-on:** Group 1, Group 2
247
+
248
+ ---
249
+
250
+ ### Group 5: Scoring Pipeline — LLM Judge + Dimension Evaluators
251
+ **Goal:** Compound scoring system that evaluates completed simulation transcripts across 9 dimensions.
252
+
253
+ **Deliverables:**
254
+ 1. `src/services/simulator/judge.ts` — LLM-as-judge (Gemini Flash or Claude Haiku for cost), evaluates a transcript against a single dimension, returns score + reasoning + evidence
255
+ 2. `src/services/simulator/dimensions/` — one evaluator per dimension:
256
+ - `script-adherence.ts` — checks agent followed expected conversation flow
257
+ - `goal-completion.ts` — checks if scenario objective was achieved
258
+ - `response-quality.ts` — per-turn quality assessment (accuracy, tone, helpfulness)
259
+ - `latency.ts` — threshold-based scoring from measured turn latencies
260
+ - `tool-usage.ts` — checks correct tools called at correct times
261
+ - `recovery.ts` — checks handling of off-script/adversarial input
262
+ - `hallucination.ts` — checks against agent's KB ground truth
263
+ - `instruction-compliance.ts` — checks against agent's rules (AGENTS.md, .claude/rules/*)
264
+ - `human-eval.ts` — aggregates human annotations into a score
265
+ 3. `src/services/simulator/scorer.ts` — runs applicable dimensions per scenario (from scoring.md weights), computes weighted average, persists to `genie_sim_scores`
266
+ 4. Aggregate score calculator: sum of per-scenario scores → 0-100
267
+
268
+ **Acceptance Criteria:**
269
+ - [ ] Each dimension produces a 0.00-1.00 score with reasoning text and evidence (turn references)
270
+ - [ ] Scorer respects per-scenario dimension selection and weights from scoring.md
271
+ - [ ] All scores + reasoning persisted to `genie_sim_scores` table
272
+ - [ ] Aggregate score computed and stored on `genie_sim_runs.aggregate_score`
273
+ - [ ] Instruction compliance evaluator loads agent's actual rules from its repo
274
+
275
+ **Validation:**
276
+ ```bash
277
+ bunx biome check src/services/simulator/ && bunx tsc --noEmit
278
+ ```
279
+
280
+ **depends-on:** Group 1, Group 4
281
+
282
+ ---
283
+
284
+ ### Group 6: CLI Commands + `/simulate` Skill
285
+ **Goal:** User-facing CLI namespace and skill for managing simulations.
286
+
287
+ **Deliverables:**
288
+ 1. `src/term-commands/sim/index.ts` — `genie sim` command group registration
289
+ 2. `src/term-commands/sim/run.ts` — `genie sim run` (single + --all)
290
+ 3. `src/term-commands/sim/results.ts` — `genie sim results [--run <id>]` with table output
291
+ 4. `src/term-commands/sim/list.ts` — `genie sim list` shows available scenarios
292
+ 5. `src/term-commands/sim/annotate.ts` — `genie sim annotate <scenario> --run <id>` for CLI-based human verdicts
293
+ 6. `skills/simulate.md` — `/simulate` skill prompt for interactive simulation piloting
294
+ 7. Register `simulator` agent type in genie's built-in agent registry
295
+
296
+ **Acceptance Criteria:**
297
+ - [ ] `genie sim` shows help with all subcommands
298
+ - [ ] `genie sim results` renders table with scenario scores and run aggregate
299
+ - [ ] `genie sim list` shows scenarios with status (has sim agent, last run score)
300
+ - [ ] `/simulate` skill loads and routes to appropriate sim commands
301
+ - [ ] `simulator` type appears in `genie agent directory`
302
+
303
+ **Validation:**
304
+ ```bash
305
+ bunx biome check src/term-commands/sim/ skills/simulate.md && bunx tsc --noEmit
306
+ ```
307
+
308
+ **depends-on:** Group 4, Group 5
309
+
310
+ ---
311
+
312
+ ### Group 7: Session Lifecycle — Exit Signals, Turn Cap, Cleanup, Annotations
313
+ **Goal:** Production-grade lifecycle management for sim sessions.
314
+
315
+ **Deliverables:**
316
+ 1. `genie sim done` command — sim agent calls this to signal scenario completion, triggers scoring pipeline, tears down both sessions
317
+ 2. Turn cap enforcement: orchestrator kills sessions at 100 turns (configurable), marks scenario status as `loop`, scores 0
318
+ 3. Stale session cleanup: reuse omni-bridge's pattern (PG tracking, orphan detection on restart)
319
+ 4. Annotation storage: `genie sim annotate` writes to `genie_sim_annotations`, human-eval dimension reads from it
320
+ 5. Error handling: if sim or target agent crashes, mark scenario as `error`, score 0, clean up sessions
321
+
322
+ **Acceptance Criteria:**
323
+ - [ ] `genie sim done` from inside a sim agent triggers scoring and teardown within 5s
324
+ - [ ] Sessions hitting 100-turn cap are killed and scored 0 with status `loop`
325
+ - [ ] Orphaned sim sessions are detected and cleaned up on restart
326
+ - [ ] Annotations persist to PG and are reflected in human-eval dimension score
327
+ - [ ] Crashed scenarios are marked `error` with cleanup, don't block the run
328
+
329
+ **Validation:**
330
+ ```bash
331
+ bunx biome check src/services/simulator/ src/term-commands/sim/ && bunx tsc --noEmit
332
+ ```
333
+
334
+ **depends-on:** Group 4, Group 5, Group 6
335
+
336
+ ---
337
+
338
+ ### Group 8: Backend — NATS Subjects + PG Query Handlers
339
+ **Goal:** Backend data layer for the genie-app Simulations view, following the existing pg-bridge + NATS req/reply pattern.
340
+
341
+ **Deliverables:**
342
+ 1. `packages/genie-app/lib/subjects.ts` — add `simulations` domain with subjects: `simulations.runs`, `simulations.scenarios`, `simulations.turns`, `simulations.scores`, `simulations.annotate`
343
+ 2. `packages/genie-app/src-backend/pg-bridge.ts` — add PG query handlers for:
344
+ - `simulations.runs` — list runs with aggregate scores, filter by agent/branch/status
345
+ - `simulations.scenarios` — list scenarios for a run with per-scenario scores
346
+ - `simulations.turns` — get full transcript for a scenario (ordered turns with tool calls, latency)
347
+ - `simulations.scores` — get per-dimension score breakdown with reasoning for a scenario
348
+ - `simulations.annotate` — write human annotation (turn-level or scenario-level verdict)
349
+ 3. NATS event bridge: add `sim_scenario_complete` and `sim_run_complete` PG LISTEN/NOTIFY channels for real-time UI updates
350
+
351
+ **Acceptance Criteria:**
352
+ - [ ] All 5 NATS subjects registered and handled in pg-bridge
353
+ - [ ] Queries return typed responses matching the PG schema from Group 1
354
+ - [ ] Annotation writes validate input (verdict must be up/down, turn_id or scenario_id required)
355
+ - [ ] Real-time events bridged so the UI can subscribe to run progress
356
+
357
+ **Validation:**
358
+ ```bash
359
+ bunx biome check packages/genie-app/src-backend/ packages/genie-app/lib/subjects.ts && bunx tsc --noEmit
360
+ ```
361
+
362
+ **depends-on:** Group 1, Group 7
363
+
364
+ ---
365
+
366
+ ### Group 9: App — Simulations View (Runs Dashboard + Scenario List)
367
+ **Goal:** Main Simulations view in genie-app showing runs and scenario breakdowns.
368
+
369
+ **Deliverables:**
370
+ 1. `packages/genie-app/views/simulations/ui/SimulationsView.tsx` — main view with:
371
+ - **Runs list** — table showing all runs with: target agent, branch, date, status, aggregate score (0-100), scenario progress (completed/total)
372
+ - **Run detail** — click a run to see scenario breakdown table: scenario name, status (done/loop/error), score, turn count, duration
373
+ - **Score breakdown** — expandable per-scenario showing 9-dimension radar or bar chart with LLM judge reasoning
374
+ - **KPI cards** — top bar with: latest score, score trend (vs previous run), pass rate, avg latency
375
+ 2. `packages/genie-app/manifest.ts` — register Simulations view entry
376
+ 3. `packages/genie-app/components.ts` — add lazy import for SimulationsView
377
+ 4. `src/App.tsx` — add Simulations to NAV_ITEMS
378
+
379
+ **Acceptance Criteria:**
380
+ - [ ] "Simulations" appears in sidebar nav
381
+ - [ ] Runs list loads from NATS `simulations.runs` subject
382
+ - [ ] Click-through from run → scenario list → score breakdown works
383
+ - [ ] Real-time updates: new scenario completions appear without manual refresh
384
+ - [ ] Reuses shared components: SearchBar, LoadingState, ErrorState, EmptyState, KpiCard
385
+
386
+ **Validation:**
387
+ ```bash
388
+ bunx biome check packages/genie-app/views/simulations/ && bunx tsc --noEmit
389
+ ```
390
+
391
+ **depends-on:** Group 8
392
+
393
+ ---
394
+
395
+ ### Group 10: App — Transcript Replay + Human Annotation
396
+ **Goal:** WhatsApp-like transcript replay with per-turn annotation capability.
397
+
398
+ **Deliverables:**
399
+ 1. Transcript panel in SimulationsView (or sub-component):
400
+ - **Chat replay** — WhatsApp-style bubbles using shared `ChatBubble` component. Sim agent messages on left (as "customer"), target agent messages on right. Tool calls rendered inline via `ToolCallCard`
401
+ - **Per-turn metadata** — latency badge, timestamp, tool calls expandable
402
+ - **Annotation controls** — thumbs up/down button on each turn bubble. Click persists to PG via `simulations.annotate` NATS subject
403
+ - **Scenario-level annotation** — summary verdict (up/down + optional comment) at bottom of transcript
404
+ 2. Score overlay: alongside transcript, show per-dimension scores with reasoning expandable
405
+ 3. Annotation summary: visual indicator showing how many turns have been annotated vs total
406
+
407
+ **Acceptance Criteria:**
408
+ - [ ] Transcript loads from `simulations.turns` and renders as chat bubbles
409
+ - [ ] Sim agent (customer) messages on left, target agent on right — visually distinct
410
+ - [ ] Thumbs up/down on each turn persists to PG immediately
411
+ - [ ] Scenario-level annotation with optional comment supported
412
+ - [ ] Tool calls rendered inline (expandable) using existing ToolCallCard
413
+ - [ ] Annotation count shown (e.g. "12/18 turns annotated")
414
+
415
+ **Validation:**
416
+ ```bash
417
+ bunx biome check packages/genie-app/views/simulations/ && bunx tsc --noEmit
418
+ ```
419
+
420
+ **depends-on:** Group 8, Group 9
421
+
422
+ ---
423
+
424
+ ## QA Criteria
425
+
426
+ - [ ] Full simulation run (at least 3 scenarios) completes end-to-end with scores in PG
427
+ - [ ] Sim agent and target agent converse through real Omni with no awareness leak
428
+ - [ ] `genie sim results` displays correct aggregate and per-scenario scores
429
+ - [ ] Turn cap triggers correctly at configured limit
430
+ - [ ] `genie sim create` produces anonymized scenario with no real PII in output
431
+ - [ ] Scoring pipeline produces per-dimension breakdown with reasoning
432
+ - [ ] Genie-app Simulations view loads, displays runs, and navigates to scenario detail
433
+ - [ ] Transcript replay renders correctly with chat bubbles and tool calls
434
+ - [ ] Human annotation persists and reflects in human-eval dimension score
435
+ - [ ] `bun run check` passes (typecheck + lint + dead-code + test)
436
+
437
+ ## Assumptions / Risks
438
+
439
+ | Risk | Severity | Mitigation |
440
+ |------|----------|------------|
441
+ | Infinite loops between agents | Medium | 100-turn hard cap + `genie sim done` exit signal. Loops score 0 |
442
+ | Lost/orphaned sim sessions | Low | Same recovery pattern as omni-bridge (PG tracking, stale cleanup) |
443
+ | PII leakage in scenario files | High | Anonymization mandatory at extraction. Never store raw customer data in repo |
444
+ | Tool side effects during sim | Medium | Dedicated sim Omni instance/channel captures all side effects (handoffs, replies) |
445
+ | Cost (LLM calls for judge + 2 agents per scenario) | Medium | Budget per run. Use Haiku/Gemini Flash for judge. Wave concurrency limits blast radius |
446
+ | Sim agent recognized as AI by target | Low | Full persona roleplay with realistic WhatsApp patterns. Target has no detection mechanism |
447
+ | Omni instance availability | Medium | Sim requires running Omni with dedicated sim instance. `genie sim` checks prerequisites |
448
+ | Large PG data volume (100 scenarios × ~20 turns × multiple runs) | Low | Partition by run_id. Archive old runs. Turn content is text, not large |
449
+
450
+ ## Review Results
451
+
452
+ _Populated by `/review` after execution completes._
453
+
454
+ ## Files to Create/Modify
455
+
456
+ ```
457
+ # New files
458
+ src/services/simulator/orchestrator.ts — Core simulation engine
459
+ src/services/simulator/sim-session.ts — Omni session injection for sim pairs
460
+ src/services/simulator/sim-loader.ts — Scenario loader + validator
461
+ src/services/simulator/pii-anonymizer.ts — PII detection and replacement
462
+ src/services/simulator/judge.ts — LLM-as-judge core
463
+ src/services/simulator/scorer.ts — Weighted scoring aggregator
464
+ src/services/simulator/dimensions/ — Per-dimension evaluators (9 files)
465
+ src/term-commands/sim/index.ts — CLI command group
466
+ src/term-commands/sim/create.ts — genie sim create
467
+ src/term-commands/sim/run.ts — genie sim run
468
+ src/term-commands/sim/results.ts — genie sim results
469
+ src/term-commands/sim/list.ts — genie sim list
470
+ src/term-commands/sim/annotate.ts — genie sim annotate
471
+ src/term-commands/sim/done.ts — genie sim done
472
+ skills/simulate.md — /simulate skill prompt
473
+ migrations/XXXX_create_simulation_tables.sql — PG schema
474
+
475
+ # App — backend
476
+ packages/genie-app/lib/subjects.ts — Add simulations NATS subjects
477
+ packages/genie-app/src-backend/pg-bridge.ts — Add simulation PG query handlers
478
+
479
+ # App — frontend
480
+ packages/genie-app/views/simulations/ui/SimulationsView.tsx — Main view (runs, scenarios, scores)
481
+ packages/genie-app/manifest.ts — Register Simulations view
482
+ packages/genie-app/components.ts — Add lazy import
483
+ packages/genie-app/src/App.tsx — Add Simulations to NAV_ITEMS
484
+
485
+ # Modified files
486
+ src/genie.ts — Register sim command group
487
+ src/lib/agent-types.ts — Add simulator to built-in types
488
+ ```