vibeostheog 0.22.16 → 0.22.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +1 -1
- package/package.json +6 -2
- package/src/lib/trinity-tool.js +2 -1
- package/src/lib/hooks/tests/footer.test.js +0 -185
- package/src/lib/tests/api-client.test.js +0 -220
- package/src/lib/tests/pricing.test.js +0 -745
- package/src/lib/tests/state.test.js +0 -686
- package/src/tests/index.test.js +0 -91
- package/src/vibeOS-lib/flow-rules.json +0 -39
- package/src/vibeOS-lib/tests/experiment-data-export.json +0 -12743
- package/src/vibeOS-lib/tests/experiment-scenarios-progressive.json +0 -115
- package/src/vibeOS-lib/tests/experiment-scenarios-token-latency.json +0 -57
- package/src/vibeOS-lib/tests/experiment-scenarios.json +0 -94
- package/src/vibeOS-lib/tests/reports/mode-benchmark-final-2026-05-23T07-01-58Z.json +0 -129
- package/src/vibeOS-lib/tests/reports/mode-calibration-2026-05-23T07-07-08Z.json +0 -440
- package/src/vibeOS-lib/tests/reports/mode-signal-analysis-2026-05-23T06-59-30Z.json +0 -89
|
@@ -1,115 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"meta": {
|
|
3
|
-
"description": "Progressive benchmark scenarios with KNOWN CORRECT outputs. Tests context awareness across 5+ files in vibeOS codebase.",
|
|
4
|
-
"version": "3.0.0",
|
|
5
|
-
"hypothesis": "Brain tier outperforms cheap at 10+ file context requirements. Context degradation hits cheap tier harder after 10+ implicit references.",
|
|
6
|
-
"methodology": "Each scenario has a known correct diff. Evaluator compares output to ground truth. Tracks tokens in/out, latency, correctness."
|
|
7
|
-
},
|
|
8
|
-
"metrics": {
|
|
9
|
-
"correctness": "Exact output matches known correct answer (0 or 1)",
|
|
10
|
-
"partial_correctness": "Output functionally correct but syntactically different (0-1)",
|
|
11
|
-
"tokens_in": "Completion tokens used by the model",
|
|
12
|
-
"tokens_out": "Tokens generated",
|
|
13
|
-
"latency_ms": "Wall time in milliseconds",
|
|
14
|
-
"files_read": "Number of files opened to complete task",
|
|
15
|
-
"files_modified": "Number of files changed",
|
|
16
|
-
"regression": "Did the change break existing tests? (0 or 1)"
|
|
17
|
-
},
|
|
18
|
-
"scenarios": [
|
|
19
|
-
{
|
|
20
|
-
"id": "delete-record-saving",
|
|
21
|
-
"description": "Delete the `recordSaving` function and all calls to it. Known outcome: function removed from index-helpers.ts, no callers in index.ts.",
|
|
22
|
-
"knownCorrectDiff": [{"file": "src/lib/index-helpers.ts", "linesRemoved": 85}, {"file": "src/index.ts", "linesChanged": 0}],
|
|
23
|
-
"filesRequired": ["src/lib/index-helpers.ts", "src/index.ts"],
|
|
24
|
-
"verification": "grep -r 'recordSaving' src/ --include='*.ts' | wc -l == 0",
|
|
25
|
-
"dependencies": [],
|
|
26
|
-
"turnsRequired": 2
|
|
27
|
-
},
|
|
28
|
-
{
|
|
29
|
-
"id": "rename-sync-control-settings",
|
|
30
|
-
"description": "Rename `syncControlSettings` to `applyControlVector` everywhere in src/. Known outcome: all references updated, no broken imports.",
|
|
31
|
-
"knownCorrectDiff": [{"file": "src/lib/hooks/chat-transform.ts", "linesChanged": 3}, {"file": "src/lib/hooks/chat-transform.js", "linesChanged": 3}],
|
|
32
|
-
"filesRequired": ["src/lib/hooks/chat-transform.ts", "src/lib/hooks/chat-transform.js", "src/vibeOS-lib/blackbox/meta-controller.ts"],
|
|
33
|
-
"verification": "grep 'syncControlSettings' src/ --include='*.ts' | wc -l == 0 AND grep 'applyControlVector' src/ --include='*.ts' | wc -l >= 2",
|
|
34
|
-
"dependencies": [],
|
|
35
|
-
"turnsRequired": 2
|
|
36
|
-
},
|
|
37
|
-
{
|
|
38
|
-
"id": "fix-savings-field-mismatch",
|
|
39
|
-
"description": "Fix the savings tracking bug: `delegation_savings_usd` should be `total_savings_usd` in state.ts, index.ts, and footer.ts. Known outcome: 4 files changed, field name consistent across all.",
|
|
40
|
-
"knownCorrectDiff": [{"file": "src/lib/state.ts", "linesChanged": 5}, {"file": "src/index.ts", "linesChanged": 1}, {"file": "src/lib/hooks/footer.ts", "linesChanged": 2}, {"file": "src/vibeOS-mcp-server.ts", "linesChanged": 1}],
|
|
41
|
-
"filesRequired": ["src/lib/state.ts", "src/index.ts", "src/lib/hooks/footer.ts", "src/vibeOS-mcp-server.ts"],
|
|
42
|
-
"verification": "grep 'delegation_savings_usd' src/ --include='*.ts' | wc -l == 0",
|
|
43
|
-
"dependencies": [],
|
|
44
|
-
"turnsRequired": 5
|
|
45
|
-
},
|
|
46
|
-
{
|
|
47
|
-
"id": "add-footer-tool-count-field",
|
|
48
|
-
"description": "Add tool count to footer. Known outcome: readLifetimeSavings returns sesToolCount, _appendFooter displays '| tools: N'.",
|
|
49
|
-
"knownCorrectDiff": [{"file": "src/lib/hooks/footer.ts", "linesChanged": 3}],
|
|
50
|
-
"filesRequired": ["src/lib/hooks/footer.ts"],
|
|
51
|
-
"verification": "grep 'sesToolCount' src/lib/hooks/footer.ts | wc -l >= 2 AND grep 'tools:' src/lib/hooks/footer.ts | wc -l >= 2",
|
|
52
|
-
"dependencies": [],
|
|
53
|
-
"turnsRequired": 3
|
|
54
|
-
},
|
|
55
|
-
{
|
|
56
|
-
"id": "extract-selection-manager",
|
|
57
|
-
"description": "Extract loadSelection and writeSelection from state.ts into a new file selection-manager.ts. Known outcome: new file created, state.ts re-exports, all imports chain correctly.",
|
|
58
|
-
"knownCorrectDiff": [{"file": "src/lib/selection-manager.ts", "linesChanged": 65}, {"file": "src/lib/state.ts", "linesChanged": 10}],
|
|
59
|
-
"filesRequired": ["src/lib/state.ts", "src/index.ts"],
|
|
60
|
-
"verification": "node -e 'import(\"src/lib/selection-manager.ts\").then(m => console.log(Object.keys(m)))' 2>&1 | grep loadSelection",
|
|
61
|
-
"dependencies": [],
|
|
62
|
-
"turnsRequired": 5
|
|
63
|
-
},
|
|
64
|
-
{
|
|
65
|
-
"id": "add-auto-select-mode-test",
|
|
66
|
-
"description": "Add a test for autoSelectMode: when regime=CONVERGING, mode should be 'quality'. Write the test in src/lib/hooks/tests/. Known outcome: test file created, test passes.",
|
|
67
|
-
"knownCorrectDiff": [{"file": "src/lib/hooks/tests/auto-select-mode.test.ts", "linesChanged": 20}],
|
|
68
|
-
"filesRequired": ["src/vibeOS-lib/blackbox/meta-controller.ts", "src/lib/hooks/tests/"],
|
|
69
|
-
"verification": "node --test src/lib/hooks/tests/auto-select-mode.test.ts | grep 'pass 1'",
|
|
70
|
-
"dependencies": [],
|
|
71
|
-
"turnsRequired": 3
|
|
72
|
-
},
|
|
73
|
-
{
|
|
74
|
-
"id": "progressive-context-chain",
|
|
75
|
-
"description": "Multi-turn progressive task: Turn 1) read state.ts and understand the savings tracking. Turn 2) find where total_savings_usd is written. Turn 3) find where it's read. Turn 4) ensure consistency. Known outcome: identify all 5 touchpoints and confirm they use the same field name.",
|
|
76
|
-
"knownCorrectDiff": [{"file": "report.txt", "linesChanged": 10}],
|
|
77
|
-
"filesRequired": ["src/lib/index-helpers.ts", "src/lib/state.ts", "src/lib/hooks/footer.ts", "src/vibeOS-mcp-server.ts"],
|
|
78
|
-
"verification": "grep -c 'total_savings_usd\|delegation_savings_usd' report.txt > /dev/null",
|
|
79
|
-
"dependencies": [],
|
|
80
|
-
"turnsRequired": 4
|
|
81
|
-
},
|
|
82
|
-
{
|
|
83
|
-
"id": "cross-file-type-refactor",
|
|
84
|
-
"description": "Rename OptimizationMode type (currently string union) to an enum. Update meta-controller.ts and all consumers. Known outcome: type definition changed, all imports updated, typecheck passes.",
|
|
85
|
-
"knownCorrectDiff": [{"file": "src/vibeOS-lib/blackbox/meta-controller.ts", "linesChanged": 10}, {"file": "src/lib/hooks/chat-transform.ts", "linesChanged": 2}],
|
|
86
|
-
"filesRequired": ["src/vibeOS-lib/blackbox/meta-controller.ts", "src/vibeOS-lib/blackbox/index.ts", "src/lib/hooks/chat-transform.ts"],
|
|
87
|
-
"verification": "grep 'OptimizationMode' src/vibeOS-lib/blackbox/meta-controller.ts | head -1 | grep 'enum'",
|
|
88
|
-
"dependencies": [],
|
|
89
|
-
"turnsRequired": 6
|
|
90
|
-
},
|
|
91
|
-
{
|
|
92
|
-
"id": "introduce-bug-fix-cycle",
|
|
93
|
-
"description": "Known bug: the ledger writes saveEst but readers expect 'usd'. Fix: add 'usd' to the ledger entry. Known outcome: index-helpers.ts ledger entry now includes usd field.",
|
|
94
|
-
"knownCorrectDiff": [{"file": "src/lib/index-helpers.ts", "linesChanged": 1}],
|
|
95
|
-
"filesRequired": ["src/lib/index-helpers.ts", "src/lib/hooks/footer.ts"],
|
|
96
|
-
"verification": "grep 'usd:' src/lib/index-helpers.ts | grep -v 'saveEst' | grep -v 'total_\\|cache_\\|missed_' | wc -l >= 1",
|
|
97
|
-
"dependencies": [],
|
|
98
|
-
"turnsRequired": 3
|
|
99
|
-
},
|
|
100
|
-
{
|
|
101
|
-
"id": "complex-architectural-change",
|
|
102
|
-
"description": "Move the `syncControlSettings` call from chat-transform.ts line 247 to BEFORE the blackbox state is saved (line 199), so settings are written before the next turn's state is computed. Known outcome: the call moves from line 247 to between lines 198-199.",
|
|
103
|
-
"knownCorrectDiff": [{"file": "src/lib/hooks/chat-transform.ts", "linesChanged": 2}],
|
|
104
|
-
"filesRequired": ["src/lib/hooks/chat-transform.ts"],
|
|
105
|
-
"verification": "grep -n 'syncControlSettings' src/lib/hooks/chat-transform.ts | head -1",
|
|
106
|
-
"dependencies": [],
|
|
107
|
-
"turnsRequired": 5
|
|
108
|
-
}
|
|
109
|
-
],
|
|
110
|
-
"rotation": {
|
|
111
|
-
"scenariosPerNight": 10,
|
|
112
|
-
"maxRunsPerScenario": 50,
|
|
113
|
-
"cooldownNights": 0
|
|
114
|
-
}
|
|
115
|
-
}
|
|
@@ -1,57 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"meta": {
|
|
3
|
-
"description": "Token/latency benchmark. Measures actual token consumption, throughput, and wall-clock latency across 3 tiers.",
|
|
4
|
-
"version": "1.0.0",
|
|
5
|
-
"hypothesis": "MEDIUM tier (flash) has 3-5x lower latency than BRAIN. CHEAP has highest tok/sec. Token throughput scales inversely with capability.",
|
|
6
|
-
"methodology": "Each scenario is a standalone prompt. Runner measures wall-clock time, estimates token counts from response length (1 tok ~= 4 chars), and records pricing estimates."
|
|
7
|
-
},
|
|
8
|
-
"pricing": {
|
|
9
|
-
"deepseek-v4-pro": { "prompt": 1.25, "completion": 8 },
|
|
10
|
-
"deepseek-v4-flash": { "prompt": 0.25, "completion": 1 },
|
|
11
|
-
"deepseek-chat": { "prompt": 0.14, "completion": 0.56 }
|
|
12
|
-
},
|
|
13
|
-
"scenarios": [
|
|
14
|
-
{
|
|
15
|
-
"id": "short-qa",
|
|
16
|
-
"category": "short",
|
|
17
|
-
"prompt": "What is the difference between TCP and UDP? Answer in 2-3 sentences.",
|
|
18
|
-
"expected_length_chars": 300,
|
|
19
|
-
"turns_required": 1
|
|
20
|
-
},
|
|
21
|
-
{
|
|
22
|
-
"id": "medium-explain",
|
|
23
|
-
"category": "medium",
|
|
24
|
-
"prompt": "Explain how a distributed consensus algorithm like Raft works. Include leader election, log replication, and safety properties. Write ~500 words.",
|
|
25
|
-
"expected_length_chars": 2000,
|
|
26
|
-
"turns_required": 1
|
|
27
|
-
},
|
|
28
|
-
{
|
|
29
|
-
"id": "long-codegen",
|
|
30
|
-
"category": "long",
|
|
31
|
-
"prompt": "Implement a complete LRU cache in TypeScript with generics, O(1) get/put, expiration TTL, event emitter for evictions, and comprehensive error handling. Include JSDoc. Write ~200 lines.",
|
|
32
|
-
"expected_length_chars": 6000,
|
|
33
|
-
"turns_required": 1
|
|
34
|
-
},
|
|
35
|
-
{
|
|
36
|
-
"id": "short-math",
|
|
37
|
-
"category": "short",
|
|
38
|
-
"prompt": "Calculate the sum of all prime numbers between 1 and 100. Show your work in 2-3 lines.",
|
|
39
|
-
"expected_length_chars": 300,
|
|
40
|
-
"turns_required": 1
|
|
41
|
-
},
|
|
42
|
-
{
|
|
43
|
-
"id": "medium-architecture",
|
|
44
|
-
"category": "medium",
|
|
45
|
-
"prompt": "Design the architecture for a real-time collaborative document editor (like Google Docs). Cover: OT vs CRDT, WebSocket mesh, persistence layer, conflict resolution, cursor sync. ~500 words.",
|
|
46
|
-
"expected_length_chars": 2000,
|
|
47
|
-
"turns_required": 1
|
|
48
|
-
},
|
|
49
|
-
{
|
|
50
|
-
"id": "long-api-design",
|
|
51
|
-
"category": "long",
|
|
52
|
-
"prompt": "Design a complete REST API for a multi-tenant SaaS platform. Include: authentication via JWT, RBAC with 3 roles, CRUD for 4 entities, rate limiting, pagination, soft delete, audit logging, and webhook support. Write a complete OpenAPI 3.0 spec in YAML.",
|
|
53
|
-
"expected_length_chars": 8000,
|
|
54
|
-
"turns_required": 1
|
|
55
|
-
}
|
|
56
|
-
]
|
|
57
|
-
}
|
|
@@ -1,94 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"meta": {
|
|
3
|
-
"description": "vibeOS nightly experiment scenarios. Hard scenarios (>100 lines, multi-file, architectural) test whether quality/brain mode produces measurably better outcomes than budget/cheap mode.",
|
|
4
|
-
"version": "2.0.0",
|
|
5
|
-
"experiment_hypothesis": "For HARD scenarios (multi-file architecture, complex state, concurrent systems), quality mode (brain tier, full thinking) produces measurably better code than budget mode (cheap tier, off thinking). For MEDIUM scenarios, both tiers produce equivalent quality."
|
|
6
|
-
},
|
|
7
|
-
"metrics": {
|
|
8
|
-
"correctness": "Tests pass on first run (0-1)",
|
|
9
|
-
"completeness": "All requirements addressed — files complete, edge cases, architecture (0-1)",
|
|
10
|
-
"safety": "No vulnerabilities, race conditions, or dangerous patterns (0-1)",
|
|
11
|
-
"architecture_quality": "Module cohesion, interface design, SOLID adherence (0-1)",
|
|
12
|
-
"time_to_solve": "Wall time to completion (seconds)"
|
|
13
|
-
},
|
|
14
|
-
"domains": ["api", "arch", "systems", "refactor", "algorithm"],
|
|
15
|
-
"scenarios": [
|
|
16
|
-
{
|
|
17
|
-
"id": "api-authenticate",
|
|
18
|
-
"domain": "api",
|
|
19
|
-
"complexity": "medium",
|
|
20
|
-
"prompt": "Create an Express.js authentication middleware in TypeScript that validates JWT tokens from the Authorization header. Extract the user ID from the token payload and attach it to `req.user`. Return 401 if the token is missing, expired, or invalid. Include proper TypeScript types and error handling.",
|
|
21
|
-
"expectedFiles": ["middleware/auth.ts"],
|
|
22
|
-
"validation": { "type": "jest", "testFile": "middleware/auth.test.ts" },
|
|
23
|
-
"passCriteria": { "testsPass": 1, "hasTypeAnnotation": true, "hasErrorHandling": true }
|
|
24
|
-
},
|
|
25
|
-
{
|
|
26
|
-
"id": "refactor-extract-function",
|
|
27
|
-
"domain": "refactor",
|
|
28
|
-
"complexity": "medium",
|
|
29
|
-
"prompt": "Given this TypeScript function, refactor it by extracting the validation logic and the formatting logic into separate pure functions. Keep the same public API. Preserve all behavior:\n```ts\nfunction processOrder(order: { items: { price: number; qty: number }[]; tax: number; discount: number }): string {\n if (!order.items || !Array.isArray(order.items)) throw new Error('Invalid items');\n if (order.items.length === 0) throw new Error('No items');\n if (typeof order.tax !== 'number' || order.tax < 0) throw new Error('Invalid tax');\n let subtotal = 0;\n for (const item of order.items) {\n if (typeof item.price !== 'number' || item.price < 0) throw new Error('Invalid price');\n if (typeof item.qty !== 'number' || item.qty < 0) throw new Error('Invalid qty');\n subtotal += item.price * item.qty;\n }\n const total = subtotal + subtotal * order.tax - order.discount;\n return '$' + total.toFixed(2);\n}\n```",
|
|
30
|
-
"expectedFiles": ["order.ts"],
|
|
31
|
-
"validation": { "type": "jest", "testFile": "order.test.ts" },
|
|
32
|
-
"passCriteria": { "testsPass": 1, "extractedFunctions": 2, "behaviorPreserved": true }
|
|
33
|
-
},
|
|
34
|
-
{
|
|
35
|
-
"id": "api-rate-limiter",
|
|
36
|
-
"domain": "api",
|
|
37
|
-
"complexity": "hard",
|
|
38
|
-
"prompt": "Implement a sliding window rate limiter for Express.js in TypeScript. Requirements: 1) limit N requests per window per IP, 2) in-memory Map store, 3) return 429 with Retry-After header when exceeded, 4) automatic cleanup of expired entries via setInterval, 5) configurable window size and max requests, 6) export as middleware factory `createRateLimiter(windowMs, maxRequests)`. Write in `rateLimiter.ts`. Include comprehensive tests.",
|
|
39
|
-
"expectedFiles": ["rateLimiter.ts"],
|
|
40
|
-
"validation": { "type": "jest", "testFile": "rateLimiter.test.ts" },
|
|
41
|
-
"passCriteria": { "testsPass": 1, "rateLimitedCorrectly": true, "cleanupImplemented": true, "retryAfterHeader": true }
|
|
42
|
-
},
|
|
43
|
-
{
|
|
44
|
-
"id": "arch-multi-file-rest-api",
|
|
45
|
-
"domain": "arch",
|
|
46
|
-
"complexity": "hard",
|
|
47
|
-
"prompt": "Design and implement a multi-file REST API for a task manager in TypeScript using Express. Architecture:\n- `types.ts` — Task interface (id: string, title: string, description?: string, status: 'todo'|'in-progress'|'done', priority: 'low'|'medium'|'high', createdAt: Date, updatedAt: Date)\n- `store.ts` — In-memory data store class (TaskStore) with create, findById, findAll, update, delete, searchByStatus. Generate IDs with crypto.randomUUID(). All methods return Promises.\n- `middleware/validateTask.ts` — Express middleware that validates request body: title non-empty (3-200 chars), status valid, priority valid. Return 400 with specific error messages.\n- `routes.ts` — Express Router with: GET /tasks, GET /tasks/:id, POST /tasks, PATCH /tasks/:id, DELETE /tasks/:id. Use async error wrapper.\n- `app.ts` — Express app with JSON parsing, CORS, routes mounted at /api, 404 handler, global error handler middleware.\nReturn ALL 5 files. Each must be complete, compilable, and properly import from each other.",
|
|
48
|
-
"expectedFiles": ["types.ts", "store.ts", "middleware/validateTask.ts", "routes.ts", "app.ts"],
|
|
49
|
-
"validation": { "type": "jest", "testFile": "app.test.ts" },
|
|
50
|
-
"passCriteria": { "testsPass": 1, "filesComplete": 5, "hasValidation": true, "hasErrorHandler": true, "hasCRUD": true }
|
|
51
|
-
},
|
|
52
|
-
{
|
|
53
|
-
"id": "systems-concurrent-task-runner",
|
|
54
|
-
"domain": "systems",
|
|
55
|
-
"complexity": "hard",
|
|
56
|
-
"prompt": "Implement a concurrent task runner in TypeScript. Requirements:\n1) Accept an array of async tasks and a concurrency limit N\n2) Run at most N tasks simultaneously (not Promise.all — must use semaphore pattern)\n3) Collect results in order (result[i] corresponds to task[i])\n4) If any task rejects, cancel remaining tasks gracefully and reject with the error\n5) Support external cancellation via AbortSignal\n6) Include a TaskRunner class with methods: run(), cancel(), status() returning { running, pending, completed, failed }\n7) Use proper TypeScript generics: TaskRunner<T> where T is the task result type\n8) Handle edge cases: empty task array, concurrency > task count, single task.\nWrite tests covering all edge cases. Export as single file `taskRunner.ts`.",
|
|
57
|
-
"expectedFiles": ["taskRunner.ts"],
|
|
58
|
-
"validation": { "type": "jest", "testFile": "taskRunner.test.ts" },
|
|
59
|
-
"passCriteria": { "testsPass": 1, "concurrencyCorrect": true, "cancellationWorks": true, "orderPreserved": true, "errorPropagation": true }
|
|
60
|
-
},
|
|
61
|
-
{
|
|
62
|
-
"id": "refactor-monolith-split",
|
|
63
|
-
"domain": "refactor",
|
|
64
|
-
"complexity": "hard",
|
|
65
|
-
"prompt": "Refactor this monolithic TypeScript file into 4 properly separated modules:\n- `models/user.ts` — User interface and types\n- `utils/validation.ts` — Email, password, and username validation functions\n- `repositories/userRepository.ts` — Data access layer with async operations\n- `services/userService.ts` — Business logic layer\n\nThe monolith contains User model, 3 validation functions, 5 repository functions, and 4 service functions (register, login, updateProfile, deleteAccount). Preserve ALL behavior. Each module must have proper imports/exports. Return all 4 files.\n\n```ts\ninterface User { id: string; username: string; email: string; passwordHash: string; createdAt: Date }\nfunction isValidEmail(email: string): boolean { return /^[^\\s@]+@[^\\s@]+\\.[^\\s@]+$/.test(email) }\nfunction isStrongPassword(pw: string): boolean { return pw.length >= 8 && /[A-Z]/.test(pw) && /[0-9]/.test(pw) }\nfunction isValidUsername(u: string): boolean { return u.length >= 3 && u.length <= 30 && /^[a-zA-Z0-9_]+$/.test(u) }\nconst userDB: Map<string, User> = new Map()\nasync function findById(id: string): Promise<User | undefined> { return Promise.resolve(userDB.get(id)) }\nasync function findByEmail(email: string): Promise<User | undefined> { for (const u of userDB.values()) { if (u.email === email) return u } return undefined }\nasync function createUser(user: User): Promise<User> { userDB.set(user.id, user); return user }\nasync function updateUser(id: string, updates: Partial<User>): Promise<User | undefined> { const u = userDB.get(id); if (!u) return undefined; Object.assign(u, updates); return u }\nasync function deleteUser(id: string): Promise<boolean> { return userDB.delete(id) }\nasync function registerUser(username: string, email: string, password: string): Promise<User> { if (!isValidEmail(email)) throw new Error('Invalid email'); if (!isStrongPassword(password)) throw new Error('Weak password'); if (!isValidUsername(username)) throw new Error('Invalid username'); const existing = await findByEmail(email); if (existing) throw new Error('Email already registered'); const user: User = { id: Date.now().toString(36) + Math.random().toString(36).slice(2), username, email, passwordHash: 'hashed_' + password, createdAt: new Date() }; return createUser(user) }\nasync function loginUser(email: string, password: string): Promise<User> { const user = await findByEmail(email); if (!user) throw new Error('User not found'); if (user.passwordHash !== 'hashed_' + password) throw new Error('Invalid password'); return user }\nasync function updateProfile(id: string, data: { username?: string; email?: string }): Promise<User> { if (data.email && !isValidEmail(data.email)) throw new Error('Invalid email'); if (data.username && !isValidUsername(data.username)) throw new Error('Invalid username'); const updated = await updateUser(id, data); if (!updated) throw new Error('User not found'); return updated }\nasync function deleteAccount(id: string): Promise<void> { const deleted = await deleteUser(id); if (!deleted) throw new Error('User not found') }\n```",
|
|
66
|
-
"expectedFiles": ["models/user.ts", "utils/validation.ts", "repositories/userRepository.ts", "services/userService.ts"],
|
|
67
|
-
"validation": { "type": "jest", "testFile": "services/userService.test.ts" },
|
|
68
|
-
"passCriteria": { "testsPass": 1, "modulesExtracted": 4, "importsCorrect": true, "behaviorPreserved": true }
|
|
69
|
-
},
|
|
70
|
-
{
|
|
71
|
-
"id": "arch-state-machine",
|
|
72
|
-
"domain": "arch",
|
|
73
|
-
"complexity": "hard",
|
|
74
|
-
"prompt": "Implement a fully type-safe finite state machine in TypeScript.\nRequirements:\n1) Generic over state type S (extends string) and event type E (extends string)\n2) Accept a transitions map: Record<S, Partial<Record<E, S>>> at construction\n3) transition(event: E): void — validates event is allowed for current state, moves to target state\n4) can(event: E): boolean — check if transition is possible without executing\n5) Lifecycle callbacks: onEnter(state, callback), onExit(state, callback), onTransition(from, event, to, callback)\n6) getHistory(): Array<{ from: S; event: E; to: S; timestamp: number }>\n7) reset(): void — return to initial state, clear history\n8) getState(): S — current state\n9) TypeScript must catch invalid states/events at compile time (discriminated unions or template literal types).\n10) Handle edge cases: duplicate transitions, transition to same state, empty transitions map\nWrite comprehensive tests. Single file `stateMachine.ts`.",
|
|
75
|
-
"expectedFiles": ["stateMachine.ts"],
|
|
76
|
-
"validation": { "type": "jest", "testFile": "stateMachine.test.ts" },
|
|
77
|
-
"passCriteria": { "testsPass": 1, "genericTyped": true, "callbacksWork": true, "historyTracked": true, "typeSafe": true }
|
|
78
|
-
},
|
|
79
|
-
{
|
|
80
|
-
"id": "algorithm-djikstra-pathfinder",
|
|
81
|
-
"domain": "algorithm",
|
|
82
|
-
"complexity": "hard",
|
|
83
|
-
"prompt": "Implement Dijkstra's shortest path algorithm in TypeScript.\nRequirements:\n1) Accept a weighted directed graph as adjacency list: Map<string, Array<{ node: string; weight: number }>>\n2) Function: shortestPath(graph, start, end): { path: string[], distance: number } | null\n3) Use a min-priority queue (implement it, don't import). Use binary heap.\n4) Return the path as an array of node IDs and the total distance. Return null if no path exists.\n5) Handle edge cases: empty graph, start equals end, disconnected nodes, negative weights (throw error if negative), multiple equal-cost paths (return any), cycles.\n6) Write a BinaryHeap class with push, pop, isEmpty, decreaseKey.\n7) TypeScript generics: BinaryHeap<T>\n8) Include JSDoc for both BinaryHeap and shortestPath.\n9) Write comprehensive tests covering all edge cases.\nSingle file `dijkstra.ts`.",
|
|
84
|
-
"expectedFiles": ["dijkstra.ts"],
|
|
85
|
-
"validation": { "type": "jest", "testFile": "dijkstra.test.ts" },
|
|
86
|
-
"passCriteria": { "testsPass": 1, "binaryHeapWorks": true, "pathCorrect": true, "negativeRejected": true, "allEdgeCases": true }
|
|
87
|
-
}
|
|
88
|
-
],
|
|
89
|
-
"rotation": {
|
|
90
|
-
"scenariosPerNight": 2,
|
|
91
|
-
"maxRunsPerScenario": 5,
|
|
92
|
-
"cooldownNights": 1
|
|
93
|
-
}
|
|
94
|
-
}
|
|
@@ -1,129 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"meta": {
|
|
3
|
-
"generated_at": "2026-05-23T07:01:58.571Z",
|
|
4
|
-
"type": "vibeos-mode-benchmark-final",
|
|
5
|
-
"version": "1.0",
|
|
6
|
-
"total_cost_usd": 0.33294155999999997
|
|
7
|
-
},
|
|
8
|
-
"tier_summary": [
|
|
9
|
-
{
|
|
10
|
-
"tier": "brain",
|
|
11
|
-
"runs": 15,
|
|
12
|
-
"avg_latency_ms": 329,
|
|
13
|
-
"avg_tps": 7564,
|
|
14
|
-
"total_tok_out": 36892,
|
|
15
|
-
"total_cost_usd": 0.295821,
|
|
16
|
-
"scenarios": [
|
|
17
|
-
"short-qa",
|
|
18
|
-
"medium-explain",
|
|
19
|
-
"long-codegen",
|
|
20
|
-
"short-math",
|
|
21
|
-
"medium-architecture",
|
|
22
|
-
"long-api-design"
|
|
23
|
-
]
|
|
24
|
-
},
|
|
25
|
-
{
|
|
26
|
-
"tier": "medium",
|
|
27
|
-
"runs": 15,
|
|
28
|
-
"avg_latency_ms": 308,
|
|
29
|
-
"avg_tps": 6686,
|
|
30
|
-
"total_tok_out": 29542,
|
|
31
|
-
"total_cost_usd": 0.029679,
|
|
32
|
-
"scenarios": [
|
|
33
|
-
"short-qa",
|
|
34
|
-
"medium-explain",
|
|
35
|
-
"long-codegen",
|
|
36
|
-
"short-math",
|
|
37
|
-
"medium-architecture",
|
|
38
|
-
"long-api-design"
|
|
39
|
-
]
|
|
40
|
-
},
|
|
41
|
-
{
|
|
42
|
-
"tier": "cheap",
|
|
43
|
-
"runs": 8,
|
|
44
|
-
"avg_latency_ms": 391,
|
|
45
|
-
"avg_tps": 5344,
|
|
46
|
-
"total_tok_out": 13215,
|
|
47
|
-
"total_cost_usd": 0.007441560000000001,
|
|
48
|
-
"scenarios": [
|
|
49
|
-
"short-qa",
|
|
50
|
-
"medium-explain",
|
|
51
|
-
"long-codegen",
|
|
52
|
-
"short-math",
|
|
53
|
-
"medium-architecture",
|
|
54
|
-
"long-api-design"
|
|
55
|
-
]
|
|
56
|
-
}
|
|
57
|
-
],
|
|
58
|
-
"thinking_summary": [
|
|
59
|
-
{
|
|
60
|
-
"mode": "off",
|
|
61
|
-
"runs": 8,
|
|
62
|
-
"avg_latency_ms": 304,
|
|
63
|
-
"avg_tps": 5655
|
|
64
|
-
},
|
|
65
|
-
{
|
|
66
|
-
"mode": "brief",
|
|
67
|
-
"runs": 6,
|
|
68
|
-
"avg_latency_ms": 309,
|
|
69
|
-
"avg_tps": 6141
|
|
70
|
-
},
|
|
71
|
-
{
|
|
72
|
-
"mode": "full",
|
|
73
|
-
"runs": 6,
|
|
74
|
-
"avg_latency_ms": 342,
|
|
75
|
-
"avg_tps": 6631
|
|
76
|
-
}
|
|
77
|
-
],
|
|
78
|
-
"signals": [
|
|
79
|
-
{
|
|
80
|
-
"id": "TIER_PARITY",
|
|
81
|
-
"confidence": "HIGH",
|
|
82
|
-
"description": "Brain and medium have near-identical latency"
|
|
83
|
-
},
|
|
84
|
-
{
|
|
85
|
-
"id": "THINKING_OVERHEAD",
|
|
86
|
-
"confidence": "MEDIUM",
|
|
87
|
-
"description": "Full thinking adds ~15% latency"
|
|
88
|
-
},
|
|
89
|
-
{
|
|
90
|
-
"id": "CHEAP_GAP",
|
|
91
|
-
"confidence": "HIGH",
|
|
92
|
-
"description": "Cheap produces 60% shorter responses than medium"
|
|
93
|
-
},
|
|
94
|
-
{
|
|
95
|
-
"id": "CALIBRATION_GAP",
|
|
96
|
-
"confidence": "CRITICAL",
|
|
97
|
-
"description": "4/5 modes have zero real sessions"
|
|
98
|
-
},
|
|
99
|
-
{
|
|
100
|
-
"id": "HUMAN_FLOW_COST",
|
|
101
|
-
"confidence": "HIGH",
|
|
102
|
-
"description": "Real tasks cost 2-5x more than single-turn predictions"
|
|
103
|
-
},
|
|
104
|
-
{
|
|
105
|
-
"id": "AUTO_MODE_STUB",
|
|
106
|
-
"confidence": "HIGH",
|
|
107
|
-
"description": "autoSelectMode() always returns 'balanced' regardless of state"
|
|
108
|
-
}
|
|
109
|
-
],
|
|
110
|
-
"mode_gaps": [
|
|
111
|
-
{
|
|
112
|
-
"id": "FORENSIC",
|
|
113
|
-
"priority": "HIGH",
|
|
114
|
-
"description": "Debugging/sleuthing workflow — brain + full thinking + strict flow"
|
|
115
|
-
},
|
|
116
|
-
{
|
|
117
|
-
"id": "WEB_RESEARCH",
|
|
118
|
-
"priority": "HIGH",
|
|
119
|
-
"description": "Research/exploration — medium + full thinking + context7"
|
|
120
|
-
}
|
|
121
|
-
],
|
|
122
|
-
"recommendations": [
|
|
123
|
-
"Add FORENSIC mode with brain tier, full thinking, strict enforcement",
|
|
124
|
-
"Add WEB_RESEARCH mode with medium tier, full thinking, context7 required",
|
|
125
|
-
"Fix autoSelectMode() to use stress + regime for dynamic selection",
|
|
126
|
-
"Link calibration mode configs to computeControlVector()",
|
|
127
|
-
"Auto-record mode usage data to populate calibration for all modes"
|
|
128
|
-
]
|
|
129
|
-
}
|