opencode-swarm-plugin 0.20.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.beads/issues.jsonl +213 -0
- package/INTEGRATION_EXAMPLE.md +66 -0
- package/README.md +352 -522
- package/dist/index.js +2046 -984
- package/dist/plugin.js +2051 -1017
- package/docs/analysis/subagent-coordination-patterns.md +2 -0
- package/docs/semantic-memory-cli-syntax.md +123 -0
- package/docs/swarm-mail-architecture.md +1147 -0
- package/evals/README.md +116 -0
- package/evals/evalite.config.ts +15 -0
- package/evals/example.eval.ts +32 -0
- package/evals/fixtures/decomposition-cases.ts +105 -0
- package/evals/lib/data-loader.test.ts +288 -0
- package/evals/lib/data-loader.ts +111 -0
- package/evals/lib/llm.ts +115 -0
- package/evals/scorers/index.ts +200 -0
- package/evals/scorers/outcome-scorers.test.ts +27 -0
- package/evals/scorers/outcome-scorers.ts +349 -0
- package/evals/swarm-decomposition.eval.ts +112 -0
- package/package.json +8 -1
- package/scripts/cleanup-test-memories.ts +346 -0
- package/src/beads.ts +49 -0
- package/src/eval-capture.ts +487 -0
- package/src/index.ts +45 -3
- package/src/learning.integration.test.ts +19 -4
- package/src/output-guardrails.test.ts +438 -0
- package/src/output-guardrails.ts +381 -0
- package/src/schemas/index.ts +18 -0
- package/src/schemas/swarm-context.ts +115 -0
- package/src/storage.ts +117 -5
- package/src/streams/events.test.ts +296 -0
- package/src/streams/events.ts +93 -0
- package/src/streams/migrations.test.ts +24 -20
- package/src/streams/migrations.ts +51 -0
- package/src/streams/projections.ts +187 -0
- package/src/streams/store.ts +275 -0
- package/src/swarm-orchestrate.ts +771 -189
- package/src/swarm-prompts.ts +84 -12
- package/src/swarm.integration.test.ts +124 -0
- package/vitest.integration.config.ts +6 -0
- package/vitest.integration.setup.ts +48 -0
package/evals/README.md
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# Evalite - Swarm Decomposition Evals
|
|
2
|
+
|
|
3
|
+
TypeScript-native evaluation framework for testing swarm task decomposition quality.
|
|
4
|
+
|
|
5
|
+
## Quick Start
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
# Watch mode for development
|
|
9
|
+
pnpm eval:dev
|
|
10
|
+
|
|
11
|
+
# Run all evals once
|
|
12
|
+
pnpm eval:run
|
|
13
|
+
|
|
14
|
+
# CI mode with 80% threshold
|
|
15
|
+
pnpm eval:ci
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Structure
|
|
19
|
+
|
|
20
|
+
```
|
|
21
|
+
evals/
|
|
22
|
+
├── evalite.config.ts # Evalite configuration
|
|
23
|
+
├── scorers/
|
|
24
|
+
│ └── index.ts # Custom scorers (independence, balance, coverage, clarity)
|
|
25
|
+
├── fixtures/
|
|
26
|
+
│ └── decomposition-cases.ts # Test cases with expected outcomes
|
|
27
|
+
└── *.eval.ts # Eval files (auto-discovered)
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Custom Scorers
|
|
31
|
+
|
|
32
|
+
### Subtask Independence (0-1)
|
|
33
|
+
|
|
34
|
+
Checks no files appear in multiple subtasks. File conflicts cause merge conflicts and coordination overhead.
|
|
35
|
+
|
|
36
|
+
### Complexity Balance (0-1)
|
|
37
|
+
|
|
38
|
+
Measures coefficient of variation (CV) of estimated_complexity across subtasks. CV < 0.3 scores 1.0, decreases linearly to 0 at CV = 1.0.
|
|
39
|
+
|
|
40
|
+
### Coverage Completeness (0-1)
|
|
41
|
+
|
|
42
|
+
If expected.requiredFiles specified: ratio of covered files.
|
|
43
|
+
Otherwise: checks subtask count is within min/max range.
|
|
44
|
+
|
|
45
|
+
### Instruction Clarity (0-1)
|
|
46
|
+
|
|
47
|
+
Average quality score per subtask based on:
|
|
48
|
+
|
|
49
|
+
- Description length > 20 chars (+0.2)
|
|
50
|
+
- Files specified (+0.2)
|
|
51
|
+
- Non-generic title (+0.1)
|
|
52
|
+
|
|
53
|
+
## Writing Evals
|
|
54
|
+
|
|
55
|
+
```typescript
|
|
56
|
+
import { evalite } from "evalite";
|
|
57
|
+
import { subtaskIndependence, coverageCompleteness } from "./scorers/index.js";
|
|
58
|
+
|
|
59
|
+
evalite("My decomposition test", {
|
|
60
|
+
data: async () => {
|
|
61
|
+
return [
|
|
62
|
+
{
|
|
63
|
+
input: "Add OAuth authentication",
|
|
64
|
+
expected: {
|
|
65
|
+
minSubtasks: 3,
|
|
66
|
+
maxSubtasks: 6,
|
|
67
|
+
requiredFiles: ["src/auth/oauth.ts", "src/middleware/auth.ts"],
|
|
68
|
+
},
|
|
69
|
+
},
|
|
70
|
+
];
|
|
71
|
+
},
|
|
72
|
+
task: async (input) => {
|
|
73
|
+
// Call your decomposition logic here
|
|
74
|
+
// Should return BeadTree JSON as string
|
|
75
|
+
},
|
|
76
|
+
scorers: [subtaskIndependence, coverageCompleteness],
|
|
77
|
+
});
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## BeadTree Format
|
|
81
|
+
|
|
82
|
+
Scorers expect output as JSON string matching:
|
|
83
|
+
|
|
84
|
+
```typescript
|
|
85
|
+
{
|
|
86
|
+
epic: {
|
|
87
|
+
title: string;
|
|
88
|
+
description: string;
|
|
89
|
+
}
|
|
90
|
+
subtasks: Array<{
|
|
91
|
+
title: string;
|
|
92
|
+
description?: string;
|
|
93
|
+
files?: string[];
|
|
94
|
+
estimated_complexity?: number; // 1-3
|
|
95
|
+
}>;
|
|
96
|
+
}
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Fixtures
|
|
100
|
+
|
|
101
|
+
See `fixtures/decomposition-cases.ts` for example test cases covering:
|
|
102
|
+
|
|
103
|
+
- OAuth implementation
|
|
104
|
+
- Rate limiting
|
|
105
|
+
- TypeScript migration
|
|
106
|
+
- Admin dashboard
|
|
107
|
+
- Memory leak debugging
|
|
108
|
+
- Feature flag system
|
|
109
|
+
|
|
110
|
+
## Notes
|
|
111
|
+
|
|
112
|
+
- Evalite v1.0.0-beta.15 installed
|
|
113
|
+
- Built on Vitest
|
|
114
|
+
- Runs locally, no API keys required
|
|
115
|
+
- Results cached in `node_modules/.evalite/`
|
|
116
|
+
- Clear cache if needed: `rm -rf node_modules/.evalite`
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import { defineConfig } from "evalite/config";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Evalite configuration for swarm decomposition testing
|
|
5
|
+
*
|
|
6
|
+
* Tests swarm task decomposition strategies to ensure:
|
|
7
|
+
* - Subtasks are independent (no file conflicts)
|
|
8
|
+
* - Complexity is balanced across subtasks
|
|
9
|
+
* - Full task coverage
|
|
10
|
+
*/
|
|
11
|
+
export default defineConfig({
|
|
12
|
+
// Base configuration - Evalite will auto-discover evals in this directory
|
|
13
|
+
// Custom scorers are defined in scorers/index.ts
|
|
14
|
+
// Test fixtures are in fixtures/decomposition-cases.ts
|
|
15
|
+
});
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Example eval file to test Evalite setup
|
|
3
|
+
*
|
|
4
|
+
* This is a minimal test to verify:
|
|
5
|
+
* 1. Evalite CLI can discover .eval.ts files
|
|
6
|
+
* 2. createScorer works
|
|
7
|
+
* 3. evalite() function works
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { evalite } from "evalite";
|
|
11
|
+
import { subtaskIndependence } from "./scorers/index.js";
|
|
12
|
+
|
|
13
|
+
evalite("Example: Basic scorer test", {
|
|
14
|
+
data: async () => {
|
|
15
|
+
return [
|
|
16
|
+
{
|
|
17
|
+
input: "Test task",
|
|
18
|
+
output: JSON.stringify({
|
|
19
|
+
epic: { title: "Test Epic", description: "Test" },
|
|
20
|
+
subtasks: [
|
|
21
|
+
{ title: "Subtask 1", files: ["a.ts"], estimated_complexity: 1 },
|
|
22
|
+
{ title: "Subtask 2", files: ["b.ts"], estimated_complexity: 1 },
|
|
23
|
+
],
|
|
24
|
+
}),
|
|
25
|
+
},
|
|
26
|
+
];
|
|
27
|
+
},
|
|
28
|
+
task: async (input) => {
|
|
29
|
+
return input; // passthrough for testing
|
|
30
|
+
},
|
|
31
|
+
scorers: [subtaskIndependence],
|
|
32
|
+
});
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Test cases for swarm task decomposition
|
|
3
|
+
*
|
|
4
|
+
* Each case includes:
|
|
5
|
+
* - input: task description and optional context
|
|
6
|
+
* - expected: validation criteria (min/max subtasks, required files)
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
export interface DecompositionTestCase {
|
|
10
|
+
input: {
|
|
11
|
+
task: string;
|
|
12
|
+
context?: string;
|
|
13
|
+
};
|
|
14
|
+
expected: {
|
|
15
|
+
minSubtasks: number;
|
|
16
|
+
maxSubtasks: number;
|
|
17
|
+
requiredFiles?: string[];
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export const decompositionCases: DecompositionTestCase[] = [
|
|
22
|
+
{
|
|
23
|
+
input: {
|
|
24
|
+
task: "Add user authentication with OAuth",
|
|
25
|
+
context: "Next.js App Router application with existing user model",
|
|
26
|
+
},
|
|
27
|
+
expected: {
|
|
28
|
+
minSubtasks: 3,
|
|
29
|
+
maxSubtasks: 6,
|
|
30
|
+
requiredFiles: [
|
|
31
|
+
"src/auth/oauth.ts",
|
|
32
|
+
"src/auth/middleware.ts",
|
|
33
|
+
"app/api/auth/[...nextauth]/route.ts",
|
|
34
|
+
],
|
|
35
|
+
},
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
input: {
|
|
39
|
+
task: "Implement rate limiting for API endpoints",
|
|
40
|
+
context: "Express.js API with Redis available",
|
|
41
|
+
},
|
|
42
|
+
expected: {
|
|
43
|
+
minSubtasks: 2,
|
|
44
|
+
maxSubtasks: 4,
|
|
45
|
+
requiredFiles: [
|
|
46
|
+
"src/middleware/rate-limit.ts",
|
|
47
|
+
"src/utils/redis-client.ts",
|
|
48
|
+
],
|
|
49
|
+
},
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
input: {
|
|
53
|
+
task: "Add TypeScript strict mode to legacy JavaScript project",
|
|
54
|
+
context: "Large codebase with 50+ JS files, currently untyped",
|
|
55
|
+
},
|
|
56
|
+
expected: {
|
|
57
|
+
minSubtasks: 4,
|
|
58
|
+
maxSubtasks: 8,
|
|
59
|
+
requiredFiles: ["tsconfig.json"],
|
|
60
|
+
},
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
input: {
|
|
64
|
+
task: "Create admin dashboard for user management",
|
|
65
|
+
context: "React app with existing component library and API client",
|
|
66
|
+
},
|
|
67
|
+
expected: {
|
|
68
|
+
minSubtasks: 4,
|
|
69
|
+
maxSubtasks: 7,
|
|
70
|
+
requiredFiles: [
|
|
71
|
+
"src/pages/admin/Dashboard.tsx",
|
|
72
|
+
"src/components/admin/UserTable.tsx",
|
|
73
|
+
"src/api/admin.ts",
|
|
74
|
+
],
|
|
75
|
+
},
|
|
76
|
+
},
|
|
77
|
+
{
|
|
78
|
+
input: {
|
|
79
|
+
task: "Fix memory leak in long-running background job",
|
|
80
|
+
context:
|
|
81
|
+
"Node.js worker that processes queue messages, memory grows over time",
|
|
82
|
+
},
|
|
83
|
+
expected: {
|
|
84
|
+
minSubtasks: 2,
|
|
85
|
+
maxSubtasks: 4,
|
|
86
|
+
requiredFiles: ["src/workers/queue-processor.ts"],
|
|
87
|
+
},
|
|
88
|
+
},
|
|
89
|
+
{
|
|
90
|
+
input: {
|
|
91
|
+
task: "Implement feature flag system with remote config",
|
|
92
|
+
context:
|
|
93
|
+
"Microservices architecture, need runtime toggles without deploys",
|
|
94
|
+
},
|
|
95
|
+
expected: {
|
|
96
|
+
minSubtasks: 3,
|
|
97
|
+
maxSubtasks: 6,
|
|
98
|
+
requiredFiles: [
|
|
99
|
+
"src/feature-flags/client.ts",
|
|
100
|
+
"src/feature-flags/middleware.ts",
|
|
101
|
+
"src/feature-flags/types.ts",
|
|
102
|
+
],
|
|
103
|
+
},
|
|
104
|
+
},
|
|
105
|
+
];
|
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Data Loader Tests
|
|
3
|
+
*
|
|
4
|
+
* Tests the PGlite-backed eval data loader functions.
|
|
5
|
+
* Uses a real in-memory PGlite database for accurate testing.
|
|
6
|
+
*/
|
|
7
|
+
import { describe, it, expect, beforeAll, afterAll } from "bun:test";
|
|
8
|
+
import {
|
|
9
|
+
loadEvalCases,
|
|
10
|
+
hasRealEvalData,
|
|
11
|
+
getEvalDataSummary,
|
|
12
|
+
} from "./data-loader.js";
|
|
13
|
+
import { appendEvent } from "../../src/streams/store.js";
|
|
14
|
+
import { getDatabase, closeDatabase } from "../../src/streams/index.js";
|
|
15
|
+
import type {
|
|
16
|
+
DecompositionGeneratedEvent,
|
|
17
|
+
SubtaskOutcomeEvent,
|
|
18
|
+
} from "../../src/streams/events.js";
|
|
19
|
+
import * as fs from "node:fs";
|
|
20
|
+
import * as path from "node:path";
|
|
21
|
+
import * as os from "node:os";
|
|
22
|
+
|
|
23
|
+
const TEST_PROJECT_KEY = "test-project-eval-loader";
|
|
24
|
+
|
|
25
|
+
// Create a unique temp directory for this test run
|
|
26
|
+
let testDir: string;
|
|
27
|
+
|
|
28
|
+
describe("Data Loader", () => {
|
|
29
|
+
beforeAll(async () => {
|
|
30
|
+
// Create temp directory for test database
|
|
31
|
+
testDir = fs.mkdtempSync(path.join(os.tmpdir(), "eval-loader-test-"));
|
|
32
|
+
|
|
33
|
+
// Initialize database by getting it (lazy init)
|
|
34
|
+
await getDatabase(testDir);
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
afterAll(async () => {
|
|
38
|
+
await closeDatabase(testDir);
|
|
39
|
+
// Clean up temp directory
|
|
40
|
+
fs.rmSync(testDir, { recursive: true, force: true });
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
describe("loadEvalCases", () => {
|
|
44
|
+
it("transforms eval records to EvalCase format", async () => {
|
|
45
|
+
// Insert a decomposition event
|
|
46
|
+
const decompositionEvent: DecompositionGeneratedEvent = {
|
|
47
|
+
type: "decomposition_generated",
|
|
48
|
+
timestamp: Date.now(),
|
|
49
|
+
project_key: TEST_PROJECT_KEY,
|
|
50
|
+
epic_id: "epic-load-1",
|
|
51
|
+
task: "Add authentication",
|
|
52
|
+
context: "Next.js app",
|
|
53
|
+
strategy: "feature-based",
|
|
54
|
+
epic_title: "Auth Epic",
|
|
55
|
+
subtasks: [
|
|
56
|
+
{ title: "OAuth setup", files: ["src/auth/oauth.ts"], priority: 1 },
|
|
57
|
+
{
|
|
58
|
+
title: "Session management",
|
|
59
|
+
files: ["src/auth/session.ts"],
|
|
60
|
+
priority: 2,
|
|
61
|
+
},
|
|
62
|
+
],
|
|
63
|
+
};
|
|
64
|
+
await appendEvent(decompositionEvent, testDir);
|
|
65
|
+
|
|
66
|
+
// Insert outcome events for both subtasks
|
|
67
|
+
const outcome1: SubtaskOutcomeEvent = {
|
|
68
|
+
type: "subtask_outcome",
|
|
69
|
+
timestamp: Date.now(),
|
|
70
|
+
project_key: TEST_PROJECT_KEY,
|
|
71
|
+
epic_id: "epic-load-1",
|
|
72
|
+
bead_id: "epic-load-1.1",
|
|
73
|
+
planned_files: ["src/auth/oauth.ts"],
|
|
74
|
+
actual_files: ["src/auth/oauth.ts"],
|
|
75
|
+
duration_ms: 5000,
|
|
76
|
+
error_count: 0,
|
|
77
|
+
retry_count: 0,
|
|
78
|
+
success: true,
|
|
79
|
+
};
|
|
80
|
+
await appendEvent(outcome1, testDir);
|
|
81
|
+
|
|
82
|
+
const outcome2: SubtaskOutcomeEvent = {
|
|
83
|
+
type: "subtask_outcome",
|
|
84
|
+
timestamp: Date.now(),
|
|
85
|
+
project_key: TEST_PROJECT_KEY,
|
|
86
|
+
epic_id: "epic-load-1",
|
|
87
|
+
bead_id: "epic-load-1.2",
|
|
88
|
+
planned_files: ["src/auth/session.ts"],
|
|
89
|
+
actual_files: ["src/auth/session.ts"],
|
|
90
|
+
duration_ms: 3000,
|
|
91
|
+
error_count: 0,
|
|
92
|
+
retry_count: 0,
|
|
93
|
+
success: true,
|
|
94
|
+
};
|
|
95
|
+
await appendEvent(outcome2, testDir);
|
|
96
|
+
|
|
97
|
+
const cases = await loadEvalCases(TEST_PROJECT_KEY, {
|
|
98
|
+
projectPath: testDir,
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
expect(cases.length).toBeGreaterThanOrEqual(1);
|
|
102
|
+
const authCase = cases.find((c) => c.input.task === "Add authentication");
|
|
103
|
+
expect(authCase).toBeDefined();
|
|
104
|
+
expect(authCase!.input.context).toBe("Next.js app");
|
|
105
|
+
expect(authCase!.expected.minSubtasks).toBe(2);
|
|
106
|
+
expect(authCase!.expected.maxSubtasks).toBe(2);
|
|
107
|
+
expect(authCase!.expected.requiredFiles).toContain("src/auth/oauth.ts");
|
|
108
|
+
expect(authCase!.expected.requiredFiles).toContain("src/auth/session.ts");
|
|
109
|
+
expect(authCase!.actual).toBeDefined();
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
it("filters by success when successOnly is true", async () => {
|
|
113
|
+
// Insert a successful decomposition
|
|
114
|
+
const successEvent: DecompositionGeneratedEvent = {
|
|
115
|
+
type: "decomposition_generated",
|
|
116
|
+
timestamp: Date.now(),
|
|
117
|
+
project_key: TEST_PROJECT_KEY,
|
|
118
|
+
epic_id: "epic-success-filter",
|
|
119
|
+
task: "Success task for filter",
|
|
120
|
+
strategy: "feature-based",
|
|
121
|
+
epic_title: "Success Epic",
|
|
122
|
+
subtasks: [{ title: "Sub", files: ["src/success.ts"], priority: 1 }],
|
|
123
|
+
};
|
|
124
|
+
await appendEvent(successEvent, testDir);
|
|
125
|
+
|
|
126
|
+
// Mark it successful
|
|
127
|
+
const successOutcome: SubtaskOutcomeEvent = {
|
|
128
|
+
type: "subtask_outcome",
|
|
129
|
+
timestamp: Date.now(),
|
|
130
|
+
project_key: TEST_PROJECT_KEY,
|
|
131
|
+
epic_id: "epic-success-filter",
|
|
132
|
+
bead_id: "epic-success-filter.1",
|
|
133
|
+
planned_files: ["src/success.ts"],
|
|
134
|
+
actual_files: ["src/success.ts"],
|
|
135
|
+
duration_ms: 1000,
|
|
136
|
+
error_count: 0,
|
|
137
|
+
retry_count: 0,
|
|
138
|
+
success: true,
|
|
139
|
+
};
|
|
140
|
+
await appendEvent(successOutcome, testDir);
|
|
141
|
+
|
|
142
|
+
// Insert a failed decomposition
|
|
143
|
+
const failEvent: DecompositionGeneratedEvent = {
|
|
144
|
+
type: "decomposition_generated",
|
|
145
|
+
timestamp: Date.now(),
|
|
146
|
+
project_key: TEST_PROJECT_KEY,
|
|
147
|
+
epic_id: "epic-fail-filter",
|
|
148
|
+
task: "Failed task for filter",
|
|
149
|
+
strategy: "feature-based",
|
|
150
|
+
epic_title: "Failed Epic",
|
|
151
|
+
subtasks: [{ title: "Sub", files: ["src/fail.ts"], priority: 1 }],
|
|
152
|
+
};
|
|
153
|
+
await appendEvent(failEvent, testDir);
|
|
154
|
+
|
|
155
|
+
// Mark it failed
|
|
156
|
+
const failOutcome: SubtaskOutcomeEvent = {
|
|
157
|
+
type: "subtask_outcome",
|
|
158
|
+
timestamp: Date.now(),
|
|
159
|
+
project_key: TEST_PROJECT_KEY,
|
|
160
|
+
epic_id: "epic-fail-filter",
|
|
161
|
+
bead_id: "epic-fail-filter.1",
|
|
162
|
+
planned_files: ["src/fail.ts"],
|
|
163
|
+
actual_files: [],
|
|
164
|
+
duration_ms: 500,
|
|
165
|
+
error_count: 3,
|
|
166
|
+
retry_count: 2,
|
|
167
|
+
success: false,
|
|
168
|
+
};
|
|
169
|
+
await appendEvent(failOutcome, testDir);
|
|
170
|
+
|
|
171
|
+
const successCases = await loadEvalCases(TEST_PROJECT_KEY, {
|
|
172
|
+
successOnly: true,
|
|
173
|
+
projectPath: testDir,
|
|
174
|
+
});
|
|
175
|
+
|
|
176
|
+
// Should only include successful cases
|
|
177
|
+
const failedCase = successCases.find(
|
|
178
|
+
(c) => c.input.task === "Failed task for filter",
|
|
179
|
+
);
|
|
180
|
+
expect(failedCase).toBeUndefined();
|
|
181
|
+
});
|
|
182
|
+
|
|
183
|
+
it("passes strategy filter to getEvalRecords", async () => {
|
|
184
|
+
// Insert file-based decomposition
|
|
185
|
+
const fileBasedEvent: DecompositionGeneratedEvent = {
|
|
186
|
+
type: "decomposition_generated",
|
|
187
|
+
timestamp: Date.now(),
|
|
188
|
+
project_key: TEST_PROJECT_KEY,
|
|
189
|
+
epic_id: "epic-file-based",
|
|
190
|
+
task: "File-based task",
|
|
191
|
+
strategy: "file-based",
|
|
192
|
+
epic_title: "File Epic",
|
|
193
|
+
subtasks: [{ title: "Sub", files: ["src/file.ts"], priority: 1 }],
|
|
194
|
+
};
|
|
195
|
+
await appendEvent(fileBasedEvent, testDir);
|
|
196
|
+
|
|
197
|
+
const fileBasedCases = await loadEvalCases(TEST_PROJECT_KEY, {
|
|
198
|
+
strategy: "file-based",
|
|
199
|
+
projectPath: testDir,
|
|
200
|
+
});
|
|
201
|
+
|
|
202
|
+
// All returned cases should be file-based
|
|
203
|
+
for (const c of fileBasedCases) {
|
|
204
|
+
expect(c.actual?.strategy).toBe("file-based");
|
|
205
|
+
}
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
it("passes limit to getEvalRecords", async () => {
|
|
209
|
+
const cases = await loadEvalCases(TEST_PROJECT_KEY, {
|
|
210
|
+
limit: 2,
|
|
211
|
+
projectPath: testDir,
|
|
212
|
+
});
|
|
213
|
+
|
|
214
|
+
expect(cases.length).toBeLessThanOrEqual(2);
|
|
215
|
+
});
|
|
216
|
+
|
|
217
|
+
it("handles records with no context", async () => {
|
|
218
|
+
const noContextEvent: DecompositionGeneratedEvent = {
|
|
219
|
+
type: "decomposition_generated",
|
|
220
|
+
timestamp: Date.now(),
|
|
221
|
+
project_key: TEST_PROJECT_KEY,
|
|
222
|
+
epic_id: "epic-no-context",
|
|
223
|
+
task: "Task without context",
|
|
224
|
+
// context is undefined
|
|
225
|
+
strategy: "feature-based",
|
|
226
|
+
epic_title: "No Context Epic",
|
|
227
|
+
subtasks: [{ title: "Sub", files: [], priority: 1 }],
|
|
228
|
+
};
|
|
229
|
+
await appendEvent(noContextEvent, testDir);
|
|
230
|
+
|
|
231
|
+
const cases = await loadEvalCases(TEST_PROJECT_KEY, {
|
|
232
|
+
projectPath: testDir,
|
|
233
|
+
});
|
|
234
|
+
const noContextCase = cases.find(
|
|
235
|
+
(c) => c.input.task === "Task without context",
|
|
236
|
+
);
|
|
237
|
+
|
|
238
|
+
expect(noContextCase).toBeDefined();
|
|
239
|
+
expect(noContextCase!.input.context).toBeUndefined();
|
|
240
|
+
});
|
|
241
|
+
});
|
|
242
|
+
|
|
243
|
+
describe("hasRealEvalData", () => {
|
|
244
|
+
it("returns true when enough records exist", async () => {
|
|
245
|
+
// We've inserted several records above, should have enough
|
|
246
|
+
const hasData = await hasRealEvalData(TEST_PROJECT_KEY, 1, testDir);
|
|
247
|
+
expect(hasData).toBe(true);
|
|
248
|
+
});
|
|
249
|
+
|
|
250
|
+
it("returns false when not enough records exist", async () => {
|
|
251
|
+
// Use a project key with no data
|
|
252
|
+
const hasData = await hasRealEvalData("nonexistent-project", 5, testDir);
|
|
253
|
+
expect(hasData).toBe(false);
|
|
254
|
+
});
|
|
255
|
+
|
|
256
|
+
it("uses custom minRecords threshold", async () => {
|
|
257
|
+
// Should have at least 1 record
|
|
258
|
+
const hasData = await hasRealEvalData(TEST_PROJECT_KEY, 1, testDir);
|
|
259
|
+
expect(hasData).toBe(true);
|
|
260
|
+
|
|
261
|
+
// Should not have 1000 records
|
|
262
|
+
const hasLotsOfData = await hasRealEvalData(
|
|
263
|
+
TEST_PROJECT_KEY,
|
|
264
|
+
1000,
|
|
265
|
+
testDir,
|
|
266
|
+
);
|
|
267
|
+
expect(hasLotsOfData).toBe(false);
|
|
268
|
+
});
|
|
269
|
+
});
|
|
270
|
+
|
|
271
|
+
describe("getEvalDataSummary", () => {
|
|
272
|
+
it("returns formatted summary with hasEnoughData flag", async () => {
|
|
273
|
+
const summary = await getEvalDataSummary(TEST_PROJECT_KEY, testDir);
|
|
274
|
+
|
|
275
|
+
expect(summary.totalRecords).toBeGreaterThanOrEqual(1);
|
|
276
|
+
expect(typeof summary.successRate).toBe("number");
|
|
277
|
+
expect(typeof summary.byStrategy).toBe("object");
|
|
278
|
+
expect(typeof summary.hasEnoughData).toBe("boolean");
|
|
279
|
+
});
|
|
280
|
+
|
|
281
|
+
it("sets hasEnoughData based on record count", async () => {
|
|
282
|
+
// Empty project should not have enough data
|
|
283
|
+
const emptySummary = await getEvalDataSummary("empty-project", testDir);
|
|
284
|
+
expect(emptySummary.hasEnoughData).toBe(false);
|
|
285
|
+
expect(emptySummary.totalRecords).toBe(0);
|
|
286
|
+
});
|
|
287
|
+
});
|
|
288
|
+
});
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PGlite-backed eval data loader
|
|
3
|
+
*
|
|
4
|
+
* Loads real decomposition outcomes from the eval_records table
|
|
5
|
+
* for use in Evalite evals.
|
|
6
|
+
*/
|
|
7
|
+
import {
|
|
8
|
+
getEvalRecords,
|
|
9
|
+
getEvalStats,
|
|
10
|
+
type EvalRecord,
|
|
11
|
+
} from "../../src/streams/projections.js";
|
|
12
|
+
|
|
13
|
+
export interface EvalCase {
|
|
14
|
+
input: { task: string; context?: string };
|
|
15
|
+
expected: {
|
|
16
|
+
minSubtasks: number;
|
|
17
|
+
maxSubtasks: number;
|
|
18
|
+
requiredFiles?: string[];
|
|
19
|
+
overallSuccess?: boolean;
|
|
20
|
+
};
|
|
21
|
+
actual?: EvalRecord;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Load eval cases from PGlite
|
|
26
|
+
*
|
|
27
|
+
* @param projectKey - Project key for filtering records
|
|
28
|
+
* @param options - Filter options
|
|
29
|
+
* @returns Array of eval cases ready for Evalite
|
|
30
|
+
*/
|
|
31
|
+
export async function loadEvalCases(
|
|
32
|
+
projectKey: string,
|
|
33
|
+
options?: {
|
|
34
|
+
limit?: number;
|
|
35
|
+
strategy?: "file-based" | "feature-based" | "risk-based";
|
|
36
|
+
successOnly?: boolean;
|
|
37
|
+
projectPath?: string;
|
|
38
|
+
},
|
|
39
|
+
): Promise<EvalCase[]> {
|
|
40
|
+
const { limit, strategy, successOnly, projectPath } = options ?? {};
|
|
41
|
+
|
|
42
|
+
// Query eval records from PGlite
|
|
43
|
+
const records = await getEvalRecords(
|
|
44
|
+
projectKey,
|
|
45
|
+
{ limit, strategy },
|
|
46
|
+
projectPath,
|
|
47
|
+
);
|
|
48
|
+
|
|
49
|
+
// Filter by success if requested
|
|
50
|
+
const filtered = successOnly
|
|
51
|
+
? records.filter((r) => r.overall_success === true)
|
|
52
|
+
: records;
|
|
53
|
+
|
|
54
|
+
// Transform to EvalCase format
|
|
55
|
+
return filtered.map((record) => ({
|
|
56
|
+
input: {
|
|
57
|
+
task: record.task,
|
|
58
|
+
context: record.context ?? undefined,
|
|
59
|
+
},
|
|
60
|
+
expected: {
|
|
61
|
+
minSubtasks: 2,
|
|
62
|
+
maxSubtasks: record.subtasks.length,
|
|
63
|
+
requiredFiles: record.subtasks.flatMap((s) => s.files),
|
|
64
|
+
overallSuccess: record.overall_success ?? undefined,
|
|
65
|
+
},
|
|
66
|
+
actual: record,
|
|
67
|
+
}));
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Check if we have enough real data to run evals
|
|
72
|
+
*
|
|
73
|
+
* @param projectKey - Project key to check
|
|
74
|
+
* @param minRecords - Minimum number of records required (default: 5)
|
|
75
|
+
* @param projectPath - Optional project path for database lookup
|
|
76
|
+
* @returns True if enough data exists
|
|
77
|
+
*/
|
|
78
|
+
export async function hasRealEvalData(
|
|
79
|
+
projectKey: string,
|
|
80
|
+
minRecords: number = 5,
|
|
81
|
+
projectPath?: string,
|
|
82
|
+
): Promise<boolean> {
|
|
83
|
+
const stats = await getEvalStats(projectKey, projectPath);
|
|
84
|
+
return stats.totalRecords >= minRecords;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Get eval data stats for reporting
|
|
89
|
+
*
|
|
90
|
+
* @param projectKey - Project key to query
|
|
91
|
+
* @param projectPath - Optional project path for database lookup
|
|
92
|
+
* @returns Summary of available eval data
|
|
93
|
+
*/
|
|
94
|
+
export async function getEvalDataSummary(
|
|
95
|
+
projectKey: string,
|
|
96
|
+
projectPath?: string,
|
|
97
|
+
): Promise<{
|
|
98
|
+
totalRecords: number;
|
|
99
|
+
successRate: number;
|
|
100
|
+
byStrategy: Record<string, number>;
|
|
101
|
+
hasEnoughData: boolean;
|
|
102
|
+
}> {
|
|
103
|
+
const stats = await getEvalStats(projectKey, projectPath);
|
|
104
|
+
|
|
105
|
+
return {
|
|
106
|
+
totalRecords: stats.totalRecords,
|
|
107
|
+
successRate: stats.successRate,
|
|
108
|
+
byStrategy: stats.byStrategy,
|
|
109
|
+
hasEnoughData: stats.totalRecords >= 5,
|
|
110
|
+
};
|
|
111
|
+
}
|