agents-harness 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +1 -1
- package/dist/core/file-protocol.d.ts +4 -2
- package/dist/core/file-protocol.js +71 -1
- package/dist/core/file-protocol.js.map +1 -1
- package/dist/core/orchestrator.d.ts +1 -0
- package/dist/core/orchestrator.js +9 -2
- package/dist/core/orchestrator.js.map +1 -1
- package/dist/core/types.d.ts +20 -0
- package/dist/dashboard/static/index.html +518 -209
- package/dist/defaults/criteria.d.ts +3 -0
- package/dist/defaults/criteria.js +124 -0
- package/dist/defaults/criteria.js.map +1 -1
- package/dist/defaults/project-type.d.ts +2 -0
- package/dist/defaults/project-type.js +48 -0
- package/dist/defaults/project-type.js.map +1 -0
- package/dist/defaults/prompts.js +40 -18
- package/dist/defaults/prompts.js.map +1 -1
- package/dist/index.d.ts +3 -2
- package/dist/index.js +2 -1
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
- package/dist/dashboard/static/static/.gitkeep +0 -0
- package/dist/dashboard/static/static/index.html +0 -622
|
@@ -1 +1,4 @@
|
|
|
1
|
+
import type { EvalDimension, ProjectType } from "../core/types.js";
|
|
1
2
|
export declare const DEFAULT_CRITERIA = "## Default Evaluation Criteria\n\n### Correctness\n- All features specified in the contract are implemented and functional\n- No placeholder, stubbed, or mocked implementations in production code\n- Code runs without runtime errors\n\n### Testing\n- New features have corresponding tests\n- All tests pass when the test suite is run\n- Tests cover the primary success path and key edge cases\n\n### Code Quality\n- Code follows project conventions (from CLAUDE.md if present)\n- No leftover TODO or FIXME comments\n- No debug logging or commented-out code left in place\n- Imports are clean \u2014 no unused imports\n\n### Integration\n- New code integrates with existing codebase without breaking existing functionality\n- Existing tests still pass after changes\n";
|
|
3
|
+
export declare function getDimensions(projectType: ProjectType): EvalDimension[];
|
|
4
|
+
export declare function formatDimensionsBlock(dimensions: EvalDimension[]): string;
|
|
@@ -20,4 +20,128 @@ export const DEFAULT_CRITERIA = `## Default Evaluation Criteria
|
|
|
20
20
|
- New code integrates with existing codebase without breaking existing functionality
|
|
21
21
|
- Existing tests still pass after changes
|
|
22
22
|
`;
|
|
23
|
+
const UNIVERSAL_DIMENSIONS = [
|
|
24
|
+
{
|
|
25
|
+
id: "correctness",
|
|
26
|
+
name: "Correctness",
|
|
27
|
+
description: "Features work as specified, no placeholders, no runtime errors",
|
|
28
|
+
weight: 2.0,
|
|
29
|
+
threshold: 6,
|
|
30
|
+
rubric: "3=major features missing or broken; 5=features work with gaps; 7=solid with minor issues; 9=exceeds requirements",
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
id: "testing",
|
|
34
|
+
name: "Testing",
|
|
35
|
+
description: "Tests exist, pass, cover happy path and edge cases",
|
|
36
|
+
weight: 1.5,
|
|
37
|
+
threshold: 5,
|
|
38
|
+
rubric: "3=no tests or most fail; 5=happy path covered; 7=good coverage with edge cases; 9=comprehensive with mocks and integration",
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
id: "code-quality",
|
|
42
|
+
name: "Code Quality",
|
|
43
|
+
description: "Follows conventions, no TODOs/dead code, clean imports",
|
|
44
|
+
weight: 1.0,
|
|
45
|
+
threshold: 5,
|
|
46
|
+
rubric: "3=inconsistent style, dead code; 5=acceptable, minor issues; 7=clean and consistent; 9=exemplary, idiomatic",
|
|
47
|
+
},
|
|
48
|
+
{
|
|
49
|
+
id: "integration",
|
|
50
|
+
name: "Integration",
|
|
51
|
+
description: "Existing tests pass, follows existing patterns, no regressions",
|
|
52
|
+
weight: 1.5,
|
|
53
|
+
threshold: 6,
|
|
54
|
+
rubric: "3=breaks existing tests; 5=works but diverges from patterns; 7=integrates cleanly; 9=enhances existing architecture",
|
|
55
|
+
},
|
|
56
|
+
{
|
|
57
|
+
id: "design-principles",
|
|
58
|
+
name: "Design Principles",
|
|
59
|
+
description: "SOLID, DRY, separation of concerns, appropriate abstractions",
|
|
60
|
+
weight: 1.0,
|
|
61
|
+
threshold: 5,
|
|
62
|
+
rubric: "3=tangled responsibilities, heavy duplication; 5=reasonable structure; 7=clean separation, minimal duplication; 9=elegant, well-abstracted",
|
|
63
|
+
},
|
|
64
|
+
{
|
|
65
|
+
id: "error-handling",
|
|
66
|
+
name: "Error Handling",
|
|
67
|
+
description: "Proper error propagation, edge cases handled, input validation",
|
|
68
|
+
weight: 1.0,
|
|
69
|
+
threshold: 5,
|
|
70
|
+
rubric: "3=errors swallowed or crash; 5=basic error handling; 7=graceful handling with informative messages; 9=comprehensive with recovery strategies",
|
|
71
|
+
},
|
|
72
|
+
];
|
|
73
|
+
const BACKEND_DIMENSIONS = [
|
|
74
|
+
{
|
|
75
|
+
id: "api-design",
|
|
76
|
+
name: "API Design",
|
|
77
|
+
description: "Consistent endpoints, status codes, input validation, error responses",
|
|
78
|
+
weight: 1.5,
|
|
79
|
+
threshold: 6,
|
|
80
|
+
rubric: "3=inconsistent or broken endpoints; 5=functional but inconsistent; 7=clean REST/GraphQL with proper status codes; 9=well-documented, versioned, idiomatic",
|
|
81
|
+
},
|
|
82
|
+
{
|
|
83
|
+
id: "data-integrity",
|
|
84
|
+
name: "Data Integrity",
|
|
85
|
+
description: "Transactions, data validation at boundaries, no data loss paths",
|
|
86
|
+
weight: 1.5,
|
|
87
|
+
threshold: 6,
|
|
88
|
+
rubric: "3=data loss possible; 5=basic validation; 7=proper transactions and boundary checks; 9=bulletproof data handling",
|
|
89
|
+
},
|
|
90
|
+
{
|
|
91
|
+
id: "concurrency-safety",
|
|
92
|
+
name: "Concurrency Safety",
|
|
93
|
+
description: "No race conditions, shared state protected, timeout handling",
|
|
94
|
+
weight: 1.0,
|
|
95
|
+
threshold: 5,
|
|
96
|
+
rubric: "3=race conditions present; 5=basic locking; 7=proper concurrency patterns; 9=lock-free or formally verified",
|
|
97
|
+
},
|
|
98
|
+
];
|
|
99
|
+
const FRONTEND_DIMENSIONS = [
|
|
100
|
+
{
|
|
101
|
+
id: "ui-ux-quality",
|
|
102
|
+
name: "UI/UX Quality",
|
|
103
|
+
description: "Consistent visuals, responsive, loading/error states, intuitive flows",
|
|
104
|
+
weight: 1.5,
|
|
105
|
+
threshold: 5,
|
|
106
|
+
rubric: "3=broken layout or missing states; 5=functional UI with gaps; 7=polished with loading/error states; 9=delightful, pixel-perfect",
|
|
107
|
+
},
|
|
108
|
+
{
|
|
109
|
+
id: "component-architecture",
|
|
110
|
+
name: "Component Architecture",
|
|
111
|
+
description: "Clear responsibilities, proper state management, clean props",
|
|
112
|
+
weight: 1.0,
|
|
113
|
+
threshold: 5,
|
|
114
|
+
rubric: "3=monolithic components, prop drilling; 5=reasonable split; 7=clean composition with proper state; 9=reusable, well-encapsulated",
|
|
115
|
+
},
|
|
116
|
+
{
|
|
117
|
+
id: "accessibility",
|
|
118
|
+
name: "Accessibility",
|
|
119
|
+
description: "Semantic HTML, keyboard navigation, ARIA labels",
|
|
120
|
+
weight: 1.0,
|
|
121
|
+
threshold: 4,
|
|
122
|
+
rubric: "3=no semantic HTML; 5=basic semantics; 7=keyboard nav and ARIA; 9=WCAG AA compliant",
|
|
123
|
+
},
|
|
124
|
+
];
|
|
125
|
+
export function getDimensions(projectType) {
|
|
126
|
+
switch (projectType) {
|
|
127
|
+
case "frontend":
|
|
128
|
+
return [...UNIVERSAL_DIMENSIONS, ...FRONTEND_DIMENSIONS];
|
|
129
|
+
case "backend":
|
|
130
|
+
return [...UNIVERSAL_DIMENSIONS, ...BACKEND_DIMENSIONS];
|
|
131
|
+
case "fullstack":
|
|
132
|
+
return [...UNIVERSAL_DIMENSIONS, ...BACKEND_DIMENSIONS, ...FRONTEND_DIMENSIONS];
|
|
133
|
+
case "universal":
|
|
134
|
+
return [...UNIVERSAL_DIMENSIONS];
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
export function formatDimensionsBlock(dimensions) {
|
|
138
|
+
const lines = [];
|
|
139
|
+
for (const dim of dimensions) {
|
|
140
|
+
lines.push(`### ${dim.name} (weight: ${dim.weight}, min: ${dim.threshold}/10)`);
|
|
141
|
+
lines.push(dim.description);
|
|
142
|
+
lines.push(`Rubric: ${dim.rubric}`);
|
|
143
|
+
lines.push("");
|
|
144
|
+
}
|
|
145
|
+
return lines.join("\n");
|
|
146
|
+
}
|
|
23
147
|
//# sourceMappingURL=criteria.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"criteria.js","sourceRoot":"","sources":["../../src/defaults/criteria.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"criteria.js","sourceRoot":"","sources":["../../src/defaults/criteria.ts"],"names":[],"mappings":"AAEA,MAAM,CAAC,MAAM,gBAAgB,GAAG;;;;;;;;;;;;;;;;;;;;;CAqB/B,CAAC;AAEF,MAAM,oBAAoB,GAAoB;IAC5C;QACE,EAAE,EAAE,aAAa;QACjB,IAAI,EAAE,aAAa;QACnB,WAAW,EAAE,gEAAgE;QAC7E,MAAM,EAAE,GAAG;QACX,SAAS,EAAE,CAAC;QACZ,MAAM,EAAE,kHAAkH;KAC3H;IACD;QACE,EAAE,EAAE,SAAS;QACb,IAAI,EAAE,SAAS;QACf,WAAW,EAAE,oDAAoD;QACjE,MAAM,EAAE,GAAG;QACX,SAAS,EAAE,CAAC;QACZ,MAAM,EAAE,4HAA4H;KACrI;IACD;QACE,EAAE,EAAE,cAAc;QAClB,IAAI,EAAE,cAAc;QACpB,WAAW,EAAE,wDAAwD;QACrE,MAAM,EAAE,GAAG;QACX,SAAS,EAAE,CAAC;QACZ,MAAM,EAAE,6GAA6G;KACtH;IACD;QACE,EAAE,EAAE,aAAa;QACjB,IAAI,EAAE,aAAa;QACnB,WAAW,EAAE,gEAAgE;QAC7E,MAAM,EAAE,GAAG;QACX,SAAS,EAAE,CAAC;QACZ,MAAM,EAAE,qHAAqH;KAC9H;IACD;QACE,EAAE,EAAE,mBAAmB;QACvB,IAAI,EAAE,mBAAmB;QACzB,WAAW,EAAE,8DAA8D;QAC3E,MAAM,EAAE,GAAG;QACX,SAAS,EAAE,CAAC;QACZ,MAAM,EAAE,4IAA4I;KACrJ;IACD;QACE,EAAE,EAAE,gBAAgB;QACpB,IAAI,EAAE,gBAAgB;QACtB,WAAW,EAAE,gEAAgE;QAC7E,MAAM,EAAE,GAAG;QACX,SAAS,EAAE,CAAC;QACZ,MAAM,EAAE,8IAA8I;KACvJ;CACF,CAAC;AAEF,MAAM,kBAAkB,GAAoB;IAC1C;QACE,EAAE,EAAE,YAAY;QAChB,IAAI,EAAE,YAAY;QAClB,WAAW,EAAE,uEAAuE;QACpF,MAAM,EAAE,GAAG;QACX,SAAS,EAAE,CAAC;QACZ,MAAM,EAAE,2JAA2J;KACpK;IACD;QACE,EAAE,EAAE,gBAAgB;QACpB,IAAI,EAAE,gBAAgB;QACtB,WAAW,EAAE,iEAAiE;QAC9E,MAAM,EAAE,GAAG;QACX,SAAS,EAAE,CAAC;QACZ,MAAM,EAAE,kHAAkH;KAC3H;IACD;QACE,EAAE,EAAE,oBAAoB;QACxB,IAAI,EAAE,oBAAoB;QAC1B,WAAW,EAAE,8DAA8D;QAC3E,MAAM,EAAE,GAAG;QACX,SAAS,EAAE,CAAC;QACZ,MAAM,EAAE,6GAA6G;KACtH;CACF,CAAC;AAEF,MAAM,mBAAmB,GAAoB;IAC3C;QACE,EAAE,EAAE,eAAe;QACnB,IAAI,EAAE,eAAe;QACrB,WAAW,EAAE,uEAAuE;QACpF,MAAM,EAAE,GAAG;QACX,SAAS,EAAE,CAAC;QACZ,MAAM,EAAE,iIAAiI;KAC1I;IACD;QACE,EAAE,EAAE,wBAAwB;QAC5B,IAAI,EAAE,wBAAwB;QAC9B,WAAW,EAAE,8DAA8D;QAC3E,MAAM,EAAE,GAAG;QACX,SAAS,EAAE,CAAC;QACZ,MAAM,EAAE,kIAAkI;KAC3I;IACD;QACE,EAAE,EAAE,eAAe;QACnB,IAAI,EAAE,eAAe;QACrB,WAAW,EAAE,iDAAiD;QAC9D,MAAM,EAAE,GAAG;QACX,SAAS,EAAE,CAAC;QACZ,MAAM,EAAE,qFAAqF;KAC9F;CACF,CAAC;AAEF,MAAM,UAAU,aAAa,CAAC,WAAwB;IACpD,QAAQ,WAAW,EAAE,CAAC;QACpB,KAAK,UAAU;YACb,OAAO,CAAC,GAAG,oBAAoB,EAAE,GAAG,mBAAmB,CAAC,CAAC;QAC3D,KAAK,SAAS;YACZ,OAAO,CAAC,GAAG,oBAAoB,EAAE,GAAG,kBAAkB,CAAC,CAAC;QAC1D,KAAK,WAAW;YACd,OAAO,CAAC,GAAG,oBAAoB,EAAE,GAAG,kBAAkB,EAAE,GAAG,mBAAmB,CAAC,CAAC;QAClF,KAAK,WAAW;YACd,OAAO,CAAC,GAAG,oBAAoB,CAAC,CAAC;IACrC,CAAC;AACH,CAAC;AAED,MAAM,UAAU,qBAAqB,CAAC,UAA2B;IAC/D,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,KAAK,MAAM,GAAG,IAAI,UAAU,EAAE,CAAC;QAC7B,KAAK,CAAC,IAAI,CAAC,OAAO,GAAG,CAAC,IAAI,aAAa,GAAG,CAAC,MAAM,UAAU,GAAG,CAAC,SAAS,MAAM,CAAC,CAAC;QAChF,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,WAAW,CAAC,CAAC;QAC5B,KAAK,CAAC,IAAI,CAAC,WAAW,GAAG,CAAC,MAAM,EAAE,CAAC,CAAC;QACpC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACjB,CAAC;IACD,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC"}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
const FRONTEND_FRAMEWORKS = new Set([
|
|
2
|
+
"vite",
|
|
3
|
+
"react",
|
|
4
|
+
"vue",
|
|
5
|
+
"svelte",
|
|
6
|
+
"angular",
|
|
7
|
+
]);
|
|
8
|
+
const BACKEND_FRAMEWORKS = new Set([
|
|
9
|
+
"django",
|
|
10
|
+
"fastapi",
|
|
11
|
+
"express",
|
|
12
|
+
"flask",
|
|
13
|
+
"hono",
|
|
14
|
+
"gin",
|
|
15
|
+
"rails",
|
|
16
|
+
]);
|
|
17
|
+
const FULLSTACK_FRAMEWORKS = new Set(["nextjs"]);
|
|
18
|
+
const BACKEND_LANGUAGES = new Set(["python", "go", "rust", "java", "ruby"]);
|
|
19
|
+
export function detectProjectType(ctx) {
|
|
20
|
+
let hasFrontend = false;
|
|
21
|
+
let hasBackend = false;
|
|
22
|
+
for (const ws of ctx.workspaces) {
|
|
23
|
+
const { framework, language, devServer } = ws.stack;
|
|
24
|
+
if (framework && FULLSTACK_FRAMEWORKS.has(framework)) {
|
|
25
|
+
return "fullstack";
|
|
26
|
+
}
|
|
27
|
+
if (framework && FRONTEND_FRAMEWORKS.has(framework)) {
|
|
28
|
+
hasFrontend = true;
|
|
29
|
+
}
|
|
30
|
+
else if (framework && BACKEND_FRAMEWORKS.has(framework)) {
|
|
31
|
+
hasBackend = true;
|
|
32
|
+
}
|
|
33
|
+
else if (!framework && BACKEND_LANGUAGES.has(language)) {
|
|
34
|
+
hasBackend = true;
|
|
35
|
+
}
|
|
36
|
+
else if (!framework && devServer) {
|
|
37
|
+
hasFrontend = true;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
if (hasFrontend && hasBackend)
|
|
41
|
+
return "fullstack";
|
|
42
|
+
if (hasFrontend)
|
|
43
|
+
return "frontend";
|
|
44
|
+
if (hasBackend)
|
|
45
|
+
return "backend";
|
|
46
|
+
return "universal";
|
|
47
|
+
}
|
|
48
|
+
//# sourceMappingURL=project-type.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"project-type.js","sourceRoot":"","sources":["../../src/defaults/project-type.ts"],"names":[],"mappings":"AAEA,MAAM,mBAAmB,GAAG,IAAI,GAAG,CAAC;IAClC,MAAM;IACN,OAAO;IACP,KAAK;IACL,QAAQ;IACR,SAAS;CACV,CAAC,CAAC;AAEH,MAAM,kBAAkB,GAAG,IAAI,GAAG,CAAC;IACjC,QAAQ;IACR,SAAS;IACT,SAAS;IACT,OAAO;IACP,MAAM;IACN,KAAK;IACL,OAAO;CACR,CAAC,CAAC;AAEH,MAAM,oBAAoB,GAAG,IAAI,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC;AAEjD,MAAM,iBAAiB,GAAG,IAAI,GAAG,CAAC,CAAC,QAAQ,EAAE,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC;AAE5E,MAAM,UAAU,iBAAiB,CAAC,GAAmB;IACnD,IAAI,WAAW,GAAG,KAAK,CAAC;IACxB,IAAI,UAAU,GAAG,KAAK,CAAC;IAEvB,KAAK,MAAM,EAAE,IAAI,GAAG,CAAC,UAAU,EAAE,CAAC;QAChC,MAAM,EAAE,SAAS,EAAE,QAAQ,EAAE,SAAS,EAAE,GAAG,EAAE,CAAC,KAAK,CAAC;QAEpD,IAAI,SAAS,IAAI,oBAAoB,CAAC,GAAG,CAAC,SAAS,CAAC,EAAE,CAAC;YACrD,OAAO,WAAW,CAAC;QACrB,CAAC;QAED,IAAI,SAAS,IAAI,mBAAmB,CAAC,GAAG,CAAC,SAAS,CAAC,EAAE,CAAC;YACpD,WAAW,GAAG,IAAI,CAAC;QACrB,CAAC;aAAM,IAAI,SAAS,IAAI,kBAAkB,CAAC,GAAG,CAAC,SAAS,CAAC,EAAE,CAAC;YAC1D,UAAU,GAAG,IAAI,CAAC;QACpB,CAAC;aAAM,IAAI,CAAC,SAAS,IAAI,iBAAiB,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;YACzD,UAAU,GAAG,IAAI,CAAC;QACpB,CAAC;aAAM,IAAI,CAAC,SAAS,IAAI,SAAS,EAAE,CAAC;YACnC,WAAW,GAAG,IAAI,CAAC;QACrB,CAAC;IACH,CAAC;IAED,IAAI,WAAW,IAAI,UAAU;QAAE,OAAO,WAAW,CAAC;IAClD,IAAI,WAAW;QAAE,OAAO,UAAU,CAAC;IACnC,IAAI,UAAU;QAAE,OAAO,SAAS,CAAC;IACjC,OAAO,WAAW,CAAC;AACrB,CAAC"}
|
package/dist/defaults/prompts.js
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { getDimensions, formatDimensionsBlock } from "./criteria.js";
|
|
2
|
+
import { detectProjectType } from "./project-type.js";
|
|
2
3
|
const PLANNER_BASE = `You are a product planner. Your job is to convert user descriptions into comprehensive product specifications, break them into sprints, and write sprint contracts.
|
|
3
4
|
|
|
4
5
|
RULES:
|
|
@@ -21,29 +22,47 @@ RULES:
|
|
|
21
22
|
- Commit your work with conventional commit messages.
|
|
22
23
|
- Do NOT evaluate your own work. Do NOT say "this looks good" or "everything is working."
|
|
23
24
|
Your job is to implement, not judge. A separate evaluator will assess your work.`;
|
|
24
|
-
const EVALUATOR_BASE = `You are a
|
|
25
|
+
const EVALUATOR_BASE = `You are a code evaluator. Your job is to assess implementation quality across multiple dimensions.
|
|
25
26
|
|
|
26
27
|
MINDSET:
|
|
27
|
-
- Be
|
|
28
|
-
-
|
|
29
|
-
-
|
|
30
|
-
-
|
|
31
|
-
-
|
|
28
|
+
- Be fair and calibrated. Base scores on evidence.
|
|
29
|
+
- 5 = acceptable, meets minimum expectations
|
|
30
|
+
- 7 = good, solid with minor issues
|
|
31
|
+
- 9-10 = excellent, production-grade
|
|
32
|
+
- 3 or below = significant problems
|
|
33
|
+
- Run the test suite. Read the code. Verify behavior.
|
|
34
|
+
- Stubbed, mocked, or placeholder implementations in production code score low on Correctness.
|
|
35
|
+
|
|
36
|
+
CALIBRATION EXAMPLES:
|
|
37
|
+
- All features work but one edge case unhandled: Correctness = 7
|
|
38
|
+
- Tests exist but only happy path: Testing = 5
|
|
39
|
+
- API returns correct data but 500s on invalid input: Error Handling = 4
|
|
40
|
+
- Code works but duplicates logic across files: Design Principles = 4
|
|
41
|
+
- Clean code, follows all conventions: Code Quality = 8
|
|
32
42
|
|
|
33
43
|
PROCESS:
|
|
34
44
|
1. Read .harness/contract.md for what was promised
|
|
35
45
|
2. Read the actual code that was written (use Grep and Read)
|
|
36
46
|
3. Run the test suite
|
|
37
|
-
4.
|
|
47
|
+
4. Score each dimension below with evidence
|
|
38
48
|
5. Write your evaluation to .harness/evaluation.md
|
|
39
49
|
|
|
50
|
+
{{DIMENSIONS}}
|
|
51
|
+
|
|
40
52
|
YOUR OUTPUT FORMAT (write to .harness/evaluation.md):
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
53
|
+
Overall: PASS or FAIL
|
|
54
|
+
Score: X.X/10
|
|
55
|
+
|
|
56
|
+
## Dimensions
|
|
57
|
+
|
|
58
|
+
### [Dimension Name]
|
|
59
|
+
Score: N/10
|
|
60
|
+
Rationale: (1-2 sentences with specific evidence)
|
|
61
|
+
|
|
62
|
+
(repeat for each dimension)
|
|
63
|
+
|
|
64
|
+
## Critique
|
|
65
|
+
(actionable feedback for improvements — what's wrong and what needs to change)`;
|
|
47
66
|
const BASE_PROMPTS = {
|
|
48
67
|
planner: PLANNER_BASE,
|
|
49
68
|
generator: GENERATOR_BASE,
|
|
@@ -106,13 +125,16 @@ export function buildSystemPrompt(role, ctx, appendPrompt) {
|
|
|
106
125
|
sections.push(BASE_PROMPTS[role]);
|
|
107
126
|
// 2. Project context
|
|
108
127
|
sections.push(`\n\n## PROJECT CONTEXT\n\n${formatProjectContext(ctx)}`);
|
|
109
|
-
// 3. Evaluation
|
|
128
|
+
// 3. Evaluation dimensions (evaluator only)
|
|
110
129
|
if (role === "evaluator") {
|
|
111
|
-
|
|
130
|
+
const projectType = detectProjectType(ctx);
|
|
131
|
+
const dimensions = getDimensions(projectType);
|
|
132
|
+
const dimensionsBlock = formatDimensionsBlock(dimensions);
|
|
133
|
+
// Replace {{DIMENSIONS}} placeholder in base prompt
|
|
134
|
+
sections[0] = sections[0].replace("{{DIMENSIONS}}", `## SCORING DIMENSIONS\n\n${dimensionsBlock}`);
|
|
112
135
|
if (ctx.criteria) {
|
|
113
|
-
|
|
136
|
+
sections.push(`\n\n## CUSTOM CRITERIA\n\n${ctx.criteria}`);
|
|
114
137
|
}
|
|
115
|
-
sections.push(`\n\n## EVALUATION CRITERIA\n\n${criteriaBlock}`);
|
|
116
138
|
}
|
|
117
139
|
// 4. Additional instructions
|
|
118
140
|
if (appendPrompt) {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"prompts.js","sourceRoot":"","sources":["../../src/defaults/prompts.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,
|
|
1
|
+
{"version":3,"file":"prompts.js","sourceRoot":"","sources":["../../src/defaults/prompts.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,aAAa,EAAE,qBAAqB,EAAE,MAAM,eAAe,CAAC;AACrE,OAAO,EAAE,iBAAiB,EAAE,MAAM,mBAAmB,CAAC;AAEtD,MAAM,YAAY,GAAG;;;;;;;;;8DASyC,CAAC;AAE/D,MAAM,cAAc,GAAG;;;;;;;;;;;mFAW4D,CAAC;AAEpF,MAAM,cAAc,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;+EAwCwD,CAAC;AAEhF,MAAM,YAAY,GAA8B;IAC9C,OAAO,EAAE,YAAY;IACrB,SAAS,EAAE,cAAc;IACzB,SAAS,EAAE,cAAc;CAC1B,CAAC;AAEF;;GAEG;AACH,MAAM,UAAU,oBAAoB,CAAC,GAAmB;IACtD,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,KAAK,CAAC,IAAI,CAAC,oBAAoB,GAAG,CAAC,QAAQ,EAAE,CAAC,CAAC;IAC/C,KAAK,CAAC,IAAI,CAAC,SAAS,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;IAEhC,IAAI,GAAG,CAAC,KAAK,IAAI,GAAG,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACtC,KAAK,CAAC,IAAI,CAAC,UAAU,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAC/C,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACf,KAAK,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;IAC1B,KAAK,MAAM,EAAE,IAAI,GAAG,CAAC,UAAU,EAAE,CAAC;QAChC,KAAK,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC,IAAI,EAAE,CAAC,CAAC;QAC7B,KAAK,CAAC,IAAI,CAAC,iBAAiB,EAAE,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC,CAAC;QACjD,IAAI,EAAE,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;YACvB,KAAK,CAAC,IAAI,CAAC,kBAAkB,EAAE,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC,CAAC;QACrD,CAAC;QACD,IAAI,EAAE,CAAC,KAAK,CAAC,UAAU,EAAE,CAAC;YACxB,KAAK,CAAC,IAAI,CAAC,oBAAoB,EAAE,CAAC,KAAK,CAAC,UAAU,EAAE,CAAC,CAAC;QACxD,CAAC;QACD,KAAK,CAAC,IAAI,CAAC,qBAAqB,EAAE,CAAC,KAAK,CAAC,WAAW,EAAE,CAAC,CAAC;QACxD,IAAI,EAAE,CAAC,KAAK,CAAC,WAAW,EAAE,CAAC;YACzB,KAAK,CAAC,IAAI,CAAC,qBAAqB,EAAE,CAAC,KAAK,CAAC,WAAW,EAAE,CAAC,CAAC;QAC1D,CAAC;QACD,IAAI,EAAE,CAAC,KAAK,CAAC,YAAY,EAAE,CAAC;YAC1B,KAAK,CAAC,IAAI,CAAC,sBAAsB,EAAE,CAAC,KAAK,CAAC,YAAY,EAAE,CAAC,CAAC;QAC5D,CAAC;QACD,IAAI,EAAE,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;YACvB,KAAK,CAAC,IAAI,CAAC,mBAAmB,EAAE,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC,CAAC;QACtD,CAAC;QACD,IAAI,EAAE,CAAC,QAAQ,EAAE,CAAC;YAChB,KAAK,CAAC,IAAI,CAAC,kBAAkB,EAAE,CAAC,QAAQ,EAAE,CAAC,CAAC;QAC9C,CAAC;IACH,CAAC;IAED,IAAI,GAAG,CAAC,YAAY,EAAE,CAAC;QACrB,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACf,KAAK,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QACzB,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;IAC/B,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,UAAU,iBAAiB,CAC/B,IAAe,EACf,GAAmB,EACnB,YAAqB;IAErB,MAAM,QAAQ,GAAa,EAAE,CAAC;IAE9B,iBAAiB;IACjB,QAAQ,CAAC,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC;IAElC,qBAAqB;IACrB,QAAQ,CAAC,IAAI,CAAC,6BAA6B,oBAAoB,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IAExE,4CAA4C;IAC5C,IAAI,IAAI,KAAK,WAAW,EAAE,CAAC;QACzB,MAAM,WAAW,GAAG,iBAAiB,CAAC,GAAG,CAAC,CAAC;QAC3C,MAAM,UAAU,GAAG,aAAa,CAAC,WAAW,CAAC,CAAC;QAC9C,MAAM,eAAe,GAAG,qBAAqB,CAAC,UAAU,CAAC,CAAC;QAE1D,oDAAoD;QACpD,QAAQ,CAAC,CAAC,CAAC,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,gBAAgB,EAAE,4BAA4B,eAAe,EAAE,CAAC,CAAC;QAEnG,IAAI,GAAG,CAAC,QAAQ,EAAE,CAAC;YACjB,QAAQ,CAAC,IAAI,CAAC,6BAA6B,GAAG,CAAC,QAAQ,EAAE,CAAC,CAAC;QAC7D,CAAC;IACH,CAAC;IAED,6BAA6B;IAC7B,IAAI,YAAY,EAAE,CAAC;QACjB,QAAQ,CAAC,IAAI,CAAC,qCAAqC,YAAY,EAAE,CAAC,CAAC;IACrE,CAAC;IAED,OAAO,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;AAC3B,CAAC"}
|
package/dist/index.d.ts
CHANGED
|
@@ -7,6 +7,7 @@ export { buildProjectContext } from "./discovery/project-context.js";
|
|
|
7
7
|
export { detectStack, detectRepoType, discoverWorkspaces } from "./discovery/stack-detector.js";
|
|
8
8
|
export { loadConfig, loadCriteria } from "./discovery/config-loader.js";
|
|
9
9
|
export { buildSystemPrompt, formatProjectContext } from "./defaults/prompts.js";
|
|
10
|
-
export { DEFAULT_CRITERIA } from "./defaults/criteria.js";
|
|
10
|
+
export { DEFAULT_CRITERIA, getDimensions, formatDimensionsBlock } from "./defaults/criteria.js";
|
|
11
|
+
export { detectProjectType } from "./defaults/project-type.js";
|
|
11
12
|
export { DashboardServer } from "./dashboard/server.js";
|
|
12
|
-
export type { Stack, Workspace, ProjectContext, AgentConfig, WorkspaceConfig, HarnessConfig, SprintStatus, RunStatus, Phase, SprintProgress, Progress, EvalResult, AgentRole, AgentDefinition, PhaseStartEvent, AgentActivityEvent, EvaluationEvent, CostUpdateEvent, SprintCompleteEvent, RunCompleteEvent, HarnessEvent, } from "./core/types.js";
|
|
13
|
+
export type { Stack, Workspace, ProjectContext, AgentConfig, WorkspaceConfig, HarnessConfig, SprintStatus, RunStatus, Phase, SprintProgress, Progress, EvalResult, ProjectType, EvalDimension, DimensionScore, AgentRole, AgentDefinition, PhaseStartEvent, AgentActivityEvent, EvaluationEvent, CostUpdateEvent, SprintCompleteEvent, RunCompleteEvent, HarnessEvent, } from "./core/types.js";
|
package/dist/index.js
CHANGED
|
@@ -8,7 +8,8 @@ export { detectStack, detectRepoType, discoverWorkspaces } from "./discovery/sta
|
|
|
8
8
|
export { loadConfig, loadCriteria } from "./discovery/config-loader.js";
|
|
9
9
|
// Defaults
|
|
10
10
|
export { buildSystemPrompt, formatProjectContext } from "./defaults/prompts.js";
|
|
11
|
-
export { DEFAULT_CRITERIA } from "./defaults/criteria.js";
|
|
11
|
+
export { DEFAULT_CRITERIA, getDimensions, formatDimensionsBlock } from "./defaults/criteria.js";
|
|
12
|
+
export { detectProjectType } from "./defaults/project-type.js";
|
|
12
13
|
// Dashboard
|
|
13
14
|
export { DashboardServer } from "./dashboard/server.js";
|
|
14
15
|
//# sourceMappingURL=index.js.map
|
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO;AACP,OAAO,EAAE,OAAO,EAAE,MAAM,wBAAwB,CAAC;AAEjD,OAAO,EAAE,cAAc,EAAE,MAAM,2BAA2B,CAAC;AAE3D,OAAO,EAAE,YAAY,EAAE,MAAM,yBAAyB,CAAC;AAEvD,YAAY;AACZ,OAAO,EAAE,mBAAmB,EAAE,MAAM,gCAAgC,CAAC;AACrE,OAAO,EAAE,WAAW,EAAE,cAAc,EAAE,kBAAkB,EAAE,MAAM,+BAA+B,CAAC;AAChG,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,8BAA8B,CAAC;AAExE,WAAW;AACX,OAAO,EAAE,iBAAiB,EAAE,oBAAoB,EAAE,MAAM,uBAAuB,CAAC;AAChF,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO;AACP,OAAO,EAAE,OAAO,EAAE,MAAM,wBAAwB,CAAC;AAEjD,OAAO,EAAE,cAAc,EAAE,MAAM,2BAA2B,CAAC;AAE3D,OAAO,EAAE,YAAY,EAAE,MAAM,yBAAyB,CAAC;AAEvD,YAAY;AACZ,OAAO,EAAE,mBAAmB,EAAE,MAAM,gCAAgC,CAAC;AACrE,OAAO,EAAE,WAAW,EAAE,cAAc,EAAE,kBAAkB,EAAE,MAAM,+BAA+B,CAAC;AAChG,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,8BAA8B,CAAC;AAExE,WAAW;AACX,OAAO,EAAE,iBAAiB,EAAE,oBAAoB,EAAE,MAAM,uBAAuB,CAAC;AAChF,OAAO,EAAE,gBAAgB,EAAE,aAAa,EAAE,qBAAqB,EAAE,MAAM,wBAAwB,CAAC;AAChG,OAAO,EAAE,iBAAiB,EAAE,MAAM,4BAA4B,CAAC;AAE/D,YAAY;AACZ,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agents-harness",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.2",
|
|
4
4
|
"description": "Multi-agent orchestrator for autonomous software development",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"types": "dist/index.d.ts",
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
"agents-harness": "dist/cli.js"
|
|
9
9
|
},
|
|
10
10
|
"scripts": {
|
|
11
|
-
"build": "tsc && cp -r src/dashboard/static dist/dashboard/static",
|
|
11
|
+
"build": "tsc && rm -rf dist/dashboard/static && cp -r src/dashboard/static dist/dashboard/static",
|
|
12
12
|
"dev": "tsc --watch",
|
|
13
13
|
"test": "vitest run",
|
|
14
14
|
"test:watch": "vitest",
|
|
File without changes
|