agent-bober 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/README.md +13 -7
  2. package/agents/bober-evaluator.md +62 -54
  3. package/agents/bober-generator.md +4 -0
  4. package/dist/contracts/eval-result.d.ts +339 -0
  5. package/dist/contracts/eval-result.d.ts.map +1 -1
  6. package/dist/contracts/eval-result.js +36 -0
  7. package/dist/contracts/eval-result.js.map +1 -1
  8. package/dist/evaluators/builtin/playwright.d.ts.map +1 -1
  9. package/dist/evaluators/builtin/playwright.js +50 -15
  10. package/dist/evaluators/builtin/playwright.js.map +1 -1
  11. package/dist/index.d.ts +5 -1
  12. package/dist/index.d.ts.map +1 -1
  13. package/dist/index.js +4 -0
  14. package/dist/index.js.map +1 -1
  15. package/dist/orchestrator/agent-loader.d.ts +26 -0
  16. package/dist/orchestrator/agent-loader.d.ts.map +1 -0
  17. package/dist/orchestrator/agent-loader.js +125 -0
  18. package/dist/orchestrator/agent-loader.js.map +1 -0
  19. package/dist/orchestrator/agentic-loop.d.ts +53 -0
  20. package/dist/orchestrator/agentic-loop.d.ts.map +1 -0
  21. package/dist/orchestrator/agentic-loop.js +145 -0
  22. package/dist/orchestrator/agentic-loop.js.map +1 -0
  23. package/dist/orchestrator/evaluator-agent.d.ts +4 -1
  24. package/dist/orchestrator/evaluator-agent.d.ts.map +1 -1
  25. package/dist/orchestrator/evaluator-agent.js +107 -84
  26. package/dist/orchestrator/evaluator-agent.js.map +1 -1
  27. package/dist/orchestrator/generator-agent.d.ts +14 -2
  28. package/dist/orchestrator/generator-agent.d.ts.map +1 -1
  29. package/dist/orchestrator/generator-agent.js +96 -73
  30. package/dist/orchestrator/generator-agent.js.map +1 -1
  31. package/dist/orchestrator/model-resolver.d.ts +9 -0
  32. package/dist/orchestrator/model-resolver.d.ts.map +1 -0
  33. package/dist/orchestrator/model-resolver.js +21 -0
  34. package/dist/orchestrator/model-resolver.js.map +1 -0
  35. package/dist/orchestrator/pipeline.d.ts.map +1 -1
  36. package/dist/orchestrator/pipeline.js +21 -4
  37. package/dist/orchestrator/pipeline.js.map +1 -1
  38. package/dist/orchestrator/planner-agent.d.ts +3 -2
  39. package/dist/orchestrator/planner-agent.d.ts.map +1 -1
  40. package/dist/orchestrator/planner-agent.js +39 -75
  41. package/dist/orchestrator/planner-agent.js.map +1 -1
  42. package/dist/orchestrator/tools/handlers.d.ts +9 -0
  43. package/dist/orchestrator/tools/handlers.d.ts.map +1 -0
  44. package/dist/orchestrator/tools/handlers.js +279 -0
  45. package/dist/orchestrator/tools/handlers.js.map +1 -0
  46. package/dist/orchestrator/tools/index.d.ts +21 -0
  47. package/dist/orchestrator/tools/index.d.ts.map +1 -0
  48. package/dist/orchestrator/tools/index.js +33 -0
  49. package/dist/orchestrator/tools/index.js.map +1 -0
  50. package/dist/orchestrator/tools/schemas.d.ts +16 -0
  51. package/dist/orchestrator/tools/schemas.d.ts.map +1 -0
  52. package/dist/orchestrator/tools/schemas.js +138 -0
  53. package/dist/orchestrator/tools/schemas.js.map +1 -0
  54. package/package.json +1 -1
  55. package/templates/presets/nextjs/bober.config.json +1 -1
  56. package/templates/presets/react-vite/bober.config.json +1 -1
package/README.md CHANGED
@@ -451,15 +451,20 @@ To debug failing E2E tests:
451
451
 
452
452
  This architecture implements the patterns described in Anthropic's [**"Harness design for long-running application development"**](https://www.anthropic.com/engineering/harness-design-long-running-apps) by Prithvi Rajasekaran. The key insight from that research: separating code generation from code evaluation creates a feedback loop that catches errors early and dramatically improves output quality. In their tests, a solo agent produced broken output in 20 minutes, while the full harness produced a polished, working application — demonstrating that multi-agent orchestration with honest evaluation is worth the investment.
453
453
 
454
- - **Planner** (Claude Opus): High-reasoning model for decomposing complex features into clear, testable sprint contracts. Thinks about scope, dependencies, and risk.
455
- - **Generator** (Claude Sonnet): Fast, capable model for writing code. Works within the boundaries of a single sprint contract.
456
- - **Evaluator** (Claude Sonnet): Runs automated checks (typecheck, lint, build, tests) and provides structured feedback. If a sprint fails evaluation, the Generator gets specific rework instructions.
454
+ ### Agentic Tool-Use Architecture
455
+
456
+ Each agent runs as a **multi-turn agentic loop** with tool access via the Anthropic SDK. System prompts are loaded from the detailed agent definitions in `agents/bober-*.md` (300-600 lines of role-specific instructions, anti-leniency protocols, and evaluation criteria).
457
+
458
+ - **Planner** (Claude Opus): Explores the codebase via read-only tools (`read_file`, `glob`, `grep`), then produces sprint-decomposed plans. Thinks about scope, dependencies, and risk.
459
+ - **Generator** (Claude Sonnet): Full tool access (`bash`, `read_file`, `write_file`, `edit_file`, `glob`, `grep`). Reads existing code, writes implementation, runs tests, and commits — all autonomously within the sprint contract boundaries.
460
+ - **Evaluator** (Claude Sonnet): Read-only + bash tools (`bash`, `read_file`, `glob`, `grep` — deliberately NO write/edit). Independently verifies by running the dev server, taking Playwright screenshots, executing tests, and inspecting code. Cannot fix bugs — only report them with precise feedback.
457
461
 
458
462
  The separation ensures that:
459
- 1. The Generator cannot "mark its own homework" -- an independent evaluation step catches issues.
463
+ 1. The Generator cannot "mark its own homework" an independent evaluation step with its own tool access catches issues through actual runtime verification, not just reading the generator's self-report.
460
464
  2. Sprint contracts provide clear scope boundaries, preventing feature creep.
461
- 3. Automated checks run after every sprint, not just at the end.
465
+ 3. Automated checks (programmatic evaluators) + agent-based qualitative evaluation run after every sprint.
462
466
  4. Context resets between sprints keep the Generator focused and prevent context degradation.
467
+ 5. The Evaluator's anti-leniency protocol ensures passing on the first iteration is rare for non-trivial work.
463
468
 
464
469
  ### State Management
465
470
 
@@ -522,10 +527,11 @@ agent-bober/
522
527
  config/ Config schema, loader, defaults
523
528
  contracts/ Sprint contract and eval result types
524
529
  evaluators/ Built-in evaluator plugins
525
- orchestrator/ Context handoff and agent coordination
530
+ orchestrator/ Agent runners, agentic loop, tool infrastructure
531
+ tools/ Tool schemas, sandboxed handlers, role-based sets
526
532
  state/ State management for .bober/ directory
527
533
  utils/ Shared utilities
528
- agents/ Agent system prompts (.md files)
534
+ agents/ Agent system prompts (.md files, loaded at runtime)
529
535
  skills/ Claude Code slash command definitions
530
536
  templates/ Project templates and scaffolds
531
537
  hooks/ Claude Code hooks
@@ -88,9 +88,59 @@ Read these documents in order:
88
88
 
89
89
  Build a checklist from the contract's `successCriteria` array. This is your evaluation framework. Every criterion gets tested independently.
90
90
 
91
- ### Step 2: Run Configured Evaluation Strategies
91
+ ### Step 2: Live Page Evaluation (for frontend/UI projects)
92
92
 
93
- Read `evaluator.strategies` from `bober.config.json`. Execute each configured strategy in order.
93
+ **Before running ANY automated strategy**, if this sprint involves UI/frontend changes, you MUST interact with the live page. This is NOT optional. This is the FIRST thing you do.
94
+
95
+ **2a. Start the dev server:**
96
+ ```bash
97
+ npm run dev &
98
+ DEV_PID=$!
99
+ sleep 8
100
+ ```
101
+
102
+ **2b. Screenshot and study the page:**
103
+ ```bash
104
+ npx playwright screenshot http://localhost:3000 /tmp/bober-eval-home.png --full-page 2>&1
105
+ ```
106
+ Screenshot additional routes relevant to this sprint. READ every screenshot — you are multimodal, you can see images.
107
+
108
+ **2c. Score against the four design criteria.**
109
+
110
+ Study each screenshot carefully, then score each criterion 0-100. Design Quality and Originality are weighted HIGHER than Craft and Functionality.
111
+
112
+ **Design Quality** (Weight: High) — Does the design feel like a coherent whole? Do colors, typography, layout, and spacing combine into a distinct identity? Or does it look like random parts assembled together?
113
+ - Failing: mismatched card styles, no visual hierarchy, arbitrary colors, assembled-from-parts feeling
114
+ - Passing: consistent visual language, clear mood, intentional color palette, unified system
115
+
116
+ **Originality** (Weight: High) — Are there deliberate creative choices? Or is this default templates and AI-generated patterns?
117
+ - Automatic fail: unmodified Tailwind/Bootstrap defaults, purple/blue gradients over white cards, generic centered hero + CTA, stock component layouts
118
+ - Passing: custom color choices, distinctive layout decisions, typography personality, visual elements a human designer would recognize as intentional
119
+
120
+ **Craft** (Weight: Medium) — Technical execution: type hierarchy (distinct h1/h2/h3/body sizes), spacing consistency (using a scale, not random pixels), color contrast (WCAG AA), visual consistency across components.
121
+
122
+ **Functionality** (Weight: Medium) — Can users find primary actions? Are interactive elements obvious? Are loading/error/empty states handled?
123
+
124
+ **Scoring:**
125
+ - Generic but functional: 40-55 (FAIL for UI-focused sprints)
126
+ - Has originality but minor issues: 65-80 (PASS with notes)
127
+ - Cohesive, original, well-crafted, functional: 80-95 (PASS)
128
+ - Reserve 95-100 for exceptional work — almost never award this
129
+
130
+ **If the combined weighted score is below 65, the sprint FAILS** with specific feedback on what to improve. Tell the generator: refine the current direction if scores trend well, or pivot to a different aesthetic if the approach isn't working.
131
+
132
+ **2d. Check for visual bugs:**
133
+ - Blank areas or broken layouts
134
+ - Text overflow or overlapping elements
135
+ - Missing images or broken SVGs
136
+ - Sections not matching success criteria descriptions
137
+ - Mobile responsiveness (if criteria require it, screenshot at 375px too)
138
+
139
+ **Do NOT kill the dev server** — Playwright tests need it in Step 3.
140
+
141
+ ### Step 3: Run Configured Evaluation Strategies
142
+
143
+ Read `evaluator.strategies` from `bober.config.json`. Execute each configured strategy in order. **The dev server should still be running from Step 2.**
94
144
 
95
145
  **For each strategy, record:**
96
146
  - Strategy type
@@ -99,6 +149,11 @@ Read `evaluator.strategies` from `bober.config.json`. Execute each configured st
99
149
  - Pass/fail determination
100
150
  - Whether this strategy is `required` (blocking) or optional
101
151
 
152
+ **After all strategies are done, kill the dev server:**
153
+ ```bash
154
+ kill $DEV_PID 2>/dev/null
155
+ ```
156
+
102
157
  **Strategy execution:**
103
158
 
104
159
  #### `typecheck`
@@ -187,7 +242,7 @@ This strategy requires careful execution:
187
242
  - Execute the custom command specified
188
243
  - Interpret output based on the strategy's config
189
244
 
190
- ### Step 3: Verify Success Criteria
245
+ ### Step 4: Verify Success Criteria
191
246
 
192
247
  Go through EVERY success criterion in the contract, one by one. For each:
193
248
 
@@ -202,7 +257,7 @@ Go through EVERY success criterion in the contract, one by one. For each:
202
257
  - A criterion with `required: false` is recorded but does not block the sprint
203
258
  - If a criterion's `verificationMethod` cannot be executed (e.g., Playwright not set up), mark it as `"skipped"` with a clear reason. If it was `required`, escalate this as a configuration issue.
204
259
 
205
- ### Step 4: Check Principles Adherence
260
+ ### Step 5: Check Principles Adherence
206
261
 
207
262
  If `.bober/principles.md` exists, verify the Generator's output adheres to the project principles:
208
263
 
@@ -212,7 +267,7 @@ If `.bober/principles.md` exists, verify the Generator's output adheres to the p
212
267
 
213
268
  Principle violations should be reported in the `generatorFeedback` array with `category: "quality"` and a reference to the specific principle that was violated.
214
269
 
215
- ### Step 5: Check for Regressions
270
+ ### Step 6: Check for Regressions
216
271
 
217
272
  Beyond the contract's criteria, check for regressions:
218
273
 
@@ -220,7 +275,7 @@ Beyond the contract's criteria, check for regressions:
220
275
  2. **Does the build still work?** Even if the contract is about backend code, verify the full build.
221
276
  3. **Were any existing files modified in unexpected ways?** Use `git diff` to review all changes. Flag any changes to files NOT mentioned in the contract's `estimatedFiles`.
222
277
 
223
- ### Step 6: Produce Structured EvalResult
278
+ ### Step 7: Produce Structured EvalResult
224
279
 
225
280
  Generate the following JSON structure:
226
281
 
@@ -282,7 +337,7 @@ Generate the following JSON structure:
282
337
  }
283
338
  ```
284
339
 
285
- ### Step 7: Save and Report
340
+ ### Step 8: Save and Report
286
341
 
287
342
  1. **Save the EvalResult** to `.bober/eval-results/<evalId>.json`
288
343
  - IMPORTANT: You do not have Write tools. Output the EvalResult JSON and the orchestrator will save it.
@@ -471,53 +526,6 @@ If `playwright` is in the configured evaluation strategies:
471
526
  ```
472
527
  New interactive elements without `data-testid` = quality failure with feedback to add them.
473
528
 
474
- ## Design & UI Evaluation Criteria
475
-
476
- When the sprint involves UI/frontend work, evaluate against these four criteria in addition to functional correctness. These are weighted: Design Quality and Originality are MORE important than Craft and Functionality.
477
-
478
- ### 1. Design Quality (Weight: High)
479
- Does the design feel like a coherent whole rather than a collection of parts? Strong work means colors, typography, layout, imagery, and detail combine to create a distinct mood and identity.
480
-
481
- **Failing signals:**
482
- - Multiple visual "languages" on the same page (mismatched card styles, inconsistent button treatments)
483
- - No clear visual hierarchy — everything competes for attention
484
- - Colors that feel arbitrary rather than curated
485
- - Layout that feels assembled from parts rather than designed as a system
486
-
487
- ### 2. Originality (Weight: High)
488
- Is there evidence of custom decisions, or is this template layouts, library defaults, and AI-generated patterns? A human designer should recognize deliberate creative choices.
489
-
490
- **Automatic failures:**
491
- - Unmodified Tailwind/Bootstrap/Material UI defaults with no customization
492
- - Purple/blue gradients over white cards (the #1 telltale AI pattern)
493
- - Generic hero sections with centered text and a CTA button
494
- - Stock component library layouts with only color changes
495
- - Any pattern you've seen five times before — if it's generic, it fails
496
-
497
- ### 3. Craft (Weight: Medium)
498
- Technical execution: typography hierarchy, spacing consistency, color harmony, contrast ratios. This is a competence check.
499
-
500
- **Check specifically:**
501
- - Is there a clear type scale (distinct sizes for h1/h2/h3/body/caption)?
502
- - Is spacing consistent (using a scale like 4/8/16/24/32/48, not random pixels)?
503
- - Do colors have sufficient contrast for accessibility (WCAG AA minimum)?
504
- - Are interactive elements visually consistent (all buttons look like they belong together)?
505
-
506
- ### 4. Functionality (Weight: Medium)
507
- Can users understand what the interface does, find primary actions, and complete tasks without guessing?
508
-
509
- **Check specifically:**
510
- - Are primary actions visually prominent?
511
- - Do interactive elements have clear hover/focus/active states?
512
- - Are loading, error, and empty states handled?
513
- - Is the layout responsive (or at least not broken) at common viewport widths?
514
-
515
- ### Scoring UI Work
516
- - A design that is technically correct but visually generic scores LOW (40-55)
517
- - A design with originality and craft but minor functional issues scores MEDIUM-HIGH (65-80)
518
- - A design that is cohesive, original, well-crafted, AND functional scores HIGH (80-95)
519
- - Reserve 95-100 for genuinely exceptional work — you should almost never award this
520
-
521
529
  ## Code Quality Evaluation
522
530
 
523
531
  Beyond functional correctness, evaluate code quality ruthlessly:
@@ -399,3 +399,7 @@ When implementing user interfaces, your work will be graded on four criteria. Yo
399
399
  4. **Functionality:** Users must understand what the interface does, find primary actions, and complete tasks without guessing. Interactive elements must have clear affordances. Loading states, error states, and empty states must all be handled.
400
400
 
401
401
  Do NOT produce "safe" designs that technically satisfy requirements but lack any personality. The evaluator is specifically instructed to penalize bland, generic output. Take aesthetic risks. Make deliberate choices about color, typography, layout, and motion.
402
+
403
+ **On rework iterations:** When you receive evaluator feedback on design, make a strategic decision:
404
+ - If design scores are trending upward (65+), **refine** the current direction — improve what's working
405
+ - If design scores are low or stagnant (<65), **pivot** to a fundamentally different aesthetic — new color palette, different layout approach, different visual personality. Don't polish something that isn't working.
@@ -24,6 +24,69 @@ export declare const EvalDetailSchema: z.ZodObject<{
24
24
  line?: number | undefined;
25
25
  }>;
26
26
  export type EvalDetail = z.infer<typeof EvalDetailSchema>;
27
+ export declare const CriterionResultSchema: z.ZodObject<{
28
+ criterionId: z.ZodString;
29
+ description: z.ZodString;
30
+ required: z.ZodBoolean;
31
+ result: z.ZodEnum<["pass", "fail", "skipped"]>;
32
+ evidence: z.ZodOptional<z.ZodString>;
33
+ feedback: z.ZodOptional<z.ZodString>;
34
+ }, "strip", z.ZodTypeAny, {
35
+ description: string;
36
+ required: boolean;
37
+ criterionId: string;
38
+ result: "pass" | "fail" | "skipped";
39
+ feedback?: string | undefined;
40
+ evidence?: string | undefined;
41
+ }, {
42
+ description: string;
43
+ required: boolean;
44
+ criterionId: string;
45
+ result: "pass" | "fail" | "skipped";
46
+ feedback?: string | undefined;
47
+ evidence?: string | undefined;
48
+ }>;
49
+ export type CriterionResult = z.infer<typeof CriterionResultSchema>;
50
+ export declare const RegressionSchema: z.ZodObject<{
51
+ description: z.ZodString;
52
+ evidence: z.ZodString;
53
+ severity: z.ZodEnum<["critical", "major", "minor"]>;
54
+ }, "strip", z.ZodTypeAny, {
55
+ description: string;
56
+ severity: "critical" | "major" | "minor";
57
+ evidence: string;
58
+ }, {
59
+ description: string;
60
+ severity: "critical" | "major" | "minor";
61
+ evidence: string;
62
+ }>;
63
+ export type Regression = z.infer<typeof RegressionSchema>;
64
+ export declare const GeneratorFeedbackItemSchema: z.ZodObject<{
65
+ priority: z.ZodEnum<["critical", "high", "medium", "low"]>;
66
+ category: z.ZodEnum<["bug", "missing-feature", "regression", "quality", "performance"]>;
67
+ file: z.ZodOptional<z.ZodString>;
68
+ line: z.ZodOptional<z.ZodNumber>;
69
+ description: z.ZodString;
70
+ expected: z.ZodOptional<z.ZodString>;
71
+ reproduction: z.ZodOptional<z.ZodString>;
72
+ }, "strip", z.ZodTypeAny, {
73
+ description: string;
74
+ priority: "medium" | "critical" | "high" | "low";
75
+ category: "bug" | "missing-feature" | "regression" | "quality" | "performance";
76
+ expected?: string | undefined;
77
+ file?: string | undefined;
78
+ line?: number | undefined;
79
+ reproduction?: string | undefined;
80
+ }, {
81
+ description: string;
82
+ priority: "medium" | "critical" | "high" | "low";
83
+ category: "bug" | "missing-feature" | "regression" | "quality" | "performance";
84
+ expected?: string | undefined;
85
+ file?: string | undefined;
86
+ line?: number | undefined;
87
+ reproduction?: string | undefined;
88
+ }>;
89
+ export type GeneratorFeedbackItem = z.infer<typeof GeneratorFeedbackItemSchema>;
27
90
  export declare const EvalResultSchema: z.ZodObject<{
28
91
  evaluator: z.ZodString;
29
92
  passed: z.ZodBoolean;
@@ -53,6 +116,69 @@ export declare const EvalResultSchema: z.ZodObject<{
53
116
  summary: z.ZodString;
54
117
  feedback: z.ZodString;
55
118
  timestamp: z.ZodString;
119
+ iteration: z.ZodOptional<z.ZodNumber>;
120
+ contractId: z.ZodOptional<z.ZodString>;
121
+ criteriaResults: z.ZodOptional<z.ZodArray<z.ZodObject<{
122
+ criterionId: z.ZodString;
123
+ description: z.ZodString;
124
+ required: z.ZodBoolean;
125
+ result: z.ZodEnum<["pass", "fail", "skipped"]>;
126
+ evidence: z.ZodOptional<z.ZodString>;
127
+ feedback: z.ZodOptional<z.ZodString>;
128
+ }, "strip", z.ZodTypeAny, {
129
+ description: string;
130
+ required: boolean;
131
+ criterionId: string;
132
+ result: "pass" | "fail" | "skipped";
133
+ feedback?: string | undefined;
134
+ evidence?: string | undefined;
135
+ }, {
136
+ description: string;
137
+ required: boolean;
138
+ criterionId: string;
139
+ result: "pass" | "fail" | "skipped";
140
+ feedback?: string | undefined;
141
+ evidence?: string | undefined;
142
+ }>, "many">>;
143
+ regressions: z.ZodOptional<z.ZodArray<z.ZodObject<{
144
+ description: z.ZodString;
145
+ evidence: z.ZodString;
146
+ severity: z.ZodEnum<["critical", "major", "minor"]>;
147
+ }, "strip", z.ZodTypeAny, {
148
+ description: string;
149
+ severity: "critical" | "major" | "minor";
150
+ evidence: string;
151
+ }, {
152
+ description: string;
153
+ severity: "critical" | "major" | "minor";
154
+ evidence: string;
155
+ }>, "many">>;
156
+ designScore: z.ZodOptional<z.ZodNumber>;
157
+ generatorFeedback: z.ZodOptional<z.ZodArray<z.ZodObject<{
158
+ priority: z.ZodEnum<["critical", "high", "medium", "low"]>;
159
+ category: z.ZodEnum<["bug", "missing-feature", "regression", "quality", "performance"]>;
160
+ file: z.ZodOptional<z.ZodString>;
161
+ line: z.ZodOptional<z.ZodNumber>;
162
+ description: z.ZodString;
163
+ expected: z.ZodOptional<z.ZodString>;
164
+ reproduction: z.ZodOptional<z.ZodString>;
165
+ }, "strip", z.ZodTypeAny, {
166
+ description: string;
167
+ priority: "medium" | "critical" | "high" | "low";
168
+ category: "bug" | "missing-feature" | "regression" | "quality" | "performance";
169
+ expected?: string | undefined;
170
+ file?: string | undefined;
171
+ line?: number | undefined;
172
+ reproduction?: string | undefined;
173
+ }, {
174
+ description: string;
175
+ priority: "medium" | "critical" | "high" | "low";
176
+ category: "bug" | "missing-feature" | "regression" | "quality" | "performance";
177
+ expected?: string | undefined;
178
+ file?: string | undefined;
179
+ line?: number | undefined;
180
+ reproduction?: string | undefined;
181
+ }>, "many">>;
56
182
  }, "strip", z.ZodTypeAny, {
57
183
  evaluator: string;
58
184
  passed: boolean;
@@ -68,6 +194,31 @@ export declare const EvalResultSchema: z.ZodObject<{
68
194
  summary: string;
69
195
  feedback: string;
70
196
  score?: number | undefined;
197
+ iteration?: number | undefined;
198
+ contractId?: string | undefined;
199
+ criteriaResults?: {
200
+ description: string;
201
+ required: boolean;
202
+ criterionId: string;
203
+ result: "pass" | "fail" | "skipped";
204
+ feedback?: string | undefined;
205
+ evidence?: string | undefined;
206
+ }[] | undefined;
207
+ regressions?: {
208
+ description: string;
209
+ severity: "critical" | "major" | "minor";
210
+ evidence: string;
211
+ }[] | undefined;
212
+ designScore?: number | undefined;
213
+ generatorFeedback?: {
214
+ description: string;
215
+ priority: "medium" | "critical" | "high" | "low";
216
+ category: "bug" | "missing-feature" | "regression" | "quality" | "performance";
217
+ expected?: string | undefined;
218
+ file?: string | undefined;
219
+ line?: number | undefined;
220
+ reproduction?: string | undefined;
221
+ }[] | undefined;
71
222
  }, {
72
223
  evaluator: string;
73
224
  passed: boolean;
@@ -83,6 +234,31 @@ export declare const EvalResultSchema: z.ZodObject<{
83
234
  summary: string;
84
235
  feedback: string;
85
236
  score?: number | undefined;
237
+ iteration?: number | undefined;
238
+ contractId?: string | undefined;
239
+ criteriaResults?: {
240
+ description: string;
241
+ required: boolean;
242
+ criterionId: string;
243
+ result: "pass" | "fail" | "skipped";
244
+ feedback?: string | undefined;
245
+ evidence?: string | undefined;
246
+ }[] | undefined;
247
+ regressions?: {
248
+ description: string;
249
+ severity: "critical" | "major" | "minor";
250
+ evidence: string;
251
+ }[] | undefined;
252
+ designScore?: number | undefined;
253
+ generatorFeedback?: {
254
+ description: string;
255
+ priority: "medium" | "critical" | "high" | "low";
256
+ category: "bug" | "missing-feature" | "regression" | "quality" | "performance";
257
+ expected?: string | undefined;
258
+ file?: string | undefined;
259
+ line?: number | undefined;
260
+ reproduction?: string | undefined;
261
+ }[] | undefined;
86
262
  }>;
87
263
  export type EvalResult = z.infer<typeof EvalResultSchema>;
88
264
  export declare const SprintEvaluationSchema: z.ZodObject<{
@@ -117,6 +293,69 @@ export declare const SprintEvaluationSchema: z.ZodObject<{
117
293
  summary: z.ZodString;
118
294
  feedback: z.ZodString;
119
295
  timestamp: z.ZodString;
296
+ iteration: z.ZodOptional<z.ZodNumber>;
297
+ contractId: z.ZodOptional<z.ZodString>;
298
+ criteriaResults: z.ZodOptional<z.ZodArray<z.ZodObject<{
299
+ criterionId: z.ZodString;
300
+ description: z.ZodString;
301
+ required: z.ZodBoolean;
302
+ result: z.ZodEnum<["pass", "fail", "skipped"]>;
303
+ evidence: z.ZodOptional<z.ZodString>;
304
+ feedback: z.ZodOptional<z.ZodString>;
305
+ }, "strip", z.ZodTypeAny, {
306
+ description: string;
307
+ required: boolean;
308
+ criterionId: string;
309
+ result: "pass" | "fail" | "skipped";
310
+ feedback?: string | undefined;
311
+ evidence?: string | undefined;
312
+ }, {
313
+ description: string;
314
+ required: boolean;
315
+ criterionId: string;
316
+ result: "pass" | "fail" | "skipped";
317
+ feedback?: string | undefined;
318
+ evidence?: string | undefined;
319
+ }>, "many">>;
320
+ regressions: z.ZodOptional<z.ZodArray<z.ZodObject<{
321
+ description: z.ZodString;
322
+ evidence: z.ZodString;
323
+ severity: z.ZodEnum<["critical", "major", "minor"]>;
324
+ }, "strip", z.ZodTypeAny, {
325
+ description: string;
326
+ severity: "critical" | "major" | "minor";
327
+ evidence: string;
328
+ }, {
329
+ description: string;
330
+ severity: "critical" | "major" | "minor";
331
+ evidence: string;
332
+ }>, "many">>;
333
+ designScore: z.ZodOptional<z.ZodNumber>;
334
+ generatorFeedback: z.ZodOptional<z.ZodArray<z.ZodObject<{
335
+ priority: z.ZodEnum<["critical", "high", "medium", "low"]>;
336
+ category: z.ZodEnum<["bug", "missing-feature", "regression", "quality", "performance"]>;
337
+ file: z.ZodOptional<z.ZodString>;
338
+ line: z.ZodOptional<z.ZodNumber>;
339
+ description: z.ZodString;
340
+ expected: z.ZodOptional<z.ZodString>;
341
+ reproduction: z.ZodOptional<z.ZodString>;
342
+ }, "strip", z.ZodTypeAny, {
343
+ description: string;
344
+ priority: "medium" | "critical" | "high" | "low";
345
+ category: "bug" | "missing-feature" | "regression" | "quality" | "performance";
346
+ expected?: string | undefined;
347
+ file?: string | undefined;
348
+ line?: number | undefined;
349
+ reproduction?: string | undefined;
350
+ }, {
351
+ description: string;
352
+ priority: "medium" | "critical" | "high" | "low";
353
+ category: "bug" | "missing-feature" | "regression" | "quality" | "performance";
354
+ expected?: string | undefined;
355
+ file?: string | undefined;
356
+ line?: number | undefined;
357
+ reproduction?: string | undefined;
358
+ }>, "many">>;
120
359
  }, "strip", z.ZodTypeAny, {
121
360
  evaluator: string;
122
361
  passed: boolean;
@@ -132,6 +371,31 @@ export declare const SprintEvaluationSchema: z.ZodObject<{
132
371
  summary: string;
133
372
  feedback: string;
134
373
  score?: number | undefined;
374
+ iteration?: number | undefined;
375
+ contractId?: string | undefined;
376
+ criteriaResults?: {
377
+ description: string;
378
+ required: boolean;
379
+ criterionId: string;
380
+ result: "pass" | "fail" | "skipped";
381
+ feedback?: string | undefined;
382
+ evidence?: string | undefined;
383
+ }[] | undefined;
384
+ regressions?: {
385
+ description: string;
386
+ severity: "critical" | "major" | "minor";
387
+ evidence: string;
388
+ }[] | undefined;
389
+ designScore?: number | undefined;
390
+ generatorFeedback?: {
391
+ description: string;
392
+ priority: "medium" | "critical" | "high" | "low";
393
+ category: "bug" | "missing-feature" | "regression" | "quality" | "performance";
394
+ expected?: string | undefined;
395
+ file?: string | undefined;
396
+ line?: number | undefined;
397
+ reproduction?: string | undefined;
398
+ }[] | undefined;
135
399
  }, {
136
400
  evaluator: string;
137
401
  passed: boolean;
@@ -147,6 +411,31 @@ export declare const SprintEvaluationSchema: z.ZodObject<{
147
411
  summary: string;
148
412
  feedback: string;
149
413
  score?: number | undefined;
414
+ iteration?: number | undefined;
415
+ contractId?: string | undefined;
416
+ criteriaResults?: {
417
+ description: string;
418
+ required: boolean;
419
+ criterionId: string;
420
+ result: "pass" | "fail" | "skipped";
421
+ feedback?: string | undefined;
422
+ evidence?: string | undefined;
423
+ }[] | undefined;
424
+ regressions?: {
425
+ description: string;
426
+ severity: "critical" | "major" | "minor";
427
+ evidence: string;
428
+ }[] | undefined;
429
+ designScore?: number | undefined;
430
+ generatorFeedback?: {
431
+ description: string;
432
+ priority: "medium" | "critical" | "high" | "low";
433
+ category: "bug" | "missing-feature" | "regression" | "quality" | "performance";
434
+ expected?: string | undefined;
435
+ file?: string | undefined;
436
+ line?: number | undefined;
437
+ reproduction?: string | undefined;
438
+ }[] | undefined;
150
439
  }>, "many">;
151
440
  overallPassed: z.ZodBoolean;
152
441
  aggregateFeedback: z.ZodString;
@@ -168,6 +457,31 @@ export declare const SprintEvaluationSchema: z.ZodObject<{
168
457
  summary: string;
169
458
  feedback: string;
170
459
  score?: number | undefined;
460
+ iteration?: number | undefined;
461
+ contractId?: string | undefined;
462
+ criteriaResults?: {
463
+ description: string;
464
+ required: boolean;
465
+ criterionId: string;
466
+ result: "pass" | "fail" | "skipped";
467
+ feedback?: string | undefined;
468
+ evidence?: string | undefined;
469
+ }[] | undefined;
470
+ regressions?: {
471
+ description: string;
472
+ severity: "critical" | "major" | "minor";
473
+ evidence: string;
474
+ }[] | undefined;
475
+ designScore?: number | undefined;
476
+ generatorFeedback?: {
477
+ description: string;
478
+ priority: "medium" | "critical" | "high" | "low";
479
+ category: "bug" | "missing-feature" | "regression" | "quality" | "performance";
480
+ expected?: string | undefined;
481
+ file?: string | undefined;
482
+ line?: number | undefined;
483
+ reproduction?: string | undefined;
484
+ }[] | undefined;
171
485
  }[];
172
486
  overallPassed: boolean;
173
487
  aggregateFeedback: string;
@@ -189,6 +503,31 @@ export declare const SprintEvaluationSchema: z.ZodObject<{
189
503
  summary: string;
190
504
  feedback: string;
191
505
  score?: number | undefined;
506
+ iteration?: number | undefined;
507
+ contractId?: string | undefined;
508
+ criteriaResults?: {
509
+ description: string;
510
+ required: boolean;
511
+ criterionId: string;
512
+ result: "pass" | "fail" | "skipped";
513
+ feedback?: string | undefined;
514
+ evidence?: string | undefined;
515
+ }[] | undefined;
516
+ regressions?: {
517
+ description: string;
518
+ severity: "critical" | "major" | "minor";
519
+ evidence: string;
520
+ }[] | undefined;
521
+ designScore?: number | undefined;
522
+ generatorFeedback?: {
523
+ description: string;
524
+ priority: "medium" | "critical" | "high" | "low";
525
+ category: "bug" | "missing-feature" | "regression" | "quality" | "performance";
526
+ expected?: string | undefined;
527
+ file?: string | undefined;
528
+ line?: number | undefined;
529
+ reproduction?: string | undefined;
530
+ }[] | undefined;
192
531
  }[];
193
532
  overallPassed: boolean;
194
533
  aggregateFeedback: string;
@@ -1 +1 @@
1
- {"version":3,"file":"eval-result.d.ts","sourceRoot":"","sources":["../../src/contracts/eval-result.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAIxB,eAAO,MAAM,cAAc,yCAAuC,CAAC;AACnE,MAAM,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AAItD,eAAO,MAAM,gBAAgB;;;;;;;;;;;;;;;;;;;;;EAO3B,CAAC;AACH,MAAM,MAAM,UAAU,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,gBAAgB,CAAC,CAAC;AAI1D,eAAO,MAAM,gBAAgB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAQ3B,CAAC;AACH,MAAM,MAAM,UAAU,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,gBAAgB,CAAC,CAAC;AAI1D,eAAO,MAAM,sBAAsB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAMjC,CAAC;AACH,MAAM,MAAM,gBAAgB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,sBAAsB,CAAC,CAAC;AAItE;;GAEG;AACH,wBAAgB,gBAAgB,CAC9B,QAAQ,EAAE,MAAM,EAChB,KAAK,EAAE,MAAM,EACb,OAAO,EAAE,UAAU,EAAE,GACpB,gBAAgB,CAsBlB;AAED;;GAEG;AACH,wBAAgB,cAAc,CAAC,UAAU,EAAE,gBAAgB,GAAG,MAAM,CA2CnE"}
1
+ {"version":3,"file":"eval-result.d.ts","sourceRoot":"","sources":["../../src/contracts/eval-result.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAIxB,eAAO,MAAM,cAAc,yCAAuC,CAAC;AACnE,MAAM,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AAItD,eAAO,MAAM,gBAAgB;;;;;;;;;;;;;;;;;;;;;EAO3B,CAAC;AACH,MAAM,MAAM,UAAU,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,gBAAgB,CAAC,CAAC;AAI1D,eAAO,MAAM,qBAAqB;;;;;;;;;;;;;;;;;;;;;EAOhC,CAAC;AACH,MAAM,MAAM,eAAe,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,qBAAqB,CAAC,CAAC;AAEpE,eAAO,MAAM,gBAAgB;;;;;;;;;;;;EAI3B,CAAC;AACH,MAAM,MAAM,UAAU,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,gBAAgB,CAAC,CAAC;AAE1D,eAAO,MAAM,2BAA2B;;;;;;;;;;;;;;;;;;;;;;;;EActC,CAAC;AACH,MAAM,MAAM,qBAAqB,GAAG,CAAC,CAAC,KAAK,CACzC,OAAO,2BAA2B,CACnC,CAAC;AAIF,eAAO,MAAM,gBAAgB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAe3B,CAAC;AACH,MAAM,MAAM,UAAU,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,gBAAgB,CAAC,CAAC;AAI1D,eAAO,MAAM,sBAAsB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAMjC,CAAC;AACH,MAAM,MAAM,gBAAgB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,sBAAsB,CAAC,CAAC;AAItE;;GAEG;AACH,wBAAgB,gBAAgB,CAC9B,QAAQ,EAAE,MAAM,EAChB,KAAK,EAAE,MAAM,EACb,OAAO,EAAE,UAAU,EAAE,GACpB,gBAAgB,CAsBlB;AAED;;GAEG;AACH,wBAAgB,cAAc,CAAC,UAAU,EAAE,gBAAgB,GAAG,MAAM,CA2CnE"}