@helmiq/crew 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (209) hide show
  1. package/defaults/personas/architect.persona.yaml +72 -0
  2. package/defaults/personas/engineer.persona.yaml +137 -0
  3. package/defaults/personas/persona-spec.schema.yaml +149 -0
  4. package/defaults/personas/reviewer.persona.yaml +47 -0
  5. package/defaults/rubrics/adr.rubric.yaml +48 -0
  6. package/defaults/rubrics/code-review.rubric.yaml +39 -0
  7. package/defaults/rubrics/pull-request.rubric.yaml +40 -0
  8. package/dist/actions/actions.test.d.ts +2 -0
  9. package/dist/actions/actions.test.d.ts.map +1 -0
  10. package/dist/actions/actions.test.js +158 -0
  11. package/dist/actions/direct-dispatcher.d.ts +10 -0
  12. package/dist/actions/direct-dispatcher.d.ts.map +1 -0
  13. package/dist/actions/direct-dispatcher.js +27 -0
  14. package/dist/actions/dispatcher.d.ts +11 -0
  15. package/dist/actions/dispatcher.d.ts.map +1 -0
  16. package/dist/actions/dispatcher.js +1 -0
  17. package/dist/actions/index.d.ts +7 -0
  18. package/dist/actions/index.d.ts.map +1 -0
  19. package/dist/actions/index.js +3 -0
  20. package/dist/actions/registry.d.ts +13 -0
  21. package/dist/actions/registry.d.ts.map +1 -0
  22. package/dist/actions/registry.js +40 -0
  23. package/dist/actions/resolver.d.ts +47 -0
  24. package/dist/actions/resolver.d.ts.map +1 -0
  25. package/dist/actions/resolver.js +43 -0
  26. package/dist/cli/cli.test.d.ts +2 -0
  27. package/dist/cli/cli.test.d.ts.map +1 -0
  28. package/dist/cli/cli.test.js +392 -0
  29. package/dist/cli/run.d.ts +45 -0
  30. package/dist/cli/run.d.ts.map +1 -0
  31. package/dist/cli/run.js +236 -0
  32. package/dist/common/errors.d.ts +76 -0
  33. package/dist/common/errors.d.ts.map +1 -0
  34. package/dist/common/errors.js +74 -0
  35. package/dist/config/config.test.d.ts +2 -0
  36. package/dist/config/config.test.d.ts.map +1 -0
  37. package/dist/config/config.test.js +691 -0
  38. package/dist/config/index.d.ts +7 -0
  39. package/dist/config/index.d.ts.map +1 -0
  40. package/dist/config/index.js +4 -0
  41. package/dist/config/loader.d.ts +16 -0
  42. package/dist/config/loader.d.ts.map +1 -0
  43. package/dist/config/loader.js +56 -0
  44. package/dist/config/model-resolver.d.ts +24 -0
  45. package/dist/config/model-resolver.d.ts.map +1 -0
  46. package/dist/config/model-resolver.js +39 -0
  47. package/dist/config/resolver.d.ts +22 -0
  48. package/dist/config/resolver.d.ts.map +1 -0
  49. package/dist/config/resolver.js +115 -0
  50. package/dist/config/schemas.d.ts +266 -0
  51. package/dist/config/schemas.d.ts.map +1 -0
  52. package/dist/config/schemas.js +115 -0
  53. package/dist/context/artifact-reader.d.ts +12 -0
  54. package/dist/context/artifact-reader.d.ts.map +1 -0
  55. package/dist/context/artifact-reader.js +92 -0
  56. package/dist/context/assembler.d.ts +22 -0
  57. package/dist/context/assembler.d.ts.map +1 -0
  58. package/dist/context/assembler.js +126 -0
  59. package/dist/context/code-reader.d.ts +14 -0
  60. package/dist/context/code-reader.d.ts.map +1 -0
  61. package/dist/context/code-reader.js +56 -0
  62. package/dist/context/context.test.d.ts +2 -0
  63. package/dist/context/context.test.d.ts.map +1 -0
  64. package/dist/context/context.test.js +260 -0
  65. package/dist/context/index.d.ts +9 -0
  66. package/dist/context/index.d.ts.map +1 -0
  67. package/dist/context/index.js +5 -0
  68. package/dist/context/section-extractor.d.ts +9 -0
  69. package/dist/context/section-extractor.d.ts.map +1 -0
  70. package/dist/context/section-extractor.js +32 -0
  71. package/dist/context/token-budget.d.ts +11 -0
  72. package/dist/context/token-budget.d.ts.map +1 -0
  73. package/dist/context/token-budget.js +22 -0
  74. package/dist/control/control.test.d.ts +2 -0
  75. package/dist/control/control.test.d.ts.map +1 -0
  76. package/dist/control/control.test.js +137 -0
  77. package/dist/control/id-generator.d.ts +12 -0
  78. package/dist/control/id-generator.d.ts.map +1 -0
  79. package/dist/control/id-generator.js +20 -0
  80. package/dist/control/index.d.ts +5 -0
  81. package/dist/control/index.d.ts.map +1 -0
  82. package/dist/control/index.js +3 -0
  83. package/dist/control/lock-manager.d.ts +13 -0
  84. package/dist/control/lock-manager.d.ts.map +1 -0
  85. package/dist/control/lock-manager.js +72 -0
  86. package/dist/control/run-state.d.ts +16 -0
  87. package/dist/control/run-state.d.ts.map +1 -0
  88. package/dist/control/run-state.js +55 -0
  89. package/dist/engine/composite.d.ts +34 -0
  90. package/dist/engine/composite.d.ts.map +1 -0
  91. package/dist/engine/composite.js +192 -0
  92. package/dist/engine/composite.test.d.ts +2 -0
  93. package/dist/engine/composite.test.d.ts.map +1 -0
  94. package/dist/engine/composite.test.js +1947 -0
  95. package/dist/engine/engine.test.d.ts +2 -0
  96. package/dist/engine/engine.test.d.ts.map +1 -0
  97. package/dist/engine/engine.test.js +334 -0
  98. package/dist/engine/index.d.ts +10 -0
  99. package/dist/engine/index.d.ts.map +1 -0
  100. package/dist/engine/index.js +5 -0
  101. package/dist/engine/llm-client.d.ts +27 -0
  102. package/dist/engine/llm-client.d.ts.map +1 -0
  103. package/dist/engine/llm-client.js +46 -0
  104. package/dist/engine/simple.d.ts +21 -0
  105. package/dist/engine/simple.d.ts.map +1 -0
  106. package/dist/engine/simple.js +59 -0
  107. package/dist/engine/tool-dispatch.d.ts +37 -0
  108. package/dist/engine/tool-dispatch.d.ts.map +1 -0
  109. package/dist/engine/tool-dispatch.js +146 -0
  110. package/dist/engine/tool-dispatch.test.d.ts +2 -0
  111. package/dist/engine/tool-dispatch.test.d.ts.map +1 -0
  112. package/dist/engine/tool-dispatch.test.js +348 -0
  113. package/dist/engine/tool-filter.d.ts +13 -0
  114. package/dist/engine/tool-filter.d.ts.map +1 -0
  115. package/dist/engine/tool-filter.js +25 -0
  116. package/dist/evaluation/evaluation.test.d.ts +2 -0
  117. package/dist/evaluation/evaluation.test.d.ts.map +1 -0
  118. package/dist/evaluation/evaluation.test.js +490 -0
  119. package/dist/evaluation/evaluator.d.ts +19 -0
  120. package/dist/evaluation/evaluator.d.ts.map +1 -0
  121. package/dist/evaluation/evaluator.js +78 -0
  122. package/dist/evaluation/index.d.ts +4 -0
  123. package/dist/evaluation/index.d.ts.map +1 -0
  124. package/dist/evaluation/index.js +2 -0
  125. package/dist/evaluation/scorer.d.ts +38 -0
  126. package/dist/evaluation/scorer.d.ts.map +1 -0
  127. package/dist/evaluation/scorer.js +94 -0
  128. package/dist/index.d.ts +47 -0
  129. package/dist/index.d.ts.map +1 -0
  130. package/dist/index.js +28 -0
  131. package/dist/providers/index.d.ts +2 -0
  132. package/dist/providers/index.d.ts.map +1 -0
  133. package/dist/providers/index.js +1 -0
  134. package/dist/providers/provider-factory.d.ts +11 -0
  135. package/dist/providers/provider-factory.d.ts.map +1 -0
  136. package/dist/providers/provider-factory.js +30 -0
  137. package/dist/publication/frontmatter.d.ts +21 -0
  138. package/dist/publication/frontmatter.d.ts.map +1 -0
  139. package/dist/publication/frontmatter.js +15 -0
  140. package/dist/publication/git-ops.d.ts +18 -0
  141. package/dist/publication/git-ops.d.ts.map +1 -0
  142. package/dist/publication/git-ops.js +74 -0
  143. package/dist/publication/index.d.ts +9 -0
  144. package/dist/publication/index.d.ts.map +1 -0
  145. package/dist/publication/index.js +5 -0
  146. package/dist/publication/provenance-writer.d.ts +27 -0
  147. package/dist/publication/provenance-writer.d.ts.map +1 -0
  148. package/dist/publication/provenance-writer.js +21 -0
  149. package/dist/publication/publication.test.d.ts +2 -0
  150. package/dist/publication/publication.test.d.ts.map +1 -0
  151. package/dist/publication/publication.test.js +235 -0
  152. package/dist/publication/publisher.d.ts +32 -0
  153. package/dist/publication/publisher.d.ts.map +1 -0
  154. package/dist/publication/publisher.js +113 -0
  155. package/dist/publication/secret-scanner.d.ts +6 -0
  156. package/dist/publication/secret-scanner.d.ts.map +1 -0
  157. package/dist/publication/secret-scanner.js +19 -0
  158. package/dist/tools/index.d.ts +4 -0
  159. package/dist/tools/index.d.ts.map +1 -0
  160. package/dist/tools/index.js +2 -0
  161. package/dist/tools/registry.d.ts +15 -0
  162. package/dist/tools/registry.d.ts.map +1 -0
  163. package/dist/tools/registry.js +288 -0
  164. package/dist/tools/registry.test.d.ts +2 -0
  165. package/dist/tools/registry.test.d.ts.map +1 -0
  166. package/dist/tools/registry.test.js +131 -0
  167. package/dist/tools/tool-groups.d.ts +20 -0
  168. package/dist/tools/tool-groups.d.ts.map +1 -0
  169. package/dist/tools/tool-groups.js +48 -0
  170. package/dist/tools/tool-groups.test.d.ts +2 -0
  171. package/dist/tools/tool-groups.test.d.ts.map +1 -0
  172. package/dist/tools/tool-groups.test.js +127 -0
  173. package/dist/types/artifact-store.d.ts +33 -0
  174. package/dist/types/artifact-store.d.ts.map +1 -0
  175. package/dist/types/artifact-store.js +9 -0
  176. package/dist/types/evaluation-rubric.d.ts +18 -0
  177. package/dist/types/evaluation-rubric.d.ts.map +1 -0
  178. package/dist/types/evaluation-rubric.js +1 -0
  179. package/dist/types/index.d.ts +10 -0
  180. package/dist/types/index.d.ts.map +1 -0
  181. package/dist/types/index.js +1 -0
  182. package/dist/types/llm-provider.d.ts +47 -0
  183. package/dist/types/llm-provider.d.ts.map +1 -0
  184. package/dist/types/llm-provider.js +8 -0
  185. package/dist/types/persona-spec.d.ts +79 -0
  186. package/dist/types/persona-spec.d.ts.map +1 -0
  187. package/dist/types/persona-spec.js +1 -0
  188. package/dist/types/project-config.d.ts +28 -0
  189. package/dist/types/project-config.d.ts.map +1 -0
  190. package/dist/types/project-config.js +1 -0
  191. package/dist/types/provenance.d.ts +67 -0
  192. package/dist/types/provenance.d.ts.map +1 -0
  193. package/dist/types/provenance.js +1 -0
  194. package/dist/types/run-state.d.ts +11 -0
  195. package/dist/types/run-state.d.ts.map +1 -0
  196. package/dist/types/run-state.js +1 -0
  197. package/dist/types/tool-runtime.d.ts +43 -0
  198. package/dist/types/tool-runtime.d.ts.map +1 -0
  199. package/dist/types/tool-runtime.js +30 -0
  200. package/dist/workspace/detect.d.ts +11 -0
  201. package/dist/workspace/detect.d.ts.map +1 -0
  202. package/dist/workspace/detect.js +28 -0
  203. package/dist/workspace/detect.test.d.ts +2 -0
  204. package/dist/workspace/detect.test.d.ts.map +1 -0
  205. package/dist/workspace/detect.test.js +53 -0
  206. package/dist/workspace/index.d.ts +2 -0
  207. package/dist/workspace/index.d.ts.map +1 -0
  208. package/dist/workspace/index.js +1 -0
  209. package/package.json +51 -0
@@ -0,0 +1,490 @@
1
+ import { describe, it, expect, vi, beforeEach } from 'vitest';
2
+ import { parseEvaluationResponse, computeAggregate, determinePassFail, formatFailureFeedback, } from './scorer.js';
3
+ import { evaluate } from './evaluator.js';
4
+ const mockModel = {
5
+ alias: 'claude-sonnet',
6
+ provider: 'anthropic',
7
+ concreteModel: 'claude-sonnet-4-20250514',
8
+ apiKeyEnv: 'ANTHROPIC_API_KEY',
9
+ };
10
+ const testRubric = {
11
+ rubric: {
12
+ artifact_type: 'pull-request',
13
+ scoring_scale: 10,
14
+ pass_threshold: 7,
15
+ criteria: [
16
+ {
17
+ name: 'Tests cover acceptance criteria',
18
+ weight: 'blocking',
19
+ description: 'Must have tests',
20
+ },
21
+ { name: 'Code follows patterns', weight: 'blocking' },
22
+ { name: 'PR description links to story', weight: 'important' },
23
+ { name: 'Readable code', weight: 'advisory' },
24
+ ],
25
+ },
26
+ };
27
+ function makeScoringResponse(scores) {
28
+ return JSON.stringify(scores);
29
+ }
30
+ const passingScores = makeScoringResponse([
31
+ { name: 'Tests cover acceptance criteria', score: 8, reasoning: 'Good coverage' },
32
+ { name: 'Code follows patterns', score: 9, reasoning: 'Consistent' },
33
+ { name: 'PR description links to story', score: 7, reasoning: 'Links present' },
34
+ { name: 'Readable code', score: 8, reasoning: 'Clear' },
35
+ ]);
36
+ const failingScores = makeScoringResponse([
37
+ { name: 'Tests cover acceptance criteria', score: 5, reasoning: 'Missing tests for edge cases' },
38
+ { name: 'Code follows patterns', score: 8, reasoning: 'OK' },
39
+ { name: 'PR description links to story', score: 6, reasoning: 'Missing design link' },
40
+ { name: 'Readable code', score: 7, reasoning: 'Acceptable' },
41
+ ]);
42
+ describe('T-01-005a: rubric loading and scoring', () => {
43
+ it('parses a well-formed JSON evaluation response', () => {
44
+ const scores = parseEvaluationResponse(passingScores, testRubric);
45
+ expect(scores).toHaveLength(4);
46
+ expect(scores[0].name).toBe('Tests cover acceptance criteria');
47
+ expect(scores[0].weight).toBe('blocking');
48
+ expect(scores[0].score).toBe(8);
49
+ expect(scores[0].reasoning).toBe('Good coverage');
50
+ });
51
+ it('extracts JSON array from surrounding text', () => {
52
+ const text = `Here is my evaluation:\n${passingScores}\nThat is my assessment.`;
53
+ const scores = parseEvaluationResponse(text, testRubric);
54
+ expect(scores).toHaveLength(4);
55
+ expect(scores[0].score).toBe(8);
56
+ });
57
+ it('returns zero scores for unparseable responses', () => {
58
+ const scores = parseEvaluationResponse('This is not JSON at all.', testRubric);
59
+ expect(scores).toHaveLength(4);
60
+ expect(scores.every((s) => s.score === 0)).toBe(true);
61
+ });
62
+ it('matches criteria case-insensitively', () => {
63
+ const response = makeScoringResponse([
64
+ { name: 'tests cover acceptance criteria', score: 9, reasoning: 'Great' },
65
+ { name: 'CODE FOLLOWS PATTERNS', score: 8, reasoning: 'Good' },
66
+ { name: 'pr description links to story', score: 7, reasoning: 'OK' },
67
+ { name: 'readable code', score: 6, reasoning: 'Fine' },
68
+ ]);
69
+ const scores = parseEvaluationResponse(response, testRubric);
70
+ expect(scores[0].score).toBe(9);
71
+ expect(scores[1].score).toBe(8);
72
+ });
73
+ it('computes weighted aggregate correctly', () => {
74
+ const scores = [
75
+ { name: 'A', weight: 'blocking', score: 8, reasoning: '' },
76
+ { name: 'B', weight: 'blocking', score: 6, reasoning: '' },
77
+ { name: 'C', weight: 'important', score: 7, reasoning: '' },
78
+ { name: 'D', weight: 'advisory', score: 9, reasoning: '' },
79
+ ];
80
+ const agg = computeAggregate(scores);
81
+ // (8*3 + 6*3 + 7*2 + 9*1) / (3+3+2+1) = (24+18+14+9)/9 = 65/9 = 7.22
82
+ expect(agg).toBeCloseTo(7.22, 1);
83
+ });
84
+ it('returns 0 aggregate for empty scores', () => {
85
+ expect(computeAggregate([])).toBe(0);
86
+ });
87
+ });
88
+ describe('T-01-005b: pass/fail determination', () => {
89
+ it('passes when all blocking criteria meet threshold', () => {
90
+ const scores = [
91
+ { name: 'A', weight: 'blocking', score: 8, reasoning: '' },
92
+ { name: 'B', weight: 'blocking', score: 7, reasoning: '' },
93
+ { name: 'C', weight: 'important', score: 5, reasoning: '' },
94
+ ];
95
+ expect(determinePassFail(scores, testRubric)).toBe(true);
96
+ });
97
+ it('fails when any blocking criterion is below threshold', () => {
98
+ const scores = [
99
+ { name: 'A', weight: 'blocking', score: 6, reasoning: '' },
100
+ { name: 'B', weight: 'blocking', score: 9, reasoning: '' },
101
+ ];
102
+ expect(determinePassFail(scores, testRubric)).toBe(false);
103
+ });
104
+ it('important and advisory below threshold do not cause failure', () => {
105
+ const scores = [
106
+ { name: 'A', weight: 'blocking', score: 7, reasoning: '' },
107
+ { name: 'B', weight: 'important', score: 3, reasoning: '' },
108
+ { name: 'C', weight: 'advisory', score: 2, reasoning: '' },
109
+ ];
110
+ expect(determinePassFail(scores, testRubric)).toBe(true);
111
+ });
112
+ it('checks aggregate threshold when configured', () => {
113
+ const rubricWithAgg = {
114
+ rubric: {
115
+ ...testRubric.rubric,
116
+ aggregate_threshold: 8,
117
+ },
118
+ };
119
+ const scores = [
120
+ { name: 'A', weight: 'blocking', score: 7, reasoning: '' },
121
+ { name: 'B', weight: 'blocking', score: 7, reasoning: '' },
122
+ ];
123
+ // aggregate = 7, threshold = 8 -> fail
124
+ expect(determinePassFail(scores, rubricWithAgg)).toBe(false);
125
+ });
126
+ it('passes when aggregate meets threshold', () => {
127
+ const rubricWithAgg = {
128
+ rubric: {
129
+ ...testRubric.rubric,
130
+ aggregate_threshold: 7,
131
+ },
132
+ };
133
+ const scores = [
134
+ { name: 'A', weight: 'blocking', score: 8, reasoning: '' },
135
+ { name: 'B', weight: 'blocking', score: 7, reasoning: '' },
136
+ ];
137
+ expect(determinePassFail(scores, rubricWithAgg)).toBe(true);
138
+ });
139
+ });
140
+ describe('T-01-005c: refinement iteration on failure', () => {
141
+ beforeEach(() => {
142
+ vi.restoreAllMocks();
143
+ });
144
+ it('calls refineCallback when evaluation fails and re-evaluates', async () => {
145
+ let callCount = 0;
146
+ const provider = {
147
+ generateText: vi.fn().mockImplementation(() => {
148
+ callCount++;
149
+ if (callCount === 1) {
150
+ return Promise.resolve({
151
+ text: failingScores,
152
+ toolCalls: [],
153
+ tokensIn: 10,
154
+ tokensOut: 20,
155
+ });
156
+ }
157
+ return Promise.resolve({ text: passingScores, toolCalls: [], tokensIn: 10, tokensOut: 20 });
158
+ }),
159
+ };
160
+ const refineCallback = vi.fn().mockResolvedValue('Refined artifact');
161
+ const result = await evaluate('Original artifact', testRubric, mockModel, provider, 3, refineCallback);
162
+ expect(refineCallback).toHaveBeenCalledOnce();
163
+ expect(result.evaluation.pass).toBe(true);
164
+ expect(result.finalArtifact).toBe('Refined artifact');
165
+ });
166
+ it('passes feedback with failing criterion details to refineCallback', async () => {
167
+ let callCount = 0;
168
+ const provider = {
169
+ generateText: vi.fn().mockImplementation(() => {
170
+ callCount++;
171
+ if (callCount === 1) {
172
+ return Promise.resolve({
173
+ text: failingScores,
174
+ toolCalls: [],
175
+ tokensIn: 10,
176
+ tokensOut: 20,
177
+ });
178
+ }
179
+ return Promise.resolve({ text: passingScores, toolCalls: [], tokensIn: 10, tokensOut: 20 });
180
+ }),
181
+ };
182
+ const refineCallback = vi.fn().mockResolvedValue('Fixed');
183
+ await evaluate('Artifact', testRubric, mockModel, provider, 2, refineCallback);
184
+ const feedback = refineCallback.mock.calls[0][0];
185
+ expect(feedback).toContain('Tests cover acceptance criteria');
186
+ expect(feedback).toContain('Missing tests');
187
+ });
188
+ });
189
+ describe('T-01-005d: publish with quality flag on exhaustion', () => {
190
+ it('returns pass=false when max iterations exhausted', async () => {
191
+ const provider = {
192
+ generateText: vi.fn().mockResolvedValue({
193
+ text: failingScores,
194
+ toolCalls: [],
195
+ tokensIn: 10,
196
+ tokensOut: 20,
197
+ }),
198
+ };
199
+ const refineCallback = vi.fn().mockResolvedValue('Still failing');
200
+ const result = await evaluate('Bad artifact', testRubric, mockModel, provider, 2, refineCallback);
201
+ expect(result.evaluation.pass).toBe(false);
202
+ expect(result.evaluation.maxIterations).toBe(2);
203
+ expect(refineCallback).toHaveBeenCalledOnce();
204
+ });
205
+ it('returns the last refined artifact even on exhaustion', async () => {
206
+ const provider = {
207
+ generateText: vi.fn().mockResolvedValue({
208
+ text: failingScores,
209
+ toolCalls: [],
210
+ tokensIn: 10,
211
+ tokensOut: 20,
212
+ }),
213
+ };
214
+ const refineCallback = vi.fn().mockResolvedValue('Last attempt artifact');
215
+ const result = await evaluate('Original', testRubric, mockModel, provider, 2, refineCallback);
216
+ expect(result.finalArtifact).toBe('Last attempt artifact');
217
+ });
218
+ it('works without a refineCallback (single evaluation)', async () => {
219
+ const provider = {
220
+ generateText: vi.fn().mockResolvedValue({
221
+ text: failingScores,
222
+ toolCalls: [],
223
+ tokensIn: 10,
224
+ tokensOut: 20,
225
+ }),
226
+ };
227
+ const result = await evaluate('Artifact', testRubric, mockModel, provider, 1);
228
+ expect(result.evaluation.pass).toBe(false);
229
+ expect(result.finalArtifact).toBe('Artifact');
230
+ });
231
+ });
232
+ describe('T-FR-03 (CREW-05): ADR self-evaluation', () => {
233
+ const adrRubric = {
234
+ rubric: {
235
+ artifact_type: 'adr-summary',
236
+ scoring_scale: 10,
237
+ pass_threshold: 7,
238
+ criteria: [
239
+ {
240
+ name: 'ADRs follow project template',
241
+ weight: 'blocking',
242
+ description: 'Template compliance',
243
+ },
244
+ {
245
+ name: 'Options analysis is balanced',
246
+ weight: 'blocking',
247
+ description: 'Balanced options',
248
+ },
249
+ {
250
+ name: 'Rationale is justified by analysis',
251
+ weight: 'blocking',
252
+ description: 'Justified rationale',
253
+ },
254
+ {
255
+ name: 'Consequences are documented',
256
+ weight: 'important',
257
+ description: 'Documented consequences',
258
+ },
259
+ {
260
+ name: 'No contradictions with existing ADRs',
261
+ weight: 'important',
262
+ description: 'No contradictions',
263
+ },
264
+ ],
265
+ },
266
+ };
267
+ const adrPassingScores = makeScoringResponse([
268
+ { name: 'ADRs follow project template', score: 8, reasoning: 'All sections complete' },
269
+ {
270
+ name: 'Options analysis is balanced',
271
+ score: 7,
272
+ reasoning: 'Two viable options with substantive pros/cons',
273
+ },
274
+ {
275
+ name: 'Rationale is justified by analysis',
276
+ score: 8,
277
+ reasoning: 'Decision follows from drivers',
278
+ },
279
+ { name: 'Consequences are documented', score: 7, reasoning: 'Benefits and trade-offs listed' },
280
+ {
281
+ name: 'No contradictions with existing ADRs',
282
+ score: 9,
283
+ reasoning: 'Consistent with ADR register',
284
+ },
285
+ ]);
286
+ const adrFailingScores = makeScoringResponse([
287
+ { name: 'ADRs follow project template', score: 8, reasoning: 'Template OK' },
288
+ {
289
+ name: 'Options analysis is balanced',
290
+ score: 4,
291
+ reasoning: 'Only one viable option; second is a straw-man',
292
+ },
293
+ {
294
+ name: 'Rationale is justified by analysis',
295
+ score: 5,
296
+ reasoning: 'Decision asserted without evidence',
297
+ },
298
+ {
299
+ name: 'Consequences are documented',
300
+ score: 6,
301
+ reasoning: 'Only positive consequences listed',
302
+ },
303
+ { name: 'No contradictions with existing ADRs', score: 8, reasoning: 'OK' },
304
+ ]);
305
+ beforeEach(() => {
306
+ vi.restoreAllMocks();
307
+ });
308
+ it('T-FR-03-1: self-evaluation runs and passes for quality ADR summary', async () => {
309
+ const provider = {
310
+ generateText: vi.fn().mockResolvedValue({
311
+ text: adrPassingScores,
312
+ toolCalls: [],
313
+ tokensIn: 10,
314
+ tokensOut: 20,
315
+ }),
316
+ };
317
+ const result = await evaluate('# ADR Summary\n\nAll ADRs produced.', adrRubric, mockModel, provider, 2);
318
+ expect(result.evaluation.pass).toBe(true);
319
+ expect(result.evaluation.scores).toHaveLength(5);
320
+ expect(result.evaluation.aggregate).toBeGreaterThan(7);
321
+ });
322
+ it('T-FR-03-1: self-evaluation fails when blocking criteria are below threshold', async () => {
323
+ const provider = {
324
+ generateText: vi.fn().mockResolvedValue({
325
+ text: adrFailingScores,
326
+ toolCalls: [],
327
+ tokensIn: 10,
328
+ tokensOut: 20,
329
+ }),
330
+ };
331
+ const result = await evaluate('# Weak ADR Summary', adrRubric, mockModel, provider, 1);
332
+ expect(result.evaluation.pass).toBe(false);
333
+ const failingCriteria = result.evaluation.scores.filter((s) => s.weight === 'blocking' && s.score < 7);
334
+ expect(failingCriteria.length).toBeGreaterThan(0);
335
+ });
336
+ it('T-FR-03-2: refinement is triggered when ADR evaluation fails', async () => {
337
+ let callCount = 0;
338
+ const provider = {
339
+ generateText: vi.fn().mockImplementation(() => {
340
+ callCount++;
341
+ if (callCount === 1) {
342
+ return Promise.resolve({
343
+ text: adrFailingScores,
344
+ toolCalls: [],
345
+ tokensIn: 10,
346
+ tokensOut: 20,
347
+ });
348
+ }
349
+ return Promise.resolve({
350
+ text: adrPassingScores,
351
+ toolCalls: [],
352
+ tokensIn: 10,
353
+ tokensOut: 20,
354
+ });
355
+ }),
356
+ };
357
+ const refineCallback = vi
358
+ .fn()
359
+ .mockResolvedValue('# Revised ADR Summary\n\nBalanced options added.');
360
+ const result = await evaluate('# Weak ADR', adrRubric, mockModel, provider, 3, refineCallback);
361
+ expect(refineCallback).toHaveBeenCalledOnce();
362
+ expect(result.evaluation.pass).toBe(true);
363
+ expect(result.finalArtifact).toContain('Revised ADR Summary');
364
+ });
365
+ it('T-FR-03-2: refinement feedback includes failing ADR criteria', async () => {
366
+ let callCount = 0;
367
+ const provider = {
368
+ generateText: vi.fn().mockImplementation(() => {
369
+ callCount++;
370
+ if (callCount === 1) {
371
+ return Promise.resolve({
372
+ text: adrFailingScores,
373
+ toolCalls: [],
374
+ tokensIn: 10,
375
+ tokensOut: 20,
376
+ });
377
+ }
378
+ return Promise.resolve({
379
+ text: adrPassingScores,
380
+ toolCalls: [],
381
+ tokensIn: 10,
382
+ tokensOut: 20,
383
+ });
384
+ }),
385
+ };
386
+ const refineCallback = vi.fn().mockResolvedValue('Refined');
387
+ await evaluate('ADR', adrRubric, mockModel, provider, 2, refineCallback);
388
+ const feedback = refineCallback.mock.calls[0][0];
389
+ expect(feedback).toContain('Options analysis is balanced');
390
+ expect(feedback).toContain('straw-man');
391
+ expect(feedback).toContain('Rationale is justified by analysis');
392
+ });
393
+ it('T-FR-03-3: quality flag set when self-evaluation is exhausted', async () => {
394
+ const provider = {
395
+ generateText: vi.fn().mockResolvedValue({
396
+ text: adrFailingScores,
397
+ toolCalls: [],
398
+ tokensIn: 10,
399
+ tokensOut: 20,
400
+ }),
401
+ };
402
+ const refineCallback = vi.fn().mockResolvedValue('Still failing');
403
+ const result = await evaluate('Bad ADR', adrRubric, mockModel, provider, 2, refineCallback);
404
+ expect(result.evaluation.pass).toBe(false);
405
+ expect(result.evaluation.maxIterations).toBe(2);
406
+ expect(refineCallback).toHaveBeenCalledOnce();
407
+ });
408
+ it('T-FR-03-4: evaluation scores are structured for provenance recording', async () => {
409
+ const provider = {
410
+ generateText: vi.fn().mockResolvedValue({
411
+ text: adrPassingScores,
412
+ toolCalls: [],
413
+ tokensIn: 10,
414
+ tokensOut: 20,
415
+ }),
416
+ };
417
+ const result = await evaluate('ADR Summary', adrRubric, mockModel, provider, 1);
418
+ expect(result.evaluation.scores).toHaveLength(5);
419
+ for (const score of result.evaluation.scores) {
420
+ expect(score).toHaveProperty('name');
421
+ expect(score).toHaveProperty('weight');
422
+ expect(score).toHaveProperty('score');
423
+ expect(score).toHaveProperty('reasoning');
424
+ }
425
+ expect(typeof result.evaluation.aggregate).toBe('number');
426
+ expect(typeof result.evaluation.iteration).toBe('number');
427
+ expect(typeof result.evaluation.maxIterations).toBe('number');
428
+ });
429
+ });
430
+ describe('T-01-005e: scores in provenance', () => {
431
+ it('returns per-criterion scores with name, weight, score, reasoning', async () => {
432
+ const provider = {
433
+ generateText: vi.fn().mockResolvedValue({
434
+ text: passingScores,
435
+ toolCalls: [],
436
+ tokensIn: 10,
437
+ tokensOut: 20,
438
+ }),
439
+ };
440
+ const result = await evaluate('Good artifact', testRubric, mockModel, provider, 1);
441
+ expect(result.evaluation.scores).toHaveLength(4);
442
+ for (const score of result.evaluation.scores) {
443
+ expect(score).toHaveProperty('name');
444
+ expect(score).toHaveProperty('weight');
445
+ expect(score).toHaveProperty('score');
446
+ expect(score).toHaveProperty('reasoning');
447
+ expect(typeof score.score).toBe('number');
448
+ }
449
+ });
450
+ it('returns aggregate score for provenance recording', async () => {
451
+ const provider = {
452
+ generateText: vi.fn().mockResolvedValue({
453
+ text: passingScores,
454
+ toolCalls: [],
455
+ tokensIn: 10,
456
+ tokensOut: 20,
457
+ }),
458
+ };
459
+ const result = await evaluate('Artifact', testRubric, mockModel, provider, 1);
460
+ expect(result.evaluation.aggregate).toBeGreaterThan(0);
461
+ expect(typeof result.evaluation.aggregate).toBe('number');
462
+ });
463
+ it('includes iteration count and maxIterations', async () => {
464
+ const provider = {
465
+ generateText: vi.fn().mockResolvedValue({
466
+ text: passingScores,
467
+ toolCalls: [],
468
+ tokensIn: 10,
469
+ tokensOut: 20,
470
+ }),
471
+ };
472
+ const result = await evaluate('Artifact', testRubric, mockModel, provider, 3);
473
+ expect(result.evaluation.iteration).toBe(0);
474
+ expect(result.evaluation.maxIterations).toBe(3);
475
+ });
476
+ it('formats failure feedback with criterion details', () => {
477
+ const scores = [
478
+ { name: 'Tests', weight: 'blocking', score: 5, reasoning: 'Missing edge cases' },
479
+ { name: 'Patterns', weight: 'blocking', score: 8, reasoning: 'OK' },
480
+ { name: 'Docs', weight: 'important', score: 4, reasoning: 'No links' },
481
+ ];
482
+ const feedback = formatFailureFeedback(scores);
483
+ expect(feedback).toContain('Tests');
484
+ expect(feedback).toContain('Missing edge cases');
485
+ expect(feedback).toContain('blocking');
486
+ expect(feedback).toContain('Docs');
487
+ expect(feedback).toContain('No links');
488
+ expect(feedback).not.toContain('Patterns');
489
+ });
490
+ });
@@ -0,0 +1,19 @@
1
+ import type { EvaluationRubric, LlmProvider } from '../types/index.js';
2
+ import type { ResolvedModel } from '../config/model-resolver.js';
3
+ import type { EvaluationResult } from './scorer.js';
4
+ /**
5
+ * Evaluate an artifact against a quality rubric using an LLM.
6
+ *
7
+ * Implements the self-evaluation loop per ADR-0004:
8
+ * 1. Score each criterion numerically (LLM output)
9
+ * 2. Check blocking criteria against pass_threshold
10
+ * 3. If failed, invoke refineCallback with feedback and re-evaluate
11
+ * 4. On exhaustion, return the last evaluation with pass=false
12
+ *
13
+ * The caller (pipeline) decides whether to publish with a quality flag.
14
+ */
15
+ export declare function evaluate(artifact: string, rubric: EvaluationRubric, model: ResolvedModel, provider: LlmProvider, maxIterations?: number, refineCallback?: (feedback: string) => Promise<string>): Promise<{
16
+ finalArtifact: string;
17
+ evaluation: EvaluationResult;
18
+ }>;
19
+ //# sourceMappingURL=evaluator.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"evaluator.d.ts","sourceRoot":"","sources":["../../src/evaluation/evaluator.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,gBAAgB,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AACvE,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAQjE,OAAO,KAAK,EAAkB,gBAAgB,EAAE,MAAM,aAAa,CAAC;AA4BpE;;;;;;;;;;GAUG;AACH,wBAAsB,QAAQ,CAC5B,QAAQ,EAAE,MAAM,EAChB,MAAM,EAAE,gBAAgB,EACxB,KAAK,EAAE,aAAa,EACpB,QAAQ,EAAE,WAAW,EACrB,aAAa,CAAC,EAAE,MAAM,EACtB,cAAc,CAAC,EAAE,CAAC,QAAQ,EAAE,MAAM,KAAK,OAAO,CAAC,MAAM,CAAC,GACrD,OAAO,CAAC;IAAE,aAAa,EAAE,MAAM,CAAC;IAAC,UAAU,EAAE,gBAAgB,CAAA;CAAE,CAAC,CAkDlE"}
@@ -0,0 +1,78 @@
1
+ import { callLlm } from '../engine/llm-client.js';
2
+ import { parseEvaluationResponse, computeAggregate, determinePassFail, formatFailureFeedback, } from './scorer.js';
3
+ const DEFAULT_MAX_ITERATIONS = 2;
4
+ function getMaxIterations() {
5
+ const env = process.env['CREW_MAX_REFINEMENT_ITERATIONS'];
6
+ if (env) {
7
+ const parsed = parseInt(env, 10);
8
+ if (!Number.isNaN(parsed) && parsed > 0)
9
+ return parsed;
10
+ }
11
+ return DEFAULT_MAX_ITERATIONS;
12
+ }
13
+ function buildEvaluationPrompt(artifact, rubric) {
14
+ const scale = rubric.rubric.scoring_scale ?? 10;
15
+ const criteriaList = rubric.rubric.criteria
16
+ .map((c) => `- "${c.name}" (weight: ${c.weight})${c.description ? `: ${c.description}` : ''}`)
17
+ .join('\n');
18
+ return (`Evaluate the following artifact against each criterion on a scale of 1-${scale}.\n\n` +
19
+ `Criteria:\n${criteriaList}\n\n` +
20
+ `Respond with ONLY a JSON array of objects, one per criterion:\n` +
21
+ `[{ "name": "criterion name", "score": <number 1-${scale}>, "reasoning": "brief explanation" }]\n\n` +
22
+ `--- Artifact to evaluate ---\n${artifact}\n--- End artifact ---`);
23
+ }
24
+ /**
25
+ * Evaluate an artifact against a quality rubric using an LLM.
26
+ *
27
+ * Implements the self-evaluation loop per ADR-0004:
28
+ * 1. Score each criterion numerically (LLM output)
29
+ * 2. Check blocking criteria against pass_threshold
30
+ * 3. If failed, invoke refineCallback with feedback and re-evaluate
31
+ * 4. On exhaustion, return the last evaluation with pass=false
32
+ *
33
+ * The caller (pipeline) decides whether to publish with a quality flag.
34
+ */
35
+ export async function evaluate(artifact, rubric, model, provider, maxIterations, refineCallback) {
36
+ const maxIter = maxIterations ?? getMaxIterations();
37
+ let currentArtifact = artifact;
38
+ let lastScores = [];
39
+ let lastAggregate = 0;
40
+ for (let iteration = 0; iteration < maxIter; iteration++) {
41
+ const prompt = buildEvaluationPrompt(currentArtifact, rubric);
42
+ const result = await callLlm({
43
+ provider,
44
+ model,
45
+ system: 'You are a quality evaluator. Score the artifact against the provided criteria. Return ONLY a JSON array.',
46
+ messages: [{ role: 'user', content: prompt }],
47
+ maxRetries: 1,
48
+ });
49
+ lastScores = parseEvaluationResponse(result.text, rubric);
50
+ lastAggregate = computeAggregate(lastScores);
51
+ if (determinePassFail(lastScores, rubric)) {
52
+ return {
53
+ finalArtifact: currentArtifact,
54
+ evaluation: {
55
+ scores: lastScores,
56
+ aggregate: lastAggregate,
57
+ pass: true,
58
+ iteration,
59
+ maxIterations: maxIter,
60
+ },
61
+ };
62
+ }
63
+ if (iteration < maxIter - 1 && refineCallback) {
64
+ const feedback = formatFailureFeedback(lastScores);
65
+ currentArtifact = await refineCallback(feedback);
66
+ }
67
+ }
68
+ return {
69
+ finalArtifact: currentArtifact,
70
+ evaluation: {
71
+ scores: lastScores,
72
+ aggregate: lastAggregate,
73
+ pass: false,
74
+ iteration: maxIter - 1,
75
+ maxIterations: maxIter,
76
+ },
77
+ };
78
+ }
@@ -0,0 +1,4 @@
1
+ export { evaluate } from './evaluator.js';
2
+ export { computeAggregate, determinePassFail, parseEvaluationResponse, formatFailureFeedback, } from './scorer.js';
3
+ export type { CriterionScore, EvaluationResult } from './scorer.js';
4
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/evaluation/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,gBAAgB,CAAC;AAC1C,OAAO,EACL,gBAAgB,EAChB,iBAAiB,EACjB,uBAAuB,EACvB,qBAAqB,GACtB,MAAM,aAAa,CAAC;AACrB,YAAY,EAAE,cAAc,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC"}
@@ -0,0 +1,2 @@
1
+ export { evaluate } from './evaluator.js';
2
+ export { computeAggregate, determinePassFail, parseEvaluationResponse, formatFailureFeedback, } from './scorer.js';
@@ -0,0 +1,38 @@
1
+ import type { CriterionWeight, EvaluationRubric } from '../types/index.js';
2
+ export interface CriterionScore {
3
+ name: string;
4
+ weight: CriterionWeight;
5
+ score: number;
6
+ reasoning: string;
7
+ }
8
+ export interface EvaluationResult {
9
+ scores: CriterionScore[];
10
+ aggregate: number;
11
+ pass: boolean;
12
+ iteration: number;
13
+ maxIterations: number;
14
+ }
15
+ /**
16
+ * Compute a weighted aggregate score from per-criterion scores.
17
+ * blocking=3, important=2, advisory=1
18
+ */
19
+ export declare function computeAggregate(scores: CriterionScore[]): number;
20
+ /**
21
+ * Determine pass/fail from per-criterion scores and rubric thresholds.
22
+ *
23
+ * Pass requires:
24
+ * 1. All blocking criteria score >= pass_threshold
25
+ * 2. If aggregate_threshold is set, the weighted aggregate must meet it
26
+ */
27
+ export declare function determinePassFail(scores: CriterionScore[], rubric: EvaluationRubric): boolean;
28
+ /**
29
+ * Parse the LLM's structured evaluation response into typed scores.
30
+ * Expects a JSON array of { name, score, reasoning } objects.
31
+ * Merges the weight from the rubric criteria definitions.
32
+ */
33
+ export declare function parseEvaluationResponse(text: string, rubric: EvaluationRubric): CriterionScore[];
34
+ /**
35
+ * Format failure feedback for the refinement callback.
36
+ */
37
+ export declare function formatFailureFeedback(scores: CriterionScore[]): string;
38
+ //# sourceMappingURL=scorer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"scorer.d.ts","sourceRoot":"","sources":["../../src/evaluation/scorer.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,gBAAgB,EAAE,MAAM,mBAAmB,CAAC;AAE3E,MAAM,WAAW,cAAc;IAC7B,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,eAAe,CAAC;IACxB,KAAK,EAAE,MAAM,CAAC;IACd,SAAS,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,gBAAgB;IAC/B,MAAM,EAAE,cAAc,EAAE,CAAC;IACzB,SAAS,EAAE,MAAM,CAAC;IAClB,IAAI,EAAE,OAAO,CAAC;IACd,SAAS,EAAE,MAAM,CAAC;IAClB,aAAa,EAAE,MAAM,CAAC;CACvB;AAQD;;;GAGG;AACH,wBAAgB,gBAAgB,CAAC,MAAM,EAAE,cAAc,EAAE,GAAG,MAAM,CAcjE;AAED;;;;;;GAMG;AACH,wBAAgB,iBAAiB,CAAC,MAAM,EAAE,cAAc,EAAE,EAAE,MAAM,EAAE,gBAAgB,GAAG,OAAO,CAY7F;AAED;;;;GAIG;AACH,wBAAgB,uBAAuB,CAAC,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,gBAAgB,GAAG,cAAc,EAAE,CAuChG;AAED;;GAEG;AACH,wBAAgB,qBAAqB,CAAC,MAAM,EAAE,cAAc,EAAE,GAAG,MAAM,CAUtE"}