jfl 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,448 @@
1
+ name: JFL Eval Suite
2
+
3
+ on:
4
+ pull_request:
5
+ types: [opened, synchronize]
6
+ branches: [main]
7
+
8
+ permissions:
9
+ contents: write
10
+ pull-requests: write
11
+
12
+ jobs:
13
+ eval:
14
+ name: Run JFL Eval
15
+ runs-on: ubuntu-latest
16
+ timeout-minutes: 15
17
+ if: startsWith(github.head_ref, 'pp/') || contains(github.event.pull_request.labels.*.name, 'run-eval')
18
+
19
+ env:
20
+ JFL_HUB_URL: ${{ secrets.JFL_HUB_URL }}
21
+ JFL_HUB_TOKEN: ${{ secrets.JFL_HUB_TOKEN }}
22
+ JFL_TELEMETRY: 'off'
23
+ NODE_OPTIONS: '--experimental-vm-modules'
24
+
25
+ steps:
26
+ - uses: actions/checkout@v4
27
+ with:
28
+ fetch-depth: 0
29
+
30
+ - uses: actions/setup-node@v4
31
+ with:
32
+ node-version: '22'
33
+ cache: 'npm'
34
+
35
+ - name: Install dependencies
36
+ run: npm ci
37
+
38
+ - name: Build
39
+ run: npx tsc --build --noEmit
40
+
41
+ - name: Run tests (baseline from main)
42
+ id: baseline
43
+ run: |
44
+ # Clean switch to main — remove PR-only files, restore main state
45
+ PR_SHA=$(git rev-parse HEAD)
46
+ git checkout origin/main --force
47
+ git clean -fd
48
+ npm ci --ignore-scripts 2>/dev/null || true
49
+
50
+ RESULT=$(npx jest --json --silent 2>/dev/null || echo '{}')
51
+ PASSING=$(echo "$RESULT" | node -e "
52
+ let d=''; process.stdin.on('data',c=>d+=c); process.stdin.on('end',()=>{
53
+ try { const j=JSON.parse(d); console.log(j.numPassedTests||0); }
54
+ catch(e) { console.log(0); }
55
+ })
56
+ ")
57
+ TOTAL=$(echo "$RESULT" | node -e "
58
+ let d=''; process.stdin.on('data',c=>d+=c); process.stdin.on('end',()=>{
59
+ try { const j=JSON.parse(d); console.log(j.numTotalTests||1); }
60
+ catch(e) { console.log(1); }
61
+ })
62
+ ")
63
+ SCORE=$(node -e "console.log(($TOTAL>0?$PASSING/$TOTAL:0).toFixed(4))")
64
+ echo "score=$SCORE" >> $GITHUB_OUTPUT
65
+ echo "passing=$PASSING" >> $GITHUB_OUTPUT
66
+ echo "total=$TOTAL" >> $GITHUB_OUTPUT
67
+ echo "Baseline: $SCORE ($PASSING/$TOTAL)"
68
+
69
+ # Switch back to PR branch
70
+ git checkout $PR_SHA --force
71
+ git clean -fd
72
+
73
+ - name: Run tests (PR branch)
74
+ id: pr_eval
75
+ run: |
76
+ npm ci --ignore-scripts 2>/dev/null || true
77
+
78
+ RESULT=$(npx jest --json --silent 2>/dev/null || echo '{}')
79
+ PASSING=$(echo "$RESULT" | node -e "
80
+ let d=''; process.stdin.on('data',c=>d+=c); process.stdin.on('end',()=>{
81
+ try { const j=JSON.parse(d); console.log(j.numPassedTests||0); }
82
+ catch(e) { console.log(0); }
83
+ })
84
+ ")
85
+ TOTAL=$(echo "$RESULT" | node -e "
86
+ let d=''; process.stdin.on('data',c=>d+=c); process.stdin.on('end',()=>{
87
+ try { const j=JSON.parse(d); console.log(j.numTotalTests||1); }
88
+ catch(e) { console.log(1); }
89
+ })
90
+ ")
91
+ SCORE=$(node -e "console.log(($TOTAL>0?$PASSING/$TOTAL:0).toFixed(4))")
92
+ echo "score=$SCORE" >> $GITHUB_OUTPUT
93
+ echo "passing=$PASSING" >> $GITHUB_OUTPUT
94
+ echo "total=$TOTAL" >> $GITHUB_OUTPUT
95
+ echo "PR score: $SCORE ($PASSING/$TOTAL)"
96
+
97
+ - name: Compute delta
98
+ id: delta
99
+ run: |
100
+ BASELINE=${{ steps.baseline.outputs.score }}
101
+ PR_SCORE=${{ steps.pr_eval.outputs.score }}
102
+ BASELINE_TOTAL=${{ steps.baseline.outputs.total }}
103
+ PR_TOTAL=${{ steps.pr_eval.outputs.total }}
104
+ BASELINE_PASSING=${{ steps.baseline.outputs.passing }}
105
+ PR_PASSING=${{ steps.pr_eval.outputs.passing }}
106
+
107
+ # Pass rate delta
108
+ DELTA=$(node -e "console.log(($PR_SCORE - $BASELINE).toFixed(4))")
109
+
110
+ # Test count delta
111
+ TESTS_ADDED=$(node -e "console.log($PR_TOTAL - $BASELINE_TOTAL)")
112
+ TESTS_PASSING_ADDED=$(node -e "console.log($PR_PASSING - $BASELINE_PASSING)")
113
+
114
+ # Improved = pass_rate up OR (pass_rate maintained at 1.0 AND more passing tests)
115
+ IMPROVED=$(node -e "
116
+ const prScore = $PR_SCORE;
117
+ const baseline = $BASELINE;
118
+ const testsAdded = $PR_PASSING - $BASELINE_PASSING;
119
+ const passRateUp = prScore > baseline;
120
+ const passRateMaintained = prScore >= 1.0 && baseline >= 1.0;
121
+ const moreTests = testsAdded > 0;
122
+ const noRegression = prScore >= baseline;
123
+ console.log(passRateUp || (passRateMaintained && moreTests) || (noRegression && moreTests));
124
+ ")
125
+
126
+ echo "delta=$DELTA" >> $GITHUB_OUTPUT
127
+ echo "improved=$IMPROVED" >> $GITHUB_OUTPUT
128
+ echo "tests_added=$TESTS_ADDED" >> $GITHUB_OUTPUT
129
+ echo "tests_passing_added=$TESTS_PASSING_ADDED" >> $GITHUB_OUTPUT
130
+ echo "Delta: $DELTA, tests_added=$TESTS_ADDED (improved=$IMPROVED)"
131
+
132
+ - name: AI quality assessment
133
+ id: ai_eval
134
+ env:
135
+ OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
136
+ OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
137
+ PR_TITLE: ${{ github.event.pull_request.title }}
138
+ run: |
139
+ # Get diff for AI assessment
140
+ git diff origin/main...HEAD -- '*.ts' '*.tsx' '*.js' '*.jsx' | head -3000 > /tmp/eval-diff.txt
141
+ DIFF_SIZE=$(wc -c < /tmp/eval-diff.txt | tr -d ' ')
142
+
143
+ if [ "$DIFF_SIZE" -lt 10 ]; then
144
+ echo "quality_score=0.5" >> $GITHUB_OUTPUT
145
+ echo "quality_reason=No code diff to assess" >> $GITHUB_OUTPUT
146
+ exit 0
147
+ fi
148
+
149
+ # Gather context (same as review workflow)
150
+ node -e "
151
+ const fs = require('fs');
152
+ const sections = [];
153
+ try { const c = JSON.parse(fs.readFileSync('.jfl/config.json','utf8')); sections.push('Project: ' + (c.name||'unknown') + ' - ' + (c.description||'')); } catch(e) {}
154
+ try { sections.push(fs.readFileSync('knowledge/ARCHITECTURE.md','utf8').substring(0,1500)); } catch(e) {}
155
+ try { sections.push(fs.readFileSync('knowledge/SERVICE_SPEC.md','utf8').substring(0,800)); } catch(e) {}
156
+ fs.writeFileSync('/tmp/eval-context.txt', sections.join('\n'));
157
+ "
158
+
159
+ # Build AI eval payload
160
+ node -e "
161
+ const fs = require('fs');
162
+ const diff = fs.readFileSync('/tmp/eval-diff.txt', 'utf8').substring(0, 8000);
163
+ const context = fs.readFileSync('/tmp/eval-context.txt', 'utf8').substring(0, 2000);
164
+ const title = process.env.PR_TITLE || '';
165
+
166
+ const testMetrics = {
167
+ baseline_pass_rate: ${{ steps.baseline.outputs.score }},
168
+ pr_pass_rate: ${{ steps.pr_eval.outputs.score }},
169
+ baseline_tests: ${{ steps.baseline.outputs.total }},
170
+ pr_tests: ${{ steps.pr_eval.outputs.total }},
171
+ tests_added: ${{ steps.delta.outputs.tests_added }},
172
+ };
173
+
174
+ const payload = {
175
+ model: 'gpt-4o-mini',
176
+ messages: [
177
+ {
178
+ role: 'system',
179
+ content: [
180
+ 'You are an AI eval scorer for a software project. Score this PR on overall quality improvement.',
181
+ '',
182
+ '--- PROJECT CONTEXT ---',
183
+ context,
184
+ '--- END CONTEXT ---',
185
+ '',
186
+ 'Test metrics: ' + JSON.stringify(testMetrics),
187
+ '',
188
+ 'Score the PR from 0.0 to 1.0 on these dimensions:',
189
+ '- correctness: Does the code work? Any bugs?',
190
+ '- coverage: Did it add meaningful tests?',
191
+ '- architecture: Does it follow project patterns?',
192
+ '- value: Does it add real value to the project?',
193
+ '',
194
+ 'Respond in this exact JSON format (no markdown):',
195
+ '{\"quality_score\": 0.0-1.0, \"dimensions\": {\"correctness\": 0.0-1.0, \"coverage\": 0.0-1.0, \"architecture\": 0.0-1.0, \"value\": 0.0-1.0}, \"reason\": \"1-2 sentence summary\"}',
196
+ ].join('\\n')
197
+ },
198
+ { role: 'user', content: 'PR: ' + title + '\\n\\n' + diff }
199
+ ],
200
+ max_tokens: 300,
201
+ temperature: 0.1
202
+ };
203
+
204
+ fs.writeFileSync('/tmp/eval-payload.json', JSON.stringify(payload));
205
+ "
206
+
207
+ # Try OpenAI
208
+ RESULT=""
209
+ if [ -n "$OPENAI_API_KEY" ]; then
210
+ RESPONSE=$(curl -s -X POST https://api.openai.com/v1/chat/completions \
211
+ -H "Content-Type: application/json" \
212
+ -H "Authorization: Bearer $OPENAI_API_KEY" \
213
+ -d @/tmp/eval-payload.json 2>/dev/null)
214
+ RESULT=$(echo "$RESPONSE" | node -e "
215
+ let d=''; process.stdin.on('data',c=>d+=c); process.stdin.on('end',()=>{
216
+ try { console.log(JSON.parse(d).choices[0].message.content); }
217
+ catch(e) { console.log(''); }
218
+ })
219
+ ")
220
+ fi
221
+
222
+ # Fallback to OpenRouter
223
+ if [ -z "$RESULT" ] && [ -n "$OPENROUTER_API_KEY" ]; then
224
+ node -e "
225
+ const fs = require('fs');
226
+ const p = JSON.parse(fs.readFileSync('/tmp/eval-payload.json', 'utf8'));
227
+ p.model = 'anthropic/claude-sonnet-4';
228
+ fs.writeFileSync('/tmp/eval-payload-or.json', JSON.stringify(p));
229
+ "
230
+ RESPONSE=$(curl -s -X POST https://openrouter.ai/api/v1/chat/completions \
231
+ -H "Content-Type: application/json" \
232
+ -H "Authorization: Bearer $OPENROUTER_API_KEY" \
233
+ -d @/tmp/eval-payload-or.json 2>/dev/null)
234
+ RESULT=$(echo "$RESPONSE" | node -e "
235
+ let d=''; process.stdin.on('data',c=>d+=c); process.stdin.on('end',()=>{
236
+ try { console.log(JSON.parse(d).choices[0].message.content); }
237
+ catch(e) { console.log(''); }
238
+ })
239
+ ")
240
+ fi
241
+
242
+ # Parse AI eval result
243
+ node -e "
244
+ const raw = \`$RESULT\` || '{}';
245
+ try {
246
+ const parsed = JSON.parse(raw.match(/\{[\s\S]*\}/)?.[0] || '{}');
247
+ const score = Math.max(0, Math.min(1, parsed.quality_score || 0.5));
248
+ const reason = (parsed.reason || 'AI eval unavailable').replace(/\n/g, ' ');
249
+ const dims = parsed.dimensions || {};
250
+ console.log('quality_score=' + score.toFixed(2));
251
+ console.log('quality_reason=' + reason);
252
+ console.log('dim_correctness=' + (dims.correctness || 0.5).toFixed(2));
253
+ console.log('dim_coverage=' + (dims.coverage || 0.5).toFixed(2));
254
+ console.log('dim_architecture=' + (dims.architecture || 0.5).toFixed(2));
255
+ console.log('dim_value=' + (dims.value || 0.5).toFixed(2));
256
+ } catch(e) {
257
+ console.log('quality_score=0.50');
258
+ console.log('quality_reason=AI eval parse error');
259
+ console.log('dim_correctness=0.50');
260
+ console.log('dim_coverage=0.50');
261
+ console.log('dim_architecture=0.50');
262
+ console.log('dim_value=0.50');
263
+ }
264
+ " >> $GITHUB_OUTPUT
265
+
266
+ - name: Commit eval entry to PR branch
267
+ run: |
268
+ git config user.name "github-actions[bot]"
269
+ git config user.email "github-actions[bot]@users.noreply.github.com"
270
+
271
+ # Checkout the actual PR branch (not detached HEAD)
272
+ git checkout ${{ github.head_ref }}
273
+
274
+ # Write eval entry
275
+ mkdir -p .jfl/eval
276
+ cat >> .jfl/eval/eval.jsonl << EOF
277
+ {"v":1,"ts":"$(date -u +%Y-%m-%dT%H:%M:%SZ)","agent":"peter-parker","dataset":"ci-eval","metrics":{"test_pass_rate":${{ steps.pr_eval.outputs.score }},"tests_passed":${{ steps.pr_eval.outputs.passing }},"tests_total":${{ steps.pr_eval.outputs.total }}},"composite":${{ steps.pr_eval.outputs.score }},"delta":{"composite":${{ steps.delta.outputs.delta }}},"model_version":"pp-run","improved":${{ steps.delta.outputs.improved }},"pr_number":${{ github.event.pull_request.number }},"branch":"${{ github.head_ref }}"}
278
+ EOF
279
+
280
+ # Write eval:scored event for hub file watcher
281
+ cat >> .jfl/service-events.jsonl << EOF
282
+ {"ts":"$(date -u +%Y-%m-%dT%H:%M:%S.000Z)","type":"eval:scored","source":"ci","data":{"agent":"peter-parker","composite":${{ steps.pr_eval.outputs.score }},"baseline":${{ steps.baseline.outputs.score }},"delta":${{ steps.delta.outputs.delta }},"improved":"${{ steps.delta.outputs.improved }}","pr_number":${{ github.event.pull_request.number }},"branch":"${{ github.head_ref }}","run_url":"${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}","quality_score":${{ steps.ai_eval.outputs.quality_score }},"dim_correctness":${{ steps.ai_eval.outputs.dim_correctness }},"dim_coverage":${{ steps.ai_eval.outputs.dim_coverage }},"dim_architecture":${{ steps.ai_eval.outputs.dim_architecture }},"dim_value":${{ steps.ai_eval.outputs.dim_value }}}}
283
+ EOF
284
+
285
+ git add .jfl/eval/eval.jsonl .jfl/service-events.jsonl
286
+ git commit -m "eval: scored PR #${{ github.event.pull_request.number }} (composite=${{ steps.pr_eval.outputs.score }}, delta=${{ steps.delta.outputs.delta }})"
287
+ git push origin ${{ github.head_ref }}
288
+
289
+ - name: Post eval:scored event to hub (best-effort)
290
+ if: env.JFL_HUB_URL != ''
291
+ continue-on-error: true
292
+ run: |
293
+ curl -sf -X POST "${JFL_HUB_URL}/api/events" \
294
+ -H "Content-Type: application/json" \
295
+ -H "Authorization: Bearer ${JFL_HUB_TOKEN}" \
296
+ -d '{
297
+ "type": "eval:scored",
298
+ "source": "github-actions",
299
+ "data": {
300
+ "pr_number": ${{ github.event.pull_request.number }},
301
+ "pr_url": "${{ github.event.pull_request.html_url }}",
302
+ "branch": "${{ github.head_ref }}",
303
+ "commit_sha": "${{ github.sha }}",
304
+ "agent": "peter-parker",
305
+ "metrics": {
306
+ "test_pass_rate": ${{ steps.pr_eval.outputs.score }},
307
+ "test_count": ${{ steps.pr_eval.outputs.total }},
308
+ "tests_passing": ${{ steps.pr_eval.outputs.passing }}
309
+ },
310
+ "baseline": ${{ steps.baseline.outputs.score }},
311
+ "baseline_total": ${{ steps.baseline.outputs.total }},
312
+ "baseline_passing": ${{ steps.baseline.outputs.passing }},
313
+ "composite": ${{ steps.pr_eval.outputs.score }},
314
+ "delta": ${{ steps.delta.outputs.delta }},
315
+ "tests_added": ${{ steps.delta.outputs.tests_added }},
316
+ "tests_passing_added": ${{ steps.delta.outputs.tests_passing_added }},
317
+ "improved": ${{ steps.delta.outputs.improved }},
318
+ "run_id": "${{ github.run_id }}",
319
+ "run_url": "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
320
+ }
321
+ }' || echo "Warning: could not post to hub (hub may not be running)"
322
+
323
+ - name: Comment on PR
324
+ uses: actions/github-script@v7
325
+ with:
326
+ script: |
327
+ const delta = parseFloat('${{ steps.delta.outputs.delta }}');
328
+ const improved = '${{ steps.delta.outputs.improved }}' === 'true';
329
+ const baseline = '${{ steps.baseline.outputs.score }}';
330
+ const prScore = '${{ steps.pr_eval.outputs.score }}';
331
+ const baselineTotal = parseInt('${{ steps.baseline.outputs.total }}');
332
+ const prTotal = parseInt('${{ steps.pr_eval.outputs.total }}');
333
+ const baselinePassing = parseInt('${{ steps.baseline.outputs.passing }}');
334
+ const prPassing = parseInt('${{ steps.pr_eval.outputs.passing }}');
335
+ const testsAdded = parseInt('${{ steps.delta.outputs.tests_added }}');
336
+ const testsPassingAdded = parseInt('${{ steps.delta.outputs.tests_passing_added }}');
337
+ const runUrl = '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}';
338
+
339
+ const emoji = improved ? ':green_circle:' : (delta < 0 ? ':red_circle:' : ':yellow_circle:');
340
+ let verdict = 'UNCHANGED';
341
+ if (delta > 0) verdict = 'IMPROVED';
342
+ else if (delta < 0) verdict = 'REGRESSED';
343
+ else if (testsPassingAdded > 0) verdict = 'IMPROVED (more tests)';
344
+
345
+ const testsDeltaStr = testsAdded > 0 ? `+${testsAdded}` : `${testsAdded}`;
346
+ const qualityScore = parseFloat('${{ steps.ai_eval.outputs.quality_score }}') || 0.5;
347
+ const qualityReason = '${{ steps.ai_eval.outputs.quality_reason }}' || 'N/A';
348
+ const dimCorrectness = '${{ steps.ai_eval.outputs.dim_correctness }}' || '—';
349
+ const dimCoverage = '${{ steps.ai_eval.outputs.dim_coverage }}' || '—';
350
+ const dimArchitecture = '${{ steps.ai_eval.outputs.dim_architecture }}' || '—';
351
+ const dimValue = '${{ steps.ai_eval.outputs.dim_value }}' || '—';
352
+
353
+ const body = [
354
+ `## JFL Eval Results ${emoji}`,
355
+ '',
356
+ '| Metric | Baseline (main) | PR | Delta |',
357
+ '|--------|-----------------|-----|-------|',
358
+ `| test_pass_rate | ${baseline} (${baselinePassing}/${baselineTotal}) | ${prScore} (${prPassing}/${prTotal}) | ${delta > 0 ? '+' : ''}${delta.toFixed(4)} |`,
359
+ `| test_count | ${baselineTotal} | ${prTotal} | ${testsDeltaStr} |`,
360
+ `| tests_passing | ${baselinePassing} | ${prPassing} | ${testsPassingAdded > 0 ? '+' : ''}${testsPassingAdded} |`,
361
+ '',
362
+ `### AI Quality Score: ${qualityScore.toFixed(2)}`,
363
+ '',
364
+ '| Dimension | Score |',
365
+ '|-----------|-------|',
366
+ `| Correctness | ${dimCorrectness} |`,
367
+ `| Coverage | ${dimCoverage} |`,
368
+ `| Architecture | ${dimArchitecture} |`,
369
+ `| Value | ${dimValue} |`,
370
+ '',
371
+ `> ${qualityReason}`,
372
+ '',
373
+ `**Verdict: ${verdict}**`,
374
+ '',
375
+ improved
376
+ ? 'Auto-merge eligible via JFL flow.'
377
+ : (delta < 0
378
+ ? 'Regression detected. Manual review required.'
379
+ : 'No improvement detected. Manual review recommended.'),
380
+ '',
381
+ '---',
382
+ `[View run](${runUrl}) | *Evaluated by JFL self-driving loop*`,
383
+ ].join('\n');
384
+
385
+ const { data: comments } = await github.rest.issues.listComments({
386
+ owner: context.repo.owner,
387
+ repo: context.repo.repo,
388
+ issue_number: context.issue.number,
389
+ });
390
+
391
+ const existing = comments.find(c =>
392
+ c.user.type === 'Bot' && c.body.includes('JFL Eval Results')
393
+ );
394
+
395
+ if (existing) {
396
+ await github.rest.issues.updateComment({
397
+ owner: context.repo.owner,
398
+ repo: context.repo.repo,
399
+ comment_id: existing.id,
400
+ body,
401
+ });
402
+ } else {
403
+ await github.rest.issues.createComment({
404
+ owner: context.repo.owner,
405
+ repo: context.repo.repo,
406
+ issue_number: context.issue.number,
407
+ body,
408
+ });
409
+ }
410
+
411
+ - name: Auto-merge or flag regression
412
+ if: always()
413
+ env:
414
+ GH_TOKEN: ${{ github.token }}
415
+ run: |
416
+ PR_NUMBER=${{ github.event.pull_request.number }}
417
+ IMPROVED="${{ steps.delta.outputs.improved }}"
418
+
419
+ if [ "$IMPROVED" = "true" ]; then
420
+ # Check if AI review requested changes
421
+ REVIEW_STATE=$(gh pr view $PR_NUMBER --json reviews --jq '[.reviews[] | select(.author.login == "github-actions")] | last | .state // "NONE"' 2>/dev/null || echo "NONE")
422
+
423
+ if [ "$REVIEW_STATE" = "CHANGES_REQUESTED" ]; then
424
+ echo "Eval passed but AI review has blockers — holding merge"
425
+ gh pr comment $PR_NUMBER --body "**JFL Eval passed** (test_pass_rate improved) but **AI review found blockers**. Address red findings, then eval will re-run and auto-merge."
426
+ else
427
+ echo "Eval passed, no review blockers — auto-merging PR #$PR_NUMBER"
428
+ gh pr merge $PR_NUMBER --merge --delete-branch \
429
+ --body "Auto-merged by JFL eval: test_pass_rate improved by ${{ steps.delta.outputs.delta }} (${{ steps.baseline.outputs.score }} → ${{ steps.pr_eval.outputs.score }})"
430
+ fi
431
+ else
432
+ echo "Eval regression — requesting changes on PR #$PR_NUMBER"
433
+ gh pr review $PR_NUMBER --request-changes \
434
+ --body "JFL eval regression: test_pass_rate dropped by ${{ steps.delta.outputs.delta }} (${{ steps.baseline.outputs.score }} → ${{ steps.pr_eval.outputs.score }}). Run: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
435
+ fi
436
+
437
+ - name: Eval Summary
438
+ if: always()
439
+ run: |
440
+ echo "### JFL Eval Results" >> $GITHUB_STEP_SUMMARY
441
+ echo "" >> $GITHUB_STEP_SUMMARY
442
+ echo "| Metric | Baseline | PR | Delta |" >> $GITHUB_STEP_SUMMARY
443
+ echo "|--------|----------|-----|-------|" >> $GITHUB_STEP_SUMMARY
444
+ echo "| test_pass_rate | ${{ steps.baseline.outputs.score }} | ${{ steps.pr_eval.outputs.score }} | ${{ steps.delta.outputs.delta }} |" >> $GITHUB_STEP_SUMMARY
445
+ echo "| test_count | ${{ steps.baseline.outputs.total }} | ${{ steps.pr_eval.outputs.total }} | ${{ steps.delta.outputs.tests_added }} |" >> $GITHUB_STEP_SUMMARY
446
+ echo "| tests_passing | ${{ steps.baseline.outputs.passing }} | ${{ steps.pr_eval.outputs.passing }} | ${{ steps.delta.outputs.tests_passing_added }} |" >> $GITHUB_STEP_SUMMARY
447
+ echo "" >> $GITHUB_STEP_SUMMARY
448
+ echo "Improved: ${{ steps.delta.outputs.improved }}" >> $GITHUB_STEP_SUMMARY