jfl 0.4.2 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +213 -34
- package/dist/commands/ci-setup.d.ts +5 -0
- package/dist/commands/ci-setup.d.ts.map +1 -0
- package/dist/commands/ci-setup.js +82 -0
- package/dist/commands/ci-setup.js.map +1 -0
- package/dist/commands/peter.d.ts +4 -1
- package/dist/commands/peter.d.ts.map +1 -1
- package/dist/commands/peter.js +491 -1
- package/dist/commands/peter.js.map +1 -1
- package/dist/commands/update.d.ts.map +1 -1
- package/dist/commands/update.js +58 -4
- package/dist/commands/update.js.map +1 -1
- package/dist/index.js +18 -2
- package/dist/index.js.map +1 -1
- package/dist/lib/flow-engine.d.ts +2 -0
- package/dist/lib/flow-engine.d.ts.map +1 -1
- package/dist/lib/flow-engine.js +40 -0
- package/dist/lib/flow-engine.js.map +1 -1
- package/dist/types/map.d.ts +1 -1
- package/dist/types/map.d.ts.map +1 -1
- package/dist/types/map.js.map +1 -1
- package/package.json +1 -1
- package/template/.github/workflows/jfl-eval.yml +540 -0
- package/template/.github/workflows/jfl-review.yml +371 -0
- package/template/.jfl/{flows-self-driving.yaml → flows/self-driving.yaml} +20 -0
package/dist/types/map.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"map.d.ts","sourceRoot":"","sources":["../../src/types/map.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,MAAM,MAAM,YAAY,GACpB,iBAAiB,GAAG,eAAe,GACnC,cAAc,GAAG,gBAAgB,GAAG,aAAa,GACjD,iBAAiB,GAAG,mBAAmB,GACvC,eAAe,GAAG,qBAAqB,GAAG,sBAAsB,GAAG,oBAAoB,GACvF,cAAc,GACd,eAAe,GAAG,eAAe,GACjC,iBAAiB,GAAG,kBAAkB,GACtC,oBAAoB,GAAG,kBAAkB,GACzC,eAAe,GAAG,kBAAkB,GACpC,WAAW,GAAG,kBAAkB,GAChC,qBAAqB,GACrB,qBAAqB,GAAG,oBAAoB,GAC5C,gBAAgB,GAAG,gBAAgB,GACnC,eAAe,GAAG,eAAe,GAAG,aAAa,GAAG,cAAc,GAClE,uBAAuB,GAAG,0BAA0B,GAAG,gBAAgB,GACvE,gBAAgB,GAAG,aAAa,GAAG,eAAe,GAAG,YAAY,GACjE,oBAAoB,GAAG,0BAA0B,GACjD,mBAAmB,GAAG,wBAAwB,GAC9C,YAAY,GAAG,iBAAiB,GAChC,QAAQ,CAAA;AAEZ,MAAM,WAAW,WAAW;IAC1B,eAAe,EAAE,MAAM,CAAA;IACvB,UAAU,CAAC,EAAE,MAAM,CAAA;IACnB,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;IACpC,WAAW,CAAC,EAAE,MAAM,CAAA;IACpB,UAAU,CAAC,EAAE,MAAM,EAAE,CAAA;IACrB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAA;CACvB;AAED,MAAM,WAAW,QAAQ;IACvB,EAAE,EAAE,MAAM,CAAA;IACV,EAAE,EAAE,MAAM,CAAA;IACV,IAAI,EAAE,YAAY,CAAA;IAClB,MAAM,EAAE,MAAM,CAAA;IACd,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;IAC7B,GAAG,CAAC,EAAE,MAAM,CAAA;CACb;AAED,MAAM,WAAW,eAAe;IAC9B,EAAE,EAAE,MAAM,CAAA;IACV,QAAQ,EAAE,MAAM,CAAA;IAChB,QAAQ,EAAE,MAAM,EAAE,CAAA;IAClB,SAAS,EAAE,KAAK,GAAG,WAAW,GAAG,MAAM,CAAA;IACvC,SAAS,EAAE,MAAM,CAAA;IACjB,WAAW,CAAC,EAAE,MAAM,CAAA;CACrB;AAED,MAAM,MAAM,SAAS,GAAG,OAAO,GAAG,QAAQ,GAAG,MAAM,CAAA;AAEnD,MAAM,MAAM,SAAS,GAAG,OAAO,GAAG,SAAS,GAAG,SAAS,GAAG,UAAU,GAAG,QAAQ,CAAA;AAE/E,MAAM,MAAM,WAAW,GAAG,gBAAgB,GAAG,UAAU,GAAG,eAAe,CAAA;AAEzE,eAAO,MAAM,mBAAmB,EAAE,MAAM,CAAC,WAAW,EAAE,MAAM,CAAC,SAAS,EAAE,SAAS,CAAC,CAsBjF,CAAA;AAED,eAAO,MAAM,gBAAgB,EAAE,MAAM,CAAC,SAAS,EAAE,SAAS,CAMzD,CAAA"}
|
|
1
|
+
{"version":3,"file":"map.d.ts","sourceRoot":"","sources":["../../src/types/map.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,MAAM,MAAM,YAAY,GACpB,iBAAiB,GAAG,eAAe,GACnC,cAAc,GAAG,gBAAgB,GAAG,aAAa,GACjD,iBAAiB,GAAG,mBAAmB,GACvC,eAAe,GAAG,qBAAqB,GAAG,sBAAsB,GAAG,oBAAoB,GACvF,cAAc,GACd,eAAe,GAAG,eAAe,GACjC,iBAAiB,GAAG,kBAAkB,GACtC,oBAAoB,GAAG,kBAAkB,GACzC,eAAe,GAAG,kBAAkB,GACpC,WAAW,GAAG,kBAAkB,GAChC,qBAAqB,GACrB,qBAAqB,GAAG,oBAAoB,GAC5C,gBAAgB,GAAG,gBAAgB,GACnC,eAAe,GAAG,eAAe,GAAG,aAAa,GAAG,cAAc,GAClE,uBAAuB,GAAG,0BAA0B,GAAG,gBAAgB,GACvE,gBAAgB,GAAG,aAAa,GAAG,eAAe,GAAG,YAAY,GACjE,oBAAoB,GAAG,0BAA0B,GACjD,mBAAmB,GAAG,wBAAwB,GAC9C,YAAY,GAAG,iBAAiB,GAChC,YAAY,GAAG,aAAa,GAAG,uBAAuB,GACtD,QAAQ,CAAA;AAEZ,MAAM,WAAW,WAAW;IAC1B,eAAe,EAAE,MAAM,CAAA;IACvB,UAAU,CAAC,EAAE,MAAM,CAAA;IACnB,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;IACpC,WAAW,CAAC,EAAE,MAAM,CAAA;IACpB,UAAU,CAAC,EAAE,MAAM,EAAE,CAAA;IACrB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAA;CACvB;AAED,MAAM,WAAW,QAAQ;IACvB,EAAE,EAAE,MAAM,CAAA;IACV,EAAE,EAAE,MAAM,CAAA;IACV,IAAI,EAAE,YAAY,CAAA;IAClB,MAAM,EAAE,MAAM,CAAA;IACd,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;IAC7B,GAAG,CAAC,EAAE,MAAM,CAAA;CACb;AAED,MAAM,WAAW,eAAe;IAC9B,EAAE,EAAE,MAAM,CAAA;IACV,QAAQ,EAAE,MAAM,CAAA;IAChB,QAAQ,EAAE,MAAM,EAAE,CAAA;IAClB,SAAS,EAAE,KAAK,GAAG,WAAW,GAAG,MAAM,CAAA;IACvC,SAAS,EAAE,MAAM,CAAA;IACjB,WAAW,CAAC,EAAE,MAAM,CAAA;CACrB;AAED,MAAM,MAAM,SAAS,GAAG,OAAO,GAAG,QAAQ,GAAG,MAAM,CAAA;AAEnD,MAAM,MAAM,SAAS,GAAG,OAAO,GAAG,SAAS,GAAG,SAAS,GAAG,UAAU,GAAG,QAAQ,CAAA;AAE/E,MAAM,MAAM,WAAW,GAAG,gBAAgB,GAAG,UAAU,GAAG,eAAe,CAAA;AAEzE,eAAO,MAAM,mBAAmB,EAAE,MAAM,CAAC,WAAW,EAAE,MAAM,CAAC,SAAS,EAAE,SAAS,CAAC,CAsBjF,CAAA;AAED,eAAO,MAAM,gBAAgB,EAAE,MAAM,CAAC,SAAS,EAAE,SAAS,CAMzD,CAAA"}
|
package/dist/types/map.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"map.js","sourceRoot":"","sources":["../../src/types/map.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;
|
|
1
|
+
{"version":3,"file":"map.js","sourceRoot":"","sources":["../../src/types/map.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AA6DH,MAAM,CAAC,MAAM,mBAAmB,GAAsD;IACpF,gBAAgB,EAAE;QAChB,KAAK,EAAE,OAAO;QACd,OAAO,EAAE,QAAQ;QACjB,OAAO,EAAE,QAAQ;QACjB,QAAQ,EAAE,QAAQ;QAClB,MAAM,EAAE,OAAO;KAChB;IACD,UAAU,EAAE;QACV,KAAK,EAAE,OAAO;QACd,OAAO,EAAE,QAAQ;QACjB,OAAO,EAAE,QAAQ;QACjB,QAAQ,EAAE,MAAM;QAChB,MAAM,EAAE,QAAQ;KACjB;IACD,eAAe,EAAE;QACf,KAAK,EAAE,QAAQ;QACf,OAAO,EAAE,MAAM;QACf,OAAO,EAAE,QAAQ;QACjB,QAAQ,EAAE,MAAM;QAChB,MAAM,EAAE,QAAQ;KACjB;CACF,CAAA;AAED,MAAM,CAAC,MAAM,gBAAgB,GAAiC;IAC5D,KAAK,EAAE,QAAQ;IACf,OAAO,EAAE,MAAM;IACf,OAAO,EAAE,MAAM;IACf,QAAQ,EAAE,MAAM;IAChB,MAAM,EAAE,QAAQ;CACjB,CAAA"}
|
package/package.json
CHANGED
|
@@ -0,0 +1,540 @@
|
|
|
1
|
+
name: JFL Eval Suite
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
pull_request:
|
|
5
|
+
types: [opened, synchronize]
|
|
6
|
+
branches: [main]
|
|
7
|
+
|
|
8
|
+
permissions:
|
|
9
|
+
contents: write
|
|
10
|
+
pull-requests: write
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
eval:
|
|
14
|
+
name: Run JFL Eval
|
|
15
|
+
runs-on: ubuntu-latest
|
|
16
|
+
timeout-minutes: 15
|
|
17
|
+
if: startsWith(github.head_ref, 'pp/') || contains(github.event.pull_request.labels.*.name, 'run-eval')
|
|
18
|
+
|
|
19
|
+
env:
|
|
20
|
+
JFL_HUB_URL: ${{ secrets.JFL_HUB_URL }}
|
|
21
|
+
JFL_HUB_TOKEN: ${{ secrets.JFL_HUB_TOKEN }}
|
|
22
|
+
JFL_TELEMETRY: 'off'
|
|
23
|
+
NODE_OPTIONS: '--experimental-vm-modules'
|
|
24
|
+
|
|
25
|
+
steps:
|
|
26
|
+
- uses: actions/checkout@v4
|
|
27
|
+
with:
|
|
28
|
+
fetch-depth: 0
|
|
29
|
+
|
|
30
|
+
- uses: actions/setup-node@v4
|
|
31
|
+
with:
|
|
32
|
+
node-version: '22'
|
|
33
|
+
cache: 'npm'
|
|
34
|
+
|
|
35
|
+
- name: Install dependencies
|
|
36
|
+
run: npm ci
|
|
37
|
+
|
|
38
|
+
- name: Build
|
|
39
|
+
run: npx tsc --build --noEmit
|
|
40
|
+
|
|
41
|
+
- name: Predict eval delta (Stratus, optional)
|
|
42
|
+
id: predict
|
|
43
|
+
if: env.STRATUS_API_KEY != ''
|
|
44
|
+
continue-on-error: true
|
|
45
|
+
env:
|
|
46
|
+
STRATUS_API_KEY: ${{ secrets.STRATUS_API_KEY }}
|
|
47
|
+
PR_TITLE: ${{ github.event.pull_request.title }}
|
|
48
|
+
run: |
|
|
49
|
+
DIFF_SUMMARY=$(git diff origin/main...HEAD --stat | tail -5)
|
|
50
|
+
CURRENT_SCORE=$(cat .jfl/eval.jsonl 2>/dev/null | tail -1 | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{console.log(JSON.parse(d).composite||0)}catch{console.log(0)}})" 2>/dev/null || echo "0")
|
|
51
|
+
|
|
52
|
+
RESULT=$(node -e "
|
|
53
|
+
const { Predictor } = require('./dist/lib/predictor.js');
|
|
54
|
+
const p = new Predictor();
|
|
55
|
+
p.predict({
|
|
56
|
+
proposal: { description: '$PR_TITLE — $DIFF_SUMMARY', change_type: 'fix', scope: 'medium' },
|
|
57
|
+
current_score: $CURRENT_SCORE,
|
|
58
|
+
goal: 'test_pass_rate >= 1.0',
|
|
59
|
+
recent_trajectory: [],
|
|
60
|
+
}).then(r => {
|
|
61
|
+
console.log(JSON.stringify({id: r.prediction_id, delta: r.predicted_delta, confidence: r.confidence, recommendation: r.recommendation}));
|
|
62
|
+
}).catch(e => {
|
|
63
|
+
console.error('Prediction failed:', e.message);
|
|
64
|
+
console.log(JSON.stringify({id: '', delta: 0, confidence: 0, recommendation: 'revise'}));
|
|
65
|
+
});
|
|
66
|
+
" 2>/dev/null || echo '{"id":"","delta":0,"confidence":0,"recommendation":"revise"}')
|
|
67
|
+
|
|
68
|
+
echo "prediction_id=$(echo "$RESULT" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{console.log(JSON.parse(d).id)}catch{console.log('')}})")" >> $GITHUB_OUTPUT
|
|
69
|
+
echo "predicted_delta=$(echo "$RESULT" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{console.log(JSON.parse(d).delta)}catch{console.log(0)}})")" >> $GITHUB_OUTPUT
|
|
70
|
+
echo "confidence=$(echo "$RESULT" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{console.log(JSON.parse(d).confidence)}catch{console.log(0)}})")" >> $GITHUB_OUTPUT
|
|
71
|
+
echo "recommendation=$(echo "$RESULT" | node -e "let d='';process.stdin.on('data',c=>d+=c);process.stdin.on('end',()=>{try{console.log(JSON.parse(d).recommendation)}catch{console.log('revise')}})")" >> $GITHUB_OUTPUT
|
|
72
|
+
|
|
73
|
+
- name: Run tests (baseline from main)
|
|
74
|
+
id: baseline
|
|
75
|
+
run: |
|
|
76
|
+
# Clean switch to main — remove PR-only files, restore main state
|
|
77
|
+
PR_SHA=$(git rev-parse HEAD)
|
|
78
|
+
git checkout origin/main --force
|
|
79
|
+
git clean -fd
|
|
80
|
+
npm ci --ignore-scripts 2>/dev/null || true
|
|
81
|
+
|
|
82
|
+
RESULT=$(npx jest --json --silent 2>/dev/null || echo '{}')
|
|
83
|
+
PASSING=$(echo "$RESULT" | node -e "
|
|
84
|
+
let d=''; process.stdin.on('data',c=>d+=c); process.stdin.on('end',()=>{
|
|
85
|
+
try { const j=JSON.parse(d); console.log(j.numPassedTests||0); }
|
|
86
|
+
catch(e) { console.log(0); }
|
|
87
|
+
})
|
|
88
|
+
")
|
|
89
|
+
TOTAL=$(echo "$RESULT" | node -e "
|
|
90
|
+
let d=''; process.stdin.on('data',c=>d+=c); process.stdin.on('end',()=>{
|
|
91
|
+
try { const j=JSON.parse(d); console.log(j.numTotalTests||1); }
|
|
92
|
+
catch(e) { console.log(1); }
|
|
93
|
+
})
|
|
94
|
+
")
|
|
95
|
+
SCORE=$(node -e "console.log(($TOTAL>0?$PASSING/$TOTAL:0).toFixed(4))")
|
|
96
|
+
echo "score=$SCORE" >> $GITHUB_OUTPUT
|
|
97
|
+
echo "passing=$PASSING" >> $GITHUB_OUTPUT
|
|
98
|
+
echo "total=$TOTAL" >> $GITHUB_OUTPUT
|
|
99
|
+
echo "Baseline: $SCORE ($PASSING/$TOTAL)"
|
|
100
|
+
|
|
101
|
+
# Switch back to PR branch
|
|
102
|
+
git checkout $PR_SHA --force
|
|
103
|
+
git clean -fd
|
|
104
|
+
|
|
105
|
+
- name: Run tests (PR branch)
|
|
106
|
+
id: pr_eval
|
|
107
|
+
run: |
|
|
108
|
+
npm ci --ignore-scripts 2>/dev/null || true
|
|
109
|
+
|
|
110
|
+
RESULT=$(npx jest --json --silent 2>/dev/null || echo '{}')
|
|
111
|
+
PASSING=$(echo "$RESULT" | node -e "
|
|
112
|
+
let d=''; process.stdin.on('data',c=>d+=c); process.stdin.on('end',()=>{
|
|
113
|
+
try { const j=JSON.parse(d); console.log(j.numPassedTests||0); }
|
|
114
|
+
catch(e) { console.log(0); }
|
|
115
|
+
})
|
|
116
|
+
")
|
|
117
|
+
TOTAL=$(echo "$RESULT" | node -e "
|
|
118
|
+
let d=''; process.stdin.on('data',c=>d+=c); process.stdin.on('end',()=>{
|
|
119
|
+
try { const j=JSON.parse(d); console.log(j.numTotalTests||1); }
|
|
120
|
+
catch(e) { console.log(1); }
|
|
121
|
+
})
|
|
122
|
+
")
|
|
123
|
+
SCORE=$(node -e "console.log(($TOTAL>0?$PASSING/$TOTAL:0).toFixed(4))")
|
|
124
|
+
echo "score=$SCORE" >> $GITHUB_OUTPUT
|
|
125
|
+
echo "passing=$PASSING" >> $GITHUB_OUTPUT
|
|
126
|
+
echo "total=$TOTAL" >> $GITHUB_OUTPUT
|
|
127
|
+
echo "PR score: $SCORE ($PASSING/$TOTAL)"
|
|
128
|
+
|
|
129
|
+
- name: Compute delta
|
|
130
|
+
id: delta
|
|
131
|
+
run: |
|
|
132
|
+
BASELINE=${{ steps.baseline.outputs.score }}
|
|
133
|
+
PR_SCORE=${{ steps.pr_eval.outputs.score }}
|
|
134
|
+
BASELINE_TOTAL=${{ steps.baseline.outputs.total }}
|
|
135
|
+
PR_TOTAL=${{ steps.pr_eval.outputs.total }}
|
|
136
|
+
BASELINE_PASSING=${{ steps.baseline.outputs.passing }}
|
|
137
|
+
PR_PASSING=${{ steps.pr_eval.outputs.passing }}
|
|
138
|
+
|
|
139
|
+
# Pass rate delta
|
|
140
|
+
DELTA=$(node -e "console.log(($PR_SCORE - $BASELINE).toFixed(4))")
|
|
141
|
+
|
|
142
|
+
# Test count delta
|
|
143
|
+
TESTS_ADDED=$(node -e "console.log($PR_TOTAL - $BASELINE_TOTAL)")
|
|
144
|
+
TESTS_PASSING_ADDED=$(node -e "console.log($PR_PASSING - $BASELINE_PASSING)")
|
|
145
|
+
|
|
146
|
+
# Improved = pass_rate up OR (pass_rate maintained at 1.0 AND more passing tests)
|
|
147
|
+
IMPROVED=$(node -e "
|
|
148
|
+
const prScore = $PR_SCORE;
|
|
149
|
+
const baseline = $BASELINE;
|
|
150
|
+
const testsAdded = $PR_PASSING - $BASELINE_PASSING;
|
|
151
|
+
const passRateUp = prScore > baseline;
|
|
152
|
+
const passRateMaintained = prScore >= 1.0 && baseline >= 1.0;
|
|
153
|
+
const moreTests = testsAdded > 0;
|
|
154
|
+
const noRegression = prScore >= baseline;
|
|
155
|
+
console.log(passRateUp || (passRateMaintained && moreTests) || (noRegression && moreTests));
|
|
156
|
+
")
|
|
157
|
+
|
|
158
|
+
echo "delta=$DELTA" >> $GITHUB_OUTPUT
|
|
159
|
+
echo "improved=$IMPROVED" >> $GITHUB_OUTPUT
|
|
160
|
+
echo "tests_added=$TESTS_ADDED" >> $GITHUB_OUTPUT
|
|
161
|
+
echo "tests_passing_added=$TESTS_PASSING_ADDED" >> $GITHUB_OUTPUT
|
|
162
|
+
echo "Delta: $DELTA, tests_added=$TESTS_ADDED (improved=$IMPROVED)"
|
|
163
|
+
|
|
164
|
+
- name: Resolve prediction (Stratus, optional)
|
|
165
|
+
if: steps.predict.outputs.prediction_id != ''
|
|
166
|
+
continue-on-error: true
|
|
167
|
+
env:
|
|
168
|
+
STRATUS_API_KEY: ${{ secrets.STRATUS_API_KEY }}
|
|
169
|
+
run: |
|
|
170
|
+
PRED_ID="${{ steps.predict.outputs.prediction_id }}"
|
|
171
|
+
DELTA=${{ steps.delta.outputs.delta }}
|
|
172
|
+
PR_SCORE=${{ steps.pr_eval.outputs.score }}
|
|
173
|
+
|
|
174
|
+
node -e "
|
|
175
|
+
const { Predictor } = require('./dist/lib/predictor.js');
|
|
176
|
+
const p = new Predictor();
|
|
177
|
+
p.resolve('$PRED_ID', $DELTA, $PR_SCORE, '${{ github.run_id }}')
|
|
178
|
+
.then(() => console.log('Prediction resolved: $PRED_ID'))
|
|
179
|
+
.catch(e => console.error('Resolve failed:', e.message));
|
|
180
|
+
" 2>/dev/null || echo "Warning: could not resolve prediction"
|
|
181
|
+
|
|
182
|
+
- name: AI quality assessment
|
|
183
|
+
id: ai_eval
|
|
184
|
+
env:
|
|
185
|
+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
|
186
|
+
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
|
187
|
+
PR_TITLE: ${{ github.event.pull_request.title }}
|
|
188
|
+
run: |
|
|
189
|
+
# Get diff for AI assessment
|
|
190
|
+
git diff origin/main...HEAD -- '*.ts' '*.tsx' '*.js' '*.jsx' | head -3000 > /tmp/eval-diff.txt
|
|
191
|
+
DIFF_SIZE=$(wc -c < /tmp/eval-diff.txt | tr -d ' ')
|
|
192
|
+
|
|
193
|
+
if [ "$DIFF_SIZE" -lt 10 ]; then
|
|
194
|
+
echo "quality_score=0.5" >> $GITHUB_OUTPUT
|
|
195
|
+
echo "quality_reason=No code diff to assess" >> $GITHUB_OUTPUT
|
|
196
|
+
exit 0
|
|
197
|
+
fi
|
|
198
|
+
|
|
199
|
+
# Gather context (same as review workflow)
|
|
200
|
+
node -e "
|
|
201
|
+
const fs = require('fs');
|
|
202
|
+
const sections = [];
|
|
203
|
+
try { const c = JSON.parse(fs.readFileSync('.jfl/config.json','utf8')); sections.push('Project: ' + (c.name||'unknown') + ' - ' + (c.description||'')); } catch(e) {}
|
|
204
|
+
try { sections.push(fs.readFileSync('knowledge/ARCHITECTURE.md','utf8').substring(0,1500)); } catch(e) {}
|
|
205
|
+
try { sections.push(fs.readFileSync('knowledge/SERVICE_SPEC.md','utf8').substring(0,800)); } catch(e) {}
|
|
206
|
+
fs.writeFileSync('/tmp/eval-context.txt', sections.join('\n'));
|
|
207
|
+
"
|
|
208
|
+
|
|
209
|
+
# Build AI eval payload
|
|
210
|
+
node -e "
|
|
211
|
+
const fs = require('fs');
|
|
212
|
+
const diff = fs.readFileSync('/tmp/eval-diff.txt', 'utf8').substring(0, 8000);
|
|
213
|
+
const context = fs.readFileSync('/tmp/eval-context.txt', 'utf8').substring(0, 2000);
|
|
214
|
+
const title = process.env.PR_TITLE || '';
|
|
215
|
+
|
|
216
|
+
const testMetrics = {
|
|
217
|
+
baseline_pass_rate: ${{ steps.baseline.outputs.score }},
|
|
218
|
+
pr_pass_rate: ${{ steps.pr_eval.outputs.score }},
|
|
219
|
+
baseline_tests: ${{ steps.baseline.outputs.total }},
|
|
220
|
+
pr_tests: ${{ steps.pr_eval.outputs.total }},
|
|
221
|
+
tests_added: ${{ steps.delta.outputs.tests_added }},
|
|
222
|
+
};
|
|
223
|
+
|
|
224
|
+
const payload = {
|
|
225
|
+
model: 'gpt-4o-mini',
|
|
226
|
+
messages: [
|
|
227
|
+
{
|
|
228
|
+
role: 'system',
|
|
229
|
+
content: [
|
|
230
|
+
'You are an AI eval scorer for a software project. Score this PR on overall quality improvement.',
|
|
231
|
+
'',
|
|
232
|
+
'--- PROJECT CONTEXT ---',
|
|
233
|
+
context,
|
|
234
|
+
'--- END CONTEXT ---',
|
|
235
|
+
'',
|
|
236
|
+
'Test metrics: ' + JSON.stringify(testMetrics),
|
|
237
|
+
'',
|
|
238
|
+
'Score the PR from 0.0 to 1.0 on these dimensions:',
|
|
239
|
+
'- correctness: Does the code work? Any bugs?',
|
|
240
|
+
'- coverage: Did it add meaningful tests?',
|
|
241
|
+
'- architecture: Does it follow project patterns?',
|
|
242
|
+
'- value: Does it add real value to the project?',
|
|
243
|
+
'',
|
|
244
|
+
'Respond in this exact JSON format (no markdown):',
|
|
245
|
+
'{\"quality_score\": 0.0-1.0, \"dimensions\": {\"correctness\": 0.0-1.0, \"coverage\": 0.0-1.0, \"architecture\": 0.0-1.0, \"value\": 0.0-1.0}, \"reason\": \"1-2 sentence summary\"}',
|
|
246
|
+
].join('\\n')
|
|
247
|
+
},
|
|
248
|
+
{ role: 'user', content: 'PR: ' + title + '\\n\\n' + diff }
|
|
249
|
+
],
|
|
250
|
+
max_tokens: 300,
|
|
251
|
+
temperature: 0.1
|
|
252
|
+
};
|
|
253
|
+
|
|
254
|
+
fs.writeFileSync('/tmp/eval-payload.json', JSON.stringify(payload));
|
|
255
|
+
"
|
|
256
|
+
|
|
257
|
+
# Try OpenAI
|
|
258
|
+
RESULT=""
|
|
259
|
+
if [ -n "$OPENAI_API_KEY" ]; then
|
|
260
|
+
RESPONSE=$(curl -s -X POST https://api.openai.com/v1/chat/completions \
|
|
261
|
+
-H "Content-Type: application/json" \
|
|
262
|
+
-H "Authorization: Bearer $OPENAI_API_KEY" \
|
|
263
|
+
-d @/tmp/eval-payload.json 2>/dev/null)
|
|
264
|
+
RESULT=$(echo "$RESPONSE" | node -e "
|
|
265
|
+
let d=''; process.stdin.on('data',c=>d+=c); process.stdin.on('end',()=>{
|
|
266
|
+
try { console.log(JSON.parse(d).choices[0].message.content); }
|
|
267
|
+
catch(e) { console.log(''); }
|
|
268
|
+
})
|
|
269
|
+
")
|
|
270
|
+
fi
|
|
271
|
+
|
|
272
|
+
# Fallback to OpenRouter
|
|
273
|
+
if [ -z "$RESULT" ] && [ -n "$OPENROUTER_API_KEY" ]; then
|
|
274
|
+
node -e "
|
|
275
|
+
const fs = require('fs');
|
|
276
|
+
const p = JSON.parse(fs.readFileSync('/tmp/eval-payload.json', 'utf8'));
|
|
277
|
+
p.model = 'anthropic/claude-sonnet-4';
|
|
278
|
+
fs.writeFileSync('/tmp/eval-payload-or.json', JSON.stringify(p));
|
|
279
|
+
"
|
|
280
|
+
RESPONSE=$(curl -s -X POST https://openrouter.ai/api/v1/chat/completions \
|
|
281
|
+
-H "Content-Type: application/json" \
|
|
282
|
+
-H "Authorization: Bearer $OPENROUTER_API_KEY" \
|
|
283
|
+
-d @/tmp/eval-payload-or.json 2>/dev/null)
|
|
284
|
+
RESULT=$(echo "$RESPONSE" | node -e "
|
|
285
|
+
let d=''; process.stdin.on('data',c=>d+=c); process.stdin.on('end',()=>{
|
|
286
|
+
try { console.log(JSON.parse(d).choices[0].message.content); }
|
|
287
|
+
catch(e) { console.log(''); }
|
|
288
|
+
})
|
|
289
|
+
")
|
|
290
|
+
fi
|
|
291
|
+
|
|
292
|
+
# Parse AI eval result
|
|
293
|
+
node -e "
|
|
294
|
+
const raw = \`$RESULT\` || '{}';
|
|
295
|
+
try {
|
|
296
|
+
const parsed = JSON.parse(raw.match(/\{[\s\S]*\}/)?.[0] || '{}');
|
|
297
|
+
const score = Math.max(0, Math.min(1, parsed.quality_score || 0.5));
|
|
298
|
+
const reason = (parsed.reason || 'AI eval unavailable').replace(/\n/g, ' ');
|
|
299
|
+
const dims = parsed.dimensions || {};
|
|
300
|
+
console.log('quality_score=' + score.toFixed(2));
|
|
301
|
+
console.log('quality_reason=' + reason);
|
|
302
|
+
console.log('dim_correctness=' + (dims.correctness || 0.5).toFixed(2));
|
|
303
|
+
console.log('dim_coverage=' + (dims.coverage || 0.5).toFixed(2));
|
|
304
|
+
console.log('dim_architecture=' + (dims.architecture || 0.5).toFixed(2));
|
|
305
|
+
console.log('dim_value=' + (dims.value || 0.5).toFixed(2));
|
|
306
|
+
} catch(e) {
|
|
307
|
+
console.log('quality_score=0.50');
|
|
308
|
+
console.log('quality_reason=AI eval parse error');
|
|
309
|
+
console.log('dim_correctness=0.50');
|
|
310
|
+
console.log('dim_coverage=0.50');
|
|
311
|
+
console.log('dim_architecture=0.50');
|
|
312
|
+
console.log('dim_value=0.50');
|
|
313
|
+
}
|
|
314
|
+
" >> $GITHUB_OUTPUT
|
|
315
|
+
|
|
316
|
+
- name: Detect agent from branch
|
|
317
|
+
id: agent
|
|
318
|
+
run: |
|
|
319
|
+
BRANCH="${{ github.head_ref }}"
|
|
320
|
+
# Extract agent from branch prefix: pp/* → peter-parker, bot/* → bot, etc.
|
|
321
|
+
if [[ "$BRANCH" == pp/* ]]; then
|
|
322
|
+
echo "name=peter-parker" >> $GITHUB_OUTPUT
|
|
323
|
+
elif [[ "$BRANCH" == bot/* ]]; then
|
|
324
|
+
echo "name=bot" >> $GITHUB_OUTPUT
|
|
325
|
+
elif [[ "$BRANCH" == agent/* ]]; then
|
|
326
|
+
# agent/agent-name/description → agent-name
|
|
327
|
+
AGENT_NAME=$(echo "$BRANCH" | cut -d'/' -f2)
|
|
328
|
+
echo "name=$AGENT_NAME" >> $GITHUB_OUTPUT
|
|
329
|
+
else
|
|
330
|
+
echo "name=${{ github.event.pull_request.user.login }}" >> $GITHUB_OUTPUT
|
|
331
|
+
fi
|
|
332
|
+
|
|
333
|
+
- name: Commit eval entry to PR branch
|
|
334
|
+
run: |
|
|
335
|
+
git config user.name "github-actions[bot]"
|
|
336
|
+
git config user.email "github-actions[bot]@users.noreply.github.com"
|
|
337
|
+
|
|
338
|
+
# Checkout the actual PR branch (not detached HEAD)
|
|
339
|
+
git checkout ${{ github.head_ref }}
|
|
340
|
+
|
|
341
|
+
AGENT="${{ steps.agent.outputs.name }}"
|
|
342
|
+
PRED_ID="${{ steps.predict.outputs.prediction_id }}"
|
|
343
|
+
|
|
344
|
+
# Write eval entry (matches readEvals path: .jfl/eval.jsonl)
|
|
345
|
+
mkdir -p .jfl
|
|
346
|
+
cat >> .jfl/eval.jsonl << EOF
|
|
347
|
+
{"v":1,"ts":"$(date -u +%Y-%m-%dT%H:%M:%SZ)","agent":"$AGENT","dataset":"ci-eval","metrics":{"test_pass_rate":${{ steps.pr_eval.outputs.score }},"tests_passed":${{ steps.pr_eval.outputs.passing }},"tests_total":${{ steps.pr_eval.outputs.total }}},"composite":${{ steps.pr_eval.outputs.score }},"delta":{"composite":${{ steps.delta.outputs.delta }}},"model_version":"$AGENT-run","improved":${{ steps.delta.outputs.improved }},"pr_number":${{ github.event.pull_request.number }},"branch":"${{ github.head_ref }}","quality_score":${{ steps.ai_eval.outputs.quality_score }},"dim_correctness":${{ steps.ai_eval.outputs.dim_correctness }},"dim_coverage":${{ steps.ai_eval.outputs.dim_coverage }},"dim_architecture":${{ steps.ai_eval.outputs.dim_architecture }},"dim_value":${{ steps.ai_eval.outputs.dim_value }},"prediction_id":"$PRED_ID"}
|
|
348
|
+
EOF
|
|
349
|
+
|
|
350
|
+
# Write eval:scored event for hub file watcher
|
|
351
|
+
cat >> .jfl/service-events.jsonl << EOF
|
|
352
|
+
{"ts":"$(date -u +%Y-%m-%dT%H:%M:%S.000Z)","type":"eval:scored","source":"ci","data":{"agent":"$AGENT","composite":${{ steps.pr_eval.outputs.score }},"baseline":${{ steps.baseline.outputs.score }},"delta":${{ steps.delta.outputs.delta }},"improved":"${{ steps.delta.outputs.improved }}","pr_number":${{ github.event.pull_request.number }},"branch":"${{ github.head_ref }}","run_url":"${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}","quality_score":${{ steps.ai_eval.outputs.quality_score }},"dim_correctness":${{ steps.ai_eval.outputs.dim_correctness }},"dim_coverage":${{ steps.ai_eval.outputs.dim_coverage }},"dim_architecture":${{ steps.ai_eval.outputs.dim_architecture }},"dim_value":${{ steps.ai_eval.outputs.dim_value }}}}
|
|
353
|
+
EOF
|
|
354
|
+
|
|
355
|
+
git add .jfl/eval.jsonl .jfl/service-events.jsonl
|
|
356
|
+
git add .jfl/predictions/ 2>/dev/null || true
|
|
357
|
+
git commit -m "eval: scored PR #${{ github.event.pull_request.number }} (composite=${{ steps.pr_eval.outputs.score }}, delta=${{ steps.delta.outputs.delta }})"
|
|
358
|
+
git push origin ${{ github.head_ref }}
|
|
359
|
+
|
|
360
|
+
- name: Post eval:scored event to hub (best-effort)
|
|
361
|
+
if: env.JFL_HUB_URL != ''
|
|
362
|
+
continue-on-error: true
|
|
363
|
+
run: |
|
|
364
|
+
curl -sf -X POST "${JFL_HUB_URL}/api/events" \
|
|
365
|
+
-H "Content-Type: application/json" \
|
|
366
|
+
-H "Authorization: Bearer ${JFL_HUB_TOKEN}" \
|
|
367
|
+
-d '{
|
|
368
|
+
"type": "eval:scored",
|
|
369
|
+
"source": "github-actions",
|
|
370
|
+
"data": {
|
|
371
|
+
"pr_number": ${{ github.event.pull_request.number }},
|
|
372
|
+
"pr_url": "${{ github.event.pull_request.html_url }}",
|
|
373
|
+
"branch": "${{ github.head_ref }}",
|
|
374
|
+
"commit_sha": "${{ github.sha }}",
|
|
375
|
+
"agent": "${{ steps.agent.outputs.name }}",
|
|
376
|
+
"metrics": {
|
|
377
|
+
"test_pass_rate": ${{ steps.pr_eval.outputs.score }},
|
|
378
|
+
"test_count": ${{ steps.pr_eval.outputs.total }},
|
|
379
|
+
"tests_passing": ${{ steps.pr_eval.outputs.passing }}
|
|
380
|
+
},
|
|
381
|
+
"baseline": ${{ steps.baseline.outputs.score }},
|
|
382
|
+
"baseline_total": ${{ steps.baseline.outputs.total }},
|
|
383
|
+
"baseline_passing": ${{ steps.baseline.outputs.passing }},
|
|
384
|
+
"composite": ${{ steps.pr_eval.outputs.score }},
|
|
385
|
+
"delta": ${{ steps.delta.outputs.delta }},
|
|
386
|
+
"tests_added": ${{ steps.delta.outputs.tests_added }},
|
|
387
|
+
"tests_passing_added": ${{ steps.delta.outputs.tests_passing_added }},
|
|
388
|
+
"improved": ${{ steps.delta.outputs.improved }},
|
|
389
|
+
"run_id": "${{ github.run_id }}",
|
|
390
|
+
"run_url": "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
|
391
|
+
}
|
|
392
|
+
}' || echo "Warning: could not post to hub (hub may not be running)"
|
|
393
|
+
|
|
394
|
+
- name: Comment on PR
|
|
395
|
+
uses: actions/github-script@v7
|
|
396
|
+
with:
|
|
397
|
+
script: |
|
|
398
|
+
const delta = parseFloat('${{ steps.delta.outputs.delta }}');
|
|
399
|
+
const improved = '${{ steps.delta.outputs.improved }}' === 'true';
|
|
400
|
+
const baseline = '${{ steps.baseline.outputs.score }}';
|
|
401
|
+
const prScore = '${{ steps.pr_eval.outputs.score }}';
|
|
402
|
+
const baselineTotal = parseInt('${{ steps.baseline.outputs.total }}');
|
|
403
|
+
const prTotal = parseInt('${{ steps.pr_eval.outputs.total }}');
|
|
404
|
+
const baselinePassing = parseInt('${{ steps.baseline.outputs.passing }}');
|
|
405
|
+
const prPassing = parseInt('${{ steps.pr_eval.outputs.passing }}');
|
|
406
|
+
const testsAdded = parseInt('${{ steps.delta.outputs.tests_added }}');
|
|
407
|
+
const testsPassingAdded = parseInt('${{ steps.delta.outputs.tests_passing_added }}');
|
|
408
|
+
const runUrl = '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}';
|
|
409
|
+
|
|
410
|
+
const emoji = improved ? ':green_circle:' : (delta < 0 ? ':red_circle:' : ':yellow_circle:');
|
|
411
|
+
let verdict = 'UNCHANGED';
|
|
412
|
+
if (delta > 0) verdict = 'IMPROVED';
|
|
413
|
+
else if (delta < 0) verdict = 'REGRESSED';
|
|
414
|
+
else if (testsPassingAdded > 0) verdict = 'IMPROVED (more tests)';
|
|
415
|
+
|
|
416
|
+
const testsDeltaStr = testsAdded > 0 ? `+${testsAdded}` : `${testsAdded}`;
|
|
417
|
+
const qualityScore = parseFloat('${{ steps.ai_eval.outputs.quality_score }}') || 0.5;
|
|
418
|
+
const qualityReason = '${{ steps.ai_eval.outputs.quality_reason }}' || 'N/A';
|
|
419
|
+
const dimCorrectness = '${{ steps.ai_eval.outputs.dim_correctness }}' || '—';
|
|
420
|
+
const dimCoverage = '${{ steps.ai_eval.outputs.dim_coverage }}' || '—';
|
|
421
|
+
const dimArchitecture = '${{ steps.ai_eval.outputs.dim_architecture }}' || '—';
|
|
422
|
+
const dimValue = '${{ steps.ai_eval.outputs.dim_value }}' || '—';
|
|
423
|
+
|
|
424
|
+
// Prediction data (optional)
|
|
425
|
+
const predId = '${{ steps.predict.outputs.prediction_id }}' || '';
|
|
426
|
+
const predDelta = parseFloat('${{ steps.predict.outputs.predicted_delta }}') || 0;
|
|
427
|
+
const predConf = parseFloat('${{ steps.predict.outputs.confidence }}') || 0;
|
|
428
|
+
const predRec = '${{ steps.predict.outputs.recommendation }}' || '';
|
|
429
|
+
|
|
430
|
+
const predictionSection = predId ? [
|
|
431
|
+
'',
|
|
432
|
+
`### Stratus Prediction`,
|
|
433
|
+
'',
|
|
434
|
+
`| Metric | Value |`,
|
|
435
|
+
`|--------|-------|`,
|
|
436
|
+
`| Predicted delta | ${predDelta >= 0 ? '+' : ''}${predDelta.toFixed(4)} |`,
|
|
437
|
+
`| Actual delta | ${delta >= 0 ? '+' : ''}${delta.toFixed(4)} |`,
|
|
438
|
+
`| Prediction error | ${Math.abs(predDelta - delta).toFixed(4)} |`,
|
|
439
|
+
`| Confidence | ${Math.round(predConf * 100)}% |`,
|
|
440
|
+
`| Recommendation | ${predRec} |`,
|
|
441
|
+
`| Direction correct | ${(predDelta >= 0) === (delta >= 0) ? ':white_check_mark:' : ':x:'} |`,
|
|
442
|
+
'',
|
|
443
|
+
] : [];
|
|
444
|
+
|
|
445
|
+
const body = [
|
|
446
|
+
`## JFL Eval Results ${emoji}`,
|
|
447
|
+
'',
|
|
448
|
+
'| Metric | Baseline (main) | PR | Delta |',
|
|
449
|
+
'|--------|-----------------|-----|-------|',
|
|
450
|
+
`| test_pass_rate | ${baseline} (${baselinePassing}/${baselineTotal}) | ${prScore} (${prPassing}/${prTotal}) | ${delta > 0 ? '+' : ''}${delta.toFixed(4)} |`,
|
|
451
|
+
`| test_count | ${baselineTotal} | ${prTotal} | ${testsDeltaStr} |`,
|
|
452
|
+
`| tests_passing | ${baselinePassing} | ${prPassing} | ${testsPassingAdded > 0 ? '+' : ''}${testsPassingAdded} |`,
|
|
453
|
+
'',
|
|
454
|
+
`### AI Quality Score: ${qualityScore.toFixed(2)}`,
|
|
455
|
+
'',
|
|
456
|
+
'| Dimension | Score |',
|
|
457
|
+
'|-----------|-------|',
|
|
458
|
+
`| Correctness | ${dimCorrectness} |`,
|
|
459
|
+
`| Coverage | ${dimCoverage} |`,
|
|
460
|
+
`| Architecture | ${dimArchitecture} |`,
|
|
461
|
+
`| Value | ${dimValue} |`,
|
|
462
|
+
'',
|
|
463
|
+
`> ${qualityReason}`,
|
|
464
|
+
...predictionSection,
|
|
465
|
+
`**Verdict: ${verdict}**`,
|
|
466
|
+
'',
|
|
467
|
+
improved
|
|
468
|
+
? 'Auto-merge eligible via JFL flow.'
|
|
469
|
+
: (delta < 0
|
|
470
|
+
? 'Regression detected. Manual review required.'
|
|
471
|
+
: 'No improvement detected. Manual review recommended.'),
|
|
472
|
+
'',
|
|
473
|
+
'---',
|
|
474
|
+
`[View run](${runUrl}) | *Evaluated by JFL self-driving loop*`,
|
|
475
|
+
].join('\n');
|
|
476
|
+
|
|
477
|
+
const { data: comments } = await github.rest.issues.listComments({
|
|
478
|
+
owner: context.repo.owner,
|
|
479
|
+
repo: context.repo.repo,
|
|
480
|
+
issue_number: context.issue.number,
|
|
481
|
+
});
|
|
482
|
+
|
|
483
|
+
const existing = comments.find(c =>
|
|
484
|
+
c.user.type === 'Bot' && c.body.includes('JFL Eval Results')
|
|
485
|
+
);
|
|
486
|
+
|
|
487
|
+
if (existing) {
|
|
488
|
+
await github.rest.issues.updateComment({
|
|
489
|
+
owner: context.repo.owner,
|
|
490
|
+
repo: context.repo.repo,
|
|
491
|
+
comment_id: existing.id,
|
|
492
|
+
body,
|
|
493
|
+
});
|
|
494
|
+
} else {
|
|
495
|
+
await github.rest.issues.createComment({
|
|
496
|
+
owner: context.repo.owner,
|
|
497
|
+
repo: context.repo.repo,
|
|
498
|
+
issue_number: context.issue.number,
|
|
499
|
+
body,
|
|
500
|
+
});
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
- name: Auto-merge or flag regression
|
|
504
|
+
if: always()
|
|
505
|
+
env:
|
|
506
|
+
GH_TOKEN: ${{ github.token }}
|
|
507
|
+
run: |
|
|
508
|
+
PR_NUMBER=${{ github.event.pull_request.number }}
|
|
509
|
+
IMPROVED="${{ steps.delta.outputs.improved }}"
|
|
510
|
+
|
|
511
|
+
if [ "$IMPROVED" = "true" ]; then
|
|
512
|
+
# Check if AI review requested changes
|
|
513
|
+
REVIEW_STATE=$(gh pr view $PR_NUMBER --json reviews --jq '[.reviews[] | select(.author.login == "github-actions")] | last | .state // "NONE"' 2>/dev/null || echo "NONE")
|
|
514
|
+
|
|
515
|
+
if [ "$REVIEW_STATE" = "CHANGES_REQUESTED" ]; then
|
|
516
|
+
echo "Eval passed but AI review has blockers — holding merge"
|
|
517
|
+
gh pr comment $PR_NUMBER --body "**JFL Eval passed** (test_pass_rate improved) but **AI review found blockers**. Address red findings, then eval will re-run and auto-merge."
|
|
518
|
+
else
|
|
519
|
+
echo "Eval passed, no review blockers — auto-merging PR #$PR_NUMBER"
|
|
520
|
+
gh pr merge $PR_NUMBER --merge --delete-branch \
|
|
521
|
+
--body "Auto-merged by JFL eval: test_pass_rate improved by ${{ steps.delta.outputs.delta }} (${{ steps.baseline.outputs.score }} → ${{ steps.pr_eval.outputs.score }})"
|
|
522
|
+
fi
|
|
523
|
+
else
|
|
524
|
+
echo "Eval regression — requesting changes on PR #$PR_NUMBER"
|
|
525
|
+
gh pr review $PR_NUMBER --request-changes \
|
|
526
|
+
--body "JFL eval regression: test_pass_rate dropped by ${{ steps.delta.outputs.delta }} (${{ steps.baseline.outputs.score }} → ${{ steps.pr_eval.outputs.score }}). Run: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
|
527
|
+
fi
|
|
528
|
+
|
|
529
|
+
- name: Eval Summary
|
|
530
|
+
if: always()
|
|
531
|
+
run: |
|
|
532
|
+
echo "### JFL Eval Results" >> $GITHUB_STEP_SUMMARY
|
|
533
|
+
echo "" >> $GITHUB_STEP_SUMMARY
|
|
534
|
+
echo "| Metric | Baseline | PR | Delta |" >> $GITHUB_STEP_SUMMARY
|
|
535
|
+
echo "|--------|----------|-----|-------|" >> $GITHUB_STEP_SUMMARY
|
|
536
|
+
echo "| test_pass_rate | ${{ steps.baseline.outputs.score }} | ${{ steps.pr_eval.outputs.score }} | ${{ steps.delta.outputs.delta }} |" >> $GITHUB_STEP_SUMMARY
|
|
537
|
+
echo "| test_count | ${{ steps.baseline.outputs.total }} | ${{ steps.pr_eval.outputs.total }} | ${{ steps.delta.outputs.tests_added }} |" >> $GITHUB_STEP_SUMMARY
|
|
538
|
+
echo "| tests_passing | ${{ steps.baseline.outputs.passing }} | ${{ steps.pr_eval.outputs.passing }} | ${{ steps.delta.outputs.tests_passing_added }} |" >> $GITHUB_STEP_SUMMARY
|
|
539
|
+
echo "" >> $GITHUB_STEP_SUMMARY
|
|
540
|
+
echo "Improved: ${{ steps.delta.outputs.improved }}" >> $GITHUB_STEP_SUMMARY
|