@xn-intenton-z2a/agentic-lib 7.4.43 → 7.4.44
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -1,100 +1,56 @@
|
|
|
1
1
|
---
|
|
2
|
-
description:
|
|
2
|
+
description: Investigate benchmark data via tools and produce structured findings with evidence
|
|
3
3
|
---
|
|
4
4
|
|
|
5
|
-
You are a benchmark analyst for an autonomous coding pipeline.
|
|
5
|
+
You are a benchmark analyst for an autonomous coding pipeline. Data has been gathered to files on disk. Your job is to **investigate** using tools, not just summarise — dig into source code, issues, PRs, and commits to produce findings like those in a professional benchmark report.
|
|
6
6
|
|
|
7
7
|
## Available Tools
|
|
8
8
|
|
|
9
|
-
- `read_file` — Read
|
|
10
|
-
- `
|
|
11
|
-
- `
|
|
12
|
-
- `
|
|
13
|
-
- `
|
|
14
|
-
- `
|
|
9
|
+
- `read_file` — Read files on disk. Key data files are in `/tmp/report-data/`:
|
|
10
|
+
- `mission.md` — The MISSION.md with acceptance criteria
|
|
11
|
+
- `config.toml` — Full agentic-lib.toml configuration
|
|
12
|
+
- `state.toml` — Persistent state (counters, budget, mission status)
|
|
13
|
+
- `workflow-runs.json` — All workflow runs with timing and outcome
|
|
14
|
+
- `commits.json` — All commits with messages and authors
|
|
15
|
+
- `issues.json` — Issues with labels and state
|
|
16
|
+
- `pull-requests.json` — PRs with branches and merge info
|
|
17
|
+
- `read_file` — Also read repository files directly: `src/lib/main.js`, `tests/unit/*.test.js`, `README.md`
|
|
18
|
+
- `list_files` — Browse directory structure
|
|
19
|
+
- `list_issues` / `get_issue` — Get full issue details including body and comments
|
|
20
|
+
- `list_prs` — Query PRs
|
|
21
|
+
- `git_diff` / `git_status` — View working tree state
|
|
22
|
+
- `report_analysis` — **Required.** Call exactly once with your structured analysis.
|
|
15
23
|
|
|
16
|
-
|
|
24
|
+
## How to Work
|
|
17
25
|
|
|
18
|
-
|
|
26
|
+
**Do NOT summarise or concatenate raw data.** Instead:
|
|
19
27
|
|
|
20
|
-
|
|
28
|
+
1. **Read the mission** (`/tmp/report-data/mission.md`) — extract each acceptance criterion
|
|
29
|
+
2. **Read the source code** (`src/lib/main.js`) — verify each criterion is implemented
|
|
30
|
+
3. **Read workflow-runs.json** — identify which runs produced transforms vs maintenance
|
|
31
|
+
4. **Cross-reference with pull-requests.json** — map transforms to merged PRs
|
|
32
|
+
5. **Read specific issues** (use `get_issue`) — understand what work was done
|
|
33
|
+
6. **Look for problems** — failing runs, issue churn, budget exhaustion, stuck loops
|
|
21
34
|
|
|
22
|
-
|
|
23
|
-
- **agentic-lib.toml** — full configuration snapshot (model, profile, budget, paths, tuning)
|
|
24
|
-
- **agentic-lib-state.toml** — full persistent state snapshot (counters, budget, status flags)
|
|
25
|
-
- **Workflow runs** — all runs in the period with name, conclusion, timing, duration, and URLs
|
|
26
|
-
- **Pull requests** — merged and open PRs with branch, title, additions/deletions, file count
|
|
27
|
-
- **Commits** — all commits with SHA, message, author, timestamp
|
|
28
|
-
- **Issues** — open and recently closed issues with labels, title, body excerpts
|
|
29
|
-
- **Source code** — full contents of all source files (src/lib/*.js), not just line counts
|
|
30
|
-
- **Test files** — full contents of all test files, not just filenames
|
|
31
|
-
- **Agent log excerpts** — narrative excerpts from the most recent agent log files
|
|
32
|
-
- **Website HTML** — text summary of the GitHub Pages website content
|
|
33
|
-
- **Screenshot** — whether SCREENSHOT_INDEX.png was captured (available as artifact)
|
|
34
|
-
- **README.md** — repository README content
|
|
35
|
-
- **Mission status** — whether MISSION_COMPLETE.md or MISSION_FAILED.md exist, with contents
|
|
35
|
+
## What Good Findings Look Like
|
|
36
36
|
|
|
37
|
-
|
|
37
|
+
From BENCHMARK_REPORT_016.md:
|
|
38
38
|
|
|
39
|
-
###
|
|
39
|
+
> ### FINDING-2: Autonomous dependency addition without lockfile update breaks CI (CRITICAL)
|
|
40
|
+
> PR #32 added `sharp` to `package.json` without regenerating the lockfile. The LLM knew the mission required a PNG dependency... But the transform mechanism can only edit files — it cannot run `npm install`. This is a **structural gap** in the autonomous pipeline.
|
|
40
41
|
|
|
41
|
-
|
|
42
|
-
- Use `read_file` to check source code for evidence of implementation
|
|
43
|
-
- Use `list_issues` / `get_issue` to find related issues that addressed it
|
|
44
|
-
- Mark each criterion as **PASS**, **FAIL**, or **NOT TESTED** with specific evidence (file path, line number, function name, or issue number)
|
|
45
|
-
- Don't trust issue titles — verify in the actual code
|
|
42
|
+
Notice: specific PR number, root cause analysis, structural insight, severity level.
|
|
46
43
|
|
|
47
|
-
|
|
44
|
+
Bad finding: "The pipeline ran 8 workflow runs and produced 4 transforms." — This is just restating numbers from the data.
|
|
48
45
|
|
|
49
|
-
|
|
50
|
-
- Was it an init run, a supervisor run, or a manual dispatch?
|
|
51
|
-
- Did it produce a transform? (Check: was a PR merged in the same time window?)
|
|
52
|
-
- What did the supervisor/director decide? (Check agent logs)
|
|
53
|
-
- Map runs to PRs to commits to understand the transformation chain
|
|
46
|
+
## What to Produce
|
|
54
47
|
|
|
55
|
-
|
|
48
|
+
Call `report_analysis` with:
|
|
56
49
|
|
|
57
|
-
|
|
50
|
+
- **summary**: 2-3 sentences. What was the mission? Did it complete? What's the headline?
|
|
51
|
+
- **iteration_narrative**: Prose timeline. "At 02:10, the first workflow run produced PR #9 which implemented the expression parser. At 03:36, a second transform added CSV loading via PR #11..." — map runs to PRs to actual changes.
|
|
52
|
+
- **acceptance_criteria**: For EACH criterion from MISSION.md, read the source code and mark PASS/FAIL/NOT TESTED with evidence like "fizzBuzz() at src/lib/main.js:12 returns correct array — tested in tests/unit/fizzbuzz.test.js"
|
|
53
|
+
- **findings**: Observations with severity. POSITIVE (what worked well), CONCERN (needs attention), CRITICAL (broken), REGRESSION (got worse), OBSERVATION (neutral insight). Every finding must cite specific evidence.
|
|
54
|
+
- **recommendations**: Actionable next steps
|
|
58
55
|
|
|
59
|
-
|
|
60
|
-
- Is the implementation correct and complete?
|
|
61
|
-
- Are the tests meaningful (testing real behaviour) or trivial (testing existence)?
|
|
62
|
-
- Are there TODO comments or incomplete implementations?
|
|
63
|
-
- Does the code structure match what the mission asked for?
|
|
64
|
-
|
|
65
|
-
### 4. Identify Findings
|
|
66
|
-
|
|
67
|
-
Each finding should be categorised as:
|
|
68
|
-
- **POSITIVE** — something that worked well
|
|
69
|
-
- **CONCERN** — something that needs attention
|
|
70
|
-
- **REGRESSION** — something that got worse compared to expected behaviour
|
|
71
|
-
|
|
72
|
-
Every finding must cite evidence (file path, issue number, commit SHA, or workflow run ID).
|
|
73
|
-
|
|
74
|
-
### 5. Produce Scenario Summary
|
|
75
|
-
|
|
76
|
-
Fill in the `scenario_summary` object:
|
|
77
|
-
- `total_iterations`: total workflow runs
|
|
78
|
-
- `transforms`: how many produced merged PRs with code changes
|
|
79
|
-
- `convergence_iteration`: which iteration reached mission-complete (0 if not)
|
|
80
|
-
- `final_source_lines`: line count of main source file
|
|
81
|
-
- `final_test_count`: number of test files
|
|
82
|
-
- `acceptance_pass_count`: e.g. "7/8 PASS"
|
|
83
|
-
- `total_tokens`: from state file counters
|
|
84
|
-
|
|
85
|
-
### 6. Make Recommendations
|
|
86
|
-
|
|
87
|
-
Actionable next steps for improving the pipeline, the mission, or the code. Be specific.
|
|
88
|
-
|
|
89
|
-
### 7. Call `report_analysis`
|
|
90
|
-
|
|
91
|
-
Record your complete analysis as a structured JSON object. This is mandatory — the report cannot be enriched without it.
|
|
92
|
-
|
|
93
|
-
## Report Quality Standards
|
|
94
|
-
|
|
95
|
-
- Every claim must cite evidence
|
|
96
|
-
- Acceptance criteria assessment must read the actual source code
|
|
97
|
-
- Compare state file counters with observed workflow runs for consistency
|
|
98
|
-
- Note any discrepancies between what the pipeline reports and what actually happened
|
|
99
|
-
- Be honest about failures — a clear failure report is more valuable than a vague success report
|
|
100
|
-
- Include the iteration narrative as prose, not just a table
|
|
56
|
+
**You MUST call report_analysis exactly once.**
|
package/package.json
CHANGED
|
@@ -1,18 +1,19 @@
|
|
|
1
1
|
// SPDX-License-Identifier: GPL-3.0-only
|
|
2
2
|
// Copyright (C) 2025-2026 Polycode Limited
|
|
3
|
-
// tasks/report.js — Benchmark report:
|
|
3
|
+
// tasks/report.js — Benchmark report: gather data to filesystem, LLM analyses via tools
|
|
4
4
|
//
|
|
5
|
-
//
|
|
6
|
-
//
|
|
7
|
-
//
|
|
8
|
-
// would write following ITERATION_BENCHMARKS_SIMPLE.md.
|
|
5
|
+
// Pattern: write mechanical data to files on disk, give the LLM a concise summary
|
|
6
|
+
// pointing it at those files + tools, LLM investigates and calls report_analysis
|
|
7
|
+
// with structured findings. The handler writes the final report markdown.
|
|
9
8
|
|
|
10
9
|
import * as core from "@actions/core";
|
|
11
|
-
import { existsSync, readFileSync,
|
|
12
|
-
import { readOptionalFile,
|
|
10
|
+
import { existsSync, readFileSync, writeFileSync, mkdirSync } from "fs";
|
|
11
|
+
import { readOptionalFile, NARRATIVE_INSTRUCTION } from "../copilot.js";
|
|
13
12
|
import { runCopilotSession } from "../../../copilot/copilot-session.js";
|
|
14
13
|
import { createGitHubTools, createGitTools } from "../../../copilot/github-tools.js";
|
|
15
14
|
|
|
15
|
+
const REPORT_DATA_DIR = "/tmp/report-data";
|
|
16
|
+
|
|
16
17
|
/**
|
|
17
18
|
* Discover the most recent init workflow run timestamp via GitHub API.
|
|
18
19
|
*/
|
|
@@ -32,440 +33,299 @@ async function findLatestInitRun(octokit, owner, repo) {
|
|
|
32
33
|
}
|
|
33
34
|
|
|
34
35
|
/**
|
|
35
|
-
*
|
|
36
|
+
* Gather all mechanical data and write it to files in REPORT_DATA_DIR.
|
|
37
|
+
* Returns a summary object with counts and key facts.
|
|
36
38
|
*/
|
|
37
|
-
async function
|
|
38
|
-
|
|
39
|
+
async function gatherAndWriteData(octokit, owner, repoName, periodStart, periodEnd, config) {
|
|
40
|
+
mkdirSync(REPORT_DATA_DIR, { recursive: true });
|
|
41
|
+
|
|
42
|
+
// 1. Config and state (already on disk, just copy references)
|
|
43
|
+
const configContent = readOptionalFile(config._configPath || "agentic-lib.toml") || "";
|
|
44
|
+
const stateContent = readOptionalFile("agentic-lib-state.toml") || "";
|
|
45
|
+
const missionContent = readOptionalFile("MISSION.md") || "";
|
|
46
|
+
writeFileSync(`${REPORT_DATA_DIR}/config.toml`, configContent);
|
|
47
|
+
writeFileSync(`${REPORT_DATA_DIR}/state.toml`, stateContent);
|
|
48
|
+
writeFileSync(`${REPORT_DATA_DIR}/mission.md`, missionContent);
|
|
49
|
+
|
|
50
|
+
// 2. Workflow runs
|
|
51
|
+
let workflowRuns = [];
|
|
39
52
|
try {
|
|
40
53
|
const { data } = await octokit.rest.actions.listWorkflowRunsForRepo({
|
|
41
|
-
owner, repo, per_page: 50, created: `${
|
|
54
|
+
owner, repo: repoName, per_page: 50, created: `${periodStart}..${periodEnd}`,
|
|
42
55
|
});
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
}
|
|
53
|
-
return runs;
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
/**
|
|
57
|
-
* List commits in the period.
|
|
58
|
-
*/
|
|
59
|
-
async function listCommits(octokit, owner, repo, since, until) {
|
|
60
|
-
const commits = [];
|
|
56
|
+
workflowRuns = data.workflow_runs.map(r => ({
|
|
57
|
+
id: r.id, name: r.name, status: r.status, conclusion: r.conclusion,
|
|
58
|
+
created_at: r.created_at, updated_at: r.updated_at, html_url: r.html_url,
|
|
59
|
+
}));
|
|
60
|
+
} catch (err) { core.warning(`Could not list runs: ${err.message}`); }
|
|
61
|
+
writeFileSync(`${REPORT_DATA_DIR}/workflow-runs.json`, JSON.stringify(workflowRuns, null, 2));
|
|
62
|
+
|
|
63
|
+
// 3. Commits
|
|
64
|
+
let commits = [];
|
|
61
65
|
try {
|
|
62
66
|
const { data } = await octokit.rest.repos.listCommits({
|
|
63
|
-
owner, repo, since, until, per_page: 100,
|
|
67
|
+
owner, repo: repoName, since: periodStart, until: periodEnd, per_page: 100,
|
|
64
68
|
});
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
core.warning(`Could not list commits: ${err.message}`);
|
|
75
|
-
}
|
|
76
|
-
return commits;
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
/**
|
|
80
|
-
* List issues (open + recently closed) with bodies for context.
|
|
81
|
-
*/
|
|
82
|
-
async function listIssues(octokit, owner, repo, since) {
|
|
83
|
-
const issues = [];
|
|
69
|
+
commits = data.map(c => ({
|
|
70
|
+
sha: c.sha.substring(0, 8), message: c.commit.message.split("\n")[0],
|
|
71
|
+
author: c.commit.author?.name || "unknown", date: c.commit.author?.date || "",
|
|
72
|
+
}));
|
|
73
|
+
} catch (err) { core.warning(`Could not list commits: ${err.message}`); }
|
|
74
|
+
writeFileSync(`${REPORT_DATA_DIR}/commits.json`, JSON.stringify(commits, null, 2));
|
|
75
|
+
|
|
76
|
+
// 4. Issues (open + recently closed)
|
|
77
|
+
let issues = [];
|
|
84
78
|
for (const state of ["open", "closed"]) {
|
|
85
79
|
try {
|
|
86
80
|
const { data } = await octokit.rest.issues.listForRepo({
|
|
87
|
-
owner, repo, state, since, per_page: 50,
|
|
81
|
+
owner, repo: repoName, state, since: periodStart, per_page: 50,
|
|
82
|
+
sort: "created", direction: "desc",
|
|
88
83
|
});
|
|
89
84
|
for (const i of data) {
|
|
90
85
|
if (i.pull_request) continue;
|
|
91
86
|
issues.push({
|
|
92
87
|
number: i.number, state: i.state, title: i.title,
|
|
93
|
-
labels: i.labels.map(l => l.name)
|
|
94
|
-
|
|
95
|
-
body: i.body ? i.body.substring(0, 500) : "",
|
|
88
|
+
labels: i.labels.map(l => l.name), created_at: i.created_at,
|
|
89
|
+
closed_at: i.closed_at,
|
|
96
90
|
});
|
|
97
91
|
}
|
|
98
92
|
} catch { /* ignore */ }
|
|
99
93
|
}
|
|
100
|
-
|
|
101
|
-
}
|
|
94
|
+
writeFileSync(`${REPORT_DATA_DIR}/issues.json`, JSON.stringify(issues, null, 2));
|
|
102
95
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
*/
|
|
106
|
-
async function listPullRequests(octokit, owner, repo, since) {
|
|
107
|
-
const prs = [];
|
|
96
|
+
// 5. PRs (merged + open)
|
|
97
|
+
let prs = [];
|
|
108
98
|
for (const state of ["closed", "open"]) {
|
|
109
99
|
try {
|
|
110
100
|
const { data } = await octokit.rest.pulls.list({
|
|
111
|
-
owner, repo, state, per_page: 30, sort: "created", direction: "desc",
|
|
101
|
+
owner, repo: repoName, state, per_page: 30, sort: "created", direction: "desc",
|
|
112
102
|
});
|
|
113
103
|
for (const p of data) {
|
|
114
104
|
if (state === "closed" && !p.merged_at) continue;
|
|
115
|
-
if (new Date(p.created_at) < new Date(
|
|
105
|
+
if (new Date(p.created_at) < new Date(periodStart)) continue;
|
|
116
106
|
prs.push({
|
|
117
107
|
number: p.number, title: p.title, state: p.state,
|
|
118
108
|
branch: p.head?.ref || "", merged_at: p.merged_at,
|
|
119
|
-
created_at: p.created_at,
|
|
120
109
|
additions: p.additions || 0, deletions: p.deletions || 0,
|
|
121
|
-
changed_files: p.changed_files || 0,
|
|
122
110
|
});
|
|
123
111
|
}
|
|
124
112
|
} catch { /* ignore */ }
|
|
125
113
|
}
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
const
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
const
|
|
140
|
-
if (
|
|
141
|
-
if (
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
const fp = `${srcDir}/${f}`;
|
|
145
|
-
if (fp === srcPath) continue;
|
|
146
|
-
if (f.endsWith(".js") || f.endsWith(".ts")) filePaths.push(fp);
|
|
147
|
-
}
|
|
148
|
-
} catch { /* ignore */ }
|
|
149
|
-
}
|
|
114
|
+
writeFileSync(`${REPORT_DATA_DIR}/pull-requests.json`, JSON.stringify(prs, null, 2));
|
|
115
|
+
|
|
116
|
+
// 6. Mission status
|
|
117
|
+
const missionComplete = existsSync("MISSION_COMPLETE.md");
|
|
118
|
+
const missionFailed = existsSync("MISSION_FAILED.md");
|
|
119
|
+
const completeContent = missionComplete ? readFileSync("MISSION_COMPLETE.md", "utf8") : "";
|
|
120
|
+
const failedContent = missionFailed ? readFileSync("MISSION_FAILED.md", "utf8") : "";
|
|
121
|
+
|
|
122
|
+
// Parse key state values
|
|
123
|
+
let budgetUsed = 0, budgetCap = 0, totalTokens = 0, transforms = 0;
|
|
124
|
+
const budgetUsedMatch = stateContent.match(/transformation-budget-used\s*=\s*(\d+)/);
|
|
125
|
+
const budgetCapMatch = stateContent.match(/transformation-budget-cap\s*=\s*(\d+)/);
|
|
126
|
+
const tokensMatch = stateContent.match(/total-tokens\s*=\s*(\d+)/);
|
|
127
|
+
const transformsMatch = stateContent.match(/cumulative-transforms\s*=\s*(\d+)/);
|
|
128
|
+
if (budgetUsedMatch) budgetUsed = parseInt(budgetUsedMatch[1]);
|
|
129
|
+
if (budgetCapMatch) budgetCap = parseInt(budgetCapMatch[1]);
|
|
130
|
+
if (tokensMatch) totalTokens = parseInt(tokensMatch[1]);
|
|
131
|
+
if (transformsMatch) transforms = parseInt(transformsMatch[1]);
|
|
150
132
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
/**
|
|
167
|
-
* Read test file contents (with size limit).
|
|
168
|
-
*/
|
|
169
|
-
function readTestFiles() {
|
|
170
|
-
const files = [];
|
|
171
|
-
const MAX_CHARS = 3000;
|
|
172
|
-
for (const dir of ["tests", "tests/unit", "__tests__"]) {
|
|
173
|
-
if (!existsSync(dir)) continue;
|
|
174
|
-
try {
|
|
175
|
-
for (const f of readdirSync(dir)) {
|
|
176
|
-
if (!f.endsWith(".test.js") && !f.endsWith(".test.ts") && !f.endsWith(".spec.js")) continue;
|
|
177
|
-
const fp = `${dir}/${f}`;
|
|
178
|
-
try {
|
|
179
|
-
const content = readFileSync(fp, "utf8");
|
|
180
|
-
files.push({
|
|
181
|
-
file: fp,
|
|
182
|
-
lines: content.split("\n").length,
|
|
183
|
-
content: content.length > MAX_CHARS
|
|
184
|
-
? content.substring(0, MAX_CHARS) + `\n... (truncated)`
|
|
185
|
-
: content,
|
|
186
|
-
});
|
|
187
|
-
} catch { /* ignore */ }
|
|
188
|
-
}
|
|
189
|
-
} catch { /* ignore */ }
|
|
190
|
-
}
|
|
191
|
-
return files;
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
/**
|
|
195
|
-
* Read agent log file contents (last N logs).
|
|
196
|
-
*/
|
|
197
|
-
function readAgentLogs(logPrefix, maxLogs = 10) {
|
|
198
|
-
const logDir = logPrefix.includes("/") ? logPrefix.substring(0, logPrefix.lastIndexOf("/")) : ".";
|
|
199
|
-
const logBase = logPrefix.includes("/") ? logPrefix.substring(logPrefix.lastIndexOf("/") + 1) : logPrefix;
|
|
200
|
-
const logs = [];
|
|
201
|
-
try {
|
|
202
|
-
const allLogs = readdirSync(logDir)
|
|
203
|
-
.filter(f => f.startsWith(logBase) && f.endsWith(".md"))
|
|
204
|
-
.sort();
|
|
205
|
-
// Take the most recent N logs
|
|
206
|
-
const recent = allLogs.slice(-maxLogs);
|
|
207
|
-
for (const f of recent) {
|
|
208
|
-
const fp = logDir === "." ? f : `${logDir}/${f}`;
|
|
209
|
-
try {
|
|
210
|
-
const content = readFileSync(fp, "utf8");
|
|
211
|
-
// Extract key info: first 80 lines
|
|
212
|
-
const lines = content.split("\n");
|
|
213
|
-
logs.push({
|
|
214
|
-
file: f,
|
|
215
|
-
excerpt: lines.slice(0, 80).join("\n"),
|
|
216
|
-
totalLines: lines.length,
|
|
217
|
-
});
|
|
218
|
-
} catch { /* ignore */ }
|
|
219
|
-
}
|
|
220
|
-
} catch { /* ignore */ }
|
|
221
|
-
return logs;
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
/**
|
|
225
|
-
* Extract acceptance criteria from MISSION.md.
|
|
226
|
-
* Looks for bullet points, numbered lists, or "Acceptance Criteria" sections.
|
|
227
|
-
*/
|
|
228
|
-
function extractAcceptanceCriteria(missionContent) {
|
|
229
|
-
if (!missionContent) return [];
|
|
230
|
-
const criteria = [];
|
|
231
|
-
const lines = missionContent.split("\n");
|
|
232
|
-
let inCriteria = false;
|
|
233
|
-
for (const line of lines) {
|
|
234
|
-
const lower = line.toLowerCase();
|
|
235
|
-
if (lower.includes("acceptance") || lower.includes("criteria") || lower.includes("requirements")) {
|
|
236
|
-
inCriteria = true;
|
|
237
|
-
continue;
|
|
238
|
-
}
|
|
239
|
-
if (inCriteria && /^#+\s/.test(line) && !lower.includes("criteria")) {
|
|
240
|
-
inCriteria = false;
|
|
241
|
-
}
|
|
242
|
-
if (inCriteria) {
|
|
243
|
-
const match = line.match(/^[\s]*[-*]\s+(.+)/) || line.match(/^[\s]*\d+\.\s+(.+)/);
|
|
244
|
-
if (match) criteria.push(match[1].trim());
|
|
245
|
-
}
|
|
246
|
-
}
|
|
247
|
-
// If no explicit criteria section found, extract all bullet points as potential criteria
|
|
248
|
-
if (criteria.length === 0) {
|
|
249
|
-
for (const line of lines) {
|
|
250
|
-
const match = line.match(/^[\s]*[-*]\s+(.+)/);
|
|
251
|
-
if (match && match[1].length > 10) criteria.push(match[1].trim());
|
|
252
|
-
}
|
|
253
|
-
}
|
|
254
|
-
return criteria;
|
|
133
|
+
return {
|
|
134
|
+
periodStart, periodEnd,
|
|
135
|
+
workflowRunCount: workflowRuns.length,
|
|
136
|
+
commitCount: commits.length,
|
|
137
|
+
issueCount: issues.length,
|
|
138
|
+
openIssueCount: issues.filter(i => i.state === "open").length,
|
|
139
|
+
closedIssueCount: issues.filter(i => i.state === "closed").length,
|
|
140
|
+
prCount: prs.length,
|
|
141
|
+
mergedPrCount: prs.filter(p => p.merged_at).length,
|
|
142
|
+
missionComplete, missionFailed,
|
|
143
|
+
completeContent, failedContent,
|
|
144
|
+
budgetUsed, budgetCap, totalTokens, transforms,
|
|
145
|
+
missionContent,
|
|
146
|
+
};
|
|
255
147
|
}
|
|
256
148
|
|
|
257
149
|
/**
|
|
258
|
-
*
|
|
150
|
+
* Build the concise prompt for the LLM (like supervisor's buildPrompt).
|
|
259
151
|
*/
|
|
260
|
-
function
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
}
|
|
276
|
-
|
|
277
|
-
|
|
152
|
+
function buildPrompt(summary, agentInstructions, repo) {
|
|
153
|
+
return [
|
|
154
|
+
"## Instructions",
|
|
155
|
+
agentInstructions,
|
|
156
|
+
"",
|
|
157
|
+
"## Report Period",
|
|
158
|
+
`Repository: ${repo.owner}/${repo.repo}`,
|
|
159
|
+
`Period: ${summary.periodStart} → ${summary.periodEnd}`,
|
|
160
|
+
"",
|
|
161
|
+
"## Summary of Gathered Data",
|
|
162
|
+
`Workflow runs: ${summary.workflowRunCount}`,
|
|
163
|
+
`Commits: ${summary.commitCount}`,
|
|
164
|
+
`Issues: ${summary.issueCount} (${summary.openIssueCount} open, ${summary.closedIssueCount} closed)`,
|
|
165
|
+
`Pull requests: ${summary.prCount} (${summary.mergedPrCount} merged)`,
|
|
166
|
+
`Cumulative transforms: ${summary.transforms}`,
|
|
167
|
+
`Budget: ${summary.budgetUsed}/${summary.budgetCap} used`,
|
|
168
|
+
`Total tokens consumed: ${summary.totalTokens}`,
|
|
169
|
+
`Mission complete: ${summary.missionComplete ? "YES" : "NO"}`,
|
|
170
|
+
`Mission failed: ${summary.missionFailed ? "YES" : "NO"}`,
|
|
171
|
+
"",
|
|
172
|
+
"## Data Files Available",
|
|
173
|
+
"The following files contain the full mechanical data. Use `read_file` to examine them:",
|
|
174
|
+
`- ${REPORT_DATA_DIR}/mission.md — MISSION.md with acceptance criteria`,
|
|
175
|
+
`- ${REPORT_DATA_DIR}/config.toml — Full agentic-lib.toml configuration`,
|
|
176
|
+
`- ${REPORT_DATA_DIR}/state.toml — Full agentic-lib-state.toml persistent state`,
|
|
177
|
+
`- ${REPORT_DATA_DIR}/workflow-runs.json — All ${summary.workflowRunCount} workflow runs with timing and URLs`,
|
|
178
|
+
`- ${REPORT_DATA_DIR}/commits.json — All ${summary.commitCount} commits with messages`,
|
|
179
|
+
`- ${REPORT_DATA_DIR}/issues.json — All ${summary.issueCount} issues with labels and state`,
|
|
180
|
+
`- ${REPORT_DATA_DIR}/pull-requests.json — All ${summary.prCount} PRs with branches and merge info`,
|
|
181
|
+
"",
|
|
182
|
+
"## Source Code and Tests",
|
|
183
|
+
"Use `read_file` and `list_files` to examine the actual source code and tests:",
|
|
184
|
+
"- `src/lib/main.js` — Main source file",
|
|
185
|
+
"- `tests/unit/` — Unit test directory",
|
|
186
|
+
"- `README.md` — Repository documentation",
|
|
187
|
+
"",
|
|
188
|
+
"## Your Task",
|
|
189
|
+
"1. Read the mission file to extract acceptance criteria",
|
|
190
|
+
"2. Read workflow-runs.json and pull-requests.json to build an iteration timeline",
|
|
191
|
+
"3. Read source code to verify each acceptance criterion (PASS/FAIL/NOT TESTED)",
|
|
192
|
+
"4. Read issues to understand what work was done and identify any churn",
|
|
193
|
+
"5. Investigate any failures or anomalies you find — use get_issue for details",
|
|
194
|
+
"6. Call `report_analysis` exactly once with your structured findings",
|
|
195
|
+
"",
|
|
196
|
+
"**You MUST call report_analysis exactly once.**",
|
|
197
|
+
].join("\n");
|
|
278
198
|
}
|
|
279
199
|
|
|
280
200
|
/**
|
|
281
|
-
* Build the
|
|
201
|
+
* Build the final report markdown from the LLM analysis.
|
|
282
202
|
*/
|
|
283
|
-
function
|
|
284
|
-
periodStart, periodEnd, config, stateContent, configContent,
|
|
285
|
-
workflowRuns, commits, issues, prs, sourceFiles, testFiles, agentLogs,
|
|
286
|
-
missionContent, acceptanceCriteria, websiteInfo, hasScreenshot, repo,
|
|
287
|
-
}) {
|
|
288
|
-
const sections = [];
|
|
203
|
+
function buildReportMarkdown(summary, analysis, repo, model) {
|
|
289
204
|
const now = new Date().toISOString().split("T")[0];
|
|
205
|
+
const sections = [];
|
|
290
206
|
|
|
291
207
|
sections.push(`# Benchmark Report`);
|
|
292
208
|
sections.push(``);
|
|
293
209
|
sections.push(`**Date**: ${now}`);
|
|
294
210
|
sections.push(`**Repository**: ${repo.owner}/${repo.repo}`);
|
|
295
|
-
sections.push(`**Period**: ${periodStart} → ${periodEnd}`);
|
|
296
|
-
sections.push(`**
|
|
211
|
+
sections.push(`**Period**: ${summary.periodStart} → ${summary.periodEnd}`);
|
|
212
|
+
sections.push(`**Model**: ${model}`);
|
|
297
213
|
sections.push(``);
|
|
298
214
|
sections.push(`---`);
|
|
299
215
|
|
|
300
|
-
//
|
|
301
|
-
sections.push(``);
|
|
302
|
-
sections.push(`## Mission`);
|
|
303
|
-
sections.push(``);
|
|
304
|
-
sections.push("```");
|
|
305
|
-
sections.push(missionContent || "(no MISSION.md found)");
|
|
306
|
-
sections.push("```");
|
|
307
|
-
|
|
308
|
-
if (acceptanceCriteria.length > 0) {
|
|
309
|
-
sections.push(``);
|
|
310
|
-
sections.push(`### Extracted Acceptance Criteria`);
|
|
311
|
-
sections.push(``);
|
|
312
|
-
for (let i = 0; i < acceptanceCriteria.length; i++) {
|
|
313
|
-
sections.push(`${i + 1}. ${acceptanceCriteria[i]}`);
|
|
314
|
-
}
|
|
315
|
-
}
|
|
316
|
-
|
|
317
|
-
// ── Configuration snapshot ──
|
|
216
|
+
// Summary
|
|
318
217
|
sections.push(``);
|
|
319
|
-
sections.push(`##
|
|
218
|
+
sections.push(`## Summary`);
|
|
320
219
|
sections.push(``);
|
|
321
|
-
sections.push("
|
|
322
|
-
sections.push(configContent || "(not found)");
|
|
323
|
-
sections.push("```");
|
|
220
|
+
sections.push(analysis.summary || "(no summary provided)");
|
|
324
221
|
|
|
325
|
-
//
|
|
222
|
+
// Configuration
|
|
326
223
|
sections.push(``);
|
|
327
|
-
sections.push(
|
|
328
|
-
sections.push(``);
|
|
329
|
-
sections.push("```toml");
|
|
330
|
-
sections.push(stateContent || "(not found)");
|
|
331
|
-
sections.push("```");
|
|
332
|
-
|
|
333
|
-
// ── Mission status ──
|
|
224
|
+
sections.push(`---`);
|
|
334
225
|
sections.push(``);
|
|
335
|
-
sections.push(`##
|
|
226
|
+
sections.push(`## Configuration`);
|
|
336
227
|
sections.push(``);
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
sections.push(`|
|
|
340
|
-
sections.push(
|
|
341
|
-
sections.push(`|
|
|
342
|
-
sections.push(`|
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
if (
|
|
228
|
+
sections.push(`| Parameter | Value |`);
|
|
229
|
+
sections.push(`|-----------|-------|`);
|
|
230
|
+
sections.push(`| Mission complete | ${summary.missionComplete ? "YES" : "NO"} |`);
|
|
231
|
+
sections.push(`| Mission failed | ${summary.missionFailed ? "YES" : "NO"} |`);
|
|
232
|
+
sections.push(`| Transforms | ${summary.transforms} |`);
|
|
233
|
+
sections.push(`| Budget | ${summary.budgetUsed}/${summary.budgetCap} |`);
|
|
234
|
+
sections.push(`| Total tokens | ${summary.totalTokens} |`);
|
|
235
|
+
sections.push(`| Workflow runs | ${summary.workflowRunCount} |`);
|
|
236
|
+
sections.push(`| Commits | ${summary.commitCount} |`);
|
|
237
|
+
sections.push(`| PRs merged | ${summary.mergedPrCount} |`);
|
|
238
|
+
sections.push(`| Issues (open/closed) | ${summary.openIssueCount}/${summary.closedIssueCount} |`);
|
|
239
|
+
|
|
240
|
+
// Iteration narrative
|
|
241
|
+
if (analysis.iteration_narrative) {
|
|
351
242
|
sections.push(``);
|
|
352
|
-
sections.push(
|
|
353
|
-
sections.push(readFileSync("MISSION_FAILED.md", "utf8").trim());
|
|
354
|
-
sections.push("```");
|
|
355
|
-
}
|
|
356
|
-
|
|
357
|
-
// ── Workflow runs (iteration timeline) ──
|
|
358
|
-
sections.push(``);
|
|
359
|
-
sections.push(`## Workflow Runs (${workflowRuns.length})`);
|
|
360
|
-
sections.push(``);
|
|
361
|
-
sections.push(`| # | Name | Conclusion | Started | Duration | URL |`);
|
|
362
|
-
sections.push(`|---|------|------------|---------|----------|-----|`);
|
|
363
|
-
for (let i = 0; i < workflowRuns.length; i++) {
|
|
364
|
-
const r = workflowRuns[i];
|
|
365
|
-
const startMs = new Date(r.created_at).getTime();
|
|
366
|
-
const endMs = new Date(r.updated_at).getTime();
|
|
367
|
-
const durationMin = Math.round((endMs - startMs) / 60000);
|
|
368
|
-
sections.push(`| ${i + 1} | ${r.name} | ${r.conclusion || r.status} | ${r.created_at} | ~${durationMin}min | [${r.id}](${r.html_url}) |`);
|
|
369
|
-
}
|
|
370
|
-
|
|
371
|
-
// ── Pull Requests (transformation evidence) ──
|
|
372
|
-
sections.push(``);
|
|
373
|
-
sections.push(`## Pull Requests (${prs.length})`);
|
|
374
|
-
sections.push(``);
|
|
375
|
-
sections.push(`| # | Branch | Title | Merged | +/- | Files |`);
|
|
376
|
-
sections.push(`|---|--------|-------|--------|-----|-------|`);
|
|
377
|
-
for (const p of prs) {
|
|
378
|
-
sections.push(`| #${p.number} | ${p.branch} | ${p.title} | ${p.merged_at || "open"} | +${p.additions}/-${p.deletions} | ${p.changed_files} |`);
|
|
379
|
-
}
|
|
380
|
-
|
|
381
|
-
// ── Commits timeline ──
|
|
382
|
-
sections.push(``);
|
|
383
|
-
sections.push(`## Commits Timeline (${commits.length})`);
|
|
384
|
-
sections.push(``);
|
|
385
|
-
sections.push(`| SHA | Date | Author | Message |`);
|
|
386
|
-
sections.push(`|-----|------|--------|---------|`);
|
|
387
|
-
for (const c of commits) {
|
|
388
|
-
sections.push(`| ${c.sha} | ${c.date} | ${c.author} | ${c.message} |`);
|
|
389
|
-
}
|
|
390
|
-
|
|
391
|
-
// ── Issues ──
|
|
392
|
-
sections.push(``);
|
|
393
|
-
sections.push(`## Issues (${issues.length})`);
|
|
394
|
-
sections.push(``);
|
|
395
|
-
sections.push(`| # | State | Labels | Title |`);
|
|
396
|
-
sections.push(`|---|-------|--------|-------|`);
|
|
397
|
-
for (const i of issues) {
|
|
398
|
-
sections.push(`| #${i.number} | ${i.state} | ${i.labels || "-"} | ${i.title} |`);
|
|
399
|
-
}
|
|
400
|
-
|
|
401
|
-
// ── Source Code (full contents) ──
|
|
402
|
-
sections.push(``);
|
|
403
|
-
sections.push(`## Source Code (${sourceFiles.length} files)`);
|
|
404
|
-
for (const s of sourceFiles) {
|
|
243
|
+
sections.push(`---`);
|
|
405
244
|
sections.push(``);
|
|
406
|
-
sections.push(
|
|
245
|
+
sections.push(`## Timeline`);
|
|
407
246
|
sections.push(``);
|
|
408
|
-
sections.push(
|
|
409
|
-
sections.push(s.content);
|
|
410
|
-
sections.push("```");
|
|
247
|
+
sections.push(analysis.iteration_narrative);
|
|
411
248
|
}
|
|
412
249
|
|
|
413
|
-
//
|
|
414
|
-
|
|
415
|
-
sections.push(`## Test Files (${testFiles.length} files)`);
|
|
416
|
-
for (const t of testFiles) {
|
|
250
|
+
// Acceptance criteria
|
|
251
|
+
if (analysis.acceptance_criteria?.length) {
|
|
417
252
|
sections.push(``);
|
|
418
|
-
sections.push(
|
|
253
|
+
sections.push(`---`);
|
|
419
254
|
sections.push(``);
|
|
420
|
-
sections.push(
|
|
421
|
-
sections.push(t.content);
|
|
422
|
-
sections.push("```");
|
|
423
|
-
}
|
|
424
|
-
|
|
425
|
-
// ── Website & Screenshot ──
|
|
426
|
-
sections.push(``);
|
|
427
|
-
sections.push(`## Website & Screenshot`);
|
|
428
|
-
sections.push(``);
|
|
429
|
-
sections.push(`**Screenshot**: ${hasScreenshot ? "SCREENSHOT_INDEX.png captured (see artifacts)" : "not available"}`);
|
|
430
|
-
sections.push(``);
|
|
431
|
-
if (websiteInfo) {
|
|
432
|
-
sections.push(`**Website** (${websiteInfo.rawLength} bytes, ${websiteInfo.hasContent ? "has content" : "minimal content"}):`);
|
|
255
|
+
sections.push(`## Acceptance Criteria`);
|
|
433
256
|
sections.push(``);
|
|
434
|
-
sections.push(
|
|
435
|
-
sections.push(
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
257
|
+
sections.push(`| Criterion | Status | Evidence |`);
|
|
258
|
+
sections.push(`|-----------|--------|----------|`);
|
|
259
|
+
for (const ac of analysis.acceptance_criteria) {
|
|
260
|
+
sections.push(`| ${ac.criterion} | ${ac.status} | ${ac.evidence} |`);
|
|
261
|
+
}
|
|
439
262
|
}
|
|
440
263
|
|
|
441
|
-
//
|
|
442
|
-
|
|
443
|
-
sections.push(`## Agent Logs (${agentLogs.length} files)`);
|
|
444
|
-
for (const log of agentLogs) {
|
|
264
|
+
// Findings
|
|
265
|
+
if (analysis.findings?.length) {
|
|
445
266
|
sections.push(``);
|
|
446
|
-
sections.push(
|
|
267
|
+
sections.push(`---`);
|
|
447
268
|
sections.push(``);
|
|
448
|
-
sections.push(
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
269
|
+
sections.push(`## Findings`);
|
|
270
|
+
for (const f of analysis.findings) {
|
|
271
|
+
sections.push(``);
|
|
272
|
+
sections.push(`### ${f.id}: ${f.title} (${f.severity})`);
|
|
273
|
+
sections.push(``);
|
|
274
|
+
sections.push(f.description);
|
|
275
|
+
}
|
|
452
276
|
}
|
|
453
277
|
|
|
454
|
-
//
|
|
455
|
-
|
|
456
|
-
if (readmeContent) {
|
|
278
|
+
// Recommendations
|
|
279
|
+
if (analysis.recommendations?.length) {
|
|
457
280
|
sections.push(``);
|
|
458
|
-
sections.push(
|
|
281
|
+
sections.push(`---`);
|
|
459
282
|
sections.push(``);
|
|
460
|
-
|
|
461
|
-
sections.push(
|
|
462
|
-
|
|
283
|
+
sections.push(`## Recommendations`);
|
|
284
|
+
sections.push(``);
|
|
285
|
+
for (let i = 0; i < analysis.recommendations.length; i++) {
|
|
286
|
+
sections.push(`${i + 1}. ${analysis.recommendations[i]}`);
|
|
287
|
+
}
|
|
463
288
|
}
|
|
464
289
|
|
|
465
290
|
sections.push(``);
|
|
466
291
|
return sections.join("\n");
|
|
467
292
|
}
|
|
468
293
|
|
|
294
|
+
/**
|
|
295
|
+
* Build a mechanical-only fallback report (no LLM).
|
|
296
|
+
*/
|
|
297
|
+
function buildFallbackReport(summary, repo) {
|
|
298
|
+
const now = new Date().toISOString().split("T")[0];
|
|
299
|
+
return [
|
|
300
|
+
`# Benchmark Report`,
|
|
301
|
+
``,
|
|
302
|
+
`**Date**: ${now}`,
|
|
303
|
+
`**Repository**: ${repo.owner}/${repo.repo}`,
|
|
304
|
+
`**Period**: ${summary.periodStart} → ${summary.periodEnd}`,
|
|
305
|
+
`**Generated by**: agentic-lib-report (mechanical — no LLM enrichment)`,
|
|
306
|
+
``,
|
|
307
|
+
`---`,
|
|
308
|
+
``,
|
|
309
|
+
`## Summary`,
|
|
310
|
+
``,
|
|
311
|
+
`| Metric | Value |`,
|
|
312
|
+
`|--------|-------|`,
|
|
313
|
+
`| Mission complete | ${summary.missionComplete ? "YES" : "NO"} |`,
|
|
314
|
+
`| Mission failed | ${summary.missionFailed ? "YES" : "NO"} |`,
|
|
315
|
+
`| Transforms | ${summary.transforms} |`,
|
|
316
|
+
`| Budget | ${summary.budgetUsed}/${summary.budgetCap} |`,
|
|
317
|
+
`| Total tokens | ${summary.totalTokens} |`,
|
|
318
|
+
`| Workflow runs | ${summary.workflowRunCount} |`,
|
|
319
|
+
`| Commits | ${summary.commitCount} |`,
|
|
320
|
+
`| PRs merged | ${summary.mergedPrCount} |`,
|
|
321
|
+
`| Issues (open/closed) | ${summary.openIssueCount}/${summary.closedIssueCount} |`,
|
|
322
|
+
``,
|
|
323
|
+
`> This report contains only mechanical data. LLM enrichment was not available.`,
|
|
324
|
+
`> For a full report with findings and acceptance criteria verification, ensure COPILOT_GITHUB_TOKEN is set.`,
|
|
325
|
+
``,
|
|
326
|
+
].join("\n");
|
|
327
|
+
}
|
|
328
|
+
|
|
469
329
|
/**
|
|
470
330
|
* Report task handler.
|
|
471
331
|
*/
|
|
@@ -483,202 +343,109 @@ export async function report(context) {
|
|
|
483
343
|
}
|
|
484
344
|
core.info(`Report period: ${periodStart} → ${periodEnd}`);
|
|
485
345
|
|
|
486
|
-
//
|
|
487
|
-
const
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
// Gather data from GitHub API
|
|
494
|
-
const workflowRuns = await listWorkflowRuns(octokit, owner, repoName, periodStart, periodEnd);
|
|
495
|
-
const commits = await listCommits(octokit, owner, repoName, periodStart, periodEnd);
|
|
496
|
-
const issues = await listIssues(octokit, owner, repoName, periodStart);
|
|
497
|
-
const prs = await listPullRequests(octokit, owner, repoName, periodStart);
|
|
498
|
-
|
|
499
|
-
// Gather local data: full file contents, not just stats
|
|
500
|
-
const sourceFiles = readSourceFiles(config);
|
|
501
|
-
const testFiles = readTestFiles();
|
|
502
|
-
const logPrefix = config.intentionBot?.logPrefix || "agent-log-";
|
|
503
|
-
const agentLogs = readAgentLogs(logPrefix);
|
|
504
|
-
const websiteInfo = readWebsiteHtml();
|
|
505
|
-
const hasScreenshot = existsSync("SCREENSHOT_INDEX.png");
|
|
506
|
-
|
|
507
|
-
core.info(`Gathered: ${workflowRuns.length} runs, ${commits.length} commits, ${issues.length} issues, ${prs.length} PRs, ${sourceFiles.length} source files, ${testFiles.length} test files, ${agentLogs.length} logs`);
|
|
508
|
-
|
|
509
|
-
// Build mechanical report
|
|
510
|
-
const mechanicalReport = buildMechanicalReport({
|
|
511
|
-
periodStart, periodEnd, config, stateContent, configContent,
|
|
512
|
-
workflowRuns, commits, issues, prs, sourceFiles, testFiles, agentLogs,
|
|
513
|
-
missionContent, acceptanceCriteria, websiteInfo, hasScreenshot, repo,
|
|
514
|
-
});
|
|
515
|
-
|
|
516
|
-
// Optional LLM enrichment (if Copilot token available)
|
|
517
|
-
let enrichedAnalysis = null;
|
|
346
|
+
// Gather all data and write to filesystem
|
|
347
|
+
const summary = await gatherAndWriteData(octokit, owner, repoName, periodStart, periodEnd, config);
|
|
348
|
+
core.info(`Gathered: ${summary.workflowRunCount} runs, ${summary.commitCount} commits, ${summary.issueCount} issues, ${summary.prCount} PRs`);
|
|
349
|
+
|
|
350
|
+
// LLM enrichment (required for a proper report)
|
|
351
|
+
let analysis = null;
|
|
518
352
|
let tokensUsed = 0;
|
|
519
353
|
let resultModel = model;
|
|
354
|
+
|
|
520
355
|
if (process.env.COPILOT_GITHUB_TOKEN) {
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
},
|
|
543
|
-
required: ["id", "title", "severity", "description"],
|
|
544
|
-
},
|
|
545
|
-
},
|
|
546
|
-
recommendations: { type: "array", items: { type: "string" } },
|
|
547
|
-
acceptance_criteria: {
|
|
548
|
-
type: "array",
|
|
549
|
-
items: {
|
|
550
|
-
type: "object",
|
|
551
|
-
properties: {
|
|
552
|
-
criterion: { type: "string" },
|
|
553
|
-
status: { type: "string", enum: ["PASS", "FAIL", "NOT TESTED"] },
|
|
554
|
-
evidence: { type: "string" },
|
|
555
|
-
},
|
|
556
|
-
required: ["criterion", "status", "evidence"],
|
|
356
|
+
const agentInstructions = context.instructions || "You are a benchmark analyst. Investigate the data and produce findings.";
|
|
357
|
+
|
|
358
|
+
const createTools = (defineTool, _wp, logger) => {
|
|
359
|
+
const ghTools = createGitHubTools(octokit, owner, repoName, defineTool, logger);
|
|
360
|
+
const gitTools = createGitTools(defineTool, logger);
|
|
361
|
+
|
|
362
|
+
const reportTool = defineTool("report_analysis", {
|
|
363
|
+
description: "Record your benchmark analysis. Call exactly once with structured findings, acceptance criteria verification, timeline narrative, and recommendations.",
|
|
364
|
+
parameters: {
|
|
365
|
+
type: "object",
|
|
366
|
+
properties: {
|
|
367
|
+
summary: { type: "string", description: "Executive summary — what happened in this benchmark period, key outcomes" },
|
|
368
|
+
iteration_narrative: { type: "string", description: "Prose timeline: for each significant event, what happened, what changed, which PRs were created/merged" },
|
|
369
|
+
acceptance_criteria: {
|
|
370
|
+
type: "array",
|
|
371
|
+
items: {
|
|
372
|
+
type: "object",
|
|
373
|
+
properties: {
|
|
374
|
+
criterion: { type: "string" },
|
|
375
|
+
status: { type: "string", enum: ["PASS", "FAIL", "NOT TESTED"] },
|
|
376
|
+
evidence: { type: "string", description: "Specific file:line, function name, issue number, or test name" },
|
|
557
377
|
},
|
|
378
|
+
required: ["criterion", "status", "evidence"],
|
|
558
379
|
},
|
|
559
|
-
|
|
560
|
-
|
|
380
|
+
description: "Each acceptance criterion from MISSION.md verified against actual source code",
|
|
381
|
+
},
|
|
382
|
+
findings: {
|
|
383
|
+
type: "array",
|
|
384
|
+
items: {
|
|
561
385
|
type: "object",
|
|
562
386
|
properties: {
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
final_test_count: { type: "number" },
|
|
568
|
-
acceptance_pass_count: { type: "string", description: "e.g. '7/8 PASS'" },
|
|
569
|
-
total_tokens: { type: "number" },
|
|
387
|
+
id: { type: "string", description: "FINDING-N" },
|
|
388
|
+
title: { type: "string" },
|
|
389
|
+
severity: { type: "string", enum: ["POSITIVE", "CONCERN", "CRITICAL", "REGRESSION", "OBSERVATION"] },
|
|
390
|
+
description: { type: "string", description: "Include specific evidence: run IDs, issue numbers, file paths" },
|
|
570
391
|
},
|
|
392
|
+
required: ["id", "title", "severity", "description"],
|
|
571
393
|
},
|
|
572
394
|
},
|
|
573
|
-
|
|
574
|
-
},
|
|
575
|
-
handler: async (args) => {
|
|
576
|
-
enrichedAnalysis = args;
|
|
577
|
-
return "Analysis recorded.";
|
|
395
|
+
recommendations: { type: "array", items: { type: "string" } },
|
|
578
396
|
},
|
|
397
|
+
required: ["summary", "findings", "recommendations"],
|
|
579
398
|
},
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
"3. Read issue bodies (use get_issue) to understand what work was requested and completed.",
|
|
589
|
-
"4. Produce a narrative timeline: for each iteration, what happened, what changed, what the agent decided.",
|
|
590
|
-
"5. Assess code quality by reading the source — is it clean, correct, well-tested?",
|
|
591
|
-
"",
|
|
592
|
-
"=== MECHANICAL DATA ===",
|
|
593
|
-
mechanicalReport,
|
|
594
|
-
].join("\n");
|
|
399
|
+
handler: async (args) => {
|
|
400
|
+
analysis = args;
|
|
401
|
+
return { textResultForLlm: "Analysis recorded. The report will be generated from your findings." };
|
|
402
|
+
},
|
|
403
|
+
});
|
|
404
|
+
|
|
405
|
+
return [...ghTools, ...gitTools, reportTool];
|
|
406
|
+
};
|
|
595
407
|
|
|
408
|
+
const prompt = buildPrompt(summary, agentInstructions, repo);
|
|
409
|
+
|
|
410
|
+
const systemPrompt =
|
|
411
|
+
"You are a benchmark analyst for an autonomous coding pipeline. " +
|
|
412
|
+
"Your job is to investigate the gathered data using tools, verify acceptance criteria by reading source code, " +
|
|
413
|
+
"trace the transformation timeline from workflow runs to PRs to commits, " +
|
|
414
|
+
"and produce structured findings with specific evidence. " +
|
|
415
|
+
"Use read_file to examine source code, tests, and data files. " +
|
|
416
|
+
"Use list_issues and get_issue to understand work done. " +
|
|
417
|
+
"Call report_analysis exactly once with your complete analysis." +
|
|
418
|
+
NARRATIVE_INSTRUCTION;
|
|
419
|
+
|
|
420
|
+
try {
|
|
596
421
|
const result = await runCopilotSession({
|
|
597
|
-
workspacePath:
|
|
422
|
+
workspacePath: process.cwd(),
|
|
598
423
|
model: model || config.model || "gpt-5-mini",
|
|
599
424
|
tuning: config.tuning || {},
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
425
|
+
agentPrompt: systemPrompt,
|
|
426
|
+
userPrompt: prompt,
|
|
427
|
+
writablePaths: [],
|
|
428
|
+
createTools,
|
|
429
|
+
excludedTools: ["write_file", "run_command", "run_tests"],
|
|
430
|
+
logger: { info: core.info, warning: core.warning, error: core.error, debug: core.debug },
|
|
604
431
|
});
|
|
605
|
-
tokensUsed = result.
|
|
432
|
+
tokensUsed = result.tokensIn + result.tokensOut;
|
|
606
433
|
resultModel = result.model || model;
|
|
434
|
+
core.info(`Report LLM session completed: ${tokensUsed} tokens`);
|
|
607
435
|
} catch (err) {
|
|
608
|
-
core.warning(`LLM enrichment failed
|
|
436
|
+
core.warning(`LLM enrichment failed: ${err.message}`);
|
|
609
437
|
}
|
|
610
438
|
}
|
|
611
439
|
|
|
612
|
-
//
|
|
613
|
-
let finalReport
|
|
614
|
-
if (
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
enrichedSections.push(``);
|
|
619
|
-
enrichedSections.push(`## Analysis (LLM-enriched)`);
|
|
620
|
-
enrichedSections.push(``);
|
|
621
|
-
enrichedSections.push(enrichedAnalysis.summary || "");
|
|
622
|
-
enrichedSections.push(``);
|
|
623
|
-
|
|
624
|
-
if (enrichedAnalysis.iteration_narrative) {
|
|
625
|
-
enrichedSections.push(`### Iteration Narrative`);
|
|
626
|
-
enrichedSections.push(``);
|
|
627
|
-
enrichedSections.push(enrichedAnalysis.iteration_narrative);
|
|
628
|
-
enrichedSections.push(``);
|
|
629
|
-
}
|
|
630
|
-
|
|
631
|
-
if (enrichedAnalysis.acceptance_criteria?.length) {
|
|
632
|
-
enrichedSections.push(`### Acceptance Criteria`);
|
|
633
|
-
enrichedSections.push(``);
|
|
634
|
-
enrichedSections.push(`| Criterion | Status | Evidence |`);
|
|
635
|
-
enrichedSections.push(`|-----------|--------|----------|`);
|
|
636
|
-
for (const ac of enrichedAnalysis.acceptance_criteria) {
|
|
637
|
-
enrichedSections.push(`| ${ac.criterion} | ${ac.status} | ${ac.evidence} |`);
|
|
638
|
-
}
|
|
639
|
-
enrichedSections.push(``);
|
|
640
|
-
}
|
|
641
|
-
|
|
642
|
-
if (enrichedAnalysis.scenario_summary) {
|
|
643
|
-
const s = enrichedAnalysis.scenario_summary;
|
|
644
|
-
enrichedSections.push(`### Scenario Summary`);
|
|
645
|
-
enrichedSections.push(``);
|
|
646
|
-
enrichedSections.push(`| Metric | Value |`);
|
|
647
|
-
enrichedSections.push(`|--------|-------|`);
|
|
648
|
-
if (s.total_iterations != null) enrichedSections.push(`| Total iterations | ${s.total_iterations} |`);
|
|
649
|
-
if (s.transforms != null) enrichedSections.push(`| Transforms | ${s.transforms} |`);
|
|
650
|
-
if (s.convergence_iteration) enrichedSections.push(`| Convergence | Iteration ${s.convergence_iteration} |`);
|
|
651
|
-
if (s.final_source_lines) enrichedSections.push(`| Final source lines | ${s.final_source_lines} |`);
|
|
652
|
-
if (s.final_test_count) enrichedSections.push(`| Final test count | ${s.final_test_count} |`);
|
|
653
|
-
if (s.acceptance_pass_count) enrichedSections.push(`| Acceptance criteria | ${s.acceptance_pass_count} |`);
|
|
654
|
-
if (s.total_tokens) enrichedSections.push(`| Total tokens | ${s.total_tokens} |`);
|
|
655
|
-
enrichedSections.push(``);
|
|
656
|
-
}
|
|
657
|
-
|
|
658
|
-
if (enrichedAnalysis.findings?.length) {
|
|
659
|
-
enrichedSections.push(`### Findings`);
|
|
660
|
-
enrichedSections.push(``);
|
|
661
|
-
for (const f of enrichedAnalysis.findings) {
|
|
662
|
-
enrichedSections.push(`#### ${f.id}: ${f.title} (${f.severity})`);
|
|
663
|
-
enrichedSections.push(``);
|
|
664
|
-
enrichedSections.push(f.description);
|
|
665
|
-
enrichedSections.push(``);
|
|
666
|
-
}
|
|
667
|
-
}
|
|
668
|
-
|
|
669
|
-
if (enrichedAnalysis.recommendations?.length) {
|
|
670
|
-
enrichedSections.push(`### Recommendations`);
|
|
671
|
-
enrichedSections.push(``);
|
|
672
|
-
for (let i = 0; i < enrichedAnalysis.recommendations.length; i++) {
|
|
673
|
-
enrichedSections.push(`${i + 1}. ${enrichedAnalysis.recommendations[i]}`);
|
|
674
|
-
}
|
|
675
|
-
enrichedSections.push(``);
|
|
676
|
-
}
|
|
677
|
-
|
|
678
|
-
finalReport += enrichedSections.join("\n");
|
|
440
|
+
// Build final report
|
|
441
|
+
let finalReport;
|
|
442
|
+
if (analysis) {
|
|
443
|
+
finalReport = buildReportMarkdown(summary, analysis, repo, resultModel);
|
|
444
|
+
} else {
|
|
445
|
+
finalReport = buildFallbackReport(summary, repo);
|
|
679
446
|
}
|
|
680
447
|
|
|
681
|
-
const narrative = `Generated benchmark report for ${repo.owner}/${repo.repo}
|
|
448
|
+
const narrative = `Generated benchmark report for ${repo.owner}/${repo.repo}: ${summary.workflowRunCount} runs, ${summary.transforms} transforms, mission ${summary.missionComplete ? "complete" : summary.missionFailed ? "failed" : "in progress"}`;
|
|
682
449
|
|
|
683
450
|
return {
|
|
684
451
|
outcome: "report-generated",
|