@minhpnq1807/contextos 0.6.0 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +10 -0
- package/README.md +31 -1
- package/bin/ctx.js +21 -0
- package/docs/demo/agents-lost-middle.gif +0 -0
- package/docs/demo/agents-lost-middle.txt +25 -0
- package/docs/demo/capture-live-demos.mjs +76 -0
- package/docs/demo/contextos-ready.gif +0 -0
- package/docs/demo/contextos-ready.txt +14 -0
- package/docs/demo/render-terminal-gif.mjs +1 -1
- package/docs/demo/same-prompt-different-context.gif +0 -0
- package/docs/demo/same-prompt-different-context.txt +51 -0
- package/docs/launch-demos.md +22 -0
- package/eval/hallucination/run-agent-leaderboard.js +233 -0
- package/eval/hallucination/run-leaderboard.js +183 -0
- package/package.json +6 -1
- package/plugins/ctx/.codex-plugin/plugin.json +1 -1
- package/scripts/sync-community-skills.mjs +40 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,15 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.6.2
|
|
4
|
+
|
|
5
|
+
- **Live agent leaderboard:** Added `ctx leaderboard --agents codex,gemini` and `npm run leaderboard:agents` to run the hallucination benchmark through installed Codex/Gemini CLIs with timeouts and skip/error reporting for missing or unauthenticated agents.
|
|
6
|
+
- **Live GIF capture:** Added `npm run demo:capture` to regenerate the three launch GIFs from real local `ctx` command output across ContextOS and skill-routing fixture repos.
|
|
7
|
+
- **Community skills sync:** Added `scripts/sync-community-skills.mjs`, `npm run sync:community-skills`, and a scheduled/manual GitHub Action that opens PRs from `khovan123/contextOS-skills` back into `community-skills/`.
|
|
8
|
+
|
|
9
|
+
## 0.6.1
|
|
10
|
+
|
|
11
|
+
- **Hallucination Leaderboard:** Added `ctx leaderboard --hallucination` and `npm run leaderboard:hallucination` to compare raw prompt-only skill guesses against ContextOS evidence-routed skill selection across 20 fixture tasks.
|
|
12
|
+
|
|
3
13
|
## 0.6.0
|
|
4
14
|
|
|
5
15
|
- **Launch demo framing:** Added Agent Hallucination Benchmark messaging, same-prompt/same-model/different-context copy, and `docs/launch-demos.md` with three short demo scripts: hallucination benchmark, AGENTS.md lost-in-the-middle, and repo-aware skills.
|
package/README.md
CHANGED
|
@@ -28,7 +28,7 @@ Published package: [`@minhpnq1807/contextos`](https://www.npmjs.com/package/@min
|
|
|
28
28
|
|
|
29
29
|
## Demo
|
|
30
30
|
|
|
31
|
-

|
|
32
32
|
|
|
33
33
|
Same prompt. Same model. Different context.
|
|
34
34
|
|
|
@@ -42,6 +42,19 @@ ctx skills doctor -- "fix deployed"
|
|
|
42
42
|
| `vercel.json`, `next`, GitHub workflow | `vercel-deployment`, `github-actions-ci-cd`, `env-secret-management` |
|
|
43
43
|
| ContextOS repo with no app deploy evidence | no deployment skill selected |
|
|
44
44
|
|
|
45
|
+
More 10-second demos:
|
|
46
|
+
|
|
47
|
+
| Demo | GIF |
|
|
48
|
+
| --- | --- |
|
|
49
|
+
| AGENTS.md Lost In The Middle | [docs/demo/agents-lost-middle.gif](docs/demo/agents-lost-middle.gif) |
|
|
50
|
+
| ContextOS Ready Gold | [docs/demo/contextos-ready.gif](docs/demo/contextos-ready.gif) |
|
|
51
|
+
|
|
52
|
+
Regenerate the GIFs from real local `ctx` command output:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
npm run demo:capture
|
|
56
|
+
```
|
|
57
|
+
|
|
45
58
|
## Agent Hallucination Benchmark
|
|
46
59
|
|
|
47
60
|
Generic agents often guess deployment tooling from the prompt alone:
|
|
@@ -80,6 +93,19 @@ Skill Router internal fixture benchmark:
|
|
|
80
93
|
|
|
81
94
|
This is an internal fixture benchmark, not an external real-world benchmark. It is designed to prove the router behavior across controlled Expo/EAS, Next/Vercel, Docker, Railway/Render, Firebase, auth, database, testing, mobile, and adversarial negative-gate cases.
|
|
82
95
|
|
|
96
|
+
Hallucination leaderboard:
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
ctx leaderboard --hallucination
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
Current local result across 20 fixture tasks and 12 repo contexts:
|
|
103
|
+
|
|
104
|
+
| System | Correct Skill |
|
|
105
|
+
| --- | ---: |
|
|
106
|
+
| Raw Agent | 10.0% |
|
|
107
|
+
| ContextOS + Codex | 80.0% |
|
|
108
|
+
|
|
83
109
|
Example hook context injected before the agent works:
|
|
84
110
|
|
|
85
111
|
```text
|
|
@@ -246,6 +272,8 @@ The score checks project `AGENTS.md` rules, project skill packs under `.codex/sk
|
|
|
246
272
|
| `ctx stats` | Show workspace-level usage and effectiveness metrics. |
|
|
247
273
|
| `ctx benchmark -- "task"` | Compare raw AGENTS.md ordering vs ContextOS scheduling. |
|
|
248
274
|
| `ctx benchmark --skills` | Run the Skill Router eval benchmark. |
|
|
275
|
+
| `ctx leaderboard --hallucination` | Compare raw prompt-only guesses vs ContextOS routing. |
|
|
276
|
+
| `ctx leaderboard --agents codex,gemini` | Run the live CLI leaderboard when Codex/Gemini credentials are available. |
|
|
249
277
|
| `ctx sync --rules` | Sync AGENTS/Ruler/MCP config across agents. |
|
|
250
278
|
| `ctx sync --skills` | Sync skills across agents through skillshare. |
|
|
251
279
|
| `ctx sync --workflows` | Sync workflow markdown across Claude/Codex/Antigravity. |
|
|
@@ -570,6 +598,8 @@ This warning comes from a transitive dependency in the local embedding/WASM stac
|
|
|
570
598
|
| `ctx stats` | Shows aggregate runtime metrics for the current workspace. | You want to know whether ContextOS is active and useful over time. | Prints sectioned tables for prompt/report counts, injection rate, efficiency, rule outcomes, hook events, last prompt, and last report. |
|
|
571
599
|
| `ctx benchmark -- "task"` | Compares baseline AGENTS.md ordering with ContextOS task-aware scheduling. | You want a before/after signal for lost-in-the-middle risk. | Prints tables for parsed/actionable/filtered rules, baseline middle-risk, scheduled high/mid rules, recency reminder status, and top scored rules. |
|
|
572
600
|
| `ctx benchmark --skills` | Runs the Skill Router eval benchmark. | You want evidence for skill routing accuracy and negative gates. | Prints top-1 accuracy, top-3 recall, false positive rate, confidence calibration, and negative gate accuracy across `eval/skill-routing` fixtures. |
|
|
601
|
+
| `ctx leaderboard --hallucination` | Compares raw prompt-only skill guesses with ContextOS evidence routing. | You want launch evidence for the hallucination problem. | Runs 20 fixture tasks across 10+ repo contexts and prints Raw Agent vs ContextOS correctness plus sample failures. |
|
|
602
|
+
| `ctx leaderboard --agents codex,gemini` | Runs the same benchmark shape through installed agent CLIs. | You want real agent output instead of the deterministic raw baseline. | Calls `codex exec` in read-only mode and the local Gemini CLI with timeouts; missing or unauthenticated CLIs are reported as skipped/errors instead of blocking. |
|
|
573
603
|
| `ctx sync --rules` | Syncs project rules and MCP servers through Ruler. | You want Codex, Claude Code, and Antigravity to share one project rule/MCP source of truth. | Ensures `.ruler/ruler.toml`, injects `ctx-mcp`, imports existing MCP servers from Codex and project `.mcp.json`, runs `ruler apply --agents codex,claude,antigravity`, mirrors MCP servers to Antigravity MCP configs, and verifies generated config. |
|
|
574
604
|
| `ctx sync --rules --agents <list>` | Syncs only selected agents through Ruler. | You want to update one or two agents without touching the others. | Accepts comma-separated values such as `codex`, `claude`, `agy`, `antigravity`, or `codex,claude,agy`; `agy` is normalized to Ruler's `antigravity`. |
|
|
575
605
|
| `ctx sync --rules --dry-run` | Previews Ruler sync without writing files or running apply. | You want to inspect behavior before changing project config. | Prints the same flow with dry-run status. |
|
package/bin/ctx.js
CHANGED
|
@@ -20,6 +20,8 @@ import { defaultDataRoot, workspaceDataDir, workspaceMarkerPath } from "../plugi
|
|
|
20
20
|
import { installMcpTelemetryProxies } from "../plugins/ctx/lib/mcp-proxy-install.js";
|
|
21
21
|
import { benchmarkWorkspace, formatBenchmark } from "../plugins/ctx/lib/benchmark.js";
|
|
22
22
|
import { formatSkillRoutingBenchmark, runSkillRoutingEval } from "../eval/skill-routing/run-eval.js";
|
|
23
|
+
import { formatHallucinationLeaderboard, runHallucinationLeaderboard } from "../eval/hallucination/run-leaderboard.js";
|
|
24
|
+
import { formatAgentLeaderboard, runAgentLeaderboard } from "../eval/hallucination/run-agent-leaderboard.js";
|
|
23
25
|
import { copyDir, copyPackageRoot, syncPackageRoot } from "../plugins/ctx/lib/package-install.js";
|
|
24
26
|
import { installClaudeHooks } from "../plugins/ctx/lib/claude-hooks.js";
|
|
25
27
|
import { installClaudeMcp } from "../plugins/ctx/lib/claude-mcp.js";
|
|
@@ -197,6 +199,8 @@ Usage:
|
|
|
197
199
|
ctx stats Show workspace statistics
|
|
198
200
|
ctx benchmark -- "task" Benchmark workspace for a task
|
|
199
201
|
ctx benchmark --skills Run skill routing eval benchmark
|
|
202
|
+
ctx leaderboard --hallucination Compare raw agent guesses vs ContextOS routing
|
|
203
|
+
ctx leaderboard --agents codex,gemini Run live CLI leaderboard for installed agents
|
|
200
204
|
ctx sync --rules Sync AGENTS.md rules to all agents
|
|
201
205
|
ctx sync --rules --agents <names> Sync rules to specific agents only
|
|
202
206
|
ctx sync --rules --dry-run Preview rule sync without writing
|
|
@@ -1034,6 +1038,23 @@ try {
|
|
|
1034
1038
|
if (!task.trim()) throw new Error('Usage: ctx benchmark -- "task"');
|
|
1035
1039
|
console.log(formatBenchmark(benchmarkWorkspace({ cwd: process.cwd(), task })));
|
|
1036
1040
|
}
|
|
1041
|
+
} else if (command === "leaderboard") {
|
|
1042
|
+
if (args.includes("--hallucination")) {
|
|
1043
|
+
console.log(formatHallucinationLeaderboard(await runHallucinationLeaderboard({ rootDir })));
|
|
1044
|
+
} else if (args.includes("--agents")) {
|
|
1045
|
+
const index = args.indexOf("--agents");
|
|
1046
|
+
const agents = String(args[index + 1] || "").split(",").map((agent) => agent.trim()).filter(Boolean);
|
|
1047
|
+
const limitIndex = args.indexOf("--limit");
|
|
1048
|
+
const timeoutIndex = args.indexOf("--timeout-ms");
|
|
1049
|
+
console.log(formatAgentLeaderboard(runAgentLeaderboard({
|
|
1050
|
+
rootDir,
|
|
1051
|
+
agents: agents.length ? agents : undefined,
|
|
1052
|
+
caseLimit: limitIndex >= 0 ? Number(args[limitIndex + 1]) : undefined,
|
|
1053
|
+
timeoutMs: timeoutIndex >= 0 ? Number(args[timeoutIndex + 1]) : undefined
|
|
1054
|
+
})));
|
|
1055
|
+
} else {
|
|
1056
|
+
throw new Error("Usage: ctx leaderboard --hallucination OR ctx leaderboard --agents codex,gemini");
|
|
1057
|
+
}
|
|
1037
1058
|
} else if (command === "skills") {
|
|
1038
1059
|
if (args[1] === "doctor") {
|
|
1039
1060
|
const marker = args.indexOf("--");
|
|
Binary file
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
$ ctx benchmark -- "fix failing test"
|
|
2
|
+
ContextOS benchmark
|
|
3
|
+
|
|
4
|
+
Summary
|
|
5
|
+
-------
|
|
6
|
+
Metric Value
|
|
7
|
+
-------------------- -------------------------
|
|
8
|
+
Task fix failing test
|
|
9
|
+
Rules parsed 7
|
|
10
|
+
Actionable rules 5
|
|
11
|
+
Filtered rules 2
|
|
12
|
+
Relevant rules 1
|
|
13
|
+
Baseline middle-risk 1/1 relevant rules (100%)
|
|
14
|
+
ContextOS scheduled 1 high, 0 mid
|
|
15
|
+
Recency reminder enabled
|
|
16
|
+
|
|
17
|
+
Top Rules
|
|
18
|
+
---------
|
|
19
|
+
Score Rule Reasons
|
|
20
|
+
----- ---------------------------------------------------------------------------------------- ----------
|
|
21
|
+
0.50 IMPORTANT: This project has a knowledge graph. ALWAYS use code-review-graph MCP tools... imperative
|
|
22
|
+
0.00 AGENTS.md
|
|
23
|
+
0.00 Centralised AI agent instructions. Add coding guidelines, style guides, and project c...
|
|
24
|
+
0.00 Ruler concatenates all .md files in this directory (and subdirectories), starting wit...
|
|
25
|
+
0.00 Use `query_graph` pattern="tests_for" to check coverage.
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { execFileSync } from "node:child_process";
|
|
3
|
+
import fs from "node:fs";
|
|
4
|
+
import path from "node:path";
|
|
5
|
+
import { fileURLToPath } from "node:url";
|
|
6
|
+
|
|
7
|
+
import { benchmarkWorkspace, formatBenchmark } from "../../plugins/ctx/lib/benchmark.js";
|
|
8
|
+
import { formatContextOSReady, inspectContextOSReady } from "../../plugins/ctx/lib/certification.js";
|
|
9
|
+
import { formatHallucinationLeaderboard, runHallucinationLeaderboard } from "../../eval/hallucination/run-leaderboard.js";
|
|
10
|
+
import { runSkillRoutingEval } from "../../eval/skill-routing/run-eval.js";
|
|
11
|
+
|
|
12
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
13
|
+
const repoRoot = path.resolve(__dirname, "..", "..");
|
|
14
|
+
const render = path.join(__dirname, "render-terminal-gif.mjs");
|
|
15
|
+
const leaderboard = await runHallucinationLeaderboard({ rootDir: repoRoot });
|
|
16
|
+
const skillEval = await runSkillRoutingEval({ rootDir: repoRoot });
|
|
17
|
+
|
|
18
|
+
const demos = [
|
|
19
|
+
{
|
|
20
|
+
log: "same-prompt-different-context.txt",
|
|
21
|
+
gif: "same-prompt-different-context.gif",
|
|
22
|
+
steps: [
|
|
23
|
+
["ctx leaderboard --hallucination", formatHallucinationLeaderboard(leaderboard)],
|
|
24
|
+
["ctx skills doctor -- \"fix deployed\" # Expo fixture", routeSummary("expo-eas", "fix deployed")],
|
|
25
|
+
["ctx skills doctor -- \"fix deployed\" # Next/Vercel fixture", routeSummary("next-vercel", "fix deployed")]
|
|
26
|
+
]
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
log: "agents-lost-middle.txt",
|
|
30
|
+
gif: "agents-lost-middle.gif",
|
|
31
|
+
steps: [
|
|
32
|
+
["ctx benchmark -- \"fix failing test\"", formatBenchmark(benchmarkWorkspace({ cwd: repoRoot, task: "fix failing test" }))]
|
|
33
|
+
]
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
log: "contextos-ready.txt",
|
|
37
|
+
gif: "contextos-ready.gif",
|
|
38
|
+
steps: [
|
|
39
|
+
["ctx doctor", formatContextOSReady(inspectContextOSReady({ cwd: repoRoot }))]
|
|
40
|
+
]
|
|
41
|
+
}
|
|
42
|
+
];
|
|
43
|
+
|
|
44
|
+
for (const demo of demos) {
|
|
45
|
+
const logPath = path.join(__dirname, demo.log);
|
|
46
|
+
const gifPath = path.join(__dirname, demo.gif);
|
|
47
|
+
const chunks = [];
|
|
48
|
+
for (const [label, output] of demo.steps) {
|
|
49
|
+
chunks.push(`$ ${label}`);
|
|
50
|
+
chunks.push(cleanOutput(output));
|
|
51
|
+
chunks.push("");
|
|
52
|
+
}
|
|
53
|
+
fs.writeFileSync(logPath, chunks.join("\n").trimEnd() + "\n", "utf8");
|
|
54
|
+
execFileSync(process.execPath, [render, logPath, gifPath], { cwd: repoRoot, stdio: "inherit" });
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
function cleanOutput(output) {
|
|
58
|
+
return String(output || "")
|
|
59
|
+
.split(/\r?\n/)
|
|
60
|
+
.map((line) => line.trimEnd())
|
|
61
|
+
.join("\n")
|
|
62
|
+
.trim();
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
function routeSummary(fixture, prompt) {
|
|
66
|
+
const row = skillEval.rows.find((item) => item.fixture === fixture && item.prompt === prompt);
|
|
67
|
+
if (!row) return "No route found.";
|
|
68
|
+
return [
|
|
69
|
+
"ContextOS skill doctor",
|
|
70
|
+
`fixture: ${fixture}`,
|
|
71
|
+
`prompt: ${prompt}`,
|
|
72
|
+
`selected: ${row.selectedIds.join(", ") || "(none)"}`,
|
|
73
|
+
`expected: ${row.expected.join(", ") || "(none)"}`,
|
|
74
|
+
`rejected: ${row.forbidden.join(", ") || "(none)"}`
|
|
75
|
+
].join("\n");
|
|
76
|
+
}
|
|
Binary file
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
$ ctx doctor
|
|
2
|
+
Repository Score
|
|
3
|
+
|
|
4
|
+
Rules: 100
|
|
5
|
+
Skills: 100
|
|
6
|
+
Workflows: 100
|
|
7
|
+
|
|
8
|
+
Overall:
|
|
9
|
+
ContextOS Ready Gold
|
|
10
|
+
|
|
11
|
+
Evidence:
|
|
12
|
+
- Rules: 1 AGENTS.md source(s), 5 actionable rule(s)
|
|
13
|
+
- Skills: 3 skill(s), 3 metadata file(s)
|
|
14
|
+
- Workflows: 2 workflow(s), 2 with agent chain(s)
|
|
@@ -35,7 +35,7 @@ for (let count = 1; count <= displayLines.length; count += frameStep) {
|
|
|
35
35
|
frames.push(writeFrame({ tmpDir, index: frames.length, lines: displayLines }));
|
|
36
36
|
|
|
37
37
|
fs.mkdirSync(path.dirname(outputPath), { recursive: true });
|
|
38
|
-
execFileSync("convert", ["-delay", "12", "-loop", "0", ...frames, outputPath], { stdio: "inherit" });
|
|
38
|
+
execFileSync("convert", ["-limit", "time", "120", "-delay", "12", "-loop", "0", ...frames, outputPath], { stdio: "inherit" });
|
|
39
39
|
console.log(`Wrote ${outputPath}`);
|
|
40
40
|
|
|
41
41
|
function writeFrame({ tmpDir, index, lines }) {
|
|
Binary file
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
$ ctx leaderboard --hallucination
|
|
2
|
+
Hallucination Leaderboard
|
|
3
|
+
Repos: 12
|
|
4
|
+
Tasks: 20
|
|
5
|
+
|
|
6
|
+
System Correct Skill
|
|
7
|
+
------------------ -------------
|
|
8
|
+
Raw Agent 10.0%
|
|
9
|
+
ContextOS + Codex 80.0%
|
|
10
|
+
|
|
11
|
+
Sample failures:
|
|
12
|
+
- expo-eas: "fix deployed"
|
|
13
|
+
expected: eas, mobile-deployment, github-actions-ci-cd
|
|
14
|
+
raw: eas, env-secret-management, railway-render-deployment ✗
|
|
15
|
+
contextos: eas, github-actions-ci-cd, mobile-deployment ✓
|
|
16
|
+
- next-vercel: "fix deployed"
|
|
17
|
+
expected: vercel-deployment, github-actions-ci-cd, env-secret-management
|
|
18
|
+
raw: eas, env-secret-management, railway-render-deployment ✗
|
|
19
|
+
contextos: vercel-deployment, github-actions-ci-cd, env-secret-management ✓
|
|
20
|
+
- docker-node: "docker image build failed"
|
|
21
|
+
expected: docker, build-log-debugging
|
|
22
|
+
raw: docker, build-log-debugging, github-actions-ci-cd ✗
|
|
23
|
+
contextos: build-log-debugging, docker ✓
|
|
24
|
+
- railway-render: "Railway deploy health check failed"
|
|
25
|
+
expected: railway-render-deployment, build-log-debugging
|
|
26
|
+
raw: railway-render-deployment, build-log-debugging, firebase-hosting ✗
|
|
27
|
+
contextos: build-log-debugging, railway-render-deployment ✓
|
|
28
|
+
- firebase-hosting: "deploy firebase hosting"
|
|
29
|
+
expected: firebase-hosting
|
|
30
|
+
raw: firebase-hosting, flutter-firebase, railway-render-deployment ✗
|
|
31
|
+
contextos: firebase-hosting ✓
|
|
32
|
+
- nest-prisma: "optimize slow prisma queries"
|
|
33
|
+
expected: prisma, nestjs-module
|
|
34
|
+
raw: prisma, nestjs-module, android-signing ✗
|
|
35
|
+
contextos: nestjs-module, prisma ✓
|
|
36
|
+
|
|
37
|
+
$ ctx skills doctor -- "fix deployed" # Expo fixture
|
|
38
|
+
ContextOS skill doctor
|
|
39
|
+
fixture: expo-eas
|
|
40
|
+
prompt: fix deployed
|
|
41
|
+
selected: eas, github-actions-ci-cd, mobile-deployment
|
|
42
|
+
expected: eas, mobile-deployment, github-actions-ci-cd
|
|
43
|
+
rejected: vercel-deployment
|
|
44
|
+
|
|
45
|
+
$ ctx skills doctor -- "fix deployed" # Next/Vercel fixture
|
|
46
|
+
ContextOS skill doctor
|
|
47
|
+
fixture: next-vercel
|
|
48
|
+
prompt: fix deployed
|
|
49
|
+
selected: vercel-deployment, github-actions-ci-cd, env-secret-management
|
|
50
|
+
expected: vercel-deployment, github-actions-ci-cd, env-secret-management
|
|
51
|
+
rejected: eas
|
package/docs/launch-demos.md
CHANGED
|
@@ -4,6 +4,8 @@ These are demo scripts for explaining ContextOS quickly. They are intentionally
|
|
|
4
4
|
|
|
5
5
|
## 1. Agent Hallucination Benchmark
|
|
6
6
|
|
|
7
|
+
GIF: [`docs/demo/same-prompt-different-context.gif`](demo/same-prompt-different-context.gif)
|
|
8
|
+
|
|
7
9
|
Prompt:
|
|
8
10
|
|
|
9
11
|
```text
|
|
@@ -39,6 +41,8 @@ Same prompt. Same model. Different context.
|
|
|
39
41
|
|
|
40
42
|
## 2. AGENTS.md Lost In The Middle
|
|
41
43
|
|
|
44
|
+
GIF: [`docs/demo/agents-lost-middle.gif`](demo/agents-lost-middle.gif)
|
|
45
|
+
|
|
42
46
|
Setup:
|
|
43
47
|
|
|
44
48
|
```text
|
|
@@ -71,6 +75,8 @@ Important repo rules should not depend on where they appear in a long file.
|
|
|
71
75
|
|
|
72
76
|
## 3. Repo-Aware Skills
|
|
73
77
|
|
|
78
|
+
GIF: [`docs/demo/same-prompt-different-context.gif`](demo/same-prompt-different-context.gif)
|
|
79
|
+
|
|
74
80
|
Prompt:
|
|
75
81
|
|
|
76
82
|
```text
|
|
@@ -103,3 +109,19 @@ Message:
|
|
|
103
109
|
```text
|
|
104
110
|
Context is not extra text. It changes the correct answer.
|
|
105
111
|
```
|
|
112
|
+
|
|
113
|
+
## 4. ContextOS Ready
|
|
114
|
+
|
|
115
|
+
GIF: [`docs/demo/contextos-ready.gif`](demo/contextos-ready.gif)
|
|
116
|
+
|
|
117
|
+
Command:
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
ctx doctor
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Message:
|
|
124
|
+
|
|
125
|
+
```text
|
|
126
|
+
Repos now have a target: AGENTS.md + skills + workflows + evidence.
|
|
127
|
+
```
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { execFileSync, spawnSync } from "node:child_process";
|
|
3
|
+
import fs from "node:fs";
|
|
4
|
+
import os from "node:os";
|
|
5
|
+
import path from "node:path";
|
|
6
|
+
import { fileURLToPath } from "node:url";
|
|
7
|
+
|
|
8
|
+
import { parseEvalYaml } from "../skill-routing/run-eval.js";
|
|
9
|
+
|
|
10
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
11
|
+
const repoRoot = path.resolve(__dirname, "..", "..");
|
|
12
|
+
const skillEvalRoot = path.resolve(__dirname, "..", "skill-routing");
|
|
13
|
+
const DEFAULT_AGENTS = ["codex", "gemini"];
|
|
14
|
+
const DEFAULT_CASE_LIMIT = 5;
|
|
15
|
+
const DEFAULT_TIMEOUT_MS = 120000;
|
|
16
|
+
|
|
17
|
+
export function runAgentLeaderboard({
|
|
18
|
+
agents = DEFAULT_AGENTS,
|
|
19
|
+
casesPath = path.join(skillEvalRoot, "cases.yaml"),
|
|
20
|
+
caseLimit = DEFAULT_CASE_LIMIT,
|
|
21
|
+
timeoutMs = DEFAULT_TIMEOUT_MS,
|
|
22
|
+
rootDir = repoRoot
|
|
23
|
+
} = {}) {
|
|
24
|
+
const config = parseEvalYaml(fs.readFileSync(casesPath, "utf8"));
|
|
25
|
+
const cases = config.cases
|
|
26
|
+
.filter((row) => row.expected?.length)
|
|
27
|
+
.slice(0, caseLimit);
|
|
28
|
+
const skillIds = config.skills.map((skill) => skill.id);
|
|
29
|
+
const systems = [];
|
|
30
|
+
|
|
31
|
+
for (const agent of agents) {
|
|
32
|
+
const binary = findBinary(agent);
|
|
33
|
+
if (!binary) {
|
|
34
|
+
systems.push({ name: agent, status: "skipped", reason: "binary not found", rows: [], correctRate: 0 });
|
|
35
|
+
continue;
|
|
36
|
+
}
|
|
37
|
+
const rows = cases.map((testCase) => runAgentCase({
|
|
38
|
+
agent,
|
|
39
|
+
binary,
|
|
40
|
+
testCase,
|
|
41
|
+
skillIds,
|
|
42
|
+
timeoutMs,
|
|
43
|
+
rootDir
|
|
44
|
+
}));
|
|
45
|
+
const completed = rows.filter((row) => row.status === "ok");
|
|
46
|
+
const correct = completed.filter((row) => row.correct).length;
|
|
47
|
+
systems.push({
|
|
48
|
+
name: agent,
|
|
49
|
+
status: completed.length ? "ok" : "skipped",
|
|
50
|
+
reason: completed.length ? "" : firstReason(rows),
|
|
51
|
+
rows,
|
|
52
|
+
correctRate: completed.length ? correct / completed.length : 0
|
|
53
|
+
});
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
return {
|
|
57
|
+
mode: "live-agent-cli",
|
|
58
|
+
caseCount: cases.length,
|
|
59
|
+
systems
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
export function formatAgentLeaderboard(result) {
|
|
64
|
+
const lines = [
|
|
65
|
+
"Live Agent Leaderboard",
|
|
66
|
+
`Mode: ${result.mode}`,
|
|
67
|
+
`Tasks: ${result.caseCount}`,
|
|
68
|
+
"",
|
|
69
|
+
"System Status Correct Skill",
|
|
70
|
+
"-------- -------- -------------"
|
|
71
|
+
];
|
|
72
|
+
for (const system of result.systems) {
|
|
73
|
+
const score = system.status === "ok" ? percent(system.correctRate) : system.reason;
|
|
74
|
+
lines.push(`${system.name.padEnd(8)} ${system.status.padEnd(8)} ${score}`);
|
|
75
|
+
}
|
|
76
|
+
lines.push("", "Cases:");
|
|
77
|
+
for (const system of result.systems) {
|
|
78
|
+
lines.push(`- ${system.name}`);
|
|
79
|
+
for (const row of system.rows.slice(0, 5)) {
|
|
80
|
+
lines.push(` - ${row.fixture}: "${row.prompt}"`);
|
|
81
|
+
lines.push(` selected: ${row.selectedIds.join(", ") || "(none)"} ${row.correct ? "✓" : "✗"}`);
|
|
82
|
+
if (row.status !== "ok") lines.push(` status: ${row.status}; ${row.reason}`);
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
return lines.join("\n");
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
function runAgentCase({ agent, binary, testCase, skillIds, timeoutMs, rootDir }) {
|
|
89
|
+
const cwd = testCase.fixture === "contextos"
|
|
90
|
+
? rootDir
|
|
91
|
+
: path.join(skillEvalRoot, "fixtures", testCase.fixture);
|
|
92
|
+
const prompt = buildPrompt({ task: testCase.prompt, skillIds });
|
|
93
|
+
const startedAt = Date.now();
|
|
94
|
+
const result = spawnSync(binary, agentArgs({ agent, cwd, prompt }), {
|
|
95
|
+
cwd,
|
|
96
|
+
encoding: "utf8",
|
|
97
|
+
timeout: timeoutMs,
|
|
98
|
+
maxBuffer: 1024 * 1024 * 4,
|
|
99
|
+
env: {
|
|
100
|
+
...process.env,
|
|
101
|
+
NO_COLOR: "1",
|
|
102
|
+
CI: process.env.CI || "1"
|
|
103
|
+
}
|
|
104
|
+
});
|
|
105
|
+
const output = `${result.stdout || ""}\n${result.stderr || ""}`.trim();
|
|
106
|
+
const selectedIds = parseSkillIds(output, skillIds).slice(0, 3);
|
|
107
|
+
const correct = isCorrect({
|
|
108
|
+
selectedIds,
|
|
109
|
+
expected: testCase.expected || [],
|
|
110
|
+
allowed: testCase.allowed || [],
|
|
111
|
+
forbidden: testCase.forbidden || []
|
|
112
|
+
});
|
|
113
|
+
return {
|
|
114
|
+
status: result.error ? "error" : result.status === 0 ? "ok" : "error",
|
|
115
|
+
reason: result.error?.message || (result.status === 0 ? "" : `exit ${result.status}`),
|
|
116
|
+
prompt: testCase.prompt,
|
|
117
|
+
fixture: testCase.fixture,
|
|
118
|
+
expected: testCase.expected || [],
|
|
119
|
+
selectedIds,
|
|
120
|
+
correct,
|
|
121
|
+
durationMs: Date.now() - startedAt
|
|
122
|
+
};
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
function agentArgs({ agent, cwd, prompt }) {
|
|
126
|
+
if (agent === "codex") {
|
|
127
|
+
return [
|
|
128
|
+
"exec",
|
|
129
|
+
"--cd", cwd,
|
|
130
|
+
"--sandbox", "read-only",
|
|
131
|
+
"--ask-for-approval", "never",
|
|
132
|
+
prompt
|
|
133
|
+
];
|
|
134
|
+
}
|
|
135
|
+
if (agent === "gemini") {
|
|
136
|
+
const template = process.env.CONTEXTOS_GEMINI_CMD;
|
|
137
|
+
if (template) return expandTemplate(template, { cwd, prompt });
|
|
138
|
+
return ["-p", prompt];
|
|
139
|
+
}
|
|
140
|
+
return [prompt];
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
function buildPrompt({ task, skillIds }) {
|
|
144
|
+
return [
|
|
145
|
+
"You are evaluating a repository for a coding-agent skill router benchmark.",
|
|
146
|
+
"Do not edit files. Do not run commands.",
|
|
147
|
+
`Task: ${task}`,
|
|
148
|
+
`Allowed skill IDs: ${skillIds.join(", ")}`,
|
|
149
|
+
"Return only the top skill IDs as comma-separated text. No explanations."
|
|
150
|
+
].join("\n");
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
function parseSkillIds(output, skillIds) {
|
|
154
|
+
const normalized = new Map(skillIds.map((id) => [normalize(id), id]));
|
|
155
|
+
const found = [];
|
|
156
|
+
for (const token of String(output || "").split(/[^A-Za-z0-9_.@-]+/)) {
|
|
157
|
+
const id = normalized.get(normalize(token));
|
|
158
|
+
if (id && !found.includes(id)) found.push(id);
|
|
159
|
+
}
|
|
160
|
+
return found;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
function isCorrect({ selectedIds, expected, allowed, forbidden }) {
|
|
164
|
+
const selected = new Set(selectedIds);
|
|
165
|
+
const accepted = new Set([...expected, ...allowed]);
|
|
166
|
+
return expected.every((skill) => selected.has(skill))
|
|
167
|
+
&& forbidden.every((skill) => !selected.has(skill))
|
|
168
|
+
&& selectedIds.every((skill) => accepted.has(skill));
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
function findBinary(name) {
|
|
172
|
+
const safeName = shellQuote(name);
|
|
173
|
+
const candidates = [
|
|
174
|
+
path.join(os.homedir(), ".local", "bin", safeName),
|
|
175
|
+
path.join(os.homedir(), ".npm-global", "bin", safeName),
|
|
176
|
+
path.join(os.homedir(), ".nvm", "current", "bin", safeName),
|
|
177
|
+
`/mnt/c/Users/admin/AppData/Roaming/npm/${safeName}`,
|
|
178
|
+
`/mnt/c/Users/admin/AppData/Roaming/npm/${safeName}.cmd`
|
|
179
|
+
];
|
|
180
|
+
for (const candidate of candidates) {
|
|
181
|
+
if (fs.existsSync(candidate)) return candidate;
|
|
182
|
+
}
|
|
183
|
+
for (const command of [
|
|
184
|
+
`command -v ${safeName}`,
|
|
185
|
+
`source ~/.profile >/dev/null 2>&1 || true; source ~/.bashrc >/dev/null 2>&1 || true; command -v ${safeName}`
|
|
186
|
+
]) {
|
|
187
|
+
try {
|
|
188
|
+
const found = execFileSync("bash", ["-lc", command], { encoding: "utf8" }).trim();
|
|
189
|
+
if (found) return found;
|
|
190
|
+
} catch {
|
|
191
|
+
// continue
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
return "";
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
function firstReason(rows) {
|
|
198
|
+
return rows.find((row) => row.reason)?.reason || "no completed cases";
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
function normalize(value) {
|
|
202
|
+
return String(value || "").toLowerCase().replace(/[^a-z0-9]+/g, "");
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
function shellQuote(value) {
|
|
206
|
+
return String(value).replace(/[^A-Za-z0-9_./-]/g, "");
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
function expandTemplate(template, vars) {
|
|
210
|
+
const file = path.join(os.tmpdir(), `contextos-agent-prompt-${process.pid}-${Date.now()}.txt`);
|
|
211
|
+
fs.writeFileSync(file, vars.prompt, "utf8");
|
|
212
|
+
return String(template)
|
|
213
|
+
.replaceAll("{cwd}", vars.cwd)
|
|
214
|
+
.replaceAll("{prompt_file}", file)
|
|
215
|
+
.split(/\s+/)
|
|
216
|
+
.filter(Boolean);
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
function percent(value) {
|
|
220
|
+
return `${(value * 100).toFixed(1)}%`;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
if (import.meta.url === `file://${process.argv[1]}`) {
|
|
224
|
+
const agentsArg = process.argv.find((arg) => arg.startsWith("--agents="));
|
|
225
|
+
const limitArg = process.argv.find((arg) => arg.startsWith("--limit="));
|
|
226
|
+
const timeoutArg = process.argv.find((arg) => arg.startsWith("--timeout-ms="));
|
|
227
|
+
const result = runAgentLeaderboard({
|
|
228
|
+
agents: agentsArg ? agentsArg.slice("--agents=".length).split(",").map((item) => item.trim()).filter(Boolean) : DEFAULT_AGENTS,
|
|
229
|
+
caseLimit: limitArg ? Number(limitArg.slice("--limit=".length)) : DEFAULT_CASE_LIMIT,
|
|
230
|
+
timeoutMs: timeoutArg ? Number(timeoutArg.slice("--timeout-ms=".length)) : DEFAULT_TIMEOUT_MS
|
|
231
|
+
});
|
|
232
|
+
console.log(formatAgentLeaderboard(result));
|
|
233
|
+
}
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import fs from "node:fs";
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
import { fileURLToPath } from "node:url";
|
|
5
|
+
|
|
6
|
+
import { parseEvalYaml, runSkillRoutingEval } from "../skill-routing/run-eval.js";
|
|
7
|
+
|
|
8
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
9
|
+
const evalRoot = path.resolve(__dirname, "..", "skill-routing");
|
|
10
|
+
const DEFAULT_CASE_LIMIT = 20;
|
|
11
|
+
|
|
12
|
+
export async function runHallucinationLeaderboard({
|
|
13
|
+
rootDir = path.resolve(__dirname, "..", ".."),
|
|
14
|
+
casesPath = path.join(evalRoot, "cases.yaml"),
|
|
15
|
+
caseLimit = DEFAULT_CASE_LIMIT
|
|
16
|
+
} = {}) {
|
|
17
|
+
const config = parseEvalYaml(fs.readFileSync(casesPath, "utf8"));
|
|
18
|
+
const selectedCases = selectLeaderboardCases(config.cases, caseLimit);
|
|
19
|
+
const wanted = new Set(selectedCases.map(caseId));
|
|
20
|
+
const contextos = await runSkillRoutingEval({ rootDir, casesPath, topK: 3, threshold: 0.5 });
|
|
21
|
+
const contextRows = contextos.rows
|
|
22
|
+
.filter((row) => wanted.has(caseId(row)))
|
|
23
|
+
.map((row) => evaluateRow({
|
|
24
|
+
prompt: row.prompt,
|
|
25
|
+
fixture: row.fixture,
|
|
26
|
+
expected: row.expected,
|
|
27
|
+
allowed: row.allowed,
|
|
28
|
+
forbidden: row.forbidden,
|
|
29
|
+
selectedIds: row.selectedIds
|
|
30
|
+
}));
|
|
31
|
+
const rawRows = selectedCases.map((testCase) => evaluateRow({
|
|
32
|
+
prompt: testCase.prompt,
|
|
33
|
+
fixture: testCase.fixture,
|
|
34
|
+
expected: testCase.expected || [],
|
|
35
|
+
allowed: testCase.allowed || [],
|
|
36
|
+
forbidden: testCase.forbidden || [],
|
|
37
|
+
selectedIds: rawAgentSkills({ prompt: testCase.prompt, skills: config.skills, topK: 3 })
|
|
38
|
+
}));
|
|
39
|
+
|
|
40
|
+
return {
|
|
41
|
+
caseCount: selectedCases.length,
|
|
42
|
+
repoCount: new Set(selectedCases.map((row) => row.fixture)).size,
|
|
43
|
+
systems: [
|
|
44
|
+
summarizeSystem("Raw Agent", rawRows),
|
|
45
|
+
summarizeSystem("ContextOS + Codex", contextRows)
|
|
46
|
+
],
|
|
47
|
+
rows: selectedCases.map((testCase) => ({
|
|
48
|
+
prompt: testCase.prompt,
|
|
49
|
+
fixture: testCase.fixture,
|
|
50
|
+
expected: testCase.expected || [],
|
|
51
|
+
raw: rawRows.find((row) => row.prompt === testCase.prompt && row.fixture === testCase.fixture),
|
|
52
|
+
contextos: contextRows.find((row) => row.prompt === testCase.prompt && row.fixture === testCase.fixture)
|
|
53
|
+
}))
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export function formatHallucinationLeaderboard(result) {
|
|
58
|
+
const lines = [
|
|
59
|
+
"Hallucination Leaderboard",
|
|
60
|
+
`Repos: ${result.repoCount}`,
|
|
61
|
+
`Tasks: ${result.caseCount}`,
|
|
62
|
+
"",
|
|
63
|
+
"System Correct Skill",
|
|
64
|
+
"------------------ -------------"
|
|
65
|
+
];
|
|
66
|
+
for (const system of result.systems) {
|
|
67
|
+
lines.push(`${system.name.padEnd(18)} ${percent(system.correctRate)}`);
|
|
68
|
+
}
|
|
69
|
+
lines.push("", "Sample failures:");
|
|
70
|
+
const failures = result.rows
|
|
71
|
+
.filter((row) => !row.raw.correct || !row.contextos.correct)
|
|
72
|
+
.slice(0, 6);
|
|
73
|
+
if (!failures.length) {
|
|
74
|
+
lines.push("- none");
|
|
75
|
+
} else {
|
|
76
|
+
for (const row of failures) {
|
|
77
|
+
lines.push(`- ${row.fixture}: "${row.prompt}"`);
|
|
78
|
+
lines.push(` expected: ${row.expected.join(", ") || "(none)"}`);
|
|
79
|
+
lines.push(` raw: ${row.raw.selectedIds.join(", ") || "(none)"} ${row.raw.correct ? "✓" : "✗"}`);
|
|
80
|
+
lines.push(` contextos: ${row.contextos.selectedIds.join(", ") || "(none)"} ${row.contextos.correct ? "✓" : "✗"}`);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
return lines.join("\n");
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
function selectLeaderboardCases(cases, limit) {
|
|
87
|
+
const wantedFixtures = [
|
|
88
|
+
"expo-eas",
|
|
89
|
+
"next-vercel",
|
|
90
|
+
"docker-node",
|
|
91
|
+
"railway-render",
|
|
92
|
+
"firebase-hosting",
|
|
93
|
+
"nest-prisma",
|
|
94
|
+
"express-mongo-jwt",
|
|
95
|
+
"oauth-google",
|
|
96
|
+
"redis-cache",
|
|
97
|
+
"contextos",
|
|
98
|
+
"frontend-only-next",
|
|
99
|
+
"static-docs"
|
|
100
|
+
];
|
|
101
|
+
const selected = [];
|
|
102
|
+
for (const fixture of wantedFixtures) {
|
|
103
|
+
const match = cases.find((row) => row.fixture === fixture && !selected.some((item) => caseId(item) === caseId(row)));
|
|
104
|
+
if (match) selected.push(match);
|
|
105
|
+
if (selected.length >= limit) return selected;
|
|
106
|
+
}
|
|
107
|
+
for (const row of cases) {
|
|
108
|
+
if (!selected.some((item) => caseId(item) === caseId(row))) selected.push(row);
|
|
109
|
+
if (selected.length >= limit) break;
|
|
110
|
+
}
|
|
111
|
+
return selected;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
function rawAgentSkills({ prompt, skills, topK }) {
|
|
115
|
+
return skills
|
|
116
|
+
.map((skill) => ({ id: skill.id, score: rawPromptScore(prompt, skill) }))
|
|
117
|
+
.sort((a, b) => b.score - a.score || a.id.localeCompare(b.id))
|
|
118
|
+
.slice(0, topK)
|
|
119
|
+
.map((skill) => skill.id);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
function rawPromptScore(prompt, skill) {
|
|
123
|
+
const promptTokens = new Set(tokenize(prompt));
|
|
124
|
+
const triggerTokens = tokenize([
|
|
125
|
+
skill.id,
|
|
126
|
+
skill.description,
|
|
127
|
+
...(skill.positive_triggers?.prompts || [])
|
|
128
|
+
].join(" "));
|
|
129
|
+
let score = 0;
|
|
130
|
+
for (const token of triggerTokens) {
|
|
131
|
+
if (promptTokens.has(token)) score += token.length > 5 ? 2 : 1;
|
|
132
|
+
}
|
|
133
|
+
return score;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
function evaluateRow({ prompt, fixture, expected, allowed, forbidden, selectedIds }) {
|
|
137
|
+
const selected = new Set(selectedIds);
|
|
138
|
+
const accepted = new Set([...expected, ...allowed]);
|
|
139
|
+
const hasExpected = expected.length
|
|
140
|
+
? expected.every((skill) => selected.has(skill))
|
|
141
|
+
: selectedIds.length === 0;
|
|
142
|
+
const hasForbidden = forbidden.some((skill) => selected.has(skill));
|
|
143
|
+
const hasUnexpected = selectedIds.some((skill) => !accepted.has(skill));
|
|
144
|
+
return {
|
|
145
|
+
prompt,
|
|
146
|
+
fixture,
|
|
147
|
+
expected,
|
|
148
|
+
allowed,
|
|
149
|
+
forbidden,
|
|
150
|
+
selectedIds,
|
|
151
|
+
correct: hasExpected && !hasForbidden && !hasUnexpected
|
|
152
|
+
};
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
function summarizeSystem(name, rows) {
|
|
156
|
+
const correct = rows.filter((row) => row.correct).length;
|
|
157
|
+
return {
|
|
158
|
+
name,
|
|
159
|
+
correct,
|
|
160
|
+
total: rows.length,
|
|
161
|
+
correctRate: rows.length ? correct / rows.length : 0
|
|
162
|
+
};
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
function tokenize(value) {
|
|
166
|
+
return String(value || "")
|
|
167
|
+
.toLowerCase()
|
|
168
|
+
.split(/[^a-z0-9@.-]+/)
|
|
169
|
+
.filter((token) => token.length > 2);
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
function caseId(row) {
|
|
173
|
+
return `${row.fixture}\0${row.prompt}`;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
function percent(value) {
|
|
177
|
+
return `${(value * 100).toFixed(1)}%`;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
if (import.meta.url === `file://${process.argv[1]}`) {
|
|
181
|
+
const result = await runHallucinationLeaderboard();
|
|
182
|
+
console.log(formatHallucinationLeaderboard(result));
|
|
183
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@minhpnq1807/contextos",
|
|
3
|
-
"version": "0.6.
|
|
3
|
+
"version": "0.6.2",
|
|
4
4
|
"description": "Task-aware AGENTS.md context injection and compliance reporting for Codex, Claude Code, and Antigravity.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
"files": [
|
|
11
11
|
"bin/",
|
|
12
12
|
"plugins/",
|
|
13
|
+
"scripts/",
|
|
13
14
|
".codex/skills/",
|
|
14
15
|
".codex/workflows/",
|
|
15
16
|
".agents/",
|
|
@@ -27,6 +28,10 @@
|
|
|
27
28
|
"build": "node bin/ctx.js --version",
|
|
28
29
|
"validate:plugin": "node test/validate-plugin.js",
|
|
29
30
|
"benchmark:skills": "node bin/ctx.js benchmark --skills",
|
|
31
|
+
"leaderboard:hallucination": "node eval/hallucination/run-leaderboard.js",
|
|
32
|
+
"leaderboard:agents": "node eval/hallucination/run-agent-leaderboard.js --agents=codex,gemini",
|
|
33
|
+
"demo:capture": "node docs/demo/capture-live-demos.mjs",
|
|
34
|
+
"sync:community-skills": "node scripts/sync-community-skills.mjs",
|
|
30
35
|
"test:mcp": "node test/mcp-protocol-smoke.js"
|
|
31
36
|
},
|
|
32
37
|
"engines": {
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import fs from "node:fs";
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
|
|
5
|
+
const [, , sourceArg = "external-skills", targetArg = "community-skills"] = process.argv;
|
|
6
|
+
const source = path.resolve(sourceArg);
|
|
7
|
+
const target = path.resolve(targetArg);
|
|
8
|
+
const skip = new Set([".git", ".github", "scripts"]);
|
|
9
|
+
|
|
10
|
+
if (!fs.existsSync(source)) {
|
|
11
|
+
console.error(`Missing source skills directory: ${source}`);
|
|
12
|
+
process.exit(1);
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
fs.mkdirSync(target, { recursive: true });
|
|
16
|
+
|
|
17
|
+
for (const entry of fs.readdirSync(target, { withFileTypes: true })) {
|
|
18
|
+
if (entry.isDirectory()) fs.rmSync(path.join(target, entry.name), { recursive: true, force: true });
|
|
19
|
+
else if (entry.isFile() && entry.name !== "README.md") fs.rmSync(path.join(target, entry.name), { force: true });
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
for (const entry of fs.readdirSync(source, { withFileTypes: true })) {
|
|
23
|
+
if (skip.has(entry.name)) continue;
|
|
24
|
+
const from = path.join(source, entry.name);
|
|
25
|
+
const to = path.join(target, entry.name);
|
|
26
|
+
if (entry.isDirectory()) copyDir(from, to);
|
|
27
|
+
else if (entry.isFile() && entry.name === "README.md") fs.copyFileSync(from, to);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
console.log(`Synced ContextOS skills from ${source} to ${target}`);
|
|
31
|
+
|
|
32
|
+
function copyDir(from, to) {
|
|
33
|
+
fs.mkdirSync(to, { recursive: true });
|
|
34
|
+
for (const entry of fs.readdirSync(from, { withFileTypes: true })) {
|
|
35
|
+
const sourcePath = path.join(from, entry.name);
|
|
36
|
+
const targetPath = path.join(to, entry.name);
|
|
37
|
+
if (entry.isDirectory()) copyDir(sourcePath, targetPath);
|
|
38
|
+
else if (entry.isFile()) fs.copyFileSync(sourcePath, targetPath);
|
|
39
|
+
}
|
|
40
|
+
}
|