@minhpnq1807/contextos 0.6.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,9 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.6.1
4
+
5
+ - **Hallucination Leaderboard:** Added `ctx leaderboard --hallucination` and `npm run leaderboard:hallucination` to compare raw prompt-only skill guesses against ContextOS evidence-routed skill selection across 20 fixture tasks.
6
+
3
7
  ## 0.6.0
4
8
 
5
9
  - **Launch demo framing:** Added Agent Hallucination Benchmark messaging, same-prompt/same-model/different-context copy, and `docs/launch-demos.md` with three short demo scripts: hallucination benchmark, AGENTS.md lost-in-the-middle, and repo-aware skills.
package/README.md CHANGED
@@ -28,7 +28,7 @@ Published package: [`@minhpnq1807/contextos`](https://www.npmjs.com/package/@min
28
28
 
29
29
  ## Demo
30
30
 
31
- ![ContextOS demo: same prompt, different repo, correct skills](docs/demo/contextos-demo.gif)
31
+ ![ContextOS demo: same prompt, different repo, correct skills](docs/demo/same-prompt-different-context.gif)
32
32
 
33
33
  Same prompt. Same model. Different context.
34
34
 
@@ -42,6 +42,13 @@ ctx skills doctor -- "fix deployed"
42
42
  | `vercel.json`, `next`, GitHub workflow | `vercel-deployment`, `github-actions-ci-cd`, `env-secret-management` |
43
43
  | ContextOS repo with no app deploy evidence | no deployment skill selected |
44
44
 
45
+ More 10-second demos:
46
+
47
+ | Demo | GIF |
48
+ | --- | --- |
49
+ | AGENTS.md Lost In The Middle | [docs/demo/agents-lost-middle.gif](docs/demo/agents-lost-middle.gif) |
50
+ | ContextOS Ready Gold | [docs/demo/contextos-ready.gif](docs/demo/contextos-ready.gif) |
51
+
45
52
  ## Agent Hallucination Benchmark
46
53
 
47
54
  Generic agents often guess deployment tooling from the prompt alone:
@@ -80,6 +87,19 @@ Skill Router internal fixture benchmark:
80
87
 
81
88
  This is an internal fixture benchmark, not an external real-world benchmark. It is designed to prove the router behavior across controlled Expo/EAS, Next/Vercel, Docker, Railway/Render, Firebase, auth, database, testing, mobile, and adversarial negative-gate cases.
82
89
 
90
+ Hallucination leaderboard:
91
+
92
+ ```bash
93
+ ctx leaderboard --hallucination
94
+ ```
95
+
96
+ Current local result across 20 fixture tasks and 12 repo contexts:
97
+
98
+ | System | Correct Skill |
99
+ | --- | ---: |
100
+ | Raw Agent | 10.0% |
101
+ | ContextOS + Codex | 80.0% |
102
+
83
103
  Example hook context injected before the agent works:
84
104
 
85
105
  ```text
@@ -246,6 +266,7 @@ The score checks project `AGENTS.md` rules, project skill packs under `.codex/sk
246
266
  | `ctx stats` | Show workspace-level usage and effectiveness metrics. |
247
267
  | `ctx benchmark -- "task"` | Compare raw AGENTS.md ordering vs ContextOS scheduling. |
248
268
  | `ctx benchmark --skills` | Run the Skill Router eval benchmark. |
269
+ | `ctx leaderboard --hallucination` | Compare raw prompt-only guesses vs ContextOS routing. |
249
270
  | `ctx sync --rules` | Sync AGENTS/Ruler/MCP config across agents. |
250
271
  | `ctx sync --skills` | Sync skills across agents through skillshare. |
251
272
  | `ctx sync --workflows` | Sync workflow markdown across Claude/Codex/Antigravity. |
@@ -570,6 +591,7 @@ This warning comes from a transitive dependency in the local embedding/WASM stac
570
591
  | `ctx stats` | Shows aggregate runtime metrics for the current workspace. | You want to know whether ContextOS is active and useful over time. | Prints sectioned tables for prompt/report counts, injection rate, efficiency, rule outcomes, hook events, last prompt, and last report. |
571
592
  | `ctx benchmark -- "task"` | Compares baseline AGENTS.md ordering with ContextOS task-aware scheduling. | You want a before/after signal for lost-in-the-middle risk. | Prints tables for parsed/actionable/filtered rules, baseline middle-risk, scheduled high/mid rules, recency reminder status, and top scored rules. |
572
593
  | `ctx benchmark --skills` | Runs the Skill Router eval benchmark. | You want evidence for skill routing accuracy and negative gates. | Prints top-1 accuracy, top-3 recall, false positive rate, confidence calibration, and negative gate accuracy across `eval/skill-routing` fixtures. |
594
+ | `ctx leaderboard --hallucination` | Compares raw prompt-only skill guesses with ContextOS evidence routing. | You want launch evidence for the hallucination problem. | Runs 20 fixture tasks across 10+ repo contexts and prints Raw Agent vs ContextOS correctness plus sample failures. |
573
595
  | `ctx sync --rules` | Syncs project rules and MCP servers through Ruler. | You want Codex, Claude Code, and Antigravity to share one project rule/MCP source of truth. | Ensures `.ruler/ruler.toml`, injects `ctx-mcp`, imports existing MCP servers from Codex and project `.mcp.json`, runs `ruler apply --agents codex,claude,antigravity`, mirrors MCP servers to Antigravity MCP configs, and verifies generated config. |
574
596
  | `ctx sync --rules --agents <list>` | Syncs only selected agents through Ruler. | You want to update one or two agents without touching the others. | Accepts comma-separated values such as `codex`, `claude`, `agy`, `antigravity`, or `codex,claude,agy`; `agy` is normalized to Ruler's `antigravity`. |
575
597
  | `ctx sync --rules --dry-run` | Previews Ruler sync without writing files or running apply. | You want to inspect behavior before changing project config. | Prints the same flow with dry-run status. |
package/bin/ctx.js CHANGED
@@ -20,6 +20,7 @@ import { defaultDataRoot, workspaceDataDir, workspaceMarkerPath } from "../plugi
20
20
  import { installMcpTelemetryProxies } from "../plugins/ctx/lib/mcp-proxy-install.js";
21
21
  import { benchmarkWorkspace, formatBenchmark } from "../plugins/ctx/lib/benchmark.js";
22
22
  import { formatSkillRoutingBenchmark, runSkillRoutingEval } from "../eval/skill-routing/run-eval.js";
23
+ import { formatHallucinationLeaderboard, runHallucinationLeaderboard } from "../eval/hallucination/run-leaderboard.js";
23
24
  import { copyDir, copyPackageRoot, syncPackageRoot } from "../plugins/ctx/lib/package-install.js";
24
25
  import { installClaudeHooks } from "../plugins/ctx/lib/claude-hooks.js";
25
26
  import { installClaudeMcp } from "../plugins/ctx/lib/claude-mcp.js";
@@ -197,6 +198,7 @@ Usage:
197
198
  ctx stats Show workspace statistics
198
199
  ctx benchmark -- "task" Benchmark workspace for a task
199
200
  ctx benchmark --skills Run skill routing eval benchmark
201
+ ctx leaderboard --hallucination Compare raw agent guesses vs ContextOS routing
200
202
  ctx sync --rules Sync AGENTS.md rules to all agents
201
203
  ctx sync --rules --agents <names> Sync rules to specific agents only
202
204
  ctx sync --rules --dry-run Preview rule sync without writing
@@ -1034,6 +1036,12 @@ try {
1034
1036
  if (!task.trim()) throw new Error('Usage: ctx benchmark -- "task"');
1035
1037
  console.log(formatBenchmark(benchmarkWorkspace({ cwd: process.cwd(), task })));
1036
1038
  }
1039
+ } else if (command === "leaderboard") {
1040
+ if (args.includes("--hallucination")) {
1041
+ console.log(formatHallucinationLeaderboard(await runHallucinationLeaderboard({ rootDir })));
1042
+ } else {
1043
+ throw new Error("Usage: ctx leaderboard --hallucination");
1044
+ }
1037
1045
  } else if (command === "skills") {
1038
1046
  if (args[1] === "doctor") {
1039
1047
  const marker = args.indexOf("--");
Binary file
@@ -0,0 +1,28 @@
1
+ $ cat AGENTS.md
2
+ 1. General style
3
+ 2. Formatting
4
+ 3. Test names
5
+ ...
6
+ 37. IMPORTANT: Always use code-review-graph before grep.
7
+ ...
8
+ 52. Release notes
9
+
10
+ $ codex "fix failing test"
11
+ Raw agent starts with grep
12
+ Rule followed: no
13
+
14
+ $ ctx debug -- "fix failing test"
15
+ ContextOS debug
16
+ Critical ContextOS rules:
17
+ - IMPORTANT: Always use code-review-graph before grep.
18
+
19
+ Suggested files to check:
20
+ - test/score-context.test.js
21
+ - plugins/ctx/lib/score-context.js
22
+
23
+ $ codex + ContextOS
24
+ Rule followed: yes
25
+ Evidence: graph checked before file reads
26
+
27
+ AGENTS.md did not change.
28
+ The rule moved from buried context into runtime context.
Binary file
@@ -0,0 +1,20 @@
1
+ $ ctx doctor
2
+ Repository Score
3
+
4
+ Rules: 100
5
+ Skills: 100
6
+ Workflows: 100
7
+
8
+ Overall:
9
+ ContextOS Ready Gold
10
+
11
+ Evidence:
12
+ - Rules: 1 AGENTS.md source(s), 5 actionable rule(s)
13
+ - Skills: 3 skill(s), 3 metadata file(s)
14
+ - Workflows: 2 workflow(s), 2 with agent chain(s)
15
+
16
+ $ badge
17
+ [ContextOS Ready Gold]
18
+
19
+ Repos now have a target:
20
+ AGENTS.md + skills + workflows + evidence.
@@ -0,0 +1,26 @@
1
+ $ ctx leaderboard --hallucination
2
+ Hallucination Leaderboard
3
+ Repos: 12
4
+ Tasks: 20
5
+
6
+ System Correct Skill
7
+ ------------------ -------------
8
+ Raw Agent 10.0%
9
+ ContextOS + Codex 80.0%
10
+
11
+ $ ctx skills doctor -- "fix deployed" # Expo repo
12
+ ContextOS skill doctor
13
+ 1. eas high confidence
14
+ evidence: eas.json, app.json, expo dependency
15
+ 2. mobile-deployment high confidence
16
+ 3. github-actions-ci-cd medium confidence
17
+
18
+ $ ctx skills doctor -- "fix deployed" # Next.js repo
19
+ ContextOS skill doctor
20
+ 1. vercel-deployment high confidence
21
+ evidence: vercel.json, next dependency
22
+ 2. github-actions-ci-cd high confidence
23
+ 3. env-secret-management medium confidence
24
+
25
+ Same prompt. Same model. Different repo evidence.
26
+ ContextOS routes the right skill before the agent edits code.
@@ -4,6 +4,8 @@ These are demo scripts for explaining ContextOS quickly. They are intentionally
4
4
 
5
5
  ## 1. Agent Hallucination Benchmark
6
6
 
7
+ GIF: [`docs/demo/same-prompt-different-context.gif`](demo/same-prompt-different-context.gif)
8
+
7
9
  Prompt:
8
10
 
9
11
  ```text
@@ -39,6 +41,8 @@ Same prompt. Same model. Different context.
39
41
 
40
42
  ## 2. AGENTS.md Lost In The Middle
41
43
 
44
+ GIF: [`docs/demo/agents-lost-middle.gif`](demo/agents-lost-middle.gif)
45
+
42
46
  Setup:
43
47
 
44
48
  ```text
@@ -71,6 +75,8 @@ Important repo rules should not depend on where they appear in a long file.
71
75
 
72
76
  ## 3. Repo-Aware Skills
73
77
 
78
+ GIF: [`docs/demo/same-prompt-different-context.gif`](demo/same-prompt-different-context.gif)
79
+
74
80
  Prompt:
75
81
 
76
82
  ```text
@@ -103,3 +109,19 @@ Message:
103
109
  ```text
104
110
  Context is not extra text. It changes the correct answer.
105
111
  ```
112
+
113
+ ## 4. ContextOS Ready
114
+
115
+ GIF: [`docs/demo/contextos-ready.gif`](demo/contextos-ready.gif)
116
+
117
+ Command:
118
+
119
+ ```bash
120
+ ctx doctor
121
+ ```
122
+
123
+ Message:
124
+
125
+ ```text
126
+ Repos now have a target: AGENTS.md + skills + workflows + evidence.
127
+ ```
@@ -0,0 +1,183 @@
1
+ #!/usr/bin/env node
2
+ import fs from "node:fs";
3
+ import path from "node:path";
4
+ import { fileURLToPath } from "node:url";
5
+
6
+ import { parseEvalYaml, runSkillRoutingEval } from "../skill-routing/run-eval.js";
7
+
8
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
9
+ const evalRoot = path.resolve(__dirname, "..", "skill-routing");
10
+ const DEFAULT_CASE_LIMIT = 20;
11
+
12
+ export async function runHallucinationLeaderboard({
13
+ rootDir = path.resolve(__dirname, "..", ".."),
14
+ casesPath = path.join(evalRoot, "cases.yaml"),
15
+ caseLimit = DEFAULT_CASE_LIMIT
16
+ } = {}) {
17
+ const config = parseEvalYaml(fs.readFileSync(casesPath, "utf8"));
18
+ const selectedCases = selectLeaderboardCases(config.cases, caseLimit);
19
+ const wanted = new Set(selectedCases.map(caseId));
20
+ const contextos = await runSkillRoutingEval({ rootDir, casesPath, topK: 3, threshold: 0.5 });
21
+ const contextRows = contextos.rows
22
+ .filter((row) => wanted.has(caseId(row)))
23
+ .map((row) => evaluateRow({
24
+ prompt: row.prompt,
25
+ fixture: row.fixture,
26
+ expected: row.expected,
27
+ allowed: row.allowed,
28
+ forbidden: row.forbidden,
29
+ selectedIds: row.selectedIds
30
+ }));
31
+ const rawRows = selectedCases.map((testCase) => evaluateRow({
32
+ prompt: testCase.prompt,
33
+ fixture: testCase.fixture,
34
+ expected: testCase.expected || [],
35
+ allowed: testCase.allowed || [],
36
+ forbidden: testCase.forbidden || [],
37
+ selectedIds: rawAgentSkills({ prompt: testCase.prompt, skills: config.skills, topK: 3 })
38
+ }));
39
+
40
+ return {
41
+ caseCount: selectedCases.length,
42
+ repoCount: new Set(selectedCases.map((row) => row.fixture)).size,
43
+ systems: [
44
+ summarizeSystem("Raw Agent", rawRows),
45
+ summarizeSystem("ContextOS + Codex", contextRows)
46
+ ],
47
+ rows: selectedCases.map((testCase) => ({
48
+ prompt: testCase.prompt,
49
+ fixture: testCase.fixture,
50
+ expected: testCase.expected || [],
51
+ raw: rawRows.find((row) => row.prompt === testCase.prompt && row.fixture === testCase.fixture),
52
+ contextos: contextRows.find((row) => row.prompt === testCase.prompt && row.fixture === testCase.fixture)
53
+ }))
54
+ };
55
+ }
56
+
57
+ export function formatHallucinationLeaderboard(result) {
58
+ const lines = [
59
+ "Hallucination Leaderboard",
60
+ `Repos: ${result.repoCount}`,
61
+ `Tasks: ${result.caseCount}`,
62
+ "",
63
+ "System Correct Skill",
64
+ "------------------ -------------"
65
+ ];
66
+ for (const system of result.systems) {
67
+ lines.push(`${system.name.padEnd(18)} ${percent(system.correctRate)}`);
68
+ }
69
+ lines.push("", "Sample failures:");
70
+ const failures = result.rows
71
+ .filter((row) => !row.raw.correct || !row.contextos.correct)
72
+ .slice(0, 6);
73
+ if (!failures.length) {
74
+ lines.push("- none");
75
+ } else {
76
+ for (const row of failures) {
77
+ lines.push(`- ${row.fixture}: "${row.prompt}"`);
78
+ lines.push(` expected: ${row.expected.join(", ") || "(none)"}`);
79
+ lines.push(` raw: ${row.raw.selectedIds.join(", ") || "(none)"} ${row.raw.correct ? "✓" : "✗"}`);
80
+ lines.push(` contextos: ${row.contextos.selectedIds.join(", ") || "(none)"} ${row.contextos.correct ? "✓" : "✗"}`);
81
+ }
82
+ }
83
+ return lines.join("\n");
84
+ }
85
+
86
+ function selectLeaderboardCases(cases, limit) {
87
+ const wantedFixtures = [
88
+ "expo-eas",
89
+ "next-vercel",
90
+ "docker-node",
91
+ "railway-render",
92
+ "firebase-hosting",
93
+ "nest-prisma",
94
+ "express-mongo-jwt",
95
+ "oauth-google",
96
+ "redis-cache",
97
+ "contextos",
98
+ "frontend-only-next",
99
+ "static-docs"
100
+ ];
101
+ const selected = [];
102
+ for (const fixture of wantedFixtures) {
103
+ const match = cases.find((row) => row.fixture === fixture && !selected.some((item) => caseId(item) === caseId(row)));
104
+ if (match) selected.push(match);
105
+ if (selected.length >= limit) return selected;
106
+ }
107
+ for (const row of cases) {
108
+ if (!selected.some((item) => caseId(item) === caseId(row))) selected.push(row);
109
+ if (selected.length >= limit) break;
110
+ }
111
+ return selected;
112
+ }
113
+
114
+ function rawAgentSkills({ prompt, skills, topK }) {
115
+ return skills
116
+ .map((skill) => ({ id: skill.id, score: rawPromptScore(prompt, skill) }))
117
+ .sort((a, b) => b.score - a.score || a.id.localeCompare(b.id))
118
+ .slice(0, topK)
119
+ .map((skill) => skill.id);
120
+ }
121
+
122
+ function rawPromptScore(prompt, skill) {
123
+ const promptTokens = new Set(tokenize(prompt));
124
+ const triggerTokens = tokenize([
125
+ skill.id,
126
+ skill.description,
127
+ ...(skill.positive_triggers?.prompts || [])
128
+ ].join(" "));
129
+ let score = 0;
130
+ for (const token of triggerTokens) {
131
+ if (promptTokens.has(token)) score += token.length > 5 ? 2 : 1;
132
+ }
133
+ return score;
134
+ }
135
+
136
+ function evaluateRow({ prompt, fixture, expected, allowed, forbidden, selectedIds }) {
137
+ const selected = new Set(selectedIds);
138
+ const accepted = new Set([...expected, ...allowed]);
139
+ const hasExpected = expected.length
140
+ ? expected.every((skill) => selected.has(skill))
141
+ : selectedIds.length === 0;
142
+ const hasForbidden = forbidden.some((skill) => selected.has(skill));
143
+ const hasUnexpected = selectedIds.some((skill) => !accepted.has(skill));
144
+ return {
145
+ prompt,
146
+ fixture,
147
+ expected,
148
+ allowed,
149
+ forbidden,
150
+ selectedIds,
151
+ correct: hasExpected && !hasForbidden && !hasUnexpected
152
+ };
153
+ }
154
+
155
+ function summarizeSystem(name, rows) {
156
+ const correct = rows.filter((row) => row.correct).length;
157
+ return {
158
+ name,
159
+ correct,
160
+ total: rows.length,
161
+ correctRate: rows.length ? correct / rows.length : 0
162
+ };
163
+ }
164
+
165
+ function tokenize(value) {
166
+ return String(value || "")
167
+ .toLowerCase()
168
+ .split(/[^a-z0-9@.-]+/)
169
+ .filter((token) => token.length > 2);
170
+ }
171
+
172
+ function caseId(row) {
173
+ return `${row.fixture}\0${row.prompt}`;
174
+ }
175
+
176
+ function percent(value) {
177
+ return `${(value * 100).toFixed(1)}%`;
178
+ }
179
+
180
+ if (import.meta.url === `file://${process.argv[1]}`) {
181
+ const result = await runHallucinationLeaderboard();
182
+ console.log(formatHallucinationLeaderboard(result));
183
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@minhpnq1807/contextos",
3
- "version": "0.6.0",
3
+ "version": "0.6.1",
4
4
  "description": "Task-aware AGENTS.md context injection and compliance reporting for Codex, Claude Code, and Antigravity.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -27,6 +27,7 @@
27
27
  "build": "node bin/ctx.js --version",
28
28
  "validate:plugin": "node test/validate-plugin.js",
29
29
  "benchmark:skills": "node bin/ctx.js benchmark --skills",
30
+ "leaderboard:hallucination": "node eval/hallucination/run-leaderboard.js",
30
31
  "test:mcp": "node test/mcp-protocol-smoke.js"
31
32
  },
32
33
  "engines": {
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ctx",
3
- "version": "0.6.0",
3
+ "version": "0.6.1",
4
4
  "description": "Inject task-relevant AGENTS.md rules into Codex through plugin hooks.",
5
5
  "author": {
6
6
  "name": "ContextOS"