@minhpnq1807/contextos 0.6.1 → 0.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/README.md +92 -61
- package/bin/ctx.js +40 -3
- package/docs/demo/agents-lost-middle.gif +0 -0
- package/docs/demo/agents-lost-middle.txt +23 -26
- package/docs/demo/capture-live-demos.mjs +76 -0
- package/docs/demo/contextos-ready.gif +0 -0
- package/docs/demo/contextos-ready.txt +0 -6
- package/docs/demo/render-terminal-gif.mjs +1 -1
- package/docs/demo/same-prompt-different-context.gif +0 -0
- package/docs/demo/same-prompt-different-context.txt +38 -13
- package/eval/hallucination/run-agent-leaderboard.js +246 -0
- package/eval/hallucination/run-leaderboard.js +5 -5
- package/package.json +5 -1
- package/plugins/ctx/.codex-plugin/plugin.json +1 -1
- package/scripts/sync-community-skills.mjs +40 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,17 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.6.3
|
|
4
|
+
|
|
5
|
+
- **Launch benchmark wording:** Clarified that `ctx leaderboard --hallucination` is an offline deterministic benchmark comparing a raw heuristic baseline with ContextOS evidence-based context selection, while live agent results remain pending external CLI environments.
|
|
6
|
+
- **Offline leaderboard labels:** Renamed offline leaderboard output from agent-like labels to `Raw heuristic baseline` and `ContextOS evidence benchmark` so the 10% to 80% result is not confused with a live Codex/Gemini comparison.
|
|
7
|
+
- **Live leaderboard alias:** Added `ctx leaderboard --hallucination --live --agent <name>` as a launch-friendly alias for running the hallucination benchmark through one installed agent CLI. Live benchmark output now reports `OK`/`SKIPPED`/`ERROR` style statuses and supports `CONTEXTOS_<AGENT>_CMD` command templates for external wrappers.
|
|
8
|
+
|
|
9
|
+
## 0.6.2
|
|
10
|
+
|
|
11
|
+
- **Live agent leaderboard:** Added `ctx leaderboard --agents codex,gemini` and `npm run leaderboard:agents` to run the hallucination benchmark through installed Codex/Gemini CLIs with timeouts and skip/error reporting for missing or unauthenticated agents.
|
|
12
|
+
- **Live GIF capture:** Added `npm run demo:capture` to regenerate the three launch GIFs from real local `ctx` command output across ContextOS and skill-routing fixture repos.
|
|
13
|
+
- **Community skills sync:** Added `scripts/sync-community-skills.mjs`, `npm run sync:community-skills`, and a scheduled/manual GitHub Action that opens PRs from `khovan123/contextOS-skills` back into `community-skills/`.
|
|
14
|
+
|
|
3
15
|
## 0.6.1
|
|
4
16
|
|
|
5
17
|
- **Hallucination Leaderboard:** Added `ctx leaderboard --hallucination` and `npm run leaderboard:hallucination` to compare raw prompt-only skill guesses against ContextOS evidence-routed skill selection across 20 fixture tasks.
|
package/README.md
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# ContextOS
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Stop coding agents from ignoring repo rules, guessing the wrong path, and reading random files.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
ContextOS gives the agent the right rules, files, skills, workflows, and evidence before it writes code.
|
|
6
6
|
|
|
7
7
|
[](https://www.npmjs.com/package/@minhpnq1807/contextos)
|
|
8
8
|
[](https://github.com/khovan123/contextOS/actions/workflows/ci.yml)
|
|
@@ -10,19 +10,17 @@ Rules, files, skills, workflows, and evidence: injected before the agent writes
|
|
|
10
10
|
[](LICENSE)
|
|
11
11
|
|
|
12
12
|
```text
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
important rules drift into the middle
|
|
16
|
-
agent starts by grepping files and misses the repo contract
|
|
13
|
+
Problem: Agents ignore project rules.
|
|
14
|
+
Fix: ContextOS puts the relevant AGENTS.md rules in front of the agent for this task.
|
|
17
15
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
16
|
+
Problem: Agents choose the wrong deployment path.
|
|
17
|
+
Fix: ContextOS checks repo evidence before suggesting skills like EAS, Vercel, Docker, or CI/CD.
|
|
18
|
+
|
|
19
|
+
Problem: Agents grep random files.
|
|
20
|
+
Fix: ContextOS suggests the files and workflows to check first.
|
|
23
21
|
```
|
|
24
22
|
|
|
25
|
-
ContextOS is not another `AGENTS.md` loader. It is a
|
|
23
|
+
ContextOS is not another `AGENTS.md` loader. It is a pre-flight context layer for coding agents: it turns repo rules, project signals, skills, workflows, and evidence into a compact task brief before the agent starts editing.
|
|
26
24
|
|
|
27
25
|
Published package: [`@minhpnq1807/contextos`](https://www.npmjs.com/package/@minhpnq1807/contextos)
|
|
28
26
|
|
|
@@ -36,7 +34,7 @@ Same prompt. Same model. Different context.
|
|
|
36
34
|
ctx skills doctor -- "fix deployed"
|
|
37
35
|
```
|
|
38
36
|
|
|
39
|
-
| Repo evidence |
|
|
37
|
+
| Repo evidence | What ContextOS tells the agent |
|
|
40
38
|
| --- | --- |
|
|
41
39
|
| `eas.json`, `expo`, `react-native` | `eas`, `mobile-deployment`, `github-actions-ci-cd` |
|
|
42
40
|
| `vercel.json`, `next`, GitHub workflow | `vercel-deployment`, `github-actions-ci-cd`, `env-secret-management` |
|
|
@@ -49,7 +47,13 @@ More 10-second demos:
|
|
|
49
47
|
| AGENTS.md Lost In The Middle | [docs/demo/agents-lost-middle.gif](docs/demo/agents-lost-middle.gif) |
|
|
50
48
|
| ContextOS Ready Gold | [docs/demo/contextos-ready.gif](docs/demo/contextos-ready.gif) |
|
|
51
49
|
|
|
52
|
-
|
|
50
|
+
Regenerate the GIFs from real local `ctx` command output:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
npm run demo:capture
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Wrong Path Benchmark
|
|
53
57
|
|
|
54
58
|
Generic agents often guess deployment tooling from the prompt alone:
|
|
55
59
|
|
|
@@ -58,7 +62,7 @@ Prompt: Fix deployment
|
|
|
58
62
|
Raw agent guess: Vercel, Docker, Railway
|
|
59
63
|
```
|
|
60
64
|
|
|
61
|
-
ContextOS
|
|
65
|
+
ContextOS checks the repo first:
|
|
62
66
|
|
|
63
67
|
```text
|
|
64
68
|
Detected evidence:
|
|
@@ -72,9 +76,9 @@ Selected skills:
|
|
|
72
76
|
- github-actions-ci-cd
|
|
73
77
|
```
|
|
74
78
|
|
|
75
|
-
That is the core launch demo: same prompt, same model, different repo
|
|
79
|
+
That is the core launch demo: same prompt, same model, different repo, correct next step.
|
|
76
80
|
|
|
77
|
-
|
|
81
|
+
Internal fixture benchmark:
|
|
78
82
|
|
|
79
83
|
| Metric | Result |
|
|
80
84
|
| --- | ---: |
|
|
@@ -85,26 +89,44 @@ Skill Router internal fixture benchmark:
|
|
|
85
89
|
| Confidence Calibration | 100.0% |
|
|
86
90
|
| Negative Gate Accuracy | 100.0% |
|
|
87
91
|
|
|
88
|
-
This is an internal fixture benchmark, not an external real-world benchmark. It is designed to prove
|
|
92
|
+
This is an internal fixture benchmark, not an external real-world benchmark. It is designed to prove that ContextOS changes its suggestions from repo evidence across controlled Expo/EAS, Next/Vercel, Docker, Railway/Render, Firebase, auth, database, testing, mobile, and adversarial negative cases.
|
|
89
93
|
|
|
90
|
-
|
|
94
|
+
Offline hallucination leaderboard:
|
|
91
95
|
|
|
92
96
|
```bash
|
|
93
97
|
ctx leaderboard --hallucination
|
|
94
98
|
```
|
|
95
99
|
|
|
96
|
-
Current
|
|
100
|
+
Current deterministic result across 20 fixture tasks and 12 repo contexts:
|
|
97
101
|
|
|
98
|
-
| System | Correct
|
|
102
|
+
| System | Correct context choice |
|
|
99
103
|
| --- | ---: |
|
|
100
|
-
| Raw
|
|
101
|
-
| ContextOS
|
|
104
|
+
| Raw heuristic baseline | 10.0% |
|
|
105
|
+
| ContextOS evidence benchmark | 80.0% |
|
|
106
|
+
|
|
107
|
+
This means ContextOS improves deterministic context routing from 10% to 80% on the offline hallucination task set. It does not claim ContextOS beats Codex, Gemini, Claude Code, or Cursor in live runs.
|
|
108
|
+
|
|
109
|
+
Live agent benchmark support exists, but results are pending an external environment with working CLI auth/session access:
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
ctx leaderboard --hallucination --live --agent codex
|
|
113
|
+
ctx leaderboard --hallucination --live --agent gemini
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
If a CLI cannot run in the current environment, the command reports `SKIPPED` or an agent error instead of blocking launch.
|
|
117
|
+
|
|
118
|
+
Live benchmark tracking:
|
|
119
|
+
|
|
120
|
+
- [Run Codex live benchmark](https://github.com/khovan123/contextOS/issues/1)
|
|
121
|
+
- [Run Claude Code live benchmark](https://github.com/khovan123/contextOS/issues/3)
|
|
122
|
+
- [Run Gemini CLI live benchmark](https://github.com/khovan123/contextOS/issues/4)
|
|
123
|
+
- [Run Cursor live benchmark](https://github.com/khovan123/contextOS/issues/2)
|
|
102
124
|
|
|
103
125
|
Example hook context injected before the agent works:
|
|
104
126
|
|
|
105
127
|
```text
|
|
106
128
|
## Critical ContextOS rules
|
|
107
|
-
- IMPORTANT: This project has a knowledge graph.
|
|
129
|
+
- IMPORTANT: This project has a knowledge graph. Use it before broad file search.
|
|
108
130
|
- Use `query_graph` pattern="tests_for" to check coverage.
|
|
109
131
|
|
|
110
132
|
## Suggested files to check
|
|
@@ -123,7 +145,7 @@ ContextOS report
|
|
|
123
145
|
Efficiency: 100%
|
|
124
146
|
Injected rules: 8
|
|
125
147
|
Rule outcomes: 8 followed, 0 ignored, 0 unknown
|
|
126
|
-
Runtime
|
|
148
|
+
Runtime evidence: project graph was used before file search
|
|
127
149
|
```
|
|
128
150
|
|
|
129
151
|
## Quick Install
|
|
@@ -166,37 +188,44 @@ ctx install agy
|
|
|
166
188
|
|
|
167
189
|
Restart the agent after setup. Then use the agent normally.
|
|
168
190
|
|
|
169
|
-
## Why
|
|
191
|
+
## Why ContextOS Exists
|
|
170
192
|
|
|
171
193
|
Developers put real operating instructions in `AGENTS.md`: use this graph tool before reading files, run these tests, follow this architecture boundary, avoid this migration path.
|
|
172
194
|
|
|
173
|
-
The problem is not that agents cannot read `AGENTS.md`. The problem is that large context windows bury the important rule in the middle, where attention is weak.
|
|
195
|
+
The problem is not that agents cannot read `AGENTS.md`. The problem is that large context windows bury the important rule in the middle, where attention is weak.
|
|
196
|
+
|
|
197
|
+
The same thing happens with project structure:
|
|
198
|
+
|
|
199
|
+
- A deployment prompt says "fix deploy", and the agent guesses Vercel in an Expo repo.
|
|
200
|
+
- A backend error mentions Fastify, and the agent loads frontend skills.
|
|
201
|
+
- A feature request names one route, and the agent starts with broad grep instead of the files that matter.
|
|
202
|
+
|
|
203
|
+
ContextOS fixes those three failures before the agent starts work.
|
|
174
204
|
|
|
175
205
|
The next visible demo is not another feature. It is showing the pain in a few seconds:
|
|
176
206
|
|
|
177
207
|
```text
|
|
178
208
|
Raw agent: guesses from the prompt.
|
|
179
|
-
ContextOS:
|
|
209
|
+
ContextOS: checks repo evidence first.
|
|
180
210
|
```
|
|
181
211
|
|
|
182
212
|
## What ContextOS Does
|
|
183
213
|
|
|
184
|
-
|
|
|
214
|
+
| Agent failure | ContextOS behavior |
|
|
185
215
|
| --- | --- |
|
|
186
|
-
|
|
|
187
|
-
|
|
|
188
|
-
|
|
|
189
|
-
|
|
|
190
|
-
|
|
|
191
|
-
| Evidence | Stop hooks persist `followed`, `ignored`, `unknown`, and runtime telemetry for explicit reports. |
|
|
216
|
+
| Ignores project rules | Shows the relevant rules at the start of the task. |
|
|
217
|
+
| Picks the wrong tool or deployment path | Suggests skills only when the repo has supporting evidence. |
|
|
218
|
+
| Reads random files first | Suggests the likely files and workflows before exploration starts. |
|
|
219
|
+
| Claims compliance without proof | Reports which rules were followed, ignored, or unknown after the task. |
|
|
220
|
+
| Needs to work across agents | Supports Codex, Claude Code, and Antigravity with the same project context. |
|
|
192
221
|
|
|
193
222
|
## Comparison
|
|
194
223
|
|
|
195
224
|
| Approach | What it gives the agent | Main gap |
|
|
196
225
|
| --- | --- | --- |
|
|
197
226
|
| Plain `AGENTS.md` | Static repo instructions. | Important rules get buried or ignored when the task changes. |
|
|
198
|
-
| Generic RAG |
|
|
199
|
-
| ContextOS | Task-
|
|
227
|
+
| Generic RAG | Related files or snippets. | It usually does not choose skills/workflows or prove rule compliance. |
|
|
228
|
+
| ContextOS | Task-specific rules, files, skills, workflows, and evidence. | Requires local setup and prepared indexes for best results. |
|
|
200
229
|
|
|
201
230
|
## Safety Model
|
|
202
231
|
|
|
@@ -206,20 +235,20 @@ ContextOS is designed to be OSS-friendly and low-friction:
|
|
|
206
235
|
| --- | --- |
|
|
207
236
|
| Standalone by default | `ctx setup` works without `code-review-graph`, `codegraph`, or `agent-memory`. |
|
|
208
237
|
| Optional adapters | Graph and memory backends add signal when available; missing adapters contribute score `0`. |
|
|
209
|
-
| Fail-open hooks | Prompt hooks return local context or nothing instead of blocking the agent when
|
|
238
|
+
| Fail-open hooks | Prompt hooks return local context or nothing instead of blocking the agent when optional runtime pieces are unavailable. |
|
|
210
239
|
| Local-only telemetry | Reports, prompt history, evidence, and telemetry stay under `~/.ctx/contextos/`. |
|
|
211
|
-
| No hook network calls | Prompt and stop hooks do not call external services. Install/warm commands may
|
|
240
|
+
| No hook network calls | Prompt and stop hooks do not call external services. Install/warm commands may prepare local indexes when explicitly run. |
|
|
212
241
|
| No postinstall surprise | `npm install` only installs the CLI. Setup runs only when you call `ctx setup`. |
|
|
213
242
|
|
|
214
|
-
Positioning: ContextOS works standalone and gets smarter when graph or memory adapters are available.
|
|
243
|
+
Positioning: ContextOS works standalone and gets smarter when project graph or memory adapters are available.
|
|
215
244
|
|
|
216
245
|
## Roadmap
|
|
217
246
|
|
|
218
|
-
ContextOS is not heading toward a dashboard-first product. The next work is focused on making the existing local
|
|
247
|
+
ContextOS is not heading toward a dashboard-first product. The next work is focused on making the existing local behavior more visible and reusable:
|
|
219
248
|
|
|
220
249
|
| Next | Why |
|
|
221
250
|
| --- | --- |
|
|
222
|
-
| Hallucination Leaderboard | Compare raw agent guesses vs ContextOS evidence-
|
|
251
|
+
| Hallucination Leaderboard | Compare raw agent guesses vs ContextOS evidence-based recommendations across the same repos and tasks. |
|
|
223
252
|
| Agent Replay | Turn telemetry into a readable post-task narrative: prompt, selected skills, followed rules, suggested files, touched files, efficiency. |
|
|
224
253
|
| Community Skill Packs | Let contributors PR ContextOS-ready skills with triggers, evidence, negative gates, and workflows before building a larger hub. |
|
|
225
254
|
| ContextOS Ready | Define a repository readiness badge for AGENTS.md, skills, workflows, and evidence quality. |
|
|
@@ -231,11 +260,11 @@ See [docs/roadmap.md](docs/roadmap.md) for the current roadmap notes.
|
|
|
231
260
|
|
|
232
261
|
ContextOS starts the community loop with [`community-skills/`](community-skills/) instead of a hosted marketplace. The seed packs are `eas`, `vercel`, `prisma`, `redis`, `oauth-google`, and `jwt-auth`.
|
|
233
262
|
|
|
234
|
-
Each pack contains a model-visible `SKILL.md` plus `skill.yaml`
|
|
263
|
+
Each pack contains a model-visible `SKILL.md` plus `skill.yaml` metadata with prompt triggers, project evidence, negative triggers, and a short workflow. Contributors can PR new packs by copying [`community-skills/_template/`](community-skills/_template/).
|
|
235
264
|
|
|
236
265
|
## ContextOS Ready
|
|
237
266
|
|
|
238
|
-
`ctx doctor` scores whether a repository is ready for ContextOS-style agent
|
|
267
|
+
`ctx doctor` scores whether a repository is ready for ContextOS-style agent guidance:
|
|
239
268
|
|
|
240
269
|
```bash
|
|
241
270
|
ctx doctor
|
|
@@ -265,9 +294,11 @@ The score checks project `AGENTS.md` rules, project skill packs under `.codex/sk
|
|
|
265
294
|
| `ctx evidence` | Show why each rule was marked followed/ignored/unknown. |
|
|
266
295
|
| `ctx stats` | Show workspace-level usage and effectiveness metrics. |
|
|
267
296
|
| `ctx benchmark -- "task"` | Compare raw AGENTS.md ordering vs ContextOS scheduling. |
|
|
268
|
-
| `ctx benchmark --skills` | Run the
|
|
269
|
-
| `ctx leaderboard --hallucination` |
|
|
270
|
-
| `ctx
|
|
297
|
+
| `ctx benchmark --skills` | Run the skill selection eval benchmark. |
|
|
298
|
+
| `ctx leaderboard --hallucination` | Run the offline deterministic hallucination benchmark. |
|
|
299
|
+
| `ctx leaderboard --hallucination --live --agent codex` | Run the live CLI benchmark when agent auth/session is available. |
|
|
300
|
+
| `ctx leaderboard --agents codex,gemini` | Legacy live CLI leaderboard form. |
|
|
301
|
+
| `ctx sync --rules` | Sync project rules across agents. |
|
|
271
302
|
| `ctx sync --skills` | Sync skills across agents through skillshare. |
|
|
272
303
|
| `ctx sync --workflows` | Sync workflow markdown across Claude/Codex/Antigravity. |
|
|
273
304
|
|
|
@@ -276,7 +307,7 @@ The score checks project `AGENTS.md` rules, project skill packs under `.codex/sk
|
|
|
276
307
|
1. Start in a repo with an `AGENTS.md` that contains a rule like:
|
|
277
308
|
|
|
278
309
|
```text
|
|
279
|
-
Always use
|
|
310
|
+
Always use the project graph before reading files.
|
|
280
311
|
```
|
|
281
312
|
|
|
282
313
|
2. Install:
|
|
@@ -591,7 +622,9 @@ This warning comes from a transitive dependency in the local embedding/WASM stac
|
|
|
591
622
|
| `ctx stats` | Shows aggregate runtime metrics for the current workspace. | You want to know whether ContextOS is active and useful over time. | Prints sectioned tables for prompt/report counts, injection rate, efficiency, rule outcomes, hook events, last prompt, and last report. |
|
|
592
623
|
| `ctx benchmark -- "task"` | Compares baseline AGENTS.md ordering with ContextOS task-aware scheduling. | You want a before/after signal for lost-in-the-middle risk. | Prints tables for parsed/actionable/filtered rules, baseline middle-risk, scheduled high/mid rules, recency reminder status, and top scored rules. |
|
|
593
624
|
| `ctx benchmark --skills` | Runs the Skill Router eval benchmark. | You want evidence for skill routing accuracy and negative gates. | Prints top-1 accuracy, top-3 recall, false positive rate, confidence calibration, and negative gate accuracy across `eval/skill-routing` fixtures. |
|
|
594
|
-
| `ctx leaderboard --hallucination` |
|
|
625
|
+
| `ctx leaderboard --hallucination` | Runs the offline deterministic hallucination benchmark. | You want launch evidence for the wrong-context problem without depending on external agent auth. | Runs 20 fixture tasks across 10+ repo contexts and prints Raw heuristic baseline vs ContextOS evidence benchmark plus sample failures. |
|
|
626
|
+
| `ctx leaderboard --hallucination --live --agent codex` | Runs the hallucination benchmark through one installed agent CLI. | You want real agent output and have CLI auth/session available. | Calls the selected CLI with timeouts; missing, blocked, or unauthenticated CLIs are reported as skipped/errors instead of blocking. |
|
|
627
|
+
| `ctx leaderboard --agents codex,gemini` | Legacy live CLI leaderboard form. | You want to run multiple live agents at once. | Equivalent live-agent benchmark shape for comma-separated CLIs. |
|
|
595
628
|
| `ctx sync --rules` | Syncs project rules and MCP servers through Ruler. | You want Codex, Claude Code, and Antigravity to share one project rule/MCP source of truth. | Ensures `.ruler/ruler.toml`, injects `ctx-mcp`, imports existing MCP servers from Codex and project `.mcp.json`, runs `ruler apply --agents codex,claude,antigravity`, mirrors MCP servers to Antigravity MCP configs, and verifies generated config. |
|
|
596
629
|
| `ctx sync --rules --agents <list>` | Syncs only selected agents through Ruler. | You want to update one or two agents without touching the others. | Accepts comma-separated values such as `codex`, `claude`, `agy`, `antigravity`, or `codex,claude,agy`; `agy` is normalized to Ruler's `antigravity`. |
|
|
597
630
|
| `ctx sync --rules --dry-run` | Previews Ruler sync without writing files or running apply. | You want to inspect behavior before changing project config. | Prints the same flow with dry-run status. |
|
|
@@ -656,7 +689,7 @@ These files are local telemetry only. Hooks do not make network calls.
|
|
|
656
689
|
|
|
657
690
|
## Project Understanding
|
|
658
691
|
|
|
659
|
-
ContextOS works standalone. The
|
|
692
|
+
ContextOS works standalone. The default path is local project rules, prepared file indexes, project skills, workflows, and evidence capture.
|
|
660
693
|
|
|
661
694
|
Project graph and memory backends are optional adapters:
|
|
662
695
|
|
|
@@ -668,26 +701,24 @@ Project graph and memory backends are optional adapters:
|
|
|
668
701
|
|
|
669
702
|
ContextOS does not require `code-review-graph`, `codegraph`, or `agent-memory` to install or run. It gets smarter when those backends are available; when they are missing, the adapter scores stay at zero and the hook continues with local context.
|
|
670
703
|
|
|
671
|
-
For file suggestions, ContextOS
|
|
704
|
+
For file suggestions, ContextOS uses prepared local indexes:
|
|
672
705
|
|
|
673
706
|
```text
|
|
674
707
|
prompt
|
|
675
|
-
->
|
|
676
|
-
->
|
|
677
|
-
->
|
|
678
|
-
->
|
|
679
|
-
->
|
|
680
|
-
-> merge and deduplicate semantic, import-graph, and optional graph matches
|
|
681
|
-
-> inject top suggested files with graph evidence reasons
|
|
708
|
+
-> read task-relevant AGENTS.md rules
|
|
709
|
+
-> suggest prepared file candidates
|
|
710
|
+
-> expand nearby imports
|
|
711
|
+
-> add optional project-graph matches when available
|
|
712
|
+
-> inject a compact list of files to check
|
|
682
713
|
```
|
|
683
714
|
|
|
684
|
-
This keeps the hook fast and local while still using graph
|
|
715
|
+
This keeps the hook fast and local while still using project graph signal when available. When no graph adapter is available, file suggestions still use local file indexes and import expansion.
|
|
685
716
|
|
|
686
|
-
Prompt
|
|
717
|
+
Prompt-time file suggestions do not walk the repository. `ctx install` and `ctx embeddings warm` rebuild the file index and one-hop import adjacency by walking source paths once; prompt hooks query those prepared indexes directly. Rules, files, skills, and workflows are resolved concurrently.
|
|
687
718
|
|
|
688
719
|
`ctx embeddings warm` automatically refreshes the active Codex marketplace payload before rebuilding indexes. Use `ctx refresh` when you want the same marketplace sync plus install-style file, skill, import, and code-review-graph embedding refresh in one command.
|
|
689
720
|
|
|
690
|
-
If a prompt has no usable context candidates, the hook fails open without emitting an empty `hook context` block, records `emptyContextReason` in the workspace runtime file, and starts a detached `autowarm` rebuild with a cooldown. That background rebuild refreshes
|
|
721
|
+
If a prompt has no usable context candidates, the hook fails open without emitting an empty `hook context` block, records `emptyContextReason` in the workspace runtime file, and starts a detached `autowarm` rebuild with a cooldown. That background rebuild refreshes prepared indexes for the next prompt while keeping repository walking out of the current prompt path.
|
|
691
722
|
|
|
692
723
|
Use `ctx --config` to choose which prompt sections ContextOS injects and how many suggestions each section may show. Interactive `ctx setup` includes the same section picker and limit prompts, while `ctx setup --yes` keeps the current saved config for automation. The panel supports multiple selection with `Space` and persists the global choice in `~/.ctx/contextos/output-config.json`. Defaults are five suggested files, five skills, and five workflows; caps are 20 files, 10 skills, and 5 workflows. Disabling rules hides both critical and additional relevant rule sections; compliance metadata remains available for reports.
|
|
693
724
|
|
package/bin/ctx.js
CHANGED
|
@@ -21,6 +21,7 @@ import { installMcpTelemetryProxies } from "../plugins/ctx/lib/mcp-proxy-install
|
|
|
21
21
|
import { benchmarkWorkspace, formatBenchmark } from "../plugins/ctx/lib/benchmark.js";
|
|
22
22
|
import { formatSkillRoutingBenchmark, runSkillRoutingEval } from "../eval/skill-routing/run-eval.js";
|
|
23
23
|
import { formatHallucinationLeaderboard, runHallucinationLeaderboard } from "../eval/hallucination/run-leaderboard.js";
|
|
24
|
+
import { formatAgentLeaderboard, runAgentLeaderboard } from "../eval/hallucination/run-agent-leaderboard.js";
|
|
24
25
|
import { copyDir, copyPackageRoot, syncPackageRoot } from "../plugins/ctx/lib/package-install.js";
|
|
25
26
|
import { installClaudeHooks } from "../plugins/ctx/lib/claude-hooks.js";
|
|
26
27
|
import { installClaudeMcp } from "../plugins/ctx/lib/claude-mcp.js";
|
|
@@ -198,7 +199,10 @@ Usage:
|
|
|
198
199
|
ctx stats Show workspace statistics
|
|
199
200
|
ctx benchmark -- "task" Benchmark workspace for a task
|
|
200
201
|
ctx benchmark --skills Run skill routing eval benchmark
|
|
201
|
-
ctx leaderboard --hallucination
|
|
202
|
+
ctx leaderboard --hallucination Run offline deterministic hallucination benchmark
|
|
203
|
+
ctx leaderboard --hallucination --live --agent codex
|
|
204
|
+
Run hallucination benchmark through one live CLI
|
|
205
|
+
ctx leaderboard --agents codex,gemini Run live CLI leaderboard for installed agents
|
|
202
206
|
ctx sync --rules Sync AGENTS.md rules to all agents
|
|
203
207
|
ctx sync --rules --agents <names> Sync rules to specific agents only
|
|
204
208
|
ctx sync --rules --dry-run Preview rule sync without writing
|
|
@@ -250,6 +254,18 @@ function normalizeInstallAgent(agent) {
|
|
|
250
254
|
if (normalized === "antigravity") return "agy";
|
|
251
255
|
return normalized;
|
|
252
256
|
}
|
|
257
|
+
|
|
258
|
+
function leaderboardAgentsFromArgs(args) {
|
|
259
|
+
const agentIndex = args.indexOf("--agent");
|
|
260
|
+
const agentsIndex = args.indexOf("--agents");
|
|
261
|
+
const index = agentIndex >= 0 ? agentIndex : agentsIndex;
|
|
262
|
+
if (index < 0) return [];
|
|
263
|
+
return String(args[index + 1] || "")
|
|
264
|
+
.split(",")
|
|
265
|
+
.map((agent) => agent.trim())
|
|
266
|
+
.filter(Boolean);
|
|
267
|
+
}
|
|
268
|
+
|
|
253
269
|
/**
|
|
254
270
|
* Intercept console.log from an async fn,
|
|
255
271
|
* printing each line immediately with "│ " prefix for real-time feedback.
|
|
@@ -1037,10 +1053,31 @@ try {
|
|
|
1037
1053
|
console.log(formatBenchmark(benchmarkWorkspace({ cwd: process.cwd(), task })));
|
|
1038
1054
|
}
|
|
1039
1055
|
} else if (command === "leaderboard") {
|
|
1040
|
-
if (args.includes("--hallucination")) {
|
|
1056
|
+
if (args.includes("--hallucination") && args.includes("--live")) {
|
|
1057
|
+
const agents = leaderboardAgentsFromArgs(args);
|
|
1058
|
+
const limitIndex = args.indexOf("--limit");
|
|
1059
|
+
const timeoutIndex = args.indexOf("--timeout-ms");
|
|
1060
|
+
console.log(formatAgentLeaderboard(runAgentLeaderboard({
|
|
1061
|
+
rootDir,
|
|
1062
|
+
agents: agents.length ? agents : undefined,
|
|
1063
|
+
caseLimit: limitIndex >= 0 ? Number(args[limitIndex + 1]) : undefined,
|
|
1064
|
+
timeoutMs: timeoutIndex >= 0 ? Number(args[timeoutIndex + 1]) : undefined
|
|
1065
|
+
})));
|
|
1066
|
+
} else if (args.includes("--hallucination")) {
|
|
1041
1067
|
console.log(formatHallucinationLeaderboard(await runHallucinationLeaderboard({ rootDir })));
|
|
1068
|
+
} else if (args.includes("--agents")) {
|
|
1069
|
+
const index = args.indexOf("--agents");
|
|
1070
|
+
const agents = String(args[index + 1] || "").split(",").map((agent) => agent.trim()).filter(Boolean);
|
|
1071
|
+
const limitIndex = args.indexOf("--limit");
|
|
1072
|
+
const timeoutIndex = args.indexOf("--timeout-ms");
|
|
1073
|
+
console.log(formatAgentLeaderboard(runAgentLeaderboard({
|
|
1074
|
+
rootDir,
|
|
1075
|
+
agents: agents.length ? agents : undefined,
|
|
1076
|
+
caseLimit: limitIndex >= 0 ? Number(args[limitIndex + 1]) : undefined,
|
|
1077
|
+
timeoutMs: timeoutIndex >= 0 ? Number(args[timeoutIndex + 1]) : undefined
|
|
1078
|
+
})));
|
|
1042
1079
|
} else {
|
|
1043
|
-
throw new Error("Usage: ctx leaderboard --hallucination");
|
|
1080
|
+
throw new Error("Usage: ctx leaderboard --hallucination OR ctx leaderboard --hallucination --live --agent codex OR ctx leaderboard --agents codex,gemini");
|
|
1044
1081
|
}
|
|
1045
1082
|
} else if (command === "skills") {
|
|
1046
1083
|
if (args[1] === "doctor") {
|
|
Binary file
|
|
@@ -1,28 +1,25 @@
|
|
|
1
|
-
$
|
|
2
|
-
|
|
3
|
-
2. Formatting
|
|
4
|
-
3. Test names
|
|
5
|
-
...
|
|
6
|
-
37. IMPORTANT: Always use code-review-graph before grep.
|
|
7
|
-
...
|
|
8
|
-
52. Release notes
|
|
1
|
+
$ ctx benchmark -- "fix failing test"
|
|
2
|
+
ContextOS benchmark
|
|
9
3
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
4
|
+
Summary
|
|
5
|
+
-------
|
|
6
|
+
Metric Value
|
|
7
|
+
-------------------- -------------------------
|
|
8
|
+
Task fix failing test
|
|
9
|
+
Rules parsed 7
|
|
10
|
+
Actionable rules 5
|
|
11
|
+
Filtered rules 2
|
|
12
|
+
Relevant rules 1
|
|
13
|
+
Baseline middle-risk 1/1 relevant rules (100%)
|
|
14
|
+
ContextOS scheduled 1 high, 0 mid
|
|
15
|
+
Recency reminder enabled
|
|
13
16
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
$ codex + ContextOS
|
|
24
|
-
Rule followed: yes
|
|
25
|
-
Evidence: graph checked before file reads
|
|
26
|
-
|
|
27
|
-
AGENTS.md did not change.
|
|
28
|
-
The rule moved from buried context into runtime context.
|
|
17
|
+
Top Rules
|
|
18
|
+
---------
|
|
19
|
+
Score Rule Reasons
|
|
20
|
+
----- ---------------------------------------------------------------------------------------- ----------
|
|
21
|
+
0.50 IMPORTANT: This project has a knowledge graph. ALWAYS use code-review-graph MCP tools... imperative
|
|
22
|
+
0.00 AGENTS.md
|
|
23
|
+
0.00 Centralised AI agent instructions. Add coding guidelines, style guides, and project c...
|
|
24
|
+
0.00 Ruler concatenates all .md files in this directory (and subdirectories), starting wit...
|
|
25
|
+
0.00 Use `query_graph` pattern="tests_for" to check coverage.
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { execFileSync } from "node:child_process";
|
|
3
|
+
import fs from "node:fs";
|
|
4
|
+
import path from "node:path";
|
|
5
|
+
import { fileURLToPath } from "node:url";
|
|
6
|
+
|
|
7
|
+
import { benchmarkWorkspace, formatBenchmark } from "../../plugins/ctx/lib/benchmark.js";
|
|
8
|
+
import { formatContextOSReady, inspectContextOSReady } from "../../plugins/ctx/lib/certification.js";
|
|
9
|
+
import { formatHallucinationLeaderboard, runHallucinationLeaderboard } from "../../eval/hallucination/run-leaderboard.js";
|
|
10
|
+
import { runSkillRoutingEval } from "../../eval/skill-routing/run-eval.js";
|
|
11
|
+
|
|
12
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
13
|
+
const repoRoot = path.resolve(__dirname, "..", "..");
|
|
14
|
+
const render = path.join(__dirname, "render-terminal-gif.mjs");
|
|
15
|
+
const leaderboard = await runHallucinationLeaderboard({ rootDir: repoRoot });
|
|
16
|
+
const skillEval = await runSkillRoutingEval({ rootDir: repoRoot });
|
|
17
|
+
|
|
18
|
+
const demos = [
|
|
19
|
+
{
|
|
20
|
+
log: "same-prompt-different-context.txt",
|
|
21
|
+
gif: "same-prompt-different-context.gif",
|
|
22
|
+
steps: [
|
|
23
|
+
["ctx leaderboard --hallucination", formatHallucinationLeaderboard(leaderboard)],
|
|
24
|
+
["ctx skills doctor -- \"fix deployed\" # Expo fixture", routeSummary("expo-eas", "fix deployed")],
|
|
25
|
+
["ctx skills doctor -- \"fix deployed\" # Next/Vercel fixture", routeSummary("next-vercel", "fix deployed")]
|
|
26
|
+
]
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
log: "agents-lost-middle.txt",
|
|
30
|
+
gif: "agents-lost-middle.gif",
|
|
31
|
+
steps: [
|
|
32
|
+
["ctx benchmark -- \"fix failing test\"", formatBenchmark(benchmarkWorkspace({ cwd: repoRoot, task: "fix failing test" }))]
|
|
33
|
+
]
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
log: "contextos-ready.txt",
|
|
37
|
+
gif: "contextos-ready.gif",
|
|
38
|
+
steps: [
|
|
39
|
+
["ctx doctor", formatContextOSReady(inspectContextOSReady({ cwd: repoRoot }))]
|
|
40
|
+
]
|
|
41
|
+
}
|
|
42
|
+
];
|
|
43
|
+
|
|
44
|
+
for (const demo of demos) {
|
|
45
|
+
const logPath = path.join(__dirname, demo.log);
|
|
46
|
+
const gifPath = path.join(__dirname, demo.gif);
|
|
47
|
+
const chunks = [];
|
|
48
|
+
for (const [label, output] of demo.steps) {
|
|
49
|
+
chunks.push(`$ ${label}`);
|
|
50
|
+
chunks.push(cleanOutput(output));
|
|
51
|
+
chunks.push("");
|
|
52
|
+
}
|
|
53
|
+
fs.writeFileSync(logPath, chunks.join("\n").trimEnd() + "\n", "utf8");
|
|
54
|
+
execFileSync(process.execPath, [render, logPath, gifPath], { cwd: repoRoot, stdio: "inherit" });
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
function cleanOutput(output) {
|
|
58
|
+
return String(output || "")
|
|
59
|
+
.split(/\r?\n/)
|
|
60
|
+
.map((line) => line.trimEnd())
|
|
61
|
+
.join("\n")
|
|
62
|
+
.trim();
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
function routeSummary(fixture, prompt) {
|
|
66
|
+
const row = skillEval.rows.find((item) => item.fixture === fixture && item.prompt === prompt);
|
|
67
|
+
if (!row) return "No route found.";
|
|
68
|
+
return [
|
|
69
|
+
"ContextOS skill doctor",
|
|
70
|
+
`fixture: ${fixture}`,
|
|
71
|
+
`prompt: ${prompt}`,
|
|
72
|
+
`selected: ${row.selectedIds.join(", ") || "(none)"}`,
|
|
73
|
+
`expected: ${row.expected.join(", ") || "(none)"}`,
|
|
74
|
+
`rejected: ${row.forbidden.join(", ") || "(none)"}`
|
|
75
|
+
].join("\n");
|
|
76
|
+
}
|
|
Binary file
|
|
@@ -12,9 +12,3 @@ Evidence:
|
|
|
12
12
|
- Rules: 1 AGENTS.md source(s), 5 actionable rule(s)
|
|
13
13
|
- Skills: 3 skill(s), 3 metadata file(s)
|
|
14
14
|
- Workflows: 2 workflow(s), 2 with agent chain(s)
|
|
15
|
-
|
|
16
|
-
$ badge
|
|
17
|
-
[ContextOS Ready Gold]
|
|
18
|
-
|
|
19
|
-
Repos now have a target:
|
|
20
|
-
AGENTS.md + skills + workflows + evidence.
|
|
@@ -35,7 +35,7 @@ for (let count = 1; count <= displayLines.length; count += frameStep) {
|
|
|
35
35
|
frames.push(writeFrame({ tmpDir, index: frames.length, lines: displayLines }));
|
|
36
36
|
|
|
37
37
|
fs.mkdirSync(path.dirname(outputPath), { recursive: true });
|
|
38
|
-
execFileSync("convert", ["-delay", "12", "-loop", "0", ...frames, outputPath], { stdio: "inherit" });
|
|
38
|
+
execFileSync("convert", ["-limit", "time", "120", "-delay", "12", "-loop", "0", ...frames, outputPath], { stdio: "inherit" });
|
|
39
39
|
console.log(`Wrote ${outputPath}`);
|
|
40
40
|
|
|
41
41
|
function writeFrame({ tmpDir, index, lines }) {
|
|
Binary file
|
|
@@ -8,19 +8,44 @@ System Correct Skill
|
|
|
8
8
|
Raw Agent 10.0%
|
|
9
9
|
ContextOS + Codex 80.0%
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
11
|
+
Sample failures:
|
|
12
|
+
- expo-eas: "fix deployed"
|
|
13
|
+
expected: eas, mobile-deployment, github-actions-ci-cd
|
|
14
|
+
raw: eas, env-secret-management, railway-render-deployment ✗
|
|
15
|
+
contextos: eas, github-actions-ci-cd, mobile-deployment ✓
|
|
16
|
+
- next-vercel: "fix deployed"
|
|
17
|
+
expected: vercel-deployment, github-actions-ci-cd, env-secret-management
|
|
18
|
+
raw: eas, env-secret-management, railway-render-deployment ✗
|
|
19
|
+
contextos: vercel-deployment, github-actions-ci-cd, env-secret-management ✓
|
|
20
|
+
- docker-node: "docker image build failed"
|
|
21
|
+
expected: docker, build-log-debugging
|
|
22
|
+
raw: docker, build-log-debugging, github-actions-ci-cd ✗
|
|
23
|
+
contextos: build-log-debugging, docker ✓
|
|
24
|
+
- railway-render: "Railway deploy health check failed"
|
|
25
|
+
expected: railway-render-deployment, build-log-debugging
|
|
26
|
+
raw: railway-render-deployment, build-log-debugging, firebase-hosting ✗
|
|
27
|
+
contextos: build-log-debugging, railway-render-deployment ✓
|
|
28
|
+
- firebase-hosting: "deploy firebase hosting"
|
|
29
|
+
expected: firebase-hosting
|
|
30
|
+
raw: firebase-hosting, flutter-firebase, railway-render-deployment ✗
|
|
31
|
+
contextos: firebase-hosting ✓
|
|
32
|
+
- nest-prisma: "optimize slow prisma queries"
|
|
33
|
+
expected: prisma, nestjs-module
|
|
34
|
+
raw: prisma, nestjs-module, android-signing ✗
|
|
35
|
+
contextos: nestjs-module, prisma ✓
|
|
17
36
|
|
|
18
|
-
$ ctx skills doctor -- "fix deployed" #
|
|
37
|
+
$ ctx skills doctor -- "fix deployed" # Expo fixture
|
|
19
38
|
ContextOS skill doctor
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
39
|
+
fixture: expo-eas
|
|
40
|
+
prompt: fix deployed
|
|
41
|
+
selected: eas, github-actions-ci-cd, mobile-deployment
|
|
42
|
+
expected: eas, mobile-deployment, github-actions-ci-cd
|
|
43
|
+
rejected: vercel-deployment
|
|
24
44
|
|
|
25
|
-
|
|
26
|
-
ContextOS
|
|
45
|
+
$ ctx skills doctor -- "fix deployed" # Next/Vercel fixture
|
|
46
|
+
ContextOS skill doctor
|
|
47
|
+
fixture: next-vercel
|
|
48
|
+
prompt: fix deployed
|
|
49
|
+
selected: vercel-deployment, github-actions-ci-cd, env-secret-management
|
|
50
|
+
expected: vercel-deployment, github-actions-ci-cd, env-secret-management
|
|
51
|
+
rejected: eas
|
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { execFileSync, spawnSync } from "node:child_process";
|
|
3
|
+
import fs from "node:fs";
|
|
4
|
+
import os from "node:os";
|
|
5
|
+
import path from "node:path";
|
|
6
|
+
import { fileURLToPath } from "node:url";
|
|
7
|
+
|
|
8
|
+
import { parseEvalYaml } from "../skill-routing/run-eval.js";
|
|
9
|
+
|
|
10
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
11
|
+
const repoRoot = path.resolve(__dirname, "..", "..");
|
|
12
|
+
const skillEvalRoot = path.resolve(__dirname, "..", "skill-routing");
|
|
13
|
+
const DEFAULT_AGENTS = ["codex", "gemini"];
|
|
14
|
+
const DEFAULT_CASE_LIMIT = 5;
|
|
15
|
+
const DEFAULT_TIMEOUT_MS = 120000;
|
|
16
|
+
|
|
17
|
+
export function runAgentLeaderboard({
|
|
18
|
+
agents = DEFAULT_AGENTS,
|
|
19
|
+
casesPath = path.join(skillEvalRoot, "cases.yaml"),
|
|
20
|
+
caseLimit = DEFAULT_CASE_LIMIT,
|
|
21
|
+
timeoutMs = DEFAULT_TIMEOUT_MS,
|
|
22
|
+
rootDir = repoRoot
|
|
23
|
+
} = {}) {
|
|
24
|
+
const config = parseEvalYaml(fs.readFileSync(casesPath, "utf8"));
|
|
25
|
+
const cases = config.cases
|
|
26
|
+
.filter((row) => row.expected?.length)
|
|
27
|
+
.slice(0, caseLimit);
|
|
28
|
+
const skillIds = config.skills.map((skill) => skill.id);
|
|
29
|
+
const systems = [];
|
|
30
|
+
|
|
31
|
+
for (const agent of agents) {
|
|
32
|
+
const template = agentCommandTemplate(agent);
|
|
33
|
+
const binary = template ? template.split(/\s+/).filter(Boolean)[0] : findBinary(agent);
|
|
34
|
+
if (!binary) {
|
|
35
|
+
systems.push({ name: agent, status: "skipped", reason: "binary not found", rows: [], correctRate: 0 });
|
|
36
|
+
continue;
|
|
37
|
+
}
|
|
38
|
+
const rows = cases.map((testCase) => runAgentCase({
|
|
39
|
+
agent,
|
|
40
|
+
binary,
|
|
41
|
+
testCase,
|
|
42
|
+
skillIds,
|
|
43
|
+
timeoutMs,
|
|
44
|
+
rootDir
|
|
45
|
+
}));
|
|
46
|
+
const completed = rows.filter((row) => row.status === "ok");
|
|
47
|
+
const correct = completed.filter((row) => row.correct).length;
|
|
48
|
+
systems.push({
|
|
49
|
+
name: agent,
|
|
50
|
+
status: completed.length ? "ok" : "skipped",
|
|
51
|
+
reason: completed.length ? "" : firstReason(rows),
|
|
52
|
+
rows,
|
|
53
|
+
correctRate: completed.length ? correct / completed.length : 0
|
|
54
|
+
});
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
return {
|
|
58
|
+
mode: "live-agent-cli",
|
|
59
|
+
caseCount: cases.length,
|
|
60
|
+
systems
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
export function formatAgentLeaderboard(result) {
|
|
65
|
+
const lines = [
|
|
66
|
+
"Live Agent Leaderboard",
|
|
67
|
+
`Mode: ${result.mode}`,
|
|
68
|
+
`Tasks: ${result.caseCount}`,
|
|
69
|
+
"",
|
|
70
|
+
"System Status Correct Skill",
|
|
71
|
+
"-------- -------- -------------"
|
|
72
|
+
];
|
|
73
|
+
for (const system of result.systems) {
|
|
74
|
+
const score = system.status === "ok" ? percent(system.correctRate) : system.reason;
|
|
75
|
+
lines.push(`${system.name.padEnd(8)} ${system.status.toUpperCase().padEnd(8)} ${score}`);
|
|
76
|
+
}
|
|
77
|
+
lines.push("", "Cases:");
|
|
78
|
+
for (const system of result.systems) {
|
|
79
|
+
lines.push(`- ${system.name}`);
|
|
80
|
+
for (const row of system.rows.slice(0, 5)) {
|
|
81
|
+
lines.push(` - ${row.fixture}: "${row.prompt}"`);
|
|
82
|
+
lines.push(` selected: ${row.selectedIds.join(", ") || "(none)"} ${row.correct ? "✓" : "✗"}`);
|
|
83
|
+
if (row.status !== "ok") lines.push(` status: ${row.status}; ${row.reason}`);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
return lines.join("\n");
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
function runAgentCase({ agent, binary, testCase, skillIds, timeoutMs, rootDir }) {
|
|
90
|
+
const cwd = testCase.fixture === "contextos"
|
|
91
|
+
? rootDir
|
|
92
|
+
: path.join(skillEvalRoot, "fixtures", testCase.fixture);
|
|
93
|
+
const prompt = buildPrompt({ task: testCase.prompt, skillIds });
|
|
94
|
+
const startedAt = Date.now();
|
|
95
|
+
const result = spawnSync(binary, agentArgs({ agent, cwd, prompt }), {
|
|
96
|
+
cwd,
|
|
97
|
+
encoding: "utf8",
|
|
98
|
+
timeout: timeoutMs,
|
|
99
|
+
maxBuffer: 1024 * 1024 * 4,
|
|
100
|
+
env: {
|
|
101
|
+
...process.env,
|
|
102
|
+
NO_COLOR: "1",
|
|
103
|
+
CI: process.env.CI || "1"
|
|
104
|
+
}
|
|
105
|
+
});
|
|
106
|
+
const output = `${result.stdout || ""}\n${result.stderr || ""}`.trim();
|
|
107
|
+
const selectedIds = parseSkillIds(output, skillIds).slice(0, 3);
|
|
108
|
+
const correct = isCorrect({
|
|
109
|
+
selectedIds,
|
|
110
|
+
expected: testCase.expected || [],
|
|
111
|
+
allowed: testCase.allowed || [],
|
|
112
|
+
forbidden: testCase.forbidden || []
|
|
113
|
+
});
|
|
114
|
+
return {
|
|
115
|
+
status: result.error ? "error" : result.status === 0 ? "ok" : "error",
|
|
116
|
+
reason: result.error?.message || (result.status === 0 ? "" : `exit ${result.status}`),
|
|
117
|
+
prompt: testCase.prompt,
|
|
118
|
+
fixture: testCase.fixture,
|
|
119
|
+
expected: testCase.expected || [],
|
|
120
|
+
selectedIds,
|
|
121
|
+
correct,
|
|
122
|
+
durationMs: Date.now() - startedAt
|
|
123
|
+
};
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
function agentArgs({ agent, cwd, prompt }) {
|
|
127
|
+
const genericTemplate = agentCommandTemplate(agent);
|
|
128
|
+
if (genericTemplate) return expandTemplate(genericTemplate, { cwd, prompt }).slice(1);
|
|
129
|
+
if (agent === "codex") {
|
|
130
|
+
return [
|
|
131
|
+
"exec",
|
|
132
|
+
"--cd", cwd,
|
|
133
|
+
"--sandbox", "read-only",
|
|
134
|
+
"--ask-for-approval", "never",
|
|
135
|
+
prompt
|
|
136
|
+
];
|
|
137
|
+
}
|
|
138
|
+
if (agent === "gemini") {
|
|
139
|
+
const template = process.env.CONTEXTOS_GEMINI_CMD;
|
|
140
|
+
if (template) return expandTemplate(template, { cwd, prompt });
|
|
141
|
+
return ["-p", prompt];
|
|
142
|
+
}
|
|
143
|
+
return [prompt];
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
function agentCommandTemplate(agent) {
|
|
147
|
+
const envKey = `CONTEXTOS_${String(agent || "").toUpperCase().replace(/[^A-Z0-9]+/g, "_")}_CMD`;
|
|
148
|
+
return process.env[envKey] || "";
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
function buildPrompt({ task, skillIds }) {
|
|
152
|
+
return [
|
|
153
|
+
"You are evaluating a repository for a coding-agent skill router benchmark.",
|
|
154
|
+
"Do not edit files. Do not run commands.",
|
|
155
|
+
`Task: ${task}`,
|
|
156
|
+
`Allowed skill IDs: ${skillIds.join(", ")}`,
|
|
157
|
+
"Return only the top skill IDs as comma-separated text. No explanations."
|
|
158
|
+
].join("\n");
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
function parseSkillIds(output, skillIds) {
|
|
162
|
+
const normalized = new Map(skillIds.map((id) => [normalize(id), id]));
|
|
163
|
+
const found = [];
|
|
164
|
+
for (const token of String(output || "").split(/[^A-Za-z0-9_.@-]+/)) {
|
|
165
|
+
const id = normalized.get(normalize(token));
|
|
166
|
+
if (id && !found.includes(id)) found.push(id);
|
|
167
|
+
}
|
|
168
|
+
return found;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
function isCorrect({ selectedIds, expected, allowed, forbidden }) {
|
|
172
|
+
const selected = new Set(selectedIds);
|
|
173
|
+
const accepted = new Set([...expected, ...allowed]);
|
|
174
|
+
return expected.every((skill) => selected.has(skill))
|
|
175
|
+
&& forbidden.every((skill) => !selected.has(skill))
|
|
176
|
+
&& selectedIds.every((skill) => accepted.has(skill));
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
function findBinary(name) {
|
|
180
|
+
const safeName = shellQuote(name);
|
|
181
|
+
const candidates = [
|
|
182
|
+
path.join(os.homedir(), ".local", "bin", safeName),
|
|
183
|
+
path.join(os.homedir(), ".npm-global", "bin", safeName),
|
|
184
|
+
path.join(os.homedir(), ".nvm", "current", "bin", safeName),
|
|
185
|
+
`/mnt/c/Users/admin/AppData/Roaming/npm/${safeName}`,
|
|
186
|
+
`/mnt/c/Users/admin/AppData/Roaming/npm/${safeName}.cmd`
|
|
187
|
+
];
|
|
188
|
+
for (const candidate of candidates) {
|
|
189
|
+
if (fs.existsSync(candidate)) return candidate;
|
|
190
|
+
}
|
|
191
|
+
for (const dir of String(process.env.PATH || "").split(path.delimiter)) {
|
|
192
|
+
if (!dir) continue;
|
|
193
|
+
const candidate = path.join(dir, safeName);
|
|
194
|
+
if (fs.existsSync(candidate)) return candidate;
|
|
195
|
+
}
|
|
196
|
+
for (const command of [
|
|
197
|
+
`command -v ${safeName}`,
|
|
198
|
+
`source ~/.profile >/dev/null 2>&1 || true; source ~/.bashrc >/dev/null 2>&1 || true; command -v ${safeName}`
|
|
199
|
+
]) {
|
|
200
|
+
try {
|
|
201
|
+
const found = execFileSync("bash", ["-lc", command], { encoding: "utf8" }).trim();
|
|
202
|
+
if (found) return found;
|
|
203
|
+
} catch {
|
|
204
|
+
// continue
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
return "";
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
function firstReason(rows) {
|
|
211
|
+
return rows.find((row) => row.reason)?.reason || "no completed cases";
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
function normalize(value) {
|
|
215
|
+
return String(value || "").toLowerCase().replace(/[^a-z0-9]+/g, "");
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
function shellQuote(value) {
|
|
219
|
+
return String(value).replace(/[^A-Za-z0-9_./-]/g, "");
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
function expandTemplate(template, vars) {
|
|
223
|
+
const file = path.join(os.tmpdir(), `contextos-agent-prompt-${process.pid}-${Date.now()}.txt`);
|
|
224
|
+
fs.writeFileSync(file, vars.prompt, "utf8");
|
|
225
|
+
return String(template)
|
|
226
|
+
.replaceAll("{cwd}", vars.cwd)
|
|
227
|
+
.replaceAll("{prompt_file}", file)
|
|
228
|
+
.split(/\s+/)
|
|
229
|
+
.filter(Boolean);
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
function percent(value) {
|
|
233
|
+
return `${(value * 100).toFixed(1)}%`;
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
if (import.meta.url === `file://${process.argv[1]}`) {
|
|
237
|
+
const agentsArg = process.argv.find((arg) => arg.startsWith("--agents="));
|
|
238
|
+
const limitArg = process.argv.find((arg) => arg.startsWith("--limit="));
|
|
239
|
+
const timeoutArg = process.argv.find((arg) => arg.startsWith("--timeout-ms="));
|
|
240
|
+
const result = runAgentLeaderboard({
|
|
241
|
+
agents: agentsArg ? agentsArg.slice("--agents=".length).split(",").map((item) => item.trim()).filter(Boolean) : DEFAULT_AGENTS,
|
|
242
|
+
caseLimit: limitArg ? Number(limitArg.slice("--limit=".length)) : DEFAULT_CASE_LIMIT,
|
|
243
|
+
timeoutMs: timeoutArg ? Number(timeoutArg.slice("--timeout-ms=".length)) : DEFAULT_TIMEOUT_MS
|
|
244
|
+
});
|
|
245
|
+
console.log(formatAgentLeaderboard(result));
|
|
246
|
+
}
|
|
@@ -41,8 +41,8 @@ export async function runHallucinationLeaderboard({
|
|
|
41
41
|
caseCount: selectedCases.length,
|
|
42
42
|
repoCount: new Set(selectedCases.map((row) => row.fixture)).size,
|
|
43
43
|
systems: [
|
|
44
|
-
summarizeSystem("Raw
|
|
45
|
-
summarizeSystem("ContextOS
|
|
44
|
+
summarizeSystem("Raw heuristic baseline", rawRows),
|
|
45
|
+
summarizeSystem("ContextOS evidence benchmark", contextRows)
|
|
46
46
|
],
|
|
47
47
|
rows: selectedCases.map((testCase) => ({
|
|
48
48
|
prompt: testCase.prompt,
|
|
@@ -60,11 +60,11 @@ export function formatHallucinationLeaderboard(result) {
|
|
|
60
60
|
`Repos: ${result.repoCount}`,
|
|
61
61
|
`Tasks: ${result.caseCount}`,
|
|
62
62
|
"",
|
|
63
|
-
"System
|
|
64
|
-
"
|
|
63
|
+
"System Correct Context",
|
|
64
|
+
"---------------------------- ---------------"
|
|
65
65
|
];
|
|
66
66
|
for (const system of result.systems) {
|
|
67
|
-
lines.push(`${system.name.padEnd(
|
|
67
|
+
lines.push(`${system.name.padEnd(28)} ${percent(system.correctRate)}`);
|
|
68
68
|
}
|
|
69
69
|
lines.push("", "Sample failures:");
|
|
70
70
|
const failures = result.rows
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@minhpnq1807/contextos",
|
|
3
|
-
"version": "0.6.
|
|
3
|
+
"version": "0.6.3",
|
|
4
4
|
"description": "Task-aware AGENTS.md context injection and compliance reporting for Codex, Claude Code, and Antigravity.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
"files": [
|
|
11
11
|
"bin/",
|
|
12
12
|
"plugins/",
|
|
13
|
+
"scripts/",
|
|
13
14
|
".codex/skills/",
|
|
14
15
|
".codex/workflows/",
|
|
15
16
|
".agents/",
|
|
@@ -28,6 +29,9 @@
|
|
|
28
29
|
"validate:plugin": "node test/validate-plugin.js",
|
|
29
30
|
"benchmark:skills": "node bin/ctx.js benchmark --skills",
|
|
30
31
|
"leaderboard:hallucination": "node eval/hallucination/run-leaderboard.js",
|
|
32
|
+
"leaderboard:agents": "node eval/hallucination/run-agent-leaderboard.js --agents=codex,gemini",
|
|
33
|
+
"demo:capture": "node docs/demo/capture-live-demos.mjs",
|
|
34
|
+
"sync:community-skills": "node scripts/sync-community-skills.mjs",
|
|
31
35
|
"test:mcp": "node test/mcp-protocol-smoke.js"
|
|
32
36
|
},
|
|
33
37
|
"engines": {
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import fs from "node:fs";
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
|
|
5
|
+
const [, , sourceArg = "external-skills", targetArg = "community-skills"] = process.argv;
|
|
6
|
+
const source = path.resolve(sourceArg);
|
|
7
|
+
const target = path.resolve(targetArg);
|
|
8
|
+
const skip = new Set([".git", ".github", "scripts"]);
|
|
9
|
+
|
|
10
|
+
if (!fs.existsSync(source)) {
|
|
11
|
+
console.error(`Missing source skills directory: ${source}`);
|
|
12
|
+
process.exit(1);
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
fs.mkdirSync(target, { recursive: true });
|
|
16
|
+
|
|
17
|
+
for (const entry of fs.readdirSync(target, { withFileTypes: true })) {
|
|
18
|
+
if (entry.isDirectory()) fs.rmSync(path.join(target, entry.name), { recursive: true, force: true });
|
|
19
|
+
else if (entry.isFile() && entry.name !== "README.md") fs.rmSync(path.join(target, entry.name), { force: true });
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
for (const entry of fs.readdirSync(source, { withFileTypes: true })) {
|
|
23
|
+
if (skip.has(entry.name)) continue;
|
|
24
|
+
const from = path.join(source, entry.name);
|
|
25
|
+
const to = path.join(target, entry.name);
|
|
26
|
+
if (entry.isDirectory()) copyDir(from, to);
|
|
27
|
+
else if (entry.isFile() && entry.name === "README.md") fs.copyFileSync(from, to);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
console.log(`Synced ContextOS skills from ${source} to ${target}`);
|
|
31
|
+
|
|
32
|
+
function copyDir(from, to) {
|
|
33
|
+
fs.mkdirSync(to, { recursive: true });
|
|
34
|
+
for (const entry of fs.readdirSync(from, { withFileTypes: true })) {
|
|
35
|
+
const sourcePath = path.join(from, entry.name);
|
|
36
|
+
const targetPath = path.join(to, entry.name);
|
|
37
|
+
if (entry.isDirectory()) copyDir(sourcePath, targetPath);
|
|
38
|
+
else if (entry.isFile()) fs.copyFileSync(sourcePath, targetPath);
|
|
39
|
+
}
|
|
40
|
+
}
|