@minhpnq1807/contextos 0.1.9 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,11 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.2.0
4
+
5
+ - Adds `ctx benchmark -- "task"` to compare baseline AGENTS.md ordering with ContextOS scheduling and estimate lost-in-the-middle risk.
6
+ - Improves AGENTS.md rule filtering for generic headings and non-actionable sections.
7
+ - Splits Stop reports into `followed`, `ignored`, `unknown`, and `unmeasurable` so efficiency only reflects rules with evidence.
8
+
3
9
  ## 0.1.9
4
10
 
5
11
  - Proxies all configured MCP servers except ContextOS' own `ctx-mcp` server.
package/README.md CHANGED
@@ -190,6 +190,7 @@ This warning comes from a transitive dependency in the local embedding/WASM stac
190
190
  | `ctx report` | Shows the last Stop-hook compliance report for the current workspace. | A Codex task has finished and you want the summary again. | Reads `~/.ctx/contextos/workspaces/<workspace-id>/last-report.json`. |
191
191
  | `ctx evidence` | Shows detailed evidence behind the last report for the current workspace. | You want to inspect why a rule was marked `followed`, `ignored`, or `unknown`. | Prints rule text, source file, score, status, and evidence reason. |
192
192
  | `ctx stats` | Shows aggregate runtime metrics for the current workspace. | You want to know whether ContextOS is active and useful over time. | Prints prompt count, report count, injected/quiet ratio, average prompt analysis time, efficiency, rule outcomes, hook events, and last suggested files for the current workspace only. |
193
+ | `ctx benchmark -- "task"` | Compares baseline AGENTS.md ordering with ContextOS task-aware scheduling. | You want a before/after signal for lost-in-the-middle risk. | Prints parsed/actionable/filtered rule counts, relevant rules in the middle of the original file, scheduled high/mid rules, and top scored rules. |
193
194
  | `ctx embeddings warm -- "task"` | Prepares local semantic embedding caches. | First install, CI smoke checks, or after changing AGENTS.md/project files. | Loads/downloads `Xenova/all-MiniLM-L6-v2` and writes vectors to `~/.ctx/contextos/embeddings.db`. |
194
195
  | `ctx --version` | Prints the installed ContextOS CLI version. | You want to confirm which npm version is being executed. | Prints the version from package metadata. |
195
196
 
@@ -286,6 +287,7 @@ ContextOS uses heuristic evidence collection from git diff/status plus local run
286
287
  followed = evidence in the diff suggests the rule was applied
287
288
  ignored = evidence in the diff suggests the rule was violated
288
289
  unknown = the rule was relevant, but the diff does not prove either way
290
+ unmeasurable = ContextOS lacks the required evidence source, such as git diff lines or runtime telemetry
289
291
  ```
290
292
 
291
293
  For runtime-only rules, ContextOS also checks `telemetry.jsonl` for hook-visible tool names, MCP server names, and command metadata. A rule like "use code-review-graph before reading files" can be marked `followed` when telemetry contains a matching `code-review-graph` signal.
package/bin/ctx.js CHANGED
@@ -15,6 +15,7 @@ import { warmFileEmbeddings } from "../plugins/ctx/lib/file-embedding-retriever.
15
15
  import { scoreContext } from "../plugins/ctx/lib/score-context.js";
16
16
  import { defaultDataRoot, workspaceDataDir, workspaceMarkerPath } from "../plugins/ctx/lib/workspace-data.js";
17
17
  import { installMcpTelemetryProxies } from "../plugins/ctx/lib/mcp-proxy-install.js";
18
+ import { benchmarkWorkspace, formatBenchmark } from "../plugins/ctx/lib/benchmark.js";
18
19
 
19
20
  const __dirname = path.dirname(fileURLToPath(import.meta.url));
20
21
  const rootDir = path.resolve(__dirname, "..");
@@ -32,6 +33,7 @@ Usage:
32
33
  ctx report
33
34
  ctx evidence
34
35
  ctx stats
36
+ ctx benchmark -- "task"
35
37
  ctx embeddings warm -- "task"
36
38
  ctx --version
37
39
  `;
@@ -272,6 +274,11 @@ try {
272
274
  console.log(formatEvidence(loadLastReport()));
273
275
  } else if (command === "stats") {
274
276
  console.log(formatStats(loadStats(contextOSWorkspaceDataDir())));
277
+ } else if (command === "benchmark") {
278
+ const marker = args.indexOf("--");
279
+ const task = marker >= 0 ? args.slice(marker + 1).join(" ") : args.slice(1).join(" ");
280
+ if (!task.trim()) throw new Error('Usage: ctx benchmark -- "task"');
281
+ console.log(formatBenchmark(benchmarkWorkspace({ cwd: process.cwd(), task })));
275
282
  } else {
276
283
  throw new Error(`Unknown command: ${command}\n\n${usage()}`);
277
284
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@minhpnq1807/contextos",
3
- "version": "0.1.9",
3
+ "version": "0.2.0",
4
4
  "description": "Task-aware AGENTS.md context injection and compliance reporting for Codex.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -81,6 +81,11 @@ const TOOL_REFERENCE_TOKENS = new Set([
81
81
  "list_communities"
82
82
  ]);
83
83
 
84
+ const ACTION_TOKENS = new Set([
85
+ "add", "avoid", "call", "check", "derive", "ensure", "filter", "follow", "prefer", "run",
86
+ "use", "validate", "verify", "write", "never", "always", "must", "should", "do"
87
+ ]);
88
+
84
89
  export function tokenize(value) {
85
90
  const normalized = String(value || "")
86
91
  .toLowerCase()
@@ -190,6 +195,7 @@ export function isDocumentationOnlyRule(rule) {
190
195
  if (/^<!--.*-->$/.test(normalized)) return true;
191
196
  if (DOCUMENTATION_HEADING_PATTERNS.some((pattern) => pattern.test(normalized))) return true;
192
197
  if (isMarkdownTableRule(normalized)) return true;
198
+ if (isGenericHeading(normalized)) return true;
193
199
  return false;
194
200
  }
195
201
 
@@ -209,6 +215,13 @@ function isMarkdownTableRule(content) {
209
215
  return /\btool\b/.test(lower) && /\buse\s+when\b/.test(lower) && toolReferenceCount >= 2;
210
216
  }
211
217
 
218
+ function isGenericHeading(content) {
219
+ if (content.length > 80 || /[`.:;]/.test(content)) return false;
220
+ const tokens = tokenize(content);
221
+ if (tokens.length > 4) return false;
222
+ return !tokens.some((token) => ACTION_TOKENS.has(token));
223
+ }
224
+
212
225
  function dedupeRules(rules) {
213
226
  const seen = new Set();
214
227
  return rules.filter((rule) => {
@@ -0,0 +1,72 @@
1
+ import { parseRules, filterActionableRules, scoreRules } from "./analyzer.js";
2
+ import { readAgentsChain } from "./reader.js";
3
+ import { scheduleContext } from "./scheduler.js";
4
+
5
+ export function benchmarkContext({ markdown, sources = [], task = "", openFiles = [], topK = 8 } = {}) {
6
+ const parsedRules = parseRules(markdown);
7
+ const actionableRules = filterActionableRules(parsedRules);
8
+ const scoredRules = scoreRules(actionableRules, task, openFiles);
9
+ const relevantRules = scoredRules.filter((rule) => Number(rule.score || 0) >= 0.1);
10
+ const scheduled = scheduleContext({ rules: scoredRules, relevantFiles: [] });
11
+
12
+ const originalPositions = new Map(actionableRules.map((rule, index) => [rule.content, index]));
13
+ const middleStart = Math.floor(actionableRules.length * 0.25);
14
+ const middleEnd = Math.ceil(actionableRules.length * 0.75);
15
+ const lostMiddle = relevantRules.filter((rule) => {
16
+ const index = originalPositions.get(rule.content);
17
+ return typeof index === "number" && index >= middleStart && index <= middleEnd;
18
+ });
19
+
20
+ return {
21
+ task,
22
+ sources,
23
+ rulesParsed: parsedRules.length,
24
+ actionableRules: actionableRules.length,
25
+ filteredRules: parsedRules.length - actionableRules.length,
26
+ relevantRules: relevantRules.length,
27
+ baseline: {
28
+ relevantRulesInMiddle: lostMiddle.length,
29
+ middleRiskPercent: relevantRules.length ? Math.round((lostMiddle.length / relevantRules.length) * 100) : 0
30
+ },
31
+ contextOS: {
32
+ highRules: scheduled.highRules.length,
33
+ midRules: scheduled.midRules.length,
34
+ topRules: scoredRules.slice(0, topK).map((rule) => ({
35
+ score: rule.score,
36
+ content: rule.content,
37
+ reasons: rule.reasons || []
38
+ })),
39
+ repeatsHighRulesAtEnd: scheduled.highRules.length > 0
40
+ }
41
+ };
42
+ }
43
+
44
+ export function benchmarkWorkspace({ cwd = process.cwd(), task = "", openFiles = [], topK = 8 } = {}) {
45
+ const merged = readAgentsChain({ cwd });
46
+ return benchmarkContext({
47
+ markdown: merged.content,
48
+ sources: merged.sources,
49
+ task,
50
+ openFiles,
51
+ topK
52
+ });
53
+ }
54
+
55
+ export function formatBenchmark(result) {
56
+ const lines = [];
57
+ lines.push("ContextOS benchmark");
58
+ lines.push(`Task: ${result.task || "(empty)"}`);
59
+ lines.push(`Rules: ${result.rulesParsed} parsed, ${result.actionableRules} actionable, ${result.filteredRules} filtered`);
60
+ lines.push(`Relevant rules: ${result.relevantRules}`);
61
+ lines.push(`Baseline middle-risk: ${result.baseline.relevantRulesInMiddle}/${result.relevantRules} relevant rules (${result.baseline.middleRiskPercent}%)`);
62
+ lines.push(`ContextOS scheduled: ${result.contextOS.highRules} high, ${result.contextOS.midRules} mid`);
63
+ lines.push(`Recency reminder: ${result.contextOS.repeatsHighRulesAtEnd ? "enabled" : "not needed"}`);
64
+ if (result.contextOS.topRules.length) {
65
+ lines.push("Top rules:");
66
+ for (const rule of result.contextOS.topRules) {
67
+ const reasons = rule.reasons?.length ? ` reasons:${rule.reasons.join(",")}` : "";
68
+ lines.push(`- ${Number(rule.score || 0).toFixed(2)} ${rule.content}${reasons}`);
69
+ }
70
+ }
71
+ return lines.join("\n");
72
+ }
@@ -183,23 +183,31 @@ export function checkCompliance({ rules = [], addedLines = [], runtimeEvidence =
183
183
  if (!keywords.length || !addedLines.length) {
184
184
  results.push({
185
185
  rule,
186
- status: "unknown",
186
+ status: isRuntimeOnly || !addedLines.length || !keywords.length ? "unmeasurable" : "unknown",
187
187
  kind: isRuntimeOnly ? "runtime" : kind,
188
188
  keywords,
189
189
  evidence: isRuntimeOnly
190
- ? "requires runtime/tool-call telemetry; no matching runtime signal observed"
190
+ ? "requires runtime/tool-call telemetry; no runtime telemetry source observed"
191
191
  : (!addedLines.length ? "no added lines in git diff" : "no concrete compliance keywords found")
192
192
  });
193
193
  continue;
194
194
  }
195
195
 
196
196
  if (isRuntimeOnly) {
197
+ const hasRuntimeSource = Boolean(
198
+ runtimeEvidence.sources?.length ||
199
+ runtimeEvidence.signals?.length ||
200
+ runtimeEvidence.toolSignals?.length ||
201
+ runtimeEvidence.commandSignals?.length
202
+ );
197
203
  results.push({
198
204
  rule,
199
- status: "unknown",
205
+ status: hasRuntimeSource ? "unknown" : "unmeasurable",
200
206
  kind: "runtime",
201
207
  keywords,
202
- evidence: "requires runtime/tool-call telemetry; no matching runtime signal observed"
208
+ evidence: hasRuntimeSource
209
+ ? "requires runtime/tool-call telemetry; no matching runtime signal observed"
210
+ : "requires runtime/tool-call telemetry; no runtime telemetry source observed"
203
211
  });
204
212
  continue;
205
213
  }
@@ -5,6 +5,7 @@ export function buildReport({ cwd, prompt, relevantFiles, scheduled, gitSnapshot
5
5
  const followed = actionableCompliance.filter((item) => item.status === "followed");
6
6
  const ignored = actionableCompliance.filter((item) => item.status === "ignored");
7
7
  const unknown = actionableCompliance.filter((item) => item.status === "unknown");
8
+ const unmeasurable = actionableCompliance.filter((item) => item.status === "unmeasurable");
8
9
  const measured = followed.length + ignored.length;
9
10
  const efficiencyScore = measured ? Math.round((followed.length / measured) * 100) : null;
10
11
 
@@ -20,8 +21,10 @@ export function buildReport({ cwd, prompt, relevantFiles, scheduled, gitSnapshot
20
21
  followed,
21
22
  ignored,
22
23
  unknown,
24
+ unmeasurable,
23
25
  measuredRuleCount: measured,
24
26
  unknownRuleCount: unknown.length,
27
+ unmeasurableRuleCount: unmeasurable.length,
25
28
  efficiencyScore
26
29
  };
27
30
  }
@@ -32,7 +35,7 @@ export function formatReport(report) {
32
35
  lines.push("ContextOS report");
33
36
  lines.push(`Efficiency: ${report.efficiencyScore == null ? "unknown" : `${report.efficiencyScore}%`}`);
34
37
  lines.push(`Injected rules: ${report.injectedRuleCount || 0}`);
35
- lines.push(`Rule outcomes: ${report.followed?.length || 0} followed, ${report.ignored?.length || 0} ignored, ${report.unknown?.length || 0} unknown`);
38
+ lines.push(`Rule outcomes: ${report.followed?.length || 0} followed, ${report.ignored?.length || 0} ignored, ${report.unknown?.length || 0} unknown, ${report.unmeasurable?.length || 0} unmeasurable`);
36
39
  lines.push(`Measured rules: ${report.measuredRuleCount ?? ((report.followed?.length || 0) + (report.ignored?.length || 0))}`);
37
40
  lines.push(`Changed files: ${report.changedFiles?.length ? report.changedFiles.join(", ") : "none detected"}`);
38
41
 
@@ -48,6 +51,7 @@ export function formatReport(report) {
48
51
  appendBucket(lines, "Followed", report.followed);
49
52
  appendBucket(lines, "Ignored", report.ignored);
50
53
  appendBucket(lines, "Unknown", report.unknown);
54
+ appendBucket(lines, "Unmeasurable", report.unmeasurable);
51
55
 
52
56
  if (report.ignored?.length) {
53
57
  lines.push(`Suggestion: fix ignored rule evidence first: ${truncate(report.ignored[0].rule?.content || "", 70)}`);
@@ -71,7 +75,8 @@ export function formatEvidence(report) {
71
75
  const items = [
72
76
  ...(report.followed || []).map((item) => ({ ...item, status: "followed" })),
73
77
  ...(report.ignored || []).map((item) => ({ ...item, status: "ignored" })),
74
- ...(report.unknown || []).map((item) => ({ ...item, status: "unknown" }))
78
+ ...(report.unknown || []).map((item) => ({ ...item, status: "unknown" })),
79
+ ...(report.unmeasurable || []).map((item) => ({ ...item, status: "unmeasurable" }))
75
80
  ];
76
81
 
77
82
  if (!items.length) {
@@ -129,15 +134,18 @@ function sanitizeReport(report = {}) {
129
134
  const followed = (report.followed || []).filter((item) => !isSystemUserRule(item.rule));
130
135
  const ignored = (report.ignored || []).filter((item) => !isSystemUserRule(item.rule));
131
136
  const unknown = (report.unknown || []).filter((item) => !isSystemUserRule(item.rule));
137
+ const unmeasurable = (report.unmeasurable || []).filter((item) => !isSystemUserRule(item.rule));
132
138
  const measured = followed.length + ignored.length;
133
139
  return {
134
140
  ...report,
135
- injectedRuleCount: followed.length + ignored.length + unknown.length,
141
+ injectedRuleCount: followed.length + ignored.length + unknown.length + unmeasurable.length,
136
142
  followed,
137
143
  ignored,
138
144
  unknown,
145
+ unmeasurable,
139
146
  measuredRuleCount: measured,
140
147
  unknownRuleCount: unknown.length,
148
+ unmeasurableRuleCount: unmeasurable.length,
141
149
  efficiencyScore: measured ? Math.round((followed.length / measured) * 100) : null
142
150
  };
143
151
  }
@@ -57,6 +57,7 @@ export function loadStats(dataDir) {
57
57
  const followed = reports.reduce((sum, report) => sum + (report.followed?.length || 0), 0);
58
58
  const ignored = reports.reduce((sum, report) => sum + (report.ignored?.length || 0), 0);
59
59
  const unknown = reports.reduce((sum, report) => sum + (report.unknown?.length || 0), 0);
60
+ const unmeasurable = reports.reduce((sum, report) => sum + (report.unmeasurable?.length || 0), 0);
60
61
 
61
62
  return {
62
63
  dataDir,
@@ -71,6 +72,7 @@ export function loadStats(dataDir) {
71
72
  followed,
72
73
  ignored,
73
74
  unknown,
75
+ unmeasurable,
74
76
  lastPrompt: analyzedPrompts.at(-1) || null,
75
77
  lastReport: reports.at(-1) || null
76
78
  };
@@ -85,7 +87,7 @@ export function formatStats(stats) {
85
87
  lines.push(`Prompt mode: ${stats.injectedCount} injected, ${stats.quietCount} quiet (${stats.injectionRate}% injected)`);
86
88
  lines.push(`Average prompt analysis: ${stats.averagePromptMs == null ? "unknown" : `${stats.averagePromptMs}ms`}`);
87
89
  lines.push(`Average efficiency: ${formatAverageEfficiency(stats)}`);
88
- lines.push(`Rule outcomes: ${stats.followed} followed, ${stats.ignored} ignored, ${stats.unknown} unknown`);
90
+ lines.push(`Rule outcomes: ${stats.followed} followed, ${stats.ignored} ignored, ${stats.unknown} unknown, ${stats.unmeasurable || 0} unmeasurable`);
89
91
 
90
92
  const eventSummary = Object.entries(stats.events)
91
93
  .map(([event, count]) => `${event}:${count}`)
@@ -103,6 +105,7 @@ export function formatStats(stats) {
103
105
  lines.push(`Last report efficiency: ${stats.lastReport.efficiencyScore == null ? "unknown" : `${stats.lastReport.efficiencyScore}%`}`);
104
106
  lines.push(`Last report measured rules: ${stats.lastReport.measuredRuleCount ?? ((stats.lastReport.followed?.length || 0) + (stats.lastReport.ignored?.length || 0))}`);
105
107
  lines.push(`Last report unknown rules: ${stats.lastReport.unknownRuleCount ?? (stats.lastReport.unknown?.length || 0)}`);
108
+ lines.push(`Last report unmeasurable rules: ${stats.lastReport.unmeasurableRuleCount ?? (stats.lastReport.unmeasurable?.length || 0)}`);
106
109
  const changed = stats.lastReport.changedFiles?.join(", ");
107
110
  if (changed) lines.push(`Last changed files: ${changed}`);
108
111
  }