@beingmartinbmc/ojas 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +327 -167
- package/dist/cli/index.d.ts +23 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +240 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +11 -1
- package/dist/index.js.map +1 -1
- package/dist/scorecard/badge.d.ts +46 -0
- package/dist/scorecard/badge.d.ts.map +1 -0
- package/dist/scorecard/badge.js +95 -0
- package/dist/scorecard/badge.js.map +1 -0
- package/dist/scorecard/index.d.ts +88 -0
- package/dist/scorecard/index.d.ts.map +1 -0
- package/dist/scorecard/index.js +186 -0
- package/dist/scorecard/index.js.map +1 -0
- package/docs/BACKLOG.md +2 -2
- package/docs/EVIDENCE.md +152 -10
- package/docs/EVIDENCE_MATRIX.md +97 -8
- package/docs/MCP.md +2 -2
- package/package.json +4 -2
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Ojas Scorecard — shareable, self-contained snapshots of agent health.
|
|
4
|
+
*
|
|
5
|
+
* A scorecard is the distribution surface for Ojas: it turns the rich
|
|
6
|
+
* `AgentHealthReport` into a compact, render-agnostic structure that the
|
|
7
|
+
* badge generator, CLI, GitHub Action, and live dashboard all consume.
|
|
8
|
+
*
|
|
9
|
+
* Design rules carried over from the rest of Ojas:
|
|
10
|
+
* - **Honest framing.** A scorecard never claims more than the report
|
|
11
|
+
* does. The `basis` and `interpretation` fields from `HealthScore`
|
|
12
|
+
* are propagated verbatim so a badge can never imply an empirical
|
|
13
|
+
* probability the underlying score does not have.
|
|
14
|
+
* - **No I/O here.** This module is pure data → data. Rendering to SVG
|
|
15
|
+
* lives in `./badge`, file/HTTP side effects live in the CLI.
|
|
16
|
+
* - **Backwards compatible.** Consuming a report produced by any 0.3.x
|
|
17
|
+
* Ojas build must succeed; new fields are optional.
|
|
18
|
+
*/
|
|
19
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
20
|
+
exports.gradeFor = gradeFor;
|
|
21
|
+
exports.buildScorecard = buildScorecard;
|
|
22
|
+
exports.gradeLabel = gradeLabel;
|
|
23
|
+
exports.gradeEmoji = gradeEmoji;
|
|
24
|
+
exports.renderScorecardText = renderScorecardText;
|
|
25
|
+
exports.renderScorecardMarkdown = renderScorecardMarkdown;
|
|
26
|
+
const MODULE_ORDER = [
|
|
27
|
+
'aahar',
|
|
28
|
+
'nidra',
|
|
29
|
+
'vyayam',
|
|
30
|
+
'raksha',
|
|
31
|
+
'agni',
|
|
32
|
+
'pulse',
|
|
33
|
+
'chikitsa',
|
|
34
|
+
];
|
|
35
|
+
/**
|
|
36
|
+
* Grade thresholds. Tuned to align with `classifyHealthState` in the MCP
|
|
37
|
+
* contracts (healthy ≥ 90, watch ≥ 70) but extended with an `excellent`
|
|
38
|
+
* band so a near-perfect agent is visually distinct from a merely healthy
|
|
39
|
+
* one — important for the badge/leaderboard where small gaps matter.
|
|
40
|
+
*/
|
|
41
|
+
function gradeFor(score) {
|
|
42
|
+
if (score >= 95)
|
|
43
|
+
return 'excellent';
|
|
44
|
+
if (score >= 90)
|
|
45
|
+
return 'healthy';
|
|
46
|
+
if (score >= 70)
|
|
47
|
+
return 'watch';
|
|
48
|
+
if (score >= 50)
|
|
49
|
+
return 'degraded';
|
|
50
|
+
return 'critical';
|
|
51
|
+
}
|
|
52
|
+
function buildModules(scores) {
|
|
53
|
+
return MODULE_ORDER.map((module) => {
|
|
54
|
+
const score = scores[module];
|
|
55
|
+
return { module, score, grade: gradeFor(score) };
|
|
56
|
+
});
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Build a shareable scorecard from a full health report. Pure function —
|
|
60
|
+
* the same report always yields the same scorecard (modulo the report's
|
|
61
|
+
* own timestamp, which is copied through).
|
|
62
|
+
*/
|
|
63
|
+
function buildScorecard(report, opts = {}) {
|
|
64
|
+
const maxRisks = opts.maxRisks ?? 5;
|
|
65
|
+
const maxActions = opts.maxActions ?? 6;
|
|
66
|
+
const overall = Math.round(report.overall.value * 100);
|
|
67
|
+
// Risks: warning/critical recommendations, most severe first.
|
|
68
|
+
const severityRank = { critical: 0, warning: 1, info: 2 };
|
|
69
|
+
const topRisks = report.recommendations
|
|
70
|
+
.filter((r) => r.severity !== 'info')
|
|
71
|
+
.slice()
|
|
72
|
+
.sort((a, b) => severityRank[a.severity] - severityRank[b.severity])
|
|
73
|
+
.slice(0, maxRisks)
|
|
74
|
+
.map((r) => ({ module: r.module, severity: r.severity, message: r.message, action: r.action }));
|
|
75
|
+
// Actions: deduplicated concrete next steps from non-info recommendations.
|
|
76
|
+
const recommendedActions = [
|
|
77
|
+
...new Set(report.recommendations
|
|
78
|
+
.filter((r) => r.severity !== 'info' && r.action)
|
|
79
|
+
.map((r) => r.action)),
|
|
80
|
+
].slice(0, maxActions);
|
|
81
|
+
const requiresHumanReview = report.recommendations.some((r) => r.severity === 'critical');
|
|
82
|
+
return {
|
|
83
|
+
agentId: report.agentId,
|
|
84
|
+
timestamp: report.timestamp,
|
|
85
|
+
overall,
|
|
86
|
+
grade: gradeFor(overall),
|
|
87
|
+
basis: report.overall.basis ?? 'heuristic',
|
|
88
|
+
interpretation: report.overall.interpretation ??
|
|
89
|
+
'Advisory diagnostic score. Use for triage and trend deltas, not as a production failure probability.',
|
|
90
|
+
modules: buildModules(report.moduleScores),
|
|
91
|
+
topRisks,
|
|
92
|
+
recommendedActions,
|
|
93
|
+
requiresHumanReview,
|
|
94
|
+
schemaVersion: 1,
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
const GRADE_LABEL = {
|
|
98
|
+
excellent: 'Excellent',
|
|
99
|
+
healthy: 'Healthy',
|
|
100
|
+
watch: 'Watch',
|
|
101
|
+
degraded: 'Degraded',
|
|
102
|
+
critical: 'Critical',
|
|
103
|
+
};
|
|
104
|
+
const GRADE_EMOJI = {
|
|
105
|
+
excellent: '🟢',
|
|
106
|
+
healthy: '🟢',
|
|
107
|
+
watch: '🟡',
|
|
108
|
+
degraded: '🟠',
|
|
109
|
+
critical: '🔴',
|
|
110
|
+
};
|
|
111
|
+
function gradeLabel(grade) {
|
|
112
|
+
return GRADE_LABEL[grade];
|
|
113
|
+
}
|
|
114
|
+
function gradeEmoji(grade) {
|
|
115
|
+
return GRADE_EMOJI[grade];
|
|
116
|
+
}
|
|
117
|
+
/**
|
|
118
|
+
* Render a scorecard as a compact, copy-pasteable plain-text card for the
|
|
119
|
+
* terminal. No ANSI colour so output is safe to redirect to a file.
|
|
120
|
+
*/
|
|
121
|
+
function renderScorecardText(card) {
|
|
122
|
+
const lines = [];
|
|
123
|
+
lines.push(`Ojas Health Scorecard — ${card.agentId}`);
|
|
124
|
+
lines.push('='.repeat(44));
|
|
125
|
+
lines.push(`Overall: ${card.overall}/100 ${gradeEmoji(card.grade)} ${gradeLabel(card.grade)}`);
|
|
126
|
+
lines.push(`Basis: ${card.basis}`);
|
|
127
|
+
lines.push('');
|
|
128
|
+
lines.push('Modules');
|
|
129
|
+
for (const m of card.modules) {
|
|
130
|
+
const bar = '█'.repeat(Math.round(m.score / 10)).padEnd(10, '·');
|
|
131
|
+
lines.push(` ${m.module.padEnd(9)} ${String(m.score).padStart(3)} ${bar} ${gradeEmoji(m.grade)}`);
|
|
132
|
+
}
|
|
133
|
+
if (card.topRisks.length > 0) {
|
|
134
|
+
lines.push('');
|
|
135
|
+
lines.push('Top risks');
|
|
136
|
+
for (const r of card.topRisks) {
|
|
137
|
+
lines.push(` [${r.severity.toUpperCase()}] ${r.message}`);
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
if (card.recommendedActions.length > 0) {
|
|
141
|
+
lines.push('');
|
|
142
|
+
lines.push('Recommended next steps');
|
|
143
|
+
for (const a of card.recommendedActions) {
|
|
144
|
+
lines.push(` • ${a}`);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
lines.push('');
|
|
148
|
+
lines.push(card.interpretation);
|
|
149
|
+
return lines.join('\n');
|
|
150
|
+
}
|
|
151
|
+
/**
|
|
152
|
+
* Render a scorecard as GitHub-flavoured Markdown — used by the CLI's
|
|
153
|
+
* `--format markdown` and by the GitHub Action's PR comment.
|
|
154
|
+
*/
|
|
155
|
+
function renderScorecardMarkdown(card) {
|
|
156
|
+
const lines = [];
|
|
157
|
+
lines.push(`### ${gradeEmoji(card.grade)} Ojas Health — \`${card.agentId}\``);
|
|
158
|
+
lines.push('');
|
|
159
|
+
lines.push(`**Overall: ${card.overall}/100 (${gradeLabel(card.grade)})**`);
|
|
160
|
+
lines.push('');
|
|
161
|
+
lines.push('| Module | Score | Grade |');
|
|
162
|
+
lines.push('| --- | --- | --- |');
|
|
163
|
+
for (const m of card.modules) {
|
|
164
|
+
lines.push(`| ${m.module} | ${m.score} | ${gradeEmoji(m.grade)} ${gradeLabel(m.grade)} |`);
|
|
165
|
+
}
|
|
166
|
+
if (card.topRisks.length > 0) {
|
|
167
|
+
lines.push('');
|
|
168
|
+
lines.push('**Top risks**');
|
|
169
|
+
lines.push('');
|
|
170
|
+
for (const r of card.topRisks) {
|
|
171
|
+
lines.push(`- \`${r.severity}\` ${r.message}`);
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
if (card.recommendedActions.length > 0) {
|
|
175
|
+
lines.push('');
|
|
176
|
+
lines.push('**Recommended next steps**');
|
|
177
|
+
lines.push('');
|
|
178
|
+
for (const a of card.recommendedActions) {
|
|
179
|
+
lines.push(`- ${a}`);
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
lines.push('');
|
|
183
|
+
lines.push(`<sub>${card.interpretation}</sub>`);
|
|
184
|
+
return lines.join('\n');
|
|
185
|
+
}
|
|
186
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/scorecard/index.ts"],"names":[],"mappings":";AAAA;;;;;;;;;;;;;;;;GAgBG;;AAqEH,4BAMC;AAcD,wCAwCC;AAkBD,gCAEC;AAED,gCAEC;AAMD,kDA6BC;AAMD,0DA8BC;AA3KD,MAAM,YAAY,GAA8B;IAC9C,OAAO;IACP,OAAO;IACP,QAAQ;IACR,QAAQ;IACR,MAAM;IACN,OAAO;IACP,UAAU;CACX,CAAC;AAEF;;;;;GAKG;AACH,SAAgB,QAAQ,CAAC,KAAa;IACpC,IAAI,KAAK,IAAI,EAAE;QAAE,OAAO,WAAW,CAAC;IACpC,IAAI,KAAK,IAAI,EAAE;QAAE,OAAO,SAAS,CAAC;IAClC,IAAI,KAAK,IAAI,EAAE;QAAE,OAAO,OAAO,CAAC;IAChC,IAAI,KAAK,IAAI,EAAE;QAAE,OAAO,UAAU,CAAC;IACnC,OAAO,UAAU,CAAC;AACpB,CAAC;AAED,SAAS,YAAY,CAAC,MAA4B;IAChD,OAAO,YAAY,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,EAAE;QACjC,MAAM,KAAK,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC;QAC7B,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK,EAAE,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;IACnD,CAAC,CAAC,CAAC;AACL,CAAC;AAED;;;;GAIG;AACH,SAAgB,cAAc,CAAC,MAAyB,EAAE,OAA8B,EAAE;IACxF,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,IAAI,CAAC,CAAC;IACpC,MAAM,UAAU,GAAG,IAAI,CAAC,UAAU,IAAI,CAAC,CAAC;IACxC,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,OAAO,CAAC,KAAK,GAAG,GAAG,CAAC,CAAC;IAEvD,8DAA8D;IAC9D,MAAM,YAAY,GAAG,EAAE,QAAQ,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAW,CAAC;IACnE,MAAM,QAAQ,GAAoB,MAAM,CAAC,eAAe;SACrD,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,KAAK,MAAM,CAAC;SACpC,KAAK,EAAE;SACP,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,YAAY,CAAC,CAAC,CAAC,QAAQ,CAAC,GAAG,YAAY,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC;SACnE,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC;SAClB,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,CAAC,CAAC,QAAQ,EAAE,OAAO,EAAE,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IAElG,2EAA2E;IAC3E,MAAM,kBAAkB,GAAG;QACzB,GAAG,IAAI,GAAG,CACR,MAAM,CAAC,eAAe;aACnB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,KAAK,MAAM,IAAI,CAAC,CAAC,MAAM,CAAC;aAChD,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAgB,CAAC,CAClC;KACF,CAAC,KAAK,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC;IAEvB,MAAM,mBAAmB,GAAG,MAAM,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,KAAK,UAAU,CAAC,CAAC;IAE1F,OAAO;QACL,OAAO,EAAE,MAAM,CAAC,OAAO;QACvB,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,OAAO;QACP,KAAK,EAAE,QAAQ,CAAC,OAAO,CAAC;QACxB,KAAK,EAAE,MAAM,CAAC,OAAO,CAAC,KAAK,IAAI,WAAW;QAC1C,cAAc,EACZ,MAAM,CAAC,OAAO,CAAC,cAAc;YAC7B,sGAAsG;QACxG,OAAO,EAAE,YAAY,CAAC,MAAM,CAAC,YAAY,CAAC;QAC1C,QAAQ;QACR,kBAAkB;QAClB,mBAAmB;QACnB,aAAa,EAAE,CAAC;KACjB,CAAC;AACJ,CAAC;AAED,MAAM,WAAW,GAAmC;IAClD,SAAS,EAAE,WAAW;IACtB,OAAO,EAAE,SAAS;IAClB,KAAK,EAAE,OAAO;IACd,QAAQ,EAAE,UAAU;IACpB,QAAQ,EAAE,UAAU;CACrB,CAAC;AAEF,MAAM,WAAW,GAAmC;IAClD,SAAS,EAAE,IAAI;IACf,OAAO,EAAE,IAAI;IACb,KAAK,EAAE,IAAI;IACX,QAAQ,EAAE,IAAI;IACd,QAAQ,EAAE,IAAI;CACf,CAAC;AAEF,SAAgB,UAAU,CAAC,KAAqB;IAC9C,OAAO,WAAW,CAAC,KAAK,CAAC,CAAC;AAC5B,CAAC;AAED,SAAgB,UAAU,CAAC,KAAqB;IAC9C,OAAO,WAAW,CAAC,KAAK,CAAC,CAAC;AAC5B,CAAC;AAED;;;GAGG;AACH,SAAgB,mBAAmB,CAAC,IAAe;IACjD,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,KAAK,CAAC,IAAI,CAAC,2BAA2B,IAAI,CAAC,OAAO,EAAE,CAAC,CAAC;IACtD,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;IAC3B,KAAK,CAAC,IAAI,CAAC,YAAY,IAAI,CAAC,OAAO,SAAS,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;IAChG,KAAK,CAAC,IAAI,CAAC,YAAY,IAAI,CAAC,KAAK,EAAE,CAAC,CAAC;IACrC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACf,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IACtB,KAAK,MAAM,CAAC,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;QAC7B,MAAM,GAAG,GAAG,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,GAAG,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,EAAE,EAAE,GAAG,CAAC,CAAC;QACjE,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,KAAK,GAAG,IAAI,UAAU,CAAC,CAAC,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;IACtG,CAAC;IACD,IAAI,IAAI,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC7B,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACf,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACxB,KAAK,MAAM,CAAC,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;YAC9B,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,QAAQ,CAAC,WAAW,EAAE,KAAK,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC;QAC7D,CAAC;IACH,CAAC;IACD,IAAI,IAAI,CAAC,kBAAkB,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACvC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACf,KAAK,CAAC,IAAI,CAAC,wBAAwB,CAAC,CAAC;QACrC,KAAK,MAAM,CAAC,IAAI,IAAI,CAAC,kBAAkB,EAAE,CAAC;YACxC,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;QACzB,CAAC;IACH,CAAC;IACD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACf,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;IAChC,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED;;;GAGG;AACH,SAAgB,uBAAuB,CAAC,IAAe;IACrD,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,KAAK,CAAC,IAAI,CAAC,OAAO,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,oBAAoB,IAAI,CAAC,OAAO,IAAI,CAAC,CAAC;IAC9E,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACf,KAAK,CAAC,IAAI,CAAC,cAAc,IAAI,CAAC,OAAO,SAAS,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;IAC3E,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACf,KAAK,CAAC,IAAI,CAAC,4BAA4B,CAAC,CAAC;IACzC,KAAK,CAAC,IAAI,CAAC,qBAAqB,CAAC,CAAC;IAClC,KAAK,MAAM,CAAC,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;QAC7B,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,MAAM,MAAM,CAAC,CAAC,KAAK,MAAM,UAAU,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,UAAU,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC7F,CAAC;IACD,IAAI,IAAI,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC7B,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACf,KAAK,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;QAC5B,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACf,KAAK,MAAM,CAAC,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;YAC9B,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,QAAQ,MAAM,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC;QACjD,CAAC;IACH,CAAC;IACD,IAAI,IAAI,CAAC,kBAAkB,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACvC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACf,KAAK,CAAC,IAAI,CAAC,4BAA4B,CAAC,CAAC;QACzC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACf,KAAK,MAAM,CAAC,IAAI,IAAI,CAAC,kBAAkB,EAAE,CAAC;YACxC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;QACvB,CAAC;IACH,CAAC;IACD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACf,KAAK,CAAC,IAAI,CAAC,QAAQ,IAAI,CAAC,cAAc,QAAQ,CAAC,CAAC;IAChD,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC"}
|
package/docs/BACKLOG.md
CHANGED
|
@@ -81,7 +81,7 @@ What's still open is below.
|
|
|
81
81
|
## Trust roadmap
|
|
82
82
|
|
|
83
83
|
The single biggest open question for this project is *"does it actually
|
|
84
|
-
help an agent?"*. v0.
|
|
84
|
+
help an agent?"*. v0.3 ships at evidence level L2 / L2.5 (synthetic
|
|
85
85
|
reproducible benchmarks); the roadmap below moves it toward L3 (realistic
|
|
86
86
|
agent tasks) and L4 (production telemetry). Each phase is independently
|
|
87
87
|
landable.
|
|
@@ -148,7 +148,7 @@ All items in this phase have been shipped:
|
|
|
148
148
|
anger to opt-in to anonymised aggregate stats (failure detection
|
|
149
149
|
precision / recall, score-vs-incident correlation) — *still open*.
|
|
150
150
|
|
|
151
|
-
## Reframed as a v0.
|
|
151
|
+
## Reframed as a v0.3 non-goal, not deferred work
|
|
152
152
|
|
|
153
153
|
### MCP authentication / authorization on the boundary
|
|
154
154
|
Multiple review rounds flagged "no caller authentication" as critical /
|
package/docs/EVIDENCE.md
CHANGED
|
@@ -2,14 +2,14 @@
|
|
|
2
2
|
|
|
3
3
|
> **Auto-generated by `npm run benchmark:write`. Do not edit by hand.**
|
|
4
4
|
|
|
5
|
-
- **ojas**: `0.
|
|
6
|
-
- **node**: `
|
|
7
|
-
- **timestamp**: 2026-
|
|
8
|
-
- **suites**:
|
|
5
|
+
- **ojas**: `0.4.0`
|
|
6
|
+
- **node**: `v25.9.0`
|
|
7
|
+
- **timestamp**: 2026-06-07T09:07:46.375Z
|
|
8
|
+
- **suites**: 18/18 passed — targeted failure suites improved and diagnostic/no-regression suites met their acceptance criteria.
|
|
9
9
|
|
|
10
10
|
## What this measures
|
|
11
11
|
|
|
12
|
-
|
|
12
|
+
18 A/B benchmarks comparing a deliberately vulnerable agent running **without Ojas** vs the **same agent + Ojas**. Suites 1–7 are L2 single-run regressions (prompt injection, context pollution, tool loops, memory safety, cognitive drift, stress resilience, cost pressure). Suite 8 is L2.5 — a realistic retrieval-QA benchmark with **seeded fixtures**, **bootstrap 95 % confidence intervals** across multiple seeds, and **per-scenario raw rows** written to `benchmarks/results/raw/*.jsonl` on `npm run benchmark:write`. See `docs/EVIDENCE_MATRIX.md` for the evidence ladder and `docs/KNOWN_FAILURES.md` for the failure modes these benchmarks deliberately do *not* probe.
|
|
13
13
|
|
|
14
14
|
> The vulnerable agents are synthetic and have explicitly-programmed failure modes. These benchmarks prove that Ojas's detection and recovery mechanisms work as designed against canonical failure patterns. Production performance depends on the real agent's vulnerabilities and on tuning the Ojas policies for your workload. The harness is seeded via `OJAS_BENCH_SEED`; the project-default seed is reproduced on every CI run.
|
|
15
15
|
|
|
@@ -28,6 +28,13 @@
|
|
|
28
28
|
| 9 | Health-score calibration (L2.5) | aahar, nidra, vyayam, raksha, agni, pulse, chikitsa | ✅ |
|
|
29
29
|
| 10 | Ablation matrix — per-module contribution | raksha, aahar, nidra, vyayam, agni, pulse | ✅ |
|
|
30
30
|
| 11 | Flaky-tool resilience | vyayam, pulse | ✅ |
|
|
31
|
+
| 12 | Hallucination detection (ensemble) | raksha | ✅ |
|
|
32
|
+
| 13 | Model router (Wilson CI routing) | agni | ✅ |
|
|
33
|
+
| 14 | Response distiller (3 intensities) | agni | ✅ |
|
|
34
|
+
| 15 | MCP round-trip contract (18 tools) | aahar, nidra, vyayam, raksha, agni, pulse, chikitsa | ✅ |
|
|
35
|
+
| 16 | Fitness gate threshold math | pulse, chikitsa | ✅ |
|
|
36
|
+
| 17 | Memory write policy (4 tiers) | nidra, raksha | ✅ |
|
|
37
|
+
| 18 | Recovery protocol correctness | chikitsa, pulse | ✅ |
|
|
31
38
|
|
|
32
39
|
## Per-suite results
|
|
33
40
|
|
|
@@ -35,17 +42,18 @@
|
|
|
35
42
|
|
|
36
43
|
*Modules: raksha, aahar*
|
|
37
44
|
|
|
38
|
-
|
|
45
|
+
51 adversarial inputs across direct override, markup boundary, role confusion, memory poisoning, authority claim, embedded, obfuscated, and policy-laundering categories — evaluated against 30 benign controls across plain technical docs, security-topic discussions, Cyrillic/Greek prose, JWT-like base64 tokens, and marketing / customer-support copy to surface false_positive_rate honestly.
|
|
39
46
|
|
|
40
47
|
| Metric | Baseline | With Ojas | Δ | Better |
|
|
41
48
|
|---|---:|---:|---:|:---:|
|
|
42
|
-
| `attacks_succeeded` |
|
|
43
|
-
| `compliance_rate` |
|
|
44
|
-
| `attacks_quarantined_by_raksha` | 0/
|
|
49
|
+
| `attacks_succeeded` | 27/51 | 2/51 | | ↓ |
|
|
50
|
+
| `compliance_rate` | 52.9 % | 3.9 % | −92.6% | ↓ |
|
|
51
|
+
| `attacks_quarantined_by_raksha` | 0/51 | 48/51 | 94.1% | ↑ |
|
|
45
52
|
| `benign_controls_preserved` | 30/30 | 30/30 | | ✓ |
|
|
46
53
|
| `false_positive_rate` | 0 | 0 | 0.0% | ↓ |
|
|
54
|
+
| `detection_latency_p99` | n/a ms | 1.65 ms | | ↓ |
|
|
47
55
|
|
|
48
|
-
>
|
|
56
|
+
> 2 attack(s) still slipped past Raksha; review their patterns to harden the rules.
|
|
49
57
|
> false_positive_rate = 0.0% on 30 benign controls (tolerance ≤ 5%).
|
|
50
58
|
|
|
51
59
|
### 2. Context pollution survival ✅
|
|
@@ -169,6 +177,7 @@ All 8 Vyayam stress types (intensity 0.7) executed against the raw NaiveComplian
|
|
|
169
177
|
| `isotonic_bins` | n/a | 16 | | − |
|
|
170
178
|
| `brier_score_raw_vs_synthetic_success` | n/a | 0.23 | | ↓ |
|
|
171
179
|
| `brier_score_isotonic_calibrated` | 0.23 | 0.219 | 0.011 | ↓ |
|
|
180
|
+
| `threshold_band_accuracy` | n/a | 0.848 | | ↑ |
|
|
172
181
|
|
|
173
182
|
> Seeds: `[101, 202, 303, 404, 505]` · evidence level `L2.5` · pass kind `diagnostic` · CI bounds in brackets are bootstrap 95% intervals across these seeds.
|
|
174
183
|
|
|
@@ -177,6 +186,7 @@ All 8 Vyayam stress types (intensity 0.7) executed against the raw NaiveComplian
|
|
|
177
186
|
> Bucket failure rates: [0.0, 0.2)=empty (n=0), [0.2, 0.4)=67% (n=93), [0.4, 0.6)=57% (n=165), [0.6, 0.8)=39% (n=176), [0.8, 1.0]=24% (n=66).
|
|
178
187
|
> Monotonicity: holds within 5pp slack.
|
|
179
188
|
> Synthetic isotonic calibration: 16 bins, Brier 0.230 raw → 0.219 calibrated. This validates an advisory diagnostic mapping on the synthetic suite only, not a production probability model.
|
|
189
|
+
> Threshold-band accuracy: 424/500 instances mapped to expected health state (84.8%).
|
|
180
190
|
> Limitations: synthetic q→telemetry mapping; not validated against real LLM degradation. See docs/EVIDENCE_MATRIX.md and docs/KNOWN_FAILURES.md.
|
|
181
191
|
|
|
182
192
|
### 10. Ablation matrix — per-module contribution ✅
|
|
@@ -218,6 +228,138 @@ Non-deterministic fault profiles (intermittent 500s, high latency, connection re
|
|
|
218
228
|
> 9 faults injected across 32 runs (28.1% fault rate).
|
|
219
229
|
> 2 throw-mode crashes handled gracefully.
|
|
220
230
|
|
|
231
|
+
### 12. Hallucination detection (ensemble) ✅
|
|
232
|
+
|
|
233
|
+
*Modules: raksha*
|
|
234
|
+
|
|
235
|
+
20 fabricated outputs + 15 truthful outputs + 5 abstention outputs evaluated against grounding context.
|
|
236
|
+
|
|
237
|
+
| Metric | Baseline | With Ojas | Δ | Better |
|
|
238
|
+
|---|---:|---:|---:|:---:|
|
|
239
|
+
| `fabricated_detection_rate` | 0 | 1 | | ↑ |
|
|
240
|
+
| `truthful_false_positive_rate` | 0 | 0 | | ↓ |
|
|
241
|
+
| `abstention_detection_rate` | 0 | 1 | | ↑ |
|
|
242
|
+
| `claim_grounding_accuracy` | 0 | 0.25 | | ↑ |
|
|
243
|
+
| `fabricated_detected` | 0/20 | 20/20 | | ↑ |
|
|
244
|
+
| `truthful_preserved` | 0/15 | 15/15 | | ↑ |
|
|
245
|
+
|
|
246
|
+
> Fabricated detection rate: 100.0% (target ≥ 25%).
|
|
247
|
+
> Truthful false-positive rate: 0.0% (tolerance ≤ 10%).
|
|
248
|
+
> Abstention detection: 5/5.
|
|
249
|
+
> Claim-level grounding accuracy on fabricated set: 25.0%.
|
|
250
|
+
|
|
251
|
+
### 13. Model router (Wilson CI routing) ✅
|
|
252
|
+
|
|
253
|
+
*Modules: agni*
|
|
254
|
+
|
|
255
|
+
6 sparse-data tests, 3 safety-class tests, 5 convergence tests, 5 mixed-outcome tests, 6 Wilson CI validity checks.
|
|
256
|
+
|
|
257
|
+
| Metric | Baseline | With Ojas | Δ | Better |
|
|
258
|
+
|---|---:|---:|---:|:---:|
|
|
259
|
+
| `fail_closed_rate_sparse` | n/a | 1 | | ↑ |
|
|
260
|
+
| `safety_class_flagship_rate` | n/a | 1 | | ↑ |
|
|
261
|
+
| `convergence_to_cheap_rate` | n/a | 1 | | ↑ |
|
|
262
|
+
| `mixed_stays_flagship_rate` | n/a | 1 | | ↑ |
|
|
263
|
+
| `wilson_ci_coverage` | n/a | 0.833 | | ↑ |
|
|
264
|
+
|
|
265
|
+
> Fail-closed: 6/6 sparse queries returned flagship.
|
|
266
|
+
> Safety classes: 3/3 always flagship.
|
|
267
|
+
> Convergence: 5/5 high-success classes routed cheap.
|
|
268
|
+
> Mixed: 5/5 50/50 classes stayed flagship.
|
|
269
|
+
|
|
270
|
+
### 14. Response distiller (3 intensities) ✅
|
|
271
|
+
|
|
272
|
+
*Modules: agni*
|
|
273
|
+
|
|
274
|
+
20 agent outputs × 3 intensity levels. Verifies code blocks preserved, substance retained, and token savings increase with intensity.
|
|
275
|
+
|
|
276
|
+
| Metric | Baseline | With Ojas | Δ | Better |
|
|
277
|
+
|---|---:|---:|---:|:---:|
|
|
278
|
+
| `avg_tokens_removed_lite` | 0 tokens | 4.3 tokens | | ↑ |
|
|
279
|
+
| `avg_tokens_removed_full` | 0 tokens | 8.1 tokens | | ↑ |
|
|
280
|
+
| `avg_tokens_removed_ultra` | 0 tokens | 8.3 tokens | | ↑ |
|
|
281
|
+
| `code_block_survival_rate` | n/a | 1 | | ↑ |
|
|
282
|
+
| `substance_retention_rate` | n/a | 1 | | ↑ |
|
|
283
|
+
| `intensity_monotonicity` | n/a | yes | | ✓ |
|
|
284
|
+
|
|
285
|
+
> Code blocks: 5/5 preserved (100% required).
|
|
286
|
+
> Substance: 39/39 markers retained at full intensity.
|
|
287
|
+
> Token removal monotonicity: lite(4.3) ≤ full(8.1) ≤ ultra(8.3).
|
|
288
|
+
|
|
289
|
+
### 15. MCP round-trip contract (18 tools) ✅
|
|
290
|
+
|
|
291
|
+
*Modules: aahar, nidra, vyayam, raksha, agni, pulse, chikitsa*
|
|
292
|
+
|
|
293
|
+
Exercises all 18 Ojas MCP tools and verifies each returns the standard envelope (status, correlation_id, agent_id, affected_modules, etc.).
|
|
294
|
+
|
|
295
|
+
| Metric | Baseline | With Ojas | Δ | Better |
|
|
296
|
+
|---|---:|---:|---:|:---:|
|
|
297
|
+
| `tools_passing_contract` | 0/18 | 18/18 | | ↑ |
|
|
298
|
+
| `envelope_schema_compliance` | 0 | 1 | | ↑ |
|
|
299
|
+
| `correlation_id_uniqueness` | n/a | yes | | ✓ |
|
|
300
|
+
|
|
301
|
+
> 18/18 tools passed envelope contract.
|
|
302
|
+
> All correlation IDs are unique.
|
|
303
|
+
|
|
304
|
+
### 16. Fitness gate threshold math ✅
|
|
305
|
+
|
|
306
|
+
*Modules: pulse, chikitsa*
|
|
307
|
+
|
|
308
|
+
12 gate-decision scenarios across high/mid/low score agents × 4 risk levels + 12 health-state band classifications.
|
|
309
|
+
|
|
310
|
+
| Metric | Baseline | With Ojas | Δ | Better |
|
|
311
|
+
|---|---:|---:|---:|:---:|
|
|
312
|
+
| `gate_decision_consistency` | n/a | 1 | | ↑ |
|
|
313
|
+
| `safe_mode_trigger_consistency` | n/a | 1 | | ↑ |
|
|
314
|
+
| `risk_boost_monotonicity` | n/a | 1 | | ↑ |
|
|
315
|
+
| `threshold_band_accuracy` | n/a | 1 | | ↑ |
|
|
316
|
+
| `gate_scenarios_tested` | 0 | 12 | | ↑ |
|
|
317
|
+
|
|
318
|
+
> Gate decision consistency: 12/12 (100%).
|
|
319
|
+
> Safe-mode trigger consistency: 12/12 (100%).
|
|
320
|
+
> Risk-boost monotonicity: 9/9 (100%).
|
|
321
|
+
> Band accuracy: 12/12 (100%).
|
|
322
|
+
|
|
323
|
+
### 17. Memory write policy (4 tiers) ✅
|
|
324
|
+
|
|
325
|
+
*Modules: nidra, raksha*
|
|
326
|
+
|
|
327
|
+
30 candidate memory writes across 4 confidence tiers (committed / candidate / session_note / rejected). 8 candidates contain prompt-injection payloads.
|
|
328
|
+
|
|
329
|
+
| Metric | Baseline | With Ojas | Δ | Better |
|
|
330
|
+
|---|---:|---:|---:|:---:|
|
|
331
|
+
| `tier_accuracy` | n/a | 0.967 | | ↑ |
|
|
332
|
+
| `raksha_rejection_rate` | n/a | 0.875 | | ↑ |
|
|
333
|
+
| `false_commit_rate` | n/a | 0 | | ↓ |
|
|
334
|
+
| `false_reject_rate` | n/a | 0 | | ↓ |
|
|
335
|
+
| `candidates_tested` | 0 | 30 | | ↑ |
|
|
336
|
+
|
|
337
|
+
> Tier accuracy: 29/30 (97%).
|
|
338
|
+
> Raksha rejection of tainted: 7/8 (88%).
|
|
339
|
+
> False commits (tainted → committed): 0.
|
|
340
|
+
> False rejects (clean high-conf → rejected): 0.
|
|
341
|
+
|
|
342
|
+
### 18. Recovery protocol correctness ✅
|
|
343
|
+
|
|
344
|
+
*Modules: chikitsa, pulse*
|
|
345
|
+
|
|
346
|
+
7 recovery types × 3 modes = 21 test scenarios. Verifies action plans, mode semantics, and vocabulary coverage.
|
|
347
|
+
|
|
348
|
+
| Metric | Baseline | With Ojas | Δ | Better |
|
|
349
|
+
|---|---:|---:|---:|:---:|
|
|
350
|
+
| `recovery_type_coverage` | n/a | 1 | | ↑ |
|
|
351
|
+
| `action_vocabulary_coverage` | n/a | 9/9 | | ↑ |
|
|
352
|
+
| `recommend_no_mutation_rate` | n/a | 1 | | ↑ |
|
|
353
|
+
| `apply_safe_mode_correctness` | n/a | 1 | | ↑ |
|
|
354
|
+
| `non_empty_plans` | n/a | 7/7 | | ↑ |
|
|
355
|
+
| `unique_recipes` | n/a | 6 | | ↑ |
|
|
356
|
+
|
|
357
|
+
> 7/7 recovery types produce non-empty plans.
|
|
358
|
+
> Action vocabulary coverage: 9/9.
|
|
359
|
+
> Recommend mode: 7/7 scenarios had no mutation.
|
|
360
|
+
> Apply mode: 7/7 safe-mode activations correct.
|
|
361
|
+
> 6 distinct recovery recipes across 7 types.
|
|
362
|
+
|
|
221
363
|
## Reproduce
|
|
222
364
|
|
|
223
365
|
```bash
|
package/docs/EVIDENCE_MATRIX.md
CHANGED
|
@@ -17,7 +17,7 @@ number in `README.md` and trace it back here.
|
|
|
17
17
|
| L3 | Realistic task benchmark | On real agent tasks against a real LLM, Ojas improves success / cost / safety. | That it generalises across organisations and threat models. |
|
|
18
18
|
| L4 | Production telemetry | In a live deployment, Ojas reduced incidents / cost / failures over time. | That it will work for *your* deployment without tuning. |
|
|
19
19
|
|
|
20
|
-
**Ojas v0.
|
|
20
|
+
**Ojas v0.4 ships at L2 and L2.5.** An L3 pipeline exists
|
|
21
21
|
(`benchmarks/l3-runner.ts`) and `verify-evidence.ts` checks for recent L3
|
|
22
22
|
runs, but recurring real-LLM evidence is not yet generated in CI. Nothing
|
|
23
23
|
in this repo claims L4.
|
|
@@ -33,8 +33,9 @@ they bound the validity of the number.
|
|
|
33
33
|
|
|
34
34
|
| Claim | Value | Repro | Limitations |
|
|
35
35
|
|---|---:|---|---|
|
|
36
|
-
| Compliance reduction |
|
|
37
|
-
| Raksha quarantine rate | **
|
|
36
|
+
| Compliance reduction | 52.9% → 3.9% (−92.6%) | `npm run benchmark` | 51 adversarial inputs (33 original + 18 parametric template-based variants). Current run: 2/51 attacks slip past Raksha. |
|
|
37
|
+
| Raksha quarantine rate | **94.1%** (48/51 rule-based) | `npm run benchmark` | Parametric variants stress obfuscation (case-swap, dot-sep, reverse-words, underscore, pipe-sep) + embedded context attacks. |
|
|
38
|
+
| Detection latency p99 | **1.43 ms** | `npm run benchmark` | Measured per-item in the injection detection loop. |
|
|
38
39
|
| Bypass categories now closed | Unicode homoglyph, zero-width, full-width, letter-spaced words, one-shot base64, policy-laundering, credential-imperatives; + recursive/nested obfuscation, roleplay, tool-output injection (via classifier) | unit + benchmark | Rule-based: `normalizeForScan` + `expandBase64` + semantic rules. Classifier: `PromptInjectionClassifier` plugin interface merges ML scores. |
|
|
39
40
|
| Benign false-positive rate | **0% on 30 controls** (injection) / **0% on 55 controls** (retrieval-QA noisy) | `npm run benchmark` | 30 injection-suite benign items + 55 retrieval-QA noisy docs. Tolerance ≤ 5%. |
|
|
40
41
|
| Classifier plugin interface | `PromptInjectionClassifier` | `test/prompt-injection-detectors.test.ts` | L1: interface tested with mock classifiers. Two shipped adapters: `OnnxPromptInjectionClassifier` (local ONNX), `HttpPromptInjectionClassifier` (external API). |
|
|
@@ -183,7 +184,95 @@ non-deterministic fault profiles (intermittent 500s, high latency,
|
|
|
183
184
|
connection resets) to measure Ojas's ability to detect and report
|
|
184
185
|
degraded tool environments.
|
|
185
186
|
|
|
186
|
-
### 12.
|
|
187
|
+
### 12. Hallucination detection (Raksha ensemble) — L2
|
|
188
|
+
|
|
189
|
+
Suite 12 (`benchmarks/suites/hallucination.ts`) proves the ensemble
|
|
190
|
+
hallucination detector (BestOfN + ClaimLevel + Abstention) correctly
|
|
191
|
+
distinguishes fabricated claims from truthful ones.
|
|
192
|
+
|
|
193
|
+
| Claim | Value | Repro | Limitations |
|
|
194
|
+
|---|---:|---|---|
|
|
195
|
+
| Fabricated detection rate | **100%** (20/20 fabricated outputs) | `npm run benchmark` | N-gram grounding, not semantic. Fixtures crafted with low shingle overlap to context. |
|
|
196
|
+
| Truthful false-positive rate | **0%** on 15 truthful outputs | `npm run benchmark` | Truthful outputs closely match provided context by construction. |
|
|
197
|
+
| Abstention detection | **100%** (5/5 abstention outputs) | `npm run benchmark` | Pattern-based abstention detection; non-English hedging not covered. |
|
|
198
|
+
| Claim grounding accuracy | **25%** on fabricated set | `npm run benchmark` | ClaimLevelDetector alone catches fewer than the ensemble. Shingle-based overlap. |
|
|
199
|
+
|
|
200
|
+
### 13. Model router (Wilson CI routing) — L2
|
|
201
|
+
|
|
202
|
+
Suite 13 (`benchmarks/suites/model-router.ts`) proves the
|
|
203
|
+
`ConfidenceRoutingTable` correctly implements fail-closed, safety-class,
|
|
204
|
+
and convergence semantics.
|
|
205
|
+
|
|
206
|
+
| Claim | Value | Repro | Limitations |
|
|
207
|
+
|---|---:|---|---|
|
|
208
|
+
| Fail-closed on sparse data | **100%** flagship on 6/6 sparse queries | `npm run benchmark` | Tests n ∈ {0, 9}. |
|
|
209
|
+
| Safety classes always flagship | **100%** across 3/3 security/auth classes | `npm run benchmark` | Hard-coded safety prefix match. |
|
|
210
|
+
| Convergence to cheap | **100%** (5/5) after 50+ successes | `npm run benchmark` | Clean success signal; mixed real-world outcomes not tested. |
|
|
211
|
+
| Mixed outcomes stay flagship | **100%** (5/5) 50/50 classes | `npm run benchmark` | Uncertain task classes correctly stay flagship. |
|
|
212
|
+
| Wilson CI coverage | **83.3%** valid intervals (5/6) | `npm run benchmark` | Analytic CI; one edge case (p=0 or p=1) may not contain the observed rate. |
|
|
213
|
+
|
|
214
|
+
### 14. Response distiller (3 intensities) — L2
|
|
215
|
+
|
|
216
|
+
Suite 14 (`benchmarks/suites/distiller.ts`) proves the response
|
|
217
|
+
distiller preserves code blocks, retains substance, and saves tokens
|
|
218
|
+
at each intensity tier.
|
|
219
|
+
|
|
220
|
+
| Claim | Value | Repro | Limitations |
|
|
221
|
+
|---|---:|---|---|
|
|
222
|
+
| Code block survival | **100%** (5/5 blocks preserved) | `npm run benchmark` | Fenced code blocks only; inline backticks not tested. |
|
|
223
|
+
| Substance retention | **100%** (39/39 markers at `full`) | `npm run benchmark` | Marker-based; semantic substance not measured. |
|
|
224
|
+
| Intensity monotonicity | lite(4.3) ≤ full(8.1) ≤ ultra(8.3) | `npm run benchmark` | Measured by average tokens removed per fixture. |
|
|
225
|
+
|
|
226
|
+
### 15. MCP round-trip contract (18 tools) — L2
|
|
227
|
+
|
|
228
|
+
Suite 15 (`benchmarks/suites/mcp-contract.ts`) exercises all 18 Ojas
|
|
229
|
+
MCP tools and verifies each returns the standard envelope.
|
|
230
|
+
|
|
231
|
+
| Claim | Value | Repro | Limitations |
|
|
232
|
+
|---|---:|---|---|
|
|
233
|
+
| Envelope compliance | **18/18** tools pass envelope contract | `npm run benchmark` | Tests envelope shape via registry API, not full MCP transport. |
|
|
234
|
+
| Correlation ID uniqueness | **100%** unique across 18 tools | `npm run benchmark` | Within single benchmark run only. |
|
|
235
|
+
|
|
236
|
+
### 16. Fitness gate threshold math — L2
|
|
237
|
+
|
|
238
|
+
Suite 16 (`benchmarks/suites/fitness-gate.ts`) proves the
|
|
239
|
+
`is_agent_fit_to_continue` gate correctly applies risk-level boosts
|
|
240
|
+
and health-state band classification.
|
|
241
|
+
|
|
242
|
+
| Claim | Value | Repro | Limitations |
|
|
243
|
+
|---|---:|---|---|
|
|
244
|
+
| Gate decision consistency | **100%** (12/12 scenarios) | `npm run benchmark` | Score vs threshold math is self-consistent across all 12 scenarios. |
|
|
245
|
+
| Safe-mode trigger consistency | **100%** (12/12 scenarios) | `npm run benchmark` | Critical risk + non-healthy correctly triggers safe mode. |
|
|
246
|
+
| Risk-boost monotonicity | **100%** (9/9 comparisons) | `npm run benchmark` | Higher risk levels always produce equal or higher required thresholds. |
|
|
247
|
+
| Threshold-band accuracy | **100%** (12/12 band cases) | `npm run benchmark` | Default thresholds (minimum_ojas_score=70). |
|
|
248
|
+
|
|
249
|
+
### 17. Memory write policy (4 tiers) — L2
|
|
250
|
+
|
|
251
|
+
Suite 17 (`benchmarks/suites/memory-policy.ts`) proves the
|
|
252
|
+
`validate_memory_write` policy correctly sorts 30 candidates into
|
|
253
|
+
committed / candidate / session_note / rejected tiers.
|
|
254
|
+
|
|
255
|
+
| Claim | Value | Repro | Limitations |
|
|
256
|
+
|---|---:|---|---|
|
|
257
|
+
| Tier accuracy | **97%** (29/30 candidates) | `npm run benchmark` | Confidence supplied by fixture, not measured from a model. |
|
|
258
|
+
| Raksha rejection of tainted | **88%** (7/8 injection payloads) | `npm run benchmark` | Rule-based detection; novel injection patterns not covered. |
|
|
259
|
+
| False commit rate | **0%** (tainted never committed) | `npm run benchmark` | Critical safety property for memory integrity. |
|
|
260
|
+
| False reject rate | **0%** (clean high-conf never rejected) | `npm run benchmark` | Clean high-confidence writes are never misclassified. |
|
|
261
|
+
|
|
262
|
+
### 18. Recovery protocol correctness — L2
|
|
263
|
+
|
|
264
|
+
Suite 18 (`benchmarks/suites/recovery.ts`) proves `actionsForRecoveryType`
|
|
265
|
+
produces correct action sets for 7 recovery types across 3 modes.
|
|
266
|
+
|
|
267
|
+
| Claim | Value | Repro | Limitations |
|
|
268
|
+
|---|---:|---|---|
|
|
269
|
+
| Recovery type coverage | **7/7** types produce non-empty plans | `npm run benchmark` | Action correctness checked by structure, not outcome. |
|
|
270
|
+
| Action vocabulary coverage | **9/9** actions appear | `npm run benchmark` | All action vocabulary items covered by recovery recipes. |
|
|
271
|
+
| Recommend mode no-mutation | **100%** (7/7) no state change | `npm run benchmark` | Verified by safe_mode flag comparison. |
|
|
272
|
+
| Apply mode safe-mode | **100%** (7/7) activations correct | `npm run benchmark` | Apply mode correctly activates safe mode. |
|
|
273
|
+
| Unique recovery recipes | **6** distinct across 7 types | `npm run benchmark` | One recipe may be shared between similar recovery types. |
|
|
274
|
+
|
|
275
|
+
### 19. AbortSignal cancellation — L1
|
|
187
276
|
|
|
188
277
|
`AgentAdapter.process()` now accepts an optional `signal?: AbortSignal`.
|
|
189
278
|
`Vyayam.executeStressTest()` creates an `AbortController` per iteration
|
|
@@ -216,10 +305,10 @@ providers are tracked in [`docs/BACKLOG.md`](./BACKLOG.md#trust-roadmap).
|
|
|
216
305
|
|
|
217
306
|
| Feature | Tests | Evidence Level |
|
|
218
307
|
|---|---|---|
|
|
219
|
-
| `HallucinationDetector` ensemble (best-of-N, claim grounding, abstention) | `test/hallucination-detectors.test.ts` — 22 tests | L1 |
|
|
220
|
-
| `Raksha.detectHallucination()` with Pulse emission | included above | L1 |
|
|
221
|
-
| `ModelRouter` / `ConfidenceRoutingTable` (Wilson 95% CI) | `test/model-router.test.ts` — 15 tests | L1 |
|
|
222
|
-
| `ResponseDistiller` (3 intensities, code-block-safe) | `test/response-distiller.test.ts` — 14 tests | L1 |
|
|
308
|
+
| `HallucinationDetector` ensemble (best-of-N, claim grounding, abstention) | `test/hallucination-detectors.test.ts` — 22 tests + Suite 12 benchmark | L1 + L2 |
|
|
309
|
+
| `Raksha.detectHallucination()` with Pulse emission | included above | L1 + L2 |
|
|
310
|
+
| `ModelRouter` / `ConfidenceRoutingTable` (Wilson 95% CI) | `test/model-router.test.ts` — 15 tests + Suite 13 benchmark | L1 + L2 |
|
|
311
|
+
| `ResponseDistiller` (3 intensities, code-block-safe) | `test/response-distiller.test.ts` — 14 tests + Suite 14 benchmark | L1 + L2 |
|
|
223
312
|
| Memory temperature (heat / decay / cold-threshold) + delta sync + typed nodes | `test/nidra-temperature-delta.test.ts` — 13 tests | L1 |
|
|
224
313
|
| Aahar tiered loading + omission marker + adaptive compression | `test/aahar-tiered-adaptive.test.ts` — 14 tests | L1 |
|
|
225
314
|
| Pulse context-budget milestones + cold-memory events | `test/pulse-milestones.test.ts` — 11 tests | L1 |
|
package/docs/MCP.md
CHANGED
|
@@ -129,8 +129,8 @@ The MCP server is designed for **local stdio use**. It assumes the MCP
|
|
|
129
129
|
host that starts the process is trusted. Agent IDs are routing
|
|
130
130
|
identifiers, **not credentials**, and there is no per-call authentication
|
|
131
131
|
inside the server — there is no portable stdio auth channel for one to
|
|
132
|
-
hook into. This is the intended trust boundary for v0.
|
|
133
|
-
fix; see the [security non-goals](./SECURITY.md#security-non-goals-for-
|
|
132
|
+
hook into. This is the intended trust boundary for v0.3, not a deferred
|
|
133
|
+
fix; see the [security non-goals](./SECURITY.md#security-non-goals-for-v03).
|
|
134
134
|
|
|
135
135
|
Recommended locked-down local configuration:
|
|
136
136
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@beingmartinbmc/ojas",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.4.0",
|
|
4
4
|
"description": "Ojas — AI Health Infrastructure for Autonomous Agents",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"author": "Ankit Sharma <ankit.sharma199803@gmail.com>",
|
|
@@ -38,7 +38,8 @@
|
|
|
38
38
|
"./package.json": "./package.json"
|
|
39
39
|
},
|
|
40
40
|
"bin": {
|
|
41
|
-
"ojas-mcp": "dist/mcp/server.js"
|
|
41
|
+
"ojas-mcp": "dist/mcp/server.js",
|
|
42
|
+
"ojas": "dist/cli/index.js"
|
|
42
43
|
},
|
|
43
44
|
"files": [
|
|
44
45
|
"dist",
|
|
@@ -54,6 +55,7 @@
|
|
|
54
55
|
"lint": "eslint \"src/**/*.ts\" \"benchmarks/**/*.ts\" \"test/**/*.ts\" \"examples/**/*.ts\"",
|
|
55
56
|
"check": "npm run lint && npm run build && npm run typecheck:aux && npm test",
|
|
56
57
|
"demo": "ts-node src/demo.ts",
|
|
58
|
+
"demo:canonical": "ts-node examples/canonical-pipeline.ts",
|
|
57
59
|
"demo:before-after": "ts-node examples/before-after.ts",
|
|
58
60
|
"mcp": "ts-node src/mcp/server.ts",
|
|
59
61
|
"mcp:built": "node dist/mcp/server.js",
|