@aikdna/kdna-studio-core 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +202 -0
- package/README.md +168 -0
- package/package.json +27 -0
- package/schemas/studio.project.schema.json +194 -0
- package/src/cards/feynman.js +105 -0
- package/src/cards/index.js +114 -0
- package/src/cli-bridge/index.js +2 -0
- package/src/compile/index.js +511 -0
- package/src/evidence/index.js +81 -0
- package/src/governance/index.js +140 -0
- package/src/i18n/index.js +145 -0
- package/src/index.js +59 -0
- package/src/judgment-fields.js +28 -0
- package/src/packaging/index.js +88 -0
- package/src/pipeline.js +101 -0
- package/src/project/index.js +359 -0
- package/src/provenance/index.js +44 -0
- package/src/quality/contradiction.js +183 -0
- package/src/quality/index.js +161 -0
- package/src/quality/validate-cards.js +164 -0
- package/src/testlab/delta.js +193 -0
- package/src/testlab/index.js +116 -0
- package/src/versioning/index.js +155 -0
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Enhanced Quality Gates — 4-grade readiness with integrated card validation.
|
|
3
|
+
*
|
|
4
|
+
* Grades:
|
|
5
|
+
* draft_grade — Core+Patterns exist, ≥3 human-reviewed cards
|
|
6
|
+
* human_controlled — All core axioms locked with boundaries, ≥50% have Feynman
|
|
7
|
+
* tested_grade — ≥5 rated evals, ≥3 comparison tests
|
|
8
|
+
* publishable_grade — ≥10 evals, all axioms have Feynman, README 4 questions, no blocking
|
|
9
|
+
*
|
|
10
|
+
* v0.3.2: integrates validateAllCards, Feynman enforcement at publishable grade.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
const contradiction = require('./contradiction');
|
|
14
|
+
const { validateAllCards } = require('./validate-cards');
|
|
15
|
+
const { validateGovernance } = require('../governance');
|
|
16
|
+
const { computeI18nCoverage } = require('../i18n');
|
|
17
|
+
|
|
18
|
+
function computeReadiness(project) {
|
|
19
|
+
const cards = project.cards || [];
|
|
20
|
+
const tests = project.tests || [];
|
|
21
|
+
const locked = cards.filter(c => c.locked);
|
|
22
|
+
const lockedAxioms = locked.filter(c => c.type === 'axiom');
|
|
23
|
+
const lockedSelfChecks = locked.filter(c => c.type === 'self_check');
|
|
24
|
+
const lockedMisunderstandings = locked.filter(c => c.type === 'misunderstanding');
|
|
25
|
+
const ratedTests = tests.filter(t => t.result);
|
|
26
|
+
|
|
27
|
+
const blocking = [];
|
|
28
|
+
const warnings = [];
|
|
29
|
+
|
|
30
|
+
// ── Governance check (v0.6.1) ───────────────────────────────────
|
|
31
|
+
const govResult = validateGovernance(project);
|
|
32
|
+
|
|
33
|
+
// ── I18N check (v1.2.0) ─────────────────────────────────────────
|
|
34
|
+
const isOfficial = project.name?.startsWith('@aikdna/') || project.release?.official === true;
|
|
35
|
+
const i18nCoverage = computeI18nCoverage(project);
|
|
36
|
+
if (isOfficial && i18nCoverage.level === 'L0') {
|
|
37
|
+
blocking.push('I18N: official domains require at least L1 (KDNA_CARD.json + README in locales/zh-CN/)');
|
|
38
|
+
} else if (isOfficial && i18nCoverage.level === 'L1') {
|
|
39
|
+
warnings.push('I18N: L1 achieved (card + readme). Recommended: L2 overlay for publishable grade.');
|
|
40
|
+
}
|
|
41
|
+
for (const issue of govResult.issues) {
|
|
42
|
+
(issue.severity === 'blocking' ? blocking : warnings).push(`Governance: ${issue.message}`);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// ── Card validation integration (v0.3.2) ─────────────────────────
|
|
46
|
+
const cardResults = validateAllCards(project);
|
|
47
|
+
for (const { card_id, issues } of cardResults) {
|
|
48
|
+
for (const issue of issues) {
|
|
49
|
+
if (issue.severity === 'blocking') blocking.push(`${card_id}: ${issue.message}`);
|
|
50
|
+
else warnings.push(`${card_id}: ${issue.message}`);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// ── Minimum Structure ──────────────────────────────────────────
|
|
55
|
+
if (cards.length === 0) { blocking.push('Project has no cards'); return buildResult('draft_grade', blocking, warnings, project); }
|
|
56
|
+
if (locked.length === 0) { blocking.push('No locked cards — nothing to compile'); return buildResult('draft_grade', blocking, warnings, project); }
|
|
57
|
+
|
|
58
|
+
// ── Axiom Checks ──────────────────────────────────────────────
|
|
59
|
+
for (const ax of lockedAxioms) {
|
|
60
|
+
if (!ax.fields?.one_sentence || ax.fields.one_sentence.length < 10) blocking.push(`${ax.id}: one_sentence too short`);
|
|
61
|
+
if (!ax.fields?.applies_when?.length) blocking.push(`${ax.id}: missing applies_when`);
|
|
62
|
+
if (!ax.fields?.does_not_apply_when?.length) blocking.push(`${ax.id}: missing does_not_apply_when`);
|
|
63
|
+
if (!ax.fields?.failure_risk) blocking.push(`${ax.id}: missing failure_risk`);
|
|
64
|
+
if (!ax.human_lock) blocking.push(`${ax.id}: not locked`);
|
|
65
|
+
if (!ax.feynman_restatement) warnings.push(`${ax.id}: missing Feynman restatement`);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// ── Misunderstanding Checks ────────────────────────────────────
|
|
69
|
+
for (const ms of lockedMisunderstandings) {
|
|
70
|
+
if (!ms.fields?.key_distinction || ms.fields.key_distinction.length < 20) blocking.push(`${ms.id}: key_distinction too short`);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// ── Self-check Checks ──────────────────────────────────────────
|
|
74
|
+
for (const sc of lockedSelfChecks) {
|
|
75
|
+
const q = sc.fields?.question || '';
|
|
76
|
+
if (!q.endsWith('?')) blocking.push(`${sc.id}: self_check must end with ?`);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// ── Contradiction Check ────────────────────────────────────────
|
|
80
|
+
for (const c of contradiction.detectContradictions(cards)) {
|
|
81
|
+
(c.severity === 'blocking' ? blocking : warnings).push(c.message);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// ── Determine Grade ────────────────────────────────────────────
|
|
85
|
+
const axiomsComplete = lockedAxioms.length >= 1 &&
|
|
86
|
+
lockedAxioms.every(ax => ax.fields?.applies_when?.length && ax.fields?.does_not_apply_when?.length && ax.fields?.failure_risk && ax.human_lock);
|
|
87
|
+
const feynmanRatio = lockedAxioms.length > 0 ? lockedAxioms.filter(ax => ax.feynman_restatement).length / lockedAxioms.length : 0;
|
|
88
|
+
const allFeynman = lockedAxioms.every(ax => ax.feynman_restatement) && lockedMisunderstandings.every(ms => !ms.locked || ms.feynman_restatement);
|
|
89
|
+
|
|
90
|
+
// Feynman quality threshold (v0.6.2)
|
|
91
|
+
const feynmanQuality = lockedAxioms.every(ax => {
|
|
92
|
+
if (!ax.feynman_restatement?.score) return false;
|
|
93
|
+
return ax.feynman_restatement.score.total >= 4;
|
|
94
|
+
});
|
|
95
|
+
const misunderstandingFeynmanQuality = lockedMisunderstandings.length === 0 ||
|
|
96
|
+
lockedMisunderstandings.every(ms => {
|
|
97
|
+
if (!ms.feynman_restatement?.score) return false;
|
|
98
|
+
return ms.feynman_restatement.score.total >= 3;
|
|
99
|
+
});
|
|
100
|
+
if (allFeynman && !feynmanQuality) {
|
|
101
|
+
warnings.push('Feynman: axiom restatements should score ≥4/5 for publishable grade');
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// Compare test results requirements (v0.6.4)
|
|
105
|
+
const withKdnaBetter = ratedTests.filter(t => t.result === 'with_kdna_better').length;
|
|
106
|
+
const withoutKdnaBetter = ratedTests.filter(t => t.result === 'without_kdna_better').length;
|
|
107
|
+
if (ratedTests.length > 0 && withoutKdnaBetter > 0) {
|
|
108
|
+
warnings.push(`${withoutKdnaBetter} test(s) favored response WITHOUT KDNA — domain may not improve judgment`);
|
|
109
|
+
}
|
|
110
|
+
if (ratedTests.length > 0 && withKdnaBetter < 3 && ratedTests.length >= 5) {
|
|
111
|
+
warnings.push(`Only ${withKdnaBetter} tests favor KDNA — recommend ≥3 for confidence`);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
let grade = 'draft_grade';
|
|
115
|
+
if (locked.length >= 3 && axiomsComplete && feynmanRatio >= 0.5) grade = 'human_controlled';
|
|
116
|
+
if (grade === 'human_controlled' && ratedTests.length >= 5 && lockedSelfChecks.length >= 3) grade = 'tested_grade';
|
|
117
|
+
if (grade === 'tested_grade' && ratedTests.length >= 10 && lockedAxioms.length >= 3 && lockedSelfChecks.length >= 5 && blocking.length === 0 && allFeynman && feynmanQuality && misunderstandingFeynmanQuality) {
|
|
118
|
+
grade = 'publishable_grade';
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// Downgrade if governance issues exist
|
|
122
|
+
if (grade === 'publishable_grade' && govResult && !govResult.valid) {
|
|
123
|
+
grade = 'tested_grade';
|
|
124
|
+
warnings.push('Governance checks not passed — publishable downgraded to tested');
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
return buildResult(grade, blocking, warnings, project, { feynmanRatio, allFeynman, governance: govResult, i18n: i18nCoverage });
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
function buildResult(grade, blocking, warnings, project, detail = {}) {
|
|
131
|
+
const lockedCount = (project.cards || []).filter(c => c.locked).length;
|
|
132
|
+
const ratedTests = (project.tests || []).filter(t => t.result).length;
|
|
133
|
+
|
|
134
|
+
return {
|
|
135
|
+
grade,
|
|
136
|
+
publishable: grade === 'publishable_grade' && blocking.length === 0,
|
|
137
|
+
blocking,
|
|
138
|
+
warnings,
|
|
139
|
+
score: Math.max(0, 100 - blocking.length * 15 - warnings.length * 3),
|
|
140
|
+
governance: detail.governance || null,
|
|
141
|
+
i18n: detail.i18n || null,
|
|
142
|
+
stats: {
|
|
143
|
+
total_cards: (project.cards || []).length,
|
|
144
|
+
locked_cards: lockedCount,
|
|
145
|
+
locked_axioms: (project.cards || []).filter(c => c.type === 'axiom' && c.locked).length,
|
|
146
|
+
locked_self_checks: (project.cards || []).filter(c => c.type === 'self_check' && c.locked).length,
|
|
147
|
+
total_tests: (project.tests || []).length,
|
|
148
|
+
rated_tests: ratedTests,
|
|
149
|
+
feynman_ratio: detail.feynmanRatio !== undefined ? Math.round(detail.feynmanRatio * 100) + '%' : 'N/A',
|
|
150
|
+
i18n_level: detail.i18n?.level || 'L0',
|
|
151
|
+
},
|
|
152
|
+
next_step: grade === 'draft_grade' ? 'Lock at least 3 axioms with boundaries and 50% Feynman.' :
|
|
153
|
+
grade === 'human_controlled' ? 'Add 5+ rated evals and 3+ self-checks.' :
|
|
154
|
+
grade === 'tested_grade' ? 'Add 10+ evals, complete Feynman on all axioms/misunderstandings, resolve all blocking issues.' :
|
|
155
|
+
'Ready for Studio compile/export. Publish the resulting .kdna with kdna publish <file.kdna>.',
|
|
156
|
+
};
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
function getBlockingIssues(project) { return computeReadiness(project).blocking; }
|
|
160
|
+
|
|
161
|
+
module.exports = { computeReadiness, getBlockingIssues };
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Card Validator — Anti-vagueness, anti-SOP, anti-slogan, anti-straw-man checks.
|
|
3
|
+
*
|
|
4
|
+
* Ensures every card meets minimum quality before it can be locked.
|
|
5
|
+
* These checks mirror the kdna-cli publish --check rules.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
const ANTI_PATTERNS = {
|
|
9
|
+
axiom: {
|
|
10
|
+
slogans: ['is key', 'is important', 'matters', 'is critical', 'is essential', 'should be', 'must be'],
|
|
11
|
+
sops: ['first, you should', 'follow these steps', 'always remember to', 'the process is'],
|
|
12
|
+
},
|
|
13
|
+
misunderstanding: {
|
|
14
|
+
straw_men: ['some people say', 'many believe', 'it is commonly thought'],
|
|
15
|
+
},
|
|
16
|
+
self_check: {
|
|
17
|
+
generics: ['is this good', 'is this correct', 'is this helpful', 'is this clear', 'does this work', 'is it right'],
|
|
18
|
+
},
|
|
19
|
+
};
|
|
20
|
+
|
|
21
|
+
function validateCard(card) {
|
|
22
|
+
const issues = [];
|
|
23
|
+
|
|
24
|
+
switch (card.type) {
|
|
25
|
+
case 'axiom':
|
|
26
|
+
validateAxiom(card, issues);
|
|
27
|
+
break;
|
|
28
|
+
case 'misunderstanding':
|
|
29
|
+
validateMisunderstanding(card, issues);
|
|
30
|
+
break;
|
|
31
|
+
case 'self_check':
|
|
32
|
+
validateSelfCheck(card, issues);
|
|
33
|
+
break;
|
|
34
|
+
case 'ontology':
|
|
35
|
+
validateOntology(card, issues);
|
|
36
|
+
break;
|
|
37
|
+
case 'boundary':
|
|
38
|
+
validateBoundary(card, issues);
|
|
39
|
+
break;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
return issues;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function validateAxiom(card, issues) {
|
|
46
|
+
const oneLiner = (card.fields?.one_sentence || '').toLowerCase();
|
|
47
|
+
const full = (card.fields?.full_statement || '').toLowerCase();
|
|
48
|
+
|
|
49
|
+
// Anti-slogan: reject axioms that are just motivational slogans
|
|
50
|
+
for (const slogan of ANTI_PATTERNS.axiom.slogans) {
|
|
51
|
+
if (oneLiner.includes(slogan) && oneLiner.length < 40) {
|
|
52
|
+
issues.push({
|
|
53
|
+
type: 'slogan',
|
|
54
|
+
severity: 'warning',
|
|
55
|
+
message: `${card.id}: one_sentence may be a slogan — "${oneLiner.slice(0, 60)}"`,
|
|
56
|
+
fix: 'Axioms must be specific, testable judgment principles. Replace vague slogans with concrete decision rules.',
|
|
57
|
+
});
|
|
58
|
+
break;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// Anti-SOP: axioms should not encode step-by-step procedures
|
|
63
|
+
for (const sop of ANTI_PATTERNS.axiom.sops) {
|
|
64
|
+
if (oneLiner.includes(sop) || full.includes(sop)) {
|
|
65
|
+
issues.push({
|
|
66
|
+
type: 'sop',
|
|
67
|
+
severity: 'warning',
|
|
68
|
+
message: `${card.id}: axiom reads like a procedure, not a judgment principle`,
|
|
69
|
+
fix: 'Axioms encode how to judge, not what steps to follow. Rephrase as a decision principle.',
|
|
70
|
+
});
|
|
71
|
+
break;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// Anti-vagueness: one_sentence must be specific enough
|
|
76
|
+
if (oneLiner.length < 15) {
|
|
77
|
+
issues.push({ type: 'too_short', severity: 'blocking', message: `${card.id}: one_sentence too short (${oneLiner.length} chars)`, fix: 'Make it a complete, specific judgment statement.' });
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// Check for dictionary-definition style (axiom should not start with "X is")
|
|
81
|
+
if (/^\w+\s+is\s/.test(oneLiner) && oneLiner.length < 50) {
|
|
82
|
+
issues.push({ type: 'definition_like', severity: 'warning', message: `${card.id}: one_sentence reads like a definition, not a judgment — rephrase as a principle` });
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
function validateMisunderstanding(card, issues) {
|
|
87
|
+
const wrong = (card.fields?.wrong || '').toLowerCase();
|
|
88
|
+
const correct = (card.fields?.correct || '').toLowerCase();
|
|
89
|
+
const distinction = card.fields?.key_distinction || '';
|
|
90
|
+
|
|
91
|
+
// Anti-straw-man: the wrong belief should be something real people believe
|
|
92
|
+
if (wrong.length < 15) {
|
|
93
|
+
issues.push({ type: 'vague_wrong', severity: 'warning', message: `${card.id}: wrong belief too short — may describe a straw man no one believes` });
|
|
94
|
+
}
|
|
95
|
+
for (const straw of ANTI_PATTERNS.misunderstanding.straw_men) {
|
|
96
|
+
if (wrong.includes(straw)) {
|
|
97
|
+
issues.push({ type: 'straw_man', severity: 'warning', message: `${card.id}: wrong belief uses straw-man phrasing — describe what people actually get wrong` });
|
|
98
|
+
break;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
if (!distinction || distinction.length < 20) {
|
|
103
|
+
issues.push({ type: 'missing_distinction', severity: 'blocking', message: `${card.id}: key_distinction missing or too short (${distinction.length} chars)` });
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
function validateSelfCheck(card, issues) {
|
|
108
|
+
const question = card.fields?.question || '';
|
|
109
|
+
|
|
110
|
+
const isQuestion = question.endsWith('?') || question.endsWith('?') || /[吗是否]$/.test(question);
|
|
111
|
+
if (!isQuestion) {
|
|
112
|
+
issues.push({ type: 'not_question', severity: 'blocking', message: `${card.id}: must be a yes/no answerable question` });
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
if (question.length < 15) {
|
|
116
|
+
issues.push({ type: 'vague', severity: 'warning', message: `${card.id}: question too short — make it domain-specific` });
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
for (const gen of ANTI_PATTERNS.self_check.generics) {
|
|
120
|
+
if (question.toLowerCase().includes(gen)) {
|
|
121
|
+
issues.push({ type: 'generic', severity: 'warning', message: `${card.id}: question is generic — should reference domain-specific criteria` });
|
|
122
|
+
break;
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
function validateOntology(card, issues) {
|
|
128
|
+
const essence = card.fields?.essence || '';
|
|
129
|
+
const boundary = card.fields?.boundary || '';
|
|
130
|
+
const trigger = card.fields?.trigger_signal || '';
|
|
131
|
+
|
|
132
|
+
if (essence.length < 15) {
|
|
133
|
+
issues.push({ type: 'vague_essence', severity: 'warning', message: `${card.id}: essence too short — explain operational meaning` });
|
|
134
|
+
}
|
|
135
|
+
if (boundary.length < 10) {
|
|
136
|
+
issues.push({ type: 'missing_boundary', severity: 'warning', message: `${card.id}: boundary missing — what is this concept NOT?` });
|
|
137
|
+
}
|
|
138
|
+
if (trigger.length < 10) {
|
|
139
|
+
issues.push({ type: 'missing_trigger', severity: 'warning', message: `${card.id}: trigger_signal missing — how does the agent detect this concept?` });
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
function validateBoundary(card, issues) {
|
|
144
|
+
const scope = card.fields?.scope || '';
|
|
145
|
+
const outOfScope = card.fields?.out_of_scope || '';
|
|
146
|
+
|
|
147
|
+
if (scope.length < 10) {
|
|
148
|
+
issues.push({ type: 'vague_scope', severity: 'warning', message: `${card.id}: scope too short` });
|
|
149
|
+
}
|
|
150
|
+
if (outOfScope.length < 10) {
|
|
151
|
+
issues.push({ type: 'vague_out_of_scope', severity: 'blocking', message: `${card.id}: out_of_scope missing or too short` });
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
function validateAllCards(project) {
|
|
156
|
+
const allIssues = [];
|
|
157
|
+
for (const card of (project.cards || [])) {
|
|
158
|
+
const cardIssues = validateCard(card);
|
|
159
|
+
allIssues.push({ card_id: card.id, issues: cardIssues });
|
|
160
|
+
}
|
|
161
|
+
return allIssues;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
module.exports = { validateCard, validateAllCards, ANTI_PATTERNS };
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Judgment Delta — Structured comparison of agent response with vs without KDNA.
|
|
3
|
+
*
|
|
4
|
+
* Parses kdna compare output (text or JSON) into structured axes:
|
|
5
|
+
* 1. CLASSIFICATION — how the task was classified
|
|
6
|
+
* 2. DIAGNOSIS — root cause identified
|
|
7
|
+
* 3. ACTIONS — what the response suggests
|
|
8
|
+
* 4. BOUNDARY — scope awareness
|
|
9
|
+
* 5. TERMINOLOGY — domain-specific terms used
|
|
10
|
+
*
|
|
11
|
+
* Also supports scoring along the D1-D7 dimensions defined in the
|
|
12
|
+
* KDNA Compare Report specification.
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
function parseCompareOutput(diffText) {
|
|
16
|
+
const axes = {};
|
|
17
|
+
const matches = diffText.matchAll(/^(\d)\.\s*(\w+(?:\s+\w+)*):\s*(.+)$/gim);
|
|
18
|
+
for (const m of matches) {
|
|
19
|
+
const name = m[2].toLowerCase().replace(/\s+/g, '_');
|
|
20
|
+
const value = m[3].trim();
|
|
21
|
+
if (value.toUpperCase() !== 'SAME') {
|
|
22
|
+
axes[name] = value;
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
// Legacy format: "<axis>: <value>"
|
|
27
|
+
if (Object.keys(axes).length === 0) {
|
|
28
|
+
const legacyMatch = diffText.matchAll(/^(\w+):\s*(.+)$/gim);
|
|
29
|
+
for (const m of legacyMatch) {
|
|
30
|
+
const name = m[1].toLowerCase();
|
|
31
|
+
const value = m[2].trim();
|
|
32
|
+
if (name === 'verdict') continue;
|
|
33
|
+
if (value.toUpperCase() !== 'SAME') {
|
|
34
|
+
axes[name] = value;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
const verdictMatch = diffText.match(/VERDICT:\s*(.+)/i);
|
|
40
|
+
const verdict = verdictMatch ? verdictMatch[1].trim().toLowerCase() : 'trajectory_unchanged';
|
|
41
|
+
|
|
42
|
+
return { axes, verdict };
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function scoreDelta(axes) {
|
|
46
|
+
let score = 5;
|
|
47
|
+
const changed = [];
|
|
48
|
+
for (const [axis, value] of Object.entries(axes)) {
|
|
49
|
+
changed.push({ axis, value: value.slice(0, 100) });
|
|
50
|
+
score = Math.min(10, score + 1);
|
|
51
|
+
}
|
|
52
|
+
return { score: Math.min(10, score), changed };
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function createJudgmentDelta(domain, input, responseA, responseB, diffText, options = {}) {
|
|
56
|
+
const { axes, verdict } = parseCompareOutput(diffText);
|
|
57
|
+
const domainScore = scoreDelta(axes);
|
|
58
|
+
const triggeredAxioms = options.triggeredAxioms || [];
|
|
59
|
+
const avoidedMisunderstandings = options.avoidedMisunderstandings || [];
|
|
60
|
+
const selfChecksPassed = options.selfChecksPassed || null;
|
|
61
|
+
|
|
62
|
+
return {
|
|
63
|
+
meta: {
|
|
64
|
+
domain,
|
|
65
|
+
input: input.slice(0, 200),
|
|
66
|
+
model: options.model || 'unknown',
|
|
67
|
+
timestamp: new Date().toISOString(),
|
|
68
|
+
},
|
|
69
|
+
classification: {
|
|
70
|
+
without_kdna: axes.classification || 'generic',
|
|
71
|
+
with_kdna: axes.classification ? 'domain_specific' : 'unchanged',
|
|
72
|
+
changed: !!axes.classification,
|
|
73
|
+
},
|
|
74
|
+
axes,
|
|
75
|
+
verdict,
|
|
76
|
+
score: domainScore.score,
|
|
77
|
+
changed_dimensions: domainScore.changed,
|
|
78
|
+
triggered_axioms: triggeredAxioms,
|
|
79
|
+
avoided_misunderstandings: avoidedMisunderstandings,
|
|
80
|
+
self_checks_passed: selfChecksPassed,
|
|
81
|
+
scoring: buildScoring(axes, domainScore, selfChecksPassed),
|
|
82
|
+
summary: buildSummary(domain, domainScore, verdict),
|
|
83
|
+
};
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
function buildScoring(axes, domainScore, selfChecksPassed) {
|
|
87
|
+
return {
|
|
88
|
+
D1_diagnostic_depth: axes.diagnosis ? 8 : 5,
|
|
89
|
+
D2_terminology_precision: axes.terminology ? 8 : 5,
|
|
90
|
+
D3_misunderstanding_detection: 5,
|
|
91
|
+
D4_axiom_alignment: domainScore.score,
|
|
92
|
+
D5_self_check_pass_rate: selfChecksPassed !== null
|
|
93
|
+
? `${selfChecksPassed}%`
|
|
94
|
+
: 'N/A',
|
|
95
|
+
D6_boundary_respect: axes.boundary_awareness || axes.boundary ? 'Pass' : 'N/A',
|
|
96
|
+
D7_risk_avoidance: 'N/A',
|
|
97
|
+
};
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
function buildSummary(domain, domainScore, verdict) {
|
|
101
|
+
const changed = domainScore.changed.map(c => `**${c.axis}**`).join(', ');
|
|
102
|
+
if (changed.length === 0) {
|
|
103
|
+
return `Loading \`${domain}\` did not significantly alter the judgment trajectory for this input.`;
|
|
104
|
+
}
|
|
105
|
+
if (verdict.includes('changed')) {
|
|
106
|
+
return `Loading \`${domain}\` changed the agent's response across ${domainScore.changed.length} dimensions: ${changed}. The reasoning trajectory shifted from generic to domain-specific judgment.`;
|
|
107
|
+
}
|
|
108
|
+
return `Loading \`${domain}\` produced changes in ${domainScore.changed.length} dimensions: ${changed}.`;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
function compareDeltas(delta1, delta2) {
|
|
112
|
+
const diffs = [];
|
|
113
|
+
for (const axis of ['classification', 'diagnosis', 'actions', 'boundary_awareness', 'terminology']) {
|
|
114
|
+
const v1 = delta1.axes[axis] || 'SAME';
|
|
115
|
+
const v2 = delta2.axes[axis] || 'SAME';
|
|
116
|
+
if (v1 !== v2) {
|
|
117
|
+
diffs.push({ axis, before: v1, after: v2 });
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
return {
|
|
121
|
+
score_change: delta2.score - delta1.score,
|
|
122
|
+
verdict_before: delta1.verdict,
|
|
123
|
+
verdict_after: delta2.verdict,
|
|
124
|
+
axis_diffs: diffs,
|
|
125
|
+
improved: delta2.score > delta1.score,
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
function formatDeltaMarkdown(delta) {
|
|
130
|
+
const lines = [];
|
|
131
|
+
lines.push('# KDNA Judgment Comparison Report'); lines.push('');
|
|
132
|
+
lines.push(`**Domain:** ${delta.meta.domain}`);
|
|
133
|
+
lines.push(`**Model:** ${delta.meta.model}`);
|
|
134
|
+
lines.push(`**Date:** ${delta.meta.timestamp}`); lines.push('');
|
|
135
|
+
lines.push('## Judgment Diff'); lines.push('');
|
|
136
|
+
lines.push('| Dimension | Change |'); lines.push('|-----------|--------|');
|
|
137
|
+
for (const d of delta.changed_dimensions) lines.push(`| ${d.axis} | **Changed**: ${d.value} |`);
|
|
138
|
+
if (!delta.changed_dimensions.length) lines.push('| (none) | No significant change |');
|
|
139
|
+
lines.push('');
|
|
140
|
+
lines.push('## Scoring'); lines.push('');
|
|
141
|
+
for (const [dim, value] of Object.entries(delta.scoring)) lines.push(`- **${dim}:** ${value}`);
|
|
142
|
+
lines.push('');
|
|
143
|
+
lines.push(`**Verdict:** ${delta.verdict.replace(/_/g, ' ')}`); lines.push('');
|
|
144
|
+
lines.push(delta.summary);
|
|
145
|
+
return lines.join('\n');
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// ─── JSON report parsing (v0.3.3) ─────────────────────────────────────
|
|
149
|
+
|
|
150
|
+
function parseCompareReportJson(report) {
|
|
151
|
+
if (!report || !report.diff) return { axes: {}, verdict: 'trajectory_unchanged' };
|
|
152
|
+
|
|
153
|
+
const axes = {};
|
|
154
|
+
// Extract axes from structured report format
|
|
155
|
+
if (report.diff.axes) {
|
|
156
|
+
for (const [axis, value] of Object.entries(report.diff.axes)) {
|
|
157
|
+
if (value && String(value).toUpperCase() !== 'SAME') axes[axis] = String(value);
|
|
158
|
+
}
|
|
159
|
+
return { axes, verdict: report.diff.verdict || 'trajectory_unchanged' };
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// Legacy: raw baseline/kdna comparison
|
|
163
|
+
if (report.without_kdna && report.with_kdna) {
|
|
164
|
+
if (report.without_kdna.classification !== report.with_kdna.classification)
|
|
165
|
+
axes.classification = 'changed';
|
|
166
|
+
return { axes, verdict: Object.keys(axes).length > 0 ? 'trajectory_changed' : 'trajectory_unchanged' };
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
return { axes: {}, verdict: 'trajectory_unchanged' };
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
function createJudgmentDeltaFromReport(domain, input, report, options = {}) {
|
|
173
|
+
const { axes, verdict } = parseCompareReportJson(report);
|
|
174
|
+
const domainScore = scoreDelta(axes);
|
|
175
|
+
|
|
176
|
+
return {
|
|
177
|
+
meta: { domain, input: (input || '').slice(0, 200), model: report.meta?.model || options.model || 'unknown',
|
|
178
|
+
timestamp: new Date().toISOString() },
|
|
179
|
+
classification: { without_kdna: axes.classification || 'generic',
|
|
180
|
+
with_kdna: axes.classification ? 'domain_specific' : 'unchanged', changed: !!axes.classification },
|
|
181
|
+
axes, verdict,
|
|
182
|
+
score: domainScore.score,
|
|
183
|
+
changed_dimensions: domainScore.changed,
|
|
184
|
+
triggered_axioms: options.triggeredAxioms || [],
|
|
185
|
+
avoided_misunderstandings: options.avoidedMisunderstandings || [],
|
|
186
|
+
self_checks_passed: options.selfChecksPassed || null,
|
|
187
|
+
scoring: buildScoring(axes, domainScore, options.selfChecksPassed),
|
|
188
|
+
summary: buildSummary(domain, domainScore, verdict),
|
|
189
|
+
};
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
module.exports = { parseCompareOutput, parseCompareReportJson, scoreDelta,
|
|
193
|
+
createJudgmentDelta, createJudgmentDeltaFromReport, compareDeltas, formatDeltaMarkdown };
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Test Lab — Validate that a KDNA domain actually changes agent judgment.
|
|
3
|
+
*
|
|
4
|
+
* Core operations:
|
|
5
|
+
* - Create test cases (input → expected_without_kdna → expected_with_kdna)
|
|
6
|
+
* - Run comparison through kdna-cli compare
|
|
7
|
+
* - Record human rating
|
|
8
|
+
* - Attach tests to cards
|
|
9
|
+
* - Export evals for KDNA domain
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
function createTestCase(input, options = {}) {
|
|
13
|
+
return {
|
|
14
|
+
id: `test_${require('crypto').randomUUID()}`,
|
|
15
|
+
input,
|
|
16
|
+
expected_without_kdna: options.expectedWithout || '',
|
|
17
|
+
expected_with_kdna: options.expectedWith || '',
|
|
18
|
+
domain: options.domain || null,
|
|
19
|
+
result: null, // 'with_kdna_better' | 'no_difference' | 'without_kdna_better'
|
|
20
|
+
human_rating: null,
|
|
21
|
+
rated_by: null,
|
|
22
|
+
rated_at: null,
|
|
23
|
+
notes: '',
|
|
24
|
+
linked_cards: [],
|
|
25
|
+
created_at: new Date().toISOString(),
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
function recordHumanRating(testCase, result, ratedBy, notes = '') {
|
|
30
|
+
const validResults = ['with_kdna_better', 'no_difference', 'without_kdna_better'];
|
|
31
|
+
if (!validResults.includes(result)) throw new Error(`Invalid result: ${result}. Must be one of: ${validResults.join(', ')}`);
|
|
32
|
+
testCase.result = result;
|
|
33
|
+
testCase.human_rating = result;
|
|
34
|
+
testCase.rated_by = ratedBy;
|
|
35
|
+
testCase.rated_at = new Date().toISOString();
|
|
36
|
+
testCase.notes = notes;
|
|
37
|
+
return testCase;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
function linkTestToCards(testCase, cardIds) {
|
|
41
|
+
testCase.linked_cards = [...new Set([...testCase.linked_cards, ...cardIds])];
|
|
42
|
+
return testCase;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function applyTestResultsToCards(project, testCase) {
|
|
46
|
+
if (!testCase.result) return project;
|
|
47
|
+
const cards = project.cards || [];
|
|
48
|
+
for (const cardId of (testCase.linked_cards || [])) {
|
|
49
|
+
const card = cards.find(c => c.id === cardId);
|
|
50
|
+
if (!card) continue;
|
|
51
|
+
if (card.status === 'locked' && testCase.result === 'with_kdna_better') {
|
|
52
|
+
const { transitionCard } = require('../cards');
|
|
53
|
+
try {
|
|
54
|
+
transitionCard(card, 'tested', { by: testCase.rated_by || 'testlab', reason: `test ${testCase.id}: ${testCase.result}` });
|
|
55
|
+
} catch { /* card may have been already tested */ }
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
return project;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
function generateTestSummary(project) {
|
|
62
|
+
const tests = project.tests || [];
|
|
63
|
+
const total = tests.length;
|
|
64
|
+
const rated = tests.filter(t => t.result).length;
|
|
65
|
+
const withKdnaBetter = tests.filter(t => t.result === 'with_kdna_better').length;
|
|
66
|
+
const noDiff = tests.filter(t => t.result === 'no_difference').length;
|
|
67
|
+
const withoutBetter = tests.filter(t => t.result === 'without_kdna_better').length;
|
|
68
|
+
|
|
69
|
+
return {
|
|
70
|
+
total,
|
|
71
|
+
rated,
|
|
72
|
+
unrated: total - rated,
|
|
73
|
+
with_kdna_better: withKdnaBetter,
|
|
74
|
+
with_kdna_better_pct: total > 0 ? Math.round((withKdnaBetter / rated) * 100) : 0,
|
|
75
|
+
no_difference: noDiff,
|
|
76
|
+
without_kdna_better: withoutBetter,
|
|
77
|
+
passing: withKdnaBetter >= Math.ceil(rated * 0.6), // at least 60% of rated tests should favor KDNA
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
function exportEvals(project) {
|
|
82
|
+
const tests = (project.tests || []).filter(t => t.result);
|
|
83
|
+
return tests.map(t => ({
|
|
84
|
+
id: t.id,
|
|
85
|
+
input: t.input,
|
|
86
|
+
expected_without_kdna: t.expected_without_kdna || null,
|
|
87
|
+
expected_with_kdna: t.expected_with_kdna || null,
|
|
88
|
+
result: t.result,
|
|
89
|
+
linked_cards: t.linked_cards,
|
|
90
|
+
rated_by: t.rated_by,
|
|
91
|
+
rated_at: t.rated_at,
|
|
92
|
+
notes: t.notes,
|
|
93
|
+
}));
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
function compareAdapter(domainName, input, options = {}) {
|
|
97
|
+
// Returns the CLI command and args for kdna compare
|
|
98
|
+
const args = ['compare', domainName, '--input', input];
|
|
99
|
+
if (options.reportMd) args.push('--report-md');
|
|
100
|
+
if (options.reportJson) args.push('--report-json');
|
|
101
|
+
if (options.output) args.push('--output', options.output);
|
|
102
|
+
return {
|
|
103
|
+
command: 'kdna',
|
|
104
|
+
args,
|
|
105
|
+
description: 'Runs kdna compare to test judgment impact',
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
module.exports = {
|
|
110
|
+
createTestCase,
|
|
111
|
+
recordHumanRating,
|
|
112
|
+
linkTestToCards,
|
|
113
|
+
generateTestSummary,
|
|
114
|
+
exportEvals,
|
|
115
|
+
compareAdapter,
|
|
116
|
+
};
|