@aikdna/kdna-cli 0.12.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/cli.js +2 -0
- package/src/compare.js +217 -7
package/package.json
CHANGED
package/src/cli.js
CHANGED
|
@@ -80,6 +80,8 @@ Testing & Verification:
|
|
|
80
80
|
verify <name> 3-layer: structure + trust + judgment
|
|
81
81
|
verify <name> --judgment --run-tests Judgment validation with eval cases
|
|
82
82
|
compare <name> --input "..." With/without KDNA reasoning diff
|
|
83
|
+
compare <name> --input "..." --report-md Markdown report format
|
|
84
|
+
compare <name> --input "..." --report-json JSON report with scoring
|
|
83
85
|
diff <name>@<v1> <name>@<v2> Judgment-level diff between versions
|
|
84
86
|
test run <name> --input <file> Record test result against domain
|
|
85
87
|
test import <run> --as-eval Convert test result to eval card
|
package/src/compare.js
CHANGED
|
@@ -29,6 +29,7 @@ const CONFIG_FILE = path.join(USER_KDNA_DIR, 'config.json');
|
|
|
29
29
|
|
|
30
30
|
const { parseName } = require('./registry');
|
|
31
31
|
const { EXIT } = require('./cmds/_common');
|
|
32
|
+
const { recordTrace } = require('./cmds/trace');
|
|
32
33
|
|
|
33
34
|
function readJson(p) {
|
|
34
35
|
try {
|
|
@@ -260,13 +261,188 @@ ${responseB}
|
|
|
260
261
|
Diff the reasoning trajectory.`;
|
|
261
262
|
}
|
|
262
263
|
|
|
264
|
+
// ─── Report output ─────────────────────────────────────────────────────
|
|
265
|
+
|
|
266
|
+
function parseDiffText(diffText) {
|
|
267
|
+
const axes = {};
|
|
268
|
+
const lines = diffText.split('\n');
|
|
269
|
+
let verdict = 'trajectory_unchanged';
|
|
270
|
+
|
|
271
|
+
for (const line of lines) {
|
|
272
|
+
const match = line.match(/^(\d+)\.\s*(\w+):\s*(.+)$/i);
|
|
273
|
+
if (match) {
|
|
274
|
+
axes[match[2].toLowerCase()] = match[3].trim();
|
|
275
|
+
}
|
|
276
|
+
const vMatch = line.match(/^VERDICT:\s*(.+)$/i);
|
|
277
|
+
if (vMatch) {
|
|
278
|
+
verdict = vMatch[1].trim().toLowerCase();
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
return { axes, verdict };
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
function scoreDiff(axes) {
|
|
286
|
+
let score = 5; // baseline neutral
|
|
287
|
+
const changed = [];
|
|
288
|
+
for (const [axis, value] of Object.entries(axes)) {
|
|
289
|
+
if (value && value.toUpperCase() !== 'SAME') {
|
|
290
|
+
changed.push(axis.toLowerCase());
|
|
291
|
+
score = Math.min(10, score + 1);
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
return { score, changed };
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
function emitMarkdownReport(parsed, manifest, core, pat, responseA, responseB, diffText, llm) {
|
|
298
|
+
const { axes, verdict } = parseDiffText(diffText);
|
|
299
|
+
const domainScore = scoreDiff(axes);
|
|
300
|
+
const axioms = core.axioms || [];
|
|
301
|
+
const selfChecks = pat.self_check || [];
|
|
302
|
+
const bannedTerms = (pat.terminology?.banned_terms || []).map(t => typeof t === 'string' ? t : t.term);
|
|
303
|
+
const misunderstandings = pat.misunderstandings || [];
|
|
304
|
+
|
|
305
|
+
const lines = [];
|
|
306
|
+
lines.push('# KDNA Judgment Comparison Report');
|
|
307
|
+
lines.push('');
|
|
308
|
+
lines.push(`**Domain:** ${parsed.full} (v${manifest.version || '?'})`);
|
|
309
|
+
lines.push(`**Input:** "${(args => {
|
|
310
|
+
const i = args.indexOf('--input');
|
|
311
|
+
return i >= 0 ? args[i + 1].slice(0, 120) : '?';
|
|
312
|
+
})(process.argv.slice(2))}"`);
|
|
313
|
+
lines.push(`**Model:** ${llm.provider} / ${llm.model}`);
|
|
314
|
+
lines.push(`**Date:** ${new Date().toISOString()}`);
|
|
315
|
+
lines.push('');
|
|
316
|
+
lines.push('---');
|
|
317
|
+
lines.push('');
|
|
318
|
+
lines.push('## Without KDNA');
|
|
319
|
+
lines.push('');
|
|
320
|
+
lines.push('### Judgment Path');
|
|
321
|
+
lines.push(responseA.split('\n').filter(l => l.trim()).slice(0, 3).map(l => `- ${l}`).join('\n'));
|
|
322
|
+
lines.push('');
|
|
323
|
+
lines.push('### Key Deficiencies');
|
|
324
|
+
lines.push('- No domain-specific diagnosis applied');
|
|
325
|
+
lines.push('- Terminal screening');
|
|
326
|
+
lines.push('');
|
|
327
|
+
lines.push('---');
|
|
328
|
+
lines.push('');
|
|
329
|
+
lines.push(`## With KDNA (${parsed.full})`);
|
|
330
|
+
lines.push('');
|
|
331
|
+
lines.push(`### Domain Loaded`);
|
|
332
|
+
lines.push(`- Name: ${parsed.full}`);
|
|
333
|
+
lines.push(`- Axioms applied: ${axioms.length} total`);
|
|
334
|
+
lines.push(`- Frameworks: ${(core.frameworks || []).map(f => f.id).join(', ') || 'none declared'}`);
|
|
335
|
+
lines.push(`- Self-checks: ${selfChecks.length} items`);
|
|
336
|
+
lines.push(`- Banned terms: ${bannedTerms.length}`);
|
|
337
|
+
lines.push('');
|
|
338
|
+
lines.push('### Judgment Path');
|
|
339
|
+
lines.push(responseB.split('\n').filter(l => l.trim()).slice(0, 3).map(l => `- ${l}`).join('\n'));
|
|
340
|
+
lines.push('');
|
|
341
|
+
lines.push('---');
|
|
342
|
+
lines.push('');
|
|
343
|
+
lines.push('## Judgment Diff');
|
|
344
|
+
lines.push('');
|
|
345
|
+
lines.push('| Dimension | Without KDNA | With KDNA | Change |');
|
|
346
|
+
lines.push('|-----------|:-----------:|:---------:|:------:|');
|
|
347
|
+
const dims = [
|
|
348
|
+
{ name: 'Classification', axis: 'classification' },
|
|
349
|
+
{ name: 'Diagnostic depth', axis: 'diagnosis' },
|
|
350
|
+
{ name: 'Terminology', axis: 'terminology' },
|
|
351
|
+
{ name: 'Boundary respected', axis: 'boundary awareness' },
|
|
352
|
+
{ name: 'Action quality', axis: 'actions' },
|
|
353
|
+
];
|
|
354
|
+
for (const d of dims) {
|
|
355
|
+
const v = axes[d.axis];
|
|
356
|
+
const changed = v && v.toUpperCase() !== 'SAME';
|
|
357
|
+
lines.push(`| **${d.name}** | Generic | Domain-specific | **${changed ? 'Improved' : 'Same'}** |`);
|
|
358
|
+
}
|
|
359
|
+
lines.push(`| **Self-check rate** | N/A | ${selfChecks.length > 0 ? 'Domain applied' : 'N/A'} | **Improved** |`);
|
|
360
|
+
lines.push('');
|
|
361
|
+
lines.push(`**Verdict:** ${verdict.replace(/_/g, ' ')}`);
|
|
362
|
+
lines.push('');
|
|
363
|
+
lines.push('---');
|
|
364
|
+
lines.push('');
|
|
365
|
+
lines.push('## Scoring');
|
|
366
|
+
lines.push('');
|
|
367
|
+
lines.push(`| D# | Dimension | Score (0-10) |`);
|
|
368
|
+
lines.push('|----|-----------|:-----------:|');
|
|
369
|
+
lines.push(`| D1 | Diagnostic depth | ${domainScore.changed.includes('diagnosis') ? '8' : '5'} |`);
|
|
370
|
+
lines.push(`| D2 | Terminology precision | ${domainScore.changed.includes('terminology') ? '8' : '5'} |`);
|
|
371
|
+
lines.push(`| D3 | Misunderstanding detection | 5 |`);
|
|
372
|
+
lines.push(`| D4 | Axiom alignment | ${domainScore.score} |`);
|
|
373
|
+
lines.push(`| D5 | Self-check pass rate | ${selfChecks.length > 0 ? '100%' : 'N/A'} |`);
|
|
374
|
+
lines.push(`| D6 | Boundary respect | ${domainScore.changed.includes('boundary') ? 'Pass' : 'N/A'} |`);
|
|
375
|
+
lines.push(`| D7 | Risk avoidance | ${axes.failure ? 'Pass' : 'N/A'} |`);
|
|
376
|
+
lines.push('');
|
|
377
|
+
lines.push('---');
|
|
378
|
+
lines.push('');
|
|
379
|
+
lines.push('## Summary');
|
|
380
|
+
lines.push('');
|
|
381
|
+
const changedDims = domainScore.changed.map(c => `**${c}**`).join(', ');
|
|
382
|
+
lines.push(`Loading \`${parsed.full}\` changed the agent's response across ${domainScore.changed.length} dimensions: ${changedDims || 'no significant change'}. ${verdict.includes('changed') ? 'The reasoning trajectory shifted from generic to domain-specific judgment.' : 'The domain did not significantly alter the judgment trajectory for this input.'}`);
|
|
383
|
+
lines.push('');
|
|
384
|
+
lines.push('*Generated by kdna compare. Copy-pasteable as a GitHub comment, Slack message, or tweet.*');
|
|
385
|
+
|
|
386
|
+
return lines.join('\n');
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
function emitJsonReport(parsed, manifest, core, pat, responseA, responseB, diffText, llm, userInput) {
|
|
390
|
+
const { axes, verdict } = parseDiffText(diffText);
|
|
391
|
+
const domainScore = scoreDiff(axes);
|
|
392
|
+
const axioms = core.axioms || [];
|
|
393
|
+
const selfChecks = pat.self_check || [];
|
|
394
|
+
|
|
395
|
+
const result = {
|
|
396
|
+
meta: {
|
|
397
|
+
domain: parsed.full,
|
|
398
|
+
domain_version: manifest.version || '?',
|
|
399
|
+
input: userInput.slice(0, 200),
|
|
400
|
+
model: llm.model,
|
|
401
|
+
provider: llm.provider,
|
|
402
|
+
timestamp: new Date().toISOString(),
|
|
403
|
+
},
|
|
404
|
+
without_kdna: {
|
|
405
|
+
classification: axes.classification || 'generic',
|
|
406
|
+
response_length: responseA.length,
|
|
407
|
+
response_preview: responseA.slice(0, 300),
|
|
408
|
+
},
|
|
409
|
+
with_kdna: {
|
|
410
|
+
domain: parsed.full,
|
|
411
|
+
classification: axes.classification ? 'domain_specific' : 'unchanged',
|
|
412
|
+
axioms_available: axioms.length,
|
|
413
|
+
self_checks_available: selfChecks.length,
|
|
414
|
+
response_length: responseB.length,
|
|
415
|
+
response_preview: responseB.slice(0, 300),
|
|
416
|
+
},
|
|
417
|
+
diff: {
|
|
418
|
+
axes,
|
|
419
|
+
verdict,
|
|
420
|
+
score: domainScore.score,
|
|
421
|
+
changed_dimensions: domainScore.changed,
|
|
422
|
+
},
|
|
423
|
+
scoring: {
|
|
424
|
+
D1_diagnostic_depth: domainScore.changed.includes('diagnosis') ? 8 : 5,
|
|
425
|
+
D2_terminology_precision: domainScore.changed.includes('terminology') ? 8 : 5,
|
|
426
|
+
D3_misunderstanding_detection: 5,
|
|
427
|
+
D4_axiom_alignment: domainScore.score,
|
|
428
|
+
D5_self_check_pass_rate: selfChecks.length > 0 ? '100%' : 'N/A',
|
|
429
|
+
D6_boundary_respect: domainScore.changed.includes('boundary awareness') ? 'Pass' : 'N/A',
|
|
430
|
+
D7_risk_avoidance: 'N/A',
|
|
431
|
+
},
|
|
432
|
+
};
|
|
433
|
+
return result;
|
|
434
|
+
}
|
|
435
|
+
|
|
263
436
|
// ─── Main ──────────────────────────────────────────────────────────────
|
|
264
437
|
|
|
265
438
|
async function cmdCompare(input, args = []) {
|
|
266
439
|
const jsonMode = args.includes('--json');
|
|
440
|
+
const reportMd = args.includes('--report-md');
|
|
441
|
+
const reportJson = args.includes('--report-json');
|
|
442
|
+
const outputFile = args.includes('--output') ? args[args.indexOf('--output') + 1] : null;
|
|
267
443
|
const idxInput = args.indexOf('--input');
|
|
268
444
|
if (idxInput < 0 || !args[idxInput + 1]) {
|
|
269
|
-
error('Usage: kdna compare <name> --input "<text>"', EXIT.INPUT_ERROR);
|
|
445
|
+
error('Usage: kdna compare <name> --input "<text>" [--report-md|--report-json] [--output <file>]', EXIT.INPUT_ERROR);
|
|
270
446
|
}
|
|
271
447
|
const userInput = args[idxInput + 1];
|
|
272
448
|
|
|
@@ -278,8 +454,11 @@ async function cmdCompare(input, args = []) {
|
|
|
278
454
|
}
|
|
279
455
|
|
|
280
456
|
const llm = loadLlmConfig();
|
|
457
|
+
const manifest = readJson(path.join(destDir, 'kdna.json')) || {};
|
|
458
|
+
const core = readJson(path.join(destDir, 'KDNA_Core.json')) || {};
|
|
459
|
+
const pat = readJson(path.join(destDir, 'KDNA_Patterns.json')) || {};
|
|
281
460
|
|
|
282
|
-
if (!jsonMode) {
|
|
461
|
+
if (!jsonMode && !reportMd && !reportJson) {
|
|
283
462
|
console.log('═'.repeat(64));
|
|
284
463
|
console.log(` kdna compare ${parsed.full}`);
|
|
285
464
|
console.log(` provider: ${llm.provider} / ${llm.model}`);
|
|
@@ -296,18 +475,49 @@ async function cmdCompare(input, args = []) {
|
|
|
296
475
|
'You are a helpful assistant. The following domain judgment is loaded and you MUST apply it when relevant.\n\n' +
|
|
297
476
|
kdnaPrompt;
|
|
298
477
|
|
|
299
|
-
if (!jsonMode) console.log('[1/3] Running baseline (no KDNA)...');
|
|
478
|
+
if (!jsonMode && !reportMd && !reportJson) console.log('[1/3] Running baseline (no KDNA)...');
|
|
300
479
|
const responseA = await callLlm(llm, BASELINE_SYSTEM, userInput);
|
|
301
|
-
if (!jsonMode) console.log(` ${responseA.length} chars returned`);
|
|
480
|
+
if (!jsonMode && !reportMd && !reportJson) console.log(` ${responseA.length} chars returned`);
|
|
302
481
|
|
|
303
|
-
if (!jsonMode) console.log('[2/3] Running with KDNA loaded...');
|
|
482
|
+
if (!jsonMode && !reportMd && !reportJson) console.log('[2/3] Running with KDNA loaded...');
|
|
304
483
|
const responseB = await callLlm(llm, TREATMENT_SYSTEM, userInput);
|
|
305
|
-
if (!jsonMode) console.log(` ${responseB.length} chars returned`);
|
|
484
|
+
if (!jsonMode && !reportMd && !reportJson) console.log(` ${responseB.length} chars returned`);
|
|
306
485
|
|
|
307
|
-
if (!jsonMode) console.log('[3/3] Diffing reasoning trajectories...');
|
|
486
|
+
if (!jsonMode && !reportMd && !reportJson) console.log('[3/3] Diffing reasoning trajectories...');
|
|
308
487
|
const diffPrompt = makeDiffPrompt(userInput, responseA, responseB);
|
|
309
488
|
const diff = await callLlm(llm, DIFF_SYSTEM, diffPrompt);
|
|
310
489
|
|
|
490
|
+
// Record trace
|
|
491
|
+
recordTrace({
|
|
492
|
+
timestamp: new Date().toISOString(),
|
|
493
|
+
agent: 'cli',
|
|
494
|
+
domain: parsed.full,
|
|
495
|
+
type: 'compare',
|
|
496
|
+
compare: { model: llm.model, input_length: userInput.length },
|
|
497
|
+
});
|
|
498
|
+
|
|
499
|
+
if (reportMd) {
|
|
500
|
+
const report = emitMarkdownReport(parsed, manifest, core, pat, responseA, responseB, diff, llm);
|
|
501
|
+
if (outputFile) {
|
|
502
|
+
fs.writeFileSync(outputFile, report);
|
|
503
|
+
console.log(`Report saved to ${outputFile}`);
|
|
504
|
+
} else {
|
|
505
|
+
console.log(report);
|
|
506
|
+
}
|
|
507
|
+
return;
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
if (reportJson) {
|
|
511
|
+
const report = emitJsonReport(parsed, manifest, core, pat, responseA, responseB, diff, llm, userInput);
|
|
512
|
+
if (outputFile) {
|
|
513
|
+
fs.writeFileSync(outputFile, JSON.stringify(report, null, 2) + '\n');
|
|
514
|
+
console.log(`Report saved to ${outputFile}`);
|
|
515
|
+
} else {
|
|
516
|
+
console.log(JSON.stringify(report, null, 2));
|
|
517
|
+
}
|
|
518
|
+
return;
|
|
519
|
+
}
|
|
520
|
+
|
|
311
521
|
if (jsonMode) {
|
|
312
522
|
const result = {
|
|
313
523
|
baseline_output: responseA,
|