patina-cli 3.11.0 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.patina.default.yaml +29 -29
- package/CHANGELOG.md +53 -0
- package/NOTICE +21 -0
- package/README.md +117 -224
- package/README_JA.md +134 -77
- package/README_KR.md +132 -74
- package/README_ZH.md +137 -80
- package/SKILL.md +11 -20
- package/artifacts/rebaseline-2025/README.md +147 -0
- package/artifacts/rebaseline-2025/human-controls.public.jsonl +250 -0
- package/artifacts/rebaseline-2025/intake.example.jsonl +2 -0
- package/artifacts/rebaseline-2025/intake.local.example.jsonl +25 -0
- package/artifacts/rebaseline-2025/prompts.template.jsonl +7 -0
- package/artifacts/rebaseline-2025/sources.ko-public.jsonl +39 -0
- package/assets/brand/patina-badge.svg +18 -0
- package/assets/brand/patina-mark.svg +8 -0
- package/assets/demo/README.md +79 -0
- package/core/scoring.md +12 -12
- package/core/standalone-prompt.md +3 -1
- package/core/stylometry.md +93 -22
- package/docs/API.md +1554 -0
- package/docs/AUTHENTICATION.md +50 -26
- package/docs/AUTHENTICATION_KR.md +54 -29
- package/docs/BRANDING.md +9 -8
- package/docs/CLI.md +55 -14
- package/docs/COOKBOOK.md +8 -21
- package/docs/DEMO.md +32 -5
- package/docs/EXIT-CODES.md +2 -3
- package/docs/FALSE-POSITIVES.md +63 -0
- package/docs/FAQ.md +9 -1
- package/docs/FAQ_KR.md +3 -1
- package/docs/FLAG-PARITY.md +33 -47
- package/docs/ISSUE-WAVES.md +57 -0
- package/docs/PATTERNS-EN.md +67 -3
- package/docs/PATTERNS-JA.md +68 -2
- package/docs/PATTERNS-KO.md +70 -7
- package/docs/PATTERNS-ZH.md +67 -3
- package/docs/PATTERNS.md +5 -5
- package/docs/RESEARCH-DOCS-PLATFORM.md +54 -0
- package/docs/ROADMAP.md +46 -66
- package/docs/TRANSLATIONESE-KO.md +51 -0
- package/docs/audits/2026-05-deep-research.md +3 -1
- package/docs/benchmarks/README.md +51 -0
- package/docs/benchmarks/detector-comparison.json +69 -9
- package/docs/benchmarks/detector-comparison.md +10 -5
- package/docs/benchmarks/katfish-ko-latest.json +657 -0
- package/docs/benchmarks/katfish-ko-latest.md +77 -0
- package/docs/benchmarks/latest.json +1183 -108
- package/docs/benchmarks/latest.md +84 -60
- package/docs/benchmarks/lexicon-freshness-en-2026-05-22.json +1121 -0
- package/docs/benchmarks/lexicon-freshness-en-2026-05-22.md +136 -0
- package/docs/benchmarks/rebaseline-latest.json +381 -0
- package/docs/benchmarks/rebaseline-latest.md +121 -0
- package/docs/benchmarks/register-stratified-latest.json +164 -0
- package/docs/benchmarks/register-stratified-latest.md +99 -0
- package/docs/benchmarks/register-stratified.md +43 -0
- package/docs/integrations/github-action.md +44 -11
- package/docs/integrations/playground.md +58 -0
- package/docs/integrations/pre-commit.md +5 -5
- package/docs/integrations/release.md +5 -3
- package/docs/integrations/static-sites.md +83 -0
- package/docs/research/2025-rebaseline-plan.md +71 -2
- package/docs/research/2026-rebaseline.md +102 -0
- package/docs/research/adversarial-mps.md +41 -0
- package/docs/research/ai-human-metrics.md +35 -23
- package/docs/research/human-eval-panel.md +42 -0
- package/docs/research/judge-agreement.md +24 -0
- package/docs/research/ko-2025-corpus-sources.md +135 -0
- package/docs/research/lexicon-freshness-audit.md +64 -0
- package/docs/research/zh-ja-lexicon-calibration.md +60 -0
- package/docs/social/patina-launch-copy.md +173 -100
- package/docs/social/patina-launch-execution.md +94 -0
- package/docs/social/patina-launch-korean-first.md +83 -0
- package/docs/social/signs-of-ai-writing.md +26 -0
- package/docs/social/signs-of-ai-writing_KR.md +26 -0
- package/lexicon/ai-en.md +21 -24
- package/lexicon/ai-ja.md +158 -0
- package/lexicon/ai-ko.md +9 -9
- package/lexicon/ai-zh.md +158 -0
- package/lexicon/provenance/ai-en.json +970 -0
- package/lexicon/provenance/ai-ja.json +542 -0
- package/lexicon/provenance/ai-ko.json +866 -0
- package/lexicon/provenance/ai-zh.json +542 -0
- package/package.json +49 -8
- package/patterns/en-communication.md +5 -0
- package/patterns/en-content.md +5 -0
- package/patterns/en-filler.md +5 -0
- package/patterns/en-language.md +29 -1
- package/patterns/en-structure.md +5 -0
- package/patterns/en-style.md +5 -0
- package/patterns/en-viral-hook.md +42 -2
- package/patterns/ja-communication.md +5 -0
- package/patterns/ja-content.md +5 -0
- package/patterns/ja-filler.md +5 -0
- package/patterns/ja-language.md +33 -1
- package/patterns/ja-structure.md +12 -0
- package/patterns/ja-style.md +5 -0
- package/patterns/ja-viral-hook.md +41 -2
- package/patterns/ko-communication.md +5 -0
- package/patterns/ko-content.md +5 -0
- package/patterns/ko-filler.md +5 -0
- package/patterns/ko-language.md +33 -1
- package/patterns/ko-structure.md +25 -6
- package/patterns/ko-style.md +5 -0
- package/patterns/ko-viral-hook.md +38 -2
- package/patterns/zh-communication.md +5 -0
- package/patterns/zh-content.md +5 -0
- package/patterns/zh-filler.md +5 -0
- package/patterns/zh-language.md +37 -1
- package/patterns/zh-structure.md +12 -0
- package/patterns/zh-style.md +5 -0
- package/patterns/zh-viral-hook.md +38 -2
- package/playground/README.md +55 -0
- package/playground/analytics.js +4 -0
- package/playground/analyzer.js +883 -0
- package/playground/app.js +157 -0
- package/playground/data/lexicons.js +343 -0
- package/playground/index.html +138 -0
- package/playground/styles.css +267 -0
- package/profiles/namuwiki.md +111 -0
- package/scripts/adversarial-mps-report.mjs +201 -0
- package/scripts/badge-json.mjs +79 -0
- package/scripts/benchmark-report.mjs +56 -9
- package/scripts/check-release-metadata.mjs +0 -2
- package/scripts/detector-comparison.mjs +7 -7
- package/scripts/generate-playground-data.mjs +77 -0
- package/scripts/katfish-calibration.mjs +464 -0
- package/scripts/lexicon-freshness.mjs +485 -0
- package/scripts/lint.mjs +1 -1
- package/scripts/precommit-score.mjs +4 -3
- package/scripts/prose-score.mjs +81 -5
- package/scripts/rebaseline-intake.mjs +242 -0
- package/scripts/rebaseline-score.mjs +268 -0
- package/scripts/rebaseline-summary.mjs +773 -0
- package/scripts/rebaseline-web-collect.mjs +410 -0
- package/scripts/update-benchmark-ranges.mjs +1 -0
- package/src/api.js +69 -105
- package/src/auth.js +50 -2
- package/src/backends/claude-cli.js +19 -4
- package/src/backends/codex-cli.js +19 -3
- package/src/backends/contract.js +230 -1
- package/src/backends/gemini-cli.js +18 -5
- package/src/backends/index.js +87 -12
- package/src/backends/kimi-cli.js +161 -0
- package/src/cli.js +577 -567
- package/src/commands/doctor.js +2 -2
- package/src/config.js +29 -0
- package/src/errors.js +53 -1
- package/src/features/discourse-tells.js +68 -0
- package/src/features/index.js +82 -8
- package/src/features/lexicon.js +40 -6
- package/src/features/markup-leakage.js +69 -0
- package/src/features/segment.js +41 -0
- package/src/features/signal-strength.js +81 -0
- package/src/features/stylometry.js +231 -1
- package/src/features/translationese.js +127 -0
- package/src/loader.js +76 -0
- package/src/logger.js +22 -23
- package/src/model-defaults.js +55 -0
- package/src/ouroboros.js +31 -0
- package/src/output.js +102 -90
- package/src/prompt-builder.js +103 -68
- package/src/providers.js +51 -4
- package/src/scoring.js +210 -2
- package/src/security.js +75 -0
- package/tests/fixtures/live-quality/en/public-docs-01.md +26 -0
- package/tests/fixtures/live-quality/ko/public-docs-01.md +26 -0
- package/tests/fixtures/suspect-zones/expected-ranges.json +207 -16
- package/tests/fixtures/suspect-zones/ja/ai/ja-ai-04-lexicon.md +11 -0
- package/tests/fixtures/suspect-zones/ja/natural/ja-nat-04-lexicon-cold.md +11 -0
- package/tests/fixtures/suspect-zones/ko/ai/ko-ai-02.md +4 -5
- package/tests/fixtures/suspect-zones/ko/ai/ko-ai-07-ko-diagnostic.md +11 -0
- package/tests/fixtures/suspect-zones/zh/ai/zh-ai-04-lexicon.md +11 -0
- package/tests/fixtures/suspect-zones/zh/natural/zh-nat-04-lexicon-cold.md +11 -0
- package/tests/quality/README.md +188 -11
- package/tests/quality/adversarial-mps/fixtures.jsonl +10 -0
- package/tests/quality/benchmark.mjs +39 -1
- package/tests/quality/dogfood.mjs +5 -3
- package/tests/quality/live-fixtures.jsonl +2 -0
- package/tests/quality/live-quality.mjs +596 -0
- package/tests/quality/ranking-metrics.mjs +136 -0
- package/tests/quality/rebaseline-manifest.example.jsonl +5 -0
- package/vercel.json +53 -0
- package/SKILL-MAX.md +0 -455
- package/docs/internal/HARNESS.md +0 -14
- package/docs/internal/README.md +0 -14
- package/docs/internal/WARP.md +0 -23
- package/patina-max/SKILL.md +0 -523
- package/patina-max/composite.py +0 -457
- package/src/cache.js +0 -106
- package/src/commands/init.js +0 -208
- package/src/manifest.js +0 -162
- package/src/max-mode.js +0 -207
|
@@ -0,0 +1,773 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
// Rebaseline manifest checker for the 2025+ corpus protocol.
|
|
3
|
+
//
|
|
4
|
+
// This script validates metadata-only JSONL manifests, reports matrix coverage,
|
|
5
|
+
// and keeps public performance claims blocked until corpus size + outcome fields
|
|
6
|
+
// meet the process gate in process/pattern-freshness.md.
|
|
7
|
+
|
|
8
|
+
import { createHash } from 'node:crypto';
|
|
9
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs';
|
|
10
|
+
import { dirname, relative, resolve } from 'node:path';
|
|
11
|
+
import { fileURLToPath } from 'node:url';
|
|
12
|
+
|
|
13
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
14
|
+
const REPO_ROOT = resolve(__dirname, '..');
|
|
15
|
+
|
|
16
|
+
export const SCHEMA_VERSION = 1;
|
|
17
|
+
export const DEFAULT_INPUT = 'artifacts/rebaseline-2025/rebaseline-2026.scored.public.jsonl';
|
|
18
|
+
export const DEFAULT_REPORT_DIR = 'docs/benchmarks';
|
|
19
|
+
export const DEFAULT_REPORT_BASENAME = 'rebaseline-latest';
|
|
20
|
+
|
|
21
|
+
export const MATRIX = {
|
|
22
|
+
languages: ['ko', 'en', 'zh', 'ja'],
|
|
23
|
+
classes: ['ai-like', 'natural-human', 'lightly-edited-ai', 'heavily-edited-ai'],
|
|
24
|
+
registers: ['blog', 'academic-summary', 'product-doc', 'chat-update', 'technical-how-to'],
|
|
25
|
+
generatorFamilies: ['gpt-family', 'claude-family', 'gemini-family', 'open-weight'],
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
export const TARGETS = {
|
|
29
|
+
protocolPerLanguageClassRegister: 25,
|
|
30
|
+
claimPerCell: 100,
|
|
31
|
+
claimLanguages: 2,
|
|
32
|
+
claimGeneratorFamilies: 3,
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
const REQUIRED_FIELDS = [
|
|
36
|
+
'sample_id',
|
|
37
|
+
'language',
|
|
38
|
+
'class',
|
|
39
|
+
'register',
|
|
40
|
+
'model_family',
|
|
41
|
+
'provider',
|
|
42
|
+
'model',
|
|
43
|
+
'generated_at',
|
|
44
|
+
'prompt_id',
|
|
45
|
+
'decoding',
|
|
46
|
+
'postprocess',
|
|
47
|
+
'redistribution',
|
|
48
|
+
'text_hash',
|
|
49
|
+
];
|
|
50
|
+
|
|
51
|
+
const CLASS_ALIASES = new Map([
|
|
52
|
+
['ai', 'ai-like'],
|
|
53
|
+
['ai_like', 'ai-like'],
|
|
54
|
+
['ai-like', 'ai-like'],
|
|
55
|
+
['generated', 'ai-like'],
|
|
56
|
+
['human', 'natural-human'],
|
|
57
|
+
['natural', 'natural-human'],
|
|
58
|
+
['natural/human', 'natural-human'],
|
|
59
|
+
['natural-human', 'natural-human'],
|
|
60
|
+
['lightly edited ai', 'lightly-edited-ai'],
|
|
61
|
+
['lightly-edited-ai', 'lightly-edited-ai'],
|
|
62
|
+
['lightly_edited_ai', 'lightly-edited-ai'],
|
|
63
|
+
['heavily edited ai', 'heavily-edited-ai'],
|
|
64
|
+
['heavily-edited-ai', 'heavily-edited-ai'],
|
|
65
|
+
['heavily_edited_ai', 'heavily-edited-ai'],
|
|
66
|
+
]);
|
|
67
|
+
|
|
68
|
+
const REGISTER_ALIASES = new Map([
|
|
69
|
+
['blog', 'blog'],
|
|
70
|
+
['academic summary', 'academic-summary'],
|
|
71
|
+
['academic-summary', 'academic-summary'],
|
|
72
|
+
['product doc', 'product-doc'],
|
|
73
|
+
['product-doc', 'product-doc'],
|
|
74
|
+
['chat update', 'chat-update'],
|
|
75
|
+
['chat/update', 'chat-update'],
|
|
76
|
+
['chat-update', 'chat-update'],
|
|
77
|
+
['technical how-to', 'technical-how-to'],
|
|
78
|
+
['technical howto', 'technical-how-to'],
|
|
79
|
+
['technical-how-to', 'technical-how-to'],
|
|
80
|
+
]);
|
|
81
|
+
|
|
82
|
+
const MODEL_FAMILY_ALIASES = new Map([
|
|
83
|
+
['gpt', 'gpt-family'],
|
|
84
|
+
['gpt-family', 'gpt-family'],
|
|
85
|
+
['openai', 'gpt-family'],
|
|
86
|
+
['claude', 'claude-family'],
|
|
87
|
+
['claude-family', 'claude-family'],
|
|
88
|
+
['anthropic', 'claude-family'],
|
|
89
|
+
['gemini', 'gemini-family'],
|
|
90
|
+
['gemini-family', 'gemini-family'],
|
|
91
|
+
['google', 'gemini-family'],
|
|
92
|
+
['open weight', 'open-weight'],
|
|
93
|
+
['open-weight', 'open-weight'],
|
|
94
|
+
['open_weight', 'open-weight'],
|
|
95
|
+
['human', 'human-reference'],
|
|
96
|
+
['human-reference', 'human-reference'],
|
|
97
|
+
]);
|
|
98
|
+
|
|
99
|
+
const TEXT_ALLOWED_REDIS = new Set(['repo-ok', 'redistributable', 'public', 'public-domain', 'cc0', 'cc-by']);
|
|
100
|
+
const TEXT_BLOCKED_REDIS = new Set(['metadata-only', 'private', 'no-redistribution', 'hash-only']);
|
|
101
|
+
const POSITIVE_CLASSES = new Set(['ai-like', 'lightly-edited-ai', 'heavily-edited-ai']);
|
|
102
|
+
const SHA256_RE = /^sha256:[0-9a-f]{64}$/u;
|
|
103
|
+
|
|
104
|
+
export function canRedistributeText(redistribution) {
|
|
105
|
+
return TEXT_ALLOWED_REDIS.has(normalizeToken(redistribution));
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
export function blocksRedistributableText(redistribution) {
|
|
109
|
+
const normalized = normalizeToken(redistribution);
|
|
110
|
+
return TEXT_BLOCKED_REDIS.has(normalized) || !TEXT_ALLOWED_REDIS.has(normalized);
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
export function parseArgs(argv = process.argv.slice(2)) {
|
|
114
|
+
const args = {
|
|
115
|
+
input: DEFAULT_INPUT,
|
|
116
|
+
json: false,
|
|
117
|
+
write: false,
|
|
118
|
+
outputDir: DEFAULT_REPORT_DIR,
|
|
119
|
+
basename: DEFAULT_REPORT_BASENAME,
|
|
120
|
+
requireClaimReady: false,
|
|
121
|
+
help: false,
|
|
122
|
+
};
|
|
123
|
+
|
|
124
|
+
for (let i = 0; i < argv.length; i++) {
|
|
125
|
+
const arg = argv[i];
|
|
126
|
+
if (arg === '--input') args.input = argv[++i];
|
|
127
|
+
else if (arg === '--json') args.json = true;
|
|
128
|
+
else if (arg === '--write') args.write = true;
|
|
129
|
+
else if (arg === '--output-dir') args.outputDir = argv[++i];
|
|
130
|
+
else if (arg === '--basename') args.basename = argv[++i];
|
|
131
|
+
else if (arg === '--require-claim-ready') args.requireClaimReady = true;
|
|
132
|
+
else if (arg === '--help' || arg === '-h') args.help = true;
|
|
133
|
+
else throw new Error(`Unknown argument: ${arg}`);
|
|
134
|
+
}
|
|
135
|
+
return args;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
export function hashText(text) {
|
|
139
|
+
return `sha256:${createHash('sha256').update(String(text)).digest('hex')}`;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
export function loadManifest(inputPath = DEFAULT_INPUT) {
|
|
143
|
+
const abs = resolve(REPO_ROOT, inputPath);
|
|
144
|
+
const rel = relative(REPO_ROOT, abs) || inputPath;
|
|
145
|
+
const result = {
|
|
146
|
+
schemaVersion: SCHEMA_VERSION,
|
|
147
|
+
path: abs,
|
|
148
|
+
relativePath: rel,
|
|
149
|
+
records: [],
|
|
150
|
+
errors: [],
|
|
151
|
+
warnings: [],
|
|
152
|
+
};
|
|
153
|
+
|
|
154
|
+
if (!existsSync(abs)) {
|
|
155
|
+
result.errors.push(`manifest not found: ${rel}`);
|
|
156
|
+
return result;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
const seen = new Set();
|
|
160
|
+
const lines = readFileSync(abs, 'utf8').split(/\r?\n/u);
|
|
161
|
+
for (let index = 0; index < lines.length; index++) {
|
|
162
|
+
const lineNumber = index + 1;
|
|
163
|
+
const line = lines[index].trim();
|
|
164
|
+
if (!line) continue;
|
|
165
|
+
|
|
166
|
+
let parsed;
|
|
167
|
+
try {
|
|
168
|
+
parsed = JSON.parse(line);
|
|
169
|
+
} catch (error) {
|
|
170
|
+
result.errors.push(`line ${lineNumber}: invalid JSON (${error.message})`);
|
|
171
|
+
continue;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
const checked = validateRecord(parsed);
|
|
175
|
+
for (const warning of checked.warnings) result.warnings.push(`line ${lineNumber}: ${warning}`);
|
|
176
|
+
for (const error of checked.errors) result.errors.push(`line ${lineNumber}: ${error}`);
|
|
177
|
+
|
|
178
|
+
let duplicate = false;
|
|
179
|
+
if (checked.record.sample_id) {
|
|
180
|
+
if (seen.has(checked.record.sample_id)) {
|
|
181
|
+
result.errors.push(`line ${lineNumber}: duplicate sample_id ${checked.record.sample_id}`);
|
|
182
|
+
duplicate = true;
|
|
183
|
+
}
|
|
184
|
+
seen.add(checked.record.sample_id);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
if (checked.errors.length === 0 && !duplicate) result.records.push(checked.record);
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
return result;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
export function validateRecord(input) {
|
|
194
|
+
const errors = [];
|
|
195
|
+
const warnings = [];
|
|
196
|
+
const isObject = input && typeof input === 'object' && !Array.isArray(input);
|
|
197
|
+
const record = isObject ? { ...input } : {};
|
|
198
|
+
|
|
199
|
+
if (!isObject) {
|
|
200
|
+
errors.push('record must be a JSON object');
|
|
201
|
+
return { record, errors, warnings };
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
for (const field of REQUIRED_FIELDS) {
|
|
205
|
+
if (record[field] === undefined || record[field] === null || record[field] === '') {
|
|
206
|
+
errors.push(`missing required field: ${field}`);
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
record.language = normalizeToken(record.language);
|
|
211
|
+
if (record.language && !MATRIX.languages.includes(record.language)) {
|
|
212
|
+
errors.push(`language must be one of ${MATRIX.languages.join(', ')}`);
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
record.class = canonicalize(record.class, CLASS_ALIASES);
|
|
216
|
+
if (record.class && !MATRIX.classes.includes(record.class)) {
|
|
217
|
+
errors.push(`class must be one of ${MATRIX.classes.join(', ')}`);
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
record.register = canonicalize(record.register, REGISTER_ALIASES);
|
|
221
|
+
if (record.register && !MATRIX.registers.includes(record.register)) {
|
|
222
|
+
errors.push(`register must be one of ${MATRIX.registers.join(', ')}`);
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
record.model_family = canonicalize(record.model_family, MODEL_FAMILY_ALIASES);
|
|
226
|
+
if (record.model_family && !MATRIX.generatorFamilies.includes(record.model_family) && record.model_family !== 'human-reference') {
|
|
227
|
+
warnings.push(`unrecognized model_family ${record.model_family}; it will not count toward the public claim gate`);
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
record.redistribution = normalizeToken(record.redistribution);
|
|
231
|
+
if (record.redistribution && !TEXT_ALLOWED_REDIS.has(record.redistribution) && !TEXT_BLOCKED_REDIS.has(record.redistribution)) {
|
|
232
|
+
warnings.push(`unrecognized redistribution ${record.redistribution}; text is treated as non-redistributable`);
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
if (typeof record.sample_id === 'string') record.sample_id = record.sample_id.trim();
|
|
236
|
+
if (typeof record.provider === 'string') record.provider = record.provider.trim();
|
|
237
|
+
if (typeof record.model === 'string') record.model = record.model.trim();
|
|
238
|
+
if (typeof record.prompt_id === 'string') record.prompt_id = record.prompt_id.trim();
|
|
239
|
+
|
|
240
|
+
if (record.generated_at && Number.isNaN(Date.parse(record.generated_at))) {
|
|
241
|
+
errors.push('generated_at must be an ISO-like date or timestamp');
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
if (record.text_hash && !SHA256_RE.test(record.text_hash)) {
|
|
245
|
+
errors.push('text_hash must use sha256:<64 lowercase hex>');
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
if (hasText(record)) {
|
|
249
|
+
if (!TEXT_ALLOWED_REDIS.has(record.redistribution)) {
|
|
250
|
+
errors.push(`text is not allowed when redistribution=${record.redistribution || '<missing>'}`);
|
|
251
|
+
}
|
|
252
|
+
const observed = hashText(record.text);
|
|
253
|
+
if (record.text_hash && observed !== record.text_hash) {
|
|
254
|
+
errors.push(`text_hash mismatch: expected ${observed}`);
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
if (typeof record.patina_score === 'number' && (record.patina_score < 0 || record.patina_score > 100)) {
|
|
259
|
+
errors.push('patina_score must be between 0 and 100');
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
if (record.expected_hot !== undefined && typeof record.expected_hot !== 'boolean') {
|
|
263
|
+
errors.push('expected_hot must be boolean when present');
|
|
264
|
+
}
|
|
265
|
+
if (record.predicted_hot !== undefined && typeof record.predicted_hot !== 'boolean') {
|
|
266
|
+
errors.push('predicted_hot must be boolean when present');
|
|
267
|
+
}
|
|
268
|
+
if ((record.expected_hot === undefined) !== (record.predicted_hot === undefined)) {
|
|
269
|
+
warnings.push('expected_hot and predicted_hot should be recorded together for scored reports');
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
if (!isMetadataValue(record.decoding)) errors.push('decoding must be a non-empty object or string');
|
|
273
|
+
if (!isMetadataValue(record.postprocess)) errors.push('postprocess must be a non-empty object or string');
|
|
274
|
+
|
|
275
|
+
return { record, errors, warnings };
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
export function summarizeManifest(records, options = {}) {
|
|
279
|
+
const byLanguage = countBy(records, (record) => record.language);
|
|
280
|
+
const byClass = countBy(records, (record) => record.class);
|
|
281
|
+
const byRegister = countBy(records, (record) => record.register);
|
|
282
|
+
const byModelFamily = countBy(records, (record) => record.model_family);
|
|
283
|
+
const protocolCells = countBy(records, (record) => protocolCellKey(record));
|
|
284
|
+
const positiveClaimCells = countBy(
|
|
285
|
+
records.filter((record) => POSITIVE_CLASSES.has(record.class) && MATRIX.generatorFamilies.includes(record.model_family)),
|
|
286
|
+
(record) => `${record.language}|${record.model_family}`
|
|
287
|
+
);
|
|
288
|
+
const naturalClaimCells = countBy(
|
|
289
|
+
records.filter((record) => record.class === 'natural-human'),
|
|
290
|
+
(record) => record.language
|
|
291
|
+
);
|
|
292
|
+
const outcomeRecords = records.filter(
|
|
293
|
+
(record) => typeof record.expected_hot === 'boolean' && typeof record.predicted_hot === 'boolean'
|
|
294
|
+
);
|
|
295
|
+
|
|
296
|
+
return {
|
|
297
|
+
schemaVersion: SCHEMA_VERSION,
|
|
298
|
+
generatedAt: new Date().toISOString(),
|
|
299
|
+
input: options.input || null,
|
|
300
|
+
targets: TARGETS,
|
|
301
|
+
totalRecords: records.length,
|
|
302
|
+
byLanguage,
|
|
303
|
+
byClass,
|
|
304
|
+
byRegister,
|
|
305
|
+
byModelFamily,
|
|
306
|
+
protocolCoverage: summarizeProtocolCoverage(protocolCells),
|
|
307
|
+
claimGate: evaluateClaimGate({ records, positiveClaimCells, naturalClaimCells, outcomeRecords }),
|
|
308
|
+
metrics: summarizeOutcomes(outcomeRecords),
|
|
309
|
+
catchByLanguageFamily: summarizeCatchByLanguageFamily(outcomeRecords),
|
|
310
|
+
falsePositiveByLanguage: summarizeFalsePositiveByLanguage(outcomeRecords),
|
|
311
|
+
metricsByRegister: summarizeOutcomesBy(outcomeRecords, (record) => record.register),
|
|
312
|
+
};
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
export function renderMarkdownReport(summary, validation = {}) {
|
|
316
|
+
const validationLines = [];
|
|
317
|
+
if (validation.errors?.length) {
|
|
318
|
+
validationLines.push('Validation: **FAIL**');
|
|
319
|
+
validationLines.push(...validation.errors.map((error) => `- ${escapeMarkdown(error)}`));
|
|
320
|
+
} else {
|
|
321
|
+
validationLines.push('Validation: **PASS**');
|
|
322
|
+
}
|
|
323
|
+
if (validation.warnings?.length) {
|
|
324
|
+
validationLines.push('', 'Warnings:');
|
|
325
|
+
validationLines.push(...validation.warnings.map((warning) => `- ${escapeMarkdown(warning)}`));
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
const claim = summary.claimGate;
|
|
329
|
+
const metrics = summary.metrics;
|
|
330
|
+
|
|
331
|
+
const markdown = `# Rebaseline Manifest Summary
|
|
332
|
+
|
|
333
|
+
- Generated at: ${summary.generatedAt}
|
|
334
|
+
- Input: ${summary.input ? `\`${escapeMarkdown(summary.input)}\`` : 'not recorded'}
|
|
335
|
+
- Records: ${summary.totalRecords}
|
|
336
|
+
- Protocol target: ${summary.targets.protocolPerLanguageClassRegister} samples per language × class × register cell
|
|
337
|
+
- Public claim target: ${summary.targets.claimPerCell} samples per claim cell, ${summary.targets.claimLanguages}+ languages, ${summary.targets.claimGeneratorFamilies}+ generator families
|
|
338
|
+
|
|
339
|
+
## Validation
|
|
340
|
+
|
|
341
|
+
${validationLines.join('\n')}
|
|
342
|
+
|
|
343
|
+
## Coverage snapshot
|
|
344
|
+
|
|
345
|
+
### By language
|
|
346
|
+
|
|
347
|
+
${renderCountTable(summary.byLanguage, MATRIX.languages)}
|
|
348
|
+
|
|
349
|
+
### By class
|
|
350
|
+
|
|
351
|
+
${renderCountTable(summary.byClass, MATRIX.classes)}
|
|
352
|
+
|
|
353
|
+
### By register
|
|
354
|
+
|
|
355
|
+
${renderCountTable(summary.byRegister, MATRIX.registers)}
|
|
356
|
+
|
|
357
|
+
### By model family
|
|
358
|
+
|
|
359
|
+
${renderCountTable(summary.byModelFamily, [...MATRIX.generatorFamilies, 'human-reference'])}
|
|
360
|
+
|
|
361
|
+
## Protocol matrix
|
|
362
|
+
|
|
363
|
+
- Populated language × class × register cells: ${summary.protocolCoverage.populatedCells}/${summary.protocolCoverage.totalCells}
|
|
364
|
+
- Cells meeting ${summary.targets.protocolPerLanguageClassRegister}+ samples: ${summary.protocolCoverage.cellsMeetingTarget}
|
|
365
|
+
- Empty cells: ${summary.protocolCoverage.emptyCells}
|
|
366
|
+
- Underfilled populated cells: ${summary.protocolCoverage.underfilledCells.length}
|
|
367
|
+
|
|
368
|
+
${renderUnderfilled(summary.protocolCoverage.underfilledCells, 12)}
|
|
369
|
+
|
|
370
|
+
## Public performance claim gate
|
|
371
|
+
|
|
372
|
+
Public performance claim: **${claim.ready ? 'READY' : 'BLOCKED'}**
|
|
373
|
+
|
|
374
|
+
${claim.blockers.length ? renderBlockerTable(claim.blockers) : 'Gate conditions met by this manifest.'}
|
|
375
|
+
|
|
376
|
+
${renderClaimGateStats(claim, metrics, summary.targets)}
|
|
377
|
+
|
|
378
|
+
## Outcome metrics
|
|
379
|
+
|
|
380
|
+
${metrics.total ? renderMetrics(metrics) : 'No complete `expected_hot` + `predicted_hot` outcome rows yet. This manifest is corpus metadata, not a benchmark claim.'}
|
|
381
|
+
|
|
382
|
+
${metrics.total ? `### Catch rate by language × model family\n\n${renderCatchByLanguageFamily(summary.catchByLanguageFamily)}\n\n### False-positive rate by language\n\n${renderFalsePositiveByLanguage(summary.falsePositiveByLanguage)}\n\n### By register\n\n${renderMetricsByRegister(summary.metricsByRegister)}` : ''}
|
|
383
|
+
`;
|
|
384
|
+
return `${markdown.trimEnd()}\n`;
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
export function writeReportFiles(summary, validation = {}, options = {}) {
|
|
388
|
+
const outputDir = options.outputDir || DEFAULT_REPORT_DIR;
|
|
389
|
+
const basename = options.basename || DEFAULT_REPORT_BASENAME;
|
|
390
|
+
if (!/^[a-z0-9._-]+$/iu.test(basename)) {
|
|
391
|
+
throw new Error(`Invalid report basename: ${basename}`);
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
const absDir = resolve(REPO_ROOT, outputDir);
|
|
395
|
+
mkdirSync(absDir, { recursive: true });
|
|
396
|
+
|
|
397
|
+
const markdown = renderMarkdownReport(summary, validation);
|
|
398
|
+
const payload = {
|
|
399
|
+
...summary,
|
|
400
|
+
validation: {
|
|
401
|
+
errors: validation.errors || [],
|
|
402
|
+
warnings: validation.warnings || [],
|
|
403
|
+
},
|
|
404
|
+
};
|
|
405
|
+
const markdownPath = resolve(absDir, `${basename}.md`);
|
|
406
|
+
const jsonPath = resolve(absDir, `${basename}.json`);
|
|
407
|
+
writeFileSync(markdownPath, markdown);
|
|
408
|
+
writeFileSync(jsonPath, `${JSON.stringify(payload, null, 2)}\n`);
|
|
409
|
+
|
|
410
|
+
return {
|
|
411
|
+
markdownPath,
|
|
412
|
+
jsonPath,
|
|
413
|
+
relativeMarkdownPath: relative(REPO_ROOT, markdownPath),
|
|
414
|
+
relativeJsonPath: relative(REPO_ROOT, jsonPath),
|
|
415
|
+
};
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
function summarizeProtocolCoverage(protocolCells) {
|
|
419
|
+
const expectedKeys = [];
|
|
420
|
+
for (const language of MATRIX.languages) {
|
|
421
|
+
for (const cls of MATRIX.classes) {
|
|
422
|
+
for (const register of MATRIX.registers) {
|
|
423
|
+
expectedKeys.push(`${language}|${cls}|${register}`);
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
const underfilledCells = [];
|
|
429
|
+
let cellsMeetingTarget = 0;
|
|
430
|
+
let emptyCells = 0;
|
|
431
|
+
for (const key of expectedKeys) {
|
|
432
|
+
const count = protocolCells[key] || 0;
|
|
433
|
+
if (count === 0) emptyCells++;
|
|
434
|
+
else if (count >= TARGETS.protocolPerLanguageClassRegister) cellsMeetingTarget++;
|
|
435
|
+
else underfilledCells.push({ key, count });
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
return {
|
|
439
|
+
totalCells: expectedKeys.length,
|
|
440
|
+
populatedCells: expectedKeys.length - emptyCells,
|
|
441
|
+
emptyCells,
|
|
442
|
+
cellsMeetingTarget,
|
|
443
|
+
underfilledCells,
|
|
444
|
+
};
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
function evaluateClaimGate({ records, positiveClaimCells, naturalClaimCells, outcomeRecords }) {
|
|
448
|
+
const qualifiedPositiveCells = Object.entries(positiveClaimCells)
|
|
449
|
+
.filter(([, count]) => count >= TARGETS.claimPerCell)
|
|
450
|
+
.map(([key, count]) => ({ key, count }));
|
|
451
|
+
const qualifiedNaturalCells = Object.entries(naturalClaimCells)
|
|
452
|
+
.filter(([, count]) => count >= TARGETS.claimPerCell)
|
|
453
|
+
.map(([key, count]) => ({ key, count }));
|
|
454
|
+
const positiveLanguages = new Set(qualifiedPositiveCells.map(({ key }) => key.split('|')[0]));
|
|
455
|
+
const positiveFamilies = new Set(qualifiedPositiveCells.map(({ key }) => key.split('|')[1]));
|
|
456
|
+
const naturalLanguages = new Set(qualifiedNaturalCells.map(({ key }) => key));
|
|
457
|
+
const outcomeComplete = records.length > 0 && outcomeRecords.length === records.length;
|
|
458
|
+
const blockers = [];
|
|
459
|
+
|
|
460
|
+
if (positiveLanguages.size < TARGETS.claimLanguages) {
|
|
461
|
+
blockers.push(
|
|
462
|
+
`positive corpus has ${positiveLanguages.size}/${TARGETS.claimLanguages} languages with n≥${TARGETS.claimPerCell}`
|
|
463
|
+
);
|
|
464
|
+
}
|
|
465
|
+
if (positiveFamilies.size < TARGETS.claimGeneratorFamilies) {
|
|
466
|
+
blockers.push(
|
|
467
|
+
`positive corpus has ${positiveFamilies.size}/${TARGETS.claimGeneratorFamilies} generator families with n≥${TARGETS.claimPerCell}`
|
|
468
|
+
);
|
|
469
|
+
}
|
|
470
|
+
if (naturalLanguages.size < TARGETS.claimLanguages) {
|
|
471
|
+
blockers.push(
|
|
472
|
+
`natural/human corpus has ${naturalLanguages.size}/${TARGETS.claimLanguages} languages with n≥${TARGETS.claimPerCell}`
|
|
473
|
+
);
|
|
474
|
+
}
|
|
475
|
+
if (!outcomeComplete) {
|
|
476
|
+
blockers.push('expected_hot and predicted_hot outcome rows are incomplete; run a scored report before README claims');
|
|
477
|
+
}
|
|
478
|
+
if (records.length === 0) blockers.push('manifest has no records');
|
|
479
|
+
|
|
480
|
+
return {
|
|
481
|
+
ready: blockers.length === 0,
|
|
482
|
+
blockers,
|
|
483
|
+
qualifiedPositiveCells,
|
|
484
|
+
qualifiedNaturalCells,
|
|
485
|
+
};
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
function summarizeOutcomes(records) {
|
|
489
|
+
const metrics = { tp: 0, fp: 0, fn: 0, tn: 0, total: records.length };
|
|
490
|
+
for (const record of records) {
|
|
491
|
+
if (record.predicted_hot && record.expected_hot) metrics.tp++;
|
|
492
|
+
else if (record.predicted_hot && !record.expected_hot) metrics.fp++;
|
|
493
|
+
else if (!record.predicted_hot && record.expected_hot) metrics.fn++;
|
|
494
|
+
else metrics.tn++;
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
const accuracy = metrics.total ? (metrics.tp + metrics.tn) / metrics.total : 0;
|
|
498
|
+
const precision = metrics.tp + metrics.fp ? metrics.tp / (metrics.tp + metrics.fp) : 0;
|
|
499
|
+
const recall = metrics.tp + metrics.fn ? metrics.tp / (metrics.tp + metrics.fn) : 0;
|
|
500
|
+
const f1 = precision + recall ? (2 * precision * recall) / (precision + recall) : 0;
|
|
501
|
+
const falsePositiveRate = metrics.fp + metrics.tn ? metrics.fp / (metrics.fp + metrics.tn) : 0;
|
|
502
|
+
const falseNegativeRate = metrics.fn + metrics.tp ? metrics.fn / (metrics.fn + metrics.tp) : 0;
|
|
503
|
+
const positiveTotal = metrics.tp + metrics.fn;
|
|
504
|
+
const naturalTotal = metrics.fp + metrics.tn;
|
|
505
|
+
return {
|
|
506
|
+
...metrics,
|
|
507
|
+
accuracy: round(accuracy),
|
|
508
|
+
precision: round(precision),
|
|
509
|
+
recall: round(recall),
|
|
510
|
+
f1: round(f1),
|
|
511
|
+
falsePositiveRate: round(falsePositiveRate),
|
|
512
|
+
falseNegativeRate: round(falseNegativeRate),
|
|
513
|
+
accuracyCi: wilsonInterval(metrics.tp + metrics.tn, metrics.total),
|
|
514
|
+
recallCi: wilsonInterval(metrics.tp, positiveTotal),
|
|
515
|
+
falsePositiveRateCi: wilsonInterval(metrics.fp, naturalTotal),
|
|
516
|
+
};
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
function summarizeOutcomesBy(records, keyFn) {
|
|
520
|
+
const groups = {};
|
|
521
|
+
for (const record of records) {
|
|
522
|
+
const key = keyFn(record) || 'unknown';
|
|
523
|
+
if (!groups[key]) groups[key] = [];
|
|
524
|
+
groups[key].push(record);
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
return Object.fromEntries(
|
|
528
|
+
Object.entries(groups)
|
|
529
|
+
.sort(([a], [b]) => a.localeCompare(b))
|
|
530
|
+
.map(([key, group]) => [key, summarizeOutcomes(group)])
|
|
531
|
+
);
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
function summarizeCatchByLanguageFamily(records) {
|
|
535
|
+
const positive = records.filter((record) => POSITIVE_CLASSES.has(record.class));
|
|
536
|
+
const grouped = groupBy(positive, (record) => `${record.language}|${record.model_family}`);
|
|
537
|
+
return Object.fromEntries(
|
|
538
|
+
Object.entries(grouped)
|
|
539
|
+
.sort(([a], [b]) => a.localeCompare(b))
|
|
540
|
+
.map(([key, group]) => {
|
|
541
|
+
const caught = group.filter((record) => record.predicted_hot).length;
|
|
542
|
+
return [key, {
|
|
543
|
+
language: key.split('|')[0],
|
|
544
|
+
modelFamily: key.split('|')[1],
|
|
545
|
+
n: group.length,
|
|
546
|
+
caught,
|
|
547
|
+
missed: group.length - caught,
|
|
548
|
+
catchRate: round(caught / group.length),
|
|
549
|
+
catchRateCi: wilsonInterval(caught, group.length),
|
|
550
|
+
}];
|
|
551
|
+
})
|
|
552
|
+
);
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
function summarizeFalsePositiveByLanguage(records) {
|
|
556
|
+
const natural = records.filter((record) => record.class === 'natural-human');
|
|
557
|
+
const grouped = groupBy(natural, (record) => record.language);
|
|
558
|
+
return Object.fromEntries(
|
|
559
|
+
Object.entries(grouped)
|
|
560
|
+
.sort(([a], [b]) => a.localeCompare(b))
|
|
561
|
+
.map(([language, group]) => {
|
|
562
|
+
const falsePositives = group.filter((record) => record.predicted_hot).length;
|
|
563
|
+
return [language, {
|
|
564
|
+
language,
|
|
565
|
+
n: group.length,
|
|
566
|
+
falsePositives,
|
|
567
|
+
trueNegatives: group.length - falsePositives,
|
|
568
|
+
falsePositiveRate: round(falsePositives / group.length),
|
|
569
|
+
falsePositiveRateCi: wilsonInterval(falsePositives, group.length),
|
|
570
|
+
}];
|
|
571
|
+
})
|
|
572
|
+
);
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
function wilsonInterval(successes, n, z = 1.959963984540054) {
|
|
576
|
+
if (!n) return { low: 0, high: 0, method: 'Wilson score interval, 95%' };
|
|
577
|
+
const phat = successes / n;
|
|
578
|
+
const denom = 1 + (z ** 2) / n;
|
|
579
|
+
const center = (phat + (z ** 2) / (2 * n)) / denom;
|
|
580
|
+
const margin = (z * Math.sqrt((phat * (1 - phat) + (z ** 2) / (4 * n)) / n)) / denom;
|
|
581
|
+
return {
|
|
582
|
+
low: round(Math.max(0, center - margin)),
|
|
583
|
+
high: round(Math.min(1, center + margin)),
|
|
584
|
+
method: 'Wilson score interval, 95%',
|
|
585
|
+
};
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
function countBy(records, fn) {
|
|
589
|
+
const out = {};
|
|
590
|
+
for (const record of records) {
|
|
591
|
+
const key = fn(record);
|
|
592
|
+
if (!key) continue;
|
|
593
|
+
out[key] = (out[key] || 0) + 1;
|
|
594
|
+
}
|
|
595
|
+
return out;
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
function groupBy(records, fn) {
|
|
599
|
+
const out = {};
|
|
600
|
+
for (const record of records) {
|
|
601
|
+
const key = fn(record);
|
|
602
|
+
if (!key) continue;
|
|
603
|
+
if (!out[key]) out[key] = [];
|
|
604
|
+
out[key].push(record);
|
|
605
|
+
}
|
|
606
|
+
return out;
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
function protocolCellKey(record) {
|
|
610
|
+
return `${record.language}|${record.class}|${record.register}`;
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
function normalizeToken(value) {
|
|
614
|
+
return typeof value === 'string' ? value.trim().toLowerCase() : value;
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
function canonicalize(value, aliases) {
|
|
618
|
+
const normalized = normalizeToken(value);
|
|
619
|
+
if (typeof normalized !== 'string') return normalized;
|
|
620
|
+
return aliases.get(normalized) || normalized;
|
|
621
|
+
}
|
|
622
|
+
|
|
623
|
+
function hasText(record) {
|
|
624
|
+
return typeof record.text === 'string' && record.text.length > 0;
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
function isMetadataValue(value) {
|
|
628
|
+
if (typeof value === 'string') return value.trim().length > 0;
|
|
629
|
+
if (value && typeof value === 'object' && !Array.isArray(value)) return Object.keys(value).length > 0;
|
|
630
|
+
return false;
|
|
631
|
+
}
|
|
632
|
+
|
|
633
|
+
function renderCountTable(counts, preferredOrder) {
|
|
634
|
+
const keys = [...new Set([...preferredOrder, ...Object.keys(counts).sort()])];
|
|
635
|
+
const rows = keys.map((key) => `| ${escapeMarkdown(key)} | ${counts[key] || 0} |`);
|
|
636
|
+
return ['| value | n |', '|---|---:|', ...rows].join('\n');
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
function renderUnderfilled(cells, limit) {
|
|
640
|
+
if (cells.length === 0) return 'No underfilled populated protocol cells.';
|
|
641
|
+
const rows = cells
|
|
642
|
+
.slice(0, limit)
|
|
643
|
+
.map(({ key, count }) => `| ${escapeMarkdown(key.replaceAll('|', ' × '))} | ${count} |`);
|
|
644
|
+
const suffix = cells.length > limit ? `\n\n_${cells.length - limit} more underfilled cells omitted._` : '';
|
|
645
|
+
return ['| cell | n |', '|---|---:|', ...rows].join('\n') + suffix;
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
function renderBlockerTable(blockers) {
|
|
649
|
+
return [
|
|
650
|
+
'| blocker |',
|
|
651
|
+
'|---|',
|
|
652
|
+
...blockers.map((item) => `| ${escapeMarkdown(item)} |`),
|
|
653
|
+
].join('\n');
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
function renderClaimGateStats(claim, metrics, targets) {
|
|
657
|
+
return [
|
|
658
|
+
'| claim-gate count | value |',
|
|
659
|
+
'|---|---:|',
|
|
660
|
+
`| qualified positive cells (language × generator family, n≥${targets.claimPerCell}) | ${claim.qualifiedPositiveCells.length} |`,
|
|
661
|
+
`| qualified natural-language cells (language, n≥${targets.claimPerCell}) | ${claim.qualifiedNaturalCells.length} |`,
|
|
662
|
+
`| outcome rows with expected/predicted labels | ${metrics.total} |`,
|
|
663
|
+
].join('\n');
|
|
664
|
+
}
|
|
665
|
+
|
|
666
|
+
function renderMetrics(metrics) {
|
|
667
|
+
return [
|
|
668
|
+
'| metric | value |',
|
|
669
|
+
'|---|---:|',
|
|
670
|
+
`| accuracy | ${pct(metrics.accuracy)} |`,
|
|
671
|
+
`| accuracy CI | ${pct(metrics.accuracyCi.low)}–${pct(metrics.accuracyCi.high)} |`,
|
|
672
|
+
`| precision | ${pct(metrics.precision)} |`,
|
|
673
|
+
`| recall | ${pct(metrics.recall)} |`,
|
|
674
|
+
`| recall CI | ${pct(metrics.recallCi.low)}–${pct(metrics.recallCi.high)} |`,
|
|
675
|
+
`| F1 | ${metrics.f1.toFixed(3)} |`,
|
|
676
|
+
`| false positive rate | ${pct(metrics.falsePositiveRate)} |`,
|
|
677
|
+
`| false positive rate CI | ${pct(metrics.falsePositiveRateCi.low)}–${pct(metrics.falsePositiveRateCi.high)} |`,
|
|
678
|
+
`| false negative rate | ${pct(metrics.falseNegativeRate)} |`,
|
|
679
|
+
`| TP/FP/FN/TN | ${metrics.tp}/${metrics.fp}/${metrics.fn}/${metrics.tn} |`,
|
|
680
|
+
].join('\n');
|
|
681
|
+
}
|
|
682
|
+
|
|
683
|
+
function renderMetricsByRegister(metricsByRegister = {}) {
|
|
684
|
+
const keys = [...new Set([...MATRIX.registers, ...Object.keys(metricsByRegister).sort()])];
|
|
685
|
+
const rows = keys
|
|
686
|
+
.filter((key) => metricsByRegister[key])
|
|
687
|
+
.map((key) => {
|
|
688
|
+
const metrics = metricsByRegister[key];
|
|
689
|
+
return `| ${escapeMarkdown(key)} | ${metrics.total} | ${pct(metrics.falsePositiveRate)} | ${pct(metrics.falseNegativeRate)} | ${metrics.tp}/${metrics.fp}/${metrics.fn}/${metrics.tn} |`;
|
|
690
|
+
});
|
|
691
|
+
if (!rows.length) return 'No register-level outcome rows yet.';
|
|
692
|
+
return [
|
|
693
|
+
'| register | n | FP rate | FN rate | TP/FP/FN/TN |',
|
|
694
|
+
'|---|---:|---:|---:|---:|',
|
|
695
|
+
...rows,
|
|
696
|
+
].join('\n');
|
|
697
|
+
}
|
|
698
|
+
|
|
699
|
+
function renderCatchByLanguageFamily(cells = {}) {
|
|
700
|
+
const rows = Object.values(cells)
|
|
701
|
+
.sort((a, b) => `${a.language}|${a.modelFamily}`.localeCompare(`${b.language}|${b.modelFamily}`))
|
|
702
|
+
.map((cell) => `| ${escapeMarkdown(cell.language)} | ${escapeMarkdown(cell.modelFamily)} | ${cell.n} | ${pct(cell.catchRate)} | ${pct(cell.catchRateCi.low)}–${pct(cell.catchRateCi.high)} | ${cell.caught}/${cell.missed} |`);
|
|
703
|
+
if (!rows.length) return 'No positive outcome rows yet.';
|
|
704
|
+
return [
|
|
705
|
+
'| language | model family | n | catch rate | 95% CI | caught/missed |',
|
|
706
|
+
'|---|---|---:|---:|---:|---:|',
|
|
707
|
+
...rows,
|
|
708
|
+
].join('\n');
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
function renderFalsePositiveByLanguage(cells = {}) {
|
|
712
|
+
const rows = Object.values(cells)
|
|
713
|
+
.sort((a, b) => a.language.localeCompare(b.language))
|
|
714
|
+
.map((cell) => `| ${escapeMarkdown(cell.language)} | ${cell.n} | ${pct(cell.falsePositiveRate)} | ${pct(cell.falsePositiveRateCi.low)}–${pct(cell.falsePositiveRateCi.high)} | ${cell.falsePositives}/${cell.trueNegatives} |`);
|
|
715
|
+
if (!rows.length) return 'No natural-human outcome rows yet.';
|
|
716
|
+
return [
|
|
717
|
+
'| language | n | false-positive rate | 95% CI | FP/TN |',
|
|
718
|
+
'|---|---:|---:|---:|---:|',
|
|
719
|
+
...rows,
|
|
720
|
+
].join('\n');
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
function pct(value) {
|
|
724
|
+
return `${((value || 0) * 100).toFixed(1)}%`;
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
function round(n, digits = 3) {
|
|
728
|
+
return Math.round(n * 10 ** digits) / 10 ** digits;
|
|
729
|
+
}
|
|
730
|
+
|
|
731
|
+
function escapeMarkdown(value) {
|
|
732
|
+
return String(value ?? '—').replace(/\|/gu, '\\|').replace(/\n/gu, ' ');
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
function printHelp() {
|
|
736
|
+
console.log(`Usage: node scripts/rebaseline-summary.mjs [--input <manifest.jsonl>] [--json] [--write] [--require-claim-ready]
|
|
737
|
+
|
|
738
|
+
Validates a 2025+ rebaseline JSONL manifest and prints coverage/claim-gate status.
|
|
739
|
+
Use --write to refresh ${DEFAULT_REPORT_DIR}/${DEFAULT_REPORT_BASENAME}.{md,json}.
|
|
740
|
+
Default input: ${DEFAULT_INPUT}`);
|
|
741
|
+
}
|
|
742
|
+
|
|
743
|
+
function main() {
|
|
744
|
+
const args = parseArgs();
|
|
745
|
+
if (args.help) {
|
|
746
|
+
printHelp();
|
|
747
|
+
return;
|
|
748
|
+
}
|
|
749
|
+
|
|
750
|
+
const manifest = loadManifest(args.input);
|
|
751
|
+
const summary = summarizeManifest(manifest.records, { input: manifest.relativePath });
|
|
752
|
+
const validation = { errors: manifest.errors, warnings: manifest.warnings };
|
|
753
|
+
const written = args.write
|
|
754
|
+
? writeReportFiles(summary, validation, { outputDir: args.outputDir, basename: args.basename })
|
|
755
|
+
: null;
|
|
756
|
+
|
|
757
|
+
if (args.json) {
|
|
758
|
+
console.log(JSON.stringify({ ...summary, validation, written }, null, 2));
|
|
759
|
+
} else {
|
|
760
|
+
console.log(renderMarkdownReport(summary, validation));
|
|
761
|
+
if (written) {
|
|
762
|
+
console.log(`Wrote ${written.relativeMarkdownPath}`);
|
|
763
|
+
console.log(`Wrote ${written.relativeJsonPath}`);
|
|
764
|
+
}
|
|
765
|
+
}
|
|
766
|
+
|
|
767
|
+
if (manifest.errors.length) process.exit(1);
|
|
768
|
+
if (args.requireClaimReady && !summary.claimGate.ready) process.exit(2);
|
|
769
|
+
}
|
|
770
|
+
|
|
771
|
+
if (process.argv[1] && resolve(process.argv[1]) === fileURLToPath(import.meta.url)) {
|
|
772
|
+
main();
|
|
773
|
+
}
|