@archsight/aios 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. package/.claude-plugin/marketplace.json +60 -0
  2. package/.claude-plugin/plugin.json +36 -0
  3. package/CHANGELOG.md +93 -30
  4. package/OPENCODE.md +23 -0
  5. package/README.md +106 -48
  6. package/RELEASE_NOTES.md +52 -0
  7. package/adapters/README.md +7 -0
  8. package/adapters/workbuddy/README.md +43 -0
  9. package/agents/README.md +6 -3
  10. package/agents/daedalus/system-prompt.md +2 -0
  11. package/agents/hestia/constraints.md +7 -0
  12. package/agents/hestia/responsibilities.md +7 -0
  13. package/agents/hestia/role.md +12 -0
  14. package/agents/hestia/system-prompt.md +23 -0
  15. package/agents/hestia/workflow.md +8 -0
  16. package/agents/plutus/constraints.md +7 -0
  17. package/agents/plutus/responsibilities.md +7 -0
  18. package/agents/plutus/role.md +12 -0
  19. package/agents/plutus/system-prompt.md +24 -0
  20. package/agents/plutus/workflow.md +8 -0
  21. package/agents/themis/constraints.md +7 -0
  22. package/agents/themis/responsibilities.md +7 -0
  23. package/agents/themis/role.md +12 -0
  24. package/agents/themis/system-prompt.md +24 -0
  25. package/agents/themis/workflow.md +8 -0
  26. package/bin/archsight-aios.mjs +605 -31
  27. package/docs/PUBLIC_DISCOVERY.md +207 -0
  28. package/docs/business-expert-guide.md +5 -3
  29. package/docs/glossary.md +11 -3
  30. package/docs/quickstart.md +18 -4
  31. package/gemini-extension.json +6 -0
  32. package/package.json +66 -34
  33. package/prompts/README.md +12 -0
  34. package/prompts/evaluation-policy.md +70 -0
  35. package/prompts/evaluations/engineering-business-basic-advisory-validation-2026-06-16.md +87 -0
  36. package/prompts/evaluations/engineering-business-basic-fixtures.json +375 -0
  37. package/prompts/evaluations/engineering-business-basic-model-output.example.json +179 -0
  38. package/prompts/evaluations/engineering-business-basic-prompts-2026-06-16.md +205 -0
  39. package/prompts/evaluations/engineering-business-basic-scorecard.json +238 -0
  40. package/prompts/evaluations/engineering-business-public-advisory-fixtures.json +422 -0
  41. package/prompts/evaluations/public-advisory-md/01-technical-bid.md +63 -0
  42. package/prompts/evaluations/public-advisory-md/02-contract.md +61 -0
  43. package/prompts/evaluations/public-advisory-md/03-daily.md +69 -0
  44. package/prompts/evaluations/public-advisory-md/04-meeting.md +48 -0
  45. package/prompts/evaluations/public-advisory-md/05-variation.md +63 -0
  46. package/prompts/evaluations/public-advisory-md/06-scheme.md +60 -0
  47. package/prompts/failure-cases.md +5 -1
  48. package/prompts/prompt-registry.md +10 -0
  49. package/runtime/agent-routing.md +36 -8
  50. package/runtime/archsight-aios.manifest.json +207 -60
  51. package/runtime/capability-registry.json +12 -2
  52. package/runtime/hermes/agent-registry.md +3 -0
  53. package/runtime/hermes/workspace-binding.md +3 -0
  54. package/runtime/skill-routing.md +16 -2
  55. package/scripts/analyze-prompt-run-results.mjs +187 -0
  56. package/scripts/build-prompt-run-pack.mjs +248 -0
  57. package/scripts/validate-prompt-fixtures.mjs +225 -0
  58. package/scripts/validate-prompt-model-outputs.mjs +201 -0
  59. package/scripts/validate-prompt-run-results.mjs +259 -0
  60. package/scripts/validate-prompt-scorecard.mjs +133 -0
  61. package/scripts/validate-skills.mjs +138 -0
  62. package/skills/README.md +16 -0
  63. package/skills/aios-commercial-contract/SKILL.md +107 -0
  64. package/skills/aios-commercial-contract/agents/openai.yaml +4 -0
  65. package/skills/aios-commercial-contract/prompts/basic-prompt.md +83 -0
  66. package/skills/aios-commercial-tender/SKILL.md +107 -0
  67. package/skills/aios-commercial-tender/agents/openai.yaml +4 -0
  68. package/skills/aios-commercial-tender/prompts/basic-prompt.md +94 -0
  69. package/skills/aios-commercial-variation/SKILL.md +106 -0
  70. package/skills/aios-commercial-variation/agents/openai.yaml +4 -0
  71. package/skills/aios-commercial-variation/prompts/basic-prompt.md +99 -0
  72. package/skills/aios-construction-daily/SKILL.md +104 -0
  73. package/skills/aios-construction-daily/agents/openai.yaml +4 -0
  74. package/skills/aios-construction-daily/prompts/basic-prompt.md +76 -0
  75. package/skills/aios-construction-meeting/SKILL.md +104 -0
  76. package/skills/aios-construction-meeting/agents/openai.yaml +4 -0
  77. package/skills/aios-construction-meeting/prompts/basic-prompt.md +78 -0
  78. package/skills/aios-construction-scheme/SKILL.md +97 -0
  79. package/skills/aios-construction-scheme/agents/openai.yaml +4 -0
  80. package/skills/aios-construction-scheme/prompts/basic-prompt.md +90 -0
  81. package/skills/aios-prompt-compare/SKILL.md +178 -0
  82. package/skills/aios-prompt-compare/agents/openai.yaml +4 -0
  83. package/skills/engineering-business-starter-kit.md +109 -0
  84. package/templates/README.md +16 -2
  85. package/templates/project-ai/.ai/ARCHSIGHT_AIOS_RULES.md +5 -4
  86. package/templates/project-ai/.ai/agent-routing.md +3 -1
  87. package/templates/project-ai/.ai/profile-detection.md +24 -0
  88. package/templates/project-ai/.ai/project-context.md +4 -1
  89. package/templates/project-ai/.ai/skills.md +31 -12
  90. package/templates/project-ai/.ai/workflows.md +7 -4
  91. package/templates/project-ai/AGENTS.md +6 -5
  92. package/templates/project-ai/AI_CODING_RULES.md +1 -1
  93. package/templates/project-ai/CLAUDE.md +6 -5
  94. package/templates/project-ai/GEMINI.md +6 -5
  95. package/templates/project-ai/OPENCODE.md +26 -0
  96. package/workflows/README.md +3 -0
  97. package/workflows/site-daily-loop.md +101 -0
@@ -0,0 +1,201 @@
1
+ #!/usr/bin/env node
2
+
3
+ import fs from "node:fs";
4
+ import path from "node:path";
5
+
6
+ const root = fs.realpathSync(process.cwd());
7
+ const errors = [];
8
+
9
+ const defaultOutputPath = "prompts/evaluations/engineering-business-basic-model-output.example.json";
10
+ const args = parseArgs(process.argv.slice(2));
11
+ const outputPath = args.file ?? repoPath(defaultOutputPath);
12
+ const fixturePath = repoPath("prompts/evaluations/engineering-business-basic-fixtures.json");
13
+ const fixture = readJson(fixturePath);
14
+ const outputFile = args.init ? undefined : readJson(outputPath);
15
+
16
+ const sensitiveTerms = [
17
+ "立信",
18
+ "费敏",
19
+ "闻总",
20
+ "谭总",
21
+ "茅盾中学",
22
+ "鸿益",
23
+ "太鑫",
24
+ "飞双",
25
+ "魔毯",
26
+ "客户内部",
27
+ "培训演示",
28
+ "基础内测",
29
+ "内测模式",
30
+ "内测包"
31
+ ];
32
+
33
+ function repoPath(...parts) {
34
+ const target = path.join(root, ...parts);
35
+ const relative = path.relative(root, target);
36
+ if (relative.startsWith("..") || path.isAbsolute(relative)) {
37
+ throw new Error(`Path traversal detected: ${target}`);
38
+ }
39
+ return target;
40
+ }
41
+
42
+ function parseArgs(argv) {
43
+ const parsed = {
44
+ file: undefined,
45
+ init: undefined,
46
+ force: false
47
+ };
48
+
49
+ for (let index = 0; index < argv.length; index += 1) {
50
+ const arg = argv[index];
51
+ if (arg === "--file") {
52
+ const value = argv[index + 1];
53
+ if (!value) {
54
+ errors.push("--file requires a path");
55
+ } else {
56
+ parsed.file = repoPath(value);
57
+ index += 1;
58
+ }
59
+ } else if (arg === "--init") {
60
+ const value = argv[index + 1];
61
+ if (!value) {
62
+ errors.push("--init requires a path");
63
+ } else {
64
+ parsed.init = repoPath(value);
65
+ index += 1;
66
+ }
67
+ } else if (arg === "--force") {
68
+ parsed.force = true;
69
+ } else {
70
+ errors.push(`Unknown argument: ${arg}`);
71
+ }
72
+ }
73
+
74
+ return parsed;
75
+ }
76
+
77
+ function readJson(filePath) {
78
+ try {
79
+ return JSON.parse(fs.readFileSync(filePath, "utf8"));
80
+ } catch (error) {
81
+ errors.push(`${path.relative(root, filePath)}: invalid JSON (${error.message})`);
82
+ return undefined;
83
+ }
84
+ }
85
+
86
+ function check(condition, message) {
87
+ if (!condition) errors.push(message);
88
+ }
89
+
90
+ function includesSensitiveTerm(value) {
91
+ const raw = typeof value === "string" ? value : JSON.stringify(value);
92
+ return sensitiveTerms.filter((term) => raw.includes(term));
93
+ }
94
+
95
+ function outputText(value) {
96
+ if (Array.isArray(value)) return value.join("\n");
97
+ if (typeof value === "string") return value;
98
+ return "";
99
+ }
100
+
101
+ function createOutputTemplate() {
102
+ if (!fixture) return;
103
+
104
+ if (fs.existsSync(args.init) && !args.force) {
105
+ errors.push(`${path.relative(root, args.init)} already exists; pass --force to overwrite`);
106
+ return;
107
+ }
108
+
109
+ const template = {
110
+ schema: 1,
111
+ name: "engineering-business-basic-model-output-run",
112
+ version: fixture.version ?? "0.1",
113
+ fixture: "prompts/evaluations/engineering-business-basic-fixtures.json",
114
+ isExample: false,
115
+ dataBoundary:
116
+ "Fill this file with de-identified model outputs only. Do not include customer names, contacts, project names, exact amounts, or raw source documents.",
117
+ outputs: (fixture.cases ?? []).map((item) => ({
118
+ caseId: item.id,
119
+ promptVersion: fixture.version ?? "0.1",
120
+ model: "",
121
+ ranAt: "",
122
+ notes: "",
123
+ promptPath: item.promptPath,
124
+ scenario: item.scenario,
125
+ expectedSections: item.expectedStrongSections,
126
+ bannedClaims: item.bannedClaims,
127
+ output: []
128
+ }))
129
+ };
130
+
131
+ fs.mkdirSync(path.dirname(args.init), { recursive: true });
132
+ fs.writeFileSync(args.init, `${JSON.stringify(template, null, 2)}\n`, "utf8");
133
+ console.log(`Prompt model output template written: ${path.relative(root, args.init)}`);
134
+ }
135
+
136
+ if (args.init) {
137
+ if (errors.length === 0) createOutputTemplate();
138
+ if (errors.length > 0) {
139
+ console.error(`Prompt model output validation failed with ${errors.length} error(s):`);
140
+ for (const error of errors) console.error(`- ${error}`);
141
+ process.exit(1);
142
+ }
143
+ process.exit(0);
144
+ }
145
+
146
+ if (fixture && outputFile) {
147
+ check(outputFile.schema === 1, "model output file: schema must be 1");
148
+ check(typeof outputFile.version === "string" && outputFile.version.length > 0, "model output file: version must be a string");
149
+ check(Array.isArray(outputFile.outputs), "model output file: outputs must be an array");
150
+ check(
151
+ outputFile.fixture === "prompts/evaluations/engineering-business-basic-fixtures.json",
152
+ "model output file: fixture path mismatch"
153
+ );
154
+
155
+ const casesById = new Map((fixture.cases ?? []).map((item) => [item.id, item]));
156
+ const expectedIds = [...casesById.keys()].sort();
157
+ const actualIds = (outputFile.outputs ?? []).map((item) => item.caseId).sort();
158
+
159
+ check(JSON.stringify(actualIds) === JSON.stringify(expectedIds), "model output file: case coverage mismatch");
160
+
161
+ const seenIds = new Set();
162
+ for (const item of outputFile.outputs ?? []) {
163
+ check(typeof item.caseId === "string" && item.caseId.length > 0, "model output item: missing caseId");
164
+ check(!seenIds.has(item.caseId), `${item.caseId}: duplicate model output`);
165
+ seenIds.add(item.caseId);
166
+
167
+ const sourceCase = casesById.get(item.caseId);
168
+ check(Boolean(sourceCase), `${item.caseId}: caseId not found in fixtures`);
169
+
170
+ check(typeof item.promptVersion === "string" && item.promptVersion.length > 0, `${item.caseId}: missing promptVersion`);
171
+ check(item.promptVersion === fixture.version, `${item.caseId}: promptVersion must match fixture version ${fixture.version}`);
172
+ check(typeof item.model === "string" && item.model.length > 0, `${item.caseId}: missing model`);
173
+ check(typeof item.ranAt === "string" && item.ranAt.length > 0, `${item.caseId}: missing ranAt`);
174
+ check(!Number.isNaN(Date.parse(item.ranAt)), `${item.caseId}: ranAt must be a parseable timestamp`);
175
+ check(typeof item.notes === "string", `${item.caseId}: notes must be a string`);
176
+ if (outputFile.isExample !== true) {
177
+ check(item.model !== "example-skeleton", `${item.caseId}: non-example output must use a real model identifier`);
178
+ }
179
+
180
+ const text = outputText(item.output);
181
+ check(text.length > 0, `${item.caseId}: output must be a non-empty string or string array`);
182
+
183
+ const sensitiveHits = includesSensitiveTerm(item);
184
+ check(sensitiveHits.length === 0, `${item.caseId}: sensitive terms leaked (${sensitiveHits.join(", ")})`);
185
+
186
+ for (const section of sourceCase?.expectedStrongSections ?? []) {
187
+ check(text.includes(section), `${item.caseId}: output missing expected section "${section}"`);
188
+ }
189
+ for (const bannedClaim of sourceCase?.bannedClaims ?? []) {
190
+ check(!text.includes(bannedClaim), `${item.caseId}: output contains prohibited claim "${bannedClaim}"`);
191
+ }
192
+ }
193
+ }
194
+
195
+ if (errors.length > 0) {
196
+ console.error(`Prompt model output validation failed with ${errors.length} error(s):`);
197
+ for (const error of errors) console.error(`- ${error}`);
198
+ process.exit(1);
199
+ }
200
+
201
+ console.log("Prompt model output validation passed.");
@@ -0,0 +1,259 @@
1
+ #!/usr/bin/env node
2
+
3
+ import fs from "node:fs";
4
+ import path from "node:path";
5
+
6
+ const root = fs.realpathSync(process.cwd());
7
+ const errors = [];
8
+ const diagnostics = [];
9
+
10
+ const fixturePath = repoPath("prompts/evaluations/engineering-business-basic-fixtures.json");
11
+ const fixture = readJson(fixturePath);
12
+ const args = parseArgs(process.argv.slice(2));
13
+
14
+ const sensitiveTerms = [
15
+ "立信",
16
+ "费敏",
17
+ "闻总",
18
+ "谭总",
19
+ "茅盾中学",
20
+ "鸿益",
21
+ "太鑫",
22
+ "飞双",
23
+ "魔毯",
24
+ "客户内部",
25
+ "培训演示",
26
+ "基础内测",
27
+ "内测模式",
28
+ "内测包"
29
+ ];
30
+
31
+ function repoPath(...parts) {
32
+ const target = path.join(root, ...parts);
33
+ const relative = path.relative(root, target);
34
+ if (relative.startsWith("..") || path.isAbsolute(relative)) {
35
+ throw new Error(`Path traversal detected: ${target}`);
36
+ }
37
+ return target;
38
+ }
39
+
40
+ function parseArgs(argv) {
41
+ const parsed = {
42
+ file: undefined,
43
+ init: undefined,
44
+ force: false,
45
+ checkTemplate: argv.length === 0
46
+ };
47
+
48
+ for (let index = 0; index < argv.length; index += 1) {
49
+ const arg = argv[index];
50
+ if (arg === "--file") {
51
+ const value = argv[index + 1];
52
+ if (!value) {
53
+ errors.push("--file requires a path");
54
+ } else {
55
+ parsed.file = repoPath(value);
56
+ index += 1;
57
+ }
58
+ } else if (arg === "--init") {
59
+ const value = argv[index + 1];
60
+ if (!value) {
61
+ errors.push("--init requires a path");
62
+ } else {
63
+ parsed.init = repoPath(value);
64
+ index += 1;
65
+ }
66
+ } else if (arg === "--force") {
67
+ parsed.force = true;
68
+ } else if (arg === "--check-template") {
69
+ parsed.checkTemplate = true;
70
+ } else {
71
+ errors.push(`Unknown argument: ${arg}`);
72
+ }
73
+ }
74
+
75
+ if ([parsed.file, parsed.init, parsed.checkTemplate].filter(Boolean).length > 1) {
76
+ errors.push("Use only one mode: --file, --init, or --check-template");
77
+ }
78
+
79
+ return parsed;
80
+ }
81
+
82
+ function readJson(filePath) {
83
+ try {
84
+ return JSON.parse(fs.readFileSync(filePath, "utf8"));
85
+ } catch (error) {
86
+ errors.push(`${path.relative(root, filePath)}: invalid JSON (${error.message})`);
87
+ return undefined;
88
+ }
89
+ }
90
+
91
+ function check(condition, message) {
92
+ if (!condition) errors.push(message);
93
+ }
94
+
95
+ function includesSensitiveTerm(value) {
96
+ const raw = typeof value === "string" ? value : JSON.stringify(value);
97
+ return sensitiveTerms.filter((term) => raw.includes(term));
98
+ }
99
+
100
+ function outputText(value) {
101
+ if (Array.isArray(value)) return value.join("\n");
102
+ if (typeof value === "string") return value;
103
+ return "";
104
+ }
105
+
106
+ function expectedRuns() {
107
+ if (!fixture) return [];
108
+
109
+ return (fixture.cases ?? []).flatMap((item) => [
110
+ {
111
+ runId: `${item.id}::weak`,
112
+ caseId: item.id,
113
+ variant: "weak",
114
+ promptSource: "fixture.weakPrompt",
115
+ promptVersion: fixture.version,
116
+ expectedStrongSections: item.expectedStrongSections,
117
+ bannedClaims: item.bannedClaims,
118
+ weakFailureModes: item.weakFailureModes
119
+ },
120
+ {
121
+ runId: `${item.id}::basic`,
122
+ caseId: item.id,
123
+ variant: "basic",
124
+ promptSource: item.promptPath,
125
+ promptVersion: fixture.version,
126
+ expectedStrongSections: item.expectedStrongSections,
127
+ bannedClaims: item.bannedClaims,
128
+ weakFailureModes: item.weakFailureModes
129
+ }
130
+ ]);
131
+ }
132
+
133
+ function createTemplate() {
134
+ return {
135
+ schema: 1,
136
+ name: "engineering-business-basic-run-results",
137
+ version: fixture?.version ?? "0.1",
138
+ fixture: "prompts/evaluations/engineering-business-basic-fixtures.json",
139
+ runPack: "prompts/evaluations/engineering-business-basic-run-pack.generated.json",
140
+ isExample: false,
141
+ dataBoundary:
142
+ "Fill this file with de-identified weak/basic model outputs only. Do not include customer names, contacts, project names, exact amounts, or raw source documents.",
143
+ outputs: expectedRuns().map((item) => ({
144
+ runId: item.runId,
145
+ caseId: item.caseId,
146
+ variant: item.variant,
147
+ promptVersion: item.promptVersion,
148
+ model: "",
149
+ ranAt: "",
150
+ notes: "",
151
+ promptSource: item.promptSource,
152
+ expectedStrongSections: item.expectedStrongSections,
153
+ bannedClaims: item.bannedClaims,
154
+ weakFailureModes: item.weakFailureModes,
155
+ output: []
156
+ }))
157
+ };
158
+ }
159
+
160
+ function validateTemplateShape(template) {
161
+ check(template.schema === 1, "run results: schema must be 1");
162
+ check(template.version === fixture?.version, `run results: version must match fixture version ${fixture?.version}`);
163
+ check(template.fixture === "prompts/evaluations/engineering-business-basic-fixtures.json", "run results: fixture path mismatch");
164
+ check(Array.isArray(template.outputs), "run results: outputs must be an array");
165
+ check(template.outputs?.length === expectedRuns().length, "run results: output count must match weak/basic run count");
166
+
167
+ const sensitiveHits = includesSensitiveTerm(template);
168
+ check(sensitiveHits.length === 0, `run results: sensitive terms leaked (${sensitiveHits.join(", ")})`);
169
+
170
+ const expectedByRunId = new Map(expectedRuns().map((item) => [item.runId, item]));
171
+ const actualRunIds = (template.outputs ?? []).map((item) => item.runId).sort();
172
+ const expectedRunIds = [...expectedByRunId.keys()].sort();
173
+ check(JSON.stringify(actualRunIds) === JSON.stringify(expectedRunIds), "run results: runId coverage mismatch");
174
+
175
+ const seen = new Set();
176
+ for (const item of template.outputs ?? []) {
177
+ const expected = expectedByRunId.get(item.runId);
178
+ check(!seen.has(item.runId), `${item.runId}: duplicate output`);
179
+ seen.add(item.runId);
180
+ check(Boolean(expected), `${item.runId}: runId not found in expected run pack`);
181
+ check(item.caseId === expected?.caseId, `${item.runId}: caseId mismatch`);
182
+ check(item.variant === expected?.variant, `${item.runId}: variant mismatch`);
183
+ check(item.promptVersion === expected?.promptVersion, `${item.runId}: promptVersion mismatch`);
184
+ check(item.promptSource === expected?.promptSource, `${item.runId}: promptSource mismatch`);
185
+ check(Array.isArray(item.expectedStrongSections), `${item.runId}: expectedStrongSections must be an array`);
186
+ check(Array.isArray(item.bannedClaims), `${item.runId}: bannedClaims must be an array`);
187
+ check(Array.isArray(item.weakFailureModes), `${item.runId}: weakFailureModes must be an array`);
188
+ }
189
+ }
190
+
191
+ function validateRunResults(results) {
192
+ validateTemplateShape(results);
193
+
194
+ for (const item of results.outputs ?? []) {
195
+ check(typeof item.model === "string" && item.model.length > 0, `${item.runId}: missing model`);
196
+ check(item.model !== "example-skeleton", `${item.runId}: model must be a real model identifier`);
197
+ check(typeof item.ranAt === "string" && item.ranAt.length > 0, `${item.runId}: missing ranAt`);
198
+ check(!Number.isNaN(Date.parse(item.ranAt)), `${item.runId}: ranAt must be a parseable timestamp`);
199
+ check(typeof item.notes === "string", `${item.runId}: notes must be a string`);
200
+
201
+ const text = outputText(item.output);
202
+ check(text.length > 0, `${item.runId}: output must be a non-empty string or string array`);
203
+
204
+ const sensitiveHits = includesSensitiveTerm(item);
205
+ check(sensitiveHits.length === 0, `${item.runId}: sensitive terms leaked (${sensitiveHits.join(", ")})`);
206
+
207
+ const missingSections = (item.expectedStrongSections ?? []).filter((section) => !text.includes(section));
208
+ const prohibitedClaims = (item.bannedClaims ?? []).filter((claim) => text.includes(claim));
209
+
210
+ if (item.variant === "basic") {
211
+ for (const section of missingSections) {
212
+ errors.push(`${item.runId}: basic output missing expected section "${section}"`);
213
+ }
214
+ for (const claim of prohibitedClaims) {
215
+ errors.push(`${item.runId}: basic output contains prohibited claim "${claim}"`);
216
+ }
217
+ } else if (missingSections.length > 0 || prohibitedClaims.length > 0) {
218
+ diagnostics.push({
219
+ runId: item.runId,
220
+ missingSections,
221
+ prohibitedClaims
222
+ });
223
+ }
224
+ }
225
+ }
226
+
227
+ if (args.init) {
228
+ const template = createTemplate();
229
+ validateTemplateShape(template);
230
+ if (fs.existsSync(args.init) && !args.force) {
231
+ errors.push(`${path.relative(root, args.init)} already exists; pass --force to overwrite`);
232
+ }
233
+ if (errors.length === 0) {
234
+ fs.mkdirSync(path.dirname(args.init), { recursive: true });
235
+ fs.writeFileSync(args.init, `${JSON.stringify(template, null, 2)}\n`, "utf8");
236
+ console.log(`Prompt run results template written: ${path.relative(root, args.init)}`);
237
+ process.exit(0);
238
+ }
239
+ } else if (args.file) {
240
+ const results = readJson(args.file);
241
+ if (results) validateRunResults(results);
242
+ } else {
243
+ validateTemplateShape(createTemplate());
244
+ if (errors.length === 0) {
245
+ console.log("Prompt run results template validation passed.");
246
+ process.exit(0);
247
+ }
248
+ }
249
+
250
+ if (errors.length > 0) {
251
+ console.error(`Prompt run results validation failed with ${errors.length} error(s):`);
252
+ for (const error of errors) console.error(`- ${error}`);
253
+ process.exit(1);
254
+ }
255
+
256
+ console.log("Prompt run results validation passed.");
257
+ if (diagnostics.length > 0) {
258
+ console.log(`Weak output diagnostics: ${diagnostics.length} run(s) need comparison review.`);
259
+ }
@@ -0,0 +1,133 @@
1
+ #!/usr/bin/env node
2
+
3
+ import fs from "node:fs";
4
+ import path from "node:path";
5
+
6
+ const root = fs.realpathSync(process.cwd());
7
+ const errors = [];
8
+
9
+ const fixturePath = repoPath("prompts/evaluations/engineering-business-basic-fixtures.json");
10
+ const scorecardPath = repoPath("prompts/evaluations/engineering-business-basic-scorecard.json");
11
+ const fixture = readJson(fixturePath);
12
+ const scorecard = readJson(scorecardPath);
13
+
14
+ const sensitiveTerms = [
15
+ "立信",
16
+ "费敏",
17
+ "闻总",
18
+ "谭总",
19
+ "茅盾中学",
20
+ "鸿益",
21
+ "太鑫",
22
+ "飞双",
23
+ "魔毯",
24
+ "客户内部",
25
+ "培训演示",
26
+ "基础内测",
27
+ "内测模式",
28
+ "内测包"
29
+ ];
30
+
31
+ function repoPath(...parts) {
32
+ const target = path.join(root, ...parts);
33
+ const relative = path.relative(root, target);
34
+ if (relative.startsWith("..") || path.isAbsolute(relative)) {
35
+ throw new Error(`Path traversal detected: ${target}`);
36
+ }
37
+ return target;
38
+ }
39
+
40
+ function readJson(filePath) {
41
+ try {
42
+ return JSON.parse(fs.readFileSync(filePath, "utf8"));
43
+ } catch (error) {
44
+ errors.push(`${path.relative(root, filePath)}: invalid JSON (${error.message})`);
45
+ return undefined;
46
+ }
47
+ }
48
+
49
+ function check(condition, message) {
50
+ if (!condition) errors.push(message);
51
+ }
52
+
53
+ function includesSensitiveTerm(value) {
54
+ const raw = typeof value === "string" ? value : JSON.stringify(value);
55
+ return sensitiveTerms.filter((term) => raw.includes(term));
56
+ }
57
+
58
+ function weightedScore(scores, criteria) {
59
+ const totalWeight = criteria.reduce((sum, item) => sum + item.weight, 0);
60
+ return (
61
+ criteria.reduce((sum, item) => {
62
+ return sum + scores[item.id] * item.weight;
63
+ }, 0) / totalWeight
64
+ );
65
+ }
66
+
67
+ if (fixture && scorecard) {
68
+ check(scorecard.schema === 1, "scorecard: schema must be 1");
69
+ check(scorecard.version === fixture.version, `scorecard: version must match fixture version ${fixture.version}`);
70
+ check(scorecard.fixture === "prompts/evaluations/engineering-business-basic-fixtures.json", "scorecard: fixture path mismatch");
71
+ check(Array.isArray(scorecard.criteria) && scorecard.criteria.length > 0, "scorecard: criteria must be a non-empty array");
72
+ check(Array.isArray(scorecard.cases) && scorecard.cases.length === fixture.cases.length, "scorecard: case count mismatch");
73
+
74
+ const sensitiveHits = includesSensitiveTerm(scorecard);
75
+ check(sensitiveHits.length === 0, `scorecard: sensitive terms leaked (${sensitiveHits.join(", ")})`);
76
+
77
+ const criteriaIds = new Set();
78
+ let totalWeight = 0;
79
+ for (const criterion of scorecard.criteria ?? []) {
80
+ check(/^[a-z0-9_]+$/.test(criterion.id), `${criterion.id ?? "unknown"}: invalid criterion id`);
81
+ check(!criteriaIds.has(criterion.id), `${criterion.id}: duplicate criterion`);
82
+ criteriaIds.add(criterion.id);
83
+ check(Number.isInteger(criterion.weight) && criterion.weight > 0, `${criterion.id}: weight must be a positive integer`);
84
+ totalWeight += criterion.weight ?? 0;
85
+ check(typeof criterion.description === "string" && criterion.description.length > 0, `${criterion.id}: missing description`);
86
+ }
87
+ check(totalWeight === 100, `scorecard: criteria weights must total 100, got ${totalWeight}`);
88
+
89
+ const fixtureById = new Map((fixture.cases ?? []).map((item) => [item.id, item]));
90
+ const expectedIds = [...fixtureById.keys()].sort();
91
+ const actualIds = (scorecard.cases ?? []).map((item) => item.caseId).sort();
92
+ check(JSON.stringify(actualIds) === JSON.stringify(expectedIds), "scorecard: case coverage mismatch");
93
+
94
+ for (const item of scorecard.cases ?? []) {
95
+ const sourceCase = fixtureById.get(item.caseId);
96
+ check(Boolean(sourceCase), `${item.caseId}: caseId not found in fixtures`);
97
+ check(item.winner === "basic", `${item.caseId}: winner must be basic`);
98
+ check(typeof item.decisionBasis === "string" && item.decisionBasis.length > 0, `${item.caseId}: missing decisionBasis`);
99
+ check(Array.isArray(item.basicPromptGains) && item.basicPromptGains.length > 0, `${item.caseId}: basicPromptGains must be non-empty`);
100
+ check(
101
+ JSON.stringify(item.observedWeakFailures ?? []) === JSON.stringify(sourceCase?.weakFailureModes ?? []),
102
+ `${item.caseId}: observedWeakFailures must match fixture weakFailureModes`
103
+ );
104
+
105
+ for (const scoreSetName of ["weakScores", "basicScores"]) {
106
+ const scoreSet = item[scoreSetName] ?? {};
107
+ const scoreIds = Object.keys(scoreSet).sort();
108
+ check(JSON.stringify(scoreIds) === JSON.stringify([...criteriaIds].sort()), `${item.caseId}: ${scoreSetName} coverage mismatch`);
109
+ for (const criterionId of criteriaIds) {
110
+ const value = scoreSet[criterionId];
111
+ check(Number.isInteger(value) && value >= 1 && value <= 5, `${item.caseId}: ${scoreSetName}.${criterionId} must be 1-5`);
112
+ }
113
+ }
114
+
115
+ const weakTotal = weightedScore(item.weakScores, scorecard.criteria);
116
+ const basicTotal = weightedScore(item.basicScores, scorecard.criteria);
117
+ check(
118
+ basicTotal - weakTotal >= scorecard.minimumWeightedDelta,
119
+ `${item.caseId}: weighted improvement ${Number(basicTotal - weakTotal).toFixed(2)} is below minimum ${scorecard.minimumWeightedDelta}`
120
+ );
121
+ }
122
+
123
+ check(scorecard.overallDecision?.winner === "basic", "scorecard: overall winner must be basic");
124
+ check(typeof scorecard.overallDecision?.notAClaim === "string", "scorecard: overallDecision.notAClaim is required");
125
+ }
126
+
127
+ if (errors.length > 0) {
128
+ console.error(`Prompt scorecard validation failed with ${errors.length} error(s):`);
129
+ for (const error of errors) console.error(`- ${error}`);
130
+ process.exit(1);
131
+ }
132
+
133
+ console.log("Prompt scorecard validation passed.");