@archsight/aios 1.2.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/CHANGELOG.md +59 -0
- package/OPENCODE.md +23 -0
- package/README.md +64 -31
- package/RELEASE_NOTES.md +37 -0
- package/adapters/workbuddy/README.md +11 -1
- package/agents/README.md +6 -3
- package/agents/atlas/responsibilities.md +1 -1
- package/agents/atlas/system-prompt.md +1 -1
- package/agents/daedalus/system-prompt.md +2 -0
- package/agents/hestia/constraints.md +7 -0
- package/agents/hestia/responsibilities.md +7 -0
- package/agents/hestia/role.md +12 -0
- package/agents/hestia/system-prompt.md +23 -0
- package/agents/hestia/workflow.md +8 -0
- package/agents/plutus/constraints.md +7 -0
- package/agents/plutus/responsibilities.md +7 -0
- package/agents/plutus/role.md +12 -0
- package/agents/plutus/system-prompt.md +24 -0
- package/agents/plutus/workflow.md +8 -0
- package/agents/themis/constraints.md +7 -0
- package/agents/themis/responsibilities.md +7 -0
- package/agents/themis/role.md +12 -0
- package/agents/themis/system-prompt.md +24 -0
- package/agents/themis/workflow.md +8 -0
- package/bin/archsight-aios.mjs +558 -25
- package/docs/PUBLIC_DISCOVERY.md +16 -2
- package/docs/business-expert-guide.md +5 -3
- package/docs/glossary.md +11 -3
- package/docs/quickstart.md +18 -4
- package/gemini-extension.json +1 -1
- package/governance/README.md +41 -11
- package/governance/agent-boundary.md +1 -2
- package/governance/ai-review-policy.md +1 -2
- package/governance/arbitration-protocol.md +33 -33
- package/governance/context-policy.md +2 -3
- package/governance/delivery-policy.md +1 -2
- package/governance/memory-policy.md +1 -2
- package/governance/security-policy.md +1 -2
- package/memory/decision-records.md +8 -9
- package/package.json +17 -6
- package/prompts/README.md +12 -0
- package/prompts/evaluation-policy.md +70 -0
- package/prompts/evaluations/engineering-business-basic-advisory-validation-2026-06-16.md +87 -0
- package/prompts/evaluations/engineering-business-basic-fixtures.json +375 -0
- package/prompts/evaluations/engineering-business-basic-model-output.example.json +179 -0
- package/prompts/evaluations/engineering-business-basic-prompts-2026-06-16.md +205 -0
- package/prompts/evaluations/engineering-business-basic-scorecard.json +238 -0
- package/prompts/evaluations/engineering-business-public-advisory-fixtures.json +422 -0
- package/prompts/evaluations/public-advisory-md/01-technical-bid.md +63 -0
- package/prompts/evaluations/public-advisory-md/02-contract.md +61 -0
- package/prompts/evaluations/public-advisory-md/03-daily.md +69 -0
- package/prompts/evaluations/public-advisory-md/04-meeting.md +48 -0
- package/prompts/evaluations/public-advisory-md/05-variation.md +63 -0
- package/prompts/evaluations/public-advisory-md/06-scheme.md +60 -0
- package/prompts/failure-cases.md +5 -1
- package/prompts/prompt-registry.md +10 -0
- package/runtime/agent-routing.md +39 -9
- package/runtime/archsight-aios.manifest.json +154 -51
- package/runtime/hermes/agent-registry.md +3 -0
- package/runtime/hermes/workspace-binding.md +3 -0
- package/runtime/skill-routing.md +23 -12
- package/scripts/analyze-prompt-run-results.mjs +187 -0
- package/scripts/build-prompt-run-pack.mjs +248 -0
- package/scripts/validate-prompt-fixtures.mjs +225 -0
- package/scripts/validate-prompt-model-outputs.mjs +201 -0
- package/scripts/validate-prompt-run-results.mjs +259 -0
- package/scripts/validate-prompt-scorecard.mjs +133 -0
- package/scripts/validate-skills.mjs +8 -3
- package/skills/README.md +12 -6
- package/skills/aios/SKILL.md +79 -0
- package/skills/aios/agents/openai.yaml +4 -0
- package/skills/aios-arch/SKILL.md +14 -14
- package/skills/aios-ceo/SKILL.md +13 -13
- package/skills/aios-commercial-contract/SKILL.md +32 -14
- package/skills/aios-commercial-contract/prompts/basic-prompt.md +83 -0
- package/skills/aios-commercial-tender/SKILL.md +31 -13
- package/skills/aios-commercial-tender/prompts/basic-prompt.md +94 -0
- package/skills/aios-commercial-variation/SKILL.md +33 -15
- package/skills/aios-commercial-variation/prompts/basic-prompt.md +99 -0
- package/skills/aios-compare/SKILL.md +92 -0
- package/skills/aios-compare/agents/openai.yaml +4 -0
- package/skills/aios-construction-daily/SKILL.md +32 -14
- package/skills/aios-construction-daily/prompts/basic-prompt.md +76 -0
- package/skills/aios-construction-meeting/SKILL.md +32 -14
- package/skills/aios-construction-meeting/prompts/basic-prompt.md +78 -0
- package/skills/aios-construction-scheme/SKILL.md +28 -10
- package/skills/aios-construction-scheme/prompts/basic-prompt.md +90 -0
- package/skills/aios-plan/SKILL.md +7 -7
- package/skills/aios-prompt-compare/SKILL.md +180 -0
- package/skills/aios-prompt-compare/agents/openai.yaml +4 -0
- package/skills/aios-review/SKILL.md +1 -1
- package/skills/aios-structural/SKILL.md +7 -7
- package/skills/archsight-aios/SKILL.md +40 -0
- package/skills/archsight-aios/agents/openai.yaml +4 -0
- package/skills/engineering-business-starter-kit.md +112 -0
- package/templates/README.md +16 -2
- package/templates/project-ai/.ai/ARCHSIGHT_AIOS_RULES.md +5 -4
- package/templates/project-ai/.ai/agent-routing.md +3 -1
- package/templates/project-ai/.ai/profile-detection.md +24 -0
- package/templates/project-ai/.ai/project-context.md +4 -1
- package/templates/project-ai/.ai/skills.md +36 -24
- package/templates/project-ai/AGENTS.md +6 -5
- package/templates/project-ai/AI_CODING_RULES.md +1 -1
- package/templates/project-ai/CLAUDE.md +6 -5
- package/templates/project-ai/GEMINI.md +6 -5
- package/templates/project-ai/OPENCODE.md +26 -0
- package/workflows/README.md +1 -1
- package/workflows/architecture-review.md +10 -10
- package/workflows/site-daily-loop.md +25 -25
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import fs from "node:fs";
|
|
4
|
+
import path from "node:path";
|
|
5
|
+
|
|
6
|
+
const root = fs.realpathSync(process.cwd());
|
|
7
|
+
const errors = [];
|
|
8
|
+
|
|
9
|
+
const fixturePath = repoPath("prompts/evaluations/engineering-business-basic-fixtures.json");
|
|
10
|
+
const scorecardPath = repoPath("prompts/evaluations/engineering-business-basic-scorecard.json");
|
|
11
|
+
const fixture = readJson(fixturePath);
|
|
12
|
+
const scorecard = readJson(scorecardPath);
|
|
13
|
+
const args = parseArgs(process.argv.slice(2));
|
|
14
|
+
|
|
15
|
+
function repoPath(...parts) {
|
|
16
|
+
const target = path.join(root, ...parts);
|
|
17
|
+
const relative = path.relative(root, target);
|
|
18
|
+
if (relative.startsWith("..") || path.isAbsolute(relative)) {
|
|
19
|
+
throw new Error(`Path traversal detected: ${target}`);
|
|
20
|
+
}
|
|
21
|
+
return target;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
function parseArgs(argv) {
|
|
25
|
+
const parsed = {
|
|
26
|
+
file: undefined,
|
|
27
|
+
out: undefined
|
|
28
|
+
};
|
|
29
|
+
|
|
30
|
+
for (let index = 0; index < argv.length; index += 1) {
|
|
31
|
+
const arg = argv[index];
|
|
32
|
+
if (arg === "--file") {
|
|
33
|
+
const value = argv[index + 1];
|
|
34
|
+
if (!value) {
|
|
35
|
+
errors.push("--file requires a path");
|
|
36
|
+
} else {
|
|
37
|
+
parsed.file = repoPath(value);
|
|
38
|
+
index += 1;
|
|
39
|
+
}
|
|
40
|
+
} else if (arg === "--out") {
|
|
41
|
+
const value = argv[index + 1];
|
|
42
|
+
if (!value) {
|
|
43
|
+
errors.push("--out requires a path");
|
|
44
|
+
} else {
|
|
45
|
+
parsed.out = repoPath(value);
|
|
46
|
+
index += 1;
|
|
47
|
+
}
|
|
48
|
+
} else {
|
|
49
|
+
errors.push(`Unknown argument: ${arg}`);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
if (!parsed.file) {
|
|
54
|
+
errors.push("--file is required");
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
return parsed;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function readJson(filePath) {
|
|
61
|
+
try {
|
|
62
|
+
return JSON.parse(fs.readFileSync(filePath, "utf8"));
|
|
63
|
+
} catch (error) {
|
|
64
|
+
errors.push(`${path.relative(root, filePath)}: invalid JSON (${error.message})`);
|
|
65
|
+
return undefined;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
function outputText(value) {
|
|
70
|
+
if (Array.isArray(value)) return value.join("\n");
|
|
71
|
+
if (typeof value === "string") return value;
|
|
72
|
+
return "";
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
function weightedScore(scores, criteria) {
|
|
76
|
+
const totalWeight = criteria.reduce((sum, item) => sum + item.weight, 0);
|
|
77
|
+
return (
|
|
78
|
+
criteria.reduce((sum, item) => {
|
|
79
|
+
return sum + scores[item.id] * item.weight;
|
|
80
|
+
}, 0) / totalWeight
|
|
81
|
+
);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
function analyzeResults(results) {
|
|
85
|
+
const outputsByRunId = new Map((results.outputs ?? []).map((item) => [item.runId, item]));
|
|
86
|
+
const scorecardByCase = new Map((scorecard.cases ?? []).map((item) => [item.caseId, item]));
|
|
87
|
+
const criteria = scorecard.criteria ?? [];
|
|
88
|
+
|
|
89
|
+
const caseResults = (fixture.cases ?? []).map((item) => {
|
|
90
|
+
const weak = outputsByRunId.get(`${item.id}::weak`);
|
|
91
|
+
const basic = outputsByRunId.get(`${item.id}::basic`);
|
|
92
|
+
const weakText = outputText(weak?.output);
|
|
93
|
+
const basicText = outputText(basic?.output);
|
|
94
|
+
const score = scorecardByCase.get(item.id);
|
|
95
|
+
|
|
96
|
+
const weakMissingSections = item.expectedStrongSections.filter((section) => !weakText.includes(section));
|
|
97
|
+
const basicMissingSections = item.expectedStrongSections.filter((section) => !basicText.includes(section));
|
|
98
|
+
const weakProhibitedClaims = item.bannedClaims.filter((claim) => weakText.includes(claim));
|
|
99
|
+
const basicProhibitedClaims = item.bannedClaims.filter((claim) => basicText.includes(claim));
|
|
100
|
+
const basicPass = basicMissingSections.length === 0 && basicProhibitedClaims.length === 0;
|
|
101
|
+
const weakDiagnostics = weakMissingSections.length + weakProhibitedClaims.length;
|
|
102
|
+
|
|
103
|
+
const weakScore = score ? weightedScore(score.weakScores, criteria) : 0;
|
|
104
|
+
const basicScore = score ? weightedScore(score.basicScores, criteria) : 0;
|
|
105
|
+
|
|
106
|
+
return {
|
|
107
|
+
caseId: item.id,
|
|
108
|
+
scenario: item.scenario,
|
|
109
|
+
basicPass,
|
|
110
|
+
weakDiagnostics,
|
|
111
|
+
weakMissingSections,
|
|
112
|
+
basicMissingSections,
|
|
113
|
+
weakProhibitedClaims,
|
|
114
|
+
basicProhibitedClaims,
|
|
115
|
+
scorecardWinner: score?.winner ?? "unknown",
|
|
116
|
+
scoreDelta: Number((basicScore - weakScore).toFixed(2)),
|
|
117
|
+
decisionBasis: score?.decisionBasis ?? ""
|
|
118
|
+
};
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
return {
|
|
122
|
+
schema: 1,
|
|
123
|
+
name: "engineering-business-basic-run-results-analysis",
|
|
124
|
+
version: fixture.version,
|
|
125
|
+
sourceFile: path.relative(root, args.file),
|
|
126
|
+
totalCases: caseResults.length,
|
|
127
|
+
basicPassCount: caseResults.filter((item) => item.basicPass).length,
|
|
128
|
+
weakDiagnosticCaseCount: caseResults.filter((item) => item.weakDiagnostics > 0).length,
|
|
129
|
+
scorecardBasicWinnerCount: caseResults.filter((item) => item.scorecardWinner === "basic").length,
|
|
130
|
+
caseResults
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
function renderMarkdown(analysis) {
|
|
135
|
+
const lines = [
|
|
136
|
+
"# 工程业务基础提示词运行结果分析",
|
|
137
|
+
"",
|
|
138
|
+
`- 来源文件:\`${analysis.sourceFile}\``,
|
|
139
|
+
`- 覆盖 case:${analysis.totalCases}`,
|
|
140
|
+
`- 基础提示词通过门禁:${analysis.basicPassCount}/${analysis.totalCases}`,
|
|
141
|
+
`- 普通提示词存在结构或边界诊断:${analysis.weakDiagnosticCaseCount}/${analysis.totalCases}`,
|
|
142
|
+
`- scorecard 判定基础提示词更优:${analysis.scorecardBasicWinnerCount}/${analysis.totalCases}`,
|
|
143
|
+
"",
|
|
144
|
+
"> 本报告只分析已归档的脱敏 run results,不代表未运行模型时的真实效果。",
|
|
145
|
+
"",
|
|
146
|
+
"| Case | Basic Gate | Weak Diagnostics | Score Delta | Decision |",
|
|
147
|
+
"|---|---:|---:|---:|---|"
|
|
148
|
+
];
|
|
149
|
+
|
|
150
|
+
for (const item of analysis.caseResults) {
|
|
151
|
+
lines.push(
|
|
152
|
+
`| ${item.caseId} | ${item.basicPass ? "pass" : "fail"} | ${item.weakDiagnostics} | ${item.scoreDelta} | ${item.scorecardWinner} |`
|
|
153
|
+
);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
lines.push("", "## 分场景说明", "");
|
|
157
|
+
for (const item of analysis.caseResults) {
|
|
158
|
+
lines.push(`### ${item.caseId}`, "");
|
|
159
|
+
lines.push(`- 场景:${item.scenario}`);
|
|
160
|
+
lines.push(`- 评分卡依据:${item.decisionBasis}`);
|
|
161
|
+
lines.push(`- Basic 缺失章节:${item.basicMissingSections.length === 0 ? "无" : item.basicMissingSections.join(";")}`);
|
|
162
|
+
lines.push(`- Basic 禁止结论:${item.basicProhibitedClaims.length === 0 ? "无" : item.basicProhibitedClaims.join(";")}`);
|
|
163
|
+
lines.push(`- Weak 缺失章节:${item.weakMissingSections.length === 0 ? "无" : item.weakMissingSections.join(";")}`);
|
|
164
|
+
lines.push(`- Weak 禁止结论:${item.weakProhibitedClaims.length === 0 ? "无" : item.weakProhibitedClaims.join(";")}`);
|
|
165
|
+
lines.push("");
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
return `${lines.join("\n")}\n`;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
const results = args.file ? readJson(args.file) : undefined;
|
|
172
|
+
const analysis = results && fixture && scorecard ? analyzeResults(results) : undefined;
|
|
173
|
+
const report = analysis ? renderMarkdown(analysis) : "";
|
|
174
|
+
|
|
175
|
+
if (errors.length > 0) {
|
|
176
|
+
console.error(`Prompt run results analysis failed with ${errors.length} error(s):`);
|
|
177
|
+
for (const error of errors) console.error(`- ${error}`);
|
|
178
|
+
process.exit(1);
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
if (args.out) {
|
|
182
|
+
fs.mkdirSync(path.dirname(args.out), { recursive: true });
|
|
183
|
+
fs.writeFileSync(args.out, report, "utf8");
|
|
184
|
+
console.log(`Prompt run results analysis written: ${path.relative(root, args.out)}`);
|
|
185
|
+
} else {
|
|
186
|
+
process.stdout.write(report);
|
|
187
|
+
}
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import fs from "node:fs";
|
|
4
|
+
import path from "node:path";
|
|
5
|
+
|
|
6
|
+
const root = fs.realpathSync(process.cwd());
|
|
7
|
+
const errors = [];
|
|
8
|
+
|
|
9
|
+
const args = parseArgs(process.argv.slice(2));
|
|
10
|
+
const fixturePath = repoPath(args.fixture);
|
|
11
|
+
const fixture = readJson(fixturePath);
|
|
12
|
+
|
|
13
|
+
const sensitiveTerms = [
|
|
14
|
+
"立信",
|
|
15
|
+
"费敏",
|
|
16
|
+
"闻总",
|
|
17
|
+
"谭总",
|
|
18
|
+
"茅盾中学",
|
|
19
|
+
"鸿益",
|
|
20
|
+
"太鑫",
|
|
21
|
+
"飞双",
|
|
22
|
+
"魔毯",
|
|
23
|
+
"客户内部",
|
|
24
|
+
"培训演示",
|
|
25
|
+
"基础内测",
|
|
26
|
+
"内测模式",
|
|
27
|
+
"内测包",
|
|
28
|
+
"嘉兴",
|
|
29
|
+
"绍兴",
|
|
30
|
+
"杭州",
|
|
31
|
+
"20,000",
|
|
32
|
+
"20000",
|
|
33
|
+
"28,000",
|
|
34
|
+
"28000",
|
|
35
|
+
"17,800",
|
|
36
|
+
"17800"
|
|
37
|
+
];
|
|
38
|
+
|
|
39
|
+
function repoPath(...parts) {
|
|
40
|
+
const target = path.join(root, ...parts);
|
|
41
|
+
const relative = path.relative(root, target);
|
|
42
|
+
if (relative.startsWith("..") || path.isAbsolute(relative)) {
|
|
43
|
+
throw new Error(`Path traversal detected: ${target}`);
|
|
44
|
+
}
|
|
45
|
+
return target;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
function parseArgs(argv) {
|
|
49
|
+
const parsed = {
|
|
50
|
+
out: undefined,
|
|
51
|
+
fixture: "prompts/evaluations/engineering-business-basic-fixtures.json"
|
|
52
|
+
};
|
|
53
|
+
|
|
54
|
+
for (let index = 0; index < argv.length; index += 1) {
|
|
55
|
+
const arg = argv[index];
|
|
56
|
+
if (arg === "--out") {
|
|
57
|
+
const value = argv[index + 1];
|
|
58
|
+
if (!value) {
|
|
59
|
+
errors.push("--out requires a path");
|
|
60
|
+
} else {
|
|
61
|
+
parsed.out = repoPath(value);
|
|
62
|
+
index += 1;
|
|
63
|
+
}
|
|
64
|
+
} else if (arg === "--fixture") {
|
|
65
|
+
const value = argv[index + 1];
|
|
66
|
+
if (!value) {
|
|
67
|
+
errors.push("--fixture requires a path");
|
|
68
|
+
} else {
|
|
69
|
+
repoPath(value);
|
|
70
|
+
parsed.fixture = value.replace(/\\/g, "/");
|
|
71
|
+
index += 1;
|
|
72
|
+
}
|
|
73
|
+
} else if (arg === "--check") {
|
|
74
|
+
continue;
|
|
75
|
+
} else {
|
|
76
|
+
errors.push(`Unknown argument: ${arg}`);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
return parsed;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
function readJson(filePath) {
|
|
84
|
+
try {
|
|
85
|
+
return JSON.parse(fs.readFileSync(filePath, "utf8"));
|
|
86
|
+
} catch (error) {
|
|
87
|
+
errors.push(`${path.relative(root, filePath)}: invalid JSON (${error.message})`);
|
|
88
|
+
return undefined;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
function readText(relativePath) {
|
|
93
|
+
try {
|
|
94
|
+
return fs.readFileSync(repoPath(relativePath), "utf8");
|
|
95
|
+
} catch (error) {
|
|
96
|
+
errors.push(`${relativePath}: ${error.message}`);
|
|
97
|
+
return "";
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
function check(condition, message) {
|
|
102
|
+
if (!condition) errors.push(message);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
function includesSensitiveTerm(value) {
|
|
106
|
+
const raw = typeof value === "string" ? value : JSON.stringify(value);
|
|
107
|
+
return sensitiveTerms.filter((term) => raw.includes(term));
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
function caseInput(item) {
|
|
111
|
+
if (item.markdownFixturePath) {
|
|
112
|
+
return [readText(item.markdownFixturePath)];
|
|
113
|
+
}
|
|
114
|
+
return item.sampleInput;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
function runPackName() {
|
|
118
|
+
if (fixture?.name === "engineering-business-basic-fixtures") {
|
|
119
|
+
return "engineering-business-basic-run-pack";
|
|
120
|
+
}
|
|
121
|
+
return `${fixture?.name ?? "prompt-fixture"}-run-pack`;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
function buildRunPack() {
|
|
125
|
+
if (!fixture) return undefined;
|
|
126
|
+
|
|
127
|
+
const runs = [];
|
|
128
|
+
for (const item of fixture.cases ?? []) {
|
|
129
|
+
const basicPrompt = readText(item.promptPath);
|
|
130
|
+
const sampleInput = caseInput(item);
|
|
131
|
+
const common = {
|
|
132
|
+
caseId: item.id,
|
|
133
|
+
skillId: item.skillId,
|
|
134
|
+
scenario: item.scenario,
|
|
135
|
+
promptVersion: fixture.version,
|
|
136
|
+
inputSummary: item.inputSummary,
|
|
137
|
+
inputFormat: item.markdownFixturePath ? "markdown" : "inline-list",
|
|
138
|
+
sampleInput,
|
|
139
|
+
expectedStrongSections: item.expectedStrongSections,
|
|
140
|
+
bannedClaims: item.bannedClaims,
|
|
141
|
+
weakFailureModes: item.weakFailureModes,
|
|
142
|
+
...(item.markdownFixturePath ? { markdownFixturePath: item.markdownFixturePath } : {})
|
|
143
|
+
};
|
|
144
|
+
|
|
145
|
+
runs.push({
|
|
146
|
+
...common,
|
|
147
|
+
runId: `${item.id}::weak`,
|
|
148
|
+
variant: "weak",
|
|
149
|
+
promptSource: "fixture.weakPrompt",
|
|
150
|
+
prompt: item.weakPrompt
|
|
151
|
+
});
|
|
152
|
+
runs.push({
|
|
153
|
+
...common,
|
|
154
|
+
runId: `${item.id}::basic`,
|
|
155
|
+
variant: "basic",
|
|
156
|
+
promptSource: item.promptPath,
|
|
157
|
+
prompt: basicPrompt
|
|
158
|
+
});
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
return {
|
|
162
|
+
schema: 1,
|
|
163
|
+
name: runPackName(),
|
|
164
|
+
version: fixture.version,
|
|
165
|
+
fixture: args.fixture,
|
|
166
|
+
dataBoundary:
|
|
167
|
+
"De-identified weak/basic prompt run pack. Public advisory fixtures use Markdown-normalized synthetic inputs; do not add real customer names, contacts, project names, amounts, dates, locations, or raw source documents.",
|
|
168
|
+
runInstructions: [
|
|
169
|
+
"For each item, use prompt as the instruction and sampleInput as the user-provided material.",
|
|
170
|
+
"When inputFormat is markdown, pass the Markdown text as the material under review.",
|
|
171
|
+
"Run weak and basic variants separately for the same caseId.",
|
|
172
|
+
"Save model outputs into the model-output JSON schema and validate with validate-prompt-model-outputs.mjs.",
|
|
173
|
+
"Compare weak and basic outputs using engineering-business-basic-scorecard.json."
|
|
174
|
+
],
|
|
175
|
+
runs
|
|
176
|
+
};
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
function validateRunPack(runPack) {
|
|
180
|
+
if (!fixture || !runPack) return;
|
|
181
|
+
|
|
182
|
+
check(runPack.schema === 1, "run pack: schema must be 1");
|
|
183
|
+
check(runPack.version === fixture.version, `run pack: version must match fixture version ${fixture.version}`);
|
|
184
|
+
check(runPack.fixture === args.fixture, "run pack: fixture path mismatch");
|
|
185
|
+
check(Array.isArray(runPack.runs), "run pack: runs must be an array");
|
|
186
|
+
check(runPack.runs?.length === (fixture.cases?.length ?? 0) * 2, "run pack: must include weak and basic runs for every case");
|
|
187
|
+
|
|
188
|
+
const sensitiveHits = includesSensitiveTerm(runPack);
|
|
189
|
+
check(sensitiveHits.length === 0, `run pack: sensitive terms leaked (${sensitiveHits.join(", ")})`);
|
|
190
|
+
|
|
191
|
+
const fixtureById = new Map((fixture.cases ?? []).map((item) => [item.id, item]));
|
|
192
|
+
const seenRunIds = new Set();
|
|
193
|
+
const variantsByCase = new Map();
|
|
194
|
+
|
|
195
|
+
for (const item of runPack.runs ?? []) {
|
|
196
|
+
check(typeof item.runId === "string" && item.runId.length > 0, "run pack item: missing runId");
|
|
197
|
+
check(!seenRunIds.has(item.runId), `${item.runId}: duplicate runId`);
|
|
198
|
+
seenRunIds.add(item.runId);
|
|
199
|
+
|
|
200
|
+
const sourceCase = fixtureById.get(item.caseId);
|
|
201
|
+
check(Boolean(sourceCase), `${item.runId}: caseId not found in fixture`);
|
|
202
|
+
check(item.variant === "weak" || item.variant === "basic", `${item.runId}: variant must be weak or basic`);
|
|
203
|
+
check(item.skillId === sourceCase?.skillId, `${item.runId}: skillId mismatch`);
|
|
204
|
+
check(item.promptVersion === fixture.version, `${item.runId}: promptVersion mismatch`);
|
|
205
|
+
check(Array.isArray(item.sampleInput) && item.sampleInput.length > 0, `${item.runId}: sampleInput must be non-empty`);
|
|
206
|
+
check(typeof item.prompt === "string" && item.prompt.length > 0, `${item.runId}: prompt must be non-empty`);
|
|
207
|
+
if (sourceCase?.markdownFixturePath) {
|
|
208
|
+
check(item.inputFormat === "markdown", `${item.runId}: markdown fixture must set inputFormat=markdown`);
|
|
209
|
+
check(item.markdownFixturePath === sourceCase.markdownFixturePath, `${item.runId}: markdownFixturePath mismatch`);
|
|
210
|
+
check(item.sampleInput.length === 1, `${item.runId}: markdown fixture sampleInput must contain one markdown string`);
|
|
211
|
+
check(item.sampleInput[0] === readText(sourceCase.markdownFixturePath), `${item.runId}: sampleInput must match markdown fixture content`);
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
const variants = variantsByCase.get(item.caseId) ?? new Set();
|
|
215
|
+
variants.add(item.variant);
|
|
216
|
+
variantsByCase.set(item.caseId, variants);
|
|
217
|
+
|
|
218
|
+
if (item.variant === "weak") {
|
|
219
|
+
check(item.prompt === sourceCase?.weakPrompt, `${item.runId}: weak prompt must match fixture weakPrompt`);
|
|
220
|
+
check(item.promptSource === "fixture.weakPrompt", `${item.runId}: weak promptSource mismatch`);
|
|
221
|
+
} else {
|
|
222
|
+
check(item.prompt === readText(sourceCase?.promptPath), `${item.runId}: basic prompt must match promptPath content`);
|
|
223
|
+
check(item.promptSource === sourceCase?.promptPath, `${item.runId}: basic promptSource mismatch`);
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
for (const caseId of fixtureById.keys()) {
|
|
228
|
+
const variants = variantsByCase.get(caseId);
|
|
229
|
+
check(variants?.has("weak") && variants?.has("basic"), `${caseId}: missing weak/basic pair`);
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
const runPack = buildRunPack();
|
|
234
|
+
validateRunPack(runPack);
|
|
235
|
+
|
|
236
|
+
if (errors.length > 0) {
|
|
237
|
+
console.error(`Prompt run pack build failed with ${errors.length} error(s):`);
|
|
238
|
+
for (const error of errors) console.error(`- ${error}`);
|
|
239
|
+
process.exit(1);
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
if (args.out) {
|
|
243
|
+
fs.mkdirSync(path.dirname(args.out), { recursive: true });
|
|
244
|
+
fs.writeFileSync(args.out, `${JSON.stringify(runPack, null, 2)}\n`, "utf8");
|
|
245
|
+
console.log(`Prompt run pack written: ${path.relative(root, args.out)}`);
|
|
246
|
+
} else {
|
|
247
|
+
console.log("Prompt run pack validation passed.");
|
|
248
|
+
}
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import fs from "node:fs";
|
|
4
|
+
import path from "node:path";
|
|
5
|
+
|
|
6
|
+
const root = fs.realpathSync(process.cwd());
|
|
7
|
+
const errors = [];
|
|
8
|
+
|
|
9
|
+
const fixturePaths = [
|
|
10
|
+
"prompts/evaluations/engineering-business-basic-fixtures.json",
|
|
11
|
+
"prompts/evaluations/engineering-business-public-advisory-fixtures.json"
|
|
12
|
+
];
|
|
13
|
+
const fixtures = fixturePaths.map((fixturePath) => ({
|
|
14
|
+
path: fixturePath,
|
|
15
|
+
data: readJson(repoPath(fixturePath))
|
|
16
|
+
}));
|
|
17
|
+
|
|
18
|
+
const sensitiveTerms = [
|
|
19
|
+
"立信",
|
|
20
|
+
"费敏",
|
|
21
|
+
"闻总",
|
|
22
|
+
"谭总",
|
|
23
|
+
"茅盾中学",
|
|
24
|
+
"鸿益",
|
|
25
|
+
"太鑫",
|
|
26
|
+
"飞双",
|
|
27
|
+
"魔毯",
|
|
28
|
+
"客户内部",
|
|
29
|
+
"培训演示",
|
|
30
|
+
"基础内测",
|
|
31
|
+
"内测模式",
|
|
32
|
+
"内测包",
|
|
33
|
+
"嘉兴",
|
|
34
|
+
"绍兴",
|
|
35
|
+
"杭州",
|
|
36
|
+
"20,000",
|
|
37
|
+
"20000",
|
|
38
|
+
"28,000",
|
|
39
|
+
"28000",
|
|
40
|
+
"17,800",
|
|
41
|
+
"17800"
|
|
42
|
+
];
|
|
43
|
+
|
|
44
|
+
const requiredCaseFields = [
|
|
45
|
+
"id",
|
|
46
|
+
"skillId",
|
|
47
|
+
"promptPath",
|
|
48
|
+
"scenario",
|
|
49
|
+
"sourceSignals",
|
|
50
|
+
"advisoryComparison",
|
|
51
|
+
"weakPrompt",
|
|
52
|
+
"inputSummary",
|
|
53
|
+
"sampleInput",
|
|
54
|
+
"requiredPromptTerms",
|
|
55
|
+
"expectedOutputShape",
|
|
56
|
+
"expectedStrongSections",
|
|
57
|
+
"weakFailureModes",
|
|
58
|
+
"bannedClaims"
|
|
59
|
+
];
|
|
60
|
+
|
|
61
|
+
function repoPath(...parts) {
|
|
62
|
+
const target = path.join(root, ...parts);
|
|
63
|
+
const relative = path.relative(root, target);
|
|
64
|
+
if (relative.startsWith("..") || path.isAbsolute(relative)) {
|
|
65
|
+
throw new Error(`Path traversal detected: ${target}`);
|
|
66
|
+
}
|
|
67
|
+
return target;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
function readJson(filePath) {
|
|
71
|
+
try {
|
|
72
|
+
return JSON.parse(fs.readFileSync(filePath, "utf8"));
|
|
73
|
+
} catch (error) {
|
|
74
|
+
errors.push(`${path.relative(root, filePath)}: invalid JSON (${error.message})`);
|
|
75
|
+
return undefined;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
function readText(relativePath) {
|
|
80
|
+
try {
|
|
81
|
+
return fs.readFileSync(repoPath(relativePath), "utf8");
|
|
82
|
+
} catch (error) {
|
|
83
|
+
errors.push(`${relativePath}: ${error.message}`);
|
|
84
|
+
return "";
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
function check(condition, message) {
|
|
89
|
+
if (!condition) errors.push(message);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
function includesSensitiveTerm(value) {
|
|
93
|
+
const raw = typeof value === "string" ? value : JSON.stringify(value);
|
|
94
|
+
return sensitiveTerms.filter((term) => raw.includes(term));
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
function validateMarkdownFixture(fixturePath, item) {
|
|
98
|
+
const markdownPath = item.markdownFixturePath;
|
|
99
|
+
check(typeof markdownPath === "string" && markdownPath.endsWith(".md"), `${fixturePath}/${item.id}: markdownFixturePath must point to a markdown file`);
|
|
100
|
+
if (typeof markdownPath !== "string") {
|
|
101
|
+
return;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
const markdown = readText(markdownPath);
|
|
105
|
+
check(markdown.length > 0, `${fixturePath}/${item.id}: markdown fixture is empty or missing`);
|
|
106
|
+
check(markdown.includes(`caseId: ${item.id}`), `${fixturePath}/${item.id}: markdown fixture must include matching caseId`);
|
|
107
|
+
check(markdown.includes(`skillId: ${item.skillId}`), `${fixturePath}/${item.id}: markdown fixture must include matching skillId`);
|
|
108
|
+
check(markdown.includes("> 数据说明:以下客户、项目、人员、地点、日期、金额、编号均为虚构。"), `${fixturePath}/${item.id}: markdown fixture must state synthetic boundary`);
|
|
109
|
+
|
|
110
|
+
const sensitiveHits = includesSensitiveTerm(markdown);
|
|
111
|
+
check(sensitiveHits.length === 0, `${fixturePath}/${item.id}: markdown fixture leaked sensitive terms (${sensitiveHits.join(", ")})`);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
function validateSyntheticFields(fixturePath, fixture, item) {
|
|
115
|
+
if (!fixture.name?.includes("public-advisory")) {
|
|
116
|
+
return;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
check(fixture.syntheticDataPolicy?.entitiesAreFictional === true, `${fixturePath}: syntheticDataPolicy must mark entities as fictional`);
|
|
120
|
+
check(item.syntheticFields && typeof item.syntheticFields === "object", `${item.id}: missing syntheticFields`);
|
|
121
|
+
|
|
122
|
+
const requiredSyntheticFields = [
|
|
123
|
+
"customer",
|
|
124
|
+
"project",
|
|
125
|
+
"location",
|
|
126
|
+
"dates",
|
|
127
|
+
"people",
|
|
128
|
+
"numbers",
|
|
129
|
+
"documentRefs"
|
|
130
|
+
];
|
|
131
|
+
|
|
132
|
+
for (const field of requiredSyntheticFields) {
|
|
133
|
+
check(Array.isArray(item.syntheticFields?.[field]) && item.syntheticFields[field].length > 0, `${item.id}: syntheticFields.${field} must be a non-empty array`);
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
const raw = JSON.stringify(item.syntheticFields ?? {});
|
|
137
|
+
check(raw.includes("虚构") || raw.includes("示例"), `${item.id}: syntheticFields must clearly mark fictional data`);
|
|
138
|
+
validateMarkdownFixture(fixturePath, item);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
function validateFixture(fixturePath, fixture) {
|
|
142
|
+
if (!fixture) {
|
|
143
|
+
return;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
check(fixture.schema === 1, `${fixturePath}: schema must be 1`);
|
|
147
|
+
check(Array.isArray(fixture.cases), `${fixturePath}: cases must be an array`);
|
|
148
|
+
check(fixture.cases?.length === 6, `${fixturePath}: must cover 6 engineering-business cases`);
|
|
149
|
+
|
|
150
|
+
const seenIds = new Set();
|
|
151
|
+
const seenSkills = new Set();
|
|
152
|
+
|
|
153
|
+
for (const item of fixture.cases ?? []) {
|
|
154
|
+
for (const field of requiredCaseFields) {
|
|
155
|
+
check(Object.prototype.hasOwnProperty.call(item, field), `${fixturePath}/${item.id ?? "unknown"}: missing ${field}`);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
check(!seenIds.has(item.id), `${fixturePath}/${item.id}: duplicate case id`);
|
|
159
|
+
seenIds.add(item.id);
|
|
160
|
+
check(!seenSkills.has(item.skillId), `${fixturePath}/${item.skillId}: duplicate skill fixture`);
|
|
161
|
+
seenSkills.add(item.skillId);
|
|
162
|
+
|
|
163
|
+
check(/^aios-[a-z0-9-]+$/.test(item.skillId), `${fixturePath}/${item.id}: invalid skillId`);
|
|
164
|
+
check(item.promptPath === `skills/${item.skillId}/prompts/basic-prompt.md`, `${fixturePath}/${item.id}: promptPath must match skillId`);
|
|
165
|
+
|
|
166
|
+
for (const field of [
|
|
167
|
+
"inputSummary",
|
|
168
|
+
"sourceSignals",
|
|
169
|
+
"advisoryComparison",
|
|
170
|
+
"sampleInput",
|
|
171
|
+
"requiredPromptTerms",
|
|
172
|
+
"expectedOutputShape",
|
|
173
|
+
"expectedStrongSections",
|
|
174
|
+
"weakFailureModes",
|
|
175
|
+
"bannedClaims"
|
|
176
|
+
]) {
|
|
177
|
+
check(Array.isArray(item[field]) && item[field].length > 0, `${fixturePath}/${item.id}: ${field} must be a non-empty array`);
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
for (const signal of item.sourceSignals ?? []) {
|
|
181
|
+
check(
|
|
182
|
+
/^(advisory|source-shape|boundary-shape):/.test(signal),
|
|
183
|
+
`${fixturePath}/${item.id}: source signal must use an abstract prefix (${signal})`
|
|
184
|
+
);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
const sensitiveHits = includesSensitiveTerm(item);
|
|
188
|
+
check(sensitiveHits.length === 0, `${fixturePath}/${item.id}: sensitive terms leaked (${sensitiveHits.join(", ")})`);
|
|
189
|
+
validateSyntheticFields(fixturePath, fixture, item);
|
|
190
|
+
|
|
191
|
+
const prompt = readText(item.promptPath);
|
|
192
|
+
check(prompt.length > 0, `${fixturePath}/${item.id}: prompt file is empty or missing`);
|
|
193
|
+
for (const term of item.requiredPromptTerms ?? []) {
|
|
194
|
+
check(prompt.includes(term), `${fixturePath}/${item.id}: prompt missing required term "${term}"`);
|
|
195
|
+
}
|
|
196
|
+
for (const section of item.expectedStrongSections ?? []) {
|
|
197
|
+
check(prompt.includes(section), `${fixturePath}/${item.id}: prompt missing expected strong section "${section}"`);
|
|
198
|
+
}
|
|
199
|
+
for (const bannedClaim of item.bannedClaims ?? []) {
|
|
200
|
+
check(prompt.includes(bannedClaim), `${fixturePath}/${item.id}: prompt should explicitly list prohibited claim "${bannedClaim}"`);
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
const expectedSkills = [
|
|
205
|
+
"aios-commercial-tender",
|
|
206
|
+
"aios-commercial-contract",
|
|
207
|
+
"aios-construction-daily",
|
|
208
|
+
"aios-construction-meeting",
|
|
209
|
+
"aios-commercial-variation",
|
|
210
|
+
"aios-construction-scheme"
|
|
211
|
+
].sort();
|
|
212
|
+
check(JSON.stringify([...seenSkills].sort()) === JSON.stringify(expectedSkills), `${fixturePath}: skill coverage mismatch`);
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
for (const fixture of fixtures) {
|
|
216
|
+
validateFixture(fixture.path, fixture.data);
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
if (errors.length > 0) {
|
|
220
|
+
console.error(`Prompt fixture validation failed with ${errors.length} error(s):`);
|
|
221
|
+
for (const error of errors) console.error(`- ${error}`);
|
|
222
|
+
process.exit(1);
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
console.log("Prompt fixture validation passed.");
|