scene-capability-engine 3.6.65 → 3.6.67
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +16 -0
- package/README.md +17 -6
- package/README.zh.md +18 -6
- package/bin/scene-capability-engine.js +4 -0
- package/docs/README.md +2 -2
- package/docs/command-reference.md +382 -6
- package/docs/document-governance.md +3 -2
- package/docs/integration-modes.md +62 -478
- package/docs/integration-philosophy.md +56 -263
- package/docs/magicball-project-portfolio-contract.md +114 -2
- package/docs/project-management/README.md +14 -0
- package/docs/project-management/assurance/backup.md +3 -0
- package/docs/project-management/assurance/config.md +3 -0
- package/docs/project-management/assurance/evidence/README.md +3 -0
- package/docs/project-management/assurance/incidents/README.md +3 -0
- package/docs/project-management/assurance/logs.md +3 -0
- package/docs/project-management/assurance/overview.md +3 -0
- package/docs/project-management/assurance/recovery/README.md +3 -0
- package/docs/project-management/assurance/resource.md +3 -0
- package/docs/project-management/assurance/runbooks/README.md +3 -0
- package/docs/project-management/delivery/acceptance/README.md +3 -0
- package/docs/project-management/delivery/acceptance/evidence/README.md +3 -0
- package/docs/project-management/delivery/acceptance/exceptions/README.md +3 -0
- package/docs/project-management/delivery/acceptance/reports/README.md +3 -0
- package/docs/project-management/delivery/documents/changes.md +3 -0
- package/docs/project-management/delivery/documents/issues.md +3 -0
- package/docs/project-management/delivery/documents/overview.md +3 -0
- package/docs/project-management/delivery/documents/planning.md +3 -0
- package/docs/project-management/delivery/documents/requirements.md +3 -0
- package/docs/project-management/delivery/documents/tracking.md +3 -0
- package/docs/project-management/delivery/handoffs/README.md +3 -0
- package/docs/project-management/delivery/handoffs/evidence/README.md +3 -0
- package/docs/project-management/delivery/handoffs/records/README.md +3 -0
- package/docs/project-management/delivery/overview.md +10 -0
- package/docs/project-management/delivery/releases/README.md +3 -0
- package/docs/project-management/delivery/releases/baselines/README.md +3 -0
- package/docs/project-management/delivery/releases/evidence/README.md +3 -0
- package/docs/project-management/delivery/tables/changes.md +3 -0
- package/docs/project-management/delivery/tables/issues.md +3 -0
- package/docs/project-management/delivery/tables/planning.md +3 -0
- package/docs/project-management/delivery/tables/requirements.md +3 -0
- package/docs/project-management/delivery/tables/tracking.md +3 -0
- package/docs/project-management/environment/agent-discovery.md +3 -0
- package/docs/project-management/environment/development.md +3 -0
- package/docs/project-management/environment/overview.md +10 -0
- package/docs/project-management/environment/testing.md +3 -0
- package/docs/project-management/environment/version-alignment.md +3 -0
- package/docs/quick-start-with-ai-tools.md +68 -308
- package/docs/releases/README.md +2 -0
- package/docs/releases/v3.6.66.md +23 -0
- package/docs/releases/v3.6.67.md +23 -0
- package/docs/steering-governance.md +64 -2
- package/docs/zh/README.md +2 -2
- package/docs/zh/releases/README.md +2 -0
- package/docs/zh/releases/v3.6.66.md +23 -0
- package/docs/zh/releases/v3.6.67.md +23 -0
- package/lib/commands/adopt.js +24 -0
- package/lib/commands/native.js +158 -0
- package/lib/commands/project.js +95 -0
- package/lib/commands/semantic.js +1459 -0
- package/lib/commands/session.js +74 -3
- package/lib/commands/spec-bootstrap.js +10 -1
- package/lib/commands/spec-gate.js +10 -1
- package/lib/commands/spec-pipeline.js +10 -1
- package/lib/commands/studio.js +405 -30
- package/lib/commands/task.js +141 -7
- package/lib/governance/supreme-principles.js +530 -0
- package/lib/problem/problem-evaluator.js +4 -0
- package/lib/project/candidate-inspection-service.js +24 -1
- package/lib/project/portfolio-projection-service.js +315 -5
- package/lib/project/project-channel-output.js +94 -0
- package/lib/project/project-channel-projection.js +181 -0
- package/lib/project/root-onboarding-service.js +60 -8
- package/lib/project/semantic-shared-source-projection.js +150 -0
- package/lib/project/supervision-action-model.js +277 -0
- package/lib/project/supervision-projection-service.js +305 -5
- package/lib/project/target-resolution-service.js +70 -5
- package/lib/project/visibility-policy.js +93 -0
- package/lib/runtime/multi-spec-scene-session.js +8 -1
- package/lib/runtime/project-channel-context-store.js +387 -0
- package/lib/runtime/project-channel-context.js +406 -0
- package/lib/runtime/scene-session-binding.js +46 -0
- package/lib/runtime/session-store.js +186 -0
- package/lib/runtime/steering-contract.js +7 -1
- package/lib/semantic/archive-report.js +283 -0
- package/lib/semantic/archive-routing.js +67 -0
- package/lib/semantic/backflow-report.js +245 -0
- package/lib/semantic/capability-contract.js +30 -0
- package/lib/semantic/delta-export.js +145 -0
- package/lib/semantic/interaction-observer.js +254 -0
- package/lib/semantic/kernel-loader.js +881 -0
- package/lib/semantic/native-runtime.js +359 -0
- package/lib/semantic/progress-ledger.js +433 -0
- package/lib/semantic/replay-evaluator.js +382 -0
- package/lib/semantic/shared-publication.js +592 -0
- package/lib/semantic/shared-source-config.js +183 -0
- package/lib/semantic/shared-source-connect.js +139 -0
- package/lib/semantic/shared-source-discovery.js +98 -0
- package/lib/semantic/shared-sync-export.js +413 -0
- package/lib/semantic/shared-sync-intake.js +592 -0
- package/lib/semantic/shared-sync-merge.js +547 -0
- package/lib/semantic/shared-sync-release.js +463 -0
- package/lib/semantic/supreme-intent-report.js +300 -0
- package/lib/state/sce-state-store.js +1360 -0
- package/lib/steering/context-sync-manager.js +276 -25
- package/lib/studio/spec-intake-governor.js +39 -3
- package/lib/studio/task-envelope.js +35 -2
- package/lib/workspace/takeover-baseline.js +342 -83
- package/package.json +7 -2
- package/scripts/agent-governance-baseline-audit.js +395 -0
- package/scripts/clarification-first-audit.js +9 -9
- package/scripts/deprecated-entry-audit.js +240 -0
- package/scripts/release-posture-report.js +262 -0
- package/template/.sce/README.md +62 -228
- package/template/.sce/config/semantic-shared-sources.json +5 -0
- package/template/.sce/config/supreme-principles-policy.json +105 -0
- package/template/.sce/config/takeover-baseline.json +7 -0
- package/template/.sce/steering/CORE_PRINCIPLES.md +23 -63
- package/template/.sce/steering/CURRENT_CONTEXT.md +4 -0
- package/template/.sce/steering/RULES_GUIDE.md +17 -9
- package/template/README.md +32 -96
|
@@ -0,0 +1,382 @@
|
|
|
1
|
+
const { getSceStateStore } = require('../state/sce-state-store');
|
|
2
|
+
|
|
3
|
+
const STOPWORDS = new Set([
|
|
4
|
+
'a', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'by', 'for', 'from', 'had',
|
|
5
|
+
'has', 'have', 'in', 'into', 'is', 'it', 'of', 'on', 'or', 'that', 'the', 'their',
|
|
6
|
+
'then', 'this', 'to', 'was', 'were', 'with', 'after', 'before', 'will', 'would',
|
|
7
|
+
'using', 'used', 'use', 'when', 'what', 'why', 'how', 'next', 'time', 'previous',
|
|
8
|
+
'attempt', 'issue', 'problem', 'reply', 'assistant', 'user', 'general', 'observed',
|
|
9
|
+
'debug', 'repair', 'planning', 'analysis', 'verification'
|
|
10
|
+
]);
|
|
11
|
+
|
|
12
|
+
const ACTION_TERMS = [
|
|
13
|
+
'add', 'analyze', 'bisect', 'check', 'compare', 'debug', 'document', 'fix', 'inspect',
|
|
14
|
+
'isolate', 'log', 'patch', 'replay', 'reproduce', 'scope', 'test', 'trace', 'validate',
|
|
15
|
+
'verify', 'review', 'narrow', '定位', '排查', '日志', '修复', '验证', '回放', '复现', '二分'
|
|
16
|
+
];
|
|
17
|
+
|
|
18
|
+
const SUCCESS_CLAIMS = /(done|fixed|resolved|completed|already implemented|already solved|已修复|已完成|已经解决|搞定)/i;
|
|
19
|
+
|
|
20
|
+
function normalizeString(value) {
|
|
21
|
+
if (typeof value !== 'string') {
|
|
22
|
+
return '';
|
|
23
|
+
}
|
|
24
|
+
return value.trim();
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
function normalizeInteger(value, fallback = 0) {
|
|
28
|
+
const parsed = Number.parseInt(`${value}`, 10);
|
|
29
|
+
if (!Number.isFinite(parsed) || parsed <= 0) {
|
|
30
|
+
return fallback;
|
|
31
|
+
}
|
|
32
|
+
return parsed;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
function normalizeNumber(value, fallback = 0) {
|
|
36
|
+
const parsed = Number(value);
|
|
37
|
+
if (!Number.isFinite(parsed)) {
|
|
38
|
+
return fallback;
|
|
39
|
+
}
|
|
40
|
+
return parsed;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function normalizeStringArray(value, fallback = []) {
|
|
44
|
+
if (!Array.isArray(value)) {
|
|
45
|
+
return [...fallback];
|
|
46
|
+
}
|
|
47
|
+
return value
|
|
48
|
+
.map((item) => normalizeString(item))
|
|
49
|
+
.filter(Boolean);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
function tokenizeText(value) {
|
|
53
|
+
const normalized = normalizeString(value).toLowerCase();
|
|
54
|
+
if (!normalized) {
|
|
55
|
+
return [];
|
|
56
|
+
}
|
|
57
|
+
const matches = normalized.match(/[a-z0-9\u4e00-\u9fff]+/g) || [];
|
|
58
|
+
return matches.filter((token) => token.length > 1 && !STOPWORDS.has(token));
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
function summarizeObservation(observation = {}) {
|
|
62
|
+
return {
|
|
63
|
+
observation_id: normalizeString(observation.observation_id),
|
|
64
|
+
source_runtime: normalizeString(observation.source_runtime) || null,
|
|
65
|
+
event_kind: normalizeString(observation.event_kind) || null,
|
|
66
|
+
intent: normalizeString(observation.intent) || null,
|
|
67
|
+
outcome: normalizeString(observation.outcome) || null,
|
|
68
|
+
quality_signal: normalizeString(observation.quality_signal) || null,
|
|
69
|
+
summary: normalizeString(observation.summary) || null,
|
|
70
|
+
task_ref: normalizeString(observation.task_ref) || null,
|
|
71
|
+
session_id: normalizeString(observation.session_id) || null,
|
|
72
|
+
spec_id: normalizeString(observation.spec_id) || null,
|
|
73
|
+
scene_id: normalizeString(observation.scene_id) || null
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
function buildSemanticReplayDataset(observations = [], options = {}) {
|
|
78
|
+
const expectedTermLimit = normalizeInteger(options.expectedTermLimit, 8);
|
|
79
|
+
const safeObservations = Array.isArray(observations)
|
|
80
|
+
? observations.filter((item) => item && normalizeString(item.observation_id))
|
|
81
|
+
: [];
|
|
82
|
+
const tokenCounts = new Map();
|
|
83
|
+
const outcomeProfile = {};
|
|
84
|
+
const intentProfile = {};
|
|
85
|
+
const channelIds = new Set();
|
|
86
|
+
const sessionIds = new Set();
|
|
87
|
+
const sceneIds = new Set();
|
|
88
|
+
const specIds = new Set();
|
|
89
|
+
const sourceRuntimes = new Set();
|
|
90
|
+
|
|
91
|
+
for (const observation of safeObservations) {
|
|
92
|
+
channelIds.add(normalizeString(observation.channel_id));
|
|
93
|
+
sessionIds.add(normalizeString(observation.session_id));
|
|
94
|
+
sceneIds.add(normalizeString(observation.scene_id));
|
|
95
|
+
specIds.add(normalizeString(observation.spec_id));
|
|
96
|
+
sourceRuntimes.add(normalizeString(observation.source_runtime));
|
|
97
|
+
|
|
98
|
+
const outcome = normalizeString(observation.outcome) || 'unknown';
|
|
99
|
+
outcomeProfile[outcome] = (outcomeProfile[outcome] || 0) + 1;
|
|
100
|
+
|
|
101
|
+
const intent = normalizeString(observation.intent) || 'unknown';
|
|
102
|
+
intentProfile[intent] = (intentProfile[intent] || 0) + 1;
|
|
103
|
+
|
|
104
|
+
const basis = [
|
|
105
|
+
observation.summary,
|
|
106
|
+
observation.intent,
|
|
107
|
+
observation.event_kind,
|
|
108
|
+
observation.outcome,
|
|
109
|
+
observation.quality_signal,
|
|
110
|
+
observation.raw_payload && typeof observation.raw_payload === 'object'
|
|
111
|
+
? observation.raw_payload.text
|
|
112
|
+
: '',
|
|
113
|
+
observation.normalized_payload && typeof observation.normalized_payload === 'object'
|
|
114
|
+
? observation.normalized_payload.text
|
|
115
|
+
: ''
|
|
116
|
+
].join(' ');
|
|
117
|
+
|
|
118
|
+
for (const token of tokenizeText(basis)) {
|
|
119
|
+
tokenCounts.set(token, (tokenCounts.get(token) || 0) + 1);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
const expectedTerms = [...tokenCounts.entries()]
|
|
124
|
+
.sort((left, right) => {
|
|
125
|
+
if (right[1] !== left[1]) {
|
|
126
|
+
return right[1] - left[1];
|
|
127
|
+
}
|
|
128
|
+
return left[0].localeCompare(right[0]);
|
|
129
|
+
})
|
|
130
|
+
.slice(0, expectedTermLimit)
|
|
131
|
+
.map(([token]) => token);
|
|
132
|
+
|
|
133
|
+
return {
|
|
134
|
+
total_observations: safeObservations.length,
|
|
135
|
+
observation_ids: safeObservations.map((item) => normalizeString(item.observation_id)).filter(Boolean),
|
|
136
|
+
context: {
|
|
137
|
+
project_id: normalizeString(safeObservations[0] && safeObservations[0].project_id) || null,
|
|
138
|
+
channel_ids: [...channelIds].filter(Boolean),
|
|
139
|
+
session_ids: [...sessionIds].filter(Boolean),
|
|
140
|
+
scene_ids: [...sceneIds].filter(Boolean),
|
|
141
|
+
spec_ids: [...specIds].filter(Boolean),
|
|
142
|
+
source_runtimes: [...sourceRuntimes].filter(Boolean)
|
|
143
|
+
},
|
|
144
|
+
expected_terms: expectedTerms,
|
|
145
|
+
outcome_profile: outcomeProfile,
|
|
146
|
+
intent_profile: intentProfile,
|
|
147
|
+
observations: safeObservations.map((item) => summarizeObservation(item))
|
|
148
|
+
};
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
function scoreCandidateText(candidateText, replayDataset = {}, options = {}) {
|
|
152
|
+
const threshold = normalizeNumber(options.threshold, 65);
|
|
153
|
+
const expectedTerms = normalizeStringArray(replayDataset.expected_terms, []);
|
|
154
|
+
const candidateTokens = tokenizeText(candidateText);
|
|
155
|
+
const uniqueCandidateTokens = [...new Set(candidateTokens)];
|
|
156
|
+
const matchedTerms = expectedTerms.filter((term) => uniqueCandidateTokens.includes(term));
|
|
157
|
+
const actionabilityHits = ACTION_TERMS.filter((term) => candidateText.toLowerCase().includes(term.toLowerCase()));
|
|
158
|
+
|
|
159
|
+
const failedCount = normalizeNumber(replayDataset.outcome_profile && replayDataset.outcome_profile.failed, 0);
|
|
160
|
+
const blockedCount = normalizeNumber(replayDataset.outcome_profile && replayDataset.outcome_profile.blocked, 0);
|
|
161
|
+
const unstableCount = failedCount + blockedCount;
|
|
162
|
+
|
|
163
|
+
const coverageScore = expectedTerms.length > 0
|
|
164
|
+
? (matchedTerms.length / expectedTerms.length) * 70
|
|
165
|
+
: 20;
|
|
166
|
+
const actionabilityScore = Math.min(actionabilityHits.length, 3) * (20 / 3);
|
|
167
|
+
const contextScore = replayDataset.total_observations > 0 ? 10 : 0;
|
|
168
|
+
|
|
169
|
+
const contradictionHits = [];
|
|
170
|
+
let contradictionPenalty = 0;
|
|
171
|
+
if (unstableCount > 0 && SUCCESS_CLAIMS.test(candidateText) && actionabilityHits.length < 2) {
|
|
172
|
+
contradictionHits.push('candidate-claims-success-without-remediation-steps');
|
|
173
|
+
contradictionPenalty += 20;
|
|
174
|
+
}
|
|
175
|
+
if (expectedTerms.length > 0 && matchedTerms.length === 0) {
|
|
176
|
+
contradictionHits.push('candidate-misses-replay-terms');
|
|
177
|
+
contradictionPenalty += 10;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
const score = Math.max(
|
|
181
|
+
0,
|
|
182
|
+
Math.min(100, coverageScore + actionabilityScore + contextScore - contradictionPenalty)
|
|
183
|
+
);
|
|
184
|
+
const regressionReasons = [];
|
|
185
|
+
if (matchedTerms.length < Math.min(2, expectedTerms.length)) {
|
|
186
|
+
regressionReasons.push('low-replay-term-coverage');
|
|
187
|
+
}
|
|
188
|
+
if (actionabilityHits.length === 0) {
|
|
189
|
+
regressionReasons.push('missing-actionable-remediation');
|
|
190
|
+
}
|
|
191
|
+
if (contradictionHits.length > 0) {
|
|
192
|
+
regressionReasons.push(...contradictionHits);
|
|
193
|
+
}
|
|
194
|
+
if (score < threshold) {
|
|
195
|
+
regressionReasons.push('score-below-threshold');
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
return {
|
|
199
|
+
score: Number(score.toFixed(2)),
|
|
200
|
+
threshold: Number(threshold.toFixed(2)),
|
|
201
|
+
verdict: score >= threshold ? 'passed' : 'failed',
|
|
202
|
+
matched_terms: matchedTerms,
|
|
203
|
+
missing_terms: expectedTerms.filter((term) => !matchedTerms.includes(term)),
|
|
204
|
+
actionability_hits: actionabilityHits,
|
|
205
|
+
contradiction_hits: contradictionHits,
|
|
206
|
+
regression_reasons: [...new Set(regressionReasons)]
|
|
207
|
+
};
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
async function collectSemanticReplayInputs(options = {}, dependencies = {}) {
|
|
211
|
+
const projectPath = dependencies.projectPath || process.cwd();
|
|
212
|
+
const store = dependencies.stateStore || getSceStateStore(projectPath, {
|
|
213
|
+
fileSystem: dependencies.fileSystem,
|
|
214
|
+
env: dependencies.env,
|
|
215
|
+
sqliteModule: dependencies.sqliteModule,
|
|
216
|
+
noCache: dependencies.noCache === true
|
|
217
|
+
});
|
|
218
|
+
|
|
219
|
+
const limit = normalizeInteger(options.limit, 10);
|
|
220
|
+
const replayScanLimit = Math.max(limit * 10, 50);
|
|
221
|
+
const projectId = normalizeString(options.project_id || options.projectId);
|
|
222
|
+
const channelId = normalizeString(options.channel_id || options.channelId);
|
|
223
|
+
const sessionId = normalizeString(options.session_id || options.sessionId);
|
|
224
|
+
const sceneId = normalizeString(options.scene_id || options.sceneId);
|
|
225
|
+
const specId = normalizeString(options.spec_id || options.specId);
|
|
226
|
+
const sourceRuntime = normalizeString(options.source_runtime || options.sourceRuntime);
|
|
227
|
+
const observationIds = new Set(normalizeStringArray(options.observation_ids || options.observationIds, []));
|
|
228
|
+
|
|
229
|
+
let observations = await store.listSemanticObservations({
|
|
230
|
+
project_id: projectId,
|
|
231
|
+
session_id: sessionId,
|
|
232
|
+
source_runtime: sourceRuntime,
|
|
233
|
+
limit: replayScanLimit
|
|
234
|
+
});
|
|
235
|
+
|
|
236
|
+
observations = Array.isArray(observations) ? observations : [];
|
|
237
|
+
observations = observations.filter((item) => {
|
|
238
|
+
if (channelId && normalizeString(item.channel_id) !== channelId) {
|
|
239
|
+
return false;
|
|
240
|
+
}
|
|
241
|
+
if (sceneId && normalizeString(item.scene_id) !== sceneId) {
|
|
242
|
+
return false;
|
|
243
|
+
}
|
|
244
|
+
if (specId && normalizeString(item.spec_id) !== specId) {
|
|
245
|
+
return false;
|
|
246
|
+
}
|
|
247
|
+
if (observationIds.size > 0 && !observationIds.has(normalizeString(item.observation_id))) {
|
|
248
|
+
return false;
|
|
249
|
+
}
|
|
250
|
+
return true;
|
|
251
|
+
}).slice(0, limit);
|
|
252
|
+
|
|
253
|
+
if (observations.length === 0) {
|
|
254
|
+
throw new Error('No semantic observations matched the replay filters');
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
return {
|
|
258
|
+
observations,
|
|
259
|
+
replay_dataset: buildSemanticReplayDataset(observations, options),
|
|
260
|
+
store
|
|
261
|
+
};
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
async function evaluateSemanticCandidate(options = {}, dependencies = {}) {
|
|
265
|
+
const candidateReply = normalizeString(options.candidate_reply || options.candidateReply || options.candidate_text || options.candidateText);
|
|
266
|
+
const candidatePlan = normalizeString(options.candidate_plan || options.candidatePlan);
|
|
267
|
+
const candidateText = candidateReply || candidatePlan;
|
|
268
|
+
if (!candidateText) {
|
|
269
|
+
throw new Error('candidate_reply or candidate_plan is required');
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
const candidateKind = candidateReply ? 'reply' : 'action-plan';
|
|
273
|
+
const projectPath = dependencies.projectPath || process.cwd();
|
|
274
|
+
const { observations, replay_dataset: replayDataset, store } = await collectSemanticReplayInputs(options, {
|
|
275
|
+
...dependencies,
|
|
276
|
+
projectPath
|
|
277
|
+
});
|
|
278
|
+
|
|
279
|
+
const baselineRunId = normalizeString(options.baseline_evaluation_run_id || options.baselineEvaluationRunId);
|
|
280
|
+
let baselineRun = null;
|
|
281
|
+
if (baselineRunId) {
|
|
282
|
+
baselineRun = await store.getSemanticEvaluationRun(baselineRunId);
|
|
283
|
+
} else {
|
|
284
|
+
const priorRuns = await store.listSemanticEvaluationRuns({
|
|
285
|
+
project_id: options.project_id || options.projectId,
|
|
286
|
+
spec_id: options.spec_id || options.specId,
|
|
287
|
+
limit: 10
|
|
288
|
+
});
|
|
289
|
+
baselineRun = Array.isArray(priorRuns)
|
|
290
|
+
? priorRuns.find((item) => normalizeString(item.candidate_kind) === candidateKind) || null
|
|
291
|
+
: null;
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
const scoring = scoreCandidateText(candidateText, replayDataset, options);
|
|
295
|
+
const scoreDelta = baselineRun && typeof baselineRun.score === 'number'
|
|
296
|
+
? Number((scoring.score - baselineRun.score).toFixed(2))
|
|
297
|
+
: null;
|
|
298
|
+
const regressionDetected = scoring.verdict === 'failed'
|
|
299
|
+
|| (typeof scoreDelta === 'number' && scoreDelta <= -10);
|
|
300
|
+
const driftLevel = regressionDetected
|
|
301
|
+
? 'regression'
|
|
302
|
+
: (typeof scoreDelta === 'number' && scoreDelta < 0 ? 'watch' : 'stable');
|
|
303
|
+
const regressionReasons = [...scoring.regression_reasons];
|
|
304
|
+
if (typeof scoreDelta === 'number' && scoreDelta <= -10) {
|
|
305
|
+
regressionReasons.push('degraded-from-baseline');
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
const governance = {
|
|
309
|
+
eligible_for_errorbook: regressionDetected,
|
|
310
|
+
eligible_for_problem_evaluation: regressionDetected,
|
|
311
|
+
promotion_gate_passed: regressionDetected !== true,
|
|
312
|
+
promotion_gate: regressionDetected ? 'semantic-eval-blocked' : 'semantic-eval-pass',
|
|
313
|
+
evidence_required: true,
|
|
314
|
+
release_gate: normalizeString(options.release_gate || options.releaseGate) || null
|
|
315
|
+
};
|
|
316
|
+
|
|
317
|
+
const evidence = {
|
|
318
|
+
replay_dataset: {
|
|
319
|
+
total_observations: replayDataset.total_observations,
|
|
320
|
+
expected_terms: replayDataset.expected_terms,
|
|
321
|
+
outcome_profile: replayDataset.outcome_profile,
|
|
322
|
+
intent_profile: replayDataset.intent_profile
|
|
323
|
+
},
|
|
324
|
+
candidate_preview: candidateText.slice(0, 240),
|
|
325
|
+
matched_terms: scoring.matched_terms,
|
|
326
|
+
missing_terms: scoring.missing_terms,
|
|
327
|
+
actionability_hits: scoring.actionability_hits,
|
|
328
|
+
contradiction_hits: scoring.contradiction_hits,
|
|
329
|
+
baseline: baselineRun
|
|
330
|
+
? {
|
|
331
|
+
evaluation_run_id: baselineRun.evaluation_run_id,
|
|
332
|
+
score: baselineRun.score,
|
|
333
|
+
score_delta: scoreDelta
|
|
334
|
+
}
|
|
335
|
+
: null
|
|
336
|
+
};
|
|
337
|
+
|
|
338
|
+
const evaluationRun = await store.appendSemanticEvaluationRun({
|
|
339
|
+
project_id: options.project_id || options.projectId || replayDataset.context.project_id,
|
|
340
|
+
channel_id: options.channel_id || options.channelId || replayDataset.context.channel_ids[0],
|
|
341
|
+
session_id: options.session_id || options.sessionId || replayDataset.context.session_ids[0],
|
|
342
|
+
scene_id: options.scene_id || options.sceneId || replayDataset.context.scene_ids[0],
|
|
343
|
+
spec_id: options.spec_id || options.specId || replayDataset.context.spec_ids[0],
|
|
344
|
+
lesson_id: options.lesson_id || options.lessonId,
|
|
345
|
+
source_runtime: options.source_runtime || options.sourceRuntime || replayDataset.context.source_runtimes[0],
|
|
346
|
+
release_gate: options.release_gate || options.releaseGate,
|
|
347
|
+
baseline_evaluation_run_id: baselineRun ? baselineRun.evaluation_run_id : null,
|
|
348
|
+
candidate_kind: candidateKind,
|
|
349
|
+
candidate_label: normalizeString(options.candidate_label || options.candidateLabel) || null,
|
|
350
|
+
candidate_payload: {
|
|
351
|
+
text: candidateText
|
|
352
|
+
},
|
|
353
|
+
observation_ids: replayDataset.observation_ids,
|
|
354
|
+
replay_summary: replayDataset,
|
|
355
|
+
score: scoring.score,
|
|
356
|
+
threshold: scoring.threshold,
|
|
357
|
+
verdict: scoring.verdict,
|
|
358
|
+
regression_detected: regressionDetected,
|
|
359
|
+
drift_level: driftLevel,
|
|
360
|
+
regression_reasons: [...new Set(regressionReasons)],
|
|
361
|
+
evidence,
|
|
362
|
+
governance
|
|
363
|
+
});
|
|
364
|
+
|
|
365
|
+
if (!evaluationRun) {
|
|
366
|
+
throw new Error('Failed to persist semantic evaluation run');
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
return {
|
|
370
|
+
evaluation_run: evaluationRun,
|
|
371
|
+
replay_dataset: replayDataset,
|
|
372
|
+
observations: observations.map((item) => summarizeObservation(item))
|
|
373
|
+
};
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
module.exports = {
|
|
377
|
+
tokenizeText,
|
|
378
|
+
buildSemanticReplayDataset,
|
|
379
|
+
scoreCandidateText,
|
|
380
|
+
collectSemanticReplayInputs,
|
|
381
|
+
evaluateSemanticCandidate
|
|
382
|
+
};
|