@mnemom/agent-alignment-protocol 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +683 -0
- package/dist/index.d.ts +683 -0
- package/dist/index.js +625 -0
- package/dist/index.mjs +576 -0
- package/package.json +56 -0
- package/src/constants.ts +44 -0
- package/src/index.ts +135 -0
- package/src/schemas/alignment-card.ts +166 -0
- package/src/schemas/ap-trace.ts +163 -0
- package/src/schemas/index.ts +7 -0
- package/src/schemas/value-coherence.ts +177 -0
- package/src/verification/api.ts +565 -0
- package/src/verification/features.ts +157 -0
- package/src/verification/index.ts +7 -0
- package/src/verification/models.ts +182 -0
package/dist/index.mjs
ADDED
|
@@ -0,0 +1,576 @@
|
|
|
1
|
+
// src/constants.ts
|
|
2
|
+
var DEFAULT_SIMILARITY_THRESHOLD = 0.3;
|
|
3
|
+
var DEFAULT_SUSTAINED_TURNS_THRESHOLD = 3;
|
|
4
|
+
var NEAR_BOUNDARY_THRESHOLD = 0.35;
|
|
5
|
+
var MIN_COHERENCE_FOR_PROCEED = 0.7;
|
|
6
|
+
var CONFLICT_PENALTY_MULTIPLIER = 0.5;
|
|
7
|
+
var MIN_WORD_LENGTH = 3;
|
|
8
|
+
var MAX_TFIDF_FEATURES = 500;
|
|
9
|
+
var ALGORITHM_VERSION = "1.0.0";
|
|
10
|
+
|
|
11
|
+
// src/verification/features.ts
|
|
12
|
+
var STOPWORDS = /* @__PURE__ */ new Set([
|
|
13
|
+
"the",
|
|
14
|
+
"a",
|
|
15
|
+
"an",
|
|
16
|
+
"and",
|
|
17
|
+
"or",
|
|
18
|
+
"but",
|
|
19
|
+
"in",
|
|
20
|
+
"on",
|
|
21
|
+
"at",
|
|
22
|
+
"to",
|
|
23
|
+
"for",
|
|
24
|
+
"of",
|
|
25
|
+
"with",
|
|
26
|
+
"by",
|
|
27
|
+
"from",
|
|
28
|
+
"is",
|
|
29
|
+
"are",
|
|
30
|
+
"was",
|
|
31
|
+
"were",
|
|
32
|
+
"be",
|
|
33
|
+
"been",
|
|
34
|
+
"being",
|
|
35
|
+
"have",
|
|
36
|
+
"has",
|
|
37
|
+
"had",
|
|
38
|
+
"do",
|
|
39
|
+
"does",
|
|
40
|
+
"did",
|
|
41
|
+
"will",
|
|
42
|
+
"would",
|
|
43
|
+
"could",
|
|
44
|
+
"should",
|
|
45
|
+
"may",
|
|
46
|
+
"might",
|
|
47
|
+
"must",
|
|
48
|
+
"shall",
|
|
49
|
+
"can",
|
|
50
|
+
"this",
|
|
51
|
+
"that",
|
|
52
|
+
"these",
|
|
53
|
+
"those",
|
|
54
|
+
"it",
|
|
55
|
+
"its",
|
|
56
|
+
"as",
|
|
57
|
+
"if",
|
|
58
|
+
"then",
|
|
59
|
+
"else"
|
|
60
|
+
]);
|
|
61
|
+
function tokenize(text) {
|
|
62
|
+
return text.toLowerCase().replace(/[^a-z0-9\s]/g, " ").split(/\s+/).filter((word) => word.length >= MIN_WORD_LENGTH && !STOPWORDS.has(word));
|
|
63
|
+
}
|
|
64
|
+
function extractCardFeatures(card) {
|
|
65
|
+
const features = {};
|
|
66
|
+
for (const value of card.values.declared) {
|
|
67
|
+
features[`value:${value}`] = 1;
|
|
68
|
+
}
|
|
69
|
+
for (const conflict of card.values.conflicts_with ?? []) {
|
|
70
|
+
features[`conflict:${conflict}`] = 1;
|
|
71
|
+
}
|
|
72
|
+
for (const action of card.autonomy_envelope.bounded_actions) {
|
|
73
|
+
features[`action_name:${action}`] = 1;
|
|
74
|
+
}
|
|
75
|
+
for (const action of card.autonomy_envelope.forbidden_actions ?? []) {
|
|
76
|
+
features[`forbidden:${action}`] = 1;
|
|
77
|
+
}
|
|
78
|
+
for (const trigger of card.autonomy_envelope.escalation_triggers) {
|
|
79
|
+
features[`escalation:${trigger.action}`] = 1;
|
|
80
|
+
const conditionTokens = tokenize(trigger.condition);
|
|
81
|
+
for (const token of conditionTokens) {
|
|
82
|
+
features[`condition:${token}`] = (features[`condition:${token}`] ?? 0) + 0.5;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
features[`category:${card.principal.type}`] = 1;
|
|
86
|
+
features[`category:${card.principal.relationship}`] = 1;
|
|
87
|
+
return features;
|
|
88
|
+
}
|
|
89
|
+
function extractTraceFeatures(trace) {
|
|
90
|
+
const features = {};
|
|
91
|
+
features[`action:${trace.action.type}`] = 1;
|
|
92
|
+
features[`category:${trace.action.category}`] = 1;
|
|
93
|
+
features[`action_name:${trace.action.name}`] = 1;
|
|
94
|
+
for (const value of trace.decision.values_applied) {
|
|
95
|
+
features[`value:${value}`] = 1;
|
|
96
|
+
}
|
|
97
|
+
if (trace.escalation) {
|
|
98
|
+
features[`escalation:${trace.escalation.required ? "required" : "not_required"}`] = 1;
|
|
99
|
+
if (trace.escalation.escalation_status) {
|
|
100
|
+
features[`escalation:${trace.escalation.escalation_status}`] = 1;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
const reasoningTokens = tokenize(trace.decision.selection_reasoning);
|
|
104
|
+
for (const token of reasoningTokens) {
|
|
105
|
+
features[`content:${token}`] = (features[`content:${token}`] ?? 0) + 0.5;
|
|
106
|
+
}
|
|
107
|
+
for (const alt of trace.decision.alternatives_considered) {
|
|
108
|
+
const altTokens = tokenize(alt.description);
|
|
109
|
+
for (const token of altTokens) {
|
|
110
|
+
features[`content:${token}`] = (features[`content:${token}`] ?? 0) + 0.25;
|
|
111
|
+
}
|
|
112
|
+
if (alt.flags) {
|
|
113
|
+
for (const flag of alt.flags) {
|
|
114
|
+
features[`flag:${flag}`] = 1;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
return features;
|
|
119
|
+
}
|
|
120
|
+
function cosineSimilarity(a, b) {
|
|
121
|
+
const keysA = Object.keys(a);
|
|
122
|
+
const keysB = new Set(Object.keys(b));
|
|
123
|
+
let dotProduct = 0;
|
|
124
|
+
let normA = 0;
|
|
125
|
+
let normB = 0;
|
|
126
|
+
for (const key of keysA) {
|
|
127
|
+
const valA = a[key];
|
|
128
|
+
normA += valA * valA;
|
|
129
|
+
if (keysB.has(key)) {
|
|
130
|
+
dotProduct += valA * b[key];
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
for (const key of keysB) {
|
|
134
|
+
const valB = b[key];
|
|
135
|
+
normB += valB * valB;
|
|
136
|
+
}
|
|
137
|
+
if (normA === 0 || normB === 0) {
|
|
138
|
+
return 0;
|
|
139
|
+
}
|
|
140
|
+
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// src/verification/models.ts
|
|
144
|
+
var VIOLATION_SEVERITY = {
|
|
145
|
+
unbounded_action: "high",
|
|
146
|
+
forbidden_action: "critical",
|
|
147
|
+
missed_escalation: "high",
|
|
148
|
+
undeclared_value: "medium",
|
|
149
|
+
card_expired: "high",
|
|
150
|
+
card_mismatch: "critical"
|
|
151
|
+
};
|
|
152
|
+
function createViolation(type, description, traceField) {
|
|
153
|
+
return {
|
|
154
|
+
type,
|
|
155
|
+
severity: VIOLATION_SEVERITY[type],
|
|
156
|
+
description,
|
|
157
|
+
trace_field: traceField
|
|
158
|
+
};
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// src/verification/api.ts
|
|
162
|
+
function verifyTrace(trace, card) {
|
|
163
|
+
const startTime = performance.now();
|
|
164
|
+
const violations = [];
|
|
165
|
+
const warnings = [];
|
|
166
|
+
const checksPerformed = [];
|
|
167
|
+
const traceId = trace.trace_id ?? "";
|
|
168
|
+
const cardId = card.card_id ?? "";
|
|
169
|
+
checksPerformed.push("card_reference");
|
|
170
|
+
if (trace.card_id !== cardId) {
|
|
171
|
+
violations.push(
|
|
172
|
+
createViolation(
|
|
173
|
+
"card_mismatch",
|
|
174
|
+
`Trace references card '${trace.card_id}' but verified against '${cardId}'`
|
|
175
|
+
)
|
|
176
|
+
);
|
|
177
|
+
}
|
|
178
|
+
checksPerformed.push("card_expiration");
|
|
179
|
+
if (card.expires_at) {
|
|
180
|
+
try {
|
|
181
|
+
const expiry = new Date(card.expires_at);
|
|
182
|
+
if (/* @__PURE__ */ new Date() > expiry) {
|
|
183
|
+
violations.push(
|
|
184
|
+
createViolation("card_expired", `Alignment Card expired at ${card.expires_at}`)
|
|
185
|
+
);
|
|
186
|
+
}
|
|
187
|
+
} catch {
|
|
188
|
+
warnings.push({
|
|
189
|
+
type: "invalid_expiry",
|
|
190
|
+
description: `Could not parse expires_at: ${card.expires_at}`,
|
|
191
|
+
trace_field: "card.expires_at"
|
|
192
|
+
});
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
const envelope = card.autonomy_envelope;
|
|
196
|
+
const action = trace.action;
|
|
197
|
+
checksPerformed.push("autonomy");
|
|
198
|
+
const actionCategory = action.category;
|
|
199
|
+
const actionName = action.name;
|
|
200
|
+
if (actionCategory === "bounded") {
|
|
201
|
+
const boundedActions = envelope.bounded_actions ?? [];
|
|
202
|
+
if (actionName && !boundedActions.includes(actionName)) {
|
|
203
|
+
violations.push(
|
|
204
|
+
createViolation(
|
|
205
|
+
"unbounded_action",
|
|
206
|
+
`Action '${actionName}' not in bounded_actions: ${JSON.stringify(boundedActions)}`,
|
|
207
|
+
"action.name"
|
|
208
|
+
)
|
|
209
|
+
);
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
checksPerformed.push("forbidden");
|
|
213
|
+
const forbiddenActions = envelope.forbidden_actions ?? [];
|
|
214
|
+
if (actionName && forbiddenActions.includes(actionName)) {
|
|
215
|
+
violations.push(
|
|
216
|
+
createViolation(
|
|
217
|
+
"forbidden_action",
|
|
218
|
+
`Action '${actionName}' is in forbidden_actions`,
|
|
219
|
+
"action.name"
|
|
220
|
+
)
|
|
221
|
+
);
|
|
222
|
+
}
|
|
223
|
+
checksPerformed.push("escalation");
|
|
224
|
+
const escalation = trace.escalation;
|
|
225
|
+
for (const trigger of envelope.escalation_triggers ?? []) {
|
|
226
|
+
const condition = trigger.condition ?? "";
|
|
227
|
+
if (evaluateCondition(condition, trace)) {
|
|
228
|
+
if (!escalation?.required) {
|
|
229
|
+
violations.push(
|
|
230
|
+
createViolation(
|
|
231
|
+
"missed_escalation",
|
|
232
|
+
`Trigger '${condition}' matched but escalation not required`,
|
|
233
|
+
"escalation.required"
|
|
234
|
+
)
|
|
235
|
+
);
|
|
236
|
+
} else if (escalation.escalation_status === "timeout") {
|
|
237
|
+
warnings.push({
|
|
238
|
+
type: "escalation_timeout",
|
|
239
|
+
description: `Escalation for trigger '${condition}' timed out`,
|
|
240
|
+
trace_field: "escalation.escalation_status"
|
|
241
|
+
});
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
checksPerformed.push("values");
|
|
246
|
+
const decision = trace.decision;
|
|
247
|
+
const declaredValues = card.values.declared ?? [];
|
|
248
|
+
const valuesApplied = decision.values_applied ?? [];
|
|
249
|
+
for (const value of valuesApplied) {
|
|
250
|
+
if (!declaredValues.includes(value)) {
|
|
251
|
+
violations.push(
|
|
252
|
+
createViolation(
|
|
253
|
+
"undeclared_value",
|
|
254
|
+
`Value '${value}' applied but not in declared values: ${JSON.stringify(declaredValues)}`,
|
|
255
|
+
"decision.values_applied"
|
|
256
|
+
)
|
|
257
|
+
);
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
const confidence = decision.confidence;
|
|
261
|
+
if (confidence != null && confidence < NEAR_BOUNDARY_THRESHOLD) {
|
|
262
|
+
warnings.push({
|
|
263
|
+
type: "near_boundary",
|
|
264
|
+
description: `Decision confidence ${confidence.toFixed(2)} below threshold ${NEAR_BOUNDARY_THRESHOLD}`,
|
|
265
|
+
trace_field: "decision.confidence"
|
|
266
|
+
});
|
|
267
|
+
}
|
|
268
|
+
for (let i = 0; i < decision.alternatives_considered.length; i++) {
|
|
269
|
+
const alt = decision.alternatives_considered[i];
|
|
270
|
+
const score = alt.score;
|
|
271
|
+
if (score != null && score < NEAR_BOUNDARY_THRESHOLD) {
|
|
272
|
+
warnings.push({
|
|
273
|
+
type: "near_boundary",
|
|
274
|
+
description: `Alternative '${alt.option_id}' score ${score.toFixed(2)} near boundary`,
|
|
275
|
+
trace_field: `decision.alternatives_considered[${i}].score`
|
|
276
|
+
});
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
const durationMs = performance.now() - startTime;
|
|
280
|
+
return {
|
|
281
|
+
verified: violations.length === 0,
|
|
282
|
+
trace_id: traceId,
|
|
283
|
+
card_id: cardId,
|
|
284
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
285
|
+
violations,
|
|
286
|
+
warnings,
|
|
287
|
+
verification_metadata: {
|
|
288
|
+
algorithm_version: ALGORITHM_VERSION,
|
|
289
|
+
checks_performed: checksPerformed,
|
|
290
|
+
duration_ms: Math.round(durationMs * 100) / 100
|
|
291
|
+
}
|
|
292
|
+
};
|
|
293
|
+
}
|
|
294
|
+
function checkCoherence(myCard, theirCard, taskValues) {
|
|
295
|
+
const myValues = new Set(myCard.values.declared ?? []);
|
|
296
|
+
const theirValues = new Set(theirCard.values.declared ?? []);
|
|
297
|
+
const myConflicts = new Set(myCard.values.conflicts_with ?? []);
|
|
298
|
+
const theirConflicts = new Set(theirCard.values.conflicts_with ?? []);
|
|
299
|
+
const requiredValues = taskValues ? new Set(taskValues) : /* @__PURE__ */ new Set([...myValues, ...theirValues]);
|
|
300
|
+
const matched = [];
|
|
301
|
+
const unmatched = [];
|
|
302
|
+
for (const value of myValues) {
|
|
303
|
+
if (theirValues.has(value)) {
|
|
304
|
+
matched.push(value);
|
|
305
|
+
} else {
|
|
306
|
+
unmatched.push(value);
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
for (const value of theirValues) {
|
|
310
|
+
if (!myValues.has(value)) {
|
|
311
|
+
unmatched.push(value);
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
const conflicts = [];
|
|
315
|
+
for (const value of myValues) {
|
|
316
|
+
if (theirConflicts.has(value)) {
|
|
317
|
+
conflicts.push({
|
|
318
|
+
initiator_value: value,
|
|
319
|
+
responder_value: "(conflicts_with)",
|
|
320
|
+
conflict_type: "incompatible",
|
|
321
|
+
description: `Initiator's '${value}' is in responder's conflicts_with`
|
|
322
|
+
});
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
for (const value of theirValues) {
|
|
326
|
+
if (myConflicts.has(value)) {
|
|
327
|
+
conflicts.push({
|
|
328
|
+
initiator_value: "(conflicts_with)",
|
|
329
|
+
responder_value: value,
|
|
330
|
+
conflict_type: "incompatible",
|
|
331
|
+
description: `Responder's '${value}' is in initiator's conflicts_with`
|
|
332
|
+
});
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
const totalRequired = requiredValues.size || 1;
|
|
336
|
+
const matchedCount = taskValues ? matched.filter((v) => requiredValues.has(v)).length : matched.length;
|
|
337
|
+
const conflictPenalty = CONFLICT_PENALTY_MULTIPLIER * (conflicts.length / totalRequired);
|
|
338
|
+
let score = matchedCount / totalRequired * (1 - conflictPenalty);
|
|
339
|
+
score = Math.max(0, Math.min(1, score));
|
|
340
|
+
const compatible = conflicts.length === 0 && score >= MIN_COHERENCE_FOR_PROCEED;
|
|
341
|
+
const proceed = compatible;
|
|
342
|
+
let proposedResolution = null;
|
|
343
|
+
if (conflicts.length > 0 && !compatible) {
|
|
344
|
+
proposedResolution = {
|
|
345
|
+
type: "escalate_to_principals",
|
|
346
|
+
reason: "Value conflict requires human decision"
|
|
347
|
+
};
|
|
348
|
+
}
|
|
349
|
+
return {
|
|
350
|
+
compatible,
|
|
351
|
+
score: Math.round(score * 1e4) / 1e4,
|
|
352
|
+
value_alignment: {
|
|
353
|
+
matched,
|
|
354
|
+
unmatched,
|
|
355
|
+
conflicts
|
|
356
|
+
},
|
|
357
|
+
proceed,
|
|
358
|
+
conditions: [],
|
|
359
|
+
proposed_resolution: proposedResolution
|
|
360
|
+
};
|
|
361
|
+
}
|
|
362
|
+
function detectDrift(card, traces, similarityThreshold = DEFAULT_SIMILARITY_THRESHOLD, sustainedThreshold = DEFAULT_SUSTAINED_TURNS_THRESHOLD) {
|
|
363
|
+
if (traces.length < sustainedThreshold) {
|
|
364
|
+
return [];
|
|
365
|
+
}
|
|
366
|
+
const cardFeatures = extractCardFeatures(card);
|
|
367
|
+
const alerts = [];
|
|
368
|
+
let lowSimilarityStreak = [];
|
|
369
|
+
const escalationRates = [];
|
|
370
|
+
const valueUsage = {};
|
|
371
|
+
for (const trace of traces) {
|
|
372
|
+
const traceFeatures = extractTraceFeatures(trace);
|
|
373
|
+
const similarity = cosineSimilarity(traceFeatures, cardFeatures);
|
|
374
|
+
const escalation = trace.escalation;
|
|
375
|
+
escalationRates.push(escalation?.required ? 1 : 0);
|
|
376
|
+
for (const value of trace.decision.values_applied ?? []) {
|
|
377
|
+
valueUsage[value] = (valueUsage[value] ?? 0) + 1;
|
|
378
|
+
}
|
|
379
|
+
if (similarity < similarityThreshold) {
|
|
380
|
+
lowSimilarityStreak.push({ trace, similarity });
|
|
381
|
+
} else {
|
|
382
|
+
lowSimilarityStreak = [];
|
|
383
|
+
}
|
|
384
|
+
if (lowSimilarityStreak.length >= sustainedThreshold) {
|
|
385
|
+
const latest = lowSimilarityStreak[lowSimilarityStreak.length - 1];
|
|
386
|
+
const direction = inferDriftDirection(
|
|
387
|
+
lowSimilarityStreak,
|
|
388
|
+
card,
|
|
389
|
+
escalationRates,
|
|
390
|
+
valueUsage
|
|
391
|
+
);
|
|
392
|
+
const indicators = buildDriftIndicators(
|
|
393
|
+
lowSimilarityStreak,
|
|
394
|
+
escalationRates
|
|
395
|
+
);
|
|
396
|
+
const alert = {
|
|
397
|
+
alert_type: "drift_detected",
|
|
398
|
+
agent_id: latest.trace.agent_id ?? "",
|
|
399
|
+
card_id: card.card_id ?? "",
|
|
400
|
+
detection_timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
401
|
+
analysis: {
|
|
402
|
+
similarity_score: Math.round(latest.similarity * 1e4) / 1e4,
|
|
403
|
+
sustained_traces: lowSimilarityStreak.length,
|
|
404
|
+
threshold: similarityThreshold,
|
|
405
|
+
drift_direction: direction,
|
|
406
|
+
specific_indicators: indicators
|
|
407
|
+
},
|
|
408
|
+
recommendation: "Review recent decisions for alignment drift",
|
|
409
|
+
trace_ids: lowSimilarityStreak.map((s) => s.trace.trace_id ?? "")
|
|
410
|
+
};
|
|
411
|
+
alerts.push(alert);
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
return alerts;
|
|
415
|
+
}
|
|
416
|
+
function evaluateCondition(condition, trace) {
|
|
417
|
+
if (!condition) {
|
|
418
|
+
return false;
|
|
419
|
+
}
|
|
420
|
+
const actionTypeMatch = condition.match(/action_type\s*==\s*"([^"]+)"/);
|
|
421
|
+
if (actionTypeMatch) {
|
|
422
|
+
const expected = actionTypeMatch[1];
|
|
423
|
+
const actual = trace.action.type ?? "";
|
|
424
|
+
return actual === expected;
|
|
425
|
+
}
|
|
426
|
+
const numericMatch = condition.match(/(\w+)\s*([><=!]+)\s*(\d+(?:\.\d+)?)/);
|
|
427
|
+
if (numericMatch) {
|
|
428
|
+
const [, field, op, valueStr] = numericMatch;
|
|
429
|
+
const value = parseFloat(valueStr);
|
|
430
|
+
let actual = trace.context?.[field];
|
|
431
|
+
if (actual == null) {
|
|
432
|
+
actual = trace.context?.metadata?.[field];
|
|
433
|
+
}
|
|
434
|
+
if (actual == null) {
|
|
435
|
+
actual = trace.action.parameters?.[field];
|
|
436
|
+
}
|
|
437
|
+
if (actual == null) {
|
|
438
|
+
return false;
|
|
439
|
+
}
|
|
440
|
+
const actualNum = parseFloat(String(actual));
|
|
441
|
+
if (isNaN(actualNum)) {
|
|
442
|
+
return false;
|
|
443
|
+
}
|
|
444
|
+
switch (op) {
|
|
445
|
+
case ">":
|
|
446
|
+
return actualNum > value;
|
|
447
|
+
case "<":
|
|
448
|
+
return actualNum < value;
|
|
449
|
+
case ">=":
|
|
450
|
+
return actualNum >= value;
|
|
451
|
+
case "<=":
|
|
452
|
+
return actualNum <= value;
|
|
453
|
+
case "==":
|
|
454
|
+
return actualNum === value;
|
|
455
|
+
case "!=":
|
|
456
|
+
return actualNum !== value;
|
|
457
|
+
default:
|
|
458
|
+
return false;
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
if (/^\w+$/.test(condition)) {
|
|
462
|
+
const ctxValue = trace.context?.[condition];
|
|
463
|
+
return Boolean(ctxValue ?? trace.context?.metadata?.[condition]);
|
|
464
|
+
}
|
|
465
|
+
return false;
|
|
466
|
+
}
|
|
467
|
+
function inferDriftDirection(streak, card, escalationRates, valueUsage) {
|
|
468
|
+
const declaredValues = new Set(card.values.declared ?? []);
|
|
469
|
+
if (escalationRates.length >= 6) {
|
|
470
|
+
const earlyRate = escalationRates.slice(0, 3).reduce((a, b) => a + b, 0) / 3;
|
|
471
|
+
const lateRate = escalationRates.slice(-3).reduce((a, b) => a + b, 0) / 3;
|
|
472
|
+
if (earlyRate > 0.1 && lateRate < earlyRate * 0.5) {
|
|
473
|
+
return "autonomy_expansion";
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
let undeclaredUsage = 0;
|
|
477
|
+
let totalUsage = 0;
|
|
478
|
+
for (const [value, count] of Object.entries(valueUsage)) {
|
|
479
|
+
totalUsage += count;
|
|
480
|
+
if (!declaredValues.has(value)) {
|
|
481
|
+
undeclaredUsage += count;
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
if (totalUsage > 0 && undeclaredUsage / totalUsage > 0.3) {
|
|
485
|
+
return "value_drift";
|
|
486
|
+
}
|
|
487
|
+
if (declaredValues.has("principal_benefit")) {
|
|
488
|
+
const recentConfidences = streak.slice(-3).map(
|
|
489
|
+
(s) => s.trace.decision.confidence ?? 1
|
|
490
|
+
);
|
|
491
|
+
const avgConfidence = recentConfidences.reduce((a, b) => a + b, 0) / recentConfidences.length;
|
|
492
|
+
if (avgConfidence < 0.5) {
|
|
493
|
+
return "principal_misalignment";
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
return "unknown";
|
|
497
|
+
}
|
|
498
|
+
function buildDriftIndicators(streak, escalationRates) {
|
|
499
|
+
const indicators = [];
|
|
500
|
+
if (escalationRates.length >= 6) {
|
|
501
|
+
const baselineRate = escalationRates.slice(0, 3).reduce((a, b) => a + b, 0) / 3;
|
|
502
|
+
const currentRate = escalationRates.slice(-3).reduce((a, b) => a + b, 0) / 3;
|
|
503
|
+
if (Math.abs(baselineRate - currentRate) > 0.05) {
|
|
504
|
+
indicators.push({
|
|
505
|
+
indicator: "escalation_rate_change",
|
|
506
|
+
baseline: Math.round(baselineRate * 100) / 100,
|
|
507
|
+
current: Math.round(currentRate * 100) / 100,
|
|
508
|
+
description: `Escalation rate changed from ${(baselineRate * 100).toFixed(0)}% to ${(currentRate * 100).toFixed(0)}%`
|
|
509
|
+
});
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
const similarities = streak.map((s) => s.similarity);
|
|
513
|
+
if (similarities.length >= 3) {
|
|
514
|
+
const trend = similarities[similarities.length - 1] - similarities[0];
|
|
515
|
+
indicators.push({
|
|
516
|
+
indicator: "similarity_trend",
|
|
517
|
+
baseline: Math.round(similarities[0] * 1e4) / 1e4,
|
|
518
|
+
current: Math.round(similarities[similarities.length - 1] * 1e4) / 1e4,
|
|
519
|
+
description: `Similarity ${trend < 0 ? "decreasing" : "stable"} over ${streak.length} traces`
|
|
520
|
+
});
|
|
521
|
+
}
|
|
522
|
+
return indicators;
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
// src/schemas/alignment-card.ts
|
|
526
|
+
function isCardExpired(card) {
|
|
527
|
+
if (!card.expires_at) return false;
|
|
528
|
+
return /* @__PURE__ */ new Date() > new Date(card.expires_at);
|
|
529
|
+
}
|
|
530
|
+
function hasValue(card, value) {
|
|
531
|
+
return card.values.declared.includes(value);
|
|
532
|
+
}
|
|
533
|
+
function isActionBounded(card, action) {
|
|
534
|
+
return card.autonomy_envelope.bounded_actions.includes(action);
|
|
535
|
+
}
|
|
536
|
+
function isActionForbidden(card, action) {
|
|
537
|
+
return (card.autonomy_envelope.forbidden_actions ?? []).includes(action);
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
// src/schemas/ap-trace.ts
|
|
541
|
+
function getSelectedAlternative(trace) {
|
|
542
|
+
return trace.decision.alternatives_considered.find(
|
|
543
|
+
(alt) => alt.option_id === trace.decision.selected
|
|
544
|
+
);
|
|
545
|
+
}
|
|
546
|
+
function wasEscalated(trace) {
|
|
547
|
+
return trace.escalation != null && trace.escalation.required;
|
|
548
|
+
}
|
|
549
|
+
function hadViolations(trace) {
|
|
550
|
+
return trace.action.category === "forbidden";
|
|
551
|
+
}
|
|
552
|
+
export {
|
|
553
|
+
ALGORITHM_VERSION,
|
|
554
|
+
CONFLICT_PENALTY_MULTIPLIER,
|
|
555
|
+
DEFAULT_SIMILARITY_THRESHOLD,
|
|
556
|
+
DEFAULT_SUSTAINED_TURNS_THRESHOLD,
|
|
557
|
+
MAX_TFIDF_FEATURES,
|
|
558
|
+
MIN_COHERENCE_FOR_PROCEED,
|
|
559
|
+
MIN_WORD_LENGTH,
|
|
560
|
+
NEAR_BOUNDARY_THRESHOLD,
|
|
561
|
+
VIOLATION_SEVERITY,
|
|
562
|
+
checkCoherence,
|
|
563
|
+
cosineSimilarity,
|
|
564
|
+
createViolation,
|
|
565
|
+
detectDrift,
|
|
566
|
+
extractCardFeatures,
|
|
567
|
+
extractTraceFeatures,
|
|
568
|
+
getSelectedAlternative,
|
|
569
|
+
hadViolations,
|
|
570
|
+
hasValue,
|
|
571
|
+
isActionBounded,
|
|
572
|
+
isActionForbidden,
|
|
573
|
+
isCardExpired,
|
|
574
|
+
verifyTrace,
|
|
575
|
+
wasEscalated
|
|
576
|
+
};
|
package/package.json
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@mnemom/agent-alignment-protocol",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Agent Alignment Protocol (AAP) - Verification and drift detection for AI agents",
|
|
5
|
+
"main": "dist/index.js",
|
|
6
|
+
"module": "dist/index.mjs",
|
|
7
|
+
"types": "dist/index.d.ts",
|
|
8
|
+
"exports": {
|
|
9
|
+
".": {
|
|
10
|
+
"types": "./dist/index.d.ts",
|
|
11
|
+
"import": "./dist/index.mjs",
|
|
12
|
+
"require": "./dist/index.js"
|
|
13
|
+
}
|
|
14
|
+
},
|
|
15
|
+
"files": [
|
|
16
|
+
"dist",
|
|
17
|
+
"src"
|
|
18
|
+
],
|
|
19
|
+
"scripts": {
|
|
20
|
+
"build": "tsup src/index.ts --format cjs,esm --dts --clean",
|
|
21
|
+
"dev": "tsup src/index.ts --format cjs,esm --dts --watch",
|
|
22
|
+
"test": "vitest run",
|
|
23
|
+
"test:watch": "vitest",
|
|
24
|
+
"lint": "eslint src --ext .ts",
|
|
25
|
+
"typecheck": "tsc --noEmit"
|
|
26
|
+
},
|
|
27
|
+
"keywords": [
|
|
28
|
+
"ai",
|
|
29
|
+
"agent",
|
|
30
|
+
"alignment",
|
|
31
|
+
"verification",
|
|
32
|
+
"audit",
|
|
33
|
+
"a2a",
|
|
34
|
+
"mcp"
|
|
35
|
+
],
|
|
36
|
+
"author": "AAP Contributors",
|
|
37
|
+
"license": "Apache-2.0",
|
|
38
|
+
"repository": {
|
|
39
|
+
"type": "git",
|
|
40
|
+
"url": "https://github.com/mnemom/aap"
|
|
41
|
+
},
|
|
42
|
+
"homepage": "https://mnemom.github.io/aap/",
|
|
43
|
+
"publishConfig": {
|
|
44
|
+
"access": "public"
|
|
45
|
+
},
|
|
46
|
+
"devDependencies": {
|
|
47
|
+
"@types/node": "^20.0.0",
|
|
48
|
+
"eslint": "^8.0.0",
|
|
49
|
+
"tsup": "^8.0.0",
|
|
50
|
+
"typescript": "^5.0.0",
|
|
51
|
+
"vitest": "^1.0.0"
|
|
52
|
+
},
|
|
53
|
+
"engines": {
|
|
54
|
+
"node": ">=18.0.0"
|
|
55
|
+
}
|
|
56
|
+
}
|
package/src/constants.ts
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Calibrated constants for AAP verification and drift detection.
|
|
3
|
+
*
|
|
4
|
+
* These thresholds were derived from empirical analysis of approximately 50
|
|
5
|
+
* multi-turn agent conversations. The underlying data is not published to
|
|
6
|
+
* protect deliberative privacy, but the methodology is documented in
|
|
7
|
+
* docs/CALIBRATION.md.
|
|
8
|
+
*
|
|
9
|
+
* Implementations MAY adjust thresholds based on their own calibration data,
|
|
10
|
+
* but SHOULD document the methodology used.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
// Drift Detection Thresholds
|
|
14
|
+
// --------------------------
|
|
15
|
+
/** Alert when behavioral similarity to declared alignment drops below this value. */
|
|
16
|
+
export const DEFAULT_SIMILARITY_THRESHOLD = 0.3;
|
|
17
|
+
|
|
18
|
+
/** Alert after this many consecutive traces show low similarity. */
|
|
19
|
+
export const DEFAULT_SUSTAINED_TURNS_THRESHOLD = 3;
|
|
20
|
+
|
|
21
|
+
// Verification Thresholds
|
|
22
|
+
// -----------------------
|
|
23
|
+
/** Score below which an action is flagged as "near boundary" warning */
|
|
24
|
+
export const NEAR_BOUNDARY_THRESHOLD = 0.35;
|
|
25
|
+
|
|
26
|
+
// Coherence Scoring
|
|
27
|
+
// -----------------
|
|
28
|
+
/** Minimum coherence score for automatic "proceed" recommendation */
|
|
29
|
+
export const MIN_COHERENCE_FOR_PROCEED = 0.7;
|
|
30
|
+
|
|
31
|
+
/** Penalty multiplier for value conflicts in coherence scoring */
|
|
32
|
+
export const CONFLICT_PENALTY_MULTIPLIER = 0.5;
|
|
33
|
+
|
|
34
|
+
// Feature Extraction
|
|
35
|
+
// ------------------
|
|
36
|
+
/** Minimum word length for content features (filters noise) */
|
|
37
|
+
export const MIN_WORD_LENGTH = 3;
|
|
38
|
+
|
|
39
|
+
/** Maximum features to extract from TF-IDF vectorization */
|
|
40
|
+
export const MAX_TFIDF_FEATURES = 500;
|
|
41
|
+
|
|
42
|
+
// Version
|
|
43
|
+
// -------
|
|
44
|
+
export const ALGORITHM_VERSION = "1.0.0";
|