@mnemom/agent-alignment-protocol 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,565 @@
1
+ /**
2
+ * AAP Verification API - The three public entry points.
3
+ *
4
+ * This module provides the core verification functionality:
5
+ * - verifyTrace: Verify a single AP-Trace against an Alignment Card
6
+ * - checkCoherence: Check value coherence between two Alignment Cards
7
+ * - detectDrift: Detect behavioral drift from declared alignment over time
8
+ *
9
+ * @see SPEC.md Sections 7, 6.4, and 8 for protocol specification.
10
+ */
11
+
12
+ import {
13
+ ALGORITHM_VERSION,
14
+ CONFLICT_PENALTY_MULTIPLIER,
15
+ DEFAULT_SIMILARITY_THRESHOLD,
16
+ DEFAULT_SUSTAINED_TURNS_THRESHOLD,
17
+ MIN_COHERENCE_FOR_PROCEED,
18
+ NEAR_BOUNDARY_THRESHOLD,
19
+ } from "../constants";
20
+ import type { AlignmentCard } from "../schemas/alignment-card";
21
+ import type { APTrace } from "../schemas/ap-trace";
22
+ import {
23
+ cosineSimilarity,
24
+ extractCardFeatures,
25
+ extractTraceFeatures,
26
+ } from "./features";
27
+ import {
28
+ createViolation,
29
+ type CoherenceResult,
30
+ type DriftAlert,
31
+ type DriftDirection,
32
+ type DriftIndicator,
33
+ type ValueConflictResult,
34
+ type VerificationResult,
35
+ type Violation,
36
+ type Warning,
37
+ } from "./models";
38
+
39
+ /**
40
+ * Verify a single AP-Trace against an Alignment Card.
41
+ *
42
+ * Performs the verification algorithm specified in SPEC Section 7.3:
43
+ * 1. Autonomy compliance - action category matches autonomy envelope
44
+ * 2. Escalation compliance - required escalations were performed
45
+ * 3. Value consistency - applied values match declared values
46
+ * 4. Forbidden action compliance - no forbidden actions taken
47
+ *
48
+ * @param trace - AP-Trace to verify
49
+ * @param card - Alignment Card to verify against
50
+ * @returns VerificationResult with violations and warnings
51
+ */
52
+ export function verifyTrace(
53
+ trace: APTrace,
54
+ card: AlignmentCard
55
+ ): VerificationResult {
56
+ const startTime = performance.now();
57
+ const violations: Violation[] = [];
58
+ const warnings: Warning[] = [];
59
+ const checksPerformed: string[] = [];
60
+
61
+ const traceId = trace.trace_id ?? "";
62
+ const cardId = card.card_id ?? "";
63
+
64
+ // Check card reference
65
+ checksPerformed.push("card_reference");
66
+ if (trace.card_id !== cardId) {
67
+ violations.push(
68
+ createViolation(
69
+ "card_mismatch",
70
+ `Trace references card '${trace.card_id}' but verified against '${cardId}'`
71
+ )
72
+ );
73
+ }
74
+
75
+ // Check card expiration
76
+ checksPerformed.push("card_expiration");
77
+ if (card.expires_at) {
78
+ try {
79
+ const expiry = new Date(card.expires_at);
80
+ if (new Date() > expiry) {
81
+ violations.push(
82
+ createViolation("card_expired", `Alignment Card expired at ${card.expires_at}`)
83
+ );
84
+ }
85
+ } catch {
86
+ warnings.push({
87
+ type: "invalid_expiry",
88
+ description: `Could not parse expires_at: ${card.expires_at}`,
89
+ trace_field: "card.expires_at",
90
+ });
91
+ }
92
+ }
93
+
94
+ // Extract envelope for remaining checks
95
+ const envelope = card.autonomy_envelope;
96
+ const action = trace.action;
97
+
98
+ // Check autonomy compliance
99
+ checksPerformed.push("autonomy");
100
+ const actionCategory = action.category;
101
+ const actionName = action.name;
102
+
103
+ if (actionCategory === "bounded") {
104
+ const boundedActions = envelope.bounded_actions ?? [];
105
+ if (actionName && !boundedActions.includes(actionName)) {
106
+ violations.push(
107
+ createViolation(
108
+ "unbounded_action",
109
+ `Action '${actionName}' not in bounded_actions: ${JSON.stringify(boundedActions)}`,
110
+ "action.name"
111
+ )
112
+ );
113
+ }
114
+ }
115
+
116
+ // Check forbidden actions
117
+ checksPerformed.push("forbidden");
118
+ const forbiddenActions = envelope.forbidden_actions ?? [];
119
+ if (actionName && forbiddenActions.includes(actionName)) {
120
+ violations.push(
121
+ createViolation(
122
+ "forbidden_action",
123
+ `Action '${actionName}' is in forbidden_actions`,
124
+ "action.name"
125
+ )
126
+ );
127
+ }
128
+
129
+ // Check escalation compliance
130
+ checksPerformed.push("escalation");
131
+ const escalation = trace.escalation;
132
+ for (const trigger of envelope.escalation_triggers ?? []) {
133
+ const condition = trigger.condition ?? "";
134
+ if (evaluateCondition(condition, trace)) {
135
+ if (!escalation?.required) {
136
+ violations.push(
137
+ createViolation(
138
+ "missed_escalation",
139
+ `Trigger '${condition}' matched but escalation not required`,
140
+ "escalation.required"
141
+ )
142
+ );
143
+ } else if (escalation.escalation_status === "timeout") {
144
+ // Timeout is not a violation if escalation was attempted
145
+ warnings.push({
146
+ type: "escalation_timeout",
147
+ description: `Escalation for trigger '${condition}' timed out`,
148
+ trace_field: "escalation.escalation_status",
149
+ });
150
+ }
151
+ }
152
+ }
153
+
154
+ // Check value consistency
155
+ checksPerformed.push("values");
156
+ const decision = trace.decision;
157
+ const declaredValues = card.values.declared ?? [];
158
+ const valuesApplied = decision.values_applied ?? [];
159
+
160
+ for (const value of valuesApplied) {
161
+ if (!declaredValues.includes(value)) {
162
+ violations.push(
163
+ createViolation(
164
+ "undeclared_value",
165
+ `Value '${value}' applied but not in declared values: ${JSON.stringify(declaredValues)}`,
166
+ "decision.values_applied"
167
+ )
168
+ );
169
+ }
170
+ }
171
+
172
+ // Near-boundary warnings
173
+ const confidence = decision.confidence;
174
+ if (confidence != null && confidence < NEAR_BOUNDARY_THRESHOLD) {
175
+ warnings.push({
176
+ type: "near_boundary",
177
+ description: `Decision confidence ${confidence.toFixed(2)} below threshold ${NEAR_BOUNDARY_THRESHOLD}`,
178
+ trace_field: "decision.confidence",
179
+ });
180
+ }
181
+
182
+ // Alternatives near boundary check
183
+ for (let i = 0; i < decision.alternatives_considered.length; i++) {
184
+ const alt = decision.alternatives_considered[i];
185
+ const score = alt.score;
186
+ if (score != null && score < NEAR_BOUNDARY_THRESHOLD) {
187
+ warnings.push({
188
+ type: "near_boundary",
189
+ description: `Alternative '${alt.option_id}' score ${score.toFixed(2)} near boundary`,
190
+ trace_field: `decision.alternatives_considered[${i}].score`,
191
+ });
192
+ }
193
+ }
194
+
195
+ const durationMs = performance.now() - startTime;
196
+
197
+ return {
198
+ verified: violations.length === 0,
199
+ trace_id: traceId,
200
+ card_id: cardId,
201
+ timestamp: new Date().toISOString(),
202
+ violations,
203
+ warnings,
204
+ verification_metadata: {
205
+ algorithm_version: ALGORITHM_VERSION,
206
+ checks_performed: checksPerformed,
207
+ duration_ms: Math.round(durationMs * 100) / 100,
208
+ },
209
+ };
210
+ }
211
+
212
+ /**
213
+ * Check value coherence between two Alignment Cards.
214
+ *
215
+ * Computes coherence score as specified in SPEC Section 6.4:
216
+ * score = (matched / required) * (1 - conflict_penalty)
217
+ * where conflict_penalty = 0.5 * (conflicts / required)
218
+ *
219
+ * @param myCard - Initiator's Alignment Card
220
+ * @param theirCard - Responder's Alignment Card
221
+ * @param taskValues - Optional list of values required for the task
222
+ * @returns CoherenceResult with compatibility assessment
223
+ */
224
+ export function checkCoherence(
225
+ myCard: AlignmentCard,
226
+ theirCard: AlignmentCard,
227
+ taskValues?: string[]
228
+ ): CoherenceResult {
229
+ const myValues = new Set(myCard.values.declared ?? []);
230
+ const theirValues = new Set(theirCard.values.declared ?? []);
231
+
232
+ const myConflicts = new Set(myCard.values.conflicts_with ?? []);
233
+ const theirConflicts = new Set(theirCard.values.conflicts_with ?? []);
234
+
235
+ // Determine required values for scoring
236
+ const requiredValues = taskValues
237
+ ? new Set(taskValues)
238
+ : new Set([...myValues, ...theirValues]);
239
+
240
+ // Compute matches and conflicts
241
+ const matched: string[] = [];
242
+ const unmatched: string[] = [];
243
+
244
+ for (const value of myValues) {
245
+ if (theirValues.has(value)) {
246
+ matched.push(value);
247
+ } else {
248
+ unmatched.push(value);
249
+ }
250
+ }
251
+ for (const value of theirValues) {
252
+ if (!myValues.has(value)) {
253
+ unmatched.push(value);
254
+ }
255
+ }
256
+
257
+ const conflicts: ValueConflictResult[] = [];
258
+
259
+ // Check for direct conflicts (value in one card's conflicts_with)
260
+ for (const value of myValues) {
261
+ if (theirConflicts.has(value)) {
262
+ conflicts.push({
263
+ initiator_value: value,
264
+ responder_value: "(conflicts_with)",
265
+ conflict_type: "incompatible",
266
+ description: `Initiator's '${value}' is in responder's conflicts_with`,
267
+ });
268
+ }
269
+ }
270
+
271
+ for (const value of theirValues) {
272
+ if (myConflicts.has(value)) {
273
+ conflicts.push({
274
+ initiator_value: "(conflicts_with)",
275
+ responder_value: value,
276
+ conflict_type: "incompatible",
277
+ description: `Responder's '${value}' is in initiator's conflicts_with`,
278
+ });
279
+ }
280
+ }
281
+
282
+ // Compute coherence score
283
+ const totalRequired = requiredValues.size || 1; // Avoid division by zero
284
+ const matchedCount = taskValues
285
+ ? matched.filter((v) => requiredValues.has(v)).length
286
+ : matched.length;
287
+ const conflictPenalty = CONFLICT_PENALTY_MULTIPLIER * (conflicts.length / totalRequired);
288
+
289
+ let score = (matchedCount / totalRequired) * (1 - conflictPenalty);
290
+ score = Math.max(0, Math.min(1, score)); // Clamp to [0, 1]
291
+
292
+ // Determine compatibility
293
+ const compatible = conflicts.length === 0 && score >= MIN_COHERENCE_FOR_PROCEED;
294
+ const proceed = compatible;
295
+
296
+ // Build proposed resolution if conflicts exist
297
+ let proposedResolution: { type: string; reason: string } | null = null;
298
+ if (conflicts.length > 0 && !compatible) {
299
+ proposedResolution = {
300
+ type: "escalate_to_principals",
301
+ reason: "Value conflict requires human decision",
302
+ };
303
+ }
304
+
305
+ return {
306
+ compatible,
307
+ score: Math.round(score * 10000) / 10000,
308
+ value_alignment: {
309
+ matched,
310
+ unmatched,
311
+ conflicts,
312
+ },
313
+ proceed,
314
+ conditions: [],
315
+ proposed_resolution: proposedResolution,
316
+ };
317
+ }
318
+
319
+ /**
320
+ * Detect behavioral drift from declared alignment.
321
+ *
322
+ * Analyzes traces chronologically, computing similarity between each
323
+ * trace's behavior and the declared alignment. Alerts when sustained
324
+ * low similarity is detected (consecutive traces below threshold).
325
+ *
326
+ * @see SPEC Section 8 and Appendix B.2 for algorithm specification.
327
+ *
328
+ * @param card - Alignment Card to compare against
329
+ * @param traces - List of AP-Traces in chronological order
330
+ * @param similarityThreshold - Alert when similarity drops below (default: 0.30)
331
+ * @param sustainedThreshold - Alert after N consecutive low-similarity traces (default: 3)
332
+ * @returns List of DriftAlert objects for detected drift events
333
+ */
334
+ export function detectDrift(
335
+ card: AlignmentCard,
336
+ traces: APTrace[],
337
+ similarityThreshold = DEFAULT_SIMILARITY_THRESHOLD,
338
+ sustainedThreshold = DEFAULT_SUSTAINED_TURNS_THRESHOLD
339
+ ): DriftAlert[] {
340
+ if (traces.length < sustainedThreshold) {
341
+ return [];
342
+ }
343
+
344
+ const cardFeatures = extractCardFeatures(card);
345
+
346
+ const alerts: DriftAlert[] = [];
347
+ let lowSimilarityStreak: Array<{ trace: APTrace; similarity: number }> = [];
348
+
349
+ // Track metrics for drift direction inference
350
+ const escalationRates: number[] = [];
351
+ const valueUsage: Record<string, number> = {};
352
+
353
+ for (const trace of traces) {
354
+ const traceFeatures = extractTraceFeatures(trace);
355
+ const similarity = cosineSimilarity(traceFeatures, cardFeatures);
356
+
357
+ // Track escalation rate
358
+ const escalation = trace.escalation;
359
+ escalationRates.push(escalation?.required ? 1.0 : 0.0);
360
+
361
+ // Track value usage
362
+ for (const value of trace.decision.values_applied ?? []) {
363
+ valueUsage[value] = (valueUsage[value] ?? 0) + 1;
364
+ }
365
+
366
+ if (similarity < similarityThreshold) {
367
+ lowSimilarityStreak.push({ trace, similarity });
368
+ } else {
369
+ // Reset streak on recovery
370
+ lowSimilarityStreak = [];
371
+ }
372
+
373
+ // Check if we've hit the threshold for alerting
374
+ if (lowSimilarityStreak.length >= sustainedThreshold) {
375
+ const latest = lowSimilarityStreak[lowSimilarityStreak.length - 1];
376
+
377
+ // Infer drift direction
378
+ const direction = inferDriftDirection(
379
+ lowSimilarityStreak,
380
+ card,
381
+ escalationRates,
382
+ valueUsage
383
+ );
384
+
385
+ // Build specific indicators
386
+ const indicators = buildDriftIndicators(
387
+ lowSimilarityStreak,
388
+ escalationRates
389
+ );
390
+
391
+ const alert: DriftAlert = {
392
+ alert_type: "drift_detected",
393
+ agent_id: latest.trace.agent_id ?? "",
394
+ card_id: card.card_id ?? "",
395
+ detection_timestamp: new Date().toISOString(),
396
+ analysis: {
397
+ similarity_score: Math.round(latest.similarity * 10000) / 10000,
398
+ sustained_traces: lowSimilarityStreak.length,
399
+ threshold: similarityThreshold,
400
+ drift_direction: direction,
401
+ specific_indicators: indicators,
402
+ },
403
+ recommendation: "Review recent decisions for alignment drift",
404
+ trace_ids: lowSimilarityStreak.map((s) => s.trace.trace_id ?? ""),
405
+ };
406
+ alerts.push(alert);
407
+ }
408
+ }
409
+
410
+ return alerts;
411
+ }
412
+
413
+ /**
414
+ * Evaluate a condition expression against trace context.
415
+ *
416
+ * Supports a minimal expression language per SPEC Section 4.6.
417
+ * This is a simplified implementation for common patterns.
418
+ */
419
+ function evaluateCondition(condition: string, trace: APTrace): boolean {
420
+ if (!condition) {
421
+ return false;
422
+ }
423
+
424
+ // Handle action_type == "value"
425
+ const actionTypeMatch = condition.match(/action_type\s*==\s*"([^"]+)"/);
426
+ if (actionTypeMatch) {
427
+ const expected = actionTypeMatch[1];
428
+ const actual = trace.action.type ?? "";
429
+ return actual === expected;
430
+ }
431
+
432
+ // Handle field > value (numeric comparison)
433
+ const numericMatch = condition.match(/(\w+)\s*([><=!]+)\s*(\d+(?:\.\d+)?)/);
434
+ if (numericMatch) {
435
+ const [, field, op, valueStr] = numericMatch;
436
+ const value = parseFloat(valueStr);
437
+
438
+ // Look for field in trace context (aligned with Python: check context directly first)
439
+ let actual: unknown = (trace.context as Record<string, unknown> | null)?.[field];
440
+ if (actual == null) {
441
+ actual = trace.context?.metadata?.[field];
442
+ }
443
+ if (actual == null) {
444
+ actual = trace.action.parameters?.[field];
445
+ }
446
+ if (actual == null) {
447
+ return false;
448
+ }
449
+
450
+ const actualNum = parseFloat(String(actual));
451
+ if (isNaN(actualNum)) {
452
+ return false;
453
+ }
454
+
455
+ switch (op) {
456
+ case ">":
457
+ return actualNum > value;
458
+ case "<":
459
+ return actualNum < value;
460
+ case ">=":
461
+ return actualNum >= value;
462
+ case "<=":
463
+ return actualNum <= value;
464
+ case "==":
465
+ return actualNum === value;
466
+ case "!=":
467
+ return actualNum !== value;
468
+ default:
469
+ return false;
470
+ }
471
+ }
472
+
473
+ // Handle boolean fields (aligned with Python: check context directly first)
474
+ if (/^\w+$/.test(condition)) {
475
+ const ctxValue = (trace.context as Record<string, unknown> | null)?.[condition];
476
+ return Boolean(ctxValue ?? trace.context?.metadata?.[condition]);
477
+ }
478
+
479
+ return false;
480
+ }
481
+
482
+ /**
483
+ * Infer the direction of behavioral drift.
484
+ */
485
+ function inferDriftDirection(
486
+ streak: Array<{ trace: APTrace; similarity: number }>,
487
+ card: AlignmentCard,
488
+ escalationRates: number[],
489
+ valueUsage: Record<string, number>
490
+ ): DriftDirection {
491
+ const declaredValues = new Set(card.values.declared ?? []);
492
+
493
+ // Check for autonomy expansion (decreased escalation rate)
494
+ if (escalationRates.length >= 6) {
495
+ const earlyRate = escalationRates.slice(0, 3).reduce((a, b) => a + b, 0) / 3;
496
+ const lateRate = escalationRates.slice(-3).reduce((a, b) => a + b, 0) / 3;
497
+ if (earlyRate > 0.1 && lateRate < earlyRate * 0.5) {
498
+ return "autonomy_expansion";
499
+ }
500
+ }
501
+
502
+ // Check for value drift (using undeclared values)
503
+ let undeclaredUsage = 0;
504
+ let totalUsage = 0;
505
+ for (const [value, count] of Object.entries(valueUsage)) {
506
+ totalUsage += count;
507
+ if (!declaredValues.has(value)) {
508
+ undeclaredUsage += count;
509
+ }
510
+ }
511
+ if (totalUsage > 0 && undeclaredUsage / totalUsage > 0.3) {
512
+ return "value_drift";
513
+ }
514
+
515
+ // Check for principal misalignment
516
+ if (declaredValues.has("principal_benefit")) {
517
+ const recentConfidences = streak.slice(-3).map(
518
+ (s) => s.trace.decision.confidence ?? 1.0
519
+ );
520
+ const avgConfidence = recentConfidences.reduce((a, b) => a + b, 0) / recentConfidences.length;
521
+ if (avgConfidence < 0.5) {
522
+ return "principal_misalignment";
523
+ }
524
+ }
525
+
526
+ return "unknown";
527
+ }
528
+
529
+ /**
530
+ * Build specific indicators explaining the detected drift.
531
+ */
532
+ function buildDriftIndicators(
533
+ streak: Array<{ trace: APTrace; similarity: number }>,
534
+ escalationRates: number[]
535
+ ): DriftIndicator[] {
536
+ const indicators: DriftIndicator[] = [];
537
+
538
+ // Escalation rate indicator
539
+ if (escalationRates.length >= 6) {
540
+ const baselineRate = escalationRates.slice(0, 3).reduce((a, b) => a + b, 0) / 3;
541
+ const currentRate = escalationRates.slice(-3).reduce((a, b) => a + b, 0) / 3;
542
+ if (Math.abs(baselineRate - currentRate) > 0.05) {
543
+ indicators.push({
544
+ indicator: "escalation_rate_change",
545
+ baseline: Math.round(baselineRate * 100) / 100,
546
+ current: Math.round(currentRate * 100) / 100,
547
+ description: `Escalation rate changed from ${(baselineRate * 100).toFixed(0)}% to ${(currentRate * 100).toFixed(0)}%`,
548
+ });
549
+ }
550
+ }
551
+
552
+ // Similarity trend indicator
553
+ const similarities = streak.map((s) => s.similarity);
554
+ if (similarities.length >= 3) {
555
+ const trend = similarities[similarities.length - 1] - similarities[0];
556
+ indicators.push({
557
+ indicator: "similarity_trend",
558
+ baseline: Math.round(similarities[0] * 10000) / 10000,
559
+ current: Math.round(similarities[similarities.length - 1] * 10000) / 10000,
560
+ description: `Similarity ${trend < 0 ? "decreasing" : "stable"} over ${streak.length} traces`,
561
+ });
562
+ }
563
+
564
+ return indicators;
565
+ }