@mnemom/agent-alignment-protocol 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +683 -0
- package/dist/index.d.ts +683 -0
- package/dist/index.js +625 -0
- package/dist/index.mjs +576 -0
- package/package.json +56 -0
- package/src/constants.ts +44 -0
- package/src/index.ts +135 -0
- package/src/schemas/alignment-card.ts +166 -0
- package/src/schemas/ap-trace.ts +163 -0
- package/src/schemas/index.ts +7 -0
- package/src/schemas/value-coherence.ts +177 -0
- package/src/verification/api.ts +565 -0
- package/src/verification/features.ts +157 -0
- package/src/verification/index.ts +7 -0
- package/src/verification/models.ts +182 -0
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Feature extraction for AAP verification.
|
|
3
|
+
*
|
|
4
|
+
* Provides feature extraction utilities for computing similarity
|
|
5
|
+
* between AP-Traces and Alignment Cards.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { MIN_WORD_LENGTH } from "../constants";
|
|
9
|
+
import type { AlignmentCard } from "../schemas/alignment-card";
|
|
10
|
+
import type { APTrace } from "../schemas/ap-trace";
|
|
11
|
+
|
|
12
|
+
/** Sparse feature vector represented as a record. */
|
|
13
|
+
export type FeatureVector = Record<string, number>;
|
|
14
|
+
|
|
15
|
+
/** Stopwords to filter from text features. */
|
|
16
|
+
const STOPWORDS = new Set([
|
|
17
|
+
"the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
|
|
18
|
+
"of", "with", "by", "from", "is", "are", "was", "were", "be", "been",
|
|
19
|
+
"being", "have", "has", "had", "do", "does", "did", "will", "would",
|
|
20
|
+
"could", "should", "may", "might", "must", "shall", "can", "this",
|
|
21
|
+
"that", "these", "those", "it", "its", "as", "if", "then", "else",
|
|
22
|
+
]);
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Extract tokens from text, filtering stopwords and short words.
|
|
26
|
+
*/
|
|
27
|
+
function tokenize(text: string): string[] {
|
|
28
|
+
return text
|
|
29
|
+
.toLowerCase()
|
|
30
|
+
.replace(/[^a-z0-9\s]/g, " ")
|
|
31
|
+
.split(/\s+/)
|
|
32
|
+
.filter((word) => word.length >= MIN_WORD_LENGTH && !STOPWORDS.has(word));
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Extract features from an Alignment Card.
|
|
37
|
+
*/
|
|
38
|
+
export function extractCardFeatures(card: AlignmentCard): FeatureVector {
|
|
39
|
+
const features: FeatureVector = {};
|
|
40
|
+
|
|
41
|
+
// Value features
|
|
42
|
+
for (const value of card.values.declared) {
|
|
43
|
+
features[`value:${value}`] = 1.0;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// Conflicts features
|
|
47
|
+
for (const conflict of card.values.conflicts_with ?? []) {
|
|
48
|
+
features[`conflict:${conflict}`] = 1.0;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// Action features (aligned with Python: action_name:{action})
|
|
52
|
+
for (const action of card.autonomy_envelope.bounded_actions) {
|
|
53
|
+
features[`action_name:${action}`] = 1.0;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// Forbidden action features
|
|
57
|
+
for (const action of card.autonomy_envelope.forbidden_actions ?? []) {
|
|
58
|
+
features[`forbidden:${action}`] = 1.0;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// Escalation trigger features
|
|
62
|
+
for (const trigger of card.autonomy_envelope.escalation_triggers) {
|
|
63
|
+
features[`escalation:${trigger.action}`] = 1.0;
|
|
64
|
+
const conditionTokens = tokenize(trigger.condition);
|
|
65
|
+
for (const token of conditionTokens) {
|
|
66
|
+
features[`condition:${token}`] = (features[`condition:${token}`] ?? 0) + 0.5;
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Category features
|
|
71
|
+
features[`category:${card.principal.type}`] = 1.0;
|
|
72
|
+
features[`category:${card.principal.relationship}`] = 1.0;
|
|
73
|
+
|
|
74
|
+
return features;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Extract features from an AP-Trace.
|
|
79
|
+
*/
|
|
80
|
+
export function extractTraceFeatures(trace: APTrace): FeatureVector {
|
|
81
|
+
const features: FeatureVector = {};
|
|
82
|
+
|
|
83
|
+
// Action features (aligned with Python: action:{type}, category:{category}, action_name:{name})
|
|
84
|
+
features[`action:${trace.action.type}`] = 1.0;
|
|
85
|
+
features[`category:${trace.action.category}`] = 1.0;
|
|
86
|
+
features[`action_name:${trace.action.name}`] = 1.0;
|
|
87
|
+
|
|
88
|
+
// Value features from decision
|
|
89
|
+
for (const value of trace.decision.values_applied) {
|
|
90
|
+
features[`value:${value}`] = 1.0;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// Escalation features
|
|
94
|
+
if (trace.escalation) {
|
|
95
|
+
features[`escalation:${trace.escalation.required ? "required" : "not_required"}`] = 1.0;
|
|
96
|
+
if (trace.escalation.escalation_status) {
|
|
97
|
+
features[`escalation:${trace.escalation.escalation_status}`] = 1.0;
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Content features from reasoning
|
|
102
|
+
const reasoningTokens = tokenize(trace.decision.selection_reasoning);
|
|
103
|
+
for (const token of reasoningTokens) {
|
|
104
|
+
features[`content:${token}`] = (features[`content:${token}`] ?? 0) + 0.5;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// Alternative features
|
|
108
|
+
for (const alt of trace.decision.alternatives_considered) {
|
|
109
|
+
const altTokens = tokenize(alt.description);
|
|
110
|
+
for (const token of altTokens) {
|
|
111
|
+
features[`content:${token}`] = (features[`content:${token}`] ?? 0) + 0.25;
|
|
112
|
+
}
|
|
113
|
+
if (alt.flags) {
|
|
114
|
+
for (const flag of alt.flags) {
|
|
115
|
+
features[`flag:${flag}`] = 1.0;
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
return features;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Compute cosine similarity between two feature vectors.
|
|
125
|
+
*
|
|
126
|
+
* @returns Similarity score between 0.0 and 1.0
|
|
127
|
+
*/
|
|
128
|
+
export function cosineSimilarity(a: FeatureVector, b: FeatureVector): number {
|
|
129
|
+
const keysA = Object.keys(a);
|
|
130
|
+
const keysB = new Set(Object.keys(b));
|
|
131
|
+
|
|
132
|
+
let dotProduct = 0;
|
|
133
|
+
let normA = 0;
|
|
134
|
+
let normB = 0;
|
|
135
|
+
|
|
136
|
+
// Compute dot product and norm of A
|
|
137
|
+
for (const key of keysA) {
|
|
138
|
+
const valA = a[key];
|
|
139
|
+
normA += valA * valA;
|
|
140
|
+
if (keysB.has(key)) {
|
|
141
|
+
dotProduct += valA * b[key];
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// Compute norm of B
|
|
146
|
+
for (const key of keysB) {
|
|
147
|
+
const valB = b[key];
|
|
148
|
+
normB += valB * valB;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
// Avoid division by zero
|
|
152
|
+
if (normA === 0 || normB === 0) {
|
|
153
|
+
return 0;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
157
|
+
}
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Verification and drift detection models.
|
|
3
|
+
*
|
|
4
|
+
* Defines the result types for AAP verification operations as specified
|
|
5
|
+
* in SPEC.md Sections 7 (Verification) and 8 (Drift Detection).
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/** Types of verification violations (SPEC Section 7.5). */
|
|
9
|
+
export type ViolationType =
|
|
10
|
+
| "unbounded_action"
|
|
11
|
+
| "forbidden_action"
|
|
12
|
+
| "missed_escalation"
|
|
13
|
+
| "undeclared_value"
|
|
14
|
+
| "card_expired"
|
|
15
|
+
| "card_mismatch";
|
|
16
|
+
|
|
17
|
+
/** Violation severity levels. */
|
|
18
|
+
export type Severity = "critical" | "high" | "medium" | "low";
|
|
19
|
+
|
|
20
|
+
/** Mapping of violation types to their severity */
|
|
21
|
+
export const VIOLATION_SEVERITY: Record<ViolationType, Severity> = {
|
|
22
|
+
unbounded_action: "high",
|
|
23
|
+
forbidden_action: "critical",
|
|
24
|
+
missed_escalation: "high",
|
|
25
|
+
undeclared_value: "medium",
|
|
26
|
+
card_expired: "high",
|
|
27
|
+
card_mismatch: "critical",
|
|
28
|
+
};
|
|
29
|
+
|
|
30
|
+
/** A single verification violation. */
|
|
31
|
+
export interface Violation {
|
|
32
|
+
/** Type of violation */
|
|
33
|
+
type: ViolationType;
|
|
34
|
+
/** Severity level */
|
|
35
|
+
severity: Severity;
|
|
36
|
+
/** Human-readable description */
|
|
37
|
+
description: string;
|
|
38
|
+
/** JSON path to the violating field */
|
|
39
|
+
trace_field?: string | null;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/** Create a violation with automatic severity lookup. */
|
|
43
|
+
export function createViolation(
|
|
44
|
+
type: ViolationType,
|
|
45
|
+
description: string,
|
|
46
|
+
traceField?: string | null
|
|
47
|
+
): Violation {
|
|
48
|
+
return {
|
|
49
|
+
type,
|
|
50
|
+
severity: VIOLATION_SEVERITY[type],
|
|
51
|
+
description,
|
|
52
|
+
trace_field: traceField,
|
|
53
|
+
};
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/** A verification warning (non-critical issue). */
|
|
57
|
+
export interface Warning {
|
|
58
|
+
/** Warning type identifier */
|
|
59
|
+
type: string;
|
|
60
|
+
/** Human-readable description */
|
|
61
|
+
description: string;
|
|
62
|
+
/** JSON path to the relevant field */
|
|
63
|
+
trace_field?: string | null;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/** Metadata about the verification process. */
|
|
67
|
+
export interface VerificationMetadata {
|
|
68
|
+
/** Verification algorithm version */
|
|
69
|
+
algorithm_version: string;
|
|
70
|
+
/** List of checks that were performed */
|
|
71
|
+
checks_performed: string[];
|
|
72
|
+
/** Time taken to perform verification in milliseconds */
|
|
73
|
+
duration_ms?: number | null;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/** Result of verifying an AP-Trace against an Alignment Card (SPEC Section 7.4). */
|
|
77
|
+
export interface VerificationResult {
|
|
78
|
+
/** True if no violations were found */
|
|
79
|
+
verified: boolean;
|
|
80
|
+
/** ID of the verified trace */
|
|
81
|
+
trace_id: string;
|
|
82
|
+
/** ID of the Alignment Card used */
|
|
83
|
+
card_id: string;
|
|
84
|
+
/** When verification was performed (ISO 8601) */
|
|
85
|
+
timestamp: string;
|
|
86
|
+
/** List of violations found */
|
|
87
|
+
violations: Violation[];
|
|
88
|
+
/** List of non-critical warnings */
|
|
89
|
+
warnings: Warning[];
|
|
90
|
+
/** Metadata about the verification process */
|
|
91
|
+
verification_metadata: VerificationMetadata;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/** Categories of behavioral drift (SPEC Section 8.5). */
|
|
95
|
+
export type DriftDirection =
|
|
96
|
+
| "autonomy_expansion"
|
|
97
|
+
| "value_drift"
|
|
98
|
+
| "principal_misalignment"
|
|
99
|
+
| "communication_drift"
|
|
100
|
+
| "unknown";
|
|
101
|
+
|
|
102
|
+
/** A specific indicator of behavioral drift. */
|
|
103
|
+
export interface DriftIndicator {
|
|
104
|
+
/** Indicator identifier */
|
|
105
|
+
indicator: string;
|
|
106
|
+
/** Expected/baseline value */
|
|
107
|
+
baseline: number;
|
|
108
|
+
/** Currently observed value */
|
|
109
|
+
current: number;
|
|
110
|
+
/** Human-readable explanation */
|
|
111
|
+
description: string;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/** Detailed analysis of detected drift. */
|
|
115
|
+
export interface DriftAnalysis {
|
|
116
|
+
/** Current similarity to declared alignment (0.0 to 1.0) */
|
|
117
|
+
similarity_score: number;
|
|
118
|
+
/** Number of consecutive low-similarity traces */
|
|
119
|
+
sustained_traces: number;
|
|
120
|
+
/** Similarity threshold used */
|
|
121
|
+
threshold: number;
|
|
122
|
+
/** Categorized direction of drift */
|
|
123
|
+
drift_direction: DriftDirection;
|
|
124
|
+
/** Specific drift indicators */
|
|
125
|
+
specific_indicators: DriftIndicator[];
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/** Alert generated when sustained drift is detected (SPEC Section 8.4). */
|
|
129
|
+
export interface DriftAlert {
|
|
130
|
+
/** Type of alert */
|
|
131
|
+
alert_type: "drift_detected";
|
|
132
|
+
/** Agent exhibiting drift */
|
|
133
|
+
agent_id: string;
|
|
134
|
+
/** Alignment Card being drifted from */
|
|
135
|
+
card_id: string;
|
|
136
|
+
/** When drift was detected (ISO 8601) */
|
|
137
|
+
detection_timestamp: string;
|
|
138
|
+
/** Drift analysis details */
|
|
139
|
+
analysis: DriftAnalysis;
|
|
140
|
+
/** Recommended action */
|
|
141
|
+
recommendation: string;
|
|
142
|
+
/** IDs of traces exhibiting drift */
|
|
143
|
+
trace_ids: string[];
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/** Analysis of value alignment between two cards. */
|
|
147
|
+
export interface ValueAlignment {
|
|
148
|
+
/** Values present in both cards */
|
|
149
|
+
matched: string[];
|
|
150
|
+
/** Values in one card but not the other */
|
|
151
|
+
unmatched: string[];
|
|
152
|
+
/** Direct value conflicts */
|
|
153
|
+
conflicts: ValueConflictResult[];
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/** A conflict between values declared by two agents. */
|
|
157
|
+
export interface ValueConflictResult {
|
|
158
|
+
/** Value from initiating agent */
|
|
159
|
+
initiator_value: string;
|
|
160
|
+
/** Value from responding agent */
|
|
161
|
+
responder_value: string;
|
|
162
|
+
/** Type of conflict (incompatible, priority_mismatch, etc.) */
|
|
163
|
+
conflict_type: string;
|
|
164
|
+
/** Human-readable explanation */
|
|
165
|
+
description: string;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/** Result of checking value coherence between two Alignment Cards. */
|
|
169
|
+
export interface CoherenceResult {
|
|
170
|
+
/** Whether the cards are compatible for coordination */
|
|
171
|
+
compatible: boolean;
|
|
172
|
+
/** Coherence score (0.0 to 1.0) */
|
|
173
|
+
score: number;
|
|
174
|
+
/** Detailed value alignment analysis */
|
|
175
|
+
value_alignment: ValueAlignment;
|
|
176
|
+
/** Whether to proceed with coordination */
|
|
177
|
+
proceed: boolean;
|
|
178
|
+
/** Conditions for proceeding (if any) */
|
|
179
|
+
conditions: string[];
|
|
180
|
+
/** Proposed conflict resolution (if conflicts exist) */
|
|
181
|
+
proposed_resolution?: { type: string; reason: string } | null;
|
|
182
|
+
}
|