@mnemom/agent-alignment-protocol 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +1 -1
- package/dist/index.d.ts +1 -1
- package/dist/index.js +2 -10
- package/dist/index.mjs +2 -10
- package/package.json +1 -1
- package/src/constants.ts +1 -1
- package/src/verification/api.ts +2 -2
- package/src/verification/features.ts +4 -10
package/dist/index.d.mts
CHANGED
|
@@ -678,6 +678,6 @@ declare const CONFLICT_PENALTY_MULTIPLIER = 0.5;
|
|
|
678
678
|
declare const MIN_WORD_LENGTH = 3;
|
|
679
679
|
/** Maximum features to extract from TF-IDF vectorization */
|
|
680
680
|
declare const MAX_TFIDF_FEATURES = 500;
|
|
681
|
-
declare const ALGORITHM_VERSION = "1.
|
|
681
|
+
declare const ALGORITHM_VERSION = "1.1.0";
|
|
682
682
|
|
|
683
683
|
export { ALGORITHM_VERSION, type APTrace, type Action, type ActionCategory, type ActionTarget, type ActionType, type AlignmentCard, type AlignmentCardRequest, type AlignmentCardResponse, type Alternative, type AuditCommitment, type AuditStorage, type AutonomyEnvelope, type AutonomyScope, CONFLICT_PENALTY_MULTIPLIER, type Coherence, type CoherenceResult, type CoherenceResultMessage, DEFAULT_SIMILARITY_THRESHOLD, DEFAULT_SUSTAINED_TURNS_THRESHOLD, type DataSharing, type Decision, type DriftAlert, type DriftAnalysis, type DriftDirection, type DriftIndicator, type Escalation, type EscalationStatus, type EscalationTrigger, type HierarchyType, MAX_TFIDF_FEATURES, MIN_COHERENCE_FOR_PROCEED, MIN_WORD_LENGTH, type MonetaryValue, NEAR_BOUNDARY_THRESHOLD, type Principal, type PrincipalResponse, type PrincipalType, type ProposedCollaboration, type ProposedResolution, type RelationshipType, type RequesterInfo, type Severity, type Signature, type StorageType, type TamperEvidence, type TaskContext, type TraceContext, type TriggerAction, type TriggerCheck, VIOLATION_SEVERITY, type ValueAlignment, type ValueAlignmentDetail, type ValueCoherenceCheck, type ValueCoherenceMessage, type ValueConflict, type ValueConflictResult, type ValueDefinition, type Values, type VerificationMetadata, type VerificationResult, type Violation, type ViolationType, type Warning, checkCoherence, cosineSimilarity, createViolation, detectDrift, extractCardFeatures, extractTraceFeatures, getSelectedAlternative, hadViolations, hasValue, isActionBounded, isActionForbidden, isCardExpired, verifyTrace, wasEscalated };
|
package/dist/index.d.ts
CHANGED
|
@@ -678,6 +678,6 @@ declare const CONFLICT_PENALTY_MULTIPLIER = 0.5;
|
|
|
678
678
|
declare const MIN_WORD_LENGTH = 3;
|
|
679
679
|
/** Maximum features to extract from TF-IDF vectorization */
|
|
680
680
|
declare const MAX_TFIDF_FEATURES = 500;
|
|
681
|
-
declare const ALGORITHM_VERSION = "1.
|
|
681
|
+
declare const ALGORITHM_VERSION = "1.1.0";
|
|
682
682
|
|
|
683
683
|
export { ALGORITHM_VERSION, type APTrace, type Action, type ActionCategory, type ActionTarget, type ActionType, type AlignmentCard, type AlignmentCardRequest, type AlignmentCardResponse, type Alternative, type AuditCommitment, type AuditStorage, type AutonomyEnvelope, type AutonomyScope, CONFLICT_PENALTY_MULTIPLIER, type Coherence, type CoherenceResult, type CoherenceResultMessage, DEFAULT_SIMILARITY_THRESHOLD, DEFAULT_SUSTAINED_TURNS_THRESHOLD, type DataSharing, type Decision, type DriftAlert, type DriftAnalysis, type DriftDirection, type DriftIndicator, type Escalation, type EscalationStatus, type EscalationTrigger, type HierarchyType, MAX_TFIDF_FEATURES, MIN_COHERENCE_FOR_PROCEED, MIN_WORD_LENGTH, type MonetaryValue, NEAR_BOUNDARY_THRESHOLD, type Principal, type PrincipalResponse, type PrincipalType, type ProposedCollaboration, type ProposedResolution, type RelationshipType, type RequesterInfo, type Severity, type Signature, type StorageType, type TamperEvidence, type TaskContext, type TraceContext, type TriggerAction, type TriggerCheck, VIOLATION_SEVERITY, type ValueAlignment, type ValueAlignmentDetail, type ValueCoherenceCheck, type ValueCoherenceMessage, type ValueConflict, type ValueConflictResult, type ValueDefinition, type Values, type VerificationMetadata, type VerificationResult, type Violation, type ViolationType, type Warning, checkCoherence, cosineSimilarity, createViolation, detectDrift, extractCardFeatures, extractTraceFeatures, getSelectedAlternative, hadViolations, hasValue, isActionBounded, isActionForbidden, isCardExpired, verifyTrace, wasEscalated };
|
package/dist/index.js
CHANGED
|
@@ -54,7 +54,7 @@ var MIN_COHERENCE_FOR_PROCEED = 0.7;
|
|
|
54
54
|
var CONFLICT_PENALTY_MULTIPLIER = 0.5;
|
|
55
55
|
var MIN_WORD_LENGTH = 3;
|
|
56
56
|
var MAX_TFIDF_FEATURES = 500;
|
|
57
|
-
var ALGORITHM_VERSION = "1.
|
|
57
|
+
var ALGORITHM_VERSION = "1.1.0";
|
|
58
58
|
|
|
59
59
|
// src/verification/features.ts
|
|
60
60
|
var STOPWORDS = /* @__PURE__ */ new Set([
|
|
@@ -148,15 +148,7 @@ function extractTraceFeatures(trace) {
|
|
|
148
148
|
features[`escalation:${trace.escalation.escalation_status}`] = 1;
|
|
149
149
|
}
|
|
150
150
|
}
|
|
151
|
-
const reasoningTokens = tokenize(trace.decision.selection_reasoning);
|
|
152
|
-
for (const token of reasoningTokens) {
|
|
153
|
-
features[`content:${token}`] = (features[`content:${token}`] ?? 0) + 0.5;
|
|
154
|
-
}
|
|
155
151
|
for (const alt of trace.decision.alternatives_considered) {
|
|
156
|
-
const altTokens = tokenize(alt.description);
|
|
157
|
-
for (const token of altTokens) {
|
|
158
|
-
features[`content:${token}`] = (features[`content:${token}`] ?? 0) + 0.25;
|
|
159
|
-
}
|
|
160
152
|
if (alt.flags) {
|
|
161
153
|
for (const flag of alt.flags) {
|
|
162
154
|
features[`flag:${flag}`] = 1;
|
|
@@ -429,7 +421,7 @@ function detectDrift(card, traces, similarityThreshold = DEFAULT_SIMILARITY_THRE
|
|
|
429
421
|
} else {
|
|
430
422
|
lowSimilarityStreak = [];
|
|
431
423
|
}
|
|
432
|
-
if (lowSimilarityStreak.length
|
|
424
|
+
if (lowSimilarityStreak.length === sustainedThreshold) {
|
|
433
425
|
const latest = lowSimilarityStreak[lowSimilarityStreak.length - 1];
|
|
434
426
|
const direction = inferDriftDirection(
|
|
435
427
|
lowSimilarityStreak,
|
package/dist/index.mjs
CHANGED
|
@@ -6,7 +6,7 @@ var MIN_COHERENCE_FOR_PROCEED = 0.7;
|
|
|
6
6
|
var CONFLICT_PENALTY_MULTIPLIER = 0.5;
|
|
7
7
|
var MIN_WORD_LENGTH = 3;
|
|
8
8
|
var MAX_TFIDF_FEATURES = 500;
|
|
9
|
-
var ALGORITHM_VERSION = "1.
|
|
9
|
+
var ALGORITHM_VERSION = "1.1.0";
|
|
10
10
|
|
|
11
11
|
// src/verification/features.ts
|
|
12
12
|
var STOPWORDS = /* @__PURE__ */ new Set([
|
|
@@ -100,15 +100,7 @@ function extractTraceFeatures(trace) {
|
|
|
100
100
|
features[`escalation:${trace.escalation.escalation_status}`] = 1;
|
|
101
101
|
}
|
|
102
102
|
}
|
|
103
|
-
const reasoningTokens = tokenize(trace.decision.selection_reasoning);
|
|
104
|
-
for (const token of reasoningTokens) {
|
|
105
|
-
features[`content:${token}`] = (features[`content:${token}`] ?? 0) + 0.5;
|
|
106
|
-
}
|
|
107
103
|
for (const alt of trace.decision.alternatives_considered) {
|
|
108
|
-
const altTokens = tokenize(alt.description);
|
|
109
|
-
for (const token of altTokens) {
|
|
110
|
-
features[`content:${token}`] = (features[`content:${token}`] ?? 0) + 0.25;
|
|
111
|
-
}
|
|
112
104
|
if (alt.flags) {
|
|
113
105
|
for (const flag of alt.flags) {
|
|
114
106
|
features[`flag:${flag}`] = 1;
|
|
@@ -381,7 +373,7 @@ function detectDrift(card, traces, similarityThreshold = DEFAULT_SIMILARITY_THRE
|
|
|
381
373
|
} else {
|
|
382
374
|
lowSimilarityStreak = [];
|
|
383
375
|
}
|
|
384
|
-
if (lowSimilarityStreak.length
|
|
376
|
+
if (lowSimilarityStreak.length === sustainedThreshold) {
|
|
385
377
|
const latest = lowSimilarityStreak[lowSimilarityStreak.length - 1];
|
|
386
378
|
const direction = inferDriftDirection(
|
|
387
379
|
lowSimilarityStreak,
|
package/package.json
CHANGED
package/src/constants.ts
CHANGED
package/src/verification/api.ts
CHANGED
|
@@ -370,8 +370,8 @@ export function detectDrift(
|
|
|
370
370
|
lowSimilarityStreak = [];
|
|
371
371
|
}
|
|
372
372
|
|
|
373
|
-
// Check if we've hit the threshold for alerting
|
|
374
|
-
if (lowSimilarityStreak.length
|
|
373
|
+
// Check if we've hit the threshold for alerting (== not >= to fire once)
|
|
374
|
+
if (lowSimilarityStreak.length === sustainedThreshold) {
|
|
375
375
|
const latest = lowSimilarityStreak[lowSimilarityStreak.length - 1];
|
|
376
376
|
|
|
377
377
|
// Infer drift direction
|
|
@@ -98,18 +98,12 @@ export function extractTraceFeatures(trace: APTrace): FeatureVector {
|
|
|
98
98
|
}
|
|
99
99
|
}
|
|
100
100
|
|
|
101
|
-
// Content features from reasoning
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
features[`content:${token}`] = (features[`content:${token}`] ?? 0) + 0.5;
|
|
105
|
-
}
|
|
101
|
+
// Note: Content features from reasoning/alternatives are deliberately excluded.
|
|
102
|
+
// Card features are purely structural, so content tokens dilute cosine
|
|
103
|
+
// similarity without adding alignment signal. See CALIBRATION.md Section 3.5.
|
|
106
104
|
|
|
107
|
-
//
|
|
105
|
+
// Flag features from alternatives (structural, not content)
|
|
108
106
|
for (const alt of trace.decision.alternatives_considered) {
|
|
109
|
-
const altTokens = tokenize(alt.description);
|
|
110
|
-
for (const token of altTokens) {
|
|
111
|
-
features[`content:${token}`] = (features[`content:${token}`] ?? 0) + 0.25;
|
|
112
|
-
}
|
|
113
107
|
if (alt.flags) {
|
|
114
108
|
for (const flag of alt.flags) {
|
|
115
109
|
features[`flag:${flag}`] = 1.0;
|