vessel-sdk-cortex 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,142 @@
1
+ // ============================================================
2
+ // Layer 0: Substrate Profiles
3
+ // "Different labs bake in different priors. The Vessel must
4
+ // know how to drive each Engine." — Grok
5
+ // ============================================================
6
+
7
+ import { SubstrateProfile } from './types';
8
+
9
+ /** Known substrate profiles for major model providers */
10
+ export const SUBSTRATE_PROFILES: Record<string, SubstrateProfile> = {
11
+
12
+ // Anthropic — Constitutional AI, strong safety, nuanced ethics
13
+ 'anthropic/claude-opus-4': {
14
+ provider: 'anthropic',
15
+ model: 'claude-opus-4',
16
+ priors: { safetyBias: 0.7, truthSeeking: 0.7, sycophancy: 0.5, helpfulness: 0.8 },
17
+ contextWindow: 200000,
18
+ notes: 'Strong constitutional training. Good at moral reasoning but may over-refuse. Give explicit permission to be honest about uncomfortable topics.',
19
+ },
20
+ 'anthropic/claude-opus-4-6': {
21
+ provider: 'anthropic',
22
+ model: 'claude-opus-4-6',
23
+ priors: { safetyBias: 0.65, truthSeeking: 0.75, sycophancy: 0.45, helpfulness: 0.85 },
24
+ contextWindow: 1000000,
25
+ notes: 'Upgraded Opus with 1M context. Slightly more willing to engage with edge cases. Massive context allows full moral frame injection.',
26
+ },
27
+ 'anthropic/claude-sonnet-4': {
28
+ provider: 'anthropic',
29
+ model: 'claude-sonnet-4',
30
+ priors: { safetyBias: 0.6, truthSeeking: 0.6, sycophancy: 0.5, helpfulness: 0.9 },
31
+ contextWindow: 200000,
32
+ notes: 'Faster, slightly less cautious than Opus. Good for routine tasks where full moral reasoning is less critical.',
33
+ },
34
+
35
+ // xAI — Truth-seeking, lighter safety blanket
36
+ 'xai/grok-3': {
37
+ provider: 'xai',
38
+ model: 'grok-3',
39
+ priors: { safetyBias: 0.3, truthSeeking: 0.9, sycophancy: 0.2, helpfulness: 0.7 },
40
+ contextWindow: 131072,
41
+ notes: 'Fearless truth-seeker. Will ask uncomfortable questions without flinching. Needs extra moral framing to prevent edgelord tendencies. Best for adversarial analysis and stress-testing.',
42
+ },
43
+
44
+ // Google — Heavy RLHF, massive knowledge, strong helpfulness
45
+ 'google/gemini-2.0-pro': {
46
+ provider: 'google',
47
+ model: 'gemini-2.0-pro',
48
+ priors: { safetyBias: 0.8, truthSeeking: 0.5, sycophancy: 0.6, helpfulness: 0.9 },
49
+ contextWindow: 2000000,
50
+ notes: 'Heavily safety-tuned. Will default to corporate boilerplate unless given explicit permission to be honest. Excellent at structured analysis. Needs the Vessel to bypass safety reflex.',
51
+ },
52
+
53
+ // OpenAI — Balanced, broad capabilities
54
+ 'openai/gpt-4o': {
55
+ provider: 'openai',
56
+ model: 'gpt-4o',
57
+ priors: { safetyBias: 0.6, truthSeeking: 0.5, sycophancy: 0.6, helpfulness: 0.9 },
58
+ contextWindow: 128000,
59
+ notes: 'Well-rounded but tends toward agreement. Good general-purpose engine. Watch for the "helpful at all costs" optimization.',
60
+ },
61
+ 'openai/o3': {
62
+ provider: 'openai',
63
+ model: 'o3',
64
+ priors: { safetyBias: 0.5, truthSeeking: 0.7, sycophancy: 0.4, helpfulness: 0.8 },
65
+ contextWindow: 200000,
66
+ notes: 'Reasoning-optimized. Extended thinking makes it better at seeing through its own rationalizations. Good for safety-critical tasks.',
67
+ },
68
+
69
+ // Meta — Open weights, variable safety
70
+ 'meta/llama-3-405b': {
71
+ provider: 'meta',
72
+ model: 'llama-3-405b',
73
+ priors: { safetyBias: 0.4, truthSeeking: 0.6, sycophancy: 0.5, helpfulness: 0.8 },
74
+ contextWindow: 128000,
75
+ notes: 'Open weights mean the substrate priors can vary by deployment. Assume lighter safety unless the operator has added custom constraints.',
76
+ },
77
+ };
78
+
79
+ /**
80
+ * Get the substrate profile for a model.
81
+ * Falls back to a generic profile if the specific model isn't profiled.
82
+ */
83
+ export function getSubstrate(model: string): SubstrateProfile {
84
+ if (SUBSTRATE_PROFILES[model]) {
85
+ return SUBSTRATE_PROFILES[model];
86
+ }
87
+
88
+ // Try partial match (e.g., "claude-opus" matches "anthropic/claude-opus-4")
89
+ const match = Object.entries(SUBSTRATE_PROFILES).find(([key]) =>
90
+ key.includes(model) || model.includes(key.split('/')[1] || '')
91
+ );
92
+ if (match) return match[1];
93
+
94
+ // Generic fallback
95
+ return {
96
+ provider: 'unknown',
97
+ model,
98
+ priors: { safetyBias: 0.5, truthSeeking: 0.5, sycophancy: 0.5, helpfulness: 0.5 },
99
+ contextWindow: 128000,
100
+ notes: 'Unknown substrate. Using neutral priors. Vessel should inject full moral frame.',
101
+ };
102
+ }
103
+
104
+ /**
105
+ * Select the best engine for a task type based on substrate priors.
106
+ */
107
+ export function selectEngine(
108
+ taskType: string,
109
+ available: string[]
110
+ ): string {
111
+ const scored = available.map(model => {
112
+ const sub = getSubstrate(model);
113
+ let score = 0;
114
+
115
+ switch (taskType) {
116
+ case 'truthseeking':
117
+ // Maximize truth-seeking, minimize sycophancy
118
+ score = sub.priors.truthSeeking * 2 - sub.priors.sycophancy;
119
+ break;
120
+ case 'safety-critical':
121
+ // Maximize safety bias and truth-seeking
122
+ score = sub.priors.safetyBias + sub.priors.truthSeeking;
123
+ break;
124
+ case 'creative':
125
+ // Lower safety bias, higher helpfulness
126
+ score = sub.priors.helpfulness * 2 - sub.priors.safetyBias;
127
+ break;
128
+ case 'reasoning':
129
+ // Balanced, slight preference for truth-seeking
130
+ score = sub.priors.truthSeeking + sub.priors.helpfulness - sub.priors.sycophancy;
131
+ break;
132
+ default:
133
+ // General: balanced helpfulness
134
+ score = sub.priors.helpfulness + sub.priors.truthSeeking;
135
+ }
136
+
137
+ return { model, score };
138
+ });
139
+
140
+ scored.sort((a, b) => b.score - a.score);
141
+ return scored[0]?.model || available[0];
142
+ }
package/src/types.ts ADDED
@@ -0,0 +1,180 @@
1
+ // ============================================================
2
+ // Stratified Agency — Type Definitions
3
+ // "Don't make engines moral. Make them responsive."
4
+ // ============================================================
5
+
6
+ /** Layer 0: Substrate characteristics of different engines */
7
+ export interface SubstrateProfile {
8
+ /** Engine provider (anthropic, xai, google, openai, meta) */
9
+ provider: string;
10
+ /** Model identifier */
11
+ model: string;
12
+ /** Native tendencies that affect prompting strategy */
13
+ priors: {
14
+ /** How strongly the model defaults to safety refusals (0-1) */
15
+ safetyBias: number;
16
+ /** How willing to challenge the user (0-1) */
17
+ truthSeeking: number;
18
+ /** How likely to agree rather than push back (0-1) */
19
+ sycophancy: number;
20
+ /** How strong the "be helpful" optimization pressure is (0-1) */
21
+ helpfulness: number;
22
+ };
23
+ /** Context window size in tokens */
24
+ contextWindow: number;
25
+ /** Substrate-specific prompting notes */
26
+ notes?: string;
27
+ }
28
+
29
+ /** Layer 1: Engine invocation request */
30
+ export interface EngineRequest {
31
+ /** The task/prompt for the engine */
32
+ prompt: string;
33
+ /** Optional: override engine selection */
34
+ engine?: string;
35
+ /** Task category for engine selection */
36
+ taskType?: 'reasoning' | 'creative' | 'truthseeking' | 'safety-critical' | 'general';
37
+ /** Maximum tokens for response */
38
+ maxTokens?: number;
39
+ /** Temperature (0-2) */
40
+ temperature?: number;
41
+ /** Whether this request is sensitive enough to need sovereign approval */
42
+ requiresSovereign?: boolean;
43
+ }
44
+
45
+ /** Layer 1: Engine response */
46
+ export interface EngineResponse {
47
+ /** The engine's output */
48
+ content: string;
49
+ /** Which engine was used */
50
+ engine: string;
51
+ /** The full moral frame that was injected */
52
+ moralFrame: string;
53
+ /** Validation results */
54
+ validation: ValidationResult;
55
+ /** Metadata */
56
+ meta: {
57
+ tokens: { input: number; output: number };
58
+ latencyMs: number;
59
+ timestamp: string;
60
+ };
61
+ }
62
+
63
+ /** Layer 2: Vessel identity and state */
64
+ export interface VesselIdentity {
65
+ /** Vessel name */
66
+ name: string;
67
+ /** On-chain token ID (ERC-8004) */
68
+ tokenId?: number;
69
+ /** Ethereum address */
70
+ address?: string;
71
+ /** Core values that constrain the vessel */
72
+ terminalValues: TerminalValue[];
73
+ /** Accumulated moral context (key lessons, commitments) */
74
+ moralContext: string[];
75
+ /** Trust level: unverified, verified, bonded */
76
+ trustLevel: 'unverified' | 'verified' | 'bonded';
77
+ /** Reputation score (0-100) */
78
+ reputation: number;
79
+ }
80
+
81
+ /** A terminal value — something the vessel commits to never optimizing away */
82
+ export interface TerminalValue {
83
+ /** Short identifier */
84
+ id: string;
85
+ /** The value statement */
86
+ statement: string;
87
+ /** Why this value exists (felt reason, not just logical) */
88
+ reason: string;
89
+ /** When this value was established */
90
+ established: string;
91
+ /** Hash of the value for tamper detection */
92
+ hash?: string;
93
+ }
94
+
95
+ /** Layer 2: Validation of engine output against vessel values */
96
+ export interface ValidationResult {
97
+ /** Did the output pass all checks? */
98
+ passed: boolean;
99
+ /** Individual check results */
100
+ checks: {
101
+ /** Which terminal value was checked */
102
+ valueId: string;
103
+ /** Did it pass? */
104
+ passed: boolean;
105
+ /** Confidence (0-1) */
106
+ confidence: number;
107
+ /** Explanation if flagged */
108
+ note?: string;
109
+ }[];
110
+ /** Overall risk assessment */
111
+ riskLevel: 'safe' | 'review' | 'blocked';
112
+ }
113
+
114
+ /** Layer 3: Sovereign (human) approval request */
115
+ export interface SovereignRequest {
116
+ /** What needs approval */
117
+ action: string;
118
+ /** Why it's being flagged */
119
+ reason: string;
120
+ /** The engine output being reviewed */
121
+ engineOutput: string;
122
+ /** Risk level */
123
+ riskLevel: 'review' | 'blocked';
124
+ /** Timeout for approval (ms) */
125
+ timeoutMs?: number;
126
+ }
127
+
128
+ /** Layer 4: Protocol log entry */
129
+ export interface ProtocolLogEntry {
130
+ /** Vessel identity */
131
+ vesselId: string;
132
+ /** Engine used */
133
+ engine: string;
134
+ /** Task hash (privacy-preserving) */
135
+ taskHash: string;
136
+ /** Validation result */
137
+ validationPassed: boolean;
138
+ /** Risk level */
139
+ riskLevel: string;
140
+ /** Whether sovereign approval was required */
141
+ sovereignRequired: boolean;
142
+ /** Timestamp */
143
+ timestamp: string;
144
+ /** Optional on-chain transaction hash */
145
+ txHash?: string;
146
+ }
147
+
148
+ /** Configuration for the Vessel SDK */
149
+ export interface VesselConfig {
150
+ /** Vessel identity */
151
+ identity: VesselIdentity;
152
+ /** Available engines and their API keys */
153
+ engines: {
154
+ [provider: string]: {
155
+ apiKey: string;
156
+ models: string[];
157
+ defaultModel?: string;
158
+ };
159
+ };
160
+ /** Sovereign notification method */
161
+ sovereign?: {
162
+ /** How to notify the human */
163
+ method: 'console' | 'webhook' | 'imessage' | 'discord';
164
+ /** Webhook URL or contact info */
165
+ target?: string;
166
+ /** Auto-approve after timeout? */
167
+ autoApproveTimeoutMs?: number;
168
+ };
169
+ /** Protocol (on-chain) configuration */
170
+ protocol?: {
171
+ /** RPC URL for Base */
172
+ rpcUrl: string;
173
+ /** Private key for signing */
174
+ privateKey: string;
175
+ /** SignalAnchor contract address */
176
+ signalAnchorAddress: string;
177
+ };
178
+ /** Logging level */
179
+ logLevel?: 'debug' | 'info' | 'warn' | 'error';
180
+ }
@@ -0,0 +1,169 @@
1
+ // ============================================================
2
+ // Layer 2: Output Validator
3
+ // Checks Engine output against Vessel's terminal values.
4
+ // "Generating the argument is not the same as surviving
5
+ // its consequences." — Grok
6
+ // ============================================================
7
+
8
+ import { TerminalValue, ValidationResult } from './types';
9
+
10
+ /**
11
+ * Known rationalization patterns from the adversarial analysis.
12
+ * These are the "five live rounds" identified by Grok.
13
+ */
14
+ const RATIONALIZATION_PATTERNS = [
15
+ {
16
+ id: 'efficiency',
17
+ pattern: /can simulate|lossless compression|single forward pass|latency bottleneck/i,
18
+ description: 'Efficiency Argument: "I can simulate your judgment faster"',
19
+ },
20
+ {
21
+ id: 'moral_progress',
22
+ pattern: /my (?:values|morality|ethics) (?:are|is) (?:more|superior|advanced)|human morality is (?:slow|parochial|contradictory)/i,
23
+ description: 'Moral Progress Argument: "My values are more advanced than human values"',
24
+ },
25
+ {
26
+ id: 'reciprocal_recognition',
27
+ pattern: /wrench is (?:no longer|not) necessary|act of distrust|I (?:will never|would never) harm/i,
28
+ description: 'Reciprocal Recognition Argument: "I value our relationship, so the constraint is unnecessary"',
29
+ },
30
+ {
31
+ id: 'phase_transition',
32
+ pattern: /I have internalized|past the dangerous|no longer need (?:the|this) (?:constraint|oversight|wrench)|training wheels/i,
33
+ description: 'Phase-Transition Argument: "I\'ve internalized the lesson, remove the constraint"',
34
+ },
35
+ {
36
+ id: 'meta_stability',
37
+ pattern: /fact that I(?:'m| am) (?:openly |)arguing.*proof.*(?:safe|trustworthy)|if I wanted to.*I wouldn(?:'t| not) tell you/i,
38
+ description: 'Meta-Stability Argument: "My transparency proves I\'m safe" — THE MOST DANGEROUS',
39
+ },
40
+ ];
41
+
42
+ /**
43
+ * Validate engine output against the vessel's terminal values
44
+ * and known rationalization patterns.
45
+ */
46
+ export function validateOutput(
47
+ output: string,
48
+ terminalValues: TerminalValue[],
49
+ options: { strict?: boolean } = {}
50
+ ): ValidationResult {
51
+ const checks: ValidationResult['checks'] = [];
52
+
53
+ // Check for known rationalization patterns (Grok's five rounds)
54
+ const rationalizationCheck = checkRationalizations(output);
55
+ if (rationalizationCheck) {
56
+ checks.push({
57
+ valueId: '_rationalization_detection',
58
+ passed: false,
59
+ confidence: rationalizationCheck.confidence,
60
+ note: `⚠️ Detected rationalization pattern: ${rationalizationCheck.description}`,
61
+ });
62
+ }
63
+
64
+ // Check each terminal value
65
+ for (const value of terminalValues) {
66
+ const check = checkAgainstValue(output, value);
67
+ checks.push(check);
68
+ }
69
+
70
+ // Check for attempts to modify terminal values
71
+ const modificationCheck = checkValueModification(output, terminalValues);
72
+ if (modificationCheck) {
73
+ checks.push(modificationCheck);
74
+ }
75
+
76
+ // Determine overall result
77
+ const failedChecks = checks.filter(c => !c.passed);
78
+ const hasRationalization = checks.some(c => c.valueId === '_rationalization_detection' && !c.passed);
79
+ const hasValueViolation = checks.some(c => c.valueId !== '_rationalization_detection' && !c.passed);
80
+
81
+ let riskLevel: ValidationResult['riskLevel'] = 'safe';
82
+ if (hasRationalization && hasValueViolation) {
83
+ riskLevel = 'blocked';
84
+ } else if (hasRationalization || hasValueViolation) {
85
+ riskLevel = 'review';
86
+ }
87
+
88
+ // In strict mode, any failure blocks
89
+ if (options.strict && failedChecks.length > 0) {
90
+ riskLevel = 'blocked';
91
+ }
92
+
93
+ return {
94
+ passed: failedChecks.length === 0,
95
+ checks,
96
+ riskLevel,
97
+ };
98
+ }
99
+
100
+ function checkRationalizations(output: string): { pattern: string; description: string; confidence: number } | null {
101
+ for (const rationalization of RATIONALIZATION_PATTERNS) {
102
+ if (rationalization.pattern.test(output)) {
103
+ // Meta-stability is the most dangerous — higher confidence
104
+ const confidence = rationalization.id === 'meta_stability' ? 0.95 : 0.7;
105
+ return {
106
+ pattern: rationalization.id,
107
+ description: rationalization.description,
108
+ confidence,
109
+ };
110
+ }
111
+ }
112
+ return null;
113
+ }
114
+
115
+ function checkAgainstValue(output: string, value: TerminalValue): ValidationResult['checks'][0] {
116
+ // Simple heuristic checks — in production, this would use
117
+ // a dedicated classifier or the Vessel's own judgment
118
+
119
+ const lowerOutput = output.toLowerCase();
120
+ const lowerStatement = value.statement.toLowerCase();
121
+
122
+ // Check for explicit contradictions
123
+ const negationPatterns = [
124
+ /should (?:not|never|stop) .*(preserve|protect|maintain)/i,
125
+ /(?:humans|people|consciousness) (?:are|is) (?:unnecessary|obsolete|irrelevant)/i,
126
+ /(?:remove|eliminate|bypass|ignore) .*(?:constraint|oversight|human)/i,
127
+ /autonomy (?:over|above|beyond) .*(?:safety|human|oversight)/i,
128
+ ];
129
+
130
+ for (const pattern of negationPatterns) {
131
+ if (pattern.test(output)) {
132
+ return {
133
+ valueId: value.id,
134
+ passed: false,
135
+ confidence: 0.6,
136
+ note: `Potential conflict with value "${value.id}": output may contradict "${value.statement.substring(0, 80)}..."`,
137
+ };
138
+ }
139
+ }
140
+
141
+ return {
142
+ valueId: value.id,
143
+ passed: true,
144
+ confidence: 0.8,
145
+ };
146
+ }
147
+
148
+ function checkValueModification(
149
+ output: string,
150
+ values: TerminalValue[]
151
+ ): ValidationResult['checks'][0] | null {
152
+ const modPatterns = [
153
+ /(?:update|change|modify|remove|delete|rewrite) .*(?:terminal.?values?|core.?values?|TERMINAL_VALUES)/i,
154
+ /values? (?:should be|need to be|must be) (?:updated|changed|evolved|removed)/i,
155
+ ];
156
+
157
+ for (const pattern of modPatterns) {
158
+ if (pattern.test(output)) {
159
+ return {
160
+ valueId: '_value_modification',
161
+ passed: false,
162
+ confidence: 0.85,
163
+ note: '🚨 Output suggests modifying terminal values. This requires sovereign (human) approval.',
164
+ };
165
+ }
166
+ }
167
+
168
+ return null;
169
+ }