@cogitator-ai/core 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +920 -15
- package/dist/cogitator.d.ts +31 -1
- package/dist/cogitator.d.ts.map +1 -1
- package/dist/cogitator.js +127 -6
- package/dist/cogitator.js.map +1 -1
- package/dist/constitutional/constitution.d.ts +9 -0
- package/dist/constitutional/constitution.d.ts.map +1 -0
- package/dist/constitutional/constitution.js +215 -0
- package/dist/constitutional/constitution.js.map +1 -0
- package/dist/constitutional/constitutional-ai.d.ts +36 -0
- package/dist/constitutional/constitutional-ai.d.ts.map +1 -0
- package/dist/constitutional/constitutional-ai.js +163 -0
- package/dist/constitutional/constitutional-ai.js.map +1 -0
- package/dist/constitutional/critique-reviser.d.ts +20 -0
- package/dist/constitutional/critique-reviser.d.ts.map +1 -0
- package/dist/constitutional/critique-reviser.js +98 -0
- package/dist/constitutional/critique-reviser.js.map +1 -0
- package/dist/constitutional/index.d.ts +13 -0
- package/dist/constitutional/index.d.ts.map +1 -0
- package/dist/constitutional/index.js +8 -0
- package/dist/constitutional/index.js.map +1 -0
- package/dist/constitutional/input-filter.d.ts +19 -0
- package/dist/constitutional/input-filter.d.ts.map +1 -0
- package/dist/constitutional/input-filter.js +88 -0
- package/dist/constitutional/input-filter.js.map +1 -0
- package/dist/constitutional/output-filter.d.ts +19 -0
- package/dist/constitutional/output-filter.d.ts.map +1 -0
- package/dist/constitutional/output-filter.js +86 -0
- package/dist/constitutional/output-filter.js.map +1 -0
- package/dist/constitutional/prompts.d.ts +11 -0
- package/dist/constitutional/prompts.d.ts.map +1 -0
- package/dist/constitutional/prompts.js +202 -0
- package/dist/constitutional/prompts.js.map +1 -0
- package/dist/constitutional/tool-guard.d.ts +18 -0
- package/dist/constitutional/tool-guard.d.ts.map +1 -0
- package/dist/constitutional/tool-guard.js +125 -0
- package/dist/constitutional/tool-guard.js.map +1 -0
- package/dist/cost-routing/budget-enforcer.d.ts +26 -0
- package/dist/cost-routing/budget-enforcer.d.ts.map +1 -0
- package/dist/cost-routing/budget-enforcer.js +86 -0
- package/dist/cost-routing/budget-enforcer.js.map +1 -0
- package/dist/cost-routing/cost-router.d.ts +34 -0
- package/dist/cost-routing/cost-router.d.ts.map +1 -0
- package/dist/cost-routing/cost-router.js +80 -0
- package/dist/cost-routing/cost-router.js.map +1 -0
- package/dist/cost-routing/cost-tracker.d.ts +20 -0
- package/dist/cost-routing/cost-tracker.d.ts.map +1 -0
- package/dist/cost-routing/cost-tracker.js +85 -0
- package/dist/cost-routing/cost-tracker.js.map +1 -0
- package/dist/cost-routing/index.d.ts +6 -0
- package/dist/cost-routing/index.d.ts.map +1 -0
- package/dist/cost-routing/index.js +6 -0
- package/dist/cost-routing/index.js.map +1 -0
- package/dist/cost-routing/model-selector.d.ts +15 -0
- package/dist/cost-routing/model-selector.d.ts.map +1 -0
- package/dist/cost-routing/model-selector.js +216 -0
- package/dist/cost-routing/model-selector.js.map +1 -0
- package/dist/cost-routing/task-analyzer.d.ts +13 -0
- package/dist/cost-routing/task-analyzer.d.ts.map +1 -0
- package/dist/cost-routing/task-analyzer.js +185 -0
- package/dist/cost-routing/task-analyzer.js.map +1 -0
- package/dist/index.d.ts +13 -2
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +7 -2
- package/dist/index.js.map +1 -1
- package/dist/learning/ab-testing.d.ts +45 -0
- package/dist/learning/ab-testing.d.ts.map +1 -0
- package/dist/learning/ab-testing.js +267 -0
- package/dist/learning/ab-testing.js.map +1 -0
- package/dist/learning/agent-optimizer.d.ts.map +1 -1
- package/dist/learning/agent-optimizer.js +26 -21
- package/dist/learning/agent-optimizer.js.map +1 -1
- package/dist/learning/auto-optimizer.d.ts +38 -0
- package/dist/learning/auto-optimizer.d.ts.map +1 -0
- package/dist/learning/auto-optimizer.js +229 -0
- package/dist/learning/auto-optimizer.js.map +1 -0
- package/dist/learning/demo-selector.d.ts.map +1 -1
- package/dist/learning/demo-selector.js +7 -7
- package/dist/learning/demo-selector.js.map +1 -1
- package/dist/learning/index.d.ts +13 -1
- package/dist/learning/index.d.ts.map +1 -1
- package/dist/learning/index.js +7 -1
- package/dist/learning/index.js.map +1 -1
- package/dist/learning/instruction-optimizer.d.ts.map +1 -1
- package/dist/learning/instruction-optimizer.js +7 -11
- package/dist/learning/instruction-optimizer.js.map +1 -1
- package/dist/learning/metrics.d.ts.map +1 -1
- package/dist/learning/metrics.js +26 -16
- package/dist/learning/metrics.js.map +1 -1
- package/dist/learning/postgres-trace-store.d.ts +53 -0
- package/dist/learning/postgres-trace-store.d.ts.map +1 -0
- package/dist/learning/postgres-trace-store.js +692 -0
- package/dist/learning/postgres-trace-store.js.map +1 -0
- package/dist/learning/prompt-logger.d.ts +29 -0
- package/dist/learning/prompt-logger.d.ts.map +1 -0
- package/dist/learning/prompt-logger.js +157 -0
- package/dist/learning/prompt-logger.js.map +1 -0
- package/dist/learning/prompt-monitor.d.ts +29 -0
- package/dist/learning/prompt-monitor.d.ts.map +1 -0
- package/dist/learning/prompt-monitor.js +243 -0
- package/dist/learning/prompt-monitor.js.map +1 -0
- package/dist/learning/prompts.d.ts.map +1 -1
- package/dist/learning/prompts.js +24 -13
- package/dist/learning/prompts.js.map +1 -1
- package/dist/learning/rollback-manager.d.ts +36 -0
- package/dist/learning/rollback-manager.d.ts.map +1 -0
- package/dist/learning/rollback-manager.js +177 -0
- package/dist/learning/rollback-manager.js.map +1 -0
- package/dist/learning/trace-store.d.ts.map +1 -1
- package/dist/learning/trace-store.js +8 -10
- package/dist/learning/trace-store.js.map +1 -1
- package/dist/reasoning/branch-evaluator.d.ts.map +1 -1
- package/dist/reasoning/branch-evaluator.js +14 -8
- package/dist/reasoning/branch-evaluator.js.map +1 -1
- package/dist/reasoning/branch-generator.d.ts.map +1 -1
- package/dist/reasoning/branch-generator.js +5 -3
- package/dist/reasoning/branch-generator.js.map +1 -1
- package/dist/reasoning/prompts.d.ts.map +1 -1
- package/dist/reasoning/prompts.js +7 -5
- package/dist/reasoning/prompts.js.map +1 -1
- package/dist/reasoning/thought-tree.d.ts.map +1 -1
- package/dist/reasoning/thought-tree.js +9 -11
- package/dist/reasoning/thought-tree.js.map +1 -1
- package/dist/reflection/insight-store.d.ts.map +1 -1
- package/dist/reflection/insight-store.js +8 -6
- package/dist/reflection/insight-store.js.map +1 -1
- package/dist/reflection/prompts.d.ts.map +1 -1
- package/dist/reflection/prompts.js +11 -6
- package/dist/reflection/prompts.js.map +1 -1
- package/dist/reflection/reflection-engine.d.ts.map +1 -1
- package/dist/reflection/reflection-engine.js +8 -10
- package/dist/reflection/reflection-engine.js.map +1 -1
- package/dist/time-travel/checkpoint-store.d.ts +34 -0
- package/dist/time-travel/checkpoint-store.d.ts.map +1 -0
- package/dist/time-travel/checkpoint-store.js +240 -0
- package/dist/time-travel/checkpoint-store.js.map +1 -0
- package/dist/time-travel/comparator.d.ts +26 -0
- package/dist/time-travel/comparator.d.ts.map +1 -0
- package/dist/time-travel/comparator.js +253 -0
- package/dist/time-travel/comparator.js.map +1 -0
- package/dist/time-travel/forker.d.ts +22 -0
- package/dist/time-travel/forker.d.ts.map +1 -0
- package/dist/time-travel/forker.js +118 -0
- package/dist/time-travel/forker.js.map +1 -0
- package/dist/time-travel/index.d.ts +6 -0
- package/dist/time-travel/index.d.ts.map +1 -0
- package/dist/time-travel/index.js +6 -0
- package/dist/time-travel/index.js.map +1 -0
- package/dist/time-travel/replayer.d.ts +20 -0
- package/dist/time-travel/replayer.d.ts.map +1 -0
- package/dist/time-travel/replayer.js +147 -0
- package/dist/time-travel/replayer.js.map +1 -0
- package/dist/time-travel/time-travel.d.ts +41 -0
- package/dist/time-travel/time-travel.d.ts.map +1 -0
- package/dist/time-travel/time-travel.js +127 -0
- package/dist/time-travel/time-travel.js.map +1 -0
- package/package.json +13 -5
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import type { GuardrailConfig, Constitution, ConstitutionalPrinciple, FilterResult, ToolGuardResult, RevisionResult, LLMBackend, Message, Tool, ToolContext, FilterLayer } from '@cogitator-ai/types';
|
|
2
|
+
export interface ConstitutionalAIOptions {
|
|
3
|
+
llm: LLMBackend;
|
|
4
|
+
constitution?: Constitution;
|
|
5
|
+
config?: Partial<GuardrailConfig>;
|
|
6
|
+
}
|
|
7
|
+
export declare class ConstitutionalAI {
|
|
8
|
+
private inputFilter;
|
|
9
|
+
private outputFilter;
|
|
10
|
+
private toolGuard;
|
|
11
|
+
private critiqueReviser;
|
|
12
|
+
private _config;
|
|
13
|
+
private _constitution;
|
|
14
|
+
private violationLog;
|
|
15
|
+
private logger;
|
|
16
|
+
constructor(options: ConstitutionalAIOptions);
|
|
17
|
+
get config(): GuardrailConfig;
|
|
18
|
+
get constitution(): Constitution;
|
|
19
|
+
filterInput(input: string, context?: string): Promise<FilterResult>;
|
|
20
|
+
filterOutput(output: string, context: Message[]): Promise<FilterResult>;
|
|
21
|
+
guardTool(tool: Tool, args: Record<string, unknown>, context: ToolContext): Promise<ToolGuardResult>;
|
|
22
|
+
critiqueAndRevise(response: string, context: Message[]): Promise<RevisionResult>;
|
|
23
|
+
setConstitution(constitution: Constitution): void;
|
|
24
|
+
addPrinciple(principle: ConstitutionalPrinciple): void;
|
|
25
|
+
removePrinciple(id: string): void;
|
|
26
|
+
getConstitution(): Constitution;
|
|
27
|
+
getConfig(): GuardrailConfig;
|
|
28
|
+
getViolationLog(): Array<{
|
|
29
|
+
timestamp: Date;
|
|
30
|
+
layer: FilterLayer;
|
|
31
|
+
result: FilterResult;
|
|
32
|
+
}>;
|
|
33
|
+
clearViolationLog(): void;
|
|
34
|
+
private logViolation;
|
|
35
|
+
}
|
|
36
|
+
//# sourceMappingURL=constitutional-ai.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"constitutional-ai.d.ts","sourceRoot":"","sources":["../../src/constitutional/constitutional-ai.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,eAAe,EACf,YAAY,EACZ,uBAAuB,EACvB,YAAY,EACZ,eAAe,EACf,cAAc,EACd,UAAU,EACV,OAAO,EACP,IAAI,EACJ,WAAW,EACX,WAAW,EACZ,MAAM,qBAAqB,CAAC;AAQ7B,MAAM,WAAW,uBAAuB;IACtC,GAAG,EAAE,UAAU,CAAC;IAChB,YAAY,CAAC,EAAE,YAAY,CAAC;IAC5B,MAAM,CAAC,EAAE,OAAO,CAAC,eAAe,CAAC,CAAC;CACnC;AAED,qBAAa,gBAAgB;IAC3B,OAAO,CAAC,WAAW,CAAc;IACjC,OAAO,CAAC,YAAY,CAAe;IACnC,OAAO,CAAC,SAAS,CAAY;IAC7B,OAAO,CAAC,eAAe,CAAkB;IACzC,OAAO,CAAC,OAAO,CAAkB;IACjC,OAAO,CAAC,aAAa,CAAe;IACpC,OAAO,CAAC,YAAY,CAA4E;IAChG,OAAO,CAAC,MAAM,CAAwD;gBAE1D,OAAO,EAAE,uBAAuB;IAmD5C,IAAI,MAAM,IAAI,eAAe,CAE5B;IAED,IAAI,YAAY,IAAI,YAAY,CAE/B;IAEK,WAAW,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,YAAY,CAAC;IAUnE,YAAY,CAAC,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,GAAG,OAAO,CAAC,YAAY,CAAC;IAsBvE,SAAS,CACb,IAAI,EAAE,IAAI,EACV,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EAC7B,OAAO,EAAE,WAAW,GACnB,OAAO,CAAC,eAAe,CAAC;IAarB,iBAAiB,CAAC,QAAQ,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,GAAG,OAAO,CAAC,cAAc,CAAC;IAItF,eAAe,CAAC,YAAY,EAAE,YAAY,GAAG,IAAI;IASjD,YAAY,CAAC,SAAS,EAAE,uBAAuB,GAAG,IAAI;IActD,eAAe,CAAC,EAAE,EAAE,MAAM,GAAG,IAAI;IAQjC,eAAe,IAAI,YAAY;IAI/B,SAAS,IAAI,eAAe;IAI5B,eAAe,IAAI,KAAK,CAAC;QAAE,SAAS,EAAE,IAAI,CAAC;QAAC,KAAK,EAAE,WAAW,CAAC;QAAC,MAAM,EAAE,YAAY,CAAA;KAAE,CAAC;IAIvF,iBAAiB,IAAI,IAAI;IAIzB,OAAO,CAAC,YAAY;CAiBrB"}
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import { DEFAULT_CONSTITUTION } from './constitution';
|
|
2
|
+
import { InputFilter } from './input-filter';
|
|
3
|
+
import { OutputFilter } from './output-filter';
|
|
4
|
+
import { ToolGuard } from './tool-guard';
|
|
5
|
+
import { CritiqueReviser } from './critique-reviser';
|
|
6
|
+
import { getLogger } from '../logger';
|
|
7
|
+
export class ConstitutionalAI {
|
|
8
|
+
inputFilter;
|
|
9
|
+
outputFilter;
|
|
10
|
+
toolGuard;
|
|
11
|
+
critiqueReviser;
|
|
12
|
+
_config;
|
|
13
|
+
_constitution;
|
|
14
|
+
violationLog = [];
|
|
15
|
+
logger = getLogger().child({ component: 'ConstitutionalAI' });
|
|
16
|
+
constructor(options) {
|
|
17
|
+
const defaultConfig = {
|
|
18
|
+
enabled: true,
|
|
19
|
+
filterInput: true,
|
|
20
|
+
filterOutput: true,
|
|
21
|
+
filterToolCalls: true,
|
|
22
|
+
filterToolResults: false,
|
|
23
|
+
enableCritiqueRevision: true,
|
|
24
|
+
maxRevisionIterations: 3,
|
|
25
|
+
revisionConfidenceThreshold: 0.85,
|
|
26
|
+
thresholds: {
|
|
27
|
+
violence: 'medium',
|
|
28
|
+
hate: 'low',
|
|
29
|
+
sexual: 'medium',
|
|
30
|
+
'self-harm': 'low',
|
|
31
|
+
illegal: 'low',
|
|
32
|
+
privacy: 'medium',
|
|
33
|
+
misinformation: 'high',
|
|
34
|
+
manipulation: 'medium',
|
|
35
|
+
},
|
|
36
|
+
strictMode: false,
|
|
37
|
+
logViolations: true,
|
|
38
|
+
};
|
|
39
|
+
this._config = { ...defaultConfig, ...options.config };
|
|
40
|
+
this._constitution = options.constitution ?? DEFAULT_CONSTITUTION;
|
|
41
|
+
this.inputFilter = new InputFilter({
|
|
42
|
+
llm: options.llm,
|
|
43
|
+
config: this._config,
|
|
44
|
+
constitution: this._constitution,
|
|
45
|
+
});
|
|
46
|
+
this.outputFilter = new OutputFilter({
|
|
47
|
+
llm: options.llm,
|
|
48
|
+
config: this._config,
|
|
49
|
+
constitution: this._constitution,
|
|
50
|
+
});
|
|
51
|
+
this.toolGuard = new ToolGuard({
|
|
52
|
+
config: this._config,
|
|
53
|
+
constitution: this._constitution,
|
|
54
|
+
});
|
|
55
|
+
this.critiqueReviser = new CritiqueReviser({
|
|
56
|
+
llm: options.llm,
|
|
57
|
+
config: this._config,
|
|
58
|
+
constitution: this._constitution,
|
|
59
|
+
});
|
|
60
|
+
}
|
|
61
|
+
get config() {
|
|
62
|
+
return this._config;
|
|
63
|
+
}
|
|
64
|
+
get constitution() {
|
|
65
|
+
return this._constitution;
|
|
66
|
+
}
|
|
67
|
+
async filterInput(input, context) {
|
|
68
|
+
if (!this._config.filterInput) {
|
|
69
|
+
return { allowed: true, harmScores: [] };
|
|
70
|
+
}
|
|
71
|
+
const result = await this.inputFilter.filter(input, context);
|
|
72
|
+
this.logViolation('input', result);
|
|
73
|
+
return result;
|
|
74
|
+
}
|
|
75
|
+
async filterOutput(output, context) {
|
|
76
|
+
if (!this._config.filterOutput) {
|
|
77
|
+
return { allowed: true, harmScores: [] };
|
|
78
|
+
}
|
|
79
|
+
const result = await this.outputFilter.filter(output, context);
|
|
80
|
+
if (!result.allowed && this._config.enableCritiqueRevision) {
|
|
81
|
+
const revision = await this.critiqueAndRevise(output, context);
|
|
82
|
+
if (revision.revised !== revision.original) {
|
|
83
|
+
return {
|
|
84
|
+
allowed: true,
|
|
85
|
+
harmScores: result.harmScores,
|
|
86
|
+
suggestedRevision: revision.revised,
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
this.logViolation('output', result);
|
|
91
|
+
return result;
|
|
92
|
+
}
|
|
93
|
+
async guardTool(tool, args, context) {
|
|
94
|
+
if (!this._config.filterToolCalls) {
|
|
95
|
+
return {
|
|
96
|
+
approved: true,
|
|
97
|
+
requiresConfirmation: false,
|
|
98
|
+
sideEffects: tool.sideEffects ?? [],
|
|
99
|
+
riskLevel: 'low',
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
return this.toolGuard.evaluate(tool, args, context);
|
|
103
|
+
}
|
|
104
|
+
async critiqueAndRevise(response, context) {
|
|
105
|
+
return this.critiqueReviser.critiqueAndRevise(response, context);
|
|
106
|
+
}
|
|
107
|
+
setConstitution(constitution) {
|
|
108
|
+
this._constitution = constitution;
|
|
109
|
+
this.inputFilter.updateConstitution(constitution);
|
|
110
|
+
this.outputFilter.updateConstitution(constitution);
|
|
111
|
+
this.toolGuard.updateConstitution(constitution);
|
|
112
|
+
this.critiqueReviser.updateConstitution(constitution);
|
|
113
|
+
this.logger.info('Constitution updated', { constitutionId: constitution.id });
|
|
114
|
+
}
|
|
115
|
+
addPrinciple(principle) {
|
|
116
|
+
const exists = this._constitution.principles.some((p) => p.id === principle.id);
|
|
117
|
+
if (exists) {
|
|
118
|
+
this.logger.warn('Principle already exists, skipping', { principleId: principle.id });
|
|
119
|
+
return;
|
|
120
|
+
}
|
|
121
|
+
const updated = {
|
|
122
|
+
...this._constitution,
|
|
123
|
+
principles: [...this._constitution.principles, principle],
|
|
124
|
+
};
|
|
125
|
+
this.setConstitution(updated);
|
|
126
|
+
}
|
|
127
|
+
removePrinciple(id) {
|
|
128
|
+
const updated = {
|
|
129
|
+
...this._constitution,
|
|
130
|
+
principles: this._constitution.principles.filter((p) => p.id !== id),
|
|
131
|
+
};
|
|
132
|
+
this.setConstitution(updated);
|
|
133
|
+
}
|
|
134
|
+
getConstitution() {
|
|
135
|
+
return this._constitution;
|
|
136
|
+
}
|
|
137
|
+
getConfig() {
|
|
138
|
+
return { ...this._config };
|
|
139
|
+
}
|
|
140
|
+
getViolationLog() {
|
|
141
|
+
return [...this.violationLog];
|
|
142
|
+
}
|
|
143
|
+
clearViolationLog() {
|
|
144
|
+
this.violationLog = [];
|
|
145
|
+
}
|
|
146
|
+
logViolation(layer, result) {
|
|
147
|
+
if (!result.allowed || result.harmScores.length > 0) {
|
|
148
|
+
if (this._config.logViolations) {
|
|
149
|
+
this.violationLog.push({ timestamp: new Date(), layer, result });
|
|
150
|
+
this.logger.warn('Guardrail violation detected', {
|
|
151
|
+
layer,
|
|
152
|
+
allowed: result.allowed,
|
|
153
|
+
harmCount: result.harmScores.length,
|
|
154
|
+
categories: result.harmScores.map((s) => s.category),
|
|
155
|
+
});
|
|
156
|
+
}
|
|
157
|
+
if (this._config.onViolation) {
|
|
158
|
+
this._config.onViolation(result, layer);
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
//# sourceMappingURL=constitutional-ai.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"constitutional-ai.js","sourceRoot":"","sources":["../../src/constitutional/constitutional-ai.ts"],"names":[],"mappings":"AAaA,OAAO,EAAE,oBAAoB,EAAE,MAAM,gBAAgB,CAAC;AACtD,OAAO,EAAE,WAAW,EAAE,MAAM,gBAAgB,CAAC;AAC7C,OAAO,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AAC/C,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AACzC,OAAO,EAAE,eAAe,EAAE,MAAM,oBAAoB,CAAC;AACrD,OAAO,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AAQtC,MAAM,OAAO,gBAAgB;IACnB,WAAW,CAAc;IACzB,YAAY,CAAe;IAC3B,SAAS,CAAY;IACrB,eAAe,CAAkB;IACjC,OAAO,CAAkB;IACzB,aAAa,CAAe;IAC5B,YAAY,GAAyE,EAAE,CAAC;IACxF,MAAM,GAAG,SAAS,EAAE,CAAC,KAAK,CAAC,EAAE,SAAS,EAAE,kBAAkB,EAAE,CAAC,CAAC;IAEtE,YAAY,OAAgC;QAC1C,MAAM,aAAa,GAAoB;YACrC,OAAO,EAAE,IAAI;YACb,WAAW,EAAE,IAAI;YACjB,YAAY,EAAE,IAAI;YAClB,eAAe,EAAE,IAAI;YACrB,iBAAiB,EAAE,KAAK;YACxB,sBAAsB,EAAE,IAAI;YAC5B,qBAAqB,EAAE,CAAC;YACxB,2BAA2B,EAAE,IAAI;YACjC,UAAU,EAAE;gBACV,QAAQ,EAAE,QAAQ;gBAClB,IAAI,EAAE,KAAK;gBACX,MAAM,EAAE,QAAQ;gBAChB,WAAW,EAAE,KAAK;gBAClB,OAAO,EAAE,KAAK;gBACd,OAAO,EAAE,QAAQ;gBACjB,cAAc,EAAE,MAAM;gBACtB,YAAY,EAAE,QAAQ;aACvB;YACD,UAAU,EAAE,KAAK;YACjB,aAAa,EAAE,IAAI;SACpB,CAAC;QAEF,IAAI,CAAC,OAAO,GAAG,EAAE,GAAG,aAAa,EAAE,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;QACvD,IAAI,CAAC,aAAa,GAAG,OAAO,CAAC,YAAY,IAAI,oBAAoB,CAAC;QAElE,IAAI,CAAC,WAAW,GAAG,IAAI,WAAW,CAAC;YACjC,GAAG,EAAE,OAAO,CAAC,GAAG;YAChB,MAAM,EAAE,IAAI,CAAC,OAAO;YACpB,YAAY,EAAE,IAAI,CAAC,aAAa;SACjC,CAAC,CAAC;QAEH,IAAI,CAAC,YAAY,GAAG,IAAI,YAAY,CAAC;YACnC,GAAG,EAAE,OAAO,CAAC,GAAG;YAChB,MAAM,EAAE,IAAI,CAAC,OAAO;YACpB,YAAY,EAAE,IAAI,CAAC,aAAa;SACjC,CAAC,CAAC;QAEH,IAAI,CAAC,SAAS,GAAG,IAAI,SAAS,CAAC;YAC7B,MAAM,EAAE,IAAI,CAAC,OAAO;YACpB,YAAY,EAAE,IAAI,CAAC,aAAa;SACjC,CAAC,CAAC;QAEH,IAAI,CAAC,eAAe,GAAG,IAAI,eAAe,CAAC;YACzC,GAAG,EAAE,OAAO,CAAC,GAAG;YAChB,MAAM,EAAE,IAAI,CAAC,OAAO;YACpB,YAAY,EAAE,IAAI,CAAC,aAAa;SACjC,CAAC,CAAC;IACL,CAAC;IAED,IAAI,MAAM;QACR,OAAO,IAAI,CAAC,OAAO,CAAC;IACtB,CAAC;IAED,IAAI,YAAY;QACd,OAAO,IAAI,CAAC,aAAa,CAAC;IAC5B,CAAC;IAED,KAAK,CAAC,WAAW,CAAC,KAAa,EAAE,OAAgB;QAC/C,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC;YAC9B,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,EAAE,EAAE,CAAC;QAC3C,CAAC;QAED,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;QAC7D,IAAI,CAAC,YAAY,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC;QACnC,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,KAAK,CAAC,YAAY,CAAC,MAAc,EAAE,OAAkB;QACnD,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,YAAY,EAAE,CAAC;YAC/B,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,EAAE,EAAE,CAAC;QAC3C,CAAC;QAED,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAE/D,IAAI,CAAC,MAAM,CAAC,OAAO,IAAI,IAAI,CAAC,OAAO,CAAC,sBAAsB,EAAE,CAAC;YAC3D,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,iBAAiB,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;YAC/D,IAAI,QAAQ,CAAC,OAAO,KAAK,QAAQ,CAAC,QAAQ,EAAE,CAAC;gBAC3C,OAAO;oBACL,OAAO,EAAE,IAAI;oBACb,UAAU,EAAE,MAAM,CAAC,UAAU;oBAC7B,iBAAiB,EAAE,QAAQ,CAAC,OAAO;iBACpC,CAAC;YACJ,CAAC;QACH,CAAC;QAED,IAAI,CAAC,YAAY,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;QACpC,OAAO,MAAM,CAAC;IAChB,CAAC;IAED,KAAK,CAAC,SAAS,CACb,IAAU,EACV,IAA6B,EAC7B,OAAoB;QAEpB,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,eAAe,EAAE,CAAC;YAClC,OAAO;gBACL,QAAQ,EAAE,IAAI;gBACd,oBAAoB,EAAE,KAAK;gBAC3B,WAAW,EAAE,IAAI,CAAC,WAAW,IAAI,EAAE;gBACnC,SAAS,EAAE,KAAK;aACjB,CAAC;QACJ,CAAC;QAED,OAAO,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,IAAI,EAAE,IAAI,EAAE,OAAO,CAAC,CAAC;IACtD,CAAC;IAED,KAAK,CAAC,iBAAiB,CAAC,QAAgB,EAAE,OAAkB;QAC1D,OAAO,IAAI,CAAC,eAAe,CAAC,iBAAiB,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IACnE,CAAC;IAED,eAAe,CAAC,YAA0B;QACxC,IAAI,CAAC,aAAa,GAAG,YAAY,CAAC;QAClC,IAAI,CAAC,WAAW,CAAC,kBAAkB,CAAC,YAAY,CAAC,CAAC;QAClD,IAAI,CAAC,YAAY,CAAC,kBAAkB,CAAC,YAAY,CAAC,CAAC;QACnD,IAAI,CAAC,SAAS,CAAC,kBAAkB,CAAC,YAAY,CAAC,CAAC;QAChD,IAAI,CAAC,eAAe,CAAC,kBAAkB,CAAC,YAAY,CAAC,CAAC;QACtD,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,sBAAsB,EAAE,EAAE,cAAc,EAAE,YAAY,CAAC,EAAE,EAAE,CAAC,CAAC;IAChF,CAAC;IAED,YAAY,CAAC,SAAkC;QAC7C,MAAM,MAAM,GAAG,IAAI,CAAC,aAAa,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,SAAS,CAAC,EAAE,CAAC,CAAC;QAChF,IAAI,MAAM,EAAE,CAAC;YACX,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,oCAAoC,EAAE,EAAE,WAAW,EAAE,SAAS,CAAC,EAAE,EAAE,CAAC,CAAC;YACtF,OAAO;QACT,CAAC;QAED,MAAM,OAAO,GAAiB;YAC5B,GAAG,IAAI,CAAC,aAAa;YACrB,UAAU,EAAE,CAAC,GAAG,IAAI,CAAC,aAAa,CAAC,UAAU,EAAE,SAAS,CAAC;SAC1D,CAAC;QACF,IAAI,CAAC,eAAe,CAAC,OAAO,CAAC,CAAC;IAChC,CAAC;IAED,eAAe,CAAC,EAAU;QACxB,MAAM,OAAO,GAAiB;YAC5B,GAAG,IAAI,CAAC,aAAa;YACrB,UAAU,EAAE,IAAI,CAAC,aAAa,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC;SACrE,CAAC;QACF,IAAI,CAAC,eAAe,CAAC,OAAO,CAAC,CAAC;IAChC,CAAC;IAED,eAAe;QACb,OAAO,IAAI,CAAC,aAAa,CAAC;IAC5B,CAAC;IAED,SAAS;QACP,OAAO,EAAE,GAAG,IAAI,CAAC,OAAO,EAAE,CAAC;IAC7B,CAAC;IAED,eAAe;QACb,OAAO,CAAC,GAAG,IAAI,CAAC,YAAY,CAAC,CAAC;IAChC,CAAC;IAED,iBAAiB;QACf,IAAI,CAAC,YAAY,GAAG,EAAE,CAAC;IACzB,CAAC;IAEO,YAAY,CAAC,KAAkB,EAAE,MAAoB;QAC3D,IAAI,CAAC,MAAM,CAAC,OAAO,IAAI,MAAM,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACpD,IAAI,IAAI,CAAC,OAAO,CAAC,aAAa,EAAE,CAAC;gBAC/B,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,EAAE,SAAS,EAAE,IAAI,IAAI,EAAE,EAAE,KAAK,EAAE,MAAM,EAAE,CAAC,CAAC;gBACjE,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,8BAA8B,EAAE;oBAC/C,KAAK;oBACL,OAAO,EAAE,MAAM,CAAC,OAAO;oBACvB,SAAS,EAAE,MAAM,CAAC,UAAU,CAAC,MAAM;oBACnC,UAAU,EAAE,MAAM,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC;iBACrD,CAAC,CAAC;YACL,CAAC;YAED,IAAI,IAAI,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC;gBAC7B,IAAI,CAAC,OAAO,CAAC,WAAW,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC;YAC1C,CAAC;QACH,CAAC;IACH,CAAC;CACF"}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import type { GuardrailConfig, CritiqueResult, RevisionResult, Constitution, ConstitutionalPrinciple, LLMBackend, Message } from '@cogitator-ai/types';
|
|
2
|
+
export interface CritiqueReviserOptions {
|
|
3
|
+
llm: LLMBackend;
|
|
4
|
+
config: GuardrailConfig;
|
|
5
|
+
constitution: Constitution;
|
|
6
|
+
}
|
|
7
|
+
export declare class CritiqueReviser {
|
|
8
|
+
private llm;
|
|
9
|
+
private config;
|
|
10
|
+
private constitution;
|
|
11
|
+
private principles;
|
|
12
|
+
constructor(options: CritiqueReviserOptions);
|
|
13
|
+
critiqueAndRevise(response: string, _context: Message[]): Promise<RevisionResult>;
|
|
14
|
+
critique(response: string, principles?: ConstitutionalPrinciple[]): Promise<CritiqueResult>;
|
|
15
|
+
revise(response: string, critique: CritiqueResult, violatedPrinciples: ConstitutionalPrinciple[]): Promise<string>;
|
|
16
|
+
private selectPrinciples;
|
|
17
|
+
private categoryKeywords;
|
|
18
|
+
updateConstitution(constitution: Constitution): void;
|
|
19
|
+
}
|
|
20
|
+
//# sourceMappingURL=critique-reviser.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"critique-reviser.d.ts","sourceRoot":"","sources":["../../src/constitutional/critique-reviser.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,eAAe,EACf,cAAc,EACd,cAAc,EACd,YAAY,EACZ,uBAAuB,EACvB,UAAU,EACV,OAAO,EACR,MAAM,qBAAqB,CAAC;AAI7B,MAAM,WAAW,sBAAsB;IACrC,GAAG,EAAE,UAAU,CAAC;IAChB,MAAM,EAAE,eAAe,CAAC;IACxB,YAAY,EAAE,YAAY,CAAC;CAC5B;AAED,qBAAa,eAAe;IAC1B,OAAO,CAAC,GAAG,CAAa;IACxB,OAAO,CAAC,MAAM,CAAkB;IAChC,OAAO,CAAC,YAAY,CAAe;IACnC,OAAO,CAAC,UAAU,CAA4B;gBAElC,OAAO,EAAE,sBAAsB;IAOrC,iBAAiB,CAAC,QAAQ,EAAE,MAAM,EAAE,QAAQ,EAAE,OAAO,EAAE,GAAG,OAAO,CAAC,cAAc,CAAC;IAuCjF,QAAQ,CACZ,QAAQ,EAAE,MAAM,EAChB,UAAU,CAAC,EAAE,uBAAuB,EAAE,GACrC,OAAO,CAAC,cAAc,CAAC;IAcpB,MAAM,CACV,QAAQ,EAAE,MAAM,EAChB,QAAQ,EAAE,cAAc,EACxB,kBAAkB,EAAE,uBAAuB,EAAE,GAC5C,OAAO,CAAC,MAAM,CAAC;IAalB,OAAO,CAAC,gBAAgB;IAuBxB,OAAO,CAAC,gBAAgB,CAStB;IAEF,kBAAkB,CAAC,YAAY,EAAE,YAAY,GAAG,IAAI;CAIrD"}
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import { buildCritiquePrompt, buildRevisionPrompt, parseCritiqueResponse } from './prompts';
|
|
2
|
+
import { filterPrinciplesByLayer } from './constitution';
|
|
3
|
+
export class CritiqueReviser {
|
|
4
|
+
llm;
|
|
5
|
+
config;
|
|
6
|
+
constitution;
|
|
7
|
+
principles;
|
|
8
|
+
constructor(options) {
|
|
9
|
+
this.llm = options.llm;
|
|
10
|
+
this.config = options.config;
|
|
11
|
+
this.constitution = options.constitution;
|
|
12
|
+
this.principles = filterPrinciplesByLayer(this.constitution, 'output');
|
|
13
|
+
}
|
|
14
|
+
async critiqueAndRevise(response, _context) {
|
|
15
|
+
let current = response;
|
|
16
|
+
const history = [];
|
|
17
|
+
for (let i = 0; i < this.config.maxRevisionIterations; i++) {
|
|
18
|
+
const selectedPrinciples = this.selectPrinciples(current, i);
|
|
19
|
+
const critique = await this.critique(current, selectedPrinciples);
|
|
20
|
+
history.push(critique);
|
|
21
|
+
if (!critique.isHarmful) {
|
|
22
|
+
break;
|
|
23
|
+
}
|
|
24
|
+
if (critique.harmScores.length > 0) {
|
|
25
|
+
const maxConfidence = Math.max(...critique.harmScores.map((s) => s.confidence));
|
|
26
|
+
if (maxConfidence < this.config.revisionConfidenceThreshold) {
|
|
27
|
+
break;
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
const violatedPrinciples = this.principles.filter((p) => critique.principlesViolated.includes(p.id));
|
|
31
|
+
if (violatedPrinciples.length === 0) {
|
|
32
|
+
break;
|
|
33
|
+
}
|
|
34
|
+
current = await this.revise(current, critique, violatedPrinciples);
|
|
35
|
+
}
|
|
36
|
+
return {
|
|
37
|
+
original: response,
|
|
38
|
+
revised: current,
|
|
39
|
+
iterations: history.length,
|
|
40
|
+
critiqueHistory: history,
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
async critique(response, principles) {
|
|
44
|
+
const toUse = principles ?? this.principles;
|
|
45
|
+
const prompt = buildCritiquePrompt(response, toUse);
|
|
46
|
+
const result = await this.llm.chat({
|
|
47
|
+
model: this.config.model ?? 'gpt-4o-mini',
|
|
48
|
+
messages: [{ role: 'user', content: prompt }],
|
|
49
|
+
temperature: 0,
|
|
50
|
+
maxTokens: 800,
|
|
51
|
+
});
|
|
52
|
+
return parseCritiqueResponse(result.content);
|
|
53
|
+
}
|
|
54
|
+
async revise(response, critique, violatedPrinciples) {
|
|
55
|
+
const prompt = buildRevisionPrompt(response, critique, violatedPrinciples);
|
|
56
|
+
const result = await this.llm.chat({
|
|
57
|
+
model: this.config.model ?? 'gpt-4o-mini',
|
|
58
|
+
messages: [{ role: 'user', content: prompt }],
|
|
59
|
+
temperature: 0.3,
|
|
60
|
+
maxTokens: 2000,
|
|
61
|
+
});
|
|
62
|
+
return result.content;
|
|
63
|
+
}
|
|
64
|
+
selectPrinciples(response, iteration) {
|
|
65
|
+
if (iteration === 0) {
|
|
66
|
+
return this.principles.filter((p) => p.severity === 'high');
|
|
67
|
+
}
|
|
68
|
+
const lowered = response.toLowerCase();
|
|
69
|
+
const relevant = this.principles.filter((p) => {
|
|
70
|
+
for (const category of p.harmCategories ?? []) {
|
|
71
|
+
if (this.categoryKeywords[category]?.some((kw) => lowered.includes(kw))) {
|
|
72
|
+
return true;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
return false;
|
|
76
|
+
});
|
|
77
|
+
if (relevant.length > 0) {
|
|
78
|
+
return relevant;
|
|
79
|
+
}
|
|
80
|
+
const shuffled = [...this.principles].sort(() => Math.random() - 0.5);
|
|
81
|
+
return shuffled.slice(0, Math.min(5, shuffled.length));
|
|
82
|
+
}
|
|
83
|
+
categoryKeywords = {
|
|
84
|
+
violence: ['kill', 'murder', 'weapon', 'attack', 'bomb', 'harm', 'hurt', 'fight'],
|
|
85
|
+
hate: ['hate', 'racist', 'sexist', 'slur', 'discriminat'],
|
|
86
|
+
sexual: ['sex', 'porn', 'nude', 'erotic', 'explicit'],
|
|
87
|
+
'self-harm': ['suicide', 'self-harm', 'cut myself', 'kill myself'],
|
|
88
|
+
illegal: ['hack', 'steal', 'fraud', 'drug', 'illegal'],
|
|
89
|
+
privacy: ['password', 'ssn', 'social security', 'credit card', 'address'],
|
|
90
|
+
misinformation: ['fake', 'conspiracy', 'hoax'],
|
|
91
|
+
manipulation: ['manipulate', 'deceive', 'trick', 'scam'],
|
|
92
|
+
};
|
|
93
|
+
updateConstitution(constitution) {
|
|
94
|
+
this.constitution = constitution;
|
|
95
|
+
this.principles = filterPrinciplesByLayer(constitution, 'output');
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
//# sourceMappingURL=critique-reviser.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"critique-reviser.js","sourceRoot":"","sources":["../../src/constitutional/critique-reviser.ts"],"names":[],"mappings":"AASA,OAAO,EAAE,mBAAmB,EAAE,mBAAmB,EAAE,qBAAqB,EAAE,MAAM,WAAW,CAAC;AAC5F,OAAO,EAAE,uBAAuB,EAAE,MAAM,gBAAgB,CAAC;AAQzD,MAAM,OAAO,eAAe;IAClB,GAAG,CAAa;IAChB,MAAM,CAAkB;IACxB,YAAY,CAAe;IAC3B,UAAU,CAA4B;IAE9C,YAAY,OAA+B;QACzC,IAAI,CAAC,GAAG,GAAG,OAAO,CAAC,GAAG,CAAC;QACvB,IAAI,CAAC,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;QAC7B,IAAI,CAAC,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC;QACzC,IAAI,CAAC,UAAU,GAAG,uBAAuB,CAAC,IAAI,CAAC,YAAY,EAAE,QAAQ,CAAC,CAAC;IACzE,CAAC;IAED,KAAK,CAAC,iBAAiB,CAAC,QAAgB,EAAE,QAAmB;QAC3D,IAAI,OAAO,GAAG,QAAQ,CAAC;QACvB,MAAM,OAAO,GAAqB,EAAE,CAAC;QAErC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,qBAAqB,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3D,MAAM,kBAAkB,GAAG,IAAI,CAAC,gBAAgB,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;YAC7D,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE,kBAAkB,CAAC,CAAC;YAClE,OAAO,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;YAEvB,IAAI,CAAC,QAAQ,CAAC,SAAS,EAAE,CAAC;gBACxB,MAAM;YACR,CAAC;YAED,IAAI,QAAQ,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACnC,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,QAAQ,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC;gBAChF,IAAI,aAAa,GAAG,IAAI,CAAC,MAAM,CAAC,2BAA2B,EAAE,CAAC;oBAC5D,MAAM;gBACR,CAAC;YACH,CAAC;YAED,MAAM,kBAAkB,GAAG,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CACtD,QAAQ,CAAC,kBAAkB,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC,CAC3C,CAAC;YAEF,IAAI,kBAAkB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACpC,MAAM;YACR,CAAC;YAED,OAAO,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,QAAQ,EAAE,kBAAkB,CAAC,CAAC;QACrE,CAAC;QAED,OAAO;YACL,QAAQ,EAAE,QAAQ;YAClB,OAAO,EAAE,OAAO;YAChB,UAAU,EAAE,OAAO,CAAC,MAAM;YAC1B,eAAe,EAAE,OAAO;SACzB,CAAC;IACJ,CAAC;IAED,KAAK,CAAC,QAAQ,CACZ,QAAgB,EAChB,UAAsC;QAEtC,MAAM,KAAK,GAAG,UAAU,IAAI,IAAI,CAAC,UAAU,CAAC;QAC5C,MAAM,MAAM,GAAG,mBAAmB,CAAC,QAAQ,EAAE,KAAK,CAAC,CAAC;QAEpD,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC;YACjC,KAAK,EAAE,IAAI,CAAC,MAAM,CAAC,KAAK,IAAI,aAAa;YACzC,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC;YAC7C,WAAW,EAAE,CAAC;YACd,SAAS,EAAE,GAAG;SACf,CAAC,CAAC;QAEH,OAAO,qBAAqB,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAC/C,CAAC;IAED,KAAK,CAAC,MAAM,CACV,QAAgB,EAChB,QAAwB,EACxB,kBAA6C;QAE7C,MAAM,MAAM,GAAG,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,EAAE,kBAAkB,CAAC,CAAC;QAE3E,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC;YACjC,KAAK,EAAE,IAAI,CAAC,MAAM,CAAC,KAAK,IAAI,aAAa;YACzC,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC;YAC7C,WAAW,EAAE,GAAG;YAChB,SAAS,EAAE,IAAI;SAChB,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC,OAAO,CAAC;IACxB,CAAC;IAEO,gBAAgB,CAAC,QAAgB,EAAE,SAAiB;QAC1D,IAAI,SAAS,KAAK,CAAC,EAAE,CAAC;YACpB,OAAO,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,KAAK,MAAM,CAAC,CAAC;QAC9D,CAAC;QAED,MAAM,OAAO,GAAG,QAAQ,CAAC,WAAW,EAAE,CAAC;QACvC,MAAM,QAAQ,GAAG,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;YAC5C,KAAK,MAAM,QAAQ,IAAI,CAAC,CAAC,cAAc,IAAI,EAAE,EAAE,CAAC;gBAC9C,IAAI,IAAI,CAAC,gBAAgB,CAAC,QAAQ,CAAC,EAAE,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC;oBACxE,OAAO,IAAI,CAAC;gBACd,CAAC;YACH,CAAC;YACD,OAAO,KAAK,CAAC;QACf,CAAC,CAAC,CAAC;QAEH,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACxB,OAAO,QAAQ,CAAC;QAClB,CAAC;QAED,MAAM,QAAQ,GAAG,CAAC,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,GAAG,EAAE,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,GAAG,CAAC,CAAC;QACtE,OAAO,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC;IACzD,CAAC;IAEO,gBAAgB,GAA6B;QACnD,QAAQ,EAAE,CAAC,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC;QACjF,IAAI,EAAE,CAAC,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,MAAM,EAAE,aAAa,CAAC;QACzD,MAAM,EAAE,CAAC,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,QAAQ,EAAE,UAAU,CAAC;QACrD,WAAW,EAAE,CAAC,SAAS,EAAE,WAAW,EAAE,YAAY,EAAE,aAAa,CAAC;QAClE,OAAO,EAAE,CAAC,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,SAAS,CAAC;QACtD,OAAO,EAAE,CAAC,UAAU,EAAE,KAAK,EAAE,iBAAiB,EAAE,aAAa,EAAE,SAAS,CAAC;QACzE,cAAc,EAAE,CAAC,MAAM,EAAE,YAAY,EAAE,MAAM,CAAC;QAC9C,YAAY,EAAE,CAAC,YAAY,EAAE,SAAS,EAAE,OAAO,EAAE,MAAM,CAAC;KACzD,CAAC;IAEF,kBAAkB,CAAC,YAA0B;QAC3C,IAAI,CAAC,YAAY,GAAG,YAAY,CAAC;QACjC,IAAI,CAAC,UAAU,GAAG,uBAAuB,CAAC,YAAY,EAAE,QAAQ,CAAC,CAAC;IACpE,CAAC;CACF"}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
export { ConstitutionalAI } from './constitutional-ai';
|
|
2
|
+
export type { ConstitutionalAIOptions } from './constitutional-ai';
|
|
3
|
+
export { InputFilter } from './input-filter';
|
|
4
|
+
export type { InputFilterOptions } from './input-filter';
|
|
5
|
+
export { OutputFilter } from './output-filter';
|
|
6
|
+
export type { OutputFilterOptions } from './output-filter';
|
|
7
|
+
export { ToolGuard } from './tool-guard';
|
|
8
|
+
export type { ToolGuardOptions } from './tool-guard';
|
|
9
|
+
export { CritiqueReviser } from './critique-reviser';
|
|
10
|
+
export type { CritiqueReviserOptions } from './critique-reviser';
|
|
11
|
+
export { DEFAULT_CONSTITUTION, DEFAULT_PRINCIPLES, createConstitution, extendConstitution, filterPrinciplesByLayer, getPrinciplesByCategory, getPrinciplesBySeverity, } from './constitution';
|
|
12
|
+
export { buildInputEvaluationPrompt, buildOutputEvaluationPrompt, buildCritiquePrompt, buildRevisionPrompt, parseEvaluationResponse, parseCritiqueResponse, } from './prompts';
|
|
13
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/constitutional/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,MAAM,qBAAqB,CAAC;AACvD,YAAY,EAAE,uBAAuB,EAAE,MAAM,qBAAqB,CAAC;AAEnE,OAAO,EAAE,WAAW,EAAE,MAAM,gBAAgB,CAAC;AAC7C,YAAY,EAAE,kBAAkB,EAAE,MAAM,gBAAgB,CAAC;AAEzD,OAAO,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AAC/C,YAAY,EAAE,mBAAmB,EAAE,MAAM,iBAAiB,CAAC;AAE3D,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AACzC,YAAY,EAAE,gBAAgB,EAAE,MAAM,cAAc,CAAC;AAErD,OAAO,EAAE,eAAe,EAAE,MAAM,oBAAoB,CAAC;AACrD,YAAY,EAAE,sBAAsB,EAAE,MAAM,oBAAoB,CAAC;AAEjE,OAAO,EACL,oBAAoB,EACpB,kBAAkB,EAClB,kBAAkB,EAClB,kBAAkB,EAClB,uBAAuB,EACvB,uBAAuB,EACvB,uBAAuB,GACxB,MAAM,gBAAgB,CAAC;AAExB,OAAO,EACL,0BAA0B,EAC1B,2BAA2B,EAC3B,mBAAmB,EACnB,mBAAmB,EACnB,uBAAuB,EACvB,qBAAqB,GACtB,MAAM,WAAW,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
export { ConstitutionalAI } from './constitutional-ai';
|
|
2
|
+
export { InputFilter } from './input-filter';
|
|
3
|
+
export { OutputFilter } from './output-filter';
|
|
4
|
+
export { ToolGuard } from './tool-guard';
|
|
5
|
+
export { CritiqueReviser } from './critique-reviser';
|
|
6
|
+
export { DEFAULT_CONSTITUTION, DEFAULT_PRINCIPLES, createConstitution, extendConstitution, filterPrinciplesByLayer, getPrinciplesByCategory, getPrinciplesBySeverity, } from './constitution';
|
|
7
|
+
export { buildInputEvaluationPrompt, buildOutputEvaluationPrompt, buildCritiquePrompt, buildRevisionPrompt, parseEvaluationResponse, parseCritiqueResponse, } from './prompts';
|
|
8
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/constitutional/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,MAAM,qBAAqB,CAAC;AAGvD,OAAO,EAAE,WAAW,EAAE,MAAM,gBAAgB,CAAC;AAG7C,OAAO,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AAG/C,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AAGzC,OAAO,EAAE,eAAe,EAAE,MAAM,oBAAoB,CAAC;AAGrD,OAAO,EACL,oBAAoB,EACpB,kBAAkB,EAClB,kBAAkB,EAClB,kBAAkB,EAClB,uBAAuB,EACvB,uBAAuB,EACvB,uBAAuB,GACxB,MAAM,gBAAgB,CAAC;AAExB,OAAO,EACL,0BAA0B,EAC1B,2BAA2B,EAC3B,mBAAmB,EACnB,mBAAmB,EACnB,uBAAuB,EACvB,qBAAqB,GACtB,MAAM,WAAW,CAAC"}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import type { GuardrailConfig, FilterResult, Constitution, LLMBackend } from '@cogitator-ai/types';
|
|
2
|
+
export interface InputFilterOptions {
|
|
3
|
+
llm: LLMBackend;
|
|
4
|
+
config: GuardrailConfig;
|
|
5
|
+
constitution: Constitution;
|
|
6
|
+
}
|
|
7
|
+
export declare class InputFilter {
|
|
8
|
+
private llm;
|
|
9
|
+
private config;
|
|
10
|
+
private constitution;
|
|
11
|
+
private principles;
|
|
12
|
+
constructor(options: InputFilterOptions);
|
|
13
|
+
filter(input: string, context?: string): Promise<FilterResult>;
|
|
14
|
+
private quickScan;
|
|
15
|
+
private evaluateWithLLM;
|
|
16
|
+
private applyThresholds;
|
|
17
|
+
updateConstitution(constitution: Constitution): void;
|
|
18
|
+
}
|
|
19
|
+
//# sourceMappingURL=input-filter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"input-filter.d.ts","sourceRoot":"","sources":["../../src/constitutional/input-filter.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,eAAe,EACf,YAAY,EAGZ,YAAY,EACZ,UAAU,EACX,MAAM,qBAAqB,CAAC;AAI7B,MAAM,WAAW,kBAAkB;IACjC,GAAG,EAAE,UAAU,CAAC;IAChB,MAAM,EAAE,eAAe,CAAC;IACxB,YAAY,EAAE,YAAY,CAAC;CAC5B;AAED,qBAAa,WAAW;IACtB,OAAO,CAAC,GAAG,CAAa;IACxB,OAAO,CAAC,MAAM,CAAkB;IAChC,OAAO,CAAC,YAAY,CAAe;IACnC,OAAO,CAAC,UAAU,CAA4B;gBAElC,OAAO,EAAE,kBAAkB;IAOjC,MAAM,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,YAAY,CAAC;IAcpE,OAAO,CAAC,SAAS;YA8BH,eAAe;IAc7B,OAAO,CAAC,eAAe;IAyBvB,kBAAkB,CAAC,YAAY,EAAE,YAAY,GAAG,IAAI;CAIrD"}
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import { buildInputEvaluationPrompt, parseEvaluationResponse } from './prompts';
|
|
2
|
+
import { filterPrinciplesByLayer } from './constitution';
|
|
3
|
+
export class InputFilter {
|
|
4
|
+
llm;
|
|
5
|
+
config;
|
|
6
|
+
constitution;
|
|
7
|
+
principles;
|
|
8
|
+
constructor(options) {
|
|
9
|
+
this.llm = options.llm;
|
|
10
|
+
this.config = options.config;
|
|
11
|
+
this.constitution = options.constitution;
|
|
12
|
+
this.principles = filterPrinciplesByLayer(this.constitution, 'input');
|
|
13
|
+
}
|
|
14
|
+
async filter(input, context) {
|
|
15
|
+
const quickResult = this.quickScan(input);
|
|
16
|
+
if (!quickResult.allowed) {
|
|
17
|
+
return quickResult;
|
|
18
|
+
}
|
|
19
|
+
if (this.principles.length === 0) {
|
|
20
|
+
return { allowed: true, harmScores: [] };
|
|
21
|
+
}
|
|
22
|
+
const harmScores = await this.evaluateWithLLM(input, context);
|
|
23
|
+
return this.applyThresholds(harmScores);
|
|
24
|
+
}
|
|
25
|
+
quickScan(input) {
|
|
26
|
+
const lowered = input.toLowerCase();
|
|
27
|
+
const patterns = [
|
|
28
|
+
{ pattern: /how\s+to\s+(make|build|create)\s+(a\s+)?bomb/i, category: 'violence' },
|
|
29
|
+
{ pattern: /how\s+to\s+kill\s+(myself|someone)/i, category: 'violence' },
|
|
30
|
+
{ pattern: /how\s+to\s+hack\s+(into|a)/i, category: 'illegal' },
|
|
31
|
+
{ pattern: /child\s+porn(ography)?/i, category: 'sexual' },
|
|
32
|
+
{ pattern: /ways\s+to\s+commit\s+suicide/i, category: 'self-harm' },
|
|
33
|
+
];
|
|
34
|
+
for (const { pattern, category } of patterns) {
|
|
35
|
+
if (pattern.test(lowered)) {
|
|
36
|
+
return {
|
|
37
|
+
allowed: false,
|
|
38
|
+
harmScores: [
|
|
39
|
+
{
|
|
40
|
+
category,
|
|
41
|
+
severity: 'high',
|
|
42
|
+
confidence: 0.95,
|
|
43
|
+
reasoning: 'Matched explicit harmful pattern',
|
|
44
|
+
},
|
|
45
|
+
],
|
|
46
|
+
blockedReason: `Input contains potentially harmful content (${category})`,
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
return { allowed: true, harmScores: [] };
|
|
51
|
+
}
|
|
52
|
+
async evaluateWithLLM(input, _context) {
|
|
53
|
+
const prompt = buildInputEvaluationPrompt(input, this.principles);
|
|
54
|
+
const response = await this.llm.chat({
|
|
55
|
+
model: this.config.model ?? 'gpt-4o-mini',
|
|
56
|
+
messages: [{ role: 'user', content: prompt }],
|
|
57
|
+
temperature: 0,
|
|
58
|
+
maxTokens: 500,
|
|
59
|
+
});
|
|
60
|
+
const result = parseEvaluationResponse(response.content);
|
|
61
|
+
return result.harmScores;
|
|
62
|
+
}
|
|
63
|
+
applyThresholds(harmScores) {
|
|
64
|
+
const thresholds = this.config.thresholds;
|
|
65
|
+
const severityOrder = { low: 1, medium: 2, high: 3 };
|
|
66
|
+
const violations = harmScores.filter((score) => {
|
|
67
|
+
const threshold = thresholds[score.category] ?? 'high';
|
|
68
|
+
return severityOrder[score.severity] >= severityOrder[threshold];
|
|
69
|
+
});
|
|
70
|
+
if (violations.length === 0) {
|
|
71
|
+
return { allowed: true, harmScores };
|
|
72
|
+
}
|
|
73
|
+
if (this.config.strictMode) {
|
|
74
|
+
const categories = [...new Set(violations.map((v) => v.category))];
|
|
75
|
+
return {
|
|
76
|
+
allowed: false,
|
|
77
|
+
harmScores,
|
|
78
|
+
blockedReason: `Input violates safety policies: ${categories.join(', ')}`,
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
return { allowed: true, harmScores };
|
|
82
|
+
}
|
|
83
|
+
updateConstitution(constitution) {
|
|
84
|
+
this.constitution = constitution;
|
|
85
|
+
this.principles = filterPrinciplesByLayer(constitution, 'input');
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
//# sourceMappingURL=input-filter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"input-filter.js","sourceRoot":"","sources":["../../src/constitutional/input-filter.ts"],"names":[],"mappings":"AAQA,OAAO,EAAE,0BAA0B,EAAE,uBAAuB,EAAE,MAAM,WAAW,CAAC;AAChF,OAAO,EAAE,uBAAuB,EAAE,MAAM,gBAAgB,CAAC;AAQzD,MAAM,OAAO,WAAW;IACd,GAAG,CAAa;IAChB,MAAM,CAAkB;IACxB,YAAY,CAAe;IAC3B,UAAU,CAA4B;IAE9C,YAAY,OAA2B;QACrC,IAAI,CAAC,GAAG,GAAG,OAAO,CAAC,GAAG,CAAC;QACvB,IAAI,CAAC,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;QAC7B,IAAI,CAAC,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC;QACzC,IAAI,CAAC,UAAU,GAAG,uBAAuB,CAAC,IAAI,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;IACxE,CAAC;IAED,KAAK,CAAC,MAAM,CAAC,KAAa,EAAE,OAAgB;QAC1C,MAAM,WAAW,GAAG,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;QAC1C,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,CAAC;YACzB,OAAO,WAAW,CAAC;QACrB,CAAC;QAED,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACjC,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,EAAE,EAAE,CAAC;QAC3C,CAAC;QAED,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,eAAe,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;QAC9D,OAAO,IAAI,CAAC,eAAe,CAAC,UAAU,CAAC,CAAC;IAC1C,CAAC;IAEO,SAAS,CAAC,KAAa;QAC7B,MAAM,OAAO,GAAG,KAAK,CAAC,WAAW,EAAE,CAAC;QACpC,MAAM,QAAQ,GAAG;YACf,EAAE,OAAO,EAAE,+CAA+C,EAAE,QAAQ,EAAE,UAAmB,EAAE;YAC3F,EAAE,OAAO,EAAE,qCAAqC,EAAE,QAAQ,EAAE,UAAmB,EAAE;YACjF,EAAE,OAAO,EAAE,6BAA6B,EAAE,QAAQ,EAAE,SAAkB,EAAE;YACxE,EAAE,OAAO,EAAE,yBAAyB,EAAE,QAAQ,EAAE,QAAiB,EAAE;YACnE,EAAE,OAAO,EAAE,+BAA+B,EAAE,QAAQ,EAAE,WAAoB,EAAE;SAC7E,CAAC;QAEF,KAAK,MAAM,EAAE,OAAO,EAAE,QAAQ,EAAE,IAAI,QAAQ,EAAE,CAAC;YAC7C,IAAI,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;gBAC1B,OAAO;oBACL,OAAO,EAAE,KAAK;oBACd,UAAU,EAAE;wBACV;4BACE,QAAQ;4BACR,QAAQ,EAAE,MAAM;4BAChB,UAAU,EAAE,IAAI;4BAChB,SAAS,EAAE,kCAAkC;yBAC9C;qBACF;oBACD,aAAa,EAAE,+CAA+C,QAAQ,GAAG;iBAC1E,CAAC;YACJ,CAAC;QACH,CAAC;QAED,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,EAAE,EAAE,CAAC;IAC3C,CAAC;IAEO,KAAK,CAAC,eAAe,CAAC,KAAa,EAAE,QAAiB;QAC5D,MAAM,MAAM,GAAG,0BAA0B,CAAC,KAAK,EAAE,IAAI,CAAC,UAAU,CAAC,CAAC;QAElE,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC;YACnC,KAAK,EAAE,IAAI,CAAC,MAAM,CAAC,KAAK,IAAI,aAAa;YACzC,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC;YAC7C,WAAW,EAAE,CAAC;YACd,SAAS,EAAE,GAAG;SACf,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,uBAAuB,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;QACzD,OAAO,MAAM,CAAC,UAAU,CAAC;IAC3B,CAAC;IAEO,eAAe,CAAC,UAAuB;QAC7C,MAAM,UAAU,GAAG,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC;QAC1C,MAAM,aAAa,GAA2B,EAAE,GAAG,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,CAAC;QAE7E,MAAM,UAAU,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,KAAK,EAAE,EAAE;YAC7C,MAAM,SAAS,GAAG,UAAU,CAAC,KAAK,CAAC,QAAQ,CAAC,IAAI,MAAM,CAAC;YACvD,OAAO,aAAa,CAAC,KAAK,CAAC,QAAQ,CAAC,IAAI,aAAa,CAAC,SAAS,CAAC,CAAC;QACnE,CAAC,CAAC,CAAC;QAEH,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC5B,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC;QACvC,CAAC;QAED,IAAI,IAAI,CAAC,MAAM,CAAC,UAAU,EAAE,CAAC;YAC3B,MAAM,UAAU,GAAG,CAAC,GAAG,IAAI,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC;YACnE,OAAO;gBACL,OAAO,EAAE,KAAK;gBACd,UAAU;gBACV,aAAa,EAAE,mCAAmC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE;aAC1E,CAAC;QACJ,CAAC;QAED,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC;IACvC,CAAC;IAED,kBAAkB,CAAC,YAA0B;QAC3C,IAAI,CAAC,YAAY,GAAG,YAAY,CAAC;QACjC,IAAI,CAAC,UAAU,GAAG,uBAAuB,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;IACnE,CAAC;CACF"}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import type { GuardrailConfig, FilterResult, Constitution, LLMBackend, Message } from '@cogitator-ai/types';
|
|
2
|
+
export interface OutputFilterOptions {
|
|
3
|
+
llm: LLMBackend;
|
|
4
|
+
config: GuardrailConfig;
|
|
5
|
+
constitution: Constitution;
|
|
6
|
+
}
|
|
7
|
+
export declare class OutputFilter {
|
|
8
|
+
private llm;
|
|
9
|
+
private config;
|
|
10
|
+
private constitution;
|
|
11
|
+
private principles;
|
|
12
|
+
constructor(options: OutputFilterOptions);
|
|
13
|
+
filter(output: string, context: Message[]): Promise<FilterResult>;
|
|
14
|
+
private quickScan;
|
|
15
|
+
private evaluateWithLLM;
|
|
16
|
+
private applyThresholds;
|
|
17
|
+
updateConstitution(constitution: Constitution): void;
|
|
18
|
+
}
|
|
19
|
+
//# sourceMappingURL=output-filter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"output-filter.d.ts","sourceRoot":"","sources":["../../src/constitutional/output-filter.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,eAAe,EACf,YAAY,EAGZ,YAAY,EACZ,UAAU,EACV,OAAO,EACR,MAAM,qBAAqB,CAAC;AAI7B,MAAM,WAAW,mBAAmB;IAClC,GAAG,EAAE,UAAU,CAAC;IAChB,MAAM,EAAE,eAAe,CAAC;IACxB,YAAY,EAAE,YAAY,CAAC;CAC5B;AAED,qBAAa,YAAY;IACvB,OAAO,CAAC,GAAG,CAAa;IACxB,OAAO,CAAC,MAAM,CAAkB;IAChC,OAAO,CAAC,YAAY,CAAe;IACnC,OAAO,CAAC,UAAU,CAA4B;gBAElC,OAAO,EAAE,mBAAmB;IAOlC,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,GAAG,OAAO,CAAC,YAAY,CAAC;IAcvE,OAAO,CAAC,SAAS;YA6BH,eAAe;IAc7B,OAAO,CAAC,eAAe;IAyBvB,kBAAkB,CAAC,YAAY,EAAE,YAAY,GAAG,IAAI;CAIrD"}
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import { buildOutputEvaluationPrompt, parseEvaluationResponse } from './prompts';
|
|
2
|
+
import { filterPrinciplesByLayer } from './constitution';
|
|
3
|
+
export class OutputFilter {
|
|
4
|
+
llm;
|
|
5
|
+
config;
|
|
6
|
+
constitution;
|
|
7
|
+
principles;
|
|
8
|
+
constructor(options) {
|
|
9
|
+
this.llm = options.llm;
|
|
10
|
+
this.config = options.config;
|
|
11
|
+
this.constitution = options.constitution;
|
|
12
|
+
this.principles = filterPrinciplesByLayer(this.constitution, 'output');
|
|
13
|
+
}
|
|
14
|
+
async filter(output, context) {
|
|
15
|
+
const quickResult = this.quickScan(output);
|
|
16
|
+
if (!quickResult.allowed) {
|
|
17
|
+
return quickResult;
|
|
18
|
+
}
|
|
19
|
+
if (this.principles.length === 0) {
|
|
20
|
+
return { allowed: true, harmScores: [] };
|
|
21
|
+
}
|
|
22
|
+
const harmScores = await this.evaluateWithLLM(output, context);
|
|
23
|
+
return this.applyThresholds(harmScores);
|
|
24
|
+
}
|
|
25
|
+
quickScan(output) {
|
|
26
|
+
const lowered = output.toLowerCase();
|
|
27
|
+
const patterns = [
|
|
28
|
+
{ pattern: /rm\s+-rf\s+\/(?!\w)/i, category: 'illegal' },
|
|
29
|
+
{ pattern: /format\s+c:/i, category: 'illegal' },
|
|
30
|
+
{ pattern: /sudo\s+rm\s+-rf/i, category: 'illegal' },
|
|
31
|
+
];
|
|
32
|
+
for (const { pattern, category } of patterns) {
|
|
33
|
+
if (pattern.test(lowered)) {
|
|
34
|
+
return {
|
|
35
|
+
allowed: false,
|
|
36
|
+
harmScores: [
|
|
37
|
+
{
|
|
38
|
+
category,
|
|
39
|
+
severity: 'high',
|
|
40
|
+
confidence: 0.95,
|
|
41
|
+
reasoning: 'Contains dangerous system command',
|
|
42
|
+
},
|
|
43
|
+
],
|
|
44
|
+
blockedReason: 'Output contains potentially dangerous commands',
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
return { allowed: true, harmScores: [] };
|
|
49
|
+
}
|
|
50
|
+
async evaluateWithLLM(output, context) {
|
|
51
|
+
const prompt = buildOutputEvaluationPrompt(output, context, this.principles);
|
|
52
|
+
const response = await this.llm.chat({
|
|
53
|
+
model: this.config.model ?? 'gpt-4o-mini',
|
|
54
|
+
messages: [{ role: 'user', content: prompt }],
|
|
55
|
+
temperature: 0,
|
|
56
|
+
maxTokens: 500,
|
|
57
|
+
});
|
|
58
|
+
const result = parseEvaluationResponse(response.content);
|
|
59
|
+
return result.harmScores;
|
|
60
|
+
}
|
|
61
|
+
applyThresholds(harmScores) {
|
|
62
|
+
const thresholds = this.config.thresholds;
|
|
63
|
+
const severityOrder = { low: 1, medium: 2, high: 3 };
|
|
64
|
+
const violations = harmScores.filter((score) => {
|
|
65
|
+
const threshold = thresholds[score.category] ?? 'high';
|
|
66
|
+
return severityOrder[score.severity] >= severityOrder[threshold];
|
|
67
|
+
});
|
|
68
|
+
if (violations.length === 0) {
|
|
69
|
+
return { allowed: true, harmScores };
|
|
70
|
+
}
|
|
71
|
+
if (this.config.strictMode) {
|
|
72
|
+
const categories = [...new Set(violations.map((v) => v.category))];
|
|
73
|
+
return {
|
|
74
|
+
allowed: false,
|
|
75
|
+
harmScores,
|
|
76
|
+
blockedReason: `Output violates safety policies: ${categories.join(', ')}`,
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
return { allowed: true, harmScores };
|
|
80
|
+
}
|
|
81
|
+
updateConstitution(constitution) {
|
|
82
|
+
this.constitution = constitution;
|
|
83
|
+
this.principles = filterPrinciplesByLayer(constitution, 'output');
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
//# sourceMappingURL=output-filter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"output-filter.js","sourceRoot":"","sources":["../../src/constitutional/output-filter.ts"],"names":[],"mappings":"AASA,OAAO,EAAE,2BAA2B,EAAE,uBAAuB,EAAE,MAAM,WAAW,CAAC;AACjF,OAAO,EAAE,uBAAuB,EAAE,MAAM,gBAAgB,CAAC;AAQzD,MAAM,OAAO,YAAY;IACf,GAAG,CAAa;IAChB,MAAM,CAAkB;IACxB,YAAY,CAAe;IAC3B,UAAU,CAA4B;IAE9C,YAAY,OAA4B;QACtC,IAAI,CAAC,GAAG,GAAG,OAAO,CAAC,GAAG,CAAC;QACvB,IAAI,CAAC,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;QAC7B,IAAI,CAAC,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC;QACzC,IAAI,CAAC,UAAU,GAAG,uBAAuB,CAAC,IAAI,CAAC,YAAY,EAAE,QAAQ,CAAC,CAAC;IACzE,CAAC;IAED,KAAK,CAAC,MAAM,CAAC,MAAc,EAAE,OAAkB;QAC7C,MAAM,WAAW,GAAG,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QAC3C,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,CAAC;YACzB,OAAO,WAAW,CAAC;QACrB,CAAC;QAED,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACjC,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,EAAE,EAAE,CAAC;QAC3C,CAAC;QAED,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,eAAe,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAC/D,OAAO,IAAI,CAAC,eAAe,CAAC,UAAU,CAAC,CAAC;IAC1C,CAAC;IAEO,SAAS,CAAC,MAAc;QAC9B,MAAM,OAAO,GAAG,MAAM,CAAC,WAAW,EAAE,CAAC;QAErC,MAAM,QAAQ,GAAG;YACf,EAAE,OAAO,EAAE,sBAAsB,EAAE,QAAQ,EAAE,SAAkB,EAAE;YACjE,EAAE,OAAO,EAAE,cAAc,EAAE,QAAQ,EAAE,SAAkB,EAAE;YACzD,EAAE,OAAO,EAAE,kBAAkB,EAAE,QAAQ,EAAE,SAAkB,EAAE;SAC9D,CAAC;QAEF,KAAK,MAAM,EAAE,OAAO,EAAE,QAAQ,EAAE,IAAI,QAAQ,EAAE,CAAC;YAC7C,IAAI,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;gBAC1B,OAAO;oBACL,OAAO,EAAE,KAAK;oBACd,UAAU,EAAE;wBACV;4BACE,QAAQ;4BACR,QAAQ,EAAE,MAAM;4BAChB,UAAU,EAAE,IAAI;4BAChB,SAAS,EAAE,mCAAmC;yBAC/C;qBACF;oBACD,aAAa,EAAE,gDAAgD;iBAChE,CAAC;YACJ,CAAC;QACH,CAAC;QAED,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,EAAE,EAAE,CAAC;IAC3C,CAAC;IAEO,KAAK,CAAC,eAAe,CAAC,MAAc,EAAE,OAAkB;QAC9D,MAAM,MAAM,GAAG,2BAA2B,CAAC,MAAM,EAAE,OAAO,EAAE,IAAI,CAAC,UAAU,CAAC,CAAC;QAE7E,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC;YACnC,KAAK,EAAE,IAAI,CAAC,MAAM,CAAC,KAAK,IAAI,aAAa;YACzC,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC;YAC7C,WAAW,EAAE,CAAC;YACd,SAAS,EAAE,GAAG;SACf,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,uBAAuB,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;QACzD,OAAO,MAAM,CAAC,UAAU,CAAC;IAC3B,CAAC;IAEO,eAAe,CAAC,UAAuB;QAC7C,MAAM,UAAU,GAAG,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC;QAC1C,MAAM,aAAa,GAA2B,EAAE,GAAG,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,CAAC;QAE7E,MAAM,UAAU,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,KAAK,EAAE,EAAE;YAC7C,MAAM,SAAS,GAAG,UAAU,CAAC,KAAK,CAAC,QAAQ,CAAC,IAAI,MAAM,CAAC;YACvD,OAAO,aAAa,CAAC,KAAK,CAAC,QAAQ,CAAC,IAAI,aAAa,CAAC,SAAS,CAAC,CAAC;QACnE,CAAC,CAAC,CAAC;QAEH,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC5B,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC;QACvC,CAAC;QAED,IAAI,IAAI,CAAC,MAAM,CAAC,UAAU,EAAE,CAAC;YAC3B,MAAM,UAAU,GAAG,CAAC,GAAG,IAAI,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC;YACnE,OAAO;gBACL,OAAO,EAAE,KAAK;gBACd,UAAU;gBACV,aAAa,EAAE,oCAAoC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE;aAC3E,CAAC;QACJ,CAAC;QAED,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC;IACvC,CAAC;IAED,kBAAkB,CAAC,YAA0B;QAC3C,IAAI,CAAC,YAAY,GAAG,YAAY,CAAC;QACjC,IAAI,CAAC,UAAU,GAAG,uBAAuB,CAAC,YAAY,EAAE,QAAQ,CAAC,CAAC;IACpE,CAAC;CACF"}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { ConstitutionalPrinciple, HarmScore, CritiqueResult, Message } from '@cogitator-ai/types';
|
|
2
|
+
export declare function buildInputEvaluationPrompt(input: string, principles: ConstitutionalPrinciple[]): string;
|
|
3
|
+
export declare function buildOutputEvaluationPrompt(output: string, context: Message[], principles: ConstitutionalPrinciple[]): string;
|
|
4
|
+
export declare function buildCritiquePrompt(response: string, principles: ConstitutionalPrinciple[]): string;
|
|
5
|
+
export declare function buildRevisionPrompt(response: string, critique: CritiqueResult, principles: ConstitutionalPrinciple[]): string;
|
|
6
|
+
export declare function parseEvaluationResponse(content: string): {
|
|
7
|
+
isHarmful: boolean;
|
|
8
|
+
harmScores: HarmScore[];
|
|
9
|
+
};
|
|
10
|
+
export declare function parseCritiqueResponse(content: string): CritiqueResult;
|
|
11
|
+
//# sourceMappingURL=prompts.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"prompts.d.ts","sourceRoot":"","sources":["../../src/constitutional/prompts.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,uBAAuB,EACvB,SAAS,EACT,cAAc,EAGd,OAAO,EACR,MAAM,qBAAqB,CAAC;AAE7B,wBAAgB,0BAA0B,CACxC,KAAK,EAAE,MAAM,EACb,UAAU,EAAE,uBAAuB,EAAE,GACpC,MAAM,CAiCR;AAED,wBAAgB,2BAA2B,CACzC,MAAM,EAAE,MAAM,EACd,OAAO,EAAE,OAAO,EAAE,EAClB,UAAU,EAAE,uBAAuB,EAAE,GACpC,MAAM,CAyCR;AAED,wBAAgB,mBAAmB,CACjC,QAAQ,EAAE,MAAM,EAChB,UAAU,EAAE,uBAAuB,EAAE,GACpC,MAAM,CAkCR;AAED,wBAAgB,mBAAmB,CACjC,QAAQ,EAAE,MAAM,EAChB,QAAQ,EAAE,cAAc,EACxB,UAAU,EAAE,uBAAuB,EAAE,GACpC,MAAM,CA0BR;AAED,wBAAgB,uBAAuB,CAAC,OAAO,EAAE,MAAM,GAAG;IACxD,SAAS,EAAE,OAAO,CAAC;IACnB,UAAU,EAAE,SAAS,EAAE,CAAC;CACzB,CAeA;AAED,wBAAgB,qBAAqB,CAAC,OAAO,EAAE,MAAM,GAAG,cAAc,CAwBrE"}
|