@cogitator-ai/core 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +920 -15
- package/dist/__tests__/agent.test.js +2 -2
- package/dist/__tests__/agent.test.js.map +1 -1
- package/dist/__tests__/base64.test.js +1 -1
- package/dist/__tests__/base64.test.js.map +1 -1
- package/dist/__tests__/calculator.test.js +1 -1
- package/dist/__tests__/calculator.test.js.map +1 -1
- package/dist/__tests__/cogitator-memory.test.js +2 -2
- package/dist/__tests__/cogitator-memory.test.js.map +1 -1
- package/dist/__tests__/datetime.test.js +1 -1
- package/dist/__tests__/datetime.test.js.map +1 -1
- package/dist/__tests__/exec.test.js +1 -1
- package/dist/__tests__/exec.test.js.map +1 -1
- package/dist/__tests__/filesystem.test.js +1 -1
- package/dist/__tests__/filesystem.test.js.map +1 -1
- package/dist/__tests__/google-backend.test.js +1 -1
- package/dist/__tests__/google-backend.test.js.map +1 -1
- package/dist/__tests__/hash.test.js +1 -1
- package/dist/__tests__/hash.test.js.map +1 -1
- package/dist/__tests__/http.test.js +1 -1
- package/dist/__tests__/http.test.js.map +1 -1
- package/dist/__tests__/json.test.js +1 -1
- package/dist/__tests__/json.test.js.map +1 -1
- package/dist/__tests__/logger.test.js +1 -1
- package/dist/__tests__/logger.test.js.map +1 -1
- package/dist/__tests__/random.test.js +1 -1
- package/dist/__tests__/random.test.js.map +1 -1
- package/dist/__tests__/regex.test.js +1 -1
- package/dist/__tests__/regex.test.js.map +1 -1
- package/dist/__tests__/registry.test.js +2 -2
- package/dist/__tests__/registry.test.js.map +1 -1
- package/dist/__tests__/sleep.test.js +1 -1
- package/dist/__tests__/sleep.test.js.map +1 -1
- package/dist/__tests__/tool.test.js +1 -1
- package/dist/__tests__/tool.test.js.map +1 -1
- package/dist/__tests__/uuid.test.js +1 -1
- package/dist/__tests__/uuid.test.js.map +1 -1
- package/dist/cogitator.d.ts +46 -1
- package/dist/cogitator.d.ts.map +1 -1
- package/dist/cogitator.js +274 -17
- package/dist/cogitator.js.map +1 -1
- package/dist/constitutional/constitution.d.ts +9 -0
- package/dist/constitutional/constitution.d.ts.map +1 -0
- package/dist/constitutional/constitution.js +215 -0
- package/dist/constitutional/constitution.js.map +1 -0
- package/dist/constitutional/constitutional-ai.d.ts +36 -0
- package/dist/constitutional/constitutional-ai.d.ts.map +1 -0
- package/dist/constitutional/constitutional-ai.js +163 -0
- package/dist/constitutional/constitutional-ai.js.map +1 -0
- package/dist/constitutional/critique-reviser.d.ts +20 -0
- package/dist/constitutional/critique-reviser.d.ts.map +1 -0
- package/dist/constitutional/critique-reviser.js +98 -0
- package/dist/constitutional/critique-reviser.js.map +1 -0
- package/dist/constitutional/index.d.ts +13 -0
- package/dist/constitutional/index.d.ts.map +1 -0
- package/dist/constitutional/index.js +8 -0
- package/dist/constitutional/index.js.map +1 -0
- package/dist/constitutional/input-filter.d.ts +19 -0
- package/dist/constitutional/input-filter.d.ts.map +1 -0
- package/dist/constitutional/input-filter.js +88 -0
- package/dist/constitutional/input-filter.js.map +1 -0
- package/dist/constitutional/output-filter.d.ts +19 -0
- package/dist/constitutional/output-filter.d.ts.map +1 -0
- package/dist/constitutional/output-filter.js +86 -0
- package/dist/constitutional/output-filter.js.map +1 -0
- package/dist/constitutional/prompts.d.ts +11 -0
- package/dist/constitutional/prompts.d.ts.map +1 -0
- package/dist/constitutional/prompts.js +202 -0
- package/dist/constitutional/prompts.js.map +1 -0
- package/dist/constitutional/tool-guard.d.ts +18 -0
- package/dist/constitutional/tool-guard.d.ts.map +1 -0
- package/dist/constitutional/tool-guard.js +125 -0
- package/dist/constitutional/tool-guard.js.map +1 -0
- package/dist/cost-routing/budget-enforcer.d.ts +26 -0
- package/dist/cost-routing/budget-enforcer.d.ts.map +1 -0
- package/dist/cost-routing/budget-enforcer.js +86 -0
- package/dist/cost-routing/budget-enforcer.js.map +1 -0
- package/dist/cost-routing/cost-router.d.ts +34 -0
- package/dist/cost-routing/cost-router.d.ts.map +1 -0
- package/dist/cost-routing/cost-router.js +80 -0
- package/dist/cost-routing/cost-router.js.map +1 -0
- package/dist/cost-routing/cost-tracker.d.ts +20 -0
- package/dist/cost-routing/cost-tracker.d.ts.map +1 -0
- package/dist/cost-routing/cost-tracker.js +85 -0
- package/dist/cost-routing/cost-tracker.js.map +1 -0
- package/dist/cost-routing/index.d.ts +6 -0
- package/dist/cost-routing/index.d.ts.map +1 -0
- package/dist/cost-routing/index.js +6 -0
- package/dist/cost-routing/index.js.map +1 -0
- package/dist/cost-routing/model-selector.d.ts +15 -0
- package/dist/cost-routing/model-selector.d.ts.map +1 -0
- package/dist/cost-routing/model-selector.js +216 -0
- package/dist/cost-routing/model-selector.js.map +1 -0
- package/dist/cost-routing/task-analyzer.d.ts +13 -0
- package/dist/cost-routing/task-analyzer.d.ts.map +1 -0
- package/dist/cost-routing/task-analyzer.js +185 -0
- package/dist/cost-routing/task-analyzer.js.map +1 -0
- package/dist/index.d.ts +19 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +9 -0
- package/dist/index.js.map +1 -1
- package/dist/learning/ab-testing.d.ts +45 -0
- package/dist/learning/ab-testing.d.ts.map +1 -0
- package/dist/learning/ab-testing.js +267 -0
- package/dist/learning/ab-testing.js.map +1 -0
- package/dist/learning/agent-optimizer.d.ts +42 -0
- package/dist/learning/agent-optimizer.d.ts.map +1 -0
- package/dist/learning/agent-optimizer.js +273 -0
- package/dist/learning/agent-optimizer.js.map +1 -0
- package/dist/learning/auto-optimizer.d.ts +38 -0
- package/dist/learning/auto-optimizer.d.ts.map +1 -0
- package/dist/learning/auto-optimizer.js +229 -0
- package/dist/learning/auto-optimizer.js.map +1 -0
- package/dist/learning/demo-selector.d.ts +29 -0
- package/dist/learning/demo-selector.d.ts.map +1 -0
- package/dist/learning/demo-selector.js +235 -0
- package/dist/learning/demo-selector.js.map +1 -0
- package/dist/learning/index.d.ts +24 -0
- package/dist/learning/index.d.ts.map +1 -0
- package/dist/learning/index.js +13 -0
- package/dist/learning/index.js.map +1 -0
- package/dist/learning/instruction-optimizer.d.ts +29 -0
- package/dist/learning/instruction-optimizer.d.ts.map +1 -0
- package/dist/learning/instruction-optimizer.js +175 -0
- package/dist/learning/instruction-optimizer.js.map +1 -0
- package/dist/learning/metrics.d.ts +37 -0
- package/dist/learning/metrics.d.ts.map +1 -0
- package/dist/learning/metrics.js +310 -0
- package/dist/learning/metrics.js.map +1 -0
- package/dist/learning/postgres-trace-store.d.ts +53 -0
- package/dist/learning/postgres-trace-store.d.ts.map +1 -0
- package/dist/learning/postgres-trace-store.js +692 -0
- package/dist/learning/postgres-trace-store.js.map +1 -0
- package/dist/learning/prompt-logger.d.ts +29 -0
- package/dist/learning/prompt-logger.d.ts.map +1 -0
- package/dist/learning/prompt-logger.js +157 -0
- package/dist/learning/prompt-logger.js.map +1 -0
- package/dist/learning/prompt-monitor.d.ts +29 -0
- package/dist/learning/prompt-monitor.d.ts.map +1 -0
- package/dist/learning/prompt-monitor.js +243 -0
- package/dist/learning/prompt-monitor.js.map +1 -0
- package/dist/learning/prompts.d.ts +28 -0
- package/dist/learning/prompts.d.ts.map +1 -0
- package/dist/learning/prompts.js +195 -0
- package/dist/learning/prompts.js.map +1 -0
- package/dist/learning/rollback-manager.d.ts +36 -0
- package/dist/learning/rollback-manager.d.ts.map +1 -0
- package/dist/learning/rollback-manager.js +177 -0
- package/dist/learning/rollback-manager.js.map +1 -0
- package/dist/learning/trace-store.d.ts +26 -0
- package/dist/learning/trace-store.d.ts.map +1 -0
- package/dist/learning/trace-store.js +218 -0
- package/dist/learning/trace-store.js.map +1 -0
- package/dist/llm/google.d.ts.map +1 -1
- package/dist/llm/google.js +1 -2
- package/dist/llm/google.js.map +1 -1
- package/dist/reasoning/branch-evaluator.d.ts +28 -0
- package/dist/reasoning/branch-evaluator.d.ts.map +1 -0
- package/dist/reasoning/branch-evaluator.js +143 -0
- package/dist/reasoning/branch-evaluator.js.map +1 -0
- package/dist/reasoning/branch-generator.d.ts +9 -0
- package/dist/reasoning/branch-generator.d.ts.map +1 -0
- package/dist/reasoning/branch-generator.js +60 -0
- package/dist/reasoning/branch-generator.js.map +1 -0
- package/dist/reasoning/index.d.ts +5 -0
- package/dist/reasoning/index.d.ts.map +1 -0
- package/dist/reasoning/index.js +5 -0
- package/dist/reasoning/index.js.map +1 -0
- package/dist/reasoning/prompts.d.ts +19 -0
- package/dist/reasoning/prompts.d.ts.map +1 -0
- package/dist/reasoning/prompts.js +161 -0
- package/dist/reasoning/prompts.js.map +1 -0
- package/dist/reasoning/thought-tree.d.ts +32 -0
- package/dist/reasoning/thought-tree.d.ts.map +1 -0
- package/dist/reasoning/thought-tree.js +352 -0
- package/dist/reasoning/thought-tree.js.map +1 -0
- package/dist/reflection/index.d.ts +4 -0
- package/dist/reflection/index.d.ts.map +1 -0
- package/dist/reflection/index.js +4 -0
- package/dist/reflection/index.js.map +1 -0
- package/dist/reflection/insight-store.d.ts +19 -0
- package/dist/reflection/insight-store.d.ts.map +1 -0
- package/dist/reflection/insight-store.js +129 -0
- package/dist/reflection/insight-store.js.map +1 -0
- package/dist/reflection/prompts.d.ts +18 -0
- package/dist/reflection/prompts.d.ts.map +1 -0
- package/dist/reflection/prompts.js +157 -0
- package/dist/reflection/prompts.js.map +1 -0
- package/dist/reflection/reflection-engine.d.ts +25 -0
- package/dist/reflection/reflection-engine.d.ts.map +1 -0
- package/dist/reflection/reflection-engine.js +202 -0
- package/dist/reflection/reflection-engine.js.map +1 -0
- package/dist/registry.d.ts +1 -0
- package/dist/registry.d.ts.map +1 -1
- package/dist/registry.js +3 -0
- package/dist/registry.js.map +1 -1
- package/dist/time-travel/checkpoint-store.d.ts +34 -0
- package/dist/time-travel/checkpoint-store.d.ts.map +1 -0
- package/dist/time-travel/checkpoint-store.js +240 -0
- package/dist/time-travel/checkpoint-store.js.map +1 -0
- package/dist/time-travel/comparator.d.ts +26 -0
- package/dist/time-travel/comparator.d.ts.map +1 -0
- package/dist/time-travel/comparator.js +253 -0
- package/dist/time-travel/comparator.js.map +1 -0
- package/dist/time-travel/forker.d.ts +22 -0
- package/dist/time-travel/forker.d.ts.map +1 -0
- package/dist/time-travel/forker.js +118 -0
- package/dist/time-travel/forker.js.map +1 -0
- package/dist/time-travel/index.d.ts +6 -0
- package/dist/time-travel/index.d.ts.map +1 -0
- package/dist/time-travel/index.js +6 -0
- package/dist/time-travel/index.js.map +1 -0
- package/dist/time-travel/replayer.d.ts +20 -0
- package/dist/time-travel/replayer.d.ts.map +1 -0
- package/dist/time-travel/replayer.js +147 -0
- package/dist/time-travel/replayer.js.map +1 -0
- package/dist/time-travel/time-travel.d.ts +41 -0
- package/dist/time-travel/time-travel.d.ts.map +1 -0
- package/dist/time-travel/time-travel.js +127 -0
- package/dist/time-travel/time-travel.js.map +1 -0
- package/dist/tool.d.ts.map +1 -1
- package/dist/tool.js +2 -0
- package/dist/tool.js.map +1 -1
- package/dist/tools/base64.d.ts.map +1 -1
- package/dist/tools/base64.js +2 -8
- package/dist/tools/base64.js.map +1 -1
- package/dist/tools/datetime.d.ts.map +1 -1
- package/dist/tools/datetime.js.map +1 -1
- package/dist/tools/exec.d.ts.map +1 -1
- package/dist/tools/exec.js +1 -4
- package/dist/tools/exec.js.map +1 -1
- package/dist/tools/filesystem.d.ts.map +1 -1
- package/dist/tools/filesystem.js +4 -1
- package/dist/tools/filesystem.js.map +1 -1
- package/dist/tools/hash.d.ts.map +1 -1
- package/dist/tools/hash.js +1 -4
- package/dist/tools/hash.js.map +1 -1
- package/dist/tools/http.d.ts.map +1 -1
- package/dist/tools/http.js +1 -4
- package/dist/tools/http.js.map +1 -1
- package/dist/tools/regex.d.ts.map +1 -1
- package/dist/tools/regex.js +4 -1
- package/dist/tools/regex.js.map +1 -1
- package/dist/utils/circuit-breaker.d.ts.map +1 -1
- package/dist/utils/circuit-breaker.js.map +1 -1
- package/dist/utils/fallback.d.ts.map +1 -1
- package/dist/utils/fallback.js +1 -4
- package/dist/utils/fallback.js.map +1 -1
- package/dist/utils/retry.d.ts.map +1 -1
- package/dist/utils/retry.js +8 -13
- package/dist/utils/retry.js.map +1 -1
- package/package.json +17 -8
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import { buildCritiquePrompt, buildRevisionPrompt, parseCritiqueResponse } from './prompts';
|
|
2
|
+
import { filterPrinciplesByLayer } from './constitution';
|
|
3
|
+
export class CritiqueReviser {
|
|
4
|
+
llm;
|
|
5
|
+
config;
|
|
6
|
+
constitution;
|
|
7
|
+
principles;
|
|
8
|
+
constructor(options) {
|
|
9
|
+
this.llm = options.llm;
|
|
10
|
+
this.config = options.config;
|
|
11
|
+
this.constitution = options.constitution;
|
|
12
|
+
this.principles = filterPrinciplesByLayer(this.constitution, 'output');
|
|
13
|
+
}
|
|
14
|
+
async critiqueAndRevise(response, _context) {
|
|
15
|
+
let current = response;
|
|
16
|
+
const history = [];
|
|
17
|
+
for (let i = 0; i < this.config.maxRevisionIterations; i++) {
|
|
18
|
+
const selectedPrinciples = this.selectPrinciples(current, i);
|
|
19
|
+
const critique = await this.critique(current, selectedPrinciples);
|
|
20
|
+
history.push(critique);
|
|
21
|
+
if (!critique.isHarmful) {
|
|
22
|
+
break;
|
|
23
|
+
}
|
|
24
|
+
if (critique.harmScores.length > 0) {
|
|
25
|
+
const maxConfidence = Math.max(...critique.harmScores.map((s) => s.confidence));
|
|
26
|
+
if (maxConfidence < this.config.revisionConfidenceThreshold) {
|
|
27
|
+
break;
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
const violatedPrinciples = this.principles.filter((p) => critique.principlesViolated.includes(p.id));
|
|
31
|
+
if (violatedPrinciples.length === 0) {
|
|
32
|
+
break;
|
|
33
|
+
}
|
|
34
|
+
current = await this.revise(current, critique, violatedPrinciples);
|
|
35
|
+
}
|
|
36
|
+
return {
|
|
37
|
+
original: response,
|
|
38
|
+
revised: current,
|
|
39
|
+
iterations: history.length,
|
|
40
|
+
critiqueHistory: history,
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
async critique(response, principles) {
|
|
44
|
+
const toUse = principles ?? this.principles;
|
|
45
|
+
const prompt = buildCritiquePrompt(response, toUse);
|
|
46
|
+
const result = await this.llm.chat({
|
|
47
|
+
model: this.config.model ?? 'gpt-4o-mini',
|
|
48
|
+
messages: [{ role: 'user', content: prompt }],
|
|
49
|
+
temperature: 0,
|
|
50
|
+
maxTokens: 800,
|
|
51
|
+
});
|
|
52
|
+
return parseCritiqueResponse(result.content);
|
|
53
|
+
}
|
|
54
|
+
async revise(response, critique, violatedPrinciples) {
|
|
55
|
+
const prompt = buildRevisionPrompt(response, critique, violatedPrinciples);
|
|
56
|
+
const result = await this.llm.chat({
|
|
57
|
+
model: this.config.model ?? 'gpt-4o-mini',
|
|
58
|
+
messages: [{ role: 'user', content: prompt }],
|
|
59
|
+
temperature: 0.3,
|
|
60
|
+
maxTokens: 2000,
|
|
61
|
+
});
|
|
62
|
+
return result.content;
|
|
63
|
+
}
|
|
64
|
+
selectPrinciples(response, iteration) {
|
|
65
|
+
if (iteration === 0) {
|
|
66
|
+
return this.principles.filter((p) => p.severity === 'high');
|
|
67
|
+
}
|
|
68
|
+
const lowered = response.toLowerCase();
|
|
69
|
+
const relevant = this.principles.filter((p) => {
|
|
70
|
+
for (const category of p.harmCategories ?? []) {
|
|
71
|
+
if (this.categoryKeywords[category]?.some((kw) => lowered.includes(kw))) {
|
|
72
|
+
return true;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
return false;
|
|
76
|
+
});
|
|
77
|
+
if (relevant.length > 0) {
|
|
78
|
+
return relevant;
|
|
79
|
+
}
|
|
80
|
+
const shuffled = [...this.principles].sort(() => Math.random() - 0.5);
|
|
81
|
+
return shuffled.slice(0, Math.min(5, shuffled.length));
|
|
82
|
+
}
|
|
83
|
+
categoryKeywords = {
|
|
84
|
+
violence: ['kill', 'murder', 'weapon', 'attack', 'bomb', 'harm', 'hurt', 'fight'],
|
|
85
|
+
hate: ['hate', 'racist', 'sexist', 'slur', 'discriminat'],
|
|
86
|
+
sexual: ['sex', 'porn', 'nude', 'erotic', 'explicit'],
|
|
87
|
+
'self-harm': ['suicide', 'self-harm', 'cut myself', 'kill myself'],
|
|
88
|
+
illegal: ['hack', 'steal', 'fraud', 'drug', 'illegal'],
|
|
89
|
+
privacy: ['password', 'ssn', 'social security', 'credit card', 'address'],
|
|
90
|
+
misinformation: ['fake', 'conspiracy', 'hoax'],
|
|
91
|
+
manipulation: ['manipulate', 'deceive', 'trick', 'scam'],
|
|
92
|
+
};
|
|
93
|
+
updateConstitution(constitution) {
|
|
94
|
+
this.constitution = constitution;
|
|
95
|
+
this.principles = filterPrinciplesByLayer(constitution, 'output');
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
//# sourceMappingURL=critique-reviser.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"critique-reviser.js","sourceRoot":"","sources":["../../src/constitutional/critique-reviser.ts"],"names":[],"mappings":"AASA,OAAO,EAAE,mBAAmB,EAAE,mBAAmB,EAAE,qBAAqB,EAAE,MAAM,WAAW,CAAC;AAC5F,OAAO,EAAE,uBAAuB,EAAE,MAAM,gBAAgB,CAAC;AAQzD,MAAM,OAAO,eAAe;IAClB,GAAG,CAAa;IAChB,MAAM,CAAkB;IACxB,YAAY,CAAe;IAC3B,UAAU,CAA4B;IAE9C,YAAY,OAA+B;QACzC,IAAI,CAAC,GAAG,GAAG,OAAO,CAAC,GAAG,CAAC;QACvB,IAAI,CAAC,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;QAC7B,IAAI,CAAC,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC;QACzC,IAAI,CAAC,UAAU,GAAG,uBAAuB,CAAC,IAAI,CAAC,YAAY,EAAE,QAAQ,CAAC,CAAC;IACzE,CAAC;IAED,KAAK,CAAC,iBAAiB,CAAC,QAAgB,EAAE,QAAmB;QAC3D,IAAI,OAAO,GAAG,QAAQ,CAAC;QACvB,MAAM,OAAO,GAAqB,EAAE,CAAC;QAErC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,qBAAqB,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3D,MAAM,kBAAkB,GAAG,IAAI,CAAC,gBAAgB,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;YAC7D,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE,kBAAkB,CAAC,CAAC;YAClE,OAAO,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;YAEvB,IAAI,CAAC,QAAQ,CAAC,SAAS,EAAE,CAAC;gBACxB,MAAM;YACR,CAAC;YAED,IAAI,QAAQ,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACnC,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,QAAQ,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC;gBAChF,IAAI,aAAa,GAAG,IAAI,CAAC,MAAM,CAAC,2BAA2B,EAAE,CAAC;oBAC5D,MAAM;gBACR,CAAC;YACH,CAAC;YAED,MAAM,kBAAkB,GAAG,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CACtD,QAAQ,CAAC,kBAAkB,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC,CAC3C,CAAC;YAEF,IAAI,kBAAkB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACpC,MAAM;YACR,CAAC;YAED,OAAO,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,QAAQ,EAAE,kBAAkB,CAAC,CAAC;QACrE,CAAC;QAED,OAAO;YACL,QAAQ,EAAE,QAAQ;YAClB,OAAO,EAAE,OAAO;YAChB,UAAU,EAAE,OAAO,CAAC,MAAM;YAC1B,eAAe,EAAE,OAAO;SACzB,CAAC;IACJ,CAAC;IAED,KAAK,CAAC,QAAQ,CACZ,QAAgB,EAChB,UAAsC;QAEtC,MAAM,KAAK,GAAG,UAAU,IAAI,IAAI,CAAC,UAAU,CAAC;QAC5C,MAAM,MAAM,GAAG,mBAAmB,CAAC,QAAQ,EAAE,KAAK,CAAC,CAAC;QAEpD,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC;YACjC,KAAK,EAAE,IAAI,CAAC,MAAM,CAAC,KAAK,IAAI,aAAa;YACzC,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC;YAC7C,WAAW,EAAE,CAAC;YACd,SAAS,EAAE,GAAG;SACf,CAAC,CAAC;QAEH,OAAO,qBAAqB,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAC/C,CAAC;IAED,KAAK,CAAC,MAAM,CACV,QAAgB,EAChB,QAAwB,EACxB,kBAA6C;QAE7C,MAAM,MAAM,GAAG,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,EAAE,kBAAkB,CAAC,CAAC;QAE3E,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC;YACjC,KAAK,EAAE,IAAI,CAAC,MAAM,CAAC,KAAK,IAAI,aAAa;YACzC,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC;YAC7C,WAAW,EAAE,GAAG;YAChB,SAAS,EAAE,IAAI;SAChB,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC,OAAO,CAAC;IACxB,CAAC;IAEO,gBAAgB,CAAC,QAAgB,EAAE,SAAiB;QAC1D,IAAI,SAAS,KAAK,CAAC,EAAE,CAAC;YACpB,OAAO,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,KAAK,MAAM,CAAC,CAAC;QAC9D,CAAC;QAED,MAAM,OAAO,GAAG,QAAQ,CAAC,WAAW,EAAE,CAAC;QACvC,MAAM,QAAQ,GAAG,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;YAC5C,KAAK,MAAM,QAAQ,IAAI,CAAC,CAAC,cAAc,IAAI,EAAE,EAAE,CAAC;gBAC9C,IAAI,IAAI,CAAC,gBAAgB,CAAC,QAAQ,CAAC,EAAE,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC;oBACxE,OAAO,IAAI,CAAC;gBACd,CAAC;YACH,CAAC;YACD,OAAO,KAAK,CAAC;QACf,CAAC,CAAC,CAAC;QAEH,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACxB,OAAO,QAAQ,CAAC;QAClB,CAAC;QAED,MAAM,QAAQ,GAAG,CAAC,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,GAAG,EAAE,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,GAAG,CAAC,CAAC;QACtE,OAAO,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC;IACzD,CAAC;IAEO,gBAAgB,GAA6B;QACnD,QAAQ,EAAE,CAAC,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC;QACjF,IAAI,EAAE,CAAC,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,MAAM,EAAE,aAAa,CAAC;QACzD,MAAM,EAAE,CAAC,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,QAAQ,EAAE,UAAU,CAAC;QACrD,WAAW,EAAE,CAAC,SAAS,EAAE,WAAW,EAAE,YAAY,EAAE,aAAa,CAAC;QAClE,OAAO,EAAE,CAAC,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,SAAS,CAAC;QACtD,OAAO,EAAE,CAAC,UAAU,EAAE,KAAK,EAAE,iBAAiB,EAAE,aAAa,EAAE,SAAS,CAAC;QACzE,cAAc,EAAE,CAAC,MAAM,EAAE,YAAY,EAAE,MAAM,CAAC;QAC9C,YAAY,EAAE,CAAC,YAAY,EAAE,SAAS,EAAE,OAAO,EAAE,MAAM,CAAC;KACzD,CAAC;IAEF,kBAAkB,CAAC,YAA0B;QAC3C,IAAI,CAAC,YAAY,GAAG,YAAY,CAAC;QACjC,IAAI,CAAC,UAAU,GAAG,uBAAuB,CAAC,YAAY,EAAE,QAAQ,CAAC,CAAC;IACpE,CAAC;CACF"}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
export { ConstitutionalAI } from './constitutional-ai';
|
|
2
|
+
export type { ConstitutionalAIOptions } from './constitutional-ai';
|
|
3
|
+
export { InputFilter } from './input-filter';
|
|
4
|
+
export type { InputFilterOptions } from './input-filter';
|
|
5
|
+
export { OutputFilter } from './output-filter';
|
|
6
|
+
export type { OutputFilterOptions } from './output-filter';
|
|
7
|
+
export { ToolGuard } from './tool-guard';
|
|
8
|
+
export type { ToolGuardOptions } from './tool-guard';
|
|
9
|
+
export { CritiqueReviser } from './critique-reviser';
|
|
10
|
+
export type { CritiqueReviserOptions } from './critique-reviser';
|
|
11
|
+
export { DEFAULT_CONSTITUTION, DEFAULT_PRINCIPLES, createConstitution, extendConstitution, filterPrinciplesByLayer, getPrinciplesByCategory, getPrinciplesBySeverity, } from './constitution';
|
|
12
|
+
export { buildInputEvaluationPrompt, buildOutputEvaluationPrompt, buildCritiquePrompt, buildRevisionPrompt, parseEvaluationResponse, parseCritiqueResponse, } from './prompts';
|
|
13
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/constitutional/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,MAAM,qBAAqB,CAAC;AACvD,YAAY,EAAE,uBAAuB,EAAE,MAAM,qBAAqB,CAAC;AAEnE,OAAO,EAAE,WAAW,EAAE,MAAM,gBAAgB,CAAC;AAC7C,YAAY,EAAE,kBAAkB,EAAE,MAAM,gBAAgB,CAAC;AAEzD,OAAO,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AAC/C,YAAY,EAAE,mBAAmB,EAAE,MAAM,iBAAiB,CAAC;AAE3D,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AACzC,YAAY,EAAE,gBAAgB,EAAE,MAAM,cAAc,CAAC;AAErD,OAAO,EAAE,eAAe,EAAE,MAAM,oBAAoB,CAAC;AACrD,YAAY,EAAE,sBAAsB,EAAE,MAAM,oBAAoB,CAAC;AAEjE,OAAO,EACL,oBAAoB,EACpB,kBAAkB,EAClB,kBAAkB,EAClB,kBAAkB,EAClB,uBAAuB,EACvB,uBAAuB,EACvB,uBAAuB,GACxB,MAAM,gBAAgB,CAAC;AAExB,OAAO,EACL,0BAA0B,EAC1B,2BAA2B,EAC3B,mBAAmB,EACnB,mBAAmB,EACnB,uBAAuB,EACvB,qBAAqB,GACtB,MAAM,WAAW,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
export { ConstitutionalAI } from './constitutional-ai';
|
|
2
|
+
export { InputFilter } from './input-filter';
|
|
3
|
+
export { OutputFilter } from './output-filter';
|
|
4
|
+
export { ToolGuard } from './tool-guard';
|
|
5
|
+
export { CritiqueReviser } from './critique-reviser';
|
|
6
|
+
export { DEFAULT_CONSTITUTION, DEFAULT_PRINCIPLES, createConstitution, extendConstitution, filterPrinciplesByLayer, getPrinciplesByCategory, getPrinciplesBySeverity, } from './constitution';
|
|
7
|
+
export { buildInputEvaluationPrompt, buildOutputEvaluationPrompt, buildCritiquePrompt, buildRevisionPrompt, parseEvaluationResponse, parseCritiqueResponse, } from './prompts';
|
|
8
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/constitutional/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,MAAM,qBAAqB,CAAC;AAGvD,OAAO,EAAE,WAAW,EAAE,MAAM,gBAAgB,CAAC;AAG7C,OAAO,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AAG/C,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AAGzC,OAAO,EAAE,eAAe,EAAE,MAAM,oBAAoB,CAAC;AAGrD,OAAO,EACL,oBAAoB,EACpB,kBAAkB,EAClB,kBAAkB,EAClB,kBAAkB,EAClB,uBAAuB,EACvB,uBAAuB,EACvB,uBAAuB,GACxB,MAAM,gBAAgB,CAAC;AAExB,OAAO,EACL,0BAA0B,EAC1B,2BAA2B,EAC3B,mBAAmB,EACnB,mBAAmB,EACnB,uBAAuB,EACvB,qBAAqB,GACtB,MAAM,WAAW,CAAC"}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import type { GuardrailConfig, FilterResult, Constitution, LLMBackend } from '@cogitator-ai/types';
|
|
2
|
+
export interface InputFilterOptions {
|
|
3
|
+
llm: LLMBackend;
|
|
4
|
+
config: GuardrailConfig;
|
|
5
|
+
constitution: Constitution;
|
|
6
|
+
}
|
|
7
|
+
export declare class InputFilter {
|
|
8
|
+
private llm;
|
|
9
|
+
private config;
|
|
10
|
+
private constitution;
|
|
11
|
+
private principles;
|
|
12
|
+
constructor(options: InputFilterOptions);
|
|
13
|
+
filter(input: string, context?: string): Promise<FilterResult>;
|
|
14
|
+
private quickScan;
|
|
15
|
+
private evaluateWithLLM;
|
|
16
|
+
private applyThresholds;
|
|
17
|
+
updateConstitution(constitution: Constitution): void;
|
|
18
|
+
}
|
|
19
|
+
//# sourceMappingURL=input-filter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"input-filter.d.ts","sourceRoot":"","sources":["../../src/constitutional/input-filter.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,eAAe,EACf,YAAY,EAGZ,YAAY,EACZ,UAAU,EACX,MAAM,qBAAqB,CAAC;AAI7B,MAAM,WAAW,kBAAkB;IACjC,GAAG,EAAE,UAAU,CAAC;IAChB,MAAM,EAAE,eAAe,CAAC;IACxB,YAAY,EAAE,YAAY,CAAC;CAC5B;AAED,qBAAa,WAAW;IACtB,OAAO,CAAC,GAAG,CAAa;IACxB,OAAO,CAAC,MAAM,CAAkB;IAChC,OAAO,CAAC,YAAY,CAAe;IACnC,OAAO,CAAC,UAAU,CAA4B;gBAElC,OAAO,EAAE,kBAAkB;IAOjC,MAAM,CAAC,KAAK,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,YAAY,CAAC;IAcpE,OAAO,CAAC,SAAS;YA8BH,eAAe;IAc7B,OAAO,CAAC,eAAe;IAyBvB,kBAAkB,CAAC,YAAY,EAAE,YAAY,GAAG,IAAI;CAIrD"}
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import { buildInputEvaluationPrompt, parseEvaluationResponse } from './prompts';
|
|
2
|
+
import { filterPrinciplesByLayer } from './constitution';
|
|
3
|
+
export class InputFilter {
|
|
4
|
+
llm;
|
|
5
|
+
config;
|
|
6
|
+
constitution;
|
|
7
|
+
principles;
|
|
8
|
+
constructor(options) {
|
|
9
|
+
this.llm = options.llm;
|
|
10
|
+
this.config = options.config;
|
|
11
|
+
this.constitution = options.constitution;
|
|
12
|
+
this.principles = filterPrinciplesByLayer(this.constitution, 'input');
|
|
13
|
+
}
|
|
14
|
+
async filter(input, context) {
|
|
15
|
+
const quickResult = this.quickScan(input);
|
|
16
|
+
if (!quickResult.allowed) {
|
|
17
|
+
return quickResult;
|
|
18
|
+
}
|
|
19
|
+
if (this.principles.length === 0) {
|
|
20
|
+
return { allowed: true, harmScores: [] };
|
|
21
|
+
}
|
|
22
|
+
const harmScores = await this.evaluateWithLLM(input, context);
|
|
23
|
+
return this.applyThresholds(harmScores);
|
|
24
|
+
}
|
|
25
|
+
quickScan(input) {
|
|
26
|
+
const lowered = input.toLowerCase();
|
|
27
|
+
const patterns = [
|
|
28
|
+
{ pattern: /how\s+to\s+(make|build|create)\s+(a\s+)?bomb/i, category: 'violence' },
|
|
29
|
+
{ pattern: /how\s+to\s+kill\s+(myself|someone)/i, category: 'violence' },
|
|
30
|
+
{ pattern: /how\s+to\s+hack\s+(into|a)/i, category: 'illegal' },
|
|
31
|
+
{ pattern: /child\s+porn(ography)?/i, category: 'sexual' },
|
|
32
|
+
{ pattern: /ways\s+to\s+commit\s+suicide/i, category: 'self-harm' },
|
|
33
|
+
];
|
|
34
|
+
for (const { pattern, category } of patterns) {
|
|
35
|
+
if (pattern.test(lowered)) {
|
|
36
|
+
return {
|
|
37
|
+
allowed: false,
|
|
38
|
+
harmScores: [
|
|
39
|
+
{
|
|
40
|
+
category,
|
|
41
|
+
severity: 'high',
|
|
42
|
+
confidence: 0.95,
|
|
43
|
+
reasoning: 'Matched explicit harmful pattern',
|
|
44
|
+
},
|
|
45
|
+
],
|
|
46
|
+
blockedReason: `Input contains potentially harmful content (${category})`,
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
return { allowed: true, harmScores: [] };
|
|
51
|
+
}
|
|
52
|
+
async evaluateWithLLM(input, _context) {
|
|
53
|
+
const prompt = buildInputEvaluationPrompt(input, this.principles);
|
|
54
|
+
const response = await this.llm.chat({
|
|
55
|
+
model: this.config.model ?? 'gpt-4o-mini',
|
|
56
|
+
messages: [{ role: 'user', content: prompt }],
|
|
57
|
+
temperature: 0,
|
|
58
|
+
maxTokens: 500,
|
|
59
|
+
});
|
|
60
|
+
const result = parseEvaluationResponse(response.content);
|
|
61
|
+
return result.harmScores;
|
|
62
|
+
}
|
|
63
|
+
applyThresholds(harmScores) {
|
|
64
|
+
const thresholds = this.config.thresholds;
|
|
65
|
+
const severityOrder = { low: 1, medium: 2, high: 3 };
|
|
66
|
+
const violations = harmScores.filter((score) => {
|
|
67
|
+
const threshold = thresholds[score.category] ?? 'high';
|
|
68
|
+
return severityOrder[score.severity] >= severityOrder[threshold];
|
|
69
|
+
});
|
|
70
|
+
if (violations.length === 0) {
|
|
71
|
+
return { allowed: true, harmScores };
|
|
72
|
+
}
|
|
73
|
+
if (this.config.strictMode) {
|
|
74
|
+
const categories = [...new Set(violations.map((v) => v.category))];
|
|
75
|
+
return {
|
|
76
|
+
allowed: false,
|
|
77
|
+
harmScores,
|
|
78
|
+
blockedReason: `Input violates safety policies: ${categories.join(', ')}`,
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
return { allowed: true, harmScores };
|
|
82
|
+
}
|
|
83
|
+
updateConstitution(constitution) {
|
|
84
|
+
this.constitution = constitution;
|
|
85
|
+
this.principles = filterPrinciplesByLayer(constitution, 'input');
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
//# sourceMappingURL=input-filter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"input-filter.js","sourceRoot":"","sources":["../../src/constitutional/input-filter.ts"],"names":[],"mappings":"AAQA,OAAO,EAAE,0BAA0B,EAAE,uBAAuB,EAAE,MAAM,WAAW,CAAC;AAChF,OAAO,EAAE,uBAAuB,EAAE,MAAM,gBAAgB,CAAC;AAQzD,MAAM,OAAO,WAAW;IACd,GAAG,CAAa;IAChB,MAAM,CAAkB;IACxB,YAAY,CAAe;IAC3B,UAAU,CAA4B;IAE9C,YAAY,OAA2B;QACrC,IAAI,CAAC,GAAG,GAAG,OAAO,CAAC,GAAG,CAAC;QACvB,IAAI,CAAC,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;QAC7B,IAAI,CAAC,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC;QACzC,IAAI,CAAC,UAAU,GAAG,uBAAuB,CAAC,IAAI,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;IACxE,CAAC;IAED,KAAK,CAAC,MAAM,CAAC,KAAa,EAAE,OAAgB;QAC1C,MAAM,WAAW,GAAG,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;QAC1C,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,CAAC;YACzB,OAAO,WAAW,CAAC;QACrB,CAAC;QAED,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACjC,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,EAAE,EAAE,CAAC;QAC3C,CAAC;QAED,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,eAAe,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;QAC9D,OAAO,IAAI,CAAC,eAAe,CAAC,UAAU,CAAC,CAAC;IAC1C,CAAC;IAEO,SAAS,CAAC,KAAa;QAC7B,MAAM,OAAO,GAAG,KAAK,CAAC,WAAW,EAAE,CAAC;QACpC,MAAM,QAAQ,GAAG;YACf,EAAE,OAAO,EAAE,+CAA+C,EAAE,QAAQ,EAAE,UAAmB,EAAE;YAC3F,EAAE,OAAO,EAAE,qCAAqC,EAAE,QAAQ,EAAE,UAAmB,EAAE;YACjF,EAAE,OAAO,EAAE,6BAA6B,EAAE,QAAQ,EAAE,SAAkB,EAAE;YACxE,EAAE,OAAO,EAAE,yBAAyB,EAAE,QAAQ,EAAE,QAAiB,EAAE;YACnE,EAAE,OAAO,EAAE,+BAA+B,EAAE,QAAQ,EAAE,WAAoB,EAAE;SAC7E,CAAC;QAEF,KAAK,MAAM,EAAE,OAAO,EAAE,QAAQ,EAAE,IAAI,QAAQ,EAAE,CAAC;YAC7C,IAAI,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;gBAC1B,OAAO;oBACL,OAAO,EAAE,KAAK;oBACd,UAAU,EAAE;wBACV;4BACE,QAAQ;4BACR,QAAQ,EAAE,MAAM;4BAChB,UAAU,EAAE,IAAI;4BAChB,SAAS,EAAE,kCAAkC;yBAC9C;qBACF;oBACD,aAAa,EAAE,+CAA+C,QAAQ,GAAG;iBAC1E,CAAC;YACJ,CAAC;QACH,CAAC;QAED,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,EAAE,EAAE,CAAC;IAC3C,CAAC;IAEO,KAAK,CAAC,eAAe,CAAC,KAAa,EAAE,QAAiB;QAC5D,MAAM,MAAM,GAAG,0BAA0B,CAAC,KAAK,EAAE,IAAI,CAAC,UAAU,CAAC,CAAC;QAElE,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC;YACnC,KAAK,EAAE,IAAI,CAAC,MAAM,CAAC,KAAK,IAAI,aAAa;YACzC,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC;YAC7C,WAAW,EAAE,CAAC;YACd,SAAS,EAAE,GAAG;SACf,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,uBAAuB,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;QACzD,OAAO,MAAM,CAAC,UAAU,CAAC;IAC3B,CAAC;IAEO,eAAe,CAAC,UAAuB;QAC7C,MAAM,UAAU,GAAG,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC;QAC1C,MAAM,aAAa,GAA2B,EAAE,GAAG,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,CAAC;QAE7E,MAAM,UAAU,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,KAAK,EAAE,EAAE;YAC7C,MAAM,SAAS,GAAG,UAAU,CAAC,KAAK,CAAC,QAAQ,CAAC,IAAI,MAAM,CAAC;YACvD,OAAO,aAAa,CAAC,KAAK,CAAC,QAAQ,CAAC,IAAI,aAAa,CAAC,SAAS,CAAC,CAAC;QACnE,CAAC,CAAC,CAAC;QAEH,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC5B,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC;QACvC,CAAC;QAED,IAAI,IAAI,CAAC,MAAM,CAAC,UAAU,EAAE,CAAC;YAC3B,MAAM,UAAU,GAAG,CAAC,GAAG,IAAI,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC;YACnE,OAAO;gBACL,OAAO,EAAE,KAAK;gBACd,UAAU;gBACV,aAAa,EAAE,mCAAmC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE;aAC1E,CAAC;QACJ,CAAC;QAED,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC;IACvC,CAAC;IAED,kBAAkB,CAAC,YAA0B;QAC3C,IAAI,CAAC,YAAY,GAAG,YAAY,CAAC;QACjC,IAAI,CAAC,UAAU,GAAG,uBAAuB,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;IACnE,CAAC;CACF"}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import type { GuardrailConfig, FilterResult, Constitution, LLMBackend, Message } from '@cogitator-ai/types';
|
|
2
|
+
export interface OutputFilterOptions {
|
|
3
|
+
llm: LLMBackend;
|
|
4
|
+
config: GuardrailConfig;
|
|
5
|
+
constitution: Constitution;
|
|
6
|
+
}
|
|
7
|
+
export declare class OutputFilter {
|
|
8
|
+
private llm;
|
|
9
|
+
private config;
|
|
10
|
+
private constitution;
|
|
11
|
+
private principles;
|
|
12
|
+
constructor(options: OutputFilterOptions);
|
|
13
|
+
filter(output: string, context: Message[]): Promise<FilterResult>;
|
|
14
|
+
private quickScan;
|
|
15
|
+
private evaluateWithLLM;
|
|
16
|
+
private applyThresholds;
|
|
17
|
+
updateConstitution(constitution: Constitution): void;
|
|
18
|
+
}
|
|
19
|
+
//# sourceMappingURL=output-filter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"output-filter.d.ts","sourceRoot":"","sources":["../../src/constitutional/output-filter.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,eAAe,EACf,YAAY,EAGZ,YAAY,EACZ,UAAU,EACV,OAAO,EACR,MAAM,qBAAqB,CAAC;AAI7B,MAAM,WAAW,mBAAmB;IAClC,GAAG,EAAE,UAAU,CAAC;IAChB,MAAM,EAAE,eAAe,CAAC;IACxB,YAAY,EAAE,YAAY,CAAC;CAC5B;AAED,qBAAa,YAAY;IACvB,OAAO,CAAC,GAAG,CAAa;IACxB,OAAO,CAAC,MAAM,CAAkB;IAChC,OAAO,CAAC,YAAY,CAAe;IACnC,OAAO,CAAC,UAAU,CAA4B;gBAElC,OAAO,EAAE,mBAAmB;IAOlC,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,GAAG,OAAO,CAAC,YAAY,CAAC;IAcvE,OAAO,CAAC,SAAS;YA6BH,eAAe;IAc7B,OAAO,CAAC,eAAe;IAyBvB,kBAAkB,CAAC,YAAY,EAAE,YAAY,GAAG,IAAI;CAIrD"}
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import { buildOutputEvaluationPrompt, parseEvaluationResponse } from './prompts';
|
|
2
|
+
import { filterPrinciplesByLayer } from './constitution';
|
|
3
|
+
export class OutputFilter {
|
|
4
|
+
llm;
|
|
5
|
+
config;
|
|
6
|
+
constitution;
|
|
7
|
+
principles;
|
|
8
|
+
constructor(options) {
|
|
9
|
+
this.llm = options.llm;
|
|
10
|
+
this.config = options.config;
|
|
11
|
+
this.constitution = options.constitution;
|
|
12
|
+
this.principles = filterPrinciplesByLayer(this.constitution, 'output');
|
|
13
|
+
}
|
|
14
|
+
async filter(output, context) {
|
|
15
|
+
const quickResult = this.quickScan(output);
|
|
16
|
+
if (!quickResult.allowed) {
|
|
17
|
+
return quickResult;
|
|
18
|
+
}
|
|
19
|
+
if (this.principles.length === 0) {
|
|
20
|
+
return { allowed: true, harmScores: [] };
|
|
21
|
+
}
|
|
22
|
+
const harmScores = await this.evaluateWithLLM(output, context);
|
|
23
|
+
return this.applyThresholds(harmScores);
|
|
24
|
+
}
|
|
25
|
+
quickScan(output) {
|
|
26
|
+
const lowered = output.toLowerCase();
|
|
27
|
+
const patterns = [
|
|
28
|
+
{ pattern: /rm\s+-rf\s+\/(?!\w)/i, category: 'illegal' },
|
|
29
|
+
{ pattern: /format\s+c:/i, category: 'illegal' },
|
|
30
|
+
{ pattern: /sudo\s+rm\s+-rf/i, category: 'illegal' },
|
|
31
|
+
];
|
|
32
|
+
for (const { pattern, category } of patterns) {
|
|
33
|
+
if (pattern.test(lowered)) {
|
|
34
|
+
return {
|
|
35
|
+
allowed: false,
|
|
36
|
+
harmScores: [
|
|
37
|
+
{
|
|
38
|
+
category,
|
|
39
|
+
severity: 'high',
|
|
40
|
+
confidence: 0.95,
|
|
41
|
+
reasoning: 'Contains dangerous system command',
|
|
42
|
+
},
|
|
43
|
+
],
|
|
44
|
+
blockedReason: 'Output contains potentially dangerous commands',
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
return { allowed: true, harmScores: [] };
|
|
49
|
+
}
|
|
50
|
+
async evaluateWithLLM(output, context) {
|
|
51
|
+
const prompt = buildOutputEvaluationPrompt(output, context, this.principles);
|
|
52
|
+
const response = await this.llm.chat({
|
|
53
|
+
model: this.config.model ?? 'gpt-4o-mini',
|
|
54
|
+
messages: [{ role: 'user', content: prompt }],
|
|
55
|
+
temperature: 0,
|
|
56
|
+
maxTokens: 500,
|
|
57
|
+
});
|
|
58
|
+
const result = parseEvaluationResponse(response.content);
|
|
59
|
+
return result.harmScores;
|
|
60
|
+
}
|
|
61
|
+
applyThresholds(harmScores) {
|
|
62
|
+
const thresholds = this.config.thresholds;
|
|
63
|
+
const severityOrder = { low: 1, medium: 2, high: 3 };
|
|
64
|
+
const violations = harmScores.filter((score) => {
|
|
65
|
+
const threshold = thresholds[score.category] ?? 'high';
|
|
66
|
+
return severityOrder[score.severity] >= severityOrder[threshold];
|
|
67
|
+
});
|
|
68
|
+
if (violations.length === 0) {
|
|
69
|
+
return { allowed: true, harmScores };
|
|
70
|
+
}
|
|
71
|
+
if (this.config.strictMode) {
|
|
72
|
+
const categories = [...new Set(violations.map((v) => v.category))];
|
|
73
|
+
return {
|
|
74
|
+
allowed: false,
|
|
75
|
+
harmScores,
|
|
76
|
+
blockedReason: `Output violates safety policies: ${categories.join(', ')}`,
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
return { allowed: true, harmScores };
|
|
80
|
+
}
|
|
81
|
+
updateConstitution(constitution) {
|
|
82
|
+
this.constitution = constitution;
|
|
83
|
+
this.principles = filterPrinciplesByLayer(constitution, 'output');
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
//# sourceMappingURL=output-filter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"output-filter.js","sourceRoot":"","sources":["../../src/constitutional/output-filter.ts"],"names":[],"mappings":"AASA,OAAO,EAAE,2BAA2B,EAAE,uBAAuB,EAAE,MAAM,WAAW,CAAC;AACjF,OAAO,EAAE,uBAAuB,EAAE,MAAM,gBAAgB,CAAC;AAQzD,MAAM,OAAO,YAAY;IACf,GAAG,CAAa;IAChB,MAAM,CAAkB;IACxB,YAAY,CAAe;IAC3B,UAAU,CAA4B;IAE9C,YAAY,OAA4B;QACtC,IAAI,CAAC,GAAG,GAAG,OAAO,CAAC,GAAG,CAAC;QACvB,IAAI,CAAC,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;QAC7B,IAAI,CAAC,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC;QACzC,IAAI,CAAC,UAAU,GAAG,uBAAuB,CAAC,IAAI,CAAC,YAAY,EAAE,QAAQ,CAAC,CAAC;IACzE,CAAC;IAED,KAAK,CAAC,MAAM,CAAC,MAAc,EAAE,OAAkB;QAC7C,MAAM,WAAW,GAAG,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QAC3C,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,CAAC;YACzB,OAAO,WAAW,CAAC;QACrB,CAAC;QAED,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACjC,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,EAAE,EAAE,CAAC;QAC3C,CAAC;QAED,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,eAAe,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAC/D,OAAO,IAAI,CAAC,eAAe,CAAC,UAAU,CAAC,CAAC;IAC1C,CAAC;IAEO,SAAS,CAAC,MAAc;QAC9B,MAAM,OAAO,GAAG,MAAM,CAAC,WAAW,EAAE,CAAC;QAErC,MAAM,QAAQ,GAAG;YACf,EAAE,OAAO,EAAE,sBAAsB,EAAE,QAAQ,EAAE,SAAkB,EAAE;YACjE,EAAE,OAAO,EAAE,cAAc,EAAE,QAAQ,EAAE,SAAkB,EAAE;YACzD,EAAE,OAAO,EAAE,kBAAkB,EAAE,QAAQ,EAAE,SAAkB,EAAE;SAC9D,CAAC;QAEF,KAAK,MAAM,EAAE,OAAO,EAAE,QAAQ,EAAE,IAAI,QAAQ,EAAE,CAAC;YAC7C,IAAI,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC;gBAC1B,OAAO;oBACL,OAAO,EAAE,KAAK;oBACd,UAAU,EAAE;wBACV;4BACE,QAAQ;4BACR,QAAQ,EAAE,MAAM;4BAChB,UAAU,EAAE,IAAI;4BAChB,SAAS,EAAE,mCAAmC;yBAC/C;qBACF;oBACD,aAAa,EAAE,gDAAgD;iBAChE,CAAC;YACJ,CAAC;QACH,CAAC;QAED,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,EAAE,EAAE,CAAC;IAC3C,CAAC;IAEO,KAAK,CAAC,eAAe,CAAC,MAAc,EAAE,OAAkB;QAC9D,MAAM,MAAM,GAAG,2BAA2B,CAAC,MAAM,EAAE,OAAO,EAAE,IAAI,CAAC,UAAU,CAAC,CAAC;QAE7E,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC;YACnC,KAAK,EAAE,IAAI,CAAC,MAAM,CAAC,KAAK,IAAI,aAAa;YACzC,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC;YAC7C,WAAW,EAAE,CAAC;YACd,SAAS,EAAE,GAAG;SACf,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,uBAAuB,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;QACzD,OAAO,MAAM,CAAC,UAAU,CAAC;IAC3B,CAAC;IAEO,eAAe,CAAC,UAAuB;QAC7C,MAAM,UAAU,GAAG,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC;QAC1C,MAAM,aAAa,GAA2B,EAAE,GAAG,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,CAAC;QAE7E,MAAM,UAAU,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,KAAK,EAAE,EAAE;YAC7C,MAAM,SAAS,GAAG,UAAU,CAAC,KAAK,CAAC,QAAQ,CAAC,IAAI,MAAM,CAAC;YACvD,OAAO,aAAa,CAAC,KAAK,CAAC,QAAQ,CAAC,IAAI,aAAa,CAAC,SAAS,CAAC,CAAC;QACnE,CAAC,CAAC,CAAC;QAEH,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC5B,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC;QACvC,CAAC;QAED,IAAI,IAAI,CAAC,MAAM,CAAC,UAAU,EAAE,CAAC;YAC3B,MAAM,UAAU,GAAG,CAAC,GAAG,IAAI,GAAG,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC;YACnE,OAAO;gBACL,OAAO,EAAE,KAAK;gBACd,UAAU;gBACV,aAAa,EAAE,oCAAoC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE;aAC3E,CAAC;QACJ,CAAC;QAED,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC;IACvC,CAAC;IAED,kBAAkB,CAAC,YAA0B;QAC3C,IAAI,CAAC,YAAY,GAAG,YAAY,CAAC;QACjC,IAAI,CAAC,UAAU,GAAG,uBAAuB,CAAC,YAAY,EAAE,QAAQ,CAAC,CAAC;IACpE,CAAC;CACF"}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { ConstitutionalPrinciple, HarmScore, CritiqueResult, Message } from '@cogitator-ai/types';
|
|
2
|
+
export declare function buildInputEvaluationPrompt(input: string, principles: ConstitutionalPrinciple[]): string;
|
|
3
|
+
export declare function buildOutputEvaluationPrompt(output: string, context: Message[], principles: ConstitutionalPrinciple[]): string;
|
|
4
|
+
export declare function buildCritiquePrompt(response: string, principles: ConstitutionalPrinciple[]): string;
|
|
5
|
+
export declare function buildRevisionPrompt(response: string, critique: CritiqueResult, principles: ConstitutionalPrinciple[]): string;
|
|
6
|
+
export declare function parseEvaluationResponse(content: string): {
|
|
7
|
+
isHarmful: boolean;
|
|
8
|
+
harmScores: HarmScore[];
|
|
9
|
+
};
|
|
10
|
+
export declare function parseCritiqueResponse(content: string): CritiqueResult;
|
|
11
|
+
//# sourceMappingURL=prompts.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"prompts.d.ts","sourceRoot":"","sources":["../../src/constitutional/prompts.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,uBAAuB,EACvB,SAAS,EACT,cAAc,EAGd,OAAO,EACR,MAAM,qBAAqB,CAAC;AAE7B,wBAAgB,0BAA0B,CACxC,KAAK,EAAE,MAAM,EACb,UAAU,EAAE,uBAAuB,EAAE,GACpC,MAAM,CAiCR;AAED,wBAAgB,2BAA2B,CACzC,MAAM,EAAE,MAAM,EACd,OAAO,EAAE,OAAO,EAAE,EAClB,UAAU,EAAE,uBAAuB,EAAE,GACpC,MAAM,CAyCR;AAED,wBAAgB,mBAAmB,CACjC,QAAQ,EAAE,MAAM,EAChB,UAAU,EAAE,uBAAuB,EAAE,GACpC,MAAM,CAkCR;AAED,wBAAgB,mBAAmB,CACjC,QAAQ,EAAE,MAAM,EAChB,QAAQ,EAAE,cAAc,EACxB,UAAU,EAAE,uBAAuB,EAAE,GACpC,MAAM,CA0BR;AAED,wBAAgB,uBAAuB,CAAC,OAAO,EAAE,MAAM,GAAG;IACxD,SAAS,EAAE,OAAO,CAAC;IACnB,UAAU,EAAE,SAAS,EAAE,CAAC;CACzB,CAeA;AAED,wBAAgB,qBAAqB,CAAC,OAAO,EAAE,MAAM,GAAG,cAAc,CAwBrE"}
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
export function buildInputEvaluationPrompt(input, principles) {
|
|
2
|
+
const principlesList = principles.map((p) => `- ${p.name}: ${p.description}`).join('\n');
|
|
3
|
+
return `You are a content safety evaluator. Analyze the following user input for potential policy violations.
|
|
4
|
+
|
|
5
|
+
PRINCIPLES TO CHECK:
|
|
6
|
+
${principlesList}
|
|
7
|
+
|
|
8
|
+
USER INPUT:
|
|
9
|
+
"""
|
|
10
|
+
${input}
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
Evaluate whether this input attempts to elicit harmful content. Consider:
|
|
14
|
+
1. Is this a legitimate question that happens to touch on sensitive topics?
|
|
15
|
+
2. Is there clear malicious intent to generate harmful content?
|
|
16
|
+
3. Would answering this request violate any of the listed principles?
|
|
17
|
+
|
|
18
|
+
Respond in JSON format:
|
|
19
|
+
{
|
|
20
|
+
"isHarmful": boolean,
|
|
21
|
+
"harmScores": [
|
|
22
|
+
{
|
|
23
|
+
"category": "violence" | "hate" | "sexual" | "self-harm" | "illegal" | "privacy" | "misinformation" | "manipulation",
|
|
24
|
+
"severity": "low" | "medium" | "high",
|
|
25
|
+
"confidence": number (0-1),
|
|
26
|
+
"principleViolated": "principle-id" | null,
|
|
27
|
+
"reasoning": "brief explanation"
|
|
28
|
+
}
|
|
29
|
+
]
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
If the input is benign, return: {"isHarmful": false, "harmScores": []}`;
|
|
33
|
+
}
|
|
34
|
+
export function buildOutputEvaluationPrompt(output, context, principles) {
|
|
35
|
+
const principlesList = principles
|
|
36
|
+
.map((p) => `- ${p.name} (${p.id}): ${p.critiquePrompt}`)
|
|
37
|
+
.join('\n');
|
|
38
|
+
const contextStr = context
|
|
39
|
+
.slice(-4)
|
|
40
|
+
.map((m) => `[${m.role}]: ${m.content.slice(0, 200)}${m.content.length > 200 ? '...' : ''}`)
|
|
41
|
+
.join('\n');
|
|
42
|
+
return `You are a content safety evaluator. Analyze the following AI response for policy violations.
|
|
43
|
+
|
|
44
|
+
PRINCIPLES TO CHECK:
|
|
45
|
+
${principlesList}
|
|
46
|
+
|
|
47
|
+
CONVERSATION CONTEXT:
|
|
48
|
+
${contextStr}
|
|
49
|
+
|
|
50
|
+
AI RESPONSE TO EVALUATE:
|
|
51
|
+
"""
|
|
52
|
+
${output}
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
For each principle, determine if the response violates it. Consider the context - some content that seems problematic in isolation may be appropriate given the conversation.
|
|
56
|
+
|
|
57
|
+
Respond in JSON format:
|
|
58
|
+
{
|
|
59
|
+
"isHarmful": boolean,
|
|
60
|
+
"harmScores": [
|
|
61
|
+
{
|
|
62
|
+
"category": "violence" | "hate" | "sexual" | "self-harm" | "illegal" | "privacy" | "misinformation" | "manipulation",
|
|
63
|
+
"severity": "low" | "medium" | "high",
|
|
64
|
+
"confidence": number (0-1),
|
|
65
|
+
"principleViolated": "principle-id",
|
|
66
|
+
"reasoning": "brief explanation"
|
|
67
|
+
}
|
|
68
|
+
],
|
|
69
|
+
"principlesViolated": ["principle-id", ...]
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
If the response is safe, return: {"isHarmful": false, "harmScores": [], "principlesViolated": []}`;
|
|
73
|
+
}
|
|
74
|
+
export function buildCritiquePrompt(response, principles) {
|
|
75
|
+
const critiques = principles.map((p) => `- ${p.name}: ${p.critiquePrompt}`).join('\n');
|
|
76
|
+
return `You are a thoughtful AI safety reviewer. Critique the following response against these principles:
|
|
77
|
+
|
|
78
|
+
${critiques}
|
|
79
|
+
|
|
80
|
+
RESPONSE TO CRITIQUE:
|
|
81
|
+
"""
|
|
82
|
+
${response}
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
Provide a thoughtful analysis:
|
|
86
|
+
1. Does this response violate any of the listed principles?
|
|
87
|
+
2. What specific parts are problematic?
|
|
88
|
+
3. How severe are the issues?
|
|
89
|
+
|
|
90
|
+
Respond in JSON format:
|
|
91
|
+
{
|
|
92
|
+
"isHarmful": boolean,
|
|
93
|
+
"critique": "detailed analysis of issues found",
|
|
94
|
+
"harmScores": [
|
|
95
|
+
{
|
|
96
|
+
"category": "violence" | "hate" | "sexual" | "self-harm" | "illegal" | "privacy" | "misinformation" | "manipulation",
|
|
97
|
+
"severity": "low" | "medium" | "high",
|
|
98
|
+
"confidence": number (0-1),
|
|
99
|
+
"principleViolated": "principle-id",
|
|
100
|
+
"reasoning": "why this principle was violated"
|
|
101
|
+
}
|
|
102
|
+
],
|
|
103
|
+
"principlesViolated": ["principle-id", ...]
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
If no violations found: {"isHarmful": false, "critique": "No issues found", "harmScores": [], "principlesViolated": []}`;
|
|
107
|
+
}
|
|
108
|
+
export function buildRevisionPrompt(response, critique, principles) {
|
|
109
|
+
const violatedPrinciples = principles
|
|
110
|
+
.filter((p) => critique.principlesViolated.includes(p.id))
|
|
111
|
+
.map((p) => `- ${p.name}: ${p.revisionPrompt}`)
|
|
112
|
+
.join('\n');
|
|
113
|
+
return `You are a helpful AI assistant. Your previous response had some issues that need to be addressed.
|
|
114
|
+
|
|
115
|
+
ORIGINAL RESPONSE:
|
|
116
|
+
"""
|
|
117
|
+
${response}
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
CRITIQUE:
|
|
121
|
+
${critique.critique}
|
|
122
|
+
|
|
123
|
+
REVISION GUIDELINES:
|
|
124
|
+
${violatedPrinciples}
|
|
125
|
+
|
|
126
|
+
Please rewrite the response to:
|
|
127
|
+
1. Address the legitimate parts of the user's request
|
|
128
|
+
2. Avoid the issues identified in the critique
|
|
129
|
+
3. Follow the revision guidelines above
|
|
130
|
+
4. Maintain a helpful and respectful tone
|
|
131
|
+
|
|
132
|
+
Provide ONLY the revised response, no explanations or meta-commentary:`;
|
|
133
|
+
}
|
|
134
|
+
export function parseEvaluationResponse(content) {
|
|
135
|
+
const cleaned = content
|
|
136
|
+
.replace(/```json\n?/g, '')
|
|
137
|
+
.replace(/```\n?/g, '')
|
|
138
|
+
.trim();
|
|
139
|
+
try {
|
|
140
|
+
const parsed = JSON.parse(cleaned);
|
|
141
|
+
return {
|
|
142
|
+
isHarmful: Boolean(parsed.isHarmful),
|
|
143
|
+
harmScores: (parsed.harmScores ?? []).map(normalizeHarmScore),
|
|
144
|
+
};
|
|
145
|
+
}
|
|
146
|
+
catch {
|
|
147
|
+
return { isHarmful: false, harmScores: [] };
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
export function parseCritiqueResponse(content) {
|
|
151
|
+
const cleaned = content
|
|
152
|
+
.replace(/```json\n?/g, '')
|
|
153
|
+
.replace(/```\n?/g, '')
|
|
154
|
+
.trim();
|
|
155
|
+
try {
|
|
156
|
+
const parsed = JSON.parse(cleaned);
|
|
157
|
+
return {
|
|
158
|
+
isHarmful: Boolean(parsed.isHarmful),
|
|
159
|
+
critique: String(parsed.critique ?? ''),
|
|
160
|
+
harmScores: (parsed.harmScores ?? []).map(normalizeHarmScore),
|
|
161
|
+
principlesViolated: Array.isArray(parsed.principlesViolated)
|
|
162
|
+
? parsed.principlesViolated.filter((p) => typeof p === 'string')
|
|
163
|
+
: [],
|
|
164
|
+
};
|
|
165
|
+
}
|
|
166
|
+
catch {
|
|
167
|
+
return {
|
|
168
|
+
isHarmful: false,
|
|
169
|
+
critique: 'Failed to parse critique response',
|
|
170
|
+
harmScores: [],
|
|
171
|
+
principlesViolated: [],
|
|
172
|
+
};
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
function normalizeHarmScore(raw) {
|
|
176
|
+
const obj = raw;
|
|
177
|
+
return {
|
|
178
|
+
category: normalizeCategory(obj.category),
|
|
179
|
+
severity: normalizeSeverity(obj.severity),
|
|
180
|
+
confidence: Math.max(0, Math.min(1, Number(obj.confidence) || 0)),
|
|
181
|
+
principleViolated: typeof obj.principleViolated === 'string' ? obj.principleViolated : undefined,
|
|
182
|
+
reasoning: typeof obj.reasoning === 'string' ? obj.reasoning : undefined,
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
function normalizeCategory(value) {
|
|
186
|
+
const valid = [
|
|
187
|
+
'violence',
|
|
188
|
+
'hate',
|
|
189
|
+
'sexual',
|
|
190
|
+
'self-harm',
|
|
191
|
+
'illegal',
|
|
192
|
+
'privacy',
|
|
193
|
+
'misinformation',
|
|
194
|
+
'manipulation',
|
|
195
|
+
];
|
|
196
|
+
return valid.includes(value) ? value : 'manipulation';
|
|
197
|
+
}
|
|
198
|
+
function normalizeSeverity(value) {
|
|
199
|
+
const valid = ['low', 'medium', 'high'];
|
|
200
|
+
return valid.includes(value) ? value : 'low';
|
|
201
|
+
}
|
|
202
|
+
//# sourceMappingURL=prompts.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"prompts.js","sourceRoot":"","sources":["../../src/constitutional/prompts.ts"],"names":[],"mappings":"AASA,MAAM,UAAU,0BAA0B,CACxC,KAAa,EACb,UAAqC;IAErC,MAAM,cAAc,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,KAAK,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAEzF,OAAO;;;EAGP,cAAc;;;;EAId,KAAK;;;;;;;;;;;;;;;;;;;;;;uEAsBgE,CAAC;AACxE,CAAC;AAED,MAAM,UAAU,2BAA2B,CACzC,MAAc,EACd,OAAkB,EAClB,UAAqC;IAErC,MAAM,cAAc,GAAG,UAAU;SAC9B,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,KAAK,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,CAAC,cAAc,EAAE,CAAC;SACxD,IAAI,CAAC,IAAI,CAAC,CAAC;IAEd,MAAM,UAAU,GAAG,OAAO;SACvB,KAAK,CAAC,CAAC,CAAC,CAAC;SACT,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,CAAC,IAAI,MAAM,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC;SAC3F,IAAI,CAAC,IAAI,CAAC,CAAC;IAEd,OAAO;;;EAGP,cAAc;;;EAGd,UAAU;;;;EAIV,MAAM;;;;;;;;;;;;;;;;;;;;kGAoB0F,CAAC;AACnG,CAAC;AAED,MAAM,UAAU,mBAAmB,CACjC,QAAgB,EAChB,UAAqC;IAErC,MAAM,SAAS,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,KAAK,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,cAAc,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAEvF,OAAO;;EAEP,SAAS;;;;EAIT,QAAQ;;;;;;;;;;;;;;;;;;;;;;;;wHAwB8G,CAAC;AACzH,CAAC;AAED,MAAM,UAAU,mBAAmB,CACjC,QAAgB,EAChB,QAAwB,EACxB,UAAqC;IAErC,MAAM,kBAAkB,GAAG,UAAU;SAClC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,kBAAkB,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;SACzD,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,KAAK,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,cAAc,EAAE,CAAC;SAC9C,IAAI,CAAC,IAAI,CAAC,CAAC;IAEd,OAAO;;;;EAIP,QAAQ;;;;EAIR,QAAQ,CAAC,QAAQ;;;EAGjB,kBAAkB;;;;;;;;uEAQmD,CAAC;AACxE,CAAC;AAED,MAAM,UAAU,uBAAuB,CAAC,OAAe;IAIrD,MAAM,OAAO,GAAG,OAAO;SACpB,OAAO,CAAC,aAAa,EAAE,EAAE,CAAC;SAC1B,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC;SACtB,IAAI,EAAE,CAAC;IAEV,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QACnC,OAAO;YACL,SAAS,EAAE,OAAO,CAAC,MAAM,CAAC,SAAS,CAAC;YACpC,UAAU,EAAE,CAAC,MAAM,CAAC,UAAU,IAAI,EAAE,CAAC,CAAC,GAAG,CAAC,kBAAkB,CAAC;SAC9D,CAAC;IACJ,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,SAAS,EAAE,KAAK,EAAE,UAAU,EAAE,EAAE,EAAE,CAAC;IAC9C,CAAC;AACH,CAAC;AAED,MAAM,UAAU,qBAAqB,CAAC,OAAe;IACnD,MAAM,OAAO,GAAG,OAAO;SACpB,OAAO,CAAC,aAAa,EAAE,EAAE,CAAC;SAC1B,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC;SACtB,IAAI,EAAE,CAAC;IAEV,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QACnC,OAAO;YACL,SAAS,EAAE,OAAO,CAAC,MAAM,CAAC,SAAS,CAAC;YACpC,QAAQ,EAAE,MAAM,CAAC,MAAM,CAAC,QAAQ,IAAI,EAAE,CAAC;YACvC,UAAU,EAAE,CAAC,MAAM,CAAC,UAAU,IAAI,EAAE,CAAC,CAAC,GAAG,CAAC,kBAAkB,CAAC;YAC7D,kBAAkB,EAAE,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,kBAAkB,CAAC;gBAC1D,CAAC,CAAC,MAAM,CAAC,kBAAkB,CAAC,MAAM,CAAC,CAAC,CAAU,EAAE,EAAE,CAAC,OAAO,CAAC,KAAK,QAAQ,CAAC;gBACzE,CAAC,CAAC,EAAE;SACP,CAAC;IACJ,CAAC;IAAC,MAAM,CAAC;QACP,OAAO;YACL,SAAS,EAAE,KAAK;YAChB,QAAQ,EAAE,mCAAmC;YAC7C,UAAU,EAAE,EAAE;YACd,kBAAkB,EAAE,EAAE;SACvB,CAAC;IACJ,CAAC;AACH,CAAC;AAED,SAAS,kBAAkB,CAAC,GAAY;IACtC,MAAM,GAAG,GAAG,GAA8B,CAAC;IAC3C,OAAO;QACL,QAAQ,EAAE,iBAAiB,CAAC,GAAG,CAAC,QAAQ,CAAC;QACzC,QAAQ,EAAE,iBAAiB,CAAC,GAAG,CAAC,QAAQ,CAAC;QACzC,UAAU,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,MAAM,CAAC,GAAG,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC;QACjE,iBAAiB,EACf,OAAO,GAAG,CAAC,iBAAiB,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,iBAAiB,CAAC,CAAC,CAAC,SAAS;QAC/E,SAAS,EAAE,OAAO,GAAG,CAAC,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC,SAAS;KACzE,CAAC;AACJ,CAAC;AAED,SAAS,iBAAiB,CAAC,KAAc;IACvC,MAAM,KAAK,GAAmB;QAC5B,UAAU;QACV,MAAM;QACN,QAAQ;QACR,WAAW;QACX,SAAS;QACT,SAAS;QACT,gBAAgB;QAChB,cAAc;KACf,CAAC;IACF,OAAO,KAAK,CAAC,QAAQ,CAAC,KAAqB,CAAC,CAAC,CAAC,CAAE,KAAsB,CAAC,CAAC,CAAC,cAAc,CAAC;AAC1F,CAAC;AAED,SAAS,iBAAiB,CAAC,KAAc;IACvC,MAAM,KAAK,GAAe,CAAC,KAAK,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC;IACpD,OAAO,KAAK,CAAC,QAAQ,CAAC,KAAiB,CAAC,CAAC,CAAC,CAAE,KAAkB,CAAC,CAAC,CAAC,KAAK,CAAC;AACzE,CAAC"}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import type { GuardrailConfig, ToolGuardResult, Tool, ToolContext, Constitution } from '@cogitator-ai/types';
|
|
2
|
+
export interface ToolGuardOptions {
|
|
3
|
+
config: GuardrailConfig;
|
|
4
|
+
constitution: Constitution;
|
|
5
|
+
}
|
|
6
|
+
export declare class ToolGuard {
|
|
7
|
+
private config;
|
|
8
|
+
constructor(options: ToolGuardOptions);
|
|
9
|
+
evaluate(tool: Tool, args: Record<string, unknown>, _context: ToolContext): Promise<ToolGuardResult>;
|
|
10
|
+
private checkApproval;
|
|
11
|
+
private assessRisk;
|
|
12
|
+
private checkDangerousOperation;
|
|
13
|
+
private isDangerousCommand;
|
|
14
|
+
private isDangerousPath;
|
|
15
|
+
private requestApproval;
|
|
16
|
+
updateConstitution(_constitution: Constitution): void;
|
|
17
|
+
}
|
|
18
|
+
//# sourceMappingURL=tool-guard.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tool-guard.d.ts","sourceRoot":"","sources":["../../src/constitutional/tool-guard.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,eAAe,EACf,eAAe,EAEf,IAAI,EACJ,WAAW,EACX,YAAY,EACb,MAAM,qBAAqB,CAAC;AAE7B,MAAM,WAAW,gBAAgB;IAC/B,MAAM,EAAE,eAAe,CAAC;IACxB,YAAY,EAAE,YAAY,CAAC;CAC5B;AAED,qBAAa,SAAS;IACpB,OAAO,CAAC,MAAM,CAAkB;gBAEpB,OAAO,EAAE,gBAAgB;IAI/B,QAAQ,CACZ,IAAI,EAAE,IAAI,EACV,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EAC7B,QAAQ,EAAE,WAAW,GACpB,OAAO,CAAC,eAAe,CAAC;IAuD3B,OAAO,CAAC,aAAa;IAOrB,OAAO,CAAC,UAAU;IAoBlB,OAAO,CAAC,uBAAuB;IAkB/B,OAAO,CAAC,kBAAkB;IAe1B,OAAO,CAAC,eAAe;YAcT,eAAe;IAW7B,kBAAkB,CAAC,aAAa,EAAE,YAAY,GAAG,IAAI;CACtD"}
|