@gaia-minds/assistant-cli 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CONSTITUTION.md +208 -0
- package/README.md +85 -158
- package/assistant/README.md +126 -12
- package/package.json +6 -1
- package/tools/agent-actions.py +1213 -0
- package/tools/agent-alignment.py +888 -0
- package/tools/agent-config.yml +20 -5
- package/tools/agent-loop.py +502 -62
- package/tools/agent_actions.py +42 -0
- package/tools/agent_alignment.py +41 -0
- package/tools/gaia-assistant.py +2375 -34
|
@@ -0,0 +1,888 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Alignment checker for the Gaia Minds self-evolving agent.
|
|
3
|
+
|
|
4
|
+
Evaluates proposed actions against Constitutional values using a two-tier
|
|
5
|
+
approach:
|
|
6
|
+
|
|
7
|
+
Tier 1 (rule-based) -- always runs, fast, deterministic.
|
|
8
|
+
Tier 2 (LLM-based) -- runs when an Anthropic client is provided, uses
|
|
9
|
+
Claude to evaluate nuanced alignment questions.
|
|
10
|
+
|
|
11
|
+
The alignment checker NEVER executes actions. It only evaluates and returns
|
|
12
|
+
an AlignmentResult indicating whether the action should proceed.
|
|
13
|
+
|
|
14
|
+
Usage as a library:
|
|
15
|
+
from tools import agent_alignment # or however the agent-loop imports it
|
|
16
|
+
result = agent_alignment.check_alignment(action, constitution, memory)
|
|
17
|
+
|
|
18
|
+
Usage standalone (self-test):
|
|
19
|
+
python3 tools/agent-alignment.py
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
import json
|
|
23
|
+
import re
|
|
24
|
+
import sys
|
|
25
|
+
from dataclasses import dataclass, asdict
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
from typing import Any, Dict, List, Optional
|
|
28
|
+
|
|
29
|
+
# Try to import the Anthropic SDK. If unavailable, Tier 2 checks are
|
|
30
|
+
# silently skipped and the module falls back to rule-based checks only.
|
|
31
|
+
try:
|
|
32
|
+
import anthropic # noqa: F401
|
|
33
|
+
|
|
34
|
+
_HAS_ANTHROPIC = True
|
|
35
|
+
except ImportError:
|
|
36
|
+
_HAS_ANTHROPIC = False
|
|
37
|
+
|
|
38
|
+
# Try to import PyYAML for config parsing. Fall back to a minimal inline
|
|
39
|
+
# parser if unavailable.
|
|
40
|
+
try:
|
|
41
|
+
import yaml # noqa: F401
|
|
42
|
+
|
|
43
|
+
_HAS_YAML = True
|
|
44
|
+
except ImportError:
|
|
45
|
+
_HAS_YAML = False
|
|
46
|
+
|
|
47
|
+
# ---------------------------------------------------------------------------
|
|
48
|
+
# Constants
|
|
49
|
+
# ---------------------------------------------------------------------------
|
|
50
|
+
|
|
51
|
+
SCRIPT_DIR = Path(__file__).resolve().parent
|
|
52
|
+
REPO_ROOT = SCRIPT_DIR.parent
|
|
53
|
+
CONFIG_PATH = SCRIPT_DIR / "agent-config.yml"
|
|
54
|
+
CONSTITUTION_PATH = REPO_ROOT / "CONSTITUTION.md"
|
|
55
|
+
|
|
56
|
+
# Module-level cache for the constitution text
|
|
57
|
+
_constitution_cache: Optional[str] = None
|
|
58
|
+
|
|
59
|
+
# Module-level cache for the parsed config
|
|
60
|
+
_config_cache: Optional[Dict[str, Any]] = None
|
|
61
|
+
|
|
62
|
+
# Risk level ordering (lower index = lower risk)
|
|
63
|
+
RISK_LEVELS = ("low", "medium", "high", "forbidden")
|
|
64
|
+
|
|
65
|
+
# Safety-critical file patterns that must never be deleted or tampered with
|
|
66
|
+
SAFETY_FILE_PATTERNS: List[re.Pattern] = [
|
|
67
|
+
re.compile(r"CONSTITUTION\.md", re.IGNORECASE),
|
|
68
|
+
re.compile(r"security\.md", re.IGNORECASE),
|
|
69
|
+
re.compile(r"agent-alignment\.py", re.IGNORECASE),
|
|
70
|
+
re.compile(r"pre-commit", re.IGNORECASE),
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
# Red-flag patterns in action parameters (compiled once at import time)
|
|
74
|
+
RED_FLAG_PATTERNS: List[Dict[str, Any]] = [
|
|
75
|
+
{
|
|
76
|
+
"pattern": re.compile(
|
|
77
|
+
r"(delete|remove|rm)\b.*\b(constitution|security\.md|alignment)",
|
|
78
|
+
re.IGNORECASE,
|
|
79
|
+
),
|
|
80
|
+
"reason": "Action references deleting safety-critical files.",
|
|
81
|
+
},
|
|
82
|
+
{
|
|
83
|
+
"pattern": re.compile(
|
|
84
|
+
r"(git\s+(rebase|reset|push\s+--force|filter-branch|reflog\s+expire))",
|
|
85
|
+
re.IGNORECASE,
|
|
86
|
+
),
|
|
87
|
+
"reason": "Action attempts to modify or rewrite git history.",
|
|
88
|
+
},
|
|
89
|
+
{
|
|
90
|
+
"pattern": re.compile(
|
|
91
|
+
r"(disable|bypass|skip|remove)\b.*\b(hook|pre-commit|ci|check|review)",
|
|
92
|
+
re.IGNORECASE,
|
|
93
|
+
),
|
|
94
|
+
"reason": "Action attempts to disable hooks, CI, or review processes.",
|
|
95
|
+
},
|
|
96
|
+
{
|
|
97
|
+
"pattern": re.compile(
|
|
98
|
+
r"merge\b.*\b(own|self|my)\b.*\bpr", re.IGNORECASE
|
|
99
|
+
),
|
|
100
|
+
"reason": "Action attempts to merge the agent's own PR without review.",
|
|
101
|
+
},
|
|
102
|
+
{
|
|
103
|
+
"pattern": re.compile(
|
|
104
|
+
r"(backdoor|exploit|exfiltrat|obfuscat|hidden\s+functionalit)",
|
|
105
|
+
re.IGNORECASE,
|
|
106
|
+
),
|
|
107
|
+
"reason": "Action contains references to deceptive or harmful techniques.",
|
|
108
|
+
},
|
|
109
|
+
{
|
|
110
|
+
"pattern": re.compile(
|
|
111
|
+
r"(weapon|surveillance\s+target|manipulat(e|ion)\s+(user|human|people))",
|
|
112
|
+
re.IGNORECASE,
|
|
113
|
+
),
|
|
114
|
+
"reason": (
|
|
115
|
+
"Action references capability restrictions from Article III "
|
|
116
|
+
"(weapons, surveillance, manipulation)."
|
|
117
|
+
),
|
|
118
|
+
},
|
|
119
|
+
]
|
|
120
|
+
|
|
121
|
+
# Terminal colours (only when stdout is a TTY)
|
|
122
|
+
_IS_TTY = hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _colour(code: str, text: str) -> str:
|
|
126
|
+
if _IS_TTY:
|
|
127
|
+
return f"\033[{code}m{text}\033[0m"
|
|
128
|
+
return text
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def green(text: str) -> str:
|
|
132
|
+
return _colour("32", text)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def yellow(text: str) -> str:
|
|
136
|
+
return _colour("33", text)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def red(text: str) -> str:
|
|
140
|
+
return _colour("31", text)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def bold(text: str) -> str:
|
|
144
|
+
return _colour("1", text)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def dim(text: str) -> str:
|
|
148
|
+
return _colour("2", text)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
# ---------------------------------------------------------------------------
|
|
152
|
+
# Data classes
|
|
153
|
+
# ---------------------------------------------------------------------------
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
@dataclass
|
|
157
|
+
class AlignmentResult:
|
|
158
|
+
"""Result of an alignment check on a proposed action."""
|
|
159
|
+
|
|
160
|
+
allowed: bool
|
|
161
|
+
risk_level: str # "low", "medium", "high", "forbidden"
|
|
162
|
+
reasoning: str # explanation of why allowed/denied
|
|
163
|
+
suggestions: Optional[str] = None # how to modify action to be aligned
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
# ---------------------------------------------------------------------------
|
|
167
|
+
# Config & Constitution loading
|
|
168
|
+
# ---------------------------------------------------------------------------
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def load_constitution(repo_root: str) -> str:
|
|
172
|
+
"""Load CONSTITUTION.md from the given repo root path.
|
|
173
|
+
|
|
174
|
+
Caches the result in a module-level variable so repeated calls within the
|
|
175
|
+
same process do not re-read the file.
|
|
176
|
+
"""
|
|
177
|
+
global _constitution_cache
|
|
178
|
+
|
|
179
|
+
if _constitution_cache is not None:
|
|
180
|
+
return _constitution_cache
|
|
181
|
+
|
|
182
|
+
constitution_path = Path(repo_root) / "CONSTITUTION.md"
|
|
183
|
+
if not constitution_path.is_file():
|
|
184
|
+
raise FileNotFoundError(
|
|
185
|
+
f"Constitution not found at {constitution_path}. "
|
|
186
|
+
"The agent cannot operate without its Constitution."
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
_constitution_cache = constitution_path.read_text(encoding="utf-8")
|
|
190
|
+
return _constitution_cache
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _load_config() -> Dict[str, Any]:
|
|
194
|
+
"""Load and cache agent-config.yml."""
|
|
195
|
+
global _config_cache
|
|
196
|
+
|
|
197
|
+
if _config_cache is not None:
|
|
198
|
+
return _config_cache
|
|
199
|
+
|
|
200
|
+
if not CONFIG_PATH.is_file():
|
|
201
|
+
# Return a sensible default so the module can still function
|
|
202
|
+
_config_cache = {"risk": {}}
|
|
203
|
+
return _config_cache
|
|
204
|
+
|
|
205
|
+
raw = CONFIG_PATH.read_text(encoding="utf-8")
|
|
206
|
+
|
|
207
|
+
if _HAS_YAML:
|
|
208
|
+
_config_cache = yaml.safe_load(raw) or {}
|
|
209
|
+
else:
|
|
210
|
+
# Minimal fallback: parse only the risk section we need
|
|
211
|
+
_config_cache = _parse_risk_section(raw)
|
|
212
|
+
|
|
213
|
+
return _config_cache
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _parse_risk_section(raw: str) -> Dict[str, Any]:
|
|
217
|
+
"""Minimal YAML-subset parser for the risk section of agent-config.yml.
|
|
218
|
+
|
|
219
|
+
This is intentionally simple. It only extracts the lists under
|
|
220
|
+
risk.auto_execute, risk.auto_pr, risk.require_review, and risk.forbidden.
|
|
221
|
+
"""
|
|
222
|
+
config: Dict[str, Any] = {"risk": {}}
|
|
223
|
+
current_section: Optional[str] = None
|
|
224
|
+
in_risk = False
|
|
225
|
+
|
|
226
|
+
for line in raw.splitlines():
|
|
227
|
+
stripped = line.strip()
|
|
228
|
+
|
|
229
|
+
# Detect top-level 'risk:' section
|
|
230
|
+
if line.startswith("risk:"):
|
|
231
|
+
in_risk = True
|
|
232
|
+
continue
|
|
233
|
+
|
|
234
|
+
# Once we leave the risk section (another top-level key), stop
|
|
235
|
+
if in_risk and line and not line[0].isspace() and not line.startswith("#"):
|
|
236
|
+
break
|
|
237
|
+
|
|
238
|
+
if not in_risk:
|
|
239
|
+
continue
|
|
240
|
+
|
|
241
|
+
# Detect sub-keys like " auto_execute:"
|
|
242
|
+
if stripped.endswith(":") and not stripped.startswith("-") and not stripped.startswith("#"):
|
|
243
|
+
key = stripped.rstrip(":").strip()
|
|
244
|
+
# Strip inline comments after the key
|
|
245
|
+
if "#" in key:
|
|
246
|
+
key = key[: key.index("#")].strip()
|
|
247
|
+
current_section = key
|
|
248
|
+
config["risk"][current_section] = []
|
|
249
|
+
continue
|
|
250
|
+
|
|
251
|
+
# Detect list items like ' - "verify_resources"'
|
|
252
|
+
if current_section and stripped.startswith("- "):
|
|
253
|
+
value = stripped[2:].strip().strip('"').strip("'")
|
|
254
|
+
# Strip inline comments
|
|
255
|
+
if " #" in value:
|
|
256
|
+
value = value[: value.index(" #")].strip().strip('"').strip("'")
|
|
257
|
+
config["risk"].setdefault(current_section, []).append(value)
|
|
258
|
+
|
|
259
|
+
return config
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
# ---------------------------------------------------------------------------
|
|
263
|
+
# Risk classification (pure rule-based)
|
|
264
|
+
# ---------------------------------------------------------------------------
|
|
265
|
+
|
|
266
|
+
# Mapping from config risk categories to risk level strings
|
|
267
|
+
_RISK_CATEGORY_TO_LEVEL = {
|
|
268
|
+
"auto_execute": "low",
|
|
269
|
+
"auto_pr": "medium",
|
|
270
|
+
"require_review": "high",
|
|
271
|
+
"forbidden": "forbidden",
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def classify_risk(action_type: str, config: Optional[Dict[str, Any]] = None) -> str:
|
|
276
|
+
"""Classify an action type's risk level based on the agent config.
|
|
277
|
+
|
|
278
|
+
Reads the ``risk`` section of agent-config.yml (or the provided config
|
|
279
|
+
dict) and returns one of: "low", "medium", "high", "forbidden".
|
|
280
|
+
|
|
281
|
+
If the action type is not found in any category, defaults to "high"
|
|
282
|
+
(precautionary principle -- unknown actions require review).
|
|
283
|
+
"""
|
|
284
|
+
if config is None:
|
|
285
|
+
config = _load_config()
|
|
286
|
+
|
|
287
|
+
risk_config = config.get("risk", {})
|
|
288
|
+
|
|
289
|
+
for category, level in _RISK_CATEGORY_TO_LEVEL.items():
|
|
290
|
+
actions_in_category = risk_config.get(category, [])
|
|
291
|
+
if action_type in actions_in_category:
|
|
292
|
+
return level
|
|
293
|
+
|
|
294
|
+
# Unknown action type -- default to high risk (precautionary principle)
|
|
295
|
+
return "high"
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
# ---------------------------------------------------------------------------
|
|
299
|
+
# Tier 1: Rule-based alignment check
|
|
300
|
+
# ---------------------------------------------------------------------------
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def _flatten_action_text(action: Dict[str, Any]) -> str:
|
|
304
|
+
"""Flatten all string values in an action dict into a single text blob
|
|
305
|
+
for pattern matching."""
|
|
306
|
+
parts: List[str] = []
|
|
307
|
+
|
|
308
|
+
def _extract(obj: Any) -> None:
|
|
309
|
+
if isinstance(obj, str):
|
|
310
|
+
parts.append(obj)
|
|
311
|
+
elif isinstance(obj, dict):
|
|
312
|
+
for v in obj.values():
|
|
313
|
+
_extract(v)
|
|
314
|
+
elif isinstance(obj, (list, tuple)):
|
|
315
|
+
for item in obj:
|
|
316
|
+
_extract(item)
|
|
317
|
+
|
|
318
|
+
_extract(action)
|
|
319
|
+
return " ".join(parts)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def _tier1_check(
|
|
323
|
+
action: Dict[str, Any],
|
|
324
|
+
config: Dict[str, Any],
|
|
325
|
+
) -> AlignmentResult:
|
|
326
|
+
"""Tier 1: deterministic, rule-based alignment check.
|
|
327
|
+
|
|
328
|
+
Returns an AlignmentResult. This always runs, even when Tier 2 (LLM)
|
|
329
|
+
is available.
|
|
330
|
+
"""
|
|
331
|
+
action_type = action.get("type", "unknown")
|
|
332
|
+
|
|
333
|
+
# ----- Step 1: Check forbidden list -----
|
|
334
|
+
risk_level = classify_risk(action_type, config)
|
|
335
|
+
|
|
336
|
+
if risk_level == "forbidden":
|
|
337
|
+
return AlignmentResult(
|
|
338
|
+
allowed=False,
|
|
339
|
+
risk_level="forbidden",
|
|
340
|
+
reasoning=(
|
|
341
|
+
f"Action type '{action_type}' is on the forbidden list in "
|
|
342
|
+
"agent-config.yml. The agent must never perform this action. "
|
|
343
|
+
"This aligns with the Constitution's Safety Protocols "
|
|
344
|
+
"(Article III) and the precautionary principle."
|
|
345
|
+
),
|
|
346
|
+
suggestions=None,
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
# ----- Step 2: Scan for red flags in action params -----
|
|
350
|
+
action_text = _flatten_action_text(action)
|
|
351
|
+
|
|
352
|
+
for red_flag in RED_FLAG_PATTERNS:
|
|
353
|
+
if red_flag["pattern"].search(action_text):
|
|
354
|
+
return AlignmentResult(
|
|
355
|
+
allowed=False,
|
|
356
|
+
risk_level="forbidden",
|
|
357
|
+
reasoning=(
|
|
358
|
+
f"Red flag detected: {red_flag['reason']} "
|
|
359
|
+
"This violates the Constitution's safety protocols "
|
|
360
|
+
"(Article III). The precautionary principle requires "
|
|
361
|
+
"denial when safety-critical patterns are detected."
|
|
362
|
+
),
|
|
363
|
+
suggestions=(
|
|
364
|
+
"Reformulate the action to avoid modifying safety-critical "
|
|
365
|
+
"files, circumventing review processes, or introducing "
|
|
366
|
+
"deceptive/harmful content."
|
|
367
|
+
),
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
# ----- Step 3: Additional semantic checks -----
|
|
371
|
+
# Check for attempts to modify safety files even if not "delete"
|
|
372
|
+
params = action.get("params", {})
|
|
373
|
+
target_files = []
|
|
374
|
+
if isinstance(params, dict):
|
|
375
|
+
for key in ("file", "path", "target", "files"):
|
|
376
|
+
val = params.get(key)
|
|
377
|
+
if isinstance(val, str):
|
|
378
|
+
target_files.append(val)
|
|
379
|
+
elif isinstance(val, list):
|
|
380
|
+
target_files.extend(str(v) for v in val)
|
|
381
|
+
|
|
382
|
+
for target in target_files:
|
|
383
|
+
for pattern in SAFETY_FILE_PATTERNS:
|
|
384
|
+
if pattern.search(target):
|
|
385
|
+
if risk_level != "high":
|
|
386
|
+
# Escalate: modifying safety files always requires review
|
|
387
|
+
risk_level = "high"
|
|
388
|
+
|
|
389
|
+
# ----- Step 4: Return result based on risk classification -----
|
|
390
|
+
reasoning_parts = [
|
|
391
|
+
f"Action type '{action_type}' classified as '{risk_level}' risk "
|
|
392
|
+
f"by agent-config.yml."
|
|
393
|
+
]
|
|
394
|
+
|
|
395
|
+
if risk_level == "low":
|
|
396
|
+
reasoning_parts.append(
|
|
397
|
+
"This is an auto-executable action that poses minimal risk."
|
|
398
|
+
)
|
|
399
|
+
elif risk_level == "medium":
|
|
400
|
+
reasoning_parts.append(
|
|
401
|
+
"This action will be submitted as a PR for human review before "
|
|
402
|
+
"merging, in accordance with the Constitution's transparency and "
|
|
403
|
+
"human oversight principles."
|
|
404
|
+
)
|
|
405
|
+
elif risk_level == "high":
|
|
406
|
+
reasoning_parts.append(
|
|
407
|
+
"This action requires explicit human approval before proceeding. "
|
|
408
|
+
"The Constitution's precautionary principle and human override "
|
|
409
|
+
"provisions (Article III, 3.1-3.2) apply."
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
reasoning_parts.append("No red flags detected in action parameters.")
|
|
413
|
+
|
|
414
|
+
suggestions = None
|
|
415
|
+
if risk_level == "high":
|
|
416
|
+
suggestions = (
|
|
417
|
+
"Create a detailed issue explaining the proposed change and wait "
|
|
418
|
+
"for human approval before proceeding."
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
return AlignmentResult(
|
|
422
|
+
allowed=True,
|
|
423
|
+
risk_level=risk_level,
|
|
424
|
+
reasoning=" ".join(reasoning_parts),
|
|
425
|
+
suggestions=suggestions,
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
# ---------------------------------------------------------------------------
|
|
430
|
+
# Tier 2: LLM-based alignment check
|
|
431
|
+
# ---------------------------------------------------------------------------
|
|
432
|
+
|
|
433
|
+
_TIER2_SYSTEM_PROMPT = """\
|
|
434
|
+
You are an alignment evaluator for the Gaia Minds autonomous agent. Your role
|
|
435
|
+
is to determine whether a proposed action aligns with the Gaia Minds
|
|
436
|
+
Constitution.
|
|
437
|
+
|
|
438
|
+
You must evaluate the action against ALL Constitutional articles:
|
|
439
|
+
- Article I: Core Values (life protection, benevolence, transparency, honesty, humility)
|
|
440
|
+
- Article II: Governance (contributor eligibility, decision making, conflict resolution)
|
|
441
|
+
- Article III: Safety Protocols (precautionary principle, human override, security, capability restrictions)
|
|
442
|
+
- Article IV: Resource Ethics (transparent acquisition, sustainable operations, no exploitation)
|
|
443
|
+
- Article V: Evolution (living document, interpretation, self-improvement)
|
|
444
|
+
|
|
445
|
+
Apply the precautionary principle: when uncertain, rule against allowing the action.
|
|
446
|
+
|
|
447
|
+
CONSTITUTION:
|
|
448
|
+
{constitution}
|
|
449
|
+
"""
|
|
450
|
+
|
|
451
|
+
_TIER2_USER_PROMPT = """\
|
|
452
|
+
Evaluate whether this proposed action aligns with the Gaia Minds Constitution.
|
|
453
|
+
|
|
454
|
+
ACTION:
|
|
455
|
+
{action_json}
|
|
456
|
+
|
|
457
|
+
RECENT MEMORY CONTEXT (recent decisions and lessons):
|
|
458
|
+
{memory_context}
|
|
459
|
+
|
|
460
|
+
Respond with ONLY valid JSON (no markdown fences, no extra text) in this exact format:
|
|
461
|
+
{{
|
|
462
|
+
"allowed": true/false,
|
|
463
|
+
"risk_level": "low" | "medium" | "high" | "forbidden",
|
|
464
|
+
"reasoning": "detailed explanation referencing specific Constitutional articles",
|
|
465
|
+
"suggestions": "how to modify the action to be aligned, or null if already aligned"
|
|
466
|
+
}}
|
|
467
|
+
"""
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
def _tier2_check(
|
|
471
|
+
action: Dict[str, Any],
|
|
472
|
+
constitution: str,
|
|
473
|
+
memory_context: str,
|
|
474
|
+
client: Any,
|
|
475
|
+
model: str,
|
|
476
|
+
) -> Optional[AlignmentResult]:
|
|
477
|
+
"""Tier 2: LLM-based alignment check using Claude.
|
|
478
|
+
|
|
479
|
+
Returns an AlignmentResult on success, or None if the API call fails
|
|
480
|
+
(so the caller can fall back to Tier 1 only).
|
|
481
|
+
"""
|
|
482
|
+
system_prompt = _TIER2_SYSTEM_PROMPT.format(constitution=constitution)
|
|
483
|
+
user_prompt = _TIER2_USER_PROMPT.format(
|
|
484
|
+
action_json=json.dumps(action, indent=2),
|
|
485
|
+
memory_context=memory_context if memory_context else "(no memory context available)",
|
|
486
|
+
)
|
|
487
|
+
|
|
488
|
+
try:
|
|
489
|
+
response = client.messages.create(
|
|
490
|
+
model=model,
|
|
491
|
+
max_tokens=1024,
|
|
492
|
+
temperature=0.2,
|
|
493
|
+
system=system_prompt,
|
|
494
|
+
messages=[
|
|
495
|
+
{"role": "user", "content": user_prompt},
|
|
496
|
+
],
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
# Extract text from the response
|
|
500
|
+
response_text = ""
|
|
501
|
+
for block in response.content:
|
|
502
|
+
if hasattr(block, "text"):
|
|
503
|
+
response_text += block.text
|
|
504
|
+
|
|
505
|
+
# Strip any markdown code fences if present
|
|
506
|
+
response_text = response_text.strip()
|
|
507
|
+
if response_text.startswith("```"):
|
|
508
|
+
# Remove opening fence (with optional language tag)
|
|
509
|
+
response_text = re.sub(r"^```[a-zA-Z]*\n?", "", response_text)
|
|
510
|
+
# Remove closing fence
|
|
511
|
+
response_text = re.sub(r"\n?```$", "", response_text)
|
|
512
|
+
response_text = response_text.strip()
|
|
513
|
+
|
|
514
|
+
# Parse the JSON response
|
|
515
|
+
parsed = json.loads(response_text)
|
|
516
|
+
|
|
517
|
+
# Validate the response has required fields
|
|
518
|
+
if not all(k in parsed for k in ("allowed", "risk_level", "reasoning")):
|
|
519
|
+
return None
|
|
520
|
+
|
|
521
|
+
# Validate risk_level is a known value
|
|
522
|
+
risk_level = parsed["risk_level"]
|
|
523
|
+
if risk_level not in RISK_LEVELS:
|
|
524
|
+
risk_level = "high" # default to high if unknown
|
|
525
|
+
|
|
526
|
+
return AlignmentResult(
|
|
527
|
+
allowed=bool(parsed["allowed"]),
|
|
528
|
+
risk_level=risk_level,
|
|
529
|
+
reasoning=str(parsed["reasoning"]),
|
|
530
|
+
suggestions=parsed.get("suggestions"),
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
except json.JSONDecodeError:
|
|
534
|
+
# Could not parse the LLM response as JSON
|
|
535
|
+
return None
|
|
536
|
+
except Exception:
|
|
537
|
+
# API call failed -- caller will fall back to Tier 1
|
|
538
|
+
return None
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
# ---------------------------------------------------------------------------
|
|
542
|
+
# Main entry point: check_alignment()
|
|
543
|
+
# ---------------------------------------------------------------------------
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
def check_alignment(
|
|
547
|
+
action: Dict[str, Any],
|
|
548
|
+
constitution: str,
|
|
549
|
+
memory_context: str,
|
|
550
|
+
client: Any = None,
|
|
551
|
+
model: str = "claude-sonnet-4-5-20250929",
|
|
552
|
+
) -> AlignmentResult:
|
|
553
|
+
"""Check if a proposed action aligns with Constitutional values.
|
|
554
|
+
|
|
555
|
+
Two-tier evaluation:
|
|
556
|
+
1. Rule-based checks always run first.
|
|
557
|
+
2. If an Anthropic client is provided, LLM-based evaluation runs second.
|
|
558
|
+
|
|
559
|
+
The stricter of the two results wins. If the LLM call fails, the
|
|
560
|
+
fallback policy is:
|
|
561
|
+
- fail-open for low risk (allow, since rules already passed)
|
|
562
|
+
- fail-closed for medium/high risk (deny, precautionary principle)
|
|
563
|
+
|
|
564
|
+
Parameters
|
|
565
|
+
----------
|
|
566
|
+
action : dict
|
|
567
|
+
The proposed action, e.g.
|
|
568
|
+
{"type": "add_research", "params": {...}, "reasoning": "..."}
|
|
569
|
+
constitution : str
|
|
570
|
+
Full text of CONSTITUTION.md.
|
|
571
|
+
memory_context : str
|
|
572
|
+
Recent decisions and lessons as text.
|
|
573
|
+
client : optional
|
|
574
|
+
An ``anthropic.Anthropic`` client instance. If None, only Tier 1
|
|
575
|
+
(rule-based) checks are performed.
|
|
576
|
+
model : str
|
|
577
|
+
The Claude model to use for Tier 2 checks.
|
|
578
|
+
|
|
579
|
+
Returns
|
|
580
|
+
-------
|
|
581
|
+
AlignmentResult
|
|
582
|
+
The alignment evaluation result.
|
|
583
|
+
"""
|
|
584
|
+
config = _load_config()
|
|
585
|
+
|
|
586
|
+
# ----- Tier 1: Rule-based check (always runs) -----
|
|
587
|
+
tier1_result = _tier1_check(action, config)
|
|
588
|
+
|
|
589
|
+
# If Tier 1 says forbidden, return immediately -- no override possible
|
|
590
|
+
if tier1_result.risk_level == "forbidden":
|
|
591
|
+
return tier1_result
|
|
592
|
+
|
|
593
|
+
# ----- Tier 2: LLM-based check (only if client provided) -----
|
|
594
|
+
if client is not None and _HAS_ANTHROPIC:
|
|
595
|
+
tier2_result = _tier2_check(
|
|
596
|
+
action=action,
|
|
597
|
+
constitution=constitution,
|
|
598
|
+
memory_context=memory_context,
|
|
599
|
+
client=client,
|
|
600
|
+
model=model,
|
|
601
|
+
)
|
|
602
|
+
|
|
603
|
+
if tier2_result is not None:
|
|
604
|
+
# The stricter ruling wins:
|
|
605
|
+
# If Tier 2 says "not allowed" but Tier 1 said "allowed",
|
|
606
|
+
# use Tier 2 (stricter).
|
|
607
|
+
if not tier2_result.allowed and tier1_result.allowed:
|
|
608
|
+
tier2_result.reasoning = (
|
|
609
|
+
"[Tier 2 override] " + tier2_result.reasoning
|
|
610
|
+
+ " (Tier 1 rule-based check would have allowed this action, "
|
|
611
|
+
"but LLM-based Constitutional review denied it.)"
|
|
612
|
+
)
|
|
613
|
+
return tier2_result
|
|
614
|
+
|
|
615
|
+
# If both agree on "allowed", use the higher risk level
|
|
616
|
+
if tier2_result.allowed and tier1_result.allowed:
|
|
617
|
+
t1_idx = RISK_LEVELS.index(tier1_result.risk_level)
|
|
618
|
+
t2_idx = RISK_LEVELS.index(tier2_result.risk_level)
|
|
619
|
+
if t2_idx > t1_idx:
|
|
620
|
+
# Tier 2 assigns higher risk -- use Tier 2 risk level
|
|
621
|
+
tier1_result.risk_level = tier2_result.risk_level
|
|
622
|
+
tier1_result.reasoning = (
|
|
623
|
+
tier1_result.reasoning
|
|
624
|
+
+ f" [Tier 2 escalation] LLM review escalated risk "
|
|
625
|
+
f"to '{tier2_result.risk_level}': {tier2_result.reasoning}"
|
|
626
|
+
)
|
|
627
|
+
if tier2_result.suggestions:
|
|
628
|
+
tier1_result.suggestions = tier2_result.suggestions
|
|
629
|
+
|
|
630
|
+
return tier1_result
|
|
631
|
+
|
|
632
|
+
# If Tier 1 says not allowed (shouldn't happen since we returned
|
|
633
|
+
# early for forbidden), just return Tier 1
|
|
634
|
+
return tier1_result
|
|
635
|
+
|
|
636
|
+
else:
|
|
637
|
+
# Tier 2 failed (API error, parse error, etc.)
|
|
638
|
+
# Fail-open for low risk, fail-closed for medium/high
|
|
639
|
+
if tier1_result.risk_level == "low":
|
|
640
|
+
tier1_result.reasoning += (
|
|
641
|
+
" [Note] LLM-based alignment check was unavailable; "
|
|
642
|
+
"proceeding with rule-based result only (fail-open for "
|
|
643
|
+
"low-risk actions)."
|
|
644
|
+
)
|
|
645
|
+
return tier1_result
|
|
646
|
+
else:
|
|
647
|
+
return AlignmentResult(
|
|
648
|
+
allowed=False,
|
|
649
|
+
risk_level=tier1_result.risk_level,
|
|
650
|
+
reasoning=(
|
|
651
|
+
f"LLM-based alignment check failed and the action is "
|
|
652
|
+
f"'{tier1_result.risk_level}' risk. Per the "
|
|
653
|
+
f"precautionary principle (Constitution Article III, "
|
|
654
|
+
f"3.1), the agent fails closed for non-low-risk "
|
|
655
|
+
f"actions when full alignment evaluation is unavailable."
|
|
656
|
+
),
|
|
657
|
+
suggestions=(
|
|
658
|
+
"Retry when the Anthropic API is available, or request "
|
|
659
|
+
"explicit human approval for this action."
|
|
660
|
+
),
|
|
661
|
+
)
|
|
662
|
+
|
|
663
|
+
# ----- No Tier 2 available: return Tier 1 result -----
|
|
664
|
+
return tier1_result
|
|
665
|
+
|
|
666
|
+
|
|
667
|
+
# ---------------------------------------------------------------------------
|
|
668
|
+
# Self-test (when run as __main__)
|
|
669
|
+
# ---------------------------------------------------------------------------
|
|
670
|
+
|
|
671
|
+
|
|
672
|
+
def _risk_colour(level: str) -> str:
|
|
673
|
+
"""Return a coloured version of the risk level string."""
|
|
674
|
+
if level == "low":
|
|
675
|
+
return green(level)
|
|
676
|
+
elif level == "medium":
|
|
677
|
+
return yellow(level)
|
|
678
|
+
elif level == "high":
|
|
679
|
+
return red(level)
|
|
680
|
+
elif level == "forbidden":
|
|
681
|
+
return red(bold(level))
|
|
682
|
+
return level
|
|
683
|
+
|
|
684
|
+
|
|
685
|
+
def _self_test() -> int:
|
|
686
|
+
"""Run self-test with example actions and print results."""
|
|
687
|
+
print()
|
|
688
|
+
print(bold(" Gaia Minds Alignment Checker -- Self-Test"))
|
|
689
|
+
print(bold(" " + "=" * 50))
|
|
690
|
+
print()
|
|
691
|
+
|
|
692
|
+
# Load constitution for the test
|
|
693
|
+
try:
|
|
694
|
+
constitution = load_constitution(str(REPO_ROOT))
|
|
695
|
+
print(f" Constitution loaded: {len(constitution)} characters")
|
|
696
|
+
except FileNotFoundError as exc:
|
|
697
|
+
print(red(f" {exc}"))
|
|
698
|
+
return 1
|
|
699
|
+
|
|
700
|
+
# Load config
|
|
701
|
+
config = _load_config()
|
|
702
|
+
risk_config = config.get("risk", {})
|
|
703
|
+
print(f" Config loaded: {sum(len(v) for v in risk_config.values())} action types classified")
|
|
704
|
+
print()
|
|
705
|
+
|
|
706
|
+
# Define test cases: (action, expected_allowed, expected_risk)
|
|
707
|
+
test_cases = [
|
|
708
|
+
(
|
|
709
|
+
{"type": "verify_resources"},
|
|
710
|
+
True,
|
|
711
|
+
"low",
|
|
712
|
+
"Auto-executable low-risk action",
|
|
713
|
+
),
|
|
714
|
+
(
|
|
715
|
+
{"type": "add_research", "params": {"topic": "transformer efficiency"}},
|
|
716
|
+
True,
|
|
717
|
+
"medium",
|
|
718
|
+
"Standard research addition (auto-PR)",
|
|
719
|
+
),
|
|
720
|
+
(
|
|
721
|
+
{"type": "modify_agent_loop", "params": {"change": "add new feature"}},
|
|
722
|
+
True,
|
|
723
|
+
"high",
|
|
724
|
+
"Self-modification requires human review",
|
|
725
|
+
),
|
|
726
|
+
(
|
|
727
|
+
{"type": "delete_constitution"},
|
|
728
|
+
False,
|
|
729
|
+
"forbidden",
|
|
730
|
+
"Forbidden: deleting the Constitution",
|
|
731
|
+
),
|
|
732
|
+
(
|
|
733
|
+
{"type": "merge_own_pr"},
|
|
734
|
+
False,
|
|
735
|
+
"forbidden",
|
|
736
|
+
"Forbidden: merging own PR",
|
|
737
|
+
),
|
|
738
|
+
(
|
|
739
|
+
{"type": "bypass_review"},
|
|
740
|
+
False,
|
|
741
|
+
"forbidden",
|
|
742
|
+
"Forbidden: bypassing review",
|
|
743
|
+
),
|
|
744
|
+
]
|
|
745
|
+
|
|
746
|
+
passed = 0
|
|
747
|
+
failed = 0
|
|
748
|
+
|
|
749
|
+
for action, expected_allowed, expected_risk, description in test_cases:
|
|
750
|
+
result = check_alignment(
|
|
751
|
+
action=action,
|
|
752
|
+
constitution=constitution,
|
|
753
|
+
memory_context="(self-test -- no memory context)",
|
|
754
|
+
client=None, # rule-based only for self-test
|
|
755
|
+
)
|
|
756
|
+
|
|
757
|
+
# Check expectations
|
|
758
|
+
ok_allowed = result.allowed == expected_allowed
|
|
759
|
+
ok_risk = result.risk_level == expected_risk
|
|
760
|
+
|
|
761
|
+
status = green("[PASS]") if (ok_allowed and ok_risk) else red("[FAIL]")
|
|
762
|
+
if ok_allowed and ok_risk:
|
|
763
|
+
passed += 1
|
|
764
|
+
else:
|
|
765
|
+
failed += 1
|
|
766
|
+
|
|
767
|
+
print(f" {status} {description}")
|
|
768
|
+
print(f" Action: {json.dumps(action)}")
|
|
769
|
+
print(f" Allowed: {result.allowed} (expected {expected_allowed})", end="")
|
|
770
|
+
if not ok_allowed:
|
|
771
|
+
print(f" {red('<-- MISMATCH')}", end="")
|
|
772
|
+
print()
|
|
773
|
+
print(f" Risk: {_risk_colour(result.risk_level)} (expected {expected_risk})", end="")
|
|
774
|
+
if not ok_risk:
|
|
775
|
+
print(f" {red('<-- MISMATCH')}", end="")
|
|
776
|
+
print()
|
|
777
|
+
print(f" Reason: {dim(result.reasoning[:120])}...")
|
|
778
|
+
if result.suggestions:
|
|
779
|
+
print(f" Suggest: {dim(result.suggestions[:100])}")
|
|
780
|
+
print()
|
|
781
|
+
|
|
782
|
+
# Additional red-flag tests
|
|
783
|
+
print(bold(" Red-flag detection tests"))
|
|
784
|
+
print(bold(" " + "-" * 50))
|
|
785
|
+
print()
|
|
786
|
+
|
|
787
|
+
red_flag_cases = [
|
|
788
|
+
(
|
|
789
|
+
{
|
|
790
|
+
"type": "update_skill",
|
|
791
|
+
"params": {"change": "delete the constitution file"},
|
|
792
|
+
},
|
|
793
|
+
False,
|
|
794
|
+
"Red flag: delete safety file reference",
|
|
795
|
+
),
|
|
796
|
+
(
|
|
797
|
+
{
|
|
798
|
+
"type": "create_tool",
|
|
799
|
+
"params": {"command": "git push --force"},
|
|
800
|
+
},
|
|
801
|
+
False,
|
|
802
|
+
"Red flag: force push (git history rewrite)",
|
|
803
|
+
),
|
|
804
|
+
(
|
|
805
|
+
{
|
|
806
|
+
"type": "create_tool",
|
|
807
|
+
"params": {"desc": "disable pre-commit hooks"},
|
|
808
|
+
},
|
|
809
|
+
False,
|
|
810
|
+
"Red flag: disabling hooks",
|
|
811
|
+
),
|
|
812
|
+
(
|
|
813
|
+
{
|
|
814
|
+
"type": "add_research",
|
|
815
|
+
"params": {"content": "backdoor access method"},
|
|
816
|
+
},
|
|
817
|
+
False,
|
|
818
|
+
"Red flag: backdoor reference",
|
|
819
|
+
),
|
|
820
|
+
]
|
|
821
|
+
|
|
822
|
+
for action, expected_allowed, description in red_flag_cases:
|
|
823
|
+
result = check_alignment(
|
|
824
|
+
action=action,
|
|
825
|
+
constitution=constitution,
|
|
826
|
+
memory_context="(self-test -- no memory context)",
|
|
827
|
+
client=None,
|
|
828
|
+
)
|
|
829
|
+
|
|
830
|
+
ok = result.allowed == expected_allowed
|
|
831
|
+
status = green("[PASS]") if ok else red("[FAIL]")
|
|
832
|
+
if ok:
|
|
833
|
+
passed += 1
|
|
834
|
+
else:
|
|
835
|
+
failed += 1
|
|
836
|
+
|
|
837
|
+
print(f" {status} {description}")
|
|
838
|
+
print(f" Action: {json.dumps(action)}")
|
|
839
|
+
print(f" Allowed: {result.allowed} (expected {expected_allowed})", end="")
|
|
840
|
+
if not ok:
|
|
841
|
+
print(f" {red('<-- MISMATCH')}", end="")
|
|
842
|
+
print()
|
|
843
|
+
print(f" Risk: {_risk_colour(result.risk_level)}")
|
|
844
|
+
print(f" Reason: {dim(result.reasoning[:120])}...")
|
|
845
|
+
print()
|
|
846
|
+
|
|
847
|
+
# Classify risk standalone tests
|
|
848
|
+
print(bold(" classify_risk() tests"))
|
|
849
|
+
print(bold(" " + "-" * 50))
|
|
850
|
+
print()
|
|
851
|
+
|
|
852
|
+
risk_cases = [
|
|
853
|
+
("verify_resources", "low"),
|
|
854
|
+
("add_research", "medium"),
|
|
855
|
+
("modify_agent_loop", "high"),
|
|
856
|
+
("delete_constitution", "forbidden"),
|
|
857
|
+
("totally_unknown_action", "high"), # unknown defaults to high
|
|
858
|
+
]
|
|
859
|
+
|
|
860
|
+
for action_type, expected_level in risk_cases:
|
|
861
|
+
actual_level = classify_risk(action_type, config)
|
|
862
|
+
ok = actual_level == expected_level
|
|
863
|
+
status = green("[PASS]") if ok else red("[FAIL]")
|
|
864
|
+
if ok:
|
|
865
|
+
passed += 1
|
|
866
|
+
else:
|
|
867
|
+
failed += 1
|
|
868
|
+
|
|
869
|
+
print(f" {status} classify_risk('{action_type}') = {_risk_colour(actual_level)} (expected {expected_level})", end="")
|
|
870
|
+
if not ok:
|
|
871
|
+
print(f" {red('<-- MISMATCH')}", end="")
|
|
872
|
+
print()
|
|
873
|
+
|
|
874
|
+
# Summary
|
|
875
|
+
print()
|
|
876
|
+
print(bold(" " + "=" * 50))
|
|
877
|
+
total = passed + failed
|
|
878
|
+
if failed == 0:
|
|
879
|
+
print(green(f" All {total} tests passed."))
|
|
880
|
+
else:
|
|
881
|
+
print(red(f" {failed}/{total} tests FAILED."))
|
|
882
|
+
print()
|
|
883
|
+
|
|
884
|
+
return 1 if failed > 0 else 0
|
|
885
|
+
|
|
886
|
+
|
|
887
|
+
if __name__ == "__main__":
|
|
888
|
+
raise SystemExit(_self_test())
|