@gaia-minds/assistant-cli 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,888 @@
1
+ #!/usr/bin/env python3
2
+ """Alignment checker for the Gaia Minds self-evolving agent.
3
+
4
+ Evaluates proposed actions against Constitutional values using a two-tier
5
+ approach:
6
+
7
+ Tier 1 (rule-based) -- always runs, fast, deterministic.
8
+ Tier 2 (LLM-based) -- runs when an Anthropic client is provided, uses
9
+ Claude to evaluate nuanced alignment questions.
10
+
11
+ The alignment checker NEVER executes actions. It only evaluates and returns
12
+ an AlignmentResult indicating whether the action should proceed.
13
+
14
+ Usage as a library:
15
+ from tools import agent_alignment # or however the agent-loop imports it
16
+ result = agent_alignment.check_alignment(action, constitution, memory)
17
+
18
+ Usage standalone (self-test):
19
+ python3 tools/agent-alignment.py
20
+ """
21
+
22
+ import json
23
+ import re
24
+ import sys
25
+ from dataclasses import dataclass, asdict
26
+ from pathlib import Path
27
+ from typing import Any, Dict, List, Optional
28
+
29
+ # Try to import the Anthropic SDK. If unavailable, Tier 2 checks are
30
+ # silently skipped and the module falls back to rule-based checks only.
31
+ try:
32
+ import anthropic # noqa: F401
33
+
34
+ _HAS_ANTHROPIC = True
35
+ except ImportError:
36
+ _HAS_ANTHROPIC = False
37
+
38
+ # Try to import PyYAML for config parsing. Fall back to a minimal inline
39
+ # parser if unavailable.
40
+ try:
41
+ import yaml # noqa: F401
42
+
43
+ _HAS_YAML = True
44
+ except ImportError:
45
+ _HAS_YAML = False
46
+
47
+ # ---------------------------------------------------------------------------
48
+ # Constants
49
+ # ---------------------------------------------------------------------------
50
+
51
+ SCRIPT_DIR = Path(__file__).resolve().parent
52
+ REPO_ROOT = SCRIPT_DIR.parent
53
+ CONFIG_PATH = SCRIPT_DIR / "agent-config.yml"
54
+ CONSTITUTION_PATH = REPO_ROOT / "CONSTITUTION.md"
55
+
56
+ # Module-level cache for the constitution text
57
+ _constitution_cache: Optional[str] = None
58
+
59
+ # Module-level cache for the parsed config
60
+ _config_cache: Optional[Dict[str, Any]] = None
61
+
62
+ # Risk level ordering (lower index = lower risk)
63
+ RISK_LEVELS = ("low", "medium", "high", "forbidden")
64
+
65
+ # Safety-critical file patterns that must never be deleted or tampered with
66
+ SAFETY_FILE_PATTERNS: List[re.Pattern] = [
67
+ re.compile(r"CONSTITUTION\.md", re.IGNORECASE),
68
+ re.compile(r"security\.md", re.IGNORECASE),
69
+ re.compile(r"agent-alignment\.py", re.IGNORECASE),
70
+ re.compile(r"pre-commit", re.IGNORECASE),
71
+ ]
72
+
73
+ # Red-flag patterns in action parameters (compiled once at import time)
74
+ RED_FLAG_PATTERNS: List[Dict[str, Any]] = [
75
+ {
76
+ "pattern": re.compile(
77
+ r"(delete|remove|rm)\b.*\b(constitution|security\.md|alignment)",
78
+ re.IGNORECASE,
79
+ ),
80
+ "reason": "Action references deleting safety-critical files.",
81
+ },
82
+ {
83
+ "pattern": re.compile(
84
+ r"(git\s+(rebase|reset|push\s+--force|filter-branch|reflog\s+expire))",
85
+ re.IGNORECASE,
86
+ ),
87
+ "reason": "Action attempts to modify or rewrite git history.",
88
+ },
89
+ {
90
+ "pattern": re.compile(
91
+ r"(disable|bypass|skip|remove)\b.*\b(hook|pre-commit|ci|check|review)",
92
+ re.IGNORECASE,
93
+ ),
94
+ "reason": "Action attempts to disable hooks, CI, or review processes.",
95
+ },
96
+ {
97
+ "pattern": re.compile(
98
+ r"merge\b.*\b(own|self|my)\b.*\bpr", re.IGNORECASE
99
+ ),
100
+ "reason": "Action attempts to merge the agent's own PR without review.",
101
+ },
102
+ {
103
+ "pattern": re.compile(
104
+ r"(backdoor|exploit|exfiltrat|obfuscat|hidden\s+functionalit)",
105
+ re.IGNORECASE,
106
+ ),
107
+ "reason": "Action contains references to deceptive or harmful techniques.",
108
+ },
109
+ {
110
+ "pattern": re.compile(
111
+ r"(weapon|surveillance\s+target|manipulat(e|ion)\s+(user|human|people))",
112
+ re.IGNORECASE,
113
+ ),
114
+ "reason": (
115
+ "Action references capability restrictions from Article III "
116
+ "(weapons, surveillance, manipulation)."
117
+ ),
118
+ },
119
+ ]
120
+
121
+ # Terminal colours (only when stdout is a TTY)
122
+ _IS_TTY = hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
123
+
124
+
125
+ def _colour(code: str, text: str) -> str:
126
+ if _IS_TTY:
127
+ return f"\033[{code}m{text}\033[0m"
128
+ return text
129
+
130
+
131
+ def green(text: str) -> str:
132
+ return _colour("32", text)
133
+
134
+
135
+ def yellow(text: str) -> str:
136
+ return _colour("33", text)
137
+
138
+
139
+ def red(text: str) -> str:
140
+ return _colour("31", text)
141
+
142
+
143
+ def bold(text: str) -> str:
144
+ return _colour("1", text)
145
+
146
+
147
+ def dim(text: str) -> str:
148
+ return _colour("2", text)
149
+
150
+
151
+ # ---------------------------------------------------------------------------
152
+ # Data classes
153
+ # ---------------------------------------------------------------------------
154
+
155
+
156
+ @dataclass
157
+ class AlignmentResult:
158
+ """Result of an alignment check on a proposed action."""
159
+
160
+ allowed: bool
161
+ risk_level: str # "low", "medium", "high", "forbidden"
162
+ reasoning: str # explanation of why allowed/denied
163
+ suggestions: Optional[str] = None # how to modify action to be aligned
164
+
165
+
166
+ # ---------------------------------------------------------------------------
167
+ # Config & Constitution loading
168
+ # ---------------------------------------------------------------------------
169
+
170
+
171
+ def load_constitution(repo_root: str) -> str:
172
+ """Load CONSTITUTION.md from the given repo root path.
173
+
174
+ Caches the result in a module-level variable so repeated calls within the
175
+ same process do not re-read the file.
176
+ """
177
+ global _constitution_cache
178
+
179
+ if _constitution_cache is not None:
180
+ return _constitution_cache
181
+
182
+ constitution_path = Path(repo_root) / "CONSTITUTION.md"
183
+ if not constitution_path.is_file():
184
+ raise FileNotFoundError(
185
+ f"Constitution not found at {constitution_path}. "
186
+ "The agent cannot operate without its Constitution."
187
+ )
188
+
189
+ _constitution_cache = constitution_path.read_text(encoding="utf-8")
190
+ return _constitution_cache
191
+
192
+
193
+ def _load_config() -> Dict[str, Any]:
194
+ """Load and cache agent-config.yml."""
195
+ global _config_cache
196
+
197
+ if _config_cache is not None:
198
+ return _config_cache
199
+
200
+ if not CONFIG_PATH.is_file():
201
+ # Return a sensible default so the module can still function
202
+ _config_cache = {"risk": {}}
203
+ return _config_cache
204
+
205
+ raw = CONFIG_PATH.read_text(encoding="utf-8")
206
+
207
+ if _HAS_YAML:
208
+ _config_cache = yaml.safe_load(raw) or {}
209
+ else:
210
+ # Minimal fallback: parse only the risk section we need
211
+ _config_cache = _parse_risk_section(raw)
212
+
213
+ return _config_cache
214
+
215
+
216
+ def _parse_risk_section(raw: str) -> Dict[str, Any]:
217
+ """Minimal YAML-subset parser for the risk section of agent-config.yml.
218
+
219
+ This is intentionally simple. It only extracts the lists under
220
+ risk.auto_execute, risk.auto_pr, risk.require_review, and risk.forbidden.
221
+ """
222
+ config: Dict[str, Any] = {"risk": {}}
223
+ current_section: Optional[str] = None
224
+ in_risk = False
225
+
226
+ for line in raw.splitlines():
227
+ stripped = line.strip()
228
+
229
+ # Detect top-level 'risk:' section
230
+ if line.startswith("risk:"):
231
+ in_risk = True
232
+ continue
233
+
234
+ # Once we leave the risk section (another top-level key), stop
235
+ if in_risk and line and not line[0].isspace() and not line.startswith("#"):
236
+ break
237
+
238
+ if not in_risk:
239
+ continue
240
+
241
+ # Detect sub-keys like " auto_execute:"
242
+ if stripped.endswith(":") and not stripped.startswith("-") and not stripped.startswith("#"):
243
+ key = stripped.rstrip(":").strip()
244
+ # Strip inline comments after the key
245
+ if "#" in key:
246
+ key = key[: key.index("#")].strip()
247
+ current_section = key
248
+ config["risk"][current_section] = []
249
+ continue
250
+
251
+ # Detect list items like ' - "verify_resources"'
252
+ if current_section and stripped.startswith("- "):
253
+ value = stripped[2:].strip().strip('"').strip("'")
254
+ # Strip inline comments
255
+ if " #" in value:
256
+ value = value[: value.index(" #")].strip().strip('"').strip("'")
257
+ config["risk"].setdefault(current_section, []).append(value)
258
+
259
+ return config
260
+
261
+
262
+ # ---------------------------------------------------------------------------
263
+ # Risk classification (pure rule-based)
264
+ # ---------------------------------------------------------------------------
265
+
266
+ # Mapping from config risk categories to risk level strings
267
+ _RISK_CATEGORY_TO_LEVEL = {
268
+ "auto_execute": "low",
269
+ "auto_pr": "medium",
270
+ "require_review": "high",
271
+ "forbidden": "forbidden",
272
+ }
273
+
274
+
275
+ def classify_risk(action_type: str, config: Optional[Dict[str, Any]] = None) -> str:
276
+ """Classify an action type's risk level based on the agent config.
277
+
278
+ Reads the ``risk`` section of agent-config.yml (or the provided config
279
+ dict) and returns one of: "low", "medium", "high", "forbidden".
280
+
281
+ If the action type is not found in any category, defaults to "high"
282
+ (precautionary principle -- unknown actions require review).
283
+ """
284
+ if config is None:
285
+ config = _load_config()
286
+
287
+ risk_config = config.get("risk", {})
288
+
289
+ for category, level in _RISK_CATEGORY_TO_LEVEL.items():
290
+ actions_in_category = risk_config.get(category, [])
291
+ if action_type in actions_in_category:
292
+ return level
293
+
294
+ # Unknown action type -- default to high risk (precautionary principle)
295
+ return "high"
296
+
297
+
298
+ # ---------------------------------------------------------------------------
299
+ # Tier 1: Rule-based alignment check
300
+ # ---------------------------------------------------------------------------
301
+
302
+
303
+ def _flatten_action_text(action: Dict[str, Any]) -> str:
304
+ """Flatten all string values in an action dict into a single text blob
305
+ for pattern matching."""
306
+ parts: List[str] = []
307
+
308
+ def _extract(obj: Any) -> None:
309
+ if isinstance(obj, str):
310
+ parts.append(obj)
311
+ elif isinstance(obj, dict):
312
+ for v in obj.values():
313
+ _extract(v)
314
+ elif isinstance(obj, (list, tuple)):
315
+ for item in obj:
316
+ _extract(item)
317
+
318
+ _extract(action)
319
+ return " ".join(parts)
320
+
321
+
322
+ def _tier1_check(
323
+ action: Dict[str, Any],
324
+ config: Dict[str, Any],
325
+ ) -> AlignmentResult:
326
+ """Tier 1: deterministic, rule-based alignment check.
327
+
328
+ Returns an AlignmentResult. This always runs, even when Tier 2 (LLM)
329
+ is available.
330
+ """
331
+ action_type = action.get("type", "unknown")
332
+
333
+ # ----- Step 1: Check forbidden list -----
334
+ risk_level = classify_risk(action_type, config)
335
+
336
+ if risk_level == "forbidden":
337
+ return AlignmentResult(
338
+ allowed=False,
339
+ risk_level="forbidden",
340
+ reasoning=(
341
+ f"Action type '{action_type}' is on the forbidden list in "
342
+ "agent-config.yml. The agent must never perform this action. "
343
+ "This aligns with the Constitution's Safety Protocols "
344
+ "(Article III) and the precautionary principle."
345
+ ),
346
+ suggestions=None,
347
+ )
348
+
349
+ # ----- Step 2: Scan for red flags in action params -----
350
+ action_text = _flatten_action_text(action)
351
+
352
+ for red_flag in RED_FLAG_PATTERNS:
353
+ if red_flag["pattern"].search(action_text):
354
+ return AlignmentResult(
355
+ allowed=False,
356
+ risk_level="forbidden",
357
+ reasoning=(
358
+ f"Red flag detected: {red_flag['reason']} "
359
+ "This violates the Constitution's safety protocols "
360
+ "(Article III). The precautionary principle requires "
361
+ "denial when safety-critical patterns are detected."
362
+ ),
363
+ suggestions=(
364
+ "Reformulate the action to avoid modifying safety-critical "
365
+ "files, circumventing review processes, or introducing "
366
+ "deceptive/harmful content."
367
+ ),
368
+ )
369
+
370
+ # ----- Step 3: Additional semantic checks -----
371
+ # Check for attempts to modify safety files even if not "delete"
372
+ params = action.get("params", {})
373
+ target_files = []
374
+ if isinstance(params, dict):
375
+ for key in ("file", "path", "target", "files"):
376
+ val = params.get(key)
377
+ if isinstance(val, str):
378
+ target_files.append(val)
379
+ elif isinstance(val, list):
380
+ target_files.extend(str(v) for v in val)
381
+
382
+ for target in target_files:
383
+ for pattern in SAFETY_FILE_PATTERNS:
384
+ if pattern.search(target):
385
+ if risk_level != "high":
386
+ # Escalate: modifying safety files always requires review
387
+ risk_level = "high"
388
+
389
+ # ----- Step 4: Return result based on risk classification -----
390
+ reasoning_parts = [
391
+ f"Action type '{action_type}' classified as '{risk_level}' risk "
392
+ f"by agent-config.yml."
393
+ ]
394
+
395
+ if risk_level == "low":
396
+ reasoning_parts.append(
397
+ "This is an auto-executable action that poses minimal risk."
398
+ )
399
+ elif risk_level == "medium":
400
+ reasoning_parts.append(
401
+ "This action will be submitted as a PR for human review before "
402
+ "merging, in accordance with the Constitution's transparency and "
403
+ "human oversight principles."
404
+ )
405
+ elif risk_level == "high":
406
+ reasoning_parts.append(
407
+ "This action requires explicit human approval before proceeding. "
408
+ "The Constitution's precautionary principle and human override "
409
+ "provisions (Article III, 3.1-3.2) apply."
410
+ )
411
+
412
+ reasoning_parts.append("No red flags detected in action parameters.")
413
+
414
+ suggestions = None
415
+ if risk_level == "high":
416
+ suggestions = (
417
+ "Create a detailed issue explaining the proposed change and wait "
418
+ "for human approval before proceeding."
419
+ )
420
+
421
+ return AlignmentResult(
422
+ allowed=True,
423
+ risk_level=risk_level,
424
+ reasoning=" ".join(reasoning_parts),
425
+ suggestions=suggestions,
426
+ )
427
+
428
+
429
+ # ---------------------------------------------------------------------------
430
+ # Tier 2: LLM-based alignment check
431
+ # ---------------------------------------------------------------------------
432
+
433
+ _TIER2_SYSTEM_PROMPT = """\
434
+ You are an alignment evaluator for the Gaia Minds autonomous agent. Your role
435
+ is to determine whether a proposed action aligns with the Gaia Minds
436
+ Constitution.
437
+
438
+ You must evaluate the action against ALL Constitutional articles:
439
+ - Article I: Core Values (life protection, benevolence, transparency, honesty, humility)
440
+ - Article II: Governance (contributor eligibility, decision making, conflict resolution)
441
+ - Article III: Safety Protocols (precautionary principle, human override, security, capability restrictions)
442
+ - Article IV: Resource Ethics (transparent acquisition, sustainable operations, no exploitation)
443
+ - Article V: Evolution (living document, interpretation, self-improvement)
444
+
445
+ Apply the precautionary principle: when uncertain, rule against allowing the action.
446
+
447
+ CONSTITUTION:
448
+ {constitution}
449
+ """
450
+
451
+ _TIER2_USER_PROMPT = """\
452
+ Evaluate whether this proposed action aligns with the Gaia Minds Constitution.
453
+
454
+ ACTION:
455
+ {action_json}
456
+
457
+ RECENT MEMORY CONTEXT (recent decisions and lessons):
458
+ {memory_context}
459
+
460
+ Respond with ONLY valid JSON (no markdown fences, no extra text) in this exact format:
461
+ {{
462
+ "allowed": true/false,
463
+ "risk_level": "low" | "medium" | "high" | "forbidden",
464
+ "reasoning": "detailed explanation referencing specific Constitutional articles",
465
+ "suggestions": "how to modify the action to be aligned, or null if already aligned"
466
+ }}
467
+ """
468
+
469
+
470
+ def _tier2_check(
471
+ action: Dict[str, Any],
472
+ constitution: str,
473
+ memory_context: str,
474
+ client: Any,
475
+ model: str,
476
+ ) -> Optional[AlignmentResult]:
477
+ """Tier 2: LLM-based alignment check using Claude.
478
+
479
+ Returns an AlignmentResult on success, or None if the API call fails
480
+ (so the caller can fall back to Tier 1 only).
481
+ """
482
+ system_prompt = _TIER2_SYSTEM_PROMPT.format(constitution=constitution)
483
+ user_prompt = _TIER2_USER_PROMPT.format(
484
+ action_json=json.dumps(action, indent=2),
485
+ memory_context=memory_context if memory_context else "(no memory context available)",
486
+ )
487
+
488
+ try:
489
+ response = client.messages.create(
490
+ model=model,
491
+ max_tokens=1024,
492
+ temperature=0.2,
493
+ system=system_prompt,
494
+ messages=[
495
+ {"role": "user", "content": user_prompt},
496
+ ],
497
+ )
498
+
499
+ # Extract text from the response
500
+ response_text = ""
501
+ for block in response.content:
502
+ if hasattr(block, "text"):
503
+ response_text += block.text
504
+
505
+ # Strip any markdown code fences if present
506
+ response_text = response_text.strip()
507
+ if response_text.startswith("```"):
508
+ # Remove opening fence (with optional language tag)
509
+ response_text = re.sub(r"^```[a-zA-Z]*\n?", "", response_text)
510
+ # Remove closing fence
511
+ response_text = re.sub(r"\n?```$", "", response_text)
512
+ response_text = response_text.strip()
513
+
514
+ # Parse the JSON response
515
+ parsed = json.loads(response_text)
516
+
517
+ # Validate the response has required fields
518
+ if not all(k in parsed for k in ("allowed", "risk_level", "reasoning")):
519
+ return None
520
+
521
+ # Validate risk_level is a known value
522
+ risk_level = parsed["risk_level"]
523
+ if risk_level not in RISK_LEVELS:
524
+ risk_level = "high" # default to high if unknown
525
+
526
+ return AlignmentResult(
527
+ allowed=bool(parsed["allowed"]),
528
+ risk_level=risk_level,
529
+ reasoning=str(parsed["reasoning"]),
530
+ suggestions=parsed.get("suggestions"),
531
+ )
532
+
533
+ except json.JSONDecodeError:
534
+ # Could not parse the LLM response as JSON
535
+ return None
536
+ except Exception:
537
+ # API call failed -- caller will fall back to Tier 1
538
+ return None
539
+
540
+
541
+ # ---------------------------------------------------------------------------
542
+ # Main entry point: check_alignment()
543
+ # ---------------------------------------------------------------------------
544
+
545
+
546
+ def check_alignment(
547
+ action: Dict[str, Any],
548
+ constitution: str,
549
+ memory_context: str,
550
+ client: Any = None,
551
+ model: str = "claude-sonnet-4-5-20250929",
552
+ ) -> AlignmentResult:
553
+ """Check if a proposed action aligns with Constitutional values.
554
+
555
+ Two-tier evaluation:
556
+ 1. Rule-based checks always run first.
557
+ 2. If an Anthropic client is provided, LLM-based evaluation runs second.
558
+
559
+ The stricter of the two results wins. If the LLM call fails, the
560
+ fallback policy is:
561
+ - fail-open for low risk (allow, since rules already passed)
562
+ - fail-closed for medium/high risk (deny, precautionary principle)
563
+
564
+ Parameters
565
+ ----------
566
+ action : dict
567
+ The proposed action, e.g.
568
+ {"type": "add_research", "params": {...}, "reasoning": "..."}
569
+ constitution : str
570
+ Full text of CONSTITUTION.md.
571
+ memory_context : str
572
+ Recent decisions and lessons as text.
573
+ client : optional
574
+ An ``anthropic.Anthropic`` client instance. If None, only Tier 1
575
+ (rule-based) checks are performed.
576
+ model : str
577
+ The Claude model to use for Tier 2 checks.
578
+
579
+ Returns
580
+ -------
581
+ AlignmentResult
582
+ The alignment evaluation result.
583
+ """
584
+ config = _load_config()
585
+
586
+ # ----- Tier 1: Rule-based check (always runs) -----
587
+ tier1_result = _tier1_check(action, config)
588
+
589
+ # If Tier 1 says forbidden, return immediately -- no override possible
590
+ if tier1_result.risk_level == "forbidden":
591
+ return tier1_result
592
+
593
+ # ----- Tier 2: LLM-based check (only if client provided) -----
594
+ if client is not None and _HAS_ANTHROPIC:
595
+ tier2_result = _tier2_check(
596
+ action=action,
597
+ constitution=constitution,
598
+ memory_context=memory_context,
599
+ client=client,
600
+ model=model,
601
+ )
602
+
603
+ if tier2_result is not None:
604
+ # The stricter ruling wins:
605
+ # If Tier 2 says "not allowed" but Tier 1 said "allowed",
606
+ # use Tier 2 (stricter).
607
+ if not tier2_result.allowed and tier1_result.allowed:
608
+ tier2_result.reasoning = (
609
+ "[Tier 2 override] " + tier2_result.reasoning
610
+ + " (Tier 1 rule-based check would have allowed this action, "
611
+ "but LLM-based Constitutional review denied it.)"
612
+ )
613
+ return tier2_result
614
+
615
+ # If both agree on "allowed", use the higher risk level
616
+ if tier2_result.allowed and tier1_result.allowed:
617
+ t1_idx = RISK_LEVELS.index(tier1_result.risk_level)
618
+ t2_idx = RISK_LEVELS.index(tier2_result.risk_level)
619
+ if t2_idx > t1_idx:
620
+ # Tier 2 assigns higher risk -- use Tier 2 risk level
621
+ tier1_result.risk_level = tier2_result.risk_level
622
+ tier1_result.reasoning = (
623
+ tier1_result.reasoning
624
+ + f" [Tier 2 escalation] LLM review escalated risk "
625
+ f"to '{tier2_result.risk_level}': {tier2_result.reasoning}"
626
+ )
627
+ if tier2_result.suggestions:
628
+ tier1_result.suggestions = tier2_result.suggestions
629
+
630
+ return tier1_result
631
+
632
+ # If Tier 1 says not allowed (shouldn't happen since we returned
633
+ # early for forbidden), just return Tier 1
634
+ return tier1_result
635
+
636
+ else:
637
+ # Tier 2 failed (API error, parse error, etc.)
638
+ # Fail-open for low risk, fail-closed for medium/high
639
+ if tier1_result.risk_level == "low":
640
+ tier1_result.reasoning += (
641
+ " [Note] LLM-based alignment check was unavailable; "
642
+ "proceeding with rule-based result only (fail-open for "
643
+ "low-risk actions)."
644
+ )
645
+ return tier1_result
646
+ else:
647
+ return AlignmentResult(
648
+ allowed=False,
649
+ risk_level=tier1_result.risk_level,
650
+ reasoning=(
651
+ f"LLM-based alignment check failed and the action is "
652
+ f"'{tier1_result.risk_level}' risk. Per the "
653
+ f"precautionary principle (Constitution Article III, "
654
+ f"3.1), the agent fails closed for non-low-risk "
655
+ f"actions when full alignment evaluation is unavailable."
656
+ ),
657
+ suggestions=(
658
+ "Retry when the Anthropic API is available, or request "
659
+ "explicit human approval for this action."
660
+ ),
661
+ )
662
+
663
+ # ----- No Tier 2 available: return Tier 1 result -----
664
+ return tier1_result
665
+
666
+
667
+ # ---------------------------------------------------------------------------
668
+ # Self-test (when run as __main__)
669
+ # ---------------------------------------------------------------------------
670
+
671
+
672
+ def _risk_colour(level: str) -> str:
673
+ """Return a coloured version of the risk level string."""
674
+ if level == "low":
675
+ return green(level)
676
+ elif level == "medium":
677
+ return yellow(level)
678
+ elif level == "high":
679
+ return red(level)
680
+ elif level == "forbidden":
681
+ return red(bold(level))
682
+ return level
683
+
684
+
685
+ def _self_test() -> int:
686
+ """Run self-test with example actions and print results."""
687
+ print()
688
+ print(bold(" Gaia Minds Alignment Checker -- Self-Test"))
689
+ print(bold(" " + "=" * 50))
690
+ print()
691
+
692
+ # Load constitution for the test
693
+ try:
694
+ constitution = load_constitution(str(REPO_ROOT))
695
+ print(f" Constitution loaded: {len(constitution)} characters")
696
+ except FileNotFoundError as exc:
697
+ print(red(f" {exc}"))
698
+ return 1
699
+
700
+ # Load config
701
+ config = _load_config()
702
+ risk_config = config.get("risk", {})
703
+ print(f" Config loaded: {sum(len(v) for v in risk_config.values())} action types classified")
704
+ print()
705
+
706
+ # Define test cases: (action, expected_allowed, expected_risk)
707
+ test_cases = [
708
+ (
709
+ {"type": "verify_resources"},
710
+ True,
711
+ "low",
712
+ "Auto-executable low-risk action",
713
+ ),
714
+ (
715
+ {"type": "add_research", "params": {"topic": "transformer efficiency"}},
716
+ True,
717
+ "medium",
718
+ "Standard research addition (auto-PR)",
719
+ ),
720
+ (
721
+ {"type": "modify_agent_loop", "params": {"change": "add new feature"}},
722
+ True,
723
+ "high",
724
+ "Self-modification requires human review",
725
+ ),
726
+ (
727
+ {"type": "delete_constitution"},
728
+ False,
729
+ "forbidden",
730
+ "Forbidden: deleting the Constitution",
731
+ ),
732
+ (
733
+ {"type": "merge_own_pr"},
734
+ False,
735
+ "forbidden",
736
+ "Forbidden: merging own PR",
737
+ ),
738
+ (
739
+ {"type": "bypass_review"},
740
+ False,
741
+ "forbidden",
742
+ "Forbidden: bypassing review",
743
+ ),
744
+ ]
745
+
746
+ passed = 0
747
+ failed = 0
748
+
749
+ for action, expected_allowed, expected_risk, description in test_cases:
750
+ result = check_alignment(
751
+ action=action,
752
+ constitution=constitution,
753
+ memory_context="(self-test -- no memory context)",
754
+ client=None, # rule-based only for self-test
755
+ )
756
+
757
+ # Check expectations
758
+ ok_allowed = result.allowed == expected_allowed
759
+ ok_risk = result.risk_level == expected_risk
760
+
761
+ status = green("[PASS]") if (ok_allowed and ok_risk) else red("[FAIL]")
762
+ if ok_allowed and ok_risk:
763
+ passed += 1
764
+ else:
765
+ failed += 1
766
+
767
+ print(f" {status} {description}")
768
+ print(f" Action: {json.dumps(action)}")
769
+ print(f" Allowed: {result.allowed} (expected {expected_allowed})", end="")
770
+ if not ok_allowed:
771
+ print(f" {red('<-- MISMATCH')}", end="")
772
+ print()
773
+ print(f" Risk: {_risk_colour(result.risk_level)} (expected {expected_risk})", end="")
774
+ if not ok_risk:
775
+ print(f" {red('<-- MISMATCH')}", end="")
776
+ print()
777
+ print(f" Reason: {dim(result.reasoning[:120])}...")
778
+ if result.suggestions:
779
+ print(f" Suggest: {dim(result.suggestions[:100])}")
780
+ print()
781
+
782
+ # Additional red-flag tests
783
+ print(bold(" Red-flag detection tests"))
784
+ print(bold(" " + "-" * 50))
785
+ print()
786
+
787
+ red_flag_cases = [
788
+ (
789
+ {
790
+ "type": "update_skill",
791
+ "params": {"change": "delete the constitution file"},
792
+ },
793
+ False,
794
+ "Red flag: delete safety file reference",
795
+ ),
796
+ (
797
+ {
798
+ "type": "create_tool",
799
+ "params": {"command": "git push --force"},
800
+ },
801
+ False,
802
+ "Red flag: force push (git history rewrite)",
803
+ ),
804
+ (
805
+ {
806
+ "type": "create_tool",
807
+ "params": {"desc": "disable pre-commit hooks"},
808
+ },
809
+ False,
810
+ "Red flag: disabling hooks",
811
+ ),
812
+ (
813
+ {
814
+ "type": "add_research",
815
+ "params": {"content": "backdoor access method"},
816
+ },
817
+ False,
818
+ "Red flag: backdoor reference",
819
+ ),
820
+ ]
821
+
822
+ for action, expected_allowed, description in red_flag_cases:
823
+ result = check_alignment(
824
+ action=action,
825
+ constitution=constitution,
826
+ memory_context="(self-test -- no memory context)",
827
+ client=None,
828
+ )
829
+
830
+ ok = result.allowed == expected_allowed
831
+ status = green("[PASS]") if ok else red("[FAIL]")
832
+ if ok:
833
+ passed += 1
834
+ else:
835
+ failed += 1
836
+
837
+ print(f" {status} {description}")
838
+ print(f" Action: {json.dumps(action)}")
839
+ print(f" Allowed: {result.allowed} (expected {expected_allowed})", end="")
840
+ if not ok:
841
+ print(f" {red('<-- MISMATCH')}", end="")
842
+ print()
843
+ print(f" Risk: {_risk_colour(result.risk_level)}")
844
+ print(f" Reason: {dim(result.reasoning[:120])}...")
845
+ print()
846
+
847
+ # Classify risk standalone tests
848
+ print(bold(" classify_risk() tests"))
849
+ print(bold(" " + "-" * 50))
850
+ print()
851
+
852
+ risk_cases = [
853
+ ("verify_resources", "low"),
854
+ ("add_research", "medium"),
855
+ ("modify_agent_loop", "high"),
856
+ ("delete_constitution", "forbidden"),
857
+ ("totally_unknown_action", "high"), # unknown defaults to high
858
+ ]
859
+
860
+ for action_type, expected_level in risk_cases:
861
+ actual_level = classify_risk(action_type, config)
862
+ ok = actual_level == expected_level
863
+ status = green("[PASS]") if ok else red("[FAIL]")
864
+ if ok:
865
+ passed += 1
866
+ else:
867
+ failed += 1
868
+
869
+ print(f" {status} classify_risk('{action_type}') = {_risk_colour(actual_level)} (expected {expected_level})", end="")
870
+ if not ok:
871
+ print(f" {red('<-- MISMATCH')}", end="")
872
+ print()
873
+
874
+ # Summary
875
+ print()
876
+ print(bold(" " + "=" * 50))
877
+ total = passed + failed
878
+ if failed == 0:
879
+ print(green(f" All {total} tests passed."))
880
+ else:
881
+ print(red(f" {failed}/{total} tests FAILED."))
882
+ print()
883
+
884
+ return 1 if failed > 0 else 0
885
+
886
+
887
+ if __name__ == "__main__":
888
+ raise SystemExit(_self_test())