maestro-harness 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,46 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.egg-info/
6
+ *.egg
7
+ dist/
8
+ build/
9
+
10
+ # Virtual environments
11
+ .venv/
12
+
13
+ # Testing
14
+ .pytest_cache/
15
+ htmlcov/
16
+ .coverage
17
+ coverage.xml
18
+
19
+ # Type checking
20
+ .mypy_cache/
21
+ .pyright/
22
+
23
+ # Ruff
24
+ .ruff_cache/
25
+
26
+ # IDE
27
+ .idea/
28
+ *.swp
29
+ *.swo
30
+ *~
31
+
32
+ # OS
33
+ .DS_Store
34
+ Thumbs.db
35
+
36
+ # uv
37
+ uv.lock
38
+
39
+ # Project
40
+ *.db
41
+ *.sqlite3
42
+ .maestro/
43
+ src/api/
44
+
45
+ # env
46
+ .env
@@ -0,0 +1,8 @@
1
+ Metadata-Version: 2.4
2
+ Name: maestro-harness
3
+ Version: 0.1.0
4
+ Summary: Agent Harness: policy-based guarded execution layer
5
+ Requires-Python: >=3.12
6
+ Requires-Dist: gitpython>=3.1.50
7
+ Requires-Dist: pydantic>=2.10
8
+ Requires-Dist: pyyaml>=6
@@ -0,0 +1,17 @@
1
+ [project]
2
+ name = "maestro-harness"
3
+ version = "0.1.0"
4
+ description = "Agent Harness: policy-based guarded execution layer"
5
+ requires-python = ">=3.12"
6
+ dependencies = [
7
+ "gitpython>=3.1.50",
8
+ "pydantic>=2.10",
9
+ "pyyaml>=6",
10
+ ]
11
+
12
+ [build-system]
13
+ requires = ["hatchling"]
14
+ build-backend = "hatchling.build"
15
+
16
+ [tool.hatch.build.targets.wheel]
17
+ packages = ["src/maestro_harness"]
File without changes
@@ -0,0 +1,39 @@
1
+ """Access Control List: filesystem and command permission checks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import fnmatch
6
+
7
+ from .policy import AgentPolicy
8
+
9
+
10
+ class AccessDecision:
11
+ __slots__ = ("allowed", "reason")
12
+
13
+ def __init__(self, allowed: bool, reason: str = "") -> None:
14
+ self.allowed = allowed
15
+ self.reason = reason
16
+
17
+
18
+ def check_file_access(policy: AgentPolicy, path: str, mode: str = "write") -> AccessDecision:
19
+ # Deny takes precedence
20
+ for pattern in policy.filesystem.deny:
21
+ if fnmatch.fnmatch(path, pattern):
22
+ return AccessDecision(False, f"Path '{path}' denied by pattern '{pattern}'")
23
+ # Check allow list
24
+ for pattern in policy.filesystem.allow:
25
+ if fnmatch.fnmatch(path, pattern):
26
+ return AccessDecision(True)
27
+ return AccessDecision(False, f"Path '{path}' not in allow list")
28
+
29
+
30
+ def check_command(policy: AgentPolicy, command: str) -> AccessDecision:
31
+ base = command.split()[0] if command.strip() else ""
32
+ # Deny takes precedence
33
+ for denied in policy.commands.deny:
34
+ if base == denied or command.startswith(denied + " "):
35
+ return AccessDecision(False, f"Command '{command}' denied by rule '{denied}'")
36
+ for allowed in policy.commands.allow:
37
+ if base == allowed or command.startswith(allowed + " "):
38
+ return AccessDecision(True)
39
+ return AccessDecision(False, f"Command '{command}' not in allow list")
@@ -0,0 +1,59 @@
1
+ """Budget tracker: token, step, and time limits per agent."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import time
6
+
7
+ from .policy import BudgetConfig
8
+
9
+
10
+ class BudgetExhaustedError(Exception):
11
+ pass
12
+
13
+
14
+ class BudgetTracker:
15
+ def __init__(self, config: BudgetConfig) -> None:
16
+ self._config = config
17
+ self._tokens_used: int = 0
18
+ self._steps_used: int = 0
19
+ self._start_time: float | None = None
20
+
21
+ def start_timer(self) -> None:
22
+ self._start_time = time.monotonic()
23
+
24
+ def consume_tokens(self, count: int) -> None:
25
+ self._tokens_used += count
26
+ if self._tokens_used > self._config.max_tokens_per_task:
27
+ raise BudgetExhaustedError(
28
+ f"Token budget exhausted: {self._tokens_used}/{self._config.max_tokens_per_task}"
29
+ )
30
+
31
+ def consume_step(self) -> None:
32
+ self._steps_used += 1
33
+ if self._steps_used > self._config.max_steps_per_epoch:
34
+ raise BudgetExhaustedError(
35
+ f"Step budget exhausted: {self._steps_used}/{self._config.max_steps_per_epoch}"
36
+ )
37
+
38
+ def check_time(self) -> None:
39
+ if self._start_time is None:
40
+ return
41
+ elapsed = (time.monotonic() - self._start_time) / 60.0
42
+ if elapsed > self._config.max_wall_time_minutes:
43
+ raise BudgetExhaustedError(
44
+ f"Time budget exhausted: {elapsed:.1f}/{self._config.max_wall_time_minutes} min"
45
+ )
46
+
47
+ @property
48
+ def tokens_used(self) -> int:
49
+ return self._tokens_used
50
+
51
+ @property
52
+ def steps_used(self) -> int:
53
+ return self._steps_used
54
+
55
+ @property
56
+ def elapsed_minutes(self) -> float:
57
+ if self._start_time is None:
58
+ return 0.0
59
+ return (time.monotonic() - self._start_time) / 60.0
@@ -0,0 +1,125 @@
1
+ """Harness: unified guard layer composing ACL, budget, validator, and sandbox."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .budget import BudgetTracker
6
+ from .policy import AgentPolicy, BudgetConfig, HarnessPolicy, load_policy
7
+ from .rollback import RollbackManager
8
+ from .sandbox import LocalSandbox, SandboxResult
9
+ from .validator import Action, ValidationResult, validate_action
10
+
11
+
12
+ class Harness:
13
+ """Policy-aware action interceptor wrapping all guard layers."""
14
+
15
+ def __init__(self, policy_path: str, sandbox_root: str = ".") -> None:
16
+ self._policy = load_policy(policy_path)
17
+ self._sandbox = LocalSandbox(sandbox_root)
18
+ self._budgets: dict[str, BudgetTracker] = {}
19
+ self._rollback = RollbackManager(sandbox_root)
20
+
21
+ @classmethod
22
+ def from_policy(cls, policy: HarnessPolicy, sandbox_root: str = ".") -> Harness:
23
+ """Create a Harness directly from a HarnessPolicy object."""
24
+ instance = cls.__new__(cls)
25
+ instance._policy = policy
26
+ instance._sandbox = LocalSandbox(sandbox_root)
27
+ instance._budgets = {}
28
+ instance._rollback = RollbackManager(sandbox_root)
29
+ return instance
30
+
31
+ @property
32
+ def policy(self) -> HarnessPolicy:
33
+ return self._policy
34
+
35
+ def _get_budget(self, agent_id: str) -> BudgetTracker:
36
+ if agent_id not in self._budgets:
37
+ agent_policy = self._resolve_policy(agent_id)
38
+ config = agent_policy.budget if agent_policy else BudgetConfig()
39
+ self._budgets[agent_id] = BudgetTracker(config)
40
+ self._budgets[agent_id].start_timer()
41
+ return self._budgets[agent_id]
42
+
43
+ def _normalize_path(self, path: str) -> str:
44
+ """Make path relative to sandbox_root for ACL pattern matching."""
45
+ from pathlib import Path
46
+
47
+ p = Path(path).resolve()
48
+ root = Path(self._sandbox._root).resolve()
49
+ try:
50
+ rel = p.relative_to(root)
51
+ return str(rel)
52
+ except ValueError:
53
+ return path
54
+
55
+ def _resolve_policy(self, agent_id: str) -> AgentPolicy | None:
56
+ """Lookup agent policy with fallback to role prefix (e.g. backend-2 -> backend_agent)."""
57
+ if agent_id in self._policy.agents:
58
+ return self._policy.agents[agent_id]
59
+ import re
60
+ # Try exact match with _agent suffix (backend-test -> backend_agent)
61
+ role = re.sub(r'-[\w-]+$', '', agent_id) # strip suffix like -0, -test
62
+ candidates = [role, role + '_agent', re.sub(r'_agent$', '', role)]
63
+ for c in candidates:
64
+ if c in self._policy.agents:
65
+ return self._policy.agents[c]
66
+ return None
67
+
68
+ def validate_action(self, agent_id: str, action: Action) -> ValidationResult:
69
+ agent_policy = self._resolve_policy(agent_id)
70
+ if agent_policy is None:
71
+ return ValidationResult(allowed=False, errors=[f"Unknown agent '{agent_id}'"])
72
+ # Normalize path for ACL matching
73
+ normalized = action.model_copy()
74
+ if normalized.target:
75
+ normalized.target = self._normalize_path(normalized.target)
76
+ return validate_action(agent_policy, normalized)
77
+
78
+ def execute_action(self, agent_id: str, action: Action) -> SandboxResult:
79
+ validation = self.validate_action(agent_id, action)
80
+ if not validation.allowed:
81
+ return SandboxResult(-1, "", f"Action blocked: {'; '.join(validation.errors)}")
82
+
83
+ budget = self._get_budget(agent_id)
84
+ budget.consume_step()
85
+ budget.check_time()
86
+
87
+ if action.command:
88
+ return self._sandbox.run_command(action.command)
89
+
90
+ if action.action_type in ("CODE_WRITE", "FILE_WRITE") and action.content:
91
+ self._sandbox.write_file(action.target, action.content)
92
+ return SandboxResult(0, f"Wrote {action.target}", "")
93
+
94
+ if action.action_type == "FILE_READ":
95
+ content = self._sandbox.read_file(action.target)
96
+ return SandboxResult(0, content, "")
97
+
98
+ return SandboxResult(0, "", "")
99
+
100
+ def check_budget(self, agent_id: str) -> dict[str, int | float]:
101
+ budget = self._get_budget(agent_id)
102
+ return {
103
+ "tokens_used": budget.tokens_used,
104
+ "steps_used": budget.steps_used,
105
+ "elapsed_minutes": round(budget.elapsed_minutes, 2),
106
+ }
107
+
108
+ def get_agent_policy(self, agent_id: str) -> AgentPolicy | None:
109
+ return self._resolve_policy(agent_id)
110
+
111
+ # ------------------------------------------------------------------
112
+ # Rollback API
113
+ # ------------------------------------------------------------------
114
+
115
+ def snapshot(self, tag: str = "maestro-rollback") -> str | None:
116
+ """Create a Git snapshot before an execution epoch."""
117
+ return self._rollback.snapshot(tag)
118
+
119
+ def rollback(self) -> bool:
120
+ """Reset the working tree to the last snapshot."""
121
+ return self._rollback.rollback()
122
+
123
+ def clear_snapshot(self) -> None:
124
+ """Remove the snapshot tag after a successful epoch."""
125
+ self._rollback.clear_snapshot()
@@ -0,0 +1,46 @@
1
+ """Policy models and loader for the Agent Harness."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ import yaml
8
+ from pydantic import BaseModel, Field
9
+
10
+
11
+ class FilesystemRule(BaseModel):
12
+ allow: list[str] = Field(default_factory=list)
13
+ deny: list[str] = Field(default_factory=list)
14
+
15
+
16
+ class CommandRule(BaseModel):
17
+ allow: list[str] = Field(default_factory=list)
18
+ deny: list[str] = Field(default_factory=list)
19
+
20
+
21
+ class BudgetConfig(BaseModel):
22
+ max_tokens_per_task: int = 50000
23
+ max_steps_per_epoch: int = 30
24
+ max_wall_time_minutes: int = 10
25
+
26
+
27
+ class HumanGateRule(BaseModel):
28
+ action_pattern: str
29
+ requires_approval: bool = True
30
+
31
+
32
+ class AgentPolicy(BaseModel):
33
+ filesystem: FilesystemRule = Field(default_factory=FilesystemRule)
34
+ commands: CommandRule = Field(default_factory=CommandRule)
35
+ budget: BudgetConfig = Field(default_factory=BudgetConfig)
36
+ rollback: bool = True
37
+ human_gate: list[HumanGateRule] = Field(default_factory=list)
38
+
39
+
40
+ class HarnessPolicy(BaseModel):
41
+ agents: dict[str, AgentPolicy] = Field(default_factory=dict)
42
+
43
+
44
+ def load_policy(path: str) -> HarnessPolicy:
45
+ data = yaml.safe_load(Path(path).read_text())
46
+ return HarnessPolicy.model_validate(data)
File without changes
@@ -0,0 +1,96 @@
1
+ """Git-based snapshot and rollback for the Agent Harness."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from pathlib import Path
7
+
8
+ from git import InvalidGitRepositoryError, Repo
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class RollbackManager:
14
+ """Manages Git snapshots before agent execution epochs.
15
+
16
+ Creates a temporary commit before an epoch starts; if validation fails,
17
+ the working tree is reset to that commit (soft rollback).
18
+ """
19
+
20
+ def __init__(self, repo_path: str = ".") -> None:
21
+ self._repo_path = Path(repo_path).resolve()
22
+ self._repo: Repo | None = None
23
+ self._snapshot_tag: str | None = None
24
+ self._original_branch: str | None = None
25
+ try:
26
+ self._repo = Repo(self._repo_path)
27
+ except InvalidGitRepositoryError:
28
+ logger.warning("No Git repository found at %s; rollback disabled", self._repo_path)
29
+
30
+ @property
31
+ def enabled(self) -> bool:
32
+ return self._repo is not None and not self._repo.bare
33
+
34
+ def snapshot(self, tag: str = "maestro-rollback") -> str | None:
35
+ """Create a snapshot commit and return its hexsha.
36
+
37
+ If the repo has uncommitted changes, they are stashed.
38
+ A lightweight tag is created for easy recovery.
39
+ """
40
+ if not self.enabled or self._repo is None:
41
+ return None
42
+
43
+ try:
44
+ # Stash any existing changes to keep the tree clean
45
+ if self._repo.is_dirty(untracked_files=True):
46
+ self._repo.git.stash("push", "-u", "-m", f"maestro-auto-stash-{tag}")
47
+
48
+ # Create a tag on HEAD for rollback
49
+ head = self._repo.head.commit
50
+ tag_name = f"{tag}-{head.hexsha[:8]}"
51
+
52
+ # Remove old tag if it exists
53
+ for existing in list(self._repo.tags):
54
+ if existing.name == tag_name:
55
+ self._repo.delete_tag(existing)
56
+
57
+ self._repo.create_tag(tag_name, ref=head)
58
+ self._snapshot_tag = tag_name
59
+ logger.info("Rollback snapshot created: %s", tag_name)
60
+ return tag_name
61
+ except Exception as exc:
62
+ logger.warning("Failed to create snapshot: %s", exc)
63
+ return None
64
+
65
+ def rollback(self, tag: str | None = None) -> bool:
66
+ """Reset the working tree to the snapshot tag.
67
+
68
+ Returns True if rollback succeeded.
69
+ """
70
+ if not self.enabled or self._repo is None:
71
+ return False
72
+
73
+ target_tag = tag or self._snapshot_tag
74
+ if target_tag is None:
75
+ logger.warning("No snapshot tag available for rollback")
76
+ return False
77
+
78
+ try:
79
+ commit = self._repo.tags[target_tag].commit
80
+ self._repo.head.reset(commit, index=True, working_tree=True)
81
+ logger.info("Rolled back to %s", target_tag)
82
+ return True
83
+ except Exception as exc:
84
+ logger.warning("Rollback failed: %s", exc)
85
+ return False
86
+
87
+ def clear_snapshot(self) -> None:
88
+ """Remove the snapshot tag after a successful epoch."""
89
+ if self._repo is not None and self._snapshot_tag:
90
+ try:
91
+ for tag in list(self._repo.tags):
92
+ if tag.name == self._snapshot_tag:
93
+ self._repo.delete_tag(tag)
94
+ except Exception as exc:
95
+ logger.debug("Failed to delete snapshot tag: %s", exc)
96
+ self._snapshot_tag = None
@@ -0,0 +1,58 @@
1
+ """Sandbox abstraction for isolated agent execution."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import subprocess
6
+ from typing import Protocol
7
+
8
+
9
+ class SandboxResult:
10
+ __slots__ = ("exit_code", "stdout", "stderr")
11
+
12
+ def __init__(self, exit_code: int, stdout: str, stderr: str) -> None:
13
+ self.exit_code = exit_code
14
+ self.stdout = stdout
15
+ self.stderr = stderr
16
+
17
+
18
+ class SandboxProvider(Protocol):
19
+ def run_command(
20
+ self, command: str, cwd: str | None = None, timeout: int = 60
21
+ ) -> SandboxResult: ...
22
+ def write_file(self, path: str, content: str) -> None: ...
23
+ def read_file(self, path: str) -> str: ...
24
+
25
+
26
+ class LocalSandbox:
27
+ """Subprocess-based sandbox with directory restriction (Phase 1)."""
28
+
29
+ def __init__(self, allowed_root: str) -> None:
30
+ self._root = allowed_root
31
+
32
+ def run_command(self, command: str, cwd: str | None = None, timeout: int = 60) -> SandboxResult:
33
+ work_dir = cwd or self._root
34
+ try:
35
+ proc = subprocess.run(
36
+ command,
37
+ shell=True,
38
+ capture_output=True,
39
+ text=True,
40
+ cwd=work_dir,
41
+ timeout=timeout,
42
+ )
43
+ return SandboxResult(proc.returncode, proc.stdout, proc.stderr)
44
+ except subprocess.TimeoutExpired:
45
+ return SandboxResult(-1, "", f"Command timed out after {timeout}s")
46
+
47
+ def write_file(self, path: str, content: str) -> None:
48
+ from pathlib import Path
49
+
50
+ target = Path(self._root) / path
51
+ target.parent.mkdir(parents=True, exist_ok=True)
52
+ target.write_text(content)
53
+
54
+ def read_file(self, path: str) -> str:
55
+ from pathlib import Path
56
+
57
+ target = Path(self._root) / path
58
+ return target.read_text()
@@ -0,0 +1,67 @@
1
+ """Action validator: pre-execution safety checks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+
7
+ from pydantic import BaseModel
8
+
9
+ from .acl import check_command, check_file_access
10
+ from .policy import AgentPolicy
11
+
12
+
13
+ class Action(BaseModel):
14
+ agent_id: str
15
+ action_type: str = ""
16
+ target: str = ""
17
+ content: str = ""
18
+ command: str = ""
19
+
20
+
21
+ class ValidationResult(BaseModel):
22
+ allowed: bool = True
23
+ errors: list[str] = []
24
+
25
+
26
+ DANGEROUS_PATTERNS = [
27
+ re.compile(r"\beval\s*\("),
28
+ re.compile(r"\bos\.system\s*\("),
29
+ re.compile(r"\bsubprocess\.call\s*\(.*shell\s*=\s*True"),
30
+ re.compile(r"\brm\s+-rf\s+/"),
31
+ re.compile(r"\bos\.remove\s*\(\s*[\"']/?"),
32
+ ]
33
+
34
+ MAX_DELETE_LINES = 50
35
+
36
+
37
+ def validate_action(policy: AgentPolicy, action: Action) -> ValidationResult:
38
+ errors: list[str] = []
39
+
40
+ # File access check
41
+ if action.target and action.action_type in (
42
+ "CODE_WRITE", "FILE_WRITE", "FILE_DELETE", "FILE_READ"
43
+ ):
44
+ decision = check_file_access(policy, action.target)
45
+ if not decision.allowed:
46
+ errors.append(decision.reason)
47
+
48
+ # Command check
49
+ if action.command:
50
+ decision = check_command(policy, action.command)
51
+ if not decision.allowed:
52
+ errors.append(decision.reason)
53
+
54
+ # Dangerous pattern check
55
+ for pattern in DANGEROUS_PATTERNS:
56
+ if pattern.search(action.content):
57
+ errors.append(f"Dangerous pattern detected: {pattern.pattern}")
58
+
59
+ # Large deletion check
60
+ if action.action_type == "FILE_DELETE" and action.content:
61
+ deleted_lines = action.content.count("\n") + 1
62
+ if deleted_lines > MAX_DELETE_LINES:
63
+ errors.append(
64
+ f"Deletion of {deleted_lines} lines exceeds threshold of {MAX_DELETE_LINES}"
65
+ )
66
+
67
+ return ValidationResult(allowed=len(errors) == 0, errors=errors)
File without changes
@@ -0,0 +1,55 @@
1
+ """Shared fixtures for maestro-harness tests."""
2
+
3
+ from pathlib import Path
4
+
5
+ import pytest
6
+
7
+ from maestro_harness.policy import (
8
+ AgentPolicy,
9
+ BudgetConfig,
10
+ CommandRule,
11
+ FilesystemRule,
12
+ HarnessPolicy,
13
+ )
14
+
15
+
16
+ @pytest.fixture
17
+ def sample_policy() -> HarnessPolicy:
18
+ return HarnessPolicy(
19
+ agents={
20
+ "backend_agent": AgentPolicy(
21
+ filesystem=FilesystemRule(
22
+ allow=["src/api/**", "tests/api/**"],
23
+ deny=["src/ui/**", "*.env"],
24
+ ),
25
+ commands=CommandRule(
26
+ allow=["python", "pytest", "pip"],
27
+ deny=["rm -rf", "curl", "wget"],
28
+ ),
29
+ budget=BudgetConfig(
30
+ max_tokens_per_task=100,
31
+ max_steps_per_epoch=5,
32
+ max_wall_time_minutes=1,
33
+ ),
34
+ ),
35
+ "frontend_agent": AgentPolicy(
36
+ filesystem=FilesystemRule(
37
+ allow=["src/ui/**", "public/**"],
38
+ deny=["src/api/**"],
39
+ ),
40
+ commands=CommandRule(
41
+ allow=["npm", "npx", "jest"],
42
+ deny=["curl", "sudo"],
43
+ ),
44
+ ),
45
+ }
46
+ )
47
+
48
+
49
+ @pytest.fixture
50
+ def policy_file(tmp_path: Path, sample_policy: HarnessPolicy) -> str:
51
+ import yaml
52
+
53
+ path = tmp_path / "harness_policy.yaml"
54
+ path.write_text(yaml.dump(sample_policy.model_dump(), default_flow_style=False))
55
+ return str(path)
@@ -0,0 +1,254 @@
1
+ """Tests for maestro-harness: policy, ACL, budget, validator, harness."""
2
+
3
+ from pathlib import Path
4
+
5
+ import pytest
6
+
7
+ from maestro_harness.acl import check_command, check_file_access
8
+ from maestro_harness.budget import BudgetExhaustedError, BudgetTracker
9
+ from maestro_harness.harness import Harness
10
+ from maestro_harness.policy import BudgetConfig, HarnessPolicy, load_policy
11
+ from maestro_harness.validator import Action, validate_action
12
+
13
+
14
+ class TestPolicyLoader:
15
+ def test_load_from_file(self, policy_file: str) -> None:
16
+ policy = load_policy(policy_file)
17
+ assert "backend_agent" in policy.agents
18
+ assert "frontend_agent" in policy.agents
19
+
20
+ def test_backend_filesystem_rules(self, policy_file: str) -> None:
21
+ policy = load_policy(policy_file)
22
+ be = policy.agents["backend_agent"]
23
+ assert "src/api/**" in be.filesystem.allow
24
+ assert "src/ui/**" in be.filesystem.deny
25
+
26
+
27
+ class TestACL:
28
+ def test_allow_matching_path(self, sample_policy: HarnessPolicy) -> None:
29
+ be = sample_policy.agents["backend_agent"]
30
+ result = check_file_access(be, "src/api/auth.py")
31
+ assert result.allowed
32
+
33
+ def test_deny_takes_precedence(self, sample_policy: HarnessPolicy) -> None:
34
+ be = sample_policy.agents["backend_agent"]
35
+ result = check_file_access(be, "src/ui/app.tsx")
36
+ assert not result.allowed
37
+ assert "denied" in result.reason
38
+
39
+ def test_no_match_denied(self, sample_policy: HarnessPolicy) -> None:
40
+ be = sample_policy.agents["backend_agent"]
41
+ result = check_file_access(be, "random/file.txt")
42
+ assert not result.allowed
43
+
44
+ def test_env_denied(self, sample_policy: HarnessPolicy) -> None:
45
+ be = sample_policy.agents["backend_agent"]
46
+ result = check_file_access(be, ".env")
47
+ assert not result.allowed
48
+
49
+ def test_command_allowed(self, sample_policy: HarnessPolicy) -> None:
50
+ be = sample_policy.agents["backend_agent"]
51
+ result = check_command(be, "python main.py")
52
+ assert result.allowed
53
+
54
+ def test_command_denied(self, sample_policy: HarnessPolicy) -> None:
55
+ be = sample_policy.agents["backend_agent"]
56
+ result = check_command(be, "rm -rf /")
57
+ assert not result.allowed
58
+
59
+ def test_command_not_in_list(self, sample_policy: HarnessPolicy) -> None:
60
+ be = sample_policy.agents["backend_agent"]
61
+ result = check_command(be, "docker build .")
62
+ assert not result.allowed
63
+
64
+
65
+ class TestBudget:
66
+ def test_consume_tokens(self) -> None:
67
+ tracker = BudgetTracker(BudgetConfig(max_tokens_per_task=100))
68
+ tracker.consume_tokens(50)
69
+ assert tracker.tokens_used == 50
70
+
71
+ def test_token_exhaustion(self) -> None:
72
+ tracker = BudgetTracker(BudgetConfig(max_tokens_per_task=10))
73
+ with pytest.raises(BudgetExhaustedError, match="Token"):
74
+ tracker.consume_tokens(20)
75
+
76
+ def test_step_exhaustion(self) -> None:
77
+ tracker = BudgetTracker(BudgetConfig(max_steps_per_epoch=2))
78
+ tracker.consume_step()
79
+ tracker.consume_step()
80
+ with pytest.raises(BudgetExhaustedError, match="Step"):
81
+ tracker.consume_step()
82
+
83
+ def test_elapsed_tracking(self) -> None:
84
+ tracker = BudgetTracker(BudgetConfig())
85
+ tracker.start_timer()
86
+ assert tracker.elapsed_minutes >= 0
87
+
88
+
89
+ class TestValidator:
90
+ def test_valid_write(self, sample_policy: HarnessPolicy) -> None:
91
+ be = sample_policy.agents["backend_agent"]
92
+ action = Action(
93
+ agent_id="backend_agent",
94
+ action_type="CODE_WRITE",
95
+ target="src/api/auth.py",
96
+ content="def login(): pass",
97
+ )
98
+ result = validate_action(be, action)
99
+ assert result.allowed
100
+
101
+ def test_invalid_path(self, sample_policy: HarnessPolicy) -> None:
102
+ be = sample_policy.agents["backend_agent"]
103
+ action = Action(
104
+ agent_id="backend_agent",
105
+ action_type="CODE_WRITE",
106
+ target="src/ui/app.tsx",
107
+ )
108
+ result = validate_action(be, action)
109
+ assert not result.allowed
110
+
111
+ def test_dangerous_eval(self, sample_policy: HarnessPolicy) -> None:
112
+ be = sample_policy.agents["backend_agent"]
113
+ action = Action(
114
+ agent_id="backend_agent",
115
+ action_type="CODE_WRITE",
116
+ target="src/api/evil.py",
117
+ content="result = eval(user_input)",
118
+ )
119
+ result = validate_action(be, action)
120
+ assert not result.allowed
121
+ assert any("Dangerous" in e for e in result.errors)
122
+
123
+ def test_dangerous_os_system(self, sample_policy: HarnessPolicy) -> None:
124
+ be = sample_policy.agents["backend_agent"]
125
+ action = Action(
126
+ agent_id="backend_agent",
127
+ action_type="CODE_WRITE",
128
+ target="src/api/cmd.py",
129
+ content="os.system('rm -rf /')",
130
+ )
131
+ result = validate_action(be, action)
132
+ assert not result.allowed
133
+
134
+ def test_dangerous_command(self, sample_policy: HarnessPolicy) -> None:
135
+ be = sample_policy.agents["backend_agent"]
136
+ action = Action(
137
+ agent_id="backend_agent",
138
+ action_type="CODE_WRITE",
139
+ target="src/api/x.py",
140
+ content="pass",
141
+ command="rm -rf /tmp/anything",
142
+ )
143
+ result = validate_action(be, action)
144
+ assert not result.allowed
145
+
146
+
147
+ class TestHarness:
148
+ def test_validate_known_agent(self, policy_file: str) -> None:
149
+ harness = Harness(policy_file)
150
+ action = Action(
151
+ agent_id="backend_agent",
152
+ action_type="CODE_WRITE",
153
+ target="src/api/auth.py",
154
+ )
155
+ result = harness.validate_action("backend_agent", action)
156
+ assert result.allowed
157
+
158
+ def test_validate_unknown_agent(self, policy_file: str) -> None:
159
+ harness = Harness(policy_file)
160
+ action = Action(agent_id="unknown", action_type="CODE_WRITE", target="f.py")
161
+ result = harness.validate_action("unknown", action)
162
+ assert not result.allowed
163
+ assert "Unknown" in result.errors[0]
164
+
165
+ def test_execute_blocked(self, policy_file: str) -> None:
166
+ harness = Harness(policy_file)
167
+ action = Action(
168
+ agent_id="backend_agent",
169
+ action_type="CODE_WRITE",
170
+ target="src/ui/app.tsx",
171
+ )
172
+ result = harness.execute_action("backend_agent", action)
173
+ assert result.exit_code == -1
174
+ assert "blocked" in result.stderr
175
+
176
+ def test_budget_tracking(self, policy_file: str) -> None:
177
+ harness = Harness(policy_file)
178
+ action = Action(
179
+ agent_id="backend_agent",
180
+ action_type="CODE_WRITE",
181
+ target="src/api/auth.py",
182
+ content="pass",
183
+ )
184
+ harness.execute_action("backend_agent", action)
185
+ status = harness.check_budget("backend_agent")
186
+ assert status["steps_used"] == 1
187
+
188
+ def test_file_read_allowed(self, policy_file: str, tmp_path: Path) -> None:
189
+ harness = Harness(policy_file, sandbox_root=str(tmp_path))
190
+ test_file = tmp_path / "src" / "api" / "auth.py"
191
+ test_file.parent.mkdir(parents=True, exist_ok=True)
192
+ test_file.write_text("def login(): pass")
193
+ action = Action(
194
+ agent_id="backend_agent",
195
+ action_type="FILE_READ",
196
+ target=str(test_file),
197
+ )
198
+ result = harness.execute_action("backend_agent", action)
199
+ assert result.exit_code == 0
200
+ assert "def login(): pass" in result.stdout
201
+
202
+ def test_file_read_blocked_by_acl(self, policy_file: str) -> None:
203
+ harness = Harness(policy_file)
204
+ action = Action(
205
+ agent_id="backend_agent",
206
+ action_type="FILE_READ",
207
+ target="src/ui/app.tsx",
208
+ )
209
+ result = harness.execute_action("backend_agent", action)
210
+ assert result.exit_code == -1
211
+ assert "blocked" in result.stderr
212
+
213
+ def test_file_write_and_read_roundtrip(self, policy_file: str, tmp_path: Path) -> None:
214
+ harness = Harness(policy_file, sandbox_root=str(tmp_path))
215
+ target = str(tmp_path / "src" / "api" / "new.py")
216
+ write_action = Action(
217
+ agent_id="backend_agent",
218
+ action_type="FILE_WRITE",
219
+ target=target,
220
+ content="x = 42",
221
+ )
222
+ harness.execute_action("backend_agent", write_action)
223
+ read_action = Action(
224
+ agent_id="backend_agent",
225
+ action_type="FILE_READ",
226
+ target=target,
227
+ )
228
+ result = harness.execute_action("backend_agent", read_action)
229
+ assert result.exit_code == 0
230
+ assert "x = 42" in result.stdout
231
+
232
+ def test_shell_exec_allowed(self, policy_file: str, tmp_path: Path) -> None:
233
+ harness = Harness(policy_file, sandbox_root=str(tmp_path))
234
+ action = Action(
235
+ agent_id="backend_agent",
236
+ action_type="SHELL_EXEC",
237
+ target="",
238
+ command="python -c 'print(hello)'",
239
+ )
240
+ result = harness.execute_action("backend_agent", action)
241
+ assert result.exit_code != 0 # NameError, but command was allowed to run
242
+ assert "hello" in result.stderr or "NameError" in result.stderr
243
+
244
+ def test_shell_exec_blocked(self, policy_file: str) -> None:
245
+ harness = Harness(policy_file)
246
+ action = Action(
247
+ agent_id="backend_agent",
248
+ action_type="SHELL_EXEC",
249
+ target="",
250
+ command="curl https://evil.com",
251
+ )
252
+ result = harness.execute_action("backend_agent", action)
253
+ assert result.exit_code == -1
254
+ assert "blocked" in result.stderr