maestro-harness 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- maestro_harness-0.1.0/.gitignore +46 -0
- maestro_harness-0.1.0/PKG-INFO +8 -0
- maestro_harness-0.1.0/pyproject.toml +17 -0
- maestro_harness-0.1.0/src/maestro_harness/__init__.py +0 -0
- maestro_harness-0.1.0/src/maestro_harness/acl.py +39 -0
- maestro_harness-0.1.0/src/maestro_harness/budget.py +59 -0
- maestro_harness-0.1.0/src/maestro_harness/harness.py +125 -0
- maestro_harness-0.1.0/src/maestro_harness/policy.py +46 -0
- maestro_harness-0.1.0/src/maestro_harness/py.typed +0 -0
- maestro_harness-0.1.0/src/maestro_harness/rollback.py +96 -0
- maestro_harness-0.1.0/src/maestro_harness/sandbox.py +58 -0
- maestro_harness-0.1.0/src/maestro_harness/validator.py +67 -0
- maestro_harness-0.1.0/tests/__init__.py +0 -0
- maestro_harness-0.1.0/tests/conftest.py +55 -0
- maestro_harness-0.1.0/tests/test_harness.py +254 -0
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.egg-info/
|
|
6
|
+
*.egg
|
|
7
|
+
dist/
|
|
8
|
+
build/
|
|
9
|
+
|
|
10
|
+
# Virtual environments
|
|
11
|
+
.venv/
|
|
12
|
+
|
|
13
|
+
# Testing
|
|
14
|
+
.pytest_cache/
|
|
15
|
+
htmlcov/
|
|
16
|
+
.coverage
|
|
17
|
+
coverage.xml
|
|
18
|
+
|
|
19
|
+
# Type checking
|
|
20
|
+
.mypy_cache/
|
|
21
|
+
.pyright/
|
|
22
|
+
|
|
23
|
+
# Ruff
|
|
24
|
+
.ruff_cache/
|
|
25
|
+
|
|
26
|
+
# IDE
|
|
27
|
+
.idea/
|
|
28
|
+
*.swp
|
|
29
|
+
*.swo
|
|
30
|
+
*~
|
|
31
|
+
|
|
32
|
+
# OS
|
|
33
|
+
.DS_Store
|
|
34
|
+
Thumbs.db
|
|
35
|
+
|
|
36
|
+
# uv
|
|
37
|
+
uv.lock
|
|
38
|
+
|
|
39
|
+
# Project
|
|
40
|
+
*.db
|
|
41
|
+
*.sqlite3
|
|
42
|
+
.maestro/
|
|
43
|
+
src/api/
|
|
44
|
+
|
|
45
|
+
# env
|
|
46
|
+
.env
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "maestro-harness"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Agent Harness: policy-based guarded execution layer"
|
|
5
|
+
requires-python = ">=3.12"
|
|
6
|
+
dependencies = [
|
|
7
|
+
"gitpython>=3.1.50",
|
|
8
|
+
"pydantic>=2.10",
|
|
9
|
+
"pyyaml>=6",
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
[build-system]
|
|
13
|
+
requires = ["hatchling"]
|
|
14
|
+
build-backend = "hatchling.build"
|
|
15
|
+
|
|
16
|
+
[tool.hatch.build.targets.wheel]
|
|
17
|
+
packages = ["src/maestro_harness"]
|
|
File without changes
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Access Control List: filesystem and command permission checks."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import fnmatch
|
|
6
|
+
|
|
7
|
+
from .policy import AgentPolicy
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AccessDecision:
|
|
11
|
+
__slots__ = ("allowed", "reason")
|
|
12
|
+
|
|
13
|
+
def __init__(self, allowed: bool, reason: str = "") -> None:
|
|
14
|
+
self.allowed = allowed
|
|
15
|
+
self.reason = reason
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def check_file_access(policy: AgentPolicy, path: str, mode: str = "write") -> AccessDecision:
|
|
19
|
+
# Deny takes precedence
|
|
20
|
+
for pattern in policy.filesystem.deny:
|
|
21
|
+
if fnmatch.fnmatch(path, pattern):
|
|
22
|
+
return AccessDecision(False, f"Path '{path}' denied by pattern '{pattern}'")
|
|
23
|
+
# Check allow list
|
|
24
|
+
for pattern in policy.filesystem.allow:
|
|
25
|
+
if fnmatch.fnmatch(path, pattern):
|
|
26
|
+
return AccessDecision(True)
|
|
27
|
+
return AccessDecision(False, f"Path '{path}' not in allow list")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def check_command(policy: AgentPolicy, command: str) -> AccessDecision:
|
|
31
|
+
base = command.split()[0] if command.strip() else ""
|
|
32
|
+
# Deny takes precedence
|
|
33
|
+
for denied in policy.commands.deny:
|
|
34
|
+
if base == denied or command.startswith(denied + " "):
|
|
35
|
+
return AccessDecision(False, f"Command '{command}' denied by rule '{denied}'")
|
|
36
|
+
for allowed in policy.commands.allow:
|
|
37
|
+
if base == allowed or command.startswith(allowed + " "):
|
|
38
|
+
return AccessDecision(True)
|
|
39
|
+
return AccessDecision(False, f"Command '{command}' not in allow list")
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Budget tracker: token, step, and time limits per agent."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
|
|
7
|
+
from .policy import BudgetConfig
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BudgetExhaustedError(Exception):
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class BudgetTracker:
|
|
15
|
+
def __init__(self, config: BudgetConfig) -> None:
|
|
16
|
+
self._config = config
|
|
17
|
+
self._tokens_used: int = 0
|
|
18
|
+
self._steps_used: int = 0
|
|
19
|
+
self._start_time: float | None = None
|
|
20
|
+
|
|
21
|
+
def start_timer(self) -> None:
|
|
22
|
+
self._start_time = time.monotonic()
|
|
23
|
+
|
|
24
|
+
def consume_tokens(self, count: int) -> None:
|
|
25
|
+
self._tokens_used += count
|
|
26
|
+
if self._tokens_used > self._config.max_tokens_per_task:
|
|
27
|
+
raise BudgetExhaustedError(
|
|
28
|
+
f"Token budget exhausted: {self._tokens_used}/{self._config.max_tokens_per_task}"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
def consume_step(self) -> None:
|
|
32
|
+
self._steps_used += 1
|
|
33
|
+
if self._steps_used > self._config.max_steps_per_epoch:
|
|
34
|
+
raise BudgetExhaustedError(
|
|
35
|
+
f"Step budget exhausted: {self._steps_used}/{self._config.max_steps_per_epoch}"
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
def check_time(self) -> None:
|
|
39
|
+
if self._start_time is None:
|
|
40
|
+
return
|
|
41
|
+
elapsed = (time.monotonic() - self._start_time) / 60.0
|
|
42
|
+
if elapsed > self._config.max_wall_time_minutes:
|
|
43
|
+
raise BudgetExhaustedError(
|
|
44
|
+
f"Time budget exhausted: {elapsed:.1f}/{self._config.max_wall_time_minutes} min"
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def tokens_used(self) -> int:
|
|
49
|
+
return self._tokens_used
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def steps_used(self) -> int:
|
|
53
|
+
return self._steps_used
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def elapsed_minutes(self) -> float:
|
|
57
|
+
if self._start_time is None:
|
|
58
|
+
return 0.0
|
|
59
|
+
return (time.monotonic() - self._start_time) / 60.0
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""Harness: unified guard layer composing ACL, budget, validator, and sandbox."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .budget import BudgetTracker
|
|
6
|
+
from .policy import AgentPolicy, BudgetConfig, HarnessPolicy, load_policy
|
|
7
|
+
from .rollback import RollbackManager
|
|
8
|
+
from .sandbox import LocalSandbox, SandboxResult
|
|
9
|
+
from .validator import Action, ValidationResult, validate_action
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Harness:
|
|
13
|
+
"""Policy-aware action interceptor wrapping all guard layers."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, policy_path: str, sandbox_root: str = ".") -> None:
|
|
16
|
+
self._policy = load_policy(policy_path)
|
|
17
|
+
self._sandbox = LocalSandbox(sandbox_root)
|
|
18
|
+
self._budgets: dict[str, BudgetTracker] = {}
|
|
19
|
+
self._rollback = RollbackManager(sandbox_root)
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
def from_policy(cls, policy: HarnessPolicy, sandbox_root: str = ".") -> Harness:
|
|
23
|
+
"""Create a Harness directly from a HarnessPolicy object."""
|
|
24
|
+
instance = cls.__new__(cls)
|
|
25
|
+
instance._policy = policy
|
|
26
|
+
instance._sandbox = LocalSandbox(sandbox_root)
|
|
27
|
+
instance._budgets = {}
|
|
28
|
+
instance._rollback = RollbackManager(sandbox_root)
|
|
29
|
+
return instance
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def policy(self) -> HarnessPolicy:
|
|
33
|
+
return self._policy
|
|
34
|
+
|
|
35
|
+
def _get_budget(self, agent_id: str) -> BudgetTracker:
|
|
36
|
+
if agent_id not in self._budgets:
|
|
37
|
+
agent_policy = self._resolve_policy(agent_id)
|
|
38
|
+
config = agent_policy.budget if agent_policy else BudgetConfig()
|
|
39
|
+
self._budgets[agent_id] = BudgetTracker(config)
|
|
40
|
+
self._budgets[agent_id].start_timer()
|
|
41
|
+
return self._budgets[agent_id]
|
|
42
|
+
|
|
43
|
+
def _normalize_path(self, path: str) -> str:
|
|
44
|
+
"""Make path relative to sandbox_root for ACL pattern matching."""
|
|
45
|
+
from pathlib import Path
|
|
46
|
+
|
|
47
|
+
p = Path(path).resolve()
|
|
48
|
+
root = Path(self._sandbox._root).resolve()
|
|
49
|
+
try:
|
|
50
|
+
rel = p.relative_to(root)
|
|
51
|
+
return str(rel)
|
|
52
|
+
except ValueError:
|
|
53
|
+
return path
|
|
54
|
+
|
|
55
|
+
def _resolve_policy(self, agent_id: str) -> AgentPolicy | None:
|
|
56
|
+
"""Lookup agent policy with fallback to role prefix (e.g. backend-2 -> backend_agent)."""
|
|
57
|
+
if agent_id in self._policy.agents:
|
|
58
|
+
return self._policy.agents[agent_id]
|
|
59
|
+
import re
|
|
60
|
+
# Try exact match with _agent suffix (backend-test -> backend_agent)
|
|
61
|
+
role = re.sub(r'-[\w-]+$', '', agent_id) # strip suffix like -0, -test
|
|
62
|
+
candidates = [role, role + '_agent', re.sub(r'_agent$', '', role)]
|
|
63
|
+
for c in candidates:
|
|
64
|
+
if c in self._policy.agents:
|
|
65
|
+
return self._policy.agents[c]
|
|
66
|
+
return None
|
|
67
|
+
|
|
68
|
+
def validate_action(self, agent_id: str, action: Action) -> ValidationResult:
|
|
69
|
+
agent_policy = self._resolve_policy(agent_id)
|
|
70
|
+
if agent_policy is None:
|
|
71
|
+
return ValidationResult(allowed=False, errors=[f"Unknown agent '{agent_id}'"])
|
|
72
|
+
# Normalize path for ACL matching
|
|
73
|
+
normalized = action.model_copy()
|
|
74
|
+
if normalized.target:
|
|
75
|
+
normalized.target = self._normalize_path(normalized.target)
|
|
76
|
+
return validate_action(agent_policy, normalized)
|
|
77
|
+
|
|
78
|
+
def execute_action(self, agent_id: str, action: Action) -> SandboxResult:
|
|
79
|
+
validation = self.validate_action(agent_id, action)
|
|
80
|
+
if not validation.allowed:
|
|
81
|
+
return SandboxResult(-1, "", f"Action blocked: {'; '.join(validation.errors)}")
|
|
82
|
+
|
|
83
|
+
budget = self._get_budget(agent_id)
|
|
84
|
+
budget.consume_step()
|
|
85
|
+
budget.check_time()
|
|
86
|
+
|
|
87
|
+
if action.command:
|
|
88
|
+
return self._sandbox.run_command(action.command)
|
|
89
|
+
|
|
90
|
+
if action.action_type in ("CODE_WRITE", "FILE_WRITE") and action.content:
|
|
91
|
+
self._sandbox.write_file(action.target, action.content)
|
|
92
|
+
return SandboxResult(0, f"Wrote {action.target}", "")
|
|
93
|
+
|
|
94
|
+
if action.action_type == "FILE_READ":
|
|
95
|
+
content = self._sandbox.read_file(action.target)
|
|
96
|
+
return SandboxResult(0, content, "")
|
|
97
|
+
|
|
98
|
+
return SandboxResult(0, "", "")
|
|
99
|
+
|
|
100
|
+
def check_budget(self, agent_id: str) -> dict[str, int | float]:
|
|
101
|
+
budget = self._get_budget(agent_id)
|
|
102
|
+
return {
|
|
103
|
+
"tokens_used": budget.tokens_used,
|
|
104
|
+
"steps_used": budget.steps_used,
|
|
105
|
+
"elapsed_minutes": round(budget.elapsed_minutes, 2),
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
def get_agent_policy(self, agent_id: str) -> AgentPolicy | None:
|
|
109
|
+
return self._resolve_policy(agent_id)
|
|
110
|
+
|
|
111
|
+
# ------------------------------------------------------------------
|
|
112
|
+
# Rollback API
|
|
113
|
+
# ------------------------------------------------------------------
|
|
114
|
+
|
|
115
|
+
def snapshot(self, tag: str = "maestro-rollback") -> str | None:
|
|
116
|
+
"""Create a Git snapshot before an execution epoch."""
|
|
117
|
+
return self._rollback.snapshot(tag)
|
|
118
|
+
|
|
119
|
+
def rollback(self) -> bool:
|
|
120
|
+
"""Reset the working tree to the last snapshot."""
|
|
121
|
+
return self._rollback.rollback()
|
|
122
|
+
|
|
123
|
+
def clear_snapshot(self) -> None:
|
|
124
|
+
"""Remove the snapshot tag after a successful epoch."""
|
|
125
|
+
self._rollback.clear_snapshot()
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Policy models and loader for the Agent Harness."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import yaml
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class FilesystemRule(BaseModel):
|
|
12
|
+
allow: list[str] = Field(default_factory=list)
|
|
13
|
+
deny: list[str] = Field(default_factory=list)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class CommandRule(BaseModel):
|
|
17
|
+
allow: list[str] = Field(default_factory=list)
|
|
18
|
+
deny: list[str] = Field(default_factory=list)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class BudgetConfig(BaseModel):
|
|
22
|
+
max_tokens_per_task: int = 50000
|
|
23
|
+
max_steps_per_epoch: int = 30
|
|
24
|
+
max_wall_time_minutes: int = 10
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class HumanGateRule(BaseModel):
|
|
28
|
+
action_pattern: str
|
|
29
|
+
requires_approval: bool = True
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class AgentPolicy(BaseModel):
|
|
33
|
+
filesystem: FilesystemRule = Field(default_factory=FilesystemRule)
|
|
34
|
+
commands: CommandRule = Field(default_factory=CommandRule)
|
|
35
|
+
budget: BudgetConfig = Field(default_factory=BudgetConfig)
|
|
36
|
+
rollback: bool = True
|
|
37
|
+
human_gate: list[HumanGateRule] = Field(default_factory=list)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class HarnessPolicy(BaseModel):
|
|
41
|
+
agents: dict[str, AgentPolicy] = Field(default_factory=dict)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def load_policy(path: str) -> HarnessPolicy:
|
|
45
|
+
data = yaml.safe_load(Path(path).read_text())
|
|
46
|
+
return HarnessPolicy.model_validate(data)
|
|
File without changes
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Git-based snapshot and rollback for the Agent Harness."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from git import InvalidGitRepositoryError, Repo
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class RollbackManager:
|
|
14
|
+
"""Manages Git snapshots before agent execution epochs.
|
|
15
|
+
|
|
16
|
+
Creates a temporary commit before an epoch starts; if validation fails,
|
|
17
|
+
the working tree is reset to that commit (soft rollback).
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, repo_path: str = ".") -> None:
|
|
21
|
+
self._repo_path = Path(repo_path).resolve()
|
|
22
|
+
self._repo: Repo | None = None
|
|
23
|
+
self._snapshot_tag: str | None = None
|
|
24
|
+
self._original_branch: str | None = None
|
|
25
|
+
try:
|
|
26
|
+
self._repo = Repo(self._repo_path)
|
|
27
|
+
except InvalidGitRepositoryError:
|
|
28
|
+
logger.warning("No Git repository found at %s; rollback disabled", self._repo_path)
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
def enabled(self) -> bool:
|
|
32
|
+
return self._repo is not None and not self._repo.bare
|
|
33
|
+
|
|
34
|
+
def snapshot(self, tag: str = "maestro-rollback") -> str | None:
|
|
35
|
+
"""Create a snapshot commit and return its hexsha.
|
|
36
|
+
|
|
37
|
+
If the repo has uncommitted changes, they are stashed.
|
|
38
|
+
A lightweight tag is created for easy recovery.
|
|
39
|
+
"""
|
|
40
|
+
if not self.enabled or self._repo is None:
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
# Stash any existing changes to keep the tree clean
|
|
45
|
+
if self._repo.is_dirty(untracked_files=True):
|
|
46
|
+
self._repo.git.stash("push", "-u", "-m", f"maestro-auto-stash-{tag}")
|
|
47
|
+
|
|
48
|
+
# Create a tag on HEAD for rollback
|
|
49
|
+
head = self._repo.head.commit
|
|
50
|
+
tag_name = f"{tag}-{head.hexsha[:8]}"
|
|
51
|
+
|
|
52
|
+
# Remove old tag if it exists
|
|
53
|
+
for existing in list(self._repo.tags):
|
|
54
|
+
if existing.name == tag_name:
|
|
55
|
+
self._repo.delete_tag(existing)
|
|
56
|
+
|
|
57
|
+
self._repo.create_tag(tag_name, ref=head)
|
|
58
|
+
self._snapshot_tag = tag_name
|
|
59
|
+
logger.info("Rollback snapshot created: %s", tag_name)
|
|
60
|
+
return tag_name
|
|
61
|
+
except Exception as exc:
|
|
62
|
+
logger.warning("Failed to create snapshot: %s", exc)
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
def rollback(self, tag: str | None = None) -> bool:
|
|
66
|
+
"""Reset the working tree to the snapshot tag.
|
|
67
|
+
|
|
68
|
+
Returns True if rollback succeeded.
|
|
69
|
+
"""
|
|
70
|
+
if not self.enabled or self._repo is None:
|
|
71
|
+
return False
|
|
72
|
+
|
|
73
|
+
target_tag = tag or self._snapshot_tag
|
|
74
|
+
if target_tag is None:
|
|
75
|
+
logger.warning("No snapshot tag available for rollback")
|
|
76
|
+
return False
|
|
77
|
+
|
|
78
|
+
try:
|
|
79
|
+
commit = self._repo.tags[target_tag].commit
|
|
80
|
+
self._repo.head.reset(commit, index=True, working_tree=True)
|
|
81
|
+
logger.info("Rolled back to %s", target_tag)
|
|
82
|
+
return True
|
|
83
|
+
except Exception as exc:
|
|
84
|
+
logger.warning("Rollback failed: %s", exc)
|
|
85
|
+
return False
|
|
86
|
+
|
|
87
|
+
def clear_snapshot(self) -> None:
|
|
88
|
+
"""Remove the snapshot tag after a successful epoch."""
|
|
89
|
+
if self._repo is not None and self._snapshot_tag:
|
|
90
|
+
try:
|
|
91
|
+
for tag in list(self._repo.tags):
|
|
92
|
+
if tag.name == self._snapshot_tag:
|
|
93
|
+
self._repo.delete_tag(tag)
|
|
94
|
+
except Exception as exc:
|
|
95
|
+
logger.debug("Failed to delete snapshot tag: %s", exc)
|
|
96
|
+
self._snapshot_tag = None
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Sandbox abstraction for isolated agent execution."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import subprocess
|
|
6
|
+
from typing import Protocol
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SandboxResult:
|
|
10
|
+
__slots__ = ("exit_code", "stdout", "stderr")
|
|
11
|
+
|
|
12
|
+
def __init__(self, exit_code: int, stdout: str, stderr: str) -> None:
|
|
13
|
+
self.exit_code = exit_code
|
|
14
|
+
self.stdout = stdout
|
|
15
|
+
self.stderr = stderr
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SandboxProvider(Protocol):
|
|
19
|
+
def run_command(
|
|
20
|
+
self, command: str, cwd: str | None = None, timeout: int = 60
|
|
21
|
+
) -> SandboxResult: ...
|
|
22
|
+
def write_file(self, path: str, content: str) -> None: ...
|
|
23
|
+
def read_file(self, path: str) -> str: ...
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class LocalSandbox:
|
|
27
|
+
"""Subprocess-based sandbox with directory restriction (Phase 1)."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, allowed_root: str) -> None:
|
|
30
|
+
self._root = allowed_root
|
|
31
|
+
|
|
32
|
+
def run_command(self, command: str, cwd: str | None = None, timeout: int = 60) -> SandboxResult:
|
|
33
|
+
work_dir = cwd or self._root
|
|
34
|
+
try:
|
|
35
|
+
proc = subprocess.run(
|
|
36
|
+
command,
|
|
37
|
+
shell=True,
|
|
38
|
+
capture_output=True,
|
|
39
|
+
text=True,
|
|
40
|
+
cwd=work_dir,
|
|
41
|
+
timeout=timeout,
|
|
42
|
+
)
|
|
43
|
+
return SandboxResult(proc.returncode, proc.stdout, proc.stderr)
|
|
44
|
+
except subprocess.TimeoutExpired:
|
|
45
|
+
return SandboxResult(-1, "", f"Command timed out after {timeout}s")
|
|
46
|
+
|
|
47
|
+
def write_file(self, path: str, content: str) -> None:
|
|
48
|
+
from pathlib import Path
|
|
49
|
+
|
|
50
|
+
target = Path(self._root) / path
|
|
51
|
+
target.parent.mkdir(parents=True, exist_ok=True)
|
|
52
|
+
target.write_text(content)
|
|
53
|
+
|
|
54
|
+
def read_file(self, path: str) -> str:
|
|
55
|
+
from pathlib import Path
|
|
56
|
+
|
|
57
|
+
target = Path(self._root) / path
|
|
58
|
+
return target.read_text()
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Action validator: pre-execution safety checks."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
|
|
9
|
+
from .acl import check_command, check_file_access
|
|
10
|
+
from .policy import AgentPolicy
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Action(BaseModel):
|
|
14
|
+
agent_id: str
|
|
15
|
+
action_type: str = ""
|
|
16
|
+
target: str = ""
|
|
17
|
+
content: str = ""
|
|
18
|
+
command: str = ""
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ValidationResult(BaseModel):
|
|
22
|
+
allowed: bool = True
|
|
23
|
+
errors: list[str] = []
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
DANGEROUS_PATTERNS = [
|
|
27
|
+
re.compile(r"\beval\s*\("),
|
|
28
|
+
re.compile(r"\bos\.system\s*\("),
|
|
29
|
+
re.compile(r"\bsubprocess\.call\s*\(.*shell\s*=\s*True"),
|
|
30
|
+
re.compile(r"\brm\s+-rf\s+/"),
|
|
31
|
+
re.compile(r"\bos\.remove\s*\(\s*[\"']/?"),
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
MAX_DELETE_LINES = 50
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def validate_action(policy: AgentPolicy, action: Action) -> ValidationResult:
|
|
38
|
+
errors: list[str] = []
|
|
39
|
+
|
|
40
|
+
# File access check
|
|
41
|
+
if action.target and action.action_type in (
|
|
42
|
+
"CODE_WRITE", "FILE_WRITE", "FILE_DELETE", "FILE_READ"
|
|
43
|
+
):
|
|
44
|
+
decision = check_file_access(policy, action.target)
|
|
45
|
+
if not decision.allowed:
|
|
46
|
+
errors.append(decision.reason)
|
|
47
|
+
|
|
48
|
+
# Command check
|
|
49
|
+
if action.command:
|
|
50
|
+
decision = check_command(policy, action.command)
|
|
51
|
+
if not decision.allowed:
|
|
52
|
+
errors.append(decision.reason)
|
|
53
|
+
|
|
54
|
+
# Dangerous pattern check
|
|
55
|
+
for pattern in DANGEROUS_PATTERNS:
|
|
56
|
+
if pattern.search(action.content):
|
|
57
|
+
errors.append(f"Dangerous pattern detected: {pattern.pattern}")
|
|
58
|
+
|
|
59
|
+
# Large deletion check
|
|
60
|
+
if action.action_type == "FILE_DELETE" and action.content:
|
|
61
|
+
deleted_lines = action.content.count("\n") + 1
|
|
62
|
+
if deleted_lines > MAX_DELETE_LINES:
|
|
63
|
+
errors.append(
|
|
64
|
+
f"Deletion of {deleted_lines} lines exceeds threshold of {MAX_DELETE_LINES}"
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
return ValidationResult(allowed=len(errors) == 0, errors=errors)
|
|
File without changes
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""Shared fixtures for maestro-harness tests."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from maestro_harness.policy import (
|
|
8
|
+
AgentPolicy,
|
|
9
|
+
BudgetConfig,
|
|
10
|
+
CommandRule,
|
|
11
|
+
FilesystemRule,
|
|
12
|
+
HarnessPolicy,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@pytest.fixture
|
|
17
|
+
def sample_policy() -> HarnessPolicy:
|
|
18
|
+
return HarnessPolicy(
|
|
19
|
+
agents={
|
|
20
|
+
"backend_agent": AgentPolicy(
|
|
21
|
+
filesystem=FilesystemRule(
|
|
22
|
+
allow=["src/api/**", "tests/api/**"],
|
|
23
|
+
deny=["src/ui/**", "*.env"],
|
|
24
|
+
),
|
|
25
|
+
commands=CommandRule(
|
|
26
|
+
allow=["python", "pytest", "pip"],
|
|
27
|
+
deny=["rm -rf", "curl", "wget"],
|
|
28
|
+
),
|
|
29
|
+
budget=BudgetConfig(
|
|
30
|
+
max_tokens_per_task=100,
|
|
31
|
+
max_steps_per_epoch=5,
|
|
32
|
+
max_wall_time_minutes=1,
|
|
33
|
+
),
|
|
34
|
+
),
|
|
35
|
+
"frontend_agent": AgentPolicy(
|
|
36
|
+
filesystem=FilesystemRule(
|
|
37
|
+
allow=["src/ui/**", "public/**"],
|
|
38
|
+
deny=["src/api/**"],
|
|
39
|
+
),
|
|
40
|
+
commands=CommandRule(
|
|
41
|
+
allow=["npm", "npx", "jest"],
|
|
42
|
+
deny=["curl", "sudo"],
|
|
43
|
+
),
|
|
44
|
+
),
|
|
45
|
+
}
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@pytest.fixture
|
|
50
|
+
def policy_file(tmp_path: Path, sample_policy: HarnessPolicy) -> str:
|
|
51
|
+
import yaml
|
|
52
|
+
|
|
53
|
+
path = tmp_path / "harness_policy.yaml"
|
|
54
|
+
path.write_text(yaml.dump(sample_policy.model_dump(), default_flow_style=False))
|
|
55
|
+
return str(path)
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
"""Tests for maestro-harness: policy, ACL, budget, validator, harness."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from maestro_harness.acl import check_command, check_file_access
|
|
8
|
+
from maestro_harness.budget import BudgetExhaustedError, BudgetTracker
|
|
9
|
+
from maestro_harness.harness import Harness
|
|
10
|
+
from maestro_harness.policy import BudgetConfig, HarnessPolicy, load_policy
|
|
11
|
+
from maestro_harness.validator import Action, validate_action
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TestPolicyLoader:
|
|
15
|
+
def test_load_from_file(self, policy_file: str) -> None:
|
|
16
|
+
policy = load_policy(policy_file)
|
|
17
|
+
assert "backend_agent" in policy.agents
|
|
18
|
+
assert "frontend_agent" in policy.agents
|
|
19
|
+
|
|
20
|
+
def test_backend_filesystem_rules(self, policy_file: str) -> None:
|
|
21
|
+
policy = load_policy(policy_file)
|
|
22
|
+
be = policy.agents["backend_agent"]
|
|
23
|
+
assert "src/api/**" in be.filesystem.allow
|
|
24
|
+
assert "src/ui/**" in be.filesystem.deny
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class TestACL:
|
|
28
|
+
def test_allow_matching_path(self, sample_policy: HarnessPolicy) -> None:
|
|
29
|
+
be = sample_policy.agents["backend_agent"]
|
|
30
|
+
result = check_file_access(be, "src/api/auth.py")
|
|
31
|
+
assert result.allowed
|
|
32
|
+
|
|
33
|
+
def test_deny_takes_precedence(self, sample_policy: HarnessPolicy) -> None:
|
|
34
|
+
be = sample_policy.agents["backend_agent"]
|
|
35
|
+
result = check_file_access(be, "src/ui/app.tsx")
|
|
36
|
+
assert not result.allowed
|
|
37
|
+
assert "denied" in result.reason
|
|
38
|
+
|
|
39
|
+
def test_no_match_denied(self, sample_policy: HarnessPolicy) -> None:
|
|
40
|
+
be = sample_policy.agents["backend_agent"]
|
|
41
|
+
result = check_file_access(be, "random/file.txt")
|
|
42
|
+
assert not result.allowed
|
|
43
|
+
|
|
44
|
+
def test_env_denied(self, sample_policy: HarnessPolicy) -> None:
|
|
45
|
+
be = sample_policy.agents["backend_agent"]
|
|
46
|
+
result = check_file_access(be, ".env")
|
|
47
|
+
assert not result.allowed
|
|
48
|
+
|
|
49
|
+
def test_command_allowed(self, sample_policy: HarnessPolicy) -> None:
|
|
50
|
+
be = sample_policy.agents["backend_agent"]
|
|
51
|
+
result = check_command(be, "python main.py")
|
|
52
|
+
assert result.allowed
|
|
53
|
+
|
|
54
|
+
def test_command_denied(self, sample_policy: HarnessPolicy) -> None:
|
|
55
|
+
be = sample_policy.agents["backend_agent"]
|
|
56
|
+
result = check_command(be, "rm -rf /")
|
|
57
|
+
assert not result.allowed
|
|
58
|
+
|
|
59
|
+
def test_command_not_in_list(self, sample_policy: HarnessPolicy) -> None:
|
|
60
|
+
be = sample_policy.agents["backend_agent"]
|
|
61
|
+
result = check_command(be, "docker build .")
|
|
62
|
+
assert not result.allowed
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class TestBudget:
|
|
66
|
+
def test_consume_tokens(self) -> None:
|
|
67
|
+
tracker = BudgetTracker(BudgetConfig(max_tokens_per_task=100))
|
|
68
|
+
tracker.consume_tokens(50)
|
|
69
|
+
assert tracker.tokens_used == 50
|
|
70
|
+
|
|
71
|
+
def test_token_exhaustion(self) -> None:
|
|
72
|
+
tracker = BudgetTracker(BudgetConfig(max_tokens_per_task=10))
|
|
73
|
+
with pytest.raises(BudgetExhaustedError, match="Token"):
|
|
74
|
+
tracker.consume_tokens(20)
|
|
75
|
+
|
|
76
|
+
def test_step_exhaustion(self) -> None:
|
|
77
|
+
tracker = BudgetTracker(BudgetConfig(max_steps_per_epoch=2))
|
|
78
|
+
tracker.consume_step()
|
|
79
|
+
tracker.consume_step()
|
|
80
|
+
with pytest.raises(BudgetExhaustedError, match="Step"):
|
|
81
|
+
tracker.consume_step()
|
|
82
|
+
|
|
83
|
+
def test_elapsed_tracking(self) -> None:
|
|
84
|
+
tracker = BudgetTracker(BudgetConfig())
|
|
85
|
+
tracker.start_timer()
|
|
86
|
+
assert tracker.elapsed_minutes >= 0
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class TestValidator:
|
|
90
|
+
def test_valid_write(self, sample_policy: HarnessPolicy) -> None:
|
|
91
|
+
be = sample_policy.agents["backend_agent"]
|
|
92
|
+
action = Action(
|
|
93
|
+
agent_id="backend_agent",
|
|
94
|
+
action_type="CODE_WRITE",
|
|
95
|
+
target="src/api/auth.py",
|
|
96
|
+
content="def login(): pass",
|
|
97
|
+
)
|
|
98
|
+
result = validate_action(be, action)
|
|
99
|
+
assert result.allowed
|
|
100
|
+
|
|
101
|
+
def test_invalid_path(self, sample_policy: HarnessPolicy) -> None:
|
|
102
|
+
be = sample_policy.agents["backend_agent"]
|
|
103
|
+
action = Action(
|
|
104
|
+
agent_id="backend_agent",
|
|
105
|
+
action_type="CODE_WRITE",
|
|
106
|
+
target="src/ui/app.tsx",
|
|
107
|
+
)
|
|
108
|
+
result = validate_action(be, action)
|
|
109
|
+
assert not result.allowed
|
|
110
|
+
|
|
111
|
+
def test_dangerous_eval(self, sample_policy: HarnessPolicy) -> None:
|
|
112
|
+
be = sample_policy.agents["backend_agent"]
|
|
113
|
+
action = Action(
|
|
114
|
+
agent_id="backend_agent",
|
|
115
|
+
action_type="CODE_WRITE",
|
|
116
|
+
target="src/api/evil.py",
|
|
117
|
+
content="result = eval(user_input)",
|
|
118
|
+
)
|
|
119
|
+
result = validate_action(be, action)
|
|
120
|
+
assert not result.allowed
|
|
121
|
+
assert any("Dangerous" in e for e in result.errors)
|
|
122
|
+
|
|
123
|
+
def test_dangerous_os_system(self, sample_policy: HarnessPolicy) -> None:
|
|
124
|
+
be = sample_policy.agents["backend_agent"]
|
|
125
|
+
action = Action(
|
|
126
|
+
agent_id="backend_agent",
|
|
127
|
+
action_type="CODE_WRITE",
|
|
128
|
+
target="src/api/cmd.py",
|
|
129
|
+
content="os.system('rm -rf /')",
|
|
130
|
+
)
|
|
131
|
+
result = validate_action(be, action)
|
|
132
|
+
assert not result.allowed
|
|
133
|
+
|
|
134
|
+
def test_dangerous_command(self, sample_policy: HarnessPolicy) -> None:
|
|
135
|
+
be = sample_policy.agents["backend_agent"]
|
|
136
|
+
action = Action(
|
|
137
|
+
agent_id="backend_agent",
|
|
138
|
+
action_type="CODE_WRITE",
|
|
139
|
+
target="src/api/x.py",
|
|
140
|
+
content="pass",
|
|
141
|
+
command="rm -rf /tmp/anything",
|
|
142
|
+
)
|
|
143
|
+
result = validate_action(be, action)
|
|
144
|
+
assert not result.allowed
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class TestHarness:
|
|
148
|
+
def test_validate_known_agent(self, policy_file: str) -> None:
|
|
149
|
+
harness = Harness(policy_file)
|
|
150
|
+
action = Action(
|
|
151
|
+
agent_id="backend_agent",
|
|
152
|
+
action_type="CODE_WRITE",
|
|
153
|
+
target="src/api/auth.py",
|
|
154
|
+
)
|
|
155
|
+
result = harness.validate_action("backend_agent", action)
|
|
156
|
+
assert result.allowed
|
|
157
|
+
|
|
158
|
+
def test_validate_unknown_agent(self, policy_file: str) -> None:
|
|
159
|
+
harness = Harness(policy_file)
|
|
160
|
+
action = Action(agent_id="unknown", action_type="CODE_WRITE", target="f.py")
|
|
161
|
+
result = harness.validate_action("unknown", action)
|
|
162
|
+
assert not result.allowed
|
|
163
|
+
assert "Unknown" in result.errors[0]
|
|
164
|
+
|
|
165
|
+
def test_execute_blocked(self, policy_file: str) -> None:
|
|
166
|
+
harness = Harness(policy_file)
|
|
167
|
+
action = Action(
|
|
168
|
+
agent_id="backend_agent",
|
|
169
|
+
action_type="CODE_WRITE",
|
|
170
|
+
target="src/ui/app.tsx",
|
|
171
|
+
)
|
|
172
|
+
result = harness.execute_action("backend_agent", action)
|
|
173
|
+
assert result.exit_code == -1
|
|
174
|
+
assert "blocked" in result.stderr
|
|
175
|
+
|
|
176
|
+
def test_budget_tracking(self, policy_file: str) -> None:
|
|
177
|
+
harness = Harness(policy_file)
|
|
178
|
+
action = Action(
|
|
179
|
+
agent_id="backend_agent",
|
|
180
|
+
action_type="CODE_WRITE",
|
|
181
|
+
target="src/api/auth.py",
|
|
182
|
+
content="pass",
|
|
183
|
+
)
|
|
184
|
+
harness.execute_action("backend_agent", action)
|
|
185
|
+
status = harness.check_budget("backend_agent")
|
|
186
|
+
assert status["steps_used"] == 1
|
|
187
|
+
|
|
188
|
+
def test_file_read_allowed(self, policy_file: str, tmp_path: Path) -> None:
|
|
189
|
+
harness = Harness(policy_file, sandbox_root=str(tmp_path))
|
|
190
|
+
test_file = tmp_path / "src" / "api" / "auth.py"
|
|
191
|
+
test_file.parent.mkdir(parents=True, exist_ok=True)
|
|
192
|
+
test_file.write_text("def login(): pass")
|
|
193
|
+
action = Action(
|
|
194
|
+
agent_id="backend_agent",
|
|
195
|
+
action_type="FILE_READ",
|
|
196
|
+
target=str(test_file),
|
|
197
|
+
)
|
|
198
|
+
result = harness.execute_action("backend_agent", action)
|
|
199
|
+
assert result.exit_code == 0
|
|
200
|
+
assert "def login(): pass" in result.stdout
|
|
201
|
+
|
|
202
|
+
def test_file_read_blocked_by_acl(self, policy_file: str) -> None:
|
|
203
|
+
harness = Harness(policy_file)
|
|
204
|
+
action = Action(
|
|
205
|
+
agent_id="backend_agent",
|
|
206
|
+
action_type="FILE_READ",
|
|
207
|
+
target="src/ui/app.tsx",
|
|
208
|
+
)
|
|
209
|
+
result = harness.execute_action("backend_agent", action)
|
|
210
|
+
assert result.exit_code == -1
|
|
211
|
+
assert "blocked" in result.stderr
|
|
212
|
+
|
|
213
|
+
def test_file_write_and_read_roundtrip(self, policy_file: str, tmp_path: Path) -> None:
|
|
214
|
+
harness = Harness(policy_file, sandbox_root=str(tmp_path))
|
|
215
|
+
target = str(tmp_path / "src" / "api" / "new.py")
|
|
216
|
+
write_action = Action(
|
|
217
|
+
agent_id="backend_agent",
|
|
218
|
+
action_type="FILE_WRITE",
|
|
219
|
+
target=target,
|
|
220
|
+
content="x = 42",
|
|
221
|
+
)
|
|
222
|
+
harness.execute_action("backend_agent", write_action)
|
|
223
|
+
read_action = Action(
|
|
224
|
+
agent_id="backend_agent",
|
|
225
|
+
action_type="FILE_READ",
|
|
226
|
+
target=target,
|
|
227
|
+
)
|
|
228
|
+
result = harness.execute_action("backend_agent", read_action)
|
|
229
|
+
assert result.exit_code == 0
|
|
230
|
+
assert "x = 42" in result.stdout
|
|
231
|
+
|
|
232
|
+
def test_shell_exec_allowed(self, policy_file: str, tmp_path: Path) -> None:
|
|
233
|
+
harness = Harness(policy_file, sandbox_root=str(tmp_path))
|
|
234
|
+
action = Action(
|
|
235
|
+
agent_id="backend_agent",
|
|
236
|
+
action_type="SHELL_EXEC",
|
|
237
|
+
target="",
|
|
238
|
+
command="python -c 'print(hello)'",
|
|
239
|
+
)
|
|
240
|
+
result = harness.execute_action("backend_agent", action)
|
|
241
|
+
assert result.exit_code != 0 # NameError, but command was allowed to run
|
|
242
|
+
assert "hello" in result.stderr or "NameError" in result.stderr
|
|
243
|
+
|
|
244
|
+
def test_shell_exec_blocked(self, policy_file: str) -> None:
|
|
245
|
+
harness = Harness(policy_file)
|
|
246
|
+
action = Action(
|
|
247
|
+
agent_id="backend_agent",
|
|
248
|
+
action_type="SHELL_EXEC",
|
|
249
|
+
target="",
|
|
250
|
+
command="curl https://evil.com",
|
|
251
|
+
)
|
|
252
|
+
result = harness.execute_action("backend_agent", action)
|
|
253
|
+
assert result.exit_code == -1
|
|
254
|
+
assert "blocked" in result.stderr
|