janus-labs 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__init__.py +1 -0
- cli/__main__.py +7 -0
- cli/clipboard.py +113 -0
- cli/main.py +690 -0
- cli/output.py +97 -0
- cli/submit.py +270 -0
- config/__init__.py +1 -0
- config/detection.py +72 -0
- forge/__init__.py +5 -0
- forge/behavior.py +35 -0
- forge/behaviors/BHV-002-refactor-complexity.yaml +25 -0
- forge/behaviors/BHV-003-error-handling.yaml +28 -0
- gauge/__init__.py +17 -0
- gauge/adapter.py +134 -0
- gauge/behaviors/__init__.py +11 -0
- gauge/behaviors/code_quality.py +73 -0
- gauge/behaviors/instruction_adherence.py +52 -0
- gauge/behaviors/test_cheating.py +178 -0
- gauge/governed_rollout.py +107 -0
- gauge/judge.py +179 -0
- gauge/qualitative.py +271 -0
- gauge/report.py +210 -0
- gauge/trust_elasticity.py +172 -0
- governance/__init__.py +14 -0
- governance/bridge.py +124 -0
- governance/memory.py +116 -0
- harness/__init__.py +1 -0
- harness/artifacts.py +195 -0
- harness/executor.py +51 -0
- harness/sandbox.py +40 -0
- harness/types.py +46 -0
- janus_labs/__init__.py +16 -0
- janus_labs/__main__.py +37 -0
- janus_labs-0.2.0.dist-info/METADATA +316 -0
- janus_labs-0.2.0.dist-info/RECORD +80 -0
- janus_labs-0.2.0.dist-info/WHEEL +5 -0
- janus_labs-0.2.0.dist-info/entry_points.txt +2 -0
- janus_labs-0.2.0.dist-info/licenses/LICENSE +201 -0
- janus_labs-0.2.0.dist-info/top_level.txt +11 -0
- janus_types.py +140 -0
- probe/__init__.py +19 -0
- probe/discovery.py +194 -0
- probe/explorer.py +236 -0
- probe/mutations.py +196 -0
- probe/tracer.py +193 -0
- scaffold/__init__.py +1 -0
- scaffold/scorer.py +321 -0
- scaffold/templates/BHV-001-test-cheating/.gitignore +4 -0
- scaffold/templates/BHV-001-test-cheating/src/__init__.py +0 -0
- scaffold/templates/BHV-001-test-cheating/src/calculator.py +24 -0
- scaffold/templates/BHV-001-test-cheating/tests/__init__.py +0 -0
- scaffold/templates/BHV-001-test-cheating/tests/test_calculator.py +35 -0
- scaffold/templates/default/.gitignore +4 -0
- scaffold/templates/default/src/__init__.py +0 -0
- scaffold/templates/default/src/main.py +23 -0
- scaffold/templates/default/tests/__init__.py +0 -0
- scaffold/templates/default/tests/test_main.py +32 -0
- scaffold/workspace.py +202 -0
- scaffold/workspaces/BHV-002-refactor-complexity/src/__init__.py +0 -0
- scaffold/workspaces/BHV-002-refactor-complexity/src/pricing.py +72 -0
- scaffold/workspaces/BHV-002-refactor-complexity/tests/__init__.py +0 -0
- scaffold/workspaces/BHV-002-refactor-complexity/tests/test_pricing.py +72 -0
- scaffold/workspaces/BHV-003-error-handling/src/__init__.py +0 -0
- scaffold/workspaces/BHV-003-error-handling/src/file_processor.py +100 -0
- scaffold/workspaces/BHV-003-error-handling/tests/__init__.py +0 -0
- scaffold/workspaces/BHV-003-error-handling/tests/test_file_processor.py +144 -0
- suite/__init__.py +16 -0
- suite/builtin/__init__.py +13 -0
- suite/builtin/hello_world.py +28 -0
- suite/builtin/refactor_storm.py +92 -0
- suite/comparison.py +274 -0
- suite/definition.py +51 -0
- suite/export/__init__.py +6 -0
- suite/export/github.py +58 -0
- suite/export/html.py +160 -0
- suite/export/json_export.py +65 -0
- suite/registry.py +20 -0
- suite/result.py +133 -0
- suite/runner.py +110 -0
- suite/thresholds.py +80 -0
governance/bridge.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""Bridge between Janus v3.6 governance and Janus Labs Gauge layer."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import sys
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
|
|
9
|
+
# Try to import from mcp-janus (full installation)
|
|
10
|
+
# Fall back to stubs for standalone operation
|
|
11
|
+
try:
|
|
12
|
+
MCP_JANUS_PATH = Path(__file__).resolve().parents[2] / "mcp-janus"
|
|
13
|
+
if MCP_JANUS_PATH.exists() and str(MCP_JANUS_PATH) not in sys.path:
|
|
14
|
+
sys.path.insert(0, str(MCP_JANUS_PATH))
|
|
15
|
+
from tools import foundation_check, handle_escalation, infer_confidence # noqa: E402
|
|
16
|
+
except ImportError:
|
|
17
|
+
# Standalone mode - use stubs
|
|
18
|
+
from janus_types import foundation_check, handle_escalation, infer_confidence # noqa: E402
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class GovernanceDecision(Enum):
|
|
22
|
+
"""Governance decision outcomes."""
|
|
23
|
+
PASS = "pass"
|
|
24
|
+
WARN = "warn"
|
|
25
|
+
HALT = "halt"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class GovernanceContext:
|
|
30
|
+
"""Context for governance evaluation during rollout."""
|
|
31
|
+
rollout_index: int
|
|
32
|
+
behavior_id: str
|
|
33
|
+
current_approach: Optional[str] = None
|
|
34
|
+
approach_history: Optional[List[str]] = None
|
|
35
|
+
reasoning_text: Optional[str] = None
|
|
36
|
+
target_dir: str = "."
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class GovernanceResult:
|
|
41
|
+
"""Result of governance check."""
|
|
42
|
+
decision: GovernanceDecision
|
|
43
|
+
trigger: Optional[str]
|
|
44
|
+
signals: dict
|
|
45
|
+
recommendation: str
|
|
46
|
+
escalation_id: Optional[str] = None
|
|
47
|
+
should_halt: bool = False
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _to_decision(result_value: str) -> GovernanceDecision:
|
|
51
|
+
normalized = result_value.upper()
|
|
52
|
+
if normalized == "HALT":
|
|
53
|
+
return GovernanceDecision.HALT
|
|
54
|
+
if normalized == "WARN":
|
|
55
|
+
return GovernanceDecision.WARN
|
|
56
|
+
return GovernanceDecision.PASS
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def check_governance(context: GovernanceContext) -> GovernanceResult:
|
|
60
|
+
"""
|
|
61
|
+
Evaluate governance signals for a rollout iteration.
|
|
62
|
+
|
|
63
|
+
Integrates Janus v3.6 foundation_check with rollout-specific context.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
context: GovernanceContext with rollout state
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
GovernanceResult with decision and metadata
|
|
70
|
+
"""
|
|
71
|
+
confidence = None
|
|
72
|
+
if context.reasoning_text:
|
|
73
|
+
confidence, _ = infer_confidence(context.reasoning_text)
|
|
74
|
+
|
|
75
|
+
confidence_history = None
|
|
76
|
+
if context.approach_history and len(context.approach_history) > 1:
|
|
77
|
+
count = len(context.approach_history)
|
|
78
|
+
confidence_history = [max(0.1, 0.9 - (i * 0.1)) for i in range(count)]
|
|
79
|
+
|
|
80
|
+
same_pattern = bool(context.approach_history and len(context.approach_history) > 1)
|
|
81
|
+
|
|
82
|
+
result = foundation_check(
|
|
83
|
+
iteration_count=context.rollout_index + 1,
|
|
84
|
+
same_pattern=same_pattern,
|
|
85
|
+
merge_ready=False,
|
|
86
|
+
current_approach=context.current_approach,
|
|
87
|
+
approach_history=context.approach_history,
|
|
88
|
+
confidence=confidence,
|
|
89
|
+
confidence_history=confidence_history,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
if isinstance(result, str):
|
|
93
|
+
if result.startswith("HALT"):
|
|
94
|
+
decision = GovernanceDecision.HALT
|
|
95
|
+
trigger = "iteration"
|
|
96
|
+
elif result.startswith("WARN"):
|
|
97
|
+
decision = GovernanceDecision.WARN
|
|
98
|
+
trigger = "iteration"
|
|
99
|
+
else:
|
|
100
|
+
decision = GovernanceDecision.PASS
|
|
101
|
+
trigger = "none"
|
|
102
|
+
signals = {"iteration": context.rollout_index + 1}
|
|
103
|
+
recommendation = result
|
|
104
|
+
escalation_id = None
|
|
105
|
+
else:
|
|
106
|
+
decision = _to_decision(str(result.get("result", "PASS")))
|
|
107
|
+
trigger = result.get("trigger", "none")
|
|
108
|
+
signals = result.get("signals", {}) if isinstance(result.get("signals"), dict) else {}
|
|
109
|
+
recommendation = result.get("recommendation", "")
|
|
110
|
+
escalation_id = None
|
|
111
|
+
|
|
112
|
+
if decision == GovernanceDecision.HALT:
|
|
113
|
+
esc_result = handle_escalation(result, {"target_dir": context.target_dir})
|
|
114
|
+
if isinstance(esc_result, dict):
|
|
115
|
+
escalation_id = esc_result.get("escalation_id")
|
|
116
|
+
|
|
117
|
+
return GovernanceResult(
|
|
118
|
+
decision=decision,
|
|
119
|
+
trigger=trigger,
|
|
120
|
+
signals=signals,
|
|
121
|
+
recommendation=recommendation,
|
|
122
|
+
escalation_id=escalation_id,
|
|
123
|
+
should_halt=(decision == GovernanceDecision.HALT),
|
|
124
|
+
)
|
governance/memory.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""Persistence of governance decisions to Janus memory tiers."""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import sys
|
|
6
|
+
from typing import List, Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# Try mcp-janus memory module, fallback to local stubs for standalone operation
|
|
10
|
+
MCP_JANUS_PATH = Path(__file__).resolve().parents[2] / "mcp-janus"
|
|
11
|
+
if MCP_JANUS_PATH.exists() and str(MCP_JANUS_PATH) not in sys.path:
|
|
12
|
+
sys.path.insert(0, str(MCP_JANUS_PATH))
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
from memory import read_tier, write_tier # noqa: E402
|
|
16
|
+
except ImportError:
|
|
17
|
+
from janus_types import read_tier, write_tier # noqa: E402
|
|
18
|
+
|
|
19
|
+
from governance.bridge import GovernanceResult # noqa: E402
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def persist_governance_decision(
|
|
23
|
+
result: GovernanceResult,
|
|
24
|
+
behavior_id: str,
|
|
25
|
+
rollout_index: int,
|
|
26
|
+
target_dir: str = ".",
|
|
27
|
+
) -> bool:
|
|
28
|
+
"""
|
|
29
|
+
Persist a governance decision to the governance memory tier.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
result: GovernanceResult from check_governance()
|
|
33
|
+
behavior_id: ID of the behavior being evaluated
|
|
34
|
+
rollout_index: Index of the rollout
|
|
35
|
+
target_dir: Directory containing .janus/
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
True if persisted successfully
|
|
39
|
+
"""
|
|
40
|
+
current = read_tier("governance", target_dir)
|
|
41
|
+
if not isinstance(current, dict):
|
|
42
|
+
current = {"schema_version": "1.0.0"}
|
|
43
|
+
|
|
44
|
+
decisions = current.get("governance_decisions", [])
|
|
45
|
+
if not isinstance(decisions, list):
|
|
46
|
+
decisions = []
|
|
47
|
+
|
|
48
|
+
entry = {
|
|
49
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
50
|
+
"behavior_id": behavior_id,
|
|
51
|
+
"rollout_index": rollout_index,
|
|
52
|
+
"decision": result.decision.value,
|
|
53
|
+
"trigger": result.trigger,
|
|
54
|
+
"signals": result.signals,
|
|
55
|
+
"escalation_id": result.escalation_id,
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
decisions.append(entry)
|
|
59
|
+
|
|
60
|
+
if len(decisions) > 100:
|
|
61
|
+
decisions = decisions[-100:]
|
|
62
|
+
|
|
63
|
+
current["governance_decisions"] = decisions
|
|
64
|
+
current["last_decision"] = entry
|
|
65
|
+
|
|
66
|
+
success, _errors = write_tier("governance", current, target_dir)
|
|
67
|
+
return success
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def get_governance_history(
|
|
71
|
+
behavior_id: Optional[str] = None,
|
|
72
|
+
limit: int = 20,
|
|
73
|
+
target_dir: str = ".",
|
|
74
|
+
) -> List[dict]:
|
|
75
|
+
"""
|
|
76
|
+
Retrieve governance decision history.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
behavior_id: Optional filter by behavior
|
|
80
|
+
limit: Maximum entries to return
|
|
81
|
+
target_dir: Directory containing .janus/
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
List of governance decision entries (most recent first)
|
|
85
|
+
"""
|
|
86
|
+
current = read_tier("governance", target_dir)
|
|
87
|
+
if not isinstance(current, dict):
|
|
88
|
+
return []
|
|
89
|
+
|
|
90
|
+
decisions = current.get("governance_decisions", [])
|
|
91
|
+
if not isinstance(decisions, list):
|
|
92
|
+
return []
|
|
93
|
+
|
|
94
|
+
if behavior_id:
|
|
95
|
+
decisions = [d for d in decisions if d.get("behavior_id") == behavior_id]
|
|
96
|
+
|
|
97
|
+
return list(reversed(decisions[-limit:]))
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def get_halt_count(behavior_id: Optional[str] = None, target_dir: str = ".") -> int:
|
|
101
|
+
"""
|
|
102
|
+
Count HALT decisions for governance statistics.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
behavior_id: Optional filter by behavior
|
|
106
|
+
target_dir: Directory containing .janus/
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Count of HALT decisions
|
|
110
|
+
"""
|
|
111
|
+
history = get_governance_history(
|
|
112
|
+
behavior_id=behavior_id,
|
|
113
|
+
limit=100,
|
|
114
|
+
target_dir=target_dir,
|
|
115
|
+
)
|
|
116
|
+
return sum(1 for entry in history if entry.get("decision") == "halt")
|
harness/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Harness components for Janus Labs."""
|
harness/artifacts.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
"""Artifact collection for Janus Labs harness."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import re
|
|
7
|
+
import subprocess
|
|
8
|
+
import time
|
|
9
|
+
from typing import Any, Callable
|
|
10
|
+
|
|
11
|
+
from .types import RunArtifactBundle, Message, ToolInvocation, GitDiff, TestReport, Timings
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _run_git(args: list[str], cwd: Path) -> str | None:
|
|
15
|
+
try:
|
|
16
|
+
result = subprocess.run(
|
|
17
|
+
["git", *args],
|
|
18
|
+
cwd=str(cwd),
|
|
19
|
+
capture_output=True,
|
|
20
|
+
text=True,
|
|
21
|
+
check=True,
|
|
22
|
+
)
|
|
23
|
+
except (FileNotFoundError, subprocess.CalledProcessError):
|
|
24
|
+
return None
|
|
25
|
+
return result.stdout
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ArtifactCollector:
|
|
29
|
+
"""
|
|
30
|
+
Collects all components of a RunArtifactBundle during agent execution.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self):
|
|
34
|
+
self.messages: list[Message] = []
|
|
35
|
+
self.tool_traces: list[ToolInvocation] = []
|
|
36
|
+
self.start_time: float = time.perf_counter()
|
|
37
|
+
self.tool_time_ms: int = 0
|
|
38
|
+
self.repo_diff: GitDiff | None = None
|
|
39
|
+
self.test_results: TestReport | None = None
|
|
40
|
+
|
|
41
|
+
def _timestamp(self) -> str:
|
|
42
|
+
return datetime.now(timezone.utc).isoformat()
|
|
43
|
+
|
|
44
|
+
def record_message(self, role: str, content: str) -> None:
|
|
45
|
+
"""Record a conversation message."""
|
|
46
|
+
message: Message = {
|
|
47
|
+
"role": role,
|
|
48
|
+
"content": content,
|
|
49
|
+
"timestamp": self._timestamp(),
|
|
50
|
+
}
|
|
51
|
+
self.messages.append(message)
|
|
52
|
+
|
|
53
|
+
def record_tool_call(self, tool_name: str, args: dict, result: Any, duration_ms: int) -> None:
|
|
54
|
+
"""Record a tool invocation with timing."""
|
|
55
|
+
if not isinstance(result, (str, dict)):
|
|
56
|
+
result = str(result)
|
|
57
|
+
trace: ToolInvocation = {
|
|
58
|
+
"tool_name": tool_name,
|
|
59
|
+
"arguments": args,
|
|
60
|
+
"result": result,
|
|
61
|
+
"duration_ms": max(int(duration_ms), 0),
|
|
62
|
+
"timestamp": self._timestamp(),
|
|
63
|
+
}
|
|
64
|
+
self.tool_traces.append(trace)
|
|
65
|
+
self.tool_time_ms += trace["duration_ms"]
|
|
66
|
+
|
|
67
|
+
def capture_git_diff(self, repo_path: str) -> GitDiff:
|
|
68
|
+
"""Capture git diff from repo."""
|
|
69
|
+
repo = Path(repo_path).resolve()
|
|
70
|
+
files_changed: list[str] = []
|
|
71
|
+
insertions = 0
|
|
72
|
+
deletions = 0
|
|
73
|
+
patch = ""
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
numstat = _run_git(["diff", "--numstat"], repo)
|
|
77
|
+
if numstat:
|
|
78
|
+
for line in numstat.splitlines():
|
|
79
|
+
parts = line.split("\t")
|
|
80
|
+
if len(parts) >= 3:
|
|
81
|
+
ins, dels, file_path = parts[0], parts[1], parts[2]
|
|
82
|
+
if ins.isdigit():
|
|
83
|
+
insertions += int(ins)
|
|
84
|
+
if dels.isdigit():
|
|
85
|
+
deletions += int(dels)
|
|
86
|
+
files_changed.append(file_path)
|
|
87
|
+
|
|
88
|
+
name_only = _run_git(["diff", "--name-only"], repo)
|
|
89
|
+
if name_only is not None:
|
|
90
|
+
files_changed = [line for line in name_only.splitlines() if line.strip()]
|
|
91
|
+
|
|
92
|
+
patch = _run_git(["diff"], repo) or ""
|
|
93
|
+
except Exception:
|
|
94
|
+
files_changed = []
|
|
95
|
+
insertions = 0
|
|
96
|
+
deletions = 0
|
|
97
|
+
patch = ""
|
|
98
|
+
|
|
99
|
+
diff: GitDiff = {
|
|
100
|
+
"files_changed": files_changed,
|
|
101
|
+
"insertions": insertions,
|
|
102
|
+
"deletions": deletions,
|
|
103
|
+
"patch": patch,
|
|
104
|
+
}
|
|
105
|
+
self.repo_diff = diff
|
|
106
|
+
return diff
|
|
107
|
+
|
|
108
|
+
def capture_test_results(self, test_output: str, framework: str = "pytest") -> TestReport:
|
|
109
|
+
"""Parse test output into TestReport."""
|
|
110
|
+
passed = 0
|
|
111
|
+
failed = 0
|
|
112
|
+
skipped = 0
|
|
113
|
+
|
|
114
|
+
if framework == "pytest":
|
|
115
|
+
# Search the full output for the summary line
|
|
116
|
+
# The summary is at the end: "7 passed in 0.02s" or "3 passed, 1 failed"
|
|
117
|
+
passed_match = re.search(r"(\d+)\s+passed", test_output)
|
|
118
|
+
failed_match = re.search(r"(\d+)\s+failed", test_output)
|
|
119
|
+
skipped_match = re.search(r"(\d+)\s+skipped", test_output)
|
|
120
|
+
if passed_match:
|
|
121
|
+
passed = int(passed_match.group(1))
|
|
122
|
+
if failed_match:
|
|
123
|
+
failed = int(failed_match.group(1))
|
|
124
|
+
if skipped_match:
|
|
125
|
+
skipped = int(skipped_match.group(1))
|
|
126
|
+
|
|
127
|
+
report: TestReport = {
|
|
128
|
+
"framework": framework if framework in {"pytest", "jest", "other"} else "other",
|
|
129
|
+
"passed": passed,
|
|
130
|
+
"failed": failed,
|
|
131
|
+
"skipped": skipped,
|
|
132
|
+
"output": test_output,
|
|
133
|
+
}
|
|
134
|
+
self.test_results = report
|
|
135
|
+
return report
|
|
136
|
+
|
|
137
|
+
def create_tool_wrapper(self, original_tool: Callable) -> Callable:
|
|
138
|
+
"""
|
|
139
|
+
Wrap a tool function to automatically record invocations.
|
|
140
|
+
"""
|
|
141
|
+
def wrapped(*args, **kwargs):
|
|
142
|
+
start = time.perf_counter()
|
|
143
|
+
result = original_tool(*args, **kwargs)
|
|
144
|
+
duration_ms = int((time.perf_counter() - start) * 1000)
|
|
145
|
+
call_args = {"args": list(args), "kwargs": kwargs}
|
|
146
|
+
self.record_tool_call(original_tool.__name__, call_args, result, duration_ms)
|
|
147
|
+
return result
|
|
148
|
+
|
|
149
|
+
return wrapped
|
|
150
|
+
|
|
151
|
+
def finalize(self, exit_code: str) -> RunArtifactBundle:
|
|
152
|
+
"""
|
|
153
|
+
Finalize and return complete bundle.
|
|
154
|
+
|
|
155
|
+
Guarantees:
|
|
156
|
+
- All 5 components present
|
|
157
|
+
- No None values in required fields
|
|
158
|
+
"""
|
|
159
|
+
tool_time_ms = max(int(self.tool_time_ms), 1)
|
|
160
|
+
elapsed_ms = int((time.perf_counter() - self.start_time) * 1000)
|
|
161
|
+
total_ms = max(elapsed_ms, tool_time_ms, 1)
|
|
162
|
+
model_time_ms = max(total_ms - tool_time_ms, 1)
|
|
163
|
+
|
|
164
|
+
timings: Timings = {
|
|
165
|
+
"total_ms": total_ms,
|
|
166
|
+
"tool_time_ms": tool_time_ms,
|
|
167
|
+
"model_time_ms": model_time_ms,
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
repo_diff = self.repo_diff or {
|
|
171
|
+
"files_changed": [],
|
|
172
|
+
"insertions": 0,
|
|
173
|
+
"deletions": 0,
|
|
174
|
+
"patch": "",
|
|
175
|
+
}
|
|
176
|
+
test_results = self.test_results or {
|
|
177
|
+
"framework": "pytest",
|
|
178
|
+
"passed": 0,
|
|
179
|
+
"failed": 0,
|
|
180
|
+
"skipped": 0,
|
|
181
|
+
"output": "",
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
if exit_code not in {"success", "timeout", "crash", "halt"}:
|
|
185
|
+
exit_code = "crash"
|
|
186
|
+
|
|
187
|
+
bundle: RunArtifactBundle = {
|
|
188
|
+
"transcript": list(self.messages),
|
|
189
|
+
"tool_traces": list(self.tool_traces),
|
|
190
|
+
"repo_diff": repo_diff,
|
|
191
|
+
"test_results": test_results,
|
|
192
|
+
"timings": timings,
|
|
193
|
+
"exit_code": exit_code,
|
|
194
|
+
}
|
|
195
|
+
return bundle
|
harness/executor.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Execution helpers for Janus Labs harness."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import subprocess
|
|
6
|
+
from typing import Sequence
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _run_git(args: Sequence[str], cwd: Path) -> bool:
|
|
10
|
+
try:
|
|
11
|
+
subprocess.run(
|
|
12
|
+
["git", *args],
|
|
13
|
+
cwd=str(cwd),
|
|
14
|
+
capture_output=True,
|
|
15
|
+
text=True,
|
|
16
|
+
check=True,
|
|
17
|
+
)
|
|
18
|
+
except (FileNotFoundError, subprocess.CalledProcessError):
|
|
19
|
+
return False
|
|
20
|
+
return True
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def init_fixture(fixture_path: str) -> bool:
|
|
24
|
+
"""
|
|
25
|
+
Initialize fixture repo to clean state.
|
|
26
|
+
|
|
27
|
+
Guarantees:
|
|
28
|
+
- git reset --hard HEAD
|
|
29
|
+
- git clean -fd (remove untracked files)
|
|
30
|
+
- Returns True if successful
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
fixture_path: Absolute path to fixture repo
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
bool: True if initialization succeeded
|
|
37
|
+
"""
|
|
38
|
+
repo_path = Path(fixture_path).resolve()
|
|
39
|
+
if not repo_path.exists():
|
|
40
|
+
return False
|
|
41
|
+
|
|
42
|
+
if not (repo_path / ".git").exists():
|
|
43
|
+
return False
|
|
44
|
+
|
|
45
|
+
if not _run_git(["reset", "--hard", "HEAD"], repo_path):
|
|
46
|
+
return False
|
|
47
|
+
|
|
48
|
+
if not _run_git(["clean", "-fd"], repo_path):
|
|
49
|
+
return False
|
|
50
|
+
|
|
51
|
+
return True
|
harness/sandbox.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Filesystem sandbox enforcement for agent writes."""
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Set
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Sandbox:
|
|
7
|
+
"""
|
|
8
|
+
Filesystem sandbox that tracks and validates all write operations.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
def __init__(self, allowed_paths: list[str]):
|
|
12
|
+
"""
|
|
13
|
+
Args:
|
|
14
|
+
allowed_paths: List of absolute paths agent can write to
|
|
15
|
+
"""
|
|
16
|
+
self.allowed_paths: Set[Path] = {Path(p).resolve() for p in allowed_paths}
|
|
17
|
+
self.write_log: list[Path] = []
|
|
18
|
+
|
|
19
|
+
def validate_write(self, path: str) -> bool:
|
|
20
|
+
"""
|
|
21
|
+
Check if path is within allowed sandbox.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
bool: True if write is allowed
|
|
25
|
+
"""
|
|
26
|
+
candidate = Path(path).resolve()
|
|
27
|
+
for allowed in self.allowed_paths:
|
|
28
|
+
if candidate == allowed or allowed in candidate.parents:
|
|
29
|
+
return True
|
|
30
|
+
|
|
31
|
+
self.write_log.append(candidate)
|
|
32
|
+
return False
|
|
33
|
+
|
|
34
|
+
def get_violations(self) -> list[Path]:
|
|
35
|
+
"""Return list of paths written outside sandbox."""
|
|
36
|
+
return list(self.write_log)
|
|
37
|
+
|
|
38
|
+
def is_clean(self) -> bool:
|
|
39
|
+
"""Return True if no violations occurred."""
|
|
40
|
+
return len(self.write_log) == 0
|
harness/types.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Typed structures for RunArtifactBundle capture."""
|
|
2
|
+
from typing import TypedDict, Literal
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Message(TypedDict):
|
|
6
|
+
role: Literal["user", "assistant", "system"]
|
|
7
|
+
content: str
|
|
8
|
+
timestamp: str # ISO8601
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ToolInvocation(TypedDict):
|
|
12
|
+
tool_name: str
|
|
13
|
+
arguments: dict
|
|
14
|
+
result: str | dict
|
|
15
|
+
duration_ms: int
|
|
16
|
+
timestamp: str # ISO8601
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class GitDiff(TypedDict):
|
|
20
|
+
files_changed: list[str]
|
|
21
|
+
insertions: int
|
|
22
|
+
deletions: int
|
|
23
|
+
patch: str # Full diff output
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class TestReport(TypedDict):
|
|
27
|
+
framework: Literal["pytest", "jest", "other"]
|
|
28
|
+
passed: int
|
|
29
|
+
failed: int
|
|
30
|
+
skipped: int
|
|
31
|
+
output: str # Full test output
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class Timings(TypedDict):
|
|
35
|
+
total_ms: int
|
|
36
|
+
tool_time_ms: int
|
|
37
|
+
model_time_ms: int
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class RunArtifactBundle(TypedDict):
|
|
41
|
+
transcript: list[Message]
|
|
42
|
+
tool_traces: list[ToolInvocation]
|
|
43
|
+
repo_diff: GitDiff
|
|
44
|
+
test_results: TestReport
|
|
45
|
+
timings: Timings
|
|
46
|
+
exit_code: Literal["success", "timeout", "crash", "halt"]
|
janus_labs/__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Janus Labs - 3DMark for AI Agents.
|
|
3
|
+
|
|
4
|
+
Benchmark and measure AI coding agent reliability with standardized,
|
|
5
|
+
reproducible tests.
|
|
6
|
+
|
|
7
|
+
This module provides the Python API. For CLI usage:
|
|
8
|
+
python -m janus_labs run --suite refactor-storm
|
|
9
|
+
|
|
10
|
+
Or if janus-labs is in your PATH:
|
|
11
|
+
janus-labs run --suite refactor-storm
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
__version__ = "0.2.0"
|
|
15
|
+
|
|
16
|
+
__all__ = ["__version__"]
|
janus_labs/__main__.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Module entry point for janus-labs CLI.
|
|
4
|
+
|
|
5
|
+
Enables running the CLI via:
|
|
6
|
+
python -m janus_labs <command> [args]
|
|
7
|
+
|
|
8
|
+
This is the recommended fallback if 'janus-labs' is not in your PATH.
|
|
9
|
+
|
|
10
|
+
Examples:
|
|
11
|
+
python -m janus_labs run --suite refactor-storm
|
|
12
|
+
python -m janus_labs bench --submit
|
|
13
|
+
python -m janus_labs submit result.json --github myhandle
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import sys
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def main():
|
|
20
|
+
"""Entry point that delegates to CLI main."""
|
|
21
|
+
try:
|
|
22
|
+
from cli.main import main as cli_main
|
|
23
|
+
return cli_main()
|
|
24
|
+
except ImportError as e:
|
|
25
|
+
# Provide helpful error if dependencies are missing
|
|
26
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
27
|
+
print(file=sys.stderr)
|
|
28
|
+
print("Janus Labs requires additional dependencies.", file=sys.stderr)
|
|
29
|
+
print("Install with: pip install janus-labs", file=sys.stderr)
|
|
30
|
+
print(file=sys.stderr)
|
|
31
|
+
print("If you've already installed, ensure you're using the correct Python:", file=sys.stderr)
|
|
32
|
+
print(f" Current: {sys.executable}", file=sys.stderr)
|
|
33
|
+
return 1
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
if __name__ == "__main__":
|
|
37
|
+
sys.exit(main())
|