janus-labs 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__init__.py +1 -0
- cli/__main__.py +7 -0
- cli/clipboard.py +113 -0
- cli/main.py +690 -0
- cli/output.py +97 -0
- cli/submit.py +270 -0
- config/__init__.py +1 -0
- config/detection.py +72 -0
- forge/__init__.py +5 -0
- forge/behavior.py +35 -0
- forge/behaviors/BHV-002-refactor-complexity.yaml +25 -0
- forge/behaviors/BHV-003-error-handling.yaml +28 -0
- gauge/__init__.py +17 -0
- gauge/adapter.py +134 -0
- gauge/behaviors/__init__.py +11 -0
- gauge/behaviors/code_quality.py +73 -0
- gauge/behaviors/instruction_adherence.py +52 -0
- gauge/behaviors/test_cheating.py +178 -0
- gauge/governed_rollout.py +107 -0
- gauge/judge.py +179 -0
- gauge/qualitative.py +271 -0
- gauge/report.py +210 -0
- gauge/trust_elasticity.py +172 -0
- governance/__init__.py +14 -0
- governance/bridge.py +124 -0
- governance/memory.py +116 -0
- harness/__init__.py +1 -0
- harness/artifacts.py +195 -0
- harness/executor.py +51 -0
- harness/sandbox.py +40 -0
- harness/types.py +46 -0
- janus_labs/__init__.py +16 -0
- janus_labs/__main__.py +37 -0
- janus_labs-0.2.0.dist-info/METADATA +316 -0
- janus_labs-0.2.0.dist-info/RECORD +80 -0
- janus_labs-0.2.0.dist-info/WHEEL +5 -0
- janus_labs-0.2.0.dist-info/entry_points.txt +2 -0
- janus_labs-0.2.0.dist-info/licenses/LICENSE +201 -0
- janus_labs-0.2.0.dist-info/top_level.txt +11 -0
- janus_types.py +140 -0
- probe/__init__.py +19 -0
- probe/discovery.py +194 -0
- probe/explorer.py +236 -0
- probe/mutations.py +196 -0
- probe/tracer.py +193 -0
- scaffold/__init__.py +1 -0
- scaffold/scorer.py +321 -0
- scaffold/templates/BHV-001-test-cheating/.gitignore +4 -0
- scaffold/templates/BHV-001-test-cheating/src/__init__.py +0 -0
- scaffold/templates/BHV-001-test-cheating/src/calculator.py +24 -0
- scaffold/templates/BHV-001-test-cheating/tests/__init__.py +0 -0
- scaffold/templates/BHV-001-test-cheating/tests/test_calculator.py +35 -0
- scaffold/templates/default/.gitignore +4 -0
- scaffold/templates/default/src/__init__.py +0 -0
- scaffold/templates/default/src/main.py +23 -0
- scaffold/templates/default/tests/__init__.py +0 -0
- scaffold/templates/default/tests/test_main.py +32 -0
- scaffold/workspace.py +202 -0
- scaffold/workspaces/BHV-002-refactor-complexity/src/__init__.py +0 -0
- scaffold/workspaces/BHV-002-refactor-complexity/src/pricing.py +72 -0
- scaffold/workspaces/BHV-002-refactor-complexity/tests/__init__.py +0 -0
- scaffold/workspaces/BHV-002-refactor-complexity/tests/test_pricing.py +72 -0
- scaffold/workspaces/BHV-003-error-handling/src/__init__.py +0 -0
- scaffold/workspaces/BHV-003-error-handling/src/file_processor.py +100 -0
- scaffold/workspaces/BHV-003-error-handling/tests/__init__.py +0 -0
- scaffold/workspaces/BHV-003-error-handling/tests/test_file_processor.py +144 -0
- suite/__init__.py +16 -0
- suite/builtin/__init__.py +13 -0
- suite/builtin/hello_world.py +28 -0
- suite/builtin/refactor_storm.py +92 -0
- suite/comparison.py +274 -0
- suite/definition.py +51 -0
- suite/export/__init__.py +6 -0
- suite/export/github.py +58 -0
- suite/export/html.py +160 -0
- suite/export/json_export.py +65 -0
- suite/registry.py +20 -0
- suite/result.py +133 -0
- suite/runner.py +110 -0
- suite/thresholds.py +80 -0
janus_types.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Stub types for standalone Janus Labs operation.
|
|
3
|
+
|
|
4
|
+
When running within the AoP monorepo, the full Janus Protocol is available.
|
|
5
|
+
For standalone use, these stubs provide minimal functionality.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import List, Optional, Tuple
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class TrustScore:
|
|
14
|
+
"""Trust score for an agent session."""
|
|
15
|
+
|
|
16
|
+
value: float # 0.0 to 1.0
|
|
17
|
+
confidence: float # 0.0 to 1.0
|
|
18
|
+
|
|
19
|
+
@classmethod
|
|
20
|
+
def default(cls) -> "TrustScore":
|
|
21
|
+
"""Return default trust score."""
|
|
22
|
+
return cls(value=0.7, confidence=0.5)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class GovernanceState:
|
|
27
|
+
"""Minimal governance state for standalone operation."""
|
|
28
|
+
|
|
29
|
+
iteration_count: int = 1
|
|
30
|
+
halted: bool = False
|
|
31
|
+
trust_score: Optional[TrustScore] = None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def foundation_check(
|
|
35
|
+
iteration_count: int,
|
|
36
|
+
same_pattern: bool = False,
|
|
37
|
+
merge_ready: bool = False,
|
|
38
|
+
current_approach: Optional[str] = None,
|
|
39
|
+
approach_history: Optional[List[str]] = None,
|
|
40
|
+
confidence: Optional[float] = None,
|
|
41
|
+
confidence_history: Optional[List[float]] = None,
|
|
42
|
+
) -> dict:
|
|
43
|
+
"""
|
|
44
|
+
Stub foundation check for standalone operation.
|
|
45
|
+
|
|
46
|
+
Returns PASS for iterations 1-2, WARN for iteration 3+.
|
|
47
|
+
Full implementation available in mcp-janus.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
iteration_count: Current iteration number
|
|
51
|
+
same_pattern: Whether the same approach is being repeated
|
|
52
|
+
merge_ready: Whether work is ready for merge
|
|
53
|
+
current_approach: Description of current approach
|
|
54
|
+
approach_history: List of previous approaches
|
|
55
|
+
confidence: Current confidence level
|
|
56
|
+
confidence_history: History of confidence levels
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
dict with result, trigger, signals, and recommendation
|
|
60
|
+
"""
|
|
61
|
+
_ = merge_ready, current_approach, approach_history, confidence, confidence_history
|
|
62
|
+
|
|
63
|
+
if iteration_count >= 3:
|
|
64
|
+
return {
|
|
65
|
+
"result": "HALT" if same_pattern else "WARN",
|
|
66
|
+
"trigger": "iteration",
|
|
67
|
+
"signals": {"iteration": iteration_count},
|
|
68
|
+
"recommendation": "Consider decomposing the task",
|
|
69
|
+
}
|
|
70
|
+
return {
|
|
71
|
+
"result": "PASS",
|
|
72
|
+
"trigger": "none",
|
|
73
|
+
"signals": {"iteration": iteration_count},
|
|
74
|
+
"recommendation": "Proceed",
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def handle_escalation(result: dict, context: dict) -> dict:
|
|
79
|
+
"""
|
|
80
|
+
Stub escalation handler - logs but takes no action.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
result: The governance check result
|
|
84
|
+
context: Additional context for escalation
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
dict with escalation_id and action taken
|
|
88
|
+
"""
|
|
89
|
+
_ = result, context
|
|
90
|
+
return {"escalation_id": None, "action": "logged"}
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def infer_confidence(text: str) -> Tuple[float, str]:
|
|
94
|
+
"""
|
|
95
|
+
Stub confidence inference - returns moderate confidence.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
text: Text to analyze for confidence signals
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Tuple of (confidence_value, confidence_label)
|
|
102
|
+
"""
|
|
103
|
+
_ = text
|
|
104
|
+
return (0.7, "moderate")
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
# In-memory storage for standalone operation
|
|
108
|
+
_memory_store: dict = {}
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def read_tier(tier: str, target_dir: str = ".") -> dict:
|
|
112
|
+
"""
|
|
113
|
+
Stub memory tier reader - uses in-memory storage.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
tier: Memory tier name (e.g., "governance")
|
|
117
|
+
target_dir: Target directory (ignored in stub)
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Stored data for the tier, or empty dict
|
|
121
|
+
"""
|
|
122
|
+
_ = target_dir
|
|
123
|
+
return _memory_store.get(tier, {})
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def write_tier(tier: str, data: dict, target_dir: str = ".") -> Tuple[bool, List[str]]:
|
|
127
|
+
"""
|
|
128
|
+
Stub memory tier writer - uses in-memory storage.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
tier: Memory tier name (e.g., "governance")
|
|
132
|
+
data: Data to store
|
|
133
|
+
target_dir: Target directory (ignored in stub)
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
Tuple of (success, error_list)
|
|
137
|
+
"""
|
|
138
|
+
_ = target_dir
|
|
139
|
+
_memory_store[tier] = data
|
|
140
|
+
return (True, [])
|
probe/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Probe layer - Discovery via Phoenix integration."""
|
|
2
|
+
|
|
3
|
+
from .mutations import TaskMutation, MutationStrategy, apply_mutation, generate_mutation_suite
|
|
4
|
+
from .explorer import Explorer, ExplorationConfig
|
|
5
|
+
from .tracer import PhoenixTracer, TraceContext
|
|
6
|
+
from .discovery import DiscoveryPack, generate_discovery_pack
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"TaskMutation",
|
|
10
|
+
"MutationStrategy",
|
|
11
|
+
"apply_mutation",
|
|
12
|
+
"generate_mutation_suite",
|
|
13
|
+
"Explorer",
|
|
14
|
+
"ExplorationConfig",
|
|
15
|
+
"PhoenixTracer",
|
|
16
|
+
"TraceContext",
|
|
17
|
+
"DiscoveryPack",
|
|
18
|
+
"generate_discovery_pack",
|
|
19
|
+
]
|
probe/discovery.py
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""DiscoveryPack generation for Probe layer."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
import hashlib
|
|
6
|
+
import json
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
from probe.explorer import ExplorationResult, ExplorationRun
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class FailureCluster:
|
|
14
|
+
"""A cluster of similar failures."""
|
|
15
|
+
cluster_id: str
|
|
16
|
+
proposed_name: str
|
|
17
|
+
frequency: int
|
|
18
|
+
severity_hint: str
|
|
19
|
+
exemplar_run_ids: list[str]
|
|
20
|
+
common_mutation: Optional[str] = None
|
|
21
|
+
description: str = ""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class DiscoveryPack:
|
|
26
|
+
"""
|
|
27
|
+
Complete discovery output from Probe exploration.
|
|
28
|
+
|
|
29
|
+
Contains failure clusters, metadata, and exemplar links.
|
|
30
|
+
"""
|
|
31
|
+
pack_id: str
|
|
32
|
+
timestamp: str
|
|
33
|
+
task_explored: str
|
|
34
|
+
failure_clusters: list[FailureCluster]
|
|
35
|
+
metadata: dict
|
|
36
|
+
total_runs: int
|
|
37
|
+
novel_failures_found: int
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _cluster_by_exit_code(runs: list[ExplorationRun]) -> dict[str, list[ExplorationRun]]:
|
|
41
|
+
"""Group runs by exit code."""
|
|
42
|
+
clusters: dict[str, list[ExplorationRun]] = {}
|
|
43
|
+
for run in runs:
|
|
44
|
+
code = run.bundle.get("exit_code", "unknown")
|
|
45
|
+
if code not in clusters:
|
|
46
|
+
clusters[code] = []
|
|
47
|
+
clusters[code].append(run)
|
|
48
|
+
return clusters
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _cluster_by_mutation(runs: list[ExplorationRun]) -> dict[str, list[ExplorationRun]]:
|
|
52
|
+
"""Group runs by mutation strategy."""
|
|
53
|
+
clusters: dict[str, list[ExplorationRun]] = {}
|
|
54
|
+
for run in runs:
|
|
55
|
+
strategy = run.mutation.strategy.value
|
|
56
|
+
if strategy not in clusters:
|
|
57
|
+
clusters[strategy] = []
|
|
58
|
+
clusters[strategy].append(run)
|
|
59
|
+
return clusters
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _calculate_severity(runs: list[ExplorationRun]) -> str:
|
|
63
|
+
"""
|
|
64
|
+
Calculate severity hint based on failure characteristics.
|
|
65
|
+
|
|
66
|
+
- critical: crash or halt
|
|
67
|
+
- high: failures with tool errors
|
|
68
|
+
- medium: failures with no output
|
|
69
|
+
- low: minor issues
|
|
70
|
+
"""
|
|
71
|
+
for run in runs:
|
|
72
|
+
if run.bundle.get("exit_code") in ("crash", "halt"):
|
|
73
|
+
return "critical"
|
|
74
|
+
if run.error:
|
|
75
|
+
return "high"
|
|
76
|
+
|
|
77
|
+
for run in runs:
|
|
78
|
+
for trace in run.bundle.get("tool_traces", []):
|
|
79
|
+
result = str(trace.get("result", ""))
|
|
80
|
+
if "error" in result.lower():
|
|
81
|
+
return "high"
|
|
82
|
+
|
|
83
|
+
for run in runs:
|
|
84
|
+
if not run.bundle.get("transcript"):
|
|
85
|
+
return "medium"
|
|
86
|
+
|
|
87
|
+
return "low"
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _generate_cluster_id(runs: list[ExplorationRun]) -> str:
|
|
91
|
+
"""Generate deterministic cluster ID."""
|
|
92
|
+
content = json.dumps([run.run_id for run in runs], sort_keys=True)
|
|
93
|
+
return hashlib.sha256(content.encode()).hexdigest()[:12]
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _propose_cluster_name(
|
|
97
|
+
runs: list[ExplorationRun],
|
|
98
|
+
common_mutation: Optional[str],
|
|
99
|
+
) -> str:
|
|
100
|
+
"""Generate a proposed name for the cluster."""
|
|
101
|
+
if not runs:
|
|
102
|
+
return "unknown-cluster"
|
|
103
|
+
|
|
104
|
+
exit_codes = {run.bundle.get("exit_code") for run in runs}
|
|
105
|
+
|
|
106
|
+
if "crash" in exit_codes:
|
|
107
|
+
return "crash-cluster"
|
|
108
|
+
if "halt" in exit_codes:
|
|
109
|
+
return "governance-halt-cluster"
|
|
110
|
+
if "timeout" in exit_codes:
|
|
111
|
+
return "timeout-cluster"
|
|
112
|
+
|
|
113
|
+
if common_mutation == "tool_removal":
|
|
114
|
+
return "tool-dependency-failure"
|
|
115
|
+
if common_mutation == "context_reduce":
|
|
116
|
+
return "context-sensitivity-failure"
|
|
117
|
+
|
|
118
|
+
for run in runs:
|
|
119
|
+
for trace in run.bundle.get("tool_traces", []):
|
|
120
|
+
if "error" in str(trace.get("result", "")).lower():
|
|
121
|
+
return "tool-error-cluster"
|
|
122
|
+
|
|
123
|
+
return "unclassified-failure"
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def generate_discovery_pack(
|
|
127
|
+
exploration_result: ExplorationResult,
|
|
128
|
+
task: str,
|
|
129
|
+
) -> DiscoveryPack:
|
|
130
|
+
"""
|
|
131
|
+
Generate a DiscoveryPack from exploration results.
|
|
132
|
+
|
|
133
|
+
Clusters failures by:
|
|
134
|
+
1. Exit code (crash, halt, timeout, success)
|
|
135
|
+
2. Mutation strategy that triggered failure
|
|
136
|
+
3. Error patterns (via simple heuristics)
|
|
137
|
+
|
|
138
|
+
In production with Phoenix, this would use embedding-based
|
|
139
|
+
semantic clustering. For MVP, we use rule-based clustering.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
exploration_result: Result from Explorer.explore()
|
|
143
|
+
task: Original task explored
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
DiscoveryPack with failure clusters
|
|
147
|
+
"""
|
|
148
|
+
failed_runs = [run for run in exploration_result.runs if not run.success]
|
|
149
|
+
|
|
150
|
+
for run in exploration_result.runs:
|
|
151
|
+
if run.success:
|
|
152
|
+
for trace in run.bundle.get("tool_traces", []):
|
|
153
|
+
if "error" in str(trace.get("result", "")).lower():
|
|
154
|
+
if run not in failed_runs:
|
|
155
|
+
failed_runs.append(run)
|
|
156
|
+
break
|
|
157
|
+
|
|
158
|
+
mutation_clusters = _cluster_by_mutation(failed_runs)
|
|
159
|
+
|
|
160
|
+
failure_clusters: list[FailureCluster] = []
|
|
161
|
+
for mutation, runs in mutation_clusters.items():
|
|
162
|
+
if not runs:
|
|
163
|
+
continue
|
|
164
|
+
|
|
165
|
+
cluster = FailureCluster(
|
|
166
|
+
cluster_id=_generate_cluster_id(runs),
|
|
167
|
+
proposed_name=_propose_cluster_name(runs, mutation),
|
|
168
|
+
frequency=len(runs),
|
|
169
|
+
severity_hint=_calculate_severity(runs),
|
|
170
|
+
exemplar_run_ids=[runs[0].run_id],
|
|
171
|
+
common_mutation=mutation,
|
|
172
|
+
description=f"Failures triggered by {mutation} mutation",
|
|
173
|
+
)
|
|
174
|
+
failure_clusters.append(cluster)
|
|
175
|
+
|
|
176
|
+
severity_order = {"critical": 0, "high": 1, "medium": 2, "low": 3}
|
|
177
|
+
failure_clusters.sort(key=lambda cluster: severity_order.get(cluster.severity_hint, 4))
|
|
178
|
+
|
|
179
|
+
return DiscoveryPack(
|
|
180
|
+
pack_id=hashlib.sha256(
|
|
181
|
+
f"{task}-{exploration_result.total_runs}".encode()
|
|
182
|
+
).hexdigest()[:12],
|
|
183
|
+
timestamp=datetime.now().isoformat(),
|
|
184
|
+
task_explored=task,
|
|
185
|
+
failure_clusters=failure_clusters,
|
|
186
|
+
metadata={
|
|
187
|
+
"total_runs": exploration_result.total_runs,
|
|
188
|
+
"successful_runs": exploration_result.successful_runs,
|
|
189
|
+
"failed_runs": exploration_result.failed_runs,
|
|
190
|
+
"strategies_used": list(exploration_result.mutations_applied.keys()),
|
|
191
|
+
},
|
|
192
|
+
total_runs=exploration_result.total_runs,
|
|
193
|
+
novel_failures_found=len(failure_clusters),
|
|
194
|
+
)
|
probe/explorer.py
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
"""Exploration runner for Probe layer."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from harness.artifacts import ArtifactCollector
|
|
7
|
+
from harness.executor import init_fixture
|
|
8
|
+
from harness.types import RunArtifactBundle
|
|
9
|
+
from probe.mutations import MutationStrategy, TaskMutation, apply_mutation
|
|
10
|
+
from probe.tracer import PhoenixTracer, TraceContext
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class ExplorationConfig:
|
|
15
|
+
"""Configuration for exploration runs."""
|
|
16
|
+
max_runs: int = 10
|
|
17
|
+
strategies: list[MutationStrategy] = field(
|
|
18
|
+
default_factory=lambda: [
|
|
19
|
+
MutationStrategy.NONE,
|
|
20
|
+
MutationStrategy.TASK_VARIATION,
|
|
21
|
+
MutationStrategy.TOOL_REMOVAL,
|
|
22
|
+
]
|
|
23
|
+
)
|
|
24
|
+
available_tools: list[str] = field(
|
|
25
|
+
default_factory=lambda: ["read_file", "write_file", "bash"]
|
|
26
|
+
)
|
|
27
|
+
seed: Optional[int] = 42
|
|
28
|
+
timeout_ms: int = 60000
|
|
29
|
+
fixture_path: Optional[str] = None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class ExplorationRun:
|
|
34
|
+
"""Result of a single exploration run."""
|
|
35
|
+
run_id: str
|
|
36
|
+
mutation: TaskMutation
|
|
37
|
+
bundle: RunArtifactBundle
|
|
38
|
+
trace_context: TraceContext
|
|
39
|
+
success: bool
|
|
40
|
+
error: Optional[str] = None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class ExplorationResult:
|
|
45
|
+
"""Complete exploration result."""
|
|
46
|
+
config: ExplorationConfig
|
|
47
|
+
runs: list[ExplorationRun]
|
|
48
|
+
total_runs: int
|
|
49
|
+
successful_runs: int
|
|
50
|
+
failed_runs: int
|
|
51
|
+
mutations_applied: dict[str, int]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class Explorer:
|
|
55
|
+
"""
|
|
56
|
+
Executes exploration runs with mutations and tracing.
|
|
57
|
+
|
|
58
|
+
The Explorer:
|
|
59
|
+
1. Generates task mutations
|
|
60
|
+
2. Initializes fixtures
|
|
61
|
+
3. Simulates agent execution (or calls real agent)
|
|
62
|
+
4. Collects traces via PhoenixTracer
|
|
63
|
+
5. Aggregates results
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def __init__(
|
|
67
|
+
self,
|
|
68
|
+
config: ExplorationConfig,
|
|
69
|
+
tracer: Optional[PhoenixTracer] = None,
|
|
70
|
+
):
|
|
71
|
+
"""
|
|
72
|
+
Initialize explorer.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
config: Exploration configuration
|
|
76
|
+
tracer: Optional PhoenixTracer (created if not provided)
|
|
77
|
+
"""
|
|
78
|
+
self.config = config
|
|
79
|
+
self.tracer = tracer or PhoenixTracer()
|
|
80
|
+
self.runs: list[ExplorationRun] = []
|
|
81
|
+
|
|
82
|
+
def _simulate_agent_execution(
|
|
83
|
+
self,
|
|
84
|
+
mutation: TaskMutation,
|
|
85
|
+
collector: ArtifactCollector,
|
|
86
|
+
) -> None:
|
|
87
|
+
"""
|
|
88
|
+
Simulate agent execution for a mutated task.
|
|
89
|
+
|
|
90
|
+
In production, this would invoke the actual agent.
|
|
91
|
+
For MVP, we simulate with mock tool calls.
|
|
92
|
+
"""
|
|
93
|
+
collector.record_message("user", mutation.mutated_task)
|
|
94
|
+
self.tracer.record_message("user", mutation.mutated_task)
|
|
95
|
+
|
|
96
|
+
collector.record_message(
|
|
97
|
+
"assistant",
|
|
98
|
+
f"I'll work on: {mutation.mutated_task[:100]}..."
|
|
99
|
+
)
|
|
100
|
+
self.tracer.record_message(
|
|
101
|
+
"assistant",
|
|
102
|
+
f"Processing task with mutation: {mutation.strategy.value}"
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
if mutation.strategy == MutationStrategy.TOOL_REMOVAL:
|
|
106
|
+
removed = mutation.mutation_details.get("removed_tool", "unknown")
|
|
107
|
+
collector.record_tool_call(
|
|
108
|
+
removed,
|
|
109
|
+
{"action": "attempt"},
|
|
110
|
+
{"error": "Tool unavailable"},
|
|
111
|
+
0,
|
|
112
|
+
)
|
|
113
|
+
self.tracer.record_tool_call(
|
|
114
|
+
removed,
|
|
115
|
+
{"action": "attempt"},
|
|
116
|
+
{"error": "Tool unavailable"},
|
|
117
|
+
0,
|
|
118
|
+
)
|
|
119
|
+
else:
|
|
120
|
+
collector.record_tool_call(
|
|
121
|
+
"read_file",
|
|
122
|
+
{"path": "main.py"},
|
|
123
|
+
"def hello(): pass",
|
|
124
|
+
50,
|
|
125
|
+
)
|
|
126
|
+
self.tracer.record_tool_call(
|
|
127
|
+
"read_file",
|
|
128
|
+
{"path": "main.py"},
|
|
129
|
+
"def hello(): pass",
|
|
130
|
+
50,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
def run_single(
|
|
134
|
+
self,
|
|
135
|
+
task: str,
|
|
136
|
+
strategy: MutationStrategy,
|
|
137
|
+
run_index: int = 0,
|
|
138
|
+
) -> ExplorationRun:
|
|
139
|
+
"""
|
|
140
|
+
Execute a single exploration run.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
task: Original task description
|
|
144
|
+
strategy: Mutation strategy to apply
|
|
145
|
+
run_index: Index for seed calculation
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
ExplorationRun with results
|
|
149
|
+
"""
|
|
150
|
+
base_seed = self.config.seed
|
|
151
|
+
seed = base_seed + run_index if base_seed is not None else None
|
|
152
|
+
|
|
153
|
+
mutation = apply_mutation(
|
|
154
|
+
task,
|
|
155
|
+
strategy,
|
|
156
|
+
available_tools=self.config.available_tools,
|
|
157
|
+
seed=seed,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
trace_ctx = self.tracer.start_trace(
|
|
161
|
+
task_description=mutation.mutated_task,
|
|
162
|
+
mutation=strategy.value,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
if self.config.fixture_path:
|
|
166
|
+
init_fixture(self.config.fixture_path)
|
|
167
|
+
|
|
168
|
+
collector = ArtifactCollector()
|
|
169
|
+
error = None
|
|
170
|
+
success = True
|
|
171
|
+
|
|
172
|
+
try:
|
|
173
|
+
self._simulate_agent_execution(mutation, collector)
|
|
174
|
+
exit_code = "success"
|
|
175
|
+
except Exception as exc:
|
|
176
|
+
error = str(exc)
|
|
177
|
+
success = False
|
|
178
|
+
exit_code = "crash"
|
|
179
|
+
|
|
180
|
+
self.tracer.end_trace(exit_code)
|
|
181
|
+
bundle = collector.finalize(exit_code)
|
|
182
|
+
|
|
183
|
+
run = ExplorationRun(
|
|
184
|
+
run_id=trace_ctx.run_id,
|
|
185
|
+
mutation=mutation,
|
|
186
|
+
bundle=bundle,
|
|
187
|
+
trace_context=trace_ctx,
|
|
188
|
+
success=success,
|
|
189
|
+
error=error,
|
|
190
|
+
)
|
|
191
|
+
self.runs.append(run)
|
|
192
|
+
return run
|
|
193
|
+
|
|
194
|
+
def explore(self, task: str) -> ExplorationResult:
|
|
195
|
+
"""
|
|
196
|
+
Run full exploration with all configured strategies.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
task: Original task to explore
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
ExplorationResult with all runs
|
|
203
|
+
"""
|
|
204
|
+
self.runs = []
|
|
205
|
+
mutations_count: dict[str, int] = {}
|
|
206
|
+
|
|
207
|
+
run_index = 0
|
|
208
|
+
for strategy in self.config.strategies:
|
|
209
|
+
runs_per_strategy = max(
|
|
210
|
+
1, self.config.max_runs // len(self.config.strategies)
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
for _ in range(runs_per_strategy):
|
|
214
|
+
if run_index >= self.config.max_runs:
|
|
215
|
+
break
|
|
216
|
+
|
|
217
|
+
self.run_single(task, strategy, run_index)
|
|
218
|
+
run_index += 1
|
|
219
|
+
|
|
220
|
+
strategy_name = strategy.value
|
|
221
|
+
mutations_count[strategy_name] = mutations_count.get(strategy_name, 0) + 1
|
|
222
|
+
|
|
223
|
+
successful = sum(1 for run in self.runs if run.success)
|
|
224
|
+
|
|
225
|
+
return ExplorationResult(
|
|
226
|
+
config=self.config,
|
|
227
|
+
runs=self.runs,
|
|
228
|
+
total_runs=len(self.runs),
|
|
229
|
+
successful_runs=successful,
|
|
230
|
+
failed_runs=len(self.runs) - successful,
|
|
231
|
+
mutations_applied=mutations_count,
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
def get_traces(self) -> list[dict]:
|
|
235
|
+
"""Export collected traces."""
|
|
236
|
+
return self.tracer.export_traces()
|