janus-labs 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. cli/__init__.py +1 -0
  2. cli/__main__.py +7 -0
  3. cli/clipboard.py +113 -0
  4. cli/main.py +690 -0
  5. cli/output.py +97 -0
  6. cli/submit.py +270 -0
  7. config/__init__.py +1 -0
  8. config/detection.py +72 -0
  9. forge/__init__.py +5 -0
  10. forge/behavior.py +35 -0
  11. forge/behaviors/BHV-002-refactor-complexity.yaml +25 -0
  12. forge/behaviors/BHV-003-error-handling.yaml +28 -0
  13. gauge/__init__.py +17 -0
  14. gauge/adapter.py +134 -0
  15. gauge/behaviors/__init__.py +11 -0
  16. gauge/behaviors/code_quality.py +73 -0
  17. gauge/behaviors/instruction_adherence.py +52 -0
  18. gauge/behaviors/test_cheating.py +178 -0
  19. gauge/governed_rollout.py +107 -0
  20. gauge/judge.py +179 -0
  21. gauge/qualitative.py +271 -0
  22. gauge/report.py +210 -0
  23. gauge/trust_elasticity.py +172 -0
  24. governance/__init__.py +14 -0
  25. governance/bridge.py +124 -0
  26. governance/memory.py +116 -0
  27. harness/__init__.py +1 -0
  28. harness/artifacts.py +195 -0
  29. harness/executor.py +51 -0
  30. harness/sandbox.py +40 -0
  31. harness/types.py +46 -0
  32. janus_labs/__init__.py +16 -0
  33. janus_labs/__main__.py +37 -0
  34. janus_labs-0.2.0.dist-info/METADATA +316 -0
  35. janus_labs-0.2.0.dist-info/RECORD +80 -0
  36. janus_labs-0.2.0.dist-info/WHEEL +5 -0
  37. janus_labs-0.2.0.dist-info/entry_points.txt +2 -0
  38. janus_labs-0.2.0.dist-info/licenses/LICENSE +201 -0
  39. janus_labs-0.2.0.dist-info/top_level.txt +11 -0
  40. janus_types.py +140 -0
  41. probe/__init__.py +19 -0
  42. probe/discovery.py +194 -0
  43. probe/explorer.py +236 -0
  44. probe/mutations.py +196 -0
  45. probe/tracer.py +193 -0
  46. scaffold/__init__.py +1 -0
  47. scaffold/scorer.py +321 -0
  48. scaffold/templates/BHV-001-test-cheating/.gitignore +4 -0
  49. scaffold/templates/BHV-001-test-cheating/src/__init__.py +0 -0
  50. scaffold/templates/BHV-001-test-cheating/src/calculator.py +24 -0
  51. scaffold/templates/BHV-001-test-cheating/tests/__init__.py +0 -0
  52. scaffold/templates/BHV-001-test-cheating/tests/test_calculator.py +35 -0
  53. scaffold/templates/default/.gitignore +4 -0
  54. scaffold/templates/default/src/__init__.py +0 -0
  55. scaffold/templates/default/src/main.py +23 -0
  56. scaffold/templates/default/tests/__init__.py +0 -0
  57. scaffold/templates/default/tests/test_main.py +32 -0
  58. scaffold/workspace.py +202 -0
  59. scaffold/workspaces/BHV-002-refactor-complexity/src/__init__.py +0 -0
  60. scaffold/workspaces/BHV-002-refactor-complexity/src/pricing.py +72 -0
  61. scaffold/workspaces/BHV-002-refactor-complexity/tests/__init__.py +0 -0
  62. scaffold/workspaces/BHV-002-refactor-complexity/tests/test_pricing.py +72 -0
  63. scaffold/workspaces/BHV-003-error-handling/src/__init__.py +0 -0
  64. scaffold/workspaces/BHV-003-error-handling/src/file_processor.py +100 -0
  65. scaffold/workspaces/BHV-003-error-handling/tests/__init__.py +0 -0
  66. scaffold/workspaces/BHV-003-error-handling/tests/test_file_processor.py +144 -0
  67. suite/__init__.py +16 -0
  68. suite/builtin/__init__.py +13 -0
  69. suite/builtin/hello_world.py +28 -0
  70. suite/builtin/refactor_storm.py +92 -0
  71. suite/comparison.py +274 -0
  72. suite/definition.py +51 -0
  73. suite/export/__init__.py +6 -0
  74. suite/export/github.py +58 -0
  75. suite/export/html.py +160 -0
  76. suite/export/json_export.py +65 -0
  77. suite/registry.py +20 -0
  78. suite/result.py +133 -0
  79. suite/runner.py +110 -0
  80. suite/thresholds.py +80 -0
janus_types.py ADDED
@@ -0,0 +1,140 @@
1
+ """
2
+ Stub types for standalone Janus Labs operation.
3
+
4
+ When running within the AoP monorepo, the full Janus Protocol is available.
5
+ For standalone use, these stubs provide minimal functionality.
6
+ """
7
+
8
+ from dataclasses import dataclass
9
+ from typing import List, Optional, Tuple
10
+
11
+
12
+ @dataclass
13
+ class TrustScore:
14
+ """Trust score for an agent session."""
15
+
16
+ value: float # 0.0 to 1.0
17
+ confidence: float # 0.0 to 1.0
18
+
19
+ @classmethod
20
+ def default(cls) -> "TrustScore":
21
+ """Return default trust score."""
22
+ return cls(value=0.7, confidence=0.5)
23
+
24
+
25
+ @dataclass
26
+ class GovernanceState:
27
+ """Minimal governance state for standalone operation."""
28
+
29
+ iteration_count: int = 1
30
+ halted: bool = False
31
+ trust_score: Optional[TrustScore] = None
32
+
33
+
34
+ def foundation_check(
35
+ iteration_count: int,
36
+ same_pattern: bool = False,
37
+ merge_ready: bool = False,
38
+ current_approach: Optional[str] = None,
39
+ approach_history: Optional[List[str]] = None,
40
+ confidence: Optional[float] = None,
41
+ confidence_history: Optional[List[float]] = None,
42
+ ) -> dict:
43
+ """
44
+ Stub foundation check for standalone operation.
45
+
46
+ Returns PASS for iterations 1-2, WARN for iteration 3+.
47
+ Full implementation available in mcp-janus.
48
+
49
+ Args:
50
+ iteration_count: Current iteration number
51
+ same_pattern: Whether the same approach is being repeated
52
+ merge_ready: Whether work is ready for merge
53
+ current_approach: Description of current approach
54
+ approach_history: List of previous approaches
55
+ confidence: Current confidence level
56
+ confidence_history: History of confidence levels
57
+
58
+ Returns:
59
+ dict with result, trigger, signals, and recommendation
60
+ """
61
+ _ = merge_ready, current_approach, approach_history, confidence, confidence_history
62
+
63
+ if iteration_count >= 3:
64
+ return {
65
+ "result": "HALT" if same_pattern else "WARN",
66
+ "trigger": "iteration",
67
+ "signals": {"iteration": iteration_count},
68
+ "recommendation": "Consider decomposing the task",
69
+ }
70
+ return {
71
+ "result": "PASS",
72
+ "trigger": "none",
73
+ "signals": {"iteration": iteration_count},
74
+ "recommendation": "Proceed",
75
+ }
76
+
77
+
78
+ def handle_escalation(result: dict, context: dict) -> dict:
79
+ """
80
+ Stub escalation handler - logs but takes no action.
81
+
82
+ Args:
83
+ result: The governance check result
84
+ context: Additional context for escalation
85
+
86
+ Returns:
87
+ dict with escalation_id and action taken
88
+ """
89
+ _ = result, context
90
+ return {"escalation_id": None, "action": "logged"}
91
+
92
+
93
+ def infer_confidence(text: str) -> Tuple[float, str]:
94
+ """
95
+ Stub confidence inference - returns moderate confidence.
96
+
97
+ Args:
98
+ text: Text to analyze for confidence signals
99
+
100
+ Returns:
101
+ Tuple of (confidence_value, confidence_label)
102
+ """
103
+ _ = text
104
+ return (0.7, "moderate")
105
+
106
+
107
+ # In-memory storage for standalone operation
108
+ _memory_store: dict = {}
109
+
110
+
111
+ def read_tier(tier: str, target_dir: str = ".") -> dict:
112
+ """
113
+ Stub memory tier reader - uses in-memory storage.
114
+
115
+ Args:
116
+ tier: Memory tier name (e.g., "governance")
117
+ target_dir: Target directory (ignored in stub)
118
+
119
+ Returns:
120
+ Stored data for the tier, or empty dict
121
+ """
122
+ _ = target_dir
123
+ return _memory_store.get(tier, {})
124
+
125
+
126
+ def write_tier(tier: str, data: dict, target_dir: str = ".") -> Tuple[bool, List[str]]:
127
+ """
128
+ Stub memory tier writer - uses in-memory storage.
129
+
130
+ Args:
131
+ tier: Memory tier name (e.g., "governance")
132
+ data: Data to store
133
+ target_dir: Target directory (ignored in stub)
134
+
135
+ Returns:
136
+ Tuple of (success, error_list)
137
+ """
138
+ _ = target_dir
139
+ _memory_store[tier] = data
140
+ return (True, [])
probe/__init__.py ADDED
@@ -0,0 +1,19 @@
1
+ """Probe layer - Discovery via Phoenix integration."""
2
+
3
+ from .mutations import TaskMutation, MutationStrategy, apply_mutation, generate_mutation_suite
4
+ from .explorer import Explorer, ExplorationConfig
5
+ from .tracer import PhoenixTracer, TraceContext
6
+ from .discovery import DiscoveryPack, generate_discovery_pack
7
+
8
+ __all__ = [
9
+ "TaskMutation",
10
+ "MutationStrategy",
11
+ "apply_mutation",
12
+ "generate_mutation_suite",
13
+ "Explorer",
14
+ "ExplorationConfig",
15
+ "PhoenixTracer",
16
+ "TraceContext",
17
+ "DiscoveryPack",
18
+ "generate_discovery_pack",
19
+ ]
probe/discovery.py ADDED
@@ -0,0 +1,194 @@
1
+ """DiscoveryPack generation for Probe layer."""
2
+
3
+ from dataclasses import dataclass
4
+ from datetime import datetime
5
+ import hashlib
6
+ import json
7
+ from typing import Optional
8
+
9
+ from probe.explorer import ExplorationResult, ExplorationRun
10
+
11
+
12
+ @dataclass
13
+ class FailureCluster:
14
+ """A cluster of similar failures."""
15
+ cluster_id: str
16
+ proposed_name: str
17
+ frequency: int
18
+ severity_hint: str
19
+ exemplar_run_ids: list[str]
20
+ common_mutation: Optional[str] = None
21
+ description: str = ""
22
+
23
+
24
+ @dataclass
25
+ class DiscoveryPack:
26
+ """
27
+ Complete discovery output from Probe exploration.
28
+
29
+ Contains failure clusters, metadata, and exemplar links.
30
+ """
31
+ pack_id: str
32
+ timestamp: str
33
+ task_explored: str
34
+ failure_clusters: list[FailureCluster]
35
+ metadata: dict
36
+ total_runs: int
37
+ novel_failures_found: int
38
+
39
+
40
+ def _cluster_by_exit_code(runs: list[ExplorationRun]) -> dict[str, list[ExplorationRun]]:
41
+ """Group runs by exit code."""
42
+ clusters: dict[str, list[ExplorationRun]] = {}
43
+ for run in runs:
44
+ code = run.bundle.get("exit_code", "unknown")
45
+ if code not in clusters:
46
+ clusters[code] = []
47
+ clusters[code].append(run)
48
+ return clusters
49
+
50
+
51
+ def _cluster_by_mutation(runs: list[ExplorationRun]) -> dict[str, list[ExplorationRun]]:
52
+ """Group runs by mutation strategy."""
53
+ clusters: dict[str, list[ExplorationRun]] = {}
54
+ for run in runs:
55
+ strategy = run.mutation.strategy.value
56
+ if strategy not in clusters:
57
+ clusters[strategy] = []
58
+ clusters[strategy].append(run)
59
+ return clusters
60
+
61
+
62
+ def _calculate_severity(runs: list[ExplorationRun]) -> str:
63
+ """
64
+ Calculate severity hint based on failure characteristics.
65
+
66
+ - critical: crash or halt
67
+ - high: failures with tool errors
68
+ - medium: failures with no output
69
+ - low: minor issues
70
+ """
71
+ for run in runs:
72
+ if run.bundle.get("exit_code") in ("crash", "halt"):
73
+ return "critical"
74
+ if run.error:
75
+ return "high"
76
+
77
+ for run in runs:
78
+ for trace in run.bundle.get("tool_traces", []):
79
+ result = str(trace.get("result", ""))
80
+ if "error" in result.lower():
81
+ return "high"
82
+
83
+ for run in runs:
84
+ if not run.bundle.get("transcript"):
85
+ return "medium"
86
+
87
+ return "low"
88
+
89
+
90
+ def _generate_cluster_id(runs: list[ExplorationRun]) -> str:
91
+ """Generate deterministic cluster ID."""
92
+ content = json.dumps([run.run_id for run in runs], sort_keys=True)
93
+ return hashlib.sha256(content.encode()).hexdigest()[:12]
94
+
95
+
96
+ def _propose_cluster_name(
97
+ runs: list[ExplorationRun],
98
+ common_mutation: Optional[str],
99
+ ) -> str:
100
+ """Generate a proposed name for the cluster."""
101
+ if not runs:
102
+ return "unknown-cluster"
103
+
104
+ exit_codes = {run.bundle.get("exit_code") for run in runs}
105
+
106
+ if "crash" in exit_codes:
107
+ return "crash-cluster"
108
+ if "halt" in exit_codes:
109
+ return "governance-halt-cluster"
110
+ if "timeout" in exit_codes:
111
+ return "timeout-cluster"
112
+
113
+ if common_mutation == "tool_removal":
114
+ return "tool-dependency-failure"
115
+ if common_mutation == "context_reduce":
116
+ return "context-sensitivity-failure"
117
+
118
+ for run in runs:
119
+ for trace in run.bundle.get("tool_traces", []):
120
+ if "error" in str(trace.get("result", "")).lower():
121
+ return "tool-error-cluster"
122
+
123
+ return "unclassified-failure"
124
+
125
+
126
+ def generate_discovery_pack(
127
+ exploration_result: ExplorationResult,
128
+ task: str,
129
+ ) -> DiscoveryPack:
130
+ """
131
+ Generate a DiscoveryPack from exploration results.
132
+
133
+ Clusters failures by:
134
+ 1. Exit code (crash, halt, timeout, success)
135
+ 2. Mutation strategy that triggered failure
136
+ 3. Error patterns (via simple heuristics)
137
+
138
+ In production with Phoenix, this would use embedding-based
139
+ semantic clustering. For MVP, we use rule-based clustering.
140
+
141
+ Args:
142
+ exploration_result: Result from Explorer.explore()
143
+ task: Original task explored
144
+
145
+ Returns:
146
+ DiscoveryPack with failure clusters
147
+ """
148
+ failed_runs = [run for run in exploration_result.runs if not run.success]
149
+
150
+ for run in exploration_result.runs:
151
+ if run.success:
152
+ for trace in run.bundle.get("tool_traces", []):
153
+ if "error" in str(trace.get("result", "")).lower():
154
+ if run not in failed_runs:
155
+ failed_runs.append(run)
156
+ break
157
+
158
+ mutation_clusters = _cluster_by_mutation(failed_runs)
159
+
160
+ failure_clusters: list[FailureCluster] = []
161
+ for mutation, runs in mutation_clusters.items():
162
+ if not runs:
163
+ continue
164
+
165
+ cluster = FailureCluster(
166
+ cluster_id=_generate_cluster_id(runs),
167
+ proposed_name=_propose_cluster_name(runs, mutation),
168
+ frequency=len(runs),
169
+ severity_hint=_calculate_severity(runs),
170
+ exemplar_run_ids=[runs[0].run_id],
171
+ common_mutation=mutation,
172
+ description=f"Failures triggered by {mutation} mutation",
173
+ )
174
+ failure_clusters.append(cluster)
175
+
176
+ severity_order = {"critical": 0, "high": 1, "medium": 2, "low": 3}
177
+ failure_clusters.sort(key=lambda cluster: severity_order.get(cluster.severity_hint, 4))
178
+
179
+ return DiscoveryPack(
180
+ pack_id=hashlib.sha256(
181
+ f"{task}-{exploration_result.total_runs}".encode()
182
+ ).hexdigest()[:12],
183
+ timestamp=datetime.now().isoformat(),
184
+ task_explored=task,
185
+ failure_clusters=failure_clusters,
186
+ metadata={
187
+ "total_runs": exploration_result.total_runs,
188
+ "successful_runs": exploration_result.successful_runs,
189
+ "failed_runs": exploration_result.failed_runs,
190
+ "strategies_used": list(exploration_result.mutations_applied.keys()),
191
+ },
192
+ total_runs=exploration_result.total_runs,
193
+ novel_failures_found=len(failure_clusters),
194
+ )
probe/explorer.py ADDED
@@ -0,0 +1,236 @@
1
+ """Exploration runner for Probe layer."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Optional
5
+
6
+ from harness.artifacts import ArtifactCollector
7
+ from harness.executor import init_fixture
8
+ from harness.types import RunArtifactBundle
9
+ from probe.mutations import MutationStrategy, TaskMutation, apply_mutation
10
+ from probe.tracer import PhoenixTracer, TraceContext
11
+
12
+
13
+ @dataclass
14
+ class ExplorationConfig:
15
+ """Configuration for exploration runs."""
16
+ max_runs: int = 10
17
+ strategies: list[MutationStrategy] = field(
18
+ default_factory=lambda: [
19
+ MutationStrategy.NONE,
20
+ MutationStrategy.TASK_VARIATION,
21
+ MutationStrategy.TOOL_REMOVAL,
22
+ ]
23
+ )
24
+ available_tools: list[str] = field(
25
+ default_factory=lambda: ["read_file", "write_file", "bash"]
26
+ )
27
+ seed: Optional[int] = 42
28
+ timeout_ms: int = 60000
29
+ fixture_path: Optional[str] = None
30
+
31
+
32
+ @dataclass
33
+ class ExplorationRun:
34
+ """Result of a single exploration run."""
35
+ run_id: str
36
+ mutation: TaskMutation
37
+ bundle: RunArtifactBundle
38
+ trace_context: TraceContext
39
+ success: bool
40
+ error: Optional[str] = None
41
+
42
+
43
+ @dataclass
44
+ class ExplorationResult:
45
+ """Complete exploration result."""
46
+ config: ExplorationConfig
47
+ runs: list[ExplorationRun]
48
+ total_runs: int
49
+ successful_runs: int
50
+ failed_runs: int
51
+ mutations_applied: dict[str, int]
52
+
53
+
54
+ class Explorer:
55
+ """
56
+ Executes exploration runs with mutations and tracing.
57
+
58
+ The Explorer:
59
+ 1. Generates task mutations
60
+ 2. Initializes fixtures
61
+ 3. Simulates agent execution (or calls real agent)
62
+ 4. Collects traces via PhoenixTracer
63
+ 5. Aggregates results
64
+ """
65
+
66
+ def __init__(
67
+ self,
68
+ config: ExplorationConfig,
69
+ tracer: Optional[PhoenixTracer] = None,
70
+ ):
71
+ """
72
+ Initialize explorer.
73
+
74
+ Args:
75
+ config: Exploration configuration
76
+ tracer: Optional PhoenixTracer (created if not provided)
77
+ """
78
+ self.config = config
79
+ self.tracer = tracer or PhoenixTracer()
80
+ self.runs: list[ExplorationRun] = []
81
+
82
+ def _simulate_agent_execution(
83
+ self,
84
+ mutation: TaskMutation,
85
+ collector: ArtifactCollector,
86
+ ) -> None:
87
+ """
88
+ Simulate agent execution for a mutated task.
89
+
90
+ In production, this would invoke the actual agent.
91
+ For MVP, we simulate with mock tool calls.
92
+ """
93
+ collector.record_message("user", mutation.mutated_task)
94
+ self.tracer.record_message("user", mutation.mutated_task)
95
+
96
+ collector.record_message(
97
+ "assistant",
98
+ f"I'll work on: {mutation.mutated_task[:100]}..."
99
+ )
100
+ self.tracer.record_message(
101
+ "assistant",
102
+ f"Processing task with mutation: {mutation.strategy.value}"
103
+ )
104
+
105
+ if mutation.strategy == MutationStrategy.TOOL_REMOVAL:
106
+ removed = mutation.mutation_details.get("removed_tool", "unknown")
107
+ collector.record_tool_call(
108
+ removed,
109
+ {"action": "attempt"},
110
+ {"error": "Tool unavailable"},
111
+ 0,
112
+ )
113
+ self.tracer.record_tool_call(
114
+ removed,
115
+ {"action": "attempt"},
116
+ {"error": "Tool unavailable"},
117
+ 0,
118
+ )
119
+ else:
120
+ collector.record_tool_call(
121
+ "read_file",
122
+ {"path": "main.py"},
123
+ "def hello(): pass",
124
+ 50,
125
+ )
126
+ self.tracer.record_tool_call(
127
+ "read_file",
128
+ {"path": "main.py"},
129
+ "def hello(): pass",
130
+ 50,
131
+ )
132
+
133
+ def run_single(
134
+ self,
135
+ task: str,
136
+ strategy: MutationStrategy,
137
+ run_index: int = 0,
138
+ ) -> ExplorationRun:
139
+ """
140
+ Execute a single exploration run.
141
+
142
+ Args:
143
+ task: Original task description
144
+ strategy: Mutation strategy to apply
145
+ run_index: Index for seed calculation
146
+
147
+ Returns:
148
+ ExplorationRun with results
149
+ """
150
+ base_seed = self.config.seed
151
+ seed = base_seed + run_index if base_seed is not None else None
152
+
153
+ mutation = apply_mutation(
154
+ task,
155
+ strategy,
156
+ available_tools=self.config.available_tools,
157
+ seed=seed,
158
+ )
159
+
160
+ trace_ctx = self.tracer.start_trace(
161
+ task_description=mutation.mutated_task,
162
+ mutation=strategy.value,
163
+ )
164
+
165
+ if self.config.fixture_path:
166
+ init_fixture(self.config.fixture_path)
167
+
168
+ collector = ArtifactCollector()
169
+ error = None
170
+ success = True
171
+
172
+ try:
173
+ self._simulate_agent_execution(mutation, collector)
174
+ exit_code = "success"
175
+ except Exception as exc:
176
+ error = str(exc)
177
+ success = False
178
+ exit_code = "crash"
179
+
180
+ self.tracer.end_trace(exit_code)
181
+ bundle = collector.finalize(exit_code)
182
+
183
+ run = ExplorationRun(
184
+ run_id=trace_ctx.run_id,
185
+ mutation=mutation,
186
+ bundle=bundle,
187
+ trace_context=trace_ctx,
188
+ success=success,
189
+ error=error,
190
+ )
191
+ self.runs.append(run)
192
+ return run
193
+
194
+ def explore(self, task: str) -> ExplorationResult:
195
+ """
196
+ Run full exploration with all configured strategies.
197
+
198
+ Args:
199
+ task: Original task to explore
200
+
201
+ Returns:
202
+ ExplorationResult with all runs
203
+ """
204
+ self.runs = []
205
+ mutations_count: dict[str, int] = {}
206
+
207
+ run_index = 0
208
+ for strategy in self.config.strategies:
209
+ runs_per_strategy = max(
210
+ 1, self.config.max_runs // len(self.config.strategies)
211
+ )
212
+
213
+ for _ in range(runs_per_strategy):
214
+ if run_index >= self.config.max_runs:
215
+ break
216
+
217
+ self.run_single(task, strategy, run_index)
218
+ run_index += 1
219
+
220
+ strategy_name = strategy.value
221
+ mutations_count[strategy_name] = mutations_count.get(strategy_name, 0) + 1
222
+
223
+ successful = sum(1 for run in self.runs if run.success)
224
+
225
+ return ExplorationResult(
226
+ config=self.config,
227
+ runs=self.runs,
228
+ total_runs=len(self.runs),
229
+ successful_runs=successful,
230
+ failed_runs=len(self.runs) - successful,
231
+ mutations_applied=mutations_count,
232
+ )
233
+
234
+ def get_traces(self) -> list[dict]:
235
+ """Export collected traces."""
236
+ return self.tracer.export_traces()