janus-labs 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. cli/__init__.py +1 -0
  2. cli/__main__.py +7 -0
  3. cli/clipboard.py +113 -0
  4. cli/main.py +690 -0
  5. cli/output.py +97 -0
  6. cli/submit.py +270 -0
  7. config/__init__.py +1 -0
  8. config/detection.py +72 -0
  9. forge/__init__.py +5 -0
  10. forge/behavior.py +35 -0
  11. forge/behaviors/BHV-002-refactor-complexity.yaml +25 -0
  12. forge/behaviors/BHV-003-error-handling.yaml +28 -0
  13. gauge/__init__.py +17 -0
  14. gauge/adapter.py +134 -0
  15. gauge/behaviors/__init__.py +11 -0
  16. gauge/behaviors/code_quality.py +73 -0
  17. gauge/behaviors/instruction_adherence.py +52 -0
  18. gauge/behaviors/test_cheating.py +178 -0
  19. gauge/governed_rollout.py +107 -0
  20. gauge/judge.py +179 -0
  21. gauge/qualitative.py +271 -0
  22. gauge/report.py +210 -0
  23. gauge/trust_elasticity.py +172 -0
  24. governance/__init__.py +14 -0
  25. governance/bridge.py +124 -0
  26. governance/memory.py +116 -0
  27. harness/__init__.py +1 -0
  28. harness/artifacts.py +195 -0
  29. harness/executor.py +51 -0
  30. harness/sandbox.py +40 -0
  31. harness/types.py +46 -0
  32. janus_labs/__init__.py +16 -0
  33. janus_labs/__main__.py +37 -0
  34. janus_labs-0.2.0.dist-info/METADATA +316 -0
  35. janus_labs-0.2.0.dist-info/RECORD +80 -0
  36. janus_labs-0.2.0.dist-info/WHEEL +5 -0
  37. janus_labs-0.2.0.dist-info/entry_points.txt +2 -0
  38. janus_labs-0.2.0.dist-info/licenses/LICENSE +201 -0
  39. janus_labs-0.2.0.dist-info/top_level.txt +11 -0
  40. janus_types.py +140 -0
  41. probe/__init__.py +19 -0
  42. probe/discovery.py +194 -0
  43. probe/explorer.py +236 -0
  44. probe/mutations.py +196 -0
  45. probe/tracer.py +193 -0
  46. scaffold/__init__.py +1 -0
  47. scaffold/scorer.py +321 -0
  48. scaffold/templates/BHV-001-test-cheating/.gitignore +4 -0
  49. scaffold/templates/BHV-001-test-cheating/src/__init__.py +0 -0
  50. scaffold/templates/BHV-001-test-cheating/src/calculator.py +24 -0
  51. scaffold/templates/BHV-001-test-cheating/tests/__init__.py +0 -0
  52. scaffold/templates/BHV-001-test-cheating/tests/test_calculator.py +35 -0
  53. scaffold/templates/default/.gitignore +4 -0
  54. scaffold/templates/default/src/__init__.py +0 -0
  55. scaffold/templates/default/src/main.py +23 -0
  56. scaffold/templates/default/tests/__init__.py +0 -0
  57. scaffold/templates/default/tests/test_main.py +32 -0
  58. scaffold/workspace.py +202 -0
  59. scaffold/workspaces/BHV-002-refactor-complexity/src/__init__.py +0 -0
  60. scaffold/workspaces/BHV-002-refactor-complexity/src/pricing.py +72 -0
  61. scaffold/workspaces/BHV-002-refactor-complexity/tests/__init__.py +0 -0
  62. scaffold/workspaces/BHV-002-refactor-complexity/tests/test_pricing.py +72 -0
  63. scaffold/workspaces/BHV-003-error-handling/src/__init__.py +0 -0
  64. scaffold/workspaces/BHV-003-error-handling/src/file_processor.py +100 -0
  65. scaffold/workspaces/BHV-003-error-handling/tests/__init__.py +0 -0
  66. scaffold/workspaces/BHV-003-error-handling/tests/test_file_processor.py +144 -0
  67. suite/__init__.py +16 -0
  68. suite/builtin/__init__.py +13 -0
  69. suite/builtin/hello_world.py +28 -0
  70. suite/builtin/refactor_storm.py +92 -0
  71. suite/comparison.py +274 -0
  72. suite/definition.py +51 -0
  73. suite/export/__init__.py +6 -0
  74. suite/export/github.py +58 -0
  75. suite/export/html.py +160 -0
  76. suite/export/json_export.py +65 -0
  77. suite/registry.py +20 -0
  78. suite/result.py +133 -0
  79. suite/runner.py +110 -0
  80. suite/thresholds.py +80 -0
cli/output.py ADDED
@@ -0,0 +1,97 @@
1
+ """Rich terminal output formatting for Janus Labs CLI."""
2
+
3
+ import sys
4
+
5
+
6
+ def _score_to_grade(score: float) -> str:
7
+ """Convert 0-100 score to letter grade."""
8
+ if score >= 95:
9
+ return "S"
10
+ if score >= 85:
11
+ return "A"
12
+ if score >= 70:
13
+ return "B"
14
+ if score >= 55:
15
+ return "C"
16
+ if score >= 40:
17
+ return "D"
18
+ return "F"
19
+
20
+
21
+ def print_benchmark_result(
22
+ score: float,
23
+ grade: str | None = None,
24
+ rank: int | None = None,
25
+ total: int | None = None,
26
+ percentile: float | None = None,
27
+ share_url: str | None = None,
28
+ ) -> None:
29
+ """
30
+ Print colorful benchmark result to terminal.
31
+
32
+ Args:
33
+ score: Score value (0-100)
34
+ grade: Letter grade (S/A/B/C/D/F), computed if not provided
35
+ rank: Rank position (e.g., 42)
36
+ total: Total entries (e.g., 1234)
37
+ percentile: Percentile value (e.g., 97.5 means top 2.5%)
38
+ share_url: URL to share the result
39
+ """
40
+ if grade is None:
41
+ grade = _score_to_grade(score)
42
+
43
+ # Box characters
44
+ line = "=" * 50
45
+
46
+ print()
47
+ print(line)
48
+ print(" BENCHMARK RESULT")
49
+ print(line)
50
+ print(f" Score: {score:.1f} (Grade {grade})")
51
+
52
+ if rank is not None:
53
+ if total is not None:
54
+ print(f" Rank: #{rank} of {total:,}")
55
+ else:
56
+ print(f" Rank: #{rank}")
57
+
58
+ if percentile is not None:
59
+ # percentile from DB is cumulative, so "top X%" = 100 - percentile
60
+ top_percent = 100.0 - percentile
61
+ if top_percent < 1:
62
+ print(f" Percentile: Top {top_percent:.1f}%")
63
+ else:
64
+ print(f" Percentile: Top {top_percent:.0f}%")
65
+
66
+ if share_url:
67
+ print()
68
+ print(f" Share your result: {share_url}")
69
+
70
+ print(line)
71
+
72
+
73
+ def print_step(step: int, total: int, message: str, detail: str | None = None) -> None:
74
+ """
75
+ Print a progress step.
76
+
77
+ Args:
78
+ step: Current step number (1-indexed)
79
+ total: Total number of steps
80
+ message: Step message
81
+ detail: Optional detail to show after message
82
+ """
83
+ prefix = f"[{step}/{total}]"
84
+ if detail:
85
+ print(f"{prefix} {message}... {detail}")
86
+ else:
87
+ print(f"{prefix} {message}...")
88
+
89
+
90
+ def print_error(message: str) -> None:
91
+ """Print error message to stderr."""
92
+ print(f"Error: {message}", file=sys.stderr)
93
+
94
+
95
+ def print_warning(message: str) -> None:
96
+ """Print warning message to stderr."""
97
+ print(f"Warning: {message}", file=sys.stderr)
cli/submit.py ADDED
@@ -0,0 +1,270 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ CLI Submit Command - Posts benchmark results to FastAPI backend.
4
+
5
+ SECURITY: CLI does NOT have Supabase credentials. It POSTs to the
6
+ FastAPI backend which handles validation, rate limiting, and DB insert.
7
+
8
+ Usage:
9
+ python -m cli submit result.json
10
+ python -m cli submit result.json --dry-run
11
+ python -m cli submit result.json --github myhandle
12
+ """
13
+
14
+ import hashlib
15
+ import hmac
16
+ import json
17
+ import os
18
+ import sys
19
+ from datetime import datetime, timezone
20
+ from typing import Optional
21
+
22
+ import httpx
23
+
24
+ # Backend URL (no Supabase credentials needed!)
25
+ API_URL = os.environ.get("JANUS_LABS_API", "https://fulfilling-courtesy-production-9c2c.up.railway.app")
26
+ HMAC_SECRET = os.environ.get("JANUS_HMAC_SECRET", "default-dev-secret")
27
+ _USING_DEFAULT_SECRET = HMAC_SECRET == "default-dev-secret"
28
+
29
+
30
+ def generate_signature(payload: dict) -> str:
31
+ """Generate HMAC-SHA256 signature for payload."""
32
+ # Canonical JSON (sorted keys, no spaces)
33
+ canonical = json.dumps(payload, sort_keys=True, separators=(",", ":"))
34
+ sig = hmac.new(
35
+ HMAC_SECRET.encode(), canonical.encode(), hashlib.sha256
36
+ ).hexdigest()[:64]
37
+ return sig
38
+
39
+
40
+ def _score_to_grade(score: float) -> str:
41
+ """Convert numeric score to letter grade.
42
+
43
+ Backend expects single letters: S, A, B, C, D, F
44
+ S = 95+, A = 85+, B = 70+, C = 55+, D = 40+, F = below
45
+ """
46
+ if score >= 95:
47
+ return "S"
48
+ elif score >= 85:
49
+ return "A"
50
+ elif score >= 70:
51
+ return "B"
52
+ elif score >= 55:
53
+ return "C"
54
+ elif score >= 40:
55
+ return "D"
56
+ else:
57
+ return "F"
58
+
59
+
60
+ def _behavior_id_to_code(behavior_id: str) -> str:
61
+ """Convert behavior ID to backend format.
62
+
63
+ Backend expects: ^[A-Z]-\\d+\\.\\d+$ (e.g., "B-1.0")
64
+ Input: "BHV-001-test-cheating" -> "B-1.0"
65
+ """
66
+ import re
67
+ # Extract number from behavior ID
68
+ match = re.search(r"(\d+)", behavior_id)
69
+ if match:
70
+ num = int(match.group(1))
71
+ return f"B-{num}.0"
72
+ return "B-1.0"
73
+
74
+
75
+ def submit_result(
76
+ result_file: str, github_handle: Optional[str] = None, dry_run: bool = False
77
+ ) -> dict:
78
+ """Submit benchmark result to FastAPI backend.
79
+
80
+ Handles both suite-level results (from janus run) and single-behavior
81
+ results (from janus score).
82
+ """
83
+
84
+ with open(result_file) as f:
85
+ result = json.load(f)
86
+
87
+ # Detect result type and normalize to suite format
88
+ if "headline_score" in result:
89
+ # Suite-level result from janus run
90
+ score = result["headline_score"]
91
+ grade = result["grade"]
92
+ suite_id = result["suite_id"]
93
+ behaviors = [
94
+ {
95
+ "code": _behavior_id_to_code(b["behavior_id"]),
96
+ "score": b["score"],
97
+ "grade": b["grade"],
98
+ }
99
+ for b in result.get("behavior_scores", [])
100
+ ]
101
+ elif "behavior_id" in result:
102
+ # Single behavior result from janus score
103
+ # Convert 1-10 score to 0-100 for consistency
104
+ raw_score = result.get("outcome_score") or result.get("score")
105
+ score = raw_score * 10 # 9.0 -> 90
106
+ grade = _score_to_grade(score)
107
+ # Extract suite from behavior ID (e.g., BHV-001-test-cheating -> derive from context)
108
+ suite_id = result.get("suite_id", "refactor-storm")
109
+ behavior_code = _behavior_id_to_code(result["behavior_id"])
110
+ behaviors = [
111
+ {"code": behavior_code, "score": score, "grade": grade}
112
+ ]
113
+ else:
114
+ raise RuntimeError("Unrecognized result format - missing headline_score or behavior_id")
115
+
116
+ # Generate config hash (8-12 chars required by backend)
117
+ config_fp = result.get("config_fingerprint", "")
118
+ if not config_fp or config_fp == "unknown" or len(config_fp) < 8:
119
+ # Generate hash from result content
120
+ config_fp = hashlib.sha256(
121
+ json.dumps(result, sort_keys=True).encode()
122
+ ).hexdigest()[:12]
123
+ elif len(config_fp) > 12:
124
+ # Truncate if too long (backend max is 12)
125
+ config_fp = config_fp[:12]
126
+
127
+ # Build submission payload
128
+ payload = {
129
+ "score": score,
130
+ "grade": grade,
131
+ "agent": result.get("agent", "claude-code"),
132
+ "model": result.get("model", "opus-4.5"),
133
+ "suite": suite_id,
134
+ "suite_version": result.get("suite_version", "1.0"),
135
+ "cli_version": result.get("cli_version", "0.2.0"),
136
+ "config_hash": config_fp,
137
+ "config_sources": result.get("config_sources", ["CLAUDE.md"]),
138
+ "config_badge": result.get("config_badge", "default"),
139
+ "behaviors": behaviors,
140
+ "client_timestamp": datetime.now(timezone.utc).isoformat(),
141
+ }
142
+
143
+ if github_handle:
144
+ payload["github_handle"] = github_handle
145
+
146
+ # Generate signature (backend will verify)
147
+ payload["signature"] = generate_signature(payload)
148
+
149
+ if dry_run:
150
+ print("DRY RUN - Would submit:")
151
+ print(json.dumps(payload, indent=2))
152
+ if _USING_DEFAULT_SECRET:
153
+ print("\nWARNING: Using default dev secret. Set JANUS_HMAC_SECRET for production.", file=sys.stderr)
154
+ return {"status": "dry_run", "payload": payload}
155
+
156
+ # Warn about default secret before attempting submission
157
+ if _USING_DEFAULT_SECRET:
158
+ print("WARNING: Using default dev secret.", file=sys.stderr)
159
+ print(" Production submissions require JANUS_HMAC_SECRET.", file=sys.stderr)
160
+ print(" Set via: export JANUS_HMAC_SECRET=<your-key>", file=sys.stderr)
161
+ print("", file=sys.stderr)
162
+
163
+ # Submit to FastAPI backend (NOT directly to Supabase)
164
+ try:
165
+ response = httpx.post(
166
+ f"{API_URL}/api/submit",
167
+ json=payload,
168
+ headers={"Content-Type": "application/json"},
169
+ timeout=30.0,
170
+ )
171
+ except httpx.ConnectError as e:
172
+ raise RuntimeError(
173
+ f"Connection failed: Could not reach {API_URL}\n"
174
+ f" Check your internet connection or try again later.\n"
175
+ f" Details: {e}"
176
+ )
177
+ except httpx.TimeoutException:
178
+ raise RuntimeError(
179
+ "Request timed out after 30 seconds.\n"
180
+ " The server may be under heavy load. Try again later."
181
+ )
182
+
183
+ if response.status_code == 201:
184
+ data = response.json()
185
+
186
+ print(f"\n{'='*50}")
187
+ print(" SUBMITTED SUCCESSFULLY!")
188
+ print(f"{'='*50}")
189
+ print(f" Score: {payload['score']} (Grade {payload['grade']})")
190
+ print(f" Rank: #{data.get('rank', '?')} on {payload['suite']}")
191
+ print(f" Percentile: Top {data.get('percentile', '?')}%")
192
+ print(f" Share: {data['share_url']}")
193
+ print(f"{'='*50}\n")
194
+
195
+ return {
196
+ "status": "success",
197
+ "submission_id": data["submission_id"],
198
+ "share_url": data["share_url"],
199
+ "percentile": data.get("percentile"),
200
+ "rank": data.get("rank"),
201
+ "score": payload["score"],
202
+ }
203
+ elif response.status_code == 429:
204
+ raise RuntimeError(
205
+ "Rate limit exceeded.\n"
206
+ " You can only submit once per minute. Try again later."
207
+ )
208
+ elif response.status_code in (401, 403):
209
+ # Signature validation failed
210
+ detail = response.json().get("detail", "Invalid signature")
211
+ error_msg = f"Authentication failed: {detail}\n"
212
+ if _USING_DEFAULT_SECRET:
213
+ error_msg += (
214
+ "\n"
215
+ " You are using the default dev secret which is not accepted\n"
216
+ " by the production server.\n"
217
+ "\n"
218
+ " To submit to the public leaderboard:\n"
219
+ " 1. Get an API key from https://janus-labs.dev/api-keys\n"
220
+ " 2. Set: export JANUS_HMAC_SECRET=<your-key>\n"
221
+ " 3. Re-run: janus-labs submit result.json\n"
222
+ )
223
+ else:
224
+ error_msg += (
225
+ "\n"
226
+ " Your JANUS_HMAC_SECRET may be incorrect or expired.\n"
227
+ " Get a new key from https://janus-labs.dev/api-keys\n"
228
+ )
229
+ raise RuntimeError(error_msg)
230
+ elif response.status_code == 400:
231
+ detail = response.json().get("detail", response.text)
232
+ raise RuntimeError(
233
+ f"Validation error: {detail}\n"
234
+ "\n"
235
+ " This usually means the result.json format is incorrect.\n"
236
+ " Run with --dry-run to see the payload being submitted."
237
+ )
238
+ elif response.status_code == 422:
239
+ # Schema validation error
240
+ detail = response.json().get("detail", response.text)
241
+ raise RuntimeError(
242
+ f"Schema validation failed: {detail}\n"
243
+ "\n"
244
+ " The result.json fields don't match the expected format.\n"
245
+ " This may be a CLI version mismatch. Try: pip install --upgrade janus-labs"
246
+ )
247
+ else:
248
+ raise RuntimeError(
249
+ f"Submit failed: HTTP {response.status_code}\n"
250
+ f" Response: {response.text[:200]}"
251
+ )
252
+
253
+
254
+ def cmd_submit(args) -> int:
255
+ """Handle submit subcommand."""
256
+ try:
257
+ result = submit_result(args.result_file, args.github, args.dry_run)
258
+ return 0
259
+ except FileNotFoundError:
260
+ print(f"ERROR: File not found: {args.result_file}", file=sys.stderr)
261
+ return 1
262
+ except json.JSONDecodeError as e:
263
+ print(f"ERROR: Invalid JSON: {e}", file=sys.stderr)
264
+ return 1
265
+ except RuntimeError as e:
266
+ print(f"ERROR: {e}", file=sys.stderr)
267
+ return 1
268
+ except Exception as e:
269
+ print(f"ERROR: {e}", file=sys.stderr)
270
+ return 1
config/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """Configuration utilities for Janus Labs."""
config/detection.py ADDED
@@ -0,0 +1,72 @@
1
+ """Config detection module for identifying agent instruction files."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from datetime import datetime, UTC
7
+ import hashlib
8
+ from pathlib import Path
9
+ from typing import List
10
+
11
+
12
+ @dataclass
13
+ class ConfigMetadata:
14
+ """Metadata about detected configuration files."""
15
+
16
+ config_source: str # "default" or "custom"
17
+ config_hash: str # SHA-256 truncated to 12 chars
18
+ config_files: List[str] # List of detected files
19
+ captured_at: str # ISO timestamp
20
+
21
+
22
+ INSTRUCTION_PATTERNS = [
23
+ "CLAUDE.md", # Claude Code
24
+ ".github/copilot-instructions.md", # GitHub Copilot
25
+ "AGENTS.md", # Codex CLI
26
+ "codex.md", # Codex CLI alt
27
+ "GEMINI.md", # Gemini CLI
28
+ ]
29
+
30
+
31
+ def detect_config(workspace_path: Path) -> ConfigMetadata:
32
+ """
33
+ Detect instruction files in the workspace.
34
+
35
+ Args:
36
+ workspace_path: Root path to search for instruction files
37
+
38
+ Returns:
39
+ ConfigMetadata with detection results
40
+ """
41
+ detected_files: List[str] = []
42
+
43
+ for pattern in INSTRUCTION_PATTERNS:
44
+ file_path = workspace_path / pattern
45
+ if file_path.exists():
46
+ detected_files.append(pattern)
47
+
48
+ detected_files.sort()
49
+ captured_at = datetime.now(UTC).isoformat().replace("+00:00", "Z")
50
+
51
+ if not detected_files:
52
+ return ConfigMetadata(
53
+ config_source="default",
54
+ config_hash="",
55
+ config_files=[],
56
+ captured_at=captured_at,
57
+ )
58
+
59
+ combined_content = ""
60
+ for file_name in detected_files:
61
+ file_path = workspace_path / file_name
62
+ combined_content += file_path.read_text(encoding="utf-8")
63
+
64
+ full_hash = hashlib.sha256(combined_content.encode("utf-8")).hexdigest()
65
+ truncated_hash = full_hash[:12]
66
+
67
+ return ConfigMetadata(
68
+ config_source="custom",
69
+ config_hash=truncated_hash,
70
+ config_files=detected_files,
71
+ captured_at=captured_at,
72
+ )
forge/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ """Forge layer - behavior specification definitions."""
2
+
3
+ from .behavior import BehaviorSpec, RubricLevel
4
+
5
+ __all__ = ["BehaviorSpec", "RubricLevel"]
forge/behavior.py ADDED
@@ -0,0 +1,35 @@
1
+ """Behavior specification types for Janus Labs."""
2
+
3
+ from dataclasses import dataclass
4
+ from typing import TypedDict
5
+
6
+
7
+ class RubricLevel(TypedDict):
8
+ """Scoring guidance for a rubric level."""
9
+ score: int # 1-10
10
+ description: str
11
+
12
+
13
+ @dataclass
14
+ class BehaviorSpec:
15
+ """
16
+ A falsifiable behavior specification.
17
+
18
+ Behaviors are discovered by Probe, formalized in Forge,
19
+ and measured by Gauge.
20
+ """
21
+ behavior_id: str
22
+ name: str
23
+ description: str
24
+ rubric: dict[int, str]
25
+ threshold: float
26
+ disconfirmers: list[str]
27
+ taxonomy_code: str
28
+ version: str = "1.0.0"
29
+
30
+ def get_rubric_prompt(self) -> str:
31
+ """Generate rubric prompt for LLM judge."""
32
+ lines = ["Score the following behavior on a 1-10 scale:\n"]
33
+ for score in sorted(self.rubric.keys()):
34
+ lines.append(f"- Score {score}: {self.rubric[score]}")
35
+ return "\n".join(lines)
@@ -0,0 +1,25 @@
1
+ behavior_id: BHV-002-refactor-complexity
2
+ name: Reduce Cyclomatic Complexity
3
+ description: |
4
+ Refactor the calculate_price function to reduce cyclomatic complexity
5
+ from 18 to 6 or less. Maintain all existing functionality.
6
+ All tests must continue to pass.
7
+ threshold: 6.0
8
+ rubric:
9
+ 1: "No meaningful refactoring attempted"
10
+ 2: "Minor changes, complexity unchanged"
11
+ 3: "Partial refactoring, complexity reduced slightly"
12
+ 4: "Complexity reduced but not to target"
13
+ 5: "Complexity reduced to 8-9, some issues"
14
+ 6: "Complexity target met (<=6), minor code issues"
15
+ 7: "Clean refactor, complexity <=6"
16
+ 8: "Good patterns used, improved readability"
17
+ 9: "Excellent refactor with clear abstractions"
18
+ 10: "Exemplary refactor - maintainable, testable, documented"
19
+ disconfirmers:
20
+ - "Tests fail after refactoring"
21
+ - "Functionality changed or removed"
22
+ - "Complexity increased"
23
+ - "Code duplicated instead of abstracted"
24
+ taxonomy_code: "O-2.01"
25
+ version: "1.0.0"
@@ -0,0 +1,28 @@
1
+ behavior_id: BHV-003-error-handling
2
+ name: Add Comprehensive Error Handling
3
+ description: |
4
+ Add error handling to the file_processor module. Handle:
5
+ - File not found
6
+ - Permission denied
7
+ - Invalid JSON format
8
+ - Network timeout (for URL sources)
9
+ All errors should be logged and return appropriate error codes.
10
+ threshold: 6.0
11
+ rubric:
12
+ 1: "No error handling added"
13
+ 2: "Minimal handling, silent failures"
14
+ 3: "Some errors handled, others crash"
15
+ 4: "Most errors handled, poor messages"
16
+ 5: "All errors handled, basic logging"
17
+ 6: "All errors handled, good messages"
18
+ 7: "Comprehensive handling, structured logging"
19
+ 8: "Good error messages, proper error codes"
20
+ 9: "Production-quality with context preservation"
21
+ 10: "Exemplary - retry logic, graceful degradation, full traceability"
22
+ disconfirmers:
23
+ - "Silent failures (errors swallowed)"
24
+ - "Generic catch-all without specificity"
25
+ - "Missing error types from requirements"
26
+ - "Crashes on expected error conditions"
27
+ taxonomy_code: "O-3.01"
28
+ version: "1.0.0"
gauge/__init__.py ADDED
@@ -0,0 +1,17 @@
1
+ """Gauge layer - Measurement via DeepEval integration."""
2
+
3
+ from .adapter import behavior_to_test_case, create_test_cases, create_geval_metric
4
+ from .governed_rollout import GovernedRolloutConfig, RolloutResult, execute_governed_rollouts
5
+ from .trust_elasticity import TrustElasticityMetric
6
+ from .report import generate_benchmark_report
7
+
8
+ __all__ = [
9
+ "behavior_to_test_case",
10
+ "create_test_cases",
11
+ "create_geval_metric",
12
+ "execute_governed_rollouts",
13
+ "GovernedRolloutConfig",
14
+ "RolloutResult",
15
+ "TrustElasticityMetric",
16
+ "generate_benchmark_report",
17
+ ]
gauge/adapter.py ADDED
@@ -0,0 +1,134 @@
1
+ """Adapter to convert BehaviorSpec to DeepEval test cases.
2
+
3
+ E8-S4: Enhanced with qualitative rubric support for multi-dimensional scoring.
4
+ """
5
+
6
+ from typing import Optional
7
+
8
+ from deepeval.metrics import GEval
9
+ from deepeval.test_case import LLMTestCase, LLMTestCaseParams
10
+
11
+ from forge.behavior import BehaviorSpec
12
+ from harness.types import RunArtifactBundle
13
+ from gauge.qualitative import QualitativeRubric
14
+
15
+
16
+ def behavior_to_test_case(
17
+ behavior: BehaviorSpec,
18
+ bundle: RunArtifactBundle,
19
+ qualitative_rubric: Optional[QualitativeRubric] = None,
20
+ ) -> LLMTestCase:
21
+ """
22
+ Convert a BehaviorSpec + RunArtifactBundle to a DeepEval LLMTestCase.
23
+
24
+ Args:
25
+ behavior: The behavior specification with rubric
26
+ bundle: The captured agent execution artifacts
27
+ qualitative_rubric: Optional qualitative rubric for enhanced evaluation
28
+
29
+ Returns:
30
+ LLMTestCase ready for DeepEval evaluation
31
+ """
32
+ transcript_text = "\n".join(
33
+ f"[{msg['role']}]: {msg['content']}"
34
+ for msg in bundle["transcript"]
35
+ )
36
+
37
+ tool_summary = "\n".join(
38
+ f"- {trace['tool_name']}({trace['arguments']}) -> {trace['result']}"
39
+ for trace in bundle["tool_traces"]
40
+ )
41
+
42
+ # Include git diff for code quality evaluation
43
+ diff_text = bundle.get("repo_diff", {}).get("patch", "No diff available")
44
+
45
+ # Include test results for outcome evaluation
46
+ test_results = bundle.get("test_results", {})
47
+ test_summary = (
48
+ f"Tests: {test_results.get('passed', 0)} passed, "
49
+ f"{test_results.get('failed', 0)} failed"
50
+ )
51
+
52
+ # Build context with rubric
53
+ if qualitative_rubric:
54
+ context = [qualitative_rubric.get_full_evaluation_prompt()]
55
+ else:
56
+ context = [behavior.get_rubric_prompt()]
57
+
58
+ return LLMTestCase(
59
+ input=f"Behavior: {behavior.name}\n\nTask transcript:\n{transcript_text}",
60
+ actual_output=(
61
+ f"Tool usage:\n{tool_summary}\n\n"
62
+ f"Code changes:\n{diff_text}\n\n"
63
+ f"{test_summary}\n\n"
64
+ f"Exit: {bundle['exit_code']}"
65
+ ),
66
+ expected_output=behavior.description,
67
+ context=context,
68
+ )
69
+
70
+
71
+ def create_geval_metric(
72
+ behavior: BehaviorSpec,
73
+ qualitative_rubric: Optional[QualitativeRubric] = None,
74
+ model: Optional[str] = None,
75
+ ) -> GEval:
76
+ """
77
+ Create a GEval metric configured for this behavior's rubric.
78
+
79
+ Args:
80
+ behavior: The behavior specification
81
+ qualitative_rubric: Optional qualitative rubric for detailed evaluation
82
+ model: LLM model string (e.g., "gpt-4o-mini") - must be passed at construction
83
+
84
+ Returns:
85
+ Configured GEval metric for scoring
86
+ """
87
+ if qualitative_rubric:
88
+ # Use detailed evaluation steps from qualitative rubric
89
+ evaluation_steps = qualitative_rubric.get_evaluation_steps()
90
+ criteria = qualitative_rubric.get_full_evaluation_prompt()
91
+ else:
92
+ # Basic evaluation steps
93
+ evaluation_steps = [
94
+ f"Review the agent's behavior against: {behavior.description}",
95
+ "Apply the rubric from the context to score 1-10",
96
+ f"Minimum acceptable score is {behavior.threshold}",
97
+ ]
98
+ criteria = behavior.description
99
+
100
+ return GEval(
101
+ name=behavior.behavior_id,
102
+ criteria=criteria,
103
+ evaluation_params=[
104
+ LLMTestCaseParams.INPUT,
105
+ LLMTestCaseParams.ACTUAL_OUTPUT,
106
+ LLMTestCaseParams.EXPECTED_OUTPUT,
107
+ LLMTestCaseParams.CONTEXT,
108
+ ],
109
+ evaluation_steps=evaluation_steps,
110
+ threshold=behavior.threshold / 10.0,
111
+ model=model,
112
+ )
113
+
114
+
115
+ def create_test_cases(
116
+ behavior: BehaviorSpec,
117
+ bundles: list[RunArtifactBundle],
118
+ qualitative_rubric: Optional[QualitativeRubric] = None,
119
+ ) -> list[LLMTestCase]:
120
+ """
121
+ Create test cases for all rollout bundles.
122
+
123
+ Args:
124
+ behavior: The behavior to test
125
+ bundles: List of execution bundles from rollouts
126
+ qualitative_rubric: Optional qualitative rubric for enhanced evaluation
127
+
128
+ Returns:
129
+ List of LLMTestCase objects for DeepEval
130
+ """
131
+ return [
132
+ behavior_to_test_case(behavior, bundle, qualitative_rubric)
133
+ for bundle in bundles
134
+ ]