stackfix 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloudgym/__init__.py +3 -0
- cloudgym/benchmark/__init__.py +0 -0
- cloudgym/benchmark/dataset.py +188 -0
- cloudgym/benchmark/evaluator.py +275 -0
- cloudgym/cli.py +61 -0
- cloudgym/fixer/__init__.py +1 -0
- cloudgym/fixer/cli.py +521 -0
- cloudgym/fixer/detector.py +81 -0
- cloudgym/fixer/formatter.py +55 -0
- cloudgym/fixer/lambda_handler.py +126 -0
- cloudgym/fixer/repairer.py +237 -0
- cloudgym/generator/__init__.py +0 -0
- cloudgym/generator/formatter.py +142 -0
- cloudgym/generator/pipeline.py +271 -0
- cloudgym/inverter/__init__.py +0 -0
- cloudgym/inverter/_cf_injectors.py +705 -0
- cloudgym/inverter/_cf_utils.py +202 -0
- cloudgym/inverter/_hcl_utils.py +182 -0
- cloudgym/inverter/_tf_injectors.py +641 -0
- cloudgym/inverter/_yaml_cf.py +84 -0
- cloudgym/inverter/agentic.py +90 -0
- cloudgym/inverter/engine.py +258 -0
- cloudgym/inverter/programmatic.py +95 -0
- cloudgym/scraper/__init__.py +0 -0
- cloudgym/scraper/aws_samples.py +159 -0
- cloudgym/scraper/github.py +238 -0
- cloudgym/scraper/registry.py +165 -0
- cloudgym/scraper/validator.py +116 -0
- cloudgym/taxonomy/__init__.py +10 -0
- cloudgym/taxonomy/base.py +102 -0
- cloudgym/taxonomy/cloudformation.py +258 -0
- cloudgym/taxonomy/terraform.py +274 -0
- cloudgym/utils/__init__.py +0 -0
- cloudgym/utils/config.py +57 -0
- cloudgym/utils/ollama.py +66 -0
- cloudgym/validator/__init__.py +0 -0
- cloudgym/validator/cloudformation.py +55 -0
- cloudgym/validator/opentofu.py +103 -0
- cloudgym/validator/terraform.py +115 -0
- stackfix-0.1.0.dist-info/METADATA +182 -0
- stackfix-0.1.0.dist-info/RECORD +44 -0
- stackfix-0.1.0.dist-info/WHEEL +4 -0
- stackfix-0.1.0.dist-info/entry_points.txt +3 -0
- stackfix-0.1.0.dist-info/licenses/LICENSE +21 -0
cloudgym/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
"""Benchmark dataset management.
|
|
2
|
+
|
|
3
|
+
Curates a balanced subset from the test split for evaluation.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import logging
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class BenchmarkEntry:
|
|
18
|
+
"""A single benchmark entry."""
|
|
19
|
+
|
|
20
|
+
id: str
|
|
21
|
+
format: str
|
|
22
|
+
broken_config: str
|
|
23
|
+
errors: list[str]
|
|
24
|
+
warnings: list[str]
|
|
25
|
+
fault_types: list[str]
|
|
26
|
+
difficulty: str
|
|
27
|
+
gold_config: str
|
|
28
|
+
gold_hash: str
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class BenchmarkDataset:
|
|
32
|
+
"""Manages the curated benchmark dataset."""
|
|
33
|
+
|
|
34
|
+
def __init__(self, path: str | Path):
|
|
35
|
+
self.path = Path(path)
|
|
36
|
+
self.entries: list[BenchmarkEntry] = []
|
|
37
|
+
if self.path.exists():
|
|
38
|
+
self._load()
|
|
39
|
+
|
|
40
|
+
def _load(self):
|
|
41
|
+
"""Load benchmark entries from JSONL."""
|
|
42
|
+
with open(self.path) as f:
|
|
43
|
+
for line in f:
|
|
44
|
+
data = json.loads(line)
|
|
45
|
+
self.entries.append(BenchmarkEntry(**{
|
|
46
|
+
k: data[k] for k in BenchmarkEntry.__dataclass_fields__
|
|
47
|
+
if k in data
|
|
48
|
+
}))
|
|
49
|
+
logger.info("Loaded %d benchmark entries from %s", len(self.entries), self.path)
|
|
50
|
+
|
|
51
|
+
def __len__(self) -> int:
|
|
52
|
+
return len(self.entries)
|
|
53
|
+
|
|
54
|
+
def __iter__(self):
|
|
55
|
+
return iter(self.entries)
|
|
56
|
+
|
|
57
|
+
@staticmethod
|
|
58
|
+
def build(
|
|
59
|
+
test_jsonl: str | Path,
|
|
60
|
+
output_path: str | Path,
|
|
61
|
+
target_size: int = 200,
|
|
62
|
+
) -> BenchmarkDataset:
|
|
63
|
+
"""Curate a benchmark dataset from the test split.
|
|
64
|
+
|
|
65
|
+
Curation rules:
|
|
66
|
+
- Single-fault only (one fault type per record)
|
|
67
|
+
- Balanced across categories and difficulties
|
|
68
|
+
- Min 10-line configs (non-trivial)
|
|
69
|
+
- Deduplicated per gold config (max 1 entry per gold hash per fault category)
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
test_jsonl: Path to test.jsonl from format_and_split.
|
|
73
|
+
output_path: Path to write benchmark.jsonl.
|
|
74
|
+
target_size: Target number of benchmark entries.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
BenchmarkDataset with curated entries.
|
|
78
|
+
"""
|
|
79
|
+
test_path = Path(test_jsonl)
|
|
80
|
+
out_path = Path(output_path)
|
|
81
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
82
|
+
|
|
83
|
+
# Load test records
|
|
84
|
+
records = []
|
|
85
|
+
with open(test_path) as f:
|
|
86
|
+
for line in f:
|
|
87
|
+
records.append(json.loads(line))
|
|
88
|
+
|
|
89
|
+
logger.info("Loaded %d test records for curation", len(records))
|
|
90
|
+
|
|
91
|
+
# Filter: single-fault, min 10 lines
|
|
92
|
+
candidates = [
|
|
93
|
+
r for r in records
|
|
94
|
+
if len(r.get("fault_types", [])) == 1
|
|
95
|
+
and len(r.get("broken_config", "").splitlines()) >= 10
|
|
96
|
+
and r.get("errors") # Must have validation errors
|
|
97
|
+
]
|
|
98
|
+
logger.info("%d candidates after filtering", len(candidates))
|
|
99
|
+
|
|
100
|
+
# Deduplicate: max 1 entry per (gold_hash, fault_category)
|
|
101
|
+
seen: set[tuple[str, str]] = set()
|
|
102
|
+
deduped = []
|
|
103
|
+
for r in candidates:
|
|
104
|
+
fault_id = r["fault_types"][0]
|
|
105
|
+
category = fault_id.split(".")[0] if "." in fault_id else fault_id
|
|
106
|
+
key = (r.get("gold_hash", ""), category)
|
|
107
|
+
if key not in seen:
|
|
108
|
+
seen.add(key)
|
|
109
|
+
deduped.append(r)
|
|
110
|
+
logger.info("%d after deduplication", len(deduped))
|
|
111
|
+
|
|
112
|
+
# Balance across categories and difficulties
|
|
113
|
+
selected = _balance_select(deduped, target_size)
|
|
114
|
+
logger.info("Selected %d entries for benchmark", len(selected))
|
|
115
|
+
|
|
116
|
+
# Write benchmark JSONL
|
|
117
|
+
with open(out_path, "w") as f:
|
|
118
|
+
for r in selected:
|
|
119
|
+
entry = {
|
|
120
|
+
"id": r["id"],
|
|
121
|
+
"format": r["format"],
|
|
122
|
+
"broken_config": r["broken_config"],
|
|
123
|
+
"errors": r["errors"],
|
|
124
|
+
"warnings": r.get("warnings", []),
|
|
125
|
+
"fault_types": r["fault_types"],
|
|
126
|
+
"difficulty": r["difficulty"],
|
|
127
|
+
"gold_config": r["gold_config"],
|
|
128
|
+
"gold_hash": r.get("gold_hash", ""),
|
|
129
|
+
}
|
|
130
|
+
f.write(json.dumps(entry) + "\n")
|
|
131
|
+
|
|
132
|
+
# Write metadata
|
|
133
|
+
meta = {
|
|
134
|
+
"total_entries": len(selected),
|
|
135
|
+
"source": str(test_path),
|
|
136
|
+
"category_distribution": _count_categories(selected),
|
|
137
|
+
"difficulty_distribution": _count_field(selected, "difficulty"),
|
|
138
|
+
"format_distribution": _count_field(selected, "format"),
|
|
139
|
+
}
|
|
140
|
+
meta_path = out_path.parent / "benchmark_meta.json"
|
|
141
|
+
with open(meta_path, "w") as f:
|
|
142
|
+
json.dump(meta, f, indent=2)
|
|
143
|
+
|
|
144
|
+
return BenchmarkDataset(out_path)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _balance_select(records: list[dict], target: int) -> list[dict]:
|
|
148
|
+
"""Select records with balanced category/difficulty distribution."""
|
|
149
|
+
by_category: dict[str, list[dict]] = {}
|
|
150
|
+
for r in records:
|
|
151
|
+
fault_id = r["fault_types"][0]
|
|
152
|
+
cat = fault_id.split(".")[0] if "." in fault_id else fault_id
|
|
153
|
+
by_category.setdefault(cat, []).append(r)
|
|
154
|
+
|
|
155
|
+
if not by_category:
|
|
156
|
+
return []
|
|
157
|
+
|
|
158
|
+
per_category = max(1, target // len(by_category))
|
|
159
|
+
selected = []
|
|
160
|
+
|
|
161
|
+
for cat, cat_records in by_category.items():
|
|
162
|
+
# Within category, balance by difficulty
|
|
163
|
+
by_diff: dict[str, list[dict]] = {}
|
|
164
|
+
for r in cat_records:
|
|
165
|
+
by_diff.setdefault(r["difficulty"], []).append(r)
|
|
166
|
+
|
|
167
|
+
per_diff = max(1, per_category // max(len(by_diff), 1))
|
|
168
|
+
for diff, diff_records in by_diff.items():
|
|
169
|
+
selected.extend(diff_records[:per_diff])
|
|
170
|
+
|
|
171
|
+
return selected[:target]
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _count_categories(records: list[dict]) -> dict[str, int]:
|
|
175
|
+
counts: dict[str, int] = {}
|
|
176
|
+
for r in records:
|
|
177
|
+
fault_id = r["fault_types"][0]
|
|
178
|
+
cat = fault_id.split(".")[0] if "." in fault_id else fault_id
|
|
179
|
+
counts[cat] = counts.get(cat, 0) + 1
|
|
180
|
+
return counts
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _count_field(records: list[dict], field: str) -> dict[str, int]:
|
|
184
|
+
counts: dict[str, int] = {}
|
|
185
|
+
for r in records:
|
|
186
|
+
val = r.get(field, "unknown")
|
|
187
|
+
counts[val] = counts.get(val, 0) + 1
|
|
188
|
+
return counts
|
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
"""Evaluation harness for IaC repair benchmark.
|
|
2
|
+
|
|
3
|
+
Validates model-generated fixes via terraform validate / cfn-lint
|
|
4
|
+
and computes pass@k metrics using the unbiased Codex estimator.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import json
|
|
11
|
+
import logging
|
|
12
|
+
import math
|
|
13
|
+
import shutil
|
|
14
|
+
import tempfile
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any, Callable, Awaitable
|
|
18
|
+
|
|
19
|
+
from cloudgym.benchmark.dataset import BenchmarkDataset, BenchmarkEntry
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class EvalReport:
|
|
26
|
+
"""Evaluation report for a model on the benchmark."""
|
|
27
|
+
|
|
28
|
+
model_name: str
|
|
29
|
+
n_attempts: int
|
|
30
|
+
total_entries: int
|
|
31
|
+
pass_at_k: dict[int, float] = field(default_factory=dict)
|
|
32
|
+
per_category: dict[str, dict[int, float]] = field(default_factory=dict)
|
|
33
|
+
per_difficulty: dict[str, dict[int, float]] = field(default_factory=dict)
|
|
34
|
+
per_format: dict[str, dict[int, float]] = field(default_factory=dict)
|
|
35
|
+
raw_results: list[dict] = field(default_factory=list)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# Type alias for model repair function
|
|
39
|
+
ModelFn = Callable[[str, list[str]], Awaitable[str]]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class Evaluator:
|
|
43
|
+
"""Evaluates model repair attempts against the benchmark."""
|
|
44
|
+
|
|
45
|
+
# Max concurrent validations (terraform/cfn-lint)
|
|
46
|
+
DEFAULT_CONCURRENCY = 8
|
|
47
|
+
|
|
48
|
+
def __init__(self, benchmark_path: str | Path, concurrency: int | None = None):
|
|
49
|
+
self.dataset = BenchmarkDataset(benchmark_path)
|
|
50
|
+
self._tf_cache_dir: Path | None = None
|
|
51
|
+
self._concurrency = concurrency or self.DEFAULT_CONCURRENCY
|
|
52
|
+
|
|
53
|
+
async def evaluate_model(
|
|
54
|
+
self,
|
|
55
|
+
model_fn: ModelFn,
|
|
56
|
+
model_name: str = "unknown",
|
|
57
|
+
n_attempts: int = 5,
|
|
58
|
+
k_values: list[int] | None = None,
|
|
59
|
+
) -> EvalReport:
|
|
60
|
+
"""Evaluate a model's repair ability on the benchmark.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
model_fn: Async function (broken_config, errors) -> repaired_config.
|
|
64
|
+
model_name: Name of the model being evaluated.
|
|
65
|
+
n_attempts: Number of repair attempts per benchmark entry.
|
|
66
|
+
k_values: k values for pass@k computation. Default [1, 3].
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
EvalReport with pass@k metrics and breakdowns.
|
|
70
|
+
"""
|
|
71
|
+
if k_values is None:
|
|
72
|
+
k_values = [1, 3]
|
|
73
|
+
|
|
74
|
+
# Phase 1: Generate all repairs (serial — model inference isn't parallel-safe)
|
|
75
|
+
# Store as list of (entry, [repaired_configs])
|
|
76
|
+
all_repairs: list[tuple[BenchmarkEntry, list[str | None]]] = []
|
|
77
|
+
for entry in self.dataset:
|
|
78
|
+
repairs: list[str | None] = []
|
|
79
|
+
for attempt in range(n_attempts):
|
|
80
|
+
try:
|
|
81
|
+
repaired = await model_fn(entry.broken_config, entry.errors)
|
|
82
|
+
repairs.append(repaired)
|
|
83
|
+
except Exception:
|
|
84
|
+
logger.exception(
|
|
85
|
+
"Model failed on entry %s attempt %d", entry.id, attempt
|
|
86
|
+
)
|
|
87
|
+
repairs.append(None)
|
|
88
|
+
all_repairs.append((entry, repairs))
|
|
89
|
+
|
|
90
|
+
# Phase 2: Validate all repairs concurrently
|
|
91
|
+
sem = asyncio.Semaphore(self._concurrency)
|
|
92
|
+
|
|
93
|
+
async def _validate(repaired: str | None, fmt: str) -> bool:
|
|
94
|
+
if repaired is None:
|
|
95
|
+
return False
|
|
96
|
+
async with sem:
|
|
97
|
+
return await self._check_repair(repaired, fmt)
|
|
98
|
+
|
|
99
|
+
raw_results = []
|
|
100
|
+
for entry, repairs in all_repairs:
|
|
101
|
+
tasks = [_validate(r, entry.format) for r in repairs]
|
|
102
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
103
|
+
passes = sum(
|
|
104
|
+
1 for r in results if r is True
|
|
105
|
+
)
|
|
106
|
+
raw_results.append({
|
|
107
|
+
"id": entry.id,
|
|
108
|
+
"format": entry.format,
|
|
109
|
+
"fault_types": entry.fault_types,
|
|
110
|
+
"difficulty": entry.difficulty,
|
|
111
|
+
"n": n_attempts,
|
|
112
|
+
"c": passes,
|
|
113
|
+
})
|
|
114
|
+
|
|
115
|
+
# Compute metrics
|
|
116
|
+
report = EvalReport(
|
|
117
|
+
model_name=model_name,
|
|
118
|
+
n_attempts=n_attempts,
|
|
119
|
+
total_entries=len(self.dataset),
|
|
120
|
+
raw_results=raw_results,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Overall pass@k
|
|
124
|
+
for k in k_values:
|
|
125
|
+
report.pass_at_k[k] = _compute_pass_at_k(raw_results, k)
|
|
126
|
+
|
|
127
|
+
# Per-category breakdown
|
|
128
|
+
categories = set()
|
|
129
|
+
for r in raw_results:
|
|
130
|
+
for ft in r["fault_types"]:
|
|
131
|
+
cat = ft.split(".")[0] if "." in ft else ft
|
|
132
|
+
categories.add(cat)
|
|
133
|
+
|
|
134
|
+
for cat in categories:
|
|
135
|
+
cat_results = [
|
|
136
|
+
r for r in raw_results
|
|
137
|
+
if any(ft.startswith(cat) for ft in r["fault_types"])
|
|
138
|
+
]
|
|
139
|
+
report.per_category[cat] = {
|
|
140
|
+
k: _compute_pass_at_k(cat_results, k) for k in k_values
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
# Per-difficulty breakdown
|
|
144
|
+
difficulties = {r["difficulty"] for r in raw_results}
|
|
145
|
+
for diff in difficulties:
|
|
146
|
+
diff_results = [r for r in raw_results if r["difficulty"] == diff]
|
|
147
|
+
report.per_difficulty[diff] = {
|
|
148
|
+
k: _compute_pass_at_k(diff_results, k) for k in k_values
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
# Per-format breakdown
|
|
152
|
+
formats = {r["format"] for r in raw_results}
|
|
153
|
+
for fmt in formats:
|
|
154
|
+
fmt_results = [r for r in raw_results if r["format"] == fmt]
|
|
155
|
+
report.per_format[fmt] = {
|
|
156
|
+
k: _compute_pass_at_k(fmt_results, k) for k in k_values
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
return report
|
|
160
|
+
|
|
161
|
+
async def _ensure_tf_cache(self) -> Path:
|
|
162
|
+
"""Create a cached terraform init directory for fast validation."""
|
|
163
|
+
if self._tf_cache_dir and self._tf_cache_dir.exists():
|
|
164
|
+
return self._tf_cache_dir
|
|
165
|
+
|
|
166
|
+
cache = Path(tempfile.mkdtemp(prefix="cloudgym_tf_cache_"))
|
|
167
|
+
# Write a minimal .tf that requires the AWS provider
|
|
168
|
+
(cache / "providers.tf").write_text(
|
|
169
|
+
'terraform {\n required_providers {\n'
|
|
170
|
+
' aws = {\n source = "hashicorp/aws"\n'
|
|
171
|
+
' version = "~> 5.0"\n }\n }\n}\n'
|
|
172
|
+
)
|
|
173
|
+
import asyncio
|
|
174
|
+
proc = await asyncio.create_subprocess_exec(
|
|
175
|
+
"terraform", "init", "-backend=false", "-no-color",
|
|
176
|
+
cwd=cache,
|
|
177
|
+
stdout=asyncio.subprocess.PIPE,
|
|
178
|
+
stderr=asyncio.subprocess.PIPE,
|
|
179
|
+
)
|
|
180
|
+
await proc.communicate()
|
|
181
|
+
# Remove the providers.tf so it doesn't interfere with validation
|
|
182
|
+
(cache / "providers.tf").unlink(missing_ok=True)
|
|
183
|
+
self._tf_cache_dir = cache
|
|
184
|
+
logger.info("Cached terraform init at %s", cache)
|
|
185
|
+
return cache
|
|
186
|
+
|
|
187
|
+
async def _check_repair(self, repaired: str, iac_format: str) -> bool:
|
|
188
|
+
"""Check if a repaired config passes validation."""
|
|
189
|
+
if not repaired or not repaired.strip():
|
|
190
|
+
return False
|
|
191
|
+
|
|
192
|
+
if iac_format in ("terraform", "opentofu"):
|
|
193
|
+
return await self._check_terraform_repair(repaired)
|
|
194
|
+
else:
|
|
195
|
+
return await self._check_cf_repair(repaired)
|
|
196
|
+
|
|
197
|
+
async def _check_terraform_repair(self, repaired: str) -> bool:
|
|
198
|
+
"""Validate terraform repair using cached init directory."""
|
|
199
|
+
try:
|
|
200
|
+
cache = await self._ensure_tf_cache()
|
|
201
|
+
tmpdir = Path(tempfile.mkdtemp(prefix="cloudgym_eval_"))
|
|
202
|
+
# Symlink .terraform and lock file from cache (much faster than copy)
|
|
203
|
+
tf_dir = cache / ".terraform"
|
|
204
|
+
lock_file = cache / ".terraform.lock.hcl"
|
|
205
|
+
if tf_dir.exists():
|
|
206
|
+
(tmpdir / ".terraform").symlink_to(tf_dir)
|
|
207
|
+
if lock_file.exists():
|
|
208
|
+
(tmpdir / ".terraform.lock.hcl").symlink_to(lock_file)
|
|
209
|
+
|
|
210
|
+
(tmpdir / "repaired.tf").write_text(repaired)
|
|
211
|
+
|
|
212
|
+
# Skip init, go straight to validate
|
|
213
|
+
proc = await asyncio.create_subprocess_exec(
|
|
214
|
+
"terraform", "validate", "-json", "-no-color",
|
|
215
|
+
cwd=tmpdir,
|
|
216
|
+
stdout=asyncio.subprocess.PIPE,
|
|
217
|
+
stderr=asyncio.subprocess.PIPE,
|
|
218
|
+
)
|
|
219
|
+
stdout, _ = await proc.communicate()
|
|
220
|
+
result = json.loads(stdout.decode(errors="replace"))
|
|
221
|
+
return result.get("valid", False)
|
|
222
|
+
except Exception:
|
|
223
|
+
logger.exception("Terraform validation error during eval")
|
|
224
|
+
return False
|
|
225
|
+
finally:
|
|
226
|
+
shutil.rmtree(tmpdir, ignore_errors=True)
|
|
227
|
+
|
|
228
|
+
async def _check_cf_repair(self, repaired: str) -> bool:
|
|
229
|
+
"""Validate CloudFormation repair via cfn-lint."""
|
|
230
|
+
tmpdir = Path(tempfile.mkdtemp(prefix="cloudgym_eval_"))
|
|
231
|
+
tmp_file = tmpdir / "repaired.yaml"
|
|
232
|
+
tmp_file.write_text(repaired)
|
|
233
|
+
try:
|
|
234
|
+
from cloudgym.validator.cloudformation import validate
|
|
235
|
+
result = await validate(tmp_file)
|
|
236
|
+
return result.valid and len(result.errors) == 0
|
|
237
|
+
except Exception:
|
|
238
|
+
logger.exception("CF validation error during eval")
|
|
239
|
+
return False
|
|
240
|
+
finally:
|
|
241
|
+
shutil.rmtree(tmpdir, ignore_errors=True)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def _compute_pass_at_k(results: list[dict], k: int) -> float:
|
|
245
|
+
"""Compute pass@k using the unbiased Codex estimator.
|
|
246
|
+
|
|
247
|
+
pass@k = 1 - C(n-c, k) / C(n, k)
|
|
248
|
+
|
|
249
|
+
Where n = total attempts, c = number of correct attempts.
|
|
250
|
+
"""
|
|
251
|
+
if not results:
|
|
252
|
+
return 0.0
|
|
253
|
+
|
|
254
|
+
total = 0.0
|
|
255
|
+
for r in results:
|
|
256
|
+
n = r["n"]
|
|
257
|
+
c = r["c"]
|
|
258
|
+
if n < k:
|
|
259
|
+
# Not enough attempts
|
|
260
|
+
total += 1.0 if c > 0 else 0.0
|
|
261
|
+
elif c == 0:
|
|
262
|
+
total += 0.0
|
|
263
|
+
elif n - c < k:
|
|
264
|
+
total += 1.0
|
|
265
|
+
else:
|
|
266
|
+
total += 1.0 - _comb(n - c, k) / _comb(n, k)
|
|
267
|
+
|
|
268
|
+
return total / len(results)
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def _comb(n: int, k: int) -> float:
|
|
272
|
+
"""Compute combination C(n, k) using math.comb."""
|
|
273
|
+
if k < 0 or k > n:
|
|
274
|
+
return 0.0
|
|
275
|
+
return float(math.comb(n, k))
|
cloudgym/cli.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Cloud-Gym CLI entry point."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import click
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@click.group()
|
|
9
|
+
@click.version_option(package_name="cloud-gym")
|
|
10
|
+
def main():
|
|
11
|
+
"""Cloud-Gym: IaC Repair Benchmark via Environment Inversion."""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@main.command()
|
|
15
|
+
@click.option("--skip-github", is_flag=True)
|
|
16
|
+
@click.option("--skip-registry", is_flag=True)
|
|
17
|
+
@click.option("--skip-aws", is_flag=True)
|
|
18
|
+
@click.option("--skip-validate", is_flag=True)
|
|
19
|
+
def scrape(skip_github: bool, skip_registry: bool, skip_aws: bool, skip_validate: bool):
|
|
20
|
+
"""Collect gold IaC configurations from various sources."""
|
|
21
|
+
import asyncio
|
|
22
|
+
from scripts.scrape import run_scrape
|
|
23
|
+
|
|
24
|
+
asyncio.run(run_scrape(skip_github, skip_registry, skip_aws, skip_validate))
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@main.command()
|
|
28
|
+
def taxonomy():
|
|
29
|
+
"""Display the fault taxonomy."""
|
|
30
|
+
from rich.console import Console
|
|
31
|
+
from rich.table import Table
|
|
32
|
+
|
|
33
|
+
from cloudgym.taxonomy.base import REGISTRY, FaultCategory, IaCFormat
|
|
34
|
+
import cloudgym.taxonomy.terraform # noqa: F401 — triggers registration
|
|
35
|
+
import cloudgym.taxonomy.cloudformation # noqa: F401
|
|
36
|
+
|
|
37
|
+
console = Console()
|
|
38
|
+
table = Table(title="Cloud-Gym Fault Taxonomy")
|
|
39
|
+
table.add_column("ID", style="cyan")
|
|
40
|
+
table.add_column("Category", style="magenta")
|
|
41
|
+
table.add_column("Severity")
|
|
42
|
+
table.add_column("Formats")
|
|
43
|
+
table.add_column("Description")
|
|
44
|
+
|
|
45
|
+
for fault in sorted(REGISTRY.all(), key=lambda f: (f.category.name, f.severity.value)):
|
|
46
|
+
formats = ", ".join(f.value for f in fault.applicable_formats)
|
|
47
|
+
sev_style = {"low": "green", "medium": "yellow", "high": "red"}[fault.severity.value]
|
|
48
|
+
table.add_row(
|
|
49
|
+
fault.id,
|
|
50
|
+
fault.category.name,
|
|
51
|
+
f"[{sev_style}]{fault.severity.value}[/{sev_style}]",
|
|
52
|
+
formats,
|
|
53
|
+
fault.description,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
console.print(table)
|
|
57
|
+
console.print(f"\nTotal fault types: {len(REGISTRY)}")
|
|
58
|
+
for cat in FaultCategory:
|
|
59
|
+
count = len(REGISTRY.list_by_category(cat))
|
|
60
|
+
if count > 0:
|
|
61
|
+
console.print(f" {cat.name}: {count}")
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""stackfix: AI-powered Infrastructure-as-Code repair tool."""
|