stackfix 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. cloudgym/__init__.py +3 -0
  2. cloudgym/benchmark/__init__.py +0 -0
  3. cloudgym/benchmark/dataset.py +188 -0
  4. cloudgym/benchmark/evaluator.py +275 -0
  5. cloudgym/cli.py +61 -0
  6. cloudgym/fixer/__init__.py +1 -0
  7. cloudgym/fixer/cli.py +521 -0
  8. cloudgym/fixer/detector.py +81 -0
  9. cloudgym/fixer/formatter.py +55 -0
  10. cloudgym/fixer/lambda_handler.py +126 -0
  11. cloudgym/fixer/repairer.py +237 -0
  12. cloudgym/generator/__init__.py +0 -0
  13. cloudgym/generator/formatter.py +142 -0
  14. cloudgym/generator/pipeline.py +271 -0
  15. cloudgym/inverter/__init__.py +0 -0
  16. cloudgym/inverter/_cf_injectors.py +705 -0
  17. cloudgym/inverter/_cf_utils.py +202 -0
  18. cloudgym/inverter/_hcl_utils.py +182 -0
  19. cloudgym/inverter/_tf_injectors.py +641 -0
  20. cloudgym/inverter/_yaml_cf.py +84 -0
  21. cloudgym/inverter/agentic.py +90 -0
  22. cloudgym/inverter/engine.py +258 -0
  23. cloudgym/inverter/programmatic.py +95 -0
  24. cloudgym/scraper/__init__.py +0 -0
  25. cloudgym/scraper/aws_samples.py +159 -0
  26. cloudgym/scraper/github.py +238 -0
  27. cloudgym/scraper/registry.py +165 -0
  28. cloudgym/scraper/validator.py +116 -0
  29. cloudgym/taxonomy/__init__.py +10 -0
  30. cloudgym/taxonomy/base.py +102 -0
  31. cloudgym/taxonomy/cloudformation.py +258 -0
  32. cloudgym/taxonomy/terraform.py +274 -0
  33. cloudgym/utils/__init__.py +0 -0
  34. cloudgym/utils/config.py +57 -0
  35. cloudgym/utils/ollama.py +66 -0
  36. cloudgym/validator/__init__.py +0 -0
  37. cloudgym/validator/cloudformation.py +55 -0
  38. cloudgym/validator/opentofu.py +103 -0
  39. cloudgym/validator/terraform.py +115 -0
  40. stackfix-0.1.0.dist-info/METADATA +182 -0
  41. stackfix-0.1.0.dist-info/RECORD +44 -0
  42. stackfix-0.1.0.dist-info/WHEEL +4 -0
  43. stackfix-0.1.0.dist-info/entry_points.txt +3 -0
  44. stackfix-0.1.0.dist-info/licenses/LICENSE +21 -0
cloudgym/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """Cloud-Gym: IaC Repair Benchmark via Environment Inversion."""
2
+
3
+ __version__ = "0.1.0"
File without changes
@@ -0,0 +1,188 @@
1
+ """Benchmark dataset management.
2
+
3
+ Curates a balanced subset from the test split for evaluation.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import json
9
+ import logging
10
+ from dataclasses import dataclass
11
+ from pathlib import Path
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ @dataclass
17
+ class BenchmarkEntry:
18
+ """A single benchmark entry."""
19
+
20
+ id: str
21
+ format: str
22
+ broken_config: str
23
+ errors: list[str]
24
+ warnings: list[str]
25
+ fault_types: list[str]
26
+ difficulty: str
27
+ gold_config: str
28
+ gold_hash: str
29
+
30
+
31
+ class BenchmarkDataset:
32
+ """Manages the curated benchmark dataset."""
33
+
34
+ def __init__(self, path: str | Path):
35
+ self.path = Path(path)
36
+ self.entries: list[BenchmarkEntry] = []
37
+ if self.path.exists():
38
+ self._load()
39
+
40
+ def _load(self):
41
+ """Load benchmark entries from JSONL."""
42
+ with open(self.path) as f:
43
+ for line in f:
44
+ data = json.loads(line)
45
+ self.entries.append(BenchmarkEntry(**{
46
+ k: data[k] for k in BenchmarkEntry.__dataclass_fields__
47
+ if k in data
48
+ }))
49
+ logger.info("Loaded %d benchmark entries from %s", len(self.entries), self.path)
50
+
51
+ def __len__(self) -> int:
52
+ return len(self.entries)
53
+
54
+ def __iter__(self):
55
+ return iter(self.entries)
56
+
57
+ @staticmethod
58
+ def build(
59
+ test_jsonl: str | Path,
60
+ output_path: str | Path,
61
+ target_size: int = 200,
62
+ ) -> BenchmarkDataset:
63
+ """Curate a benchmark dataset from the test split.
64
+
65
+ Curation rules:
66
+ - Single-fault only (one fault type per record)
67
+ - Balanced across categories and difficulties
68
+ - Min 10-line configs (non-trivial)
69
+ - Deduplicated per gold config (max 1 entry per gold hash per fault category)
70
+
71
+ Args:
72
+ test_jsonl: Path to test.jsonl from format_and_split.
73
+ output_path: Path to write benchmark.jsonl.
74
+ target_size: Target number of benchmark entries.
75
+
76
+ Returns:
77
+ BenchmarkDataset with curated entries.
78
+ """
79
+ test_path = Path(test_jsonl)
80
+ out_path = Path(output_path)
81
+ out_path.parent.mkdir(parents=True, exist_ok=True)
82
+
83
+ # Load test records
84
+ records = []
85
+ with open(test_path) as f:
86
+ for line in f:
87
+ records.append(json.loads(line))
88
+
89
+ logger.info("Loaded %d test records for curation", len(records))
90
+
91
+ # Filter: single-fault, min 10 lines
92
+ candidates = [
93
+ r for r in records
94
+ if len(r.get("fault_types", [])) == 1
95
+ and len(r.get("broken_config", "").splitlines()) >= 10
96
+ and r.get("errors") # Must have validation errors
97
+ ]
98
+ logger.info("%d candidates after filtering", len(candidates))
99
+
100
+ # Deduplicate: max 1 entry per (gold_hash, fault_category)
101
+ seen: set[tuple[str, str]] = set()
102
+ deduped = []
103
+ for r in candidates:
104
+ fault_id = r["fault_types"][0]
105
+ category = fault_id.split(".")[0] if "." in fault_id else fault_id
106
+ key = (r.get("gold_hash", ""), category)
107
+ if key not in seen:
108
+ seen.add(key)
109
+ deduped.append(r)
110
+ logger.info("%d after deduplication", len(deduped))
111
+
112
+ # Balance across categories and difficulties
113
+ selected = _balance_select(deduped, target_size)
114
+ logger.info("Selected %d entries for benchmark", len(selected))
115
+
116
+ # Write benchmark JSONL
117
+ with open(out_path, "w") as f:
118
+ for r in selected:
119
+ entry = {
120
+ "id": r["id"],
121
+ "format": r["format"],
122
+ "broken_config": r["broken_config"],
123
+ "errors": r["errors"],
124
+ "warnings": r.get("warnings", []),
125
+ "fault_types": r["fault_types"],
126
+ "difficulty": r["difficulty"],
127
+ "gold_config": r["gold_config"],
128
+ "gold_hash": r.get("gold_hash", ""),
129
+ }
130
+ f.write(json.dumps(entry) + "\n")
131
+
132
+ # Write metadata
133
+ meta = {
134
+ "total_entries": len(selected),
135
+ "source": str(test_path),
136
+ "category_distribution": _count_categories(selected),
137
+ "difficulty_distribution": _count_field(selected, "difficulty"),
138
+ "format_distribution": _count_field(selected, "format"),
139
+ }
140
+ meta_path = out_path.parent / "benchmark_meta.json"
141
+ with open(meta_path, "w") as f:
142
+ json.dump(meta, f, indent=2)
143
+
144
+ return BenchmarkDataset(out_path)
145
+
146
+
147
+ def _balance_select(records: list[dict], target: int) -> list[dict]:
148
+ """Select records with balanced category/difficulty distribution."""
149
+ by_category: dict[str, list[dict]] = {}
150
+ for r in records:
151
+ fault_id = r["fault_types"][0]
152
+ cat = fault_id.split(".")[0] if "." in fault_id else fault_id
153
+ by_category.setdefault(cat, []).append(r)
154
+
155
+ if not by_category:
156
+ return []
157
+
158
+ per_category = max(1, target // len(by_category))
159
+ selected = []
160
+
161
+ for cat, cat_records in by_category.items():
162
+ # Within category, balance by difficulty
163
+ by_diff: dict[str, list[dict]] = {}
164
+ for r in cat_records:
165
+ by_diff.setdefault(r["difficulty"], []).append(r)
166
+
167
+ per_diff = max(1, per_category // max(len(by_diff), 1))
168
+ for diff, diff_records in by_diff.items():
169
+ selected.extend(diff_records[:per_diff])
170
+
171
+ return selected[:target]
172
+
173
+
174
+ def _count_categories(records: list[dict]) -> dict[str, int]:
175
+ counts: dict[str, int] = {}
176
+ for r in records:
177
+ fault_id = r["fault_types"][0]
178
+ cat = fault_id.split(".")[0] if "." in fault_id else fault_id
179
+ counts[cat] = counts.get(cat, 0) + 1
180
+ return counts
181
+
182
+
183
+ def _count_field(records: list[dict], field: str) -> dict[str, int]:
184
+ counts: dict[str, int] = {}
185
+ for r in records:
186
+ val = r.get(field, "unknown")
187
+ counts[val] = counts.get(val, 0) + 1
188
+ return counts
@@ -0,0 +1,275 @@
1
+ """Evaluation harness for IaC repair benchmark.
2
+
3
+ Validates model-generated fixes via terraform validate / cfn-lint
4
+ and computes pass@k metrics using the unbiased Codex estimator.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import asyncio
10
+ import json
11
+ import logging
12
+ import math
13
+ import shutil
14
+ import tempfile
15
+ from dataclasses import dataclass, field
16
+ from pathlib import Path
17
+ from typing import Any, Callable, Awaitable
18
+
19
+ from cloudgym.benchmark.dataset import BenchmarkDataset, BenchmarkEntry
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ @dataclass
25
+ class EvalReport:
26
+ """Evaluation report for a model on the benchmark."""
27
+
28
+ model_name: str
29
+ n_attempts: int
30
+ total_entries: int
31
+ pass_at_k: dict[int, float] = field(default_factory=dict)
32
+ per_category: dict[str, dict[int, float]] = field(default_factory=dict)
33
+ per_difficulty: dict[str, dict[int, float]] = field(default_factory=dict)
34
+ per_format: dict[str, dict[int, float]] = field(default_factory=dict)
35
+ raw_results: list[dict] = field(default_factory=list)
36
+
37
+
38
+ # Type alias for model repair function
39
+ ModelFn = Callable[[str, list[str]], Awaitable[str]]
40
+
41
+
42
+ class Evaluator:
43
+ """Evaluates model repair attempts against the benchmark."""
44
+
45
+ # Max concurrent validations (terraform/cfn-lint)
46
+ DEFAULT_CONCURRENCY = 8
47
+
48
+ def __init__(self, benchmark_path: str | Path, concurrency: int | None = None):
49
+ self.dataset = BenchmarkDataset(benchmark_path)
50
+ self._tf_cache_dir: Path | None = None
51
+ self._concurrency = concurrency or self.DEFAULT_CONCURRENCY
52
+
53
+ async def evaluate_model(
54
+ self,
55
+ model_fn: ModelFn,
56
+ model_name: str = "unknown",
57
+ n_attempts: int = 5,
58
+ k_values: list[int] | None = None,
59
+ ) -> EvalReport:
60
+ """Evaluate a model's repair ability on the benchmark.
61
+
62
+ Args:
63
+ model_fn: Async function (broken_config, errors) -> repaired_config.
64
+ model_name: Name of the model being evaluated.
65
+ n_attempts: Number of repair attempts per benchmark entry.
66
+ k_values: k values for pass@k computation. Default [1, 3].
67
+
68
+ Returns:
69
+ EvalReport with pass@k metrics and breakdowns.
70
+ """
71
+ if k_values is None:
72
+ k_values = [1, 3]
73
+
74
+ # Phase 1: Generate all repairs (serial — model inference isn't parallel-safe)
75
+ # Store as list of (entry, [repaired_configs])
76
+ all_repairs: list[tuple[BenchmarkEntry, list[str | None]]] = []
77
+ for entry in self.dataset:
78
+ repairs: list[str | None] = []
79
+ for attempt in range(n_attempts):
80
+ try:
81
+ repaired = await model_fn(entry.broken_config, entry.errors)
82
+ repairs.append(repaired)
83
+ except Exception:
84
+ logger.exception(
85
+ "Model failed on entry %s attempt %d", entry.id, attempt
86
+ )
87
+ repairs.append(None)
88
+ all_repairs.append((entry, repairs))
89
+
90
+ # Phase 2: Validate all repairs concurrently
91
+ sem = asyncio.Semaphore(self._concurrency)
92
+
93
+ async def _validate(repaired: str | None, fmt: str) -> bool:
94
+ if repaired is None:
95
+ return False
96
+ async with sem:
97
+ return await self._check_repair(repaired, fmt)
98
+
99
+ raw_results = []
100
+ for entry, repairs in all_repairs:
101
+ tasks = [_validate(r, entry.format) for r in repairs]
102
+ results = await asyncio.gather(*tasks, return_exceptions=True)
103
+ passes = sum(
104
+ 1 for r in results if r is True
105
+ )
106
+ raw_results.append({
107
+ "id": entry.id,
108
+ "format": entry.format,
109
+ "fault_types": entry.fault_types,
110
+ "difficulty": entry.difficulty,
111
+ "n": n_attempts,
112
+ "c": passes,
113
+ })
114
+
115
+ # Compute metrics
116
+ report = EvalReport(
117
+ model_name=model_name,
118
+ n_attempts=n_attempts,
119
+ total_entries=len(self.dataset),
120
+ raw_results=raw_results,
121
+ )
122
+
123
+ # Overall pass@k
124
+ for k in k_values:
125
+ report.pass_at_k[k] = _compute_pass_at_k(raw_results, k)
126
+
127
+ # Per-category breakdown
128
+ categories = set()
129
+ for r in raw_results:
130
+ for ft in r["fault_types"]:
131
+ cat = ft.split(".")[0] if "." in ft else ft
132
+ categories.add(cat)
133
+
134
+ for cat in categories:
135
+ cat_results = [
136
+ r for r in raw_results
137
+ if any(ft.startswith(cat) for ft in r["fault_types"])
138
+ ]
139
+ report.per_category[cat] = {
140
+ k: _compute_pass_at_k(cat_results, k) for k in k_values
141
+ }
142
+
143
+ # Per-difficulty breakdown
144
+ difficulties = {r["difficulty"] for r in raw_results}
145
+ for diff in difficulties:
146
+ diff_results = [r for r in raw_results if r["difficulty"] == diff]
147
+ report.per_difficulty[diff] = {
148
+ k: _compute_pass_at_k(diff_results, k) for k in k_values
149
+ }
150
+
151
+ # Per-format breakdown
152
+ formats = {r["format"] for r in raw_results}
153
+ for fmt in formats:
154
+ fmt_results = [r for r in raw_results if r["format"] == fmt]
155
+ report.per_format[fmt] = {
156
+ k: _compute_pass_at_k(fmt_results, k) for k in k_values
157
+ }
158
+
159
+ return report
160
+
161
+ async def _ensure_tf_cache(self) -> Path:
162
+ """Create a cached terraform init directory for fast validation."""
163
+ if self._tf_cache_dir and self._tf_cache_dir.exists():
164
+ return self._tf_cache_dir
165
+
166
+ cache = Path(tempfile.mkdtemp(prefix="cloudgym_tf_cache_"))
167
+ # Write a minimal .tf that requires the AWS provider
168
+ (cache / "providers.tf").write_text(
169
+ 'terraform {\n required_providers {\n'
170
+ ' aws = {\n source = "hashicorp/aws"\n'
171
+ ' version = "~> 5.0"\n }\n }\n}\n'
172
+ )
173
+ import asyncio
174
+ proc = await asyncio.create_subprocess_exec(
175
+ "terraform", "init", "-backend=false", "-no-color",
176
+ cwd=cache,
177
+ stdout=asyncio.subprocess.PIPE,
178
+ stderr=asyncio.subprocess.PIPE,
179
+ )
180
+ await proc.communicate()
181
+ # Remove the providers.tf so it doesn't interfere with validation
182
+ (cache / "providers.tf").unlink(missing_ok=True)
183
+ self._tf_cache_dir = cache
184
+ logger.info("Cached terraform init at %s", cache)
185
+ return cache
186
+
187
+ async def _check_repair(self, repaired: str, iac_format: str) -> bool:
188
+ """Check if a repaired config passes validation."""
189
+ if not repaired or not repaired.strip():
190
+ return False
191
+
192
+ if iac_format in ("terraform", "opentofu"):
193
+ return await self._check_terraform_repair(repaired)
194
+ else:
195
+ return await self._check_cf_repair(repaired)
196
+
197
+ async def _check_terraform_repair(self, repaired: str) -> bool:
198
+ """Validate terraform repair using cached init directory."""
199
+ try:
200
+ cache = await self._ensure_tf_cache()
201
+ tmpdir = Path(tempfile.mkdtemp(prefix="cloudgym_eval_"))
202
+ # Symlink .terraform and lock file from cache (much faster than copy)
203
+ tf_dir = cache / ".terraform"
204
+ lock_file = cache / ".terraform.lock.hcl"
205
+ if tf_dir.exists():
206
+ (tmpdir / ".terraform").symlink_to(tf_dir)
207
+ if lock_file.exists():
208
+ (tmpdir / ".terraform.lock.hcl").symlink_to(lock_file)
209
+
210
+ (tmpdir / "repaired.tf").write_text(repaired)
211
+
212
+ # Skip init, go straight to validate
213
+ proc = await asyncio.create_subprocess_exec(
214
+ "terraform", "validate", "-json", "-no-color",
215
+ cwd=tmpdir,
216
+ stdout=asyncio.subprocess.PIPE,
217
+ stderr=asyncio.subprocess.PIPE,
218
+ )
219
+ stdout, _ = await proc.communicate()
220
+ result = json.loads(stdout.decode(errors="replace"))
221
+ return result.get("valid", False)
222
+ except Exception:
223
+ logger.exception("Terraform validation error during eval")
224
+ return False
225
+ finally:
226
+ shutil.rmtree(tmpdir, ignore_errors=True)
227
+
228
+ async def _check_cf_repair(self, repaired: str) -> bool:
229
+ """Validate CloudFormation repair via cfn-lint."""
230
+ tmpdir = Path(tempfile.mkdtemp(prefix="cloudgym_eval_"))
231
+ tmp_file = tmpdir / "repaired.yaml"
232
+ tmp_file.write_text(repaired)
233
+ try:
234
+ from cloudgym.validator.cloudformation import validate
235
+ result = await validate(tmp_file)
236
+ return result.valid and len(result.errors) == 0
237
+ except Exception:
238
+ logger.exception("CF validation error during eval")
239
+ return False
240
+ finally:
241
+ shutil.rmtree(tmpdir, ignore_errors=True)
242
+
243
+
244
+ def _compute_pass_at_k(results: list[dict], k: int) -> float:
245
+ """Compute pass@k using the unbiased Codex estimator.
246
+
247
+ pass@k = 1 - C(n-c, k) / C(n, k)
248
+
249
+ Where n = total attempts, c = number of correct attempts.
250
+ """
251
+ if not results:
252
+ return 0.0
253
+
254
+ total = 0.0
255
+ for r in results:
256
+ n = r["n"]
257
+ c = r["c"]
258
+ if n < k:
259
+ # Not enough attempts
260
+ total += 1.0 if c > 0 else 0.0
261
+ elif c == 0:
262
+ total += 0.0
263
+ elif n - c < k:
264
+ total += 1.0
265
+ else:
266
+ total += 1.0 - _comb(n - c, k) / _comb(n, k)
267
+
268
+ return total / len(results)
269
+
270
+
271
+ def _comb(n: int, k: int) -> float:
272
+ """Compute combination C(n, k) using math.comb."""
273
+ if k < 0 or k > n:
274
+ return 0.0
275
+ return float(math.comb(n, k))
cloudgym/cli.py ADDED
@@ -0,0 +1,61 @@
1
+ """Cloud-Gym CLI entry point."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import click
6
+
7
+
8
+ @click.group()
9
+ @click.version_option(package_name="cloud-gym")
10
+ def main():
11
+ """Cloud-Gym: IaC Repair Benchmark via Environment Inversion."""
12
+
13
+
14
+ @main.command()
15
+ @click.option("--skip-github", is_flag=True)
16
+ @click.option("--skip-registry", is_flag=True)
17
+ @click.option("--skip-aws", is_flag=True)
18
+ @click.option("--skip-validate", is_flag=True)
19
+ def scrape(skip_github: bool, skip_registry: bool, skip_aws: bool, skip_validate: bool):
20
+ """Collect gold IaC configurations from various sources."""
21
+ import asyncio
22
+ from scripts.scrape import run_scrape
23
+
24
+ asyncio.run(run_scrape(skip_github, skip_registry, skip_aws, skip_validate))
25
+
26
+
27
+ @main.command()
28
+ def taxonomy():
29
+ """Display the fault taxonomy."""
30
+ from rich.console import Console
31
+ from rich.table import Table
32
+
33
+ from cloudgym.taxonomy.base import REGISTRY, FaultCategory, IaCFormat
34
+ import cloudgym.taxonomy.terraform # noqa: F401 — triggers registration
35
+ import cloudgym.taxonomy.cloudformation # noqa: F401
36
+
37
+ console = Console()
38
+ table = Table(title="Cloud-Gym Fault Taxonomy")
39
+ table.add_column("ID", style="cyan")
40
+ table.add_column("Category", style="magenta")
41
+ table.add_column("Severity")
42
+ table.add_column("Formats")
43
+ table.add_column("Description")
44
+
45
+ for fault in sorted(REGISTRY.all(), key=lambda f: (f.category.name, f.severity.value)):
46
+ formats = ", ".join(f.value for f in fault.applicable_formats)
47
+ sev_style = {"low": "green", "medium": "yellow", "high": "red"}[fault.severity.value]
48
+ table.add_row(
49
+ fault.id,
50
+ fault.category.name,
51
+ f"[{sev_style}]{fault.severity.value}[/{sev_style}]",
52
+ formats,
53
+ fault.description,
54
+ )
55
+
56
+ console.print(table)
57
+ console.print(f"\nTotal fault types: {len(REGISTRY)}")
58
+ for cat in FaultCategory:
59
+ count = len(REGISTRY.list_by_category(cat))
60
+ if count > 0:
61
+ console.print(f" {cat.name}: {count}")
@@ -0,0 +1 @@
1
+ """stackfix: AI-powered Infrastructure-as-Code repair tool."""