stackfix 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloudgym/__init__.py +3 -0
- cloudgym/benchmark/__init__.py +0 -0
- cloudgym/benchmark/dataset.py +188 -0
- cloudgym/benchmark/evaluator.py +275 -0
- cloudgym/cli.py +61 -0
- cloudgym/fixer/__init__.py +1 -0
- cloudgym/fixer/cli.py +521 -0
- cloudgym/fixer/detector.py +81 -0
- cloudgym/fixer/formatter.py +55 -0
- cloudgym/fixer/lambda_handler.py +126 -0
- cloudgym/fixer/repairer.py +237 -0
- cloudgym/generator/__init__.py +0 -0
- cloudgym/generator/formatter.py +142 -0
- cloudgym/generator/pipeline.py +271 -0
- cloudgym/inverter/__init__.py +0 -0
- cloudgym/inverter/_cf_injectors.py +705 -0
- cloudgym/inverter/_cf_utils.py +202 -0
- cloudgym/inverter/_hcl_utils.py +182 -0
- cloudgym/inverter/_tf_injectors.py +641 -0
- cloudgym/inverter/_yaml_cf.py +84 -0
- cloudgym/inverter/agentic.py +90 -0
- cloudgym/inverter/engine.py +258 -0
- cloudgym/inverter/programmatic.py +95 -0
- cloudgym/scraper/__init__.py +0 -0
- cloudgym/scraper/aws_samples.py +159 -0
- cloudgym/scraper/github.py +238 -0
- cloudgym/scraper/registry.py +165 -0
- cloudgym/scraper/validator.py +116 -0
- cloudgym/taxonomy/__init__.py +10 -0
- cloudgym/taxonomy/base.py +102 -0
- cloudgym/taxonomy/cloudformation.py +258 -0
- cloudgym/taxonomy/terraform.py +274 -0
- cloudgym/utils/__init__.py +0 -0
- cloudgym/utils/config.py +57 -0
- cloudgym/utils/ollama.py +66 -0
- cloudgym/validator/__init__.py +0 -0
- cloudgym/validator/cloudformation.py +55 -0
- cloudgym/validator/opentofu.py +103 -0
- cloudgym/validator/terraform.py +115 -0
- stackfix-0.1.0.dist-info/METADATA +182 -0
- stackfix-0.1.0.dist-info/RECORD +44 -0
- stackfix-0.1.0.dist-info/WHEEL +4 -0
- stackfix-0.1.0.dist-info/entry_points.txt +3 -0
- stackfix-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
"""End-to-end training data generation pipeline.
|
|
2
|
+
|
|
3
|
+
Discovers gold configs, runs fault injection (programmatic and/or agentic),
|
|
4
|
+
validates breaks, and outputs training records as JSONL.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import logging
|
|
11
|
+
import uuid
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from cloudgym.generator.formatter import TrainingRecord, format_and_split
|
|
16
|
+
from cloudgym.inverter.engine import InversionEngine, InversionResult
|
|
17
|
+
from cloudgym.taxonomy import REGISTRY
|
|
18
|
+
from cloudgym.taxonomy.base import FaultCategory, FaultType, IaCFormat
|
|
19
|
+
from cloudgym.utils.config import PipelineConfig
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class PipelineStats:
|
|
26
|
+
"""Statistics for a pipeline run."""
|
|
27
|
+
|
|
28
|
+
total_gold: int = 0
|
|
29
|
+
total_broken: int = 0
|
|
30
|
+
resistant_configs: int = 0
|
|
31
|
+
faults_not_applicable: dict[str, int] = field(default_factory=dict)
|
|
32
|
+
errors: int = 0
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class PipelineRunner:
|
|
36
|
+
"""Runs the full training data generation pipeline."""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
config: PipelineConfig | None = None,
|
|
41
|
+
max_retries: int = 3,
|
|
42
|
+
concurrency: int = 5,
|
|
43
|
+
skip_validation: bool = True,
|
|
44
|
+
):
|
|
45
|
+
self.config = config or PipelineConfig()
|
|
46
|
+
self.engine = InversionEngine(
|
|
47
|
+
max_retries=max_retries,
|
|
48
|
+
concurrency=concurrency,
|
|
49
|
+
skip_validation=skip_validation,
|
|
50
|
+
)
|
|
51
|
+
self.stats = PipelineStats()
|
|
52
|
+
|
|
53
|
+
async def run(
|
|
54
|
+
self,
|
|
55
|
+
gold_dir: str | Path,
|
|
56
|
+
output_dir: str | Path,
|
|
57
|
+
programmatic_variants: int | None = None,
|
|
58
|
+
agentic_variants: int | None = None,
|
|
59
|
+
skip_agentic: bool = False,
|
|
60
|
+
) -> dict:
|
|
61
|
+
"""Run the full pipeline: discover gold -> inject faults -> write JSONL.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
gold_dir: Directory containing gold configs.
|
|
65
|
+
output_dir: Directory to write JSONL output files.
|
|
66
|
+
programmatic_variants: Number of programmatic faults per gold config.
|
|
67
|
+
agentic_variants: Number of agentic faults per gold config.
|
|
68
|
+
skip_agentic: Skip agentic injection entirely.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Metadata dict from format_and_split.
|
|
72
|
+
"""
|
|
73
|
+
gold_path = Path(gold_dir)
|
|
74
|
+
n_prog = programmatic_variants or self.config.programmatic_variants
|
|
75
|
+
n_agent = 0 if skip_agentic else (agentic_variants or self.config.agentic_variants)
|
|
76
|
+
|
|
77
|
+
# Discover gold configs
|
|
78
|
+
gold_files = self._discover_gold(gold_path)
|
|
79
|
+
self.stats.total_gold = len(gold_files)
|
|
80
|
+
logger.info("Discovered %d gold configs in %s", len(gold_files), gold_path)
|
|
81
|
+
|
|
82
|
+
if not gold_files:
|
|
83
|
+
logger.warning("No gold configs found")
|
|
84
|
+
return {"total_records": 0}
|
|
85
|
+
|
|
86
|
+
# Generate records
|
|
87
|
+
records: list[TrainingRecord] = []
|
|
88
|
+
|
|
89
|
+
for gold_file in gold_files:
|
|
90
|
+
iac_format = self._detect_format(gold_file)
|
|
91
|
+
applicable_faults = self._get_stratified_faults(iac_format, n_prog)
|
|
92
|
+
|
|
93
|
+
# Programmatic inversions
|
|
94
|
+
for fault_type in applicable_faults:
|
|
95
|
+
result = await self.engine.invert(
|
|
96
|
+
gold_file, [fault_type], mode="programmatic"
|
|
97
|
+
)
|
|
98
|
+
if result is not None:
|
|
99
|
+
record = self._result_to_record(result, "programmatic")
|
|
100
|
+
records.append(record)
|
|
101
|
+
self.stats.total_broken += 1
|
|
102
|
+
else:
|
|
103
|
+
key = fault_type.id
|
|
104
|
+
self.stats.faults_not_applicable[key] = (
|
|
105
|
+
self.stats.faults_not_applicable.get(key, 0) + 1
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Agentic inversions
|
|
109
|
+
if n_agent > 0:
|
|
110
|
+
categories = list({ft.category.name for ft in applicable_faults})
|
|
111
|
+
for i in range(min(n_agent, len(categories))):
|
|
112
|
+
cat = categories[i % len(categories)]
|
|
113
|
+
cat_faults = [ft for ft in applicable_faults if ft.category.name == cat]
|
|
114
|
+
if cat_faults:
|
|
115
|
+
result = await self.engine.invert(
|
|
116
|
+
gold_file, cat_faults[:1], mode="agentic"
|
|
117
|
+
)
|
|
118
|
+
if result is not None:
|
|
119
|
+
record = self._result_to_record(result, "agentic")
|
|
120
|
+
records.append(record)
|
|
121
|
+
self.stats.total_broken += 1
|
|
122
|
+
|
|
123
|
+
if not any(
|
|
124
|
+
r for r in records
|
|
125
|
+
if r.gold_hash == TrainingRecord(
|
|
126
|
+
id="", format="", gold_config=gold_file.read_text(),
|
|
127
|
+
broken_config="", errors=[], warnings=[], fault_types=[],
|
|
128
|
+
fault_description="", difficulty="", source=""
|
|
129
|
+
).gold_hash
|
|
130
|
+
):
|
|
131
|
+
self.stats.resistant_configs += 1
|
|
132
|
+
|
|
133
|
+
logger.info(
|
|
134
|
+
"Pipeline complete: %d gold -> %d broken (%d resistant)",
|
|
135
|
+
self.stats.total_gold, self.stats.total_broken, self.stats.resistant_configs,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
# Format and split
|
|
139
|
+
if records:
|
|
140
|
+
metadata = format_and_split(
|
|
141
|
+
records,
|
|
142
|
+
output_dir,
|
|
143
|
+
ratios=(
|
|
144
|
+
self.config.train_split,
|
|
145
|
+
self.config.val_split,
|
|
146
|
+
self.config.test_split,
|
|
147
|
+
),
|
|
148
|
+
)
|
|
149
|
+
else:
|
|
150
|
+
metadata = {"total_records": 0}
|
|
151
|
+
|
|
152
|
+
metadata["pipeline_stats"] = {
|
|
153
|
+
"total_gold": self.stats.total_gold,
|
|
154
|
+
"total_broken": self.stats.total_broken,
|
|
155
|
+
"resistant_configs": self.stats.resistant_configs,
|
|
156
|
+
"faults_not_applicable": self.stats.faults_not_applicable,
|
|
157
|
+
"errors": self.stats.errors,
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
return metadata
|
|
161
|
+
|
|
162
|
+
def _discover_gold(self, gold_dir: Path) -> list[Path]:
|
|
163
|
+
"""Find all gold config files."""
|
|
164
|
+
extensions = {".tf", ".yaml", ".yml", ".json", ".template"}
|
|
165
|
+
files = []
|
|
166
|
+
if gold_dir.exists():
|
|
167
|
+
for ext in extensions:
|
|
168
|
+
files.extend(gold_dir.rglob(f"*{ext}"))
|
|
169
|
+
# Filter out hidden files and common non-config files
|
|
170
|
+
files = [
|
|
171
|
+
f for f in files
|
|
172
|
+
if not any(part.startswith('.') for part in f.parts)
|
|
173
|
+
and f.stat().st_size > 50 # Skip trivially small files
|
|
174
|
+
]
|
|
175
|
+
return sorted(files)
|
|
176
|
+
|
|
177
|
+
def _detect_format(self, path: Path) -> str:
|
|
178
|
+
"""Detect IaC format from file path."""
|
|
179
|
+
if path.suffix == ".tf":
|
|
180
|
+
return "terraform"
|
|
181
|
+
if "cloudformation" in str(path).lower() or path.suffix in (".yaml", ".yml", ".json", ".template"):
|
|
182
|
+
# Check if it's in a cloudformation directory
|
|
183
|
+
if "cloudformation" in str(path).lower():
|
|
184
|
+
return "cloudformation"
|
|
185
|
+
# Check if it looks like a CF template
|
|
186
|
+
try:
|
|
187
|
+
import yaml
|
|
188
|
+
content = yaml.safe_load(path.read_text())
|
|
189
|
+
if isinstance(content, dict) and ("AWSTemplateFormatVersion" in content or "Resources" in content):
|
|
190
|
+
return "cloudformation"
|
|
191
|
+
except Exception:
|
|
192
|
+
pass
|
|
193
|
+
return "terraform"
|
|
194
|
+
|
|
195
|
+
def _get_stratified_faults(self, iac_format: str, n: int) -> list[FaultType]:
|
|
196
|
+
"""Select N fault types via round-robin across categories for balance."""
|
|
197
|
+
fmt_map = {
|
|
198
|
+
"terraform": IaCFormat.TERRAFORM,
|
|
199
|
+
"opentofu": IaCFormat.OPENTOFU,
|
|
200
|
+
"cloudformation": IaCFormat.CLOUDFORMATION,
|
|
201
|
+
}
|
|
202
|
+
iac_fmt = fmt_map.get(iac_format)
|
|
203
|
+
if iac_fmt is None:
|
|
204
|
+
return []
|
|
205
|
+
|
|
206
|
+
all_faults = REGISTRY.list_by_format(iac_fmt)
|
|
207
|
+
if not all_faults:
|
|
208
|
+
return []
|
|
209
|
+
|
|
210
|
+
# Group by category
|
|
211
|
+
by_category: dict[str, list[FaultType]] = {}
|
|
212
|
+
for ft in all_faults:
|
|
213
|
+
by_category.setdefault(ft.category.name, []).append(ft)
|
|
214
|
+
|
|
215
|
+
# Round-robin across categories
|
|
216
|
+
selected: list[FaultType] = []
|
|
217
|
+
categories = list(by_category.keys())
|
|
218
|
+
cat_indices = {cat: 0 for cat in categories}
|
|
219
|
+
|
|
220
|
+
while len(selected) < n and len(selected) < len(all_faults):
|
|
221
|
+
for cat in categories:
|
|
222
|
+
if len(selected) >= n:
|
|
223
|
+
break
|
|
224
|
+
faults = by_category[cat]
|
|
225
|
+
idx = cat_indices[cat]
|
|
226
|
+
if idx < len(faults):
|
|
227
|
+
selected.append(faults[idx])
|
|
228
|
+
cat_indices[cat] = idx + 1
|
|
229
|
+
|
|
230
|
+
# Check if we've exhausted all categories
|
|
231
|
+
if all(cat_indices[c] >= len(by_category[c]) for c in categories):
|
|
232
|
+
break
|
|
233
|
+
|
|
234
|
+
return selected
|
|
235
|
+
|
|
236
|
+
def _result_to_record(
|
|
237
|
+
self, result: InversionResult, source: str
|
|
238
|
+
) -> TrainingRecord:
|
|
239
|
+
"""Convert an InversionResult to a TrainingRecord."""
|
|
240
|
+
return TrainingRecord(
|
|
241
|
+
id=str(uuid.uuid4()),
|
|
242
|
+
format=result.iac_format,
|
|
243
|
+
gold_config=result.gold_config,
|
|
244
|
+
broken_config=result.broken_config,
|
|
245
|
+
errors=result.validation_result.errors,
|
|
246
|
+
warnings=result.validation_result.warnings,
|
|
247
|
+
fault_types=[result.fault_type.id],
|
|
248
|
+
fault_description=result.injection.description,
|
|
249
|
+
difficulty=result.fault_type.severity.value,
|
|
250
|
+
source=source,
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
# Convenience function matching original stub
|
|
255
|
+
async def generate_training_data(
|
|
256
|
+
gold_dir: str,
|
|
257
|
+
output_dir: str,
|
|
258
|
+
programmatic_variants: int = 4,
|
|
259
|
+
agentic_variants: int = 2,
|
|
260
|
+
) -> dict:
|
|
261
|
+
"""Generate training pairs from gold configs.
|
|
262
|
+
|
|
263
|
+
Returns statistics about the generation run.
|
|
264
|
+
"""
|
|
265
|
+
runner = PipelineRunner()
|
|
266
|
+
return await runner.run(
|
|
267
|
+
gold_dir=gold_dir,
|
|
268
|
+
output_dir=output_dir,
|
|
269
|
+
programmatic_variants=programmatic_variants,
|
|
270
|
+
agentic_variants=agentic_variants,
|
|
271
|
+
)
|
|
File without changes
|