stackfix 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. cloudgym/__init__.py +3 -0
  2. cloudgym/benchmark/__init__.py +0 -0
  3. cloudgym/benchmark/dataset.py +188 -0
  4. cloudgym/benchmark/evaluator.py +275 -0
  5. cloudgym/cli.py +61 -0
  6. cloudgym/fixer/__init__.py +1 -0
  7. cloudgym/fixer/cli.py +521 -0
  8. cloudgym/fixer/detector.py +81 -0
  9. cloudgym/fixer/formatter.py +55 -0
  10. cloudgym/fixer/lambda_handler.py +126 -0
  11. cloudgym/fixer/repairer.py +237 -0
  12. cloudgym/generator/__init__.py +0 -0
  13. cloudgym/generator/formatter.py +142 -0
  14. cloudgym/generator/pipeline.py +271 -0
  15. cloudgym/inverter/__init__.py +0 -0
  16. cloudgym/inverter/_cf_injectors.py +705 -0
  17. cloudgym/inverter/_cf_utils.py +202 -0
  18. cloudgym/inverter/_hcl_utils.py +182 -0
  19. cloudgym/inverter/_tf_injectors.py +641 -0
  20. cloudgym/inverter/_yaml_cf.py +84 -0
  21. cloudgym/inverter/agentic.py +90 -0
  22. cloudgym/inverter/engine.py +258 -0
  23. cloudgym/inverter/programmatic.py +95 -0
  24. cloudgym/scraper/__init__.py +0 -0
  25. cloudgym/scraper/aws_samples.py +159 -0
  26. cloudgym/scraper/github.py +238 -0
  27. cloudgym/scraper/registry.py +165 -0
  28. cloudgym/scraper/validator.py +116 -0
  29. cloudgym/taxonomy/__init__.py +10 -0
  30. cloudgym/taxonomy/base.py +102 -0
  31. cloudgym/taxonomy/cloudformation.py +258 -0
  32. cloudgym/taxonomy/terraform.py +274 -0
  33. cloudgym/utils/__init__.py +0 -0
  34. cloudgym/utils/config.py +57 -0
  35. cloudgym/utils/ollama.py +66 -0
  36. cloudgym/validator/__init__.py +0 -0
  37. cloudgym/validator/cloudformation.py +55 -0
  38. cloudgym/validator/opentofu.py +103 -0
  39. cloudgym/validator/terraform.py +115 -0
  40. stackfix-0.1.0.dist-info/METADATA +182 -0
  41. stackfix-0.1.0.dist-info/RECORD +44 -0
  42. stackfix-0.1.0.dist-info/WHEEL +4 -0
  43. stackfix-0.1.0.dist-info/entry_points.txt +3 -0
  44. stackfix-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,271 @@
1
+ """End-to-end training data generation pipeline.
2
+
3
+ Discovers gold configs, runs fault injection (programmatic and/or agentic),
4
+ validates breaks, and outputs training records as JSONL.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import asyncio
10
+ import logging
11
+ import uuid
12
+ from dataclasses import dataclass, field
13
+ from pathlib import Path
14
+
15
+ from cloudgym.generator.formatter import TrainingRecord, format_and_split
16
+ from cloudgym.inverter.engine import InversionEngine, InversionResult
17
+ from cloudgym.taxonomy import REGISTRY
18
+ from cloudgym.taxonomy.base import FaultCategory, FaultType, IaCFormat
19
+ from cloudgym.utils.config import PipelineConfig
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ @dataclass
25
+ class PipelineStats:
26
+ """Statistics for a pipeline run."""
27
+
28
+ total_gold: int = 0
29
+ total_broken: int = 0
30
+ resistant_configs: int = 0
31
+ faults_not_applicable: dict[str, int] = field(default_factory=dict)
32
+ errors: int = 0
33
+
34
+
35
+ class PipelineRunner:
36
+ """Runs the full training data generation pipeline."""
37
+
38
+ def __init__(
39
+ self,
40
+ config: PipelineConfig | None = None,
41
+ max_retries: int = 3,
42
+ concurrency: int = 5,
43
+ skip_validation: bool = True,
44
+ ):
45
+ self.config = config or PipelineConfig()
46
+ self.engine = InversionEngine(
47
+ max_retries=max_retries,
48
+ concurrency=concurrency,
49
+ skip_validation=skip_validation,
50
+ )
51
+ self.stats = PipelineStats()
52
+
53
+ async def run(
54
+ self,
55
+ gold_dir: str | Path,
56
+ output_dir: str | Path,
57
+ programmatic_variants: int | None = None,
58
+ agentic_variants: int | None = None,
59
+ skip_agentic: bool = False,
60
+ ) -> dict:
61
+ """Run the full pipeline: discover gold -> inject faults -> write JSONL.
62
+
63
+ Args:
64
+ gold_dir: Directory containing gold configs.
65
+ output_dir: Directory to write JSONL output files.
66
+ programmatic_variants: Number of programmatic faults per gold config.
67
+ agentic_variants: Number of agentic faults per gold config.
68
+ skip_agentic: Skip agentic injection entirely.
69
+
70
+ Returns:
71
+ Metadata dict from format_and_split.
72
+ """
73
+ gold_path = Path(gold_dir)
74
+ n_prog = programmatic_variants or self.config.programmatic_variants
75
+ n_agent = 0 if skip_agentic else (agentic_variants or self.config.agentic_variants)
76
+
77
+ # Discover gold configs
78
+ gold_files = self._discover_gold(gold_path)
79
+ self.stats.total_gold = len(gold_files)
80
+ logger.info("Discovered %d gold configs in %s", len(gold_files), gold_path)
81
+
82
+ if not gold_files:
83
+ logger.warning("No gold configs found")
84
+ return {"total_records": 0}
85
+
86
+ # Generate records
87
+ records: list[TrainingRecord] = []
88
+
89
+ for gold_file in gold_files:
90
+ iac_format = self._detect_format(gold_file)
91
+ applicable_faults = self._get_stratified_faults(iac_format, n_prog)
92
+
93
+ # Programmatic inversions
94
+ for fault_type in applicable_faults:
95
+ result = await self.engine.invert(
96
+ gold_file, [fault_type], mode="programmatic"
97
+ )
98
+ if result is not None:
99
+ record = self._result_to_record(result, "programmatic")
100
+ records.append(record)
101
+ self.stats.total_broken += 1
102
+ else:
103
+ key = fault_type.id
104
+ self.stats.faults_not_applicable[key] = (
105
+ self.stats.faults_not_applicable.get(key, 0) + 1
106
+ )
107
+
108
+ # Agentic inversions
109
+ if n_agent > 0:
110
+ categories = list({ft.category.name for ft in applicable_faults})
111
+ for i in range(min(n_agent, len(categories))):
112
+ cat = categories[i % len(categories)]
113
+ cat_faults = [ft for ft in applicable_faults if ft.category.name == cat]
114
+ if cat_faults:
115
+ result = await self.engine.invert(
116
+ gold_file, cat_faults[:1], mode="agentic"
117
+ )
118
+ if result is not None:
119
+ record = self._result_to_record(result, "agentic")
120
+ records.append(record)
121
+ self.stats.total_broken += 1
122
+
123
+ if not any(
124
+ r for r in records
125
+ if r.gold_hash == TrainingRecord(
126
+ id="", format="", gold_config=gold_file.read_text(),
127
+ broken_config="", errors=[], warnings=[], fault_types=[],
128
+ fault_description="", difficulty="", source=""
129
+ ).gold_hash
130
+ ):
131
+ self.stats.resistant_configs += 1
132
+
133
+ logger.info(
134
+ "Pipeline complete: %d gold -> %d broken (%d resistant)",
135
+ self.stats.total_gold, self.stats.total_broken, self.stats.resistant_configs,
136
+ )
137
+
138
+ # Format and split
139
+ if records:
140
+ metadata = format_and_split(
141
+ records,
142
+ output_dir,
143
+ ratios=(
144
+ self.config.train_split,
145
+ self.config.val_split,
146
+ self.config.test_split,
147
+ ),
148
+ )
149
+ else:
150
+ metadata = {"total_records": 0}
151
+
152
+ metadata["pipeline_stats"] = {
153
+ "total_gold": self.stats.total_gold,
154
+ "total_broken": self.stats.total_broken,
155
+ "resistant_configs": self.stats.resistant_configs,
156
+ "faults_not_applicable": self.stats.faults_not_applicable,
157
+ "errors": self.stats.errors,
158
+ }
159
+
160
+ return metadata
161
+
162
+ def _discover_gold(self, gold_dir: Path) -> list[Path]:
163
+ """Find all gold config files."""
164
+ extensions = {".tf", ".yaml", ".yml", ".json", ".template"}
165
+ files = []
166
+ if gold_dir.exists():
167
+ for ext in extensions:
168
+ files.extend(gold_dir.rglob(f"*{ext}"))
169
+ # Filter out hidden files and common non-config files
170
+ files = [
171
+ f for f in files
172
+ if not any(part.startswith('.') for part in f.parts)
173
+ and f.stat().st_size > 50 # Skip trivially small files
174
+ ]
175
+ return sorted(files)
176
+
177
+ def _detect_format(self, path: Path) -> str:
178
+ """Detect IaC format from file path."""
179
+ if path.suffix == ".tf":
180
+ return "terraform"
181
+ if "cloudformation" in str(path).lower() or path.suffix in (".yaml", ".yml", ".json", ".template"):
182
+ # Check if it's in a cloudformation directory
183
+ if "cloudformation" in str(path).lower():
184
+ return "cloudformation"
185
+ # Check if it looks like a CF template
186
+ try:
187
+ import yaml
188
+ content = yaml.safe_load(path.read_text())
189
+ if isinstance(content, dict) and ("AWSTemplateFormatVersion" in content or "Resources" in content):
190
+ return "cloudformation"
191
+ except Exception:
192
+ pass
193
+ return "terraform"
194
+
195
+ def _get_stratified_faults(self, iac_format: str, n: int) -> list[FaultType]:
196
+ """Select N fault types via round-robin across categories for balance."""
197
+ fmt_map = {
198
+ "terraform": IaCFormat.TERRAFORM,
199
+ "opentofu": IaCFormat.OPENTOFU,
200
+ "cloudformation": IaCFormat.CLOUDFORMATION,
201
+ }
202
+ iac_fmt = fmt_map.get(iac_format)
203
+ if iac_fmt is None:
204
+ return []
205
+
206
+ all_faults = REGISTRY.list_by_format(iac_fmt)
207
+ if not all_faults:
208
+ return []
209
+
210
+ # Group by category
211
+ by_category: dict[str, list[FaultType]] = {}
212
+ for ft in all_faults:
213
+ by_category.setdefault(ft.category.name, []).append(ft)
214
+
215
+ # Round-robin across categories
216
+ selected: list[FaultType] = []
217
+ categories = list(by_category.keys())
218
+ cat_indices = {cat: 0 for cat in categories}
219
+
220
+ while len(selected) < n and len(selected) < len(all_faults):
221
+ for cat in categories:
222
+ if len(selected) >= n:
223
+ break
224
+ faults = by_category[cat]
225
+ idx = cat_indices[cat]
226
+ if idx < len(faults):
227
+ selected.append(faults[idx])
228
+ cat_indices[cat] = idx + 1
229
+
230
+ # Check if we've exhausted all categories
231
+ if all(cat_indices[c] >= len(by_category[c]) for c in categories):
232
+ break
233
+
234
+ return selected
235
+
236
+ def _result_to_record(
237
+ self, result: InversionResult, source: str
238
+ ) -> TrainingRecord:
239
+ """Convert an InversionResult to a TrainingRecord."""
240
+ return TrainingRecord(
241
+ id=str(uuid.uuid4()),
242
+ format=result.iac_format,
243
+ gold_config=result.gold_config,
244
+ broken_config=result.broken_config,
245
+ errors=result.validation_result.errors,
246
+ warnings=result.validation_result.warnings,
247
+ fault_types=[result.fault_type.id],
248
+ fault_description=result.injection.description,
249
+ difficulty=result.fault_type.severity.value,
250
+ source=source,
251
+ )
252
+
253
+
254
+ # Convenience function matching original stub
255
+ async def generate_training_data(
256
+ gold_dir: str,
257
+ output_dir: str,
258
+ programmatic_variants: int = 4,
259
+ agentic_variants: int = 2,
260
+ ) -> dict:
261
+ """Generate training pairs from gold configs.
262
+
263
+ Returns statistics about the generation run.
264
+ """
265
+ runner = PipelineRunner()
266
+ return await runner.run(
267
+ gold_dir=gold_dir,
268
+ output_dir=output_dir,
269
+ programmatic_variants=programmatic_variants,
270
+ agentic_variants=agentic_variants,
271
+ )
File without changes