PyPI - stackfix - Versions diffs - 0.1.0__py3-none-any.whl - Mend

stackfix 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

cloudgym/__init__.py +3 -0
cloudgym/benchmark/__init__.py +0 -0
cloudgym/benchmark/dataset.py +188 -0
cloudgym/benchmark/evaluator.py +275 -0
cloudgym/cli.py +61 -0
cloudgym/fixer/__init__.py +1 -0
cloudgym/fixer/cli.py +521 -0
cloudgym/fixer/detector.py +81 -0
cloudgym/fixer/formatter.py +55 -0
cloudgym/fixer/lambda_handler.py +126 -0
cloudgym/fixer/repairer.py +237 -0
cloudgym/generator/__init__.py +0 -0
cloudgym/generator/formatter.py +142 -0
cloudgym/generator/pipeline.py +271 -0
cloudgym/inverter/__init__.py +0 -0
cloudgym/inverter/_cf_injectors.py +705 -0
cloudgym/inverter/_cf_utils.py +202 -0
cloudgym/inverter/_hcl_utils.py +182 -0
cloudgym/inverter/_tf_injectors.py +641 -0
cloudgym/inverter/_yaml_cf.py +84 -0
cloudgym/inverter/agentic.py +90 -0
cloudgym/inverter/engine.py +258 -0
cloudgym/inverter/programmatic.py +95 -0
cloudgym/scraper/__init__.py +0 -0
cloudgym/scraper/aws_samples.py +159 -0
cloudgym/scraper/github.py +238 -0
cloudgym/scraper/registry.py +165 -0
cloudgym/scraper/validator.py +116 -0
cloudgym/taxonomy/__init__.py +10 -0
cloudgym/taxonomy/base.py +102 -0
cloudgym/taxonomy/cloudformation.py +258 -0
cloudgym/taxonomy/terraform.py +274 -0
cloudgym/utils/__init__.py +0 -0
cloudgym/utils/config.py +57 -0
cloudgym/utils/ollama.py +66 -0
cloudgym/validator/__init__.py +0 -0
cloudgym/validator/cloudformation.py +55 -0
cloudgym/validator/opentofu.py +103 -0
cloudgym/validator/terraform.py +115 -0
stackfix-0.1.0.dist-info/METADATA +182 -0
stackfix-0.1.0.dist-info/RECORD +44 -0
stackfix-0.1.0.dist-info/WHEEL +4 -0
stackfix-0.1.0.dist-info/entry_points.txt +3 -0
stackfix-0.1.0.dist-info/licenses/LICENSE +21 -0

cloudgym/scraper/github.py ADDED Viewed

@@ -0,0 +1,238 @@
+"""GitHub scraper for Terraform and CloudFormation configurations."""
+from __future__ import annotations
+import asyncio
+import logging
+import os
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+import httpx
+from cloudgym.utils.config import GOLD_CF_DIR, GOLD_TF_DIR, ScraperConfig
+logger = logging.getLogger(__name__)
+GITHUB_API = "https://api.github.com"
+GITHUB_SEARCH_CODE = f"{GITHUB_API}/search/code"
+GITHUB_SEARCH_REPOS = f"{GITHUB_API}/search/repositories"
+# Patterns that suggest secrets — skip these files
+SECRET_PATTERNS = re.compile(
+    r"(AKIA[0-9A-Z]{16}|aws_secret_access_key|password\s*=\s*\".{8,}\")",
+    re.IGNORECASE,
+)
+@dataclass
+class ScrapedFile:
+    """A single scraped IaC configuration file."""
+    repo_full_name: str
+    file_path: str
+    content: str
+    format: str  # "terraform" or "cloudformation"
+    sha: str = ""
+@dataclass
+class GitHubScraper:
+    """Scrapes GitHub for Terraform and CloudFormation configs."""
+    config: ScraperConfig = field(default_factory=ScraperConfig)
+    _seen_hashes: set[str] = field(default_factory=set, repr=False)
+    @property
+    def _headers(self) -> dict[str, str]:
+        token = self.config.github_token or os.environ.get("GITHUB_TOKEN", "")
+        headers = {"Accept": "application/vnd.github.v3+json"}
+        if token:
+            headers["Authorization"] = f"Bearer {token}"
+        return headers
+    async def scrape_terraform(self) -> list[ScrapedFile]:
+        """Search GitHub for Terraform .tf files."""
+        results: list[ScrapedFile] = []
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            for query in self.config.tf_search_queries:
+                files = await self._search_code(
+                    client,
+                    query=f"{query} extension:tf",
+                    fmt="terraform",
+                )
+                results.extend(files)
+                if len(results) >= self.config.max_repos:
+                    break
+                # Respect rate limits
+                await asyncio.sleep(2.0)
+        return results
+    async def scrape_cloudformation(self) -> list[ScrapedFile]:
+        """Search GitHub for CloudFormation templates."""
+        results: list[ScrapedFile] = []
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            for query in self.config.cf_search_queries:
+                files = await self._search_code(
+                    client,
+                    query=f"{query} extension:yaml",
+                    fmt="cloudformation",
+                )
+                results.extend(files)
+                # Also search JSON CF templates
+                files = await self._search_code(
+                    client,
+                    query=f"{query} extension:json",
+                    fmt="cloudformation",
+                )
+                results.extend(files)
+                if len(results) >= self.config.max_repos:
+                    break
+                await asyncio.sleep(2.0)
+        return results
+    async def scrape_all(self) -> list[ScrapedFile]:
+        """Run all GitHub scraping tasks."""
+        tf_files, cf_files = await asyncio.gather(
+            self.scrape_terraform(),
+            self.scrape_cloudformation(),
+        )
+        return tf_files + cf_files
+    async def _search_code(
+        self,
+        client: httpx.AsyncClient,
+        query: str,
+        fmt: str,
+        per_page: int = 30,
+    ) -> list[ScrapedFile]:
+        """Search GitHub code API and download matching files."""
+        results: list[ScrapedFile] = []
+        try:
+            resp = await client.get(
+                GITHUB_SEARCH_CODE,
+                params={"q": query, "per_page": per_page},
+                headers=self._headers,
+            )
+            resp.raise_for_status()
+        except httpx.HTTPStatusError as exc:
+            if exc.response.status_code == 403:
+                logger.warning("GitHub API rate limit hit, pausing")
+                await asyncio.sleep(60)
+                return results
+            logger.error("GitHub search failed: %s", exc)
+            return results
+        data = resp.json()
+        items = data.get("items", [])
+        logger.info("GitHub search '%s' returned %d items", query, len(items))
+        for item in items:
+            sha = item.get("sha", "")
+            if sha in self._seen_hashes:
+                continue
+            raw_url = item.get("html_url", "").replace(
+                "github.com", "raw.githubusercontent.com"
+            ).replace("/blob/", "/")
+            if not raw_url:
+                continue
+            content = await self._download_raw(client, raw_url)
+            if content is None:
+                continue
+            if not self._passes_filters(content, fmt):
+                continue
+            self._seen_hashes.add(sha)
+            results.append(
+                ScrapedFile(
+                    repo_full_name=item.get("repository", {}).get("full_name", ""),
+                    file_path=item.get("path", ""),
+                    content=content,
+                    format=fmt,
+                    sha=sha,
+                )
+            )
+        return results
+    async def _download_raw(self, client: httpx.AsyncClient, url: str) -> str | None:
+        """Download raw file content from GitHub."""
+        try:
+            resp = await client.get(url, headers=self._headers, follow_redirects=True)
+            resp.raise_for_status()
+            text = resp.text
+            if len(text) > self.config.max_file_size_kb * 1024:
+                return None
+            return text
+        except httpx.HTTPError:
+            return None
+    def _passes_filters(self, content: str, fmt: str) -> bool:
+        """Check if file content passes quality filters."""
+        # Skip files with obvious secrets
+        if SECRET_PATTERNS.search(content):
+            logger.debug("Skipping file with potential secrets")
+            return False
+        # Skip very small files
+        if len(content.strip()) < 50:
+            return False
+        if fmt == "terraform":
+            return self._filter_terraform(content)
+        elif fmt == "cloudformation":
+            return self._filter_cloudformation(content)
+        return False
+    def _filter_terraform(self, content: str) -> bool:
+        """Check Terraform-specific quality criteria."""
+        resource_count = content.count("resource ")
+        data_count = content.count("data ")
+        module_count = content.count("module ")
+        total = resource_count + data_count + module_count
+        return total >= self.config.min_resources
+    def _filter_cloudformation(self, content: str) -> bool:
+        """Check CloudFormation-specific quality criteria."""
+        has_version = "AWSTemplateFormatVersion" in content
+        has_resources = "Resources:" in content or '"Resources"' in content
+        if not (has_version or has_resources):
+            return False
+        # Count resources roughly
+        resource_lines = content.count("Type: AWS::") + content.count('"Type": "AWS::')
+        return resource_lines >= self.config.min_resources
+async def save_scraped_files(files: list[ScrapedFile]) -> dict[str, int]:
+    """Save scraped files to the gold directories. Returns count per format."""
+    counts = {"terraform": 0, "cloudformation": 0}
+    for f in files:
+        if f.format == "terraform":
+            out_dir = GOLD_TF_DIR
+            ext = ".tf"
+        else:
+            out_dir = GOLD_CF_DIR
+            ext = ".yaml" if not f.file_path.endswith(".json") else ".json"
+        out_dir.mkdir(parents=True, exist_ok=True)
+        # Use repo name + file path as unique filename
+        safe_name = f"{f.repo_full_name}__{f.file_path}".replace("/", "_").replace("\\", "_")
+        if not safe_name.endswith(ext):
+            safe_name += ext
+        out_path = out_dir / safe_name
+        out_path.write_text(f.content, encoding="utf-8")
+        counts[f.format] += 1
+    return counts

cloudgym/scraper/registry.py ADDED Viewed

@@ -0,0 +1,165 @@
+"""Terraform Registry module scraper."""
+from __future__ import annotations
+import asyncio
+import logging
+from dataclasses import dataclass, field
+from pathlib import Path
+import httpx
+from cloudgym.utils.config import GOLD_TF_DIR
+logger = logging.getLogger(__name__)
+REGISTRY_API = "https://registry.terraform.io/v1"
+@dataclass
+class RegistryModule:
+    """A module scraped from the Terraform Registry."""
+    namespace: str
+    name: str
+    provider: str
+    version: str
+    source_url: str
+    configs: list[tuple[str, str]] = field(default_factory=list)  # (filename, content)
+@dataclass
+class RegistryScraper:
+    """Scrapes verified modules from the Terraform Registry."""
+    max_modules: int = 100
+    providers: list[str] = field(default_factory=lambda: ["aws", "azurerm", "google"])
+    async def scrape(self) -> list[RegistryModule]:
+        """Fetch verified modules and download their source configs."""
+        modules: list[RegistryModule] = []
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            for provider in self.providers:
+                batch = await self._list_modules(client, provider)
+                modules.extend(batch)
+                if len(modules) >= self.max_modules:
+                    break
+            # Download source for each module
+            tasks = [self._fetch_module_source(client, m) for m in modules]
+            await asyncio.gather(*tasks)
+        return [m for m in modules if m.configs]
+    async def _list_modules(
+        self, client: httpx.AsyncClient, provider: str
+    ) -> list[RegistryModule]:
+        """List verified modules for a given provider."""
+        modules = []
+        try:
+            resp = await client.get(
+                f"{REGISTRY_API}/modules",
+                params={
+                    "provider": provider,
+                    "verified": "true",
+                    "limit": 20,
+                },
+            )
+            resp.raise_for_status()
+        except httpx.HTTPError as exc:
+            logger.error("Registry API failed for %s: %s", provider, exc)
+            return modules
+        data = resp.json()
+        for mod in data.get("modules", []):
+            modules.append(
+                RegistryModule(
+                    namespace=mod.get("namespace", ""),
+                    name=mod.get("name", ""),
+                    provider=mod.get("provider", ""),
+                    version=mod.get("version", ""),
+                    source_url=mod.get("source", ""),
+                )
+            )
+        return modules
+    async def _fetch_module_source(
+        self, client: httpx.AsyncClient, module: RegistryModule
+    ) -> None:
+        """Fetch the download URL and retrieve .tf files from the source."""
+        try:
+            # Get the download URL from the registry
+            resp = await client.get(
+                f"{REGISTRY_API}/modules/"
+                f"{module.namespace}/{module.name}/{module.provider}/"
+                f"{module.version}/download",
+                follow_redirects=True,
+            )
+            # The download endpoint returns a redirect header with the source archive
+            download_url = resp.headers.get("X-Terraform-Get", "")
+            if not download_url:
+                return
+            # If it's a GitHub source, try to get raw .tf files
+            if "github.com" in download_url:
+                await self._fetch_github_tf_files(client, download_url, module)
+        except httpx.HTTPError as exc:
+            logger.debug("Failed to fetch source for %s/%s: %s", module.namespace, module.name, exc)
+    async def _fetch_github_tf_files(
+        self,
+        client: httpx.AsyncClient,
+        github_url: str,
+        module: RegistryModule,
+    ) -> None:
+        """Fetch .tf files from a GitHub repository URL."""
+        # Convert GitHub URL to API URL for contents
+        # e.g., https://github.com/org/repo -> api.github.com/repos/org/repo/contents
+        parts = github_url.rstrip("/").split("github.com/")
+        if len(parts) < 2:
+            return
+        repo_path = parts[1].split("?")[0].split("//")[0]
+        api_url = f"https://api.github.com/repos/{repo_path}/contents"
+        try:
+            resp = await client.get(api_url)
+            resp.raise_for_status()
+            contents = resp.json()
+        except (httpx.HTTPError, ValueError):
+            return
+        if not isinstance(contents, list):
+            return
+        for item in contents:
+            if not isinstance(item, dict):
+                continue
+            name = item.get("name", "")
+            if name.endswith(".tf") and item.get("type") == "file":
+                download_url = item.get("download_url", "")
+                if download_url:
+                    try:
+                        file_resp = await client.get(download_url)
+                        file_resp.raise_for_status()
+                        module.configs.append((name, file_resp.text))
+                    except httpx.HTTPError:
+                        continue
+async def save_registry_modules(modules: list[RegistryModule]) -> int:
+    """Save registry module configs to gold directory. Returns file count."""
+    count = 0
+    GOLD_TF_DIR.mkdir(parents=True, exist_ok=True)
+    for module in modules:
+        for filename, content in module.configs:
+            safe_name = f"registry__{module.namespace}__{module.name}__{filename}"
+            out_path = GOLD_TF_DIR / safe_name
+            out_path.write_text(content, encoding="utf-8")
+            count += 1
+    return count

cloudgym/scraper/validator.py ADDED Viewed

@@ -0,0 +1,116 @@
+"""Gold instance validator — filters scraped configs to only keep valid ones."""
+from __future__ import annotations
+import asyncio
+import logging
+from dataclasses import dataclass, field
+from pathlib import Path
+from cloudgym.utils.config import GOLD_CF_DIR, GOLD_TF_DIR
+from cloudgym.validator import cloudformation as cf_validator
+from cloudgym.validator import terraform as tf_validator
+logger = logging.getLogger(__name__)
+@dataclass
+class ValidationStats:
+    """Statistics from a gold validation run."""
+    total: int = 0
+    valid: int = 0
+    invalid: int = 0
+    errors: dict[str, list[str]] = field(default_factory=dict)
+    @property
+    def pass_rate(self) -> float:
+        return self.valid / self.total if self.total > 0 else 0.0
+async def validate_gold_terraform(
+    directory: Path | None = None,
+    concurrency: int = 5,
+) -> ValidationStats:
+    """Validate all Terraform configs in the gold directory.
+    Removes configs that fail validation so only gold instances remain.
+    """
+    tf_dir = directory or GOLD_TF_DIR
+    if not tf_dir.exists():
+        logger.warning("Terraform gold directory does not exist: %s", tf_dir)
+        return ValidationStats()
+    tf_files = list(tf_dir.glob("*.tf"))
+    stats = ValidationStats(total=len(tf_files))
+    semaphore = asyncio.Semaphore(concurrency)
+    async def _validate_one(path: Path) -> None:
+        async with semaphore:
+            result = await tf_validator.validate(path)
+            if result.valid:
+                stats.valid += 1
+                logger.debug("PASS: %s", path.name)
+            else:
+                stats.invalid += 1
+                stats.errors[path.name] = result.errors
+                logger.info("FAIL (removing): %s — %s", path.name, result.errors[:2])
+                path.unlink(missing_ok=True)
+    await asyncio.gather(*[_validate_one(f) for f in tf_files])
+    return stats
+async def validate_gold_cloudformation(
+    directory: Path | None = None,
+    concurrency: int = 5,
+) -> ValidationStats:
+    """Validate all CloudFormation templates in the gold directory.
+    Removes templates that fail cfn-lint so only gold instances remain.
+    """
+    cf_dir = directory or GOLD_CF_DIR
+    if not cf_dir.exists():
+        logger.warning("CloudFormation gold directory does not exist: %s", cf_dir)
+        return ValidationStats()
+    cf_files = list(cf_dir.glob("*.yaml")) + list(cf_dir.glob("*.json"))
+    stats = ValidationStats(total=len(cf_files))
+    semaphore = asyncio.Semaphore(concurrency)
+    async def _validate_one(path: Path) -> None:
+        async with semaphore:
+            result = await cf_validator.validate(path)
+            if result.valid:
+                stats.valid += 1
+                logger.debug("PASS: %s", path.name)
+            else:
+                stats.invalid += 1
+                stats.errors[path.name] = result.errors
+                logger.info("FAIL (removing): %s — %s", path.name, result.errors[:2])
+                path.unlink(missing_ok=True)
+    await asyncio.gather(*[_validate_one(f) for f in cf_files])
+    return stats
+async def validate_all_gold() -> dict[str, ValidationStats]:
+    """Validate all gold configs across formats."""
+    tf_stats, cf_stats = await asyncio.gather(
+        validate_gold_terraform(),
+        validate_gold_cloudformation(),
+    )
+    logger.info(
+        "Gold validation complete — TF: %d/%d pass (%.0f%%), CF: %d/%d pass (%.0f%%)",
+        tf_stats.valid,
+        tf_stats.total,
+        tf_stats.pass_rate * 100,
+        cf_stats.valid,
+        cf_stats.total,
+        cf_stats.pass_rate * 100,
+    )
+    return {"terraform": tf_stats, "cloudformation": cf_stats}

cloudgym/taxonomy/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+"""Fault taxonomy for IaC environment inversion.
+Importing this package auto-registers all fault types in the global REGISTRY.
+"""
+from cloudgym.taxonomy.base import REGISTRY  # noqa: F401
+# Import submodules to trigger fault registration
+import cloudgym.taxonomy.terraform  # noqa: F401
+import cloudgym.taxonomy.cloudformation  # noqa: F401

cloudgym/taxonomy/base.py ADDED Viewed

@@ -0,0 +1,102 @@
+"""Fault taxonomy base definitions for IaC environment inversion."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from enum import Enum, auto
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from collections.abc import Callable
+class FaultCategory(Enum):
+    """High-level categories of IaC faults."""
+    SYNTACTIC = auto()
+    REFERENCE = auto()
+    SEMANTIC = auto()
+    DEPENDENCY = auto()
+    PROVIDER = auto()
+    SECURITY = auto()
+    CROSS_RESOURCE = auto()
+    INTRINSIC = auto()  # CloudFormation-specific (intrinsic function faults)
+class IaCFormat(Enum):
+    """Supported Infrastructure-as-Code formats."""
+    TERRAFORM = "terraform"
+    CLOUDFORMATION = "cloudformation"
+    OPENTOFU = "opentofu"
+class Severity(Enum):
+    """Fault severity / difficulty level."""
+    LOW = "low"  # Obvious syntax errors, easy to spot
+    MEDIUM = "medium"  # Requires understanding of resource relationships
+    HIGH = "high"  # Subtle semantic or cross-resource issues
+@dataclass(frozen=True)
+class FaultType:
+    """A specific type of fault that can be injected into IaC configs.
+    Each FaultType defines a category of breakage (e.g., "missing closing brace")
+    and carries metadata for taxonomy analysis and benchmark stratification.
+    """
+    name: str
+    category: FaultCategory
+    description: str
+    severity: Severity
+    applicable_formats: frozenset[IaCFormat]
+    example_error: str = ""
+    tags: frozenset[str] = field(default_factory=frozenset)
+    @property
+    def id(self) -> str:
+        """Short identifier: category.name (e.g., SYNTACTIC.missing_brace)."""
+        return f"{self.category.name}.{self.name}"
+@dataclass
+class FaultInjection:
+    """Record of a single fault injection applied to a config."""
+    fault_type: FaultType
+    original_snippet: str
+    modified_snippet: str
+    location: str  # file path or line range description
+    description: str  # human-readable explanation of what was changed
+@dataclass
+class FaultRegistry:
+    """Central registry of all known fault types."""
+    _faults: dict[str, FaultType] = field(default_factory=dict)
+    def register(self, fault: FaultType) -> FaultType:
+        self._faults[fault.id] = fault
+        return fault
+    def get(self, fault_id: str) -> FaultType | None:
+        return self._faults.get(fault_id)
+    def list_by_category(self, category: FaultCategory) -> list[FaultType]:
+        return [f for f in self._faults.values() if f.category == category]
+    def list_by_format(self, fmt: IaCFormat) -> list[FaultType]:
+        return [f for f in self._faults.values() if fmt in f.applicable_formats]
+    def all(self) -> list[FaultType]:
+        return list(self._faults.values())
+    def __len__(self) -> int:
+        return len(self._faults)
+# Global registry instance
+REGISTRY = FaultRegistry()