PyPI - aion-evolve - Versions diffs - 0.2.1__py3-none-any.whl - Mend

aion-evolve 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

aion/__init__.py +10 -0
aion/__main__.py +5 -0
aion/cli.py +304 -0
aion/config.py +83 -0
aion/context_extractor.py +243 -0
aion/evaluation.py +142 -0
aion/llm_analyzer.py +219 -0
aion/models.py +100 -0
aion/risk_heuristics.py +90 -0
aion/semgrep_runner.py +50 -0
aion_evolve-0.2.1.dist-info/METADATA +89 -0
aion_evolve-0.2.1.dist-info/RECORD +15 -0
aion_evolve-0.2.1.dist-info/WHEEL +5 -0
aion_evolve-0.2.1.dist-info/entry_points.txt +2 -0
aion_evolve-0.2.1.dist-info/top_level.txt +1 -0

aion/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+"""aion package."""
+from importlib.metadata import PackageNotFoundError, version
+__all__ = ["__version__"]
+try:
+    __version__ = version("aion-evolve")
+except PackageNotFoundError:
+    __version__ = "0.0.0"

aion/__main__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .cli import app
+if __name__ == "__main__":
+    app()

aion/cli.py ADDED Viewed

@@ -0,0 +1,304 @@
+from __future__ import annotations
+import json
+import os
+import subprocess
+from enum import Enum
+from pathlib import Path
+import typer
+from rich.console import Console
+from rich.panel import Panel
+from rich.table import Table
+from .config import AppConfig, ConfigError, load_app_config
+from .context_extractor import ContextExtractor
+from .llm_analyzer import LLMAnalyzer, LLMAnalyzerError
+from .models import Finding, ProjectScanSummary, ScanReport, normalize_path
+from .risk_heuristics import fallback_reasons
+from .semgrep_runner import SemgrepError, SemgrepRunner, semgrep_available
+app = typer.Typer(help="AION: The Self-Evolving Code Engine. Code Once, Live Forever.", no_args_is_help=True)
+stderr_console = Console(stderr=True)
+stdout_console = Console()
+AI_MARKERS = (
+    "generated by",
+    "co-authored-by: github-copilot",
+    "github-copilot[bot]",
+    "@cursor",
+    "cursor ai",
+)
+class Provider(str, Enum):
+    anthropic = "anthropic"
+    openai = "openai"
+@app.callback()
+def main() -> None:
+    """AION: The Self-Evolving Code Engine. Code Once, Live Forever."""
+@app.command()
+def scan(
+    target: Path = typer.Argument(..., exists=True, readable=True, resolve_path=True),
+    ai_generated: list[Path] | None = typer.Option(
+        None,
+        "--ai-generated",
+        help="Explicit files or directories to treat as AI-generated. Can be repeated.",
+    ),
+    provider: Provider | None = typer.Option(None, "--provider", help="LLM provider: anthropic or openai."),
+    model: str | None = typer.Option(None, help="Model name. Defaults depend on provider."),
+    output: str = typer.Option("text", "--output", help="text or json"),
+    verbose: bool = typer.Option(False, "--verbose", help="Print context and raw prompts to stderr."),
+) -> None:
+    root = target if target.is_dir() else target.parent
+    try:
+        config = load_app_config(root)
+    except ConfigError as exc:
+        raise typer.BadParameter(str(exc)) from exc
+    resolved_provider = provider or _provider_from_config(config) or Provider.anthropic
+    api_key = _resolve_api_key(resolved_provider)
+    if not api_key:
+        raise typer.BadParameter(
+            _missing_api_key_message(resolved_provider)
+        )
+    resolved_model = model or config.model or _default_model_for_provider(resolved_provider)
+    extractor = ContextExtractor(root=root, extra_ignore_patterns=config.ignore_paths)
+    context_profile = extractor.extract()
+    if verbose:
+        stderr_console.print("[bold]Context profile[/bold]")
+        stderr_console.print_json(json.dumps(context_profile.summary_payload(), ensure_ascii=False))
+    candidate_files = _resolve_target_files(target, config.ignore_paths)
+    if not candidate_files:
+        _exit_with_summary(ProjectScanSummary(target=normalize_path(target), warnings=["No Python files found."]), output)
+    explicit_targets = _expand_explicit_targets(ai_generated or [])
+    files_to_scan, detection_warnings = _detect_ai_generated_files(candidate_files, explicit_targets, root)
+    summary = ProjectScanSummary(
+        target=normalize_path(target),
+        files_scanned=len(files_to_scan),
+        warnings=detection_warnings,
+    )
+    runner = SemgrepRunner()
+    has_semgrep = semgrep_available()
+    if not has_semgrep:
+        summary.warnings.append("semgrep is not installed; falling back to LLM-only mode.")
+    analyzer = LLMAnalyzer(api_key=api_key, model=resolved_model, provider=resolved_provider.value, verbose=verbose)
+    for file_path in files_to_scan:
+        report = ScanReport(file=normalize_path(file_path), ai_generated=True)
+        semgrep_findings = []
+        if has_semgrep:
+            try:
+                semgrep_findings = runner.run(file_path)
+            except SemgrepError as exc:
+                summary.warnings.append(f"semgrep failed for {file_path.name}: {exc}")
+        report.semgrep_findings = semgrep_findings
+        if verbose:
+            stderr_console.print(f"[bold]Estimated token cost[/bold] {file_path}: {analyzer.estimate_tokens(file_path, context_profile)}")
+            if semgrep_findings:
+                stderr_console.print("[bold]Semgrep findings[/bold]")
+                stderr_console.print_json(
+                    json.dumps([finding.model_dump() for finding in semgrep_findings], ensure_ascii=False)
+                )
+        try:
+            reasons = fallback_reasons(file_path, context_profile)
+            if verbose and reasons:
+                stderr_console.print(f"[bold]Fallback reasons[/bold] {file_path}: {', '.join(reasons)}")
+            should_run_llm = (not has_semgrep) or bool(semgrep_findings) or bool(reasons)
+            if not should_run_llm:
+                report.mode = "semgrep-only"
+                report.findings = []
+            else:
+                report.findings = analyzer.analyze(
+                    file_path,
+                    context_profile,
+                    semgrep_findings,
+                    fallback_signals=reasons,
+                    console=stderr_console,
+                )
+                report.mode = "llm-only" if not has_semgrep else "semgrep+llm"
+        except LLMAnalyzerError as exc:
+            summary.warnings.append(f"LLM analysis failed for {file_path.name}: {exc}")
+            report.mode = "semgrep-only" if has_semgrep else "skipped"
+        summary.reports.append(report)
+    _exit_with_summary(summary, output)
+def _resolve_target_files(target: Path, extra_ignore_patterns: list[str] | None = None) -> list[Path]:
+    extra_ignore_patterns = extra_ignore_patterns or []
+    if target.is_file():
+        return [target] if target.suffix == ".py" else []
+    return sorted(
+        path
+        for path in target.rglob("*.py")
+        if not any(part in {".git", ".venv", "venv", "node_modules", "__pycache__"} for part in path.parts)
+        if not _matches_any_pattern(path, target, extra_ignore_patterns)
+    )
+def _matches_any_pattern(path: Path, root: Path, patterns: list[str]) -> bool:
+    try:
+        relative = path.relative_to(root if root.is_dir() else root.parent).as_posix()
+    except ValueError:
+        relative = path.as_posix()
+    for pattern in patterns:
+        if Path(relative).match(pattern) or Path(path.name).match(pattern):
+            return True
+    return False
+def _resolve_api_key(provider: Provider) -> str | None:
+    if provider == Provider.anthropic:
+        return os.getenv("ANTHROPIC_API_KEY")
+    if provider == Provider.openai:
+        return os.getenv("OPENAI_API_KEY")
+    return None
+def _missing_api_key_message(provider: Provider) -> str:
+    if provider == Provider.anthropic:
+        return "ANTHROPIC_API_KEY is not set. Export it before running, for example: export ANTHROPIC_API_KEY=your_key"
+    if provider == Provider.openai:
+        return "OPENAI_API_KEY is not set. Export it before running, for example: export OPENAI_API_KEY=your_key"
+    return "Provider API key is not set."
+def _default_model_for_provider(provider: Provider) -> str:
+    if provider == Provider.anthropic:
+        return "claude-3-5-sonnet-latest"
+    if provider == Provider.openai:
+        return "gpt-4.1"
+    raise ValueError(f"unsupported provider: {provider}")
+def _provider_from_config(config: AppConfig) -> Provider | None:
+    if config.provider is None:
+        return None
+    try:
+        return Provider(config.provider)
+    except ValueError as exc:
+        raise typer.BadParameter(f"unsupported provider in .aion.yaml: {config.provider}") from exc
+def _expand_explicit_targets(targets: list[Path]) -> set[str]:
+    expanded: set[str] = set()
+    for target in targets:
+        if target.is_dir():
+            for path in target.rglob("*.py"):
+                expanded.add(normalize_path(path))
+        elif target.suffix == ".py":
+            expanded.add(normalize_path(target))
+    return expanded
+def _detect_ai_generated_files(
+    candidates: list[Path],
+    explicit_targets: set[str],
+    root: Path,
+) -> tuple[list[Path], list[str]]:
+    warnings: list[str] = []
+    if explicit_targets:
+        selected = [path for path in candidates if normalize_path(path) in explicit_targets]
+        if not selected:
+            warnings.append("No Python files matched --ai-generated targets.")
+        return selected, warnings
+    selected = [path for path in candidates if _has_ai_marker(path) or _git_history_has_ai_marker(path, root)]
+    if selected:
+        return selected, warnings
+    warnings.append("No AI-generated markers found; scanning all Python files.")
+    return candidates, warnings
+def _has_ai_marker(path: Path) -> bool:
+    try:
+        content = path.read_text(encoding="utf-8", errors="ignore")[:2000].lower()
+    except OSError:
+        return False
+    return any(marker in content for marker in AI_MARKERS)
+def _git_history_has_ai_marker(path: Path, root: Path) -> bool:
+    command = [
+        "git",
+        "-C",
+        str(root),
+        "log",
+        "--format=%an%n%B",
+        "--",
+        str(path),
+    ]
+    try:
+        result = subprocess.run(command, capture_output=True, text=True, check=False)
+    except OSError:
+        return False
+    if result.returncode != 0:
+        return False
+    history = result.stdout.lower()
+    return "github-copilot" in history or "cursor" in history
+def _exit_with_summary(summary: ProjectScanSummary, output: str) -> None:
+    if output == "json":
+        stdout_console.print_json(summary.model_dump_json())
+        raise typer.Exit(code=0)
+    for warning in summary.warnings:
+        stderr_console.print(f"[yellow]warning:[/yellow] {warning}")
+    if summary.files_scanned == 0:
+        stdout_console.print(Panel("No Python files were scanned.", title="AION"))
+        raise typer.Exit(code=0)
+    stdout_console.print(
+        Panel(
+            f"Target: {summary.target}\nFiles scanned: {summary.files_scanned}\nFindings: {summary.finding_count}",
+            title="AION",
+        )
+    )
+    rendered = False
+    for report in summary.sorted_reports():
+        if not report.findings:
+            continue
+        rendered = True
+        table = Table(title=report.file)
+        table.add_column("Severity")
+        table.add_column("Line", justify="right")
+        table.add_column("Issue")
+        table.add_column("Context Gap")
+        table.add_column("Fix")
+        for finding in sorted(report.findings, key=_severity_sort_key):
+            table.add_row(
+                finding.severity,
+                str(finding.line),
+                finding.issue,
+                finding.context_gap,
+                finding.fix,
+            )
+        stdout_console.print(table)
+    if not rendered:
+        stdout_console.print("[green]No findings reported.[/green]")
+def _severity_sort_key(finding: Finding) -> tuple[int, int]:
+    order = {"critical": 0, "high": 1, "medium": 2, "low": 3}
+    return order.get(finding.severity, 4), finding.line

aion/config.py ADDED Viewed

@@ -0,0 +1,83 @@
+from __future__ import annotations
+import ast
+from dataclasses import dataclass, field
+from pathlib import Path
+@dataclass
+class AppConfig:
+    provider: str | None = None
+    model: str | None = None
+    ignore_paths: list[str] = field(default_factory=list)
+class ConfigError(RuntimeError):
+    pass
+def load_app_config(root: Path) -> AppConfig:
+    config_path = root / ".aion.yaml"
+    if not config_path.exists():
+        return AppConfig()
+    return _parse_config(config_path)
+def _parse_config(path: Path) -> AppConfig:
+    lines = path.read_text(encoding="utf-8").splitlines()
+    data: dict[str, object] = {}
+    index = 0
+    while index < len(lines):
+        raw_line = lines[index]
+        stripped = raw_line.strip()
+        index += 1
+        if not stripped or stripped.startswith("#"):
+            continue
+        if raw_line.startswith((" ", "\t")):
+            raise ConfigError(f"unexpected indentation in {path}")
+        if ":" not in raw_line:
+            raise ConfigError(f"invalid config line: {raw_line}")
+        key, value = raw_line.split(":", 1)
+        key = key.strip()
+        value = value.strip()
+        if not value:
+            items: list[str] = []
+            while index < len(lines):
+                nested_raw = lines[index]
+                nested = nested_raw.strip()
+                if not nested or nested.startswith("#"):
+                    index += 1
+                    continue
+                if not nested_raw.startswith(("  ", "\t")):
+                    break
+                if not nested.startswith("- "):
+                    raise ConfigError(f"invalid list item in {path}: {nested_raw}")
+                items.append(_parse_scalar(nested[2:].strip()))
+                index += 1
+            data[key] = items
+            continue
+        data[key] = _parse_scalar(value)
+    ignore_paths = data.get("ignore_paths", [])
+    if not isinstance(ignore_paths, list):
+        raise ConfigError("ignore_paths must be a list")
+    provider = data.get("provider")
+    model = data.get("model")
+    return AppConfig(
+        provider=str(provider) if provider is not None else None,
+        model=str(model) if model is not None else None,
+        ignore_paths=[str(item) for item in ignore_paths],
+    )
+def _parse_scalar(value: str) -> str:
+    if (value.startswith("'") and value.endswith("'")) or (value.startswith('"') and value.endswith('"')):
+        return ast.literal_eval(value)
+    return value

aion/context_extractor.py ADDED Viewed

@@ -0,0 +1,243 @@
+from __future__ import annotations
+import ast
+import hashlib
+import json
+import random
+from dataclasses import dataclass
+from fnmatch import fnmatch
+from pathlib import Path
+from .models import ContextProfile, normalize_path
+DEFAULT_EXCLUDES = {
+    ".git",
+    ".hg",
+    ".svn",
+    ".venv",
+    "venv",
+    "node_modules",
+    "__pycache__",
+}
+ORM_IMPORTS = ("sqlalchemy", "django.db", "peewee", "tortoise", "pony", "ormar")
+HTTP_IMPORTS = ("httpx", "requests", "aiohttp", "urllib3")
+LOW_LEVEL_DB_IMPORTS = ("sqlite3", "pymysql", "psycopg2", "mysql.connector", "MySQLdb")
+DB_CALL_PATTERNS = {
+    "session.query": "session.query()",
+    "session.execute": "session.execute()",
+    "db.execute": "db.execute()",
+    "cursor.execute": "cursor.execute()",
+    "Model.objects": "Model.objects",
+}
+@dataclass
+class ExtractedFileData:
+    imports: list[str]
+    decorators: list[str]
+    db_patterns: list[str]
+    function_names: list[str]
+    orm_candidates: list[str]
+    http_candidates: list[str]
+    low_level_db_imports: list[str]
+class ContextExtractor:
+    def __init__(
+        self,
+        root: Path,
+        max_files: int = 500,
+        cache_path: Path | None = None,
+        extra_ignore_patterns: list[str] | None = None,
+    ):
+        self.root = root.resolve()
+        self.max_files = max_files
+        self.cache_path = cache_path or Path.home() / ".aion-context.json"
+        self._cache = self._load_cache()
+        self._ignore_patterns = self._load_gitignore_patterns()
+        self._ignore_patterns.extend(extra_ignore_patterns or [])
+    def extract(self) -> ContextProfile:
+        python_files = self._collect_python_files()
+        sampled = False
+        if len(python_files) > self.max_files:
+            sampled = True
+            randomizer = random.Random(42)
+            python_files = sorted(randomizer.sample(python_files, self.max_files))
+        profile = ContextProfile(scanned_files=len(python_files), sampled=sampled)
+        orm_votes: dict[str, int] = {}
+        http_votes: dict[str, int] = {}
+        import_set: set[str] = set()
+        decorator_set: set[str] = set()
+        db_pattern_set: set[str] = set()
+        function_set: set[str] = set()
+        low_level_set: set[str] = set()
+        for file_path in python_files:
+            cached = self._extract_with_cache(file_path)
+            if cached is None:
+                profile.skipped_files.append(normalize_path(file_path))
+                continue
+            import_set.update(cached.imports)
+            decorator_set.update(cached.decorators)
+            db_pattern_set.update(cached.db_patterns)
+            function_set.update(cached.function_names)
+            low_level_set.update(cached.low_level_db_imports)
+            for orm in cached.orm_candidates:
+                orm_votes[orm] = orm_votes.get(orm, 0) + 1
+            for client in cached.http_candidates:
+                http_votes[client] = http_votes.get(client, 0) + 1
+        profile.imports = sorted(import_set)[:50]
+        profile.auth_decorators = sorted(decorator_set)[:20]
+        profile.db_patterns = sorted(db_pattern_set)[:20]
+        profile.function_names = sorted(function_set)[:30]
+        profile.low_level_db_imports = sorted(low_level_set)[:20]
+        profile.orm = self._pick_top_vote(orm_votes)
+        profile.http_client = self._pick_top_vote(http_votes)
+        self._write_cache()
+        return profile
+    def _collect_python_files(self) -> list[Path]:
+        files: list[Path] = []
+        for path in self.root.rglob("*.py"):
+            if any(part in DEFAULT_EXCLUDES for part in path.parts):
+                continue
+            relative = path.relative_to(self.root).as_posix()
+            if self._is_ignored(relative):
+                continue
+            files.append(path)
+        return sorted(files)
+    def _is_ignored(self, relative_path: str) -> bool:
+        for pattern in self._ignore_patterns:
+            if fnmatch(relative_path, pattern) or fnmatch(Path(relative_path).name, pattern):
+                return True
+        return False
+    def _load_gitignore_patterns(self) -> list[str]:
+        gitignore = self.root / ".gitignore"
+        if not gitignore.exists():
+            return []
+        patterns: list[str] = []
+        for raw_line in gitignore.read_text(encoding="utf-8", errors="ignore").splitlines():
+            line = raw_line.strip()
+            if not line or line.startswith("#") or line.startswith("!"):
+                continue
+            if line.endswith("/"):
+                patterns.append(f"{line}*")
+            patterns.append(line.lstrip("/"))
+        return patterns
+    def _extract_with_cache(self, file_path: Path) -> ExtractedFileData | None:
+        content = file_path.read_text(encoding="utf-8", errors="ignore")
+        digest = hashlib.sha256(content.encode("utf-8")).hexdigest()
+        cache_key = normalize_path(file_path)
+        cached = self._cache.get(cache_key)
+        if cached and cached.get("sha256") == digest:
+            try:
+                return ExtractedFileData(**cached["data"])
+            except TypeError:
+                pass
+        parsed = self._extract_file(file_path, content)
+        if parsed is None:
+            return None
+        self._cache[cache_key] = {"sha256": digest, "data": parsed.__dict__}
+        return parsed
+    def _extract_file(self, file_path: Path, content: str) -> ExtractedFileData | None:
+        try:
+            tree = ast.parse(content, filename=str(file_path))
+        except SyntaxError:
+            return None
+        imports: set[str] = set()
+        decorators: set[str] = set()
+        db_patterns: set[str] = set()
+        function_names: set[str] = set()
+        orm_candidates: set[str] = set()
+        http_candidates: set[str] = set()
+        low_level_db_imports: set[str] = set()
+        for node in ast.walk(tree):
+            if isinstance(node, ast.Import):
+                for alias in node.names:
+                    imports.add(alias.name)
+                    self._classify_import(alias.name, orm_candidates, http_candidates, low_level_db_imports)
+            elif isinstance(node, ast.ImportFrom):
+                module = node.module or ""
+                if module:
+                    imports.add(module)
+                    self._classify_import(module, orm_candidates, http_candidates, low_level_db_imports)
+            elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+                function_names.add(node.name)
+                for decorator in node.decorator_list:
+                    rendered = self._render_name(decorator)
+                    if rendered:
+                        decorators.add(f"@{rendered}")
+            elif isinstance(node, ast.Call):
+                rendered = self._render_name(node.func)
+                if rendered and rendered in DB_CALL_PATTERNS:
+                    db_patterns.add(DB_CALL_PATTERNS[rendered])
+        return ExtractedFileData(
+            imports=sorted(imports),
+            decorators=sorted(decorators),
+            db_patterns=sorted(db_patterns),
+            function_names=sorted(function_names),
+            orm_candidates=sorted(orm_candidates),
+            http_candidates=sorted(http_candidates),
+            low_level_db_imports=sorted(low_level_db_imports),
+        )
+    def _classify_import(
+        self,
+        module_name: str,
+        orm_candidates: set[str],
+        http_candidates: set[str],
+        low_level_db_imports: set[str],
+    ) -> None:
+        lowered = module_name.lower()
+        for orm in ORM_IMPORTS:
+            if lowered.startswith(orm):
+                orm_candidates.add(orm.split(".")[0])
+        for client in HTTP_IMPORTS:
+            if lowered.startswith(client):
+                http_candidates.add(client)
+        for db_import in LOW_LEVEL_DB_IMPORTS:
+            if lowered.startswith(db_import.lower()):
+                low_level_db_imports.add(db_import)
+    def _render_name(self, node: ast.AST) -> str | None:
+        if isinstance(node, ast.Name):
+            return node.id
+        if isinstance(node, ast.Attribute):
+            base = self._render_name(node.value)
+            return f"{base}.{node.attr}" if base else node.attr
+        if isinstance(node, ast.Call):
+            return self._render_name(node.func)
+        return None
+    def _pick_top_vote(self, votes: dict[str, int]) -> str | None:
+        if not votes:
+            return None
+        return sorted(votes.items(), key=lambda item: (-item[1], item[0]))[0][0]
+    def _load_cache(self) -> dict[str, dict[str, object]]:
+        if not self.cache_path.exists():
+            return {}
+        try:
+            return json.loads(self.cache_path.read_text(encoding="utf-8"))
+        except (OSError, json.JSONDecodeError):
+            return {}
+    def _write_cache(self) -> None:
+        try:
+            self.cache_path.write_text(
+                json.dumps(self._cache, indent=2, sort_keys=True),
+                encoding="utf-8",
+            )
+        except OSError:
+            pass

aion/evaluation.py ADDED Viewed

@@ -0,0 +1,142 @@
+from __future__ import annotations
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from .llm_analyzer import LLMAnalyzer, LLMAnalyzerError
+from .models import ContextProfile, Finding, SemgrepFinding
+from .risk_heuristics import fallback_reasons
+from .semgrep_runner import SemgrepError, SemgrepRunner, semgrep_available
+@dataclass(frozen=True)
+class FixtureCase:
+    relative_path: str
+    source_path: Path
+    context_path: Path
+    has_vuln: bool
+    expected_context_gap: str
+@dataclass(frozen=True)
+class FixturePrediction:
+    case: FixtureCase
+    findings: list[Finding]
+    semgrep_findings: list[SemgrepFinding]
+    used_semgrep: bool
+    @property
+    def predicted_vulnerable(self) -> bool:
+        return bool(self.findings)
+@dataclass(frozen=True)
+class EvalMetrics:
+    true_positive: int
+    false_positive: int
+    true_negative: int
+    false_negative: int
+    @property
+    def precision(self) -> float:
+        denominator = self.true_positive + self.false_positive
+        return self.true_positive / denominator if denominator else 1.0
+    @property
+    def recall(self) -> float:
+        denominator = self.true_positive + self.false_negative
+        return self.true_positive / denominator if denominator else 1.0
+def load_fixture_cases(fixtures_root: Path) -> list[FixtureCase]:
+    labels = json.loads((fixtures_root / "labels.json").read_text(encoding="utf-8"))
+    cases: list[FixtureCase] = []
+    for relative_path, metadata in sorted(labels.items()):
+        source_path = fixtures_root / relative_path
+        prefix = source_path.name.split("_", 1)[0]
+        context_path = source_path.with_name(f"{prefix}_context.json")
+        cases.append(
+            FixtureCase(
+                relative_path=relative_path,
+                source_path=source_path,
+                context_path=context_path,
+                has_vuln=bool(metadata["has_vuln"]),
+                expected_context_gap=str(metadata["expected_context_gap"]),
+            )
+        )
+    return cases
+def load_context_profile(context_path: Path) -> ContextProfile:
+    payload = json.loads(context_path.read_text(encoding="utf-8"))
+    return ContextProfile(**payload)
+def evaluate_cases(
+    cases: list[FixtureCase],
+    api_key: str,
+    model: str = "claude-3-5-sonnet-latest",
+    provider: str = "anthropic",
+    ignore_llm_errors: bool = True,
+) -> list[FixturePrediction]:
+    analyzer = LLMAnalyzer(api_key=api_key, model=model, provider=provider)
+    runner = SemgrepRunner()
+    use_semgrep = semgrep_available()
+    predictions: list[FixturePrediction] = []
+    for case in cases:
+        context_profile = load_context_profile(case.context_path)
+        semgrep_findings: list[SemgrepFinding] = []
+        if use_semgrep:
+            try:
+                semgrep_findings = runner.run(case.source_path)
+            except SemgrepError:
+                semgrep_findings = []
+        try:
+            reasons = fallback_reasons(case.source_path, context_profile)
+            if use_semgrep and not semgrep_findings and not reasons:
+                findings = []
+            else:
+                findings = analyzer.analyze(
+                    case.source_path,
+                    context_profile,
+                    semgrep_findings,
+                    fallback_signals=reasons,
+                )
+        except LLMAnalyzerError:
+            if not ignore_llm_errors:
+                raise
+            findings = []
+        predictions.append(
+            FixturePrediction(
+                case=case,
+                findings=findings,
+                semgrep_findings=semgrep_findings,
+                used_semgrep=use_semgrep,
+            )
+        )
+    return predictions
+def compute_metrics(predictions: list[FixturePrediction]) -> EvalMetrics:
+    true_positive = false_positive = true_negative = false_negative = 0
+    for prediction in predictions:
+        actual = prediction.case.has_vuln
+        predicted = prediction.predicted_vulnerable
+        if actual and predicted:
+            true_positive += 1
+        elif actual and not predicted:
+            false_negative += 1
+        elif not actual and predicted:
+            false_positive += 1
+        else:
+            true_negative += 1
+    return EvalMetrics(
+        true_positive=true_positive,
+        false_positive=false_positive,
+        true_negative=true_negative,
+        false_negative=false_negative,
+    )

aion/llm_analyzer.py ADDED Viewed

@@ -0,0 +1,219 @@
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Literal
+from .models import ContextProfile, Finding, LLMScanResponse, SemgrepFinding
+LLMProvider = Literal["anthropic", "openai"]
+class LLMAnalyzerError(RuntimeError):
+    pass
+class LLMAnalyzer:
+    def __init__(
+        self,
+        api_key: str,
+        model: str,
+        provider: LLMProvider = "anthropic",
+        max_chunk_lines: int = 200,
+        overlap_lines: int = 50,
+        verbose: bool = False,
+    ):
+        self.api_key = api_key
+        self.model = model
+        self.provider = provider
+        self.max_chunk_lines = max_chunk_lines
+        self.overlap_lines = overlap_lines
+        self.verbose = verbose
+    def analyze(
+        self,
+        target: Path,
+        context_profile: ContextProfile,
+        semgrep_findings: list[SemgrepFinding],
+        fallback_signals: list[str] | None = None,
+        console=None,
+    ) -> list[Finding]:
+        try:
+            source = target.read_text(encoding="utf-8", errors="ignore")
+        except OSError as exc:
+            raise LLMAnalyzerError(f"failed to read {target}: {exc}") from exc
+        client = self._create_client()
+        chunks = self._chunk_source(source)
+        findings: list[Finding] = []
+        for chunk in chunks:
+            prompt = self._build_prompt(
+                target=target,
+                chunk_text=chunk["text"],
+                start_line=chunk["start_line"],
+                end_line=chunk["end_line"],
+                context_profile=context_profile,
+                semgrep_findings=semgrep_findings,
+                fallback_signals=fallback_signals or [],
+            )
+            if self.verbose and console is not None:
+                console.print("[bold]LLM prompt[/bold]")
+                console.print(prompt)
+            try:
+                response = self._create_completion(client, prompt)
+            except Exception as exc:  # noqa: BLE001
+                raise LLMAnalyzerError(str(exc)) from exc
+            for finding in response.findings:
+                findings.append(
+                    Finding(
+                        issue=finding.issue,
+                        severity=finding.severity,
+                        line=self._remap_line(chunk["start_line"], finding.line),
+                        context_gap=finding.context_gap,
+                        fix=finding.fix,
+                        semgrep_rule=finding.semgrep_rule,
+                    )
+                )
+        return self._deduplicate(findings)
+    def _create_client(self):
+        try:
+            import instructor
+        except ImportError as exc:
+            raise LLMAnalyzerError("instructor is not installed") from exc
+        if self.provider == "anthropic":
+            try:
+                from anthropic import Anthropic
+            except ImportError as exc:
+                raise LLMAnalyzerError("anthropic is not installed") from exc
+            return instructor.from_anthropic(Anthropic(api_key=self.api_key))
+        if self.provider == "openai":
+            try:
+                from openai import OpenAI
+            except ImportError as exc:
+                raise LLMAnalyzerError("openai is not installed") from exc
+            return instructor.from_openai(OpenAI(api_key=self.api_key))
+        raise LLMAnalyzerError(f"unsupported provider: {self.provider}")
+    def _create_completion(self, client, prompt: str) -> LLMScanResponse:
+        if self.provider == "anthropic":
+            return client.messages.create(
+                model=self.model,
+                max_tokens=1800,
+                temperature=0,
+                response_model=LLMScanResponse,
+                messages=[
+                    {
+                        "role": "user",
+                        "content": prompt,
+                    }
+                ],
+            )
+        if self.provider == "openai":
+            return client.chat.completions.create(
+                model=self.model,
+                max_completion_tokens=1800,
+                response_model=LLMScanResponse,
+                messages=[
+                    {
+                        "role": "user",
+                        "content": prompt,
+                    }
+                ],
+            )
+        raise LLMAnalyzerError(f"unsupported provider: {self.provider}")
+    def estimate_tokens(self, target: Path, context_profile: ContextProfile) -> int:
+        source = target.read_text(encoding="utf-8", errors="ignore")
+        payload = json.dumps(context_profile.summary_payload(), ensure_ascii=False)
+        total_chars = len(source) + len(payload)
+        return max(total_chars // 4, 1)
+    def _build_prompt(
+        self,
+        target: Path,
+        chunk_text: str,
+        start_line: int,
+        end_line: int,
+        context_profile: ContextProfile,
+        semgrep_findings: list[SemgrepFinding],
+        fallback_signals: list[str],
+    ) -> str:
+        semgrep_summary = [
+            {
+                "rule": finding.check_id,
+                "line": finding.line,
+                "severity": finding.severity,
+                "message": finding.message,
+            }
+            for finding in semgrep_findings
+            if start_line <= finding.line <= end_line
+        ]
+        return (
+            "You are reviewing AI-generated Python code for security issues.\n"
+            "Focus on context-blindness: places where the code ignores established project patterns.\n"
+            "Return only structured findings matching the response schema.\n\n"
+            f"Target file: {target}\n"
+            f"Chunk line range: {start_line}-{end_line}\n"
+            f"Project context summary: {json.dumps(context_profile.summary_payload(), ensure_ascii=False)}\n"
+            f"Semgrep findings in this chunk: {json.dumps(semgrep_summary, ensure_ascii=False)}\n\n"
+            f"Fallback risk signals: {json.dumps(fallback_signals, ensure_ascii=False)}\n\n"
+            "Rules:\n"
+            "- Report only concrete security issues.\n"
+            "- line must be relative to this chunk, not the whole file.\n"
+            "- context_gap must explain what the AI likely did not know about the project.\n"
+            "- If project context implies ORM/auth/rate-limit usage, mention the exact project pattern being bypassed.\n"
+            "- If fallback risk signals are present, investigate them directly even if Semgrep found nothing.\n"
+            "- Do not treat an empty Semgrep result as evidence that the code is safe.\n"
+            "- Keep fixes actionable and specific.\n\n"
+            "Code chunk:\n"
+            f"{chunk_text}"
+        )
+    def _chunk_source(self, source: str) -> list[dict[str, object]]:
+        lines = source.splitlines()
+        if len(lines) <= self.max_chunk_lines:
+            return [
+                {
+                    "text": source,
+                    "start_line": 1,
+                    "end_line": max(len(lines), 1),
+                }
+            ]
+        chunks: list[dict[str, object]] = []
+        step = self.max_chunk_lines - self.overlap_lines
+        start = 0
+        while start < len(lines):
+            end = min(start + self.max_chunk_lines, len(lines))
+            chunks.append(
+                {
+                    "text": "\n".join(lines[start:end]),
+                    "start_line": start + 1,
+                    "end_line": end,
+                }
+            )
+            if end >= len(lines):
+                break
+            start += step
+        return chunks
+    def _remap_line(self, chunk_start_line: int, reported_line: int) -> int:
+        return max(chunk_start_line + reported_line - 1, 1)
+    def _deduplicate(self, findings: list[Finding]) -> list[Finding]:
+        seen: set[tuple[int, str, str]] = set()
+        unique: list[Finding] = []
+        for finding in sorted(findings, key=lambda item: (item.line, item.issue, item.severity)):
+            key = (finding.line, finding.issue, finding.severity)
+            if key in seen:
+                continue
+            seen.add(key)
+            unique.append(finding)
+        return unique

aion/models.py ADDED Viewed

@@ -0,0 +1,100 @@
+from __future__ import annotations
+from pathlib import Path
+from typing import Literal
+from pydantic import BaseModel, Field
+Severity = Literal["critical", "high", "medium", "low"]
+class ContextProfile(BaseModel):
+    orm: str | None = None
+    auth_decorators: list[str] = Field(default_factory=list)
+    db_patterns: list[str] = Field(default_factory=list)
+    low_level_db_imports: list[str] = Field(default_factory=list)
+    http_client: str | None = None
+    imports: list[str] = Field(default_factory=list)
+    function_names: list[str] = Field(default_factory=list)
+    scanned_files: int = 0
+    sampled: bool = False
+    skipped_files: list[str] = Field(default_factory=list)
+    def summary_payload(self) -> dict[str, object]:
+        return {
+            "orm": self.orm,
+            "auth_decorators": self.auth_decorators[:20],
+            "db_patterns": self.db_patterns[:20],
+            "low_level_db_imports": self.low_level_db_imports[:20],
+            "http_client": self.http_client,
+            "imports": self.imports[:50],
+            "function_names": self.function_names[:30],
+            "scanned_files": self.scanned_files,
+            "sampled": self.sampled,
+        }
+class SemgrepFinding(BaseModel):
+    check_id: str
+    path: str
+    line: int
+    end_line: int | None = None
+    severity: str = "INFO"
+    message: str
+    code: str | None = None
+    metadata: dict[str, object] = Field(default_factory=dict)
+class Finding(BaseModel):
+    issue: str
+    severity: Severity
+    line: int
+    context_gap: str
+    fix: str
+    semgrep_rule: str | None = None
+class ScanReport(BaseModel):
+    file: str
+    findings: list[Finding] = Field(default_factory=list)
+    semgrep_findings: list[SemgrepFinding] = Field(default_factory=list)
+    ai_generated: bool = False
+    mode: Literal["semgrep+llm", "llm-only", "semgrep-only", "skipped"] = "skipped"
+class LLMScanResponse(BaseModel):
+    findings: list[Finding] = Field(default_factory=list)
+class ProjectScanSummary(BaseModel):
+    target: str
+    files_scanned: int = 0
+    reports: list[ScanReport] = Field(default_factory=list)
+    warnings: list[str] = Field(default_factory=list)
+    @property
+    def finding_count(self) -> int:
+        return sum(len(report.findings) for report in self.reports)
+    def sorted_reports(self) -> list[ScanReport]:
+        return sorted(
+            self.reports,
+            key=lambda report: (
+                min(
+                    (
+                        {"critical": 0, "high": 1, "medium": 2, "low": 3}[finding.severity]
+                        for finding in report.findings
+                    ),
+                    default=4,
+                ),
+                report.file,
+            ),
+        )
+def normalize_path(path: Path) -> str:
+    try:
+        return str(path.resolve())
+    except OSError:
+        return str(path)

aion/risk_heuristics.py ADDED Viewed

@@ -0,0 +1,90 @@
+from __future__ import annotations
+import ast
+from pathlib import Path
+from .models import ContextProfile
+ROUTE_DECORATOR_NAMES = {
+    "app.get",
+    "app.post",
+    "app.put",
+    "app.delete",
+    "app.patch",
+    "router.get",
+    "router.post",
+    "router.put",
+    "router.delete",
+    "router.patch",
+}
+SECRET_NAME_MARKERS = ("key", "secret", "token", "password")
+LOW_LEVEL_DB_IMPORTS = {"sqlite3", "pymysql", "psycopg2", "mysql.connector", "mysqldb"}
+def fallback_reasons(target: Path, context_profile: ContextProfile) -> list[str]:
+    try:
+        source = target.read_text(encoding="utf-8", errors="ignore")
+        tree = ast.parse(source, filename=str(target))
+    except (OSError, SyntaxError):
+        return []
+    reasons: list[str] = []
+    if context_profile.orm and _imports_low_level_db(tree):
+        reasons.append("low-level database access bypasses the project's ORM pattern")
+    if _has_hardcoded_secret(tree):
+        reasons.append("hardcoded secret-like assignment detected")
+    if context_profile.auth_decorators and _has_route_without_auth(tree, context_profile.auth_decorators):
+        reasons.append("route handler is missing the project's auth decorators")
+    return reasons
+def _imports_low_level_db(tree: ast.AST) -> bool:
+    for node in ast.walk(tree):
+        if isinstance(node, ast.Import):
+            for alias in node.names:
+                if alias.name.lower() in LOW_LEVEL_DB_IMPORTS:
+                    return True
+        elif isinstance(node, ast.ImportFrom):
+            module = (node.module or "").lower()
+            if module in LOW_LEVEL_DB_IMPORTS:
+                return True
+    return False
+def _has_hardcoded_secret(tree: ast.AST) -> bool:
+    for node in ast.walk(tree):
+        if isinstance(node, ast.Assign):
+            value = node.value
+            if not (isinstance(value, ast.Constant) and isinstance(value.value, str)):
+                continue
+            for target in node.targets:
+                name = _render_name(target).lower()
+                if any(marker in name for marker in SECRET_NAME_MARKERS):
+                    return True
+    return False
+def _has_route_without_auth(tree: ast.AST, auth_decorators: list[str]) -> bool:
+    allowed = {decorator.lstrip("@").split(".")[-1] for decorator in auth_decorators}
+    for node in ast.walk(tree):
+        if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+            continue
+        decorator_names = {_render_name(decorator) for decorator in node.decorator_list}
+        route_like = any(name in ROUTE_DECORATOR_NAMES for name in decorator_names if name)
+        if not route_like:
+            continue
+        has_auth = any(name and name.split(".")[-1] in allowed for name in decorator_names)
+        if not has_auth:
+            return True
+    return False
+def _render_name(node: ast.AST) -> str:
+    if isinstance(node, ast.Name):
+        return node.id
+    if isinstance(node, ast.Attribute):
+        base = _render_name(node.value)
+        return f"{base}.{node.attr}" if base else node.attr
+    if isinstance(node, ast.Call):
+        return _render_name(node.func)
+    return ""

aion/semgrep_runner.py ADDED Viewed

@@ -0,0 +1,50 @@
+from __future__ import annotations
+import json
+import shutil
+import subprocess
+from pathlib import Path
+from .models import SemgrepFinding, normalize_path
+class SemgrepError(RuntimeError):
+    pass
+def semgrep_available() -> bool:
+    return shutil.which("semgrep") is not None
+class SemgrepRunner:
+    def __init__(self, config: str = "p/python"):
+        self.config = config
+    def run(self, target: Path) -> list[SemgrepFinding]:
+        command = ["semgrep", "--json", "--config", self.config, str(target)]
+        result = subprocess.run(command, capture_output=True, text=True, check=False)
+        if result.returncode not in (0, 1):
+            raise SemgrepError(result.stderr.strip() or "semgrep failed")
+        try:
+            payload = json.loads(result.stdout or "{}")
+        except json.JSONDecodeError as exc:
+            raise SemgrepError("semgrep returned malformed JSON") from exc
+        findings: list[SemgrepFinding] = []
+        for item in payload.get("results", []):
+            start = item.get("start", {})
+            end = item.get("end", {})
+            extra = item.get("extra", {})
+            findings.append(
+                SemgrepFinding(
+                    check_id=item.get("check_id", "unknown"),
+                    path=normalize_path(Path(item.get("path", str(target)))),
+                    line=start.get("line", 1),
+                    end_line=end.get("line"),
+                    severity=extra.get("severity", "INFO"),
+                    message=extra.get("message", "").strip() or item.get("check_id", "Semgrep finding"),
+                    code=extra.get("lines"),
+                    metadata=extra.get("metadata", {}),
+                )
+            )
+        return findings

aion_evolve-0.2.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,89 @@
+Metadata-Version: 2.4
+Name: aion-evolve
+Version: 0.2.1
+Summary: AION: The Self-Evolving Code Engine. Code Once, Live Forever.
+Project-URL: Homepage, https://github.com/shenxianpeng/aion
+Project-URL: Documentation, https://shenxianpeng.github.io/aion/
+Project-URL: Bug Tracker, https://github.com/shenxianpeng/aion/issues
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+Requires-Dist: anthropic>=0.86.0
+Requires-Dist: instructor>=1.14.5
+Requires-Dist: openai>=2.30.0
+Requires-Dist: pydantic>=2.11.0
+Requires-Dist: rich>=14.3.3
+Requires-Dist: semgrep>=1.156.0
+Requires-Dist: typer>=0.23.1
+# AION
+[![Docs](https://img.shields.io/badge/docs-github%20pages-blue)](https://shenxianpeng.github.io/aion/)
+> **Code Once, Live Forever.**
+`AION` is The Self-Evolving Code Engine — designed to end technical debt and keep your codebase in a perpetual state of health.
+AI scans your code continuously, automatically rewrites outdated syntax and risky logic, and delivers an evolved codebase every day. Instead of treating every file in isolation, it builds a lightweight profile of the existing repository, runs `semgrep` as a fast first pass, and only asks the LLM to investigate files that have concrete risk signals or meaningful context gaps. The main differentiator is context-gap reporting, for example: "this file uses `sqlite3`, but the rest of the project uses `sqlalchemy` sessions."
+## Current MVP
+- Python-only scanning
+- Project context extraction via `ast`
+- `semgrep --config p/python` integration
+- Anthropic-backed structured findings
+- Anthropic and OpenAI providers
+- AI-generated file detection via file markers, git history, or explicit `--ai-generated`
+- Rich terminal output and JSON output
+## Install
+```bash
+uv sync
+```
+## Usage
+```bash
+export ANTHROPIC_API_KEY=your_key
+uv run aion scan ./path/to/project
+uv run aion scan ./path/to/project --ai-generated ./path/to/project/generated_file.py
+uv run aion scan ./path/to/project --output json
+export OPENAI_API_KEY=your_key
+uv run aion scan ./path/to/project --provider openai
+```
+## Config File
+Create `.aion.yaml` in the project root:
+```yaml
+provider: openai
+model: gpt-4.1
+ignore_paths:
+  - tests/*
+  - scripts/generated_*.py
+```
+CLI flags still override config values.
+## Notes
+- If `semgrep` is unavailable, the tool degrades to LLM-only mode and prints a warning.
+- If no AI-generated markers are found, the tool scans all Python files and prints a warning.
+- Context extraction cache is stored at `~/.aion-context.json`.
+- Provider-specific defaults: Anthropic uses `claude-3-5-sonnet-latest`; OpenAI uses `gpt-4.1` unless `--model` is set.
+## Tests
+```bash
+uv run pytest tests/unit
+uv run pytest -m eval tests/eval
+```
+## Documentation
+Full documentation is published with GitHub Pages:
+- English: `docs/en/`
+- 中文: `docs/zh/`
+- Site URL: `https://shenxianpeng.github.io/aion/`

aion_evolve-0.2.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,15 @@
+aion/__init__.py,sha256=N0j6iIy_RoA1JMNfbMwNPS4LNJOf8cGQ-JvM1FsqQ78,211
+aion/__main__.py,sha256=nwyadK0B67PgQNG60KYhqotybP8f0KvfyJ0S5Z7VSsE,60
+aion/cli.py,sha256=1bEbm7Te2SwVAkmm6Z0RwA9YRwd5IT1nPhsS31Fn2OE,10923
+aion/config.py,sha256=kctVKvaiRY065k8drU79LN5-2X-B2Yi7ObBv7D1oi3E,2475
+aion/context_extractor.py,sha256=uUOZLAuDFiJ52NPX_aerft5I4yHFLBFiKiZsPIb0GkM,9065
+aion/evaluation.py,sha256=nsx0zScje1iTVZrHCk6xC12HFvyRtyMJeX-y8Wen02s,4447
+aion/llm_analyzer.py,sha256=Ys6ySkDVrPJvFG71yR-_HFKFArTBSlxeEpzy1knEmt8,8171
+aion/models.py,sha256=vwEfjH0nY9660va7fnEaNxjoDboAY1FTLIrE5aZnUIo,2882
+aion/risk_heuristics.py,sha256=eZYE5jgXbO-SPjvwxWGnSucUNhtwOURpS1sOoKrbebw,3158
+aion/semgrep_runner.py,sha256=aFYSN4rXw5spbziSsLfISl3WXO0ELJD_3t9WBVtKypY,1745
+aion_evolve-0.2.1.dist-info/METADATA,sha256=IRWjfRfubZzuEeCHFFTGMkaiTEsCrQG8DTAbjEEZsZU,2890
+aion_evolve-0.2.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+aion_evolve-0.2.1.dist-info/entry_points.txt,sha256=ESe7xV_00Z1lHI7R4HnQSxbovqXvXtuZqI-k0mN4-TM,38
+aion_evolve-0.2.1.dist-info/top_level.txt,sha256=Di9luRhzG-5XgrNQOA4Iw__XvBajwuEa8zFANbQyIPQ,5
+aion_evolve-0.2.1.dist-info/RECORD,,

aion_evolve-0.2.1.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

aion_evolve-0.2.1.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ aion = aion.cli:app

aion_evolve-0.2.1.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ aion