PyPI - repocrunch - Versions diffs - 0.1.0__py3-none-any.whl - Mend

repocrunch 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

repocrunch/__init__.py +27 -0
repocrunch/analyzer.py +91 -0
repocrunch/api.py +44 -0
repocrunch/cli.py +83 -0
repocrunch/client.py +154 -0
repocrunch/detection.py +117 -0
repocrunch/extractors/__init__.py +15 -0
repocrunch/extractors/architecture.py +113 -0
repocrunch/extractors/health.py +107 -0
repocrunch/extractors/metadata.py +49 -0
repocrunch/extractors/security.py +58 -0
repocrunch/extractors/tech_stack.py +218 -0
repocrunch/mcp_server.py +28 -0
repocrunch/models.py +82 -0
repocrunch/parsers/__init__.py +23 -0
repocrunch/parsers/build_gradle.py +64 -0
repocrunch/parsers/cargo_toml.py +21 -0
repocrunch/parsers/cmakelists.py +26 -0
repocrunch/parsers/gemfile.py +64 -0
repocrunch/parsers/go_mod.py +30 -0
repocrunch/parsers/package_json.py +28 -0
repocrunch/parsers/pom_xml.py +59 -0
repocrunch/parsers/pyproject_toml.py +71 -0
repocrunch/parsers/requirements_txt.py +18 -0
repocrunch-0.1.0.dist-info/METADATA +218 -0
repocrunch-0.1.0.dist-info/RECORD +29 -0
repocrunch-0.1.0.dist-info/WHEEL +4 -0
repocrunch-0.1.0.dist-info/entry_points.txt +2 -0
repocrunch-0.1.0.dist-info/licenses/LICENSE +21 -0

repocrunch/__init__.py ADDED Viewed

@@ -0,0 +1,27 @@
+"""RepoCrunch — Analyze GitHub repos into structured JSON."""
+from __future__ import annotations
+import asyncio
+from repocrunch.analyzer import analyze_repo
+from repocrunch.models import SCHEMA_VERSION, RepoAnalysis
+__version__ = "0.1.0"
+__all__ = ["analyze", "analyze_sync", "RepoAnalysis", "SCHEMA_VERSION", "__version__"]
+async def analyze(
+    repo: str,
+    token: str | None = None,
+) -> RepoAnalysis:
+    """Analyze a GitHub repo asynchronously."""
+    return await analyze_repo(repo, token=token)
+def analyze_sync(
+    repo: str,
+    token: str | None = None,
+) -> RepoAnalysis:
+    """Analyze a GitHub repo synchronously."""
+    return asyncio.run(analyze_repo(repo, token=token))

repocrunch/analyzer.py ADDED Viewed

@@ -0,0 +1,91 @@
+"""Orchestrator: parse input → gather data → run extractors → assemble result."""
+from __future__ import annotations
+import asyncio
+import re
+from datetime import datetime, timezone
+from repocrunch.client import GitHubClient
+from repocrunch.extractors.architecture import extract_architecture
+from repocrunch.extractors.health import extract_health
+from repocrunch.extractors.metadata import extract_metadata
+from repocrunch.extractors.security import extract_security
+from repocrunch.extractors.tech_stack import extract_tech_stack
+from repocrunch.models import RepoAnalysis
+def parse_repo_input(raw: str) -> tuple[str, str]:
+    """Parse 'owner/repo' or a GitHub URL into (owner, repo)."""
+    raw = raw.strip().rstrip("/")
+    # Full URL
+    match = re.match(r"https?://github\.com/([^/]+)/([^/]+?)(?:\.git)?$", raw)
+    if match:
+        return match.group(1), match.group(2)
+    # owner/repo shorthand
+    match = re.match(r"^([A-Za-z0-9_.-]+)/([A-Za-z0-9_.-]+)$", raw)
+    if match:
+        return match.group(1), match.group(2)
+    raise ValueError(f"Cannot parse repo input: {raw!r}. Use 'owner/repo' or a GitHub URL.")
+async def analyze_repo(
+    repo_input: str,
+    token: str | None = None,
+    client: GitHubClient | None = None,
+) -> RepoAnalysis:
+    """Analyze a GitHub repo and return structured results."""
+    owner, repo = parse_repo_input(repo_input)
+    warnings: list[str] = []
+    owns_client = client is None
+    if owns_client:
+        client = GitHubClient(token=token)
+    try:
+        # Phase 1: parallel fetch of repo metadata, languages, and file tree
+        repo_data, languages, tree_data = await asyncio.gather(
+            client.get(f"/repos/{owner}/{repo}"),
+            client.get(f"/repos/{owner}/{repo}/languages"),
+            client.get(f"/repos/{owner}/{repo}/git/trees/HEAD", params={"recursive": "1"}),
+        )
+        if repo_data is None:
+            raise ValueError(f"Repository not found: {owner}/{repo}")
+        tree_data = tree_data or {"tree": []}
+        languages = languages or {}
+        primary_language = repo_data.get("language")
+        # Phase 2: parallel extraction (async extractors run concurrently)
+        summary = extract_metadata(repo_data, languages)
+        tech_stack, health, security = await asyncio.gather(
+            extract_tech_stack(client, owner, repo, tree_data, primary_language),
+            extract_health(client, owner, repo, repo_data),
+            extract_security(client, owner, repo, tree_data, repo_data, warnings),
+        )
+        # Architecture is sync — run after tech_stack so we have deps for test detection
+        architecture = extract_architecture(tree_data, tech_stack.key_deps)
+        # Collect client warnings
+        warnings.extend(client.warnings)
+        return RepoAnalysis(
+            repo=f"{owner}/{repo}",
+            url=f"https://github.com/{owner}/{repo}",
+            analyzed_at=datetime.now(timezone.utc),
+            summary=summary,
+            tech_stack=tech_stack,
+            architecture=architecture,
+            health=health,
+            security=security,
+            warnings=warnings,
+        )
+    finally:
+        if owns_client:
+            await client.close()

repocrunch/api.py ADDED Viewed

@@ -0,0 +1,44 @@
+"""FastAPI REST API for RepoCrunch."""
+from __future__ import annotations
+from fastapi import FastAPI, HTTPException, Query
+from fastapi.middleware.cors import CORSMiddleware
+from repocrunch import __version__
+from repocrunch.analyzer import analyze_repo
+from repocrunch.client import RateLimitError
+app = FastAPI(
+    title="RepoCrunch",
+    version=__version__,
+    description="Analyze GitHub repos into structured JSON.",
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["GET"],
+    allow_headers=["*"],
+)
+@app.get("/analyze")
+async def analyze(
+    repo: str = Query(description="GitHub repo as 'owner/repo' or URL"),
+    github_token: str | None = Query(None, description="GitHub token for higher rate limits"),
+):
+    try:
+        result = await analyze_repo(repo, token=github_token)
+        return result.model_dump(mode="json")
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    except RateLimitError:
+        raise HTTPException(status_code=429, detail="GitHub API rate limit exhausted")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/health")
+async def health():
+    return {"status": "ok", "version": __version__}

repocrunch/cli.py ADDED Viewed

@@ -0,0 +1,83 @@
+"""Typer CLI for RepoCrunch."""
+from __future__ import annotations
+import json
+import typer
+from repocrunch import __version__, analyze_sync
+app = typer.Typer(
+    name="repocrunch",
+    help="Analyze GitHub repos into structured JSON.",
+    no_args_is_help=True,
+)
+@app.command()
+def analyze(
+    repo: str = typer.Argument(help="GitHub repo as 'owner/repo' or URL"),
+    pretty: bool = typer.Option(False, "--pretty", "-p", help="Pretty-print JSON output"),
+    field: str | None = typer.Option(None, "--field", "-f", help="Extract a single top-level field"),
+    token: str | None = typer.Option(None, "--token", "-t", help="GitHub token (or set GITHUB_TOKEN)"),
+) -> None:
+    """Analyze a GitHub repository."""
+    try:
+        result = analyze_sync(repo, token=token)
+    except ValueError as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1)
+    except Exception as e:
+        typer.echo(f"Error: {e}", err=True)
+        raise typer.Exit(1)
+    data = result.model_dump(mode="json")
+    if field:
+        if field not in data:
+            typer.echo(f"Unknown field: {field}. Available: {', '.join(data.keys())}", err=True)
+            raise typer.Exit(1)
+        data = data[field]
+    indent = 2 if pretty else None
+    typer.echo(json.dumps(data, indent=indent, default=str))
+@app.command()
+def serve(
+    host: str = typer.Option("0.0.0.0", help="Host to bind to"),
+    port: int = typer.Option(8000, help="Port to bind to"),
+) -> None:
+    """Start the REST API server."""
+    try:
+        import uvicorn
+        from repocrunch.api import app as fastapi_app
+    except ImportError:
+        typer.echo("Install API extras: pip install repocrunch[api]", err=True)
+        raise typer.Exit(1)
+    uvicorn.run(fastapi_app, host=host, port=port)
+@app.command()
+def mcp() -> None:
+    """Start the MCP server (STDIO transport)."""
+    try:
+        from repocrunch.mcp_server import mcp as mcp_app
+    except ImportError:
+        typer.echo("Install MCP extras: pip install repocrunch[mcp]", err=True)
+        raise typer.Exit(1)
+    mcp_app.run()
+@app.command()
+def version() -> None:
+    """Print version information."""
+    typer.echo(f"repocrunch {__version__}")
+if __name__ == "__main__":
+    app()

repocrunch/client.py ADDED Viewed

@@ -0,0 +1,154 @@
+"""GitHub API client with auth, rate limiting, ETag caching, and retries."""
+from __future__ import annotations
+import base64
+import logging
+import os
+from collections import OrderedDict
+from typing import Any
+import httpx
+logger = logging.getLogger(__name__)
+GITHUB_API = "https://api.github.com"
+CACHE_MAX = 200
+class RateLimitError(Exception):
+    def __init__(self, reset_at: int | None = None):
+        self.reset_at = reset_at
+        super().__init__("GitHub API rate limit exhausted")
+class GitHubClient:
+    def __init__(
+        self,
+        token: str | None = None,
+        client: httpx.AsyncClient | None = None,
+    ):
+        self.token = token or os.environ.get("GITHUB_TOKEN")
+        self._external_client = client is not None
+        self._client = client or self._make_client()
+        self._etag_cache: OrderedDict[str, tuple[str, Any]] = OrderedDict()
+        self.rate_remaining: int | None = None
+        self.rate_limit: int | None = None
+        self.warnings: list[str] = []
+    def _make_client(self) -> httpx.AsyncClient:
+        headers = {
+            "Accept": "application/vnd.github+json",
+            "X-GitHub-Api-Version": "2022-11-28",
+        }
+        if self.token:
+            headers["Authorization"] = f"Bearer {self.token}"
+        return httpx.AsyncClient(
+            base_url=GITHUB_API,
+            headers=headers,
+            timeout=30.0,
+        )
+    def _update_rate_info(self, response: httpx.Response) -> None:
+        remaining = response.headers.get("X-RateLimit-Remaining")
+        if remaining is not None:
+            self.rate_remaining = int(remaining)
+        limit = response.headers.get("X-RateLimit-Limit")
+        if limit is not None:
+            self.rate_limit = int(limit)
+        if self.rate_remaining is not None and self.rate_remaining < 5:
+            self.warnings.append(
+                f"GitHub API rate limit low: {self.rate_remaining}/{self.rate_limit} remaining"
+            )
+    def _cache_set(self, url: str, etag: str, data: Any) -> None:
+        if len(self._etag_cache) >= CACHE_MAX:
+            self._etag_cache.popitem(last=False)
+        self._etag_cache[url] = (etag, data)
+    async def get(self, path: str, params: dict | None = None) -> Any:
+        """GET a GitHub API endpoint. Returns parsed JSON or None on 404."""
+        if self.rate_remaining is not None and self.rate_remaining <= 0:
+            raise RateLimitError()
+        url = path
+        headers: dict[str, str] = {}
+        cache_key = f"{path}?{params}" if params else path
+        if cache_key in self._etag_cache:
+            etag, cached_data = self._etag_cache[cache_key]
+            headers["If-None-Match"] = etag
+        retries = 2
+        for attempt in range(retries + 1):
+            try:
+                response = await self._client.get(url, params=params, headers=headers)
+                break
+            except httpx.TransportError:
+                if attempt == retries:
+                    raise
+                continue
+        self._update_rate_info(response)
+        if response.status_code == 304:
+            self._etag_cache.move_to_end(cache_key)
+            return self._etag_cache[cache_key][1]
+        if response.status_code in (401, 404):
+            return None
+        if response.status_code == 403:
+            if self.rate_remaining is not None and self.rate_remaining <= 0:
+                reset = response.headers.get("X-RateLimit-Reset")
+                raise RateLimitError(int(reset) if reset else None)
+            # Permission denied (e.g. branch protection without admin access)
+            return None
+        response.raise_for_status()
+        data = response.json()
+        etag = response.headers.get("ETag")
+        if etag:
+            self._cache_set(cache_key, etag, data)
+        return data
+    async def get_file_content(self, owner: str, repo: str, path: str) -> str | None:
+        """Get decoded file content from a repo. Returns None if not found."""
+        data = await self.get(f"/repos/{owner}/{repo}/contents/{path}")
+        if data is None:
+            return None
+        if isinstance(data, dict) and data.get("encoding") == "base64":
+            return base64.b64decode(data["content"]).decode("utf-8", errors="replace")
+        return None
+    async def get_contributor_count(self, owner: str, repo: str) -> int:
+        """Get total contributor count using the Link header pagination trick."""
+        response = await self._client.get(
+            f"/repos/{owner}/{repo}/contributors",
+            params={"per_page": 1, "anon": "true"},
+        )
+        self._update_rate_info(response)
+        if response.status_code != 200:
+            return 0
+        link = response.headers.get("Link", "")
+        if 'rel="last"' in link:
+            for part in link.split(","):
+                if 'rel="last"' in part:
+                    url_part = part.split(";")[0].strip().strip("<>")
+                    if "page=" in url_part:
+                        page = url_part.split("page=")[-1].split("&")[0]
+                        return int(page)
+        return len(response.json()) if isinstance(response.json(), list) else 0
+    async def close(self) -> None:
+        if not self._external_client:
+            await self._client.aclose()
+    async def __aenter__(self) -> GitHubClient:
+        return self
+    async def __aexit__(self, *args: Any) -> None:
+        await self.close()

repocrunch/detection.py ADDED Viewed

@@ -0,0 +1,117 @@
+"""Framework and test framework detection maps."""
+from __future__ import annotations
+# dependency name → framework label
+FRAMEWORK_MAP: dict[str, str] = {
+    # Python
+    "fastapi": "FastAPI",
+    "django": "Django",
+    "flask": "Flask",
+    "starlette": "Starlette",
+    "tornado": "Tornado",
+    "sanic": "Sanic",
+    "litestar": "Litestar",
+    "aiohttp": "aiohttp",
+    "bottle": "Bottle",
+    "falcon": "Falcon",
+    "quart": "Quart",
+    "streamlit": "Streamlit",
+    "gradio": "Gradio",
+    # Node.js / TypeScript
+    "next": "Next.js",
+    "react": "React",
+    "vue": "Vue.js",
+    "angular": "Angular",
+    "@angular/core": "Angular",
+    "svelte": "Svelte",
+    "express": "Express",
+    "nestjs": "NestJS",
+    "@nestjs/core": "NestJS",
+    "nuxt": "Nuxt",
+    "remix": "Remix",
+    "@remix-run/react": "Remix",
+    "gatsby": "Gatsby",
+    "astro": "Astro",
+    "hono": "Hono",
+    "fastify": "Fastify",
+    "koa": "Koa",
+    "solid-js": "SolidJS",
+    "preact": "Preact",
+    # Rust
+    "actix-web": "Actix Web",
+    "axum": "Axum",
+    "rocket": "Rocket",
+    "warp": "Warp",
+    "tide": "Tide",
+    "leptos": "Leptos",
+    "yew": "Yew",
+    "tauri": "Tauri",
+    # Go (module paths)
+    "github.com/gin-gonic/gin": "Gin",
+    "github.com/gofiber/fiber": "Fiber",
+    "github.com/labstack/echo": "Echo",
+    "github.com/gorilla/mux": "Gorilla Mux",
+    "github.com/go-chi/chi": "Chi",
+    "github.com/beego/beego": "Beego",
+    # Java / Kotlin
+    "org.springframework.boot:spring-boot-starter-web": "Spring Boot",
+    "io.quarkus:quarkus-core": "Quarkus",
+    "io.micronaut:micronaut-core": "Micronaut",
+    "io.vertx:vertx-core": "Vert.x",
+    "com.typesafe.play:play_2.13": "Play Framework",
+    "com.typesafe.play:play_3": "Play Framework",
+    "io.ktor:ktor-server-core": "Ktor",
+    # Ruby
+    "rails": "Rails",
+    "sinatra": "Sinatra",
+    "hanami": "Hanami",
+    # C / C++
+    "Boost": "Boost",
+    "Qt5": "Qt",
+    "Qt6": "Qt",
+    "OpenCV": "OpenCV",
+    "SFML": "SFML",
+}
+# dependency name → test framework label
+TEST_FRAMEWORK_MAP: dict[str, str] = {
+    # Python
+    "pytest": "pytest",
+    "unittest": "unittest",
+    "nose": "nose",
+    "nose2": "nose2",
+    # Node.js
+    "jest": "Jest",
+    "mocha": "Mocha",
+    "vitest": "Vitest",
+    "@playwright/test": "Playwright",
+    "cypress": "Cypress",
+    "ava": "AVA",
+    "tap": "tap",
+    # Rust (built-in, detected from tree)
+    # Go (built-in testing package)
+    # Java / Kotlin
+    "junit": "JUnit",
+    "org.junit.jupiter:junit-jupiter": "JUnit 5",
+    "org.junit.jupiter:junit-jupiter-api": "JUnit 5",
+    "junit:junit": "JUnit 4",
+    "org.mockito:mockito-core": "Mockito",
+    "org.testng:testng": "TestNG",
+    # Ruby
+    "rspec": "RSpec",
+    "rspec-rails": "RSpec",
+    "minitest": "Minitest",
+}
+# Files in tree that indicate test framework
+TEST_FILE_PATTERNS: dict[str, str] = {
+    "jest.config": "Jest",
+    "vitest.config": "Vitest",
+    "cypress.config": "Cypress",
+    "playwright.config": "Playwright",
+    ".mocharc": "Mocha",
+    "pytest.ini": "pytest",
+    "setup.cfg": "pytest",  # often contains [tool:pytest]
+    "conftest.py": "pytest",
+}

repocrunch/extractors/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""Extractors that transform raw GitHub API data into structured models."""
+from repocrunch.extractors.architecture import extract_architecture
+from repocrunch.extractors.health import extract_health
+from repocrunch.extractors.metadata import extract_metadata
+from repocrunch.extractors.security import extract_security
+from repocrunch.extractors.tech_stack import extract_tech_stack
+__all__ = [
+    "extract_metadata",
+    "extract_tech_stack",
+    "extract_architecture",
+    "extract_health",
+    "extract_security",
+]

repocrunch/extractors/architecture.py ADDED Viewed

@@ -0,0 +1,113 @@
+"""Extract architecture signals from the file tree."""
+from __future__ import annotations
+from typing import Any
+from repocrunch.detection import TEST_FILE_PATTERNS, TEST_FRAMEWORK_MAP
+from repocrunch.models import Architecture
+def _get_tree_paths(tree_data: dict[str, Any]) -> set[str]:
+    return {item["path"] for item in tree_data.get("tree", []) if item.get("type") == "blob"}
+def _detect_monorepo(paths: set[str], tree_data: dict[str, Any]) -> bool:
+    dirs = {item["path"] for item in tree_data.get("tree", []) if item.get("type") == "tree"}
+    # Workspace indicators
+    if "lerna.json" in paths or "pnpm-workspace.yaml" in paths:
+        return True
+    # Multiple package.json at different levels
+    pkg_jsons = [p for p in paths if p.endswith("package.json") and "/" in p]
+    if len(pkg_jsons) >= 2:
+        return True
+    # packages/ or apps/ directories
+    if any(d.startswith("packages/") for d in dirs) or any(d.startswith("apps/") for d in dirs):
+        return True
+    return False
+def _detect_docker(paths: set[str]) -> bool:
+    return any(
+        p == "Dockerfile" or p == "docker-compose.yml" or p == "docker-compose.yaml"
+        or p.endswith("/Dockerfile") or p == "compose.yml" or p == "compose.yaml"
+        for p in paths
+    )
+def _detect_ci_cd(paths: set[str]) -> list[str]:
+    ci: list[str] = []
+    if any(p.startswith(".github/workflows/") for p in paths):
+        ci.append("GitHub Actions")
+    if ".gitlab-ci.yml" in paths:
+        ci.append("GitLab CI")
+    if "Jenkinsfile" in paths:
+        ci.append("Jenkins")
+    if ".circleci/config.yml" in paths or ".circleci/config.yaml" in paths:
+        ci.append("CircleCI")
+    if ".travis.yml" in paths:
+        ci.append("Travis CI")
+    if any(p.startswith("azure-pipelines") for p in paths):
+        ci.append("Azure Pipelines")
+    if "bitbucket-pipelines.yml" in paths:
+        ci.append("Bitbucket Pipelines")
+    return ci
+def _detect_test_framework(paths: set[str], deps: list[str] | None = None) -> tuple[str | None, bool]:
+    """Detect test framework and whether tests exist. Returns (framework, has_tests)."""
+    framework = None
+    # Check deps first
+    if deps:
+        for dep in deps:
+            dep_lower = dep.lower()
+            if dep_lower in TEST_FRAMEWORK_MAP:
+                framework = TEST_FRAMEWORK_MAP[dep_lower]
+                break
+    # Check config files in tree
+    if not framework:
+        for filename, fw in TEST_FILE_PATTERNS.items():
+            if any(p.endswith(filename) or p == filename for p in paths):
+                framework = fw
+                break
+    # Check for test directories
+    has_tests = any(
+        p.startswith("tests/") or p.startswith("test/") or p.startswith("__tests__/")
+        or "/tests/" in p or "/test/" in p or "/__tests__/" in p
+        or p.endswith("_test.py") or p.endswith("_test.go") or p.endswith("_test.rs")
+        or p.endswith(".test.js") or p.endswith(".test.ts") or p.endswith(".test.tsx")
+        or p.endswith(".spec.js") or p.endswith(".spec.ts") or p.endswith(".spec.tsx")
+        for p in paths
+    )
+    # Rust/Go have built-in test frameworks
+    if has_tests and not framework:
+        if any(p.endswith("_test.go") for p in paths):
+            framework = "go test"
+        elif any(p.endswith("_test.rs") or p.endswith("/tests/") for p in paths):
+            framework = "cargo test"
+    return framework, has_tests
+def extract_architecture(
+    tree_data: dict[str, Any],
+    deps: list[str] | None = None,
+) -> Architecture:
+    paths = _get_tree_paths(tree_data)
+    test_framework, has_tests = _detect_test_framework(paths, deps)
+    return Architecture(
+        monorepo=_detect_monorepo(paths, tree_data),
+        docker=_detect_docker(paths),
+        ci_cd=_detect_ci_cd(paths),
+        test_framework=test_framework,
+        has_tests=has_tests,
+    )