PyPI - fc-data - Versions diffs - 0.2.0__py3-none-any.whl - Mend

fc-data 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

datasmith/__init__.py +330 -0
datasmith/__init__.pyi +194 -0
datasmith/agents/__init__.py +31 -0
datasmith/agents/classifiers.py +272 -0
datasmith/agents/codex.py +25 -0
datasmith/agents/config.py +108 -0
datasmith/agents/extractors.py +197 -0
datasmith/agents/installed/README.md +52 -0
datasmith/agents/installed/__init__.py +22 -0
datasmith/agents/installed/base.py +240 -0
datasmith/agents/installed/claude.py +134 -0
datasmith/agents/installed/codex.py +91 -0
datasmith/agents/installed/gemini.py +118 -0
datasmith/agents/installed/none.py +27 -0
datasmith/agents/sandbox.py +547 -0
datasmith/agents/synthesizer.py +439 -0
datasmith/agents/templates/AGENTS.md.j2 +150 -0
datasmith/agents/templates/sandbox_verify.py +428 -0
datasmith/docker/__init__.py +31 -0
datasmith/docker/context.py +112 -0
datasmith/docker/images.py +158 -0
datasmith/docker/publish.py +56 -0
datasmith/docker/templates/Dockerfile.base +26 -0
datasmith/docker/templates/Dockerfile.pr +42 -0
datasmith/docker/templates/Dockerfile.repo +11 -0
datasmith/docker/templates/docker_build_base.sh +780 -0
datasmith/docker/templates/docker_build_env.sh +309 -0
datasmith/docker/templates/docker_build_final.sh +106 -0
datasmith/docker/templates/docker_build_pkg.sh +99 -0
datasmith/docker/templates/docker_build_run.sh +124 -0
datasmith/docker/templates/entrypoint.sh +62 -0
datasmith/docker/templates/parser.py +1405 -0
datasmith/docker/templates/profile.sh +199 -0
datasmith/docker/templates/pytest_runner.py +692 -0
datasmith/docker/templates/run-tests.sh +197 -0
datasmith/docker/verifiers.py +131 -0
datasmith/filters.py +154 -0
datasmith/github/__init__.py +22 -0
datasmith/github/client.py +333 -0
datasmith/github/hooks.py +50 -0
datasmith/github/links.py +110 -0
datasmith/github/models.py +206 -0
datasmith/github/render.py +173 -0
datasmith/github/search.py +66 -0
datasmith/github/templates/comment.md.j2 +5 -0
datasmith/github/templates/final.md.j2 +66 -0
datasmith/github/templates/issues.md.j2 +21 -0
datasmith/github/templates/repo.md.j2 +1 -0
datasmith/preflight.py +162 -0
datasmith/publish/__init__.py +13 -0
datasmith/publish/huggingface.py +104 -0
datasmith/publish/pipeline.py +60 -0
datasmith/publish/records.py +91 -0
datasmith/py.typed +1 -0
datasmith/resolution/__init__.py +14 -0
datasmith/resolution/blocklist.py +145 -0
datasmith/resolution/cache.py +120 -0
datasmith/resolution/constants.py +277 -0
datasmith/resolution/dependency_resolver.py +174 -0
datasmith/resolution/git_utils.py +378 -0
datasmith/resolution/import_analyzer.py +66 -0
datasmith/resolution/metadata_parser.py +412 -0
datasmith/resolution/models.py +41 -0
datasmith/resolution/orchestrator.py +522 -0
datasmith/resolution/package_filters.py +312 -0
datasmith/resolution/python_manager.py +110 -0
datasmith/runners/__init__.py +15 -0
datasmith/runners/base.py +112 -0
datasmith/runners/classify_prs.py +48 -0
datasmith/runners/render_problems.py +113 -0
datasmith/runners/resolve_packages.py +66 -0
datasmith/runners/scrape_commits.py +166 -0
datasmith/runners/scrape_repos.py +44 -0
datasmith/runners/synthesize_images.py +310 -0
datasmith/update/__init__.py +5 -0
datasmith/update/cli.py +169 -0
datasmith/update/offline.py +173 -0
datasmith/update/pipeline.py +497 -0
datasmith/utils/__init__.py +18 -0
datasmith/utils/core.py +67 -0
datasmith/utils/db.py +156 -0
datasmith/utils/tokens.py +65 -0
fc_data-0.2.0.dist-info/METADATA +441 -0
fc_data-0.2.0.dist-info/RECORD +87 -0
fc_data-0.2.0.dist-info/WHEEL +4 -0
fc_data-0.2.0.dist-info/entry_points.txt +2 -0
fc_data-0.2.0.dist-info/licenses/LICENSE +28 -0

datasmith/resolution/package_filters.py ADDED Viewed

@@ -0,0 +1,312 @@
+"""Filtering and normalizing package requirements."""
+from __future__ import annotations
+import re
+import shlex
+from collections.abc import Iterable, Mapping
+from pathlib import Path
+from git import Commit
+from .constants import (
+    ALLOWLIST_COMMON_PYPI,
+    CONDA_SYSTEM_PACKAGES,
+    EXTRA_MARKER_RE,
+    GENERIC_LOCAL_NAMES,
+    NOT_REQUIREMENTS,
+    STDLIB,
+)
+from .git_utils import read_blob_text
+def parse_extras_segment(token: str) -> list[str]:
+    """Extract extras from a token like 'package[extra1,extra2]'."""
+    if "[" not in token or not token.endswith("]"):
+        return []
+    segment = token[token.rfind("[") + 1 : -1]
+    if not segment:
+        return []
+    return [part.strip() for part in segment.split(",") if part.strip()]
+def extras_from_install_commands(install_cmds: Iterable[str], extras_available: set[str]) -> set[str]:
+    """Extract extras requested in install commands."""
+    requested: set[str] = set()
+    for cmd in install_cmds:
+        if not cmd:
+            continue
+        for token in shlex.split(cmd):
+            for extra in parse_extras_segment(token):
+                if extra in extras_available:
+                    requested.add(extra)
+    return requested
+def extras_from_matrix(matrix: Mapping[str, set[str]] | None, extras_available: set[str]) -> set[str]:
+    """Extract extras from ASV matrix configuration."""
+    if not matrix:
+        return set()
+    requested: set[str] = set()
+    for values in matrix.values():
+        for value in values:
+            if value in extras_available:
+                requested.add(value)
+    return requested
+def extract_requested_extras(
+    install_cmds: Iterable[str],
+    matrix: Mapping[str, set[str]] | None,
+    available: Iterable[str],
+) -> set[str]:
+    """Extract all requested extras from install commands and matrix."""
+    extras_available = set(available)
+    requested = extras_from_install_commands(install_cmds, extras_available)
+    requested.update(extras_from_matrix(matrix, extras_available))
+    return requested
+def resolve_requirements_file(commit: Commit, rel_path: str, seen: set[str]) -> set[str]:
+    """Recursively resolve a requirements file from a commit."""
+    if rel_path in seen:
+        return set()
+    seen.add(rel_path)
+    requirements: set[str] = set()
+    content = read_blob_text(commit, rel_path)
+    if not content:
+        return requirements
+    for line in content.splitlines():
+        line = line.strip()
+        if not line or line.startswith("#"):
+            continue
+        tokens = line.split()
+        if len(tokens) >= 2 and tokens[0] in {"-r", "--requirement"}:
+            nested_path = tokens[1]
+            if "/" in rel_path:
+                base_dir = "/".join(rel_path.split("/")[:-1])
+                nested_path = f"{base_dir}/{nested_path}"
+            requirements.update(resolve_requirements_file(commit, nested_path, seen))
+            continue
+        requirements.add(line)
+    return requirements
+def split_shell_command(cmd: str) -> list[str]:
+    """Split a shell command on operators like &&, ||, ; into separate commands."""
+    parts = re.split(r"\s*(?:&&|\|\||;)\s*", cmd)
+    return [p.strip() for p in parts if p.strip()]
+def is_valid_direct_url(req: str) -> bool:
+    """Check if a requirement string is a valid direct URL for uv."""
+    if not req or not req.strip():
+        return False
+    req = req.strip()
+    if not (
+        req.startswith("http://")
+        or req.startswith("https://")
+        or req.startswith("git+")
+        or req.startswith("hg+")
+        or req.startswith("svn+")
+        or req.startswith("bzr+")
+        or req.startswith("file://")
+    ):
+        return False
+    ok_exts = (
+        ".whl",
+        ".tar.gz",
+        ".zip",
+        ".tar.bz2",
+        ".tar.lz",
+        ".tar.lzma",
+        ".tar.xz",
+        ".tar.zst",
+        ".tar",
+        ".tbz",
+        ".tgz",
+        ".tlz",
+        ".txz",
+    )
+    return any(req.lower().endswith(ext) for ext in ok_exts)
+def is_valid_pypi_requirement(req: str) -> bool:
+    """Validate if a string looks like a valid PyPI requirement per PEP 508."""
+    if not req or not req.strip():
+        return False
+    req = req.strip()
+    if "{" in req or "}" in req or "$" in req:
+        return False
+    if any(op in req for op in ["&&", "||", ";;", "|", "&"]):
+        return False
+    if req.startswith("--"):
+        return False
+    if any(req.startswith(prefix) for prefix in ["http://", "https://", "git+", "hg+", "svn+", "bzr+", "file://"]):
+        return True
+    if req.startswith("."):
+        return False
+    pkg_match = re.match(r"^([A-Za-z0-9]([A-Za-z0-9._-]*[A-Za-z0-9])?)", req)
+    if not pkg_match:
+        return False
+    pkg_name = pkg_match.group(1)
+    return not (pkg_name.startswith("_") or len(pkg_name) == 1)
+def fix_marker_spacing(req: str) -> str:
+    """Fix missing spaces around 'and' and 'or' operators in PEP 508 markers."""
+    if "#" in req:
+        match = re.search(r"(?<!\s)#", req)
+        if match:
+            req = req[: match.start()]
+    if ";" not in req:
+        return req
+    parts = req.split(";", 1)
+    if len(parts) != 2:
+        return req
+    pkg_spec, marker = parts
+    marker = re.sub(r"(?<=[^\s])and(?=[^\s])", " and ", marker)
+    marker = re.sub(r"(?<=[^\s])or(?=[^\s])", " or ", marker)
+    return f"{pkg_spec};{marker}"
+def normalize_requirement(req: str) -> list[str]:
+    """Normalize a token into one or more requirement strings."""
+    if not req or not req.strip():
+        return []
+    req = req.strip()
+    req = fix_marker_spacing(req)
+    if "{" in req or "}" in req or "$" in req:
+        return []
+    if any(op in req for op in ["&&", "||", ";;", "|", "&"]) or req.startswith("--"):
+        return []
+    if req in {"-r", "--requirement", "-c", "--constraint", "-e", "--editable"}:
+        return []
+    if req.startswith(("http://", "https://", "git+", "hg+", "svn+", "bzr+", "file://")):
+        return [req] if is_valid_direct_url(req) else []
+    if req.startswith("."):
+        return []
+    return [req]
+def project_local_names(project_dir: Path) -> set[str]:
+    """Collect names that look like local modules/packages."""
+    names: set[str] = set()
+    skip_dirs = {"__pycache__", ".git", ".eggs", ".tox", "build", "dist", "node_modules"}
+    for py in project_dir.glob("*.py"):
+        if not py.name.startswith("_"):
+            names.add(py.stem)
+    for item in project_dir.rglob("*"):
+        if any(skip in item.parts for skip in skip_dirs):
+            continue
+        if item.is_dir():
+            if item.name.startswith(".") or item.name.startswith("_"):
+                continue
+            if (item / "__init__.py").exists():
+                names.add(item.name)
+        elif item.suffix == ".py":
+            if not item.name.startswith("_"):
+                names.add(item.stem)
+    return names
+def clean_pinned(reqs: list[str]) -> list[str]:
+    """Removes lower-bound version specifiers from requirements that have both >= and <=."""
+    new_reqs = []
+    for r in reqs:
+        r = re.sub(r"\s+", "", r)
+        if ">=" in r and "<=" in r:
+            pkg_name = extract_pkg_name(r)
+            parts = re.split(r",\s*", r)
+            le_parts = [p for p in parts if "<=" in p]
+            if le_parts:
+                le_parts = [p if pkg_name in p else f"{pkg_name}{p}" for p in le_parts]
+                new_reqs.extend(le_parts)
+            else:
+                new_reqs.append(r)
+        else:
+            new_reqs.append(r)
+    return new_reqs
+def extract_pkg_name(req: str) -> str:
+    """Extract package name from a requirement string."""
+    name = re.split(r"[<>=!;\s]", req, maxsplit=1)[0]
+    if "[" in name:
+        name = name.split("[", 1)[0]
+    return name.strip()
+def filter_requirements_for_pypi(  # noqa: C901
+    requirements: Iterable[str], *, project_dir: Path, own_import_name: str | None
+) -> list[str]:
+    """Remove things that are clearly not PyPI-installable."""
+    from .blocklist import get_blocklist, normalize_package_name
+    local_names = project_local_names(project_dir)
+    own_names = set()
+    if own_import_name:
+        own_names |= {own_import_name, own_import_name.replace("-", "_"), own_import_name.replace("_", "-")}
+    dynamic_blocklist = get_blocklist()
+    out: list[str] = []
+    for raw in requirements:
+        if not raw or not raw.strip():
+            continue
+        raw = raw.strip()
+        raw = fix_marker_spacing(raw)
+        if raw.startswith(("http://", "https://", "git+", "hg+", "svn+", "bzr+", "file://")):
+            if is_valid_direct_url(raw):
+                out.append(raw)
+            continue
+        name = extract_pkg_name(raw)
+        if not name:
+            continue
+        low = name.lower()
+        if low.startswith("python"):
+            suffix = low[6:]
+            if not suffix or suffix[0].isdigit() or suffix.startswith("."):
+                continue
+        if name.startswith("_") or len(name) == 1:
+            continue
+        if low in STDLIB or name in NOT_REQUIREMENTS:
+            continue
+        normalized_name = normalize_package_name(name)
+        if normalized_name in dynamic_blocklist:
+            continue
+        if low in CONDA_SYSTEM_PACKAGES:
+            continue
+        if low in GENERIC_LOCAL_NAMES and name not in ALLOWLIST_COMMON_PYPI:
+            continue
+        if name in own_names:
+            continue
+        if name in local_names and name not in ALLOWLIST_COMMON_PYPI:
+            continue
+        out.append(raw)
+    stripped: list[str] = []
+    for r in out:
+        r2 = EXTRA_MARKER_RE.sub("", r).strip()
+        r2 = re.sub(r"\s*;\s*$", "", r2)
+        stripped.append(r2)
+    deduped: list[str] = []
+    seen: set[str] = set()
+    for r in stripped:
+        if r not in seen:
+            seen.add(r)
+            deduped.append(r)
+    return deduped

datasmith/resolution/python_manager.py ADDED Viewed

@@ -0,0 +1,110 @@
+"""Python version management and uv interaction."""
+from __future__ import annotations
+import datetime as dt
+import os
+import subprocess
+from pathlib import Path
+from datasmith.utils import get_logger
+logger = get_logger("resolution.python_manager")
+def run_uv(
+    args: list[str],
+    *,
+    input_text: str | None = None,
+    cwd: Path | None = None,
+    extra_env: dict[str, str] | None = None,
+    check: bool = False,
+) -> subprocess.CompletedProcess:
+    """Run a uv command with specified arguments."""
+    env = os.environ.copy()
+    env.setdefault("UV_COLOR", "never")
+    env.setdefault("NO_COLOR", "1")
+    if extra_env:
+        env.update(extra_env)
+    cp = subprocess.run(
+        ["uv", *args],
+        input=input_text.encode("utf-8") if input_text is not None else None,
+        capture_output=True,
+        cwd=str(cwd) if cwd else None,
+        env=env,
+    )
+    if check and cp.returncode != 0:
+        raise RuntimeError(
+            f"uv {' '.join(args)} failed with code {cp.returncode}\n"
+            f"STDOUT:\n{cp.stdout.decode()}\nSTDERR:\n{cp.stderr.decode()}"
+        )
+    return cp
+def ensure_python_version_available(version: str) -> bool:
+    """Ensure uv has the requested Python version available, downloading if needed."""
+    list_cp = run_uv(["python", "list"])
+    if list_cp.returncode == 0:
+        output = list_cp.stdout.decode()
+        if version in output or f"cpython-{version}" in output or version.replace(".", "") in output:
+            return True
+    install_cp = run_uv(["python", "install", version])
+    if install_cp.returncode == 0:
+        logger.debug("Successfully installed Python %s", version)
+        return True
+    logger.debug("Failed to install Python %s: %s", version, install_cp.stderr.decode())
+    return False
+def filter_python_versions_by_commit_date(  # noqa: C901
+    available_versions: set[tuple[int, ...]], commit_date: dt.datetime
+) -> list[tuple[int, ...]]:
+    """Filter Python versions to avoid anachronistic choices.
+    Note: Python 3.7 is excluded since it's EOL and not available in uv.
+    """
+    valid_versions = [v for v in available_versions if v >= (3, 8)]
+    if not valid_versions:
+        return []
+    py_releases = {
+        (3, 7): dt.datetime(2018, 6, 27, tzinfo=dt.timezone.utc),
+        (3, 8): dt.datetime(2019, 10, 14, tzinfo=dt.timezone.utc),
+        (3, 9): dt.datetime(2020, 10, 5, tzinfo=dt.timezone.utc),
+        (3, 10): dt.datetime(2021, 10, 4, tzinfo=dt.timezone.utc),
+        (3, 11): dt.datetime(2022, 10, 24, tzinfo=dt.timezone.utc),
+        (3, 12): dt.datetime(2023, 10, 2, tzinfo=dt.timezone.utc),
+        (3, 13): dt.datetime(2024, 10, 7, tzinfo=dt.timezone.utc),
+    }
+    grace_period = dt.timedelta(days=90)
+    filtered = []
+    for v in valid_versions:
+        version_key = (v[0], v[1])
+        release_date = py_releases.get(version_key)
+        if release_date is None:
+            if commit_date < dt.datetime(2024, 1, 1, tzinfo=dt.timezone.utc):
+                continue
+            filtered.append(v)
+        elif commit_date >= release_date - grace_period:
+            filtered.append(v)
+    if not filtered:
+        inferred = []
+        for version_key, release_date in sorted(py_releases.items(), reverse=True):
+            if version_key < (3, 8):
+                continue
+            if release_date <= commit_date + grace_period:
+                matching = [v for v in valid_versions if (v[0], v[1]) == version_key]
+                if matching:
+                    inferred.extend(matching)
+                elif len(inferred) < 3:
+                    inferred.append(version_key)
+                if len(inferred) >= 3:
+                    break
+        filtered = inferred if inferred else [(3, 8)]
+    return sorted(filtered, reverse=True)

datasmith/runners/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""ds.runners — Async runner infrastructure with Supabase progress tracking."""
+from datasmith.runners.base import BaseRunner
+from datasmith.runners.classify_prs import ClassifyPRsRunner
+from datasmith.runners.scrape_commits import ScrapeCommitsRunner
+from datasmith.runners.scrape_repos import ScrapeReposRunner
+from datasmith.runners.synthesize_images import SynthesizeImagesRunner
+__all__ = [
+    "BaseRunner",
+    "ClassifyPRsRunner",
+    "ScrapeCommitsRunner",
+    "ScrapeReposRunner",
+    "SynthesizeImagesRunner",
+]

datasmith/runners/base.py ADDED Viewed

@@ -0,0 +1,112 @@
+from __future__ import annotations
+import asyncio
+import time
+import traceback
+import uuid
+from abc import ABC, abstractmethod
+from typing import Any, TypeVar
+from datasmith.utils import get_client, get_logger
+logger = get_logger("runners.base")
+T = TypeVar("T")
+class BaseRunner(ABC):
+    """Abstract async runner with Supabase progress tracking."""
+    def __init__(self, name: str, n_concurrent: int = 10) -> None:
+        self.name = name
+        self.runner_id = f"{name}-{uuid.uuid4().hex[:8]}"
+        self._n_concurrent = n_concurrent
+        self._completed = 0
+        self._failed = 0
+        self._total = 0
+        self._last_progress_update = 0.0
+    @abstractmethod
+    async def _process_item(self, item: Any) -> None: ...
+    async def run(self, items: list[Any]) -> None:
+        """Run the runner on a list of items with bounded concurrency."""
+        self._total = len(items)
+        self._completed = 0
+        self._failed = 0
+        self._init_progress()
+        sem = asyncio.Semaphore(self._n_concurrent)
+        async def _wrapped(item: Any) -> None:
+            async with sem:
+                try:
+                    await self._process_item(item)
+                    self._completed += 1
+                except Exception as exc:
+                    self._failed += 1
+                    self._log_failure(item, exc)
+                    logger.exception("Failed processing item %s", self._item_id(item))
+                finally:
+                    self._maybe_update_progress()
+        tasks = [asyncio.create_task(_wrapped(item)) for item in items]
+        try:
+            await asyncio.gather(*tasks)
+        except (KeyboardInterrupt, asyncio.CancelledError):
+            for t in tasks:
+                t.cancel()
+            await asyncio.gather(*tasks, return_exceptions=True)
+            raise
+        finally:
+            self._update_progress(force=True)
+    def _item_id(self, item: Any) -> str:
+        if hasattr(item, "cache_key"):
+            return str(item.cache_key)
+        return str(item)
+    def _init_progress(self) -> None:
+        try:
+            client = get_client()
+            client.table("runner_progress").upsert({
+                "runner_id": self.runner_id,
+                "runner_name": self.name,
+                "total": self._total,
+                "completed": 0,
+                "failed": 0,
+            }).execute()
+        except Exception:
+            logger.warning("Failed to initialize progress tracking")
+    def _maybe_update_progress(self) -> None:
+        now = time.time()
+        if (self._completed + self._failed) % 10 == 0 or now - self._last_progress_update > 30:
+            self._update_progress()
+    def _update_progress(self, force: bool = False) -> None:
+        self._last_progress_update = time.time()
+        try:
+            client = get_client()
+            client.table("runner_progress").upsert({
+                "runner_id": self.runner_id,
+                "runner_name": self.name,
+                "total": self._total,
+                "completed": self._completed,
+                "failed": self._failed,
+            }).execute()
+        except Exception:
+            logger.warning("Failed to update progress")
+    def _log_failure(self, item: Any, exc: Exception) -> None:
+        try:
+            client = get_client()
+            client.table("runner_failures").insert({
+                "runner_id": self.runner_id,
+                "item_id": self._item_id(item),
+                "error_message": str(exc),
+                "traceback": traceback.format_exc(),
+            }).execute()
+        except Exception:
+            logger.warning("Failed to log failure for %s", self._item_id(item))

datasmith/runners/classify_prs.py ADDED Viewed

@@ -0,0 +1,48 @@
+from __future__ import annotations
+import asyncio
+import functools
+from typing import Any
+from datasmith.runners.base import BaseRunner
+from datasmith.utils import get_client, get_logger
+logger = get_logger("runners.classify_prs")
+class ClassifyPRsRunner(BaseRunner):
+    """Batch classification of PRs via LLM agents."""
+    def __init__(self, classifier: Any, judge: Any, n_concurrent: int = 5) -> None:
+        super().__init__(name="classify_prs", n_concurrent=n_concurrent)
+        self._classifier = classifier
+        self._judge = judge
+    async def _process_item(self, item: Any) -> None:
+        """Process a PR dict with owner, repo, issue_number, description, patch."""
+        owner = item["owner"]
+        repo = item["repo"]
+        issue_number = item["issue_number"]
+        description = item.get("description", "")
+        patch = item.get("patch", "")
+        file_change_summary = item.get("file_change_summary", "")
+        loop = asyncio.get_running_loop()
+        is_perf, _reason = await loop.run_in_executor(
+            None, functools.partial(self._classifier.classify, description, patch, file_change_summary)
+        )
+        update: dict[str, Any] = {"is_performance_commit": is_perf}
+        if is_perf:
+            decision = await loop.run_in_executor(None, functools.partial(self._judge.classify, description, patch))
+            update["classification"] = decision.category
+            update["difficulty"] = decision.difficulty
+        client = get_client()
+        client.table("pull_requests").update(update).eq("owner", owner).eq("repo", repo).eq(
+            "issue_number", issue_number
+        ).execute()
+        logger.info("Classified %s/%s#%d: perf=%s", owner, repo, issue_number, is_perf)