PyPI - fc-data - Versions diffs - 0.2.0__py3-none-any.whl - Mend

fc-data 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

datasmith/__init__.py +330 -0
datasmith/__init__.pyi +194 -0
datasmith/agents/__init__.py +31 -0
datasmith/agents/classifiers.py +272 -0
datasmith/agents/codex.py +25 -0
datasmith/agents/config.py +108 -0
datasmith/agents/extractors.py +197 -0
datasmith/agents/installed/README.md +52 -0
datasmith/agents/installed/__init__.py +22 -0
datasmith/agents/installed/base.py +240 -0
datasmith/agents/installed/claude.py +134 -0
datasmith/agents/installed/codex.py +91 -0
datasmith/agents/installed/gemini.py +118 -0
datasmith/agents/installed/none.py +27 -0
datasmith/agents/sandbox.py +547 -0
datasmith/agents/synthesizer.py +439 -0
datasmith/agents/templates/AGENTS.md.j2 +150 -0
datasmith/agents/templates/sandbox_verify.py +428 -0
datasmith/docker/__init__.py +31 -0
datasmith/docker/context.py +112 -0
datasmith/docker/images.py +158 -0
datasmith/docker/publish.py +56 -0
datasmith/docker/templates/Dockerfile.base +26 -0
datasmith/docker/templates/Dockerfile.pr +42 -0
datasmith/docker/templates/Dockerfile.repo +11 -0
datasmith/docker/templates/docker_build_base.sh +780 -0
datasmith/docker/templates/docker_build_env.sh +309 -0
datasmith/docker/templates/docker_build_final.sh +106 -0
datasmith/docker/templates/docker_build_pkg.sh +99 -0
datasmith/docker/templates/docker_build_run.sh +124 -0
datasmith/docker/templates/entrypoint.sh +62 -0
datasmith/docker/templates/parser.py +1405 -0
datasmith/docker/templates/profile.sh +199 -0
datasmith/docker/templates/pytest_runner.py +692 -0
datasmith/docker/templates/run-tests.sh +197 -0
datasmith/docker/verifiers.py +131 -0
datasmith/filters.py +154 -0
datasmith/github/__init__.py +22 -0
datasmith/github/client.py +333 -0
datasmith/github/hooks.py +50 -0
datasmith/github/links.py +110 -0
datasmith/github/models.py +206 -0
datasmith/github/render.py +173 -0
datasmith/github/search.py +66 -0
datasmith/github/templates/comment.md.j2 +5 -0
datasmith/github/templates/final.md.j2 +66 -0
datasmith/github/templates/issues.md.j2 +21 -0
datasmith/github/templates/repo.md.j2 +1 -0
datasmith/preflight.py +162 -0
datasmith/publish/__init__.py +13 -0
datasmith/publish/huggingface.py +104 -0
datasmith/publish/pipeline.py +60 -0
datasmith/publish/records.py +91 -0
datasmith/py.typed +1 -0
datasmith/resolution/__init__.py +14 -0
datasmith/resolution/blocklist.py +145 -0
datasmith/resolution/cache.py +120 -0
datasmith/resolution/constants.py +277 -0
datasmith/resolution/dependency_resolver.py +174 -0
datasmith/resolution/git_utils.py +378 -0
datasmith/resolution/import_analyzer.py +66 -0
datasmith/resolution/metadata_parser.py +412 -0
datasmith/resolution/models.py +41 -0
datasmith/resolution/orchestrator.py +522 -0
datasmith/resolution/package_filters.py +312 -0
datasmith/resolution/python_manager.py +110 -0
datasmith/runners/__init__.py +15 -0
datasmith/runners/base.py +112 -0
datasmith/runners/classify_prs.py +48 -0
datasmith/runners/render_problems.py +113 -0
datasmith/runners/resolve_packages.py +66 -0
datasmith/runners/scrape_commits.py +166 -0
datasmith/runners/scrape_repos.py +44 -0
datasmith/runners/synthesize_images.py +310 -0
datasmith/update/__init__.py +5 -0
datasmith/update/cli.py +169 -0
datasmith/update/offline.py +173 -0
datasmith/update/pipeline.py +497 -0
datasmith/utils/__init__.py +18 -0
datasmith/utils/core.py +67 -0
datasmith/utils/db.py +156 -0
datasmith/utils/tokens.py +65 -0
fc_data-0.2.0.dist-info/METADATA +441 -0
fc_data-0.2.0.dist-info/RECORD +87 -0
fc_data-0.2.0.dist-info/WHEEL +4 -0
fc_data-0.2.0.dist-info/entry_points.txt +2 -0
fc_data-0.2.0.dist-info/licenses/LICENSE +28 -0

datasmith/agents/installed/base.py ADDED Viewed

@@ -0,0 +1,240 @@
+"""Abstract interface for CLI-based coding agents."""
+from __future__ import annotations
+import contextlib
+import os
+import shutil
+import signal
+import subprocess
+import threading
+import time
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from datasmith.utils import get_logger
+logger = get_logger("agents.installed")
+@dataclass
+class AgentResult:
+    """Unified result from any installed CLI agent."""
+    success: bool
+    output: str = ""
+    raw_output: str = ""
+    files_changed: list[str] = field(default_factory=list)
+    duration_s: float = 0.0
+    error: str = ""
+# Backward-compat alias
+CodexResult = AgentResult
+class InstalledAgent(ABC):
+    """Abstract interface for CLI-based coding agents.
+    Each subclass wraps a specific CLI tool (Codex, Claude Code, Gemini CLI)
+    and normalises its output into an ``AgentResult``.
+    """
+    @abstractmethod
+    def name(self) -> str:
+        """Human-readable agent name."""
+    @abstractmethod
+    def is_available(self) -> bool:
+        """Check if the CLI binary is on PATH."""
+    @abstractmethod
+    def exec(
+        self,
+        prompt: str,
+        timeout: int = 3600,
+        workdir: str | None = None,
+    ) -> AgentResult:
+        """Run a prompt non-interactively. Returns AgentResult."""
+    def exec_or_dry_run(
+        self,
+        prompt: str,
+        timeout: int = 3600,
+        workdir: str | None = None,
+        dry_run: bool = False,
+    ) -> AgentResult:
+        """Shared dry-run wrapper around :meth:`exec`."""
+        if dry_run:
+            logger.info("[DRY RUN] %s command for prompt (%d chars): %.500s", self.name(), len(prompt), prompt)
+            return AgentResult(
+                success=True,
+                output="[dry run — no execution]",
+                duration_s=0.0,
+            )
+        return self.exec(prompt, timeout=timeout, workdir=workdir)
+    @staticmethod
+    def _which(binary: str) -> bool:
+        return shutil.which(binary) is not None
+def _kill_process_group(proc: subprocess.Popen[str], sig: int = signal.SIGTERM) -> None:
+    """Send *sig* to the process group of *proc*, swallowing errors."""
+    with contextlib.suppress(ProcessLookupError, OSError):
+        os.killpg(os.getpgid(proc.pid), sig)
+# ---------------------------------------------------------------------------
+# Global subprocess registry — allows the SIGINT handler to reach agent
+# processes that live in their own sessions (start_new_session=True) and
+# therefore don't receive CTRL+C from the terminal.
+# ---------------------------------------------------------------------------
+_active_procs: set[subprocess.Popen[str]] = set()
+_active_procs_lock = threading.Lock()
+def _register_proc(proc: subprocess.Popen[str]) -> None:
+    with _active_procs_lock:
+        _active_procs.add(proc)
+def _unregister_proc(proc: subprocess.Popen[str]) -> None:
+    with _active_procs_lock:
+        _active_procs.discard(proc)
+def terminate_all_agents(force: bool = False) -> None:
+    """Kill every tracked agent subprocess.
+    Called from the CLI signal handler so that threads blocked on
+    ``proc.communicate()`` can unblock and the process can exit.
+    With *force=True* sends SIGKILL instead of SIGTERM.
+    """
+    sig = signal.SIGKILL if force else signal.SIGTERM
+    # list() snapshot avoids issues with concurrent set mutation.
+    for proc in list(_active_procs):
+        _kill_process_group(proc, sig)
+def _terminate_and_wait(proc: subprocess.Popen[str]) -> None:
+    """Send SIGTERM, wait, escalate to SIGKILL if needed."""
+    _kill_process_group(proc, signal.SIGTERM)
+    try:
+        proc.wait(timeout=10)
+    except subprocess.TimeoutExpired:
+        _kill_process_group(proc, signal.SIGKILL)
+        proc.wait()
+def run_agent_subprocess(
+    cmd: list[str],
+    *,
+    timeout: int = 3600,
+    cwd: str | None = None,
+    env: dict[str, str] | None = None,
+    agent_name: str = "agent",
+) -> tuple[int, str, str, float]:
+    """Run an agent CLI command with process-group cleanup on interrupt or timeout.
+    Returns ``(returncode, stdout, stderr, duration_s)``.
+    On timeout the process is killed and any partial output captured so far
+    is returned with ``returncode=-1``.
+    Raises ``FileNotFoundError`` if the binary is missing and
+    re-raises ``KeyboardInterrupt`` (after cleanup).
+    """
+    start = time.time()
+    proc: subprocess.Popen[str] | None = None
+    try:
+        proc = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            cwd=cwd,
+            env=env,
+            start_new_session=True,
+        )
+        _register_proc(proc)
+        stdout, stderr = proc.communicate(timeout=timeout)
+        duration = time.time() - start
+        return proc.returncode, stdout, stderr, duration
+    except subprocess.TimeoutExpired as exc:
+        logger.warning("%s timed out after %ds — capturing partial output", agent_name, timeout)
+        partial_stdout, partial_stderr = _collect_partial_output(exc, proc)
+        duration = time.time() - start
+        return -1, partial_stdout, partial_stderr, duration
+    except KeyboardInterrupt:
+        if proc is not None:
+            _terminate_and_wait(proc)
+        raise
+    finally:
+        if proc is not None:
+            _unregister_proc(proc)
+            if proc.poll() is None:
+                _kill_process_group(proc, signal.SIGKILL)
+                proc.wait()
+def _collect_partial_output(
+    exc: subprocess.TimeoutExpired,
+    proc: subprocess.Popen[str] | None,
+) -> tuple[str, str]:
+    """Extract whatever output was buffered before a timeout."""
+    partial_stdout = ""
+    partial_stderr = ""
+    if exc.stdout:
+        partial_stdout = exc.stdout if isinstance(exc.stdout, str) else exc.stdout.decode(errors="replace")
+    if exc.stderr:
+        partial_stderr = exc.stderr if isinstance(exc.stderr, str) else exc.stderr.decode(errors="replace")
+    if proc is not None:
+        _terminate_and_wait(proc)
+        try:
+            remaining_out, remaining_err = proc.communicate(timeout=5)
+            partial_stdout += remaining_out or ""
+            partial_stderr += remaining_err or ""
+        except Exception:
+            logger.debug("Failed to read remaining output after timeout", exc_info=True)
+    return partial_stdout, partial_stderr
+# Registry of concrete agents in preference order.
+# Populated by __init__.py after all subclasses are importable.
+_AGENT_CLASSES: list[type[InstalledAgent]] = []
+def get_agent(preference: list[str] | None = None) -> InstalledAgent:
+    """Auto-detect and return the first available agent.
+    *preference* is a list of agent names (lowercase) to try in order.
+    Default: ``["claude", "codex", "gemini"]``.
+    Raises ``RuntimeError`` if none are available.
+    """
+    from datasmith.agents.installed.claude import ClaudeAgent
+    from datasmith.agents.installed.codex import CodexAgent
+    from datasmith.agents.installed.gemini import GeminiAgent
+    from datasmith.agents.installed.none import NoneAgent
+    registry: dict[str, type[InstalledAgent]] = {
+        "claude": ClaudeAgent,
+        "codex": CodexAgent,
+        "gemini": GeminiAgent,
+        "none": NoneAgent,
+    }
+    order = preference or ["claude", "codex", "gemini"]
+    for name in order:
+        cls = registry.get(name)
+        if cls is None:
+            continue
+        agent = cls()
+        if agent.is_available():
+            logger.info("Auto-detected agent: %s", agent.name())
+            return agent
+    available = list(registry.keys())
+    raise RuntimeError(f"No installed CLI agent found. Tried: {order}. Install one of: {available}")

datasmith/agents/installed/claude.py ADDED Viewed

@@ -0,0 +1,134 @@
+"""Claude Code CLI agent implementation."""
+from __future__ import annotations
+import json
+import os
+from datasmith.agents.installed.base import AgentResult, InstalledAgent, run_agent_subprocess
+from datasmith.utils import get_logger
+logger = get_logger("agents.installed.claude")
+def _extract_assistant_text(message: object) -> str:
+    """Extract text from a Claude assistant message payload."""
+    if isinstance(message, str):
+        return message
+    if isinstance(message, dict):
+        content = message.get("content", "")
+        if isinstance(content, str):
+            return content
+        if isinstance(content, list):
+            parts = []
+            for block in content:
+                if isinstance(block, dict) and block.get("type") == "text":
+                    parts.append(block.get("text", ""))
+            return "\n".join(parts)
+    return ""
+def _parse_claude_stdout(stdout: str) -> tuple[list[str], list[str]]:
+    """Parse Claude Code stream-json output into (output_lines, files_changed)."""
+    files_changed: list[str] = []
+    output_lines: list[str] = []
+    for line in stdout.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            obj = json.loads(line)
+            if not isinstance(obj, dict):
+                continue
+            msg_type = obj.get("type", "")
+            if msg_type == "assistant" and "message" in obj:
+                text = _extract_assistant_text(obj["message"])
+                if text:
+                    output_lines.append(text)
+            elif msg_type == "result":
+                result_text = obj.get("result", "")
+                if isinstance(result_text, str) and result_text:
+                    output_lines.append(result_text)
+            elif msg_type == "tool_use":
+                _collect_file_change(obj, files_changed)
+        except json.JSONDecodeError:
+            output_lines.append(line)
+    return output_lines, files_changed
+_FILE_TOOL_NAMES = {"Write", "Edit", "write_file", "edit_file"}
+def _collect_file_change(obj: dict, files_changed: list[str]) -> None:
+    """Extract file path from a tool_use event if it's a file-editing tool."""
+    tool_name = obj.get("name", "")
+    tool_input = obj.get("input", {})
+    if tool_name in _FILE_TOOL_NAMES and isinstance(tool_input, dict):
+        path = tool_input.get("file_path") or tool_input.get("path", "")
+        if path:
+            files_changed.append(path)
+class ClaudeAgent(InstalledAgent):
+    """Claude Code CLI agent."""
+    def name(self) -> str:
+        return "claude"
+    def is_available(self) -> bool:
+        return self._which("claude")
+    def exec(
+        self,
+        prompt: str,
+        timeout: int = 3600,
+        workdir: str | None = None,
+    ) -> AgentResult:
+        cmd = [
+            "claude",
+            "-p",
+            prompt,
+            "--dangerously-skip-permissions",
+            "--output-format",
+            "stream-json",
+            "--no-session-persistence",
+            "--verbose",
+            "--model",
+            "sonnet",
+            "--effort",
+            "medium",
+        ]
+        logger.debug("claude command: %s", " ".join(cmd))
+        # Nesting guard: unset Claude Code env vars to avoid
+        # "cannot be launched inside another Claude Code session" error.
+        env = os.environ.copy()
+        env.pop("CLAUDE_CODE_ENTRYPOINT", None)
+        env.pop("CLAUDECODE", None)
+        try:
+            returncode, stdout, stderr, duration = run_agent_subprocess(
+                cmd, timeout=timeout, cwd=workdir, env=env, agent_name="claude"
+            )
+            output_lines, files_changed = _parse_claude_stdout(stdout)
+            return AgentResult(
+                success=returncode == 0,
+                output="\n".join(output_lines) if output_lines else stdout,
+                raw_output=stdout,
+                files_changed=files_changed,
+                duration_s=duration,
+                error=stderr if returncode != 0 else "",
+            )
+        except FileNotFoundError:
+            return AgentResult(
+                success=False,
+                duration_s=0.0,
+                error="claude CLI not found. Install with: npm install -g @anthropic-ai/claude-code",
+            )

datasmith/agents/installed/codex.py ADDED Viewed

@@ -0,0 +1,91 @@
+"""Codex CLI agent implementation."""
+from __future__ import annotations
+import json
+from datasmith.agents.installed.base import AgentResult, InstalledAgent, run_agent_subprocess
+from datasmith.utils import get_logger
+logger = get_logger("agents.installed.codex")
+def _parse_codex_stdout(stdout: str) -> tuple[list[str], list[str]]:
+    """Parse JSON stream from Codex stdout into (output_lines, files_changed)."""
+    files_changed: list[str] = []
+    output_lines: list[str] = []
+    for line in stdout.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            obj = json.loads(line)
+            if isinstance(obj, dict):
+                if "file" in obj:
+                    files_changed.append(obj["file"])
+                # codex >=0.114 item.completed format
+                item = obj.get("item")
+                if isinstance(item, dict) and item.get("type") == "agent_message":
+                    text = item.get("text", "")
+                    if text:
+                        output_lines.append(text)
+                elif "output" in obj:
+                    output_lines.append(obj["output"])
+                elif "message" in obj:
+                    output_lines.append(obj["message"])
+        except json.JSONDecodeError:
+            output_lines.append(line)
+    return output_lines, files_changed
+class CodexAgent(InstalledAgent):
+    """Codex CLI (``codex exec``) agent."""
+    def __init__(self, full_auto: bool = False, sandbox: str = "") -> None:
+        self._full_auto = full_auto
+        self._sandbox = sandbox
+    def name(self) -> str:
+        return "codex"
+    def is_available(self) -> bool:
+        return self._which("codex")
+    def exec(
+        self,
+        prompt: str,
+        timeout: int = 3600,
+        workdir: str | None = None,
+    ) -> AgentResult:
+        cmd = ["codex", "exec", "--model", "gpt-5.4-mini", "-c", "model_reasoning_effort=medium"]
+        if self._full_auto and self._sandbox:
+            cmd.extend(["--full-auto", "--sandbox", self._sandbox])
+        else:
+            cmd.append("--dangerously-bypass-approvals-and-sandbox")
+        cmd.extend(["--json", "--ephemeral"])
+        cmd.append(prompt)
+        logger.debug("codex command: %s", " ".join(cmd))
+        try:
+            returncode, stdout, stderr, duration = run_agent_subprocess(
+                cmd, timeout=timeout, cwd=workdir, agent_name="codex"
+            )
+            output_lines, files_changed = _parse_codex_stdout(stdout)
+            return AgentResult(
+                success=returncode == 0,
+                output="\n".join(output_lines) if output_lines else stdout,
+                raw_output=stdout,
+                files_changed=files_changed,
+                duration_s=duration,
+                error=stderr if returncode != 0 else "",
+            )
+        except FileNotFoundError:
+            return AgentResult(
+                success=False,
+                duration_s=0.0,
+                error="codex CLI not found. Install with: npm install -g @openai/codex",
+            )

datasmith/agents/installed/gemini.py ADDED Viewed

@@ -0,0 +1,118 @@
+"""Gemini CLI agent implementation."""
+from __future__ import annotations
+import json
+from datasmith.agents.installed.base import AgentResult, InstalledAgent, run_agent_subprocess
+from datasmith.utils import get_logger
+logger = get_logger("agents.installed.gemini")
+_TEXT_TYPES = {"assistant", "response", "text"}
+_FILE_TOOL_NAMES = {"write_file", "edit_file", "Write", "Edit", "create_file", "update_file"}
+def _parse_gemini_stdout(stdout: str) -> tuple[list[str], list[str]]:
+    """Parse Gemini CLI stream-json output into (output_lines, files_changed)."""
+    files_changed: list[str] = []
+    output_lines: list[str] = []
+    for line in stdout.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            obj = json.loads(line)
+            if not isinstance(obj, dict):
+                continue
+            msg_type = obj.get("type", "")
+            if msg_type in _TEXT_TYPES:
+                _append_text(obj, output_lines)
+            elif msg_type == "result":
+                _append_result(obj, output_lines)
+            elif msg_type in ("tool_use", "action"):
+                _collect_gemini_file(obj, files_changed)
+            elif "output" in obj:
+                output_lines.append(obj["output"])
+            elif "message" in obj:
+                output_lines.append(obj["message"])
+        except json.JSONDecodeError:
+            output_lines.append(line)
+    return output_lines, files_changed
+def _append_text(obj: dict, output_lines: list[str]) -> None:
+    text = obj.get("text") or obj.get("message") or obj.get("content", "")
+    if isinstance(text, str) and text:
+        output_lines.append(text)
+def _append_result(obj: dict, output_lines: list[str]) -> None:
+    result_text = obj.get("result") or obj.get("text", "")
+    if isinstance(result_text, str) and result_text:
+        output_lines.append(result_text)
+def _collect_gemini_file(obj: dict, files_changed: list[str]) -> None:
+    tool_name = obj.get("name") or obj.get("tool", "")
+    tool_input = obj.get("input") or obj.get("args", {})
+    if isinstance(tool_input, dict) and tool_name in _FILE_TOOL_NAMES:
+        path = tool_input.get("file_path") or tool_input.get("path", "")
+        if path:
+            files_changed.append(path)
+class GeminiAgent(InstalledAgent):
+    """Gemini CLI agent."""
+    def name(self) -> str:
+        return "gemini"
+    def is_available(self) -> bool:
+        return self._which("gemini")
+    def exec(
+        self,
+        prompt: str,
+        timeout: int = 3600,
+        workdir: str | None = None,
+    ) -> AgentResult:
+        cmd = [
+            "gemini",
+            "-p",
+            prompt,
+            "--yolo",
+            "-o",
+            "json",
+            "--model",
+            "gemini-2.5-flash",
+        ]
+        logger.debug("gemini command: %s", " ".join(cmd))
+        try:
+            returncode, stdout, stderr, duration = run_agent_subprocess(
+                cmd, timeout=timeout, cwd=workdir, agent_name="gemini"
+            )
+            output_lines, files_changed = _parse_gemini_stdout(stdout)
+            return AgentResult(
+                success=returncode == 0,
+                output="\n".join(output_lines) if output_lines else stdout,
+                raw_output=stdout,
+                files_changed=files_changed,
+                duration_s=duration,
+                error=stderr if returncode != 0 else "",
+            )
+        except FileNotFoundError:
+            return AgentResult(
+                success=False,
+                duration_s=0.0,
+                error="gemini CLI not found. Install with: npm install -g @anthropic-ai/gemini-cli",
+            )

datasmith/agents/installed/none.py ADDED Viewed

@@ -0,0 +1,27 @@
+"""No-op agent that skips LLM generation, relying solely on similar-context matching."""
+from __future__ import annotations
+from datasmith.agents.installed.base import AgentResult, InstalledAgent
+class NoneAgent(InstalledAgent):
+    """A no-op agent that is always available and never executes."""
+    def name(self) -> str:
+        return "none"
+    def is_available(self) -> bool:
+        return True
+    def exec(
+        self,
+        prompt: str,
+        timeout: int = 3600,
+        workdir: str | None = None,
+    ) -> AgentResult:
+        return AgentResult(
+            success=False,
+            output="[none agent — no LLM execution]",
+            error="NoneAgent does not execute",
+        )