PyPI - codeprobe - Versions diffs - 0.1.6__tar.gz → 0.2.0__tar.gz - Mend

codeprobe 0.1.6tar.gz → 0.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (142) hide show

{codeprobe-0.1.6 → codeprobe-0.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: codeprobe
-Version: 0.1.6
+Version: 0.2.0
 Summary: Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results.
 Author: codeprobe contributors
 License-Expression: Apache-2.0

{codeprobe-0.1.6 → codeprobe-0.2.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "codeprobe"
-version = "0.1.6"
+version = "0.2.0"
 description = "Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results."
 readme = "README.md"
 license = "Apache-2.0"

{codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/__init__.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """codeprobe — Benchmark AI coding agents against your own codebase."""
-__version__ = "0.1.6"
+__version__ = "0.2.0"

{codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/adapters/claude.py RENAMED Viewed

@@ -4,6 +4,7 @@ from __future__ import annotations
 import json
 import re
+import shutil
 import subprocess
 import tempfile
 from pathlib import Path
@@ -69,16 +70,31 @@ class ClaudeAdapter(BaseAdapter):
         mcp_path = self._write_mcp_config(config)
         if mcp_path:
-            cmd.extend(["--mcp-config", mcp_path])
+            cmd.extend(["--mcp-config", mcp_path, "--strict-mcp-config"])
         return cmd
     def isolate_session(self, slot_id: int) -> dict[str, str]:
-        """Return a per-slot CLAUDE_CONFIG_DIR for session isolation."""
+        """Return a per-slot CLAUDE_CONFIG_DIR for session isolation.
+        Copies authentication credentials from the real ``~/.claude/``
+        directory so the agent subprocess can authenticate.
+        """
         config_dir = (
             Path(tempfile.gettempdir()) / "codeprobe-claude" / f"slot-{slot_id}"
         )
         config_dir.mkdir(parents=True, exist_ok=True)
+        # Copy auth credentials from the user's real config dir.
+        # Without these the subprocess gets "Not logged in".
+        real_config = Path.home() / ".claude"
+        if real_config.is_dir():
+            for name in ("credentials.json", ".credentials.json"):
+                src = real_config / name
+                dst = config_dir / name
+                if src.is_file():
+                    shutil.copy2(src, dst)
         return {"CLAUDE_CONFIG_DIR": str(config_dir)}
     def parse_output(

{codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/api.py RENAMED Viewed

@@ -58,11 +58,24 @@ def _build_experiment_config(raw: dict) -> ExperimentConfig:
     )
-def _discover_task_dirs(tasks_dir: Path) -> list[Path]:
-    """Find valid task directories (those containing instruction.md)."""
+def _discover_task_dirs(
+    tasks_dir: Path, *, task_ids: tuple[str, ...] = ()
+) -> list[Path]:
+    """Find valid task directories (those containing instruction.md).
+    When *task_ids* is non-empty, only return tasks whose directory name
+    appears in that tuple.
+    """
     if not tasks_dir.is_dir():
         raise FileNotFoundError(f"Tasks directory not found: {tasks_dir}")
+    if task_ids:
+        allowed = set(task_ids)
+        return sorted(
+            d
+            for d in tasks_dir.iterdir()
+            if d.is_dir() and d.name in allowed and (d / "instruction.md").exists()
+        )
     return sorted(
         d for d in tasks_dir.iterdir() if d.is_dir() and (d / "instruction.md").exists()
     )
@@ -103,7 +116,7 @@ def run_experiment(
     experiment = load_experiment(experiment_dir)
     tasks_dir = experiment_dir / experiment.tasks_dir
-    task_dirs = _discover_task_dirs(tasks_dir)
+    task_dirs = _discover_task_dirs(tasks_dir, task_ids=experiment.task_ids)
     if not task_dirs:
         raise ValueError(

{codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/__init__.py RENAMED Viewed

@@ -1,18 +1,89 @@
 """CLI entry point for codeprobe."""
+import json as _json
+import logging
+import sys
 import click
 from codeprobe import __version__
+class _JsonFormatter(logging.Formatter):
+    """Emit one JSON object per log line."""
+    def format(self, record: logging.LogRecord) -> str:
+        payload = {
+            "level": record.levelname,
+            "logger": record.name,
+            "message": record.getMessage(),
+            "timestamp": self.formatTime(record, "%Y-%m-%dT%H:%M:%S%z"),
+        }
+        return _json.dumps(payload)
+def _configure_logging(verbose: int, quiet: bool, log_format: str = "text") -> None:
+    """Configure namespace-scoped logging for codeprobe.* modules.
+    Attaches a StreamHandler to `logging.getLogger("codeprobe")` so that
+    all 26+ codeprobe.* modules emit through hierarchy without touching
+    third-party loggers (httpx, urllib3, etc.).
+    """
+    if quiet:
+        level = logging.WARNING
+    elif verbose >= 1:
+        level = logging.DEBUG
+    else:
+        level = logging.INFO
+    logger = logging.getLogger("codeprobe")
+    logger.setLevel(level)
+    logger.propagate = False  # don't bubble to root
+    # Idempotent: tests / repeat invocations must not duplicate handlers.
+    for h in list(logger.handlers):
+        logger.removeHandler(h)
+    handler = logging.StreamHandler(sys.stderr)
+    if log_format == "json":
+        handler.setFormatter(_JsonFormatter())
+    elif verbose >= 1:
+        fmt = "%(levelname)s %(name)s: %(message)s"
+        handler.setFormatter(logging.Formatter(fmt))
+    else:
+        fmt = "%(levelname)s: %(message)s"
+        handler.setFormatter(logging.Formatter(fmt))
+    logger.addHandler(handler)
 @click.group()
+@click.option(
+    "-v",
+    "--verbose",
+    count=True,
+    help="Increase log verbosity (-v sets DEBUG).",
+)
+@click.option(
+    "-q",
+    "--quiet",
+    is_flag=True,
+    default=False,
+    help="Suppress INFO logs (WARNING and above only).",
+)
+@click.option(
+    "--log-format",
+    type=click.Choice(["text", "json"]),
+    default="text",
+    help="Log output format (default: text). 'json' emits one JSON object per line.",
+)
 @click.version_option(version=__version__, prog_name="codeprobe")
-def main() -> None:
+def main(verbose: int, quiet: bool, log_format: str) -> None:
     """Benchmark AI coding agents against your own codebase.
     Mine real tasks from your repo history, run agents against them,
     and interpret the results to find which setup works best for YOUR code.
     """
+    _configure_logging(verbose=verbose, quiet=quiet, log_format=log_format)
 @main.command()
@@ -121,6 +192,20 @@ def init(path: str) -> None:
     default=False,
     help="Run LLM verification on curated ground truth.",
 )
+@click.option(
+    "--mcp-families",
+    is_flag=True,
+    default=False,
+    help="Include MCP-advantaged task families (symbol-reference-trace, "
+    "type-hierarchy-consumers, change-scope-audit). Only with --org-scale.",
+)
+@click.option(
+    "--sg-repo",
+    default="",
+    help="Sourcegraph repo identifier for ground truth enrichment "
+    "(e.g. github.com/sg-evals/numpy). Defaults to github.com/sg-evals/{repo_name} "
+    "when --mcp-families is used. Requires SOURCEGRAPH_TOKEN env var.",
+)
 def mine(
     path: str,
     count: int,
@@ -139,6 +224,8 @@ def mine(
     curate: bool,
     backends: tuple[str, ...],
     verify_curation_flag: bool,
+    mcp_families: bool,
+    sg_repo: str,
 ) -> None:
     """Mine eval tasks from a repository's history.
@@ -175,6 +262,8 @@ def mine(
         curate=curate,
         backends=backends,
         verify_curation_flag=verify_curation_flag,
+        mcp_families=mcp_families,
+        sg_repo=sg_repo,
     )
@@ -272,6 +361,20 @@ def init_experiment(path: str, name: str, description: str) -> None:
 @click.option(
     "--mcp-config", default=None, help="MCP config as JSON string or file path."
 )
+@click.option(
+    "--instruction-variant",
+    default=None,
+    help="Instruction file variant (e.g., instruction_mcp.md). Default: instruction.md.",
+)
+@click.option(
+    "--preamble",
+    "preambles",
+    multiple=True,
+    help=(
+        "Preamble to prepend to the instruction. Repeatable. "
+        "Built-ins: sourcegraph, github. Or path to a custom .md file."
+    ),
+)
 def add_config(
     path: str,
     label: str,
@@ -279,6 +382,8 @@ def add_config(
     model: str | None,
     permission_mode: str,
     mcp_config: str | None,
+    instruction_variant: str | None,
+    preambles: tuple[str, ...],
 ) -> None:
     """Add a configuration to an existing experiment."""
     from codeprobe.cli.experiment_cmd import experiment_add_config
@@ -290,6 +395,8 @@ def add_config(
         model=model,
         permission_mode=permission_mode,
         mcp_config_str=mcp_config,
+        instruction_variant=instruction_variant,
+        preambles=preambles,
     )

{codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/experiment_cmd.py RENAMED Viewed

@@ -63,6 +63,8 @@ def experiment_add_config(
     model: str | None,
     permission_mode: str,
     mcp_config_str: str | None,
+    instruction_variant: str | None = None,
+    preambles: tuple[str, ...] = (),
 ) -> None:
     """Add a configuration to an existing experiment."""
     exp_dir = Path(path)
@@ -104,6 +106,8 @@ def experiment_add_config(
         model=model,
         permission_mode=permission_mode,
         mcp_config=mcp_config,
+        instruction_variant=instruction_variant,
+        preambles=preambles,
     )
     # Validate the label is a safe path component

{codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/init_cmd.py RENAMED Viewed

@@ -173,12 +173,83 @@ def _prompt_mcp_config() -> str:
         click.echo(f"  Error: '{expanded}' does not exist. Try again.")
+def _detect_sourcegraph_in_mcp(
+    discovered: list[tuple[Path, list[str]]],
+    mcp_data: dict | None = None,
+) -> bool:
+    """Return True if any discovered MCP config contains a Sourcegraph server.
+    Checks server names for common Sourcegraph patterns (e.g.
+    ``sourcegraph``, ``sourcegraph-mcp-server``).
+    """
+    sg_names = {"sourcegraph", "sourcegraph-mcp-server"}
+    for _path, server_names in discovered:
+        for name in server_names:
+            if name.lower() in sg_names:
+                return True
+    if mcp_data:
+        for name in mcp_data.get("mcpServers", {}):
+            if name.lower() in sg_names:
+                return True
+    return False
+def _prompt_sourcegraph_token() -> str:
+    """Prompt for Sourcegraph access token, checking env var first."""
+    import os
+    env_token = os.environ.get("SOURCEGRAPH_TOKEN", "")
+    if env_token:
+        masked = env_token[:4] + "..." + env_token[-4:] if len(env_token) > 8 else "***"
+        click.echo(f"  Found SOURCEGRAPH_TOKEN in environment ({masked})")
+        if click.confirm("  Use this token?", default=True):
+            return env_token
+    return click.prompt("Sourcegraph access token")
+def _prompt_sourcegraph_url() -> str | None:
+    """Prompt for optional custom Sourcegraph instance URL."""
+    url = click.prompt(
+        "Sourcegraph URL (press Enter for sourcegraph.com)",
+        default="",
+        show_default=False,
+    )
+    return url if url else None
 def _goal_mcp(agents: list[str], name: str) -> _Result:
     """Goal 1: MCP comparison prompts."""
     agent = _prompt_agent(agents)
     model = _prompt_model()
-    mcp_path = _prompt_mcp_config()
+    # Check if Sourcegraph is available in discovered MCP configs
+    discovered = _discover_mcp_configs()
+    use_sourcegraph = False
+    if _detect_sourcegraph_in_mcp(discovered):
+        click.echo()
+        click.echo("Detected Sourcegraph MCP server in your configuration.")
+        click.echo("codeprobe can use the HTTP endpoint for better performance.")
+        use_sourcegraph = click.confirm("Use Sourcegraph HTTP MCP?", default=True)
+    else:
+        click.echo()
+        click.echo("Would you like to use Sourcegraph as the MCP server?")
+        use_sourcegraph = click.confirm("Use Sourcegraph?", default=False)
+    if use_sourcegraph:
+        token = _prompt_sourcegraph_token()
+        sg_url = _prompt_sourcegraph_url()
+        return ask_mcp_comparison(
+            experiment_name=name,
+            agent=agent,
+            model=model,
+            sourcegraph_token=token,
+            sourcegraph_url=sg_url,
+        )
+    # Fall back to generic MCP config path
+    mcp_path = _prompt_mcp_config()
     return ask_mcp_comparison(
         experiment_name=name,
         agent=agent,

{codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/mine_cmd.py RENAMED Viewed

@@ -403,6 +403,41 @@ def _clear_tasks_dir(repo_path: Path) -> Path:
     return tasks_dir
+def _record_task_ids_in_experiment(repo_path: Path, task_ids: list[str]) -> None:
+    """Update the experiment's task_ids so ``run`` only executes these tasks.
+    If exactly one experiment exists under ``<repo>/.codeprobe/``, its
+    ``experiment.json`` is updated with the new task ID list.  When zero
+    or multiple experiments exist, this is a no-op (the user must scope
+    manually via ``--config``).
+    """
+    from codeprobe.core.experiment import load_experiment, save_experiment
+    from codeprobe.models.experiment import Experiment
+    codeprobe_dir = repo_path / ".codeprobe"
+    if not codeprobe_dir.is_dir():
+        return
+    candidates = sorted(
+        d
+        for d in codeprobe_dir.iterdir()
+        if d.is_dir() and (d / "experiment.json").is_file()
+    )
+    if len(candidates) != 1:
+        return
+    exp_dir = candidates[0]
+    experiment = load_experiment(exp_dir)
+    updated = Experiment(
+        name=experiment.name,
+        description=experiment.description,
+        configs=experiment.configs,
+        tasks_dir=experiment.tasks_dir,
+        task_ids=tuple(sorted(task_ids)),
+    )
+    save_experiment(exp_dir, updated)
 def _resolve_repo_path(path: str) -> Path:
     """Resolve a path or URL to a local repo directory."""
     if _is_git_url(path):
@@ -483,6 +518,8 @@ def run_mine(
     curate: bool = False,
     backends: tuple[str, ...] = (),
     verify_curation_flag: bool = False,
+    mcp_families: bool = False,
+    sg_repo: str = "",
 ) -> None:
     """Mine eval tasks from a repository."""
     from codeprobe.mining import mine_tasks, write_task_dir
@@ -518,6 +555,8 @@ def run_mine(
             curate=curate,
             backends=backends,
             verify_curation_flag=verify_curation_flag,
+            mcp_families=mcp_families,
+            sg_repo=sg_repo,
         )
         return
@@ -571,6 +610,8 @@ def run_mine(
     for task in tasks:
         write_task_dir(task, tasks_dir, repo_path)
+    _record_task_ids_in_experiment(repo_path, [t.id for t in tasks])
     _show_results_table(tasks)
     warnings = _quality_review(tasks, goal_name, bias)
@@ -603,6 +644,8 @@ def _run_org_scale_mine(
     curate: bool = False,
     backends: tuple[str, ...] = (),
     verify_curation_flag: bool = False,
+    mcp_families: bool = False,
+    sg_repo: str = "",
 ) -> None:
     """Mine org-scale comprehension tasks with oracle verification."""
     from codeprobe.mining.org_scale import mine_org_scale_tasks
@@ -631,12 +674,19 @@ def _run_org_scale_mine(
             click.echo("No families selected. Aborted.")
             return
+    # Default sg_repo from primary repo name if not explicitly provided
+    effective_sg_repo = sg_repo
+    if not effective_sg_repo and mcp_families:
+        effective_sg_repo = f"github.com/sg-evals/{repo_paths[0].name}"
     result = mine_org_scale_tasks(
         repo_paths,
         count=count,
         families=selected_families,
         no_llm=no_llm,
         scan_timeout=scan_timeout,
+        include_mcp_families=mcp_families,
+        sg_repo=effective_sg_repo,
     )
     if not result.tasks:
@@ -682,6 +732,8 @@ def _run_org_scale_mine(
             curation_backends=curation_backends_used,
         )
+    _record_task_ids_in_experiment(primary_repo, [t.id for t in curated_tasks])
     _show_org_scale_results(
         curated_tasks, tasks_dir, primary_repo, curation_backends_used
     )

{codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/probe_cmd.py RENAMED Viewed

@@ -3,10 +3,13 @@
 from __future__ import annotations
 import json
+import logging
 from pathlib import Path
 import click
+logger = logging.getLogger(__name__)
 from codeprobe.probe.generator import DEFAULT_COUNT, MAX_PROBES, MIN_PROBES
@@ -76,7 +79,7 @@ def probe(
     output_dir = Path(output) if output else repo_root / "probes"
     effective_repo_name = repo_name or repo_root.name
-    click.echo(f"Scanning {repo_root} for symbols...", err=True)
+    logger.info("Scanning %s for symbols...", repo_root)
     probes = generate_probes(
         repo_root=repo_root,
         count=count,
@@ -85,12 +88,11 @@ def probe(
     )
     if not probes:
-        click.echo("No probes generated -- no suitable symbols found.", err=True)
+        logger.warning("No probes generated -- no suitable symbols found.")
         raise SystemExit(1)
-    click.echo(
-        f"Generated {len(probes)} probes, writing to {output_dir}...",
-        err=True,
+    logger.info(
+        "Generated %d probes, writing to %s...", len(probes), output_dir
     )
     created = write_probe_tasks(probes, output_dir, effective_repo_name)
@@ -108,9 +110,9 @@ def probe(
         }
         click.echo(json.dumps(summary, indent=2))
     else:
-        click.echo(f"Probe generation complete:", err=True)
-        click.echo(f"  Total probes: {len(probes)}", err=True)
+        logger.info("Probe generation complete:")
+        logger.info("  Total probes: %d", len(probes))
         for tpl_name, tpl_count in sorted(by_template.items()):
-            click.echo(f"  {tpl_name}: {tpl_count}", err=True)
-        click.echo(f"  Output: {output_dir}", err=True)
+            logger.info("  %s: %d", tpl_name, tpl_count)
+        logger.info("  Output: %s", output_dir)
         click.echo(f"Created {len(created)} probe tasks in {output_dir}")

{codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/run_cmd.py RENAMED Viewed

@@ -22,6 +22,27 @@ def _on_task_complete(result: CompletedTask) -> None:
     click.echo(f"  {result.task_id}: {status} ({result.duration_seconds:.1f}s)")
+def _find_tasks(d: Path, *, task_ids: tuple[str, ...] = ()) -> list[Path]:
+    """Discover task subdirectories with instruction.md.
+    When *task_ids* is non-empty, only return tasks whose directory name
+    appears in that tuple.  This scopes task discovery to the current
+    experiment, preventing tasks from other experiments from leaking in.
+    """
+    if not d.is_dir():
+        return []
+    if task_ids:
+        allowed = set(task_ids)
+        return sorted(
+            sd
+            for sd in d.iterdir()
+            if sd.is_dir() and sd.name in allowed and (sd / "instruction.md").exists()
+        )
+    return sorted(
+        sd for sd in d.iterdir() if sd.is_dir() and (sd / "instruction.md").exists()
+    )
 def _print_dry_run(estimate: DryRunEstimate) -> None:
     """Pretty-print a DryRunEstimate to stdout."""
     cost_lo, cost_hi = estimate.estimated_cost_range
@@ -88,17 +109,9 @@ def run_eval(
     tasks_dir = exp_dir / experiment.tasks_dir
     repo_tasks = Path(path).resolve() / ".codeprobe" / experiment.tasks_dir
-    # Prefer whichever location actually has task subdirectories with instruction.md
-    def _find_tasks(d: Path) -> list[Path]:
-        if not d.is_dir():
-            return []
-        return sorted(
-            sd for sd in d.iterdir() if sd.is_dir() and (sd / "instruction.md").exists()
-        )
-    task_dirs = _find_tasks(tasks_dir)
+    task_dirs = _find_tasks(tasks_dir, task_ids=experiment.task_ids)
     if not task_dirs and repo_tasks != tasks_dir:
-        task_dirs = _find_tasks(repo_tasks)
+        task_dirs = _find_tasks(repo_tasks, task_ids=experiment.task_ids)
         if task_dirs:
             tasks_dir = repo_tasks

{codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/wizard.py RENAMED Viewed

@@ -14,19 +14,68 @@ from codeprobe.models.experiment import ExperimentConfig
 _SAFE_NAME = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9._-]*$")
+_DEFAULT_SOURCEGRAPH_URL = "https://sourcegraph.com"
+def build_sourcegraph_mcp_config(
+    *,
+    token: str,
+    url: str = _DEFAULT_SOURCEGRAPH_URL,
+) -> dict:
+    """Build an HTTP MCP config dict for Sourcegraph.
+    Returns a ``{"mcpServers": {"sourcegraph": {...}}}`` dict suitable for
+    passing as ``mcp_config`` on an :class:`ExperimentConfig`.
+    """
+    base_url = url.rstrip("/")
+    return {
+        "mcpServers": {
+            "sourcegraph": {
+                "type": "http",
+                "url": f"{base_url}/.api/mcp/v1",
+                "headers": {"Authorization": f"token {token}"},
+            }
+        }
+    }
 def ask_mcp_comparison(
     *,
     experiment_name: str,
     agent: str,
     model: str | None,
-    mcp_config_path: str,
+    mcp_config_path: str | None = None,
+    sourcegraph_token: str | None = None,
+    sourcegraph_url: str | None = None,
 ) -> tuple[EvalrcConfig, list[ExperimentConfig]]:
-    """Goal 1: Compare baseline agent vs MCP-augmented agent."""
-    mcp_data = _load_json(mcp_config_path)
+    """Goal 1: Compare baseline agent vs MCP-augmented agent.
+    When *sourcegraph_token* is provided, generates an HTTP-based Sourcegraph
+    MCP config with an ``Authorization`` header and adds the ``sourcegraph``
+    preamble.  Otherwise falls back to loading the MCP config from
+    *mcp_config_path*.
+    """
+    if sourcegraph_token is not None:
+        mcp_data = build_sourcegraph_mcp_config(
+            token=sourcegraph_token,
+            url=sourcegraph_url or _DEFAULT_SOURCEGRAPH_URL,
+        )
+        preambles: tuple[str, ...] = ("sourcegraph",)
+    else:
+        if mcp_config_path is None:
+            raise click.BadParameter(
+                "Either sourcegraph_token or mcp_config_path must be provided."
+            )
+        mcp_data = _load_json(mcp_config_path)
+        preambles = ()
     baseline = ExperimentConfig(label="baseline", agent=agent, model=model)
     with_mcp = ExperimentConfig(
-        label="with-mcp", agent=agent, model=model, mcp_config=mcp_data
+        label="with-mcp",
+        agent=agent,
+        model=model,
+        mcp_config=mcp_data,
+        preambles=preambles,
     )
     evalrc = EvalrcConfig(name=experiment_name, agents=[agent])

codeprobe 0.1.6__tar.gz → 0.2.0__tar.gz

codeprobe 0.1.6tar.gz → 0.2.0tar.gz