PyPI - codeprobe - Versions diffs - 0.1.7__tar.gz → 0.2.0__tar.gz - Mend

codeprobe 0.1.7tar.gz → 0.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (142) hide show

{codeprobe-0.1.7 → codeprobe-0.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: codeprobe
-Version: 0.1.7
+Version: 0.2.0
 Summary: Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results.
 Author: codeprobe contributors
 License-Expression: Apache-2.0

{codeprobe-0.1.7 → codeprobe-0.2.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "codeprobe"
-version = "0.1.7"
+version = "0.2.0"
 description = "Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results."
 readme = "README.md"
 license = "Apache-2.0"

{codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/__init__.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """codeprobe — Benchmark AI coding agents against your own codebase."""
-__version__ = "0.1.7"
+__version__ = "0.2.0"

{codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/adapters/claude.py RENAMED Viewed

@@ -4,6 +4,7 @@ from __future__ import annotations
 import json
 import re
+import shutil
 import subprocess
 import tempfile
 from pathlib import Path
@@ -69,16 +70,31 @@ class ClaudeAdapter(BaseAdapter):
         mcp_path = self._write_mcp_config(config)
         if mcp_path:
-            cmd.extend(["--mcp-config", mcp_path])
+            cmd.extend(["--mcp-config", mcp_path, "--strict-mcp-config"])
         return cmd
     def isolate_session(self, slot_id: int) -> dict[str, str]:
-        """Return a per-slot CLAUDE_CONFIG_DIR for session isolation."""
+        """Return a per-slot CLAUDE_CONFIG_DIR for session isolation.
+        Copies authentication credentials from the real ``~/.claude/``
+        directory so the agent subprocess can authenticate.
+        """
         config_dir = (
             Path(tempfile.gettempdir()) / "codeprobe-claude" / f"slot-{slot_id}"
         )
         config_dir.mkdir(parents=True, exist_ok=True)
+        # Copy auth credentials from the user's real config dir.
+        # Without these the subprocess gets "Not logged in".
+        real_config = Path.home() / ".claude"
+        if real_config.is_dir():
+            for name in ("credentials.json", ".credentials.json"):
+                src = real_config / name
+                dst = config_dir / name
+                if src.is_file():
+                    shutil.copy2(src, dst)
         return {"CLAUDE_CONFIG_DIR": str(config_dir)}
     def parse_output(

{codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/__init__.py RENAMED Viewed

@@ -1,18 +1,89 @@
 """CLI entry point for codeprobe."""
+import json as _json
+import logging
+import sys
 import click
 from codeprobe import __version__
+class _JsonFormatter(logging.Formatter):
+    """Emit one JSON object per log line."""
+    def format(self, record: logging.LogRecord) -> str:
+        payload = {
+            "level": record.levelname,
+            "logger": record.name,
+            "message": record.getMessage(),
+            "timestamp": self.formatTime(record, "%Y-%m-%dT%H:%M:%S%z"),
+        }
+        return _json.dumps(payload)
+def _configure_logging(verbose: int, quiet: bool, log_format: str = "text") -> None:
+    """Configure namespace-scoped logging for codeprobe.* modules.
+    Attaches a StreamHandler to `logging.getLogger("codeprobe")` so that
+    all 26+ codeprobe.* modules emit through hierarchy without touching
+    third-party loggers (httpx, urllib3, etc.).
+    """
+    if quiet:
+        level = logging.WARNING
+    elif verbose >= 1:
+        level = logging.DEBUG
+    else:
+        level = logging.INFO
+    logger = logging.getLogger("codeprobe")
+    logger.setLevel(level)
+    logger.propagate = False  # don't bubble to root
+    # Idempotent: tests / repeat invocations must not duplicate handlers.
+    for h in list(logger.handlers):
+        logger.removeHandler(h)
+    handler = logging.StreamHandler(sys.stderr)
+    if log_format == "json":
+        handler.setFormatter(_JsonFormatter())
+    elif verbose >= 1:
+        fmt = "%(levelname)s %(name)s: %(message)s"
+        handler.setFormatter(logging.Formatter(fmt))
+    else:
+        fmt = "%(levelname)s: %(message)s"
+        handler.setFormatter(logging.Formatter(fmt))
+    logger.addHandler(handler)
 @click.group()
+@click.option(
+    "-v",
+    "--verbose",
+    count=True,
+    help="Increase log verbosity (-v sets DEBUG).",
+)
+@click.option(
+    "-q",
+    "--quiet",
+    is_flag=True,
+    default=False,
+    help="Suppress INFO logs (WARNING and above only).",
+)
+@click.option(
+    "--log-format",
+    type=click.Choice(["text", "json"]),
+    default="text",
+    help="Log output format (default: text). 'json' emits one JSON object per line.",
+)
 @click.version_option(version=__version__, prog_name="codeprobe")
-def main() -> None:
+def main(verbose: int, quiet: bool, log_format: str) -> None:
     """Benchmark AI coding agents against your own codebase.
     Mine real tasks from your repo history, run agents against them,
     and interpret the results to find which setup works best for YOUR code.
     """
+    _configure_logging(verbose=verbose, quiet=quiet, log_format=log_format)
 @main.command()
@@ -121,6 +192,20 @@ def init(path: str) -> None:
     default=False,
     help="Run LLM verification on curated ground truth.",
 )
+@click.option(
+    "--mcp-families",
+    is_flag=True,
+    default=False,
+    help="Include MCP-advantaged task families (symbol-reference-trace, "
+    "type-hierarchy-consumers, change-scope-audit). Only with --org-scale.",
+)
+@click.option(
+    "--sg-repo",
+    default="",
+    help="Sourcegraph repo identifier for ground truth enrichment "
+    "(e.g. github.com/sg-evals/numpy). Defaults to github.com/sg-evals/{repo_name} "
+    "when --mcp-families is used. Requires SOURCEGRAPH_TOKEN env var.",
+)
 def mine(
     path: str,
     count: int,
@@ -139,6 +224,8 @@ def mine(
     curate: bool,
     backends: tuple[str, ...],
     verify_curation_flag: bool,
+    mcp_families: bool,
+    sg_repo: str,
 ) -> None:
     """Mine eval tasks from a repository's history.
@@ -175,6 +262,8 @@ def mine(
         curate=curate,
         backends=backends,
         verify_curation_flag=verify_curation_flag,
+        mcp_families=mcp_families,
+        sg_repo=sg_repo,
     )
@@ -272,6 +361,20 @@ def init_experiment(path: str, name: str, description: str) -> None:
 @click.option(
     "--mcp-config", default=None, help="MCP config as JSON string or file path."
 )
+@click.option(
+    "--instruction-variant",
+    default=None,
+    help="Instruction file variant (e.g., instruction_mcp.md). Default: instruction.md.",
+)
+@click.option(
+    "--preamble",
+    "preambles",
+    multiple=True,
+    help=(
+        "Preamble to prepend to the instruction. Repeatable. "
+        "Built-ins: sourcegraph, github. Or path to a custom .md file."
+    ),
+)
 def add_config(
     path: str,
     label: str,
@@ -279,6 +382,8 @@ def add_config(
     model: str | None,
     permission_mode: str,
     mcp_config: str | None,
+    instruction_variant: str | None,
+    preambles: tuple[str, ...],
 ) -> None:
     """Add a configuration to an existing experiment."""
     from codeprobe.cli.experiment_cmd import experiment_add_config
@@ -290,6 +395,8 @@ def add_config(
         model=model,
         permission_mode=permission_mode,
         mcp_config_str=mcp_config,
+        instruction_variant=instruction_variant,
+        preambles=preambles,
     )

{codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/experiment_cmd.py RENAMED Viewed

@@ -63,6 +63,8 @@ def experiment_add_config(
     model: str | None,
     permission_mode: str,
     mcp_config_str: str | None,
+    instruction_variant: str | None = None,
+    preambles: tuple[str, ...] = (),
 ) -> None:
     """Add a configuration to an existing experiment."""
     exp_dir = Path(path)
@@ -104,6 +106,8 @@ def experiment_add_config(
         model=model,
         permission_mode=permission_mode,
         mcp_config=mcp_config,
+        instruction_variant=instruction_variant,
+        preambles=preambles,
     )
     # Validate the label is a safe path component

{codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/init_cmd.py RENAMED Viewed

@@ -173,12 +173,83 @@ def _prompt_mcp_config() -> str:
         click.echo(f"  Error: '{expanded}' does not exist. Try again.")
+def _detect_sourcegraph_in_mcp(
+    discovered: list[tuple[Path, list[str]]],
+    mcp_data: dict | None = None,
+) -> bool:
+    """Return True if any discovered MCP config contains a Sourcegraph server.
+    Checks server names for common Sourcegraph patterns (e.g.
+    ``sourcegraph``, ``sourcegraph-mcp-server``).
+    """
+    sg_names = {"sourcegraph", "sourcegraph-mcp-server"}
+    for _path, server_names in discovered:
+        for name in server_names:
+            if name.lower() in sg_names:
+                return True
+    if mcp_data:
+        for name in mcp_data.get("mcpServers", {}):
+            if name.lower() in sg_names:
+                return True
+    return False
+def _prompt_sourcegraph_token() -> str:
+    """Prompt for Sourcegraph access token, checking env var first."""
+    import os
+    env_token = os.environ.get("SOURCEGRAPH_TOKEN", "")
+    if env_token:
+        masked = env_token[:4] + "..." + env_token[-4:] if len(env_token) > 8 else "***"
+        click.echo(f"  Found SOURCEGRAPH_TOKEN in environment ({masked})")
+        if click.confirm("  Use this token?", default=True):
+            return env_token
+    return click.prompt("Sourcegraph access token")
+def _prompt_sourcegraph_url() -> str | None:
+    """Prompt for optional custom Sourcegraph instance URL."""
+    url = click.prompt(
+        "Sourcegraph URL (press Enter for sourcegraph.com)",
+        default="",
+        show_default=False,
+    )
+    return url if url else None
 def _goal_mcp(agents: list[str], name: str) -> _Result:
     """Goal 1: MCP comparison prompts."""
     agent = _prompt_agent(agents)
     model = _prompt_model()
-    mcp_path = _prompt_mcp_config()
+    # Check if Sourcegraph is available in discovered MCP configs
+    discovered = _discover_mcp_configs()
+    use_sourcegraph = False
+    if _detect_sourcegraph_in_mcp(discovered):
+        click.echo()
+        click.echo("Detected Sourcegraph MCP server in your configuration.")
+        click.echo("codeprobe can use the HTTP endpoint for better performance.")
+        use_sourcegraph = click.confirm("Use Sourcegraph HTTP MCP?", default=True)
+    else:
+        click.echo()
+        click.echo("Would you like to use Sourcegraph as the MCP server?")
+        use_sourcegraph = click.confirm("Use Sourcegraph?", default=False)
+    if use_sourcegraph:
+        token = _prompt_sourcegraph_token()
+        sg_url = _prompt_sourcegraph_url()
+        return ask_mcp_comparison(
+            experiment_name=name,
+            agent=agent,
+            model=model,
+            sourcegraph_token=token,
+            sourcegraph_url=sg_url,
+        )
+    # Fall back to generic MCP config path
+    mcp_path = _prompt_mcp_config()
     return ask_mcp_comparison(
         experiment_name=name,
         agent=agent,

{codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/mine_cmd.py RENAMED Viewed

@@ -518,6 +518,8 @@ def run_mine(
     curate: bool = False,
     backends: tuple[str, ...] = (),
     verify_curation_flag: bool = False,
+    mcp_families: bool = False,
+    sg_repo: str = "",
 ) -> None:
     """Mine eval tasks from a repository."""
     from codeprobe.mining import mine_tasks, write_task_dir
@@ -553,6 +555,8 @@ def run_mine(
             curate=curate,
             backends=backends,
             verify_curation_flag=verify_curation_flag,
+            mcp_families=mcp_families,
+            sg_repo=sg_repo,
         )
         return
@@ -640,6 +644,8 @@ def _run_org_scale_mine(
     curate: bool = False,
     backends: tuple[str, ...] = (),
     verify_curation_flag: bool = False,
+    mcp_families: bool = False,
+    sg_repo: str = "",
 ) -> None:
     """Mine org-scale comprehension tasks with oracle verification."""
     from codeprobe.mining.org_scale import mine_org_scale_tasks
@@ -668,12 +674,19 @@ def _run_org_scale_mine(
             click.echo("No families selected. Aborted.")
             return
+    # Default sg_repo from primary repo name if not explicitly provided
+    effective_sg_repo = sg_repo
+    if not effective_sg_repo and mcp_families:
+        effective_sg_repo = f"github.com/sg-evals/{repo_paths[0].name}"
     result = mine_org_scale_tasks(
         repo_paths,
         count=count,
         families=selected_families,
         no_llm=no_llm,
         scan_timeout=scan_timeout,
+        include_mcp_families=mcp_families,
+        sg_repo=effective_sg_repo,
     )
     if not result.tasks:

{codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/probe_cmd.py RENAMED Viewed

@@ -3,10 +3,13 @@
 from __future__ import annotations
 import json
+import logging
 from pathlib import Path
 import click
+logger = logging.getLogger(__name__)
 from codeprobe.probe.generator import DEFAULT_COUNT, MAX_PROBES, MIN_PROBES
@@ -76,7 +79,7 @@ def probe(
     output_dir = Path(output) if output else repo_root / "probes"
     effective_repo_name = repo_name or repo_root.name
-    click.echo(f"Scanning {repo_root} for symbols...", err=True)
+    logger.info("Scanning %s for symbols...", repo_root)
     probes = generate_probes(
         repo_root=repo_root,
         count=count,
@@ -85,12 +88,11 @@ def probe(
     )
     if not probes:
-        click.echo("No probes generated -- no suitable symbols found.", err=True)
+        logger.warning("No probes generated -- no suitable symbols found.")
         raise SystemExit(1)
-    click.echo(
-        f"Generated {len(probes)} probes, writing to {output_dir}...",
-        err=True,
+    logger.info(
+        "Generated %d probes, writing to %s...", len(probes), output_dir
     )
     created = write_probe_tasks(probes, output_dir, effective_repo_name)
@@ -108,9 +110,9 @@ def probe(
         }
         click.echo(json.dumps(summary, indent=2))
     else:
-        click.echo(f"Probe generation complete:", err=True)
-        click.echo(f"  Total probes: {len(probes)}", err=True)
+        logger.info("Probe generation complete:")
+        logger.info("  Total probes: %d", len(probes))
         for tpl_name, tpl_count in sorted(by_template.items()):
-            click.echo(f"  {tpl_name}: {tpl_count}", err=True)
-        click.echo(f"  Output: {output_dir}", err=True)
+            logger.info("  %s: %d", tpl_name, tpl_count)
+        logger.info("  Output: %s", output_dir)
         click.echo(f"Created {len(created)} probe tasks in {output_dir}")

{codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/wizard.py RENAMED Viewed

@@ -14,19 +14,68 @@ from codeprobe.models.experiment import ExperimentConfig
 _SAFE_NAME = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9._-]*$")
+_DEFAULT_SOURCEGRAPH_URL = "https://sourcegraph.com"
+def build_sourcegraph_mcp_config(
+    *,
+    token: str,
+    url: str = _DEFAULT_SOURCEGRAPH_URL,
+) -> dict:
+    """Build an HTTP MCP config dict for Sourcegraph.
+    Returns a ``{"mcpServers": {"sourcegraph": {...}}}`` dict suitable for
+    passing as ``mcp_config`` on an :class:`ExperimentConfig`.
+    """
+    base_url = url.rstrip("/")
+    return {
+        "mcpServers": {
+            "sourcegraph": {
+                "type": "http",
+                "url": f"{base_url}/.api/mcp/v1",
+                "headers": {"Authorization": f"token {token}"},
+            }
+        }
+    }
 def ask_mcp_comparison(
     *,
     experiment_name: str,
     agent: str,
     model: str | None,
-    mcp_config_path: str,
+    mcp_config_path: str | None = None,
+    sourcegraph_token: str | None = None,
+    sourcegraph_url: str | None = None,
 ) -> tuple[EvalrcConfig, list[ExperimentConfig]]:
-    """Goal 1: Compare baseline agent vs MCP-augmented agent."""
-    mcp_data = _load_json(mcp_config_path)
+    """Goal 1: Compare baseline agent vs MCP-augmented agent.
+    When *sourcegraph_token* is provided, generates an HTTP-based Sourcegraph
+    MCP config with an ``Authorization`` header and adds the ``sourcegraph``
+    preamble.  Otherwise falls back to loading the MCP config from
+    *mcp_config_path*.
+    """
+    if sourcegraph_token is not None:
+        mcp_data = build_sourcegraph_mcp_config(
+            token=sourcegraph_token,
+            url=sourcegraph_url or _DEFAULT_SOURCEGRAPH_URL,
+        )
+        preambles: tuple[str, ...] = ("sourcegraph",)
+    else:
+        if mcp_config_path is None:
+            raise click.BadParameter(
+                "Either sourcegraph_token or mcp_config_path must be provided."
+            )
+        mcp_data = _load_json(mcp_config_path)
+        preambles = ()
     baseline = ExperimentConfig(label="baseline", agent=agent, model=model)
     with_mcp = ExperimentConfig(
-        label="with-mcp", agent=agent, model=model, mcp_config=mcp_data
+        label="with-mcp",
+        agent=agent,
+        model=model,
+        mcp_config=mcp_data,
+        preambles=preambles,
     )
     evalrc = EvalrcConfig(name=experiment_name, agents=[agent])

{codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/core/checkpoint.py RENAMED Viewed

@@ -9,6 +9,7 @@ from __future__ import annotations
 import json
 import logging
 import sqlite3
+import time
 from dataclasses import asdict
 from datetime import datetime, timezone
 from pathlib import Path
@@ -173,31 +174,47 @@ class CheckpointStore:
     def _open(self, db_path: Path) -> sqlite3.Connection:
         """Open (or create) the SQLite database with WAL mode.
-        If the file is corrupt, it is removed and recreated.
-        Automatically migrates the schema for older databases.
+        Retries on transient ``OperationalError`` (lock contention during
+        concurrent creation).  Only removes and recreates the file on
+        non-transient ``DatabaseError`` (genuine corruption).
         """
         db_path.parent.mkdir(parents=True, exist_ok=True)
-        try:
-            conn = sqlite3.connect(str(db_path), timeout=10)
-            conn.execute("PRAGMA journal_mode=WAL")
-            conn.execute(_CREATE_TABLE)
-            self._migrate_schema(conn)
-            conn.commit()
-            return conn
-        except sqlite3.DatabaseError:
-            logger.warning(
-                "Corrupt checkpoint DB at %s — removing and recreating", db_path
-            )
-            db_path.unlink(missing_ok=True)
-            # Also remove WAL/SHM sidecar files
-            db_path.with_suffix(".db-wal").unlink(missing_ok=True)
-            db_path.with_suffix(".db-shm").unlink(missing_ok=True)
-            conn = sqlite3.connect(str(db_path), timeout=10)
-            conn.execute("PRAGMA journal_mode=WAL")
-            conn.execute(_CREATE_TABLE)
-            self._migrate_schema(conn)
-            conn.commit()
-            return conn
+        last_err: Exception | None = None
+        for attempt in range(4):
+            try:
+                conn = sqlite3.connect(str(db_path), timeout=10)
+                conn.execute("PRAGMA journal_mode=WAL")
+                conn.execute(_CREATE_TABLE)
+                self._migrate_schema(conn)
+                conn.commit()
+                return conn
+            except sqlite3.OperationalError as exc:
+                # Transient lock contention — retry with backoff
+                last_err = exc
+                logger.debug(
+                    "Checkpoint DB busy at %s (attempt %d): %s",
+                    db_path,
+                    attempt + 1,
+                    exc,
+                )
+                time.sleep(0.1 * (2**attempt))
+            except sqlite3.DatabaseError:
+                logger.warning(
+                    "Corrupt checkpoint DB at %s — removing and recreating",
+                    db_path,
+                )
+                db_path.unlink(missing_ok=True)
+                db_path.with_suffix(".db-wal").unlink(missing_ok=True)
+                db_path.with_suffix(".db-shm").unlink(missing_ok=True)
+                conn = sqlite3.connect(str(db_path), timeout=10)
+                conn.execute("PRAGMA journal_mode=WAL")
+                conn.execute(_CREATE_TABLE)
+                self._migrate_schema(conn)
+                conn.commit()
+                return conn
+        raise sqlite3.OperationalError(
+            f"Could not open checkpoint DB at {db_path} after 4 attempts: {last_err}"
+        )
     @staticmethod
     def _migrate_schema(conn: sqlite3.Connection) -> None:

{codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/core/executor.py RENAMED Viewed

@@ -166,21 +166,25 @@ def execute_task(
     """
     task_id = task_dir.name
+    # Load task metadata once — used for reward_type auto-detection and
+    # preamble context (e.g. sg_repo for Sourcegraph preamble).
+    _task_meta: dict = {}
+    meta_path = task_dir / "metadata.json"
+    if meta_path.is_file():
+        try:
+            import json as _json
+            _task_meta = _json.loads(meta_path.read_text(encoding="utf-8"))
+        except (ValueError, OSError):
+            pass
     # Auto-detect reward_type from task metadata when caller uses default.
     # Oracle tasks (org-scale) need "continuous" scoring to read reward.txt;
     # the default "binary" would score exit-code-only and always pass.
     if reward_type == "binary":
-        meta_path = task_dir / "metadata.json"
-        if meta_path.is_file():
-            try:
-                import json as _json
-                meta = _json.loads(meta_path.read_text(encoding="utf-8"))
-                task_rt = (meta.get("verification") or {}).get("reward_type")
-                if task_rt and task_rt != "binary":
-                    reward_type = task_rt
-            except (ValueError, OSError):
-                pass  # Stick with caller's default
+        task_rt = (_task_meta.get("verification") or {}).get("reward_type")
+        if task_rt and task_rt != "binary":
+            reward_type = task_rt
     def _error_result(error: str, error_category: str | None = None) -> TaskResult:
         return TaskResult(
@@ -206,6 +210,12 @@ def execute_task(
         )
     if preamble_names and preamble_resolver is not None:
+        # Build extra context from task metadata for preamble templates
+        extra_ctx: dict[str, str] = {}
+        sg_repo = (_task_meta.get("metadata") or {}).get("sg_repo", "")
+        if sg_repo:
+            extra_ctx["sg_repo"] = sg_repo
         try:
             prompt, resolved_preambles = compose_instruction(
                 instruction,
@@ -214,6 +224,7 @@ def execute_task(
                 resolver=preamble_resolver,
                 task_id=task_id,
                 worktree_path=worktree_path,
+                extra_context=extra_ctx or None,
             )
         except (FileNotFoundError, ValueError) as exc:
             return _error_result(f"Preamble resolution failed: {exc}")
@@ -325,7 +336,7 @@ def _git_reset_workdir(repo_path: Path) -> None:
             capture_output=True,
         )
         subprocess.run(
-            ["git", "clean", "-fd"],
+            ["git", "clean", "-fd", "-e", ".codeprobe", "-e", ".codeprobe-worktrees"],
             cwd=repo_path,
             check=True,
             capture_output=True,
@@ -448,6 +459,15 @@ def execute_config(
     """
     checkpointed_ids, results = _restore_checkpointed(checkpoint_store)
+    # Filter checkpointed results to only include tasks in the current
+    # experiment.  Without this, stale entries from prior runs with different
+    # task_ids leak into the results list and inflate/deflate scores.
+    current_task_ids = {d.name for d in task_dirs}
+    checkpointed_ids = {
+        (tid, ri) for tid, ri in checkpointed_ids if tid in current_task_ids
+    }
+    results = [r for r in results if r.task_id in current_task_ids]
     # Build expanded work items: (task_dir, repeat_index) for all repeats
     all_work: list[tuple[Path, int]] = [
         (d, ri) for d in task_dirs for ri in range(repeats)

codeprobe 0.1.7__tar.gz → 0.2.0__tar.gz

codeprobe 0.1.7tar.gz → 0.2.0tar.gz