PyPI - codeprobe - Versions diffs - 0.5.2__tar.gz → 0.5.4__tar.gz - Mend

codeprobe 0.5.2tar.gz → 0.5.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (226) hide show

{codeprobe-0.5.2 → codeprobe-0.5.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: codeprobe
-Version: 0.5.2
+Version: 0.5.4
 Summary: Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results.
 Author: codeprobe contributors
 License-Expression: Apache-2.0

{codeprobe-0.5.2 → codeprobe-0.5.4}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "codeprobe"
-version = "0.5.2"
+version = "0.5.4"
 description = "Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results."
 readme = "README.md"
 license = "Apache-2.0"

{codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/adapters/claude.py RENAMED Viewed

@@ -243,7 +243,13 @@ class ClaudeAdapter(BaseAdapter):
     def build_command(self, prompt: str, config: AgentConfig) -> list[str]:
         binary = self._require_binary()
-        cmd = [binary, "-p", prompt, "--output-format", "json"]
+        # stream-json + --verbose emits newline-delimited events including
+        # every assistant message (with tool_use content blocks) and ends
+        # with a ``type: "result"`` event mirroring the ``json`` envelope.
+        # This is what gives us accurate per-run tool_call_count and
+        # per-tool observability; the collector reconstructs the envelope
+        # from the terminal event.
+        cmd = [binary, "-p", prompt, "--output-format", "stream-json", "--verbose"]
         if config.model:
             cmd.extend(["--model", _normalize_model_for_cli(config.model)])
@@ -262,6 +268,27 @@ class ClaudeAdapter(BaseAdapter):
         if mcp_path:
             cmd.extend(["--mcp-config", mcp_path, "--strict-mcp-config"])
+        # Tool restrictions. Claude CLI has three related flags:
+        #   --tools ""            disables all built-in tools
+        #   --allowedTools X,Y    auto-approves these tools (no permission
+        #                         prompt); names may include MCP tools as
+        #                         ``mcp__<server>__<tool>``
+        #   --disallowedTools X,Y blocks these tools outright
+        # We treat ``allowed_tools`` as a whitelist: when set, built-ins
+        # are disabled (``--tools ""``) and listed names are auto-approved
+        # (``--allowedTools``). This yields true MCP-only runs when the
+        # whitelist contains only ``mcp__*`` names — verified against
+        # claude 2.1.x: without auto-approval the agent hits permission
+        # prompts and ends the turn early.
+        if config.allowed_tools is not None:
+            cmd.extend(["--tools", ""])
+            if config.allowed_tools:
+                cmd.extend(["--allowedTools", ",".join(config.allowed_tools)])
+        if config.disallowed_tools:
+            cmd.extend(
+                ["--disallowedTools", ",".join(config.disallowed_tools)]
+            )
         return cmd
     def isolate_session(self, slot_id: int) -> dict[str, str]:
@@ -290,15 +317,35 @@ class ClaudeAdapter(BaseAdapter):
         return {}
     def parse_output(self, result: subprocess.CompletedProcess[str], duration: float) -> AgentOutput:
-        """Parse Claude CLI JSON envelope into AgentOutput."""
+        """Parse Claude CLI JSON envelope into AgentOutput.
+        Handles both ``--output-format json`` (single envelope) and
+        ``--output-format stream-json --verbose`` (newline-delimited
+        events) — the collector auto-detects. When parsing a stream, the
+        final ``type: "result"`` event carries the same fields as the
+        single-envelope shape, so we reconstruct ``result`` text from it.
+        """
         usage = self._collector.collect(result.stdout)
-        # Extract content text from the JSON envelope
+        # Extract content text. For stream-json, the terminal result event
+        # has a ``result`` field; iterate events to find it. For single
+        # envelope, json.loads works directly.
+        stdout_text = result.stdout
         try:
             envelope = json.loads(result.stdout)
             stdout_text = envelope.get("result", result.stdout)
         except (json.JSONDecodeError, ValueError):
-            stdout_text = result.stdout
+            for line in reversed(result.stdout.splitlines()):
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    ev = json.loads(line)
+                except (json.JSONDecodeError, ValueError):
+                    continue
+                if isinstance(ev, dict) and ev.get("type") == "result":
+                    stdout_text = ev.get("result", result.stdout)
+                    break
         return AgentOutput(
             stdout=stdout_text,
@@ -313,4 +360,5 @@ class ClaudeAdapter(BaseAdapter):
             cost_source=usage.cost_source,
             error=usage.error,
             tool_call_count=usage.tool_call_count,
+            tool_use_by_name=usage.tool_use_by_name,
         )

{codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/adapters/protocol.py RENAMED Viewed

@@ -45,6 +45,9 @@ class AgentOutput:
     error: str | None = None
     cost_source: str = "unavailable"
     tool_call_count: int | None = None
+    # Per-tool usage counts (e.g. {"Read": 5, "mcp__sourcegraph__...": 2}).
+    # None when the adapter couldn't capture a streaming transcript.
+    tool_use_by_name: dict[str, int] | None = None
     def __post_init__(self) -> None:
         if self.cost_model not in ALLOWED_COST_MODELS:
@@ -63,12 +66,22 @@ class AgentOutput:
 @dataclass(frozen=True)
 class AgentConfig:
-    """Configuration passed to an agent adapter."""
+    """Configuration passed to an agent adapter.
+    ``allowed_tools`` / ``disallowed_tools`` restrict which tools the agent
+    may call. When both are ``None`` the adapter uses its default tool set.
+    When ``allowed_tools`` is an empty list, the adapter disables all
+    built-in tools (useful for MCP-only experiments: MCP tools are still
+    available because they come from ``mcp_config``, but no built-in
+    ``Read``/``Grep``/``Bash``/etc. are).
+    """
     model: str | None = None
     permission_mode: str = "default"
     timeout_seconds: int = 3600
     mcp_config: dict | None = None
+    allowed_tools: list[str] | None = None
+    disallowed_tools: list[str] | None = None
     extra: dict | None = None
     cwd: str | None = None

{codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/adapters/telemetry.py RENAMED Viewed

@@ -66,6 +66,11 @@ class UsageData:
     cost_source: str = "unavailable"
     error: str | None = None
     tool_call_count: int | None = None
+    # Tool-use counts broken down by tool name (e.g. ``{"Read": 5,
+    # "mcp__sourcegraph__keyword_search": 2}``). Populated only when the
+    # adapter captured a streaming transcript. None means "not captured",
+    # not "no tool calls".
+    tool_use_by_name: dict[str, int] | None = None
     def __post_init__(self) -> None:
         if self.cost_model not in ALLOWED_COST_MODELS:
@@ -145,6 +150,45 @@ def _count_tool_use_blocks(envelope: dict[str, Any]) -> int | None:
     return count
+def _parse_stream_json(raw_output: str) -> tuple[dict[str, Any] | None, int, dict[str, int]]:
+    """Parse a ``--output-format stream-json --verbose`` transcript.
+    Returns ``(result_event, tool_use_count, tool_use_by_name)``.
+    ``result_event`` is the final ``type: "result"`` event (same shape as
+    ``--output-format json`` envelope), or None when the stream is
+    malformed or has no terminal event. ``tool_use_by_name`` aggregates
+    tool-use block counts by tool name (including MCP tools, which appear
+    as ``mcp__<server>__<tool>``), useful for observability.
+    """
+    result_event: dict[str, Any] | None = None
+    tool_use_count = 0
+    by_name: dict[str, int] = {}
+    for line in raw_output.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            ev = json.loads(line)
+        except (json.JSONDecodeError, ValueError):
+            continue
+        if not isinstance(ev, dict):
+            continue
+        if ev.get("type") == "assistant":
+            msg = ev.get("message")
+            if isinstance(msg, dict):
+                for block in msg.get("content", []) or []:
+                    if not isinstance(block, dict):
+                        continue
+                    if block.get("type") == "tool_use":
+                        tool_use_count += 1
+                        name = block.get("name", "")
+                        if isinstance(name, str) and name:
+                            by_name[name] = by_name.get(name, 0) + 1
+        if ev.get("type") == "result":
+            result_event = ev
+    return result_event, tool_use_count, by_name
 class JsonStdoutCollector:
     """Extract telemetry from Claude CLI JSON envelope on stdout.
@@ -162,10 +206,40 @@ class JsonStdoutCollector:
     """
     def collect(self, raw_output: str, **context: Any) -> UsageData:
-        try:
-            envelope = json.loads(raw_output)
-        except (json.JSONDecodeError, ValueError) as exc:
-            return UsageData(error=f"JSON parse failed: {exc}")
+        # Two accepted shapes:
+        #   1. ``--output-format json`` — a single JSON envelope; no
+        #      per-tool-use trace, so tool_call_count stays None.
+        #   2. ``--output-format stream-json --verbose`` — newline-delimited
+        #      events ending in a ``type: "result"`` event that mirrors
+        #      shape (1). We also count ``tool_use`` blocks across all
+        #      ``assistant`` events for accurate tool_call_count.
+        stream_tool_count: int | None = None
+        stream_tool_by_name: dict[str, int] = {}
+        trimmed = raw_output.lstrip()
+        if trimmed.startswith("{\n") or trimmed.startswith("{"):
+            # Try single-envelope path first — most adapters still use
+            # ``--output-format json``.
+            try:
+                envelope = json.loads(raw_output)
+                if envelope.get("type") == "result" and "\n" in raw_output.rstrip():
+                    # Ambiguous: looks like a single-line event from the
+                    # stream. Fall through to stream parsing below.
+                    raise ValueError("ambiguous envelope — retry as stream")
+            except (json.JSONDecodeError, ValueError):
+                envelope = None
+        else:
+            envelope = None
+        if envelope is None:
+            result_ev, stream_tool_count, stream_tool_by_name = _parse_stream_json(
+                raw_output
+            )
+            if result_ev is None:
+                return UsageData(
+                    error="JSON parse failed: output is neither a valid "
+                    "envelope nor a stream-json transcript ending in a "
+                    "'result' event"
+                )
+            envelope = result_ev
         usage = envelope.get("usage")
         if usage is None:
@@ -197,7 +271,13 @@ class JsonStdoutCollector:
             cost_model = "unknown"
             cost_source = "unavailable"
-        tool_call_count = _count_tool_use_blocks(envelope)
+        # Prefer stream-json count when the transcript was streamed — it's
+        # always present and accurate. Fall back to the envelope's
+        # ``messages`` array (when some future CLI flag surfaces it), else
+        # stays None.
+        tool_call_count = stream_tool_count
+        if tool_call_count is None:
+            tool_call_count = _count_tool_use_blocks(envelope)
         return UsageData(
             input_tokens=input_tokens,
@@ -207,6 +287,7 @@ class JsonStdoutCollector:
             cost_model=cost_model,
             cost_source=cost_source,
             tool_call_count=tool_call_count,
+            tool_use_by_name=stream_tool_by_name or None,
             error=envelope_error,
         )

{codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/analysis/stats.py RENAMED Viewed

@@ -583,7 +583,31 @@ def compare_configs(
     elif speed_diff > 0:
         parts.append(f"{speed_diff:.1f}s slower")
-    summary = f"{a.label} vs {b.label}: {', '.join(parts)} " f"\u2192 {winner} wins"
+    # Soften the verdict when the effect is negligible or the test is
+    # underpowered, so we don't confidently declare a "winner" on what may
+    # be noise. Thresholds:
+    #   Cohen's d: |d| < 0.2 is "negligible" (Cohen 1988).
+    #   Cliff's delta: |delta| < 0.147 is "negligible" (Romano et al. 2006).
+    #   p-value > 0.05: not significant at the conventional threshold.
+    scores_tied = abs(score_diff) < 0.01
+    negligible_threshold = 0.2 if eff_method == "cohens_d" else 0.147
+    small_effect = (
+        eff_size is not None and abs(eff_size) < negligible_threshold
+    )
+    not_significant = p_val is not None and p_val > 0.05
+    if scores_tied:
+        verdict = "effectively tied"
+    elif small_effect and not_significant:
+        verdict = f"{winner} nominally ahead (not significant; small effect)"
+    elif small_effect:
+        verdict = f"{winner} nominally ahead (small effect size)"
+    elif not_significant:
+        verdict = f"{winner} nominally ahead (not significant at p=0.05)"
+    else:
+        verdict = f"{winner} wins"
+    summary = f"{a.label} vs {b.label}: {', '.join(parts)} \u2192 {verdict}"
     return PairwiseComparison(
         config_a=a.label,

{codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/api.py RENAMED Viewed

@@ -151,6 +151,8 @@ def run_experiment(
             permission_mode=perm,
             timeout_seconds=timeout,
             mcp_config=exp_config.mcp_config,
+            allowed_tools=exp_config.allowed_tools,
+            disallowed_tools=exp_config.disallowed_tools,
             cwd=str(experiment_dir.resolve()),
         )

{codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/__init__.py RENAMED Viewed

@@ -794,6 +794,23 @@ def init_experiment(
         "Built-ins: sourcegraph, github. Or path to a custom .md file."
     ),
 )
+@click.option(
+    "--allowed-tools",
+    default=None,
+    help=(
+        "Restrict the agent to this comma-separated list of built-in "
+        "tool names (e.g. 'Read,Grep'). Pass an empty string ('') to "
+        "disable all built-in tools for an MCP-only comparison."
+    ),
+)
+@click.option(
+    "--disallowed-tools",
+    default=None,
+    help=(
+        "Block the agent from these comma-separated built-in tool names "
+        "(e.g. 'Bash,Write'). Applies on top of --allowed-tools."
+    ),
+)
 def add_config(
     path: str,
     label: str,
@@ -803,10 +820,19 @@ def add_config(
     mcp_config: str | None,
     instruction_variant: str | None,
     preambles: tuple[str, ...],
+    allowed_tools: str | None,
+    disallowed_tools: str | None,
 ) -> None:
     """Add a configuration to an existing experiment."""
     from codeprobe.cli.experiment_cmd import experiment_add_config
+    # Parse comma-separated tool lists. An empty string means "MCP-only":
+    # disable all built-in tools. None means "adapter default".
+    def _parse_tools(raw: str | None) -> list[str] | None:
+        if raw is None:
+            return None
+        return [t.strip() for t in raw.split(",") if t.strip()]
     experiment_add_config(
         path,
         label=label,
@@ -816,6 +842,8 @@ def add_config(
         mcp_config_str=mcp_config,
         instruction_variant=instruction_variant,
         preambles=preambles,
+        allowed_tools=_parse_tools(allowed_tools),
+        disallowed_tools=_parse_tools(disallowed_tools),
     )

{codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/experiment_cmd.py RENAMED Viewed

@@ -142,6 +142,8 @@ def experiment_add_config(
     mcp_config_str: str | None,
     instruction_variant: str | None = None,
     preambles: tuple[str, ...] = (),
+    allowed_tools: list[str] | None = None,
+    disallowed_tools: list[str] | None = None,
 ) -> None:
     """Add a configuration to an existing experiment."""
     exp_dir = Path(path)
@@ -191,6 +193,8 @@ def experiment_add_config(
         mcp_config=mcp_config,
         instruction_variant=instruction_variant,
         preambles=preambles,
+        allowed_tools=allowed_tools,
+        disallowed_tools=disallowed_tools,
     )
     # Validate the label is a safe path component

{codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/run_cmd.py RENAMED Viewed

@@ -482,6 +482,8 @@ def run_eval(
             permission_mode=perm,
             timeout_seconds=resolved_timeout,
             mcp_config=exp_config.mcp_config,
+            allowed_tools=exp_config.allowed_tools,
+            disallowed_tools=exp_config.disallowed_tools,
             cwd=str(repo_root),
         )

{codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/executor.py RENAMED Viewed

@@ -387,6 +387,7 @@ def execute_task(
                 cost_model=output.cost_model,
                 cost_source=output.cost_source,
                 tool_call_count=output.tool_call_count,
+                tool_use_by_name=output.tool_use_by_name,
             )
         # For oracle tasks, the agent writes answer.txt / answer.json to the

{codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/experiment.py RENAMED Viewed

@@ -98,7 +98,11 @@ def load_experiment(exp_dir: Path) -> Experiment:
             model=c.get("model"),
             permission_mode=c.get("permission_mode", "default"),
             mcp_config=c.get("mcp_config"),
+            allowed_tools=c.get("allowed_tools"),
+            disallowed_tools=c.get("disallowed_tools"),
             instruction_variant=c.get("instruction_variant"),
+            preambles=tuple(c.get("preambles", ())),
+            reward_type=c.get("reward_type", "binary"),
             extra=c.get("extra", {}),
         )
         for c in data.get("configs", [])

{codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/models/experiment.py RENAMED Viewed

@@ -8,13 +8,23 @@ from typing import Any
 @dataclass(frozen=True)
 class ExperimentConfig:
-    """A single configuration to evaluate (e.g., 'baseline' or 'with-mcp')."""
+    """A single configuration to evaluate (e.g., 'baseline' or 'with-mcp').
+    ``allowed_tools`` / ``disallowed_tools`` restrict which tools the
+    agent is allowed to call during this config's runs. Semantics mirror
+    the underlying CLI (Claude's ``--allowedTools`` / ``--disallowedTools``
+    / ``--tools``). Set ``allowed_tools=[]`` to disable all built-in tools
+    for an MCP-only comparison — MCP tools are still reachable because
+    they come from ``mcp_config``.
+    """
     label: str
     agent: str = "claude"
     model: str | None = None
     permission_mode: str = "default"
     mcp_config: dict | None = None
+    allowed_tools: list[str] | None = None
+    disallowed_tools: list[str] | None = None
     instruction_variant: str | None = None
     preambles: tuple[str, ...] = ()
     reward_type: str = "binary"
@@ -29,6 +39,8 @@ class ExperimentConfig:
             f"ExperimentConfig(label={self.label!r}, agent={self.agent!r}, "
             f"model={self.model!r}, permission_mode={self.permission_mode!r}, "
             f"mcp_config={redacted_mcp!r}, "
+            f"allowed_tools={self.allowed_tools!r}, "
+            f"disallowed_tools={self.disallowed_tools!r}, "
             f"instruction_variant={self.instruction_variant!r}, "
             f"preambles={self.preambles!r}, reward_type={self.reward_type!r}, "
             f"extra={self.extra!r})"
@@ -113,6 +125,9 @@ class CompletedTask:
     cost_model: str = "unknown"
     cost_source: str = "unavailable"
     tool_call_count: int | None = None
+    # Per-tool usage breakdown (e.g. {"Read": 5,
+    # "mcp__sourcegraph__keyword_search": 2}). None when not captured.
+    tool_use_by_name: dict[str, int] | None = None
     error_category: str | None = None
     scoring_details: dict = field(default_factory=dict)
     metadata: dict = field(default_factory=dict)

{codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: codeprobe
-Version: 0.5.2
+Version: 0.5.4
 Summary: Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results.
 Author: codeprobe contributors
 License-Expression: Apache-2.0

{codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_adapters.py RENAMED Viewed

@@ -1824,3 +1824,165 @@ class TestClaudeModelNormalization:
             cmd = adapter.build_command("test", config)
         idx = cmd.index("--model")
         assert cmd[idx + 1] == "claude-sonnet-4-6"
+# ---------------------------------------------------------------------------
+# 0.5.4: tool-restriction flags + stream-json tool_use capture
+# ---------------------------------------------------------------------------
+class TestClaudeToolRestrictions:
+    """Claude adapter wires AgentConfig.{allowed,disallowed}_tools to CLI."""
+    def test_allowed_tools_empty_list_maps_to_tools_empty(self) -> None:
+        """``allowed_tools=[]`` means MCP-only → ``--tools ""``."""
+        adapter = ClaudeAdapter()
+        if not adapter.find_binary():
+            pytest.skip("claude binary not available")
+        config = AgentConfig(allowed_tools=[])
+        cmd = adapter.build_command("test", config)
+        # Should contain --tools followed immediately by empty string.
+        assert "--tools" in cmd
+        idx = cmd.index("--tools")
+        assert cmd[idx + 1] == ""
+    def test_allowed_tools_nonempty_emits_both_flags(self) -> None:
+        """Non-empty allowed_tools = whitelist. Adapter disables built-ins
+        via --tools "" AND auto-approves listed names via --allowedTools,
+        because in claude 2.1.x, --allowedTools alone doesn't restrict
+        the available tool set (it just auto-approves) and without both
+        flags the agent either burns turns on permission prompts or calls
+        unlisted tools."""
+        adapter = ClaudeAdapter()
+        if not adapter.find_binary():
+            pytest.skip("claude binary not available")
+        config = AgentConfig(allowed_tools=["Read", "Grep"])
+        cmd = adapter.build_command("test", config)
+        assert "--tools" in cmd
+        assert cmd[cmd.index("--tools") + 1] == ""
+        assert "--allowedTools" in cmd
+        assert cmd[cmd.index("--allowedTools") + 1] == "Read,Grep"
+    def test_disallowed_tools_maps_to_disallowedTools(self) -> None:
+        adapter = ClaudeAdapter()
+        if not adapter.find_binary():
+            pytest.skip("claude binary not available")
+        config = AgentConfig(disallowed_tools=["Bash", "Write"])
+        cmd = adapter.build_command("test", config)
+        assert "--disallowedTools" in cmd
+        idx = cmd.index("--disallowedTools")
+        assert cmd[idx + 1] == "Bash,Write"
+    def test_both_tool_restrictions_coexist(self) -> None:
+        adapter = ClaudeAdapter()
+        if not adapter.find_binary():
+            pytest.skip("claude binary not available")
+        config = AgentConfig(
+            allowed_tools=["Read"], disallowed_tools=["Bash"]
+        )
+        cmd = adapter.build_command("test", config)
+        assert "--allowedTools" in cmd
+        assert "--disallowedTools" in cmd
+    def test_none_tool_restrictions_omit_flags(self) -> None:
+        """Default behavior: no --tools / --allowedTools / --disallowedTools."""
+        adapter = ClaudeAdapter()
+        if not adapter.find_binary():
+            pytest.skip("claude binary not available")
+        config = AgentConfig()
+        cmd = adapter.build_command("test", config)
+        assert "--tools" not in cmd
+        assert "--allowedTools" not in cmd
+        assert "--disallowedTools" not in cmd
+    def test_stream_json_is_default_output_format(self) -> None:
+        """Claude adapter switched to stream-json for tool_use capture."""
+        adapter = ClaudeAdapter()
+        if not adapter.find_binary():
+            pytest.skip("claude binary not available")
+        cmd = adapter.build_command("test", AgentConfig())
+        assert "--output-format" in cmd
+        idx = cmd.index("--output-format")
+        assert cmd[idx + 1] == "stream-json"
+        assert "--verbose" in cmd
+class TestStreamJsonToolUseCapture:
+    """JsonStdoutCollector parses stream-json and counts tool_use blocks."""
+    def _make_stream(self, tool_names: list[str]) -> str:
+        """Build a minimal stream-json transcript with given tool_use blocks."""
+        import json as _json
+        lines = [
+            _json.dumps({
+                "type": "system", "subtype": "init",
+                "mcp_servers": [{"name": "sourcegraph", "status": "connected"}],
+            })
+        ]
+        for name in tool_names:
+            lines.append(_json.dumps({
+                "type": "assistant",
+                "message": {"content": [{"type": "tool_use", "name": name}]},
+            }))
+        # Terminal result event carries the envelope-shape fields.
+        lines.append(_json.dumps({
+            "type": "result",
+            "subtype": "success",
+            "result": "Done.",
+            "is_error": False,
+            "usage": {
+                "input_tokens": 10,
+                "output_tokens": 20,
+                "cache_read_input_tokens": 100,
+            },
+            "total_cost_usd": 0.05,
+        }))
+        return "\n".join(lines) + "\n"
+    def test_counts_all_tool_use_blocks(self) -> None:
+        from codeprobe.adapters.telemetry import JsonStdoutCollector
+        stream = self._make_stream(["Read", "Grep", "Read", "Bash"])
+        u = JsonStdoutCollector().collect(stream)
+        assert u.tool_call_count == 4
+        assert u.tool_use_by_name == {"Read": 2, "Grep": 1, "Bash": 1}
+    def test_counts_mcp_tool_names(self) -> None:
+        """MCP tools show up as ``mcp__<server>__<tool>``; counted correctly."""
+        from codeprobe.adapters.telemetry import JsonStdoutCollector
+        stream = self._make_stream([
+            "Read", "mcp__sourcegraph__keyword_search",
+            "mcp__sourcegraph__find_references",
+        ])
+        u = JsonStdoutCollector().collect(stream)
+        assert u.tool_call_count == 3
+        assert u.tool_use_by_name["mcp__sourcegraph__keyword_search"] == 1
+        assert u.tool_use_by_name["mcp__sourcegraph__find_references"] == 1
+    def test_empty_stream_returns_no_tool_calls(self) -> None:
+        from codeprobe.adapters.telemetry import JsonStdoutCollector
+        stream = self._make_stream([])
+        u = JsonStdoutCollector().collect(stream)
+        assert u.tool_call_count == 0
+        assert u.tool_use_by_name is None  # sentinel: nothing captured
+    def test_single_envelope_still_works(self) -> None:
+        """Back-compat: legacy --output-format json single envelope parses."""
+        from codeprobe.adapters.telemetry import JsonStdoutCollector
+        envelope = {
+            "result": "ok",
+            "usage": {
+                "input_tokens": 5, "output_tokens": 10,
+                "cache_read_input_tokens": 0,
+            },
+            "total_cost_usd": 0.01,
+        }
+        import json as _json
+        u = JsonStdoutCollector().collect(_json.dumps(envelope))
+        assert u.input_tokens == 5
+        assert u.tool_call_count is None  # envelope has no messages

codeprobe 0.5.2__tar.gz → 0.5.4__tar.gz

codeprobe 0.5.2tar.gz → 0.5.4tar.gz