npm - pi-llm-debugging - Versions diffs - 0.1.0 → 0.2.0 - Mend

pi-llm-debugging 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/README.md +161 -14
package/extensions/pi-llm-debugging.ts +156 -10
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 # pi-llm-debugging
-A [pi](https://shittycodingagent.ai) extension that captures the full LLM provider request payload to disk before each call — letting you inspect exactly what gets sent to the model, turn by turn.
+A [pi](https://shittycodingagent.ai) extension that captures both the full LLM provider request payload **and** the direct HTTP response from the provider to disk for every LLM call — letting you inspect exactly what gets sent to (and received from) the model, turn by turn.
 ## Install
@@ -16,10 +16,11 @@ pi remove npm:pi-llm-debugging
 ## How it works
-Every time pi is about to call the LLM, the extension intercepts the raw provider payload and writes it as a JSON file into your project's local `.pi` directory:
+Every time pi is about to call the LLM, the extension writes **two** JSON files into your project's local `.pi` directory — one for the request, one for the raw provider response:
 ```
-.pi/pi-llm-debugging/<session_id>/<seq>.json
+.pi/pi-llm-debugging/<session_id>/<seq>-req.json
+.pi/pi-llm-debugging/<session_id>/<seq>-res.json
 ```
 - **`session_id`** — the current pi session identifier (visible in the footer bar). Resets on `/new`, `/resume`, and `/fork`.
@@ -30,12 +31,153 @@ For example, a session might produce:
 ```
 .pi/pi-llm-debugging/
 └── abc123def/
-    ├── 001.json   ← first turn
-    ├── 002.json   ← second turn (after a tool call loops back)
-    └── 003.json
+    ├── 001-req.json   ← first turn, request payload
+    ├── 001-res.json   ← first turn, raw provider response
+    ├── 002-req.json   ← second turn (after a tool call loops back)
+    ├── 002-res.json
+    ├── 003-req.json
+    └── 003-res.json
 ```
-Each file is the exact payload the provider receives: the full message history, system prompt, tool definitions, model parameters, and any cache hints.
+**`<seq>-req.json`** is the exact payload the provider receives: the full message history, system prompt, tool definitions, model parameters, and any cache hints. It is captured via pi's `before_provider_request` event.
+**`<seq>-res.json`** is the direct HTTP response from the provider, captured by transparently intercepting `fetch` for known LLM hosts (Anthropic, OpenAI, Gemini, Groq, Mistral, DeepSeek, xAI, Together, Fireworks, Cohere, Perplexity, OpenRouter, …). Each file has this shape:
+```jsonc
+{
+  "url": "https://api.anthropic.com/v1/messages",
+  "method": "POST",
+  "status": 200,
+  "statusText": "OK",
+  "headers": { "content-type": "text/event-stream", ... },
+  "body": "event: message_start\ndata: {...}\n\n...",   // raw bytes verbatim
+  "parsedBody": { /* decoded JSON, when content-type is application/json */ }
+}
+```
+For streaming SSE responses, `body` contains the full raw SSE text exactly as sent by the provider, so you can replay or diff it. For non-streamed JSON responses, `parsedBody` holds the decoded object for convenience.
+## Walkthrough: a 2-turn session
+Let's trace what gets written when you run a single prompt that requires one tool call. Imagine you start a fresh pi session and type:
+> **show me the number of files in cwd**
+pi loops with the model twice: once to decide which tool to call, and once to summarize the tool output. That produces four files:
+```
+.pi/pi-llm-debugging/abc123def/
+├── 001-req.json   ← user prompt is sent
+├── 001-res.json   ← model replies with a Bash tool call
+├── 002-req.json   ← prompt + tool call + tool result are sent back
+└── 002-res.json   ← model replies with the final text answer
+```
+### Turn 1 — ask the question
+**`001-req.json`** (trimmed) — just the user message and the model/tool config:
+```jsonc
+{
+  "model": "claude-opus-4-6",
+  "system": [{ "type": "text", "text": "You are Claude Code, ..." }],
+  "tools": [ { "name": "Bash", /* ... */ }, /* Read, Edit, ... */ ],
+  "messages": [
+    {
+      "role": "user",
+      "content": [
+        { "type": "text", "text": "show me the number of files in cwd" }
+      ]
+    }
+  ]
+}
+```
+**`001-res.json`** — the raw SSE stream from Anthropic. The model decides to call `Bash`:
+```jsonc
+{
+  "url": "https://api.anthropic.com/v1/messages",
+  "status": 200,
+  "headers": { "content-type": "text/event-stream", /* ... */ },
+  "body": "event: message_start\ndata: {\"type\":\"message_start\",\"message\":{\"usage\":{\"input_tokens\":6,\"cache_read_input_tokens\":5996,\"output_tokens\":0}}}\n\nevent: content_block_start\ndata: {\"type\":\"content_block_start\",\"index\":0,\"content_block\":{\"type\":\"tool_use\",\"id\":\"toolu_01GE7771KFZvGYeQhyLmbJqQ\",\"name\":\"Bash\",\"input\":{}}}\n\nevent: content_block_delta\ndata: {\"delta\":{\"type\":\"input_json_delta\",\"partial_json\":\"{\\\"command\\\": \\\"ls -1 | wc -l\\\"}\"}}\n\nevent: message_delta\ndata: {\"delta\":{\"stop_reason\":\"tool_use\"},\"usage\":{\"output_tokens\":72}}\n\nevent: message_stop\ndata: {\"type\":\"message_stop\"}\n\n"
+}
+```
+If you `jq -r '.body' 001-res.json` you'll see the SSE events laid out cleanly. Notable bits in this response:
+- `stop_reason: "tool_use"` — the model wants pi to run a tool before continuing.
+- `tool_use.name: "Bash"`, `input: { command: "ls -1 | wc -l" }` — exactly what pi will execute.
+- `usage.cache_read_input_tokens: 5996` — 5996 tokens of system prompt + tool defs hit the prompt cache; only 6 fresh input tokens were billed at full rate.
+pi runs `ls -1 | wc -l` locally and gets `13`. It then loops back to the model.
+### Turn 2 — the tool result is sent back
+**`002-req.json`** (trimmed) — the conversation has grown by two messages: the assistant's `tool_use` block and a `user`-role `tool_result` carrying the bash output:
+```jsonc
+{
+  "model": "claude-opus-4-6",
+  "messages": [
+    {
+      "role": "user",
+      "content": [{ "type": "text", "text": "show me the number of files in cwd" }]
+    },
+    {
+      "role": "assistant",
+      "content": [
+        {
+          "type": "tool_use",
+          "id": "toolu_01GE7771KFZvGYeQhyLmbJqQ",
+          "name": "Bash",
+          "input": { "command": "ls -1 | wc -l" }
+        }
+      ]
+    },
+    {
+      "role": "user",
+      "content": [
+        {
+          "type": "tool_result",
+          "tool_use_id": "toolu_01GE7771KFZvGYeQhyLmbJqQ",
+          "content": "      13\n"
+        }
+      ]
+    }
+  ]
+}
+```
+Diffing `001-req.json` against `002-req.json` is the fastest way to see *exactly* how pi grew the conversation between turns — useful when debugging tool-result formatting or context bloat.
+**`002-res.json`** — with the tool result in hand, the model now answers in plain text:
+```jsonc
+{
+  "url": "https://api.anthropic.com/v1/messages",
+  "status": 200,
+  "body": "event: message_start\ndata: {...,\"usage\":{\"input_tokens\":1,\"cache_read_input_tokens\":6089,\"output_tokens\":1}}\n\nevent: content_block_start\ndata: {\"content_block\":{\"type\":\"text\",\"text\":\"\"}}\n\nevent: content_block_delta\ndata: {\"delta\":{\"type\":\"text_delta\",\"text\":\"13\"}}\n\nevent: content_block_delta\ndata: {\"delta\":{\"type\":\"text_delta\",\"text\":\" files/directories in the current working directory.\"}}\n\nevent: message_delta\ndata: {\"delta\":{\"stop_reason\":\"end_turn\"},\"usage\":{\"output_tokens\":17}}\n\nevent: message_stop\ndata: {}\n\n"
+}
+```
+This time:
+- `stop_reason: "end_turn"` — the model is done, the agent loop exits.
+- The streamed text deltas concatenate to `"13 files/directories in the current working directory."` — which is what you see in the pi UI.
+- `cache_read_input_tokens: 6089` (vs `5996` on turn 1) — the prior turn's assistant + tool_result blocks were appended into the cache.
+### What you learned from 4 files
+By reading these 4 files in order you can answer questions like:
+- *Did pi send my exact prompt?* → `001-req.json`
+- *Why did the model choose Bash and what command did it pick?* → `001-res.json`
+- *Was the tool result formatted correctly when sent back?* → `002-req.json`
+- *Did the model actually generate the final answer, or did pi mangle it?* → `002-res.json`
+- *How much of my context was cached vs fresh?* → `usage` blocks in either `-res.json`
+No guessing, no "works on my machine" — just the bytes that crossed the wire.
 ## What you can debug
@@ -49,6 +191,8 @@ Each file is the exact payload the provider receives: the full message history,
 **Compaction quality** — Compare the payload just before and just after a `/compact` to see what the summary replaced and whether important context was preserved.
+**Response-side issues** — Inspect `<seq>-res.json` to see the raw SSE stream, tool-use blocks, stop reason, thinking blocks, cache hits, and token usage reported by the provider. Essential when the model does something surprising and you need to know whether it was the model, the parser, or pi itself.
 ## Tips
 Add the debugging output to your `.gitignore` so it doesn't end up in version control:
@@ -60,16 +204,19 @@ Add the debugging output to your `.gitignore` so it doesn't end up in version co
 Use `jq` to quickly inspect a payload:
 ```bash
-# See just the messages
-jq '.messages' .pi/pi-llm-debugging/<session_id>/001.json
+# See just the messages sent on the first turn
+jq '.messages' .pi/pi-llm-debugging/<session_id>/001-req.json
+# Inspect the decoded provider response (non-streamed)
+jq '.parsedBody' .pi/pi-llm-debugging/<session_id>/001-res.json
-# Count tokens approximation: check message content lengths
-jq '[.messages[].content | .. | strings | length] | add' .pi/pi-llm-debugging/<session_id>/001.json
+# Replay a streamed SSE response to stdout
+jq -r '.body' .pi/pi-llm-debugging/<session_id>/001-res.json
-# Diff two consecutive turns to see what changed
+# Diff two consecutive request payloads to see what changed
 diff \
-  <(jq . .pi/pi-llm-debugging/<session_id>/001.json) \
-  <(jq . .pi/pi-llm-debugging/<session_id>/002.json)
+  <(jq . .pi/pi-llm-debugging/<session_id>/001-req.json) \
+  <(jq . .pi/pi-llm-debugging/<session_id>/002-req.json)
 ```
 Since files are scoped per-project under `.pi/`, each project manages its own debugging output independently — nothing bleeds into other projects or your global `~/.pi` directory.

package/extensions/pi-llm-debugging.ts CHANGED Viewed

@@ -1,21 +1,162 @@
 /**
- * Pi LLM Debugging — Saves the full provider request payload to disk before each LLM call.
+ * Pi LLM Debugging — Saves the full provider request AND the raw provider response
+ * to disk for each LLM call.
  *
- * Files are written to <project>/.pi/pi-llm-debugging/<pi_session_id>/<sequence>.json
- * where <sequence> is a zero-padded counter (001, 002, ...).
+ * For every LLM turn, two files are written:
+ *   <project>/.pi/pi-llm-debugging/<pi_session_id>/<seq>-req.json
+ *   <project>/.pi/pi-llm-debugging/<pi_session_id>/<seq>-res.json
  *
- * Unlike the global save-llm-prompt extension, each project manages its own debugging
- * files under its local .pi directory, making it easy to review, diff, and gitignore
- * per-project LLM traffic.
+ * - <seq> is a zero-padded counter (001, 002, ...).
+ * - <seq>-req.json contains the exact payload handed to the provider SDK
+ *   (captured via pi's `before_provider_request` event).
+ * - <seq>-res.json contains the direct HTTP response from the LLM provider
+ *   (captured by monkey-patching globalThis.fetch and teeing the response
+ *   body for known provider hosts). For streaming SSE responses the raw SSE
+ *   text is preserved verbatim inside the `body` field.
  *
- * The current Pi session ID is shown in the footer bar and updates on /new, /resume, and /fork.
+ * Unlike pi's global save-llm-prompt extension, files are scoped to the
+ * current project's local .pi directory so each project manages its own
+ * debugging output (easy to gitignore, diff, and review).
+ *
+ * The current Pi session ID is shown in the footer bar and updates on
+ * /new, /resume, and /fork.
  */
 import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
 import { mkdirSync, writeFileSync } from "node:fs";
 import { join } from "node:path";
+// Hostnames we consider "LLM provider" traffic worth capturing.
+const PROVIDER_HOST_PATTERNS: RegExp[] = [
+  /(^|\.)anthropic\.com$/i,
+  /(^|\.)openai\.com$/i,
+  /(^|\.)openai\.azure\.com$/i,
+  /(^|\.)googleapis\.com$/i, // gemini / vertex
+  /(^|\.)generativelanguage\.googleapis\.com$/i,
+  /(^|\.)mistral\.ai$/i,
+  /(^|\.)groq\.com$/i,
+  /(^|\.)deepseek\.com$/i,
+  /(^|\.)x\.ai$/i,
+  /(^|\.)together\.xyz$/i,
+  /(^|\.)fireworks\.ai$/i,
+  /(^|\.)cohere\.(com|ai)$/i,
+  /(^|\.)perplexity\.ai$/i,
+  /(^|\.)openrouter\.ai$/i,
+];
+function isProviderUrl(url: string): boolean {
+  try {
+    const host = new URL(url).hostname;
+    return PROVIDER_HOST_PATTERNS.some((re) => re.test(host));
+  } catch {
+    return false;
+  }
+}
+// Install the fetch interceptor exactly once per Node process. Multiple
+// pi sessions (or extension re-inits) share the same hook and use a
+// module-level "current target" to decide where to write the next response.
+type ResponseTarget = { outDir: string; sequence: number } | null;
+let currentTarget: ResponseTarget = null;
+let fetchPatched = false;
+function installFetchInterceptor() {
+  if (fetchPatched) return;
+  fetchPatched = true;
+  const originalFetch = globalThis.fetch;
+  if (typeof originalFetch !== "function") return;
+  globalThis.fetch = (async (
+    input: Parameters<typeof fetch>[0],
+    init?: Parameters<typeof fetch>[1],
+  ): Promise<Response> => {
+    const url =
+      typeof input === "string"
+        ? input
+        : input instanceof URL
+          ? input.toString()
+          : (input as Request).url;
+    const response = await originalFetch(input as any, init as any);
+    // Only intercept known provider traffic, and only if we have a
+    // request that hasn't been paired with a response yet.
+    if (!currentTarget || !isProviderUrl(url)) {
+      return response;
+    }
+    const target = currentTarget;
+    // One response per request: clear immediately so subsequent fetches
+    // (retries, unrelated calls) don't clobber this slot.
+    currentTarget = null;
+    const filename = `${String(target.sequence).padStart(3, "0")}-res.json`;
+    const filepath = join(target.outDir, filename);
+    // Tee the body so the caller still gets a fully readable response.
+    // For non-streamed JSON responses, .clone() + .text() is enough.
+    // For SSE streams, clone() also works: both branches can be
+    // consumed independently by the WHATWG fetch implementation.
+    const cloned = response.clone();
+    // Fire-and-forget: never block the real request on disk IO.
+    void (async () => {
+      try {
+        const headers: Record<string, string> = {};
+        cloned.headers.forEach((v, k) => {
+          headers[k] = v;
+        });
+        const bodyText = await cloned.text();
+        const contentType = headers["content-type"] || "";
+        let parsedBody: unknown = undefined;
+        if (contentType.includes("application/json")) {
+          try {
+            parsedBody = JSON.parse(bodyText);
+          } catch {
+            // keep raw text only
+          }
+        }
+        const record = {
+          url,
+          method: (init?.method || (typeof input !== "string" && !(input instanceof URL) ? (input as Request).method : "GET")).toUpperCase(),
+          status: response.status,
+          statusText: response.statusText,
+          headers,
+          // For SSE / text responses, `body` holds the raw stream text.
+          // For JSON responses, `parsedBody` holds the decoded object and
+          // `body` still holds the exact bytes for fidelity.
+          body: bodyText,
+          parsedBody,
+        };
+        writeFileSync(filepath, JSON.stringify(record, null, 2), "utf-8");
+      } catch (err) {
+        try {
+          writeFileSync(
+            filepath,
+            JSON.stringify(
+              { url, error: (err as Error)?.message || String(err) },
+              null,
+              2,
+            ),
+            "utf-8",
+          );
+        } catch {
+          // give up silently — debugging must never break the session
+        }
+      }
+    })();
+    return response;
+  }) as typeof fetch;
+}
 export default function (pi: ExtensionAPI) {
+  installFetchInterceptor();
   let outDir = "";
   let sequence = 0;
@@ -46,8 +187,13 @@ export default function (pi: ExtensionAPI) {
   pi.on("before_provider_request", (_event, ctx) => {
     if (!outDir) initSession(ctx);
     sequence++;
-    const filename = `${String(sequence).padStart(3, "0")}.json`;
-    const filepath = join(outDir, filename);
-    writeFileSync(filepath, JSON.stringify(_event.payload, null, 2), "utf-8");
+    const seqStr = String(sequence).padStart(3, "0");
+    const reqPath = join(outDir, `${seqStr}-req.json`);
+    writeFileSync(reqPath, JSON.stringify(_event.payload, null, 2), "utf-8");
+    // Arm the fetch interceptor to route the very next provider-bound
+    // HTTP response into <seq>-res.json.
+    currentTarget = { outDir, sequence };
   });
 }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "pi-llm-debugging",
-  "version": "0.1.0",
+  "version": "0.2.0",
   "description": "Saves LLM provider request payloads to the project's .pi folder for per-project debugging",
   "repository": {
     "type": "git",