pi-ui-extend 0.1.39 → 0.1.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/app/app.js CHANGED
@@ -666,6 +666,12 @@ export class PiUiExtendApp {
666
666
  setSessionActivity: (activity) => this.setSessionActivity(activity),
667
667
  addEntry: (entry) => this.addEntry(entry),
668
668
  addSessionAbortedEntry: () => this.sessionEvents.addSessionAbortedEntry(),
669
+ emitSessionAborted: () => {
670
+ const runtime = this.runtime;
671
+ if (!runtime)
672
+ return;
673
+ this.extensionEventBusByRuntime.get(runtime)?.emit("pix:session-aborted", { aborted: true });
674
+ },
669
675
  showToast: (message, kind) => this.showToast(message, kind),
670
676
  dismissActiveDialog: () => this.toastController.dismissActiveDialog(),
671
677
  stopVoiceInput: () => this.voiceController.stopRecording(),
@@ -78,7 +78,7 @@ export const SUBAGENTS_WIDGET_MAX_ROWS = 8;
78
78
  export const DEFAULT_THINKING_TOOL_RULE = {
79
79
  previewLines: 0,
80
80
  direction: "head",
81
- color: "thinkingForeground",
81
+ color: "assistantForeground",
82
82
  };
83
83
  export const TERMINAL_COMMAND_MODIFIER_FLAG = 8;
84
84
  export const GIT_BRANCH_CACHE_MS = 30_000;
@@ -18,6 +18,7 @@ export type AppInputActionControllerHost = {
18
18
  setSessionActivity(activity: SessionActivity): void;
19
19
  addEntry(entry: Entry): void;
20
20
  addSessionAbortedEntry(): void;
21
+ emitSessionAborted(): void;
21
22
  showToast(message: string, kind: "success" | "error" | "warning" | "info"): void;
22
23
  dismissActiveDialog?(): boolean;
23
24
  stopVoiceInput(): Promise<void>;
@@ -88,6 +88,9 @@ export class AppInputActionController {
88
88
  }
89
89
  async abortStreamingSession(runtime, options) {
90
90
  const session = runtime.session;
91
+ // Relay the user-initiated abort to extensions (e.g. the terminal-bell
92
+ // extension) so they can suppress the attention bell for this turn.
93
+ this.host.emitSessionAborted();
91
94
  if (this.abortInFlight) {
92
95
  session.agent.abort();
93
96
  if (options.stopIfAlreadyAborting)
@@ -37,6 +37,17 @@ export async function runProcess(command, args = [], options = {}) {
37
37
  child.once("error", (err) => {
38
38
  error = err;
39
39
  });
40
+ // Writing to stdin after the child has closed it raises EPIPE. This is
41
+ // common with clipboard helpers (xclip/xsel/wl-copy) that exit once they
42
+ // have read enough, or when a candidate command exits early. The child's
43
+ // exit status is still captured by the "close" handler, so treat EPIPE as
44
+ // benign and never let it surface as an unhandled "error" event.
45
+ child.stdin?.once("error", (err) => {
46
+ if (err?.code === "EPIPE")
47
+ return;
48
+ if (error === undefined)
49
+ error = err;
50
+ });
40
51
  child.once("close", (status, signal) => {
41
52
  if (timer)
42
53
  clearTimeout(timer);
@@ -7,7 +7,6 @@ import { formatStructuredText } from "./message-content.js";
7
7
  import { formatSubagentTimestamp, isSubagentRunRenderDetails, isSubagentsToolName, subagentRunName, subagentStatusIcon, taskPreviewMap, } from "../subagents/subagents-model.js";
8
8
  import { formatTodoTaskLine, isTodoDetails, visibleTodoTasks } from "../todo/todo-model.js";
9
9
  import { renderToolBlock } from "./tool-block-renderer.js";
10
- import { thinkingLevelThemeColor } from "./status-line-renderer.js";
11
10
  export function renderConversationToolEntry(entry, width, options) {
12
11
  const todoLines = renderTodoToolEntry(entry, width, options);
13
12
  if (todoLines)
@@ -53,14 +52,14 @@ export function renderThinkingEntry(entry, width, options) {
53
52
  const forceExpanded = Boolean(options.allThinkingExpanded);
54
53
  const compactExpandedText = options.superCompactTools && forceExpanded ? removeBlankLines(expandedText) : expandedText;
55
54
  const expanded = forceExpanded || (entry.expanded && expandedText.trim().length > 0);
56
- const headerColorOverride = entry.level
57
- ? thinkingLevelThemeColor(entry.level, options.colors, options.availableThinkingLevels)
58
- : undefined;
59
55
  const elapsed = thinkingElapsedText(entry, options.currentTimeMs ?? Date.now());
56
+ const headerArgs = [entry.level ? `(${entry.level})` : undefined, elapsed]
57
+ .filter((part) => part !== undefined)
58
+ .join(" ");
60
59
  return renderToolBlock({
61
60
  id: entry.id,
62
61
  toolName: THINKING_TOOL_NAME,
63
- ...(elapsed === undefined ? {} : { headerArgs: elapsed }),
62
+ ...(headerArgs === "" ? {} : { headerArgs }),
64
63
  expanded,
65
64
  status: entry.status,
66
65
  isError: false,
@@ -73,7 +72,6 @@ export function renderThinkingEntry(entry, width, options) {
73
72
  superCompact: Boolean(options.superCompactTools && !forceExpanded),
74
73
  backgroundOverride: options.colors.thinkingMessageBackground,
75
74
  showGutter: true,
76
- ...(headerColorOverride === undefined ? {} : { headerColorOverride }),
77
75
  });
78
76
  }
79
77
  function thinkingElapsedText(entry, currentTimeMs) {
@@ -15,6 +15,13 @@ const TERMINAL_BELL_ATTENTION_EVENT = "pix:terminal-bell:attention";
15
15
  * extensions, so the renderer emits this on the extension event bus.
16
16
  */
17
17
  const RETRY_ACTIVE_EVENT = "pix:retry-active";
18
+ /**
19
+ * Renderer-relayed signal that the user interrupted the session (Esc/Ctrl-C).
20
+ * Payload: `{ aborted: boolean }`. Aborting the SDK stream during tool
21
+ * execution does not always produce an aborted `message_update`, so the
22
+ * renderer relays the abort here to reliably suppress the attention bell.
23
+ */
24
+ const SESSION_ABORTED_EVENT = "pix:session-aborted";
18
25
  const DEFAULT_COMPLETION_NOTIFICATION_TITLE = "Pix - complete";
19
26
  const DEFAULT_ERROR_NOTIFICATION_TITLE = "Pix - error";
20
27
  const DEFAULT_QUESTION_NOTIFICATION_TITLE = "Pix - question";
@@ -220,9 +227,16 @@ function notificationTitleTemplate(defaultTitle) {
220
227
  function willRetryAfterAgentEnd(event) {
221
228
  return event.willRetry === true;
222
229
  }
230
+ function isAbortedMessageUpdate(event) {
231
+ return event.type === "error" && event.reason === "aborted";
232
+ }
223
233
  function failureReasonFromMessageUpdate(event) {
224
234
  if (event.type !== "error")
225
235
  return undefined;
236
+ // The SDK reports a user-initiated interrupt as `{ type: "error", reason: "aborted" }`.
237
+ // That is not a failure the bell should announce, so treat it as "no reason".
238
+ if (event.reason === "aborted")
239
+ return undefined;
226
240
  const reason = event.error?.errorMessage;
227
241
  return typeof reason === "string" ? trimmed(reason) : undefined;
228
242
  }
@@ -387,6 +401,10 @@ export default function terminalBell(pi) {
387
401
  let deferredUntilSubagentsFinish = false;
388
402
  let liveSubagentCount = 0;
389
403
  let lastFailureReason;
404
+ // True when the user interrupted the session this turn (Esc/Ctrl-C). The
405
+ // attention bell should never ring for a user-initiated abort, so this flag
406
+ // suppresses any queued/pending bell until the next agent_start resets it.
407
+ let userAborted = false;
390
408
  // True while the session is in an auto-retry cycle (relayed via the
391
409
  // extension event bus). Suppresses the failure bell on intermediate retry
392
410
  // attempts; the final exhausted failure still rings because no retry-start
@@ -426,6 +444,13 @@ export default function terminalBell(pi) {
426
444
  // queued this bell and the timer firing, suppress the bell entirely.
427
445
  if (retryActive)
428
446
  return;
447
+ // Safety net: if the user aborted after the bell was queued (e.g. an
448
+ // aborted agent_end with no aborted message_update), suppress it.
449
+ if (userAborted) {
450
+ pendingBell = undefined;
451
+ deferredUntilSubagentsFinish = false;
452
+ return;
453
+ }
429
454
  try {
430
455
  if (!ctx.isIdle()) {
431
456
  if (attempt < MAX_IDLE_RETRIES)
@@ -514,15 +539,39 @@ export default function terminalBell(pi) {
514
539
  deferredUntilSubagentsFinish = false;
515
540
  }
516
541
  });
542
+ pi.events.on(SESSION_ABORTED_EVENT, (data) => {
543
+ const aborted = data != null && typeof data === "object" && data.aborted === true;
544
+ if (!aborted)
545
+ return;
546
+ // The user interrupted the session. Aborting during tool execution does
547
+ // not always produce an aborted `message_update`, so the renderer relays
548
+ // the interrupt here. Suppress any pending bell until the next agent_start.
549
+ userAborted = true;
550
+ lastFailureReason = undefined;
551
+ clearTimer();
552
+ pendingBell = undefined;
553
+ deferredUntilSubagentsFinish = false;
554
+ });
517
555
  pi.on("agent_start", async () => {
518
556
  clearTimer();
519
557
  deferredUntilSubagentsFinish = false;
520
558
  lastFailureReason = undefined;
559
+ userAborted = false;
521
560
  retryActive = false;
522
561
  activeSubagentWaitToolCallIds.clear();
523
562
  notifiedAskUserToolCallIds.clear();
524
563
  });
525
564
  pi.on("message_update", async (event) => {
565
+ if (isAbortedMessageUpdate(event.assistantMessageEvent)) {
566
+ // The user interrupted the stream. Suppress any pending bell until
567
+ // the next agent_start.
568
+ userAborted = true;
569
+ lastFailureReason = undefined;
570
+ clearTimer();
571
+ pendingBell = undefined;
572
+ deferredUntilSubagentsFinish = false;
573
+ return;
574
+ }
526
575
  const reason = failureReasonFromMessageUpdate(event.assistantMessageEvent);
527
576
  if (reason) {
528
577
  lastFailureReason = reason;
@@ -554,6 +603,10 @@ export default function terminalBell(pi) {
554
603
  clearTimer();
555
604
  return;
556
605
  }
606
+ if (userAborted) {
607
+ clearTimer();
608
+ return;
609
+ }
557
610
  if (lastFailureReason) {
558
611
  scheduleBell(ctx, idleDelayMs, 0, renderNotificationTemplate(retryFailureMessageTemplate(), {
559
612
  ...buildNotificationTemplateValues(ctx, pi),
@@ -569,6 +622,7 @@ export default function terminalBell(pi) {
569
622
  deferredUntilSubagentsFinish = false;
570
623
  liveSubagentCount = 0;
571
624
  lastFailureReason = undefined;
625
+ userAborted = false;
572
626
  retryActive = false;
573
627
  activeSubagentWaitToolCallIds.clear();
574
628
  notifiedAskUserToolCallIds.clear();
package/dist/config.js CHANGED
@@ -19,7 +19,7 @@ const DEFAULT_TOOL_RENDERER = {
19
19
  color: "toolTitle",
20
20
  },
21
21
  tools: {
22
- thinking: { previewLines: 0, direction: "head", color: "thinkingForeground" },
22
+ thinking: { previewLines: 0, direction: "head", color: "assistantForeground" },
23
23
  bash: { previewLines: 6, direction: "tail", color: "warning" },
24
24
  Bash: { previewLines: 6, direction: "tail", color: "warning" },
25
25
  shell: { previewLines: 6, direction: "tail", color: "warning" },
@@ -10,7 +10,7 @@ export const DEFAULT_PIX_CONFIG_JSONC = String.raw `{
10
10
  "toolRenderer": {
11
11
  "default": { "previewLines": 0, "direction": "head", "color": "toolTitle" },
12
12
  "tools": {
13
- "thinking": { "previewLines": 0, "direction": "head", "color": "thinkingForeground" },
13
+ "thinking": { "previewLines": 0, "direction": "head", "color": "assistantForeground" },
14
14
  "bash": { "previewLines": 6, "direction": "tail", "color": "warning" },
15
15
  "Bash": { "previewLines": 6, "direction": "tail", "color": "warning" },
16
16
  "shell": { "previewLines": 6, "direction": "tail", "color": "warning" },
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pi-ui-extend",
3
- "version": "0.1.39",
3
+ "version": "0.1.41",
4
4
  "private": false,
5
5
  "type": "module",
6
6
  "bin": {
@@ -11,7 +11,7 @@ At a high level, the process of creating a skill goes like this:
11
11
 
12
12
  - Decide what you want the skill to do and roughly how it should do it
13
13
  - Write a draft of the skill
14
- - Create a few test prompts and run claude-with-access-to-the-skill on them
14
+ - Create a few test prompts and run pi-with-access-to-the-skill on them
15
15
  - Help the user evaluate the results both qualitatively and quantitatively
16
16
  - While the runs happen in the background, draft some quantitative evals if there aren't any (if there are some, you can either use as is or modify if you feel something needs to change about them). Then explain them to the user (or if they already existed, explain the ones that already exist)
17
17
  - Use the `eval-viewer/generate_review.py` script to show the user the results for them to look at, and also let them look at the quantitative metrics
@@ -31,7 +31,7 @@ Cool? Cool.
31
31
 
32
32
  ## Communicating with the user
33
33
 
34
- The skill creator is liable to be used by people across a wide range of familiarity with coding jargon. If you haven't heard (and how could you, it's only very recently that it started), there's a trend now where the power of Claude is inspiring plumbers to open up their terminals, parents and grandparents to google "how to install npm". On the other hand, the bulk of users are probably fairly computer-literate.
34
+ The skill creator is liable to be used by people across a wide range of familiarity with coding jargon. If you haven't heard (and how could you, it's only very recently that it started), there's a trend now where the power of AI coding agents like pi is inspiring plumbers to open up their terminals, parents and grandparents to google "how to install npm". On the other hand, the bulk of users are probably fairly computer-literate.
35
35
 
36
36
  So please pay attention to context cues to understand how to phrase your communication! In the default case, just to give you some idea:
37
37
 
@@ -48,7 +48,7 @@ It's OK to briefly explain terms if you're in doubt, and feel free to clarify te
48
48
 
49
49
  Start by understanding the user's intent. The current conversation might already contain a workflow the user wants to capture (e.g., they say "turn this into a skill"). If so, extract answers from the conversation history first — the tools used, the sequence of steps, corrections the user made, input/output formats observed. The user may need to fill the gaps, and should confirm before proceeding to the next step.
50
50
 
51
- 1. What should this skill enable Claude to do?
51
+ 1. What should this skill enable pi to do?
52
52
  2. When should this skill trigger? (what user phrases/contexts)
53
53
  3. What's the expected output format?
54
54
  4. Should we set up test cases to verify the skill works? Skills with objectively verifiable outputs (file transforms, data extraction, code generation, fixed workflow steps) benefit from test cases. Skills with subjective outputs (writing style, art) often don't need them. Suggest the appropriate default based on the skill type, but let the user decide.
@@ -63,8 +63,8 @@ Check available MCPs - if useful for research (searching docs, finding similar s
63
63
 
64
64
  Based on the user interview, fill in these components:
65
65
 
66
- - **name**: Skill identifier
67
- - **description**: When to trigger, what it does. This is the primary triggering mechanism - include both what the skill does AND specific contexts for when to use it. All "when to use" info goes here, not in the body. Note: currently Claude has a tendency to "undertrigger" skills -- to not use them when they'd be useful. To combat this, please make the skill descriptions a little bit "pushy". So for instance, instead of "How to build a simple fast dashboard to display internal Anthropic data.", you might write "How to build a simple fast dashboard to display internal Anthropic data. Make sure to use this skill whenever the user mentions dashboards, data visualization, internal metrics, or wants to display any kind of company data, even if they don't explicitly ask for a 'dashboard.'"
66
+ - **name**: Skill identifier (lowercase, hyphens, a-z0-9 — see pi's skill validation rules)
67
+ - **description**: When to trigger, what it does. This is the primary triggering mechanism - include both what the skill does AND specific contexts for when to use it. All "when to use" info goes here, not in the body. Note: currently models have a tendency to "undertrigger" skills -- to not use them when they'd be useful. To combat this, please make the skill descriptions a little bit "pushy". So for instance, instead of "How to build a simple fast dashboard to display internal Anthropic data.", you might write "How to build a simple fast dashboard to display internal Anthropic data. Make sure to use this skill whenever the user mentions dashboards, data visualization, internal metrics, or wants to display any kind of company data, even if they don't explicitly ask for a 'dashboard.'"
68
68
  - **compatibility**: Required tools, dependencies (optional, rarely needed)
69
69
  - **the rest of the skill :)**
70
70
 
@@ -106,7 +106,15 @@ cloud-deploy/
106
106
  ├── gcp.md
107
107
  └── azure.md
108
108
  ```
109
- Claude reads only the relevant reference file.
109
+ The agent reads only the relevant reference file.
110
+
111
+ #### How pi discovers and loads skills
112
+
113
+ This matters both for writing skills and for testing them. Pi scans skill locations at startup (`~/.pi/agent/skills/`, `~/.agents/skills/`, project `.pi/skills/` and `.agents/skills/`, settings `skills` arrays, and the `--skill <path>` CLI flag). It extracts each skill's `name` + `description` and lists them in the system prompt as `available_skills`. When a task matches a skill, the agent uses the `read` tool to load the full `SKILL.md` on demand — that's progressive disclosure. Skills also register as `/skill:name` commands the user can invoke directly to force-load them.
114
+
115
+ Two practical consequences for this skill:
116
+ - **Where to install a finished skill**: global skills go in `~/.pi/agent/skills/<skill-name>/`; project skills go in `.pi/skills/<skill-name>/` (only loaded once the project is trusted).
117
+ - **How triggering is measured**: the eval scripts in `scripts/` test triggering by watching whether pi actually performs that `read` on the skill's `SKILL.md`. They invoke `pi -p --mode json --skill <temp-skill-dir>` so the description under test is evaluated in isolation (with `--no-skills` to suppress all other discovered skills).
110
118
 
111
119
  #### Principle of Lack of Surprise
112
120
 
@@ -168,9 +176,9 @@ Put results in `<skill-name>-workspace/` as a sibling to the skill directory. Wi
168
176
 
169
177
  ### Step 1: Spawn all runs (with-skill AND baseline) in the same turn
170
178
 
171
- For each test case, spawn two subagents in the same turn — one with the skill, one without. This is important: don't spawn the with-skill runs first and then come back for baselines later. Launch everything at once so it all finishes around the same time.
179
+ For each test case, spawn two subagents in the same turn — one with the skill, one without. This is important: don't spawn the with-skill runs first and then come back for baselines later. Launch everything at once so it all finishes around the same time. (If your harness has no subagent tool, see "Running without subagents" below — run them inline, one at a time, and skip baselines.)
172
180
 
173
- **With-skill run:**
181
+ **With-skill run** (give the subagent a task like this):
174
182
 
175
183
  ```
176
184
  Execute this task:
@@ -179,6 +187,8 @@ Execute this task:
179
187
  - Input files: <eval files if any, or "none">
180
188
  - Save outputs to: <workspace>/iteration-<N>/eval-<ID>/with_skill/outputs/
181
189
  - Outputs to save: <what the user cares about — e.g., "the .docx file", "the final CSV">
190
+
191
+ Load the skill at <path-to-skill> (read its SKILL.md) before starting, and follow it.
182
192
  ```
183
193
 
184
194
  **Baseline run** (same prompt, but the baseline depends on context):
@@ -206,7 +216,7 @@ Update the `eval_metadata.json` files and `evals/evals.json` with the assertions
206
216
 
207
217
  ### Step 3: As runs complete, capture timing data
208
218
 
209
- When each subagent task completes, you receive a notification containing `total_tokens` and `duration_ms`. Save this data immediately to `timing.json` in the run directory:
219
+ When each subagent task completes, you receive a notification containing `total_tokens` and `duration_ms` (or equivalent usage info from the result). Save this data immediately to `timing.json` in the run directory:
210
220
 
211
221
  ```json
212
222
  {
@@ -216,7 +226,7 @@ When each subagent task completes, you receive a notification containing `total_
216
226
  }
217
227
  ```
218
228
 
219
- This is the only opportunity to capture this data it comes through the task notification and isn't persisted elsewhere. Process each notification as it arrives rather than trying to batch them.
229
+ If your harness's completion notification carries a different shape, capture whatever token/duration fields it provides. Process each notification as it arrives rather than trying to batch them.
220
230
 
221
231
  ### Step 4: Grade, aggregate, and launch the viewer
222
232
 
@@ -244,7 +254,7 @@ Put each with_skill version before its baseline counterpart.
244
254
  ```
245
255
  For iteration 2+, also pass `--previous-workspace <workspace>/iteration-<N-1>`.
246
256
 
247
- **Cowork / headless environments:** If `webbrowser.open()` is not available or the environment has no display, use `--static <output_path>` to write a standalone HTML file instead of starting a server. Feedback will be downloaded as a `feedback.json` file when the user clicks "Submit All Reviews". After download, copy `feedback.json` into the workspace directory for the next iteration to pick up.
257
+ **Headless / no-display environments:** If `webbrowser.open()` is not available or the environment has no display (remote server, CI), use `--static <output_path>` to write a standalone HTML file instead of starting a server. Feedback will be downloaded as a `feedback.json` file when the user clicks "Submit All Reviews". After download, copy `feedback.json` into the workspace directory for the next iteration to pick up.
248
258
 
249
259
  Note: please use generate_review.py to create the viewer; there's no need to write custom HTML.
250
260
 
@@ -332,7 +342,7 @@ This is optional, requires subagents, and most users won't need it. The human re
332
342
 
333
343
  ## Description Optimization
334
344
 
335
- The description field in SKILL.md frontmatter is the primary mechanism that determines whether Claude invokes a skill. After creating or improving a skill, offer to optimize the description for better triggering accuracy.
345
+ The description field in SKILL.md frontmatter is the primary mechanism that determines whether pi invokes a skill. After creating or improving a skill, offer to optimize the description for better triggering accuracy.
336
346
 
337
347
  ### Step 1: Generate trigger eval queries
338
348
 
@@ -345,7 +355,7 @@ Create 20 eval queries — a mix of should-trigger and should-not-trigger. Save
345
355
  ]
346
356
  ```
347
357
 
348
- The queries must be realistic and something a Claude Code or Claude.ai user would actually type. Not abstract requests, but requests that are concrete and specific and have a good amount of detail. For instance, file paths, personal context about the user's job or situation, column names and values, company names, URLs. A little bit of backstory. Some might be in lowercase or contain abbreviations or typos or casual speech. Use a mix of different lengths, and focus on edge cases rather than making them clear-cut (the user will get a chance to sign off on them).
358
+ The queries must be realistic and something a pi user would actually type. Not abstract requests, but requests that are concrete and specific and have a good amount of detail. For instance, file paths, personal context about the user's job or situation, column names and values, company names, URLs. A little bit of backstory. Some might be in lowercase or contain abbreviations or typos or casual speech. Use a mix of different lengths, and focus on edge cases rather than making them clear-cut (the user will get a chance to sign off on them).
349
359
 
350
360
  Bad: `"Format this data"`, `"Extract text from PDF"`, `"Create a chart"`
351
361
 
@@ -387,17 +397,17 @@ python -m scripts.run_loop \
387
397
  --verbose
388
398
  ```
389
399
 
390
- Use the model ID from your system prompt (the one powering the current session) so the triggering test matches what the user actually experiences.
400
+ Use the model ID from your system prompt (the one powering the current session) so the triggering test matches what the user actually experiences. For pi this is a model pattern like `anthropic/claude-sonnet-4-20250514` or whatever you're running as (with an optional `:thinking` suffix).
391
401
 
392
402
  While it runs, periodically tail the output to give the user updates on which iteration it's on and what the scores look like.
393
403
 
394
- This handles the full optimization loop automatically. It splits the eval set into 60% train and 40% held-out test, evaluates the current description (running each query 3 times to get a reliable trigger rate), then calls Claude to propose improvements based on what failed. It re-evaluates each new description on both train and test, iterating up to 5 times. When it's done, it opens an HTML report in the browser showing the results per iteration and returns JSON with `best_description` — selected by test score rather than train score to avoid overfitting.
404
+ This handles the full optimization loop automatically. It splits the eval set into 60% train and 40% held-out test, evaluates the current description (running each query 3 times to get a reliable trigger rate), then calls pi to propose improvements based on what failed. It re-evaluates each new description on both train and test, iterating up to 5 times. When it's done, it opens an HTML report in the browser showing the results per iteration and returns JSON with `best_description` — selected by test score rather than train score to avoid overfitting.
395
405
 
396
406
  ### How skill triggering works
397
407
 
398
- Understanding the triggering mechanism helps design better eval queries. Skills appear in Claude's `available_skills` list with their name + description, and Claude decides whether to consult a skill based on that description. The important thing to know is that Claude only consults skills for tasks it can't easily handle on its own — simple, one-step queries like "read this PDF" may not trigger a skill even if the description matches perfectly, because Claude can handle them directly with basic tools. Complex, multi-step, or specialized queries reliably trigger skills when the description matches.
408
+ Understanding the triggering mechanism helps design better eval queries. Skills appear in pi's `available_skills` list (in the system prompt) with their name + description, and the agent decides whether to consult a skill based on that description. The important thing to know is that the agent only consults skills for tasks it can't easily handle on its own — simple, one-step queries like "read this PDF" may not trigger a skill even if the description matches perfectly, because the agent can handle them directly with basic tools. Complex, multi-step, or specialized queries reliably trigger skills when the description matches.
399
409
 
400
- This means your eval queries should be substantive enough that Claude would actually benefit from consulting a skill. Simple queries like "read file X" are poor test cases — they won't trigger skills regardless of description quality.
410
+ This means your eval queries should be substantive enough that the agent would actually benefit from consulting a skill. Simple queries like "read file X" are poor test cases — they won't trigger skills regardless of description quality.
401
411
 
402
412
  ### Step 4: Apply the result
403
413
 
@@ -417,45 +427,31 @@ After packaging, direct the user to the resulting `.skill` file path so they can
417
427
 
418
428
  ---
419
429
 
420
- ## Claude.ai-specific instructions
430
+ ## Running without subagents
421
431
 
422
- In Claude.ai, the core workflow is the same (draft → test → review → improve → repeat), but because Claude.ai doesn't have subagents, some mechanics change. Here's what to adapt:
432
+ The core workflow is the same (draft → test → review → improve → repeat), but if your harness has no subagent tool, some mechanics change. Here's what to adapt:
423
433
 
424
434
  **Running test cases**: No subagents means no parallel execution. For each test case, read the skill's SKILL.md, then follow its instructions to accomplish the test prompt yourself. Do them one at a time. This is less rigorous than independent subagents (you wrote the skill and you're also running it, so you have full context), but it's a useful sanity check — and the human review step compensates. Skip the baseline runs — just use the skill to complete the task as requested.
425
435
 
426
- **Reviewing results**: If you can't open a browser (e.g., Claude.ai's VM has no display, or you're on a remote server), skip the browser reviewer entirely. Instead, present results directly in the conversation. For each test case, show the prompt and the output. If the output is a file the user needs to see (like a .docx or .xlsx), save it to the filesystem and tell them where it is so they can download and inspect it. Ask for feedback inline: "How does this look? Anything you'd change?"
436
+ **Reviewing results**: If you can't open a browser (e.g., a remote server with no display), skip the browser reviewer entirely. Instead, present results directly in the conversation. For each test case, show the prompt and the output. If the output is a file the user needs to see (like a .docx or .xlsx), save it to the filesystem and tell them where it is so they can inspect it. Ask for feedback inline: "How does this look? Anything you'd change?"
427
437
 
428
438
  **Benchmarking**: Skip the quantitative benchmarking — it relies on baseline comparisons which aren't meaningful without subagents. Focus on qualitative feedback from the user.
429
439
 
430
- **The iteration loop**: Same as before — improve the skill, rerun the test cases, ask for feedback — just without the browser reviewer in the middle. You can still organize results into iteration directories on the filesystem if you have one.
440
+ **The iteration loop**: Same as before — improve the skill, rerun the test cases, ask for feedback — just without the browser reviewer in the middle. You can still organize results into iteration directories on the filesystem.
431
441
 
432
- **Description optimization**: This section requires the `claude` CLI tool (specifically `claude -p`) which is only available in Claude Code. Skip it if you're on Claude.ai.
442
+ **Description optimization**: This requires the `pi` CLI (specifically `pi -p`) which the eval scripts call via subprocess. It works as long as `pi` is on PATH. If it isn't available in the environment, skip description optimization.
433
443
 
434
444
  **Blind comparison**: Requires subagents. Skip it.
435
445
 
436
- **Packaging**: The `package_skill.py` script works anywhere with Python and a filesystem. On Claude.ai, you can run it and the user can download the resulting `.skill` file.
446
+ **Packaging**: The `package_skill.py` script works anywhere with Python and a filesystem.
437
447
 
438
448
  **Updating an existing skill**: The user might be asking you to update an existing skill, not create a new one. In this case:
439
449
  - **Preserve the original name.** Note the skill's directory name and `name` frontmatter field -- use them unchanged. E.g., if the installed skill is `research-helper`, output `research-helper.skill` (not `research-helper-v2`).
440
- - **Copy to a writeable location before editing.** The installed skill path may be read-only. Copy to `/tmp/skill-name/`, edit there, and package from the copy.
450
+ - **Copy to a writeable location before editing.** An installed skill path may be read-only (e.g., a global `~/.pi/agent/skills/` install). Copy to `/tmp/skill-name/`, edit there, and package from the copy.
441
451
  - **If packaging manually, stage in `/tmp/` first**, then copy to the output directory -- direct writes may fail due to permissions.
442
452
 
443
453
  ---
444
454
 
445
- ## Cowork-Specific Instructions
446
-
447
- If you're in Cowork, the main things to know are:
448
-
449
- - You have subagents, so the main workflow (spawn test cases in parallel, run baselines, grade, etc.) all works. (However, if you run into severe problems with timeouts, it's OK to run the test prompts in series rather than parallel.)
450
- - You don't have a browser or display, so when generating the eval viewer, use `--static <output_path>` to write a standalone HTML file instead of starting a server. Then proffer a link that the user can click to open the HTML in their browser.
451
- - For whatever reason, the Cowork setup seems to disincline Claude from generating the eval viewer after running the tests, so just to reiterate: whether you're in Cowork or in Claude Code, after running tests, you should always generate the eval viewer for the human to look at examples before revising the skill yourself and trying to make corrections, using `generate_review.py` (not writing your own boutique html code). Sorry in advance but I'm gonna go all caps here: GENERATE THE EVAL VIEWER *BEFORE* evaluating inputs yourself. You want to get them in front of the human ASAP!
452
- - Feedback works differently: since there's no running server, the viewer's "Submit All Reviews" button will download `feedback.json` as a file. You can then read it from there (you may have to request access first).
453
- - Packaging works — `package_skill.py` just needs Python and a filesystem.
454
- - Description optimization (`run_loop.py` / `run_eval.py`) should work in Cowork just fine since it uses `claude -p` via subprocess, not a browser, but please save it until you've fully finished making the skill and the user agrees it's in good shape.
455
- - **Updating an existing skill**: The user might be asking you to update an existing skill, not create a new one. Follow the update guidance in the claude.ai section above.
456
-
457
- ---
458
-
459
455
  ## Reference files
460
456
 
461
457
  The agents/ directory contains instructions for specialized subagents. Read them when you need to spawn the relevant subagent.
@@ -473,13 +469,13 @@ Repeating one more time the core loop here for emphasis:
473
469
 
474
470
  - Figure out what the skill is about
475
471
  - Draft or edit the skill
476
- - Run claude-with-access-to-the-skill on test prompts
472
+ - Run pi-with-access-to-the-skill on test prompts
477
473
  - With the user, evaluate the outputs:
478
474
  - Create benchmark.json and run `eval-viewer/generate_review.py` to help the user review them
479
475
  - Run quantitative evals
480
476
  - Repeat until you and the user are satisfied
481
477
  - Package the final skill and return it to the user.
482
478
 
483
- Please add steps to your TodoList, if you have such a thing, to make sure you don't forget. If you're in Cowork, please specifically put "Create evals JSON and run `eval-viewer/generate_review.py` so human can review test cases" in your TodoList to make sure it happens.
479
+ Please add steps to your TodoList, if you have such a thing, to make sure you don't forget. Specifically put "Create evals JSON and run `eval-viewer/generate_review.py` so human can review test cases" in your TodoList to make sure it happens.
484
480
 
485
481
  Good luck!
@@ -545,7 +545,7 @@
545
545
  <div class="header">
546
546
  <div>
547
547
  <h1>Eval Review: <span id="skill-name"></span></h1>
548
- <div class="instructions">Review each output and leave feedback below. Navigate with arrow keys or buttons. When done, copy feedback and paste into Claude Code.</div>
548
+ <div class="instructions">Review each output and leave feedback below. Navigate with arrow keys or buttons. When done, copy feedback and paste into your pi session.</div>
549
549
  </div>
550
550
  <div class="progress" id="progress"></div>
551
551
  </div>
@@ -634,7 +634,7 @@
634
634
  <div class="done-overlay" id="done-overlay">
635
635
  <div class="done-card">
636
636
  <h2>Review Complete</h2>
637
- <p>Your feedback has been saved. Go back to your Claude Code session and tell Claude you're done reviewing.</p>
637
+ <p>Your feedback has been saved. Go back to your pi session and tell the agent you're done reviewing.</p>
638
638
  <div class="btn-row">
639
639
  <button onclick="closeDoneDialog()">OK</button>
640
640
  </div>
@@ -225,7 +225,7 @@ Output from Benchmark mode. Located at `benchmarks/<timestamp>/benchmark.json`.
225
225
  "metadata": {
226
226
  "skill_name": "pdf",
227
227
  "skill_path": "/path/to/pdf",
228
- "executor_model": "claude-sonnet-4-20250514",
228
+ "executor_model": "anthropic/claude-sonnet-4-20250514",
229
229
  "analyzer_model": "most-capable-model",
230
230
  "timestamp": "2026-01-15T10:30:00Z",
231
231
  "evals_run": [1, 2, 3],
@@ -148,7 +148,7 @@ def generate_html(data: dict, auto_refresh: bool = False, skill_name: str = "")
148
148
  <body>
149
149
  <h1>""" + title_prefix + """Skill Description Optimization</h1>
150
150
  <div class="explainer">
151
- <strong>Optimizing your skill's description.</strong> This page updates automatically as Claude tests different versions of your skill's description. Each row is an iteration — a new description attempt. The columns show test queries: green checkmarks mean the skill triggered correctly (or correctly didn't trigger), red crosses mean it got it wrong. The "Train" score shows performance on queries used to improve the description; the "Test" score shows performance on held-out queries the optimizer hasn't seen. When it's done, Claude will apply the best-performing description to your skill.
151
+ <strong>Optimizing your skill's description.</strong> This page updates automatically as pi tests different versions of your skill's description. Each row is an iteration — a new description attempt. The columns show test queries: green checkmarks mean the skill triggered correctly (or correctly didn't trigger), red crosses mean it got it wrong. The "Train" score shows performance on queries used to improve the description; the "Test" score shows performance on held-out queries the optimizer hasn't seen. When it's done, pi will apply the best-performing description to your skill.
152
152
  </div>
153
153
  """]
154
154
 
@@ -2,13 +2,12 @@
2
2
  """Improve a skill description based on eval results.
3
3
 
4
4
  Takes eval results (from run_eval.py) and generates an improved description
5
- by calling `claude -p` as a subprocess (same auth pattern as run_eval.py
6
- uses the session's Claude Code auth, no separate ANTHROPIC_API_KEY needed).
5
+ by calling `pi -p` as a subprocess (uses the session's pi auth/config, no
6
+ separate API key needed).
7
7
  """
8
8
 
9
9
  import argparse
10
10
  import json
11
- import os
12
11
  import re
13
12
  import subprocess
14
13
  import sys
@@ -17,32 +16,26 @@ from pathlib import Path
17
16
  from scripts.utils import parse_skill_md
18
17
 
19
18
 
20
- def _call_claude(prompt: str, model: str | None, timeout: int = 300) -> str:
21
- """Run `claude -p` with the prompt on stdin and return the text response.
19
+ def _call_pi(prompt: str, model: str | None, timeout: int = 300) -> str:
20
+ """Run `pi -p` with the prompt on stdin and return the text response.
22
21
 
23
22
  Prompt goes over stdin (not argv) because it embeds the full SKILL.md
24
23
  body and can easily exceed comfortable argv length.
25
24
  """
26
- cmd = ["claude", "-p", "--output-format", "text"]
25
+ cmd = ["pi", "-p", "--mode", "text", "--no-session"]
27
26
  if model:
28
27
  cmd.extend(["--model", model])
29
28
 
30
- # Remove CLAUDECODE env var to allow nesting claude -p inside a
31
- # Claude Code session. The guard is for interactive terminal conflicts;
32
- # programmatic subprocess usage is safe. Same pattern as run_eval.py.
33
- env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
34
-
35
29
  result = subprocess.run(
36
30
  cmd,
37
31
  input=prompt,
38
32
  capture_output=True,
39
33
  text=True,
40
- env=env,
41
34
  timeout=timeout,
42
35
  )
43
36
  if result.returncode != 0:
44
37
  raise RuntimeError(
45
- f"claude -p exited {result.returncode}\nstderr: {result.stderr}"
38
+ f"pi -p exited {result.returncode}\nstderr: {result.stderr}"
46
39
  )
47
40
  return result.stdout
48
41
 
@@ -58,7 +51,7 @@ def improve_description(
58
51
  log_dir: Path | None = None,
59
52
  iteration: int | None = None,
60
53
  ) -> str:
61
- """Call Claude to improve the description based on eval results."""
54
+ """Call pi to improve the description based on eval results."""
62
55
  failed_triggers = [
63
56
  r for r in eval_results["results"]
64
57
  if r["should_trigger"] and not r["pass"]
@@ -68,7 +61,6 @@ def improve_description(
68
61
  if not r["should_trigger"] and not r["pass"]
69
62
  ]
70
63
 
71
- # Build scores summary
72
64
  train_score = f"{eval_results['summary']['passed']}/{eval_results['summary']['total']}"
73
65
  if test_results:
74
66
  test_score = f"{test_results['summary']['passed']}/{test_results['summary']['total']}"
@@ -76,9 +68,9 @@ def improve_description(
76
68
  else:
77
69
  scores_summary = f"Train: {train_score}"
78
70
 
79
- prompt = f"""You are optimizing a skill description for a Claude Code skill called "{skill_name}". A "skill" is sort of like a prompt, but with progressive disclosure -- there's a title and description that Claude sees when deciding whether to use the skill, and then if it does use the skill, it reads the .md file which has lots more details and potentially links to other resources in the skill folder like helper files and scripts and additional documentation or examples.
71
+ prompt = f"""You are optimizing a skill description for a pi skill called "{skill_name}". A "skill" is sort of like a prompt, but with progressive disclosure -- there's a title and description that the agent sees when deciding whether to use the skill, and then if it does use the skill, it reads the .md file which has lots more details and potentially links to other resources in the skill folder like helper files and scripts and additional documentation or examples.
80
72
 
81
- The description appears in Claude's "available_skills" list. When a user sends a query, Claude decides whether to invoke the skill based solely on the title and on this description. Your goal is to write a description that triggers for relevant queries, and doesn't trigger for irrelevant ones.
73
+ The description appears in the agent's "available_skills" list. When a user sends a query, the agent decides whether to invoke the skill based solely on the title and on this description. Your goal is to write a description that triggers for relevant queries, and doesn't trigger for irrelevant ones.
82
74
 
83
75
  Here's the current description:
84
76
  <current_description>
@@ -134,14 +126,14 @@ Concretely, your description should not be more than about 100-200 words, even i
134
126
  Here are some tips that we've found to work well in writing these descriptions:
135
127
  - The skill should be phrased in the imperative -- "Use this skill for" rather than "this skill does"
136
128
  - The skill description should focus on the user's intent, what they are trying to achieve, vs. the implementation details of how the skill works.
137
- - The description competes with other skills for Claude's attention — make it distinctive and immediately recognizable.
129
+ - The description competes with other skills for the agent's attention — make it distinctive and immediately recognizable.
138
130
  - If you're getting lots of failures after repeated attempts, change things up. Try different sentence structures or wordings.
139
131
 
140
132
  I'd encourage you to be creative and mix up the style in different iterations since you'll have multiple opportunities to try different approaches and we'll just grab the highest-scoring one at the end.
141
133
 
142
134
  Please respond with only the new description text in <new_description> tags, nothing else."""
143
135
 
144
- text = _call_claude(prompt, model)
136
+ text = _call_pi(prompt, model)
145
137
 
146
138
  match = re.search(r"<new_description>(.*?)</new_description>", text, re.DOTALL)
147
139
  description = match.group(1).strip().strip('"') if match else text.strip().strip('"')
@@ -157,9 +149,8 @@ Please respond with only the new description text in <new_description> tags, not
157
149
 
158
150
  # Safety net: the prompt already states the 1024-char hard limit, but if
159
151
  # the model blew past it anyway, make one fresh single-turn call that
160
- # quotes the too-long version and asks for a shorter rewrite. (The old
161
- # SDK path did this as a true multi-turn; `claude -p` is one-shot, so we
162
- # inline the prior output into the new prompt instead.)
152
+ # quotes the too-long version and asks for a shorter rewrite. (pi -p is
153
+ # one-shot, so we inline the prior output into the new prompt instead.)
163
154
  if len(description) > 1024:
164
155
  shorten_prompt = (
165
156
  f"{prompt}\n\n"
@@ -171,7 +162,7 @@ Please respond with only the new description text in <new_description> tags, not
171
162
  f"important trigger words and intent coverage. Respond with only "
172
163
  f"the new description in <new_description> tags."
173
164
  )
174
- shorten_text = _call_claude(shorten_prompt, model)
165
+ shorten_text = _call_pi(shorten_prompt, model)
175
166
  match = re.search(r"<new_description>(.*?)</new_description>", shorten_text, re.DOTALL)
176
167
  shortened = match.group(1).strip().strip('"') if match else shorten_text.strip().strip('"')
177
168
 
@@ -229,7 +220,6 @@ def main():
229
220
  if args.verbose:
230
221
  print(f"Improved: {new_description}", file=sys.stderr)
231
222
 
232
- # Output as JSON with both the new description and updated history
233
223
  output = {
234
224
  "description": new_description,
235
225
  "history": history + [{
@@ -1,16 +1,19 @@
1
1
  #!/usr/bin/env python3
2
2
  """Run trigger evaluation for a skill description.
3
3
 
4
- Tests whether a skill's description causes Claude to trigger (read the skill)
4
+ Tests whether a skill's description causes pi to trigger (read the skill)
5
5
  for a set of queries. Outputs results as JSON.
6
6
  """
7
7
 
8
8
  import argparse
9
9
  import json
10
10
  import os
11
+ import re
11
12
  import select
13
+ import shutil
12
14
  import subprocess
13
15
  import sys
16
+ import tempfile
14
17
  import time
15
18
  import uuid
16
19
  from concurrent.futures import ProcessPoolExecutor, as_completed
@@ -20,82 +23,92 @@ from scripts.utils import parse_skill_md
20
23
 
21
24
 
22
25
  def find_project_root() -> Path:
23
- """Find the project root by walking up from cwd looking for .claude/.
26
+ """Return the working directory pi should run in.
24
27
 
25
- Mimics how Claude Code discovers its project root, so the command file
26
- we create ends up where claude -p will look for it.
28
+ Unlike Claude Code, pi has no `.claude/` project marker that controls
29
+ skill discovery skills are loaded explicitly via `--skill` (or from
30
+ pi's own skill locations). We simply use the current directory so the
31
+ agent sees the same relative paths the user would.
27
32
  """
28
- current = Path.cwd()
29
- for parent in [current, *current.parents]:
30
- if (parent / ".claude").is_dir():
31
- return parent
32
- return current
33
+ return Path.cwd()
34
+
35
+
36
+ def _safe_skill_name(raw: str, unique_id: str) -> str:
37
+ """Build a frontmatter-valid skill name (lowercase, hyphens, a-z0-9)."""
38
+ base = re.sub(r"[^a-z0-9]+", "-", (raw or "skill").lower()).strip("-") or "skill"
39
+ return f"{base}-{unique_id}"
33
40
 
34
41
 
35
42
  def run_single_query(
36
43
  query: str,
37
44
  skill_name: str,
38
45
  skill_description: str,
46
+ skill_body: str,
39
47
  timeout: int,
40
48
  project_root: str,
41
49
  model: str | None = None,
42
50
  ) -> bool:
43
51
  """Run a single query and return whether the skill was triggered.
44
52
 
45
- Creates a command file in .claude/commands/ so it appears in Claude's
46
- available_skills list, then runs `claude -p` with the raw query.
47
- Uses --include-partial-messages to detect triggering early from
48
- stream events (content_block_start) rather than waiting for the
49
- full assistant message, which only arrives after tool execution.
53
+ Creates a throwaway skill directory whose SKILL.md carries the
54
+ description under test, then runs `pi -p --mode json --skill <dir>`.
55
+ We watch the JSON event stream for a `read` tool call targeting that
56
+ SKILL.md, which is how pi loads a skill once the model decides to use
57
+ it. As soon as we see it, we return True and kill the process so the
58
+ run doesn't keep executing the skill.
50
59
  """
51
60
  unique_id = uuid.uuid4().hex[:8]
52
- clean_name = f"{skill_name}-skill-{unique_id}"
53
- project_commands_dir = Path(project_root) / ".claude" / "commands"
54
- command_file = project_commands_dir / f"{clean_name}.md"
61
+ clean_name = _safe_skill_name(skill_name, unique_id)
62
+ temp_skill_dir = Path(tempfile.mkdtemp(prefix=f"pi-skill-eval-{unique_id}-"))
63
+ skill_md_path = temp_skill_dir / "SKILL.md"
55
64
 
56
65
  try:
57
- project_commands_dir.mkdir(parents=True, exist_ok=True)
58
- # Use YAML block scalar to avoid breaking on quotes in description
66
+ # Write a SKILL.md with the description under test. The body is the
67
+ # real skill body so the model behaves naturally if it does read it,
68
+ # but the triggering decision is driven solely by the description.
59
69
  indented_desc = "\n ".join(skill_description.split("\n"))
60
- command_content = (
70
+ skill_md_content = (
61
71
  f"---\n"
72
+ f"name: {clean_name}\n"
62
73
  f"description: |\n"
63
74
  f" {indented_desc}\n"
64
75
  f"---\n\n"
65
- f"# {skill_name}\n\n"
66
- f"This skill handles: {skill_description}\n"
76
+ f"{skill_body.strip()}\n"
67
77
  )
68
- command_file.write_text(command_content)
78
+ skill_md_path.write_text(skill_md_content)
69
79
 
70
80
  cmd = [
71
- "claude",
72
- "-p", query,
73
- "--output-format", "stream-json",
74
- "--verbose",
75
- "--include-partial-messages",
81
+ "pi",
82
+ "-p", "--mode", "json",
83
+ "--no-session",
84
+ # Only the skill under test should be available, so its
85
+ # description is what gets evaluated in isolation. Explicit
86
+ # --skill paths still load even with --no-skills.
87
+ "--no-skills",
88
+ "--skill", str(temp_skill_dir),
89
+ query,
76
90
  ]
77
91
  if model:
78
92
  cmd.extend(["--model", model])
79
93
 
80
- # Remove CLAUDECODE env var to allow nesting claude -p inside a
81
- # Claude Code session. The guard is for interactive terminal conflicts;
82
- # programmatic subprocess usage is safe.
83
- env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
84
-
85
94
  process = subprocess.Popen(
86
95
  cmd,
87
96
  stdout=subprocess.PIPE,
88
97
  stderr=subprocess.DEVNULL,
89
98
  cwd=project_root,
90
- env=env,
91
99
  )
92
100
 
93
101
  triggered = False
94
102
  start_time = time.time()
95
103
  buffer = ""
96
- # Track state for stream event detection
97
- pending_tool_name = None
98
- accumulated_json = ""
104
+
105
+ def _targets_skill(path: str) -> bool:
106
+ """True if a read target points at the temp skill's SKILL.md."""
107
+ if not path:
108
+ return False
109
+ # The temp dir name embeds unique_id, so this is unique per run
110
+ # and survives absolute/relative/tilde variations.
111
+ return unique_id in path or clean_name in path
99
112
 
100
113
  try:
101
114
  while time.time() - start_time < timeout:
@@ -125,66 +138,42 @@ def run_single_query(
125
138
  except json.JSONDecodeError:
126
139
  continue
127
140
 
128
- # Early detection via stream events
129
- if event.get("type") == "stream_event":
130
- se = event.get("event", {})
131
- se_type = se.get("type", "")
132
-
133
- if se_type == "content_block_start":
134
- cb = se.get("content_block", {})
135
- if cb.get("type") == "tool_use":
136
- tool_name = cb.get("name", "")
137
- if tool_name in ("Skill", "Read"):
138
- pending_tool_name = tool_name
139
- accumulated_json = ""
140
- else:
141
- return False
142
-
143
- elif se_type == "content_block_delta" and pending_tool_name:
144
- delta = se.get("delta", {})
145
- if delta.get("type") == "input_json_delta":
146
- accumulated_json += delta.get("partial_json", "")
147
- if clean_name in accumulated_json:
141
+ etype = event.get("type")
142
+
143
+ # Fully-formed tool call (fires before execution).
144
+ if etype == "message_update":
145
+ ame = event.get("assistantMessageEvent", {})
146
+ if ame.get("type") == "toolcall_end":
147
+ tool_call = ame.get("toolCall", {})
148
+ if tool_call.get("name") == "read":
149
+ path = (tool_call.get("arguments") or {}).get("path", "")
150
+ if _targets_skill(path):
148
151
  return True
149
152
 
150
- elif se_type in ("content_block_stop", "message_stop"):
151
- if pending_tool_name:
152
- return clean_name in accumulated_json
153
- if se_type == "message_stop":
154
- return False
155
-
156
- # Fallback: full assistant message
157
- elif event.get("type") == "assistant":
158
- message = event.get("message", {})
159
- for content_item in message.get("content", []):
160
- if content_item.get("type") != "tool_use":
161
- continue
162
- tool_name = content_item.get("name", "")
163
- tool_input = content_item.get("input", {})
164
- if tool_name == "Skill" and clean_name in tool_input.get("skill", ""):
165
- triggered = True
166
- elif tool_name == "Read" and clean_name in tool_input.get("file_path", ""):
167
- triggered = True
168
- return triggered
169
-
170
- elif event.get("type") == "result":
153
+ # Tool actually started executing — redundant but robust.
154
+ elif etype == "tool_execution_start":
155
+ if event.get("toolName") == "read":
156
+ path = (event.get("args") or {}).get("path", "")
157
+ if _targets_skill(path):
158
+ return True
159
+
160
+ elif etype == "agent_end":
171
161
  return triggered
172
162
  finally:
173
- # Clean up process on any exit path (return, exception, timeout)
174
163
  if process.poll() is None:
175
164
  process.kill()
176
165
  process.wait()
177
166
 
178
167
  return triggered
179
168
  finally:
180
- if command_file.exists():
181
- command_file.unlink()
169
+ shutil.rmtree(temp_skill_dir, ignore_errors=True)
182
170
 
183
171
 
184
172
  def run_eval(
185
173
  eval_set: list[dict],
186
174
  skill_name: str,
187
175
  description: str,
176
+ skill_body: str,
188
177
  num_workers: int,
189
178
  timeout: int,
190
179
  project_root: Path,
@@ -204,6 +193,7 @@ def run_eval(
204
193
  item["query"],
205
194
  skill_name,
206
195
  description,
196
+ skill_body,
207
197
  timeout,
208
198
  str(project_root),
209
199
  model,
@@ -256,6 +246,21 @@ def run_eval(
256
246
  }
257
247
 
258
248
 
249
+ def extract_skill_body(skill_path: Path, full_content: str) -> str:
250
+ """Return the SKILL.md body (everything after the frontmatter)."""
251
+ lines = full_content.split("\n")
252
+ if not lines or lines[0].strip() != "---":
253
+ return full_content
254
+ end_idx = None
255
+ for i, line in enumerate(lines[1:], start=1):
256
+ if line.strip() == "---":
257
+ end_idx = i
258
+ break
259
+ if end_idx is None:
260
+ return full_content
261
+ return "\n".join(lines[end_idx + 1:])
262
+
263
+
259
264
  def main():
260
265
  parser = argparse.ArgumentParser(description="Run trigger evaluation for a skill description")
261
266
  parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
@@ -265,7 +270,7 @@ def main():
265
270
  parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds")
266
271
  parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
267
272
  parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
268
- parser.add_argument("--model", default=None, help="Model to use for claude -p (default: user's configured model)")
273
+ parser.add_argument("--model", default=None, help="Model to use for pi -p (default: user's configured model)")
269
274
  parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
270
275
  args = parser.parse_args()
271
276
 
@@ -278,6 +283,7 @@ def main():
278
283
 
279
284
  name, original_description, content = parse_skill_md(skill_path)
280
285
  description = args.description or original_description
286
+ skill_body = extract_skill_body(skill_path, content)
281
287
  project_root = find_project_root()
282
288
 
283
289
  if args.verbose:
@@ -287,6 +293,7 @@ def main():
287
293
  eval_set=eval_set,
288
294
  skill_name=name,
289
295
  description=description,
296
+ skill_body=skill_body,
290
297
  num_workers=args.num_workers,
291
298
  timeout=args.timeout,
292
299
  project_root=project_root,