pi-ui-extend 0.1.39 → 0.1.43
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/app/app.js +6 -0
- package/dist/app/cli/update.js +17 -7
- package/dist/app/constants.js +1 -1
- package/dist/app/input/input-action-controller.d.ts +1 -0
- package/dist/app/input/input-action-controller.js +3 -0
- package/dist/app/process.js +11 -0
- package/dist/app/rendering/conversation-tool-renderer.js +4 -6
- package/dist/bundled-extensions/terminal-bell/index.js +54 -0
- package/dist/config.js +1 -1
- package/dist/default-pix-config.js +1 -1
- package/external/pi-tools-suite/README.md +1 -1
- package/external/pi-tools-suite/package.json +3 -3
- package/external/pi-tools-suite/src/coding-discipline/index.ts +228 -68
- package/package.json +5 -5
- package/skills/skill-creator/SKILL.md +44 -41
- package/skills/skill-creator/eval-viewer/viewer.html +2 -2
- package/skills/skill-creator/references/schemas.md +1 -1
- package/skills/skill-creator/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/aggregate_benchmark.cpython-314.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/generate_report.cpython-314.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/improve_description.cpython-314.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/package_skill.cpython-314.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/run_eval.cpython-314.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/run_loop.cpython-314.pyc +0 -0
- package/skills/skill-creator/scripts/__pycache__/utils.cpython-314.pyc +0 -0
- package/skills/skill-creator/scripts/generate_report.py +1 -1
- package/skills/skill-creator/scripts/improve_description.py +14 -24
- package/skills/skill-creator/scripts/run_eval.py +93 -82
- package/skills/skill-creator/scripts/run_loop.py +1 -0
|
@@ -1,16 +1,19 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
"""Run trigger evaluation for a skill description.
|
|
3
3
|
|
|
4
|
-
Tests whether a skill's description causes
|
|
4
|
+
Tests whether a skill's description causes pi to trigger (read the skill)
|
|
5
5
|
for a set of queries. Outputs results as JSON.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
import argparse
|
|
9
9
|
import json
|
|
10
10
|
import os
|
|
11
|
+
import re
|
|
11
12
|
import select
|
|
13
|
+
import shutil
|
|
12
14
|
import subprocess
|
|
13
15
|
import sys
|
|
16
|
+
import tempfile
|
|
14
17
|
import time
|
|
15
18
|
import uuid
|
|
16
19
|
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
@@ -20,82 +23,92 @@ from scripts.utils import parse_skill_md
|
|
|
20
23
|
|
|
21
24
|
|
|
22
25
|
def find_project_root() -> Path:
|
|
23
|
-
"""
|
|
26
|
+
"""Return the working directory pi should run in.
|
|
24
27
|
|
|
25
|
-
|
|
26
|
-
|
|
28
|
+
Unlike Claude Code, pi has no `.claude/` project marker that controls
|
|
29
|
+
skill discovery — skills are loaded explicitly via `--skill` (or from
|
|
30
|
+
pi's own skill locations). We simply use the current directory so the
|
|
31
|
+
agent sees the same relative paths the user would.
|
|
27
32
|
"""
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
+
return Path.cwd()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _safe_skill_name(raw: str, unique_id: str) -> str:
|
|
37
|
+
"""Build a frontmatter-valid skill name (lowercase, hyphens, a-z0-9)."""
|
|
38
|
+
base = re.sub(r"[^a-z0-9]+", "-", (raw or "skill").lower()).strip("-") or "skill"
|
|
39
|
+
return f"{base}-{unique_id}"
|
|
33
40
|
|
|
34
41
|
|
|
35
42
|
def run_single_query(
|
|
36
43
|
query: str,
|
|
37
44
|
skill_name: str,
|
|
38
45
|
skill_description: str,
|
|
46
|
+
skill_body: str,
|
|
39
47
|
timeout: int,
|
|
40
48
|
project_root: str,
|
|
41
49
|
model: str | None = None,
|
|
42
50
|
) -> bool:
|
|
43
51
|
"""Run a single query and return whether the skill was triggered.
|
|
44
52
|
|
|
45
|
-
Creates a
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
53
|
+
Creates a throwaway skill directory whose SKILL.md carries the
|
|
54
|
+
description under test, then runs `pi -p --mode json --skill <dir>`.
|
|
55
|
+
We watch the JSON event stream for a `read` tool call targeting that
|
|
56
|
+
SKILL.md, which is how pi loads a skill once the model decides to use
|
|
57
|
+
it. As soon as we see it, we return True and kill the process so the
|
|
58
|
+
run doesn't keep executing the skill.
|
|
50
59
|
"""
|
|
51
60
|
unique_id = uuid.uuid4().hex[:8]
|
|
52
|
-
clean_name =
|
|
53
|
-
|
|
54
|
-
|
|
61
|
+
clean_name = _safe_skill_name(skill_name, unique_id)
|
|
62
|
+
temp_skill_dir = Path(tempfile.mkdtemp(prefix=f"pi-skill-eval-{unique_id}-"))
|
|
63
|
+
skill_md_path = temp_skill_dir / "SKILL.md"
|
|
55
64
|
|
|
56
65
|
try:
|
|
57
|
-
|
|
58
|
-
#
|
|
66
|
+
# Write a SKILL.md with the description under test. The body is the
|
|
67
|
+
# real skill body so the model behaves naturally if it does read it,
|
|
68
|
+
# but the triggering decision is driven solely by the description.
|
|
59
69
|
indented_desc = "\n ".join(skill_description.split("\n"))
|
|
60
|
-
|
|
70
|
+
skill_md_content = (
|
|
61
71
|
f"---\n"
|
|
72
|
+
f"name: {clean_name}\n"
|
|
62
73
|
f"description: |\n"
|
|
63
74
|
f" {indented_desc}\n"
|
|
64
75
|
f"---\n\n"
|
|
65
|
-
f"
|
|
66
|
-
f"This skill handles: {skill_description}\n"
|
|
76
|
+
f"{skill_body.strip()}\n"
|
|
67
77
|
)
|
|
68
|
-
|
|
78
|
+
skill_md_path.write_text(skill_md_content)
|
|
69
79
|
|
|
70
80
|
cmd = [
|
|
71
|
-
"
|
|
72
|
-
"-p",
|
|
73
|
-
"--
|
|
74
|
-
|
|
75
|
-
|
|
81
|
+
"pi",
|
|
82
|
+
"-p", "--mode", "json",
|
|
83
|
+
"--no-session",
|
|
84
|
+
# Only the skill under test should be available, so its
|
|
85
|
+
# description is what gets evaluated in isolation. Explicit
|
|
86
|
+
# --skill paths still load even with --no-skills.
|
|
87
|
+
"--no-skills",
|
|
88
|
+
"--skill", str(temp_skill_dir),
|
|
89
|
+
query,
|
|
76
90
|
]
|
|
77
91
|
if model:
|
|
78
92
|
cmd.extend(["--model", model])
|
|
79
93
|
|
|
80
|
-
# Remove CLAUDECODE env var to allow nesting claude -p inside a
|
|
81
|
-
# Claude Code session. The guard is for interactive terminal conflicts;
|
|
82
|
-
# programmatic subprocess usage is safe.
|
|
83
|
-
env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
|
|
84
|
-
|
|
85
94
|
process = subprocess.Popen(
|
|
86
95
|
cmd,
|
|
87
96
|
stdout=subprocess.PIPE,
|
|
88
97
|
stderr=subprocess.DEVNULL,
|
|
89
98
|
cwd=project_root,
|
|
90
|
-
env=env,
|
|
91
99
|
)
|
|
92
100
|
|
|
93
101
|
triggered = False
|
|
94
102
|
start_time = time.time()
|
|
95
103
|
buffer = ""
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
104
|
+
|
|
105
|
+
def _targets_skill(path: str) -> bool:
|
|
106
|
+
"""True if a read target points at the temp skill's SKILL.md."""
|
|
107
|
+
if not path:
|
|
108
|
+
return False
|
|
109
|
+
# The temp dir name embeds unique_id, so this is unique per run
|
|
110
|
+
# and survives absolute/relative/tilde variations.
|
|
111
|
+
return unique_id in path or clean_name in path
|
|
99
112
|
|
|
100
113
|
try:
|
|
101
114
|
while time.time() - start_time < timeout:
|
|
@@ -125,66 +138,46 @@ def run_single_query(
|
|
|
125
138
|
except json.JSONDecodeError:
|
|
126
139
|
continue
|
|
127
140
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
if
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
return False
|
|
142
|
-
|
|
143
|
-
elif se_type == "content_block_delta" and pending_tool_name:
|
|
144
|
-
delta = se.get("delta", {})
|
|
145
|
-
if delta.get("type") == "input_json_delta":
|
|
146
|
-
accumulated_json += delta.get("partial_json", "")
|
|
147
|
-
if clean_name in accumulated_json:
|
|
141
|
+
etype = event.get("type")
|
|
142
|
+
|
|
143
|
+
# Fully-formed tool call (fires before execution).
|
|
144
|
+
if etype == "message_update":
|
|
145
|
+
ame = event.get("assistantMessageEvent", {})
|
|
146
|
+
if ame.get("type") == "toolcall_end":
|
|
147
|
+
tool_call = ame.get("toolCall", {})
|
|
148
|
+
# Tool name arrives capitalized ("Read"), not lowercase.
|
|
149
|
+
if (tool_call.get("name") or "").lower() == "read":
|
|
150
|
+
args = tool_call.get("arguments") or {}
|
|
151
|
+
# Read tool's real arg is `file_path`.
|
|
152
|
+
path = args.get("file_path", "") or args.get("path", "")
|
|
153
|
+
if _targets_skill(path):
|
|
148
154
|
return True
|
|
149
155
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
for content_item in message.get("content", []):
|
|
160
|
-
if content_item.get("type") != "tool_use":
|
|
161
|
-
continue
|
|
162
|
-
tool_name = content_item.get("name", "")
|
|
163
|
-
tool_input = content_item.get("input", {})
|
|
164
|
-
if tool_name == "Skill" and clean_name in tool_input.get("skill", ""):
|
|
165
|
-
triggered = True
|
|
166
|
-
elif tool_name == "Read" and clean_name in tool_input.get("file_path", ""):
|
|
167
|
-
triggered = True
|
|
168
|
-
return triggered
|
|
169
|
-
|
|
170
|
-
elif event.get("type") == "result":
|
|
156
|
+
# Tool actually started executing — redundant but robust.
|
|
157
|
+
elif etype == "tool_execution_start":
|
|
158
|
+
if (event.get("toolName") or "").lower() == "read":
|
|
159
|
+
args = event.get("args") or {}
|
|
160
|
+
path = args.get("file_path", "") or args.get("path", "")
|
|
161
|
+
if _targets_skill(path):
|
|
162
|
+
return True
|
|
163
|
+
|
|
164
|
+
elif etype == "agent_end":
|
|
171
165
|
return triggered
|
|
172
166
|
finally:
|
|
173
|
-
# Clean up process on any exit path (return, exception, timeout)
|
|
174
167
|
if process.poll() is None:
|
|
175
168
|
process.kill()
|
|
176
169
|
process.wait()
|
|
177
170
|
|
|
178
171
|
return triggered
|
|
179
172
|
finally:
|
|
180
|
-
|
|
181
|
-
command_file.unlink()
|
|
173
|
+
shutil.rmtree(temp_skill_dir, ignore_errors=True)
|
|
182
174
|
|
|
183
175
|
|
|
184
176
|
def run_eval(
|
|
185
177
|
eval_set: list[dict],
|
|
186
178
|
skill_name: str,
|
|
187
179
|
description: str,
|
|
180
|
+
skill_body: str,
|
|
188
181
|
num_workers: int,
|
|
189
182
|
timeout: int,
|
|
190
183
|
project_root: Path,
|
|
@@ -204,6 +197,7 @@ def run_eval(
|
|
|
204
197
|
item["query"],
|
|
205
198
|
skill_name,
|
|
206
199
|
description,
|
|
200
|
+
skill_body,
|
|
207
201
|
timeout,
|
|
208
202
|
str(project_root),
|
|
209
203
|
model,
|
|
@@ -256,6 +250,21 @@ def run_eval(
|
|
|
256
250
|
}
|
|
257
251
|
|
|
258
252
|
|
|
253
|
+
def extract_skill_body(skill_path: Path, full_content: str) -> str:
|
|
254
|
+
"""Return the SKILL.md body (everything after the frontmatter)."""
|
|
255
|
+
lines = full_content.split("\n")
|
|
256
|
+
if not lines or lines[0].strip() != "---":
|
|
257
|
+
return full_content
|
|
258
|
+
end_idx = None
|
|
259
|
+
for i, line in enumerate(lines[1:], start=1):
|
|
260
|
+
if line.strip() == "---":
|
|
261
|
+
end_idx = i
|
|
262
|
+
break
|
|
263
|
+
if end_idx is None:
|
|
264
|
+
return full_content
|
|
265
|
+
return "\n".join(lines[end_idx + 1:])
|
|
266
|
+
|
|
267
|
+
|
|
259
268
|
def main():
|
|
260
269
|
parser = argparse.ArgumentParser(description="Run trigger evaluation for a skill description")
|
|
261
270
|
parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
|
|
@@ -265,7 +274,7 @@ def main():
|
|
|
265
274
|
parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds")
|
|
266
275
|
parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
|
|
267
276
|
parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
|
|
268
|
-
parser.add_argument("--model", default=None, help="Model to use for
|
|
277
|
+
parser.add_argument("--model", default=None, help="Model to use for pi -p (default: user's configured model)")
|
|
269
278
|
parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
|
|
270
279
|
args = parser.parse_args()
|
|
271
280
|
|
|
@@ -278,6 +287,7 @@ def main():
|
|
|
278
287
|
|
|
279
288
|
name, original_description, content = parse_skill_md(skill_path)
|
|
280
289
|
description = args.description or original_description
|
|
290
|
+
skill_body = extract_skill_body(skill_path, content)
|
|
281
291
|
project_root = find_project_root()
|
|
282
292
|
|
|
283
293
|
if args.verbose:
|
|
@@ -287,6 +297,7 @@ def main():
|
|
|
287
297
|
eval_set=eval_set,
|
|
288
298
|
skill_name=name,
|
|
289
299
|
description=description,
|
|
300
|
+
skill_body=skill_body,
|
|
290
301
|
num_workers=args.num_workers,
|
|
291
302
|
timeout=args.timeout,
|
|
292
303
|
project_root=project_root,
|