@rudderhq/agent-runtime-gemini-local 0.2.1 → 0.2.2-canary.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/skills/conversation-to-skill/LICENSE.txt +202 -0
- package/skills/conversation-to-skill/SKILL.md +428 -0
- package/skills/conversation-to-skill/agents/analyzer.md +274 -0
- package/skills/conversation-to-skill/agents/comparator.md +202 -0
- package/skills/conversation-to-skill/agents/grader.md +223 -0
- package/skills/conversation-to-skill/assets/eval_review.html +146 -0
- package/skills/conversation-to-skill/eval-viewer/generate_review.py +471 -0
- package/skills/conversation-to-skill/eval-viewer/viewer.html +1325 -0
- package/skills/conversation-to-skill/references/compatibility.md +36 -0
- package/skills/conversation-to-skill/references/description-optimization.md +113 -0
- package/skills/conversation-to-skill/references/evaluation-suite.md +410 -0
- package/skills/conversation-to-skill/references/schemas.md +431 -0
- package/skills/conversation-to-skill/scripts/__init__.py +0 -0
- package/skills/conversation-to-skill/scripts/aggregate_benchmark.py +401 -0
- package/skills/conversation-to-skill/scripts/generate_report.py +335 -0
- package/skills/conversation-to-skill/scripts/improve_description.py +197 -0
- package/skills/conversation-to-skill/scripts/model_backends.py +115 -0
- package/skills/conversation-to-skill/scripts/package_skill.py +136 -0
- package/skills/conversation-to-skill/scripts/quick_validate.py +103 -0
- package/skills/conversation-to-skill/scripts/run_eval.py +363 -0
- package/skills/conversation-to-skill/scripts/run_loop.py +319 -0
- package/skills/conversation-to-skill/scripts/utils.py +223 -0
- package/skills/rudder/references/organization-skills.md +1 -1
- package/skills/skill-creator/SKILL.md +9 -0
- package/skills/skill-optimizer/CHANGELOG.md +29 -0
- package/skills/skill-optimizer/SKILL.md +205 -0
- package/skills/skill-optimizer/references/adapters/creative-brand-content.md +30 -0
- package/skills/skill-optimizer/references/adapters/customer-support-sales.md +30 -0
- package/skills/skill-optimizer/references/adapters/document-data-processing.md +31 -0
- package/skills/skill-optimizer/references/adapters/education-training.md +31 -0
- package/skills/skill-optimizer/references/adapters/finance-accounting.md +31 -0
- package/skills/skill-optimizer/references/adapters/healthcare-operations.md +30 -0
- package/skills/skill-optimizer/references/adapters/hr-people-ops.md +31 -0
- package/skills/skill-optimizer/references/adapters/legal-compliance.md +31 -0
- package/skills/skill-optimizer/references/adapters/operations-supply-chain.md +31 -0
- package/skills/skill-optimizer/references/adapters/personal-productivity.md +29 -0
- package/skills/skill-optimizer/references/adapters/research-knowledge.md +31 -0
- package/skills/skill-optimizer/references/adapters/software-ai.md +31 -0
- package/skills/skill-optimizer/references/domain-adapter-patterns.md +66 -0
- package/skills/skill-optimizer/references/eval-method.md +17 -0
- package/skills/skill-optimizer/references/universal-optimization-lens.md +73 -0
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Skill Packager - Creates a distributable .skill file of a skill folder
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
python utils/package_skill.py <path/to/skill-folder> [output-directory]
|
|
7
|
+
|
|
8
|
+
Example:
|
|
9
|
+
python utils/package_skill.py skills/public/my-skill
|
|
10
|
+
python utils/package_skill.py skills/public/my-skill ./dist
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import fnmatch
|
|
14
|
+
import sys
|
|
15
|
+
import zipfile
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from scripts.quick_validate import validate_skill
|
|
18
|
+
|
|
19
|
+
# Patterns to exclude when packaging skills.
|
|
20
|
+
EXCLUDE_DIRS = {"__pycache__", "node_modules"}
|
|
21
|
+
EXCLUDE_GLOBS = {"*.pyc"}
|
|
22
|
+
EXCLUDE_FILES = {".DS_Store"}
|
|
23
|
+
# Directories excluded only at the skill root (not when nested deeper).
|
|
24
|
+
ROOT_EXCLUDE_DIRS = {"evals"}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def should_exclude(rel_path: Path) -> bool:
|
|
28
|
+
"""Check if a path should be excluded from packaging."""
|
|
29
|
+
parts = rel_path.parts
|
|
30
|
+
if any(part in EXCLUDE_DIRS for part in parts):
|
|
31
|
+
return True
|
|
32
|
+
# rel_path is relative to skill_path.parent, so parts[0] is the skill
|
|
33
|
+
# folder name and parts[1] (if present) is the first subdir.
|
|
34
|
+
if len(parts) > 1 and parts[1] in ROOT_EXCLUDE_DIRS:
|
|
35
|
+
return True
|
|
36
|
+
name = rel_path.name
|
|
37
|
+
if name in EXCLUDE_FILES:
|
|
38
|
+
return True
|
|
39
|
+
return any(fnmatch.fnmatch(name, pat) for pat in EXCLUDE_GLOBS)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def package_skill(skill_path, output_dir=None):
|
|
43
|
+
"""
|
|
44
|
+
Package a skill folder into a .skill file.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
skill_path: Path to the skill folder
|
|
48
|
+
output_dir: Optional output directory for the .skill file (defaults to current directory)
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Path to the created .skill file, or None if error
|
|
52
|
+
"""
|
|
53
|
+
skill_path = Path(skill_path).resolve()
|
|
54
|
+
|
|
55
|
+
# Validate skill folder exists
|
|
56
|
+
if not skill_path.exists():
|
|
57
|
+
print(f"❌ Error: Skill folder not found: {skill_path}")
|
|
58
|
+
return None
|
|
59
|
+
|
|
60
|
+
if not skill_path.is_dir():
|
|
61
|
+
print(f"❌ Error: Path is not a directory: {skill_path}")
|
|
62
|
+
return None
|
|
63
|
+
|
|
64
|
+
# Validate SKILL.md exists
|
|
65
|
+
skill_md = skill_path / "SKILL.md"
|
|
66
|
+
if not skill_md.exists():
|
|
67
|
+
print(f"❌ Error: SKILL.md not found in {skill_path}")
|
|
68
|
+
return None
|
|
69
|
+
|
|
70
|
+
# Run validation before packaging
|
|
71
|
+
print("🔍 Validating skill...")
|
|
72
|
+
valid, message = validate_skill(skill_path)
|
|
73
|
+
if not valid:
|
|
74
|
+
print(f"❌ Validation failed: {message}")
|
|
75
|
+
print(" Please fix the validation errors before packaging.")
|
|
76
|
+
return None
|
|
77
|
+
print(f"✅ {message}\n")
|
|
78
|
+
|
|
79
|
+
# Determine output location
|
|
80
|
+
skill_name = skill_path.name
|
|
81
|
+
if output_dir:
|
|
82
|
+
output_path = Path(output_dir).resolve()
|
|
83
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
84
|
+
else:
|
|
85
|
+
output_path = Path.cwd()
|
|
86
|
+
|
|
87
|
+
skill_filename = output_path / f"{skill_name}.skill"
|
|
88
|
+
|
|
89
|
+
# Create the .skill file (zip format)
|
|
90
|
+
try:
|
|
91
|
+
with zipfile.ZipFile(skill_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
|
92
|
+
# Walk through the skill directory, excluding build artifacts
|
|
93
|
+
for file_path in skill_path.rglob('*'):
|
|
94
|
+
if not file_path.is_file():
|
|
95
|
+
continue
|
|
96
|
+
arcname = file_path.relative_to(skill_path.parent)
|
|
97
|
+
if should_exclude(arcname):
|
|
98
|
+
print(f" Skipped: {arcname}")
|
|
99
|
+
continue
|
|
100
|
+
zipf.write(file_path, arcname)
|
|
101
|
+
print(f" Added: {arcname}")
|
|
102
|
+
|
|
103
|
+
print(f"\n✅ Successfully packaged skill to: {skill_filename}")
|
|
104
|
+
return skill_filename
|
|
105
|
+
|
|
106
|
+
except Exception as e:
|
|
107
|
+
print(f"❌ Error creating .skill file: {e}")
|
|
108
|
+
return None
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def main():
|
|
112
|
+
if len(sys.argv) < 2:
|
|
113
|
+
print("Usage: python utils/package_skill.py <path/to/skill-folder> [output-directory]")
|
|
114
|
+
print("\nExample:")
|
|
115
|
+
print(" python utils/package_skill.py skills/public/my-skill")
|
|
116
|
+
print(" python utils/package_skill.py skills/public/my-skill ./dist")
|
|
117
|
+
sys.exit(1)
|
|
118
|
+
|
|
119
|
+
skill_path = sys.argv[1]
|
|
120
|
+
output_dir = sys.argv[2] if len(sys.argv) > 2 else None
|
|
121
|
+
|
|
122
|
+
print(f"📦 Packaging skill: {skill_path}")
|
|
123
|
+
if output_dir:
|
|
124
|
+
print(f" Output directory: {output_dir}")
|
|
125
|
+
print()
|
|
126
|
+
|
|
127
|
+
result = package_skill(skill_path, output_dir)
|
|
128
|
+
|
|
129
|
+
if result:
|
|
130
|
+
sys.exit(0)
|
|
131
|
+
else:
|
|
132
|
+
sys.exit(1)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
if __name__ == "__main__":
|
|
136
|
+
main()
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Quick validation script for skills - minimal version
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import sys
|
|
9
|
+
import re
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
if __package__ in (None, ""):
|
|
13
|
+
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
14
|
+
|
|
15
|
+
from scripts.utils import load_skill_frontmatter
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def validate_skill(skill_path):
|
|
19
|
+
"""Basic validation of a skill"""
|
|
20
|
+
skill_path = Path(skill_path)
|
|
21
|
+
|
|
22
|
+
# Check SKILL.md exists
|
|
23
|
+
skill_md = skill_path / "SKILL.md"
|
|
24
|
+
if not skill_md.exists():
|
|
25
|
+
return False, "SKILL.md not found"
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
frontmatter, _ = load_skill_frontmatter(skill_path)
|
|
29
|
+
except ValueError as exc:
|
|
30
|
+
return False, f"Invalid YAML in frontmatter: {exc}"
|
|
31
|
+
|
|
32
|
+
# Define allowed properties
|
|
33
|
+
ALLOWED_PROPERTIES = {"name", "description", "license", "allowed-tools", "metadata", "compatibility"}
|
|
34
|
+
|
|
35
|
+
# Check for unexpected properties (excluding nested keys under metadata)
|
|
36
|
+
unexpected_keys = set(frontmatter.keys()) - ALLOWED_PROPERTIES
|
|
37
|
+
if unexpected_keys:
|
|
38
|
+
return False, (
|
|
39
|
+
f"Unexpected key(s) in SKILL.md frontmatter: {', '.join(sorted(unexpected_keys))}. "
|
|
40
|
+
f"Allowed properties are: {', '.join(sorted(ALLOWED_PROPERTIES))}"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# Check required fields
|
|
44
|
+
if "name" not in frontmatter:
|
|
45
|
+
return False, "Missing 'name' in frontmatter"
|
|
46
|
+
if "description" not in frontmatter:
|
|
47
|
+
return False, "Missing 'description' in frontmatter"
|
|
48
|
+
|
|
49
|
+
# Extract name for validation
|
|
50
|
+
name = frontmatter.get("name", "")
|
|
51
|
+
if not isinstance(name, str):
|
|
52
|
+
return False, f"Name must be a string, got {type(name).__name__}"
|
|
53
|
+
name = name.strip()
|
|
54
|
+
if name:
|
|
55
|
+
# Check naming convention (kebab-case: lowercase with hyphens)
|
|
56
|
+
if not re.match(r"^[a-z0-9-]+$", name):
|
|
57
|
+
return False, f"Name '{name}' should be kebab-case (lowercase letters, digits, and hyphens only)"
|
|
58
|
+
if name.startswith("-") or name.endswith("-") or "--" in name:
|
|
59
|
+
return False, f"Name '{name}' cannot start/end with hyphen or contain consecutive hyphens"
|
|
60
|
+
# Check name length (max 64 characters per spec)
|
|
61
|
+
if len(name) > 64:
|
|
62
|
+
return False, f"Name is too long ({len(name)} characters). Maximum is 64 characters."
|
|
63
|
+
|
|
64
|
+
# Extract and validate description
|
|
65
|
+
description = frontmatter.get("description", "")
|
|
66
|
+
if not isinstance(description, str):
|
|
67
|
+
return False, f"Description must be a string, got {type(description).__name__}"
|
|
68
|
+
description = description.strip()
|
|
69
|
+
if description:
|
|
70
|
+
# Check for angle brackets
|
|
71
|
+
if "<" in description or ">" in description:
|
|
72
|
+
return False, "Description cannot contain angle brackets (< or >)"
|
|
73
|
+
# Check description length (max 1024 characters per spec)
|
|
74
|
+
if len(description) > 1024:
|
|
75
|
+
return False, f"Description is too long ({len(description)} characters). Maximum is 1024 characters."
|
|
76
|
+
|
|
77
|
+
# Validate compatibility field if present (optional)
|
|
78
|
+
compatibility = frontmatter.get("compatibility", "")
|
|
79
|
+
if compatibility:
|
|
80
|
+
if not isinstance(compatibility, str):
|
|
81
|
+
return False, f"Compatibility must be a string, got {type(compatibility).__name__}"
|
|
82
|
+
if len(compatibility) > 500:
|
|
83
|
+
return False, f"Compatibility is too long ({len(compatibility)} characters). Maximum is 500 characters."
|
|
84
|
+
|
|
85
|
+
allowed_tools = frontmatter.get("allowed-tools")
|
|
86
|
+
if allowed_tools is not None and not isinstance(allowed_tools, list):
|
|
87
|
+
return False, f"allowed-tools must be a list, got {type(allowed_tools).__name__}"
|
|
88
|
+
|
|
89
|
+
metadata = frontmatter.get("metadata")
|
|
90
|
+
if metadata is not None and not isinstance(metadata, dict):
|
|
91
|
+
return False, f"metadata must be a mapping, got {type(metadata).__name__}"
|
|
92
|
+
|
|
93
|
+
return True, "Skill is valid!"
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
if __name__ == "__main__":
|
|
97
|
+
if len(sys.argv) != 2:
|
|
98
|
+
print("Usage: python quick_validate.py <skill_directory>")
|
|
99
|
+
sys.exit(1)
|
|
100
|
+
|
|
101
|
+
valid, message = validate_skill(sys.argv[1])
|
|
102
|
+
print(message)
|
|
103
|
+
sys.exit(0 if valid else 1)
|
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Run trigger evaluation for a skill description.
|
|
3
|
+
|
|
4
|
+
Supports two modes:
|
|
5
|
+
- `claude`: uses `claude -p` and detects whether the temporary skill/command was
|
|
6
|
+
actually consulted.
|
|
7
|
+
- `codex`: approximates triggering by asking Codex to judge, using only the skill
|
|
8
|
+
name, description, and user query.
|
|
9
|
+
|
|
10
|
+
The Codex path is an approximation because Codex's local skill mechanism is not
|
|
11
|
+
the same as Claude Code's command discovery. It is still useful for measuring
|
|
12
|
+
whether your description clearly communicates when the skill should be used.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import json
|
|
19
|
+
import os
|
|
20
|
+
import select
|
|
21
|
+
import subprocess
|
|
22
|
+
import sys
|
|
23
|
+
import time
|
|
24
|
+
import uuid
|
|
25
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
|
|
28
|
+
if __package__ in (None, ""):
|
|
29
|
+
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
30
|
+
|
|
31
|
+
from scripts.model_backends import detect_backend, extract_first_json_object, generate_text
|
|
32
|
+
from scripts.utils import parse_skill_md
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def find_project_root() -> Path:
|
|
36
|
+
"""Find the nearest plausible project root."""
|
|
37
|
+
current = Path.cwd()
|
|
38
|
+
markers = (".claude", ".git", ".codex", ".agents")
|
|
39
|
+
for parent in [current, *current.parents]:
|
|
40
|
+
if any((parent / marker).exists() for marker in markers):
|
|
41
|
+
return parent
|
|
42
|
+
return current
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def run_single_query_claude(
|
|
46
|
+
query: str,
|
|
47
|
+
skill_name: str,
|
|
48
|
+
skill_description: str,
|
|
49
|
+
timeout: int,
|
|
50
|
+
project_root: str,
|
|
51
|
+
model: str | None = None,
|
|
52
|
+
) -> bool:
|
|
53
|
+
"""Run a single query against Claude Code and detect real skill usage."""
|
|
54
|
+
unique_id = uuid.uuid4().hex[:8]
|
|
55
|
+
clean_name = f"{skill_name}-skill-{unique_id}"
|
|
56
|
+
project_commands_dir = Path(project_root) / ".claude" / "commands"
|
|
57
|
+
command_file = project_commands_dir / f"{clean_name}.md"
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
project_commands_dir.mkdir(parents=True, exist_ok=True)
|
|
61
|
+
indented_desc = "\n ".join(skill_description.split("\n"))
|
|
62
|
+
command_content = (
|
|
63
|
+
f"---\n"
|
|
64
|
+
f"description: |\n"
|
|
65
|
+
f" {indented_desc}\n"
|
|
66
|
+
f"---\n\n"
|
|
67
|
+
f"# {skill_name}\n\n"
|
|
68
|
+
f"This skill handles: {skill_description}\n"
|
|
69
|
+
)
|
|
70
|
+
command_file.write_text(command_content)
|
|
71
|
+
|
|
72
|
+
cmd = [
|
|
73
|
+
"claude",
|
|
74
|
+
"-p", query,
|
|
75
|
+
"--output-format", "stream-json",
|
|
76
|
+
"--verbose",
|
|
77
|
+
"--include-partial-messages",
|
|
78
|
+
]
|
|
79
|
+
if model:
|
|
80
|
+
cmd.extend(["--model", model])
|
|
81
|
+
|
|
82
|
+
env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
|
|
83
|
+
process = subprocess.Popen(
|
|
84
|
+
cmd,
|
|
85
|
+
stdout=subprocess.PIPE,
|
|
86
|
+
stderr=subprocess.DEVNULL,
|
|
87
|
+
cwd=project_root,
|
|
88
|
+
env=env,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
triggered = False
|
|
92
|
+
start_time = time.time()
|
|
93
|
+
buffer = ""
|
|
94
|
+
pending_tool_name = None
|
|
95
|
+
accumulated_json = ""
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
while time.time() - start_time < timeout:
|
|
99
|
+
if process.poll() is not None:
|
|
100
|
+
remaining = process.stdout.read()
|
|
101
|
+
if remaining:
|
|
102
|
+
buffer += remaining.decode("utf-8", errors="replace")
|
|
103
|
+
break
|
|
104
|
+
|
|
105
|
+
ready, _, _ = select.select([process.stdout], [], [], 1.0)
|
|
106
|
+
if not ready:
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
chunk = os.read(process.stdout.fileno(), 8192)
|
|
110
|
+
if not chunk:
|
|
111
|
+
break
|
|
112
|
+
buffer += chunk.decode("utf-8", errors="replace")
|
|
113
|
+
|
|
114
|
+
while "\n" in buffer:
|
|
115
|
+
line, buffer = buffer.split("\n", 1)
|
|
116
|
+
line = line.strip()
|
|
117
|
+
if not line:
|
|
118
|
+
continue
|
|
119
|
+
|
|
120
|
+
try:
|
|
121
|
+
event = json.loads(line)
|
|
122
|
+
except json.JSONDecodeError:
|
|
123
|
+
continue
|
|
124
|
+
|
|
125
|
+
if event.get("type") == "stream_event":
|
|
126
|
+
stream_event = event.get("event", {})
|
|
127
|
+
stream_type = stream_event.get("type", "")
|
|
128
|
+
|
|
129
|
+
if stream_type == "content_block_start":
|
|
130
|
+
content_block = stream_event.get("content_block", {})
|
|
131
|
+
if content_block.get("type") == "tool_use":
|
|
132
|
+
tool_name = content_block.get("name", "")
|
|
133
|
+
if tool_name in ("Skill", "Read"):
|
|
134
|
+
pending_tool_name = tool_name
|
|
135
|
+
accumulated_json = ""
|
|
136
|
+
else:
|
|
137
|
+
return False
|
|
138
|
+
|
|
139
|
+
elif stream_type == "content_block_delta" and pending_tool_name:
|
|
140
|
+
delta = stream_event.get("delta", {})
|
|
141
|
+
if delta.get("type") == "input_json_delta":
|
|
142
|
+
accumulated_json += delta.get("partial_json", "")
|
|
143
|
+
if clean_name in accumulated_json:
|
|
144
|
+
return True
|
|
145
|
+
|
|
146
|
+
elif stream_type in ("content_block_stop", "message_stop"):
|
|
147
|
+
if pending_tool_name:
|
|
148
|
+
return clean_name in accumulated_json
|
|
149
|
+
if stream_type == "message_stop":
|
|
150
|
+
return False
|
|
151
|
+
|
|
152
|
+
elif event.get("type") == "assistant":
|
|
153
|
+
message = event.get("message", {})
|
|
154
|
+
for content_item in message.get("content", []):
|
|
155
|
+
if content_item.get("type") != "tool_use":
|
|
156
|
+
continue
|
|
157
|
+
tool_name = content_item.get("name", "")
|
|
158
|
+
tool_input = content_item.get("input", {})
|
|
159
|
+
if tool_name == "Skill" and clean_name in tool_input.get("skill", ""):
|
|
160
|
+
triggered = True
|
|
161
|
+
elif tool_name == "Read" and clean_name in tool_input.get("file_path", ""):
|
|
162
|
+
triggered = True
|
|
163
|
+
return triggered
|
|
164
|
+
|
|
165
|
+
elif event.get("type") == "result":
|
|
166
|
+
return triggered
|
|
167
|
+
finally:
|
|
168
|
+
if process.poll() is None:
|
|
169
|
+
process.kill()
|
|
170
|
+
process.wait()
|
|
171
|
+
|
|
172
|
+
return triggered
|
|
173
|
+
finally:
|
|
174
|
+
if command_file.exists():
|
|
175
|
+
command_file.unlink()
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def run_single_query_judge(
|
|
179
|
+
query: str,
|
|
180
|
+
skill_name: str,
|
|
181
|
+
skill_description: str,
|
|
182
|
+
timeout: int,
|
|
183
|
+
project_root: str,
|
|
184
|
+
backend: str,
|
|
185
|
+
model: str | None = None,
|
|
186
|
+
) -> bool:
|
|
187
|
+
"""Ask a model to judge whether the skill should trigger."""
|
|
188
|
+
prompt = f"""You are evaluating skill routing.
|
|
189
|
+
|
|
190
|
+
Use only the skill name, the skill description, and the user query below.
|
|
191
|
+
Ignore hidden context, filesystem contents, tool availability, and implementation details.
|
|
192
|
+
|
|
193
|
+
Return strict JSON only:
|
|
194
|
+
{{"trigger": true, "reason": "one short sentence"}}
|
|
195
|
+
|
|
196
|
+
Mark "trigger": true only if this skill is clearly the best specialized workflow for the request.
|
|
197
|
+
Mark "trigger": false for simple requests, adjacent requests, or requests better handled by another workflow.
|
|
198
|
+
|
|
199
|
+
Skill name: {skill_name}
|
|
200
|
+
Skill description:
|
|
201
|
+
{skill_description}
|
|
202
|
+
|
|
203
|
+
User query:
|
|
204
|
+
{query}
|
|
205
|
+
"""
|
|
206
|
+
output = generate_text(
|
|
207
|
+
prompt,
|
|
208
|
+
backend=backend,
|
|
209
|
+
model=model,
|
|
210
|
+
cwd=Path(project_root),
|
|
211
|
+
timeout=timeout,
|
|
212
|
+
)
|
|
213
|
+
parsed = extract_first_json_object(output)
|
|
214
|
+
return bool(parsed.get("trigger"))
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def run_eval(
|
|
218
|
+
eval_set: list[dict],
|
|
219
|
+
skill_name: str,
|
|
220
|
+
description: str,
|
|
221
|
+
num_workers: int,
|
|
222
|
+
timeout: int,
|
|
223
|
+
project_root: Path,
|
|
224
|
+
runs_per_query: int = 1,
|
|
225
|
+
trigger_threshold: float = 0.5,
|
|
226
|
+
model: str | None = None,
|
|
227
|
+
backend: str = "auto",
|
|
228
|
+
) -> dict:
|
|
229
|
+
"""Run the full eval set and return results."""
|
|
230
|
+
backend = detect_backend(backend)
|
|
231
|
+
results = []
|
|
232
|
+
|
|
233
|
+
with ProcessPoolExecutor(max_workers=num_workers) as executor:
|
|
234
|
+
future_to_info = {}
|
|
235
|
+
for item in eval_set:
|
|
236
|
+
for run_idx in range(runs_per_query):
|
|
237
|
+
if backend == "claude":
|
|
238
|
+
future = executor.submit(
|
|
239
|
+
run_single_query_claude,
|
|
240
|
+
item["query"],
|
|
241
|
+
skill_name,
|
|
242
|
+
description,
|
|
243
|
+
timeout,
|
|
244
|
+
str(project_root),
|
|
245
|
+
model,
|
|
246
|
+
)
|
|
247
|
+
else:
|
|
248
|
+
future = executor.submit(
|
|
249
|
+
run_single_query_judge,
|
|
250
|
+
item["query"],
|
|
251
|
+
skill_name,
|
|
252
|
+
description,
|
|
253
|
+
timeout,
|
|
254
|
+
str(project_root),
|
|
255
|
+
backend,
|
|
256
|
+
model,
|
|
257
|
+
)
|
|
258
|
+
future_to_info[future] = (item, run_idx)
|
|
259
|
+
|
|
260
|
+
query_triggers: dict[str, list[bool]] = {}
|
|
261
|
+
query_items: dict[str, dict] = {}
|
|
262
|
+
for future in as_completed(future_to_info):
|
|
263
|
+
item, _ = future_to_info[future]
|
|
264
|
+
query = item["query"]
|
|
265
|
+
query_items[query] = item
|
|
266
|
+
if query not in query_triggers:
|
|
267
|
+
query_triggers[query] = []
|
|
268
|
+
try:
|
|
269
|
+
query_triggers[query].append(future.result())
|
|
270
|
+
except Exception as exc:
|
|
271
|
+
print(f"Warning: query failed: {exc}", file=sys.stderr)
|
|
272
|
+
query_triggers[query].append(False)
|
|
273
|
+
|
|
274
|
+
for query, triggers in query_triggers.items():
|
|
275
|
+
item = query_items[query]
|
|
276
|
+
trigger_rate = sum(triggers) / len(triggers)
|
|
277
|
+
should_trigger = item["should_trigger"]
|
|
278
|
+
did_pass = trigger_rate >= trigger_threshold if should_trigger else trigger_rate < trigger_threshold
|
|
279
|
+
results.append({
|
|
280
|
+
"query": query,
|
|
281
|
+
"should_trigger": should_trigger,
|
|
282
|
+
"trigger_rate": trigger_rate,
|
|
283
|
+
"triggers": sum(triggers),
|
|
284
|
+
"runs": len(triggers),
|
|
285
|
+
"pass": did_pass,
|
|
286
|
+
})
|
|
287
|
+
|
|
288
|
+
passed = sum(1 for item in results if item["pass"])
|
|
289
|
+
total = len(results)
|
|
290
|
+
mode = "observed" if backend == "claude" else "judged"
|
|
291
|
+
|
|
292
|
+
return {
|
|
293
|
+
"skill_name": skill_name,
|
|
294
|
+
"description": description,
|
|
295
|
+
"backend": backend,
|
|
296
|
+
"evaluation_mode": mode,
|
|
297
|
+
"results": results,
|
|
298
|
+
"summary": {
|
|
299
|
+
"total": total,
|
|
300
|
+
"passed": passed,
|
|
301
|
+
"failed": total - passed,
|
|
302
|
+
},
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def main():
|
|
307
|
+
parser = argparse.ArgumentParser(description="Run trigger evaluation for a skill description")
|
|
308
|
+
parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
|
|
309
|
+
parser.add_argument("--skill-path", required=True, help="Path to skill directory")
|
|
310
|
+
parser.add_argument("--description", default=None, help="Override description to test")
|
|
311
|
+
parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers")
|
|
312
|
+
parser.add_argument("--timeout", type=int, default=60, help="Timeout per query in seconds")
|
|
313
|
+
parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
|
|
314
|
+
parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
|
|
315
|
+
parser.add_argument("--model", default=None, help="Optional backend model identifier")
|
|
316
|
+
parser.add_argument("--backend", default="auto", choices=["auto", "claude", "codex"], help="Evaluation backend")
|
|
317
|
+
parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
|
|
318
|
+
args = parser.parse_args()
|
|
319
|
+
|
|
320
|
+
eval_set = json.loads(Path(args.eval_set).read_text())
|
|
321
|
+
skill_path = Path(args.skill_path)
|
|
322
|
+
|
|
323
|
+
if not (skill_path / "SKILL.md").exists():
|
|
324
|
+
print(f"Error: No SKILL.md found at {skill_path}", file=sys.stderr)
|
|
325
|
+
sys.exit(1)
|
|
326
|
+
|
|
327
|
+
name, original_description, _ = parse_skill_md(skill_path)
|
|
328
|
+
description = args.description or original_description
|
|
329
|
+
project_root = find_project_root()
|
|
330
|
+
|
|
331
|
+
if args.verbose:
|
|
332
|
+
print(f"Evaluating with backend={args.backend}: {description}", file=sys.stderr)
|
|
333
|
+
|
|
334
|
+
output = run_eval(
|
|
335
|
+
eval_set=eval_set,
|
|
336
|
+
skill_name=name,
|
|
337
|
+
description=description,
|
|
338
|
+
num_workers=args.num_workers,
|
|
339
|
+
timeout=args.timeout,
|
|
340
|
+
project_root=project_root,
|
|
341
|
+
runs_per_query=args.runs_per_query,
|
|
342
|
+
trigger_threshold=args.trigger_threshold,
|
|
343
|
+
model=args.model,
|
|
344
|
+
backend=args.backend,
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
if args.verbose:
|
|
348
|
+
summary = output["summary"]
|
|
349
|
+
print(
|
|
350
|
+
f"Results ({output['evaluation_mode']} via {output['backend']}): "
|
|
351
|
+
f"{summary['passed']}/{summary['total']} passed",
|
|
352
|
+
file=sys.stderr,
|
|
353
|
+
)
|
|
354
|
+
for item in output["results"]:
|
|
355
|
+
status = "PASS" if item["pass"] else "FAIL"
|
|
356
|
+
rate_str = f"{item['triggers']}/{item['runs']}"
|
|
357
|
+
print(f" [{status}] rate={rate_str} expected={item['should_trigger']}: {item['query'][:70]}", file=sys.stderr)
|
|
358
|
+
|
|
359
|
+
print(json.dumps(output, indent=2))
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
if __name__ == "__main__":
|
|
363
|
+
main()
|