@rudderhq/agent-runtime-gemini-local 0.2.1 → 0.2.2-canary.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/package.json +2 -2
  2. package/skills/conversation-to-skill/LICENSE.txt +202 -0
  3. package/skills/conversation-to-skill/SKILL.md +428 -0
  4. package/skills/conversation-to-skill/agents/analyzer.md +274 -0
  5. package/skills/conversation-to-skill/agents/comparator.md +202 -0
  6. package/skills/conversation-to-skill/agents/grader.md +223 -0
  7. package/skills/conversation-to-skill/assets/eval_review.html +146 -0
  8. package/skills/conversation-to-skill/eval-viewer/generate_review.py +471 -0
  9. package/skills/conversation-to-skill/eval-viewer/viewer.html +1325 -0
  10. package/skills/conversation-to-skill/references/compatibility.md +36 -0
  11. package/skills/conversation-to-skill/references/description-optimization.md +113 -0
  12. package/skills/conversation-to-skill/references/evaluation-suite.md +410 -0
  13. package/skills/conversation-to-skill/references/schemas.md +431 -0
  14. package/skills/conversation-to-skill/scripts/__init__.py +0 -0
  15. package/skills/conversation-to-skill/scripts/aggregate_benchmark.py +401 -0
  16. package/skills/conversation-to-skill/scripts/generate_report.py +335 -0
  17. package/skills/conversation-to-skill/scripts/improve_description.py +197 -0
  18. package/skills/conversation-to-skill/scripts/model_backends.py +115 -0
  19. package/skills/conversation-to-skill/scripts/package_skill.py +136 -0
  20. package/skills/conversation-to-skill/scripts/quick_validate.py +103 -0
  21. package/skills/conversation-to-skill/scripts/run_eval.py +363 -0
  22. package/skills/conversation-to-skill/scripts/run_loop.py +319 -0
  23. package/skills/conversation-to-skill/scripts/utils.py +223 -0
  24. package/skills/rudder/references/organization-skills.md +1 -1
  25. package/skills/skill-creator/SKILL.md +9 -0
  26. package/skills/skill-optimizer/CHANGELOG.md +29 -0
  27. package/skills/skill-optimizer/SKILL.md +205 -0
  28. package/skills/skill-optimizer/references/adapters/creative-brand-content.md +30 -0
  29. package/skills/skill-optimizer/references/adapters/customer-support-sales.md +30 -0
  30. package/skills/skill-optimizer/references/adapters/document-data-processing.md +31 -0
  31. package/skills/skill-optimizer/references/adapters/education-training.md +31 -0
  32. package/skills/skill-optimizer/references/adapters/finance-accounting.md +31 -0
  33. package/skills/skill-optimizer/references/adapters/healthcare-operations.md +30 -0
  34. package/skills/skill-optimizer/references/adapters/hr-people-ops.md +31 -0
  35. package/skills/skill-optimizer/references/adapters/legal-compliance.md +31 -0
  36. package/skills/skill-optimizer/references/adapters/operations-supply-chain.md +31 -0
  37. package/skills/skill-optimizer/references/adapters/personal-productivity.md +29 -0
  38. package/skills/skill-optimizer/references/adapters/research-knowledge.md +31 -0
  39. package/skills/skill-optimizer/references/adapters/software-ai.md +31 -0
  40. package/skills/skill-optimizer/references/domain-adapter-patterns.md +66 -0
  41. package/skills/skill-optimizer/references/eval-method.md +17 -0
  42. package/skills/skill-optimizer/references/universal-optimization-lens.md +73 -0
@@ -0,0 +1,136 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Skill Packager - Creates a distributable .skill file of a skill folder
4
+
5
+ Usage:
6
+ python utils/package_skill.py <path/to/skill-folder> [output-directory]
7
+
8
+ Example:
9
+ python utils/package_skill.py skills/public/my-skill
10
+ python utils/package_skill.py skills/public/my-skill ./dist
11
+ """
12
+
13
+ import fnmatch
14
+ import sys
15
+ import zipfile
16
+ from pathlib import Path
17
+ from scripts.quick_validate import validate_skill
18
+
19
+ # Patterns to exclude when packaging skills.
20
+ EXCLUDE_DIRS = {"__pycache__", "node_modules"}
21
+ EXCLUDE_GLOBS = {"*.pyc"}
22
+ EXCLUDE_FILES = {".DS_Store"}
23
+ # Directories excluded only at the skill root (not when nested deeper).
24
+ ROOT_EXCLUDE_DIRS = {"evals"}
25
+
26
+
27
+ def should_exclude(rel_path: Path) -> bool:
28
+ """Check if a path should be excluded from packaging."""
29
+ parts = rel_path.parts
30
+ if any(part in EXCLUDE_DIRS for part in parts):
31
+ return True
32
+ # rel_path is relative to skill_path.parent, so parts[0] is the skill
33
+ # folder name and parts[1] (if present) is the first subdir.
34
+ if len(parts) > 1 and parts[1] in ROOT_EXCLUDE_DIRS:
35
+ return True
36
+ name = rel_path.name
37
+ if name in EXCLUDE_FILES:
38
+ return True
39
+ return any(fnmatch.fnmatch(name, pat) for pat in EXCLUDE_GLOBS)
40
+
41
+
42
+ def package_skill(skill_path, output_dir=None):
43
+ """
44
+ Package a skill folder into a .skill file.
45
+
46
+ Args:
47
+ skill_path: Path to the skill folder
48
+ output_dir: Optional output directory for the .skill file (defaults to current directory)
49
+
50
+ Returns:
51
+ Path to the created .skill file, or None if error
52
+ """
53
+ skill_path = Path(skill_path).resolve()
54
+
55
+ # Validate skill folder exists
56
+ if not skill_path.exists():
57
+ print(f"❌ Error: Skill folder not found: {skill_path}")
58
+ return None
59
+
60
+ if not skill_path.is_dir():
61
+ print(f"❌ Error: Path is not a directory: {skill_path}")
62
+ return None
63
+
64
+ # Validate SKILL.md exists
65
+ skill_md = skill_path / "SKILL.md"
66
+ if not skill_md.exists():
67
+ print(f"❌ Error: SKILL.md not found in {skill_path}")
68
+ return None
69
+
70
+ # Run validation before packaging
71
+ print("🔍 Validating skill...")
72
+ valid, message = validate_skill(skill_path)
73
+ if not valid:
74
+ print(f"❌ Validation failed: {message}")
75
+ print(" Please fix the validation errors before packaging.")
76
+ return None
77
+ print(f"✅ {message}\n")
78
+
79
+ # Determine output location
80
+ skill_name = skill_path.name
81
+ if output_dir:
82
+ output_path = Path(output_dir).resolve()
83
+ output_path.mkdir(parents=True, exist_ok=True)
84
+ else:
85
+ output_path = Path.cwd()
86
+
87
+ skill_filename = output_path / f"{skill_name}.skill"
88
+
89
+ # Create the .skill file (zip format)
90
+ try:
91
+ with zipfile.ZipFile(skill_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
92
+ # Walk through the skill directory, excluding build artifacts
93
+ for file_path in skill_path.rglob('*'):
94
+ if not file_path.is_file():
95
+ continue
96
+ arcname = file_path.relative_to(skill_path.parent)
97
+ if should_exclude(arcname):
98
+ print(f" Skipped: {arcname}")
99
+ continue
100
+ zipf.write(file_path, arcname)
101
+ print(f" Added: {arcname}")
102
+
103
+ print(f"\n✅ Successfully packaged skill to: {skill_filename}")
104
+ return skill_filename
105
+
106
+ except Exception as e:
107
+ print(f"❌ Error creating .skill file: {e}")
108
+ return None
109
+
110
+
111
+ def main():
112
+ if len(sys.argv) < 2:
113
+ print("Usage: python utils/package_skill.py <path/to/skill-folder> [output-directory]")
114
+ print("\nExample:")
115
+ print(" python utils/package_skill.py skills/public/my-skill")
116
+ print(" python utils/package_skill.py skills/public/my-skill ./dist")
117
+ sys.exit(1)
118
+
119
+ skill_path = sys.argv[1]
120
+ output_dir = sys.argv[2] if len(sys.argv) > 2 else None
121
+
122
+ print(f"📦 Packaging skill: {skill_path}")
123
+ if output_dir:
124
+ print(f" Output directory: {output_dir}")
125
+ print()
126
+
127
+ result = package_skill(skill_path, output_dir)
128
+
129
+ if result:
130
+ sys.exit(0)
131
+ else:
132
+ sys.exit(1)
133
+
134
+
135
+ if __name__ == "__main__":
136
+ main()
@@ -0,0 +1,103 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Quick validation script for skills - minimal version
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import sys
9
+ import re
10
+ from pathlib import Path
11
+
12
+ if __package__ in (None, ""):
13
+ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
14
+
15
+ from scripts.utils import load_skill_frontmatter
16
+
17
+
18
+ def validate_skill(skill_path):
19
+ """Basic validation of a skill"""
20
+ skill_path = Path(skill_path)
21
+
22
+ # Check SKILL.md exists
23
+ skill_md = skill_path / "SKILL.md"
24
+ if not skill_md.exists():
25
+ return False, "SKILL.md not found"
26
+
27
+ try:
28
+ frontmatter, _ = load_skill_frontmatter(skill_path)
29
+ except ValueError as exc:
30
+ return False, f"Invalid YAML in frontmatter: {exc}"
31
+
32
+ # Define allowed properties
33
+ ALLOWED_PROPERTIES = {"name", "description", "license", "allowed-tools", "metadata", "compatibility"}
34
+
35
+ # Check for unexpected properties (excluding nested keys under metadata)
36
+ unexpected_keys = set(frontmatter.keys()) - ALLOWED_PROPERTIES
37
+ if unexpected_keys:
38
+ return False, (
39
+ f"Unexpected key(s) in SKILL.md frontmatter: {', '.join(sorted(unexpected_keys))}. "
40
+ f"Allowed properties are: {', '.join(sorted(ALLOWED_PROPERTIES))}"
41
+ )
42
+
43
+ # Check required fields
44
+ if "name" not in frontmatter:
45
+ return False, "Missing 'name' in frontmatter"
46
+ if "description" not in frontmatter:
47
+ return False, "Missing 'description' in frontmatter"
48
+
49
+ # Extract name for validation
50
+ name = frontmatter.get("name", "")
51
+ if not isinstance(name, str):
52
+ return False, f"Name must be a string, got {type(name).__name__}"
53
+ name = name.strip()
54
+ if name:
55
+ # Check naming convention (kebab-case: lowercase with hyphens)
56
+ if not re.match(r"^[a-z0-9-]+$", name):
57
+ return False, f"Name '{name}' should be kebab-case (lowercase letters, digits, and hyphens only)"
58
+ if name.startswith("-") or name.endswith("-") or "--" in name:
59
+ return False, f"Name '{name}' cannot start/end with hyphen or contain consecutive hyphens"
60
+ # Check name length (max 64 characters per spec)
61
+ if len(name) > 64:
62
+ return False, f"Name is too long ({len(name)} characters). Maximum is 64 characters."
63
+
64
+ # Extract and validate description
65
+ description = frontmatter.get("description", "")
66
+ if not isinstance(description, str):
67
+ return False, f"Description must be a string, got {type(description).__name__}"
68
+ description = description.strip()
69
+ if description:
70
+ # Check for angle brackets
71
+ if "<" in description or ">" in description:
72
+ return False, "Description cannot contain angle brackets (< or >)"
73
+ # Check description length (max 1024 characters per spec)
74
+ if len(description) > 1024:
75
+ return False, f"Description is too long ({len(description)} characters). Maximum is 1024 characters."
76
+
77
+ # Validate compatibility field if present (optional)
78
+ compatibility = frontmatter.get("compatibility", "")
79
+ if compatibility:
80
+ if not isinstance(compatibility, str):
81
+ return False, f"Compatibility must be a string, got {type(compatibility).__name__}"
82
+ if len(compatibility) > 500:
83
+ return False, f"Compatibility is too long ({len(compatibility)} characters). Maximum is 500 characters."
84
+
85
+ allowed_tools = frontmatter.get("allowed-tools")
86
+ if allowed_tools is not None and not isinstance(allowed_tools, list):
87
+ return False, f"allowed-tools must be a list, got {type(allowed_tools).__name__}"
88
+
89
+ metadata = frontmatter.get("metadata")
90
+ if metadata is not None and not isinstance(metadata, dict):
91
+ return False, f"metadata must be a mapping, got {type(metadata).__name__}"
92
+
93
+ return True, "Skill is valid!"
94
+
95
+
96
+ if __name__ == "__main__":
97
+ if len(sys.argv) != 2:
98
+ print("Usage: python quick_validate.py <skill_directory>")
99
+ sys.exit(1)
100
+
101
+ valid, message = validate_skill(sys.argv[1])
102
+ print(message)
103
+ sys.exit(0 if valid else 1)
@@ -0,0 +1,363 @@
1
+ #!/usr/bin/env python3
2
+ """Run trigger evaluation for a skill description.
3
+
4
+ Supports two modes:
5
+ - `claude`: uses `claude -p` and detects whether the temporary skill/command was
6
+ actually consulted.
7
+ - `codex`: approximates triggering by asking Codex to judge, using only the skill
8
+ name, description, and user query.
9
+
10
+ The Codex path is an approximation because Codex's local skill mechanism is not
11
+ the same as Claude Code's command discovery. It is still useful for measuring
12
+ whether your description clearly communicates when the skill should be used.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import argparse
18
+ import json
19
+ import os
20
+ import select
21
+ import subprocess
22
+ import sys
23
+ import time
24
+ import uuid
25
+ from concurrent.futures import ProcessPoolExecutor, as_completed
26
+ from pathlib import Path
27
+
28
+ if __package__ in (None, ""):
29
+ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
30
+
31
+ from scripts.model_backends import detect_backend, extract_first_json_object, generate_text
32
+ from scripts.utils import parse_skill_md
33
+
34
+
35
+ def find_project_root() -> Path:
36
+ """Find the nearest plausible project root."""
37
+ current = Path.cwd()
38
+ markers = (".claude", ".git", ".codex", ".agents")
39
+ for parent in [current, *current.parents]:
40
+ if any((parent / marker).exists() for marker in markers):
41
+ return parent
42
+ return current
43
+
44
+
45
+ def run_single_query_claude(
46
+ query: str,
47
+ skill_name: str,
48
+ skill_description: str,
49
+ timeout: int,
50
+ project_root: str,
51
+ model: str | None = None,
52
+ ) -> bool:
53
+ """Run a single query against Claude Code and detect real skill usage."""
54
+ unique_id = uuid.uuid4().hex[:8]
55
+ clean_name = f"{skill_name}-skill-{unique_id}"
56
+ project_commands_dir = Path(project_root) / ".claude" / "commands"
57
+ command_file = project_commands_dir / f"{clean_name}.md"
58
+
59
+ try:
60
+ project_commands_dir.mkdir(parents=True, exist_ok=True)
61
+ indented_desc = "\n ".join(skill_description.split("\n"))
62
+ command_content = (
63
+ f"---\n"
64
+ f"description: |\n"
65
+ f" {indented_desc}\n"
66
+ f"---\n\n"
67
+ f"# {skill_name}\n\n"
68
+ f"This skill handles: {skill_description}\n"
69
+ )
70
+ command_file.write_text(command_content)
71
+
72
+ cmd = [
73
+ "claude",
74
+ "-p", query,
75
+ "--output-format", "stream-json",
76
+ "--verbose",
77
+ "--include-partial-messages",
78
+ ]
79
+ if model:
80
+ cmd.extend(["--model", model])
81
+
82
+ env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
83
+ process = subprocess.Popen(
84
+ cmd,
85
+ stdout=subprocess.PIPE,
86
+ stderr=subprocess.DEVNULL,
87
+ cwd=project_root,
88
+ env=env,
89
+ )
90
+
91
+ triggered = False
92
+ start_time = time.time()
93
+ buffer = ""
94
+ pending_tool_name = None
95
+ accumulated_json = ""
96
+
97
+ try:
98
+ while time.time() - start_time < timeout:
99
+ if process.poll() is not None:
100
+ remaining = process.stdout.read()
101
+ if remaining:
102
+ buffer += remaining.decode("utf-8", errors="replace")
103
+ break
104
+
105
+ ready, _, _ = select.select([process.stdout], [], [], 1.0)
106
+ if not ready:
107
+ continue
108
+
109
+ chunk = os.read(process.stdout.fileno(), 8192)
110
+ if not chunk:
111
+ break
112
+ buffer += chunk.decode("utf-8", errors="replace")
113
+
114
+ while "\n" in buffer:
115
+ line, buffer = buffer.split("\n", 1)
116
+ line = line.strip()
117
+ if not line:
118
+ continue
119
+
120
+ try:
121
+ event = json.loads(line)
122
+ except json.JSONDecodeError:
123
+ continue
124
+
125
+ if event.get("type") == "stream_event":
126
+ stream_event = event.get("event", {})
127
+ stream_type = stream_event.get("type", "")
128
+
129
+ if stream_type == "content_block_start":
130
+ content_block = stream_event.get("content_block", {})
131
+ if content_block.get("type") == "tool_use":
132
+ tool_name = content_block.get("name", "")
133
+ if tool_name in ("Skill", "Read"):
134
+ pending_tool_name = tool_name
135
+ accumulated_json = ""
136
+ else:
137
+ return False
138
+
139
+ elif stream_type == "content_block_delta" and pending_tool_name:
140
+ delta = stream_event.get("delta", {})
141
+ if delta.get("type") == "input_json_delta":
142
+ accumulated_json += delta.get("partial_json", "")
143
+ if clean_name in accumulated_json:
144
+ return True
145
+
146
+ elif stream_type in ("content_block_stop", "message_stop"):
147
+ if pending_tool_name:
148
+ return clean_name in accumulated_json
149
+ if stream_type == "message_stop":
150
+ return False
151
+
152
+ elif event.get("type") == "assistant":
153
+ message = event.get("message", {})
154
+ for content_item in message.get("content", []):
155
+ if content_item.get("type") != "tool_use":
156
+ continue
157
+ tool_name = content_item.get("name", "")
158
+ tool_input = content_item.get("input", {})
159
+ if tool_name == "Skill" and clean_name in tool_input.get("skill", ""):
160
+ triggered = True
161
+ elif tool_name == "Read" and clean_name in tool_input.get("file_path", ""):
162
+ triggered = True
163
+ return triggered
164
+
165
+ elif event.get("type") == "result":
166
+ return triggered
167
+ finally:
168
+ if process.poll() is None:
169
+ process.kill()
170
+ process.wait()
171
+
172
+ return triggered
173
+ finally:
174
+ if command_file.exists():
175
+ command_file.unlink()
176
+
177
+
178
+ def run_single_query_judge(
179
+ query: str,
180
+ skill_name: str,
181
+ skill_description: str,
182
+ timeout: int,
183
+ project_root: str,
184
+ backend: str,
185
+ model: str | None = None,
186
+ ) -> bool:
187
+ """Ask a model to judge whether the skill should trigger."""
188
+ prompt = f"""You are evaluating skill routing.
189
+
190
+ Use only the skill name, the skill description, and the user query below.
191
+ Ignore hidden context, filesystem contents, tool availability, and implementation details.
192
+
193
+ Return strict JSON only:
194
+ {{"trigger": true, "reason": "one short sentence"}}
195
+
196
+ Mark "trigger": true only if this skill is clearly the best specialized workflow for the request.
197
+ Mark "trigger": false for simple requests, adjacent requests, or requests better handled by another workflow.
198
+
199
+ Skill name: {skill_name}
200
+ Skill description:
201
+ {skill_description}
202
+
203
+ User query:
204
+ {query}
205
+ """
206
+ output = generate_text(
207
+ prompt,
208
+ backend=backend,
209
+ model=model,
210
+ cwd=Path(project_root),
211
+ timeout=timeout,
212
+ )
213
+ parsed = extract_first_json_object(output)
214
+ return bool(parsed.get("trigger"))
215
+
216
+
217
+ def run_eval(
218
+ eval_set: list[dict],
219
+ skill_name: str,
220
+ description: str,
221
+ num_workers: int,
222
+ timeout: int,
223
+ project_root: Path,
224
+ runs_per_query: int = 1,
225
+ trigger_threshold: float = 0.5,
226
+ model: str | None = None,
227
+ backend: str = "auto",
228
+ ) -> dict:
229
+ """Run the full eval set and return results."""
230
+ backend = detect_backend(backend)
231
+ results = []
232
+
233
+ with ProcessPoolExecutor(max_workers=num_workers) as executor:
234
+ future_to_info = {}
235
+ for item in eval_set:
236
+ for run_idx in range(runs_per_query):
237
+ if backend == "claude":
238
+ future = executor.submit(
239
+ run_single_query_claude,
240
+ item["query"],
241
+ skill_name,
242
+ description,
243
+ timeout,
244
+ str(project_root),
245
+ model,
246
+ )
247
+ else:
248
+ future = executor.submit(
249
+ run_single_query_judge,
250
+ item["query"],
251
+ skill_name,
252
+ description,
253
+ timeout,
254
+ str(project_root),
255
+ backend,
256
+ model,
257
+ )
258
+ future_to_info[future] = (item, run_idx)
259
+
260
+ query_triggers: dict[str, list[bool]] = {}
261
+ query_items: dict[str, dict] = {}
262
+ for future in as_completed(future_to_info):
263
+ item, _ = future_to_info[future]
264
+ query = item["query"]
265
+ query_items[query] = item
266
+ if query not in query_triggers:
267
+ query_triggers[query] = []
268
+ try:
269
+ query_triggers[query].append(future.result())
270
+ except Exception as exc:
271
+ print(f"Warning: query failed: {exc}", file=sys.stderr)
272
+ query_triggers[query].append(False)
273
+
274
+ for query, triggers in query_triggers.items():
275
+ item = query_items[query]
276
+ trigger_rate = sum(triggers) / len(triggers)
277
+ should_trigger = item["should_trigger"]
278
+ did_pass = trigger_rate >= trigger_threshold if should_trigger else trigger_rate < trigger_threshold
279
+ results.append({
280
+ "query": query,
281
+ "should_trigger": should_trigger,
282
+ "trigger_rate": trigger_rate,
283
+ "triggers": sum(triggers),
284
+ "runs": len(triggers),
285
+ "pass": did_pass,
286
+ })
287
+
288
+ passed = sum(1 for item in results if item["pass"])
289
+ total = len(results)
290
+ mode = "observed" if backend == "claude" else "judged"
291
+
292
+ return {
293
+ "skill_name": skill_name,
294
+ "description": description,
295
+ "backend": backend,
296
+ "evaluation_mode": mode,
297
+ "results": results,
298
+ "summary": {
299
+ "total": total,
300
+ "passed": passed,
301
+ "failed": total - passed,
302
+ },
303
+ }
304
+
305
+
306
+ def main():
307
+ parser = argparse.ArgumentParser(description="Run trigger evaluation for a skill description")
308
+ parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file")
309
+ parser.add_argument("--skill-path", required=True, help="Path to skill directory")
310
+ parser.add_argument("--description", default=None, help="Override description to test")
311
+ parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers")
312
+ parser.add_argument("--timeout", type=int, default=60, help="Timeout per query in seconds")
313
+ parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query")
314
+ parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold")
315
+ parser.add_argument("--model", default=None, help="Optional backend model identifier")
316
+ parser.add_argument("--backend", default="auto", choices=["auto", "claude", "codex"], help="Evaluation backend")
317
+ parser.add_argument("--verbose", action="store_true", help="Print progress to stderr")
318
+ args = parser.parse_args()
319
+
320
+ eval_set = json.loads(Path(args.eval_set).read_text())
321
+ skill_path = Path(args.skill_path)
322
+
323
+ if not (skill_path / "SKILL.md").exists():
324
+ print(f"Error: No SKILL.md found at {skill_path}", file=sys.stderr)
325
+ sys.exit(1)
326
+
327
+ name, original_description, _ = parse_skill_md(skill_path)
328
+ description = args.description or original_description
329
+ project_root = find_project_root()
330
+
331
+ if args.verbose:
332
+ print(f"Evaluating with backend={args.backend}: {description}", file=sys.stderr)
333
+
334
+ output = run_eval(
335
+ eval_set=eval_set,
336
+ skill_name=name,
337
+ description=description,
338
+ num_workers=args.num_workers,
339
+ timeout=args.timeout,
340
+ project_root=project_root,
341
+ runs_per_query=args.runs_per_query,
342
+ trigger_threshold=args.trigger_threshold,
343
+ model=args.model,
344
+ backend=args.backend,
345
+ )
346
+
347
+ if args.verbose:
348
+ summary = output["summary"]
349
+ print(
350
+ f"Results ({output['evaluation_mode']} via {output['backend']}): "
351
+ f"{summary['passed']}/{summary['total']} passed",
352
+ file=sys.stderr,
353
+ )
354
+ for item in output["results"]:
355
+ status = "PASS" if item["pass"] else "FAIL"
356
+ rate_str = f"{item['triggers']}/{item['runs']}"
357
+ print(f" [{status}] rate={rate_str} expected={item['should_trigger']}: {item['query'][:70]}", file=sys.stderr)
358
+
359
+ print(json.dumps(output, indent=2))
360
+
361
+
362
+ if __name__ == "__main__":
363
+ main()