skillnet-ai 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1006 @@
1
+ import ast
2
+ import json
3
+ import logging
4
+ import os
5
+ import shlex
6
+ import subprocess
7
+ import time
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ from dataclasses import dataclass
10
+ from typing import Dict, Any, List, Optional, Tuple, Callable, Iterator
11
+
12
+ from openai import OpenAI
13
+ from tqdm import tqdm
14
+
15
+ from skillnet_ai.downloader import SkillDownloader
16
+ from skillnet_ai.prompts import SKILL_EVALUATION_PROMPT
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ # ==========================================================================
22
+ # Configuration and data models
23
+ # ==========================================================================
24
+
25
+ @dataclass
26
+ class EvaluatorConfig:
27
+ """Configuration for the skill evaluator."""
28
+ api_key: str
29
+ base_url: str
30
+ model: str
31
+ max_workers: int = 5
32
+ temperature: float = 0.3
33
+ cache_dir: str = "./evaluate_cache_dir"
34
+ run_scripts: bool = False
35
+ script_timeout_sec: int = 8
36
+ max_script_runs: int = 5
37
+ script_python: str = "python"
38
+ include_script_results: bool = False
39
+ max_script_output_chars: int = 400
40
+ github_token: Optional[str] = None
41
+
42
+
43
+ @dataclass
44
+ class Skill:
45
+ """Unified representation of a skill."""
46
+ path: str # Local path to the skill root directory
47
+ name: str
48
+ description: Optional[str] = None
49
+ category: Optional[str] = None
50
+ url: Optional[str] = None # Original URL (when created from URL)
51
+
52
+ @classmethod
53
+ def from_url(
54
+ cls,
55
+ url: str,
56
+ downloader: 'SkillDownloader',
57
+ cache_dir: str,
58
+ max_retries: int = 3,
59
+ retry_delay: float = 2.0,
60
+ **kwargs
61
+ ) -> Tuple[Optional['Skill'], Optional[str]]:
62
+ """
63
+ Create a Skill from a GitHub URL.
64
+ download fails, it will retry, finally return (None, error_msg) instead of throwing an exception.
65
+
66
+ Returns:
67
+ (Skill, None) success; (None, error_msg) failure.
68
+ """
69
+ normalized_url = cls._normalize_url(url)
70
+ if not normalized_url:
71
+ return None, f"Invalid GitHub URL: {url}"
72
+ # Download to local cache (with retries in evaluator)
73
+ local_path = None
74
+ for attempt in range(max_retries):
75
+ local_path = downloader.download(normalized_url, target_dir=cache_dir)
76
+ if local_path:
77
+ break
78
+ if attempt < max_retries - 1:
79
+ logger.warning(
80
+ "Download failed (attempt %d/%d). Retrying in %.1fs...",
81
+ attempt + 1, max_retries, retry_delay,
82
+ )
83
+ time.sleep(retry_delay)
84
+ if not local_path:
85
+ return None, f"Failed to download after {max_retries} retries: {url}"
86
+ # Derive skill name from URL if not provided
87
+ name = kwargs.get('name') or normalized_url.rstrip('/').split('/')[-1]
88
+
89
+ return cls(
90
+ path=local_path,
91
+ name=name,
92
+ url=url,
93
+ description=kwargs.get('description'),
94
+ category=kwargs.get('category')
95
+ ), None
96
+
97
+ @classmethod
98
+ def from_path(cls, path: str, **kwargs) -> Tuple[Optional['Skill'], Optional[str]]:
99
+ """Create a Skill from a local directory path.
100
+
101
+ Returns:
102
+ (Skill, None) success; (None, error_msg) failure.
103
+ """
104
+ abs_path = os.path.abspath(path)
105
+ if not os.path.isdir(abs_path):
106
+ return None, f"Invalid skill path: {path}"
107
+
108
+ name = kwargs.get('name') or os.path.basename(abs_path)
109
+
110
+ return (
111
+ cls(
112
+ path=abs_path,
113
+ name=name,
114
+ description=kwargs.get('description'),
115
+ category=kwargs.get('category')
116
+ ),
117
+ None,
118
+ )
119
+
120
+ @staticmethod
121
+ def _normalize_url(url: str) -> Optional[str]:
122
+ """Normalize GitHub URL to /tree/ format."""
123
+ if not url:
124
+ return None
125
+ if "/blob/" in url:
126
+ return url.replace("/blob/", "/tree/")
127
+ if "/tree/" in url:
128
+ return url
129
+ return None
130
+
131
+
132
+ @dataclass
133
+ class ScriptExecutionResult:
134
+ """Result of executing a single script."""
135
+ path: str
136
+ status: str # success | compiled_only | failed | timeout | skipped
137
+ command: str
138
+ exit_code: Optional[int] = None
139
+ error: Optional[str] = None
140
+ duration_sec: Optional[float] = None
141
+ note: Optional[str] = None
142
+
143
+ def to_dict(self) -> Dict[str, Any]:
144
+ return {
145
+ "path": self.path,
146
+ "status": self.status,
147
+ "command": self.command,
148
+ "exit_code": self.exit_code,
149
+ "error": self.error,
150
+ "duration_sec": self.duration_sec,
151
+ "note": self.note,
152
+ }
153
+
154
+
155
+ class ScriptRunner:
156
+ """Execute python scripts under scripts/ with safe defaults."""
157
+
158
+ PATH_LIKE_EXTS = {
159
+ ".xml",
160
+ ".json",
161
+ ".yaml",
162
+ ".yml",
163
+ ".csv",
164
+ ".tsv",
165
+ ".txt",
166
+ ".md",
167
+ ".ini",
168
+ ".toml",
169
+ ".coverage",
170
+ ".db",
171
+ ".sqlite",
172
+ ".sql",
173
+ ".parquet",
174
+ }
175
+
176
+ def __init__(self, python_bin: str, timeout_sec: int, max_runs: int,
177
+ max_output_chars: int):
178
+ self.python_bin = python_bin
179
+ self.timeout_sec = timeout_sec
180
+ self.max_runs = max_runs
181
+ self.max_output_chars = max_output_chars
182
+
183
+ def run_for_skill(self, skill_dir: str) -> List[ScriptExecutionResult]:
184
+ scripts = self._discover_py_scripts(skill_dir)
185
+ results: List[ScriptExecutionResult] = []
186
+
187
+ for script_path in scripts[: self.max_runs]:
188
+ results.append(self._run_script(skill_dir, script_path))
189
+
190
+ if len(scripts) > self.max_runs:
191
+ logger.info(
192
+ "Found %s scripts, truncated to %s for execution",
193
+ len(scripts),
194
+ self.max_runs
195
+ )
196
+
197
+ return results
198
+
199
+ def _discover_py_scripts(self, skill_dir: str) -> List[str]:
200
+ paths: List[str] = []
201
+ for root, _, files in os.walk(skill_dir):
202
+ if "scripts" not in root.split(os.sep):
203
+ continue
204
+ for filename in files:
205
+ if filename.lower().endswith(".py"):
206
+ paths.append(os.path.join(root, filename))
207
+ return sorted(paths)
208
+
209
+ def _run_script(self, skill_dir: str, script_path: str) -> ScriptExecutionResult:
210
+ rel_path = os.path.relpath(script_path, skill_dir)
211
+ usage_cmd = self._build_usage_command(script_path, rel_path)
212
+ if usage_cmd:
213
+ missing_inputs = self._detect_missing_inputs(usage_cmd, skill_dir)
214
+ if missing_inputs:
215
+ compile_result = self._run_command(
216
+ [self.python_bin, "-m", "py_compile", rel_path],
217
+ skill_dir
218
+ )
219
+ note = f"missing inputs: {', '.join(missing_inputs)}"
220
+ if compile_result["timed_out"]:
221
+ return self._result_timeout(rel_path, compile_result, note=note)
222
+ if compile_result["exit_code"] == 0:
223
+ return self._result_compiled_only(rel_path, compile_result, note=note)
224
+ return self._result_failed(rel_path, compile_result, note=note)
225
+ run_result = self._run_command(usage_cmd, skill_dir)
226
+ note = "usage-derived command"
227
+ if run_result["timed_out"]:
228
+ return self._result_timeout(rel_path, run_result, note=note)
229
+ if run_result["exit_code"] == 0:
230
+ return self._result_success(rel_path, run_result, note=note)
231
+ return self._result_failed(rel_path, run_result, note=note)
232
+
233
+ compile_result = self._run_command(
234
+ [self.python_bin, "-m", "py_compile", rel_path],
235
+ skill_dir
236
+ )
237
+ if compile_result["timed_out"]:
238
+ return self._result_timeout(rel_path, compile_result)
239
+ if compile_result["exit_code"] == 0:
240
+ return self._result_compiled_only(
241
+ rel_path,
242
+ compile_result,
243
+ note="no usage examples found; py_compile succeeded",
244
+ )
245
+
246
+ return self._result_failed(
247
+ rel_path,
248
+ compile_result,
249
+ note="no usage examples found",
250
+ )
251
+
252
+ def _result_timeout(self, rel_path: str, result: Dict[str, Any], note: Optional[str] = None) -> ScriptExecutionResult:
253
+ return ScriptExecutionResult(
254
+ path=rel_path,
255
+ status="timeout",
256
+ command=result["command"],
257
+ exit_code=None,
258
+ error=result.get("error"),
259
+ duration_sec=result["duration_sec"],
260
+ note=note,
261
+ )
262
+
263
+ def _result_success(self, rel_path: str, result: Dict[str, Any], note: Optional[str] = None) -> ScriptExecutionResult:
264
+ return ScriptExecutionResult(
265
+ path=rel_path,
266
+ status="success",
267
+ command=result["command"],
268
+ exit_code=result.get("exit_code"),
269
+ duration_sec=result["duration_sec"],
270
+ note=note,
271
+ )
272
+
273
+ def _result_compiled_only(self, rel_path: str, result: Dict[str, Any], note: Optional[str] = None) -> ScriptExecutionResult:
274
+ return ScriptExecutionResult(
275
+ path=rel_path,
276
+ status="compiled_only",
277
+ command=result["command"],
278
+ exit_code=result.get("exit_code"),
279
+ error=self._pick_error(result),
280
+ duration_sec=result["duration_sec"],
281
+ note=note,
282
+ )
283
+
284
+ def _result_failed(self, rel_path: str, result: Dict[str, Any], note: Optional[str] = None) -> ScriptExecutionResult:
285
+ return ScriptExecutionResult(
286
+ path=rel_path,
287
+ status="failed",
288
+ command=result["command"],
289
+ exit_code=result.get("exit_code"),
290
+ error=self._pick_error(result),
291
+ duration_sec=result["duration_sec"],
292
+ note=note,
293
+ )
294
+
295
+ def _build_usage_command(self, script_path: str,
296
+ rel_path: str) -> Optional[List[str]]:
297
+ script_name = os.path.basename(script_path)
298
+ usage_lines = self._extract_usage_lines(script_path, script_name)
299
+ if not usage_lines:
300
+ return None
301
+
302
+ candidates: List[List[str]] = []
303
+ for line in usage_lines:
304
+ cmd = self._parse_usage_line(line, rel_path, script_name)
305
+ if cmd:
306
+ candidates.append(cmd)
307
+
308
+ if not candidates:
309
+ return None
310
+
311
+ # Prefer runnable commands:
312
+ # - Commands containing placeholder tokens like "<file>", "[options]", "{path}" are
313
+ # often documentation examples and not directly runnable.
314
+ # - If placeholders exist, prefer a help-style command to at least verify the script
315
+ # starts up, otherwise fall back to compilation-only.
316
+ runnable = [cmd for cmd in candidates if not self._has_placeholder_tokens(cmd)]
317
+ for cmd in runnable:
318
+ if not self._is_help_command(cmd):
319
+ return cmd
320
+
321
+ for cmd in candidates:
322
+ if self._is_help_command(cmd):
323
+ return cmd
324
+
325
+ return None
326
+
327
+ def _extract_usage_lines(self, script_path: str,
328
+ script_name: str) -> List[str]:
329
+ try:
330
+ with open(script_path, "r", encoding="utf-8", errors="ignore") as f:
331
+ source = f.read()
332
+ except Exception as e:
333
+ logger.warning("Failed to read %s: %s", script_path, e)
334
+ return []
335
+
336
+ try:
337
+ tree = ast.parse(source)
338
+ doc = ast.get_docstring(tree) or ""
339
+ except Exception:
340
+ doc = ""
341
+
342
+ if not doc:
343
+ return []
344
+
345
+ lines = doc.splitlines()
346
+ usage_lines: List[str] = []
347
+ for idx, line in enumerate(lines):
348
+ if line.strip().lower().startswith("usage:"):
349
+ for follow in lines[idx + 1:]:
350
+ if not follow.strip():
351
+ break
352
+ usage_lines.append(follow.strip())
353
+
354
+ if usage_lines:
355
+ return usage_lines
356
+
357
+ for line in lines:
358
+ stripped = line.strip()
359
+ if not stripped:
360
+ continue
361
+ if stripped.startswith(("./", "python", "python3")):
362
+ usage_lines.append(stripped)
363
+ continue
364
+ if stripped.startswith(script_name):
365
+ usage_lines.append(stripped)
366
+
367
+ return usage_lines
368
+
369
+ def _parse_usage_line(self, line: str, rel_path: str,
370
+ script_name: str) -> Optional[List[str]]:
371
+ try:
372
+ tokens = shlex.split(line)
373
+ except ValueError:
374
+ return None
375
+
376
+ if not tokens:
377
+ return None
378
+
379
+ python_prefix = tokens[0].startswith("python")
380
+ if python_prefix:
381
+ tokens = [self.python_bin] + tokens[1:]
382
+
383
+ script_idx = None
384
+ for idx, token in enumerate(tokens):
385
+ token_base = os.path.basename(token)
386
+ if token_base == script_name:
387
+ script_idx = idx
388
+ break
389
+
390
+ if script_idx is None:
391
+ return None
392
+
393
+ tokens[script_idx] = rel_path
394
+
395
+ if not python_prefix:
396
+ tokens = [self.python_bin] + tokens[script_idx:]
397
+
398
+ return tokens
399
+
400
+ def _iter_non_flag_tokens(self, cmd: List[str]) -> Iterator[str]:
401
+ for token in cmd:
402
+ if not token or token.startswith("-"):
403
+ continue
404
+ if token == self.python_bin:
405
+ continue
406
+ yield token
407
+
408
+ def _is_help_command(self, cmd: List[str]) -> bool:
409
+ for token in cmd:
410
+ lowered = token.lower()
411
+ if lowered in {"--help", "-h", "help"}:
412
+ return True
413
+ return False
414
+
415
+ def _has_placeholder_tokens(self, cmd: List[str]) -> bool:
416
+ for token in self._iter_non_flag_tokens(cmd):
417
+ if self._is_placeholder_token(token):
418
+ return True
419
+ return False
420
+
421
+ @staticmethod
422
+ def _is_placeholder_token(token: str) -> bool:
423
+ # Common usage placeholders: <...>, [...], {...}
424
+ if any(ch in token for ch in ("<", ">", "[", "]", "{", "}")):
425
+ return True
426
+ lowered = token.strip().lower()
427
+ return lowered in {"options", "[options]", "<options>", "{options}"}
428
+
429
+ def _detect_missing_inputs(self, cmd: List[str], cwd: str) -> List[str]:
430
+ missing: List[str] = []
431
+ for token in self._iter_non_flag_tokens(cmd):
432
+ if self._is_placeholder_token(token):
433
+ continue
434
+ if not self._looks_like_path(token):
435
+ continue
436
+ path = token if os.path.isabs(token) else os.path.join(cwd, token)
437
+ if not os.path.exists(path):
438
+ missing.append(token)
439
+ return missing
440
+
441
+ def _looks_like_path(self, token: str) -> bool:
442
+ if "/" in token or token.startswith("."):
443
+ return True
444
+ _, ext = os.path.splitext(token)
445
+ return ext.lower() in self.PATH_LIKE_EXTS
446
+
447
+ def _run_command(self, cmd: List[str], cwd: str) -> Dict[str, Any]:
448
+ command_str = shlex.join(cmd)
449
+ start = time.time()
450
+ try:
451
+ completed = subprocess.run(
452
+ cmd,
453
+ cwd=cwd,
454
+ text=True,
455
+ capture_output=True,
456
+ timeout=self.timeout_sec
457
+ )
458
+ duration = time.time() - start
459
+ return {
460
+ "command": command_str,
461
+ "exit_code": completed.returncode,
462
+ "stdout": self._truncate(completed.stdout),
463
+ "stderr": self._truncate(completed.stderr),
464
+ "duration_sec": round(duration, 3),
465
+ "timed_out": False
466
+ }
467
+ except subprocess.TimeoutExpired:
468
+ duration = time.time() - start
469
+ return {
470
+ "command": command_str,
471
+ "exit_code": None,
472
+ "stdout": "",
473
+ "stderr": "",
474
+ "duration_sec": round(duration, 3),
475
+ "timed_out": True,
476
+ "error": f"Timeout after {self.timeout_sec}s"
477
+ }
478
+ except FileNotFoundError as e:
479
+ duration = time.time() - start
480
+ return {
481
+ "command": command_str,
482
+ "exit_code": None,
483
+ "stdout": "",
484
+ "stderr": str(e),
485
+ "duration_sec": round(duration, 3),
486
+ "timed_out": False,
487
+ "error": str(e)
488
+ }
489
+
490
+ def _truncate(self, text: str) -> str:
491
+ if not text:
492
+ return ""
493
+ if len(text) <= self.max_output_chars:
494
+ return text
495
+ return text[: self.max_output_chars] + "...[truncated]"
496
+
497
+ def _pick_error(self, *results: Dict[str, Any]) -> Optional[str]:
498
+ for result in results:
499
+ if not result:
500
+ continue
501
+ stderr = result.get("stderr") or ""
502
+ stdout = result.get("stdout") or ""
503
+ if stderr.strip():
504
+ return stderr.strip()
505
+ if stdout.strip():
506
+ return stdout.strip()
507
+ if result.get("error"):
508
+ return str(result.get("error"))
509
+ return None
510
+
511
+
512
+ # ==========================================================================
513
+ # Skill content loader
514
+ # ==========================================================================
515
+
516
+ class SkillLoader:
517
+ """Load SKILL.md, scripts, and reference files for a skill."""
518
+
519
+ REFERENCE_ALLOWED_EXTS = {
520
+ ".md",
521
+ ".txt",
522
+ ".json",
523
+ ".yaml",
524
+ ".yml",
525
+ ".ini",
526
+ ".toml",
527
+ ".cfg",
528
+ ".csv",
529
+ ".tsv",
530
+ }
531
+
532
+ @staticmethod
533
+ def _walk_and_load(skill_dir: str, max_files: int, max_chars: int,
534
+ root_filter: Callable[[str], bool],
535
+ file_filter: Callable[[str], bool],
536
+ skip_skill_md: bool) -> List[Dict[str, str]]:
537
+ items: List[Dict[str, str]] = []
538
+ for root, _, files in os.walk(skill_dir):
539
+ if not root_filter(root):
540
+ continue
541
+ for filename in files:
542
+ if len(items) >= max_files:
543
+ return items
544
+ if skip_skill_md and filename.lower() == "skill.md":
545
+ continue
546
+ if not file_filter(filename):
547
+ continue
548
+ filepath = os.path.join(root, filename)
549
+ rel_path = os.path.relpath(filepath, skill_dir)
550
+ try:
551
+ with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
552
+ content = f.read(max_chars)
553
+ items.append({"path": rel_path, "content": content})
554
+ except Exception as e:
555
+ logger.warning(f"Skip {filepath}: {e}")
556
+ return items
557
+
558
+ @staticmethod
559
+ def load_skill_md(skill_dir: str, max_chars: int = 12000) -> Optional[str]:
560
+ """Load SKILL.md content with optional truncation."""
561
+ path = SkillLoader._find_file(skill_dir, "skill.md")
562
+ if not path:
563
+ logger.warning(f"SKILL.md not found in {skill_dir}")
564
+ return None
565
+
566
+ with open(path, 'r', encoding='utf-8', errors='ignore') as f:
567
+ content = f.read()
568
+
569
+ if len(content) > max_chars:
570
+ content = content[:max_chars] + "\n\n...[truncated]..."
571
+
572
+ return content
573
+
574
+ @staticmethod
575
+ def load_scripts(skill_dir: str, max_files: int = 5,
576
+ max_chars: int = 1200) -> List[Dict[str, str]]:
577
+ """Load a sample of files under the scripts directory."""
578
+ return SkillLoader._walk_and_load(
579
+ skill_dir,
580
+ max_files=max_files,
581
+ max_chars=max_chars,
582
+ root_filter=lambda root: "scripts" in root.split(os.sep),
583
+ file_filter=lambda _filename: True,
584
+ skip_skill_md=False,
585
+ )
586
+
587
+ @staticmethod
588
+ def load_references(
589
+ skill_dir: str,
590
+ max_files: int = 10,
591
+ max_chars: int = 4000,
592
+ ) -> List[Dict[str, str]]:
593
+ """
594
+ Load non-script reference files for a skill.
595
+
596
+ This is intended for files other than SKILL.md and scripts/,
597
+ e.g. README.md, references/, assets/, etc.
598
+ """
599
+ def file_filter(filename: str) -> bool:
600
+ ext = os.path.splitext(filename)[1].lower()
601
+ return (not ext) or (ext in SkillLoader.REFERENCE_ALLOWED_EXTS)
602
+
603
+ return SkillLoader._walk_and_load(
604
+ skill_dir,
605
+ max_files=max_files,
606
+ max_chars=max_chars,
607
+ root_filter=lambda root: "scripts" not in root.split(os.sep),
608
+ file_filter=file_filter,
609
+ skip_skill_md=True,
610
+ )
611
+
612
+ @staticmethod
613
+ def _find_file(directory: str, filename: str) -> Optional[str]:
614
+ """Recursively find a file in directory (case-insensitive)."""
615
+ for root, _, files in os.walk(directory):
616
+ for f in files:
617
+ if f.lower() == filename.lower():
618
+ return os.path.join(root, f)
619
+ return None
620
+
621
+
622
+ # ==========================================================================
623
+ # Prompt builder
624
+ # ==========================================================================
625
+
626
+ class PromptBuilder:
627
+ """Build prompts for skill evaluation."""
628
+
629
+ @staticmethod
630
+ def _format_file_items(items: List[Dict[str, str]], empty_message: str) -> str:
631
+ formatted: List[str] = []
632
+ for item in items:
633
+ if not isinstance(item, dict):
634
+ continue
635
+ path = item.get("path") or "[unknown path]"
636
+ content = item.get("content") or ""
637
+ formatted.append(f"# {path}\n{content}\n")
638
+ return "\n".join(formatted) if formatted else empty_message
639
+
640
+ @staticmethod
641
+ def build(skill: Skill, skill_md: Optional[str],
642
+ scripts: List[Dict[str, str]],
643
+ references: Optional[List[Dict[str, str]]] = None,
644
+ script_exec_results: Optional[List[ScriptExecutionResult]] = None) -> str:
645
+ """Build the evaluation prompt for a given skill."""
646
+ skill_md_block = skill_md or "[SKILL.md not found]"
647
+
648
+ if references:
649
+ references_block = PromptBuilder._format_file_items(
650
+ references,
651
+ "[No references or additional assets found]",
652
+ )
653
+ else:
654
+ references_block = "[No references or additional assets found]"
655
+
656
+ if scripts:
657
+ scripts_block = PromptBuilder._format_file_items(
658
+ scripts,
659
+ "[No scripts found]",
660
+ )
661
+ else:
662
+ scripts_block = "[No scripts found]"
663
+
664
+ if script_exec_results is None:
665
+ script_exec_block = "[Scripts not executed]"
666
+ elif not script_exec_results:
667
+ script_exec_block = "[No runnable python scripts found]"
668
+ else:
669
+ script_exec_block = "\n".join(
670
+ PromptBuilder._format_exec_result(r)
671
+ for r in script_exec_results
672
+ )
673
+
674
+ return SKILL_EVALUATION_PROMPT.format(
675
+ skill_name=skill.name,
676
+ skill_description=skill.description or "N/A",
677
+ category=skill.category or "N/A",
678
+ repo_name="N/A",
679
+ author="N/A",
680
+ skill_md_block=skill_md_block,
681
+ references_block=references_block,
682
+ scripts_block=scripts_block,
683
+ script_exec_block=script_exec_block
684
+ )
685
+
686
+ @staticmethod
687
+ def _format_exec_result(result: ScriptExecutionResult) -> str:
688
+ base = f"- {result.path}: {result.status}"
689
+ if result.exit_code is not None:
690
+ base += f" (exit={result.exit_code})"
691
+ base += f" | cmd: {result.command}"
692
+ if result.note:
693
+ base += f" | note: {result.note}"
694
+ if result.error:
695
+ clean_error = " ".join(result.error.splitlines())
696
+ base += f" | error: {clean_error}"
697
+ return base
698
+
699
+
700
+ # ==========================================================================
701
+ # LLM client
702
+ # ==========================================================================
703
+
704
+ class LLMClient:
705
+ """Thin wrapper around the OpenAI client for evaluation calls."""
706
+
707
+ def __init__(self, config: EvaluatorConfig):
708
+ self.client = OpenAI(api_key=config.api_key, base_url=config.base_url)
709
+ self.model = config.model
710
+ self.temperature = config.temperature
711
+
712
+ def evaluate(self, prompt: str) -> Dict[str, Any]:
713
+ """Call the LLM with the given prompt and parse JSON response."""
714
+ messages = [
715
+ {
716
+ "role": "system",
717
+ "content": (
718
+ "You are an expert evaluator of AI Agent Skills. "
719
+ "Follow the JSON schema and constraints exactly. "
720
+ "Use ONLY the provided metadata, SKILL.md, reference files, and scripts snippets."
721
+ )
722
+ },
723
+ {"role": "user", "content": prompt}
724
+ ]
725
+
726
+ try:
727
+ response = self.client.chat.completions.create(
728
+ model=self.model,
729
+ messages=messages,
730
+ response_format={"type": "json_object"},
731
+ temperature=self.temperature
732
+ )
733
+ raw_response = response.choices[0].message.content
734
+ return json.loads(raw_response)
735
+ except Exception as e:
736
+ logger.error(f"LLM call failed: {e}")
737
+ raise
738
+
739
+
740
+ # ==========================================================================
741
+ # Core evaluator
742
+ # ==========================================================================
743
+
744
+ class SkillEvaluator:
745
+ """
746
+ Main entry point for evaluating AI skills.
747
+
748
+ Typical usage:
749
+ config = EvaluatorConfig(api_key="your-key")
750
+ evaluator = SkillEvaluator(config)
751
+
752
+ # Single skill from URL
753
+ skill, err = Skill.from_url("https://github.com/.../skill", evaluator.downloader, config.cache_dir)
754
+ if err:
755
+ result = evaluator._create_error_result(err)
756
+ else:
757
+ result = evaluator.evaluate(skill)
758
+
759
+ # Single skill from local path
760
+ skill = Skill.from_path("/path/to/skill")
761
+ result = evaluator.evaluate(skill)
762
+
763
+ # Batch evaluation
764
+ skills = [skill1, skill2, skill3]
765
+ results = evaluator.evaluate_batch(skills)
766
+ """
767
+
768
+ def __init__(self, config: EvaluatorConfig):
769
+ """Initialize the evaluator with configuration."""
770
+ if not config.api_key:
771
+ raise ValueError("API key is required")
772
+
773
+ self.config = config
774
+ self.downloader = SkillDownloader(api_token=config.github_token)
775
+ self.loader = SkillLoader()
776
+ self.prompt_builder = PromptBuilder()
777
+ self.llm_client = LLMClient(config)
778
+ self.script_runner = ScriptRunner(
779
+ python_bin=config.script_python,
780
+ timeout_sec=config.script_timeout_sec,
781
+ max_runs=config.max_script_runs,
782
+ max_output_chars=config.max_script_output_chars
783
+ )
784
+
785
+ def evaluate(self, skill: Skill) -> Dict[str, Any]:
786
+ """
787
+ Evaluate a single skill.
788
+
789
+ Args:
790
+ skill: A Skill instance to evaluate.
791
+
792
+ Returns:
793
+ A dict containing the evaluation result.
794
+ """
795
+ try:
796
+ # Load content
797
+ skill_md = self.loader.load_skill_md(skill.path)
798
+ scripts = self.loader.load_scripts(skill.path)
799
+ references = self.loader.load_references(skill.path)
800
+
801
+ # Optional script execution
802
+ script_exec_results: Optional[List[ScriptExecutionResult]] = None
803
+ if self.config.run_scripts:
804
+ script_exec_results = self.script_runner.run_for_skill(skill.path)
805
+
806
+ # Build prompt
807
+ prompt = self.prompt_builder.build(
808
+ skill,
809
+ skill_md,
810
+ scripts,
811
+ references=references,
812
+ script_exec_results=script_exec_results
813
+ )
814
+
815
+ # Call LLM
816
+ result = self.llm_client.evaluate(prompt)
817
+ if self.config.include_script_results and script_exec_results is not None:
818
+ result["script_execution"] = [
819
+ r.to_dict() for r in script_exec_results
820
+ ]
821
+ return result
822
+
823
+ except Exception as e:
824
+ skill_name = getattr(skill, "name", "[unknown skill]")
825
+ logger.exception("Evaluation failed for %s: %s", skill_name, e)
826
+ return self._create_error_result(str(e))
827
+
828
+ def evaluate_batch(self, skills: List[Skill]) -> List[Dict[str, Any]]:
829
+ """
830
+ Evaluate multiple skills in parallel.
831
+
832
+ Args:
833
+ skills: List of Skill objects.
834
+
835
+ Returns:
836
+ List of evaluation results in the same order as input.
837
+ """
838
+ results = [None] * len(skills)
839
+
840
+ with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
841
+ future_to_idx = {
842
+ executor.submit(self.evaluate, skill): idx
843
+ for idx, skill in enumerate(skills)
844
+ }
845
+
846
+ with tqdm(total=len(skills), desc="Evaluating skills") as pbar:
847
+ for future in as_completed(future_to_idx):
848
+ idx = future_to_idx[future]
849
+ results[idx] = future.result()
850
+ pbar.update(1)
851
+
852
+ return results
853
+
854
+ def evaluate_from_url(self, url: str, **kwargs) -> Dict[str, Any]:
855
+ """Convenience helper: create and evaluate a skill from a URL."""
856
+ skill, err = Skill.from_url(
857
+ url, self.downloader, self.config.cache_dir, **kwargs
858
+ )
859
+ if err:
860
+ return self._create_error_result(err)
861
+ return self.evaluate(skill)
862
+
863
+ def evaluate_from_path(self, path: str, **kwargs) -> Dict[str, Any]:
864
+ """Convenience helper: create and evaluate a skill from a local path."""
865
+ skill, err = Skill.from_path(
866
+ path, **kwargs
867
+ )
868
+ if err:
869
+ return self._create_error_result(err)
870
+ return self.evaluate(skill)
871
+
872
+ @staticmethod
873
+ def _create_error_result(error_msg: str) -> Dict[str, Any]:
874
+ """Create a default error-shaped evaluation result."""
875
+ error_item = {"level": "Poor", "reason": f"Evaluation failed: {error_msg}"}
876
+ return {
877
+ "error": error_msg,
878
+ "safety": error_item,
879
+ "completeness": error_item,
880
+ "executability": error_item,
881
+ "modifiability": error_item,
882
+ "cost_awareness": error_item
883
+ }
884
+
885
+
886
+ # ==========================================================================
887
+ # CLI entry point
888
+ # ==========================================================================
889
+
890
+ if __name__ == '__main__':
891
+ """Command line entry point for batch evaluation from JSONL."""
892
+ import argparse
893
+
894
+ parser = argparse.ArgumentParser(description='Evaluate AI Agent Skills')
895
+ parser.add_argument('--input', required=True, help='Input JSONL file')
896
+ parser.add_argument('--output', required=True, help='Output JSONL file')
897
+ parser.add_argument('--api-key', help='OpenAI API key')
898
+ parser.add_argument('--base-url', help='OpenAI API base URL')
899
+ parser.add_argument('--model', default='gpt-4o', help='Model name')
900
+ parser.add_argument('--max-workers', type=int, default=5, help='Max workers')
901
+ parser.add_argument('--cache-dir', default='./evaluate_cache_dir', help='Cache directory')
902
+ parser.add_argument('--run-scripts', action='store_true',
903
+ help='Execute python scripts under scripts/')
904
+ parser.add_argument('--script-timeout', type=int, default=8,
905
+ help='Timeout seconds per script run')
906
+ parser.add_argument('--max-script-runs', type=int, default=5,
907
+ help='Max python scripts to execute per skill')
908
+ parser.add_argument('--script-python', default='python',
909
+ help='Python executable for running scripts')
910
+ parser.add_argument('--include-script-results', action='store_true',
911
+ help='Attach script execution results to evaluation output')
912
+ parser.add_argument('--max-script-output-chars', type=int, default=400,
913
+ help='Max chars of script stdout/stderr to keep')
914
+
915
+ args = parser.parse_args()
916
+
917
+ def _load_records(jsonl_path: str) -> List[Dict[str, Any]]:
918
+ with open(jsonl_path, 'r', encoding='utf-8') as f:
919
+ return [json.loads(line) for line in f if line.strip()]
920
+
921
+ def _build_skills(
922
+ records: List[Dict[str, Any]],
923
+ evaluator: 'SkillEvaluator',
924
+ config: EvaluatorConfig,
925
+ ) -> Tuple[List[Optional[Skill]], Dict[int, str]]:
926
+ skills: List[Optional[Skill]] = []
927
+ errors: Dict[int, str] = {}
928
+ for idx, rec in enumerate(records):
929
+ if 'skill_url' in rec:
930
+ skill, err = Skill.from_url(
931
+ rec['skill_url'],
932
+ evaluator.downloader,
933
+ config.cache_dir,
934
+ name=rec.get('skill_name'),
935
+ description=rec.get('skill_description'),
936
+ category=rec.get('category')
937
+ )
938
+ elif 'skill_path' in rec:
939
+ skill, err = Skill.from_path(
940
+ rec['skill_path'],
941
+ name=rec.get('skill_name'),
942
+ description=rec.get('skill_description'),
943
+ category=rec.get('category')
944
+ )
945
+ else:
946
+ raise ValueError("Record must have 'skill_url' or 'skill_path'")
947
+
948
+ if err:
949
+ errors[idx] = err
950
+ skills.append(None)
951
+ else:
952
+ skills.append(skill)
953
+ return skills, errors
954
+
955
+ def _evaluate_records(
956
+ records: List[Dict[str, Any]],
957
+ evaluator: 'SkillEvaluator',
958
+ skills: List[Optional[Skill]],
959
+ errors: Dict[int, str],
960
+ ) -> List[Dict[str, Any]]:
961
+ skills_to_eval = [(idx, s) for idx, s in enumerate(skills) if s is not None]
962
+ idx_to_result: Dict[int, Dict[str, Any]] = {}
963
+ if skills_to_eval:
964
+ indices, valid_skills = zip(*skills_to_eval)
965
+ batch_results = evaluator.evaluate_batch(list(valid_skills))
966
+ idx_to_result = dict(zip(indices, batch_results))
967
+ return [
968
+ evaluator._create_error_result(errors[idx])
969
+ if idx in errors
970
+ else idx_to_result[idx]
971
+ for idx in range(len(records))
972
+ ]
973
+
974
+ def _write_outputs(records: List[Dict[str, Any]], output_jsonl_path: str) -> str:
975
+ with open(output_jsonl_path, 'w', encoding='utf-8') as f:
976
+ for rec in records:
977
+ f.write(json.dumps(rec, ensure_ascii=False) + '\n')
978
+ json_path = output_jsonl_path.replace('.jsonl', '.json')
979
+ with open(json_path, 'w', encoding='utf-8') as f:
980
+ json.dump({str(i): rec for i, rec in enumerate(records)},
981
+ f, ensure_ascii=False, indent=2)
982
+ return json_path
983
+
984
+ config = EvaluatorConfig(
985
+ api_key=args.api_key or os.getenv('API_KEY'),
986
+ base_url=args.base_url or os.getenv('BASE_URL'),
987
+ model=args.model,
988
+ max_workers=args.max_workers,
989
+ cache_dir=args.cache_dir,
990
+ run_scripts=args.run_scripts,
991
+ script_timeout_sec=args.script_timeout,
992
+ max_script_runs=args.max_script_runs,
993
+ script_python=args.script_python,
994
+ include_script_results=args.include_script_results,
995
+ max_script_output_chars=args.max_script_output_chars
996
+ )
997
+ evaluator = SkillEvaluator(config)
998
+ records = _load_records(args.input)
999
+ skills, download_errors = _build_skills(records, evaluator, config)
1000
+ results = _evaluate_records(records, evaluator, skills, download_errors)
1001
+ for rec, result in zip(records, results):
1002
+ rec['evaluation'] = result
1003
+ json_path = _write_outputs(records, args.output)
1004
+
1005
+ print(f"✓ Evaluated {len(results)} skills")
1006
+ print(f"✓ Results saved to {args.output} and {json_path}")