skillnet-ai 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- skillnet_ai/__init__.py +23 -0
- skillnet_ai/analyzer.py +222 -0
- skillnet_ai/cli.py +577 -0
- skillnet_ai/client.py +316 -0
- skillnet_ai/creator.py +1026 -0
- skillnet_ai/downloader.py +156 -0
- skillnet_ai/evaluator.py +1006 -0
- skillnet_ai/models.py +41 -0
- skillnet_ai/prompts.py +885 -0
- skillnet_ai/searcher.py +100 -0
- skillnet_ai-0.0.3.dist-info/METADATA +369 -0
- skillnet_ai-0.0.3.dist-info/RECORD +16 -0
- {skillnet_ai-0.0.1.dist-info → skillnet_ai-0.0.3.dist-info}/WHEEL +1 -1
- skillnet_ai-0.0.3.dist-info/entry_points.txt +2 -0
- skillnet_ai-0.0.3.dist-info/licenses/LICENSE +21 -0
- skillnet_ai-0.0.1.dist-info/METADATA +0 -20
- skillnet_ai-0.0.1.dist-info/RECORD +0 -5
- {skillnet_ai-0.0.1.dist-info → skillnet_ai-0.0.3.dist-info}/top_level.txt +0 -0
skillnet_ai/evaluator.py
ADDED
|
@@ -0,0 +1,1006 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import shlex
|
|
6
|
+
import subprocess
|
|
7
|
+
import time
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from typing import Dict, Any, List, Optional, Tuple, Callable, Iterator
|
|
11
|
+
|
|
12
|
+
from openai import OpenAI
|
|
13
|
+
from tqdm import tqdm
|
|
14
|
+
|
|
15
|
+
from skillnet_ai.downloader import SkillDownloader
|
|
16
|
+
from skillnet_ai.prompts import SKILL_EVALUATION_PROMPT
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# ==========================================================================
|
|
22
|
+
# Configuration and data models
|
|
23
|
+
# ==========================================================================
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class EvaluatorConfig:
|
|
27
|
+
"""Configuration for the skill evaluator."""
|
|
28
|
+
api_key: str
|
|
29
|
+
base_url: str
|
|
30
|
+
model: str
|
|
31
|
+
max_workers: int = 5
|
|
32
|
+
temperature: float = 0.3
|
|
33
|
+
cache_dir: str = "./evaluate_cache_dir"
|
|
34
|
+
run_scripts: bool = False
|
|
35
|
+
script_timeout_sec: int = 8
|
|
36
|
+
max_script_runs: int = 5
|
|
37
|
+
script_python: str = "python"
|
|
38
|
+
include_script_results: bool = False
|
|
39
|
+
max_script_output_chars: int = 400
|
|
40
|
+
github_token: Optional[str] = None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class Skill:
|
|
45
|
+
"""Unified representation of a skill."""
|
|
46
|
+
path: str # Local path to the skill root directory
|
|
47
|
+
name: str
|
|
48
|
+
description: Optional[str] = None
|
|
49
|
+
category: Optional[str] = None
|
|
50
|
+
url: Optional[str] = None # Original URL (when created from URL)
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def from_url(
|
|
54
|
+
cls,
|
|
55
|
+
url: str,
|
|
56
|
+
downloader: 'SkillDownloader',
|
|
57
|
+
cache_dir: str,
|
|
58
|
+
max_retries: int = 3,
|
|
59
|
+
retry_delay: float = 2.0,
|
|
60
|
+
**kwargs
|
|
61
|
+
) -> Tuple[Optional['Skill'], Optional[str]]:
|
|
62
|
+
"""
|
|
63
|
+
Create a Skill from a GitHub URL.
|
|
64
|
+
download fails, it will retry, finally return (None, error_msg) instead of throwing an exception.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
(Skill, None) success; (None, error_msg) failure.
|
|
68
|
+
"""
|
|
69
|
+
normalized_url = cls._normalize_url(url)
|
|
70
|
+
if not normalized_url:
|
|
71
|
+
return None, f"Invalid GitHub URL: {url}"
|
|
72
|
+
# Download to local cache (with retries in evaluator)
|
|
73
|
+
local_path = None
|
|
74
|
+
for attempt in range(max_retries):
|
|
75
|
+
local_path = downloader.download(normalized_url, target_dir=cache_dir)
|
|
76
|
+
if local_path:
|
|
77
|
+
break
|
|
78
|
+
if attempt < max_retries - 1:
|
|
79
|
+
logger.warning(
|
|
80
|
+
"Download failed (attempt %d/%d). Retrying in %.1fs...",
|
|
81
|
+
attempt + 1, max_retries, retry_delay,
|
|
82
|
+
)
|
|
83
|
+
time.sleep(retry_delay)
|
|
84
|
+
if not local_path:
|
|
85
|
+
return None, f"Failed to download after {max_retries} retries: {url}"
|
|
86
|
+
# Derive skill name from URL if not provided
|
|
87
|
+
name = kwargs.get('name') or normalized_url.rstrip('/').split('/')[-1]
|
|
88
|
+
|
|
89
|
+
return cls(
|
|
90
|
+
path=local_path,
|
|
91
|
+
name=name,
|
|
92
|
+
url=url,
|
|
93
|
+
description=kwargs.get('description'),
|
|
94
|
+
category=kwargs.get('category')
|
|
95
|
+
), None
|
|
96
|
+
|
|
97
|
+
@classmethod
|
|
98
|
+
def from_path(cls, path: str, **kwargs) -> Tuple[Optional['Skill'], Optional[str]]:
|
|
99
|
+
"""Create a Skill from a local directory path.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
(Skill, None) success; (None, error_msg) failure.
|
|
103
|
+
"""
|
|
104
|
+
abs_path = os.path.abspath(path)
|
|
105
|
+
if not os.path.isdir(abs_path):
|
|
106
|
+
return None, f"Invalid skill path: {path}"
|
|
107
|
+
|
|
108
|
+
name = kwargs.get('name') or os.path.basename(abs_path)
|
|
109
|
+
|
|
110
|
+
return (
|
|
111
|
+
cls(
|
|
112
|
+
path=abs_path,
|
|
113
|
+
name=name,
|
|
114
|
+
description=kwargs.get('description'),
|
|
115
|
+
category=kwargs.get('category')
|
|
116
|
+
),
|
|
117
|
+
None,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
@staticmethod
|
|
121
|
+
def _normalize_url(url: str) -> Optional[str]:
|
|
122
|
+
"""Normalize GitHub URL to /tree/ format."""
|
|
123
|
+
if not url:
|
|
124
|
+
return None
|
|
125
|
+
if "/blob/" in url:
|
|
126
|
+
return url.replace("/blob/", "/tree/")
|
|
127
|
+
if "/tree/" in url:
|
|
128
|
+
return url
|
|
129
|
+
return None
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
@dataclass
|
|
133
|
+
class ScriptExecutionResult:
|
|
134
|
+
"""Result of executing a single script."""
|
|
135
|
+
path: str
|
|
136
|
+
status: str # success | compiled_only | failed | timeout | skipped
|
|
137
|
+
command: str
|
|
138
|
+
exit_code: Optional[int] = None
|
|
139
|
+
error: Optional[str] = None
|
|
140
|
+
duration_sec: Optional[float] = None
|
|
141
|
+
note: Optional[str] = None
|
|
142
|
+
|
|
143
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
144
|
+
return {
|
|
145
|
+
"path": self.path,
|
|
146
|
+
"status": self.status,
|
|
147
|
+
"command": self.command,
|
|
148
|
+
"exit_code": self.exit_code,
|
|
149
|
+
"error": self.error,
|
|
150
|
+
"duration_sec": self.duration_sec,
|
|
151
|
+
"note": self.note,
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class ScriptRunner:
|
|
156
|
+
"""Execute python scripts under scripts/ with safe defaults."""
|
|
157
|
+
|
|
158
|
+
PATH_LIKE_EXTS = {
|
|
159
|
+
".xml",
|
|
160
|
+
".json",
|
|
161
|
+
".yaml",
|
|
162
|
+
".yml",
|
|
163
|
+
".csv",
|
|
164
|
+
".tsv",
|
|
165
|
+
".txt",
|
|
166
|
+
".md",
|
|
167
|
+
".ini",
|
|
168
|
+
".toml",
|
|
169
|
+
".coverage",
|
|
170
|
+
".db",
|
|
171
|
+
".sqlite",
|
|
172
|
+
".sql",
|
|
173
|
+
".parquet",
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
def __init__(self, python_bin: str, timeout_sec: int, max_runs: int,
|
|
177
|
+
max_output_chars: int):
|
|
178
|
+
self.python_bin = python_bin
|
|
179
|
+
self.timeout_sec = timeout_sec
|
|
180
|
+
self.max_runs = max_runs
|
|
181
|
+
self.max_output_chars = max_output_chars
|
|
182
|
+
|
|
183
|
+
def run_for_skill(self, skill_dir: str) -> List[ScriptExecutionResult]:
|
|
184
|
+
scripts = self._discover_py_scripts(skill_dir)
|
|
185
|
+
results: List[ScriptExecutionResult] = []
|
|
186
|
+
|
|
187
|
+
for script_path in scripts[: self.max_runs]:
|
|
188
|
+
results.append(self._run_script(skill_dir, script_path))
|
|
189
|
+
|
|
190
|
+
if len(scripts) > self.max_runs:
|
|
191
|
+
logger.info(
|
|
192
|
+
"Found %s scripts, truncated to %s for execution",
|
|
193
|
+
len(scripts),
|
|
194
|
+
self.max_runs
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
return results
|
|
198
|
+
|
|
199
|
+
def _discover_py_scripts(self, skill_dir: str) -> List[str]:
|
|
200
|
+
paths: List[str] = []
|
|
201
|
+
for root, _, files in os.walk(skill_dir):
|
|
202
|
+
if "scripts" not in root.split(os.sep):
|
|
203
|
+
continue
|
|
204
|
+
for filename in files:
|
|
205
|
+
if filename.lower().endswith(".py"):
|
|
206
|
+
paths.append(os.path.join(root, filename))
|
|
207
|
+
return sorted(paths)
|
|
208
|
+
|
|
209
|
+
def _run_script(self, skill_dir: str, script_path: str) -> ScriptExecutionResult:
|
|
210
|
+
rel_path = os.path.relpath(script_path, skill_dir)
|
|
211
|
+
usage_cmd = self._build_usage_command(script_path, rel_path)
|
|
212
|
+
if usage_cmd:
|
|
213
|
+
missing_inputs = self._detect_missing_inputs(usage_cmd, skill_dir)
|
|
214
|
+
if missing_inputs:
|
|
215
|
+
compile_result = self._run_command(
|
|
216
|
+
[self.python_bin, "-m", "py_compile", rel_path],
|
|
217
|
+
skill_dir
|
|
218
|
+
)
|
|
219
|
+
note = f"missing inputs: {', '.join(missing_inputs)}"
|
|
220
|
+
if compile_result["timed_out"]:
|
|
221
|
+
return self._result_timeout(rel_path, compile_result, note=note)
|
|
222
|
+
if compile_result["exit_code"] == 0:
|
|
223
|
+
return self._result_compiled_only(rel_path, compile_result, note=note)
|
|
224
|
+
return self._result_failed(rel_path, compile_result, note=note)
|
|
225
|
+
run_result = self._run_command(usage_cmd, skill_dir)
|
|
226
|
+
note = "usage-derived command"
|
|
227
|
+
if run_result["timed_out"]:
|
|
228
|
+
return self._result_timeout(rel_path, run_result, note=note)
|
|
229
|
+
if run_result["exit_code"] == 0:
|
|
230
|
+
return self._result_success(rel_path, run_result, note=note)
|
|
231
|
+
return self._result_failed(rel_path, run_result, note=note)
|
|
232
|
+
|
|
233
|
+
compile_result = self._run_command(
|
|
234
|
+
[self.python_bin, "-m", "py_compile", rel_path],
|
|
235
|
+
skill_dir
|
|
236
|
+
)
|
|
237
|
+
if compile_result["timed_out"]:
|
|
238
|
+
return self._result_timeout(rel_path, compile_result)
|
|
239
|
+
if compile_result["exit_code"] == 0:
|
|
240
|
+
return self._result_compiled_only(
|
|
241
|
+
rel_path,
|
|
242
|
+
compile_result,
|
|
243
|
+
note="no usage examples found; py_compile succeeded",
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
return self._result_failed(
|
|
247
|
+
rel_path,
|
|
248
|
+
compile_result,
|
|
249
|
+
note="no usage examples found",
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
def _result_timeout(self, rel_path: str, result: Dict[str, Any], note: Optional[str] = None) -> ScriptExecutionResult:
|
|
253
|
+
return ScriptExecutionResult(
|
|
254
|
+
path=rel_path,
|
|
255
|
+
status="timeout",
|
|
256
|
+
command=result["command"],
|
|
257
|
+
exit_code=None,
|
|
258
|
+
error=result.get("error"),
|
|
259
|
+
duration_sec=result["duration_sec"],
|
|
260
|
+
note=note,
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
def _result_success(self, rel_path: str, result: Dict[str, Any], note: Optional[str] = None) -> ScriptExecutionResult:
|
|
264
|
+
return ScriptExecutionResult(
|
|
265
|
+
path=rel_path,
|
|
266
|
+
status="success",
|
|
267
|
+
command=result["command"],
|
|
268
|
+
exit_code=result.get("exit_code"),
|
|
269
|
+
duration_sec=result["duration_sec"],
|
|
270
|
+
note=note,
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
def _result_compiled_only(self, rel_path: str, result: Dict[str, Any], note: Optional[str] = None) -> ScriptExecutionResult:
|
|
274
|
+
return ScriptExecutionResult(
|
|
275
|
+
path=rel_path,
|
|
276
|
+
status="compiled_only",
|
|
277
|
+
command=result["command"],
|
|
278
|
+
exit_code=result.get("exit_code"),
|
|
279
|
+
error=self._pick_error(result),
|
|
280
|
+
duration_sec=result["duration_sec"],
|
|
281
|
+
note=note,
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
def _result_failed(self, rel_path: str, result: Dict[str, Any], note: Optional[str] = None) -> ScriptExecutionResult:
|
|
285
|
+
return ScriptExecutionResult(
|
|
286
|
+
path=rel_path,
|
|
287
|
+
status="failed",
|
|
288
|
+
command=result["command"],
|
|
289
|
+
exit_code=result.get("exit_code"),
|
|
290
|
+
error=self._pick_error(result),
|
|
291
|
+
duration_sec=result["duration_sec"],
|
|
292
|
+
note=note,
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
def _build_usage_command(self, script_path: str,
|
|
296
|
+
rel_path: str) -> Optional[List[str]]:
|
|
297
|
+
script_name = os.path.basename(script_path)
|
|
298
|
+
usage_lines = self._extract_usage_lines(script_path, script_name)
|
|
299
|
+
if not usage_lines:
|
|
300
|
+
return None
|
|
301
|
+
|
|
302
|
+
candidates: List[List[str]] = []
|
|
303
|
+
for line in usage_lines:
|
|
304
|
+
cmd = self._parse_usage_line(line, rel_path, script_name)
|
|
305
|
+
if cmd:
|
|
306
|
+
candidates.append(cmd)
|
|
307
|
+
|
|
308
|
+
if not candidates:
|
|
309
|
+
return None
|
|
310
|
+
|
|
311
|
+
# Prefer runnable commands:
|
|
312
|
+
# - Commands containing placeholder tokens like "<file>", "[options]", "{path}" are
|
|
313
|
+
# often documentation examples and not directly runnable.
|
|
314
|
+
# - If placeholders exist, prefer a help-style command to at least verify the script
|
|
315
|
+
# starts up, otherwise fall back to compilation-only.
|
|
316
|
+
runnable = [cmd for cmd in candidates if not self._has_placeholder_tokens(cmd)]
|
|
317
|
+
for cmd in runnable:
|
|
318
|
+
if not self._is_help_command(cmd):
|
|
319
|
+
return cmd
|
|
320
|
+
|
|
321
|
+
for cmd in candidates:
|
|
322
|
+
if self._is_help_command(cmd):
|
|
323
|
+
return cmd
|
|
324
|
+
|
|
325
|
+
return None
|
|
326
|
+
|
|
327
|
+
def _extract_usage_lines(self, script_path: str,
|
|
328
|
+
script_name: str) -> List[str]:
|
|
329
|
+
try:
|
|
330
|
+
with open(script_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
331
|
+
source = f.read()
|
|
332
|
+
except Exception as e:
|
|
333
|
+
logger.warning("Failed to read %s: %s", script_path, e)
|
|
334
|
+
return []
|
|
335
|
+
|
|
336
|
+
try:
|
|
337
|
+
tree = ast.parse(source)
|
|
338
|
+
doc = ast.get_docstring(tree) or ""
|
|
339
|
+
except Exception:
|
|
340
|
+
doc = ""
|
|
341
|
+
|
|
342
|
+
if not doc:
|
|
343
|
+
return []
|
|
344
|
+
|
|
345
|
+
lines = doc.splitlines()
|
|
346
|
+
usage_lines: List[str] = []
|
|
347
|
+
for idx, line in enumerate(lines):
|
|
348
|
+
if line.strip().lower().startswith("usage:"):
|
|
349
|
+
for follow in lines[idx + 1:]:
|
|
350
|
+
if not follow.strip():
|
|
351
|
+
break
|
|
352
|
+
usage_lines.append(follow.strip())
|
|
353
|
+
|
|
354
|
+
if usage_lines:
|
|
355
|
+
return usage_lines
|
|
356
|
+
|
|
357
|
+
for line in lines:
|
|
358
|
+
stripped = line.strip()
|
|
359
|
+
if not stripped:
|
|
360
|
+
continue
|
|
361
|
+
if stripped.startswith(("./", "python", "python3")):
|
|
362
|
+
usage_lines.append(stripped)
|
|
363
|
+
continue
|
|
364
|
+
if stripped.startswith(script_name):
|
|
365
|
+
usage_lines.append(stripped)
|
|
366
|
+
|
|
367
|
+
return usage_lines
|
|
368
|
+
|
|
369
|
+
def _parse_usage_line(self, line: str, rel_path: str,
|
|
370
|
+
script_name: str) -> Optional[List[str]]:
|
|
371
|
+
try:
|
|
372
|
+
tokens = shlex.split(line)
|
|
373
|
+
except ValueError:
|
|
374
|
+
return None
|
|
375
|
+
|
|
376
|
+
if not tokens:
|
|
377
|
+
return None
|
|
378
|
+
|
|
379
|
+
python_prefix = tokens[0].startswith("python")
|
|
380
|
+
if python_prefix:
|
|
381
|
+
tokens = [self.python_bin] + tokens[1:]
|
|
382
|
+
|
|
383
|
+
script_idx = None
|
|
384
|
+
for idx, token in enumerate(tokens):
|
|
385
|
+
token_base = os.path.basename(token)
|
|
386
|
+
if token_base == script_name:
|
|
387
|
+
script_idx = idx
|
|
388
|
+
break
|
|
389
|
+
|
|
390
|
+
if script_idx is None:
|
|
391
|
+
return None
|
|
392
|
+
|
|
393
|
+
tokens[script_idx] = rel_path
|
|
394
|
+
|
|
395
|
+
if not python_prefix:
|
|
396
|
+
tokens = [self.python_bin] + tokens[script_idx:]
|
|
397
|
+
|
|
398
|
+
return tokens
|
|
399
|
+
|
|
400
|
+
def _iter_non_flag_tokens(self, cmd: List[str]) -> Iterator[str]:
|
|
401
|
+
for token in cmd:
|
|
402
|
+
if not token or token.startswith("-"):
|
|
403
|
+
continue
|
|
404
|
+
if token == self.python_bin:
|
|
405
|
+
continue
|
|
406
|
+
yield token
|
|
407
|
+
|
|
408
|
+
def _is_help_command(self, cmd: List[str]) -> bool:
|
|
409
|
+
for token in cmd:
|
|
410
|
+
lowered = token.lower()
|
|
411
|
+
if lowered in {"--help", "-h", "help"}:
|
|
412
|
+
return True
|
|
413
|
+
return False
|
|
414
|
+
|
|
415
|
+
def _has_placeholder_tokens(self, cmd: List[str]) -> bool:
|
|
416
|
+
for token in self._iter_non_flag_tokens(cmd):
|
|
417
|
+
if self._is_placeholder_token(token):
|
|
418
|
+
return True
|
|
419
|
+
return False
|
|
420
|
+
|
|
421
|
+
@staticmethod
|
|
422
|
+
def _is_placeholder_token(token: str) -> bool:
|
|
423
|
+
# Common usage placeholders: <...>, [...], {...}
|
|
424
|
+
if any(ch in token for ch in ("<", ">", "[", "]", "{", "}")):
|
|
425
|
+
return True
|
|
426
|
+
lowered = token.strip().lower()
|
|
427
|
+
return lowered in {"options", "[options]", "<options>", "{options}"}
|
|
428
|
+
|
|
429
|
+
def _detect_missing_inputs(self, cmd: List[str], cwd: str) -> List[str]:
|
|
430
|
+
missing: List[str] = []
|
|
431
|
+
for token in self._iter_non_flag_tokens(cmd):
|
|
432
|
+
if self._is_placeholder_token(token):
|
|
433
|
+
continue
|
|
434
|
+
if not self._looks_like_path(token):
|
|
435
|
+
continue
|
|
436
|
+
path = token if os.path.isabs(token) else os.path.join(cwd, token)
|
|
437
|
+
if not os.path.exists(path):
|
|
438
|
+
missing.append(token)
|
|
439
|
+
return missing
|
|
440
|
+
|
|
441
|
+
def _looks_like_path(self, token: str) -> bool:
|
|
442
|
+
if "/" in token or token.startswith("."):
|
|
443
|
+
return True
|
|
444
|
+
_, ext = os.path.splitext(token)
|
|
445
|
+
return ext.lower() in self.PATH_LIKE_EXTS
|
|
446
|
+
|
|
447
|
+
def _run_command(self, cmd: List[str], cwd: str) -> Dict[str, Any]:
|
|
448
|
+
command_str = shlex.join(cmd)
|
|
449
|
+
start = time.time()
|
|
450
|
+
try:
|
|
451
|
+
completed = subprocess.run(
|
|
452
|
+
cmd,
|
|
453
|
+
cwd=cwd,
|
|
454
|
+
text=True,
|
|
455
|
+
capture_output=True,
|
|
456
|
+
timeout=self.timeout_sec
|
|
457
|
+
)
|
|
458
|
+
duration = time.time() - start
|
|
459
|
+
return {
|
|
460
|
+
"command": command_str,
|
|
461
|
+
"exit_code": completed.returncode,
|
|
462
|
+
"stdout": self._truncate(completed.stdout),
|
|
463
|
+
"stderr": self._truncate(completed.stderr),
|
|
464
|
+
"duration_sec": round(duration, 3),
|
|
465
|
+
"timed_out": False
|
|
466
|
+
}
|
|
467
|
+
except subprocess.TimeoutExpired:
|
|
468
|
+
duration = time.time() - start
|
|
469
|
+
return {
|
|
470
|
+
"command": command_str,
|
|
471
|
+
"exit_code": None,
|
|
472
|
+
"stdout": "",
|
|
473
|
+
"stderr": "",
|
|
474
|
+
"duration_sec": round(duration, 3),
|
|
475
|
+
"timed_out": True,
|
|
476
|
+
"error": f"Timeout after {self.timeout_sec}s"
|
|
477
|
+
}
|
|
478
|
+
except FileNotFoundError as e:
|
|
479
|
+
duration = time.time() - start
|
|
480
|
+
return {
|
|
481
|
+
"command": command_str,
|
|
482
|
+
"exit_code": None,
|
|
483
|
+
"stdout": "",
|
|
484
|
+
"stderr": str(e),
|
|
485
|
+
"duration_sec": round(duration, 3),
|
|
486
|
+
"timed_out": False,
|
|
487
|
+
"error": str(e)
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
def _truncate(self, text: str) -> str:
|
|
491
|
+
if not text:
|
|
492
|
+
return ""
|
|
493
|
+
if len(text) <= self.max_output_chars:
|
|
494
|
+
return text
|
|
495
|
+
return text[: self.max_output_chars] + "...[truncated]"
|
|
496
|
+
|
|
497
|
+
def _pick_error(self, *results: Dict[str, Any]) -> Optional[str]:
|
|
498
|
+
for result in results:
|
|
499
|
+
if not result:
|
|
500
|
+
continue
|
|
501
|
+
stderr = result.get("stderr") or ""
|
|
502
|
+
stdout = result.get("stdout") or ""
|
|
503
|
+
if stderr.strip():
|
|
504
|
+
return stderr.strip()
|
|
505
|
+
if stdout.strip():
|
|
506
|
+
return stdout.strip()
|
|
507
|
+
if result.get("error"):
|
|
508
|
+
return str(result.get("error"))
|
|
509
|
+
return None
|
|
510
|
+
|
|
511
|
+
|
|
512
|
+
# ==========================================================================
|
|
513
|
+
# Skill content loader
|
|
514
|
+
# ==========================================================================
|
|
515
|
+
|
|
516
|
+
class SkillLoader:
|
|
517
|
+
"""Load SKILL.md, scripts, and reference files for a skill."""
|
|
518
|
+
|
|
519
|
+
REFERENCE_ALLOWED_EXTS = {
|
|
520
|
+
".md",
|
|
521
|
+
".txt",
|
|
522
|
+
".json",
|
|
523
|
+
".yaml",
|
|
524
|
+
".yml",
|
|
525
|
+
".ini",
|
|
526
|
+
".toml",
|
|
527
|
+
".cfg",
|
|
528
|
+
".csv",
|
|
529
|
+
".tsv",
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
@staticmethod
|
|
533
|
+
def _walk_and_load(skill_dir: str, max_files: int, max_chars: int,
|
|
534
|
+
root_filter: Callable[[str], bool],
|
|
535
|
+
file_filter: Callable[[str], bool],
|
|
536
|
+
skip_skill_md: bool) -> List[Dict[str, str]]:
|
|
537
|
+
items: List[Dict[str, str]] = []
|
|
538
|
+
for root, _, files in os.walk(skill_dir):
|
|
539
|
+
if not root_filter(root):
|
|
540
|
+
continue
|
|
541
|
+
for filename in files:
|
|
542
|
+
if len(items) >= max_files:
|
|
543
|
+
return items
|
|
544
|
+
if skip_skill_md and filename.lower() == "skill.md":
|
|
545
|
+
continue
|
|
546
|
+
if not file_filter(filename):
|
|
547
|
+
continue
|
|
548
|
+
filepath = os.path.join(root, filename)
|
|
549
|
+
rel_path = os.path.relpath(filepath, skill_dir)
|
|
550
|
+
try:
|
|
551
|
+
with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
|
|
552
|
+
content = f.read(max_chars)
|
|
553
|
+
items.append({"path": rel_path, "content": content})
|
|
554
|
+
except Exception as e:
|
|
555
|
+
logger.warning(f"Skip {filepath}: {e}")
|
|
556
|
+
return items
|
|
557
|
+
|
|
558
|
+
@staticmethod
|
|
559
|
+
def load_skill_md(skill_dir: str, max_chars: int = 12000) -> Optional[str]:
|
|
560
|
+
"""Load SKILL.md content with optional truncation."""
|
|
561
|
+
path = SkillLoader._find_file(skill_dir, "skill.md")
|
|
562
|
+
if not path:
|
|
563
|
+
logger.warning(f"SKILL.md not found in {skill_dir}")
|
|
564
|
+
return None
|
|
565
|
+
|
|
566
|
+
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
567
|
+
content = f.read()
|
|
568
|
+
|
|
569
|
+
if len(content) > max_chars:
|
|
570
|
+
content = content[:max_chars] + "\n\n...[truncated]..."
|
|
571
|
+
|
|
572
|
+
return content
|
|
573
|
+
|
|
574
|
+
@staticmethod
|
|
575
|
+
def load_scripts(skill_dir: str, max_files: int = 5,
|
|
576
|
+
max_chars: int = 1200) -> List[Dict[str, str]]:
|
|
577
|
+
"""Load a sample of files under the scripts directory."""
|
|
578
|
+
return SkillLoader._walk_and_load(
|
|
579
|
+
skill_dir,
|
|
580
|
+
max_files=max_files,
|
|
581
|
+
max_chars=max_chars,
|
|
582
|
+
root_filter=lambda root: "scripts" in root.split(os.sep),
|
|
583
|
+
file_filter=lambda _filename: True,
|
|
584
|
+
skip_skill_md=False,
|
|
585
|
+
)
|
|
586
|
+
|
|
587
|
+
@staticmethod
|
|
588
|
+
def load_references(
|
|
589
|
+
skill_dir: str,
|
|
590
|
+
max_files: int = 10,
|
|
591
|
+
max_chars: int = 4000,
|
|
592
|
+
) -> List[Dict[str, str]]:
|
|
593
|
+
"""
|
|
594
|
+
Load non-script reference files for a skill.
|
|
595
|
+
|
|
596
|
+
This is intended for files other than SKILL.md and scripts/,
|
|
597
|
+
e.g. README.md, references/, assets/, etc.
|
|
598
|
+
"""
|
|
599
|
+
def file_filter(filename: str) -> bool:
|
|
600
|
+
ext = os.path.splitext(filename)[1].lower()
|
|
601
|
+
return (not ext) or (ext in SkillLoader.REFERENCE_ALLOWED_EXTS)
|
|
602
|
+
|
|
603
|
+
return SkillLoader._walk_and_load(
|
|
604
|
+
skill_dir,
|
|
605
|
+
max_files=max_files,
|
|
606
|
+
max_chars=max_chars,
|
|
607
|
+
root_filter=lambda root: "scripts" not in root.split(os.sep),
|
|
608
|
+
file_filter=file_filter,
|
|
609
|
+
skip_skill_md=True,
|
|
610
|
+
)
|
|
611
|
+
|
|
612
|
+
@staticmethod
|
|
613
|
+
def _find_file(directory: str, filename: str) -> Optional[str]:
|
|
614
|
+
"""Recursively find a file in directory (case-insensitive)."""
|
|
615
|
+
for root, _, files in os.walk(directory):
|
|
616
|
+
for f in files:
|
|
617
|
+
if f.lower() == filename.lower():
|
|
618
|
+
return os.path.join(root, f)
|
|
619
|
+
return None
|
|
620
|
+
|
|
621
|
+
|
|
622
|
+
# ==========================================================================
|
|
623
|
+
# Prompt builder
|
|
624
|
+
# ==========================================================================
|
|
625
|
+
|
|
626
|
+
class PromptBuilder:
|
|
627
|
+
"""Build prompts for skill evaluation."""
|
|
628
|
+
|
|
629
|
+
@staticmethod
|
|
630
|
+
def _format_file_items(items: List[Dict[str, str]], empty_message: str) -> str:
|
|
631
|
+
formatted: List[str] = []
|
|
632
|
+
for item in items:
|
|
633
|
+
if not isinstance(item, dict):
|
|
634
|
+
continue
|
|
635
|
+
path = item.get("path") or "[unknown path]"
|
|
636
|
+
content = item.get("content") or ""
|
|
637
|
+
formatted.append(f"# {path}\n{content}\n")
|
|
638
|
+
return "\n".join(formatted) if formatted else empty_message
|
|
639
|
+
|
|
640
|
+
@staticmethod
|
|
641
|
+
def build(skill: Skill, skill_md: Optional[str],
|
|
642
|
+
scripts: List[Dict[str, str]],
|
|
643
|
+
references: Optional[List[Dict[str, str]]] = None,
|
|
644
|
+
script_exec_results: Optional[List[ScriptExecutionResult]] = None) -> str:
|
|
645
|
+
"""Build the evaluation prompt for a given skill."""
|
|
646
|
+
skill_md_block = skill_md or "[SKILL.md not found]"
|
|
647
|
+
|
|
648
|
+
if references:
|
|
649
|
+
references_block = PromptBuilder._format_file_items(
|
|
650
|
+
references,
|
|
651
|
+
"[No references or additional assets found]",
|
|
652
|
+
)
|
|
653
|
+
else:
|
|
654
|
+
references_block = "[No references or additional assets found]"
|
|
655
|
+
|
|
656
|
+
if scripts:
|
|
657
|
+
scripts_block = PromptBuilder._format_file_items(
|
|
658
|
+
scripts,
|
|
659
|
+
"[No scripts found]",
|
|
660
|
+
)
|
|
661
|
+
else:
|
|
662
|
+
scripts_block = "[No scripts found]"
|
|
663
|
+
|
|
664
|
+
if script_exec_results is None:
|
|
665
|
+
script_exec_block = "[Scripts not executed]"
|
|
666
|
+
elif not script_exec_results:
|
|
667
|
+
script_exec_block = "[No runnable python scripts found]"
|
|
668
|
+
else:
|
|
669
|
+
script_exec_block = "\n".join(
|
|
670
|
+
PromptBuilder._format_exec_result(r)
|
|
671
|
+
for r in script_exec_results
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
return SKILL_EVALUATION_PROMPT.format(
|
|
675
|
+
skill_name=skill.name,
|
|
676
|
+
skill_description=skill.description or "N/A",
|
|
677
|
+
category=skill.category or "N/A",
|
|
678
|
+
repo_name="N/A",
|
|
679
|
+
author="N/A",
|
|
680
|
+
skill_md_block=skill_md_block,
|
|
681
|
+
references_block=references_block,
|
|
682
|
+
scripts_block=scripts_block,
|
|
683
|
+
script_exec_block=script_exec_block
|
|
684
|
+
)
|
|
685
|
+
|
|
686
|
+
@staticmethod
|
|
687
|
+
def _format_exec_result(result: ScriptExecutionResult) -> str:
|
|
688
|
+
base = f"- {result.path}: {result.status}"
|
|
689
|
+
if result.exit_code is not None:
|
|
690
|
+
base += f" (exit={result.exit_code})"
|
|
691
|
+
base += f" | cmd: {result.command}"
|
|
692
|
+
if result.note:
|
|
693
|
+
base += f" | note: {result.note}"
|
|
694
|
+
if result.error:
|
|
695
|
+
clean_error = " ".join(result.error.splitlines())
|
|
696
|
+
base += f" | error: {clean_error}"
|
|
697
|
+
return base
|
|
698
|
+
|
|
699
|
+
|
|
700
|
+
# ==========================================================================
|
|
701
|
+
# LLM client
|
|
702
|
+
# ==========================================================================
|
|
703
|
+
|
|
704
|
+
class LLMClient:
|
|
705
|
+
"""Thin wrapper around the OpenAI client for evaluation calls."""
|
|
706
|
+
|
|
707
|
+
def __init__(self, config: EvaluatorConfig):
|
|
708
|
+
self.client = OpenAI(api_key=config.api_key, base_url=config.base_url)
|
|
709
|
+
self.model = config.model
|
|
710
|
+
self.temperature = config.temperature
|
|
711
|
+
|
|
712
|
+
def evaluate(self, prompt: str) -> Dict[str, Any]:
|
|
713
|
+
"""Call the LLM with the given prompt and parse JSON response."""
|
|
714
|
+
messages = [
|
|
715
|
+
{
|
|
716
|
+
"role": "system",
|
|
717
|
+
"content": (
|
|
718
|
+
"You are an expert evaluator of AI Agent Skills. "
|
|
719
|
+
"Follow the JSON schema and constraints exactly. "
|
|
720
|
+
"Use ONLY the provided metadata, SKILL.md, reference files, and scripts snippets."
|
|
721
|
+
)
|
|
722
|
+
},
|
|
723
|
+
{"role": "user", "content": prompt}
|
|
724
|
+
]
|
|
725
|
+
|
|
726
|
+
try:
|
|
727
|
+
response = self.client.chat.completions.create(
|
|
728
|
+
model=self.model,
|
|
729
|
+
messages=messages,
|
|
730
|
+
response_format={"type": "json_object"},
|
|
731
|
+
temperature=self.temperature
|
|
732
|
+
)
|
|
733
|
+
raw_response = response.choices[0].message.content
|
|
734
|
+
return json.loads(raw_response)
|
|
735
|
+
except Exception as e:
|
|
736
|
+
logger.error(f"LLM call failed: {e}")
|
|
737
|
+
raise
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
# ==========================================================================
|
|
741
|
+
# Core evaluator
|
|
742
|
+
# ==========================================================================
|
|
743
|
+
|
|
744
|
+
class SkillEvaluator:
|
|
745
|
+
"""
|
|
746
|
+
Main entry point for evaluating AI skills.
|
|
747
|
+
|
|
748
|
+
Typical usage:
|
|
749
|
+
config = EvaluatorConfig(api_key="your-key")
|
|
750
|
+
evaluator = SkillEvaluator(config)
|
|
751
|
+
|
|
752
|
+
# Single skill from URL
|
|
753
|
+
skill, err = Skill.from_url("https://github.com/.../skill", evaluator.downloader, config.cache_dir)
|
|
754
|
+
if err:
|
|
755
|
+
result = evaluator._create_error_result(err)
|
|
756
|
+
else:
|
|
757
|
+
result = evaluator.evaluate(skill)
|
|
758
|
+
|
|
759
|
+
# Single skill from local path
|
|
760
|
+
skill = Skill.from_path("/path/to/skill")
|
|
761
|
+
result = evaluator.evaluate(skill)
|
|
762
|
+
|
|
763
|
+
# Batch evaluation
|
|
764
|
+
skills = [skill1, skill2, skill3]
|
|
765
|
+
results = evaluator.evaluate_batch(skills)
|
|
766
|
+
"""
|
|
767
|
+
|
|
768
|
+
def __init__(self, config: EvaluatorConfig):
|
|
769
|
+
"""Initialize the evaluator with configuration."""
|
|
770
|
+
if not config.api_key:
|
|
771
|
+
raise ValueError("API key is required")
|
|
772
|
+
|
|
773
|
+
self.config = config
|
|
774
|
+
self.downloader = SkillDownloader(api_token=config.github_token)
|
|
775
|
+
self.loader = SkillLoader()
|
|
776
|
+
self.prompt_builder = PromptBuilder()
|
|
777
|
+
self.llm_client = LLMClient(config)
|
|
778
|
+
self.script_runner = ScriptRunner(
|
|
779
|
+
python_bin=config.script_python,
|
|
780
|
+
timeout_sec=config.script_timeout_sec,
|
|
781
|
+
max_runs=config.max_script_runs,
|
|
782
|
+
max_output_chars=config.max_script_output_chars
|
|
783
|
+
)
|
|
784
|
+
|
|
785
|
+
def evaluate(self, skill: Skill) -> Dict[str, Any]:
|
|
786
|
+
"""
|
|
787
|
+
Evaluate a single skill.
|
|
788
|
+
|
|
789
|
+
Args:
|
|
790
|
+
skill: A Skill instance to evaluate.
|
|
791
|
+
|
|
792
|
+
Returns:
|
|
793
|
+
A dict containing the evaluation result.
|
|
794
|
+
"""
|
|
795
|
+
try:
|
|
796
|
+
# Load content
|
|
797
|
+
skill_md = self.loader.load_skill_md(skill.path)
|
|
798
|
+
scripts = self.loader.load_scripts(skill.path)
|
|
799
|
+
references = self.loader.load_references(skill.path)
|
|
800
|
+
|
|
801
|
+
# Optional script execution
|
|
802
|
+
script_exec_results: Optional[List[ScriptExecutionResult]] = None
|
|
803
|
+
if self.config.run_scripts:
|
|
804
|
+
script_exec_results = self.script_runner.run_for_skill(skill.path)
|
|
805
|
+
|
|
806
|
+
# Build prompt
|
|
807
|
+
prompt = self.prompt_builder.build(
|
|
808
|
+
skill,
|
|
809
|
+
skill_md,
|
|
810
|
+
scripts,
|
|
811
|
+
references=references,
|
|
812
|
+
script_exec_results=script_exec_results
|
|
813
|
+
)
|
|
814
|
+
|
|
815
|
+
# Call LLM
|
|
816
|
+
result = self.llm_client.evaluate(prompt)
|
|
817
|
+
if self.config.include_script_results and script_exec_results is not None:
|
|
818
|
+
result["script_execution"] = [
|
|
819
|
+
r.to_dict() for r in script_exec_results
|
|
820
|
+
]
|
|
821
|
+
return result
|
|
822
|
+
|
|
823
|
+
except Exception as e:
|
|
824
|
+
skill_name = getattr(skill, "name", "[unknown skill]")
|
|
825
|
+
logger.exception("Evaluation failed for %s: %s", skill_name, e)
|
|
826
|
+
return self._create_error_result(str(e))
|
|
827
|
+
|
|
828
|
+
def evaluate_batch(self, skills: List[Skill]) -> List[Dict[str, Any]]:
|
|
829
|
+
"""
|
|
830
|
+
Evaluate multiple skills in parallel.
|
|
831
|
+
|
|
832
|
+
Args:
|
|
833
|
+
skills: List of Skill objects.
|
|
834
|
+
|
|
835
|
+
Returns:
|
|
836
|
+
List of evaluation results in the same order as input.
|
|
837
|
+
"""
|
|
838
|
+
results = [None] * len(skills)
|
|
839
|
+
|
|
840
|
+
with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
|
|
841
|
+
future_to_idx = {
|
|
842
|
+
executor.submit(self.evaluate, skill): idx
|
|
843
|
+
for idx, skill in enumerate(skills)
|
|
844
|
+
}
|
|
845
|
+
|
|
846
|
+
with tqdm(total=len(skills), desc="Evaluating skills") as pbar:
|
|
847
|
+
for future in as_completed(future_to_idx):
|
|
848
|
+
idx = future_to_idx[future]
|
|
849
|
+
results[idx] = future.result()
|
|
850
|
+
pbar.update(1)
|
|
851
|
+
|
|
852
|
+
return results
|
|
853
|
+
|
|
854
|
+
def evaluate_from_url(self, url: str, **kwargs) -> Dict[str, Any]:
|
|
855
|
+
"""Convenience helper: create and evaluate a skill from a URL."""
|
|
856
|
+
skill, err = Skill.from_url(
|
|
857
|
+
url, self.downloader, self.config.cache_dir, **kwargs
|
|
858
|
+
)
|
|
859
|
+
if err:
|
|
860
|
+
return self._create_error_result(err)
|
|
861
|
+
return self.evaluate(skill)
|
|
862
|
+
|
|
863
|
+
def evaluate_from_path(self, path: str, **kwargs) -> Dict[str, Any]:
|
|
864
|
+
"""Convenience helper: create and evaluate a skill from a local path."""
|
|
865
|
+
skill, err = Skill.from_path(
|
|
866
|
+
path, **kwargs
|
|
867
|
+
)
|
|
868
|
+
if err:
|
|
869
|
+
return self._create_error_result(err)
|
|
870
|
+
return self.evaluate(skill)
|
|
871
|
+
|
|
872
|
+
@staticmethod
|
|
873
|
+
def _create_error_result(error_msg: str) -> Dict[str, Any]:
|
|
874
|
+
"""Create a default error-shaped evaluation result."""
|
|
875
|
+
error_item = {"level": "Poor", "reason": f"Evaluation failed: {error_msg}"}
|
|
876
|
+
return {
|
|
877
|
+
"error": error_msg,
|
|
878
|
+
"safety": error_item,
|
|
879
|
+
"completeness": error_item,
|
|
880
|
+
"executability": error_item,
|
|
881
|
+
"modifiability": error_item,
|
|
882
|
+
"cost_awareness": error_item
|
|
883
|
+
}
|
|
884
|
+
|
|
885
|
+
|
|
886
|
+
# ==========================================================================
|
|
887
|
+
# CLI entry point
|
|
888
|
+
# ==========================================================================
|
|
889
|
+
|
|
890
|
+
if __name__ == '__main__':
|
|
891
|
+
"""Command line entry point for batch evaluation from JSONL."""
|
|
892
|
+
import argparse
|
|
893
|
+
|
|
894
|
+
parser = argparse.ArgumentParser(description='Evaluate AI Agent Skills')
|
|
895
|
+
parser.add_argument('--input', required=True, help='Input JSONL file')
|
|
896
|
+
parser.add_argument('--output', required=True, help='Output JSONL file')
|
|
897
|
+
parser.add_argument('--api-key', help='OpenAI API key')
|
|
898
|
+
parser.add_argument('--base-url', help='OpenAI API base URL')
|
|
899
|
+
parser.add_argument('--model', default='gpt-4o', help='Model name')
|
|
900
|
+
parser.add_argument('--max-workers', type=int, default=5, help='Max workers')
|
|
901
|
+
parser.add_argument('--cache-dir', default='./evaluate_cache_dir', help='Cache directory')
|
|
902
|
+
parser.add_argument('--run-scripts', action='store_true',
|
|
903
|
+
help='Execute python scripts under scripts/')
|
|
904
|
+
parser.add_argument('--script-timeout', type=int, default=8,
|
|
905
|
+
help='Timeout seconds per script run')
|
|
906
|
+
parser.add_argument('--max-script-runs', type=int, default=5,
|
|
907
|
+
help='Max python scripts to execute per skill')
|
|
908
|
+
parser.add_argument('--script-python', default='python',
|
|
909
|
+
help='Python executable for running scripts')
|
|
910
|
+
parser.add_argument('--include-script-results', action='store_true',
|
|
911
|
+
help='Attach script execution results to evaluation output')
|
|
912
|
+
parser.add_argument('--max-script-output-chars', type=int, default=400,
|
|
913
|
+
help='Max chars of script stdout/stderr to keep')
|
|
914
|
+
|
|
915
|
+
args = parser.parse_args()
|
|
916
|
+
|
|
917
|
+
def _load_records(jsonl_path: str) -> List[Dict[str, Any]]:
|
|
918
|
+
with open(jsonl_path, 'r', encoding='utf-8') as f:
|
|
919
|
+
return [json.loads(line) for line in f if line.strip()]
|
|
920
|
+
|
|
921
|
+
def _build_skills(
|
|
922
|
+
records: List[Dict[str, Any]],
|
|
923
|
+
evaluator: 'SkillEvaluator',
|
|
924
|
+
config: EvaluatorConfig,
|
|
925
|
+
) -> Tuple[List[Optional[Skill]], Dict[int, str]]:
|
|
926
|
+
skills: List[Optional[Skill]] = []
|
|
927
|
+
errors: Dict[int, str] = {}
|
|
928
|
+
for idx, rec in enumerate(records):
|
|
929
|
+
if 'skill_url' in rec:
|
|
930
|
+
skill, err = Skill.from_url(
|
|
931
|
+
rec['skill_url'],
|
|
932
|
+
evaluator.downloader,
|
|
933
|
+
config.cache_dir,
|
|
934
|
+
name=rec.get('skill_name'),
|
|
935
|
+
description=rec.get('skill_description'),
|
|
936
|
+
category=rec.get('category')
|
|
937
|
+
)
|
|
938
|
+
elif 'skill_path' in rec:
|
|
939
|
+
skill, err = Skill.from_path(
|
|
940
|
+
rec['skill_path'],
|
|
941
|
+
name=rec.get('skill_name'),
|
|
942
|
+
description=rec.get('skill_description'),
|
|
943
|
+
category=rec.get('category')
|
|
944
|
+
)
|
|
945
|
+
else:
|
|
946
|
+
raise ValueError("Record must have 'skill_url' or 'skill_path'")
|
|
947
|
+
|
|
948
|
+
if err:
|
|
949
|
+
errors[idx] = err
|
|
950
|
+
skills.append(None)
|
|
951
|
+
else:
|
|
952
|
+
skills.append(skill)
|
|
953
|
+
return skills, errors
|
|
954
|
+
|
|
955
|
+
def _evaluate_records(
|
|
956
|
+
records: List[Dict[str, Any]],
|
|
957
|
+
evaluator: 'SkillEvaluator',
|
|
958
|
+
skills: List[Optional[Skill]],
|
|
959
|
+
errors: Dict[int, str],
|
|
960
|
+
) -> List[Dict[str, Any]]:
|
|
961
|
+
skills_to_eval = [(idx, s) for idx, s in enumerate(skills) if s is not None]
|
|
962
|
+
idx_to_result: Dict[int, Dict[str, Any]] = {}
|
|
963
|
+
if skills_to_eval:
|
|
964
|
+
indices, valid_skills = zip(*skills_to_eval)
|
|
965
|
+
batch_results = evaluator.evaluate_batch(list(valid_skills))
|
|
966
|
+
idx_to_result = dict(zip(indices, batch_results))
|
|
967
|
+
return [
|
|
968
|
+
evaluator._create_error_result(errors[idx])
|
|
969
|
+
if idx in errors
|
|
970
|
+
else idx_to_result[idx]
|
|
971
|
+
for idx in range(len(records))
|
|
972
|
+
]
|
|
973
|
+
|
|
974
|
+
def _write_outputs(records: List[Dict[str, Any]], output_jsonl_path: str) -> str:
|
|
975
|
+
with open(output_jsonl_path, 'w', encoding='utf-8') as f:
|
|
976
|
+
for rec in records:
|
|
977
|
+
f.write(json.dumps(rec, ensure_ascii=False) + '\n')
|
|
978
|
+
json_path = output_jsonl_path.replace('.jsonl', '.json')
|
|
979
|
+
with open(json_path, 'w', encoding='utf-8') as f:
|
|
980
|
+
json.dump({str(i): rec for i, rec in enumerate(records)},
|
|
981
|
+
f, ensure_ascii=False, indent=2)
|
|
982
|
+
return json_path
|
|
983
|
+
|
|
984
|
+
config = EvaluatorConfig(
|
|
985
|
+
api_key=args.api_key or os.getenv('API_KEY'),
|
|
986
|
+
base_url=args.base_url or os.getenv('BASE_URL'),
|
|
987
|
+
model=args.model,
|
|
988
|
+
max_workers=args.max_workers,
|
|
989
|
+
cache_dir=args.cache_dir,
|
|
990
|
+
run_scripts=args.run_scripts,
|
|
991
|
+
script_timeout_sec=args.script_timeout,
|
|
992
|
+
max_script_runs=args.max_script_runs,
|
|
993
|
+
script_python=args.script_python,
|
|
994
|
+
include_script_results=args.include_script_results,
|
|
995
|
+
max_script_output_chars=args.max_script_output_chars
|
|
996
|
+
)
|
|
997
|
+
evaluator = SkillEvaluator(config)
|
|
998
|
+
records = _load_records(args.input)
|
|
999
|
+
skills, download_errors = _build_skills(records, evaluator, config)
|
|
1000
|
+
results = _evaluate_records(records, evaluator, skills, download_errors)
|
|
1001
|
+
for rec, result in zip(records, results):
|
|
1002
|
+
rec['evaluation'] = result
|
|
1003
|
+
json_path = _write_outputs(records, args.output)
|
|
1004
|
+
|
|
1005
|
+
print(f"✓ Evaluated {len(results)} skills")
|
|
1006
|
+
print(f"✓ Results saved to {args.output} and {json_path}")
|