@cristobalme/skill-test 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +140 -0
- package/action/action.yml +87 -0
- package/dist/action/comment.cjs +178 -0
- package/dist/action/comment.cjs.map +1 -0
- package/dist/action/comment.js +151 -0
- package/dist/action/comment.js.map +1 -0
- package/dist/bin/skill-test.cjs +1313 -0
- package/dist/bin/skill-test.cjs.map +1 -0
- package/dist/bin/skill-test.js +1290 -0
- package/dist/bin/skill-test.js.map +1 -0
- package/dist/index.cjs +1217 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +461 -0
- package/dist/index.d.ts +461 -0
- package/dist/index.js +1159 -0
- package/dist/index.js.map +1 -0
- package/examples/skill-test.yml +28 -0
- package/package.json +77 -0
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,461 @@
|
|
|
1
|
+
/** Severity of a lint finding. Exit code is non-zero iff any `error` exists. */
|
|
2
|
+
type Severity = "error" | "warn" | "info";
|
|
3
|
+
/** A single result produced by a lint rule. */
|
|
4
|
+
interface Finding {
|
|
5
|
+
ruleId: string;
|
|
6
|
+
severity: Severity;
|
|
7
|
+
message: string;
|
|
8
|
+
/** Absolute or repo-relative path to the file the finding concerns. */
|
|
9
|
+
file: string;
|
|
10
|
+
/** 1-based line number within `file`, when known. */
|
|
11
|
+
line?: number;
|
|
12
|
+
}
|
|
13
|
+
/** Parsed representation of a SKILL.md file. */
|
|
14
|
+
interface ParsedSkill {
|
|
15
|
+
/** Absolute path to the SKILL.md file. */
|
|
16
|
+
path: string;
|
|
17
|
+
/** Absolute path to the skill's root directory (parent of SKILL.md). */
|
|
18
|
+
dir: string;
|
|
19
|
+
/** Raw frontmatter object (may be empty if frontmatter was missing/invalid). */
|
|
20
|
+
frontmatter: Record<string, unknown>;
|
|
21
|
+
/** Markdown body after the frontmatter. */
|
|
22
|
+
body: string;
|
|
23
|
+
/** 1-based line where the body starts in the original file. */
|
|
24
|
+
bodyStartLine: number;
|
|
25
|
+
/** Estimated token count of the body. */
|
|
26
|
+
bodyTokens: number;
|
|
27
|
+
/** Number of lines in the body. */
|
|
28
|
+
bodyLines: number;
|
|
29
|
+
/** True when valid YAML frontmatter delimited by `---` was found. */
|
|
30
|
+
hasFrontmatter: boolean;
|
|
31
|
+
/** Parse-level error message, if the file could not be parsed at all. */
|
|
32
|
+
parseError?: string;
|
|
33
|
+
}
|
|
34
|
+
/** Result of the static lint layer for one skill. */
|
|
35
|
+
interface LintResult {
|
|
36
|
+
layer: "lint";
|
|
37
|
+
skillPath: string;
|
|
38
|
+
findings: Finding[];
|
|
39
|
+
/** True iff no `error`-severity findings. */
|
|
40
|
+
ok: boolean;
|
|
41
|
+
}
|
|
42
|
+
/** Outcome of classifying a single prompt against a skill's metadata. */
|
|
43
|
+
interface ClassificationResult {
|
|
44
|
+
prompt: string;
|
|
45
|
+
/** Whether the model decided the skill would activate. */
|
|
46
|
+
activated: boolean;
|
|
47
|
+
/** Short reason from the model. */
|
|
48
|
+
reason: string;
|
|
49
|
+
/** True when this result came from the on-disk cache. */
|
|
50
|
+
cached: boolean;
|
|
51
|
+
}
|
|
52
|
+
/** Precision/recall/F1 over a labeled corpus of prompts. */
|
|
53
|
+
interface TriggerScore {
|
|
54
|
+
truePositives: number;
|
|
55
|
+
falsePositives: number;
|
|
56
|
+
trueNegatives: number;
|
|
57
|
+
falseNegatives: number;
|
|
58
|
+
precision: number;
|
|
59
|
+
recall: number;
|
|
60
|
+
f1: number;
|
|
61
|
+
}
|
|
62
|
+
/** Result of the triggering layer for one skill. */
|
|
63
|
+
interface TriggerResult {
|
|
64
|
+
layer: "trigger";
|
|
65
|
+
skillPath: string;
|
|
66
|
+
score: TriggerScore;
|
|
67
|
+
classifications: ClassificationResult[];
|
|
68
|
+
/** Prompts that should have activated but didn't. */
|
|
69
|
+
falseNegatives: string[];
|
|
70
|
+
/** Prompts that should not have activated but did. */
|
|
71
|
+
falsePositives: string[];
|
|
72
|
+
ok: boolean;
|
|
73
|
+
}
|
|
74
|
+
/** A layer that was skipped, with a human-readable reason. */
|
|
75
|
+
interface SkippedLayer {
|
|
76
|
+
layer: "trigger" | "run";
|
|
77
|
+
skipped: true;
|
|
78
|
+
reason: string;
|
|
79
|
+
}
|
|
80
|
+
/** Aggregate report across all layers run for a set of skills. */
|
|
81
|
+
interface Report {
|
|
82
|
+
results: SkillReport[];
|
|
83
|
+
/** Overall: true iff every skill passed every layer that ran. */
|
|
84
|
+
ok: boolean;
|
|
85
|
+
}
|
|
86
|
+
/** Per-skill aggregate across layers. */
|
|
87
|
+
interface SkillReport {
|
|
88
|
+
skillPath: string;
|
|
89
|
+
lint?: LintResult;
|
|
90
|
+
trigger?: TriggerResult | SkippedLayer;
|
|
91
|
+
ok: boolean;
|
|
92
|
+
}
|
|
93
|
+
/** Process exit codes (documented in the README). */
|
|
94
|
+
declare const EXIT: {
|
|
95
|
+
readonly OK: 0;
|
|
96
|
+
readonly FAILURES: 1;
|
|
97
|
+
readonly USAGE: 2;
|
|
98
|
+
};
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Agent Skills specification constants.
|
|
102
|
+
*
|
|
103
|
+
* Source: https://agentskills.io/specification (fetched 2026-06-04).
|
|
104
|
+
* The Agent Skills format was originally developed by Anthropic and released as
|
|
105
|
+
* an open standard. These constants mirror the live spec verbatim; update them
|
|
106
|
+
* (and the source date) whenever the spec changes.
|
|
107
|
+
*
|
|
108
|
+
* Frontmatter fields:
|
|
109
|
+
* - name (required) Max 64 chars. Lowercase a-z, 0-9 and hyphens.
|
|
110
|
+
* Must not start/end with a hyphen or contain "--".
|
|
111
|
+
* Must match the parent directory name.
|
|
112
|
+
* - description (required) Max 1024 chars. Non-empty. What it does + when to use.
|
|
113
|
+
* - license (optional) License name or reference to a bundled file.
|
|
114
|
+
* - compatibility (optional) Max 500 chars. Environment requirements.
|
|
115
|
+
* - metadata (optional) Map of string keys to string values.
|
|
116
|
+
* - allowed-tools (optional) Space-separated string of pre-approved tools. (Experimental)
|
|
117
|
+
*
|
|
118
|
+
* Body: no format restrictions. Recommended < 5000 tokens once activated, and the
|
|
119
|
+
* main SKILL.md kept under 500 lines; move detail into referenced files.
|
|
120
|
+
* File references should be relative paths from the skill root, one level deep.
|
|
121
|
+
*/
|
|
122
|
+
/** Allowed characters and shape for the `name` field. */
|
|
123
|
+
declare const NAME_PATTERN: RegExp;
|
|
124
|
+
declare const NAME_MAX_LENGTH = 64;
|
|
125
|
+
declare const DESCRIPTION_MAX_LENGTH = 1024;
|
|
126
|
+
declare const COMPATIBILITY_MAX_LENGTH = 500;
|
|
127
|
+
/** Recommended ceiling for the activated SKILL.md body. */
|
|
128
|
+
declare const BODY_RECOMMENDED_MAX_TOKENS = 5000;
|
|
129
|
+
/** We warn as the body approaches the ceiling (90%). */
|
|
130
|
+
declare const BODY_WARN_TOKENS: number;
|
|
131
|
+
/** Recommended maximum number of lines in the main SKILL.md. */
|
|
132
|
+
declare const BODY_RECOMMENDED_MAX_LINES = 500;
|
|
133
|
+
/** Known frontmatter fields per the spec, used to flag unknown keys (info only). */
|
|
134
|
+
declare const KNOWN_FRONTMATTER_FIELDS: readonly ["name", "description", "license", "compatibility", "metadata", "allowed-tools"];
|
|
135
|
+
declare const SPEC_URL = "https://agentskills.io/specification";
|
|
136
|
+
|
|
137
|
+
declare const spec_BODY_RECOMMENDED_MAX_LINES: typeof BODY_RECOMMENDED_MAX_LINES;
|
|
138
|
+
declare const spec_BODY_RECOMMENDED_MAX_TOKENS: typeof BODY_RECOMMENDED_MAX_TOKENS;
|
|
139
|
+
declare const spec_BODY_WARN_TOKENS: typeof BODY_WARN_TOKENS;
|
|
140
|
+
declare const spec_COMPATIBILITY_MAX_LENGTH: typeof COMPATIBILITY_MAX_LENGTH;
|
|
141
|
+
declare const spec_DESCRIPTION_MAX_LENGTH: typeof DESCRIPTION_MAX_LENGTH;
|
|
142
|
+
declare const spec_KNOWN_FRONTMATTER_FIELDS: typeof KNOWN_FRONTMATTER_FIELDS;
|
|
143
|
+
declare const spec_NAME_MAX_LENGTH: typeof NAME_MAX_LENGTH;
|
|
144
|
+
declare const spec_NAME_PATTERN: typeof NAME_PATTERN;
|
|
145
|
+
declare const spec_SPEC_URL: typeof SPEC_URL;
|
|
146
|
+
declare namespace spec {
|
|
147
|
+
export { spec_BODY_RECOMMENDED_MAX_LINES as BODY_RECOMMENDED_MAX_LINES, spec_BODY_RECOMMENDED_MAX_TOKENS as BODY_RECOMMENDED_MAX_TOKENS, spec_BODY_WARN_TOKENS as BODY_WARN_TOKENS, spec_COMPATIBILITY_MAX_LENGTH as COMPATIBILITY_MAX_LENGTH, spec_DESCRIPTION_MAX_LENGTH as DESCRIPTION_MAX_LENGTH, spec_KNOWN_FRONTMATTER_FIELDS as KNOWN_FRONTMATTER_FIELDS, spec_NAME_MAX_LENGTH as NAME_MAX_LENGTH, spec_NAME_PATTERN as NAME_PATTERN, spec_SPEC_URL as SPEC_URL };
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Parse a SKILL.md file into frontmatter + body. Never throws on malformed
|
|
152
|
+
* content: parse-level problems are reported via `parseError` / `hasFrontmatter`
|
|
153
|
+
* so lint rules can turn them into findings.
|
|
154
|
+
*/
|
|
155
|
+
declare function parseSkillFile(path: string): ParsedSkill;
|
|
156
|
+
/** Parse already-loaded SKILL.md content. Exposed for tests. */
|
|
157
|
+
declare function parseSkillContent(raw: string, path: string): ParsedSkill;
|
|
158
|
+
|
|
159
|
+
/**
|
|
160
|
+
* Offline token estimator.
|
|
161
|
+
*
|
|
162
|
+
* The static layer must never require a network call or an API key, so we cannot
|
|
163
|
+
* use a real tokenizer service. This heuristic blends a character-based and a
|
|
164
|
+
* word-based estimate, which tracks BPE tokenizers (e.g. cl100k) closely enough
|
|
165
|
+
* for a "is this body near the recommended ceiling?" check. It intentionally
|
|
166
|
+
* errs slightly high so we warn before a body actually exceeds the limit.
|
|
167
|
+
*/
|
|
168
|
+
declare function estimateTokens(text: string): number;
|
|
169
|
+
|
|
170
|
+
/** Run every lint rule against a single SKILL.md file. */
|
|
171
|
+
declare function lintSkill(skillPath: string): LintResult;
|
|
172
|
+
|
|
173
|
+
type Rule = (skill: ParsedSkill) => Finding[];
|
|
174
|
+
/** The ordered set of rules run by the lint orchestrator. */
|
|
175
|
+
declare const rules: Rule[];
|
|
176
|
+
|
|
177
|
+
/**
|
|
178
|
+
* Resolve one or more path arguments into a deduplicated list of SKILL.md files.
|
|
179
|
+
*
|
|
180
|
+
* Each argument may be:
|
|
181
|
+
* - a path to a SKILL.md file directly,
|
|
182
|
+
* - a skill directory (containing SKILL.md),
|
|
183
|
+
* - a directory of many skills (walked recursively for SKILL.md files).
|
|
184
|
+
*
|
|
185
|
+
* Glob patterns are expanded by the shell before reaching us; a directory
|
|
186
|
+
* argument is walked recursively, so pointing at a repo of skills works.
|
|
187
|
+
*/
|
|
188
|
+
declare function discoverSkills(paths: string[]): string[];
|
|
189
|
+
|
|
190
|
+
/** Serialize lint results to a stable JSON-friendly object. */
|
|
191
|
+
declare function lintResultsToJson(results: LintResult[]): {
|
|
192
|
+
tool: string;
|
|
193
|
+
layer: string;
|
|
194
|
+
ok: boolean;
|
|
195
|
+
skills: {
|
|
196
|
+
skill: string;
|
|
197
|
+
ok: boolean;
|
|
198
|
+
findings: Finding[];
|
|
199
|
+
}[];
|
|
200
|
+
};
|
|
201
|
+
/** Serialize a full multi-layer report. */
|
|
202
|
+
declare function reportToJson(report: Report): {
|
|
203
|
+
tool: string;
|
|
204
|
+
ok: boolean;
|
|
205
|
+
skills: {
|
|
206
|
+
skill: string;
|
|
207
|
+
ok: boolean;
|
|
208
|
+
lint: {
|
|
209
|
+
ok: boolean;
|
|
210
|
+
findings: Finding[];
|
|
211
|
+
} | undefined;
|
|
212
|
+
trigger: {
|
|
213
|
+
skipped: boolean;
|
|
214
|
+
reason: string;
|
|
215
|
+
ok?: undefined;
|
|
216
|
+
score?: undefined;
|
|
217
|
+
falseNegatives?: undefined;
|
|
218
|
+
falsePositives?: undefined;
|
|
219
|
+
} | {
|
|
220
|
+
ok: boolean;
|
|
221
|
+
score: TriggerScore;
|
|
222
|
+
falseNegatives: string[];
|
|
223
|
+
falsePositives: string[];
|
|
224
|
+
skipped?: undefined;
|
|
225
|
+
reason?: undefined;
|
|
226
|
+
} | undefined;
|
|
227
|
+
}[];
|
|
228
|
+
};
|
|
229
|
+
|
|
230
|
+
/** Render results of the lint layer for a set of skills. */
|
|
231
|
+
declare function renderLintResults(results: LintResult[], opts?: {
|
|
232
|
+
quiet?: boolean;
|
|
233
|
+
}): string;
|
|
234
|
+
/** Render an aggregate Report across all layers (used by `lint` and `check`). */
|
|
235
|
+
declare function renderReport(report: Report, opts?: {
|
|
236
|
+
quiet?: boolean;
|
|
237
|
+
}): string;
|
|
238
|
+
|
|
239
|
+
/** Render a Report as JUnit XML for CI systems. */
|
|
240
|
+
declare function reportToJUnit(report: Report): string;
|
|
241
|
+
|
|
242
|
+
declare function setColorEnabled(on: boolean): void;
|
|
243
|
+
|
|
244
|
+
interface CachedClassification {
|
|
245
|
+
activated: boolean;
|
|
246
|
+
reason: string;
|
|
247
|
+
}
|
|
248
|
+
/**
|
|
249
|
+
* Disk cache for classifications, keyed by a hash of (model, name, description,
|
|
250
|
+
* prompt). Identical inputs are free to re-run, which keeps CI cheap and
|
|
251
|
+
* deterministic. The cache file is a single JSON object.
|
|
252
|
+
*/
|
|
253
|
+
declare class TriggerCache {
|
|
254
|
+
private readonly file;
|
|
255
|
+
private readonly enabled;
|
|
256
|
+
private store;
|
|
257
|
+
private dirty;
|
|
258
|
+
constructor(file: string, enabled?: boolean);
|
|
259
|
+
key(model: string, name: string, description: string, prompt: string): string;
|
|
260
|
+
get(key: string): CachedClassification | undefined;
|
|
261
|
+
set(key: string, value: CachedClassification): void;
|
|
262
|
+
/** Persist to disk if anything changed. */
|
|
263
|
+
flush(): void;
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
/** Default classifier model. Pinned for reproducibility; override with --model. */
|
|
267
|
+
declare const DEFAULT_MODEL = "claude-haiku-4-5-20251001";
|
|
268
|
+
/** One classification request: skill metadata only — never the skill body. */
|
|
269
|
+
interface ClassifyRequest {
|
|
270
|
+
name: string;
|
|
271
|
+
description: string;
|
|
272
|
+
prompt: string;
|
|
273
|
+
model: string;
|
|
274
|
+
}
|
|
275
|
+
interface ClassifyResponse {
|
|
276
|
+
activated: boolean;
|
|
277
|
+
reason: string;
|
|
278
|
+
}
|
|
279
|
+
/**
|
|
280
|
+
* Pluggable classification backend. The Anthropic implementation lives in
|
|
281
|
+
* backend/anthropic.ts; tests provide a mock. Keeping this an interface is what
|
|
282
|
+
* lets the trigger layer be backend-agnostic later.
|
|
283
|
+
*/
|
|
284
|
+
interface ClassifierBackend {
|
|
285
|
+
readonly model: string;
|
|
286
|
+
classify(req: ClassifyRequest): Promise<ClassifyResponse>;
|
|
287
|
+
}
|
|
288
|
+
interface SkillMeta {
|
|
289
|
+
name: string;
|
|
290
|
+
description: string;
|
|
291
|
+
}
|
|
292
|
+
/**
|
|
293
|
+
* Classify every prompt against the skill's metadata, using the on-disk cache
|
|
294
|
+
* when available so reruns make zero API calls. Returns one result per prompt in
|
|
295
|
+
* input order.
|
|
296
|
+
*/
|
|
297
|
+
declare function classifyPrompts(meta: SkillMeta, prompts: string[], backend: ClassifierBackend, cache?: TriggerCache): Promise<ClassificationResult[]>;
|
|
298
|
+
|
|
299
|
+
/** Validated shape of a SKILL.test.yaml file. */
|
|
300
|
+
interface TriggerSpec {
|
|
301
|
+
specPath: string;
|
|
302
|
+
/** Optional pointer to the SKILL.md this spec targets (relative to the spec). */
|
|
303
|
+
skillRef?: string;
|
|
304
|
+
shouldActivate: string[];
|
|
305
|
+
shouldNotActivate: string[];
|
|
306
|
+
/** Behavioral tasks — validated structurally but exercised in Phase 5. */
|
|
307
|
+
tasks: unknown[];
|
|
308
|
+
}
|
|
309
|
+
declare class SpecValidationError extends Error {
|
|
310
|
+
}
|
|
311
|
+
/** Locate the SKILL.test.yaml that sits beside a SKILL.md, if any. */
|
|
312
|
+
declare function findSpec(skillPath: string): string | undefined;
|
|
313
|
+
/** Load and validate a SKILL.test.yaml file. Throws SpecValidationError. */
|
|
314
|
+
declare function loadSpec(specPath: string): TriggerSpec;
|
|
315
|
+
|
|
316
|
+
/**
|
|
317
|
+
* Compute precision/recall/F1 over the labeled corpus.
|
|
318
|
+
* - should_activate that activated ⇒ true positive
|
|
319
|
+
* - should_activate that didn't ⇒ false negative
|
|
320
|
+
* - should_not_activate that activated ⇒ false positive
|
|
321
|
+
* - should_not_activate that didn't ⇒ true negative
|
|
322
|
+
*/
|
|
323
|
+
declare function computeTriggerResult(skillPath: string, spec: TriggerSpec, classifications: ClassificationResult[]): TriggerResult;
|
|
324
|
+
|
|
325
|
+
interface TriggerOptions {
|
|
326
|
+
/** Backend to classify with. When absent, the layer is skipped (no API key). */
|
|
327
|
+
backend?: ClassifierBackend;
|
|
328
|
+
/** Cache instance. When absent, a default disk cache is created. */
|
|
329
|
+
cache?: TriggerCache;
|
|
330
|
+
/** Directory for the default disk cache. */
|
|
331
|
+
cacheDir?: string;
|
|
332
|
+
/** Disable caching entirely. */
|
|
333
|
+
noCache?: boolean;
|
|
334
|
+
}
|
|
335
|
+
/**
|
|
336
|
+
* Run the triggering layer for one skill. Returns a TriggerResult, or a
|
|
337
|
+
* SkippedLayer with a clear reason when there's no spec, no metadata, or no
|
|
338
|
+
* backend. Throws SpecValidationError when a spec exists but is invalid.
|
|
339
|
+
*/
|
|
340
|
+
declare function triggerSkill(skillPath: string, opts?: TriggerOptions): Promise<TriggerResult | SkippedLayer>;
|
|
341
|
+
|
|
342
|
+
/** Extract the first JSON object from a model response. */
|
|
343
|
+
declare function parseDecision(text: string): ClassifyResponse;
|
|
344
|
+
interface AnthropicBackendOptions {
|
|
345
|
+
apiKey?: string;
|
|
346
|
+
model?: string;
|
|
347
|
+
}
|
|
348
|
+
/** Create a classifier backed by the Anthropic Messages API. */
|
|
349
|
+
declare function createAnthropicBackend(opts?: AnthropicBackendOptions): ClassifierBackend;
|
|
350
|
+
/** True when an API key is available for the trigger/behavioral layers. */
|
|
351
|
+
declare function hasApiKey(): boolean;
|
|
352
|
+
|
|
353
|
+
/**
|
|
354
|
+
* Behavioral graders (Phase 5 — interfaces only).
|
|
355
|
+
*
|
|
356
|
+
* Deterministic graders come first and are the default; `llm_judge` is opt-in
|
|
357
|
+
* and must be validated against hand-labeled cases before it is relied on.
|
|
358
|
+
*/
|
|
359
|
+
/** Declarative grader specs as they appear in SKILL.test.yaml `tasks[].grade`. */
|
|
360
|
+
type GraderSpec = {
|
|
361
|
+
type: "file_exists";
|
|
362
|
+
path: string;
|
|
363
|
+
} | {
|
|
364
|
+
type: "regex_match";
|
|
365
|
+
file?: string;
|
|
366
|
+
pattern: string;
|
|
367
|
+
} | {
|
|
368
|
+
type: "contains";
|
|
369
|
+
file?: string;
|
|
370
|
+
text: string;
|
|
371
|
+
} | {
|
|
372
|
+
type: "exit_code";
|
|
373
|
+
equals: number;
|
|
374
|
+
} | {
|
|
375
|
+
type: "llm_judge";
|
|
376
|
+
rubric: string;
|
|
377
|
+
threshold: number;
|
|
378
|
+
};
|
|
379
|
+
/** Context handed to a grader after a task run completes. */
|
|
380
|
+
interface GradeContext {
|
|
381
|
+
/** Working directory the agent ran in (inside the sandbox mount). */
|
|
382
|
+
workdir: string;
|
|
383
|
+
/** Captured stdout of the run. */
|
|
384
|
+
stdout: string;
|
|
385
|
+
/** Process exit code of the run. */
|
|
386
|
+
exitCode: number;
|
|
387
|
+
}
|
|
388
|
+
interface GraderResult {
|
|
389
|
+
type: GraderSpec["type"];
|
|
390
|
+
passed: boolean;
|
|
391
|
+
detail?: string;
|
|
392
|
+
}
|
|
393
|
+
interface Grader {
|
|
394
|
+
grade(ctx: GradeContext): Promise<GraderResult> | GraderResult;
|
|
395
|
+
}
|
|
396
|
+
/** Build a grader from its spec. Implemented in Phase 5. */
|
|
397
|
+
declare function createGrader(_spec: GraderSpec): Grader;
|
|
398
|
+
|
|
399
|
+
/**
|
|
400
|
+
* Execution sandbox (Phase 5 — interfaces only).
|
|
401
|
+
*
|
|
402
|
+
* Skill-provided code is NEVER executed on the host. Tasks run inside an
|
|
403
|
+
* isolated container with only the task fixtures mounted.
|
|
404
|
+
*/
|
|
405
|
+
interface SandboxOptions {
|
|
406
|
+
/** Container image to run in. */
|
|
407
|
+
image: string;
|
|
408
|
+
/** Host paths mounted read-only into the sandbox. */
|
|
409
|
+
mounts: string[];
|
|
410
|
+
/** Network policy; default is no network. */
|
|
411
|
+
network?: "none" | "host";
|
|
412
|
+
}
|
|
413
|
+
interface SandboxRun {
|
|
414
|
+
stdout: string;
|
|
415
|
+
stderr: string;
|
|
416
|
+
exitCode: number;
|
|
417
|
+
}
|
|
418
|
+
interface Sandbox {
|
|
419
|
+
run(command: string): Promise<SandboxRun>;
|
|
420
|
+
dispose(): Promise<void>;
|
|
421
|
+
}
|
|
422
|
+
/** Create a Docker-backed sandbox. Implemented in Phase 5. */
|
|
423
|
+
declare function createDockerSandbox(_opts: SandboxOptions): Sandbox;
|
|
424
|
+
|
|
425
|
+
/**
|
|
426
|
+
* Behavioral task runner (Phase 5 — interfaces only).
|
|
427
|
+
*
|
|
428
|
+
* Each task runs N times (default 5) to surface nondeterminism; the result is a
|
|
429
|
+
* pass *rate*, not a single boolean, and passes on a threshold. Model and
|
|
430
|
+
* temperature are pinned for reproducibility.
|
|
431
|
+
*/
|
|
432
|
+
|
|
433
|
+
/** A behavioral task as it appears under SKILL.test.yaml `tasks`. */
|
|
434
|
+
interface TaskSpec {
|
|
435
|
+
/** Optional setup commands run before the prompt. */
|
|
436
|
+
setup?: string;
|
|
437
|
+
/** The user prompt given to the agent. */
|
|
438
|
+
prompt: string;
|
|
439
|
+
/** One or more graders; deterministic graders are preferred. */
|
|
440
|
+
grade: GraderSpec[];
|
|
441
|
+
}
|
|
442
|
+
interface TaskRunResult {
|
|
443
|
+
prompt: string;
|
|
444
|
+
/** Fraction of runs that passed all graders (0..1). */
|
|
445
|
+
passRate: number;
|
|
446
|
+
runs: number;
|
|
447
|
+
passed: boolean;
|
|
448
|
+
}
|
|
449
|
+
interface RunOptions {
|
|
450
|
+
/** How many times to run each task (default 5). */
|
|
451
|
+
repeats?: number;
|
|
452
|
+
/** Pass threshold on the pass rate (default 1.0). */
|
|
453
|
+
threshold?: number;
|
|
454
|
+
/** Pinned model + temperature for reproducibility. */
|
|
455
|
+
model?: string;
|
|
456
|
+
temperature?: number;
|
|
457
|
+
}
|
|
458
|
+
/** Run a behavioral task N times and report a pass rate. Implemented in Phase 5. */
|
|
459
|
+
declare function runTask(_task: TaskSpec, _opts?: RunOptions): Promise<TaskRunResult>;
|
|
460
|
+
|
|
461
|
+
export { type ClassificationResult, type ClassifierBackend, type ClassifyRequest, type ClassifyResponse, DEFAULT_MODEL, EXIT, type Finding, type GradeContext, type Grader, type GraderResult, type GraderSpec, type LintResult, type ParsedSkill, type Report, type RunOptions, type Sandbox, type SandboxOptions, type Severity, type SkillMeta, type SkillReport, type SkippedLayer, SpecValidationError, type TaskRunResult, type TaskSpec, TriggerCache, type TriggerOptions, type TriggerResult, type TriggerScore, type TriggerSpec, classifyPrompts, computeTriggerResult, createAnthropicBackend, createDockerSandbox, createGrader, discoverSkills, estimateTokens, findSpec, hasApiKey, lintResultsToJson, lintSkill, loadSpec, parseDecision, parseSkillContent, parseSkillFile, renderLintResults, renderReport, reportToJUnit, reportToJson, rules, runTask, setColorEnabled, spec, triggerSkill };
|