@skill-test/sdk 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +33 -0
- package/dist/errors.d.ts +14 -0
- package/dist/errors.js +23 -0
- package/dist/generated/report.d.ts +166 -0
- package/dist/generated/report.js +7 -0
- package/dist/generated/validation.d.ts +32 -0
- package/dist/generated/validation.js +7 -0
- package/dist/helpers.d.ts +10 -0
- package/dist/helpers.js +21 -0
- package/dist/index.d.ts +21 -0
- package/dist/index.js +19 -0
- package/dist/runner.d.ts +32 -0
- package/dist/runner.js +92 -0
- package/package.json +40 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Nick DeRobertis
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# @skill-test/sdk
|
|
2
|
+
|
|
3
|
+
The TypeScript SDK for the
|
|
4
|
+
[`skilltest`](https://github.com/nickderobertis/skilltest) CLI. A thin, typed
|
|
5
|
+
wrapper and nothing else: it runs the CLI as a subprocess and types the stable
|
|
6
|
+
`--format json` contract with declarations generated from the CLI's own JSON
|
|
7
|
+
Schemas. Test-framework integrations build on it — use
|
|
8
|
+
[`@skill-test/vitest`](../../plugins/vitest) if you want the vitest helpers; use
|
|
9
|
+
this package directly from any other TypeScript/JavaScript code.
|
|
10
|
+
|
|
11
|
+
```ts
|
|
12
|
+
import { runSkill, validateSkill, assistantText, describeFailures } from "@skill-test/sdk";
|
|
13
|
+
|
|
14
|
+
const report = await runSkill("cases/greet.yaml");
|
|
15
|
+
if (!report.passed) throw new Error(describeFailures(report));
|
|
16
|
+
// Mix in deterministic checks on the transcript:
|
|
17
|
+
const text = assistantText(report.runs[0]!.transcript);
|
|
18
|
+
|
|
19
|
+
const result = await validateSkill("skills/greeter");
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
The `skilltest` binary is resolved from the `bin` option, the `SKILLTEST_BIN`
|
|
23
|
+
env var, or `PATH`; a provider override comes from `provider` or
|
|
24
|
+
`SKILLTEST_PROVIDER`. A failing eval is *reported* (`report.passed` is false),
|
|
25
|
+
not thrown; bad input throws `SkilltestUsageError` (CLI exit 2) and provider
|
|
26
|
+
problems throw `SkilltestProviderError` (exit 3).
|
|
27
|
+
|
|
28
|
+
The types in `src/generated/` are **generated** from
|
|
29
|
+
`schemas/report.schema.json` / `schemas/validation.schema.json` — themselves
|
|
30
|
+
generated from the CLI's own types — via `just gen-contract`, and a drift gate
|
|
31
|
+
in CI fails if anything is stale, so the types cannot diverge from the binary.
|
|
32
|
+
They are types only: the runner trusts the shape after `JSON.parse`, because
|
|
33
|
+
the gate (not runtime re-validation) is what guarantees it.
|
package/dist/errors.d.ts
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Errors mirroring the CLI's exit-code contract. A *test failure* (exit 1) is
|
|
3
|
+
* returned as a Report with `passed === false`, not thrown; *bad input* (exit 2)
|
|
4
|
+
* and *provider failure* (exit 3) are thrown because the author must fix them.
|
|
5
|
+
*/
|
|
6
|
+
export declare class SkilltestError extends Error {
|
|
7
|
+
constructor(message: string);
|
|
8
|
+
}
|
|
9
|
+
export declare class SkilltestUsageError extends SkilltestError {
|
|
10
|
+
constructor(message: string);
|
|
11
|
+
}
|
|
12
|
+
export declare class SkilltestProviderError extends SkilltestError {
|
|
13
|
+
constructor(message: string);
|
|
14
|
+
}
|
package/dist/errors.js
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Errors mirroring the CLI's exit-code contract. A *test failure* (exit 1) is
|
|
3
|
+
* returned as a Report with `passed === false`, not thrown; *bad input* (exit 2)
|
|
4
|
+
* and *provider failure* (exit 3) are thrown because the author must fix them.
|
|
5
|
+
*/
|
|
6
|
+
export class SkilltestError extends Error {
|
|
7
|
+
constructor(message) {
|
|
8
|
+
super(message);
|
|
9
|
+
this.name = "SkilltestError";
|
|
10
|
+
}
|
|
11
|
+
}
|
|
12
|
+
export class SkilltestUsageError extends SkilltestError {
|
|
13
|
+
constructor(message) {
|
|
14
|
+
super(message);
|
|
15
|
+
this.name = "SkilltestUsageError";
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
export class SkilltestProviderError extends SkilltestError {
|
|
19
|
+
constructor(message) {
|
|
20
|
+
super(message);
|
|
21
|
+
this.name = "SkilltestProviderError";
|
|
22
|
+
}
|
|
23
|
+
}
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Generated from the golden JSON Schemas in schemas/ by `just gen-contract`.
|
|
3
|
+
* DO NOT MODIFY BY HAND — change the Rust report types and regenerate; the
|
|
4
|
+
* contract drift gate fails while this file is stale.
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* The kind-specific detail of an eval outcome, for reporting.
|
|
8
|
+
*
|
|
9
|
+
* The variant titles name the generated SDK model for each union arm, so keep
|
|
10
|
+
* them stable: they are part of the SDK API surface.
|
|
11
|
+
*/
|
|
12
|
+
export type EvalDetail = BooleanDetail | NumericDetail;
|
|
13
|
+
/**
|
|
14
|
+
* How a numeric score is compared to its threshold.
|
|
15
|
+
*/
|
|
16
|
+
export type Comparator = "gte" | "gt" | "lte" | "lt";
|
|
17
|
+
/**
|
|
18
|
+
* Who produced a message.
|
|
19
|
+
*/
|
|
20
|
+
export type Role = "user" | "assistant" | "system";
|
|
21
|
+
/**
|
|
22
|
+
* The top-level report for a `skilltest run` invocation.
|
|
23
|
+
*/
|
|
24
|
+
export interface Report {
|
|
25
|
+
/**
|
|
26
|
+
* True iff every run passed.
|
|
27
|
+
*/
|
|
28
|
+
passed: boolean;
|
|
29
|
+
/**
|
|
30
|
+
* Every individual run.
|
|
31
|
+
*/
|
|
32
|
+
runs: CaseRun[];
|
|
33
|
+
/**
|
|
34
|
+
* Aggregate counts.
|
|
35
|
+
*/
|
|
36
|
+
summary: Summary;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* The result of running one test case on one (platform, model) pair.
|
|
40
|
+
*/
|
|
41
|
+
export interface CaseRun {
|
|
42
|
+
/**
|
|
43
|
+
* The test case name.
|
|
44
|
+
*/
|
|
45
|
+
case: string;
|
|
46
|
+
/**
|
|
47
|
+
* Per-eval outcomes, in declaration order.
|
|
48
|
+
*/
|
|
49
|
+
evals: EvalOutcome[];
|
|
50
|
+
/**
|
|
51
|
+
* The model this run used.
|
|
52
|
+
*/
|
|
53
|
+
model: string;
|
|
54
|
+
/**
|
|
55
|
+
* True iff every eval in this run passed.
|
|
56
|
+
*/
|
|
57
|
+
passed: boolean;
|
|
58
|
+
/**
|
|
59
|
+
* The harness platform this run used.
|
|
60
|
+
*/
|
|
61
|
+
platform: string;
|
|
62
|
+
/**
|
|
63
|
+
* Absolute-ish path to the skill that was exercised.
|
|
64
|
+
*/
|
|
65
|
+
skill: string;
|
|
66
|
+
/**
|
|
67
|
+
* The full conversation, for debugging and deterministic mix-in checks.
|
|
68
|
+
*/
|
|
69
|
+
transcript: Transcript;
|
|
70
|
+
/**
|
|
71
|
+
* Number of assistant turns produced.
|
|
72
|
+
*/
|
|
73
|
+
turns: number;
|
|
74
|
+
/**
|
|
75
|
+
* Aggregated token/cost usage across every provider call in this run
|
|
76
|
+
* (skill turns + simulated-user turns + judge calls). Omitted when no
|
|
77
|
+
* usage was reported (e.g. the fake provider or a harness that doesn't
|
|
78
|
+
* surface usage).
|
|
79
|
+
*/
|
|
80
|
+
usage?: Usage | null;
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* The result of running one eval against a transcript.
|
|
84
|
+
*/
|
|
85
|
+
export interface EvalOutcome {
|
|
86
|
+
/**
|
|
87
|
+
* Kind-specific verdict detail.
|
|
88
|
+
*/
|
|
89
|
+
detail: EvalDetail;
|
|
90
|
+
/**
|
|
91
|
+
* The eval's label (name or criterion).
|
|
92
|
+
*/
|
|
93
|
+
label: string;
|
|
94
|
+
/**
|
|
95
|
+
* Whether the eval passed.
|
|
96
|
+
*/
|
|
97
|
+
passed: boolean;
|
|
98
|
+
/**
|
|
99
|
+
* The judge's stated reason.
|
|
100
|
+
*/
|
|
101
|
+
reason: string;
|
|
102
|
+
}
|
|
103
|
+
export interface BooleanDetail {
|
|
104
|
+
expected: boolean;
|
|
105
|
+
kind: "boolean";
|
|
106
|
+
value: boolean;
|
|
107
|
+
}
|
|
108
|
+
export interface NumericDetail {
|
|
109
|
+
comparator: Comparator;
|
|
110
|
+
kind: "numeric";
|
|
111
|
+
threshold: number;
|
|
112
|
+
value: number;
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* An ordered list of messages. Thin wrapper so the type reads clearly at call
|
|
116
|
+
* sites and so we can grow conversation-level helpers without churn.
|
|
117
|
+
*/
|
|
118
|
+
export interface Transcript {
|
|
119
|
+
messages: Message[];
|
|
120
|
+
}
|
|
121
|
+
/**
|
|
122
|
+
* A single turn in the conversation.
|
|
123
|
+
*/
|
|
124
|
+
export interface Message {
|
|
125
|
+
content: string;
|
|
126
|
+
role: Role;
|
|
127
|
+
}
|
|
128
|
+
/**
|
|
129
|
+
* Token / cost usage for one provider call.
|
|
130
|
+
*
|
|
131
|
+
* Each field is independently optional because not every harness reports every
|
|
132
|
+
* signal (cost is commonly absent on subscription auth; some harnesses report
|
|
133
|
+
* no usage at all). The whole struct is `Option<Usage>` on a turn — `None`
|
|
134
|
+
* means "no signal," not "zero."
|
|
135
|
+
*/
|
|
136
|
+
export interface Usage {
|
|
137
|
+
cost_usd?: number | null;
|
|
138
|
+
input_tokens?: number | null;
|
|
139
|
+
output_tokens?: number | null;
|
|
140
|
+
}
|
|
141
|
+
/**
|
|
142
|
+
* Aggregate pass/fail counts for a report.
|
|
143
|
+
*/
|
|
144
|
+
export interface Summary {
|
|
145
|
+
/**
|
|
146
|
+
* Distinct test cases represented.
|
|
147
|
+
*/
|
|
148
|
+
cases: number;
|
|
149
|
+
/**
|
|
150
|
+
* Runs that failed.
|
|
151
|
+
*/
|
|
152
|
+
failed: number;
|
|
153
|
+
/**
|
|
154
|
+
* Runs that passed.
|
|
155
|
+
*/
|
|
156
|
+
passed: number;
|
|
157
|
+
/**
|
|
158
|
+
* Total (case × platform × model) runs.
|
|
159
|
+
*/
|
|
160
|
+
runs: number;
|
|
161
|
+
/**
|
|
162
|
+
* Aggregated token/cost usage across every run in the report. Omitted
|
|
163
|
+
* when no run reported usage.
|
|
164
|
+
*/
|
|
165
|
+
usage?: Usage | null;
|
|
166
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Generated from the golden JSON Schemas in schemas/ by `just gen-contract`.
|
|
3
|
+
* DO NOT MODIFY BY HAND — change the Rust report types and regenerate; the
|
|
4
|
+
* contract drift gate fails while this file is stale.
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* The top-level report for a `skilltest validate` invocation.
|
|
8
|
+
*/
|
|
9
|
+
export interface ValidationReport {
|
|
10
|
+
/**
|
|
11
|
+
* Every finding, in discovery order.
|
|
12
|
+
*/
|
|
13
|
+
findings: ValidationFinding[];
|
|
14
|
+
/**
|
|
15
|
+
* True iff no findings were produced.
|
|
16
|
+
*/
|
|
17
|
+
valid: boolean;
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* One problem found while validating a skill, as serialized in the
|
|
21
|
+
* `skilltest validate --format json` output.
|
|
22
|
+
*/
|
|
23
|
+
export interface ValidationFinding {
|
|
24
|
+
/**
|
|
25
|
+
* What is wrong and how to fix it.
|
|
26
|
+
*/
|
|
27
|
+
message: string;
|
|
28
|
+
/**
|
|
29
|
+
* The skill directory the finding is about.
|
|
30
|
+
*/
|
|
31
|
+
skill: string;
|
|
32
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Hand-written conveniences over the generated contract types. The type
|
|
3
|
+
* checker keeps these honest against `generated/` — a renamed or removed
|
|
4
|
+
* field fails `tsc`, not a user's test.
|
|
5
|
+
*/
|
|
6
|
+
import type { Report, Transcript } from "./generated/report.js";
|
|
7
|
+
/** The assistant turns of a transcript joined — handy for mix-in checks. */
|
|
8
|
+
export declare function assistantText(transcript: Transcript): string;
|
|
9
|
+
/** A one-line-per-failed-eval summary, for assertion messages. */
|
|
10
|
+
export declare function describeFailures(report: Report): string;
|
package/dist/helpers.js
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/** The assistant turns of a transcript joined — handy for mix-in checks. */
|
|
2
|
+
export function assistantText(transcript) {
|
|
3
|
+
return transcript.messages
|
|
4
|
+
.filter((m) => m.role === "assistant")
|
|
5
|
+
.map((m) => m.content)
|
|
6
|
+
.join("\n");
|
|
7
|
+
}
|
|
8
|
+
/** A one-line-per-failed-eval summary, for assertion messages. */
|
|
9
|
+
export function describeFailures(report) {
|
|
10
|
+
const lines = [];
|
|
11
|
+
for (const run of report.runs) {
|
|
12
|
+
if (run.passed)
|
|
13
|
+
continue;
|
|
14
|
+
for (const outcome of run.evals) {
|
|
15
|
+
if (!outcome.passed) {
|
|
16
|
+
lines.push(`${run.case} [${run.platform}/${run.model}] ${outcome.label}: ${outcome.reason}`);
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
return lines.join("\n");
|
|
21
|
+
}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `@skill-test/sdk` — the TypeScript SDK for the `skilltest` CLI.
|
|
3
|
+
*
|
|
4
|
+
* A thin, typed wrapper around the CLI and nothing else: run test cases,
|
|
5
|
+
* validate skills, and get back objects typed by declarations **generated from
|
|
6
|
+
* the CLI's own JSON Schemas** (`just gen-contract`), so the types cannot
|
|
7
|
+
* drift from the binary. Test frameworks build on this — `@skill-test/vitest`
|
|
8
|
+
* adds the vitest helpers on top.
|
|
9
|
+
*
|
|
10
|
+
* ```ts
|
|
11
|
+
* import { runSkill, assistantText, describeFailures } from "@skill-test/sdk";
|
|
12
|
+
*
|
|
13
|
+
* const report = await runSkill("cases/greet.yaml");
|
|
14
|
+
* if (!report.passed) throw new Error(describeFailures(report));
|
|
15
|
+
* ```
|
|
16
|
+
*/
|
|
17
|
+
export { runSkill, validateSkill, ENV_BIN, ENV_PROVIDER, type RunOptions } from "./runner.js";
|
|
18
|
+
export { SkilltestError, SkilltestProviderError, SkilltestUsageError, } from "./errors.js";
|
|
19
|
+
export { assistantText, describeFailures } from "./helpers.js";
|
|
20
|
+
export type { BooleanDetail, CaseRun, Comparator, EvalDetail, EvalOutcome, Message, NumericDetail, Report, Role, Summary, Transcript, Usage, } from "./generated/report.js";
|
|
21
|
+
export type { ValidationFinding, ValidationReport } from "./generated/validation.js";
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `@skill-test/sdk` — the TypeScript SDK for the `skilltest` CLI.
|
|
3
|
+
*
|
|
4
|
+
* A thin, typed wrapper around the CLI and nothing else: run test cases,
|
|
5
|
+
* validate skills, and get back objects typed by declarations **generated from
|
|
6
|
+
* the CLI's own JSON Schemas** (`just gen-contract`), so the types cannot
|
|
7
|
+
* drift from the binary. Test frameworks build on this — `@skill-test/vitest`
|
|
8
|
+
* adds the vitest helpers on top.
|
|
9
|
+
*
|
|
10
|
+
* ```ts
|
|
11
|
+
* import { runSkill, assistantText, describeFailures } from "@skill-test/sdk";
|
|
12
|
+
*
|
|
13
|
+
* const report = await runSkill("cases/greet.yaml");
|
|
14
|
+
* if (!report.passed) throw new Error(describeFailures(report));
|
|
15
|
+
* ```
|
|
16
|
+
*/
|
|
17
|
+
export { runSkill, validateSkill, ENV_BIN, ENV_PROVIDER } from "./runner.js";
|
|
18
|
+
export { SkilltestError, SkilltestProviderError, SkilltestUsageError, } from "./errors.js";
|
|
19
|
+
export { assistantText, describeFailures } from "./helpers.js";
|
package/dist/runner.d.ts
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import type { Report } from "./generated/report.js";
|
|
2
|
+
import type { ValidationReport } from "./generated/validation.js";
|
|
3
|
+
/** Environment variables supplying defaults for the binary and provider. */
|
|
4
|
+
export declare const ENV_BIN = "SKILLTEST_BIN";
|
|
5
|
+
export declare const ENV_PROVIDER = "SKILLTEST_PROVIDER";
|
|
6
|
+
export interface RunOptions {
|
|
7
|
+
/** Path to the `skilltest` binary (default: `$SKILLTEST_BIN` or `skilltest`). */
|
|
8
|
+
bin?: string;
|
|
9
|
+
/** Provider command (default: `$SKILLTEST_PROVIDER`). A string or argv array. */
|
|
10
|
+
provider?: string | string[];
|
|
11
|
+
/** Harness platforms to run on (overrides config). */
|
|
12
|
+
platforms?: string[];
|
|
13
|
+
/** Models to run on (overrides config). */
|
|
14
|
+
models?: string[];
|
|
15
|
+
/** Model used for evals and the simulated user. */
|
|
16
|
+
judgeModel?: string;
|
|
17
|
+
/** Cap on assistant turns for multi-turn cases. */
|
|
18
|
+
maxTurns?: number;
|
|
19
|
+
/** Path to a config file. */
|
|
20
|
+
config?: string;
|
|
21
|
+
/** Working directory for the subprocess. */
|
|
22
|
+
cwd?: string;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Run one or more test cases and return the parsed {@link Report}. A failing
|
|
26
|
+
* eval is reported in `report.passed`, not thrown; only bad input
|
|
27
|
+
* ({@link SkilltestUsageError}) and provider failures
|
|
28
|
+
* ({@link SkilltestProviderError}) throw.
|
|
29
|
+
*/
|
|
30
|
+
export declare function runSkill(casePath: string, options?: RunOptions): Promise<Report>;
|
|
31
|
+
/** Validate a skill directory (or a folder of them) and return findings. */
|
|
32
|
+
export declare function validateSkill(path: string, options?: Pick<RunOptions, "bin" | "cwd">): Promise<ValidationReport>;
|
package/dist/runner.js
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Run the `skilltest` CLI as a subprocess and parse its JSON contract.
|
|
3
|
+
*
|
|
4
|
+
* This is the code-level API: call {@link runSkill}, get a typed
|
|
5
|
+
* {@link Report}, assert on `report.passed`, and mix in deterministic checks
|
|
6
|
+
* against the transcript.
|
|
7
|
+
*/
|
|
8
|
+
import { spawn } from "node:child_process";
|
|
9
|
+
import { SkilltestError, SkilltestProviderError, SkilltestUsageError } from "./errors.js";
|
|
10
|
+
/** Environment variables supplying defaults for the binary and provider. */
|
|
11
|
+
export const ENV_BIN = "SKILLTEST_BIN";
|
|
12
|
+
export const ENV_PROVIDER = "SKILLTEST_PROVIDER";
|
|
13
|
+
function resolveBin(bin) {
|
|
14
|
+
return bin ?? process.env[ENV_BIN] ?? "skilltest";
|
|
15
|
+
}
|
|
16
|
+
function resolveProvider(provider) {
|
|
17
|
+
const value = provider ?? process.env[ENV_PROVIDER];
|
|
18
|
+
if (value === undefined)
|
|
19
|
+
return undefined;
|
|
20
|
+
return Array.isArray(value) ? value.join(" ") : value;
|
|
21
|
+
}
|
|
22
|
+
function capture(bin, args, cwd) {
|
|
23
|
+
return new Promise((resolve, reject) => {
|
|
24
|
+
const child = spawn(bin, args, { cwd });
|
|
25
|
+
let stdout = "";
|
|
26
|
+
let stderr = "";
|
|
27
|
+
child.stdout.on("data", (chunk) => {
|
|
28
|
+
stdout += chunk.toString();
|
|
29
|
+
});
|
|
30
|
+
child.stderr.on("data", (chunk) => {
|
|
31
|
+
stderr += chunk.toString();
|
|
32
|
+
});
|
|
33
|
+
child.on("error", (err) => reject(new SkilltestProviderError(`could not run skilltest binary \`${bin}\`: ${err.message}. Set ${ENV_BIN} or pass bin.`)));
|
|
34
|
+
child.on("close", (status) => resolve({ status, stdout, stderr }));
|
|
35
|
+
});
|
|
36
|
+
}
|
|
37
|
+
// Exit codes that still produce a JSON report (0 = all passed, 1 = some failed).
|
|
38
|
+
function raiseForStatus(result) {
|
|
39
|
+
if (result.status === 0 || result.status === 1)
|
|
40
|
+
return;
|
|
41
|
+
const detail = result.stderr.trim() || result.stdout.trim();
|
|
42
|
+
if (result.status === 2)
|
|
43
|
+
throw new SkilltestUsageError(detail);
|
|
44
|
+
if (result.status === 3)
|
|
45
|
+
throw new SkilltestProviderError(detail);
|
|
46
|
+
throw new SkilltestError(`skilltest exited ${result.status}: ${detail}`);
|
|
47
|
+
}
|
|
48
|
+
// The cast is sound by construction: the SDK's types are generated from the
|
|
49
|
+
// CLI's own JSON Schemas and the contract drift gate (`just gen-contract
|
|
50
|
+
// --check` in CI) fails when they diverge, so the shape is not re-validated
|
|
51
|
+
// here at runtime.
|
|
52
|
+
function parse(stdout) {
|
|
53
|
+
try {
|
|
54
|
+
return JSON.parse(stdout);
|
|
55
|
+
}
|
|
56
|
+
catch (err) {
|
|
57
|
+
throw new SkilltestError(`skilltest did not emit JSON: ${err.message}`);
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Run one or more test cases and return the parsed {@link Report}. A failing
|
|
62
|
+
* eval is reported in `report.passed`, not thrown; only bad input
|
|
63
|
+
* ({@link SkilltestUsageError}) and provider failures
|
|
64
|
+
* ({@link SkilltestProviderError}) throw.
|
|
65
|
+
*/
|
|
66
|
+
export async function runSkill(casePath, options = {}) {
|
|
67
|
+
const args = [];
|
|
68
|
+
if (options.config)
|
|
69
|
+
args.push("--config", options.config);
|
|
70
|
+
args.push("run", casePath, "--format", "json");
|
|
71
|
+
const provider = resolveProvider(options.provider);
|
|
72
|
+
if (provider !== undefined)
|
|
73
|
+
args.push("--provider", provider);
|
|
74
|
+
for (const platform of options.platforms ?? [])
|
|
75
|
+
args.push("--platform", platform);
|
|
76
|
+
for (const model of options.models ?? [])
|
|
77
|
+
args.push("--model", model);
|
|
78
|
+
if (options.judgeModel)
|
|
79
|
+
args.push("--judge-model", options.judgeModel);
|
|
80
|
+
if (options.maxTurns !== undefined)
|
|
81
|
+
args.push("--max-turns", String(options.maxTurns));
|
|
82
|
+
const result = await capture(resolveBin(options.bin), args, options.cwd);
|
|
83
|
+
raiseForStatus(result);
|
|
84
|
+
return parse(result.stdout);
|
|
85
|
+
}
|
|
86
|
+
/** Validate a skill directory (or a folder of them) and return findings. */
|
|
87
|
+
export async function validateSkill(path, options = {}) {
|
|
88
|
+
const args = ["validate", path, "--format", "json"];
|
|
89
|
+
const result = await capture(resolveBin(options.bin), args, options.cwd);
|
|
90
|
+
raiseForStatus(result);
|
|
91
|
+
return parse(result.stdout);
|
|
92
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@skill-test/sdk",
|
|
3
|
+
"version": "0.1.1",
|
|
4
|
+
"description": "TypeScript SDK for the skilltest CLI: run AI-skill tests and natural-language evals from TypeScript, with a typed report contract generated from the CLI's JSON Schemas.",
|
|
5
|
+
"license": "MIT",
|
|
6
|
+
"repository": {
|
|
7
|
+
"type": "git",
|
|
8
|
+
"url": "git+https://github.com/nickderobertis/skilltest.git",
|
|
9
|
+
"directory": "sdks/typescript"
|
|
10
|
+
},
|
|
11
|
+
"publishConfig": {
|
|
12
|
+
"access": "public"
|
|
13
|
+
},
|
|
14
|
+
"type": "module",
|
|
15
|
+
"main": "./dist/index.js",
|
|
16
|
+
"types": "./dist/index.d.ts",
|
|
17
|
+
"exports": {
|
|
18
|
+
".": {
|
|
19
|
+
"types": "./dist/index.d.ts",
|
|
20
|
+
"default": "./dist/index.js"
|
|
21
|
+
}
|
|
22
|
+
},
|
|
23
|
+
"files": [
|
|
24
|
+
"dist"
|
|
25
|
+
],
|
|
26
|
+
"devDependencies": {
|
|
27
|
+
"@types/node": "^22.7.5",
|
|
28
|
+
"json-schema-to-typescript": "^15.0.4",
|
|
29
|
+
"typescript": "^5.6.3",
|
|
30
|
+
"vitest": "^2.1.3"
|
|
31
|
+
},
|
|
32
|
+
"nx": {
|
|
33
|
+
"includedScripts": []
|
|
34
|
+
},
|
|
35
|
+
"scripts": {
|
|
36
|
+
"build": "tsc -p tsconfig.build.json",
|
|
37
|
+
"typecheck": "tsc -p tsconfig.json",
|
|
38
|
+
"test": "vitest run"
|
|
39
|
+
}
|
|
40
|
+
}
|