@pauly4010/evalai-sdk 1.4.1 → 1.5.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +85 -0
- package/README.md +205 -543
- package/dist/assertions.d.ts +2 -2
- package/dist/assertions.js +104 -71
- package/dist/batch.js +12 -17
- package/dist/cache.js +7 -11
- package/dist/cli/api.d.ts +108 -0
- package/dist/cli/api.js +130 -0
- package/dist/cli/check.d.ts +28 -13
- package/dist/cli/check.js +249 -142
- package/dist/cli/ci-context.d.ts +6 -0
- package/dist/cli/ci-context.js +110 -0
- package/dist/cli/config.d.ts +30 -0
- package/dist/cli/config.js +207 -0
- package/dist/cli/constants.d.ts +15 -0
- package/dist/cli/constants.js +18 -0
- package/dist/cli/doctor.d.ts +11 -0
- package/dist/cli/doctor.js +82 -0
- package/dist/cli/formatters/github.d.ts +8 -0
- package/dist/cli/formatters/github.js +130 -0
- package/dist/cli/formatters/human.d.ts +6 -0
- package/dist/cli/formatters/human.js +107 -0
- package/dist/cli/formatters/json.d.ts +6 -0
- package/dist/cli/formatters/json.js +10 -0
- package/dist/cli/formatters/pr-comment.d.ts +12 -0
- package/dist/cli/formatters/pr-comment.js +101 -0
- package/dist/cli/formatters/types.d.ts +100 -0
- package/dist/cli/formatters/types.js +5 -0
- package/dist/cli/gate.d.ts +21 -0
- package/dist/cli/gate.js +175 -0
- package/dist/cli/index.d.ts +1 -0
- package/dist/cli/index.js +67 -23
- package/dist/cli/init.d.ts +7 -0
- package/dist/cli/init.js +69 -0
- package/dist/cli/policy-packs.d.ts +23 -0
- package/dist/cli/policy-packs.js +83 -0
- package/dist/cli/profiles.d.ts +28 -0
- package/dist/cli/profiles.js +30 -0
- package/dist/cli/reason-codes.d.ts +17 -0
- package/dist/cli/reason-codes.js +19 -0
- package/dist/cli/render/snippet.d.ts +5 -0
- package/dist/cli/render/snippet.js +15 -0
- package/dist/cli/render/sort.d.ts +10 -0
- package/dist/cli/render/sort.js +24 -0
- package/dist/cli/report/build-check-report.d.ts +19 -0
- package/dist/cli/report/build-check-report.js +124 -0
- package/dist/cli/share.d.ts +17 -0
- package/dist/cli/share.js +83 -0
- package/dist/client.d.ts +2 -2
- package/dist/client.js +144 -132
- package/dist/context.d.ts +1 -1
- package/dist/context.js +4 -6
- package/dist/errors.d.ts +2 -0
- package/dist/errors.js +116 -107
- package/dist/export.d.ts +6 -6
- package/dist/export.js +39 -33
- package/dist/index.d.ts +25 -24
- package/dist/index.js +62 -56
- package/dist/integrations/anthropic.d.ts +1 -1
- package/dist/integrations/anthropic.js +23 -19
- package/dist/integrations/openai-eval.d.ts +57 -0
- package/dist/integrations/openai-eval.js +230 -0
- package/dist/integrations/openai.d.ts +1 -1
- package/dist/integrations/openai.js +23 -19
- package/dist/local.d.ts +2 -2
- package/dist/local.js +25 -25
- package/dist/logger.d.ts +1 -1
- package/dist/logger.js +24 -28
- package/dist/matchers/index.d.ts +1 -0
- package/dist/matchers/index.js +6 -0
- package/dist/matchers/to-pass-gate.d.ts +29 -0
- package/dist/matchers/to-pass-gate.js +35 -0
- package/dist/pagination.d.ts +1 -1
- package/dist/pagination.js +6 -6
- package/dist/snapshot.js +24 -24
- package/dist/streaming.js +11 -11
- package/dist/testing.d.ts +6 -2
- package/dist/testing.js +30 -12
- package/dist/types.d.ts +22 -22
- package/dist/types.js +13 -13
- package/dist/utils/input-hash.d.ts +8 -0
- package/dist/utils/input-hash.js +38 -0
- package/dist/version.d.ts +7 -0
- package/dist/version.js +10 -0
- package/dist/workflows.d.ts +7 -7
- package/dist/workflows.js +44 -44
- package/package.json +102 -90
- package/dist/__tests__/assertions.test.d.ts +0 -1
- package/dist/__tests__/assertions.test.js +0 -288
- package/dist/__tests__/client.test.d.ts +0 -1
- package/dist/__tests__/client.test.js +0 -185
- package/dist/__tests__/testing.test.d.ts +0 -1
- package/dist/__tests__/testing.test.js +0 -230
- package/dist/__tests__/workflows.test.d.ts +0 -1
- package/dist/__tests__/workflows.test.js +0 -222
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* API fetch helpers for evalai check.
|
|
3
|
+
* Captures x-request-id from response headers.
|
|
4
|
+
* Sends X-EvalAI-SDK-Version and X-EvalAI-Spec-Version on all requests.
|
|
5
|
+
*/
|
|
6
|
+
export type QualityLatestData = {
|
|
7
|
+
score?: number;
|
|
8
|
+
total?: number | null;
|
|
9
|
+
evidenceLevel?: string | null;
|
|
10
|
+
baselineScore?: number | null;
|
|
11
|
+
regressionDelta?: number | null;
|
|
12
|
+
baselineMissing?: boolean | null;
|
|
13
|
+
breakdown?: {
|
|
14
|
+
passRate?: number;
|
|
15
|
+
safety?: number;
|
|
16
|
+
judge?: number;
|
|
17
|
+
};
|
|
18
|
+
flags?: string[];
|
|
19
|
+
evaluationRunId?: number;
|
|
20
|
+
evaluationId?: number;
|
|
21
|
+
avgLatencyMs?: number | null;
|
|
22
|
+
costUsd?: number | null;
|
|
23
|
+
baselineCostUsd?: number | null;
|
|
24
|
+
baselineRunId?: number | null;
|
|
25
|
+
};
|
|
26
|
+
export type RunDetailsData = {
|
|
27
|
+
results?: Array<{
|
|
28
|
+
testCaseId?: number;
|
|
29
|
+
status?: string;
|
|
30
|
+
output?: string;
|
|
31
|
+
durationMs?: number;
|
|
32
|
+
assertionsJson?: Record<string, unknown>;
|
|
33
|
+
test_cases?: {
|
|
34
|
+
name?: string;
|
|
35
|
+
input?: string;
|
|
36
|
+
expectedOutput?: string;
|
|
37
|
+
};
|
|
38
|
+
}>;
|
|
39
|
+
};
|
|
40
|
+
export declare function fetchQualityLatest(baseUrl: string, apiKey: string, evaluationId: string, baseline: string): Promise<{
|
|
41
|
+
ok: true;
|
|
42
|
+
data: QualityLatestData;
|
|
43
|
+
requestId?: string;
|
|
44
|
+
} | {
|
|
45
|
+
ok: false;
|
|
46
|
+
status: number;
|
|
47
|
+
body: string;
|
|
48
|
+
requestId?: string;
|
|
49
|
+
}>;
|
|
50
|
+
export declare function fetchRunDetails(baseUrl: string, apiKey: string, evaluationId: string, runId: number): Promise<{
|
|
51
|
+
ok: true;
|
|
52
|
+
data: RunDetailsData;
|
|
53
|
+
} | {
|
|
54
|
+
ok: false;
|
|
55
|
+
}>;
|
|
56
|
+
export type CiContext = {
|
|
57
|
+
provider?: "github" | "gitlab" | "circle" | "unknown";
|
|
58
|
+
repo?: string;
|
|
59
|
+
sha?: string;
|
|
60
|
+
branch?: string;
|
|
61
|
+
pr?: number;
|
|
62
|
+
runUrl?: string;
|
|
63
|
+
actor?: string;
|
|
64
|
+
};
|
|
65
|
+
export type ImportResult = {
|
|
66
|
+
testCaseId: number;
|
|
67
|
+
status: "passed" | "failed";
|
|
68
|
+
output: string;
|
|
69
|
+
latencyMs?: number;
|
|
70
|
+
costUsd?: number;
|
|
71
|
+
assertionsJson?: Record<string, unknown>;
|
|
72
|
+
};
|
|
73
|
+
export type PublishShareResult = {
|
|
74
|
+
shareId: string;
|
|
75
|
+
shareUrl: string;
|
|
76
|
+
shareScope: string;
|
|
77
|
+
};
|
|
78
|
+
export declare function fetchRunExport(baseUrl: string, apiKey: string, evaluationId: string, runId: number): Promise<{
|
|
79
|
+
ok: true;
|
|
80
|
+
exportData: Record<string, unknown>;
|
|
81
|
+
} | {
|
|
82
|
+
ok: false;
|
|
83
|
+
status: number;
|
|
84
|
+
body: string;
|
|
85
|
+
}>;
|
|
86
|
+
export declare function publishShare(baseUrl: string, apiKey: string, evaluationId: string, exportData: Record<string, unknown>, evaluationRunId: number, options?: {
|
|
87
|
+
expiresInDays?: number;
|
|
88
|
+
}): Promise<{
|
|
89
|
+
ok: true;
|
|
90
|
+
data: PublishShareResult;
|
|
91
|
+
} | {
|
|
92
|
+
ok: false;
|
|
93
|
+
status: number;
|
|
94
|
+
body: string;
|
|
95
|
+
}>;
|
|
96
|
+
export declare function importRunOnFail(baseUrl: string, apiKey: string, evaluationId: string, results: ImportResult[], options: {
|
|
97
|
+
idempotencyKey?: string;
|
|
98
|
+
ci?: CiContext;
|
|
99
|
+
importClientVersion?: string;
|
|
100
|
+
checkReport?: Record<string, unknown>;
|
|
101
|
+
}): Promise<{
|
|
102
|
+
ok: true;
|
|
103
|
+
runId: number;
|
|
104
|
+
} | {
|
|
105
|
+
ok: false;
|
|
106
|
+
status: number;
|
|
107
|
+
body: string;
|
|
108
|
+
}>;
|
package/dist/cli/api.js
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* API fetch helpers for evalai check.
|
|
4
|
+
* Captures x-request-id from response headers.
|
|
5
|
+
* Sends X-EvalAI-SDK-Version and X-EvalAI-Spec-Version on all requests.
|
|
6
|
+
*/
|
|
7
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
8
|
+
exports.fetchQualityLatest = fetchQualityLatest;
|
|
9
|
+
exports.fetchRunDetails = fetchRunDetails;
|
|
10
|
+
exports.fetchRunExport = fetchRunExport;
|
|
11
|
+
exports.publishShare = publishShare;
|
|
12
|
+
exports.importRunOnFail = importRunOnFail;
|
|
13
|
+
const version_1 = require("../version");
|
|
14
|
+
const API_HEADERS = {
|
|
15
|
+
"X-EvalAI-SDK-Version": version_1.SDK_VERSION,
|
|
16
|
+
"X-EvalAI-Spec-Version": version_1.SPEC_VERSION,
|
|
17
|
+
};
|
|
18
|
+
async function fetchQualityLatest(baseUrl, apiKey, evaluationId, baseline) {
|
|
19
|
+
const headers = { ...API_HEADERS, Authorization: `Bearer ${apiKey}` };
|
|
20
|
+
const url = `${baseUrl.replace(/\/$/, "")}/api/quality?evaluationId=${evaluationId}&action=latest&baseline=${baseline}`;
|
|
21
|
+
try {
|
|
22
|
+
const res = await fetch(url, { headers });
|
|
23
|
+
const requestId = res.headers.get("x-request-id") ?? undefined;
|
|
24
|
+
const body = await res.text();
|
|
25
|
+
if (!res.ok) {
|
|
26
|
+
return { ok: false, status: res.status, body, requestId };
|
|
27
|
+
}
|
|
28
|
+
const data = JSON.parse(body);
|
|
29
|
+
return { ok: true, data, requestId };
|
|
30
|
+
}
|
|
31
|
+
catch (err) {
|
|
32
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
33
|
+
return { ok: false, status: 0, body: msg, requestId: undefined };
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
async function fetchRunDetails(baseUrl, apiKey, evaluationId, runId) {
|
|
37
|
+
const headers = { ...API_HEADERS, Authorization: `Bearer ${apiKey}` };
|
|
38
|
+
const url = `${baseUrl.replace(/\/$/, "")}/api/evaluations/${evaluationId}/runs/${runId}`;
|
|
39
|
+
try {
|
|
40
|
+
const res = await fetch(url, { headers });
|
|
41
|
+
if (!res.ok)
|
|
42
|
+
return { ok: false };
|
|
43
|
+
const data = (await res.json());
|
|
44
|
+
return { ok: true, data };
|
|
45
|
+
}
|
|
46
|
+
catch {
|
|
47
|
+
return { ok: false };
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
async function fetchRunExport(baseUrl, apiKey, evaluationId, runId) {
|
|
51
|
+
const headers = { ...API_HEADERS, Authorization: `Bearer ${apiKey}` };
|
|
52
|
+
const url = `${baseUrl.replace(/\/$/, "")}/api/evaluations/${evaluationId}/runs/${runId}/export`;
|
|
53
|
+
try {
|
|
54
|
+
const res = await fetch(url, { headers });
|
|
55
|
+
const text = await res.text();
|
|
56
|
+
if (!res.ok)
|
|
57
|
+
return { ok: false, status: res.status, body: text };
|
|
58
|
+
const exportData = JSON.parse(text);
|
|
59
|
+
return { ok: true, exportData };
|
|
60
|
+
}
|
|
61
|
+
catch (err) {
|
|
62
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
63
|
+
return { ok: false, status: 0, body: msg };
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
async function publishShare(baseUrl, apiKey, evaluationId, exportData, evaluationRunId, options) {
|
|
67
|
+
const headers = {
|
|
68
|
+
...API_HEADERS,
|
|
69
|
+
Authorization: `Bearer ${apiKey}`,
|
|
70
|
+
"Content-Type": "application/json",
|
|
71
|
+
};
|
|
72
|
+
const body = {
|
|
73
|
+
exportData,
|
|
74
|
+
shareScope: "run",
|
|
75
|
+
evaluationRunId,
|
|
76
|
+
...(options?.expiresInDays != null && { expiresInDays: options.expiresInDays }),
|
|
77
|
+
};
|
|
78
|
+
const url = `${baseUrl.replace(/\/$/, "")}/api/evaluations/${evaluationId}/publish`;
|
|
79
|
+
try {
|
|
80
|
+
const res = await fetch(url, {
|
|
81
|
+
method: "POST",
|
|
82
|
+
headers,
|
|
83
|
+
body: JSON.stringify(body),
|
|
84
|
+
});
|
|
85
|
+
const text = await res.text();
|
|
86
|
+
if (!res.ok)
|
|
87
|
+
return { ok: false, status: res.status, body: text };
|
|
88
|
+
const data = JSON.parse(text);
|
|
89
|
+
return { ok: true, data };
|
|
90
|
+
}
|
|
91
|
+
catch (err) {
|
|
92
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
93
|
+
return { ok: false, status: 0, body: msg };
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
async function importRunOnFail(baseUrl, apiKey, evaluationId, results, options) {
|
|
97
|
+
const headers = {
|
|
98
|
+
...API_HEADERS,
|
|
99
|
+
Authorization: `Bearer ${apiKey}`,
|
|
100
|
+
"Content-Type": "application/json",
|
|
101
|
+
};
|
|
102
|
+
if (options.idempotencyKey) {
|
|
103
|
+
headers["Idempotency-Key"] = options.idempotencyKey;
|
|
104
|
+
}
|
|
105
|
+
const body = {
|
|
106
|
+
environment: "dev",
|
|
107
|
+
results,
|
|
108
|
+
importClientVersion: options.importClientVersion ?? "evalai-cli",
|
|
109
|
+
ci: options.ci,
|
|
110
|
+
...(options.checkReport != null && { checkReport: options.checkReport }),
|
|
111
|
+
};
|
|
112
|
+
const url = `${baseUrl.replace(/\/$/, "")}/api/evaluations/${evaluationId}/runs/import`;
|
|
113
|
+
try {
|
|
114
|
+
const res = await fetch(url, {
|
|
115
|
+
method: "POST",
|
|
116
|
+
headers,
|
|
117
|
+
body: JSON.stringify(body),
|
|
118
|
+
});
|
|
119
|
+
const text = await res.text();
|
|
120
|
+
if (!res.ok) {
|
|
121
|
+
return { ok: false, status: res.status, body: text };
|
|
122
|
+
}
|
|
123
|
+
const data = JSON.parse(text);
|
|
124
|
+
return { ok: true, runId: data.runId };
|
|
125
|
+
}
|
|
126
|
+
catch (err) {
|
|
127
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
128
|
+
return { ok: false, status: 0, body: msg };
|
|
129
|
+
}
|
|
130
|
+
}
|
package/dist/cli/check.d.ts
CHANGED
|
@@ -14,10 +14,14 @@
|
|
|
14
14
|
* --minN <n> Fail if total test cases < n (low sample size)
|
|
15
15
|
* --allowWeakEvidence If false (default), fail when evidenceLevel is 'weak'
|
|
16
16
|
* --policy <name> Enforce a compliance policy (e.g. HIPAA, SOC2, GDPR)
|
|
17
|
-
* --baseline <mode>
|
|
17
|
+
* --baseline <mode> Baseline comparison mode: "published" (default), "previous", or "production"
|
|
18
18
|
* --evaluationId <id> Required. The evaluation to gate on.
|
|
19
19
|
* --baseUrl <url> API base URL (default: EVALAI_BASE_URL or http://localhost:3000)
|
|
20
20
|
* --apiKey <key> API key (default: EVALAI_API_KEY env var)
|
|
21
|
+
* --share <mode> Share link: "always" | "fail" | "never" (default: never)
|
|
22
|
+
* fail = create public share link only when gate fails (CI-friendly)
|
|
23
|
+
* --pr-comment-out <file> Write PR comment markdown to file (for GitHub Action to post)
|
|
24
|
+
* --profile <name> Preset: strict (95/0/30), balanced (90/2/10), fast (85/5/5). Explicit flags override.
|
|
21
25
|
*
|
|
22
26
|
* Exit codes:
|
|
23
27
|
* 0 — Gate passed
|
|
@@ -28,31 +32,42 @@
|
|
|
28
32
|
* 5 — Invalid arguments
|
|
29
33
|
* 6 — Gate failed: total test cases < minN
|
|
30
34
|
* 7 — Gate failed: weak evidence (evidenceLevel === 'weak')
|
|
35
|
+
* 8 — Gate warned: near-regression (warnDrop ≤ drop < maxDrop)
|
|
31
36
|
*
|
|
32
37
|
* Environment:
|
|
33
38
|
* EVALAI_BASE_URL — API base URL (default: http://localhost:3000)
|
|
34
39
|
* EVALAI_API_KEY — API key for authentication
|
|
35
40
|
*/
|
|
36
|
-
export
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
readonly REGRESSION: 2;
|
|
40
|
-
readonly POLICY_VIOLATION: 3;
|
|
41
|
-
readonly API_ERROR: 4;
|
|
42
|
-
readonly BAD_ARGS: 5;
|
|
43
|
-
readonly LOW_N: 6;
|
|
44
|
-
readonly WEAK_EVIDENCE: 7;
|
|
45
|
-
};
|
|
41
|
+
export { EXIT } from "./constants";
|
|
42
|
+
export type FormatType = "human" | "json" | "github";
|
|
43
|
+
export type ShareMode = "always" | "fail" | "never";
|
|
46
44
|
export interface CheckArgs {
|
|
47
45
|
baseUrl: string;
|
|
48
46
|
apiKey: string;
|
|
49
47
|
minScore: number;
|
|
50
48
|
maxDrop?: number;
|
|
49
|
+
warnDrop?: number;
|
|
51
50
|
minN?: number;
|
|
52
51
|
allowWeakEvidence: boolean;
|
|
53
52
|
evaluationId: string;
|
|
54
53
|
policy?: string;
|
|
55
|
-
baseline:
|
|
54
|
+
baseline: "published" | "previous" | "production" | "auto";
|
|
55
|
+
format: FormatType;
|
|
56
|
+
explain: boolean;
|
|
57
|
+
onFail?: "import";
|
|
58
|
+
share: ShareMode;
|
|
59
|
+
prCommentOut?: string;
|
|
60
|
+
maxCostUsd?: number;
|
|
61
|
+
maxLatencyMs?: number;
|
|
62
|
+
maxCostDeltaUsd?: number;
|
|
56
63
|
}
|
|
57
|
-
export
|
|
64
|
+
export type ParseArgsResult = {
|
|
65
|
+
ok: true;
|
|
66
|
+
args: CheckArgs;
|
|
67
|
+
} | {
|
|
68
|
+
ok: false;
|
|
69
|
+
exitCode: number;
|
|
70
|
+
message: string;
|
|
71
|
+
};
|
|
72
|
+
export declare function parseArgs(argv: string[]): ParseArgsResult;
|
|
58
73
|
export declare function runCheck(args: CheckArgs): Promise<number>;
|