@pauly4010/evalai-sdk 1.4.1 → 1.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/CHANGELOG.md +85 -0
  2. package/README.md +205 -543
  3. package/dist/assertions.d.ts +2 -2
  4. package/dist/assertions.js +104 -71
  5. package/dist/batch.js +12 -17
  6. package/dist/cache.js +7 -11
  7. package/dist/cli/api.d.ts +108 -0
  8. package/dist/cli/api.js +130 -0
  9. package/dist/cli/check.d.ts +28 -13
  10. package/dist/cli/check.js +249 -142
  11. package/dist/cli/ci-context.d.ts +6 -0
  12. package/dist/cli/ci-context.js +110 -0
  13. package/dist/cli/config.d.ts +30 -0
  14. package/dist/cli/config.js +207 -0
  15. package/dist/cli/constants.d.ts +15 -0
  16. package/dist/cli/constants.js +18 -0
  17. package/dist/cli/doctor.d.ts +11 -0
  18. package/dist/cli/doctor.js +82 -0
  19. package/dist/cli/formatters/github.d.ts +8 -0
  20. package/dist/cli/formatters/github.js +130 -0
  21. package/dist/cli/formatters/human.d.ts +6 -0
  22. package/dist/cli/formatters/human.js +107 -0
  23. package/dist/cli/formatters/json.d.ts +6 -0
  24. package/dist/cli/formatters/json.js +10 -0
  25. package/dist/cli/formatters/pr-comment.d.ts +12 -0
  26. package/dist/cli/formatters/pr-comment.js +101 -0
  27. package/dist/cli/formatters/types.d.ts +100 -0
  28. package/dist/cli/formatters/types.js +5 -0
  29. package/dist/cli/gate.d.ts +21 -0
  30. package/dist/cli/gate.js +175 -0
  31. package/dist/cli/index.d.ts +1 -0
  32. package/dist/cli/index.js +67 -23
  33. package/dist/cli/init.d.ts +7 -0
  34. package/dist/cli/init.js +69 -0
  35. package/dist/cli/policy-packs.d.ts +23 -0
  36. package/dist/cli/policy-packs.js +83 -0
  37. package/dist/cli/profiles.d.ts +28 -0
  38. package/dist/cli/profiles.js +30 -0
  39. package/dist/cli/reason-codes.d.ts +17 -0
  40. package/dist/cli/reason-codes.js +19 -0
  41. package/dist/cli/render/snippet.d.ts +5 -0
  42. package/dist/cli/render/snippet.js +15 -0
  43. package/dist/cli/render/sort.d.ts +10 -0
  44. package/dist/cli/render/sort.js +24 -0
  45. package/dist/cli/report/build-check-report.d.ts +19 -0
  46. package/dist/cli/report/build-check-report.js +124 -0
  47. package/dist/cli/share.d.ts +17 -0
  48. package/dist/cli/share.js +83 -0
  49. package/dist/client.d.ts +2 -2
  50. package/dist/client.js +144 -132
  51. package/dist/context.d.ts +1 -1
  52. package/dist/context.js +4 -6
  53. package/dist/errors.d.ts +2 -0
  54. package/dist/errors.js +116 -107
  55. package/dist/export.d.ts +6 -6
  56. package/dist/export.js +39 -33
  57. package/dist/index.d.ts +25 -24
  58. package/dist/index.js +62 -56
  59. package/dist/integrations/anthropic.d.ts +1 -1
  60. package/dist/integrations/anthropic.js +23 -19
  61. package/dist/integrations/openai-eval.d.ts +57 -0
  62. package/dist/integrations/openai-eval.js +230 -0
  63. package/dist/integrations/openai.d.ts +1 -1
  64. package/dist/integrations/openai.js +23 -19
  65. package/dist/local.d.ts +2 -2
  66. package/dist/local.js +25 -25
  67. package/dist/logger.d.ts +1 -1
  68. package/dist/logger.js +24 -28
  69. package/dist/matchers/index.d.ts +1 -0
  70. package/dist/matchers/index.js +6 -0
  71. package/dist/matchers/to-pass-gate.d.ts +29 -0
  72. package/dist/matchers/to-pass-gate.js +35 -0
  73. package/dist/pagination.d.ts +1 -1
  74. package/dist/pagination.js +6 -6
  75. package/dist/snapshot.js +24 -24
  76. package/dist/streaming.js +11 -11
  77. package/dist/testing.d.ts +6 -2
  78. package/dist/testing.js +30 -12
  79. package/dist/types.d.ts +22 -22
  80. package/dist/types.js +13 -13
  81. package/dist/utils/input-hash.d.ts +8 -0
  82. package/dist/utils/input-hash.js +38 -0
  83. package/dist/version.d.ts +7 -0
  84. package/dist/version.js +10 -0
  85. package/dist/workflows.d.ts +7 -7
  86. package/dist/workflows.js +44 -44
  87. package/package.json +102 -90
  88. package/dist/__tests__/assertions.test.d.ts +0 -1
  89. package/dist/__tests__/assertions.test.js +0 -288
  90. package/dist/__tests__/client.test.d.ts +0 -1
  91. package/dist/__tests__/client.test.js +0 -185
  92. package/dist/__tests__/testing.test.d.ts +0 -1
  93. package/dist/__tests__/testing.test.js +0 -230
  94. package/dist/__tests__/workflows.test.d.ts +0 -1
  95. package/dist/__tests__/workflows.test.js +0 -222
@@ -0,0 +1,108 @@
1
+ /**
2
+ * API fetch helpers for evalai check.
3
+ * Captures x-request-id from response headers.
4
+ * Sends X-EvalAI-SDK-Version and X-EvalAI-Spec-Version on all requests.
5
+ */
6
+ export type QualityLatestData = {
7
+ score?: number;
8
+ total?: number | null;
9
+ evidenceLevel?: string | null;
10
+ baselineScore?: number | null;
11
+ regressionDelta?: number | null;
12
+ baselineMissing?: boolean | null;
13
+ breakdown?: {
14
+ passRate?: number;
15
+ safety?: number;
16
+ judge?: number;
17
+ };
18
+ flags?: string[];
19
+ evaluationRunId?: number;
20
+ evaluationId?: number;
21
+ avgLatencyMs?: number | null;
22
+ costUsd?: number | null;
23
+ baselineCostUsd?: number | null;
24
+ baselineRunId?: number | null;
25
+ };
26
+ export type RunDetailsData = {
27
+ results?: Array<{
28
+ testCaseId?: number;
29
+ status?: string;
30
+ output?: string;
31
+ durationMs?: number;
32
+ assertionsJson?: Record<string, unknown>;
33
+ test_cases?: {
34
+ name?: string;
35
+ input?: string;
36
+ expectedOutput?: string;
37
+ };
38
+ }>;
39
+ };
40
+ export declare function fetchQualityLatest(baseUrl: string, apiKey: string, evaluationId: string, baseline: string): Promise<{
41
+ ok: true;
42
+ data: QualityLatestData;
43
+ requestId?: string;
44
+ } | {
45
+ ok: false;
46
+ status: number;
47
+ body: string;
48
+ requestId?: string;
49
+ }>;
50
+ export declare function fetchRunDetails(baseUrl: string, apiKey: string, evaluationId: string, runId: number): Promise<{
51
+ ok: true;
52
+ data: RunDetailsData;
53
+ } | {
54
+ ok: false;
55
+ }>;
56
+ export type CiContext = {
57
+ provider?: "github" | "gitlab" | "circle" | "unknown";
58
+ repo?: string;
59
+ sha?: string;
60
+ branch?: string;
61
+ pr?: number;
62
+ runUrl?: string;
63
+ actor?: string;
64
+ };
65
+ export type ImportResult = {
66
+ testCaseId: number;
67
+ status: "passed" | "failed";
68
+ output: string;
69
+ latencyMs?: number;
70
+ costUsd?: number;
71
+ assertionsJson?: Record<string, unknown>;
72
+ };
73
+ export type PublishShareResult = {
74
+ shareId: string;
75
+ shareUrl: string;
76
+ shareScope: string;
77
+ };
78
+ export declare function fetchRunExport(baseUrl: string, apiKey: string, evaluationId: string, runId: number): Promise<{
79
+ ok: true;
80
+ exportData: Record<string, unknown>;
81
+ } | {
82
+ ok: false;
83
+ status: number;
84
+ body: string;
85
+ }>;
86
+ export declare function publishShare(baseUrl: string, apiKey: string, evaluationId: string, exportData: Record<string, unknown>, evaluationRunId: number, options?: {
87
+ expiresInDays?: number;
88
+ }): Promise<{
89
+ ok: true;
90
+ data: PublishShareResult;
91
+ } | {
92
+ ok: false;
93
+ status: number;
94
+ body: string;
95
+ }>;
96
+ export declare function importRunOnFail(baseUrl: string, apiKey: string, evaluationId: string, results: ImportResult[], options: {
97
+ idempotencyKey?: string;
98
+ ci?: CiContext;
99
+ importClientVersion?: string;
100
+ checkReport?: Record<string, unknown>;
101
+ }): Promise<{
102
+ ok: true;
103
+ runId: number;
104
+ } | {
105
+ ok: false;
106
+ status: number;
107
+ body: string;
108
+ }>;
@@ -0,0 +1,130 @@
1
+ "use strict";
2
+ /**
3
+ * API fetch helpers for evalai check.
4
+ * Captures x-request-id from response headers.
5
+ * Sends X-EvalAI-SDK-Version and X-EvalAI-Spec-Version on all requests.
6
+ */
7
+ Object.defineProperty(exports, "__esModule", { value: true });
8
+ exports.fetchQualityLatest = fetchQualityLatest;
9
+ exports.fetchRunDetails = fetchRunDetails;
10
+ exports.fetchRunExport = fetchRunExport;
11
+ exports.publishShare = publishShare;
12
+ exports.importRunOnFail = importRunOnFail;
13
+ const version_1 = require("../version");
14
+ const API_HEADERS = {
15
+ "X-EvalAI-SDK-Version": version_1.SDK_VERSION,
16
+ "X-EvalAI-Spec-Version": version_1.SPEC_VERSION,
17
+ };
18
+ async function fetchQualityLatest(baseUrl, apiKey, evaluationId, baseline) {
19
+ const headers = { ...API_HEADERS, Authorization: `Bearer ${apiKey}` };
20
+ const url = `${baseUrl.replace(/\/$/, "")}/api/quality?evaluationId=${evaluationId}&action=latest&baseline=${baseline}`;
21
+ try {
22
+ const res = await fetch(url, { headers });
23
+ const requestId = res.headers.get("x-request-id") ?? undefined;
24
+ const body = await res.text();
25
+ if (!res.ok) {
26
+ return { ok: false, status: res.status, body, requestId };
27
+ }
28
+ const data = JSON.parse(body);
29
+ return { ok: true, data, requestId };
30
+ }
31
+ catch (err) {
32
+ const msg = err instanceof Error ? err.message : String(err);
33
+ return { ok: false, status: 0, body: msg, requestId: undefined };
34
+ }
35
+ }
36
+ async function fetchRunDetails(baseUrl, apiKey, evaluationId, runId) {
37
+ const headers = { ...API_HEADERS, Authorization: `Bearer ${apiKey}` };
38
+ const url = `${baseUrl.replace(/\/$/, "")}/api/evaluations/${evaluationId}/runs/${runId}`;
39
+ try {
40
+ const res = await fetch(url, { headers });
41
+ if (!res.ok)
42
+ return { ok: false };
43
+ const data = (await res.json());
44
+ return { ok: true, data };
45
+ }
46
+ catch {
47
+ return { ok: false };
48
+ }
49
+ }
50
+ async function fetchRunExport(baseUrl, apiKey, evaluationId, runId) {
51
+ const headers = { ...API_HEADERS, Authorization: `Bearer ${apiKey}` };
52
+ const url = `${baseUrl.replace(/\/$/, "")}/api/evaluations/${evaluationId}/runs/${runId}/export`;
53
+ try {
54
+ const res = await fetch(url, { headers });
55
+ const text = await res.text();
56
+ if (!res.ok)
57
+ return { ok: false, status: res.status, body: text };
58
+ const exportData = JSON.parse(text);
59
+ return { ok: true, exportData };
60
+ }
61
+ catch (err) {
62
+ const msg = err instanceof Error ? err.message : String(err);
63
+ return { ok: false, status: 0, body: msg };
64
+ }
65
+ }
66
+ async function publishShare(baseUrl, apiKey, evaluationId, exportData, evaluationRunId, options) {
67
+ const headers = {
68
+ ...API_HEADERS,
69
+ Authorization: `Bearer ${apiKey}`,
70
+ "Content-Type": "application/json",
71
+ };
72
+ const body = {
73
+ exportData,
74
+ shareScope: "run",
75
+ evaluationRunId,
76
+ ...(options?.expiresInDays != null && { expiresInDays: options.expiresInDays }),
77
+ };
78
+ const url = `${baseUrl.replace(/\/$/, "")}/api/evaluations/${evaluationId}/publish`;
79
+ try {
80
+ const res = await fetch(url, {
81
+ method: "POST",
82
+ headers,
83
+ body: JSON.stringify(body),
84
+ });
85
+ const text = await res.text();
86
+ if (!res.ok)
87
+ return { ok: false, status: res.status, body: text };
88
+ const data = JSON.parse(text);
89
+ return { ok: true, data };
90
+ }
91
+ catch (err) {
92
+ const msg = err instanceof Error ? err.message : String(err);
93
+ return { ok: false, status: 0, body: msg };
94
+ }
95
+ }
96
+ async function importRunOnFail(baseUrl, apiKey, evaluationId, results, options) {
97
+ const headers = {
98
+ ...API_HEADERS,
99
+ Authorization: `Bearer ${apiKey}`,
100
+ "Content-Type": "application/json",
101
+ };
102
+ if (options.idempotencyKey) {
103
+ headers["Idempotency-Key"] = options.idempotencyKey;
104
+ }
105
+ const body = {
106
+ environment: "dev",
107
+ results,
108
+ importClientVersion: options.importClientVersion ?? "evalai-cli",
109
+ ci: options.ci,
110
+ ...(options.checkReport != null && { checkReport: options.checkReport }),
111
+ };
112
+ const url = `${baseUrl.replace(/\/$/, "")}/api/evaluations/${evaluationId}/runs/import`;
113
+ try {
114
+ const res = await fetch(url, {
115
+ method: "POST",
116
+ headers,
117
+ body: JSON.stringify(body),
118
+ });
119
+ const text = await res.text();
120
+ if (!res.ok) {
121
+ return { ok: false, status: res.status, body: text };
122
+ }
123
+ const data = JSON.parse(text);
124
+ return { ok: true, runId: data.runId };
125
+ }
126
+ catch (err) {
127
+ const msg = err instanceof Error ? err.message : String(err);
128
+ return { ok: false, status: 0, body: msg };
129
+ }
130
+ }
@@ -14,10 +14,14 @@
14
14
  * --minN <n> Fail if total test cases < n (low sample size)
15
15
  * --allowWeakEvidence If false (default), fail when evidenceLevel is 'weak'
16
16
  * --policy <name> Enforce a compliance policy (e.g. HIPAA, SOC2, GDPR)
17
- * --baseline <mode> Baseline comparison mode: "published" (default), "previous", or "production"
17
+ * --baseline <mode> Baseline comparison mode: "published" (default), "previous", or "production"
18
18
  * --evaluationId <id> Required. The evaluation to gate on.
19
19
  * --baseUrl <url> API base URL (default: EVALAI_BASE_URL or http://localhost:3000)
20
20
  * --apiKey <key> API key (default: EVALAI_API_KEY env var)
21
+ * --share <mode> Share link: "always" | "fail" | "never" (default: never)
22
+ * fail = create public share link only when gate fails (CI-friendly)
23
+ * --pr-comment-out <file> Write PR comment markdown to file (for GitHub Action to post)
24
+ * --profile <name> Preset: strict (95/0/30), balanced (90/2/10), fast (85/5/5). Explicit flags override.
21
25
  *
22
26
  * Exit codes:
23
27
  * 0 — Gate passed
@@ -28,31 +32,42 @@
28
32
  * 5 — Invalid arguments
29
33
  * 6 — Gate failed: total test cases < minN
30
34
  * 7 — Gate failed: weak evidence (evidenceLevel === 'weak')
35
+ * 8 — Gate warned: near-regression (warnDrop ≤ drop < maxDrop)
31
36
  *
32
37
  * Environment:
33
38
  * EVALAI_BASE_URL — API base URL (default: http://localhost:3000)
34
39
  * EVALAI_API_KEY — API key for authentication
35
40
  */
36
- export declare const EXIT: {
37
- readonly PASS: 0;
38
- readonly SCORE_BELOW: 1;
39
- readonly REGRESSION: 2;
40
- readonly POLICY_VIOLATION: 3;
41
- readonly API_ERROR: 4;
42
- readonly BAD_ARGS: 5;
43
- readonly LOW_N: 6;
44
- readonly WEAK_EVIDENCE: 7;
45
- };
41
+ export { EXIT } from "./constants";
42
+ export type FormatType = "human" | "json" | "github";
43
+ export type ShareMode = "always" | "fail" | "never";
46
44
  export interface CheckArgs {
47
45
  baseUrl: string;
48
46
  apiKey: string;
49
47
  minScore: number;
50
48
  maxDrop?: number;
49
+ warnDrop?: number;
51
50
  minN?: number;
52
51
  allowWeakEvidence: boolean;
53
52
  evaluationId: string;
54
53
  policy?: string;
55
- baseline: 'published' | 'previous' | 'production';
54
+ baseline: "published" | "previous" | "production" | "auto";
55
+ format: FormatType;
56
+ explain: boolean;
57
+ onFail?: "import";
58
+ share: ShareMode;
59
+ prCommentOut?: string;
60
+ maxCostUsd?: number;
61
+ maxLatencyMs?: number;
62
+ maxCostDeltaUsd?: number;
56
63
  }
57
- export declare function parseArgs(argv: string[]): CheckArgs;
64
+ export type ParseArgsResult = {
65
+ ok: true;
66
+ args: CheckArgs;
67
+ } | {
68
+ ok: false;
69
+ exitCode: number;
70
+ message: string;
71
+ };
72
+ export declare function parseArgs(argv: string[]): ParseArgsResult;
58
73
  export declare function runCheck(args: CheckArgs): Promise<number>;