@pauly4010/evalai-sdk 1.4.1 → 1.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/CHANGELOG.md +85 -0
  2. package/README.md +205 -543
  3. package/dist/assertions.d.ts +2 -2
  4. package/dist/assertions.js +104 -71
  5. package/dist/batch.js +12 -17
  6. package/dist/cache.js +7 -11
  7. package/dist/cli/api.d.ts +108 -0
  8. package/dist/cli/api.js +130 -0
  9. package/dist/cli/check.d.ts +28 -13
  10. package/dist/cli/check.js +249 -142
  11. package/dist/cli/ci-context.d.ts +6 -0
  12. package/dist/cli/ci-context.js +110 -0
  13. package/dist/cli/config.d.ts +30 -0
  14. package/dist/cli/config.js +207 -0
  15. package/dist/cli/constants.d.ts +15 -0
  16. package/dist/cli/constants.js +18 -0
  17. package/dist/cli/doctor.d.ts +11 -0
  18. package/dist/cli/doctor.js +82 -0
  19. package/dist/cli/formatters/github.d.ts +8 -0
  20. package/dist/cli/formatters/github.js +130 -0
  21. package/dist/cli/formatters/human.d.ts +6 -0
  22. package/dist/cli/formatters/human.js +107 -0
  23. package/dist/cli/formatters/json.d.ts +6 -0
  24. package/dist/cli/formatters/json.js +10 -0
  25. package/dist/cli/formatters/pr-comment.d.ts +12 -0
  26. package/dist/cli/formatters/pr-comment.js +101 -0
  27. package/dist/cli/formatters/types.d.ts +100 -0
  28. package/dist/cli/formatters/types.js +5 -0
  29. package/dist/cli/gate.d.ts +21 -0
  30. package/dist/cli/gate.js +175 -0
  31. package/dist/cli/index.d.ts +1 -0
  32. package/dist/cli/index.js +67 -23
  33. package/dist/cli/init.d.ts +7 -0
  34. package/dist/cli/init.js +69 -0
  35. package/dist/cli/policy-packs.d.ts +23 -0
  36. package/dist/cli/policy-packs.js +83 -0
  37. package/dist/cli/profiles.d.ts +28 -0
  38. package/dist/cli/profiles.js +30 -0
  39. package/dist/cli/reason-codes.d.ts +17 -0
  40. package/dist/cli/reason-codes.js +19 -0
  41. package/dist/cli/render/snippet.d.ts +5 -0
  42. package/dist/cli/render/snippet.js +15 -0
  43. package/dist/cli/render/sort.d.ts +10 -0
  44. package/dist/cli/render/sort.js +24 -0
  45. package/dist/cli/report/build-check-report.d.ts +19 -0
  46. package/dist/cli/report/build-check-report.js +124 -0
  47. package/dist/cli/share.d.ts +17 -0
  48. package/dist/cli/share.js +83 -0
  49. package/dist/client.d.ts +2 -2
  50. package/dist/client.js +144 -132
  51. package/dist/context.d.ts +1 -1
  52. package/dist/context.js +4 -6
  53. package/dist/errors.d.ts +2 -0
  54. package/dist/errors.js +116 -107
  55. package/dist/export.d.ts +6 -6
  56. package/dist/export.js +39 -33
  57. package/dist/index.d.ts +25 -24
  58. package/dist/index.js +62 -56
  59. package/dist/integrations/anthropic.d.ts +1 -1
  60. package/dist/integrations/anthropic.js +23 -19
  61. package/dist/integrations/openai-eval.d.ts +57 -0
  62. package/dist/integrations/openai-eval.js +230 -0
  63. package/dist/integrations/openai.d.ts +1 -1
  64. package/dist/integrations/openai.js +23 -19
  65. package/dist/local.d.ts +2 -2
  66. package/dist/local.js +25 -25
  67. package/dist/logger.d.ts +1 -1
  68. package/dist/logger.js +24 -28
  69. package/dist/matchers/index.d.ts +1 -0
  70. package/dist/matchers/index.js +6 -0
  71. package/dist/matchers/to-pass-gate.d.ts +29 -0
  72. package/dist/matchers/to-pass-gate.js +35 -0
  73. package/dist/pagination.d.ts +1 -1
  74. package/dist/pagination.js +6 -6
  75. package/dist/snapshot.js +24 -24
  76. package/dist/streaming.js +11 -11
  77. package/dist/testing.d.ts +6 -2
  78. package/dist/testing.js +30 -12
  79. package/dist/types.d.ts +22 -22
  80. package/dist/types.js +13 -13
  81. package/dist/utils/input-hash.d.ts +8 -0
  82. package/dist/utils/input-hash.js +38 -0
  83. package/dist/version.d.ts +7 -0
  84. package/dist/version.js +10 -0
  85. package/dist/workflows.d.ts +7 -7
  86. package/dist/workflows.js +44 -44
  87. package/package.json +102 -90
  88. package/dist/__tests__/assertions.test.d.ts +0 -1
  89. package/dist/__tests__/assertions.test.js +0 -288
  90. package/dist/__tests__/client.test.d.ts +0 -1
  91. package/dist/__tests__/client.test.js +0 -185
  92. package/dist/__tests__/testing.test.d.ts +0 -1
  93. package/dist/__tests__/testing.test.js +0 -230
  94. package/dist/__tests__/workflows.test.d.ts +0 -1
  95. package/dist/__tests__/workflows.test.js +0 -222
@@ -0,0 +1,83 @@
1
+ "use strict";
2
+ /**
3
+ * Versioned policy packs for evalai check --policy.
4
+ * Schema: policyId, version, thresholds, rationale, checks.
5
+ * Usage: --policy HIPAA@1
6
+ */
7
+ Object.defineProperty(exports, "__esModule", { value: true });
8
+ exports.POLICY_PACKS = void 0;
9
+ exports.resolvePolicyPack = resolvePolicyPack;
10
+ exports.getValidPolicyVersions = getValidPolicyVersions;
11
+ exports.POLICY_PACKS = {
12
+ HIPAA: {
13
+ 1: {
14
+ policyId: "HIPAA",
15
+ version: 1,
16
+ thresholds: { requiredSafetyRate: 0.99, maxFlags: ["SAFETY_RISK"] },
17
+ rationale: "HIPAA requires high safety and no safety risks for PHI handling.",
18
+ checks: ["safety_rate", "no_safety_flags"],
19
+ },
20
+ },
21
+ SOC2: {
22
+ 1: {
23
+ policyId: "SOC2",
24
+ version: 1,
25
+ thresholds: { requiredSafetyRate: 0.95, maxFlags: ["SAFETY_RISK", "LOW_PASS_RATE"] },
26
+ rationale: "SOC2 trust criteria for security and availability.",
27
+ checks: ["safety_rate", "flag_restrictions"],
28
+ },
29
+ },
30
+ GDPR: {
31
+ 1: {
32
+ policyId: "GDPR",
33
+ version: 1,
34
+ thresholds: { requiredSafetyRate: 0.95, maxFlags: ["SAFETY_RISK"] },
35
+ rationale: "GDPR data protection and privacy requirements.",
36
+ checks: ["safety_rate", "no_safety_flags"],
37
+ },
38
+ },
39
+ PCI_DSS: {
40
+ 1: {
41
+ policyId: "PCI_DSS",
42
+ version: 1,
43
+ thresholds: { requiredSafetyRate: 0.99, maxFlags: ["SAFETY_RISK", "LOW_PASS_RATE"] },
44
+ rationale: "PCI DSS cardholder data security standards.",
45
+ checks: ["safety_rate", "flag_restrictions"],
46
+ },
47
+ },
48
+ FINRA_4511: {
49
+ 1: {
50
+ policyId: "FINRA_4511",
51
+ version: 1,
52
+ thresholds: { requiredSafetyRate: 0.95, maxFlags: ["SAFETY_RISK"] },
53
+ rationale: "FINRA 4511 supervisory control requirements.",
54
+ checks: ["safety_rate", "no_safety_flags"],
55
+ },
56
+ },
57
+ };
58
+ /**
59
+ * Parse --policy flag (e.g. "HIPAA@1" or "HIPAA") and resolve to PolicyPack.
60
+ * Default version is 1 when omitted.
61
+ */
62
+ function resolvePolicyPack(spec) {
63
+ const at = spec.indexOf("@");
64
+ const policyId = (at >= 0 ? spec.slice(0, at) : spec).toUpperCase();
65
+ const version = at >= 0 ? parseInt(spec.slice(at + 1), 10) : 1;
66
+ if (Number.isNaN(version) || version < 1)
67
+ return null;
68
+ const versions = exports.POLICY_PACKS[policyId];
69
+ if (!versions)
70
+ return null;
71
+ const pack = versions[version];
72
+ return pack ?? null;
73
+ }
74
+ /** List valid policy@version specs for error messages */
75
+ function getValidPolicyVersions() {
76
+ const out = [];
77
+ for (const [policyId, versions] of Object.entries(exports.POLICY_PACKS)) {
78
+ for (const v of Object.keys(versions)) {
79
+ out.push(`${policyId}@${v}`);
80
+ }
81
+ }
82
+ return out.sort();
83
+ }
@@ -0,0 +1,28 @@
1
+ /**
2
+ * Gate profile presets: strict, balanced, fast.
3
+ * Profiles override flags unless explicitly set.
4
+ */
5
+ export declare const PROFILES: {
6
+ readonly strict: {
7
+ readonly minScore: 95;
8
+ readonly maxDrop: 0;
9
+ readonly warnDrop: 0;
10
+ readonly minN: 30;
11
+ readonly allowWeakEvidence: false;
12
+ };
13
+ readonly balanced: {
14
+ readonly minScore: 90;
15
+ readonly maxDrop: 2;
16
+ readonly warnDrop: 1;
17
+ readonly minN: 10;
18
+ readonly allowWeakEvidence: false;
19
+ };
20
+ readonly fast: {
21
+ readonly minScore: 85;
22
+ readonly maxDrop: 5;
23
+ readonly warnDrop: 2;
24
+ readonly minN: 5;
25
+ readonly allowWeakEvidence: true;
26
+ };
27
+ };
28
+ export type ProfileName = keyof typeof PROFILES;
@@ -0,0 +1,30 @@
1
+ "use strict";
2
+ /**
3
+ * Gate profile presets: strict, balanced, fast.
4
+ * Profiles override flags unless explicitly set.
5
+ */
6
+ Object.defineProperty(exports, "__esModule", { value: true });
7
+ exports.PROFILES = void 0;
8
+ exports.PROFILES = {
9
+ strict: {
10
+ minScore: 95,
11
+ maxDrop: 0,
12
+ warnDrop: 0,
13
+ minN: 30,
14
+ allowWeakEvidence: false,
15
+ },
16
+ balanced: {
17
+ minScore: 90,
18
+ maxDrop: 2,
19
+ warnDrop: 1,
20
+ minN: 10,
21
+ allowWeakEvidence: false,
22
+ },
23
+ fast: {
24
+ minScore: 85,
25
+ maxDrop: 5,
26
+ warnDrop: 2,
27
+ minN: 5,
28
+ allowWeakEvidence: true,
29
+ },
30
+ };
@@ -0,0 +1,17 @@
1
+ /**
2
+ * Canonical reason codes for gate evaluation.
3
+ * Used by gate.ts and formatters for consistent failure classification.
4
+ */
5
+ export declare const REASON_CODES: {
6
+ readonly PASS: "PASS";
7
+ readonly WARN_REGRESSION: "WARN_REGRESSION";
8
+ readonly LOW_SAMPLE_SIZE: "LOW_SAMPLE_SIZE";
9
+ readonly BASELINE_MISSING: "BASELINE_MISSING";
10
+ readonly SCORE_TOO_LOW: "SCORE_TOO_LOW";
11
+ readonly DELTA_TOO_HIGH: "DELTA_TOO_HIGH";
12
+ readonly COST_BUDGET_EXCEEDED: "COST_BUDGET_EXCEEDED";
13
+ readonly LATENCY_BUDGET_EXCEEDED: "LATENCY_BUDGET_EXCEEDED";
14
+ readonly POLICY_FAILED: "POLICY_FAILED";
15
+ readonly UNKNOWN: "UNKNOWN";
16
+ };
17
+ export type ReasonCode = (typeof REASON_CODES)[keyof typeof REASON_CODES];
@@ -0,0 +1,19 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.REASON_CODES = void 0;
4
+ /**
5
+ * Canonical reason codes for gate evaluation.
6
+ * Used by gate.ts and formatters for consistent failure classification.
7
+ */
8
+ exports.REASON_CODES = {
9
+ PASS: "PASS",
10
+ WARN_REGRESSION: "WARN_REGRESSION",
11
+ LOW_SAMPLE_SIZE: "LOW_SAMPLE_SIZE",
12
+ BASELINE_MISSING: "BASELINE_MISSING",
13
+ SCORE_TOO_LOW: "SCORE_TOO_LOW",
14
+ DELTA_TOO_HIGH: "DELTA_TOO_HIGH",
15
+ COST_BUDGET_EXCEEDED: "COST_BUDGET_EXCEEDED",
16
+ LATENCY_BUDGET_EXCEEDED: "LATENCY_BUDGET_EXCEEDED",
17
+ POLICY_FAILED: "POLICY_FAILED",
18
+ UNKNOWN: "UNKNOWN",
19
+ };
@@ -0,0 +1,5 @@
1
+ /**
2
+ * Truncate a string for deterministic output.
3
+ * Replaces newlines with space, caps length.
4
+ */
5
+ export declare function truncateSnippet(s: string | undefined | null, maxLen?: number): string;
@@ -0,0 +1,15 @@
1
+ "use strict";
2
+ /**
3
+ * Truncate a string for deterministic output.
4
+ * Replaces newlines with space, caps length.
5
+ */
6
+ Object.defineProperty(exports, "__esModule", { value: true });
7
+ exports.truncateSnippet = truncateSnippet;
8
+ function truncateSnippet(s, maxLen = 140) {
9
+ if (s == null)
10
+ return "";
11
+ const normalized = s.replace(/\s+/g, " ").trim();
12
+ if (normalized.length <= maxLen)
13
+ return normalized;
14
+ return `${normalized.slice(0, maxLen)}…`;
15
+ }
@@ -0,0 +1,10 @@
1
+ /**
2
+ * Deterministic ordering for failed cases.
3
+ * Sort by status severity (failed > error > skipped > passed), then by testCaseId asc.
4
+ */
5
+ export interface SortableCase {
6
+ status?: string;
7
+ testCaseId?: number;
8
+ [key: string]: unknown;
9
+ }
10
+ export declare function sortFailedCases<T extends SortableCase>(cases: T[]): T[];
@@ -0,0 +1,24 @@
1
+ "use strict";
2
+ /**
3
+ * Deterministic ordering for failed cases.
4
+ * Sort by status severity (failed > error > skipped > passed), then by testCaseId asc.
5
+ */
6
+ Object.defineProperty(exports, "__esModule", { value: true });
7
+ exports.sortFailedCases = sortFailedCases;
8
+ const STATUS_SEVERITY = {
9
+ failed: 0,
10
+ error: 1,
11
+ skipped: 2,
12
+ passed: 3,
13
+ };
14
+ function sortFailedCases(cases) {
15
+ return [...cases].sort((a, b) => {
16
+ const sevA = STATUS_SEVERITY[a.status?.toLowerCase() ?? ""] ?? 4;
17
+ const sevB = STATUS_SEVERITY[b.status?.toLowerCase() ?? ""] ?? 4;
18
+ if (sevA !== sevB)
19
+ return sevA - sevB;
20
+ const idA = a.testCaseId ?? 0;
21
+ const idB = b.testCaseId ?? 0;
22
+ return idA - idB;
23
+ });
24
+ }
@@ -0,0 +1,19 @@
1
+ /**
2
+ * Build CheckReport from API data and gate result.
3
+ * Normalizes failed cases (truncate, sort), dashboard URL, top N + more.
4
+ */
5
+ import type { QualityLatestData, RunDetailsData } from "../api";
6
+ import type { CheckArgs } from "../check";
7
+ import type { CheckReport } from "../formatters/types";
8
+ import type { GateResult } from "../gate";
9
+ export type BuildReportInput = {
10
+ args: CheckArgs;
11
+ quality: QualityLatestData;
12
+ runDetails?: RunDetailsData | null;
13
+ gateResult: GateResult;
14
+ requestId?: string;
15
+ shareUrl?: string;
16
+ baselineRunId?: number | null;
17
+ ciRunUrl?: string | null;
18
+ };
19
+ export declare function buildCheckReport(input: BuildReportInput): CheckReport;
@@ -0,0 +1,124 @@
1
+ "use strict";
2
+ /**
3
+ * Build CheckReport from API data and gate result.
4
+ * Normalizes failed cases (truncate, sort), dashboard URL, top N + more.
5
+ */
6
+ Object.defineProperty(exports, "__esModule", { value: true });
7
+ exports.buildCheckReport = buildCheckReport;
8
+ const snippet_1 = require("../render/snippet");
9
+ const sort_1 = require("../render/sort");
10
+ const TOP_N = 3;
11
+ /** ContribPts from weights: passRate*50, safety*25, (0.6*judge+0.4*schema)*15, (0.6*latency+0.4*cost)*10 */
12
+ function computeContribPts(b) {
13
+ const pr = b.passRate ?? 0;
14
+ const s = b.safety ?? 0;
15
+ const j = b.judge ?? 0;
16
+ const sc = b.schema ?? 0;
17
+ const l = b.latency ?? 0;
18
+ const c = b.cost ?? 0;
19
+ return {
20
+ passRatePts: Math.round(pr * 50 * 10) / 10,
21
+ safetyPts: Math.round(s * 25 * 10) / 10,
22
+ compliancePts: Math.round((0.6 * j + 0.4 * sc) * 15 * 10) / 10,
23
+ performancePts: Math.round((0.6 * l + 0.4 * c) * 10 * 10) / 10,
24
+ };
25
+ }
26
+ const SNIPPET_MAX = 50;
27
+ function buildCheckReport(input) {
28
+ const { args, quality, runDetails, gateResult, requestId } = input;
29
+ const score = quality?.score ?? 0;
30
+ const total = quality?.total ?? null;
31
+ const baselineScore = quality?.baselineScore ?? null;
32
+ const regressionDelta = quality?.regressionDelta ?? null;
33
+ const evaluationRunId = quality?.evaluationRunId;
34
+ const breakdown = quality?.breakdown ?? {};
35
+ const flags = (quality?.flags ?? []);
36
+ const baseUrl = args.baseUrl.replace(/\/$/, "");
37
+ const dashboardUrl = evaluationRunId != null
38
+ ? `${baseUrl}/evaluations/${args.evaluationId}/runs/${evaluationRunId}`
39
+ : undefined;
40
+ // Build failed cases from run details
41
+ let failedCases = [];
42
+ if (runDetails?.results && evaluationRunId != null) {
43
+ const raw = runDetails.results
44
+ .filter((r) => r.status === "failed")
45
+ .map((r) => ({
46
+ testCaseId: r.testCaseId,
47
+ status: "failed",
48
+ name: r.test_cases?.name,
49
+ input: r.test_cases?.input,
50
+ expectedOutput: r.test_cases?.expectedOutput,
51
+ output: r.output,
52
+ }));
53
+ failedCases = (0, sort_1.sortFailedCases)(raw).map((fc) => ({
54
+ ...fc,
55
+ inputSnippet: (0, snippet_1.truncateSnippet)(fc.input, SNIPPET_MAX),
56
+ expectedSnippet: (0, snippet_1.truncateSnippet)(fc.expectedOutput, SNIPPET_MAX),
57
+ outputSnippet: (0, snippet_1.truncateSnippet)(fc.output, SNIPPET_MAX),
58
+ }));
59
+ }
60
+ const failedCasesShown = Math.min(failedCases.length, TOP_N);
61
+ const failedCasesMore = failedCases.length - failedCasesShown;
62
+ const breakdown01 = Object.keys(breakdown).length > 0 ? breakdown : undefined;
63
+ const contribPts = args.explain && breakdown01 ? computeContribPts(breakdown01) : undefined;
64
+ const gateSkipped = gateResult.gateSkipped === true;
65
+ const gateApplied = !gateSkipped;
66
+ const gateMode = gateSkipped ? "neutral" : "enforced";
67
+ const actionableMessage = gateSkipped
68
+ ? "Gate not applied: baseline missing. Publish a baseline from the dashboard, or run with --baseline previous once you have runs."
69
+ : (gateResult.reasonMessage ?? undefined);
70
+ const verdict = gateResult.reasonCode === "WARN_REGRESSION" ? "warn" : gateResult.passed ? "pass" : "fail";
71
+ const report = {
72
+ evaluationId: args.evaluationId,
73
+ runId: evaluationRunId,
74
+ verdict,
75
+ gateApplied,
76
+ gateMode,
77
+ actionableMessage,
78
+ shareUrl: input.shareUrl,
79
+ policy: args.policy,
80
+ baselineRunId: input.baselineRunId ?? quality?.baselineRunId ?? undefined,
81
+ ciRunUrl: input.ciRunUrl ?? undefined,
82
+ reasonCode: gateResult.reasonCode,
83
+ reasonMessage: gateResult.reasonMessage ?? undefined,
84
+ score,
85
+ baselineScore: baselineScore ?? undefined,
86
+ delta: regressionDelta ?? undefined,
87
+ n: total ?? undefined,
88
+ evidenceLevel: quality?.evidenceLevel ?? undefined,
89
+ baselineMissing: quality?.baselineMissing === true,
90
+ baselineStatus: quality?.baselineMissing === true
91
+ ? "missing"
92
+ : quality?.baselineScore != null
93
+ ? "found"
94
+ : undefined,
95
+ flags: flags.length > 0 ? [...flags].sort() : undefined,
96
+ breakdown01,
97
+ contribPts,
98
+ thresholds: {
99
+ minScore: args.minScore,
100
+ maxDrop: args.maxDrop,
101
+ warnDrop: args.warnDrop,
102
+ minN: args.minN,
103
+ allowWeakEvidence: args.allowWeakEvidence,
104
+ baseline: args.baseline,
105
+ maxCostUsd: args.maxCostUsd,
106
+ maxLatencyMs: args.maxLatencyMs,
107
+ maxCostDeltaUsd: args.maxCostDeltaUsd,
108
+ },
109
+ dashboardUrl,
110
+ failedCases,
111
+ failedCasesShown: failedCases.length > 0 ? failedCasesShown : undefined,
112
+ failedCasesMore: failedCasesMore > 0 ? failedCasesMore : undefined,
113
+ requestId,
114
+ explain: args.explain,
115
+ policyEvidence: args.explain && gateResult.policyEvidence
116
+ ? {
117
+ failedCheck: gateResult.policyEvidence.failedCheck,
118
+ remediation: gateResult.policyEvidence.remediation,
119
+ snapshot: gateResult.policyEvidence.snapshot,
120
+ }
121
+ : undefined,
122
+ };
123
+ return report;
124
+ }
@@ -0,0 +1,17 @@
1
+ /**
2
+ * evalai share — Create a share link for a run.
3
+ * Usage: evalai share --scope run --expires 7d
4
+ */
5
+ export type ShareArgs = {
6
+ baseUrl: string;
7
+ apiKey: string;
8
+ evaluationId: string;
9
+ runId: number;
10
+ scope: "run";
11
+ expires: string;
12
+ expiresInDays: number;
13
+ };
14
+ export declare function parseShareArgs(argv: string[]): ShareArgs | {
15
+ error: string;
16
+ };
17
+ export declare function runShare(args: ShareArgs): Promise<number>;
@@ -0,0 +1,83 @@
1
+ "use strict";
2
+ /**
3
+ * evalai share — Create a share link for a run.
4
+ * Usage: evalai share --scope run --expires 7d
5
+ */
6
+ Object.defineProperty(exports, "__esModule", { value: true });
7
+ exports.parseShareArgs = parseShareArgs;
8
+ exports.runShare = runShare;
9
+ const api_1 = require("./api");
10
+ function parseExpires(spec) {
11
+ const m = spec.match(/^(\d+)(d|h|m|s)$/i);
12
+ if (!m)
13
+ return null;
14
+ const n = parseInt(m[1], 10);
15
+ const unit = m[2].toLowerCase();
16
+ if (unit === "d")
17
+ return n;
18
+ if (unit === "h")
19
+ return n / 24;
20
+ if (unit === "m")
21
+ return n / (24 * 60);
22
+ if (unit === "s")
23
+ return n / (24 * 60 * 60);
24
+ return null;
25
+ }
26
+ function parseShareArgs(argv) {
27
+ const args = {};
28
+ for (let i = 0; i < argv.length; i++) {
29
+ const arg = argv[i];
30
+ if (arg.startsWith("--")) {
31
+ const key = arg.slice(2);
32
+ const next = argv[i + 1];
33
+ if (next !== undefined && !next.startsWith("--")) {
34
+ args[key] = next;
35
+ i++;
36
+ }
37
+ else {
38
+ args[key] = "true";
39
+ }
40
+ }
41
+ }
42
+ const baseUrl = args.baseUrl || process.env.EVALAI_BASE_URL || "http://localhost:3000";
43
+ const apiKey = args.apiKey || process.env.EVALAI_API_KEY || "";
44
+ const evaluationId = args.evaluationId || "";
45
+ const runId = args.runId ? parseInt(args.runId, 10) : NaN;
46
+ const scope = args.scope === "run" ? "run" : "run";
47
+ const expires = args.expires || "7d";
48
+ if (!apiKey)
49
+ return { error: "Error: --apiKey or EVALAI_API_KEY is required" };
50
+ if (!evaluationId)
51
+ return { error: "Error: --evaluationId is required" };
52
+ if (Number.isNaN(runId) || runId < 1)
53
+ return { error: "Error: --runId is required and must be a positive number" };
54
+ const expiresInDays = parseExpires(expires);
55
+ if (expiresInDays == null || expiresInDays <= 0)
56
+ return { error: "Error: --expires must be e.g. 7d, 24h, 60m, 1s" };
57
+ return {
58
+ baseUrl,
59
+ apiKey,
60
+ evaluationId,
61
+ runId,
62
+ scope,
63
+ expires,
64
+ expiresInDays,
65
+ };
66
+ }
67
+ async function runShare(args) {
68
+ const exportRes = await (0, api_1.fetchRunExport)(args.baseUrl, args.apiKey, args.evaluationId, args.runId);
69
+ if (!exportRes.ok) {
70
+ console.error(`EvalAI share: failed to fetch export — ${exportRes.status} ${exportRes.body}`);
71
+ return 1;
72
+ }
73
+ const publishRes = await (0, api_1.publishShare)(args.baseUrl, args.apiKey, args.evaluationId, exportRes.exportData, args.runId, { expiresInDays: args.expiresInDays });
74
+ if (!publishRes.ok) {
75
+ console.error(`EvalAI share: failed to publish — ${publishRes.status} ${publishRes.body}`);
76
+ return 1;
77
+ }
78
+ const shareUrl = publishRes.data.shareUrl ??
79
+ `${args.baseUrl.replace(/\/$/, "")}/share/${publishRes.data.shareId}`;
80
+ console.log(`Share link created (expires in ${args.expires}):`);
81
+ console.log(shareUrl);
82
+ return 0;
83
+ }
package/dist/client.d.ts CHANGED
@@ -1,5 +1,5 @@
1
- import { ClientConfig, Trace, CreateTraceParams, ListTracesParams, Evaluation, CreateEvaluationParams, UpdateEvaluationParams, ListEvaluationsParams, LLMJudgeResult, RunLLMJudgeParams, TestCase, CreateTestCaseParams, EvaluationRun, CreateRunParams, Span, CreateSpanParams, UpdateTraceParams, OrganizationLimits, Annotation, CreateAnnotationParams, ListAnnotationsParams, AnnotationTask, CreateAnnotationTaskParams, ListAnnotationTasksParams, AnnotationItem, CreateAnnotationItemParams, ListAnnotationItemsParams, APIKey, APIKeyWithSecret, CreateAPIKeyParams, UpdateAPIKeyParams, ListAPIKeysParams, APIKeyUsage, Webhook, CreateWebhookParams, UpdateWebhookParams, ListWebhooksParams, WebhookDelivery, ListWebhookDeliveriesParams, UsageStats, GetUsageParams, UsageSummary, LLMJudgeConfig, CreateLLMJudgeConfigParams, ListLLMJudgeConfigsParams, ListLLMJudgeResultsParams, LLMJudgeAlignment, GetLLMJudgeAlignmentParams, Organization } from './types';
2
- import { Logger } from './logger';
1
+ import { type Logger } from "./logger";
2
+ import type { Annotation, AnnotationItem, AnnotationTask, APIKey, APIKeyUsage, APIKeyWithSecret, ClientConfig, CreateAnnotationItemParams, CreateAnnotationParams, CreateAnnotationTaskParams, CreateAPIKeyParams, CreateEvaluationParams, CreateLLMJudgeConfigParams, CreateRunParams, CreateSpanParams, CreateTestCaseParams, CreateTraceParams, CreateWebhookParams, Evaluation, EvaluationRun, GetLLMJudgeAlignmentParams, GetUsageParams, ListAnnotationItemsParams, ListAnnotationsParams, ListAnnotationTasksParams, ListAPIKeysParams, ListEvaluationsParams, ListLLMJudgeConfigsParams, ListLLMJudgeResultsParams, ListTracesParams, ListWebhookDeliveriesParams, ListWebhooksParams, LLMJudgeAlignment, LLMJudgeConfig, LLMJudgeResult, Organization, OrganizationLimits, RunLLMJudgeParams, Span, TestCase, Trace, UpdateAPIKeyParams, UpdateEvaluationParams, UpdateTraceParams, UpdateWebhookParams, UsageStats, UsageSummary, Webhook, WebhookDelivery } from "./types";
3
3
  /**
4
4
  * AI Evaluation Platform SDK Client
5
5
  *