@pauly4010/evalai-sdk 1.4.1 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/CHANGELOG.md +34 -0
  2. package/README.md +102 -8
  3. package/dist/cli/api.d.ts +79 -0
  4. package/dist/cli/api.js +74 -0
  5. package/dist/cli/check.d.ts +15 -12
  6. package/dist/cli/check.js +113 -134
  7. package/dist/cli/ci-context.d.ts +6 -0
  8. package/dist/cli/ci-context.js +51 -0
  9. package/dist/cli/config.d.ts +24 -0
  10. package/dist/cli/config.js +158 -0
  11. package/dist/cli/constants.d.ts +13 -0
  12. package/dist/cli/constants.js +16 -0
  13. package/dist/cli/doctor.d.ts +11 -0
  14. package/dist/cli/doctor.js +82 -0
  15. package/dist/cli/formatters/github.d.ts +8 -0
  16. package/dist/cli/formatters/github.js +119 -0
  17. package/dist/cli/formatters/human.d.ts +6 -0
  18. package/dist/cli/formatters/human.js +92 -0
  19. package/dist/cli/formatters/json.d.ts +6 -0
  20. package/dist/cli/formatters/json.js +10 -0
  21. package/dist/cli/formatters/types.d.ts +76 -0
  22. package/dist/cli/formatters/types.js +5 -0
  23. package/dist/cli/gate.d.ts +13 -0
  24. package/dist/cli/gate.js +108 -0
  25. package/dist/cli/index.d.ts +1 -0
  26. package/dist/cli/index.js +31 -5
  27. package/dist/cli/init.d.ts +7 -0
  28. package/dist/cli/init.js +69 -0
  29. package/dist/cli/render/snippet.d.ts +5 -0
  30. package/dist/cli/render/snippet.js +15 -0
  31. package/dist/cli/render/sort.d.ts +10 -0
  32. package/dist/cli/render/sort.js +24 -0
  33. package/dist/cli/report/build-check-report.d.ts +16 -0
  34. package/dist/cli/report/build-check-report.js +94 -0
  35. package/dist/index.d.ts +1 -0
  36. package/dist/index.js +4 -1
  37. package/dist/integrations/openai-eval.d.ts +53 -0
  38. package/dist/integrations/openai-eval.js +226 -0
  39. package/dist/utils/input-hash.d.ts +8 -0
  40. package/dist/utils/input-hash.js +38 -0
  41. package/package.json +5 -1
  42. package/dist/__tests__/assertions.test.d.ts +0 -1
  43. package/dist/__tests__/assertions.test.js +0 -288
  44. package/dist/__tests__/client.test.d.ts +0 -1
  45. package/dist/__tests__/client.test.js +0 -185
  46. package/dist/__tests__/testing.test.d.ts +0 -1
  47. package/dist/__tests__/testing.test.js +0 -230
  48. package/dist/__tests__/workflows.test.d.ts +0 -1
  49. package/dist/__tests__/workflows.test.js +0 -222
@@ -0,0 +1,16 @@
1
+ /**
2
+ * Build CheckReport from API data and gate result.
3
+ * Normalizes failed cases (truncate, sort), dashboard URL, top N + more.
4
+ */
5
+ import type { CheckArgs } from '../check';
6
+ import type { QualityLatestData, RunDetailsData } from '../api';
7
+ import type { GateResult } from '../gate';
8
+ import type { CheckReport } from '../formatters/types';
9
+ export type BuildReportInput = {
10
+ args: CheckArgs;
11
+ quality: QualityLatestData;
12
+ runDetails?: RunDetailsData | null;
13
+ gateResult: GateResult;
14
+ requestId?: string;
15
+ };
16
+ export declare function buildCheckReport(input: BuildReportInput): CheckReport;
@@ -0,0 +1,94 @@
1
+ "use strict";
2
+ /**
3
+ * Build CheckReport from API data and gate result.
4
+ * Normalizes failed cases (truncate, sort), dashboard URL, top N + more.
5
+ */
6
+ Object.defineProperty(exports, "__esModule", { value: true });
7
+ exports.buildCheckReport = buildCheckReport;
8
+ const snippet_1 = require("../render/snippet");
9
+ const sort_1 = require("../render/sort");
10
+ const TOP_N = 3;
11
+ /** ContribPts from weights: passRate*50, safety*25, (0.6*judge+0.4*schema)*15, (0.6*latency+0.4*cost)*10 */
12
+ function computeContribPts(b) {
13
+ const pr = b.passRate ?? 0;
14
+ const s = b.safety ?? 0;
15
+ const j = b.judge ?? 0;
16
+ const sc = b.schema ?? 0;
17
+ const l = b.latency ?? 0;
18
+ const c = b.cost ?? 0;
19
+ return {
20
+ passRatePts: Math.round(pr * 50 * 10) / 10,
21
+ safetyPts: Math.round(s * 25 * 10) / 10,
22
+ compliancePts: Math.round((0.6 * j + 0.4 * sc) * 15 * 10) / 10,
23
+ performancePts: Math.round((0.6 * l + 0.4 * c) * 10 * 10) / 10,
24
+ };
25
+ }
26
+ const SNIPPET_MAX = 50;
27
+ function buildCheckReport(input) {
28
+ const { args, quality, runDetails, gateResult, requestId } = input;
29
+ const score = quality?.score ?? 0;
30
+ const total = quality?.total ?? null;
31
+ const baselineScore = quality?.baselineScore ?? null;
32
+ const regressionDelta = quality?.regressionDelta ?? null;
33
+ const evaluationRunId = quality?.evaluationRunId;
34
+ const breakdown = quality?.breakdown ?? {};
35
+ const flags = (quality?.flags ?? []);
36
+ const baseUrl = args.baseUrl.replace(/\/$/, '');
37
+ const dashboardUrl = evaluationRunId != null
38
+ ? `${baseUrl}/evaluations/${args.evaluationId}/runs/${evaluationRunId}`
39
+ : undefined;
40
+ // Build failed cases from run details
41
+ let failedCases = [];
42
+ if (runDetails?.results && evaluationRunId != null) {
43
+ const raw = runDetails.results
44
+ .filter((r) => r.status === 'failed')
45
+ .map((r) => ({
46
+ testCaseId: r.testCaseId,
47
+ status: 'failed',
48
+ name: r.test_cases?.name,
49
+ input: r.test_cases?.input,
50
+ expectedOutput: r.test_cases?.expectedOutput,
51
+ output: r.output,
52
+ }));
53
+ failedCases = (0, sort_1.sortFailedCases)(raw).map((fc) => ({
54
+ ...fc,
55
+ inputSnippet: (0, snippet_1.truncateSnippet)(fc.input, SNIPPET_MAX),
56
+ expectedSnippet: (0, snippet_1.truncateSnippet)(fc.expectedOutput, SNIPPET_MAX),
57
+ outputSnippet: (0, snippet_1.truncateSnippet)(fc.output, SNIPPET_MAX),
58
+ }));
59
+ }
60
+ const failedCasesShown = Math.min(failedCases.length, TOP_N);
61
+ const failedCasesMore = failedCases.length - failedCasesShown;
62
+ const breakdown01 = Object.keys(breakdown).length > 0 ? breakdown : undefined;
63
+ const contribPts = args.explain && breakdown01 ? computeContribPts(breakdown01) : undefined;
64
+ const report = {
65
+ evaluationId: args.evaluationId,
66
+ runId: evaluationRunId,
67
+ verdict: gateResult.passed ? 'pass' : 'fail',
68
+ reasonCode: gateResult.reasonCode,
69
+ reasonMessage: gateResult.reasonMessage ?? undefined,
70
+ score,
71
+ baselineScore: baselineScore ?? undefined,
72
+ delta: regressionDelta ?? undefined,
73
+ n: total ?? undefined,
74
+ evidenceLevel: quality?.evidenceLevel ?? undefined,
75
+ baselineMissing: quality?.baselineMissing === true,
76
+ flags: flags.length > 0 ? [...flags].sort() : undefined,
77
+ breakdown01,
78
+ contribPts,
79
+ thresholds: {
80
+ minScore: args.minScore,
81
+ maxDrop: args.maxDrop,
82
+ minN: args.minN,
83
+ allowWeakEvidence: args.allowWeakEvidence,
84
+ baseline: args.baseline,
85
+ },
86
+ dashboardUrl,
87
+ failedCases,
88
+ failedCasesShown: failedCases.length > 0 ? failedCasesShown : undefined,
89
+ failedCasesMore: failedCasesMore > 0 ? failedCasesMore : undefined,
90
+ requestId,
91
+ explain: args.explain,
92
+ };
93
+ return report;
94
+ }
package/dist/index.d.ts CHANGED
@@ -27,6 +27,7 @@ export { RequestBatcher } from './batch';
27
27
  export { Logger } from './logger';
28
28
  export { traceOpenAI } from './integrations/openai';
29
29
  export { traceAnthropic } from './integrations/anthropic';
30
+ export { openAIChatEval, type OpenAIChatEvalOptions, type OpenAIChatEvalResult, type OpenAIChatEvalCase, } from './integrations/openai-eval';
30
31
  export { WorkflowTracer, createWorkflowTracer, traceWorkflowStep, traceLangChainAgent, traceCrewAI, traceAutoGen, type WorkflowNode, type WorkflowEdge, type WorkflowDefinition, type WorkflowContext, type WorkflowStatus, type HandoffType, type AgentHandoff, type DecisionAlternative, type DecisionType, type RecordDecisionParams, type LLMProvider, type CostCategory, type RecordCostParams, type CostRecord, type WorkflowTracerOptions, type AgentSpanContext, } from './workflows';
31
32
  export type { ClientConfig as AIEvalConfig, Trace as TraceData, Span as SpanData, Evaluation as EvaluationData, LLMJudgeResult as LLMJudgeData, RetryConfig, GenericMetadata as AnnotationData, TracedResponse, TestCase, TestResult, SnapshotData, ExportOptions, ImportOptions, StreamOptions, BatchOptions } from './types';
32
33
  export { EvaluationTemplates, type EvaluationTemplateType, type FeatureUsage, type OrganizationLimits } from './types';
package/dist/index.js CHANGED
@@ -9,7 +9,7 @@
9
9
  */
10
10
  Object.defineProperty(exports, "__esModule", { value: true });
11
11
  exports.decodeCursor = exports.encodeCursor = exports.autoPaginate = exports.createPaginatedIterator = exports.PaginatedIterator = exports.CacheTTL = exports.RequestCache = exports.RateLimiter = exports.batchRead = exports.streamEvaluation = exports.batchProcess = exports.importData = exports.exportData = exports.compareSnapshots = exports.saveSnapshot = exports.compareWithSnapshot = exports.snapshot = exports.TestSuite = exports.createTestSuite = exports.ContextManager = exports.withContext = exports.getContext = exports.createContext = exports.hasValidCodeSyntax = exports.containsAllRequiredFields = exports.followsInstructions = exports.hasNoToxicity = exports.respondedWithinTime = exports.hasFactualAccuracy = exports.containsLanguage = exports.hasReadabilityScore = exports.matchesSchema = exports.hasNoHallucinations = exports.isValidURL = exports.isValidEmail = exports.withinRange = exports.similarTo = exports.hasSentiment = exports.notContainsPII = exports.containsJSON = exports.hasLength = exports.matchesPattern = exports.containsKeywords = exports.expect = exports.NetworkError = exports.ValidationError = exports.AuthenticationError = exports.RateLimitError = exports.EvalAIError = exports.AIEvalClient = void 0;
12
- exports.EXIT = exports.runCheck = exports.parseArgs = exports.EvaluationTemplates = exports.traceAutoGen = exports.traceCrewAI = exports.traceLangChainAgent = exports.traceWorkflowStep = exports.createWorkflowTracer = exports.WorkflowTracer = exports.traceAnthropic = exports.traceOpenAI = exports.Logger = exports.RequestBatcher = void 0;
12
+ exports.EXIT = exports.runCheck = exports.parseArgs = exports.EvaluationTemplates = exports.traceAutoGen = exports.traceCrewAI = exports.traceLangChainAgent = exports.traceWorkflowStep = exports.createWorkflowTracer = exports.WorkflowTracer = exports.openAIChatEval = exports.traceAnthropic = exports.traceOpenAI = exports.Logger = exports.RequestBatcher = void 0;
13
13
  // Main SDK exports
14
14
  var client_1 = require("./client");
15
15
  Object.defineProperty(exports, "AIEvalClient", { enumerable: true, get: function () { return client_1.AIEvalClient; } });
@@ -94,6 +94,9 @@ var openai_1 = require("./integrations/openai");
94
94
  Object.defineProperty(exports, "traceOpenAI", { enumerable: true, get: function () { return openai_1.traceOpenAI; } });
95
95
  var anthropic_1 = require("./integrations/anthropic");
96
96
  Object.defineProperty(exports, "traceAnthropic", { enumerable: true, get: function () { return anthropic_1.traceAnthropic; } });
97
+ // OpenAI regression eval (local-first, no account required)
98
+ var openai_eval_1 = require("./integrations/openai-eval");
99
+ Object.defineProperty(exports, "openAIChatEval", { enumerable: true, get: function () { return openai_eval_1.openAIChatEval; } });
97
100
  // Workflow tracing (Orchestration Layer)
98
101
  var workflows_1 = require("./workflows");
99
102
  Object.defineProperty(exports, "WorkflowTracer", { enumerable: true, get: function () { return workflows_1.WorkflowTracer; } });
@@ -0,0 +1,53 @@
1
+ /**
2
+ * openAIChatEval — One-function OpenAI chat regression testing
3
+ *
4
+ * Run local regression tests with OpenAI. No EvalAI account required.
5
+ * CI-friendly output. Optional reportToEvalAI in v1.5.
6
+ *
7
+ * @example
8
+ * ```typescript
9
+ * import { openAIChatEval } from '@pauly4010/evalai-sdk';
10
+ *
11
+ * await openAIChatEval({
12
+ * name: 'chat-regression',
13
+ * cases: [
14
+ * { input: 'Hello', expectedOutput: 'greeting' },
15
+ * { input: '2 + 2 = ?', expectedOutput: '4' }
16
+ * ]
17
+ * });
18
+ * ```
19
+ */
20
+ import type { TestSuiteCaseResult } from '../testing';
21
+ export interface OpenAIChatEvalCase {
22
+ input: string;
23
+ expectedOutput?: string;
24
+ /** Platform test case ID. When provided, used directly for reportToEvalAI (no input matching). */
25
+ testCaseId?: number;
26
+ assertions?: ((output: string) => import('../assertions').AssertionResult)[];
27
+ }
28
+ export interface OpenAIChatEvalOptions {
29
+ name: string;
30
+ model?: string;
31
+ apiKey?: string;
32
+ cases: OpenAIChatEvalCase[];
33
+ /** v1.5: Upload results to EvalAI platform for an existing evaluation. Requires evaluationId and EVALAI_API_KEY. */
34
+ reportToEvalAI?: boolean;
35
+ /** Evaluation ID (from config or arg). Required when reportToEvalAI is true. */
36
+ evaluationId?: string;
37
+ /** EvalAI API base URL. Default: EVALAI_BASE_URL or http://localhost:3000 */
38
+ baseUrl?: string;
39
+ /** Idempotency key for import (e.g. CI run ID). Prevents duplicate runs on retry. */
40
+ idempotencyKey?: string;
41
+ }
42
+ export interface OpenAIChatEvalResult {
43
+ passed: number;
44
+ total: number;
45
+ score: number;
46
+ results: TestSuiteCaseResult[];
47
+ durationMs: number;
48
+ }
49
+ /**
50
+ * Run OpenAI chat regression tests locally.
51
+ * No EvalAI account required. Returns score and prints CI-friendly summary.
52
+ */
53
+ export declare function openAIChatEval(options: OpenAIChatEvalOptions): Promise<OpenAIChatEvalResult>;
@@ -0,0 +1,226 @@
1
+ "use strict";
2
+ /**
3
+ * openAIChatEval — One-function OpenAI chat regression testing
4
+ *
5
+ * Run local regression tests with OpenAI. No EvalAI account required.
6
+ * CI-friendly output. Optional reportToEvalAI in v1.5.
7
+ *
8
+ * @example
9
+ * ```typescript
10
+ * import { openAIChatEval } from '@pauly4010/evalai-sdk';
11
+ *
12
+ * await openAIChatEval({
13
+ * name: 'chat-regression',
14
+ * cases: [
15
+ * { input: 'Hello', expectedOutput: 'greeting' },
16
+ * { input: '2 + 2 = ?', expectedOutput: '4' }
17
+ * ]
18
+ * });
19
+ * ```
20
+ */
21
+ Object.defineProperty(exports, "__esModule", { value: true });
22
+ exports.openAIChatEval = openAIChatEval;
23
+ const testing_1 = require("../testing");
24
+ const assertions_1 = require("../assertions");
25
+ const config_1 = require("../cli/config");
26
+ const input_hash_1 = require("../utils/input-hash");
27
+ const MAX_FAILED_CASES_TO_SHOW = 5;
28
+ function getOpenAI() {
29
+ try {
30
+ // eslint-disable-next-line @typescript-eslint/no-var-requires
31
+ const OpenAI = require('openai');
32
+ return OpenAI;
33
+ }
34
+ catch {
35
+ throw new Error('openai package is required for openAIChatEval. Install with: npm install openai');
36
+ }
37
+ }
38
+ function createExecutor(model, apiKey) {
39
+ const OpenAI = getOpenAI();
40
+ const openai = new OpenAI({ apiKey });
41
+ return async (input) => {
42
+ const response = await openai.chat.completions.create({
43
+ model,
44
+ messages: [{ role: 'user', content: input }],
45
+ temperature: 0.1,
46
+ });
47
+ return response.choices[0]?.message?.content ?? '';
48
+ };
49
+ }
50
+ function printSummary(result) {
51
+ const { passed, total, results } = result;
52
+ const score = total > 0 ? Math.round((passed / total) * 100) : 0;
53
+ const failed = results.filter((r) => !r.passed);
54
+ const status = failed.length === 0 ? 'PASS' : 'FAIL';
55
+ console.log(`\n${status} ${passed}/${total} (score: ${score})\n`);
56
+ if (failed.length > 0) {
57
+ const toShow = failed.slice(0, MAX_FAILED_CASES_TO_SHOW);
58
+ const more = failed.length - toShow.length;
59
+ console.log(`${failed.length} failing case${failed.length === 1 ? '' : 's'}:`);
60
+ for (const r of toShow) {
61
+ const expected = r.expected ?? '(no expected)';
62
+ console.log(`- "${r.input}" → expected: ${expected}`);
63
+ }
64
+ if (more > 0) {
65
+ console.log(`+ ${more} more`);
66
+ }
67
+ console.log('\nGate this in CI:');
68
+ console.log(' npx -y @pauly4010/evalai-sdk@^1 init');
69
+ }
70
+ else {
71
+ console.log('Tip: Want dashboards and history?');
72
+ console.log('Set EVALAI_API_KEY and connect this to the platform.');
73
+ }
74
+ }
75
+ /**
76
+ * Run OpenAI chat regression tests locally.
77
+ * No EvalAI account required. Returns score and prints CI-friendly summary.
78
+ */
79
+ async function openAIChatEval(options) {
80
+ const { name, model = 'gpt-4o-mini', apiKey, cases } = options;
81
+ const resolvedApiKey = apiKey ?? (typeof process !== 'undefined' && process.env?.OPENAI_API_KEY);
82
+ if (!resolvedApiKey) {
83
+ throw new Error('OPENAI_API_KEY is required. Set it in the environment or pass apiKey to openAIChatEval.');
84
+ }
85
+ const executor = createExecutor(model, resolvedApiKey);
86
+ const suiteCases = cases.map((c) => {
87
+ const assertions = c.assertions
88
+ ? [...c.assertions]
89
+ : c.expectedOutput
90
+ ? [(output) => (0, assertions_1.expect)(output).toContainKeywords(c.expectedOutput.split(/\s+/).filter(Boolean))]
91
+ : undefined;
92
+ return {
93
+ input: c.input,
94
+ expected: c.expectedOutput,
95
+ assertions,
96
+ };
97
+ });
98
+ const suite = (0, testing_1.createTestSuite)(name, {
99
+ cases: suiteCases,
100
+ executor,
101
+ parallel: true,
102
+ });
103
+ const result = await suite.run();
104
+ const score = result.total > 0 ? Math.round((result.passed / result.total) * 100) : 0;
105
+ const evalResult = {
106
+ passed: result.passed,
107
+ total: result.total,
108
+ score,
109
+ results: result.results,
110
+ durationMs: result.durationMs,
111
+ };
112
+ printSummary(evalResult);
113
+ // v1.5: Optional report to EvalAI platform
114
+ if (options.reportToEvalAI) {
115
+ const config = typeof process !== 'undefined' && process.cwd ? (0, config_1.loadConfig)(process.cwd()) : null;
116
+ const evalId = options.evaluationId || config?.evaluationId;
117
+ if (!evalId || String(evalId).trim() === '') {
118
+ console.log('Run evalai init and set evaluationId to upload results.');
119
+ return evalResult;
120
+ }
121
+ const evalaiKey = (typeof process !== 'undefined' && process.env?.EVALAI_API_KEY) || '';
122
+ if (!evalaiKey) {
123
+ console.log('Set EVALAI_API_KEY to upload results.');
124
+ return evalResult;
125
+ }
126
+ const baseUrl = options.baseUrl ||
127
+ config?.baseUrl ||
128
+ (typeof process !== 'undefined' && process.env?.EVALAI_BASE_URL) ||
129
+ 'http://localhost:3000';
130
+ const url = String(baseUrl).replace(/\/$/, '');
131
+ try {
132
+ // Resolve testCaseId for each result: explicit testCaseId in cases, or match by inputHash
133
+ const importResults = [];
134
+ const hasExplicitIds = cases.some((c) => c.testCaseId != null);
135
+ if (hasExplicitIds) {
136
+ // Use testCaseId from cases (same order as results)
137
+ for (let i = 0; i < result.results.length; i++) {
138
+ const tcId = cases[i]?.testCaseId;
139
+ if (tcId == null) {
140
+ console.log('reportToEvalAI: All cases must have testCaseId when any has it.');
141
+ return evalResult;
142
+ }
143
+ importResults.push({
144
+ testCaseId: tcId,
145
+ status: result.results[i].passed ? 'passed' : 'failed',
146
+ output: result.results[i].actual ?? '',
147
+ latencyMs: result.results[i].durationMs,
148
+ });
149
+ }
150
+ }
151
+ else {
152
+ // Match by inputHash (same canonicalization as platform)
153
+ const tcRes = await fetch(`${url}/api/evaluations/${evalId}/test-cases?limit=500`, {
154
+ headers: { Authorization: `Bearer ${evalaiKey}` },
155
+ });
156
+ if (!tcRes.ok) {
157
+ console.log('Could not fetch test cases. Check evaluationId and EVALAI_API_KEY.');
158
+ return evalResult;
159
+ }
160
+ const platformCases = (await tcRes.json());
161
+ const hashToIds = new Map();
162
+ for (const tc of platformCases) {
163
+ const input = tc.input ?? '';
164
+ if (!input.trim())
165
+ continue;
166
+ const hash = (0, input_hash_1.sha256Input)(input);
167
+ const existing = hashToIds.get(hash) ?? [];
168
+ existing.push(tc.id);
169
+ hashToIds.set(hash, existing);
170
+ }
171
+ for (const r of result.results) {
172
+ const hash = (0, input_hash_1.sha256Input)(r.input ?? '');
173
+ const ids = hashToIds.get(hash);
174
+ if (ids == null || ids.length === 0) {
175
+ console.log(`No platform test case matches input: "${(r.input ?? '').slice(0, 50)}…"`);
176
+ return evalResult;
177
+ }
178
+ if (ids.length > 1) {
179
+ console.log(`Multiple platform test cases share the same input (hash collision). Use testCaseId in cases.`);
180
+ return evalResult;
181
+ }
182
+ importResults.push({
183
+ testCaseId: ids[0],
184
+ status: r.passed ? 'passed' : 'failed',
185
+ output: r.actual ?? '',
186
+ latencyMs: r.durationMs,
187
+ });
188
+ }
189
+ }
190
+ if (importResults.length !== result.results.length) {
191
+ console.log('Could not match all results to platform test cases.');
192
+ return evalResult;
193
+ }
194
+ const sdkVersion = '1.4.1';
195
+ const headers = {
196
+ 'Content-Type': 'application/json',
197
+ Authorization: `Bearer ${evalaiKey}`,
198
+ };
199
+ if (options.idempotencyKey) {
200
+ headers['Idempotency-Key'] = options.idempotencyKey;
201
+ }
202
+ const importRes = await fetch(`${url}/api/evaluations/${evalId}/runs/import`, {
203
+ method: 'POST',
204
+ headers,
205
+ body: JSON.stringify({
206
+ environment: 'dev',
207
+ results: importResults,
208
+ importClientVersion: sdkVersion,
209
+ }),
210
+ });
211
+ if (!importRes.ok) {
212
+ const body = await importRes.text();
213
+ console.log(`Upload failed: ${importRes.status} — ${body}`);
214
+ return evalResult;
215
+ }
216
+ const importData = (await importRes.json());
217
+ if (importData.dashboardUrl) {
218
+ console.log(`Dashboard: ${importData.dashboardUrl}`);
219
+ }
220
+ }
221
+ catch (err) {
222
+ console.log('Upload failed:', err instanceof Error ? err.message : String(err));
223
+ }
224
+ }
225
+ return evalResult;
226
+ }
@@ -0,0 +1,8 @@
1
+ /**
2
+ * Input normalization and hashing for deterministic matching.
3
+ * Must match platform's @/lib/utils/input-hash.ts for reportToEvalAI.
4
+ */
5
+ /** Normalize input for stable matching (whitespace, JSON key order). */
6
+ export declare function normalizeInput(input: string): string;
7
+ /** SHA-256 hash of normalized input. */
8
+ export declare function sha256Input(s: string): string;
@@ -0,0 +1,38 @@
1
+ "use strict";
2
+ /**
3
+ * Input normalization and hashing for deterministic matching.
4
+ * Must match platform's @/lib/utils/input-hash.ts for reportToEvalAI.
5
+ */
6
+ var __importDefault = (this && this.__importDefault) || function (mod) {
7
+ return (mod && mod.__esModule) ? mod : { "default": mod };
8
+ };
9
+ Object.defineProperty(exports, "__esModule", { value: true });
10
+ exports.normalizeInput = normalizeInput;
11
+ exports.sha256Input = sha256Input;
12
+ const crypto_1 = __importDefault(require("crypto"));
13
+ function sortKeys(obj) {
14
+ const sorted = {};
15
+ for (const k of Object.keys(obj).sort()) {
16
+ const v = obj[k];
17
+ sorted[k] =
18
+ v != null && typeof v === 'object' && !Array.isArray(v)
19
+ ? sortKeys(v)
20
+ : v;
21
+ }
22
+ return sorted;
23
+ }
24
+ /** Normalize input for stable matching (whitespace, JSON key order). */
25
+ function normalizeInput(input) {
26
+ const s = input.trim();
27
+ try {
28
+ const obj = JSON.parse(s);
29
+ return JSON.stringify(sortKeys(obj));
30
+ }
31
+ catch {
32
+ return s.replace(/\s+/g, ' ');
33
+ }
34
+ }
35
+ /** SHA-256 hash of normalized input. */
36
+ function sha256Input(s) {
37
+ return crypto_1.default.createHash('sha256').update(normalizeInput(s), 'utf8').digest('hex');
38
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pauly4010/evalai-sdk",
3
- "version": "1.4.1",
3
+ "version": "1.5.0",
4
4
  "description": "AI Evaluation Platform SDK - Complete API Coverage with Performance Optimizations",
5
5
  "main": "dist/index.js",
6
6
  "module": "dist/index.js",
@@ -85,6 +85,10 @@
85
85
  "./integrations/anthropic": {
86
86
  "import": "./dist/integrations/anthropic.js",
87
87
  "types": "./dist/integrations/anthropic.d.ts"
88
+ },
89
+ "./integrations/openai-eval": {
90
+ "import": "./dist/integrations/openai-eval.js",
91
+ "types": "./dist/integrations/openai-eval.d.ts"
88
92
  }
89
93
  }
90
94
  }
@@ -1 +0,0 @@
1
- export {};