@pauly4010/evalai-sdk 1.4.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +46 -0
- package/README.md +108 -9
- package/dist/cli/api.d.ts +79 -0
- package/dist/cli/api.js +74 -0
- package/dist/cli/check.d.ts +16 -13
- package/dist/cli/check.js +117 -127
- package/dist/cli/ci-context.d.ts +6 -0
- package/dist/cli/ci-context.js +51 -0
- package/dist/cli/config.d.ts +24 -0
- package/dist/cli/config.js +158 -0
- package/dist/cli/constants.d.ts +13 -0
- package/dist/cli/constants.js +16 -0
- package/dist/cli/doctor.d.ts +11 -0
- package/dist/cli/doctor.js +82 -0
- package/dist/cli/formatters/github.d.ts +8 -0
- package/dist/cli/formatters/github.js +119 -0
- package/dist/cli/formatters/human.d.ts +6 -0
- package/dist/cli/formatters/human.js +92 -0
- package/dist/cli/formatters/json.d.ts +6 -0
- package/dist/cli/formatters/json.js +10 -0
- package/dist/cli/formatters/types.d.ts +76 -0
- package/dist/cli/formatters/types.js +5 -0
- package/dist/cli/gate.d.ts +13 -0
- package/dist/cli/gate.js +108 -0
- package/dist/cli/index.d.ts +1 -0
- package/dist/cli/index.js +31 -5
- package/dist/cli/init.d.ts +7 -0
- package/dist/cli/init.js +69 -0
- package/dist/cli/render/snippet.d.ts +5 -0
- package/dist/cli/render/snippet.js +15 -0
- package/dist/cli/render/sort.d.ts +10 -0
- package/dist/cli/render/sort.js +24 -0
- package/dist/cli/report/build-check-report.d.ts +16 -0
- package/dist/cli/report/build-check-report.js +94 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.js +4 -1
- package/dist/integrations/openai-eval.d.ts +53 -0
- package/dist/integrations/openai-eval.js +226 -0
- package/dist/utils/input-hash.d.ts +8 -0
- package/dist/utils/input-hash.js +38 -0
- package/package.json +10 -3
- package/.env.example +0 -0
- package/ADDITIONAL_ISSUES_FOUND.md +0 -174
- package/dist/__tests__/assertions.test.d.ts +0 -1
- package/dist/__tests__/assertions.test.js +0 -288
- package/dist/__tests__/client.test.d.ts +0 -1
- package/dist/__tests__/client.test.js +0 -185
- package/dist/__tests__/testing.test.d.ts +0 -1
- package/dist/__tests__/testing.test.js +0 -230
- package/dist/__tests__/workflows.test.d.ts +0 -1
- package/dist/__tests__/workflows.test.js +0 -222
- package/evalai-sdk-1.2.0.tgz +0 -0
- package/postcss.config.mjs +0 -2
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Build CheckReport from API data and gate result.
|
|
3
|
+
* Normalizes failed cases (truncate, sort), dashboard URL, top N + more.
|
|
4
|
+
*/
|
|
5
|
+
import type { CheckArgs } from '../check';
|
|
6
|
+
import type { QualityLatestData, RunDetailsData } from '../api';
|
|
7
|
+
import type { GateResult } from '../gate';
|
|
8
|
+
import type { CheckReport } from '../formatters/types';
|
|
9
|
+
export type BuildReportInput = {
|
|
10
|
+
args: CheckArgs;
|
|
11
|
+
quality: QualityLatestData;
|
|
12
|
+
runDetails?: RunDetailsData | null;
|
|
13
|
+
gateResult: GateResult;
|
|
14
|
+
requestId?: string;
|
|
15
|
+
};
|
|
16
|
+
export declare function buildCheckReport(input: BuildReportInput): CheckReport;
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Build CheckReport from API data and gate result.
|
|
4
|
+
* Normalizes failed cases (truncate, sort), dashboard URL, top N + more.
|
|
5
|
+
*/
|
|
6
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
7
|
+
exports.buildCheckReport = buildCheckReport;
|
|
8
|
+
const snippet_1 = require("../render/snippet");
|
|
9
|
+
const sort_1 = require("../render/sort");
|
|
10
|
+
const TOP_N = 3;
|
|
11
|
+
/** ContribPts from weights: passRate*50, safety*25, (0.6*judge+0.4*schema)*15, (0.6*latency+0.4*cost)*10 */
|
|
12
|
+
function computeContribPts(b) {
|
|
13
|
+
const pr = b.passRate ?? 0;
|
|
14
|
+
const s = b.safety ?? 0;
|
|
15
|
+
const j = b.judge ?? 0;
|
|
16
|
+
const sc = b.schema ?? 0;
|
|
17
|
+
const l = b.latency ?? 0;
|
|
18
|
+
const c = b.cost ?? 0;
|
|
19
|
+
return {
|
|
20
|
+
passRatePts: Math.round(pr * 50 * 10) / 10,
|
|
21
|
+
safetyPts: Math.round(s * 25 * 10) / 10,
|
|
22
|
+
compliancePts: Math.round((0.6 * j + 0.4 * sc) * 15 * 10) / 10,
|
|
23
|
+
performancePts: Math.round((0.6 * l + 0.4 * c) * 10 * 10) / 10,
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
const SNIPPET_MAX = 50;
|
|
27
|
+
function buildCheckReport(input) {
|
|
28
|
+
const { args, quality, runDetails, gateResult, requestId } = input;
|
|
29
|
+
const score = quality?.score ?? 0;
|
|
30
|
+
const total = quality?.total ?? null;
|
|
31
|
+
const baselineScore = quality?.baselineScore ?? null;
|
|
32
|
+
const regressionDelta = quality?.regressionDelta ?? null;
|
|
33
|
+
const evaluationRunId = quality?.evaluationRunId;
|
|
34
|
+
const breakdown = quality?.breakdown ?? {};
|
|
35
|
+
const flags = (quality?.flags ?? []);
|
|
36
|
+
const baseUrl = args.baseUrl.replace(/\/$/, '');
|
|
37
|
+
const dashboardUrl = evaluationRunId != null
|
|
38
|
+
? `${baseUrl}/evaluations/${args.evaluationId}/runs/${evaluationRunId}`
|
|
39
|
+
: undefined;
|
|
40
|
+
// Build failed cases from run details
|
|
41
|
+
let failedCases = [];
|
|
42
|
+
if (runDetails?.results && evaluationRunId != null) {
|
|
43
|
+
const raw = runDetails.results
|
|
44
|
+
.filter((r) => r.status === 'failed')
|
|
45
|
+
.map((r) => ({
|
|
46
|
+
testCaseId: r.testCaseId,
|
|
47
|
+
status: 'failed',
|
|
48
|
+
name: r.test_cases?.name,
|
|
49
|
+
input: r.test_cases?.input,
|
|
50
|
+
expectedOutput: r.test_cases?.expectedOutput,
|
|
51
|
+
output: r.output,
|
|
52
|
+
}));
|
|
53
|
+
failedCases = (0, sort_1.sortFailedCases)(raw).map((fc) => ({
|
|
54
|
+
...fc,
|
|
55
|
+
inputSnippet: (0, snippet_1.truncateSnippet)(fc.input, SNIPPET_MAX),
|
|
56
|
+
expectedSnippet: (0, snippet_1.truncateSnippet)(fc.expectedOutput, SNIPPET_MAX),
|
|
57
|
+
outputSnippet: (0, snippet_1.truncateSnippet)(fc.output, SNIPPET_MAX),
|
|
58
|
+
}));
|
|
59
|
+
}
|
|
60
|
+
const failedCasesShown = Math.min(failedCases.length, TOP_N);
|
|
61
|
+
const failedCasesMore = failedCases.length - failedCasesShown;
|
|
62
|
+
const breakdown01 = Object.keys(breakdown).length > 0 ? breakdown : undefined;
|
|
63
|
+
const contribPts = args.explain && breakdown01 ? computeContribPts(breakdown01) : undefined;
|
|
64
|
+
const report = {
|
|
65
|
+
evaluationId: args.evaluationId,
|
|
66
|
+
runId: evaluationRunId,
|
|
67
|
+
verdict: gateResult.passed ? 'pass' : 'fail',
|
|
68
|
+
reasonCode: gateResult.reasonCode,
|
|
69
|
+
reasonMessage: gateResult.reasonMessage ?? undefined,
|
|
70
|
+
score,
|
|
71
|
+
baselineScore: baselineScore ?? undefined,
|
|
72
|
+
delta: regressionDelta ?? undefined,
|
|
73
|
+
n: total ?? undefined,
|
|
74
|
+
evidenceLevel: quality?.evidenceLevel ?? undefined,
|
|
75
|
+
baselineMissing: quality?.baselineMissing === true,
|
|
76
|
+
flags: flags.length > 0 ? [...flags].sort() : undefined,
|
|
77
|
+
breakdown01,
|
|
78
|
+
contribPts,
|
|
79
|
+
thresholds: {
|
|
80
|
+
minScore: args.minScore,
|
|
81
|
+
maxDrop: args.maxDrop,
|
|
82
|
+
minN: args.minN,
|
|
83
|
+
allowWeakEvidence: args.allowWeakEvidence,
|
|
84
|
+
baseline: args.baseline,
|
|
85
|
+
},
|
|
86
|
+
dashboardUrl,
|
|
87
|
+
failedCases,
|
|
88
|
+
failedCasesShown: failedCases.length > 0 ? failedCasesShown : undefined,
|
|
89
|
+
failedCasesMore: failedCasesMore > 0 ? failedCasesMore : undefined,
|
|
90
|
+
requestId,
|
|
91
|
+
explain: args.explain,
|
|
92
|
+
};
|
|
93
|
+
return report;
|
|
94
|
+
}
|
package/dist/index.d.ts
CHANGED
|
@@ -27,6 +27,7 @@ export { RequestBatcher } from './batch';
|
|
|
27
27
|
export { Logger } from './logger';
|
|
28
28
|
export { traceOpenAI } from './integrations/openai';
|
|
29
29
|
export { traceAnthropic } from './integrations/anthropic';
|
|
30
|
+
export { openAIChatEval, type OpenAIChatEvalOptions, type OpenAIChatEvalResult, type OpenAIChatEvalCase, } from './integrations/openai-eval';
|
|
30
31
|
export { WorkflowTracer, createWorkflowTracer, traceWorkflowStep, traceLangChainAgent, traceCrewAI, traceAutoGen, type WorkflowNode, type WorkflowEdge, type WorkflowDefinition, type WorkflowContext, type WorkflowStatus, type HandoffType, type AgentHandoff, type DecisionAlternative, type DecisionType, type RecordDecisionParams, type LLMProvider, type CostCategory, type RecordCostParams, type CostRecord, type WorkflowTracerOptions, type AgentSpanContext, } from './workflows';
|
|
31
32
|
export type { ClientConfig as AIEvalConfig, Trace as TraceData, Span as SpanData, Evaluation as EvaluationData, LLMJudgeResult as LLMJudgeData, RetryConfig, GenericMetadata as AnnotationData, TracedResponse, TestCase, TestResult, SnapshotData, ExportOptions, ImportOptions, StreamOptions, BatchOptions } from './types';
|
|
32
33
|
export { EvaluationTemplates, type EvaluationTemplateType, type FeatureUsage, type OrganizationLimits } from './types';
|
package/dist/index.js
CHANGED
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
*/
|
|
10
10
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
11
11
|
exports.decodeCursor = exports.encodeCursor = exports.autoPaginate = exports.createPaginatedIterator = exports.PaginatedIterator = exports.CacheTTL = exports.RequestCache = exports.RateLimiter = exports.batchRead = exports.streamEvaluation = exports.batchProcess = exports.importData = exports.exportData = exports.compareSnapshots = exports.saveSnapshot = exports.compareWithSnapshot = exports.snapshot = exports.TestSuite = exports.createTestSuite = exports.ContextManager = exports.withContext = exports.getContext = exports.createContext = exports.hasValidCodeSyntax = exports.containsAllRequiredFields = exports.followsInstructions = exports.hasNoToxicity = exports.respondedWithinTime = exports.hasFactualAccuracy = exports.containsLanguage = exports.hasReadabilityScore = exports.matchesSchema = exports.hasNoHallucinations = exports.isValidURL = exports.isValidEmail = exports.withinRange = exports.similarTo = exports.hasSentiment = exports.notContainsPII = exports.containsJSON = exports.hasLength = exports.matchesPattern = exports.containsKeywords = exports.expect = exports.NetworkError = exports.ValidationError = exports.AuthenticationError = exports.RateLimitError = exports.EvalAIError = exports.AIEvalClient = void 0;
|
|
12
|
-
exports.EXIT = exports.runCheck = exports.parseArgs = exports.EvaluationTemplates = exports.traceAutoGen = exports.traceCrewAI = exports.traceLangChainAgent = exports.traceWorkflowStep = exports.createWorkflowTracer = exports.WorkflowTracer = exports.traceAnthropic = exports.traceOpenAI = exports.Logger = exports.RequestBatcher = void 0;
|
|
12
|
+
exports.EXIT = exports.runCheck = exports.parseArgs = exports.EvaluationTemplates = exports.traceAutoGen = exports.traceCrewAI = exports.traceLangChainAgent = exports.traceWorkflowStep = exports.createWorkflowTracer = exports.WorkflowTracer = exports.openAIChatEval = exports.traceAnthropic = exports.traceOpenAI = exports.Logger = exports.RequestBatcher = void 0;
|
|
13
13
|
// Main SDK exports
|
|
14
14
|
var client_1 = require("./client");
|
|
15
15
|
Object.defineProperty(exports, "AIEvalClient", { enumerable: true, get: function () { return client_1.AIEvalClient; } });
|
|
@@ -94,6 +94,9 @@ var openai_1 = require("./integrations/openai");
|
|
|
94
94
|
Object.defineProperty(exports, "traceOpenAI", { enumerable: true, get: function () { return openai_1.traceOpenAI; } });
|
|
95
95
|
var anthropic_1 = require("./integrations/anthropic");
|
|
96
96
|
Object.defineProperty(exports, "traceAnthropic", { enumerable: true, get: function () { return anthropic_1.traceAnthropic; } });
|
|
97
|
+
// OpenAI regression eval (local-first, no account required)
|
|
98
|
+
var openai_eval_1 = require("./integrations/openai-eval");
|
|
99
|
+
Object.defineProperty(exports, "openAIChatEval", { enumerable: true, get: function () { return openai_eval_1.openAIChatEval; } });
|
|
97
100
|
// Workflow tracing (Orchestration Layer)
|
|
98
101
|
var workflows_1 = require("./workflows");
|
|
99
102
|
Object.defineProperty(exports, "WorkflowTracer", { enumerable: true, get: function () { return workflows_1.WorkflowTracer; } });
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* openAIChatEval — One-function OpenAI chat regression testing
|
|
3
|
+
*
|
|
4
|
+
* Run local regression tests with OpenAI. No EvalAI account required.
|
|
5
|
+
* CI-friendly output. Optional reportToEvalAI in v1.5.
|
|
6
|
+
*
|
|
7
|
+
* @example
|
|
8
|
+
* ```typescript
|
|
9
|
+
* import { openAIChatEval } from '@pauly4010/evalai-sdk';
|
|
10
|
+
*
|
|
11
|
+
* await openAIChatEval({
|
|
12
|
+
* name: 'chat-regression',
|
|
13
|
+
* cases: [
|
|
14
|
+
* { input: 'Hello', expectedOutput: 'greeting' },
|
|
15
|
+
* { input: '2 + 2 = ?', expectedOutput: '4' }
|
|
16
|
+
* ]
|
|
17
|
+
* });
|
|
18
|
+
* ```
|
|
19
|
+
*/
|
|
20
|
+
import type { TestSuiteCaseResult } from '../testing';
|
|
21
|
+
export interface OpenAIChatEvalCase {
|
|
22
|
+
input: string;
|
|
23
|
+
expectedOutput?: string;
|
|
24
|
+
/** Platform test case ID. When provided, used directly for reportToEvalAI (no input matching). */
|
|
25
|
+
testCaseId?: number;
|
|
26
|
+
assertions?: ((output: string) => import('../assertions').AssertionResult)[];
|
|
27
|
+
}
|
|
28
|
+
export interface OpenAIChatEvalOptions {
|
|
29
|
+
name: string;
|
|
30
|
+
model?: string;
|
|
31
|
+
apiKey?: string;
|
|
32
|
+
cases: OpenAIChatEvalCase[];
|
|
33
|
+
/** v1.5: Upload results to EvalAI platform for an existing evaluation. Requires evaluationId and EVALAI_API_KEY. */
|
|
34
|
+
reportToEvalAI?: boolean;
|
|
35
|
+
/** Evaluation ID (from config or arg). Required when reportToEvalAI is true. */
|
|
36
|
+
evaluationId?: string;
|
|
37
|
+
/** EvalAI API base URL. Default: EVALAI_BASE_URL or http://localhost:3000 */
|
|
38
|
+
baseUrl?: string;
|
|
39
|
+
/** Idempotency key for import (e.g. CI run ID). Prevents duplicate runs on retry. */
|
|
40
|
+
idempotencyKey?: string;
|
|
41
|
+
}
|
|
42
|
+
export interface OpenAIChatEvalResult {
|
|
43
|
+
passed: number;
|
|
44
|
+
total: number;
|
|
45
|
+
score: number;
|
|
46
|
+
results: TestSuiteCaseResult[];
|
|
47
|
+
durationMs: number;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Run OpenAI chat regression tests locally.
|
|
51
|
+
* No EvalAI account required. Returns score and prints CI-friendly summary.
|
|
52
|
+
*/
|
|
53
|
+
export declare function openAIChatEval(options: OpenAIChatEvalOptions): Promise<OpenAIChatEvalResult>;
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* openAIChatEval — One-function OpenAI chat regression testing
|
|
4
|
+
*
|
|
5
|
+
* Run local regression tests with OpenAI. No EvalAI account required.
|
|
6
|
+
* CI-friendly output. Optional reportToEvalAI in v1.5.
|
|
7
|
+
*
|
|
8
|
+
* @example
|
|
9
|
+
* ```typescript
|
|
10
|
+
* import { openAIChatEval } from '@pauly4010/evalai-sdk';
|
|
11
|
+
*
|
|
12
|
+
* await openAIChatEval({
|
|
13
|
+
* name: 'chat-regression',
|
|
14
|
+
* cases: [
|
|
15
|
+
* { input: 'Hello', expectedOutput: 'greeting' },
|
|
16
|
+
* { input: '2 + 2 = ?', expectedOutput: '4' }
|
|
17
|
+
* ]
|
|
18
|
+
* });
|
|
19
|
+
* ```
|
|
20
|
+
*/
|
|
21
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
22
|
+
exports.openAIChatEval = openAIChatEval;
|
|
23
|
+
const testing_1 = require("../testing");
|
|
24
|
+
const assertions_1 = require("../assertions");
|
|
25
|
+
const config_1 = require("../cli/config");
|
|
26
|
+
const input_hash_1 = require("../utils/input-hash");
|
|
27
|
+
const MAX_FAILED_CASES_TO_SHOW = 5;
|
|
28
|
+
function getOpenAI() {
|
|
29
|
+
try {
|
|
30
|
+
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
|
31
|
+
const OpenAI = require('openai');
|
|
32
|
+
return OpenAI;
|
|
33
|
+
}
|
|
34
|
+
catch {
|
|
35
|
+
throw new Error('openai package is required for openAIChatEval. Install with: npm install openai');
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
function createExecutor(model, apiKey) {
|
|
39
|
+
const OpenAI = getOpenAI();
|
|
40
|
+
const openai = new OpenAI({ apiKey });
|
|
41
|
+
return async (input) => {
|
|
42
|
+
const response = await openai.chat.completions.create({
|
|
43
|
+
model,
|
|
44
|
+
messages: [{ role: 'user', content: input }],
|
|
45
|
+
temperature: 0.1,
|
|
46
|
+
});
|
|
47
|
+
return response.choices[0]?.message?.content ?? '';
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
function printSummary(result) {
|
|
51
|
+
const { passed, total, results } = result;
|
|
52
|
+
const score = total > 0 ? Math.round((passed / total) * 100) : 0;
|
|
53
|
+
const failed = results.filter((r) => !r.passed);
|
|
54
|
+
const status = failed.length === 0 ? 'PASS' : 'FAIL';
|
|
55
|
+
console.log(`\n${status} ${passed}/${total} (score: ${score})\n`);
|
|
56
|
+
if (failed.length > 0) {
|
|
57
|
+
const toShow = failed.slice(0, MAX_FAILED_CASES_TO_SHOW);
|
|
58
|
+
const more = failed.length - toShow.length;
|
|
59
|
+
console.log(`${failed.length} failing case${failed.length === 1 ? '' : 's'}:`);
|
|
60
|
+
for (const r of toShow) {
|
|
61
|
+
const expected = r.expected ?? '(no expected)';
|
|
62
|
+
console.log(`- "${r.input}" → expected: ${expected}`);
|
|
63
|
+
}
|
|
64
|
+
if (more > 0) {
|
|
65
|
+
console.log(`+ ${more} more`);
|
|
66
|
+
}
|
|
67
|
+
console.log('\nGate this in CI:');
|
|
68
|
+
console.log(' npx -y @pauly4010/evalai-sdk@^1 init');
|
|
69
|
+
}
|
|
70
|
+
else {
|
|
71
|
+
console.log('Tip: Want dashboards and history?');
|
|
72
|
+
console.log('Set EVALAI_API_KEY and connect this to the platform.');
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* Run OpenAI chat regression tests locally.
|
|
77
|
+
* No EvalAI account required. Returns score and prints CI-friendly summary.
|
|
78
|
+
*/
|
|
79
|
+
async function openAIChatEval(options) {
|
|
80
|
+
const { name, model = 'gpt-4o-mini', apiKey, cases } = options;
|
|
81
|
+
const resolvedApiKey = apiKey ?? (typeof process !== 'undefined' && process.env?.OPENAI_API_KEY);
|
|
82
|
+
if (!resolvedApiKey) {
|
|
83
|
+
throw new Error('OPENAI_API_KEY is required. Set it in the environment or pass apiKey to openAIChatEval.');
|
|
84
|
+
}
|
|
85
|
+
const executor = createExecutor(model, resolvedApiKey);
|
|
86
|
+
const suiteCases = cases.map((c) => {
|
|
87
|
+
const assertions = c.assertions
|
|
88
|
+
? [...c.assertions]
|
|
89
|
+
: c.expectedOutput
|
|
90
|
+
? [(output) => (0, assertions_1.expect)(output).toContainKeywords(c.expectedOutput.split(/\s+/).filter(Boolean))]
|
|
91
|
+
: undefined;
|
|
92
|
+
return {
|
|
93
|
+
input: c.input,
|
|
94
|
+
expected: c.expectedOutput,
|
|
95
|
+
assertions,
|
|
96
|
+
};
|
|
97
|
+
});
|
|
98
|
+
const suite = (0, testing_1.createTestSuite)(name, {
|
|
99
|
+
cases: suiteCases,
|
|
100
|
+
executor,
|
|
101
|
+
parallel: true,
|
|
102
|
+
});
|
|
103
|
+
const result = await suite.run();
|
|
104
|
+
const score = result.total > 0 ? Math.round((result.passed / result.total) * 100) : 0;
|
|
105
|
+
const evalResult = {
|
|
106
|
+
passed: result.passed,
|
|
107
|
+
total: result.total,
|
|
108
|
+
score,
|
|
109
|
+
results: result.results,
|
|
110
|
+
durationMs: result.durationMs,
|
|
111
|
+
};
|
|
112
|
+
printSummary(evalResult);
|
|
113
|
+
// v1.5: Optional report to EvalAI platform
|
|
114
|
+
if (options.reportToEvalAI) {
|
|
115
|
+
const config = typeof process !== 'undefined' && process.cwd ? (0, config_1.loadConfig)(process.cwd()) : null;
|
|
116
|
+
const evalId = options.evaluationId || config?.evaluationId;
|
|
117
|
+
if (!evalId || String(evalId).trim() === '') {
|
|
118
|
+
console.log('Run evalai init and set evaluationId to upload results.');
|
|
119
|
+
return evalResult;
|
|
120
|
+
}
|
|
121
|
+
const evalaiKey = (typeof process !== 'undefined' && process.env?.EVALAI_API_KEY) || '';
|
|
122
|
+
if (!evalaiKey) {
|
|
123
|
+
console.log('Set EVALAI_API_KEY to upload results.');
|
|
124
|
+
return evalResult;
|
|
125
|
+
}
|
|
126
|
+
const baseUrl = options.baseUrl ||
|
|
127
|
+
config?.baseUrl ||
|
|
128
|
+
(typeof process !== 'undefined' && process.env?.EVALAI_BASE_URL) ||
|
|
129
|
+
'http://localhost:3000';
|
|
130
|
+
const url = String(baseUrl).replace(/\/$/, '');
|
|
131
|
+
try {
|
|
132
|
+
// Resolve testCaseId for each result: explicit testCaseId in cases, or match by inputHash
|
|
133
|
+
const importResults = [];
|
|
134
|
+
const hasExplicitIds = cases.some((c) => c.testCaseId != null);
|
|
135
|
+
if (hasExplicitIds) {
|
|
136
|
+
// Use testCaseId from cases (same order as results)
|
|
137
|
+
for (let i = 0; i < result.results.length; i++) {
|
|
138
|
+
const tcId = cases[i]?.testCaseId;
|
|
139
|
+
if (tcId == null) {
|
|
140
|
+
console.log('reportToEvalAI: All cases must have testCaseId when any has it.');
|
|
141
|
+
return evalResult;
|
|
142
|
+
}
|
|
143
|
+
importResults.push({
|
|
144
|
+
testCaseId: tcId,
|
|
145
|
+
status: result.results[i].passed ? 'passed' : 'failed',
|
|
146
|
+
output: result.results[i].actual ?? '',
|
|
147
|
+
latencyMs: result.results[i].durationMs,
|
|
148
|
+
});
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
else {
|
|
152
|
+
// Match by inputHash (same canonicalization as platform)
|
|
153
|
+
const tcRes = await fetch(`${url}/api/evaluations/${evalId}/test-cases?limit=500`, {
|
|
154
|
+
headers: { Authorization: `Bearer ${evalaiKey}` },
|
|
155
|
+
});
|
|
156
|
+
if (!tcRes.ok) {
|
|
157
|
+
console.log('Could not fetch test cases. Check evaluationId and EVALAI_API_KEY.');
|
|
158
|
+
return evalResult;
|
|
159
|
+
}
|
|
160
|
+
const platformCases = (await tcRes.json());
|
|
161
|
+
const hashToIds = new Map();
|
|
162
|
+
for (const tc of platformCases) {
|
|
163
|
+
const input = tc.input ?? '';
|
|
164
|
+
if (!input.trim())
|
|
165
|
+
continue;
|
|
166
|
+
const hash = (0, input_hash_1.sha256Input)(input);
|
|
167
|
+
const existing = hashToIds.get(hash) ?? [];
|
|
168
|
+
existing.push(tc.id);
|
|
169
|
+
hashToIds.set(hash, existing);
|
|
170
|
+
}
|
|
171
|
+
for (const r of result.results) {
|
|
172
|
+
const hash = (0, input_hash_1.sha256Input)(r.input ?? '');
|
|
173
|
+
const ids = hashToIds.get(hash);
|
|
174
|
+
if (ids == null || ids.length === 0) {
|
|
175
|
+
console.log(`No platform test case matches input: "${(r.input ?? '').slice(0, 50)}…"`);
|
|
176
|
+
return evalResult;
|
|
177
|
+
}
|
|
178
|
+
if (ids.length > 1) {
|
|
179
|
+
console.log(`Multiple platform test cases share the same input (hash collision). Use testCaseId in cases.`);
|
|
180
|
+
return evalResult;
|
|
181
|
+
}
|
|
182
|
+
importResults.push({
|
|
183
|
+
testCaseId: ids[0],
|
|
184
|
+
status: r.passed ? 'passed' : 'failed',
|
|
185
|
+
output: r.actual ?? '',
|
|
186
|
+
latencyMs: r.durationMs,
|
|
187
|
+
});
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
if (importResults.length !== result.results.length) {
|
|
191
|
+
console.log('Could not match all results to platform test cases.');
|
|
192
|
+
return evalResult;
|
|
193
|
+
}
|
|
194
|
+
const sdkVersion = '1.4.1';
|
|
195
|
+
const headers = {
|
|
196
|
+
'Content-Type': 'application/json',
|
|
197
|
+
Authorization: `Bearer ${evalaiKey}`,
|
|
198
|
+
};
|
|
199
|
+
if (options.idempotencyKey) {
|
|
200
|
+
headers['Idempotency-Key'] = options.idempotencyKey;
|
|
201
|
+
}
|
|
202
|
+
const importRes = await fetch(`${url}/api/evaluations/${evalId}/runs/import`, {
|
|
203
|
+
method: 'POST',
|
|
204
|
+
headers,
|
|
205
|
+
body: JSON.stringify({
|
|
206
|
+
environment: 'dev',
|
|
207
|
+
results: importResults,
|
|
208
|
+
importClientVersion: sdkVersion,
|
|
209
|
+
}),
|
|
210
|
+
});
|
|
211
|
+
if (!importRes.ok) {
|
|
212
|
+
const body = await importRes.text();
|
|
213
|
+
console.log(`Upload failed: ${importRes.status} — ${body}`);
|
|
214
|
+
return evalResult;
|
|
215
|
+
}
|
|
216
|
+
const importData = (await importRes.json());
|
|
217
|
+
if (importData.dashboardUrl) {
|
|
218
|
+
console.log(`Dashboard: ${importData.dashboardUrl}`);
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
catch (err) {
|
|
222
|
+
console.log('Upload failed:', err instanceof Error ? err.message : String(err));
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
return evalResult;
|
|
226
|
+
}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Input normalization and hashing for deterministic matching.
|
|
3
|
+
* Must match platform's @/lib/utils/input-hash.ts for reportToEvalAI.
|
|
4
|
+
*/
|
|
5
|
+
/** Normalize input for stable matching (whitespace, JSON key order). */
|
|
6
|
+
export declare function normalizeInput(input: string): string;
|
|
7
|
+
/** SHA-256 hash of normalized input. */
|
|
8
|
+
export declare function sha256Input(s: string): string;
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Input normalization and hashing for deterministic matching.
|
|
4
|
+
* Must match platform's @/lib/utils/input-hash.ts for reportToEvalAI.
|
|
5
|
+
*/
|
|
6
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
7
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
8
|
+
};
|
|
9
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
10
|
+
exports.normalizeInput = normalizeInput;
|
|
11
|
+
exports.sha256Input = sha256Input;
|
|
12
|
+
const crypto_1 = __importDefault(require("crypto"));
|
|
13
|
+
function sortKeys(obj) {
|
|
14
|
+
const sorted = {};
|
|
15
|
+
for (const k of Object.keys(obj).sort()) {
|
|
16
|
+
const v = obj[k];
|
|
17
|
+
sorted[k] =
|
|
18
|
+
v != null && typeof v === 'object' && !Array.isArray(v)
|
|
19
|
+
? sortKeys(v)
|
|
20
|
+
: v;
|
|
21
|
+
}
|
|
22
|
+
return sorted;
|
|
23
|
+
}
|
|
24
|
+
/** Normalize input for stable matching (whitespace, JSON key order). */
|
|
25
|
+
function normalizeInput(input) {
|
|
26
|
+
const s = input.trim();
|
|
27
|
+
try {
|
|
28
|
+
const obj = JSON.parse(s);
|
|
29
|
+
return JSON.stringify(sortKeys(obj));
|
|
30
|
+
}
|
|
31
|
+
catch {
|
|
32
|
+
return s.replace(/\s+/g, ' ');
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
/** SHA-256 hash of normalized input. */
|
|
36
|
+
function sha256Input(s) {
|
|
37
|
+
return crypto_1.default.createHash('sha256').update(normalizeInput(s), 'utf8').digest('hex');
|
|
38
|
+
}
|
package/package.json
CHANGED
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@pauly4010/evalai-sdk",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.5.0",
|
|
4
4
|
"description": "AI Evaluation Platform SDK - Complete API Coverage with Performance Optimizations",
|
|
5
|
-
"main": "
|
|
6
|
-
"
|
|
5
|
+
"main": "dist/index.js",
|
|
6
|
+
"module": "dist/index.js",
|
|
7
|
+
"types": "dist/index.d.ts",
|
|
8
|
+
"sideEffects": false,
|
|
9
|
+
"files": ["dist", "README.md", "CHANGELOG.md"],
|
|
7
10
|
"bin": {
|
|
8
11
|
"evalai": "./dist/cli/index.js"
|
|
9
12
|
},
|
|
@@ -82,6 +85,10 @@
|
|
|
82
85
|
"./integrations/anthropic": {
|
|
83
86
|
"import": "./dist/integrations/anthropic.js",
|
|
84
87
|
"types": "./dist/integrations/anthropic.d.ts"
|
|
88
|
+
},
|
|
89
|
+
"./integrations/openai-eval": {
|
|
90
|
+
"import": "./dist/integrations/openai-eval.js",
|
|
91
|
+
"types": "./dist/integrations/openai-eval.d.ts"
|
|
85
92
|
}
|
|
86
93
|
}
|
|
87
94
|
}
|
package/.env.example
DELETED
|
Binary file
|
|
@@ -1,174 +0,0 @@
|
|
|
1
|
-
# Additional Issues Found in Second Review
|
|
2
|
-
|
|
3
|
-
## 🔴 Issues Discovered
|
|
4
|
-
|
|
5
|
-
### 1. **process.env Usage in Browser Context** ⚠️ HIGH PRIORITY
|
|
6
|
-
|
|
7
|
-
**Files**: `client.ts` (lines 105, 116, 178)
|
|
8
|
-
|
|
9
|
-
**Problem**: The SDK uses `process.env` directly, which is undefined in browsers:
|
|
10
|
-
|
|
11
|
-
```typescript
|
|
12
|
-
// Line 105
|
|
13
|
-
this.apiKey = config.apiKey || process.env.EVALAI_API_KEY || ...
|
|
14
|
-
|
|
15
|
-
// Line 116
|
|
16
|
-
const orgIdFromEnv = process.env.EVALAI_ORGANIZATION_ID || ...
|
|
17
|
-
|
|
18
|
-
// Line 178 (in static init method)
|
|
19
|
-
baseUrl: process.env.EVALAI_BASE_URL,
|
|
20
|
-
```
|
|
21
|
-
|
|
22
|
-
**Impact**:
|
|
23
|
-
- Will cause "Cannot read property of undefined" errors in browsers
|
|
24
|
-
- Breaks zero-config initialization in browsers
|
|
25
|
-
- `AIEvalClient.init()` won't work in browsers
|
|
26
|
-
|
|
27
|
-
**Severity**: HIGH - Core functionality breaks in browsers
|
|
28
|
-
|
|
29
|
-
---
|
|
30
|
-
|
|
31
|
-
### 2. **Type Name Collision** 🟡 MEDIUM PRIORITY
|
|
32
|
-
|
|
33
|
-
**Files**: `types.ts` (line 209) and `testing.ts` (line 27)
|
|
34
|
-
|
|
35
|
-
**Problem**: Two different `TestCase` interfaces with same name but different purposes:
|
|
36
|
-
|
|
37
|
-
**types.ts** (Database Model):
|
|
38
|
-
```typescript
|
|
39
|
-
export interface TestCase {
|
|
40
|
-
id: number;
|
|
41
|
-
evaluationId: number;
|
|
42
|
-
input: string;
|
|
43
|
-
expectedOutput: string | null;
|
|
44
|
-
metadata: Record<string, any> | null;
|
|
45
|
-
createdAt: string;
|
|
46
|
-
}
|
|
47
|
-
```
|
|
48
|
-
|
|
49
|
-
**testing.ts** (Test Suite Model):
|
|
50
|
-
```typescript
|
|
51
|
-
export interface TestCase {
|
|
52
|
-
id?: string;
|
|
53
|
-
input: string;
|
|
54
|
-
expected?: string;
|
|
55
|
-
metadata?: Record<string, any>;
|
|
56
|
-
assertions?: ((output: string) => AssertionResult)[];
|
|
57
|
-
}
|
|
58
|
-
```
|
|
59
|
-
|
|
60
|
-
**Impact**:
|
|
61
|
-
- Confusing for developers
|
|
62
|
-
- IDE autocomplete shows wrong interface
|
|
63
|
-
- Only `types.ts` version is exported from index.ts (line 117)
|
|
64
|
-
- Could cause type errors if both are imported
|
|
65
|
-
|
|
66
|
-
**Severity**: MEDIUM - Causes confusion but only types.ts version is publicly exported
|
|
67
|
-
|
|
68
|
-
---
|
|
69
|
-
|
|
70
|
-
### 3. **Dynamic Import Pattern in export.ts** 🟢 LOW PRIORITY
|
|
71
|
-
|
|
72
|
-
**Files**: `export.ts` (lines 296, 316)
|
|
73
|
-
|
|
74
|
-
**Pattern**:
|
|
75
|
-
```typescript
|
|
76
|
-
const fs = await import('fs');
|
|
77
|
-
fs.writeFileSync(filePath, ...);
|
|
78
|
-
```
|
|
79
|
-
|
|
80
|
-
**Issue**:
|
|
81
|
-
- Dynamic import returns a module namespace object
|
|
82
|
-
- Works but is unusual pattern (normally use static imports in Node.js-only files)
|
|
83
|
-
- Could fail in some bundler configurations
|
|
84
|
-
|
|
85
|
-
**Impact**:
|
|
86
|
-
- Works but non-standard
|
|
87
|
-
- Tree-shaking friendly but unnecessary for Node.js-only code
|
|
88
|
-
- Some bundlers might have issues
|
|
89
|
-
|
|
90
|
-
**Severity**: LOW - Works but not best practice
|
|
91
|
-
|
|
92
|
-
---
|
|
93
|
-
|
|
94
|
-
### 4. **TypeScript Module Configuration** 🟢 INFO
|
|
95
|
-
|
|
96
|
-
**File**: `tsconfig.json`
|
|
97
|
-
|
|
98
|
-
**Current**:
|
|
99
|
-
```json
|
|
100
|
-
{
|
|
101
|
-
"module": "commonjs"
|
|
102
|
-
}
|
|
103
|
-
```
|
|
104
|
-
|
|
105
|
-
**Observation**:
|
|
106
|
-
- Using CommonJS but package.json has ES module exports
|
|
107
|
-
- CLI uses `.js` extensions in imports (which is correct for ES modules)
|
|
108
|
-
- Mismatch between TypeScript config and runtime expectations
|
|
109
|
-
|
|
110
|
-
**Impact**:
|
|
111
|
-
- May cause issues with module resolution
|
|
112
|
-
- CLI imports might not work as expected
|
|
113
|
-
- Bundlers might be confused
|
|
114
|
-
|
|
115
|
-
**Severity**: LOW - Currently working but could cause subtle issues
|
|
116
|
-
|
|
117
|
-
---
|
|
118
|
-
|
|
119
|
-
## 📊 Summary
|
|
120
|
-
|
|
121
|
-
| Issue | Severity | Impact | Affected |
|
|
122
|
-
|-------|----------|--------|----------|
|
|
123
|
-
| process.env in browser | 🔴 HIGH | Breaks in browsers | Core client |
|
|
124
|
-
| TestCase collision | 🟡 MEDIUM | Developer confusion | Types |
|
|
125
|
-
| Dynamic imports | 🟢 LOW | Unusual pattern | export.ts |
|
|
126
|
-
| Module config | 🟢 INFO | Potential confusion | Build system |
|
|
127
|
-
|
|
128
|
-
---
|
|
129
|
-
|
|
130
|
-
## ✅ Recommended Fixes
|
|
131
|
-
|
|
132
|
-
### Fix 1: Safe process.env Access
|
|
133
|
-
|
|
134
|
-
Add helper function:
|
|
135
|
-
```typescript
|
|
136
|
-
// utils.ts or client.ts
|
|
137
|
-
function getEnvVar(name: string): string | undefined {
|
|
138
|
-
if (typeof process !== 'undefined' && process.env) {
|
|
139
|
-
return process.env[name];
|
|
140
|
-
}
|
|
141
|
-
return undefined;
|
|
142
|
-
}
|
|
143
|
-
```
|
|
144
|
-
|
|
145
|
-
Then use:
|
|
146
|
-
```typescript
|
|
147
|
-
this.apiKey = config.apiKey || getEnvVar('EVALAI_API_KEY') || ...
|
|
148
|
-
```
|
|
149
|
-
|
|
150
|
-
### Fix 2: Rename Test Suite TestCase
|
|
151
|
-
|
|
152
|
-
Rename in `testing.ts`:
|
|
153
|
-
```typescript
|
|
154
|
-
export interface TestSuiteCase { // Was: TestCase
|
|
155
|
-
id?: string;
|
|
156
|
-
input: string;
|
|
157
|
-
expected?: string;
|
|
158
|
-
// ...
|
|
159
|
-
}
|
|
160
|
-
```
|
|
161
|
-
|
|
162
|
-
### Fix 3: Static Imports in export.ts
|
|
163
|
-
|
|
164
|
-
Since already checked for Node.js environment:
|
|
165
|
-
```typescript
|
|
166
|
-
import * as fs from 'fs'; // Instead of: const fs = await import('fs')
|
|
167
|
-
```
|
|
168
|
-
|
|
169
|
-
### Fix 4: Consider ES Modules
|
|
170
|
-
|
|
171
|
-
Either:
|
|
172
|
-
- Change tsconfig to `"module": "es2020"`
|
|
173
|
-
- Or change package.json exports to use `.cjs` extensions
|
|
174
|
-
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
export {};
|