@pauly4010/evalai-sdk 1.8.0 → 1.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +54 -0
- package/README.md +136 -23
- package/dist/assertions.js +51 -18
- package/dist/batch.js +8 -2
- package/dist/cli/api.js +3 -1
- package/dist/cli/check.js +19 -6
- package/dist/cli/ci-context.js +3 -1
- package/dist/cli/ci.d.ts +45 -0
- package/dist/cli/ci.js +192 -0
- package/dist/cli/config.js +28 -8
- package/dist/cli/diff.d.ts +173 -0
- package/dist/cli/diff.js +685 -0
- package/dist/cli/discover.d.ts +84 -0
- package/dist/cli/discover.js +419 -0
- package/dist/cli/doctor.js +62 -19
- package/dist/cli/env.d.ts +21 -0
- package/dist/cli/env.js +42 -0
- package/dist/cli/explain.js +168 -36
- package/dist/cli/formatters/human.js +4 -1
- package/dist/cli/formatters/pr-comment.js +3 -1
- package/dist/cli/gate.js +6 -2
- package/dist/cli/impact-analysis.d.ts +63 -0
- package/dist/cli/impact-analysis.js +252 -0
- package/dist/cli/index.js +185 -0
- package/dist/cli/manifest.d.ts +103 -0
- package/dist/cli/manifest.js +282 -0
- package/dist/cli/migrate.d.ts +41 -0
- package/dist/cli/migrate.js +349 -0
- package/dist/cli/policy-packs.js +8 -2
- package/dist/cli/print-config.js +33 -14
- package/dist/cli/regression-gate.js +8 -2
- package/dist/cli/report/build-check-report.js +8 -2
- package/dist/cli/run.d.ts +101 -0
- package/dist/cli/run.js +395 -0
- package/dist/cli/share.js +3 -1
- package/dist/cli/upgrade.js +2 -1
- package/dist/cli/workspace.d.ts +28 -0
- package/dist/cli/workspace.js +58 -0
- package/dist/client.d.ts +16 -19
- package/dist/client.js +60 -43
- package/dist/client.request.test.d.ts +1 -1
- package/dist/client.request.test.js +222 -147
- package/dist/context.js +3 -1
- package/dist/errors.js +11 -4
- package/dist/export.js +3 -1
- package/dist/index.d.ts +8 -2
- package/dist/index.js +30 -5
- package/dist/integrations/anthropic.d.ts +20 -1
- package/dist/integrations/openai-eval.js +4 -2
- package/dist/integrations/openai.d.ts +24 -1
- package/dist/local.js +3 -1
- package/dist/logger.js +6 -2
- package/dist/pagination.js +6 -2
- package/dist/runtime/adapters/config-to-dsl.d.ts +33 -0
- package/dist/runtime/adapters/config-to-dsl.js +394 -0
- package/dist/runtime/adapters/testsuite-to-dsl.d.ts +63 -0
- package/dist/runtime/adapters/testsuite-to-dsl.js +276 -0
- package/dist/runtime/context.d.ts +26 -0
- package/dist/runtime/context.js +74 -0
- package/dist/runtime/eval.d.ts +46 -0
- package/dist/runtime/eval.js +244 -0
- package/dist/runtime/execution-mode.d.ts +80 -0
- package/dist/runtime/execution-mode.js +357 -0
- package/dist/runtime/executor.d.ts +16 -0
- package/dist/runtime/executor.js +152 -0
- package/dist/runtime/registry.d.ts +78 -0
- package/dist/runtime/registry.js +403 -0
- package/dist/runtime/run-report.d.ts +200 -0
- package/dist/runtime/run-report.js +222 -0
- package/dist/runtime/types.d.ts +356 -0
- package/dist/runtime/types.js +76 -0
- package/dist/testing.d.ts +65 -0
- package/dist/testing.js +49 -2
- package/dist/types.d.ts +100 -69
- package/dist/utils/input-hash.js +4 -1
- package/dist/version.d.ts +1 -1
- package/dist/version.js +1 -1
- package/dist/workflows.js +62 -14
- package/package.json +115 -110
package/dist/testing.d.ts
CHANGED
|
@@ -90,6 +90,47 @@ export interface TestSuiteResult {
|
|
|
90
90
|
/** Case IDs that were retried (flaky recovery) */
|
|
91
91
|
retriedCases?: string[];
|
|
92
92
|
}
|
|
93
|
+
/**
|
|
94
|
+
* Test definition for introspection
|
|
95
|
+
* COMPAT-201: Public TestSuite introspection (minimal getters)
|
|
96
|
+
*/
|
|
97
|
+
export interface TestDefinition {
|
|
98
|
+
/** Test case ID */
|
|
99
|
+
id: string;
|
|
100
|
+
/** Test input */
|
|
101
|
+
input: string;
|
|
102
|
+
/** Expected output */
|
|
103
|
+
expected?: string;
|
|
104
|
+
/** Test metadata */
|
|
105
|
+
metadata?: Record<string, unknown>;
|
|
106
|
+
/** Whether test has assertions */
|
|
107
|
+
hasAssertions: boolean;
|
|
108
|
+
/** Number of assertions */
|
|
109
|
+
assertionCount: number;
|
|
110
|
+
}
|
|
111
|
+
/**
|
|
112
|
+
* Portable suite representation
|
|
113
|
+
* COMPAT-201: Public TestSuite introspection (minimal getters)
|
|
114
|
+
*/
|
|
115
|
+
export interface PortableSuite {
|
|
116
|
+
/** Suite name */
|
|
117
|
+
name: string;
|
|
118
|
+
/** Suite configuration */
|
|
119
|
+
config: TestSuiteConfig;
|
|
120
|
+
/** Test definitions */
|
|
121
|
+
tests: TestDefinition[];
|
|
122
|
+
/** Suite metadata */
|
|
123
|
+
metadata: {
|
|
124
|
+
suiteName?: string;
|
|
125
|
+
tags?: string[];
|
|
126
|
+
defaults?: {
|
|
127
|
+
timeout?: number;
|
|
128
|
+
parallel?: boolean;
|
|
129
|
+
stopOnFailure?: boolean;
|
|
130
|
+
retries?: number;
|
|
131
|
+
};
|
|
132
|
+
};
|
|
133
|
+
}
|
|
93
134
|
/**
|
|
94
135
|
* Test Suite for declarative evaluation testing
|
|
95
136
|
*/
|
|
@@ -115,6 +156,30 @@ export declare class TestSuite {
|
|
|
115
156
|
* Get suite configuration
|
|
116
157
|
*/
|
|
117
158
|
getConfig(): TestSuiteConfig;
|
|
159
|
+
/**
|
|
160
|
+
* Get test definitions for introspection
|
|
161
|
+
* COMPAT-201: Public TestSuite introspection (minimal getters)
|
|
162
|
+
*/
|
|
163
|
+
getTests(): TestDefinition[];
|
|
164
|
+
/**
|
|
165
|
+
* Get suite metadata for introspection
|
|
166
|
+
* COMPAT-201: Public TestSuite introspection (minimal getters)
|
|
167
|
+
*/
|
|
168
|
+
getMetadata(): {
|
|
169
|
+
suiteName?: string;
|
|
170
|
+
tags?: string[];
|
|
171
|
+
defaults?: {
|
|
172
|
+
timeout?: number;
|
|
173
|
+
parallel?: boolean;
|
|
174
|
+
stopOnFailure?: boolean;
|
|
175
|
+
retries?: number;
|
|
176
|
+
};
|
|
177
|
+
};
|
|
178
|
+
/**
|
|
179
|
+
* Convert to portable suite representation
|
|
180
|
+
* COMPAT-201: Public TestSuite introspection (minimal getters)
|
|
181
|
+
*/
|
|
182
|
+
toJSON(): PortableSuite;
|
|
118
183
|
}
|
|
119
184
|
/**
|
|
120
185
|
* Create a test suite
|
package/dist/testing.js
CHANGED
|
@@ -59,7 +59,10 @@ class TestSuite {
|
|
|
59
59
|
if (this.config.executor) {
|
|
60
60
|
const timeout = this.config.timeout || 30000;
|
|
61
61
|
const timeoutPromise = new Promise((_, reject) => setTimeout(() => reject(new Error(`Test timeout after ${timeout}ms`)), timeout));
|
|
62
|
-
actual = await Promise.race([
|
|
62
|
+
actual = await Promise.race([
|
|
63
|
+
this.config.executor(testCase.input),
|
|
64
|
+
timeoutPromise,
|
|
65
|
+
]);
|
|
63
66
|
}
|
|
64
67
|
else if (testCase.expected) {
|
|
65
68
|
actual = testCase.expected; // Use expected as actual if no executor
|
|
@@ -127,7 +130,9 @@ class TestSuite {
|
|
|
127
130
|
const retriedCases = [];
|
|
128
131
|
const retries = this.config.retries ?? 0;
|
|
129
132
|
if (retries > 0 && results.length > 0) {
|
|
130
|
-
const failingIndices = results
|
|
133
|
+
const failingIndices = results
|
|
134
|
+
.map((r, i) => (r.passed ? -1 : i))
|
|
135
|
+
.filter((i) => i >= 0);
|
|
131
136
|
for (let attempt = 0; attempt < retries && failingIndices.length > 0; attempt++) {
|
|
132
137
|
const toRetry = [...failingIndices];
|
|
133
138
|
failingIndices.length = 0;
|
|
@@ -169,6 +174,48 @@ class TestSuite {
|
|
|
169
174
|
getConfig() {
|
|
170
175
|
return { ...this.config };
|
|
171
176
|
}
|
|
177
|
+
/**
|
|
178
|
+
* Get test definitions for introspection
|
|
179
|
+
* COMPAT-201: Public TestSuite introspection (minimal getters)
|
|
180
|
+
*/
|
|
181
|
+
getTests() {
|
|
182
|
+
return this.config.cases.map((testCase, index) => ({
|
|
183
|
+
id: testCase.id || `case-${index}`,
|
|
184
|
+
input: testCase.input,
|
|
185
|
+
expected: testCase.expected,
|
|
186
|
+
metadata: testCase.metadata,
|
|
187
|
+
hasAssertions: !!testCase.assertions && testCase.assertions.length > 0,
|
|
188
|
+
assertionCount: testCase.assertions?.length || 0,
|
|
189
|
+
}));
|
|
190
|
+
}
|
|
191
|
+
/**
|
|
192
|
+
* Get suite metadata for introspection
|
|
193
|
+
* COMPAT-201: Public TestSuite introspection (minimal getters)
|
|
194
|
+
*/
|
|
195
|
+
getMetadata() {
|
|
196
|
+
return {
|
|
197
|
+
suiteName: this.name,
|
|
198
|
+
tags: [], // TestSuite doesn't have tags, but include for future compatibility
|
|
199
|
+
defaults: {
|
|
200
|
+
timeout: this.config.timeout,
|
|
201
|
+
parallel: this.config.parallel,
|
|
202
|
+
stopOnFailure: this.config.stopOnFailure,
|
|
203
|
+
retries: this.config.retries,
|
|
204
|
+
},
|
|
205
|
+
};
|
|
206
|
+
}
|
|
207
|
+
/**
|
|
208
|
+
* Convert to portable suite representation
|
|
209
|
+
* COMPAT-201: Public TestSuite introspection (minimal getters)
|
|
210
|
+
*/
|
|
211
|
+
toJSON() {
|
|
212
|
+
return {
|
|
213
|
+
name: this.name,
|
|
214
|
+
config: this.getConfig(),
|
|
215
|
+
tests: this.getTests(),
|
|
216
|
+
metadata: this.getMetadata(),
|
|
217
|
+
};
|
|
218
|
+
}
|
|
172
219
|
}
|
|
173
220
|
exports.TestSuite = TestSuite;
|
|
174
221
|
/**
|
package/dist/types.d.ts
CHANGED
|
@@ -150,11 +150,15 @@ export interface Span<TMetadata = Record<string, unknown>> {
|
|
|
150
150
|
export interface CreateSpanParams<TMetadata = Record<string, unknown>> {
|
|
151
151
|
name: string;
|
|
152
152
|
spanId: string;
|
|
153
|
+
type: string;
|
|
153
154
|
parentSpanId?: string;
|
|
154
155
|
startTime: string;
|
|
155
156
|
endTime?: string;
|
|
156
157
|
durationMs?: number;
|
|
158
|
+
input?: unknown;
|
|
159
|
+
output?: unknown;
|
|
157
160
|
metadata?: TMetadata;
|
|
161
|
+
evaluationRunId?: number | null;
|
|
158
162
|
}
|
|
159
163
|
/**
|
|
160
164
|
* Evaluation object representing a test evaluation
|
|
@@ -227,17 +231,30 @@ export interface CreateTestCaseParams {
|
|
|
227
231
|
export interface EvaluationRun {
|
|
228
232
|
id: number;
|
|
229
233
|
evaluationId: number;
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
234
|
+
organizationId: number;
|
|
235
|
+
status: string;
|
|
236
|
+
totalCases: number | null;
|
|
237
|
+
passedCases: number | null;
|
|
238
|
+
failedCases: number | null;
|
|
239
|
+
environment: string | null;
|
|
240
|
+
startedAt: string | null;
|
|
233
241
|
completedAt: string | null;
|
|
242
|
+
createdAt: string;
|
|
243
|
+
}
|
|
244
|
+
/**
|
|
245
|
+
* Result of getRun — includes the run and its test results
|
|
246
|
+
*/
|
|
247
|
+
export interface EvaluationRunDetail {
|
|
248
|
+
run: EvaluationRun;
|
|
249
|
+
results: Array<Record<string, unknown>>;
|
|
250
|
+
baselineResults?: Array<Record<string, unknown>>;
|
|
251
|
+
compareRunId?: number;
|
|
234
252
|
}
|
|
235
253
|
/**
|
|
236
254
|
* Parameters for creating an evaluation run
|
|
237
255
|
*/
|
|
238
256
|
export interface CreateRunParams {
|
|
239
|
-
|
|
240
|
-
results?: Record<string, unknown>;
|
|
257
|
+
environment?: string;
|
|
241
258
|
}
|
|
242
259
|
/**
|
|
243
260
|
* LLM Judge evaluation result
|
|
@@ -252,6 +269,15 @@ export interface LLMJudgeResult {
|
|
|
252
269
|
metadata: Record<string, unknown> | null;
|
|
253
270
|
createdAt: string;
|
|
254
271
|
}
|
|
272
|
+
/**
|
|
273
|
+
* Result of a single LLM judge evaluation call
|
|
274
|
+
*/
|
|
275
|
+
export interface LLMJudgeEvaluateResult {
|
|
276
|
+
score: number;
|
|
277
|
+
reasoning: string;
|
|
278
|
+
passed: boolean;
|
|
279
|
+
details: unknown;
|
|
280
|
+
}
|
|
255
281
|
/**
|
|
256
282
|
* Parameters for running an LLM judge evaluation
|
|
257
283
|
*/
|
|
@@ -295,6 +321,10 @@ export declare class SDKError extends Error {
|
|
|
295
321
|
export type AIEvalConfig = ClientConfig;
|
|
296
322
|
export type TraceData<TMetadata = unknown> = Trace<TMetadata>;
|
|
297
323
|
export type SpanData<TMetadata = unknown> = Span<TMetadata>;
|
|
324
|
+
export interface TraceDetail<TMetadata = Record<string, unknown>> {
|
|
325
|
+
trace: Trace<TMetadata>;
|
|
326
|
+
spans: Span<TMetadata>[];
|
|
327
|
+
}
|
|
298
328
|
export type EvaluationData<TMetadata = unknown> = Evaluation<TMetadata>;
|
|
299
329
|
export type LLMJudgeData = LLMJudgeResult;
|
|
300
330
|
export type AnnotationData = unknown;
|
|
@@ -537,7 +567,7 @@ export interface Webhook {
|
|
|
537
567
|
events: string[];
|
|
538
568
|
secret: string;
|
|
539
569
|
status: "active" | "inactive";
|
|
540
|
-
|
|
570
|
+
lastDeliveredAt: string | null;
|
|
541
571
|
createdAt: string;
|
|
542
572
|
updatedAt: string;
|
|
543
573
|
}
|
|
@@ -572,12 +602,12 @@ export interface ListWebhooksParams {
|
|
|
572
602
|
export interface WebhookDelivery {
|
|
573
603
|
id: number;
|
|
574
604
|
webhookId: number;
|
|
575
|
-
|
|
605
|
+
eventType: string;
|
|
576
606
|
payload: Record<string, unknown>;
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
607
|
+
responseBody: string | null;
|
|
608
|
+
responseStatus: number | null;
|
|
609
|
+
status: string;
|
|
610
|
+
attemptCount: number;
|
|
581
611
|
createdAt: string;
|
|
582
612
|
}
|
|
583
613
|
/**
|
|
@@ -586,51 +616,59 @@ export interface WebhookDelivery {
|
|
|
586
616
|
export interface ListWebhookDeliveriesParams {
|
|
587
617
|
limit?: number;
|
|
588
618
|
offset?: number;
|
|
589
|
-
|
|
619
|
+
status?: "success" | "failed" | "pending";
|
|
590
620
|
}
|
|
591
621
|
/**
|
|
592
622
|
* Usage statistics
|
|
593
623
|
*/
|
|
594
624
|
export interface UsageStats {
|
|
595
|
-
|
|
625
|
+
analytics: {
|
|
626
|
+
totalRequests: number;
|
|
627
|
+
avgResponseTime: number;
|
|
628
|
+
errorRate: number;
|
|
629
|
+
successRate: number;
|
|
630
|
+
groupedData: Array<{
|
|
631
|
+
key: string;
|
|
632
|
+
count: number;
|
|
633
|
+
avgResponseTime: number;
|
|
634
|
+
}>;
|
|
635
|
+
};
|
|
596
636
|
period: {
|
|
597
637
|
start: string;
|
|
598
638
|
end: string;
|
|
599
639
|
};
|
|
600
|
-
traces: {
|
|
601
|
-
total: number;
|
|
602
|
-
byStatus: Record<string, number>;
|
|
603
|
-
};
|
|
604
|
-
evaluations: {
|
|
605
|
-
total: number;
|
|
606
|
-
byType: Record<string, number>;
|
|
607
|
-
};
|
|
608
|
-
apiCalls: {
|
|
609
|
-
total: number;
|
|
610
|
-
byEndpoint: Record<string, number>;
|
|
611
|
-
};
|
|
612
640
|
}
|
|
613
641
|
/**
|
|
614
642
|
* Parameters for getting usage stats
|
|
615
643
|
*/
|
|
616
644
|
export interface GetUsageParams {
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
645
|
+
period?: "7d" | "30d" | "90d";
|
|
646
|
+
groupBy?: "endpoint" | "method" | "day";
|
|
647
|
+
limit?: number;
|
|
648
|
+
offset?: number;
|
|
620
649
|
}
|
|
621
650
|
/**
|
|
622
651
|
* Usage summary
|
|
623
652
|
*/
|
|
624
653
|
export interface UsageSummary {
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
654
|
+
summary: {
|
|
655
|
+
totalRequests: number;
|
|
656
|
+
avgResponseTime: number;
|
|
657
|
+
minResponseTime: number;
|
|
658
|
+
maxResponseTime: number;
|
|
659
|
+
errorRate: number;
|
|
660
|
+
successRate: number;
|
|
661
|
+
requestsByStatusCode: Record<string, number>;
|
|
662
|
+
topEndpoints: Array<{
|
|
663
|
+
endpoint: string;
|
|
664
|
+
count: number;
|
|
665
|
+
}>;
|
|
666
|
+
requestsOverTime: Array<{
|
|
667
|
+
date: string;
|
|
668
|
+
count: number;
|
|
669
|
+
}>;
|
|
631
670
|
};
|
|
632
|
-
|
|
633
|
-
billingPeriod: {
|
|
671
|
+
period: {
|
|
634
672
|
start: string;
|
|
635
673
|
end: string;
|
|
636
674
|
};
|
|
@@ -641,13 +679,12 @@ export interface UsageSummary {
|
|
|
641
679
|
export interface LLMJudgeConfig {
|
|
642
680
|
id: number;
|
|
643
681
|
name: string;
|
|
644
|
-
description: string | null;
|
|
645
682
|
model: string;
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
683
|
+
promptTemplate: string;
|
|
684
|
+
criteria: unknown;
|
|
685
|
+
settings: unknown;
|
|
649
686
|
organizationId: number;
|
|
650
|
-
createdBy:
|
|
687
|
+
createdBy: string;
|
|
651
688
|
createdAt: string;
|
|
652
689
|
updatedAt: string;
|
|
653
690
|
}
|
|
@@ -656,13 +693,10 @@ export interface LLMJudgeConfig {
|
|
|
656
693
|
*/
|
|
657
694
|
export interface CreateLLMJudgeConfigParams {
|
|
658
695
|
name: string;
|
|
659
|
-
description?: string;
|
|
660
696
|
model: string;
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
organizationId: number;
|
|
665
|
-
createdBy: number;
|
|
697
|
+
promptTemplate: string;
|
|
698
|
+
criteria?: Record<string, unknown>;
|
|
699
|
+
settings?: Record<string, unknown>;
|
|
666
700
|
}
|
|
667
701
|
/**
|
|
668
702
|
* Parameters for listing LLM judge configs
|
|
@@ -685,28 +719,25 @@ export interface ListLLMJudgeResultsParams {
|
|
|
685
719
|
* LLM Judge alignment analysis
|
|
686
720
|
*/
|
|
687
721
|
export interface LLMJudgeAlignment {
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
correlation: number;
|
|
722
|
+
alignmentData: Array<{
|
|
723
|
+
testCaseId: number;
|
|
724
|
+
humanScore: number;
|
|
725
|
+
judgeScore: number;
|
|
726
|
+
alignment: number;
|
|
727
|
+
}>;
|
|
728
|
+
metrics: {
|
|
729
|
+
averageAlignment: number;
|
|
730
|
+
totalComparisons: number;
|
|
731
|
+
highAlignment: number;
|
|
732
|
+
lowAlignment: number;
|
|
733
|
+
alignmentRate: number;
|
|
701
734
|
};
|
|
702
735
|
}
|
|
703
736
|
/**
|
|
704
737
|
* Parameters for getting alignment analysis
|
|
705
738
|
*/
|
|
706
739
|
export interface GetLLMJudgeAlignmentParams {
|
|
707
|
-
|
|
708
|
-
startDate?: string;
|
|
709
|
-
endDate?: string;
|
|
740
|
+
evaluationRunId: number;
|
|
710
741
|
}
|
|
711
742
|
/**
|
|
712
743
|
* Organization object
|
|
@@ -714,10 +745,10 @@ export interface GetLLMJudgeAlignmentParams {
|
|
|
714
745
|
export interface Organization {
|
|
715
746
|
id: number;
|
|
716
747
|
name: string;
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
748
|
+
role?: string;
|
|
749
|
+
slug?: string;
|
|
750
|
+
plan?: string;
|
|
751
|
+
status?: string;
|
|
752
|
+
createdAt?: string;
|
|
753
|
+
updatedAt?: string;
|
|
723
754
|
}
|
package/dist/utils/input-hash.js
CHANGED
|
@@ -34,5 +34,8 @@ function normalizeInput(input) {
|
|
|
34
34
|
}
|
|
35
35
|
/** SHA-256 hash of normalized input. */
|
|
36
36
|
function sha256Input(s) {
|
|
37
|
-
return node_crypto_1.default
|
|
37
|
+
return node_crypto_1.default
|
|
38
|
+
.createHash("sha256")
|
|
39
|
+
.update(normalizeInput(s), "utf8")
|
|
40
|
+
.digest("hex");
|
|
38
41
|
}
|
package/dist/version.d.ts
CHANGED
package/dist/version.js
CHANGED
|
@@ -6,5 +6,5 @@ exports.SPEC_VERSION = exports.SDK_VERSION = void 0;
|
|
|
6
6
|
* X-EvalAI-SDK-Version: SDK package version
|
|
7
7
|
* X-EvalAI-Spec-Version: OpenAPI spec version (docs/openapi.json info.version)
|
|
8
8
|
*/
|
|
9
|
-
exports.SDK_VERSION = "1.
|
|
9
|
+
exports.SDK_VERSION = "1.9.1";
|
|
10
10
|
exports.SPEC_VERSION = "1.0.0";
|
package/dist/workflows.js
CHANGED
|
@@ -192,6 +192,7 @@ class WorkflowTracer {
|
|
|
192
192
|
await this.client.traces.createSpan(this.currentWorkflow.traceId, {
|
|
193
193
|
name: `Agent: ${agentName}`,
|
|
194
194
|
spanId,
|
|
195
|
+
type: "agent",
|
|
195
196
|
parentSpanId,
|
|
196
197
|
startTime,
|
|
197
198
|
metadata: (0, context_1.mergeWithContext)({
|
|
@@ -215,6 +216,7 @@ class WorkflowTracer {
|
|
|
215
216
|
await this.client.traces.createSpan(this.currentWorkflow.traceId, {
|
|
216
217
|
name: `Agent: ${span.agentName}`,
|
|
217
218
|
spanId: `${span.spanId}-end`,
|
|
219
|
+
type: "agent",
|
|
218
220
|
parentSpanId: span.spanId,
|
|
219
221
|
startTime: span.startTime,
|
|
220
222
|
endTime,
|
|
@@ -226,7 +228,11 @@ class WorkflowTracer {
|
|
|
226
228
|
}),
|
|
227
229
|
});
|
|
228
230
|
this.activeSpans.delete(span.spanId);
|
|
229
|
-
this.log("Ended agent span", {
|
|
231
|
+
this.log("Ended agent span", {
|
|
232
|
+
agentName: span.agentName,
|
|
233
|
+
spanId: span.spanId,
|
|
234
|
+
durationMs,
|
|
235
|
+
});
|
|
230
236
|
}
|
|
231
237
|
// ==========================================================================
|
|
232
238
|
// HANDOFFS
|
|
@@ -261,6 +267,7 @@ class WorkflowTracer {
|
|
|
261
267
|
await this.client.traces.createSpan(this.currentWorkflow.traceId, {
|
|
262
268
|
name: `Handoff: ${fromAgent || "start"} → ${toAgent}`,
|
|
263
269
|
spanId,
|
|
270
|
+
type: "handoff",
|
|
264
271
|
startTime: handoff.timestamp,
|
|
265
272
|
endTime: handoff.timestamp,
|
|
266
273
|
durationMs: 0,
|
|
@@ -306,6 +313,7 @@ class WorkflowTracer {
|
|
|
306
313
|
await this.client.traces.createSpan(this.currentWorkflow.traceId, {
|
|
307
314
|
name: `Decision: ${params.agent} chose ${params.chosen}`,
|
|
308
315
|
spanId,
|
|
316
|
+
type: "decision",
|
|
309
317
|
startTime: timestamp,
|
|
310
318
|
endTime: timestamp,
|
|
311
319
|
durationMs: 0,
|
|
@@ -369,6 +377,7 @@ class WorkflowTracer {
|
|
|
369
377
|
await this.client.traces.createSpan(this.currentWorkflow.traceId, {
|
|
370
378
|
name: `Cost: ${params.provider}/${params.model}`,
|
|
371
379
|
spanId,
|
|
380
|
+
type: "cost",
|
|
372
381
|
startTime: timestamp,
|
|
373
382
|
endTime: timestamp,
|
|
374
383
|
durationMs: 0,
|
|
@@ -417,23 +426,62 @@ class WorkflowTracer {
|
|
|
417
426
|
// Default pricing (can be extended with API lookup)
|
|
418
427
|
const knownPricing = {
|
|
419
428
|
// OpenAI
|
|
420
|
-
"openai/gpt-4": {
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
"openai/gpt-
|
|
429
|
+
"openai/gpt-4": {
|
|
430
|
+
inputPricePerMillion: 30.0,
|
|
431
|
+
outputPricePerMillion: 60.0,
|
|
432
|
+
},
|
|
433
|
+
"openai/gpt-4-turbo": {
|
|
434
|
+
inputPricePerMillion: 10.0,
|
|
435
|
+
outputPricePerMillion: 30.0,
|
|
436
|
+
},
|
|
437
|
+
"openai/gpt-4o": {
|
|
438
|
+
inputPricePerMillion: 5.0,
|
|
439
|
+
outputPricePerMillion: 15.0,
|
|
440
|
+
},
|
|
441
|
+
"openai/gpt-4o-mini": {
|
|
442
|
+
inputPricePerMillion: 0.15,
|
|
443
|
+
outputPricePerMillion: 0.6,
|
|
444
|
+
},
|
|
445
|
+
"openai/gpt-3.5-turbo": {
|
|
446
|
+
inputPricePerMillion: 0.5,
|
|
447
|
+
outputPricePerMillion: 1.5,
|
|
448
|
+
},
|
|
425
449
|
// Anthropic
|
|
426
|
-
"anthropic/claude-3-opus": {
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
450
|
+
"anthropic/claude-3-opus": {
|
|
451
|
+
inputPricePerMillion: 15.0,
|
|
452
|
+
outputPricePerMillion: 75.0,
|
|
453
|
+
},
|
|
454
|
+
"anthropic/claude-3-sonnet": {
|
|
455
|
+
inputPricePerMillion: 3.0,
|
|
456
|
+
outputPricePerMillion: 15.0,
|
|
457
|
+
},
|
|
458
|
+
"anthropic/claude-3-haiku": {
|
|
459
|
+
inputPricePerMillion: 0.25,
|
|
460
|
+
outputPricePerMillion: 1.25,
|
|
461
|
+
},
|
|
462
|
+
"anthropic/claude-3.5-sonnet": {
|
|
463
|
+
inputPricePerMillion: 3.0,
|
|
464
|
+
outputPricePerMillion: 15.0,
|
|
465
|
+
},
|
|
430
466
|
// Google
|
|
431
|
-
"google/gemini-pro": {
|
|
432
|
-
|
|
433
|
-
|
|
467
|
+
"google/gemini-pro": {
|
|
468
|
+
inputPricePerMillion: 0.5,
|
|
469
|
+
outputPricePerMillion: 1.5,
|
|
470
|
+
},
|
|
471
|
+
"google/gemini-1.5-pro": {
|
|
472
|
+
inputPricePerMillion: 3.5,
|
|
473
|
+
outputPricePerMillion: 10.5,
|
|
474
|
+
},
|
|
475
|
+
"google/gemini-1.5-flash": {
|
|
476
|
+
inputPricePerMillion: 0.075,
|
|
477
|
+
outputPricePerMillion: 0.3,
|
|
478
|
+
},
|
|
434
479
|
};
|
|
435
480
|
const key = `${provider}/${model}`;
|
|
436
|
-
return knownPricing[key] || {
|
|
481
|
+
return (knownPricing[key] || {
|
|
482
|
+
inputPricePerMillion: 1.0,
|
|
483
|
+
outputPricePerMillion: 3.0,
|
|
484
|
+
});
|
|
437
485
|
}
|
|
438
486
|
/**
|
|
439
487
|
* Generate a unique ID
|