@evalgate/sdk 2.2.3 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +31 -0
- package/README.md +39 -2
- package/dist/assertions.d.ts +186 -6
- package/dist/assertions.js +515 -61
- package/dist/batch.js +4 -4
- package/dist/cache.d.ts +4 -0
- package/dist/cache.js +4 -0
- package/dist/cli/baseline.d.ts +14 -0
- package/dist/cli/baseline.js +43 -3
- package/dist/cli/check.d.ts +5 -2
- package/dist/cli/check.js +20 -12
- package/dist/cli/compare.d.ts +80 -0
- package/dist/cli/compare.js +266 -0
- package/dist/cli/index.js +244 -101
- package/dist/cli/regression-gate.js +23 -0
- package/dist/cli/run.js +22 -0
- package/dist/cli/start.d.ts +26 -0
- package/dist/cli/start.js +130 -0
- package/dist/cli/templates.d.ts +24 -0
- package/dist/cli/templates.js +314 -0
- package/dist/cli/traces.d.ts +109 -0
- package/dist/cli/traces.js +152 -0
- package/dist/cli/validate.d.ts +37 -0
- package/dist/cli/validate.js +252 -0
- package/dist/cli/watch.d.ts +19 -0
- package/dist/cli/watch.js +175 -0
- package/dist/client.js +6 -13
- package/dist/constants.d.ts +2 -0
- package/dist/constants.js +5 -0
- package/dist/index.d.ts +8 -6
- package/dist/index.js +26 -6
- package/dist/integrations/openai.js +83 -60
- package/dist/logger.d.ts +3 -1
- package/dist/logger.js +2 -1
- package/dist/otel.d.ts +130 -0
- package/dist/otel.js +309 -0
- package/dist/runtime/eval.d.ts +14 -4
- package/dist/runtime/eval.js +127 -2
- package/dist/runtime/registry.d.ts +4 -2
- package/dist/runtime/registry.js +11 -3
- package/dist/runtime/run-report.d.ts +1 -1
- package/dist/runtime/run-report.js +7 -4
- package/dist/runtime/types.d.ts +38 -0
- package/dist/testing.d.ts +8 -0
- package/dist/testing.js +45 -10
- package/dist/version.d.ts +2 -2
- package/dist/version.js +2 -2
- package/dist/workflows.d.ts +2 -0
- package/dist/workflows.js +184 -102
- package/package.json +124 -117
|
@@ -52,52 +52,59 @@ function traceOpenAI(openai, evalClient, options = {}) {
|
|
|
52
52
|
const response = await originalCreate(params, requestOptions);
|
|
53
53
|
const durationMs = Date.now() - startTime;
|
|
54
54
|
// Create trace with success status and complete metadata
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
? {
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
55
|
+
// Trace creation is non-fatal — never lose the OpenAI result due to tracing issues
|
|
56
|
+
try {
|
|
57
|
+
const traceMetadata = (0, context_1.mergeWithContext)({
|
|
58
|
+
model: params.model,
|
|
59
|
+
temperature: params.temperature,
|
|
60
|
+
max_tokens: params.max_tokens,
|
|
61
|
+
...(captureInput ? { input: params.messages } : {}),
|
|
62
|
+
...(captureOutput ? { output: response.choices[0]?.message } : {}),
|
|
63
|
+
...(captureMetadata
|
|
64
|
+
? {
|
|
65
|
+
usage: response.usage,
|
|
66
|
+
finish_reason: response.choices[0]?.finish_reason,
|
|
67
|
+
}
|
|
68
|
+
: {}),
|
|
69
|
+
});
|
|
70
|
+
await evalClient.traces?.create({
|
|
71
|
+
name: `OpenAI: ${params.model}`,
|
|
72
|
+
traceId,
|
|
73
|
+
organizationId: organizationId || evalClient.getOrganizationId(),
|
|
74
|
+
status: "success",
|
|
75
|
+
durationMs,
|
|
76
|
+
metadata: traceMetadata,
|
|
77
|
+
});
|
|
78
|
+
}
|
|
79
|
+
catch {
|
|
80
|
+
/* trace failure is non-fatal */
|
|
81
|
+
}
|
|
76
82
|
return response;
|
|
77
83
|
}
|
|
78
84
|
catch (error) {
|
|
79
85
|
const durationMs = Date.now() - startTime;
|
|
80
|
-
// Create trace with error status
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
?.create({
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
86
|
+
// Create trace with error status — non-fatal
|
|
87
|
+
try {
|
|
88
|
+
const errorMetadata = (0, context_1.mergeWithContext)({
|
|
89
|
+
model: params.model,
|
|
90
|
+
temperature: params.temperature,
|
|
91
|
+
max_tokens: params.max_tokens,
|
|
92
|
+
...(captureInput ? { input: params.messages } : {}),
|
|
93
|
+
...(captureMetadata ? { params } : {}),
|
|
94
|
+
error: error instanceof Error ? error.message : String(error),
|
|
95
|
+
});
|
|
96
|
+
await evalClient.traces?.create({
|
|
97
|
+
name: `OpenAI: ${params.model}`,
|
|
98
|
+
traceId,
|
|
99
|
+
organizationId: organizationId || evalClient.getOrganizationId(),
|
|
100
|
+
status: "error",
|
|
101
|
+
durationMs,
|
|
102
|
+
metadata: errorMetadata,
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
catch {
|
|
106
|
+
/* trace failure is non-fatal */
|
|
107
|
+
}
|
|
101
108
|
throw error;
|
|
102
109
|
}
|
|
103
110
|
};
|
|
@@ -123,6 +130,7 @@ function traceOpenAI(openai, evalClient, options = {}) {
|
|
|
123
130
|
async function traceOpenAICall(evalClient, name, fn, options = {}) {
|
|
124
131
|
const startTime = Date.now();
|
|
125
132
|
const traceId = `openai-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
|
|
133
|
+
// Trace creation is non-fatal — never lose the fn() result due to tracing issues
|
|
126
134
|
try {
|
|
127
135
|
await evalClient.traces?.create({
|
|
128
136
|
name,
|
|
@@ -131,30 +139,45 @@ async function traceOpenAICall(evalClient, name, fn, options = {}) {
|
|
|
131
139
|
status: "pending",
|
|
132
140
|
metadata: (0, context_1.mergeWithContext)({}),
|
|
133
141
|
});
|
|
142
|
+
}
|
|
143
|
+
catch {
|
|
144
|
+
/* trace failure is non-fatal */
|
|
145
|
+
}
|
|
146
|
+
try {
|
|
134
147
|
const result = await fn();
|
|
135
148
|
const durationMs = Date.now() - startTime;
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
149
|
+
try {
|
|
150
|
+
await evalClient.traces?.create({
|
|
151
|
+
name,
|
|
152
|
+
traceId,
|
|
153
|
+
organizationId: options.organizationId || evalClient.getOrganizationId(),
|
|
154
|
+
status: "success",
|
|
155
|
+
durationMs,
|
|
156
|
+
metadata: (0, context_1.mergeWithContext)({}),
|
|
157
|
+
});
|
|
158
|
+
}
|
|
159
|
+
catch {
|
|
160
|
+
/* trace failure is non-fatal */
|
|
161
|
+
}
|
|
144
162
|
return result;
|
|
145
163
|
}
|
|
146
164
|
catch (error) {
|
|
147
165
|
const durationMs = Date.now() - startTime;
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
166
|
+
try {
|
|
167
|
+
await evalClient.traces?.create({
|
|
168
|
+
name,
|
|
169
|
+
traceId,
|
|
170
|
+
organizationId: options.organizationId || evalClient.getOrganizationId(),
|
|
171
|
+
status: "error",
|
|
172
|
+
durationMs,
|
|
173
|
+
metadata: (0, context_1.mergeWithContext)({
|
|
174
|
+
error: error instanceof Error ? error.message : String(error),
|
|
175
|
+
}),
|
|
176
|
+
});
|
|
177
|
+
}
|
|
178
|
+
catch {
|
|
179
|
+
/* trace failure is non-fatal */
|
|
180
|
+
}
|
|
158
181
|
throw error;
|
|
159
182
|
}
|
|
160
183
|
}
|
package/dist/logger.d.ts
CHANGED
package/dist/logger.js
CHANGED
|
@@ -93,9 +93,10 @@ class Logger {
|
|
|
93
93
|
* Create child logger with prefix
|
|
94
94
|
*/
|
|
95
95
|
child(prefix) {
|
|
96
|
+
const resolvedPrefix = typeof prefix === "string" ? prefix : prefix.prefix;
|
|
96
97
|
return new Logger({
|
|
97
98
|
...this.options,
|
|
98
|
-
prefix: `${this.options.prefix}:${
|
|
99
|
+
prefix: `${this.options.prefix}:${resolvedPrefix}`,
|
|
99
100
|
});
|
|
100
101
|
}
|
|
101
102
|
/**
|
package/dist/otel.d.ts
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OpenTelemetry Export for WorkflowTracer
|
|
3
|
+
*
|
|
4
|
+
* Converts WorkflowTracer spans, decisions, and costs into
|
|
5
|
+
* OpenTelemetry-compatible span data for export to any OTEL collector.
|
|
6
|
+
*
|
|
7
|
+
* Usage:
|
|
8
|
+
* import { OTelExporter } from "@evalgate/sdk/otel";
|
|
9
|
+
*
|
|
10
|
+
* const exporter = new OTelExporter({ endpoint: "http://localhost:4318" });
|
|
11
|
+
* const tracer = new WorkflowTracer(client, { debug: true });
|
|
12
|
+
* // ... run workflow ...
|
|
13
|
+
* await exporter.exportFromTracer(tracer);
|
|
14
|
+
*/
|
|
15
|
+
import type { WorkflowTracer } from "./workflows";
|
|
16
|
+
/**
|
|
17
|
+
* OTEL-compatible span representation
|
|
18
|
+
* Follows the OpenTelemetry Trace specification
|
|
19
|
+
*/
|
|
20
|
+
export interface OTelSpan {
|
|
21
|
+
traceId: string;
|
|
22
|
+
spanId: string;
|
|
23
|
+
parentSpanId?: string;
|
|
24
|
+
name: string;
|
|
25
|
+
/** OTLP SpanKind: 0=UNSPECIFIED, 1=INTERNAL, 2=SERVER, 3=CLIENT, 4=PRODUCER, 5=CONSUMER */
|
|
26
|
+
kind: 0 | 1 | 2 | 3 | 4 | 5;
|
|
27
|
+
startTimeUnixNano: string;
|
|
28
|
+
endTimeUnixNano: string;
|
|
29
|
+
attributes: OTelAttribute[];
|
|
30
|
+
/** OTLP StatusCode: 0=STATUS_CODE_UNSET, 1=STATUS_CODE_OK, 2=STATUS_CODE_ERROR */
|
|
31
|
+
status: {
|
|
32
|
+
code: 0 | 1 | 2;
|
|
33
|
+
message?: string;
|
|
34
|
+
};
|
|
35
|
+
events: OTelEvent[];
|
|
36
|
+
}
|
|
37
|
+
export interface OTelAttribute {
|
|
38
|
+
key: string;
|
|
39
|
+
value: {
|
|
40
|
+
stringValue?: string;
|
|
41
|
+
intValue?: string;
|
|
42
|
+
doubleValue?: number;
|
|
43
|
+
boolValue?: boolean;
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
export interface OTelEvent {
|
|
47
|
+
name: string;
|
|
48
|
+
timeUnixNano: string;
|
|
49
|
+
attributes: OTelAttribute[];
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* OTEL export payload (OTLP JSON format)
|
|
53
|
+
*/
|
|
54
|
+
export interface OTelExportPayload {
|
|
55
|
+
resourceSpans: Array<{
|
|
56
|
+
resource: {
|
|
57
|
+
attributes: OTelAttribute[];
|
|
58
|
+
};
|
|
59
|
+
scopeSpans: Array<{
|
|
60
|
+
scope: {
|
|
61
|
+
name: string;
|
|
62
|
+
version: string;
|
|
63
|
+
};
|
|
64
|
+
spans: OTelSpan[];
|
|
65
|
+
}>;
|
|
66
|
+
}>;
|
|
67
|
+
}
|
|
68
|
+
export interface OTelExporterOptions {
|
|
69
|
+
/** OTEL collector endpoint (default: http://localhost:4318/v1/traces) */
|
|
70
|
+
endpoint?: string;
|
|
71
|
+
/** Service name for resource attributes */
|
|
72
|
+
serviceName?: string;
|
|
73
|
+
/** Additional resource attributes */
|
|
74
|
+
resourceAttributes?: Record<string, string>;
|
|
75
|
+
/** SDK version */
|
|
76
|
+
sdkVersion?: string;
|
|
77
|
+
/** Headers for the export request */
|
|
78
|
+
headers?: Record<string, string>;
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* OpenTelemetry Exporter for EvalGate WorkflowTracer
|
|
82
|
+
*/
|
|
83
|
+
export declare class OTelExporter {
|
|
84
|
+
private options;
|
|
85
|
+
constructor(options?: OTelExporterOptions);
|
|
86
|
+
/**
|
|
87
|
+
* Export workflow data from a WorkflowTracer instance
|
|
88
|
+
*/
|
|
89
|
+
exportFromTracer(tracer: WorkflowTracer): OTelExportPayload;
|
|
90
|
+
/**
|
|
91
|
+
* Export a run result as OTEL spans
|
|
92
|
+
*/
|
|
93
|
+
exportRunResult(runResult: {
|
|
94
|
+
runId: string;
|
|
95
|
+
metadata: {
|
|
96
|
+
startedAt: number;
|
|
97
|
+
completedAt: number;
|
|
98
|
+
duration: number;
|
|
99
|
+
mode: string;
|
|
100
|
+
};
|
|
101
|
+
results: Array<{
|
|
102
|
+
specId: string;
|
|
103
|
+
name: string;
|
|
104
|
+
filePath: string;
|
|
105
|
+
result: {
|
|
106
|
+
status: string;
|
|
107
|
+
score?: number;
|
|
108
|
+
duration: number;
|
|
109
|
+
error?: string;
|
|
110
|
+
};
|
|
111
|
+
}>;
|
|
112
|
+
summary: {
|
|
113
|
+
passed: number;
|
|
114
|
+
failed: number;
|
|
115
|
+
passRate: number;
|
|
116
|
+
};
|
|
117
|
+
}): OTelExportPayload;
|
|
118
|
+
/**
|
|
119
|
+
* Send payload to OTEL collector via HTTP
|
|
120
|
+
*/
|
|
121
|
+
send(payload: OTelExportPayload): Promise<boolean>;
|
|
122
|
+
private decisionToSpan;
|
|
123
|
+
private handoffToSpan;
|
|
124
|
+
private costToSpan;
|
|
125
|
+
private buildPayload;
|
|
126
|
+
}
|
|
127
|
+
/**
|
|
128
|
+
* Convenience factory
|
|
129
|
+
*/
|
|
130
|
+
export declare function createOTelExporter(options?: OTelExporterOptions): OTelExporter;
|
package/dist/otel.js
ADDED
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* OpenTelemetry Export for WorkflowTracer
|
|
4
|
+
*
|
|
5
|
+
* Converts WorkflowTracer spans, decisions, and costs into
|
|
6
|
+
* OpenTelemetry-compatible span data for export to any OTEL collector.
|
|
7
|
+
*
|
|
8
|
+
* Usage:
|
|
9
|
+
* import { OTelExporter } from "@evalgate/sdk/otel";
|
|
10
|
+
*
|
|
11
|
+
* const exporter = new OTelExporter({ endpoint: "http://localhost:4318" });
|
|
12
|
+
* const tracer = new WorkflowTracer(client, { debug: true });
|
|
13
|
+
* // ... run workflow ...
|
|
14
|
+
* await exporter.exportFromTracer(tracer);
|
|
15
|
+
*/
|
|
16
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
+
exports.OTelExporter = void 0;
|
|
18
|
+
exports.createOTelExporter = createOTelExporter;
|
|
19
|
+
/**
|
|
20
|
+
* Generate a random 16-byte hex trace ID
|
|
21
|
+
*/
|
|
22
|
+
function generateTraceId() {
|
|
23
|
+
const bytes = new Uint8Array(16);
|
|
24
|
+
for (let i = 0; i < 16; i++) {
|
|
25
|
+
bytes[i] = Math.floor(Math.random() * 256);
|
|
26
|
+
}
|
|
27
|
+
return Array.from(bytes)
|
|
28
|
+
.map((b) => b.toString(16).padStart(2, "0"))
|
|
29
|
+
.join("");
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Generate a random 8-byte hex span ID
|
|
33
|
+
*/
|
|
34
|
+
function generateSpanId() {
|
|
35
|
+
const bytes = new Uint8Array(8);
|
|
36
|
+
for (let i = 0; i < 8; i++) {
|
|
37
|
+
bytes[i] = Math.floor(Math.random() * 256);
|
|
38
|
+
}
|
|
39
|
+
return Array.from(bytes)
|
|
40
|
+
.map((b) => b.toString(16).padStart(2, "0"))
|
|
41
|
+
.join("");
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Convert milliseconds to nanosecond string
|
|
45
|
+
*/
|
|
46
|
+
function msToNano(ms) {
|
|
47
|
+
return `${BigInt(ms) * BigInt(1000000)}`;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Create an OTEL attribute
|
|
51
|
+
*/
|
|
52
|
+
function attr(key, value) {
|
|
53
|
+
if (typeof value === "string") {
|
|
54
|
+
return { key, value: { stringValue: value } };
|
|
55
|
+
}
|
|
56
|
+
if (typeof value === "number") {
|
|
57
|
+
if (Number.isInteger(value)) {
|
|
58
|
+
return { key, value: { intValue: String(value) } };
|
|
59
|
+
}
|
|
60
|
+
return { key, value: { doubleValue: value } };
|
|
61
|
+
}
|
|
62
|
+
return { key, value: { boolValue: value } };
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* OpenTelemetry Exporter for EvalGate WorkflowTracer
|
|
66
|
+
*/
|
|
67
|
+
class OTelExporter {
|
|
68
|
+
constructor(options = {}) {
|
|
69
|
+
this.options = {
|
|
70
|
+
endpoint: options.endpoint ?? "http://localhost:4318/v1/traces",
|
|
71
|
+
serviceName: options.serviceName ?? "evalgate",
|
|
72
|
+
resourceAttributes: options.resourceAttributes ?? {},
|
|
73
|
+
sdkVersion: options.sdkVersion ?? "2.3.0",
|
|
74
|
+
headers: options.headers ?? {},
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Export workflow data from a WorkflowTracer instance
|
|
79
|
+
*/
|
|
80
|
+
exportFromTracer(tracer) {
|
|
81
|
+
const workflow = tracer.getCurrentWorkflow();
|
|
82
|
+
const handoffs = tracer.getHandoffs();
|
|
83
|
+
const decisions = tracer.getDecisions();
|
|
84
|
+
const costs = tracer.getCosts();
|
|
85
|
+
const traceId = generateTraceId();
|
|
86
|
+
const rootSpanId = generateSpanId();
|
|
87
|
+
const now = Date.now();
|
|
88
|
+
const spans = [];
|
|
89
|
+
// Root workflow span
|
|
90
|
+
if (workflow) {
|
|
91
|
+
spans.push({
|
|
92
|
+
traceId,
|
|
93
|
+
spanId: rootSpanId,
|
|
94
|
+
name: `workflow.${workflow.name}`,
|
|
95
|
+
kind: 1,
|
|
96
|
+
startTimeUnixNano: msToNano(new Date(workflow.startedAt).getTime()),
|
|
97
|
+
endTimeUnixNano: msToNano(now),
|
|
98
|
+
attributes: [
|
|
99
|
+
attr("evalgate.workflow.name", workflow.name),
|
|
100
|
+
attr("evalgate.workflow.id", workflow.id),
|
|
101
|
+
attr("evalgate.workflow.trace_id", workflow.traceId),
|
|
102
|
+
],
|
|
103
|
+
status: { code: 1 },
|
|
104
|
+
events: [],
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
// Decision spans
|
|
108
|
+
for (let i = 0; i < decisions.length; i++) {
|
|
109
|
+
const decision = decisions[i];
|
|
110
|
+
const spanId = generateSpanId();
|
|
111
|
+
spans.push(this.decisionToSpan(traceId, spanId, rootSpanId, decision, now - decisions.length + i));
|
|
112
|
+
}
|
|
113
|
+
// Handoff events
|
|
114
|
+
for (let i = 0; i < handoffs.length; i++) {
|
|
115
|
+
const handoff = handoffs[i];
|
|
116
|
+
const spanId = generateSpanId();
|
|
117
|
+
spans.push(this.handoffToSpan(traceId, spanId, rootSpanId, handoff));
|
|
118
|
+
}
|
|
119
|
+
// Cost spans
|
|
120
|
+
for (let i = 0; i < costs.length; i++) {
|
|
121
|
+
const cost = costs[i];
|
|
122
|
+
const spanId = generateSpanId();
|
|
123
|
+
spans.push(this.costToSpan(traceId, spanId, rootSpanId, cost, now - costs.length + i));
|
|
124
|
+
}
|
|
125
|
+
return this.buildPayload(spans);
|
|
126
|
+
}
|
|
127
|
+
/**
|
|
128
|
+
* Export a run result as OTEL spans
|
|
129
|
+
*/
|
|
130
|
+
exportRunResult(runResult) {
|
|
131
|
+
const traceId = generateTraceId();
|
|
132
|
+
const rootSpanId = generateSpanId();
|
|
133
|
+
const spans = [];
|
|
134
|
+
// Root run span
|
|
135
|
+
spans.push({
|
|
136
|
+
traceId,
|
|
137
|
+
spanId: rootSpanId,
|
|
138
|
+
name: `evalgate.run.${runResult.runId}`,
|
|
139
|
+
kind: 1,
|
|
140
|
+
startTimeUnixNano: msToNano(runResult.metadata.startedAt),
|
|
141
|
+
endTimeUnixNano: msToNano(runResult.metadata.completedAt),
|
|
142
|
+
attributes: [
|
|
143
|
+
attr("evalgate.run.id", runResult.runId),
|
|
144
|
+
attr("evalgate.run.mode", runResult.metadata.mode),
|
|
145
|
+
attr("evalgate.run.duration_ms", runResult.metadata.duration),
|
|
146
|
+
attr("evalgate.run.pass_rate", runResult.summary.passRate),
|
|
147
|
+
attr("evalgate.run.passed", runResult.summary.passed),
|
|
148
|
+
attr("evalgate.run.failed", runResult.summary.failed),
|
|
149
|
+
],
|
|
150
|
+
status: {
|
|
151
|
+
code: runResult.summary.failed > 0 ? 2 : 1,
|
|
152
|
+
},
|
|
153
|
+
events: [],
|
|
154
|
+
});
|
|
155
|
+
// Per-spec child spans
|
|
156
|
+
let offset = 0;
|
|
157
|
+
for (const spec of runResult.results) {
|
|
158
|
+
const spanId = generateSpanId();
|
|
159
|
+
const specStart = runResult.metadata.startedAt + offset;
|
|
160
|
+
const specEnd = specStart + spec.result.duration;
|
|
161
|
+
offset += spec.result.duration;
|
|
162
|
+
const attributes = [
|
|
163
|
+
attr("evalgate.spec.id", spec.specId),
|
|
164
|
+
attr("evalgate.spec.name", spec.name),
|
|
165
|
+
attr("evalgate.spec.file", spec.filePath),
|
|
166
|
+
attr("evalgate.spec.status", spec.result.status),
|
|
167
|
+
attr("evalgate.spec.duration_ms", spec.result.duration),
|
|
168
|
+
];
|
|
169
|
+
if (spec.result.score !== undefined) {
|
|
170
|
+
attributes.push(attr("evalgate.spec.score", spec.result.score));
|
|
171
|
+
}
|
|
172
|
+
spans.push({
|
|
173
|
+
traceId,
|
|
174
|
+
spanId,
|
|
175
|
+
parentSpanId: rootSpanId,
|
|
176
|
+
name: `evalgate.spec.${spec.name}`,
|
|
177
|
+
kind: 1,
|
|
178
|
+
startTimeUnixNano: msToNano(specStart),
|
|
179
|
+
endTimeUnixNano: msToNano(specEnd),
|
|
180
|
+
attributes,
|
|
181
|
+
status: {
|
|
182
|
+
code: spec.result.status === "passed" ? 1 : 2,
|
|
183
|
+
message: spec.result.error,
|
|
184
|
+
},
|
|
185
|
+
events: [],
|
|
186
|
+
});
|
|
187
|
+
}
|
|
188
|
+
return this.buildPayload(spans);
|
|
189
|
+
}
|
|
190
|
+
/**
|
|
191
|
+
* Send payload to OTEL collector via HTTP
|
|
192
|
+
*/
|
|
193
|
+
async send(payload) {
|
|
194
|
+
try {
|
|
195
|
+
const response = await fetch(this.options.endpoint, {
|
|
196
|
+
method: "POST",
|
|
197
|
+
headers: {
|
|
198
|
+
"Content-Type": "application/json",
|
|
199
|
+
...this.options.headers,
|
|
200
|
+
},
|
|
201
|
+
body: JSON.stringify(payload),
|
|
202
|
+
});
|
|
203
|
+
return response.ok;
|
|
204
|
+
}
|
|
205
|
+
catch (err) {
|
|
206
|
+
console.warn(`[OTelExporter] Failed to send: ${err instanceof Error ? err.message : String(err)}`);
|
|
207
|
+
return false;
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
decisionToSpan(traceId, spanId, parentSpanId, decision, timestampMs) {
|
|
211
|
+
return {
|
|
212
|
+
traceId,
|
|
213
|
+
spanId,
|
|
214
|
+
parentSpanId,
|
|
215
|
+
name: `decision.${decision.agent}.${decision.chosen}`,
|
|
216
|
+
kind: 1,
|
|
217
|
+
startTimeUnixNano: msToNano(timestampMs),
|
|
218
|
+
endTimeUnixNano: msToNano(timestampMs + 1),
|
|
219
|
+
attributes: [
|
|
220
|
+
attr("evalgate.decision.agent", decision.agent),
|
|
221
|
+
attr("evalgate.decision.type", decision.type),
|
|
222
|
+
attr("evalgate.decision.chosen", decision.chosen),
|
|
223
|
+
attr("evalgate.decision.alternatives", decision.alternatives.length),
|
|
224
|
+
...(decision.confidence !== undefined
|
|
225
|
+
? [attr("evalgate.decision.confidence", decision.confidence)]
|
|
226
|
+
: []),
|
|
227
|
+
...(decision.reasoning
|
|
228
|
+
? [attr("evalgate.decision.reasoning", decision.reasoning)]
|
|
229
|
+
: []),
|
|
230
|
+
],
|
|
231
|
+
status: { code: 1 },
|
|
232
|
+
events: [],
|
|
233
|
+
};
|
|
234
|
+
}
|
|
235
|
+
handoffToSpan(traceId, spanId, parentSpanId, handoff) {
|
|
236
|
+
const ts = new Date(handoff.timestamp).getTime();
|
|
237
|
+
return {
|
|
238
|
+
traceId,
|
|
239
|
+
spanId,
|
|
240
|
+
parentSpanId,
|
|
241
|
+
name: `handoff.${handoff.fromAgent ?? "start"}.${handoff.toAgent}`,
|
|
242
|
+
kind: 1,
|
|
243
|
+
startTimeUnixNano: msToNano(ts),
|
|
244
|
+
endTimeUnixNano: msToNano(ts + 1),
|
|
245
|
+
attributes: [
|
|
246
|
+
attr("evalgate.handoff.from", handoff.fromAgent ?? "start"),
|
|
247
|
+
attr("evalgate.handoff.to", handoff.toAgent),
|
|
248
|
+
attr("evalgate.handoff.type", handoff.handoffType),
|
|
249
|
+
],
|
|
250
|
+
status: { code: 1 },
|
|
251
|
+
events: [],
|
|
252
|
+
};
|
|
253
|
+
}
|
|
254
|
+
costToSpan(traceId, spanId, parentSpanId, cost, timestampMs) {
|
|
255
|
+
return {
|
|
256
|
+
traceId,
|
|
257
|
+
spanId,
|
|
258
|
+
parentSpanId,
|
|
259
|
+
name: `cost.${cost.provider}.${cost.model}`,
|
|
260
|
+
kind: 1,
|
|
261
|
+
startTimeUnixNano: msToNano(timestampMs),
|
|
262
|
+
endTimeUnixNano: msToNano(timestampMs + 1),
|
|
263
|
+
attributes: [
|
|
264
|
+
attr("evalgate.cost.provider", cost.provider),
|
|
265
|
+
attr("evalgate.cost.model", cost.model),
|
|
266
|
+
attr("evalgate.cost.input_tokens", cost.inputTokens),
|
|
267
|
+
attr("evalgate.cost.output_tokens", cost.outputTokens),
|
|
268
|
+
attr("evalgate.cost.total_tokens", cost.totalTokens),
|
|
269
|
+
attr("evalgate.cost.total_usd", cost.totalCost),
|
|
270
|
+
],
|
|
271
|
+
status: { code: 1 },
|
|
272
|
+
events: [],
|
|
273
|
+
};
|
|
274
|
+
}
|
|
275
|
+
buildPayload(spans) {
|
|
276
|
+
const resourceAttrs = [
|
|
277
|
+
attr("service.name", this.options.serviceName),
|
|
278
|
+
attr("telemetry.sdk.name", "evalgate"),
|
|
279
|
+
attr("telemetry.sdk.version", this.options.sdkVersion),
|
|
280
|
+
attr("telemetry.sdk.language", "nodejs"),
|
|
281
|
+
];
|
|
282
|
+
for (const [key, value] of Object.entries(this.options.resourceAttributes)) {
|
|
283
|
+
resourceAttrs.push(attr(key, value));
|
|
284
|
+
}
|
|
285
|
+
return {
|
|
286
|
+
resourceSpans: [
|
|
287
|
+
{
|
|
288
|
+
resource: { attributes: resourceAttrs },
|
|
289
|
+
scopeSpans: [
|
|
290
|
+
{
|
|
291
|
+
scope: {
|
|
292
|
+
name: "evalgate",
|
|
293
|
+
version: this.options.sdkVersion,
|
|
294
|
+
},
|
|
295
|
+
spans,
|
|
296
|
+
},
|
|
297
|
+
],
|
|
298
|
+
},
|
|
299
|
+
],
|
|
300
|
+
};
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
exports.OTelExporter = OTelExporter;
|
|
304
|
+
/**
|
|
305
|
+
* Convenience factory
|
|
306
|
+
*/
|
|
307
|
+
function createOTelExporter(options) {
|
|
308
|
+
return new OTelExporter(options);
|
|
309
|
+
}
|
package/dist/runtime/eval.d.ts
CHANGED
|
@@ -4,12 +4,19 @@
|
|
|
4
4
|
* The core DSL function for defining behavioral specifications.
|
|
5
5
|
* Uses content-addressable identity with AST position for stability.
|
|
6
6
|
*/
|
|
7
|
-
import
|
|
7
|
+
import { createEvalRuntime, disposeActiveRuntime, getActiveRuntime, setActiveRuntime, withRuntime } from "./registry";
|
|
8
|
+
import type { DefineEvalFunction, EvalContext, EvalResult, EvalSpec } from "./types";
|
|
8
9
|
/**
|
|
9
10
|
* Export the defineEval function with proper typing
|
|
10
11
|
* This is the main DSL entry point
|
|
11
12
|
*/
|
|
12
13
|
export declare const defineEval: DefineEvalFunction;
|
|
14
|
+
/**
|
|
15
|
+
* Filter a list of specs according to skip/only semantics:
|
|
16
|
+
* - If any spec has mode === "only", return only those specs
|
|
17
|
+
* - Otherwise, return all specs except those with mode === "skip"
|
|
18
|
+
*/
|
|
19
|
+
export declare function getFilteredSpecs(specs: EvalSpec[]): EvalSpec[];
|
|
13
20
|
/**
|
|
14
21
|
* Convenience export for evalai.test() alias (backward compatibility)
|
|
15
22
|
* Provides alternative naming that matches the original roadmap vision
|
|
@@ -48,8 +55,11 @@ export declare function createResult(config: {
|
|
|
48
55
|
assertions?: EvalResult["assertions"];
|
|
49
56
|
metadata?: Record<string, unknown>;
|
|
50
57
|
error?: string;
|
|
58
|
+
output?: string;
|
|
59
|
+
durationMs?: number;
|
|
60
|
+
tokens?: number;
|
|
51
61
|
}): EvalResult;
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
62
|
+
export { createEvalRuntime, disposeActiveRuntime, getActiveRuntime, setActiveRuntime, withRuntime, };
|
|
63
|
+
export { createContext as createEvalContext };
|
|
64
|
+
export { createLocalExecutor } from "./executor";
|
|
55
65
|
export default defineEval;
|