opensearch-genai-sdk 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/decorators.d.ts +111 -0
- package/dist/decorators.d.ts.map +1 -0
- package/dist/decorators.js +217 -0
- package/dist/decorators.js.map +1 -0
- package/dist/evals/evaluate.d.ts +97 -0
- package/dist/evals/evaluate.d.ts.map +1 -0
- package/dist/evals/evaluate.js +202 -0
- package/dist/evals/evaluate.js.map +1 -0
- package/dist/evals/index.d.ts +12 -0
- package/dist/evals/index.d.ts.map +1 -0
- package/dist/evals/index.js +10 -0
- package/dist/evals/index.js.map +1 -0
- package/dist/evals/protocol.d.ts +69 -0
- package/dist/evals/protocol.d.ts.map +1 -0
- package/dist/evals/protocol.js +73 -0
- package/dist/evals/protocol.js.map +1 -0
- package/dist/index.d.ts +18 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +17 -0
- package/dist/index.js.map +1 -0
- package/dist/register.d.ts +59 -0
- package/dist/register.d.ts.map +1 -0
- package/dist/register.js +149 -0
- package/dist/register.js.map +1 -0
- package/dist/score.d.ts +56 -0
- package/dist/score.d.ts.map +1 -0
- package/dist/score.js +70 -0
- package/dist/score.js.map +1 -0
- package/package.json +53 -0
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Wrapper functions for tracing custom functions as OTEL spans.
|
|
3
|
+
*
|
|
4
|
+
* Provides traceWorkflow, traceTask, traceAgent, and traceTool
|
|
5
|
+
* that create standard OpenTelemetry spans. These are the user-facing
|
|
6
|
+
* API for tracing custom application logic -- the gap that pure
|
|
7
|
+
* auto-instrumentors don't cover.
|
|
8
|
+
*
|
|
9
|
+
* All wrappers produce standard OTEL spans with gen_ai semantic
|
|
10
|
+
* convention attributes. Zero lock-in: remove the wrapper and
|
|
11
|
+
* your code still works.
|
|
12
|
+
*
|
|
13
|
+
* TypeScript does not have Python-style decorators that work on plain
|
|
14
|
+
* functions, so we use higher-order wrapper functions instead:
|
|
15
|
+
*
|
|
16
|
+
* @example
|
|
17
|
+
* ```ts
|
|
18
|
+
* const search = traceTool("web_search", async (query: string) => {
|
|
19
|
+
* return [{ title: `Result: ${query}` }];
|
|
20
|
+
* });
|
|
21
|
+
*
|
|
22
|
+
* const pipeline = traceWorkflow("qa_pipeline", async (question: string) => {
|
|
23
|
+
* const results = await search(question);
|
|
24
|
+
* return summarize(results);
|
|
25
|
+
* });
|
|
26
|
+
* ```
|
|
27
|
+
*/
|
|
28
|
+
export interface TraceOptions {
|
|
29
|
+
/** Optional version number for tracking changes. */
|
|
30
|
+
version?: number;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Trace a function as a workflow span.
|
|
34
|
+
*
|
|
35
|
+
* Use for top-level orchestration functions that coordinate
|
|
36
|
+
* multiple tasks, agents, or tool calls.
|
|
37
|
+
*
|
|
38
|
+
* @param name - Span name for the workflow.
|
|
39
|
+
* @param fn - The function to wrap.
|
|
40
|
+
* @param options - Optional trace settings.
|
|
41
|
+
* @returns A wrapped function with the same signature.
|
|
42
|
+
*
|
|
43
|
+
* @example
|
|
44
|
+
* ```ts
|
|
45
|
+
* const pipeline = traceWorkflow("qa_pipeline", async (question: string) => {
|
|
46
|
+
* const plan = await planSteps(question);
|
|
47
|
+
* return await execute(plan);
|
|
48
|
+
* });
|
|
49
|
+
* ```
|
|
50
|
+
*/
|
|
51
|
+
export declare function traceWorkflow<TArgs extends unknown[], TReturn>(name: string, fn: (...args: TArgs) => TReturn, options?: TraceOptions): (...args: TArgs) => TReturn;
|
|
52
|
+
/**
|
|
53
|
+
* Trace a function as a task span.
|
|
54
|
+
*
|
|
55
|
+
* Use for individual units of work within a workflow.
|
|
56
|
+
*
|
|
57
|
+
* @param name - Span name for the task.
|
|
58
|
+
* @param fn - The function to wrap.
|
|
59
|
+
* @param options - Optional trace settings.
|
|
60
|
+
* @returns A wrapped function with the same signature.
|
|
61
|
+
*
|
|
62
|
+
* @example
|
|
63
|
+
* ```ts
|
|
64
|
+
* const summarize = traceTask("summarize", async (text: string) => {
|
|
65
|
+
* return llm.generate(`Summarize: ${text}`);
|
|
66
|
+
* });
|
|
67
|
+
* ```
|
|
68
|
+
*/
|
|
69
|
+
export declare function traceTask<TArgs extends unknown[], TReturn>(name: string, fn: (...args: TArgs) => TReturn, options?: TraceOptions): (...args: TArgs) => TReturn;
|
|
70
|
+
/**
|
|
71
|
+
* Trace a function as an agent span.
|
|
72
|
+
*
|
|
73
|
+
* Use for autonomous agent logic that makes decisions and
|
|
74
|
+
* invokes tools.
|
|
75
|
+
*
|
|
76
|
+
* @param name - Span name for the agent.
|
|
77
|
+
* @param fn - The function to wrap.
|
|
78
|
+
* @param options - Optional trace settings.
|
|
79
|
+
* @returns A wrapped function with the same signature.
|
|
80
|
+
*
|
|
81
|
+
* @example
|
|
82
|
+
* ```ts
|
|
83
|
+
* const agent = traceAgent("research_agent", async (query: string) => {
|
|
84
|
+
* while (!done) {
|
|
85
|
+
* const action = await decideNextAction(query);
|
|
86
|
+
* result = await executeAction(action);
|
|
87
|
+
* }
|
|
88
|
+
* return result;
|
|
89
|
+
* });
|
|
90
|
+
* ```
|
|
91
|
+
*/
|
|
92
|
+
export declare function traceAgent<TArgs extends unknown[], TReturn>(name: string, fn: (...args: TArgs) => TReturn, options?: TraceOptions): (...args: TArgs) => TReturn;
|
|
93
|
+
/**
|
|
94
|
+
* Trace a function as a tool span.
|
|
95
|
+
*
|
|
96
|
+
* Use for tool/function calls invoked by agents.
|
|
97
|
+
*
|
|
98
|
+
* @param name - Span name for the tool.
|
|
99
|
+
* @param fn - The function to wrap.
|
|
100
|
+
* @param options - Optional trace settings.
|
|
101
|
+
* @returns A wrapped function with the same signature.
|
|
102
|
+
*
|
|
103
|
+
* @example
|
|
104
|
+
* ```ts
|
|
105
|
+
* const search = traceTool("web_search", async (query: string) => {
|
|
106
|
+
* return searchApi.query(query);
|
|
107
|
+
* });
|
|
108
|
+
* ```
|
|
109
|
+
*/
|
|
110
|
+
export declare function traceTool<TArgs extends unknown[], TReturn>(name: string, fn: (...args: TArgs) => TReturn, options?: TraceOptions): (...args: TArgs) => TReturn;
|
|
111
|
+
//# sourceMappingURL=decorators.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"decorators.d.ts","sourceRoot":"","sources":["../src/decorators.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AAeH,MAAM,WAAW,YAAY;IAC3B,oDAAoD;IACpD,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED;;;;;;;;;;;;;;;;;;GAkBG;AACH,wBAAgB,aAAa,CAAC,KAAK,SAAS,OAAO,EAAE,EAAE,OAAO,EAC5D,IAAI,EAAE,MAAM,EACZ,EAAE,EAAE,CAAC,GAAG,IAAI,EAAE,KAAK,KAAK,OAAO,EAC/B,OAAO,CAAC,EAAE,YAAY,GACrB,CAAC,GAAG,IAAI,EAAE,KAAK,KAAK,OAAO,CAE7B;AAED;;;;;;;;;;;;;;;;GAgBG;AACH,wBAAgB,SAAS,CAAC,KAAK,SAAS,OAAO,EAAE,EAAE,OAAO,EACxD,IAAI,EAAE,MAAM,EACZ,EAAE,EAAE,CAAC,GAAG,IAAI,EAAE,KAAK,KAAK,OAAO,EAC/B,OAAO,CAAC,EAAE,YAAY,GACrB,CAAC,GAAG,IAAI,EAAE,KAAK,KAAK,OAAO,CAE7B;AAED;;;;;;;;;;;;;;;;;;;;;GAqBG;AACH,wBAAgB,UAAU,CAAC,KAAK,SAAS,OAAO,EAAE,EAAE,OAAO,EACzD,IAAI,EAAE,MAAM,EACZ,EAAE,EAAE,CAAC,GAAG,IAAI,EAAE,KAAK,KAAK,OAAO,EAC/B,OAAO,CAAC,EAAE,YAAY,GACrB,CAAC,GAAG,IAAI,EAAE,KAAK,KAAK,OAAO,CAE7B;AAED;;;;;;;;;;;;;;;;GAgBG;AACH,wBAAgB,SAAS,CAAC,KAAK,SAAS,OAAO,EAAE,EAAE,OAAO,EACxD,IAAI,EAAE,MAAM,EACZ,EAAE,EAAE,CAAC,GAAG,IAAI,EAAE,KAAK,KAAK,OAAO,EAC/B,OAAO,CAAC,EAAE,YAAY,GACrB,CAAC,GAAG,IAAI,EAAE,KAAK,KAAK,OAAO,CAE7B"}
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Wrapper functions for tracing custom functions as OTEL spans.
|
|
3
|
+
*
|
|
4
|
+
* Provides traceWorkflow, traceTask, traceAgent, and traceTool
|
|
5
|
+
* that create standard OpenTelemetry spans. These are the user-facing
|
|
6
|
+
* API for tracing custom application logic -- the gap that pure
|
|
7
|
+
* auto-instrumentors don't cover.
|
|
8
|
+
*
|
|
9
|
+
* All wrappers produce standard OTEL spans with gen_ai semantic
|
|
10
|
+
* convention attributes. Zero lock-in: remove the wrapper and
|
|
11
|
+
* your code still works.
|
|
12
|
+
*
|
|
13
|
+
* TypeScript does not have Python-style decorators that work on plain
|
|
14
|
+
* functions, so we use higher-order wrapper functions instead:
|
|
15
|
+
*
|
|
16
|
+
* @example
|
|
17
|
+
* ```ts
|
|
18
|
+
* const search = traceTool("web_search", async (query: string) => {
|
|
19
|
+
* return [{ title: `Result: ${query}` }];
|
|
20
|
+
* });
|
|
21
|
+
*
|
|
22
|
+
* const pipeline = traceWorkflow("qa_pipeline", async (question: string) => {
|
|
23
|
+
* const results = await search(question);
|
|
24
|
+
* return summarize(results);
|
|
25
|
+
* });
|
|
26
|
+
* ```
|
|
27
|
+
*/
|
|
28
|
+
import { trace, SpanStatusCode } from "@opentelemetry/api";
|
|
29
|
+
/** Span kind values following OpenLLMetry/OTEL GenAI conventions. */
|
|
30
|
+
const SPAN_KIND_WORKFLOW = "workflow";
|
|
31
|
+
const SPAN_KIND_TASK = "task";
|
|
32
|
+
const SPAN_KIND_AGENT = "agent";
|
|
33
|
+
const SPAN_KIND_TOOL = "tool";
|
|
34
|
+
const TRACER_NAME = "opensearch-genai-sdk";
|
|
35
|
+
/** Maximum size (in characters) for serialized input/output attributes. */
|
|
36
|
+
const MAX_ATTRIBUTE_LENGTH = 10_000;
|
|
37
|
+
/**
|
|
38
|
+
* Trace a function as a workflow span.
|
|
39
|
+
*
|
|
40
|
+
* Use for top-level orchestration functions that coordinate
|
|
41
|
+
* multiple tasks, agents, or tool calls.
|
|
42
|
+
*
|
|
43
|
+
* @param name - Span name for the workflow.
|
|
44
|
+
* @param fn - The function to wrap.
|
|
45
|
+
* @param options - Optional trace settings.
|
|
46
|
+
* @returns A wrapped function with the same signature.
|
|
47
|
+
*
|
|
48
|
+
* @example
|
|
49
|
+
* ```ts
|
|
50
|
+
* const pipeline = traceWorkflow("qa_pipeline", async (question: string) => {
|
|
51
|
+
* const plan = await planSteps(question);
|
|
52
|
+
* return await execute(plan);
|
|
53
|
+
* });
|
|
54
|
+
* ```
|
|
55
|
+
*/
|
|
56
|
+
export function traceWorkflow(name, fn, options) {
|
|
57
|
+
return makeWrapper(name, fn, SPAN_KIND_WORKFLOW, options);
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Trace a function as a task span.
|
|
61
|
+
*
|
|
62
|
+
* Use for individual units of work within a workflow.
|
|
63
|
+
*
|
|
64
|
+
* @param name - Span name for the task.
|
|
65
|
+
* @param fn - The function to wrap.
|
|
66
|
+
* @param options - Optional trace settings.
|
|
67
|
+
* @returns A wrapped function with the same signature.
|
|
68
|
+
*
|
|
69
|
+
* @example
|
|
70
|
+
* ```ts
|
|
71
|
+
* const summarize = traceTask("summarize", async (text: string) => {
|
|
72
|
+
* return llm.generate(`Summarize: ${text}`);
|
|
73
|
+
* });
|
|
74
|
+
* ```
|
|
75
|
+
*/
|
|
76
|
+
export function traceTask(name, fn, options) {
|
|
77
|
+
return makeWrapper(name, fn, SPAN_KIND_TASK, options);
|
|
78
|
+
}
|
|
79
|
+
/**
|
|
80
|
+
* Trace a function as an agent span.
|
|
81
|
+
*
|
|
82
|
+
* Use for autonomous agent logic that makes decisions and
|
|
83
|
+
* invokes tools.
|
|
84
|
+
*
|
|
85
|
+
* @param name - Span name for the agent.
|
|
86
|
+
* @param fn - The function to wrap.
|
|
87
|
+
* @param options - Optional trace settings.
|
|
88
|
+
* @returns A wrapped function with the same signature.
|
|
89
|
+
*
|
|
90
|
+
* @example
|
|
91
|
+
* ```ts
|
|
92
|
+
* const agent = traceAgent("research_agent", async (query: string) => {
|
|
93
|
+
* while (!done) {
|
|
94
|
+
* const action = await decideNextAction(query);
|
|
95
|
+
* result = await executeAction(action);
|
|
96
|
+
* }
|
|
97
|
+
* return result;
|
|
98
|
+
* });
|
|
99
|
+
* ```
|
|
100
|
+
*/
|
|
101
|
+
export function traceAgent(name, fn, options) {
|
|
102
|
+
return makeWrapper(name, fn, SPAN_KIND_AGENT, options);
|
|
103
|
+
}
|
|
104
|
+
/**
|
|
105
|
+
* Trace a function as a tool span.
|
|
106
|
+
*
|
|
107
|
+
* Use for tool/function calls invoked by agents.
|
|
108
|
+
*
|
|
109
|
+
* @param name - Span name for the tool.
|
|
110
|
+
* @param fn - The function to wrap.
|
|
111
|
+
* @param options - Optional trace settings.
|
|
112
|
+
* @returns A wrapped function with the same signature.
|
|
113
|
+
*
|
|
114
|
+
* @example
|
|
115
|
+
* ```ts
|
|
116
|
+
* const search = traceTool("web_search", async (query: string) => {
|
|
117
|
+
* return searchApi.query(query);
|
|
118
|
+
* });
|
|
119
|
+
* ```
|
|
120
|
+
*/
|
|
121
|
+
export function traceTool(name, fn, options) {
|
|
122
|
+
return makeWrapper(name, fn, SPAN_KIND_TOOL, options);
|
|
123
|
+
}
|
|
124
|
+
/**
|
|
125
|
+
* Create a wrapper function that wraps the original function in an OTEL span.
|
|
126
|
+
*
|
|
127
|
+
* Handles both sync and async functions transparently. When the wrapped
|
|
128
|
+
* function returns a Promise, the span is ended when the Promise settles.
|
|
129
|
+
*/
|
|
130
|
+
function makeWrapper(spanName, fn, spanKind, options) {
|
|
131
|
+
const wrappedFn = (...args) => {
|
|
132
|
+
const tracer = trace.getTracer(TRACER_NAME);
|
|
133
|
+
return tracer.startActiveSpan(spanName, (span) => {
|
|
134
|
+
setSpanAttributes(span, spanKind, options?.version, args);
|
|
135
|
+
try {
|
|
136
|
+
const result = fn(...args);
|
|
137
|
+
// Handle async functions: result is a Promise
|
|
138
|
+
if (result instanceof Promise) {
|
|
139
|
+
return result
|
|
140
|
+
.then((resolved) => {
|
|
141
|
+
setOutput(span, resolved);
|
|
142
|
+
span.end();
|
|
143
|
+
return resolved;
|
|
144
|
+
})
|
|
145
|
+
.catch((err) => {
|
|
146
|
+
span.setStatus({ code: SpanStatusCode.ERROR, message: String(err) });
|
|
147
|
+
span.recordException(err);
|
|
148
|
+
span.end();
|
|
149
|
+
throw err;
|
|
150
|
+
});
|
|
151
|
+
}
|
|
152
|
+
// Sync function
|
|
153
|
+
setOutput(span, result);
|
|
154
|
+
span.end();
|
|
155
|
+
return result;
|
|
156
|
+
}
|
|
157
|
+
catch (err) {
|
|
158
|
+
span.setStatus({ code: SpanStatusCode.ERROR, message: String(err) });
|
|
159
|
+
span.recordException(err);
|
|
160
|
+
span.end();
|
|
161
|
+
throw err;
|
|
162
|
+
}
|
|
163
|
+
});
|
|
164
|
+
};
|
|
165
|
+
// Preserve function name for debugging
|
|
166
|
+
Object.defineProperty(wrappedFn, "name", { value: fn.name || spanName });
|
|
167
|
+
return wrappedFn;
|
|
168
|
+
}
|
|
169
|
+
/**
|
|
170
|
+
* Set standard attributes on a span.
|
|
171
|
+
*/
|
|
172
|
+
function setSpanAttributes(span, spanKind, version, args) {
|
|
173
|
+
// GenAI / traceloop span kind convention
|
|
174
|
+
span.setAttribute("traceloop.span.kind", spanKind);
|
|
175
|
+
span.setAttribute("gen_ai.operation.name", spanKind);
|
|
176
|
+
if (version !== undefined) {
|
|
177
|
+
span.setAttribute("traceloop.entity.version", version);
|
|
178
|
+
}
|
|
179
|
+
// Capture input (best-effort, don't fail if serialization fails)
|
|
180
|
+
setInput(span, args);
|
|
181
|
+
}
|
|
182
|
+
/**
|
|
183
|
+
* Attempt to capture function input as a span attribute.
|
|
184
|
+
*/
|
|
185
|
+
function setInput(span, args) {
|
|
186
|
+
try {
|
|
187
|
+
if (args.length === 0)
|
|
188
|
+
return;
|
|
189
|
+
const value = args.length === 1 ? args[0] : args;
|
|
190
|
+
let serialized = JSON.stringify(value, (_key, val) => typeof val === "bigint" ? val.toString() : val);
|
|
191
|
+
if (serialized.length > MAX_ATTRIBUTE_LENGTH) {
|
|
192
|
+
serialized = serialized.slice(0, MAX_ATTRIBUTE_LENGTH) + "...(truncated)";
|
|
193
|
+
}
|
|
194
|
+
span.setAttribute("traceloop.entity.input", serialized);
|
|
195
|
+
}
|
|
196
|
+
catch {
|
|
197
|
+
// Silently ignore serialization errors
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
/**
|
|
201
|
+
* Attempt to capture function output as a span attribute.
|
|
202
|
+
*/
|
|
203
|
+
function setOutput(span, result) {
|
|
204
|
+
try {
|
|
205
|
+
if (result === undefined || result === null)
|
|
206
|
+
return;
|
|
207
|
+
let serialized = JSON.stringify(result, (_key, val) => typeof val === "bigint" ? val.toString() : val);
|
|
208
|
+
if (serialized.length > MAX_ATTRIBUTE_LENGTH) {
|
|
209
|
+
serialized = serialized.slice(0, MAX_ATTRIBUTE_LENGTH) + "...(truncated)";
|
|
210
|
+
}
|
|
211
|
+
span.setAttribute("traceloop.entity.output", serialized);
|
|
212
|
+
}
|
|
213
|
+
catch {
|
|
214
|
+
// Silently ignore serialization errors
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
//# sourceMappingURL=decorators.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"decorators.js","sourceRoot":"","sources":["../src/decorators.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AAEH,OAAO,EAAE,KAAK,EAAE,cAAc,EAAQ,MAAM,oBAAoB,CAAC;AAEjE,qEAAqE;AACrE,MAAM,kBAAkB,GAAG,UAAU,CAAC;AACtC,MAAM,cAAc,GAAG,MAAM,CAAC;AAC9B,MAAM,eAAe,GAAG,OAAO,CAAC;AAChC,MAAM,cAAc,GAAG,MAAM,CAAC;AAE9B,MAAM,WAAW,GAAG,sBAAsB,CAAC;AAE3C,2EAA2E;AAC3E,MAAM,oBAAoB,GAAG,MAAM,CAAC;AAOpC;;;;;;;;;;;;;;;;;;GAkBG;AACH,MAAM,UAAU,aAAa,CAC3B,IAAY,EACZ,EAA+B,EAC/B,OAAsB;IAEtB,OAAO,WAAW,CAAC,IAAI,EAAE,EAAE,EAAE,kBAAkB,EAAE,OAAO,CAAC,CAAC;AAC5D,CAAC;AAED;;;;;;;;;;;;;;;;GAgBG;AACH,MAAM,UAAU,SAAS,CACvB,IAAY,EACZ,EAA+B,EAC/B,OAAsB;IAEtB,OAAO,WAAW,CAAC,IAAI,EAAE,EAAE,EAAE,cAAc,EAAE,OAAO,CAAC,CAAC;AACxD,CAAC;AAED;;;;;;;;;;;;;;;;;;;;;GAqBG;AACH,MAAM,UAAU,UAAU,CACxB,IAAY,EACZ,EAA+B,EAC/B,OAAsB;IAEtB,OAAO,WAAW,CAAC,IAAI,EAAE,EAAE,EAAE,eAAe,EAAE,OAAO,CAAC,CAAC;AACzD,CAAC;AAED;;;;;;;;;;;;;;;;GAgBG;AACH,MAAM,UAAU,SAAS,CACvB,IAAY,EACZ,EAA+B,EAC/B,OAAsB;IAEtB,OAAO,WAAW,CAAC,IAAI,EAAE,EAAE,EAAE,cAAc,EAAE,OAAO,CAAC,CAAC;AACxD,CAAC;AAED;;;;;GAKG;AACH,SAAS,WAAW,CAClB,QAAgB,EAChB,EAA+B,EAC/B,QAAgB,EAChB,OAAsB;IAEtB,MAAM,SAAS,GAAG,CAAC,GAAG,IAAW,EAAW,EAAE;QAC5C,MAAM,MAAM,GAAG,KAAK,CAAC,SAAS,CAAC,WAAW,CAAC,CAAC;QAE5C,OAAO,MAAM,CAAC,eAAe,CAAC,QAAQ,EAAE,CAAC,IAAU,EAAE,EAAE;YACrD,iBAAiB,CAAC,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,CAAC,CAAC;YAE1D,IAAI,CAAC;gBACH,MAAM,MAAM,GAAG,EAAE,CAAC,GAAG,IAAI,CAAC,CAAC;gBAE3B,8CAA8C;gBAC9C,IAAI,MAAM,YAAY,OAAO,EAAE,CAAC;oBAC9B,OAAO,MAAM;yBACV,IAAI,CAAC,CAAC,QAAQ,EAAE,EAAE;wBACjB,SAAS,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;wBAC1B,IAAI,CAAC,GAAG,EAAE,CAAC;wBACX,OAAO,QAAQ,CAAC;oBAClB,CAAC,CAAC;yBACD,KAAK,CAAC,CAAC,GAAU,EAAE,EAAE;wBACpB,IAAI,CAAC,SAAS,CAAC,EAAE,IAAI,EAAE,cAAc,CAAC,KAAK,EAAE,OAAO,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;wBACrE,IAAI,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC;wBAC1B,IAAI,CAAC,GAAG,EAAE,CAAC;wBACX,MAAM,GAAG,CAAC;oBACZ,CAAC,CAAY,CAAC;gBAClB,CAAC;gBAED,gBAAgB;gBAChB,SAAS,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;gBACxB,IAAI,CAAC,GAAG,EAAE,CAAC;gBACX,OAAO,MAAM,CAAC;YAChB,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACb,IAAI,CAAC,SAAS,CAAC,EAAE,IAAI,EAAE,cAAc,CAAC,KAAK,EAAE,OAAO,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;gBACrE,IAAI,CAAC,eAAe,CAAC,GAAY,CAAC,CAAC;gBACnC,IAAI,CAAC,GAAG,EAAE,CAAC;gBACX,MAAM,GAAG,CAAC;YACZ,CAAC;QACH,CAAC,CAAC,CAAC;IACL,CAAC,CAAC;IAEF,uCAAuC;IACvC,MAAM,CAAC,cAAc,CAAC,SAAS,EAAE,MAAM,EAAE,EAAE,KAAK,EAAE,EAAE,CAAC,IAAI,IAAI,QAAQ,EAAE,CAAC,CAAC;IACzE,OAAO,SAAS,CAAC;AACnB,CAAC;AAED;;GAEG;AACH,SAAS,iBAAiB,CACxB,IAAU,EACV,QAAgB,EAChB,OAA2B,EAC3B,IAAe;IAEf,yCAAyC;IACzC,IAAI,CAAC,YAAY,CAAC,qBAAqB,EAAE,QAAQ,CAAC,CAAC;IACnD,IAAI,CAAC,YAAY,CAAC,uBAAuB,EAAE,QAAQ,CAAC,CAAC;IAErD,IAAI,OAAO,KAAK,SAAS,EAAE,CAAC;QAC1B,IAAI,CAAC,YAAY,CAAC,0BAA0B,EAAE,OAAO,CAAC,CAAC;IACzD,CAAC;IAED,iEAAiE;IACjE,QAAQ,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;AACvB,CAAC;AAED;;GAEG;AACH,SAAS,QAAQ,CAAC,IAAU,EAAE,IAAe;IAC3C,IAAI,CAAC;QACH,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO;QAE9B,MAAM,KAAK,GAAG,IAAI,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;QACjD,IAAI,UAAU,GAAG,IAAI,CAAC,SAAS,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,GAAG,EAAE,EAAE,CACnD,OAAO,GAAG,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,GAAG,CAC/C,CAAC;QAEF,IAAI,UAAU,CAAC,MAAM,GAAG,oBAAoB,EAAE,CAAC;YAC7C,UAAU,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC,EAAE,oBAAoB,CAAC,GAAG,gBAAgB,CAAC;QAC5E,CAAC;QAED,IAAI,CAAC,YAAY,CAAC,wBAAwB,EAAE,UAAU,CAAC,CAAC;IAC1D,CAAC;IAAC,MAAM,CAAC;QACP,uCAAuC;IACzC,CAAC;AACH,CAAC;AAED;;GAEG;AACH,SAAS,SAAS,CAAC,IAAU,EAAE,MAAe;IAC5C,IAAI,CAAC;QACH,IAAI,MAAM,KAAK,SAAS,IAAI,MAAM,KAAK,IAAI;YAAE,OAAO;QAEpD,IAAI,UAAU,GAAG,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,GAAG,EAAE,EAAE,CACpD,OAAO,GAAG,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,GAAG,CAC/C,CAAC;QAEF,IAAI,UAAU,CAAC,MAAM,GAAG,oBAAoB,EAAE,CAAC;YAC7C,UAAU,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC,EAAE,oBAAoB,CAAC,GAAG,gBAAgB,CAAC;QAC5E,CAAC;QAED,IAAI,CAAC,YAAY,CAAC,yBAAyB,EAAE,UAAU,CAAC,CAAC;IAC3D,CAAC;IAAC,MAAM,CAAC;QACP,uCAAuC;IACzC,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluation orchestrator for OpenSearch AI observability.
|
|
3
|
+
*
|
|
4
|
+
* Runs a task function across a dataset, applies scorers to each output,
|
|
5
|
+
* creates OTEL spans for the entire eval run, and stores scores in OpenSearch.
|
|
6
|
+
*/
|
|
7
|
+
import { Score, Scorer } from "./protocol.js";
|
|
8
|
+
/**
|
|
9
|
+
* Result of a single data point evaluation.
|
|
10
|
+
*/
|
|
11
|
+
export interface EvalResult {
|
|
12
|
+
/** The input given to the task. */
|
|
13
|
+
input: unknown;
|
|
14
|
+
/** The task's output. */
|
|
15
|
+
output?: unknown;
|
|
16
|
+
/** The expected output (if provided). */
|
|
17
|
+
expected?: unknown;
|
|
18
|
+
/** Dict of scorer name to Score. */
|
|
19
|
+
scores: Record<string, Score>;
|
|
20
|
+
/** Error message if the task failed. */
|
|
21
|
+
error?: string;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Summary of an evaluation run.
|
|
25
|
+
*/
|
|
26
|
+
export interface EvalSummary {
|
|
27
|
+
/** The evaluation name. */
|
|
28
|
+
name: string;
|
|
29
|
+
/** Per-data-point results. */
|
|
30
|
+
results: EvalResult[];
|
|
31
|
+
/** Average score per scorer. */
|
|
32
|
+
averages: Record<string, number>;
|
|
33
|
+
/** Number of data points. */
|
|
34
|
+
total: number;
|
|
35
|
+
/** Number of failed data points. */
|
|
36
|
+
errors: number;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* A single data point in the evaluation dataset.
|
|
40
|
+
*/
|
|
41
|
+
export interface EvalDatum {
|
|
42
|
+
/** The input to the task. */
|
|
43
|
+
input: unknown;
|
|
44
|
+
/** The expected output (optional). */
|
|
45
|
+
expected?: unknown;
|
|
46
|
+
/** Allow additional fields. */
|
|
47
|
+
[key: string]: unknown;
|
|
48
|
+
}
|
|
49
|
+
export interface EvaluateOptions {
|
|
50
|
+
/** Name for this evaluation run. */
|
|
51
|
+
name: string;
|
|
52
|
+
/** List of dicts with "input" and optional "expected" keys, or a callable that returns such a list. */
|
|
53
|
+
data: EvalDatum[] | (() => EvalDatum[]);
|
|
54
|
+
/** Function that takes an input and returns output. Can be sync or async. */
|
|
55
|
+
task: (input: unknown) => unknown | Promise<unknown>;
|
|
56
|
+
/** List of scorer instances. */
|
|
57
|
+
scores: Scorer[];
|
|
58
|
+
/** Whether to emit scores as separate OTEL spans for Data Prepper routing. Defaults to true. */
|
|
59
|
+
emitScores?: boolean;
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Run an evaluation: dataset -> task -> scorers -> OTEL spans.
|
|
63
|
+
*
|
|
64
|
+
* For each data point, runs the task function to produce output,
|
|
65
|
+
* then applies all scorers. Creates OTEL spans for the entire flow.
|
|
66
|
+
* Scores are emitted as OTEL spans through the same exporter pipeline.
|
|
67
|
+
*
|
|
68
|
+
* @example
|
|
69
|
+
* ```ts
|
|
70
|
+
* import { evaluate } from "@opensearch-project/genai-sdk";
|
|
71
|
+
* import { Score } from "@opensearch-project/genai-sdk/evals";
|
|
72
|
+
*
|
|
73
|
+
* const exactMatch: Scorer = {
|
|
74
|
+
* name: "exact_match",
|
|
75
|
+
* score({ output, expected }) {
|
|
76
|
+
* const match = output.trim().toLowerCase() === (expected ?? "").trim().toLowerCase();
|
|
77
|
+
* return { name: "exact_match", value: match ? 1.0 : 0.0 };
|
|
78
|
+
* },
|
|
79
|
+
* };
|
|
80
|
+
*
|
|
81
|
+
* const results = evaluate({
|
|
82
|
+
* name: "qa-eval",
|
|
83
|
+
* data: [
|
|
84
|
+
* { input: "Capital of France?", expected: "Paris" },
|
|
85
|
+
* { input: "2+2?", expected: "4" },
|
|
86
|
+
* ],
|
|
87
|
+
* task: (input) => myLlmCall(input as string),
|
|
88
|
+
* scores: [exactMatch],
|
|
89
|
+
* });
|
|
90
|
+
* ```
|
|
91
|
+
*/
|
|
92
|
+
export declare function evaluate(options: EvaluateOptions): EvalSummary;
|
|
93
|
+
/**
|
|
94
|
+
* Format an EvalSummary as a human-readable string.
|
|
95
|
+
*/
|
|
96
|
+
export declare function formatEvalSummary(summary: EvalSummary): string;
|
|
97
|
+
//# sourceMappingURL=evaluate.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"evaluate.d.ts","sourceRoot":"","sources":["../../src/evals/evaluate.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,OAAO,EAAE,KAAK,EAAE,MAAM,EAAc,MAAM,eAAe,CAAC;AAK1D;;GAEG;AACH,MAAM,WAAW,UAAU;IACzB,mCAAmC;IACnC,KAAK,EAAE,OAAO,CAAC;IACf,yBAAyB;IACzB,MAAM,CAAC,EAAE,OAAO,CAAC;IACjB,yCAAyC;IACzC,QAAQ,CAAC,EAAE,OAAO,CAAC;IACnB,oCAAoC;IACpC,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC;IAC9B,wCAAwC;IACxC,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,2BAA2B;IAC3B,IAAI,EAAE,MAAM,CAAC;IACb,8BAA8B;IAC9B,OAAO,EAAE,UAAU,EAAE,CAAC;IACtB,gCAAgC;IAChC,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,6BAA6B;IAC7B,KAAK,EAAE,MAAM,CAAC;IACd,oCAAoC;IACpC,MAAM,EAAE,MAAM,CAAC;CAChB;AAED;;GAEG;AACH,MAAM,WAAW,SAAS;IACxB,6BAA6B;IAC7B,KAAK,EAAE,OAAO,CAAC;IACf,sCAAsC;IACtC,QAAQ,CAAC,EAAE,OAAO,CAAC;IACnB,+BAA+B;IAC/B,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;CACxB;AAED,MAAM,WAAW,eAAe;IAC9B,oCAAoC;IACpC,IAAI,EAAE,MAAM,CAAC;IACb,uGAAuG;IACvG,IAAI,EAAE,SAAS,EAAE,GAAG,CAAC,MAAM,SAAS,EAAE,CAAC,CAAC;IACxC,6EAA6E;IAC7E,IAAI,EAAE,CAAC,KAAK,EAAE,OAAO,KAAK,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,CAAC;IACrD,gCAAgC;IAChC,MAAM,EAAE,MAAM,EAAE,CAAC;IACjB,gGAAgG;IAChG,UAAU,CAAC,EAAE,OAAO,CAAC;CACtB;AAaD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;AACH,wBAAgB,QAAQ,CAAC,OAAO,EAAE,eAAe,GAAG,WAAW,CAiL9D;AAED;;GAEG;AACH,wBAAgB,iBAAiB,CAAC,OAAO,EAAE,WAAW,GAAG,MAAM,CAM9D"}
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluation orchestrator for OpenSearch AI observability.
|
|
3
|
+
*
|
|
4
|
+
* Runs a task function across a dataset, applies scorers to each output,
|
|
5
|
+
* creates OTEL spans for the entire eval run, and stores scores in OpenSearch.
|
|
6
|
+
*/
|
|
7
|
+
import { trace, SpanStatusCode } from "@opentelemetry/api";
|
|
8
|
+
import { adaptScore } from "./protocol.js";
|
|
9
|
+
import { score as submitScore } from "../score.js";
|
|
10
|
+
const TRACER_NAME = "opensearch-genai-sdk-evals";
|
|
11
|
+
/**
|
|
12
|
+
* Format a span context ID as a hex string.
|
|
13
|
+
*/
|
|
14
|
+
function formatTraceId(traceId) {
|
|
15
|
+
return traceId;
|
|
16
|
+
}
|
|
17
|
+
function formatSpanId(spanId) {
|
|
18
|
+
return spanId;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Run an evaluation: dataset -> task -> scorers -> OTEL spans.
|
|
22
|
+
*
|
|
23
|
+
* For each data point, runs the task function to produce output,
|
|
24
|
+
* then applies all scorers. Creates OTEL spans for the entire flow.
|
|
25
|
+
* Scores are emitted as OTEL spans through the same exporter pipeline.
|
|
26
|
+
*
|
|
27
|
+
* @example
|
|
28
|
+
* ```ts
|
|
29
|
+
* import { evaluate } from "@opensearch-project/genai-sdk";
|
|
30
|
+
* import { Score } from "@opensearch-project/genai-sdk/evals";
|
|
31
|
+
*
|
|
32
|
+
* const exactMatch: Scorer = {
|
|
33
|
+
* name: "exact_match",
|
|
34
|
+
* score({ output, expected }) {
|
|
35
|
+
* const match = output.trim().toLowerCase() === (expected ?? "").trim().toLowerCase();
|
|
36
|
+
* return { name: "exact_match", value: match ? 1.0 : 0.0 };
|
|
37
|
+
* },
|
|
38
|
+
* };
|
|
39
|
+
*
|
|
40
|
+
* const results = evaluate({
|
|
41
|
+
* name: "qa-eval",
|
|
42
|
+
* data: [
|
|
43
|
+
* { input: "Capital of France?", expected: "Paris" },
|
|
44
|
+
* { input: "2+2?", expected: "4" },
|
|
45
|
+
* ],
|
|
46
|
+
* task: (input) => myLlmCall(input as string),
|
|
47
|
+
* scores: [exactMatch],
|
|
48
|
+
* });
|
|
49
|
+
* ```
|
|
50
|
+
*/
|
|
51
|
+
export function evaluate(options) {
|
|
52
|
+
const { name, data, task, scores: scorers, emitScores = true } = options;
|
|
53
|
+
const tracer = trace.getTracer(TRACER_NAME);
|
|
54
|
+
// Resolve data if callable
|
|
55
|
+
const dataset = typeof data === "function" ? data() : data;
|
|
56
|
+
const summary = {
|
|
57
|
+
name,
|
|
58
|
+
results: [],
|
|
59
|
+
averages: {},
|
|
60
|
+
total: dataset.length,
|
|
61
|
+
errors: 0,
|
|
62
|
+
};
|
|
63
|
+
tracer.startActiveSpan("evaluate", {
|
|
64
|
+
attributes: {
|
|
65
|
+
"eval.name": name,
|
|
66
|
+
"eval.dataset_size": dataset.length,
|
|
67
|
+
"eval.scorer_count": scorers.length,
|
|
68
|
+
},
|
|
69
|
+
}, (evalSpan) => {
|
|
70
|
+
for (let i = 0; i < dataset.length; i++) {
|
|
71
|
+
const datum = dataset[i];
|
|
72
|
+
const inputVal = datum.input;
|
|
73
|
+
const expectedVal = datum.expected;
|
|
74
|
+
const evalResult = {
|
|
75
|
+
input: inputVal,
|
|
76
|
+
expected: expectedVal,
|
|
77
|
+
scores: {},
|
|
78
|
+
};
|
|
79
|
+
tracer.startActiveSpan("eval_item", {
|
|
80
|
+
attributes: {
|
|
81
|
+
"eval.item.index": i,
|
|
82
|
+
"eval.item.input": String(inputVal).slice(0, 1000),
|
|
83
|
+
},
|
|
84
|
+
}, (itemSpan) => {
|
|
85
|
+
// Run the task
|
|
86
|
+
let output;
|
|
87
|
+
let taskFailed = false;
|
|
88
|
+
tracer.startActiveSpan("eval_task", (taskSpan) => {
|
|
89
|
+
try {
|
|
90
|
+
const result = task(inputVal);
|
|
91
|
+
// Handle async tasks by noting we don't support them in sync evaluate()
|
|
92
|
+
// For a sync evaluate, the task should be sync. Async tasks would need
|
|
93
|
+
// an evaluateAsync() variant.
|
|
94
|
+
if (result instanceof Promise) {
|
|
95
|
+
console.warn("[opensearch-genai-sdk] Async task detected in sync evaluate(). " +
|
|
96
|
+
"The promise will not be awaited. Use sync tasks or implement evaluateAsync().");
|
|
97
|
+
}
|
|
98
|
+
output = result;
|
|
99
|
+
evalResult.output = output;
|
|
100
|
+
taskSpan.setAttribute("eval.task.output", String(output).slice(0, 1000));
|
|
101
|
+
}
|
|
102
|
+
catch (err) {
|
|
103
|
+
evalResult.error = String(err);
|
|
104
|
+
summary.errors++;
|
|
105
|
+
taskSpan.setStatus({ code: SpanStatusCode.ERROR, message: String(err) });
|
|
106
|
+
taskSpan.recordException(err);
|
|
107
|
+
taskFailed = true;
|
|
108
|
+
}
|
|
109
|
+
taskSpan.end();
|
|
110
|
+
});
|
|
111
|
+
if (!taskFailed) {
|
|
112
|
+
// Run each scorer
|
|
113
|
+
for (const scorer of scorers) {
|
|
114
|
+
const scorerName = scorer.name;
|
|
115
|
+
tracer.startActiveSpan(`eval_score.${scorerName}`, { attributes: { "eval.scorer.name": scorerName } }, (scoreSpan) => {
|
|
116
|
+
try {
|
|
117
|
+
const rawResult = scorer.score({
|
|
118
|
+
input: String(inputVal),
|
|
119
|
+
output: String(output),
|
|
120
|
+
expected: expectedVal !== undefined ? String(expectedVal) : undefined,
|
|
121
|
+
});
|
|
122
|
+
const scoreObj = adaptScore(scorerName, rawResult);
|
|
123
|
+
evalResult.scores[scorerName] = scoreObj;
|
|
124
|
+
if (scoreObj.value !== undefined) {
|
|
125
|
+
scoreSpan.setAttribute("eval.score.value", scoreObj.value);
|
|
126
|
+
}
|
|
127
|
+
if (scoreObj.label) {
|
|
128
|
+
scoreSpan.setAttribute("eval.score.label", scoreObj.label);
|
|
129
|
+
}
|
|
130
|
+
if (scoreObj.rationale) {
|
|
131
|
+
scoreSpan.setAttribute("eval.score.rationale", scoreObj.rationale.slice(0, 500));
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
catch (err) {
|
|
135
|
+
console.warn(`[opensearch-genai-sdk] Scorer ${scorerName} failed on item ${i}: ${err}`);
|
|
136
|
+
scoreSpan.setStatus({ code: SpanStatusCode.ERROR, message: String(err) });
|
|
137
|
+
scoreSpan.recordException(err);
|
|
138
|
+
}
|
|
139
|
+
scoreSpan.end();
|
|
140
|
+
});
|
|
141
|
+
}
|
|
142
|
+
// Emit scores as OTEL spans
|
|
143
|
+
if (emitScores) {
|
|
144
|
+
const spanContext = itemSpan.spanContext();
|
|
145
|
+
const traceIdHex = formatTraceId(spanContext.traceId);
|
|
146
|
+
const spanIdHex = formatSpanId(spanContext.spanId);
|
|
147
|
+
for (const [scorerName, scoreObj] of Object.entries(evalResult.scores)) {
|
|
148
|
+
try {
|
|
149
|
+
submitScore({
|
|
150
|
+
name: scorerName,
|
|
151
|
+
value: scoreObj.value,
|
|
152
|
+
traceId: traceIdHex,
|
|
153
|
+
spanId: spanIdHex,
|
|
154
|
+
label: scoreObj.label,
|
|
155
|
+
rationale: scoreObj.rationale,
|
|
156
|
+
source: "eval",
|
|
157
|
+
metadata: { eval_name: name, item_index: i },
|
|
158
|
+
});
|
|
159
|
+
}
|
|
160
|
+
catch (err) {
|
|
161
|
+
console.warn(`[opensearch-genai-sdk] Failed to emit score ${scorerName}: ${err}`);
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
summary.results.push(evalResult);
|
|
167
|
+
itemSpan.end();
|
|
168
|
+
});
|
|
169
|
+
}
|
|
170
|
+
// Compute averages
|
|
171
|
+
const scoreTotals = {};
|
|
172
|
+
for (const result of summary.results) {
|
|
173
|
+
for (const [scorerName, scoreObj] of Object.entries(result.scores)) {
|
|
174
|
+
if (scoreObj.value !== undefined) {
|
|
175
|
+
if (!scoreTotals[scorerName]) {
|
|
176
|
+
scoreTotals[scorerName] = [];
|
|
177
|
+
}
|
|
178
|
+
scoreTotals[scorerName].push(scoreObj.value);
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
for (const [scorerName, values] of Object.entries(scoreTotals)) {
|
|
183
|
+
const avg = values.reduce((a, b) => a + b, 0) / values.length;
|
|
184
|
+
summary.averages[scorerName] = avg;
|
|
185
|
+
evalSpan.setAttribute(`eval.avg.${scorerName}`, avg);
|
|
186
|
+
}
|
|
187
|
+
evalSpan.setAttribute("eval.errors", summary.errors);
|
|
188
|
+
evalSpan.end();
|
|
189
|
+
});
|
|
190
|
+
return summary;
|
|
191
|
+
}
|
|
192
|
+
/**
|
|
193
|
+
* Format an EvalSummary as a human-readable string.
|
|
194
|
+
*/
|
|
195
|
+
export function formatEvalSummary(summary) {
|
|
196
|
+
const parts = [`Eval: ${summary.name} (${summary.total} samples, ${summary.errors} errors)`];
|
|
197
|
+
for (const [scorerName, avg] of Object.entries(summary.averages)) {
|
|
198
|
+
parts.push(` ${scorerName}: ${avg.toFixed(3)}`);
|
|
199
|
+
}
|
|
200
|
+
return parts.join("\n");
|
|
201
|
+
}
|
|
202
|
+
//# sourceMappingURL=evaluate.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"evaluate.js","sourceRoot":"","sources":["../../src/evals/evaluate.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,KAAK,EAAE,cAAc,EAAQ,MAAM,oBAAoB,CAAC;AACjE,OAAO,EAAiB,UAAU,EAAE,MAAM,eAAe,CAAC;AAC1D,OAAO,EAAE,KAAK,IAAI,WAAW,EAAE,MAAM,aAAa,CAAC;AAEnD,MAAM,WAAW,GAAG,4BAA4B,CAAC;AA2DjD;;GAEG;AACH,SAAS,aAAa,CAAC,OAAe;IACpC,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,SAAS,YAAY,CAAC,MAAc;IAClC,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;AACH,MAAM,UAAU,QAAQ,CAAC,OAAwB;IAC/C,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,UAAU,GAAG,IAAI,EAAE,GAAG,OAAO,CAAC;IAEzE,MAAM,MAAM,GAAG,KAAK,CAAC,SAAS,CAAC,WAAW,CAAC,CAAC;IAE5C,2BAA2B;IAC3B,MAAM,OAAO,GAAgB,OAAO,IAAI,KAAK,UAAU,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;IAExE,MAAM,OAAO,GAAgB;QAC3B,IAAI;QACJ,OAAO,EAAE,EAAE;QACX,QAAQ,EAAE,EAAE;QACZ,KAAK,EAAE,OAAO,CAAC,MAAM;QACrB,MAAM,EAAE,CAAC;KACV,CAAC;IAEF,MAAM,CAAC,eAAe,CACpB,UAAU,EACV;QACE,UAAU,EAAE;YACV,WAAW,EAAE,IAAI;YACjB,mBAAmB,EAAE,OAAO,CAAC,MAAM;YACnC,mBAAmB,EAAE,OAAO,CAAC,MAAM;SACpC;KACF,EACD,CAAC,QAAc,EAAE,EAAE;QACjB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACxC,MAAM,KAAK,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;YACzB,MAAM,QAAQ,GAAG,KAAK,CAAC,KAAK,CAAC;YAC7B,MAAM,WAAW,GAAG,KAAK,CAAC,QAAQ,CAAC;YAEnC,MAAM,UAAU,GAAe;gBAC7B,KAAK,EAAE,QAAQ;gBACf,QAAQ,EAAE,WAAW;gBACrB,MAAM,EAAE,EAAE;aACX,CAAC;YAEF,MAAM,CAAC,eAAe,CACpB,WAAW,EACX;gBACE,UAAU,EAAE;oBACV,iBAAiB,EAAE,CAAC;oBACpB,iBAAiB,EAAE,MAAM,CAAC,QAAQ,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC;iBACnD;aACF,EACD,CAAC,QAAc,EAAE,EAAE;gBACjB,eAAe;gBACf,IAAI,MAAe,CAAC;gBACpB,IAAI,UAAU,GAAG,KAAK,CAAC;gBAEvB,MAAM,CAAC,eAAe,CAAC,WAAW,EAAE,CAAC,QAAc,EAAE,EAAE;oBACrD,IAAI,CAAC;wBACH,MAAM,MAAM,GAAG,IAAI,CAAC,QAAQ,CAAC,CAAC;wBAE9B,wEAAwE;wBACxE,uEAAuE;wBACvE,8BAA8B;wBAC9B,IAAI,MAAM,YAAY,OAAO,EAAE,CAAC;4BAC9B,OAAO,CAAC,IAAI,CACV,iEAAiE;gCAC/D,+EAA+E,CAClF,CAAC;wBACJ,CAAC;wBAED,MAAM,GAAG,MAAM,CAAC;wBAChB,UAAU,CAAC,MAAM,GAAG,MAAM,CAAC;wBAC3B,QAAQ,CAAC,YAAY,CAAC,kBAAkB,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC;oBAC3E,CAAC;oBAAC,OAAO,GAAG,EAAE,CAAC;wBACb,UAAU,CAAC,KAAK,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC;wBAC/B,OAAO,CAAC,MAAM,EAAE,CAAC;wBACjB,QAAQ,CAAC,SAAS,CAAC,EAAE,IAAI,EAAE,cAAc,CAAC,KAAK,EAAE,OAAO,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;wBACzE,QAAQ,CAAC,eAAe,CAAC,GAAY,CAAC,CAAC;wBACvC,UAAU,GAAG,IAAI,CAAC;oBACpB,CAAC;oBACD,QAAQ,CAAC,GAAG,EAAE,CAAC;gBACjB,CAAC,CAAC,CAAC;gBAEH,IAAI,CAAC,UAAU,EAAE,CAAC;oBAChB,kBAAkB;oBAClB,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;wBAC7B,MAAM,UAAU,GAAG,MAAM,CAAC,IAAI,CAAC;wBAE/B,MAAM,CAAC,eAAe,CACpB,cAAc,UAAU,EAAE,EAC1B,EAAE,UAAU,EAAE,EAAE,kBAAkB,EAAE,UAAU,EAAE,EAAE,EAClD,CAAC,SAAe,EAAE,EAAE;4BAClB,IAAI,CAAC;gCACH,MAAM,SAAS,GAAG,MAAM,CAAC,KAAK,CAAC;oCAC7B,KAAK,EAAE,MAAM,CAAC,QAAQ,CAAC;oCACvB,MAAM,EAAE,MAAM,CAAC,MAAM,CAAC;oCACtB,QAAQ,EAAE,WAAW,KAAK,SAAS,CAAC,CAAC,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,SAAS;iCACtE,CAAC,CAAC;gCACH,MAAM,QAAQ,GAAG,UAAU,CAAC,UAAU,EAAE,SAAS,CAAC,CAAC;gCACnD,UAAU,CAAC,MAAM,CAAC,UAAU,CAAC,GAAG,QAAQ,CAAC;gCAEzC,IAAI,QAAQ,CAAC,KAAK,KAAK,SAAS,EAAE,CAAC;oCACjC,SAAS,CAAC,YAAY,CAAC,kBAAkB,EAAE,QAAQ,CAAC,KAAK,CAAC,CAAC;gCAC7D,CAAC;gCACD,IAAI,QAAQ,CAAC,KAAK,EAAE,CAAC;oCACnB,SAAS,CAAC,YAAY,CAAC,kBAAkB,EAAE,QAAQ,CAAC,KAAK,CAAC,CAAC;gCAC7D,CAAC;gCACD,IAAI,QAAQ,CAAC,SAAS,EAAE,CAAC;oCACvB,SAAS,CAAC,YAAY,CACpB,sBAAsB,EACtB,QAAQ,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CACjC,CAAC;gCACJ,CAAC;4BACH,CAAC;4BAAC,OAAO,GAAG,EAAE,CAAC;gCACb,OAAO,CAAC,IAAI,CACV,iCAAiC,UAAU,mBAAmB,CAAC,KAAK,GAAG,EAAE,CAC1E,CAAC;gCACF,SAAS,CAAC,SAAS,CAAC,EAAE,IAAI,EAAE,cAAc,CAAC,KAAK,EAAE,OAAO,EAAE,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;gCAC1E,SAAS,CAAC,eAAe,CAAC,GAAY,CAAC,CAAC;4BAC1C,CAAC;4BACD,SAAS,CAAC,GAAG,EAAE,CAAC;wBAClB,CAAC,CACF,CAAC;oBACJ,CAAC;oBAED,4BAA4B;oBAC5B,IAAI,UAAU,EAAE,CAAC;wBACf,MAAM,WAAW,GAAG,QAAQ,CAAC,WAAW,EAAE,CAAC;wBAC3C,MAAM,UAAU,GAAG,aAAa,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;wBACtD,MAAM,SAAS,GAAG,YAAY,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC;wBAEnD,KAAK,MAAM,CAAC,UAAU,EAAE,QAAQ,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,UAAU,CAAC,MAAM,CAAC,EAAE,CAAC;4BACvE,IAAI,CAAC;gCACH,WAAW,CAAC;oCACV,IAAI,EAAE,UAAU;oCAChB,KAAK,EAAE,QAAQ,CAAC,KAAK;oCACrB,OAAO,EAAE,UAAU;oCACnB,MAAM,EAAE,SAAS;oCACjB,KAAK,EAAE,QAAQ,CAAC,KAAK;oCACrB,SAAS,EAAE,QAAQ,CAAC,SAAS;oCAC7B,MAAM,EAAE,MAAM;oCACd,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC,EAAE;iCAC7C,CAAC,CAAC;4BACL,CAAC;4BAAC,OAAO,GAAG,EAAE,CAAC;gCACb,OAAO,CAAC,IAAI,CACV,+CAA+C,UAAU,KAAK,GAAG,EAAE,CACpE,CAAC;4BACJ,CAAC;wBACH,CAAC;oBACH,CAAC;gBACH,CAAC;gBAED,OAAO,CAAC,OAAO,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;gBACjC,QAAQ,CAAC,GAAG,EAAE,CAAC;YACjB,CAAC,CACF,CAAC;QACJ,CAAC;QAED,mBAAmB;QACnB,MAAM,WAAW,GAA6B,EAAE,CAAC;QACjD,KAAK,MAAM,MAAM,IAAI,OAAO,CAAC,OAAO,EAAE,CAAC;YACrC,KAAK,MAAM,CAAC,UAAU,EAAE,QAAQ,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,MAAM,CAAC,EAAE,CAAC;gBACnE,IAAI,QAAQ,CAAC,KAAK,KAAK,SAAS,EAAE,CAAC;oBACjC,IAAI,CAAC,WAAW,CAAC,UAAU,CAAC,EAAE,CAAC;wBAC7B,WAAW,CAAC,UAAU,CAAC,GAAG,EAAE,CAAC;oBAC/B,CAAC;oBACD,WAAW,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC;gBAC/C,CAAC;YACH,CAAC;QACH,CAAC;QAED,KAAK,MAAM,CAAC,UAAU,EAAE,MAAM,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,WAAW,CAAC,EAAE,CAAC;YAC/D,MAAM,GAAG,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC;YAC9D,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAC,GAAG,GAAG,CAAC;YACnC,QAAQ,CAAC,YAAY,CAAC,YAAY,UAAU,EAAE,EAAE,GAAG,CAAC,CAAC;QACvD,CAAC;QAED,QAAQ,CAAC,YAAY,CAAC,aAAa,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC;QACrD,QAAQ,CAAC,GAAG,EAAE,CAAC;IACjB,CAAC,CACF,CAAC;IAEF,OAAO,OAAO,CAAC;AACjB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,iBAAiB,CAAC,OAAoB;IACpD,MAAM,KAAK,GAAG,CAAC,SAAS,OAAO,CAAC,IAAI,KAAK,OAAO,CAAC,KAAK,aAAa,OAAO,CAAC,MAAM,UAAU,CAAC,CAAC;IAC7F,KAAK,MAAM,CAAC,UAAU,EAAE,GAAG,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,CAAC;QACjE,KAAK,CAAC,IAAI,CAAC,KAAK,UAAU,KAAK,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IACnD,CAAC;IACD,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC"}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluation framework for OpenSearch AI observability.
|
|
3
|
+
*
|
|
4
|
+
* Provides the evaluate() orchestrator and Scorer interface for running
|
|
5
|
+
* evaluations on LLM outputs. Compatible with autoevals, phoenix-evals,
|
|
6
|
+
* or any custom scorer that matches the Scorer interface.
|
|
7
|
+
*/
|
|
8
|
+
export type { Score, Scorer } from "./protocol.js";
|
|
9
|
+
export { adaptScore } from "./protocol.js";
|
|
10
|
+
export type { EvalResult, EvalSummary, EvalDatum, EvaluateOptions } from "./evaluate.js";
|
|
11
|
+
export { evaluate, formatEvalSummary } from "./evaluate.js";
|
|
12
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/evals/index.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,YAAY,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,eAAe,CAAC;AACnD,OAAO,EAAE,UAAU,EAAE,MAAM,eAAe,CAAC;AAC3C,YAAY,EAAE,UAAU,EAAE,WAAW,EAAE,SAAS,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AACzF,OAAO,EAAE,QAAQ,EAAE,iBAAiB,EAAE,MAAM,eAAe,CAAC"}
|