@fallom/trace 0.2.26 → 0.2.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-MSI4HGK6.mjs +1051 -0
- package/dist/chunk-TNNLTWRG.mjs +1045 -0
- package/dist/core-5BF6KLNO.mjs +21 -0
- package/dist/core-SL7FAAJN.mjs +21 -0
- package/dist/index.d.mts +119 -3
- package/dist/index.d.ts +119 -3
- package/dist/index.js +156 -14
- package/dist/index.mjs +130 -3
- package/package.json +2 -2
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import {
|
|
2
|
+
DEFAULT_JUDGE_MODEL,
|
|
3
|
+
_apiKey,
|
|
4
|
+
_baseUrl,
|
|
5
|
+
_initialized,
|
|
6
|
+
compareModels,
|
|
7
|
+
evaluate,
|
|
8
|
+
init,
|
|
9
|
+
uploadResultsPublic
|
|
10
|
+
} from "./chunk-MSI4HGK6.mjs";
|
|
11
|
+
import "./chunk-7P6ASYW6.mjs";
|
|
12
|
+
export {
|
|
13
|
+
DEFAULT_JUDGE_MODEL,
|
|
14
|
+
_apiKey,
|
|
15
|
+
_baseUrl,
|
|
16
|
+
_initialized,
|
|
17
|
+
compareModels,
|
|
18
|
+
evaluate,
|
|
19
|
+
init,
|
|
20
|
+
uploadResultsPublic
|
|
21
|
+
};
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import {
|
|
2
|
+
DEFAULT_JUDGE_MODEL,
|
|
3
|
+
_apiKey,
|
|
4
|
+
_baseUrl,
|
|
5
|
+
_initialized,
|
|
6
|
+
compareModels,
|
|
7
|
+
evaluate,
|
|
8
|
+
init,
|
|
9
|
+
uploadResultsPublic
|
|
10
|
+
} from "./chunk-TNNLTWRG.mjs";
|
|
11
|
+
import "./chunk-7P6ASYW6.mjs";
|
|
12
|
+
export {
|
|
13
|
+
DEFAULT_JUDGE_MODEL,
|
|
14
|
+
_apiKey,
|
|
15
|
+
_baseUrl,
|
|
16
|
+
_initialized,
|
|
17
|
+
compareModels,
|
|
18
|
+
evaluate,
|
|
19
|
+
init,
|
|
20
|
+
uploadResultsPublic
|
|
21
|
+
};
|
package/dist/index.d.mts
CHANGED
|
@@ -130,6 +130,92 @@ declare function init$4(options?: {
|
|
|
130
130
|
*/
|
|
131
131
|
declare function shutdown(): Promise<void>;
|
|
132
132
|
|
|
133
|
+
/**
|
|
134
|
+
* FallomSpan - Manual span for custom operations.
|
|
135
|
+
*
|
|
136
|
+
* Use for non-LLM operations like RAG retrieval, preprocessing, tool execution, etc.
|
|
137
|
+
*
|
|
138
|
+
* @example
|
|
139
|
+
* ```typescript
|
|
140
|
+
* const session = fallom.session({ configKey: "my-agent", sessionId });
|
|
141
|
+
*
|
|
142
|
+
* // Create a manual span
|
|
143
|
+
* const span = session.span("rag.retrieve");
|
|
144
|
+
* span.set({ "rag.query": userQuery, "rag.topK": 5 });
|
|
145
|
+
*
|
|
146
|
+
* const docs = await retrieveDocuments(userQuery);
|
|
147
|
+
* span.set({ "rag.documents.count": docs.length });
|
|
148
|
+
*
|
|
149
|
+
* span.end(); // Sends the span
|
|
150
|
+
* ```
|
|
151
|
+
*/
|
|
152
|
+
|
|
153
|
+
interface SpanOptions {
|
|
154
|
+
/** Parent span ID for nested spans */
|
|
155
|
+
parentSpanId?: string;
|
|
156
|
+
/** Trace ID to continue an existing trace */
|
|
157
|
+
traceId?: string;
|
|
158
|
+
/** Span kind (defaults to "custom") */
|
|
159
|
+
kind?: "custom" | "tool" | "retrieval" | "preprocessing" | "postprocessing";
|
|
160
|
+
}
|
|
161
|
+
declare class FallomSpan {
|
|
162
|
+
private name;
|
|
163
|
+
private ctx;
|
|
164
|
+
private attrs;
|
|
165
|
+
private startTime;
|
|
166
|
+
private ended;
|
|
167
|
+
private _status;
|
|
168
|
+
private _errorMessage?;
|
|
169
|
+
readonly spanId: string;
|
|
170
|
+
readonly traceId: string;
|
|
171
|
+
readonly parentSpanId?: string;
|
|
172
|
+
readonly kind: string;
|
|
173
|
+
constructor(name: string, ctx: SessionContext, options?: SpanOptions);
|
|
174
|
+
/**
|
|
175
|
+
* Set attributes on the span.
|
|
176
|
+
* Can be called multiple times - attributes are merged.
|
|
177
|
+
*/
|
|
178
|
+
set(attributes: Record<string, unknown>): this;
|
|
179
|
+
/**
|
|
180
|
+
* Mark the span as errored.
|
|
181
|
+
*/
|
|
182
|
+
setError(error: Error | string): this;
|
|
183
|
+
/**
|
|
184
|
+
* Get span context for creating child spans.
|
|
185
|
+
*/
|
|
186
|
+
context(): {
|
|
187
|
+
traceId: string;
|
|
188
|
+
spanId: string;
|
|
189
|
+
};
|
|
190
|
+
/**
|
|
191
|
+
* End the span and send it.
|
|
192
|
+
* Must be called for the span to be recorded.
|
|
193
|
+
*/
|
|
194
|
+
end(): void;
|
|
195
|
+
}
|
|
196
|
+
/**
|
|
197
|
+
* Wrap a function to automatically create a span around it.
|
|
198
|
+
* Similar to Braintrust's wrapTraced().
|
|
199
|
+
*
|
|
200
|
+
* @example
|
|
201
|
+
* ```typescript
|
|
202
|
+
* const fetchDocuments = wrapTraced(
|
|
203
|
+
* session,
|
|
204
|
+
* "rag.fetch",
|
|
205
|
+
* async (query: string) => {
|
|
206
|
+
* const docs = await vectorDb.search(query);
|
|
207
|
+
* return docs;
|
|
208
|
+
* }
|
|
209
|
+
* );
|
|
210
|
+
*
|
|
211
|
+
* // Function input/output automatically captured
|
|
212
|
+
* const docs = await fetchDocuments("user query");
|
|
213
|
+
* ```
|
|
214
|
+
*/
|
|
215
|
+
declare function wrapTraced<T extends (...args: any[]) => Promise<any>>(session: {
|
|
216
|
+
span: (name: string, options?: SpanOptions) => FallomSpan;
|
|
217
|
+
}, name: string, fn: T, options?: SpanOptions): T;
|
|
218
|
+
|
|
133
219
|
/**
|
|
134
220
|
* FallomSession - Session-scoped tracing for concurrent-safe operations.
|
|
135
221
|
*/
|
|
@@ -162,6 +248,24 @@ declare class FallomSession {
|
|
|
162
248
|
constructor(options: SessionOptions);
|
|
163
249
|
/** Get the session context. */
|
|
164
250
|
getContext(): SessionContext;
|
|
251
|
+
/**
|
|
252
|
+
* Create a manual span for custom operations.
|
|
253
|
+
*
|
|
254
|
+
* Use for non-LLM operations like RAG retrieval, preprocessing, tool execution, etc.
|
|
255
|
+
* The span uses the session's context (configKey, sessionId, etc.).
|
|
256
|
+
*
|
|
257
|
+
* @example
|
|
258
|
+
* ```typescript
|
|
259
|
+
* const span = session.span("rag.retrieve");
|
|
260
|
+
* span.set({ "rag.query": userQuery, "rag.topK": 5 });
|
|
261
|
+
*
|
|
262
|
+
* const docs = await retrieveDocuments(userQuery);
|
|
263
|
+
* span.set({ "rag.documents.count": docs.length });
|
|
264
|
+
*
|
|
265
|
+
* span.end(); // Must call to send the span
|
|
266
|
+
* ```
|
|
267
|
+
*/
|
|
268
|
+
span(name: string, options?: SpanOptions): FallomSpan;
|
|
165
269
|
/**
|
|
166
270
|
* Get model assignment for this session (A/B testing).
|
|
167
271
|
*/
|
|
@@ -233,15 +337,19 @@ declare function session(options: SessionOptions): FallomSession;
|
|
|
233
337
|
|
|
234
338
|
type trace_FallomSession = FallomSession;
|
|
235
339
|
declare const trace_FallomSession: typeof FallomSession;
|
|
340
|
+
type trace_FallomSpan = FallomSpan;
|
|
341
|
+
declare const trace_FallomSpan: typeof FallomSpan;
|
|
236
342
|
type trace_SessionContext = SessionContext;
|
|
237
343
|
type trace_SessionOptions = SessionOptions;
|
|
344
|
+
type trace_SpanOptions = SpanOptions;
|
|
238
345
|
type trace_TraceContext = TraceContext;
|
|
239
346
|
type trace_TraceData = TraceData;
|
|
240
347
|
type trace_WrapAISDKOptions = WrapAISDKOptions;
|
|
241
348
|
declare const trace_session: typeof session;
|
|
242
349
|
declare const trace_shutdown: typeof shutdown;
|
|
350
|
+
declare const trace_wrapTraced: typeof wrapTraced;
|
|
243
351
|
declare namespace trace {
|
|
244
|
-
export { trace_FallomSession as FallomSession, type trace_SessionContext as SessionContext, type trace_SessionOptions as SessionOptions, type trace_TraceContext as TraceContext, type trace_TraceData as TraceData, type trace_WrapAISDKOptions as WrapAISDKOptions, init$4 as init, trace_session as session, trace_shutdown as shutdown };
|
|
352
|
+
export { trace_FallomSession as FallomSession, trace_FallomSpan as FallomSpan, type trace_SessionContext as SessionContext, type trace_SessionOptions as SessionOptions, type trace_SpanOptions as SpanOptions, type trace_TraceContext as TraceContext, type trace_TraceData as TraceData, type trace_WrapAISDKOptions as WrapAISDKOptions, init$4 as init, trace_session as session, trace_shutdown as shutdown, trace_wrapTraced as wrapTraced };
|
|
245
353
|
}
|
|
246
354
|
|
|
247
355
|
/**
|
|
@@ -531,6 +639,12 @@ interface EvaluateOptions {
|
|
|
531
639
|
/** List of metrics to run (built-in or custom). Default: all built-in metrics */
|
|
532
640
|
metrics?: MetricInput[];
|
|
533
641
|
judgeModel?: string;
|
|
642
|
+
/**
|
|
643
|
+
* Context to provide the LLM judge about the product/domain being evaluated.
|
|
644
|
+
* This helps the judge make better evaluations by understanding what features
|
|
645
|
+
* or capabilities are valid (e.g., won't mark valid features as hallucinations).
|
|
646
|
+
*/
|
|
647
|
+
judgeContext?: string;
|
|
534
648
|
name?: string;
|
|
535
649
|
description?: string;
|
|
536
650
|
verbose?: boolean;
|
|
@@ -566,7 +680,7 @@ declare const METRIC_PROMPTS: Record<MetricName, {
|
|
|
566
680
|
/**
|
|
567
681
|
* Build the G-Eval prompt for the LLM judge.
|
|
568
682
|
*/
|
|
569
|
-
declare function buildGEvalPrompt(criteria: string, steps: string[], systemMessage: string | undefined, inputText: string, outputText: string): string;
|
|
683
|
+
declare function buildGEvalPrompt(criteria: string, steps: string[], systemMessage: string | undefined, inputText: string, outputText: string, judgeContext?: string): string;
|
|
570
684
|
/**
|
|
571
685
|
* Result of running G-Eval on a single metric.
|
|
572
686
|
*/
|
|
@@ -600,6 +714,8 @@ interface RunGEvalOptions {
|
|
|
600
714
|
traceSessionId?: string;
|
|
601
715
|
/** Optional customer ID for tracing (e.g., organization ID) */
|
|
602
716
|
traceCustomerId?: string;
|
|
717
|
+
/** Optional context to provide the judge about the product/domain being evaluated */
|
|
718
|
+
judgeContext?: string;
|
|
603
719
|
}
|
|
604
720
|
/**
|
|
605
721
|
* Run G-Eval for a single metric using OpenRouter.
|
|
@@ -1114,4 +1230,4 @@ declare const _default: {
|
|
|
1114
1230
|
session: typeof session;
|
|
1115
1231
|
};
|
|
1116
1232
|
|
|
1117
|
-
export { type CompareModelsOptions, type DatasetItem, type EvalResult, type EvaluateOptions, FallomExporter, type FallomExporterOptions, FallomSession, type GEvalScore, type InitOptions, type MetricName, type PromptResult, type SessionContext, type SessionOptions, buildGEvalPrompt, calculateAggregateScores, clearMastraPrompt, _default as default, detectRegression, evals, init, models, prompts, runGEval, session, setMastraPrompt, setMastraPromptAB, trace };
|
|
1233
|
+
export { type CompareModelsOptions, type DatasetItem, type EvalResult, type EvaluateOptions, FallomExporter, type FallomExporterOptions, FallomSession, FallomSpan, type GEvalScore, type InitOptions, type MetricName, type PromptResult, type SessionContext, type SessionOptions, type SpanOptions, buildGEvalPrompt, calculateAggregateScores, clearMastraPrompt, _default as default, detectRegression, evals, init, models, prompts, runGEval, session, setMastraPrompt, setMastraPromptAB, trace, wrapTraced };
|
package/dist/index.d.ts
CHANGED
|
@@ -130,6 +130,92 @@ declare function init$4(options?: {
|
|
|
130
130
|
*/
|
|
131
131
|
declare function shutdown(): Promise<void>;
|
|
132
132
|
|
|
133
|
+
/**
|
|
134
|
+
* FallomSpan - Manual span for custom operations.
|
|
135
|
+
*
|
|
136
|
+
* Use for non-LLM operations like RAG retrieval, preprocessing, tool execution, etc.
|
|
137
|
+
*
|
|
138
|
+
* @example
|
|
139
|
+
* ```typescript
|
|
140
|
+
* const session = fallom.session({ configKey: "my-agent", sessionId });
|
|
141
|
+
*
|
|
142
|
+
* // Create a manual span
|
|
143
|
+
* const span = session.span("rag.retrieve");
|
|
144
|
+
* span.set({ "rag.query": userQuery, "rag.topK": 5 });
|
|
145
|
+
*
|
|
146
|
+
* const docs = await retrieveDocuments(userQuery);
|
|
147
|
+
* span.set({ "rag.documents.count": docs.length });
|
|
148
|
+
*
|
|
149
|
+
* span.end(); // Sends the span
|
|
150
|
+
* ```
|
|
151
|
+
*/
|
|
152
|
+
|
|
153
|
+
interface SpanOptions {
|
|
154
|
+
/** Parent span ID for nested spans */
|
|
155
|
+
parentSpanId?: string;
|
|
156
|
+
/** Trace ID to continue an existing trace */
|
|
157
|
+
traceId?: string;
|
|
158
|
+
/** Span kind (defaults to "custom") */
|
|
159
|
+
kind?: "custom" | "tool" | "retrieval" | "preprocessing" | "postprocessing";
|
|
160
|
+
}
|
|
161
|
+
declare class FallomSpan {
|
|
162
|
+
private name;
|
|
163
|
+
private ctx;
|
|
164
|
+
private attrs;
|
|
165
|
+
private startTime;
|
|
166
|
+
private ended;
|
|
167
|
+
private _status;
|
|
168
|
+
private _errorMessage?;
|
|
169
|
+
readonly spanId: string;
|
|
170
|
+
readonly traceId: string;
|
|
171
|
+
readonly parentSpanId?: string;
|
|
172
|
+
readonly kind: string;
|
|
173
|
+
constructor(name: string, ctx: SessionContext, options?: SpanOptions);
|
|
174
|
+
/**
|
|
175
|
+
* Set attributes on the span.
|
|
176
|
+
* Can be called multiple times - attributes are merged.
|
|
177
|
+
*/
|
|
178
|
+
set(attributes: Record<string, unknown>): this;
|
|
179
|
+
/**
|
|
180
|
+
* Mark the span as errored.
|
|
181
|
+
*/
|
|
182
|
+
setError(error: Error | string): this;
|
|
183
|
+
/**
|
|
184
|
+
* Get span context for creating child spans.
|
|
185
|
+
*/
|
|
186
|
+
context(): {
|
|
187
|
+
traceId: string;
|
|
188
|
+
spanId: string;
|
|
189
|
+
};
|
|
190
|
+
/**
|
|
191
|
+
* End the span and send it.
|
|
192
|
+
* Must be called for the span to be recorded.
|
|
193
|
+
*/
|
|
194
|
+
end(): void;
|
|
195
|
+
}
|
|
196
|
+
/**
|
|
197
|
+
* Wrap a function to automatically create a span around it.
|
|
198
|
+
* Similar to Braintrust's wrapTraced().
|
|
199
|
+
*
|
|
200
|
+
* @example
|
|
201
|
+
* ```typescript
|
|
202
|
+
* const fetchDocuments = wrapTraced(
|
|
203
|
+
* session,
|
|
204
|
+
* "rag.fetch",
|
|
205
|
+
* async (query: string) => {
|
|
206
|
+
* const docs = await vectorDb.search(query);
|
|
207
|
+
* return docs;
|
|
208
|
+
* }
|
|
209
|
+
* );
|
|
210
|
+
*
|
|
211
|
+
* // Function input/output automatically captured
|
|
212
|
+
* const docs = await fetchDocuments("user query");
|
|
213
|
+
* ```
|
|
214
|
+
*/
|
|
215
|
+
declare function wrapTraced<T extends (...args: any[]) => Promise<any>>(session: {
|
|
216
|
+
span: (name: string, options?: SpanOptions) => FallomSpan;
|
|
217
|
+
}, name: string, fn: T, options?: SpanOptions): T;
|
|
218
|
+
|
|
133
219
|
/**
|
|
134
220
|
* FallomSession - Session-scoped tracing for concurrent-safe operations.
|
|
135
221
|
*/
|
|
@@ -162,6 +248,24 @@ declare class FallomSession {
|
|
|
162
248
|
constructor(options: SessionOptions);
|
|
163
249
|
/** Get the session context. */
|
|
164
250
|
getContext(): SessionContext;
|
|
251
|
+
/**
|
|
252
|
+
* Create a manual span for custom operations.
|
|
253
|
+
*
|
|
254
|
+
* Use for non-LLM operations like RAG retrieval, preprocessing, tool execution, etc.
|
|
255
|
+
* The span uses the session's context (configKey, sessionId, etc.).
|
|
256
|
+
*
|
|
257
|
+
* @example
|
|
258
|
+
* ```typescript
|
|
259
|
+
* const span = session.span("rag.retrieve");
|
|
260
|
+
* span.set({ "rag.query": userQuery, "rag.topK": 5 });
|
|
261
|
+
*
|
|
262
|
+
* const docs = await retrieveDocuments(userQuery);
|
|
263
|
+
* span.set({ "rag.documents.count": docs.length });
|
|
264
|
+
*
|
|
265
|
+
* span.end(); // Must call to send the span
|
|
266
|
+
* ```
|
|
267
|
+
*/
|
|
268
|
+
span(name: string, options?: SpanOptions): FallomSpan;
|
|
165
269
|
/**
|
|
166
270
|
* Get model assignment for this session (A/B testing).
|
|
167
271
|
*/
|
|
@@ -233,15 +337,19 @@ declare function session(options: SessionOptions): FallomSession;
|
|
|
233
337
|
|
|
234
338
|
type trace_FallomSession = FallomSession;
|
|
235
339
|
declare const trace_FallomSession: typeof FallomSession;
|
|
340
|
+
type trace_FallomSpan = FallomSpan;
|
|
341
|
+
declare const trace_FallomSpan: typeof FallomSpan;
|
|
236
342
|
type trace_SessionContext = SessionContext;
|
|
237
343
|
type trace_SessionOptions = SessionOptions;
|
|
344
|
+
type trace_SpanOptions = SpanOptions;
|
|
238
345
|
type trace_TraceContext = TraceContext;
|
|
239
346
|
type trace_TraceData = TraceData;
|
|
240
347
|
type trace_WrapAISDKOptions = WrapAISDKOptions;
|
|
241
348
|
declare const trace_session: typeof session;
|
|
242
349
|
declare const trace_shutdown: typeof shutdown;
|
|
350
|
+
declare const trace_wrapTraced: typeof wrapTraced;
|
|
243
351
|
declare namespace trace {
|
|
244
|
-
export { trace_FallomSession as FallomSession, type trace_SessionContext as SessionContext, type trace_SessionOptions as SessionOptions, type trace_TraceContext as TraceContext, type trace_TraceData as TraceData, type trace_WrapAISDKOptions as WrapAISDKOptions, init$4 as init, trace_session as session, trace_shutdown as shutdown };
|
|
352
|
+
export { trace_FallomSession as FallomSession, trace_FallomSpan as FallomSpan, type trace_SessionContext as SessionContext, type trace_SessionOptions as SessionOptions, type trace_SpanOptions as SpanOptions, type trace_TraceContext as TraceContext, type trace_TraceData as TraceData, type trace_WrapAISDKOptions as WrapAISDKOptions, init$4 as init, trace_session as session, trace_shutdown as shutdown, trace_wrapTraced as wrapTraced };
|
|
245
353
|
}
|
|
246
354
|
|
|
247
355
|
/**
|
|
@@ -531,6 +639,12 @@ interface EvaluateOptions {
|
|
|
531
639
|
/** List of metrics to run (built-in or custom). Default: all built-in metrics */
|
|
532
640
|
metrics?: MetricInput[];
|
|
533
641
|
judgeModel?: string;
|
|
642
|
+
/**
|
|
643
|
+
* Context to provide the LLM judge about the product/domain being evaluated.
|
|
644
|
+
* This helps the judge make better evaluations by understanding what features
|
|
645
|
+
* or capabilities are valid (e.g., won't mark valid features as hallucinations).
|
|
646
|
+
*/
|
|
647
|
+
judgeContext?: string;
|
|
534
648
|
name?: string;
|
|
535
649
|
description?: string;
|
|
536
650
|
verbose?: boolean;
|
|
@@ -566,7 +680,7 @@ declare const METRIC_PROMPTS: Record<MetricName, {
|
|
|
566
680
|
/**
|
|
567
681
|
* Build the G-Eval prompt for the LLM judge.
|
|
568
682
|
*/
|
|
569
|
-
declare function buildGEvalPrompt(criteria: string, steps: string[], systemMessage: string | undefined, inputText: string, outputText: string): string;
|
|
683
|
+
declare function buildGEvalPrompt(criteria: string, steps: string[], systemMessage: string | undefined, inputText: string, outputText: string, judgeContext?: string): string;
|
|
570
684
|
/**
|
|
571
685
|
* Result of running G-Eval on a single metric.
|
|
572
686
|
*/
|
|
@@ -600,6 +714,8 @@ interface RunGEvalOptions {
|
|
|
600
714
|
traceSessionId?: string;
|
|
601
715
|
/** Optional customer ID for tracing (e.g., organization ID) */
|
|
602
716
|
traceCustomerId?: string;
|
|
717
|
+
/** Optional context to provide the judge about the product/domain being evaluated */
|
|
718
|
+
judgeContext?: string;
|
|
603
719
|
}
|
|
604
720
|
/**
|
|
605
721
|
* Run G-Eval for a single metric using OpenRouter.
|
|
@@ -1114,4 +1230,4 @@ declare const _default: {
|
|
|
1114
1230
|
session: typeof session;
|
|
1115
1231
|
};
|
|
1116
1232
|
|
|
1117
|
-
export { type CompareModelsOptions, type DatasetItem, type EvalResult, type EvaluateOptions, FallomExporter, type FallomExporterOptions, FallomSession, type GEvalScore, type InitOptions, type MetricName, type PromptResult, type SessionContext, type SessionOptions, buildGEvalPrompt, calculateAggregateScores, clearMastraPrompt, _default as default, detectRegression, evals, init, models, prompts, runGEval, session, setMastraPrompt, setMastraPromptAB, trace };
|
|
1233
|
+
export { type CompareModelsOptions, type DatasetItem, type EvalResult, type EvaluateOptions, FallomExporter, type FallomExporterOptions, FallomSession, FallomSpan, type GEvalScore, type InitOptions, type MetricName, type PromptResult, type SessionContext, type SessionOptions, type SpanOptions, buildGEvalPrompt, calculateAggregateScores, clearMastraPrompt, _default as default, detectRegression, evals, init, models, prompts, runGEval, session, setMastraPrompt, setMastraPromptAB, trace, wrapTraced };
|
package/dist/index.js
CHANGED
|
@@ -346,10 +346,15 @@ var init_types = __esm({
|
|
|
346
346
|
});
|
|
347
347
|
|
|
348
348
|
// src/evals/prompts.ts
|
|
349
|
-
function buildGEvalPrompt(criteria, steps, systemMessage, inputText, outputText) {
|
|
349
|
+
function buildGEvalPrompt(criteria, steps, systemMessage, inputText, outputText, judgeContext) {
|
|
350
350
|
const stepsText = steps.map((s, i) => `${i + 1}. ${s}`).join("\n");
|
|
351
351
|
return `You are an expert evaluator assessing LLM outputs using the G-Eval methodology.
|
|
352
|
+
${judgeContext ? `
|
|
353
|
+
## Important Context
|
|
354
|
+
The following context provides background information about the product/domain being evaluated. Use this to inform your evaluation - for example, if the context mentions that certain features or capabilities exist, do not mark responses as hallucinations when they reference those features.
|
|
352
355
|
|
|
356
|
+
${judgeContext}
|
|
357
|
+
` : ""}
|
|
353
358
|
## Evaluation Criteria
|
|
354
359
|
${criteria}
|
|
355
360
|
|
|
@@ -388,7 +393,8 @@ async function runGEval(options) {
|
|
|
388
393
|
openrouterKey,
|
|
389
394
|
fallomApiKey,
|
|
390
395
|
traceSessionId,
|
|
391
|
-
traceCustomerId
|
|
396
|
+
traceCustomerId,
|
|
397
|
+
judgeContext
|
|
392
398
|
} = options;
|
|
393
399
|
const apiKey4 = openrouterKey || process.env.OPENROUTER_API_KEY;
|
|
394
400
|
if (!apiKey4) {
|
|
@@ -406,7 +412,8 @@ async function runGEval(options) {
|
|
|
406
412
|
config.steps,
|
|
407
413
|
systemMessage,
|
|
408
414
|
inputText,
|
|
409
|
-
outputText
|
|
415
|
+
outputText,
|
|
416
|
+
judgeContext
|
|
410
417
|
);
|
|
411
418
|
const startTime = Date.now();
|
|
412
419
|
const response = await fetch(
|
|
@@ -982,14 +989,15 @@ function init4(options = {}) {
|
|
|
982
989
|
}
|
|
983
990
|
_initialized = true;
|
|
984
991
|
}
|
|
985
|
-
async function runGEval2(metric, inputText, outputText, systemMessage, judgeModel) {
|
|
992
|
+
async function runGEval2(metric, inputText, outputText, systemMessage, judgeModel, judgeContext) {
|
|
986
993
|
const metricArg = isCustomMetric(metric) ? { name: metric.name, criteria: metric.criteria, steps: metric.steps } : metric;
|
|
987
994
|
return runGEval({
|
|
988
995
|
metric: metricArg,
|
|
989
996
|
inputText,
|
|
990
997
|
outputText,
|
|
991
998
|
systemMessage,
|
|
992
|
-
judgeModel
|
|
999
|
+
judgeModel,
|
|
1000
|
+
judgeContext
|
|
993
1001
|
});
|
|
994
1002
|
}
|
|
995
1003
|
async function resolveDataset(datasetInput) {
|
|
@@ -1040,6 +1048,7 @@ async function evaluate(options) {
|
|
|
1040
1048
|
dataset: datasetInput,
|
|
1041
1049
|
metrics = [...AVAILABLE_METRICS],
|
|
1042
1050
|
judgeModel = DEFAULT_JUDGE_MODEL,
|
|
1051
|
+
judgeContext,
|
|
1043
1052
|
name,
|
|
1044
1053
|
description,
|
|
1045
1054
|
verbose = true,
|
|
@@ -1102,7 +1111,8 @@ async function evaluate(options) {
|
|
|
1102
1111
|
item.input,
|
|
1103
1112
|
item.output,
|
|
1104
1113
|
item.systemMessage,
|
|
1105
|
-
judgeModel
|
|
1114
|
+
judgeModel,
|
|
1115
|
+
judgeContext
|
|
1106
1116
|
);
|
|
1107
1117
|
const key = isCustomMetric(metric) ? metricName : metricName.replace(/_([a-z])/g, (_, c) => c.toUpperCase());
|
|
1108
1118
|
result[key] = score;
|
|
@@ -1133,6 +1143,7 @@ async function compareModels(options) {
|
|
|
1133
1143
|
models,
|
|
1134
1144
|
metrics = [...AVAILABLE_METRICS],
|
|
1135
1145
|
judgeModel = DEFAULT_JUDGE_MODEL,
|
|
1146
|
+
judgeContext,
|
|
1136
1147
|
includeProduction = true,
|
|
1137
1148
|
modelKwargs = {},
|
|
1138
1149
|
name,
|
|
@@ -1150,6 +1161,7 @@ async function compareModels(options) {
|
|
|
1150
1161
|
dataset,
|
|
1151
1162
|
metrics,
|
|
1152
1163
|
judgeModel,
|
|
1164
|
+
judgeContext,
|
|
1153
1165
|
verbose,
|
|
1154
1166
|
_skipUpload: true
|
|
1155
1167
|
});
|
|
@@ -1206,7 +1218,8 @@ async function compareModels(options) {
|
|
|
1206
1218
|
item.input,
|
|
1207
1219
|
output,
|
|
1208
1220
|
item.systemMessage,
|
|
1209
|
-
judgeModel
|
|
1221
|
+
judgeModel,
|
|
1222
|
+
judgeContext
|
|
1210
1223
|
);
|
|
1211
1224
|
const key = isCustomMetric(metric) ? metricName : metricName.replace(/_([a-z])/g, (_, c) => c.toUpperCase());
|
|
1212
1225
|
result[key] = score;
|
|
@@ -1378,6 +1391,7 @@ var index_exports = {};
|
|
|
1378
1391
|
__export(index_exports, {
|
|
1379
1392
|
FallomExporter: () => FallomExporter,
|
|
1380
1393
|
FallomSession: () => FallomSession,
|
|
1394
|
+
FallomSpan: () => FallomSpan,
|
|
1381
1395
|
buildGEvalPrompt: () => buildGEvalPrompt,
|
|
1382
1396
|
calculateAggregateScores: () => calculateAggregateScores,
|
|
1383
1397
|
clearMastraPrompt: () => clearMastraPrompt,
|
|
@@ -1391,7 +1405,8 @@ __export(index_exports, {
|
|
|
1391
1405
|
session: () => session,
|
|
1392
1406
|
setMastraPrompt: () => setMastraPrompt,
|
|
1393
1407
|
setMastraPromptAB: () => setMastraPromptAB,
|
|
1394
|
-
trace: () => trace_exports
|
|
1408
|
+
trace: () => trace_exports,
|
|
1409
|
+
wrapTraced: () => wrapTraced
|
|
1395
1410
|
});
|
|
1396
1411
|
module.exports = __toCommonJS(index_exports);
|
|
1397
1412
|
|
|
@@ -1399,9 +1414,11 @@ module.exports = __toCommonJS(index_exports);
|
|
|
1399
1414
|
var trace_exports = {};
|
|
1400
1415
|
__export(trace_exports, {
|
|
1401
1416
|
FallomSession: () => FallomSession,
|
|
1417
|
+
FallomSpan: () => FallomSpan,
|
|
1402
1418
|
init: () => init,
|
|
1403
1419
|
session: () => session,
|
|
1404
|
-
shutdown: () => shutdown
|
|
1420
|
+
shutdown: () => shutdown,
|
|
1421
|
+
wrapTraced: () => wrapTraced
|
|
1405
1422
|
});
|
|
1406
1423
|
|
|
1407
1424
|
// src/trace/core.ts
|
|
@@ -2187,6 +2204,109 @@ function generateHexId(length) {
|
|
|
2187
2204
|
return Array.from(bytes).map((b) => b.toString(16).padStart(2, "0")).join("");
|
|
2188
2205
|
}
|
|
2189
2206
|
|
|
2207
|
+
// src/trace/span.ts
|
|
2208
|
+
var FallomSpan = class {
|
|
2209
|
+
constructor(name, ctx, options = {}) {
|
|
2210
|
+
this.name = name;
|
|
2211
|
+
this.ctx = ctx;
|
|
2212
|
+
this.attrs = {};
|
|
2213
|
+
this.ended = false;
|
|
2214
|
+
this._status = "OK";
|
|
2215
|
+
this.spanId = generateHexId(16);
|
|
2216
|
+
this.traceId = options.traceId || generateHexId(32);
|
|
2217
|
+
this.parentSpanId = options.parentSpanId;
|
|
2218
|
+
this.kind = options.kind || "custom";
|
|
2219
|
+
this.startTime = Date.now();
|
|
2220
|
+
}
|
|
2221
|
+
/**
|
|
2222
|
+
* Set attributes on the span.
|
|
2223
|
+
* Can be called multiple times - attributes are merged.
|
|
2224
|
+
*/
|
|
2225
|
+
set(attributes) {
|
|
2226
|
+
if (this.ended) {
|
|
2227
|
+
console.warn("[Fallom] Cannot set attributes on ended span");
|
|
2228
|
+
return this;
|
|
2229
|
+
}
|
|
2230
|
+
Object.assign(this.attrs, attributes);
|
|
2231
|
+
return this;
|
|
2232
|
+
}
|
|
2233
|
+
/**
|
|
2234
|
+
* Mark the span as errored.
|
|
2235
|
+
*/
|
|
2236
|
+
setError(error) {
|
|
2237
|
+
this._status = "ERROR";
|
|
2238
|
+
this._errorMessage = error instanceof Error ? error.message : error;
|
|
2239
|
+
return this;
|
|
2240
|
+
}
|
|
2241
|
+
/**
|
|
2242
|
+
* Get span context for creating child spans.
|
|
2243
|
+
*/
|
|
2244
|
+
context() {
|
|
2245
|
+
return {
|
|
2246
|
+
traceId: this.traceId,
|
|
2247
|
+
spanId: this.spanId
|
|
2248
|
+
};
|
|
2249
|
+
}
|
|
2250
|
+
/**
|
|
2251
|
+
* End the span and send it.
|
|
2252
|
+
* Must be called for the span to be recorded.
|
|
2253
|
+
*/
|
|
2254
|
+
end() {
|
|
2255
|
+
if (this.ended) {
|
|
2256
|
+
console.warn("[Fallom] Span already ended");
|
|
2257
|
+
return;
|
|
2258
|
+
}
|
|
2259
|
+
this.ended = true;
|
|
2260
|
+
if (!isInitialized()) {
|
|
2261
|
+
return;
|
|
2262
|
+
}
|
|
2263
|
+
const endTime = Date.now();
|
|
2264
|
+
sendTrace({
|
|
2265
|
+
config_key: this.ctx.configKey,
|
|
2266
|
+
session_id: this.ctx.sessionId,
|
|
2267
|
+
customer_id: this.ctx.customerId,
|
|
2268
|
+
metadata: this.ctx.metadata,
|
|
2269
|
+
tags: this.ctx.tags,
|
|
2270
|
+
trace_id: this.traceId,
|
|
2271
|
+
span_id: this.spanId,
|
|
2272
|
+
parent_span_id: this.parentSpanId,
|
|
2273
|
+
name: this.name,
|
|
2274
|
+
kind: this.kind,
|
|
2275
|
+
start_time: new Date(this.startTime).toISOString(),
|
|
2276
|
+
end_time: new Date(endTime).toISOString(),
|
|
2277
|
+
duration_ms: endTime - this.startTime,
|
|
2278
|
+
status: this._status,
|
|
2279
|
+
error_message: this._errorMessage,
|
|
2280
|
+
attributes: {
|
|
2281
|
+
"fallom.sdk_version": "2",
|
|
2282
|
+
"fallom.span_type": "manual",
|
|
2283
|
+
...this.attrs
|
|
2284
|
+
}
|
|
2285
|
+
}).catch(() => {
|
|
2286
|
+
});
|
|
2287
|
+
}
|
|
2288
|
+
};
|
|
2289
|
+
function wrapTraced(session2, name, fn, options = {}) {
|
|
2290
|
+
return (async (...args) => {
|
|
2291
|
+
const span = session2.span(name, options);
|
|
2292
|
+
if (args.length === 1) {
|
|
2293
|
+
span.set({ input: args[0] });
|
|
2294
|
+
} else if (args.length > 1) {
|
|
2295
|
+
span.set({ input: args });
|
|
2296
|
+
}
|
|
2297
|
+
try {
|
|
2298
|
+
const result = await fn(...args);
|
|
2299
|
+
span.set({ output: result });
|
|
2300
|
+
span.end();
|
|
2301
|
+
return result;
|
|
2302
|
+
} catch (error) {
|
|
2303
|
+
span.setError(error instanceof Error ? error : String(error));
|
|
2304
|
+
span.end();
|
|
2305
|
+
throw error;
|
|
2306
|
+
}
|
|
2307
|
+
});
|
|
2308
|
+
}
|
|
2309
|
+
|
|
2190
2310
|
// src/prompts.ts
|
|
2191
2311
|
var prompts_exports = {};
|
|
2192
2312
|
__export(prompts_exports, {
|
|
@@ -4056,6 +4176,26 @@ var FallomSession = class {
|
|
|
4056
4176
|
getContext() {
|
|
4057
4177
|
return { ...this.ctx };
|
|
4058
4178
|
}
|
|
4179
|
+
/**
|
|
4180
|
+
* Create a manual span for custom operations.
|
|
4181
|
+
*
|
|
4182
|
+
* Use for non-LLM operations like RAG retrieval, preprocessing, tool execution, etc.
|
|
4183
|
+
* The span uses the session's context (configKey, sessionId, etc.).
|
|
4184
|
+
*
|
|
4185
|
+
* @example
|
|
4186
|
+
* ```typescript
|
|
4187
|
+
* const span = session.span("rag.retrieve");
|
|
4188
|
+
* span.set({ "rag.query": userQuery, "rag.topK": 5 });
|
|
4189
|
+
*
|
|
4190
|
+
* const docs = await retrieveDocuments(userQuery);
|
|
4191
|
+
* span.set({ "rag.documents.count": docs.length });
|
|
4192
|
+
*
|
|
4193
|
+
* span.end(); // Must call to send the span
|
|
4194
|
+
* ```
|
|
4195
|
+
*/
|
|
4196
|
+
span(name, options) {
|
|
4197
|
+
return new FallomSpan(name, this.ctx, options);
|
|
4198
|
+
}
|
|
4059
4199
|
/**
|
|
4060
4200
|
* Get model assignment for this session (A/B testing).
|
|
4061
4201
|
*/
|
|
@@ -4294,7 +4434,7 @@ async function init5(options = {}) {
|
|
|
4294
4434
|
}
|
|
4295
4435
|
|
|
4296
4436
|
// src/mastra.ts
|
|
4297
|
-
var
|
|
4437
|
+
var import_core14 = require("@opentelemetry/core");
|
|
4298
4438
|
var promptContext2 = {};
|
|
4299
4439
|
function setMastraPrompt(promptKey, version) {
|
|
4300
4440
|
promptContext2 = {
|
|
@@ -4344,7 +4484,7 @@ var FallomExporter = class {
|
|
|
4344
4484
|
*/
|
|
4345
4485
|
export(spans, resultCallback) {
|
|
4346
4486
|
if (spans.length === 0) {
|
|
4347
|
-
resultCallback({ code:
|
|
4487
|
+
resultCallback({ code: import_core14.ExportResultCode.SUCCESS });
|
|
4348
4488
|
return;
|
|
4349
4489
|
}
|
|
4350
4490
|
this.log(`Exporting ${spans.length} spans...`);
|
|
@@ -4361,11 +4501,11 @@ var FallomExporter = class {
|
|
|
4361
4501
|
}
|
|
4362
4502
|
const exportPromise = this.sendSpans(spans).then(() => {
|
|
4363
4503
|
this.log("Export successful");
|
|
4364
|
-
resultCallback({ code:
|
|
4504
|
+
resultCallback({ code: import_core14.ExportResultCode.SUCCESS });
|
|
4365
4505
|
}).catch((error) => {
|
|
4366
4506
|
console.error("[FallomExporter] Export failed:", error);
|
|
4367
4507
|
resultCallback({
|
|
4368
|
-
code:
|
|
4508
|
+
code: import_core14.ExportResultCode.FAILED,
|
|
4369
4509
|
error: error instanceof Error ? error : new Error(String(error))
|
|
4370
4510
|
});
|
|
4371
4511
|
});
|
|
@@ -4545,6 +4685,7 @@ var index_default = {
|
|
|
4545
4685
|
0 && (module.exports = {
|
|
4546
4686
|
FallomExporter,
|
|
4547
4687
|
FallomSession,
|
|
4688
|
+
FallomSpan,
|
|
4548
4689
|
buildGEvalPrompt,
|
|
4549
4690
|
calculateAggregateScores,
|
|
4550
4691
|
clearMastraPrompt,
|
|
@@ -4557,5 +4698,6 @@ var index_default = {
|
|
|
4557
4698
|
session,
|
|
4558
4699
|
setMastraPrompt,
|
|
4559
4700
|
setMastraPromptAB,
|
|
4560
|
-
trace
|
|
4701
|
+
trace,
|
|
4702
|
+
wrapTraced
|
|
4561
4703
|
});
|