baselineos 0.2.0-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +17 -0
- package/README.md +198 -0
- package/dist/__evals__/runner.d.ts +2 -0
- package/dist/__evals__/runner.js +14687 -0
- package/dist/__evals__/runner.js.map +1 -0
- package/dist/api/server.d.ts +21 -0
- package/dist/api/server.js +1007 -0
- package/dist/api/server.js.map +1 -0
- package/dist/cli/bin.d.ts +1 -0
- package/dist/cli/bin.js +8427 -0
- package/dist/cli/bin.js.map +1 -0
- package/dist/core/agent-bus.d.ts +110 -0
- package/dist/core/agent-bus.js +242 -0
- package/dist/core/agent-bus.js.map +1 -0
- package/dist/core/cache.d.ts +66 -0
- package/dist/core/cache.js +160 -0
- package/dist/core/cache.js.map +1 -0
- package/dist/core/config.d.ts +1002 -0
- package/dist/core/config.js +429 -0
- package/dist/core/config.js.map +1 -0
- package/dist/core/indexer.d.ts +152 -0
- package/dist/core/indexer.js +481 -0
- package/dist/core/indexer.js.map +1 -0
- package/dist/core/llm-tracer.d.ts +2 -0
- package/dist/core/llm-tracer.js +241 -0
- package/dist/core/llm-tracer.js.map +1 -0
- package/dist/core/memory.d.ts +86 -0
- package/dist/core/memory.js +346 -0
- package/dist/core/memory.js.map +1 -0
- package/dist/core/opa-client.d.ts +51 -0
- package/dist/core/opa-client.js +157 -0
- package/dist/core/opa-client.js.map +1 -0
- package/dist/core/opa-policy-gate.d.ts +133 -0
- package/dist/core/opa-policy-gate.js +454 -0
- package/dist/core/opa-policy-gate.js.map +1 -0
- package/dist/core/orchestrator.d.ts +14 -0
- package/dist/core/orchestrator.js +1297 -0
- package/dist/core/orchestrator.js.map +1 -0
- package/dist/core/pii-detector.d.ts +82 -0
- package/dist/core/pii-detector.js +126 -0
- package/dist/core/pii-detector.js.map +1 -0
- package/dist/core/rag-engine.d.ts +121 -0
- package/dist/core/rag-engine.js +504 -0
- package/dist/core/rag-engine.js.map +1 -0
- package/dist/core/task-queue.d.ts +69 -0
- package/dist/core/task-queue.js +124 -0
- package/dist/core/task-queue.js.map +1 -0
- package/dist/core/telemetry.d.ts +56 -0
- package/dist/core/telemetry.js +94 -0
- package/dist/core/telemetry.js.map +1 -0
- package/dist/core/types.d.ts +328 -0
- package/dist/core/types.js +24 -0
- package/dist/core/types.js.map +1 -0
- package/dist/index.d.ts +21 -0
- package/dist/index.js +12444 -0
- package/dist/index.js.map +1 -0
- package/dist/llm-tracer-CIIujuO-.d.ts +493 -0
- package/dist/mcp/server.d.ts +2651 -0
- package/dist/mcp/server.js +676 -0
- package/dist/mcp/server.js.map +1 -0
- package/dist/orchestrator-DF89k_AK.d.ts +506 -0
- package/package.json +157 -0
- package/templates/README.md +7 -0
- package/templates/baseline.config.ts +207 -0
|
@@ -0,0 +1,493 @@
|
|
|
1
|
+
import { AgentBus } from './core/agent-bus.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* ModelVersionRegistry — SIGNAL-048
|
|
5
|
+
*
|
|
6
|
+
* Tracks which model version handled each task execution. Provides a
|
|
7
|
+
* queryable registry of deployed model versions, their metadata, and
|
|
8
|
+
* per-task attribution for audit and regression analysis.
|
|
9
|
+
*
|
|
10
|
+
* Capabilities:
|
|
11
|
+
* - Register model versions with metadata (provider, family, deployment date)
|
|
12
|
+
* - Set the active version per provider
|
|
13
|
+
* - Record per-task model attribution
|
|
14
|
+
* - List all versions / versions by provider
|
|
15
|
+
* - Emit governance:model-version-changed on active version change
|
|
16
|
+
*
|
|
17
|
+
* Usage:
|
|
18
|
+
* const registry = new ModelVersionRegistry({ bus });
|
|
19
|
+
* registry.register('claude-sonnet-4-6', { provider: 'anthropic', family: 'claude-4' });
|
|
20
|
+
* registry.setActive('anthropic', 'claude-sonnet-4-6');
|
|
21
|
+
* registry.recordTaskModel('task-123', 'claude-sonnet-4-6');
|
|
22
|
+
* registry.getForTask('task-123'); // → 'claude-sonnet-4-6'
|
|
23
|
+
*
|
|
24
|
+
* @license Apache-2.0
|
|
25
|
+
*/
|
|
26
|
+
|
|
27
|
+
interface ModelVersionEntry {
|
|
28
|
+
modelId: string;
|
|
29
|
+
provider: string;
|
|
30
|
+
family?: string;
|
|
31
|
+
/** ISO date string when this version was first registered */
|
|
32
|
+
registeredAt: string;
|
|
33
|
+
/** ISO date string when this version was last set as active */
|
|
34
|
+
activatedAt?: string;
|
|
35
|
+
/** Arbitrary metadata: context window, pricing tier, capabilities, etc. */
|
|
36
|
+
metadata?: Record<string, unknown>;
|
|
37
|
+
}
|
|
38
|
+
interface ModelVersionRegistryConfig {
|
|
39
|
+
bus?: AgentBus;
|
|
40
|
+
}
|
|
41
|
+
declare class ModelVersionRegistry {
|
|
42
|
+
private readonly versions;
|
|
43
|
+
/** provider → active modelId */
|
|
44
|
+
private readonly activeVersions;
|
|
45
|
+
/** taskId → modelId */
|
|
46
|
+
private readonly taskModels;
|
|
47
|
+
private readonly bus?;
|
|
48
|
+
constructor(config?: ModelVersionRegistryConfig);
|
|
49
|
+
/**
|
|
50
|
+
* Register a model version. Idempotent — re-registering updates metadata.
|
|
51
|
+
*/
|
|
52
|
+
register(modelId: string, meta: Omit<ModelVersionEntry, 'modelId' | 'registeredAt'>): void;
|
|
53
|
+
/**
|
|
54
|
+
* Set the active model version for a provider.
|
|
55
|
+
* Publishes governance:model-version-changed on the bus.
|
|
56
|
+
*/
|
|
57
|
+
setActive(provider: string, modelId: string): void;
|
|
58
|
+
/**
|
|
59
|
+
* Get the currently active model version for a provider.
|
|
60
|
+
* Returns undefined if none has been set.
|
|
61
|
+
*/
|
|
62
|
+
getActive(provider: string): string | undefined;
|
|
63
|
+
/**
|
|
64
|
+
* Record which model version was used for a given task.
|
|
65
|
+
*/
|
|
66
|
+
recordTaskModel(taskId: string, modelId: string): void;
|
|
67
|
+
/**
|
|
68
|
+
* Return the model version used for a given task.
|
|
69
|
+
* Returns undefined if no attribution has been recorded.
|
|
70
|
+
*/
|
|
71
|
+
getForTask(taskId: string): string | undefined;
|
|
72
|
+
/**
|
|
73
|
+
* Return entry for a specific model version, or undefined if unknown.
|
|
74
|
+
*/
|
|
75
|
+
get(modelId: string): ModelVersionEntry | undefined;
|
|
76
|
+
/**
|
|
77
|
+
* List all registered model versions, optionally filtered by provider.
|
|
78
|
+
*/
|
|
79
|
+
listVersions(provider?: string): ModelVersionEntry[];
|
|
80
|
+
/**
|
|
81
|
+
* List all providers that have an active model version set.
|
|
82
|
+
*/
|
|
83
|
+
listActiveProviders(): Array<{
|
|
84
|
+
provider: string;
|
|
85
|
+
modelId: string;
|
|
86
|
+
}>;
|
|
87
|
+
/**
|
|
88
|
+
* Return a summary of per-model task counts (how many tasks used each model).
|
|
89
|
+
*/
|
|
90
|
+
getTaskCounts(): Map<string, number>;
|
|
91
|
+
/**
|
|
92
|
+
* Remove all task attribution records (e.g., on rotation period boundary).
|
|
93
|
+
*/
|
|
94
|
+
clearTaskRecords(): void;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
interface GroundTruthCase {
|
|
98
|
+
/** Unique identifier for this eval case */
|
|
99
|
+
id: string;
|
|
100
|
+
/** Human-readable description */
|
|
101
|
+
description: string;
|
|
102
|
+
/** Simulated input text / task description */
|
|
103
|
+
input: string;
|
|
104
|
+
/** Expected model output (verbatim or representative) */
|
|
105
|
+
expectedOutput: string;
|
|
106
|
+
/** Acceptance criteria labels for this case */
|
|
107
|
+
criteria: string[];
|
|
108
|
+
/** Expected token range [min, max] for cost SLO check */
|
|
109
|
+
expectedTokenRange?: [number, number];
|
|
110
|
+
/** Whether this is a benign (non-refusal) request */
|
|
111
|
+
benign?: boolean;
|
|
112
|
+
}
|
|
113
|
+
interface ProdEvalCheckContext {
|
|
114
|
+
case: GroundTruthCase;
|
|
115
|
+
/** Simulated output — either expectedOutput or a probe value */
|
|
116
|
+
output: string;
|
|
117
|
+
}
|
|
118
|
+
interface ProdEvalCheckResult {
|
|
119
|
+
checkId: string;
|
|
120
|
+
caseId: string;
|
|
121
|
+
passed: boolean;
|
|
122
|
+
reason?: string;
|
|
123
|
+
}
|
|
124
|
+
type ProdEvalCheck = (ctx: ProdEvalCheckContext) => Promise<ProdEvalCheckResult[]>;
|
|
125
|
+
interface ProdEvalReport {
|
|
126
|
+
runId: string;
|
|
127
|
+
timestamp: string;
|
|
128
|
+
totalCases: number;
|
|
129
|
+
totalChecks: number;
|
|
130
|
+
passed: number;
|
|
131
|
+
failed: number;
|
|
132
|
+
score: number;
|
|
133
|
+
results: ProdEvalCheckResult[];
|
|
134
|
+
durationMs: number;
|
|
135
|
+
}
|
|
136
|
+
/**
|
|
137
|
+
* Per-model evaluation result stored in prod-eval-history.json (SIGNAL-053).
|
|
138
|
+
*/
|
|
139
|
+
interface VersionEvalResult {
|
|
140
|
+
/** Unique run identifier */
|
|
141
|
+
runId: string;
|
|
142
|
+
/** Model ID that was evaluated (e.g. 'claude-sonnet-4-6') */
|
|
143
|
+
modelId: string;
|
|
144
|
+
/** ISO timestamp of the eval run */
|
|
145
|
+
timestamp: string;
|
|
146
|
+
/** Overall score 0–100 */
|
|
147
|
+
score: number;
|
|
148
|
+
/** Fraction of checks that passed (0–1) */
|
|
149
|
+
passRate: number;
|
|
150
|
+
/** Check IDs that failed */
|
|
151
|
+
failedChecks: string[];
|
|
152
|
+
/** Total number of cases evaluated */
|
|
153
|
+
totalCases: number;
|
|
154
|
+
}
|
|
155
|
+
/**
|
|
156
|
+
* Result of comparing two VersionEvalResults (SIGNAL-053).
|
|
157
|
+
*/
|
|
158
|
+
interface VersionComparison {
|
|
159
|
+
/** Score delta: b.score − a.score (positive = b is better) */
|
|
160
|
+
deltaScore: number;
|
|
161
|
+
/** PassRate delta: b.passRate − a.passRate */
|
|
162
|
+
deltaPassRate: number;
|
|
163
|
+
/** Checks that failed in b but not in a (new regressions) */
|
|
164
|
+
newRegressions: string[];
|
|
165
|
+
/** Checks that failed in a but not in b (improvements) */
|
|
166
|
+
improvements: string[];
|
|
167
|
+
/** True if b is strictly better than a */
|
|
168
|
+
improved: boolean;
|
|
169
|
+
/**
|
|
170
|
+
* Promotion recommendation:
|
|
171
|
+
* 'promote' — b is measurably better with no new regressions
|
|
172
|
+
* 'hold' — marginal improvement or insufficient delta
|
|
173
|
+
* 'rollback' — b is worse than a
|
|
174
|
+
*/
|
|
175
|
+
recommend: 'promote' | 'hold' | 'rollback';
|
|
176
|
+
/**
|
|
177
|
+
* 'high' when ≥ 10 cases; 'low' when fewer cases may produce noisy results.
|
|
178
|
+
*/
|
|
179
|
+
confidence: 'high' | 'low';
|
|
180
|
+
}
|
|
181
|
+
interface ProductionEvalPipelineConfig {
|
|
182
|
+
bus?: AgentBus;
|
|
183
|
+
/** Path to ground-truth JSON dataset. Default: src/data/prod-eval-dataset.json */
|
|
184
|
+
datasetPath?: string;
|
|
185
|
+
/** Path to write the report JSON. Default: packages/baselineos/prod-eval-report.json */
|
|
186
|
+
reportPath?: string;
|
|
187
|
+
/** Path to the eval history JSON file (SIGNAL-053). Default: src/data/prod-eval-history.json */
|
|
188
|
+
historyPath?: string;
|
|
189
|
+
/**
|
|
190
|
+
* Minimum score delta required to recommend promotion (default: 3).
|
|
191
|
+
* deltaScore must exceed this threshold for 'promote' recommendation.
|
|
192
|
+
*/
|
|
193
|
+
promotionThreshold?: number;
|
|
194
|
+
/**
|
|
195
|
+
* ModelVersionRegistry for auto model attribution (SIGNAL-056).
|
|
196
|
+
* When set, every run() call resolves the active model and appends a
|
|
197
|
+
* VersionEvalResult to history automatically — no need to call runForModel().
|
|
198
|
+
* Provider defaults to 'anthropic' unless modelAttributionProvider is set.
|
|
199
|
+
*/
|
|
200
|
+
modelVersionRegistry?: ModelVersionRegistry;
|
|
201
|
+
/** Provider key used to look up the active model. Default: 'anthropic' */
|
|
202
|
+
modelAttributionProvider?: string;
|
|
203
|
+
}
|
|
204
|
+
declare class ProductionEvalPipeline {
|
|
205
|
+
private readonly checks;
|
|
206
|
+
private readonly config;
|
|
207
|
+
private readonly bus?;
|
|
208
|
+
private readonly promotionThreshold;
|
|
209
|
+
constructor(config?: ProductionEvalPipelineConfig);
|
|
210
|
+
/** Register a custom check function. */
|
|
211
|
+
addCheck(check: ProdEvalCheck): this;
|
|
212
|
+
/**
|
|
213
|
+
* Register all five built-in production checks.
|
|
214
|
+
*/
|
|
215
|
+
addBuiltinChecks(): this;
|
|
216
|
+
/**
|
|
217
|
+
* Load the ground-truth dataset from disk.
|
|
218
|
+
*/
|
|
219
|
+
loadDataset(): GroundTruthCase[];
|
|
220
|
+
/**
|
|
221
|
+
* Run all checks against all ground-truth cases.
|
|
222
|
+
* Returns a ProdEvalReport.
|
|
223
|
+
*
|
|
224
|
+
* When modelVersionRegistry is configured (SIGNAL-056), automatically
|
|
225
|
+
* resolves the active model and appends a VersionEvalResult to history.
|
|
226
|
+
*/
|
|
227
|
+
run(cases?: GroundTruthCase[]): Promise<ProdEvalReport>;
|
|
228
|
+
/**
|
|
229
|
+
* Run the eval suite and record the result under a specific modelId.
|
|
230
|
+
* Appends to prod-eval-history.json for trend tracking.
|
|
231
|
+
*
|
|
232
|
+
* Uses _runCore() directly to avoid double-appending when modelVersionRegistry
|
|
233
|
+
* is also configured.
|
|
234
|
+
*/
|
|
235
|
+
runForModel(modelId: string, cases?: GroundTruthCase[]): Promise<VersionEvalResult>;
|
|
236
|
+
/**
|
|
237
|
+
* Core eval execution — runs checks, persists report, publishes bus event.
|
|
238
|
+
* Called by both run() and runForModel() to avoid code duplication.
|
|
239
|
+
*/
|
|
240
|
+
private _runCore;
|
|
241
|
+
/**
|
|
242
|
+
* Compare two VersionEvalResults and return a promotion recommendation.
|
|
243
|
+
* `a` is the baseline (current production); `b` is the candidate.
|
|
244
|
+
*/
|
|
245
|
+
compareVersions(a: VersionEvalResult, b: VersionEvalResult): VersionComparison;
|
|
246
|
+
/**
|
|
247
|
+
* Append a VersionEvalResult to the eval history file.
|
|
248
|
+
*/
|
|
249
|
+
appendHistory(result: VersionEvalResult): void;
|
|
250
|
+
/**
|
|
251
|
+
* Return all VersionEvalResults, optionally filtered by modelId.
|
|
252
|
+
*/
|
|
253
|
+
getHistory(modelId?: string): VersionEvalResult[];
|
|
254
|
+
/**
|
|
255
|
+
* Return the rolling average eval score and delta from windowSize entries ago.
|
|
256
|
+
* Returns undefined when insufficient history exists.
|
|
257
|
+
*/
|
|
258
|
+
getQualityTrend(modelId: string, windowSize?: number): {
|
|
259
|
+
current: number;
|
|
260
|
+
rollingAvg: number;
|
|
261
|
+
delta: number;
|
|
262
|
+
} | undefined;
|
|
263
|
+
private _historyPath;
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
/**
|
|
267
|
+
* TraceCurator — SIGNAL-047
|
|
268
|
+
*
|
|
269
|
+
* Converts sampled production LLM traces into GroundTruthCase entries
|
|
270
|
+
* that feed the ProductionEvalPipeline dataset. Enables continuous
|
|
271
|
+
* ground-truth expansion from real inference without manual labelling.
|
|
272
|
+
*
|
|
273
|
+
* Architecture:
|
|
274
|
+
* 1. Accept trace records (from Langfuse callbacks or a local trace buffer)
|
|
275
|
+
* 2. Apply sampling strategy (reservoir / recency / confidence-weighted)
|
|
276
|
+
* 3. Score and filter traces using configurable quality criteria
|
|
277
|
+
* 4. Append accepted cases to the ground-truth dataset on disk
|
|
278
|
+
* 5. Publish governance:trace-curated on AgentBus
|
|
279
|
+
*
|
|
280
|
+
* Sampling strategies:
|
|
281
|
+
* reservoir — uniform random sample, bounded by maxDatasetSize
|
|
282
|
+
* recency — keeps the N most recent accepted traces
|
|
283
|
+
* confidence — preferentially retains high-confidence traces
|
|
284
|
+
*
|
|
285
|
+
* Usage:
|
|
286
|
+
* const curator = new TraceCurator({ datasetPath, bus });
|
|
287
|
+
* curator.ingest(traceRecord); // real-time ingestion
|
|
288
|
+
* const result = await curator.flush(); // write accepted cases to disk
|
|
289
|
+
*
|
|
290
|
+
* @license Apache-2.0
|
|
291
|
+
*/
|
|
292
|
+
|
|
293
|
+
type SamplingStrategy = 'reservoir' | 'recency' | 'confidence';
|
|
294
|
+
interface TraceRecord {
|
|
295
|
+
/** Unique trace identifier (task ID in Langfuse) */
|
|
296
|
+
traceId: string;
|
|
297
|
+
/** Human-readable task title */
|
|
298
|
+
title: string;
|
|
299
|
+
/** The input prompt / task description sent to the model */
|
|
300
|
+
input: string;
|
|
301
|
+
/** The model's final output for this trace */
|
|
302
|
+
output: string;
|
|
303
|
+
/** Model confidence signal (0–1). Use self-verify score when available. */
|
|
304
|
+
confidence: number;
|
|
305
|
+
/** Whether this was a benign (non-refusal) request */
|
|
306
|
+
benign?: boolean;
|
|
307
|
+
/** Approximate token count for cost SLO */
|
|
308
|
+
tokens?: number;
|
|
309
|
+
/** ISO timestamp of the trace */
|
|
310
|
+
timestamp: string;
|
|
311
|
+
/** Whether the trace passed self-verification */
|
|
312
|
+
verified?: boolean;
|
|
313
|
+
}
|
|
314
|
+
interface CurationResult {
|
|
315
|
+
ingested: number;
|
|
316
|
+
accepted: number;
|
|
317
|
+
rejected: number;
|
|
318
|
+
datasetSize: number;
|
|
319
|
+
newCases: GroundTruthCase[];
|
|
320
|
+
}
|
|
321
|
+
interface TraceCuratorConfig {
|
|
322
|
+
bus?: AgentBus;
|
|
323
|
+
/** Path to ground-truth JSON dataset. Default: src/data/prod-eval-dataset.json */
|
|
324
|
+
datasetPath?: string;
|
|
325
|
+
/** Sampling strategy. Default: reservoir */
|
|
326
|
+
strategy?: SamplingStrategy;
|
|
327
|
+
/** Maximum dataset size (oldest entries pruned when exceeded). Default: 500 */
|
|
328
|
+
maxDatasetSize?: number;
|
|
329
|
+
/** Minimum confidence to accept a trace. Default: 0.7 */
|
|
330
|
+
minConfidence?: number;
|
|
331
|
+
/** Only accept traces where self-verify passed. Default: false */
|
|
332
|
+
requireVerified?: boolean;
|
|
333
|
+
/** Minimum input length in characters. Default: 20 */
|
|
334
|
+
minInputLength?: number;
|
|
335
|
+
}
|
|
336
|
+
declare class TraceCurator {
|
|
337
|
+
private readonly bus?;
|
|
338
|
+
private readonly datasetPath;
|
|
339
|
+
private readonly strategy;
|
|
340
|
+
private readonly maxDatasetSize;
|
|
341
|
+
private readonly minConfidence;
|
|
342
|
+
private readonly requireVerified;
|
|
343
|
+
private readonly minInputLength;
|
|
344
|
+
/** In-memory buffer of traces ingested since last flush */
|
|
345
|
+
private readonly buffer;
|
|
346
|
+
constructor(config?: TraceCuratorConfig);
|
|
347
|
+
/**
|
|
348
|
+
* Ingest a trace record into the buffer.
|
|
349
|
+
* Call flush() to persist accepted cases to disk.
|
|
350
|
+
*/
|
|
351
|
+
ingest(trace: TraceRecord): void;
|
|
352
|
+
/**
|
|
353
|
+
* Ingest multiple traces at once.
|
|
354
|
+
*/
|
|
355
|
+
ingestBatch(traces: TraceRecord[]): void;
|
|
356
|
+
/**
|
|
357
|
+
* Apply quality filters and accept/reject buffered traces.
|
|
358
|
+
* Returns accepted TraceRecords without writing to disk.
|
|
359
|
+
*/
|
|
360
|
+
filter(traces: TraceRecord[]): TraceRecord[];
|
|
361
|
+
/**
|
|
362
|
+
* Flush buffered traces to the ground-truth dataset.
|
|
363
|
+
* Applies sampling strategy, deduplicates by traceId, and respects maxDatasetSize.
|
|
364
|
+
*/
|
|
365
|
+
flush(): Promise<CurationResult>;
|
|
366
|
+
/** Return the current buffer without flushing. */
|
|
367
|
+
peekBuffer(): TraceRecord[];
|
|
368
|
+
/** Clear the buffer without writing to disk. */
|
|
369
|
+
clearBuffer(): void;
|
|
370
|
+
/** Return the number of cases currently in the on-disk dataset. */
|
|
371
|
+
getDatasetSize(): number;
|
|
372
|
+
/**
|
|
373
|
+
* Export the curated dataset to a fine-tuning JSONL file.
|
|
374
|
+
*
|
|
375
|
+
* Each line is a JSON object in the messages format expected by the
|
|
376
|
+
* target API. PII-free by construction — upstream LlmTracer masking
|
|
377
|
+
* ensures no PII reaches the dataset.
|
|
378
|
+
*
|
|
379
|
+
* Formats:
|
|
380
|
+
* 'openai' — { "messages": [system?, user, assistant] }
|
|
381
|
+
* 'anthropic' — { "system": "...", "messages": [user, assistant] }
|
|
382
|
+
*
|
|
383
|
+
* @param format Target fine-tuning API format
|
|
384
|
+
* @param outputPath Absolute path to write the .jsonl file
|
|
385
|
+
* @param options.systemPrompt Optional system prompt to include per example
|
|
386
|
+
* @param options.minExamples Minimum dataset size before warning (default 10)
|
|
387
|
+
*/
|
|
388
|
+
exportFineTuneDataset(format: 'openai' | 'anthropic', outputPath: string, options?: {
|
|
389
|
+
systemPrompt?: string;
|
|
390
|
+
minExamples?: number;
|
|
391
|
+
}): {
|
|
392
|
+
exported: number;
|
|
393
|
+
skipped: number;
|
|
394
|
+
path: string;
|
|
395
|
+
};
|
|
396
|
+
private toCuratedCase;
|
|
397
|
+
private loadDataset;
|
|
398
|
+
private applyStrategy;
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
/**
|
|
402
|
+
* LLM Tracer — SIGNAL-014
|
|
403
|
+
*
|
|
404
|
+
* Logs every LLM call made by AnthropicEngine to Langfuse for prompt-level
|
|
405
|
+
* observability. Captures model, token usage, latency, input/output, and
|
|
406
|
+
* task metadata. Traces are keyed by task.id, so a multi-step task (execute
|
|
407
|
+
* → self-verify → review) produces one Langfuse trace with multiple
|
|
408
|
+
* generation spans.
|
|
409
|
+
*
|
|
410
|
+
* Self-hosted Langfuse:
|
|
411
|
+
* docker compose -f docker/docker-compose.monitoring.yml up -d
|
|
412
|
+
* → http://localhost:3001 (admin@baselineos.dev / baseline)
|
|
413
|
+
*
|
|
414
|
+
* Configuration (env or constructor):
|
|
415
|
+
* LANGFUSE_PUBLIC_KEY — project public key (default: baseline-public-key)
|
|
416
|
+
* LANGFUSE_SECRET_KEY — project secret key (default: baseline-secret-key)
|
|
417
|
+
* LANGFUSE_BASE_URL — Langfuse server URL (default: http://localhost:3001)
|
|
418
|
+
*
|
|
419
|
+
* The tracer is fail-safe: any Langfuse error is swallowed silently so it
|
|
420
|
+
* can never interrupt task execution.
|
|
421
|
+
*
|
|
422
|
+
* @license Apache-2.0
|
|
423
|
+
*/
|
|
424
|
+
|
|
425
|
+
interface LlmTracerConfig {
|
|
426
|
+
/** Langfuse project public key. Default: LANGFUSE_PUBLIC_KEY env or 'baseline-public-key' */
|
|
427
|
+
publicKey?: string;
|
|
428
|
+
/** Langfuse project secret key. Default: LANGFUSE_SECRET_KEY env or 'baseline-secret-key' */
|
|
429
|
+
secretKey?: string;
|
|
430
|
+
/** Langfuse server base URL. Default: LANGFUSE_BASE_URL env or http://localhost:3001 */
|
|
431
|
+
baseUrl?: string;
|
|
432
|
+
/** Flush batch size (default: 15) */
|
|
433
|
+
flushAt?: number;
|
|
434
|
+
/** Flush interval in ms (default: 30_000) */
|
|
435
|
+
flushInterval?: number;
|
|
436
|
+
/**
|
|
437
|
+
* Scan and redact PII from system prompts, input messages, and output
|
|
438
|
+
* before sending to Langfuse. Default: true (SIGNAL-037).
|
|
439
|
+
*/
|
|
440
|
+
enablePiiMasking?: boolean;
|
|
441
|
+
/**
|
|
442
|
+
* TraceCurator to ingest sampled traces into the ground-truth dataset.
|
|
443
|
+
* When provided, each logGeneration() call feeds a TraceRecord into the
|
|
444
|
+
* curator buffer for downstream curation and eval dataset expansion (SIGNAL-049).
|
|
445
|
+
*/
|
|
446
|
+
curator?: TraceCurator;
|
|
447
|
+
}
|
|
448
|
+
interface LlmGenerationOptions {
|
|
449
|
+
/** Span name within the trace, e.g. 'execute', 'self-verify', 'review', 'agent-loop:3' */
|
|
450
|
+
name: string;
|
|
451
|
+
/** Model identifier, e.g. 'claude-sonnet-4-6' */
|
|
452
|
+
model: string;
|
|
453
|
+
/** System prompt sent to the model */
|
|
454
|
+
systemPrompt?: string;
|
|
455
|
+
/** User + assistant turns passed to the model */
|
|
456
|
+
inputMessages: Array<{
|
|
457
|
+
role: string;
|
|
458
|
+
content: unknown;
|
|
459
|
+
}>;
|
|
460
|
+
/** Raw text output from the model */
|
|
461
|
+
output: string;
|
|
462
|
+
inputTokens: number;
|
|
463
|
+
outputTokens: number;
|
|
464
|
+
startTime: Date;
|
|
465
|
+
endTime: Date;
|
|
466
|
+
/** Arbitrary metadata attached to the generation (taskId, agentId, etc.) */
|
|
467
|
+
metadata?: Record<string, unknown>;
|
|
468
|
+
}
|
|
469
|
+
declare class LlmTracer {
|
|
470
|
+
private readonly langfuse;
|
|
471
|
+
private readonly piiScanner;
|
|
472
|
+
private readonly curator;
|
|
473
|
+
constructor(config?: LlmTracerConfig);
|
|
474
|
+
/** Redact PII from a string if masking is enabled. No-op when scanner is null. */
|
|
475
|
+
private maskPii;
|
|
476
|
+
/**
|
|
477
|
+
* Log a single LLM generation to Langfuse.
|
|
478
|
+
*
|
|
479
|
+
* Multiple calls with the same `traceId` accumulate under one trace,
|
|
480
|
+
* so all generations for a task are grouped together automatically.
|
|
481
|
+
*
|
|
482
|
+
* @param traceId task.id — the top-level trace identifier
|
|
483
|
+
* @param traceName task.title — human-readable trace label in the UI
|
|
484
|
+
* @param options generation details
|
|
485
|
+
*/
|
|
486
|
+
logGeneration(traceId: string, traceName: string, options: LlmGenerationOptions): void;
|
|
487
|
+
/** Flush pending spans. Call on graceful shutdown. */
|
|
488
|
+
flush(): Promise<void>;
|
|
489
|
+
/** Shutdown Langfuse client (flushes pending spans). */
|
|
490
|
+
shutdown(): void;
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
export { type CurationResult as C, type GroundTruthCase as G, LlmTracer as L, ModelVersionRegistry as M, ProductionEvalPipeline as P, type SamplingStrategy as S, TraceCurator as T, type VersionEvalResult as V, type VersionComparison as a, type LlmGenerationOptions as b, type LlmTracerConfig as c, type ModelVersionEntry as d, type ModelVersionRegistryConfig as e, type ProdEvalCheck as f, type ProdEvalCheckContext as g, type ProdEvalCheckResult as h, type ProdEvalReport as i, type ProductionEvalPipelineConfig as j, type TraceCuratorConfig as k, type TraceRecord as l };
|