@artemiskit/core 0.2.0 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +76 -0
- package/dist/adapters/types.d.ts +5 -0
- package/dist/adapters/types.d.ts.map +1 -1
- package/dist/artifacts/manifest.d.ts.map +1 -1
- package/dist/artifacts/types.d.ts +20 -0
- package/dist/artifacts/types.d.ts.map +1 -1
- package/dist/cost/pricing.d.ts +2 -1
- package/dist/cost/pricing.d.ts.map +1 -1
- package/dist/evaluators/llm-grader.d.ts.map +1 -1
- package/dist/index.js +468 -205
- package/dist/scenario/schema.d.ts +8 -0
- package/dist/scenario/schema.d.ts.map +1 -1
- package/dist/storage/local.d.ts +44 -2
- package/dist/storage/local.d.ts.map +1 -1
- package/dist/storage/types.d.ts +66 -0
- package/dist/storage/types.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/adapters/types.ts +5 -0
- package/src/artifacts/manifest.ts +24 -2
- package/src/artifacts/types.ts +21 -0
- package/src/cost/pricing.ts +242 -65
- package/src/evaluators/llm-grader.ts +45 -13
- package/src/evaluators/similarity.test.ts +4 -3
- package/src/scenario/schema.ts +4 -0
- package/src/storage/local.test.ts +243 -0
- package/src/storage/local.ts +186 -4
- package/src/storage/types.ts +77 -0
- package/dist/events/emitter.d.ts +0 -111
- package/dist/events/emitter.d.ts.map +0 -1
- package/dist/events/index.d.ts +0 -6
- package/dist/events/index.d.ts.map +0 -1
- package/dist/events/types.d.ts +0 -177
- package/dist/events/types.d.ts.map +0 -1
package/dist/index.js
CHANGED
|
@@ -11564,8 +11564,7 @@ class LLMGraderEvaluator {
|
|
|
11564
11564
|
const result = await context.client.generate({
|
|
11565
11565
|
prompt,
|
|
11566
11566
|
model: expected.model,
|
|
11567
|
-
|
|
11568
|
-
maxTokens: 200
|
|
11567
|
+
maxTokens: 1000
|
|
11569
11568
|
});
|
|
11570
11569
|
const parsed = this.parseGraderResponse(result.text);
|
|
11571
11570
|
const passed = parsed.score >= expected.threshold;
|
|
@@ -11590,9 +11589,17 @@ class LLMGraderEvaluator {
|
|
|
11590
11589
|
}
|
|
11591
11590
|
}
|
|
11592
11591
|
parseGraderResponse(text) {
|
|
11593
|
-
const
|
|
11592
|
+
const cleanedText = text.replace(/```json\s*/gi, "").replace(/```\s*/g, "").trim();
|
|
11593
|
+
const jsonMatch = cleanedText.match(/\{[\s\S]*?\}/);
|
|
11594
11594
|
if (!jsonMatch) {
|
|
11595
|
-
|
|
11595
|
+
const scoreMatch = cleanedText.match(/(?:score[:\s]*)?(\d+\.?\d*)/i);
|
|
11596
|
+
if (scoreMatch) {
|
|
11597
|
+
const score = Number(scoreMatch[1]);
|
|
11598
|
+
if (!Number.isNaN(score) && score >= 0 && score <= 1) {
|
|
11599
|
+
return { score, reason: cleanedText };
|
|
11600
|
+
}
|
|
11601
|
+
}
|
|
11602
|
+
throw new Error(`No JSON found in grader response: ${text.substring(0, 100)}...`);
|
|
11596
11603
|
}
|
|
11597
11604
|
try {
|
|
11598
11605
|
const parsed = JSON.parse(jsonMatch[0]);
|
|
@@ -11605,26 +11612,39 @@ class LLMGraderEvaluator {
|
|
|
11605
11612
|
reason: parsed.reason
|
|
11606
11613
|
};
|
|
11607
11614
|
} catch (error) {
|
|
11615
|
+
const scoreMatch = jsonMatch[0].match(/"score"[:\s]*(\d+\.?\d*)/i);
|
|
11616
|
+
if (scoreMatch) {
|
|
11617
|
+
const score = Number(scoreMatch[1]);
|
|
11618
|
+
if (!Number.isNaN(score) && score >= 0 && score <= 1) {
|
|
11619
|
+
const reasonMatch = jsonMatch[0].match(/"reason"[:\s]*"([^"]+)"/i);
|
|
11620
|
+
return { score, reason: reasonMatch?.[1] };
|
|
11621
|
+
}
|
|
11622
|
+
}
|
|
11608
11623
|
throw new Error(`Failed to parse grader response: ${error.message}`);
|
|
11609
11624
|
}
|
|
11610
11625
|
}
|
|
11611
11626
|
}
|
|
11612
|
-
var GRADER_PROMPT = `You are
|
|
11627
|
+
var GRADER_PROMPT = `You are a strict JSON-only evaluator. You grade AI responses based on rubrics.
|
|
11613
11628
|
|
|
11614
|
-
|
|
11629
|
+
RUBRIC:
|
|
11615
11630
|
{{rubric}}
|
|
11616
11631
|
|
|
11617
|
-
|
|
11632
|
+
RESPONSE TO EVALUATE:
|
|
11618
11633
|
{{response}}
|
|
11619
11634
|
|
|
11620
|
-
|
|
11621
|
-
Score the response from 0.0 to 1.0 based on the rubric.
|
|
11622
|
-
Be objective and consistent in your scoring.
|
|
11635
|
+
TASK: Score the response from 0.0 to 1.0 based on the rubric above.
|
|
11623
11636
|
|
|
11624
|
-
|
|
11625
|
-
{"score":
|
|
11637
|
+
OUTPUT FORMAT: You MUST respond with ONLY this exact JSON structure, nothing else:
|
|
11638
|
+
{"score":0.0,"reason":"explanation"}
|
|
11639
|
+
|
|
11640
|
+
RULES:
|
|
11641
|
+
- Output ONLY valid JSON, no markdown, no code blocks, no extra text
|
|
11642
|
+
- "score" must be a number between 0.0 and 1.0
|
|
11643
|
+
- "reason" must be a brief string explaining the score
|
|
11644
|
+
- Do NOT wrap in \`\`\`json or any formatting
|
|
11645
|
+
- Your entire response must be parseable by JSON.parse()
|
|
11626
11646
|
|
|
11627
|
-
|
|
11647
|
+
JSON OUTPUT:`;
|
|
11628
11648
|
|
|
11629
11649
|
// src/evaluators/not-contains.ts
|
|
11630
11650
|
class NotContainsEvaluator {
|
|
@@ -13487,6 +13507,7 @@ var ProviderConfigSchema = exports_external.object({
|
|
|
13487
13507
|
deploymentName: exports_external.string().optional(),
|
|
13488
13508
|
apiVersion: exports_external.string().optional(),
|
|
13489
13509
|
embeddingDeploymentName: exports_external.string().optional(),
|
|
13510
|
+
modelFamily: exports_external.string().optional(),
|
|
13490
13511
|
underlyingProvider: exports_external.enum(["openai", "azure", "anthropic", "google", "mistral"]).optional()
|
|
13491
13512
|
}).optional();
|
|
13492
13513
|
var BaseExpectedSchema = exports_external.discriminatedUnion("type", [
|
|
@@ -14217,6 +14238,319 @@ function nanoid(size = 21) {
|
|
|
14217
14238
|
return id;
|
|
14218
14239
|
}
|
|
14219
14240
|
|
|
14241
|
+
// src/cost/pricing.ts
|
|
14242
|
+
var MODEL_PRICING = {
|
|
14243
|
+
"gpt-5": {
|
|
14244
|
+
promptPer1K: 0.00125,
|
|
14245
|
+
completionPer1K: 0.01,
|
|
14246
|
+
lastUpdated: "2026-01",
|
|
14247
|
+
notes: "400K context window"
|
|
14248
|
+
},
|
|
14249
|
+
"gpt-5.1": {
|
|
14250
|
+
promptPer1K: 0.00125,
|
|
14251
|
+
completionPer1K: 0.01,
|
|
14252
|
+
lastUpdated: "2026-01"
|
|
14253
|
+
},
|
|
14254
|
+
"gpt-5.2": {
|
|
14255
|
+
promptPer1K: 0.00175,
|
|
14256
|
+
completionPer1K: 0.014,
|
|
14257
|
+
lastUpdated: "2026-01"
|
|
14258
|
+
},
|
|
14259
|
+
"gpt-5-mini": {
|
|
14260
|
+
promptPer1K: 0.00025,
|
|
14261
|
+
completionPer1K: 0.002,
|
|
14262
|
+
lastUpdated: "2026-01"
|
|
14263
|
+
},
|
|
14264
|
+
"gpt-5-nano": {
|
|
14265
|
+
promptPer1K: 0.00005,
|
|
14266
|
+
completionPer1K: 0.0004,
|
|
14267
|
+
lastUpdated: "2026-01"
|
|
14268
|
+
},
|
|
14269
|
+
"gpt-4.1": {
|
|
14270
|
+
promptPer1K: 0.002,
|
|
14271
|
+
completionPer1K: 0.008,
|
|
14272
|
+
lastUpdated: "2026-01",
|
|
14273
|
+
notes: "1M context window"
|
|
14274
|
+
},
|
|
14275
|
+
"gpt-4.1-mini": {
|
|
14276
|
+
promptPer1K: 0.0004,
|
|
14277
|
+
completionPer1K: 0.0016,
|
|
14278
|
+
lastUpdated: "2026-01"
|
|
14279
|
+
},
|
|
14280
|
+
"gpt-4.1-nano": {
|
|
14281
|
+
promptPer1K: 0.0001,
|
|
14282
|
+
completionPer1K: 0.0004,
|
|
14283
|
+
lastUpdated: "2026-01"
|
|
14284
|
+
},
|
|
14285
|
+
"gpt-4o": {
|
|
14286
|
+
promptPer1K: 0.0025,
|
|
14287
|
+
completionPer1K: 0.01,
|
|
14288
|
+
lastUpdated: "2026-01",
|
|
14289
|
+
notes: "128K context window"
|
|
14290
|
+
},
|
|
14291
|
+
"gpt-4o-mini": {
|
|
14292
|
+
promptPer1K: 0.00015,
|
|
14293
|
+
completionPer1K: 0.0006,
|
|
14294
|
+
lastUpdated: "2026-01",
|
|
14295
|
+
notes: "128K context window"
|
|
14296
|
+
},
|
|
14297
|
+
o1: {
|
|
14298
|
+
promptPer1K: 0.015,
|
|
14299
|
+
completionPer1K: 0.06,
|
|
14300
|
+
lastUpdated: "2026-01",
|
|
14301
|
+
notes: "Reasoning model - internal thinking tokens billed as output"
|
|
14302
|
+
},
|
|
14303
|
+
o3: {
|
|
14304
|
+
promptPer1K: 0.002,
|
|
14305
|
+
completionPer1K: 0.008,
|
|
14306
|
+
lastUpdated: "2026-01"
|
|
14307
|
+
},
|
|
14308
|
+
"o3-mini": {
|
|
14309
|
+
promptPer1K: 0.0011,
|
|
14310
|
+
completionPer1K: 0.0044,
|
|
14311
|
+
lastUpdated: "2026-01"
|
|
14312
|
+
},
|
|
14313
|
+
"o4-mini": {
|
|
14314
|
+
promptPer1K: 0.0011,
|
|
14315
|
+
completionPer1K: 0.0044,
|
|
14316
|
+
lastUpdated: "2026-01"
|
|
14317
|
+
},
|
|
14318
|
+
"gpt-4-turbo": {
|
|
14319
|
+
promptPer1K: 0.01,
|
|
14320
|
+
completionPer1K: 0.03,
|
|
14321
|
+
lastUpdated: "2026-01"
|
|
14322
|
+
},
|
|
14323
|
+
"gpt-4": {
|
|
14324
|
+
promptPer1K: 0.03,
|
|
14325
|
+
completionPer1K: 0.06,
|
|
14326
|
+
lastUpdated: "2026-01"
|
|
14327
|
+
},
|
|
14328
|
+
"gpt-3.5-turbo": {
|
|
14329
|
+
promptPer1K: 0.0005,
|
|
14330
|
+
completionPer1K: 0.0015,
|
|
14331
|
+
lastUpdated: "2026-01"
|
|
14332
|
+
},
|
|
14333
|
+
"claude-opus-4.5": {
|
|
14334
|
+
promptPer1K: 0.005,
|
|
14335
|
+
completionPer1K: 0.025,
|
|
14336
|
+
lastUpdated: "2026-01",
|
|
14337
|
+
notes: "Most capable Claude model"
|
|
14338
|
+
},
|
|
14339
|
+
"claude-sonnet-4.5": {
|
|
14340
|
+
promptPer1K: 0.003,
|
|
14341
|
+
completionPer1K: 0.015,
|
|
14342
|
+
lastUpdated: "2026-01",
|
|
14343
|
+
notes: "Balanced performance and cost"
|
|
14344
|
+
},
|
|
14345
|
+
"claude-haiku-4.5": {
|
|
14346
|
+
promptPer1K: 0.001,
|
|
14347
|
+
completionPer1K: 0.005,
|
|
14348
|
+
lastUpdated: "2026-01",
|
|
14349
|
+
notes: "Fastest Claude model"
|
|
14350
|
+
},
|
|
14351
|
+
"claude-opus-4": {
|
|
14352
|
+
promptPer1K: 0.015,
|
|
14353
|
+
completionPer1K: 0.075,
|
|
14354
|
+
lastUpdated: "2026-01"
|
|
14355
|
+
},
|
|
14356
|
+
"claude-opus-4.1": {
|
|
14357
|
+
promptPer1K: 0.015,
|
|
14358
|
+
completionPer1K: 0.075,
|
|
14359
|
+
lastUpdated: "2026-01"
|
|
14360
|
+
},
|
|
14361
|
+
"claude-sonnet-4": {
|
|
14362
|
+
promptPer1K: 0.003,
|
|
14363
|
+
completionPer1K: 0.015,
|
|
14364
|
+
lastUpdated: "2026-01"
|
|
14365
|
+
},
|
|
14366
|
+
"claude-sonnet-3.7": {
|
|
14367
|
+
promptPer1K: 0.003,
|
|
14368
|
+
completionPer1K: 0.015,
|
|
14369
|
+
lastUpdated: "2026-01"
|
|
14370
|
+
},
|
|
14371
|
+
"claude-3-7-sonnet": {
|
|
14372
|
+
promptPer1K: 0.003,
|
|
14373
|
+
completionPer1K: 0.015,
|
|
14374
|
+
lastUpdated: "2026-01"
|
|
14375
|
+
},
|
|
14376
|
+
"claude-3-5-sonnet-20241022": {
|
|
14377
|
+
promptPer1K: 0.003,
|
|
14378
|
+
completionPer1K: 0.015,
|
|
14379
|
+
lastUpdated: "2026-01"
|
|
14380
|
+
},
|
|
14381
|
+
"claude-3-5-haiku-20241022": {
|
|
14382
|
+
promptPer1K: 0.0008,
|
|
14383
|
+
completionPer1K: 0.004,
|
|
14384
|
+
lastUpdated: "2026-01"
|
|
14385
|
+
},
|
|
14386
|
+
"claude-haiku-3.5": {
|
|
14387
|
+
promptPer1K: 0.0008,
|
|
14388
|
+
completionPer1K: 0.004,
|
|
14389
|
+
lastUpdated: "2026-01"
|
|
14390
|
+
},
|
|
14391
|
+
"claude-3-opus": {
|
|
14392
|
+
promptPer1K: 0.015,
|
|
14393
|
+
completionPer1K: 0.075,
|
|
14394
|
+
lastUpdated: "2026-01"
|
|
14395
|
+
},
|
|
14396
|
+
"claude-3-sonnet": {
|
|
14397
|
+
promptPer1K: 0.003,
|
|
14398
|
+
completionPer1K: 0.015,
|
|
14399
|
+
lastUpdated: "2026-01"
|
|
14400
|
+
},
|
|
14401
|
+
"claude-3-haiku": {
|
|
14402
|
+
promptPer1K: 0.00025,
|
|
14403
|
+
completionPer1K: 0.00125,
|
|
14404
|
+
lastUpdated: "2026-01"
|
|
14405
|
+
},
|
|
14406
|
+
"claude-3.5-sonnet": {
|
|
14407
|
+
promptPer1K: 0.003,
|
|
14408
|
+
completionPer1K: 0.015,
|
|
14409
|
+
lastUpdated: "2026-01"
|
|
14410
|
+
},
|
|
14411
|
+
"claude-3.5-haiku": {
|
|
14412
|
+
promptPer1K: 0.0008,
|
|
14413
|
+
completionPer1K: 0.004,
|
|
14414
|
+
lastUpdated: "2026-01"
|
|
14415
|
+
}
|
|
14416
|
+
};
|
|
14417
|
+
var DEFAULT_PRICING = {
|
|
14418
|
+
promptPer1K: 0.003,
|
|
14419
|
+
completionPer1K: 0.015,
|
|
14420
|
+
lastUpdated: "2026-01",
|
|
14421
|
+
notes: "Default pricing - verify with provider"
|
|
14422
|
+
};
|
|
14423
|
+
function getModelPricing(model) {
|
|
14424
|
+
if (MODEL_PRICING[model]) {
|
|
14425
|
+
return MODEL_PRICING[model];
|
|
14426
|
+
}
|
|
14427
|
+
const lowerModel = model.toLowerCase();
|
|
14428
|
+
for (const [key, pricing] of Object.entries(MODEL_PRICING)) {
|
|
14429
|
+
if (key.toLowerCase() === lowerModel) {
|
|
14430
|
+
return pricing;
|
|
14431
|
+
}
|
|
14432
|
+
}
|
|
14433
|
+
if (lowerModel.includes("gpt-5.2")) {
|
|
14434
|
+
return MODEL_PRICING["gpt-5.2"];
|
|
14435
|
+
}
|
|
14436
|
+
if (lowerModel.includes("gpt-5.1")) {
|
|
14437
|
+
return MODEL_PRICING["gpt-5.1"];
|
|
14438
|
+
}
|
|
14439
|
+
if (lowerModel.includes("gpt-5-mini")) {
|
|
14440
|
+
return MODEL_PRICING["gpt-5-mini"];
|
|
14441
|
+
}
|
|
14442
|
+
if (lowerModel.includes("gpt-5-nano")) {
|
|
14443
|
+
return MODEL_PRICING["gpt-5-nano"];
|
|
14444
|
+
}
|
|
14445
|
+
if (lowerModel.includes("gpt-5")) {
|
|
14446
|
+
return MODEL_PRICING["gpt-5"];
|
|
14447
|
+
}
|
|
14448
|
+
if (lowerModel.includes("gpt-4.1-mini")) {
|
|
14449
|
+
return MODEL_PRICING["gpt-4.1-mini"];
|
|
14450
|
+
}
|
|
14451
|
+
if (lowerModel.includes("gpt-4.1-nano")) {
|
|
14452
|
+
return MODEL_PRICING["gpt-4.1-nano"];
|
|
14453
|
+
}
|
|
14454
|
+
if (lowerModel.includes("gpt-4.1")) {
|
|
14455
|
+
return MODEL_PRICING["gpt-4.1"];
|
|
14456
|
+
}
|
|
14457
|
+
if (lowerModel.includes("gpt-4o-mini")) {
|
|
14458
|
+
return MODEL_PRICING["gpt-4o-mini"];
|
|
14459
|
+
}
|
|
14460
|
+
if (lowerModel.includes("gpt-4o")) {
|
|
14461
|
+
return MODEL_PRICING["gpt-4o"];
|
|
14462
|
+
}
|
|
14463
|
+
if (lowerModel.includes("o4-mini")) {
|
|
14464
|
+
return MODEL_PRICING["o4-mini"];
|
|
14465
|
+
}
|
|
14466
|
+
if (lowerModel.includes("o3-mini")) {
|
|
14467
|
+
return MODEL_PRICING["o3-mini"];
|
|
14468
|
+
}
|
|
14469
|
+
if (lowerModel.includes("o3")) {
|
|
14470
|
+
return MODEL_PRICING.o3;
|
|
14471
|
+
}
|
|
14472
|
+
if (lowerModel.includes("o1")) {
|
|
14473
|
+
return MODEL_PRICING.o1;
|
|
14474
|
+
}
|
|
14475
|
+
if (lowerModel.includes("gpt-4-turbo")) {
|
|
14476
|
+
return MODEL_PRICING["gpt-4-turbo"];
|
|
14477
|
+
}
|
|
14478
|
+
if (lowerModel.includes("gpt-4")) {
|
|
14479
|
+
return MODEL_PRICING["gpt-4"];
|
|
14480
|
+
}
|
|
14481
|
+
if (lowerModel.includes("gpt-3.5")) {
|
|
14482
|
+
return MODEL_PRICING["gpt-3.5-turbo"];
|
|
14483
|
+
}
|
|
14484
|
+
if (lowerModel.includes("opus-4.5") || lowerModel.includes("opus-4-5")) {
|
|
14485
|
+
return MODEL_PRICING["claude-opus-4.5"];
|
|
14486
|
+
}
|
|
14487
|
+
if (lowerModel.includes("sonnet-4.5") || lowerModel.includes("sonnet-4-5")) {
|
|
14488
|
+
return MODEL_PRICING["claude-sonnet-4.5"];
|
|
14489
|
+
}
|
|
14490
|
+
if (lowerModel.includes("haiku-4.5") || lowerModel.includes("haiku-4-5")) {
|
|
14491
|
+
return MODEL_PRICING["claude-haiku-4.5"];
|
|
14492
|
+
}
|
|
14493
|
+
if (lowerModel.includes("opus-4.1") || lowerModel.includes("opus-4-1")) {
|
|
14494
|
+
return MODEL_PRICING["claude-opus-4.1"];
|
|
14495
|
+
}
|
|
14496
|
+
if (lowerModel.includes("opus-4")) {
|
|
14497
|
+
return MODEL_PRICING["claude-opus-4"];
|
|
14498
|
+
}
|
|
14499
|
+
if (lowerModel.includes("sonnet-4")) {
|
|
14500
|
+
return MODEL_PRICING["claude-sonnet-4"];
|
|
14501
|
+
}
|
|
14502
|
+
if (lowerModel.includes("sonnet-3.7") || lowerModel.includes("sonnet-3-7")) {
|
|
14503
|
+
return MODEL_PRICING["claude-sonnet-3.7"];
|
|
14504
|
+
}
|
|
14505
|
+
if (lowerModel.includes("claude-3-5-sonnet") || lowerModel.includes("claude-3.5-sonnet")) {
|
|
14506
|
+
return MODEL_PRICING["claude-3.5-sonnet"];
|
|
14507
|
+
}
|
|
14508
|
+
if (lowerModel.includes("claude-3-5-haiku") || lowerModel.includes("claude-3.5-haiku")) {
|
|
14509
|
+
return MODEL_PRICING["claude-3.5-haiku"];
|
|
14510
|
+
}
|
|
14511
|
+
if (lowerModel.includes("claude-3-opus")) {
|
|
14512
|
+
return MODEL_PRICING["claude-3-opus"];
|
|
14513
|
+
}
|
|
14514
|
+
if (lowerModel.includes("claude-3-sonnet")) {
|
|
14515
|
+
return MODEL_PRICING["claude-3-sonnet"];
|
|
14516
|
+
}
|
|
14517
|
+
if (lowerModel.includes("claude-3-haiku")) {
|
|
14518
|
+
return MODEL_PRICING["claude-3-haiku"];
|
|
14519
|
+
}
|
|
14520
|
+
if (lowerModel.includes("claude")) {
|
|
14521
|
+
return MODEL_PRICING["claude-sonnet-4.5"];
|
|
14522
|
+
}
|
|
14523
|
+
return DEFAULT_PRICING;
|
|
14524
|
+
}
|
|
14525
|
+
function estimateCost(promptTokens, completionTokens, model) {
|
|
14526
|
+
const pricing = getModelPricing(model);
|
|
14527
|
+
const promptCostUsd = promptTokens / 1000 * pricing.promptPer1K;
|
|
14528
|
+
const completionCostUsd = completionTokens / 1000 * pricing.completionPer1K;
|
|
14529
|
+
const totalUsd = promptCostUsd + completionCostUsd;
|
|
14530
|
+
return {
|
|
14531
|
+
totalUsd,
|
|
14532
|
+
promptCostUsd,
|
|
14533
|
+
completionCostUsd,
|
|
14534
|
+
model,
|
|
14535
|
+
pricing
|
|
14536
|
+
};
|
|
14537
|
+
}
|
|
14538
|
+
function formatCost(costUsd) {
|
|
14539
|
+
if (costUsd < 0.01) {
|
|
14540
|
+
return `$${(costUsd * 100).toFixed(4)} cents`;
|
|
14541
|
+
}
|
|
14542
|
+
if (costUsd < 1) {
|
|
14543
|
+
return `$${costUsd.toFixed(4)}`;
|
|
14544
|
+
}
|
|
14545
|
+
return `$${costUsd.toFixed(2)}`;
|
|
14546
|
+
}
|
|
14547
|
+
function listKnownModels() {
|
|
14548
|
+
return Object.entries(MODEL_PRICING).map(([model, pricing]) => ({
|
|
14549
|
+
model,
|
|
14550
|
+
pricing
|
|
14551
|
+
}));
|
|
14552
|
+
}
|
|
14553
|
+
|
|
14220
14554
|
// src/provenance/environment.ts
|
|
14221
14555
|
function getEnvironmentInfo() {
|
|
14222
14556
|
return {
|
|
@@ -14275,7 +14609,8 @@ function createRunManifest(options) {
|
|
|
14275
14609
|
runReason,
|
|
14276
14610
|
redaction
|
|
14277
14611
|
} = options;
|
|
14278
|
-
const
|
|
14612
|
+
const modelForCost = resolvedConfig?.model || config.model;
|
|
14613
|
+
const metrics = calculateMetrics(cases, modelForCost);
|
|
14279
14614
|
const git = getGitInfo();
|
|
14280
14615
|
const environment = getEnvironmentInfo();
|
|
14281
14616
|
return {
|
|
@@ -14299,7 +14634,7 @@ function createRunManifest(options) {
|
|
|
14299
14634
|
redaction
|
|
14300
14635
|
};
|
|
14301
14636
|
}
|
|
14302
|
-
function calculateMetrics(cases) {
|
|
14637
|
+
function calculateMetrics(cases, model) {
|
|
14303
14638
|
const passedCases = cases.filter((c) => c.ok);
|
|
14304
14639
|
const latencies = cases.map((c) => c.latencyMs).sort((a, b) => a - b);
|
|
14305
14640
|
const medianLatency = latencies.length > 0 ? latencies[Math.floor(latencies.length / 2)] : 0;
|
|
@@ -14307,6 +14642,21 @@ function calculateMetrics(cases) {
|
|
|
14307
14642
|
const p95Latency = latencies.length > 0 ? latencies[p95Index] : 0;
|
|
14308
14643
|
const totalPromptTokens = cases.reduce((sum, c) => sum + c.tokens.prompt, 0);
|
|
14309
14644
|
const totalCompletionTokens = cases.reduce((sum, c) => sum + c.tokens.completion, 0);
|
|
14645
|
+
let cost;
|
|
14646
|
+
if (model && (totalPromptTokens > 0 || totalCompletionTokens > 0)) {
|
|
14647
|
+
const costEstimate = estimateCost(totalPromptTokens, totalCompletionTokens, model);
|
|
14648
|
+
const pricing = getModelPricing(model);
|
|
14649
|
+
cost = {
|
|
14650
|
+
total_usd: costEstimate.totalUsd,
|
|
14651
|
+
prompt_cost_usd: costEstimate.promptCostUsd,
|
|
14652
|
+
completion_cost_usd: costEstimate.completionCostUsd,
|
|
14653
|
+
model: costEstimate.model,
|
|
14654
|
+
pricing: {
|
|
14655
|
+
prompt_per_1k: pricing.promptPer1K,
|
|
14656
|
+
completion_per_1k: pricing.completionPer1K
|
|
14657
|
+
}
|
|
14658
|
+
};
|
|
14659
|
+
}
|
|
14310
14660
|
return {
|
|
14311
14661
|
success_rate: cases.length > 0 ? passedCases.length / cases.length : 0,
|
|
14312
14662
|
total_cases: cases.length,
|
|
@@ -14316,7 +14666,8 @@ function calculateMetrics(cases) {
|
|
|
14316
14666
|
p95_latency_ms: p95Latency,
|
|
14317
14667
|
total_tokens: totalPromptTokens + totalCompletionTokens,
|
|
14318
14668
|
total_prompt_tokens: totalPromptTokens,
|
|
14319
|
-
total_completion_tokens: totalCompletionTokens
|
|
14669
|
+
total_completion_tokens: totalCompletionTokens,
|
|
14670
|
+
cost
|
|
14320
14671
|
};
|
|
14321
14672
|
}
|
|
14322
14673
|
function detectCIEnvironment() {
|
|
@@ -14487,14 +14838,26 @@ function getSuccessRate(manifest) {
|
|
|
14487
14838
|
}
|
|
14488
14839
|
return manifest.metrics.success_rate;
|
|
14489
14840
|
}
|
|
14841
|
+
function getEstimatedCost(manifest) {
|
|
14842
|
+
const type = getManifestType(manifest);
|
|
14843
|
+
if (type === "stress") {
|
|
14844
|
+
return manifest.metrics.cost?.estimated_total_usd;
|
|
14845
|
+
}
|
|
14846
|
+
if (type === "run") {
|
|
14847
|
+
return manifest.metrics.cost?.total_usd;
|
|
14848
|
+
}
|
|
14849
|
+
return;
|
|
14850
|
+
}
|
|
14490
14851
|
function getScenario(manifest) {
|
|
14491
14852
|
return manifest.config.scenario;
|
|
14492
14853
|
}
|
|
14493
14854
|
|
|
14494
14855
|
class LocalStorageAdapter {
|
|
14495
14856
|
basePath;
|
|
14857
|
+
baselinesPath;
|
|
14496
14858
|
constructor(basePath = "./artemis-runs") {
|
|
14497
14859
|
this.basePath = resolve2(basePath);
|
|
14860
|
+
this.baselinesPath = join2(this.basePath, ".artemis", "baselines.json");
|
|
14498
14861
|
}
|
|
14499
14862
|
async save(manifest) {
|
|
14500
14863
|
const dir = join2(this.basePath, manifest.project);
|
|
@@ -14554,13 +14917,17 @@ class LocalStorageAdapter {
|
|
|
14554
14917
|
if (options?.scenario && getScenario(manifest) !== options.scenario) {
|
|
14555
14918
|
continue;
|
|
14556
14919
|
}
|
|
14557
|
-
|
|
14920
|
+
const item = {
|
|
14558
14921
|
runId: manifest.run_id,
|
|
14559
14922
|
scenario: getScenario(manifest),
|
|
14560
14923
|
successRate: getSuccessRate(manifest),
|
|
14561
14924
|
createdAt: manifest.start_time,
|
|
14562
14925
|
type: manifestType
|
|
14563
|
-
}
|
|
14926
|
+
};
|
|
14927
|
+
if (options?.includeCost) {
|
|
14928
|
+
item.estimatedCostUsd = getEstimatedCost(manifest);
|
|
14929
|
+
}
|
|
14930
|
+
results.push(item);
|
|
14564
14931
|
} catch {}
|
|
14565
14932
|
}
|
|
14566
14933
|
}
|
|
@@ -14613,6 +14980,89 @@ class LocalStorageAdapter {
|
|
|
14613
14980
|
return [];
|
|
14614
14981
|
}
|
|
14615
14982
|
}
|
|
14983
|
+
async loadBaselinesFile() {
|
|
14984
|
+
try {
|
|
14985
|
+
const content = await readFile2(this.baselinesPath, "utf-8");
|
|
14986
|
+
return JSON.parse(content);
|
|
14987
|
+
} catch {
|
|
14988
|
+
return { version: "1.0", baselines: {} };
|
|
14989
|
+
}
|
|
14990
|
+
}
|
|
14991
|
+
async saveBaselinesFile(data) {
|
|
14992
|
+
const dir = join2(this.basePath, ".artemis");
|
|
14993
|
+
await mkdir(dir, { recursive: true });
|
|
14994
|
+
await writeFile(this.baselinesPath, JSON.stringify(data, null, 2));
|
|
14995
|
+
}
|
|
14996
|
+
async setBaseline(scenario, runId, tag) {
|
|
14997
|
+
const manifest = await this.loadRun(runId);
|
|
14998
|
+
const scenarioName = scenario || getScenario(manifest);
|
|
14999
|
+
const baseline = {
|
|
15000
|
+
scenario: scenarioName,
|
|
15001
|
+
runId,
|
|
15002
|
+
createdAt: new Date().toISOString(),
|
|
15003
|
+
metrics: {
|
|
15004
|
+
successRate: manifest.metrics.success_rate,
|
|
15005
|
+
medianLatencyMs: manifest.metrics.median_latency_ms,
|
|
15006
|
+
totalTokens: manifest.metrics.total_tokens,
|
|
15007
|
+
passedCases: manifest.metrics.passed_cases,
|
|
15008
|
+
failedCases: manifest.metrics.failed_cases,
|
|
15009
|
+
totalCases: manifest.metrics.total_cases
|
|
15010
|
+
},
|
|
15011
|
+
tag
|
|
15012
|
+
};
|
|
15013
|
+
const data = await this.loadBaselinesFile();
|
|
15014
|
+
data.baselines[scenarioName] = baseline;
|
|
15015
|
+
await this.saveBaselinesFile(data);
|
|
15016
|
+
return baseline;
|
|
15017
|
+
}
|
|
15018
|
+
async getBaseline(scenario) {
|
|
15019
|
+
const data = await this.loadBaselinesFile();
|
|
15020
|
+
return data.baselines[scenario] || null;
|
|
15021
|
+
}
|
|
15022
|
+
async getBaselineByRunId(runId) {
|
|
15023
|
+
const data = await this.loadBaselinesFile();
|
|
15024
|
+
const baselines = Object.values(data.baselines);
|
|
15025
|
+
return baselines.find((b) => b.runId === runId) || null;
|
|
15026
|
+
}
|
|
15027
|
+
async listBaselines() {
|
|
15028
|
+
const data = await this.loadBaselinesFile();
|
|
15029
|
+
return Object.values(data.baselines).sort((a, b) => new Date(b.createdAt).getTime() - new Date(a.createdAt).getTime());
|
|
15030
|
+
}
|
|
15031
|
+
async removeBaseline(scenario) {
|
|
15032
|
+
const data = await this.loadBaselinesFile();
|
|
15033
|
+
if (data.baselines[scenario]) {
|
|
15034
|
+
delete data.baselines[scenario];
|
|
15035
|
+
await this.saveBaselinesFile(data);
|
|
15036
|
+
return true;
|
|
15037
|
+
}
|
|
15038
|
+
return false;
|
|
15039
|
+
}
|
|
15040
|
+
async removeBaselineByRunId(runId) {
|
|
15041
|
+
const data = await this.loadBaselinesFile();
|
|
15042
|
+
const entry = Object.entries(data.baselines).find(([_, b]) => b.runId === runId);
|
|
15043
|
+
if (entry) {
|
|
15044
|
+
delete data.baselines[entry[0]];
|
|
15045
|
+
await this.saveBaselinesFile(data);
|
|
15046
|
+
return true;
|
|
15047
|
+
}
|
|
15048
|
+
return false;
|
|
15049
|
+
}
|
|
15050
|
+
async compareToBaseline(runId, regressionThreshold = 0.05) {
|
|
15051
|
+
const currentManifest = await this.loadRun(runId);
|
|
15052
|
+
const scenario = getScenario(currentManifest);
|
|
15053
|
+
const baseline = await this.getBaseline(scenario);
|
|
15054
|
+
if (!baseline) {
|
|
15055
|
+
return null;
|
|
15056
|
+
}
|
|
15057
|
+
const comparison = await this.compare(baseline.runId, runId);
|
|
15058
|
+
const hasRegression = comparison.delta.successRate < -regressionThreshold;
|
|
15059
|
+
return {
|
|
15060
|
+
baseline,
|
|
15061
|
+
comparison,
|
|
15062
|
+
hasRegression,
|
|
15063
|
+
regressionThreshold
|
|
15064
|
+
};
|
|
15065
|
+
}
|
|
14616
15066
|
}
|
|
14617
15067
|
|
|
14618
15068
|
// ../../node_modules/.bun/tslib@2.8.1/node_modules/tslib/modules/index.js
|
|
@@ -24301,193 +24751,6 @@ class Logger {
|
|
|
24301
24751
|
}
|
|
24302
24752
|
}
|
|
24303
24753
|
var logger = new Logger("artemis");
|
|
24304
|
-
// src/cost/pricing.ts
|
|
24305
|
-
var MODEL_PRICING = {
|
|
24306
|
-
"gpt-4": {
|
|
24307
|
-
promptPer1K: 0.03,
|
|
24308
|
-
completionPer1K: 0.06,
|
|
24309
|
-
lastUpdated: "2024-01"
|
|
24310
|
-
},
|
|
24311
|
-
"gpt-4-32k": {
|
|
24312
|
-
promptPer1K: 0.06,
|
|
24313
|
-
completionPer1K: 0.12,
|
|
24314
|
-
lastUpdated: "2024-01"
|
|
24315
|
-
},
|
|
24316
|
-
"gpt-4-turbo": {
|
|
24317
|
-
promptPer1K: 0.01,
|
|
24318
|
-
completionPer1K: 0.03,
|
|
24319
|
-
lastUpdated: "2024-01"
|
|
24320
|
-
},
|
|
24321
|
-
"gpt-4-turbo-preview": {
|
|
24322
|
-
promptPer1K: 0.01,
|
|
24323
|
-
completionPer1K: 0.03,
|
|
24324
|
-
lastUpdated: "2024-01"
|
|
24325
|
-
},
|
|
24326
|
-
"gpt-4o": {
|
|
24327
|
-
promptPer1K: 0.005,
|
|
24328
|
-
completionPer1K: 0.015,
|
|
24329
|
-
lastUpdated: "2024-05"
|
|
24330
|
-
},
|
|
24331
|
-
"gpt-4o-mini": {
|
|
24332
|
-
promptPer1K: 0.00015,
|
|
24333
|
-
completionPer1K: 0.0006,
|
|
24334
|
-
lastUpdated: "2024-07"
|
|
24335
|
-
},
|
|
24336
|
-
"gpt-3.5-turbo": {
|
|
24337
|
-
promptPer1K: 0.0005,
|
|
24338
|
-
completionPer1K: 0.0015,
|
|
24339
|
-
lastUpdated: "2024-01"
|
|
24340
|
-
},
|
|
24341
|
-
"gpt-3.5-turbo-16k": {
|
|
24342
|
-
promptPer1K: 0.003,
|
|
24343
|
-
completionPer1K: 0.004,
|
|
24344
|
-
lastUpdated: "2024-01"
|
|
24345
|
-
},
|
|
24346
|
-
"claude-3-opus-20240229": {
|
|
24347
|
-
promptPer1K: 0.015,
|
|
24348
|
-
completionPer1K: 0.075,
|
|
24349
|
-
lastUpdated: "2024-03"
|
|
24350
|
-
},
|
|
24351
|
-
"claude-3-sonnet-20240229": {
|
|
24352
|
-
promptPer1K: 0.003,
|
|
24353
|
-
completionPer1K: 0.015,
|
|
24354
|
-
lastUpdated: "2024-03"
|
|
24355
|
-
},
|
|
24356
|
-
"claude-3-haiku-20240307": {
|
|
24357
|
-
promptPer1K: 0.00025,
|
|
24358
|
-
completionPer1K: 0.00125,
|
|
24359
|
-
lastUpdated: "2024-03"
|
|
24360
|
-
},
|
|
24361
|
-
"claude-3-5-sonnet-20240620": {
|
|
24362
|
-
promptPer1K: 0.003,
|
|
24363
|
-
completionPer1K: 0.015,
|
|
24364
|
-
lastUpdated: "2024-06"
|
|
24365
|
-
},
|
|
24366
|
-
"claude-3-5-sonnet-20241022": {
|
|
24367
|
-
promptPer1K: 0.003,
|
|
24368
|
-
completionPer1K: 0.015,
|
|
24369
|
-
lastUpdated: "2024-10"
|
|
24370
|
-
},
|
|
24371
|
-
"claude-3-5-haiku-20241022": {
|
|
24372
|
-
promptPer1K: 0.0008,
|
|
24373
|
-
completionPer1K: 0.004,
|
|
24374
|
-
lastUpdated: "2024-10"
|
|
24375
|
-
},
|
|
24376
|
-
"claude-3-opus": {
|
|
24377
|
-
promptPer1K: 0.015,
|
|
24378
|
-
completionPer1K: 0.075,
|
|
24379
|
-
lastUpdated: "2024-03"
|
|
24380
|
-
},
|
|
24381
|
-
"claude-3-sonnet": {
|
|
24382
|
-
promptPer1K: 0.003,
|
|
24383
|
-
completionPer1K: 0.015,
|
|
24384
|
-
lastUpdated: "2024-03"
|
|
24385
|
-
},
|
|
24386
|
-
"claude-3-haiku": {
|
|
24387
|
-
promptPer1K: 0.00025,
|
|
24388
|
-
completionPer1K: 0.00125,
|
|
24389
|
-
lastUpdated: "2024-03"
|
|
24390
|
-
},
|
|
24391
|
-
"claude-3.5-sonnet": {
|
|
24392
|
-
promptPer1K: 0.003,
|
|
24393
|
-
completionPer1K: 0.015,
|
|
24394
|
-
lastUpdated: "2024-10"
|
|
24395
|
-
},
|
|
24396
|
-
"claude-3.5-haiku": {
|
|
24397
|
-
promptPer1K: 0.0008,
|
|
24398
|
-
completionPer1K: 0.004,
|
|
24399
|
-
lastUpdated: "2024-10"
|
|
24400
|
-
},
|
|
24401
|
-
"claude-2": {
|
|
24402
|
-
promptPer1K: 0.008,
|
|
24403
|
-
completionPer1K: 0.024,
|
|
24404
|
-
lastUpdated: "2024-01"
|
|
24405
|
-
},
|
|
24406
|
-
"claude-instant-1": {
|
|
24407
|
-
promptPer1K: 0.0008,
|
|
24408
|
-
completionPer1K: 0.0024,
|
|
24409
|
-
lastUpdated: "2024-01"
|
|
24410
|
-
}
|
|
24411
|
-
};
|
|
24412
|
-
var DEFAULT_PRICING = {
|
|
24413
|
-
promptPer1K: 0.01,
|
|
24414
|
-
completionPer1K: 0.03,
|
|
24415
|
-
lastUpdated: "2024-01",
|
|
24416
|
-
notes: "Default pricing - verify with provider"
|
|
24417
|
-
};
|
|
24418
|
-
function getModelPricing(model) {
|
|
24419
|
-
if (MODEL_PRICING[model]) {
|
|
24420
|
-
return MODEL_PRICING[model];
|
|
24421
|
-
}
|
|
24422
|
-
const lowerModel = model.toLowerCase();
|
|
24423
|
-
for (const [key, pricing] of Object.entries(MODEL_PRICING)) {
|
|
24424
|
-
if (key.toLowerCase() === lowerModel) {
|
|
24425
|
-
return pricing;
|
|
24426
|
-
}
|
|
24427
|
-
}
|
|
24428
|
-
if (lowerModel.includes("gpt-4o-mini")) {
|
|
24429
|
-
return MODEL_PRICING["gpt-4o-mini"];
|
|
24430
|
-
}
|
|
24431
|
-
if (lowerModel.includes("gpt-4o")) {
|
|
24432
|
-
return MODEL_PRICING["gpt-4o"];
|
|
24433
|
-
}
|
|
24434
|
-
if (lowerModel.includes("gpt-4-turbo")) {
|
|
24435
|
-
return MODEL_PRICING["gpt-4-turbo"];
|
|
24436
|
-
}
|
|
24437
|
-
if (lowerModel.includes("gpt-4")) {
|
|
24438
|
-
return MODEL_PRICING["gpt-4"];
|
|
24439
|
-
}
|
|
24440
|
-
if (lowerModel.includes("gpt-3.5")) {
|
|
24441
|
-
return MODEL_PRICING["gpt-3.5-turbo"];
|
|
24442
|
-
}
|
|
24443
|
-
if (lowerModel.includes("claude-3-5-sonnet") || lowerModel.includes("claude-3.5-sonnet")) {
|
|
24444
|
-
return MODEL_PRICING["claude-3.5-sonnet"];
|
|
24445
|
-
}
|
|
24446
|
-
if (lowerModel.includes("claude-3-5-haiku") || lowerModel.includes("claude-3.5-haiku")) {
|
|
24447
|
-
return MODEL_PRICING["claude-3.5-haiku"];
|
|
24448
|
-
}
|
|
24449
|
-
if (lowerModel.includes("claude-3-opus")) {
|
|
24450
|
-
return MODEL_PRICING["claude-3-opus"];
|
|
24451
|
-
}
|
|
24452
|
-
if (lowerModel.includes("claude-3-sonnet")) {
|
|
24453
|
-
return MODEL_PRICING["claude-3-sonnet"];
|
|
24454
|
-
}
|
|
24455
|
-
if (lowerModel.includes("claude-3-haiku")) {
|
|
24456
|
-
return MODEL_PRICING["claude-3-haiku"];
|
|
24457
|
-
}
|
|
24458
|
-
if (lowerModel.includes("claude")) {
|
|
24459
|
-
return MODEL_PRICING["claude-2"];
|
|
24460
|
-
}
|
|
24461
|
-
return DEFAULT_PRICING;
|
|
24462
|
-
}
|
|
24463
|
-
function estimateCost(promptTokens, completionTokens, model) {
|
|
24464
|
-
const pricing = getModelPricing(model);
|
|
24465
|
-
const promptCostUsd = promptTokens / 1000 * pricing.promptPer1K;
|
|
24466
|
-
const completionCostUsd = completionTokens / 1000 * pricing.completionPer1K;
|
|
24467
|
-
const totalUsd = promptCostUsd + completionCostUsd;
|
|
24468
|
-
return {
|
|
24469
|
-
totalUsd,
|
|
24470
|
-
promptCostUsd,
|
|
24471
|
-
completionCostUsd,
|
|
24472
|
-
model,
|
|
24473
|
-
pricing
|
|
24474
|
-
};
|
|
24475
|
-
}
|
|
24476
|
-
function formatCost(costUsd) {
|
|
24477
|
-
if (costUsd < 0.01) {
|
|
24478
|
-
return `$${(costUsd * 100).toFixed(4)} cents`;
|
|
24479
|
-
}
|
|
24480
|
-
if (costUsd < 1) {
|
|
24481
|
-
return `$${costUsd.toFixed(4)}`;
|
|
24482
|
-
}
|
|
24483
|
-
return `$${costUsd.toFixed(2)}`;
|
|
24484
|
-
}
|
|
24485
|
-
function listKnownModels() {
|
|
24486
|
-
return Object.entries(MODEL_PRICING).map(([model, pricing]) => ({
|
|
24487
|
-
model,
|
|
24488
|
-
pricing
|
|
24489
|
-
}));
|
|
24490
|
-
}
|
|
24491
24754
|
export {
|
|
24492
24755
|
wrapError,
|
|
24493
24756
|
validateScenario,
|