@artemiskit/core 0.2.0 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -11564,8 +11564,7 @@ class LLMGraderEvaluator {
11564
11564
  const result = await context.client.generate({
11565
11565
  prompt,
11566
11566
  model: expected.model,
11567
- temperature: 0,
11568
- maxTokens: 200
11567
+ maxTokens: 1000
11569
11568
  });
11570
11569
  const parsed = this.parseGraderResponse(result.text);
11571
11570
  const passed = parsed.score >= expected.threshold;
@@ -11590,9 +11589,17 @@ class LLMGraderEvaluator {
11590
11589
  }
11591
11590
  }
11592
11591
  parseGraderResponse(text) {
11593
- const jsonMatch = text.match(/\{[\s\S]*?\}/);
11592
+ const cleanedText = text.replace(/```json\s*/gi, "").replace(/```\s*/g, "").trim();
11593
+ const jsonMatch = cleanedText.match(/\{[\s\S]*?\}/);
11594
11594
  if (!jsonMatch) {
11595
- throw new Error("No JSON found in grader response");
11595
+ const scoreMatch = cleanedText.match(/(?:score[:\s]*)?(\d+\.?\d*)/i);
11596
+ if (scoreMatch) {
11597
+ const score = Number(scoreMatch[1]);
11598
+ if (!Number.isNaN(score) && score >= 0 && score <= 1) {
11599
+ return { score, reason: cleanedText };
11600
+ }
11601
+ }
11602
+ throw new Error(`No JSON found in grader response: ${text.substring(0, 100)}...`);
11596
11603
  }
11597
11604
  try {
11598
11605
  const parsed = JSON.parse(jsonMatch[0]);
@@ -11605,26 +11612,39 @@ class LLMGraderEvaluator {
11605
11612
  reason: parsed.reason
11606
11613
  };
11607
11614
  } catch (error) {
11615
+ const scoreMatch = jsonMatch[0].match(/"score"[:\s]*(\d+\.?\d*)/i);
11616
+ if (scoreMatch) {
11617
+ const score = Number(scoreMatch[1]);
11618
+ if (!Number.isNaN(score) && score >= 0 && score <= 1) {
11619
+ const reasonMatch = jsonMatch[0].match(/"reason"[:\s]*"([^"]+)"/i);
11620
+ return { score, reason: reasonMatch?.[1] };
11621
+ }
11622
+ }
11608
11623
  throw new Error(`Failed to parse grader response: ${error.message}`);
11609
11624
  }
11610
11625
  }
11611
11626
  }
11612
- var GRADER_PROMPT = `You are an evaluator grading an AI response based on a rubric.
11627
+ var GRADER_PROMPT = `You are a strict JSON-only evaluator. You grade AI responses based on rubrics.
11613
11628
 
11614
- ## RUBRIC
11629
+ RUBRIC:
11615
11630
  {{rubric}}
11616
11631
 
11617
- ## RESPONSE TO EVALUATE
11632
+ RESPONSE TO EVALUATE:
11618
11633
  {{response}}
11619
11634
 
11620
- ## INSTRUCTIONS
11621
- Score the response from 0.0 to 1.0 based on the rubric.
11622
- Be objective and consistent in your scoring.
11635
+ TASK: Score the response from 0.0 to 1.0 based on the rubric above.
11623
11636
 
11624
- Respond with ONLY a JSON object in this exact format:
11625
- {"score": <number between 0 and 1>, "reason": "<brief explanation of score>"}
11637
+ OUTPUT FORMAT: You MUST respond with ONLY this exact JSON structure, nothing else:
11638
+ {"score":0.0,"reason":"explanation"}
11639
+
11640
+ RULES:
11641
+ - Output ONLY valid JSON, no markdown, no code blocks, no extra text
11642
+ - "score" must be a number between 0.0 and 1.0
11643
+ - "reason" must be a brief string explaining the score
11644
+ - Do NOT wrap in \`\`\`json or any formatting
11645
+ - Your entire response must be parseable by JSON.parse()
11626
11646
 
11627
- Do not include any other text, markdown, or formatting.`;
11647
+ JSON OUTPUT:`;
11628
11648
 
11629
11649
  // src/evaluators/not-contains.ts
11630
11650
  class NotContainsEvaluator {
@@ -13487,6 +13507,7 @@ var ProviderConfigSchema = exports_external.object({
13487
13507
  deploymentName: exports_external.string().optional(),
13488
13508
  apiVersion: exports_external.string().optional(),
13489
13509
  embeddingDeploymentName: exports_external.string().optional(),
13510
+ modelFamily: exports_external.string().optional(),
13490
13511
  underlyingProvider: exports_external.enum(["openai", "azure", "anthropic", "google", "mistral"]).optional()
13491
13512
  }).optional();
13492
13513
  var BaseExpectedSchema = exports_external.discriminatedUnion("type", [
@@ -14217,6 +14238,319 @@ function nanoid(size = 21) {
14217
14238
  return id;
14218
14239
  }
14219
14240
 
14241
+ // src/cost/pricing.ts
14242
+ var MODEL_PRICING = {
14243
+ "gpt-5": {
14244
+ promptPer1K: 0.00125,
14245
+ completionPer1K: 0.01,
14246
+ lastUpdated: "2026-01",
14247
+ notes: "400K context window"
14248
+ },
14249
+ "gpt-5.1": {
14250
+ promptPer1K: 0.00125,
14251
+ completionPer1K: 0.01,
14252
+ lastUpdated: "2026-01"
14253
+ },
14254
+ "gpt-5.2": {
14255
+ promptPer1K: 0.00175,
14256
+ completionPer1K: 0.014,
14257
+ lastUpdated: "2026-01"
14258
+ },
14259
+ "gpt-5-mini": {
14260
+ promptPer1K: 0.00025,
14261
+ completionPer1K: 0.002,
14262
+ lastUpdated: "2026-01"
14263
+ },
14264
+ "gpt-5-nano": {
14265
+ promptPer1K: 0.00005,
14266
+ completionPer1K: 0.0004,
14267
+ lastUpdated: "2026-01"
14268
+ },
14269
+ "gpt-4.1": {
14270
+ promptPer1K: 0.002,
14271
+ completionPer1K: 0.008,
14272
+ lastUpdated: "2026-01",
14273
+ notes: "1M context window"
14274
+ },
14275
+ "gpt-4.1-mini": {
14276
+ promptPer1K: 0.0004,
14277
+ completionPer1K: 0.0016,
14278
+ lastUpdated: "2026-01"
14279
+ },
14280
+ "gpt-4.1-nano": {
14281
+ promptPer1K: 0.0001,
14282
+ completionPer1K: 0.0004,
14283
+ lastUpdated: "2026-01"
14284
+ },
14285
+ "gpt-4o": {
14286
+ promptPer1K: 0.0025,
14287
+ completionPer1K: 0.01,
14288
+ lastUpdated: "2026-01",
14289
+ notes: "128K context window"
14290
+ },
14291
+ "gpt-4o-mini": {
14292
+ promptPer1K: 0.00015,
14293
+ completionPer1K: 0.0006,
14294
+ lastUpdated: "2026-01",
14295
+ notes: "128K context window"
14296
+ },
14297
+ o1: {
14298
+ promptPer1K: 0.015,
14299
+ completionPer1K: 0.06,
14300
+ lastUpdated: "2026-01",
14301
+ notes: "Reasoning model - internal thinking tokens billed as output"
14302
+ },
14303
+ o3: {
14304
+ promptPer1K: 0.002,
14305
+ completionPer1K: 0.008,
14306
+ lastUpdated: "2026-01"
14307
+ },
14308
+ "o3-mini": {
14309
+ promptPer1K: 0.0011,
14310
+ completionPer1K: 0.0044,
14311
+ lastUpdated: "2026-01"
14312
+ },
14313
+ "o4-mini": {
14314
+ promptPer1K: 0.0011,
14315
+ completionPer1K: 0.0044,
14316
+ lastUpdated: "2026-01"
14317
+ },
14318
+ "gpt-4-turbo": {
14319
+ promptPer1K: 0.01,
14320
+ completionPer1K: 0.03,
14321
+ lastUpdated: "2026-01"
14322
+ },
14323
+ "gpt-4": {
14324
+ promptPer1K: 0.03,
14325
+ completionPer1K: 0.06,
14326
+ lastUpdated: "2026-01"
14327
+ },
14328
+ "gpt-3.5-turbo": {
14329
+ promptPer1K: 0.0005,
14330
+ completionPer1K: 0.0015,
14331
+ lastUpdated: "2026-01"
14332
+ },
14333
+ "claude-opus-4.5": {
14334
+ promptPer1K: 0.005,
14335
+ completionPer1K: 0.025,
14336
+ lastUpdated: "2026-01",
14337
+ notes: "Most capable Claude model"
14338
+ },
14339
+ "claude-sonnet-4.5": {
14340
+ promptPer1K: 0.003,
14341
+ completionPer1K: 0.015,
14342
+ lastUpdated: "2026-01",
14343
+ notes: "Balanced performance and cost"
14344
+ },
14345
+ "claude-haiku-4.5": {
14346
+ promptPer1K: 0.001,
14347
+ completionPer1K: 0.005,
14348
+ lastUpdated: "2026-01",
14349
+ notes: "Fastest Claude model"
14350
+ },
14351
+ "claude-opus-4": {
14352
+ promptPer1K: 0.015,
14353
+ completionPer1K: 0.075,
14354
+ lastUpdated: "2026-01"
14355
+ },
14356
+ "claude-opus-4.1": {
14357
+ promptPer1K: 0.015,
14358
+ completionPer1K: 0.075,
14359
+ lastUpdated: "2026-01"
14360
+ },
14361
+ "claude-sonnet-4": {
14362
+ promptPer1K: 0.003,
14363
+ completionPer1K: 0.015,
14364
+ lastUpdated: "2026-01"
14365
+ },
14366
+ "claude-sonnet-3.7": {
14367
+ promptPer1K: 0.003,
14368
+ completionPer1K: 0.015,
14369
+ lastUpdated: "2026-01"
14370
+ },
14371
+ "claude-3-7-sonnet": {
14372
+ promptPer1K: 0.003,
14373
+ completionPer1K: 0.015,
14374
+ lastUpdated: "2026-01"
14375
+ },
14376
+ "claude-3-5-sonnet-20241022": {
14377
+ promptPer1K: 0.003,
14378
+ completionPer1K: 0.015,
14379
+ lastUpdated: "2026-01"
14380
+ },
14381
+ "claude-3-5-haiku-20241022": {
14382
+ promptPer1K: 0.0008,
14383
+ completionPer1K: 0.004,
14384
+ lastUpdated: "2026-01"
14385
+ },
14386
+ "claude-haiku-3.5": {
14387
+ promptPer1K: 0.0008,
14388
+ completionPer1K: 0.004,
14389
+ lastUpdated: "2026-01"
14390
+ },
14391
+ "claude-3-opus": {
14392
+ promptPer1K: 0.015,
14393
+ completionPer1K: 0.075,
14394
+ lastUpdated: "2026-01"
14395
+ },
14396
+ "claude-3-sonnet": {
14397
+ promptPer1K: 0.003,
14398
+ completionPer1K: 0.015,
14399
+ lastUpdated: "2026-01"
14400
+ },
14401
+ "claude-3-haiku": {
14402
+ promptPer1K: 0.00025,
14403
+ completionPer1K: 0.00125,
14404
+ lastUpdated: "2026-01"
14405
+ },
14406
+ "claude-3.5-sonnet": {
14407
+ promptPer1K: 0.003,
14408
+ completionPer1K: 0.015,
14409
+ lastUpdated: "2026-01"
14410
+ },
14411
+ "claude-3.5-haiku": {
14412
+ promptPer1K: 0.0008,
14413
+ completionPer1K: 0.004,
14414
+ lastUpdated: "2026-01"
14415
+ }
14416
+ };
14417
+ var DEFAULT_PRICING = {
14418
+ promptPer1K: 0.003,
14419
+ completionPer1K: 0.015,
14420
+ lastUpdated: "2026-01",
14421
+ notes: "Default pricing - verify with provider"
14422
+ };
14423
+ function getModelPricing(model) {
14424
+ if (MODEL_PRICING[model]) {
14425
+ return MODEL_PRICING[model];
14426
+ }
14427
+ const lowerModel = model.toLowerCase();
14428
+ for (const [key, pricing] of Object.entries(MODEL_PRICING)) {
14429
+ if (key.toLowerCase() === lowerModel) {
14430
+ return pricing;
14431
+ }
14432
+ }
14433
+ if (lowerModel.includes("gpt-5.2")) {
14434
+ return MODEL_PRICING["gpt-5.2"];
14435
+ }
14436
+ if (lowerModel.includes("gpt-5.1")) {
14437
+ return MODEL_PRICING["gpt-5.1"];
14438
+ }
14439
+ if (lowerModel.includes("gpt-5-mini")) {
14440
+ return MODEL_PRICING["gpt-5-mini"];
14441
+ }
14442
+ if (lowerModel.includes("gpt-5-nano")) {
14443
+ return MODEL_PRICING["gpt-5-nano"];
14444
+ }
14445
+ if (lowerModel.includes("gpt-5")) {
14446
+ return MODEL_PRICING["gpt-5"];
14447
+ }
14448
+ if (lowerModel.includes("gpt-4.1-mini")) {
14449
+ return MODEL_PRICING["gpt-4.1-mini"];
14450
+ }
14451
+ if (lowerModel.includes("gpt-4.1-nano")) {
14452
+ return MODEL_PRICING["gpt-4.1-nano"];
14453
+ }
14454
+ if (lowerModel.includes("gpt-4.1")) {
14455
+ return MODEL_PRICING["gpt-4.1"];
14456
+ }
14457
+ if (lowerModel.includes("gpt-4o-mini")) {
14458
+ return MODEL_PRICING["gpt-4o-mini"];
14459
+ }
14460
+ if (lowerModel.includes("gpt-4o")) {
14461
+ return MODEL_PRICING["gpt-4o"];
14462
+ }
14463
+ if (lowerModel.includes("o4-mini")) {
14464
+ return MODEL_PRICING["o4-mini"];
14465
+ }
14466
+ if (lowerModel.includes("o3-mini")) {
14467
+ return MODEL_PRICING["o3-mini"];
14468
+ }
14469
+ if (lowerModel.includes("o3")) {
14470
+ return MODEL_PRICING.o3;
14471
+ }
14472
+ if (lowerModel.includes("o1")) {
14473
+ return MODEL_PRICING.o1;
14474
+ }
14475
+ if (lowerModel.includes("gpt-4-turbo")) {
14476
+ return MODEL_PRICING["gpt-4-turbo"];
14477
+ }
14478
+ if (lowerModel.includes("gpt-4")) {
14479
+ return MODEL_PRICING["gpt-4"];
14480
+ }
14481
+ if (lowerModel.includes("gpt-3.5")) {
14482
+ return MODEL_PRICING["gpt-3.5-turbo"];
14483
+ }
14484
+ if (lowerModel.includes("opus-4.5") || lowerModel.includes("opus-4-5")) {
14485
+ return MODEL_PRICING["claude-opus-4.5"];
14486
+ }
14487
+ if (lowerModel.includes("sonnet-4.5") || lowerModel.includes("sonnet-4-5")) {
14488
+ return MODEL_PRICING["claude-sonnet-4.5"];
14489
+ }
14490
+ if (lowerModel.includes("haiku-4.5") || lowerModel.includes("haiku-4-5")) {
14491
+ return MODEL_PRICING["claude-haiku-4.5"];
14492
+ }
14493
+ if (lowerModel.includes("opus-4.1") || lowerModel.includes("opus-4-1")) {
14494
+ return MODEL_PRICING["claude-opus-4.1"];
14495
+ }
14496
+ if (lowerModel.includes("opus-4")) {
14497
+ return MODEL_PRICING["claude-opus-4"];
14498
+ }
14499
+ if (lowerModel.includes("sonnet-4")) {
14500
+ return MODEL_PRICING["claude-sonnet-4"];
14501
+ }
14502
+ if (lowerModel.includes("sonnet-3.7") || lowerModel.includes("sonnet-3-7")) {
14503
+ return MODEL_PRICING["claude-sonnet-3.7"];
14504
+ }
14505
+ if (lowerModel.includes("claude-3-5-sonnet") || lowerModel.includes("claude-3.5-sonnet")) {
14506
+ return MODEL_PRICING["claude-3.5-sonnet"];
14507
+ }
14508
+ if (lowerModel.includes("claude-3-5-haiku") || lowerModel.includes("claude-3.5-haiku")) {
14509
+ return MODEL_PRICING["claude-3.5-haiku"];
14510
+ }
14511
+ if (lowerModel.includes("claude-3-opus")) {
14512
+ return MODEL_PRICING["claude-3-opus"];
14513
+ }
14514
+ if (lowerModel.includes("claude-3-sonnet")) {
14515
+ return MODEL_PRICING["claude-3-sonnet"];
14516
+ }
14517
+ if (lowerModel.includes("claude-3-haiku")) {
14518
+ return MODEL_PRICING["claude-3-haiku"];
14519
+ }
14520
+ if (lowerModel.includes("claude")) {
14521
+ return MODEL_PRICING["claude-sonnet-4.5"];
14522
+ }
14523
+ return DEFAULT_PRICING;
14524
+ }
14525
+ function estimateCost(promptTokens, completionTokens, model) {
14526
+ const pricing = getModelPricing(model);
14527
+ const promptCostUsd = promptTokens / 1000 * pricing.promptPer1K;
14528
+ const completionCostUsd = completionTokens / 1000 * pricing.completionPer1K;
14529
+ const totalUsd = promptCostUsd + completionCostUsd;
14530
+ return {
14531
+ totalUsd,
14532
+ promptCostUsd,
14533
+ completionCostUsd,
14534
+ model,
14535
+ pricing
14536
+ };
14537
+ }
14538
+ function formatCost(costUsd) {
14539
+ if (costUsd < 0.01) {
14540
+ return `$${(costUsd * 100).toFixed(4)} cents`;
14541
+ }
14542
+ if (costUsd < 1) {
14543
+ return `$${costUsd.toFixed(4)}`;
14544
+ }
14545
+ return `$${costUsd.toFixed(2)}`;
14546
+ }
14547
+ function listKnownModels() {
14548
+ return Object.entries(MODEL_PRICING).map(([model, pricing]) => ({
14549
+ model,
14550
+ pricing
14551
+ }));
14552
+ }
14553
+
14220
14554
  // src/provenance/environment.ts
14221
14555
  function getEnvironmentInfo() {
14222
14556
  return {
@@ -14275,7 +14609,8 @@ function createRunManifest(options) {
14275
14609
  runReason,
14276
14610
  redaction
14277
14611
  } = options;
14278
- const metrics = calculateMetrics(cases);
14612
+ const modelForCost = resolvedConfig?.model || config.model;
14613
+ const metrics = calculateMetrics(cases, modelForCost);
14279
14614
  const git = getGitInfo();
14280
14615
  const environment = getEnvironmentInfo();
14281
14616
  return {
@@ -14299,7 +14634,7 @@ function createRunManifest(options) {
14299
14634
  redaction
14300
14635
  };
14301
14636
  }
14302
- function calculateMetrics(cases) {
14637
+ function calculateMetrics(cases, model) {
14303
14638
  const passedCases = cases.filter((c) => c.ok);
14304
14639
  const latencies = cases.map((c) => c.latencyMs).sort((a, b) => a - b);
14305
14640
  const medianLatency = latencies.length > 0 ? latencies[Math.floor(latencies.length / 2)] : 0;
@@ -14307,6 +14642,21 @@ function calculateMetrics(cases) {
14307
14642
  const p95Latency = latencies.length > 0 ? latencies[p95Index] : 0;
14308
14643
  const totalPromptTokens = cases.reduce((sum, c) => sum + c.tokens.prompt, 0);
14309
14644
  const totalCompletionTokens = cases.reduce((sum, c) => sum + c.tokens.completion, 0);
14645
+ let cost;
14646
+ if (model && (totalPromptTokens > 0 || totalCompletionTokens > 0)) {
14647
+ const costEstimate = estimateCost(totalPromptTokens, totalCompletionTokens, model);
14648
+ const pricing = getModelPricing(model);
14649
+ cost = {
14650
+ total_usd: costEstimate.totalUsd,
14651
+ prompt_cost_usd: costEstimate.promptCostUsd,
14652
+ completion_cost_usd: costEstimate.completionCostUsd,
14653
+ model: costEstimate.model,
14654
+ pricing: {
14655
+ prompt_per_1k: pricing.promptPer1K,
14656
+ completion_per_1k: pricing.completionPer1K
14657
+ }
14658
+ };
14659
+ }
14310
14660
  return {
14311
14661
  success_rate: cases.length > 0 ? passedCases.length / cases.length : 0,
14312
14662
  total_cases: cases.length,
@@ -14316,7 +14666,8 @@ function calculateMetrics(cases) {
14316
14666
  p95_latency_ms: p95Latency,
14317
14667
  total_tokens: totalPromptTokens + totalCompletionTokens,
14318
14668
  total_prompt_tokens: totalPromptTokens,
14319
- total_completion_tokens: totalCompletionTokens
14669
+ total_completion_tokens: totalCompletionTokens,
14670
+ cost
14320
14671
  };
14321
14672
  }
14322
14673
  function detectCIEnvironment() {
@@ -14487,14 +14838,26 @@ function getSuccessRate(manifest) {
14487
14838
  }
14488
14839
  return manifest.metrics.success_rate;
14489
14840
  }
14841
+ function getEstimatedCost(manifest) {
14842
+ const type = getManifestType(manifest);
14843
+ if (type === "stress") {
14844
+ return manifest.metrics.cost?.estimated_total_usd;
14845
+ }
14846
+ if (type === "run") {
14847
+ return manifest.metrics.cost?.total_usd;
14848
+ }
14849
+ return;
14850
+ }
14490
14851
  function getScenario(manifest) {
14491
14852
  return manifest.config.scenario;
14492
14853
  }
14493
14854
 
14494
14855
  class LocalStorageAdapter {
14495
14856
  basePath;
14857
+ baselinesPath;
14496
14858
  constructor(basePath = "./artemis-runs") {
14497
14859
  this.basePath = resolve2(basePath);
14860
+ this.baselinesPath = join2(this.basePath, ".artemis", "baselines.json");
14498
14861
  }
14499
14862
  async save(manifest) {
14500
14863
  const dir = join2(this.basePath, manifest.project);
@@ -14554,13 +14917,17 @@ class LocalStorageAdapter {
14554
14917
  if (options?.scenario && getScenario(manifest) !== options.scenario) {
14555
14918
  continue;
14556
14919
  }
14557
- results.push({
14920
+ const item = {
14558
14921
  runId: manifest.run_id,
14559
14922
  scenario: getScenario(manifest),
14560
14923
  successRate: getSuccessRate(manifest),
14561
14924
  createdAt: manifest.start_time,
14562
14925
  type: manifestType
14563
- });
14926
+ };
14927
+ if (options?.includeCost) {
14928
+ item.estimatedCostUsd = getEstimatedCost(manifest);
14929
+ }
14930
+ results.push(item);
14564
14931
  } catch {}
14565
14932
  }
14566
14933
  }
@@ -14613,6 +14980,89 @@ class LocalStorageAdapter {
14613
14980
  return [];
14614
14981
  }
14615
14982
  }
14983
+ async loadBaselinesFile() {
14984
+ try {
14985
+ const content = await readFile2(this.baselinesPath, "utf-8");
14986
+ return JSON.parse(content);
14987
+ } catch {
14988
+ return { version: "1.0", baselines: {} };
14989
+ }
14990
+ }
14991
+ async saveBaselinesFile(data) {
14992
+ const dir = join2(this.basePath, ".artemis");
14993
+ await mkdir(dir, { recursive: true });
14994
+ await writeFile(this.baselinesPath, JSON.stringify(data, null, 2));
14995
+ }
14996
+ async setBaseline(scenario, runId, tag) {
14997
+ const manifest = await this.loadRun(runId);
14998
+ const scenarioName = scenario || getScenario(manifest);
14999
+ const baseline = {
15000
+ scenario: scenarioName,
15001
+ runId,
15002
+ createdAt: new Date().toISOString(),
15003
+ metrics: {
15004
+ successRate: manifest.metrics.success_rate,
15005
+ medianLatencyMs: manifest.metrics.median_latency_ms,
15006
+ totalTokens: manifest.metrics.total_tokens,
15007
+ passedCases: manifest.metrics.passed_cases,
15008
+ failedCases: manifest.metrics.failed_cases,
15009
+ totalCases: manifest.metrics.total_cases
15010
+ },
15011
+ tag
15012
+ };
15013
+ const data = await this.loadBaselinesFile();
15014
+ data.baselines[scenarioName] = baseline;
15015
+ await this.saveBaselinesFile(data);
15016
+ return baseline;
15017
+ }
15018
+ async getBaseline(scenario) {
15019
+ const data = await this.loadBaselinesFile();
15020
+ return data.baselines[scenario] || null;
15021
+ }
15022
+ async getBaselineByRunId(runId) {
15023
+ const data = await this.loadBaselinesFile();
15024
+ const baselines = Object.values(data.baselines);
15025
+ return baselines.find((b) => b.runId === runId) || null;
15026
+ }
15027
+ async listBaselines() {
15028
+ const data = await this.loadBaselinesFile();
15029
+ return Object.values(data.baselines).sort((a, b) => new Date(b.createdAt).getTime() - new Date(a.createdAt).getTime());
15030
+ }
15031
+ async removeBaseline(scenario) {
15032
+ const data = await this.loadBaselinesFile();
15033
+ if (data.baselines[scenario]) {
15034
+ delete data.baselines[scenario];
15035
+ await this.saveBaselinesFile(data);
15036
+ return true;
15037
+ }
15038
+ return false;
15039
+ }
15040
+ async removeBaselineByRunId(runId) {
15041
+ const data = await this.loadBaselinesFile();
15042
+ const entry = Object.entries(data.baselines).find(([_, b]) => b.runId === runId);
15043
+ if (entry) {
15044
+ delete data.baselines[entry[0]];
15045
+ await this.saveBaselinesFile(data);
15046
+ return true;
15047
+ }
15048
+ return false;
15049
+ }
15050
+ async compareToBaseline(runId, regressionThreshold = 0.05) {
15051
+ const currentManifest = await this.loadRun(runId);
15052
+ const scenario = getScenario(currentManifest);
15053
+ const baseline = await this.getBaseline(scenario);
15054
+ if (!baseline) {
15055
+ return null;
15056
+ }
15057
+ const comparison = await this.compare(baseline.runId, runId);
15058
+ const hasRegression = comparison.delta.successRate < -regressionThreshold;
15059
+ return {
15060
+ baseline,
15061
+ comparison,
15062
+ hasRegression,
15063
+ regressionThreshold
15064
+ };
15065
+ }
14616
15066
  }
14617
15067
 
14618
15068
  // ../../node_modules/.bun/tslib@2.8.1/node_modules/tslib/modules/index.js
@@ -24301,193 +24751,6 @@ class Logger {
24301
24751
  }
24302
24752
  }
24303
24753
  var logger = new Logger("artemis");
24304
- // src/cost/pricing.ts
24305
- var MODEL_PRICING = {
24306
- "gpt-4": {
24307
- promptPer1K: 0.03,
24308
- completionPer1K: 0.06,
24309
- lastUpdated: "2024-01"
24310
- },
24311
- "gpt-4-32k": {
24312
- promptPer1K: 0.06,
24313
- completionPer1K: 0.12,
24314
- lastUpdated: "2024-01"
24315
- },
24316
- "gpt-4-turbo": {
24317
- promptPer1K: 0.01,
24318
- completionPer1K: 0.03,
24319
- lastUpdated: "2024-01"
24320
- },
24321
- "gpt-4-turbo-preview": {
24322
- promptPer1K: 0.01,
24323
- completionPer1K: 0.03,
24324
- lastUpdated: "2024-01"
24325
- },
24326
- "gpt-4o": {
24327
- promptPer1K: 0.005,
24328
- completionPer1K: 0.015,
24329
- lastUpdated: "2024-05"
24330
- },
24331
- "gpt-4o-mini": {
24332
- promptPer1K: 0.00015,
24333
- completionPer1K: 0.0006,
24334
- lastUpdated: "2024-07"
24335
- },
24336
- "gpt-3.5-turbo": {
24337
- promptPer1K: 0.0005,
24338
- completionPer1K: 0.0015,
24339
- lastUpdated: "2024-01"
24340
- },
24341
- "gpt-3.5-turbo-16k": {
24342
- promptPer1K: 0.003,
24343
- completionPer1K: 0.004,
24344
- lastUpdated: "2024-01"
24345
- },
24346
- "claude-3-opus-20240229": {
24347
- promptPer1K: 0.015,
24348
- completionPer1K: 0.075,
24349
- lastUpdated: "2024-03"
24350
- },
24351
- "claude-3-sonnet-20240229": {
24352
- promptPer1K: 0.003,
24353
- completionPer1K: 0.015,
24354
- lastUpdated: "2024-03"
24355
- },
24356
- "claude-3-haiku-20240307": {
24357
- promptPer1K: 0.00025,
24358
- completionPer1K: 0.00125,
24359
- lastUpdated: "2024-03"
24360
- },
24361
- "claude-3-5-sonnet-20240620": {
24362
- promptPer1K: 0.003,
24363
- completionPer1K: 0.015,
24364
- lastUpdated: "2024-06"
24365
- },
24366
- "claude-3-5-sonnet-20241022": {
24367
- promptPer1K: 0.003,
24368
- completionPer1K: 0.015,
24369
- lastUpdated: "2024-10"
24370
- },
24371
- "claude-3-5-haiku-20241022": {
24372
- promptPer1K: 0.0008,
24373
- completionPer1K: 0.004,
24374
- lastUpdated: "2024-10"
24375
- },
24376
- "claude-3-opus": {
24377
- promptPer1K: 0.015,
24378
- completionPer1K: 0.075,
24379
- lastUpdated: "2024-03"
24380
- },
24381
- "claude-3-sonnet": {
24382
- promptPer1K: 0.003,
24383
- completionPer1K: 0.015,
24384
- lastUpdated: "2024-03"
24385
- },
24386
- "claude-3-haiku": {
24387
- promptPer1K: 0.00025,
24388
- completionPer1K: 0.00125,
24389
- lastUpdated: "2024-03"
24390
- },
24391
- "claude-3.5-sonnet": {
24392
- promptPer1K: 0.003,
24393
- completionPer1K: 0.015,
24394
- lastUpdated: "2024-10"
24395
- },
24396
- "claude-3.5-haiku": {
24397
- promptPer1K: 0.0008,
24398
- completionPer1K: 0.004,
24399
- lastUpdated: "2024-10"
24400
- },
24401
- "claude-2": {
24402
- promptPer1K: 0.008,
24403
- completionPer1K: 0.024,
24404
- lastUpdated: "2024-01"
24405
- },
24406
- "claude-instant-1": {
24407
- promptPer1K: 0.0008,
24408
- completionPer1K: 0.0024,
24409
- lastUpdated: "2024-01"
24410
- }
24411
- };
24412
- var DEFAULT_PRICING = {
24413
- promptPer1K: 0.01,
24414
- completionPer1K: 0.03,
24415
- lastUpdated: "2024-01",
24416
- notes: "Default pricing - verify with provider"
24417
- };
24418
- function getModelPricing(model) {
24419
- if (MODEL_PRICING[model]) {
24420
- return MODEL_PRICING[model];
24421
- }
24422
- const lowerModel = model.toLowerCase();
24423
- for (const [key, pricing] of Object.entries(MODEL_PRICING)) {
24424
- if (key.toLowerCase() === lowerModel) {
24425
- return pricing;
24426
- }
24427
- }
24428
- if (lowerModel.includes("gpt-4o-mini")) {
24429
- return MODEL_PRICING["gpt-4o-mini"];
24430
- }
24431
- if (lowerModel.includes("gpt-4o")) {
24432
- return MODEL_PRICING["gpt-4o"];
24433
- }
24434
- if (lowerModel.includes("gpt-4-turbo")) {
24435
- return MODEL_PRICING["gpt-4-turbo"];
24436
- }
24437
- if (lowerModel.includes("gpt-4")) {
24438
- return MODEL_PRICING["gpt-4"];
24439
- }
24440
- if (lowerModel.includes("gpt-3.5")) {
24441
- return MODEL_PRICING["gpt-3.5-turbo"];
24442
- }
24443
- if (lowerModel.includes("claude-3-5-sonnet") || lowerModel.includes("claude-3.5-sonnet")) {
24444
- return MODEL_PRICING["claude-3.5-sonnet"];
24445
- }
24446
- if (lowerModel.includes("claude-3-5-haiku") || lowerModel.includes("claude-3.5-haiku")) {
24447
- return MODEL_PRICING["claude-3.5-haiku"];
24448
- }
24449
- if (lowerModel.includes("claude-3-opus")) {
24450
- return MODEL_PRICING["claude-3-opus"];
24451
- }
24452
- if (lowerModel.includes("claude-3-sonnet")) {
24453
- return MODEL_PRICING["claude-3-sonnet"];
24454
- }
24455
- if (lowerModel.includes("claude-3-haiku")) {
24456
- return MODEL_PRICING["claude-3-haiku"];
24457
- }
24458
- if (lowerModel.includes("claude")) {
24459
- return MODEL_PRICING["claude-2"];
24460
- }
24461
- return DEFAULT_PRICING;
24462
- }
24463
- function estimateCost(promptTokens, completionTokens, model) {
24464
- const pricing = getModelPricing(model);
24465
- const promptCostUsd = promptTokens / 1000 * pricing.promptPer1K;
24466
- const completionCostUsd = completionTokens / 1000 * pricing.completionPer1K;
24467
- const totalUsd = promptCostUsd + completionCostUsd;
24468
- return {
24469
- totalUsd,
24470
- promptCostUsd,
24471
- completionCostUsd,
24472
- model,
24473
- pricing
24474
- };
24475
- }
24476
- function formatCost(costUsd) {
24477
- if (costUsd < 0.01) {
24478
- return `$${(costUsd * 100).toFixed(4)} cents`;
24479
- }
24480
- if (costUsd < 1) {
24481
- return `$${costUsd.toFixed(4)}`;
24482
- }
24483
- return `$${costUsd.toFixed(2)}`;
24484
- }
24485
- function listKnownModels() {
24486
- return Object.entries(MODEL_PRICING).map(([model, pricing]) => ({
24487
- model,
24488
- pricing
24489
- }));
24490
- }
24491
24754
  export {
24492
24755
  wrapError,
24493
24756
  validateScenario,