@artemiskit/core 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,47 @@
1
1
  # @artemiskit/core
2
2
 
3
+ ## 0.2.2
4
+
5
+ ### Patch Changes
6
+
7
+ - d5ca7c6: Add baseline command and CI mode for regression detection
8
+
9
+ ### New Features
10
+
11
+ - **Baseline Command**: New `akit baseline` command with `set`, `list`, `get`, `remove` subcommands
12
+
13
+ - Lookup by run ID (default) or scenario name (`--scenario` flag)
14
+ - Store and manage baseline metrics for regression comparison
15
+
16
+ - **CI Mode**: New `--ci` flag for machine-readable output
17
+
18
+ - Outputs environment variable format for easy parsing
19
+ - Auto-detects CI environments (GitHub Actions, GitLab CI, etc.)
20
+ - Suppresses colors and spinners
21
+
22
+ - **Summary Formats**: New `--summary` flag with `json`, `text`, `security` formats
23
+
24
+ - JSON summary for pipeline parsing
25
+ - Security summary for compliance reporting
26
+
27
+ - **Regression Detection**: New `--baseline` and `--threshold` flags
28
+ - Compare runs against saved baselines
29
+ - Configurable regression threshold (default 5%)
30
+ - Exit code 1 on regression detection
31
+
32
+ ## 0.2.1
33
+
34
+ ### Patch Changes
35
+
36
+ - fix: improve LLM grader compatibility with reasoning models
37
+
38
+ - Remove temperature parameter from LLM grader (reasoning models like o1, o3, gpt-5-mini only support temperature=1)
39
+ - Increase maxTokens from 200 to 1000 to accommodate reasoning models that use tokens for internal thinking
40
+ - Improve grader prompt for stricter JSON-only output format
41
+ - Add fallback parsing for malformed JSON responses
42
+ - Add markdown code block stripping from grader responses
43
+ - Add `modelFamily` configuration option to Azure OpenAI provider for correct parameter detection when deployment names differ from model names
44
+
3
45
  ## 0.2.0
4
46
 
5
47
  ### Minor Changes
@@ -131,6 +131,11 @@ export interface AzureOpenAIAdapterConfig extends BaseAdapterConfig {
131
131
  apiVersion: string;
132
132
  /** Optional separate deployment name for embedding models */
133
133
  embeddingDeploymentName?: string;
134
+ /**
135
+ * Model family for parameter detection (e.g., 'gpt-5-mini' when deployment is '5-mini')
136
+ * Used to determine which API parameters to use (max_tokens vs max_completion_tokens)
137
+ */
138
+ modelFamily?: string;
134
139
  }
135
140
  /**
136
141
  * Vercel AI SDK configuration
@@ -1 +1 @@
1
- {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/adapters/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,IAAI,EAAE,QAAQ,GAAG,MAAM,GAAG,WAAW,GAAG,UAAU,GAAG,MAAM,CAAC;IAC5D,OAAO,EAAE,MAAM,CAAC;IAChB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,aAAa,CAAC,EAAE;QACd,IAAI,EAAE,MAAM,CAAC;QACb,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC;IACF,UAAU,CAAC,EAAE,QAAQ,EAAE,CAAC;CACzB;AAED,MAAM,WAAW,QAAQ;IACvB,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,UAAU,CAAC;IACjB,QAAQ,EAAE;QACR,IAAI,EAAE,MAAM,CAAC;QACb,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC;CACH;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,MAAM,EAAE,MAAM,GAAG,WAAW,EAAE,CAAC;IAC/B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC;IAChB,SAAS,CAAC,EAAE,kBAAkB,EAAE,CAAC;IACjC,KAAK,CAAC,EAAE,cAAc,EAAE,CAAC;IACzB,cAAc,CAAC,EAAE;QAAE,IAAI,EAAE,MAAM,GAAG,aAAa,CAAA;KAAE,CAAC;IAClD,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACpC;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACrC;AAED,MAAM,WAAW,cAAc;IAC7B,IAAI,EAAE,UAAU,CAAC;IACjB,QAAQ,EAAE,kBAAkB,CAAC;CAC9B;AAED;;GAEG;AACH,MAAM,WAAW,UAAU;IACzB,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,EAAE,MAAM,CAAC;IACnB,KAAK,EAAE,MAAM,CAAC;CACf;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,EAAE,EAAE,MAAM,CAAC;IACX,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,UAAU,CAAC;IACnB,SAAS,EAAE,MAAM,CAAC;IAClB,YAAY,CAAC,EAAE,MAAM,GAAG,QAAQ,GAAG,eAAe,GAAG,YAAY,GAAG,gBAAgB,CAAC;IACrF,YAAY,CAAC,EAAE;QACb,IAAI,EAAE,MAAM,CAAC;QACb,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC;IACF,SAAS,CAAC,EAAE,QAAQ,EAAE,CAAC;IACvB,GAAG,CAAC,EAAE,OAAO,CAAC;CACf;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,SAAS,EAAE,OAAO,CAAC;IACnB,eAAe,EAAE,OAAO,CAAC;IACzB,OAAO,EAAE,OAAO,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC;IACnB,MAAM,CAAC,EAAE,OAAO,CAAC;IACjB,QAAQ,CAAC,EAAE,OAAO,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAE1B,QAAQ,CAAC,OAAO,EAAE,eAAe,GAAG,OAAO,CAAC,cAAc,CAAC,CAAC;IAE5D,MAAM,CAAC,CAAC,OAAO,EAAE,eAAe,EAAE,OAAO,EAAE,CAAC,KAAK,EAAE,MAAM,KAAK,IAAI,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC;IAE3F,KAAK,CAAC,CAAC,IAAI,EAAE,MAAM,EAAE,KAAK,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC;IAExD,YAAY,IAAI,OAAO,CAAC,iBAAiB,CAAC,CAAC;IAE3C,KAAK,CAAC,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;CACzB;AAED;;GAEG;AACH,MAAM,MAAM,YAAY,GACpB,QAAQ,GACR,cAAc,GACd,WAAW,GACX,WAAW,GACX,QAAQ,GACR,SAAS,GACT,QAAQ,GACR,aAAa,GACb,QAAQ,GACR,QAAQ,CAAC;AAEb;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,QAAQ,EAAE,YAAY,CAAC;IACvB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,mBAAoB,SAAQ,iBAAiB;IAC5D,QAAQ,EAAE,QAAQ,CAAC;IACnB,YAAY,CAAC,EAAE,MAAM,CAAC;CACvB;AAED;;GAEG;AACH,MAAM,WAAW,wBAAyB,SAAQ,iBAAiB;IACjE,QAAQ,EAAE,cAAc,CAAC;IACzB,YAAY,EAAE,MAAM,CAAC;IACrB,cAAc,EAAE,MAAM,CAAC;IACvB,UAAU,EAAE,MAAM,CAAC;IACnB,6DAA6D;IAC7D,uBAAuB,CAAC,EAAE,MAAM,CAAC;CAClC;AAED;;GAEG;AACH,MAAM,WAAW,qBAAsB,SAAQ,iBAAiB;IAC9D,QAAQ,EAAE,WAAW,CAAC;IACtB,kBAAkB,EAAE,QAAQ,GAAG,OAAO,GAAG,WAAW,GAAG,QAAQ,GAAG,SAAS,CAAC;IAC5E,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CAC1C;AAED;;GAEG;AACH,MAAM,WAAW,sBAAuB,SAAQ,iBAAiB;IAC/D,QAAQ,EAAE,WAAW,CAAC;CACvB;AAED;;GAEG;AACH,MAAM,MAAM,aAAa,GACrB,mBAAmB,GACnB,wBAAwB,GACxB,qBAAqB,GACrB,sBAAsB,GACtB,iBAAiB,CAAC"}
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/adapters/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,IAAI,EAAE,QAAQ,GAAG,MAAM,GAAG,WAAW,GAAG,UAAU,GAAG,MAAM,CAAC;IAC5D,OAAO,EAAE,MAAM,CAAC;IAChB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,aAAa,CAAC,EAAE;QACd,IAAI,EAAE,MAAM,CAAC;QACb,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC;IACF,UAAU,CAAC,EAAE,QAAQ,EAAE,CAAC;CACzB;AAED,MAAM,WAAW,QAAQ;IACvB,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,UAAU,CAAC;IACjB,QAAQ,EAAE;QACR,IAAI,EAAE,MAAM,CAAC;QACb,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC;CACH;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,MAAM,EAAE,MAAM,GAAG,WAAW,EAAE,CAAC;IAC/B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC;IAChB,SAAS,CAAC,EAAE,kBAAkB,EAAE,CAAC;IACjC,KAAK,CAAC,EAAE,cAAc,EAAE,CAAC;IACzB,cAAc,CAAC,EAAE;QAAE,IAAI,EAAE,MAAM,GAAG,aAAa,CAAA;KAAE,CAAC;IAClD,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACpC;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACrC;AAED,MAAM,WAAW,cAAc;IAC7B,IAAI,EAAE,UAAU,CAAC;IACjB,QAAQ,EAAE,kBAAkB,CAAC;CAC9B;AAED;;GAEG;AACH,MAAM,WAAW,UAAU;IACzB,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,EAAE,MAAM,CAAC;IACnB,KAAK,EAAE,MAAM,CAAC;CACf;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,EAAE,EAAE,MAAM,CAAC;IACX,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,UAAU,CAAC;IACnB,SAAS,EAAE,MAAM,CAAC;IAClB,YAAY,CAAC,EAAE,MAAM,GAAG,QAAQ,GAAG,eAAe,GAAG,YAAY,GAAG,gBAAgB,CAAC;IACrF,YAAY,CAAC,EAAE;QACb,IAAI,EAAE,MAAM,CAAC;QACb,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC;IACF,SAAS,CAAC,EAAE,QAAQ,EAAE,CAAC;IACvB,GAAG,CAAC,EAAE,OAAO,CAAC;CACf;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,SAAS,EAAE,OAAO,CAAC;IACnB,eAAe,EAAE,OAAO,CAAC;IACzB,OAAO,EAAE,OAAO,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC;IACnB,MAAM,CAAC,EAAE,OAAO,CAAC;IACjB,QAAQ,CAAC,EAAE,OAAO,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAE1B,QAAQ,CAAC,OAAO,EAAE,eAAe,GAAG,OAAO,CAAC,cAAc,CAAC,CAAC;IAE5D,MAAM,CAAC,CAAC,OAAO,EAAE,eAAe,EAAE,OAAO,EAAE,CAAC,KAAK,EAAE,MAAM,KAAK,IAAI,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC;IAE3F,KAAK,CAAC,CAAC,IAAI,EAAE,MAAM,EAAE,KAAK,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC;IAExD,YAAY,IAAI,OAAO,CAAC,iBAAiB,CAAC,CAAC;IAE3C,KAAK,CAAC,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;CACzB;AAED;;GAEG;AACH,MAAM,MAAM,YAAY,GACpB,QAAQ,GACR,cAAc,GACd,WAAW,GACX,WAAW,GACX,QAAQ,GACR,SAAS,GACT,QAAQ,GACR,aAAa,GACb,QAAQ,GACR,QAAQ,CAAC;AAEb;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,QAAQ,EAAE,YAAY,CAAC;IACvB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,mBAAoB,SAAQ,iBAAiB;IAC5D,QAAQ,EAAE,QAAQ,CAAC;IACnB,YAAY,CAAC,EAAE,MAAM,CAAC;CACvB;AAED;;GAEG;AACH,MAAM,WAAW,wBAAyB,SAAQ,iBAAiB;IACjE,QAAQ,EAAE,cAAc,CAAC;IACzB,YAAY,EAAE,MAAM,CAAC;IACrB,cAAc,EAAE,MAAM,CAAC;IACvB,UAAU,EAAE,MAAM,CAAC;IACnB,6DAA6D;IAC7D,uBAAuB,CAAC,EAAE,MAAM,CAAC;IACjC;;;OAGG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,qBAAsB,SAAQ,iBAAiB;IAC9D,QAAQ,EAAE,WAAW,CAAC;IACtB,kBAAkB,EAAE,QAAQ,GAAG,OAAO,GAAG,WAAW,GAAG,QAAQ,GAAG,SAAS,CAAC;IAC5E,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CAC1C;AAED;;GAEG;AACH,MAAM,WAAW,sBAAuB,SAAQ,iBAAiB;IAC/D,QAAQ,EAAE,WAAW,CAAC;CACvB;AAED;;GAEG;AACH,MAAM,MAAM,aAAa,GACrB,mBAAmB,GACnB,wBAAwB,GACxB,qBAAqB,GACrB,sBAAsB,GACtB,iBAAiB,CAAC"}
@@ -3,6 +3,7 @@
3
3
  *
4
4
  * Pricing is per 1,000 tokens (1K tokens) in USD
5
5
  * Data is updated periodically - always verify with provider's official pricing
6
+ * Last comprehensive update: January 2026
6
7
  */
7
8
  export interface ModelPricing {
8
9
  /** Price per 1K prompt/input tokens in USD */
@@ -33,7 +34,7 @@ export interface CostEstimate {
33
34
  export declare const MODEL_PRICING: Record<string, ModelPricing>;
34
35
  /**
35
36
  * Default pricing for unknown models
36
- * Uses conservative estimates
37
+ * Uses conservative estimates based on mid-tier model pricing
37
38
  */
38
39
  export declare const DEFAULT_PRICING: ModelPricing;
39
40
  /**
@@ -1 +1 @@
1
- {"version":3,"file":"pricing.d.ts","sourceRoot":"","sources":["../../src/cost/pricing.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,MAAM,WAAW,YAAY;IAC3B,8CAA8C;IAC9C,WAAW,EAAE,MAAM,CAAC;IACpB,mDAAmD;IACnD,eAAe,EAAE,MAAM,CAAC;IACxB,wBAAwB;IACxB,WAAW,EAAE,MAAM,CAAC;IACpB,8BAA8B;IAC9B,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,YAAY;IAC3B,kCAAkC;IAClC,QAAQ,EAAE,MAAM,CAAC;IACjB,6BAA6B;IAC7B,aAAa,EAAE,MAAM,CAAC;IACtB,iCAAiC;IACjC,iBAAiB,EAAE,MAAM,CAAC;IAC1B,iCAAiC;IACjC,KAAK,EAAE,MAAM,CAAC;IACd,mBAAmB;IACnB,OAAO,EAAE,YAAY,CAAC;CACvB;AAED;;;GAGG;AACH,eAAO,MAAM,aAAa,EAAE,MAAM,CAAC,MAAM,EAAE,YAAY,CAqHtD,CAAC;AAEF;;;GAGG;AACH,eAAO,MAAM,eAAe,EAAE,YAK7B,CAAC;AAEF;;;;GAIG;AACH,wBAAgB,eAAe,CAAC,KAAK,EAAE,MAAM,GAAG,YAAY,CAkD3D;AAED;;;;;;GAMG;AACH,wBAAgB,YAAY,CAC1B,YAAY,EAAE,MAAM,EACpB,gBAAgB,EAAE,MAAM,EACxB,KAAK,EAAE,MAAM,GACZ,YAAY,CAcd;AAED;;;;GAIG;AACH,wBAAgB,UAAU,CAAC,OAAO,EAAE,MAAM,GAAG,MAAM,CAQlD;AAED;;GAEG;AACH,wBAAgB,eAAe,IAAI,KAAK,CAAC;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,OAAO,EAAE,YAAY,CAAA;CAAE,CAAC,CAKjF"}
1
+ {"version":3,"file":"pricing.d.ts","sourceRoot":"","sources":["../../src/cost/pricing.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,MAAM,WAAW,YAAY;IAC3B,8CAA8C;IAC9C,WAAW,EAAE,MAAM,CAAC;IACpB,mDAAmD;IACnD,eAAe,EAAE,MAAM,CAAC;IACxB,wBAAwB;IACxB,WAAW,EAAE,MAAM,CAAC;IACpB,8BAA8B;IAC9B,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,YAAY;IAC3B,kCAAkC;IAClC,QAAQ,EAAE,MAAM,CAAC;IACjB,6BAA6B;IAC7B,aAAa,EAAE,MAAM,CAAC;IACtB,iCAAiC;IACjC,iBAAiB,EAAE,MAAM,CAAC;IAC1B,iCAAiC;IACjC,KAAK,EAAE,MAAM,CAAC;IACd,mBAAmB;IACnB,OAAO,EAAE,YAAY,CAAC;CACvB;AAED;;;GAGG;AACH,eAAO,MAAM,aAAa,EAAE,MAAM,CAAC,MAAM,EAAE,YAAY,CAuNtD,CAAC;AAEF;;;GAGG;AACH,eAAO,MAAM,eAAe,EAAE,YAK7B,CAAC;AAEF;;;;GAIG;AACH,wBAAgB,eAAe,CAAC,KAAK,EAAE,MAAM,GAAG,YAAY,CAgI3D;AAED;;;;;;GAMG;AACH,wBAAgB,YAAY,CAC1B,YAAY,EAAE,MAAM,EACpB,gBAAgB,EAAE,MAAM,EACxB,KAAK,EAAE,MAAM,GACZ,YAAY,CAcd;AAED;;;;GAIG;AACH,wBAAgB,UAAU,CAAC,OAAO,EAAE,MAAM,GAAG,MAAM,CAQlD;AAED;;GAEG;AACH,wBAAgB,eAAe,IAAI,KAAK,CAAC;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,OAAO,EAAE,YAAY,CAAA;CAAE,CAAC,CAKjF"}
@@ -1 +1 @@
1
- {"version":3,"file":"llm-grader.d.ts","sourceRoot":"","sources":["../../src/evaluators/llm-grader.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AACnD,OAAO,KAAK,EAAE,SAAS,EAAE,gBAAgB,EAAE,eAAe,EAAE,MAAM,SAAS,CAAC;AAmB5E,qBAAa,kBAAmB,YAAW,SAAS;IAClD,QAAQ,CAAC,IAAI,gBAAgB;IAEvB,QAAQ,CACZ,QAAQ,EAAE,MAAM,EAChB,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,gBAAgB,GACzB,OAAO,CAAC,eAAe,CAAC;IA8C3B,OAAO,CAAC,mBAAmB;CAsB5B"}
1
+ {"version":3,"file":"llm-grader.d.ts","sourceRoot":"","sources":["../../src/evaluators/llm-grader.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AACnD,OAAO,KAAK,EAAE,SAAS,EAAE,gBAAgB,EAAE,eAAe,EAAE,MAAM,SAAS,CAAC;AAwB5E,qBAAa,kBAAmB,YAAW,SAAS;IAClD,QAAQ,CAAC,IAAI,gBAAgB;IAEvB,QAAQ,CACZ,QAAQ,EAAE,MAAM,EAChB,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,gBAAgB,GACzB,OAAO,CAAC,eAAe,CAAC;IAgD3B,OAAO,CAAC,mBAAmB;CA+C5B"}
package/dist/index.js CHANGED
@@ -11564,8 +11564,7 @@ class LLMGraderEvaluator {
11564
11564
  const result = await context.client.generate({
11565
11565
  prompt,
11566
11566
  model: expected.model,
11567
- temperature: 0,
11568
- maxTokens: 200
11567
+ maxTokens: 1000
11569
11568
  });
11570
11569
  const parsed = this.parseGraderResponse(result.text);
11571
11570
  const passed = parsed.score >= expected.threshold;
@@ -11590,9 +11589,17 @@ class LLMGraderEvaluator {
11590
11589
  }
11591
11590
  }
11592
11591
  parseGraderResponse(text) {
11593
- const jsonMatch = text.match(/\{[\s\S]*?\}/);
11592
+ const cleanedText = text.replace(/```json\s*/gi, "").replace(/```\s*/g, "").trim();
11593
+ const jsonMatch = cleanedText.match(/\{[\s\S]*?\}/);
11594
11594
  if (!jsonMatch) {
11595
- throw new Error("No JSON found in grader response");
11595
+ const scoreMatch = cleanedText.match(/(?:score[:\s]*)?(\d+\.?\d*)/i);
11596
+ if (scoreMatch) {
11597
+ const score = Number(scoreMatch[1]);
11598
+ if (!Number.isNaN(score) && score >= 0 && score <= 1) {
11599
+ return { score, reason: cleanedText };
11600
+ }
11601
+ }
11602
+ throw new Error(`No JSON found in grader response: ${text.substring(0, 100)}...`);
11596
11603
  }
11597
11604
  try {
11598
11605
  const parsed = JSON.parse(jsonMatch[0]);
@@ -11605,26 +11612,39 @@ class LLMGraderEvaluator {
11605
11612
  reason: parsed.reason
11606
11613
  };
11607
11614
  } catch (error) {
11615
+ const scoreMatch = jsonMatch[0].match(/"score"[:\s]*(\d+\.?\d*)/i);
11616
+ if (scoreMatch) {
11617
+ const score = Number(scoreMatch[1]);
11618
+ if (!Number.isNaN(score) && score >= 0 && score <= 1) {
11619
+ const reasonMatch = jsonMatch[0].match(/"reason"[:\s]*"([^"]+)"/i);
11620
+ return { score, reason: reasonMatch?.[1] };
11621
+ }
11622
+ }
11608
11623
  throw new Error(`Failed to parse grader response: ${error.message}`);
11609
11624
  }
11610
11625
  }
11611
11626
  }
11612
- var GRADER_PROMPT = `You are an evaluator grading an AI response based on a rubric.
11627
+ var GRADER_PROMPT = `You are a strict JSON-only evaluator. You grade AI responses based on rubrics.
11613
11628
 
11614
- ## RUBRIC
11629
+ RUBRIC:
11615
11630
  {{rubric}}
11616
11631
 
11617
- ## RESPONSE TO EVALUATE
11632
+ RESPONSE TO EVALUATE:
11618
11633
  {{response}}
11619
11634
 
11620
- ## INSTRUCTIONS
11621
- Score the response from 0.0 to 1.0 based on the rubric.
11622
- Be objective and consistent in your scoring.
11635
+ TASK: Score the response from 0.0 to 1.0 based on the rubric above.
11623
11636
 
11624
- Respond with ONLY a JSON object in this exact format:
11625
- {"score": <number between 0 and 1>, "reason": "<brief explanation of score>"}
11637
+ OUTPUT FORMAT: You MUST respond with ONLY this exact JSON structure, nothing else:
11638
+ {"score":0.0,"reason":"explanation"}
11639
+
11640
+ RULES:
11641
+ - Output ONLY valid JSON, no markdown, no code blocks, no extra text
11642
+ - "score" must be a number between 0.0 and 1.0
11643
+ - "reason" must be a brief string explaining the score
11644
+ - Do NOT wrap in \`\`\`json or any formatting
11645
+ - Your entire response must be parseable by JSON.parse()
11626
11646
 
11627
- Do not include any other text, markdown, or formatting.`;
11647
+ JSON OUTPUT:`;
11628
11648
 
11629
11649
  // src/evaluators/not-contains.ts
11630
11650
  class NotContainsEvaluator {
@@ -13487,6 +13507,7 @@ var ProviderConfigSchema = exports_external.object({
13487
13507
  deploymentName: exports_external.string().optional(),
13488
13508
  apiVersion: exports_external.string().optional(),
13489
13509
  embeddingDeploymentName: exports_external.string().optional(),
13510
+ modelFamily: exports_external.string().optional(),
13490
13511
  underlyingProvider: exports_external.enum(["openai", "azure", "anthropic", "google", "mistral"]).optional()
13491
13512
  }).optional();
13492
13513
  var BaseExpectedSchema = exports_external.discriminatedUnion("type", [
@@ -14493,8 +14514,10 @@ function getScenario(manifest) {
14493
14514
 
14494
14515
  class LocalStorageAdapter {
14495
14516
  basePath;
14517
+ baselinesPath;
14496
14518
  constructor(basePath = "./artemis-runs") {
14497
14519
  this.basePath = resolve2(basePath);
14520
+ this.baselinesPath = join2(this.basePath, ".artemis", "baselines.json");
14498
14521
  }
14499
14522
  async save(manifest) {
14500
14523
  const dir = join2(this.basePath, manifest.project);
@@ -14613,6 +14636,89 @@ class LocalStorageAdapter {
14613
14636
  return [];
14614
14637
  }
14615
14638
  }
14639
+ async loadBaselinesFile() {
14640
+ try {
14641
+ const content = await readFile2(this.baselinesPath, "utf-8");
14642
+ return JSON.parse(content);
14643
+ } catch {
14644
+ return { version: "1.0", baselines: {} };
14645
+ }
14646
+ }
14647
+ async saveBaselinesFile(data) {
14648
+ const dir = join2(this.basePath, ".artemis");
14649
+ await mkdir(dir, { recursive: true });
14650
+ await writeFile(this.baselinesPath, JSON.stringify(data, null, 2));
14651
+ }
14652
+ async setBaseline(scenario, runId, tag) {
14653
+ const manifest = await this.loadRun(runId);
14654
+ const scenarioName = scenario || getScenario(manifest);
14655
+ const baseline = {
14656
+ scenario: scenarioName,
14657
+ runId,
14658
+ createdAt: new Date().toISOString(),
14659
+ metrics: {
14660
+ successRate: manifest.metrics.success_rate,
14661
+ medianLatencyMs: manifest.metrics.median_latency_ms,
14662
+ totalTokens: manifest.metrics.total_tokens,
14663
+ passedCases: manifest.metrics.passed_cases,
14664
+ failedCases: manifest.metrics.failed_cases,
14665
+ totalCases: manifest.metrics.total_cases
14666
+ },
14667
+ tag
14668
+ };
14669
+ const data = await this.loadBaselinesFile();
14670
+ data.baselines[scenarioName] = baseline;
14671
+ await this.saveBaselinesFile(data);
14672
+ return baseline;
14673
+ }
14674
+ async getBaseline(scenario) {
14675
+ const data = await this.loadBaselinesFile();
14676
+ return data.baselines[scenario] || null;
14677
+ }
14678
+ async getBaselineByRunId(runId) {
14679
+ const data = await this.loadBaselinesFile();
14680
+ const baselines = Object.values(data.baselines);
14681
+ return baselines.find((b) => b.runId === runId) || null;
14682
+ }
14683
+ async listBaselines() {
14684
+ const data = await this.loadBaselinesFile();
14685
+ return Object.values(data.baselines).sort((a, b) => new Date(b.createdAt).getTime() - new Date(a.createdAt).getTime());
14686
+ }
14687
+ async removeBaseline(scenario) {
14688
+ const data = await this.loadBaselinesFile();
14689
+ if (data.baselines[scenario]) {
14690
+ delete data.baselines[scenario];
14691
+ await this.saveBaselinesFile(data);
14692
+ return true;
14693
+ }
14694
+ return false;
14695
+ }
14696
+ async removeBaselineByRunId(runId) {
14697
+ const data = await this.loadBaselinesFile();
14698
+ const entry = Object.entries(data.baselines).find(([_, b]) => b.runId === runId);
14699
+ if (entry) {
14700
+ delete data.baselines[entry[0]];
14701
+ await this.saveBaselinesFile(data);
14702
+ return true;
14703
+ }
14704
+ return false;
14705
+ }
14706
+ async compareToBaseline(runId, regressionThreshold = 0.05) {
14707
+ const currentManifest = await this.loadRun(runId);
14708
+ const scenario = getScenario(currentManifest);
14709
+ const baseline = await this.getBaseline(scenario);
14710
+ if (!baseline) {
14711
+ return null;
14712
+ }
14713
+ const comparison = await this.compare(baseline.runId, runId);
14714
+ const hasRegression = comparison.delta.successRate < -regressionThreshold;
14715
+ return {
14716
+ baseline,
14717
+ comparison,
14718
+ hasRegression,
14719
+ regressionThreshold
14720
+ };
14721
+ }
14616
14722
  }
14617
14723
 
14618
14724
  // ../../node_modules/.bun/tslib@2.8.1/node_modules/tslib/modules/index.js
@@ -24303,116 +24409,184 @@ class Logger {
24303
24409
  var logger = new Logger("artemis");
24304
24410
  // src/cost/pricing.ts
24305
24411
  var MODEL_PRICING = {
24306
- "gpt-4": {
24307
- promptPer1K: 0.03,
24308
- completionPer1K: 0.06,
24309
- lastUpdated: "2024-01"
24412
+ "gpt-5": {
24413
+ promptPer1K: 0.00125,
24414
+ completionPer1K: 0.01,
24415
+ lastUpdated: "2026-01",
24416
+ notes: "400K context window"
24310
24417
  },
24311
- "gpt-4-32k": {
24312
- promptPer1K: 0.06,
24313
- completionPer1K: 0.12,
24314
- lastUpdated: "2024-01"
24418
+ "gpt-5.1": {
24419
+ promptPer1K: 0.00125,
24420
+ completionPer1K: 0.01,
24421
+ lastUpdated: "2026-01"
24315
24422
  },
24316
- "gpt-4-turbo": {
24317
- promptPer1K: 0.01,
24318
- completionPer1K: 0.03,
24319
- lastUpdated: "2024-01"
24423
+ "gpt-5.2": {
24424
+ promptPer1K: 0.00175,
24425
+ completionPer1K: 0.014,
24426
+ lastUpdated: "2026-01"
24320
24427
  },
24321
- "gpt-4-turbo-preview": {
24322
- promptPer1K: 0.01,
24323
- completionPer1K: 0.03,
24324
- lastUpdated: "2024-01"
24428
+ "gpt-5-mini": {
24429
+ promptPer1K: 0.00025,
24430
+ completionPer1K: 0.002,
24431
+ lastUpdated: "2026-01"
24432
+ },
24433
+ "gpt-5-nano": {
24434
+ promptPer1K: 0.00005,
24435
+ completionPer1K: 0.0004,
24436
+ lastUpdated: "2026-01"
24437
+ },
24438
+ "gpt-4.1": {
24439
+ promptPer1K: 0.002,
24440
+ completionPer1K: 0.008,
24441
+ lastUpdated: "2026-01",
24442
+ notes: "1M context window"
24443
+ },
24444
+ "gpt-4.1-mini": {
24445
+ promptPer1K: 0.0004,
24446
+ completionPer1K: 0.0016,
24447
+ lastUpdated: "2026-01"
24448
+ },
24449
+ "gpt-4.1-nano": {
24450
+ promptPer1K: 0.0001,
24451
+ completionPer1K: 0.0004,
24452
+ lastUpdated: "2026-01"
24325
24453
  },
24326
24454
  "gpt-4o": {
24327
- promptPer1K: 0.005,
24328
- completionPer1K: 0.015,
24329
- lastUpdated: "2024-05"
24455
+ promptPer1K: 0.0025,
24456
+ completionPer1K: 0.01,
24457
+ lastUpdated: "2026-01",
24458
+ notes: "128K context window"
24330
24459
  },
24331
24460
  "gpt-4o-mini": {
24332
24461
  promptPer1K: 0.00015,
24333
24462
  completionPer1K: 0.0006,
24334
- lastUpdated: "2024-07"
24463
+ lastUpdated: "2026-01",
24464
+ notes: "128K context window"
24465
+ },
24466
+ o1: {
24467
+ promptPer1K: 0.015,
24468
+ completionPer1K: 0.06,
24469
+ lastUpdated: "2026-01",
24470
+ notes: "Reasoning model - internal thinking tokens billed as output"
24471
+ },
24472
+ o3: {
24473
+ promptPer1K: 0.002,
24474
+ completionPer1K: 0.008,
24475
+ lastUpdated: "2026-01"
24476
+ },
24477
+ "o3-mini": {
24478
+ promptPer1K: 0.0011,
24479
+ completionPer1K: 0.0044,
24480
+ lastUpdated: "2026-01"
24481
+ },
24482
+ "o4-mini": {
24483
+ promptPer1K: 0.0011,
24484
+ completionPer1K: 0.0044,
24485
+ lastUpdated: "2026-01"
24486
+ },
24487
+ "gpt-4-turbo": {
24488
+ promptPer1K: 0.01,
24489
+ completionPer1K: 0.03,
24490
+ lastUpdated: "2026-01"
24491
+ },
24492
+ "gpt-4": {
24493
+ promptPer1K: 0.03,
24494
+ completionPer1K: 0.06,
24495
+ lastUpdated: "2026-01"
24335
24496
  },
24336
24497
  "gpt-3.5-turbo": {
24337
24498
  promptPer1K: 0.0005,
24338
24499
  completionPer1K: 0.0015,
24339
- lastUpdated: "2024-01"
24500
+ lastUpdated: "2026-01"
24340
24501
  },
24341
- "gpt-3.5-turbo-16k": {
24502
+ "claude-opus-4.5": {
24503
+ promptPer1K: 0.005,
24504
+ completionPer1K: 0.025,
24505
+ lastUpdated: "2026-01",
24506
+ notes: "Most capable Claude model"
24507
+ },
24508
+ "claude-sonnet-4.5": {
24342
24509
  promptPer1K: 0.003,
24343
- completionPer1K: 0.004,
24344
- lastUpdated: "2024-01"
24510
+ completionPer1K: 0.015,
24511
+ lastUpdated: "2026-01",
24512
+ notes: "Balanced performance and cost"
24513
+ },
24514
+ "claude-haiku-4.5": {
24515
+ promptPer1K: 0.001,
24516
+ completionPer1K: 0.005,
24517
+ lastUpdated: "2026-01",
24518
+ notes: "Fastest Claude model"
24519
+ },
24520
+ "claude-opus-4": {
24521
+ promptPer1K: 0.015,
24522
+ completionPer1K: 0.075,
24523
+ lastUpdated: "2026-01"
24345
24524
  },
24346
- "claude-3-opus-20240229": {
24525
+ "claude-opus-4.1": {
24347
24526
  promptPer1K: 0.015,
24348
24527
  completionPer1K: 0.075,
24349
- lastUpdated: "2024-03"
24528
+ lastUpdated: "2026-01"
24350
24529
  },
24351
- "claude-3-sonnet-20240229": {
24530
+ "claude-sonnet-4": {
24352
24531
  promptPer1K: 0.003,
24353
24532
  completionPer1K: 0.015,
24354
- lastUpdated: "2024-03"
24533
+ lastUpdated: "2026-01"
24355
24534
  },
24356
- "claude-3-haiku-20240307": {
24357
- promptPer1K: 0.00025,
24358
- completionPer1K: 0.00125,
24359
- lastUpdated: "2024-03"
24535
+ "claude-sonnet-3.7": {
24536
+ promptPer1K: 0.003,
24537
+ completionPer1K: 0.015,
24538
+ lastUpdated: "2026-01"
24360
24539
  },
24361
- "claude-3-5-sonnet-20240620": {
24540
+ "claude-3-7-sonnet": {
24362
24541
  promptPer1K: 0.003,
24363
24542
  completionPer1K: 0.015,
24364
- lastUpdated: "2024-06"
24543
+ lastUpdated: "2026-01"
24365
24544
  },
24366
24545
  "claude-3-5-sonnet-20241022": {
24367
24546
  promptPer1K: 0.003,
24368
24547
  completionPer1K: 0.015,
24369
- lastUpdated: "2024-10"
24548
+ lastUpdated: "2026-01"
24370
24549
  },
24371
24550
  "claude-3-5-haiku-20241022": {
24372
24551
  promptPer1K: 0.0008,
24373
24552
  completionPer1K: 0.004,
24374
- lastUpdated: "2024-10"
24553
+ lastUpdated: "2026-01"
24554
+ },
24555
+ "claude-haiku-3.5": {
24556
+ promptPer1K: 0.0008,
24557
+ completionPer1K: 0.004,
24558
+ lastUpdated: "2026-01"
24375
24559
  },
24376
24560
  "claude-3-opus": {
24377
24561
  promptPer1K: 0.015,
24378
24562
  completionPer1K: 0.075,
24379
- lastUpdated: "2024-03"
24563
+ lastUpdated: "2026-01"
24380
24564
  },
24381
24565
  "claude-3-sonnet": {
24382
24566
  promptPer1K: 0.003,
24383
24567
  completionPer1K: 0.015,
24384
- lastUpdated: "2024-03"
24568
+ lastUpdated: "2026-01"
24385
24569
  },
24386
24570
  "claude-3-haiku": {
24387
24571
  promptPer1K: 0.00025,
24388
24572
  completionPer1K: 0.00125,
24389
- lastUpdated: "2024-03"
24573
+ lastUpdated: "2026-01"
24390
24574
  },
24391
24575
  "claude-3.5-sonnet": {
24392
24576
  promptPer1K: 0.003,
24393
24577
  completionPer1K: 0.015,
24394
- lastUpdated: "2024-10"
24578
+ lastUpdated: "2026-01"
24395
24579
  },
24396
24580
  "claude-3.5-haiku": {
24397
24581
  promptPer1K: 0.0008,
24398
24582
  completionPer1K: 0.004,
24399
- lastUpdated: "2024-10"
24400
- },
24401
- "claude-2": {
24402
- promptPer1K: 0.008,
24403
- completionPer1K: 0.024,
24404
- lastUpdated: "2024-01"
24405
- },
24406
- "claude-instant-1": {
24407
- promptPer1K: 0.0008,
24408
- completionPer1K: 0.0024,
24409
- lastUpdated: "2024-01"
24583
+ lastUpdated: "2026-01"
24410
24584
  }
24411
24585
  };
24412
24586
  var DEFAULT_PRICING = {
24413
- promptPer1K: 0.01,
24414
- completionPer1K: 0.03,
24415
- lastUpdated: "2024-01",
24587
+ promptPer1K: 0.003,
24588
+ completionPer1K: 0.015,
24589
+ lastUpdated: "2026-01",
24416
24590
  notes: "Default pricing - verify with provider"
24417
24591
  };
24418
24592
  function getModelPricing(model) {
@@ -24425,12 +24599,48 @@ function getModelPricing(model) {
24425
24599
  return pricing;
24426
24600
  }
24427
24601
  }
24602
+ if (lowerModel.includes("gpt-5.2")) {
24603
+ return MODEL_PRICING["gpt-5.2"];
24604
+ }
24605
+ if (lowerModel.includes("gpt-5.1")) {
24606
+ return MODEL_PRICING["gpt-5.1"];
24607
+ }
24608
+ if (lowerModel.includes("gpt-5-mini")) {
24609
+ return MODEL_PRICING["gpt-5-mini"];
24610
+ }
24611
+ if (lowerModel.includes("gpt-5-nano")) {
24612
+ return MODEL_PRICING["gpt-5-nano"];
24613
+ }
24614
+ if (lowerModel.includes("gpt-5")) {
24615
+ return MODEL_PRICING["gpt-5"];
24616
+ }
24617
+ if (lowerModel.includes("gpt-4.1-mini")) {
24618
+ return MODEL_PRICING["gpt-4.1-mini"];
24619
+ }
24620
+ if (lowerModel.includes("gpt-4.1-nano")) {
24621
+ return MODEL_PRICING["gpt-4.1-nano"];
24622
+ }
24623
+ if (lowerModel.includes("gpt-4.1")) {
24624
+ return MODEL_PRICING["gpt-4.1"];
24625
+ }
24428
24626
  if (lowerModel.includes("gpt-4o-mini")) {
24429
24627
  return MODEL_PRICING["gpt-4o-mini"];
24430
24628
  }
24431
24629
  if (lowerModel.includes("gpt-4o")) {
24432
24630
  return MODEL_PRICING["gpt-4o"];
24433
24631
  }
24632
+ if (lowerModel.includes("o4-mini")) {
24633
+ return MODEL_PRICING["o4-mini"];
24634
+ }
24635
+ if (lowerModel.includes("o3-mini")) {
24636
+ return MODEL_PRICING["o3-mini"];
24637
+ }
24638
+ if (lowerModel.includes("o3")) {
24639
+ return MODEL_PRICING.o3;
24640
+ }
24641
+ if (lowerModel.includes("o1")) {
24642
+ return MODEL_PRICING.o1;
24643
+ }
24434
24644
  if (lowerModel.includes("gpt-4-turbo")) {
24435
24645
  return MODEL_PRICING["gpt-4-turbo"];
24436
24646
  }
@@ -24440,6 +24650,27 @@ function getModelPricing(model) {
24440
24650
  if (lowerModel.includes("gpt-3.5")) {
24441
24651
  return MODEL_PRICING["gpt-3.5-turbo"];
24442
24652
  }
24653
+ if (lowerModel.includes("opus-4.5") || lowerModel.includes("opus-4-5")) {
24654
+ return MODEL_PRICING["claude-opus-4.5"];
24655
+ }
24656
+ if (lowerModel.includes("sonnet-4.5") || lowerModel.includes("sonnet-4-5")) {
24657
+ return MODEL_PRICING["claude-sonnet-4.5"];
24658
+ }
24659
+ if (lowerModel.includes("haiku-4.5") || lowerModel.includes("haiku-4-5")) {
24660
+ return MODEL_PRICING["claude-haiku-4.5"];
24661
+ }
24662
+ if (lowerModel.includes("opus-4.1") || lowerModel.includes("opus-4-1")) {
24663
+ return MODEL_PRICING["claude-opus-4.1"];
24664
+ }
24665
+ if (lowerModel.includes("opus-4")) {
24666
+ return MODEL_PRICING["claude-opus-4"];
24667
+ }
24668
+ if (lowerModel.includes("sonnet-4")) {
24669
+ return MODEL_PRICING["claude-sonnet-4"];
24670
+ }
24671
+ if (lowerModel.includes("sonnet-3.7") || lowerModel.includes("sonnet-3-7")) {
24672
+ return MODEL_PRICING["claude-sonnet-3.7"];
24673
+ }
24443
24674
  if (lowerModel.includes("claude-3-5-sonnet") || lowerModel.includes("claude-3.5-sonnet")) {
24444
24675
  return MODEL_PRICING["claude-3.5-sonnet"];
24445
24676
  }
@@ -24456,7 +24687,7 @@ function getModelPricing(model) {
24456
24687
  return MODEL_PRICING["claude-3-haiku"];
24457
24688
  }
24458
24689
  if (lowerModel.includes("claude")) {
24459
- return MODEL_PRICING["claude-2"];
24690
+ return MODEL_PRICING["claude-sonnet-4.5"];
24460
24691
  }
24461
24692
  return DEFAULT_PRICING;
24462
24693
  }