@plune-ai/cli 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,215 @@
1
+ interface Summary {
2
+ total: number;
3
+ passed: number;
4
+ failed: number;
5
+ errored: number;
6
+ cost_usd: number;
7
+ duration_ms: number;
8
+ }
9
+ interface AssertionResultRecord {
10
+ type: string;
11
+ passed: boolean;
12
+ score?: number;
13
+ reason?: string;
14
+ }
15
+ interface Usage {
16
+ input_tokens: number;
17
+ output_tokens: number;
18
+ cost_usd: number;
19
+ }
20
+ interface RunError {
21
+ code: string;
22
+ message: string;
23
+ }
24
+ interface RowResult {
25
+ vars: Record<string, unknown>;
26
+ output: string | null;
27
+ cached: boolean;
28
+ usage?: Usage;
29
+ latency_ms?: number;
30
+ error?: RunError;
31
+ assertions: AssertionResultRecord[];
32
+ }
33
+ interface EvalResult {
34
+ id: string;
35
+ tags: string[];
36
+ rows: RowResult[];
37
+ passed: boolean;
38
+ }
39
+ interface RunResult {
40
+ schema: 1;
41
+ plune_version: string;
42
+ started_at: string;
43
+ finished_at: string;
44
+ config_hash: string;
45
+ summary: Summary;
46
+ evals: EvalResult[];
47
+ }
48
+
49
+ interface ProviderConfig {
50
+ type: 'anthropic' | 'openai' | 'openrouter';
51
+ model: string;
52
+ temperature?: number;
53
+ max_tokens?: number;
54
+ concurrency?: number;
55
+ timeout?: number;
56
+ max_retries?: number;
57
+ }
58
+ interface ModelPrice {
59
+ input_per_1k_usd: number;
60
+ output_per_1k_usd: number;
61
+ }
62
+ type PricingMap = Record<string, ModelPrice>;
63
+ interface DatasetRow {
64
+ vars: Record<string, string | number | boolean>;
65
+ expected?: string;
66
+ }
67
+ type DatasetRef = string | {
68
+ examples: DatasetRow[];
69
+ };
70
+ interface ExactMatchAssertion {
71
+ type: 'exact-match';
72
+ value: string;
73
+ trim?: boolean;
74
+ ignore_case?: boolean;
75
+ }
76
+ interface ContainsAssertion {
77
+ type: 'contains';
78
+ value: string;
79
+ ignore_case?: boolean;
80
+ }
81
+ interface ContainsAnyAssertion {
82
+ type: 'contains-any';
83
+ values: string[];
84
+ ignore_case?: boolean;
85
+ }
86
+ interface ContainsAllAssertion {
87
+ type: 'contains-all';
88
+ values: string[];
89
+ ignore_case?: boolean;
90
+ }
91
+ interface JsonSchemaAssertion {
92
+ type: 'json-schema';
93
+ schema: object;
94
+ extract?: 'auto' | 'strict';
95
+ }
96
+ interface LlmJudgeAssertion {
97
+ type: 'llm-judge';
98
+ criteria: string;
99
+ provider?: Partial<ProviderConfig>;
100
+ pass_threshold?: number;
101
+ }
102
+ interface SemanticSimilarityAssertion {
103
+ type: 'semantic-similarity';
104
+ reference: string;
105
+ threshold?: number;
106
+ }
107
+ interface FaithfulnessAssertion {
108
+ type: 'faithfulness';
109
+ context: string;
110
+ threshold?: number;
111
+ }
112
+ interface AnswerRelevanceAssertion {
113
+ type: 'answer-relevance';
114
+ question: string;
115
+ threshold?: number;
116
+ }
117
+ interface ContextPrecisionAssertion {
118
+ type: 'context-precision';
119
+ context: string;
120
+ question: string;
121
+ threshold?: number;
122
+ }
123
+ type AssertionConfig = ExactMatchAssertion | ContainsAssertion | ContainsAnyAssertion | ContainsAllAssertion | JsonSchemaAssertion | LlmJudgeAssertion | SemanticSimilarityAssertion | FaithfulnessAssertion | AnswerRelevanceAssertion | ContextPrecisionAssertion;
124
+ interface EvalConfig {
125
+ id: string;
126
+ description?: string;
127
+ tags?: string[];
128
+ provider?: Partial<ProviderConfig>;
129
+ prompt?: string;
130
+ prompt_file?: string;
131
+ dataset: DatasetRef;
132
+ assertions: AssertionConfig[];
133
+ }
134
+ interface Config {
135
+ version: 1;
136
+ provider: ProviderConfig;
137
+ defaults?: {
138
+ assertions?: AssertionConfig[];
139
+ };
140
+ pricing?: PricingMap;
141
+ evals: EvalConfig[];
142
+ }
143
+
144
+ interface CompletionRequest {
145
+ provider: string;
146
+ model: string;
147
+ temperature: number;
148
+ max_tokens: number;
149
+ prompt_resolved: string;
150
+ }
151
+ interface CompletionResponse {
152
+ output: string;
153
+ usage: {
154
+ input_tokens: number;
155
+ output_tokens: number;
156
+ };
157
+ /**
158
+ * Actual USD cost the provider reported for this call, if it reports one (e.g. OpenRouter via
159
+ * `usage.include`). Absent for providers that only return token counts — the cost is then
160
+ * estimated downstream (ADR-PRC02). Additive + optional: existing providers omit it.
161
+ */
162
+ cost_usd?: number;
163
+ }
164
+ interface CostEstimate {
165
+ cost_usd: number;
166
+ }
167
+ interface Provider<_TConfig extends ProviderConfig = ProviderConfig> {
168
+ complete(req: CompletionRequest): Promise<CompletionResponse>;
169
+ /**
170
+ * Resolve the USD cost for a call's usage. `reportedCostUsd` (optional) is the provider's actual
171
+ * reported cost from `complete()` — when passed it is preferred over the table estimate, unless a
172
+ * config `pricing` override exists (precedence in `resolveCost`, ADR-PRC01). Omitting it (dry-run,
173
+ * judge calls) yields a pure token-based estimate, as before — additive + backward-compatible.
174
+ */
175
+ estimateCost(usage: {
176
+ input_tokens: number;
177
+ output_tokens: number;
178
+ }, reportedCostUsd?: number): CostEstimate;
179
+ }
180
+
181
+ interface Embedder {
182
+ /** Embed a batch of texts into fixed-dimension vectors (one per input, in order). */
183
+ embed(texts: string[]): Promise<Float32Array[]>;
184
+ }
185
+
186
+ interface Cache {
187
+ /** Return the cached completion, or undefined on a miss. `maxAgeMs` (optional) expires old entries. */
188
+ get(key: string, opts?: {
189
+ maxAgeMs?: number;
190
+ }): CompletionResponse | undefined;
191
+ set(key: string, value: CompletionResponse): void;
192
+ clear(): void;
193
+ close(): void;
194
+ }
195
+
196
+ interface RunDeps {
197
+ resolveProvider(cfg: ProviderConfig): Provider;
198
+ embedder: Embedder;
199
+ cache: Cache;
200
+ now: () => number;
201
+ loadDataset(ref: DatasetRef, baseDir: string): DatasetRow[];
202
+ baseDir: string;
203
+ }
204
+
205
+ interface RunOptions {
206
+ dryRun: boolean;
207
+ configPath?: string;
208
+ only?: string[];
209
+ concurrency?: number;
210
+ noCache?: boolean;
211
+ bail?: boolean;
212
+ }
213
+ declare function handleRun(options: RunOptions, depsFactory?: (config: Config, baseDir: string) => RunDeps): Promise<RunResult>;
214
+
215
+ export { type AssertionResultRecord, type Config, type EvalResult, type RowResult, type RunError, type RunOptions, type RunResult, type Summary, type Usage, handleRun as run };
@@ -0,0 +1,215 @@
1
+ interface Summary {
2
+ total: number;
3
+ passed: number;
4
+ failed: number;
5
+ errored: number;
6
+ cost_usd: number;
7
+ duration_ms: number;
8
+ }
9
+ interface AssertionResultRecord {
10
+ type: string;
11
+ passed: boolean;
12
+ score?: number;
13
+ reason?: string;
14
+ }
15
+ interface Usage {
16
+ input_tokens: number;
17
+ output_tokens: number;
18
+ cost_usd: number;
19
+ }
20
+ interface RunError {
21
+ code: string;
22
+ message: string;
23
+ }
24
+ interface RowResult {
25
+ vars: Record<string, unknown>;
26
+ output: string | null;
27
+ cached: boolean;
28
+ usage?: Usage;
29
+ latency_ms?: number;
30
+ error?: RunError;
31
+ assertions: AssertionResultRecord[];
32
+ }
33
+ interface EvalResult {
34
+ id: string;
35
+ tags: string[];
36
+ rows: RowResult[];
37
+ passed: boolean;
38
+ }
39
+ interface RunResult {
40
+ schema: 1;
41
+ plune_version: string;
42
+ started_at: string;
43
+ finished_at: string;
44
+ config_hash: string;
45
+ summary: Summary;
46
+ evals: EvalResult[];
47
+ }
48
+
49
+ interface ProviderConfig {
50
+ type: 'anthropic' | 'openai' | 'openrouter';
51
+ model: string;
52
+ temperature?: number;
53
+ max_tokens?: number;
54
+ concurrency?: number;
55
+ timeout?: number;
56
+ max_retries?: number;
57
+ }
58
+ interface ModelPrice {
59
+ input_per_1k_usd: number;
60
+ output_per_1k_usd: number;
61
+ }
62
+ type PricingMap = Record<string, ModelPrice>;
63
+ interface DatasetRow {
64
+ vars: Record<string, string | number | boolean>;
65
+ expected?: string;
66
+ }
67
+ type DatasetRef = string | {
68
+ examples: DatasetRow[];
69
+ };
70
+ interface ExactMatchAssertion {
71
+ type: 'exact-match';
72
+ value: string;
73
+ trim?: boolean;
74
+ ignore_case?: boolean;
75
+ }
76
+ interface ContainsAssertion {
77
+ type: 'contains';
78
+ value: string;
79
+ ignore_case?: boolean;
80
+ }
81
+ interface ContainsAnyAssertion {
82
+ type: 'contains-any';
83
+ values: string[];
84
+ ignore_case?: boolean;
85
+ }
86
+ interface ContainsAllAssertion {
87
+ type: 'contains-all';
88
+ values: string[];
89
+ ignore_case?: boolean;
90
+ }
91
+ interface JsonSchemaAssertion {
92
+ type: 'json-schema';
93
+ schema: object;
94
+ extract?: 'auto' | 'strict';
95
+ }
96
+ interface LlmJudgeAssertion {
97
+ type: 'llm-judge';
98
+ criteria: string;
99
+ provider?: Partial<ProviderConfig>;
100
+ pass_threshold?: number;
101
+ }
102
+ interface SemanticSimilarityAssertion {
103
+ type: 'semantic-similarity';
104
+ reference: string;
105
+ threshold?: number;
106
+ }
107
+ interface FaithfulnessAssertion {
108
+ type: 'faithfulness';
109
+ context: string;
110
+ threshold?: number;
111
+ }
112
+ interface AnswerRelevanceAssertion {
113
+ type: 'answer-relevance';
114
+ question: string;
115
+ threshold?: number;
116
+ }
117
+ interface ContextPrecisionAssertion {
118
+ type: 'context-precision';
119
+ context: string;
120
+ question: string;
121
+ threshold?: number;
122
+ }
123
+ type AssertionConfig = ExactMatchAssertion | ContainsAssertion | ContainsAnyAssertion | ContainsAllAssertion | JsonSchemaAssertion | LlmJudgeAssertion | SemanticSimilarityAssertion | FaithfulnessAssertion | AnswerRelevanceAssertion | ContextPrecisionAssertion;
124
+ interface EvalConfig {
125
+ id: string;
126
+ description?: string;
127
+ tags?: string[];
128
+ provider?: Partial<ProviderConfig>;
129
+ prompt?: string;
130
+ prompt_file?: string;
131
+ dataset: DatasetRef;
132
+ assertions: AssertionConfig[];
133
+ }
134
+ interface Config {
135
+ version: 1;
136
+ provider: ProviderConfig;
137
+ defaults?: {
138
+ assertions?: AssertionConfig[];
139
+ };
140
+ pricing?: PricingMap;
141
+ evals: EvalConfig[];
142
+ }
143
+
144
+ interface CompletionRequest {
145
+ provider: string;
146
+ model: string;
147
+ temperature: number;
148
+ max_tokens: number;
149
+ prompt_resolved: string;
150
+ }
151
+ interface CompletionResponse {
152
+ output: string;
153
+ usage: {
154
+ input_tokens: number;
155
+ output_tokens: number;
156
+ };
157
+ /**
158
+ * Actual USD cost the provider reported for this call, if it reports one (e.g. OpenRouter via
159
+ * `usage.include`). Absent for providers that only return token counts — the cost is then
160
+ * estimated downstream (ADR-PRC02). Additive + optional: existing providers omit it.
161
+ */
162
+ cost_usd?: number;
163
+ }
164
+ interface CostEstimate {
165
+ cost_usd: number;
166
+ }
167
+ interface Provider<_TConfig extends ProviderConfig = ProviderConfig> {
168
+ complete(req: CompletionRequest): Promise<CompletionResponse>;
169
+ /**
170
+ * Resolve the USD cost for a call's usage. `reportedCostUsd` (optional) is the provider's actual
171
+ * reported cost from `complete()` — when passed it is preferred over the table estimate, unless a
172
+ * config `pricing` override exists (precedence in `resolveCost`, ADR-PRC01). Omitting it (dry-run,
173
+ * judge calls) yields a pure token-based estimate, as before — additive + backward-compatible.
174
+ */
175
+ estimateCost(usage: {
176
+ input_tokens: number;
177
+ output_tokens: number;
178
+ }, reportedCostUsd?: number): CostEstimate;
179
+ }
180
+
181
+ interface Embedder {
182
+ /** Embed a batch of texts into fixed-dimension vectors (one per input, in order). */
183
+ embed(texts: string[]): Promise<Float32Array[]>;
184
+ }
185
+
186
+ interface Cache {
187
+ /** Return the cached completion, or undefined on a miss. `maxAgeMs` (optional) expires old entries. */
188
+ get(key: string, opts?: {
189
+ maxAgeMs?: number;
190
+ }): CompletionResponse | undefined;
191
+ set(key: string, value: CompletionResponse): void;
192
+ clear(): void;
193
+ close(): void;
194
+ }
195
+
196
+ interface RunDeps {
197
+ resolveProvider(cfg: ProviderConfig): Provider;
198
+ embedder: Embedder;
199
+ cache: Cache;
200
+ now: () => number;
201
+ loadDataset(ref: DatasetRef, baseDir: string): DatasetRow[];
202
+ baseDir: string;
203
+ }
204
+
205
+ interface RunOptions {
206
+ dryRun: boolean;
207
+ configPath?: string;
208
+ only?: string[];
209
+ concurrency?: number;
210
+ noCache?: boolean;
211
+ bail?: boolean;
212
+ }
213
+ declare function handleRun(options: RunOptions, depsFactory?: (config: Config, baseDir: string) => RunDeps): Promise<RunResult>;
214
+
215
+ export { type AssertionResultRecord, type Config, type EvalResult, type RowResult, type RunError, type RunOptions, type RunResult, type Summary, type Usage, handleRun as run };