@thinkhive/sdk 3.1.0 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +279 -128
- package/dist/api/apiKeys.d.ts +252 -0
- package/dist/api/apiKeys.js +298 -0
- package/dist/api/business-metrics.d.ts +188 -0
- package/dist/api/business-metrics.js +213 -0
- package/dist/api/conversation-eval.d.ts +200 -0
- package/dist/api/conversation-eval.js +235 -0
- package/dist/api/deterministic-graders.d.ts +205 -0
- package/dist/api/deterministic-graders.js +191 -0
- package/dist/api/eval-health.d.ts +250 -0
- package/dist/api/eval-health.js +224 -0
- package/dist/api/human-review.d.ts +275 -0
- package/dist/api/human-review.js +236 -0
- package/dist/api/nondeterminism.d.ts +300 -0
- package/dist/api/nondeterminism.js +250 -0
- package/dist/api/quality-metrics.d.ts +303 -0
- package/dist/api/quality-metrics.js +198 -0
- package/dist/api/roi-analytics.d.ts +263 -0
- package/dist/api/roi-analytics.js +204 -0
- package/dist/api/transcript-patterns.d.ts +204 -0
- package/dist/api/transcript-patterns.js +227 -0
- package/dist/core/client.d.ts +82 -8
- package/dist/core/client.js +223 -32
- package/dist/core/config.d.ts +1 -1
- package/dist/core/config.js +2 -2
- package/dist/core/types.d.ts +27 -2
- package/dist/core/types.js +1 -1
- package/dist/index.d.ts +415 -62
- package/dist/index.js +253 -37
- package/package.json +8 -4
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ThinkHive SDK v3.0 - Non-Determinism API
|
|
3
|
+
*
|
|
4
|
+
* API for pass@k / pass^k analysis to measure LLM evaluation reliability
|
|
5
|
+
*/
|
|
6
|
+
export type NondeterminismRunType = 'pass_at_k' | 'pass_to_k' | 'variance' | 'reliability';
|
|
7
|
+
export type NondeterminismRunStatus = 'pending' | 'running' | 'completed' | 'failed' | 'cancelled';
|
|
8
|
+
export interface NondeterminismRun {
|
|
9
|
+
id: string;
|
|
10
|
+
companyId: string;
|
|
11
|
+
agentId: string;
|
|
12
|
+
runType: NondeterminismRunType;
|
|
13
|
+
kValue: number;
|
|
14
|
+
status: NondeterminismRunStatus;
|
|
15
|
+
traceCount: number;
|
|
16
|
+
criterionId?: string;
|
|
17
|
+
criteriaIds: string[];
|
|
18
|
+
temperature?: string;
|
|
19
|
+
model?: string;
|
|
20
|
+
progressPercent: number;
|
|
21
|
+
passAtKRate?: string;
|
|
22
|
+
passToKRate?: string;
|
|
23
|
+
avgVariance?: string;
|
|
24
|
+
reliabilityScore?: string;
|
|
25
|
+
startedAt?: string;
|
|
26
|
+
completedAt?: string;
|
|
27
|
+
createdBy?: string;
|
|
28
|
+
createdAt: string;
|
|
29
|
+
}
|
|
30
|
+
export interface NondeterminismSample {
|
|
31
|
+
id: string;
|
|
32
|
+
runId: string;
|
|
33
|
+
traceId: string;
|
|
34
|
+
criterionId: string;
|
|
35
|
+
sampleIndex: number;
|
|
36
|
+
score: string;
|
|
37
|
+
passed: boolean;
|
|
38
|
+
reasoning?: string;
|
|
39
|
+
confidence?: string;
|
|
40
|
+
tokensUsed?: number;
|
|
41
|
+
costUsd?: string;
|
|
42
|
+
model?: string;
|
|
43
|
+
temperature?: string;
|
|
44
|
+
latencyMs?: number;
|
|
45
|
+
error?: string;
|
|
46
|
+
createdAt: string;
|
|
47
|
+
}
|
|
48
|
+
export interface CreateRunOptions {
|
|
49
|
+
agentId: string;
|
|
50
|
+
criterionId?: string;
|
|
51
|
+
criteriaIds?: string[];
|
|
52
|
+
kValue: number;
|
|
53
|
+
traceIds: string[];
|
|
54
|
+
runType?: NondeterminismRunType;
|
|
55
|
+
temperature?: number;
|
|
56
|
+
model?: string;
|
|
57
|
+
}
|
|
58
|
+
export interface RecordSampleOptions {
|
|
59
|
+
runId: string;
|
|
60
|
+
traceId: string;
|
|
61
|
+
criterionId: string;
|
|
62
|
+
sampleIndex: number;
|
|
63
|
+
score: number;
|
|
64
|
+
passed: boolean;
|
|
65
|
+
reasoning?: string;
|
|
66
|
+
confidence?: number;
|
|
67
|
+
tokensUsed?: number;
|
|
68
|
+
costUsd?: number;
|
|
69
|
+
model?: string;
|
|
70
|
+
temperature?: number;
|
|
71
|
+
latencyMs?: number;
|
|
72
|
+
error?: string;
|
|
73
|
+
}
|
|
74
|
+
export interface TraceAnalysis {
|
|
75
|
+
traceId: string;
|
|
76
|
+
samples: NondeterminismSample[];
|
|
77
|
+
passCount: number;
|
|
78
|
+
totalCount: number;
|
|
79
|
+
passRate: number;
|
|
80
|
+
scoreVariance: number;
|
|
81
|
+
meanScore: number;
|
|
82
|
+
isConsistent: boolean;
|
|
83
|
+
}
|
|
84
|
+
export interface CriterionAnalysis {
|
|
85
|
+
criterionId: string;
|
|
86
|
+
traceAnalyses: TraceAnalysis[];
|
|
87
|
+
passAtKRate: number;
|
|
88
|
+
passToKRate: number;
|
|
89
|
+
reliabilityScore: number;
|
|
90
|
+
isReliable: boolean;
|
|
91
|
+
recommendation: string;
|
|
92
|
+
}
|
|
93
|
+
export interface RunSummary {
|
|
94
|
+
run: NondeterminismRun;
|
|
95
|
+
traceAnalyses: TraceAnalysis[];
|
|
96
|
+
criterionAnalyses: CriterionAnalysis[];
|
|
97
|
+
}
|
|
98
|
+
export interface ListRunsOptions {
|
|
99
|
+
agentId?: string;
|
|
100
|
+
status?: NondeterminismRunStatus;
|
|
101
|
+
limit?: number;
|
|
102
|
+
offset?: number;
|
|
103
|
+
}
|
|
104
|
+
export interface PassAtKInfo {
|
|
105
|
+
concepts: {
|
|
106
|
+
passAtK: {
|
|
107
|
+
name: string;
|
|
108
|
+
description: string;
|
|
109
|
+
formula: string;
|
|
110
|
+
useCase: string;
|
|
111
|
+
};
|
|
112
|
+
passToK: {
|
|
113
|
+
name: string;
|
|
114
|
+
description: string;
|
|
115
|
+
formula: string;
|
|
116
|
+
useCase: string;
|
|
117
|
+
};
|
|
118
|
+
variance: {
|
|
119
|
+
name: string;
|
|
120
|
+
description: string;
|
|
121
|
+
useCase: string;
|
|
122
|
+
};
|
|
123
|
+
reliability: {
|
|
124
|
+
name: string;
|
|
125
|
+
description: string;
|
|
126
|
+
useCase: string;
|
|
127
|
+
};
|
|
128
|
+
};
|
|
129
|
+
recommendations: Record<string, string>;
|
|
130
|
+
defaults: {
|
|
131
|
+
kValue: number;
|
|
132
|
+
reliabilityThreshold: number;
|
|
133
|
+
varianceThreshold: number;
|
|
134
|
+
};
|
|
135
|
+
}
|
|
136
|
+
/**
|
|
137
|
+
* Non-Determinism API client for pass@k analysis and reliability measurement
|
|
138
|
+
*/
|
|
139
|
+
export declare const nondeterminism: {
|
|
140
|
+
/**
|
|
141
|
+
* Create a new non-determinism analysis run
|
|
142
|
+
*
|
|
143
|
+
* @example
|
|
144
|
+
* ```typescript
|
|
145
|
+
* const run = await nondeterminism.createRun({
|
|
146
|
+
* agentId: 'agent_123',
|
|
147
|
+
* criterionId: 'criterion_456',
|
|
148
|
+
* kValue: 5,
|
|
149
|
+
* traceIds: ['trace_1', 'trace_2', 'trace_3'],
|
|
150
|
+
* runType: 'pass_at_k',
|
|
151
|
+
* });
|
|
152
|
+
* ```
|
|
153
|
+
*/
|
|
154
|
+
createRun(options: CreateRunOptions): Promise<NondeterminismRun>;
|
|
155
|
+
/**
|
|
156
|
+
* Get non-determinism runs
|
|
157
|
+
*
|
|
158
|
+
* @example
|
|
159
|
+
* ```typescript
|
|
160
|
+
* const runs = await nondeterminism.getRuns({ agentId: 'agent_123' });
|
|
161
|
+
* ```
|
|
162
|
+
*/
|
|
163
|
+
getRuns(options?: ListRunsOptions): Promise<NondeterminismRun[]>;
|
|
164
|
+
/**
|
|
165
|
+
* Get a specific run
|
|
166
|
+
*
|
|
167
|
+
* @example
|
|
168
|
+
* ```typescript
|
|
169
|
+
* const run = await nondeterminism.getRun('run_123');
|
|
170
|
+
* ```
|
|
171
|
+
*/
|
|
172
|
+
getRun(runId: string): Promise<NondeterminismRun>;
|
|
173
|
+
/**
|
|
174
|
+
* Start a run
|
|
175
|
+
*
|
|
176
|
+
* @example
|
|
177
|
+
* ```typescript
|
|
178
|
+
* await nondeterminism.startRun('run_123');
|
|
179
|
+
* ```
|
|
180
|
+
*/
|
|
181
|
+
startRun(runId: string): Promise<void>;
|
|
182
|
+
/**
|
|
183
|
+
* Complete a run
|
|
184
|
+
*
|
|
185
|
+
* @example
|
|
186
|
+
* ```typescript
|
|
187
|
+
* await nondeterminism.completeRun('run_123');
|
|
188
|
+
* ```
|
|
189
|
+
*/
|
|
190
|
+
completeRun(runId: string): Promise<void>;
|
|
191
|
+
/**
|
|
192
|
+
* Record a sample result
|
|
193
|
+
*
|
|
194
|
+
* @example
|
|
195
|
+
* ```typescript
|
|
196
|
+
* const sample = await nondeterminism.recordSample({
|
|
197
|
+
* runId: 'run_123',
|
|
198
|
+
* traceId: 'trace_456',
|
|
199
|
+
* criterionId: 'criterion_789',
|
|
200
|
+
* sampleIndex: 0,
|
|
201
|
+
* score: 85,
|
|
202
|
+
* passed: true,
|
|
203
|
+
* reasoning: 'Response meets quality criteria',
|
|
204
|
+
* });
|
|
205
|
+
* ```
|
|
206
|
+
*/
|
|
207
|
+
recordSample(options: RecordSampleOptions): Promise<NondeterminismSample>;
|
|
208
|
+
/**
|
|
209
|
+
* Get samples for a run
|
|
210
|
+
*
|
|
211
|
+
* @example
|
|
212
|
+
* ```typescript
|
|
213
|
+
* const samples = await nondeterminism.getSamples('run_123');
|
|
214
|
+
* ```
|
|
215
|
+
*/
|
|
216
|
+
getSamples(runId: string): Promise<NondeterminismSample[]>;
|
|
217
|
+
/**
|
|
218
|
+
* Get run summary with analysis
|
|
219
|
+
*
|
|
220
|
+
* @example
|
|
221
|
+
* ```typescript
|
|
222
|
+
* const summary = await nondeterminism.getRunSummary('run_123');
|
|
223
|
+
* console.log(`Pass@k rate: ${summary.criterionAnalyses[0].passAtKRate}`);
|
|
224
|
+
* ```
|
|
225
|
+
*/
|
|
226
|
+
getRunSummary(runId: string): Promise<RunSummary>;
|
|
227
|
+
/**
|
|
228
|
+
* Trigger analysis of a completed run
|
|
229
|
+
*
|
|
230
|
+
* @example
|
|
231
|
+
* ```typescript
|
|
232
|
+
* const summary = await nondeterminism.analyzeRun('run_123');
|
|
233
|
+
* ```
|
|
234
|
+
*/
|
|
235
|
+
analyzeRun(runId: string): Promise<RunSummary>;
|
|
236
|
+
/**
|
|
237
|
+
* Get information about pass@k analysis
|
|
238
|
+
*
|
|
239
|
+
* @example
|
|
240
|
+
* ```typescript
|
|
241
|
+
* const info = await nondeterminism.getInfo();
|
|
242
|
+
* console.log(info.concepts.passAtK.description);
|
|
243
|
+
* ```
|
|
244
|
+
*/
|
|
245
|
+
getInfo(): Promise<PassAtKInfo>;
|
|
246
|
+
};
|
|
247
|
+
/**
|
|
248
|
+
* Calculate pass@k probability from pass rate
|
|
249
|
+
*
|
|
250
|
+
* @param passRate - Single-run pass rate (0-1)
|
|
251
|
+
* @param k - Number of runs
|
|
252
|
+
* @returns Probability that at least 1 of k runs passes
|
|
253
|
+
*
|
|
254
|
+
* @example
|
|
255
|
+
* ```typescript
|
|
256
|
+
* const passAtK = calculatePassAtK(0.7, 3); // ~0.973
|
|
257
|
+
* ```
|
|
258
|
+
*/
|
|
259
|
+
export declare function calculatePassAtK(passRate: number, k: number): number;
|
|
260
|
+
/**
|
|
261
|
+
* Calculate pass^k probability from pass rate
|
|
262
|
+
*
|
|
263
|
+
* @param passRate - Single-run pass rate (0-1)
|
|
264
|
+
* @param k - Number of runs
|
|
265
|
+
* @returns Probability that all k runs pass
|
|
266
|
+
*
|
|
267
|
+
* @example
|
|
268
|
+
* ```typescript
|
|
269
|
+
* const passToK = calculatePassToK(0.7, 3); // ~0.343
|
|
270
|
+
* ```
|
|
271
|
+
*/
|
|
272
|
+
export declare function calculatePassToK(passRate: number, k: number): number;
|
|
273
|
+
/**
|
|
274
|
+
* Calculate required pass rate to achieve target pass@k
|
|
275
|
+
*
|
|
276
|
+
* @param targetPassAtK - Desired pass@k probability
|
|
277
|
+
* @param k - Number of runs
|
|
278
|
+
* @returns Required single-run pass rate
|
|
279
|
+
*
|
|
280
|
+
* @example
|
|
281
|
+
* ```typescript
|
|
282
|
+
* const requiredRate = requiredPassRateForPassAtK(0.95, 3); // ~0.632
|
|
283
|
+
* ```
|
|
284
|
+
*/
|
|
285
|
+
export declare function requiredPassRateForPassAtK(targetPassAtK: number, k: number): number;
|
|
286
|
+
/**
|
|
287
|
+
* Determine if evaluation is reliable based on analysis
|
|
288
|
+
*
|
|
289
|
+
* @param analysis - Criterion analysis result
|
|
290
|
+
* @param reliabilityThreshold - Minimum reliability score (default 0.8)
|
|
291
|
+
* @returns Whether the evaluation is considered reliable
|
|
292
|
+
*/
|
|
293
|
+
export declare function isReliableEvaluation(analysis: CriterionAnalysis, reliabilityThreshold?: number): boolean;
|
|
294
|
+
/**
|
|
295
|
+
* Get recommendation based on reliability analysis
|
|
296
|
+
*
|
|
297
|
+
* @param analysis - Criterion analysis result
|
|
298
|
+
* @returns Actionable recommendation string
|
|
299
|
+
*/
|
|
300
|
+
export declare function getReliabilityRecommendation(analysis: CriterionAnalysis): string;
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* ThinkHive SDK v3.0 - Non-Determinism API
|
|
4
|
+
*
|
|
5
|
+
* API for pass@k / pass^k analysis to measure LLM evaluation reliability
|
|
6
|
+
*/
|
|
7
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
8
|
+
exports.nondeterminism = void 0;
|
|
9
|
+
exports.calculatePassAtK = calculatePassAtK;
|
|
10
|
+
exports.calculatePassToK = calculatePassToK;
|
|
11
|
+
exports.requiredPassRateForPassAtK = requiredPassRateForPassAtK;
|
|
12
|
+
exports.isReliableEvaluation = isReliableEvaluation;
|
|
13
|
+
exports.getReliabilityRecommendation = getReliabilityRecommendation;
|
|
14
|
+
const client_1 = require("../core/client");
|
|
15
|
+
// ============================================================================
|
|
16
|
+
// NON-DETERMINISM API CLIENT
|
|
17
|
+
// ============================================================================
|
|
18
|
+
/**
|
|
19
|
+
* Non-Determinism API client for pass@k analysis and reliability measurement
|
|
20
|
+
*/
|
|
21
|
+
exports.nondeterminism = {
|
|
22
|
+
/**
|
|
23
|
+
* Create a new non-determinism analysis run
|
|
24
|
+
*
|
|
25
|
+
* @example
|
|
26
|
+
* ```typescript
|
|
27
|
+
* const run = await nondeterminism.createRun({
|
|
28
|
+
* agentId: 'agent_123',
|
|
29
|
+
* criterionId: 'criterion_456',
|
|
30
|
+
* kValue: 5,
|
|
31
|
+
* traceIds: ['trace_1', 'trace_2', 'trace_3'],
|
|
32
|
+
* runType: 'pass_at_k',
|
|
33
|
+
* });
|
|
34
|
+
* ```
|
|
35
|
+
*/
|
|
36
|
+
async createRun(options) {
|
|
37
|
+
return (0, client_1.apiRequestWithData)('/nondeterminism/runs', {
|
|
38
|
+
method: 'POST',
|
|
39
|
+
body: options,
|
|
40
|
+
apiVersion: 'v1',
|
|
41
|
+
});
|
|
42
|
+
},
|
|
43
|
+
/**
|
|
44
|
+
* Get non-determinism runs
|
|
45
|
+
*
|
|
46
|
+
* @example
|
|
47
|
+
* ```typescript
|
|
48
|
+
* const runs = await nondeterminism.getRuns({ agentId: 'agent_123' });
|
|
49
|
+
* ```
|
|
50
|
+
*/
|
|
51
|
+
async getRuns(options = {}) {
|
|
52
|
+
const params = new URLSearchParams();
|
|
53
|
+
if (options.agentId)
|
|
54
|
+
params.set('agentId', options.agentId);
|
|
55
|
+
if (options.status)
|
|
56
|
+
params.set('status', options.status);
|
|
57
|
+
if (options.limit)
|
|
58
|
+
params.set('limit', String(options.limit));
|
|
59
|
+
if (options.offset)
|
|
60
|
+
params.set('offset', String(options.offset));
|
|
61
|
+
return (0, client_1.apiRequestWithData)(`/nondeterminism/runs?${params.toString()}`, { apiVersion: 'v1' });
|
|
62
|
+
},
|
|
63
|
+
/**
|
|
64
|
+
* Get a specific run
|
|
65
|
+
*
|
|
66
|
+
* @example
|
|
67
|
+
* ```typescript
|
|
68
|
+
* const run = await nondeterminism.getRun('run_123');
|
|
69
|
+
* ```
|
|
70
|
+
*/
|
|
71
|
+
async getRun(runId) {
|
|
72
|
+
return (0, client_1.apiRequestWithData)(`/nondeterminism/runs/${runId}`, { apiVersion: 'v1' });
|
|
73
|
+
},
|
|
74
|
+
/**
|
|
75
|
+
* Start a run
|
|
76
|
+
*
|
|
77
|
+
* @example
|
|
78
|
+
* ```typescript
|
|
79
|
+
* await nondeterminism.startRun('run_123');
|
|
80
|
+
* ```
|
|
81
|
+
*/
|
|
82
|
+
async startRun(runId) {
|
|
83
|
+
await (0, client_1.apiRequest)(`/nondeterminism/runs/${runId}/start`, {
|
|
84
|
+
method: 'POST',
|
|
85
|
+
apiVersion: 'v1',
|
|
86
|
+
});
|
|
87
|
+
},
|
|
88
|
+
/**
|
|
89
|
+
* Complete a run
|
|
90
|
+
*
|
|
91
|
+
* @example
|
|
92
|
+
* ```typescript
|
|
93
|
+
* await nondeterminism.completeRun('run_123');
|
|
94
|
+
* ```
|
|
95
|
+
*/
|
|
96
|
+
async completeRun(runId) {
|
|
97
|
+
await (0, client_1.apiRequest)(`/nondeterminism/runs/${runId}/complete`, {
|
|
98
|
+
method: 'POST',
|
|
99
|
+
apiVersion: 'v1',
|
|
100
|
+
});
|
|
101
|
+
},
|
|
102
|
+
/**
|
|
103
|
+
* Record a sample result
|
|
104
|
+
*
|
|
105
|
+
* @example
|
|
106
|
+
* ```typescript
|
|
107
|
+
* const sample = await nondeterminism.recordSample({
|
|
108
|
+
* runId: 'run_123',
|
|
109
|
+
* traceId: 'trace_456',
|
|
110
|
+
* criterionId: 'criterion_789',
|
|
111
|
+
* sampleIndex: 0,
|
|
112
|
+
* score: 85,
|
|
113
|
+
* passed: true,
|
|
114
|
+
* reasoning: 'Response meets quality criteria',
|
|
115
|
+
* });
|
|
116
|
+
* ```
|
|
117
|
+
*/
|
|
118
|
+
async recordSample(options) {
|
|
119
|
+
return (0, client_1.apiRequestWithData)('/nondeterminism/samples', {
|
|
120
|
+
method: 'POST',
|
|
121
|
+
body: options,
|
|
122
|
+
apiVersion: 'v1',
|
|
123
|
+
});
|
|
124
|
+
},
|
|
125
|
+
/**
|
|
126
|
+
* Get samples for a run
|
|
127
|
+
*
|
|
128
|
+
* @example
|
|
129
|
+
* ```typescript
|
|
130
|
+
* const samples = await nondeterminism.getSamples('run_123');
|
|
131
|
+
* ```
|
|
132
|
+
*/
|
|
133
|
+
async getSamples(runId) {
|
|
134
|
+
return (0, client_1.apiRequestWithData)(`/nondeterminism/runs/${runId}/samples`, { apiVersion: 'v1' });
|
|
135
|
+
},
|
|
136
|
+
/**
|
|
137
|
+
* Get run summary with analysis
|
|
138
|
+
*
|
|
139
|
+
* @example
|
|
140
|
+
* ```typescript
|
|
141
|
+
* const summary = await nondeterminism.getRunSummary('run_123');
|
|
142
|
+
* console.log(`Pass@k rate: ${summary.criterionAnalyses[0].passAtKRate}`);
|
|
143
|
+
* ```
|
|
144
|
+
*/
|
|
145
|
+
async getRunSummary(runId) {
|
|
146
|
+
return (0, client_1.apiRequestWithData)(`/nondeterminism/runs/${runId}/summary`, { apiVersion: 'v1' });
|
|
147
|
+
},
|
|
148
|
+
/**
|
|
149
|
+
* Trigger analysis of a completed run
|
|
150
|
+
*
|
|
151
|
+
* @example
|
|
152
|
+
* ```typescript
|
|
153
|
+
* const summary = await nondeterminism.analyzeRun('run_123');
|
|
154
|
+
* ```
|
|
155
|
+
*/
|
|
156
|
+
async analyzeRun(runId) {
|
|
157
|
+
return (0, client_1.apiRequestWithData)(`/nondeterminism/runs/${runId}/analyze`, { method: 'POST', apiVersion: 'v1' });
|
|
158
|
+
},
|
|
159
|
+
/**
|
|
160
|
+
* Get information about pass@k analysis
|
|
161
|
+
*
|
|
162
|
+
* @example
|
|
163
|
+
* ```typescript
|
|
164
|
+
* const info = await nondeterminism.getInfo();
|
|
165
|
+
* console.log(info.concepts.passAtK.description);
|
|
166
|
+
* ```
|
|
167
|
+
*/
|
|
168
|
+
async getInfo() {
|
|
169
|
+
return (0, client_1.apiRequestWithData)('/nondeterminism/info', { apiVersion: 'v1' });
|
|
170
|
+
},
|
|
171
|
+
};
|
|
172
|
+
// ============================================================================
|
|
173
|
+
// HELPER FUNCTIONS
|
|
174
|
+
// ============================================================================
|
|
175
|
+
/**
|
|
176
|
+
* Calculate pass@k probability from pass rate
|
|
177
|
+
*
|
|
178
|
+
* @param passRate - Single-run pass rate (0-1)
|
|
179
|
+
* @param k - Number of runs
|
|
180
|
+
* @returns Probability that at least 1 of k runs passes
|
|
181
|
+
*
|
|
182
|
+
* @example
|
|
183
|
+
* ```typescript
|
|
184
|
+
* const passAtK = calculatePassAtK(0.7, 3); // ~0.973
|
|
185
|
+
* ```
|
|
186
|
+
*/
|
|
187
|
+
function calculatePassAtK(passRate, k) {
|
|
188
|
+
return 1 - Math.pow(1 - passRate, k);
|
|
189
|
+
}
|
|
190
|
+
/**
|
|
191
|
+
* Calculate pass^k probability from pass rate
|
|
192
|
+
*
|
|
193
|
+
* @param passRate - Single-run pass rate (0-1)
|
|
194
|
+
* @param k - Number of runs
|
|
195
|
+
* @returns Probability that all k runs pass
|
|
196
|
+
*
|
|
197
|
+
* @example
|
|
198
|
+
* ```typescript
|
|
199
|
+
* const passToK = calculatePassToK(0.7, 3); // ~0.343
|
|
200
|
+
* ```
|
|
201
|
+
*/
|
|
202
|
+
function calculatePassToK(passRate, k) {
|
|
203
|
+
return Math.pow(passRate, k);
|
|
204
|
+
}
|
|
205
|
+
/**
|
|
206
|
+
* Calculate required pass rate to achieve target pass@k
|
|
207
|
+
*
|
|
208
|
+
* @param targetPassAtK - Desired pass@k probability
|
|
209
|
+
* @param k - Number of runs
|
|
210
|
+
* @returns Required single-run pass rate
|
|
211
|
+
*
|
|
212
|
+
* @example
|
|
213
|
+
* ```typescript
|
|
214
|
+
* const requiredRate = requiredPassRateForPassAtK(0.95, 3); // ~0.632
|
|
215
|
+
* ```
|
|
216
|
+
*/
|
|
217
|
+
function requiredPassRateForPassAtK(targetPassAtK, k) {
|
|
218
|
+
return 1 - Math.pow(1 - targetPassAtK, 1 / k);
|
|
219
|
+
}
|
|
220
|
+
/**
|
|
221
|
+
* Determine if evaluation is reliable based on analysis
|
|
222
|
+
*
|
|
223
|
+
* @param analysis - Criterion analysis result
|
|
224
|
+
* @param reliabilityThreshold - Minimum reliability score (default 0.8)
|
|
225
|
+
* @returns Whether the evaluation is considered reliable
|
|
226
|
+
*/
|
|
227
|
+
function isReliableEvaluation(analysis, reliabilityThreshold = 0.8) {
|
|
228
|
+
return analysis.reliabilityScore >= reliabilityThreshold;
|
|
229
|
+
}
|
|
230
|
+
/**
|
|
231
|
+
* Get recommendation based on reliability analysis
|
|
232
|
+
*
|
|
233
|
+
* @param analysis - Criterion analysis result
|
|
234
|
+
* @returns Actionable recommendation string
|
|
235
|
+
*/
|
|
236
|
+
function getReliabilityRecommendation(analysis) {
|
|
237
|
+
if (analysis.reliabilityScore >= 0.9) {
|
|
238
|
+
return 'Evaluation is highly reliable. No changes needed.';
|
|
239
|
+
}
|
|
240
|
+
else if (analysis.reliabilityScore >= 0.8) {
|
|
241
|
+
return 'Evaluation is reliable. Consider minor criteria refinements.';
|
|
242
|
+
}
|
|
243
|
+
else if (analysis.reliabilityScore >= 0.6) {
|
|
244
|
+
return 'Evaluation has moderate reliability. Add more specific criteria or examples.';
|
|
245
|
+
}
|
|
246
|
+
else {
|
|
247
|
+
return 'Evaluation is unreliable. Consider using deterministic checks or restructuring criteria.';
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
//# sourceMappingURL=data:application/json;base64,{"version":3,"file":"nondeterminism.js","sourceRoot":"","sources":["../../src/api/nondeterminism.ts"],"names":[],"mappings":";AAAA;;;;GAIG;;;AAqUH,4CAEC;AAcD,4CAEC;AAcD,gEAEC;AASD,oDAKC;AAQD,oEAUC;AArYD,2CAAgE;AA4HhE,+EAA+E;AAC/E,6BAA6B;AAC7B,+EAA+E;AAE/E;;GAEG;AACU,QAAA,cAAc,GAAG;IAC5B;;;;;;;;;;;;;OAaG;IACH,KAAK,CAAC,SAAS,CAAC,OAAyB;QACvC,OAAO,IAAA,2BAAkB,EAAoB,sBAAsB,EAAE;YACnE,MAAM,EAAE,MAAM;YACd,IAAI,EAAE,OAAO;YACb,UAAU,EAAE,IAAI;SACjB,CAAC,CAAC;IACL,CAAC;IAED;;;;;;;OAOG;IACH,KAAK,CAAC,OAAO,CAAC,UAA2B,EAAE;QACzC,MAAM,MAAM,GAAG,IAAI,eAAe,EAAE,CAAC;QACrC,IAAI,OAAO,CAAC,OAAO;YAAE,MAAM,CAAC,GAAG,CAAC,SAAS,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC;QAC5D,IAAI,OAAO,CAAC,MAAM;YAAE,MAAM,CAAC,GAAG,CAAC,QAAQ,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC;QACzD,IAAI,OAAO,CAAC,KAAK;YAAE,MAAM,CAAC,GAAG,CAAC,OAAO,EAAE,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC;QAC9D,IAAI,OAAO,CAAC,MAAM;YAAE,MAAM,CAAC,GAAG,CAAC,QAAQ,EAAE,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC;QAEjE,OAAO,IAAA,2BAAkB,EACvB,wBAAwB,MAAM,CAAC,QAAQ,EAAE,EAAE,EAC3C,EAAE,UAAU,EAAE,IAAI,EAAE,CACrB,CAAC;IACJ,CAAC;IAED;;;;;;;OAOG;IACH,KAAK,CAAC,MAAM,CAAC,KAAa;QACxB,OAAO,IAAA,2BAAkB,EACvB,wBAAwB,KAAK,EAAE,EAC/B,EAAE,UAAU,EAAE,IAAI,EAAE,CACrB,CAAC;IACJ,CAAC;IAED;;;;;;;OAOG;IACH,KAAK,CAAC,QAAQ,CAAC,KAAa;QAC1B,MAAM,IAAA,mBAAU,EAAC,wBAAwB,KAAK,QAAQ,EAAE;YACtD,MAAM,EAAE,MAAM;YACd,UAAU,EAAE,IAAI;SACjB,CAAC,CAAC;IACL,CAAC;IAED;;;;;;;OAOG;IACH,KAAK,CAAC,WAAW,CAAC,KAAa;QAC7B,MAAM,IAAA,mBAAU,EAAC,wBAAwB,KAAK,WAAW,EAAE;YACzD,MAAM,EAAE,MAAM;YACd,UAAU,EAAE,IAAI;SACjB,CAAC,CAAC;IACL,CAAC;IAED;;;;;;;;;;;;;;;OAeG;IACH,KAAK,CAAC,YAAY,CAAC,OAA4B;QAC7C,OAAO,IAAA,2BAAkB,EAAuB,yBAAyB,EAAE;YACzE,MAAM,EAAE,MAAM;YACd,IAAI,EAAE,OAAO;YACb,UAAU,EAAE,IAAI;SACjB,CAAC,CAAC;IACL,CAAC;IAED;;;;;;;OAOG;IACH,KAAK,CAAC,UAAU,CAAC,KAAa;QAC5B,OAAO,IAAA,2BAAkB,EACvB,wBAAwB,KAAK,UAAU,EACvC,EAAE,UAAU,EAAE,IAAI,EAAE,CACrB,CAAC;IACJ,CAAC;IAED;;;;;;;;OAQG;IACH,KAAK,CAAC,aAAa,CAAC,KAAa;QAC/B,OAAO,IAAA,2BAAkB,EACvB,wBAAwB,KAAK,UAAU,EACvC,EAAE,UAAU,EAAE,IAAI,EAAE,CACrB,CAAC;IACJ,CAAC;IAED;;;;;;;OAOG;IACH,KAAK,CAAC,UAAU,CAAC,KAAa;QAC5B,OAAO,IAAA,2BAAkB,EACvB,wBAAwB,KAAK,UAAU,EACvC,EAAE,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,IAAI,EAAE,CACrC,CAAC;IACJ,CAAC;IAED;;;;;;;;OAQG;IACH,KAAK,CAAC,OAAO;QACX,OAAO,IAAA,2BAAkB,EACvB,sBAAsB,EACtB,EAAE,UAAU,EAAE,IAAI,EAAE,CACrB,CAAC;IACJ,CAAC;CACF,CAAC;AAEF,+EAA+E;AAC/E,mBAAmB;AACnB,+EAA+E;AAE/E;;;;;;;;;;;GAWG;AACH,SAAgB,gBAAgB,CAAC,QAAgB,EAAE,CAAS;IAC1D,OAAO,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,QAAQ,EAAE,CAAC,CAAC,CAAC;AACvC,CAAC;AAED;;;;;;;;;;;GAWG;AACH,SAAgB,gBAAgB,CAAC,QAAgB,EAAE,CAAS;IAC1D,OAAO,IAAI,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC;AAC/B,CAAC;AAED;;;;;;;;;;;GAWG;AACH,SAAgB,0BAA0B,CAAC,aAAqB,EAAE,CAAS;IACzE,OAAO,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,aAAa,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;AAChD,CAAC;AAED;;;;;;GAMG;AACH,SAAgB,oBAAoB,CAClC,QAA2B,EAC3B,oBAAoB,GAAG,GAAG;IAE1B,OAAO,QAAQ,CAAC,gBAAgB,IAAI,oBAAoB,CAAC;AAC3D,CAAC;AAED;;;;;GAKG;AACH,SAAgB,4BAA4B,CAAC,QAA2B;IACtE,IAAI,QAAQ,CAAC,gBAAgB,IAAI,GAAG,EAAE,CAAC;QACrC,OAAO,mDAAmD,CAAC;IAC7D,CAAC;SAAM,IAAI,QAAQ,CAAC,gBAAgB,IAAI,GAAG,EAAE,CAAC;QAC5C,OAAO,8DAA8D,CAAC;IACxE,CAAC;SAAM,IAAI,QAAQ,CAAC,gBAAgB,IAAI,GAAG,EAAE,CAAC;QAC5C,OAAO,8EAA8E,CAAC;IACxF,CAAC;SAAM,CAAC;QACN,OAAO,0FAA0F,CAAC;IACpG,CAAC;AACH,CAAC","sourcesContent":["/**\n * ThinkHive SDK v3.0 - Non-Determinism API\n *\n * API for pass@k / pass^k analysis to measure LLM evaluation reliability\n */\n\nimport { apiRequest, apiRequestWithData } from '../core/client';\n\n// ============================================================================\n// TYPES\n// ============================================================================\n\nexport type NondeterminismRunType = 'pass_at_k' | 'pass_to_k' | 'variance' | 'reliability';\nexport type NondeterminismRunStatus = 'pending' | 'running' | 'completed' | 'failed' | 'cancelled';\n\nexport interface NondeterminismRun {\n  id: string;\n  companyId: string;\n  agentId: string;\n  runType: NondeterminismRunType;\n  kValue: number;\n  status: NondeterminismRunStatus;\n  traceCount: number;\n  criterionId?: string;\n  criteriaIds: string[];\n  temperature?: string;\n  model?: string;\n  progressPercent: number;\n  passAtKRate?: string;\n  passToKRate?: string;\n  avgVariance?: string;\n  reliabilityScore?: string;\n  startedAt?: string;\n  completedAt?: string;\n  createdBy?: string;\n  createdAt: string;\n}\n\nexport interface NondeterminismSample {\n  id: string;\n  runId: string;\n  traceId: string;\n  criterionId: string;\n  sampleIndex: number;\n  score: string;\n  passed: boolean;\n  reasoning?: string;\n  confidence?: string;\n  tokensUsed?: number;\n  costUsd?: string;\n  model?: string;\n  temperature?: string;\n  latencyMs?: number;\n  error?: string;\n  createdAt: string;\n}\n\nexport interface CreateRunOptions {\n  agentId: string;\n  criterionId?: string;\n  criteriaIds?: string[];\n  kValue: number;\n  traceIds: string[];\n  runType?: NondeterminismRunType;\n  temperature?: number;\n  model?: string;\n}\n\nexport interface RecordSampleOptions {\n  runId: string;\n  traceId: string;\n  criterionId: string;\n  sampleIndex: number;\n  score: number;\n  passed: boolean;\n  reasoning?: string;\n  confidence?: number;\n  tokensUsed?: number;\n  costUsd?: number;\n  model?: string;\n  temperature?: number;\n  latencyMs?: number;\n  error?: string;\n}\n\nexport interface TraceAnalysis {\n  traceId: string;\n  samples: NondeterminismSample[];\n  passCount: number;\n  totalCount: number;\n  passRate: number;\n  scoreVariance: number;\n  meanScore: number;\n  isConsistent: boolean;\n}\n\nexport interface CriterionAnalysis {\n  criterionId: string;\n  traceAnalyses: TraceAnalysis[];\n  passAtKRate: number;\n  passToKRate: number;\n  reliabilityScore: number;\n  isReliable: boolean;\n  recommendation: string;\n}\n\nexport interface RunSummary {\n  run: NondeterminismRun;\n  traceAnalyses: TraceAnalysis[];\n  criterionAnalyses: CriterionAnalysis[];\n}\n\nexport interface ListRunsOptions {\n  agentId?: string;\n  status?: NondeterminismRunStatus;\n  limit?: number;\n  offset?: number;\n}\n\nexport interface PassAtKInfo {\n  concepts: {\n    passAtK: { name: string; description: string; formula: string; useCase: string };\n    passToK: { name: string; description: string; formula: string; useCase: string };\n    variance: { name: string; description: string; useCase: string };\n    reliability: { name: string; description: string; useCase: string };\n  };\n  recommendations: Record<string, string>;\n  defaults: { kValue: number; reliabilityThreshold: number; varianceThreshold: number };\n}\n\n// ============================================================================\n// NON-DETERMINISM API CLIENT\n// ============================================================================\n\n/**\n * Non-Determinism API client for pass@k analysis and reliability measurement\n */\nexport const nondeterminism = {\n  /**\n   * Create a new non-determinism analysis run\n   *\n   * @example\n   * ```typescript\n   * const run = await nondeterminism.createRun({\n   *   agentId: 'agent_123',\n   *   criterionId: 'criterion_456',\n   *   kValue: 5,\n   *   traceIds: ['trace_1', 'trace_2', 'trace_3'],\n   *   runType: 'pass_at_k',\n   * });\n   * ```\n   */\n  async createRun(options: CreateRunOptions): Promise<NondeterminismRun> {\n    return apiRequestWithData<NondeterminismRun>('/nondeterminism/runs', {\n      method: 'POST',\n      body: options,\n      apiVersion: 'v1',\n    });\n  },\n\n  /**\n   * Get non-determinism runs\n   *\n   * @example\n   * ```typescript\n   * const runs = await nondeterminism.getRuns({ agentId: 'agent_123' });\n   * ```\n   */\n  async getRuns(options: ListRunsOptions = {}): Promise<NondeterminismRun[]> {\n    const params = new URLSearchParams();\n    if (options.agentId) params.set('agentId', options.agentId);\n    if (options.status) params.set('status', options.status);\n    if (options.limit) params.set('limit', String(options.limit));\n    if (options.offset) params.set('offset', String(options.offset));\n\n    return apiRequestWithData<NondeterminismRun[]>(\n      `/nondeterminism/runs?${params.toString()}`,\n      { apiVersion: 'v1' }\n    );\n  },\n\n  /**\n   * Get a specific run\n   *\n   * @example\n   * ```typescript\n   * const run = await nondeterminism.getRun('run_123');\n   * ```\n   */\n  async getRun(runId: string): Promise<NondeterminismRun> {\n    return apiRequestWithData<NondeterminismRun>(\n      `/nondeterminism/runs/${runId}`,\n      { apiVersion: 'v1' }\n    );\n  },\n\n  /**\n   * Start a run\n   *\n   * @example\n   * ```typescript\n   * await nondeterminism.startRun('run_123');\n   * ```\n   */\n  async startRun(runId: string): Promise<void> {\n    await apiRequest(`/nondeterminism/runs/${runId}/start`, {\n      method: 'POST',\n      apiVersion: 'v1',\n    });\n  },\n\n  /**\n   * Complete a run\n   *\n   * @example\n   * ```typescript\n   * await nondeterminism.completeRun('run_123');\n   * ```\n   */\n  async completeRun(runId: string): Promise<void> {\n    await apiRequest(`/nondeterminism/runs/${runId}/complete`, {\n      method: 'POST',\n      apiVersion: 'v1',\n    });\n  },\n\n  /**\n   * Record a sample result\n   *\n   * @example\n   * ```typescript\n   * const sample = await nondeterminism.recordSample({\n   *   runId: 'run_123',\n   *   traceId: 'trace_456',\n   *   criterionId: 'criterion_789',\n   *   sampleIndex: 0,\n   *   score: 85,\n   *   passed: true,\n   *   reasoning: 'Response meets quality criteria',\n   * });\n   * ```\n   */\n  async recordSample(options: RecordSampleOptions): Promise<NondeterminismSample> {\n    return apiRequestWithData<NondeterminismSample>('/nondeterminism/samples', {\n      method: 'POST',\n      body: options,\n      apiVersion: 'v1',\n    });\n  },\n\n  /**\n   * Get samples for a run\n   *\n   * @example\n   * ```typescript\n   * const samples = await nondeterminism.getSamples('run_123');\n   * ```\n   */\n  async getSamples(runId: string): Promise<NondeterminismSample[]> {\n    return apiRequestWithData<NondeterminismSample[]>(\n      `/nondeterminism/runs/${runId}/samples`,\n      { apiVersion: 'v1' }\n    );\n  },\n\n  /**\n   * Get run summary with analysis\n   *\n   * @example\n   * ```typescript\n   * const summary = await nondeterminism.getRunSummary('run_123');\n   * console.log(`Pass@k rate: ${summary.criterionAnalyses[0].passAtKRate}`);\n   * ```\n   */\n  async getRunSummary(runId: string): Promise<RunSummary> {\n    return apiRequestWithData<RunSummary>(\n      `/nondeterminism/runs/${runId}/summary`,\n      { apiVersion: 'v1' }\n    );\n  },\n\n  /**\n   * Trigger analysis of a completed run\n   *\n   * @example\n   * ```typescript\n   * const summary = await nondeterminism.analyzeRun('run_123');\n   * ```\n   */\n  async analyzeRun(runId: string): Promise<RunSummary> {\n    return apiRequestWithData<RunSummary>(\n      `/nondeterminism/runs/${runId}/analyze`,\n      { method: 'POST', apiVersion: 'v1' }\n    );\n  },\n\n  /**\n   * Get information about pass@k analysis\n   *\n   * @example\n   * ```typescript\n   * const info = await nondeterminism.getInfo();\n   * console.log(info.concepts.passAtK.description);\n   * ```\n   */\n  async getInfo(): Promise<PassAtKInfo> {\n    return apiRequestWithData<PassAtKInfo>(\n      '/nondeterminism/info',\n      { apiVersion: 'v1' }\n    );\n  },\n};\n\n// ============================================================================\n// HELPER FUNCTIONS\n// ============================================================================\n\n/**\n * Calculate pass@k probability from pass rate\n *\n * @param passRate - Single-run pass rate (0-1)\n * @param k - Number of runs\n * @returns Probability that at least 1 of k runs passes\n *\n * @example\n * ```typescript\n * const passAtK = calculatePassAtK(0.7, 3); // ~0.973\n * ```\n */\nexport function calculatePassAtK(passRate: number, k: number): number {\n  return 1 - Math.pow(1 - passRate, k);\n}\n\n/**\n * Calculate pass^k probability from pass rate\n *\n * @param passRate - Single-run pass rate (0-1)\n * @param k - Number of runs\n * @returns Probability that all k runs pass\n *\n * @example\n * ```typescript\n * const passToK = calculatePassToK(0.7, 3); // ~0.343\n * ```\n */\nexport function calculatePassToK(passRate: number, k: number): number {\n  return Math.pow(passRate, k);\n}\n\n/**\n * Calculate required pass rate to achieve target pass@k\n *\n * @param targetPassAtK - Desired pass@k probability\n * @param k - Number of runs\n * @returns Required single-run pass rate\n *\n * @example\n * ```typescript\n * const requiredRate = requiredPassRateForPassAtK(0.95, 3); // ~0.632\n * ```\n */\nexport function requiredPassRateForPassAtK(targetPassAtK: number, k: number): number {\n  return 1 - Math.pow(1 - targetPassAtK, 1 / k);\n}\n\n/**\n * Determine if evaluation is reliable based on analysis\n *\n * @param analysis - Criterion analysis result\n * @param reliabilityThreshold - Minimum reliability score (default 0.8)\n * @returns Whether the evaluation is considered reliable\n */\nexport function isReliableEvaluation(\n  analysis: CriterionAnalysis,\n  reliabilityThreshold = 0.8\n): boolean {\n  return analysis.reliabilityScore >= reliabilityThreshold;\n}\n\n/**\n * Get recommendation based on reliability analysis\n *\n * @param analysis - Criterion analysis result\n * @returns Actionable recommendation string\n */\nexport function getReliabilityRecommendation(analysis: CriterionAnalysis): string {\n  if (analysis.reliabilityScore >= 0.9) {\n    return 'Evaluation is highly reliable. No changes needed.';\n  } else if (analysis.reliabilityScore >= 0.8) {\n    return 'Evaluation is reliable. Consider minor criteria refinements.';\n  } else if (analysis.reliabilityScore >= 0.6) {\n    return 'Evaluation has moderate reliability. Add more specific criteria or examples.';\n  } else {\n    return 'Evaluation is unreliable. Consider using deterministic checks or restructuring criteria.';\n  }\n}\n"]}
|