@thinkhive/sdk 3.1.1 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/MIGRATION.md +83 -12
- package/README.md +279 -128
- package/dist/api/agents.d.ts +169 -0
- package/dist/api/agents.js +185 -0
- package/dist/api/apiKeys.d.ts +252 -0
- package/dist/api/apiKeys.js +298 -0
- package/dist/api/business-metrics.d.ts +188 -0
- package/dist/api/business-metrics.js +213 -0
- package/dist/api/calibration.d.ts +0 -62
- package/dist/api/calibration.js +5 -48
- package/dist/api/claims.js +10 -7
- package/dist/api/conversation-eval.d.ts +200 -0
- package/dist/api/conversation-eval.js +235 -0
- package/dist/api/deterministic-graders.d.ts +205 -0
- package/dist/api/deterministic-graders.js +191 -0
- package/dist/api/eval-health.d.ts +250 -0
- package/dist/api/eval-health.js +224 -0
- package/dist/api/human-review.d.ts +275 -0
- package/dist/api/human-review.js +236 -0
- package/dist/api/nondeterminism.d.ts +300 -0
- package/dist/api/nondeterminism.js +250 -0
- package/dist/api/quality-metrics.d.ts +303 -0
- package/dist/api/quality-metrics.js +198 -0
- package/dist/api/roi-analytics.d.ts +263 -0
- package/dist/api/roi-analytics.js +204 -0
- package/dist/api/runs.js +12 -6
- package/dist/api/transcript-patterns.d.ts +204 -0
- package/dist/api/transcript-patterns.js +227 -0
- package/dist/core/client.d.ts +83 -9
- package/dist/core/client.js +229 -34
- package/dist/core/config.d.ts +2 -3
- package/dist/core/config.js +3 -4
- package/dist/core/types.d.ts +57 -4
- package/dist/core/types.js +1 -1
- package/dist/index.d.ts +429 -76
- package/dist/index.js +262 -42
- package/package.json +2 -2
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* ThinkHive SDK v3.0 - Non-Determinism API
|
|
4
|
+
*
|
|
5
|
+
* API for pass@k / pass^k analysis to measure LLM evaluation reliability
|
|
6
|
+
*/
|
|
7
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
8
|
+
exports.nondeterminism = void 0;
|
|
9
|
+
exports.calculatePassAtK = calculatePassAtK;
|
|
10
|
+
exports.calculatePassToK = calculatePassToK;
|
|
11
|
+
exports.requiredPassRateForPassAtK = requiredPassRateForPassAtK;
|
|
12
|
+
exports.isReliableEvaluation = isReliableEvaluation;
|
|
13
|
+
exports.getReliabilityRecommendation = getReliabilityRecommendation;
|
|
14
|
+
const client_1 = require("../core/client");
|
|
15
|
+
// ============================================================================
|
|
16
|
+
// NON-DETERMINISM API CLIENT
|
|
17
|
+
// ============================================================================
|
|
18
|
+
/**
|
|
19
|
+
* Non-Determinism API client for pass@k analysis and reliability measurement
|
|
20
|
+
*/
|
|
21
|
+
exports.nondeterminism = {
|
|
22
|
+
/**
|
|
23
|
+
* Create a new non-determinism analysis run
|
|
24
|
+
*
|
|
25
|
+
* @example
|
|
26
|
+
* ```typescript
|
|
27
|
+
* const run = await nondeterminism.createRun({
|
|
28
|
+
* agentId: 'agent_123',
|
|
29
|
+
* criterionId: 'criterion_456',
|
|
30
|
+
* kValue: 5,
|
|
31
|
+
* traceIds: ['trace_1', 'trace_2', 'trace_3'],
|
|
32
|
+
* runType: 'pass_at_k',
|
|
33
|
+
* });
|
|
34
|
+
* ```
|
|
35
|
+
*/
|
|
36
|
+
async createRun(options) {
|
|
37
|
+
return (0, client_1.apiRequestWithData)('/nondeterminism/runs', {
|
|
38
|
+
method: 'POST',
|
|
39
|
+
body: options,
|
|
40
|
+
apiVersion: 'none',
|
|
41
|
+
});
|
|
42
|
+
},
|
|
43
|
+
/**
|
|
44
|
+
* Get non-determinism runs
|
|
45
|
+
*
|
|
46
|
+
* @example
|
|
47
|
+
* ```typescript
|
|
48
|
+
* const runs = await nondeterminism.getRuns({ agentId: 'agent_123' });
|
|
49
|
+
* ```
|
|
50
|
+
*/
|
|
51
|
+
async getRuns(options = {}) {
|
|
52
|
+
const params = new URLSearchParams();
|
|
53
|
+
if (options.agentId)
|
|
54
|
+
params.set('agentId', options.agentId);
|
|
55
|
+
if (options.status)
|
|
56
|
+
params.set('status', options.status);
|
|
57
|
+
if (options.limit)
|
|
58
|
+
params.set('limit', String(options.limit));
|
|
59
|
+
if (options.offset)
|
|
60
|
+
params.set('offset', String(options.offset));
|
|
61
|
+
return (0, client_1.apiRequestWithData)(`/nondeterminism/runs?${params.toString()}`, { apiVersion: 'none' });
|
|
62
|
+
},
|
|
63
|
+
/**
|
|
64
|
+
* Get a specific run
|
|
65
|
+
*
|
|
66
|
+
* @example
|
|
67
|
+
* ```typescript
|
|
68
|
+
* const run = await nondeterminism.getRun('run_123');
|
|
69
|
+
* ```
|
|
70
|
+
*/
|
|
71
|
+
async getRun(runId) {
|
|
72
|
+
return (0, client_1.apiRequestWithData)(`/nondeterminism/runs/${runId}`, { apiVersion: 'none' });
|
|
73
|
+
},
|
|
74
|
+
/**
|
|
75
|
+
* Start a run
|
|
76
|
+
*
|
|
77
|
+
* @example
|
|
78
|
+
* ```typescript
|
|
79
|
+
* await nondeterminism.startRun('run_123');
|
|
80
|
+
* ```
|
|
81
|
+
*/
|
|
82
|
+
async startRun(runId) {
|
|
83
|
+
await (0, client_1.apiRequest)(`/nondeterminism/runs/${runId}/start`, {
|
|
84
|
+
method: 'POST',
|
|
85
|
+
apiVersion: 'none',
|
|
86
|
+
});
|
|
87
|
+
},
|
|
88
|
+
/**
|
|
89
|
+
* Complete a run
|
|
90
|
+
*
|
|
91
|
+
* @example
|
|
92
|
+
* ```typescript
|
|
93
|
+
* await nondeterminism.completeRun('run_123');
|
|
94
|
+
* ```
|
|
95
|
+
*/
|
|
96
|
+
async completeRun(runId) {
|
|
97
|
+
await (0, client_1.apiRequest)(`/nondeterminism/runs/${runId}/complete`, {
|
|
98
|
+
method: 'POST',
|
|
99
|
+
apiVersion: 'none',
|
|
100
|
+
});
|
|
101
|
+
},
|
|
102
|
+
/**
|
|
103
|
+
* Record a sample result
|
|
104
|
+
*
|
|
105
|
+
* @example
|
|
106
|
+
* ```typescript
|
|
107
|
+
* const sample = await nondeterminism.recordSample({
|
|
108
|
+
* runId: 'run_123',
|
|
109
|
+
* traceId: 'trace_456',
|
|
110
|
+
* criterionId: 'criterion_789',
|
|
111
|
+
* sampleIndex: 0,
|
|
112
|
+
* score: 85,
|
|
113
|
+
* passed: true,
|
|
114
|
+
* reasoning: 'Response meets quality criteria',
|
|
115
|
+
* });
|
|
116
|
+
* ```
|
|
117
|
+
*/
|
|
118
|
+
async recordSample(options) {
|
|
119
|
+
return (0, client_1.apiRequestWithData)('/nondeterminism/samples', {
|
|
120
|
+
method: 'POST',
|
|
121
|
+
body: options,
|
|
122
|
+
apiVersion: 'none',
|
|
123
|
+
});
|
|
124
|
+
},
|
|
125
|
+
/**
|
|
126
|
+
* Get samples for a run
|
|
127
|
+
*
|
|
128
|
+
* @example
|
|
129
|
+
* ```typescript
|
|
130
|
+
* const samples = await nondeterminism.getSamples('run_123');
|
|
131
|
+
* ```
|
|
132
|
+
*/
|
|
133
|
+
async getSamples(runId) {
|
|
134
|
+
return (0, client_1.apiRequestWithData)(`/nondeterminism/runs/${runId}/samples`, { apiVersion: 'none' });
|
|
135
|
+
},
|
|
136
|
+
/**
|
|
137
|
+
* Get run summary with analysis
|
|
138
|
+
*
|
|
139
|
+
* @example
|
|
140
|
+
* ```typescript
|
|
141
|
+
* const summary = await nondeterminism.getRunSummary('run_123');
|
|
142
|
+
* console.log(`Pass@k rate: ${summary.criterionAnalyses[0].passAtKRate}`);
|
|
143
|
+
* ```
|
|
144
|
+
*/
|
|
145
|
+
async getRunSummary(runId) {
|
|
146
|
+
return (0, client_1.apiRequestWithData)(`/nondeterminism/runs/${runId}/summary`, { apiVersion: 'none' });
|
|
147
|
+
},
|
|
148
|
+
/**
|
|
149
|
+
* Trigger analysis of a completed run
|
|
150
|
+
*
|
|
151
|
+
* @example
|
|
152
|
+
* ```typescript
|
|
153
|
+
* const summary = await nondeterminism.analyzeRun('run_123');
|
|
154
|
+
* ```
|
|
155
|
+
*/
|
|
156
|
+
async analyzeRun(runId) {
|
|
157
|
+
return (0, client_1.apiRequestWithData)(`/nondeterminism/runs/${runId}/analyze`, { method: 'POST', apiVersion: 'none' });
|
|
158
|
+
},
|
|
159
|
+
/**
|
|
160
|
+
* Get information about pass@k analysis
|
|
161
|
+
*
|
|
162
|
+
* @example
|
|
163
|
+
* ```typescript
|
|
164
|
+
* const info = await nondeterminism.getInfo();
|
|
165
|
+
* console.log(info.concepts.passAtK.description);
|
|
166
|
+
* ```
|
|
167
|
+
*/
|
|
168
|
+
async getInfo() {
|
|
169
|
+
return (0, client_1.apiRequestWithData)('/nondeterminism/info', { apiVersion: 'none' });
|
|
170
|
+
},
|
|
171
|
+
};
|
|
172
|
+
// ============================================================================
|
|
173
|
+
// HELPER FUNCTIONS
|
|
174
|
+
// ============================================================================
|
|
175
|
+
/**
|
|
176
|
+
* Calculate pass@k probability from pass rate
|
|
177
|
+
*
|
|
178
|
+
* @param passRate - Single-run pass rate (0-1)
|
|
179
|
+
* @param k - Number of runs
|
|
180
|
+
* @returns Probability that at least 1 of k runs passes
|
|
181
|
+
*
|
|
182
|
+
* @example
|
|
183
|
+
* ```typescript
|
|
184
|
+
* const passAtK = calculatePassAtK(0.7, 3); // ~0.973
|
|
185
|
+
* ```
|
|
186
|
+
*/
|
|
187
|
+
function calculatePassAtK(passRate, k) {
|
|
188
|
+
return 1 - Math.pow(1 - passRate, k);
|
|
189
|
+
}
|
|
190
|
+
/**
|
|
191
|
+
* Calculate pass^k probability from pass rate
|
|
192
|
+
*
|
|
193
|
+
* @param passRate - Single-run pass rate (0-1)
|
|
194
|
+
* @param k - Number of runs
|
|
195
|
+
* @returns Probability that all k runs pass
|
|
196
|
+
*
|
|
197
|
+
* @example
|
|
198
|
+
* ```typescript
|
|
199
|
+
* const passToK = calculatePassToK(0.7, 3); // ~0.343
|
|
200
|
+
* ```
|
|
201
|
+
*/
|
|
202
|
+
function calculatePassToK(passRate, k) {
|
|
203
|
+
return Math.pow(passRate, k);
|
|
204
|
+
}
|
|
205
|
+
/**
|
|
206
|
+
* Calculate required pass rate to achieve target pass@k
|
|
207
|
+
*
|
|
208
|
+
* @param targetPassAtK - Desired pass@k probability
|
|
209
|
+
* @param k - Number of runs
|
|
210
|
+
* @returns Required single-run pass rate
|
|
211
|
+
*
|
|
212
|
+
* @example
|
|
213
|
+
* ```typescript
|
|
214
|
+
* const requiredRate = requiredPassRateForPassAtK(0.95, 3); // ~0.632
|
|
215
|
+
* ```
|
|
216
|
+
*/
|
|
217
|
+
function requiredPassRateForPassAtK(targetPassAtK, k) {
|
|
218
|
+
return 1 - Math.pow(1 - targetPassAtK, 1 / k);
|
|
219
|
+
}
|
|
220
|
+
/**
|
|
221
|
+
* Determine if evaluation is reliable based on analysis
|
|
222
|
+
*
|
|
223
|
+
* @param analysis - Criterion analysis result
|
|
224
|
+
* @param reliabilityThreshold - Minimum reliability score (default 0.8)
|
|
225
|
+
* @returns Whether the evaluation is considered reliable
|
|
226
|
+
*/
|
|
227
|
+
function isReliableEvaluation(analysis, reliabilityThreshold = 0.8) {
|
|
228
|
+
return analysis.reliabilityScore >= reliabilityThreshold;
|
|
229
|
+
}
|
|
230
|
+
/**
|
|
231
|
+
* Get recommendation based on reliability analysis
|
|
232
|
+
*
|
|
233
|
+
* @param analysis - Criterion analysis result
|
|
234
|
+
* @returns Actionable recommendation string
|
|
235
|
+
*/
|
|
236
|
+
function getReliabilityRecommendation(analysis) {
|
|
237
|
+
if (analysis.reliabilityScore >= 0.9) {
|
|
238
|
+
return 'Evaluation is highly reliable. No changes needed.';
|
|
239
|
+
}
|
|
240
|
+
else if (analysis.reliabilityScore >= 0.8) {
|
|
241
|
+
return 'Evaluation is reliable. Consider minor criteria refinements.';
|
|
242
|
+
}
|
|
243
|
+
else if (analysis.reliabilityScore >= 0.6) {
|
|
244
|
+
return 'Evaluation has moderate reliability. Add more specific criteria or examples.';
|
|
245
|
+
}
|
|
246
|
+
else {
|
|
247
|
+
return 'Evaluation is unreliable. Consider using deterministic checks or restructuring criteria.';
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
//# sourceMappingURL=data:application/json;base64,{"version":3,"file":"nondeterminism.js","sourceRoot":"","sources":["../../src/api/nondeterminism.ts"],"names":[],"mappings":";AAAA;;;;GAIG;;;AAqUH,4CAEC;AAcD,4CAEC;AAcD,gEAEC;AASD,oDAKC;AAQD,oEAUC;AArYD,2CAAgE;AA4HhE,+EAA+E;AAC/E,6BAA6B;AAC7B,+EAA+E;AAE/E;;GAEG;AACU,QAAA,cAAc,GAAG;IAC5B;;;;;;;;;;;;;OAaG;IACH,KAAK,CAAC,SAAS,CAAC,OAAyB;QACvC,OAAO,IAAA,2BAAkB,EAAoB,sBAAsB,EAAE;YACnE,MAAM,EAAE,MAAM;YACd,IAAI,EAAE,OAAO;YACb,UAAU,EAAE,MAAM;SACnB,CAAC,CAAC;IACL,CAAC;IAED;;;;;;;OAOG;IACH,KAAK,CAAC,OAAO,CAAC,UAA2B,EAAE;QACzC,MAAM,MAAM,GAAG,IAAI,eAAe,EAAE,CAAC;QACrC,IAAI,OAAO,CAAC,OAAO;YAAE,MAAM,CAAC,GAAG,CAAC,SAAS,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC;QAC5D,IAAI,OAAO,CAAC,MAAM;YAAE,MAAM,CAAC,GAAG,CAAC,QAAQ,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC;QACzD,IAAI,OAAO,CAAC,KAAK;YAAE,MAAM,CAAC,GAAG,CAAC,OAAO,EAAE,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC;QAC9D,IAAI,OAAO,CAAC,MAAM;YAAE,MAAM,CAAC,GAAG,CAAC,QAAQ,EAAE,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC;QAEjE,OAAO,IAAA,2BAAkB,EACvB,wBAAwB,MAAM,CAAC,QAAQ,EAAE,EAAE,EAC3C,EAAE,UAAU,EAAE,MAAM,EAAE,CACvB,CAAC;IACJ,CAAC;IAED;;;;;;;OAOG;IACH,KAAK,CAAC,MAAM,CAAC,KAAa;QACxB,OAAO,IAAA,2BAAkB,EACvB,wBAAwB,KAAK,EAAE,EAC/B,EAAE,UAAU,EAAE,MAAM,EAAE,CACvB,CAAC;IACJ,CAAC;IAED;;;;;;;OAOG;IACH,KAAK,CAAC,QAAQ,CAAC,KAAa;QAC1B,MAAM,IAAA,mBAAU,EAAC,wBAAwB,KAAK,QAAQ,EAAE;YACtD,MAAM,EAAE,MAAM;YACd,UAAU,EAAE,MAAM;SACnB,CAAC,CAAC;IACL,CAAC;IAED;;;;;;;OAOG;IACH,KAAK,CAAC,WAAW,CAAC,KAAa;QAC7B,MAAM,IAAA,mBAAU,EAAC,wBAAwB,KAAK,WAAW,EAAE;YACzD,MAAM,EAAE,MAAM;YACd,UAAU,EAAE,MAAM;SACnB,CAAC,CAAC;IACL,CAAC;IAED;;;;;;;;;;;;;;;OAeG;IACH,KAAK,CAAC,YAAY,CAAC,OAA4B;QAC7C,OAAO,IAAA,2BAAkB,EAAuB,yBAAyB,EAAE;YACzE,MAAM,EAAE,MAAM;YACd,IAAI,EAAE,OAAO;YACb,UAAU,EAAE,MAAM;SACnB,CAAC,CAAC;IACL,CAAC;IAED;;;;;;;OAOG;IACH,KAAK,CAAC,UAAU,CAAC,KAAa;QAC5B,OAAO,IAAA,2BAAkB,EACvB,wBAAwB,KAAK,UAAU,EACvC,EAAE,UAAU,EAAE,MAAM,EAAE,CACvB,CAAC;IACJ,CAAC;IAED;;;;;;;;OAQG;IACH,KAAK,CAAC,aAAa,CAAC,KAAa;QAC/B,OAAO,IAAA,2BAAkB,EACvB,wBAAwB,KAAK,UAAU,EACvC,EAAE,UAAU,EAAE,MAAM,EAAE,CACvB,CAAC;IACJ,CAAC;IAED;;;;;;;OAOG;IACH,KAAK,CAAC,UAAU,CAAC,KAAa;QAC5B,OAAO,IAAA,2BAAkB,EACvB,wBAAwB,KAAK,UAAU,EACvC,EAAE,MAAM,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM,EAAE,CACvC,CAAC;IACJ,CAAC;IAED;;;;;;;;OAQG;IACH,KAAK,CAAC,OAAO;QACX,OAAO,IAAA,2BAAkB,EACvB,sBAAsB,EACtB,EAAE,UAAU,EAAE,MAAM,EAAE,CACvB,CAAC;IACJ,CAAC;CACF,CAAC;AAEF,+EAA+E;AAC/E,mBAAmB;AACnB,+EAA+E;AAE/E;;;;;;;;;;;GAWG;AACH,SAAgB,gBAAgB,CAAC,QAAgB,EAAE,CAAS;IAC1D,OAAO,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,QAAQ,EAAE,CAAC,CAAC,CAAC;AACvC,CAAC;AAED;;;;;;;;;;;GAWG;AACH,SAAgB,gBAAgB,CAAC,QAAgB,EAAE,CAAS;IAC1D,OAAO,IAAI,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC;AAC/B,CAAC;AAED;;;;;;;;;;;GAWG;AACH,SAAgB,0BAA0B,CAAC,aAAqB,EAAE,CAAS;IACzE,OAAO,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,aAAa,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;AAChD,CAAC;AAED;;;;;;GAMG;AACH,SAAgB,oBAAoB,CAClC,QAA2B,EAC3B,oBAAoB,GAAG,GAAG;IAE1B,OAAO,QAAQ,CAAC,gBAAgB,IAAI,oBAAoB,CAAC;AAC3D,CAAC;AAED;;;;;GAKG;AACH,SAAgB,4BAA4B,CAAC,QAA2B;IACtE,IAAI,QAAQ,CAAC,gBAAgB,IAAI,GAAG,EAAE,CAAC;QACrC,OAAO,mDAAmD,CAAC;IAC7D,CAAC;SAAM,IAAI,QAAQ,CAAC,gBAAgB,IAAI,GAAG,EAAE,CAAC;QAC5C,OAAO,8DAA8D,CAAC;IACxE,CAAC;SAAM,IAAI,QAAQ,CAAC,gBAAgB,IAAI,GAAG,EAAE,CAAC;QAC5C,OAAO,8EAA8E,CAAC;IACxF,CAAC;SAAM,CAAC;QACN,OAAO,0FAA0F,CAAC;IACpG,CAAC;AACH,CAAC","sourcesContent":["/**\n * ThinkHive SDK v3.0 - Non-Determinism API\n *\n * API for pass@k / pass^k analysis to measure LLM evaluation reliability\n */\n\nimport { apiRequest, apiRequestWithData } from '../core/client';\n\n// ============================================================================\n// TYPES\n// ============================================================================\n\nexport type NondeterminismRunType = 'pass_at_k' | 'pass_to_k' | 'variance' | 'reliability';\nexport type NondeterminismRunStatus = 'pending' | 'running' | 'completed' | 'failed' | 'cancelled';\n\nexport interface NondeterminismRun {\n  id: string;\n  companyId: string;\n  agentId: string;\n  runType: NondeterminismRunType;\n  kValue: number;\n  status: NondeterminismRunStatus;\n  traceCount: number;\n  criterionId?: string;\n  criteriaIds: string[];\n  temperature?: string;\n  model?: string;\n  progressPercent: number;\n  passAtKRate?: string;\n  passToKRate?: string;\n  avgVariance?: string;\n  reliabilityScore?: string;\n  startedAt?: string;\n  completedAt?: string;\n  createdBy?: string;\n  createdAt: string;\n}\n\nexport interface NondeterminismSample {\n  id: string;\n  runId: string;\n  traceId: string;\n  criterionId: string;\n  sampleIndex: number;\n  score: string;\n  passed: boolean;\n  reasoning?: string;\n  confidence?: string;\n  tokensUsed?: number;\n  costUsd?: string;\n  model?: string;\n  temperature?: string;\n  latencyMs?: number;\n  error?: string;\n  createdAt: string;\n}\n\nexport interface CreateRunOptions {\n  agentId: string;\n  criterionId?: string;\n  criteriaIds?: string[];\n  kValue: number;\n  traceIds: string[];\n  runType?: NondeterminismRunType;\n  temperature?: number;\n  model?: string;\n}\n\nexport interface RecordSampleOptions {\n  runId: string;\n  traceId: string;\n  criterionId: string;\n  sampleIndex: number;\n  score: number;\n  passed: boolean;\n  reasoning?: string;\n  confidence?: number;\n  tokensUsed?: number;\n  costUsd?: number;\n  model?: string;\n  temperature?: number;\n  latencyMs?: number;\n  error?: string;\n}\n\nexport interface TraceAnalysis {\n  traceId: string;\n  samples: NondeterminismSample[];\n  passCount: number;\n  totalCount: number;\n  passRate: number;\n  scoreVariance: number;\n  meanScore: number;\n  isConsistent: boolean;\n}\n\nexport interface CriterionAnalysis {\n  criterionId: string;\n  traceAnalyses: TraceAnalysis[];\n  passAtKRate: number;\n  passToKRate: number;\n  reliabilityScore: number;\n  isReliable: boolean;\n  recommendation: string;\n}\n\nexport interface RunSummary {\n  run: NondeterminismRun;\n  traceAnalyses: TraceAnalysis[];\n  criterionAnalyses: CriterionAnalysis[];\n}\n\nexport interface ListRunsOptions {\n  agentId?: string;\n  status?: NondeterminismRunStatus;\n  limit?: number;\n  offset?: number;\n}\n\nexport interface PassAtKInfo {\n  concepts: {\n    passAtK: { name: string; description: string; formula: string; useCase: string };\n    passToK: { name: string; description: string; formula: string; useCase: string };\n    variance: { name: string; description: string; useCase: string };\n    reliability: { name: string; description: string; useCase: string };\n  };\n  recommendations: Record<string, string>;\n  defaults: { kValue: number; reliabilityThreshold: number; varianceThreshold: number };\n}\n\n// ============================================================================\n// NON-DETERMINISM API CLIENT\n// ============================================================================\n\n/**\n * Non-Determinism API client for pass@k analysis and reliability measurement\n */\nexport const nondeterminism = {\n  /**\n   * Create a new non-determinism analysis run\n   *\n   * @example\n   * ```typescript\n   * const run = await nondeterminism.createRun({\n   *   agentId: 'agent_123',\n   *   criterionId: 'criterion_456',\n   *   kValue: 5,\n   *   traceIds: ['trace_1', 'trace_2', 'trace_3'],\n   *   runType: 'pass_at_k',\n   * });\n   * ```\n   */\n  async createRun(options: CreateRunOptions): Promise<NondeterminismRun> {\n    return apiRequestWithData<NondeterminismRun>('/nondeterminism/runs', {\n      method: 'POST',\n      body: options,\n      apiVersion: 'none',\n    });\n  },\n\n  /**\n   * Get non-determinism runs\n   *\n   * @example\n   * ```typescript\n   * const runs = await nondeterminism.getRuns({ agentId: 'agent_123' });\n   * ```\n   */\n  async getRuns(options: ListRunsOptions = {}): Promise<NondeterminismRun[]> {\n    const params = new URLSearchParams();\n    if (options.agentId) params.set('agentId', options.agentId);\n    if (options.status) params.set('status', options.status);\n    if (options.limit) params.set('limit', String(options.limit));\n    if (options.offset) params.set('offset', String(options.offset));\n\n    return apiRequestWithData<NondeterminismRun[]>(\n      `/nondeterminism/runs?${params.toString()}`,\n      { apiVersion: 'none' }\n    );\n  },\n\n  /**\n   * Get a specific run\n   *\n   * @example\n   * ```typescript\n   * const run = await nondeterminism.getRun('run_123');\n   * ```\n   */\n  async getRun(runId: string): Promise<NondeterminismRun> {\n    return apiRequestWithData<NondeterminismRun>(\n      `/nondeterminism/runs/${runId}`,\n      { apiVersion: 'none' }\n    );\n  },\n\n  /**\n   * Start a run\n   *\n   * @example\n   * ```typescript\n   * await nondeterminism.startRun('run_123');\n   * ```\n   */\n  async startRun(runId: string): Promise<void> {\n    await apiRequest(`/nondeterminism/runs/${runId}/start`, {\n      method: 'POST',\n      apiVersion: 'none',\n    });\n  },\n\n  /**\n   * Complete a run\n   *\n   * @example\n   * ```typescript\n   * await nondeterminism.completeRun('run_123');\n   * ```\n   */\n  async completeRun(runId: string): Promise<void> {\n    await apiRequest(`/nondeterminism/runs/${runId}/complete`, {\n      method: 'POST',\n      apiVersion: 'none',\n    });\n  },\n\n  /**\n   * Record a sample result\n   *\n   * @example\n   * ```typescript\n   * const sample = await nondeterminism.recordSample({\n   *   runId: 'run_123',\n   *   traceId: 'trace_456',\n   *   criterionId: 'criterion_789',\n   *   sampleIndex: 0,\n   *   score: 85,\n   *   passed: true,\n   *   reasoning: 'Response meets quality criteria',\n   * });\n   * ```\n   */\n  async recordSample(options: RecordSampleOptions): Promise<NondeterminismSample> {\n    return apiRequestWithData<NondeterminismSample>('/nondeterminism/samples', {\n      method: 'POST',\n      body: options,\n      apiVersion: 'none',\n    });\n  },\n\n  /**\n   * Get samples for a run\n   *\n   * @example\n   * ```typescript\n   * const samples = await nondeterminism.getSamples('run_123');\n   * ```\n   */\n  async getSamples(runId: string): Promise<NondeterminismSample[]> {\n    return apiRequestWithData<NondeterminismSample[]>(\n      `/nondeterminism/runs/${runId}/samples`,\n      { apiVersion: 'none' }\n    );\n  },\n\n  /**\n   * Get run summary with analysis\n   *\n   * @example\n   * ```typescript\n   * const summary = await nondeterminism.getRunSummary('run_123');\n   * console.log(`Pass@k rate: ${summary.criterionAnalyses[0].passAtKRate}`);\n   * ```\n   */\n  async getRunSummary(runId: string): Promise<RunSummary> {\n    return apiRequestWithData<RunSummary>(\n      `/nondeterminism/runs/${runId}/summary`,\n      { apiVersion: 'none' }\n    );\n  },\n\n  /**\n   * Trigger analysis of a completed run\n   *\n   * @example\n   * ```typescript\n   * const summary = await nondeterminism.analyzeRun('run_123');\n   * ```\n   */\n  async analyzeRun(runId: string): Promise<RunSummary> {\n    return apiRequestWithData<RunSummary>(\n      `/nondeterminism/runs/${runId}/analyze`,\n      { method: 'POST', apiVersion: 'none' }\n    );\n  },\n\n  /**\n   * Get information about pass@k analysis\n   *\n   * @example\n   * ```typescript\n   * const info = await nondeterminism.getInfo();\n   * console.log(info.concepts.passAtK.description);\n   * ```\n   */\n  async getInfo(): Promise<PassAtKInfo> {\n    return apiRequestWithData<PassAtKInfo>(\n      '/nondeterminism/info',\n      { apiVersion: 'none' }\n    );\n  },\n};\n\n// ============================================================================\n// HELPER FUNCTIONS\n// ============================================================================\n\n/**\n * Calculate pass@k probability from pass rate\n *\n * @param passRate - Single-run pass rate (0-1)\n * @param k - Number of runs\n * @returns Probability that at least 1 of k runs passes\n *\n * @example\n * ```typescript\n * const passAtK = calculatePassAtK(0.7, 3); // ~0.973\n * ```\n */\nexport function calculatePassAtK(passRate: number, k: number): number {\n  return 1 - Math.pow(1 - passRate, k);\n}\n\n/**\n * Calculate pass^k probability from pass rate\n *\n * @param passRate - Single-run pass rate (0-1)\n * @param k - Number of runs\n * @returns Probability that all k runs pass\n *\n * @example\n * ```typescript\n * const passToK = calculatePassToK(0.7, 3); // ~0.343\n * ```\n */\nexport function calculatePassToK(passRate: number, k: number): number {\n  return Math.pow(passRate, k);\n}\n\n/**\n * Calculate required pass rate to achieve target pass@k\n *\n * @param targetPassAtK - Desired pass@k probability\n * @param k - Number of runs\n * @returns Required single-run pass rate\n *\n * @example\n * ```typescript\n * const requiredRate = requiredPassRateForPassAtK(0.95, 3); // ~0.632\n * ```\n */\nexport function requiredPassRateForPassAtK(targetPassAtK: number, k: number): number {\n  return 1 - Math.pow(1 - targetPassAtK, 1 / k);\n}\n\n/**\n * Determine if evaluation is reliable based on analysis\n *\n * @param analysis - Criterion analysis result\n * @param reliabilityThreshold - Minimum reliability score (default 0.8)\n * @returns Whether the evaluation is considered reliable\n */\nexport function isReliableEvaluation(\n  analysis: CriterionAnalysis,\n  reliabilityThreshold = 0.8\n): boolean {\n  return analysis.reliabilityScore >= reliabilityThreshold;\n}\n\n/**\n * Get recommendation based on reliability analysis\n *\n * @param analysis - Criterion analysis result\n * @returns Actionable recommendation string\n */\nexport function getReliabilityRecommendation(analysis: CriterionAnalysis): string {\n  if (analysis.reliabilityScore >= 0.9) {\n    return 'Evaluation is highly reliable. No changes needed.';\n  } else if (analysis.reliabilityScore >= 0.8) {\n    return 'Evaluation is reliable. Consider minor criteria refinements.';\n  } else if (analysis.reliabilityScore >= 0.6) {\n    return 'Evaluation has moderate reliability. Add more specific criteria or examples.';\n  } else {\n    return 'Evaluation is unreliable. Consider using deterministic checks or restructuring criteria.';\n  }\n}\n"]}
|
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ThinkHive SDK v3.1 - Quality Metrics API
|
|
3
|
+
*
|
|
4
|
+
* RAG Evaluation & Hallucination Detection for AI quality assurance
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Retrieved context for RAG evaluation
|
|
8
|
+
*/
|
|
9
|
+
export interface RetrievedContext {
|
|
10
|
+
content: string;
|
|
11
|
+
chunkIndex?: number;
|
|
12
|
+
metadata?: Record<string, unknown>;
|
|
13
|
+
score?: number;
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Ground truth context
|
|
17
|
+
*/
|
|
18
|
+
export interface GroundTruthContext {
|
|
19
|
+
content: string;
|
|
20
|
+
chunkIndex?: number;
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Grounded span evidence
|
|
24
|
+
*/
|
|
25
|
+
export interface GroundedSpan {
|
|
26
|
+
text: string;
|
|
27
|
+
confidence: number;
|
|
28
|
+
sourceChunkIndex?: number;
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Ungrounded span evidence
|
|
32
|
+
*/
|
|
33
|
+
export interface UngroundedSpan {
|
|
34
|
+
text: string;
|
|
35
|
+
confidence: number;
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* Citation mapping
|
|
39
|
+
*/
|
|
40
|
+
export interface CitationMap {
|
|
41
|
+
claim: string;
|
|
42
|
+
citedIndex: number;
|
|
43
|
+
isValid: boolean;
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* RAG evaluation result
|
|
47
|
+
*/
|
|
48
|
+
export interface RAGEvaluation {
|
|
49
|
+
contextRelevance: number;
|
|
50
|
+
contextPrecision: number;
|
|
51
|
+
contextRecall: number;
|
|
52
|
+
groundedness: number;
|
|
53
|
+
faithfulness: number;
|
|
54
|
+
answerRelevance: number;
|
|
55
|
+
citationAccuracy: number;
|
|
56
|
+
citationCompleteness: number;
|
|
57
|
+
overallScore: number;
|
|
58
|
+
grade: 'A' | 'B' | 'C' | 'D' | 'F';
|
|
59
|
+
groundedSpanCount?: number;
|
|
60
|
+
ungroundedSpanCount?: number;
|
|
61
|
+
issues: string[];
|
|
62
|
+
recommendations: string[];
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* RAG evaluation evidence
|
|
66
|
+
*/
|
|
67
|
+
export interface RAGEvidence {
|
|
68
|
+
groundedSpans: GroundedSpan[];
|
|
69
|
+
ungroundedSpans: UngroundedSpan[];
|
|
70
|
+
citationMap: CitationMap[];
|
|
71
|
+
}
|
|
72
|
+
/**
|
|
73
|
+
* Hallucination instance
|
|
74
|
+
*/
|
|
75
|
+
export interface HallucinationInstance {
|
|
76
|
+
type: string;
|
|
77
|
+
severity: 'low' | 'medium' | 'high' | 'critical';
|
|
78
|
+
text: string;
|
|
79
|
+
explanation: string;
|
|
80
|
+
confidence: number;
|
|
81
|
+
suggestedFix?: string;
|
|
82
|
+
}
|
|
83
|
+
/**
|
|
84
|
+
* Hallucination detection report
|
|
85
|
+
*/
|
|
86
|
+
export interface HallucinationReport {
|
|
87
|
+
hasHallucinations: boolean;
|
|
88
|
+
hallucinationScore: number;
|
|
89
|
+
riskLevel: 'low' | 'medium' | 'high' | 'critical';
|
|
90
|
+
factualClaims: number;
|
|
91
|
+
verifiedClaims: number;
|
|
92
|
+
unverifiedClaims: number;
|
|
93
|
+
summary: string;
|
|
94
|
+
recommendations: string[];
|
|
95
|
+
instances: HallucinationInstance[];
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Groundedness analysis result
|
|
99
|
+
*/
|
|
100
|
+
export interface GroundednessResult {
|
|
101
|
+
score: number;
|
|
102
|
+
faithfulness: number;
|
|
103
|
+
contextRelevance: number;
|
|
104
|
+
grade: string;
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* Batch evaluation result for a single trace
|
|
108
|
+
*/
|
|
109
|
+
export interface BatchEvaluationResult {
|
|
110
|
+
traceId: string;
|
|
111
|
+
success: boolean;
|
|
112
|
+
error?: string;
|
|
113
|
+
rag?: {
|
|
114
|
+
score: number;
|
|
115
|
+
grade: string;
|
|
116
|
+
mainIssue?: string;
|
|
117
|
+
};
|
|
118
|
+
hallucination?: {
|
|
119
|
+
hasIssues: boolean;
|
|
120
|
+
score: number;
|
|
121
|
+
topIssue?: string;
|
|
122
|
+
};
|
|
123
|
+
}
|
|
124
|
+
/**
|
|
125
|
+
* Batch evaluation summary
|
|
126
|
+
*/
|
|
127
|
+
export interface BatchEvaluationSummary {
|
|
128
|
+
totalTraces: number;
|
|
129
|
+
successfulEvaluations: number;
|
|
130
|
+
avgRagScore: number;
|
|
131
|
+
hallucinationRate: number;
|
|
132
|
+
gradeDistribution: {
|
|
133
|
+
A: number;
|
|
134
|
+
B: number;
|
|
135
|
+
C: number;
|
|
136
|
+
D: number;
|
|
137
|
+
F: number;
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
/**
|
|
141
|
+
* Quality Metrics API client for RAG evaluation and hallucination detection
|
|
142
|
+
*/
|
|
143
|
+
export declare const qualityMetrics: {
|
|
144
|
+
/**
|
|
145
|
+
* Get RAG quality scores for a specific trace
|
|
146
|
+
*
|
|
147
|
+
* @example
|
|
148
|
+
* ```typescript
|
|
149
|
+
* const scores = await qualityMetrics.getRagScores('trace_abc123');
|
|
150
|
+
* console.log(`Groundedness: ${scores.evaluation.groundedness}`);
|
|
151
|
+
* console.log(`Grade: ${scores.evaluation.grade}`);
|
|
152
|
+
* ```
|
|
153
|
+
*/
|
|
154
|
+
getRagScores(traceId: string): Promise<{
|
|
155
|
+
traceId: string;
|
|
156
|
+
evaluation: RAGEvaluation;
|
|
157
|
+
evidence: RAGEvidence;
|
|
158
|
+
}>;
|
|
159
|
+
/**
|
|
160
|
+
* Get hallucination detection report for a trace
|
|
161
|
+
*
|
|
162
|
+
* @example
|
|
163
|
+
* ```typescript
|
|
164
|
+
* const report = await qualityMetrics.getHallucinationReport('trace_abc123');
|
|
165
|
+
* if (report.report.hasHallucinations) {
|
|
166
|
+
* console.log(`Risk level: ${report.report.riskLevel}`);
|
|
167
|
+
* for (const instance of report.report.instances) {
|
|
168
|
+
* console.log(`- ${instance.type}: ${instance.text}`);
|
|
169
|
+
* }
|
|
170
|
+
* }
|
|
171
|
+
* ```
|
|
172
|
+
*/
|
|
173
|
+
getHallucinationReport(traceId: string): Promise<{
|
|
174
|
+
traceId: string;
|
|
175
|
+
report: HallucinationReport;
|
|
176
|
+
}>;
|
|
177
|
+
/**
|
|
178
|
+
* Evaluate RAG quality for provided content (ad-hoc evaluation)
|
|
179
|
+
*
|
|
180
|
+
* @example
|
|
181
|
+
* ```typescript
|
|
182
|
+
* const result = await qualityMetrics.evaluateRag({
|
|
183
|
+
* query: 'What is the refund policy?',
|
|
184
|
+
* response: 'You can get a refund within 30 days.',
|
|
185
|
+
* retrievedContexts: [
|
|
186
|
+
* { content: 'Our refund policy allows returns within 30 days of purchase.' },
|
|
187
|
+
* ],
|
|
188
|
+
* });
|
|
189
|
+
* console.log(`Groundedness: ${result.evaluation.groundedness}`);
|
|
190
|
+
* ```
|
|
191
|
+
*/
|
|
192
|
+
evaluateRag(input: {
|
|
193
|
+
query: string;
|
|
194
|
+
response: string;
|
|
195
|
+
retrievedContexts: RetrievedContext[];
|
|
196
|
+
groundTruthContexts?: GroundTruthContext[];
|
|
197
|
+
citations?: string[];
|
|
198
|
+
}): Promise<{
|
|
199
|
+
evaluation: RAGEvaluation;
|
|
200
|
+
evidence: RAGEvidence;
|
|
201
|
+
}>;
|
|
202
|
+
/**
|
|
203
|
+
* Detect hallucinations in provided content (ad-hoc detection)
|
|
204
|
+
*
|
|
205
|
+
* @example
|
|
206
|
+
* ```typescript
|
|
207
|
+
* const result = await qualityMetrics.detectHallucinations({
|
|
208
|
+
* response: 'The product costs $99 and comes with a 2-year warranty.',
|
|
209
|
+
* contexts: [
|
|
210
|
+
* { content: 'The product costs $99 with a 1-year warranty.' },
|
|
211
|
+
* ],
|
|
212
|
+
* });
|
|
213
|
+
* if (result.report.hasHallucinations) {
|
|
214
|
+
* console.log('Detected hallucinations:', result.report.instances);
|
|
215
|
+
* }
|
|
216
|
+
* ```
|
|
217
|
+
*/
|
|
218
|
+
detectHallucinations(input: {
|
|
219
|
+
response: string;
|
|
220
|
+
contexts: Array<{
|
|
221
|
+
content: string;
|
|
222
|
+
metadata?: Record<string, unknown>;
|
|
223
|
+
}>;
|
|
224
|
+
query?: string;
|
|
225
|
+
previousResponses?: string[];
|
|
226
|
+
}): Promise<{
|
|
227
|
+
report: HallucinationReport;
|
|
228
|
+
}>;
|
|
229
|
+
/**
|
|
230
|
+
* Get groundedness analysis for a trace
|
|
231
|
+
*
|
|
232
|
+
* @example
|
|
233
|
+
* ```typescript
|
|
234
|
+
* const result = await qualityMetrics.getGroundedness('trace_abc123');
|
|
235
|
+
* console.log(`Groundedness score: ${result.groundedness.score}`);
|
|
236
|
+
* console.log(`Grounded spans: ${result.summary.groundedSpans}`);
|
|
237
|
+
* ```
|
|
238
|
+
*/
|
|
239
|
+
getGroundedness(traceId: string): Promise<{
|
|
240
|
+
traceId: string;
|
|
241
|
+
groundedness: GroundednessResult;
|
|
242
|
+
spans: {
|
|
243
|
+
grounded: Array<{
|
|
244
|
+
text: string;
|
|
245
|
+
confidence: number;
|
|
246
|
+
sourceIndex: number;
|
|
247
|
+
}>;
|
|
248
|
+
ungrounded: Array<{
|
|
249
|
+
text: string;
|
|
250
|
+
confidence: number;
|
|
251
|
+
}>;
|
|
252
|
+
};
|
|
253
|
+
summary: {
|
|
254
|
+
totalSpans: number;
|
|
255
|
+
groundedSpans: number;
|
|
256
|
+
ungroundedSpans: number;
|
|
257
|
+
groundednessRatio: number;
|
|
258
|
+
};
|
|
259
|
+
}>;
|
|
260
|
+
/**
|
|
261
|
+
* Evaluate multiple traces for quality metrics in batch
|
|
262
|
+
*
|
|
263
|
+
* @example
|
|
264
|
+
* ```typescript
|
|
265
|
+
* const result = await qualityMetrics.evaluateBatch({
|
|
266
|
+
* traceIds: ['trace_1', 'trace_2', 'trace_3'],
|
|
267
|
+
* });
|
|
268
|
+
* console.log(`Average RAG score: ${result.summary.avgRagScore}`);
|
|
269
|
+
* console.log(`Hallucination rate: ${result.summary.hallucinationRate}%`);
|
|
270
|
+
* ```
|
|
271
|
+
*/
|
|
272
|
+
evaluateBatch(options: {
|
|
273
|
+
traceIds: string[];
|
|
274
|
+
includeDetails?: boolean;
|
|
275
|
+
}): Promise<{
|
|
276
|
+
summary: BatchEvaluationSummary;
|
|
277
|
+
results: BatchEvaluationResult[];
|
|
278
|
+
}>;
|
|
279
|
+
};
|
|
280
|
+
/**
|
|
281
|
+
* Check if a RAG evaluation passes quality thresholds
|
|
282
|
+
*/
|
|
283
|
+
export declare function passesQualityThreshold(evaluation: RAGEvaluation, thresholds?: {
|
|
284
|
+
minGroundedness?: number;
|
|
285
|
+
minOverallScore?: number;
|
|
286
|
+
minGrade?: 'A' | 'B' | 'C' | 'D';
|
|
287
|
+
}): boolean;
|
|
288
|
+
/**
|
|
289
|
+
* Check if hallucination risk is acceptable
|
|
290
|
+
*/
|
|
291
|
+
export declare function isHallucinationRiskAcceptable(report: HallucinationReport, maxRiskLevel?: 'low' | 'medium' | 'high'): boolean;
|
|
292
|
+
/**
|
|
293
|
+
* Get quality recommendations based on evaluation
|
|
294
|
+
*/
|
|
295
|
+
export declare function getQualityRecommendations(ragEval: RAGEvaluation, hallucinationReport?: HallucinationReport): string[];
|
|
296
|
+
/**
|
|
297
|
+
* Format quality score for display
|
|
298
|
+
*/
|
|
299
|
+
export declare function formatQualityScore(score: number): string;
|
|
300
|
+
/**
|
|
301
|
+
* Get color indicator for grade
|
|
302
|
+
*/
|
|
303
|
+
export declare function getGradeColor(grade: 'A' | 'B' | 'C' | 'D' | 'F'): 'green' | 'blue' | 'yellow' | 'orange' | 'red';
|