teleportation-cli 1.3.0 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/hooks/permission_request.mjs +11 -4
- package/.claude/hooks/post_tool_use.mjs +1 -3
- package/.claude/hooks/pre_tool_use.mjs +255 -289
- package/.claude/hooks/session-register.mjs +44 -29
- package/.claude/hooks/session_end.mjs +29 -3
- package/.claude/hooks/session_start.mjs +57 -1
- package/.claude/hooks/stop.mjs +245 -242
- package/.claude/hooks/user_prompt_submit.mjs +1 -3
- package/lib/config/manager.js +45 -1
- package/lib/daemon/session-file-registry.js +207 -0
- package/lib/daemon/task-executor-v2.js +239 -29
- package/lib/daemon/teleportation-daemon.js +469 -29
- package/lib/daemon/timeline-analyzer.js +19 -13
- package/lib/daemon/transcript-ingestion.js +310 -51
- package/lib/daemon/utils.js +0 -9
- package/lib/install/installer.js +126 -3
- package/lib/install/uhr-installer.js +32 -18
- package/lib/intelligence/benchmark.js +240 -0
- package/lib/intelligence/index.js +29 -0
- package/lib/intelligence/rebuild-policies.js +169 -0
- package/lib/intelligence/schema.js +259 -0
- package/lib/intelligence/transcript-mine.js +339 -0
- package/lib/session/metadata.js +23 -5
- package/lib/transcript-sync/lifecycle.js +88 -0
- package/lib/transcript-sync/repo-context.js +45 -0
- package/lib/transcript-sync/worker.js +233 -0
- package/lib/utils/log-sanitizer.js +65 -0
- package/package.json +2 -1
- package/scripts/sync-transcripts.sh +272 -0
- package/teleportation-cli.cjs +295 -4
package/lib/install/installer.js
CHANGED
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
*/
|
|
11
11
|
|
|
12
12
|
import { copyFile, mkdir, chmod, readFile, writeFile, stat, readdir } from 'fs/promises';
|
|
13
|
+
import { existsSync } from 'fs';
|
|
13
14
|
import { join, dirname, resolve } from 'path';
|
|
14
15
|
import { fileURLToPath } from 'url';
|
|
15
16
|
import { homedir } from 'os';
|
|
@@ -79,6 +80,28 @@ export function checkClaudeCode() {
|
|
|
79
80
|
};
|
|
80
81
|
}
|
|
81
82
|
|
|
83
|
+
/**
|
|
84
|
+
* Check if Cursor IDE is installed by detecting ~/.cursor directory or cursor binary
|
|
85
|
+
*/
|
|
86
|
+
export function checkCursorIde() {
|
|
87
|
+
const cursorDir = join(HOME_DIR, '.cursor');
|
|
88
|
+
if (existsSync(cursorDir)) {
|
|
89
|
+
return { valid: true, path: cursorDir };
|
|
90
|
+
}
|
|
91
|
+
try {
|
|
92
|
+
const cursorPath = execSync('which cursor', { encoding: 'utf8', stdio: ['ignore', 'pipe', 'ignore'] }).trim();
|
|
93
|
+
if (cursorPath) {
|
|
94
|
+
return { valid: true, path: cursorPath };
|
|
95
|
+
}
|
|
96
|
+
} catch (e) {
|
|
97
|
+
// Cursor binary not found
|
|
98
|
+
}
|
|
99
|
+
return {
|
|
100
|
+
valid: false,
|
|
101
|
+
error: 'Cursor IDE not found. Install Cursor or ensure ~/.cursor directory exists.'
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
|
|
82
105
|
/**
|
|
83
106
|
* Check if Gemini CLI is installed
|
|
84
107
|
*/
|
|
@@ -307,6 +330,88 @@ export async function installGeminiHooks(sourceGeminiHooksDir) {
|
|
|
307
330
|
};
|
|
308
331
|
}
|
|
309
332
|
|
|
333
|
+
/**
|
|
334
|
+
* Install Teleportation hooks for Cursor IDE.
|
|
335
|
+
* Writes/merges ~/.cursor/hooks.json to point at the installed hook scripts.
|
|
336
|
+
* Cursor reads this file natively — no "Third-party skills" toggle required.
|
|
337
|
+
*
|
|
338
|
+
* @param {string} sourceHooksDir - Absolute path to the installed .claude/hooks/ directory
|
|
339
|
+
*/
|
|
340
|
+
export async function installCursorHooks(sourceHooksDir) {
|
|
341
|
+
const cursorDir = join(HOME_DIR, '.cursor');
|
|
342
|
+
const hooksJsonPath = join(cursorDir, 'hooks.json');
|
|
343
|
+
await mkdir(cursorDir, { recursive: true });
|
|
344
|
+
|
|
345
|
+
// Build the native Cursor hooks.json format
|
|
346
|
+
const hooksConfig = {
|
|
347
|
+
preToolUse: [{ command: `bun ${join(sourceHooksDir, 'pre_tool_use.mjs')}` }],
|
|
348
|
+
postToolUse: [{ command: `bun ${join(sourceHooksDir, 'post_tool_use.mjs')}` }],
|
|
349
|
+
sessionStart: [{ command: `bun ${join(sourceHooksDir, 'session_start.mjs')}` }],
|
|
350
|
+
sessionEnd: [{ command: `bun ${join(sourceHooksDir, 'session_end.mjs')}` }],
|
|
351
|
+
stop: [{ command: `bun ${join(sourceHooksDir, 'stop.mjs')}` }],
|
|
352
|
+
beforeSubmitPrompt: [{ command: `bun ${join(sourceHooksDir, 'user_prompt_submit.mjs')}` }],
|
|
353
|
+
};
|
|
354
|
+
|
|
355
|
+
// Merge with any existing hooks.json to preserve other tools' hooks
|
|
356
|
+
let existing = { version: 1, hooks: {} };
|
|
357
|
+
try {
|
|
358
|
+
const content = await readFile(hooksJsonPath, 'utf8');
|
|
359
|
+
existing = JSON.parse(content);
|
|
360
|
+
existing.hooks = existing.hooks || {};
|
|
361
|
+
} catch (e) {
|
|
362
|
+
// No existing file — start fresh
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
// Overwrite only Teleportation-managed hooks; leave others untouched
|
|
366
|
+
const merged = {
|
|
367
|
+
...existing,
|
|
368
|
+
version: 1,
|
|
369
|
+
hooks: { ...existing.hooks, ...hooksConfig },
|
|
370
|
+
};
|
|
371
|
+
|
|
372
|
+
await writeFile(hooksJsonPath, JSON.stringify(merged, null, 2));
|
|
373
|
+
return { hooksJsonPath, hooksInstalled: Object.keys(hooksConfig) };
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
/**
|
|
377
|
+
* Remove Teleportation hooks from ~/.cursor/hooks.json without deleting the file.
|
|
378
|
+
* Preserves hooks belonging to other tools.
|
|
379
|
+
*/
|
|
380
|
+
export async function uninstallCursorHooks() {
|
|
381
|
+
const hooksJsonPath = join(HOME_DIR, '.cursor', 'hooks.json');
|
|
382
|
+
const teleportationKeys = ['preToolUse', 'postToolUse', 'sessionStart', 'sessionEnd', 'stop', 'beforeSubmitPrompt'];
|
|
383
|
+
const teleportationHookFiles = [
|
|
384
|
+
'pre_tool_use.mjs',
|
|
385
|
+
'post_tool_use.mjs',
|
|
386
|
+
'session_start.mjs',
|
|
387
|
+
'session_end.mjs',
|
|
388
|
+
'stop.mjs',
|
|
389
|
+
'user_prompt_submit.mjs',
|
|
390
|
+
];
|
|
391
|
+
|
|
392
|
+
try {
|
|
393
|
+
const content = await readFile(hooksJsonPath, 'utf8');
|
|
394
|
+
const parsed = JSON.parse(content);
|
|
395
|
+
if (parsed.hooks) {
|
|
396
|
+
for (const key of teleportationKeys) {
|
|
397
|
+
if (!Array.isArray(parsed.hooks[key])) continue;
|
|
398
|
+
parsed.hooks[key] = parsed.hooks[key].filter((entry) => {
|
|
399
|
+
const command = entry?.command || '';
|
|
400
|
+
return !teleportationHookFiles.some((file) => command.includes(file));
|
|
401
|
+
});
|
|
402
|
+
if (parsed.hooks[key].length === 0) {
|
|
403
|
+
delete parsed.hooks[key];
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
await writeFile(hooksJsonPath, JSON.stringify(parsed, null, 2));
|
|
408
|
+
return { success: true, hooksJsonPath };
|
|
409
|
+
} catch (e) {
|
|
410
|
+
if (e.code === 'ENOENT') return { success: true, hooksJsonPath }; // Nothing to remove
|
|
411
|
+
throw e;
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
|
|
310
415
|
/**
|
|
311
416
|
* Copy daemon files to ~/.teleportation/daemon/
|
|
312
417
|
*/
|
|
@@ -662,6 +767,7 @@ export async function verifyInstallation() {
|
|
|
662
767
|
* @param {Object} [options] - Installation options
|
|
663
768
|
* @param {boolean} [options.includeClaude] - Force include Claude hooks (overrides detection)
|
|
664
769
|
* @param {boolean} [options.includeGemini] - Force include Gemini hooks (overrides detection)
|
|
770
|
+
* @param {boolean} [options.includeCursor] - Force include Cursor hooks (overrides detection)
|
|
665
771
|
*/
|
|
666
772
|
export async function install(sourceHooksDir, options = {}) {
|
|
667
773
|
// Pre-flight checks
|
|
@@ -673,6 +779,7 @@ export async function install(sourceHooksDir, options = {}) {
|
|
|
673
779
|
// Check available CLIs
|
|
674
780
|
const claudeCheck = checkClaudeCode();
|
|
675
781
|
const geminiCheck = checkGeminiCli();
|
|
782
|
+
const cursorCheck = checkCursorIde();
|
|
676
783
|
|
|
677
784
|
// Resolve what to install based on options and detection
|
|
678
785
|
const shouldInstallClaude = options.includeClaude !== undefined
|
|
@@ -683,11 +790,15 @@ export async function install(sourceHooksDir, options = {}) {
|
|
|
683
790
|
? options.includeGemini
|
|
684
791
|
: geminiCheck.valid;
|
|
685
792
|
|
|
686
|
-
|
|
687
|
-
|
|
793
|
+
const shouldInstallCursor = options.includeCursor !== undefined
|
|
794
|
+
? options.includeCursor
|
|
795
|
+
: cursorCheck.valid;
|
|
796
|
+
|
|
797
|
+
if (!shouldInstallClaude && !shouldInstallGemini && !shouldInstallCursor) {
|
|
798
|
+
if (options.includeClaude === false && options.includeGemini === false && options.includeCursor === false) {
|
|
688
799
|
throw new Error('No targets selected for installation.');
|
|
689
800
|
}
|
|
690
|
-
throw new Error('Neither Claude Code
|
|
801
|
+
throw new Error('Neither Claude Code, Gemini CLI, nor Cursor IDE found. Please install one of them first, or specify a target.');
|
|
691
802
|
}
|
|
692
803
|
|
|
693
804
|
// Create directories
|
|
@@ -695,6 +806,7 @@ export async function install(sourceHooksDir, options = {}) {
|
|
|
695
806
|
|
|
696
807
|
let hooksInstalled = 0;
|
|
697
808
|
let geminiHooksInstalled = 0;
|
|
809
|
+
let cursorHooksInstalled = 0;
|
|
698
810
|
|
|
699
811
|
// 1. Install Claude hooks
|
|
700
812
|
if (shouldInstallClaude) {
|
|
@@ -719,6 +831,16 @@ export async function install(sourceHooksDir, options = {}) {
|
|
|
719
831
|
}
|
|
720
832
|
}
|
|
721
833
|
|
|
834
|
+
// 3. Install Cursor hooks
|
|
835
|
+
if (shouldInstallCursor) {
|
|
836
|
+
try {
|
|
837
|
+
const cursorResult = await installCursorHooks(resolve(getProjectHooksDir()));
|
|
838
|
+
cursorHooksInstalled = cursorResult.hooksInstalled.length;
|
|
839
|
+
} catch (e) {
|
|
840
|
+
console.warn(`Warning: Cursor hooks failed to install: ${e.message}`);
|
|
841
|
+
}
|
|
842
|
+
}
|
|
843
|
+
|
|
722
844
|
// Install daemon (still goes to ~/.teleportation/daemon/)
|
|
723
845
|
const daemonResult = await installDaemon();
|
|
724
846
|
if (daemonResult.failed.length > 0) {
|
|
@@ -753,6 +875,7 @@ export async function install(sourceHooksDir, options = {}) {
|
|
|
753
875
|
success: true,
|
|
754
876
|
hooksInstalled,
|
|
755
877
|
geminiHooksInstalled,
|
|
878
|
+
cursorHooksInstalled,
|
|
756
879
|
daemonInstalled: daemonResult.installed.length,
|
|
757
880
|
libFilesInstalled: libResult.installed.length,
|
|
758
881
|
settingsFile: getProjectSettings(),
|
|
@@ -13,35 +13,41 @@
|
|
|
13
13
|
*/
|
|
14
14
|
|
|
15
15
|
import { execSync } from 'child_process';
|
|
16
|
-
import { join } from 'path';
|
|
16
|
+
import { join, resolve, dirname } from 'path';
|
|
17
|
+
import { fileURLToPath } from 'url';
|
|
17
18
|
import { tmpdir } from 'os';
|
|
18
19
|
|
|
20
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
21
|
+
// Absolute path to the local uhr binary shipped with teleportation
|
|
22
|
+
const LOCAL_UHR_BIN = resolve(__dirname, '..', '..', 'node_modules', '.bin', 'uhr');
|
|
23
|
+
|
|
19
24
|
/**
|
|
20
|
-
*
|
|
21
|
-
*
|
|
22
|
-
*
|
|
23
|
-
* 1. `uhr` on the system PATH (via `which uhr`)
|
|
24
|
-
* 2. `node_modules/.bin/uhr` (local project install)
|
|
25
|
-
*
|
|
26
|
-
* @returns {Promise<boolean>} true if UHR CLI is reachable
|
|
25
|
+
* Resolve the uhr binary path, preferring PATH then the local node_modules install.
|
|
26
|
+
* Returns the binary path string, or null if not found.
|
|
27
|
+
* @returns {Promise<string|null>}
|
|
27
28
|
*/
|
|
28
|
-
export async function
|
|
29
|
+
export async function resolveUhrBin() {
|
|
29
30
|
// Check PATH first
|
|
30
31
|
try {
|
|
31
|
-
execSync('which uhr', { encoding: 'utf8', stdio: ['ignore', 'pipe', 'ignore'] });
|
|
32
|
-
return
|
|
32
|
+
const found = execSync('which uhr', { encoding: 'utf8', stdio: ['ignore', 'pipe', 'ignore'] }).trim();
|
|
33
|
+
if (found) return found;
|
|
33
34
|
} catch (_) {
|
|
34
35
|
// Not in PATH
|
|
35
36
|
}
|
|
36
37
|
|
|
37
|
-
// Check local node_modules
|
|
38
|
+
// Check absolute path to local node_modules/.bin/uhr
|
|
38
39
|
try {
|
|
39
|
-
const
|
|
40
|
-
|
|
41
|
-
return await file.exists();
|
|
40
|
+
const file = Bun.file(LOCAL_UHR_BIN);
|
|
41
|
+
if (await file.exists()) return LOCAL_UHR_BIN;
|
|
42
42
|
} catch (_) {
|
|
43
|
-
|
|
43
|
+
// continue
|
|
44
44
|
}
|
|
45
|
+
|
|
46
|
+
return null;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
export async function isUhrAvailable() {
|
|
50
|
+
return (await resolveUhrBin()) !== null;
|
|
45
51
|
}
|
|
46
52
|
|
|
47
53
|
/**
|
|
@@ -89,9 +95,14 @@ export async function installViaUhr(manifestPath, hooksDir, options = {}) {
|
|
|
89
95
|
}
|
|
90
96
|
|
|
91
97
|
// 5. Run uhr install
|
|
98
|
+
const uhrBin = await resolveUhrBin();
|
|
99
|
+
if (!uhrBin) {
|
|
100
|
+
return { success: false, reason: 'UHR CLI not found (checked PATH and node_modules/.bin/uhr)' };
|
|
101
|
+
}
|
|
102
|
+
|
|
92
103
|
const warnings = [];
|
|
93
104
|
try {
|
|
94
|
-
const output = execSync(`
|
|
105
|
+
const output = execSync(`"${uhrBin}" install "${tempManifestPath}"`, {
|
|
95
106
|
encoding: 'utf8',
|
|
96
107
|
stdio: ['ignore', 'pipe', 'pipe'],
|
|
97
108
|
timeout: 30000,
|
|
@@ -123,8 +134,11 @@ export async function installViaUhr(manifestPath, hooksDir, options = {}) {
|
|
|
123
134
|
* @returns {Promise<{success: boolean}>}
|
|
124
135
|
*/
|
|
125
136
|
export async function uninstallViaUhr(serviceName) {
|
|
137
|
+
const uhrBin = await resolveUhrBin();
|
|
138
|
+
if (!uhrBin) return { success: false };
|
|
139
|
+
|
|
126
140
|
try {
|
|
127
|
-
execSync(`
|
|
141
|
+
execSync(`"${uhrBin}" uninstall "${serviceName}"`, {
|
|
128
142
|
encoding: 'utf8',
|
|
129
143
|
stdio: ['ignore', 'pipe', 'pipe'],
|
|
130
144
|
timeout: 30000,
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Transcript intelligence benchmark aggregation.
|
|
3
|
+
*
|
|
4
|
+
* Computes comparative performance slices by task class + provider + model.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
function toNumberOrNull(value) {
|
|
8
|
+
if (value == null) return null;
|
|
9
|
+
const num = Number(value);
|
|
10
|
+
return Number.isFinite(num) ? num : null;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
function toBoolOrNull(value) {
|
|
14
|
+
if (typeof value === 'boolean') return value;
|
|
15
|
+
return null;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
function initSession(event) {
|
|
19
|
+
return {
|
|
20
|
+
session_id: event.session_id || 'unknown-session',
|
|
21
|
+
task_category: event.task_category || 'unknown',
|
|
22
|
+
provider: event.provider || 'unknown',
|
|
23
|
+
model: event.model || 'unknown',
|
|
24
|
+
success_seen: false,
|
|
25
|
+
failure_seen: false,
|
|
26
|
+
cost_usd: 0,
|
|
27
|
+
duration_ms: 0,
|
|
28
|
+
tokens_used: 0,
|
|
29
|
+
approval_prompts: 0,
|
|
30
|
+
approval_escalations: 0,
|
|
31
|
+
tool_events: 0,
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
function updateSessionSummary(summary, event) {
|
|
36
|
+
if (event.event_type === 'approval_request') {
|
|
37
|
+
summary.approval_prompts += 1;
|
|
38
|
+
}
|
|
39
|
+
if (event.event_type === 'approval_decision' && event.approval?.decision === 'escalate') {
|
|
40
|
+
summary.approval_escalations += 1;
|
|
41
|
+
}
|
|
42
|
+
if (event.event_type === 'tool_call') {
|
|
43
|
+
summary.tool_events += 1;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
const success = toBoolOrNull(event.execution?.success);
|
|
47
|
+
if (success === true) summary.success_seen = true;
|
|
48
|
+
if (success === false) summary.failure_seen = true;
|
|
49
|
+
|
|
50
|
+
const cost = toNumberOrNull(event.execution?.cost_usd);
|
|
51
|
+
if (cost != null) summary.cost_usd += cost;
|
|
52
|
+
|
|
53
|
+
const duration = toNumberOrNull(event.execution?.duration_ms);
|
|
54
|
+
if (duration != null) summary.duration_ms += duration;
|
|
55
|
+
|
|
56
|
+
const tokens = toNumberOrNull(event.execution?.tokens_used);
|
|
57
|
+
if (tokens != null) summary.tokens_used += tokens;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function finalizeSession(summary) {
|
|
61
|
+
let completion_state = 'partial';
|
|
62
|
+
if (summary.success_seen && !summary.failure_seen) completion_state = 'success';
|
|
63
|
+
if (summary.failure_seen && !summary.success_seen) completion_state = 'failed';
|
|
64
|
+
|
|
65
|
+
return {
|
|
66
|
+
...summary,
|
|
67
|
+
completion_state,
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
function average(values) {
|
|
72
|
+
if (values.length === 0) return null;
|
|
73
|
+
return values.reduce((sum, value) => sum + value, 0) / values.length;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
function computeQualityProxy(stats) {
|
|
77
|
+
// Bounded [0..1]: favor success and penalize failures/escalations.
|
|
78
|
+
const escalationPenalty = Math.min(0.25, stats.escalation_rate * 0.5);
|
|
79
|
+
const failurePenalty = stats.failure_rate * 0.35;
|
|
80
|
+
const quality = stats.success_rate - escalationPenalty - failurePenalty;
|
|
81
|
+
return Math.max(0, Number(quality.toFixed(4)));
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
function normalizeHigherIsWorse(value, min, max) {
|
|
85
|
+
if (value == null || min == null || max == null) return 0;
|
|
86
|
+
const span = max - min;
|
|
87
|
+
if (span <= 0) return 0;
|
|
88
|
+
return Math.min(1, Math.max(0, (value - min) / span));
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
function proportionConfidenceInterval(rate, sampleSize) {
|
|
92
|
+
if (!Number.isFinite(rate) || !Number.isFinite(sampleSize) || sampleSize <= 0) {
|
|
93
|
+
return { low: 0, high: 0, margin: 0 };
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
const margin = 1.96 * Math.sqrt((rate * (1 - rate)) / sampleSize);
|
|
97
|
+
return {
|
|
98
|
+
low: Number(Math.max(0, rate - margin).toFixed(4)),
|
|
99
|
+
high: Number(Math.min(1, rate + margin).toFixed(4)),
|
|
100
|
+
margin: Number(margin.toFixed(4)),
|
|
101
|
+
};
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
export function buildSessionBenchmarks(events) {
|
|
105
|
+
if (!Array.isArray(events) || events.length === 0) return [];
|
|
106
|
+
|
|
107
|
+
const sessions = new Map();
|
|
108
|
+
|
|
109
|
+
for (const event of events) {
|
|
110
|
+
const key = event.session_id || 'unknown-session';
|
|
111
|
+
const summary = sessions.get(key) || initSession(event);
|
|
112
|
+
updateSessionSummary(summary, event);
|
|
113
|
+
sessions.set(key, summary);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
return Array.from(sessions.values()).map(finalizeSession);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
export function aggregateBenchmarksByTaskClass(events, options = {}) {
|
|
120
|
+
const minSessions = Number.isInteger(options.minSessions) ? Math.max(1, options.minSessions) : 1;
|
|
121
|
+
const sessionRows = buildSessionBenchmarks(events);
|
|
122
|
+
if (sessionRows.length === 0) return [];
|
|
123
|
+
|
|
124
|
+
const slices = new Map();
|
|
125
|
+
|
|
126
|
+
for (const row of sessionRows) {
|
|
127
|
+
const key = `${row.task_category}::${row.provider}::${row.model}`;
|
|
128
|
+
const existing = slices.get(key) || {
|
|
129
|
+
task_category: row.task_category,
|
|
130
|
+
provider: row.provider,
|
|
131
|
+
model: row.model,
|
|
132
|
+
sessions: [],
|
|
133
|
+
};
|
|
134
|
+
existing.sessions.push(row);
|
|
135
|
+
slices.set(key, existing);
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
return Array.from(slices.values())
|
|
139
|
+
.filter((slice) => slice.sessions.length >= minSessions)
|
|
140
|
+
.map((slice) => {
|
|
141
|
+
const total = slice.sessions.length;
|
|
142
|
+
const successCount = slice.sessions.filter((row) => row.completion_state === 'success').length;
|
|
143
|
+
const failureCount = slice.sessions.filter((row) => row.completion_state === 'failed').length;
|
|
144
|
+
|
|
145
|
+
const escalationPrompts = slice.sessions.reduce((sum, row) => sum + row.approval_prompts, 0);
|
|
146
|
+
const escalations = slice.sessions.reduce((sum, row) => sum + row.approval_escalations, 0);
|
|
147
|
+
|
|
148
|
+
const stats = {
|
|
149
|
+
success_rate: Number((successCount / total).toFixed(4)),
|
|
150
|
+
failure_rate: Number((failureCount / total).toFixed(4)),
|
|
151
|
+
escalation_rate: escalationPrompts === 0
|
|
152
|
+
? 0
|
|
153
|
+
: Number((escalations / escalationPrompts).toFixed(4)),
|
|
154
|
+
};
|
|
155
|
+
|
|
156
|
+
return {
|
|
157
|
+
task_category: slice.task_category,
|
|
158
|
+
provider: slice.provider,
|
|
159
|
+
model: slice.model,
|
|
160
|
+
sample_size: total,
|
|
161
|
+
...stats,
|
|
162
|
+
avg_cost_usd: average(slice.sessions.map((row) => row.cost_usd)),
|
|
163
|
+
avg_duration_ms: average(slice.sessions.map((row) => row.duration_ms)),
|
|
164
|
+
avg_tokens_used: average(slice.sessions.map((row) => row.tokens_used)),
|
|
165
|
+
completion_reliability: Number((1 - stats.failure_rate).toFixed(4)),
|
|
166
|
+
quality_proxy: computeQualityProxy(stats),
|
|
167
|
+
};
|
|
168
|
+
})
|
|
169
|
+
.sort((a, b) => {
|
|
170
|
+
if (b.quality_proxy !== a.quality_proxy) return b.quality_proxy - a.quality_proxy;
|
|
171
|
+
if (b.success_rate !== a.success_rate) return b.success_rate - a.success_rate;
|
|
172
|
+
return a.avg_cost_usd - b.avg_cost_usd;
|
|
173
|
+
});
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
export function scoreHarnessModelBenchmark(events, options = {}) {
|
|
177
|
+
const provider = options.harness || options.provider;
|
|
178
|
+
const model = options.model;
|
|
179
|
+
const taskCategory = options.taskCategory || options.task_category;
|
|
180
|
+
const minSessions = Number.isInteger(options.minSessions) ? Math.max(1, options.minSessions) : 30;
|
|
181
|
+
|
|
182
|
+
const allSlices = aggregateBenchmarksByTaskClass(events, { minSessions: 1 });
|
|
183
|
+
if (allSlices.length === 0) {
|
|
184
|
+
return { eligible: false, reason: 'no_benchmark_data' };
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
const comparable = taskCategory
|
|
188
|
+
? allSlices.filter((slice) => slice.task_category === taskCategory)
|
|
189
|
+
: allSlices;
|
|
190
|
+
|
|
191
|
+
const target = comparable.find((slice) =>
|
|
192
|
+
slice.provider === provider && slice.model === model && (!taskCategory || slice.task_category === taskCategory),
|
|
193
|
+
);
|
|
194
|
+
|
|
195
|
+
if (!target) {
|
|
196
|
+
return { eligible: false, reason: 'no_matching_slice' };
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
if (target.sample_size < minSessions) {
|
|
200
|
+
return {
|
|
201
|
+
eligible: false,
|
|
202
|
+
reason: 'insufficient_evidence',
|
|
203
|
+
sample_size: target.sample_size,
|
|
204
|
+
required_sample_size: minSessions,
|
|
205
|
+
};
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
const costs = comparable.map((slice) => slice.avg_cost_usd).filter((value) => Number.isFinite(value));
|
|
209
|
+
const latencies = comparable.map((slice) => slice.avg_duration_ms).filter((value) => Number.isFinite(value));
|
|
210
|
+
|
|
211
|
+
const costMin = costs.length > 0 ? Math.min(...costs) : null;
|
|
212
|
+
const costMax = costs.length > 0 ? Math.max(...costs) : null;
|
|
213
|
+
const latencyMin = latencies.length > 0 ? Math.min(...latencies) : null;
|
|
214
|
+
const latencyMax = latencies.length > 0 ? Math.max(...latencies) : null;
|
|
215
|
+
|
|
216
|
+
const normalizedCostPenalty = normalizeHigherIsWorse(target.avg_cost_usd, costMin, costMax);
|
|
217
|
+
const normalizedLatencyPenalty = normalizeHigherIsWorse(target.avg_duration_ms, latencyMin, latencyMax);
|
|
218
|
+
|
|
219
|
+
const score = Number((
|
|
220
|
+
(0.35 * target.success_rate) +
|
|
221
|
+
(0.2 * target.quality_proxy) +
|
|
222
|
+
(0.15 * (1 - normalizedCostPenalty)) +
|
|
223
|
+
(0.1 * (1 - normalizedLatencyPenalty)) +
|
|
224
|
+
(0.1 * target.completion_reliability) +
|
|
225
|
+
(0.1 * (1 - target.escalation_rate))
|
|
226
|
+
).toFixed(4));
|
|
227
|
+
|
|
228
|
+
return {
|
|
229
|
+
eligible: true,
|
|
230
|
+
task_category: target.task_category,
|
|
231
|
+
provider: target.provider,
|
|
232
|
+
model: target.model,
|
|
233
|
+
sample_size: target.sample_size,
|
|
234
|
+
score,
|
|
235
|
+
metrics: target,
|
|
236
|
+
confidence_interval: proportionConfidenceInterval(target.success_rate, target.sample_size),
|
|
237
|
+
};
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
export { computeQualityProxy, proportionConfidenceInterval };
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
export {
|
|
2
|
+
normalizeTranscriptEvent,
|
|
3
|
+
normalizeTranscriptEvents,
|
|
4
|
+
isValidNormalizedEvent,
|
|
5
|
+
normalizeTranscriptEntry,
|
|
6
|
+
validateNormalizedTranscriptEntry,
|
|
7
|
+
TASK_CATEGORIES,
|
|
8
|
+
EVENT_TYPES,
|
|
9
|
+
APPROVAL_DECISIONS,
|
|
10
|
+
APPROVAL_SOURCES,
|
|
11
|
+
ENTRY_SCHEMA_VERSION,
|
|
12
|
+
} from './schema.js';
|
|
13
|
+
|
|
14
|
+
export {
|
|
15
|
+
mineRequestPatterns,
|
|
16
|
+
extractRequestText,
|
|
17
|
+
tokenize,
|
|
18
|
+
classifyIntent,
|
|
19
|
+
} from './transcript-mine.js';
|
|
20
|
+
|
|
21
|
+
export {
|
|
22
|
+
buildSessionBenchmarks,
|
|
23
|
+
aggregateBenchmarksByTaskClass,
|
|
24
|
+
scoreHarnessModelBenchmark,
|
|
25
|
+
computeQualityProxy,
|
|
26
|
+
proportionConfidenceInterval,
|
|
27
|
+
} from './benchmark.js';
|
|
28
|
+
|
|
29
|
+
export { rebuildPolicyArtifacts } from './rebuild-policies.js';
|