@claudetools/tools 0.8.2 → 0.8.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +41 -0
- package/dist/context/deduplication.d.ts +72 -0
- package/dist/context/deduplication.js +77 -0
- package/dist/context/deduplication.test.d.ts +6 -0
- package/dist/context/deduplication.test.js +84 -0
- package/dist/context/emergency-eviction.d.ts +73 -0
- package/dist/context/emergency-eviction.example.d.ts +13 -0
- package/dist/context/emergency-eviction.example.js +94 -0
- package/dist/context/emergency-eviction.js +226 -0
- package/dist/context/eviction-engine.d.ts +76 -0
- package/dist/context/eviction-engine.example.d.ts +7 -0
- package/dist/context/eviction-engine.example.js +144 -0
- package/dist/context/eviction-engine.js +176 -0
- package/dist/context/example-usage.d.ts +1 -0
- package/dist/context/example-usage.js +128 -0
- package/dist/context/exchange-summariser.d.ts +80 -0
- package/dist/context/exchange-summariser.js +261 -0
- package/dist/context/health-monitor.d.ts +97 -0
- package/dist/context/health-monitor.example.d.ts +1 -0
- package/dist/context/health-monitor.example.js +164 -0
- package/dist/context/health-monitor.js +210 -0
- package/dist/context/importance-scorer.d.ts +94 -0
- package/dist/context/importance-scorer.example.d.ts +1 -0
- package/dist/context/importance-scorer.example.js +140 -0
- package/dist/context/importance-scorer.js +187 -0
- package/dist/context/index.d.ts +9 -0
- package/dist/context/index.js +16 -0
- package/dist/context/session-helper.d.ts +10 -0
- package/dist/context/session-helper.js +51 -0
- package/dist/context/session-store.d.ts +94 -0
- package/dist/context/session-store.js +286 -0
- package/dist/context/usage-estimator.d.ts +131 -0
- package/dist/context/usage-estimator.js +260 -0
- package/dist/context/usage-estimator.test.d.ts +1 -0
- package/dist/context/usage-estimator.test.js +208 -0
- package/dist/context-cli.d.ts +16 -0
- package/dist/context-cli.js +309 -0
- package/dist/evaluation/build-dataset.d.ts +1 -0
- package/dist/evaluation/build-dataset.js +135 -0
- package/dist/evaluation/threshold-eval.d.ts +63 -0
- package/dist/evaluation/threshold-eval.js +250 -0
- package/dist/handlers/codedna-handlers.d.ts +2 -2
- package/dist/handlers/tool-handlers.js +126 -165
- package/dist/helpers/api-client.d.ts +5 -1
- package/dist/helpers/api-client.js +3 -1
- package/dist/helpers/compact-formatter.d.ts +51 -0
- package/dist/helpers/compact-formatter.js +130 -0
- package/dist/helpers/engagement-tracker.d.ts +10 -0
- package/dist/helpers/engagement-tracker.js +61 -0
- package/dist/helpers/error-tracking.js +1 -1
- package/dist/helpers/session-validation.d.ts +76 -0
- package/dist/helpers/session-validation.js +221 -0
- package/dist/helpers/usage-analytics.js +1 -1
- package/dist/hooks/index.d.ts +4 -0
- package/dist/hooks/index.js +6 -0
- package/dist/hooks/post-tool-use-hook-cli.d.ts +2 -0
- package/dist/hooks/post-tool-use-hook-cli.js +34 -0
- package/dist/hooks/post-tool-use.d.ts +67 -0
- package/dist/hooks/post-tool-use.js +234 -0
- package/dist/hooks/stop-hook-cli.d.ts +2 -0
- package/dist/hooks/stop-hook-cli.js +34 -0
- package/dist/hooks/stop.d.ts +64 -0
- package/dist/hooks/stop.js +192 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.js +2 -0
- package/dist/logger.d.ts +1 -1
- package/dist/logger.js +4 -0
- package/dist/resources.js +3 -0
- package/dist/setup.js +206 -2
- package/dist/templates/claude-md.d.ts +1 -1
- package/dist/templates/claude-md.js +23 -35
- package/dist/templates/worker-prompt.js +35 -202
- package/dist/tools.js +26 -20
- package/package.json +6 -2
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
// =============================================================================
|
|
2
|
+
// Context Management CLI Commands
|
|
3
|
+
// =============================================================================
|
|
4
|
+
// CLI interface for context window management: status, evict, summarise, reset
|
|
5
|
+
import chalk from 'chalk';
|
|
6
|
+
import ora from 'ora';
|
|
7
|
+
import prompts from 'prompts';
|
|
8
|
+
import { getSessionStore } from './context/session-store.js';
|
|
9
|
+
import { createEvictionEngine } from './context/eviction-engine.js';
|
|
10
|
+
import { createExchangeSummariser } from './context/exchange-summariser.js';
|
|
11
|
+
// -----------------------------------------------------------------------------
|
|
12
|
+
// Utility Functions
|
|
13
|
+
// -----------------------------------------------------------------------------
|
|
14
|
+
function success(msg) {
|
|
15
|
+
console.log(chalk.green('✓ ') + msg);
|
|
16
|
+
}
|
|
17
|
+
function error(msg) {
|
|
18
|
+
console.log(chalk.red('✗ ') + msg);
|
|
19
|
+
}
|
|
20
|
+
function info(msg) {
|
|
21
|
+
console.log(chalk.blue('ℹ ') + msg);
|
|
22
|
+
}
|
|
23
|
+
function warn(msg) {
|
|
24
|
+
console.log(chalk.yellow('⚠ ') + msg);
|
|
25
|
+
}
|
|
26
|
+
function header(title) {
|
|
27
|
+
console.log('\n' + chalk.cyan('━'.repeat(50)));
|
|
28
|
+
console.log(chalk.cyan.bold(title));
|
|
29
|
+
console.log(chalk.cyan('━'.repeat(50)) + '\n');
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Get session ID from args or prompt user to select
|
|
33
|
+
*/
|
|
34
|
+
async function getSessionId(args) {
|
|
35
|
+
// Check if session ID provided as argument
|
|
36
|
+
const sessionIdArg = args.find((arg) => arg.startsWith('--session='));
|
|
37
|
+
if (sessionIdArg) {
|
|
38
|
+
return sessionIdArg.split('=')[1];
|
|
39
|
+
}
|
|
40
|
+
// List available sessions
|
|
41
|
+
const store = getSessionStore();
|
|
42
|
+
const sessions = await store.listSessions();
|
|
43
|
+
if (sessions.length === 0) {
|
|
44
|
+
error('No active sessions found');
|
|
45
|
+
return null;
|
|
46
|
+
}
|
|
47
|
+
// If only one session, use it
|
|
48
|
+
if (sessions.length === 1) {
|
|
49
|
+
return sessions[0].session_id;
|
|
50
|
+
}
|
|
51
|
+
// Prompt user to select
|
|
52
|
+
const choices = sessions.map((s) => ({
|
|
53
|
+
title: `${s.session_id} (${s.model}, started ${s.started_at.toLocaleString()})`,
|
|
54
|
+
value: s.session_id,
|
|
55
|
+
}));
|
|
56
|
+
const response = await prompts({
|
|
57
|
+
type: 'select',
|
|
58
|
+
name: 'sessionId',
|
|
59
|
+
message: 'Select a session:',
|
|
60
|
+
choices,
|
|
61
|
+
});
|
|
62
|
+
return response.sessionId || null;
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Format fill percentage with colour coding
|
|
66
|
+
*/
|
|
67
|
+
function formatFill(fill) {
|
|
68
|
+
const percentage = (fill * 100).toFixed(1);
|
|
69
|
+
if (fill < 0.5) {
|
|
70
|
+
return chalk.green(`${percentage}%`);
|
|
71
|
+
}
|
|
72
|
+
else if (fill < 0.7) {
|
|
73
|
+
return chalk.yellow(`${percentage}%`);
|
|
74
|
+
}
|
|
75
|
+
else {
|
|
76
|
+
return chalk.red(`${percentage}%`);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
/**
|
|
80
|
+
* Format session summary
|
|
81
|
+
*/
|
|
82
|
+
function formatSessionSummary(session) {
|
|
83
|
+
console.log(chalk.bold('Session ID:'), session.session_id);
|
|
84
|
+
console.log(chalk.bold('Model:'), session.model);
|
|
85
|
+
console.log(chalk.bold('Started:'), session.started_at.toLocaleString());
|
|
86
|
+
console.log(chalk.bold('Context Limit:'), session.context_limit.toLocaleString(), 'tokens');
|
|
87
|
+
console.log(chalk.bold('Estimated Fill:'), formatFill(session.estimated_fill));
|
|
88
|
+
console.log(chalk.bold('Used Tokens:'), Math.round(session.estimated_fill * session.context_limit).toLocaleString());
|
|
89
|
+
console.log(chalk.bold('Injected Facts:'), session.injected_facts.length);
|
|
90
|
+
console.log(chalk.bold('Exchanges:'), session.exchanges.length);
|
|
91
|
+
// Show exchange summary
|
|
92
|
+
const summarisedCount = session.exchanges.filter((ex) => ex.summarised_at).length;
|
|
93
|
+
if (summarisedCount > 0) {
|
|
94
|
+
console.log(chalk.bold('Summarised Exchanges:'), summarisedCount);
|
|
95
|
+
}
|
|
96
|
+
console.log(chalk.bold('Last Updated:'), session.last_updated.toLocaleString());
|
|
97
|
+
}
|
|
98
|
+
// -----------------------------------------------------------------------------
|
|
99
|
+
// Commands
|
|
100
|
+
// -----------------------------------------------------------------------------
|
|
101
|
+
/**
|
|
102
|
+
* claudetools context status - Show current session context usage
|
|
103
|
+
*/
|
|
104
|
+
export async function contextStatus(args) {
|
|
105
|
+
header('Context Status');
|
|
106
|
+
const sessionId = await getSessionId(args);
|
|
107
|
+
if (!sessionId) {
|
|
108
|
+
process.exit(1);
|
|
109
|
+
}
|
|
110
|
+
const spinner = ora('Loading session state...').start();
|
|
111
|
+
try {
|
|
112
|
+
const store = getSessionStore();
|
|
113
|
+
const session = await store.getSession(sessionId);
|
|
114
|
+
if (!session) {
|
|
115
|
+
spinner.fail('Session not found');
|
|
116
|
+
process.exit(1);
|
|
117
|
+
}
|
|
118
|
+
spinner.succeed('Session loaded');
|
|
119
|
+
console.log();
|
|
120
|
+
formatSessionSummary(session);
|
|
121
|
+
// Show warnings
|
|
122
|
+
console.log();
|
|
123
|
+
if (session.estimated_fill > 0.85) {
|
|
124
|
+
warn('Critical: Context window is near full (>85%). Emergency eviction may be triggered.');
|
|
125
|
+
}
|
|
126
|
+
else if (session.estimated_fill > 0.6) {
|
|
127
|
+
warn('Warning: Context window is filling up (>60%). Consider running eviction.');
|
|
128
|
+
}
|
|
129
|
+
else {
|
|
130
|
+
success('Context window is healthy.');
|
|
131
|
+
}
|
|
132
|
+
// Show eviction eligibility
|
|
133
|
+
const engine = createEvictionEngine();
|
|
134
|
+
if (engine.shouldEvict(session)) {
|
|
135
|
+
console.log();
|
|
136
|
+
info('Automatic eviction will be triggered at next opportunity.');
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
catch (err) {
|
|
140
|
+
spinner.fail('Failed to load session');
|
|
141
|
+
throw err;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
/**
|
|
145
|
+
* claudetools context evict - Manually trigger eviction cycle
|
|
146
|
+
*/
|
|
147
|
+
export async function contextEvict(args) {
|
|
148
|
+
header('Context Eviction');
|
|
149
|
+
const sessionId = await getSessionId(args);
|
|
150
|
+
if (!sessionId) {
|
|
151
|
+
process.exit(1);
|
|
152
|
+
}
|
|
153
|
+
const spinner = ora('Loading session state...').start();
|
|
154
|
+
try {
|
|
155
|
+
const store = getSessionStore();
|
|
156
|
+
const session = await store.getSession(sessionId);
|
|
157
|
+
if (!session) {
|
|
158
|
+
spinner.fail('Session not found');
|
|
159
|
+
process.exit(1);
|
|
160
|
+
}
|
|
161
|
+
spinner.text = 'Analysing session...';
|
|
162
|
+
// Check if eviction is needed
|
|
163
|
+
const engine = createEvictionEngine();
|
|
164
|
+
if (!engine.shouldEvict(session)) {
|
|
165
|
+
spinner.info(`No eviction needed (fill: ${formatFill(session.estimated_fill)})`);
|
|
166
|
+
return;
|
|
167
|
+
}
|
|
168
|
+
// Get eviction plan
|
|
169
|
+
const plan = engine.getEvictionPlan(session);
|
|
170
|
+
spinner.stop();
|
|
171
|
+
console.log();
|
|
172
|
+
console.log(chalk.bold('Eviction Plan:'));
|
|
173
|
+
console.log(chalk.bold('Facts to evict:'), plan.factsToEvict.length);
|
|
174
|
+
console.log(chalk.bold('Current fill:'), formatFill(session.estimated_fill));
|
|
175
|
+
console.log(chalk.bold('Expected fill after:'), formatFill(plan.expectedFillAfter));
|
|
176
|
+
if (plan.includesCritical) {
|
|
177
|
+
warn('Plan includes evicting CRITICAL facts (fill >85%)');
|
|
178
|
+
}
|
|
179
|
+
// Confirm before proceeding
|
|
180
|
+
const confirm = await prompts({
|
|
181
|
+
type: 'confirm',
|
|
182
|
+
name: 'proceed',
|
|
183
|
+
message: 'Proceed with eviction?',
|
|
184
|
+
initial: true,
|
|
185
|
+
});
|
|
186
|
+
if (!confirm.proceed) {
|
|
187
|
+
info('Eviction cancelled');
|
|
188
|
+
return;
|
|
189
|
+
}
|
|
190
|
+
spinner.start('Evicting facts...');
|
|
191
|
+
// Execute eviction
|
|
192
|
+
const result = await engine.runEviction(session);
|
|
193
|
+
spinner.succeed('Eviction complete');
|
|
194
|
+
console.log();
|
|
195
|
+
console.log(chalk.bold('Evicted:'), result.evictedCount, 'facts');
|
|
196
|
+
console.log(chalk.bold('New fill:'), formatFill(result.newEstimatedFill));
|
|
197
|
+
// Update session in store
|
|
198
|
+
await store.updateSession(sessionId, { estimated_fill: result.newEstimatedFill });
|
|
199
|
+
success(`Context window reduced from ${formatFill(session.estimated_fill)} to ${formatFill(result.newEstimatedFill)}`);
|
|
200
|
+
}
|
|
201
|
+
catch (err) {
|
|
202
|
+
spinner.fail('Eviction failed');
|
|
203
|
+
throw err;
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
/**
|
|
207
|
+
* claudetools context summarise - Summarise and compress exchanges
|
|
208
|
+
*/
|
|
209
|
+
export async function contextSummarise(args) {
|
|
210
|
+
header('Exchange Summarisation');
|
|
211
|
+
const sessionId = await getSessionId(args);
|
|
212
|
+
if (!sessionId) {
|
|
213
|
+
process.exit(1);
|
|
214
|
+
}
|
|
215
|
+
const spinner = ora('Loading session state...').start();
|
|
216
|
+
try {
|
|
217
|
+
const store = getSessionStore();
|
|
218
|
+
const session = await store.getSession(sessionId);
|
|
219
|
+
if (!session) {
|
|
220
|
+
spinner.fail('Session not found');
|
|
221
|
+
process.exit(1);
|
|
222
|
+
}
|
|
223
|
+
spinner.text = 'Analysing exchanges...';
|
|
224
|
+
// Check if summarisation is needed
|
|
225
|
+
const summariser = createExchangeSummariser();
|
|
226
|
+
if (!summariser.shouldSummarise(session)) {
|
|
227
|
+
spinner.info('No exchanges need summarisation (< 10 unsummarised)');
|
|
228
|
+
return;
|
|
229
|
+
}
|
|
230
|
+
spinner.stop();
|
|
231
|
+
console.log();
|
|
232
|
+
const unsummarised = session.exchanges.filter((ex) => !ex.summarised_at);
|
|
233
|
+
console.log(chalk.bold('Unsummarised exchanges:'), unsummarised.length);
|
|
234
|
+
console.log(chalk.bold('Total exchanges:'), session.exchanges.length);
|
|
235
|
+
// Confirm before proceeding
|
|
236
|
+
const confirm = await prompts({
|
|
237
|
+
type: 'confirm',
|
|
238
|
+
name: 'proceed',
|
|
239
|
+
message: 'Proceed with summarisation?',
|
|
240
|
+
initial: true,
|
|
241
|
+
});
|
|
242
|
+
if (!confirm.proceed) {
|
|
243
|
+
info('Summarisation cancelled');
|
|
244
|
+
return;
|
|
245
|
+
}
|
|
246
|
+
spinner.start('Summarising exchanges...');
|
|
247
|
+
// Execute summarisation
|
|
248
|
+
const result = await summariser.summariseOldExchanges(session);
|
|
249
|
+
spinner.succeed('Summarisation complete');
|
|
250
|
+
console.log();
|
|
251
|
+
console.log(chalk.bold('Summarised:'), result.summarisedCount, 'exchanges');
|
|
252
|
+
console.log(chalk.bold('Tokens saved:'), result.tokensSaved.toLocaleString());
|
|
253
|
+
// Mark exchanges as summarised in store
|
|
254
|
+
for (let i = 0; i < result.summarisedCount; i++) {
|
|
255
|
+
await store.markExchangeSummarised(sessionId, i);
|
|
256
|
+
}
|
|
257
|
+
success(`Compressed ${result.summarisedCount} exchanges, saved ~${result.tokensSaved} tokens`);
|
|
258
|
+
}
|
|
259
|
+
catch (err) {
|
|
260
|
+
spinner.fail('Summarisation failed');
|
|
261
|
+
throw err;
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
/**
|
|
265
|
+
* claudetools context reset - Clear session state
|
|
266
|
+
*/
|
|
267
|
+
export async function contextReset(args) {
|
|
268
|
+
header('Context Reset');
|
|
269
|
+
const sessionId = await getSessionId(args);
|
|
270
|
+
if (!sessionId) {
|
|
271
|
+
process.exit(1);
|
|
272
|
+
}
|
|
273
|
+
const spinner = ora('Loading session state...').start();
|
|
274
|
+
try {
|
|
275
|
+
const store = getSessionStore();
|
|
276
|
+
const session = await store.getSession(sessionId);
|
|
277
|
+
if (!session) {
|
|
278
|
+
spinner.fail('Session not found');
|
|
279
|
+
process.exit(1);
|
|
280
|
+
}
|
|
281
|
+
spinner.stop();
|
|
282
|
+
console.log();
|
|
283
|
+
formatSessionSummary(session);
|
|
284
|
+
console.log();
|
|
285
|
+
warn('This will permanently delete all session data including:');
|
|
286
|
+
console.log(' - Injected facts');
|
|
287
|
+
console.log(' - Exchange history');
|
|
288
|
+
console.log(' - Token usage estimates');
|
|
289
|
+
// Confirm before proceeding
|
|
290
|
+
const confirm = await prompts({
|
|
291
|
+
type: 'confirm',
|
|
292
|
+
name: 'proceed',
|
|
293
|
+
message: chalk.red('Are you sure you want to delete this session?'),
|
|
294
|
+
initial: false,
|
|
295
|
+
});
|
|
296
|
+
if (!confirm.proceed) {
|
|
297
|
+
info('Reset cancelled');
|
|
298
|
+
return;
|
|
299
|
+
}
|
|
300
|
+
spinner.start('Deleting session...');
|
|
301
|
+
await store.deleteSession(sessionId);
|
|
302
|
+
spinner.succeed('Session deleted');
|
|
303
|
+
success('Context state cleared');
|
|
304
|
+
}
|
|
305
|
+
catch (err) {
|
|
306
|
+
spinner.fail('Reset failed');
|
|
307
|
+
throw err;
|
|
308
|
+
}
|
|
309
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
// =============================================================================
|
|
2
|
+
// Test Dataset Builder
|
|
3
|
+
// =============================================================================
|
|
4
|
+
// Interactive helper to build test dataset with ground truth judgments
|
|
5
|
+
// =============================================================================
|
|
6
|
+
import { getMemoryIndex } from '../helpers/api-client.js';
|
|
7
|
+
import { DEFAULT_USER_ID } from '../helpers/config.js';
|
|
8
|
+
import * as readline from 'readline/promises';
|
|
9
|
+
/**
|
|
10
|
+
* Interactive CLI to build test dataset.
|
|
11
|
+
*
|
|
12
|
+
* For each query:
|
|
13
|
+
* 1. Shows all facts with relevance scores
|
|
14
|
+
* 2. User marks which are actually relevant (ground truth)
|
|
15
|
+
* 3. Saves to JSON file
|
|
16
|
+
*/
|
|
17
|
+
async function buildDatasetInteractive(projectId, userId = DEFAULT_USER_ID) {
|
|
18
|
+
const rl = readline.createInterface({
|
|
19
|
+
input: process.stdin,
|
|
20
|
+
output: process.stdout,
|
|
21
|
+
});
|
|
22
|
+
const dataset = [];
|
|
23
|
+
console.log('📝 Interactive Test Dataset Builder\n');
|
|
24
|
+
console.log('This tool helps you create ground truth judgments for threshold evaluation.\n');
|
|
25
|
+
console.log('For each query, you\'ll see all matching facts and mark which are truly relevant.\n');
|
|
26
|
+
let addMore = true;
|
|
27
|
+
while (addMore) {
|
|
28
|
+
// Get query
|
|
29
|
+
const query = await rl.question('\n🔍 Enter test query (or "done" to finish): ');
|
|
30
|
+
if (query.toLowerCase() === 'done') {
|
|
31
|
+
break;
|
|
32
|
+
}
|
|
33
|
+
if (!query.trim()) {
|
|
34
|
+
console.log('⚠️ Query cannot be empty');
|
|
35
|
+
continue;
|
|
36
|
+
}
|
|
37
|
+
// Fetch all facts (no relevance filter)
|
|
38
|
+
console.log('\n⏳ Fetching facts...\n');
|
|
39
|
+
const result = await getMemoryIndex(projectId, {
|
|
40
|
+
query,
|
|
41
|
+
limit: 50,
|
|
42
|
+
min_relevance: 0, // Get everything
|
|
43
|
+
}, userId);
|
|
44
|
+
if (result.index.length === 0) {
|
|
45
|
+
console.log('❌ No facts found for this query. Try a different query.\n');
|
|
46
|
+
continue;
|
|
47
|
+
}
|
|
48
|
+
// Display facts
|
|
49
|
+
console.log(`\n📊 Found ${result.index.length} facts:\n`);
|
|
50
|
+
console.log('─'.repeat(80));
|
|
51
|
+
result.index.forEach((entry, i) => {
|
|
52
|
+
const relevanceBar = '█'.repeat(Math.round(entry.relevance * 20));
|
|
53
|
+
console.log(`\n[${i + 1}] ID: ${entry.id.slice(0, 12)}...`);
|
|
54
|
+
console.log(` Relevance: ${(entry.relevance * 100).toFixed(0)}% ${relevanceBar}`);
|
|
55
|
+
console.log(` Category: ${entry.category}`);
|
|
56
|
+
console.log(` Summary: ${entry.summary}`);
|
|
57
|
+
});
|
|
58
|
+
console.log('\n' + '─'.repeat(80));
|
|
59
|
+
// Get ground truth judgments
|
|
60
|
+
const relevantIndices = await rl.question('\n✅ Which facts are TRULY RELEVANT? (comma-separated numbers, e.g., "1,3,5"): ');
|
|
61
|
+
if (!relevantIndices.trim()) {
|
|
62
|
+
console.log('⚠️ No facts marked as relevant. Skipping this query.');
|
|
63
|
+
continue;
|
|
64
|
+
}
|
|
65
|
+
// Parse indices
|
|
66
|
+
const indices = relevantIndices
|
|
67
|
+
.split(',')
|
|
68
|
+
.map(s => parseInt(s.trim()) - 1)
|
|
69
|
+
.filter(i => i >= 0 && i < result.index.length);
|
|
70
|
+
if (indices.length === 0) {
|
|
71
|
+
console.log('⚠️ Invalid indices. Skipping this query.');
|
|
72
|
+
continue;
|
|
73
|
+
}
|
|
74
|
+
const relevantFactIds = indices.map(i => result.index[i].id);
|
|
75
|
+
console.log(`\n✓ Marked ${relevantFactIds.length} facts as relevant`);
|
|
76
|
+
// Get metadata
|
|
77
|
+
const category = await rl.question('📂 Category (architecture/pattern/decision/preference/fact): ');
|
|
78
|
+
const description = await rl.question('📝 Description (what is this query testing?): ');
|
|
79
|
+
// Add to dataset
|
|
80
|
+
dataset.push({
|
|
81
|
+
query,
|
|
82
|
+
relevantFactIds,
|
|
83
|
+
category: category.trim() || 'fact',
|
|
84
|
+
description: description.trim() || query,
|
|
85
|
+
});
|
|
86
|
+
console.log(`\n✅ Added test case ${dataset.length}`);
|
|
87
|
+
}
|
|
88
|
+
rl.close();
|
|
89
|
+
// Save dataset
|
|
90
|
+
if (dataset.length > 0) {
|
|
91
|
+
const fs = await import('fs/promises');
|
|
92
|
+
const outputPath = `/Users/oweninnes/Projects/memory/docs/evaluation/dataset-${Date.now()}.json`;
|
|
93
|
+
await fs.mkdir('/Users/oweninnes/Projects/memory/docs/evaluation', { recursive: true });
|
|
94
|
+
await fs.writeFile(outputPath, JSON.stringify(dataset, null, 2));
|
|
95
|
+
console.log('\n' + '='.repeat(80));
|
|
96
|
+
console.log(`\n✅ Dataset saved to: ${outputPath}`);
|
|
97
|
+
console.log(`\n📊 Total test cases: ${dataset.length}`);
|
|
98
|
+
console.log(`📋 Total ground truth facts: ${dataset.reduce((sum, d) => sum + d.relevantFactIds.length, 0)}`);
|
|
99
|
+
// Show code to use in threshold-eval.ts
|
|
100
|
+
console.log('\n📝 Copy this into threshold-eval.ts buildTestDataset():\n');
|
|
101
|
+
console.log('```typescript');
|
|
102
|
+
console.log('export function buildTestDataset(): QueryJudgment[] {');
|
|
103
|
+
console.log(' return ' + JSON.stringify(dataset, null, 2) + ';');
|
|
104
|
+
console.log('}');
|
|
105
|
+
console.log('```\n');
|
|
106
|
+
}
|
|
107
|
+
else {
|
|
108
|
+
console.log('\n⚠️ No test cases created.');
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
/**
|
|
112
|
+
* CLI entry point.
|
|
113
|
+
*/
|
|
114
|
+
async function main() {
|
|
115
|
+
const projectId = process.argv[2];
|
|
116
|
+
if (!projectId) {
|
|
117
|
+
console.error('Usage: tsx src/evaluation/build-dataset.ts <project-id>');
|
|
118
|
+
process.exit(1);
|
|
119
|
+
}
|
|
120
|
+
try {
|
|
121
|
+
await buildDatasetInteractive(projectId);
|
|
122
|
+
}
|
|
123
|
+
catch (error) {
|
|
124
|
+
console.error('\n❌ Error:', error);
|
|
125
|
+
process.exit(1);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
// Run if called directly (ES module check)
|
|
129
|
+
const isMainModule = process.argv[1] && process.argv[1].endsWith('build-dataset.ts');
|
|
130
|
+
if (isMainModule) {
|
|
131
|
+
main().catch(err => {
|
|
132
|
+
console.error(err);
|
|
133
|
+
process.exit(1);
|
|
134
|
+
});
|
|
135
|
+
}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
export interface QueryJudgment {
|
|
2
|
+
query: string;
|
|
3
|
+
relevantFactIds: string[];
|
|
4
|
+
category: 'architecture' | 'pattern' | 'decision' | 'preference' | 'fact';
|
|
5
|
+
description: string;
|
|
6
|
+
}
|
|
7
|
+
export interface ThresholdResult {
|
|
8
|
+
threshold: number;
|
|
9
|
+
precision: number;
|
|
10
|
+
recall: number;
|
|
11
|
+
f1: number;
|
|
12
|
+
truePositives: number;
|
|
13
|
+
falsePositives: number;
|
|
14
|
+
falseNegatives: number;
|
|
15
|
+
avgRelevanceReturned: number;
|
|
16
|
+
totalReturned: number;
|
|
17
|
+
}
|
|
18
|
+
export interface EvaluationReport {
|
|
19
|
+
projectId: string;
|
|
20
|
+
testDate: string;
|
|
21
|
+
results: ThresholdResult[];
|
|
22
|
+
bestThreshold: {
|
|
23
|
+
byF1: number;
|
|
24
|
+
byPrecision: number;
|
|
25
|
+
byRecall: number;
|
|
26
|
+
};
|
|
27
|
+
dataset: QueryJudgment[];
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Build test dataset with query-result relevance judgments.
|
|
31
|
+
*
|
|
32
|
+
* To use this framework:
|
|
33
|
+
* 1. Add real queries from your project
|
|
34
|
+
* 2. Run memory_index with no filter to see all available facts
|
|
35
|
+
* 3. Manually judge which fact IDs are relevant to each query
|
|
36
|
+
* 4. Add judgments to this dataset
|
|
37
|
+
*
|
|
38
|
+
* Example workflow:
|
|
39
|
+
* - Query: "authentication patterns"
|
|
40
|
+
* - Run: memory_index(query="authentication patterns", min_relevance=0)
|
|
41
|
+
* - Review returned facts, note IDs of truly relevant ones
|
|
42
|
+
* - Add to dataset with those fact IDs as ground truth
|
|
43
|
+
*/
|
|
44
|
+
export declare function buildTestDataset(): QueryJudgment[];
|
|
45
|
+
/**
|
|
46
|
+
* Evaluate a single threshold value against the test dataset.
|
|
47
|
+
*/
|
|
48
|
+
export declare function evaluateThreshold(projectId: string, threshold: number, dataset: QueryJudgment[], userId?: string): Promise<ThresholdResult>;
|
|
49
|
+
/**
|
|
50
|
+
* Run full evaluation across multiple threshold values.
|
|
51
|
+
*/
|
|
52
|
+
export declare function runThresholdEvaluation(projectId: string, thresholds?: number[], userId?: string): Promise<EvaluationReport>;
|
|
53
|
+
/**
|
|
54
|
+
* Format evaluation report as markdown.
|
|
55
|
+
*/
|
|
56
|
+
export declare function formatReport(report: EvaluationReport): string;
|
|
57
|
+
/**
|
|
58
|
+
* Run evaluation from command line.
|
|
59
|
+
*
|
|
60
|
+
* Usage:
|
|
61
|
+
* tsx src/evaluation/threshold-eval.ts <project-id>
|
|
62
|
+
*/
|
|
63
|
+
export declare function main(): Promise<void>;
|