@claudetools/tools 0.8.2 → 0.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,135 @@
1
+ // =============================================================================
2
+ // Test Dataset Builder
3
+ // =============================================================================
4
+ // Interactive helper to build test dataset with ground truth judgments
5
+ // =============================================================================
6
+ import { getMemoryIndex } from '../helpers/api-client.js';
7
+ import { DEFAULT_USER_ID } from '../helpers/config.js';
8
+ import * as readline from 'readline/promises';
9
+ /**
10
+ * Interactive CLI to build test dataset.
11
+ *
12
+ * For each query:
13
+ * 1. Shows all facts with relevance scores
14
+ * 2. User marks which are actually relevant (ground truth)
15
+ * 3. Saves to JSON file
16
+ */
17
+ async function buildDatasetInteractive(projectId, userId = DEFAULT_USER_ID) {
18
+ const rl = readline.createInterface({
19
+ input: process.stdin,
20
+ output: process.stdout,
21
+ });
22
+ const dataset = [];
23
+ console.log('šŸ“ Interactive Test Dataset Builder\n');
24
+ console.log('This tool helps you create ground truth judgments for threshold evaluation.\n');
25
+ console.log('For each query, you\'ll see all matching facts and mark which are truly relevant.\n');
26
+ let addMore = true;
27
+ while (addMore) {
28
+ // Get query
29
+ const query = await rl.question('\nšŸ” Enter test query (or "done" to finish): ');
30
+ if (query.toLowerCase() === 'done') {
31
+ break;
32
+ }
33
+ if (!query.trim()) {
34
+ console.log('āš ļø Query cannot be empty');
35
+ continue;
36
+ }
37
+ // Fetch all facts (no relevance filter)
38
+ console.log('\nā³ Fetching facts...\n');
39
+ const result = await getMemoryIndex(projectId, {
40
+ query,
41
+ limit: 50,
42
+ min_relevance: 0, // Get everything
43
+ }, userId);
44
+ if (result.index.length === 0) {
45
+ console.log('āŒ No facts found for this query. Try a different query.\n');
46
+ continue;
47
+ }
48
+ // Display facts
49
+ console.log(`\nšŸ“Š Found ${result.index.length} facts:\n`);
50
+ console.log('─'.repeat(80));
51
+ result.index.forEach((entry, i) => {
52
+ const relevanceBar = 'ā–ˆ'.repeat(Math.round(entry.relevance * 20));
53
+ console.log(`\n[${i + 1}] ID: ${entry.id.slice(0, 12)}...`);
54
+ console.log(` Relevance: ${(entry.relevance * 100).toFixed(0)}% ${relevanceBar}`);
55
+ console.log(` Category: ${entry.category}`);
56
+ console.log(` Summary: ${entry.summary}`);
57
+ });
58
+ console.log('\n' + '─'.repeat(80));
59
+ // Get ground truth judgments
60
+ const relevantIndices = await rl.question('\nāœ… Which facts are TRULY RELEVANT? (comma-separated numbers, e.g., "1,3,5"): ');
61
+ if (!relevantIndices.trim()) {
62
+ console.log('āš ļø No facts marked as relevant. Skipping this query.');
63
+ continue;
64
+ }
65
+ // Parse indices
66
+ const indices = relevantIndices
67
+ .split(',')
68
+ .map(s => parseInt(s.trim()) - 1)
69
+ .filter(i => i >= 0 && i < result.index.length);
70
+ if (indices.length === 0) {
71
+ console.log('āš ļø Invalid indices. Skipping this query.');
72
+ continue;
73
+ }
74
+ const relevantFactIds = indices.map(i => result.index[i].id);
75
+ console.log(`\nāœ“ Marked ${relevantFactIds.length} facts as relevant`);
76
+ // Get metadata
77
+ const category = await rl.question('šŸ“‚ Category (architecture/pattern/decision/preference/fact): ');
78
+ const description = await rl.question('šŸ“ Description (what is this query testing?): ');
79
+ // Add to dataset
80
+ dataset.push({
81
+ query,
82
+ relevantFactIds,
83
+ category: category.trim() || 'fact',
84
+ description: description.trim() || query,
85
+ });
86
+ console.log(`\nāœ… Added test case ${dataset.length}`);
87
+ }
88
+ rl.close();
89
+ // Save dataset
90
+ if (dataset.length > 0) {
91
+ const fs = await import('fs/promises');
92
+ const outputPath = `/Users/oweninnes/Projects/memory/docs/evaluation/dataset-${Date.now()}.json`;
93
+ await fs.mkdir('/Users/oweninnes/Projects/memory/docs/evaluation', { recursive: true });
94
+ await fs.writeFile(outputPath, JSON.stringify(dataset, null, 2));
95
+ console.log('\n' + '='.repeat(80));
96
+ console.log(`\nāœ… Dataset saved to: ${outputPath}`);
97
+ console.log(`\nšŸ“Š Total test cases: ${dataset.length}`);
98
+ console.log(`šŸ“‹ Total ground truth facts: ${dataset.reduce((sum, d) => sum + d.relevantFactIds.length, 0)}`);
99
+ // Show code to use in threshold-eval.ts
100
+ console.log('\nšŸ“ Copy this into threshold-eval.ts buildTestDataset():\n');
101
+ console.log('```typescript');
102
+ console.log('export function buildTestDataset(): QueryJudgment[] {');
103
+ console.log(' return ' + JSON.stringify(dataset, null, 2) + ';');
104
+ console.log('}');
105
+ console.log('```\n');
106
+ }
107
+ else {
108
+ console.log('\nāš ļø No test cases created.');
109
+ }
110
+ }
111
+ /**
112
+ * CLI entry point.
113
+ */
114
+ async function main() {
115
+ const projectId = process.argv[2];
116
+ if (!projectId) {
117
+ console.error('Usage: tsx src/evaluation/build-dataset.ts <project-id>');
118
+ process.exit(1);
119
+ }
120
+ try {
121
+ await buildDatasetInteractive(projectId);
122
+ }
123
+ catch (error) {
124
+ console.error('\nāŒ Error:', error);
125
+ process.exit(1);
126
+ }
127
+ }
128
+ // Run if called directly (ES module check)
129
+ const isMainModule = process.argv[1] && process.argv[1].endsWith('build-dataset.ts');
130
+ if (isMainModule) {
131
+ main().catch(err => {
132
+ console.error(err);
133
+ process.exit(1);
134
+ });
135
+ }
@@ -0,0 +1,63 @@
1
+ export interface QueryJudgment {
2
+ query: string;
3
+ relevantFactIds: string[];
4
+ category: 'architecture' | 'pattern' | 'decision' | 'preference' | 'fact';
5
+ description: string;
6
+ }
7
+ export interface ThresholdResult {
8
+ threshold: number;
9
+ precision: number;
10
+ recall: number;
11
+ f1: number;
12
+ truePositives: number;
13
+ falsePositives: number;
14
+ falseNegatives: number;
15
+ avgRelevanceReturned: number;
16
+ totalReturned: number;
17
+ }
18
+ export interface EvaluationReport {
19
+ projectId: string;
20
+ testDate: string;
21
+ results: ThresholdResult[];
22
+ bestThreshold: {
23
+ byF1: number;
24
+ byPrecision: number;
25
+ byRecall: number;
26
+ };
27
+ dataset: QueryJudgment[];
28
+ }
29
+ /**
30
+ * Build test dataset with query-result relevance judgments.
31
+ *
32
+ * To use this framework:
33
+ * 1. Add real queries from your project
34
+ * 2. Run memory_index with no filter to see all available facts
35
+ * 3. Manually judge which fact IDs are relevant to each query
36
+ * 4. Add judgments to this dataset
37
+ *
38
+ * Example workflow:
39
+ * - Query: "authentication patterns"
40
+ * - Run: memory_index(query="authentication patterns", min_relevance=0)
41
+ * - Review returned facts, note IDs of truly relevant ones
42
+ * - Add to dataset with those fact IDs as ground truth
43
+ */
44
+ export declare function buildTestDataset(): QueryJudgment[];
45
+ /**
46
+ * Evaluate a single threshold value against the test dataset.
47
+ */
48
+ export declare function evaluateThreshold(projectId: string, threshold: number, dataset: QueryJudgment[], userId?: string): Promise<ThresholdResult>;
49
+ /**
50
+ * Run full evaluation across multiple threshold values.
51
+ */
52
+ export declare function runThresholdEvaluation(projectId: string, thresholds?: number[], userId?: string): Promise<EvaluationReport>;
53
+ /**
54
+ * Format evaluation report as markdown.
55
+ */
56
+ export declare function formatReport(report: EvaluationReport): string;
57
+ /**
58
+ * Run evaluation from command line.
59
+ *
60
+ * Usage:
61
+ * tsx src/evaluation/threshold-eval.ts <project-id>
62
+ */
63
+ export declare function main(): Promise<void>;
@@ -0,0 +1,250 @@
1
+ // =============================================================================
2
+ // Threshold Evaluation Framework
3
+ // =============================================================================
4
+ // Empirical evaluation of relevance threshold values
5
+ // Tests 0.2, 0.3, 0.4, 0.5 to find optimal balance of precision vs recall
6
+ // =============================================================================
7
+ import { getMemoryIndex } from '../helpers/api-client.js';
8
+ import { DEFAULT_USER_ID } from '../helpers/config.js';
9
+ // -----------------------------------------------------------------------------
10
+ // Test Dataset
11
+ // -----------------------------------------------------------------------------
12
+ /**
13
+ * Build test dataset with query-result relevance judgments.
14
+ *
15
+ * To use this framework:
16
+ * 1. Add real queries from your project
17
+ * 2. Run memory_index with no filter to see all available facts
18
+ * 3. Manually judge which fact IDs are relevant to each query
19
+ * 4. Add judgments to this dataset
20
+ *
21
+ * Example workflow:
22
+ * - Query: "authentication patterns"
23
+ * - Run: memory_index(query="authentication patterns", min_relevance=0)
24
+ * - Review returned facts, note IDs of truly relevant ones
25
+ * - Add to dataset with those fact IDs as ground truth
26
+ */
27
+ export function buildTestDataset() {
28
+ return [
29
+ // Example 1: Architecture query
30
+ {
31
+ query: 'context injection system',
32
+ relevantFactIds: [
33
+ // TODO: Fill with actual fact IDs from your project
34
+ // These should be manually verified as relevant to this query
35
+ ],
36
+ category: 'architecture',
37
+ description: 'How context injection works in the system',
38
+ },
39
+ // Example 2: Pattern query
40
+ {
41
+ query: 'task management workflow',
42
+ relevantFactIds: [
43
+ // TODO: Fill with actual fact IDs
44
+ ],
45
+ category: 'pattern',
46
+ description: 'Patterns for task creation and execution',
47
+ },
48
+ // Example 3: Decision query
49
+ {
50
+ query: 'why use BM25 instead of TF-IDF',
51
+ relevantFactIds: [
52
+ // TODO: Fill with actual fact IDs
53
+ ],
54
+ category: 'decision',
55
+ description: 'Architectural decision rationale',
56
+ },
57
+ // TODO: Add more test cases
58
+ // Aim for 15-20 diverse queries covering different:
59
+ // - Query types (architecture, pattern, decision, preference, fact)
60
+ // - Query lengths (short vs long)
61
+ // - Query specificity (broad vs narrow)
62
+ // - Expected result counts (1-2 facts vs 10+ facts)
63
+ ];
64
+ }
65
+ // -----------------------------------------------------------------------------
66
+ // Evaluation Logic
67
+ // -----------------------------------------------------------------------------
68
+ /**
69
+ * Evaluate a single threshold value against the test dataset.
70
+ */
71
+ export async function evaluateThreshold(projectId, threshold, dataset, userId = DEFAULT_USER_ID) {
72
+ let totalTP = 0;
73
+ let totalFP = 0;
74
+ let totalFN = 0;
75
+ let totalRelevanceScore = 0;
76
+ let totalReturned = 0;
77
+ for (const judgment of dataset) {
78
+ try {
79
+ // Query with this threshold
80
+ const result = await getMemoryIndex(projectId, {
81
+ query: judgment.query,
82
+ limit: 50,
83
+ min_relevance: threshold,
84
+ }, userId);
85
+ const returnedIds = new Set(result.index.map(entry => entry.id));
86
+ const relevantIds = new Set(judgment.relevantFactIds);
87
+ // Calculate metrics for this query
88
+ const tp = Array.from(returnedIds).filter(id => relevantIds.has(id)).length;
89
+ const fp = Array.from(returnedIds).filter(id => !relevantIds.has(id)).length;
90
+ const fn = Array.from(relevantIds).filter(id => !returnedIds.has(id)).length;
91
+ totalTP += tp;
92
+ totalFP += fp;
93
+ totalFN += fn;
94
+ // Track average relevance of returned results
95
+ const relevanceScores = result.index.map(e => e.relevance);
96
+ totalRelevanceScore += relevanceScores.reduce((sum, r) => sum + r, 0);
97
+ totalReturned += result.index.length;
98
+ }
99
+ catch (error) {
100
+ console.error(`Error evaluating query "${judgment.query}":`, error);
101
+ }
102
+ }
103
+ // Calculate aggregate metrics
104
+ const precision = totalTP + totalFP > 0 ? totalTP / (totalTP + totalFP) : 0;
105
+ const recall = totalTP + totalFN > 0 ? totalTP / (totalTP + totalFN) : 0;
106
+ const f1 = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0;
107
+ const avgRelevance = totalReturned > 0 ? totalRelevanceScore / totalReturned : 0;
108
+ return {
109
+ threshold,
110
+ precision,
111
+ recall,
112
+ f1,
113
+ truePositives: totalTP,
114
+ falsePositives: totalFP,
115
+ falseNegatives: totalFN,
116
+ avgRelevanceReturned: avgRelevance,
117
+ totalReturned,
118
+ };
119
+ }
120
+ /**
121
+ * Run full evaluation across multiple threshold values.
122
+ */
123
+ export async function runThresholdEvaluation(projectId, thresholds = [0.2, 0.3, 0.4, 0.5], userId = DEFAULT_USER_ID) {
124
+ const dataset = buildTestDataset();
125
+ if (dataset.length === 0 || dataset.every(j => j.relevantFactIds.length === 0)) {
126
+ throw new Error('Test dataset is empty or has no ground truth judgments. ' +
127
+ 'Please fill buildTestDataset() with real queries and fact IDs.');
128
+ }
129
+ console.log(`\nšŸ“Š Evaluating ${thresholds.length} thresholds on ${dataset.length} queries...\n`);
130
+ const results = [];
131
+ for (const threshold of thresholds) {
132
+ console.log(`Testing threshold ${threshold}...`);
133
+ const result = await evaluateThreshold(projectId, threshold, dataset, userId);
134
+ results.push(result);
135
+ console.log(` P=${result.precision.toFixed(3)} R=${result.recall.toFixed(3)} F1=${result.f1.toFixed(3)}`);
136
+ }
137
+ // Find best thresholds by different metrics
138
+ const bestF1 = results.reduce((best, curr) => curr.f1 > best.f1 ? curr : best);
139
+ const bestPrecision = results.reduce((best, curr) => curr.precision > best.precision ? curr : best);
140
+ const bestRecall = results.reduce((best, curr) => curr.recall > best.recall ? curr : best);
141
+ return {
142
+ projectId,
143
+ testDate: new Date().toISOString(),
144
+ results,
145
+ bestThreshold: {
146
+ byF1: bestF1.threshold,
147
+ byPrecision: bestPrecision.threshold,
148
+ byRecall: bestRecall.threshold,
149
+ },
150
+ dataset,
151
+ };
152
+ }
153
+ // -----------------------------------------------------------------------------
154
+ // Reporting
155
+ // -----------------------------------------------------------------------------
156
+ /**
157
+ * Format evaluation report as markdown.
158
+ */
159
+ export function formatReport(report) {
160
+ let md = `# Relevance Threshold Evaluation Report\n\n`;
161
+ md += `**Project:** ${report.projectId}\n`;
162
+ md += `**Date:** ${report.testDate}\n`;
163
+ md += `**Test Queries:** ${report.dataset.length}\n\n`;
164
+ md += `## Results Summary\n\n`;
165
+ md += `| Threshold | Precision | Recall | F1 Score | TP | FP | FN | Avg Relevance | Total Returned |\n`;
166
+ md += `|-----------|-----------|--------|----------|----|----|----|--------------:|---------------:|\n`;
167
+ for (const r of report.results) {
168
+ md += `| ${r.threshold.toFixed(1)} `;
169
+ md += `| ${(r.precision * 100).toFixed(1)}% `;
170
+ md += `| ${(r.recall * 100).toFixed(1)}% `;
171
+ md += `| ${(r.f1 * 100).toFixed(1)}% `;
172
+ md += `| ${r.truePositives} `;
173
+ md += `| ${r.falsePositives} `;
174
+ md += `| ${r.falseNegatives} `;
175
+ md += `| ${(r.avgRelevanceReturned * 100).toFixed(1)}% `;
176
+ md += `| ${r.totalReturned} |\n`;
177
+ }
178
+ md += `\n## Recommendations\n\n`;
179
+ md += `- **Best F1 Score:** ${report.bestThreshold.byF1} (balanced precision/recall)\n`;
180
+ md += `- **Best Precision:** ${report.bestThreshold.byPrecision} (fewer false positives)\n`;
181
+ md += `- **Best Recall:** ${report.bestThreshold.byRecall} (fewer false negatives)\n\n`;
182
+ md += `## Analysis\n\n`;
183
+ const current = report.results.find(r => r.threshold === 0.3);
184
+ const best = report.results.find(r => r.threshold === report.bestThreshold.byF1);
185
+ if (current && best && current.threshold !== best.threshold) {
186
+ const precisionChange = ((best.precision - current.precision) / current.precision) * 100;
187
+ const recallChange = ((best.recall - current.recall) / current.recall) * 100;
188
+ const f1Change = ((best.f1 - current.f1) / current.f1) * 100;
189
+ md += `Current threshold (0.3) vs Best threshold (${best.threshold}):\n`;
190
+ md += `- Precision: ${precisionChange > 0 ? '+' : ''}${precisionChange.toFixed(1)}%\n`;
191
+ md += `- Recall: ${recallChange > 0 ? '+' : ''}${recallChange.toFixed(1)}%\n`;
192
+ md += `- F1 Score: ${f1Change > 0 ? '+' : ''}${f1Change.toFixed(1)}%\n\n`;
193
+ if (f1Change > 5) {
194
+ md += `**Recommendation:** Change default threshold from 0.3 to ${best.threshold}\n`;
195
+ }
196
+ else {
197
+ md += `**Recommendation:** Keep current threshold (0.3) - minimal improvement from change\n`;
198
+ }
199
+ }
200
+ else if (current && best && current.threshold === best.threshold) {
201
+ md += `**Recommendation:** Keep current threshold (0.3) - already optimal\n`;
202
+ }
203
+ md += `\n## Test Dataset\n\n`;
204
+ for (const judgment of report.dataset) {
205
+ md += `### ${judgment.query}\n`;
206
+ md += `- **Category:** ${judgment.category}\n`;
207
+ md += `- **Description:** ${judgment.description}\n`;
208
+ md += `- **Ground Truth Facts:** ${judgment.relevantFactIds.length}\n\n`;
209
+ }
210
+ return md;
211
+ }
212
+ // -----------------------------------------------------------------------------
213
+ // CLI Interface
214
+ // -----------------------------------------------------------------------------
215
+ /**
216
+ * Run evaluation from command line.
217
+ *
218
+ * Usage:
219
+ * tsx src/evaluation/threshold-eval.ts <project-id>
220
+ */
221
+ export async function main() {
222
+ const projectId = process.argv[2];
223
+ if (!projectId) {
224
+ console.error('Usage: tsx src/evaluation/threshold-eval.ts <project-id>');
225
+ process.exit(1);
226
+ }
227
+ try {
228
+ const report = await runThresholdEvaluation(projectId);
229
+ const markdown = formatReport(report);
230
+ console.log('\n' + markdown);
231
+ // Optionally save to file
232
+ const fs = await import('fs/promises');
233
+ const outputPath = `/Users/oweninnes/Projects/memory/docs/evaluation/threshold-eval-${Date.now()}.md`;
234
+ await fs.mkdir('/Users/oweninnes/Projects/memory/docs/evaluation', { recursive: true });
235
+ await fs.writeFile(outputPath, markdown);
236
+ console.log(`\nšŸ“„ Report saved to: ${outputPath}`);
237
+ }
238
+ catch (error) {
239
+ console.error('Evaluation failed:', error);
240
+ process.exit(1);
241
+ }
242
+ }
243
+ // Run if called directly (ES module check)
244
+ const isMainModule = process.argv[1] && process.argv[1].endsWith('threshold-eval.ts');
245
+ if (isMainModule) {
246
+ main().catch(err => {
247
+ console.error(err);
248
+ process.exit(1);
249
+ });
250
+ }
@@ -213,7 +213,7 @@ export declare function handleValidateSpec(args: any): Promise<{
213
213
  fields: {
214
214
  name: string;
215
215
  type: import("../codedna/parser.js").FieldType;
216
- constraints: ("default" | "length" | "email" | "min" | "max" | "url" | "required" | "pattern" | "nullable" | "unique" | "hashed" | "index" | "immutable" | "textarea" | "switch" | "radio")[];
216
+ constraints: ("default" | "length" | "email" | "min" | "max" | "url" | "pattern" | "required" | "nullable" | "unique" | "hashed" | "index" | "immutable" | "textarea" | "switch" | "radio")[];
217
217
  }[];
218
218
  };
219
219
  summary: {