@claudetools/tools 0.8.2 ā 0.8.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +41 -0
- package/dist/context/deduplication.d.ts +72 -0
- package/dist/context/deduplication.js +77 -0
- package/dist/context/deduplication.test.d.ts +6 -0
- package/dist/context/deduplication.test.js +84 -0
- package/dist/context/emergency-eviction.d.ts +73 -0
- package/dist/context/emergency-eviction.example.d.ts +13 -0
- package/dist/context/emergency-eviction.example.js +94 -0
- package/dist/context/emergency-eviction.js +226 -0
- package/dist/context/eviction-engine.d.ts +76 -0
- package/dist/context/eviction-engine.example.d.ts +7 -0
- package/dist/context/eviction-engine.example.js +144 -0
- package/dist/context/eviction-engine.js +176 -0
- package/dist/context/example-usage.d.ts +1 -0
- package/dist/context/example-usage.js +128 -0
- package/dist/context/exchange-summariser.d.ts +80 -0
- package/dist/context/exchange-summariser.js +261 -0
- package/dist/context/health-monitor.d.ts +97 -0
- package/dist/context/health-monitor.example.d.ts +1 -0
- package/dist/context/health-monitor.example.js +164 -0
- package/dist/context/health-monitor.js +210 -0
- package/dist/context/importance-scorer.d.ts +94 -0
- package/dist/context/importance-scorer.example.d.ts +1 -0
- package/dist/context/importance-scorer.example.js +140 -0
- package/dist/context/importance-scorer.js +187 -0
- package/dist/context/index.d.ts +9 -0
- package/dist/context/index.js +16 -0
- package/dist/context/session-helper.d.ts +10 -0
- package/dist/context/session-helper.js +51 -0
- package/dist/context/session-store.d.ts +94 -0
- package/dist/context/session-store.js +286 -0
- package/dist/context/usage-estimator.d.ts +131 -0
- package/dist/context/usage-estimator.js +260 -0
- package/dist/context/usage-estimator.test.d.ts +1 -0
- package/dist/context/usage-estimator.test.js +208 -0
- package/dist/context-cli.d.ts +16 -0
- package/dist/context-cli.js +309 -0
- package/dist/evaluation/build-dataset.d.ts +1 -0
- package/dist/evaluation/build-dataset.js +135 -0
- package/dist/evaluation/threshold-eval.d.ts +63 -0
- package/dist/evaluation/threshold-eval.js +250 -0
- package/dist/handlers/codedna-handlers.d.ts +2 -2
- package/dist/handlers/tool-handlers.js +126 -165
- package/dist/helpers/api-client.d.ts +5 -1
- package/dist/helpers/api-client.js +3 -1
- package/dist/helpers/compact-formatter.d.ts +51 -0
- package/dist/helpers/compact-formatter.js +130 -0
- package/dist/helpers/engagement-tracker.d.ts +10 -0
- package/dist/helpers/engagement-tracker.js +61 -0
- package/dist/helpers/error-tracking.js +1 -1
- package/dist/helpers/session-validation.d.ts +76 -0
- package/dist/helpers/session-validation.js +221 -0
- package/dist/helpers/usage-analytics.js +1 -1
- package/dist/hooks/index.d.ts +4 -0
- package/dist/hooks/index.js +6 -0
- package/dist/hooks/post-tool-use-hook-cli.d.ts +2 -0
- package/dist/hooks/post-tool-use-hook-cli.js +34 -0
- package/dist/hooks/post-tool-use.d.ts +67 -0
- package/dist/hooks/post-tool-use.js +234 -0
- package/dist/hooks/stop-hook-cli.d.ts +2 -0
- package/dist/hooks/stop-hook-cli.js +34 -0
- package/dist/hooks/stop.d.ts +64 -0
- package/dist/hooks/stop.js +192 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.js +2 -0
- package/dist/logger.d.ts +1 -1
- package/dist/logger.js +4 -0
- package/dist/resources.js +3 -0
- package/dist/setup.js +206 -2
- package/dist/templates/claude-md.d.ts +1 -1
- package/dist/templates/claude-md.js +23 -35
- package/dist/templates/worker-prompt.js +35 -202
- package/dist/tools.js +26 -20
- package/package.json +6 -2
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
// =============================================================================
|
|
2
|
+
// Threshold Evaluation Framework
|
|
3
|
+
// =============================================================================
|
|
4
|
+
// Empirical evaluation of relevance threshold values
|
|
5
|
+
// Tests 0.2, 0.3, 0.4, 0.5 to find optimal balance of precision vs recall
|
|
6
|
+
// =============================================================================
|
|
7
|
+
import { getMemoryIndex } from '../helpers/api-client.js';
|
|
8
|
+
import { DEFAULT_USER_ID } from '../helpers/config.js';
|
|
9
|
+
// -----------------------------------------------------------------------------
|
|
10
|
+
// Test Dataset
|
|
11
|
+
// -----------------------------------------------------------------------------
|
|
12
|
+
/**
|
|
13
|
+
* Build test dataset with query-result relevance judgments.
|
|
14
|
+
*
|
|
15
|
+
* To use this framework:
|
|
16
|
+
* 1. Add real queries from your project
|
|
17
|
+
* 2. Run memory_index with no filter to see all available facts
|
|
18
|
+
* 3. Manually judge which fact IDs are relevant to each query
|
|
19
|
+
* 4. Add judgments to this dataset
|
|
20
|
+
*
|
|
21
|
+
* Example workflow:
|
|
22
|
+
* - Query: "authentication patterns"
|
|
23
|
+
* - Run: memory_index(query="authentication patterns", min_relevance=0)
|
|
24
|
+
* - Review returned facts, note IDs of truly relevant ones
|
|
25
|
+
* - Add to dataset with those fact IDs as ground truth
|
|
26
|
+
*/
|
|
27
|
+
export function buildTestDataset() {
|
|
28
|
+
return [
|
|
29
|
+
// Example 1: Architecture query
|
|
30
|
+
{
|
|
31
|
+
query: 'context injection system',
|
|
32
|
+
relevantFactIds: [
|
|
33
|
+
// TODO: Fill with actual fact IDs from your project
|
|
34
|
+
// These should be manually verified as relevant to this query
|
|
35
|
+
],
|
|
36
|
+
category: 'architecture',
|
|
37
|
+
description: 'How context injection works in the system',
|
|
38
|
+
},
|
|
39
|
+
// Example 2: Pattern query
|
|
40
|
+
{
|
|
41
|
+
query: 'task management workflow',
|
|
42
|
+
relevantFactIds: [
|
|
43
|
+
// TODO: Fill with actual fact IDs
|
|
44
|
+
],
|
|
45
|
+
category: 'pattern',
|
|
46
|
+
description: 'Patterns for task creation and execution',
|
|
47
|
+
},
|
|
48
|
+
// Example 3: Decision query
|
|
49
|
+
{
|
|
50
|
+
query: 'why use BM25 instead of TF-IDF',
|
|
51
|
+
relevantFactIds: [
|
|
52
|
+
// TODO: Fill with actual fact IDs
|
|
53
|
+
],
|
|
54
|
+
category: 'decision',
|
|
55
|
+
description: 'Architectural decision rationale',
|
|
56
|
+
},
|
|
57
|
+
// TODO: Add more test cases
|
|
58
|
+
// Aim for 15-20 diverse queries covering different:
|
|
59
|
+
// - Query types (architecture, pattern, decision, preference, fact)
|
|
60
|
+
// - Query lengths (short vs long)
|
|
61
|
+
// - Query specificity (broad vs narrow)
|
|
62
|
+
// - Expected result counts (1-2 facts vs 10+ facts)
|
|
63
|
+
];
|
|
64
|
+
}
|
|
65
|
+
// -----------------------------------------------------------------------------
|
|
66
|
+
// Evaluation Logic
|
|
67
|
+
// -----------------------------------------------------------------------------
|
|
68
|
+
/**
|
|
69
|
+
* Evaluate a single threshold value against the test dataset.
|
|
70
|
+
*/
|
|
71
|
+
export async function evaluateThreshold(projectId, threshold, dataset, userId = DEFAULT_USER_ID) {
|
|
72
|
+
let totalTP = 0;
|
|
73
|
+
let totalFP = 0;
|
|
74
|
+
let totalFN = 0;
|
|
75
|
+
let totalRelevanceScore = 0;
|
|
76
|
+
let totalReturned = 0;
|
|
77
|
+
for (const judgment of dataset) {
|
|
78
|
+
try {
|
|
79
|
+
// Query with this threshold
|
|
80
|
+
const result = await getMemoryIndex(projectId, {
|
|
81
|
+
query: judgment.query,
|
|
82
|
+
limit: 50,
|
|
83
|
+
min_relevance: threshold,
|
|
84
|
+
}, userId);
|
|
85
|
+
const returnedIds = new Set(result.index.map(entry => entry.id));
|
|
86
|
+
const relevantIds = new Set(judgment.relevantFactIds);
|
|
87
|
+
// Calculate metrics for this query
|
|
88
|
+
const tp = Array.from(returnedIds).filter(id => relevantIds.has(id)).length;
|
|
89
|
+
const fp = Array.from(returnedIds).filter(id => !relevantIds.has(id)).length;
|
|
90
|
+
const fn = Array.from(relevantIds).filter(id => !returnedIds.has(id)).length;
|
|
91
|
+
totalTP += tp;
|
|
92
|
+
totalFP += fp;
|
|
93
|
+
totalFN += fn;
|
|
94
|
+
// Track average relevance of returned results
|
|
95
|
+
const relevanceScores = result.index.map(e => e.relevance);
|
|
96
|
+
totalRelevanceScore += relevanceScores.reduce((sum, r) => sum + r, 0);
|
|
97
|
+
totalReturned += result.index.length;
|
|
98
|
+
}
|
|
99
|
+
catch (error) {
|
|
100
|
+
console.error(`Error evaluating query "${judgment.query}":`, error);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
// Calculate aggregate metrics
|
|
104
|
+
const precision = totalTP + totalFP > 0 ? totalTP / (totalTP + totalFP) : 0;
|
|
105
|
+
const recall = totalTP + totalFN > 0 ? totalTP / (totalTP + totalFN) : 0;
|
|
106
|
+
const f1 = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0;
|
|
107
|
+
const avgRelevance = totalReturned > 0 ? totalRelevanceScore / totalReturned : 0;
|
|
108
|
+
return {
|
|
109
|
+
threshold,
|
|
110
|
+
precision,
|
|
111
|
+
recall,
|
|
112
|
+
f1,
|
|
113
|
+
truePositives: totalTP,
|
|
114
|
+
falsePositives: totalFP,
|
|
115
|
+
falseNegatives: totalFN,
|
|
116
|
+
avgRelevanceReturned: avgRelevance,
|
|
117
|
+
totalReturned,
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
/**
|
|
121
|
+
* Run full evaluation across multiple threshold values.
|
|
122
|
+
*/
|
|
123
|
+
export async function runThresholdEvaluation(projectId, thresholds = [0.2, 0.3, 0.4, 0.5], userId = DEFAULT_USER_ID) {
|
|
124
|
+
const dataset = buildTestDataset();
|
|
125
|
+
if (dataset.length === 0 || dataset.every(j => j.relevantFactIds.length === 0)) {
|
|
126
|
+
throw new Error('Test dataset is empty or has no ground truth judgments. ' +
|
|
127
|
+
'Please fill buildTestDataset() with real queries and fact IDs.');
|
|
128
|
+
}
|
|
129
|
+
console.log(`\nš Evaluating ${thresholds.length} thresholds on ${dataset.length} queries...\n`);
|
|
130
|
+
const results = [];
|
|
131
|
+
for (const threshold of thresholds) {
|
|
132
|
+
console.log(`Testing threshold ${threshold}...`);
|
|
133
|
+
const result = await evaluateThreshold(projectId, threshold, dataset, userId);
|
|
134
|
+
results.push(result);
|
|
135
|
+
console.log(` P=${result.precision.toFixed(3)} R=${result.recall.toFixed(3)} F1=${result.f1.toFixed(3)}`);
|
|
136
|
+
}
|
|
137
|
+
// Find best thresholds by different metrics
|
|
138
|
+
const bestF1 = results.reduce((best, curr) => curr.f1 > best.f1 ? curr : best);
|
|
139
|
+
const bestPrecision = results.reduce((best, curr) => curr.precision > best.precision ? curr : best);
|
|
140
|
+
const bestRecall = results.reduce((best, curr) => curr.recall > best.recall ? curr : best);
|
|
141
|
+
return {
|
|
142
|
+
projectId,
|
|
143
|
+
testDate: new Date().toISOString(),
|
|
144
|
+
results,
|
|
145
|
+
bestThreshold: {
|
|
146
|
+
byF1: bestF1.threshold,
|
|
147
|
+
byPrecision: bestPrecision.threshold,
|
|
148
|
+
byRecall: bestRecall.threshold,
|
|
149
|
+
},
|
|
150
|
+
dataset,
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
// -----------------------------------------------------------------------------
|
|
154
|
+
// Reporting
|
|
155
|
+
// -----------------------------------------------------------------------------
|
|
156
|
+
/**
|
|
157
|
+
* Format evaluation report as markdown.
|
|
158
|
+
*/
|
|
159
|
+
export function formatReport(report) {
|
|
160
|
+
let md = `# Relevance Threshold Evaluation Report\n\n`;
|
|
161
|
+
md += `**Project:** ${report.projectId}\n`;
|
|
162
|
+
md += `**Date:** ${report.testDate}\n`;
|
|
163
|
+
md += `**Test Queries:** ${report.dataset.length}\n\n`;
|
|
164
|
+
md += `## Results Summary\n\n`;
|
|
165
|
+
md += `| Threshold | Precision | Recall | F1 Score | TP | FP | FN | Avg Relevance | Total Returned |\n`;
|
|
166
|
+
md += `|-----------|-----------|--------|----------|----|----|----|--------------:|---------------:|\n`;
|
|
167
|
+
for (const r of report.results) {
|
|
168
|
+
md += `| ${r.threshold.toFixed(1)} `;
|
|
169
|
+
md += `| ${(r.precision * 100).toFixed(1)}% `;
|
|
170
|
+
md += `| ${(r.recall * 100).toFixed(1)}% `;
|
|
171
|
+
md += `| ${(r.f1 * 100).toFixed(1)}% `;
|
|
172
|
+
md += `| ${r.truePositives} `;
|
|
173
|
+
md += `| ${r.falsePositives} `;
|
|
174
|
+
md += `| ${r.falseNegatives} `;
|
|
175
|
+
md += `| ${(r.avgRelevanceReturned * 100).toFixed(1)}% `;
|
|
176
|
+
md += `| ${r.totalReturned} |\n`;
|
|
177
|
+
}
|
|
178
|
+
md += `\n## Recommendations\n\n`;
|
|
179
|
+
md += `- **Best F1 Score:** ${report.bestThreshold.byF1} (balanced precision/recall)\n`;
|
|
180
|
+
md += `- **Best Precision:** ${report.bestThreshold.byPrecision} (fewer false positives)\n`;
|
|
181
|
+
md += `- **Best Recall:** ${report.bestThreshold.byRecall} (fewer false negatives)\n\n`;
|
|
182
|
+
md += `## Analysis\n\n`;
|
|
183
|
+
const current = report.results.find(r => r.threshold === 0.3);
|
|
184
|
+
const best = report.results.find(r => r.threshold === report.bestThreshold.byF1);
|
|
185
|
+
if (current && best && current.threshold !== best.threshold) {
|
|
186
|
+
const precisionChange = ((best.precision - current.precision) / current.precision) * 100;
|
|
187
|
+
const recallChange = ((best.recall - current.recall) / current.recall) * 100;
|
|
188
|
+
const f1Change = ((best.f1 - current.f1) / current.f1) * 100;
|
|
189
|
+
md += `Current threshold (0.3) vs Best threshold (${best.threshold}):\n`;
|
|
190
|
+
md += `- Precision: ${precisionChange > 0 ? '+' : ''}${precisionChange.toFixed(1)}%\n`;
|
|
191
|
+
md += `- Recall: ${recallChange > 0 ? '+' : ''}${recallChange.toFixed(1)}%\n`;
|
|
192
|
+
md += `- F1 Score: ${f1Change > 0 ? '+' : ''}${f1Change.toFixed(1)}%\n\n`;
|
|
193
|
+
if (f1Change > 5) {
|
|
194
|
+
md += `**Recommendation:** Change default threshold from 0.3 to ${best.threshold}\n`;
|
|
195
|
+
}
|
|
196
|
+
else {
|
|
197
|
+
md += `**Recommendation:** Keep current threshold (0.3) - minimal improvement from change\n`;
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
else if (current && best && current.threshold === best.threshold) {
|
|
201
|
+
md += `**Recommendation:** Keep current threshold (0.3) - already optimal\n`;
|
|
202
|
+
}
|
|
203
|
+
md += `\n## Test Dataset\n\n`;
|
|
204
|
+
for (const judgment of report.dataset) {
|
|
205
|
+
md += `### ${judgment.query}\n`;
|
|
206
|
+
md += `- **Category:** ${judgment.category}\n`;
|
|
207
|
+
md += `- **Description:** ${judgment.description}\n`;
|
|
208
|
+
md += `- **Ground Truth Facts:** ${judgment.relevantFactIds.length}\n\n`;
|
|
209
|
+
}
|
|
210
|
+
return md;
|
|
211
|
+
}
|
|
212
|
+
// -----------------------------------------------------------------------------
|
|
213
|
+
// CLI Interface
|
|
214
|
+
// -----------------------------------------------------------------------------
|
|
215
|
+
/**
|
|
216
|
+
* Run evaluation from command line.
|
|
217
|
+
*
|
|
218
|
+
* Usage:
|
|
219
|
+
* tsx src/evaluation/threshold-eval.ts <project-id>
|
|
220
|
+
*/
|
|
221
|
+
export async function main() {
|
|
222
|
+
const projectId = process.argv[2];
|
|
223
|
+
if (!projectId) {
|
|
224
|
+
console.error('Usage: tsx src/evaluation/threshold-eval.ts <project-id>');
|
|
225
|
+
process.exit(1);
|
|
226
|
+
}
|
|
227
|
+
try {
|
|
228
|
+
const report = await runThresholdEvaluation(projectId);
|
|
229
|
+
const markdown = formatReport(report);
|
|
230
|
+
console.log('\n' + markdown);
|
|
231
|
+
// Optionally save to file
|
|
232
|
+
const fs = await import('fs/promises');
|
|
233
|
+
const outputPath = `/Users/oweninnes/Projects/memory/docs/evaluation/threshold-eval-${Date.now()}.md`;
|
|
234
|
+
await fs.mkdir('/Users/oweninnes/Projects/memory/docs/evaluation', { recursive: true });
|
|
235
|
+
await fs.writeFile(outputPath, markdown);
|
|
236
|
+
console.log(`\nš Report saved to: ${outputPath}`);
|
|
237
|
+
}
|
|
238
|
+
catch (error) {
|
|
239
|
+
console.error('Evaluation failed:', error);
|
|
240
|
+
process.exit(1);
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
// Run if called directly (ES module check)
|
|
244
|
+
const isMainModule = process.argv[1] && process.argv[1].endsWith('threshold-eval.ts');
|
|
245
|
+
if (isMainModule) {
|
|
246
|
+
main().catch(err => {
|
|
247
|
+
console.error(err);
|
|
248
|
+
process.exit(1);
|
|
249
|
+
});
|
|
250
|
+
}
|
|
@@ -213,7 +213,7 @@ export declare function handleValidateSpec(args: any): Promise<{
|
|
|
213
213
|
fields: {
|
|
214
214
|
name: string;
|
|
215
215
|
type: import("../codedna/parser.js").FieldType;
|
|
216
|
-
constraints: ("default" | "length" | "email" | "min" | "max" | "url" | "
|
|
216
|
+
constraints: ("default" | "length" | "email" | "min" | "max" | "url" | "pattern" | "required" | "nullable" | "unique" | "hashed" | "index" | "immutable" | "textarea" | "switch" | "radio")[];
|
|
217
217
|
}[];
|
|
218
218
|
};
|
|
219
219
|
summary: {
|
|
@@ -369,7 +369,7 @@ export declare function handleInitProject(args: {
|
|
|
369
369
|
patterns: {
|
|
370
370
|
pattern_id: string;
|
|
371
371
|
name: string;
|
|
372
|
-
category: "hooks" | "
|
|
372
|
+
category: "hooks" | "state" | "components" | "forms" | "validation" | "styling" | "anti-patterns";
|
|
373
373
|
description: string;
|
|
374
374
|
}[];
|
|
375
375
|
summary: {
|