@claudetools/tools 0.8.2 → 0.8.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/dist/cli.js +41 -0
  2. package/dist/context/deduplication.d.ts +72 -0
  3. package/dist/context/deduplication.js +77 -0
  4. package/dist/context/deduplication.test.d.ts +6 -0
  5. package/dist/context/deduplication.test.js +84 -0
  6. package/dist/context/emergency-eviction.d.ts +73 -0
  7. package/dist/context/emergency-eviction.example.d.ts +13 -0
  8. package/dist/context/emergency-eviction.example.js +94 -0
  9. package/dist/context/emergency-eviction.js +226 -0
  10. package/dist/context/eviction-engine.d.ts +76 -0
  11. package/dist/context/eviction-engine.example.d.ts +7 -0
  12. package/dist/context/eviction-engine.example.js +144 -0
  13. package/dist/context/eviction-engine.js +176 -0
  14. package/dist/context/example-usage.d.ts +1 -0
  15. package/dist/context/example-usage.js +128 -0
  16. package/dist/context/exchange-summariser.d.ts +80 -0
  17. package/dist/context/exchange-summariser.js +261 -0
  18. package/dist/context/health-monitor.d.ts +97 -0
  19. package/dist/context/health-monitor.example.d.ts +1 -0
  20. package/dist/context/health-monitor.example.js +164 -0
  21. package/dist/context/health-monitor.js +210 -0
  22. package/dist/context/importance-scorer.d.ts +94 -0
  23. package/dist/context/importance-scorer.example.d.ts +1 -0
  24. package/dist/context/importance-scorer.example.js +140 -0
  25. package/dist/context/importance-scorer.js +187 -0
  26. package/dist/context/index.d.ts +9 -0
  27. package/dist/context/index.js +16 -0
  28. package/dist/context/session-helper.d.ts +10 -0
  29. package/dist/context/session-helper.js +51 -0
  30. package/dist/context/session-store.d.ts +94 -0
  31. package/dist/context/session-store.js +286 -0
  32. package/dist/context/usage-estimator.d.ts +131 -0
  33. package/dist/context/usage-estimator.js +260 -0
  34. package/dist/context/usage-estimator.test.d.ts +1 -0
  35. package/dist/context/usage-estimator.test.js +208 -0
  36. package/dist/context-cli.d.ts +16 -0
  37. package/dist/context-cli.js +309 -0
  38. package/dist/evaluation/build-dataset.d.ts +1 -0
  39. package/dist/evaluation/build-dataset.js +135 -0
  40. package/dist/evaluation/threshold-eval.d.ts +63 -0
  41. package/dist/evaluation/threshold-eval.js +250 -0
  42. package/dist/handlers/codedna-handlers.d.ts +2 -2
  43. package/dist/handlers/tool-handlers.js +126 -165
  44. package/dist/helpers/api-client.d.ts +5 -1
  45. package/dist/helpers/api-client.js +3 -1
  46. package/dist/helpers/compact-formatter.d.ts +51 -0
  47. package/dist/helpers/compact-formatter.js +130 -0
  48. package/dist/helpers/engagement-tracker.d.ts +10 -0
  49. package/dist/helpers/engagement-tracker.js +61 -0
  50. package/dist/helpers/error-tracking.js +1 -1
  51. package/dist/helpers/session-validation.d.ts +76 -0
  52. package/dist/helpers/session-validation.js +221 -0
  53. package/dist/helpers/usage-analytics.js +1 -1
  54. package/dist/hooks/index.d.ts +4 -0
  55. package/dist/hooks/index.js +6 -0
  56. package/dist/hooks/post-tool-use-hook-cli.d.ts +2 -0
  57. package/dist/hooks/post-tool-use-hook-cli.js +34 -0
  58. package/dist/hooks/post-tool-use.d.ts +67 -0
  59. package/dist/hooks/post-tool-use.js +234 -0
  60. package/dist/hooks/stop-hook-cli.d.ts +2 -0
  61. package/dist/hooks/stop-hook-cli.js +34 -0
  62. package/dist/hooks/stop.d.ts +64 -0
  63. package/dist/hooks/stop.js +192 -0
  64. package/dist/index.d.ts +3 -0
  65. package/dist/index.js +2 -0
  66. package/dist/logger.d.ts +1 -1
  67. package/dist/logger.js +4 -0
  68. package/dist/resources.js +3 -0
  69. package/dist/setup.js +206 -2
  70. package/dist/templates/claude-md.d.ts +1 -1
  71. package/dist/templates/claude-md.js +23 -35
  72. package/dist/templates/worker-prompt.js +35 -202
  73. package/dist/tools.js +26 -20
  74. package/package.json +6 -2
@@ -0,0 +1,250 @@
1
+ // =============================================================================
2
+ // Threshold Evaluation Framework
3
+ // =============================================================================
4
+ // Empirical evaluation of relevance threshold values
5
+ // Tests 0.2, 0.3, 0.4, 0.5 to find optimal balance of precision vs recall
6
+ // =============================================================================
7
+ import { getMemoryIndex } from '../helpers/api-client.js';
8
+ import { DEFAULT_USER_ID } from '../helpers/config.js';
9
+ // -----------------------------------------------------------------------------
10
+ // Test Dataset
11
+ // -----------------------------------------------------------------------------
12
+ /**
13
+ * Build test dataset with query-result relevance judgments.
14
+ *
15
+ * To use this framework:
16
+ * 1. Add real queries from your project
17
+ * 2. Run memory_index with no filter to see all available facts
18
+ * 3. Manually judge which fact IDs are relevant to each query
19
+ * 4. Add judgments to this dataset
20
+ *
21
+ * Example workflow:
22
+ * - Query: "authentication patterns"
23
+ * - Run: memory_index(query="authentication patterns", min_relevance=0)
24
+ * - Review returned facts, note IDs of truly relevant ones
25
+ * - Add to dataset with those fact IDs as ground truth
26
+ */
27
+ export function buildTestDataset() {
28
+ return [
29
+ // Example 1: Architecture query
30
+ {
31
+ query: 'context injection system',
32
+ relevantFactIds: [
33
+ // TODO: Fill with actual fact IDs from your project
34
+ // These should be manually verified as relevant to this query
35
+ ],
36
+ category: 'architecture',
37
+ description: 'How context injection works in the system',
38
+ },
39
+ // Example 2: Pattern query
40
+ {
41
+ query: 'task management workflow',
42
+ relevantFactIds: [
43
+ // TODO: Fill with actual fact IDs
44
+ ],
45
+ category: 'pattern',
46
+ description: 'Patterns for task creation and execution',
47
+ },
48
+ // Example 3: Decision query
49
+ {
50
+ query: 'why use BM25 instead of TF-IDF',
51
+ relevantFactIds: [
52
+ // TODO: Fill with actual fact IDs
53
+ ],
54
+ category: 'decision',
55
+ description: 'Architectural decision rationale',
56
+ },
57
+ // TODO: Add more test cases
58
+ // Aim for 15-20 diverse queries covering different:
59
+ // - Query types (architecture, pattern, decision, preference, fact)
60
+ // - Query lengths (short vs long)
61
+ // - Query specificity (broad vs narrow)
62
+ // - Expected result counts (1-2 facts vs 10+ facts)
63
+ ];
64
+ }
65
+ // -----------------------------------------------------------------------------
66
+ // Evaluation Logic
67
+ // -----------------------------------------------------------------------------
68
+ /**
69
+ * Evaluate a single threshold value against the test dataset.
70
+ */
71
+ export async function evaluateThreshold(projectId, threshold, dataset, userId = DEFAULT_USER_ID) {
72
+ let totalTP = 0;
73
+ let totalFP = 0;
74
+ let totalFN = 0;
75
+ let totalRelevanceScore = 0;
76
+ let totalReturned = 0;
77
+ for (const judgment of dataset) {
78
+ try {
79
+ // Query with this threshold
80
+ const result = await getMemoryIndex(projectId, {
81
+ query: judgment.query,
82
+ limit: 50,
83
+ min_relevance: threshold,
84
+ }, userId);
85
+ const returnedIds = new Set(result.index.map(entry => entry.id));
86
+ const relevantIds = new Set(judgment.relevantFactIds);
87
+ // Calculate metrics for this query
88
+ const tp = Array.from(returnedIds).filter(id => relevantIds.has(id)).length;
89
+ const fp = Array.from(returnedIds).filter(id => !relevantIds.has(id)).length;
90
+ const fn = Array.from(relevantIds).filter(id => !returnedIds.has(id)).length;
91
+ totalTP += tp;
92
+ totalFP += fp;
93
+ totalFN += fn;
94
+ // Track average relevance of returned results
95
+ const relevanceScores = result.index.map(e => e.relevance);
96
+ totalRelevanceScore += relevanceScores.reduce((sum, r) => sum + r, 0);
97
+ totalReturned += result.index.length;
98
+ }
99
+ catch (error) {
100
+ console.error(`Error evaluating query "${judgment.query}":`, error);
101
+ }
102
+ }
103
+ // Calculate aggregate metrics
104
+ const precision = totalTP + totalFP > 0 ? totalTP / (totalTP + totalFP) : 0;
105
+ const recall = totalTP + totalFN > 0 ? totalTP / (totalTP + totalFN) : 0;
106
+ const f1 = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0;
107
+ const avgRelevance = totalReturned > 0 ? totalRelevanceScore / totalReturned : 0;
108
+ return {
109
+ threshold,
110
+ precision,
111
+ recall,
112
+ f1,
113
+ truePositives: totalTP,
114
+ falsePositives: totalFP,
115
+ falseNegatives: totalFN,
116
+ avgRelevanceReturned: avgRelevance,
117
+ totalReturned,
118
+ };
119
+ }
120
+ /**
121
+ * Run full evaluation across multiple threshold values.
122
+ */
123
+ export async function runThresholdEvaluation(projectId, thresholds = [0.2, 0.3, 0.4, 0.5], userId = DEFAULT_USER_ID) {
124
+ const dataset = buildTestDataset();
125
+ if (dataset.length === 0 || dataset.every(j => j.relevantFactIds.length === 0)) {
126
+ throw new Error('Test dataset is empty or has no ground truth judgments. ' +
127
+ 'Please fill buildTestDataset() with real queries and fact IDs.');
128
+ }
129
+ console.log(`\nšŸ“Š Evaluating ${thresholds.length} thresholds on ${dataset.length} queries...\n`);
130
+ const results = [];
131
+ for (const threshold of thresholds) {
132
+ console.log(`Testing threshold ${threshold}...`);
133
+ const result = await evaluateThreshold(projectId, threshold, dataset, userId);
134
+ results.push(result);
135
+ console.log(` P=${result.precision.toFixed(3)} R=${result.recall.toFixed(3)} F1=${result.f1.toFixed(3)}`);
136
+ }
137
+ // Find best thresholds by different metrics
138
+ const bestF1 = results.reduce((best, curr) => curr.f1 > best.f1 ? curr : best);
139
+ const bestPrecision = results.reduce((best, curr) => curr.precision > best.precision ? curr : best);
140
+ const bestRecall = results.reduce((best, curr) => curr.recall > best.recall ? curr : best);
141
+ return {
142
+ projectId,
143
+ testDate: new Date().toISOString(),
144
+ results,
145
+ bestThreshold: {
146
+ byF1: bestF1.threshold,
147
+ byPrecision: bestPrecision.threshold,
148
+ byRecall: bestRecall.threshold,
149
+ },
150
+ dataset,
151
+ };
152
+ }
153
+ // -----------------------------------------------------------------------------
154
+ // Reporting
155
+ // -----------------------------------------------------------------------------
156
+ /**
157
+ * Format evaluation report as markdown.
158
+ */
159
+ export function formatReport(report) {
160
+ let md = `# Relevance Threshold Evaluation Report\n\n`;
161
+ md += `**Project:** ${report.projectId}\n`;
162
+ md += `**Date:** ${report.testDate}\n`;
163
+ md += `**Test Queries:** ${report.dataset.length}\n\n`;
164
+ md += `## Results Summary\n\n`;
165
+ md += `| Threshold | Precision | Recall | F1 Score | TP | FP | FN | Avg Relevance | Total Returned |\n`;
166
+ md += `|-----------|-----------|--------|----------|----|----|----|--------------:|---------------:|\n`;
167
+ for (const r of report.results) {
168
+ md += `| ${r.threshold.toFixed(1)} `;
169
+ md += `| ${(r.precision * 100).toFixed(1)}% `;
170
+ md += `| ${(r.recall * 100).toFixed(1)}% `;
171
+ md += `| ${(r.f1 * 100).toFixed(1)}% `;
172
+ md += `| ${r.truePositives} `;
173
+ md += `| ${r.falsePositives} `;
174
+ md += `| ${r.falseNegatives} `;
175
+ md += `| ${(r.avgRelevanceReturned * 100).toFixed(1)}% `;
176
+ md += `| ${r.totalReturned} |\n`;
177
+ }
178
+ md += `\n## Recommendations\n\n`;
179
+ md += `- **Best F1 Score:** ${report.bestThreshold.byF1} (balanced precision/recall)\n`;
180
+ md += `- **Best Precision:** ${report.bestThreshold.byPrecision} (fewer false positives)\n`;
181
+ md += `- **Best Recall:** ${report.bestThreshold.byRecall} (fewer false negatives)\n\n`;
182
+ md += `## Analysis\n\n`;
183
+ const current = report.results.find(r => r.threshold === 0.3);
184
+ const best = report.results.find(r => r.threshold === report.bestThreshold.byF1);
185
+ if (current && best && current.threshold !== best.threshold) {
186
+ const precisionChange = ((best.precision - current.precision) / current.precision) * 100;
187
+ const recallChange = ((best.recall - current.recall) / current.recall) * 100;
188
+ const f1Change = ((best.f1 - current.f1) / current.f1) * 100;
189
+ md += `Current threshold (0.3) vs Best threshold (${best.threshold}):\n`;
190
+ md += `- Precision: ${precisionChange > 0 ? '+' : ''}${precisionChange.toFixed(1)}%\n`;
191
+ md += `- Recall: ${recallChange > 0 ? '+' : ''}${recallChange.toFixed(1)}%\n`;
192
+ md += `- F1 Score: ${f1Change > 0 ? '+' : ''}${f1Change.toFixed(1)}%\n\n`;
193
+ if (f1Change > 5) {
194
+ md += `**Recommendation:** Change default threshold from 0.3 to ${best.threshold}\n`;
195
+ }
196
+ else {
197
+ md += `**Recommendation:** Keep current threshold (0.3) - minimal improvement from change\n`;
198
+ }
199
+ }
200
+ else if (current && best && current.threshold === best.threshold) {
201
+ md += `**Recommendation:** Keep current threshold (0.3) - already optimal\n`;
202
+ }
203
+ md += `\n## Test Dataset\n\n`;
204
+ for (const judgment of report.dataset) {
205
+ md += `### ${judgment.query}\n`;
206
+ md += `- **Category:** ${judgment.category}\n`;
207
+ md += `- **Description:** ${judgment.description}\n`;
208
+ md += `- **Ground Truth Facts:** ${judgment.relevantFactIds.length}\n\n`;
209
+ }
210
+ return md;
211
+ }
212
+ // -----------------------------------------------------------------------------
213
+ // CLI Interface
214
+ // -----------------------------------------------------------------------------
215
+ /**
216
+ * Run evaluation from command line.
217
+ *
218
+ * Usage:
219
+ * tsx src/evaluation/threshold-eval.ts <project-id>
220
+ */
221
+ export async function main() {
222
+ const projectId = process.argv[2];
223
+ if (!projectId) {
224
+ console.error('Usage: tsx src/evaluation/threshold-eval.ts <project-id>');
225
+ process.exit(1);
226
+ }
227
+ try {
228
+ const report = await runThresholdEvaluation(projectId);
229
+ const markdown = formatReport(report);
230
+ console.log('\n' + markdown);
231
+ // Optionally save to file
232
+ const fs = await import('fs/promises');
233
+ const outputPath = `/Users/oweninnes/Projects/memory/docs/evaluation/threshold-eval-${Date.now()}.md`;
234
+ await fs.mkdir('/Users/oweninnes/Projects/memory/docs/evaluation', { recursive: true });
235
+ await fs.writeFile(outputPath, markdown);
236
+ console.log(`\nšŸ“„ Report saved to: ${outputPath}`);
237
+ }
238
+ catch (error) {
239
+ console.error('Evaluation failed:', error);
240
+ process.exit(1);
241
+ }
242
+ }
243
+ // Run if called directly (ES module check)
244
+ const isMainModule = process.argv[1] && process.argv[1].endsWith('threshold-eval.ts');
245
+ if (isMainModule) {
246
+ main().catch(err => {
247
+ console.error(err);
248
+ process.exit(1);
249
+ });
250
+ }
@@ -213,7 +213,7 @@ export declare function handleValidateSpec(args: any): Promise<{
213
213
  fields: {
214
214
  name: string;
215
215
  type: import("../codedna/parser.js").FieldType;
216
- constraints: ("default" | "length" | "email" | "min" | "max" | "url" | "required" | "pattern" | "nullable" | "unique" | "hashed" | "index" | "immutable" | "textarea" | "switch" | "radio")[];
216
+ constraints: ("default" | "length" | "email" | "min" | "max" | "url" | "pattern" | "required" | "nullable" | "unique" | "hashed" | "index" | "immutable" | "textarea" | "switch" | "radio")[];
217
217
  }[];
218
218
  };
219
219
  summary: {
@@ -369,7 +369,7 @@ export declare function handleInitProject(args: {
369
369
  patterns: {
370
370
  pattern_id: string;
371
371
  name: string;
372
- category: "hooks" | "components" | "forms" | "state" | "validation" | "styling" | "anti-patterns";
372
+ category: "hooks" | "state" | "components" | "forms" | "validation" | "styling" | "anti-patterns";
373
373
  description: string;
374
374
  }[];
375
375
  summary: {