@claudetools/tools 0.7.9 ā 0.8.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/evaluation/build-dataset.d.ts +1 -0
- package/dist/evaluation/build-dataset.js +135 -0
- package/dist/evaluation/threshold-eval.d.ts +63 -0
- package/dist/evaluation/threshold-eval.js +250 -0
- package/dist/handlers/codedna-handlers.d.ts +1 -1
- package/dist/handlers/tool-handlers.js +44 -155
- package/dist/helpers/compact-formatter.d.ts +51 -0
- package/dist/helpers/compact-formatter.js +130 -0
- package/dist/helpers/engagement-tracker.d.ts +10 -0
- package/dist/helpers/engagement-tracker.js +61 -0
- package/dist/helpers/session-validation.d.ts +76 -0
- package/dist/helpers/session-validation.js +221 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.js +1 -0
- package/dist/resources.js +3 -0
- package/dist/templates/claude-md.d.ts +1 -1
- package/dist/templates/claude-md.js +23 -35
- package/dist/templates/worker-prompt.js +35 -202
- package/dist/tools.js +22 -20
- package/package.json +4 -2
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
// =============================================================================
|
|
2
|
+
// Test Dataset Builder
|
|
3
|
+
// =============================================================================
|
|
4
|
+
// Interactive helper to build test dataset with ground truth judgments
|
|
5
|
+
// =============================================================================
|
|
6
|
+
import { getMemoryIndex } from '../helpers/api-client.js';
|
|
7
|
+
import { DEFAULT_USER_ID } from '../helpers/config.js';
|
|
8
|
+
import * as readline from 'readline/promises';
|
|
9
|
+
/**
|
|
10
|
+
* Interactive CLI to build test dataset.
|
|
11
|
+
*
|
|
12
|
+
* For each query:
|
|
13
|
+
* 1. Shows all facts with relevance scores
|
|
14
|
+
* 2. User marks which are actually relevant (ground truth)
|
|
15
|
+
* 3. Saves to JSON file
|
|
16
|
+
*/
|
|
17
|
+
async function buildDatasetInteractive(projectId, userId = DEFAULT_USER_ID) {
|
|
18
|
+
const rl = readline.createInterface({
|
|
19
|
+
input: process.stdin,
|
|
20
|
+
output: process.stdout,
|
|
21
|
+
});
|
|
22
|
+
const dataset = [];
|
|
23
|
+
console.log('š Interactive Test Dataset Builder\n');
|
|
24
|
+
console.log('This tool helps you create ground truth judgments for threshold evaluation.\n');
|
|
25
|
+
console.log('For each query, you\'ll see all matching facts and mark which are truly relevant.\n');
|
|
26
|
+
let addMore = true;
|
|
27
|
+
while (addMore) {
|
|
28
|
+
// Get query
|
|
29
|
+
const query = await rl.question('\nš Enter test query (or "done" to finish): ');
|
|
30
|
+
if (query.toLowerCase() === 'done') {
|
|
31
|
+
break;
|
|
32
|
+
}
|
|
33
|
+
if (!query.trim()) {
|
|
34
|
+
console.log('ā ļø Query cannot be empty');
|
|
35
|
+
continue;
|
|
36
|
+
}
|
|
37
|
+
// Fetch all facts (no relevance filter)
|
|
38
|
+
console.log('\nā³ Fetching facts...\n');
|
|
39
|
+
const result = await getMemoryIndex(projectId, {
|
|
40
|
+
query,
|
|
41
|
+
limit: 50,
|
|
42
|
+
min_relevance: 0, // Get everything
|
|
43
|
+
}, userId);
|
|
44
|
+
if (result.index.length === 0) {
|
|
45
|
+
console.log('ā No facts found for this query. Try a different query.\n');
|
|
46
|
+
continue;
|
|
47
|
+
}
|
|
48
|
+
// Display facts
|
|
49
|
+
console.log(`\nš Found ${result.index.length} facts:\n`);
|
|
50
|
+
console.log('ā'.repeat(80));
|
|
51
|
+
result.index.forEach((entry, i) => {
|
|
52
|
+
const relevanceBar = 'ā'.repeat(Math.round(entry.relevance * 20));
|
|
53
|
+
console.log(`\n[${i + 1}] ID: ${entry.id.slice(0, 12)}...`);
|
|
54
|
+
console.log(` Relevance: ${(entry.relevance * 100).toFixed(0)}% ${relevanceBar}`);
|
|
55
|
+
console.log(` Category: ${entry.category}`);
|
|
56
|
+
console.log(` Summary: ${entry.summary}`);
|
|
57
|
+
});
|
|
58
|
+
console.log('\n' + 'ā'.repeat(80));
|
|
59
|
+
// Get ground truth judgments
|
|
60
|
+
const relevantIndices = await rl.question('\nā
Which facts are TRULY RELEVANT? (comma-separated numbers, e.g., "1,3,5"): ');
|
|
61
|
+
if (!relevantIndices.trim()) {
|
|
62
|
+
console.log('ā ļø No facts marked as relevant. Skipping this query.');
|
|
63
|
+
continue;
|
|
64
|
+
}
|
|
65
|
+
// Parse indices
|
|
66
|
+
const indices = relevantIndices
|
|
67
|
+
.split(',')
|
|
68
|
+
.map(s => parseInt(s.trim()) - 1)
|
|
69
|
+
.filter(i => i >= 0 && i < result.index.length);
|
|
70
|
+
if (indices.length === 0) {
|
|
71
|
+
console.log('ā ļø Invalid indices. Skipping this query.');
|
|
72
|
+
continue;
|
|
73
|
+
}
|
|
74
|
+
const relevantFactIds = indices.map(i => result.index[i].id);
|
|
75
|
+
console.log(`\nā Marked ${relevantFactIds.length} facts as relevant`);
|
|
76
|
+
// Get metadata
|
|
77
|
+
const category = await rl.question('š Category (architecture/pattern/decision/preference/fact): ');
|
|
78
|
+
const description = await rl.question('š Description (what is this query testing?): ');
|
|
79
|
+
// Add to dataset
|
|
80
|
+
dataset.push({
|
|
81
|
+
query,
|
|
82
|
+
relevantFactIds,
|
|
83
|
+
category: category.trim() || 'fact',
|
|
84
|
+
description: description.trim() || query,
|
|
85
|
+
});
|
|
86
|
+
console.log(`\nā
Added test case ${dataset.length}`);
|
|
87
|
+
}
|
|
88
|
+
rl.close();
|
|
89
|
+
// Save dataset
|
|
90
|
+
if (dataset.length > 0) {
|
|
91
|
+
const fs = await import('fs/promises');
|
|
92
|
+
const outputPath = `/Users/oweninnes/Projects/memory/docs/evaluation/dataset-${Date.now()}.json`;
|
|
93
|
+
await fs.mkdir('/Users/oweninnes/Projects/memory/docs/evaluation', { recursive: true });
|
|
94
|
+
await fs.writeFile(outputPath, JSON.stringify(dataset, null, 2));
|
|
95
|
+
console.log('\n' + '='.repeat(80));
|
|
96
|
+
console.log(`\nā
Dataset saved to: ${outputPath}`);
|
|
97
|
+
console.log(`\nš Total test cases: ${dataset.length}`);
|
|
98
|
+
console.log(`š Total ground truth facts: ${dataset.reduce((sum, d) => sum + d.relevantFactIds.length, 0)}`);
|
|
99
|
+
// Show code to use in threshold-eval.ts
|
|
100
|
+
console.log('\nš Copy this into threshold-eval.ts buildTestDataset():\n');
|
|
101
|
+
console.log('```typescript');
|
|
102
|
+
console.log('export function buildTestDataset(): QueryJudgment[] {');
|
|
103
|
+
console.log(' return ' + JSON.stringify(dataset, null, 2) + ';');
|
|
104
|
+
console.log('}');
|
|
105
|
+
console.log('```\n');
|
|
106
|
+
}
|
|
107
|
+
else {
|
|
108
|
+
console.log('\nā ļø No test cases created.');
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
/**
|
|
112
|
+
* CLI entry point.
|
|
113
|
+
*/
|
|
114
|
+
async function main() {
|
|
115
|
+
const projectId = process.argv[2];
|
|
116
|
+
if (!projectId) {
|
|
117
|
+
console.error('Usage: tsx src/evaluation/build-dataset.ts <project-id>');
|
|
118
|
+
process.exit(1);
|
|
119
|
+
}
|
|
120
|
+
try {
|
|
121
|
+
await buildDatasetInteractive(projectId);
|
|
122
|
+
}
|
|
123
|
+
catch (error) {
|
|
124
|
+
console.error('\nā Error:', error);
|
|
125
|
+
process.exit(1);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
// Run if called directly (ES module check)
|
|
129
|
+
const isMainModule = process.argv[1] && process.argv[1].endsWith('build-dataset.ts');
|
|
130
|
+
if (isMainModule) {
|
|
131
|
+
main().catch(err => {
|
|
132
|
+
console.error(err);
|
|
133
|
+
process.exit(1);
|
|
134
|
+
});
|
|
135
|
+
}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
export interface QueryJudgment {
|
|
2
|
+
query: string;
|
|
3
|
+
relevantFactIds: string[];
|
|
4
|
+
category: 'architecture' | 'pattern' | 'decision' | 'preference' | 'fact';
|
|
5
|
+
description: string;
|
|
6
|
+
}
|
|
7
|
+
export interface ThresholdResult {
|
|
8
|
+
threshold: number;
|
|
9
|
+
precision: number;
|
|
10
|
+
recall: number;
|
|
11
|
+
f1: number;
|
|
12
|
+
truePositives: number;
|
|
13
|
+
falsePositives: number;
|
|
14
|
+
falseNegatives: number;
|
|
15
|
+
avgRelevanceReturned: number;
|
|
16
|
+
totalReturned: number;
|
|
17
|
+
}
|
|
18
|
+
export interface EvaluationReport {
|
|
19
|
+
projectId: string;
|
|
20
|
+
testDate: string;
|
|
21
|
+
results: ThresholdResult[];
|
|
22
|
+
bestThreshold: {
|
|
23
|
+
byF1: number;
|
|
24
|
+
byPrecision: number;
|
|
25
|
+
byRecall: number;
|
|
26
|
+
};
|
|
27
|
+
dataset: QueryJudgment[];
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Build test dataset with query-result relevance judgments.
|
|
31
|
+
*
|
|
32
|
+
* To use this framework:
|
|
33
|
+
* 1. Add real queries from your project
|
|
34
|
+
* 2. Run memory_index with no filter to see all available facts
|
|
35
|
+
* 3. Manually judge which fact IDs are relevant to each query
|
|
36
|
+
* 4. Add judgments to this dataset
|
|
37
|
+
*
|
|
38
|
+
* Example workflow:
|
|
39
|
+
* - Query: "authentication patterns"
|
|
40
|
+
* - Run: memory_index(query="authentication patterns", min_relevance=0)
|
|
41
|
+
* - Review returned facts, note IDs of truly relevant ones
|
|
42
|
+
* - Add to dataset with those fact IDs as ground truth
|
|
43
|
+
*/
|
|
44
|
+
export declare function buildTestDataset(): QueryJudgment[];
|
|
45
|
+
/**
|
|
46
|
+
* Evaluate a single threshold value against the test dataset.
|
|
47
|
+
*/
|
|
48
|
+
export declare function evaluateThreshold(projectId: string, threshold: number, dataset: QueryJudgment[], userId?: string): Promise<ThresholdResult>;
|
|
49
|
+
/**
|
|
50
|
+
* Run full evaluation across multiple threshold values.
|
|
51
|
+
*/
|
|
52
|
+
export declare function runThresholdEvaluation(projectId: string, thresholds?: number[], userId?: string): Promise<EvaluationReport>;
|
|
53
|
+
/**
|
|
54
|
+
* Format evaluation report as markdown.
|
|
55
|
+
*/
|
|
56
|
+
export declare function formatReport(report: EvaluationReport): string;
|
|
57
|
+
/**
|
|
58
|
+
* Run evaluation from command line.
|
|
59
|
+
*
|
|
60
|
+
* Usage:
|
|
61
|
+
* tsx src/evaluation/threshold-eval.ts <project-id>
|
|
62
|
+
*/
|
|
63
|
+
export declare function main(): Promise<void>;
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
// =============================================================================
|
|
2
|
+
// Threshold Evaluation Framework
|
|
3
|
+
// =============================================================================
|
|
4
|
+
// Empirical evaluation of relevance threshold values
|
|
5
|
+
// Tests 0.2, 0.3, 0.4, 0.5 to find optimal balance of precision vs recall
|
|
6
|
+
// =============================================================================
|
|
7
|
+
import { getMemoryIndex } from '../helpers/api-client.js';
|
|
8
|
+
import { DEFAULT_USER_ID } from '../helpers/config.js';
|
|
9
|
+
// -----------------------------------------------------------------------------
|
|
10
|
+
// Test Dataset
|
|
11
|
+
// -----------------------------------------------------------------------------
|
|
12
|
+
/**
|
|
13
|
+
* Build test dataset with query-result relevance judgments.
|
|
14
|
+
*
|
|
15
|
+
* To use this framework:
|
|
16
|
+
* 1. Add real queries from your project
|
|
17
|
+
* 2. Run memory_index with no filter to see all available facts
|
|
18
|
+
* 3. Manually judge which fact IDs are relevant to each query
|
|
19
|
+
* 4. Add judgments to this dataset
|
|
20
|
+
*
|
|
21
|
+
* Example workflow:
|
|
22
|
+
* - Query: "authentication patterns"
|
|
23
|
+
* - Run: memory_index(query="authentication patterns", min_relevance=0)
|
|
24
|
+
* - Review returned facts, note IDs of truly relevant ones
|
|
25
|
+
* - Add to dataset with those fact IDs as ground truth
|
|
26
|
+
*/
|
|
27
|
+
export function buildTestDataset() {
|
|
28
|
+
return [
|
|
29
|
+
// Example 1: Architecture query
|
|
30
|
+
{
|
|
31
|
+
query: 'context injection system',
|
|
32
|
+
relevantFactIds: [
|
|
33
|
+
// TODO: Fill with actual fact IDs from your project
|
|
34
|
+
// These should be manually verified as relevant to this query
|
|
35
|
+
],
|
|
36
|
+
category: 'architecture',
|
|
37
|
+
description: 'How context injection works in the system',
|
|
38
|
+
},
|
|
39
|
+
// Example 2: Pattern query
|
|
40
|
+
{
|
|
41
|
+
query: 'task management workflow',
|
|
42
|
+
relevantFactIds: [
|
|
43
|
+
// TODO: Fill with actual fact IDs
|
|
44
|
+
],
|
|
45
|
+
category: 'pattern',
|
|
46
|
+
description: 'Patterns for task creation and execution',
|
|
47
|
+
},
|
|
48
|
+
// Example 3: Decision query
|
|
49
|
+
{
|
|
50
|
+
query: 'why use BM25 instead of TF-IDF',
|
|
51
|
+
relevantFactIds: [
|
|
52
|
+
// TODO: Fill with actual fact IDs
|
|
53
|
+
],
|
|
54
|
+
category: 'decision',
|
|
55
|
+
description: 'Architectural decision rationale',
|
|
56
|
+
},
|
|
57
|
+
// TODO: Add more test cases
|
|
58
|
+
// Aim for 15-20 diverse queries covering different:
|
|
59
|
+
// - Query types (architecture, pattern, decision, preference, fact)
|
|
60
|
+
// - Query lengths (short vs long)
|
|
61
|
+
// - Query specificity (broad vs narrow)
|
|
62
|
+
// - Expected result counts (1-2 facts vs 10+ facts)
|
|
63
|
+
];
|
|
64
|
+
}
|
|
65
|
+
// -----------------------------------------------------------------------------
|
|
66
|
+
// Evaluation Logic
|
|
67
|
+
// -----------------------------------------------------------------------------
|
|
68
|
+
/**
|
|
69
|
+
* Evaluate a single threshold value against the test dataset.
|
|
70
|
+
*/
|
|
71
|
+
export async function evaluateThreshold(projectId, threshold, dataset, userId = DEFAULT_USER_ID) {
|
|
72
|
+
let totalTP = 0;
|
|
73
|
+
let totalFP = 0;
|
|
74
|
+
let totalFN = 0;
|
|
75
|
+
let totalRelevanceScore = 0;
|
|
76
|
+
let totalReturned = 0;
|
|
77
|
+
for (const judgment of dataset) {
|
|
78
|
+
try {
|
|
79
|
+
// Query with this threshold
|
|
80
|
+
const result = await getMemoryIndex(projectId, {
|
|
81
|
+
query: judgment.query,
|
|
82
|
+
limit: 50,
|
|
83
|
+
min_relevance: threshold,
|
|
84
|
+
}, userId);
|
|
85
|
+
const returnedIds = new Set(result.index.map(entry => entry.id));
|
|
86
|
+
const relevantIds = new Set(judgment.relevantFactIds);
|
|
87
|
+
// Calculate metrics for this query
|
|
88
|
+
const tp = Array.from(returnedIds).filter(id => relevantIds.has(id)).length;
|
|
89
|
+
const fp = Array.from(returnedIds).filter(id => !relevantIds.has(id)).length;
|
|
90
|
+
const fn = Array.from(relevantIds).filter(id => !returnedIds.has(id)).length;
|
|
91
|
+
totalTP += tp;
|
|
92
|
+
totalFP += fp;
|
|
93
|
+
totalFN += fn;
|
|
94
|
+
// Track average relevance of returned results
|
|
95
|
+
const relevanceScores = result.index.map(e => e.relevance);
|
|
96
|
+
totalRelevanceScore += relevanceScores.reduce((sum, r) => sum + r, 0);
|
|
97
|
+
totalReturned += result.index.length;
|
|
98
|
+
}
|
|
99
|
+
catch (error) {
|
|
100
|
+
console.error(`Error evaluating query "${judgment.query}":`, error);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
// Calculate aggregate metrics
|
|
104
|
+
const precision = totalTP + totalFP > 0 ? totalTP / (totalTP + totalFP) : 0;
|
|
105
|
+
const recall = totalTP + totalFN > 0 ? totalTP / (totalTP + totalFN) : 0;
|
|
106
|
+
const f1 = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0;
|
|
107
|
+
const avgRelevance = totalReturned > 0 ? totalRelevanceScore / totalReturned : 0;
|
|
108
|
+
return {
|
|
109
|
+
threshold,
|
|
110
|
+
precision,
|
|
111
|
+
recall,
|
|
112
|
+
f1,
|
|
113
|
+
truePositives: totalTP,
|
|
114
|
+
falsePositives: totalFP,
|
|
115
|
+
falseNegatives: totalFN,
|
|
116
|
+
avgRelevanceReturned: avgRelevance,
|
|
117
|
+
totalReturned,
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
/**
|
|
121
|
+
* Run full evaluation across multiple threshold values.
|
|
122
|
+
*/
|
|
123
|
+
export async function runThresholdEvaluation(projectId, thresholds = [0.2, 0.3, 0.4, 0.5], userId = DEFAULT_USER_ID) {
|
|
124
|
+
const dataset = buildTestDataset();
|
|
125
|
+
if (dataset.length === 0 || dataset.every(j => j.relevantFactIds.length === 0)) {
|
|
126
|
+
throw new Error('Test dataset is empty or has no ground truth judgments. ' +
|
|
127
|
+
'Please fill buildTestDataset() with real queries and fact IDs.');
|
|
128
|
+
}
|
|
129
|
+
console.log(`\nš Evaluating ${thresholds.length} thresholds on ${dataset.length} queries...\n`);
|
|
130
|
+
const results = [];
|
|
131
|
+
for (const threshold of thresholds) {
|
|
132
|
+
console.log(`Testing threshold ${threshold}...`);
|
|
133
|
+
const result = await evaluateThreshold(projectId, threshold, dataset, userId);
|
|
134
|
+
results.push(result);
|
|
135
|
+
console.log(` P=${result.precision.toFixed(3)} R=${result.recall.toFixed(3)} F1=${result.f1.toFixed(3)}`);
|
|
136
|
+
}
|
|
137
|
+
// Find best thresholds by different metrics
|
|
138
|
+
const bestF1 = results.reduce((best, curr) => curr.f1 > best.f1 ? curr : best);
|
|
139
|
+
const bestPrecision = results.reduce((best, curr) => curr.precision > best.precision ? curr : best);
|
|
140
|
+
const bestRecall = results.reduce((best, curr) => curr.recall > best.recall ? curr : best);
|
|
141
|
+
return {
|
|
142
|
+
projectId,
|
|
143
|
+
testDate: new Date().toISOString(),
|
|
144
|
+
results,
|
|
145
|
+
bestThreshold: {
|
|
146
|
+
byF1: bestF1.threshold,
|
|
147
|
+
byPrecision: bestPrecision.threshold,
|
|
148
|
+
byRecall: bestRecall.threshold,
|
|
149
|
+
},
|
|
150
|
+
dataset,
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
// -----------------------------------------------------------------------------
|
|
154
|
+
// Reporting
|
|
155
|
+
// -----------------------------------------------------------------------------
|
|
156
|
+
/**
|
|
157
|
+
* Format evaluation report as markdown.
|
|
158
|
+
*/
|
|
159
|
+
export function formatReport(report) {
|
|
160
|
+
let md = `# Relevance Threshold Evaluation Report\n\n`;
|
|
161
|
+
md += `**Project:** ${report.projectId}\n`;
|
|
162
|
+
md += `**Date:** ${report.testDate}\n`;
|
|
163
|
+
md += `**Test Queries:** ${report.dataset.length}\n\n`;
|
|
164
|
+
md += `## Results Summary\n\n`;
|
|
165
|
+
md += `| Threshold | Precision | Recall | F1 Score | TP | FP | FN | Avg Relevance | Total Returned |\n`;
|
|
166
|
+
md += `|-----------|-----------|--------|----------|----|----|----|--------------:|---------------:|\n`;
|
|
167
|
+
for (const r of report.results) {
|
|
168
|
+
md += `| ${r.threshold.toFixed(1)} `;
|
|
169
|
+
md += `| ${(r.precision * 100).toFixed(1)}% `;
|
|
170
|
+
md += `| ${(r.recall * 100).toFixed(1)}% `;
|
|
171
|
+
md += `| ${(r.f1 * 100).toFixed(1)}% `;
|
|
172
|
+
md += `| ${r.truePositives} `;
|
|
173
|
+
md += `| ${r.falsePositives} `;
|
|
174
|
+
md += `| ${r.falseNegatives} `;
|
|
175
|
+
md += `| ${(r.avgRelevanceReturned * 100).toFixed(1)}% `;
|
|
176
|
+
md += `| ${r.totalReturned} |\n`;
|
|
177
|
+
}
|
|
178
|
+
md += `\n## Recommendations\n\n`;
|
|
179
|
+
md += `- **Best F1 Score:** ${report.bestThreshold.byF1} (balanced precision/recall)\n`;
|
|
180
|
+
md += `- **Best Precision:** ${report.bestThreshold.byPrecision} (fewer false positives)\n`;
|
|
181
|
+
md += `- **Best Recall:** ${report.bestThreshold.byRecall} (fewer false negatives)\n\n`;
|
|
182
|
+
md += `## Analysis\n\n`;
|
|
183
|
+
const current = report.results.find(r => r.threshold === 0.3);
|
|
184
|
+
const best = report.results.find(r => r.threshold === report.bestThreshold.byF1);
|
|
185
|
+
if (current && best && current.threshold !== best.threshold) {
|
|
186
|
+
const precisionChange = ((best.precision - current.precision) / current.precision) * 100;
|
|
187
|
+
const recallChange = ((best.recall - current.recall) / current.recall) * 100;
|
|
188
|
+
const f1Change = ((best.f1 - current.f1) / current.f1) * 100;
|
|
189
|
+
md += `Current threshold (0.3) vs Best threshold (${best.threshold}):\n`;
|
|
190
|
+
md += `- Precision: ${precisionChange > 0 ? '+' : ''}${precisionChange.toFixed(1)}%\n`;
|
|
191
|
+
md += `- Recall: ${recallChange > 0 ? '+' : ''}${recallChange.toFixed(1)}%\n`;
|
|
192
|
+
md += `- F1 Score: ${f1Change > 0 ? '+' : ''}${f1Change.toFixed(1)}%\n\n`;
|
|
193
|
+
if (f1Change > 5) {
|
|
194
|
+
md += `**Recommendation:** Change default threshold from 0.3 to ${best.threshold}\n`;
|
|
195
|
+
}
|
|
196
|
+
else {
|
|
197
|
+
md += `**Recommendation:** Keep current threshold (0.3) - minimal improvement from change\n`;
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
else if (current && best && current.threshold === best.threshold) {
|
|
201
|
+
md += `**Recommendation:** Keep current threshold (0.3) - already optimal\n`;
|
|
202
|
+
}
|
|
203
|
+
md += `\n## Test Dataset\n\n`;
|
|
204
|
+
for (const judgment of report.dataset) {
|
|
205
|
+
md += `### ${judgment.query}\n`;
|
|
206
|
+
md += `- **Category:** ${judgment.category}\n`;
|
|
207
|
+
md += `- **Description:** ${judgment.description}\n`;
|
|
208
|
+
md += `- **Ground Truth Facts:** ${judgment.relevantFactIds.length}\n\n`;
|
|
209
|
+
}
|
|
210
|
+
return md;
|
|
211
|
+
}
|
|
212
|
+
// -----------------------------------------------------------------------------
|
|
213
|
+
// CLI Interface
|
|
214
|
+
// -----------------------------------------------------------------------------
|
|
215
|
+
/**
|
|
216
|
+
* Run evaluation from command line.
|
|
217
|
+
*
|
|
218
|
+
* Usage:
|
|
219
|
+
* tsx src/evaluation/threshold-eval.ts <project-id>
|
|
220
|
+
*/
|
|
221
|
+
export async function main() {
|
|
222
|
+
const projectId = process.argv[2];
|
|
223
|
+
if (!projectId) {
|
|
224
|
+
console.error('Usage: tsx src/evaluation/threshold-eval.ts <project-id>');
|
|
225
|
+
process.exit(1);
|
|
226
|
+
}
|
|
227
|
+
try {
|
|
228
|
+
const report = await runThresholdEvaluation(projectId);
|
|
229
|
+
const markdown = formatReport(report);
|
|
230
|
+
console.log('\n' + markdown);
|
|
231
|
+
// Optionally save to file
|
|
232
|
+
const fs = await import('fs/promises');
|
|
233
|
+
const outputPath = `/Users/oweninnes/Projects/memory/docs/evaluation/threshold-eval-${Date.now()}.md`;
|
|
234
|
+
await fs.mkdir('/Users/oweninnes/Projects/memory/docs/evaluation', { recursive: true });
|
|
235
|
+
await fs.writeFile(outputPath, markdown);
|
|
236
|
+
console.log(`\nš Report saved to: ${outputPath}`);
|
|
237
|
+
}
|
|
238
|
+
catch (error) {
|
|
239
|
+
console.error('Evaluation failed:', error);
|
|
240
|
+
process.exit(1);
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
// Run if called directly (ES module check)
|
|
244
|
+
const isMainModule = process.argv[1] && process.argv[1].endsWith('threshold-eval.ts');
|
|
245
|
+
if (isMainModule) {
|
|
246
|
+
main().catch(err => {
|
|
247
|
+
console.error(err);
|
|
248
|
+
process.exit(1);
|
|
249
|
+
});
|
|
250
|
+
}
|
|
@@ -213,7 +213,7 @@ export declare function handleValidateSpec(args: any): Promise<{
|
|
|
213
213
|
fields: {
|
|
214
214
|
name: string;
|
|
215
215
|
type: import("../codedna/parser.js").FieldType;
|
|
216
|
-
constraints: ("default" | "length" | "email" | "min" | "max" | "url" | "
|
|
216
|
+
constraints: ("default" | "length" | "email" | "min" | "max" | "url" | "pattern" | "required" | "nullable" | "unique" | "hashed" | "index" | "immutable" | "textarea" | "switch" | "radio")[];
|
|
217
217
|
}[];
|
|
218
218
|
};
|
|
219
219
|
summary: {
|