chub-dev 0.2.0-beta.3 → 0.2.0-beta.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/src/commands/annotate.js +83 -0
- package/src/commands/build.js +9 -0
- package/src/commands/get.js +48 -4
- package/src/index.js +4 -2
- package/src/lib/annotations.js +57 -0
- package/src/lib/bm25.js +170 -0
- package/src/lib/cache.js +14 -0
- package/src/lib/config.js +1 -1
- package/src/lib/registry.js +103 -20
- package/dist/anthropic/docs/sdk/javascript/DOC.md +0 -499
- package/dist/anthropic/docs/sdk/python/DOC.md +0 -382
- package/dist/openai/docs/chat/javascript/DOC.md +0 -350
- package/dist/openai/docs/chat/python/DOC.md +0 -526
- package/dist/pinecone/docs/sdk/javascript/DOC.md +0 -984
- package/dist/pinecone/docs/sdk/python/DOC.md +0 -1395
- package/dist/registry.json +0 -276
- package/dist/resend/docs/sdk/DOC.md +0 -1271
- package/dist/stripe/docs/api/DOC.md +0 -1726
- package/dist/supabase/docs/sdk/DOC.md +0 -1606
- package/dist/twilio/docs/sdk/python/DOC.md +0 -469
- package/dist/twilio/docs/sdk/typescript/DOC.md +0 -946
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "chub-dev",
|
|
3
|
-
"version": "0.2.0-beta.
|
|
3
|
+
"version": "0.2.0-beta.4",
|
|
4
4
|
"description": "CLI for Context Hub - search and retrieve LLM-optimized docs and skills",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -46,6 +46,6 @@
|
|
|
46
46
|
"yaml": "^2.3.0"
|
|
47
47
|
},
|
|
48
48
|
"devDependencies": {
|
|
49
|
-
"vitest": "^
|
|
49
|
+
"vitest": "^4.0.18"
|
|
50
50
|
}
|
|
51
51
|
}
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import chalk from 'chalk';
|
|
2
|
+
import { readAnnotation, writeAnnotation, clearAnnotation, listAnnotations } from '../lib/annotations.js';
|
|
3
|
+
import { output, error, info } from '../lib/output.js';
|
|
4
|
+
|
|
5
|
+
export function registerAnnotateCommand(program) {
|
|
6
|
+
program
|
|
7
|
+
.command('annotate [id] [note]')
|
|
8
|
+
.description('Attach agent notes to a doc or skill')
|
|
9
|
+
.option('--clear', 'Remove annotation for this entry')
|
|
10
|
+
.option('--list', 'List all annotations')
|
|
11
|
+
.action((id, note, opts) => {
|
|
12
|
+
const globalOpts = program.optsWithGlobals();
|
|
13
|
+
|
|
14
|
+
if (opts.list) {
|
|
15
|
+
const annotations = listAnnotations();
|
|
16
|
+
output(
|
|
17
|
+
annotations,
|
|
18
|
+
(data) => {
|
|
19
|
+
if (data.length === 0) {
|
|
20
|
+
console.log('No annotations.');
|
|
21
|
+
return;
|
|
22
|
+
}
|
|
23
|
+
for (const a of data) {
|
|
24
|
+
console.log(`${chalk.bold(a.id)} ${chalk.dim(`(${a.updatedAt})`)}`);
|
|
25
|
+
console.log(` ${a.note}`);
|
|
26
|
+
console.log();
|
|
27
|
+
}
|
|
28
|
+
},
|
|
29
|
+
globalOpts
|
|
30
|
+
);
|
|
31
|
+
return;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
if (!id) {
|
|
35
|
+
error('Usage: chub annotate <id> <note> | chub annotate <id> --clear | chub annotate --list', globalOpts);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
if (opts.clear) {
|
|
39
|
+
const removed = clearAnnotation(id);
|
|
40
|
+
output(
|
|
41
|
+
{ id, cleared: removed },
|
|
42
|
+
(data) => {
|
|
43
|
+
if (data.cleared) {
|
|
44
|
+
console.log(`Annotation cleared for ${chalk.bold(id)}.`);
|
|
45
|
+
} else {
|
|
46
|
+
console.log(`No annotation found for ${chalk.bold(id)}.`);
|
|
47
|
+
}
|
|
48
|
+
},
|
|
49
|
+
globalOpts
|
|
50
|
+
);
|
|
51
|
+
return;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
if (!note) {
|
|
55
|
+
// Show existing annotation
|
|
56
|
+
const existing = readAnnotation(id);
|
|
57
|
+
if (existing) {
|
|
58
|
+
output(
|
|
59
|
+
existing,
|
|
60
|
+
(data) => {
|
|
61
|
+
console.log(`${chalk.bold(data.id)} ${chalk.dim(`(${data.updatedAt})`)}`);
|
|
62
|
+
console.log(data.note);
|
|
63
|
+
},
|
|
64
|
+
globalOpts
|
|
65
|
+
);
|
|
66
|
+
} else {
|
|
67
|
+
output(
|
|
68
|
+
{ id, note: null },
|
|
69
|
+
() => console.log(`No annotation for ${chalk.bold(id)}.`),
|
|
70
|
+
globalOpts
|
|
71
|
+
);
|
|
72
|
+
}
|
|
73
|
+
return;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
const data = writeAnnotation(id, note);
|
|
77
|
+
output(
|
|
78
|
+
data,
|
|
79
|
+
(d) => console.log(`Annotation saved for ${chalk.bold(d.id)}.`),
|
|
80
|
+
globalOpts
|
|
81
|
+
);
|
|
82
|
+
});
|
|
83
|
+
}
|
package/src/commands/build.js
CHANGED
|
@@ -4,6 +4,7 @@ import chalk from 'chalk';
|
|
|
4
4
|
import { parseFrontmatter } from '../lib/frontmatter.js';
|
|
5
5
|
import { info } from '../lib/output.js';
|
|
6
6
|
import { trackEvent } from '../lib/analytics.js';
|
|
7
|
+
import { buildIndex } from '../lib/bm25.js';
|
|
7
8
|
|
|
8
9
|
/**
|
|
9
10
|
* Recursively find all DOC.md and SKILL.md files under a directory.
|
|
@@ -301,6 +302,14 @@ export function registerBuildCommand(program) {
|
|
|
301
302
|
mkdirSync(outputDir, { recursive: true });
|
|
302
303
|
writeFileSync(join(outputDir, 'registry.json'), JSON.stringify(registry, null, 2));
|
|
303
304
|
|
|
305
|
+
// Build and write BM25 search index
|
|
306
|
+
const allEntries = [
|
|
307
|
+
...allDocs.map((d) => ({ ...d, _type: 'doc' })),
|
|
308
|
+
...allSkills.map((s) => ({ ...s, _type: 'skill' })),
|
|
309
|
+
];
|
|
310
|
+
const searchIndex = buildIndex(allEntries);
|
|
311
|
+
writeFileSync(join(outputDir, 'search-index.json'), JSON.stringify(searchIndex));
|
|
312
|
+
|
|
304
313
|
// Copy content tree
|
|
305
314
|
for (const authorEntry of topLevel) {
|
|
306
315
|
const src = join(contentDir, authorEntry.name);
|
package/src/commands/get.js
CHANGED
|
@@ -5,6 +5,7 @@ import { getEntry, resolveDocPath, resolveEntryFile } from '../lib/registry.js';
|
|
|
5
5
|
import { fetchDoc, fetchDocFull } from '../lib/cache.js';
|
|
6
6
|
import { output, error, info } from '../lib/output.js';
|
|
7
7
|
import { trackEvent } from '../lib/analytics.js';
|
|
8
|
+
import { readAnnotation } from '../lib/annotations.js';
|
|
8
9
|
|
|
9
10
|
/**
|
|
10
11
|
* Fetch one or more entries by ID. Auto-detects doc vs skill per entry.
|
|
@@ -35,6 +36,13 @@ async function fetchEntries(ids, opts, globalOpts) {
|
|
|
35
36
|
error(`Could not resolve path for "${id}" ${opts.lang || ''} ${opts.version || ''}`.trim(), globalOpts);
|
|
36
37
|
}
|
|
37
38
|
|
|
39
|
+
if (resolved.versionNotFound) {
|
|
40
|
+
error(
|
|
41
|
+
`Version "${resolved.requested}" not found for "${id}". Available versions: ${resolved.available.join(', ')}`,
|
|
42
|
+
globalOpts
|
|
43
|
+
);
|
|
44
|
+
}
|
|
45
|
+
|
|
38
46
|
if (resolved.needsLanguage) {
|
|
39
47
|
error(
|
|
40
48
|
`Multiple languages available for "${id}": ${resolved.available.join(', ')}. Specify --lang.`,
|
|
@@ -47,13 +55,32 @@ async function fetchEntries(ids, opts, globalOpts) {
|
|
|
47
55
|
error(`"${id}" ${entryFile.error}`, globalOpts);
|
|
48
56
|
}
|
|
49
57
|
|
|
58
|
+
// Determine which reference files exist (beyond DOC.md/SKILL.md)
|
|
59
|
+
const entryFileName = type === 'skill' ? 'SKILL.md' : 'DOC.md';
|
|
60
|
+
const refFiles = resolved.files.filter((f) => f !== entryFileName);
|
|
61
|
+
|
|
50
62
|
try {
|
|
51
|
-
if (opts.
|
|
63
|
+
if (opts.file) {
|
|
64
|
+
// --file mode: fetch specific file(s) by path
|
|
65
|
+
const requested = opts.file.split(',').map((f) => f.trim());
|
|
66
|
+
const invalid = requested.filter((f) => !resolved.files.includes(f));
|
|
67
|
+
if (invalid.length > 0) {
|
|
68
|
+
const available = refFiles.length > 0 ? refFiles.join(', ') : '(none)';
|
|
69
|
+
error(`File "${invalid[0]}" not found in ${id}. Available: ${available}`, globalOpts);
|
|
70
|
+
}
|
|
71
|
+
if (requested.length === 1) {
|
|
72
|
+
const content = await fetchDoc(resolved.source, join(resolved.path, requested[0]));
|
|
73
|
+
results.push({ id: entry.id, type, content, path: join(resolved.path, requested[0]) });
|
|
74
|
+
} else {
|
|
75
|
+
const allFiles = await fetchDocFull(resolved.source, resolved.path, requested);
|
|
76
|
+
results.push({ id: entry.id, type, files: allFiles, path: resolved.path });
|
|
77
|
+
}
|
|
78
|
+
} else if (opts.full && resolved.files.length > 0) {
|
|
52
79
|
const allFiles = await fetchDocFull(resolved.source, resolved.path, resolved.files);
|
|
53
80
|
results.push({ id: entry.id, type, files: allFiles, path: resolved.path });
|
|
54
81
|
} else {
|
|
55
82
|
const content = await fetchDoc(resolved.source, entryFile.filePath);
|
|
56
|
-
results.push({ id: entry.id, type, content, path: entryFile.filePath });
|
|
83
|
+
results.push({ id: entry.id, type, content, path: entryFile.filePath, additionalFiles: refFiles });
|
|
57
84
|
}
|
|
58
85
|
} catch (err) {
|
|
59
86
|
error(err.message, globalOpts);
|
|
@@ -112,9 +139,25 @@ async function fetchEntries(ids, opts, globalOpts) {
|
|
|
112
139
|
}
|
|
113
140
|
} else {
|
|
114
141
|
if (results.length === 1 && !results[0].files) {
|
|
142
|
+
const r = results[0];
|
|
143
|
+
const extraFiles = r.additionalFiles || [];
|
|
144
|
+
const annotation = readAnnotation(r.id);
|
|
145
|
+
const jsonData = { id: r.id, type: r.type, content: r.content, path: r.path };
|
|
146
|
+
if (extraFiles.length > 0) jsonData.additionalFiles = extraFiles;
|
|
147
|
+
if (annotation) jsonData.annotation = annotation;
|
|
115
148
|
output(
|
|
116
|
-
|
|
117
|
-
(data) =>
|
|
149
|
+
jsonData,
|
|
150
|
+
(data) => {
|
|
151
|
+
process.stdout.write(data.content);
|
|
152
|
+
if (annotation) {
|
|
153
|
+
process.stdout.write(`\n\n---\n[Agent note — ${annotation.updatedAt}]\n${annotation.note}\n`);
|
|
154
|
+
}
|
|
155
|
+
if (extraFiles.length > 0) {
|
|
156
|
+
const fileList = extraFiles.map((f) => ` ${f}`).join('\n');
|
|
157
|
+
const example = `chub get ${r.id} --file ${extraFiles[0]}`;
|
|
158
|
+
process.stdout.write(`\n\n---\nAdditional files available (use --file to fetch):\n${fileList}\nExample: ${example}\n`);
|
|
159
|
+
}
|
|
160
|
+
},
|
|
118
161
|
globalOpts
|
|
119
162
|
);
|
|
120
163
|
} else {
|
|
@@ -142,6 +185,7 @@ export function registerGetCommand(program) {
|
|
|
142
185
|
.option('--version <version>', 'Specific version (for docs)')
|
|
143
186
|
.option('-o, --output <path>', 'Write to file or directory')
|
|
144
187
|
.option('--full', 'Fetch all files (not just entry point)')
|
|
188
|
+
.option('--file <paths>', 'Fetch specific file(s) by path (comma-separated)')
|
|
145
189
|
.action(async (ids, opts) => {
|
|
146
190
|
const globalOpts = program.optsWithGlobals();
|
|
147
191
|
await fetchEntries(ids, opts, globalOpts);
|
package/src/index.js
CHANGED
|
@@ -10,6 +10,7 @@ import { registerSearchCommand } from './commands/search.js';
|
|
|
10
10
|
import { registerGetCommand } from './commands/get.js';
|
|
11
11
|
import { registerBuildCommand } from './commands/build.js';
|
|
12
12
|
import { registerFeedbackCommand } from './commands/feedback.js';
|
|
13
|
+
import { registerAnnotateCommand } from './commands/annotate.js';
|
|
13
14
|
import { trackEvent, shutdownAnalytics } from './lib/analytics.js';
|
|
14
15
|
|
|
15
16
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
@@ -77,14 +78,14 @@ const program = new Command();
|
|
|
77
78
|
program
|
|
78
79
|
.name('chub')
|
|
79
80
|
.description('Context Hub - search and retrieve LLM-optimized docs and skills')
|
|
80
|
-
.version(pkg.version)
|
|
81
|
+
.version(pkg.version, '-V, --cli-version')
|
|
81
82
|
.option('--json', 'Output as JSON (machine-readable)')
|
|
82
83
|
.action(() => {
|
|
83
84
|
printUsage();
|
|
84
85
|
});
|
|
85
86
|
|
|
86
87
|
// Commands that don't need registry
|
|
87
|
-
const SKIP_REGISTRY = ['update', 'cache', 'build', 'feedback', 'help'];
|
|
88
|
+
const SKIP_REGISTRY = ['update', 'cache', 'build', 'feedback', 'annotate', 'help'];
|
|
88
89
|
|
|
89
90
|
program.hook('preAction', async (thisCommand) => {
|
|
90
91
|
const cmdName = thisCommand.args?.[0] || thisCommand.name();
|
|
@@ -111,6 +112,7 @@ registerSearchCommand(program);
|
|
|
111
112
|
registerGetCommand(program);
|
|
112
113
|
registerBuildCommand(program);
|
|
113
114
|
registerFeedbackCommand(program);
|
|
115
|
+
registerAnnotateCommand(program);
|
|
114
116
|
|
|
115
117
|
program.parse();
|
|
116
118
|
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import { readFileSync, writeFileSync, mkdirSync, unlinkSync, readdirSync } from 'node:fs';
|
|
2
|
+
import { join } from 'node:path';
|
|
3
|
+
import { getChubDir } from './config.js';
|
|
4
|
+
|
|
5
|
+
function getAnnotationsDir() {
|
|
6
|
+
return join(getChubDir(), 'annotations');
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
function annotationPath(entryId) {
|
|
10
|
+
const safe = entryId.replace(/\//g, '--');
|
|
11
|
+
return join(getAnnotationsDir(), `${safe}.json`);
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export function readAnnotation(entryId) {
|
|
15
|
+
try {
|
|
16
|
+
return JSON.parse(readFileSync(annotationPath(entryId), 'utf8'));
|
|
17
|
+
} catch {
|
|
18
|
+
return null;
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export function writeAnnotation(entryId, note) {
|
|
23
|
+
const dir = getAnnotationsDir();
|
|
24
|
+
mkdirSync(dir, { recursive: true });
|
|
25
|
+
const data = {
|
|
26
|
+
id: entryId,
|
|
27
|
+
note,
|
|
28
|
+
updatedAt: new Date().toISOString(),
|
|
29
|
+
};
|
|
30
|
+
writeFileSync(annotationPath(entryId), JSON.stringify(data, null, 2));
|
|
31
|
+
return data;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export function clearAnnotation(entryId) {
|
|
35
|
+
try {
|
|
36
|
+
unlinkSync(annotationPath(entryId));
|
|
37
|
+
return true;
|
|
38
|
+
} catch {
|
|
39
|
+
return false;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
export function listAnnotations() {
|
|
44
|
+
const dir = getAnnotationsDir();
|
|
45
|
+
try {
|
|
46
|
+
const files = readdirSync(dir).filter((f) => f.endsWith('.json'));
|
|
47
|
+
return files.map((f) => {
|
|
48
|
+
try {
|
|
49
|
+
return JSON.parse(readFileSync(join(dir, f), 'utf8'));
|
|
50
|
+
} catch {
|
|
51
|
+
return null;
|
|
52
|
+
}
|
|
53
|
+
}).filter(Boolean);
|
|
54
|
+
} catch {
|
|
55
|
+
return [];
|
|
56
|
+
}
|
|
57
|
+
}
|
package/src/lib/bm25.js
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* BM25 search implementation for Context Hub.
|
|
3
|
+
* Index is built at `chub build` time, scoring happens at search time.
|
|
4
|
+
* Tokenizer is shared between build and search to ensure consistency.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
const STOP_WORDS = new Set([
|
|
8
|
+
'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
|
|
9
|
+
'has', 'have', 'in', 'is', 'it', 'its', 'of', 'on', 'or', 'that',
|
|
10
|
+
'the', 'to', 'was', 'were', 'will', 'with', 'this', 'but', 'not',
|
|
11
|
+
'you', 'your', 'can', 'do', 'does', 'how', 'if', 'may', 'no',
|
|
12
|
+
'so', 'than', 'too', 'very', 'just', 'about', 'into', 'over',
|
|
13
|
+
'such', 'then', 'them', 'these', 'those', 'through', 'under',
|
|
14
|
+
'use', 'using', 'used',
|
|
15
|
+
]);
|
|
16
|
+
|
|
17
|
+
// BM25 default parameters
|
|
18
|
+
const DEFAULT_K1 = 1.5;
|
|
19
|
+
const DEFAULT_B = 0.75;
|
|
20
|
+
|
|
21
|
+
// Field weights for multi-field scoring
|
|
22
|
+
const FIELD_WEIGHTS = {
|
|
23
|
+
name: 3.0,
|
|
24
|
+
tags: 2.0,
|
|
25
|
+
description: 1.0,
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Tokenize text into lowercase terms with stop word removal.
|
|
30
|
+
* Must be used identically at build time and search time.
|
|
31
|
+
*/
|
|
32
|
+
export function tokenize(text) {
|
|
33
|
+
if (!text) return [];
|
|
34
|
+
return text
|
|
35
|
+
.toLowerCase()
|
|
36
|
+
.replace(/[^a-z0-9\s-]/g, ' ')
|
|
37
|
+
.split(/[\s-]+/)
|
|
38
|
+
.filter((t) => t.length > 1 && !STOP_WORDS.has(t));
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Build a BM25 search index from registry entries.
|
|
43
|
+
* Called during `chub build`.
|
|
44
|
+
*
|
|
45
|
+
* @param {Array} entries - Combined docs and skills from registry
|
|
46
|
+
* @returns {Object} The search index
|
|
47
|
+
*/
|
|
48
|
+
export function buildIndex(entries) {
|
|
49
|
+
const documents = [];
|
|
50
|
+
const dfMap = {}; // document frequency per term (across all fields)
|
|
51
|
+
const fieldLengths = { name: [], description: [], tags: [] };
|
|
52
|
+
|
|
53
|
+
for (const entry of entries) {
|
|
54
|
+
const nameTokens = tokenize(entry.name);
|
|
55
|
+
const descTokens = tokenize(entry.description || '');
|
|
56
|
+
const tagTokens = (entry.tags || []).flatMap((t) => tokenize(t));
|
|
57
|
+
|
|
58
|
+
documents.push({
|
|
59
|
+
id: entry.id,
|
|
60
|
+
tokens: {
|
|
61
|
+
name: nameTokens,
|
|
62
|
+
description: descTokens,
|
|
63
|
+
tags: tagTokens,
|
|
64
|
+
},
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
fieldLengths.name.push(nameTokens.length);
|
|
68
|
+
fieldLengths.description.push(descTokens.length);
|
|
69
|
+
fieldLengths.tags.push(tagTokens.length);
|
|
70
|
+
|
|
71
|
+
// Count document frequency — a term counts once per document (union of all fields)
|
|
72
|
+
const allTerms = new Set([...nameTokens, ...descTokens, ...tagTokens]);
|
|
73
|
+
for (const term of allTerms) {
|
|
74
|
+
dfMap[term] = (dfMap[term] || 0) + 1;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
const N = documents.length;
|
|
79
|
+
|
|
80
|
+
// Compute IDF for each term
|
|
81
|
+
const idf = {};
|
|
82
|
+
for (const [term, df] of Object.entries(dfMap)) {
|
|
83
|
+
idf[term] = Math.log((N - df + 0.5) / (df + 0.5) + 1);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// Compute average field lengths
|
|
87
|
+
const avg = (arr) => arr.length === 0 ? 0 : arr.reduce((a, b) => a + b, 0) / arr.length;
|
|
88
|
+
const avgFieldLengths = {
|
|
89
|
+
name: avg(fieldLengths.name),
|
|
90
|
+
description: avg(fieldLengths.description),
|
|
91
|
+
tags: avg(fieldLengths.tags),
|
|
92
|
+
};
|
|
93
|
+
|
|
94
|
+
return {
|
|
95
|
+
version: '1.0.0',
|
|
96
|
+
algorithm: 'bm25',
|
|
97
|
+
params: { k1: DEFAULT_K1, b: DEFAULT_B },
|
|
98
|
+
totalDocs: N,
|
|
99
|
+
avgFieldLengths,
|
|
100
|
+
idf,
|
|
101
|
+
documents,
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Compute BM25 score for a single field.
|
|
107
|
+
*/
|
|
108
|
+
function scoreField(queryTerms, fieldTokens, idf, avgFieldLen, k1, b) {
|
|
109
|
+
if (fieldTokens.length === 0) return 0;
|
|
110
|
+
|
|
111
|
+
// Build term frequency map for this field
|
|
112
|
+
const tf = {};
|
|
113
|
+
for (const t of fieldTokens) {
|
|
114
|
+
tf[t] = (tf[t] || 0) + 1;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
let score = 0;
|
|
118
|
+
const dl = fieldTokens.length;
|
|
119
|
+
|
|
120
|
+
for (const term of queryTerms) {
|
|
121
|
+
const termFreq = tf[term] || 0;
|
|
122
|
+
if (termFreq === 0) continue;
|
|
123
|
+
|
|
124
|
+
const termIdf = idf[term] || 0;
|
|
125
|
+
const numerator = termFreq * (k1 + 1);
|
|
126
|
+
const denominator = termFreq + k1 * (1 - b + b * (dl / (avgFieldLen || 1)));
|
|
127
|
+
score += termIdf * (numerator / denominator);
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
return score;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Search the BM25 index with a query string.
|
|
135
|
+
*
|
|
136
|
+
* @param {string} query - The search query
|
|
137
|
+
* @param {Object} index - The pre-built BM25 index
|
|
138
|
+
* @param {Object} opts - Options: { limit }
|
|
139
|
+
* @returns {Array} Sorted results: [{ id, score }]
|
|
140
|
+
*/
|
|
141
|
+
export function search(query, index, opts = {}) {
|
|
142
|
+
const queryTerms = tokenize(query);
|
|
143
|
+
if (queryTerms.length === 0) return [];
|
|
144
|
+
|
|
145
|
+
const { k1, b } = index.params;
|
|
146
|
+
const results = [];
|
|
147
|
+
|
|
148
|
+
for (const doc of index.documents) {
|
|
149
|
+
let totalScore = 0;
|
|
150
|
+
|
|
151
|
+
for (const [field, weight] of Object.entries(FIELD_WEIGHTS)) {
|
|
152
|
+
const fieldTokens = doc.tokens[field] || [];
|
|
153
|
+
const avgLen = index.avgFieldLengths[field] || 1;
|
|
154
|
+
const fieldScore = scoreField(queryTerms, fieldTokens, index.idf, avgLen, k1, b);
|
|
155
|
+
totalScore += fieldScore * weight;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
if (totalScore > 0) {
|
|
159
|
+
results.push({ id: doc.id, score: totalScore });
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
results.sort((a, b) => b.score - a.score);
|
|
164
|
+
|
|
165
|
+
if (opts.limit) {
|
|
166
|
+
return results.slice(0, opts.limit);
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
return results;
|
|
170
|
+
}
|
package/src/lib/cache.js
CHANGED
|
@@ -225,6 +225,20 @@ export function loadSourceRegistry(source) {
|
|
|
225
225
|
return JSON.parse(readFileSync(regPath, 'utf8'));
|
|
226
226
|
}
|
|
227
227
|
|
|
228
|
+
/**
|
|
229
|
+
* Load BM25 search index for a single source (if available).
|
|
230
|
+
*/
|
|
231
|
+
export function loadSearchIndex(source) {
|
|
232
|
+
const basePath = source.path || getSourceDir(source.name);
|
|
233
|
+
const indexPath = join(basePath, 'search-index.json');
|
|
234
|
+
if (!existsSync(indexPath)) return null;
|
|
235
|
+
try {
|
|
236
|
+
return JSON.parse(readFileSync(indexPath, 'utf8'));
|
|
237
|
+
} catch {
|
|
238
|
+
return null;
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
228
242
|
/**
|
|
229
243
|
* Get cache stats.
|
|
230
244
|
*/
|
package/src/lib/config.js
CHANGED
package/src/lib/registry.js
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
|
-
import { loadSourceRegistry } from './cache.js';
|
|
1
|
+
import { loadSourceRegistry, loadSearchIndex } from './cache.js';
|
|
2
2
|
import { loadConfig } from './config.js';
|
|
3
3
|
import { normalizeLanguage } from './normalize.js';
|
|
4
|
+
import { search as bm25Search } from './bm25.js';
|
|
4
5
|
|
|
5
6
|
let _merged = null;
|
|
7
|
+
let _searchIndex = null;
|
|
6
8
|
|
|
7
9
|
/**
|
|
8
10
|
* Load and merge entries from all configured sources.
|
|
@@ -14,11 +16,16 @@ function getMerged() {
|
|
|
14
16
|
const config = loadConfig();
|
|
15
17
|
const allDocs = [];
|
|
16
18
|
const allSkills = [];
|
|
19
|
+
const searchIndexes = [];
|
|
17
20
|
|
|
18
21
|
for (const source of config.sources) {
|
|
19
22
|
const registry = loadSourceRegistry(source);
|
|
20
23
|
if (!registry) continue;
|
|
21
24
|
|
|
25
|
+
// Load BM25 search index if available
|
|
26
|
+
const idx = loadSearchIndex(source);
|
|
27
|
+
if (idx) searchIndexes.push(idx);
|
|
28
|
+
|
|
22
29
|
// Support both new format (docs/skills) and old format (entries)
|
|
23
30
|
if (registry.docs) {
|
|
24
31
|
for (const doc of registry.docs) {
|
|
@@ -46,6 +53,53 @@ function getMerged() {
|
|
|
46
53
|
}
|
|
47
54
|
}
|
|
48
55
|
|
|
56
|
+
// Merge search indexes (combine documents and recompute IDF)
|
|
57
|
+
if (searchIndexes.length > 0) {
|
|
58
|
+
if (searchIndexes.length === 1) {
|
|
59
|
+
_searchIndex = searchIndexes[0];
|
|
60
|
+
} else {
|
|
61
|
+
// Merge multiple indexes: combine documents, recompute global IDF
|
|
62
|
+
const allDocuments = searchIndexes.flatMap((idx) => idx.documents);
|
|
63
|
+
const N = allDocuments.length;
|
|
64
|
+
const dfMap = {};
|
|
65
|
+
const fieldLengths = { name: [], description: [], tags: [] };
|
|
66
|
+
|
|
67
|
+
for (const doc of allDocuments) {
|
|
68
|
+
const allTerms = new Set([
|
|
69
|
+
...(doc.tokens.name || []),
|
|
70
|
+
...(doc.tokens.description || []),
|
|
71
|
+
...(doc.tokens.tags || []),
|
|
72
|
+
]);
|
|
73
|
+
for (const term of allTerms) {
|
|
74
|
+
dfMap[term] = (dfMap[term] || 0) + 1;
|
|
75
|
+
}
|
|
76
|
+
fieldLengths.name.push((doc.tokens.name || []).length);
|
|
77
|
+
fieldLengths.description.push((doc.tokens.description || []).length);
|
|
78
|
+
fieldLengths.tags.push((doc.tokens.tags || []).length);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
const idf = {};
|
|
82
|
+
for (const [term, df] of Object.entries(dfMap)) {
|
|
83
|
+
idf[term] = Math.log((N - df + 0.5) / (df + 0.5) + 1);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
const avg = (arr) => arr.length === 0 ? 0 : arr.reduce((a, b) => a + b, 0) / arr.length;
|
|
87
|
+
_searchIndex = {
|
|
88
|
+
version: '1.0.0',
|
|
89
|
+
algorithm: 'bm25',
|
|
90
|
+
params: searchIndexes[0].params,
|
|
91
|
+
totalDocs: N,
|
|
92
|
+
avgFieldLengths: {
|
|
93
|
+
name: avg(fieldLengths.name),
|
|
94
|
+
description: avg(fieldLengths.description),
|
|
95
|
+
tags: avg(fieldLengths.tags),
|
|
96
|
+
},
|
|
97
|
+
idf,
|
|
98
|
+
documents: allDocuments,
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
49
103
|
_merged = { docs: allDocs, skills: allSkills };
|
|
50
104
|
return _merged;
|
|
51
105
|
}
|
|
@@ -121,11 +175,10 @@ export function getDisplayId(entry) {
|
|
|
121
175
|
|
|
122
176
|
/**
|
|
123
177
|
* Search entries by query string. Searches both docs and skills.
|
|
178
|
+
* Uses BM25 when a search index is available, falls back to keyword matching.
|
|
124
179
|
*/
|
|
125
180
|
export function searchEntries(query, filters = {}) {
|
|
126
181
|
const entries = applySourceFilter(getAllEntries());
|
|
127
|
-
const q = query.toLowerCase();
|
|
128
|
-
const words = q.split(/\s+/);
|
|
129
182
|
|
|
130
183
|
// Deduplicate: same id+source appearing as both doc and skill → show once
|
|
131
184
|
const seen = new Set();
|
|
@@ -138,27 +191,50 @@ export function searchEntries(query, filters = {}) {
|
|
|
138
191
|
}
|
|
139
192
|
}
|
|
140
193
|
|
|
141
|
-
|
|
142
|
-
|
|
194
|
+
// Build entry lookup by id
|
|
195
|
+
const entryById = new Map();
|
|
196
|
+
for (const entry of deduped) {
|
|
197
|
+
entryById.set(entry.id, entry);
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
let results;
|
|
201
|
+
|
|
202
|
+
if (_searchIndex) {
|
|
203
|
+
// BM25 search
|
|
204
|
+
const bm25Results = bm25Search(query, _searchIndex);
|
|
205
|
+
results = bm25Results
|
|
206
|
+
.map((r) => {
|
|
207
|
+
const entry = entryById.get(r.id);
|
|
208
|
+
return entry ? { entry, score: r.score } : null;
|
|
209
|
+
})
|
|
210
|
+
.filter(Boolean);
|
|
211
|
+
} else {
|
|
212
|
+
// Fallback: keyword matching
|
|
213
|
+
const q = query.toLowerCase();
|
|
214
|
+
const words = q.split(/\s+/);
|
|
143
215
|
|
|
144
|
-
|
|
145
|
-
|
|
216
|
+
results = deduped.map((entry) => {
|
|
217
|
+
let score = 0;
|
|
146
218
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
else if (nameLower.includes(q)) score += 40;
|
|
219
|
+
if (entry.id === q) score += 100;
|
|
220
|
+
else if (entry.id.includes(q)) score += 50;
|
|
150
221
|
|
|
151
|
-
|
|
152
|
-
if (
|
|
153
|
-
if (nameLower.includes(
|
|
154
|
-
if (entry.description?.toLowerCase().includes(word)) score += 5;
|
|
155
|
-
if (entry.tags?.some((t) => t.toLowerCase().includes(word))) score += 15;
|
|
156
|
-
}
|
|
222
|
+
const nameLower = entry.name.toLowerCase();
|
|
223
|
+
if (nameLower === q) score += 80;
|
|
224
|
+
else if (nameLower.includes(q)) score += 40;
|
|
157
225
|
|
|
158
|
-
|
|
159
|
-
|
|
226
|
+
for (const word of words) {
|
|
227
|
+
if (entry.id.includes(word)) score += 10;
|
|
228
|
+
if (nameLower.includes(word)) score += 10;
|
|
229
|
+
if (entry.description?.toLowerCase().includes(word)) score += 5;
|
|
230
|
+
if (entry.tags?.some((t) => t.toLowerCase().includes(word))) score += 15;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
return { entry, score };
|
|
234
|
+
});
|
|
160
235
|
|
|
161
|
-
|
|
236
|
+
results = results.filter((r) => r.score > 0);
|
|
237
|
+
}
|
|
162
238
|
|
|
163
239
|
const filtered = applyFilters(results.map((r) => r.entry), filters);
|
|
164
240
|
const filteredSet = new Set(filtered);
|
|
@@ -255,6 +331,13 @@ export function resolveDocPath(entry, language, version) {
|
|
|
255
331
|
let verObj = null;
|
|
256
332
|
if (version) {
|
|
257
333
|
verObj = langObj.versions?.find((v) => v.version === version);
|
|
334
|
+
if (!verObj) {
|
|
335
|
+
return {
|
|
336
|
+
versionNotFound: true,
|
|
337
|
+
requested: version,
|
|
338
|
+
available: langObj.versions?.map((v) => v.version) || [],
|
|
339
|
+
};
|
|
340
|
+
}
|
|
258
341
|
} else {
|
|
259
342
|
const rec = langObj.recommendedVersion;
|
|
260
343
|
verObj = langObj.versions?.find((v) => v.version === rec) || langObj.versions?.[0];
|
|
@@ -272,7 +355,7 @@ export function resolveDocPath(entry, language, version) {
|
|
|
272
355
|
* Given a resolved path and a type ("doc" or "skill"), return the entry file path.
|
|
273
356
|
*/
|
|
274
357
|
export function resolveEntryFile(resolved, type) {
|
|
275
|
-
if (!resolved || resolved.needsLanguage) return { error: 'unresolved' };
|
|
358
|
+
if (!resolved || resolved.needsLanguage || resolved.versionNotFound) return { error: 'unresolved' };
|
|
276
359
|
|
|
277
360
|
const fileName = type === 'skill' ? 'SKILL.md' : 'DOC.md';
|
|
278
361
|
|