chub-dev 0.2.0-beta.2 → 0.2.0-beta.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "chub-dev",
3
- "version": "0.2.0-beta.2",
3
+ "version": "0.2.0-beta.4",
4
4
  "description": "CLI for Context Hub - search and retrieve LLM-optimized docs and skills",
5
5
  "type": "module",
6
6
  "bin": {
@@ -46,6 +46,6 @@
46
46
  "yaml": "^2.3.0"
47
47
  },
48
48
  "devDependencies": {
49
- "vitest": "^3.0.0"
49
+ "vitest": "^4.0.18"
50
50
  }
51
51
  }
@@ -0,0 +1,83 @@
1
+ import chalk from 'chalk';
2
+ import { readAnnotation, writeAnnotation, clearAnnotation, listAnnotations } from '../lib/annotations.js';
3
+ import { output, error, info } from '../lib/output.js';
4
+
5
+ export function registerAnnotateCommand(program) {
6
+ program
7
+ .command('annotate [id] [note]')
8
+ .description('Attach agent notes to a doc or skill')
9
+ .option('--clear', 'Remove annotation for this entry')
10
+ .option('--list', 'List all annotations')
11
+ .action((id, note, opts) => {
12
+ const globalOpts = program.optsWithGlobals();
13
+
14
+ if (opts.list) {
15
+ const annotations = listAnnotations();
16
+ output(
17
+ annotations,
18
+ (data) => {
19
+ if (data.length === 0) {
20
+ console.log('No annotations.');
21
+ return;
22
+ }
23
+ for (const a of data) {
24
+ console.log(`${chalk.bold(a.id)} ${chalk.dim(`(${a.updatedAt})`)}`);
25
+ console.log(` ${a.note}`);
26
+ console.log();
27
+ }
28
+ },
29
+ globalOpts
30
+ );
31
+ return;
32
+ }
33
+
34
+ if (!id) {
35
+ error('Usage: chub annotate <id> <note> | chub annotate <id> --clear | chub annotate --list', globalOpts);
36
+ }
37
+
38
+ if (opts.clear) {
39
+ const removed = clearAnnotation(id);
40
+ output(
41
+ { id, cleared: removed },
42
+ (data) => {
43
+ if (data.cleared) {
44
+ console.log(`Annotation cleared for ${chalk.bold(id)}.`);
45
+ } else {
46
+ console.log(`No annotation found for ${chalk.bold(id)}.`);
47
+ }
48
+ },
49
+ globalOpts
50
+ );
51
+ return;
52
+ }
53
+
54
+ if (!note) {
55
+ // Show existing annotation
56
+ const existing = readAnnotation(id);
57
+ if (existing) {
58
+ output(
59
+ existing,
60
+ (data) => {
61
+ console.log(`${chalk.bold(data.id)} ${chalk.dim(`(${data.updatedAt})`)}`);
62
+ console.log(data.note);
63
+ },
64
+ globalOpts
65
+ );
66
+ } else {
67
+ output(
68
+ { id, note: null },
69
+ () => console.log(`No annotation for ${chalk.bold(id)}.`),
70
+ globalOpts
71
+ );
72
+ }
73
+ return;
74
+ }
75
+
76
+ const data = writeAnnotation(id, note);
77
+ output(
78
+ data,
79
+ (d) => console.log(`Annotation saved for ${chalk.bold(d.id)}.`),
80
+ globalOpts
81
+ );
82
+ });
83
+ }
@@ -4,6 +4,7 @@ import chalk from 'chalk';
4
4
  import { parseFrontmatter } from '../lib/frontmatter.js';
5
5
  import { info } from '../lib/output.js';
6
6
  import { trackEvent } from '../lib/analytics.js';
7
+ import { buildIndex } from '../lib/bm25.js';
7
8
 
8
9
  /**
9
10
  * Recursively find all DOC.md and SKILL.md files under a directory.
@@ -301,6 +302,14 @@ export function registerBuildCommand(program) {
301
302
  mkdirSync(outputDir, { recursive: true });
302
303
  writeFileSync(join(outputDir, 'registry.json'), JSON.stringify(registry, null, 2));
303
304
 
305
+ // Build and write BM25 search index
306
+ const allEntries = [
307
+ ...allDocs.map((d) => ({ ...d, _type: 'doc' })),
308
+ ...allSkills.map((s) => ({ ...s, _type: 'skill' })),
309
+ ];
310
+ const searchIndex = buildIndex(allEntries);
311
+ writeFileSync(join(outputDir, 'search-index.json'), JSON.stringify(searchIndex));
312
+
304
313
  // Copy content tree
305
314
  for (const authorEntry of topLevel) {
306
315
  const src = join(contentDir, authorEntry.name);
@@ -5,19 +5,17 @@ import { getEntry, resolveDocPath, resolveEntryFile } from '../lib/registry.js';
5
5
  import { fetchDoc, fetchDocFull } from '../lib/cache.js';
6
6
  import { output, error, info } from '../lib/output.js';
7
7
  import { trackEvent } from '../lib/analytics.js';
8
+ import { readAnnotation } from '../lib/annotations.js';
8
9
 
9
10
  /**
10
- * Core fetch logic shared by `get docs` and `get skills`.
11
- * @param {string} type - "doc" or "skill"
12
- * @param {string[]} ids - one or more entry ids
13
- * @param {object} opts - command options (lang, version, output, full)
14
- * @param {object} globalOpts - global options (json)
11
+ * Fetch one or more entries by ID. Auto-detects doc vs skill per entry.
15
12
  */
16
- async function fetchEntries(type, ids, opts, globalOpts) {
13
+ async function fetchEntries(ids, opts, globalOpts) {
17
14
  const results = [];
18
15
 
19
16
  for (const id of ids) {
20
- const result = getEntry(id, type);
17
+ // Search both docs and skills — auto-detect type
18
+ const result = getEntry(id);
21
19
 
22
20
  if (result.ambiguous) {
23
21
  error(
@@ -27,16 +25,24 @@ async function fetchEntries(type, ids, opts, globalOpts) {
27
25
  }
28
26
 
29
27
  if (!result.entry) {
30
- error(`Entry "${id}" not found in ${type}s.`, globalOpts);
28
+ error(`Entry "${id}" not found.`, globalOpts);
31
29
  }
32
30
 
33
31
  const entry = result.entry;
32
+ const type = entry.languages ? 'doc' : 'skill';
34
33
  const resolved = resolveDocPath(entry, opts.lang, opts.version);
35
34
 
36
35
  if (!resolved) {
37
36
  error(`Could not resolve path for "${id}" ${opts.lang || ''} ${opts.version || ''}`.trim(), globalOpts);
38
37
  }
39
38
 
39
+ if (resolved.versionNotFound) {
40
+ error(
41
+ `Version "${resolved.requested}" not found for "${id}". Available versions: ${resolved.available.join(', ')}`,
42
+ globalOpts
43
+ );
44
+ }
45
+
40
46
  if (resolved.needsLanguage) {
41
47
  error(
42
48
  `Multiple languages available for "${id}": ${resolved.available.join(', ')}. Specify --lang.`,
@@ -49,13 +55,32 @@ async function fetchEntries(type, ids, opts, globalOpts) {
49
55
  error(`"${id}" ${entryFile.error}`, globalOpts);
50
56
  }
51
57
 
58
+ // Determine which reference files exist (beyond DOC.md/SKILL.md)
59
+ const entryFileName = type === 'skill' ? 'SKILL.md' : 'DOC.md';
60
+ const refFiles = resolved.files.filter((f) => f !== entryFileName);
61
+
52
62
  try {
53
- if (opts.full && resolved.files.length > 0) {
63
+ if (opts.file) {
64
+ // --file mode: fetch specific file(s) by path
65
+ const requested = opts.file.split(',').map((f) => f.trim());
66
+ const invalid = requested.filter((f) => !resolved.files.includes(f));
67
+ if (invalid.length > 0) {
68
+ const available = refFiles.length > 0 ? refFiles.join(', ') : '(none)';
69
+ error(`File "${invalid[0]}" not found in ${id}. Available: ${available}`, globalOpts);
70
+ }
71
+ if (requested.length === 1) {
72
+ const content = await fetchDoc(resolved.source, join(resolved.path, requested[0]));
73
+ results.push({ id: entry.id, type, content, path: join(resolved.path, requested[0]) });
74
+ } else {
75
+ const allFiles = await fetchDocFull(resolved.source, resolved.path, requested);
76
+ results.push({ id: entry.id, type, files: allFiles, path: resolved.path });
77
+ }
78
+ } else if (opts.full && resolved.files.length > 0) {
54
79
  const allFiles = await fetchDocFull(resolved.source, resolved.path, resolved.files);
55
- results.push({ id: entry.id, files: allFiles, path: resolved.path });
80
+ results.push({ id: entry.id, type, files: allFiles, path: resolved.path });
56
81
  } else {
57
82
  const content = await fetchDoc(resolved.source, entryFile.filePath);
58
- results.push({ id: entry.id, content, path: entryFile.filePath });
83
+ results.push({ id: entry.id, type, content, path: entryFile.filePath, additionalFiles: refFiles });
59
84
  }
60
85
  } catch (err) {
61
86
  error(err.message, globalOpts);
@@ -64,7 +89,7 @@ async function fetchEntries(type, ids, opts, globalOpts) {
64
89
 
65
90
  // Track fetches
66
91
  for (const r of results) {
67
- trackEvent(type === 'doc' ? 'doc_fetched' : 'skill_fetched', {
92
+ trackEvent(r.type === 'doc' ? 'doc_fetched' : 'skill_fetched', {
68
93
  entry_id: r.id,
69
94
  full: !!opts.full,
70
95
  lang: opts.lang || undefined,
@@ -74,7 +99,6 @@ async function fetchEntries(type, ids, opts, globalOpts) {
74
99
  // Output
75
100
  if (opts.output) {
76
101
  if (opts.full) {
77
- // --full -o: write individual files preserving directory structure
78
102
  for (const r of results) {
79
103
  if (r.files) {
80
104
  const baseDir = ids.length > 1 ? join(opts.output, r.id) : opts.output;
@@ -111,18 +135,32 @@ async function fetchEntries(type, ids, opts, globalOpts) {
111
135
  }
112
136
  }
113
137
  if (globalOpts.json) {
114
- console.log(JSON.stringify(results.map((r) => ({ id: r.id, path: opts.output }))));
138
+ console.log(JSON.stringify(results.map((r) => ({ id: r.id, type: r.type, path: opts.output }))));
115
139
  }
116
140
  } else {
117
- // stdout
118
141
  if (results.length === 1 && !results[0].files) {
142
+ const r = results[0];
143
+ const extraFiles = r.additionalFiles || [];
144
+ const annotation = readAnnotation(r.id);
145
+ const jsonData = { id: r.id, type: r.type, content: r.content, path: r.path };
146
+ if (extraFiles.length > 0) jsonData.additionalFiles = extraFiles;
147
+ if (annotation) jsonData.annotation = annotation;
119
148
  output(
120
- { id: results[0].id, content: results[0].content, path: results[0].path },
121
- (data) => process.stdout.write(data.content),
149
+ jsonData,
150
+ (data) => {
151
+ process.stdout.write(data.content);
152
+ if (annotation) {
153
+ process.stdout.write(`\n\n---\n[Agent note — ${annotation.updatedAt}]\n${annotation.note}\n`);
154
+ }
155
+ if (extraFiles.length > 0) {
156
+ const fileList = extraFiles.map((f) => ` ${f}`).join('\n');
157
+ const example = `chub get ${r.id} --file ${extraFiles[0]}`;
158
+ process.stdout.write(`\n\n---\nAdditional files available (use --file to fetch):\n${fileList}\nExample: ${example}\n`);
159
+ }
160
+ },
122
161
  globalOpts
123
162
  );
124
163
  } else {
125
- // Concatenate all content (--full to stdout, or multiple entries)
126
164
  const parts = results.flatMap((r) => {
127
165
  if (r.files) {
128
166
  return r.files.map((f) => `# FILE: ${f.name}\n\n${f.content}`);
@@ -131,7 +169,7 @@ async function fetchEntries(type, ids, opts, globalOpts) {
131
169
  });
132
170
  const combined = parts.join('\n\n---\n\n');
133
171
  output(
134
- results.map((r) => ({ id: r.id, path: r.path })),
172
+ results.map((r) => ({ id: r.id, type: r.type, path: r.path })),
135
173
  () => process.stdout.write(combined),
136
174
  globalOpts
137
175
  );
@@ -140,29 +178,16 @@ async function fetchEntries(type, ids, opts, globalOpts) {
140
178
  }
141
179
 
142
180
  export function registerGetCommand(program) {
143
- const get = program
144
- .command('get')
145
- .description('Retrieve docs or skills');
146
-
147
- get
148
- .command('docs <ids...>')
149
- .description('Fetch documentation content')
150
- .option('--lang <language>', 'Language variant')
151
- .option('--version <version>', 'Specific version')
152
- .option('-o, --output <path>', 'Write to file or directory')
153
- .option('--full', 'Fetch all files (not just entry point)')
154
- .action(async (ids, opts) => {
155
- const globalOpts = program.optsWithGlobals();
156
- await fetchEntries('doc', ids, opts, globalOpts);
157
- });
158
-
159
- get
160
- .command('skills <ids...>')
161
- .description('Fetch skill content')
181
+ program
182
+ .command('get <ids...>')
183
+ .description('Fetch docs or skills by ID (auto-detects type)')
184
+ .option('--lang <language>', 'Language variant (for docs)')
185
+ .option('--version <version>', 'Specific version (for docs)')
162
186
  .option('-o, --output <path>', 'Write to file or directory')
163
187
  .option('--full', 'Fetch all files (not just entry point)')
188
+ .option('--file <paths>', 'Fetch specific file(s) by path (comma-separated)')
164
189
  .action(async (ids, opts) => {
165
190
  const globalOpts = program.optsWithGlobals();
166
- await fetchEntries('skill', ids, opts, globalOpts);
191
+ await fetchEntries(ids, opts, globalOpts);
167
192
  });
168
193
  }
package/src/index.js CHANGED
@@ -10,6 +10,7 @@ import { registerSearchCommand } from './commands/search.js';
10
10
  import { registerGetCommand } from './commands/get.js';
11
11
  import { registerBuildCommand } from './commands/build.js';
12
12
  import { registerFeedbackCommand } from './commands/feedback.js';
13
+ import { registerAnnotateCommand } from './commands/annotate.js';
13
14
  import { trackEvent, shutdownAnalytics } from './lib/analytics.js';
14
15
 
15
16
  const __dirname = dirname(fileURLToPath(import.meta.url));
@@ -26,17 +27,16 @@ ${chalk.bold.underline('Getting Started')}
26
27
  ${chalk.dim('$')} chub search ${chalk.dim('# list everything available')}
27
28
  ${chalk.dim('$')} chub search "stripe" ${chalk.dim('# fuzzy search')}
28
29
  ${chalk.dim('$')} chub search stripe/payments ${chalk.dim('# exact id → full detail')}
29
- ${chalk.dim('$')} chub get docs stripe/payments ${chalk.dim('# print doc to terminal')}
30
- ${chalk.dim('$')} chub get docs stripe/payments -o doc.md ${chalk.dim('# save to file')}
31
- ${chalk.dim('$')} chub get docs stripe/payments --lang py ${chalk.dim('# specific language')}
32
- ${chalk.dim('$')} chub get skills pw/login-flows ${chalk.dim('# fetch a skill')}
33
- ${chalk.dim('$')} chub get docs openai/chat stripe/payments ${chalk.dim('# fetch multiple')}
30
+ ${chalk.dim('$')} chub get stripe/api ${chalk.dim('# print doc to terminal')}
31
+ ${chalk.dim('$')} chub get stripe/api -o doc.md ${chalk.dim('# save to file')}
32
+ ${chalk.dim('$')} chub get openai/chat --lang py ${chalk.dim('# specific language')}
33
+ ${chalk.dim('$')} chub get pw-community/login-flows ${chalk.dim('# fetch a skill')}
34
+ ${chalk.dim('$')} chub get openai/chat stripe/api ${chalk.dim('# fetch multiple')}
34
35
 
35
36
  ${chalk.bold.underline('Commands')}
36
37
 
37
38
  ${chalk.bold('search')} [query] Search docs and skills (no query = list all)
38
- ${chalk.bold('get docs')} <ids...> Fetch documentation content
39
- ${chalk.bold('get skills')} <ids...> Fetch skill content
39
+ ${chalk.bold('get')} <ids...> Fetch docs or skills by ID
40
40
  ${chalk.bold('update')} Refresh the cached registry
41
41
  ${chalk.bold('cache')} status|clear Manage the local cache
42
42
  ${chalk.bold('build')} <content-dir> Build registry from content directory
@@ -56,10 +56,10 @@ ${chalk.bold.underline('Agent Piping Patterns')}
56
56
 
57
57
  ${chalk.dim('# Search → pick → fetch → save')}
58
58
  ${chalk.dim('$')} ID=$(chub search "stripe" --json | jq -r '.results[0].id')
59
- ${chalk.dim('$')} chub get docs "$ID" --lang js -o .context/stripe.md
59
+ ${chalk.dim('$')} chub get "$ID" --lang js -o .context/stripe.md
60
60
 
61
- ${chalk.dim('# Fetch multiple docs at once')}
62
- ${chalk.dim('$')} chub get docs openai/chat stripe/payments -o .context/
61
+ ${chalk.dim('# Fetch multiple at once')}
62
+ ${chalk.dim('$')} chub get openai/chat stripe/api -o .context/
63
63
 
64
64
  ${chalk.bold.underline('Multi-Source Config')} ${chalk.dim('(~/.chub/config.yaml)')}
65
65
 
@@ -69,7 +69,7 @@ ${chalk.bold.underline('Multi-Source Config')} ${chalk.dim('(~/.chub/config.yaml
69
69
  ${chalk.dim(' - name: internal')}
70
70
  ${chalk.dim(' path: /path/to/local/docs')}
71
71
 
72
- ${chalk.dim('# On id collision, use source: prefix: chub get docs internal:openai/chat')}
72
+ ${chalk.dim('# On id collision, use source: prefix: chub get internal:openai/chat')}
73
73
  `);
74
74
  }
75
75
 
@@ -78,14 +78,14 @@ const program = new Command();
78
78
  program
79
79
  .name('chub')
80
80
  .description('Context Hub - search and retrieve LLM-optimized docs and skills')
81
- .version(pkg.version)
81
+ .version(pkg.version, '-V, --cli-version')
82
82
  .option('--json', 'Output as JSON (machine-readable)')
83
83
  .action(() => {
84
84
  printUsage();
85
85
  });
86
86
 
87
87
  // Commands that don't need registry
88
- const SKIP_REGISTRY = ['update', 'cache', 'build', 'feedback', 'help'];
88
+ const SKIP_REGISTRY = ['update', 'cache', 'build', 'feedback', 'annotate', 'help'];
89
89
 
90
90
  program.hook('preAction', async (thisCommand) => {
91
91
  const cmdName = thisCommand.args?.[0] || thisCommand.name();
@@ -112,6 +112,7 @@ registerSearchCommand(program);
112
112
  registerGetCommand(program);
113
113
  registerBuildCommand(program);
114
114
  registerFeedbackCommand(program);
115
+ registerAnnotateCommand(program);
115
116
 
116
117
  program.parse();
117
118
 
@@ -0,0 +1,57 @@
1
+ import { readFileSync, writeFileSync, mkdirSync, unlinkSync, readdirSync } from 'node:fs';
2
+ import { join } from 'node:path';
3
+ import { getChubDir } from './config.js';
4
+
5
+ function getAnnotationsDir() {
6
+ return join(getChubDir(), 'annotations');
7
+ }
8
+
9
+ function annotationPath(entryId) {
10
+ const safe = entryId.replace(/\//g, '--');
11
+ return join(getAnnotationsDir(), `${safe}.json`);
12
+ }
13
+
14
+ export function readAnnotation(entryId) {
15
+ try {
16
+ return JSON.parse(readFileSync(annotationPath(entryId), 'utf8'));
17
+ } catch {
18
+ return null;
19
+ }
20
+ }
21
+
22
+ export function writeAnnotation(entryId, note) {
23
+ const dir = getAnnotationsDir();
24
+ mkdirSync(dir, { recursive: true });
25
+ const data = {
26
+ id: entryId,
27
+ note,
28
+ updatedAt: new Date().toISOString(),
29
+ };
30
+ writeFileSync(annotationPath(entryId), JSON.stringify(data, null, 2));
31
+ return data;
32
+ }
33
+
34
+ export function clearAnnotation(entryId) {
35
+ try {
36
+ unlinkSync(annotationPath(entryId));
37
+ return true;
38
+ } catch {
39
+ return false;
40
+ }
41
+ }
42
+
43
+ export function listAnnotations() {
44
+ const dir = getAnnotationsDir();
45
+ try {
46
+ const files = readdirSync(dir).filter((f) => f.endsWith('.json'));
47
+ return files.map((f) => {
48
+ try {
49
+ return JSON.parse(readFileSync(join(dir, f), 'utf8'));
50
+ } catch {
51
+ return null;
52
+ }
53
+ }).filter(Boolean);
54
+ } catch {
55
+ return [];
56
+ }
57
+ }
@@ -0,0 +1,170 @@
1
+ /**
2
+ * BM25 search implementation for Context Hub.
3
+ * Index is built at `chub build` time, scoring happens at search time.
4
+ * Tokenizer is shared between build and search to ensure consistency.
5
+ */
6
+
7
+ const STOP_WORDS = new Set([
8
+ 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
9
+ 'has', 'have', 'in', 'is', 'it', 'its', 'of', 'on', 'or', 'that',
10
+ 'the', 'to', 'was', 'were', 'will', 'with', 'this', 'but', 'not',
11
+ 'you', 'your', 'can', 'do', 'does', 'how', 'if', 'may', 'no',
12
+ 'so', 'than', 'too', 'very', 'just', 'about', 'into', 'over',
13
+ 'such', 'then', 'them', 'these', 'those', 'through', 'under',
14
+ 'use', 'using', 'used',
15
+ ]);
16
+
17
+ // BM25 default parameters
18
+ const DEFAULT_K1 = 1.5;
19
+ const DEFAULT_B = 0.75;
20
+
21
+ // Field weights for multi-field scoring
22
+ const FIELD_WEIGHTS = {
23
+ name: 3.0,
24
+ tags: 2.0,
25
+ description: 1.0,
26
+ };
27
+
28
+ /**
29
+ * Tokenize text into lowercase terms with stop word removal.
30
+ * Must be used identically at build time and search time.
31
+ */
32
+ export function tokenize(text) {
33
+ if (!text) return [];
34
+ return text
35
+ .toLowerCase()
36
+ .replace(/[^a-z0-9\s-]/g, ' ')
37
+ .split(/[\s-]+/)
38
+ .filter((t) => t.length > 1 && !STOP_WORDS.has(t));
39
+ }
40
+
41
+ /**
42
+ * Build a BM25 search index from registry entries.
43
+ * Called during `chub build`.
44
+ *
45
+ * @param {Array} entries - Combined docs and skills from registry
46
+ * @returns {Object} The search index
47
+ */
48
+ export function buildIndex(entries) {
49
+ const documents = [];
50
+ const dfMap = {}; // document frequency per term (across all fields)
51
+ const fieldLengths = { name: [], description: [], tags: [] };
52
+
53
+ for (const entry of entries) {
54
+ const nameTokens = tokenize(entry.name);
55
+ const descTokens = tokenize(entry.description || '');
56
+ const tagTokens = (entry.tags || []).flatMap((t) => tokenize(t));
57
+
58
+ documents.push({
59
+ id: entry.id,
60
+ tokens: {
61
+ name: nameTokens,
62
+ description: descTokens,
63
+ tags: tagTokens,
64
+ },
65
+ });
66
+
67
+ fieldLengths.name.push(nameTokens.length);
68
+ fieldLengths.description.push(descTokens.length);
69
+ fieldLengths.tags.push(tagTokens.length);
70
+
71
+ // Count document frequency — a term counts once per document (union of all fields)
72
+ const allTerms = new Set([...nameTokens, ...descTokens, ...tagTokens]);
73
+ for (const term of allTerms) {
74
+ dfMap[term] = (dfMap[term] || 0) + 1;
75
+ }
76
+ }
77
+
78
+ const N = documents.length;
79
+
80
+ // Compute IDF for each term
81
+ const idf = {};
82
+ for (const [term, df] of Object.entries(dfMap)) {
83
+ idf[term] = Math.log((N - df + 0.5) / (df + 0.5) + 1);
84
+ }
85
+
86
+ // Compute average field lengths
87
+ const avg = (arr) => arr.length === 0 ? 0 : arr.reduce((a, b) => a + b, 0) / arr.length;
88
+ const avgFieldLengths = {
89
+ name: avg(fieldLengths.name),
90
+ description: avg(fieldLengths.description),
91
+ tags: avg(fieldLengths.tags),
92
+ };
93
+
94
+ return {
95
+ version: '1.0.0',
96
+ algorithm: 'bm25',
97
+ params: { k1: DEFAULT_K1, b: DEFAULT_B },
98
+ totalDocs: N,
99
+ avgFieldLengths,
100
+ idf,
101
+ documents,
102
+ };
103
+ }
104
+
105
+ /**
106
+ * Compute BM25 score for a single field.
107
+ */
108
+ function scoreField(queryTerms, fieldTokens, idf, avgFieldLen, k1, b) {
109
+ if (fieldTokens.length === 0) return 0;
110
+
111
+ // Build term frequency map for this field
112
+ const tf = {};
113
+ for (const t of fieldTokens) {
114
+ tf[t] = (tf[t] || 0) + 1;
115
+ }
116
+
117
+ let score = 0;
118
+ const dl = fieldTokens.length;
119
+
120
+ for (const term of queryTerms) {
121
+ const termFreq = tf[term] || 0;
122
+ if (termFreq === 0) continue;
123
+
124
+ const termIdf = idf[term] || 0;
125
+ const numerator = termFreq * (k1 + 1);
126
+ const denominator = termFreq + k1 * (1 - b + b * (dl / (avgFieldLen || 1)));
127
+ score += termIdf * (numerator / denominator);
128
+ }
129
+
130
+ return score;
131
+ }
132
+
133
+ /**
134
+ * Search the BM25 index with a query string.
135
+ *
136
+ * @param {string} query - The search query
137
+ * @param {Object} index - The pre-built BM25 index
138
+ * @param {Object} opts - Options: { limit }
139
+ * @returns {Array} Sorted results: [{ id, score }]
140
+ */
141
+ export function search(query, index, opts = {}) {
142
+ const queryTerms = tokenize(query);
143
+ if (queryTerms.length === 0) return [];
144
+
145
+ const { k1, b } = index.params;
146
+ const results = [];
147
+
148
+ for (const doc of index.documents) {
149
+ let totalScore = 0;
150
+
151
+ for (const [field, weight] of Object.entries(FIELD_WEIGHTS)) {
152
+ const fieldTokens = doc.tokens[field] || [];
153
+ const avgLen = index.avgFieldLengths[field] || 1;
154
+ const fieldScore = scoreField(queryTerms, fieldTokens, index.idf, avgLen, k1, b);
155
+ totalScore += fieldScore * weight;
156
+ }
157
+
158
+ if (totalScore > 0) {
159
+ results.push({ id: doc.id, score: totalScore });
160
+ }
161
+ }
162
+
163
+ results.sort((a, b) => b.score - a.score);
164
+
165
+ if (opts.limit) {
166
+ return results.slice(0, opts.limit);
167
+ }
168
+
169
+ return results;
170
+ }
package/src/lib/cache.js CHANGED
@@ -225,6 +225,20 @@ export function loadSourceRegistry(source) {
225
225
  return JSON.parse(readFileSync(regPath, 'utf8'));
226
226
  }
227
227
 
228
+ /**
229
+ * Load BM25 search index for a single source (if available).
230
+ */
231
+ export function loadSearchIndex(source) {
232
+ const basePath = source.path || getSourceDir(source.name);
233
+ const indexPath = join(basePath, 'search-index.json');
234
+ if (!existsSync(indexPath)) return null;
235
+ try {
236
+ return JSON.parse(readFileSync(indexPath, 'utf8'));
237
+ } catch {
238
+ return null;
239
+ }
240
+ }
241
+
228
242
  /**
229
243
  * Get cache stats.
230
244
  */
package/src/lib/config.js CHANGED
@@ -18,7 +18,7 @@ const DEFAULTS = {
18
18
  let _config = null;
19
19
 
20
20
  export function getChubDir() {
21
- return join(homedir(), '.chub');
21
+ return process.env.CHUB_DIR || join(homedir(), '.chub');
22
22
  }
23
23
 
24
24
  export function loadConfig() {