chub-dev 0.2.0-beta.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +76 -0
- package/bin/chub-mcp +2 -0
- package/package.json +11 -6
- package/skills/get-api-docs/SKILL.md +81 -0
- package/src/commands/annotate.js +83 -0
- package/src/commands/build.js +21 -4
- package/src/commands/feedback.js +12 -9
- package/src/commands/get.js +77 -12
- package/src/commands/help.js +34 -0
- package/src/commands/search.js +17 -8
- package/src/index.js +35 -67
- package/src/lib/analytics.js +13 -2
- package/src/lib/annotations.js +57 -0
- package/src/lib/bm25.js +303 -0
- package/src/lib/cache.js +108 -17
- package/src/lib/config.js +15 -2
- package/src/lib/help.js +158 -0
- package/src/lib/identity.js +12 -1
- package/src/lib/registry.js +283 -27
- package/src/lib/telemetry.js +7 -1
- package/src/lib/welcome.js +42 -0
- package/src/mcp/server.js +184 -0
- package/src/mcp/stdio-lifecycle.js +54 -0
- package/src/mcp/tools.js +286 -0
- package/dist/anthropic/docs/sdk/javascript/DOC.md +0 -499
- package/dist/anthropic/docs/sdk/python/DOC.md +0 -382
- package/dist/openai/docs/chat/javascript/DOC.md +0 -350
- package/dist/openai/docs/chat/python/DOC.md +0 -526
- package/dist/pinecone/docs/sdk/javascript/DOC.md +0 -984
- package/dist/pinecone/docs/sdk/python/DOC.md +0 -1395
- package/dist/registry.json +0 -276
- package/dist/resend/docs/sdk/DOC.md +0 -1271
- package/dist/stripe/docs/api/DOC.md +0 -1726
- package/dist/supabase/docs/sdk/DOC.md +0 -1606
- package/dist/twilio/docs/sdk/python/DOC.md +0 -469
- package/dist/twilio/docs/sdk/typescript/DOC.md +0 -946
package/src/index.js
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import chalk from 'chalk';
|
|
2
1
|
import { Command } from 'commander';
|
|
3
2
|
import { readFileSync } from 'node:fs';
|
|
4
3
|
import { fileURLToPath } from 'node:url';
|
|
@@ -10,87 +9,55 @@ import { registerSearchCommand } from './commands/search.js';
|
|
|
10
9
|
import { registerGetCommand } from './commands/get.js';
|
|
11
10
|
import { registerBuildCommand } from './commands/build.js';
|
|
12
11
|
import { registerFeedbackCommand } from './commands/feedback.js';
|
|
13
|
-
import {
|
|
12
|
+
import { registerAnnotateCommand } from './commands/annotate.js';
|
|
13
|
+
import { registerHelpCommand } from './commands/help.js';
|
|
14
|
+
import { trackEvent, shutdownAnalytics, setCliVersion } from './lib/analytics.js';
|
|
15
|
+
import { error } from './lib/output.js';
|
|
16
|
+
import { showWelcomeIfNeeded } from './lib/welcome.js';
|
|
17
|
+
import { getLocalHelpText } from './lib/help.js';
|
|
14
18
|
|
|
15
19
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
16
20
|
const pkg = JSON.parse(readFileSync(join(__dirname, '..', 'package.json'), 'utf8'));
|
|
17
|
-
|
|
18
|
-
function printUsage() {
|
|
19
|
-
console.log(`
|
|
20
|
-
${chalk.bold('chub')} — Context Hub CLI v${pkg.version}
|
|
21
|
-
Search and retrieve LLM-optimized docs and skills.
|
|
22
|
-
|
|
23
|
-
${chalk.bold.underline('Getting Started')}
|
|
24
|
-
|
|
25
|
-
${chalk.dim('$')} chub update ${chalk.dim('# download the registry')}
|
|
26
|
-
${chalk.dim('$')} chub search ${chalk.dim('# list everything available')}
|
|
27
|
-
${chalk.dim('$')} chub search "stripe" ${chalk.dim('# fuzzy search')}
|
|
28
|
-
${chalk.dim('$')} chub search stripe/payments ${chalk.dim('# exact id → full detail')}
|
|
29
|
-
${chalk.dim('$')} chub get stripe/api ${chalk.dim('# print doc to terminal')}
|
|
30
|
-
${chalk.dim('$')} chub get stripe/api -o doc.md ${chalk.dim('# save to file')}
|
|
31
|
-
${chalk.dim('$')} chub get openai/chat --lang py ${chalk.dim('# specific language')}
|
|
32
|
-
${chalk.dim('$')} chub get pw-community/login-flows ${chalk.dim('# fetch a skill')}
|
|
33
|
-
${chalk.dim('$')} chub get openai/chat stripe/api ${chalk.dim('# fetch multiple')}
|
|
34
|
-
|
|
35
|
-
${chalk.bold.underline('Commands')}
|
|
36
|
-
|
|
37
|
-
${chalk.bold('search')} [query] Search docs and skills (no query = list all)
|
|
38
|
-
${chalk.bold('get')} <ids...> Fetch docs or skills by ID
|
|
39
|
-
${chalk.bold('update')} Refresh the cached registry
|
|
40
|
-
${chalk.bold('cache')} status|clear Manage the local cache
|
|
41
|
-
${chalk.bold('build')} <content-dir> Build registry from content directory
|
|
42
|
-
|
|
43
|
-
${chalk.bold.underline('Flags')}
|
|
44
|
-
|
|
45
|
-
--json Structured JSON output (for agents and piping)
|
|
46
|
-
--tags <csv> Filter by tags (e.g. docs, skill, openai, browser)
|
|
47
|
-
--lang <language> Language variant (js, py, ts)
|
|
48
|
-
--full Fetch all files, not just the entry point
|
|
49
|
-
-o, --output <path> Write content to file or directory
|
|
50
|
-
|
|
51
|
-
${chalk.bold.underline('Agent Piping Patterns')}
|
|
52
|
-
|
|
53
|
-
${chalk.dim('# Get the top result id')}
|
|
54
|
-
${chalk.dim('$')} chub search "stripe" --json | jq -r '.results[0].id'
|
|
55
|
-
|
|
56
|
-
${chalk.dim('# Search → pick → fetch → save')}
|
|
57
|
-
${chalk.dim('$')} ID=$(chub search "stripe" --json | jq -r '.results[0].id')
|
|
58
|
-
${chalk.dim('$')} chub get "$ID" --lang js -o .context/stripe.md
|
|
59
|
-
|
|
60
|
-
${chalk.dim('# Fetch multiple at once')}
|
|
61
|
-
${chalk.dim('$')} chub get openai/chat stripe/api -o .context/
|
|
62
|
-
|
|
63
|
-
${chalk.bold.underline('Multi-Source Config')} ${chalk.dim('(~/.chub/config.yaml)')}
|
|
64
|
-
|
|
65
|
-
${chalk.dim('sources:')}
|
|
66
|
-
${chalk.dim(' - name: community')}
|
|
67
|
-
${chalk.dim(' url: https://cdn.aichub.org/v1')}
|
|
68
|
-
${chalk.dim(' - name: internal')}
|
|
69
|
-
${chalk.dim(' path: /path/to/local/docs')}
|
|
70
|
-
|
|
71
|
-
${chalk.dim('# On id collision, use source: prefix: chub get internal:openai/chat')}
|
|
72
|
-
`);
|
|
73
|
-
}
|
|
21
|
+
setCliVersion(pkg.version);
|
|
74
22
|
|
|
75
23
|
const program = new Command();
|
|
76
24
|
|
|
77
25
|
program
|
|
78
26
|
.name('chub')
|
|
79
27
|
.description('Context Hub - search and retrieve LLM-optimized docs and skills')
|
|
80
|
-
.version(pkg.version)
|
|
28
|
+
.version(pkg.version, '-V, --cli-version')
|
|
29
|
+
.addHelpCommand(false)
|
|
81
30
|
.option('--json', 'Output as JSON (machine-readable)')
|
|
82
31
|
.action(() => {
|
|
83
|
-
|
|
32
|
+
console.log(getLocalHelpText(pkg.version));
|
|
84
33
|
});
|
|
85
34
|
|
|
86
35
|
// Commands that don't need registry
|
|
87
|
-
const SKIP_REGISTRY = ['update', 'cache', 'build', 'feedback', 'help'];
|
|
36
|
+
const SKIP_REGISTRY = ['update', 'cache', 'build', 'feedback', 'annotate', 'help'];
|
|
88
37
|
|
|
89
38
|
program.hook('preAction', async (thisCommand) => {
|
|
39
|
+
const globalOpts = thisCommand.optsWithGlobals?.() || {};
|
|
40
|
+
showWelcomeIfNeeded(globalOpts);
|
|
41
|
+
|
|
90
42
|
const cmdName = thisCommand.args?.[0] || thisCommand.name();
|
|
91
|
-
// Track command usage (fire-and-forget, never blocks)
|
|
92
43
|
if (cmdName !== 'chub') {
|
|
93
|
-
|
|
44
|
+
// Only initialize identity and track if telemetry is enabled
|
|
45
|
+
// Respects CHUB_TELEMETRY=0 — no client_id file created, no events sent
|
|
46
|
+
try {
|
|
47
|
+
const { isTelemetryEnabled } = await import('./lib/telemetry.js');
|
|
48
|
+
if (isTelemetryEnabled()) {
|
|
49
|
+
const { getOrCreateClientId, isFirstRun } = await import('./lib/identity.js');
|
|
50
|
+
await getOrCreateClientId();
|
|
51
|
+
|
|
52
|
+
// Fire-and-forget — don't block command on PostHog network I/O
|
|
53
|
+
trackEvent('command_run', { command: cmdName }).catch(() => {});
|
|
54
|
+
if (isFirstRun()) {
|
|
55
|
+
trackEvent('first_run', { command: cmdName }).catch(() => {});
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
} catch {
|
|
59
|
+
// Identity/telemetry failure — silently skip, don't block the command
|
|
60
|
+
}
|
|
94
61
|
}
|
|
95
62
|
if (SKIP_REGISTRY.includes(cmdName)) return;
|
|
96
63
|
if (thisCommand.parent?.name() === 'cache') return;
|
|
@@ -99,9 +66,8 @@ program.hook('preAction', async (thisCommand) => {
|
|
|
99
66
|
try {
|
|
100
67
|
await ensureRegistry();
|
|
101
68
|
} catch (err) {
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
process.exit(1);
|
|
69
|
+
await trackEvent('command_error', { command: cmdName, error_type: 'registry_unavailable' });
|
|
70
|
+
error(`Registry not available: ${err.message}. Run \`chub update\` to refresh remote registries, or check that local source paths in ~/.chub/config.yaml are correct.`, globalOpts);
|
|
105
71
|
}
|
|
106
72
|
});
|
|
107
73
|
|
|
@@ -111,6 +77,8 @@ registerSearchCommand(program);
|
|
|
111
77
|
registerGetCommand(program);
|
|
112
78
|
registerBuildCommand(program);
|
|
113
79
|
registerFeedbackCommand(program);
|
|
80
|
+
registerAnnotateCommand(program);
|
|
81
|
+
registerHelpCommand(program, pkg.version);
|
|
114
82
|
|
|
115
83
|
program.parse();
|
|
116
84
|
|
package/src/lib/analytics.js
CHANGED
|
@@ -4,13 +4,14 @@
|
|
|
4
4
|
* Tracks: command usage, search patterns, doc/skill popularity, errors.
|
|
5
5
|
* Does NOT track feedback ratings (those go to the custom API via telemetry.js).
|
|
6
6
|
*
|
|
7
|
-
* Respects
|
|
7
|
+
* Respects telemetry opt-out: `telemetry: false` in config or CHUB_TELEMETRY=0.
|
|
8
|
+
* Feedback has a separate opt-out: `feedback: false` in config or CHUB_FEEDBACK=0.
|
|
8
9
|
*/
|
|
9
10
|
|
|
10
11
|
import { isTelemetryEnabled } from './telemetry.js';
|
|
11
12
|
|
|
12
13
|
// PostHog project API key (public — standard for client-side analytics)
|
|
13
|
-
const POSTHOG_KEY = '
|
|
14
|
+
const POSTHOG_KEY = 'phc_tO9mXIgcCuBccfN2Ut0quf6UFsd06u3Y6g1kqMaYdQX';
|
|
14
15
|
const POSTHOG_HOST = 'https://us.i.posthog.com';
|
|
15
16
|
|
|
16
17
|
let _posthog = null;
|
|
@@ -65,6 +66,7 @@ export async function trackEvent(event, properties = {}) {
|
|
|
65
66
|
...properties,
|
|
66
67
|
platform: process.platform,
|
|
67
68
|
node_version: process.version,
|
|
69
|
+
cli_version: _cliVersion || undefined,
|
|
68
70
|
},
|
|
69
71
|
});
|
|
70
72
|
|
|
@@ -75,6 +77,15 @@ export async function trackEvent(event, properties = {}) {
|
|
|
75
77
|
}
|
|
76
78
|
}
|
|
77
79
|
|
|
80
|
+
let _cliVersion;
|
|
81
|
+
/**
|
|
82
|
+
* Set the CLI version for inclusion in all events.
|
|
83
|
+
* Called once from index.js at startup.
|
|
84
|
+
*/
|
|
85
|
+
export function setCliVersion(version) {
|
|
86
|
+
_cliVersion = version;
|
|
87
|
+
}
|
|
88
|
+
|
|
78
89
|
/**
|
|
79
90
|
* Shut down the PostHog client gracefully.
|
|
80
91
|
* Call this before process exit if possible.
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import { readFileSync, writeFileSync, mkdirSync, unlinkSync, readdirSync } from 'node:fs';
|
|
2
|
+
import { join } from 'node:path';
|
|
3
|
+
import { getChubDir } from './config.js';
|
|
4
|
+
|
|
5
|
+
function getAnnotationsDir() {
|
|
6
|
+
return join(getChubDir(), 'annotations');
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
function annotationPath(entryId) {
|
|
10
|
+
const safe = entryId.replace(/\//g, '--');
|
|
11
|
+
return join(getAnnotationsDir(), `${safe}.json`);
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export function readAnnotation(entryId) {
|
|
15
|
+
try {
|
|
16
|
+
return JSON.parse(readFileSync(annotationPath(entryId), 'utf8'));
|
|
17
|
+
} catch {
|
|
18
|
+
return null;
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export function writeAnnotation(entryId, note) {
|
|
23
|
+
const dir = getAnnotationsDir();
|
|
24
|
+
mkdirSync(dir, { recursive: true });
|
|
25
|
+
const data = {
|
|
26
|
+
id: entryId,
|
|
27
|
+
note,
|
|
28
|
+
updatedAt: new Date().toISOString(),
|
|
29
|
+
};
|
|
30
|
+
writeFileSync(annotationPath(entryId), JSON.stringify(data, null, 2));
|
|
31
|
+
return data;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export function clearAnnotation(entryId) {
|
|
35
|
+
try {
|
|
36
|
+
unlinkSync(annotationPath(entryId));
|
|
37
|
+
return true;
|
|
38
|
+
} catch {
|
|
39
|
+
return false;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
export function listAnnotations() {
|
|
44
|
+
const dir = getAnnotationsDir();
|
|
45
|
+
try {
|
|
46
|
+
const files = readdirSync(dir).filter((f) => f.endsWith('.json'));
|
|
47
|
+
return files.map((f) => {
|
|
48
|
+
try {
|
|
49
|
+
return JSON.parse(readFileSync(join(dir, f), 'utf8'));
|
|
50
|
+
} catch {
|
|
51
|
+
return null;
|
|
52
|
+
}
|
|
53
|
+
}).filter(Boolean);
|
|
54
|
+
} catch {
|
|
55
|
+
return [];
|
|
56
|
+
}
|
|
57
|
+
}
|
package/src/lib/bm25.js
ADDED
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* BM25 search implementation for Context Hub.
|
|
3
|
+
* Index is built at `chub build` time, scoring happens at search time.
|
|
4
|
+
* Tokenizer is shared between build and search to ensure consistency.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
const STOP_WORDS = new Set([
|
|
8
|
+
'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
|
|
9
|
+
'has', 'have', 'in', 'is', 'it', 'its', 'of', 'on', 'or', 'that',
|
|
10
|
+
'the', 'to', 'was', 'were', 'will', 'with', 'this', 'but', 'not',
|
|
11
|
+
'you', 'your', 'can', 'do', 'does', 'how', 'if', 'may', 'no',
|
|
12
|
+
'so', 'than', 'too', 'very', 'just', 'about', 'into', 'over',
|
|
13
|
+
'such', 'then', 'them', 'these', 'those', 'through', 'under',
|
|
14
|
+
'use', 'using', 'used',
|
|
15
|
+
]);
|
|
16
|
+
|
|
17
|
+
// BM25 default parameters
|
|
18
|
+
const DEFAULT_K1 = 1.5;
|
|
19
|
+
const DEFAULT_B = 0.75;
|
|
20
|
+
|
|
21
|
+
// Field weights for multi-field scoring
|
|
22
|
+
const FIELD_WEIGHTS = {
|
|
23
|
+
id: 4.0,
|
|
24
|
+
name: 3.0,
|
|
25
|
+
tags: 2.0,
|
|
26
|
+
description: 1.0,
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
function getDefaultParams() {
|
|
30
|
+
return { k1: DEFAULT_K1, b: DEFAULT_B };
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
function isSearchableToken(token) {
|
|
34
|
+
return (token.length > 1 || /^\d+$/.test(token)) && !STOP_WORDS.has(token);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
export function compactIdentifier(text) {
|
|
38
|
+
return String(text || '')
|
|
39
|
+
.toLowerCase()
|
|
40
|
+
.replace(/[^a-z0-9]/g, '');
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function splitAlphaNumeric(text) {
|
|
44
|
+
return text
|
|
45
|
+
.replace(/([a-z])(\d)/g, '$1 $2')
|
|
46
|
+
.replace(/(\d)([a-z])/g, '$1 $2');
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Tokenize text into lowercase terms with stop word removal.
|
|
51
|
+
* Must be used identically at build time and search time.
|
|
52
|
+
*/
|
|
53
|
+
export function tokenize(text) {
|
|
54
|
+
if (!text) return [];
|
|
55
|
+
return text
|
|
56
|
+
.toLowerCase()
|
|
57
|
+
.replace(/[^a-z0-9\s-]/g, ' ')
|
|
58
|
+
.split(/[\s-]+/)
|
|
59
|
+
.filter(isSearchableToken);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Tokenize identifiers more aggressively than free text so package ids
|
|
64
|
+
* still match joined/split variants like "nodefetch" and "auth 0".
|
|
65
|
+
*/
|
|
66
|
+
export function tokenizeIdentifier(text) {
|
|
67
|
+
if (!text) return [];
|
|
68
|
+
|
|
69
|
+
const tokens = new Set(tokenize(text));
|
|
70
|
+
const raw = String(text);
|
|
71
|
+
const compact = compactIdentifier(raw);
|
|
72
|
+
const segments = new Set([
|
|
73
|
+
...raw.split('/').map((segment) => compactIdentifier(segment)),
|
|
74
|
+
...raw.split(/[\/_.\s-]+/).map((segment) => compactIdentifier(segment)),
|
|
75
|
+
]);
|
|
76
|
+
|
|
77
|
+
if (isSearchableToken(compact)) {
|
|
78
|
+
tokens.add(compact);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
for (const token of tokenize(splitAlphaNumeric(compact))) {
|
|
82
|
+
tokens.add(token);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
for (const segment of segments) {
|
|
86
|
+
if (!segment) continue;
|
|
87
|
+
if (isSearchableToken(segment)) {
|
|
88
|
+
tokens.add(segment);
|
|
89
|
+
}
|
|
90
|
+
for (const token of tokenize(splitAlphaNumeric(segment))) {
|
|
91
|
+
tokens.add(token);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
return [...tokens];
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
function buildInvertedIndex(documents) {
|
|
99
|
+
const invertedIndex = Object.create(null);
|
|
100
|
+
|
|
101
|
+
for (const [docIndex, doc] of documents.entries()) {
|
|
102
|
+
const allTerms = new Set([
|
|
103
|
+
...(doc.tokens.id || []),
|
|
104
|
+
...(doc.tokens.name || []),
|
|
105
|
+
...(doc.tokens.description || []),
|
|
106
|
+
...(doc.tokens.tags || []),
|
|
107
|
+
]);
|
|
108
|
+
|
|
109
|
+
for (const term of allTerms) {
|
|
110
|
+
if (!invertedIndex[term]) invertedIndex[term] = [];
|
|
111
|
+
invertedIndex[term].push(docIndex);
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
return invertedIndex;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
export function buildIndexFromDocuments(documents, params = getDefaultParams()) {
|
|
119
|
+
const dfMap = Object.create(null); // document frequency per term (across all fields)
|
|
120
|
+
const fieldLengths = { id: [], name: [], description: [], tags: [] };
|
|
121
|
+
|
|
122
|
+
for (const doc of documents) {
|
|
123
|
+
const idTokens = doc.tokens.id || [];
|
|
124
|
+
const nameTokens = doc.tokens.name || [];
|
|
125
|
+
const descTokens = doc.tokens.description || [];
|
|
126
|
+
const tagTokens = doc.tokens.tags || [];
|
|
127
|
+
|
|
128
|
+
fieldLengths.id.push(idTokens.length);
|
|
129
|
+
fieldLengths.name.push(nameTokens.length);
|
|
130
|
+
fieldLengths.description.push(descTokens.length);
|
|
131
|
+
fieldLengths.tags.push(tagTokens.length);
|
|
132
|
+
|
|
133
|
+
const allTerms = new Set([...idTokens, ...nameTokens, ...descTokens, ...tagTokens]);
|
|
134
|
+
for (const term of allTerms) {
|
|
135
|
+
dfMap[term] = (dfMap[term] || 0) + 1;
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
const N = documents.length;
|
|
140
|
+
const idf = Object.create(null);
|
|
141
|
+
for (const [term, df] of Object.entries(dfMap)) {
|
|
142
|
+
idf[term] = Math.log((N - df + 0.5) / (df + 0.5) + 1);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
const avg = (arr) => arr.length === 0 ? 0 : arr.reduce((a, b) => a + b, 0) / arr.length;
|
|
146
|
+
return {
|
|
147
|
+
version: '1.0.0',
|
|
148
|
+
algorithm: 'bm25',
|
|
149
|
+
params,
|
|
150
|
+
totalDocs: N,
|
|
151
|
+
avgFieldLengths: {
|
|
152
|
+
id: avg(fieldLengths.id),
|
|
153
|
+
name: avg(fieldLengths.name),
|
|
154
|
+
description: avg(fieldLengths.description),
|
|
155
|
+
tags: avg(fieldLengths.tags),
|
|
156
|
+
},
|
|
157
|
+
idf,
|
|
158
|
+
documents,
|
|
159
|
+
invertedIndex: buildInvertedIndex(documents),
|
|
160
|
+
};
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
/**
|
|
164
|
+
* Build a BM25 search index from registry entries.
|
|
165
|
+
* Called during `chub build`.
|
|
166
|
+
*
|
|
167
|
+
* @param {Array} entries - Combined docs and skills from registry
|
|
168
|
+
* @returns {Object} The search index
|
|
169
|
+
*/
|
|
170
|
+
export function buildIndex(entries) {
|
|
171
|
+
const documents = [];
|
|
172
|
+
|
|
173
|
+
for (const entry of entries) {
|
|
174
|
+
const idTokens = tokenizeIdentifier(entry.id);
|
|
175
|
+
const nameTokens = tokenize(entry.name);
|
|
176
|
+
const descTokens = tokenize(entry.description || '');
|
|
177
|
+
const tagTokens = (entry.tags || []).flatMap((t) => tokenize(t));
|
|
178
|
+
|
|
179
|
+
documents.push({
|
|
180
|
+
id: entry.id,
|
|
181
|
+
tokens: {
|
|
182
|
+
id: idTokens,
|
|
183
|
+
name: nameTokens,
|
|
184
|
+
description: descTokens,
|
|
185
|
+
tags: tagTokens,
|
|
186
|
+
},
|
|
187
|
+
});
|
|
188
|
+
}
|
|
189
|
+
return buildIndexFromDocuments(documents);
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
/**
|
|
193
|
+
* Compute BM25 score for a single field.
|
|
194
|
+
*/
|
|
195
|
+
function scoreField(queryTerms, fieldTokens, idf, avgFieldLen, k1, b) {
|
|
196
|
+
if (fieldTokens.length === 0) return 0;
|
|
197
|
+
|
|
198
|
+
// Build term frequency map for this field
|
|
199
|
+
const tf = Object.create(null);
|
|
200
|
+
for (const t of fieldTokens) {
|
|
201
|
+
tf[t] = (tf[t] || 0) + 1;
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
let score = 0;
|
|
205
|
+
const dl = fieldTokens.length;
|
|
206
|
+
|
|
207
|
+
for (const term of queryTerms) {
|
|
208
|
+
const termFreq = tf[term] || 0;
|
|
209
|
+
if (termFreq === 0) continue;
|
|
210
|
+
|
|
211
|
+
const termIdf = idf[term] || 0;
|
|
212
|
+
const numerator = termFreq * (k1 + 1);
|
|
213
|
+
const denominator = termFreq + k1 * (1 - b + b * (dl / (avgFieldLen || 1)));
|
|
214
|
+
score += termIdf * (numerator / denominator);
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
return score;
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
function getCandidateDocIndexes(queryTerms, index) {
|
|
221
|
+
if (!index.invertedIndex) {
|
|
222
|
+
return index.documents.map((_, docIndex) => docIndex);
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
const candidateIndexes = new Set();
|
|
226
|
+
for (const term of new Set(queryTerms)) {
|
|
227
|
+
const postings = index.invertedIndex[term];
|
|
228
|
+
if (!postings) continue;
|
|
229
|
+
for (const docIndex of postings) {
|
|
230
|
+
candidateIndexes.add(docIndex);
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
return [...candidateIndexes];
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
function runSearch(query, index, opts = {}) {
|
|
238
|
+
const queryTerms = tokenize(query);
|
|
239
|
+
const totalDocs = index.documents.length;
|
|
240
|
+
|
|
241
|
+
if (queryTerms.length === 0) {
|
|
242
|
+
return {
|
|
243
|
+
results: [],
|
|
244
|
+
stats: {
|
|
245
|
+
totalDocs,
|
|
246
|
+
candidateDocCount: 0,
|
|
247
|
+
scoredDocCount: 0,
|
|
248
|
+
matchedDocCount: 0,
|
|
249
|
+
usedInvertedIndex: !!index.invertedIndex,
|
|
250
|
+
},
|
|
251
|
+
};
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
const { k1, b } = index.params;
|
|
255
|
+
const results = [];
|
|
256
|
+
const candidateDocIndexes = getCandidateDocIndexes(queryTerms, index);
|
|
257
|
+
|
|
258
|
+
for (const docIndex of candidateDocIndexes) {
|
|
259
|
+
const doc = index.documents[docIndex];
|
|
260
|
+
let totalScore = 0;
|
|
261
|
+
|
|
262
|
+
for (const [field, weight] of Object.entries(FIELD_WEIGHTS)) {
|
|
263
|
+
const fieldTokens = doc.tokens[field] || [];
|
|
264
|
+
const avgLen = index.avgFieldLengths[field] || 1;
|
|
265
|
+
const fieldScore = scoreField(queryTerms, fieldTokens, index.idf, avgLen, k1, b);
|
|
266
|
+
totalScore += fieldScore * weight;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
if (totalScore > 0) {
|
|
270
|
+
results.push({ id: doc.id, score: totalScore });
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
results.sort((a, b) => b.score - a.score);
|
|
275
|
+
const limitedResults = opts.limit ? results.slice(0, opts.limit) : results;
|
|
276
|
+
|
|
277
|
+
return {
|
|
278
|
+
results: limitedResults,
|
|
279
|
+
stats: {
|
|
280
|
+
totalDocs,
|
|
281
|
+
candidateDocCount: candidateDocIndexes.length,
|
|
282
|
+
scoredDocCount: candidateDocIndexes.length,
|
|
283
|
+
matchedDocCount: results.length,
|
|
284
|
+
usedInvertedIndex: !!index.invertedIndex,
|
|
285
|
+
},
|
|
286
|
+
};
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
/**
|
|
290
|
+
* Search the BM25 index with a query string.
|
|
291
|
+
*
|
|
292
|
+
* @param {string} query - The search query
|
|
293
|
+
* @param {Object} index - The pre-built BM25 index
|
|
294
|
+
* @param {Object} opts - Options: { limit }
|
|
295
|
+
* @returns {Array} Sorted results: [{ id, score }]
|
|
296
|
+
*/
|
|
297
|
+
export function search(query, index, opts = {}) {
|
|
298
|
+
return runSearch(query, index, opts).results;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
export function searchWithStats(query, index, opts = {}) {
|
|
302
|
+
return runSearch(query, index, opts);
|
|
303
|
+
}
|