chub-dev 0.2.0-beta.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +76 -0
- package/bin/chub-mcp +2 -0
- package/package.json +10 -5
- package/skills/get-api-docs/SKILL.md +81 -0
- package/src/commands/annotate.js +1 -1
- package/src/commands/build.js +12 -4
- package/src/commands/feedback.js +12 -9
- package/src/commands/get.js +32 -11
- package/src/commands/help.js +34 -0
- package/src/commands/search.js +17 -8
- package/src/index.js +31 -65
- package/src/lib/analytics.js +13 -2
- package/src/lib/bm25.js +185 -52
- package/src/lib/cache.js +94 -17
- package/src/lib/config.js +14 -1
- package/src/lib/help.js +158 -0
- package/src/lib/identity.js +12 -1
- package/src/lib/registry.js +236 -63
- package/src/lib/telemetry.js +7 -1
- package/src/lib/welcome.js +42 -0
- package/src/mcp/server.js +184 -0
- package/src/mcp/stdio-lifecycle.js +54 -0
- package/src/mcp/tools.js +286 -0
package/src/index.js
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import chalk from 'chalk';
|
|
2
1
|
import { Command } from 'commander';
|
|
3
2
|
import { readFileSync } from 'node:fs';
|
|
4
3
|
import { fileURLToPath } from 'node:url';
|
|
@@ -11,67 +10,15 @@ import { registerGetCommand } from './commands/get.js';
|
|
|
11
10
|
import { registerBuildCommand } from './commands/build.js';
|
|
12
11
|
import { registerFeedbackCommand } from './commands/feedback.js';
|
|
13
12
|
import { registerAnnotateCommand } from './commands/annotate.js';
|
|
14
|
-
import {
|
|
13
|
+
import { registerHelpCommand } from './commands/help.js';
|
|
14
|
+
import { trackEvent, shutdownAnalytics, setCliVersion } from './lib/analytics.js';
|
|
15
|
+
import { error } from './lib/output.js';
|
|
16
|
+
import { showWelcomeIfNeeded } from './lib/welcome.js';
|
|
17
|
+
import { getLocalHelpText } from './lib/help.js';
|
|
15
18
|
|
|
16
19
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
17
20
|
const pkg = JSON.parse(readFileSync(join(__dirname, '..', 'package.json'), 'utf8'));
|
|
18
|
-
|
|
19
|
-
function printUsage() {
|
|
20
|
-
console.log(`
|
|
21
|
-
${chalk.bold('chub')} — Context Hub CLI v${pkg.version}
|
|
22
|
-
Search and retrieve LLM-optimized docs and skills.
|
|
23
|
-
|
|
24
|
-
${chalk.bold.underline('Getting Started')}
|
|
25
|
-
|
|
26
|
-
${chalk.dim('$')} chub update ${chalk.dim('# download the registry')}
|
|
27
|
-
${chalk.dim('$')} chub search ${chalk.dim('# list everything available')}
|
|
28
|
-
${chalk.dim('$')} chub search "stripe" ${chalk.dim('# fuzzy search')}
|
|
29
|
-
${chalk.dim('$')} chub search stripe/payments ${chalk.dim('# exact id → full detail')}
|
|
30
|
-
${chalk.dim('$')} chub get stripe/api ${chalk.dim('# print doc to terminal')}
|
|
31
|
-
${chalk.dim('$')} chub get stripe/api -o doc.md ${chalk.dim('# save to file')}
|
|
32
|
-
${chalk.dim('$')} chub get openai/chat --lang py ${chalk.dim('# specific language')}
|
|
33
|
-
${chalk.dim('$')} chub get pw-community/login-flows ${chalk.dim('# fetch a skill')}
|
|
34
|
-
${chalk.dim('$')} chub get openai/chat stripe/api ${chalk.dim('# fetch multiple')}
|
|
35
|
-
|
|
36
|
-
${chalk.bold.underline('Commands')}
|
|
37
|
-
|
|
38
|
-
${chalk.bold('search')} [query] Search docs and skills (no query = list all)
|
|
39
|
-
${chalk.bold('get')} <ids...> Fetch docs or skills by ID
|
|
40
|
-
${chalk.bold('update')} Refresh the cached registry
|
|
41
|
-
${chalk.bold('cache')} status|clear Manage the local cache
|
|
42
|
-
${chalk.bold('build')} <content-dir> Build registry from content directory
|
|
43
|
-
|
|
44
|
-
${chalk.bold.underline('Flags')}
|
|
45
|
-
|
|
46
|
-
--json Structured JSON output (for agents and piping)
|
|
47
|
-
--tags <csv> Filter by tags (e.g. docs, skill, openai, browser)
|
|
48
|
-
--lang <language> Language variant (js, py, ts)
|
|
49
|
-
--full Fetch all files, not just the entry point
|
|
50
|
-
-o, --output <path> Write content to file or directory
|
|
51
|
-
|
|
52
|
-
${chalk.bold.underline('Agent Piping Patterns')}
|
|
53
|
-
|
|
54
|
-
${chalk.dim('# Get the top result id')}
|
|
55
|
-
${chalk.dim('$')} chub search "stripe" --json | jq -r '.results[0].id'
|
|
56
|
-
|
|
57
|
-
${chalk.dim('# Search → pick → fetch → save')}
|
|
58
|
-
${chalk.dim('$')} ID=$(chub search "stripe" --json | jq -r '.results[0].id')
|
|
59
|
-
${chalk.dim('$')} chub get "$ID" --lang js -o .context/stripe.md
|
|
60
|
-
|
|
61
|
-
${chalk.dim('# Fetch multiple at once')}
|
|
62
|
-
${chalk.dim('$')} chub get openai/chat stripe/api -o .context/
|
|
63
|
-
|
|
64
|
-
${chalk.bold.underline('Multi-Source Config')} ${chalk.dim('(~/.chub/config.yaml)')}
|
|
65
|
-
|
|
66
|
-
${chalk.dim('sources:')}
|
|
67
|
-
${chalk.dim(' - name: community')}
|
|
68
|
-
${chalk.dim(' url: https://cdn.aichub.org/v1')}
|
|
69
|
-
${chalk.dim(' - name: internal')}
|
|
70
|
-
${chalk.dim(' path: /path/to/local/docs')}
|
|
71
|
-
|
|
72
|
-
${chalk.dim('# On id collision, use source: prefix: chub get internal:openai/chat')}
|
|
73
|
-
`);
|
|
74
|
-
}
|
|
21
|
+
setCliVersion(pkg.version);
|
|
75
22
|
|
|
76
23
|
const program = new Command();
|
|
77
24
|
|
|
@@ -79,19 +26,38 @@ program
|
|
|
79
26
|
.name('chub')
|
|
80
27
|
.description('Context Hub - search and retrieve LLM-optimized docs and skills')
|
|
81
28
|
.version(pkg.version, '-V, --cli-version')
|
|
29
|
+
.addHelpCommand(false)
|
|
82
30
|
.option('--json', 'Output as JSON (machine-readable)')
|
|
83
31
|
.action(() => {
|
|
84
|
-
|
|
32
|
+
console.log(getLocalHelpText(pkg.version));
|
|
85
33
|
});
|
|
86
34
|
|
|
87
35
|
// Commands that don't need registry
|
|
88
36
|
const SKIP_REGISTRY = ['update', 'cache', 'build', 'feedback', 'annotate', 'help'];
|
|
89
37
|
|
|
90
38
|
program.hook('preAction', async (thisCommand) => {
|
|
39
|
+
const globalOpts = thisCommand.optsWithGlobals?.() || {};
|
|
40
|
+
showWelcomeIfNeeded(globalOpts);
|
|
41
|
+
|
|
91
42
|
const cmdName = thisCommand.args?.[0] || thisCommand.name();
|
|
92
|
-
// Track command usage (fire-and-forget, never blocks)
|
|
93
43
|
if (cmdName !== 'chub') {
|
|
94
|
-
|
|
44
|
+
// Only initialize identity and track if telemetry is enabled
|
|
45
|
+
// Respects CHUB_TELEMETRY=0 — no client_id file created, no events sent
|
|
46
|
+
try {
|
|
47
|
+
const { isTelemetryEnabled } = await import('./lib/telemetry.js');
|
|
48
|
+
if (isTelemetryEnabled()) {
|
|
49
|
+
const { getOrCreateClientId, isFirstRun } = await import('./lib/identity.js');
|
|
50
|
+
await getOrCreateClientId();
|
|
51
|
+
|
|
52
|
+
// Fire-and-forget — don't block command on PostHog network I/O
|
|
53
|
+
trackEvent('command_run', { command: cmdName }).catch(() => {});
|
|
54
|
+
if (isFirstRun()) {
|
|
55
|
+
trackEvent('first_run', { command: cmdName }).catch(() => {});
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
} catch {
|
|
59
|
+
// Identity/telemetry failure — silently skip, don't block the command
|
|
60
|
+
}
|
|
95
61
|
}
|
|
96
62
|
if (SKIP_REGISTRY.includes(cmdName)) return;
|
|
97
63
|
if (thisCommand.parent?.name() === 'cache') return;
|
|
@@ -100,9 +66,8 @@ program.hook('preAction', async (thisCommand) => {
|
|
|
100
66
|
try {
|
|
101
67
|
await ensureRegistry();
|
|
102
68
|
} catch (err) {
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
process.exit(1);
|
|
69
|
+
await trackEvent('command_error', { command: cmdName, error_type: 'registry_unavailable' });
|
|
70
|
+
error(`Registry not available: ${err.message}. Run \`chub update\` to refresh remote registries, or check that local source paths in ~/.chub/config.yaml are correct.`, globalOpts);
|
|
106
71
|
}
|
|
107
72
|
});
|
|
108
73
|
|
|
@@ -113,6 +78,7 @@ registerGetCommand(program);
|
|
|
113
78
|
registerBuildCommand(program);
|
|
114
79
|
registerFeedbackCommand(program);
|
|
115
80
|
registerAnnotateCommand(program);
|
|
81
|
+
registerHelpCommand(program, pkg.version);
|
|
116
82
|
|
|
117
83
|
program.parse();
|
|
118
84
|
|
package/src/lib/analytics.js
CHANGED
|
@@ -4,13 +4,14 @@
|
|
|
4
4
|
* Tracks: command usage, search patterns, doc/skill popularity, errors.
|
|
5
5
|
* Does NOT track feedback ratings (those go to the custom API via telemetry.js).
|
|
6
6
|
*
|
|
7
|
-
* Respects
|
|
7
|
+
* Respects telemetry opt-out: `telemetry: false` in config or CHUB_TELEMETRY=0.
|
|
8
|
+
* Feedback has a separate opt-out: `feedback: false` in config or CHUB_FEEDBACK=0.
|
|
8
9
|
*/
|
|
9
10
|
|
|
10
11
|
import { isTelemetryEnabled } from './telemetry.js';
|
|
11
12
|
|
|
12
13
|
// PostHog project API key (public — standard for client-side analytics)
|
|
13
|
-
const POSTHOG_KEY = '
|
|
14
|
+
const POSTHOG_KEY = 'phc_tO9mXIgcCuBccfN2Ut0quf6UFsd06u3Y6g1kqMaYdQX';
|
|
14
15
|
const POSTHOG_HOST = 'https://us.i.posthog.com';
|
|
15
16
|
|
|
16
17
|
let _posthog = null;
|
|
@@ -65,6 +66,7 @@ export async function trackEvent(event, properties = {}) {
|
|
|
65
66
|
...properties,
|
|
66
67
|
platform: process.platform,
|
|
67
68
|
node_version: process.version,
|
|
69
|
+
cli_version: _cliVersion || undefined,
|
|
68
70
|
},
|
|
69
71
|
});
|
|
70
72
|
|
|
@@ -75,6 +77,15 @@ export async function trackEvent(event, properties = {}) {
|
|
|
75
77
|
}
|
|
76
78
|
}
|
|
77
79
|
|
|
80
|
+
let _cliVersion;
|
|
81
|
+
/**
|
|
82
|
+
* Set the CLI version for inclusion in all events.
|
|
83
|
+
* Called once from index.js at startup.
|
|
84
|
+
*/
|
|
85
|
+
export function setCliVersion(version) {
|
|
86
|
+
_cliVersion = version;
|
|
87
|
+
}
|
|
88
|
+
|
|
78
89
|
/**
|
|
79
90
|
* Shut down the PostHog client gracefully.
|
|
80
91
|
* Call this before process exit if possible.
|
package/src/lib/bm25.js
CHANGED
|
@@ -20,11 +20,32 @@ const DEFAULT_B = 0.75;
|
|
|
20
20
|
|
|
21
21
|
// Field weights for multi-field scoring
|
|
22
22
|
const FIELD_WEIGHTS = {
|
|
23
|
+
id: 4.0,
|
|
23
24
|
name: 3.0,
|
|
24
25
|
tags: 2.0,
|
|
25
26
|
description: 1.0,
|
|
26
27
|
};
|
|
27
28
|
|
|
29
|
+
function getDefaultParams() {
|
|
30
|
+
return { k1: DEFAULT_K1, b: DEFAULT_B };
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
function isSearchableToken(token) {
|
|
34
|
+
return (token.length > 1 || /^\d+$/.test(token)) && !STOP_WORDS.has(token);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
export function compactIdentifier(text) {
|
|
38
|
+
return String(text || '')
|
|
39
|
+
.toLowerCase()
|
|
40
|
+
.replace(/[^a-z0-9]/g, '');
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function splitAlphaNumeric(text) {
|
|
44
|
+
return text
|
|
45
|
+
.replace(/([a-z])(\d)/g, '$1 $2')
|
|
46
|
+
.replace(/(\d)([a-z])/g, '$1 $2');
|
|
47
|
+
}
|
|
48
|
+
|
|
28
49
|
/**
|
|
29
50
|
* Tokenize text into lowercase terms with stop word removal.
|
|
30
51
|
* Must be used identically at build time and search time.
|
|
@@ -35,73 +56,139 @@ export function tokenize(text) {
|
|
|
35
56
|
.toLowerCase()
|
|
36
57
|
.replace(/[^a-z0-9\s-]/g, ' ')
|
|
37
58
|
.split(/[\s-]+/)
|
|
38
|
-
.filter(
|
|
59
|
+
.filter(isSearchableToken);
|
|
39
60
|
}
|
|
40
61
|
|
|
41
62
|
/**
|
|
42
|
-
*
|
|
43
|
-
*
|
|
44
|
-
*
|
|
45
|
-
* @param {Array} entries - Combined docs and skills from registry
|
|
46
|
-
* @returns {Object} The search index
|
|
63
|
+
* Tokenize identifiers more aggressively than free text so package ids
|
|
64
|
+
* still match joined/split variants like "nodefetch" and "auth 0".
|
|
47
65
|
*/
|
|
48
|
-
export function
|
|
49
|
-
|
|
50
|
-
const dfMap = {}; // document frequency per term (across all fields)
|
|
51
|
-
const fieldLengths = { name: [], description: [], tags: [] };
|
|
66
|
+
export function tokenizeIdentifier(text) {
|
|
67
|
+
if (!text) return [];
|
|
52
68
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
69
|
+
const tokens = new Set(tokenize(text));
|
|
70
|
+
const raw = String(text);
|
|
71
|
+
const compact = compactIdentifier(raw);
|
|
72
|
+
const segments = new Set([
|
|
73
|
+
...raw.split('/').map((segment) => compactIdentifier(segment)),
|
|
74
|
+
...raw.split(/[\/_.\s-]+/).map((segment) => compactIdentifier(segment)),
|
|
75
|
+
]);
|
|
57
76
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
77
|
+
if (isSearchableToken(compact)) {
|
|
78
|
+
tokens.add(compact);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
for (const token of tokenize(splitAlphaNumeric(compact))) {
|
|
82
|
+
tokens.add(token);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
for (const segment of segments) {
|
|
86
|
+
if (!segment) continue;
|
|
87
|
+
if (isSearchableToken(segment)) {
|
|
88
|
+
tokens.add(segment);
|
|
89
|
+
}
|
|
90
|
+
for (const token of tokenize(splitAlphaNumeric(segment))) {
|
|
91
|
+
tokens.add(token);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
return [...tokens];
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
function buildInvertedIndex(documents) {
|
|
99
|
+
const invertedIndex = Object.create(null);
|
|
100
|
+
|
|
101
|
+
for (const [docIndex, doc] of documents.entries()) {
|
|
102
|
+
const allTerms = new Set([
|
|
103
|
+
...(doc.tokens.id || []),
|
|
104
|
+
...(doc.tokens.name || []),
|
|
105
|
+
...(doc.tokens.description || []),
|
|
106
|
+
...(doc.tokens.tags || []),
|
|
107
|
+
]);
|
|
108
|
+
|
|
109
|
+
for (const term of allTerms) {
|
|
110
|
+
if (!invertedIndex[term]) invertedIndex[term] = [];
|
|
111
|
+
invertedIndex[term].push(docIndex);
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
return invertedIndex;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
export function buildIndexFromDocuments(documents, params = getDefaultParams()) {
|
|
119
|
+
const dfMap = Object.create(null); // document frequency per term (across all fields)
|
|
120
|
+
const fieldLengths = { id: [], name: [], description: [], tags: [] };
|
|
121
|
+
|
|
122
|
+
for (const doc of documents) {
|
|
123
|
+
const idTokens = doc.tokens.id || [];
|
|
124
|
+
const nameTokens = doc.tokens.name || [];
|
|
125
|
+
const descTokens = doc.tokens.description || [];
|
|
126
|
+
const tagTokens = doc.tokens.tags || [];
|
|
66
127
|
|
|
128
|
+
fieldLengths.id.push(idTokens.length);
|
|
67
129
|
fieldLengths.name.push(nameTokens.length);
|
|
68
130
|
fieldLengths.description.push(descTokens.length);
|
|
69
131
|
fieldLengths.tags.push(tagTokens.length);
|
|
70
132
|
|
|
71
|
-
|
|
72
|
-
const allTerms = new Set([...nameTokens, ...descTokens, ...tagTokens]);
|
|
133
|
+
const allTerms = new Set([...idTokens, ...nameTokens, ...descTokens, ...tagTokens]);
|
|
73
134
|
for (const term of allTerms) {
|
|
74
135
|
dfMap[term] = (dfMap[term] || 0) + 1;
|
|
75
136
|
}
|
|
76
137
|
}
|
|
77
138
|
|
|
78
139
|
const N = documents.length;
|
|
79
|
-
|
|
80
|
-
// Compute IDF for each term
|
|
81
|
-
const idf = {};
|
|
140
|
+
const idf = Object.create(null);
|
|
82
141
|
for (const [term, df] of Object.entries(dfMap)) {
|
|
83
142
|
idf[term] = Math.log((N - df + 0.5) / (df + 0.5) + 1);
|
|
84
143
|
}
|
|
85
144
|
|
|
86
|
-
// Compute average field lengths
|
|
87
145
|
const avg = (arr) => arr.length === 0 ? 0 : arr.reduce((a, b) => a + b, 0) / arr.length;
|
|
88
|
-
const avgFieldLengths = {
|
|
89
|
-
name: avg(fieldLengths.name),
|
|
90
|
-
description: avg(fieldLengths.description),
|
|
91
|
-
tags: avg(fieldLengths.tags),
|
|
92
|
-
};
|
|
93
|
-
|
|
94
146
|
return {
|
|
95
147
|
version: '1.0.0',
|
|
96
148
|
algorithm: 'bm25',
|
|
97
|
-
params
|
|
149
|
+
params,
|
|
98
150
|
totalDocs: N,
|
|
99
|
-
avgFieldLengths
|
|
151
|
+
avgFieldLengths: {
|
|
152
|
+
id: avg(fieldLengths.id),
|
|
153
|
+
name: avg(fieldLengths.name),
|
|
154
|
+
description: avg(fieldLengths.description),
|
|
155
|
+
tags: avg(fieldLengths.tags),
|
|
156
|
+
},
|
|
100
157
|
idf,
|
|
101
158
|
documents,
|
|
159
|
+
invertedIndex: buildInvertedIndex(documents),
|
|
102
160
|
};
|
|
103
161
|
}
|
|
104
162
|
|
|
163
|
+
/**
|
|
164
|
+
* Build a BM25 search index from registry entries.
|
|
165
|
+
* Called during `chub build`.
|
|
166
|
+
*
|
|
167
|
+
* @param {Array} entries - Combined docs and skills from registry
|
|
168
|
+
* @returns {Object} The search index
|
|
169
|
+
*/
|
|
170
|
+
export function buildIndex(entries) {
|
|
171
|
+
const documents = [];
|
|
172
|
+
|
|
173
|
+
for (const entry of entries) {
|
|
174
|
+
const idTokens = tokenizeIdentifier(entry.id);
|
|
175
|
+
const nameTokens = tokenize(entry.name);
|
|
176
|
+
const descTokens = tokenize(entry.description || '');
|
|
177
|
+
const tagTokens = (entry.tags || []).flatMap((t) => tokenize(t));
|
|
178
|
+
|
|
179
|
+
documents.push({
|
|
180
|
+
id: entry.id,
|
|
181
|
+
tokens: {
|
|
182
|
+
id: idTokens,
|
|
183
|
+
name: nameTokens,
|
|
184
|
+
description: descTokens,
|
|
185
|
+
tags: tagTokens,
|
|
186
|
+
},
|
|
187
|
+
});
|
|
188
|
+
}
|
|
189
|
+
return buildIndexFromDocuments(documents);
|
|
190
|
+
}
|
|
191
|
+
|
|
105
192
|
/**
|
|
106
193
|
* Compute BM25 score for a single field.
|
|
107
194
|
*/
|
|
@@ -109,7 +196,7 @@ function scoreField(queryTerms, fieldTokens, idf, avgFieldLen, k1, b) {
|
|
|
109
196
|
if (fieldTokens.length === 0) return 0;
|
|
110
197
|
|
|
111
198
|
// Build term frequency map for this field
|
|
112
|
-
const tf =
|
|
199
|
+
const tf = Object.create(null);
|
|
113
200
|
for (const t of fieldTokens) {
|
|
114
201
|
tf[t] = (tf[t] || 0) + 1;
|
|
115
202
|
}
|
|
@@ -130,22 +217,46 @@ function scoreField(queryTerms, fieldTokens, idf, avgFieldLen, k1, b) {
|
|
|
130
217
|
return score;
|
|
131
218
|
}
|
|
132
219
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
220
|
+
function getCandidateDocIndexes(queryTerms, index) {
|
|
221
|
+
if (!index.invertedIndex) {
|
|
222
|
+
return index.documents.map((_, docIndex) => docIndex);
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
const candidateIndexes = new Set();
|
|
226
|
+
for (const term of new Set(queryTerms)) {
|
|
227
|
+
const postings = index.invertedIndex[term];
|
|
228
|
+
if (!postings) continue;
|
|
229
|
+
for (const docIndex of postings) {
|
|
230
|
+
candidateIndexes.add(docIndex);
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
return [...candidateIndexes];
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
function runSearch(query, index, opts = {}) {
|
|
142
238
|
const queryTerms = tokenize(query);
|
|
143
|
-
|
|
239
|
+
const totalDocs = index.documents.length;
|
|
240
|
+
|
|
241
|
+
if (queryTerms.length === 0) {
|
|
242
|
+
return {
|
|
243
|
+
results: [],
|
|
244
|
+
stats: {
|
|
245
|
+
totalDocs,
|
|
246
|
+
candidateDocCount: 0,
|
|
247
|
+
scoredDocCount: 0,
|
|
248
|
+
matchedDocCount: 0,
|
|
249
|
+
usedInvertedIndex: !!index.invertedIndex,
|
|
250
|
+
},
|
|
251
|
+
};
|
|
252
|
+
}
|
|
144
253
|
|
|
145
254
|
const { k1, b } = index.params;
|
|
146
255
|
const results = [];
|
|
256
|
+
const candidateDocIndexes = getCandidateDocIndexes(queryTerms, index);
|
|
147
257
|
|
|
148
|
-
for (const
|
|
258
|
+
for (const docIndex of candidateDocIndexes) {
|
|
259
|
+
const doc = index.documents[docIndex];
|
|
149
260
|
let totalScore = 0;
|
|
150
261
|
|
|
151
262
|
for (const [field, weight] of Object.entries(FIELD_WEIGHTS)) {
|
|
@@ -161,10 +272,32 @@ export function search(query, index, opts = {}) {
|
|
|
161
272
|
}
|
|
162
273
|
|
|
163
274
|
results.sort((a, b) => b.score - a.score);
|
|
275
|
+
const limitedResults = opts.limit ? results.slice(0, opts.limit) : results;
|
|
164
276
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
277
|
+
return {
|
|
278
|
+
results: limitedResults,
|
|
279
|
+
stats: {
|
|
280
|
+
totalDocs,
|
|
281
|
+
candidateDocCount: candidateDocIndexes.length,
|
|
282
|
+
scoredDocCount: candidateDocIndexes.length,
|
|
283
|
+
matchedDocCount: results.length,
|
|
284
|
+
usedInvertedIndex: !!index.invertedIndex,
|
|
285
|
+
},
|
|
286
|
+
};
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
/**
|
|
290
|
+
* Search the BM25 index with a query string.
|
|
291
|
+
*
|
|
292
|
+
* @param {string} query - The search query
|
|
293
|
+
* @param {Object} index - The pre-built BM25 index
|
|
294
|
+
* @param {Object} opts - Options: { limit }
|
|
295
|
+
* @returns {Array} Sorted results: [{ id, score }]
|
|
296
|
+
*/
|
|
297
|
+
export function search(query, index, opts = {}) {
|
|
298
|
+
return runSearch(query, index, opts).results;
|
|
299
|
+
}
|
|
168
300
|
|
|
169
|
-
|
|
301
|
+
export function searchWithStats(query, index, opts = {}) {
|
|
302
|
+
return runSearch(query, index, opts);
|
|
170
303
|
}
|
package/src/lib/cache.js
CHANGED
|
@@ -31,6 +31,10 @@ function getSourceRegistryPath(sourceName) {
|
|
|
31
31
|
return join(getSourceDir(sourceName), 'registry.json');
|
|
32
32
|
}
|
|
33
33
|
|
|
34
|
+
function getSourceSearchIndexPath(sourceName) {
|
|
35
|
+
return join(getSourceDir(sourceName), 'search-index.json');
|
|
36
|
+
}
|
|
37
|
+
|
|
34
38
|
function readMeta(sourceName) {
|
|
35
39
|
try {
|
|
36
40
|
return JSON.parse(readFileSync(getSourceMetaPath(sourceName), 'utf8'));
|
|
@@ -47,38 +51,99 @@ function writeMeta(sourceName, meta) {
|
|
|
47
51
|
|
|
48
52
|
function isSourceCacheFresh(sourceName) {
|
|
49
53
|
const meta = readMeta(sourceName);
|
|
50
|
-
if (!meta.lastUpdated) return false;
|
|
54
|
+
if (!meta.lastUpdated && meta.lastUpdated !== 0) return false;
|
|
51
55
|
const config = loadConfig();
|
|
52
56
|
const age = (Date.now() - meta.lastUpdated) / 1000;
|
|
53
57
|
return age < config.refresh_interval;
|
|
54
58
|
}
|
|
55
59
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
60
|
+
function isTimestampFresh(timestamp) {
|
|
61
|
+
if (timestamp === undefined || timestamp === null) return false;
|
|
62
|
+
const config = loadConfig();
|
|
63
|
+
const age = (Date.now() - timestamp) / 1000;
|
|
64
|
+
return age < config.refresh_interval;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
function hasFreshSearchIndexState(sourceName) {
|
|
68
|
+
if (existsSync(getSourceSearchIndexPath(sourceName))) {
|
|
69
|
+
return true;
|
|
62
70
|
}
|
|
63
71
|
|
|
64
|
-
const
|
|
72
|
+
const meta = readMeta(sourceName);
|
|
73
|
+
return meta.searchIndexAvailable === false && isTimestampFresh(meta.searchIndexCheckedAt);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
function shouldFetchRemoteRegistry(sourceName, force = false) {
|
|
77
|
+
if (force) return true;
|
|
78
|
+
return !(
|
|
79
|
+
isSourceCacheFresh(sourceName)
|
|
80
|
+
&& existsSync(getSourceRegistryPath(sourceName))
|
|
81
|
+
&& hasFreshSearchIndexState(sourceName)
|
|
82
|
+
);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
async function fetchRemoteText(url) {
|
|
65
86
|
const controller = new AbortController();
|
|
66
87
|
const timeout = setTimeout(() => controller.abort(), 30000);
|
|
67
|
-
let res;
|
|
68
88
|
try {
|
|
69
|
-
res = await fetch(url, { signal: controller.signal });
|
|
89
|
+
const res = await fetch(url, { signal: controller.signal });
|
|
90
|
+
if (!res.ok) {
|
|
91
|
+
throw new Error(`${res.status} ${res.statusText}`);
|
|
92
|
+
}
|
|
93
|
+
return await res.text();
|
|
70
94
|
} finally {
|
|
71
95
|
clearTimeout(timeout);
|
|
72
96
|
}
|
|
73
|
-
|
|
74
|
-
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Fetch registry for a single remote source.
|
|
101
|
+
*/
|
|
102
|
+
async function fetchRemoteRegistry(source, force = false) {
|
|
103
|
+
if (!shouldFetchRemoteRegistry(source.name, force)) {
|
|
104
|
+
return;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
const registryUrl = `${source.url}/registry.json`;
|
|
108
|
+
let registryText;
|
|
109
|
+
try {
|
|
110
|
+
registryText = await fetchRemoteText(registryUrl);
|
|
111
|
+
} catch (err) {
|
|
112
|
+
throw new Error(`Failed to fetch registry from ${source.name}: ${err.message}`);
|
|
75
113
|
}
|
|
76
114
|
|
|
77
|
-
const data = await res.text();
|
|
78
115
|
const dir = getSourceDir(source.name);
|
|
79
116
|
mkdirSync(dir, { recursive: true });
|
|
80
|
-
writeFileSync(getSourceRegistryPath(source.name),
|
|
81
|
-
|
|
117
|
+
writeFileSync(getSourceRegistryPath(source.name), registryText);
|
|
118
|
+
|
|
119
|
+
const searchIndexUrl = `${source.url}/search-index.json`;
|
|
120
|
+
const searchIndexCheckedAt = Date.now();
|
|
121
|
+
let searchIndexAvailable;
|
|
122
|
+
try {
|
|
123
|
+
const searchIndexText = await fetchRemoteText(searchIndexUrl);
|
|
124
|
+
writeFileSync(getSourceSearchIndexPath(source.name), searchIndexText);
|
|
125
|
+
searchIndexAvailable = true;
|
|
126
|
+
} catch (err) {
|
|
127
|
+
// Avoid serving a stale local search index after a registry refresh.
|
|
128
|
+
rmSync(getSourceSearchIndexPath(source.name), { force: true });
|
|
129
|
+
if (err.message?.startsWith('404 ')) {
|
|
130
|
+
searchIndexAvailable = false;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
const nextMeta = {
|
|
135
|
+
...readMeta(source.name),
|
|
136
|
+
lastUpdated: Date.now(),
|
|
137
|
+
};
|
|
138
|
+
delete nextMeta.searchIndexAvailable;
|
|
139
|
+
delete nextMeta.searchIndexCheckedAt;
|
|
140
|
+
|
|
141
|
+
if (searchIndexAvailable !== undefined) {
|
|
142
|
+
nextMeta.searchIndexAvailable = searchIndexAvailable;
|
|
143
|
+
nextMeta.searchIndexCheckedAt = searchIndexCheckedAt;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
writeMeta(source.name, nextMeta);
|
|
82
147
|
}
|
|
83
148
|
|
|
84
149
|
/**
|
|
@@ -141,6 +206,14 @@ export async function fetchFullBundle(sourceName) {
|
|
|
141
206
|
writeFileSync(getSourceRegistryPath(sourceName), regData);
|
|
142
207
|
}
|
|
143
208
|
|
|
209
|
+
const extractedSearchIndex = join(dataDir, 'search-index.json');
|
|
210
|
+
if (existsSync(extractedSearchIndex)) {
|
|
211
|
+
const searchIndexData = readFileSync(extractedSearchIndex, 'utf8');
|
|
212
|
+
writeFileSync(getSourceSearchIndexPath(sourceName), searchIndexData);
|
|
213
|
+
} else {
|
|
214
|
+
rmSync(getSourceSearchIndexPath(sourceName), { force: true });
|
|
215
|
+
}
|
|
216
|
+
|
|
144
217
|
writeMeta(sourceName, { ...readMeta(sourceName), lastUpdated: Date.now(), fullBundle: true });
|
|
145
218
|
rmSync(tmpPath, { force: true });
|
|
146
219
|
}
|
|
@@ -187,7 +260,7 @@ export async function fetchDoc(source, docPath) {
|
|
|
187
260
|
const content = await res.text();
|
|
188
261
|
|
|
189
262
|
// Cache locally
|
|
190
|
-
const dir =
|
|
263
|
+
const dir = dirname(cachedPath);
|
|
191
264
|
mkdirSync(dir, { recursive: true });
|
|
192
265
|
writeFileSync(cachedPath, content);
|
|
193
266
|
|
|
@@ -327,7 +400,7 @@ export async function ensureRegistry() {
|
|
|
327
400
|
// Auto-refresh stale remote registries (best-effort)
|
|
328
401
|
for (const source of config.sources) {
|
|
329
402
|
if (source.path) continue;
|
|
330
|
-
if (
|
|
403
|
+
if (shouldFetchRemoteRegistry(source.name)) {
|
|
331
404
|
try { await fetchRemoteRegistry(source); } catch { /* use stale */ }
|
|
332
405
|
}
|
|
333
406
|
}
|
|
@@ -341,6 +414,10 @@ export async function ensureRegistry() {
|
|
|
341
414
|
const defaultDir = getSourceDir('default');
|
|
342
415
|
mkdirSync(defaultDir, { recursive: true });
|
|
343
416
|
writeFileSync(getSourceRegistryPath('default'), readFileSync(bundledRegistry, 'utf8'));
|
|
417
|
+
const bundledSearchIndex = join(getBundledDir(), 'search-index.json');
|
|
418
|
+
if (existsSync(bundledSearchIndex)) {
|
|
419
|
+
writeFileSync(getSourceSearchIndexPath('default'), readFileSync(bundledSearchIndex, 'utf8'));
|
|
420
|
+
}
|
|
344
421
|
writeMeta('default', { lastUpdated: 0, bundledSeed: true }); // lastUpdated=0 → stale, so chub update will refresh
|
|
345
422
|
return;
|
|
346
423
|
}
|