chub-dev 0.2.0-beta.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.js CHANGED
@@ -1,4 +1,3 @@
1
- import chalk from 'chalk';
2
1
  import { Command } from 'commander';
3
2
  import { readFileSync } from 'node:fs';
4
3
  import { fileURLToPath } from 'node:url';
@@ -10,87 +9,55 @@ import { registerSearchCommand } from './commands/search.js';
10
9
  import { registerGetCommand } from './commands/get.js';
11
10
  import { registerBuildCommand } from './commands/build.js';
12
11
  import { registerFeedbackCommand } from './commands/feedback.js';
13
- import { trackEvent, shutdownAnalytics } from './lib/analytics.js';
12
+ import { registerAnnotateCommand } from './commands/annotate.js';
13
+ import { registerHelpCommand } from './commands/help.js';
14
+ import { trackEvent, shutdownAnalytics, setCliVersion } from './lib/analytics.js';
15
+ import { error } from './lib/output.js';
16
+ import { showWelcomeIfNeeded } from './lib/welcome.js';
17
+ import { getLocalHelpText } from './lib/help.js';
14
18
 
15
19
  const __dirname = dirname(fileURLToPath(import.meta.url));
16
20
  const pkg = JSON.parse(readFileSync(join(__dirname, '..', 'package.json'), 'utf8'));
17
-
18
- function printUsage() {
19
- console.log(`
20
- ${chalk.bold('chub')} — Context Hub CLI v${pkg.version}
21
- Search and retrieve LLM-optimized docs and skills.
22
-
23
- ${chalk.bold.underline('Getting Started')}
24
-
25
- ${chalk.dim('$')} chub update ${chalk.dim('# download the registry')}
26
- ${chalk.dim('$')} chub search ${chalk.dim('# list everything available')}
27
- ${chalk.dim('$')} chub search "stripe" ${chalk.dim('# fuzzy search')}
28
- ${chalk.dim('$')} chub search stripe/payments ${chalk.dim('# exact id → full detail')}
29
- ${chalk.dim('$')} chub get stripe/api ${chalk.dim('# print doc to terminal')}
30
- ${chalk.dim('$')} chub get stripe/api -o doc.md ${chalk.dim('# save to file')}
31
- ${chalk.dim('$')} chub get openai/chat --lang py ${chalk.dim('# specific language')}
32
- ${chalk.dim('$')} chub get pw-community/login-flows ${chalk.dim('# fetch a skill')}
33
- ${chalk.dim('$')} chub get openai/chat stripe/api ${chalk.dim('# fetch multiple')}
34
-
35
- ${chalk.bold.underline('Commands')}
36
-
37
- ${chalk.bold('search')} [query] Search docs and skills (no query = list all)
38
- ${chalk.bold('get')} <ids...> Fetch docs or skills by ID
39
- ${chalk.bold('update')} Refresh the cached registry
40
- ${chalk.bold('cache')} status|clear Manage the local cache
41
- ${chalk.bold('build')} <content-dir> Build registry from content directory
42
-
43
- ${chalk.bold.underline('Flags')}
44
-
45
- --json Structured JSON output (for agents and piping)
46
- --tags <csv> Filter by tags (e.g. docs, skill, openai, browser)
47
- --lang <language> Language variant (js, py, ts)
48
- --full Fetch all files, not just the entry point
49
- -o, --output <path> Write content to file or directory
50
-
51
- ${chalk.bold.underline('Agent Piping Patterns')}
52
-
53
- ${chalk.dim('# Get the top result id')}
54
- ${chalk.dim('$')} chub search "stripe" --json | jq -r '.results[0].id'
55
-
56
- ${chalk.dim('# Search → pick → fetch → save')}
57
- ${chalk.dim('$')} ID=$(chub search "stripe" --json | jq -r '.results[0].id')
58
- ${chalk.dim('$')} chub get "$ID" --lang js -o .context/stripe.md
59
-
60
- ${chalk.dim('# Fetch multiple at once')}
61
- ${chalk.dim('$')} chub get openai/chat stripe/api -o .context/
62
-
63
- ${chalk.bold.underline('Multi-Source Config')} ${chalk.dim('(~/.chub/config.yaml)')}
64
-
65
- ${chalk.dim('sources:')}
66
- ${chalk.dim(' - name: community')}
67
- ${chalk.dim(' url: https://cdn.aichub.org/v1')}
68
- ${chalk.dim(' - name: internal')}
69
- ${chalk.dim(' path: /path/to/local/docs')}
70
-
71
- ${chalk.dim('# On id collision, use source: prefix: chub get internal:openai/chat')}
72
- `);
73
- }
21
+ setCliVersion(pkg.version);
74
22
 
75
23
  const program = new Command();
76
24
 
77
25
  program
78
26
  .name('chub')
79
27
  .description('Context Hub - search and retrieve LLM-optimized docs and skills')
80
- .version(pkg.version)
28
+ .version(pkg.version, '-V, --cli-version')
29
+ .addHelpCommand(false)
81
30
  .option('--json', 'Output as JSON (machine-readable)')
82
31
  .action(() => {
83
- printUsage();
32
+ console.log(getLocalHelpText(pkg.version));
84
33
  });
85
34
 
86
35
  // Commands that don't need registry
87
- const SKIP_REGISTRY = ['update', 'cache', 'build', 'feedback', 'help'];
36
+ const SKIP_REGISTRY = ['update', 'cache', 'build', 'feedback', 'annotate', 'help'];
88
37
 
89
38
  program.hook('preAction', async (thisCommand) => {
39
+ const globalOpts = thisCommand.optsWithGlobals?.() || {};
40
+ showWelcomeIfNeeded(globalOpts);
41
+
90
42
  const cmdName = thisCommand.args?.[0] || thisCommand.name();
91
- // Track command usage (fire-and-forget, never blocks)
92
43
  if (cmdName !== 'chub') {
93
- trackEvent('command_run', { command: cmdName }).catch(() => {});
44
+ // Only initialize identity and track if telemetry is enabled
45
+ // Respects CHUB_TELEMETRY=0 — no client_id file created, no events sent
46
+ try {
47
+ const { isTelemetryEnabled } = await import('./lib/telemetry.js');
48
+ if (isTelemetryEnabled()) {
49
+ const { getOrCreateClientId, isFirstRun } = await import('./lib/identity.js');
50
+ await getOrCreateClientId();
51
+
52
+ // Fire-and-forget — don't block command on PostHog network I/O
53
+ trackEvent('command_run', { command: cmdName }).catch(() => {});
54
+ if (isFirstRun()) {
55
+ trackEvent('first_run', { command: cmdName }).catch(() => {});
56
+ }
57
+ }
58
+ } catch {
59
+ // Identity/telemetry failure — silently skip, don't block the command
60
+ }
94
61
  }
95
62
  if (SKIP_REGISTRY.includes(cmdName)) return;
96
63
  if (thisCommand.parent?.name() === 'cache') return;
@@ -99,9 +66,8 @@ program.hook('preAction', async (thisCommand) => {
99
66
  try {
100
67
  await ensureRegistry();
101
68
  } catch (err) {
102
- process.stderr.write(`Warning: Could not load registry: ${err.message}\n`);
103
- process.stderr.write(`Run \`chub update\` to initialize.\n`);
104
- process.exit(1);
69
+ await trackEvent('command_error', { command: cmdName, error_type: 'registry_unavailable' });
70
+ error(`Registry not available: ${err.message}. Run \`chub update\` to refresh remote registries, or check that local source paths in ~/.chub/config.yaml are correct.`, globalOpts);
105
71
  }
106
72
  });
107
73
 
@@ -111,6 +77,8 @@ registerSearchCommand(program);
111
77
  registerGetCommand(program);
112
78
  registerBuildCommand(program);
113
79
  registerFeedbackCommand(program);
80
+ registerAnnotateCommand(program);
81
+ registerHelpCommand(program, pkg.version);
114
82
 
115
83
  program.parse();
116
84
 
@@ -4,13 +4,14 @@
4
4
  * Tracks: command usage, search patterns, doc/skill popularity, errors.
5
5
  * Does NOT track feedback ratings (those go to the custom API via telemetry.js).
6
6
  *
7
- * Respects the same telemetry opt-out: `telemetry: false` in config or CHUB_TELEMETRY=0.
7
+ * Respects telemetry opt-out: `telemetry: false` in config or CHUB_TELEMETRY=0.
8
+ * Feedback has a separate opt-out: `feedback: false` in config or CHUB_FEEDBACK=0.
8
9
  */
9
10
 
10
11
  import { isTelemetryEnabled } from './telemetry.js';
11
12
 
12
13
  // PostHog project API key (public — standard for client-side analytics)
13
- const POSTHOG_KEY = 'phc_cUPXY1tAUkIOU9perzGcFYEtFQeCgUhUO6ejT79YLIk';
14
+ const POSTHOG_KEY = 'phc_tO9mXIgcCuBccfN2Ut0quf6UFsd06u3Y6g1kqMaYdQX';
14
15
  const POSTHOG_HOST = 'https://us.i.posthog.com';
15
16
 
16
17
  let _posthog = null;
@@ -65,6 +66,7 @@ export async function trackEvent(event, properties = {}) {
65
66
  ...properties,
66
67
  platform: process.platform,
67
68
  node_version: process.version,
69
+ cli_version: _cliVersion || undefined,
68
70
  },
69
71
  });
70
72
 
@@ -75,6 +77,15 @@ export async function trackEvent(event, properties = {}) {
75
77
  }
76
78
  }
77
79
 
80
+ let _cliVersion;
81
+ /**
82
+ * Set the CLI version for inclusion in all events.
83
+ * Called once from index.js at startup.
84
+ */
85
+ export function setCliVersion(version) {
86
+ _cliVersion = version;
87
+ }
88
+
78
89
  /**
79
90
  * Shut down the PostHog client gracefully.
80
91
  * Call this before process exit if possible.
@@ -0,0 +1,57 @@
1
+ import { readFileSync, writeFileSync, mkdirSync, unlinkSync, readdirSync } from 'node:fs';
2
+ import { join } from 'node:path';
3
+ import { getChubDir } from './config.js';
4
+
5
+ function getAnnotationsDir() {
6
+ return join(getChubDir(), 'annotations');
7
+ }
8
+
9
+ function annotationPath(entryId) {
10
+ const safe = entryId.replace(/\//g, '--');
11
+ return join(getAnnotationsDir(), `${safe}.json`);
12
+ }
13
+
14
+ export function readAnnotation(entryId) {
15
+ try {
16
+ return JSON.parse(readFileSync(annotationPath(entryId), 'utf8'));
17
+ } catch {
18
+ return null;
19
+ }
20
+ }
21
+
22
+ export function writeAnnotation(entryId, note) {
23
+ const dir = getAnnotationsDir();
24
+ mkdirSync(dir, { recursive: true });
25
+ const data = {
26
+ id: entryId,
27
+ note,
28
+ updatedAt: new Date().toISOString(),
29
+ };
30
+ writeFileSync(annotationPath(entryId), JSON.stringify(data, null, 2));
31
+ return data;
32
+ }
33
+
34
+ export function clearAnnotation(entryId) {
35
+ try {
36
+ unlinkSync(annotationPath(entryId));
37
+ return true;
38
+ } catch {
39
+ return false;
40
+ }
41
+ }
42
+
43
+ export function listAnnotations() {
44
+ const dir = getAnnotationsDir();
45
+ try {
46
+ const files = readdirSync(dir).filter((f) => f.endsWith('.json'));
47
+ return files.map((f) => {
48
+ try {
49
+ return JSON.parse(readFileSync(join(dir, f), 'utf8'));
50
+ } catch {
51
+ return null;
52
+ }
53
+ }).filter(Boolean);
54
+ } catch {
55
+ return [];
56
+ }
57
+ }
@@ -0,0 +1,303 @@
1
+ /**
2
+ * BM25 search implementation for Context Hub.
3
+ * Index is built at `chub build` time, scoring happens at search time.
4
+ * Tokenizer is shared between build and search to ensure consistency.
5
+ */
6
+
7
+ const STOP_WORDS = new Set([
8
+ 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
9
+ 'has', 'have', 'in', 'is', 'it', 'its', 'of', 'on', 'or', 'that',
10
+ 'the', 'to', 'was', 'were', 'will', 'with', 'this', 'but', 'not',
11
+ 'you', 'your', 'can', 'do', 'does', 'how', 'if', 'may', 'no',
12
+ 'so', 'than', 'too', 'very', 'just', 'about', 'into', 'over',
13
+ 'such', 'then', 'them', 'these', 'those', 'through', 'under',
14
+ 'use', 'using', 'used',
15
+ ]);
16
+
17
+ // BM25 default parameters
18
+ const DEFAULT_K1 = 1.5;
19
+ const DEFAULT_B = 0.75;
20
+
21
+ // Field weights for multi-field scoring
22
+ const FIELD_WEIGHTS = {
23
+ id: 4.0,
24
+ name: 3.0,
25
+ tags: 2.0,
26
+ description: 1.0,
27
+ };
28
+
29
+ function getDefaultParams() {
30
+ return { k1: DEFAULT_K1, b: DEFAULT_B };
31
+ }
32
+
33
+ function isSearchableToken(token) {
34
+ return (token.length > 1 || /^\d+$/.test(token)) && !STOP_WORDS.has(token);
35
+ }
36
+
37
+ export function compactIdentifier(text) {
38
+ return String(text || '')
39
+ .toLowerCase()
40
+ .replace(/[^a-z0-9]/g, '');
41
+ }
42
+
43
+ function splitAlphaNumeric(text) {
44
+ return text
45
+ .replace(/([a-z])(\d)/g, '$1 $2')
46
+ .replace(/(\d)([a-z])/g, '$1 $2');
47
+ }
48
+
49
+ /**
50
+ * Tokenize text into lowercase terms with stop word removal.
51
+ * Must be used identically at build time and search time.
52
+ */
53
+ export function tokenize(text) {
54
+ if (!text) return [];
55
+ return text
56
+ .toLowerCase()
57
+ .replace(/[^a-z0-9\s-]/g, ' ')
58
+ .split(/[\s-]+/)
59
+ .filter(isSearchableToken);
60
+ }
61
+
62
+ /**
63
+ * Tokenize identifiers more aggressively than free text so package ids
64
+ * still match joined/split variants like "nodefetch" and "auth 0".
65
+ */
66
+ export function tokenizeIdentifier(text) {
67
+ if (!text) return [];
68
+
69
+ const tokens = new Set(tokenize(text));
70
+ const raw = String(text);
71
+ const compact = compactIdentifier(raw);
72
+ const segments = new Set([
73
+ ...raw.split('/').map((segment) => compactIdentifier(segment)),
74
+ ...raw.split(/[\/_.\s-]+/).map((segment) => compactIdentifier(segment)),
75
+ ]);
76
+
77
+ if (isSearchableToken(compact)) {
78
+ tokens.add(compact);
79
+ }
80
+
81
+ for (const token of tokenize(splitAlphaNumeric(compact))) {
82
+ tokens.add(token);
83
+ }
84
+
85
+ for (const segment of segments) {
86
+ if (!segment) continue;
87
+ if (isSearchableToken(segment)) {
88
+ tokens.add(segment);
89
+ }
90
+ for (const token of tokenize(splitAlphaNumeric(segment))) {
91
+ tokens.add(token);
92
+ }
93
+ }
94
+
95
+ return [...tokens];
96
+ }
97
+
98
+ function buildInvertedIndex(documents) {
99
+ const invertedIndex = Object.create(null);
100
+
101
+ for (const [docIndex, doc] of documents.entries()) {
102
+ const allTerms = new Set([
103
+ ...(doc.tokens.id || []),
104
+ ...(doc.tokens.name || []),
105
+ ...(doc.tokens.description || []),
106
+ ...(doc.tokens.tags || []),
107
+ ]);
108
+
109
+ for (const term of allTerms) {
110
+ if (!invertedIndex[term]) invertedIndex[term] = [];
111
+ invertedIndex[term].push(docIndex);
112
+ }
113
+ }
114
+
115
+ return invertedIndex;
116
+ }
117
+
118
+ export function buildIndexFromDocuments(documents, params = getDefaultParams()) {
119
+ const dfMap = Object.create(null); // document frequency per term (across all fields)
120
+ const fieldLengths = { id: [], name: [], description: [], tags: [] };
121
+
122
+ for (const doc of documents) {
123
+ const idTokens = doc.tokens.id || [];
124
+ const nameTokens = doc.tokens.name || [];
125
+ const descTokens = doc.tokens.description || [];
126
+ const tagTokens = doc.tokens.tags || [];
127
+
128
+ fieldLengths.id.push(idTokens.length);
129
+ fieldLengths.name.push(nameTokens.length);
130
+ fieldLengths.description.push(descTokens.length);
131
+ fieldLengths.tags.push(tagTokens.length);
132
+
133
+ const allTerms = new Set([...idTokens, ...nameTokens, ...descTokens, ...tagTokens]);
134
+ for (const term of allTerms) {
135
+ dfMap[term] = (dfMap[term] || 0) + 1;
136
+ }
137
+ }
138
+
139
+ const N = documents.length;
140
+ const idf = Object.create(null);
141
+ for (const [term, df] of Object.entries(dfMap)) {
142
+ idf[term] = Math.log((N - df + 0.5) / (df + 0.5) + 1);
143
+ }
144
+
145
+ const avg = (arr) => arr.length === 0 ? 0 : arr.reduce((a, b) => a + b, 0) / arr.length;
146
+ return {
147
+ version: '1.0.0',
148
+ algorithm: 'bm25',
149
+ params,
150
+ totalDocs: N,
151
+ avgFieldLengths: {
152
+ id: avg(fieldLengths.id),
153
+ name: avg(fieldLengths.name),
154
+ description: avg(fieldLengths.description),
155
+ tags: avg(fieldLengths.tags),
156
+ },
157
+ idf,
158
+ documents,
159
+ invertedIndex: buildInvertedIndex(documents),
160
+ };
161
+ }
162
+
163
+ /**
164
+ * Build a BM25 search index from registry entries.
165
+ * Called during `chub build`.
166
+ *
167
+ * @param {Array} entries - Combined docs and skills from registry
168
+ * @returns {Object} The search index
169
+ */
170
+ export function buildIndex(entries) {
171
+ const documents = [];
172
+
173
+ for (const entry of entries) {
174
+ const idTokens = tokenizeIdentifier(entry.id);
175
+ const nameTokens = tokenize(entry.name);
176
+ const descTokens = tokenize(entry.description || '');
177
+ const tagTokens = (entry.tags || []).flatMap((t) => tokenize(t));
178
+
179
+ documents.push({
180
+ id: entry.id,
181
+ tokens: {
182
+ id: idTokens,
183
+ name: nameTokens,
184
+ description: descTokens,
185
+ tags: tagTokens,
186
+ },
187
+ });
188
+ }
189
+ return buildIndexFromDocuments(documents);
190
+ }
191
+
192
+ /**
193
+ * Compute BM25 score for a single field.
194
+ */
195
+ function scoreField(queryTerms, fieldTokens, idf, avgFieldLen, k1, b) {
196
+ if (fieldTokens.length === 0) return 0;
197
+
198
+ // Build term frequency map for this field
199
+ const tf = Object.create(null);
200
+ for (const t of fieldTokens) {
201
+ tf[t] = (tf[t] || 0) + 1;
202
+ }
203
+
204
+ let score = 0;
205
+ const dl = fieldTokens.length;
206
+
207
+ for (const term of queryTerms) {
208
+ const termFreq = tf[term] || 0;
209
+ if (termFreq === 0) continue;
210
+
211
+ const termIdf = idf[term] || 0;
212
+ const numerator = termFreq * (k1 + 1);
213
+ const denominator = termFreq + k1 * (1 - b + b * (dl / (avgFieldLen || 1)));
214
+ score += termIdf * (numerator / denominator);
215
+ }
216
+
217
+ return score;
218
+ }
219
+
220
+ function getCandidateDocIndexes(queryTerms, index) {
221
+ if (!index.invertedIndex) {
222
+ return index.documents.map((_, docIndex) => docIndex);
223
+ }
224
+
225
+ const candidateIndexes = new Set();
226
+ for (const term of new Set(queryTerms)) {
227
+ const postings = index.invertedIndex[term];
228
+ if (!postings) continue;
229
+ for (const docIndex of postings) {
230
+ candidateIndexes.add(docIndex);
231
+ }
232
+ }
233
+
234
+ return [...candidateIndexes];
235
+ }
236
+
237
+ function runSearch(query, index, opts = {}) {
238
+ const queryTerms = tokenize(query);
239
+ const totalDocs = index.documents.length;
240
+
241
+ if (queryTerms.length === 0) {
242
+ return {
243
+ results: [],
244
+ stats: {
245
+ totalDocs,
246
+ candidateDocCount: 0,
247
+ scoredDocCount: 0,
248
+ matchedDocCount: 0,
249
+ usedInvertedIndex: !!index.invertedIndex,
250
+ },
251
+ };
252
+ }
253
+
254
+ const { k1, b } = index.params;
255
+ const results = [];
256
+ const candidateDocIndexes = getCandidateDocIndexes(queryTerms, index);
257
+
258
+ for (const docIndex of candidateDocIndexes) {
259
+ const doc = index.documents[docIndex];
260
+ let totalScore = 0;
261
+
262
+ for (const [field, weight] of Object.entries(FIELD_WEIGHTS)) {
263
+ const fieldTokens = doc.tokens[field] || [];
264
+ const avgLen = index.avgFieldLengths[field] || 1;
265
+ const fieldScore = scoreField(queryTerms, fieldTokens, index.idf, avgLen, k1, b);
266
+ totalScore += fieldScore * weight;
267
+ }
268
+
269
+ if (totalScore > 0) {
270
+ results.push({ id: doc.id, score: totalScore });
271
+ }
272
+ }
273
+
274
+ results.sort((a, b) => b.score - a.score);
275
+ const limitedResults = opts.limit ? results.slice(0, opts.limit) : results;
276
+
277
+ return {
278
+ results: limitedResults,
279
+ stats: {
280
+ totalDocs,
281
+ candidateDocCount: candidateDocIndexes.length,
282
+ scoredDocCount: candidateDocIndexes.length,
283
+ matchedDocCount: results.length,
284
+ usedInvertedIndex: !!index.invertedIndex,
285
+ },
286
+ };
287
+ }
288
+
289
+ /**
290
+ * Search the BM25 index with a query string.
291
+ *
292
+ * @param {string} query - The search query
293
+ * @param {Object} index - The pre-built BM25 index
294
+ * @param {Object} opts - Options: { limit }
295
+ * @returns {Array} Sorted results: [{ id, score }]
296
+ */
297
+ export function search(query, index, opts = {}) {
298
+ return runSearch(query, index, opts).results;
299
+ }
300
+
301
+ export function searchWithStats(query, index, opts = {}) {
302
+ return runSearch(query, index, opts);
303
+ }