@comfanion/workflow 4.38.3-dev.2 → 4.38.4-dev.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,573 +0,0 @@
1
- // OpenCode Vectorizer - Semantic Code Search with Multi-Index Support
2
- // Part of @comfanion/workflow
3
-
4
- import { pipeline, env } from '@xenova/transformers';
5
- import * as lancedb from 'vectordb';
6
- import fs from 'fs/promises';
7
- import path from 'path';
8
- import crypto from 'crypto';
9
-
10
- // Suppress transformers.js logs unless DEBUG is set
11
- const DEBUG = process.env.DEBUG?.includes('vectorizer') || process.env.DEBUG === '*';
12
- if (!DEBUG) {
13
- env.allowLocalModels = true;
14
- env.useBrowserCache = false;
15
- // Disable progress callbacks and logs
16
- env.logLevel = 'error';
17
- }
18
-
19
- /**
20
- * Default index presets (can be overridden by config.yaml)
21
- */
22
- const DEFAULT_PRESETS = {
23
- code: {
24
- pattern: '**/*.{js,ts,jsx,tsx,mjs,cjs,py,go,rs,java,kt,swift,c,cpp,h,hpp,cs,rb,php,scala,clj}',
25
- ignore: ['**/node_modules/**', '**/.git/**', '**/dist/**', '**/build/**', '**/.opencode/**', '**/docs/**', '**/vendor/**', '**/__pycache__/**'],
26
- description: 'Source code files (excludes docs, vendor, node_modules)'
27
- },
28
- docs: {
29
- pattern: 'docs/**/*.{md,mdx,txt,rst,adoc}',
30
- ignore: [],
31
- description: 'Documentation in docs/ folder'
32
- },
33
- config: {
34
- pattern: '**/*.{yaml,yml,json,toml,ini,env,xml}',
35
- ignore: ['**/node_modules/**', '**/.git/**', '**/.opencode/**'],
36
- description: 'Configuration files'
37
- },
38
- all: {
39
- pattern: '**/*.{js,ts,jsx,tsx,mjs,cjs,py,go,rs,java,kt,swift,c,cpp,h,hpp,cs,rb,php,scala,clj,md,mdx,txt,rst,adoc,yaml,yml,json,toml}',
40
- ignore: ['**/node_modules/**', '**/.git/**', '**/.opencode/**'],
41
- description: 'All supported files'
42
- }
43
- };
44
-
45
- // Will be populated from config.yaml if available
46
- let INDEX_PRESETS = { ...DEFAULT_PRESETS };
47
- let GLOBAL_IGNORE = [];
48
- let EMBEDDING_MODEL = 'Xenova/all-MiniLM-L6-v2'; // Default: fast model
49
-
50
- /**
51
- * Load index configuration from config.yaml
52
- * @param {string} projectRoot - Project root directory
53
- */
54
- async function loadConfig(projectRoot) {
55
- try {
56
- const configPath = path.join(projectRoot, '.opencode', 'config.yaml');
57
- const content = await fs.readFile(configPath, 'utf8');
58
-
59
- // Parse vectorizer section from YAML
60
- const vectorizerMatch = content.match(/^vectorizer:([\s\S]*?)(?=^[a-z]|\Z)/m);
61
- if (!vectorizerMatch) return;
62
-
63
- const section = vectorizerMatch[1];
64
-
65
- // Parse embedding model
66
- const modelMatch = section.match(/^\s{2}model:\s*["']?([^"'\n]+)["']?/m);
67
- if (modelMatch) {
68
- EMBEDDING_MODEL = modelMatch[1].trim();
69
- if (DEBUG) console.log('[vectorizer] Using model from config:', EMBEDDING_MODEL);
70
- }
71
-
72
- // Parse global exclude
73
- const excludeMatch = section.match(/^\s{2}exclude:\s*\n((?:\s{4}-\s+.+\n?)*)/m);
74
- if (excludeMatch) {
75
- GLOBAL_IGNORE = excludeMatch[1]
76
- .split('\n')
77
- .map(line => line.replace(/^\s*-\s*/, '').trim())
78
- .filter(Boolean)
79
- .map(p => p.includes('*') ? p : `**/${p}/**`);
80
- }
81
-
82
- // Parse indexes section
83
- const indexesMatch = section.match(/^\s{2}indexes:\s*\n([\s\S]*?)(?=^\s{2}[a-z]|\s{2}exclude:|\Z)/m);
84
- if (!indexesMatch) return;
85
-
86
- const indexesSection = indexesMatch[1];
87
-
88
- // Parse each index (code, docs, config)
89
- for (const indexName of ['code', 'docs', 'config']) {
90
- const indexRegex = new RegExp(`^\\s{4}${indexName}:\\s*\\n([\\s\\S]*?)(?=^\\s{4}[a-z]|\\Z)`, 'm');
91
- const indexMatch = indexesSection.match(indexRegex);
92
- if (!indexMatch) continue;
93
-
94
- const indexSection = indexMatch[1];
95
-
96
- // Parse enabled
97
- const enabledMatch = indexSection.match(/^\s+enabled:\s*(true|false)/m);
98
- const enabled = enabledMatch ? enabledMatch[1] === 'true' : true;
99
-
100
- // Parse pattern
101
- const patternMatch = indexSection.match(/^\s+pattern:\s*["']?([^"'\n]+)["']?/m);
102
- const pattern = patternMatch ? patternMatch[1].trim() : DEFAULT_PRESETS[indexName]?.pattern;
103
-
104
- // Parse ignore array
105
- const ignoreMatch = indexSection.match(/^\s+ignore:\s*\n((?:\s+-\s+.+\n?)*)/m);
106
- let ignore = [];
107
- if (ignoreMatch) {
108
- ignore = ignoreMatch[1]
109
- .split('\n')
110
- .map(line => line.replace(/^\s*-\s*/, '').replace(/["']/g, '').trim())
111
- .filter(Boolean);
112
- }
113
-
114
- if (enabled && pattern) {
115
- INDEX_PRESETS[indexName] = {
116
- pattern,
117
- ignore,
118
- description: `${indexName} files from config.yaml`
119
- };
120
- }
121
- }
122
-
123
- if (DEBUG) console.log('[vectorizer] Loaded config:', { INDEX_PRESETS, GLOBAL_IGNORE });
124
- } catch (e) {
125
- if (DEBUG) console.log('[vectorizer] Using default presets (no config.yaml)');
126
- }
127
- }
128
-
129
- class CodebaseIndexer {
130
- /**
131
- * @param {string} projectRoot - Project root directory
132
- * @param {string} indexName - Name of the index (e.g., 'code', 'docs', 'config')
133
- */
134
- constructor(projectRoot, indexName = 'code') {
135
- this.root = projectRoot;
136
- this.indexName = indexName;
137
- this.baseDir = path.join(projectRoot, '.opencode', 'vectors');
138
- this.cacheDir = path.join(this.baseDir, indexName);
139
- this.model = null;
140
- this.db = null;
141
- this.hashes = {};
142
- this.configLoaded = false;
143
- }
144
-
145
- async init() {
146
- // Load config on first init
147
- if (!this.configLoaded) {
148
- await loadConfig(this.root);
149
- this.configLoaded = true;
150
- }
151
- await fs.mkdir(this.cacheDir, { recursive: true });
152
- this.db = await lancedb.connect(path.join(this.cacheDir, 'lancedb'));
153
- await this.loadHashes();
154
- return this;
155
- }
156
-
157
- async loadModel() {
158
- if (!this.model) {
159
- if (DEBUG) console.log(`[vectorizer] Loading embedding model: ${EMBEDDING_MODEL}...`);
160
- // Model options:
161
- // - Xenova/all-MiniLM-L6-v2: fast, 384 dims, ~10 files/10sec
162
- // - Xenova/bge-base-en-v1.5: quality, 768 dims, ~3 files/10sec
163
- this.model = await pipeline('feature-extraction', EMBEDDING_MODEL, {
164
- progress_callback: DEBUG ? undefined : null // Suppress progress bar unless DEBUG
165
- });
166
- if (DEBUG) console.log(`[vectorizer] Model loaded: ${EMBEDDING_MODEL}`);
167
- }
168
- return this.model;
169
- }
170
-
171
- async unloadModel() {
172
- this.model = null;
173
- if (global.gc) global.gc();
174
- }
175
-
176
- async loadHashes() {
177
- try {
178
- const hashFile = path.join(this.cacheDir, 'hashes.json');
179
- const data = await fs.readFile(hashFile, 'utf8');
180
- this.hashes = JSON.parse(data);
181
- } catch {
182
- this.hashes = {};
183
- }
184
- }
185
-
186
- async saveHashes() {
187
- const hashFile = path.join(this.cacheDir, 'hashes.json');
188
- await fs.writeFile(hashFile, JSON.stringify(this.hashes, null, 2));
189
- }
190
-
191
- fileHash(content) {
192
- return crypto.createHash('md5').update(content).digest('hex');
193
- }
194
-
195
- /**
196
- * Check if file is archived (should be excluded from default search)
197
- * Archived if:
198
- * - Path contains /archive/ folder
199
- * - File has frontmatter with archived: true
200
- */
201
- isArchived(relPath, content) {
202
- // Check path
203
- if (relPath.includes('/archive/') || relPath.startsWith('archive/')) {
204
- return true;
205
- }
206
-
207
- // Check frontmatter (YAML between --- markers at start of file)
208
- const frontmatterMatch = content.match(/^---\n([\s\S]*?)\n---/);
209
- if (frontmatterMatch) {
210
- const frontmatter = frontmatterMatch[1];
211
- if (/^archived:\s*true/m.test(frontmatter)) {
212
- return true;
213
- }
214
- }
215
-
216
- return false;
217
- }
218
-
219
- async embed(text) {
220
- const model = await this.loadModel();
221
- const result = await model(text, { pooling: 'mean', normalize: true });
222
- return Array.from(result.data);
223
- }
224
-
225
- /**
226
- * Chunk code into smaller pieces for embedding
227
- * Tries to split on function/class boundaries when possible
228
- */
229
- chunkCode(content, maxChars = 1500) {
230
- const chunks = [];
231
- const lines = content.split('\n');
232
- let current = [];
233
- let currentLen = 0;
234
-
235
- for (const line of lines) {
236
- if (currentLen + line.length > maxChars && current.length > 0) {
237
- chunks.push(current.join('\n'));
238
- current = [];
239
- currentLen = 0;
240
- }
241
- current.push(line);
242
- currentLen += line.length + 1;
243
- }
244
-
245
- if (current.length > 0) {
246
- chunks.push(current.join('\n'));
247
- }
248
-
249
- return chunks;
250
- }
251
-
252
- /**
253
- * Check if file needs re-indexing based on content hash
254
- */
255
- needsIndex(filePath, content) {
256
- const relPath = path.relative(this.root, filePath);
257
- const currentHash = this.fileHash(content);
258
- return this.hashes[relPath] !== currentHash;
259
- }
260
-
261
- /**
262
- * Index a single file
263
- * Returns true if file was indexed, false if skipped (unchanged)
264
- */
265
- async indexFile(filePath) {
266
- const relPath = path.relative(this.root, filePath);
267
-
268
- let content;
269
- try {
270
- content = await fs.readFile(filePath, 'utf8');
271
- } catch (e) {
272
- console.warn(`Cannot read ${relPath}: ${e.message}`);
273
- return false;
274
- }
275
-
276
- const hash = this.fileHash(content);
277
-
278
- // Skip if unchanged
279
- if (this.hashes[relPath] === hash) {
280
- return false;
281
- }
282
-
283
- const chunks = this.chunkCode(content);
284
- const archived = this.isArchived(relPath, content);
285
- const data = [];
286
-
287
- for (let i = 0; i < chunks.length; i++) {
288
- const embedding = await this.embed(chunks[i]);
289
- data.push({
290
- file: relPath,
291
- chunk_index: i,
292
- content: chunks[i],
293
- vector: embedding,
294
- archived: archived
295
- });
296
- }
297
-
298
- // Add to database
299
- const tableName = 'chunks';
300
- const tables = await this.db.tableNames();
301
- if (tables.includes(tableName)) {
302
- const table = await this.db.openTable(tableName);
303
- // Note: LanceDB doesn't support delete by filter in all versions
304
- // So we just add new chunks (may have duplicates until reindex --force)
305
- await table.add(data);
306
- } else {
307
- await this.db.createTable(tableName, data);
308
- }
309
-
310
- // Update hash cache
311
- this.hashes[relPath] = hash;
312
- await this.saveHashes();
313
-
314
- return true;
315
- }
316
-
317
- /**
318
- * Semantic search across indexed codebase
319
- * @param {string} query - Search query
320
- * @param {number} limit - Max results (default 5)
321
- * @param {boolean} includeArchived - Include archived files (default false)
322
- */
323
- async search(query, limit = 5, includeArchived = false) {
324
- const tableName = 'chunks';
325
- const tables = await this.db.tableNames();
326
- if (!tables.includes(tableName)) {
327
- return [];
328
- }
329
-
330
- const queryEmbedding = await this.embed(query);
331
- const table = await this.db.openTable(tableName);
332
-
333
- // Fetch more results if we need to filter archived
334
- const fetchLimit = includeArchived ? limit : limit * 3;
335
- let results = await table.search(queryEmbedding).limit(fetchLimit).execute();
336
-
337
- // Filter out archived files unless explicitly requested
338
- if (!includeArchived) {
339
- results = results.filter(r => !r.archived);
340
- }
341
-
342
- // Trim to requested limit
343
- return results.slice(0, limit);
344
- }
345
-
346
- /**
347
- * Check if index needs full reindex (files don't match current patterns)
348
- * @param {string[]} extraIgnore - Additional patterns to ignore
349
- * Returns { needsReindex, reason, currentCount, expectedCount }
350
- */
351
- async checkHealth(extraIgnore = []) {
352
- const { glob } = await import('glob');
353
- const preset = INDEX_PRESETS[this.indexName] || DEFAULT_PRESETS.code;
354
-
355
- // Combine: preset ignore + global ignore + extra ignore
356
- const ignore = [
357
- ...(preset.ignore || []),
358
- ...GLOBAL_IGNORE,
359
- ...extraIgnore.map(p => p.includes('*') ? p : `**/${p}/**`)
360
- ];
361
-
362
- const expectedFiles = await glob(preset.pattern, {
363
- cwd: this.root,
364
- nodir: true,
365
- ignore
366
- });
367
-
368
- const indexedFiles = Object.keys(this.hashes);
369
- const currentCount = indexedFiles.length;
370
- const expectedCount = expectedFiles.length;
371
-
372
- // Check if counts differ significantly (>20% difference or index is empty)
373
- const diff = Math.abs(currentCount - expectedCount);
374
- const threshold = Math.max(5, expectedCount * 0.2); // 20% or at least 5 files
375
-
376
- if (currentCount === 0 && expectedCount > 0) {
377
- return { needsReindex: true, reason: 'empty', currentCount, expectedCount };
378
- }
379
-
380
- if (diff > threshold) {
381
- return { needsReindex: true, reason: 'mismatch', currentCount, expectedCount };
382
- }
383
-
384
- return { needsReindex: false, reason: 'ok', currentCount, expectedCount };
385
- }
386
-
387
- /**
388
- * Freshen index - check for stale files and reindex only changed ones
389
- * Returns { checked, updated, deleted } counts
390
- */
391
- async freshen() {
392
- let checked = 0;
393
- let updated = 0;
394
- let deleted = 0;
395
-
396
- const indexedFiles = Object.keys(this.hashes);
397
-
398
- for (const relPath of indexedFiles) {
399
- checked++;
400
- const filePath = path.join(this.root, relPath);
401
-
402
- try {
403
- const content = await fs.readFile(filePath, 'utf8');
404
- const currentHash = this.fileHash(content);
405
-
406
- if (this.hashes[relPath] !== currentHash) {
407
- // File changed - reindex it
408
- await this.indexFile(filePath);
409
- updated++;
410
- }
411
- } catch (e) {
412
- // File deleted or unreadable - remove from index
413
- delete this.hashes[relPath];
414
- deleted++;
415
- }
416
- }
417
-
418
- if (deleted > 0) {
419
- await this.saveHashes();
420
- }
421
-
422
- return { checked, updated, deleted };
423
- }
424
-
425
- /**
426
- * Index all files matching the preset pattern
427
- * @param {function} onProgress - Optional callback(indexed, total, currentFile)
428
- * @param {string[]} extraIgnore - Additional patterns to ignore
429
- * Returns { indexed, skipped } counts
430
- */
431
- async indexAll(onProgress = null, extraIgnore = []) {
432
- const { glob } = await import('glob');
433
- const preset = INDEX_PRESETS[this.indexName] || DEFAULT_PRESETS.code;
434
-
435
- // Combine: preset ignore + global ignore + extra ignore
436
- const ignore = [
437
- ...(preset.ignore || []),
438
- ...GLOBAL_IGNORE,
439
- ...extraIgnore.map(p => p.includes('*') ? p : `**/${p}/**`)
440
- ];
441
-
442
- const files = await glob(preset.pattern, {
443
- cwd: this.root,
444
- nodir: true,
445
- ignore
446
- });
447
-
448
- let indexed = 0;
449
- let skipped = 0;
450
-
451
- for (const relPath of files) {
452
- const filePath = path.join(this.root, relPath);
453
- try {
454
- const wasIndexed = await this.indexFile(filePath);
455
- if (wasIndexed) {
456
- indexed++;
457
- if (onProgress) onProgress(indexed, files.length, relPath);
458
- } else {
459
- skipped++;
460
- }
461
- } catch (e) {
462
- skipped++;
463
- }
464
- }
465
-
466
- return { indexed, skipped, total: files.length };
467
- }
468
-
469
- /**
470
- * Index a single file by path (convenience method)
471
- */
472
- async indexSingleFile(filePath) {
473
- const absPath = path.isAbsolute(filePath)
474
- ? filePath
475
- : path.join(this.root, filePath);
476
- return await this.indexFile(absPath);
477
- }
478
-
479
- /**
480
- * Get indexing statistics for this index
481
- */
482
- async getStats() {
483
- const fileCount = Object.keys(this.hashes).length;
484
- let chunkCount = 0;
485
-
486
- try {
487
- const tables = await this.db.tableNames();
488
- if (tables.includes('chunks')) {
489
- const table = await this.db.openTable('chunks');
490
- chunkCount = await table.countRows();
491
- }
492
- } catch {}
493
-
494
- const preset = INDEX_PRESETS[this.indexName];
495
-
496
- return {
497
- indexName: this.indexName,
498
- description: preset?.description || 'Custom index',
499
- model: EMBEDDING_MODEL,
500
- fileCount,
501
- chunkCount
502
- };
503
- }
504
-
505
- /**
506
- * Get statistics for all indexes
507
- */
508
- async getAllStats() {
509
- const stats = [];
510
-
511
- try {
512
- const entries = await fs.readdir(this.baseDir, { withFileTypes: true });
513
-
514
- for (const entry of entries) {
515
- if (entry.isDirectory() && entry.name !== 'lancedb') {
516
- try {
517
- const indexer = await new CodebaseIndexer(this.root, entry.name).init();
518
- const stat = await indexer.getStats();
519
- if (stat.fileCount > 0 || stat.chunkCount > 0) {
520
- stats.push(stat);
521
- }
522
- } catch {}
523
- }
524
- }
525
- } catch {}
526
-
527
- return stats;
528
- }
529
-
530
- /**
531
- * Clear this index's data
532
- */
533
- async clear() {
534
- await fs.rm(this.cacheDir, { recursive: true, force: true });
535
- this.hashes = {};
536
- await this.init();
537
- }
538
-
539
- /**
540
- * Clear all indexes
541
- */
542
- async clearAll() {
543
- await fs.rm(this.baseDir, { recursive: true, force: true });
544
- this.hashes = {};
545
- await this.init();
546
- }
547
-
548
- /**
549
- * List all available index names
550
- */
551
- async listIndexes() {
552
- const indexes = [];
553
-
554
- try {
555
- const entries = await fs.readdir(this.baseDir, { withFileTypes: true });
556
-
557
- for (const entry of entries) {
558
- if (entry.isDirectory() && entry.name !== 'lancedb') {
559
- indexes.push(entry.name);
560
- }
561
- }
562
- } catch {}
563
-
564
- return indexes;
565
- }
566
- }
567
-
568
- // Getter for current embedding model (after config loaded)
569
- function getEmbeddingModel() {
570
- return EMBEDDING_MODEL;
571
- }
572
-
573
- export { CodebaseIndexer, INDEX_PRESETS, getEmbeddingModel };
@@ -1,16 +0,0 @@
1
- {
2
- "name": "opencode-vectorizer",
3
- "version": "1.0.0",
4
- "description": "Semantic code search for OpenCode Workflow",
5
- "type": "module",
6
- "private": true,
7
- "main": "index.js",
8
- "dependencies": {
9
- "@xenova/transformers": "^2.17.0",
10
- "glob": "^10.3.10",
11
- "vectordb": "^0.4.0"
12
- },
13
- "engines": {
14
- "node": ">=18"
15
- }
16
- }