@comfanion/usethis_search 0.1.4 → 0.2.0-dev.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,17 +1,36 @@
1
- # @comfanion/usethis_search
1
+ # 🔍 @comfanion/usethis_search
2
2
 
3
- OpenCode plugin that provides semantic search and index management tools.
3
+ **Semantic code search with automatic indexing**
4
4
 
5
- ## Tools
5
+ Forget about `grep` and `find` — search code by meaning, not by text!
6
6
 
7
- - `search` (semantic search)
8
- - `codeindex` (index status, list, reindex)
7
+ ---
9
8
 
10
- ## Storage
9
+ ## ✨ What is this?
11
10
 
12
- - Vectors are stored in `.opencode/vectors/<index>/` in the project.
11
+ An OpenCode plugin that adds **smart search** to your project:
13
12
 
14
- ## Install (OpenCode)
13
+ - 🧠 **Semantic search** — finds code by meaning, even when words don't match
14
+ - 🔀 **Hybrid search (v2)** — combines vector similarity + BM25 keyword matching
15
+ - 🧩 **Semantic chunking (v2)** — structure-aware splitting for Markdown (headings) and code (functions/classes)
16
+ - 🏷️ **Rich metadata (v2)** — filter by file type, language, date, tags
17
+ - ⚡ **Automatic indexing** — files are indexed on change (zero effort)
18
+ - 📦 **Local vectorization** — works offline, no API keys needed
19
+ - 🎯 **Three indexes** — separate for code, docs, and configs
20
+ - 📊 **Quality metrics (v2)** — track search relevance and usage
21
+ - 🌍 **Multilingual** — supports Ukrainian, Russian, and English
22
+
23
+ ---
24
+
25
+ ## 🚀 Quick Start
26
+
27
+ ### Installation
28
+
29
+ ```bash
30
+ npm install @comfanion/usethis_search
31
+ ```
32
+
33
+ ### Configuration
15
34
 
16
35
  Add to `opencode.json`:
17
36
 
@@ -20,3 +39,400 @@ Add to `opencode.json`:
20
39
  "plugin": ["@comfanion/usethis_search"]
21
40
  }
22
41
  ```
42
+
43
+ ### First Run
44
+
45
+ On OpenCode startup, the plugin automatically:
46
+ 1. Creates indexes for code and documentation
47
+ 2. Indexes all project files
48
+ 3. Shows progress via toast notifications
49
+
50
+ **First indexing may take time:**
51
+ - < 20 files — Quick coffee? ☕
52
+ - < 100 files — ~1min. Stretch break? 🧘
53
+ - < 500 files — ~3min. Make coffee ☕ and relax 🛋️
54
+ - 500+ files — ~10min. Go touch grass 🌿 or take a nap 😴
55
+
56
+ ---
57
+
58
+ ## 🎯 How to Use
59
+
60
+ ### Search
61
+
62
+ ```javascript
63
+ // Search for authentication logic
64
+ search({
65
+ query: "authentication logic",
66
+ index: "code"
67
+ })
68
+
69
+ // Search for deployment instructions
70
+ search({
71
+ query: "how to deploy",
72
+ index: "docs"
73
+ })
74
+
75
+ // Search for API keys in configs
76
+ search({
77
+ query: "API keys",
78
+ index: "config"
79
+ })
80
+
81
+ // Search across all indexes
82
+ search({
83
+ query: "database connection",
84
+ searchAll: true
85
+ })
86
+
87
+ // v2: Hybrid search (vector + keyword matching)
88
+ search({
89
+ query: "getUserById",
90
+ hybrid: true
91
+ })
92
+
93
+ // v2: Filter by file type and language
94
+ search({
95
+ query: "authentication logic",
96
+ fileType: "code",
97
+ language: "typescript"
98
+ })
99
+
100
+ // v2: Filter by date
101
+ search({
102
+ query: "recent changes",
103
+ modifiedAfter: "2024-06-01"
104
+ })
105
+
106
+ // v2: Filter by frontmatter tags
107
+ search({
108
+ query: "security",
109
+ tags: "auth,security"
110
+ })
111
+ ```
112
+
113
+ ### Index Management
114
+
115
+ ```javascript
116
+ // List all indexes
117
+ codeindex({ action: "list" })
118
+
119
+ // Check specific index status
120
+ codeindex({ action: "status", index: "code" })
121
+
122
+ // Reindex
123
+ codeindex({ action: "reindex", index: "code" })
124
+
125
+ // Index specific directory
126
+ codeindex({
127
+ action: "reindex",
128
+ index: "docs",
129
+ dir: "docs/"
130
+ })
131
+
132
+ // v2: Run quality tests against gold dataset
133
+ codeindex({ action: "test", index: "code" })
134
+ ```
135
+
136
+ ---
137
+
138
+ ## 🧠 How It Works
139
+
140
+ ### Semantic Search
141
+
142
+ Instead of searching for exact text matches, the plugin:
143
+ 1. **Cleans** content (removes TOC, noise, auto-generated markers)
144
+ 2. **Chunks** intelligently (Markdown by headings, code by functions/classes)
145
+ 3. Converts chunks into **vectors** (numerical representations of meaning)
146
+ 4. Compares vectors of your query with vectors of code
147
+ 5. Optionally combines with **BM25 keyword search** (hybrid mode)
148
+ 6. Returns the most **semantically similar** fragments with rich metadata
149
+
150
+ **Example:**
151
+ ```javascript
152
+ // You search for: "user authentication"
153
+ // It will find code with:
154
+ // - "login handler"
155
+ // - "verify credentials"
156
+ // - "session management"
157
+ // Even if words "user" and "authentication" are absent!
158
+ ```
159
+
160
+ ### Automatic Indexing
161
+
162
+ The plugin tracks file changes and automatically updates indexes:
163
+
164
+ 1. **On OpenCode startup** — checks all indexes, updates stale ones
165
+ 2. **On file edit** — queues file for reindexing
166
+ 3. **After 1 second** (debounce) — indexes changed files
167
+
168
+ **Configuration in `.opencode/vectorizer.yaml`:**
169
+
170
+ ```yaml
171
+ vectorizer:
172
+ enabled: true # Enable plugin
173
+ auto_index: true # Automatic indexing
174
+ debounce_ms: 1000 # Delay before indexing (ms)
175
+
176
+ # v2: Content cleaning
177
+ cleaning:
178
+ remove_toc: true
179
+ remove_frontmatter_metadata: false
180
+ remove_imports: false
181
+ remove_comments: false
182
+
183
+ # v2: Semantic chunking
184
+ chunking:
185
+ strategy: "semantic" # fixed | semantic
186
+ markdown:
187
+ split_by_headings: true
188
+ min_chunk_size: 200
189
+ max_chunk_size: 2000
190
+ preserve_heading_hierarchy: true
191
+ code:
192
+ split_by_functions: true
193
+ include_function_signature: true
194
+ min_chunk_size: 300
195
+ max_chunk_size: 1500
196
+
197
+ # v2: Hybrid search
198
+ search:
199
+ hybrid: false # vector + BM25
200
+ bm25_weight: 0.3
201
+
202
+ # v2: Quality monitoring
203
+ quality:
204
+ enable_metrics: false
205
+ enable_cache: true
206
+
207
+ indexes:
208
+ code:
209
+ enabled: true
210
+ docs:
211
+ enabled: true
212
+ config:
213
+ enabled: false
214
+
215
+ exclude:
216
+ - node_modules
217
+ - vendor
218
+ - dist
219
+ - build
220
+ - __pycache__
221
+ ```
222
+
223
+ ---
224
+
225
+ ## 📦 Data Structure
226
+
227
+ Indexes are stored locally in your project:
228
+
229
+ ```
230
+ .opencode/
231
+ vectors/
232
+ code/ # Code index
233
+ data/ # LanceDB tables
234
+ hashes.json # File hashes (for change detection)
235
+ docs/ # Documentation index
236
+ data/
237
+ hashes.json
238
+ vectorizer.yaml # Configuration
239
+ indexer.log # Indexing log (if DEBUG=*)
240
+ ```
241
+
242
+ ---
243
+
244
+ ## 🎨 Usage Examples
245
+
246
+ ### 1. Find all API endpoints
247
+
248
+ ```javascript
249
+ search({
250
+ query: "REST API endpoints routes",
251
+ index: "code"
252
+ })
253
+ ```
254
+
255
+ ### 2. Find testing documentation
256
+
257
+ ```javascript
258
+ search({
259
+ query: "how to write tests",
260
+ index: "docs"
261
+ })
262
+ ```
263
+
264
+ ### 3. Find database configuration
265
+
266
+ ```javascript
267
+ search({
268
+ query: "database connection settings",
269
+ index: "config"
270
+ })
271
+ ```
272
+
273
+ ### 4. Find error handling
274
+
275
+ ```javascript
276
+ search({
277
+ query: "error handling try catch",
278
+ index: "code",
279
+ limit: 20 // More results
280
+ })
281
+ ```
282
+
283
+ ### 5. Search across entire project
284
+
285
+ ```javascript
286
+ search({
287
+ query: "authentication",
288
+ searchAll: true // Searches in code, docs, config
289
+ })
290
+ ```
291
+
292
+ ---
293
+
294
+ ## 🛠️ Configuration
295
+
296
+ ### Disable automatic indexing
297
+
298
+ ```yaml
299
+ # .opencode/vectorizer.yaml
300
+ vectorizer:
301
+ enabled: true
302
+ auto_index: false # Manual indexing only
303
+ ```
304
+
305
+ ### Add custom index
306
+
307
+ ```yaml
308
+ vectorizer:
309
+ indexes:
310
+ tests:
311
+ enabled: true
312
+ extensions: [.test.js, .spec.ts]
313
+ ```
314
+
315
+ ### Change indexing delay
316
+
317
+ ```yaml
318
+ vectorizer:
319
+ debounce_ms: 3000 # 3 seconds instead of 1
320
+ ```
321
+
322
+ ### Temporarily disable plugin
323
+
324
+ ```bash
325
+ export OPENCODE_SKIP_AUTO_INDEX=1
326
+ ```
327
+
328
+ ---
329
+
330
+ ## 🐛 Debugging
331
+
332
+ ### Enable logs
333
+
334
+ ```bash
335
+ export DEBUG=file-indexer
336
+ # or
337
+ export DEBUG=*
338
+ ```
339
+
340
+ Logs will be in `.opencode/indexer.log`
341
+
342
+ ### Reindex everything
343
+
344
+ ```javascript
345
+ codeindex({ action: "reindex", index: "code" })
346
+ codeindex({ action: "reindex", index: "docs" })
347
+ ```
348
+
349
+ ### Check index status
350
+
351
+ ```javascript
352
+ codeindex({ action: "list" })
353
+ ```
354
+
355
+ ---
356
+
357
+ ## 🌟 Advantages
358
+
359
+ ### Compared to `grep`/`find`
360
+
361
+ | Feature | grep/find | usethis_search |
362
+ |---------|-----------|----------------|
363
+ | Text search | ✅ | ✅ |
364
+ | Semantic search | ❌ | ✅ |
365
+ | Finds synonyms | ❌ | ✅ |
366
+ | Understands context | ❌ | ✅ |
367
+ | Works offline | ✅ | ✅ |
368
+ | Auto-updates | ❌ | ✅ |
369
+
370
+ ### Compared to online search (GitHub Copilot, ChatGPT)
371
+
372
+ | Feature | Online | usethis_search |
373
+ |---------|--------|----------------|
374
+ | Works offline | ❌ | ✅ |
375
+ | Privacy | ❌ | ✅ |
376
+ | Free | ❌ | ✅ |
377
+ | Speed | 🐌 | ⚡ |
378
+ | Knows your code | ❌ | ✅ |
379
+
380
+ ---
381
+
382
+ ## 📊 Technical Details
383
+
384
+ - **Vectorization:** [@xenova/transformers](https://github.com/xenova/transformers.js) (ONNX Runtime)
385
+ - **Vector DB:** [LanceDB](https://lancedb.com/) (local, serverless)
386
+ - **Model:** `Xenova/all-MiniLM-L6-v2` (multilingual, 384 dimensions)
387
+ - **Model size:** ~23 MB (downloaded once)
388
+ - **Speed:** ~0.5 sec/file (after model loading)
389
+
390
+ ### v2 Architecture
391
+
392
+ ```
393
+ File → Content Cleaner → Chunker Factory → Embedder → LanceDB
394
+ ├── Markdown Chunker (heading-aware)
395
+ ├── Code Chunker (function/class-aware)
396
+ └── Fixed Chunker (fallback)
397
+
398
+ Query → Query Cache → Embedder → Vector Search ─┐
399
+ └──────────→ BM25 Search ────┤→ Hybrid Merge → Filter → Results
400
+
401
+ Metadata Filter (type, lang, date, tags)
402
+ ```
403
+
404
+ ### New Modules (v2)
405
+
406
+ | Module | Purpose |
407
+ |--------|---------|
408
+ | `content-cleaner.ts` | Remove noise (TOC, breadcrumbs, markers) |
409
+ | `metadata-extractor.ts` | Extract file_type, language, tags, dates |
410
+ | `markdown-chunker.ts` | Heading-aware splitting with hierarchy |
411
+ | `code-chunker.ts` | Function/class-aware splitting |
412
+ | `chunker-factory.ts` | Route to correct chunker by file type |
413
+ | `bm25-index.ts` | Inverted index for keyword search |
414
+ | `hybrid-search.ts` | Merge vector + BM25 scores |
415
+ | `query-cache.ts` | LRU cache for query embeddings |
416
+ | `search-metrics.ts` | Track search quality metrics |
417
+
418
+ ---
419
+
420
+ ## 🤝 Contributing
421
+
422
+ Found a bug? Have an idea? Open an issue or PR!
423
+
424
+ ---
425
+
426
+ ## 📄 License
427
+
428
+ MIT
429
+
430
+ ---
431
+
432
+ ## 🎉 Authors
433
+
434
+ Made with ❤️ by the **Comfanion** team
435
+
436
+ ---
437
+
438
+ **Search smart, not hard!** 🚀
package/file-indexer.ts CHANGED
@@ -326,6 +326,8 @@ async function ensureIndexOnSessionStart(
326
326
  return { totalFiles, elapsedSeconds, action }
327
327
  }
328
328
 
329
+ const STALE_THRESHOLD_MS = 5 * 60 * 1000 // 5 minutes — evict stuck entries
330
+
329
331
  async function processPendingFiles(projectRoot: string, config: VectorizerConfig): Promise<void> {
330
332
  if (pendingFiles.size === 0) return
331
333
  if (SKIP_AUTO_INDEX) {
@@ -335,6 +337,7 @@ async function processPendingFiles(projectRoot: string, config: VectorizerConfig
335
337
 
336
338
  const now = Date.now()
337
339
  const filesToProcess: Map<string, string[]> = new Map()
340
+ const staleKeys: string[] = []
338
341
 
339
342
  for (const [filePath, info] of pendingFiles.entries()) {
340
343
  if (now - info.timestamp >= config.debounce_ms) {
@@ -342,9 +345,17 @@ async function processPendingFiles(projectRoot: string, config: VectorizerConfig
342
345
  files.push(filePath)
343
346
  filesToProcess.set(info.indexName, files)
344
347
  pendingFiles.delete(filePath)
348
+ } else if (now - info.timestamp > STALE_THRESHOLD_MS) {
349
+ staleKeys.push(filePath)
345
350
  }
346
351
  }
347
352
 
353
+ // Evict entries stuck for >5 minutes (prevents unbounded growth)
354
+ for (const key of staleKeys) {
355
+ debug(`Evicting stale pending file: ${key}`)
356
+ pendingFiles.delete(key)
357
+ }
358
+
348
359
  if (filesToProcess.size === 0) return
349
360
 
350
361
  debug(`Processing ${filesToProcess.size} index(es)...`)
@@ -425,6 +436,9 @@ export const FileIndexerPlugin: Plugin = async ({ directory, client }) => {
425
436
  }, 1000)
426
437
  }
427
438
 
439
+ let lastProcessTime = Date.now()
440
+ const MAX_DEBOUNCE_WAIT_MS = 5000 // Force processing after 5s of rapid edits
441
+
428
442
  function queueFileForIndexing(filePath: string): void {
429
443
  const relativePath = path.relative(directory, filePath)
430
444
  if (relativePath.startsWith("..") || path.isAbsolute(relativePath)) return
@@ -439,9 +453,15 @@ export const FileIndexerPlugin: Plugin = async ({ directory, client }) => {
439
453
  if (processingTimeout) {
440
454
  clearTimeout(processingTimeout)
441
455
  }
456
+
457
+ // If rapid edits keep resetting the timer, force processing after MAX_DEBOUNCE_WAIT_MS
458
+ const timeSinceLast = Date.now() - lastProcessTime
459
+ const waitTime = timeSinceLast > MAX_DEBOUNCE_WAIT_MS ? 0 : config.debounce_ms + 100
460
+
442
461
  processingTimeout = setTimeout(async () => {
462
+ lastProcessTime = Date.now()
443
463
  await processPendingFiles(directory, config)
444
- }, config.debounce_ms + 100)
464
+ }, waitTime)
445
465
  }
446
466
 
447
467
  return {
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@comfanion/usethis_search",
3
- "version": "0.1.4",
4
- "description": "OpenCode plugin: semantic search + code index management",
3
+ "version": "0.2.0-dev.0",
4
+ "description": "OpenCode plugin: semantic search + code index management (v2: hybrid search, semantic chunking, metadata filtering)",
5
5
  "type": "module",
6
6
  "main": "./index.ts",
7
7
  "exports": {
@@ -16,6 +16,16 @@
16
16
  "tools/search.ts",
17
17
  "tools/codeindex.ts",
18
18
  "vectorizer/index.js",
19
+ "vectorizer/content-cleaner.ts",
20
+ "vectorizer/metadata-extractor.ts",
21
+ "vectorizer/bm25-index.ts",
22
+ "vectorizer/hybrid-search.ts",
23
+ "vectorizer/query-cache.ts",
24
+ "vectorizer/search-metrics.ts",
25
+ "vectorizer/chunkers/markdown-chunker.ts",
26
+ "vectorizer/chunkers/code-chunker.ts",
27
+ "vectorizer/chunkers/chunker-factory.ts",
28
+ "vectorizer.yaml",
19
29
  "README.md",
20
30
  "LICENSE"
21
31
  ],