npm - @comfanion/usethis_search - Versions diffs - 0.1.4 → 0.2.0-dev.0 - Mend

@comfanion/usethis_search 0.1.4 → 0.2.0-dev.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/README.md +424 -8
package/file-indexer.ts +21 -1
package/package.json +12 -2
package/tools/codeindex.ts +135 -16
package/tools/search.ts +46 -11
package/vectorizer/bm25-index.ts +155 -0
package/vectorizer/chunkers/chunker-factory.ts +98 -0
package/vectorizer/chunkers/code-chunker.ts +325 -0
package/vectorizer/chunkers/markdown-chunker.ts +177 -0
package/vectorizer/content-cleaner.ts +136 -0
package/vectorizer/hybrid-search.ts +97 -0
package/vectorizer/index.js +395 -16
package/vectorizer/metadata-extractor.ts +125 -0
package/vectorizer/query-cache.ts +126 -0
package/vectorizer/search-metrics.ts +155 -0
package/vectorizer.yaml +81 -0

package/README.md CHANGED Viewed

@@ -1,17 +1,36 @@
-# @comfanion/usethis_search
+# 🔍 @comfanion/usethis_search
-OpenCode plugin that provides semantic search and index management tools.
+**Semantic code search with automatic indexing**
-## Tools
+Forget about `grep` and `find` — search code by meaning, not by text!
-- `search` (semantic search)
-- `codeindex` (index status, list, reindex)
+---
-## Storage
+## ✨ What is this?
-- Vectors are stored in `.opencode/vectors/<index>/` in the project.
+An OpenCode plugin that adds **smart search** to your project:
-## Install (OpenCode)
+- 🧠 **Semantic search** — finds code by meaning, even when words don't match
+- 🔀 **Hybrid search (v2)** — combines vector similarity + BM25 keyword matching
+- 🧩 **Semantic chunking (v2)** — structure-aware splitting for Markdown (headings) and code (functions/classes)
+- 🏷️ **Rich metadata (v2)** — filter by file type, language, date, tags
+- ⚡ **Automatic indexing** — files are indexed on change (zero effort)
+- 📦 **Local vectorization** — works offline, no API keys needed
+- 🎯 **Three indexes** — separate for code, docs, and configs
+- 📊 **Quality metrics (v2)** — track search relevance and usage
+- 🌍 **Multilingual** — supports Ukrainian, Russian, and English
+---
+## 🚀 Quick Start
+### Installation
+```bash
+npm install @comfanion/usethis_search
+```
+### Configuration
 Add to `opencode.json`:
@@ -20,3 +39,400 @@ Add to `opencode.json`:
   "plugin": ["@comfanion/usethis_search"]
 }
 ```
+### First Run
+On OpenCode startup, the plugin automatically:
+1. Creates indexes for code and documentation
+2. Indexes all project files
+3. Shows progress via toast notifications
+**First indexing may take time:**
+- < 20 files — Quick coffee? ☕
+- < 100 files — ~1min. Stretch break? 🧘
+- < 500 files — ~3min. Make coffee ☕ and relax 🛋️
+- 500+ files — ~10min. Go touch grass 🌿 or take a nap 😴
+---
+## 🎯 How to Use
+### Search
+```javascript
+// Search for authentication logic
+search({
+  query: "authentication logic",
+  index: "code"
+})
+// Search for deployment instructions
+search({
+  query: "how to deploy",
+  index: "docs"
+})
+// Search for API keys in configs
+search({
+  query: "API keys",
+  index: "config"
+})
+// Search across all indexes
+search({
+  query: "database connection",
+  searchAll: true
+})
+// v2: Hybrid search (vector + keyword matching)
+search({
+  query: "getUserById",
+  hybrid: true
+})
+// v2: Filter by file type and language
+search({
+  query: "authentication logic",
+  fileType: "code",
+  language: "typescript"
+})
+// v2: Filter by date
+search({
+  query: "recent changes",
+  modifiedAfter: "2024-06-01"
+})
+// v2: Filter by frontmatter tags
+search({
+  query: "security",
+  tags: "auth,security"
+})
+```
+### Index Management
+```javascript
+// List all indexes
+codeindex({ action: "list" })
+// Check specific index status
+codeindex({ action: "status", index: "code" })
+// Reindex
+codeindex({ action: "reindex", index: "code" })
+// Index specific directory
+codeindex({
+  action: "reindex",
+  index: "docs",
+  dir: "docs/"
+})
+// v2: Run quality tests against gold dataset
+codeindex({ action: "test", index: "code" })
+```
+---
+## 🧠 How It Works
+### Semantic Search
+Instead of searching for exact text matches, the plugin:
+1. **Cleans** content (removes TOC, noise, auto-generated markers)
+2. **Chunks** intelligently (Markdown by headings, code by functions/classes)
+3. Converts chunks into **vectors** (numerical representations of meaning)
+4. Compares vectors of your query with vectors of code
+5. Optionally combines with **BM25 keyword search** (hybrid mode)
+6. Returns the most **semantically similar** fragments with rich metadata
+**Example:**
+```javascript
+// You search for: "user authentication"
+// It will find code with:
+// - "login handler"
+// - "verify credentials"
+// - "session management"
+// Even if words "user" and "authentication" are absent!
+```
+### Automatic Indexing
+The plugin tracks file changes and automatically updates indexes:
+1. **On OpenCode startup** — checks all indexes, updates stale ones
+2. **On file edit** — queues file for reindexing
+3. **After 1 second** (debounce) — indexes changed files
+**Configuration in `.opencode/vectorizer.yaml`:**
+```yaml
+vectorizer:
+  enabled: true          # Enable plugin
+  auto_index: true       # Automatic indexing
+  debounce_ms: 1000      # Delay before indexing (ms)
+  # v2: Content cleaning
+  cleaning:
+    remove_toc: true
+    remove_frontmatter_metadata: false
+    remove_imports: false
+    remove_comments: false
+  # v2: Semantic chunking
+  chunking:
+    strategy: "semantic"  # fixed | semantic
+    markdown:
+      split_by_headings: true
+      min_chunk_size: 200
+      max_chunk_size: 2000
+      preserve_heading_hierarchy: true
+    code:
+      split_by_functions: true
+      include_function_signature: true
+      min_chunk_size: 300
+      max_chunk_size: 1500
+  # v2: Hybrid search
+  search:
+    hybrid: false         # vector + BM25
+    bm25_weight: 0.3
+  # v2: Quality monitoring
+  quality:
+    enable_metrics: false
+    enable_cache: true
+  indexes:
+    code:
+      enabled: true
+    docs:
+      enabled: true
+    config:
+      enabled: false
+  exclude:
+    - node_modules
+    - vendor
+    - dist
+    - build
+    - __pycache__
+```
+---
+## 📦 Data Structure
+Indexes are stored locally in your project:
+```
+.opencode/
+  vectors/
+    code/              # Code index
+      data/            # LanceDB tables
+      hashes.json      # File hashes (for change detection)
+    docs/              # Documentation index
+      data/
+      hashes.json
+  vectorizer.yaml      # Configuration
+  indexer.log          # Indexing log (if DEBUG=*)
+```
+---
+## 🎨 Usage Examples
+### 1. Find all API endpoints
+```javascript
+search({
+  query: "REST API endpoints routes",
+  index: "code"
+})
+```
+### 2. Find testing documentation
+```javascript
+search({
+  query: "how to write tests",
+  index: "docs"
+})
+```
+### 3. Find database configuration
+```javascript
+search({
+  query: "database connection settings",
+  index: "config"
+})
+```
+### 4. Find error handling
+```javascript
+search({
+  query: "error handling try catch",
+  index: "code",
+  limit: 20  // More results
+})
+```
+### 5. Search across entire project
+```javascript
+search({
+  query: "authentication",
+  searchAll: true  // Searches in code, docs, config
+})
+```
+---
+## 🛠️ Configuration
+### Disable automatic indexing
+```yaml
+# .opencode/vectorizer.yaml
+vectorizer:
+  enabled: true
+  auto_index: false  # Manual indexing only
+```
+### Add custom index
+```yaml
+vectorizer:
+  indexes:
+    tests:
+      enabled: true
+      extensions: [.test.js, .spec.ts]
+```
+### Change indexing delay
+```yaml
+vectorizer:
+  debounce_ms: 3000  # 3 seconds instead of 1
+```
+### Temporarily disable plugin
+```bash
+export OPENCODE_SKIP_AUTO_INDEX=1
+```
+---
+## 🐛 Debugging
+### Enable logs
+```bash
+export DEBUG=file-indexer
+# or
+export DEBUG=*
+```
+Logs will be in `.opencode/indexer.log`
+### Reindex everything
+```javascript
+codeindex({ action: "reindex", index: "code" })
+codeindex({ action: "reindex", index: "docs" })
+```
+### Check index status
+```javascript
+codeindex({ action: "list" })
+```
+---
+## 🌟 Advantages
+### Compared to `grep`/`find`
+| Feature | grep/find | usethis_search |
+|---------|-----------|----------------|
+| Text search | ✅ | ✅ |
+| Semantic search | ❌ | ✅ |
+| Finds synonyms | ❌ | ✅ |
+| Understands context | ❌ | ✅ |
+| Works offline | ✅ | ✅ |
+| Auto-updates | ❌ | ✅ |
+### Compared to online search (GitHub Copilot, ChatGPT)
+| Feature | Online | usethis_search |
+|---------|--------|----------------|
+| Works offline | ❌ | ✅ |
+| Privacy | ❌ | ✅ |
+| Free | ❌ | ✅ |
+| Speed | 🐌 | ⚡ |
+| Knows your code | ❌ | ✅ |
+---
+## 📊 Technical Details
+- **Vectorization:** [@xenova/transformers](https://github.com/xenova/transformers.js) (ONNX Runtime)
+- **Vector DB:** [LanceDB](https://lancedb.com/) (local, serverless)
+- **Model:** `Xenova/all-MiniLM-L6-v2` (multilingual, 384 dimensions)
+- **Model size:** ~23 MB (downloaded once)
+- **Speed:** ~0.5 sec/file (after model loading)
+### v2 Architecture
+```
+File → Content Cleaner → Chunker Factory → Embedder → LanceDB
+                           ├── Markdown Chunker (heading-aware)
+                           ├── Code Chunker (function/class-aware)
+                           └── Fixed Chunker (fallback)
+Query → Query Cache → Embedder → Vector Search ─┐
+                    └──────────→ BM25 Search ────┤→ Hybrid Merge → Filter → Results
+                                                 │
+                                        Metadata Filter (type, lang, date, tags)
+```
+### New Modules (v2)
+| Module | Purpose |
+|--------|---------|
+| `content-cleaner.ts` | Remove noise (TOC, breadcrumbs, markers) |
+| `metadata-extractor.ts` | Extract file_type, language, tags, dates |
+| `markdown-chunker.ts` | Heading-aware splitting with hierarchy |
+| `code-chunker.ts` | Function/class-aware splitting |
+| `chunker-factory.ts` | Route to correct chunker by file type |
+| `bm25-index.ts` | Inverted index for keyword search |
+| `hybrid-search.ts` | Merge vector + BM25 scores |
+| `query-cache.ts` | LRU cache for query embeddings |
+| `search-metrics.ts` | Track search quality metrics |
+---
+## 🤝 Contributing
+Found a bug? Have an idea? Open an issue or PR!
+---
+## 📄 License
+MIT
+---
+## 🎉 Authors
+Made with ❤️ by the **Comfanion** team
+---
+**Search smart, not hard!** 🚀

package/file-indexer.ts CHANGED Viewed

@@ -326,6 +326,8 @@ async function ensureIndexOnSessionStart(
   return { totalFiles, elapsedSeconds, action }
 }
+const STALE_THRESHOLD_MS = 5 * 60 * 1000 // 5 minutes — evict stuck entries
 async function processPendingFiles(projectRoot: string, config: VectorizerConfig): Promise<void> {
   if (pendingFiles.size === 0) return
   if (SKIP_AUTO_INDEX) {
@@ -335,6 +337,7 @@ async function processPendingFiles(projectRoot: string, config: VectorizerConfig
   const now = Date.now()
   const filesToProcess: Map<string, string[]> = new Map()
+  const staleKeys: string[] = []
   for (const [filePath, info] of pendingFiles.entries()) {
     if (now - info.timestamp >= config.debounce_ms) {
@@ -342,9 +345,17 @@ async function processPendingFiles(projectRoot: string, config: VectorizerConfig
       files.push(filePath)
       filesToProcess.set(info.indexName, files)
       pendingFiles.delete(filePath)
+    } else if (now - info.timestamp > STALE_THRESHOLD_MS) {
+      staleKeys.push(filePath)
     }
   }
+  // Evict entries stuck for >5 minutes (prevents unbounded growth)
+  for (const key of staleKeys) {
+    debug(`Evicting stale pending file: ${key}`)
+    pendingFiles.delete(key)
+  }
   if (filesToProcess.size === 0) return
   debug(`Processing ${filesToProcess.size} index(es)...`)
@@ -425,6 +436,9 @@ export const FileIndexerPlugin: Plugin = async ({ directory, client }) => {
     }, 1000)
   }
+  let lastProcessTime = Date.now()
+  const MAX_DEBOUNCE_WAIT_MS = 5000 // Force processing after 5s of rapid edits
   function queueFileForIndexing(filePath: string): void {
     const relativePath = path.relative(directory, filePath)
     if (relativePath.startsWith("..") || path.isAbsolute(relativePath)) return
@@ -439,9 +453,15 @@ export const FileIndexerPlugin: Plugin = async ({ directory, client }) => {
     if (processingTimeout) {
       clearTimeout(processingTimeout)
     }
+    // If rapid edits keep resetting the timer, force processing after MAX_DEBOUNCE_WAIT_MS
+    const timeSinceLast = Date.now() - lastProcessTime
+    const waitTime = timeSinceLast > MAX_DEBOUNCE_WAIT_MS ? 0 : config.debounce_ms + 100
     processingTimeout = setTimeout(async () => {
+      lastProcessTime = Date.now()
       await processPendingFiles(directory, config)
-    }, config.debounce_ms + 100)
+    }, waitTime)
   }
   return {

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@comfanion/usethis_search",
-  "version": "0.1.4",
-  "description": "OpenCode plugin: semantic search + code index management",
+  "version": "0.2.0-dev.0",
+  "description": "OpenCode plugin: semantic search + code index management (v2: hybrid search, semantic chunking, metadata filtering)",
   "type": "module",
   "main": "./index.ts",
   "exports": {
@@ -16,6 +16,16 @@
     "tools/search.ts",
     "tools/codeindex.ts",
     "vectorizer/index.js",
+    "vectorizer/content-cleaner.ts",
+    "vectorizer/metadata-extractor.ts",
+    "vectorizer/bm25-index.ts",
+    "vectorizer/hybrid-search.ts",
+    "vectorizer/query-cache.ts",
+    "vectorizer/search-metrics.ts",
+    "vectorizer/chunkers/markdown-chunker.ts",
+    "vectorizer/chunkers/code-chunker.ts",
+    "vectorizer/chunkers/chunker-factory.ts",
+    "vectorizer.yaml",
     "README.md",
     "LICENSE"
   ],