npm - @comfanion/usethis_search - Versions diffs - 3.0.0-dev.0 → 3.0.0-dev.10 - Mend

@comfanion/usethis_search 3.0.0-dev.0 → 3.0.0-dev.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/api.ts +92 -0
package/file-indexer.ts +14 -1
package/index.ts +20 -6
package/package.json +5 -3
package/tools/codeindex.ts +173 -7
package/tools/search.ts +1 -1
package/vectorizer/analyzers/lsp-analyzer.ts +225 -94
package/vectorizer/analyzers/lsp-client.ts +369 -0
package/vectorizer/graph-builder.ts +106 -3
package/vectorizer/graph-db.ts +192 -0
package/vectorizer/{index.js → index.ts} +114 -11
package/vectorizer/usage-tracker.ts +204 -0
package/tools/read-interceptor.ts +0 -54

package/vectorizer/{index.js → index.ts} RENAMED Viewed

@@ -17,6 +17,7 @@ import { QueryCache, DEFAULT_CACHE_CONFIG } from "./query-cache.ts";
 import { SearchMetrics } from "./search-metrics.ts";
 import { GraphDB } from "./graph-db.ts";
 import { GraphBuilder } from "./graph-builder.ts";
+import { UsageTracker } from "./usage-tracker.ts";
 // Suppress transformers.js logs unless DEBUG is set
 const DEBUG = process.env.DEBUG?.includes("vectorizer") || process.env.DEBUG === "*";
@@ -354,6 +355,8 @@ class CodebaseIndexer {
     this.metrics = null;       // lazy-loaded SearchMetrics
     this.graphDB = null;       // Graph DB for relationships
     this.graphBuilder = null;   // Graph builder orchestrator
+    this._chunkCache = null;   // Lazy Map<chunk_id, row> for findChunkById
+    this.usageTracker = null;  // Usage tracking & provenance (v3)
   }
   async init() {
@@ -371,6 +374,10 @@ class CodebaseIndexer {
     this.graphDB = await new GraphDB(graphPath).init();
     this.graphBuilder = new GraphBuilder(this.graphDB, this.root);
+    // Usage tracker — provenance & usage stats
+    this.usageTracker = new UsageTracker(this.cacheDir);
+    await this.usageTracker.load();
     return this;
   }
@@ -399,6 +406,18 @@ class CodebaseIndexer {
     }
     this._bm25Rows = null;
     this.metrics = null;
+    // Close graph DB to release LevelDB lock
+    if (this.graphDB) {
+      try { await this.graphDB.close(); } catch { /* best effort */ }
+      this.graphDB = null;
+      this.graphBuilder = null;
+    }
+    // Save & release usage tracker
+    if (this.usageTracker) {
+      try { await this.usageTracker.save(); } catch { /* best effort */ }
+      this.usageTracker = null;
+    }
+    this._chunkCache = null;
     clearQueryCache();
     if (global.gc) global.gc();
   }
@@ -521,6 +540,13 @@ class CodebaseIndexer {
     await this.graphDB.deleteByFile(relPath);
     await this.graphBuilder.buildEdges(relPath, content, chunksWithIds, fileMeta.file_type);
+    // FR-054: Store graph build timestamp + file hash as metadata triple
+    try {
+      await this.graphDB.setFileMeta(relPath, hash, Date.now());
+    } catch {
+      // non-fatal — metadata is advisory
+    }
     const data = [];
     for (let i = 0; i < chunksWithIds.length; i++) {
       const embedding = await this.embed(chunksWithIds[i].content);
@@ -577,7 +603,13 @@ class CodebaseIndexer {
     if (!tables.includes(tableName)) return null;
     const table = await this.db.openTable(tableName);
-    const allRows = await table.search([0]).limit(100000).execute();
+    let allRows;
+    try {
+      allRows = await table.search([0]).limit(100000).execute();
+    } catch (e) {
+      if (DEBUG) console.log("[vectorizer] BM25 index build failed (corrupted table?):", e.message);
+      return null;
+    }
     if (allRows.length === 0) return null;
@@ -617,7 +649,14 @@ class CodebaseIndexer {
                        (options.tags && options.tags.length > 0);
     const isHybrid = HYBRID_CONFIG.enabled || options.hybrid;
     const fetchLimit = (hasFilters || isHybrid) ? Math.max(limit * 3, 50) : limit;
-    let results = await table.search(queryEmbedding).limit(fetchLimit).execute();
+    let results;
+    try {
+      results = await table.search(queryEmbedding).limit(fetchLimit).execute();
+    } catch (e) {
+      // LanceDB schema error (e.g. missing vector column) — index is corrupted
+      if (DEBUG) console.log("[vectorizer] Vector search failed (corrupted index?):", e.message);
+      return [];
+    }
     // ── Hybrid search ───────────────────────────────────────────────────────
     if (HYBRID_CONFIG.enabled || options.hybrid) {
@@ -769,20 +808,59 @@ class CodebaseIndexer {
         neighbors.sort((a, b) => b.score - a.score);
         result.relatedContext = neighbors.slice(0, 3);
+        // FR-060: Record provenance for each attached chunk
+        if (this.usageTracker) {
+          for (const n of result.relatedContext) {
+            this.usageTracker.recordProvenance(query, result.chunk_id, n.chunk_id, n.relation);
+          }
+        }
       }
     }
+    // FR-061: Record usage counts for all returned chunks (main + attached)
+    if (this.usageTracker) {
+      const allChunkIds = [];
+      for (const r of finalResults) {
+        if (r.chunk_id) allChunkIds.push(r.chunk_id);
+        if (r.relatedContext) {
+          for (const rc of r.relatedContext) {
+            if (rc.chunk_id) allChunkIds.push(rc.chunk_id);
+          }
+        }
+      }
+      this.usageTracker.recordSearchResults(allChunkIds);
+      // Save asynchronously (non-blocking)
+      this.usageTracker.save().catch(() => {});
+    }
     return finalResults;
   }
   async findChunkById(chunkId) {
-    const tableName = "chunks";
-    const tables = await this.db.tableNames();
-    if (!tables.includes(tableName)) return null;
+    // Lazy-build an in-memory Map keyed by chunk_id on first call.
+    // The cache lives until unloadModel() clears it.
+    if (!this._chunkCache) {
+      const tableName = "chunks";
+      const tables = await this.db.tableNames();
+      if (!tables.includes(tableName)) return null;
-    const table = await this.db.openTable(tableName);
-    const rows = await table.search([0]).limit(100000).execute();
-    return rows.find(r => r.chunk_id === chunkId) || null;
+      const table = await this.db.openTable(tableName);
+      let rows;
+      try {
+        rows = await table.search([0]).limit(100000).execute();
+      } catch (e) {
+        if (DEBUG) console.log("[vectorizer] Chunk cache build failed (corrupted table?):", e.message);
+        return null;
+      }
+      this._chunkCache = new Map();
+      for (const row of rows) {
+        if (row.chunk_id) {
+          this._chunkCache.set(row.chunk_id, row);
+        }
+      }
+    }
+    return this._chunkCache.get(chunkId) || null;
   }
   cosineSimilarity(vecA, vecB) {
@@ -880,14 +958,17 @@ class CodebaseIndexer {
     let indexed = 0;
     let skipped = 0;
+    const total = files.length;
-    for (const relPath of files) {
+    for (let i = 0; i < files.length; i++) {
+      const relPath = files[i];
       const filePath = path.join(this.root, relPath);
       try {
         const wasIndexed = await this.indexFile(filePath);
         if (wasIndexed) {
           indexed++;
-          if (onProgress) onProgress(indexed, files.length, relPath);
+          // FR-053: progress indicator includes graph building phase
+          if (onProgress) onProgress(indexed, total, relPath, i + 1);
         } else {
           skipped++;
         }
@@ -896,7 +977,29 @@ class CodebaseIndexer {
       }
     }
-    return { indexed, skipped, total: files.length };
+    // FR-005: Build semantic similarity edges as post-pass
+    // Only if we actually indexed new files and have a graph builder
+    let semanticEdges = 0;
+    if (indexed > 0 && this.graphBuilder && this.graphDB) {
+      try {
+        const tableName = "chunks";
+        const tables = await this.db.tableNames();
+        if (tables.includes(tableName)) {
+          const table = await this.db.openTable(tableName);
+          const allRows = await table.search([0]).limit(100000).execute();
+          const chunkData = allRows
+            .filter(r => r.chunk_id && r.vector)
+            .map(r => ({ chunk_id: r.chunk_id, vector: Array.from(r.vector), file: r.file }));
+          semanticEdges = await this.graphBuilder.buildSemanticEdges(chunkData, 0.8, 3);
+          if (DEBUG) console.log(`[vectorizer] Built ${semanticEdges} semantic similarity edges`);
+        }
+      } catch (e) {
+        if (DEBUG) console.log(`[vectorizer] Semantic edge building failed:`, e.message);
+        // non-fatal — explicit edges still work
+      }
+    }
+    return { indexed, skipped, total, semanticEdges };
   }
   async indexSingleFile(filePath) {

package/vectorizer/usage-tracker.ts ADDED Viewed

@@ -0,0 +1,204 @@
+/**
+ * Usage Tracker — records provenance and usage statistics for chunks.
+ *
+ * FR-060: Record provenance for each attached chunk {query, main_chunk_id, attached_via_edge_type}
+ * FR-061: Increment usage_count when chunk appears in search results
+ * FR-062: API to query "where is chunk X used?" → list of referencing chunks
+ * FR-063: Use usage_count as additional ranking signal
+ *
+ * Storage: JSON file at .opencode/vectors/<index>/usage-stats.json
+ * Updated asynchronously (non-blocking to search).
+ */
+import fs from "fs/promises"
+import path from "path"
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+export interface ProvenanceRecord {
+  /** The search query that triggered this attachment */
+  query: string
+  /** The main result chunk that caused context attachment */
+  mainChunkId: string
+  /** The edge type that linked main → attached chunk */
+  edgeType: string
+  /** Timestamp */
+  timestamp: number
+}
+export interface ChunkUsageStats {
+  /** How many times this chunk appeared in search results (main or attached) */
+  usageCount: number
+  /** Last time this chunk was returned in a search result */
+  lastUsed: number
+  /** Recent provenance records (max 20 per chunk to limit storage) */
+  provenance: ProvenanceRecord[]
+}
+export interface UsageData {
+  /** Per-chunk usage statistics, keyed by chunk_id */
+  chunks: Record<string, ChunkUsageStats>
+  /** Global counters */
+  totalSearches: number
+  lastUpdated: number
+}
+const MAX_PROVENANCE_PER_CHUNK = 20
+// ---------------------------------------------------------------------------
+// UsageTracker
+// ---------------------------------------------------------------------------
+export class UsageTracker {
+  private data: UsageData | null = null
+  private dirty = false
+  private savePath: string
+  constructor(private cacheDir: string) {
+    this.savePath = path.join(cacheDir, "usage-stats.json")
+  }
+  // ---- lifecycle ----------------------------------------------------------
+  async load(): Promise<void> {
+    try {
+      const raw = await fs.readFile(this.savePath, "utf-8")
+      this.data = JSON.parse(raw)
+    } catch {
+      this.data = { chunks: {}, totalSearches: 0, lastUpdated: Date.now() }
+    }
+  }
+  async save(): Promise<void> {
+    if (!this.dirty || !this.data) return
+    this.data.lastUpdated = Date.now()
+    try {
+      await fs.mkdir(path.dirname(this.savePath), { recursive: true })
+      await fs.writeFile(this.savePath, JSON.stringify(this.data, null, 2), "utf-8")
+      this.dirty = false
+    } catch {
+      // non-fatal
+    }
+  }
+  // ---- FR-060: record provenance ------------------------------------------
+  /**
+   * Record that `attachedChunkId` was attached to `mainChunkId` as context
+   * for `query`, via `edgeType` relation.
+   */
+  recordProvenance(
+    query: string,
+    mainChunkId: string,
+    attachedChunkId: string,
+    edgeType: string,
+  ): void {
+    if (!this.data) return
+    const stats = this.ensureChunkStats(attachedChunkId)
+    stats.provenance.push({
+      query,
+      mainChunkId,
+      edgeType,
+      timestamp: Date.now(),
+    })
+    // Cap provenance history
+    if (stats.provenance.length > MAX_PROVENANCE_PER_CHUNK) {
+      stats.provenance = stats.provenance.slice(-MAX_PROVENANCE_PER_CHUNK)
+    }
+    this.dirty = true
+  }
+  // ---- FR-061: increment usage_count --------------------------------------
+  /**
+   * Record that these chunk IDs appeared in search results.
+   * Call once per search with all result chunk IDs (main + attached).
+   */
+  recordSearchResults(chunkIds: string[]): void {
+    if (!this.data) return
+    this.data.totalSearches++
+    const now = Date.now()
+    for (const id of chunkIds) {
+      const stats = this.ensureChunkStats(id)
+      stats.usageCount++
+      stats.lastUsed = now
+    }
+    this.dirty = true
+  }
+  // ---- FR-062: "where is chunk X used?" -----------------------------------
+  /**
+   * Get provenance info for a chunk: which queries led to it,
+   * which main chunks it was attached to, via which edges.
+   */
+  getChunkProvenance(chunkId: string): ProvenanceRecord[] {
+    if (!this.data) return []
+    return this.data.chunks[chunkId]?.provenance ?? []
+  }
+  /**
+   * Get usage stats for a chunk.
+   */
+  getChunkStats(chunkId: string): ChunkUsageStats | null {
+    if (!this.data) return null
+    return this.data.chunks[chunkId] ?? null
+  }
+  // ---- FR-063: usage_count as ranking signal ------------------------------
+  /**
+   * Get usage count for a chunk (0 if never seen).
+   * Used as additional ranking signal in search.
+   */
+  getUsageCount(chunkId: string): number {
+    if (!this.data) return 0
+    return this.data.chunks[chunkId]?.usageCount ?? 0
+  }
+  /**
+   * Get a usage boost factor for ranking (0.0 – 1.0).
+   * Normalized: most-used chunk → 1.0, unused → 0.0.
+   */
+  getUsageBoost(chunkId: string): number {
+    if (!this.data) return 0
+    const stats = this.data.chunks[chunkId]
+    if (!stats || stats.usageCount === 0) return 0
+    // Find max usage count across all chunks for normalization
+    let maxUsage = 1
+    for (const s of Object.values(this.data.chunks)) {
+      if (s.usageCount > maxUsage) maxUsage = s.usageCount
+    }
+    return stats.usageCount / maxUsage
+  }
+  // ---- summary ------------------------------------------------------------
+  /**
+   * Get global usage summary.
+   */
+  getSummary(): { totalSearches: number; trackedChunks: number; lastUpdated: number } {
+    if (!this.data) return { totalSearches: 0, trackedChunks: 0, lastUpdated: 0 }
+    return {
+      totalSearches: this.data.totalSearches,
+      trackedChunks: Object.keys(this.data.chunks).length,
+      lastUpdated: this.data.lastUpdated,
+    }
+  }
+  // ---- internals ----------------------------------------------------------
+  private ensureChunkStats(chunkId: string): ChunkUsageStats {
+    if (!this.data!.chunks[chunkId]) {
+      this.data!.chunks[chunkId] = {
+        usageCount: 0,
+        lastUsed: 0,
+        provenance: [],
+      }
+    }
+    return this.data!.chunks[chunkId]
+  }
+}

package/tools/read-interceptor.ts DELETED Viewed

@@ -1,54 +0,0 @@
-import { tool } from "@opencode-ai/plugin"
-import path from "path"
-import { CodebaseIndexer } from "../vectorizer/index.js"
-export default tool({
-  description: `Read file with graph-aware context attachment. When available, this tool searches the file in the index and returns content + related context from the graph (imports, links, etc.).
-Use this instead of the standard Read tool for better context awareness.`,
-  args: {
-    filePath: tool.schema.string().describe("Path to the file to read"),
-  },
-  async execute(args) {
-    const projectRoot = process.cwd()
-    const filePath = path.isAbsolute(args.filePath) ? args.filePath : path.join(projectRoot, args.filePath)
-    const relPath = path.relative(projectRoot, filePath)
-    const indexer = await new CodebaseIndexer(projectRoot, "code").init()
-    const results = await indexer.search(relPath, 20, false, {})
-    const fileChunks = results.filter(r => r.file === relPath)
-    await indexer.unloadModel()
-    if (fileChunks.length === 0) {
-      return `File "${relPath}" not indexed. Use original Read tool or run codeindex({ action: "reindex", index: "code" })`
-    }
-    let output = `## ${relPath}\n\n`
-    output += `### Content\n\n`
-    for (const chunk of fileChunks) {
-      output += chunk.content + "\n\n"
-    }
-    const allRelated = fileChunks
-      .flatMap(c => c.relatedContext || [])
-      .filter((r, i, arr) => arr.findIndex(x => x.chunk_id === r.chunk_id) === i)
-    if (allRelated.length > 0) {
-      output += `### Related Context\n\n`
-      for (const rel of allRelated) {
-        const snippet = rel.content.length > 300
-          ? rel.content.substring(0, 300) + "..."
-          : rel.content
-        output += `**${rel.file}** (${rel.relation})\n`
-        output += `\`\`\`\n${snippet}\n\`\`\`\n\n`
-      }
-    }
-    return output
-  },
-})