npm - @comfanion/usethis_search - Versions diffs - 4.3.1 → 4.5.0 - Mend

@comfanion/usethis_search 4.3.1 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/api.ts +34 -17
package/cache/manager.ts +30 -19
package/cli.ts +8 -5
package/file-indexer.ts +28 -11
package/hooks/message-before.ts +5 -5
package/hooks/tool-substitution.ts +4 -120
package/index.ts +17 -6
package/package.json +4 -2
package/tools/codeindex.ts +192 -184
package/tools/graph.ts +265 -0
package/tools/read-interceptor.ts +7 -3
package/tools/search.ts +275 -186
package/tools/workspace-state.ts +1 -2
package/tools/workspace.ts +88 -117
package/vectorizer/analyzers/lsp-client.ts +52 -6
package/vectorizer/chunkers/chunker-factory.ts +6 -0
package/vectorizer/chunkers/code-chunker.ts +73 -16
package/vectorizer/chunkers/lsp-chunker.ts +313 -191
package/vectorizer/graph-db.ts +6 -4
package/vectorizer/index.ts +406 -142
package/vectorizer/query-decomposer.ts +397 -0
package/vectorizer/usage-tracker.ts +36 -0
package/vectorizer.yaml +9 -2

package/vectorizer/query-decomposer.ts ADDED Viewed

@@ -0,0 +1,397 @@
+/**
+ * Query Decomposer — splits complex queries into focused sub-queries.
+ *
+ * Problem: Long, multi-concept queries produce "diluted" embeddings
+ * because the embedding model (all-MiniLM-L6-v2, 384d) averages all
+ * token vectors into one. "JWT authentication middleware that validates
+ * permissions" → a blurry vector between auth, JWT, middleware, permissions.
+ *
+ * Solution: Decompose into focused sub-queries, search each independently,
+ * merge results via Reciprocal Rank Fusion (RRF).
+ *
+ * Strategy (no LLM — pure heuristics):
+ * 1. Short queries (≤4 significant words) → pass through unchanged
+ * 2. Medium queries (5-8 words) → extract keyword core + original
+ * 3. Long queries (9+ words) → split into 2-4 concept clusters + keyword core
+ *
+ * All decomposition is deterministic and fast (<1ms).
+ */
+// ── Types ───────────────────────────────────────────────────────────────────
+export interface DecompositionResult {
+  /** Original query (always included in sub-queries) */
+  original: string
+  /** Focused sub-queries (includes original if short enough) */
+  subQueries: string[]
+  /** Whether decomposition was applied */
+  decomposed: boolean
+  /** Strategy used */
+  strategy: "passthrough" | "keyword-core" | "concept-split"
+}
+export interface DecomposerConfig {
+  /** Enable/disable decomposition */
+  enabled: boolean
+  /** Min significant words to trigger decomposition */
+  minWords: number
+  /** Max sub-queries to generate (including original) */
+  maxSubQueries: number
+  /** Min words per sub-query */
+  minSubQueryWords: number
+}
+export const DEFAULT_DECOMPOSER_CONFIG: DecomposerConfig = {
+  enabled: true,
+  minWords: 5,
+  maxSubQueries: 4,
+  minSubQueryWords: 2,
+}
+// ── Stop words (shared with BM25 + extras for query context) ────────────────
+const STOP_WORDS = new Set([
+  "the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
+  "have", "has", "had", "do", "does", "did", "will", "would", "could",
+  "should", "may", "might", "shall", "can", "need", "must",
+  "and", "or", "but", "not", "no", "nor",
+  "in", "on", "at", "to", "for", "of", "with", "by", "from", "as",
+  "into", "about", "between", "through", "during", "before", "after",
+  "this", "that", "these", "those", "it", "its",
+  "i", "you", "he", "she", "we", "they", "me", "him", "her", "us", "them",
+  "my", "your", "his", "our", "their",
+  "what", "which", "who", "whom", "where", "when", "how", "why",
+  "if", "then", "else", "so", "than", "too", "very",
+  // Query-specific stop words (common in agent queries)
+  "find", "search", "look", "show", "get", "give", "tell",
+  "using", "used", "uses", "use",
+  "like", "such", "also", "just", "only",
+  "all", "any", "each", "every", "some",
+  "code", "file", "files", "function", "class", "method",
+  "implement", "implementation", "implements", "implemented",
+  "related", "relevant", "similar",
+  "please", "help", "want", "need",
+])
+// ── Connectors that signal concept boundaries ───────────────────────────────
+const CONCEPT_CONNECTORS = new Set([
+  "and", "or", "that", "which", "where", "when", "while",
+  "with", "using", "through", "via", "for", "including",
+  "also", "both", "either", "neither",
+])
+// ── Domain compound terms (keep together) ───────────────────────────────────
+const COMPOUND_TERMS: Array<[string, string]> = [
+  ["error", "handling"],
+  ["event", "sourcing"],
+  ["dependency", "injection"],
+  ["access", "control"],
+  ["rate", "limiting"],
+  ["load", "balancing"],
+  ["unit", "test"],
+  ["integration", "test"],
+  ["api", "endpoint"],
+  ["api", "gateway"],
+  ["data", "model"],
+  ["data", "transfer"],
+  ["database", "connection"],
+  ["file", "system"],
+  ["message", "queue"],
+  ["state", "management"],
+  ["type", "checking"],
+  ["code", "review"],
+  ["pull", "request"],
+  ["design", "pattern"],
+  ["repository", "pattern"],
+  ["factory", "pattern"],
+  ["observer", "pattern"],
+  ["middleware", "chain"],
+  ["call", "hierarchy"],
+  ["graph", "traversal"],
+]
+// ── Tokenizer ───────────────────────────────────────────────────────────────
+/**
+ * Tokenize query into lowercase words, preserving compound terms.
+ */
+export function tokenizeQuery(query: string): string[] {
+  const raw = query
+    .toLowerCase()
+    .replace(/[^a-z0-9_\-]/g, " ")
+    .split(/\s+/)
+    .filter(t => t.length > 1)
+  // Merge compound terms
+  const merged: string[] = []
+  let i = 0
+  while (i < raw.length) {
+    let found = false
+    if (i < raw.length - 1) {
+      for (const [a, b] of COMPOUND_TERMS) {
+        if (raw[i] === a && raw[i + 1] === b) {
+          merged.push(`${a}_${b}`)
+          i += 2
+          found = true
+          break
+        }
+      }
+    }
+    if (!found) {
+      merged.push(raw[i])
+      i++
+    }
+  }
+  return merged
+}
+/**
+ * Extract significant (non-stop) words from token list.
+ */
+export function extractSignificant(tokens: string[]): string[] {
+  return tokens.filter(t => !STOP_WORDS.has(t) && t.length > 2)
+}
+// ── Concept Clustering ──────────────────────────────────────────────────────
+/**
+ * Split tokens into concept groups at connector boundaries.
+ *
+ * "JWT authentication middleware that validates user permissions for API endpoints"
+ * → ["JWT authentication middleware", "validates user permissions", "API endpoints"]
+ */
+export function splitByConcepts(tokens: string[]): string[][] {
+  const groups: string[][] = []
+  let current: string[] = []
+  for (const token of tokens) {
+    if (CONCEPT_CONNECTORS.has(token)) {
+      if (current.length > 0) {
+        groups.push(current)
+        current = []
+      }
+      // Skip the connector itself
+    } else {
+      current.push(token)
+    }
+  }
+  if (current.length > 0) {
+    groups.push(current)
+  }
+  return groups
+}
+/**
+ * Merge small concept groups with neighbors to meet minimum size.
+ */
+function mergeSmallGroups(groups: string[][], minSize: number): string[][] {
+  if (groups.length <= 1) return groups
+  const merged: string[][] = []
+  let buffer: string[] = []
+  for (const group of groups) {
+    buffer.push(...group)
+    // Extract significant words to check if buffer is "big enough"
+    const sig = extractSignificant(buffer)
+    if (sig.length >= minSize) {
+      merged.push([...buffer])
+      buffer = []
+    }
+  }
+  // Remaining buffer: merge with last group or push as-is
+  if (buffer.length > 0) {
+    if (merged.length > 0) {
+      merged[merged.length - 1].push(...buffer)
+    } else {
+      merged.push(buffer)
+    }
+  }
+  return merged
+}
+// ── Keyword Core Extraction ─────────────────────────────────────────────────
+/**
+ * Extract a "keyword core" — the most important 3-4 words from the query.
+ * Uses a simple heuristic: take significant words, prefer longer/rarer ones.
+ */
+export function extractKeywordCore(significant: string[], maxWords: number = 3): string {
+  // Score words: longer words and compound terms score higher
+  const scored = significant.map(w => ({
+    word: w,
+    score: w.length + (w.includes("_") ? 5 : 0),
+  }))
+  scored.sort((a, b) => b.score - a.score)
+  const top = scored.slice(0, maxWords).map(s => s.word)
+  // Restore original order
+  const ordered = significant.filter(w => top.includes(w))
+  return ordered.slice(0, maxWords).join(" ").replace(/_/g, " ")
+}
+// ── Main Decomposer ─────────────────────────────────────────────────────────
+/**
+ * Decompose a search query into focused sub-queries.
+ *
+ * @param query  The original search query
+ * @param config Decomposer configuration
+ * @returns DecompositionResult with sub-queries and metadata
+ */
+export function decomposeQuery(
+  query: string,
+  config: DecomposerConfig = DEFAULT_DECOMPOSER_CONFIG,
+): DecompositionResult {
+  if (!config.enabled) {
+    return {
+      original: query,
+      subQueries: [query],
+      decomposed: false,
+      strategy: "passthrough",
+    }
+  }
+  const tokens = tokenizeQuery(query)
+  const significant = extractSignificant(tokens)
+  // ── Strategy 1: Short query → passthrough ─────────────────────────────────
+  if (significant.length < config.minWords) {
+    return {
+      original: query,
+      subQueries: [query],
+      decomposed: false,
+      strategy: "passthrough",
+    }
+  }
+  // ── Strategy 2: Medium query (5-8 significant words) → keyword core ───────
+  if (significant.length <= 8) {
+    const core = extractKeywordCore(significant, 3)
+    const subQueries = [query]
+    // Only add core if it's meaningfully different from original
+    if (core !== query.toLowerCase().trim() && core.split(" ").length >= config.minSubQueryWords) {
+      subQueries.push(core)
+    }
+    return {
+      original: query,
+      subQueries: subQueries.slice(0, config.maxSubQueries),
+      decomposed: subQueries.length > 1,
+      strategy: subQueries.length > 1 ? "keyword-core" : "passthrough",
+    }
+  }
+  // ── Strategy 3: Long query (9+ significant words) → concept split ─────────
+  const conceptGroups = splitByConcepts(tokens)
+  const mergedGroups = mergeSmallGroups(conceptGroups, config.minSubQueryWords)
+  const subQueries: string[] = []
+  // Always include keyword core as first sub-query (highest signal)
+  const core = extractKeywordCore(significant, 4)
+  if (core.split(" ").length >= config.minSubQueryWords) {
+    subQueries.push(core)
+  }
+  // Add concept groups as sub-queries
+  for (const group of mergedGroups) {
+    const groupSig = extractSignificant(group)
+    if (groupSig.length >= config.minSubQueryWords) {
+      const subQuery = groupSig.join(" ").replace(/_/g, " ")
+      // Avoid duplicates
+      if (!subQueries.includes(subQuery)) {
+        subQueries.push(subQuery)
+      }
+    }
+  }
+  // If we still have room, add the original (truncated to first N significant words)
+  if (subQueries.length < config.maxSubQueries) {
+    const truncated = significant.slice(0, 6).join(" ").replace(/_/g, " ")
+    if (!subQueries.includes(truncated)) {
+      subQueries.push(truncated)
+    }
+  }
+  // Ensure we don't exceed max
+  const finalQueries = subQueries.slice(0, config.maxSubQueries)
+  return {
+    original: query,
+    subQueries: finalQueries.length > 0 ? finalQueries : [query],
+    decomposed: finalQueries.length > 1,
+    strategy: finalQueries.length > 1 ? "concept-split" : "passthrough",
+  }
+}
+// ── RRF Merge ───────────────────────────────────────────────────────────────
+/**
+ * Reciprocal Rank Fusion — merge ranked result lists from multiple sub-queries.
+ *
+ * RRF score = sum(1 / (k + rank_i)) for each sub-query where the result appears.
+ *
+ * @param resultSets  Array of result arrays, each sorted by relevance (best first)
+ * @param k           RRF constant (default: 60, standard value from the paper)
+ * @param limit       Max results to return
+ * @returns Merged results sorted by RRF score, with _rrfScore and _combinedScore set
+ */
+export function rrfMerge(
+  resultSets: Array<Array<Record<string, any>>>,
+  k: number = 60,
+  limit: number = 10,
+): Array<Record<string, any>> {
+  if (resultSets.length === 0) return []
+  if (resultSets.length === 1) return resultSets[0].slice(0, limit)
+  // Build RRF scores keyed by chunk identity (file:chunk_index)
+  const scoreMap = new Map<string, { row: Record<string, any>; rrfScore: number; bestOriginalScore: number }>()
+  for (const results of resultSets) {
+    for (let rank = 0; rank < results.length; rank++) {
+      const r = results[rank]
+      const key = `${r.file}:${r.chunk_index}`
+      const rrfContribution = 1 / (k + rank + 1) // rank is 0-based, RRF uses 1-based
+      const existing = scoreMap.get(key)
+      const originalScore = r._combinedScore ?? (r._distance != null ? Math.max(0, 1 - r._distance / 2) : 0)
+      if (existing) {
+        existing.rrfScore += rrfContribution
+        // Keep the row with the best original score (most metadata)
+        if (originalScore > existing.bestOriginalScore) {
+          existing.row = r
+          existing.bestOriginalScore = originalScore
+        }
+      } else {
+        scoreMap.set(key, {
+          row: r,
+          rrfScore: rrfContribution,
+          bestOriginalScore: originalScore,
+        })
+      }
+    }
+  }
+  // Sort by RRF score and return
+  const merged = Array.from(scoreMap.values())
+    .sort((a, b) => b.rrfScore - a.rrfScore)
+    .slice(0, limit)
+    .map(entry => ({
+      ...entry.row,
+      _rrfScore: entry.rrfScore,
+      _combinedScore: entry.bestOriginalScore, // preserve for downstream compatibility
+    }))
+  return merged
+}

package/vectorizer/usage-tracker.ts CHANGED Viewed

@@ -46,6 +46,8 @@ export interface UsageData {
 }
 const MAX_PROVENANCE_PER_CHUNK = 20
+const MAX_TRACKED_CHUNKS = 5000      // Cap total tracked chunks to prevent unbounded growth
+const STALE_CHUNK_AGE_MS = 7 * 24 * 60 * 60 * 1000  // 7 days — evict chunks not used since
 // ---------------------------------------------------------------------------
 // UsageTracker
@@ -126,6 +128,40 @@ export class UsageTracker {
       stats.lastUsed = now
     }
     this.dirty = true
+    // Evict stale + over-cap entries periodically (every 50 searches)
+    if (this.data.totalSearches % 50 === 0) {
+      this.evictStaleChunks()
+    }
+  }
+  /**
+   * Evict chunks not used within STALE_CHUNK_AGE_MS, then cap at MAX_TRACKED_CHUNKS.
+   * Keeps the most recently used chunks.
+   */
+  private evictStaleChunks(): void {
+    if (!this.data) return
+    const now = Date.now()
+    const chunks = this.data.chunks
+    // Phase 1: remove stale (not used in 7 days)
+    for (const [id, stats] of Object.entries(chunks)) {
+      if (stats.lastUsed > 0 && now - stats.lastUsed > STALE_CHUNK_AGE_MS) {
+        delete chunks[id]
+        this.dirty = true
+      }
+    }
+    // Phase 2: if still over cap, evict least-used
+    const entries = Object.entries(chunks)
+    if (entries.length > MAX_TRACKED_CHUNKS) {
+      entries.sort((a, b) => a[1].lastUsed - b[1].lastUsed)
+      const toRemove = entries.length - MAX_TRACKED_CHUNKS
+      for (let i = 0; i < toRemove; i++) {
+        delete chunks[entries[i][0]]
+      }
+      this.dirty = true
+    }
   }
   // ---- FR-062: "where is chunk X used?" -----------------------------------

package/vectorizer.yaml CHANGED Viewed

@@ -62,12 +62,19 @@ vectorizer:
     max_chunks: 100                # Max number of chunks in workspace
     attach_top_n: 10               # Top N search chunks to attach with full content
     attach_related_per_chunk: 3    # Max graph relation chunks per main chunk
-    min_score_main: 0.65           # Min score for main chunks
-    min_score_related: 0.5         # Min score for graph relation chunks
+    min_score_main: 0.5            # Min score for main chunks
+    min_score_related: 0.35        # Min score for graph relation chunks
     persist_content: false         # Save full chunk content in snapshots (debug mode)
     auto_prune_search: true        # Replace old search outputs with compact summaries
     substitute_tool_outputs: true  # Replace read() outputs when chunks in workspace
+  # Query decomposition (v4 — improves long query relevance)
+  decomposition:
+    enabled: true              # Split complex queries into focused sub-queries
+    min_words: 5               # Min significant words to trigger decomposition
+    max_sub_queries: 4         # Max sub-queries (including keyword core)
+    min_sub_query_words: 2     # Min words per sub-query
   # Quality monitoring (v2)
   quality:
     enable_metrics: false   # Track search quality metrics