npm - @comfanion/usethis_search - Versions diffs - 0.1.5 → 0.2.0-dev.0 - Mend

@comfanion/usethis_search 0.1.5 → 0.2.0-dev.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/README.md +98 -7
package/file-indexer.ts +21 -1
package/package.json +12 -2
package/tools/codeindex.ts +135 -16
package/tools/search.ts +46 -11
package/vectorizer/bm25-index.ts +155 -0
package/vectorizer/chunkers/chunker-factory.ts +98 -0
package/vectorizer/chunkers/code-chunker.ts +325 -0
package/vectorizer/chunkers/markdown-chunker.ts +177 -0
package/vectorizer/content-cleaner.ts +136 -0
package/vectorizer/hybrid-search.ts +97 -0
package/vectorizer/index.js +395 -16
package/vectorizer/metadata-extractor.ts +125 -0
package/vectorizer/query-cache.ts +126 -0
package/vectorizer/search-metrics.ts +155 -0
package/vectorizer.yaml +81 -0

package/vectorizer/query-cache.ts ADDED Viewed

@@ -0,0 +1,126 @@
+/**
+ * Query Embedding Cache — LRU cache for query embeddings.
+ *
+ * Avoids recomputing embeddings for identical/recent queries.
+ * Uses a simple Map-based LRU with TTL eviction.
+ */
+export interface QueryCacheConfig {
+  maxSize: number    // max entries (default: 100)
+  ttl: number        // time-to-live in ms (default: 1 hour)
+}
+export const DEFAULT_CACHE_CONFIG: QueryCacheConfig = {
+  maxSize: 100,
+  ttl: 3600000, // 1 hour
+}
+interface CacheEntry {
+  embedding: number[]
+  timestamp: number
+}
+export class QueryCache {
+  private cache: Map<string, CacheEntry> = new Map()
+  private config: QueryCacheConfig
+  private cleanupInterval?: ReturnType<typeof setInterval>
+  constructor(config: QueryCacheConfig = DEFAULT_CACHE_CONFIG) {
+    this.config = config
+    // Periodic eviction of expired entries every 5 minutes
+    this.cleanupInterval = setInterval(() => {
+      this.evictExpired()
+    }, 300_000)
+    // Allow Node to exit even if interval is running
+    if (this.cleanupInterval && typeof this.cleanupInterval === "object" && "unref" in this.cleanupInterval) {
+      this.cleanupInterval.unref()
+    }
+  }
+  /**
+   * Get cached embedding for a query string.
+   * Returns `null` if not found or expired.
+   */
+  get(query: string): number[] | null {
+    const key = this.normalizeKey(query)
+    const entry = this.cache.get(key)
+    if (!entry) return null
+    // Check TTL
+    if (Date.now() - entry.timestamp > this.config.ttl) {
+      this.cache.delete(key)
+      return null
+    }
+    // Move to end (most recently used) — delete & re-insert
+    this.cache.delete(key)
+    this.cache.set(key, entry)
+    return entry.embedding
+  }
+  /**
+   * Store embedding for a query string.
+   */
+  set(query: string, embedding: number[]): void {
+    const key = this.normalizeKey(query)
+    // If already exists, delete first (to update position)
+    this.cache.delete(key)
+    // Evict oldest if at capacity
+    if (this.cache.size >= this.config.maxSize) {
+      const oldest = this.cache.keys().next().value
+      if (oldest !== undefined) {
+        this.cache.delete(oldest)
+      }
+    }
+    this.cache.set(key, { embedding, timestamp: Date.now() })
+  }
+  /** Check if a query is cached (and not expired). */
+  has(query: string): boolean {
+    return this.get(query) !== null
+  }
+  /** Remove all entries. */
+  clear(): void {
+    this.cache.clear()
+  }
+  /** Stop periodic cleanup and release all memory. */
+  destroy(): void {
+    if (this.cleanupInterval) {
+      clearInterval(this.cleanupInterval)
+      this.cleanupInterval = undefined
+    }
+    this.cache.clear()
+  }
+  /** Current number of (non-expired) entries. */
+  get size(): number {
+    // Lazy: don't evict expired on size check
+    return this.cache.size
+  }
+  /** Evict all expired entries. */
+  evictExpired(): number {
+    const now = Date.now()
+    let evicted = 0
+    for (const [key, entry] of this.cache) {
+      if (now - entry.timestamp > this.config.ttl) {
+        this.cache.delete(key)
+        evicted++
+      }
+    }
+    return evicted
+  }
+  // ── Internal ──────────────────────────────────────────────────────────────
+  private normalizeKey(query: string): string {
+    return query.trim().toLowerCase()
+  }
+}

package/vectorizer/search-metrics.ts ADDED Viewed

@@ -0,0 +1,155 @@
+/**
+ * Search Metrics — tracks search quality and usage patterns.
+ *
+ * Logs queries, scores, result counts, and computes aggregate stats.
+ * Persists to .opencode/vectors/metrics.json.
+ */
+import fs from "fs/promises"
+import path from "path"
+// ── Types ───────────────────────────────────────────────────────────────────
+export interface QueryRecord {
+  query: string
+  timestamp: string      // ISO
+  index: string
+  results_count: number
+  avg_score: number
+  top_score: number
+  hybrid: boolean
+}
+export interface MetricsSummary {
+  total_queries: number
+  avg_results_per_query: number
+  zero_results_rate: number
+  avg_relevance: number
+  avg_top_score: number
+}
+export interface MetricsData {
+  queries: QueryRecord[]
+  summary: MetricsSummary
+}
+// ── Constants ───────────────────────────────────────────────────────────────
+const MAX_STORED_QUERIES = 500    // keep last N queries
+const HIGH_RELEVANCE_THRESHOLD = 0.7
+// ── SearchMetrics class ─────────────────────────────────────────────────────
+export class SearchMetrics {
+  private metricsPath: string
+  private data: MetricsData
+  constructor(projectRoot: string) {
+    this.metricsPath = path.join(projectRoot, ".opencode", "vectors", "metrics.json")
+    this.data = { queries: [], summary: this.emptySummary() }
+  }
+  /** Load metrics from disk. */
+  async load(): Promise<void> {
+    try {
+      const raw = await fs.readFile(this.metricsPath, "utf8")
+      this.data = JSON.parse(raw)
+      // Trim if loaded data exceeds limit (e.g. from older version)
+      if (this.data.queries && this.data.queries.length > MAX_STORED_QUERIES) {
+        this.data.queries = this.data.queries.slice(-MAX_STORED_QUERIES)
+        this.data.summary = this.computeSummary()
+      }
+    } catch {
+      this.data = { queries: [], summary: this.emptySummary() }
+    }
+  }
+  /** Save metrics to disk. */
+  async save(): Promise<void> {
+    try {
+      const dir = path.dirname(this.metricsPath)
+      await fs.mkdir(dir, { recursive: true })
+      await fs.writeFile(this.metricsPath, JSON.stringify(this.data, null, 2))
+    } catch {
+      // non-fatal
+    }
+  }
+  /**
+   * Record a search query and its results.
+   * @param scores Array of relevance scores (1 - distance) for each result
+   */
+  recordQuery(
+    query: string,
+    index: string,
+    scores: number[],
+    hybrid: boolean = false,
+  ): void {
+    const record: QueryRecord = {
+      query,
+      timestamp: new Date().toISOString(),
+      index,
+      results_count: scores.length,
+      avg_score: scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0,
+      top_score: scores.length > 0 ? Math.max(...scores) : 0,
+      hybrid,
+    }
+    this.data.queries.push(record)
+    // Trim to max
+    if (this.data.queries.length > MAX_STORED_QUERIES) {
+      this.data.queries = this.data.queries.slice(-MAX_STORED_QUERIES)
+    }
+    // Recompute summary
+    this.data.summary = this.computeSummary()
+  }
+  /** Get current summary. */
+  getSummary(): MetricsSummary {
+    return this.data.summary
+  }
+  /** Get raw query records (last N). */
+  getQueries(limit: number = 50): QueryRecord[] {
+    return this.data.queries.slice(-limit)
+  }
+  /** Compute context relevance: % of queries where top_score > threshold. */
+  getContextRelevance(): number {
+    if (this.data.queries.length === 0) return 0
+    const relevant = this.data.queries.filter((q) => q.top_score >= HIGH_RELEVANCE_THRESHOLD)
+    return relevant.length / this.data.queries.length
+  }
+  // ── Internal ──────────────────────────────────────────────────────────────
+  private computeSummary(): MetricsSummary {
+    const queries = this.data.queries
+    if (queries.length === 0) return this.emptySummary()
+    const totalResults = queries.reduce((sum, q) => sum + q.results_count, 0)
+    const zeroResults = queries.filter((q) => q.results_count === 0).length
+    const avgRelevance = queries.reduce((sum, q) => sum + q.avg_score, 0) / queries.length
+    const avgTopScore = queries.reduce((sum, q) => sum + q.top_score, 0) / queries.length
+    return {
+      total_queries: queries.length,
+      avg_results_per_query: totalResults / queries.length,
+      zero_results_rate: zeroResults / queries.length,
+      avg_relevance: Math.round(avgRelevance * 1000) / 1000,
+      avg_top_score: Math.round(avgTopScore * 1000) / 1000,
+    }
+  }
+  private emptySummary(): MetricsSummary {
+    return {
+      total_queries: 0,
+      avg_results_per_query: 0,
+      zero_results_rate: 0,
+      avg_relevance: 0,
+      avg_top_score: 0,
+    }
+  }
+}

package/vectorizer.yaml ADDED Viewed

@@ -0,0 +1,81 @@
+vectorizer:
+  # Enable/disable vectorizer functionality
+  enabled: true
+  # Auto-index files when they change (requires file-indexer plugin)
+  auto_index: true
+  # Embedding model
+  model: "Xenova/all-MiniLM-L6-v2"
+  # Debounce time in ms (wait before indexing after file change)
+  debounce_ms: 1000
+  # Content cleaning before chunking (v2)
+  cleaning:
+    remove_toc: true
+    remove_frontmatter_metadata: false
+    remove_imports: false
+    remove_comments: false
+  # Chunking strategy (v2)
+  chunking:
+    strategy: "semantic"  # fixed | semantic
+    markdown:
+      split_by_headings: true
+      min_chunk_size: 200
+      max_chunk_size: 2000
+      preserve_heading_hierarchy: true
+    code:
+      split_by_functions: true
+      include_function_signature: true
+      min_chunk_size: 300
+      max_chunk_size: 1500
+    fixed:
+      max_chars: 1500
+  # Search configuration (v2)
+  search:
+    hybrid: false          # Enable hybrid search (vector + BM25)
+    bm25_weight: 0.3       # BM25 weight in hybrid mode (0.0-1.0)
+  # Quality monitoring (v2)
+  quality:
+    enable_metrics: false   # Track search quality metrics
+    enable_cache: true      # LRU cache for query embeddings
+  # Indexes to maintain - each has pattern (what to include) and ignore (what to skip)
+  indexes:
+    # Documentation index - markdown, text files
+    docs:
+      enabled: true
+      pattern: "docs/**/*.{md,mdx,txt,rst,adoc}"
+      ignore: []
+    # Configuration index - yaml, json, toml
+    config:
+      enabled: false
+      pattern: "**/*.{yaml,yml,json,toml,ini}"
+      ignore:
+        - "**/node_modules/**"
+        - "**/.git/**"
+        - "**/dist/**"
+        - "**/build/**"
+        - "**/.opencode/**"
+        - "**/docs/**"
+        - "**/vendor/**"
+        - "**/__pycache__/**"
+        - "**/*.min.js"
+        - "**/*.bundle.js"
+        - "**/package-lock.json"
+        - "**/yarn.lock"
+  # Global exclude patterns (applied to ALL indexes, in addition to per-index ignore)
+  exclude:
+    - node_modules
+    - vendor
+    - dist
+    - build
+    - out
+    - __pycache__