npm - @comfanion/usethis_search - Versions diffs - 0.1.5 → 0.2.0-dev.0 - Mend

@comfanion/usethis_search 0.1.5 → 0.2.0-dev.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/README.md +98 -7
package/file-indexer.ts +21 -1
package/package.json +12 -2
package/tools/codeindex.ts +135 -16
package/tools/search.ts +46 -11
package/vectorizer/bm25-index.ts +155 -0
package/vectorizer/chunkers/chunker-factory.ts +98 -0
package/vectorizer/chunkers/code-chunker.ts +325 -0
package/vectorizer/chunkers/markdown-chunker.ts +177 -0
package/vectorizer/content-cleaner.ts +136 -0
package/vectorizer/hybrid-search.ts +97 -0
package/vectorizer/index.js +395 -16
package/vectorizer/metadata-extractor.ts +125 -0
package/vectorizer/query-cache.ts +126 -0
package/vectorizer/search-metrics.ts +155 -0
package/vectorizer.yaml +81 -0

package/vectorizer/bm25-index.ts ADDED Viewed

@@ -0,0 +1,155 @@
+/**
+ * BM25 Index — keyword-based search using Okapi BM25 scoring.
+ *
+ * Builds an inverted index from chunk content and scores queries
+ * against it. Designed to complement vector similarity search.
+ */
+// ── BM25 parameters ────────────────────────────────────────────────────────
+const K1 = 1.2   // term frequency saturation
+const B = 0.75   // length normalization
+// ── Types ───────────────────────────────────────────────────────────────────
+interface DocEntry {
+  id: number
+  termFreqs: Map<string, number>
+  length: number     // total tokens
+}
+export interface BM25Result {
+  id: number
+  score: number
+}
+// ── Tokenizer ───────────────────────────────────────────────────────────────
+const STOP_WORDS = new Set([
+  "the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
+  "have", "has", "had", "do", "does", "did", "will", "would", "could",
+  "should", "may", "might", "shall", "can", "need", "must",
+  "and", "or", "but", "not", "no", "nor",
+  "in", "on", "at", "to", "for", "of", "with", "by", "from", "as",
+  "into", "about", "between", "through", "during", "before", "after",
+  "this", "that", "these", "those", "it", "its",
+  "i", "you", "he", "she", "we", "they", "me", "him", "her", "us", "them",
+  "my", "your", "his", "our", "their",
+  "what", "which", "who", "whom", "where", "when", "how", "why",
+  "if", "then", "else", "so", "than", "too", "very",
+])
+/**
+ * Tokenize text into lowercase terms, filtering stop words and short tokens.
+ */
+export function tokenize(text: string): string[] {
+  return text
+    .toLowerCase()
+    .replace(/[^a-z0-9_\-]/g, " ")
+    .split(/\s+/)
+    .filter((t) => t.length > 1 && !STOP_WORDS.has(t))
+}
+// ── BM25 Index class ────────────────────────────────────────────────────────
+export class BM25Index {
+  private docs: DocEntry[] = []
+  private invertedIndex: Map<string, Set<number>> = new Map()
+  private avgDocLength: number = 0
+  private docCount: number = 0
+  /**
+   * Build index from a list of text documents.
+   * Each document is identified by its array index.
+   */
+  build(documents: string[]): void {
+    this.docs = []
+    this.invertedIndex = new Map()
+    let totalLength = 0
+    for (let i = 0; i < documents.length; i++) {
+      const tokens = tokenize(documents[i])
+      const termFreqs = new Map<string, number>()
+      for (const token of tokens) {
+        termFreqs.set(token, (termFreqs.get(token) || 0) + 1)
+        if (!this.invertedIndex.has(token)) {
+          this.invertedIndex.set(token, new Set())
+        }
+        this.invertedIndex.get(token)!.add(i)
+      }
+      this.docs.push({ id: i, termFreqs, length: tokens.length })
+      totalLength += tokens.length
+    }
+    this.docCount = documents.length
+    this.avgDocLength = this.docCount > 0 ? totalLength / this.docCount : 0
+  }
+  /**
+   * Score a query against indexed documents.
+   * Returns array sorted by descending score.
+   */
+  search(query: string, limit: number = 50): BM25Result[] {
+    const queryTerms = tokenize(query)
+    if (queryTerms.length === 0) return []
+    // Collect candidate docs (any doc containing at least one query term)
+    const candidateIds = new Set<number>()
+    for (const term of queryTerms) {
+      const postings = this.invertedIndex.get(term)
+      if (postings) {
+        for (const id of postings) candidateIds.add(id)
+      }
+    }
+    if (candidateIds.size === 0) return []
+    // Score each candidate
+    const results: BM25Result[] = []
+    for (const docId of candidateIds) {
+      const doc = this.docs[docId]
+      let score = 0
+      for (const term of queryTerms) {
+        const tf = doc.termFreqs.get(term) || 0
+        if (tf === 0) continue
+        const df = this.invertedIndex.get(term)?.size || 0
+        const idf = Math.log((this.docCount - df + 0.5) / (df + 0.5) + 1)
+        const tfNorm = (tf * (K1 + 1)) / (tf + K1 * (1 - B + B * (doc.length / this.avgDocLength)))
+        score += idf * tfNorm
+      }
+      if (score > 0) {
+        results.push({ id: docId, score })
+      }
+    }
+    results.sort((a, b) => b.score - a.score)
+    return results.slice(0, limit)
+  }
+  /** Number of indexed documents. */
+  get size(): number {
+    return this.docCount
+  }
+  /** Number of unique terms. */
+  get vocabularySize(): number {
+    return this.invertedIndex.size
+  }
+  /** Release all memory held by the index. */
+  clear(): void {
+    this.docs = []
+    this.invertedIndex = new Map()
+    this.avgDocLength = 0
+    this.docCount = 0
+  }
+}

package/vectorizer/chunkers/chunker-factory.ts ADDED Viewed

@@ -0,0 +1,98 @@
+/**
+ * Chunker Factory — selects the appropriate chunker based on file type.
+ *
+ * Routes: markdown → markdown-chunker, code → code-chunker, else → fixed.
+ */
+import { chunkMarkdown, type MarkdownChunkConfig, DEFAULT_MD_CONFIG } from "./markdown-chunker"
+import { chunkCode, type CodeChunkConfig, DEFAULT_CODE_CONFIG } from "./code-chunker"
+import type { FileType } from "../metadata-extractor"
+// ── Types ───────────────────────────────────────────────────────────────────
+export type ChunkingStrategy = "fixed" | "semantic" | "hybrid"
+export interface ChunkingConfig {
+  strategy: ChunkingStrategy
+  markdown: MarkdownChunkConfig
+  code: CodeChunkConfig
+  fixed: { max_chars: number }
+}
+export const DEFAULT_CHUNKING_CONFIG: ChunkingConfig = {
+  strategy: "semantic",
+  markdown: DEFAULT_MD_CONFIG,
+  code: DEFAULT_CODE_CONFIG,
+  fixed: { max_chars: 1500 },
+}
+/** Unified chunk output from any chunker. */
+export interface UnifiedChunk {
+  content: string
+  heading_context?: string
+  function_name?: string
+  class_name?: string
+}
+// ── Fixed chunker (legacy) ──────────────────────────────────────────────────
+function chunkFixed(content: string, maxChars: number): UnifiedChunk[] {
+  const chunks: UnifiedChunk[] = []
+  const lines = content.split("\n")
+  let current: string[] = []
+  let currentLen = 0
+  for (const line of lines) {
+    if (currentLen + line.length + 1 > maxChars && current.length > 0) {
+      chunks.push({ content: current.join("\n") })
+      current = []
+      currentLen = 0
+    }
+    current.push(line)
+    currentLen += line.length + 1
+  }
+  if (current.length > 0) {
+    chunks.push({ content: current.join("\n") })
+  }
+  return chunks
+}
+// ── Public API ──────────────────────────────────────────────────────────────
+/**
+ * Chunk content using the appropriate strategy for the given file type.
+ */
+export function chunkContent(
+  content: string,
+  fileType: FileType,
+  language: string,
+  config: ChunkingConfig = DEFAULT_CHUNKING_CONFIG,
+): UnifiedChunk[] {
+  // If strategy is "fixed", always use fixed chunker
+  if (config.strategy === "fixed") {
+    return chunkFixed(content, config.fixed.max_chars)
+  }
+  // Semantic or hybrid: pick by file type
+  if (fileType === "docs" || language === "markdown") {
+    const mdChunks = chunkMarkdown(content, config.markdown)
+    return mdChunks.map((c) => ({
+      content: c.content,
+      heading_context: c.heading_context,
+    }))
+  }
+  if (fileType === "code") {
+    const codeChunks = chunkCode(content, config.code)
+    return codeChunks.map((c) => ({
+      content: c.content,
+      function_name: c.function_name,
+      class_name: c.class_name,
+    }))
+  }
+  // Config files or unknown — fixed
+  return chunkFixed(content, config.fixed.max_chars)
+}

package/vectorizer/chunkers/code-chunker.ts ADDED Viewed

@@ -0,0 +1,325 @@
+/**
+ * Code Chunker — splits source code by functions, classes, and exports.
+ *
+ * Uses regex-based parsing (no AST dependency) to detect function/class
+ * boundaries. Falls back to line-based splitting for unstructured code.
+ */
+export interface CodeChunkConfig {
+  min_chunk_size: number
+  max_chunk_size: number
+  split_by_functions: boolean
+  include_function_signature: boolean
+}
+export const DEFAULT_CODE_CONFIG: CodeChunkConfig = {
+  min_chunk_size: 300,
+  max_chunk_size: 1500,
+  split_by_functions: true,
+  include_function_signature: true,
+}
+export interface CodeChunk {
+  content: string
+  function_name?: string
+  class_name?: string
+}
+// ── Block detection ─────────────────────────────────────────────────────────
+interface CodeBlock {
+  type: "function" | "class" | "method" | "other"
+  name: string
+  className?: string
+  startLine: number
+  endLine: number
+}
+/**
+ * Detect top-level function/class blocks via brace-counting.
+ * Works for JS/TS/Go/Rust/Java/C-family languages.
+ */
+function detectBlocks(lines: string[]): CodeBlock[] {
+  const blocks: CodeBlock[] = []
+  // Patterns for function/class declarations
+  const fnPatterns = [
+    // JS/TS: function name(, async function name(, export function
+    /(?:export\s+)?(?:async\s+)?function\s+(\w+)/,
+    // Arrow: const name = (…) =>  or  const name = async (
+    /(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?(?:\([^)]*\)|[^=])\s*=>/,
+    // Method inside class: name( or async name(
+    /^\s+(?:async\s+)?(\w+)\s*\([^)]*\)\s*(?::\s*\w[^{]*)?\s*\{/,
+    // Go: func Name(
+    /^func\s+(?:\([^)]*\)\s+)?(\w+)\s*\(/,
+    // Rust: fn name(
+    /(?:pub\s+)?(?:async\s+)?fn\s+(\w+)/,
+    // Python def
+    /^\s*(?:async\s+)?def\s+(\w+)\s*\(/,
+  ]
+  const classPatterns = [
+    // JS/TS/Java/C#: class Name
+    /(?:export\s+)?(?:abstract\s+)?class\s+(\w+)/,
+    // Rust: struct/enum/impl
+    /(?:pub\s+)?(?:struct|enum|impl)\s+(\w+)/,
+    // Python class
+    /^class\s+(\w+)/,
+  ]
+  let currentClass: string | undefined
+  let i = 0
+  while (i < lines.length) {
+    const line = lines[i]
+    // Check for class
+    let classMatch: RegExpMatchArray | null = null
+    for (const pat of classPatterns) {
+      classMatch = line.match(pat)
+      if (classMatch) break
+    }
+    if (classMatch) {
+      const name = classMatch[1]
+      const endLine = findBlockEnd(lines, i)
+      blocks.push({ type: "class", name, startLine: i, endLine })
+      currentClass = name
+      // Look for methods inside
+      for (let j = i + 1; j < endLine; j++) {
+        const methodLine = lines[j]
+        const methodMatch = methodLine.match(/^\s+(?:(?:public|private|protected|static|async|override)\s+)*(\w+)\s*\([^)]*\)\s*(?::\s*[^{]*)?\s*\{/)
+        if (methodMatch && methodMatch[1] !== "constructor" || methodMatch && methodMatch[1] === "constructor") {
+          const mEnd = findBlockEnd(lines, j)
+          blocks.push({
+            type: "method",
+            name: methodMatch[1],
+            className: name,
+            startLine: j,
+            endLine: mEnd,
+          })
+          j = mEnd
+        }
+      }
+      i = endLine + 1
+      currentClass = undefined
+      continue
+    }
+    // Check for standalone function
+    let fnMatch: RegExpMatchArray | null = null
+    for (const pat of fnPatterns) {
+      fnMatch = line.match(pat)
+      if (fnMatch) break
+    }
+    if (fnMatch && !currentClass) {
+      const name = fnMatch[1]
+      const endLine = findBlockEnd(lines, i)
+      blocks.push({ type: "function", name, startLine: i, endLine })
+      i = endLine + 1
+      continue
+    }
+    i++
+  }
+  return blocks
+}
+/** Find end of brace-delimited block starting at `startLine`. */
+function findBlockEnd(lines: string[], startLine: number): number {
+  let braceCount = 0
+  let started = false
+  // For Python-style (indent-based), use indent detection
+  const firstLine = lines[startLine]
+  const isPythonStyle = firstLine.match(/:\s*$/) && !firstLine.includes("{")
+  if (isPythonStyle) {
+    return findPythonBlockEnd(lines, startLine)
+  }
+  for (let i = startLine; i < lines.length; i++) {
+    const line = lines[i]
+    for (const ch of line) {
+      if (ch === "{") { braceCount++; started = true }
+      if (ch === "}") { braceCount-- }
+    }
+    if (started && braceCount <= 0) {
+      return i
+    }
+  }
+  return Math.min(startLine + 50, lines.length - 1)
+}
+/** Find end of indent-based block (Python). */
+function findPythonBlockEnd(lines: string[], startLine: number): number {
+  const baseIndent = lines[startLine].match(/^(\s*)/)?.[1].length ?? 0
+  for (let i = startLine + 1; i < lines.length; i++) {
+    const line = lines[i]
+    if (line.trim() === "") continue
+    const indent = line.match(/^(\s*)/)?.[1].length ?? 0
+    if (indent <= baseIndent) {
+      return i - 1
+    }
+  }
+  return lines.length - 1
+}
+// ── Fallback: line-based splitting ──────────────────────────────────────────
+function splitByLines(lines: string[], maxChars: number): CodeChunk[] {
+  const chunks: CodeChunk[] = []
+  let current: string[] = []
+  let currentLen = 0
+  for (const line of lines) {
+    if (currentLen + line.length + 1 > maxChars && current.length > 0) {
+      chunks.push({ content: current.join("\n") })
+      current = []
+      currentLen = 0
+    }
+    current.push(line)
+    currentLen += line.length + 1
+  }
+  if (current.length > 0) {
+    chunks.push({ content: current.join("\n") })
+  }
+  return chunks
+}
+// ── Public API ──────────────────────────────────────────────────────────────
+/**
+ * Chunk source code by functions/classes.
+ */
+export function chunkCode(
+  content: string,
+  config: CodeChunkConfig = DEFAULT_CODE_CONFIG,
+): CodeChunk[] {
+  const lines = content.split("\n")
+  if (!config.split_by_functions) {
+    return splitByLines(lines, config.max_chunk_size)
+  }
+  const blocks = detectBlocks(lines)
+  if (blocks.length === 0) {
+    // No recognizable blocks — fallback
+    return splitByLines(lines, config.max_chunk_size)
+  }
+  const chunks: CodeChunk[] = []
+  // Collect "gaps" (code between blocks) and blocks themselves
+  let lastEnd = -1
+  for (const block of blocks) {
+    // If there is class-level block, skip individual method-level duplicate
+    if (block.type === "method") continue
+    // Gap before this block
+    if (block.startLine > lastEnd + 1) {
+      const gapContent = lines.slice(lastEnd + 1, block.startLine).join("\n").trim()
+      if (gapContent.length >= config.min_chunk_size) {
+        chunks.push({ content: gapContent })
+      } else if (gapContent.length > 0 && chunks.length > 0) {
+        // Merge small gap with previous chunk
+        chunks[chunks.length - 1].content += "\n\n" + gapContent
+      } else if (gapContent.length > 0) {
+        chunks.push({ content: gapContent })
+      }
+    }
+    const blockContent = lines.slice(block.startLine, block.endLine + 1).join("\n")
+    if (blockContent.length > config.max_chunk_size && block.type === "class") {
+      // Split class into methods
+      const methods = blocks.filter(
+        (b) => b.type === "method" && b.className === block.name,
+      )
+      if (methods.length > 0) {
+        let classLastEnd = block.startLine
+        for (const method of methods) {
+          // Class preamble / gap before method
+          if (method.startLine > classLastEnd + 1) {
+            const gap = lines.slice(classLastEnd + 1, method.startLine).join("\n").trim()
+            if (gap) {
+              chunks.push({
+                content: gap,
+                class_name: block.name,
+              })
+            }
+          }
+          chunks.push({
+            content: lines.slice(method.startLine, method.endLine + 1).join("\n"),
+            function_name: method.name,
+            class_name: block.name,
+          })
+          classLastEnd = method.endLine
+        }
+        // Class tail
+        if (classLastEnd < block.endLine) {
+          const tail = lines.slice(classLastEnd + 1, block.endLine + 1).join("\n").trim()
+          if (tail) {
+            chunks.push({ content: tail, class_name: block.name })
+          }
+        }
+      } else {
+        // No methods found — split by lines
+        const subChunks = splitByLines(
+          lines.slice(block.startLine, block.endLine + 1),
+          config.max_chunk_size,
+        )
+        for (const sc of subChunks) {
+          sc.class_name = block.name
+          chunks.push(sc)
+        }
+      }
+    } else {
+      chunks.push({
+        content: blockContent,
+        function_name: block.type === "function" ? block.name : undefined,
+        class_name: block.type === "class" ? block.name : block.className,
+      })
+    }
+    lastEnd = block.endLine
+  }
+  // Trailing code after last block
+  if (lastEnd < lines.length - 1) {
+    const trailing = lines.slice(lastEnd + 1).join("\n").trim()
+    if (trailing.length > 0) {
+      chunks.push({ content: trailing })
+    }
+  }
+  // Final: split any chunk still too large
+  const result: CodeChunk[] = []
+  for (const chunk of chunks) {
+    if (chunk.content.length > config.max_chunk_size) {
+      const parts = splitByLines(chunk.content.split("\n"), config.max_chunk_size)
+      for (const p of parts) {
+        result.push({ ...chunk, content: p.content })
+      }
+    } else {
+      result.push(chunk)
+    }
+  }
+  return result.filter((c) => c.content.trim().length > 0)
+}