npm - @comfanion/usethis_search - Versions diffs - 0.1.5 → 0.2.0-dev.0 - Mend

@comfanion/usethis_search 0.1.5 → 0.2.0-dev.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/README.md +98 -7
package/file-indexer.ts +21 -1
package/package.json +12 -2
package/tools/codeindex.ts +135 -16
package/tools/search.ts +46 -11
package/vectorizer/bm25-index.ts +155 -0
package/vectorizer/chunkers/chunker-factory.ts +98 -0
package/vectorizer/chunkers/code-chunker.ts +325 -0
package/vectorizer/chunkers/markdown-chunker.ts +177 -0
package/vectorizer/content-cleaner.ts +136 -0
package/vectorizer/hybrid-search.ts +97 -0
package/vectorizer/index.js +395 -16
package/vectorizer/metadata-extractor.ts +125 -0
package/vectorizer/query-cache.ts +126 -0
package/vectorizer/search-metrics.ts +155 -0
package/vectorizer.yaml +81 -0

package/vectorizer/index.js CHANGED Viewed

@@ -1,4 +1,5 @@
 // OpenCode Vectorizer - Semantic Code Search with Multi-Index Support
+// v2: Content cleaning, semantic chunking, hybrid search, metadata, cache, metrics
 import { pipeline, env } from "@xenova/transformers";
 import * as lancedb from "vectordb";
@@ -6,6 +7,15 @@ import fs from "fs/promises";
 import path from "path";
 import crypto from "crypto";
+// ── New modules ─────────────────────────────────────────────────────────────
+import { cleanContent, DEFAULT_CLEANING_CONFIG } from "./content-cleaner.ts";
+import { extractFileMetadata, detectFileType, detectLanguage } from "./metadata-extractor.ts";
+import { chunkContent, DEFAULT_CHUNKING_CONFIG } from "./chunkers/chunker-factory.ts";
+import { BM25Index } from "./bm25-index.ts";
+import { mergeResults, DEFAULT_HYBRID_CONFIG } from "./hybrid-search.ts";
+import { QueryCache, DEFAULT_CACHE_CONFIG } from "./query-cache.ts";
+import { SearchMetrics } from "./search-metrics.ts";
 // Suppress transformers.js logs unless DEBUG is set
 const DEBUG = process.env.DEBUG?.includes("vectorizer") || process.env.DEBUG === "*";
 if (!DEBUG) {
@@ -57,6 +67,13 @@ let GLOBAL_IGNORE = [];
 // Default embedding model (fast). Can be overridden by config.
 let EMBEDDING_MODEL = "Xenova/all-MiniLM-L6-v2";
+// ── Extended config parsed from YAML ────────────────────────────────────────
+let CLEANING_CONFIG = { ...DEFAULT_CLEANING_CONFIG };
+let CHUNKING_CONFIG = { ...DEFAULT_CHUNKING_CONFIG };
+let HYBRID_CONFIG = { ...DEFAULT_HYBRID_CONFIG };
+let METRICS_ENABLED = false;
+let CACHE_ENABLED = true;
 function defaultVectorizerYaml() {
   return (
     `vectorizer:\n` +
@@ -64,6 +81,40 @@ function defaultVectorizerYaml() {
     `  auto_index: true\n` +
     `  model: \"${EMBEDDING_MODEL}\"\n` +
     `  debounce_ms: 1000\n` +
+    `\n` +
+    `  # Content cleaning before chunking\n` +
+    `  cleaning:\n` +
+    `    remove_toc: true\n` +
+    `    remove_frontmatter_metadata: false\n` +
+    `    remove_imports: false\n` +
+    `    remove_comments: false\n` +
+    `\n` +
+    `  # Chunking strategy\n` +
+    `  chunking:\n` +
+    `    strategy: \"semantic\"  # fixed | semantic\n` +
+    `    markdown:\n` +
+    `      split_by_headings: true\n` +
+    `      min_chunk_size: 200\n` +
+    `      max_chunk_size: 2000\n` +
+    `      preserve_heading_hierarchy: true\n` +
+    `    code:\n` +
+    `      split_by_functions: true\n` +
+    `      include_function_signature: true\n` +
+    `      min_chunk_size: 300\n` +
+    `      max_chunk_size: 1500\n` +
+    `    fixed:\n` +
+    `      max_chars: 1500\n` +
+    `\n` +
+    `  # Hybrid search (vector + BM25)\n` +
+    `  search:\n` +
+    `    hybrid: false\n` +
+    `    bm25_weight: 0.3\n` +
+    `\n` +
+    `  # Quality monitoring\n` +
+    `  quality:\n` +
+    `    enable_metrics: false\n` +
+    `    enable_cache: true\n` +
+    `\n` +
     `  indexes:\n` +
     `    code:\n` +
     `      enabled: true\n` +
@@ -104,8 +155,25 @@ async function ensureDefaultConfig(projectRoot) {
   }
 }
+// ── YAML mini-parser helpers ────────────────────────────────────────────────
+function parseBool(section, key, fallback) {
+  const m = section.match(new RegExp(`^\\s+${key}:\\s*(true|false)`, "m"));
+  return m ? m[1] === "true" : fallback;
+}
+function parseNumber(section, key, fallback) {
+  const m = section.match(new RegExp(`^\\s+${key}:\\s*(\\d+(?:\\.\\d+)?)`, "m"));
+  return m ? parseFloat(m[1]) : fallback;
+}
+function parseString(section, key, fallback) {
+  const m = section.match(new RegExp(`^\\s+${key}:\\s*["']?([^"'\\n]+)["']?`, "m"));
+  return m ? m[1].trim() : fallback;
+}
 /**
- * Load index configuration from .opencode/vectorizer.yaml (preferred) or .opencode/config.yaml.
+ * Load index configuration from .opencode/vectorizer.yaml.
  */
 async function loadConfig(projectRoot) {
   try {
@@ -142,6 +210,61 @@ async function loadConfig(projectRoot) {
       if (DEBUG) console.log("[vectorizer] Using model from config:", EMBEDDING_MODEL);
     }
+    // ── Parse cleaning config ───────────────────────────────────────────────
+    const cleaningMatch = section.match(/^\s{2}cleaning:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
+    if (cleaningMatch) {
+      const cs = cleaningMatch[1];
+      CLEANING_CONFIG = {
+        remove_toc: parseBool(cs, "remove_toc", true),
+        remove_frontmatter_metadata: parseBool(cs, "remove_frontmatter_metadata", false),
+        remove_imports: parseBool(cs, "remove_imports", false),
+        remove_comments: parseBool(cs, "remove_comments", false),
+      };
+    }
+    // ── Parse chunking config ───────────────────────────────────────────────
+    const chunkingMatch = section.match(/^\s{2}chunking:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
+    if (chunkingMatch) {
+      const cs = chunkingMatch[1];
+      const strategy = parseString(cs, "strategy", "semantic");
+      CHUNKING_CONFIG = {
+        strategy: strategy,
+        markdown: {
+          split_by_headings: parseBool(cs, "split_by_headings", true),
+          min_chunk_size: parseNumber(cs, "min_chunk_size", 200),
+          max_chunk_size: parseNumber(cs, "max_chunk_size", 2000),
+          preserve_heading_hierarchy: parseBool(cs, "preserve_heading_hierarchy", true),
+        },
+        code: {
+          split_by_functions: parseBool(cs, "split_by_functions", true),
+          include_function_signature: parseBool(cs, "include_function_signature", true),
+          min_chunk_size: parseNumber(cs, "min_chunk_size", 300),
+          max_chunk_size: parseNumber(cs, "max_chunk_size", 1500),
+        },
+        fixed: {
+          max_chars: parseNumber(cs, "max_chars", 1500),
+        },
+      };
+    }
+    // ── Parse search config ─────────────────────────────────────────────────
+    const searchMatch = section.match(/^\s{2}search:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
+    if (searchMatch) {
+      const ss = searchMatch[1];
+      HYBRID_CONFIG = {
+        enabled: parseBool(ss, "hybrid", false),
+        bm25_weight: parseNumber(ss, "bm25_weight", 0.3),
+      };
+    }
+    // ── Parse quality config ────────────────────────────────────────────────
+    const qualityMatch = section.match(/^\s{2}quality:\s*\n([\s\S]*?)(?=^\s{2}[a-zA-Z_\-]+:|\Z)/m);
+    if (qualityMatch) {
+      const qs = qualityMatch[1];
+      METRICS_ENABLED = parseBool(qs, "enable_metrics", false);
+      CACHE_ENABLED = parseBool(qs, "enable_cache", true);
+    }
     // Parse global exclude
     const excludeMatch = section.match(/^\s{2}exclude:\s*\n((?:\s{4}-\s+.+\n?)*)/m);
     if (excludeMatch) {
@@ -196,12 +319,25 @@ async function loadConfig(projectRoot) {
       }
     }
-    if (DEBUG) console.log("[vectorizer] Loaded config:", { INDEX_PRESETS, GLOBAL_IGNORE });
+    if (DEBUG) console.log("[vectorizer] Loaded config:", { INDEX_PRESETS, GLOBAL_IGNORE, HYBRID_CONFIG, CHUNKING_CONFIG });
   } catch {
     if (DEBUG) console.log("[vectorizer] Using default presets (config load failed)");
   }
 }
+// ── Shared query cache (singleton per process) ─────────────────────────────
+let _queryCache = null;
+function getQueryCache() {
+  if (!_queryCache) _queryCache = new QueryCache(DEFAULT_CACHE_CONFIG);
+  return _queryCache;
+}
+function clearQueryCache() {
+  if (_queryCache) {
+    _queryCache.destroy();
+    _queryCache = null;
+  }
+}
 class CodebaseIndexer {
   constructor(projectRoot, indexName = "code") {
     this.root = projectRoot;
@@ -212,6 +348,8 @@ class CodebaseIndexer {
     this.db = null;
     this.hashes = {};
     this.configLoaded = false;
+    this.bm25 = null;          // lazy-built BM25 index
+    this.metrics = null;       // lazy-loaded SearchMetrics
   }
   async init() {
@@ -227,17 +365,30 @@ class CodebaseIndexer {
   async loadModel() {
     if (!this.model) {
-      if (DEBUG) console.log(`[vectorizer] Loading embedding model: ${EMBEDDING_MODEL}...`);
-      this.model = await pipeline("feature-extraction", EMBEDDING_MODEL, {
-        progress_callback: DEBUG ? undefined : null,
-      });
-      if (DEBUG) console.log(`[vectorizer] Model loaded: ${EMBEDDING_MODEL}`);
+      try {
+        if (DEBUG) console.log(`[vectorizer] Loading embedding model: ${EMBEDDING_MODEL}...`);
+        this.model = await pipeline("feature-extraction", EMBEDDING_MODEL, {
+          progress_callback: DEBUG ? undefined : null,
+        });
+        if (DEBUG) console.log(`[vectorizer] Model loaded: ${EMBEDDING_MODEL}`);
+      } catch (error) {
+        this.model = null;
+        throw new Error(`Model loading failed: ${error.message || error}`);
+      }
     }
     return this.model;
   }
   async unloadModel() {
     this.model = null;
+    // Release BM25 data held in memory
+    if (this.bm25) {
+      this.bm25.clear();
+      this.bm25 = null;
+    }
+    this._bm25Rows = null;
+    this.metrics = null;
+    clearQueryCache();
     if (global.gc) global.gc();
   }
@@ -274,12 +425,28 @@ class CodebaseIndexer {
     return false;
   }
+  // ── Embedding (with optional cache) ───────────────────────────────────────
   async embed(text) {
     const model = await this.loadModel();
     const result = await model(text, { pooling: "mean", normalize: true });
     return Array.from(result.data);
   }
+  async embedQuery(text) {
+    if (CACHE_ENABLED) {
+      const cache = getQueryCache();
+      const cached = cache.get(text);
+      if (cached) return cached;
+      const embedding = await this.embed(text);
+      cache.set(text, embedding);
+      return embedding;
+    }
+    return this.embed(text);
+  }
+  // ── Legacy chunker (kept for backward compat / "fixed" strategy) ──────────
   chunkCode(content, maxChars = 1500) {
     const chunks = [];
     const lines = content.split("\n");
@@ -309,6 +476,8 @@ class CodebaseIndexer {
     return this.hashes[relPath] !== currentHash;
   }
+  // ── Index a single file (v2: cleaning + semantic chunking + metadata) ─────
   async indexFile(filePath) {
     const relPath = path.relative(this.root, filePath);
@@ -324,21 +493,39 @@ class CodebaseIndexer {
       return false;
     }
-    const chunks = this.chunkCode(content);
+    // Extract metadata
+    const fileMeta = await extractFileMetadata(filePath, content);
     const archived = this.isArchived(relPath, content);
-    const data = [];
+    // Clean content before chunking
+    const cleaned = cleanContent(content, fileMeta.file_type, CLEANING_CONFIG);
+    // Semantic chunking
+    const chunks = chunkContent(cleaned, fileMeta.file_type, fileMeta.language, CHUNKING_CONFIG);
+    const data = [];
     for (let i = 0; i < chunks.length; i++) {
-      const embedding = await this.embed(chunks[i]);
+      const embedding = await this.embed(chunks[i].content);
       data.push({
         file: relPath,
         chunk_index: i,
-        content: chunks[i],
+        content: chunks[i].content,
         vector: embedding,
         archived: archived,
+        // v2 metadata
+        file_type: fileMeta.file_type,
+        language: fileMeta.language,
+        last_modified: fileMeta.last_modified,
+        file_size: fileMeta.file_size,
+        heading_context: chunks[i].heading_context || "",
+        function_name: chunks[i].function_name || "",
+        class_name: chunks[i].class_name || "",
+        tags: (fileMeta.tags || []).join(","),
       });
     }
+    if (data.length === 0) return false;
     const tableName = "chunks";
     const tables = await this.db.tableNames();
     if (tables.includes(tableName)) {
@@ -351,27 +538,189 @@ class CodebaseIndexer {
     this.hashes[relPath] = hash;
     await this.saveHashes();
+    // Invalidate BM25 index (needs rebuild) — release memory
+    if (this.bm25) {
+      this.bm25.clear();
+      this.bm25 = null;
+    }
+    this._bm25Rows = null;
     return true;
   }
-  async search(query, limit = 5, includeArchived = false) {
+  // ── BM25 index management ────────────────────────────────────────────────
+  async ensureBM25() {
+    if (this.bm25) return this.bm25;
+    const tableName = "chunks";
+    const tables = await this.db.tableNames();
+    if (!tables.includes(tableName)) return null;
+    const table = await this.db.openTable(tableName);
+    const allRows = await table.search([0]).limit(100000).execute();
+    if (allRows.length === 0) return null;
+    // Sort for stable ID mapping between builds
+    allRows.sort((a, b) => {
+      const ka = `${a.file}:${a.chunk_index}`;
+      const kb = `${b.file}:${b.chunk_index}`;
+      return ka.localeCompare(kb);
+    });
+    // Release previous data before rebuilding
+    if (this.bm25) this.bm25.clear();
+    this._bm25Rows = null;
+    this.bm25 = new BM25Index();
+    this.bm25.build(allRows.map((r) => r.content));
+    this._bm25Rows = allRows;
+    return this.bm25;
+  }
+  // ── Search (v2: hybrid + metadata filters + metrics) ──────────────────────
+  async search(query, limit = 5, includeArchived = false, options = {}) {
     const tableName = "chunks";
     const tables = await this.db.tableNames();
     if (!tables.includes(tableName)) {
       return [];
     }
-    const queryEmbedding = await this.embed(query);
+    const queryEmbedding = await this.embedQuery(query);
     const table = await this.db.openTable(tableName);
-    const fetchLimit = includeArchived ? limit : limit * 3;
+    // Only over-fetch when filters or hybrid search are active
+    const hasFilters = !includeArchived || options.fileType || options.language ||
+                       options.modifiedAfter || options.modifiedBefore ||
+                       (options.tags && options.tags.length > 0);
+    const isHybrid = HYBRID_CONFIG.enabled || options.hybrid;
+    const fetchLimit = (hasFilters || isHybrid) ? Math.max(limit * 3, 50) : limit;
     let results = await table.search(queryEmbedding).limit(fetchLimit).execute();
+    // ── Hybrid search ───────────────────────────────────────────────────────
+    if (HYBRID_CONFIG.enabled || options.hybrid) {
+      try {
+        const bm25 = await this.ensureBM25();
+        if (bm25 && this._bm25Rows) {
+          const bm25Results = bm25.search(query, fetchLimit);
+          // Build score maps
+          const vectorScores = new Map();
+          for (let i = 0; i < results.length; i++) {
+            const score = results[i]._distance != null ? 1 - results[i]._distance : 0.5;
+            vectorScores.set(i, score);
+          }
+          const bm25Scores = new Map();
+          for (const r of bm25Results) {
+            bm25Scores.set(r.id, r.score);
+          }
+          // We need a unified ID space. Since vector results and BM25 results
+          // reference different row sets, we use the full table rows for BM25
+          // and merge by file+chunk_index key.
+          const resultMap = new Map();
+          for (let i = 0; i < results.length; i++) {
+            const key = `${results[i].file}:${results[i].chunk_index}`;
+            const vs = results[i]._distance != null ? 1 - results[i]._distance : 0.5;
+            resultMap.set(key, { row: results[i], vectorScore: vs, bm25Score: 0 });
+          }
+          for (const br of bm25Results) {
+            if (br.id < this._bm25Rows.length) {
+              const bRow = this._bm25Rows[br.id];
+              const key = `${bRow.file}:${bRow.chunk_index}`;
+              if (resultMap.has(key)) {
+                resultMap.get(key).bm25Score = br.score;
+              } else {
+                resultMap.set(key, { row: bRow, vectorScore: 0, bm25Score: br.score });
+              }
+            }
+          }
+          // Normalize BM25 scores
+          let maxBM25 = 0;
+          for (const v of resultMap.values()) {
+            if (v.bm25Score > maxBM25) maxBM25 = v.bm25Score;
+          }
+          const bw = (options.bm25_weight ?? HYBRID_CONFIG.bm25_weight) || 0.3;
+          const vw = 1 - bw;
+          const merged = [];
+          for (const v of resultMap.values()) {
+            const normBM25 = maxBM25 > 0 ? v.bm25Score / maxBM25 : 0;
+            const combined = vw * v.vectorScore + bw * normBM25;
+            merged.push({ ...v.row, _combinedScore: combined, _distance: v.row._distance });
+          }
+          merged.sort((a, b) => b._combinedScore - a._combinedScore);
+          results = merged;
+        }
+      } catch (e) {
+        if (DEBUG) console.log("[vectorizer] Hybrid search fallback:", e.message);
+        // Fall through to vector-only results
+      }
+    }
+    // ── Metadata filters ──────────────────────────────────────────────────
     if (!includeArchived) {
       results = results.filter((r) => !r.archived);
     }
-    return results.slice(0, limit);
+    if (options.fileType) {
+      results = results.filter((r) => r.file_type === options.fileType);
+    }
+    if (options.language) {
+      results = results.filter((r) => r.language === options.language);
+    }
+    if (options.modifiedAfter) {
+      const after = new Date(options.modifiedAfter).getTime();
+      results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() >= after);
+    }
+    if (options.modifiedBefore) {
+      const before = new Date(options.modifiedBefore).getTime();
+      results = results.filter((r) => r.last_modified && new Date(r.last_modified).getTime() <= before);
+    }
+    if (options.tags && options.tags.length > 0) {
+      results = results.filter((r) => {
+        const rowTags = (r.tags || "").split(",").filter(Boolean);
+        return options.tags.some((t) => rowTags.includes(t));
+      });
+    }
+    const finalResults = results.slice(0, limit);
+    // ── Metrics tracking ────────────────────────────────────────────────────
+    if (METRICS_ENABLED) {
+      try {
+        if (!this.metrics) {
+          this.metrics = new SearchMetrics(this.root);
+          await this.metrics.load();
+        }
+        const scores = finalResults.map((r) =>
+          r._combinedScore != null
+            ? r._combinedScore
+            : r._distance != null
+              ? 1 - r._distance
+              : 0
+        );
+        this.metrics.recordQuery(query, this.indexName, scores, HYBRID_CONFIG.enabled || !!options.hybrid);
+        await this.metrics.save();
+      } catch {
+        // non-fatal
+      }
+    }
+    return finalResults;
   }
   async checkHealth(extraIgnore = []) {
@@ -478,7 +827,14 @@ class CodebaseIndexer {
   async indexSingleFile(filePath) {
     const absPath = path.isAbsolute(filePath) ? filePath : path.join(this.root, filePath);
-    return await this.indexFile(absPath);
+    // Prevent path traversal outside project root
+    const normalized = path.normalize(absPath);
+    const relative = path.relative(this.root, normalized);
+    if (relative.startsWith("..") || path.isAbsolute(relative)) {
+      if (DEBUG) console.log(`[vectorizer] Path traversal blocked: ${filePath}`);
+      return false;
+    }
+    return await this.indexFile(normalized);
   }
   async getStats() {
@@ -500,6 +856,12 @@ class CodebaseIndexer {
       model: EMBEDDING_MODEL,
       fileCount,
       chunkCount,
+      features: {
+        chunking: CHUNKING_CONFIG.strategy,
+        hybrid: HYBRID_CONFIG.enabled,
+        metrics: METRICS_ENABLED,
+        cache: CACHE_ENABLED,
+      },
     };
   }
@@ -525,12 +887,19 @@ class CodebaseIndexer {
   async clear() {
     await fs.rm(this.cacheDir, { recursive: true, force: true });
     this.hashes = {};
+    if (this.bm25) { this.bm25.clear(); this.bm25 = null; }
+    this._bm25Rows = null;
+    this.metrics = null;
     await this.init();
   }
   async clearAll() {
     await fs.rm(this.baseDir, { recursive: true, force: true });
     this.hashes = {};
+    if (this.bm25) { this.bm25.clear(); this.bm25 = null; }
+    this._bm25Rows = null;
+    this.metrics = null;
+    clearQueryCache();
     await this.init();
   }
@@ -546,6 +915,16 @@ class CodebaseIndexer {
     } catch {}
     return indexes;
   }
+  // ── Metrics access ────────────────────────────────────────────────────────
+  async getMetrics() {
+    if (!this.metrics) {
+      this.metrics = new SearchMetrics(this.root);
+      await this.metrics.load();
+    }
+    return this.metrics.getSummary();
+  }
 }
 function getEmbeddingModel() {

package/vectorizer/metadata-extractor.ts ADDED Viewed

@@ -0,0 +1,125 @@
+/**
+ * Metadata Extractor — derives rich metadata from file path + content.
+ *
+ * Adds file_type, language, last_modified, file_size, heading_context,
+ * function_name, class_name, and frontmatter tags to each chunk.
+ */
+import path from "path"
+import fs from "fs/promises"
+// ── Types ───────────────────────────────────────────────────────────────────
+export type FileType = "code" | "docs" | "config"
+export interface FileMetadata {
+  file_type: FileType
+  language: string
+  last_modified: string   // ISO timestamp
+  file_size: number       // bytes
+  tags: string[]
+}
+export interface ChunkMetadata extends FileMetadata {
+  file: string
+  chunk_index: number
+  content: string
+  vector: number[]
+  archived: boolean
+  heading_context?: string
+  function_name?: string
+  class_name?: string
+}
+// ── Extension maps ──────────────────────────────────────────────────────────
+const CODE_EXTENSIONS: Record<string, string> = {
+  ".js": "javascript", ".mjs": "javascript", ".cjs": "javascript",
+  ".ts": "typescript", ".tsx": "typescript", ".jsx": "javascript",
+  ".py": "python",
+  ".go": "go",
+  ".rs": "rust",
+  ".java": "java", ".kt": "kotlin",
+  ".swift": "swift",
+  ".c": "c", ".cpp": "cpp", ".h": "c", ".hpp": "cpp",
+  ".cs": "csharp",
+  ".rb": "ruby",
+  ".php": "php",
+  ".scala": "scala",
+  ".clj": "clojure",
+}
+const DOC_EXTENSIONS = new Set([".md", ".mdx", ".txt", ".rst", ".adoc"])
+const CONFIG_EXTENSIONS = new Set([
+  ".yaml", ".yml", ".json", ".toml", ".ini", ".xml", ".env",
+])
+// ── Helpers ─────────────────────────────────────────────────────────────────
+export function detectFileType(filePath: string): FileType {
+  const ext = path.extname(filePath).toLowerCase()
+  if (CODE_EXTENSIONS[ext]) return "code"
+  if (DOC_EXTENSIONS.has(ext)) return "docs"
+  if (CONFIG_EXTENSIONS.has(ext)) return "config"
+  return "code" // fallback
+}
+export function detectLanguage(filePath: string): string {
+  const ext = path.extname(filePath).toLowerCase()
+  if (CODE_EXTENSIONS[ext]) return CODE_EXTENSIONS[ext]
+  if (DOC_EXTENSIONS.has(ext)) return ext === ".md" || ext === ".mdx" ? "markdown" : ext.slice(1)
+  if (CONFIG_EXTENSIONS.has(ext)) return ext.slice(1)
+  return "unknown"
+}
+/** Extract tags from YAML front-matter (Markdown only). */
+export function extractFrontmatterTags(content: string): string[] {
+  const match = content.match(/^---\n([\s\S]*?)\n---/)
+  if (!match) return []
+  const fm = match[1]
+  // Look for `tags:` key — array or inline
+  const tagsMatch = fm.match(/^tags:\s*\n((?:\s+-\s+.+\n?)*)/m)
+  if (tagsMatch) {
+    return tagsMatch[1]
+      .split("\n")
+      .map((l) => l.replace(/^\s*-\s*/, "").trim())
+      .filter(Boolean)
+  }
+  // Inline: tags: [a, b, c]
+  const inlineMatch = fm.match(/^tags:\s*\[([^\]]*)\]/m)
+  if (inlineMatch) {
+    return inlineMatch[1].split(",").map((t) => t.trim()).filter(Boolean)
+  }
+  return []
+}
+// ── Public API ──────────────────────────────────────────────────────────────
+/**
+ * Extract base file-level metadata (without per-chunk fields).
+ */
+export async function extractFileMetadata(
+  filePath: string,
+  content: string,
+): Promise<FileMetadata> {
+  let lastModified = new Date().toISOString()
+  let fileSize = Buffer.byteLength(content, "utf8")
+  try {
+    const stat = await fs.stat(filePath)
+    lastModified = stat.mtime.toISOString()
+    fileSize = stat.size
+  } catch {
+    // use defaults
+  }
+  const fileType = detectFileType(filePath)
+  const language = detectLanguage(filePath)
+  const tags = fileType === "docs" ? extractFrontmatterTags(content) : []
+  return { file_type: fileType, language, last_modified: lastModified, file_size: fileSize, tags }
+}