npm - @o-lang/semantic-doc-search - Versions diffs - 1.0.23 → 1.0.24 - Mend

@o-lang/semantic-doc-search 1.0.23 → 1.0.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/config.json +25 -0
package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/onnx/model_quantized.onnx +0 -0
package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/tokenizer.json +30686 -0
package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/tokenizer_config.json +15 -0
package/embeddings.json +1 -514
package/package.json +2 -1
package/src/embeddings/local.js +77 -86
package/src/index.js +76 -36
package/src/test-doc-search.js +13 -0
package/test-doc-search-batch.js +36 -0
package/test-doc-search.js +22 -0
package/test-single-doc.js +32 -0

package/src/embeddings/local.js CHANGED Viewed

@@ -1,123 +1,114 @@
-// src/embeddings/local.js
 /**
- * LocalEmbedding - REAL semantic embeddings using all-MiniLM-L6-v2
- * Uses dynamic import to work with ESM packages in CommonJS environment
+ * LocalEmbedding
+ * ----------------
+ * Real semantic embeddings using all-MiniLM-L6-v2
+ * - Singleton model load
+ * - No silent failures
+ * - No zero vectors
+ * - Deterministic behavior
  */
 class LocalEmbedding {
   constructor() {
     this.dim = 384;
-    this.modelPromise = null;
-    this.transformersPromise = null;
+    this.model = null;
+    this.loading = null;
   }
-  /**
-   * Lazy-load the @xenova/transformers package
-   */
-  async getTransformers() {
-    if (!this.transformersPromise) {
-      this.transformersPromise = import('@xenova/transformers');
-    }
-    return this.transformersPromise;
-  }
+  /* ---------------- INTERNAL ---------------- */
-  /**
-   * Lazy-load the embedding model
-   */
-  async getModel() {
-    if (!this.modelPromise) {
-      const { pipeline, env } = await this.getTransformers();
-      // Configure transformers
-      env.allowLocalModels = true;
-      env.backends.onnx.warmup = false;
-      console.log('🔄 Loading local embedding model (first run may take 1-2 minutes)...');
-      this.modelPromise = pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2', {
-        revision: 'main',
-        cache_dir: './.cache/embeddings'
-      }).then(model => {
-        console.log('✅ Local embedding model loaded successfully!');
+  async loadModel() {
+    if (this.model) return this.model;
+    if (!this.loading) {
+      this.loading = (async () => {
+        const { pipeline, env } = await import("@xenova/transformers");
+        // Safe defaults
+        env.allowLocalModels = true;
+        env.backends.onnx.warmup = false;
+        console.log("🔄 Loading local embedding model (first run only)...");
+        const model = await pipeline(
+          "feature-extraction",
+          "Xenova/all-MiniLM-L6-v2",
+          {
+            revision: "main",
+            cache_dir: "./.cache/embeddings",
+          }
+        );
+        console.log("✅ Local embedding model ready");
         return model;
-      }).catch(error => {
-        console.error('❌ Failed to load local embedding model:', error.message);
-        throw error;
-      });
+      })();
     }
-    return this.modelPromise;
+    this.model = await this.loading;
+    return this.model;
   }
+  /* ---------------- PUBLIC API ---------------- */
   /**
-   * Generate REAL semantic embedding for text
+   * Generate embedding for a single string
    */
   async embed(text) {
-    if (!text || !text.trim()) {
-      return new Array(this.dim).fill(0);
+    if (typeof text !== "string" || !text.trim()) {
+      throw new Error("Embedding input must be a non-empty string");
     }
+    const model = await this.loadModel();
     try {
-      const model = await this.getModel();
-      const output = await model(text, {
-        pooling: 'mean',
-        normalize: true
+      const output = await model(text, {
+        pooling: "mean",
+        normalize: true,
       });
-      return Array.from(output.data);
-    } catch (error) {
-      console.error(`❌ Embedding failed for: "${text.substring(0, 50)}..."`);
-      return new Array(this.dim).fill(0);
+      const vector = Array.from(output.data);
+      if (vector.length !== this.dim) {
+        throw new Error(
+          `Invalid embedding dimension: ${vector.length} (expected ${this.dim})`
+        );
+      }
+      return vector;
+    } catch (err) {
+      console.error(
+        `❌ Embedding failed for text: "${text.slice(0, 60)}..."`
+      );
+      throw err;
     }
   }
   /**
-   * Batch embedding for multiple strings
+   * Batch embedding (sequential, safe)
    */
-  async embedBatch(textArray = []) {
-    if (!Array.isArray(textArray)) {
+  async embedBatch(texts = []) {
+    if (!Array.isArray(texts)) {
       throw new Error("embedBatch expects an array of strings");
     }
-    const embeddings = [];
-    for (const text of textArray) {
-      const embedding = await this.embed(text);
-      embeddings.push(embedding);
+    const results = [];
+    for (const text of texts) {
+      results.push(await this.embed(text));
     }
-    return embeddings;
+    return results;
   }
   /**
-   * Get embedding dimension
+   * Return embedding dimension
    */
   getDimension() {
     return this.dim;
   }
 }
-/**
- * Convenience function for compatibility
- */
-async function createEmbeddingWithRetry(text, options = {}, retries = 2) {
-  const embedder = new LocalEmbedding();
-  for (let attempt = 1; attempt <= retries; attempt++) {
-    try {
-      const embedding = await embedder.embed(text);
-      const isAllZeros = embedding.every(val => val === 0);
-      if (isAllZeros && (text || '').trim()) {
-        if (attempt === retries) {
-          console.warn(`⚠️ Embedding is all zeros for text: "${text.substring(0, 50)}..."`);
-        }
-        throw new Error('Embedding returned all zeros');
-      }
-      return embedding;
-    } catch (err) {
-      if (attempt === retries) {
-        console.error(`❌ All ${retries} attempts failed for embedding text: "${text.substring(0, 50)}..."`);
-        throw err;
-      }
-      console.warn(`⚠️ Embedding attempt ${attempt} failed, retrying...`);
-      await new Promise(resolve => setTimeout(resolve, 100 * attempt));
-    }
-  }
-}
+/* ---------------- SINGLETON EXPORT ---------------- */
+// One embedder per process (CRITICAL)
+const embedder = new LocalEmbedding();
-module.exports = { LocalEmbedding, createEmbeddingWithRetry };
+module.exports = embedder;

package/src/index.js CHANGED Viewed

@@ -1,6 +1,6 @@
 const fs = require("fs");
 const path = require("path");
-const { LocalEmbedding } = require("./embeddings/local.js");
+const embedder = require("./embeddings/local.js"); // ✅ singleton embedder
 const { chunkText } = require("./utils/chunker.js");
 const { extractKeywords } = require("./utils/extractText.js");
 const { highlightMatches } = require("./utils/highlight.js");
@@ -59,8 +59,7 @@ class DatabaseAdapter {
   async initMongo(context) {
     const { MongoClient } = require("mongodb");
-    const uri = context.MONGO_URI;
-    this.mongo = new MongoClient(uri);
+    this.mongo = new MongoClient(context.MONGO_URI);
     await this.mongo.connect();
   }
@@ -113,8 +112,8 @@ class DatabaseAdapter {
 async function loadAllDocuments(context) {
   const docs = [];
   const db = new DatabaseAdapter();
   try {
     await db.initialize(context);
     docs.push(...(await db.queryDocuments(context)));
@@ -125,9 +124,9 @@ async function loadAllDocuments(context) {
     : path.join(process.cwd(), "docs");
   if (fs.existsSync(baseDir)) {
-    const files = fs.readdirSync(baseDir).filter(f =>
-      f.endsWith(".txt") || f.endsWith(".md")
-    );
+    const files = fs
+      .readdirSync(baseDir)
+      .filter(f => f.endsWith(".txt") || f.endsWith(".md"));
     for (const file of files) {
       docs.push({
@@ -145,56 +144,86 @@ async function loadAllDocuments(context) {
 async function performHybridDocQA(query, context) {
   const cache = loadCache();
-  const embedder = new LocalEmbedding({ dimension: 384 });
-  const store = VectorRouter.create({
+  const MIN_SCORE = context.minScore ?? 0.75;
+  const topK = context.topK ?? 5;
+  const vectorStore = VectorRouter.create({
     backend: context.vectorBackend || "memory",
-    dimension: 384,
+    dimension: embedder.getDimension(),
     ...context,
   });
+  console.log(
+    "🧠 Vector store methods:",
+    Object.getOwnPropertyNames(Object.getPrototypeOf(vectorStore))
+  );
   const documents = await loadAllDocuments(context);
-  if (!documents.length) return { text: "", meta: {} };
+  console.log("📄 Documents loaded:", documents.length);
+  if (!documents.length) return { text: "(No documents found)", meta: { matches: 0 } };
+  // Multi-document ingestion
   for (const doc of documents) {
-    if (!cache[doc.id]) {
-      cache[doc.id] = true;
-      const chunks = chunkText(doc.content, 500);
-      for (let i = 0; i < chunks.length; i++) {
-        const vector = await embedder.embed(chunks[i]);
-        await store.upsert({
-          id: `${doc.id}:${i}`,
-          vector,
-          content: chunks[i],
-          source: doc.source,
-        });
+    const chunks = chunkText(doc.content, 500);
+    console.log(`📦 ${doc.id} split into ${chunks.length} chunks`);
+    for (let i = 0; i < chunks.length; i++) {
+      console.log("🧩 Chunk to embed:", chunks[i]?.substring(0, 100));
+      const vector = await embedder.embed(chunks[i]);
+      if (!vector || vector.every(v => v === 0)) {
+        console.warn("⚠️ Zero or invalid embedding, skipping chunk");
+        continue;
       }
+      await vectorStore.upsert({
+        id: `${doc.id}:${i}`,
+        vector,
+        content: chunks[i],
+        source: doc.source,
+      });
+      console.log(`✅ Upserted ${doc.id}:${i}`);
     }
   }
   saveCache(cache);
+  // Embed the query
   const queryVector = await embedder.embed(query);
-  const results = await store.search({
-    embedding: queryVector,
-    topK: 5,
-  });
+  if (!queryVector || queryVector.every(v => v === 0)) {
+    console.warn("⚠️ Query embedding invalid");
+    return { text: "(Query could not be embedded)", meta: { matches: 0 } };
+  }
+  // Top-K + similarity threshold
+  const results = await vectorStore.query(queryVector, { topK });
+  const filtered = results.filter(r => r.score >= MIN_SCORE);
+  console.log(`🔍 Search results: ${filtered.length} (after applying minScore=${MIN_SCORE})`);
+  if (!filtered.length) {
+    return { text: "(No relevant match found)", meta: { matches: 0 } };
+  }
   return {
     text: highlightMatches(
-      results.map(r => r.content).join("\n\n"),
+      filtered.map(r => r.content).join("\n\n"),
       extractKeywords(query)
     ),
-    meta: { matches: results.length },
+    meta: { matches: filtered.length },
   };
 }
 /* ---------------- PGVECTOR SEARCH ---------------- */
 async function performPgVectorSearch(query, context) {
-  const adapter = new PgVectorAdapter({ POSTGRES_URL: context.POSTGRES_URL });
-  const embedder = new LocalEmbedding({ dimension: 384 });
+  const adapter = new PgVectorAdapter({
+    POSTGRES_URL: context.POSTGRES_URL,
+  });
   const vector = await embedder.embed(query);
   const results = await adapter.search(vector, 5);
@@ -220,14 +249,25 @@ async function performDocQA(query, context) {
 async function docSearchResolver(action, context) {
   if (!action.startsWith("Ask doc-search")) return;
+  // Extract the query string
   const match = action.match(/"(.*)"|'(.*)'/);
-  const query = match
-    ? match[1] || match[2]
-    : action.replace("Ask doc-search", "").trim();
+  const query = match ? match[1] || match[2] : action.replace("Ask doc-search", "").trim();
+  // Optional: extract topK and minScore if provided in action, e.g. "Ask doc-search 'Vacation policy' topK=3 minScore=0.8"
+  let topK = 5;
+  let minScore = 0.75;
+  const topKMatch = action.match(/topK\s*=\s*(\d+)/i);
+  if (topKMatch) topK = parseInt(topKMatch[1], 10);
-  return performDocQA(query, context);
+  const minScoreMatch = action.match(/minScore\s*=\s*(0?\.\d+|1(\.0)?)/i);
+  if (minScoreMatch) minScore = parseFloat(minScoreMatch[1]);
+  // Pass these into context for hybrid search
+  const searchContext = { ...context, topK, minScore };
+  return performDocQA(query, searchContext);
 }
 docSearchResolver.resolverName = "doc-search";
 module.exports = docSearchResolver;

package/src/test-doc-search.js ADDED Viewed

@@ -0,0 +1,13 @@
+const docSearch = require("./index");
+(async () => {
+  const result = await docSearch(
+    'Ask doc-search "vacation policy"',
+    {
+      doc_root: "./docs",
+      vectorBackend: "memory"
+    }
+  );
+  console.log(result);
+})();

package/test-doc-search-batch.js ADDED Viewed

@@ -0,0 +1,36 @@
+// test-doc-search-batch.js
+const docSearchResolver = require("./src/index.js");
+(async () => {
+  try {
+    const context = {
+      doc_root: "./docs",        // folder with .txt or .md files
+      vectorBackend: "memory",   // can also switch to "pgvector" if configured
+    };
+    const queries = [
+      "Semantic search",
+      "Vacation policy",
+      "Employee onboarding",
+      "Leave requests",
+      "HR compliance"
+    ];
+    console.log("🔎 Running batch doc-search...");
+    for (const query of queries) {
+      const action = `Ask doc-search "${query}"`;
+      const result = await docSearchResolver(action, context);
+      console.log("\n====================================");
+      console.log(`Query: "${query}"`);
+      console.log("Text:\n", result.text || "(No matches found)");
+      console.log("Meta:", result.meta);
+      console.log("====================================");
+    }
+    console.log("\n✅ Batch search complete!");
+  } catch (err) {
+    console.error("❌ Batch doc-search test failed:", err);
+  }
+})();

package/test-doc-search.js ADDED Viewed

@@ -0,0 +1,22 @@
+// test-doc-search.js
+const docSearchResolver = require("./src/index.js");
+(async () => {
+  try {
+    const context = {
+      doc_root: "./docs",        // folder containing .txt or .md files
+      vectorBackend: "memory",   // can also use "pgvector" if configured
+    };
+    const action = 'Ask doc-search "Semantic search"'; // Example query
+    console.log("🔎 Running doc-search...");
+    const result = await docSearchResolver(action, context);
+    console.log("✅ Search Result:");
+    console.log("Text:\n", result.text);
+    console.log("Meta:", result.meta);
+  } catch (err) {
+    console.error("❌ doc-search test failed:", err);
+  }
+})();

package/test-single-doc.js ADDED Viewed

@@ -0,0 +1,32 @@
+// test-single-doc.js
+const path = require("path");
+const { LocalEmbedding } = require("./embeddings/local.js");
+const { chunkText } = require("./utils/chunker.js");
+const VectorRouter = require("./adapters/vectorRouter");
+(async () => {
+  const embedder = new LocalEmbedding();
+  const docPath = path.join(process.cwd(), "docs", "sample1.txt");
+  const fs = require("fs");
+  const content = fs.readFileSync(docPath, "utf8");
+  const chunks = chunkText(content, 500);
+  console.log(`Document split into ${chunks.length} chunk(s)`);
+  const vectorStore = VectorRouter.create({ backend: "memory", dimension: embedder.getDimension() });
+  for (let i = 0; i < chunks.length; i++) {
+    const vector = await embedder.embed(chunks[i]);
+    console.log(`Chunk ${i} embedding first 5 dims:`, vector.slice(0, 5));
+    await vectorStore.upsert({ id: `sample1:${i}`, vector, content: chunks[i], source: "file:sample1.txt" });
+  }
+  const query = "Semantic search";
+  const queryVector = await embedder.embed(query);
+  const results = await vectorStore.query(queryVector, { topK: 5 });
+  results.forEach((r, idx) => {
+    console.log(`Result ${idx}: score=${r.score.toFixed(3)} content=${r.content.substring(0, 50)}...`);
+  });
+})();