npm - @o-lang/semantic-doc-search - Versions diffs - 1.0.22 → 1.0.24 - Mend

@o-lang/semantic-doc-search 1.0.22 → 1.0.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/config.json +25 -0
package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/onnx/model_quantized.onnx +0 -0
package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/tokenizer.json +30686 -0
package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/tokenizer_config.json +15 -0
package/embeddings.json +1 -514
package/package.json +2 -1
package/src/embeddings/local.js +77 -86
package/src/index.js +103 -91
package/src/test-doc-search.js +13 -0
package/test-doc-search-batch.js +36 -0
package/test-doc-search.js +22 -0
package/test-single-doc.js +32 -0

package/src/embeddings/local.js CHANGED Viewed

@@ -1,123 +1,114 @@
-// src/embeddings/local.js
 /**
- * LocalEmbedding - REAL semantic embeddings using all-MiniLM-L6-v2
- * Uses dynamic import to work with ESM packages in CommonJS environment
+ * LocalEmbedding
+ * ----------------
+ * Real semantic embeddings using all-MiniLM-L6-v2
+ * - Singleton model load
+ * - No silent failures
+ * - No zero vectors
+ * - Deterministic behavior
  */
 class LocalEmbedding {
   constructor() {
     this.dim = 384;
-    this.modelPromise = null;
-    this.transformersPromise = null;
+    this.model = null;
+    this.loading = null;
   }
-  /**
-   * Lazy-load the @xenova/transformers package
-   */
-  async getTransformers() {
-    if (!this.transformersPromise) {
-      this.transformersPromise = import('@xenova/transformers');
-    }
-    return this.transformersPromise;
-  }
+  /* ---------------- INTERNAL ---------------- */
-  /**
-   * Lazy-load the embedding model
-   */
-  async getModel() {
-    if (!this.modelPromise) {
-      const { pipeline, env } = await this.getTransformers();
-      // Configure transformers
-      env.allowLocalModels = true;
-      env.backends.onnx.warmup = false;
-      console.log('🔄 Loading local embedding model (first run may take 1-2 minutes)...');
-      this.modelPromise = pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2', {
-        revision: 'main',
-        cache_dir: './.cache/embeddings'
-      }).then(model => {
-        console.log('✅ Local embedding model loaded successfully!');
+  async loadModel() {
+    if (this.model) return this.model;
+    if (!this.loading) {
+      this.loading = (async () => {
+        const { pipeline, env } = await import("@xenova/transformers");
+        // Safe defaults
+        env.allowLocalModels = true;
+        env.backends.onnx.warmup = false;
+        console.log("🔄 Loading local embedding model (first run only)...");
+        const model = await pipeline(
+          "feature-extraction",
+          "Xenova/all-MiniLM-L6-v2",
+          {
+            revision: "main",
+            cache_dir: "./.cache/embeddings",
+          }
+        );
+        console.log("✅ Local embedding model ready");
         return model;
-      }).catch(error => {
-        console.error('❌ Failed to load local embedding model:', error.message);
-        throw error;
-      });
+      })();
     }
-    return this.modelPromise;
+    this.model = await this.loading;
+    return this.model;
   }
+  /* ---------------- PUBLIC API ---------------- */
   /**
-   * Generate REAL semantic embedding for text
+   * Generate embedding for a single string
    */
   async embed(text) {
-    if (!text || !text.trim()) {
-      return new Array(this.dim).fill(0);
+    if (typeof text !== "string" || !text.trim()) {
+      throw new Error("Embedding input must be a non-empty string");
     }
+    const model = await this.loadModel();
     try {
-      const model = await this.getModel();
-      const output = await model(text, {
-        pooling: 'mean',
-        normalize: true
+      const output = await model(text, {
+        pooling: "mean",
+        normalize: true,
       });
-      return Array.from(output.data);
-    } catch (error) {
-      console.error(`❌ Embedding failed for: "${text.substring(0, 50)}..."`);
-      return new Array(this.dim).fill(0);
+      const vector = Array.from(output.data);
+      if (vector.length !== this.dim) {
+        throw new Error(
+          `Invalid embedding dimension: ${vector.length} (expected ${this.dim})`
+        );
+      }
+      return vector;
+    } catch (err) {
+      console.error(
+        `❌ Embedding failed for text: "${text.slice(0, 60)}..."`
+      );
+      throw err;
     }
   }
   /**
-   * Batch embedding for multiple strings
+   * Batch embedding (sequential, safe)
    */
-  async embedBatch(textArray = []) {
-    if (!Array.isArray(textArray)) {
+  async embedBatch(texts = []) {
+    if (!Array.isArray(texts)) {
       throw new Error("embedBatch expects an array of strings");
     }
-    const embeddings = [];
-    for (const text of textArray) {
-      const embedding = await this.embed(text);
-      embeddings.push(embedding);
+    const results = [];
+    for (const text of texts) {
+      results.push(await this.embed(text));
     }
-    return embeddings;
+    return results;
   }
   /**
-   * Get embedding dimension
+   * Return embedding dimension
    */
   getDimension() {
     return this.dim;
   }
 }
-/**
- * Convenience function for compatibility
- */
-async function createEmbeddingWithRetry(text, options = {}, retries = 2) {
-  const embedder = new LocalEmbedding();
-  for (let attempt = 1; attempt <= retries; attempt++) {
-    try {
-      const embedding = await embedder.embed(text);
-      const isAllZeros = embedding.every(val => val === 0);
-      if (isAllZeros && (text || '').trim()) {
-        if (attempt === retries) {
-          console.warn(`⚠️ Embedding is all zeros for text: "${text.substring(0, 50)}..."`);
-        }
-        throw new Error('Embedding returned all zeros');
-      }
-      return embedding;
-    } catch (err) {
-      if (attempt === retries) {
-        console.error(`❌ All ${retries} attempts failed for embedding text: "${text.substring(0, 50)}..."`);
-        throw err;
-      }
-      console.warn(`⚠️ Embedding attempt ${attempt} failed, retrying...`);
-      await new Promise(resolve => setTimeout(resolve, 100 * attempt));
-    }
-  }
-}
+/* ---------------- SINGLETON EXPORT ---------------- */
+// One embedder per process (CRITICAL)
+const embedder = new LocalEmbedding();
-module.exports = { LocalEmbedding, createEmbeddingWithRetry };
+module.exports = embedder;

package/src/index.js CHANGED Viewed

@@ -1,16 +1,16 @@
 const fs = require("fs");
 const path = require("path");
-const { createLLM } = require("./llm/router.js");
-const { LocalEmbedding } = require("./embeddings/local.js");
+const embedder = require("./embeddings/local.js"); // ✅ singleton embedder
 const { chunkText } = require("./utils/chunker.js");
 const { extractKeywords } = require("./utils/extractText.js");
-const { cosine } = require("./utils/similarity.js");
 const { highlightMatches } = require("./utils/highlight.js");
-const PgVectorAdapter = require("./adapters/pgvectorAdapter.js");
 const VectorRouter = require("./adapters/vectorRouter");
+const PgVectorAdapter = require("./adapters/pgvectorAdapter.js");
 const CACHE_PATH = path.join(process.cwd(), "embeddings.json");
+/* ---------------- UTIL ---------------- */
 function safeResolve(base, userPath) {
   const resolved = path.resolve(base, userPath);
   if (!resolved.startsWith(path.resolve(base))) {
@@ -44,13 +44,9 @@ class DatabaseAdapter {
   async initialize(context) {
     if (this.initialized) return;
-    if (context.db_type === "mongodb" || context.MONGO_URI) {
-      await this.initMongo(context);
-    } else if (context.db_type === "sqlite" || context.db_path) {
-      await this.initSQLite(context);
-    } else if (context.db_type === "postgres" || context.POSTGRES_URL) {
-      await this.initPostgres(context);
-    }
+    if (context.MONGO_URI) await this.initMongo(context);
+    else if (context.db_path) await this.initSQLite(context);
+    else if (context.POSTGRES_URL) await this.initPostgres(context);
     this.initialized = true;
   }
@@ -58,74 +54,50 @@ class DatabaseAdapter {
   async initSQLite(context) {
     const Database = require("better-sqlite3");
     const dbPath = context.db_path || "./database.db";
-    const dbDir = path.dirname(path.resolve(dbPath));
-    if (!fs.existsSync(dbDir)) {
-      throw new Error(`SQLite database directory not found: ${dbDir}`);
-    }
-    this.sqliteClient = new Database(dbPath, { readonly: true });
-  }
-  async querySQLite(query, params = []) {
-    const stmt = this.sqliteClient.prepare(query);
-    return stmt.all(...params);
+    this.sqlite = new Database(dbPath, { readonly: true });
   }
   async initMongo(context) {
     const { MongoClient } = require("mongodb");
-    const uri =
-      context.MONGO_URI ||
-      `mongodb://localhost:27017/${context.db_name || "olang"}`;
-    this.mongoClient = new MongoClient(uri);
-    await this.mongoClient.connect();
-  }
-  async queryMongo(collectionName, filter = {}, projection = {}) {
-    const db = this.mongoClient.db(process.env.DB_NAME || "olang");
-    return db.collection(collectionName).find(filter, { projection }).toArray();
+    this.mongo = new MongoClient(context.MONGO_URI);
+    await this.mongo.connect();
   }
   async initPostgres(context) {
     const { Pool } = require("pg");
-    this.postgresClient = new Pool({
-      connectionString: context.POSTGRES_URL,
-    });
-  }
-  async queryPostgres(query, params = []) {
-    const result = await this.postgresClient.query(query, params);
-    return result.rows;
+    this.pg = new Pool({ connectionString: context.POSTGRES_URL });
   }
   async queryDocuments(context) {
     const table = context.db_table || "documents";
-    const contentCol = context.db_content_column || "content";
     const idCol = context.db_id_column || "id";
+    const contentCol = context.db_content_column || "content";
-    if (context.MONGO_URI) {
-      const rows = await this.queryMongo(table);
-      return rows.map((r) => ({
-        id: r._id?.toString(),
+    if (this.mongo) {
+      const rows = await this.mongo.db().collection(table).find({}).toArray();
+      return rows.map(r => ({
+        id: r._id.toString(),
         content: r[contentCol] || "",
         source: `mongodb:${table}`,
       }));
     }
-    if (context.db_path) {
-      const rows = await this.querySQLite(
-        `SELECT ${idCol}, ${contentCol} FROM ${table}`
-      );
-      return rows.map((r) => ({
+    if (this.sqlite) {
+      const rows = this.sqlite
+        .prepare(`SELECT ${idCol}, ${contentCol} FROM ${table}`)
+        .all();
+      return rows.map(r => ({
         id: r[idCol],
         content: r[contentCol],
         source: `sqlite:${table}`,
       }));
     }
-    if (context.POSTGRES_URL) {
-      const rows = await this.queryPostgres(
+    if (this.pg) {
+      const res = await this.pg.query(
         `SELECT ${idCol}, ${contentCol} FROM ${table}`
       );
-      return rows.map((r) => ({
+      return res.rows.map(r => ({
         id: r[idCol],
         content: r[contentCol],
         source: `postgres:${table}`,
@@ -154,13 +126,13 @@ async function loadAllDocuments(context) {
   if (fs.existsSync(baseDir)) {
     const files = fs
       .readdirSync(baseDir)
-      .filter((f) => f.endsWith(".txt") || f.endsWith(".md"));
+      .filter(f => f.endsWith(".txt") || f.endsWith(".md"));
-    for (const f of files) {
+    for (const file of files) {
       docs.push({
-        id: f,
-        content: fs.readFileSync(path.join(baseDir, f), "utf8"),
-        source: `file:${f}`,
+        id: file,
+        content: fs.readFileSync(path.join(baseDir, file), "utf8"),
+        source: `file:${file}`,
       });
     }
   }
@@ -170,74 +142,102 @@ async function loadAllDocuments(context) {
 /* ---------------- HYBRID VECTOR SEARCH ---------------- */
-async function performHybridDocQA(query, context = {}) {
+async function performHybridDocQA(query, context) {
   const cache = loadCache();
-  const embedder = new LocalEmbedding({ dimension: 384 });
+  const MIN_SCORE = context.minScore ?? 0.75;
+  const topK = context.topK ?? 5;
   const vectorStore = VectorRouter.create({
     backend: context.vectorBackend || "memory",
-    dimension: 384,
+    dimension: embedder.getDimension(),
     ...context,
   });
+  console.log(
+    "🧠 Vector store methods:",
+    Object.getOwnPropertyNames(Object.getPrototypeOf(vectorStore))
+  );
   const documents = await loadAllDocuments(context);
-  if (!documents.length) {
-    return { text: "", meta: {} };
-  }
+  console.log("📄 Documents loaded:", documents.length);
+  if (!documents.length) return { text: "(No documents found)", meta: { matches: 0 } };
+  // Multi-document ingestion
   for (const doc of documents) {
-    if (!cache[doc.id]) {
-      cache[doc.id] = true;
-      const chunks = chunkText(doc.content, 500);
-      for (let i = 0; i < chunks.length; i++) {
-        const vector = await embedder.embed(chunks[i]);
-        await vectorStore.upsert({
-          id: `${doc.id}:${i}`,
-          vector,
-          content: chunks[i],
-          source: doc.source,
-        });
+    const chunks = chunkText(doc.content, 500);
+    console.log(`📦 ${doc.id} split into ${chunks.length} chunks`);
+    for (let i = 0; i < chunks.length; i++) {
+      console.log("🧩 Chunk to embed:", chunks[i]?.substring(0, 100));
+      const vector = await embedder.embed(chunks[i]);
+      if (!vector || vector.every(v => v === 0)) {
+        console.warn("⚠️ Zero or invalid embedding, skipping chunk");
+        continue;
       }
+      await vectorStore.upsert({
+        id: `${doc.id}:${i}`,
+        vector,
+        content: chunks[i],
+        source: doc.source,
+      });
+      console.log(`✅ Upserted ${doc.id}:${i}`);
     }
   }
   saveCache(cache);
+  // Embed the query
   const queryVector = await embedder.embed(query);
-  const results = await vectorStore.query(queryVector, 5);
+  if (!queryVector || queryVector.every(v => v === 0)) {
+    console.warn("⚠️ Query embedding invalid");
+    return { text: "(Query could not be embedded)", meta: { matches: 0 } };
+  }
+  // Top-K + similarity threshold
+  const results = await vectorStore.query(queryVector, { topK });
+  const filtered = results.filter(r => r.score >= MIN_SCORE);
+  console.log(`🔍 Search results: ${filtered.length} (after applying minScore=${MIN_SCORE})`);
+  if (!filtered.length) {
+    return { text: "(No relevant match found)", meta: { matches: 0 } };
+  }
   return {
     text: highlightMatches(
-      results.map((r) => r.content).join("\n\n"),
+      filtered.map(r => r.content).join("\n\n"),
       extractKeywords(query)
     ),
-    meta: { matches: results.length },
+    meta: { matches: filtered.length },
   };
 }
 /* ---------------- PGVECTOR SEARCH ---------------- */
-async function performPgVectorSearch(query, context = {}) {
+async function performPgVectorSearch(query, context) {
   const adapter = new PgVectorAdapter({
     POSTGRES_URL: context.POSTGRES_URL,
   });
-  const embedder = new LocalEmbedding({ dimension: 384 });
   const vector = await embedder.embed(query);
-  const results = await adapter.query(vector, 5);
+  const results = await adapter.search(vector, 5);
   await adapter.close();
   return {
-    text: results.map((r) => r.content).join("\n\n"),
+    text: results.map(r => r.content).join("\n\n"),
     meta: { matches: results.length },
   };
 }
 /* ---------------- ROUTER ---------------- */
-async function performDocQA(query, context = {}) {
+async function performDocQA(query, context) {
   if (context.POSTGRES_URL) {
     return performPgVectorSearch(query, context);
   }
@@ -247,14 +247,26 @@ async function performDocQA(query, context = {}) {
 /* ---------------- O-LANG RESOLVER ---------------- */
 async function docSearchResolver(action, context) {
-  if (action.startsWith("Ask doc-search")) {
-    const match = action.match(/"(.*)"|'(.*)'/);
-    const query = match
-      ? match[1] || match[2]
-      : action.replace("Ask doc-search", "").trim();
+  if (!action.startsWith("Ask doc-search")) return;
-    return performDocQA(query, context);
-  }
+  // Extract the query string
+  const match = action.match(/"(.*)"|'(.*)'/);
+  const query = match ? match[1] || match[2] : action.replace("Ask doc-search", "").trim();
+  // Optional: extract topK and minScore if provided in action, e.g. "Ask doc-search 'Vacation policy' topK=3 minScore=0.8"
+  let topK = 5;
+  let minScore = 0.75;
+  const topKMatch = action.match(/topK\s*=\s*(\d+)/i);
+  if (topKMatch) topK = parseInt(topKMatch[1], 10);
+  const minScoreMatch = action.match(/minScore\s*=\s*(0?\.\d+|1(\.0)?)/i);
+  if (minScoreMatch) minScore = parseFloat(minScoreMatch[1]);
+  // Pass these into context for hybrid search
+  const searchContext = { ...context, topK, minScore };
+  return performDocQA(query, searchContext);
 }
 docSearchResolver.resolverName = "doc-search";

package/src/test-doc-search.js ADDED Viewed

@@ -0,0 +1,13 @@
+const docSearch = require("./index");
+(async () => {
+  const result = await docSearch(
+    'Ask doc-search "vacation policy"',
+    {
+      doc_root: "./docs",
+      vectorBackend: "memory"
+    }
+  );
+  console.log(result);
+})();

package/test-doc-search-batch.js ADDED Viewed

@@ -0,0 +1,36 @@
+// test-doc-search-batch.js
+const docSearchResolver = require("./src/index.js");
+(async () => {
+  try {
+    const context = {
+      doc_root: "./docs",        // folder with .txt or .md files
+      vectorBackend: "memory",   // can also switch to "pgvector" if configured
+    };
+    const queries = [
+      "Semantic search",
+      "Vacation policy",
+      "Employee onboarding",
+      "Leave requests",
+      "HR compliance"
+    ];
+    console.log("🔎 Running batch doc-search...");
+    for (const query of queries) {
+      const action = `Ask doc-search "${query}"`;
+      const result = await docSearchResolver(action, context);
+      console.log("\n====================================");
+      console.log(`Query: "${query}"`);
+      console.log("Text:\n", result.text || "(No matches found)");
+      console.log("Meta:", result.meta);
+      console.log("====================================");
+    }
+    console.log("\n✅ Batch search complete!");
+  } catch (err) {
+    console.error("❌ Batch doc-search test failed:", err);
+  }
+})();

package/test-doc-search.js ADDED Viewed

@@ -0,0 +1,22 @@
+// test-doc-search.js
+const docSearchResolver = require("./src/index.js");
+(async () => {
+  try {
+    const context = {
+      doc_root: "./docs",        // folder containing .txt or .md files
+      vectorBackend: "memory",   // can also use "pgvector" if configured
+    };
+    const action = 'Ask doc-search "Semantic search"'; // Example query
+    console.log("🔎 Running doc-search...");
+    const result = await docSearchResolver(action, context);
+    console.log("✅ Search Result:");
+    console.log("Text:\n", result.text);
+    console.log("Meta:", result.meta);
+  } catch (err) {
+    console.error("❌ doc-search test failed:", err);
+  }
+})();

package/test-single-doc.js ADDED Viewed

@@ -0,0 +1,32 @@
+// test-single-doc.js
+const path = require("path");
+const { LocalEmbedding } = require("./embeddings/local.js");
+const { chunkText } = require("./utils/chunker.js");
+const VectorRouter = require("./adapters/vectorRouter");
+(async () => {
+  const embedder = new LocalEmbedding();
+  const docPath = path.join(process.cwd(), "docs", "sample1.txt");
+  const fs = require("fs");
+  const content = fs.readFileSync(docPath, "utf8");
+  const chunks = chunkText(content, 500);
+  console.log(`Document split into ${chunks.length} chunk(s)`);
+  const vectorStore = VectorRouter.create({ backend: "memory", dimension: embedder.getDimension() });
+  for (let i = 0; i < chunks.length; i++) {
+    const vector = await embedder.embed(chunks[i]);
+    console.log(`Chunk ${i} embedding first 5 dims:`, vector.slice(0, 5));
+    await vectorStore.upsert({ id: `sample1:${i}`, vector, content: chunks[i], source: "file:sample1.txt" });
+  }
+  const query = "Semantic search";
+  const queryVector = await embedder.embed(query);
+  const results = await vectorStore.query(queryVector, { topK: 5 });
+  results.forEach((r, idx) => {
+    console.log(`Result ${idx}: score=${r.score.toFixed(3)} content=${r.content.substring(0, 50)}...`);
+  });
+})();