npm - @o-lang/semantic-doc-search - Versions diffs - 1.0.40 → 1.0.42 - Mend

@o-lang/semantic-doc-search 1.0.40 → 1.0.42

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/README.md +17 -12
package/package.json +37 -38
package/src/embeddings/local.js +12 -2
package/src/embeddings/local.js.bak +153 -0
package/src/index.js +1 -1
package/src/resolver.js +59 -179
package/src/services/docQA.js +56 -19
package/src/utils/formatResults.js +9 -10
package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/config.json +0 -25
package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/onnx/model_quantized.onnx +0 -0
package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/tokenizer.json +0 -30686
package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/tokenizer_config.json +0 -15
package/.env.example +0 -0
package/bin/cli.js +0 -58
package/docs/sample1.txt +0 -1
package/docs/vacation policy +0 -5
package/embeddings.json +0 -3
package/test-doc-search-batch.js +0 -36
package/test-doc-search.js +0 -40
package/test-embed.js +0 -10
package/test-single-doc.js +0 -32

package/README.md CHANGED Viewed

@@ -1,24 +1,29 @@
-# @olang/semantic-doc-search
+# @o-lang/semantic-doc-search
-O-lang Semantic Document Search Resolver with hybrid search, embeddings, rerank, and streaming.
+Semantic document retrieval engine for O-Lang workflows.
+This package provides vector-based document search (RAG retrieval layer) that integrates with O-Lang kernel workflows. It handles document ingestion, chunking, embedding, and similarity search, returning LLM-ready context outputs.
 ---
 ## Features
-- Hybrid search (semantic embeddings + lexical scoring)
-- Supports multiple file types: `.txt`, `.md`, `.pdf`, `.html`, `.docx`
-- Streaming token-by-token output via SSE
-- Reranking support (Cohere, Groq, local)
-- Flexible vector adapters: in-memory, Redis, Pinecone
-- Prebuilt prompt templates: summarize, short-answer, bullet-points, cite-sources
-- CLI for quick testing
-- Safe path resolution & chunking for long documents
-- Persistent embeddings cache (`embeddings.json`)
+- Semantic vector search using embeddings
+- Document ingestion from local filesystem (`.txt`, `.md`)
+- Automatic text chunking for large documents
+- Pluggable embedding providers (local, OpenAI, Groq, etc.)
+- Multiple vector database support:
+  - In-memory store
+  - Redis (adapter)
+  - PostgreSQL / pgvector (adapter)
+  - Pinecone (adapter)
+- Embedding cache support (`embeddings.json`)
+- Normalized LLM-ready output format (`text + matches`)
+- Designed for O-Lang `.ol` workflow integration
 ---
 ## Installation
 ```bash
-npm install @olang/semantic-doc-search
+npm install @o-lang/semantic-doc-search

package/package.json CHANGED Viewed

@@ -1,38 +1,37 @@
-{
-  "name": "@o-lang/semantic-doc-search",
-  "version": "1.0.40",
-  "description": "O-lang Semantic Document Search Resolver with hybrid search, embeddings, rerank, and streaming.",
-  "main": "src/index.js",
-  "type": "commonjs",
-  "bin": {
-    "olang-doc-search": "bin/cli.js"
-  },
-  "scripts": {
-    "start": "node bin/cli.js",
-    "test": "echo \"No tests yet\""
-  },
-  "dependencies": {
-    "@anthropic-ai/sdk": "*",
-    "@xenova/transformers": "^2.17.2",
-    "axios": "^1.7.2",
-    "docx": "^7.0.0",
-    "dotenv": "^17.2.3",
-    "express": "^4.18.2",
-    "groq-sdk": "^0.5.0",
-    "jsdom": "^22.1.0",
-    "minimist": "^1.2.8",
-    "node-stream-zip": "*",
-    "openai": "^4.3.1",
-    "pdf-parse": "^1.1.1",
-    "pg": "^8.16.3",
-    "pgvector": "^0.2.1",
-    "pinecone-client": "^1.0.0",
-    "readline": "^1.3.0",
-    "redis": "^5.2.0"
-  },
-  "devDependencies": {
-    "eslint": "^8.46.0",
-    "jest": "^29.7.0",
-    "nodemon": "^2.0.22"
-  }
-}
+{
+  "name": "@o-lang/semantic-doc-search",
+  "version": "1.0.42",
+  "description": "O-Lang semantic document search resolver with vector embeddings",
+  "main": "src/index.js",
+  "exports": {
+    ".": "./src/index.js",
+    "./resolver": "./src/resolver.js",
+    "./embeddings/local": "./src/embeddings/local.js"
+  },
+  "files": [
+    "src/",
+    "package.json",
+    "README.md"
+  ],
+  "scripts": {
+    "test": "node src/test-doc-search.js",
+    "start": "node src/index.js"
+  },
+  "keywords": [
+    "o-lang",
+    "resolver",
+    "semantic-search",
+    "rag",
+    "embeddings"
+  ],
+  "author": "O-Lang Team <info@olang.cloud>",
+  "license": "MIT",
+  "dependencies": {
+    "@xenova/transformers": "^2.14.0",
+    "axios": "^1.6.0",
+    "dotenv": "^16.6.1"
+  },
+  "engines": {
+    "node": ">=18.0.0"
+  }
+}

package/src/embeddings/local.js CHANGED Viewed

@@ -149,5 +149,15 @@ class LocalEmbedding {
   }
 }
-const embedder = new LocalEmbedding();
-module.exports = embedder;
+// ✅ EXPORT AS FACTORY FUNCTION (what resolver expects)
+// Usage: const embed = await embedder({ dimension: 384 })
+// Returns: async (text) => vector
+const embedderInstance = new LocalEmbedding();
+module.exports = async ({ dimension = 384 } = {}) => {
+  if (dimension && typeof dimension === 'number') {
+    embedderInstance.dim = dimension;
+  }
+  // Return bound embed method that resolver can call: await embed(text)
+  return embedderInstance.embed.bind(embedderInstance);
+};

package/src/embeddings/local.js.bak ADDED Viewed

@@ -0,0 +1,153 @@
+/**
+ * LocalEmbedding
+ * ----------------
+ * Real semantic embeddings using all-MiniLM-L6-v2
+ * - Singleton model load
+ * - No silent failures
+ * - No zero vectors
+ * - Deterministic behavior
+ * - DEFENSIVE against method detaching & invalid vectors
+ * - WINDOWS-SAFE (disables SIMD, threads, proxy)
+ * - TENSOR-SAFE (handles Float32Array, Array, and all ONNX tensor types)
+ */
+class LocalEmbedding {
+  constructor() {
+    this.dim = 384;
+    this.model = null;
+    this.loading = null;
+    // 🔒 Bind methods to prevent resolver breakage
+    this.loadModel = this.loadModel.bind(this);
+    this.embed = this.embed.bind(this);
+    this.embedBatch = this.embedBatch.bind(this);
+    this.getDimension = this.getDimension.bind(this);
+  }
+  /* ---------------- INTERNAL ---------------- */
+  async loadModel() {
+    if (this.model) return this.model;
+    if (!this.loading) {
+      this.loading = (async () => {
+        // ⚠️ CRITICAL: Configure environment BEFORE loading model
+        const { env } = await import("@xenova/transformers");
+        // Safe settings for all platforms (harmless on macOS/Linux, essential on Windows)
+        env.backends.onnx.wasm.simd = false;      // Avoids AVX/SIMD crashes on older CPUs
+        env.backends.onnx.wasm.threads = false;   // Prevents threading issues in Node
+        env.backends.onnx.wasm.proxy = false;     // Avoids proxy complications
+        env.allowLocalModels = true;
+        env.backends.onnx.warmup = false;
+        env.cacheDir = "./.cache/embeddings";     // Explicit, project-local cache
+        console.log("🔄 Loading local embedding model (first run only)...");
+        console.log("⚙️  Using WASM (SIMD disabled) for cross-platform compatibility");
+        const { pipeline } = await import("@xenova/transformers");
+        const model = await pipeline(
+          "feature-extraction",
+          "Xenova/all-MiniLM-L6-v2",
+          {
+            revision: "main",
+            cache_dir: "./.cache/embeddings",
+          }
+        );
+        console.log("✅ Local embedding model ready");
+        return model;
+      })();
+    }
+    this.model = await this.loading;
+    return this.model;
+  }
+  /* ---------------- PUBLIC API ---------------- */
+  async embed(text) {
+    if (typeof text !== "string" || !text.trim()) {
+      throw new Error("Embedding input must be a non-empty string");
+    }
+    const model = await this.loadModel();
+    try {
+      const output = await model(text, {
+        pooling: "mean",
+        normalize: true,
+      });
+      // 🔍 DEBUG: Inspect output structure
+      console.log("🔍 Model output type:", typeof output);
+      if (output && typeof output === 'object') {
+        console.log("🔍 Output keys:", Object.keys(output));
+        console.log("🔍 Output dims:", output.dims);
+        console.log("🔍 output.data type:", Object.prototype.toString.call(output.data));
+        console.log("🔍 Is TypedArray?", ArrayBuffer.isView(output.data));
+      }
+      // ✅ UNIVERSAL EXTRACTION: handles Float32Array, Array, and all tensor forms
+      let vector = null;
+      if (output && output.data !== undefined) {
+        // Handle Float32Array, Uint8Array, etc. (standard in ONNX/WASM)
+        if (ArrayBuffer.isView(output.data)) {
+          vector = Array.from(output.data);
+        }
+        // Handle plain JS array (older backends or CPU mode)
+        else if (Array.isArray(output.data)) {
+          vector = Array.from(output.data);
+        }
+      }
+      // Handle batch output: [tensor]
+      else if (Array.isArray(output) && output[0]?.data !== undefined) {
+        if (ArrayBuffer.isView(output[0].data)) {
+          vector = Array.from(output[0].data);
+        } else if (Array.isArray(output[0].data)) {
+          vector = Array.from(output[0].data);
+        }
+      }
+      // Fallback: raw array (rare)
+      else if (Array.isArray(output)) {
+        vector = output;
+      }
+      // Final validation
+      if (!Array.isArray(vector) || vector.length !== this.dim) {
+        console.error("❌ Invalid embedding vector length:", vector?.length);
+        console.error("❌ First few values:", vector?.slice?.(0, 5));
+        throw new Error(`Invalid embedding dimension: ${vector?.length || 0} (expected ${this.dim})`);
+      }
+      return vector;
+    } catch (err) {
+      console.error(
+        `❌ Embedding failed for text: "${text.slice(0, 60)}..."`,
+        err.message
+      );
+      throw err;
+    }
+  }
+  async embedBatch(texts = []) {
+    if (!Array.isArray(texts)) {
+      throw new Error("embedBatch expects an array of strings");
+    }
+    const results = [];
+    for (const text of texts) {
+      results.push(await this.embed(text));
+    }
+    return results;
+  }
+  getDimension() {
+    return this.dim;
+  }
+}
+const embedder = new LocalEmbedding();
+module.exports = embedder;

package/src/index.js CHANGED Viewed

@@ -1,4 +1,3 @@
-// index.js (6 lines)
 const semanticResolver = require("./resolver");
 async function docSearchResolver(action, context) {
@@ -6,4 +5,5 @@ async function docSearchResolver(action, context) {
 }
 docSearchResolver.resolverName = "doc-search";
+docSearchResolver.version = "1.0.41";
 module.exports = docSearchResolver;

package/src/resolver.js CHANGED Viewed

@@ -1,5 +1,5 @@
 const VectorRouter = require("./adapters/vectorRouter");
-const embedder = require("./embeddings/local"); // singleton embedder
+const embedder = require("./embeddings/local");
 const { extractQuery } = require("./utils/extractQuery");
 const { formatResults } = require("./utils/formatResults");
 const fs = require("fs");
@@ -8,7 +8,9 @@ const crypto = require("crypto");
 const CACHE_PATH = path.join(process.cwd(), "embeddings.json");
-// Load cache for ingestion guard
+// ─────────────────────────────────────────────
+// Helpers (UNCHANGED)
+// ─────────────────────────────────────────────
 function loadCache() {
   try {
     if (fs.existsSync(CACHE_PATH)) {
@@ -24,226 +26,104 @@ function saveCache(cache) {
   } catch {}
 }
-/**
- * Clean text for embedding (defensive)
- */
 function sanitizeTextForEmbedding(text) {
   if (typeof text !== "string") return "";
-  // Remove wrapping quotes and extra whitespace
   return text.replace(/^["']|["']$/g, "").trim();
 }
-/**
- * Load documents from doc_root if provided
- */
-function loadDocumentsFromContext(context) {
-  if (context.documents && Array.isArray(context.documents)) {
-    return context.documents;
-  }
-  if (context.doc_root) {
-    const baseDir = path.resolve(process.cwd(), context.doc_root);
-    if (fs.existsSync(baseDir)) {
-      const files = fs.readdirSync(baseDir).filter(f => f.endsWith('.txt') || f.endsWith('.md'));
-      const docs = files.map(file => {
-        try {
-          const content = fs.readFileSync(path.join(baseDir, file), 'utf8');
-          return { id: file, content, source: `file:${file}` };
-        } catch (err) {
-          console.warn(`⚠️ Failed to read ${file}:`, err.message);
-          return null;
-        }
-      }).filter(Boolean);
-      console.log(`📄 Loaded ${docs.length} document(s) from ${baseDir}`);
-      docs.forEach(d => console.log(`  - ${d.id} (${d.content?.length || 0} chars)`));
-      return docs;
-    }
-  }
-  return [];
-}
-/**
- * Convert any array-like (Float32Array, etc.) to plain JS array
- */
-function toPlainArray(input) {
-  if (!input) return null;
-  if (Array.isArray(input)) return input;
-  if (ArrayBuffer.isView(input)) return Array.from(input);
-  return null;
-}
-/**
- * Hash text for caching
- */
 function hashText(str) {
   return crypto.createHash("sha256").update(str).digest("hex");
 }
-/**
- * Semantic Doc Search Resolver
- */
+// ─────────────────────────────────────────────
+// 🔥 MAIN RESOLVER
+// ─────────────────────────────────────────────
 async function resolver(action, context = {}) {
   if (typeof action !== "string") return;
-  if (!action.toLowerCase().startsWith("ask doc-search")) return;
-  let query = extractQuery(action);
-  query = sanitizeTextForEmbedding(query);
-  if (!query) return { text: "(Empty query)", meta: { matches: 0 } };
-  // Vector backend
   const vectorStore = VectorRouter.create(context);
+  const embed = await embedder({ dimension: 384 });
-  // Ensure backend supports search
-  if (!vectorStore.supports("vector.search")) {
-    throw new Error("Vector backend does not support vector.search");
-  }
-  // Load documents (from context.documents OR doc_root)
-  const documents = loadDocumentsFromContext(context);
-  console.log("🔄 Starting ingestion for", documents.length, "documents");
-  // ✅ ONLY USE CACHE FOR PERSISTENT BACKENDS
+  const doc_root = context.doc_root || "./docs";
   const useCache = !!context.POSTGRES_URL || !!context.REDIS_URL;
   const cache = useCache ? loadCache() : {};
-  // --- Document ingestion ---
-  if (documents.length > 0) {
-    if (!vectorStore.supports("vector.insert")) {
-      throw new Error("Vector backend does not support vector.insert");
-    }
+  // =====================================================
+  // ✅ 1. VECTOR INSERT (INGEST)
+  // =====================================================
+  if (action.includes("vector.insert")) {
+    let inserted = 0;
-    for (const doc of documents) {
-      console.log("📄 Processing doc:", doc.id, "content length:", doc.content?.length);
-      if (!doc?.content) {
-        console.warn("⚠️ Skipping empty doc:", doc?.id);
-        continue;
-      }
+    if (fs.existsSync(doc_root)) {
+      const files = fs.readdirSync(doc_root);
-      // ✅ CORRECT PATH: Adjust if chunker.js is in src/utils/
-      let chunks;
-      try {
-        const chunkText = require("./utils/chunker.js").chunkText;
-        chunks = chunkText(doc.content, 500) || [doc.content];
-        console.log("📦", doc.id, "split into", chunks.length, "chunks");
-      } catch (err) {
-        console.warn("⚠️ Chunking failed, using full doc:", err.message);
-        chunks = [doc.content];
-      }
+      for (const file of files) {
+        const fullPath = path.join(doc_root, file);
+        if (!fs.statSync(fullPath).isFile()) continue;
-      for (let i = 0; i < chunks.length; i++) {
-        const text = sanitizeTextForEmbedding(chunks[i]);
-        console.log("🧩 Chunk", `${doc.id}:${i}`, "text:", JSON.stringify(text));
-        if (!text) {
-          console.warn(`⚠️ Skipping empty chunk ${doc.id}:${i}`);
-          continue;
-        }
+        const content = fs.readFileSync(fullPath, "utf8");
+        if (!content) continue;
-        const hash = hashText(text);
-        // ✅ ONLY CHECK CACHE FOR PERSISTENT BACKENDS
-        if (useCache && cache[hash]) {
-          console.log(`⏭️  Skipping already ingested chunk ${doc.id}:${i}`);
-          continue;
-        }
+        const chunkText = require("./utils/chunker").chunkText;
+        const chunks = chunkText(content, 500, 50);
-        // 🔒 DEFENSIVE EMBEDDING
-        let rawVector;
-        try {
-          rawVector = await embedder.embed(text);
-        } catch (err) {
-          console.warn(`⚠️ Embedding failed for chunk ${doc.id}:${i} ("${text.slice(0, 30)}..."):`, err.message);
-          continue;
-        }
+        for (let i = 0; i < chunks.length; i++) {
+          const text = sanitizeTextForEmbedding(chunks[i]);
+          if (!text) continue;
-        // Validate vector type
-        if (!rawVector || (!Array.isArray(rawVector) && !ArrayBuffer.isView(rawVector))) {
-          console.warn(`⚠️ Invalid vector type for chunk ${doc.id}:${i}:`, typeof rawVector);
-          continue;
-        }
+          const hash = hashText(text);
+          if (useCache && cache[hash]) continue;
-        // Check for zero vectors
-        const isZero = rawVector.every?.(v => v === 0) || false;
-        if (isZero) {
-          console.warn(`⚠️ Zero vector for chunk ${doc.id}:${i}`);
-          continue;
-        }
-        // Normalize to plain array for storage
-        const vector = toPlainArray(rawVector);
-        if (!vector) {
-          console.warn(`⚠️ Failed to normalize vector for chunk ${doc.id}:${i}`);
-          continue;
-        }
+          const rawVector = await embed(text);
+          const vector = Array.from(rawVector);
-        try {
           await vectorStore.upsert({
-            id: `${doc.id}:${i}`,
+            id: `${file}:${i}`,
             vector,
             content: text,
-            source: doc.source,
+            source: `file:${file}`,
           });
-          // ✅ ONLY UPDATE CACHE FOR PERSISTENT BACKENDS
-          if (useCache) {
-            cache[hash] = true;
-          }
-          console.log(`✅ Upserted ${doc.id}:${i}`);
-        } catch (err) {
-          console.warn(`⚠️ Upsert failed for ${doc.id}:${i}:`, err.message);
-          continue;
+          if (useCache) cache[hash] = true;
+          inserted++;
         }
       }
     }
-    // ✅ ONLY SAVE CACHE FOR PERSISTENT BACKENDS
-    if (useCache) {
-      saveCache(cache);
-    }
-  }
-  // --- QUERY EMBEDDING ---
-  let rawQueryVector;
-  try {
-    rawQueryVector = await embedder.embed(query);
-  } catch (err) {
-    console.error(`❌ Query embedding failed: "${query}"`, err.message);
-    return { text: "(Query embedding failed)", meta: { matches: 0 } };
-  }
+    if (useCache) saveCache(cache);
+    if (vectorStore.close) await vectorStore.close();
-  // ✅ FULLY COMPLETE VALIDATION LINE
-  if (!rawQueryVector || (!Array.isArray(rawQueryVector) && !ArrayBuffer.isView(rawQueryVector))) {
-    console.error("❌ Invalid query vector type:", typeof rawQueryVector);
-    return { text: "(Invalid query vector)", meta: { matches: 0 } };
+    return { inserted, doc_root };
   }
-  const isZeroQuery = rawQueryVector.every?.(v => v === 0) || false;
-  if (isZeroQuery) {
-    console.warn("⚠️ Zero vector for query");
-    return { text: "(Query produced zero vector)", meta: { matches: 0 } };
-  }
+  // =====================================================
+  // ✅ 2. VECTOR SEARCH
+  // =====================================================
+  if (action.includes("vector.search")) {
+    const query = sanitizeTextForEmbedding(extractQuery(action));
+    if (!query) return { text: "", matches: [] };
-  const queryVector = toPlainArray(rawQueryVector);
-  if (!queryVector) {
-    return { text: "(Failed to normalize query vector)", meta: { matches: 0 } };
-  }
+    const rawQueryVector = await embed(query);
+    const queryVector = Array.from(rawQueryVector);
-  // --- SEARCH ---
-  try {
-    console.log("🔍 Executing vector search...");
     const results = await vectorStore.query(queryVector, {
       topK: context.topK || 5,
-      minScore: context.minScore || 0,
     });
-    console.log("📊 Raw search results:", results.length);
-    results.forEach((r, i) => console.log(`  ${i}: score=${r.score?.toFixed(4)}, content="${r.content?.substring(0, 50)}..."`));
+    if (vectorStore.close) await vectorStore.close();
     return formatResults(results, query);
-  } catch (err) {
-    console.error("❌ Vector search failed:", err.message);
-    return { text: "(Search failed)", meta: { matches: 0 } };
   }
+  // =====================================================
+  // ❌ REMOVE THIS (legacy)
+  // =====================================================
+  // if (action.startsWith("Ask doc-search")) { ... }
+  return;
 }
+resolver.resolverName = "vector";
+resolver.version = "1.0.0";
 module.exports = resolver;