npm - @o-lang/semantic-doc-search - Versions diffs - 1.0.39 → 1.0.41 - Mend

@o-lang/semantic-doc-search 1.0.39 → 1.0.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.md +17 -12
package/package.json +1 -1
package/src/resolver.js +61 -173
package/src/services/docQA.js +56 -19
package/src/utils/formatResults.js +9 -10

package/README.md CHANGED Viewed

@@ -1,24 +1,29 @@
-# @olang/semantic-doc-search
+# @o-lang/semantic-doc-search
-O-lang Semantic Document Search Resolver with hybrid search, embeddings, rerank, and streaming.
+Semantic document retrieval engine for O-Lang workflows.
+This package provides vector-based document search (RAG retrieval layer) that integrates with O-Lang kernel workflows. It handles document ingestion, chunking, embedding, and similarity search, returning LLM-ready context outputs.
 ---
 ## Features
-- Hybrid search (semantic embeddings + lexical scoring)
-- Supports multiple file types: `.txt`, `.md`, `.pdf`, `.html`, `.docx`
-- Streaming token-by-token output via SSE
-- Reranking support (Cohere, Groq, local)
-- Flexible vector adapters: in-memory, Redis, Pinecone
-- Prebuilt prompt templates: summarize, short-answer, bullet-points, cite-sources
-- CLI for quick testing
-- Safe path resolution & chunking for long documents
-- Persistent embeddings cache (`embeddings.json`)
+- Semantic vector search using embeddings
+- Document ingestion from local filesystem (`.txt`, `.md`)
+- Automatic text chunking for large documents
+- Pluggable embedding providers (local, OpenAI, Groq, etc.)
+- Multiple vector database support:
+  - In-memory store
+  - Redis (adapter)
+  - PostgreSQL / pgvector (adapter)
+  - Pinecone (adapter)
+- Embedding cache support (`embeddings.json`)
+- Normalized LLM-ready output format (`text + matches`)
+- Designed for O-Lang `.ol` workflow integration
 ---
 ## Installation
 ```bash
-npm install @olang/semantic-doc-search
+npm install @o-lang/semantic-doc-search

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@o-lang/semantic-doc-search",
-  "version": "1.0.39",
+  "version": "1.0.41",
   "description": "O-lang Semantic Document Search Resolver with hybrid search, embeddings, rerank, and streaming.",
   "main": "src/index.js",
   "type": "commonjs",

package/src/resolver.js CHANGED Viewed

@@ -1,5 +1,5 @@
 const VectorRouter = require("./adapters/vectorRouter");
-const embedder = require("./embeddings/local"); // singleton embedder
+const embedder = require("./embeddings/local");
 const { extractQuery } = require("./utils/extractQuery");
 const { formatResults } = require("./utils/formatResults");
 const fs = require("fs");
@@ -8,7 +8,9 @@ const crypto = require("crypto");
 const CACHE_PATH = path.join(process.cwd(), "embeddings.json");
-// Load cache for ingestion guard
+// ─────────────────────────────────────────────
+// Helpers (UNCHANGED)
+// ─────────────────────────────────────────────
 function loadCache() {
   try {
     if (fs.existsSync(CACHE_PATH)) {
@@ -24,218 +26,104 @@ function saveCache(cache) {
   } catch {}
 }
-/**
- * Clean text for embedding (defensive)
- */
 function sanitizeTextForEmbedding(text) {
   if (typeof text !== "string") return "";
-  // Remove wrapping quotes and extra whitespace
   return text.replace(/^["']|["']$/g, "").trim();
 }
-/**
- * Load documents from doc_root if provided
- */
-function loadDocumentsFromContext(context) {
-  if (context.documents && Array.isArray(context.documents)) {
-    return context.documents;
-  }
-  if (context.doc_root) {
-    const baseDir = path.resolve(process.cwd(), context.doc_root);
-    if (fs.existsSync(baseDir)) {
-      const files = fs.readdirSync(baseDir).filter(f => f.endsWith('.txt') || f.endsWith('.md'));
-      const docs = files.map(file => {
-        try {
-          const content = fs.readFileSync(path.join(baseDir, file), 'utf8');
-          return { id: file, content, source: `file:${file}` };
-        } catch (err) {
-          console.warn(`⚠️ Failed to read ${file}:`, err.message);
-          return null;
-        }
-      }).filter(Boolean);
-      console.log(`📄 Loaded ${docs.length} document(s) from ${baseDir}`);
-      docs.forEach(d => console.log(`  - ${d.id} (${d.content?.length || 0} chars)`));
-      return docs;
-    }
-  }
-  return [];
-}
-/**
- * Convert any array-like (Float32Array, etc.) to plain JS array
- */
-function toPlainArray(input) {
-  if (!input) return null;
-  if (Array.isArray(input)) return input;
-  if (ArrayBuffer.isView(input)) return Array.from(input);
-  return null;
-}
-/**
- * Hash text for caching
- */
 function hashText(str) {
   return crypto.createHash("sha256").update(str).digest("hex");
 }
-/**
- * Semantic Doc Search Resolver
- */
+// ─────────────────────────────────────────────
+// 🔥 MAIN RESOLVER
+// ─────────────────────────────────────────────
 async function resolver(action, context = {}) {
   if (typeof action !== "string") return;
-  if (!action.toLowerCase().startsWith("ask doc-search")) return;
-  let query = extractQuery(action);
-  query = sanitizeTextForEmbedding(query);
-  if (!query) return { text: "(Empty query)", meta: { matches: 0 } };
-  // Vector backend
   const vectorStore = VectorRouter.create(context);
+  const embed = await embedder({ dimension: 384 });
-  // Ensure backend supports search
-  if (!vectorStore.supports("vector.search")) {
-    throw new Error("Vector backend does not support vector.search");
-  }
+  const doc_root = context.doc_root || "./docs";
+  const useCache = !!context.POSTGRES_URL || !!context.REDIS_URL;
+  const cache = useCache ? loadCache() : {};
-  // Load documents (from context.documents OR doc_root)
-  const documents = loadDocumentsFromContext(context);
-  console.log("🔄 Starting ingestion for", documents.length, "documents");
+  // =====================================================
+  // ✅ 1. VECTOR INSERT (INGEST)
+  // =====================================================
+  if (action.includes("vector.insert")) {
+    let inserted = 0;
-  // Ingestion guard cache
-  const cache = loadCache();
-  // --- Document ingestion ---
-  if (documents.length > 0) {
-    if (!vectorStore.supports("vector.insert")) {
-      throw new Error("Vector backend does not support vector.insert");
-    }
-    for (const doc of documents) {
-      console.log("📄 Processing doc:", doc.id, "content length:", doc.content?.length);
-      if (!doc?.content) {
-        console.warn("⚠️ Skipping empty doc:", doc?.id);
-        continue;
-      }
-      // ✅ CORRECT PATH: Adjust if chunker.js is in src/utils/
-      let chunks;
-      try {
-        const chunkText = require("./utils/chunker.js").chunkText;
-        chunks = chunkText(doc.content, 500) || [doc.content];
-        console.log("📦", doc.id, "split into", chunks.length, "chunks");
-      } catch (err) {
-        console.warn("⚠️ Chunking failed, using full doc:", err.message);
-        chunks = [doc.content];
-      }
+    if (fs.existsSync(doc_root)) {
+      const files = fs.readdirSync(doc_root);
-      for (let i = 0; i < chunks.length; i++) {
-        const text = sanitizeTextForEmbedding(chunks[i]);
-        console.log("🧩 Chunk", `${doc.id}:${i}`, "text:", JSON.stringify(text));
-        if (!text) {
-          console.warn(`⚠️ Skipping empty chunk ${doc.id}:${i}`);
-          continue;
-        }
+      for (const file of files) {
+        const fullPath = path.join(doc_root, file);
+        if (!fs.statSync(fullPath).isFile()) continue;
-        const hash = hashText(text);
-        if (cache[hash]) {
-          console.log(`⏭️  Skipping already ingested chunk ${doc.id}:${i}`);
-          continue;
-        }
+        const content = fs.readFileSync(fullPath, "utf8");
+        if (!content) continue;
-        // 🔒 DEFENSIVE EMBEDDING
-        let rawVector;
-        try {
-          rawVector = await embedder.embed(text);
-        } catch (err) {
-          console.warn(`⚠️ Embedding failed for chunk ${doc.id}:${i} ("${text.slice(0, 30)}..."):`, err.message);
-          continue;
-        }
+        const chunkText = require("./utils/chunker").chunkText;
+        const chunks = chunkText(content, 500, 50);
-        // Validate vector type
-        if (!rawVector || (!Array.isArray(rawVector) && !ArrayBuffer.isView(rawVector))) {
-          console.warn(`⚠️ Invalid vector type for chunk ${doc.id}:${i}:`, typeof rawVector);
-          continue;
-        }
+        for (let i = 0; i < chunks.length; i++) {
+          const text = sanitizeTextForEmbedding(chunks[i]);
+          if (!text) continue;
-        // Check for zero vectors
-        const isZero = rawVector.every?.(v => v === 0) || false;
-        if (isZero) {
-          console.warn(`⚠️ Zero vector for chunk ${doc.id}:${i}`);
-          continue;
-        }
+          const hash = hashText(text);
+          if (useCache && cache[hash]) continue;
-        // Normalize to plain array for storage
-        const vector = toPlainArray(rawVector);
-        if (!vector) {
-          console.warn(`⚠️ Failed to normalize vector for chunk ${doc.id}:${i}`);
-          continue;
-        }
+          const rawVector = await embed(text);
+          const vector = Array.from(rawVector);
-        try {
           await vectorStore.upsert({
-            id: `${doc.id}:${i}`,
+            id: `${file}:${i}`,
             vector,
             content: text,
-            source: doc.source,
+            source: `file:${file}`,
           });
-          cache[hash] = true;
-          console.log(`✅ Upserted ${doc.id}:${i}`);
-        } catch (err) {
-          console.warn(`⚠️ Upsert failed for ${doc.id}:${i}:`, err.message);
-          continue;
+          if (useCache) cache[hash] = true;
+          inserted++;
         }
       }
     }
-    saveCache(cache);
-  }
-  // --- QUERY EMBEDDING ---
-  let rawQueryVector;
-  try {
-    rawQueryVector = await embedder.embed(query);
-  } catch (err) {
-    console.error(`❌ Query embedding failed: "${query}"`, err.message);
-    return { text: "(Query embedding failed)", meta: { matches: 0 } };
-  }
+    if (useCache) saveCache(cache);
+    if (vectorStore.close) await vectorStore.close();
-  // ✅ FULLY COMPLETE VALIDATION LINE
-  if (!rawQueryVector || (!Array.isArray(rawQueryVector) && !ArrayBuffer.isView(rawQueryVector))) {
-    console.error("❌ Invalid query vector type:", typeof rawQueryVector);
-    return { text: "(Invalid query vector)", meta: { matches: 0 } };
+    return { inserted, doc_root };
   }
-  const isZeroQuery = rawQueryVector.every?.(v => v === 0) || false;
-  if (isZeroQuery) {
-    console.warn("⚠️ Zero vector for query");
-    return { text: "(Query produced zero vector)", meta: { matches: 0 } };
-  }
+  // =====================================================
+  // ✅ 2. VECTOR SEARCH
+  // =====================================================
+  if (action.includes("vector.search")) {
+    const query = sanitizeTextForEmbedding(extractQuery(action));
+    if (!query) return { text: "", matches: [] };
-  const queryVector = toPlainArray(rawQueryVector);
-  if (!queryVector) {
-    return { text: "(Failed to normalize query vector)", meta: { matches: 0 } };
-  }
+    const rawQueryVector = await embed(query);
+    const queryVector = Array.from(rawQueryVector);
-  // --- SEARCH ---
-  try {
-    console.log("🔍 Executing vector search...");
     const results = await vectorStore.query(queryVector, {
       topK: context.topK || 5,
-      minScore: context.minScore || 0,
     });
-    console.log("📊 Raw search results:", results.length);
-    results.forEach((r, i) => console.log(`  ${i}: score=${r.score?.toFixed(4)}, content="${r.content?.substring(0, 50)}..."`));
+    if (vectorStore.close) await vectorStore.close();
     return formatResults(results, query);
-  } catch (err) {
-    console.error("❌ Vector search failed:", err.message);
-    return { text: "(Search failed)", meta: { matches: 0 } };
   }
+  // =====================================================
+  // ❌ REMOVE THIS (legacy)
+  // =====================================================
+  // if (action.startsWith("Ask doc-search")) { ... }
+  return;
 }
+resolver.resolverName = "vector";
+resolver.version = "1.0.0";
 module.exports = resolver;

package/src/services/docQA.js CHANGED Viewed

@@ -2,20 +2,22 @@ const VectorRouter = require("../adapters/vectorRouter");
 const embedder = require("../embeddings/local");
 const extractText = require("../utils/extractText");
 const chunkText = require("../utils/chunker");
+const formatResults = require("../utils/formatResults");
 const fs = require("fs");
 const path = require("path");
 async function performDocQA(
   query,
   {
-    doc_root,
-    vectorBackend = "pgvector",
+    doc_root = "./docs",
+    vectorBackend = "memory", // 🔥 default to memory like Python fallback
     dimension = 384,
-    migrate_on_demand = false,
     POSTGRES_URL,
+    topK = 5,
     ...config
   } = {}
 ) {
+  // ── Create vector store
   const store = VectorRouter.create({
     backend: vectorBackend,
     dimension,
@@ -25,30 +27,65 @@ async function performDocQA(
   const embed = await embedder({ dimension });
-  if (migrate_on_demand && doc_root) {
-    for (const file of fs.readdirSync(doc_root)) {
+  // ─────────────────────────────────────────────
+  // 🔥 ALWAYS INGEST (Python parity)
+  // ─────────────────────────────────────────────
+  if (doc_root && fs.existsSync(doc_root)) {
+    const files = fs.readdirSync(doc_root);
+    for (const file of files) {
       const fullPath = path.join(doc_root, file);
       if (!fs.statSync(fullPath).isFile()) continue;
+      if (!file.endsWith(".txt") && !file.endsWith(".md")) continue;
+      try {
+        const text = await extractText(fullPath);
+        if (!text || !text.trim()) continue;
+        const chunks = chunkText(text, 500, 50) || [text];
+        for (let i = 0; i < chunks.length; i++) {
+          const chunk = chunks[i];
+          if (!chunk.trim()) continue;
-      const text = await extractText(fullPath);
-      const chunks = chunkText(text);
-      for (let i = 0; i < chunks.length; i++) {
-        await store.upsert({
-          id: `${file}-${i}`,
-          vector: await embed(chunks[i]),
-          content: chunks[i],
-          source: file,
-          metadata: { chunk: i }
-        });
+          try {
+            await store.upsert({
+              id: `${file}:${i}`,
+              vector: await embed(chunk),
+              content: chunk,
+              source: `file:${file}`,
+              metadata: { chunk: i }
+            });
+          } catch (err) {
+            console.warn("⚠️ Chunk failed:", err.message);
+          }
+        }
+      } catch (err) {
+        console.error("❌ Failed to process file:", file, err.message);
       }
     }
   }
-  const results = await store.query(await embed(query), { topK: 5 });
+  // ─────────────────────────────────────────────
+  // 🔍 SEARCH
+  // ─────────────────────────────────────────────
+  let matches = [];
+  try {
+    const queryVector = await embed(query);
+    matches = await store.query(queryVector, { topK });
+  } catch (err) {
+    console.error("❌ Search failed:", err.message);
+  }
   if (store.close) await store.close();
-  return results;
+  // ─────────────────────────────────────────────
+  // ✅ FORMAT LIKE PYTHON
+  // ─────────────────────────────────────────────
+  return formatResults(matches, query);
 }
-module.exports = performDocQA;
+module.exports = performDocQA;

package/src/utils/formatResults.js CHANGED Viewed

@@ -1,15 +1,14 @@
-/**
- * Normalizes vector search results for O-Lang workflows.
- * Returns both structured matches AND a plain .text field for LLM prompts.
- */
-function formatResults(results = [], query) {
-  // ✅ Generate plain text from all matches
-  const text = results.map(r => r.content).join('\n\n');
+function formatResults(results = [], query = "") {
+  const safeResults = Array.isArray(results) ? results : [];
+  const text = safeResults.length
+    ? safeResults.map(r => r.content).join('\n\n')
+    : "";
   return {
     query,
-    text, // ← THIS IS THE KEY ADDITION
-    matches: results.map(r => ({
+    text,
+    matches: safeResults.map(r => ({
       id: r.id,
       content: r.content,
       source: r.source,