npm - @o-lang/semantic-doc-search - Versions diffs - 1.0.32 → 1.0.34 - Mend

@o-lang/semantic-doc-search 1.0.32 → 1.0.34

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@o-lang/semantic-doc-search",
-  "version": "1.0.32",
+  "version": "1.0.34",
   "description": "O-lang Semantic Document Search Resolver with hybrid search, embeddings, rerank, and streaming.",
   "main": "src/index.js",
   "type": "commonjs",

package/src/embeddings/local.js CHANGED Viewed

@@ -7,6 +7,8 @@
  * - No zero vectors
  * - Deterministic behavior
  * - DEFENSIVE against method detaching & invalid vectors
+ * - WINDOWS-SAFE (disables SIMD, threads, proxy)
+ * - TENSOR-SAFE (handles Float32Array, Array, and all ONNX tensor types)
  */
 class LocalEmbedding {
@@ -29,13 +31,21 @@ class LocalEmbedding {
     if (!this.loading) {
       this.loading = (async () => {
-        const { pipeline, env } = await import("@xenova/transformers");
+        // ⚠️ CRITICAL: Configure environment BEFORE loading model
+        const { env } = await import("@xenova/transformers");
-        // Safe defaults
+        // Safe settings for all platforms (harmless on macOS/Linux, essential on Windows)
+        env.backends.onnx.wasm.simd = false;      // Avoids AVX/SIMD crashes on older CPUs
+        env.backends.onnx.wasm.threads = false;   // Prevents threading issues in Node
+        env.backends.onnx.wasm.proxy = false;     // Avoids proxy complications
         env.allowLocalModels = true;
         env.backends.onnx.warmup = false;
+        env.cacheDir = "./.cache/embeddings";     // Explicit, project-local cache
         console.log("🔄 Loading local embedding model (first run only)...");
+        console.log("⚙️  Using WASM (SIMD disabled) for cross-platform compatibility");
+        const { pipeline } = await import("@xenova/transformers");
         const model = await pipeline(
           "feature-extraction",
@@ -57,9 +67,6 @@ class LocalEmbedding {
   /* ---------------- PUBLIC API ---------------- */
-  /**
-   * Generate embedding for a single string
-   */
   async embed(text) {
     if (typeof text !== "string" || !text.trim()) {
       throw new Error("Embedding input must be a non-empty string");
@@ -73,11 +80,45 @@ class LocalEmbedding {
         normalize: true,
       });
-      // ✅ Defensive: ensure vector is a valid array
-      const vector = Array.isArray(output?.data) ? Array.from(output.data) : null;
+      // 🔍 DEBUG: Inspect output structure
+      console.log("🔍 Model output type:", typeof output);
+      if (output && typeof output === 'object') {
+        console.log("🔍 Output keys:", Object.keys(output));
+        console.log("🔍 Output dims:", output.dims);
+        console.log("🔍 output.data type:", Object.prototype.toString.call(output.data));
+        console.log("🔍 Is TypedArray?", ArrayBuffer.isView(output.data));
+      }
+      // ✅ UNIVERSAL EXTRACTION: handles Float32Array, Array, and all tensor forms
+      let vector = null;
+      if (output && output.data !== undefined) {
+        // Handle Float32Array, Uint8Array, etc. (standard in ONNX/WASM)
+        if (ArrayBuffer.isView(output.data)) {
+          vector = Array.from(output.data);
+        }
+        // Handle plain JS array (older backends or CPU mode)
+        else if (Array.isArray(output.data)) {
+          vector = Array.from(output.data);
+        }
+      }
+      // Handle batch output: [tensor]
+      else if (Array.isArray(output) && output[0]?.data !== undefined) {
+        if (ArrayBuffer.isView(output[0].data)) {
+          vector = Array.from(output[0].data);
+        } else if (Array.isArray(output[0].data)) {
+          vector = Array.from(output[0].data);
+        }
+      }
+      // Fallback: raw array (rare)
+      else if (Array.isArray(output)) {
+        vector = output;
+      }
+      // Final validation
       if (!Array.isArray(vector) || vector.length !== this.dim) {
-        console.error("❌ Invalid embedding vector returned:", vector);
+        console.error("❌ Invalid embedding vector length:", vector?.length);
+        console.error("❌ First few values:", vector?.slice?.(0, 5));
         throw new Error(`Invalid embedding dimension: ${vector?.length || 0} (expected ${this.dim})`);
       }
@@ -85,15 +126,12 @@ class LocalEmbedding {
     } catch (err) {
       console.error(
         `❌ Embedding failed for text: "${text.slice(0, 60)}..."`,
-        err
+        err.message
       );
       throw err;
     }
   }
-  /**
-   * Batch embedding (sequential, safe)
-   */
   async embedBatch(texts = []) {
     if (!Array.isArray(texts)) {
       throw new Error("embedBatch expects an array of strings");
@@ -106,15 +144,10 @@ class LocalEmbedding {
     return results;
   }
-  /**
-   * Return embedding dimension
-   */
   getDimension() {
     return this.dim;
   }
 }
-/* ---------------- SINGLETON EXPORT ---------------- */
 const embedder = new LocalEmbedding();
-module.exports = embedder;
+module.exports = embedder;

package/src/resolver.js CHANGED Viewed

@@ -1,5 +1,5 @@
 const VectorRouter = require("./adapters/vectorRouter");
-const embedder = require("./embeddings/local"); // ✅ singleton embedder
+const embedder = require("./embeddings/local"); // singleton embedder
 const { extractQuery } = require("./utils/extractQuery");
 const { formatResults } = require("./utils/formatResults");
 const fs = require("fs");
@@ -8,7 +8,7 @@ const crypto = require("crypto");
 const CACHE_PATH = path.join(process.cwd(), "embeddings.json");
-// ---------------- Cache utils ----------------
+// Load cache for ingestion guard
 function loadCache() {
   try {
     if (fs.existsSync(CACHE_PATH)) {
@@ -17,24 +17,36 @@ function loadCache() {
   } catch {}
   return {};
 }
 function saveCache(cache) {
   try {
     fs.writeFileSync(CACHE_PATH, JSON.stringify(cache, null, 2));
   } catch {}
 }
 function hashContent(str) {
   return crypto.createHash("sha256").update(str).digest("hex");
 }
-// ---------------- Resolver ----------------
+/**
+ * Clean text for embedding (defensive)
+ */
+function sanitizeTextForEmbedding(text) {
+  if (typeof text !== "string") return "";
+  // Remove wrapping quotes and extra whitespace
+  return text.replace(/^["']|["']$/g, "").trim();
+}
+/**
+ * Semantic Doc Search Resolver
+ */
 async function resolver(action, context = {}) {
   if (typeof action !== "string") return;
   if (!action.toLowerCase().startsWith("ask doc-search")) return;
-  // Extract & sanitize query
-  const queryRaw = extractQuery(action);
-  const query = typeof queryRaw === "string" ? queryRaw.replace(/^["']|["']$/g, "").trim() : "";
-  if (!query) throw new Error("Query is empty after sanitization");
+  let query = extractQuery(action);
+  query = sanitizeTextForEmbedding(query);
+  if (!query) return { text: "(Empty query)", meta: { matches: 0 } };
   // Vector backend
   const vectorStore = VectorRouter.create(context);
@@ -61,14 +73,14 @@ async function resolver(action, context = {}) {
     for (const doc of context.documents) {
       const chunks = doc.chunks || [doc.content];
       for (let i = 0; i < chunks.length; i++) {
-        const text = chunks[i];
-        if (!text || typeof text !== "string") continue;
+        const text = sanitizeTextForEmbedding(chunks[i]);
+        if (!text) continue;
         const hash = hashContent(text);
         if (cache[hash]) continue; // Skip already ingested
         const vector = await embedder.embed(text);
-        if (!Array.isArray(vector) || vector.every(v => v === 0)) continue;
+        if (!vector || vector.every(v => v === 0)) continue;
         await vectorStore.upsert({
           id: `${doc.id}:${i}`,
@@ -85,10 +97,12 @@ async function resolver(action, context = {}) {
   // Embed query & search
   const queryVector = await embedder.embed(query);
-  if (!Array.isArray(queryVector) || queryVector.length !== embedder.getDimension()) {
-    throw new Error("Query embedding invalid or not a proper array");
+  if (!queryVector || queryVector.every(v => v === 0)) {
+    console.warn("⚠️ Query embedding invalid");
+    return { text: "(Query could not be embedded)", meta: { matches: 0 } };
   }
+  // Top-K + similarity threshold
   const results = await vectorStore.query({
     vector: queryVector,
     topK: context.topK || 5,

package/test-embed.js ADDED Viewed

@@ -0,0 +1,10 @@
+// test-embed.js
+const embedder = require("./src/embeddings/local");
+async function test() {
+  console.log("Model dimension:", embedder.getDimension());
+  const vector = await embedder.embed("hello world");
+  console.log("Embedding result:", vector?.length, vector);
+}
+test().catch(console.error);