npm - @o-lang/semantic-doc-search - Versions diffs - 1.0.12 → 1.0.14 - Mend

@o-lang/semantic-doc-search 1.0.12 → 1.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/package.json +1 -1
package/src/embeddings/local.js +91 -23

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@o-lang/semantic-doc-search",
-  "version": "1.0.12",
+  "version": "1.0.14",
   "description": "O-lang Semantic Document Search Resolver with hybrid search, embeddings, rerank, and streaming.",
   "main": "src/index.js",
   "type": "commonjs",

package/src/embeddings/local.js CHANGED Viewed

@@ -1,53 +1,121 @@
 // src/embeddings/local.js
-const crypto = require("crypto");
 /**
- * LocalEmbedding
- * Generates deterministic "fake" embeddings for offline testing or fallback.
- * Each string will produce a consistent vector based on a hash.
- * Note: Not semantic, just a placeholder for testing.
+ * LocalEmbedding - REAL semantic embeddings using all-MiniLM-L6-v2
+ * Uses dynamic import to work with ESM packages in CommonJS environment
  */
 class LocalEmbedding {
-  constructor(dim = 512) {
-    this.dim = dim;
+  constructor() {
+    this.dim = 384;
+    this.modelPromise = null;
+    this.transformersPromise = null;
   }
   /**
-   * Convert text → deterministic pseudo-vector
+   * Lazy-load the @xenova/transformers package
    */
-  embed(text) {
-    if (!text || !text.trim()) return new Array(this.dim).fill(0);
+  async getTransformers() {
+    if (!this.transformersPromise) {
+      this.transformersPromise = import('@xenova/transformers');
+    }
+    return this.transformersPromise;
+  }
-    const hash = crypto.createHash("sha256").update(text).digest();
-    const vector = [];
+  /**
+   * Lazy-load the embedding model
+   */
+  async getModel() {
+    if (!this.modelPromise) {
+      const { pipeline, env } = await this.getTransformers();
+      // Configure transformers
+      env.allowLocalModels = true;
+      env.backends.onnx.warmup = false;
+      console.log('🔄 Loading local embedding model (first run may take 1-2 minutes)...');
+      this.modelPromise = pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2', {
+        revision: 'main',
+        cache_dir: './.cache/embeddings'
+      }).then(model => {
+        console.log('✅ Local embedding model loaded successfully!');
+        return model;
+      }).catch(error => {
+        console.error('❌ Failed to load local embedding model:', error.message);
+        throw error;
+      });
+    }
+    return this.modelPromise;
+  }
-    for (let i = 0; i < this.dim; i++) {
-      vector.push(hash[i % hash.length] / 255); // normalize 0–1
+  /**
+   * Generate REAL semantic embedding for text
+   */
+  async embed(text) {
+    if (!text || !text.trim()) {
+      return new Array(this.dim).fill(0);
     }
-    return vector;
+    try {
+      const model = await this.getModel();
+      const output = await model(text, {
+        pooling: 'mean',
+        normalize: true
+      });
+      return Array.from(output.data);
+    } catch (error) {
+      console.error(`❌ Embedding failed for: "${text.substring(0, 50)}..."`);
+      return new Array(this.dim).fill(0);
+    }
   }
   /**
    * Batch embedding for multiple strings
    */
-  embedBatch(textArray = []) {
-    if (!Array.isArray(textArray)) throw new Error("embedBatch expects an array");
-    return textArray.map(text => this.embed(text));
+  async embedBatch(textArray = []) {
+    if (!Array.isArray(textArray)) {
+      throw new Error("embedBatch expects an array of strings");
+    }
+    const embeddings = [];
+    for (const text of textArray) {
+      const embedding = await this.embed(text);
+      embeddings.push(embedding);
+    }
+    return embeddings;
+  }
+  /**
+   * Get embedding dimension
+   */
+  getDimension() {
+    return this.dim;
   }
 }
 /**
- * Convenience function for index.js
- * Retries local embedding generation (mostly placeholder, but keeps API compatible)
+ * Convenience function for compatibility
  */
-async function createEmbeddingWithRetry(text, options = {}, retries = 1) {
+async function createEmbeddingWithRetry(text, options = {}, retries = 2) {
   const embedder = new LocalEmbedding();
   for (let attempt = 1; attempt <= retries; attempt++) {
     try {
-      return embedder.embed(text);
+      const embedding = await embedder.embed(text);
+      const isAllZeros = embedding.every(val => val === 0);
+      if (isAllZeros && (text || '').trim()) {
+        if (attempt === retries) {
+          console.warn(`⚠️ Embedding is all zeros for text: "${text.substring(0, 50)}..."`);
+        }
+        throw new Error('Embedding returned all zeros');
+      }
+      return embedding;
     } catch (err) {
-      if (attempt === retries) throw err;
+      if (attempt === retries) {
+        console.error(`❌ All ${retries} attempts failed for embedding text: "${text.substring(0, 50)}..."`);
+        throw err;
+      }
+      console.warn(`⚠️ Embedding attempt ${attempt} failed, retrying...`);
+      await new Promise(resolve => setTimeout(resolve, 100 * attempt));
     }
   }
 }