@o-lang/semantic-doc-search 1.0.12 → 1.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@o-lang/semantic-doc-search",
3
- "version": "1.0.12",
3
+ "version": "1.0.14",
4
4
  "description": "O-lang Semantic Document Search Resolver with hybrid search, embeddings, rerank, and streaming.",
5
5
  "main": "src/index.js",
6
6
  "type": "commonjs",
@@ -1,53 +1,121 @@
1
1
  // src/embeddings/local.js
2
- const crypto = require("crypto");
3
2
 
4
3
  /**
5
- * LocalEmbedding
6
- * Generates deterministic "fake" embeddings for offline testing or fallback.
7
- * Each string will produce a consistent vector based on a hash.
8
- * Note: Not semantic, just a placeholder for testing.
4
+ * LocalEmbedding - REAL semantic embeddings using all-MiniLM-L6-v2
5
+ * Uses dynamic import to work with ESM packages in CommonJS environment
9
6
  */
10
7
  class LocalEmbedding {
11
- constructor(dim = 512) {
12
- this.dim = dim;
8
+ constructor() {
9
+ this.dim = 384;
10
+ this.modelPromise = null;
11
+ this.transformersPromise = null;
13
12
  }
14
13
 
15
14
  /**
16
- * Convert text deterministic pseudo-vector
15
+ * Lazy-load the @xenova/transformers package
17
16
  */
18
- embed(text) {
19
- if (!text || !text.trim()) return new Array(this.dim).fill(0);
17
+ async getTransformers() {
18
+ if (!this.transformersPromise) {
19
+ this.transformersPromise = import('@xenova/transformers');
20
+ }
21
+ return this.transformersPromise;
22
+ }
20
23
 
21
- const hash = crypto.createHash("sha256").update(text).digest();
22
- const vector = [];
24
+ /**
25
+ * Lazy-load the embedding model
26
+ */
27
+ async getModel() {
28
+ if (!this.modelPromise) {
29
+ const { pipeline, env } = await this.getTransformers();
30
+
31
+ // Configure transformers
32
+ env.allowLocalModels = true;
33
+ env.backends.onnx.warmup = false;
34
+
35
+ console.log('🔄 Loading local embedding model (first run may take 1-2 minutes)...');
36
+
37
+ this.modelPromise = pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2', {
38
+ revision: 'main',
39
+ cache_dir: './.cache/embeddings'
40
+ }).then(model => {
41
+ console.log('✅ Local embedding model loaded successfully!');
42
+ return model;
43
+ }).catch(error => {
44
+ console.error('❌ Failed to load local embedding model:', error.message);
45
+ throw error;
46
+ });
47
+ }
48
+ return this.modelPromise;
49
+ }
23
50
 
24
- for (let i = 0; i < this.dim; i++) {
25
- vector.push(hash[i % hash.length] / 255); // normalize 0–1
51
+ /**
52
+ * Generate REAL semantic embedding for text
53
+ */
54
+ async embed(text) {
55
+ if (!text || !text.trim()) {
56
+ return new Array(this.dim).fill(0);
26
57
  }
27
58
 
28
- return vector;
59
+ try {
60
+ const model = await this.getModel();
61
+ const output = await model(text, {
62
+ pooling: 'mean',
63
+ normalize: true
64
+ });
65
+ return Array.from(output.data);
66
+ } catch (error) {
67
+ console.error(`❌ Embedding failed for: "${text.substring(0, 50)}..."`);
68
+ return new Array(this.dim).fill(0);
69
+ }
29
70
  }
30
71
 
31
72
  /**
32
73
  * Batch embedding for multiple strings
33
74
  */
34
- embedBatch(textArray = []) {
35
- if (!Array.isArray(textArray)) throw new Error("embedBatch expects an array");
36
- return textArray.map(text => this.embed(text));
75
+ async embedBatch(textArray = []) {
76
+ if (!Array.isArray(textArray)) {
77
+ throw new Error("embedBatch expects an array of strings");
78
+ }
79
+ const embeddings = [];
80
+ for (const text of textArray) {
81
+ const embedding = await this.embed(text);
82
+ embeddings.push(embedding);
83
+ }
84
+ return embeddings;
85
+ }
86
+
87
+ /**
88
+ * Get embedding dimension
89
+ */
90
+ getDimension() {
91
+ return this.dim;
37
92
  }
38
93
  }
39
94
 
40
95
  /**
41
- * Convenience function for index.js
42
- * Retries local embedding generation (mostly placeholder, but keeps API compatible)
96
+ * Convenience function for compatibility
43
97
  */
44
- async function createEmbeddingWithRetry(text, options = {}, retries = 1) {
98
+ async function createEmbeddingWithRetry(text, options = {}, retries = 2) {
45
99
  const embedder = new LocalEmbedding();
100
+
46
101
  for (let attempt = 1; attempt <= retries; attempt++) {
47
102
  try {
48
- return embedder.embed(text);
103
+ const embedding = await embedder.embed(text);
104
+ const isAllZeros = embedding.every(val => val === 0);
105
+ if (isAllZeros && (text || '').trim()) {
106
+ if (attempt === retries) {
107
+ console.warn(`⚠️ Embedding is all zeros for text: "${text.substring(0, 50)}..."`);
108
+ }
109
+ throw new Error('Embedding returned all zeros');
110
+ }
111
+ return embedding;
49
112
  } catch (err) {
50
- if (attempt === retries) throw err;
113
+ if (attempt === retries) {
114
+ console.error(`❌ All ${retries} attempts failed for embedding text: "${text.substring(0, 50)}..."`);
115
+ throw err;
116
+ }
117
+ console.warn(`⚠️ Embedding attempt ${attempt} failed, retrying...`);
118
+ await new Promise(resolve => setTimeout(resolve, 100 * attempt));
51
119
  }
52
120
  }
53
121
  }