@o-lang/semantic-doc-search 1.0.12 → 1.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/embeddings/local.js +122 -24
package/package.json
CHANGED
package/src/embeddings/local.js
CHANGED
|
@@ -1,53 +1,151 @@
|
|
|
1
1
|
// src/embeddings/local.js
|
|
2
|
-
const
|
|
2
|
+
const { pipeline, env } = require('@xenova/transformers');
|
|
3
|
+
|
|
4
|
+
// Configure transformers to work in Node.js
|
|
5
|
+
env.allowLocalModels = true;
|
|
6
|
+
env.backends.onnx.warmup = false; // Faster startup
|
|
3
7
|
|
|
4
8
|
/**
|
|
5
|
-
* LocalEmbedding
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
+
* LocalEmbedding - REAL semantic embeddings using all-MiniLM-L6-v2
|
|
10
|
+
* This is a compact, high-quality sentence transformer that:
|
|
11
|
+
* - Understands semantic meaning of text
|
|
12
|
+
* - Produces embeddings with negative and positive values (-1 to 1)
|
|
13
|
+
* - Works offline after first download
|
|
14
|
+
* - Is optimized for CPU (no GPU required)
|
|
15
|
+
* - Produces 384-dimensional vectors compatible with pgvector
|
|
9
16
|
*/
|
|
10
17
|
class LocalEmbedding {
|
|
11
|
-
constructor(
|
|
12
|
-
this.dim =
|
|
18
|
+
constructor() {
|
|
19
|
+
this.dim = 384; // all-MiniLM-L6-v2 output dimension
|
|
20
|
+
this.modelPromise = null;
|
|
21
|
+
this.isModelLoading = false;
|
|
13
22
|
}
|
|
14
23
|
|
|
15
24
|
/**
|
|
16
|
-
*
|
|
25
|
+
* Lazy-load the embedding model (only loads when first needed)
|
|
17
26
|
*/
|
|
18
|
-
|
|
19
|
-
if (!
|
|
27
|
+
async getModel() {
|
|
28
|
+
if (!this.modelPromise) {
|
|
29
|
+
this.isModelLoading = true;
|
|
30
|
+
console.log('🔄 Loading local embedding model (first run may take 1-2 minutes)...');
|
|
31
|
+
|
|
32
|
+
this.modelPromise = pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2', {
|
|
33
|
+
revision: 'main',
|
|
34
|
+
cache_dir: './.cache/embeddings' // Cache model locally
|
|
35
|
+
}).then(model => {
|
|
36
|
+
console.log('✅ Local embedding model loaded successfully!');
|
|
37
|
+
this.isModelLoading = false;
|
|
38
|
+
return model;
|
|
39
|
+
}).catch(error => {
|
|
40
|
+
console.error('❌ Failed to load local embedding model:', error.message);
|
|
41
|
+
this.isModelLoading = false;
|
|
42
|
+
throw error;
|
|
43
|
+
});
|
|
44
|
+
}
|
|
45
|
+
return this.modelPromise;
|
|
46
|
+
}
|
|
20
47
|
|
|
21
|
-
|
|
22
|
-
|
|
48
|
+
/**
|
|
49
|
+
* Generate REAL semantic embedding for text
|
|
50
|
+
* @param {string} text - Input text to embed
|
|
51
|
+
* @returns {number[]} - 384-dimensional embedding vector with values typically between -1 and 1
|
|
52
|
+
*/
|
|
53
|
+
async embed(text) {
|
|
54
|
+
if (!text || !text.trim()) {
|
|
55
|
+
// Return zero vector for empty text
|
|
56
|
+
return new Array(this.dim).fill(0);
|
|
57
|
+
}
|
|
23
58
|
|
|
24
|
-
|
|
25
|
-
|
|
59
|
+
try {
|
|
60
|
+
const model = await this.getModel();
|
|
61
|
+
|
|
62
|
+
// Generate embedding with mean pooling and normalization
|
|
63
|
+
// This matches the standard sentence-transformers approach
|
|
64
|
+
const output = await model(text, {
|
|
65
|
+
pooling: 'mean',
|
|
66
|
+
normalize: true
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
// Convert Float32Array to regular array
|
|
70
|
+
const embedding = Array.from(output.data);
|
|
71
|
+
|
|
72
|
+
// Verify dimension
|
|
73
|
+
if (embedding.length !== this.dim) {
|
|
74
|
+
console.warn(`⚠️ Expected ${this.dim} dimensions, got ${embedding.length}`);
|
|
75
|
+
// Pad or truncate to correct dimension
|
|
76
|
+
if (embedding.length < this.dim) {
|
|
77
|
+
return [...embedding, ...new Array(this.dim - embedding.length).fill(0)];
|
|
78
|
+
} else {
|
|
79
|
+
return embedding.slice(0, this.dim);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
return embedding;
|
|
84
|
+
|
|
85
|
+
} catch (error) {
|
|
86
|
+
console.error(`❌ Embedding generation failed for text: "${text.substring(0, 50)}..."`);
|
|
87
|
+
console.error('Error:', error.message);
|
|
88
|
+
|
|
89
|
+
// Fallback to zero vector to prevent complete failure
|
|
90
|
+
return new Array(this.dim).fill(0);
|
|
26
91
|
}
|
|
92
|
+
}
|
|
27
93
|
|
|
28
|
-
|
|
94
|
+
/**
|
|
95
|
+
* Batch embedding for multiple strings (processed sequentially to manage memory)
|
|
96
|
+
*/
|
|
97
|
+
async embedBatch(textArray = []) {
|
|
98
|
+
if (!Array.isArray(textArray)) {
|
|
99
|
+
throw new Error("embedBatch expects an array of strings");
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
const embeddings = [];
|
|
103
|
+
for (const text of textArray) {
|
|
104
|
+
const embedding = await this.embed(text);
|
|
105
|
+
embeddings.push(embedding);
|
|
106
|
+
}
|
|
107
|
+
return embeddings;
|
|
29
108
|
}
|
|
30
109
|
|
|
31
110
|
/**
|
|
32
|
-
*
|
|
111
|
+
* Get embedding dimension
|
|
33
112
|
*/
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
return textArray.map(text => this.embed(text));
|
|
113
|
+
getDimension() {
|
|
114
|
+
return this.dim;
|
|
37
115
|
}
|
|
38
116
|
}
|
|
39
117
|
|
|
40
118
|
/**
|
|
41
|
-
* Convenience function for
|
|
42
|
-
*
|
|
119
|
+
* Convenience function for compatibility with existing code
|
|
120
|
+
* Creates embedding with retry logic
|
|
43
121
|
*/
|
|
44
|
-
async function createEmbeddingWithRetry(text, options = {}, retries =
|
|
122
|
+
async function createEmbeddingWithRetry(text, options = {}, retries = 2) {
|
|
45
123
|
const embedder = new LocalEmbedding();
|
|
124
|
+
|
|
46
125
|
for (let attempt = 1; attempt <= retries; attempt++) {
|
|
47
126
|
try {
|
|
48
|
-
|
|
127
|
+
const embedding = await embedder.embed(text);
|
|
128
|
+
|
|
129
|
+
// Verify embedding is valid (not all zeros)
|
|
130
|
+
const isAllZeros = embedding.every(val => val === 0);
|
|
131
|
+
if (isAllZeros && (text || '').trim()) {
|
|
132
|
+
if (attempt === retries) {
|
|
133
|
+
console.warn(`⚠️ Embedding is all zeros for text: "${text.substring(0, 50)}..."`);
|
|
134
|
+
}
|
|
135
|
+
throw new Error('Embedding returned all zeros');
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
return embedding;
|
|
139
|
+
|
|
49
140
|
} catch (err) {
|
|
50
|
-
if (attempt === retries)
|
|
141
|
+
if (attempt === retries) {
|
|
142
|
+
console.error(`❌ All ${retries} attempts failed for embedding text: "${text.substring(0, 50)}..."`);
|
|
143
|
+
throw err;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
console.warn(`⚠️ Embedding attempt ${attempt} failed, retrying...`);
|
|
147
|
+
// Add small delay before retry
|
|
148
|
+
await new Promise(resolve => setTimeout(resolve, 100 * attempt));
|
|
51
149
|
}
|
|
52
150
|
}
|
|
53
151
|
}
|