@o-lang/semantic-doc-search 1.0.13 → 1.0.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/embeddings/local.js +25 -55
package/package.json
CHANGED
package/src/embeddings/local.js
CHANGED
|
@@ -1,44 +1,47 @@
|
|
|
1
1
|
// src/embeddings/local.js
|
|
2
|
-
const { pipeline, env } = require('@xenova/transformers');
|
|
3
|
-
|
|
4
|
-
// Configure transformers to work in Node.js
|
|
5
|
-
env.allowLocalModels = true;
|
|
6
|
-
env.backends.onnx.warmup = false; // Faster startup
|
|
7
2
|
|
|
8
3
|
/**
|
|
9
4
|
* LocalEmbedding - REAL semantic embeddings using all-MiniLM-L6-v2
|
|
10
|
-
*
|
|
11
|
-
* - Understands semantic meaning of text
|
|
12
|
-
* - Produces embeddings with negative and positive values (-1 to 1)
|
|
13
|
-
* - Works offline after first download
|
|
14
|
-
* - Is optimized for CPU (no GPU required)
|
|
15
|
-
* - Produces 384-dimensional vectors compatible with pgvector
|
|
5
|
+
* Uses dynamic import to work with ESM packages in CommonJS environment
|
|
16
6
|
*/
|
|
17
7
|
class LocalEmbedding {
|
|
18
8
|
constructor() {
|
|
19
|
-
this.dim = 384;
|
|
9
|
+
this.dim = 384;
|
|
20
10
|
this.modelPromise = null;
|
|
21
|
-
this.
|
|
11
|
+
this.transformersPromise = null;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Lazy-load the @xenova/transformers package
|
|
16
|
+
*/
|
|
17
|
+
async getTransformers() {
|
|
18
|
+
if (!this.transformersPromise) {
|
|
19
|
+
this.transformersPromise = import('@xenova/transformers');
|
|
20
|
+
}
|
|
21
|
+
return this.transformersPromise;
|
|
22
22
|
}
|
|
23
23
|
|
|
24
24
|
/**
|
|
25
|
-
* Lazy-load the embedding model
|
|
25
|
+
* Lazy-load the embedding model
|
|
26
26
|
*/
|
|
27
27
|
async getModel() {
|
|
28
28
|
if (!this.modelPromise) {
|
|
29
|
-
|
|
29
|
+
const { pipeline, env } = await this.getTransformers();
|
|
30
|
+
|
|
31
|
+
// Configure transformers
|
|
32
|
+
env.allowLocalModels = true;
|
|
33
|
+
env.backends.onnx.warmup = false;
|
|
34
|
+
|
|
30
35
|
console.log('🔄 Loading local embedding model (first run may take 1-2 minutes)...');
|
|
31
36
|
|
|
32
37
|
this.modelPromise = pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2', {
|
|
33
38
|
revision: 'main',
|
|
34
|
-
cache_dir: './.cache/embeddings'
|
|
39
|
+
cache_dir: './.cache/embeddings'
|
|
35
40
|
}).then(model => {
|
|
36
41
|
console.log('✅ Local embedding model loaded successfully!');
|
|
37
|
-
this.isModelLoading = false;
|
|
38
42
|
return model;
|
|
39
43
|
}).catch(error => {
|
|
40
44
|
console.error('❌ Failed to load local embedding model:', error.message);
|
|
41
|
-
this.isModelLoading = false;
|
|
42
45
|
throw error;
|
|
43
46
|
});
|
|
44
47
|
}
|
|
@@ -47,58 +50,32 @@ class LocalEmbedding {
|
|
|
47
50
|
|
|
48
51
|
/**
|
|
49
52
|
* Generate REAL semantic embedding for text
|
|
50
|
-
* @param {string} text - Input text to embed
|
|
51
|
-
* @returns {number[]} - 384-dimensional embedding vector with values typically between -1 and 1
|
|
52
53
|
*/
|
|
53
54
|
async embed(text) {
|
|
54
55
|
if (!text || !text.trim()) {
|
|
55
|
-
// Return zero vector for empty text
|
|
56
56
|
return new Array(this.dim).fill(0);
|
|
57
57
|
}
|
|
58
58
|
|
|
59
59
|
try {
|
|
60
60
|
const model = await this.getModel();
|
|
61
|
-
|
|
62
|
-
// Generate embedding with mean pooling and normalization
|
|
63
|
-
// This matches the standard sentence-transformers approach
|
|
64
61
|
const output = await model(text, {
|
|
65
62
|
pooling: 'mean',
|
|
66
63
|
normalize: true
|
|
67
64
|
});
|
|
68
|
-
|
|
69
|
-
// Convert Float32Array to regular array
|
|
70
|
-
const embedding = Array.from(output.data);
|
|
71
|
-
|
|
72
|
-
// Verify dimension
|
|
73
|
-
if (embedding.length !== this.dim) {
|
|
74
|
-
console.warn(`⚠️ Expected ${this.dim} dimensions, got ${embedding.length}`);
|
|
75
|
-
// Pad or truncate to correct dimension
|
|
76
|
-
if (embedding.length < this.dim) {
|
|
77
|
-
return [...embedding, ...new Array(this.dim - embedding.length).fill(0)];
|
|
78
|
-
} else {
|
|
79
|
-
return embedding.slice(0, this.dim);
|
|
80
|
-
}
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
return embedding;
|
|
84
|
-
|
|
65
|
+
return Array.from(output.data);
|
|
85
66
|
} catch (error) {
|
|
86
|
-
console.error(`❌ Embedding
|
|
87
|
-
console.error('Error:', error.message);
|
|
88
|
-
|
|
89
|
-
// Fallback to zero vector to prevent complete failure
|
|
67
|
+
console.error(`❌ Embedding failed for: "${text.substring(0, 50)}..."`);
|
|
90
68
|
return new Array(this.dim).fill(0);
|
|
91
69
|
}
|
|
92
70
|
}
|
|
93
71
|
|
|
94
72
|
/**
|
|
95
|
-
* Batch embedding for multiple strings
|
|
73
|
+
* Batch embedding for multiple strings
|
|
96
74
|
*/
|
|
97
75
|
async embedBatch(textArray = []) {
|
|
98
76
|
if (!Array.isArray(textArray)) {
|
|
99
77
|
throw new Error("embedBatch expects an array of strings");
|
|
100
78
|
}
|
|
101
|
-
|
|
102
79
|
const embeddings = [];
|
|
103
80
|
for (const text of textArray) {
|
|
104
81
|
const embedding = await this.embed(text);
|
|
@@ -116,8 +93,7 @@ class LocalEmbedding {
|
|
|
116
93
|
}
|
|
117
94
|
|
|
118
95
|
/**
|
|
119
|
-
* Convenience function for compatibility
|
|
120
|
-
* Creates embedding with retry logic
|
|
96
|
+
* Convenience function for compatibility
|
|
121
97
|
*/
|
|
122
98
|
async function createEmbeddingWithRetry(text, options = {}, retries = 2) {
|
|
123
99
|
const embedder = new LocalEmbedding();
|
|
@@ -125,8 +101,6 @@ async function createEmbeddingWithRetry(text, options = {}, retries = 2) {
|
|
|
125
101
|
for (let attempt = 1; attempt <= retries; attempt++) {
|
|
126
102
|
try {
|
|
127
103
|
const embedding = await embedder.embed(text);
|
|
128
|
-
|
|
129
|
-
// Verify embedding is valid (not all zeros)
|
|
130
104
|
const isAllZeros = embedding.every(val => val === 0);
|
|
131
105
|
if (isAllZeros && (text || '').trim()) {
|
|
132
106
|
if (attempt === retries) {
|
|
@@ -134,17 +108,13 @@ async function createEmbeddingWithRetry(text, options = {}, retries = 2) {
|
|
|
134
108
|
}
|
|
135
109
|
throw new Error('Embedding returned all zeros');
|
|
136
110
|
}
|
|
137
|
-
|
|
138
111
|
return embedding;
|
|
139
|
-
|
|
140
112
|
} catch (err) {
|
|
141
113
|
if (attempt === retries) {
|
|
142
114
|
console.error(`❌ All ${retries} attempts failed for embedding text: "${text.substring(0, 50)}..."`);
|
|
143
115
|
throw err;
|
|
144
116
|
}
|
|
145
|
-
|
|
146
117
|
console.warn(`⚠️ Embedding attempt ${attempt} failed, retrying...`);
|
|
147
|
-
// Add small delay before retry
|
|
148
118
|
await new Promise(resolve => setTimeout(resolve, 100 * attempt));
|
|
149
119
|
}
|
|
150
120
|
}
|