@o-lang/semantic-doc-search 1.0.12 → 1.0.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/embeddings/local.js +91 -23
package/package.json
CHANGED
package/src/embeddings/local.js
CHANGED
|
@@ -1,53 +1,121 @@
|
|
|
1
1
|
// src/embeddings/local.js
|
|
2
|
-
const crypto = require("crypto");
|
|
3
2
|
|
|
4
3
|
/**
|
|
5
|
-
* LocalEmbedding
|
|
6
|
-
*
|
|
7
|
-
* Each string will produce a consistent vector based on a hash.
|
|
8
|
-
* Note: Not semantic, just a placeholder for testing.
|
|
4
|
+
* LocalEmbedding - REAL semantic embeddings using all-MiniLM-L6-v2
|
|
5
|
+
* Uses dynamic import to work with ESM packages in CommonJS environment
|
|
9
6
|
*/
|
|
10
7
|
class LocalEmbedding {
|
|
11
|
-
constructor(
|
|
12
|
-
this.dim =
|
|
8
|
+
constructor() {
|
|
9
|
+
this.dim = 384;
|
|
10
|
+
this.modelPromise = null;
|
|
11
|
+
this.transformersPromise = null;
|
|
13
12
|
}
|
|
14
13
|
|
|
15
14
|
/**
|
|
16
|
-
*
|
|
15
|
+
* Lazy-load the @xenova/transformers package
|
|
17
16
|
*/
|
|
18
|
-
|
|
19
|
-
if (!
|
|
17
|
+
async getTransformers() {
|
|
18
|
+
if (!this.transformersPromise) {
|
|
19
|
+
this.transformersPromise = import('@xenova/transformers');
|
|
20
|
+
}
|
|
21
|
+
return this.transformersPromise;
|
|
22
|
+
}
|
|
20
23
|
|
|
21
|
-
|
|
22
|
-
|
|
24
|
+
/**
|
|
25
|
+
* Lazy-load the embedding model
|
|
26
|
+
*/
|
|
27
|
+
async getModel() {
|
|
28
|
+
if (!this.modelPromise) {
|
|
29
|
+
const { pipeline, env } = await this.getTransformers();
|
|
30
|
+
|
|
31
|
+
// Configure transformers
|
|
32
|
+
env.allowLocalModels = true;
|
|
33
|
+
env.backends.onnx.warmup = false;
|
|
34
|
+
|
|
35
|
+
console.log('🔄 Loading local embedding model (first run may take 1-2 minutes)...');
|
|
36
|
+
|
|
37
|
+
this.modelPromise = pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2', {
|
|
38
|
+
revision: 'main',
|
|
39
|
+
cache_dir: './.cache/embeddings'
|
|
40
|
+
}).then(model => {
|
|
41
|
+
console.log('✅ Local embedding model loaded successfully!');
|
|
42
|
+
return model;
|
|
43
|
+
}).catch(error => {
|
|
44
|
+
console.error('❌ Failed to load local embedding model:', error.message);
|
|
45
|
+
throw error;
|
|
46
|
+
});
|
|
47
|
+
}
|
|
48
|
+
return this.modelPromise;
|
|
49
|
+
}
|
|
23
50
|
|
|
24
|
-
|
|
25
|
-
|
|
51
|
+
/**
|
|
52
|
+
* Generate REAL semantic embedding for text
|
|
53
|
+
*/
|
|
54
|
+
async embed(text) {
|
|
55
|
+
if (!text || !text.trim()) {
|
|
56
|
+
return new Array(this.dim).fill(0);
|
|
26
57
|
}
|
|
27
58
|
|
|
28
|
-
|
|
59
|
+
try {
|
|
60
|
+
const model = await this.getModel();
|
|
61
|
+
const output = await model(text, {
|
|
62
|
+
pooling: 'mean',
|
|
63
|
+
normalize: true
|
|
64
|
+
});
|
|
65
|
+
return Array.from(output.data);
|
|
66
|
+
} catch (error) {
|
|
67
|
+
console.error(`❌ Embedding failed for: "${text.substring(0, 50)}..."`);
|
|
68
|
+
return new Array(this.dim).fill(0);
|
|
69
|
+
}
|
|
29
70
|
}
|
|
30
71
|
|
|
31
72
|
/**
|
|
32
73
|
* Batch embedding for multiple strings
|
|
33
74
|
*/
|
|
34
|
-
embedBatch(textArray = []) {
|
|
35
|
-
if (!Array.isArray(textArray))
|
|
36
|
-
|
|
75
|
+
async embedBatch(textArray = []) {
|
|
76
|
+
if (!Array.isArray(textArray)) {
|
|
77
|
+
throw new Error("embedBatch expects an array of strings");
|
|
78
|
+
}
|
|
79
|
+
const embeddings = [];
|
|
80
|
+
for (const text of textArray) {
|
|
81
|
+
const embedding = await this.embed(text);
|
|
82
|
+
embeddings.push(embedding);
|
|
83
|
+
}
|
|
84
|
+
return embeddings;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Get embedding dimension
|
|
89
|
+
*/
|
|
90
|
+
getDimension() {
|
|
91
|
+
return this.dim;
|
|
37
92
|
}
|
|
38
93
|
}
|
|
39
94
|
|
|
40
95
|
/**
|
|
41
|
-
* Convenience function for
|
|
42
|
-
* Retries local embedding generation (mostly placeholder, but keeps API compatible)
|
|
96
|
+
* Convenience function for compatibility
|
|
43
97
|
*/
|
|
44
|
-
async function createEmbeddingWithRetry(text, options = {}, retries =
|
|
98
|
+
async function createEmbeddingWithRetry(text, options = {}, retries = 2) {
|
|
45
99
|
const embedder = new LocalEmbedding();
|
|
100
|
+
|
|
46
101
|
for (let attempt = 1; attempt <= retries; attempt++) {
|
|
47
102
|
try {
|
|
48
|
-
|
|
103
|
+
const embedding = await embedder.embed(text);
|
|
104
|
+
const isAllZeros = embedding.every(val => val === 0);
|
|
105
|
+
if (isAllZeros && (text || '').trim()) {
|
|
106
|
+
if (attempt === retries) {
|
|
107
|
+
console.warn(`⚠️ Embedding is all zeros for text: "${text.substring(0, 50)}..."`);
|
|
108
|
+
}
|
|
109
|
+
throw new Error('Embedding returned all zeros');
|
|
110
|
+
}
|
|
111
|
+
return embedding;
|
|
49
112
|
} catch (err) {
|
|
50
|
-
if (attempt === retries)
|
|
113
|
+
if (attempt === retries) {
|
|
114
|
+
console.error(`❌ All ${retries} attempts failed for embedding text: "${text.substring(0, 50)}..."`);
|
|
115
|
+
throw err;
|
|
116
|
+
}
|
|
117
|
+
console.warn(`⚠️ Embedding attempt ${attempt} failed, retrying...`);
|
|
118
|
+
await new Promise(resolve => setTimeout(resolve, 100 * attempt));
|
|
51
119
|
}
|
|
52
120
|
}
|
|
53
121
|
}
|