@o-lang/semantic-doc-search 1.0.33 → 1.0.35
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/embeddings/local.js +51 -18
- package/src/resolver.js +63 -11
- package/test-embed.js +10 -0
package/package.json
CHANGED
package/src/embeddings/local.js
CHANGED
|
@@ -7,6 +7,8 @@
|
|
|
7
7
|
* - No zero vectors
|
|
8
8
|
* - Deterministic behavior
|
|
9
9
|
* - DEFENSIVE against method detaching & invalid vectors
|
|
10
|
+
* - WINDOWS-SAFE (disables SIMD, threads, proxy)
|
|
11
|
+
* - TENSOR-SAFE (handles Float32Array, Array, and all ONNX tensor types)
|
|
10
12
|
*/
|
|
11
13
|
|
|
12
14
|
class LocalEmbedding {
|
|
@@ -29,13 +31,21 @@ class LocalEmbedding {
|
|
|
29
31
|
|
|
30
32
|
if (!this.loading) {
|
|
31
33
|
this.loading = (async () => {
|
|
32
|
-
|
|
34
|
+
// ⚠️ CRITICAL: Configure environment BEFORE loading model
|
|
35
|
+
const { env } = await import("@xenova/transformers");
|
|
33
36
|
|
|
34
|
-
// Safe
|
|
37
|
+
// Safe settings for all platforms (harmless on macOS/Linux, essential on Windows)
|
|
38
|
+
env.backends.onnx.wasm.simd = false; // Avoids AVX/SIMD crashes on older CPUs
|
|
39
|
+
env.backends.onnx.wasm.threads = false; // Prevents threading issues in Node
|
|
40
|
+
env.backends.onnx.wasm.proxy = false; // Avoids proxy complications
|
|
35
41
|
env.allowLocalModels = true;
|
|
36
42
|
env.backends.onnx.warmup = false;
|
|
43
|
+
env.cacheDir = "./.cache/embeddings"; // Explicit, project-local cache
|
|
37
44
|
|
|
38
45
|
console.log("🔄 Loading local embedding model (first run only)...");
|
|
46
|
+
console.log("⚙️ Using WASM (SIMD disabled) for cross-platform compatibility");
|
|
47
|
+
|
|
48
|
+
const { pipeline } = await import("@xenova/transformers");
|
|
39
49
|
|
|
40
50
|
const model = await pipeline(
|
|
41
51
|
"feature-extraction",
|
|
@@ -57,9 +67,6 @@ class LocalEmbedding {
|
|
|
57
67
|
|
|
58
68
|
/* ---------------- PUBLIC API ---------------- */
|
|
59
69
|
|
|
60
|
-
/**
|
|
61
|
-
* Generate embedding for a single string
|
|
62
|
-
*/
|
|
63
70
|
async embed(text) {
|
|
64
71
|
if (typeof text !== "string" || !text.trim()) {
|
|
65
72
|
throw new Error("Embedding input must be a non-empty string");
|
|
@@ -73,11 +80,45 @@ class LocalEmbedding {
|
|
|
73
80
|
normalize: true,
|
|
74
81
|
});
|
|
75
82
|
|
|
76
|
-
//
|
|
77
|
-
|
|
83
|
+
// 🔍 DEBUG: Inspect output structure
|
|
84
|
+
console.log("🔍 Model output type:", typeof output);
|
|
85
|
+
if (output && typeof output === 'object') {
|
|
86
|
+
console.log("🔍 Output keys:", Object.keys(output));
|
|
87
|
+
console.log("🔍 Output dims:", output.dims);
|
|
88
|
+
console.log("🔍 output.data type:", Object.prototype.toString.call(output.data));
|
|
89
|
+
console.log("🔍 Is TypedArray?", ArrayBuffer.isView(output.data));
|
|
90
|
+
}
|
|
78
91
|
|
|
92
|
+
// ✅ UNIVERSAL EXTRACTION: handles Float32Array, Array, and all tensor forms
|
|
93
|
+
let vector = null;
|
|
94
|
+
|
|
95
|
+
if (output && output.data !== undefined) {
|
|
96
|
+
// Handle Float32Array, Uint8Array, etc. (standard in ONNX/WASM)
|
|
97
|
+
if (ArrayBuffer.isView(output.data)) {
|
|
98
|
+
vector = Array.from(output.data);
|
|
99
|
+
}
|
|
100
|
+
// Handle plain JS array (older backends or CPU mode)
|
|
101
|
+
else if (Array.isArray(output.data)) {
|
|
102
|
+
vector = Array.from(output.data);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
// Handle batch output: [tensor]
|
|
106
|
+
else if (Array.isArray(output) && output[0]?.data !== undefined) {
|
|
107
|
+
if (ArrayBuffer.isView(output[0].data)) {
|
|
108
|
+
vector = Array.from(output[0].data);
|
|
109
|
+
} else if (Array.isArray(output[0].data)) {
|
|
110
|
+
vector = Array.from(output[0].data);
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
// Fallback: raw array (rare)
|
|
114
|
+
else if (Array.isArray(output)) {
|
|
115
|
+
vector = output;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// Final validation
|
|
79
119
|
if (!Array.isArray(vector) || vector.length !== this.dim) {
|
|
80
|
-
console.error("❌ Invalid embedding vector
|
|
120
|
+
console.error("❌ Invalid embedding vector length:", vector?.length);
|
|
121
|
+
console.error("❌ First few values:", vector?.slice?.(0, 5));
|
|
81
122
|
throw new Error(`Invalid embedding dimension: ${vector?.length || 0} (expected ${this.dim})`);
|
|
82
123
|
}
|
|
83
124
|
|
|
@@ -85,15 +126,12 @@ class LocalEmbedding {
|
|
|
85
126
|
} catch (err) {
|
|
86
127
|
console.error(
|
|
87
128
|
`❌ Embedding failed for text: "${text.slice(0, 60)}..."`,
|
|
88
|
-
err
|
|
129
|
+
err.message
|
|
89
130
|
);
|
|
90
131
|
throw err;
|
|
91
132
|
}
|
|
92
133
|
}
|
|
93
134
|
|
|
94
|
-
/**
|
|
95
|
-
* Batch embedding (sequential, safe)
|
|
96
|
-
*/
|
|
97
135
|
async embedBatch(texts = []) {
|
|
98
136
|
if (!Array.isArray(texts)) {
|
|
99
137
|
throw new Error("embedBatch expects an array of strings");
|
|
@@ -106,15 +144,10 @@ class LocalEmbedding {
|
|
|
106
144
|
return results;
|
|
107
145
|
}
|
|
108
146
|
|
|
109
|
-
/**
|
|
110
|
-
* Return embedding dimension
|
|
111
|
-
*/
|
|
112
147
|
getDimension() {
|
|
113
148
|
return this.dim;
|
|
114
149
|
}
|
|
115
150
|
}
|
|
116
151
|
|
|
117
|
-
/* ---------------- SINGLETON EXPORT ---------------- */
|
|
118
|
-
|
|
119
152
|
const embedder = new LocalEmbedding();
|
|
120
|
-
module.exports = embedder;
|
|
153
|
+
module.exports = embedder;
|
package/src/resolver.js
CHANGED
|
@@ -37,6 +37,39 @@ function sanitizeTextForEmbedding(text) {
|
|
|
37
37
|
return text.replace(/^["']|["']$/g, "").trim();
|
|
38
38
|
}
|
|
39
39
|
|
|
40
|
+
/**
|
|
41
|
+
* Load documents from doc_root if provided
|
|
42
|
+
*/
|
|
43
|
+
function loadDocumentsFromContext(context) {
|
|
44
|
+
if (context.documents && Array.isArray(context.documents)) {
|
|
45
|
+
return context.documents;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
if (context.doc_root) {
|
|
49
|
+
const baseDir = path.resolve(process.cwd(), context.doc_root);
|
|
50
|
+
if (fs.existsSync(baseDir)) {
|
|
51
|
+
const files = fs.readdirSync(baseDir).filter(f => f.endsWith('.txt') || f.endsWith('.md'));
|
|
52
|
+
return files.map(file => ({
|
|
53
|
+
id: file,
|
|
54
|
+
content: fs.readFileSync(path.join(baseDir, file), 'utf8'),
|
|
55
|
+
source: `file:${file}`
|
|
56
|
+
}));
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
return [];
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Convert any array-like (Float32Array, etc.) to plain JS array
|
|
65
|
+
*/
|
|
66
|
+
function toPlainArray(input) {
|
|
67
|
+
if (!input) return null;
|
|
68
|
+
if (Array.isArray(input)) return input;
|
|
69
|
+
if (ArrayBuffer.isView(input)) return Array.from(input);
|
|
70
|
+
return null;
|
|
71
|
+
}
|
|
72
|
+
|
|
40
73
|
/**
|
|
41
74
|
* Semantic Doc Search Resolver
|
|
42
75
|
*/
|
|
@@ -56,21 +89,24 @@ async function resolver(action, context = {}) {
|
|
|
56
89
|
console.log("🩺 Vector health:", await vectorStore.health());
|
|
57
90
|
}
|
|
58
91
|
|
|
59
|
-
// Ingestion guard cache
|
|
60
|
-
const cache = loadCache();
|
|
61
|
-
|
|
62
92
|
// Ensure backend supports search
|
|
63
93
|
if (!vectorStore.supports("vector.search")) {
|
|
64
94
|
throw new Error("Vector backend does not support vector.search");
|
|
65
95
|
}
|
|
66
96
|
|
|
67
|
-
//
|
|
68
|
-
|
|
97
|
+
// Load documents (from context.documents OR doc_root)
|
|
98
|
+
const documents = loadDocumentsFromContext(context);
|
|
99
|
+
|
|
100
|
+
// Ingestion guard cache
|
|
101
|
+
const cache = loadCache();
|
|
102
|
+
|
|
103
|
+
// --- Document ingestion ---
|
|
104
|
+
if (documents.length > 0) {
|
|
69
105
|
if (!vectorStore.supports("vector.insert")) {
|
|
70
106
|
throw new Error("Vector backend does not support vector.insert");
|
|
71
107
|
}
|
|
72
108
|
|
|
73
|
-
for (const doc of
|
|
109
|
+
for (const doc of documents) {
|
|
74
110
|
const chunks = doc.chunks || [doc.content];
|
|
75
111
|
for (let i = 0; i < chunks.length; i++) {
|
|
76
112
|
const text = sanitizeTextForEmbedding(chunks[i]);
|
|
@@ -79,8 +115,18 @@ async function resolver(action, context = {}) {
|
|
|
79
115
|
const hash = hashContent(text);
|
|
80
116
|
if (cache[hash]) continue; // Skip already ingested
|
|
81
117
|
|
|
82
|
-
const
|
|
83
|
-
if (!
|
|
118
|
+
const rawVector = await embedder.embed(text);
|
|
119
|
+
if (!rawVector || rawVector.every(v => v === 0)) {
|
|
120
|
+
console.warn("⚠️ Skipping chunk with zero embedding:", text.slice(0, 50));
|
|
121
|
+
continue;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// 🔒 Normalize to plain JS array
|
|
125
|
+
const vector = toPlainArray(rawVector);
|
|
126
|
+
if (!vector) {
|
|
127
|
+
console.warn("⚠️ Skipping chunk with invalid embedding type:", typeof rawVector);
|
|
128
|
+
continue;
|
|
129
|
+
}
|
|
84
130
|
|
|
85
131
|
await vectorStore.upsert({
|
|
86
132
|
id: `${doc.id}:${i}`,
|
|
@@ -96,12 +142,18 @@ async function resolver(action, context = {}) {
|
|
|
96
142
|
}
|
|
97
143
|
|
|
98
144
|
// Embed query & search
|
|
99
|
-
const
|
|
100
|
-
if (!
|
|
145
|
+
const rawQueryVector = await embedder.embed(query);
|
|
146
|
+
if (!rawQueryVector || rawQueryVector.every(v => v === 0)) {
|
|
101
147
|
console.warn("⚠️ Query embedding invalid");
|
|
102
148
|
return { text: "(Query could not be embedded)", meta: { matches: 0 } };
|
|
103
149
|
}
|
|
104
150
|
|
|
151
|
+
// 🔒 Normalize query vector too (for consistency)
|
|
152
|
+
const queryVector = toPlainArray(rawQueryVector);
|
|
153
|
+
if (!queryVector) {
|
|
154
|
+
return { text: "(Query vector invalid)", meta: { matches: 0 } };
|
|
155
|
+
}
|
|
156
|
+
|
|
105
157
|
// Top-K + similarity threshold
|
|
106
158
|
const results = await vectorStore.query({
|
|
107
159
|
vector: queryVector,
|
|
@@ -112,4 +164,4 @@ async function resolver(action, context = {}) {
|
|
|
112
164
|
return formatResults(results, query);
|
|
113
165
|
}
|
|
114
166
|
|
|
115
|
-
module.exports = resolver;
|
|
167
|
+
module.exports = resolver;
|
package/test-embed.js
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
// test-embed.js
|
|
2
|
+
const embedder = require("./src/embeddings/local");
|
|
3
|
+
|
|
4
|
+
async function test() {
|
|
5
|
+
console.log("Model dimension:", embedder.getDimension());
|
|
6
|
+
const vector = await embedder.embed("hello world");
|
|
7
|
+
console.log("Embedding result:", vector?.length, vector);
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
test().catch(console.error);
|