@o-lang/semantic-doc-search 1.0.33 → 1.0.35

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@o-lang/semantic-doc-search",
3
- "version": "1.0.33",
3
+ "version": "1.0.35",
4
4
  "description": "O-lang Semantic Document Search Resolver with hybrid search, embeddings, rerank, and streaming.",
5
5
  "main": "src/index.js",
6
6
  "type": "commonjs",
@@ -7,6 +7,8 @@
7
7
  * - No zero vectors
8
8
  * - Deterministic behavior
9
9
  * - DEFENSIVE against method detaching & invalid vectors
10
+ * - WINDOWS-SAFE (disables SIMD, threads, proxy)
11
+ * - TENSOR-SAFE (handles Float32Array, Array, and all ONNX tensor types)
10
12
  */
11
13
 
12
14
  class LocalEmbedding {
@@ -29,13 +31,21 @@ class LocalEmbedding {
29
31
 
30
32
  if (!this.loading) {
31
33
  this.loading = (async () => {
32
- const { pipeline, env } = await import("@xenova/transformers");
34
+ // ⚠️ CRITICAL: Configure environment BEFORE loading model
35
+ const { env } = await import("@xenova/transformers");
33
36
 
34
- // Safe defaults
37
+ // Safe settings for all platforms (harmless on macOS/Linux, essential on Windows)
38
+ env.backends.onnx.wasm.simd = false; // Avoids AVX/SIMD crashes on older CPUs
39
+ env.backends.onnx.wasm.threads = false; // Prevents threading issues in Node
40
+ env.backends.onnx.wasm.proxy = false; // Avoids proxy complications
35
41
  env.allowLocalModels = true;
36
42
  env.backends.onnx.warmup = false;
43
+ env.cacheDir = "./.cache/embeddings"; // Explicit, project-local cache
37
44
 
38
45
  console.log("🔄 Loading local embedding model (first run only)...");
46
+ console.log("⚙️ Using WASM (SIMD disabled) for cross-platform compatibility");
47
+
48
+ const { pipeline } = await import("@xenova/transformers");
39
49
 
40
50
  const model = await pipeline(
41
51
  "feature-extraction",
@@ -57,9 +67,6 @@ class LocalEmbedding {
57
67
 
58
68
  /* ---------------- PUBLIC API ---------------- */
59
69
 
60
- /**
61
- * Generate embedding for a single string
62
- */
63
70
  async embed(text) {
64
71
  if (typeof text !== "string" || !text.trim()) {
65
72
  throw new Error("Embedding input must be a non-empty string");
@@ -73,11 +80,45 @@ class LocalEmbedding {
73
80
  normalize: true,
74
81
  });
75
82
 
76
- // Defensive: ensure vector is a valid array
77
- const vector = Array.isArray(output?.data) ? Array.from(output.data) : null;
83
+ // 🔍 DEBUG: Inspect output structure
84
+ console.log("🔍 Model output type:", typeof output);
85
+ if (output && typeof output === 'object') {
86
+ console.log("🔍 Output keys:", Object.keys(output));
87
+ console.log("🔍 Output dims:", output.dims);
88
+ console.log("🔍 output.data type:", Object.prototype.toString.call(output.data));
89
+ console.log("🔍 Is TypedArray?", ArrayBuffer.isView(output.data));
90
+ }
78
91
 
92
+ // ✅ UNIVERSAL EXTRACTION: handles Float32Array, Array, and all tensor forms
93
+ let vector = null;
94
+
95
+ if (output && output.data !== undefined) {
96
+ // Handle Float32Array, Uint8Array, etc. (standard in ONNX/WASM)
97
+ if (ArrayBuffer.isView(output.data)) {
98
+ vector = Array.from(output.data);
99
+ }
100
+ // Handle plain JS array (older backends or CPU mode)
101
+ else if (Array.isArray(output.data)) {
102
+ vector = Array.from(output.data);
103
+ }
104
+ }
105
+ // Handle batch output: [tensor]
106
+ else if (Array.isArray(output) && output[0]?.data !== undefined) {
107
+ if (ArrayBuffer.isView(output[0].data)) {
108
+ vector = Array.from(output[0].data);
109
+ } else if (Array.isArray(output[0].data)) {
110
+ vector = Array.from(output[0].data);
111
+ }
112
+ }
113
+ // Fallback: raw array (rare)
114
+ else if (Array.isArray(output)) {
115
+ vector = output;
116
+ }
117
+
118
+ // Final validation
79
119
  if (!Array.isArray(vector) || vector.length !== this.dim) {
80
- console.error("❌ Invalid embedding vector returned:", vector);
120
+ console.error("❌ Invalid embedding vector length:", vector?.length);
121
+ console.error("❌ First few values:", vector?.slice?.(0, 5));
81
122
  throw new Error(`Invalid embedding dimension: ${vector?.length || 0} (expected ${this.dim})`);
82
123
  }
83
124
 
@@ -85,15 +126,12 @@ class LocalEmbedding {
85
126
  } catch (err) {
86
127
  console.error(
87
128
  `❌ Embedding failed for text: "${text.slice(0, 60)}..."`,
88
- err
129
+ err.message
89
130
  );
90
131
  throw err;
91
132
  }
92
133
  }
93
134
 
94
- /**
95
- * Batch embedding (sequential, safe)
96
- */
97
135
  async embedBatch(texts = []) {
98
136
  if (!Array.isArray(texts)) {
99
137
  throw new Error("embedBatch expects an array of strings");
@@ -106,15 +144,10 @@ class LocalEmbedding {
106
144
  return results;
107
145
  }
108
146
 
109
- /**
110
- * Return embedding dimension
111
- */
112
147
  getDimension() {
113
148
  return this.dim;
114
149
  }
115
150
  }
116
151
 
117
- /* ---------------- SINGLETON EXPORT ---------------- */
118
-
119
152
  const embedder = new LocalEmbedding();
120
- module.exports = embedder;
153
+ module.exports = embedder;
package/src/resolver.js CHANGED
@@ -37,6 +37,39 @@ function sanitizeTextForEmbedding(text) {
37
37
  return text.replace(/^["']|["']$/g, "").trim();
38
38
  }
39
39
 
40
+ /**
41
+ * Load documents from doc_root if provided
42
+ */
43
+ function loadDocumentsFromContext(context) {
44
+ if (context.documents && Array.isArray(context.documents)) {
45
+ return context.documents;
46
+ }
47
+
48
+ if (context.doc_root) {
49
+ const baseDir = path.resolve(process.cwd(), context.doc_root);
50
+ if (fs.existsSync(baseDir)) {
51
+ const files = fs.readdirSync(baseDir).filter(f => f.endsWith('.txt') || f.endsWith('.md'));
52
+ return files.map(file => ({
53
+ id: file,
54
+ content: fs.readFileSync(path.join(baseDir, file), 'utf8'),
55
+ source: `file:${file}`
56
+ }));
57
+ }
58
+ }
59
+
60
+ return [];
61
+ }
62
+
63
+ /**
64
+ * Convert any array-like (Float32Array, etc.) to plain JS array
65
+ */
66
+ function toPlainArray(input) {
67
+ if (!input) return null;
68
+ if (Array.isArray(input)) return input;
69
+ if (ArrayBuffer.isView(input)) return Array.from(input);
70
+ return null;
71
+ }
72
+
40
73
  /**
41
74
  * Semantic Doc Search Resolver
42
75
  */
@@ -56,21 +89,24 @@ async function resolver(action, context = {}) {
56
89
  console.log("🩺 Vector health:", await vectorStore.health());
57
90
  }
58
91
 
59
- // Ingestion guard cache
60
- const cache = loadCache();
61
-
62
92
  // Ensure backend supports search
63
93
  if (!vectorStore.supports("vector.search")) {
64
94
  throw new Error("Vector backend does not support vector.search");
65
95
  }
66
96
 
67
- // --- Document ingestion (optional) ---
68
- if (context.documents && Array.isArray(context.documents)) {
97
+ // Load documents (from context.documents OR doc_root)
98
+ const documents = loadDocumentsFromContext(context);
99
+
100
+ // Ingestion guard cache
101
+ const cache = loadCache();
102
+
103
+ // --- Document ingestion ---
104
+ if (documents.length > 0) {
69
105
  if (!vectorStore.supports("vector.insert")) {
70
106
  throw new Error("Vector backend does not support vector.insert");
71
107
  }
72
108
 
73
- for (const doc of context.documents) {
109
+ for (const doc of documents) {
74
110
  const chunks = doc.chunks || [doc.content];
75
111
  for (let i = 0; i < chunks.length; i++) {
76
112
  const text = sanitizeTextForEmbedding(chunks[i]);
@@ -79,8 +115,18 @@ async function resolver(action, context = {}) {
79
115
  const hash = hashContent(text);
80
116
  if (cache[hash]) continue; // Skip already ingested
81
117
 
82
- const vector = await embedder.embed(text);
83
- if (!vector || vector.every(v => v === 0)) continue;
118
+ const rawVector = await embedder.embed(text);
119
+ if (!rawVector || rawVector.every(v => v === 0)) {
120
+ console.warn("⚠️ Skipping chunk with zero embedding:", text.slice(0, 50));
121
+ continue;
122
+ }
123
+
124
+ // 🔒 Normalize to plain JS array
125
+ const vector = toPlainArray(rawVector);
126
+ if (!vector) {
127
+ console.warn("⚠️ Skipping chunk with invalid embedding type:", typeof rawVector);
128
+ continue;
129
+ }
84
130
 
85
131
  await vectorStore.upsert({
86
132
  id: `${doc.id}:${i}`,
@@ -96,12 +142,18 @@ async function resolver(action, context = {}) {
96
142
  }
97
143
 
98
144
  // Embed query & search
99
- const queryVector = await embedder.embed(query);
100
- if (!queryVector || queryVector.every(v => v === 0)) {
145
+ const rawQueryVector = await embedder.embed(query);
146
+ if (!rawQueryVector || rawQueryVector.every(v => v === 0)) {
101
147
  console.warn("⚠️ Query embedding invalid");
102
148
  return { text: "(Query could not be embedded)", meta: { matches: 0 } };
103
149
  }
104
150
 
151
+ // 🔒 Normalize query vector too (for consistency)
152
+ const queryVector = toPlainArray(rawQueryVector);
153
+ if (!queryVector) {
154
+ return { text: "(Query vector invalid)", meta: { matches: 0 } };
155
+ }
156
+
105
157
  // Top-K + similarity threshold
106
158
  const results = await vectorStore.query({
107
159
  vector: queryVector,
@@ -112,4 +164,4 @@ async function resolver(action, context = {}) {
112
164
  return formatResults(results, query);
113
165
  }
114
166
 
115
- module.exports = resolver;
167
+ module.exports = resolver;
package/test-embed.js ADDED
@@ -0,0 +1,10 @@
1
+ // test-embed.js
2
+ const embedder = require("./src/embeddings/local");
3
+
4
+ async function test() {
5
+ console.log("Model dimension:", embedder.getDimension());
6
+ const vector = await embedder.embed("hello world");
7
+ console.log("Embedding result:", vector?.length, vector);
8
+ }
9
+
10
+ test().catch(console.error);