@o-lang/semantic-doc-search 1.0.23 → 1.0.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,123 +1,114 @@
1
- // src/embeddings/local.js
2
-
3
1
  /**
4
- * LocalEmbedding - REAL semantic embeddings using all-MiniLM-L6-v2
5
- * Uses dynamic import to work with ESM packages in CommonJS environment
2
+ * LocalEmbedding
3
+ * ----------------
4
+ * Real semantic embeddings using all-MiniLM-L6-v2
5
+ * - Singleton model load
6
+ * - No silent failures
7
+ * - No zero vectors
8
+ * - Deterministic behavior
6
9
  */
10
+
7
11
  class LocalEmbedding {
8
12
  constructor() {
9
13
  this.dim = 384;
10
- this.modelPromise = null;
11
- this.transformersPromise = null;
14
+ this.model = null;
15
+ this.loading = null;
12
16
  }
13
17
 
14
- /**
15
- * Lazy-load the @xenova/transformers package
16
- */
17
- async getTransformers() {
18
- if (!this.transformersPromise) {
19
- this.transformersPromise = import('@xenova/transformers');
20
- }
21
- return this.transformersPromise;
22
- }
18
+ /* ---------------- INTERNAL ---------------- */
23
19
 
24
- /**
25
- * Lazy-load the embedding model
26
- */
27
- async getModel() {
28
- if (!this.modelPromise) {
29
- const { pipeline, env } = await this.getTransformers();
30
-
31
- // Configure transformers
32
- env.allowLocalModels = true;
33
- env.backends.onnx.warmup = false;
34
-
35
- console.log('šŸ”„ Loading local embedding model (first run may take 1-2 minutes)...');
36
-
37
- this.modelPromise = pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2', {
38
- revision: 'main',
39
- cache_dir: './.cache/embeddings'
40
- }).then(model => {
41
- console.log('āœ… Local embedding model loaded successfully!');
20
+ async loadModel() {
21
+ if (this.model) return this.model;
22
+
23
+ if (!this.loading) {
24
+ this.loading = (async () => {
25
+ const { pipeline, env } = await import("@xenova/transformers");
26
+
27
+ // Safe defaults
28
+ env.allowLocalModels = true;
29
+ env.backends.onnx.warmup = false;
30
+
31
+ console.log("šŸ”„ Loading local embedding model (first run only)...");
32
+
33
+ const model = await pipeline(
34
+ "feature-extraction",
35
+ "Xenova/all-MiniLM-L6-v2",
36
+ {
37
+ revision: "main",
38
+ cache_dir: "./.cache/embeddings",
39
+ }
40
+ );
41
+
42
+ console.log("āœ… Local embedding model ready");
42
43
  return model;
43
- }).catch(error => {
44
- console.error('āŒ Failed to load local embedding model:', error.message);
45
- throw error;
46
- });
44
+ })();
47
45
  }
48
- return this.modelPromise;
46
+
47
+ this.model = await this.loading;
48
+ return this.model;
49
49
  }
50
50
 
51
+ /* ---------------- PUBLIC API ---------------- */
52
+
51
53
  /**
52
- * Generate REAL semantic embedding for text
54
+ * Generate embedding for a single string
53
55
  */
54
56
  async embed(text) {
55
- if (!text || !text.trim()) {
56
- return new Array(this.dim).fill(0);
57
+ if (typeof text !== "string" || !text.trim()) {
58
+ throw new Error("Embedding input must be a non-empty string");
57
59
  }
58
60
 
61
+ const model = await this.loadModel();
62
+
59
63
  try {
60
- const model = await this.getModel();
61
- const output = await model(text, {
62
- pooling: 'mean',
63
- normalize: true
64
+ const output = await model(text, {
65
+ pooling: "mean",
66
+ normalize: true,
64
67
  });
65
- return Array.from(output.data);
66
- } catch (error) {
67
- console.error(`āŒ Embedding failed for: "${text.substring(0, 50)}..."`);
68
- return new Array(this.dim).fill(0);
68
+
69
+ const vector = Array.from(output.data);
70
+
71
+ if (vector.length !== this.dim) {
72
+ throw new Error(
73
+ `Invalid embedding dimension: ${vector.length} (expected ${this.dim})`
74
+ );
75
+ }
76
+
77
+ return vector;
78
+ } catch (err) {
79
+ console.error(
80
+ `āŒ Embedding failed for text: "${text.slice(0, 60)}..."`
81
+ );
82
+ throw err;
69
83
  }
70
84
  }
71
85
 
72
86
  /**
73
- * Batch embedding for multiple strings
87
+ * Batch embedding (sequential, safe)
74
88
  */
75
- async embedBatch(textArray = []) {
76
- if (!Array.isArray(textArray)) {
89
+ async embedBatch(texts = []) {
90
+ if (!Array.isArray(texts)) {
77
91
  throw new Error("embedBatch expects an array of strings");
78
92
  }
79
- const embeddings = [];
80
- for (const text of textArray) {
81
- const embedding = await this.embed(text);
82
- embeddings.push(embedding);
93
+
94
+ const results = [];
95
+ for (const text of texts) {
96
+ results.push(await this.embed(text));
83
97
  }
84
- return embeddings;
98
+ return results;
85
99
  }
86
100
 
87
101
  /**
88
- * Get embedding dimension
102
+ * Return embedding dimension
89
103
  */
90
104
  getDimension() {
91
105
  return this.dim;
92
106
  }
93
107
  }
94
108
 
95
- /**
96
- * Convenience function for compatibility
97
- */
98
- async function createEmbeddingWithRetry(text, options = {}, retries = 2) {
99
- const embedder = new LocalEmbedding();
100
-
101
- for (let attempt = 1; attempt <= retries; attempt++) {
102
- try {
103
- const embedding = await embedder.embed(text);
104
- const isAllZeros = embedding.every(val => val === 0);
105
- if (isAllZeros && (text || '').trim()) {
106
- if (attempt === retries) {
107
- console.warn(`āš ļø Embedding is all zeros for text: "${text.substring(0, 50)}..."`);
108
- }
109
- throw new Error('Embedding returned all zeros');
110
- }
111
- return embedding;
112
- } catch (err) {
113
- if (attempt === retries) {
114
- console.error(`āŒ All ${retries} attempts failed for embedding text: "${text.substring(0, 50)}..."`);
115
- throw err;
116
- }
117
- console.warn(`āš ļø Embedding attempt ${attempt} failed, retrying...`);
118
- await new Promise(resolve => setTimeout(resolve, 100 * attempt));
119
- }
120
- }
121
- }
109
+ /* ---------------- SINGLETON EXPORT ---------------- */
110
+
111
+ // One embedder per process (CRITICAL)
112
+ const embedder = new LocalEmbedding();
122
113
 
123
- module.exports = { LocalEmbedding, createEmbeddingWithRetry };
114
+ module.exports = embedder;
package/src/index.js CHANGED
@@ -1,6 +1,6 @@
1
1
  const fs = require("fs");
2
2
  const path = require("path");
3
- const { LocalEmbedding } = require("./embeddings/local.js");
3
+ const embedder = require("./embeddings/local.js"); // āœ… singleton embedder
4
4
  const { chunkText } = require("./utils/chunker.js");
5
5
  const { extractKeywords } = require("./utils/extractText.js");
6
6
  const { highlightMatches } = require("./utils/highlight.js");
@@ -59,8 +59,7 @@ class DatabaseAdapter {
59
59
 
60
60
  async initMongo(context) {
61
61
  const { MongoClient } = require("mongodb");
62
- const uri = context.MONGO_URI;
63
- this.mongo = new MongoClient(uri);
62
+ this.mongo = new MongoClient(context.MONGO_URI);
64
63
  await this.mongo.connect();
65
64
  }
66
65
 
@@ -113,8 +112,8 @@ class DatabaseAdapter {
113
112
 
114
113
  async function loadAllDocuments(context) {
115
114
  const docs = [];
116
-
117
115
  const db = new DatabaseAdapter();
116
+
118
117
  try {
119
118
  await db.initialize(context);
120
119
  docs.push(...(await db.queryDocuments(context)));
@@ -125,9 +124,9 @@ async function loadAllDocuments(context) {
125
124
  : path.join(process.cwd(), "docs");
126
125
 
127
126
  if (fs.existsSync(baseDir)) {
128
- const files = fs.readdirSync(baseDir).filter(f =>
129
- f.endsWith(".txt") || f.endsWith(".md")
130
- );
127
+ const files = fs
128
+ .readdirSync(baseDir)
129
+ .filter(f => f.endsWith(".txt") || f.endsWith(".md"));
131
130
 
132
131
  for (const file of files) {
133
132
  docs.push({
@@ -145,56 +144,86 @@ async function loadAllDocuments(context) {
145
144
 
146
145
  async function performHybridDocQA(query, context) {
147
146
  const cache = loadCache();
148
- const embedder = new LocalEmbedding({ dimension: 384 });
149
147
 
150
- const store = VectorRouter.create({
148
+ const MIN_SCORE = context.minScore ?? 0.75;
149
+ const topK = context.topK ?? 5;
150
+
151
+ const vectorStore = VectorRouter.create({
151
152
  backend: context.vectorBackend || "memory",
152
- dimension: 384,
153
+ dimension: embedder.getDimension(),
153
154
  ...context,
154
155
  });
155
156
 
157
+ console.log(
158
+ "🧠 Vector store methods:",
159
+ Object.getOwnPropertyNames(Object.getPrototypeOf(vectorStore))
160
+ );
161
+
156
162
  const documents = await loadAllDocuments(context);
157
- if (!documents.length) return { text: "", meta: {} };
163
+ console.log("šŸ“„ Documents loaded:", documents.length);
164
+
165
+ if (!documents.length) return { text: "(No documents found)", meta: { matches: 0 } };
158
166
 
167
+ // Multi-document ingestion
159
168
  for (const doc of documents) {
160
- if (!cache[doc.id]) {
161
- cache[doc.id] = true;
162
- const chunks = chunkText(doc.content, 500);
163
-
164
- for (let i = 0; i < chunks.length; i++) {
165
- const vector = await embedder.embed(chunks[i]);
166
- await store.upsert({
167
- id: `${doc.id}:${i}`,
168
- vector,
169
- content: chunks[i],
170
- source: doc.source,
171
- });
169
+ const chunks = chunkText(doc.content, 500);
170
+ console.log(`šŸ“¦ ${doc.id} split into ${chunks.length} chunks`);
171
+
172
+ for (let i = 0; i < chunks.length; i++) {
173
+ console.log("🧩 Chunk to embed:", chunks[i]?.substring(0, 100));
174
+
175
+ const vector = await embedder.embed(chunks[i]);
176
+ if (!vector || vector.every(v => v === 0)) {
177
+ console.warn("āš ļø Zero or invalid embedding, skipping chunk");
178
+ continue;
172
179
  }
180
+
181
+ await vectorStore.upsert({
182
+ id: `${doc.id}:${i}`,
183
+ vector,
184
+ content: chunks[i],
185
+ source: doc.source,
186
+ });
187
+
188
+ console.log(`āœ… Upserted ${doc.id}:${i}`);
173
189
  }
174
190
  }
175
191
 
176
192
  saveCache(cache);
177
193
 
194
+ // Embed the query
178
195
  const queryVector = await embedder.embed(query);
179
- const results = await store.search({
180
- embedding: queryVector,
181
- topK: 5,
182
- });
196
+ if (!queryVector || queryVector.every(v => v === 0)) {
197
+ console.warn("āš ļø Query embedding invalid");
198
+ return { text: "(Query could not be embedded)", meta: { matches: 0 } };
199
+ }
200
+
201
+ // Top-K + similarity threshold
202
+ const results = await vectorStore.query(queryVector, { topK });
203
+ const filtered = results.filter(r => r.score >= MIN_SCORE);
204
+
205
+ console.log(`šŸ” Search results: ${filtered.length} (after applying minScore=${MIN_SCORE})`);
206
+
207
+ if (!filtered.length) {
208
+ return { text: "(No relevant match found)", meta: { matches: 0 } };
209
+ }
183
210
 
184
211
  return {
185
212
  text: highlightMatches(
186
- results.map(r => r.content).join("\n\n"),
213
+ filtered.map(r => r.content).join("\n\n"),
187
214
  extractKeywords(query)
188
215
  ),
189
- meta: { matches: results.length },
216
+ meta: { matches: filtered.length },
190
217
  };
191
218
  }
192
219
 
220
+
193
221
  /* ---------------- PGVECTOR SEARCH ---------------- */
194
222
 
195
223
  async function performPgVectorSearch(query, context) {
196
- const adapter = new PgVectorAdapter({ POSTGRES_URL: context.POSTGRES_URL });
197
- const embedder = new LocalEmbedding({ dimension: 384 });
224
+ const adapter = new PgVectorAdapter({
225
+ POSTGRES_URL: context.POSTGRES_URL,
226
+ });
198
227
 
199
228
  const vector = await embedder.embed(query);
200
229
  const results = await adapter.search(vector, 5);
@@ -220,14 +249,36 @@ async function performDocQA(query, context) {
220
249
  async function docSearchResolver(action, context) {
221
250
  if (!action.startsWith("Ask doc-search")) return;
222
251
 
252
+ // Extract the query string
223
253
  const match = action.match(/"(.*)"|'(.*)'/);
224
- const query = match
225
- ? match[1] || match[2]
226
- : action.replace("Ask doc-search", "").trim();
254
+ const query = match ? match[1] || match[2] : action.replace("Ask doc-search", "").trim();
255
+
256
+ // Internal settings (hidden from workflow)
257
+ const INTERNAL_TOPK = 5; // how many nearest chunks to fetch
258
+ const INTERNAL_MINSCORE = 0.75; // similarity threshold
259
+
260
+ // Perform the search
261
+ const results = await performDocQA(query, { ...context, topK: INTERNAL_TOPK, minScore: INTERNAL_MINSCORE });
262
+
263
+ // If no results meet the threshold, fallback to the best match
264
+ let finalResults;
265
+ if (!results || results.meta.matches === 0) {
266
+ const fallback = await performDocQA(query, { ...context, topK: 1, minScore: 0 });
267
+ finalResults = fallback;
268
+ } else {
269
+ finalResults = results;
270
+ }
227
271
 
228
- return performDocQA(query, context);
272
+ // Ensure LLM always receives text
273
+ if (!finalResults || !finalResults.text || finalResults.text.trim() === "") {
274
+ return {
275
+ text: `No relevant information found for "${query}".`,
276
+ meta: { matches: 0 }
277
+ };
278
+ }
279
+
280
+ return finalResults;
229
281
  }
230
282
 
231
283
  docSearchResolver.resolverName = "doc-search";
232
284
  module.exports = docSearchResolver;
233
-
@@ -0,0 +1,13 @@
1
+ const docSearch = require("./index");
2
+
3
+ (async () => {
4
+ const result = await docSearch(
5
+ 'Ask doc-search "vacation policy"',
6
+ {
7
+ doc_root: "./docs",
8
+ vectorBackend: "memory"
9
+ }
10
+ );
11
+
12
+ console.log(result);
13
+ })();
@@ -0,0 +1,36 @@
1
+ // test-doc-search-batch.js
2
+ const docSearchResolver = require("./src/index.js");
3
+
4
+ (async () => {
5
+ try {
6
+ const context = {
7
+ doc_root: "./docs", // folder with .txt or .md files
8
+ vectorBackend: "memory", // can also switch to "pgvector" if configured
9
+ };
10
+
11
+ const queries = [
12
+ "Semantic search",
13
+ "Vacation policy",
14
+ "Employee onboarding",
15
+ "Leave requests",
16
+ "HR compliance"
17
+ ];
18
+
19
+ console.log("šŸ”Ž Running batch doc-search...");
20
+
21
+ for (const query of queries) {
22
+ const action = `Ask doc-search "${query}"`;
23
+ const result = await docSearchResolver(action, context);
24
+
25
+ console.log("\n====================================");
26
+ console.log(`Query: "${query}"`);
27
+ console.log("Text:\n", result.text || "(No matches found)");
28
+ console.log("Meta:", result.meta);
29
+ console.log("====================================");
30
+ }
31
+
32
+ console.log("\nāœ… Batch search complete!");
33
+ } catch (err) {
34
+ console.error("āŒ Batch doc-search test failed:", err);
35
+ }
36
+ })();
@@ -0,0 +1,22 @@
1
+ // test-doc-search.js
2
+ const docSearchResolver = require("./src/index.js");
3
+
4
+ (async () => {
5
+ try {
6
+ const context = {
7
+ doc_root: "./docs", // folder containing .txt or .md files
8
+ vectorBackend: "memory", // can also use "pgvector" if configured
9
+ };
10
+
11
+ const action = 'Ask doc-search "Semantic search"'; // Example query
12
+ console.log("šŸ”Ž Running doc-search...");
13
+
14
+ const result = await docSearchResolver(action, context);
15
+
16
+ console.log("āœ… Search Result:");
17
+ console.log("Text:\n", result.text);
18
+ console.log("Meta:", result.meta);
19
+ } catch (err) {
20
+ console.error("āŒ doc-search test failed:", err);
21
+ }
22
+ })();
@@ -0,0 +1,32 @@
1
+ // test-single-doc.js
2
+ const path = require("path");
3
+ const { LocalEmbedding } = require("./embeddings/local.js");
4
+ const { chunkText } = require("./utils/chunker.js");
5
+ const VectorRouter = require("./adapters/vectorRouter");
6
+
7
+ (async () => {
8
+ const embedder = new LocalEmbedding();
9
+ const docPath = path.join(process.cwd(), "docs", "sample1.txt");
10
+ const fs = require("fs");
11
+ const content = fs.readFileSync(docPath, "utf8");
12
+
13
+ const chunks = chunkText(content, 500);
14
+ console.log(`Document split into ${chunks.length} chunk(s)`);
15
+
16
+ const vectorStore = VectorRouter.create({ backend: "memory", dimension: embedder.getDimension() });
17
+
18
+ for (let i = 0; i < chunks.length; i++) {
19
+ const vector = await embedder.embed(chunks[i]);
20
+ console.log(`Chunk ${i} embedding first 5 dims:`, vector.slice(0, 5));
21
+
22
+ await vectorStore.upsert({ id: `sample1:${i}`, vector, content: chunks[i], source: "file:sample1.txt" });
23
+ }
24
+
25
+ const query = "Semantic search";
26
+ const queryVector = await embedder.embed(query);
27
+
28
+ const results = await vectorStore.query(queryVector, { topK: 5 });
29
+ results.forEach((r, idx) => {
30
+ console.log(`Result ${idx}: score=${r.score.toFixed(3)} content=${r.content.substring(0, 50)}...`);
31
+ });
32
+ })();