@o-lang/semantic-doc-search 1.0.22 → 1.0.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,123 +1,114 @@
1
- // src/embeddings/local.js
2
-
3
1
  /**
4
- * LocalEmbedding - REAL semantic embeddings using all-MiniLM-L6-v2
5
- * Uses dynamic import to work with ESM packages in CommonJS environment
2
+ * LocalEmbedding
3
+ * ----------------
4
+ * Real semantic embeddings using all-MiniLM-L6-v2
5
+ * - Singleton model load
6
+ * - No silent failures
7
+ * - No zero vectors
8
+ * - Deterministic behavior
6
9
  */
10
+
7
11
  class LocalEmbedding {
8
12
  constructor() {
9
13
  this.dim = 384;
10
- this.modelPromise = null;
11
- this.transformersPromise = null;
14
+ this.model = null;
15
+ this.loading = null;
12
16
  }
13
17
 
14
- /**
15
- * Lazy-load the @xenova/transformers package
16
- */
17
- async getTransformers() {
18
- if (!this.transformersPromise) {
19
- this.transformersPromise = import('@xenova/transformers');
20
- }
21
- return this.transformersPromise;
22
- }
18
+ /* ---------------- INTERNAL ---------------- */
23
19
 
24
- /**
25
- * Lazy-load the embedding model
26
- */
27
- async getModel() {
28
- if (!this.modelPromise) {
29
- const { pipeline, env } = await this.getTransformers();
30
-
31
- // Configure transformers
32
- env.allowLocalModels = true;
33
- env.backends.onnx.warmup = false;
34
-
35
- console.log('šŸ”„ Loading local embedding model (first run may take 1-2 minutes)...');
36
-
37
- this.modelPromise = pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2', {
38
- revision: 'main',
39
- cache_dir: './.cache/embeddings'
40
- }).then(model => {
41
- console.log('āœ… Local embedding model loaded successfully!');
20
+ async loadModel() {
21
+ if (this.model) return this.model;
22
+
23
+ if (!this.loading) {
24
+ this.loading = (async () => {
25
+ const { pipeline, env } = await import("@xenova/transformers");
26
+
27
+ // Safe defaults
28
+ env.allowLocalModels = true;
29
+ env.backends.onnx.warmup = false;
30
+
31
+ console.log("šŸ”„ Loading local embedding model (first run only)...");
32
+
33
+ const model = await pipeline(
34
+ "feature-extraction",
35
+ "Xenova/all-MiniLM-L6-v2",
36
+ {
37
+ revision: "main",
38
+ cache_dir: "./.cache/embeddings",
39
+ }
40
+ );
41
+
42
+ console.log("āœ… Local embedding model ready");
42
43
  return model;
43
- }).catch(error => {
44
- console.error('āŒ Failed to load local embedding model:', error.message);
45
- throw error;
46
- });
44
+ })();
47
45
  }
48
- return this.modelPromise;
46
+
47
+ this.model = await this.loading;
48
+ return this.model;
49
49
  }
50
50
 
51
+ /* ---------------- PUBLIC API ---------------- */
52
+
51
53
  /**
52
- * Generate REAL semantic embedding for text
54
+ * Generate embedding for a single string
53
55
  */
54
56
  async embed(text) {
55
- if (!text || !text.trim()) {
56
- return new Array(this.dim).fill(0);
57
+ if (typeof text !== "string" || !text.trim()) {
58
+ throw new Error("Embedding input must be a non-empty string");
57
59
  }
58
60
 
61
+ const model = await this.loadModel();
62
+
59
63
  try {
60
- const model = await this.getModel();
61
- const output = await model(text, {
62
- pooling: 'mean',
63
- normalize: true
64
+ const output = await model(text, {
65
+ pooling: "mean",
66
+ normalize: true,
64
67
  });
65
- return Array.from(output.data);
66
- } catch (error) {
67
- console.error(`āŒ Embedding failed for: "${text.substring(0, 50)}..."`);
68
- return new Array(this.dim).fill(0);
68
+
69
+ const vector = Array.from(output.data);
70
+
71
+ if (vector.length !== this.dim) {
72
+ throw new Error(
73
+ `Invalid embedding dimension: ${vector.length} (expected ${this.dim})`
74
+ );
75
+ }
76
+
77
+ return vector;
78
+ } catch (err) {
79
+ console.error(
80
+ `āŒ Embedding failed for text: "${text.slice(0, 60)}..."`
81
+ );
82
+ throw err;
69
83
  }
70
84
  }
71
85
 
72
86
  /**
73
- * Batch embedding for multiple strings
87
+ * Batch embedding (sequential, safe)
74
88
  */
75
- async embedBatch(textArray = []) {
76
- if (!Array.isArray(textArray)) {
89
+ async embedBatch(texts = []) {
90
+ if (!Array.isArray(texts)) {
77
91
  throw new Error("embedBatch expects an array of strings");
78
92
  }
79
- const embeddings = [];
80
- for (const text of textArray) {
81
- const embedding = await this.embed(text);
82
- embeddings.push(embedding);
93
+
94
+ const results = [];
95
+ for (const text of texts) {
96
+ results.push(await this.embed(text));
83
97
  }
84
- return embeddings;
98
+ return results;
85
99
  }
86
100
 
87
101
  /**
88
- * Get embedding dimension
102
+ * Return embedding dimension
89
103
  */
90
104
  getDimension() {
91
105
  return this.dim;
92
106
  }
93
107
  }
94
108
 
95
- /**
96
- * Convenience function for compatibility
97
- */
98
- async function createEmbeddingWithRetry(text, options = {}, retries = 2) {
99
- const embedder = new LocalEmbedding();
100
-
101
- for (let attempt = 1; attempt <= retries; attempt++) {
102
- try {
103
- const embedding = await embedder.embed(text);
104
- const isAllZeros = embedding.every(val => val === 0);
105
- if (isAllZeros && (text || '').trim()) {
106
- if (attempt === retries) {
107
- console.warn(`āš ļø Embedding is all zeros for text: "${text.substring(0, 50)}..."`);
108
- }
109
- throw new Error('Embedding returned all zeros');
110
- }
111
- return embedding;
112
- } catch (err) {
113
- if (attempt === retries) {
114
- console.error(`āŒ All ${retries} attempts failed for embedding text: "${text.substring(0, 50)}..."`);
115
- throw err;
116
- }
117
- console.warn(`āš ļø Embedding attempt ${attempt} failed, retrying...`);
118
- await new Promise(resolve => setTimeout(resolve, 100 * attempt));
119
- }
120
- }
121
- }
109
+ /* ---------------- SINGLETON EXPORT ---------------- */
110
+
111
+ // One embedder per process (CRITICAL)
112
+ const embedder = new LocalEmbedding();
122
113
 
123
- module.exports = { LocalEmbedding, createEmbeddingWithRetry };
114
+ module.exports = embedder;
package/src/index.js CHANGED
@@ -1,16 +1,16 @@
1
1
  const fs = require("fs");
2
2
  const path = require("path");
3
- const { createLLM } = require("./llm/router.js");
4
- const { LocalEmbedding } = require("./embeddings/local.js");
3
+ const embedder = require("./embeddings/local.js"); // āœ… singleton embedder
5
4
  const { chunkText } = require("./utils/chunker.js");
6
5
  const { extractKeywords } = require("./utils/extractText.js");
7
- const { cosine } = require("./utils/similarity.js");
8
6
  const { highlightMatches } = require("./utils/highlight.js");
9
- const PgVectorAdapter = require("./adapters/pgvectorAdapter.js");
10
7
  const VectorRouter = require("./adapters/vectorRouter");
8
+ const PgVectorAdapter = require("./adapters/pgvectorAdapter.js");
11
9
 
12
10
  const CACHE_PATH = path.join(process.cwd(), "embeddings.json");
13
11
 
12
+ /* ---------------- UTIL ---------------- */
13
+
14
14
  function safeResolve(base, userPath) {
15
15
  const resolved = path.resolve(base, userPath);
16
16
  if (!resolved.startsWith(path.resolve(base))) {
@@ -44,13 +44,9 @@ class DatabaseAdapter {
44
44
  async initialize(context) {
45
45
  if (this.initialized) return;
46
46
 
47
- if (context.db_type === "mongodb" || context.MONGO_URI) {
48
- await this.initMongo(context);
49
- } else if (context.db_type === "sqlite" || context.db_path) {
50
- await this.initSQLite(context);
51
- } else if (context.db_type === "postgres" || context.POSTGRES_URL) {
52
- await this.initPostgres(context);
53
- }
47
+ if (context.MONGO_URI) await this.initMongo(context);
48
+ else if (context.db_path) await this.initSQLite(context);
49
+ else if (context.POSTGRES_URL) await this.initPostgres(context);
54
50
 
55
51
  this.initialized = true;
56
52
  }
@@ -58,74 +54,50 @@ class DatabaseAdapter {
58
54
  async initSQLite(context) {
59
55
  const Database = require("better-sqlite3");
60
56
  const dbPath = context.db_path || "./database.db";
61
- const dbDir = path.dirname(path.resolve(dbPath));
62
- if (!fs.existsSync(dbDir)) {
63
- throw new Error(`SQLite database directory not found: ${dbDir}`);
64
- }
65
- this.sqliteClient = new Database(dbPath, { readonly: true });
66
- }
67
-
68
- async querySQLite(query, params = []) {
69
- const stmt = this.sqliteClient.prepare(query);
70
- return stmt.all(...params);
57
+ this.sqlite = new Database(dbPath, { readonly: true });
71
58
  }
72
59
 
73
60
  async initMongo(context) {
74
61
  const { MongoClient } = require("mongodb");
75
- const uri =
76
- context.MONGO_URI ||
77
- `mongodb://localhost:27017/${context.db_name || "olang"}`;
78
- this.mongoClient = new MongoClient(uri);
79
- await this.mongoClient.connect();
80
- }
81
-
82
- async queryMongo(collectionName, filter = {}, projection = {}) {
83
- const db = this.mongoClient.db(process.env.DB_NAME || "olang");
84
- return db.collection(collectionName).find(filter, { projection }).toArray();
62
+ this.mongo = new MongoClient(context.MONGO_URI);
63
+ await this.mongo.connect();
85
64
  }
86
65
 
87
66
  async initPostgres(context) {
88
67
  const { Pool } = require("pg");
89
- this.postgresClient = new Pool({
90
- connectionString: context.POSTGRES_URL,
91
- });
92
- }
93
-
94
- async queryPostgres(query, params = []) {
95
- const result = await this.postgresClient.query(query, params);
96
- return result.rows;
68
+ this.pg = new Pool({ connectionString: context.POSTGRES_URL });
97
69
  }
98
70
 
99
71
  async queryDocuments(context) {
100
72
  const table = context.db_table || "documents";
101
- const contentCol = context.db_content_column || "content";
102
73
  const idCol = context.db_id_column || "id";
74
+ const contentCol = context.db_content_column || "content";
103
75
 
104
- if (context.MONGO_URI) {
105
- const rows = await this.queryMongo(table);
106
- return rows.map((r) => ({
107
- id: r._id?.toString(),
76
+ if (this.mongo) {
77
+ const rows = await this.mongo.db().collection(table).find({}).toArray();
78
+ return rows.map(r => ({
79
+ id: r._id.toString(),
108
80
  content: r[contentCol] || "",
109
81
  source: `mongodb:${table}`,
110
82
  }));
111
83
  }
112
84
 
113
- if (context.db_path) {
114
- const rows = await this.querySQLite(
115
- `SELECT ${idCol}, ${contentCol} FROM ${table}`
116
- );
117
- return rows.map((r) => ({
85
+ if (this.sqlite) {
86
+ const rows = this.sqlite
87
+ .prepare(`SELECT ${idCol}, ${contentCol} FROM ${table}`)
88
+ .all();
89
+ return rows.map(r => ({
118
90
  id: r[idCol],
119
91
  content: r[contentCol],
120
92
  source: `sqlite:${table}`,
121
93
  }));
122
94
  }
123
95
 
124
- if (context.POSTGRES_URL) {
125
- const rows = await this.queryPostgres(
96
+ if (this.pg) {
97
+ const res = await this.pg.query(
126
98
  `SELECT ${idCol}, ${contentCol} FROM ${table}`
127
99
  );
128
- return rows.map((r) => ({
100
+ return res.rows.map(r => ({
129
101
  id: r[idCol],
130
102
  content: r[contentCol],
131
103
  source: `postgres:${table}`,
@@ -154,13 +126,13 @@ async function loadAllDocuments(context) {
154
126
  if (fs.existsSync(baseDir)) {
155
127
  const files = fs
156
128
  .readdirSync(baseDir)
157
- .filter((f) => f.endsWith(".txt") || f.endsWith(".md"));
129
+ .filter(f => f.endsWith(".txt") || f.endsWith(".md"));
158
130
 
159
- for (const f of files) {
131
+ for (const file of files) {
160
132
  docs.push({
161
- id: f,
162
- content: fs.readFileSync(path.join(baseDir, f), "utf8"),
163
- source: `file:${f}`,
133
+ id: file,
134
+ content: fs.readFileSync(path.join(baseDir, file), "utf8"),
135
+ source: `file:${file}`,
164
136
  });
165
137
  }
166
138
  }
@@ -170,74 +142,102 @@ async function loadAllDocuments(context) {
170
142
 
171
143
  /* ---------------- HYBRID VECTOR SEARCH ---------------- */
172
144
 
173
- async function performHybridDocQA(query, context = {}) {
145
+ async function performHybridDocQA(query, context) {
174
146
  const cache = loadCache();
175
- const embedder = new LocalEmbedding({ dimension: 384 });
147
+
148
+ const MIN_SCORE = context.minScore ?? 0.75;
149
+ const topK = context.topK ?? 5;
176
150
 
177
151
  const vectorStore = VectorRouter.create({
178
152
  backend: context.vectorBackend || "memory",
179
- dimension: 384,
153
+ dimension: embedder.getDimension(),
180
154
  ...context,
181
155
  });
182
156
 
157
+ console.log(
158
+ "🧠 Vector store methods:",
159
+ Object.getOwnPropertyNames(Object.getPrototypeOf(vectorStore))
160
+ );
161
+
183
162
  const documents = await loadAllDocuments(context);
184
- if (!documents.length) {
185
- return { text: "", meta: {} };
186
- }
163
+ console.log("šŸ“„ Documents loaded:", documents.length);
164
+
165
+ if (!documents.length) return { text: "(No documents found)", meta: { matches: 0 } };
187
166
 
167
+ // Multi-document ingestion
188
168
  for (const doc of documents) {
189
- if (!cache[doc.id]) {
190
- cache[doc.id] = true;
191
- const chunks = chunkText(doc.content, 500);
192
-
193
- for (let i = 0; i < chunks.length; i++) {
194
- const vector = await embedder.embed(chunks[i]);
195
- await vectorStore.upsert({
196
- id: `${doc.id}:${i}`,
197
- vector,
198
- content: chunks[i],
199
- source: doc.source,
200
- });
169
+ const chunks = chunkText(doc.content, 500);
170
+ console.log(`šŸ“¦ ${doc.id} split into ${chunks.length} chunks`);
171
+
172
+ for (let i = 0; i < chunks.length; i++) {
173
+ console.log("🧩 Chunk to embed:", chunks[i]?.substring(0, 100));
174
+
175
+ const vector = await embedder.embed(chunks[i]);
176
+ if (!vector || vector.every(v => v === 0)) {
177
+ console.warn("āš ļø Zero or invalid embedding, skipping chunk");
178
+ continue;
201
179
  }
180
+
181
+ await vectorStore.upsert({
182
+ id: `${doc.id}:${i}`,
183
+ vector,
184
+ content: chunks[i],
185
+ source: doc.source,
186
+ });
187
+
188
+ console.log(`āœ… Upserted ${doc.id}:${i}`);
202
189
  }
203
190
  }
204
191
 
205
192
  saveCache(cache);
206
193
 
194
+ // Embed the query
207
195
  const queryVector = await embedder.embed(query);
208
- const results = await vectorStore.query(queryVector, 5);
196
+ if (!queryVector || queryVector.every(v => v === 0)) {
197
+ console.warn("āš ļø Query embedding invalid");
198
+ return { text: "(Query could not be embedded)", meta: { matches: 0 } };
199
+ }
200
+
201
+ // Top-K + similarity threshold
202
+ const results = await vectorStore.query(queryVector, { topK });
203
+ const filtered = results.filter(r => r.score >= MIN_SCORE);
204
+
205
+ console.log(`šŸ” Search results: ${filtered.length} (after applying minScore=${MIN_SCORE})`);
206
+
207
+ if (!filtered.length) {
208
+ return { text: "(No relevant match found)", meta: { matches: 0 } };
209
+ }
209
210
 
210
211
  return {
211
212
  text: highlightMatches(
212
- results.map((r) => r.content).join("\n\n"),
213
+ filtered.map(r => r.content).join("\n\n"),
213
214
  extractKeywords(query)
214
215
  ),
215
- meta: { matches: results.length },
216
+ meta: { matches: filtered.length },
216
217
  };
217
218
  }
218
219
 
220
+
219
221
  /* ---------------- PGVECTOR SEARCH ---------------- */
220
222
 
221
- async function performPgVectorSearch(query, context = {}) {
223
+ async function performPgVectorSearch(query, context) {
222
224
  const adapter = new PgVectorAdapter({
223
225
  POSTGRES_URL: context.POSTGRES_URL,
224
226
  });
225
227
 
226
- const embedder = new LocalEmbedding({ dimension: 384 });
227
228
  const vector = await embedder.embed(query);
228
- const results = await adapter.query(vector, 5);
229
-
229
+ const results = await adapter.search(vector, 5);
230
230
  await adapter.close();
231
231
 
232
232
  return {
233
- text: results.map((r) => r.content).join("\n\n"),
233
+ text: results.map(r => r.content).join("\n\n"),
234
234
  meta: { matches: results.length },
235
235
  };
236
236
  }
237
237
 
238
238
  /* ---------------- ROUTER ---------------- */
239
239
 
240
- async function performDocQA(query, context = {}) {
240
+ async function performDocQA(query, context) {
241
241
  if (context.POSTGRES_URL) {
242
242
  return performPgVectorSearch(query, context);
243
243
  }
@@ -247,14 +247,26 @@ async function performDocQA(query, context = {}) {
247
247
  /* ---------------- O-LANG RESOLVER ---------------- */
248
248
 
249
249
  async function docSearchResolver(action, context) {
250
- if (action.startsWith("Ask doc-search")) {
251
- const match = action.match(/"(.*)"|'(.*)'/);
252
- const query = match
253
- ? match[1] || match[2]
254
- : action.replace("Ask doc-search", "").trim();
250
+ if (!action.startsWith("Ask doc-search")) return;
255
251
 
256
- return performDocQA(query, context);
257
- }
252
+ // Extract the query string
253
+ const match = action.match(/"(.*)"|'(.*)'/);
254
+ const query = match ? match[1] || match[2] : action.replace("Ask doc-search", "").trim();
255
+
256
+ // Optional: extract topK and minScore if provided in action, e.g. "Ask doc-search 'Vacation policy' topK=3 minScore=0.8"
257
+ let topK = 5;
258
+ let minScore = 0.75;
259
+
260
+ const topKMatch = action.match(/topK\s*=\s*(\d+)/i);
261
+ if (topKMatch) topK = parseInt(topKMatch[1], 10);
262
+
263
+ const minScoreMatch = action.match(/minScore\s*=\s*(0?\.\d+|1(\.0)?)/i);
264
+ if (minScoreMatch) minScore = parseFloat(minScoreMatch[1]);
265
+
266
+ // Pass these into context for hybrid search
267
+ const searchContext = { ...context, topK, minScore };
268
+
269
+ return performDocQA(query, searchContext);
258
270
  }
259
271
 
260
272
  docSearchResolver.resolverName = "doc-search";
@@ -0,0 +1,13 @@
1
+ const docSearch = require("./index");
2
+
3
+ (async () => {
4
+ const result = await docSearch(
5
+ 'Ask doc-search "vacation policy"',
6
+ {
7
+ doc_root: "./docs",
8
+ vectorBackend: "memory"
9
+ }
10
+ );
11
+
12
+ console.log(result);
13
+ })();
@@ -0,0 +1,36 @@
1
+ // test-doc-search-batch.js
2
+ const docSearchResolver = require("./src/index.js");
3
+
4
+ (async () => {
5
+ try {
6
+ const context = {
7
+ doc_root: "./docs", // folder with .txt or .md files
8
+ vectorBackend: "memory", // can also switch to "pgvector" if configured
9
+ };
10
+
11
+ const queries = [
12
+ "Semantic search",
13
+ "Vacation policy",
14
+ "Employee onboarding",
15
+ "Leave requests",
16
+ "HR compliance"
17
+ ];
18
+
19
+ console.log("šŸ”Ž Running batch doc-search...");
20
+
21
+ for (const query of queries) {
22
+ const action = `Ask doc-search "${query}"`;
23
+ const result = await docSearchResolver(action, context);
24
+
25
+ console.log("\n====================================");
26
+ console.log(`Query: "${query}"`);
27
+ console.log("Text:\n", result.text || "(No matches found)");
28
+ console.log("Meta:", result.meta);
29
+ console.log("====================================");
30
+ }
31
+
32
+ console.log("\nāœ… Batch search complete!");
33
+ } catch (err) {
34
+ console.error("āŒ Batch doc-search test failed:", err);
35
+ }
36
+ })();
@@ -0,0 +1,22 @@
1
+ // test-doc-search.js
2
+ const docSearchResolver = require("./src/index.js");
3
+
4
+ (async () => {
5
+ try {
6
+ const context = {
7
+ doc_root: "./docs", // folder containing .txt or .md files
8
+ vectorBackend: "memory", // can also use "pgvector" if configured
9
+ };
10
+
11
+ const action = 'Ask doc-search "Semantic search"'; // Example query
12
+ console.log("šŸ”Ž Running doc-search...");
13
+
14
+ const result = await docSearchResolver(action, context);
15
+
16
+ console.log("āœ… Search Result:");
17
+ console.log("Text:\n", result.text);
18
+ console.log("Meta:", result.meta);
19
+ } catch (err) {
20
+ console.error("āŒ doc-search test failed:", err);
21
+ }
22
+ })();
@@ -0,0 +1,32 @@
1
+ // test-single-doc.js
2
+ const path = require("path");
3
+ const { LocalEmbedding } = require("./embeddings/local.js");
4
+ const { chunkText } = require("./utils/chunker.js");
5
+ const VectorRouter = require("./adapters/vectorRouter");
6
+
7
+ (async () => {
8
+ const embedder = new LocalEmbedding();
9
+ const docPath = path.join(process.cwd(), "docs", "sample1.txt");
10
+ const fs = require("fs");
11
+ const content = fs.readFileSync(docPath, "utf8");
12
+
13
+ const chunks = chunkText(content, 500);
14
+ console.log(`Document split into ${chunks.length} chunk(s)`);
15
+
16
+ const vectorStore = VectorRouter.create({ backend: "memory", dimension: embedder.getDimension() });
17
+
18
+ for (let i = 0; i < chunks.length; i++) {
19
+ const vector = await embedder.embed(chunks[i]);
20
+ console.log(`Chunk ${i} embedding first 5 dims:`, vector.slice(0, 5));
21
+
22
+ await vectorStore.upsert({ id: `sample1:${i}`, vector, content: chunks[i], source: "file:sample1.txt" });
23
+ }
24
+
25
+ const query = "Semantic search";
26
+ const queryVector = await embedder.embed(query);
27
+
28
+ const results = await vectorStore.query(queryVector, { topK: 5 });
29
+ results.forEach((r, idx) => {
30
+ console.log(`Result ${idx}: score=${r.score.toFixed(3)} content=${r.content.substring(0, 50)}...`);
31
+ });
32
+ })();