@o-lang/semantic-doc-search 1.0.23 ā 1.0.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/config.json +25 -0
- package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/onnx/model_quantized.onnx +0 -0
- package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/tokenizer.json +30686 -0
- package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/tokenizer_config.json +15 -0
- package/embeddings.json +1 -514
- package/package.json +2 -1
- package/src/embeddings/local.js +77 -86
- package/src/index.js +76 -36
- package/src/test-doc-search.js +13 -0
- package/test-doc-search-batch.js +36 -0
- package/test-doc-search.js +22 -0
- package/test-single-doc.js +32 -0
package/src/embeddings/local.js
CHANGED
|
@@ -1,123 +1,114 @@
|
|
|
1
|
-
// src/embeddings/local.js
|
|
2
|
-
|
|
3
1
|
/**
|
|
4
|
-
* LocalEmbedding
|
|
5
|
-
*
|
|
2
|
+
* LocalEmbedding
|
|
3
|
+
* ----------------
|
|
4
|
+
* Real semantic embeddings using all-MiniLM-L6-v2
|
|
5
|
+
* - Singleton model load
|
|
6
|
+
* - No silent failures
|
|
7
|
+
* - No zero vectors
|
|
8
|
+
* - Deterministic behavior
|
|
6
9
|
*/
|
|
10
|
+
|
|
7
11
|
class LocalEmbedding {
|
|
8
12
|
constructor() {
|
|
9
13
|
this.dim = 384;
|
|
10
|
-
this.
|
|
11
|
-
this.
|
|
14
|
+
this.model = null;
|
|
15
|
+
this.loading = null;
|
|
12
16
|
}
|
|
13
17
|
|
|
14
|
-
|
|
15
|
-
* Lazy-load the @xenova/transformers package
|
|
16
|
-
*/
|
|
17
|
-
async getTransformers() {
|
|
18
|
-
if (!this.transformersPromise) {
|
|
19
|
-
this.transformersPromise = import('@xenova/transformers');
|
|
20
|
-
}
|
|
21
|
-
return this.transformersPromise;
|
|
22
|
-
}
|
|
18
|
+
/* ---------------- INTERNAL ---------------- */
|
|
23
19
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
20
|
+
async loadModel() {
|
|
21
|
+
if (this.model) return this.model;
|
|
22
|
+
|
|
23
|
+
if (!this.loading) {
|
|
24
|
+
this.loading = (async () => {
|
|
25
|
+
const { pipeline, env } = await import("@xenova/transformers");
|
|
26
|
+
|
|
27
|
+
// Safe defaults
|
|
28
|
+
env.allowLocalModels = true;
|
|
29
|
+
env.backends.onnx.warmup = false;
|
|
30
|
+
|
|
31
|
+
console.log("š Loading local embedding model (first run only)...");
|
|
32
|
+
|
|
33
|
+
const model = await pipeline(
|
|
34
|
+
"feature-extraction",
|
|
35
|
+
"Xenova/all-MiniLM-L6-v2",
|
|
36
|
+
{
|
|
37
|
+
revision: "main",
|
|
38
|
+
cache_dir: "./.cache/embeddings",
|
|
39
|
+
}
|
|
40
|
+
);
|
|
41
|
+
|
|
42
|
+
console.log("ā
Local embedding model ready");
|
|
42
43
|
return model;
|
|
43
|
-
})
|
|
44
|
-
console.error('ā Failed to load local embedding model:', error.message);
|
|
45
|
-
throw error;
|
|
46
|
-
});
|
|
44
|
+
})();
|
|
47
45
|
}
|
|
48
|
-
|
|
46
|
+
|
|
47
|
+
this.model = await this.loading;
|
|
48
|
+
return this.model;
|
|
49
49
|
}
|
|
50
50
|
|
|
51
|
+
/* ---------------- PUBLIC API ---------------- */
|
|
52
|
+
|
|
51
53
|
/**
|
|
52
|
-
* Generate
|
|
54
|
+
* Generate embedding for a single string
|
|
53
55
|
*/
|
|
54
56
|
async embed(text) {
|
|
55
|
-
if (
|
|
56
|
-
|
|
57
|
+
if (typeof text !== "string" || !text.trim()) {
|
|
58
|
+
throw new Error("Embedding input must be a non-empty string");
|
|
57
59
|
}
|
|
58
60
|
|
|
61
|
+
const model = await this.loadModel();
|
|
62
|
+
|
|
59
63
|
try {
|
|
60
|
-
const
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
normalize: true
|
|
64
|
+
const output = await model(text, {
|
|
65
|
+
pooling: "mean",
|
|
66
|
+
normalize: true,
|
|
64
67
|
});
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
68
|
+
|
|
69
|
+
const vector = Array.from(output.data);
|
|
70
|
+
|
|
71
|
+
if (vector.length !== this.dim) {
|
|
72
|
+
throw new Error(
|
|
73
|
+
`Invalid embedding dimension: ${vector.length} (expected ${this.dim})`
|
|
74
|
+
);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
return vector;
|
|
78
|
+
} catch (err) {
|
|
79
|
+
console.error(
|
|
80
|
+
`ā Embedding failed for text: "${text.slice(0, 60)}..."`
|
|
81
|
+
);
|
|
82
|
+
throw err;
|
|
69
83
|
}
|
|
70
84
|
}
|
|
71
85
|
|
|
72
86
|
/**
|
|
73
|
-
* Batch embedding
|
|
87
|
+
* Batch embedding (sequential, safe)
|
|
74
88
|
*/
|
|
75
|
-
async embedBatch(
|
|
76
|
-
if (!Array.isArray(
|
|
89
|
+
async embedBatch(texts = []) {
|
|
90
|
+
if (!Array.isArray(texts)) {
|
|
77
91
|
throw new Error("embedBatch expects an array of strings");
|
|
78
92
|
}
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
93
|
+
|
|
94
|
+
const results = [];
|
|
95
|
+
for (const text of texts) {
|
|
96
|
+
results.push(await this.embed(text));
|
|
83
97
|
}
|
|
84
|
-
return
|
|
98
|
+
return results;
|
|
85
99
|
}
|
|
86
100
|
|
|
87
101
|
/**
|
|
88
|
-
*
|
|
102
|
+
* Return embedding dimension
|
|
89
103
|
*/
|
|
90
104
|
getDimension() {
|
|
91
105
|
return this.dim;
|
|
92
106
|
}
|
|
93
107
|
}
|
|
94
108
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
const embedder = new LocalEmbedding();
|
|
100
|
-
|
|
101
|
-
for (let attempt = 1; attempt <= retries; attempt++) {
|
|
102
|
-
try {
|
|
103
|
-
const embedding = await embedder.embed(text);
|
|
104
|
-
const isAllZeros = embedding.every(val => val === 0);
|
|
105
|
-
if (isAllZeros && (text || '').trim()) {
|
|
106
|
-
if (attempt === retries) {
|
|
107
|
-
console.warn(`ā ļø Embedding is all zeros for text: "${text.substring(0, 50)}..."`);
|
|
108
|
-
}
|
|
109
|
-
throw new Error('Embedding returned all zeros');
|
|
110
|
-
}
|
|
111
|
-
return embedding;
|
|
112
|
-
} catch (err) {
|
|
113
|
-
if (attempt === retries) {
|
|
114
|
-
console.error(`ā All ${retries} attempts failed for embedding text: "${text.substring(0, 50)}..."`);
|
|
115
|
-
throw err;
|
|
116
|
-
}
|
|
117
|
-
console.warn(`ā ļø Embedding attempt ${attempt} failed, retrying...`);
|
|
118
|
-
await new Promise(resolve => setTimeout(resolve, 100 * attempt));
|
|
119
|
-
}
|
|
120
|
-
}
|
|
121
|
-
}
|
|
109
|
+
/* ---------------- SINGLETON EXPORT ---------------- */
|
|
110
|
+
|
|
111
|
+
// One embedder per process (CRITICAL)
|
|
112
|
+
const embedder = new LocalEmbedding();
|
|
122
113
|
|
|
123
|
-
module.exports =
|
|
114
|
+
module.exports = embedder;
|
package/src/index.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
const fs = require("fs");
|
|
2
2
|
const path = require("path");
|
|
3
|
-
const
|
|
3
|
+
const embedder = require("./embeddings/local.js"); // ā
singleton embedder
|
|
4
4
|
const { chunkText } = require("./utils/chunker.js");
|
|
5
5
|
const { extractKeywords } = require("./utils/extractText.js");
|
|
6
6
|
const { highlightMatches } = require("./utils/highlight.js");
|
|
@@ -59,8 +59,7 @@ class DatabaseAdapter {
|
|
|
59
59
|
|
|
60
60
|
async initMongo(context) {
|
|
61
61
|
const { MongoClient } = require("mongodb");
|
|
62
|
-
|
|
63
|
-
this.mongo = new MongoClient(uri);
|
|
62
|
+
this.mongo = new MongoClient(context.MONGO_URI);
|
|
64
63
|
await this.mongo.connect();
|
|
65
64
|
}
|
|
66
65
|
|
|
@@ -113,8 +112,8 @@ class DatabaseAdapter {
|
|
|
113
112
|
|
|
114
113
|
async function loadAllDocuments(context) {
|
|
115
114
|
const docs = [];
|
|
116
|
-
|
|
117
115
|
const db = new DatabaseAdapter();
|
|
116
|
+
|
|
118
117
|
try {
|
|
119
118
|
await db.initialize(context);
|
|
120
119
|
docs.push(...(await db.queryDocuments(context)));
|
|
@@ -125,9 +124,9 @@ async function loadAllDocuments(context) {
|
|
|
125
124
|
: path.join(process.cwd(), "docs");
|
|
126
125
|
|
|
127
126
|
if (fs.existsSync(baseDir)) {
|
|
128
|
-
const files = fs
|
|
129
|
-
|
|
130
|
-
|
|
127
|
+
const files = fs
|
|
128
|
+
.readdirSync(baseDir)
|
|
129
|
+
.filter(f => f.endsWith(".txt") || f.endsWith(".md"));
|
|
131
130
|
|
|
132
131
|
for (const file of files) {
|
|
133
132
|
docs.push({
|
|
@@ -145,56 +144,86 @@ async function loadAllDocuments(context) {
|
|
|
145
144
|
|
|
146
145
|
async function performHybridDocQA(query, context) {
|
|
147
146
|
const cache = loadCache();
|
|
148
|
-
const embedder = new LocalEmbedding({ dimension: 384 });
|
|
149
147
|
|
|
150
|
-
const
|
|
148
|
+
const MIN_SCORE = context.minScore ?? 0.75;
|
|
149
|
+
const topK = context.topK ?? 5;
|
|
150
|
+
|
|
151
|
+
const vectorStore = VectorRouter.create({
|
|
151
152
|
backend: context.vectorBackend || "memory",
|
|
152
|
-
dimension:
|
|
153
|
+
dimension: embedder.getDimension(),
|
|
153
154
|
...context,
|
|
154
155
|
});
|
|
155
156
|
|
|
157
|
+
console.log(
|
|
158
|
+
"š§ Vector store methods:",
|
|
159
|
+
Object.getOwnPropertyNames(Object.getPrototypeOf(vectorStore))
|
|
160
|
+
);
|
|
161
|
+
|
|
156
162
|
const documents = await loadAllDocuments(context);
|
|
157
|
-
|
|
163
|
+
console.log("š Documents loaded:", documents.length);
|
|
164
|
+
|
|
165
|
+
if (!documents.length) return { text: "(No documents found)", meta: { matches: 0 } };
|
|
158
166
|
|
|
167
|
+
// Multi-document ingestion
|
|
159
168
|
for (const doc of documents) {
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
source: doc.source,
|
|
171
|
-
});
|
|
169
|
+
const chunks = chunkText(doc.content, 500);
|
|
170
|
+
console.log(`š¦ ${doc.id} split into ${chunks.length} chunks`);
|
|
171
|
+
|
|
172
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
173
|
+
console.log("š§© Chunk to embed:", chunks[i]?.substring(0, 100));
|
|
174
|
+
|
|
175
|
+
const vector = await embedder.embed(chunks[i]);
|
|
176
|
+
if (!vector || vector.every(v => v === 0)) {
|
|
177
|
+
console.warn("ā ļø Zero or invalid embedding, skipping chunk");
|
|
178
|
+
continue;
|
|
172
179
|
}
|
|
180
|
+
|
|
181
|
+
await vectorStore.upsert({
|
|
182
|
+
id: `${doc.id}:${i}`,
|
|
183
|
+
vector,
|
|
184
|
+
content: chunks[i],
|
|
185
|
+
source: doc.source,
|
|
186
|
+
});
|
|
187
|
+
|
|
188
|
+
console.log(`ā
Upserted ${doc.id}:${i}`);
|
|
173
189
|
}
|
|
174
190
|
}
|
|
175
191
|
|
|
176
192
|
saveCache(cache);
|
|
177
193
|
|
|
194
|
+
// Embed the query
|
|
178
195
|
const queryVector = await embedder.embed(query);
|
|
179
|
-
|
|
180
|
-
embedding
|
|
181
|
-
|
|
182
|
-
}
|
|
196
|
+
if (!queryVector || queryVector.every(v => v === 0)) {
|
|
197
|
+
console.warn("ā ļø Query embedding invalid");
|
|
198
|
+
return { text: "(Query could not be embedded)", meta: { matches: 0 } };
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// Top-K + similarity threshold
|
|
202
|
+
const results = await vectorStore.query(queryVector, { topK });
|
|
203
|
+
const filtered = results.filter(r => r.score >= MIN_SCORE);
|
|
204
|
+
|
|
205
|
+
console.log(`š Search results: ${filtered.length} (after applying minScore=${MIN_SCORE})`);
|
|
206
|
+
|
|
207
|
+
if (!filtered.length) {
|
|
208
|
+
return { text: "(No relevant match found)", meta: { matches: 0 } };
|
|
209
|
+
}
|
|
183
210
|
|
|
184
211
|
return {
|
|
185
212
|
text: highlightMatches(
|
|
186
|
-
|
|
213
|
+
filtered.map(r => r.content).join("\n\n"),
|
|
187
214
|
extractKeywords(query)
|
|
188
215
|
),
|
|
189
|
-
meta: { matches:
|
|
216
|
+
meta: { matches: filtered.length },
|
|
190
217
|
};
|
|
191
218
|
}
|
|
192
219
|
|
|
220
|
+
|
|
193
221
|
/* ---------------- PGVECTOR SEARCH ---------------- */
|
|
194
222
|
|
|
195
223
|
async function performPgVectorSearch(query, context) {
|
|
196
|
-
const adapter = new PgVectorAdapter({
|
|
197
|
-
|
|
224
|
+
const adapter = new PgVectorAdapter({
|
|
225
|
+
POSTGRES_URL: context.POSTGRES_URL,
|
|
226
|
+
});
|
|
198
227
|
|
|
199
228
|
const vector = await embedder.embed(query);
|
|
200
229
|
const results = await adapter.search(vector, 5);
|
|
@@ -220,14 +249,25 @@ async function performDocQA(query, context) {
|
|
|
220
249
|
async function docSearchResolver(action, context) {
|
|
221
250
|
if (!action.startsWith("Ask doc-search")) return;
|
|
222
251
|
|
|
252
|
+
// Extract the query string
|
|
223
253
|
const match = action.match(/"(.*)"|'(.*)'/);
|
|
224
|
-
const query = match
|
|
225
|
-
|
|
226
|
-
|
|
254
|
+
const query = match ? match[1] || match[2] : action.replace("Ask doc-search", "").trim();
|
|
255
|
+
|
|
256
|
+
// Optional: extract topK and minScore if provided in action, e.g. "Ask doc-search 'Vacation policy' topK=3 minScore=0.8"
|
|
257
|
+
let topK = 5;
|
|
258
|
+
let minScore = 0.75;
|
|
259
|
+
|
|
260
|
+
const topKMatch = action.match(/topK\s*=\s*(\d+)/i);
|
|
261
|
+
if (topKMatch) topK = parseInt(topKMatch[1], 10);
|
|
227
262
|
|
|
228
|
-
|
|
263
|
+
const minScoreMatch = action.match(/minScore\s*=\s*(0?\.\d+|1(\.0)?)/i);
|
|
264
|
+
if (minScoreMatch) minScore = parseFloat(minScoreMatch[1]);
|
|
265
|
+
|
|
266
|
+
// Pass these into context for hybrid search
|
|
267
|
+
const searchContext = { ...context, topK, minScore };
|
|
268
|
+
|
|
269
|
+
return performDocQA(query, searchContext);
|
|
229
270
|
}
|
|
230
271
|
|
|
231
272
|
docSearchResolver.resolverName = "doc-search";
|
|
232
273
|
module.exports = docSearchResolver;
|
|
233
|
-
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
// test-doc-search-batch.js
|
|
2
|
+
const docSearchResolver = require("./src/index.js");
|
|
3
|
+
|
|
4
|
+
(async () => {
|
|
5
|
+
try {
|
|
6
|
+
const context = {
|
|
7
|
+
doc_root: "./docs", // folder with .txt or .md files
|
|
8
|
+
vectorBackend: "memory", // can also switch to "pgvector" if configured
|
|
9
|
+
};
|
|
10
|
+
|
|
11
|
+
const queries = [
|
|
12
|
+
"Semantic search",
|
|
13
|
+
"Vacation policy",
|
|
14
|
+
"Employee onboarding",
|
|
15
|
+
"Leave requests",
|
|
16
|
+
"HR compliance"
|
|
17
|
+
];
|
|
18
|
+
|
|
19
|
+
console.log("š Running batch doc-search...");
|
|
20
|
+
|
|
21
|
+
for (const query of queries) {
|
|
22
|
+
const action = `Ask doc-search "${query}"`;
|
|
23
|
+
const result = await docSearchResolver(action, context);
|
|
24
|
+
|
|
25
|
+
console.log("\n====================================");
|
|
26
|
+
console.log(`Query: "${query}"`);
|
|
27
|
+
console.log("Text:\n", result.text || "(No matches found)");
|
|
28
|
+
console.log("Meta:", result.meta);
|
|
29
|
+
console.log("====================================");
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
console.log("\nā
Batch search complete!");
|
|
33
|
+
} catch (err) {
|
|
34
|
+
console.error("ā Batch doc-search test failed:", err);
|
|
35
|
+
}
|
|
36
|
+
})();
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
// test-doc-search.js
|
|
2
|
+
const docSearchResolver = require("./src/index.js");
|
|
3
|
+
|
|
4
|
+
(async () => {
|
|
5
|
+
try {
|
|
6
|
+
const context = {
|
|
7
|
+
doc_root: "./docs", // folder containing .txt or .md files
|
|
8
|
+
vectorBackend: "memory", // can also use "pgvector" if configured
|
|
9
|
+
};
|
|
10
|
+
|
|
11
|
+
const action = 'Ask doc-search "Semantic search"'; // Example query
|
|
12
|
+
console.log("š Running doc-search...");
|
|
13
|
+
|
|
14
|
+
const result = await docSearchResolver(action, context);
|
|
15
|
+
|
|
16
|
+
console.log("ā
Search Result:");
|
|
17
|
+
console.log("Text:\n", result.text);
|
|
18
|
+
console.log("Meta:", result.meta);
|
|
19
|
+
} catch (err) {
|
|
20
|
+
console.error("ā doc-search test failed:", err);
|
|
21
|
+
}
|
|
22
|
+
})();
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
// test-single-doc.js
|
|
2
|
+
const path = require("path");
|
|
3
|
+
const { LocalEmbedding } = require("./embeddings/local.js");
|
|
4
|
+
const { chunkText } = require("./utils/chunker.js");
|
|
5
|
+
const VectorRouter = require("./adapters/vectorRouter");
|
|
6
|
+
|
|
7
|
+
(async () => {
|
|
8
|
+
const embedder = new LocalEmbedding();
|
|
9
|
+
const docPath = path.join(process.cwd(), "docs", "sample1.txt");
|
|
10
|
+
const fs = require("fs");
|
|
11
|
+
const content = fs.readFileSync(docPath, "utf8");
|
|
12
|
+
|
|
13
|
+
const chunks = chunkText(content, 500);
|
|
14
|
+
console.log(`Document split into ${chunks.length} chunk(s)`);
|
|
15
|
+
|
|
16
|
+
const vectorStore = VectorRouter.create({ backend: "memory", dimension: embedder.getDimension() });
|
|
17
|
+
|
|
18
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
19
|
+
const vector = await embedder.embed(chunks[i]);
|
|
20
|
+
console.log(`Chunk ${i} embedding first 5 dims:`, vector.slice(0, 5));
|
|
21
|
+
|
|
22
|
+
await vectorStore.upsert({ id: `sample1:${i}`, vector, content: chunks[i], source: "file:sample1.txt" });
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
const query = "Semantic search";
|
|
26
|
+
const queryVector = await embedder.embed(query);
|
|
27
|
+
|
|
28
|
+
const results = await vectorStore.query(queryVector, { topK: 5 });
|
|
29
|
+
results.forEach((r, idx) => {
|
|
30
|
+
console.log(`Result ${idx}: score=${r.score.toFixed(3)} content=${r.content.substring(0, 50)}...`);
|
|
31
|
+
});
|
|
32
|
+
})();
|