@o-lang/semantic-doc-search 1.0.22 ā 1.0.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/config.json +25 -0
- package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/onnx/model_quantized.onnx +0 -0
- package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/tokenizer.json +30686 -0
- package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/tokenizer_config.json +15 -0
- package/embeddings.json +1 -514
- package/package.json +2 -1
- package/src/embeddings/local.js +77 -86
- package/src/index.js +103 -91
- package/src/test-doc-search.js +13 -0
- package/test-doc-search-batch.js +36 -0
- package/test-doc-search.js +22 -0
- package/test-single-doc.js +32 -0
package/src/embeddings/local.js
CHANGED
|
@@ -1,123 +1,114 @@
|
|
|
1
|
-
// src/embeddings/local.js
|
|
2
|
-
|
|
3
1
|
/**
|
|
4
|
-
* LocalEmbedding
|
|
5
|
-
*
|
|
2
|
+
* LocalEmbedding
|
|
3
|
+
* ----------------
|
|
4
|
+
* Real semantic embeddings using all-MiniLM-L6-v2
|
|
5
|
+
* - Singleton model load
|
|
6
|
+
* - No silent failures
|
|
7
|
+
* - No zero vectors
|
|
8
|
+
* - Deterministic behavior
|
|
6
9
|
*/
|
|
10
|
+
|
|
7
11
|
class LocalEmbedding {
|
|
8
12
|
constructor() {
|
|
9
13
|
this.dim = 384;
|
|
10
|
-
this.
|
|
11
|
-
this.
|
|
14
|
+
this.model = null;
|
|
15
|
+
this.loading = null;
|
|
12
16
|
}
|
|
13
17
|
|
|
14
|
-
|
|
15
|
-
* Lazy-load the @xenova/transformers package
|
|
16
|
-
*/
|
|
17
|
-
async getTransformers() {
|
|
18
|
-
if (!this.transformersPromise) {
|
|
19
|
-
this.transformersPromise = import('@xenova/transformers');
|
|
20
|
-
}
|
|
21
|
-
return this.transformersPromise;
|
|
22
|
-
}
|
|
18
|
+
/* ---------------- INTERNAL ---------------- */
|
|
23
19
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
20
|
+
async loadModel() {
|
|
21
|
+
if (this.model) return this.model;
|
|
22
|
+
|
|
23
|
+
if (!this.loading) {
|
|
24
|
+
this.loading = (async () => {
|
|
25
|
+
const { pipeline, env } = await import("@xenova/transformers");
|
|
26
|
+
|
|
27
|
+
// Safe defaults
|
|
28
|
+
env.allowLocalModels = true;
|
|
29
|
+
env.backends.onnx.warmup = false;
|
|
30
|
+
|
|
31
|
+
console.log("š Loading local embedding model (first run only)...");
|
|
32
|
+
|
|
33
|
+
const model = await pipeline(
|
|
34
|
+
"feature-extraction",
|
|
35
|
+
"Xenova/all-MiniLM-L6-v2",
|
|
36
|
+
{
|
|
37
|
+
revision: "main",
|
|
38
|
+
cache_dir: "./.cache/embeddings",
|
|
39
|
+
}
|
|
40
|
+
);
|
|
41
|
+
|
|
42
|
+
console.log("ā
Local embedding model ready");
|
|
42
43
|
return model;
|
|
43
|
-
})
|
|
44
|
-
console.error('ā Failed to load local embedding model:', error.message);
|
|
45
|
-
throw error;
|
|
46
|
-
});
|
|
44
|
+
})();
|
|
47
45
|
}
|
|
48
|
-
|
|
46
|
+
|
|
47
|
+
this.model = await this.loading;
|
|
48
|
+
return this.model;
|
|
49
49
|
}
|
|
50
50
|
|
|
51
|
+
/* ---------------- PUBLIC API ---------------- */
|
|
52
|
+
|
|
51
53
|
/**
|
|
52
|
-
* Generate
|
|
54
|
+
* Generate embedding for a single string
|
|
53
55
|
*/
|
|
54
56
|
async embed(text) {
|
|
55
|
-
if (
|
|
56
|
-
|
|
57
|
+
if (typeof text !== "string" || !text.trim()) {
|
|
58
|
+
throw new Error("Embedding input must be a non-empty string");
|
|
57
59
|
}
|
|
58
60
|
|
|
61
|
+
const model = await this.loadModel();
|
|
62
|
+
|
|
59
63
|
try {
|
|
60
|
-
const
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
normalize: true
|
|
64
|
+
const output = await model(text, {
|
|
65
|
+
pooling: "mean",
|
|
66
|
+
normalize: true,
|
|
64
67
|
});
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
68
|
+
|
|
69
|
+
const vector = Array.from(output.data);
|
|
70
|
+
|
|
71
|
+
if (vector.length !== this.dim) {
|
|
72
|
+
throw new Error(
|
|
73
|
+
`Invalid embedding dimension: ${vector.length} (expected ${this.dim})`
|
|
74
|
+
);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
return vector;
|
|
78
|
+
} catch (err) {
|
|
79
|
+
console.error(
|
|
80
|
+
`ā Embedding failed for text: "${text.slice(0, 60)}..."`
|
|
81
|
+
);
|
|
82
|
+
throw err;
|
|
69
83
|
}
|
|
70
84
|
}
|
|
71
85
|
|
|
72
86
|
/**
|
|
73
|
-
* Batch embedding
|
|
87
|
+
* Batch embedding (sequential, safe)
|
|
74
88
|
*/
|
|
75
|
-
async embedBatch(
|
|
76
|
-
if (!Array.isArray(
|
|
89
|
+
async embedBatch(texts = []) {
|
|
90
|
+
if (!Array.isArray(texts)) {
|
|
77
91
|
throw new Error("embedBatch expects an array of strings");
|
|
78
92
|
}
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
93
|
+
|
|
94
|
+
const results = [];
|
|
95
|
+
for (const text of texts) {
|
|
96
|
+
results.push(await this.embed(text));
|
|
83
97
|
}
|
|
84
|
-
return
|
|
98
|
+
return results;
|
|
85
99
|
}
|
|
86
100
|
|
|
87
101
|
/**
|
|
88
|
-
*
|
|
102
|
+
* Return embedding dimension
|
|
89
103
|
*/
|
|
90
104
|
getDimension() {
|
|
91
105
|
return this.dim;
|
|
92
106
|
}
|
|
93
107
|
}
|
|
94
108
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
const embedder = new LocalEmbedding();
|
|
100
|
-
|
|
101
|
-
for (let attempt = 1; attempt <= retries; attempt++) {
|
|
102
|
-
try {
|
|
103
|
-
const embedding = await embedder.embed(text);
|
|
104
|
-
const isAllZeros = embedding.every(val => val === 0);
|
|
105
|
-
if (isAllZeros && (text || '').trim()) {
|
|
106
|
-
if (attempt === retries) {
|
|
107
|
-
console.warn(`ā ļø Embedding is all zeros for text: "${text.substring(0, 50)}..."`);
|
|
108
|
-
}
|
|
109
|
-
throw new Error('Embedding returned all zeros');
|
|
110
|
-
}
|
|
111
|
-
return embedding;
|
|
112
|
-
} catch (err) {
|
|
113
|
-
if (attempt === retries) {
|
|
114
|
-
console.error(`ā All ${retries} attempts failed for embedding text: "${text.substring(0, 50)}..."`);
|
|
115
|
-
throw err;
|
|
116
|
-
}
|
|
117
|
-
console.warn(`ā ļø Embedding attempt ${attempt} failed, retrying...`);
|
|
118
|
-
await new Promise(resolve => setTimeout(resolve, 100 * attempt));
|
|
119
|
-
}
|
|
120
|
-
}
|
|
121
|
-
}
|
|
109
|
+
/* ---------------- SINGLETON EXPORT ---------------- */
|
|
110
|
+
|
|
111
|
+
// One embedder per process (CRITICAL)
|
|
112
|
+
const embedder = new LocalEmbedding();
|
|
122
113
|
|
|
123
|
-
module.exports =
|
|
114
|
+
module.exports = embedder;
|
package/src/index.js
CHANGED
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
const fs = require("fs");
|
|
2
2
|
const path = require("path");
|
|
3
|
-
const
|
|
4
|
-
const { LocalEmbedding } = require("./embeddings/local.js");
|
|
3
|
+
const embedder = require("./embeddings/local.js"); // ā
singleton embedder
|
|
5
4
|
const { chunkText } = require("./utils/chunker.js");
|
|
6
5
|
const { extractKeywords } = require("./utils/extractText.js");
|
|
7
|
-
const { cosine } = require("./utils/similarity.js");
|
|
8
6
|
const { highlightMatches } = require("./utils/highlight.js");
|
|
9
|
-
const PgVectorAdapter = require("./adapters/pgvectorAdapter.js");
|
|
10
7
|
const VectorRouter = require("./adapters/vectorRouter");
|
|
8
|
+
const PgVectorAdapter = require("./adapters/pgvectorAdapter.js");
|
|
11
9
|
|
|
12
10
|
const CACHE_PATH = path.join(process.cwd(), "embeddings.json");
|
|
13
11
|
|
|
12
|
+
/* ---------------- UTIL ---------------- */
|
|
13
|
+
|
|
14
14
|
function safeResolve(base, userPath) {
|
|
15
15
|
const resolved = path.resolve(base, userPath);
|
|
16
16
|
if (!resolved.startsWith(path.resolve(base))) {
|
|
@@ -44,13 +44,9 @@ class DatabaseAdapter {
|
|
|
44
44
|
async initialize(context) {
|
|
45
45
|
if (this.initialized) return;
|
|
46
46
|
|
|
47
|
-
if (context.
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
await this.initSQLite(context);
|
|
51
|
-
} else if (context.db_type === "postgres" || context.POSTGRES_URL) {
|
|
52
|
-
await this.initPostgres(context);
|
|
53
|
-
}
|
|
47
|
+
if (context.MONGO_URI) await this.initMongo(context);
|
|
48
|
+
else if (context.db_path) await this.initSQLite(context);
|
|
49
|
+
else if (context.POSTGRES_URL) await this.initPostgres(context);
|
|
54
50
|
|
|
55
51
|
this.initialized = true;
|
|
56
52
|
}
|
|
@@ -58,74 +54,50 @@ class DatabaseAdapter {
|
|
|
58
54
|
async initSQLite(context) {
|
|
59
55
|
const Database = require("better-sqlite3");
|
|
60
56
|
const dbPath = context.db_path || "./database.db";
|
|
61
|
-
|
|
62
|
-
if (!fs.existsSync(dbDir)) {
|
|
63
|
-
throw new Error(`SQLite database directory not found: ${dbDir}`);
|
|
64
|
-
}
|
|
65
|
-
this.sqliteClient = new Database(dbPath, { readonly: true });
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
async querySQLite(query, params = []) {
|
|
69
|
-
const stmt = this.sqliteClient.prepare(query);
|
|
70
|
-
return stmt.all(...params);
|
|
57
|
+
this.sqlite = new Database(dbPath, { readonly: true });
|
|
71
58
|
}
|
|
72
59
|
|
|
73
60
|
async initMongo(context) {
|
|
74
61
|
const { MongoClient } = require("mongodb");
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
`mongodb://localhost:27017/${context.db_name || "olang"}`;
|
|
78
|
-
this.mongoClient = new MongoClient(uri);
|
|
79
|
-
await this.mongoClient.connect();
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
async queryMongo(collectionName, filter = {}, projection = {}) {
|
|
83
|
-
const db = this.mongoClient.db(process.env.DB_NAME || "olang");
|
|
84
|
-
return db.collection(collectionName).find(filter, { projection }).toArray();
|
|
62
|
+
this.mongo = new MongoClient(context.MONGO_URI);
|
|
63
|
+
await this.mongo.connect();
|
|
85
64
|
}
|
|
86
65
|
|
|
87
66
|
async initPostgres(context) {
|
|
88
67
|
const { Pool } = require("pg");
|
|
89
|
-
this.
|
|
90
|
-
connectionString: context.POSTGRES_URL,
|
|
91
|
-
});
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
async queryPostgres(query, params = []) {
|
|
95
|
-
const result = await this.postgresClient.query(query, params);
|
|
96
|
-
return result.rows;
|
|
68
|
+
this.pg = new Pool({ connectionString: context.POSTGRES_URL });
|
|
97
69
|
}
|
|
98
70
|
|
|
99
71
|
async queryDocuments(context) {
|
|
100
72
|
const table = context.db_table || "documents";
|
|
101
|
-
const contentCol = context.db_content_column || "content";
|
|
102
73
|
const idCol = context.db_id_column || "id";
|
|
74
|
+
const contentCol = context.db_content_column || "content";
|
|
103
75
|
|
|
104
|
-
if (
|
|
105
|
-
const rows = await this.
|
|
106
|
-
return rows.map(
|
|
107
|
-
id: r._id
|
|
76
|
+
if (this.mongo) {
|
|
77
|
+
const rows = await this.mongo.db().collection(table).find({}).toArray();
|
|
78
|
+
return rows.map(r => ({
|
|
79
|
+
id: r._id.toString(),
|
|
108
80
|
content: r[contentCol] || "",
|
|
109
81
|
source: `mongodb:${table}`,
|
|
110
82
|
}));
|
|
111
83
|
}
|
|
112
84
|
|
|
113
|
-
if (
|
|
114
|
-
const rows =
|
|
115
|
-
`SELECT ${idCol}, ${contentCol} FROM ${table}`
|
|
116
|
-
|
|
117
|
-
return rows.map(
|
|
85
|
+
if (this.sqlite) {
|
|
86
|
+
const rows = this.sqlite
|
|
87
|
+
.prepare(`SELECT ${idCol}, ${contentCol} FROM ${table}`)
|
|
88
|
+
.all();
|
|
89
|
+
return rows.map(r => ({
|
|
118
90
|
id: r[idCol],
|
|
119
91
|
content: r[contentCol],
|
|
120
92
|
source: `sqlite:${table}`,
|
|
121
93
|
}));
|
|
122
94
|
}
|
|
123
95
|
|
|
124
|
-
if (
|
|
125
|
-
const
|
|
96
|
+
if (this.pg) {
|
|
97
|
+
const res = await this.pg.query(
|
|
126
98
|
`SELECT ${idCol}, ${contentCol} FROM ${table}`
|
|
127
99
|
);
|
|
128
|
-
return rows.map(
|
|
100
|
+
return res.rows.map(r => ({
|
|
129
101
|
id: r[idCol],
|
|
130
102
|
content: r[contentCol],
|
|
131
103
|
source: `postgres:${table}`,
|
|
@@ -154,13 +126,13 @@ async function loadAllDocuments(context) {
|
|
|
154
126
|
if (fs.existsSync(baseDir)) {
|
|
155
127
|
const files = fs
|
|
156
128
|
.readdirSync(baseDir)
|
|
157
|
-
.filter(
|
|
129
|
+
.filter(f => f.endsWith(".txt") || f.endsWith(".md"));
|
|
158
130
|
|
|
159
|
-
for (const
|
|
131
|
+
for (const file of files) {
|
|
160
132
|
docs.push({
|
|
161
|
-
id:
|
|
162
|
-
content: fs.readFileSync(path.join(baseDir,
|
|
163
|
-
source: `file:${
|
|
133
|
+
id: file,
|
|
134
|
+
content: fs.readFileSync(path.join(baseDir, file), "utf8"),
|
|
135
|
+
source: `file:${file}`,
|
|
164
136
|
});
|
|
165
137
|
}
|
|
166
138
|
}
|
|
@@ -170,74 +142,102 @@ async function loadAllDocuments(context) {
|
|
|
170
142
|
|
|
171
143
|
/* ---------------- HYBRID VECTOR SEARCH ---------------- */
|
|
172
144
|
|
|
173
|
-
async function performHybridDocQA(query, context
|
|
145
|
+
async function performHybridDocQA(query, context) {
|
|
174
146
|
const cache = loadCache();
|
|
175
|
-
|
|
147
|
+
|
|
148
|
+
const MIN_SCORE = context.minScore ?? 0.75;
|
|
149
|
+
const topK = context.topK ?? 5;
|
|
176
150
|
|
|
177
151
|
const vectorStore = VectorRouter.create({
|
|
178
152
|
backend: context.vectorBackend || "memory",
|
|
179
|
-
dimension:
|
|
153
|
+
dimension: embedder.getDimension(),
|
|
180
154
|
...context,
|
|
181
155
|
});
|
|
182
156
|
|
|
157
|
+
console.log(
|
|
158
|
+
"š§ Vector store methods:",
|
|
159
|
+
Object.getOwnPropertyNames(Object.getPrototypeOf(vectorStore))
|
|
160
|
+
);
|
|
161
|
+
|
|
183
162
|
const documents = await loadAllDocuments(context);
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
}
|
|
163
|
+
console.log("š Documents loaded:", documents.length);
|
|
164
|
+
|
|
165
|
+
if (!documents.length) return { text: "(No documents found)", meta: { matches: 0 } };
|
|
187
166
|
|
|
167
|
+
// Multi-document ingestion
|
|
188
168
|
for (const doc of documents) {
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
source: doc.source,
|
|
200
|
-
});
|
|
169
|
+
const chunks = chunkText(doc.content, 500);
|
|
170
|
+
console.log(`š¦ ${doc.id} split into ${chunks.length} chunks`);
|
|
171
|
+
|
|
172
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
173
|
+
console.log("š§© Chunk to embed:", chunks[i]?.substring(0, 100));
|
|
174
|
+
|
|
175
|
+
const vector = await embedder.embed(chunks[i]);
|
|
176
|
+
if (!vector || vector.every(v => v === 0)) {
|
|
177
|
+
console.warn("ā ļø Zero or invalid embedding, skipping chunk");
|
|
178
|
+
continue;
|
|
201
179
|
}
|
|
180
|
+
|
|
181
|
+
await vectorStore.upsert({
|
|
182
|
+
id: `${doc.id}:${i}`,
|
|
183
|
+
vector,
|
|
184
|
+
content: chunks[i],
|
|
185
|
+
source: doc.source,
|
|
186
|
+
});
|
|
187
|
+
|
|
188
|
+
console.log(`ā
Upserted ${doc.id}:${i}`);
|
|
202
189
|
}
|
|
203
190
|
}
|
|
204
191
|
|
|
205
192
|
saveCache(cache);
|
|
206
193
|
|
|
194
|
+
// Embed the query
|
|
207
195
|
const queryVector = await embedder.embed(query);
|
|
208
|
-
|
|
196
|
+
if (!queryVector || queryVector.every(v => v === 0)) {
|
|
197
|
+
console.warn("ā ļø Query embedding invalid");
|
|
198
|
+
return { text: "(Query could not be embedded)", meta: { matches: 0 } };
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// Top-K + similarity threshold
|
|
202
|
+
const results = await vectorStore.query(queryVector, { topK });
|
|
203
|
+
const filtered = results.filter(r => r.score >= MIN_SCORE);
|
|
204
|
+
|
|
205
|
+
console.log(`š Search results: ${filtered.length} (after applying minScore=${MIN_SCORE})`);
|
|
206
|
+
|
|
207
|
+
if (!filtered.length) {
|
|
208
|
+
return { text: "(No relevant match found)", meta: { matches: 0 } };
|
|
209
|
+
}
|
|
209
210
|
|
|
210
211
|
return {
|
|
211
212
|
text: highlightMatches(
|
|
212
|
-
|
|
213
|
+
filtered.map(r => r.content).join("\n\n"),
|
|
213
214
|
extractKeywords(query)
|
|
214
215
|
),
|
|
215
|
-
meta: { matches:
|
|
216
|
+
meta: { matches: filtered.length },
|
|
216
217
|
};
|
|
217
218
|
}
|
|
218
219
|
|
|
220
|
+
|
|
219
221
|
/* ---------------- PGVECTOR SEARCH ---------------- */
|
|
220
222
|
|
|
221
|
-
async function performPgVectorSearch(query, context
|
|
223
|
+
async function performPgVectorSearch(query, context) {
|
|
222
224
|
const adapter = new PgVectorAdapter({
|
|
223
225
|
POSTGRES_URL: context.POSTGRES_URL,
|
|
224
226
|
});
|
|
225
227
|
|
|
226
|
-
const embedder = new LocalEmbedding({ dimension: 384 });
|
|
227
228
|
const vector = await embedder.embed(query);
|
|
228
|
-
const results = await adapter.
|
|
229
|
-
|
|
229
|
+
const results = await adapter.search(vector, 5);
|
|
230
230
|
await adapter.close();
|
|
231
231
|
|
|
232
232
|
return {
|
|
233
|
-
text: results.map(
|
|
233
|
+
text: results.map(r => r.content).join("\n\n"),
|
|
234
234
|
meta: { matches: results.length },
|
|
235
235
|
};
|
|
236
236
|
}
|
|
237
237
|
|
|
238
238
|
/* ---------------- ROUTER ---------------- */
|
|
239
239
|
|
|
240
|
-
async function performDocQA(query, context
|
|
240
|
+
async function performDocQA(query, context) {
|
|
241
241
|
if (context.POSTGRES_URL) {
|
|
242
242
|
return performPgVectorSearch(query, context);
|
|
243
243
|
}
|
|
@@ -247,14 +247,26 @@ async function performDocQA(query, context = {}) {
|
|
|
247
247
|
/* ---------------- O-LANG RESOLVER ---------------- */
|
|
248
248
|
|
|
249
249
|
async function docSearchResolver(action, context) {
|
|
250
|
-
if (action.startsWith("Ask doc-search"))
|
|
251
|
-
const match = action.match(/"(.*)"|'(.*)'/);
|
|
252
|
-
const query = match
|
|
253
|
-
? match[1] || match[2]
|
|
254
|
-
: action.replace("Ask doc-search", "").trim();
|
|
250
|
+
if (!action.startsWith("Ask doc-search")) return;
|
|
255
251
|
|
|
256
|
-
|
|
257
|
-
|
|
252
|
+
// Extract the query string
|
|
253
|
+
const match = action.match(/"(.*)"|'(.*)'/);
|
|
254
|
+
const query = match ? match[1] || match[2] : action.replace("Ask doc-search", "").trim();
|
|
255
|
+
|
|
256
|
+
// Optional: extract topK and minScore if provided in action, e.g. "Ask doc-search 'Vacation policy' topK=3 minScore=0.8"
|
|
257
|
+
let topK = 5;
|
|
258
|
+
let minScore = 0.75;
|
|
259
|
+
|
|
260
|
+
const topKMatch = action.match(/topK\s*=\s*(\d+)/i);
|
|
261
|
+
if (topKMatch) topK = parseInt(topKMatch[1], 10);
|
|
262
|
+
|
|
263
|
+
const minScoreMatch = action.match(/minScore\s*=\s*(0?\.\d+|1(\.0)?)/i);
|
|
264
|
+
if (minScoreMatch) minScore = parseFloat(minScoreMatch[1]);
|
|
265
|
+
|
|
266
|
+
// Pass these into context for hybrid search
|
|
267
|
+
const searchContext = { ...context, topK, minScore };
|
|
268
|
+
|
|
269
|
+
return performDocQA(query, searchContext);
|
|
258
270
|
}
|
|
259
271
|
|
|
260
272
|
docSearchResolver.resolverName = "doc-search";
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
// test-doc-search-batch.js
|
|
2
|
+
const docSearchResolver = require("./src/index.js");
|
|
3
|
+
|
|
4
|
+
(async () => {
|
|
5
|
+
try {
|
|
6
|
+
const context = {
|
|
7
|
+
doc_root: "./docs", // folder with .txt or .md files
|
|
8
|
+
vectorBackend: "memory", // can also switch to "pgvector" if configured
|
|
9
|
+
};
|
|
10
|
+
|
|
11
|
+
const queries = [
|
|
12
|
+
"Semantic search",
|
|
13
|
+
"Vacation policy",
|
|
14
|
+
"Employee onboarding",
|
|
15
|
+
"Leave requests",
|
|
16
|
+
"HR compliance"
|
|
17
|
+
];
|
|
18
|
+
|
|
19
|
+
console.log("š Running batch doc-search...");
|
|
20
|
+
|
|
21
|
+
for (const query of queries) {
|
|
22
|
+
const action = `Ask doc-search "${query}"`;
|
|
23
|
+
const result = await docSearchResolver(action, context);
|
|
24
|
+
|
|
25
|
+
console.log("\n====================================");
|
|
26
|
+
console.log(`Query: "${query}"`);
|
|
27
|
+
console.log("Text:\n", result.text || "(No matches found)");
|
|
28
|
+
console.log("Meta:", result.meta);
|
|
29
|
+
console.log("====================================");
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
console.log("\nā
Batch search complete!");
|
|
33
|
+
} catch (err) {
|
|
34
|
+
console.error("ā Batch doc-search test failed:", err);
|
|
35
|
+
}
|
|
36
|
+
})();
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
// test-doc-search.js
|
|
2
|
+
const docSearchResolver = require("./src/index.js");
|
|
3
|
+
|
|
4
|
+
(async () => {
|
|
5
|
+
try {
|
|
6
|
+
const context = {
|
|
7
|
+
doc_root: "./docs", // folder containing .txt or .md files
|
|
8
|
+
vectorBackend: "memory", // can also use "pgvector" if configured
|
|
9
|
+
};
|
|
10
|
+
|
|
11
|
+
const action = 'Ask doc-search "Semantic search"'; // Example query
|
|
12
|
+
console.log("š Running doc-search...");
|
|
13
|
+
|
|
14
|
+
const result = await docSearchResolver(action, context);
|
|
15
|
+
|
|
16
|
+
console.log("ā
Search Result:");
|
|
17
|
+
console.log("Text:\n", result.text);
|
|
18
|
+
console.log("Meta:", result.meta);
|
|
19
|
+
} catch (err) {
|
|
20
|
+
console.error("ā doc-search test failed:", err);
|
|
21
|
+
}
|
|
22
|
+
})();
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
// test-single-doc.js
|
|
2
|
+
const path = require("path");
|
|
3
|
+
const { LocalEmbedding } = require("./embeddings/local.js");
|
|
4
|
+
const { chunkText } = require("./utils/chunker.js");
|
|
5
|
+
const VectorRouter = require("./adapters/vectorRouter");
|
|
6
|
+
|
|
7
|
+
(async () => {
|
|
8
|
+
const embedder = new LocalEmbedding();
|
|
9
|
+
const docPath = path.join(process.cwd(), "docs", "sample1.txt");
|
|
10
|
+
const fs = require("fs");
|
|
11
|
+
const content = fs.readFileSync(docPath, "utf8");
|
|
12
|
+
|
|
13
|
+
const chunks = chunkText(content, 500);
|
|
14
|
+
console.log(`Document split into ${chunks.length} chunk(s)`);
|
|
15
|
+
|
|
16
|
+
const vectorStore = VectorRouter.create({ backend: "memory", dimension: embedder.getDimension() });
|
|
17
|
+
|
|
18
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
19
|
+
const vector = await embedder.embed(chunks[i]);
|
|
20
|
+
console.log(`Chunk ${i} embedding first 5 dims:`, vector.slice(0, 5));
|
|
21
|
+
|
|
22
|
+
await vectorStore.upsert({ id: `sample1:${i}`, vector, content: chunks[i], source: "file:sample1.txt" });
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
const query = "Semantic search";
|
|
26
|
+
const queryVector = await embedder.embed(query);
|
|
27
|
+
|
|
28
|
+
const results = await vectorStore.query(queryVector, { topK: 5 });
|
|
29
|
+
results.forEach((r, idx) => {
|
|
30
|
+
console.log(`Result ${idx}: score=${r.score.toFixed(3)} content=${r.content.substring(0, 50)}...`);
|
|
31
|
+
});
|
|
32
|
+
})();
|