@o-lang/semantic-doc-search 1.0.40 → 1.0.41
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -12
- package/package.json +1 -1
- package/src/resolver.js +59 -179
- package/src/services/docQA.js +56 -19
- package/src/utils/formatResults.js +9 -10
package/README.md
CHANGED
|
@@ -1,24 +1,29 @@
|
|
|
1
|
-
# @
|
|
1
|
+
# @o-lang/semantic-doc-search
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Semantic document retrieval engine for O-Lang workflows.
|
|
4
|
+
|
|
5
|
+
This package provides vector-based document search (RAG retrieval layer) that integrates with O-Lang kernel workflows. It handles document ingestion, chunking, embedding, and similarity search, returning LLM-ready context outputs.
|
|
4
6
|
|
|
5
7
|
---
|
|
6
8
|
|
|
7
9
|
## Features
|
|
8
10
|
|
|
9
|
-
-
|
|
10
|
-
-
|
|
11
|
-
-
|
|
12
|
-
-
|
|
13
|
-
-
|
|
14
|
-
-
|
|
15
|
-
-
|
|
16
|
-
-
|
|
17
|
-
-
|
|
11
|
+
- Semantic vector search using embeddings
|
|
12
|
+
- Document ingestion from local filesystem (`.txt`, `.md`)
|
|
13
|
+
- Automatic text chunking for large documents
|
|
14
|
+
- Pluggable embedding providers (local, OpenAI, Groq, etc.)
|
|
15
|
+
- Multiple vector database support:
|
|
16
|
+
- In-memory store
|
|
17
|
+
- Redis (adapter)
|
|
18
|
+
- PostgreSQL / pgvector (adapter)
|
|
19
|
+
- Pinecone (adapter)
|
|
20
|
+
- Embedding cache support (`embeddings.json`)
|
|
21
|
+
- Normalized LLM-ready output format (`text + matches`)
|
|
22
|
+
- Designed for O-Lang `.ol` workflow integration
|
|
18
23
|
|
|
19
24
|
---
|
|
20
25
|
|
|
21
26
|
## Installation
|
|
22
27
|
|
|
23
28
|
```bash
|
|
24
|
-
npm install @
|
|
29
|
+
npm install @o-lang/semantic-doc-search
|
package/package.json
CHANGED
package/src/resolver.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
const VectorRouter = require("./adapters/vectorRouter");
|
|
2
|
-
const embedder = require("./embeddings/local");
|
|
2
|
+
const embedder = require("./embeddings/local");
|
|
3
3
|
const { extractQuery } = require("./utils/extractQuery");
|
|
4
4
|
const { formatResults } = require("./utils/formatResults");
|
|
5
5
|
const fs = require("fs");
|
|
@@ -8,7 +8,9 @@ const crypto = require("crypto");
|
|
|
8
8
|
|
|
9
9
|
const CACHE_PATH = path.join(process.cwd(), "embeddings.json");
|
|
10
10
|
|
|
11
|
-
//
|
|
11
|
+
// ─────────────────────────────────────────────
|
|
12
|
+
// Helpers (UNCHANGED)
|
|
13
|
+
// ─────────────────────────────────────────────
|
|
12
14
|
function loadCache() {
|
|
13
15
|
try {
|
|
14
16
|
if (fs.existsSync(CACHE_PATH)) {
|
|
@@ -24,226 +26,104 @@ function saveCache(cache) {
|
|
|
24
26
|
} catch {}
|
|
25
27
|
}
|
|
26
28
|
|
|
27
|
-
/**
|
|
28
|
-
* Clean text for embedding (defensive)
|
|
29
|
-
*/
|
|
30
29
|
function sanitizeTextForEmbedding(text) {
|
|
31
30
|
if (typeof text !== "string") return "";
|
|
32
|
-
// Remove wrapping quotes and extra whitespace
|
|
33
31
|
return text.replace(/^["']|["']$/g, "").trim();
|
|
34
32
|
}
|
|
35
33
|
|
|
36
|
-
/**
|
|
37
|
-
* Load documents from doc_root if provided
|
|
38
|
-
*/
|
|
39
|
-
function loadDocumentsFromContext(context) {
|
|
40
|
-
if (context.documents && Array.isArray(context.documents)) {
|
|
41
|
-
return context.documents;
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
if (context.doc_root) {
|
|
45
|
-
const baseDir = path.resolve(process.cwd(), context.doc_root);
|
|
46
|
-
if (fs.existsSync(baseDir)) {
|
|
47
|
-
const files = fs.readdirSync(baseDir).filter(f => f.endsWith('.txt') || f.endsWith('.md'));
|
|
48
|
-
const docs = files.map(file => {
|
|
49
|
-
try {
|
|
50
|
-
const content = fs.readFileSync(path.join(baseDir, file), 'utf8');
|
|
51
|
-
return { id: file, content, source: `file:${file}` };
|
|
52
|
-
} catch (err) {
|
|
53
|
-
console.warn(`⚠️ Failed to read ${file}:`, err.message);
|
|
54
|
-
return null;
|
|
55
|
-
}
|
|
56
|
-
}).filter(Boolean);
|
|
57
|
-
|
|
58
|
-
console.log(`📄 Loaded ${docs.length} document(s) from ${baseDir}`);
|
|
59
|
-
docs.forEach(d => console.log(` - ${d.id} (${d.content?.length || 0} chars)`));
|
|
60
|
-
return docs;
|
|
61
|
-
}
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
return [];
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
/**
|
|
68
|
-
* Convert any array-like (Float32Array, etc.) to plain JS array
|
|
69
|
-
*/
|
|
70
|
-
function toPlainArray(input) {
|
|
71
|
-
if (!input) return null;
|
|
72
|
-
if (Array.isArray(input)) return input;
|
|
73
|
-
if (ArrayBuffer.isView(input)) return Array.from(input);
|
|
74
|
-
return null;
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
/**
|
|
78
|
-
* Hash text for caching
|
|
79
|
-
*/
|
|
80
34
|
function hashText(str) {
|
|
81
35
|
return crypto.createHash("sha256").update(str).digest("hex");
|
|
82
36
|
}
|
|
83
37
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
38
|
+
// ─────────────────────────────────────────────
|
|
39
|
+
// 🔥 MAIN RESOLVER
|
|
40
|
+
// ─────────────────────────────────────────────
|
|
87
41
|
async function resolver(action, context = {}) {
|
|
88
42
|
if (typeof action !== "string") return;
|
|
89
|
-
if (!action.toLowerCase().startsWith("ask doc-search")) return;
|
|
90
43
|
|
|
91
|
-
let query = extractQuery(action);
|
|
92
|
-
query = sanitizeTextForEmbedding(query);
|
|
93
|
-
if (!query) return { text: "(Empty query)", meta: { matches: 0 } };
|
|
94
|
-
|
|
95
|
-
// Vector backend
|
|
96
44
|
const vectorStore = VectorRouter.create(context);
|
|
45
|
+
const embed = await embedder({ dimension: 384 });
|
|
97
46
|
|
|
98
|
-
|
|
99
|
-
if (!vectorStore.supports("vector.search")) {
|
|
100
|
-
throw new Error("Vector backend does not support vector.search");
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
// Load documents (from context.documents OR doc_root)
|
|
104
|
-
const documents = loadDocumentsFromContext(context);
|
|
105
|
-
console.log("🔄 Starting ingestion for", documents.length, "documents");
|
|
106
|
-
|
|
107
|
-
// ✅ ONLY USE CACHE FOR PERSISTENT BACKENDS
|
|
47
|
+
const doc_root = context.doc_root || "./docs";
|
|
108
48
|
const useCache = !!context.POSTGRES_URL || !!context.REDIS_URL;
|
|
109
49
|
const cache = useCache ? loadCache() : {};
|
|
110
50
|
|
|
111
|
-
//
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
51
|
+
// =====================================================
|
|
52
|
+
// ✅ 1. VECTOR INSERT (INGEST)
|
|
53
|
+
// =====================================================
|
|
54
|
+
if (action.includes("vector.insert")) {
|
|
55
|
+
let inserted = 0;
|
|
116
56
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
if (!doc?.content) {
|
|
121
|
-
console.warn("⚠️ Skipping empty doc:", doc?.id);
|
|
122
|
-
continue;
|
|
123
|
-
}
|
|
57
|
+
if (fs.existsSync(doc_root)) {
|
|
58
|
+
const files = fs.readdirSync(doc_root);
|
|
124
59
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
const chunkText = require("./utils/chunker.js").chunkText;
|
|
129
|
-
chunks = chunkText(doc.content, 500) || [doc.content];
|
|
130
|
-
console.log("📦", doc.id, "split into", chunks.length, "chunks");
|
|
131
|
-
} catch (err) {
|
|
132
|
-
console.warn("⚠️ Chunking failed, using full doc:", err.message);
|
|
133
|
-
chunks = [doc.content];
|
|
134
|
-
}
|
|
60
|
+
for (const file of files) {
|
|
61
|
+
const fullPath = path.join(doc_root, file);
|
|
62
|
+
if (!fs.statSync(fullPath).isFile()) continue;
|
|
135
63
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
console.log("🧩 Chunk", `${doc.id}:${i}`, "text:", JSON.stringify(text));
|
|
139
|
-
|
|
140
|
-
if (!text) {
|
|
141
|
-
console.warn(`⚠️ Skipping empty chunk ${doc.id}:${i}`);
|
|
142
|
-
continue;
|
|
143
|
-
}
|
|
64
|
+
const content = fs.readFileSync(fullPath, "utf8");
|
|
65
|
+
if (!content) continue;
|
|
144
66
|
|
|
145
|
-
const
|
|
146
|
-
|
|
147
|
-
if (useCache && cache[hash]) {
|
|
148
|
-
console.log(`⏭️ Skipping already ingested chunk ${doc.id}:${i}`);
|
|
149
|
-
continue;
|
|
150
|
-
}
|
|
67
|
+
const chunkText = require("./utils/chunker").chunkText;
|
|
68
|
+
const chunks = chunkText(content, 500, 50);
|
|
151
69
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
rawVector = await embedder.embed(text);
|
|
156
|
-
} catch (err) {
|
|
157
|
-
console.warn(`⚠️ Embedding failed for chunk ${doc.id}:${i} ("${text.slice(0, 30)}..."):`, err.message);
|
|
158
|
-
continue;
|
|
159
|
-
}
|
|
70
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
71
|
+
const text = sanitizeTextForEmbedding(chunks[i]);
|
|
72
|
+
if (!text) continue;
|
|
160
73
|
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
console.warn(`⚠️ Invalid vector type for chunk ${doc.id}:${i}:`, typeof rawVector);
|
|
164
|
-
continue;
|
|
165
|
-
}
|
|
74
|
+
const hash = hashText(text);
|
|
75
|
+
if (useCache && cache[hash]) continue;
|
|
166
76
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
if (isZero) {
|
|
170
|
-
console.warn(`⚠️ Zero vector for chunk ${doc.id}:${i}`);
|
|
171
|
-
continue;
|
|
172
|
-
}
|
|
173
|
-
|
|
174
|
-
// Normalize to plain array for storage
|
|
175
|
-
const vector = toPlainArray(rawVector);
|
|
176
|
-
if (!vector) {
|
|
177
|
-
console.warn(`⚠️ Failed to normalize vector for chunk ${doc.id}:${i}`);
|
|
178
|
-
continue;
|
|
179
|
-
}
|
|
77
|
+
const rawVector = await embed(text);
|
|
78
|
+
const vector = Array.from(rawVector);
|
|
180
79
|
|
|
181
|
-
try {
|
|
182
80
|
await vectorStore.upsert({
|
|
183
|
-
id: `${
|
|
81
|
+
id: `${file}:${i}`,
|
|
184
82
|
vector,
|
|
185
83
|
content: text,
|
|
186
|
-
source:
|
|
84
|
+
source: `file:${file}`,
|
|
187
85
|
});
|
|
188
|
-
|
|
189
|
-
if (useCache)
|
|
190
|
-
|
|
191
|
-
}
|
|
192
|
-
console.log(`✅ Upserted ${doc.id}:${i}`);
|
|
193
|
-
} catch (err) {
|
|
194
|
-
console.warn(`⚠️ Upsert failed for ${doc.id}:${i}:`, err.message);
|
|
195
|
-
continue;
|
|
86
|
+
|
|
87
|
+
if (useCache) cache[hash] = true;
|
|
88
|
+
inserted++;
|
|
196
89
|
}
|
|
197
90
|
}
|
|
198
91
|
}
|
|
199
|
-
// ✅ ONLY SAVE CACHE FOR PERSISTENT BACKENDS
|
|
200
|
-
if (useCache) {
|
|
201
|
-
saveCache(cache);
|
|
202
|
-
}
|
|
203
|
-
}
|
|
204
92
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
try {
|
|
208
|
-
rawQueryVector = await embedder.embed(query);
|
|
209
|
-
} catch (err) {
|
|
210
|
-
console.error(`❌ Query embedding failed: "${query}"`, err.message);
|
|
211
|
-
return { text: "(Query embedding failed)", meta: { matches: 0 } };
|
|
212
|
-
}
|
|
93
|
+
if (useCache) saveCache(cache);
|
|
94
|
+
if (vectorStore.close) await vectorStore.close();
|
|
213
95
|
|
|
214
|
-
|
|
215
|
-
if (!rawQueryVector || (!Array.isArray(rawQueryVector) && !ArrayBuffer.isView(rawQueryVector))) {
|
|
216
|
-
console.error("❌ Invalid query vector type:", typeof rawQueryVector);
|
|
217
|
-
return { text: "(Invalid query vector)", meta: { matches: 0 } };
|
|
96
|
+
return { inserted, doc_root };
|
|
218
97
|
}
|
|
219
98
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
99
|
+
// =====================================================
|
|
100
|
+
// ✅ 2. VECTOR SEARCH
|
|
101
|
+
// =====================================================
|
|
102
|
+
if (action.includes("vector.search")) {
|
|
103
|
+
const query = sanitizeTextForEmbedding(extractQuery(action));
|
|
104
|
+
if (!query) return { text: "", matches: [] };
|
|
225
105
|
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
return { text: "(Failed to normalize query vector)", meta: { matches: 0 } };
|
|
229
|
-
}
|
|
106
|
+
const rawQueryVector = await embed(query);
|
|
107
|
+
const queryVector = Array.from(rawQueryVector);
|
|
230
108
|
|
|
231
|
-
// --- SEARCH ---
|
|
232
|
-
try {
|
|
233
|
-
console.log("🔍 Executing vector search...");
|
|
234
109
|
const results = await vectorStore.query(queryVector, {
|
|
235
110
|
topK: context.topK || 5,
|
|
236
|
-
minScore: context.minScore || 0,
|
|
237
111
|
});
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
112
|
+
|
|
113
|
+
if (vectorStore.close) await vectorStore.close();
|
|
114
|
+
|
|
242
115
|
return formatResults(results, query);
|
|
243
|
-
} catch (err) {
|
|
244
|
-
console.error("❌ Vector search failed:", err.message);
|
|
245
|
-
return { text: "(Search failed)", meta: { matches: 0 } };
|
|
246
116
|
}
|
|
117
|
+
|
|
118
|
+
// =====================================================
|
|
119
|
+
// ❌ REMOVE THIS (legacy)
|
|
120
|
+
// =====================================================
|
|
121
|
+
// if (action.startsWith("Ask doc-search")) { ... }
|
|
122
|
+
|
|
123
|
+
return;
|
|
247
124
|
}
|
|
248
125
|
|
|
126
|
+
resolver.resolverName = "vector";
|
|
127
|
+
resolver.version = "1.0.0";
|
|
128
|
+
|
|
249
129
|
module.exports = resolver;
|
package/src/services/docQA.js
CHANGED
|
@@ -2,20 +2,22 @@ const VectorRouter = require("../adapters/vectorRouter");
|
|
|
2
2
|
const embedder = require("../embeddings/local");
|
|
3
3
|
const extractText = require("../utils/extractText");
|
|
4
4
|
const chunkText = require("../utils/chunker");
|
|
5
|
+
const formatResults = require("../utils/formatResults");
|
|
5
6
|
const fs = require("fs");
|
|
6
7
|
const path = require("path");
|
|
7
8
|
|
|
8
9
|
async function performDocQA(
|
|
9
10
|
query,
|
|
10
11
|
{
|
|
11
|
-
doc_root,
|
|
12
|
-
vectorBackend = "
|
|
12
|
+
doc_root = "./docs",
|
|
13
|
+
vectorBackend = "memory", // 🔥 default to memory like Python fallback
|
|
13
14
|
dimension = 384,
|
|
14
|
-
migrate_on_demand = false,
|
|
15
15
|
POSTGRES_URL,
|
|
16
|
+
topK = 5,
|
|
16
17
|
...config
|
|
17
18
|
} = {}
|
|
18
19
|
) {
|
|
20
|
+
// ── Create vector store
|
|
19
21
|
const store = VectorRouter.create({
|
|
20
22
|
backend: vectorBackend,
|
|
21
23
|
dimension,
|
|
@@ -25,30 +27,65 @@ async function performDocQA(
|
|
|
25
27
|
|
|
26
28
|
const embed = await embedder({ dimension });
|
|
27
29
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
+
// ─────────────────────────────────────────────
|
|
31
|
+
// 🔥 ALWAYS INGEST (Python parity)
|
|
32
|
+
// ─────────────────────────────────────────────
|
|
33
|
+
if (doc_root && fs.existsSync(doc_root)) {
|
|
34
|
+
const files = fs.readdirSync(doc_root);
|
|
35
|
+
|
|
36
|
+
for (const file of files) {
|
|
30
37
|
const fullPath = path.join(doc_root, file);
|
|
38
|
+
|
|
31
39
|
if (!fs.statSync(fullPath).isFile()) continue;
|
|
40
|
+
if (!file.endsWith(".txt") && !file.endsWith(".md")) continue;
|
|
41
|
+
|
|
42
|
+
try {
|
|
43
|
+
const text = await extractText(fullPath);
|
|
44
|
+
if (!text || !text.trim()) continue;
|
|
45
|
+
|
|
46
|
+
const chunks = chunkText(text, 500, 50) || [text];
|
|
47
|
+
|
|
48
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
49
|
+
const chunk = chunks[i];
|
|
50
|
+
if (!chunk.trim()) continue;
|
|
32
51
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
52
|
+
try {
|
|
53
|
+
await store.upsert({
|
|
54
|
+
id: `${file}:${i}`,
|
|
55
|
+
vector: await embed(chunk),
|
|
56
|
+
content: chunk,
|
|
57
|
+
source: `file:${file}`,
|
|
58
|
+
metadata: { chunk: i }
|
|
59
|
+
});
|
|
60
|
+
} catch (err) {
|
|
61
|
+
console.warn("⚠️ Chunk failed:", err.message);
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
} catch (err) {
|
|
66
|
+
console.error("❌ Failed to process file:", file, err.message);
|
|
44
67
|
}
|
|
45
68
|
}
|
|
46
69
|
}
|
|
47
70
|
|
|
48
|
-
|
|
71
|
+
// ─────────────────────────────────────────────
|
|
72
|
+
// 🔍 SEARCH
|
|
73
|
+
// ─────────────────────────────────────────────
|
|
74
|
+
let matches = [];
|
|
75
|
+
|
|
76
|
+
try {
|
|
77
|
+
const queryVector = await embed(query);
|
|
78
|
+
matches = await store.query(queryVector, { topK });
|
|
79
|
+
} catch (err) {
|
|
80
|
+
console.error("❌ Search failed:", err.message);
|
|
81
|
+
}
|
|
49
82
|
|
|
50
83
|
if (store.close) await store.close();
|
|
51
|
-
|
|
84
|
+
|
|
85
|
+
// ─────────────────────────────────────────────
|
|
86
|
+
// ✅ FORMAT LIKE PYTHON
|
|
87
|
+
// ─────────────────────────────────────────────
|
|
88
|
+
return formatResults(matches, query);
|
|
52
89
|
}
|
|
53
90
|
|
|
54
|
-
module.exports = performDocQA;
|
|
91
|
+
module.exports = performDocQA;
|
|
@@ -1,15 +1,14 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
1
|
+
function formatResults(results = [], query = "") {
|
|
2
|
+
const safeResults = Array.isArray(results) ? results : [];
|
|
3
|
+
|
|
4
|
+
const text = safeResults.length
|
|
5
|
+
? safeResults.map(r => r.content).join('\n\n')
|
|
6
|
+
: "";
|
|
7
|
+
|
|
9
8
|
return {
|
|
10
9
|
query,
|
|
11
|
-
text,
|
|
12
|
-
matches:
|
|
10
|
+
text,
|
|
11
|
+
matches: safeResults.map(r => ({
|
|
13
12
|
id: r.id,
|
|
14
13
|
content: r.content,
|
|
15
14
|
source: r.source,
|