@o-lang/semantic-doc-search 1.0.39 → 1.0.41
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -12
- package/package.json +1 -1
- package/src/resolver.js +61 -173
- package/src/services/docQA.js +56 -19
- package/src/utils/formatResults.js +9 -10
package/README.md
CHANGED
|
@@ -1,24 +1,29 @@
|
|
|
1
|
-
# @
|
|
1
|
+
# @o-lang/semantic-doc-search
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Semantic document retrieval engine for O-Lang workflows.
|
|
4
|
+
|
|
5
|
+
This package provides vector-based document search (RAG retrieval layer) that integrates with O-Lang kernel workflows. It handles document ingestion, chunking, embedding, and similarity search, returning LLM-ready context outputs.
|
|
4
6
|
|
|
5
7
|
---
|
|
6
8
|
|
|
7
9
|
## Features
|
|
8
10
|
|
|
9
|
-
-
|
|
10
|
-
-
|
|
11
|
-
-
|
|
12
|
-
-
|
|
13
|
-
-
|
|
14
|
-
-
|
|
15
|
-
-
|
|
16
|
-
-
|
|
17
|
-
-
|
|
11
|
+
- Semantic vector search using embeddings
|
|
12
|
+
- Document ingestion from local filesystem (`.txt`, `.md`)
|
|
13
|
+
- Automatic text chunking for large documents
|
|
14
|
+
- Pluggable embedding providers (local, OpenAI, Groq, etc.)
|
|
15
|
+
- Multiple vector database support:
|
|
16
|
+
- In-memory store
|
|
17
|
+
- Redis (adapter)
|
|
18
|
+
- PostgreSQL / pgvector (adapter)
|
|
19
|
+
- Pinecone (adapter)
|
|
20
|
+
- Embedding cache support (`embeddings.json`)
|
|
21
|
+
- Normalized LLM-ready output format (`text + matches`)
|
|
22
|
+
- Designed for O-Lang `.ol` workflow integration
|
|
18
23
|
|
|
19
24
|
---
|
|
20
25
|
|
|
21
26
|
## Installation
|
|
22
27
|
|
|
23
28
|
```bash
|
|
24
|
-
npm install @
|
|
29
|
+
npm install @o-lang/semantic-doc-search
|
package/package.json
CHANGED
package/src/resolver.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
const VectorRouter = require("./adapters/vectorRouter");
|
|
2
|
-
const embedder = require("./embeddings/local");
|
|
2
|
+
const embedder = require("./embeddings/local");
|
|
3
3
|
const { extractQuery } = require("./utils/extractQuery");
|
|
4
4
|
const { formatResults } = require("./utils/formatResults");
|
|
5
5
|
const fs = require("fs");
|
|
@@ -8,7 +8,9 @@ const crypto = require("crypto");
|
|
|
8
8
|
|
|
9
9
|
const CACHE_PATH = path.join(process.cwd(), "embeddings.json");
|
|
10
10
|
|
|
11
|
-
//
|
|
11
|
+
// ─────────────────────────────────────────────
|
|
12
|
+
// Helpers (UNCHANGED)
|
|
13
|
+
// ─────────────────────────────────────────────
|
|
12
14
|
function loadCache() {
|
|
13
15
|
try {
|
|
14
16
|
if (fs.existsSync(CACHE_PATH)) {
|
|
@@ -24,218 +26,104 @@ function saveCache(cache) {
|
|
|
24
26
|
} catch {}
|
|
25
27
|
}
|
|
26
28
|
|
|
27
|
-
/**
|
|
28
|
-
* Clean text for embedding (defensive)
|
|
29
|
-
*/
|
|
30
29
|
function sanitizeTextForEmbedding(text) {
|
|
31
30
|
if (typeof text !== "string") return "";
|
|
32
|
-
// Remove wrapping quotes and extra whitespace
|
|
33
31
|
return text.replace(/^["']|["']$/g, "").trim();
|
|
34
32
|
}
|
|
35
33
|
|
|
36
|
-
/**
|
|
37
|
-
* Load documents from doc_root if provided
|
|
38
|
-
*/
|
|
39
|
-
function loadDocumentsFromContext(context) {
|
|
40
|
-
if (context.documents && Array.isArray(context.documents)) {
|
|
41
|
-
return context.documents;
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
if (context.doc_root) {
|
|
45
|
-
const baseDir = path.resolve(process.cwd(), context.doc_root);
|
|
46
|
-
if (fs.existsSync(baseDir)) {
|
|
47
|
-
const files = fs.readdirSync(baseDir).filter(f => f.endsWith('.txt') || f.endsWith('.md'));
|
|
48
|
-
const docs = files.map(file => {
|
|
49
|
-
try {
|
|
50
|
-
const content = fs.readFileSync(path.join(baseDir, file), 'utf8');
|
|
51
|
-
return { id: file, content, source: `file:${file}` };
|
|
52
|
-
} catch (err) {
|
|
53
|
-
console.warn(`⚠️ Failed to read ${file}:`, err.message);
|
|
54
|
-
return null;
|
|
55
|
-
}
|
|
56
|
-
}).filter(Boolean);
|
|
57
|
-
|
|
58
|
-
console.log(`📄 Loaded ${docs.length} document(s) from ${baseDir}`);
|
|
59
|
-
docs.forEach(d => console.log(` - ${d.id} (${d.content?.length || 0} chars)`));
|
|
60
|
-
return docs;
|
|
61
|
-
}
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
return [];
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
/**
|
|
68
|
-
* Convert any array-like (Float32Array, etc.) to plain JS array
|
|
69
|
-
*/
|
|
70
|
-
function toPlainArray(input) {
|
|
71
|
-
if (!input) return null;
|
|
72
|
-
if (Array.isArray(input)) return input;
|
|
73
|
-
if (ArrayBuffer.isView(input)) return Array.from(input);
|
|
74
|
-
return null;
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
/**
|
|
78
|
-
* Hash text for caching
|
|
79
|
-
*/
|
|
80
34
|
function hashText(str) {
|
|
81
35
|
return crypto.createHash("sha256").update(str).digest("hex");
|
|
82
36
|
}
|
|
83
37
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
38
|
+
// ─────────────────────────────────────────────
|
|
39
|
+
// 🔥 MAIN RESOLVER
|
|
40
|
+
// ─────────────────────────────────────────────
|
|
87
41
|
async function resolver(action, context = {}) {
|
|
88
42
|
if (typeof action !== "string") return;
|
|
89
|
-
if (!action.toLowerCase().startsWith("ask doc-search")) return;
|
|
90
|
-
|
|
91
|
-
let query = extractQuery(action);
|
|
92
|
-
query = sanitizeTextForEmbedding(query);
|
|
93
|
-
if (!query) return { text: "(Empty query)", meta: { matches: 0 } };
|
|
94
43
|
|
|
95
|
-
// Vector backend
|
|
96
44
|
const vectorStore = VectorRouter.create(context);
|
|
45
|
+
const embed = await embedder({ dimension: 384 });
|
|
97
46
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
}
|
|
47
|
+
const doc_root = context.doc_root || "./docs";
|
|
48
|
+
const useCache = !!context.POSTGRES_URL || !!context.REDIS_URL;
|
|
49
|
+
const cache = useCache ? loadCache() : {};
|
|
102
50
|
|
|
103
|
-
//
|
|
104
|
-
|
|
105
|
-
|
|
51
|
+
// =====================================================
|
|
52
|
+
// ✅ 1. VECTOR INSERT (INGEST)
|
|
53
|
+
// =====================================================
|
|
54
|
+
if (action.includes("vector.insert")) {
|
|
55
|
+
let inserted = 0;
|
|
106
56
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
// --- Document ingestion ---
|
|
111
|
-
if (documents.length > 0) {
|
|
112
|
-
if (!vectorStore.supports("vector.insert")) {
|
|
113
|
-
throw new Error("Vector backend does not support vector.insert");
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
for (const doc of documents) {
|
|
117
|
-
console.log("📄 Processing doc:", doc.id, "content length:", doc.content?.length);
|
|
118
|
-
|
|
119
|
-
if (!doc?.content) {
|
|
120
|
-
console.warn("⚠️ Skipping empty doc:", doc?.id);
|
|
121
|
-
continue;
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
// ✅ CORRECT PATH: Adjust if chunker.js is in src/utils/
|
|
125
|
-
let chunks;
|
|
126
|
-
try {
|
|
127
|
-
const chunkText = require("./utils/chunker.js").chunkText;
|
|
128
|
-
chunks = chunkText(doc.content, 500) || [doc.content];
|
|
129
|
-
console.log("📦", doc.id, "split into", chunks.length, "chunks");
|
|
130
|
-
} catch (err) {
|
|
131
|
-
console.warn("⚠️ Chunking failed, using full doc:", err.message);
|
|
132
|
-
chunks = [doc.content];
|
|
133
|
-
}
|
|
57
|
+
if (fs.existsSync(doc_root)) {
|
|
58
|
+
const files = fs.readdirSync(doc_root);
|
|
134
59
|
|
|
135
|
-
for (
|
|
136
|
-
const
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
if (!text) {
|
|
140
|
-
console.warn(`⚠️ Skipping empty chunk ${doc.id}:${i}`);
|
|
141
|
-
continue;
|
|
142
|
-
}
|
|
60
|
+
for (const file of files) {
|
|
61
|
+
const fullPath = path.join(doc_root, file);
|
|
62
|
+
if (!fs.statSync(fullPath).isFile()) continue;
|
|
143
63
|
|
|
144
|
-
const
|
|
145
|
-
if (
|
|
146
|
-
console.log(`⏭️ Skipping already ingested chunk ${doc.id}:${i}`);
|
|
147
|
-
continue;
|
|
148
|
-
}
|
|
64
|
+
const content = fs.readFileSync(fullPath, "utf8");
|
|
65
|
+
if (!content) continue;
|
|
149
66
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
try {
|
|
153
|
-
rawVector = await embedder.embed(text);
|
|
154
|
-
} catch (err) {
|
|
155
|
-
console.warn(`⚠️ Embedding failed for chunk ${doc.id}:${i} ("${text.slice(0, 30)}..."):`, err.message);
|
|
156
|
-
continue;
|
|
157
|
-
}
|
|
67
|
+
const chunkText = require("./utils/chunker").chunkText;
|
|
68
|
+
const chunks = chunkText(content, 500, 50);
|
|
158
69
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
continue;
|
|
163
|
-
}
|
|
70
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
71
|
+
const text = sanitizeTextForEmbedding(chunks[i]);
|
|
72
|
+
if (!text) continue;
|
|
164
73
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
if (isZero) {
|
|
168
|
-
console.warn(`⚠️ Zero vector for chunk ${doc.id}:${i}`);
|
|
169
|
-
continue;
|
|
170
|
-
}
|
|
74
|
+
const hash = hashText(text);
|
|
75
|
+
if (useCache && cache[hash]) continue;
|
|
171
76
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
if (!vector) {
|
|
175
|
-
console.warn(`⚠️ Failed to normalize vector for chunk ${doc.id}:${i}`);
|
|
176
|
-
continue;
|
|
177
|
-
}
|
|
77
|
+
const rawVector = await embed(text);
|
|
78
|
+
const vector = Array.from(rawVector);
|
|
178
79
|
|
|
179
|
-
try {
|
|
180
80
|
await vectorStore.upsert({
|
|
181
|
-
id: `${
|
|
81
|
+
id: `${file}:${i}`,
|
|
182
82
|
vector,
|
|
183
83
|
content: text,
|
|
184
|
-
source:
|
|
84
|
+
source: `file:${file}`,
|
|
185
85
|
});
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
console.warn(`⚠️ Upsert failed for ${doc.id}:${i}:`, err.message);
|
|
190
|
-
continue;
|
|
86
|
+
|
|
87
|
+
if (useCache) cache[hash] = true;
|
|
88
|
+
inserted++;
|
|
191
89
|
}
|
|
192
90
|
}
|
|
193
91
|
}
|
|
194
|
-
saveCache(cache);
|
|
195
|
-
}
|
|
196
92
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
try {
|
|
200
|
-
rawQueryVector = await embedder.embed(query);
|
|
201
|
-
} catch (err) {
|
|
202
|
-
console.error(`❌ Query embedding failed: "${query}"`, err.message);
|
|
203
|
-
return { text: "(Query embedding failed)", meta: { matches: 0 } };
|
|
204
|
-
}
|
|
93
|
+
if (useCache) saveCache(cache);
|
|
94
|
+
if (vectorStore.close) await vectorStore.close();
|
|
205
95
|
|
|
206
|
-
|
|
207
|
-
if (!rawQueryVector || (!Array.isArray(rawQueryVector) && !ArrayBuffer.isView(rawQueryVector))) {
|
|
208
|
-
console.error("❌ Invalid query vector type:", typeof rawQueryVector);
|
|
209
|
-
return { text: "(Invalid query vector)", meta: { matches: 0 } };
|
|
96
|
+
return { inserted, doc_root };
|
|
210
97
|
}
|
|
211
98
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
99
|
+
// =====================================================
|
|
100
|
+
// ✅ 2. VECTOR SEARCH
|
|
101
|
+
// =====================================================
|
|
102
|
+
if (action.includes("vector.search")) {
|
|
103
|
+
const query = sanitizeTextForEmbedding(extractQuery(action));
|
|
104
|
+
if (!query) return { text: "", matches: [] };
|
|
217
105
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
return { text: "(Failed to normalize query vector)", meta: { matches: 0 } };
|
|
221
|
-
}
|
|
106
|
+
const rawQueryVector = await embed(query);
|
|
107
|
+
const queryVector = Array.from(rawQueryVector);
|
|
222
108
|
|
|
223
|
-
// --- SEARCH ---
|
|
224
|
-
try {
|
|
225
|
-
console.log("🔍 Executing vector search...");
|
|
226
109
|
const results = await vectorStore.query(queryVector, {
|
|
227
110
|
topK: context.topK || 5,
|
|
228
|
-
minScore: context.minScore || 0,
|
|
229
111
|
});
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
112
|
+
|
|
113
|
+
if (vectorStore.close) await vectorStore.close();
|
|
114
|
+
|
|
234
115
|
return formatResults(results, query);
|
|
235
|
-
} catch (err) {
|
|
236
|
-
console.error("❌ Vector search failed:", err.message);
|
|
237
|
-
return { text: "(Search failed)", meta: { matches: 0 } };
|
|
238
116
|
}
|
|
117
|
+
|
|
118
|
+
// =====================================================
|
|
119
|
+
// ❌ REMOVE THIS (legacy)
|
|
120
|
+
// =====================================================
|
|
121
|
+
// if (action.startsWith("Ask doc-search")) { ... }
|
|
122
|
+
|
|
123
|
+
return;
|
|
239
124
|
}
|
|
240
125
|
|
|
126
|
+
resolver.resolverName = "vector";
|
|
127
|
+
resolver.version = "1.0.0";
|
|
128
|
+
|
|
241
129
|
module.exports = resolver;
|
package/src/services/docQA.js
CHANGED
|
@@ -2,20 +2,22 @@ const VectorRouter = require("../adapters/vectorRouter");
|
|
|
2
2
|
const embedder = require("../embeddings/local");
|
|
3
3
|
const extractText = require("../utils/extractText");
|
|
4
4
|
const chunkText = require("../utils/chunker");
|
|
5
|
+
const formatResults = require("../utils/formatResults");
|
|
5
6
|
const fs = require("fs");
|
|
6
7
|
const path = require("path");
|
|
7
8
|
|
|
8
9
|
async function performDocQA(
|
|
9
10
|
query,
|
|
10
11
|
{
|
|
11
|
-
doc_root,
|
|
12
|
-
vectorBackend = "
|
|
12
|
+
doc_root = "./docs",
|
|
13
|
+
vectorBackend = "memory", // 🔥 default to memory like Python fallback
|
|
13
14
|
dimension = 384,
|
|
14
|
-
migrate_on_demand = false,
|
|
15
15
|
POSTGRES_URL,
|
|
16
|
+
topK = 5,
|
|
16
17
|
...config
|
|
17
18
|
} = {}
|
|
18
19
|
) {
|
|
20
|
+
// ── Create vector store
|
|
19
21
|
const store = VectorRouter.create({
|
|
20
22
|
backend: vectorBackend,
|
|
21
23
|
dimension,
|
|
@@ -25,30 +27,65 @@ async function performDocQA(
|
|
|
25
27
|
|
|
26
28
|
const embed = await embedder({ dimension });
|
|
27
29
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
+
// ─────────────────────────────────────────────
|
|
31
|
+
// 🔥 ALWAYS INGEST (Python parity)
|
|
32
|
+
// ─────────────────────────────────────────────
|
|
33
|
+
if (doc_root && fs.existsSync(doc_root)) {
|
|
34
|
+
const files = fs.readdirSync(doc_root);
|
|
35
|
+
|
|
36
|
+
for (const file of files) {
|
|
30
37
|
const fullPath = path.join(doc_root, file);
|
|
38
|
+
|
|
31
39
|
if (!fs.statSync(fullPath).isFile()) continue;
|
|
40
|
+
if (!file.endsWith(".txt") && !file.endsWith(".md")) continue;
|
|
41
|
+
|
|
42
|
+
try {
|
|
43
|
+
const text = await extractText(fullPath);
|
|
44
|
+
if (!text || !text.trim()) continue;
|
|
45
|
+
|
|
46
|
+
const chunks = chunkText(text, 500, 50) || [text];
|
|
47
|
+
|
|
48
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
49
|
+
const chunk = chunks[i];
|
|
50
|
+
if (!chunk.trim()) continue;
|
|
32
51
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
52
|
+
try {
|
|
53
|
+
await store.upsert({
|
|
54
|
+
id: `${file}:${i}`,
|
|
55
|
+
vector: await embed(chunk),
|
|
56
|
+
content: chunk,
|
|
57
|
+
source: `file:${file}`,
|
|
58
|
+
metadata: { chunk: i }
|
|
59
|
+
});
|
|
60
|
+
} catch (err) {
|
|
61
|
+
console.warn("⚠️ Chunk failed:", err.message);
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
} catch (err) {
|
|
66
|
+
console.error("❌ Failed to process file:", file, err.message);
|
|
44
67
|
}
|
|
45
68
|
}
|
|
46
69
|
}
|
|
47
70
|
|
|
48
|
-
|
|
71
|
+
// ─────────────────────────────────────────────
|
|
72
|
+
// 🔍 SEARCH
|
|
73
|
+
// ─────────────────────────────────────────────
|
|
74
|
+
let matches = [];
|
|
75
|
+
|
|
76
|
+
try {
|
|
77
|
+
const queryVector = await embed(query);
|
|
78
|
+
matches = await store.query(queryVector, { topK });
|
|
79
|
+
} catch (err) {
|
|
80
|
+
console.error("❌ Search failed:", err.message);
|
|
81
|
+
}
|
|
49
82
|
|
|
50
83
|
if (store.close) await store.close();
|
|
51
|
-
|
|
84
|
+
|
|
85
|
+
// ─────────────────────────────────────────────
|
|
86
|
+
// ✅ FORMAT LIKE PYTHON
|
|
87
|
+
// ─────────────────────────────────────────────
|
|
88
|
+
return formatResults(matches, query);
|
|
52
89
|
}
|
|
53
90
|
|
|
54
|
-
module.exports = performDocQA;
|
|
91
|
+
module.exports = performDocQA;
|
|
@@ -1,15 +1,14 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
1
|
+
function formatResults(results = [], query = "") {
|
|
2
|
+
const safeResults = Array.isArray(results) ? results : [];
|
|
3
|
+
|
|
4
|
+
const text = safeResults.length
|
|
5
|
+
? safeResults.map(r => r.content).join('\n\n')
|
|
6
|
+
: "";
|
|
7
|
+
|
|
9
8
|
return {
|
|
10
9
|
query,
|
|
11
|
-
text,
|
|
12
|
-
matches:
|
|
10
|
+
text,
|
|
11
|
+
matches: safeResults.map(r => ({
|
|
13
12
|
id: r.id,
|
|
14
13
|
content: r.content,
|
|
15
14
|
source: r.source,
|