@o-lang/semantic-doc-search 1.0.40 → 1.0.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,24 +1,29 @@
1
- # @olang/semantic-doc-search
1
+ # @o-lang/semantic-doc-search
2
2
 
3
- O-lang Semantic Document Search Resolver with hybrid search, embeddings, rerank, and streaming.
3
+ Semantic document retrieval engine for O-Lang workflows.
4
+
5
+ This package provides vector-based document search (RAG retrieval layer) that integrates with O-Lang kernel workflows. It handles document ingestion, chunking, embedding, and similarity search, returning LLM-ready context outputs.
4
6
 
5
7
  ---
6
8
 
7
9
  ## Features
8
10
 
9
- - Hybrid search (semantic embeddings + lexical scoring)
10
- - Supports multiple file types: `.txt`, `.md`, `.pdf`, `.html`, `.docx`
11
- - Streaming token-by-token output via SSE
12
- - Reranking support (Cohere, Groq, local)
13
- - Flexible vector adapters: in-memory, Redis, Pinecone
14
- - Prebuilt prompt templates: summarize, short-answer, bullet-points, cite-sources
15
- - CLI for quick testing
16
- - Safe path resolution & chunking for long documents
17
- - Persistent embeddings cache (`embeddings.json`)
11
+ - Semantic vector search using embeddings
12
+ - Document ingestion from local filesystem (`.txt`, `.md`)
13
+ - Automatic text chunking for large documents
14
+ - Pluggable embedding providers (local, OpenAI, Groq, etc.)
15
+ - Multiple vector database support:
16
+ - In-memory store
17
+ - Redis (adapter)
18
+ - PostgreSQL / pgvector (adapter)
19
+ - Pinecone (adapter)
20
+ - Embedding cache support (`embeddings.json`)
21
+ - Normalized LLM-ready output format (`text + matches`)
22
+ - Designed for O-Lang `.ol` workflow integration
18
23
 
19
24
  ---
20
25
 
21
26
  ## Installation
22
27
 
23
28
  ```bash
24
- npm install @olang/semantic-doc-search
29
+ npm install @o-lang/semantic-doc-search
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@o-lang/semantic-doc-search",
3
- "version": "1.0.40",
3
+ "version": "1.0.41",
4
4
  "description": "O-lang Semantic Document Search Resolver with hybrid search, embeddings, rerank, and streaming.",
5
5
  "main": "src/index.js",
6
6
  "type": "commonjs",
package/src/resolver.js CHANGED
@@ -1,5 +1,5 @@
1
1
  const VectorRouter = require("./adapters/vectorRouter");
2
- const embedder = require("./embeddings/local"); // singleton embedder
2
+ const embedder = require("./embeddings/local");
3
3
  const { extractQuery } = require("./utils/extractQuery");
4
4
  const { formatResults } = require("./utils/formatResults");
5
5
  const fs = require("fs");
@@ -8,7 +8,9 @@ const crypto = require("crypto");
8
8
 
9
9
  const CACHE_PATH = path.join(process.cwd(), "embeddings.json");
10
10
 
11
- // Load cache for ingestion guard
11
+ // ─────────────────────────────────────────────
12
+ // Helpers (UNCHANGED)
13
+ // ─────────────────────────────────────────────
12
14
  function loadCache() {
13
15
  try {
14
16
  if (fs.existsSync(CACHE_PATH)) {
@@ -24,226 +26,104 @@ function saveCache(cache) {
24
26
  } catch {}
25
27
  }
26
28
 
27
- /**
28
- * Clean text for embedding (defensive)
29
- */
30
29
  function sanitizeTextForEmbedding(text) {
31
30
  if (typeof text !== "string") return "";
32
- // Remove wrapping quotes and extra whitespace
33
31
  return text.replace(/^["']|["']$/g, "").trim();
34
32
  }
35
33
 
36
- /**
37
- * Load documents from doc_root if provided
38
- */
39
- function loadDocumentsFromContext(context) {
40
- if (context.documents && Array.isArray(context.documents)) {
41
- return context.documents;
42
- }
43
-
44
- if (context.doc_root) {
45
- const baseDir = path.resolve(process.cwd(), context.doc_root);
46
- if (fs.existsSync(baseDir)) {
47
- const files = fs.readdirSync(baseDir).filter(f => f.endsWith('.txt') || f.endsWith('.md'));
48
- const docs = files.map(file => {
49
- try {
50
- const content = fs.readFileSync(path.join(baseDir, file), 'utf8');
51
- return { id: file, content, source: `file:${file}` };
52
- } catch (err) {
53
- console.warn(`⚠️ Failed to read ${file}:`, err.message);
54
- return null;
55
- }
56
- }).filter(Boolean);
57
-
58
- console.log(`📄 Loaded ${docs.length} document(s) from ${baseDir}`);
59
- docs.forEach(d => console.log(` - ${d.id} (${d.content?.length || 0} chars)`));
60
- return docs;
61
- }
62
- }
63
-
64
- return [];
65
- }
66
-
67
- /**
68
- * Convert any array-like (Float32Array, etc.) to plain JS array
69
- */
70
- function toPlainArray(input) {
71
- if (!input) return null;
72
- if (Array.isArray(input)) return input;
73
- if (ArrayBuffer.isView(input)) return Array.from(input);
74
- return null;
75
- }
76
-
77
- /**
78
- * Hash text for caching
79
- */
80
34
  function hashText(str) {
81
35
  return crypto.createHash("sha256").update(str).digest("hex");
82
36
  }
83
37
 
84
- /**
85
- * Semantic Doc Search Resolver
86
- */
38
+ // ─────────────────────────────────────────────
39
+ // 🔥 MAIN RESOLVER
40
+ // ─────────────────────────────────────────────
87
41
  async function resolver(action, context = {}) {
88
42
  if (typeof action !== "string") return;
89
- if (!action.toLowerCase().startsWith("ask doc-search")) return;
90
43
 
91
- let query = extractQuery(action);
92
- query = sanitizeTextForEmbedding(query);
93
- if (!query) return { text: "(Empty query)", meta: { matches: 0 } };
94
-
95
- // Vector backend
96
44
  const vectorStore = VectorRouter.create(context);
45
+ const embed = await embedder({ dimension: 384 });
97
46
 
98
- // Ensure backend supports search
99
- if (!vectorStore.supports("vector.search")) {
100
- throw new Error("Vector backend does not support vector.search");
101
- }
102
-
103
- // Load documents (from context.documents OR doc_root)
104
- const documents = loadDocumentsFromContext(context);
105
- console.log("🔄 Starting ingestion for", documents.length, "documents");
106
-
107
- // ✅ ONLY USE CACHE FOR PERSISTENT BACKENDS
47
+ const doc_root = context.doc_root || "./docs";
108
48
  const useCache = !!context.POSTGRES_URL || !!context.REDIS_URL;
109
49
  const cache = useCache ? loadCache() : {};
110
50
 
111
- // --- Document ingestion ---
112
- if (documents.length > 0) {
113
- if (!vectorStore.supports("vector.insert")) {
114
- throw new Error("Vector backend does not support vector.insert");
115
- }
51
+ // =====================================================
52
+ // ✅ 1. VECTOR INSERT (INGEST)
53
+ // =====================================================
54
+ if (action.includes("vector.insert")) {
55
+ let inserted = 0;
116
56
 
117
- for (const doc of documents) {
118
- console.log("📄 Processing doc:", doc.id, "content length:", doc.content?.length);
119
-
120
- if (!doc?.content) {
121
- console.warn("⚠️ Skipping empty doc:", doc?.id);
122
- continue;
123
- }
57
+ if (fs.existsSync(doc_root)) {
58
+ const files = fs.readdirSync(doc_root);
124
59
 
125
- // CORRECT PATH: Adjust if chunker.js is in src/utils/
126
- let chunks;
127
- try {
128
- const chunkText = require("./utils/chunker.js").chunkText;
129
- chunks = chunkText(doc.content, 500) || [doc.content];
130
- console.log("📦", doc.id, "split into", chunks.length, "chunks");
131
- } catch (err) {
132
- console.warn("⚠️ Chunking failed, using full doc:", err.message);
133
- chunks = [doc.content];
134
- }
60
+ for (const file of files) {
61
+ const fullPath = path.join(doc_root, file);
62
+ if (!fs.statSync(fullPath).isFile()) continue;
135
63
 
136
- for (let i = 0; i < chunks.length; i++) {
137
- const text = sanitizeTextForEmbedding(chunks[i]);
138
- console.log("🧩 Chunk", `${doc.id}:${i}`, "text:", JSON.stringify(text));
139
-
140
- if (!text) {
141
- console.warn(`⚠️ Skipping empty chunk ${doc.id}:${i}`);
142
- continue;
143
- }
64
+ const content = fs.readFileSync(fullPath, "utf8");
65
+ if (!content) continue;
144
66
 
145
- const hash = hashText(text);
146
- // ONLY CHECK CACHE FOR PERSISTENT BACKENDS
147
- if (useCache && cache[hash]) {
148
- console.log(`⏭️ Skipping already ingested chunk ${doc.id}:${i}`);
149
- continue;
150
- }
67
+ const chunkText = require("./utils/chunker").chunkText;
68
+ const chunks = chunkText(content, 500, 50);
151
69
 
152
- // 🔒 DEFENSIVE EMBEDDING
153
- let rawVector;
154
- try {
155
- rawVector = await embedder.embed(text);
156
- } catch (err) {
157
- console.warn(`⚠️ Embedding failed for chunk ${doc.id}:${i} ("${text.slice(0, 30)}..."):`, err.message);
158
- continue;
159
- }
70
+ for (let i = 0; i < chunks.length; i++) {
71
+ const text = sanitizeTextForEmbedding(chunks[i]);
72
+ if (!text) continue;
160
73
 
161
- // Validate vector type
162
- if (!rawVector || (!Array.isArray(rawVector) && !ArrayBuffer.isView(rawVector))) {
163
- console.warn(`⚠️ Invalid vector type for chunk ${doc.id}:${i}:`, typeof rawVector);
164
- continue;
165
- }
74
+ const hash = hashText(text);
75
+ if (useCache && cache[hash]) continue;
166
76
 
167
- // Check for zero vectors
168
- const isZero = rawVector.every?.(v => v === 0) || false;
169
- if (isZero) {
170
- console.warn(`⚠️ Zero vector for chunk ${doc.id}:${i}`);
171
- continue;
172
- }
173
-
174
- // Normalize to plain array for storage
175
- const vector = toPlainArray(rawVector);
176
- if (!vector) {
177
- console.warn(`⚠️ Failed to normalize vector for chunk ${doc.id}:${i}`);
178
- continue;
179
- }
77
+ const rawVector = await embed(text);
78
+ const vector = Array.from(rawVector);
180
79
 
181
- try {
182
80
  await vectorStore.upsert({
183
- id: `${doc.id}:${i}`,
81
+ id: `${file}:${i}`,
184
82
  vector,
185
83
  content: text,
186
- source: doc.source,
84
+ source: `file:${file}`,
187
85
  });
188
- // ✅ ONLY UPDATE CACHE FOR PERSISTENT BACKENDS
189
- if (useCache) {
190
- cache[hash] = true;
191
- }
192
- console.log(`✅ Upserted ${doc.id}:${i}`);
193
- } catch (err) {
194
- console.warn(`⚠️ Upsert failed for ${doc.id}:${i}:`, err.message);
195
- continue;
86
+
87
+ if (useCache) cache[hash] = true;
88
+ inserted++;
196
89
  }
197
90
  }
198
91
  }
199
- // ✅ ONLY SAVE CACHE FOR PERSISTENT BACKENDS
200
- if (useCache) {
201
- saveCache(cache);
202
- }
203
- }
204
92
 
205
- // --- QUERY EMBEDDING ---
206
- let rawQueryVector;
207
- try {
208
- rawQueryVector = await embedder.embed(query);
209
- } catch (err) {
210
- console.error(`❌ Query embedding failed: "${query}"`, err.message);
211
- return { text: "(Query embedding failed)", meta: { matches: 0 } };
212
- }
93
+ if (useCache) saveCache(cache);
94
+ if (vectorStore.close) await vectorStore.close();
213
95
 
214
- // FULLY COMPLETE VALIDATION LINE
215
- if (!rawQueryVector || (!Array.isArray(rawQueryVector) && !ArrayBuffer.isView(rawQueryVector))) {
216
- console.error("❌ Invalid query vector type:", typeof rawQueryVector);
217
- return { text: "(Invalid query vector)", meta: { matches: 0 } };
96
+ return { inserted, doc_root };
218
97
  }
219
98
 
220
- const isZeroQuery = rawQueryVector.every?.(v => v === 0) || false;
221
- if (isZeroQuery) {
222
- console.warn("⚠️ Zero vector for query");
223
- return { text: "(Query produced zero vector)", meta: { matches: 0 } };
224
- }
99
+ // =====================================================
100
+ // 2. VECTOR SEARCH
101
+ // =====================================================
102
+ if (action.includes("vector.search")) {
103
+ const query = sanitizeTextForEmbedding(extractQuery(action));
104
+ if (!query) return { text: "", matches: [] };
225
105
 
226
- const queryVector = toPlainArray(rawQueryVector);
227
- if (!queryVector) {
228
- return { text: "(Failed to normalize query vector)", meta: { matches: 0 } };
229
- }
106
+ const rawQueryVector = await embed(query);
107
+ const queryVector = Array.from(rawQueryVector);
230
108
 
231
- // --- SEARCH ---
232
- try {
233
- console.log("🔍 Executing vector search...");
234
109
  const results = await vectorStore.query(queryVector, {
235
110
  topK: context.topK || 5,
236
- minScore: context.minScore || 0,
237
111
  });
238
-
239
- console.log("📊 Raw search results:", results.length);
240
- results.forEach((r, i) => console.log(` ${i}: score=${r.score?.toFixed(4)}, content="${r.content?.substring(0, 50)}..."`));
241
-
112
+
113
+ if (vectorStore.close) await vectorStore.close();
114
+
242
115
  return formatResults(results, query);
243
- } catch (err) {
244
- console.error("❌ Vector search failed:", err.message);
245
- return { text: "(Search failed)", meta: { matches: 0 } };
246
116
  }
117
+
118
+ // =====================================================
119
+ // ❌ REMOVE THIS (legacy)
120
+ // =====================================================
121
+ // if (action.startsWith("Ask doc-search")) { ... }
122
+
123
+ return;
247
124
  }
248
125
 
126
+ resolver.resolverName = "vector";
127
+ resolver.version = "1.0.0";
128
+
249
129
  module.exports = resolver;
@@ -2,20 +2,22 @@ const VectorRouter = require("../adapters/vectorRouter");
2
2
  const embedder = require("../embeddings/local");
3
3
  const extractText = require("../utils/extractText");
4
4
  const chunkText = require("../utils/chunker");
5
+ const formatResults = require("../utils/formatResults");
5
6
  const fs = require("fs");
6
7
  const path = require("path");
7
8
 
8
9
  async function performDocQA(
9
10
  query,
10
11
  {
11
- doc_root,
12
- vectorBackend = "pgvector",
12
+ doc_root = "./docs",
13
+ vectorBackend = "memory", // 🔥 default to memory like Python fallback
13
14
  dimension = 384,
14
- migrate_on_demand = false,
15
15
  POSTGRES_URL,
16
+ topK = 5,
16
17
  ...config
17
18
  } = {}
18
19
  ) {
20
+ // ── Create vector store
19
21
  const store = VectorRouter.create({
20
22
  backend: vectorBackend,
21
23
  dimension,
@@ -25,30 +27,65 @@ async function performDocQA(
25
27
 
26
28
  const embed = await embedder({ dimension });
27
29
 
28
- if (migrate_on_demand && doc_root) {
29
- for (const file of fs.readdirSync(doc_root)) {
30
+ // ─────────────────────────────────────────────
31
+ // 🔥 ALWAYS INGEST (Python parity)
32
+ // ─────────────────────────────────────────────
33
+ if (doc_root && fs.existsSync(doc_root)) {
34
+ const files = fs.readdirSync(doc_root);
35
+
36
+ for (const file of files) {
30
37
  const fullPath = path.join(doc_root, file);
38
+
31
39
  if (!fs.statSync(fullPath).isFile()) continue;
40
+ if (!file.endsWith(".txt") && !file.endsWith(".md")) continue;
41
+
42
+ try {
43
+ const text = await extractText(fullPath);
44
+ if (!text || !text.trim()) continue;
45
+
46
+ const chunks = chunkText(text, 500, 50) || [text];
47
+
48
+ for (let i = 0; i < chunks.length; i++) {
49
+ const chunk = chunks[i];
50
+ if (!chunk.trim()) continue;
32
51
 
33
- const text = await extractText(fullPath);
34
- const chunks = chunkText(text);
35
-
36
- for (let i = 0; i < chunks.length; i++) {
37
- await store.upsert({
38
- id: `${file}-${i}`,
39
- vector: await embed(chunks[i]),
40
- content: chunks[i],
41
- source: file,
42
- metadata: { chunk: i }
43
- });
52
+ try {
53
+ await store.upsert({
54
+ id: `${file}:${i}`,
55
+ vector: await embed(chunk),
56
+ content: chunk,
57
+ source: `file:${file}`,
58
+ metadata: { chunk: i }
59
+ });
60
+ } catch (err) {
61
+ console.warn("⚠️ Chunk failed:", err.message);
62
+ }
63
+ }
64
+
65
+ } catch (err) {
66
+ console.error("❌ Failed to process file:", file, err.message);
44
67
  }
45
68
  }
46
69
  }
47
70
 
48
- const results = await store.query(await embed(query), { topK: 5 });
71
+ // ─────────────────────────────────────────────
72
+ // 🔍 SEARCH
73
+ // ─────────────────────────────────────────────
74
+ let matches = [];
75
+
76
+ try {
77
+ const queryVector = await embed(query);
78
+ matches = await store.query(queryVector, { topK });
79
+ } catch (err) {
80
+ console.error("❌ Search failed:", err.message);
81
+ }
49
82
 
50
83
  if (store.close) await store.close();
51
- return results;
84
+
85
+ // ─────────────────────────────────────────────
86
+ // ✅ FORMAT LIKE PYTHON
87
+ // ─────────────────────────────────────────────
88
+ return formatResults(matches, query);
52
89
  }
53
90
 
54
- module.exports = performDocQA;
91
+ module.exports = performDocQA;
@@ -1,15 +1,14 @@
1
- /**
2
- * Normalizes vector search results for O-Lang workflows.
3
- * Returns both structured matches AND a plain .text field for LLM prompts.
4
- */
5
- function formatResults(results = [], query) {
6
- // ✅ Generate plain text from all matches
7
- const text = results.map(r => r.content).join('\n\n');
8
-
1
+ function formatResults(results = [], query = "") {
2
+ const safeResults = Array.isArray(results) ? results : [];
3
+
4
+ const text = safeResults.length
5
+ ? safeResults.map(r => r.content).join('\n\n')
6
+ : "";
7
+
9
8
  return {
10
9
  query,
11
- text, // ← THIS IS THE KEY ADDITION
12
- matches: results.map(r => ({
10
+ text,
11
+ matches: safeResults.map(r => ({
13
12
  id: r.id,
14
13
  content: r.content,
15
14
  source: r.source,