@o-lang/semantic-doc-search 1.0.39 → 1.0.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,24 +1,29 @@
1
- # @olang/semantic-doc-search
1
+ # @o-lang/semantic-doc-search
2
2
 
3
- O-lang Semantic Document Search Resolver with hybrid search, embeddings, rerank, and streaming.
3
+ Semantic document retrieval engine for O-Lang workflows.
4
+
5
+ This package provides vector-based document search (RAG retrieval layer) that integrates with O-Lang kernel workflows. It handles document ingestion, chunking, embedding, and similarity search, returning LLM-ready context outputs.
4
6
 
5
7
  ---
6
8
 
7
9
  ## Features
8
10
 
9
- - Hybrid search (semantic embeddings + lexical scoring)
10
- - Supports multiple file types: `.txt`, `.md`, `.pdf`, `.html`, `.docx`
11
- - Streaming token-by-token output via SSE
12
- - Reranking support (Cohere, Groq, local)
13
- - Flexible vector adapters: in-memory, Redis, Pinecone
14
- - Prebuilt prompt templates: summarize, short-answer, bullet-points, cite-sources
15
- - CLI for quick testing
16
- - Safe path resolution & chunking for long documents
17
- - Persistent embeddings cache (`embeddings.json`)
11
+ - Semantic vector search using embeddings
12
+ - Document ingestion from local filesystem (`.txt`, `.md`)
13
+ - Automatic text chunking for large documents
14
+ - Pluggable embedding providers (local, OpenAI, Groq, etc.)
15
+ - Multiple vector database support:
16
+ - In-memory store
17
+ - Redis (adapter)
18
+ - PostgreSQL / pgvector (adapter)
19
+ - Pinecone (adapter)
20
+ - Embedding cache support (`embeddings.json`)
21
+ - Normalized LLM-ready output format (`text + matches`)
22
+ - Designed for O-Lang `.ol` workflow integration
18
23
 
19
24
  ---
20
25
 
21
26
  ## Installation
22
27
 
23
28
  ```bash
24
- npm install @olang/semantic-doc-search
29
+ npm install @o-lang/semantic-doc-search
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@o-lang/semantic-doc-search",
3
- "version": "1.0.39",
3
+ "version": "1.0.41",
4
4
  "description": "O-lang Semantic Document Search Resolver with hybrid search, embeddings, rerank, and streaming.",
5
5
  "main": "src/index.js",
6
6
  "type": "commonjs",
package/src/resolver.js CHANGED
@@ -1,5 +1,5 @@
1
1
  const VectorRouter = require("./adapters/vectorRouter");
2
- const embedder = require("./embeddings/local"); // singleton embedder
2
+ const embedder = require("./embeddings/local");
3
3
  const { extractQuery } = require("./utils/extractQuery");
4
4
  const { formatResults } = require("./utils/formatResults");
5
5
  const fs = require("fs");
@@ -8,7 +8,9 @@ const crypto = require("crypto");
8
8
 
9
9
  const CACHE_PATH = path.join(process.cwd(), "embeddings.json");
10
10
 
11
- // Load cache for ingestion guard
11
+ // ─────────────────────────────────────────────
12
+ // Helpers (UNCHANGED)
13
+ // ─────────────────────────────────────────────
12
14
  function loadCache() {
13
15
  try {
14
16
  if (fs.existsSync(CACHE_PATH)) {
@@ -24,218 +26,104 @@ function saveCache(cache) {
24
26
  } catch {}
25
27
  }
26
28
 
27
- /**
28
- * Clean text for embedding (defensive)
29
- */
30
29
  function sanitizeTextForEmbedding(text) {
31
30
  if (typeof text !== "string") return "";
32
- // Remove wrapping quotes and extra whitespace
33
31
  return text.replace(/^["']|["']$/g, "").trim();
34
32
  }
35
33
 
36
- /**
37
- * Load documents from doc_root if provided
38
- */
39
- function loadDocumentsFromContext(context) {
40
- if (context.documents && Array.isArray(context.documents)) {
41
- return context.documents;
42
- }
43
-
44
- if (context.doc_root) {
45
- const baseDir = path.resolve(process.cwd(), context.doc_root);
46
- if (fs.existsSync(baseDir)) {
47
- const files = fs.readdirSync(baseDir).filter(f => f.endsWith('.txt') || f.endsWith('.md'));
48
- const docs = files.map(file => {
49
- try {
50
- const content = fs.readFileSync(path.join(baseDir, file), 'utf8');
51
- return { id: file, content, source: `file:${file}` };
52
- } catch (err) {
53
- console.warn(`⚠️ Failed to read ${file}:`, err.message);
54
- return null;
55
- }
56
- }).filter(Boolean);
57
-
58
- console.log(`📄 Loaded ${docs.length} document(s) from ${baseDir}`);
59
- docs.forEach(d => console.log(` - ${d.id} (${d.content?.length || 0} chars)`));
60
- return docs;
61
- }
62
- }
63
-
64
- return [];
65
- }
66
-
67
- /**
68
- * Convert any array-like (Float32Array, etc.) to plain JS array
69
- */
70
- function toPlainArray(input) {
71
- if (!input) return null;
72
- if (Array.isArray(input)) return input;
73
- if (ArrayBuffer.isView(input)) return Array.from(input);
74
- return null;
75
- }
76
-
77
- /**
78
- * Hash text for caching
79
- */
80
34
  function hashText(str) {
81
35
  return crypto.createHash("sha256").update(str).digest("hex");
82
36
  }
83
37
 
84
- /**
85
- * Semantic Doc Search Resolver
86
- */
38
+ // ─────────────────────────────────────────────
39
+ // 🔥 MAIN RESOLVER
40
+ // ─────────────────────────────────────────────
87
41
  async function resolver(action, context = {}) {
88
42
  if (typeof action !== "string") return;
89
- if (!action.toLowerCase().startsWith("ask doc-search")) return;
90
-
91
- let query = extractQuery(action);
92
- query = sanitizeTextForEmbedding(query);
93
- if (!query) return { text: "(Empty query)", meta: { matches: 0 } };
94
43
 
95
- // Vector backend
96
44
  const vectorStore = VectorRouter.create(context);
45
+ const embed = await embedder({ dimension: 384 });
97
46
 
98
- // Ensure backend supports search
99
- if (!vectorStore.supports("vector.search")) {
100
- throw new Error("Vector backend does not support vector.search");
101
- }
47
+ const doc_root = context.doc_root || "./docs";
48
+ const useCache = !!context.POSTGRES_URL || !!context.REDIS_URL;
49
+ const cache = useCache ? loadCache() : {};
102
50
 
103
- // Load documents (from context.documents OR doc_root)
104
- const documents = loadDocumentsFromContext(context);
105
- console.log("🔄 Starting ingestion for", documents.length, "documents");
51
+ // =====================================================
52
+ // 1. VECTOR INSERT (INGEST)
53
+ // =====================================================
54
+ if (action.includes("vector.insert")) {
55
+ let inserted = 0;
106
56
 
107
- // Ingestion guard cache
108
- const cache = loadCache();
109
-
110
- // --- Document ingestion ---
111
- if (documents.length > 0) {
112
- if (!vectorStore.supports("vector.insert")) {
113
- throw new Error("Vector backend does not support vector.insert");
114
- }
115
-
116
- for (const doc of documents) {
117
- console.log("📄 Processing doc:", doc.id, "content length:", doc.content?.length);
118
-
119
- if (!doc?.content) {
120
- console.warn("⚠️ Skipping empty doc:", doc?.id);
121
- continue;
122
- }
123
-
124
- // ✅ CORRECT PATH: Adjust if chunker.js is in src/utils/
125
- let chunks;
126
- try {
127
- const chunkText = require("./utils/chunker.js").chunkText;
128
- chunks = chunkText(doc.content, 500) || [doc.content];
129
- console.log("📦", doc.id, "split into", chunks.length, "chunks");
130
- } catch (err) {
131
- console.warn("⚠️ Chunking failed, using full doc:", err.message);
132
- chunks = [doc.content];
133
- }
57
+ if (fs.existsSync(doc_root)) {
58
+ const files = fs.readdirSync(doc_root);
134
59
 
135
- for (let i = 0; i < chunks.length; i++) {
136
- const text = sanitizeTextForEmbedding(chunks[i]);
137
- console.log("🧩 Chunk", `${doc.id}:${i}`, "text:", JSON.stringify(text));
138
-
139
- if (!text) {
140
- console.warn(`⚠️ Skipping empty chunk ${doc.id}:${i}`);
141
- continue;
142
- }
60
+ for (const file of files) {
61
+ const fullPath = path.join(doc_root, file);
62
+ if (!fs.statSync(fullPath).isFile()) continue;
143
63
 
144
- const hash = hashText(text);
145
- if (cache[hash]) {
146
- console.log(`⏭️ Skipping already ingested chunk ${doc.id}:${i}`);
147
- continue;
148
- }
64
+ const content = fs.readFileSync(fullPath, "utf8");
65
+ if (!content) continue;
149
66
 
150
- // 🔒 DEFENSIVE EMBEDDING
151
- let rawVector;
152
- try {
153
- rawVector = await embedder.embed(text);
154
- } catch (err) {
155
- console.warn(`⚠️ Embedding failed for chunk ${doc.id}:${i} ("${text.slice(0, 30)}..."):`, err.message);
156
- continue;
157
- }
67
+ const chunkText = require("./utils/chunker").chunkText;
68
+ const chunks = chunkText(content, 500, 50);
158
69
 
159
- // Validate vector type
160
- if (!rawVector || (!Array.isArray(rawVector) && !ArrayBuffer.isView(rawVector))) {
161
- console.warn(`⚠️ Invalid vector type for chunk ${doc.id}:${i}:`, typeof rawVector);
162
- continue;
163
- }
70
+ for (let i = 0; i < chunks.length; i++) {
71
+ const text = sanitizeTextForEmbedding(chunks[i]);
72
+ if (!text) continue;
164
73
 
165
- // Check for zero vectors
166
- const isZero = rawVector.every?.(v => v === 0) || false;
167
- if (isZero) {
168
- console.warn(`⚠️ Zero vector for chunk ${doc.id}:${i}`);
169
- continue;
170
- }
74
+ const hash = hashText(text);
75
+ if (useCache && cache[hash]) continue;
171
76
 
172
- // Normalize to plain array for storage
173
- const vector = toPlainArray(rawVector);
174
- if (!vector) {
175
- console.warn(`⚠️ Failed to normalize vector for chunk ${doc.id}:${i}`);
176
- continue;
177
- }
77
+ const rawVector = await embed(text);
78
+ const vector = Array.from(rawVector);
178
79
 
179
- try {
180
80
  await vectorStore.upsert({
181
- id: `${doc.id}:${i}`,
81
+ id: `${file}:${i}`,
182
82
  vector,
183
83
  content: text,
184
- source: doc.source,
84
+ source: `file:${file}`,
185
85
  });
186
- cache[hash] = true;
187
- console.log(`✅ Upserted ${doc.id}:${i}`);
188
- } catch (err) {
189
- console.warn(`⚠️ Upsert failed for ${doc.id}:${i}:`, err.message);
190
- continue;
86
+
87
+ if (useCache) cache[hash] = true;
88
+ inserted++;
191
89
  }
192
90
  }
193
91
  }
194
- saveCache(cache);
195
- }
196
92
 
197
- // --- QUERY EMBEDDING ---
198
- let rawQueryVector;
199
- try {
200
- rawQueryVector = await embedder.embed(query);
201
- } catch (err) {
202
- console.error(`❌ Query embedding failed: "${query}"`, err.message);
203
- return { text: "(Query embedding failed)", meta: { matches: 0 } };
204
- }
93
+ if (useCache) saveCache(cache);
94
+ if (vectorStore.close) await vectorStore.close();
205
95
 
206
- // FULLY COMPLETE VALIDATION LINE
207
- if (!rawQueryVector || (!Array.isArray(rawQueryVector) && !ArrayBuffer.isView(rawQueryVector))) {
208
- console.error("❌ Invalid query vector type:", typeof rawQueryVector);
209
- return { text: "(Invalid query vector)", meta: { matches: 0 } };
96
+ return { inserted, doc_root };
210
97
  }
211
98
 
212
- const isZeroQuery = rawQueryVector.every?.(v => v === 0) || false;
213
- if (isZeroQuery) {
214
- console.warn("⚠️ Zero vector for query");
215
- return { text: "(Query produced zero vector)", meta: { matches: 0 } };
216
- }
99
+ // =====================================================
100
+ // 2. VECTOR SEARCH
101
+ // =====================================================
102
+ if (action.includes("vector.search")) {
103
+ const query = sanitizeTextForEmbedding(extractQuery(action));
104
+ if (!query) return { text: "", matches: [] };
217
105
 
218
- const queryVector = toPlainArray(rawQueryVector);
219
- if (!queryVector) {
220
- return { text: "(Failed to normalize query vector)", meta: { matches: 0 } };
221
- }
106
+ const rawQueryVector = await embed(query);
107
+ const queryVector = Array.from(rawQueryVector);
222
108
 
223
- // --- SEARCH ---
224
- try {
225
- console.log("🔍 Executing vector search...");
226
109
  const results = await vectorStore.query(queryVector, {
227
110
  topK: context.topK || 5,
228
- minScore: context.minScore || 0,
229
111
  });
230
-
231
- console.log("📊 Raw search results:", results.length);
232
- results.forEach((r, i) => console.log(` ${i}: score=${r.score?.toFixed(4)}, content="${r.content?.substring(0, 50)}..."`));
233
-
112
+
113
+ if (vectorStore.close) await vectorStore.close();
114
+
234
115
  return formatResults(results, query);
235
- } catch (err) {
236
- console.error("❌ Vector search failed:", err.message);
237
- return { text: "(Search failed)", meta: { matches: 0 } };
238
116
  }
117
+
118
+ // =====================================================
119
+ // ❌ REMOVE THIS (legacy)
120
+ // =====================================================
121
+ // if (action.startsWith("Ask doc-search")) { ... }
122
+
123
+ return;
239
124
  }
240
125
 
126
+ resolver.resolverName = "vector";
127
+ resolver.version = "1.0.0";
128
+
241
129
  module.exports = resolver;
@@ -2,20 +2,22 @@ const VectorRouter = require("../adapters/vectorRouter");
2
2
  const embedder = require("../embeddings/local");
3
3
  const extractText = require("../utils/extractText");
4
4
  const chunkText = require("../utils/chunker");
5
+ const formatResults = require("../utils/formatResults");
5
6
  const fs = require("fs");
6
7
  const path = require("path");
7
8
 
8
9
  async function performDocQA(
9
10
  query,
10
11
  {
11
- doc_root,
12
- vectorBackend = "pgvector",
12
+ doc_root = "./docs",
13
+ vectorBackend = "memory", // 🔥 default to memory like Python fallback
13
14
  dimension = 384,
14
- migrate_on_demand = false,
15
15
  POSTGRES_URL,
16
+ topK = 5,
16
17
  ...config
17
18
  } = {}
18
19
  ) {
20
+ // ── Create vector store
19
21
  const store = VectorRouter.create({
20
22
  backend: vectorBackend,
21
23
  dimension,
@@ -25,30 +27,65 @@ async function performDocQA(
25
27
 
26
28
  const embed = await embedder({ dimension });
27
29
 
28
- if (migrate_on_demand && doc_root) {
29
- for (const file of fs.readdirSync(doc_root)) {
30
+ // ─────────────────────────────────────────────
31
+ // 🔥 ALWAYS INGEST (Python parity)
32
+ // ─────────────────────────────────────────────
33
+ if (doc_root && fs.existsSync(doc_root)) {
34
+ const files = fs.readdirSync(doc_root);
35
+
36
+ for (const file of files) {
30
37
  const fullPath = path.join(doc_root, file);
38
+
31
39
  if (!fs.statSync(fullPath).isFile()) continue;
40
+ if (!file.endsWith(".txt") && !file.endsWith(".md")) continue;
41
+
42
+ try {
43
+ const text = await extractText(fullPath);
44
+ if (!text || !text.trim()) continue;
45
+
46
+ const chunks = chunkText(text, 500, 50) || [text];
47
+
48
+ for (let i = 0; i < chunks.length; i++) {
49
+ const chunk = chunks[i];
50
+ if (!chunk.trim()) continue;
32
51
 
33
- const text = await extractText(fullPath);
34
- const chunks = chunkText(text);
35
-
36
- for (let i = 0; i < chunks.length; i++) {
37
- await store.upsert({
38
- id: `${file}-${i}`,
39
- vector: await embed(chunks[i]),
40
- content: chunks[i],
41
- source: file,
42
- metadata: { chunk: i }
43
- });
52
+ try {
53
+ await store.upsert({
54
+ id: `${file}:${i}`,
55
+ vector: await embed(chunk),
56
+ content: chunk,
57
+ source: `file:${file}`,
58
+ metadata: { chunk: i }
59
+ });
60
+ } catch (err) {
61
+ console.warn("⚠️ Chunk failed:", err.message);
62
+ }
63
+ }
64
+
65
+ } catch (err) {
66
+ console.error("❌ Failed to process file:", file, err.message);
44
67
  }
45
68
  }
46
69
  }
47
70
 
48
- const results = await store.query(await embed(query), { topK: 5 });
71
+ // ─────────────────────────────────────────────
72
+ // 🔍 SEARCH
73
+ // ─────────────────────────────────────────────
74
+ let matches = [];
75
+
76
+ try {
77
+ const queryVector = await embed(query);
78
+ matches = await store.query(queryVector, { topK });
79
+ } catch (err) {
80
+ console.error("❌ Search failed:", err.message);
81
+ }
49
82
 
50
83
  if (store.close) await store.close();
51
- return results;
84
+
85
+ // ─────────────────────────────────────────────
86
+ // ✅ FORMAT LIKE PYTHON
87
+ // ─────────────────────────────────────────────
88
+ return formatResults(matches, query);
52
89
  }
53
90
 
54
- module.exports = performDocQA;
91
+ module.exports = performDocQA;
@@ -1,15 +1,14 @@
1
- /**
2
- * Normalizes vector search results for O-Lang workflows.
3
- * Returns both structured matches AND a plain .text field for LLM prompts.
4
- */
5
- function formatResults(results = [], query) {
6
- // ✅ Generate plain text from all matches
7
- const text = results.map(r => r.content).join('\n\n');
8
-
1
+ function formatResults(results = [], query = "") {
2
+ const safeResults = Array.isArray(results) ? results : [];
3
+
4
+ const text = safeResults.length
5
+ ? safeResults.map(r => r.content).join('\n\n')
6
+ : "";
7
+
9
8
  return {
10
9
  query,
11
- text, // ← THIS IS THE KEY ADDITION
12
- matches: results.map(r => ({
10
+ text,
11
+ matches: safeResults.map(r => ({
13
12
  id: r.id,
14
13
  content: r.content,
15
14
  source: r.source,