@o-lang/semantic-doc-search 1.0.20 → 1.0.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/index.js +161 -127
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@o-lang/semantic-doc-search",
3
- "version": "1.0.20",
3
+ "version": "1.0.22",
4
4
  "description": "O-lang Semantic Document Search Resolver with hybrid search, embeddings, rerank, and streaming.",
5
5
  "main": "src/index.js",
6
6
  "type": "commonjs",
package/src/index.js CHANGED
@@ -1,4 +1,3 @@
1
- // doc-search.js
2
1
  const fs = require("fs");
3
2
  const path = require("path");
4
3
  const { createLLM } = require("./llm/router.js");
@@ -14,29 +13,45 @@ const CACHE_PATH = path.join(process.cwd(), "embeddings.json");
14
13
 
15
14
  function safeResolve(base, userPath) {
16
15
  const resolved = path.resolve(base, userPath);
17
- if (!resolved.startsWith(path.resolve(base))) throw new Error("Path traversal detected");
16
+ if (!resolved.startsWith(path.resolve(base))) {
17
+ throw new Error("Path traversal detected");
18
+ }
18
19
  return resolved;
19
20
  }
20
21
 
21
22
  function loadCache() {
22
23
  try {
23
- if (fs.existsSync(CACHE_PATH)) return JSON.parse(fs.readFileSync(CACHE_PATH, "utf8")) || {};
24
+ if (fs.existsSync(CACHE_PATH)) {
25
+ return JSON.parse(fs.readFileSync(CACHE_PATH, "utf8")) || {};
26
+ }
24
27
  } catch {}
25
28
  return {};
26
29
  }
27
30
 
28
31
  function saveCache(cache) {
29
- try { fs.writeFileSync(CACHE_PATH, JSON.stringify(cache, null, 2)); } catch {}
32
+ try {
33
+ fs.writeFileSync(CACHE_PATH, JSON.stringify(cache, null, 2));
34
+ } catch {}
30
35
  }
31
36
 
32
- // ------------------- DATABASE ADAPTER -------------------
37
+ /* ---------------- DATABASE ADAPTER ---------------- */
38
+
33
39
  class DatabaseAdapter {
34
- constructor() { this.initialized = false; }
40
+ constructor() {
41
+ this.initialized = false;
42
+ }
43
+
35
44
  async initialize(context) {
36
45
  if (this.initialized) return;
37
- if (context.db_type === "mongodb" || context.MONGO_URI) await this.initMongo(context);
38
- else if (context.db_type === "sqlite" || context.db_path) await this.initSQLite(context);
39
- else if (context.db_type === "postgres" || context.POSTGRES_URL) await this.initPostgres(context);
46
+
47
+ if (context.db_type === "mongodb" || context.MONGO_URI) {
48
+ await this.initMongo(context);
49
+ } else if (context.db_type === "sqlite" || context.db_path) {
50
+ await this.initSQLite(context);
51
+ } else if (context.db_type === "postgres" || context.POSTGRES_URL) {
52
+ await this.initPostgres(context);
53
+ }
54
+
40
55
  this.initialized = true;
41
56
  }
42
57
 
@@ -44,183 +59,202 @@ class DatabaseAdapter {
44
59
  const Database = require("better-sqlite3");
45
60
  const dbPath = context.db_path || "./database.db";
46
61
  const dbDir = path.dirname(path.resolve(dbPath));
47
- if (!fs.existsSync(dbDir)) throw new Error(`SQLite database directory not found: ${dbDir}`);
62
+ if (!fs.existsSync(dbDir)) {
63
+ throw new Error(`SQLite database directory not found: ${dbDir}`);
64
+ }
48
65
  this.sqliteClient = new Database(dbPath, { readonly: true });
49
66
  }
50
67
 
51
68
  async querySQLite(query, params = []) {
52
- if (!this.sqliteClient) throw new Error("SQLite client not initialized");
53
69
  const stmt = this.sqliteClient.prepare(query);
54
70
  return stmt.all(...params);
55
71
  }
56
72
 
57
73
  async initMongo(context) {
58
74
  const { MongoClient } = require("mongodb");
59
- const uri = context.MONGO_URI || `mongodb://localhost:27017/${context.db_name || "olang"}`;
75
+ const uri =
76
+ context.MONGO_URI ||
77
+ `mongodb://localhost:27017/${context.db_name || "olang"}`;
60
78
  this.mongoClient = new MongoClient(uri);
61
79
  await this.mongoClient.connect();
62
80
  }
63
81
 
64
82
  async queryMongo(collectionName, filter = {}, projection = {}) {
65
- if (!this.mongoClient) throw new Error("MongoDB client not initialized");
66
- const db = this.mongoClient.db(process.env.DB_NAME || context.db_name || "olang");
67
- return await db.collection(collectionName).find(filter, { projection }).toArray();
83
+ const db = this.mongoClient.db(process.env.DB_NAME || "olang");
84
+ return db.collection(collectionName).find(filter, { projection }).toArray();
68
85
  }
69
86
 
70
87
  async initPostgres(context) {
71
88
  const { Pool } = require("pg");
72
- const poolConfig = {
89
+ this.postgresClient = new Pool({
73
90
  connectionString: context.POSTGRES_URL,
74
- host: context.DB_HOST || "localhost",
75
- port: parseInt(context.DB_PORT) || 5432,
76
- user: context.DB_USER,
77
- password: context.DB_PASSWORD,
78
- database: context.DB_NAME || "olang",
79
- };
80
- Object.keys(poolConfig).forEach((k) => {
81
- if (poolConfig[k] === undefined || poolConfig[k] === null) delete poolConfig[k];
82
91
  });
83
- this.postgresClient = new Pool(poolConfig);
84
92
  }
85
93
 
86
94
  async queryPostgres(query, params = []) {
87
- if (!this.postgresClient) throw new Error("PostgreSQL client not initialized");
88
95
  const result = await this.postgresClient.query(query, params);
89
96
  return result.rows;
90
97
  }
91
98
 
92
99
  async queryDocuments(context) {
93
- const { db_type, db_table = "documents", db_content_column = "content", db_id_column = "id" } = context;
94
- if (db_type === "mongodb" || context.MONGO_URI) {
95
- const { filter, projection } = this.buildMongoQuery(context);
96
- const results = await this.queryMongo(db_table, filter, projection);
97
- return results.map((doc) => ({
98
- id: doc._id?.toString() || doc.id || doc[db_id_column],
99
- content: doc[db_content_column] || doc.content || doc.text || "",
100
- source: `mongodb:${db_table}`,
101
- }));
102
- } else if (db_type === "sqlite" || context.db_path) {
103
- const { sql, params } = this.buildSqlQuery(context);
104
- const results = await this.querySQLite(sql, params);
105
- return results.map((row) => ({
106
- id: row[db_id_column],
107
- content: row[db_content_column],
108
- source: `sqlite:${db_table}`,
109
- }));
110
- } else if (db_type === "postgres" || context.POSTGRES_URL) {
111
- const { sql, params } = this.buildSqlQuery(context);
112
- const results = await this.queryPostgres(sql, params);
113
- return results.map((row) => ({
114
- id: row[db_id_column],
115
- content: row[db_content_column],
116
- source: `postgres:${db_table}`,
100
+ const table = context.db_table || "documents";
101
+ const contentCol = context.db_content_column || "content";
102
+ const idCol = context.db_id_column || "id";
103
+
104
+ if (context.MONGO_URI) {
105
+ const rows = await this.queryMongo(table);
106
+ return rows.map((r) => ({
107
+ id: r._id?.toString(),
108
+ content: r[contentCol] || "",
109
+ source: `mongodb:${table}`,
117
110
  }));
118
111
  }
119
- return [];
120
- }
121
112
 
122
- buildMongoQuery(context) {
123
- const { doc_filter = {}, doc_projection = {} } = context;
124
- let filter = {};
125
- if (typeof doc_filter === "string") {
126
- try { filter = JSON.parse(doc_filter); } catch { filter = { $text: { $search: doc_filter } }; }
127
- } else if (typeof doc_filter === "object" && Object.keys(doc_filter).length > 0) filter = doc_filter;
128
- const projection = typeof doc_projection === "string" ? JSON.parse(doc_projection) : doc_projection;
129
- return { filter, projection };
130
- }
113
+ if (context.db_path) {
114
+ const rows = await this.querySQLite(
115
+ `SELECT ${idCol}, ${contentCol} FROM ${table}`
116
+ );
117
+ return rows.map((r) => ({
118
+ id: r[idCol],
119
+ content: r[contentCol],
120
+ source: `sqlite:${table}`,
121
+ }));
122
+ }
131
123
 
132
- buildSqlQuery(context) {
133
- const { db_content_column = "content", db_id_column = "id", doc_where = "1=1", doc_params = [] } = context;
134
- let params = doc_params;
135
- if (typeof doc_params === "string") {
136
- try { params = JSON.parse(doc_params); } catch { params = [doc_params]; }
124
+ if (context.POSTGRES_URL) {
125
+ const rows = await this.queryPostgres(
126
+ `SELECT ${idCol}, ${contentCol} FROM ${table}`
127
+ );
128
+ return rows.map((r) => ({
129
+ id: r[idCol],
130
+ content: r[contentCol],
131
+ source: `postgres:${table}`,
132
+ }));
137
133
  }
138
- const table = context.db_table || "documents";
139
- const sql = `SELECT ${db_id_column}, ${db_content_column} FROM ${table} WHERE ${doc_where}`;
140
- return { sql, params };
141
- }
142
134
 
143
- async close() {
144
- if (this.sqliteClient) { try { this.sqliteClient.close(); } catch {} this.sqliteClient = null; }
145
- if (this.mongoClient) { try { await this.mongoClient.close(); } catch {} this.mongoClient = null; }
146
- if (this.postgresClient) { try { await this.postgresClient.end(); } catch {} this.postgresClient = null; }
147
- this.initialized = false;
135
+ return [];
148
136
  }
149
137
  }
150
138
 
151
- // ------------------- DOCUMENT LOADING -------------------
152
- async function loadDocumentsFromDatabase(context) {
153
- if (!context.db_type && !context.db_path && !context.MONGO_URI && !context.POSTGRES_URL) return null;
154
- const dbAdapter = new DatabaseAdapter();
155
- try { await dbAdapter.initialize(context); return await dbAdapter.queryDocuments(context); } catch (e) { console.error("🗃️ [doc-search] Database load error:", e.message); return null; }
156
- }
139
+ /* ---------------- DOCUMENT LOADING ---------------- */
157
140
 
158
141
  async function loadAllDocuments(context) {
159
- const documents = [];
160
- const dbDocs = await loadDocumentsFromDatabase(context);
161
- if (dbDocs) documents.push(...dbDocs);
142
+ const docs = [];
143
+ const db = new DatabaseAdapter();
144
+
145
+ try {
146
+ await db.initialize(context);
147
+ docs.push(...(await db.queryDocuments(context)));
148
+ } catch {}
149
+
150
+ const baseDir = context.doc_root
151
+ ? safeResolve(process.cwd(), context.doc_root)
152
+ : path.join(process.cwd(), "docs");
162
153
 
163
- const baseDir = context.doc_root ? safeResolve(process.cwd(), context.doc_root) : path.join(process.cwd(), "docs");
164
154
  if (fs.existsSync(baseDir)) {
165
- const files = fs.readdirSync(baseDir).filter((f) => f.endsWith(".txt") || f.endsWith(".md"));
166
- for (const file of files) {
167
- try {
168
- const content = fs.readFileSync(path.join(baseDir, file), "utf8");
169
- documents.push({ id: file, content, source: `file:${file}` });
170
- } catch (e) { console.warn(`⚠️ [doc-search] Failed to read file ${file}: ${e.message}`); }
155
+ const files = fs
156
+ .readdirSync(baseDir)
157
+ .filter((f) => f.endsWith(".txt") || f.endsWith(".md"));
158
+
159
+ for (const f of files) {
160
+ docs.push({
161
+ id: f,
162
+ content: fs.readFileSync(path.join(baseDir, f), "utf8"),
163
+ source: `file:${f}`,
164
+ });
171
165
  }
172
166
  }
173
- return documents;
174
- }
175
167
 
176
- // ------------------- VECTOR MIGRATION -------------------
177
- async function checkPgVectorHasData(pgVectorAdapter) {
178
- try { const result = await pgVectorAdapter.pool.query("SELECT COUNT(*) FROM doc_embeddings"); return parseInt(result.rows[0].count) > 0; } catch { return false; }
168
+ return docs;
179
169
  }
180
170
 
181
- async function migrateDocumentsToPgVector(docRoot, pgVectorAdapter, embedder) {
182
- const baseDir = safeResolve(process.cwd(), docRoot);
183
- if (!fs.existsSync(baseDir)) { console.log("📁 No docs directory found, skipping migration"); return; }
184
- const files = fs.readdirSync(baseDir).filter((f) => f.endsWith(".txt") || f.endsWith(".md"));
185
- console.log(`🔄 Migrating ${files.length} documents to pgvector...`);
186
- for (const file of files) {
187
- try {
188
- const content = fs.readFileSync(path.join(baseDir, file), "utf8");
189
- const vector = await embedder.embed(content);
190
- await pgVectorAdapter.upsert({ id: file, vector, content, source: `file:${file}` });
191
- console.log(`✅ Migrated ${file}`);
192
- } catch (e) { console.warn(`⚠️ Failed to migrate ${file}: ${e.message}`); }
171
+ /* ---------------- HYBRID VECTOR SEARCH ---------------- */
172
+
173
+ async function performHybridDocQA(query, context = {}) {
174
+ const cache = loadCache();
175
+ const embedder = new LocalEmbedding({ dimension: 384 });
176
+
177
+ const vectorStore = VectorRouter.create({
178
+ backend: context.vectorBackend || "memory",
179
+ dimension: 384,
180
+ ...context,
181
+ });
182
+
183
+ const documents = await loadAllDocuments(context);
184
+ if (!documents.length) {
185
+ return { text: "", meta: {} };
193
186
  }
194
- }
195
187
 
196
- // ------------------- VECTOR SEARCH (AUTO SWITCH) -------------------
197
- async function performVectorQA(query, context = {}) {
198
- const postgresUrl = context.POSTGRES_URL || process.env.POSTGRES_URL;
199
- const vectorBackend = context.vectorBackend;
200
-
201
- if (postgresUrl) {
202
- return await performPgVectorSearch(query, context);
203
- } else if (vectorBackend) {
204
- return await performVectorSearch(query, context);
205
- } else {
206
- return await performHybridDocQA(query, context);
188
+ for (const doc of documents) {
189
+ if (!cache[doc.id]) {
190
+ cache[doc.id] = true;
191
+ const chunks = chunkText(doc.content, 500);
192
+
193
+ for (let i = 0; i < chunks.length; i++) {
194
+ const vector = await embedder.embed(chunks[i]);
195
+ await vectorStore.upsert({
196
+ id: `${doc.id}:${i}`,
197
+ vector,
198
+ content: chunks[i],
199
+ source: doc.source,
200
+ });
201
+ }
202
+ }
207
203
  }
204
+
205
+ saveCache(cache);
206
+
207
+ const queryVector = await embedder.embed(query);
208
+ const results = await vectorStore.query(queryVector, 5);
209
+
210
+ return {
211
+ text: highlightMatches(
212
+ results.map((r) => r.content).join("\n\n"),
213
+ extractKeywords(query)
214
+ ),
215
+ meta: { matches: results.length },
216
+ };
217
+ }
218
+
219
+ /* ---------------- PGVECTOR SEARCH ---------------- */
220
+
221
+ async function performPgVectorSearch(query, context = {}) {
222
+ const adapter = new PgVectorAdapter({
223
+ POSTGRES_URL: context.POSTGRES_URL,
224
+ });
225
+
226
+ const embedder = new LocalEmbedding({ dimension: 384 });
227
+ const vector = await embedder.embed(query);
228
+ const results = await adapter.query(vector, 5);
229
+
230
+ await adapter.close();
231
+
232
+ return {
233
+ text: results.map((r) => r.content).join("\n\n"),
234
+ meta: { matches: results.length },
235
+ };
208
236
  }
209
237
 
210
- // ------------------- HYBRID + VECTOR SEARCH FUNCTIONS -------------------
211
- // [Keep performPgVectorSearch, performHybridDocQA, loadAllDocuments, chunking, cache logic identical to previous full file]
238
+ /* ---------------- ROUTER ---------------- */
212
239
 
213
240
  async function performDocQA(query, context = {}) {
214
- return await performVectorQA(query, context);
241
+ if (context.POSTGRES_URL) {
242
+ return performPgVectorSearch(query, context);
243
+ }
244
+ return performHybridDocQA(query, context);
215
245
  }
216
246
 
247
+ /* ---------------- O-LANG RESOLVER ---------------- */
248
+
217
249
  async function docSearchResolver(action, context) {
218
- if (action.startsWith("Ask doc-search ")) {
250
+ if (action.startsWith("Ask doc-search")) {
219
251
  const match = action.match(/"(.*)"|'(.*)'/);
220
- const query = match ? match[1] || match[2] : action.replace(/^Ask doc-search\s+/, "").trim();
221
- return await performDocQA(query, context);
252
+ const query = match
253
+ ? match[1] || match[2]
254
+ : action.replace("Ask doc-search", "").trim();
255
+
256
+ return performDocQA(query, context);
222
257
  }
223
- return undefined;
224
258
  }
225
259
 
226
260
  docSearchResolver.resolverName = "doc-search";