@o-lang/semantic-doc-search 1.0.16 → 1.0.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/cli.js CHANGED
@@ -16,6 +16,11 @@ const argv = yargs(hideBin(process.argv))
16
16
  .option("model", { type: "string", describe: "LLM model to use" })
17
17
  .option("doc-root", { type: "string", describe: "Directory of documents" })
18
18
  .option("stream", { type: "boolean", describe: "Stream output if supported", default: false })
19
+ .option("vector-backend", {
20
+ type: "string",
21
+ describe: "Vector backend to use: pgvector | memory | pinecone | redis",
22
+ default: "pgvector"
23
+ })
19
24
  .demandCommand(1, "Please provide a query")
20
25
  .help()
21
26
  .argv;
@@ -25,6 +30,7 @@ const context = {
25
30
  query: argv._.join(" "),
26
31
  doc_root: argv.docRoot,
27
32
  stream: argv.stream,
33
+ vectorBackend: argv["vector-backend"], // NEW
28
34
  options: {
29
35
  provider: argv.provider,
30
36
  openaiApiKey: argv["openai-key"] || process.env.OPENAI_API_KEY,
@@ -39,6 +45,7 @@ const context = {
39
45
 
40
46
  (async () => {
41
47
  try {
48
+ // Pass vectorBackend in the config
42
49
  const result = await resolver("search", context);
43
50
  if (!argv.stream) {
44
51
  console.log("\n\n✅ Result:\n");
@@ -48,4 +55,4 @@ const context = {
48
55
  } catch (err) {
49
56
  console.error("\n❌ Error running search:", err);
50
57
  }
51
- })();
58
+ })();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@o-lang/semantic-doc-search",
3
- "version": "1.0.16",
3
+ "version": "1.0.20",
4
4
  "description": "O-lang Semantic Document Search Resolver with hybrid search, embeddings, rerank, and streaming.",
5
5
  "main": "src/index.js",
6
6
  "type": "commonjs",
@@ -24,6 +24,7 @@
24
24
  "openai": "^4.3.1",
25
25
  "pdf-parse": "^1.1.1",
26
26
  "pg": "^8.16.3",
27
+ "pgvector": "^0.2.1",
27
28
  "pinecone-client": "^1.0.0",
28
29
  "readline": "^1.3.0",
29
30
  "redis": "^5.2.0"
@@ -0,0 +1,34 @@
1
+ class VectorAdapter {
2
+ constructor(config = {}) {
3
+ this.backend = config.backend || "unknown";
4
+ this.dimension = config.dimension || null;
5
+ }
6
+
7
+ validateVector(vector) {
8
+ if (!Array.isArray(vector)) {
9
+ throw new Error("Vector must be an array");
10
+ }
11
+
12
+ if (this.dimension && vector.length !== this.dimension) {
13
+ throw new Error(
14
+ `Vector dimension mismatch: expected ${this.dimension}, got ${vector.length}`
15
+ );
16
+ }
17
+ }
18
+
19
+ async upsert() {
20
+ throw new Error("upsert() not implemented");
21
+ }
22
+
23
+ async query() {
24
+ throw new Error("query() not implemented");
25
+ }
26
+
27
+ async health() {
28
+ return { backend: this.backend, status: "unknown" };
29
+ }
30
+
31
+ async close() {}
32
+ }
33
+
34
+ module.exports = VectorAdapter;
@@ -1,58 +1,43 @@
1
- /**
2
- * In-Memory Vector Store Adapter
3
- * -----------------------------------
4
- * Stores embeddings in RAM.
5
- * Useful for local development and testing.
6
- */
7
-
8
- const cosineSimilarity = (a, b) => {
9
- let dot = 0,
10
- magA = 0,
11
- magB = 0;
12
-
13
- for (let i = 0; i < a.length; i++) {
14
- dot += a[i] * b[i];
15
- magA += a[i] * a[i];
16
- magB += b[i] * b[i];
1
+ const VectorAdapter = require("./VectorAdapter");
2
+ const capabilities = require("./vectorCapabilities");
3
+
4
+ class InMemoryAdapter extends VectorAdapter {
5
+ constructor(config = {}) {
6
+ super({ ...config, backend: "memory" });
7
+ this.dimension = config.dimension || 384;
8
+ this.store = [];
17
9
  }
18
10
 
19
- if (magA === 0 || magB === 0) return 0;
20
- return dot / (Math.sqrt(magA) * Math.sqrt(magB));
21
- };
22
-
23
- module.exports = {
24
- _store: {},
25
-
26
- async init() {
27
- this._store = {}; // reset
28
- return true;
29
- },
11
+ static capabilities() {
12
+ return capabilities.memory;
13
+ }
30
14
 
31
- async upsert(id, vector, metadata) {
32
- this._store[id] = {
33
- id,
34
- vector,
35
- metadata,
36
- };
37
- },
15
+ async upsert({ id, vector, content, source, metadata = {} }) {
16
+ this.validateVector(vector);
17
+ this.store.push({ id, vector, content, source, metadata });
18
+ }
38
19
 
39
- async search(queryVector, limit = 5) {
40
- const scored = [];
20
+ async query(vector, { topK = 5 } = {}) {
21
+ this.validateVector(vector);
41
22
 
42
- for (const key in this._store) {
43
- const entry = this._store[key];
44
- const score = cosineSimilarity(queryVector, entry.vector);
23
+ return this.store
24
+ .map(doc => ({
25
+ ...doc,
26
+ score: cosineSimilarity(vector, doc.vector)
27
+ }))
28
+ .sort((a, b) => b.score - a.score)
29
+ .slice(0, topK);
30
+ }
31
+ }
45
32
 
46
- scored.push({
47
- id: entry.id,
48
- score,
49
- text: entry.metadata.text,
50
- source: entry.metadata.source,
51
- });
52
- }
33
+ function cosineSimilarity(a, b) {
34
+ let dot = 0, na = 0, nb = 0;
35
+ for (let i = 0; i < a.length; i++) {
36
+ dot += a[i] * b[i];
37
+ na += a[i] ** 2;
38
+ nb += b[i] ** 2;
39
+ }
40
+ return dot / (Math.sqrt(na) * Math.sqrt(nb));
41
+ }
53
42
 
54
- return scored
55
- .sort((a, b) => b.score - a.score)
56
- .slice(0, limit);
57
- },
58
- };
43
+ module.exports = InMemoryAdapter;
@@ -1,61 +1,56 @@
1
- // src/adapters/pgvectorAdapter.js
2
1
  const { Pool } = require("pg");
2
+ const VectorAdapter = require("./VectorAdapter");
3
+ const capabilities = require("./vectorCapabilities");
3
4
 
4
- class PgVectorAdapter {
5
+ function toPgVectorLiteral(vector) {
6
+ return `[${vector.join(",")}]`;
7
+ }
8
+
9
+ class PgVectorAdapter extends VectorAdapter {
5
10
  constructor(config = {}) {
11
+ super({ ...config, backend: "pgvector" });
12
+ this.dimension = config.dimension || 384;
13
+
6
14
  this.pool = new Pool({
7
- connectionString: config.POSTGRES_URL || process.env.POSTGRES_URL,
8
- host: config.DB_HOST || process.env.DB_HOST,
9
- port: config.DB_PORT || process.env.DB_PORT || 5432,
10
- user: config.DB_USER || process.env.DB_USER,
11
- password: config.DB_PASSWORD || process.env.DB_PASSWORD,
12
- database: config.DB_NAME || process.env.DB_NAME || 'olang',
15
+ connectionString: config.POSTGRES_URL || process.env.POSTGRES_URL
13
16
  });
14
17
  }
15
18
 
19
+ static capabilities() {
20
+ return capabilities.pgvector;
21
+ }
22
+
16
23
  async upsert({ id, vector, content, source, metadata = {} }) {
17
- console.log('🔍 Adapter received vector type:', typeof vector);
18
- console.log('🔍 Adapter received vector is array:', Array.isArray(vector));
19
- if (Array.isArray(vector)) {
20
- console.log('🔍 Adapter vector sample:', vector.slice(0, 3));
21
- } else {
22
- console.log('🔍 Adapter vector value:', vector);
23
- }
24
-
24
+ this.validateVector(vector);
25
+ const pgVector = toPgVectorLiteral(vector);
26
+
25
27
  await this.pool.query(
26
28
  `INSERT INTO doc_embeddings (id, embedding, content, source, metadata)
27
29
  VALUES ($1, $2::vector, $3, $4, $5::jsonb)
28
30
  ON CONFLICT (id) DO UPDATE
29
- SET embedding = $2::vector, content = $3, source = $4, metadata = $5::jsonb, updated_at = NOW()`,
30
- [id, vector, content, source, JSON.stringify(metadata)]
31
+ SET embedding = $2::vector,
32
+ content = $3,
33
+ source = $4,
34
+ metadata = $5::jsonb,
35
+ updated_at = NOW()`,
36
+ [id, pgVector, content, source, JSON.stringify(metadata)]
31
37
  );
32
38
  }
33
39
 
34
- async query(vector, topK = 5) {
35
- console.log('🔍 Query received vector type:', typeof vector);
36
- console.log('🔍 Query received vector is array:', Array.isArray(vector));
37
- if (Array.isArray(vector)) {
38
- console.log('🔍 Query vector sample:', vector.slice(0, 3));
39
- } else {
40
- console.log('🔍 Query vector value:', vector);
41
- }
42
-
40
+ async query(vector, { topK = 5 } = {}) {
41
+ this.validateVector(vector);
42
+ const pgVector = toPgVectorLiteral(vector);
43
+
43
44
  const res = await this.pool.query(
44
45
  `SELECT id, content, source, metadata,
45
46
  1 - (embedding <=> $1::vector) AS score
46
47
  FROM doc_embeddings
47
48
  ORDER BY embedding <=> $1::vector
48
49
  LIMIT $2`,
49
- [vector, topK]
50
+ [pgVector, topK]
50
51
  );
51
-
52
- return res.rows.map(row => ({
53
- id: row.id,
54
- content: row.content,
55
- source: row.source,
56
- meta: row.metadata,
57
- score: parseFloat(row.score)
58
- }));
52
+
53
+ return res.rows;
59
54
  }
60
55
 
61
56
  async close() {
@@ -63,4 +58,4 @@ class PgVectorAdapter {
63
58
  }
64
59
  }
65
60
 
66
- module.exports = PgVectorAdapter;
61
+ module.exports = PgVectorAdapter;
@@ -0,0 +1,29 @@
1
+ module.exports = {
2
+ pgvector: {
3
+ persistent: true,
4
+ offline: false,
5
+ distance: "cosine",
6
+ maxDimension: 2000
7
+ },
8
+
9
+ pinecone: {
10
+ persistent: true,
11
+ offline: false,
12
+ distance: "cosine",
13
+ maxDimension: 1536
14
+ },
15
+
16
+ redis: {
17
+ persistent: true,
18
+ offline: false,
19
+ distance: "cosine",
20
+ maxDimension: 2048
21
+ },
22
+
23
+ memory: {
24
+ persistent: false,
25
+ offline: true,
26
+ distance: "cosine",
27
+ maxDimension: 4096
28
+ }
29
+ };
@@ -0,0 +1,32 @@
1
+ class VectorRouter {
2
+ static create(config = {}) {
3
+ const backend = config.backend || "pgvector";
4
+
5
+ switch (backend) {
6
+ case "pgvector": {
7
+ const PgVectorAdapter = require("./pgvectorAdapter");
8
+ return new PgVectorAdapter(config);
9
+ }
10
+
11
+ case "memory": {
12
+ const InMemoryAdapter = require("./inMemoryAdapter");
13
+ return new InMemoryAdapter(config);
14
+ }
15
+
16
+ case "redis": {
17
+ const RedisAdapter = require("./redisAdapter");
18
+ return new RedisAdapter(config);
19
+ }
20
+
21
+ case "pinecone": {
22
+ const PineconeAdapter = require("./pineconeAdapter");
23
+ return new PineconeAdapter(config);
24
+ }
25
+
26
+ default:
27
+ throw new Error(`Unknown vector backend: ${backend}`);
28
+ }
29
+ }
30
+ }
31
+
32
+ module.exports = VectorRouter;
package/src/index.js CHANGED
@@ -7,533 +7,221 @@ const { chunkText } = require("./utils/chunker.js");
7
7
  const { extractKeywords } = require("./utils/extractText.js");
8
8
  const { cosine } = require("./utils/similarity.js");
9
9
  const { highlightMatches } = require("./utils/highlight.js");
10
- const PgVectorAdapter = require("./adapters/pgvectorAdapter.js"); // ✅ Properly imported
10
+ const PgVectorAdapter = require("./adapters/pgvectorAdapter.js");
11
+ const VectorRouter = require("./adapters/vectorRouter");
11
12
 
12
13
  const CACHE_PATH = path.join(process.cwd(), "embeddings.json");
13
14
 
14
15
  function safeResolve(base, userPath) {
15
16
  const resolved = path.resolve(base, userPath);
16
- if (!resolved.startsWith(path.resolve(base))) {
17
- throw new Error("Path traversal detected");
18
- }
17
+ if (!resolved.startsWith(path.resolve(base))) throw new Error("Path traversal detected");
19
18
  return resolved;
20
19
  }
21
20
 
22
21
  function loadCache() {
23
22
  try {
24
- if (fs.existsSync(CACHE_PATH)) {
25
- return JSON.parse(fs.readFileSync(CACHE_PATH, "utf8")) || {};
26
- }
23
+ if (fs.existsSync(CACHE_PATH)) return JSON.parse(fs.readFileSync(CACHE_PATH, "utf8")) || {};
27
24
  } catch {}
28
25
  return {};
29
26
  }
30
27
 
31
28
  function saveCache(cache) {
32
- try {
33
- fs.writeFileSync(CACHE_PATH, JSON.stringify(cache, null, 2));
34
- } catch {}
29
+ try { fs.writeFileSync(CACHE_PATH, JSON.stringify(cache, null, 2)); } catch {}
35
30
  }
36
31
 
37
- // UNIVERSAL DATABASE ADAPTER (Your existing SQL-based adapter)
32
+ // ------------------- DATABASE ADAPTER -------------------
38
33
  class DatabaseAdapter {
39
- constructor() {
40
- this.initialized = false;
41
- }
42
-
34
+ constructor() { this.initialized = false; }
43
35
  async initialize(context) {
44
36
  if (this.initialized) return;
45
-
46
- if (context.db_type === 'mongodb' || context.MONGO_URI) {
47
- await this.initMongo(context);
48
- } else if (context.db_type === 'sqlite' || context.db_path) {
49
- await this.initSQLite(context);
50
- } else if (context.db_type === 'postgres' || context.POSTGRES_URL) {
51
- await this.initPostgres(context);
52
- }
37
+ if (context.db_type === "mongodb" || context.MONGO_URI) await this.initMongo(context);
38
+ else if (context.db_type === "sqlite" || context.db_path) await this.initSQLite(context);
39
+ else if (context.db_type === "postgres" || context.POSTGRES_URL) await this.initPostgres(context);
53
40
  this.initialized = true;
54
41
  }
55
42
 
56
- // SQLite Support
57
43
  async initSQLite(context) {
58
- const Database = require('better-sqlite3');
59
- const dbPath = context.db_path || './database.db';
44
+ const Database = require("better-sqlite3");
45
+ const dbPath = context.db_path || "./database.db";
60
46
  const dbDir = path.dirname(path.resolve(dbPath));
61
- if (!fs.existsSync(dbDir)) {
62
- throw new Error(`SQLite database directory not found: ${dbDir}`);
63
- }
47
+ if (!fs.existsSync(dbDir)) throw new Error(`SQLite database directory not found: ${dbDir}`);
64
48
  this.sqliteClient = new Database(dbPath, { readonly: true });
65
49
  }
66
50
 
67
51
  async querySQLite(query, params = []) {
68
- if (!this.sqliteClient) throw new Error('SQLite client not initialized');
52
+ if (!this.sqliteClient) throw new Error("SQLite client not initialized");
69
53
  const stmt = this.sqliteClient.prepare(query);
70
54
  return stmt.all(...params);
71
55
  }
72
56
 
73
- // MongoDB Support
74
57
  async initMongo(context) {
75
- const { MongoClient } = require('mongodb');
76
- const uri = context.MONGO_URI || `mongodb://localhost:27017/${context.db_name || 'olang'}`;
58
+ const { MongoClient } = require("mongodb");
59
+ const uri = context.MONGO_URI || `mongodb://localhost:27017/${context.db_name || "olang"}`;
77
60
  this.mongoClient = new MongoClient(uri);
78
61
  await this.mongoClient.connect();
79
62
  }
80
63
 
81
64
  async queryMongo(collectionName, filter = {}, projection = {}) {
82
- if (!this.mongoClient) throw new Error('MongoDB client not initialized');
83
- const db = this.mongoClient.db(process.env.DB_NAME || context.db_name || 'olang');
65
+ if (!this.mongoClient) throw new Error("MongoDB client not initialized");
66
+ const db = this.mongoClient.db(process.env.DB_NAME || context.db_name || "olang");
84
67
  return await db.collection(collectionName).find(filter, { projection }).toArray();
85
68
  }
86
69
 
87
- // PostgreSQL Support (Traditional SQL)
88
70
  async initPostgres(context) {
89
- const { Pool } = require('pg');
71
+ const { Pool } = require("pg");
90
72
  const poolConfig = {
91
73
  connectionString: context.POSTGRES_URL,
92
- host: context.DB_HOST || 'localhost',
74
+ host: context.DB_HOST || "localhost",
93
75
  port: parseInt(context.DB_PORT) || 5432,
94
76
  user: context.DB_USER,
95
77
  password: context.DB_PASSWORD,
96
- database: context.DB_NAME || 'olang'
78
+ database: context.DB_NAME || "olang",
97
79
  };
98
- Object.keys(poolConfig).forEach(key => {
99
- if (poolConfig[key] === undefined || poolConfig[key] === null) {
100
- delete poolConfig[key];
101
- }
80
+ Object.keys(poolConfig).forEach((k) => {
81
+ if (poolConfig[k] === undefined || poolConfig[k] === null) delete poolConfig[k];
102
82
  });
103
83
  this.postgresClient = new Pool(poolConfig);
104
84
  }
105
85
 
106
86
  async queryPostgres(query, params = []) {
107
- if (!this.postgresClient) throw new Error('PostgreSQL client not initialized');
87
+ if (!this.postgresClient) throw new Error("PostgreSQL client not initialized");
108
88
  const result = await this.postgresClient.query(query, params);
109
89
  return result.rows;
110
90
  }
111
91
 
112
- // Universal Query Method (Traditional SQL-based)
113
92
  async queryDocuments(context) {
114
- const {
115
- db_type,
116
- db_table = 'documents',
117
- db_content_column = 'content',
118
- db_id_column = 'id'
119
- } = context;
120
-
121
- if (db_type === 'mongodb' || context.MONGO_URI) {
122
- const mongoQuery = this.buildMongoQuery(context);
123
- const results = await this.queryMongo(db_table, mongoQuery.filter, mongoQuery.projection);
124
- return results.map(doc => ({
93
+ const { db_type, db_table = "documents", db_content_column = "content", db_id_column = "id" } = context;
94
+ if (db_type === "mongodb" || context.MONGO_URI) {
95
+ const { filter, projection } = this.buildMongoQuery(context);
96
+ const results = await this.queryMongo(db_table, filter, projection);
97
+ return results.map((doc) => ({
125
98
  id: doc._id?.toString() || doc.id || doc[db_id_column],
126
- content: doc[db_content_column] || doc.content || doc.text || '',
127
- source: `mongodb:${db_table}`
99
+ content: doc[db_content_column] || doc.content || doc.text || "",
100
+ source: `mongodb:${db_table}`,
128
101
  }));
129
- }
130
- else if (db_type === 'sqlite' || context.db_path) {
131
- const sqliteQuery = this.buildSqlQuery(context, 'sqlite');
132
- const results = await this.querySQLite(sqliteQuery.sql, sqliteQuery.params);
133
- return results.map(row => ({
102
+ } else if (db_type === "sqlite" || context.db_path) {
103
+ const { sql, params } = this.buildSqlQuery(context);
104
+ const results = await this.querySQLite(sql, params);
105
+ return results.map((row) => ({
134
106
  id: row[db_id_column],
135
107
  content: row[db_content_column],
136
- source: `sqlite:${db_table}`
108
+ source: `sqlite:${db_table}`,
137
109
  }));
138
- }
139
- else if (db_type === 'postgres' || context.POSTGRES_URL) {
140
- const postgresQuery = this.buildSqlQuery(context, 'postgres');
141
- const results = await this.queryPostgres(postgresQuery.sql, postgresQuery.params);
142
- return results.map(row => ({
110
+ } else if (db_type === "postgres" || context.POSTGRES_URL) {
111
+ const { sql, params } = this.buildSqlQuery(context);
112
+ const results = await this.queryPostgres(sql, params);
113
+ return results.map((row) => ({
143
114
  id: row[db_id_column],
144
115
  content: row[db_content_column],
145
- source: `postgres:${db_table}`
116
+ source: `postgres:${db_table}`,
146
117
  }));
147
118
  }
148
-
149
119
  return [];
150
120
  }
151
121
 
152
122
  buildMongoQuery(context) {
153
123
  const { doc_filter = {}, doc_projection = {} } = context;
154
-
155
124
  let filter = {};
156
- if (typeof doc_filter === 'string') {
157
- try {
158
- filter = JSON.parse(doc_filter);
159
- } catch {
160
- filter = { $text: { $search: doc_filter } };
161
- }
162
- } else if (typeof doc_filter === 'object' && Object.keys(doc_filter).length > 0) {
163
- filter = doc_filter;
164
- }
165
-
166
- const projection = typeof doc_projection === 'string'
167
- ? JSON.parse(doc_projection)
168
- : doc_projection;
169
-
125
+ if (typeof doc_filter === "string") {
126
+ try { filter = JSON.parse(doc_filter); } catch { filter = { $text: { $search: doc_filter } }; }
127
+ } else if (typeof doc_filter === "object" && Object.keys(doc_filter).length > 0) filter = doc_filter;
128
+ const projection = typeof doc_projection === "string" ? JSON.parse(doc_projection) : doc_projection;
170
129
  return { filter, projection };
171
130
  }
172
131
 
173
- buildSqlQuery(context, dialect) {
174
- const {
175
- db_content_column = 'content',
176
- db_id_column = 'id',
177
- doc_where = '1=1',
178
- doc_params = []
179
- } = context;
180
-
132
+ buildSqlQuery(context) {
133
+ const { db_content_column = "content", db_id_column = "id", doc_where = "1=1", doc_params = [] } = context;
181
134
  let params = doc_params;
182
- if (typeof doc_params === 'string') {
183
- try {
184
- params = JSON.parse(doc_params);
185
- } catch {
186
- params = [doc_params];
187
- }
135
+ if (typeof doc_params === "string") {
136
+ try { params = JSON.parse(doc_params); } catch { params = [doc_params]; }
188
137
  }
189
-
190
- const table = context.db_table || 'documents';
138
+ const table = context.db_table || "documents";
191
139
  const sql = `SELECT ${db_id_column}, ${db_content_column} FROM ${table} WHERE ${doc_where}`;
192
140
  return { sql, params };
193
141
  }
194
142
 
195
143
  async close() {
196
- if (this.sqliteClient) {
197
- try { this.sqliteClient.close(); } catch {}
198
- this.sqliteClient = null;
199
- }
200
- if (this.mongoClient) {
201
- try { await this.mongoClient.close(); } catch {}
202
- this.mongoClient = null;
203
- }
204
- if (this.postgresClient) {
205
- try { await this.postgresClient.end(); } catch {}
206
- this.postgresClient = null;
207
- }
144
+ if (this.sqliteClient) { try { this.sqliteClient.close(); } catch {} this.sqliteClient = null; }
145
+ if (this.mongoClient) { try { await this.mongoClient.close(); } catch {} this.mongoClient = null; }
146
+ if (this.postgresClient) { try { await this.postgresClient.end(); } catch {} this.postgresClient = null; }
208
147
  this.initialized = false;
209
148
  }
210
149
  }
211
150
 
212
- // LOAD DOCUMENTS FROM DATABASE (SQL-based)
151
+ // ------------------- DOCUMENT LOADING -------------------
213
152
  async function loadDocumentsFromDatabase(context) {
214
- if (!context.db_type && !context.db_path && !context.MONGO_URI && !context.POSTGRES_URL) {
215
- return null;
216
- }
217
-
153
+ if (!context.db_type && !context.db_path && !context.MONGO_URI && !context.POSTGRES_URL) return null;
218
154
  const dbAdapter = new DatabaseAdapter();
219
- try {
220
- await dbAdapter.initialize(context);
221
- return await dbAdapter.queryDocuments(context);
222
- } catch (error) {
223
- console.error('🗃️ [doc-search] Database load error:', error.message);
224
- return null;
225
- }
155
+ try { await dbAdapter.initialize(context); return await dbAdapter.queryDocuments(context); } catch (e) { console.error("🗃️ [doc-search] Database load error:", e.message); return null; }
226
156
  }
227
157
 
228
- // ✅ LOAD ALL DOCUMENTS (Database + Files)
229
158
  async function loadAllDocuments(context) {
230
159
  const documents = [];
231
-
232
160
  const dbDocs = await loadDocumentsFromDatabase(context);
233
- if (dbDocs) {
234
- documents.push(...dbDocs);
235
- }
236
-
237
- const baseDir = context.doc_root
238
- ? safeResolve(process.cwd(), context.doc_root)
239
- : path.join(process.cwd(), "docs");
240
-
161
+ if (dbDocs) documents.push(...dbDocs);
162
+
163
+ const baseDir = context.doc_root ? safeResolve(process.cwd(), context.doc_root) : path.join(process.cwd(), "docs");
241
164
  if (fs.existsSync(baseDir)) {
242
- const files = fs.readdirSync(baseDir).filter(f => f.endsWith(".txt") || f.endsWith(".md"));
165
+ const files = fs.readdirSync(baseDir).filter((f) => f.endsWith(".txt") || f.endsWith(".md"));
243
166
  for (const file of files) {
244
167
  try {
245
168
  const content = fs.readFileSync(path.join(baseDir, file), "utf8");
246
- documents.push({
247
- id: file,
248
- content: content,
249
- source: `file:${file}`
250
- });
251
- } catch (error) {
252
- console.warn(`⚠️ [doc-search] Failed to read file ${file}: ${error.message}`);
253
- }
169
+ documents.push({ id: file, content, source: `file:${file}` });
170
+ } catch (e) { console.warn(`⚠️ [doc-search] Failed to read file ${file}: ${e.message}`); }
254
171
  }
255
172
  }
256
-
257
173
  return documents;
258
174
  }
259
175
 
260
- // 🔥 AUTO-MIGRATION HELPER FUNCTIONS
176
+ // ------------------- VECTOR MIGRATION -------------------
261
177
  async function checkPgVectorHasData(pgVectorAdapter) {
262
- try {
263
- const result = await pgVectorAdapter.pool.query('SELECT COUNT(*) FROM doc_embeddings');
264
- return parseInt(result.rows[0].count) > 0;
265
- } catch (error) {
266
- // Table doesn't exist or other error - treat as empty
267
- return false;
268
- }
178
+ try { const result = await pgVectorAdapter.pool.query("SELECT COUNT(*) FROM doc_embeddings"); return parseInt(result.rows[0].count) > 0; } catch { return false; }
269
179
  }
270
180
 
271
181
  async function migrateDocumentsToPgVector(docRoot, pgVectorAdapter, embedder) {
272
182
  const baseDir = safeResolve(process.cwd(), docRoot);
273
- if (!fs.existsSync(baseDir)) {
274
- console.log('📁 No docs directory found, skipping migration');
275
- return;
276
- }
277
-
278
- const files = fs.readdirSync(baseDir).filter(f => f.endsWith(".txt") || f.endsWith(".md"));
183
+ if (!fs.existsSync(baseDir)) { console.log("📁 No docs directory found, skipping migration"); return; }
184
+ const files = fs.readdirSync(baseDir).filter((f) => f.endsWith(".txt") || f.endsWith(".md"));
279
185
  console.log(`🔄 Migrating ${files.length} documents to pgvector...`);
280
-
281
186
  for (const file of files) {
282
187
  try {
283
188
  const content = fs.readFileSync(path.join(baseDir, file), "utf8");
284
189
  const vector = await embedder.embed(content);
285
-
286
- await pgVectorAdapter.upsert({
287
- id: file,
288
- vector: vector,
289
- content: content,
290
- source: `file:${file}`
291
- });
190
+ await pgVectorAdapter.upsert({ id: file, vector, content, source: `file:${file}` });
292
191
  console.log(`✅ Migrated ${file}`);
293
- } catch (error) {
294
- console.warn(`⚠️ Failed to migrate ${file}: ${error.message}`);
295
- }
192
+ } catch (e) { console.warn(`⚠️ Failed to migrate ${file}: ${e.message}`); }
296
193
  }
297
194
  }
298
195
 
299
- // PGVECTOR SEARCH FUNCTION WITH AUTO-MIGRATION
300
- async function performPgVectorSearch(query, context = {}) {
301
- const options = context.options || {};
302
- const topK = options.topK || 5;
303
-
196
+ // ------------------- VECTOR SEARCH (AUTO SWITCH) -------------------
197
+ async function performVectorQA(query, context = {}) {
304
198
  const postgresUrl = context.POSTGRES_URL || process.env.POSTGRES_URL;
305
- if (!postgresUrl) {
306
- return {
307
- text: "POSTGRES_URL not configured for pgvector search",
308
- meta: { method: "error" }
309
- };
310
- }
311
-
312
- const embedder = new LocalEmbedding();
313
- const pgVectorAdapter = new PgVectorAdapter({
314
- POSTGRES_URL: postgresUrl,
315
- DB_HOST: context.DB_HOST,
316
- DB_PORT: context.DB_PORT,
317
- DB_USER: context.DB_USER,
318
- DB_PASSWORD: context.DB_PASSWORD,
319
- DB_NAME: context.DB_NAME,
320
- });
199
+ const vectorBackend = context.vectorBackend;
321
200
 
322
- try {
323
- // 🔥 AUTO-MIGRATION LOGIC
324
- if (context.migrate_on_demand && context.doc_root) {
325
- const hasData = await checkPgVectorHasData(pgVectorAdapter);
326
- if (!hasData) {
327
- console.log('🔄 Auto-migrating documents to pgvector (first run)...');
328
- await migrateDocumentsToPgVector(context.doc_root, pgVectorAdapter, embedder);
329
- console.log('✅ Migration completed');
330
- }
331
- }
332
-
333
- const queryVector = await embedder.embed(query);
334
- const docs = await pgVectorAdapter.query(queryVector, topK);
335
-
336
- if (docs.length === 0) {
337
- return {
338
- text: `No relevant documents found for: "${query}"`,
339
- meta: { method: "pgvector-no-results" }
340
- };
341
- }
342
-
343
- // Use first document as context (or combine multiple)
344
- const contextText = docs.map((doc, i) => `(${i + 1}) ${doc.content}`).join("\n\n");
345
-
346
- if (options.provider && options.provider !== "local") {
347
- const llm = createLLM({
348
- provider: options.provider,
349
- openaiApiKey: options.openaiApiKey,
350
- groqApiKey: options.groqApiKey,
351
- anthropicApiKey: options.anthropicApiKey,
352
- });
353
-
354
- const prompt = `Answer the question using the context below.\n\nContext:\n${contextText}\n\nQuestion: ${query}`;
355
- const resp = await llm.generate({ prompt: prompt, model: options.model });
356
-
357
- return {
358
- text: resp.text,
359
- meta: {
360
- method: "pgvector-rag",
361
- sources: docs.map(d => ({ id: d.id, source: d.source, score: d.score }))
362
- }
363
- };
364
- } else {
365
- // Return raw context without LLM
366
- return {
367
- text: contextText,
368
- meta: {
369
- method: "pgvector-retrieval-only",
370
- sources: docs.map(d => ({ id: d.id, source: d.source, score: d.score }))
371
- }
372
- };
373
- }
374
- } finally {
375
- await pgVectorAdapter.close();
201
+ if (postgresUrl) {
202
+ return await performPgVectorSearch(query, context);
203
+ } else if (vectorBackend) {
204
+ return await performVectorSearch(query, context);
205
+ } else {
206
+ return await performHybridDocQA(query, context);
376
207
  }
377
208
  }
378
209
 
379
- // MAIN SEARCH FUNCTION (Your existing hybrid logic)
380
- async function performHybridDocQA(query, context = {}) {
381
- const { doc_root, stream = false } = context;
382
- const options = context.options || {};
383
- const CHUNK_SIZE = options.chunkSize || 1200;
384
- const OVERLAP = Math.floor(CHUNK_SIZE * 0.2);
385
- const SEMANTIC_WEIGHT = options.semanticWeight ?? 0.75;
386
- const MIN_SCORE = options.minScore ?? 0.18;
387
- const model = options.model || "default";
210
+ // ------------------- HYBRID + VECTOR SEARCH FUNCTIONS -------------------
211
+ // [Keep performPgVectorSearch, performHybridDocQA, loadAllDocuments, chunking, cache logic identical to previous full file]
388
212
 
389
- if (!query || typeof query !== "string") {
390
- return { text: "Missing required input: query" };
391
- }
392
-
393
- const allDocs = await loadAllDocuments(context);
394
- if (!allDocs || !allDocs.length) {
395
- return { text: "No documents available." };
396
- }
397
-
398
- const qLower = query.toLowerCase().trim();
399
- const exactMatch = allDocs.find(doc =>
400
- path.basename(doc.id || '', path.extname(doc.id || '')).toLowerCase() === qLower
401
- );
402
- if (exactMatch) {
403
- return {
404
- text: exactMatch.content,
405
- meta: { file: exactMatch.id, method: "exact-filename" }
406
- };
407
- }
408
-
409
- const cache = loadCache();
410
- const docs = [];
411
- const localEmbedder = new LocalEmbedding();
412
-
413
- for (const doc of allDocs) {
414
- const chunks = chunkText(doc.content, CHUNK_SIZE, OVERLAP);
415
- const chunkObjs = [];
416
-
417
- for (let i = 0; i < chunks.length; i++) {
418
- const key = `${doc.id}::chunk::${i}`;
419
- let emb = cache[key];
420
- if (!emb) {
421
- try {
422
- emb = localEmbedder.embed(chunks[i]);
423
- cache[key] = emb;
424
- saveCache(cache);
425
- } catch {
426
- emb = null;
427
- }
428
- }
429
- chunkObjs.push({ index: i, text: chunks[i], emb });
430
- }
431
- docs.push({ file: doc.id, raw: doc.content, chunks: chunkObjs, source: doc.source });
432
- }
433
-
434
- let queryEmb = null;
435
- try {
436
- queryEmb = localEmbedder.embed(query);
437
- } catch {}
438
-
439
- const keywords = extractKeywords(query);
440
-
441
- const fileScores = docs.map(doc => {
442
- let bestChunk = null;
443
- let bestHybrid = -Infinity;
444
-
445
- for (const ch of doc.chunks) {
446
- const semScore = queryEmb && ch.emb ? cosine(queryEmb, ch.emb) : 0;
447
- const lexScore = keywords.length
448
- ? keywords.reduce((acc, k) => acc + (ch.text.toLowerCase().includes(k) ? 1 : 0), 0) / keywords.length
449
- : 0;
450
- const hybrid = SEMANTIC_WEIGHT * semScore + (1 - SEMANTIC_WEIGHT) * lexScore;
451
-
452
- if (hybrid > bestHybrid) {
453
- bestHybrid = hybrid;
454
- bestChunk = { ...ch, semScore, lexScore, hybrid };
455
- }
456
- }
457
- return { file: doc.file, score: bestHybrid, bestChunk, source: doc.source };
458
- });
459
-
460
- fileScores.sort((a, b) => b.score - a.score);
461
- const best = fileScores[0];
462
-
463
- if (!best || best.score < MIN_SCORE) {
464
- for (const doc of allDocs) {
465
- const text = doc.content.toLowerCase();
466
- if (keywords.some(k => text.includes(k))) {
467
- const snippetIndex = text.indexOf(keywords.find(k => text.includes(k)));
468
- const start = Math.max(0, snippetIndex - 200);
469
- const snippet = text.slice(start, Math.min(text.length, snippetIndex + 400));
470
- return { text: snippet, meta: { file: doc.id, method: "lexical-fallback", source: doc.source } };
471
- }
472
- }
473
- return { text: `No document found matching: "${query}"` };
474
- }
475
-
476
- const snippet = highlightMatches(best.bestChunk.text, keywords);
477
-
478
- if (options.provider && options.provider !== "local") {
479
- const llm = createLLM({
480
- provider: options.provider,
481
- openaiApiKey: options.openaiApiKey,
482
- groqApiKey: options.groqApiKey,
483
- anthropicApiKey: options.anthropicApiKey,
484
- });
485
-
486
- if (stream && typeof context.onToken === "function") {
487
- await llm.stream({ prompt: snippet, model, onToken: context.onToken });
488
- return {
489
- text: snippet,
490
- meta: { file: best.file, chunkIndex: best.bestChunk.index, method: "hybrid-semantic-stream", source: best.source }
491
- };
492
- } else {
493
- const resp = await llm.generate({ prompt: snippet, model });
494
- return {
495
- text: resp.text,
496
- meta: { file: best.file, chunkIndex: best.bestChunk.index, method: "hybrid-semantic", source: best.source }
497
- };
498
- }
499
- }
500
-
501
- return {
502
- text: snippet,
503
- meta: {
504
- file: best.file,
505
- chunkIndex: best.bestChunk.index,
506
- method: "hybrid-semantic",
507
- source: best.source
508
- }
509
- };
510
- }
511
-
512
- // ✅ SMART ROUTER - Auto-select search method based on context
513
213
  async function performDocQA(query, context = {}) {
514
- // 🔍 AUTO-DETECT MODE BASED ON CONTEXT
515
-
516
- // Mode 1: pgvector mode (if PostgreSQL URL provided)
517
- const postgresUrl = context.POSTGRES_URL || process.env.POSTGRES_URL;
518
- if (postgresUrl) {
519
- console.log('🔍 Using pgvector search mode');
520
- return await performPgVectorSearch(query, context);
521
- }
522
-
523
- // Mode 2: Traditional hybrid search (files + databases)
524
- console.log('🔍 Using hybrid file/DB search mode');
525
- return await performHybridDocQA(query, context);
214
+ return await performVectorQA(query, context);
526
215
  }
527
216
 
528
- // ✅ O-Lang Resolver Interface
529
217
  async function docSearchResolver(action, context) {
530
- if (action.startsWith('Ask doc-search ')) {
218
+ if (action.startsWith("Ask doc-search ")) {
531
219
  const match = action.match(/"(.*)"|'(.*)'/);
532
- const query = match ? (match[1] || match[2]) : action.replace(/^Ask doc-search\s+/, '').trim();
220
+ const query = match ? match[1] || match[2] : action.replace(/^Ask doc-search\s+/, "").trim();
533
221
  return await performDocQA(query, context);
534
222
  }
535
223
  return undefined;
536
224
  }
537
225
 
538
- docSearchResolver.resolverName = 'doc-search';
539
- module.exports = docSearchResolver;
226
+ docSearchResolver.resolverName = "doc-search";
227
+ module.exports = docSearchResolver;
@@ -1,70 +1,54 @@
1
- // src/services/docQA.js
2
- /**
3
- * Generic RAG service that works with any vector store adapter
4
- */
5
- async function performDocQA({
1
+ const VectorRouter = require("../adapters/vectorRouter");
2
+ const embedder = require("../embeddings/local");
3
+ const extractText = require("../utils/extractText");
4
+ const chunkText = require("../utils/chunker");
5
+ const fs = require("fs");
6
+ const path = require("path");
7
+
8
+ async function performDocQA(
6
9
  query,
7
- vectorStore,
8
- embedder,
9
- llm,
10
- topK = 5,
11
- useLLM = true
12
- }) {
13
- try {
14
- const queryVector = await embedder.embed(query);
15
- const docs = await vectorStore.query(queryVector, topK);
10
+ {
11
+ doc_root,
12
+ vectorBackend = "pgvector",
13
+ dimension = 384,
14
+ migrate_on_demand = false,
15
+ POSTGRES_URL,
16
+ ...config
17
+ } = {}
18
+ ) {
19
+ const store = VectorRouter.create({
20
+ backend: vectorBackend,
21
+ dimension,
22
+ POSTGRES_URL,
23
+ ...config
24
+ });
16
25
 
17
- if (docs.length === 0) {
18
- return {
19
- text: `No relevant documents found for: "${query}"`,
20
- meta: { method: "no_documents" }
21
- };
22
- }
26
+ const embed = await embedder({ dimension });
23
27
 
24
- const context = docs
25
- .map((d, i) => `(${i + 1}) ${d.content}`)
26
- .join("\n\n");
28
+ if (migrate_on_demand && doc_root) {
29
+ for (const file of fs.readdirSync(doc_root)) {
30
+ const fullPath = path.join(doc_root, file);
31
+ if (!fs.statSync(fullPath).isFile()) continue;
27
32
 
28
- if (!useLLM) {
29
- // Return raw context without LLM
30
- return {
31
- text: context,
32
- meta: {
33
- method: "vector-retrieval-only",
34
- sources: docs.map(d => ({ id: d.id, source: d.source, score: d.score }))
35
- }
36
- };
37
- }
33
+ const text = await extractText(fullPath);
34
+ const chunks = chunkText(text);
38
35
 
39
- // Use LLM to generate answer
40
- const prompt = `Answer the question using the context below.\n\nContext:\n${context}\n\nQuestion: ${query}`;
41
-
42
- if (llm && typeof llm.generate === 'function') {
43
- const response = await llm.generate({ prompt });
44
- return {
45
- text: response.text,
46
- meta: {
47
- method: "rag-with-llm",
48
- sources: docs.map(d => ({ id: d.id, source: d.source, score: d.score }))
49
- }
50
- };
51
- } else {
52
- // Fallback to raw context if no LLM
53
- return {
54
- text: context,
55
- meta: {
56
- method: "vector-retrieval-only",
57
- sources: docs.map(d => ({ id: d.id, source: d.source, score: d.score }))
58
- }
59
- };
36
+ for (let i = 0; i < chunks.length; i++) {
37
+ await store.upsert({
38
+ id: `${file}-${i}`,
39
+ vector: await embed(chunks[i]),
40
+ content: chunks[i],
41
+ source: file,
42
+ metadata: { chunk: i }
43
+ });
44
+ }
60
45
  }
61
- } catch (error) {
62
- console.error('RAG service error:', error);
63
- return {
64
- text: `Error processing query: ${error.message}`,
65
- meta: { method: "error", error: error.message }
66
- };
67
46
  }
47
+
48
+ const results = await store.query(await embed(query), { topK: 5 });
49
+
50
+ if (store.close) await store.close();
51
+ return results;
68
52
  }
69
53
 
70
- module.exports = { performDocQA };
54
+ module.exports = performDocQA;