@purplesquirrel/watsonx-mcp-server 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/dependabot.yml +21 -0
- package/.github/workflows/ci.yml +36 -0
- package/LICENSE +21 -0
- package/README.md +245 -0
- package/TROUBLESHOOTING.md +176 -0
- package/batch-processor.js +345 -0
- package/batch-results/classify-1765765720041.json +106 -0
- package/batch-results/full-analysis-1765765676586.json +193 -0
- package/docs/index.html +572 -0
- package/docs/specs.html +613 -0
- package/document-analyzer.js +353 -0
- package/embedding-index.js +318 -0
- package/embeddings-index.json +38761 -0
- package/index.js +551 -0
- package/linkedin-post.md +92 -0
- package/package.json +28 -0
- package/test-watsonx.js +29 -0
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* watsonx Document Analyzer
|
|
4
|
+
* Analyzes documents from external drive using IBM watsonx.ai
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { WatsonXAI } from "@ibm-cloud/watsonx-ai";
|
|
8
|
+
import { IamAuthenticator } from "ibm-cloud-sdk-core";
|
|
9
|
+
import fs from "fs/promises";
|
|
10
|
+
import path from "path";
|
|
11
|
+
|
|
12
|
+
// Configuration
|
|
13
|
+
const WATSONX_API_KEY = process.env.WATSONX_API_KEY;
|
|
14
|
+
const WATSONX_URL = process.env.WATSONX_URL || "https://us-south.ml.cloud.ibm.com";
|
|
15
|
+
const WATSONX_SPACE_ID = process.env.WATSONX_SPACE_ID;
|
|
16
|
+
|
|
17
|
+
// External drive paths
|
|
18
|
+
const EXTERNAL_DRIVE = "/Volumes/Virtual Server/_NEW";
|
|
19
|
+
const DOCUMENTS_PATH = `${EXTERNAL_DRIVE}/Documents`;
|
|
20
|
+
const TRAINING_PATH = `${EXTERNAL_DRIVE}/Code/AI/Training`;
|
|
21
|
+
|
|
22
|
+
// Initialize watsonx client
|
|
23
|
+
let client = null;
|
|
24
|
+
|
|
25
|
+
function getClient() {
|
|
26
|
+
if (!client && WATSONX_API_KEY) {
|
|
27
|
+
client = WatsonXAI.newInstance({
|
|
28
|
+
version: "2024-05-31",
|
|
29
|
+
serviceUrl: WATSONX_URL,
|
|
30
|
+
authenticator: new IamAuthenticator({
|
|
31
|
+
apikey: WATSONX_API_KEY,
|
|
32
|
+
}),
|
|
33
|
+
});
|
|
34
|
+
}
|
|
35
|
+
return client;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Summarize a document using watsonx Granite model
|
|
40
|
+
*/
|
|
41
|
+
async function summarizeDocument(text, maxLength = 200) {
|
|
42
|
+
const watsonx = getClient();
|
|
43
|
+
if (!watsonx) throw new Error("watsonx client not initialized");
|
|
44
|
+
|
|
45
|
+
// Truncate very long documents
|
|
46
|
+
const truncatedText = text.length > 4000 ? text.substring(0, 4000) + "..." : text;
|
|
47
|
+
|
|
48
|
+
const response = await watsonx.generateText({
|
|
49
|
+
modelId: "ibm/granite-3-3-8b-instruct",
|
|
50
|
+
spaceId: WATSONX_SPACE_ID,
|
|
51
|
+
input: `Summarize the following document in ${maxLength} words or less. Focus on the key points and main ideas.
|
|
52
|
+
|
|
53
|
+
Document:
|
|
54
|
+
${truncatedText}
|
|
55
|
+
|
|
56
|
+
Summary:`,
|
|
57
|
+
parameters: {
|
|
58
|
+
max_new_tokens: 300,
|
|
59
|
+
temperature: 0.3,
|
|
60
|
+
stop_sequences: ["\n\n"],
|
|
61
|
+
},
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
return response.result.results?.[0]?.generated_text?.trim() || "";
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Generate embeddings for a list of texts
|
|
69
|
+
*/
|
|
70
|
+
async function generateEmbeddings(texts) {
|
|
71
|
+
const watsonx = getClient();
|
|
72
|
+
if (!watsonx) throw new Error("watsonx client not initialized");
|
|
73
|
+
|
|
74
|
+
const response = await watsonx.embedText({
|
|
75
|
+
modelId: "ibm/slate-125m-english-rtrvr-v2",
|
|
76
|
+
spaceId: WATSONX_SPACE_ID,
|
|
77
|
+
inputs: texts,
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
return response.result.results?.map((r) => r.embedding) || [];
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Analyze document type and extract key information
|
|
85
|
+
*/
|
|
86
|
+
async function analyzeDocument(text) {
|
|
87
|
+
const watsonx = getClient();
|
|
88
|
+
if (!watsonx) throw new Error("watsonx client not initialized");
|
|
89
|
+
|
|
90
|
+
const truncatedText = text.length > 3000 ? text.substring(0, 3000) + "..." : text;
|
|
91
|
+
|
|
92
|
+
const response = await watsonx.generateText({
|
|
93
|
+
modelId: "ibm/granite-3-3-8b-instruct",
|
|
94
|
+
spaceId: WATSONX_SPACE_ID,
|
|
95
|
+
input: `Analyze the following document and provide:
|
|
96
|
+
1. Document Type (e.g., technical documentation, article, notes, code, etc.)
|
|
97
|
+
2. Main Topics (comma-separated list of 3-5 topics)
|
|
98
|
+
3. Key Entities (people, organizations, technologies mentioned)
|
|
99
|
+
4. Sentiment (positive, negative, neutral)
|
|
100
|
+
|
|
101
|
+
Document:
|
|
102
|
+
${truncatedText}
|
|
103
|
+
|
|
104
|
+
Analysis:`,
|
|
105
|
+
parameters: {
|
|
106
|
+
max_new_tokens: 300,
|
|
107
|
+
temperature: 0.2,
|
|
108
|
+
},
|
|
109
|
+
});
|
|
110
|
+
|
|
111
|
+
return response.result.results?.[0]?.generated_text?.trim() || "";
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Answer questions about a document
|
|
116
|
+
*/
|
|
117
|
+
async function questionDocument(text, question) {
|
|
118
|
+
const watsonx = getClient();
|
|
119
|
+
if (!watsonx) throw new Error("watsonx client not initialized");
|
|
120
|
+
|
|
121
|
+
const truncatedText = text.length > 3500 ? text.substring(0, 3500) + "..." : text;
|
|
122
|
+
|
|
123
|
+
const response = await watsonx.generateText({
|
|
124
|
+
modelId: "ibm/granite-3-3-8b-instruct",
|
|
125
|
+
spaceId: WATSONX_SPACE_ID,
|
|
126
|
+
input: `Based on the following document, answer the question.
|
|
127
|
+
|
|
128
|
+
Document:
|
|
129
|
+
${truncatedText}
|
|
130
|
+
|
|
131
|
+
Question: ${question}
|
|
132
|
+
|
|
133
|
+
Answer:`,
|
|
134
|
+
parameters: {
|
|
135
|
+
max_new_tokens: 300,
|
|
136
|
+
temperature: 0.3,
|
|
137
|
+
},
|
|
138
|
+
});
|
|
139
|
+
|
|
140
|
+
return response.result.results?.[0]?.generated_text?.trim() || "";
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Calculate cosine similarity between two vectors
|
|
145
|
+
*/
|
|
146
|
+
function cosineSimilarity(a, b) {
|
|
147
|
+
let dotProduct = 0;
|
|
148
|
+
let normA = 0;
|
|
149
|
+
let normB = 0;
|
|
150
|
+
for (let i = 0; i < a.length; i++) {
|
|
151
|
+
dotProduct += a[i] * b[i];
|
|
152
|
+
normA += a[i] * a[i];
|
|
153
|
+
normB += b[i] * b[i];
|
|
154
|
+
}
|
|
155
|
+
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* Search documents by semantic similarity
|
|
160
|
+
*/
|
|
161
|
+
async function semanticSearch(query, documentEmbeddings, documents, topK = 5) {
|
|
162
|
+
const [queryEmbedding] = await generateEmbeddings([query]);
|
|
163
|
+
|
|
164
|
+
const similarities = documentEmbeddings.map((embedding, index) => ({
|
|
165
|
+
index,
|
|
166
|
+
similarity: cosineSimilarity(queryEmbedding, embedding),
|
|
167
|
+
document: documents[index],
|
|
168
|
+
}));
|
|
169
|
+
|
|
170
|
+
similarities.sort((a, b) => b.similarity - a.similarity);
|
|
171
|
+
return similarities.slice(0, topK);
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/**
|
|
175
|
+
* Load documents from the training catalog
|
|
176
|
+
*/
|
|
177
|
+
async function loadDocumentCatalog() {
|
|
178
|
+
const catalogPath = `${TRAINING_PATH}/documents_catalog.json`;
|
|
179
|
+
const data = await fs.readFile(catalogPath, "utf-8");
|
|
180
|
+
return JSON.parse(data);
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
/**
|
|
184
|
+
* Read a sample of documents
|
|
185
|
+
*/
|
|
186
|
+
async function readSampleDocuments(count = 10) {
|
|
187
|
+
const files = await fs.readdir(DOCUMENTS_PATH);
|
|
188
|
+
const txtFiles = files.filter((f) => f.endsWith(".txt")).slice(0, count);
|
|
189
|
+
|
|
190
|
+
const documents = [];
|
|
191
|
+
for (const file of txtFiles) {
|
|
192
|
+
try {
|
|
193
|
+
const content = await fs.readFile(`${DOCUMENTS_PATH}/${file}`, "utf-8");
|
|
194
|
+
documents.push({
|
|
195
|
+
filename: file,
|
|
196
|
+
content: content.substring(0, 5000), // Limit size
|
|
197
|
+
});
|
|
198
|
+
} catch (err) {
|
|
199
|
+
// Skip files that can't be read
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
return documents;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
// Main execution
|
|
206
|
+
async function main() {
|
|
207
|
+
const command = process.argv[2];
|
|
208
|
+
const arg = process.argv[3];
|
|
209
|
+
|
|
210
|
+
console.log("╔══════════════════════════════════════════════════════════════╗");
|
|
211
|
+
console.log("║ watsonx Document Analyzer ║");
|
|
212
|
+
console.log("║ Powered by IBM Granite 3.3 ║");
|
|
213
|
+
console.log("╚══════════════════════════════════════════════════════════════╝");
|
|
214
|
+
console.log("");
|
|
215
|
+
|
|
216
|
+
if (!WATSONX_API_KEY || !WATSONX_SPACE_ID) {
|
|
217
|
+
console.error("Error: WATSONX_API_KEY and WATSONX_SPACE_ID must be set");
|
|
218
|
+
process.exit(1);
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
switch (command) {
|
|
222
|
+
case "catalog": {
|
|
223
|
+
console.log("📚 Loading document catalog...");
|
|
224
|
+
const catalog = await loadDocumentCatalog();
|
|
225
|
+
console.log(` Total documents: ${catalog.total}`);
|
|
226
|
+
console.log(` Sample documents:`);
|
|
227
|
+
catalog.documents.slice(0, 10).forEach((doc) => {
|
|
228
|
+
console.log(` - ${doc.filename} (${doc.type}, ${doc.size_bytes} bytes)`);
|
|
229
|
+
});
|
|
230
|
+
break;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
case "summarize": {
|
|
234
|
+
const filename = arg || "1002519.txt";
|
|
235
|
+
console.log(`📝 Summarizing: ${filename}`);
|
|
236
|
+
try {
|
|
237
|
+
const content = await fs.readFile(`${DOCUMENTS_PATH}/${filename}`, "utf-8");
|
|
238
|
+
console.log(` Document length: ${content.length} characters`);
|
|
239
|
+
console.log("\n Generating summary with watsonx...\n");
|
|
240
|
+
const summary = await summarizeDocument(content);
|
|
241
|
+
console.log(" Summary:");
|
|
242
|
+
console.log(" " + "-".repeat(60));
|
|
243
|
+
console.log(" " + summary.split("\n").join("\n "));
|
|
244
|
+
console.log(" " + "-".repeat(60));
|
|
245
|
+
} catch (err) {
|
|
246
|
+
console.error(` Error: ${err.message}`);
|
|
247
|
+
}
|
|
248
|
+
break;
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
case "analyze": {
|
|
252
|
+
const filename = arg || "1002519.txt";
|
|
253
|
+
console.log(`🔍 Analyzing: ${filename}`);
|
|
254
|
+
try {
|
|
255
|
+
const content = await fs.readFile(`${DOCUMENTS_PATH}/${filename}`, "utf-8");
|
|
256
|
+
console.log(` Document length: ${content.length} characters`);
|
|
257
|
+
console.log("\n Analyzing with watsonx...\n");
|
|
258
|
+
const analysis = await analyzeDocument(content);
|
|
259
|
+
console.log(" Analysis:");
|
|
260
|
+
console.log(" " + "-".repeat(60));
|
|
261
|
+
console.log(" " + analysis.split("\n").join("\n "));
|
|
262
|
+
console.log(" " + "-".repeat(60));
|
|
263
|
+
} catch (err) {
|
|
264
|
+
console.error(` Error: ${err.message}`);
|
|
265
|
+
}
|
|
266
|
+
break;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
case "question": {
|
|
270
|
+
const filename = arg;
|
|
271
|
+
const question = process.argv[4];
|
|
272
|
+
if (!filename || !question) {
|
|
273
|
+
console.log("Usage: document-analyzer.js question <filename> '<question>'");
|
|
274
|
+
process.exit(1);
|
|
275
|
+
}
|
|
276
|
+
console.log(`❓ Asking question about: ${filename}`);
|
|
277
|
+
console.log(` Question: ${question}`);
|
|
278
|
+
try {
|
|
279
|
+
const content = await fs.readFile(`${DOCUMENTS_PATH}/${filename}`, "utf-8");
|
|
280
|
+
console.log("\n Answering with watsonx...\n");
|
|
281
|
+
const answer = await questionDocument(content, question);
|
|
282
|
+
console.log(" Answer:");
|
|
283
|
+
console.log(" " + "-".repeat(60));
|
|
284
|
+
console.log(" " + answer.split("\n").join("\n "));
|
|
285
|
+
console.log(" " + "-".repeat(60));
|
|
286
|
+
} catch (err) {
|
|
287
|
+
console.error(` Error: ${err.message}`);
|
|
288
|
+
}
|
|
289
|
+
break;
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
case "embed": {
|
|
293
|
+
console.log("🔢 Generating embeddings for sample documents...");
|
|
294
|
+
const docs = await readSampleDocuments(5);
|
|
295
|
+
console.log(` Loaded ${docs.length} documents`);
|
|
296
|
+
|
|
297
|
+
const texts = docs.map((d) => d.content.substring(0, 500));
|
|
298
|
+
console.log(" Generating embeddings with watsonx...");
|
|
299
|
+
const embeddings = await generateEmbeddings(texts);
|
|
300
|
+
|
|
301
|
+
console.log("\n Embeddings generated:");
|
|
302
|
+
docs.forEach((doc, i) => {
|
|
303
|
+
console.log(` - ${doc.filename}: ${embeddings[i]?.length || 0} dimensions`);
|
|
304
|
+
});
|
|
305
|
+
break;
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
case "search": {
|
|
309
|
+
const query = arg;
|
|
310
|
+
if (!query) {
|
|
311
|
+
console.log("Usage: document-analyzer.js search '<query>'");
|
|
312
|
+
process.exit(1);
|
|
313
|
+
}
|
|
314
|
+
console.log(`🔍 Semantic search: "${query}"`);
|
|
315
|
+
console.log(" Loading sample documents...");
|
|
316
|
+
const docs = await readSampleDocuments(20);
|
|
317
|
+
console.log(` Loaded ${docs.length} documents`);
|
|
318
|
+
|
|
319
|
+
console.log(" Generating embeddings...");
|
|
320
|
+
const texts = docs.map((d) => d.content.substring(0, 500));
|
|
321
|
+
const embeddings = await generateEmbeddings(texts);
|
|
322
|
+
|
|
323
|
+
console.log(" Searching...\n");
|
|
324
|
+
const results = await semanticSearch(query, embeddings, docs, 5);
|
|
325
|
+
|
|
326
|
+
console.log(" Top results:");
|
|
327
|
+
results.forEach((r, i) => {
|
|
328
|
+
console.log(` ${i + 1}. ${r.document.filename} (similarity: ${r.similarity.toFixed(4)})`);
|
|
329
|
+
console.log(` ${r.document.content.substring(0, 100).replace(/\n/g, " ")}...`);
|
|
330
|
+
});
|
|
331
|
+
break;
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
default:
|
|
335
|
+
console.log("Usage: document-analyzer.js <command> [args]");
|
|
336
|
+
console.log("");
|
|
337
|
+
console.log("Commands:");
|
|
338
|
+
console.log(" catalog - List documents from training catalog");
|
|
339
|
+
console.log(" summarize [file] - Summarize a document");
|
|
340
|
+
console.log(" analyze [file] - Analyze document type and topics");
|
|
341
|
+
console.log(" question <file> <q> - Ask a question about a document");
|
|
342
|
+
console.log(" embed - Generate embeddings for sample docs");
|
|
343
|
+
console.log(" search <query> - Semantic search across documents");
|
|
344
|
+
console.log("");
|
|
345
|
+
console.log("Examples:");
|
|
346
|
+
console.log(" document-analyzer.js summarize 1002519.txt");
|
|
347
|
+
console.log(" document-analyzer.js analyze 1002519.txt");
|
|
348
|
+
console.log(" document-analyzer.js question 1002519.txt 'What is this about?'");
|
|
349
|
+
console.log(" document-analyzer.js search 'IBM Cloud Satellite'");
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
main().catch(console.error);
|
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* watsonx Embedding Index
|
|
4
|
+
* Builds and queries a persistent embedding index for RAG
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { WatsonXAI } from "@ibm-cloud/watsonx-ai";
|
|
8
|
+
import { IamAuthenticator } from "ibm-cloud-sdk-core";
|
|
9
|
+
import fs from "fs/promises";
|
|
10
|
+
import path from "path";
|
|
11
|
+
|
|
12
|
+
// Configuration
|
|
13
|
+
const WATSONX_API_KEY = process.env.WATSONX_API_KEY;
|
|
14
|
+
const WATSONX_URL = process.env.WATSONX_URL || "https://us-south.ml.cloud.ibm.com";
|
|
15
|
+
const WATSONX_SPACE_ID = process.env.WATSONX_SPACE_ID;
|
|
16
|
+
|
|
17
|
+
// Paths
|
|
18
|
+
const EXTERNAL_DRIVE = "/Volumes/Virtual Server/_NEW";
|
|
19
|
+
const DOCUMENTS_PATH = `${EXTERNAL_DRIVE}/Documents`;
|
|
20
|
+
const INDEX_PATH = "/Users/matthewkarsten/watsonx-mcp-server/embeddings-index.json";
|
|
21
|
+
|
|
22
|
+
let client = null;
|
|
23
|
+
|
|
24
|
+
function getClient() {
|
|
25
|
+
if (!client && WATSONX_API_KEY) {
|
|
26
|
+
client = WatsonXAI.newInstance({
|
|
27
|
+
version: "2024-05-31",
|
|
28
|
+
serviceUrl: WATSONX_URL,
|
|
29
|
+
authenticator: new IamAuthenticator({
|
|
30
|
+
apikey: WATSONX_API_KEY,
|
|
31
|
+
}),
|
|
32
|
+
});
|
|
33
|
+
}
|
|
34
|
+
return client;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Generate embeddings for texts (batch)
|
|
39
|
+
*/
|
|
40
|
+
async function generateEmbeddings(texts) {
|
|
41
|
+
const watsonx = getClient();
|
|
42
|
+
const response = await watsonx.embedText({
|
|
43
|
+
modelId: "ibm/slate-125m-english-rtrvr-v2",
|
|
44
|
+
spaceId: WATSONX_SPACE_ID,
|
|
45
|
+
inputs: texts,
|
|
46
|
+
});
|
|
47
|
+
return response.result.results?.map((r) => r.embedding) || [];
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Calculate cosine similarity
|
|
52
|
+
*/
|
|
53
|
+
function cosineSimilarity(a, b) {
|
|
54
|
+
let dot = 0, normA = 0, normB = 0;
|
|
55
|
+
for (let i = 0; i < a.length; i++) {
|
|
56
|
+
dot += a[i] * b[i];
|
|
57
|
+
normA += a[i] * a[i];
|
|
58
|
+
normB += b[i] * b[i];
|
|
59
|
+
}
|
|
60
|
+
return dot / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Load or create index
|
|
65
|
+
*/
|
|
66
|
+
async function loadIndex() {
|
|
67
|
+
try {
|
|
68
|
+
const data = await fs.readFile(INDEX_PATH, "utf-8");
|
|
69
|
+
return JSON.parse(data);
|
|
70
|
+
} catch {
|
|
71
|
+
return { documents: [], embeddings: [], metadata: { created: new Date().toISOString(), count: 0 } };
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Save index
|
|
77
|
+
*/
|
|
78
|
+
async function saveIndex(index) {
|
|
79
|
+
index.metadata.updated = new Date().toISOString();
|
|
80
|
+
index.metadata.count = index.documents.length;
|
|
81
|
+
await fs.writeFile(INDEX_PATH, JSON.stringify(index, null, 2));
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Build index from documents
|
|
86
|
+
*/
|
|
87
|
+
async function buildIndex(maxDocs = 100) {
|
|
88
|
+
console.log("📚 Building embedding index...");
|
|
89
|
+
console.log(` Source: ${DOCUMENTS_PATH}`);
|
|
90
|
+
console.log(` Max documents: ${maxDocs}`);
|
|
91
|
+
|
|
92
|
+
const files = await fs.readdir(DOCUMENTS_PATH);
|
|
93
|
+
const txtFiles = files.filter(f => f.endsWith(".txt")).slice(0, maxDocs);
|
|
94
|
+
|
|
95
|
+
console.log(` Found ${txtFiles.length} text files`);
|
|
96
|
+
|
|
97
|
+
const index = { documents: [], embeddings: [], metadata: { created: new Date().toISOString() } };
|
|
98
|
+
|
|
99
|
+
// Process in batches of 10
|
|
100
|
+
const batchSize = 10;
|
|
101
|
+
for (let i = 0; i < txtFiles.length; i += batchSize) {
|
|
102
|
+
const batch = txtFiles.slice(i, i + batchSize);
|
|
103
|
+
const texts = [];
|
|
104
|
+
const docs = [];
|
|
105
|
+
|
|
106
|
+
for (const file of batch) {
|
|
107
|
+
try {
|
|
108
|
+
const content = await fs.readFile(`${DOCUMENTS_PATH}/${file}`, "utf-8");
|
|
109
|
+
const truncated = content.substring(0, 500); // First 500 chars for embedding
|
|
110
|
+
texts.push(truncated);
|
|
111
|
+
docs.push({
|
|
112
|
+
filename: file,
|
|
113
|
+
preview: truncated.substring(0, 200).replace(/\n/g, " "),
|
|
114
|
+
length: content.length,
|
|
115
|
+
});
|
|
116
|
+
} catch {
|
|
117
|
+
// Skip unreadable files
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
if (texts.length > 0) {
|
|
122
|
+
console.log(` Processing batch ${Math.floor(i/batchSize) + 1}/${Math.ceil(txtFiles.length/batchSize)}...`);
|
|
123
|
+
const embeddings = await generateEmbeddings(texts);
|
|
124
|
+
|
|
125
|
+
for (let j = 0; j < docs.length; j++) {
|
|
126
|
+
index.documents.push(docs[j]);
|
|
127
|
+
index.embeddings.push(embeddings[j]);
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
await saveIndex(index);
|
|
133
|
+
console.log(`\n✅ Index built with ${index.documents.length} documents`);
|
|
134
|
+
console.log(` Saved to: ${INDEX_PATH}`);
|
|
135
|
+
|
|
136
|
+
return index;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
/**
|
|
140
|
+
* Query the index
|
|
141
|
+
*/
|
|
142
|
+
async function queryIndex(query, topK = 5) {
|
|
143
|
+
console.log(`🔍 Searching: "${query}"`);
|
|
144
|
+
|
|
145
|
+
const index = await loadIndex();
|
|
146
|
+
if (index.documents.length === 0) {
|
|
147
|
+
console.log(" Index is empty. Run 'build' first.");
|
|
148
|
+
return [];
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
console.log(` Searching ${index.documents.length} documents...`);
|
|
152
|
+
|
|
153
|
+
const [queryEmbedding] = await generateEmbeddings([query]);
|
|
154
|
+
|
|
155
|
+
const results = index.embeddings.map((emb, i) => ({
|
|
156
|
+
...index.documents[i],
|
|
157
|
+
similarity: cosineSimilarity(queryEmbedding, emb),
|
|
158
|
+
}));
|
|
159
|
+
|
|
160
|
+
results.sort((a, b) => b.similarity - a.similarity);
|
|
161
|
+
return results.slice(0, topK);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/**
|
|
165
|
+
* RAG: Retrieve and Generate
|
|
166
|
+
*/
|
|
167
|
+
async function ragQuery(question, topK = 3) {
|
|
168
|
+
console.log(`💡 RAG Query: "${question}"`);
|
|
169
|
+
|
|
170
|
+
// Retrieve relevant documents
|
|
171
|
+
const results = await queryIndex(question, topK);
|
|
172
|
+
|
|
173
|
+
if (results.length === 0) {
|
|
174
|
+
console.log(" No documents found. Build index first.");
|
|
175
|
+
return;
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
// Load full content of top documents
|
|
179
|
+
const contexts = [];
|
|
180
|
+
for (const result of results) {
|
|
181
|
+
try {
|
|
182
|
+
const content = await fs.readFile(`${DOCUMENTS_PATH}/${result.filename}`, "utf-8");
|
|
183
|
+
contexts.push({
|
|
184
|
+
filename: result.filename,
|
|
185
|
+
content: content.substring(0, 1500),
|
|
186
|
+
similarity: result.similarity,
|
|
187
|
+
});
|
|
188
|
+
} catch {
|
|
189
|
+
// Skip
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
console.log(`\n Retrieved ${contexts.length} relevant documents:`);
|
|
194
|
+
contexts.forEach((c, i) => {
|
|
195
|
+
console.log(` ${i + 1}. ${c.filename} (similarity: ${c.similarity.toFixed(4)})`);
|
|
196
|
+
});
|
|
197
|
+
|
|
198
|
+
// Generate answer using watsonx
|
|
199
|
+
const watsonx = getClient();
|
|
200
|
+
const contextText = contexts.map(c => `[${c.filename}]\n${c.content}`).join("\n\n---\n\n");
|
|
201
|
+
|
|
202
|
+
console.log("\n Generating answer with Granite 3.3...\n");
|
|
203
|
+
|
|
204
|
+
const response = await watsonx.generateText({
|
|
205
|
+
modelId: "ibm/granite-3-3-8b-instruct",
|
|
206
|
+
spaceId: WATSONX_SPACE_ID,
|
|
207
|
+
input: `You are a helpful assistant. Answer the question based on the provided context documents. If the answer is not in the context, say so.
|
|
208
|
+
|
|
209
|
+
Context Documents:
|
|
210
|
+
${contextText}
|
|
211
|
+
|
|
212
|
+
Question: ${question}
|
|
213
|
+
|
|
214
|
+
Answer:`,
|
|
215
|
+
parameters: {
|
|
216
|
+
max_new_tokens: 400,
|
|
217
|
+
temperature: 0.3,
|
|
218
|
+
},
|
|
219
|
+
});
|
|
220
|
+
|
|
221
|
+
const answer = response.result.results?.[0]?.generated_text?.trim() || "No answer generated";
|
|
222
|
+
|
|
223
|
+
console.log(" " + "─".repeat(60));
|
|
224
|
+
console.log(" Answer:");
|
|
225
|
+
console.log(" " + answer.split("\n").join("\n "));
|
|
226
|
+
console.log(" " + "─".repeat(60));
|
|
227
|
+
console.log("\n Sources:");
|
|
228
|
+
contexts.forEach(c => console.log(` - ${c.filename}`));
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
/**
|
|
232
|
+
* Show index stats
|
|
233
|
+
*/
|
|
234
|
+
async function showStats() {
|
|
235
|
+
const index = await loadIndex();
|
|
236
|
+
console.log("📊 Index Statistics");
|
|
237
|
+
console.log(" " + "─".repeat(40));
|
|
238
|
+
console.log(` Documents indexed: ${index.documents.length}`);
|
|
239
|
+
console.log(` Created: ${index.metadata.created || "N/A"}`);
|
|
240
|
+
console.log(` Updated: ${index.metadata.updated || "N/A"}`);
|
|
241
|
+
console.log(` Index file: ${INDEX_PATH}`);
|
|
242
|
+
|
|
243
|
+
if (index.documents.length > 0) {
|
|
244
|
+
console.log("\n Sample documents:");
|
|
245
|
+
index.documents.slice(0, 5).forEach(d => {
|
|
246
|
+
console.log(` - ${d.filename} (${d.length} chars)`);
|
|
247
|
+
});
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
// Main
|
|
252
|
+
async function main() {
|
|
253
|
+
const command = process.argv[2];
|
|
254
|
+
const arg = process.argv[3];
|
|
255
|
+
|
|
256
|
+
console.log("╔══════════════════════════════════════════════════════════════╗");
|
|
257
|
+
console.log("║ watsonx Embedding Index & RAG ║");
|
|
258
|
+
console.log("║ Powered by IBM Granite 3.3 + Slate ║");
|
|
259
|
+
console.log("╚══════════════════════════════════════════════════════════════╝");
|
|
260
|
+
console.log("");
|
|
261
|
+
|
|
262
|
+
if (!WATSONX_API_KEY || !WATSONX_SPACE_ID) {
|
|
263
|
+
console.error("Error: WATSONX_API_KEY and WATSONX_SPACE_ID must be set");
|
|
264
|
+
process.exit(1);
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
switch (command) {
|
|
268
|
+
case "build": {
|
|
269
|
+
const maxDocs = parseInt(arg) || 100;
|
|
270
|
+
await buildIndex(maxDocs);
|
|
271
|
+
break;
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
case "search": {
|
|
275
|
+
if (!arg) {
|
|
276
|
+
console.log("Usage: embedding-index.js search '<query>'");
|
|
277
|
+
process.exit(1);
|
|
278
|
+
}
|
|
279
|
+
const results = await queryIndex(arg, 10);
|
|
280
|
+
console.log("\n Top results:");
|
|
281
|
+
results.forEach((r, i) => {
|
|
282
|
+
console.log(` ${i + 1}. ${r.filename} (${r.similarity.toFixed(4)})`);
|
|
283
|
+
console.log(` ${r.preview.substring(0, 80)}...`);
|
|
284
|
+
});
|
|
285
|
+
break;
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
case "rag": {
|
|
289
|
+
if (!arg) {
|
|
290
|
+
console.log("Usage: embedding-index.js rag '<question>'");
|
|
291
|
+
process.exit(1);
|
|
292
|
+
}
|
|
293
|
+
await ragQuery(arg);
|
|
294
|
+
break;
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
case "stats": {
|
|
298
|
+
await showStats();
|
|
299
|
+
break;
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
default:
|
|
303
|
+
console.log("Usage: embedding-index.js <command> [args]");
|
|
304
|
+
console.log("");
|
|
305
|
+
console.log("Commands:");
|
|
306
|
+
console.log(" build [count] - Build embedding index (default: 100 docs)");
|
|
307
|
+
console.log(" search <query> - Search the index");
|
|
308
|
+
console.log(" rag <question> - RAG: Retrieve docs and generate answer");
|
|
309
|
+
console.log(" stats - Show index statistics");
|
|
310
|
+
console.log("");
|
|
311
|
+
console.log("Examples:");
|
|
312
|
+
console.log(" embedding-index.js build 200");
|
|
313
|
+
console.log(" embedding-index.js search 'IBM Cloud'");
|
|
314
|
+
console.log(" embedding-index.js rag 'How do I set up AWS for Satellite?'");
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
main().catch(console.error);
|