@pheem49/mint 1.2.4 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,15 +1,14 @@
1
1
  const fs = require('fs');
2
2
  const path = require('path');
3
3
  const os = require('os');
4
+ const crypto = require('crypto');
4
5
  const { GoogleGenAI } = require('@google/genai');
5
6
  const pdf = require('pdf-parse');
6
7
  const mammoth = require('mammoth');
7
8
  const xlsx = require('xlsx');
8
- const axios = require('axios');
9
- const cheerio = require('cheerio');
10
9
  const { readConfig } = require('../System/config_manager');
11
10
 
12
- // Handle electron dependency safely for benchmarks/tests
11
+ // Handle electron dependency safely
13
12
  let app;
14
13
  try {
15
14
  const electron = require('electron');
@@ -20,7 +19,7 @@ try {
20
19
 
21
20
  let ai = null;
22
21
  let activeApiKey = '';
23
- const initialEnvKey = (process.env.GEMINI_API_KEY || '').trim();
22
+ let DatabaseSync = null;
24
23
 
25
24
  function resolveApiKey() {
26
25
  let settingsKey = '';
@@ -30,53 +29,63 @@ function resolveApiKey() {
30
29
  } catch (e) {
31
30
  settingsKey = '';
32
31
  }
33
-
34
- const envKey = initialEnvKey;
35
- const selectedKey = settingsKey || envKey || '';
36
-
37
- if (selectedKey !== (process.env.GEMINI_API_KEY || '')) {
38
- process.env.GEMINI_API_KEY = selectedKey;
39
- }
40
-
32
+ const selectedKey = settingsKey || process.env.GEMINI_API_KEY || '';
41
33
  activeApiKey = selectedKey;
42
34
  return selectedKey;
43
35
  }
44
36
 
45
37
  function getAiClient() {
46
- const prevKey = activeApiKey;
47
- const nextKey = resolveApiKey();
48
- if (!ai || nextKey !== prevKey) {
49
- ai = new GoogleGenAI({ apiKey: nextKey });
38
+ const key = resolveApiKey();
39
+ if (!ai || activeApiKey !== key) {
40
+ ai = new GoogleGenAI({ apiKey: key });
50
41
  }
51
42
  return ai;
52
43
  }
53
44
 
54
45
  function getDbPath() {
46
+ const fileName = 'mint-knowledge.sqlite';
55
47
  if (app && app.getPath) {
56
- return path.join(app.getPath('userData'), 'mint-knowledge.json');
48
+ return path.join(app.getPath('userData'), fileName);
57
49
  }
58
- // Use global .mint directory for CLI/Benchmarking
59
50
  const mintDir = path.join(os.homedir(), '.mint');
60
- if (!fs.existsSync(mintDir)) {
61
- fs.mkdirSync(mintDir, { recursive: true });
62
- }
63
- return path.join(mintDir, 'mint-knowledge.json');
51
+ if (!fs.existsSync(mintDir)) fs.mkdirSync(mintDir, { recursive: true });
52
+ return path.join(mintDir, fileName);
64
53
  }
65
54
 
66
- function loadDb() {
67
- try {
68
- const p = getDbPath();
69
- if (fs.existsSync(p)) {
70
- return JSON.parse(fs.readFileSync(p, 'utf8'));
71
- }
72
- } catch (err) {
73
- console.error('[KnowledgeBase] Load Error:', err);
55
+ function getDatabaseSync() {
56
+ if (!DatabaseSync) {
57
+ ({ DatabaseSync } = require('node:sqlite'));
74
58
  }
75
- return { documents: [] };
59
+ return DatabaseSync;
76
60
  }
77
61
 
78
- function saveDb(db) {
79
- fs.writeFileSync(getDbPath(), JSON.stringify(db, null, 2));
62
+ // Initialize Database
63
+ let dbInstance = null;
64
+ function getDb() {
65
+ if (dbInstance) return dbInstance;
66
+ const dbPath = getDbPath();
67
+ const Database = getDatabaseSync();
68
+ dbInstance = new Database(dbPath);
69
+
70
+ // Create Tables
71
+ dbInstance.exec(`
72
+ CREATE TABLE IF NOT EXISTS sources (
73
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
74
+ path TEXT UNIQUE,
75
+ name TEXT,
76
+ hash TEXT,
77
+ last_indexed DATETIME DEFAULT CURRENT_TIMESTAMP
78
+ );
79
+ CREATE TABLE IF NOT EXISTS chunks (
80
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
81
+ source_id INTEGER,
82
+ text TEXT,
83
+ embedding BLOB,
84
+ FOREIGN KEY(source_id) REFERENCES sources(id) ON DELETE CASCADE
85
+ );
86
+ CREATE INDEX IF NOT EXISTS idx_chunks_source ON chunks(source_id);
87
+ `);
88
+ return dbInstance;
80
89
  }
81
90
 
82
91
  async function generateEmbedding(text) {
@@ -85,138 +94,203 @@ async function generateEmbedding(text) {
85
94
  model: 'gemini-embedding-001',
86
95
  contents: text,
87
96
  });
88
- // The google/genai package returns an array of embeddings
89
97
  return response.embeddings[0].values;
90
98
  }
91
99
 
100
+
92
101
  function cosineSimilarity(vecA, vecB) {
93
- let dotProduct = 0.0;
94
- let normA = 0.0;
95
- let normB = 0.0;
102
+ let dotProduct = 0, normA = 0, normB = 0;
96
103
  for (let i = 0; i < vecA.length; i++) {
97
104
  dotProduct += vecA[i] * vecB[i];
98
105
  normA += vecA[i] * vecA[i];
99
106
  normB += vecB[i] * vecB[i];
100
107
  }
101
- if (normA === 0 || normB === 0) return 0;
102
108
  return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
103
109
  }
104
110
 
111
+ function getFileHash(filePath) {
112
+ const content = fs.readFileSync(filePath);
113
+ return crypto.createHash('md5').update(content).digest('hex');
114
+ }
115
+
105
116
  function chunkText(text, maxChars = 1000, overlap = 200) {
106
117
  const chunks = [];
107
118
  let current = 0;
108
- const step = maxChars - overlap;
109
119
  while (current < text.length) {
110
120
  chunks.push(text.slice(current, current + maxChars));
111
- current += step;
121
+ current += (maxChars - overlap);
122
+ if (current >= text.length) break;
112
123
  }
113
124
  return chunks;
114
125
  }
115
126
 
116
- /**
117
- * Reads a local file or URL, chunks its text, generates embeddings, and saves to knowledge base.
118
- */
119
- async function indexFile(resourcePath) {
127
+ async function indexFile(filePath) {
120
128
  try {
121
- if (!resourcePath || resourcePath.trim() === '') return "ไม่พบข้อมูล กรุณาระบุ Path หรือ URL ค่ะ";
122
-
129
+ if (!fs.existsSync(filePath)) return `ไม่พบไฟล์: ${filePath}`;
130
+ const stats = fs.statSync(filePath);
131
+ if (stats.isDirectory()) return await indexFolder(filePath);
132
+ if (stats.size > 10 * 1024 * 1024) return `ไฟล์ใหญ่เกินไป (> 10MB): ${filePath}`;
133
+
134
+ const hash = getFileHash(filePath);
135
+ const db = getDb();
136
+
137
+ // Check if already indexed and unchanged
138
+ const checkStmt = db.prepare("SELECT id, hash FROM sources WHERE path = ?");
139
+ const existing = checkStmt.get(filePath);
140
+
141
+ if (existing && existing.hash === hash) {
142
+ return `⏩ ${path.basename(filePath)} ไม่มีการเปลี่ยนแปลง (ข้ามการอ่าน)`;
143
+ }
144
+
145
+ console.log(`[RAG] Indexing ${filePath}...`);
123
146
  let content = '';
124
- let sourceName = '';
125
- let resourceId = '';
126
-
127
- // Handle Web URLs
128
- if (resourcePath.startsWith('http://') || resourcePath.startsWith('https://')) {
129
- sourceName = resourcePath;
130
- resourceId = resourcePath;
131
- try {
132
- const response = await axios.get(resourcePath);
133
- const $ = cheerio.load(response.data);
134
- $('script, style, noscript, nav, footer, header').remove();
135
- content = $('body').text().replace(/\s+/g, ' ').trim();
136
- } catch (e) {
137
- return `ไม่สามารถดึงข้อมูลจากเว็บไซต์ได้ค่ะ: ${e.message}`;
147
+ const ext = path.extname(filePath).toLowerCase();
148
+
149
+ // Extraction logic
150
+ if (ext === '.pdf') {
151
+ const data = await pdf(fs.readFileSync(filePath));
152
+ content = data.text;
153
+ } else if (ext === '.docx') {
154
+ const res = await mammoth.extractRawText({ path: filePath });
155
+ content = res.value;
156
+ } else if (ext === '.xlsx') {
157
+ const wb = xlsx.readFile(filePath);
158
+ content = wb.SheetNames.map(n => xlsx.utils.sheet_to_csv(wb.Sheets[n])).join('\n');
159
+ } else {
160
+ content = fs.readFileSync(filePath, 'utf8');
161
+ }
162
+
163
+ if (!content.trim()) return `⚠️ ไฟล์ไม่มีข้อความ: ${filePath}`;
164
+
165
+ // Begin transaction
166
+ db.exec("BEGIN TRANSACTION");
167
+ try {
168
+ if (existing) {
169
+ db.prepare("DELETE FROM chunks WHERE source_id = ?").run(existing.id);
170
+ db.prepare("UPDATE sources SET hash = ?, last_indexed = CURRENT_TIMESTAMP WHERE id = ?").run(hash, existing.id);
171
+ } else {
172
+ db.prepare("INSERT INTO sources (path, name, hash) VALUES (?, ?, ?)").run(filePath, path.basename(filePath), hash);
138
173
  }
139
- }
140
- // Handle Local Files
141
- else {
142
- const filePath = resourcePath;
143
- if (!fs.existsSync(filePath)) return `ไม่พบไฟล์: ${filePath}`;
144
174
 
145
- const stats = fs.statSync(filePath);
146
- if (stats.size > 5 * 1024 * 1024) return `ขนาดไฟล์ใหญ่เกินไป (> 5MB): ${filePath}`;
175
+ const sourceId = existing ? existing.id : db.prepare("SELECT last_insert_rowid() as id").get().id;
176
+ const chunks = chunkText(content);
147
177
 
148
- sourceName = path.basename(filePath);
149
- resourceId = filePath;
150
- const ext = path.extname(filePath).toLowerCase();
151
-
152
- if (ext === '.pdf') {
153
- const dataBuffer = fs.readFileSync(filePath);
154
- const data = await pdf(dataBuffer);
155
- content = data.text;
156
- } else if (ext === '.docx') {
157
- const result = await mammoth.extractRawText({path: filePath});
158
- content = result.value;
159
- } else if (ext === '.xlsx') {
160
- const workbook = xlsx.readFile(filePath);
161
- content = '';
162
- for (const sheetName of workbook.SheetNames) {
163
- const sheet = workbook.Sheets[sheetName];
164
- const csv = xlsx.utils.sheet_to_csv(sheet);
165
- content += `\n--- Sheet: ${sheetName} ---\n` + csv;
166
- }
167
- } else {
168
- content = fs.readFileSync(filePath, 'utf8');
178
+ const insertChunk = db.prepare("INSERT INTO chunks (source_id, text, embedding) VALUES (?, ?, ?)");
179
+ for (const chunk of chunks) {
180
+ const embedding = await generateEmbedding(chunk);
181
+ const embeddingBlob = Buffer.from(new Float32Array(embedding).buffer);
182
+ insertChunk.run(sourceId, chunk, embeddingBlob);
169
183
  }
184
+ db.exec("COMMIT");
185
+ return `✅ Successfully indexed ${path.basename(filePath)} (${chunks.length} chunks)`;
186
+ } catch (e) {
187
+ db.exec("ROLLBACK");
188
+ throw e;
170
189
  }
171
-
172
- if (!content || content.trim().length === 0) return `ข้อมูลว่างเปล่าหรือไม่มีข้อความ: ${resourcePath}`;
173
-
174
- const chunks = chunkText(content);
175
- const db = loadDb();
176
-
177
- for (let i = 0; i < chunks.length; i++) {
178
- const embedding = await generateEmbedding(chunks[i]);
179
- db.documents.push({
180
- id: `${resourceId}#${i}-${Date.now()}`,
181
- source: sourceName,
182
- path: resourcePath,
183
- text: chunks[i],
184
- embedding
185
- });
186
- }
187
-
188
- saveDb(db);
189
- return `✅ เรียนรู้ข้อมูลจาก ${sourceName} เรียบร้อยแล้ว (แบ่งเป็น ${chunks.length} ส่วน)`;
190
190
  } catch (err) {
191
- console.error('[KnowledgeBase] Indexing error:', err);
192
- return `❌ เกิดข้อผิดพลาดในการเรียนรู้ไฟล์: ${err.message}`;
191
+ console.error('[RAG] Error:', err);
192
+ return `❌ Failed to index: ${err.message}`;
193
193
  }
194
194
  }
195
195
 
196
196
  /**
197
- * Searches the local knowledge base for relevant chunks.
197
+ * Recursively gets all files in a directory asynchronously
198
198
  */
199
+ async function getAllFiles(dirPath, arrayOfFiles = []) {
200
+ const files = await fs.promises.readdir(dirPath, { withFileTypes: true });
201
+
202
+ for (const file of files) {
203
+ const fullPath = path.join(dirPath, file.name);
204
+ if (file.isDirectory()) {
205
+ await getAllFiles(fullPath, arrayOfFiles);
206
+ } else {
207
+ arrayOfFiles.push(fullPath);
208
+ }
209
+ }
210
+ return arrayOfFiles;
211
+ }
212
+
213
+ async function indexFolder(folderPath) {
214
+ console.log(`[RAG] Indexing folder: ${folderPath}`);
215
+ const files = await getAllFiles(folderPath);
216
+ console.log(`[RAG] Found ${files.length} files to check.`);
217
+
218
+ // Process in small batches to avoid blocking
219
+ const BATCH_SIZE = 5;
220
+ let indexedCount = 0;
221
+ let skippedCount = 0;
222
+
223
+ for (let i = 0; i < files.length; i += BATCH_SIZE) {
224
+ const batch = files.slice(i, i + BATCH_SIZE);
225
+ await Promise.all(batch.map(async (file) => {
226
+ const res = await indexFile(file);
227
+ if (res && res.startsWith('✅')) indexedCount++;
228
+ else skippedCount++;
229
+ }));
230
+ }
231
+
232
+ console.log(`[RAG] Indexing complete. ${indexedCount} new/updated, ${skippedCount} skipped.`);
233
+ return `📂 Folder indexing complete: ${indexedCount} learned, ${skippedCount} skipped.`;
234
+ }
235
+
199
236
  async function searchKnowledge(query, topK = 3) {
200
- const db = loadDb();
201
- if (!db.documents || db.documents.length === 0) return null;
237
+ const startTime = Date.now();
238
+ const db = getDb();
239
+ const MAX_CHUNKS_TO_SEARCH = 2000; // Limit search to keep it fast
202
240
 
241
+ const countRes = db.prepare("SELECT COUNT(*) as count FROM chunks").get();
242
+ if (!countRes || countRes.count === 0) return null;
243
+
203
244
  try {
204
245
  const queryVector = await generateEmbedding(query);
205
- const results = db.documents.map(doc => ({
206
- ...doc,
207
- score: cosineSimilarity(queryVector, doc.embedding)
208
- })).sort((a, b) => b.score - a.score);
209
-
210
- // Return top results above a threshold
211
- const top = results.slice(0, topK).filter(r => r.score > 0.65);
212
- if (top.length > 0) {
213
- console.log(`[KnowledgeBase] Found ${top.length} matches for query.`);
214
- return top;
246
+ const queryTyped = new Float32Array(queryVector);
247
+ const results = [];
248
+
249
+ // Search most recent or top chunks first, but limit the total scan
250
+ const stmt = db.prepare("SELECT text, embedding, source_id FROM chunks LIMIT ?");
251
+ let processed = 0;
252
+
253
+ for (const c of stmt.iterate(MAX_CHUNKS_TO_SEARCH)) {
254
+ if (!c.embedding) continue;
255
+ processed++;
256
+
257
+ const chunkVector = new Float32Array(c.embedding.buffer, c.embedding.byteOffset, c.embedding.byteLength / 4);
258
+
259
+ let dotProduct = 0, normA = 0, normB = 0;
260
+ for (let i = 0; i < queryTyped.length; i++) {
261
+ const a = queryTyped[i];
262
+ const b = chunkVector[i];
263
+ dotProduct += a * b;
264
+ normA += a * a;
265
+ normB += b * b;
266
+ }
267
+ const score = dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
268
+
269
+ if (score > 0.65) {
270
+ results.push({ text: c.text, score, source_id: c.source_id });
271
+ }
215
272
  }
216
- } catch(err) {
217
- console.error("[KnowledgeBase] Search error:", err);
273
+
274
+ if (results.length > 0) {
275
+ results.sort((a, b) => b.score - a.score);
276
+ const top = results.slice(0, topK);
277
+
278
+ const sourceIds = [...new Set(top.map(t => t.source_id))];
279
+ const sources = db.prepare(`SELECT id, name FROM sources WHERE id IN (${sourceIds.join(',')})`).all();
280
+ const sourceMap = Object.fromEntries(sources.map(s => [s.id, s.name]));
281
+
282
+ console.log(`[RAG] Search took ${Date.now() - startTime}ms for ${processed} chunks.`);
283
+ return top.map(t => ({
284
+ text: t.text,
285
+ source: sourceMap[t.source_id],
286
+ score: t.score
287
+ }));
288
+ }
289
+ } catch (e) {
290
+ console.error("[RAG] Search Error:", e);
218
291
  }
219
292
  return null;
220
293
  }
221
294
 
222
- module.exports = { indexFile, searchKnowledge };
295
+
296
+ module.exports = { indexFile, indexFolder, searchKnowledge };