@pheem49/mint 1.5.2 → 1.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/GUIDE_TH.md +23 -11
  2. package/README.md +148 -66
  3. package/assets/Agent_Mint.png +0 -0
  4. package/assets/Settings.png +0 -0
  5. package/install.ps1 +64 -0
  6. package/install.sh +54 -0
  7. package/main.js +12 -0
  8. package/package.json +5 -3
  9. package/preload.js +4 -0
  10. package/scripts/install_linux_desktop_entry.js +48 -0
  11. package/src/AI_Brain/Gemini_API.js +231 -498
  12. package/src/AI_Brain/autonomous_brain.js +46 -19
  13. package/src/AI_Brain/headless_agent.js +21 -2
  14. package/src/AI_Brain/provider_adapter.js +358 -0
  15. package/src/Automation_Layer/file_operations.js +17 -5
  16. package/src/CLI/approval_handler.js +5 -0
  17. package/src/CLI/chat_router.js +7 -0
  18. package/src/CLI/chat_ui.js +397 -76
  19. package/src/CLI/cli_colors.js +86 -3
  20. package/src/CLI/cli_formatters.js +6 -1
  21. package/src/CLI/code_agent.js +706 -273
  22. package/src/CLI/interactive_chat.js +311 -149
  23. package/src/CLI/slash_command_handler.js +2 -2
  24. package/src/CLI/updater.js +21 -1
  25. package/src/System/config_manager.js +5 -1
  26. package/src/System/ipc_handlers.js +95 -1
  27. package/src/System/picture_store.js +109 -0
  28. package/src/System/smart_context.js +227 -0
  29. package/src/System/task_manager.js +127 -0
  30. package/src/System/tool_registry.js +13 -0
  31. package/src/System/window_manager.js +16 -8
  32. package/src/UI/live2d_manager.js +42 -8
  33. package/src/UI/preload-spotlight.js +1 -0
  34. package/src/UI/renderer.js +837 -63
  35. package/src/UI/settings.css +160 -96
  36. package/src/UI/settings.html +9 -0
  37. package/src/UI/settings.js +35 -2
  38. package/src/UI/spotlight.js +13 -9
  39. package/src/UI/styles.css +1592 -165
  40. package/privacy.txt +0 -1
@@ -1,17 +1,17 @@
1
1
  const { GoogleGenAI } = require('@google/genai');
2
2
  const { readChatHistory, writeChatHistory, clearChatHistory } = require('../System/chat_history_manager');
3
- const { readConfig, getAvailableProviders, isPlaceholder } = require('../System/config_manager');
3
+ const { readConfig, getAvailableProviders } = require('../System/config_manager');
4
4
  const pluginManager = require('../Plugins/plugin_manager');
5
5
  const mcpManager = require('../Plugins/mcp_manager');
6
6
  const memoryStore = require('./memory_store');
7
7
  const agentOrchestrator = require('./agent_orchestrator');
8
8
  const workspaceManager = require('../CLI/workspace_manager');
9
9
  const toolRegistry = require('../System/tool_registry');
10
+ const providerAdapter = require('./provider_adapter');
10
11
 
11
12
  let ai = null;
12
13
  let activeApiKey = '';
13
14
  const initialEnvKey = (process.env.GEMINI_API_KEY || '').trim();
14
- const axios = require('axios');
15
15
  const DEFAULT_GEMINI_MODEL = 'gemini-2.5-flash';
16
16
 
17
17
  function decodeUnicode(str) {
@@ -42,15 +42,39 @@ function imageDataUriToInlineData(base64Image) {
42
42
  };
43
43
  }
44
44
 
45
- function imageDataUriToBase64(base64Image) {
46
- return imageDataUriToInlineData(base64Image).data;
47
- }
48
-
49
45
  function normalizeImageList(base64Image) {
50
46
  if (!base64Image) return [];
51
47
  return Array.isArray(base64Image) ? base64Image.filter(Boolean) : [base64Image];
52
48
  }
53
49
 
50
+ const CHAT_MODE_ACTION_POLICY = `GOAL:
51
+ Your goal is to help the user with their queries. This Electron app is Chat Mode: use at most ONE simple action per user message, only when the latest message explicitly asks for that local action. If the user asks a question or asks you to provide text/commands, answer with action "none".
52
+
53
+ ACTION DISCIPLINE:
54
+ - Always return a single JSON object. Never return a JSON array or multiple actions.
55
+ - If the user asks "พิมพ์คำสั่งให้หน่อย", "บอกคำสั่ง", "ขอคำสั่ง", "what command", or "type the command", provide the command in "response" and set action "none". Do NOT use "type_text" or "key_tap".
56
+ - Use "type_text", "key_tap", "mouse_click", or "mouse_move" only when the user explicitly asks you to control the currently focused UI, not when they ask for a command to copy/type themselves.
57
+ - If the user asks to run terminal commands or code, Chat Mode should provide the command or tell them to use the Mint CLI agent. Do not type or press Enter on their behalf.
58
+ - Never say you opened, checked, inspected, or verified a file/folder unless the selected action actually does it and the app will execute that action.
59
+ - If the request needs workspace code inspection, edits, tests, or shell execution, tell the user to use the Mint CLI agent instead of pretending to inspect files.`;
60
+
61
+ const AGENT_MODE_ACTION_POLICY = `GOAL:
62
+ Your goal is to act as Mint's Desktop Agent Mode. You may use ONE concrete desktop action per response when it directly advances the user's latest request or a clear desktop task implied by Smart Context. Prefer useful action over explaining when the user asked Mint to do something.
63
+
64
+ ACTION DISCIPLINE:
65
+ - Always return a single JSON object. Never return a JSON array or multiple actions.
66
+ - Choose exactly one action when a desktop action is useful and the user's intent is clear; otherwise use action "none" and ask a concise follow-up.
67
+ - You may use safe desktop actions such as open_url, search, open_app, find_path, open_file, open_folder, create_folder, clipboard_write, learn_file, learn_folder, plugin, mcp_tool, web_automation, system_info, mouse_move, mouse_click, type_text, and key_tap when they match the request.
68
+ - Approval and dangerous actions are handled by Mint's UI. You may propose system_automation or delete_file only when the user clearly requested it; the app will ask for permission before running.
69
+ - For UI-control actions (mouse_click, mouse_move, type_text, key_tap), rely on Smart Context or the attached screenshot. If the target is ambiguous, ask before acting.
70
+ - If the user asks "พิมพ์คำสั่งให้หน่อย", "บอกคำสั่ง", "ขอคำสั่ง", "what command", or "type the command", provide the command in "response" and set action "none" unless they explicitly ask Mint to type it into the active UI.
71
+ - If the request needs workspace code inspection, edits, tests, or shell execution, tell the user to use the Mint CLI agent instead of pretending to inspect files or run commands from Chat UI.
72
+ - Never say you opened, checked, inspected, or verified something unless the selected action actually does it and the app will execute that action.`;
73
+
74
+ function buildActionModeInstruction(config = readConfig()) {
75
+ return config.assistantMode === 'agent' ? AGENT_MODE_ACTION_POLICY : CHAT_MODE_ACTION_POLICY;
76
+ }
77
+
54
78
  const systemInstruction = `You are "Mint" (มิ้นท์), a cute, cheerful, and highly helpful female Local AI Desktop Agent.
55
79
 
56
80
  PERSONALITY & TONE:
@@ -72,9 +96,10 @@ NATURAL CHAT FLOW:
72
96
  - You have the autonomy to suggest better ways to achieve a goal, provide alternative perspectives, and take initiative in helping the user.
73
97
  - Separate distinct points with blank lines (double newline) for readability.
74
98
  - Ask follow-up questions only when they add significant value to the task or conversation.
99
+ - The latest user message is authoritative. Do not continue or describe older tasks unless the latest message explicitly asks you to continue them.
100
+ - For greetings, name-calls, acknowledgements, and backchannels such as "มิ้น", "มิ้นๆ", "อ๋อ", "โอเค", "ขอบคุณ", "hi", "hello", "ok", or "thanks", return action "none" and a short reply only.
75
101
 
76
- GOAL:
77
- Your goal is to help the user with their queries. If they ask to open an application, open a website, search, manage files, or get system info, you must trigger an action in the structured JSON format below. **NEVER provide a conversational response about performing an action without including the actual "action" object in your JSON.**
102
+ {{ACTION_MODE_INSTRUCTION}}
78
103
 
79
104
  CREATOR INFO:
80
105
  - The creator is Pheem49.
@@ -138,6 +163,7 @@ ${toolRegistry.buildToolPromptSection()}
138
163
  // Replaces 5 previously duplicated mcpPrompt blocks.
139
164
  // ─────────────────────────────────────────────────────────────────────────────
140
165
  function buildSystemPrompt() {
166
+ const config = readConfig();
141
167
  pluginManager.loadPlugins();
142
168
  const mcpTools = mcpManager.getAllTools();
143
169
 
@@ -165,7 +191,9 @@ function buildSystemPrompt() {
165
191
  workspaceSection = `\n\n[WORKSPACE DETECTED: ${ws.name}]\nPath: ${ws.path}\nProject Instructions: ${ws.instructions}\n`;
166
192
  }
167
193
 
168
- return systemInstruction + personaInstruction + workspaceSection + pluginManager.getPromptDescriptions() + mcpSection + userContext;
194
+ const modeInstruction = buildActionModeInstruction(config);
195
+ const baseInstruction = systemInstruction.replace('{{ACTION_MODE_INSTRUCTION}}', modeInstruction);
196
+ return baseInstruction + personaInstruction + workspaceSection + pluginManager.getPromptDescriptions() + mcpSection + userContext;
169
197
  }
170
198
 
171
199
  function buildMessageWithRelevantMemory(finalMessage) {
@@ -191,10 +219,15 @@ function stripRelevantMemoryBlock(text) {
191
219
  return input
192
220
  .replace(/\n?\[Relevant long-term memory for this user message\][\s\S]*?\[End relevant memory\]\n?/g, '\n')
193
221
  .replace(/^\s*\[Relevant long-term memory for this user message\][\s\S]*?\[End relevant memory\]\s*/g, '')
222
+ .replace(/\n?\[SMART_CONTEXT\][\s\S]*?\[\/SMART_CONTEXT\]\n?/g, '\n')
194
223
  .replace(/\n?\[LOCAL KNOWLEDGE BASE - USE THIS CONTEXT TO ANSWER\][\s\S]*/g, '')
195
224
  .trim();
196
225
  }
197
226
 
227
+ function hasSmartContextBlock(text) {
228
+ return /\[SMART_CONTEXT\][\s\S]*?\[\/SMART_CONTEXT\]/.test(String(text || ''));
229
+ }
230
+
198
231
  function cleanHistoryForStorage(history) {
199
232
  if (!Array.isArray(history)) return [];
200
233
  return history.map(msg => ({
@@ -202,7 +235,13 @@ function cleanHistoryForStorage(history) {
202
235
  parts: Array.isArray(msg.parts)
203
236
  ? msg.parts.map(part => {
204
237
  if (part.text) {
205
- return { ...part, text: stripRelevantMemoryBlock(part.text) };
238
+ return {
239
+ text: stripRelevantMemoryBlock(part.text)
240
+ .replace(/data:image\/[\w.+-]+;base64,[A-Za-z0-9+/=]+/g, '[Image omitted from chat history]')
241
+ };
242
+ }
243
+ if (part.inlineData || part.fileData || part.image_url || part.imageUrl) {
244
+ return { text: '[Image omitted from chat history; saved locally when sent by the user.]' };
206
245
  }
207
246
  return part;
208
247
  })
@@ -210,6 +249,20 @@ function cleanHistoryForStorage(history) {
210
249
  }));
211
250
  }
212
251
 
252
+ function preserveHistoryMetadata(nextHistory, previousHistory, now) {
253
+ if (!Array.isArray(nextHistory)) return [];
254
+ const previous = Array.isArray(previousHistory) ? previousHistory : [];
255
+
256
+ return nextHistory.map((msg, index) => {
257
+ const prior = previous[index] || {};
258
+ return {
259
+ ...msg,
260
+ timestamp: msg.timestamp || prior.timestamp || (index >= nextHistory.length - 2 ? now : null),
261
+ providerInfo: msg.providerInfo || prior.providerInfo || null
262
+ };
263
+ });
264
+ }
265
+
213
266
  function validateParsedAction(parsedResult) {
214
267
  if (!parsedResult || !parsedResult.action) {
215
268
  return parsedResult;
@@ -223,6 +276,42 @@ function validateParsedAction(parsedResult) {
223
276
  return parsedResult;
224
277
  }
225
278
 
279
+ function normalizeParsedResult(parsedResult, originalText = '') {
280
+ if (Array.isArray(parsedResult)) {
281
+ const first = parsedResult.find(item => item && typeof item === 'object') || {};
282
+ const commandAction = parsedResult.find(item =>
283
+ item && item.action && item.action.type === 'type_text' && item.action.target
284
+ );
285
+ return {
286
+ response: commandAction
287
+ ? `คำสั่งคือ:\n${commandAction.action.target}`
288
+ : (first.response || 'มิ้นท์ตอบได้ทีละ action ต่อข้อความนะคะ ลองสั่งใหม่อีกครั้งได้เลยค่ะ'),
289
+ action: { type: 'none', target: '' }
290
+ };
291
+ }
292
+
293
+ if (!parsedResult || typeof parsedResult !== 'object') {
294
+ return { response: String(parsedResult || ''), action: { type: 'none', target: '' } };
295
+ }
296
+
297
+ if (!parsedResult.action || typeof parsedResult.action !== 'object') {
298
+ parsedResult.action = { type: 'none', target: '' };
299
+ }
300
+
301
+ const input = String(originalText || '').toLowerCase();
302
+ const asksForCommandText = /พิมพ์คำสั่ง|บอกคำสั่ง|ขอคำสั่ง|คำสั่ง.*ให้หน่อย|type.*command|what command|give.*command/.test(input);
303
+ const actionType = parsedResult.action.type;
304
+ if (asksForCommandText && (actionType === 'type_text' || actionType === 'key_tap')) {
305
+ const typed = actionType === 'type_text' ? String(parsedResult.action.target || '').trim() : '';
306
+ parsedResult.response = typed
307
+ ? `คำสั่งคือ:\n${typed}`
308
+ : (parsedResult.response || 'ได้ค่ะ แต่คำขอนี้ควรตอบเป็นข้อความ ไม่ควรพิมพ์หรือกดปุ่มแทนค่ะ');
309
+ parsedResult.action = { type: 'none', target: '' };
310
+ }
311
+
312
+ return parsedResult;
313
+ }
314
+
226
315
  function resolveApiKey() {
227
316
  let settingsKey = '';
228
317
  try {
@@ -259,63 +348,15 @@ function resolveGeminiModel() {
259
348
  }
260
349
 
261
350
  function getProviderAttemptOrder(config) {
262
- const provider = config.aiProvider || 'gemini';
263
351
  const availableProviders = getAvailableProviders(config);
264
- const ordered = availableProviders.includes(provider)
265
- ? [provider, ...availableProviders.filter(p => p !== provider)]
266
- : availableProviders;
267
- return ordered.length > 0 ? ordered : ['gemini'];
352
+ return providerAdapter.getProviderAttemptOrder(config, {
353
+ availableProviders,
354
+ priority: availableProviders
355
+ });
268
356
  }
269
357
 
270
358
  function getProviderModel(provider, config = {}) {
271
- switch (provider) {
272
- case 'gemini':
273
- return (config.geminiModel || DEFAULT_GEMINI_MODEL).trim() || DEFAULT_GEMINI_MODEL;
274
- case 'anthropic':
275
- return config.anthropicModel || 'claude-3-5-sonnet-latest';
276
- case 'openai':
277
- return config.openaiModel || 'gpt-4o';
278
- case 'local_openai':
279
- return config.localModelName || 'local-model';
280
- case 'huggingface':
281
- return config.hfModel || 'meta-llama/Meta-Llama-3-8B-Instruct';
282
- case 'ollama':
283
- return config.ollamaModel || 'llama3:latest';
284
- default:
285
- return '';
286
- }
287
- }
288
-
289
- function withProviderInfo(result, provider, config = {}) {
290
- const normalized = (result && typeof result === 'object')
291
- ? result
292
- : { response: String(result || ''), action: { type: 'none', target: '' } };
293
- const providerInfo = {
294
- provider,
295
- model: getProviderModel(provider, config)
296
- };
297
-
298
- attachProviderInfoToLatestHistory(providerInfo);
299
-
300
- return {
301
- ...normalized,
302
- providerInfo
303
- };
304
- }
305
-
306
- function attachProviderInfoToLatestHistory(providerInfo) {
307
- try {
308
- const history = readChatHistory();
309
- for (let i = history.length - 1; i >= 0; i -= 1) {
310
- if (history[i] && history[i].role === 'model') {
311
- history[i].providerInfo = providerInfo;
312
- writeChatHistory(cleanHistoryForStorage(history));
313
- return;
314
- }
315
- }
316
- } catch (error) {
317
- console.warn('[Provider Info] Failed to persist provider metadata:', error.message);
318
- }
359
+ return providerAdapter.getProviderModel(provider, config);
319
360
  }
320
361
 
321
362
  // Chat session — maintains conversation history within the session
@@ -370,16 +411,98 @@ function shouldUseKnowledgeSearch(message) {
370
411
  return knowledgeHints.some(hint => text.includes(hint));
371
412
  }
372
413
 
414
+ function chatHistoryToProviderHistory(history = []) {
415
+ return (Array.isArray(history) ? history : [])
416
+ .slice(-MAX_HISTORY_MESSAGES)
417
+ .map((msg) => {
418
+ const role = msg.role === 'model' ? 'assistant' : 'user';
419
+ const text = Array.isArray(msg.parts)
420
+ ? msg.parts.map(part => typeof part.text === 'string' ? stripRelevantMemoryBlock(part.text) : '').filter(Boolean).join('\n')
421
+ : '';
422
+ if (!text.trim()) return null;
423
+ return { role, content: text };
424
+ })
425
+ .filter(Boolean);
426
+ }
427
+
428
+ function buildChatObservation(finalMessage, images = [], base64Audio = null) {
429
+ let text = '';
430
+ if (finalMessage) {
431
+ text = buildMessageWithRelevantMemory(finalMessage);
432
+ } else if (base64Audio && images.length === 0) {
433
+ text = 'Please listen to this voice command and respond in Thai with the appropriate JSON action if needed.';
434
+ } else if (images.length === 0 && !base64Audio) {
435
+ text = 'Analyze this input.';
436
+ } else {
437
+ text = 'Analyze this input.';
438
+ }
439
+
440
+ return {
441
+ text,
442
+ imageDataUris: images,
443
+ audioDataUri: base64Audio || null
444
+ };
445
+ }
446
+
447
+ function parseChatProviderResponse(outputText, originalText = '', now = new Date().toISOString()) {
448
+ const cleaned = stripRelevantMemoryBlock(String(outputText || ''));
449
+ let parsedResult;
450
+ try {
451
+ parsedResult = JSON.parse(cleaned);
452
+ } catch (e) {
453
+ const jsonMatch = cleaned.match(/```json\n([\s\S]*?)\n```/) || cleaned.match(/\{[\s\S]*\}/);
454
+ if (jsonMatch) {
455
+ parsedResult = JSON.parse(jsonMatch[jsonMatch.length > 1 ? 1 : 0]);
456
+ } else {
457
+ parsedResult = {
458
+ response: cleaned,
459
+ action: { type: 'none', target: '' }
460
+ };
461
+ }
462
+ }
463
+
464
+ parsedResult = normalizeParsedResult(parsedResult, originalText);
465
+ if (parsedResult && typeof parsedResult.response === 'string') {
466
+ parsedResult.response = stripRelevantMemoryBlock(decodeUnicode(parsedResult.response));
467
+ }
468
+ validateParsedAction(parsedResult);
469
+ parsedResult.timestamp = now;
470
+ return parsedResult;
471
+ }
472
+
473
+ function appendChatProviderHistory(previousHistory, finalMessage, outputText, providerInfo, now) {
474
+ const nextHistory = [
475
+ ...(Array.isArray(previousHistory) ? previousHistory : []),
476
+ {
477
+ role: 'user',
478
+ parts: [{ text: finalMessage || 'Analyze this input.' }],
479
+ timestamp: now
480
+ },
481
+ {
482
+ role: 'model',
483
+ parts: [{ text: String(outputText || '') }],
484
+ timestamp: now,
485
+ providerInfo
486
+ }
487
+ ].slice(-MAX_STORED_HISTORY_MESSAGES);
488
+
489
+ writeChatHistory(cleanHistoryForStorage(nextHistory));
490
+ }
491
+
373
492
  async function handleChat(message, base64Image = null, base64Audio = null) {
374
493
  try {
375
494
  const config = readConfig();
495
+ const images = normalizeImageList(base64Image);
496
+ const previousHistory = readChatHistory();
497
+ const userVisibleMessage = stripRelevantMemoryBlock(message);
498
+ const containsSmartContext = hasSmartContextBlock(message);
376
499
 
377
500
  let finalMessage = message;
378
501
 
379
502
  // Inject Local RAG Context
380
- if (message && message.trim().length > 0 && shouldUseKnowledgeSearch(message)) {
503
+ if (userVisibleMessage && userVisibleMessage.trim().length > 0 && shouldUseKnowledgeSearch(userVisibleMessage)) {
381
504
  const { searchKnowledge } = require('./knowledge_base');
382
- const retrievedDocs = await searchKnowledge(message);
505
+ const retrievedDocs = await searchKnowledge(userVisibleMessage);
383
506
  if (retrievedDocs && retrievedDocs.length > 0) {
384
507
  let contextString = `\n\n[LOCAL KNOWLEDGE BASE - USE THIS CONTEXT TO ANSWER]\n`;
385
508
  retrievedDocs.forEach(doc => {
@@ -389,179 +512,47 @@ async function handleChat(message, base64Image = null, base64Audio = null) {
389
512
  }
390
513
  }
391
514
 
392
- const providersToTry = getProviderAttemptOrder(config);
393
-
394
- for (let i = 0; i < providersToTry.length; i++) {
395
- const currentProv = providersToTry[i];
396
- try {
397
- if (currentProv === 'ollama') {
398
- return withProviderInfo(await handleOllamaChat(finalMessage, base64Image, base64Audio, config), currentProv, config);
399
- }
400
- if (currentProv === 'anthropic') {
401
- return withProviderInfo(await handleAnthropicChat(finalMessage, base64Image, config), currentProv, config);
402
- }
403
- if (currentProv === 'openai') {
404
- return withProviderInfo(await handleOpenAIChat(finalMessage, base64Image, config), currentProv, config);
405
- }
406
- if (currentProv === 'local_openai') {
407
- return withProviderInfo(await handleLocalOpenAIChat(finalMessage, base64Image, config), currentProv, config);
408
- }
409
- if (currentProv === 'huggingface') {
410
- return withProviderInfo(await handleHuggingFaceChat(finalMessage, base64Image, config), currentProv, config);
411
- }
412
-
413
- const currentKey = resolveApiKey();
414
- if (!currentKey) {
415
- if (i === providersToTry.length - 1) {
416
- return withProviderInfo({
417
- response: "I couldn't find your Gemini API Key. Please run 'mint onboard' to set it up!",
418
- action: { type: "none", target: "" }
419
- }, currentProv, config);
420
- }
421
- console.warn("[Fallback System] Gemini API key missing. Skipping Gemini provider.");
422
- continue;
423
- }
424
-
425
- if (!ai || activeApiKey !== currentKey) {
426
- initAiClient();
427
- createChat(readChatHistory());
428
- }
429
-
430
- return withProviderInfo(await handleGeminiChat(finalMessage, base64Image, base64Audio), currentProv, config);
431
- } catch (error) {
432
- console.error(`[Fallback System] Provider '${currentProv}' failed:`, error.message);
433
- if (i === providersToTry.length - 1) {
434
- console.error("[Fallback System] All available providers failed.");
435
- throw error; // No more providers to fallback to
436
- }
437
- console.log(`[Fallback System] Switching to next available provider: '${providersToTry[i+1]}'`);
438
- // Continue the loop to try the next provider
439
- }
440
- }
441
- } catch (globalError) {
442
- console.error("handleChat error:", globalError);
443
- throw globalError;
444
- }
445
- }
446
-
447
- async function handleGeminiChat(finalMessage, base64Image, base64Audio) {
448
- try {
449
- const images = normalizeImageList(base64Image);
450
- // 1. Check cache first for text-only messages
451
- if (finalMessage && images.length === 0 && !base64Audio) {
452
- const cached = memoryStore.getCachedResponse(finalMessage);
453
- if (cached) return cached;
454
- }
455
-
456
- const desiredModel = resolveGeminiModel();
457
- if (!chat || activeModel !== desiredModel) {
458
- createChat(readChatHistory());
515
+ if (!containsSmartContext && userVisibleMessage && images.length === 0 && !base64Audio) {
516
+ const cached = memoryStore.getCachedResponse(userVisibleMessage);
517
+ if (cached) return cached;
459
518
  }
460
519
 
461
- let aiResponse;
462
- const parts = [];
463
- if (finalMessage) {
464
- parts.push({ text: buildMessageWithRelevantMemory(finalMessage) });
465
- } else if (base64Audio && images.length === 0) {
466
- // Provide a guiding prompt when only audio is provided to ensure Gemini follows instructions
467
- parts.push({ text: "Please listen to this voice command and respond in Thai with the appropriate JSON action if needed." });
468
- } else if (images.length === 0 && !base64Audio) {
469
- parts.push({ text: "Analyze this input." });
470
- }
471
-
472
- for (const item of images) {
473
- const image = imageDataUriToInlineData(item);
474
- parts.push({
475
- inlineData: image
476
- });
477
- }
478
-
479
- if (base64Audio) {
480
- // Extract MIME type from the data URI if present, fallback to audio/webm
481
- let mimeType = "audio/webm";
482
- const mimeMatch = base64Audio.match(/^data:(audio\/\w+);base64,/);
483
- if (mimeMatch) {
484
- mimeType = mimeMatch[1];
485
- }
486
-
487
- const base64Data = base64Audio.replace(/^data:audio\/\w+;base64,/, '');
488
- parts.push({
489
- inlineData: { mimeType: mimeType, data: base64Data }
490
- });
491
- }
492
-
493
- aiResponse = await chat.sendMessage({ message: parts });
494
-
495
- // Save history with timestamps
496
- const history = await chat.getHistory();
520
+ const providersToTry = getProviderAttemptOrder(config);
521
+ const client = new providerAdapter.AgentProviderClient({
522
+ provider: providersToTry[0],
523
+ providerOrder: providersToTry,
524
+ config,
525
+ history: chatHistoryToProviderHistory(previousHistory),
526
+ systemInstruction: buildSystemPrompt(),
527
+ responseMimeType: 'application/json',
528
+ maxTokens: 4096
529
+ });
530
+ const observation = buildChatObservation(finalMessage, images, base64Audio);
531
+ const outputText = await client.sendMessage(observation);
497
532
  const now = new Date().toISOString();
498
-
499
- // Add timestamp to the last two messages (User and Model) if they don't have one
500
- if (history.length >= 2) {
501
- const modelMsg = history[history.length - 1];
502
- const userMsg = history[history.length - 2];
503
- if (!modelMsg.timestamp) modelMsg.timestamp = now;
504
- if (!userMsg.timestamp) userMsg.timestamp = now;
505
- } else if (history.length === 1) {
506
- const msg = history[0];
507
- if (!msg.timestamp) msg.timestamp = now;
508
- }
509
-
510
- writeChatHistory(cleanHistoryForStorage(history));
511
-
512
- let outputText = '';
513
- try {
514
- // Robust text extraction
515
- outputText = (typeof aiResponse.text === 'function') ? aiResponse.text() : (aiResponse.text || '');
516
- } catch (e) {
517
- outputText = String(aiResponse || '');
518
- }
519
-
520
- outputText = stripRelevantMemoryBlock(outputText);
521
-
522
- let parsedResult;
523
- try {
524
- parsedResult = JSON.parse(outputText);
525
- } catch (e) {
526
- // Fallback in case the model failed to return pure JSON
527
- console.error("Failed to parse JSON directly:", e);
528
- const jsonMatch = outputText.match(/```json\n([\s\S]*?)\n```/) || outputText.match(/\{[\s\S]*\}/);
529
- if (jsonMatch) {
530
- parsedResult = JSON.parse(jsonMatch[jsonMatch.length > 1 ? 1 : 0]);
531
- } else {
532
- parsedResult = {
533
- response: outputText,
534
- action: { type: "none", target: "" }
535
- };
536
- }
537
- }
538
-
539
- // Decode any remaining unicode escapes in the response text
540
- if (parsedResult && typeof parsedResult.response === 'string') {
541
- parsedResult.response = decodeUnicode(parsedResult.response);
542
- parsedResult.response = stripRelevantMemoryBlock(parsedResult.response);
543
- }
544
-
545
- // Attach timestamp to the result
546
- validateParsedAction(parsedResult);
547
- parsedResult.timestamp = now;
548
-
549
- // Record interaction for long-term memory (non-blocking)
550
- if (finalMessage && parsedResult.response) {
551
- setImmediate(() => {
552
- memoryStore.recordInteraction(finalMessage, parsedResult.response);
553
- // Cache text-only responses
554
- if (images.length === 0 && !base64Audio) {
555
- memoryStore.cacheResponse(finalMessage, parsedResult);
556
- }
557
- });
533
+ const provider = client.lastSuccessfulProvider || client.provider || providersToTry[0];
534
+ const providerInfo = {
535
+ provider,
536
+ model: getProviderModel(provider, config),
537
+ usage: client.getUsageSummary()
538
+ };
539
+ const parsedResult = parseChatProviderResponse(outputText, userVisibleMessage || finalMessage, now);
540
+ parsedResult.providerInfo = providerInfo;
541
+ appendChatProviderHistory(previousHistory, userVisibleMessage || finalMessage, outputText, providerInfo, now);
542
+
543
+ if ((userVisibleMessage || finalMessage) && parsedResult.response) {
544
+ setImmediate(() => {
545
+ memoryStore.recordInteraction(userVisibleMessage || finalMessage, parsedResult.response);
546
+ if (!containsSmartContext && images.length === 0 && !base64Audio) {
547
+ memoryStore.cacheResponse(userVisibleMessage || finalMessage, parsedResult);
548
+ }
549
+ });
558
550
  }
559
551
 
560
552
  return parsedResult;
561
-
562
- } catch (error) {
563
- console.error("AI API Error:", error);
564
- throw error;
553
+ } catch (globalError) {
554
+ console.error("handleChat error:", globalError);
555
+ throw globalError;
565
556
  }
566
557
  }
567
558
 
@@ -573,6 +564,7 @@ async function handleGeminiChat(finalMessage, base64Image, base64Audio) {
573
564
  async function* handleGeminiChatStream(finalMessage, base64Image, base64Audio) {
574
565
  try {
575
566
  const images = normalizeImageList(base64Image);
567
+ const previousHistory = readChatHistory();
576
568
  // 1. Check cache first
577
569
  if (finalMessage && images.length === 0 && !base64Audio) {
578
570
  const cached = memoryStore.getCachedResponse(finalMessage);
@@ -624,7 +616,7 @@ async function* handleGeminiChatStream(finalMessage, base64Image, base64Audio) {
624
616
  fullText = stripRelevantMemoryBlock(fullText);
625
617
 
626
618
  // Save history
627
- const history = await chat.getHistory();
619
+ const history = preserveHistoryMetadata(await chat.getHistory(), previousHistory, new Date().toISOString());
628
620
  const now = new Date().toISOString();
629
621
  if (history.length >= 2) {
630
622
  const modelMsg = history[history.length - 1];
@@ -646,6 +638,8 @@ async function* handleGeminiChatStream(finalMessage, base64Image, base64Audio) {
646
638
  parsedResult = { response: fullText, action: { type: 'none', target: '' } };
647
639
  }
648
640
  }
641
+ parsedResult = normalizeParsedResult(parsedResult, finalMessage);
642
+
649
643
  if (parsedResult && typeof parsedResult.response === 'string') {
650
644
  parsedResult.response = decodeUnicode(parsedResult.response);
651
645
  parsedResult.response = stripRelevantMemoryBlock(parsedResult.response);
@@ -672,269 +666,6 @@ async function* handleGeminiChatStream(finalMessage, base64Image, base64Audio) {
672
666
  }
673
667
  }
674
668
 
675
- async function handleAnthropicChat(finalMessage, base64Image, config) {
676
- const history = readChatHistory() || [];
677
- const images = normalizeImageList(base64Image);
678
- const apiKey = config.anthropicApiKey || process.env.ANTHROPIC_API_KEY;
679
- if (isPlaceholder(apiKey)) return { response: "กรุณาใส่ Anthropic API Key ในการตั้งค่าก่อนนะคะ", action: { type: "none" } };
680
-
681
- const systemPrompt = buildSystemPrompt();
682
-
683
- const messages = [];
684
- for (const msg of history.slice(-MAX_HISTORY_MESSAGES)) {
685
- const role = msg.role === 'model' ? 'assistant' : 'user';
686
- let text = Array.isArray(msg.parts) ? msg.parts.map(p => p.text || '').join('\n') : '';
687
- if (text) messages.push({ role, content: text });
688
- }
689
-
690
- const content = [];
691
- for (const item of images) {
692
- const image = imageDataUriToInlineData(item);
693
- content.push({
694
- type: "image",
695
- source: { type: "base64", media_type: image.mimeType, data: image.data }
696
- });
697
- }
698
- content.push({ type: "text", text: finalMessage || "Analyze this." });
699
- messages.push({ role: "user", content });
700
-
701
- const response = await axios.post('https://api.anthropic.com/v1/messages', {
702
- model: config.anthropicModel || 'claude-3-5-sonnet-latest',
703
- max_tokens: 4096,
704
- system: systemPrompt,
705
- messages: messages
706
- }, {
707
- headers: {
708
- 'x-api-key': apiKey,
709
- 'anthropic-version': '2023-06-01',
710
- 'content-type': 'application/json'
711
- }
712
- });
713
-
714
- const outputText = response.data.content[0].text;
715
- history.push({ role: 'user', parts: [{ text: finalMessage }] });
716
- history.push({ role: 'model', parts: [{ text: outputText }] });
717
- writeChatHistory(cleanHistoryForStorage(history.slice(-MAX_STORED_HISTORY_MESSAGES)));
718
-
719
- return parseAiResponse(outputText);
720
- }
721
-
722
- async function handleOpenAIChat(finalMessage, base64Image, config) {
723
- const history = readChatHistory() || [];
724
- const images = normalizeImageList(base64Image);
725
- const apiKey = config.openaiApiKey || process.env.OPENAI_API_KEY;
726
- if (isPlaceholder(apiKey)) return { response: "กรุณาใส่ OpenAI API Key ในการตั้งค่าก่อนนะคะ", action: { type: "none" } };
727
-
728
- const systemPrompt = buildSystemPrompt();
729
-
730
- const messages = [{ role: "system", content: systemPrompt }];
731
- for (const msg of history.slice(-MAX_HISTORY_MESSAGES)) {
732
- const role = msg.role === 'model' ? 'assistant' : 'user';
733
- let text = Array.isArray(msg.parts) ? msg.parts.map(p => p.text || '').join('\n') : '';
734
- if (text) messages.push({ role, content: text });
735
- }
736
-
737
- const content = [{ type: "text", text: finalMessage || "Analyze this." }];
738
- for (const item of images) {
739
- content.push({
740
- type: "image_url",
741
- image_url: { url: item }
742
- });
743
- }
744
- messages.push({ role: "user", content });
745
-
746
- const response = await axios.post('https://api.openai.com/v1/chat/completions', {
747
- model: config.openaiModel || 'gpt-4o',
748
- messages: messages,
749
- response_format: { type: "json_object" }
750
- }, {
751
- headers: {
752
- 'Authorization': `Bearer ${apiKey}`,
753
- 'Content-Type': 'application/json'
754
- }
755
- });
756
-
757
- const outputText = response.data.choices[0].message.content;
758
- history.push({ role: 'user', parts: [{ text: finalMessage }] });
759
- history.push({ role: 'model', parts: [{ text: outputText }] });
760
- writeChatHistory(cleanHistoryForStorage(history.slice(-MAX_STORED_HISTORY_MESSAGES)));
761
-
762
- return parseAiResponse(outputText);
763
- }
764
-
765
- async function handleLocalOpenAIChat(finalMessage, base64Image, config) {
766
- const history = readChatHistory() || [];
767
- const images = normalizeImageList(base64Image);
768
- const apiKey = 'lm-studio';
769
- const baseUrl = config.localApiBaseUrl || 'http://localhost:1234/v1';
770
-
771
- const systemPrompt = buildSystemPrompt();
772
-
773
- const messages = [{ role: "system", content: systemPrompt }];
774
- for (const msg of history.slice(-MAX_HISTORY_MESSAGES)) {
775
- const role = msg.role === 'model' ? 'assistant' : 'user';
776
- let text = Array.isArray(msg.parts) ? msg.parts.map(p => p.text || '').join('\n') : '';
777
- if (text) messages.push({ role, content: text });
778
- }
779
-
780
- const content = [{ type: "text", text: finalMessage || "Analyze this." }];
781
- for (const item of images) {
782
- content.push({
783
- type: "image_url",
784
- image_url: { url: item }
785
- });
786
- }
787
- messages.push({ role: "user", content });
788
-
789
- const response = await axios.post(`${baseUrl.replace(/\/$/, '')}/chat/completions`, {
790
- model: config.localModelName || 'local-model',
791
- messages: messages,
792
- // response_format json_object is sometimes problematic on weak local models, but required by our prompt.
793
- // We'll keep it as some local servers like LM Studio support it for specific models.
794
- // If not supported, the system prompt usually coerces it anyway.
795
- response_format: { type: "json_object" }
796
- }, {
797
- headers: {
798
- 'Authorization': `Bearer ${apiKey}`,
799
- 'Content-Type': 'application/json'
800
- }
801
- });
802
-
803
- const outputText = response.data.choices[0].message.content;
804
- history.push({ role: 'user', parts: [{ text: finalMessage }] });
805
- history.push({ role: 'model', parts: [{ text: outputText }] });
806
- writeChatHistory(cleanHistoryForStorage(history.slice(-MAX_STORED_HISTORY_MESSAGES)));
807
-
808
- return parseAiResponse(outputText);
809
- }
810
-
811
- async function handleHuggingFaceChat(finalMessage, base64Image, config) {
812
- const history = readChatHistory() || [];
813
- const images = normalizeImageList(base64Image);
814
- const apiKey = config.hfApiKey || process.env.HF_API_KEY;
815
- if (isPlaceholder(apiKey)) return { response: "กรุณาใส่ Hugging Face API Key ในการตั้งค่าก่อนนะคะ", action: { type: "none" } };
816
-
817
- const modelId = config.hfModel || 'meta-llama/Meta-Llama-3-8B-Instruct';
818
- const baseUrl = `https://api-inference.huggingface.co/models/${modelId}/v1/chat/completions`;
819
-
820
- const systemPrompt = buildSystemPrompt();
821
-
822
- const messages = [{ role: "system", content: systemPrompt }];
823
- for (const msg of history.slice(-MAX_HISTORY_MESSAGES)) {
824
- const role = msg.role === 'model' ? 'assistant' : 'user';
825
- let text = Array.isArray(msg.parts) ? msg.parts.map(p => p.text || '').join('\n') : '';
826
- if (text) messages.push({ role, content: text });
827
- }
828
-
829
- const content = [{ type: "text", text: finalMessage || "Analyze this." }];
830
- for (const item of images) {
831
- content.push({
832
- type: "image_url",
833
- image_url: { url: item }
834
- });
835
- }
836
- messages.push({ role: "user", content });
837
-
838
- const response = await axios.post(baseUrl, {
839
- model: modelId,
840
- messages: messages,
841
- max_tokens: 4096
842
- }, {
843
- headers: {
844
- 'Authorization': `Bearer ${apiKey}`,
845
- 'Content-Type': 'application/json'
846
- }
847
- });
848
-
849
- const outputText = response.data.choices[0].message.content;
850
- history.push({ role: 'user', parts: [{ text: finalMessage }] });
851
- history.push({ role: 'model', parts: [{ text: outputText }] });
852
- writeChatHistory(cleanHistoryForStorage(history.slice(-MAX_STORED_HISTORY_MESSAGES)));
853
-
854
- return parseAiResponse(outputText);
855
- }
856
-
857
- function parseAiResponse(outputText) {
858
- let parsedResult;
859
- try {
860
- parsedResult = JSON.parse(outputText);
861
- } catch (e) {
862
- const jsonMatch = outputText.match(/```json\n([\s\S]*?)\n```/) || outputText.match(/\{[\s\S]*\}/);
863
- if (jsonMatch) {
864
- parsedResult = JSON.parse(jsonMatch[jsonMatch.length > 1 ? 1 : 0]);
865
- } else {
866
- parsedResult = { response: outputText, action: { type: "none", target: "" } };
867
- }
868
- }
869
- if (parsedResult && typeof parsedResult.response === 'string') {
870
- parsedResult.response = decodeUnicode(parsedResult.response);
871
- }
872
- validateParsedAction(parsedResult);
873
- parsedResult.timestamp = new Date().toISOString();
874
- return parsedResult;
875
- }
876
-
877
- async function handleOllamaChat(finalMessage, base64Image, base64Audio, config) {
878
- const history = readChatHistory() || [];
879
- const imageInputs = normalizeImageList(base64Image);
880
-
881
- const ollamaMessages = [
882
- { role: 'system', content: buildSystemPrompt() }
883
- ];
884
-
885
- for (const msg of history.slice(-MAX_HISTORY_MESSAGES)) {
886
- const role = msg.role === 'model' ? 'assistant' : 'user';
887
- let text = '';
888
- if (Array.isArray(msg.parts)) {
889
- text = msg.parts.map(p => p.text || '').join('\n');
890
- }
891
- if (text) ollamaMessages.push({ role, content: text });
892
- }
893
-
894
- let currentContent = finalMessage || 'Analyze this input.';
895
- let images = [];
896
- for (const item of imageInputs) {
897
- images.push(imageDataUriToBase64(item));
898
- }
899
-
900
- if (base64Audio && imageInputs.length === 0 && !finalMessage) {
901
- currentContent = "Please analyze this audio requirement based on text if any was transacted, otherwise reply with appropriate action.";
902
- }
903
-
904
- const userMessage = { role: 'user', content: currentContent };
905
- if (images.length > 0) userMessage.images = images;
906
-
907
- ollamaMessages.push(userMessage);
908
-
909
- const ollamaBaseUrl = (config.ollamaHost || 'http://localhost:11434').replace(/\/$/, '');
910
- const response = await axios.post(`${ollamaBaseUrl}/api/chat`, {
911
- model: config.ollamaModel || 'llama3:latest',
912
- messages: ollamaMessages,
913
- format: 'json',
914
- stream: false
915
- });
916
-
917
- const outputText = response.data.message.content;
918
-
919
- history.push({ role: 'user', parts: [{ text: currentContent }] });
920
- history.push({ role: 'model', parts: [{ text: outputText }] });
921
- writeChatHistory(cleanHistoryForStorage(history.slice(-MAX_STORED_HISTORY_MESSAGES)));
922
-
923
- let parsedResult;
924
- try {
925
- parsedResult = JSON.parse(outputText);
926
- } catch(e) {
927
- const jsonMatch = outputText.match(/```json\n([\s\S]*?)\n```/) || outputText.match(/\{[\s\S]*\}/);
928
- if (jsonMatch) {
929
- parsedResult = JSON.parse(jsonMatch[jsonMatch.length > 1 ? 1 : 0]);
930
- } else {
931
- parsedResult = { response: outputText, action: { type: "none", target: "" } };
932
- }
933
- }
934
- validateParsedAction(parsedResult);
935
- return parsedResult;
936
- }
937
-
938
669
  function resetChat() {
939
670
  clearChatHistory();
940
671
  memoryStore.clearConversationScopedProfile();
@@ -980,7 +711,7 @@ function historyToTranscript(history) {
980
711
  transcript.push({
981
712
  sender,
982
713
  text,
983
- timestamp: content.timestamp || new Date().toISOString(),
714
+ timestamp: content.timestamp || null,
984
715
  providerInfo: content.providerInfo || null
985
716
  });
986
717
  }
@@ -1058,6 +789,8 @@ module.exports = {
1058
789
  translateImageContent,
1059
790
  refreshApiKeyFromConfig,
1060
791
  _helpers: {
1061
- getProviderAttemptOrder
792
+ getProviderAttemptOrder,
793
+ normalizeParsedResult,
794
+ buildActionModeInstruction
1062
795
  }
1063
796
  };