@beltoinc/slyos-sdk 1.5.1 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/create-chatbot.sh +160 -90
  2. package/package.json +1 -1
package/create-chatbot.sh CHANGED
@@ -292,128 +292,198 @@ function printWelcome() {
292
292
  }
293
293
 
294
294
  /**
295
- * Send message to AI and get response
295
+ * Clean up model output stop hallucinated Q&A chains, strip artifacts
296
+ */
297
+ function cleanResponse(text) {
298
+ return text
299
+ // CRITICAL: Cut at first hallucinated Q&A follow-up
300
+ .split(/\n\s*Q\s*:/)[0]
301
+ // Cut at hallucinated role prefixes
302
+ .split(/\n\s*(User|Human|System|Question|A:|Answer):/i)[0]
303
+ // Strip repeated garbage chars
304
+ .replace(/(.)\1{5,}/g, '')
305
+ // Strip leading role prefixes
306
+ .replace(/^(assistant|system|answer|response|AI)\s*[:]\s*/i, '')
307
+ // Strip any remaining mid-response role prefix
308
+ .replace(/^\s*(assistant|AI)\s*[:]\s*/im, '')
309
+ .trim();
310
+ }
311
+
312
+ /**
313
+ * Send message to AI and get response — with timing metrics and streaming
296
314
  */
297
315
  async function sendMessage(userMessage) {
298
316
  try {
299
- console.log(`${colors.dim}Thinking...${colors.reset}`);
300
-
317
+ const totalStart = Date.now();
301
318
  let assistantMessage = '';
302
319
  let sourceInfo = '';
320
+ let retrievalMs = 0;
321
+ let firstTokenMs = 0;
322
+ let generationMs = 0;
323
+ let tokensGenerated = 0;
303
324
 
304
325
  if (config.kbId) {
305
- // RAG mode: call API directly to get relevant chunks, then generate locally with context
326
+ // ── RAG MODE ──
306
327
  console.log(`${colors.dim}Searching knowledge base...${colors.reset}`);
328
+ const retrievalStart = Date.now();
329
+
307
330
  try {
308
331
  const token = await getAuthToken();
309
- if (!token) throw new Error('Could not authenticate — check your API key');
310
- // Adapt chunk count to model's context window
332
+ if (!token) throw new Error('Could not authenticate');
333
+
311
334
  const modelCtx = sdk.getModelContextWindow?.() || 2048;
312
- const topK = modelCtx <= 2048 ? 2 : modelCtx <= 4096 ? 3 : 5;
335
+ const memoryMB = 8192; // default; could read from sdk.getDeviceProfile()
336
+ const cpuCores = 8;
337
+
338
+ // Dynamic config based on device + model
339
+ const deviceTier = (memoryMB >= 8192 && cpuCores >= 8) ? 'high' : (memoryMB >= 4096) ? 'mid' : 'low';
340
+ const topK = deviceTier === 'high' ? 3 : deviceTier === 'mid' ? 2 : 1;
341
+ const maxContextChars = modelCtx <= 2048
342
+ ? (deviceTier === 'high' ? 600 : deviceTier === 'mid' ? 400 : 300)
343
+ : modelCtx <= 4096
344
+ ? (deviceTier === 'high' ? 1500 : 1000)
345
+ : 2000;
346
+ const maxGenTokens = modelCtx <= 2048
347
+ ? (deviceTier === 'high' ? 200 : 150)
348
+ : Math.min(400, Math.floor(modelCtx / 4));
349
+
313
350
  const ragRes = await fetch(`${config.server}/api/rag/knowledge-bases/${config.kbId}/query`, {
314
351
  method: 'POST',
315
- headers: {
316
- 'Content-Type': 'application/json',
317
- 'Authorization': `Bearer ${token}`
318
- },
352
+ headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${token}` },
319
353
  body: JSON.stringify({ query: userMessage, top_k: topK, model_id: config.model })
320
354
  });
321
- if (!ragRes.ok) {
322
- const errText = await ragRes.text();
323
- throw new Error(`RAG query failed: ${ragRes.status} - ${errText}`);
324
- }
355
+ if (!ragRes.ok) throw new Error(`RAG ${ragRes.status}`);
325
356
  const ragData = await ragRes.json();
326
- const chunks = ragData.retrieved_chunks || [];
327
-
328
- // Check if chunks are relevant enough (similarity > 0.3)
329
- const goodChunks = chunks.filter(c => (c.similarity_score || 0) > 0.3);
330
-
331
- if (goodChunks.length > 0) {
332
- const ctxWindow = sdk.getModelContextWindow?.() || 2048;
333
-
334
- // AGGRESSIVE context limits — small models choke on long prompts
335
- // ~4 chars per token on average, reserve 60% of window for generation
336
- const maxContextTokens = Math.floor(ctxWindow * 0.3);
337
- const maxContextChars = ctxWindow <= 2048 ? 400 : ctxWindow <= 4096 ? 1000 : 2000;
338
- const maxGenTokens = ctxWindow <= 2048 ? 150 : Math.min(300, Math.floor(ctxWindow / 4));
357
+ const chunks = (ragData.retrieved_chunks || []).filter(c => (c.similarity_score || 0) > 0.3);
358
+ retrievalMs = Date.now() - retrievalStart;
339
359
 
340
- // Use only the single best chunk for small models
341
- const bestChunk = goodChunks[0];
360
+ if (chunks.length > 0) {
361
+ const bestChunk = chunks[0];
342
362
  let context = bestChunk.content
343
- .replace(/[^\x20-\x7E\n]/g, ' ') // Strip non-ASCII/control chars
344
- .replace(/\s{2,}/g, ' ') // Collapse whitespace
345
- .replace(/<[^>]+>/g, ' ') // Strip HTML tags
346
- .replace(/https?:\/\/\S+/g, '') // Strip URLs
347
- .replace(/[{}()\[\]]/g, '') // Strip brackets/braces
348
- .trim();
363
+ .replace(/[^\x20-\x7E\n]/g, ' ').replace(/\s{2,}/g, ' ')
364
+ .replace(/<[^>]+>/g, ' ').replace(/https?:\/\/\S+/g, '')
365
+ .replace(/[{}()\[\]]/g, '').trim();
349
366
  if (context.length > maxContextChars) context = context.substring(0, maxContextChars);
350
367
 
351
- console.log(`${colors.dim}Context: ${context.length} chars from "${bestChunk.document_name}"${colors.reset}`);
352
-
353
- // Minimal prompt — every token counts
354
- const prompt = `${context}\n\nQ: ${userMessage}\nA:`;
355
- const response = await sdk.generate(config.model, prompt, {
356
- temperature: 0.6,
357
- maxTokens: maxGenTokens
358
- });
359
- assistantMessage = (typeof response === 'string' ? response : response?.text || response?.content || '') || '';
360
-
361
- // Collect source names
362
- const sources = [...new Set(goodChunks.map(c => c.document_name || c.source).filter(Boolean))];
363
- if (sources.length > 0) {
364
- sourceInfo = `\n${colors.dim}[Sources: ${sources.join(', ')}]${colors.reset}`;
368
+ console.log(`${colors.dim}Context: ${context.length} chars from "${bestChunk.document_name}" [retrieval: ${retrievalMs}ms, tier: ${deviceTier}]${colors.reset}`);
369
+
370
+ // Instruction prompt — avoids Q&A chain hallucination
371
+ const prompt = `Based on this information:\n${context}\n\nAnswer briefly: ${userMessage}\n\n`;
372
+
373
+ // Stream tokens
374
+ const genStart = Date.now();
375
+ let firstToken = false;
376
+ process.stdout.write(`\n${colors.bright}${colors.magenta}AI:${colors.reset} `);
377
+
378
+ if (sdk.generateStream) {
379
+ const result = await sdk.generateStream(config.model, prompt, {
380
+ temperature: 0.6,
381
+ maxTokens: maxGenTokens,
382
+ onToken: (token, partial) => {
383
+ if (!firstToken) {
384
+ firstTokenMs = Date.now() - genStart;
385
+ firstToken = true;
386
+ }
387
+ process.stdout.write(token);
388
+ }
389
+ });
390
+ assistantMessage = result.text || '';
391
+ if (!firstToken) firstTokenMs = result.firstTokenMs || 0;
392
+ tokensGenerated = result.tokensGenerated || assistantMessage.split(/\s+/).length;
393
+ } else {
394
+ // Fallback: no streaming
395
+ const response = await sdk.generate(config.model, prompt, {
396
+ temperature: 0.6,
397
+ maxTokens: maxGenTokens
398
+ });
399
+ assistantMessage = (typeof response === 'string' ? response : response?.text || '') || '';
400
+ firstTokenMs = Date.now() - genStart;
401
+ tokensGenerated = assistantMessage.split(/\s+/).length;
402
+ process.stdout.write(assistantMessage);
365
403
  }
404
+ generationMs = Date.now() - genStart;
405
+
406
+ // Source info
407
+ const sources = [...new Set(chunks.map(c => c.document_name || c.source).filter(Boolean))];
408
+ if (sources.length > 0) sourceInfo = `\n${colors.dim}[Sources: ${sources.join(', ')}]${colors.reset}`;
366
409
  } else {
367
- // No relevant chunks answer conversationally
368
- console.log(`${colors.dim}No RAG context found, using plain generation...${colors.reset}`);
369
- const prompt = `The user said: "${userMessage}"\nGive a brief, friendly response:\n`;
370
- const response = await sdk.generate(config.model, prompt, {
371
- temperature: 0.7,
372
- maxTokens: 100
373
- });
374
- assistantMessage = (typeof response === 'string' ? response : response?.text || response?.content || '') || '';
410
+ console.log(`${colors.dim}No relevant context found [retrieval: ${retrievalMs}ms]${colors.reset}`);
411
+ const genStart = Date.now();
412
+ process.stdout.write(`\n${colors.bright}${colors.magenta}AI:${colors.reset} `);
413
+
414
+ if (sdk.generateStream) {
415
+ const result = await sdk.generateStream(config.model, `Answer briefly: ${userMessage}\n\n`, {
416
+ temperature: 0.7, maxTokens: 100,
417
+ onToken: (token) => {
418
+ if (!firstTokenMs) firstTokenMs = Date.now() - genStart;
419
+ process.stdout.write(token);
420
+ }
421
+ });
422
+ assistantMessage = result.text || '';
423
+ tokensGenerated = result.tokensGenerated || assistantMessage.split(/\s+/).length;
424
+ } else {
425
+ const response = await sdk.generate(config.model, `Answer briefly: ${userMessage}\n\n`, {
426
+ temperature: 0.7, maxTokens: 100
427
+ });
428
+ assistantMessage = (typeof response === 'string' ? response : response?.text || '') || '';
429
+ firstTokenMs = Date.now() - genStart;
430
+ tokensGenerated = assistantMessage.split(/\s+/).length;
431
+ process.stdout.write(assistantMessage);
432
+ }
433
+ generationMs = Date.now() - genStart;
375
434
  }
376
435
  } catch (ragErr) {
377
- console.log(`${colors.yellow}RAG lookup failed: ${ragErr.message}${colors.reset}`);
378
- const prompt = `The user said: "${userMessage}"\nGive a brief, friendly response:\n`;
379
- const response = await sdk.generate(config.model, prompt, {
380
- temperature: 0.7,
381
- maxTokens: 100
436
+ console.log(`${colors.yellow}RAG failed: ${ragErr.message}${colors.reset}`);
437
+ const genStart = Date.now();
438
+ const response = await sdk.generate(config.model, `Answer briefly: ${userMessage}\n\n`, {
439
+ temperature: 0.7, maxTokens: 100
382
440
  });
383
- assistantMessage = (typeof response === 'string' ? response : response?.text || response?.content || '') || '';
441
+ assistantMessage = (typeof response === 'string' ? response : response?.text || '') || '';
442
+ firstTokenMs = Date.now() - genStart;
443
+ generationMs = firstTokenMs;
444
+ tokensGenerated = assistantMessage.split(/\s+/).length;
445
+ process.stdout.write(`\n${colors.bright}${colors.magenta}AI:${colors.reset} ${assistantMessage}`);
384
446
  }
385
447
  } else {
386
- // Plain mode: direct generation (no RAG)
387
- const prompt = `The user said: "${userMessage}"\nGive a brief, helpful response:\n`;
388
- const response = await sdk.generate(config.model, prompt, {
389
- temperature: 0.7,
390
- maxTokens: 150
391
- });
392
- assistantMessage = (typeof response === 'string' ? response : response?.text || response?.content || '') || '';
448
+ // ── PLAIN MODE ──
449
+ const genStart = Date.now();
450
+ process.stdout.write(`\n${colors.bright}${colors.magenta}AI:${colors.reset} `);
451
+
452
+ if (sdk.generateStream) {
453
+ const result = await sdk.generateStream(config.model, `Answer briefly: ${userMessage}\n\n`, {
454
+ temperature: 0.7, maxTokens: 150,
455
+ onToken: (token) => {
456
+ if (!firstTokenMs) firstTokenMs = Date.now() - genStart;
457
+ process.stdout.write(token);
458
+ }
459
+ });
460
+ assistantMessage = result.text || '';
461
+ tokensGenerated = result.tokensGenerated || assistantMessage.split(/\s+/).length;
462
+ } else {
463
+ const response = await sdk.generate(config.model, `Answer briefly: ${userMessage}\n\n`, {
464
+ temperature: 0.7, maxTokens: 150
465
+ });
466
+ assistantMessage = (typeof response === 'string' ? response : response?.text || '') || '';
467
+ firstTokenMs = Date.now() - genStart;
468
+ tokensGenerated = assistantMessage.split(/\s+/).length;
469
+ process.stdout.write(assistantMessage);
470
+ }
471
+ generationMs = Date.now() - genStart;
393
472
  }
394
473
 
395
- // Clean up model output artifacts
396
- assistantMessage = assistantMessage
397
- // Strip repeated garbage chars (!!!, ???, etc)
398
- .replace(/(.)\1{5,}/g, '')
399
- // Strip leading role prefixes the model loves to emit
400
- .replace(/^(assistant|system|answer|response|AI)\s*[:]\s*/i, '')
401
- // Remove leading partial sentences (fragments before the real answer)
402
- .replace(/^[a-z][^.!?]{0,40}\.\s*/i, function(match) {
403
- // Only strip if it looks like a fragment (< 50 chars ending in period)
404
- return match.length < 50 && !match.includes(' is ') ? '' : match;
405
- })
406
- // Stop at any hallucinated role prefixes mid-response
407
- .split(/\n\s*(User|Human|System|Question):/i)[0]
408
- // Strip any remaining leading role prefix after newline
409
- .replace(/^\s*(assistant|AI)\s*[:]\s*/im, '')
410
- .trim();
474
+ // Clean up hallucinated Q&A chains
475
+ assistantMessage = cleanResponse(assistantMessage);
476
+
477
+ const totalMs = Date.now() - totalStart;
478
+ const tokPerSec = generationMs > 0 ? (tokensGenerated / (generationMs / 1000)).toFixed(1) : '0';
479
+
480
+ // Print timing summary
481
+ console.log(sourceInfo);
482
+ console.log(`${colors.dim}⏱ retrieval: ${retrievalMs}ms | first token: ${firstTokenMs}ms | generation: ${generationMs}ms | total: ${totalMs}ms | ${tokensGenerated} tokens @ ${tokPerSec} tok/s${colors.reset}\n`);
411
483
 
412
484
  if (!assistantMessage || assistantMessage.length < 3) {
413
- assistantMessage = '(No response generated — try rephrasing your question)';
485
+ console.log(`${colors.yellow}(No response generated — try rephrasing your question)${colors.reset}\n`);
414
486
  }
415
-
416
- console.log(`\n${colors.bright}${colors.magenta}AI:${colors.reset} ${assistantMessage}${sourceInfo}\n`);
417
487
  } catch (error) {
418
488
  console.error(`\n${colors.red}Error:${colors.reset} ${error.message}\n`);
419
489
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@beltoinc/slyos-sdk",
3
- "version": "1.5.1",
3
+ "version": "1.5.2",
4
4
  "description": "SlyOS - On-Device AI SDK for Web and Node.js",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",