@beltoinc/slyos-sdk 1.5.0 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/create-chatbot.sh CHANGED
@@ -292,122 +292,198 @@ function printWelcome() {
292
292
  }
293
293
 
294
294
  /**
295
- * Send message to AI and get response
295
+ * Clean up model output stop hallucinated Q&A chains, strip artifacts
296
+ */
297
+ function cleanResponse(text) {
298
+ return text
299
+ // CRITICAL: Cut at first hallucinated Q&A follow-up
300
+ .split(/\n\s*Q\s*:/)[0]
301
+ // Cut at hallucinated role prefixes
302
+ .split(/\n\s*(User|Human|System|Question|A:|Answer):/i)[0]
303
+ // Strip repeated garbage chars
304
+ .replace(/(.)\1{5,}/g, '')
305
+ // Strip leading role prefixes
306
+ .replace(/^(assistant|system|answer|response|AI)\s*[:]\s*/i, '')
307
+ // Strip any remaining mid-response role prefix
308
+ .replace(/^\s*(assistant|AI)\s*[:]\s*/im, '')
309
+ .trim();
310
+ }
311
+
312
+ /**
313
+ * Send message to AI and get response — with timing metrics and streaming
296
314
  */
297
315
  async function sendMessage(userMessage) {
298
316
  try {
299
- console.log(`${colors.dim}Thinking...${colors.reset}`);
300
-
317
+ const totalStart = Date.now();
301
318
  let assistantMessage = '';
302
319
  let sourceInfo = '';
320
+ let retrievalMs = 0;
321
+ let firstTokenMs = 0;
322
+ let generationMs = 0;
323
+ let tokensGenerated = 0;
303
324
 
304
325
  if (config.kbId) {
305
- // RAG mode: call API directly to get relevant chunks, then generate locally with context
326
+ // ── RAG MODE ──
306
327
  console.log(`${colors.dim}Searching knowledge base...${colors.reset}`);
328
+ const retrievalStart = Date.now();
329
+
307
330
  try {
308
331
  const token = await getAuthToken();
309
- if (!token) throw new Error('Could not authenticate — check your API key');
310
- // Adapt chunk count to model's context window
332
+ if (!token) throw new Error('Could not authenticate');
333
+
311
334
  const modelCtx = sdk.getModelContextWindow?.() || 2048;
312
- const topK = modelCtx <= 2048 ? 2 : modelCtx <= 4096 ? 3 : 5;
335
+ const memoryMB = 8192; // default; could read from sdk.getDeviceProfile()
336
+ const cpuCores = 8;
337
+
338
+ // Dynamic config based on device + model
339
+ const deviceTier = (memoryMB >= 8192 && cpuCores >= 8) ? 'high' : (memoryMB >= 4096) ? 'mid' : 'low';
340
+ const topK = deviceTier === 'high' ? 3 : deviceTier === 'mid' ? 2 : 1;
341
+ const maxContextChars = modelCtx <= 2048
342
+ ? (deviceTier === 'high' ? 600 : deviceTier === 'mid' ? 400 : 300)
343
+ : modelCtx <= 4096
344
+ ? (deviceTier === 'high' ? 1500 : 1000)
345
+ : 2000;
346
+ const maxGenTokens = modelCtx <= 2048
347
+ ? (deviceTier === 'high' ? 200 : 150)
348
+ : Math.min(400, Math.floor(modelCtx / 4));
349
+
313
350
  const ragRes = await fetch(`${config.server}/api/rag/knowledge-bases/${config.kbId}/query`, {
314
351
  method: 'POST',
315
- headers: {
316
- 'Content-Type': 'application/json',
317
- 'Authorization': `Bearer ${token}`
318
- },
352
+ headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${token}` },
319
353
  body: JSON.stringify({ query: userMessage, top_k: topK, model_id: config.model })
320
354
  });
321
- if (!ragRes.ok) {
322
- const errText = await ragRes.text();
323
- throw new Error(`RAG query failed: ${ragRes.status} - ${errText}`);
324
- }
355
+ if (!ragRes.ok) throw new Error(`RAG ${ragRes.status}`);
325
356
  const ragData = await ragRes.json();
326
- const chunks = ragData.retrieved_chunks || [];
327
-
328
- // Check if chunks are relevant enough (similarity > 0.3)
329
- const goodChunks = chunks.filter(c => (c.similarity_score || 0) > 0.3);
330
-
331
- if (goodChunks.length > 0) {
332
- // Keep context SHORT — small models need room to generate
333
- const ctxWindow = sdk.getModelContextWindow?.() || 2048;
334
- // Reserve at least 40% of context window for generation
335
- const maxContextChars = ctxWindow <= 2048 ? 800 : ctxWindow <= 4096 ? 1500 : 3000;
336
- const maxGenTokens = ctxWindow <= 2048 ? 150 : Math.min(300, Math.floor(ctxWindow / 4));
337
-
338
- // Clean and truncate context — strip weird chars, fit model window
339
- let context = goodChunks.map(c => c.content).join('\n')
340
- .replace(/[^\x20-\x7E\n]/g, ' ') // Strip non-ASCII/control chars
341
- .replace(/\s{3,}/g, ' ') // Collapse excessive whitespace
342
- .replace(/<[^>]+>/g, ' ') // Strip any leftover HTML tags
343
- .replace(/https?:\/\/\S+/g, '') // Strip URLs to save tokens
344
- .trim();
357
+ const chunks = (ragData.retrieved_chunks || []).filter(c => (c.similarity_score || 0) > 0.3);
358
+ retrievalMs = Date.now() - retrievalStart;
359
+
360
+ if (chunks.length > 0) {
361
+ const bestChunk = chunks[0];
362
+ let context = bestChunk.content
363
+ .replace(/[^\x20-\x7E\n]/g, ' ').replace(/\s{2,}/g, ' ')
364
+ .replace(/<[^>]+>/g, ' ').replace(/https?:\/\/\S+/g, '')
365
+ .replace(/[{}()\[\]]/g, '').trim();
345
366
  if (context.length > maxContextChars) context = context.substring(0, maxContextChars);
346
367
 
347
- // Instruction-style prompt that small models understand
348
- const prompt = `Use the following information to answer the question.\n\nInfo: ${context}\n\nQuestion: ${userMessage}\nAnswer:`;
349
- const response = await sdk.generate(config.model, prompt, {
350
- temperature: 0.6,
351
- maxTokens: maxGenTokens
352
- });
353
- assistantMessage = (typeof response === 'string' ? response : response?.text || response?.content || '') || '';
354
-
355
- // Collect source names
356
- const sources = [...new Set(goodChunks.map(c => c.document_name || c.source).filter(Boolean))];
357
- if (sources.length > 0) {
358
- sourceInfo = `\n${colors.dim}[Sources: ${sources.join(', ')}]${colors.reset}`;
368
+ console.log(`${colors.dim}Context: ${context.length} chars from "${bestChunk.document_name}" [retrieval: ${retrievalMs}ms, tier: ${deviceTier}]${colors.reset}`);
369
+
370
+ // Instruction prompt avoids Q&A chain hallucination
371
+ const prompt = `Based on this information:\n${context}\n\nAnswer briefly: ${userMessage}\n\n`;
372
+
373
+ // Stream tokens
374
+ const genStart = Date.now();
375
+ let firstToken = false;
376
+ process.stdout.write(`\n${colors.bright}${colors.magenta}AI:${colors.reset} `);
377
+
378
+ if (sdk.generateStream) {
379
+ const result = await sdk.generateStream(config.model, prompt, {
380
+ temperature: 0.6,
381
+ maxTokens: maxGenTokens,
382
+ onToken: (token, partial) => {
383
+ if (!firstToken) {
384
+ firstTokenMs = Date.now() - genStart;
385
+ firstToken = true;
386
+ }
387
+ process.stdout.write(token);
388
+ }
389
+ });
390
+ assistantMessage = result.text || '';
391
+ if (!firstToken) firstTokenMs = result.firstTokenMs || 0;
392
+ tokensGenerated = result.tokensGenerated || assistantMessage.split(/\s+/).length;
393
+ } else {
394
+ // Fallback: no streaming
395
+ const response = await sdk.generate(config.model, prompt, {
396
+ temperature: 0.6,
397
+ maxTokens: maxGenTokens
398
+ });
399
+ assistantMessage = (typeof response === 'string' ? response : response?.text || '') || '';
400
+ firstTokenMs = Date.now() - genStart;
401
+ tokensGenerated = assistantMessage.split(/\s+/).length;
402
+ process.stdout.write(assistantMessage);
359
403
  }
404
+ generationMs = Date.now() - genStart;
405
+
406
+ // Source info
407
+ const sources = [...new Set(chunks.map(c => c.document_name || c.source).filter(Boolean))];
408
+ if (sources.length > 0) sourceInfo = `\n${colors.dim}[Sources: ${sources.join(', ')}]${colors.reset}`;
360
409
  } else {
361
- // No relevant chunks answer conversationally
362
- console.log(`${colors.dim}No RAG context found, using plain generation...${colors.reset}`);
363
- const prompt = `The user said: "${userMessage}"\nGive a brief, friendly response:\n`;
364
- const response = await sdk.generate(config.model, prompt, {
365
- temperature: 0.7,
366
- maxTokens: 100
367
- });
368
- assistantMessage = (typeof response === 'string' ? response : response?.text || response?.content || '') || '';
410
+ console.log(`${colors.dim}No relevant context found [retrieval: ${retrievalMs}ms]${colors.reset}`);
411
+ const genStart = Date.now();
412
+ process.stdout.write(`\n${colors.bright}${colors.magenta}AI:${colors.reset} `);
413
+
414
+ if (sdk.generateStream) {
415
+ const result = await sdk.generateStream(config.model, `Answer briefly: ${userMessage}\n\n`, {
416
+ temperature: 0.7, maxTokens: 100,
417
+ onToken: (token) => {
418
+ if (!firstTokenMs) firstTokenMs = Date.now() - genStart;
419
+ process.stdout.write(token);
420
+ }
421
+ });
422
+ assistantMessage = result.text || '';
423
+ tokensGenerated = result.tokensGenerated || assistantMessage.split(/\s+/).length;
424
+ } else {
425
+ const response = await sdk.generate(config.model, `Answer briefly: ${userMessage}\n\n`, {
426
+ temperature: 0.7, maxTokens: 100
427
+ });
428
+ assistantMessage = (typeof response === 'string' ? response : response?.text || '') || '';
429
+ firstTokenMs = Date.now() - genStart;
430
+ tokensGenerated = assistantMessage.split(/\s+/).length;
431
+ process.stdout.write(assistantMessage);
432
+ }
433
+ generationMs = Date.now() - genStart;
369
434
  }
370
435
  } catch (ragErr) {
371
- console.log(`${colors.yellow}RAG lookup failed: ${ragErr.message}${colors.reset}`);
372
- const prompt = `The user said: "${userMessage}"\nGive a brief, friendly response:\n`;
373
- const response = await sdk.generate(config.model, prompt, {
374
- temperature: 0.7,
375
- maxTokens: 100
436
+ console.log(`${colors.yellow}RAG failed: ${ragErr.message}${colors.reset}`);
437
+ const genStart = Date.now();
438
+ const response = await sdk.generate(config.model, `Answer briefly: ${userMessage}\n\n`, {
439
+ temperature: 0.7, maxTokens: 100
376
440
  });
377
- assistantMessage = (typeof response === 'string' ? response : response?.text || response?.content || '') || '';
441
+ assistantMessage = (typeof response === 'string' ? response : response?.text || '') || '';
442
+ firstTokenMs = Date.now() - genStart;
443
+ generationMs = firstTokenMs;
444
+ tokensGenerated = assistantMessage.split(/\s+/).length;
445
+ process.stdout.write(`\n${colors.bright}${colors.magenta}AI:${colors.reset} ${assistantMessage}`);
378
446
  }
379
447
  } else {
380
- // Plain mode: direct generation (no RAG)
381
- const prompt = `The user said: "${userMessage}"\nGive a brief, helpful response:\n`;
382
- const response = await sdk.generate(config.model, prompt, {
383
- temperature: 0.7,
384
- maxTokens: 150
385
- });
386
- assistantMessage = (typeof response === 'string' ? response : response?.text || response?.content || '') || '';
448
+ // ── PLAIN MODE ──
449
+ const genStart = Date.now();
450
+ process.stdout.write(`\n${colors.bright}${colors.magenta}AI:${colors.reset} `);
451
+
452
+ if (sdk.generateStream) {
453
+ const result = await sdk.generateStream(config.model, `Answer briefly: ${userMessage}\n\n`, {
454
+ temperature: 0.7, maxTokens: 150,
455
+ onToken: (token) => {
456
+ if (!firstTokenMs) firstTokenMs = Date.now() - genStart;
457
+ process.stdout.write(token);
458
+ }
459
+ });
460
+ assistantMessage = result.text || '';
461
+ tokensGenerated = result.tokensGenerated || assistantMessage.split(/\s+/).length;
462
+ } else {
463
+ const response = await sdk.generate(config.model, `Answer briefly: ${userMessage}\n\n`, {
464
+ temperature: 0.7, maxTokens: 150
465
+ });
466
+ assistantMessage = (typeof response === 'string' ? response : response?.text || '') || '';
467
+ firstTokenMs = Date.now() - genStart;
468
+ tokensGenerated = assistantMessage.split(/\s+/).length;
469
+ process.stdout.write(assistantMessage);
470
+ }
471
+ generationMs = Date.now() - genStart;
387
472
  }
388
473
 
389
- // Clean up model output artifacts
390
- assistantMessage = assistantMessage
391
- // Strip repeated garbage chars (!!!, ???, etc)
392
- .replace(/(.)\1{5,}/g, '')
393
- // Strip leading role prefixes the model loves to emit
394
- .replace(/^(assistant|system|answer|response|AI)\s*[:]\s*/i, '')
395
- // Remove leading partial sentences (fragments before the real answer)
396
- .replace(/^[a-z][^.!?]{0,40}\.\s*/i, function(match) {
397
- // Only strip if it looks like a fragment (< 50 chars ending in period)
398
- return match.length < 50 && !match.includes(' is ') ? '' : match;
399
- })
400
- // Stop at any hallucinated role prefixes mid-response
401
- .split(/\n\s*(User|Human|System|Question):/i)[0]
402
- // Strip any remaining leading role prefix after newline
403
- .replace(/^\s*(assistant|AI)\s*[:]\s*/im, '')
404
- .trim();
474
+ // Clean up hallucinated Q&A chains
475
+ assistantMessage = cleanResponse(assistantMessage);
476
+
477
+ const totalMs = Date.now() - totalStart;
478
+ const tokPerSec = generationMs > 0 ? (tokensGenerated / (generationMs / 1000)).toFixed(1) : '0';
479
+
480
+ // Print timing summary
481
+ console.log(sourceInfo);
482
+ console.log(`${colors.dim}⏱ retrieval: ${retrievalMs}ms | first token: ${firstTokenMs}ms | generation: ${generationMs}ms | total: ${totalMs}ms | ${tokensGenerated} tokens @ ${tokPerSec} tok/s${colors.reset}\n`);
405
483
 
406
484
  if (!assistantMessage || assistantMessage.length < 3) {
407
- assistantMessage = '(No response generated — try rephrasing your question)';
485
+ console.log(`${colors.yellow}(No response generated — try rephrasing your question)${colors.reset}\n`);
408
486
  }
409
-
410
- console.log(`\n${colors.bright}${colors.magenta}AI:${colors.reset} ${assistantMessage}${sourceInfo}\n`);
411
487
  } catch (error) {
412
488
  console.error(`\n${colors.red}Error:${colors.reset} ${error.message}\n`);
413
489
  }
package/dist/index.d.ts CHANGED
@@ -44,7 +44,7 @@ interface ProgressEvent {
44
44
  detail?: any;
45
45
  }
46
46
  interface SlyEvent {
47
- type: 'auth' | 'device_registered' | 'device_profiled' | 'model_download_start' | 'model_download_progress' | 'model_loaded' | 'inference_start' | 'inference_complete' | 'error' | 'fallback_success' | 'fallback_error' | 'telemetry_flushed';
47
+ type: 'auth' | 'device_registered' | 'device_profiled' | 'model_download_start' | 'model_download_progress' | 'model_loaded' | 'inference_start' | 'inference_complete' | 'error' | 'fallback_success' | 'fallback_error' | 'telemetry_flushed' | 'token';
48
48
  data?: any;
49
49
  timestamp: number;
50
50
  }
@@ -126,6 +126,7 @@ interface RAGOptions {
126
126
  modelId: string;
127
127
  temperature?: number;
128
128
  maxTokens?: number;
129
+ onToken?: (token: string, partial: string) => void;
129
130
  }
130
131
  interface RAGChunk {
131
132
  id: string;
@@ -142,6 +143,23 @@ interface RAGResponse {
142
143
  context: string;
143
144
  latencyMs: number;
144
145
  tierUsed: 1 | 2 | 3;
146
+ timing: {
147
+ retrievalMs: number;
148
+ contextBuildMs: number;
149
+ firstTokenMs: number;
150
+ generationMs: number;
151
+ totalMs: number;
152
+ tokensGenerated: number;
153
+ tokensPerSecond: number;
154
+ };
155
+ config: {
156
+ maxContextChars: number;
157
+ maxGenTokens: number;
158
+ chunkSize: number;
159
+ topK: number;
160
+ contextWindowUsed: number;
161
+ deviceTier: 'low' | 'mid' | 'high';
162
+ };
145
163
  }
146
164
  interface OfflineIndex {
147
165
  metadata: {
@@ -224,6 +242,18 @@ declare class SlyOS {
224
242
  quant?: QuantizationLevel;
225
243
  }): Promise<void>;
226
244
  generate(modelId: string, prompt: string, options?: GenerateOptions): Promise<string>;
245
+ /**
246
+ * Stream text generation token-by-token.
247
+ * Calls onToken callback for each generated token.
248
+ */
249
+ generateStream(modelId: string, prompt: string, options?: GenerateOptions & {
250
+ onToken?: (token: string, partial: string) => void;
251
+ }): Promise<{
252
+ text: string;
253
+ firstTokenMs: number;
254
+ totalMs: number;
255
+ tokensGenerated: number;
256
+ }>;
227
257
  transcribe(modelId: string, audioInput: any, options?: TranscribeOptions): Promise<string>;
228
258
  chatCompletion(modelId: string, request: OpenAIChatCompletionRequest): Promise<OpenAIChatCompletionResponse>;
229
259
  bedrockInvoke(modelId: string, request: BedrockInvokeRequest): Promise<BedrockInvokeResponse>;
@@ -235,6 +265,10 @@ declare class SlyOS {
235
265
  private mapModelToOpenAI;
236
266
  private localEmbeddingModel;
237
267
  private offlineIndexes;
268
+ /**
269
+ * Compute dynamic RAG parameters based on device profile and model.
270
+ */
271
+ private computeRAGConfig;
238
272
  /**
239
273
  * Tier 2: Cloud-indexed RAG with local inference.
240
274
  * Retrieves relevant chunks from server, generates response locally.