@beltoinc/slyos-sdk 1.5.0 → 1.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/create-chatbot.sh +162 -86
- package/dist/index.d.ts +35 -1
- package/dist/index.js +276 -87
- package/package.json +1 -1
- package/src/index.ts +318 -101
package/create-chatbot.sh
CHANGED
|
@@ -292,122 +292,198 @@ function printWelcome() {
|
|
|
292
292
|
}
|
|
293
293
|
|
|
294
294
|
/**
|
|
295
|
-
*
|
|
295
|
+
* Clean up model output — stop hallucinated Q&A chains, strip artifacts
|
|
296
|
+
*/
|
|
297
|
+
function cleanResponse(text) {
|
|
298
|
+
return text
|
|
299
|
+
// CRITICAL: Cut at first hallucinated Q&A follow-up
|
|
300
|
+
.split(/\n\s*Q\s*:/)[0]
|
|
301
|
+
// Cut at hallucinated role prefixes
|
|
302
|
+
.split(/\n\s*(User|Human|System|Question|A:|Answer):/i)[0]
|
|
303
|
+
// Strip repeated garbage chars
|
|
304
|
+
.replace(/(.)\1{5,}/g, '')
|
|
305
|
+
// Strip leading role prefixes
|
|
306
|
+
.replace(/^(assistant|system|answer|response|AI)\s*[:]\s*/i, '')
|
|
307
|
+
// Strip any remaining mid-response role prefix
|
|
308
|
+
.replace(/^\s*(assistant|AI)\s*[:]\s*/im, '')
|
|
309
|
+
.trim();
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
/**
|
|
313
|
+
* Send message to AI and get response — with timing metrics and streaming
|
|
296
314
|
*/
|
|
297
315
|
async function sendMessage(userMessage) {
|
|
298
316
|
try {
|
|
299
|
-
|
|
300
|
-
|
|
317
|
+
const totalStart = Date.now();
|
|
301
318
|
let assistantMessage = '';
|
|
302
319
|
let sourceInfo = '';
|
|
320
|
+
let retrievalMs = 0;
|
|
321
|
+
let firstTokenMs = 0;
|
|
322
|
+
let generationMs = 0;
|
|
323
|
+
let tokensGenerated = 0;
|
|
303
324
|
|
|
304
325
|
if (config.kbId) {
|
|
305
|
-
// RAG
|
|
326
|
+
// ── RAG MODE ──
|
|
306
327
|
console.log(`${colors.dim}Searching knowledge base...${colors.reset}`);
|
|
328
|
+
const retrievalStart = Date.now();
|
|
329
|
+
|
|
307
330
|
try {
|
|
308
331
|
const token = await getAuthToken();
|
|
309
|
-
if (!token) throw new Error('Could not authenticate
|
|
310
|
-
|
|
332
|
+
if (!token) throw new Error('Could not authenticate');
|
|
333
|
+
|
|
311
334
|
const modelCtx = sdk.getModelContextWindow?.() || 2048;
|
|
312
|
-
const
|
|
335
|
+
const memoryMB = 8192; // default; could read from sdk.getDeviceProfile()
|
|
336
|
+
const cpuCores = 8;
|
|
337
|
+
|
|
338
|
+
// Dynamic config based on device + model
|
|
339
|
+
const deviceTier = (memoryMB >= 8192 && cpuCores >= 8) ? 'high' : (memoryMB >= 4096) ? 'mid' : 'low';
|
|
340
|
+
const topK = deviceTier === 'high' ? 3 : deviceTier === 'mid' ? 2 : 1;
|
|
341
|
+
const maxContextChars = modelCtx <= 2048
|
|
342
|
+
? (deviceTier === 'high' ? 600 : deviceTier === 'mid' ? 400 : 300)
|
|
343
|
+
: modelCtx <= 4096
|
|
344
|
+
? (deviceTier === 'high' ? 1500 : 1000)
|
|
345
|
+
: 2000;
|
|
346
|
+
const maxGenTokens = modelCtx <= 2048
|
|
347
|
+
? (deviceTier === 'high' ? 200 : 150)
|
|
348
|
+
: Math.min(400, Math.floor(modelCtx / 4));
|
|
349
|
+
|
|
313
350
|
const ragRes = await fetch(`${config.server}/api/rag/knowledge-bases/${config.kbId}/query`, {
|
|
314
351
|
method: 'POST',
|
|
315
|
-
headers: {
|
|
316
|
-
'Content-Type': 'application/json',
|
|
317
|
-
'Authorization': `Bearer ${token}`
|
|
318
|
-
},
|
|
352
|
+
headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${token}` },
|
|
319
353
|
body: JSON.stringify({ query: userMessage, top_k: topK, model_id: config.model })
|
|
320
354
|
});
|
|
321
|
-
if (!ragRes.ok) {
|
|
322
|
-
const errText = await ragRes.text();
|
|
323
|
-
throw new Error(`RAG query failed: ${ragRes.status} - ${errText}`);
|
|
324
|
-
}
|
|
355
|
+
if (!ragRes.ok) throw new Error(`RAG ${ragRes.status}`);
|
|
325
356
|
const ragData = await ragRes.json();
|
|
326
|
-
const chunks = ragData.retrieved_chunks || [];
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
const maxContextChars = ctxWindow <= 2048 ? 800 : ctxWindow <= 4096 ? 1500 : 3000;
|
|
336
|
-
const maxGenTokens = ctxWindow <= 2048 ? 150 : Math.min(300, Math.floor(ctxWindow / 4));
|
|
337
|
-
|
|
338
|
-
// Clean and truncate context — strip weird chars, fit model window
|
|
339
|
-
let context = goodChunks.map(c => c.content).join('\n')
|
|
340
|
-
.replace(/[^\x20-\x7E\n]/g, ' ') // Strip non-ASCII/control chars
|
|
341
|
-
.replace(/\s{3,}/g, ' ') // Collapse excessive whitespace
|
|
342
|
-
.replace(/<[^>]+>/g, ' ') // Strip any leftover HTML tags
|
|
343
|
-
.replace(/https?:\/\/\S+/g, '') // Strip URLs to save tokens
|
|
344
|
-
.trim();
|
|
357
|
+
const chunks = (ragData.retrieved_chunks || []).filter(c => (c.similarity_score || 0) > 0.3);
|
|
358
|
+
retrievalMs = Date.now() - retrievalStart;
|
|
359
|
+
|
|
360
|
+
if (chunks.length > 0) {
|
|
361
|
+
const bestChunk = chunks[0];
|
|
362
|
+
let context = bestChunk.content
|
|
363
|
+
.replace(/[^\x20-\x7E\n]/g, ' ').replace(/\s{2,}/g, ' ')
|
|
364
|
+
.replace(/<[^>]+>/g, ' ').replace(/https?:\/\/\S+/g, '')
|
|
365
|
+
.replace(/[{}()\[\]]/g, '').trim();
|
|
345
366
|
if (context.length > maxContextChars) context = context.substring(0, maxContextChars);
|
|
346
367
|
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
if (
|
|
358
|
-
|
|
368
|
+
console.log(`${colors.dim}Context: ${context.length} chars from "${bestChunk.document_name}" [retrieval: ${retrievalMs}ms, tier: ${deviceTier}]${colors.reset}`);
|
|
369
|
+
|
|
370
|
+
// Instruction prompt — avoids Q&A chain hallucination
|
|
371
|
+
const prompt = `Based on this information:\n${context}\n\nAnswer briefly: ${userMessage}\n\n`;
|
|
372
|
+
|
|
373
|
+
// Stream tokens
|
|
374
|
+
const genStart = Date.now();
|
|
375
|
+
let firstToken = false;
|
|
376
|
+
process.stdout.write(`\n${colors.bright}${colors.magenta}AI:${colors.reset} `);
|
|
377
|
+
|
|
378
|
+
if (sdk.generateStream) {
|
|
379
|
+
const result = await sdk.generateStream(config.model, prompt, {
|
|
380
|
+
temperature: 0.6,
|
|
381
|
+
maxTokens: maxGenTokens,
|
|
382
|
+
onToken: (token, partial) => {
|
|
383
|
+
if (!firstToken) {
|
|
384
|
+
firstTokenMs = Date.now() - genStart;
|
|
385
|
+
firstToken = true;
|
|
386
|
+
}
|
|
387
|
+
process.stdout.write(token);
|
|
388
|
+
}
|
|
389
|
+
});
|
|
390
|
+
assistantMessage = result.text || '';
|
|
391
|
+
if (!firstToken) firstTokenMs = result.firstTokenMs || 0;
|
|
392
|
+
tokensGenerated = result.tokensGenerated || assistantMessage.split(/\s+/).length;
|
|
393
|
+
} else {
|
|
394
|
+
// Fallback: no streaming
|
|
395
|
+
const response = await sdk.generate(config.model, prompt, {
|
|
396
|
+
temperature: 0.6,
|
|
397
|
+
maxTokens: maxGenTokens
|
|
398
|
+
});
|
|
399
|
+
assistantMessage = (typeof response === 'string' ? response : response?.text || '') || '';
|
|
400
|
+
firstTokenMs = Date.now() - genStart;
|
|
401
|
+
tokensGenerated = assistantMessage.split(/\s+/).length;
|
|
402
|
+
process.stdout.write(assistantMessage);
|
|
359
403
|
}
|
|
404
|
+
generationMs = Date.now() - genStart;
|
|
405
|
+
|
|
406
|
+
// Source info
|
|
407
|
+
const sources = [...new Set(chunks.map(c => c.document_name || c.source).filter(Boolean))];
|
|
408
|
+
if (sources.length > 0) sourceInfo = `\n${colors.dim}[Sources: ${sources.join(', ')}]${colors.reset}`;
|
|
360
409
|
} else {
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
410
|
+
console.log(`${colors.dim}No relevant context found [retrieval: ${retrievalMs}ms]${colors.reset}`);
|
|
411
|
+
const genStart = Date.now();
|
|
412
|
+
process.stdout.write(`\n${colors.bright}${colors.magenta}AI:${colors.reset} `);
|
|
413
|
+
|
|
414
|
+
if (sdk.generateStream) {
|
|
415
|
+
const result = await sdk.generateStream(config.model, `Answer briefly: ${userMessage}\n\n`, {
|
|
416
|
+
temperature: 0.7, maxTokens: 100,
|
|
417
|
+
onToken: (token) => {
|
|
418
|
+
if (!firstTokenMs) firstTokenMs = Date.now() - genStart;
|
|
419
|
+
process.stdout.write(token);
|
|
420
|
+
}
|
|
421
|
+
});
|
|
422
|
+
assistantMessage = result.text || '';
|
|
423
|
+
tokensGenerated = result.tokensGenerated || assistantMessage.split(/\s+/).length;
|
|
424
|
+
} else {
|
|
425
|
+
const response = await sdk.generate(config.model, `Answer briefly: ${userMessage}\n\n`, {
|
|
426
|
+
temperature: 0.7, maxTokens: 100
|
|
427
|
+
});
|
|
428
|
+
assistantMessage = (typeof response === 'string' ? response : response?.text || '') || '';
|
|
429
|
+
firstTokenMs = Date.now() - genStart;
|
|
430
|
+
tokensGenerated = assistantMessage.split(/\s+/).length;
|
|
431
|
+
process.stdout.write(assistantMessage);
|
|
432
|
+
}
|
|
433
|
+
generationMs = Date.now() - genStart;
|
|
369
434
|
}
|
|
370
435
|
} catch (ragErr) {
|
|
371
|
-
console.log(`${colors.yellow}RAG
|
|
372
|
-
const
|
|
373
|
-
const response = await sdk.generate(config.model,
|
|
374
|
-
temperature: 0.7,
|
|
375
|
-
maxTokens: 100
|
|
436
|
+
console.log(`${colors.yellow}RAG failed: ${ragErr.message}${colors.reset}`);
|
|
437
|
+
const genStart = Date.now();
|
|
438
|
+
const response = await sdk.generate(config.model, `Answer briefly: ${userMessage}\n\n`, {
|
|
439
|
+
temperature: 0.7, maxTokens: 100
|
|
376
440
|
});
|
|
377
|
-
assistantMessage = (typeof response === 'string' ? response : response?.text ||
|
|
441
|
+
assistantMessage = (typeof response === 'string' ? response : response?.text || '') || '';
|
|
442
|
+
firstTokenMs = Date.now() - genStart;
|
|
443
|
+
generationMs = firstTokenMs;
|
|
444
|
+
tokensGenerated = assistantMessage.split(/\s+/).length;
|
|
445
|
+
process.stdout.write(`\n${colors.bright}${colors.magenta}AI:${colors.reset} ${assistantMessage}`);
|
|
378
446
|
}
|
|
379
447
|
} else {
|
|
380
|
-
//
|
|
381
|
-
const
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
448
|
+
// ── PLAIN MODE ──
|
|
449
|
+
const genStart = Date.now();
|
|
450
|
+
process.stdout.write(`\n${colors.bright}${colors.magenta}AI:${colors.reset} `);
|
|
451
|
+
|
|
452
|
+
if (sdk.generateStream) {
|
|
453
|
+
const result = await sdk.generateStream(config.model, `Answer briefly: ${userMessage}\n\n`, {
|
|
454
|
+
temperature: 0.7, maxTokens: 150,
|
|
455
|
+
onToken: (token) => {
|
|
456
|
+
if (!firstTokenMs) firstTokenMs = Date.now() - genStart;
|
|
457
|
+
process.stdout.write(token);
|
|
458
|
+
}
|
|
459
|
+
});
|
|
460
|
+
assistantMessage = result.text || '';
|
|
461
|
+
tokensGenerated = result.tokensGenerated || assistantMessage.split(/\s+/).length;
|
|
462
|
+
} else {
|
|
463
|
+
const response = await sdk.generate(config.model, `Answer briefly: ${userMessage}\n\n`, {
|
|
464
|
+
temperature: 0.7, maxTokens: 150
|
|
465
|
+
});
|
|
466
|
+
assistantMessage = (typeof response === 'string' ? response : response?.text || '') || '';
|
|
467
|
+
firstTokenMs = Date.now() - genStart;
|
|
468
|
+
tokensGenerated = assistantMessage.split(/\s+/).length;
|
|
469
|
+
process.stdout.write(assistantMessage);
|
|
470
|
+
}
|
|
471
|
+
generationMs = Date.now() - genStart;
|
|
387
472
|
}
|
|
388
473
|
|
|
389
|
-
// Clean up
|
|
390
|
-
assistantMessage = assistantMessage
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
return match.length < 50 && !match.includes(' is ') ? '' : match;
|
|
399
|
-
})
|
|
400
|
-
// Stop at any hallucinated role prefixes mid-response
|
|
401
|
-
.split(/\n\s*(User|Human|System|Question):/i)[0]
|
|
402
|
-
// Strip any remaining leading role prefix after newline
|
|
403
|
-
.replace(/^\s*(assistant|AI)\s*[:]\s*/im, '')
|
|
404
|
-
.trim();
|
|
474
|
+
// Clean up hallucinated Q&A chains
|
|
475
|
+
assistantMessage = cleanResponse(assistantMessage);
|
|
476
|
+
|
|
477
|
+
const totalMs = Date.now() - totalStart;
|
|
478
|
+
const tokPerSec = generationMs > 0 ? (tokensGenerated / (generationMs / 1000)).toFixed(1) : '0';
|
|
479
|
+
|
|
480
|
+
// Print timing summary
|
|
481
|
+
console.log(sourceInfo);
|
|
482
|
+
console.log(`${colors.dim}⏱ retrieval: ${retrievalMs}ms | first token: ${firstTokenMs}ms | generation: ${generationMs}ms | total: ${totalMs}ms | ${tokensGenerated} tokens @ ${tokPerSec} tok/s${colors.reset}\n`);
|
|
405
483
|
|
|
406
484
|
if (!assistantMessage || assistantMessage.length < 3) {
|
|
407
|
-
|
|
485
|
+
console.log(`${colors.yellow}(No response generated — try rephrasing your question)${colors.reset}\n`);
|
|
408
486
|
}
|
|
409
|
-
|
|
410
|
-
console.log(`\n${colors.bright}${colors.magenta}AI:${colors.reset} ${assistantMessage}${sourceInfo}\n`);
|
|
411
487
|
} catch (error) {
|
|
412
488
|
console.error(`\n${colors.red}Error:${colors.reset} ${error.message}\n`);
|
|
413
489
|
}
|
package/dist/index.d.ts
CHANGED
|
@@ -44,7 +44,7 @@ interface ProgressEvent {
|
|
|
44
44
|
detail?: any;
|
|
45
45
|
}
|
|
46
46
|
interface SlyEvent {
|
|
47
|
-
type: 'auth' | 'device_registered' | 'device_profiled' | 'model_download_start' | 'model_download_progress' | 'model_loaded' | 'inference_start' | 'inference_complete' | 'error' | 'fallback_success' | 'fallback_error' | 'telemetry_flushed';
|
|
47
|
+
type: 'auth' | 'device_registered' | 'device_profiled' | 'model_download_start' | 'model_download_progress' | 'model_loaded' | 'inference_start' | 'inference_complete' | 'error' | 'fallback_success' | 'fallback_error' | 'telemetry_flushed' | 'token';
|
|
48
48
|
data?: any;
|
|
49
49
|
timestamp: number;
|
|
50
50
|
}
|
|
@@ -126,6 +126,7 @@ interface RAGOptions {
|
|
|
126
126
|
modelId: string;
|
|
127
127
|
temperature?: number;
|
|
128
128
|
maxTokens?: number;
|
|
129
|
+
onToken?: (token: string, partial: string) => void;
|
|
129
130
|
}
|
|
130
131
|
interface RAGChunk {
|
|
131
132
|
id: string;
|
|
@@ -142,6 +143,23 @@ interface RAGResponse {
|
|
|
142
143
|
context: string;
|
|
143
144
|
latencyMs: number;
|
|
144
145
|
tierUsed: 1 | 2 | 3;
|
|
146
|
+
timing: {
|
|
147
|
+
retrievalMs: number;
|
|
148
|
+
contextBuildMs: number;
|
|
149
|
+
firstTokenMs: number;
|
|
150
|
+
generationMs: number;
|
|
151
|
+
totalMs: number;
|
|
152
|
+
tokensGenerated: number;
|
|
153
|
+
tokensPerSecond: number;
|
|
154
|
+
};
|
|
155
|
+
config: {
|
|
156
|
+
maxContextChars: number;
|
|
157
|
+
maxGenTokens: number;
|
|
158
|
+
chunkSize: number;
|
|
159
|
+
topK: number;
|
|
160
|
+
contextWindowUsed: number;
|
|
161
|
+
deviceTier: 'low' | 'mid' | 'high';
|
|
162
|
+
};
|
|
145
163
|
}
|
|
146
164
|
interface OfflineIndex {
|
|
147
165
|
metadata: {
|
|
@@ -224,6 +242,18 @@ declare class SlyOS {
|
|
|
224
242
|
quant?: QuantizationLevel;
|
|
225
243
|
}): Promise<void>;
|
|
226
244
|
generate(modelId: string, prompt: string, options?: GenerateOptions): Promise<string>;
|
|
245
|
+
/**
|
|
246
|
+
* Stream text generation token-by-token.
|
|
247
|
+
* Calls onToken callback for each generated token.
|
|
248
|
+
*/
|
|
249
|
+
generateStream(modelId: string, prompt: string, options?: GenerateOptions & {
|
|
250
|
+
onToken?: (token: string, partial: string) => void;
|
|
251
|
+
}): Promise<{
|
|
252
|
+
text: string;
|
|
253
|
+
firstTokenMs: number;
|
|
254
|
+
totalMs: number;
|
|
255
|
+
tokensGenerated: number;
|
|
256
|
+
}>;
|
|
227
257
|
transcribe(modelId: string, audioInput: any, options?: TranscribeOptions): Promise<string>;
|
|
228
258
|
chatCompletion(modelId: string, request: OpenAIChatCompletionRequest): Promise<OpenAIChatCompletionResponse>;
|
|
229
259
|
bedrockInvoke(modelId: string, request: BedrockInvokeRequest): Promise<BedrockInvokeResponse>;
|
|
@@ -235,6 +265,10 @@ declare class SlyOS {
|
|
|
235
265
|
private mapModelToOpenAI;
|
|
236
266
|
private localEmbeddingModel;
|
|
237
267
|
private offlineIndexes;
|
|
268
|
+
/**
|
|
269
|
+
* Compute dynamic RAG parameters based on device profile and model.
|
|
270
|
+
*/
|
|
271
|
+
private computeRAGConfig;
|
|
238
272
|
/**
|
|
239
273
|
* Tier 2: Cloud-indexed RAG with local inference.
|
|
240
274
|
* Retrieves relevant chunks from server, generates response locally.
|