@beltoinc/slyos-sdk 1.5.0 → 1.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/create-chatbot.sh CHANGED
@@ -329,23 +329,29 @@ async function sendMessage(userMessage) {
329
329
  const goodChunks = chunks.filter(c => (c.similarity_score || 0) > 0.3);
330
330
 
331
331
  if (goodChunks.length > 0) {
332
- // Keep context SHORT — small models need room to generate
333
332
  const ctxWindow = sdk.getModelContextWindow?.() || 2048;
334
- // Reserve at least 40% of context window for generation
335
- const maxContextChars = ctxWindow <= 2048 ? 800 : ctxWindow <= 4096 ? 1500 : 3000;
333
+
334
+ // AGGRESSIVE context limits small models choke on long prompts
335
+ // ~4 chars per token on average, reserve 60% of window for generation
336
+ const maxContextTokens = Math.floor(ctxWindow * 0.3);
337
+ const maxContextChars = ctxWindow <= 2048 ? 400 : ctxWindow <= 4096 ? 1000 : 2000;
336
338
  const maxGenTokens = ctxWindow <= 2048 ? 150 : Math.min(300, Math.floor(ctxWindow / 4));
337
339
 
338
- // Clean and truncate context strip weird chars, fit model window
339
- let context = goodChunks.map(c => c.content).join('\n')
340
+ // Use only the single best chunk for small models
341
+ const bestChunk = goodChunks[0];
342
+ let context = bestChunk.content
340
343
  .replace(/[^\x20-\x7E\n]/g, ' ') // Strip non-ASCII/control chars
341
- .replace(/\s{3,}/g, ' ') // Collapse excessive whitespace
342
- .replace(/<[^>]+>/g, ' ') // Strip any leftover HTML tags
343
- .replace(/https?:\/\/\S+/g, '') // Strip URLs to save tokens
344
+ .replace(/\s{2,}/g, ' ') // Collapse whitespace
345
+ .replace(/<[^>]+>/g, ' ') // Strip HTML tags
346
+ .replace(/https?:\/\/\S+/g, '') // Strip URLs
347
+ .replace(/[{}()\[\]]/g, '') // Strip brackets/braces
344
348
  .trim();
345
349
  if (context.length > maxContextChars) context = context.substring(0, maxContextChars);
346
350
 
347
- // Instruction-style prompt that small models understand
348
- const prompt = `Use the following information to answer the question.\n\nInfo: ${context}\n\nQuestion: ${userMessage}\nAnswer:`;
351
+ console.log(`${colors.dim}Context: ${context.length} chars from "${bestChunk.document_name}"${colors.reset}`);
352
+
353
+ // Minimal prompt — every token counts
354
+ const prompt = `${context}\n\nQ: ${userMessage}\nA:`;
349
355
  const response = await sdk.generate(config.model, prompt, {
350
356
  temperature: 0.6,
351
357
  maxTokens: maxGenTokens
package/dist/index.d.ts CHANGED
@@ -44,7 +44,7 @@ interface ProgressEvent {
44
44
  detail?: any;
45
45
  }
46
46
  interface SlyEvent {
47
- type: 'auth' | 'device_registered' | 'device_profiled' | 'model_download_start' | 'model_download_progress' | 'model_loaded' | 'inference_start' | 'inference_complete' | 'error' | 'fallback_success' | 'fallback_error' | 'telemetry_flushed';
47
+ type: 'auth' | 'device_registered' | 'device_profiled' | 'model_download_start' | 'model_download_progress' | 'model_loaded' | 'inference_start' | 'inference_complete' | 'error' | 'fallback_success' | 'fallback_error' | 'telemetry_flushed' | 'token';
48
48
  data?: any;
49
49
  timestamp: number;
50
50
  }
@@ -126,6 +126,7 @@ interface RAGOptions {
126
126
  modelId: string;
127
127
  temperature?: number;
128
128
  maxTokens?: number;
129
+ onToken?: (token: string, partial: string) => void;
129
130
  }
130
131
  interface RAGChunk {
131
132
  id: string;
@@ -142,6 +143,23 @@ interface RAGResponse {
142
143
  context: string;
143
144
  latencyMs: number;
144
145
  tierUsed: 1 | 2 | 3;
146
+ timing: {
147
+ retrievalMs: number;
148
+ contextBuildMs: number;
149
+ firstTokenMs: number;
150
+ generationMs: number;
151
+ totalMs: number;
152
+ tokensGenerated: number;
153
+ tokensPerSecond: number;
154
+ };
155
+ config: {
156
+ maxContextChars: number;
157
+ maxGenTokens: number;
158
+ chunkSize: number;
159
+ topK: number;
160
+ contextWindowUsed: number;
161
+ deviceTier: 'low' | 'mid' | 'high';
162
+ };
145
163
  }
146
164
  interface OfflineIndex {
147
165
  metadata: {
@@ -224,6 +242,18 @@ declare class SlyOS {
224
242
  quant?: QuantizationLevel;
225
243
  }): Promise<void>;
226
244
  generate(modelId: string, prompt: string, options?: GenerateOptions): Promise<string>;
245
+ /**
246
+ * Stream text generation token-by-token.
247
+ * Calls onToken callback for each generated token.
248
+ */
249
+ generateStream(modelId: string, prompt: string, options?: GenerateOptions & {
250
+ onToken?: (token: string, partial: string) => void;
251
+ }): Promise<{
252
+ text: string;
253
+ firstTokenMs: number;
254
+ totalMs: number;
255
+ tokensGenerated: number;
256
+ }>;
227
257
  transcribe(modelId: string, audioInput: any, options?: TranscribeOptions): Promise<string>;
228
258
  chatCompletion(modelId: string, request: OpenAIChatCompletionRequest): Promise<OpenAIChatCompletionResponse>;
229
259
  bedrockInvoke(modelId: string, request: BedrockInvokeRequest): Promise<BedrockInvokeResponse>;
@@ -235,6 +265,10 @@ declare class SlyOS {
235
265
  private mapModelToOpenAI;
236
266
  private localEmbeddingModel;
237
267
  private offlineIndexes;
268
+ /**
269
+ * Compute dynamic RAG parameters based on device profile and model.
270
+ */
271
+ private computeRAGConfig;
238
272
  /**
239
273
  * Tier 2: Cloud-indexed RAG with local inference.
240
274
  * Retrieves relevant chunks from server, generates response locally.
package/dist/index.js CHANGED
@@ -870,6 +870,61 @@ class SlyOS {
870
870
  throw error;
871
871
  }
872
872
  }
873
+ /**
874
+ * Stream text generation token-by-token.
875
+ * Calls onToken callback for each generated token.
876
+ */
877
+ async generateStream(modelId, prompt, options = {}) {
878
+ if (!this.models.has(modelId)) {
879
+ await this.loadModel(modelId);
880
+ }
881
+ const loaded = this.models.get(modelId);
882
+ if (!loaded)
883
+ throw new Error(`Model "${modelId}" not loaded`);
884
+ const { pipe, info, contextWindow } = loaded;
885
+ if (info.category !== 'llm')
886
+ throw new Error(`Not an LLM`);
887
+ const maxTokens = Math.min(options.maxTokens || 100, contextWindow || 2048);
888
+ const startTime = Date.now();
889
+ let firstTokenTime = 0;
890
+ let accumulated = '';
891
+ this.emitProgress('generating', 0, `Streaming (max ${maxTokens} tokens)...`);
892
+ try {
893
+ const result = await pipe(prompt, {
894
+ max_new_tokens: maxTokens,
895
+ temperature: options.temperature || 0.7,
896
+ top_p: options.topP || 0.9,
897
+ do_sample: true,
898
+ // Transformers.js streamer callback
899
+ callback_function: (output) => {
900
+ if (!firstTokenTime)
901
+ firstTokenTime = Date.now() - startTime;
902
+ if (output && output.length > 0) {
903
+ // output is token IDs, we need to decode
904
+ // The callback in transformers.js v3 gives decoded text tokens
905
+ const tokenText = typeof output === 'string' ? output : '';
906
+ if (tokenText) {
907
+ accumulated += tokenText;
908
+ options.onToken?.(tokenText, accumulated);
909
+ this.emitEvent('token', { token: tokenText, partial: accumulated });
910
+ }
911
+ }
912
+ }
913
+ });
914
+ const rawOutput = result[0].generated_text;
915
+ const response = rawOutput.startsWith(prompt) ? rawOutput.slice(prompt.length).trim() : rawOutput.trim();
916
+ if (!firstTokenTime)
917
+ firstTokenTime = Date.now() - startTime;
918
+ const totalMs = Date.now() - startTime;
919
+ const tokensGenerated = response.split(/\s+/).length;
920
+ this.emitProgress('ready', 100, `Streamed ${tokensGenerated} tokens in ${(totalMs / 1000).toFixed(1)}s`);
921
+ return { text: response, firstTokenMs: firstTokenTime, totalMs, tokensGenerated };
922
+ }
923
+ catch (error) {
924
+ this.emitProgress('error', 0, `Stream failed: ${error.message}`);
925
+ throw error;
926
+ }
927
+ }
873
928
  // ── Inference: Transcribe ───────────────────────────────────────
874
929
  async transcribe(modelId, audioInput, options = {}) {
875
930
  if (!this.models.has(modelId)) {
@@ -1179,6 +1234,45 @@ class SlyOS {
1179
1234
  };
1180
1235
  return modelMapping[slyModelId] || 'gpt-4o-mini';
1181
1236
  }
1237
+ /**
1238
+ * Compute dynamic RAG parameters based on device profile and model.
1239
+ */
1240
+ computeRAGConfig(modelId) {
1241
+ const contextWindow = this.modelContextWindow || 2048;
1242
+ const memoryMB = this.deviceProfile?.memoryMB || 4096;
1243
+ const cpuCores = this.deviceProfile?.cpuCores || 4;
1244
+ const hasGPU = !!(this.deviceProfile?.gpuRenderer || this.deviceProfile?.webgpuAvailable);
1245
+ // Determine device tier
1246
+ let deviceTier = 'low';
1247
+ if (memoryMB >= 8192 && cpuCores >= 8)
1248
+ deviceTier = 'high';
1249
+ else if (memoryMB >= 4096 && cpuCores >= 4)
1250
+ deviceTier = 'mid';
1251
+ // Context chars: scale with context window AND device capability
1252
+ let maxContextChars;
1253
+ if (contextWindow <= 2048) {
1254
+ maxContextChars = deviceTier === 'high' ? 600 : deviceTier === 'mid' ? 400 : 300;
1255
+ }
1256
+ else if (contextWindow <= 4096) {
1257
+ maxContextChars = deviceTier === 'high' ? 1500 : deviceTier === 'mid' ? 1000 : 600;
1258
+ }
1259
+ else {
1260
+ maxContextChars = deviceTier === 'high' ? 3000 : deviceTier === 'mid' ? 2000 : 1000;
1261
+ }
1262
+ // Gen tokens: scale with device tier
1263
+ let maxGenTokens;
1264
+ if (contextWindow <= 2048) {
1265
+ maxGenTokens = deviceTier === 'high' ? 200 : deviceTier === 'mid' ? 150 : 100;
1266
+ }
1267
+ else {
1268
+ maxGenTokens = deviceTier === 'high' ? 400 : deviceTier === 'mid' ? 300 : 150;
1269
+ }
1270
+ // Chunk size: larger chunks for bigger context windows
1271
+ const chunkSize = contextWindow <= 2048 ? 256 : contextWindow <= 4096 ? 512 : 1024;
1272
+ // TopK: more chunks for powerful devices
1273
+ const topK = deviceTier === 'high' ? 5 : deviceTier === 'mid' ? 3 : 1;
1274
+ return { maxContextChars, maxGenTokens, chunkSize, topK, contextWindow, deviceTier };
1275
+ }
1182
1276
  /**
1183
1277
  * Tier 2: Cloud-indexed RAG with local inference.
1184
1278
  * Retrieves relevant chunks from server, generates response locally.
@@ -1188,27 +1282,52 @@ class SlyOS {
1188
1282
  try {
1189
1283
  if (!this.token)
1190
1284
  throw new Error('Not authenticated. Call init() first.');
1285
+ const ragConfig = this.computeRAGConfig(options.modelId);
1191
1286
  // Step 1: Retrieve relevant chunks from backend
1287
+ const retrievalStart = Date.now();
1192
1288
  const searchResponse = await axios.post(`${this.apiUrl}/api/rag/knowledge-bases/${options.knowledgeBaseId}/query`, {
1193
1289
  query: options.query,
1194
- top_k: options.topK || 5,
1290
+ top_k: options.topK || ragConfig.topK,
1195
1291
  model_id: options.modelId
1196
1292
  }, { headers: { Authorization: `Bearer ${this.token}` } });
1293
+ const retrievalMs = Date.now() - retrievalStart;
1197
1294
  let { retrieved_chunks, prompt_template, context } = searchResponse.data;
1198
- // Apply context window limits
1199
- const contextWindow = this.modelContextWindow || 2048;
1200
- const maxContextChars = (contextWindow - 200) * 3; // Rough token-to-char ratio, reserving 200 tokens
1201
- if (context && context.length > maxContextChars) {
1202
- context = context.substring(0, maxContextChars) + '...';
1295
+ // Step 2: Build context with dynamic limits
1296
+ const contextBuildStart = Date.now();
1297
+ if (context && context.length > ragConfig.maxContextChars) {
1298
+ context = context.substring(0, ragConfig.maxContextChars);
1203
1299
  }
1204
- // Step 2: Generate response locally using the augmented prompt
1205
- const response = await this.generate(options.modelId, prompt_template, {
1206
- temperature: options.temperature,
1207
- maxTokens: options.maxTokens,
1208
- });
1300
+ // If no prompt_template from server, build minimal one
1301
+ if (!prompt_template) {
1302
+ prompt_template = `${context}\n\nQ: ${options.query}\nA:`;
1303
+ }
1304
+ const contextBuildMs = Date.now() - contextBuildStart;
1305
+ // Step 3: Generate response — stream if callback provided
1306
+ const genStart = Date.now();
1307
+ let response;
1308
+ let firstTokenMs = 0;
1309
+ if (options.onToken) {
1310
+ const streamResult = await this.generateStream(options.modelId, prompt_template, {
1311
+ temperature: options.temperature,
1312
+ maxTokens: options.maxTokens || ragConfig.maxGenTokens,
1313
+ onToken: options.onToken,
1314
+ });
1315
+ response = streamResult.text;
1316
+ firstTokenMs = streamResult.firstTokenMs;
1317
+ }
1318
+ else {
1319
+ response = await this.generate(options.modelId, prompt_template, {
1320
+ temperature: options.temperature,
1321
+ maxTokens: options.maxTokens || ragConfig.maxGenTokens,
1322
+ });
1323
+ firstTokenMs = Date.now() - genStart; // approximate
1324
+ }
1325
+ const generationMs = Date.now() - genStart;
1326
+ const totalMs = Date.now() - startTime;
1327
+ const tokensGenerated = response.split(/\s+/).length;
1209
1328
  return {
1210
1329
  query: options.query,
1211
- retrievedChunks: retrieved_chunks.map((c) => ({
1330
+ retrievedChunks: (retrieved_chunks || []).map((c) => ({
1212
1331
  id: c.id,
1213
1332
  documentId: c.document_id,
1214
1333
  documentName: c.document_name,
@@ -1218,8 +1337,25 @@ class SlyOS {
1218
1337
  })),
1219
1338
  generatedResponse: response,
1220
1339
  context,
1221
- latencyMs: Date.now() - startTime,
1340
+ latencyMs: totalMs,
1222
1341
  tierUsed: 2,
1342
+ timing: {
1343
+ retrievalMs,
1344
+ contextBuildMs,
1345
+ firstTokenMs,
1346
+ generationMs,
1347
+ totalMs,
1348
+ tokensGenerated,
1349
+ tokensPerSecond: generationMs > 0 ? tokensGenerated / (generationMs / 1000) : 0,
1350
+ },
1351
+ config: {
1352
+ maxContextChars: ragConfig.maxContextChars,
1353
+ maxGenTokens: ragConfig.maxGenTokens,
1354
+ chunkSize: ragConfig.chunkSize,
1355
+ topK: options.topK || ragConfig.topK,
1356
+ contextWindowUsed: ragConfig.contextWindow,
1357
+ deviceTier: ragConfig.deviceTier,
1358
+ },
1223
1359
  };
1224
1360
  }
1225
1361
  catch (error) {
@@ -1234,56 +1370,66 @@ class SlyOS {
1234
1370
  async ragQueryLocal(options) {
1235
1371
  const startTime = Date.now();
1236
1372
  try {
1373
+ const ragConfig = this.computeRAGConfig(options.modelId);
1237
1374
  // Step 1: Load embedding model if needed
1238
1375
  if (!this.localEmbeddingModel) {
1239
1376
  await this.loadEmbeddingModel();
1240
1377
  }
1241
- // Adapt chunk size based on context window for efficiency
1242
- const contextWindow = this.modelContextWindow || 2048;
1243
- const chunkSize = contextWindow <= 1024 ? 256 : contextWindow <= 2048 ? 512 : 1024;
1244
- const overlap = Math.floor(chunkSize / 4);
1245
- // Step 2: Chunk documents if not already chunked
1378
+ // Step 2: Chunk and embed documents (dynamic chunk size)
1379
+ const retrievalStart = Date.now();
1246
1380
  const allChunks = [];
1247
1381
  for (const doc of options.documents) {
1248
- const chunks = this.chunkTextLocal(doc.content, chunkSize, overlap);
1382
+ const chunks = this.chunkTextLocal(doc.content, ragConfig.chunkSize, Math.floor(ragConfig.chunkSize / 4));
1249
1383
  for (const chunk of chunks) {
1250
1384
  const embedding = await this.embedTextLocal(chunk);
1251
1385
  allChunks.push({ content: chunk, documentName: doc.name || 'Document', embedding });
1252
1386
  }
1253
1387
  }
1254
- // Step 3: Embed query
1388
+ // Step 3: Embed query and search
1255
1389
  const queryEmbedding = await this.embedTextLocal(options.query);
1256
- // Step 4: Cosine similarity search
1257
1390
  const scored = allChunks
1258
1391
  .filter(c => c.embedding)
1259
- .map(c => ({
1260
- ...c,
1261
- similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding)
1262
- }))
1392
+ .map(c => ({ ...c, similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding) }))
1263
1393
  .sort((a, b) => b.similarityScore - a.similarityScore)
1264
- .slice(0, options.topK || 5);
1265
- // Step 5: Build context with size limits — keep context SHORT so model has room to generate
1266
- const maxContextChars = contextWindow <= 2048 ? 800 : contextWindow <= 4096 ? 1500 : 3000;
1267
- let contextLength = 0;
1268
- const contextParts = [];
1269
- for (const c of scored) {
1270
- const part = `[Source: ${c.documentName}]\n${c.content}`;
1271
- if (contextLength + part.length <= maxContextChars) {
1272
- contextParts.push(part);
1273
- contextLength += part.length + 10; // Account for separator
1274
- }
1275
- else {
1276
- break;
1277
- }
1394
+ .slice(0, options.topK || ragConfig.topK);
1395
+ const retrievalMs = Date.now() - retrievalStart;
1396
+ // Step 4: Build context
1397
+ const contextBuildStart = Date.now();
1398
+ const bestChunk = scored[0];
1399
+ let context = bestChunk.content
1400
+ .replace(/[^\x20-\x7E\n]/g, ' ')
1401
+ .replace(/\s{2,}/g, ' ')
1402
+ .replace(/<[^>]+>/g, ' ')
1403
+ .replace(/https?:\/\/\S+/g, '')
1404
+ .replace(/[{}()\[\]]/g, '')
1405
+ .trim();
1406
+ if (context.length > ragConfig.maxContextChars)
1407
+ context = context.substring(0, ragConfig.maxContextChars);
1408
+ const prompt = `${context}\n\nQ: ${options.query}\nA:`;
1409
+ const contextBuildMs = Date.now() - contextBuildStart;
1410
+ // Step 5: Generate — stream if callback provided
1411
+ const genStart = Date.now();
1412
+ let response;
1413
+ let firstTokenMs = 0;
1414
+ if (options.onToken) {
1415
+ const streamResult = await this.generateStream(options.modelId, prompt, {
1416
+ temperature: options.temperature || 0.6,
1417
+ maxTokens: options.maxTokens || ragConfig.maxGenTokens,
1418
+ onToken: options.onToken,
1419
+ });
1420
+ response = streamResult.text;
1421
+ firstTokenMs = streamResult.firstTokenMs;
1278
1422
  }
1279
- const context = contextParts.join('\n\n---\n\n');
1280
- const prompt = `Use the following information to answer the question.\n\nInfo: ${context}\n\nQuestion: ${options.query}\nAnswer:`;
1281
- // Step 6: Generate locally
1282
- const maxGen = contextWindow <= 2048 ? 150 : Math.min(300, Math.floor(contextWindow / 4));
1283
- const response = await this.generate(options.modelId, prompt, {
1284
- temperature: options.temperature || 0.6,
1285
- maxTokens: options.maxTokens || maxGen,
1286
- });
1423
+ else {
1424
+ response = await this.generate(options.modelId, prompt, {
1425
+ temperature: options.temperature || 0.6,
1426
+ maxTokens: options.maxTokens || ragConfig.maxGenTokens,
1427
+ });
1428
+ firstTokenMs = Date.now() - genStart;
1429
+ }
1430
+ const generationMs = Date.now() - genStart;
1431
+ const totalMs = Date.now() - startTime;
1432
+ const tokensGenerated = response.split(/\s+/).length;
1287
1433
  return {
1288
1434
  query: options.query,
1289
1435
  retrievedChunks: scored.map((c, i) => ({
@@ -1296,8 +1442,25 @@ class SlyOS {
1296
1442
  })),
1297
1443
  generatedResponse: response,
1298
1444
  context,
1299
- latencyMs: Date.now() - startTime,
1445
+ latencyMs: totalMs,
1300
1446
  tierUsed: 1,
1447
+ timing: {
1448
+ retrievalMs,
1449
+ contextBuildMs,
1450
+ firstTokenMs,
1451
+ generationMs,
1452
+ totalMs,
1453
+ tokensGenerated,
1454
+ tokensPerSecond: generationMs > 0 ? tokensGenerated / (generationMs / 1000) : 0,
1455
+ },
1456
+ config: {
1457
+ maxContextChars: ragConfig.maxContextChars,
1458
+ maxGenTokens: ragConfig.maxGenTokens,
1459
+ chunkSize: ragConfig.chunkSize,
1460
+ topK: options.topK || ragConfig.topK,
1461
+ contextWindowUsed: ragConfig.contextWindow,
1462
+ deviceTier: ragConfig.deviceTier,
1463
+ },
1301
1464
  };
1302
1465
  }
1303
1466
  catch (error) {
@@ -1312,52 +1475,61 @@ class SlyOS {
1312
1475
  async ragQueryOffline(options) {
1313
1476
  const startTime = Date.now();
1314
1477
  const index = this.offlineIndexes.get(options.knowledgeBaseId);
1315
- if (!index) {
1316
- throw new Error(`Knowledge base "${options.knowledgeBaseId}" not synced. Call syncKnowledgeBase() first.`);
1317
- }
1318
- // Check expiry
1319
- if (new Date(index.metadata.expires_at) < new Date()) {
1320
- throw new Error('Offline index has expired. Please re-sync.');
1321
- }
1478
+ if (!index)
1479
+ throw new Error(`KB "${options.knowledgeBaseId}" not synced.`);
1480
+ if (new Date(index.metadata.expires_at) < new Date())
1481
+ throw new Error('Offline index expired.');
1322
1482
  try {
1483
+ const ragConfig = this.computeRAGConfig(options.modelId);
1323
1484
  // Load embedding model
1324
- if (!this.localEmbeddingModel) {
1485
+ if (!this.localEmbeddingModel)
1325
1486
  await this.loadEmbeddingModel();
1326
- }
1327
- // Embed query
1328
- const queryEmbedding = await this.embedTextLocal(options.query);
1329
1487
  // Search offline index
1488
+ const retrievalStart = Date.now();
1489
+ const queryEmbedding = await this.embedTextLocal(options.query);
1330
1490
  const scored = index.chunks
1331
1491
  .filter(c => c.embedding && c.embedding.length > 0)
1332
- .map(c => ({
1333
- ...c,
1334
- similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding)
1335
- }))
1492
+ .map(c => ({ ...c, similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding) }))
1336
1493
  .sort((a, b) => b.similarityScore - a.similarityScore)
1337
- .slice(0, options.topK || 5);
1338
- // Build context with size limits — keep context SHORT so model has room to generate
1339
- const contextWindow = this.modelContextWindow || 2048;
1340
- const maxContextChars = contextWindow <= 2048 ? 800 : contextWindow <= 4096 ? 1500 : 3000;
1341
- let contextLength = 0;
1342
- const contextParts = [];
1343
- for (const c of scored) {
1344
- const part = `[Source: ${c.document_name}]\n${c.content}`;
1345
- if (contextLength + part.length <= maxContextChars) {
1346
- contextParts.push(part);
1347
- contextLength += part.length + 10;
1348
- }
1349
- else {
1350
- break;
1351
- }
1494
+ .slice(0, options.topK || ragConfig.topK);
1495
+ const retrievalMs = Date.now() - retrievalStart;
1496
+ // Build context
1497
+ const contextBuildStart = Date.now();
1498
+ const bestChunk = scored[0];
1499
+ let context = bestChunk.content
1500
+ .replace(/[^\x20-\x7E\n]/g, ' ')
1501
+ .replace(/\s{2,}/g, ' ')
1502
+ .replace(/<[^>]+>/g, ' ')
1503
+ .replace(/https?:\/\/\S+/g, '')
1504
+ .replace(/[{}()\[\]]/g, '')
1505
+ .trim();
1506
+ if (context.length > ragConfig.maxContextChars)
1507
+ context = context.substring(0, ragConfig.maxContextChars);
1508
+ const prompt = `${context}\n\nQ: ${options.query}\nA:`;
1509
+ const contextBuildMs = Date.now() - contextBuildStart;
1510
+ // Generate
1511
+ const genStart = Date.now();
1512
+ let response;
1513
+ let firstTokenMs = 0;
1514
+ if (options.onToken) {
1515
+ const streamResult = await this.generateStream(options.modelId, prompt, {
1516
+ temperature: options.temperature || 0.6,
1517
+ maxTokens: options.maxTokens || ragConfig.maxGenTokens,
1518
+ onToken: options.onToken,
1519
+ });
1520
+ response = streamResult.text;
1521
+ firstTokenMs = streamResult.firstTokenMs;
1352
1522
  }
1353
- const context = contextParts.join('\n\n---\n\n');
1354
- const prompt = `Use the following information to answer the question.\n\nInfo: ${context}\n\nQuestion: ${options.query}\nAnswer:`;
1355
- // Generate locally
1356
- const maxGen = contextWindow <= 2048 ? 150 : Math.min(300, Math.floor(contextWindow / 4));
1357
- const response = await this.generate(options.modelId, prompt, {
1358
- temperature: options.temperature || 0.6,
1359
- maxTokens: options.maxTokens || maxGen,
1360
- });
1523
+ else {
1524
+ response = await this.generate(options.modelId, prompt, {
1525
+ temperature: options.temperature || 0.6,
1526
+ maxTokens: options.maxTokens || ragConfig.maxGenTokens,
1527
+ });
1528
+ firstTokenMs = Date.now() - genStart;
1529
+ }
1530
+ const generationMs = Date.now() - genStart;
1531
+ const totalMs = Date.now() - startTime;
1532
+ const tokensGenerated = response.split(/\s+/).length;
1361
1533
  return {
1362
1534
  query: options.query,
1363
1535
  retrievedChunks: scored.map(c => ({
@@ -1370,8 +1542,25 @@ class SlyOS {
1370
1542
  })),
1371
1543
  generatedResponse: response,
1372
1544
  context,
1373
- latencyMs: Date.now() - startTime,
1545
+ latencyMs: totalMs,
1374
1546
  tierUsed: 3,
1547
+ timing: {
1548
+ retrievalMs,
1549
+ contextBuildMs,
1550
+ firstTokenMs,
1551
+ generationMs,
1552
+ totalMs,
1553
+ tokensGenerated,
1554
+ tokensPerSecond: generationMs > 0 ? tokensGenerated / (generationMs / 1000) : 0,
1555
+ },
1556
+ config: {
1557
+ maxContextChars: ragConfig.maxContextChars,
1558
+ maxGenTokens: ragConfig.maxGenTokens,
1559
+ chunkSize: ragConfig.chunkSize,
1560
+ topK: options.topK || ragConfig.topK,
1561
+ contextWindowUsed: ragConfig.contextWindow,
1562
+ deviceTier: ragConfig.deviceTier,
1563
+ },
1375
1564
  };
1376
1565
  }
1377
1566
  catch (error) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@beltoinc/slyos-sdk",
3
- "version": "1.5.0",
3
+ "version": "1.5.1",
4
4
  "description": "SlyOS - On-Device AI SDK for Web and Node.js",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",
package/src/index.ts CHANGED
@@ -69,7 +69,7 @@ interface ProgressEvent {
69
69
  }
70
70
 
71
71
  interface SlyEvent {
72
- type: 'auth' | 'device_registered' | 'device_profiled' | 'model_download_start' | 'model_download_progress' | 'model_loaded' | 'inference_start' | 'inference_complete' | 'error' | 'fallback_success' | 'fallback_error' | 'telemetry_flushed';
72
+ type: 'auth' | 'device_registered' | 'device_profiled' | 'model_download_start' | 'model_download_progress' | 'model_loaded' | 'inference_start' | 'inference_complete' | 'error' | 'fallback_success' | 'fallback_error' | 'telemetry_flushed' | 'token';
73
73
  data?: any;
74
74
  timestamp: number;
75
75
  }
@@ -174,6 +174,8 @@ interface RAGOptions {
174
174
  modelId: string;
175
175
  temperature?: number;
176
176
  maxTokens?: number;
177
+ // NEW: streaming callback
178
+ onToken?: (token: string, partial: string) => void;
177
179
  }
178
180
 
179
181
  interface RAGChunk {
@@ -192,6 +194,25 @@ interface RAGResponse {
192
194
  context: string;
193
195
  latencyMs: number;
194
196
  tierUsed: 1 | 2 | 3;
197
+ // NEW: detailed timing metrics
198
+ timing: {
199
+ retrievalMs: number; // Time spent retrieving/embedding chunks
200
+ contextBuildMs: number; // Time spent building context
201
+ firstTokenMs: number; // Time to first token (from generation start)
202
+ generationMs: number; // Total generation time
203
+ totalMs: number; // End-to-end latency
204
+ tokensGenerated: number; // Number of tokens in response
205
+ tokensPerSecond: number; // Generation throughput
206
+ };
207
+ // NEW: dynamic config used
208
+ config: {
209
+ maxContextChars: number;
210
+ maxGenTokens: number;
211
+ chunkSize: number;
212
+ topK: number;
213
+ contextWindowUsed: number;
214
+ deviceTier: 'low' | 'mid' | 'high';
215
+ };
195
216
  }
196
217
 
197
218
  interface OfflineIndex {
@@ -1145,6 +1166,68 @@ class SlyOS {
1145
1166
  }
1146
1167
  }
1147
1168
 
1169
+ /**
1170
+ * Stream text generation token-by-token.
1171
+ * Calls onToken callback for each generated token.
1172
+ */
1173
+ async generateStream(
1174
+ modelId: string,
1175
+ prompt: string,
1176
+ options: GenerateOptions & { onToken?: (token: string, partial: string) => void } = {}
1177
+ ): Promise<{ text: string; firstTokenMs: number; totalMs: number; tokensGenerated: number }> {
1178
+ if (!this.models.has(modelId)) {
1179
+ await this.loadModel(modelId);
1180
+ }
1181
+ const loaded = this.models.get(modelId);
1182
+ if (!loaded) throw new Error(`Model "${modelId}" not loaded`);
1183
+ const { pipe, info, contextWindow } = loaded;
1184
+ if (info.category !== 'llm') throw new Error(`Not an LLM`);
1185
+
1186
+ const maxTokens = Math.min(options.maxTokens || 100, contextWindow || 2048);
1187
+ const startTime = Date.now();
1188
+ let firstTokenTime = 0;
1189
+ let accumulated = '';
1190
+
1191
+ this.emitProgress('generating', 0, `Streaming (max ${maxTokens} tokens)...`);
1192
+
1193
+ try {
1194
+ const result = await pipe(prompt, {
1195
+ max_new_tokens: maxTokens,
1196
+ temperature: options.temperature || 0.7,
1197
+ top_p: options.topP || 0.9,
1198
+ do_sample: true,
1199
+ // Transformers.js streamer callback
1200
+ callback_function: (output: any) => {
1201
+ if (!firstTokenTime) firstTokenTime = Date.now() - startTime;
1202
+ if (output && output.length > 0) {
1203
+ // output is token IDs, we need to decode
1204
+ // The callback in transformers.js v3 gives decoded text tokens
1205
+ const tokenText = typeof output === 'string' ? output : '';
1206
+ if (tokenText) {
1207
+ accumulated += tokenText;
1208
+ options.onToken?.(tokenText, accumulated);
1209
+ this.emitEvent('token', { token: tokenText, partial: accumulated });
1210
+ }
1211
+ }
1212
+ }
1213
+ });
1214
+
1215
+ const rawOutput = result[0].generated_text;
1216
+ const response = rawOutput.startsWith(prompt) ? rawOutput.slice(prompt.length).trim() : rawOutput.trim();
1217
+
1218
+ if (!firstTokenTime) firstTokenTime = Date.now() - startTime;
1219
+ const totalMs = Date.now() - startTime;
1220
+ const tokensGenerated = response.split(/\s+/).length;
1221
+
1222
+ this.emitProgress('ready', 100, `Streamed ${tokensGenerated} tokens in ${(totalMs/1000).toFixed(1)}s`);
1223
+
1224
+ return { text: response, firstTokenMs: firstTokenTime, totalMs, tokensGenerated };
1225
+ } catch (error: any) {
1226
+ this.emitProgress('error', 0, `Stream failed: ${error.message}`);
1227
+ throw error;
1228
+ }
1229
+ }
1230
+
1148
1231
  // ── Inference: Transcribe ───────────────────────────────────────
1149
1232
 
1150
1233
  async transcribe(modelId: string, audioInput: any, options: TranscribeOptions = {}): Promise<string> {
@@ -1495,6 +1578,54 @@ class SlyOS {
1495
1578
  private localEmbeddingModel: any = null;
1496
1579
  private offlineIndexes: Map<string, OfflineIndex> = new Map();
1497
1580
 
1581
+ /**
1582
+ * Compute dynamic RAG parameters based on device profile and model.
1583
+ */
1584
+ private computeRAGConfig(modelId: string): {
1585
+ maxContextChars: number;
1586
+ maxGenTokens: number;
1587
+ chunkSize: number;
1588
+ topK: number;
1589
+ contextWindow: number;
1590
+ deviceTier: 'low' | 'mid' | 'high';
1591
+ } {
1592
+ const contextWindow = this.modelContextWindow || 2048;
1593
+ const memoryMB = this.deviceProfile?.memoryMB || 4096;
1594
+ const cpuCores = this.deviceProfile?.cpuCores || 4;
1595
+ const hasGPU = !!(this.deviceProfile?.gpuRenderer || this.deviceProfile?.webgpuAvailable);
1596
+
1597
+ // Determine device tier
1598
+ let deviceTier: 'low' | 'mid' | 'high' = 'low';
1599
+ if (memoryMB >= 8192 && cpuCores >= 8) deviceTier = 'high';
1600
+ else if (memoryMB >= 4096 && cpuCores >= 4) deviceTier = 'mid';
1601
+
1602
+ // Context chars: scale with context window AND device capability
1603
+ let maxContextChars: number;
1604
+ if (contextWindow <= 2048) {
1605
+ maxContextChars = deviceTier === 'high' ? 600 : deviceTier === 'mid' ? 400 : 300;
1606
+ } else if (contextWindow <= 4096) {
1607
+ maxContextChars = deviceTier === 'high' ? 1500 : deviceTier === 'mid' ? 1000 : 600;
1608
+ } else {
1609
+ maxContextChars = deviceTier === 'high' ? 3000 : deviceTier === 'mid' ? 2000 : 1000;
1610
+ }
1611
+
1612
+ // Gen tokens: scale with device tier
1613
+ let maxGenTokens: number;
1614
+ if (contextWindow <= 2048) {
1615
+ maxGenTokens = deviceTier === 'high' ? 200 : deviceTier === 'mid' ? 150 : 100;
1616
+ } else {
1617
+ maxGenTokens = deviceTier === 'high' ? 400 : deviceTier === 'mid' ? 300 : 150;
1618
+ }
1619
+
1620
+ // Chunk size: larger chunks for bigger context windows
1621
+ const chunkSize = contextWindow <= 2048 ? 256 : contextWindow <= 4096 ? 512 : 1024;
1622
+
1623
+ // TopK: more chunks for powerful devices
1624
+ const topK = deviceTier === 'high' ? 5 : deviceTier === 'mid' ? 3 : 1;
1625
+
1626
+ return { maxContextChars, maxGenTokens, chunkSize, topK, contextWindow, deviceTier };
1627
+ }
1628
+
1498
1629
  /**
1499
1630
  * Tier 2: Cloud-indexed RAG with local inference.
1500
1631
  * Retrieves relevant chunks from server, generates response locally.
@@ -1505,36 +1636,61 @@ class SlyOS {
1505
1636
  try {
1506
1637
  if (!this.token) throw new Error('Not authenticated. Call init() first.');
1507
1638
 
1639
+ const ragConfig = this.computeRAGConfig(options.modelId);
1640
+
1508
1641
  // Step 1: Retrieve relevant chunks from backend
1642
+ const retrievalStart = Date.now();
1509
1643
  const searchResponse = await axios.post(
1510
1644
  `${this.apiUrl}/api/rag/knowledge-bases/${options.knowledgeBaseId}/query`,
1511
1645
  {
1512
1646
  query: options.query,
1513
- top_k: options.topK || 5,
1647
+ top_k: options.topK || ragConfig.topK,
1514
1648
  model_id: options.modelId
1515
1649
  },
1516
1650
  { headers: { Authorization: `Bearer ${this.token}` } }
1517
1651
  );
1652
+ const retrievalMs = Date.now() - retrievalStart;
1518
1653
 
1519
1654
  let { retrieved_chunks, prompt_template, context } = searchResponse.data;
1520
1655
 
1521
- // Apply context window limits
1522
- const contextWindow = this.modelContextWindow || 2048;
1523
- const maxContextChars = (contextWindow - 200) * 3; // Rough token-to-char ratio, reserving 200 tokens
1524
-
1525
- if (context && context.length > maxContextChars) {
1526
- context = context.substring(0, maxContextChars) + '...';
1656
+ // Step 2: Build context with dynamic limits
1657
+ const contextBuildStart = Date.now();
1658
+ if (context && context.length > ragConfig.maxContextChars) {
1659
+ context = context.substring(0, ragConfig.maxContextChars);
1527
1660
  }
1528
-
1529
- // Step 2: Generate response locally using the augmented prompt
1530
- const response = await this.generate(options.modelId, prompt_template, {
1531
- temperature: options.temperature,
1532
- maxTokens: options.maxTokens,
1533
- });
1661
+ // If no prompt_template from server, build minimal one
1662
+ if (!prompt_template) {
1663
+ prompt_template = `${context}\n\nQ: ${options.query}\nA:`;
1664
+ }
1665
+ const contextBuildMs = Date.now() - contextBuildStart;
1666
+
1667
+ // Step 3: Generate response — stream if callback provided
1668
+ const genStart = Date.now();
1669
+ let response: string;
1670
+ let firstTokenMs = 0;
1671
+
1672
+ if (options.onToken) {
1673
+ const streamResult = await this.generateStream(options.modelId, prompt_template, {
1674
+ temperature: options.temperature,
1675
+ maxTokens: options.maxTokens || ragConfig.maxGenTokens,
1676
+ onToken: options.onToken,
1677
+ });
1678
+ response = streamResult.text;
1679
+ firstTokenMs = streamResult.firstTokenMs;
1680
+ } else {
1681
+ response = await this.generate(options.modelId, prompt_template, {
1682
+ temperature: options.temperature,
1683
+ maxTokens: options.maxTokens || ragConfig.maxGenTokens,
1684
+ });
1685
+ firstTokenMs = Date.now() - genStart; // approximate
1686
+ }
1687
+ const generationMs = Date.now() - genStart;
1688
+ const totalMs = Date.now() - startTime;
1689
+ const tokensGenerated = response.split(/\s+/).length;
1534
1690
 
1535
1691
  return {
1536
1692
  query: options.query,
1537
- retrievedChunks: retrieved_chunks.map((c: any) => ({
1693
+ retrievedChunks: (retrieved_chunks || []).map((c: any) => ({
1538
1694
  id: c.id,
1539
1695
  documentId: c.document_id,
1540
1696
  documentName: c.document_name,
@@ -1544,8 +1700,25 @@ class SlyOS {
1544
1700
  })),
1545
1701
  generatedResponse: response,
1546
1702
  context,
1547
- latencyMs: Date.now() - startTime,
1703
+ latencyMs: totalMs,
1548
1704
  tierUsed: 2,
1705
+ timing: {
1706
+ retrievalMs,
1707
+ contextBuildMs,
1708
+ firstTokenMs,
1709
+ generationMs,
1710
+ totalMs,
1711
+ tokensGenerated,
1712
+ tokensPerSecond: generationMs > 0 ? tokensGenerated / (generationMs / 1000) : 0,
1713
+ },
1714
+ config: {
1715
+ maxContextChars: ragConfig.maxContextChars,
1716
+ maxGenTokens: ragConfig.maxGenTokens,
1717
+ chunkSize: ragConfig.chunkSize,
1718
+ topK: options.topK || ragConfig.topK,
1719
+ contextWindowUsed: ragConfig.contextWindow,
1720
+ deviceTier: ragConfig.deviceTier,
1721
+ },
1549
1722
  };
1550
1723
  } catch (error: any) {
1551
1724
  this.emitEvent('error', { stage: 'rag_query', error: error.message });
@@ -1561,63 +1734,70 @@ class SlyOS {
1561
1734
  const startTime = Date.now();
1562
1735
 
1563
1736
  try {
1737
+ const ragConfig = this.computeRAGConfig(options.modelId);
1738
+
1564
1739
  // Step 1: Load embedding model if needed
1565
1740
  if (!this.localEmbeddingModel) {
1566
1741
  await this.loadEmbeddingModel();
1567
1742
  }
1568
1743
 
1569
- // Adapt chunk size based on context window for efficiency
1570
- const contextWindow = this.modelContextWindow || 2048;
1571
- const chunkSize = contextWindow <= 1024 ? 256 : contextWindow <= 2048 ? 512 : 1024;
1572
- const overlap = Math.floor(chunkSize / 4);
1573
-
1574
- // Step 2: Chunk documents if not already chunked
1744
+ // Step 2: Chunk and embed documents (dynamic chunk size)
1745
+ const retrievalStart = Date.now();
1575
1746
  const allChunks: Array<{ content: string; documentName: string; embedding?: number[] }> = [];
1576
1747
  for (const doc of options.documents) {
1577
- const chunks = this.chunkTextLocal(doc.content, chunkSize, overlap);
1748
+ const chunks = this.chunkTextLocal(doc.content, ragConfig.chunkSize, Math.floor(ragConfig.chunkSize / 4));
1578
1749
  for (const chunk of chunks) {
1579
1750
  const embedding = await this.embedTextLocal(chunk);
1580
1751
  allChunks.push({ content: chunk, documentName: doc.name || 'Document', embedding });
1581
1752
  }
1582
1753
  }
1583
1754
 
1584
- // Step 3: Embed query
1755
+ // Step 3: Embed query and search
1585
1756
  const queryEmbedding = await this.embedTextLocal(options.query);
1586
-
1587
- // Step 4: Cosine similarity search
1588
1757
  const scored = allChunks
1589
1758
  .filter(c => c.embedding)
1590
- .map(c => ({
1591
- ...c,
1592
- similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding!)
1593
- }))
1759
+ .map(c => ({ ...c, similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding!) }))
1594
1760
  .sort((a, b) => b.similarityScore - a.similarityScore)
1595
- .slice(0, options.topK || 5);
1596
-
1597
- // Step 5: Build context with size limits — keep context SHORT so model has room to generate
1598
- const maxContextChars = contextWindow <= 2048 ? 800 : contextWindow <= 4096 ? 1500 : 3000;
1599
- let contextLength = 0;
1600
- const contextParts: string[] = [];
1601
-
1602
- for (const c of scored) {
1603
- const part = `[Source: ${c.documentName}]\n${c.content}`;
1604
- if (contextLength + part.length <= maxContextChars) {
1605
- contextParts.push(part);
1606
- contextLength += part.length + 10; // Account for separator
1607
- } else {
1608
- break;
1609
- }
1761
+ .slice(0, options.topK || ragConfig.topK);
1762
+ const retrievalMs = Date.now() - retrievalStart;
1763
+
1764
+ // Step 4: Build context
1765
+ const contextBuildStart = Date.now();
1766
+ const bestChunk = scored[0];
1767
+ let context = bestChunk.content
1768
+ .replace(/[^\x20-\x7E\n]/g, ' ')
1769
+ .replace(/\s{2,}/g, ' ')
1770
+ .replace(/<[^>]+>/g, ' ')
1771
+ .replace(/https?:\/\/\S+/g, '')
1772
+ .replace(/[{}()\[\]]/g, '')
1773
+ .trim();
1774
+ if (context.length > ragConfig.maxContextChars) context = context.substring(0, ragConfig.maxContextChars);
1775
+ const prompt = `${context}\n\nQ: ${options.query}\nA:`;
1776
+ const contextBuildMs = Date.now() - contextBuildStart;
1777
+
1778
+ // Step 5: Generate — stream if callback provided
1779
+ const genStart = Date.now();
1780
+ let response: string;
1781
+ let firstTokenMs = 0;
1782
+
1783
+ if (options.onToken) {
1784
+ const streamResult = await this.generateStream(options.modelId, prompt, {
1785
+ temperature: options.temperature || 0.6,
1786
+ maxTokens: options.maxTokens || ragConfig.maxGenTokens,
1787
+ onToken: options.onToken,
1788
+ });
1789
+ response = streamResult.text;
1790
+ firstTokenMs = streamResult.firstTokenMs;
1791
+ } else {
1792
+ response = await this.generate(options.modelId, prompt, {
1793
+ temperature: options.temperature || 0.6,
1794
+ maxTokens: options.maxTokens || ragConfig.maxGenTokens,
1795
+ });
1796
+ firstTokenMs = Date.now() - genStart;
1610
1797
  }
1611
-
1612
- const context = contextParts.join('\n\n---\n\n');
1613
- const prompt = `Use the following information to answer the question.\n\nInfo: ${context}\n\nQuestion: ${options.query}\nAnswer:`;
1614
-
1615
- // Step 6: Generate locally
1616
- const maxGen = contextWindow <= 2048 ? 150 : Math.min(300, Math.floor(contextWindow / 4));
1617
- const response = await this.generate(options.modelId, prompt, {
1618
- temperature: options.temperature || 0.6,
1619
- maxTokens: options.maxTokens || maxGen,
1620
- });
1798
+ const generationMs = Date.now() - genStart;
1799
+ const totalMs = Date.now() - startTime;
1800
+ const tokensGenerated = response.split(/\s+/).length;
1621
1801
 
1622
1802
  return {
1623
1803
  query: options.query,
@@ -1631,8 +1811,25 @@ class SlyOS {
1631
1811
  })),
1632
1812
  generatedResponse: response,
1633
1813
  context,
1634
- latencyMs: Date.now() - startTime,
1814
+ latencyMs: totalMs,
1635
1815
  tierUsed: 1,
1816
+ timing: {
1817
+ retrievalMs,
1818
+ contextBuildMs,
1819
+ firstTokenMs,
1820
+ generationMs,
1821
+ totalMs,
1822
+ tokensGenerated,
1823
+ tokensPerSecond: generationMs > 0 ? tokensGenerated / (generationMs / 1000) : 0,
1824
+ },
1825
+ config: {
1826
+ maxContextChars: ragConfig.maxContextChars,
1827
+ maxGenTokens: ragConfig.maxGenTokens,
1828
+ chunkSize: ragConfig.chunkSize,
1829
+ topK: options.topK || ragConfig.topK,
1830
+ contextWindowUsed: ragConfig.contextWindow,
1831
+ deviceTier: ragConfig.deviceTier,
1832
+ },
1636
1833
  };
1637
1834
  } catch (error: any) {
1638
1835
  this.emitEvent('error', { stage: 'rag_local', error: error.message });
@@ -1648,59 +1845,62 @@ class SlyOS {
1648
1845
  const startTime = Date.now();
1649
1846
 
1650
1847
  const index = this.offlineIndexes.get(options.knowledgeBaseId);
1651
- if (!index) {
1652
- throw new Error(`Knowledge base "${options.knowledgeBaseId}" not synced. Call syncKnowledgeBase() first.`);
1653
- }
1654
-
1655
- // Check expiry
1656
- if (new Date(index.metadata.expires_at) < new Date()) {
1657
- throw new Error('Offline index has expired. Please re-sync.');
1658
- }
1848
+ if (!index) throw new Error(`KB "${options.knowledgeBaseId}" not synced.`);
1849
+ if (new Date(index.metadata.expires_at) < new Date()) throw new Error('Offline index expired.');
1659
1850
 
1660
1851
  try {
1661
- // Load embedding model
1662
- if (!this.localEmbeddingModel) {
1663
- await this.loadEmbeddingModel();
1664
- }
1852
+ const ragConfig = this.computeRAGConfig(options.modelId);
1665
1853
 
1666
- // Embed query
1667
- const queryEmbedding = await this.embedTextLocal(options.query);
1854
+ // Load embedding model
1855
+ if (!this.localEmbeddingModel) await this.loadEmbeddingModel();
1668
1856
 
1669
1857
  // Search offline index
1858
+ const retrievalStart = Date.now();
1859
+ const queryEmbedding = await this.embedTextLocal(options.query);
1670
1860
  const scored = index.chunks
1671
1861
  .filter(c => c.embedding && c.embedding.length > 0)
1672
- .map(c => ({
1673
- ...c,
1674
- similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding!)
1675
- }))
1862
+ .map(c => ({ ...c, similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding!) }))
1676
1863
  .sort((a, b) => b.similarityScore - a.similarityScore)
1677
- .slice(0, options.topK || 5);
1678
-
1679
- // Build context with size limits — keep context SHORT so model has room to generate
1680
- const contextWindow = this.modelContextWindow || 2048;
1681
- const maxContextChars = contextWindow <= 2048 ? 800 : contextWindow <= 4096 ? 1500 : 3000;
1682
- let contextLength = 0;
1683
- const contextParts: string[] = [];
1684
-
1685
- for (const c of scored) {
1686
- const part = `[Source: ${c.document_name}]\n${c.content}`;
1687
- if (contextLength + part.length <= maxContextChars) {
1688
- contextParts.push(part);
1689
- contextLength += part.length + 10;
1690
- } else {
1691
- break;
1692
- }
1864
+ .slice(0, options.topK || ragConfig.topK);
1865
+ const retrievalMs = Date.now() - retrievalStart;
1866
+
1867
+ // Build context
1868
+ const contextBuildStart = Date.now();
1869
+ const bestChunk = scored[0];
1870
+ let context = bestChunk.content
1871
+ .replace(/[^\x20-\x7E\n]/g, ' ')
1872
+ .replace(/\s{2,}/g, ' ')
1873
+ .replace(/<[^>]+>/g, ' ')
1874
+ .replace(/https?:\/\/\S+/g, '')
1875
+ .replace(/[{}()\[\]]/g, '')
1876
+ .trim();
1877
+ if (context.length > ragConfig.maxContextChars) context = context.substring(0, ragConfig.maxContextChars);
1878
+ const prompt = `${context}\n\nQ: ${options.query}\nA:`;
1879
+ const contextBuildMs = Date.now() - contextBuildStart;
1880
+
1881
+ // Generate
1882
+ const genStart = Date.now();
1883
+ let response: string;
1884
+ let firstTokenMs = 0;
1885
+
1886
+ if (options.onToken) {
1887
+ const streamResult = await this.generateStream(options.modelId, prompt, {
1888
+ temperature: options.temperature || 0.6,
1889
+ maxTokens: options.maxTokens || ragConfig.maxGenTokens,
1890
+ onToken: options.onToken,
1891
+ });
1892
+ response = streamResult.text;
1893
+ firstTokenMs = streamResult.firstTokenMs;
1894
+ } else {
1895
+ response = await this.generate(options.modelId, prompt, {
1896
+ temperature: options.temperature || 0.6,
1897
+ maxTokens: options.maxTokens || ragConfig.maxGenTokens,
1898
+ });
1899
+ firstTokenMs = Date.now() - genStart;
1693
1900
  }
1694
-
1695
- const context = contextParts.join('\n\n---\n\n');
1696
- const prompt = `Use the following information to answer the question.\n\nInfo: ${context}\n\nQuestion: ${options.query}\nAnswer:`;
1697
-
1698
- // Generate locally
1699
- const maxGen = contextWindow <= 2048 ? 150 : Math.min(300, Math.floor(contextWindow / 4));
1700
- const response = await this.generate(options.modelId, prompt, {
1701
- temperature: options.temperature || 0.6,
1702
- maxTokens: options.maxTokens || maxGen,
1703
- });
1901
+ const generationMs = Date.now() - genStart;
1902
+ const totalMs = Date.now() - startTime;
1903
+ const tokensGenerated = response.split(/\s+/).length;
1704
1904
 
1705
1905
  return {
1706
1906
  query: options.query,
@@ -1714,8 +1914,25 @@ class SlyOS {
1714
1914
  })),
1715
1915
  generatedResponse: response,
1716
1916
  context,
1717
- latencyMs: Date.now() - startTime,
1917
+ latencyMs: totalMs,
1718
1918
  tierUsed: 3,
1919
+ timing: {
1920
+ retrievalMs,
1921
+ contextBuildMs,
1922
+ firstTokenMs,
1923
+ generationMs,
1924
+ totalMs,
1925
+ tokensGenerated,
1926
+ tokensPerSecond: generationMs > 0 ? tokensGenerated / (generationMs / 1000) : 0,
1927
+ },
1928
+ config: {
1929
+ maxContextChars: ragConfig.maxContextChars,
1930
+ maxGenTokens: ragConfig.maxGenTokens,
1931
+ chunkSize: ragConfig.chunkSize,
1932
+ topK: options.topK || ragConfig.topK,
1933
+ contextWindowUsed: ragConfig.contextWindow,
1934
+ deviceTier: ragConfig.deviceTier,
1935
+ },
1719
1936
  };
1720
1937
  } catch (error: any) {
1721
1938
  this.emitEvent('error', { stage: 'rag_offline', error: error.message });