npm - @beltoinc/slyos-sdk - Versions diffs - 1.5.0 → 1.5.1 - Mend

@beltoinc/slyos-sdk 1.5.0 → 1.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/create-chatbot.sh CHANGED Viewed

@@ -329,23 +329,29 @@ async function sendMessage(userMessage) {
         const goodChunks = chunks.filter(c => (c.similarity_score || 0) > 0.3);
         if (goodChunks.length > 0) {
-          // Keep context SHORT — small models need room to generate
           const ctxWindow = sdk.getModelContextWindow?.() || 2048;
-          // Reserve at least 40% of context window for generation
-          const maxContextChars = ctxWindow <= 2048 ? 800 : ctxWindow <= 4096 ? 1500 : 3000;
+          // AGGRESSIVE context limits — small models choke on long prompts
+          // ~4 chars per token on average, reserve 60% of window for generation
+          const maxContextTokens = Math.floor(ctxWindow * 0.3);
+          const maxContextChars = ctxWindow <= 2048 ? 400 : ctxWindow <= 4096 ? 1000 : 2000;
           const maxGenTokens = ctxWindow <= 2048 ? 150 : Math.min(300, Math.floor(ctxWindow / 4));
-          // Clean and truncate context — strip weird chars, fit model window
-          let context = goodChunks.map(c => c.content).join('\n')
+          // Use only the single best chunk for small models
+          const bestChunk = goodChunks[0];
+          let context = bestChunk.content
             .replace(/[^\x20-\x7E\n]/g, ' ')  // Strip non-ASCII/control chars
-            .replace(/\s{3,}/g, ' ')            // Collapse excessive whitespace
-            .replace(/<[^>]+>/g, ' ')           // Strip any leftover HTML tags
-            .replace(/https?:\/\/\S+/g, '')     // Strip URLs to save tokens
+            .replace(/\s{2,}/g, ' ')            // Collapse whitespace
+            .replace(/<[^>]+>/g, ' ')           // Strip HTML tags
+            .replace(/https?:\/\/\S+/g, '')     // Strip URLs
+            .replace(/[{}()\[\]]/g, '')          // Strip brackets/braces
             .trim();
           if (context.length > maxContextChars) context = context.substring(0, maxContextChars);
-          // Instruction-style prompt that small models understand
-          const prompt = `Use the following information to answer the question.\n\nInfo: ${context}\n\nQuestion: ${userMessage}\nAnswer:`;
+          console.log(`${colors.dim}Context: ${context.length} chars from "${bestChunk.document_name}"${colors.reset}`);
+          // Minimal prompt — every token counts
+          const prompt = `${context}\n\nQ: ${userMessage}\nA:`;
           const response = await sdk.generate(config.model, prompt, {
             temperature: 0.6,
             maxTokens: maxGenTokens

package/dist/index.d.ts CHANGED Viewed

@@ -44,7 +44,7 @@ interface ProgressEvent {
     detail?: any;
 }
 interface SlyEvent {
-    type: 'auth' | 'device_registered' | 'device_profiled' | 'model_download_start' | 'model_download_progress' | 'model_loaded' | 'inference_start' | 'inference_complete' | 'error' | 'fallback_success' | 'fallback_error' | 'telemetry_flushed';
+    type: 'auth' | 'device_registered' | 'device_profiled' | 'model_download_start' | 'model_download_progress' | 'model_loaded' | 'inference_start' | 'inference_complete' | 'error' | 'fallback_success' | 'fallback_error' | 'telemetry_flushed' | 'token';
     data?: any;
     timestamp: number;
 }
@@ -126,6 +126,7 @@ interface RAGOptions {
     modelId: string;
     temperature?: number;
     maxTokens?: number;
+    onToken?: (token: string, partial: string) => void;
 }
 interface RAGChunk {
     id: string;
@@ -142,6 +143,23 @@ interface RAGResponse {
     context: string;
     latencyMs: number;
     tierUsed: 1 | 2 | 3;
+    timing: {
+        retrievalMs: number;
+        contextBuildMs: number;
+        firstTokenMs: number;
+        generationMs: number;
+        totalMs: number;
+        tokensGenerated: number;
+        tokensPerSecond: number;
+    };
+    config: {
+        maxContextChars: number;
+        maxGenTokens: number;
+        chunkSize: number;
+        topK: number;
+        contextWindowUsed: number;
+        deviceTier: 'low' | 'mid' | 'high';
+    };
 }
 interface OfflineIndex {
     metadata: {
@@ -224,6 +242,18 @@ declare class SlyOS {
         quant?: QuantizationLevel;
     }): Promise<void>;
     generate(modelId: string, prompt: string, options?: GenerateOptions): Promise<string>;
+    /**
+     * Stream text generation token-by-token.
+     * Calls onToken callback for each generated token.
+     */
+    generateStream(modelId: string, prompt: string, options?: GenerateOptions & {
+        onToken?: (token: string, partial: string) => void;
+    }): Promise<{
+        text: string;
+        firstTokenMs: number;
+        totalMs: number;
+        tokensGenerated: number;
+    }>;
     transcribe(modelId: string, audioInput: any, options?: TranscribeOptions): Promise<string>;
     chatCompletion(modelId: string, request: OpenAIChatCompletionRequest): Promise<OpenAIChatCompletionResponse>;
     bedrockInvoke(modelId: string, request: BedrockInvokeRequest): Promise<BedrockInvokeResponse>;
@@ -235,6 +265,10 @@ declare class SlyOS {
     private mapModelToOpenAI;
     private localEmbeddingModel;
     private offlineIndexes;
+    /**
+     * Compute dynamic RAG parameters based on device profile and model.
+     */
+    private computeRAGConfig;
     /**
      * Tier 2: Cloud-indexed RAG with local inference.
      * Retrieves relevant chunks from server, generates response locally.

package/dist/index.js CHANGED Viewed

@@ -870,6 +870,61 @@ class SlyOS {
             throw error;
         }
     }
+    /**
+     * Stream text generation token-by-token.
+     * Calls onToken callback for each generated token.
+     */
+    async generateStream(modelId, prompt, options = {}) {
+        if (!this.models.has(modelId)) {
+            await this.loadModel(modelId);
+        }
+        const loaded = this.models.get(modelId);
+        if (!loaded)
+            throw new Error(`Model "${modelId}" not loaded`);
+        const { pipe, info, contextWindow } = loaded;
+        if (info.category !== 'llm')
+            throw new Error(`Not an LLM`);
+        const maxTokens = Math.min(options.maxTokens || 100, contextWindow || 2048);
+        const startTime = Date.now();
+        let firstTokenTime = 0;
+        let accumulated = '';
+        this.emitProgress('generating', 0, `Streaming (max ${maxTokens} tokens)...`);
+        try {
+            const result = await pipe(prompt, {
+                max_new_tokens: maxTokens,
+                temperature: options.temperature || 0.7,
+                top_p: options.topP || 0.9,
+                do_sample: true,
+                // Transformers.js streamer callback
+                callback_function: (output) => {
+                    if (!firstTokenTime)
+                        firstTokenTime = Date.now() - startTime;
+                    if (output && output.length > 0) {
+                        // output is token IDs, we need to decode
+                        // The callback in transformers.js v3 gives decoded text tokens
+                        const tokenText = typeof output === 'string' ? output : '';
+                        if (tokenText) {
+                            accumulated += tokenText;
+                            options.onToken?.(tokenText, accumulated);
+                            this.emitEvent('token', { token: tokenText, partial: accumulated });
+                        }
+                    }
+                }
+            });
+            const rawOutput = result[0].generated_text;
+            const response = rawOutput.startsWith(prompt) ? rawOutput.slice(prompt.length).trim() : rawOutput.trim();
+            if (!firstTokenTime)
+                firstTokenTime = Date.now() - startTime;
+            const totalMs = Date.now() - startTime;
+            const tokensGenerated = response.split(/\s+/).length;
+            this.emitProgress('ready', 100, `Streamed ${tokensGenerated} tokens in ${(totalMs / 1000).toFixed(1)}s`);
+            return { text: response, firstTokenMs: firstTokenTime, totalMs, tokensGenerated };
+        }
+        catch (error) {
+            this.emitProgress('error', 0, `Stream failed: ${error.message}`);
+            throw error;
+        }
+    }
     // ── Inference: Transcribe ───────────────────────────────────────
     async transcribe(modelId, audioInput, options = {}) {
         if (!this.models.has(modelId)) {
@@ -1179,6 +1234,45 @@ class SlyOS {
         };
         return modelMapping[slyModelId] || 'gpt-4o-mini';
     }
+    /**
+     * Compute dynamic RAG parameters based on device profile and model.
+     */
+    computeRAGConfig(modelId) {
+        const contextWindow = this.modelContextWindow || 2048;
+        const memoryMB = this.deviceProfile?.memoryMB || 4096;
+        const cpuCores = this.deviceProfile?.cpuCores || 4;
+        const hasGPU = !!(this.deviceProfile?.gpuRenderer || this.deviceProfile?.webgpuAvailable);
+        // Determine device tier
+        let deviceTier = 'low';
+        if (memoryMB >= 8192 && cpuCores >= 8)
+            deviceTier = 'high';
+        else if (memoryMB >= 4096 && cpuCores >= 4)
+            deviceTier = 'mid';
+        // Context chars: scale with context window AND device capability
+        let maxContextChars;
+        if (contextWindow <= 2048) {
+            maxContextChars = deviceTier === 'high' ? 600 : deviceTier === 'mid' ? 400 : 300;
+        }
+        else if (contextWindow <= 4096) {
+            maxContextChars = deviceTier === 'high' ? 1500 : deviceTier === 'mid' ? 1000 : 600;
+        }
+        else {
+            maxContextChars = deviceTier === 'high' ? 3000 : deviceTier === 'mid' ? 2000 : 1000;
+        }
+        // Gen tokens: scale with device tier
+        let maxGenTokens;
+        if (contextWindow <= 2048) {
+            maxGenTokens = deviceTier === 'high' ? 200 : deviceTier === 'mid' ? 150 : 100;
+        }
+        else {
+            maxGenTokens = deviceTier === 'high' ? 400 : deviceTier === 'mid' ? 300 : 150;
+        }
+        // Chunk size: larger chunks for bigger context windows
+        const chunkSize = contextWindow <= 2048 ? 256 : contextWindow <= 4096 ? 512 : 1024;
+        // TopK: more chunks for powerful devices
+        const topK = deviceTier === 'high' ? 5 : deviceTier === 'mid' ? 3 : 1;
+        return { maxContextChars, maxGenTokens, chunkSize, topK, contextWindow, deviceTier };
+    }
     /**
      * Tier 2: Cloud-indexed RAG with local inference.
      * Retrieves relevant chunks from server, generates response locally.
@@ -1188,27 +1282,52 @@ class SlyOS {
         try {
             if (!this.token)
                 throw new Error('Not authenticated. Call init() first.');
+            const ragConfig = this.computeRAGConfig(options.modelId);
             // Step 1: Retrieve relevant chunks from backend
+            const retrievalStart = Date.now();
             const searchResponse = await axios.post(`${this.apiUrl}/api/rag/knowledge-bases/${options.knowledgeBaseId}/query`, {
                 query: options.query,
-                top_k: options.topK || 5,
+                top_k: options.topK || ragConfig.topK,
                 model_id: options.modelId
             }, { headers: { Authorization: `Bearer ${this.token}` } });
+            const retrievalMs = Date.now() - retrievalStart;
             let { retrieved_chunks, prompt_template, context } = searchResponse.data;
-            // Apply context window limits
-            const contextWindow = this.modelContextWindow || 2048;
-            const maxContextChars = (contextWindow - 200) * 3; // Rough token-to-char ratio, reserving 200 tokens
-            if (context && context.length > maxContextChars) {
-                context = context.substring(0, maxContextChars) + '...';
+            // Step 2: Build context with dynamic limits
+            const contextBuildStart = Date.now();
+            if (context && context.length > ragConfig.maxContextChars) {
+                context = context.substring(0, ragConfig.maxContextChars);
             }
-            // Step 2: Generate response locally using the augmented prompt
-            const response = await this.generate(options.modelId, prompt_template, {
-                temperature: options.temperature,
-                maxTokens: options.maxTokens,
-            });
+            // If no prompt_template from server, build minimal one
+            if (!prompt_template) {
+                prompt_template = `${context}\n\nQ: ${options.query}\nA:`;
+            }
+            const contextBuildMs = Date.now() - contextBuildStart;
+            // Step 3: Generate response — stream if callback provided
+            const genStart = Date.now();
+            let response;
+            let firstTokenMs = 0;
+            if (options.onToken) {
+                const streamResult = await this.generateStream(options.modelId, prompt_template, {
+                    temperature: options.temperature,
+                    maxTokens: options.maxTokens || ragConfig.maxGenTokens,
+                    onToken: options.onToken,
+                });
+                response = streamResult.text;
+                firstTokenMs = streamResult.firstTokenMs;
+            }
+            else {
+                response = await this.generate(options.modelId, prompt_template, {
+                    temperature: options.temperature,
+                    maxTokens: options.maxTokens || ragConfig.maxGenTokens,
+                });
+                firstTokenMs = Date.now() - genStart; // approximate
+            }
+            const generationMs = Date.now() - genStart;
+            const totalMs = Date.now() - startTime;
+            const tokensGenerated = response.split(/\s+/).length;
             return {
                 query: options.query,
-                retrievedChunks: retrieved_chunks.map((c) => ({
+                retrievedChunks: (retrieved_chunks || []).map((c) => ({
                     id: c.id,
                     documentId: c.document_id,
                     documentName: c.document_name,
@@ -1218,8 +1337,25 @@ class SlyOS {
                 })),
                 generatedResponse: response,
                 context,
-                latencyMs: Date.now() - startTime,
+                latencyMs: totalMs,
                 tierUsed: 2,
+                timing: {
+                    retrievalMs,
+                    contextBuildMs,
+                    firstTokenMs,
+                    generationMs,
+                    totalMs,
+                    tokensGenerated,
+                    tokensPerSecond: generationMs > 0 ? tokensGenerated / (generationMs / 1000) : 0,
+                },
+                config: {
+                    maxContextChars: ragConfig.maxContextChars,
+                    maxGenTokens: ragConfig.maxGenTokens,
+                    chunkSize: ragConfig.chunkSize,
+                    topK: options.topK || ragConfig.topK,
+                    contextWindowUsed: ragConfig.contextWindow,
+                    deviceTier: ragConfig.deviceTier,
+                },
             };
         }
         catch (error) {
@@ -1234,56 +1370,66 @@ class SlyOS {
     async ragQueryLocal(options) {
         const startTime = Date.now();
         try {
+            const ragConfig = this.computeRAGConfig(options.modelId);
             // Step 1: Load embedding model if needed
             if (!this.localEmbeddingModel) {
                 await this.loadEmbeddingModel();
             }
-            // Adapt chunk size based on context window for efficiency
-            const contextWindow = this.modelContextWindow || 2048;
-            const chunkSize = contextWindow <= 1024 ? 256 : contextWindow <= 2048 ? 512 : 1024;
-            const overlap = Math.floor(chunkSize / 4);
-            // Step 2: Chunk documents if not already chunked
+            // Step 2: Chunk and embed documents (dynamic chunk size)
+            const retrievalStart = Date.now();
             const allChunks = [];
             for (const doc of options.documents) {
-                const chunks = this.chunkTextLocal(doc.content, chunkSize, overlap);
+                const chunks = this.chunkTextLocal(doc.content, ragConfig.chunkSize, Math.floor(ragConfig.chunkSize / 4));
                 for (const chunk of chunks) {
                     const embedding = await this.embedTextLocal(chunk);
                     allChunks.push({ content: chunk, documentName: doc.name || 'Document', embedding });
                 }
             }
-            // Step 3: Embed query
+            // Step 3: Embed query and search
             const queryEmbedding = await this.embedTextLocal(options.query);
-            // Step 4: Cosine similarity search
             const scored = allChunks
                 .filter(c => c.embedding)
-                .map(c => ({
-                ...c,
-                similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding)
-            }))
+                .map(c => ({ ...c, similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding) }))
                 .sort((a, b) => b.similarityScore - a.similarityScore)
-                .slice(0, options.topK || 5);
-            // Step 5: Build context with size limits — keep context SHORT so model has room to generate
-            const maxContextChars = contextWindow <= 2048 ? 800 : contextWindow <= 4096 ? 1500 : 3000;
-            let contextLength = 0;
-            const contextParts = [];
-            for (const c of scored) {
-                const part = `[Source: ${c.documentName}]\n${c.content}`;
-                if (contextLength + part.length <= maxContextChars) {
-                    contextParts.push(part);
-                    contextLength += part.length + 10; // Account for separator
-                }
-                else {
-                    break;
-                }
+                .slice(0, options.topK || ragConfig.topK);
+            const retrievalMs = Date.now() - retrievalStart;
+            // Step 4: Build context
+            const contextBuildStart = Date.now();
+            const bestChunk = scored[0];
+            let context = bestChunk.content
+                .replace(/[^\x20-\x7E\n]/g, ' ')
+                .replace(/\s{2,}/g, ' ')
+                .replace(/<[^>]+>/g, ' ')
+                .replace(/https?:\/\/\S+/g, '')
+                .replace(/[{}()\[\]]/g, '')
+                .trim();
+            if (context.length > ragConfig.maxContextChars)
+                context = context.substring(0, ragConfig.maxContextChars);
+            const prompt = `${context}\n\nQ: ${options.query}\nA:`;
+            const contextBuildMs = Date.now() - contextBuildStart;
+            // Step 5: Generate — stream if callback provided
+            const genStart = Date.now();
+            let response;
+            let firstTokenMs = 0;
+            if (options.onToken) {
+                const streamResult = await this.generateStream(options.modelId, prompt, {
+                    temperature: options.temperature || 0.6,
+                    maxTokens: options.maxTokens || ragConfig.maxGenTokens,
+                    onToken: options.onToken,
+                });
+                response = streamResult.text;
+                firstTokenMs = streamResult.firstTokenMs;
             }
-            const context = contextParts.join('\n\n---\n\n');
-            const prompt = `Use the following information to answer the question.\n\nInfo: ${context}\n\nQuestion: ${options.query}\nAnswer:`;
-            // Step 6: Generate locally
-            const maxGen = contextWindow <= 2048 ? 150 : Math.min(300, Math.floor(contextWindow / 4));
-            const response = await this.generate(options.modelId, prompt, {
-                temperature: options.temperature || 0.6,
-                maxTokens: options.maxTokens || maxGen,
-            });
+            else {
+                response = await this.generate(options.modelId, prompt, {
+                    temperature: options.temperature || 0.6,
+                    maxTokens: options.maxTokens || ragConfig.maxGenTokens,
+                });
+                firstTokenMs = Date.now() - genStart;
+            }
+            const generationMs = Date.now() - genStart;
+            const totalMs = Date.now() - startTime;
+            const tokensGenerated = response.split(/\s+/).length;
             return {
                 query: options.query,
                 retrievedChunks: scored.map((c, i) => ({
@@ -1296,8 +1442,25 @@ class SlyOS {
                 })),
                 generatedResponse: response,
                 context,
-                latencyMs: Date.now() - startTime,
+                latencyMs: totalMs,
                 tierUsed: 1,
+                timing: {
+                    retrievalMs,
+                    contextBuildMs,
+                    firstTokenMs,
+                    generationMs,
+                    totalMs,
+                    tokensGenerated,
+                    tokensPerSecond: generationMs > 0 ? tokensGenerated / (generationMs / 1000) : 0,
+                },
+                config: {
+                    maxContextChars: ragConfig.maxContextChars,
+                    maxGenTokens: ragConfig.maxGenTokens,
+                    chunkSize: ragConfig.chunkSize,
+                    topK: options.topK || ragConfig.topK,
+                    contextWindowUsed: ragConfig.contextWindow,
+                    deviceTier: ragConfig.deviceTier,
+                },
             };
         }
         catch (error) {
@@ -1312,52 +1475,61 @@ class SlyOS {
     async ragQueryOffline(options) {
         const startTime = Date.now();
         const index = this.offlineIndexes.get(options.knowledgeBaseId);
-        if (!index) {
-            throw new Error(`Knowledge base "${options.knowledgeBaseId}" not synced. Call syncKnowledgeBase() first.`);
-        }
-        // Check expiry
-        if (new Date(index.metadata.expires_at) < new Date()) {
-            throw new Error('Offline index has expired. Please re-sync.');
-        }
+        if (!index)
+            throw new Error(`KB "${options.knowledgeBaseId}" not synced.`);
+        if (new Date(index.metadata.expires_at) < new Date())
+            throw new Error('Offline index expired.');
         try {
+            const ragConfig = this.computeRAGConfig(options.modelId);
             // Load embedding model
-            if (!this.localEmbeddingModel) {
+            if (!this.localEmbeddingModel)
                 await this.loadEmbeddingModel();
-            }
-            // Embed query
-            const queryEmbedding = await this.embedTextLocal(options.query);
             // Search offline index
+            const retrievalStart = Date.now();
+            const queryEmbedding = await this.embedTextLocal(options.query);
             const scored = index.chunks
                 .filter(c => c.embedding && c.embedding.length > 0)
-                .map(c => ({
-                ...c,
-                similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding)
-            }))
+                .map(c => ({ ...c, similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding) }))
                 .sort((a, b) => b.similarityScore - a.similarityScore)
-                .slice(0, options.topK || 5);
-            // Build context with size limits — keep context SHORT so model has room to generate
-            const contextWindow = this.modelContextWindow || 2048;
-            const maxContextChars = contextWindow <= 2048 ? 800 : contextWindow <= 4096 ? 1500 : 3000;
-            let contextLength = 0;
-            const contextParts = [];
-            for (const c of scored) {
-                const part = `[Source: ${c.document_name}]\n${c.content}`;
-                if (contextLength + part.length <= maxContextChars) {
-                    contextParts.push(part);
-                    contextLength += part.length + 10;
-                }
-                else {
-                    break;
-                }
+                .slice(0, options.topK || ragConfig.topK);
+            const retrievalMs = Date.now() - retrievalStart;
+            // Build context
+            const contextBuildStart = Date.now();
+            const bestChunk = scored[0];
+            let context = bestChunk.content
+                .replace(/[^\x20-\x7E\n]/g, ' ')
+                .replace(/\s{2,}/g, ' ')
+                .replace(/<[^>]+>/g, ' ')
+                .replace(/https?:\/\/\S+/g, '')
+                .replace(/[{}()\[\]]/g, '')
+                .trim();
+            if (context.length > ragConfig.maxContextChars)
+                context = context.substring(0, ragConfig.maxContextChars);
+            const prompt = `${context}\n\nQ: ${options.query}\nA:`;
+            const contextBuildMs = Date.now() - contextBuildStart;
+            // Generate
+            const genStart = Date.now();
+            let response;
+            let firstTokenMs = 0;
+            if (options.onToken) {
+                const streamResult = await this.generateStream(options.modelId, prompt, {
+                    temperature: options.temperature || 0.6,
+                    maxTokens: options.maxTokens || ragConfig.maxGenTokens,
+                    onToken: options.onToken,
+                });
+                response = streamResult.text;
+                firstTokenMs = streamResult.firstTokenMs;
             }
-            const context = contextParts.join('\n\n---\n\n');
-            const prompt = `Use the following information to answer the question.\n\nInfo: ${context}\n\nQuestion: ${options.query}\nAnswer:`;
-            // Generate locally
-            const maxGen = contextWindow <= 2048 ? 150 : Math.min(300, Math.floor(contextWindow / 4));
-            const response = await this.generate(options.modelId, prompt, {
-                temperature: options.temperature || 0.6,
-                maxTokens: options.maxTokens || maxGen,
-            });
+            else {
+                response = await this.generate(options.modelId, prompt, {
+                    temperature: options.temperature || 0.6,
+                    maxTokens: options.maxTokens || ragConfig.maxGenTokens,
+                });
+                firstTokenMs = Date.now() - genStart;
+            }
+            const generationMs = Date.now() - genStart;
+            const totalMs = Date.now() - startTime;
+            const tokensGenerated = response.split(/\s+/).length;
             return {
                 query: options.query,
                 retrievedChunks: scored.map(c => ({
@@ -1370,8 +1542,25 @@ class SlyOS {
                 })),
                 generatedResponse: response,
                 context,
-                latencyMs: Date.now() - startTime,
+                latencyMs: totalMs,
                 tierUsed: 3,
+                timing: {
+                    retrievalMs,
+                    contextBuildMs,
+                    firstTokenMs,
+                    generationMs,
+                    totalMs,
+                    tokensGenerated,
+                    tokensPerSecond: generationMs > 0 ? tokensGenerated / (generationMs / 1000) : 0,
+                },
+                config: {
+                    maxContextChars: ragConfig.maxContextChars,
+                    maxGenTokens: ragConfig.maxGenTokens,
+                    chunkSize: ragConfig.chunkSize,
+                    topK: options.topK || ragConfig.topK,
+                    contextWindowUsed: ragConfig.contextWindow,
+                    deviceTier: ragConfig.deviceTier,
+                },
             };
         }
         catch (error) {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@beltoinc/slyos-sdk",
-  "version": "1.5.0",
+  "version": "1.5.1",
   "description": "SlyOS - On-Device AI SDK for Web and Node.js",
   "main": "dist/index.js",
   "types": "dist/index.d.ts",

package/src/index.ts CHANGED Viewed

@@ -69,7 +69,7 @@ interface ProgressEvent {
 }
 interface SlyEvent {
-  type: 'auth' | 'device_registered' | 'device_profiled' | 'model_download_start' | 'model_download_progress' | 'model_loaded' | 'inference_start' | 'inference_complete' | 'error' | 'fallback_success' | 'fallback_error' | 'telemetry_flushed';
+  type: 'auth' | 'device_registered' | 'device_profiled' | 'model_download_start' | 'model_download_progress' | 'model_loaded' | 'inference_start' | 'inference_complete' | 'error' | 'fallback_success' | 'fallback_error' | 'telemetry_flushed' | 'token';
   data?: any;
   timestamp: number;
 }
@@ -174,6 +174,8 @@ interface RAGOptions {
   modelId: string;
   temperature?: number;
   maxTokens?: number;
+  // NEW: streaming callback
+  onToken?: (token: string, partial: string) => void;
 }
 interface RAGChunk {
@@ -192,6 +194,25 @@ interface RAGResponse {
   context: string;
   latencyMs: number;
   tierUsed: 1 | 2 | 3;
+  // NEW: detailed timing metrics
+  timing: {
+    retrievalMs: number;      // Time spent retrieving/embedding chunks
+    contextBuildMs: number;   // Time spent building context
+    firstTokenMs: number;     // Time to first token (from generation start)
+    generationMs: number;     // Total generation time
+    totalMs: number;          // End-to-end latency
+    tokensGenerated: number;  // Number of tokens in response
+    tokensPerSecond: number;  // Generation throughput
+  };
+  // NEW: dynamic config used
+  config: {
+    maxContextChars: number;
+    maxGenTokens: number;
+    chunkSize: number;
+    topK: number;
+    contextWindowUsed: number;
+    deviceTier: 'low' | 'mid' | 'high';
+  };
 }
 interface OfflineIndex {
@@ -1145,6 +1166,68 @@ class SlyOS {
     }
   }
+  /**
+   * Stream text generation token-by-token.
+   * Calls onToken callback for each generated token.
+   */
+  async generateStream(
+    modelId: string,
+    prompt: string,
+    options: GenerateOptions & { onToken?: (token: string, partial: string) => void } = {}
+  ): Promise<{ text: string; firstTokenMs: number; totalMs: number; tokensGenerated: number }> {
+    if (!this.models.has(modelId)) {
+      await this.loadModel(modelId);
+    }
+    const loaded = this.models.get(modelId);
+    if (!loaded) throw new Error(`Model "${modelId}" not loaded`);
+    const { pipe, info, contextWindow } = loaded;
+    if (info.category !== 'llm') throw new Error(`Not an LLM`);
+    const maxTokens = Math.min(options.maxTokens || 100, contextWindow || 2048);
+    const startTime = Date.now();
+    let firstTokenTime = 0;
+    let accumulated = '';
+    this.emitProgress('generating', 0, `Streaming (max ${maxTokens} tokens)...`);
+    try {
+      const result = await pipe(prompt, {
+        max_new_tokens: maxTokens,
+        temperature: options.temperature || 0.7,
+        top_p: options.topP || 0.9,
+        do_sample: true,
+        // Transformers.js streamer callback
+        callback_function: (output: any) => {
+          if (!firstTokenTime) firstTokenTime = Date.now() - startTime;
+          if (output && output.length > 0) {
+            // output is token IDs, we need to decode
+            // The callback in transformers.js v3 gives decoded text tokens
+            const tokenText = typeof output === 'string' ? output : '';
+            if (tokenText) {
+              accumulated += tokenText;
+              options.onToken?.(tokenText, accumulated);
+              this.emitEvent('token', { token: tokenText, partial: accumulated });
+            }
+          }
+        }
+      });
+      const rawOutput = result[0].generated_text;
+      const response = rawOutput.startsWith(prompt) ? rawOutput.slice(prompt.length).trim() : rawOutput.trim();
+      if (!firstTokenTime) firstTokenTime = Date.now() - startTime;
+      const totalMs = Date.now() - startTime;
+      const tokensGenerated = response.split(/\s+/).length;
+      this.emitProgress('ready', 100, `Streamed ${tokensGenerated} tokens in ${(totalMs/1000).toFixed(1)}s`);
+      return { text: response, firstTokenMs: firstTokenTime, totalMs, tokensGenerated };
+    } catch (error: any) {
+      this.emitProgress('error', 0, `Stream failed: ${error.message}`);
+      throw error;
+    }
+  }
   // ── Inference: Transcribe ───────────────────────────────────────
   async transcribe(modelId: string, audioInput: any, options: TranscribeOptions = {}): Promise<string> {
@@ -1495,6 +1578,54 @@ class SlyOS {
   private localEmbeddingModel: any = null;
   private offlineIndexes: Map<string, OfflineIndex> = new Map();
+  /**
+   * Compute dynamic RAG parameters based on device profile and model.
+   */
+  private computeRAGConfig(modelId: string): {
+    maxContextChars: number;
+    maxGenTokens: number;
+    chunkSize: number;
+    topK: number;
+    contextWindow: number;
+    deviceTier: 'low' | 'mid' | 'high';
+  } {
+    const contextWindow = this.modelContextWindow || 2048;
+    const memoryMB = this.deviceProfile?.memoryMB || 4096;
+    const cpuCores = this.deviceProfile?.cpuCores || 4;
+    const hasGPU = !!(this.deviceProfile?.gpuRenderer || this.deviceProfile?.webgpuAvailable);
+    // Determine device tier
+    let deviceTier: 'low' | 'mid' | 'high' = 'low';
+    if (memoryMB >= 8192 && cpuCores >= 8) deviceTier = 'high';
+    else if (memoryMB >= 4096 && cpuCores >= 4) deviceTier = 'mid';
+    // Context chars: scale with context window AND device capability
+    let maxContextChars: number;
+    if (contextWindow <= 2048) {
+      maxContextChars = deviceTier === 'high' ? 600 : deviceTier === 'mid' ? 400 : 300;
+    } else if (contextWindow <= 4096) {
+      maxContextChars = deviceTier === 'high' ? 1500 : deviceTier === 'mid' ? 1000 : 600;
+    } else {
+      maxContextChars = deviceTier === 'high' ? 3000 : deviceTier === 'mid' ? 2000 : 1000;
+    }
+    // Gen tokens: scale with device tier
+    let maxGenTokens: number;
+    if (contextWindow <= 2048) {
+      maxGenTokens = deviceTier === 'high' ? 200 : deviceTier === 'mid' ? 150 : 100;
+    } else {
+      maxGenTokens = deviceTier === 'high' ? 400 : deviceTier === 'mid' ? 300 : 150;
+    }
+    // Chunk size: larger chunks for bigger context windows
+    const chunkSize = contextWindow <= 2048 ? 256 : contextWindow <= 4096 ? 512 : 1024;
+    // TopK: more chunks for powerful devices
+    const topK = deviceTier === 'high' ? 5 : deviceTier === 'mid' ? 3 : 1;
+    return { maxContextChars, maxGenTokens, chunkSize, topK, contextWindow, deviceTier };
+  }
   /**
    * Tier 2: Cloud-indexed RAG with local inference.
    * Retrieves relevant chunks from server, generates response locally.
@@ -1505,36 +1636,61 @@ class SlyOS {
     try {
       if (!this.token) throw new Error('Not authenticated. Call init() first.');
+      const ragConfig = this.computeRAGConfig(options.modelId);
       // Step 1: Retrieve relevant chunks from backend
+      const retrievalStart = Date.now();
       const searchResponse = await axios.post(
         `${this.apiUrl}/api/rag/knowledge-bases/${options.knowledgeBaseId}/query`,
         {
           query: options.query,
-          top_k: options.topK || 5,
+          top_k: options.topK || ragConfig.topK,
           model_id: options.modelId
         },
         { headers: { Authorization: `Bearer ${this.token}` } }
       );
+      const retrievalMs = Date.now() - retrievalStart;
       let { retrieved_chunks, prompt_template, context } = searchResponse.data;
-      // Apply context window limits
-      const contextWindow = this.modelContextWindow || 2048;
-      const maxContextChars = (contextWindow - 200) * 3; // Rough token-to-char ratio, reserving 200 tokens
-      if (context && context.length > maxContextChars) {
-        context = context.substring(0, maxContextChars) + '...';
+      // Step 2: Build context with dynamic limits
+      const contextBuildStart = Date.now();
+      if (context && context.length > ragConfig.maxContextChars) {
+        context = context.substring(0, ragConfig.maxContextChars);
       }
-      // Step 2: Generate response locally using the augmented prompt
-      const response = await this.generate(options.modelId, prompt_template, {
-        temperature: options.temperature,
-        maxTokens: options.maxTokens,
-      });
+      // If no prompt_template from server, build minimal one
+      if (!prompt_template) {
+        prompt_template = `${context}\n\nQ: ${options.query}\nA:`;
+      }
+      const contextBuildMs = Date.now() - contextBuildStart;
+      // Step 3: Generate response — stream if callback provided
+      const genStart = Date.now();
+      let response: string;
+      let firstTokenMs = 0;
+      if (options.onToken) {
+        const streamResult = await this.generateStream(options.modelId, prompt_template, {
+          temperature: options.temperature,
+          maxTokens: options.maxTokens || ragConfig.maxGenTokens,
+          onToken: options.onToken,
+        });
+        response = streamResult.text;
+        firstTokenMs = streamResult.firstTokenMs;
+      } else {
+        response = await this.generate(options.modelId, prompt_template, {
+          temperature: options.temperature,
+          maxTokens: options.maxTokens || ragConfig.maxGenTokens,
+        });
+        firstTokenMs = Date.now() - genStart; // approximate
+      }
+      const generationMs = Date.now() - genStart;
+      const totalMs = Date.now() - startTime;
+      const tokensGenerated = response.split(/\s+/).length;
       return {
         query: options.query,
-        retrievedChunks: retrieved_chunks.map((c: any) => ({
+        retrievedChunks: (retrieved_chunks || []).map((c: any) => ({
           id: c.id,
           documentId: c.document_id,
           documentName: c.document_name,
@@ -1544,8 +1700,25 @@ class SlyOS {
         })),
         generatedResponse: response,
         context,
-        latencyMs: Date.now() - startTime,
+        latencyMs: totalMs,
         tierUsed: 2,
+        timing: {
+          retrievalMs,
+          contextBuildMs,
+          firstTokenMs,
+          generationMs,
+          totalMs,
+          tokensGenerated,
+          tokensPerSecond: generationMs > 0 ? tokensGenerated / (generationMs / 1000) : 0,
+        },
+        config: {
+          maxContextChars: ragConfig.maxContextChars,
+          maxGenTokens: ragConfig.maxGenTokens,
+          chunkSize: ragConfig.chunkSize,
+          topK: options.topK || ragConfig.topK,
+          contextWindowUsed: ragConfig.contextWindow,
+          deviceTier: ragConfig.deviceTier,
+        },
       };
     } catch (error: any) {
       this.emitEvent('error', { stage: 'rag_query', error: error.message });
@@ -1561,63 +1734,70 @@ class SlyOS {
     const startTime = Date.now();
     try {
+      const ragConfig = this.computeRAGConfig(options.modelId);
       // Step 1: Load embedding model if needed
       if (!this.localEmbeddingModel) {
         await this.loadEmbeddingModel();
       }
-      // Adapt chunk size based on context window for efficiency
-      const contextWindow = this.modelContextWindow || 2048;
-      const chunkSize = contextWindow <= 1024 ? 256 : contextWindow <= 2048 ? 512 : 1024;
-      const overlap = Math.floor(chunkSize / 4);
-      // Step 2: Chunk documents if not already chunked
+      // Step 2: Chunk and embed documents (dynamic chunk size)
+      const retrievalStart = Date.now();
       const allChunks: Array<{ content: string; documentName: string; embedding?: number[] }> = [];
       for (const doc of options.documents) {
-        const chunks = this.chunkTextLocal(doc.content, chunkSize, overlap);
+        const chunks = this.chunkTextLocal(doc.content, ragConfig.chunkSize, Math.floor(ragConfig.chunkSize / 4));
         for (const chunk of chunks) {
           const embedding = await this.embedTextLocal(chunk);
           allChunks.push({ content: chunk, documentName: doc.name || 'Document', embedding });
         }
       }
-      // Step 3: Embed query
+      // Step 3: Embed query and search
       const queryEmbedding = await this.embedTextLocal(options.query);
-      // Step 4: Cosine similarity search
       const scored = allChunks
         .filter(c => c.embedding)
-        .map(c => ({
-          ...c,
-          similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding!)
-        }))
+        .map(c => ({ ...c, similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding!) }))
         .sort((a, b) => b.similarityScore - a.similarityScore)
-        .slice(0, options.topK || 5);
-      // Step 5: Build context with size limits — keep context SHORT so model has room to generate
-      const maxContextChars = contextWindow <= 2048 ? 800 : contextWindow <= 4096 ? 1500 : 3000;
-      let contextLength = 0;
-      const contextParts: string[] = [];
-      for (const c of scored) {
-        const part = `[Source: ${c.documentName}]\n${c.content}`;
-        if (contextLength + part.length <= maxContextChars) {
-          contextParts.push(part);
-          contextLength += part.length + 10; // Account for separator
-        } else {
-          break;
-        }
+        .slice(0, options.topK || ragConfig.topK);
+      const retrievalMs = Date.now() - retrievalStart;
+      // Step 4: Build context
+      const contextBuildStart = Date.now();
+      const bestChunk = scored[0];
+      let context = bestChunk.content
+        .replace(/[^\x20-\x7E\n]/g, ' ')
+        .replace(/\s{2,}/g, ' ')
+        .replace(/<[^>]+>/g, ' ')
+        .replace(/https?:\/\/\S+/g, '')
+        .replace(/[{}()\[\]]/g, '')
+        .trim();
+      if (context.length > ragConfig.maxContextChars) context = context.substring(0, ragConfig.maxContextChars);
+      const prompt = `${context}\n\nQ: ${options.query}\nA:`;
+      const contextBuildMs = Date.now() - contextBuildStart;
+      // Step 5: Generate — stream if callback provided
+      const genStart = Date.now();
+      let response: string;
+      let firstTokenMs = 0;
+      if (options.onToken) {
+        const streamResult = await this.generateStream(options.modelId, prompt, {
+          temperature: options.temperature || 0.6,
+          maxTokens: options.maxTokens || ragConfig.maxGenTokens,
+          onToken: options.onToken,
+        });
+        response = streamResult.text;
+        firstTokenMs = streamResult.firstTokenMs;
+      } else {
+        response = await this.generate(options.modelId, prompt, {
+          temperature: options.temperature || 0.6,
+          maxTokens: options.maxTokens || ragConfig.maxGenTokens,
+        });
+        firstTokenMs = Date.now() - genStart;
       }
-      const context = contextParts.join('\n\n---\n\n');
-      const prompt = `Use the following information to answer the question.\n\nInfo: ${context}\n\nQuestion: ${options.query}\nAnswer:`;
-      // Step 6: Generate locally
-      const maxGen = contextWindow <= 2048 ? 150 : Math.min(300, Math.floor(contextWindow / 4));
-      const response = await this.generate(options.modelId, prompt, {
-        temperature: options.temperature || 0.6,
-        maxTokens: options.maxTokens || maxGen,
-      });
+      const generationMs = Date.now() - genStart;
+      const totalMs = Date.now() - startTime;
+      const tokensGenerated = response.split(/\s+/).length;
       return {
         query: options.query,
@@ -1631,8 +1811,25 @@ class SlyOS {
         })),
         generatedResponse: response,
         context,
-        latencyMs: Date.now() - startTime,
+        latencyMs: totalMs,
         tierUsed: 1,
+        timing: {
+          retrievalMs,
+          contextBuildMs,
+          firstTokenMs,
+          generationMs,
+          totalMs,
+          tokensGenerated,
+          tokensPerSecond: generationMs > 0 ? tokensGenerated / (generationMs / 1000) : 0,
+        },
+        config: {
+          maxContextChars: ragConfig.maxContextChars,
+          maxGenTokens: ragConfig.maxGenTokens,
+          chunkSize: ragConfig.chunkSize,
+          topK: options.topK || ragConfig.topK,
+          contextWindowUsed: ragConfig.contextWindow,
+          deviceTier: ragConfig.deviceTier,
+        },
       };
     } catch (error: any) {
       this.emitEvent('error', { stage: 'rag_local', error: error.message });
@@ -1648,59 +1845,62 @@ class SlyOS {
     const startTime = Date.now();
     const index = this.offlineIndexes.get(options.knowledgeBaseId);
-    if (!index) {
-      throw new Error(`Knowledge base "${options.knowledgeBaseId}" not synced. Call syncKnowledgeBase() first.`);
-    }
-    // Check expiry
-    if (new Date(index.metadata.expires_at) < new Date()) {
-      throw new Error('Offline index has expired. Please re-sync.');
-    }
+    if (!index) throw new Error(`KB "${options.knowledgeBaseId}" not synced.`);
+    if (new Date(index.metadata.expires_at) < new Date()) throw new Error('Offline index expired.');
     try {
-      // Load embedding model
-      if (!this.localEmbeddingModel) {
-        await this.loadEmbeddingModel();
-      }
+      const ragConfig = this.computeRAGConfig(options.modelId);
-      // Embed query
-      const queryEmbedding = await this.embedTextLocal(options.query);
+      // Load embedding model
+      if (!this.localEmbeddingModel) await this.loadEmbeddingModel();
       // Search offline index
+      const retrievalStart = Date.now();
+      const queryEmbedding = await this.embedTextLocal(options.query);
       const scored = index.chunks
         .filter(c => c.embedding && c.embedding.length > 0)
-        .map(c => ({
-          ...c,
-          similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding!)
-        }))
+        .map(c => ({ ...c, similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding!) }))
         .sort((a, b) => b.similarityScore - a.similarityScore)
-        .slice(0, options.topK || 5);
-      // Build context with size limits — keep context SHORT so model has room to generate
-      const contextWindow = this.modelContextWindow || 2048;
-      const maxContextChars = contextWindow <= 2048 ? 800 : contextWindow <= 4096 ? 1500 : 3000;
-      let contextLength = 0;
-      const contextParts: string[] = [];
-      for (const c of scored) {
-        const part = `[Source: ${c.document_name}]\n${c.content}`;
-        if (contextLength + part.length <= maxContextChars) {
-          contextParts.push(part);
-          contextLength += part.length + 10;
-        } else {
-          break;
-        }
+        .slice(0, options.topK || ragConfig.topK);
+      const retrievalMs = Date.now() - retrievalStart;
+      // Build context
+      const contextBuildStart = Date.now();
+      const bestChunk = scored[0];
+      let context = bestChunk.content
+        .replace(/[^\x20-\x7E\n]/g, ' ')
+        .replace(/\s{2,}/g, ' ')
+        .replace(/<[^>]+>/g, ' ')
+        .replace(/https?:\/\/\S+/g, '')
+        .replace(/[{}()\[\]]/g, '')
+        .trim();
+      if (context.length > ragConfig.maxContextChars) context = context.substring(0, ragConfig.maxContextChars);
+      const prompt = `${context}\n\nQ: ${options.query}\nA:`;
+      const contextBuildMs = Date.now() - contextBuildStart;
+      // Generate
+      const genStart = Date.now();
+      let response: string;
+      let firstTokenMs = 0;
+      if (options.onToken) {
+        const streamResult = await this.generateStream(options.modelId, prompt, {
+          temperature: options.temperature || 0.6,
+          maxTokens: options.maxTokens || ragConfig.maxGenTokens,
+          onToken: options.onToken,
+        });
+        response = streamResult.text;
+        firstTokenMs = streamResult.firstTokenMs;
+      } else {
+        response = await this.generate(options.modelId, prompt, {
+          temperature: options.temperature || 0.6,
+          maxTokens: options.maxTokens || ragConfig.maxGenTokens,
+        });
+        firstTokenMs = Date.now() - genStart;
       }
-      const context = contextParts.join('\n\n---\n\n');
-      const prompt = `Use the following information to answer the question.\n\nInfo: ${context}\n\nQuestion: ${options.query}\nAnswer:`;
-      // Generate locally
-      const maxGen = contextWindow <= 2048 ? 150 : Math.min(300, Math.floor(contextWindow / 4));
-      const response = await this.generate(options.modelId, prompt, {
-        temperature: options.temperature || 0.6,
-        maxTokens: options.maxTokens || maxGen,
-      });
+      const generationMs = Date.now() - genStart;
+      const totalMs = Date.now() - startTime;
+      const tokensGenerated = response.split(/\s+/).length;
       return {
         query: options.query,
@@ -1714,8 +1914,25 @@ class SlyOS {
         })),
         generatedResponse: response,
         context,
-        latencyMs: Date.now() - startTime,
+        latencyMs: totalMs,
         tierUsed: 3,
+        timing: {
+          retrievalMs,
+          contextBuildMs,
+          firstTokenMs,
+          generationMs,
+          totalMs,
+          tokensGenerated,
+          tokensPerSecond: generationMs > 0 ? tokensGenerated / (generationMs / 1000) : 0,
+        },
+        config: {
+          maxContextChars: ragConfig.maxContextChars,
+          maxGenTokens: ragConfig.maxGenTokens,
+          chunkSize: ragConfig.chunkSize,
+          topK: options.topK || ragConfig.topK,
+          contextWindowUsed: ragConfig.contextWindow,
+          deviceTier: ragConfig.deviceTier,
+        },
       };
     } catch (error: any) {
       this.emitEvent('error', { stage: 'rag_offline', error: error.message });