npm - @emilshirokikh/slyos-sdk - Versions diffs - 1.3.2 → 1.3.3 - Mend

@emilshirokikh/slyos-sdk 1.3.2 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/index.js CHANGED Viewed

@@ -81,6 +81,25 @@ function selectQuantization(memoryMB, modelId) {
     }
     return 'q4'; // fallback
 }
+// ─── Context Window Detection ──────────────────────────────────────
+async function detectContextWindowFromHF(hfModelId) {
+    try {
+        const configUrl = `https://huggingface.co/${hfModelId}/raw/main/config.json`;
+        const response = await axios.get(configUrl, { timeout: 5000 });
+        const config = response.data;
+        // Try multiple context window field names
+        const contextWindow = config.max_position_embeddings ||
+            config.n_positions ||
+            config.max_seq_len ||
+            config.model_max_length ||
+            2048;
+        return contextWindow;
+    }
+    catch {
+        // Default if config cannot be fetched
+        return 2048;
+    }
+}
 // ─── Device Profiling ───────────────────────────────────────────────
 async function profileDevice() {
     const isNode = typeof window === 'undefined';
@@ -148,6 +167,12 @@ class SlyOS {
         this.token = null;
         this.models = new Map();
         this.deviceProfile = null;
+        this.modelContextWindow = 0;
+        // ═══════════════════════════════════════════════════════════
+        // RAG — Retrieval Augmented Generation
+        // ═══════════════════════════════════════════════════════════
+        this.localEmbeddingModel = null;
+        this.offlineIndexes = new Map();
         this.apiKey = config.apiKey;
         this.apiUrl = config.apiUrl || 'https://api.slyos.world';
         this.deviceId = `device-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
@@ -177,6 +202,9 @@ class SlyOS {
     getDeviceProfile() {
         return this.deviceProfile;
     }
+    getModelContextWindow() {
+        return this.modelContextWindow;
+    }
     // ── Smart Model Recommendation ──────────────────────────────────
     recommendModel(category = 'llm') {
         if (!this.deviceProfile) {
@@ -271,6 +299,31 @@ class SlyOS {
         }
         return Object.fromEntries(Object.entries(grouped).map(([cat, models]) => [cat, { models }]));
     }
+    async searchModels(query, options) {
+        try {
+            const limit = options?.limit || 20;
+            const filters = ['onnx']; // Filter for ONNX models only
+            if (options?.task) {
+                filters.push(options.task);
+            }
+            const filterString = filters.map(f => `"${f}"`).join(',');
+            const url = `https://huggingface.co/api/models?search=${encodeURIComponent(query)}&filter=${encodeURIComponent(`[${filterString}]`)}&sort=downloads&direction=-1&limit=${limit}`;
+            const response = await axios.get(url, { timeout: 10000 });
+            const models = Array.isArray(response.data) ? response.data : [];
+            return models.map((model) => ({
+                id: model.id,
+                name: model.id.split('/')[1] || model.id,
+                downloads: model.downloads || 0,
+                likes: model.likes || 0,
+                task: model.task || 'unknown',
+                size_category: model.size_category || 'unknown',
+            }));
+        }
+        catch (error) {
+            this.emitEvent('error', { stage: 'model_search', error: error.message });
+            throw new Error(`Model search failed: ${error.message}`);
+        }
+    }
     canRunModel(modelId, quant) {
         const info = modelMap[modelId];
         if (!info)
@@ -297,25 +350,37 @@ class SlyOS {
     }
     async loadModel(modelId, options) {
         const info = modelMap[modelId];
-        if (!info) {
-            const available = Object.keys(modelMap).join(', ');
-            throw new Error(`Unknown model "${modelId}". Available: ${available}`);
-        }
-        // Determine quantization
-        let quant = options?.quant || 'fp32';
-        if (!options?.quant && this.deviceProfile) {
-            quant = selectQuantization(this.deviceProfile.memoryMB, modelId);
-            this.emitProgress('downloading', 0, `Auto-selected ${quant.toUpperCase()} quantization for your device`);
-        }
-        // Check feasibility
-        const check = this.canRunModel(modelId, quant);
-        if (!check.canRun) {
-            this.emitProgress('error', 0, check.reason);
-            throw new Error(check.reason);
-        }
-        const estimatedSize = info.sizesMB[quant];
-        this.emitProgress('downloading', 0, `Downloading ${modelId} (${quant.toUpperCase()}, ~${estimatedSize}MB)...`);
-        this.emitEvent('model_download_start', { modelId, quant, estimatedSizeMB: estimatedSize });
+        let hfModelId;
+        let task;
+        let estimatedSize;
+        // Handle curated models
+        if (info) {
+            hfModelId = info.hfModel;
+            task = info.task;
+            // Determine quantization
+            let quant = options?.quant || 'fp32';
+            if (!options?.quant && this.deviceProfile) {
+                quant = selectQuantization(this.deviceProfile.memoryMB, modelId);
+                this.emitProgress('downloading', 0, `Auto-selected ${quant.toUpperCase()} quantization for your device`);
+            }
+            // Check feasibility
+            const check = this.canRunModel(modelId, quant);
+            if (!check.canRun) {
+                this.emitProgress('error', 0, check.reason);
+                throw new Error(check.reason);
+            }
+            estimatedSize = info.sizesMB[quant];
+            this.emitProgress('downloading', 0, `Downloading ${modelId} (${quant.toUpperCase()}, ~${estimatedSize}MB)...`);
+            this.emitEvent('model_download_start', { modelId, quant, estimatedSizeMB: estimatedSize });
+        }
+        else {
+            // Handle custom HuggingFace models
+            hfModelId = modelId;
+            task = 'text-generation'; // Default task
+            estimatedSize = 2048; // Default estimate
+            this.emitProgress('downloading', 0, `Loading custom HuggingFace model: ${modelId}...`);
+            this.emitEvent('model_download_start', { modelId, custom: true, estimatedSizeMB: estimatedSize });
+        }
         // Map quant to dtype for HuggingFace
         const dtypeMap = {
             q4: 'q4',
@@ -326,9 +391,14 @@ class SlyOS {
         let lastReportedPercent = 0;
         const startTime = Date.now();
         try {
-            const pipe = await pipeline(info.task, info.hfModel, {
+            // For custom HF models, detect context window
+            let detectedContextWindow = 2048;
+            if (!info) {
+                detectedContextWindow = await detectContextWindowFromHF(hfModelId);
+            }
+            const pipe = await pipeline(task, hfModelId, {
                 device: 'cpu',
-                dtype: dtypeMap[quant],
+                dtype: 'q4', // Default to q4 for stability
                 progress_callback: (progressData) => {
                     // HuggingFace transformers sends progress events during download
                     if (progressData && typeof progressData === 'object') {
@@ -357,12 +427,22 @@ class SlyOS {
                 },
             });
             const loadTime = Date.now() - startTime;
-            const contextWindow = this.deviceProfile
-                ? recommendContextWindow(this.deviceProfile.memoryMB, quant)
-                : 2048;
-            this.models.set(modelId, { pipe, info, quant, contextWindow });
-            this.emitProgress('ready', 100, `${modelId} loaded (${quant.toUpperCase()}, ${(loadTime / 1000).toFixed(1)}s, ctx: ${contextWindow})`);
-            this.emitEvent('model_loaded', { modelId, quant, loadTimeMs: loadTime, contextWindow });
+            let contextWindow;
+            if (info) {
+                // For curated models, use recommendContextWindow
+                const quant = options?.quant || (this.deviceProfile ? selectQuantization(this.deviceProfile.memoryMB, modelId) : 'q4');
+                contextWindow = this.deviceProfile
+                    ? recommendContextWindow(this.deviceProfile.memoryMB, quant)
+                    : 2048;
+            }
+            else {
+                // For custom HF models, use detected context window
+                contextWindow = detectedContextWindow;
+            }
+            this.modelContextWindow = contextWindow;
+            this.models.set(modelId, { pipe, info, quant: 'q4', contextWindow });
+            this.emitProgress('ready', 100, `${modelId} loaded (q4, ${(loadTime / 1000).toFixed(1)}s, ctx: ${contextWindow})`);
+            this.emitEvent('model_loaded', { modelId, quant: 'q4', loadTimeMs: loadTime, contextWindow });
             // Telemetry
             if (this.token) {
                 await axios.post(`${this.apiUrl}/api/telemetry`, {
@@ -370,7 +450,7 @@ class SlyOS {
                     event_type: 'model_load',
                     model_id: modelId,
                     success: true,
-                    metadata: { quant, loadTimeMs: loadTime, contextWindow },
+                    metadata: { quant: 'q4', loadTimeMs: loadTime, contextWindow, custom: !info },
                 }, {
                     headers: { Authorization: `Bearer ${this.token}` },
                 }).catch(() => { });
@@ -760,6 +840,284 @@ class SlyOS {
         };
         return modelMapping[slyModelId] || 'gpt-4o-mini';
     }
+    /**
+     * Tier 2: Cloud-indexed RAG with local inference.
+     * Retrieves relevant chunks from server, generates response locally.
+     */
+    async ragQuery(options) {
+        const startTime = Date.now();
+        try {
+            if (!this.token)
+                throw new Error('Not authenticated. Call init() first.');
+            // Step 1: Retrieve relevant chunks from backend
+            const searchResponse = await axios.post(`${this.apiUrl}/api/rag/knowledge-bases/${options.knowledgeBaseId}/query`, {
+                query: options.query,
+                top_k: options.topK || 5,
+                model_id: options.modelId
+            }, { headers: { Authorization: `Bearer ${this.token}` } });
+            let { retrieved_chunks, prompt_template, context } = searchResponse.data;
+            // Apply context window limits
+            const contextWindow = this.modelContextWindow || 2048;
+            const maxContextChars = (contextWindow - 200) * 3; // Rough token-to-char ratio, reserving 200 tokens
+            if (context && context.length > maxContextChars) {
+                context = context.substring(0, maxContextChars) + '...';
+            }
+            // Step 2: Generate response locally using the augmented prompt
+            const response = await this.generate(options.modelId, prompt_template, {
+                temperature: options.temperature,
+                maxTokens: options.maxTokens,
+            });
+            return {
+                query: options.query,
+                retrievedChunks: retrieved_chunks.map((c) => ({
+                    id: c.id,
+                    documentId: c.document_id,
+                    documentName: c.document_name,
+                    content: c.content,
+                    similarityScore: c.similarity_score,
+                    metadata: c.metadata
+                })),
+                generatedResponse: response,
+                context,
+                latencyMs: Date.now() - startTime,
+                tierUsed: 2,
+            };
+        }
+        catch (error) {
+            this.emitEvent('error', { stage: 'rag_query', error: error.message });
+            throw new Error(`RAG query failed: ${error.message}`);
+        }
+    }
+    /**
+     * Tier 1: Fully local RAG. Zero network calls.
+     * Documents are chunked/embedded on-device, retrieval and generation all local.
+     */
+    async ragQueryLocal(options) {
+        const startTime = Date.now();
+        try {
+            // Step 1: Load embedding model if needed
+            if (!this.localEmbeddingModel) {
+                await this.loadEmbeddingModel();
+            }
+            // Adapt chunk size based on context window for efficiency
+            const contextWindow = this.modelContextWindow || 2048;
+            const chunkSize = contextWindow <= 1024 ? 256 : contextWindow <= 2048 ? 512 : 1024;
+            const overlap = Math.floor(chunkSize / 4);
+            // Step 2: Chunk documents if not already chunked
+            const allChunks = [];
+            for (const doc of options.documents) {
+                const chunks = this.chunkTextLocal(doc.content, chunkSize, overlap);
+                for (const chunk of chunks) {
+                    const embedding = await this.embedTextLocal(chunk);
+                    allChunks.push({ content: chunk, documentName: doc.name || 'Document', embedding });
+                }
+            }
+            // Step 3: Embed query
+            const queryEmbedding = await this.embedTextLocal(options.query);
+            // Step 4: Cosine similarity search
+            const scored = allChunks
+                .filter(c => c.embedding)
+                .map(c => ({
+                ...c,
+                similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding)
+            }))
+                .sort((a, b) => b.similarityScore - a.similarityScore)
+                .slice(0, options.topK || 5);
+            // Step 5: Build context with size limits
+            const maxContextChars = (contextWindow - 200) * 3; // Rough token-to-char ratio, reserving 200 tokens
+            let contextLength = 0;
+            const contextParts = [];
+            for (const c of scored) {
+                const part = `[Source: ${c.documentName}]\n${c.content}`;
+                if (contextLength + part.length <= maxContextChars) {
+                    contextParts.push(part);
+                    contextLength += part.length + 10; // Account for separator
+                }
+                else {
+                    break;
+                }
+            }
+            const context = contextParts.join('\n\n---\n\n');
+            const prompt = `You are a helpful assistant. Answer based ONLY on the following context:\n\n${context}\n\nQuestion: ${options.query}\n\nAnswer:`;
+            // Step 6: Generate locally
+            const response = await this.generate(options.modelId, prompt, {
+                temperature: options.temperature,
+                maxTokens: options.maxTokens,
+            });
+            return {
+                query: options.query,
+                retrievedChunks: scored.map((c, i) => ({
+                    id: `local-${i}`,
+                    documentId: 'local',
+                    documentName: c.documentName,
+                    content: c.content,
+                    similarityScore: c.similarityScore,
+                    metadata: {}
+                })),
+                generatedResponse: response,
+                context,
+                latencyMs: Date.now() - startTime,
+                tierUsed: 1,
+            };
+        }
+        catch (error) {
+            this.emitEvent('error', { stage: 'rag_local', error: error.message });
+            throw new Error(`Local RAG failed: ${error.message}`);
+        }
+    }
+    /**
+     * Tier 3: Offline RAG using a synced knowledge base.
+     * First call syncKnowledgeBase(), then use this for offline queries.
+     */
+    async ragQueryOffline(options) {
+        const startTime = Date.now();
+        const index = this.offlineIndexes.get(options.knowledgeBaseId);
+        if (!index) {
+            throw new Error(`Knowledge base "${options.knowledgeBaseId}" not synced. Call syncKnowledgeBase() first.`);
+        }
+        // Check expiry
+        if (new Date(index.metadata.expires_at) < new Date()) {
+            throw new Error('Offline index has expired. Please re-sync.');
+        }
+        try {
+            // Load embedding model
+            if (!this.localEmbeddingModel) {
+                await this.loadEmbeddingModel();
+            }
+            // Embed query
+            const queryEmbedding = await this.embedTextLocal(options.query);
+            // Search offline index
+            const scored = index.chunks
+                .filter(c => c.embedding && c.embedding.length > 0)
+                .map(c => ({
+                ...c,
+                similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding)
+            }))
+                .sort((a, b) => b.similarityScore - a.similarityScore)
+                .slice(0, options.topK || 5);
+            // Build context with size limits
+            const contextWindow = this.modelContextWindow || 2048;
+            const maxContextChars = (contextWindow - 200) * 3; // Rough token-to-char ratio, reserving 200 tokens
+            let contextLength = 0;
+            const contextParts = [];
+            for (const c of scored) {
+                const part = `[Source: ${c.document_name}]\n${c.content}`;
+                if (contextLength + part.length <= maxContextChars) {
+                    contextParts.push(part);
+                    contextLength += part.length + 10; // Account for separator
+                }
+                else {
+                    break;
+                }
+            }
+            const context = contextParts.join('\n\n---\n\n');
+            const prompt = `You are a helpful assistant. Answer based ONLY on the following context:\n\n${context}\n\nQuestion: ${options.query}\n\nAnswer:`;
+            // Generate locally
+            const response = await this.generate(options.modelId, prompt, {
+                temperature: options.temperature,
+                maxTokens: options.maxTokens,
+            });
+            return {
+                query: options.query,
+                retrievedChunks: scored.map(c => ({
+                    id: c.id,
+                    documentId: c.document_id,
+                    documentName: c.document_name,
+                    content: c.content,
+                    similarityScore: c.similarityScore,
+                    metadata: c.metadata
+                })),
+                generatedResponse: response,
+                context,
+                latencyMs: Date.now() - startTime,
+                tierUsed: 3,
+            };
+        }
+        catch (error) {
+            this.emitEvent('error', { stage: 'rag_offline', error: error.message });
+            throw new Error(`Offline RAG failed: ${error.message}`);
+        }
+    }
+    /**
+     * Sync a knowledge base for offline use (Tier 3).
+     * Downloads chunks + embeddings from server, stores locally.
+     */
+    async syncKnowledgeBase(knowledgeBaseId, deviceId) {
+        try {
+            if (!this.token)
+                throw new Error('Not authenticated. Call init() first.');
+            const response = await axios.post(`${this.apiUrl}/api/rag/knowledge-bases/${knowledgeBaseId}/sync`, { device_id: deviceId || this.deviceId || 'sdk-device' }, { headers: { Authorization: `Bearer ${this.token}` } });
+            const { sync_package, chunk_count, package_size_mb, expires_at } = response.data;
+            this.offlineIndexes.set(knowledgeBaseId, sync_package);
+            return {
+                chunkCount: chunk_count,
+                sizeMb: package_size_mb,
+                expiresAt: expires_at
+            };
+        }
+        catch (error) {
+            throw new Error(`Sync failed: ${error.message}`);
+        }
+    }
+    // --- RAG Helper Methods ---
+    async loadEmbeddingModel() {
+        this.emitProgress('downloading', 0, 'Loading embedding model (all-MiniLM-L6-v2)...');
+        try {
+            const { pipeline } = await import('@huggingface/transformers');
+            this.localEmbeddingModel = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
+            this.emitProgress('ready', 100, 'Embedding model loaded');
+        }
+        catch (error) {
+            this.emitProgress('error', 0, `Embedding model failed: ${error.message}`);
+            throw error;
+        }
+    }
+    async embedTextLocal(text) {
+        if (!this.localEmbeddingModel)
+            throw new Error('Embedding model not loaded');
+        const result = await this.localEmbeddingModel(text, { pooling: 'mean', normalize: true });
+        // Handle different tensor output formats (v2 vs v3 of transformers)
+        if (result.data)
+            return Array.from(result.data);
+        if (result.tolist)
+            return result.tolist().flat();
+        if (Array.isArray(result))
+            return result.flat();
+        throw new Error('Unexpected embedding output format');
+    }
+    cosineSimilarity(a, b) {
+        let dot = 0, normA = 0, normB = 0;
+        for (let i = 0; i < a.length; i++) {
+            dot += a[i] * b[i];
+            normA += a[i] * a[i];
+            normB += b[i] * b[i];
+        }
+        const denom = Math.sqrt(normA) * Math.sqrt(normB);
+        return denom === 0 ? 0 : dot / denom;
+    }
+    chunkTextLocal(text, chunkSize = 512, overlap = 128) {
+        if (!text || text.length === 0)
+            return [];
+        if (overlap >= chunkSize)
+            overlap = Math.floor(chunkSize * 0.25);
+        const chunks = [];
+        let start = 0;
+        while (start < text.length) {
+            let end = start + chunkSize;
+            if (end < text.length) {
+                const bp = Math.max(text.lastIndexOf('.', end), text.lastIndexOf('\n', end));
+                if (bp > start + chunkSize / 2)
+                    end = bp + 1;
+            }
+            const chunk = text.slice(start, end).trim();
+            if (chunk.length > 20)
+                chunks.push(chunk);
+            start = end - overlap;
+            if (start >= text.length)
+                break;
+        }
+        return chunks;
+    }
     // ── Static OpenAI Compatible Factory ────────────────────────────────
     static openaiCompatible(config) {
         const instance = new SlyOS({

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@emilshirokikh/slyos-sdk",
-  "version": "1.3.2",
+  "version": "1.3.3",
   "description": "SlyOS - On-Device AI SDK for Web and Node.js",
   "main": "dist/index.js",
   "types": "dist/index.d.ts",