npm - @emilshirokikh/slyos-sdk - Versions diffs - 1.3.2 → 1.3.3 - Mend

@emilshirokikh/slyos-sdk 1.3.2 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/src/index.ts CHANGED Viewed

@@ -151,6 +151,57 @@ interface OpenAICompatibleClient {
   };
 }
+// ─── RAG Types ──────────────────────────────────────────────────
+interface RAGOptions {
+  knowledgeBaseId: string;
+  query: string;
+  topK?: number;
+  modelId: string;
+  temperature?: number;
+  maxTokens?: number;
+}
+interface RAGChunk {
+  id: string;
+  documentId: string;
+  documentName: string;
+  content: string;
+  similarityScore: number;
+  metadata?: Record<string, any>;
+}
+interface RAGResponse {
+  query: string;
+  retrievedChunks: RAGChunk[];
+  generatedResponse: string;
+  context: string;
+  latencyMs: number;
+  tierUsed: 1 | 2 | 3;
+}
+interface OfflineIndex {
+  metadata: {
+    kb_id: string;
+    kb_name: string;
+    chunk_size: number;
+    embedding_dim: number;
+    total_chunks: number;
+    synced_at: string;
+    expires_at: string;
+    sync_token: string;
+  };
+  chunks: Array<{
+    id: string;
+    document_id: string;
+    document_name: string;
+    content: string;
+    chunk_index: number;
+    embedding: number[] | null;
+    metadata: Record<string, any>;
+  }>;
+}
 // ─── Model Registry ─────────────────────────────────────────────────
 const modelMap: Record<string, ModelInfo> = {
@@ -231,6 +282,29 @@ function selectQuantization(memoryMB: number, modelId: string): QuantizationLeve
   return 'q4'; // fallback
 }
+// ─── Context Window Detection ──────────────────────────────────────
+async function detectContextWindowFromHF(hfModelId: string): Promise<number> {
+  try {
+    const configUrl = `https://huggingface.co/${hfModelId}/raw/main/config.json`;
+    const response = await axios.get(configUrl, { timeout: 5000 });
+    const config = response.data;
+    // Try multiple context window field names
+    const contextWindow =
+      config.max_position_embeddings ||
+      config.n_positions ||
+      config.max_seq_len ||
+      config.model_max_length ||
+      2048;
+    return contextWindow;
+  } catch {
+    // Default if config cannot be fetched
+    return 2048;
+  }
+}
 // ─── Device Profiling ───────────────────────────────────────────────
 async function profileDevice(): Promise<DeviceProfile> {
@@ -307,6 +381,7 @@ class SlyOS {
   private onProgress: ProgressCallback | null;
   private onEvent: EventCallback | null;
   private fallbackConfig: FallbackConfig | null;
+  private modelContextWindow: number = 0;
   constructor(config: SlyOSConfigWithFallback) {
     this.apiKey = config.apiKey;
@@ -345,6 +420,10 @@ class SlyOS {
     return this.deviceProfile;
   }
+  getModelContextWindow(): number {
+    return this.modelContextWindow;
+  }
   // ── Smart Model Recommendation ──────────────────────────────────
   recommendModel(category: ModelCategory = 'llm'): { modelId: string; quant: QuantizationLevel; contextWindow: number; reason: string } | null {
@@ -453,6 +532,41 @@ class SlyOS {
     );
   }
+  async searchModels(query: string, options?: { limit?: number; task?: string }): Promise<Array<{
+    id: string;
+    name: string;
+    downloads: number;
+    likes: number;
+    task: string;
+    size_category: string;
+  }>> {
+    try {
+      const limit = options?.limit || 20;
+      const filters = ['onnx']; // Filter for ONNX models only
+      if (options?.task) {
+        filters.push(options.task);
+      }
+      const filterString = filters.map(f => `"${f}"`).join(',');
+      const url = `https://huggingface.co/api/models?search=${encodeURIComponent(query)}&filter=${encodeURIComponent(`[${filterString}]`)}&sort=downloads&direction=-1&limit=${limit}`;
+      const response = await axios.get(url, { timeout: 10000 });
+      const models = Array.isArray(response.data) ? response.data : [];
+      return models.map((model: any) => ({
+        id: model.id,
+        name: model.id.split('/')[1] || model.id,
+        downloads: model.downloads || 0,
+        likes: model.likes || 0,
+        task: model.task || 'unknown',
+        size_category: model.size_category || 'unknown',
+      }));
+    } catch (error: any) {
+      this.emitEvent('error', { stage: 'model_search', error: error.message });
+      throw new Error(`Model search failed: ${error.message}`);
+    }
+  }
   canRunModel(modelId: string, quant?: QuantizationLevel): { canRun: boolean; reason: string; recommendedQuant: QuantizationLevel } {
     const info = modelMap[modelId];
     if (!info) return { canRun: false, reason: `Unknown model "${modelId}"`, recommendedQuant: 'q4' };
@@ -482,29 +596,42 @@ class SlyOS {
   async loadModel(modelId: string, options?: { quant?: QuantizationLevel }): Promise<void> {
     const info = modelMap[modelId];
-    if (!info) {
-      const available = Object.keys(modelMap).join(', ');
-      throw new Error(`Unknown model "${modelId}". Available: ${available}`);
-    }
+    let hfModelId: string;
+    let task: string;
+    let estimatedSize: number;
+    // Handle curated models
+    if (info) {
+      hfModelId = info.hfModel;
+      task = info.task;
+      // Determine quantization
+      let quant: QuantizationLevel = options?.quant || 'fp32';
+      if (!options?.quant && this.deviceProfile) {
+        quant = selectQuantization(this.deviceProfile.memoryMB, modelId);
+        this.emitProgress('downloading', 0, `Auto-selected ${quant.toUpperCase()} quantization for your device`);
+      }
-    // Determine quantization
-    let quant: QuantizationLevel = options?.quant || 'fp32';
-    if (!options?.quant && this.deviceProfile) {
-      quant = selectQuantization(this.deviceProfile.memoryMB, modelId);
-      this.emitProgress('downloading', 0, `Auto-selected ${quant.toUpperCase()} quantization for your device`);
-    }
+      // Check feasibility
+      const check = this.canRunModel(modelId, quant);
+      if (!check.canRun) {
+        this.emitProgress('error', 0, check.reason);
+        throw new Error(check.reason);
+      }
-    // Check feasibility
-    const check = this.canRunModel(modelId, quant);
-    if (!check.canRun) {
-      this.emitProgress('error', 0, check.reason);
-      throw new Error(check.reason);
+      estimatedSize = info.sizesMB[quant];
+      this.emitProgress('downloading', 0, `Downloading ${modelId} (${quant.toUpperCase()}, ~${estimatedSize}MB)...`);
+      this.emitEvent('model_download_start', { modelId, quant, estimatedSizeMB: estimatedSize });
+    } else {
+      // Handle custom HuggingFace models
+      hfModelId = modelId;
+      task = 'text-generation'; // Default task
+      estimatedSize = 2048; // Default estimate
+      this.emitProgress('downloading', 0, `Loading custom HuggingFace model: ${modelId}...`);
+      this.emitEvent('model_download_start', { modelId, custom: true, estimatedSizeMB: estimatedSize });
     }
-    const estimatedSize = info.sizesMB[quant];
-    this.emitProgress('downloading', 0, `Downloading ${modelId} (${quant.toUpperCase()}, ~${estimatedSize}MB)...`);
-    this.emitEvent('model_download_start', { modelId, quant, estimatedSizeMB: estimatedSize });
     // Map quant to dtype for HuggingFace
     const dtypeMap: Record<QuantizationLevel, string> = {
       q4: 'q4',
@@ -517,9 +644,15 @@ class SlyOS {
     const startTime = Date.now();
     try {
-      const pipe = await pipeline(info.task as any, info.hfModel, {
+      // For custom HF models, detect context window
+      let detectedContextWindow = 2048;
+      if (!info) {
+        detectedContextWindow = await detectContextWindowFromHF(hfModelId);
+      }
+      const pipe = await pipeline(task as any, hfModelId, {
         device: 'cpu',
-        dtype: dtypeMap[quant] as any,
+        dtype: 'q4' as any, // Default to q4 for stability
         progress_callback: (progressData: any) => {
           // HuggingFace transformers sends progress events during download
           if (progressData && typeof progressData === 'object') {
@@ -549,14 +682,24 @@ class SlyOS {
       });
       const loadTime = Date.now() - startTime;
-      const contextWindow = this.deviceProfile
-        ? recommendContextWindow(this.deviceProfile.memoryMB, quant)
-        : 2048;
+      let contextWindow: number;
+      if (info) {
+        // For curated models, use recommendContextWindow
+        const quant = options?.quant || (this.deviceProfile ? selectQuantization(this.deviceProfile.memoryMB, modelId) : 'q4');
+        contextWindow = this.deviceProfile
+          ? recommendContextWindow(this.deviceProfile.memoryMB, quant)
+          : 2048;
+      } else {
+        // For custom HF models, use detected context window
+        contextWindow = detectedContextWindow;
+      }
-      this.models.set(modelId, { pipe, info, quant, contextWindow });
+      this.modelContextWindow = contextWindow;
+      this.models.set(modelId, { pipe, info, quant: 'q4', contextWindow });
-      this.emitProgress('ready', 100, `${modelId} loaded (${quant.toUpperCase()}, ${(loadTime / 1000).toFixed(1)}s, ctx: ${contextWindow})`);
-      this.emitEvent('model_loaded', { modelId, quant, loadTimeMs: loadTime, contextWindow });
+      this.emitProgress('ready', 100, `${modelId} loaded (q4, ${(loadTime / 1000).toFixed(1)}s, ctx: ${contextWindow})`);
+      this.emitEvent('model_loaded', { modelId, quant: 'q4', loadTimeMs: loadTime, contextWindow });
       // Telemetry
       if (this.token) {
@@ -565,7 +708,7 @@ class SlyOS {
           event_type: 'model_load',
           model_id: modelId,
           success: true,
-          metadata: { quant, loadTimeMs: loadTime, contextWindow },
+          metadata: { quant: 'q4', loadTimeMs: loadTime, contextWindow, custom: !info },
         }, {
           headers: { Authorization: `Bearer ${this.token}` },
         }).catch(() => {});
@@ -1000,6 +1143,320 @@ class SlyOS {
     return modelMapping[slyModelId] || 'gpt-4o-mini';
   }
+  // ═══════════════════════════════════════════════════════════
+  // RAG — Retrieval Augmented Generation
+  // ═══════════════════════════════════════════════════════════
+  private localEmbeddingModel: any = null;
+  private offlineIndexes: Map<string, OfflineIndex> = new Map();
+  /**
+   * Tier 2: Cloud-indexed RAG with local inference.
+   * Retrieves relevant chunks from server, generates response locally.
+   */
+  async ragQuery(options: RAGOptions): Promise<RAGResponse> {
+    const startTime = Date.now();
+    try {
+      if (!this.token) throw new Error('Not authenticated. Call init() first.');
+      // Step 1: Retrieve relevant chunks from backend
+      const searchResponse = await axios.post(
+        `${this.apiUrl}/api/rag/knowledge-bases/${options.knowledgeBaseId}/query`,
+        {
+          query: options.query,
+          top_k: options.topK || 5,
+          model_id: options.modelId
+        },
+        { headers: { Authorization: `Bearer ${this.token}` } }
+      );
+      let { retrieved_chunks, prompt_template, context } = searchResponse.data;
+      // Apply context window limits
+      const contextWindow = this.modelContextWindow || 2048;
+      const maxContextChars = (contextWindow - 200) * 3; // Rough token-to-char ratio, reserving 200 tokens
+      if (context && context.length > maxContextChars) {
+        context = context.substring(0, maxContextChars) + '...';
+      }
+      // Step 2: Generate response locally using the augmented prompt
+      const response = await this.generate(options.modelId, prompt_template, {
+        temperature: options.temperature,
+        maxTokens: options.maxTokens,
+      });
+      return {
+        query: options.query,
+        retrievedChunks: retrieved_chunks.map((c: any) => ({
+          id: c.id,
+          documentId: c.document_id,
+          documentName: c.document_name,
+          content: c.content,
+          similarityScore: c.similarity_score,
+          metadata: c.metadata
+        })),
+        generatedResponse: response,
+        context,
+        latencyMs: Date.now() - startTime,
+        tierUsed: 2,
+      };
+    } catch (error: any) {
+      this.emitEvent('error', { stage: 'rag_query', error: error.message });
+      throw new Error(`RAG query failed: ${error.message}`);
+    }
+  }
+  /**
+   * Tier 1: Fully local RAG. Zero network calls.
+   * Documents are chunked/embedded on-device, retrieval and generation all local.
+   */
+  async ragQueryLocal(options: RAGOptions & { documents: Array<{ content: string; name?: string }> }): Promise<RAGResponse> {
+    const startTime = Date.now();
+    try {
+      // Step 1: Load embedding model if needed
+      if (!this.localEmbeddingModel) {
+        await this.loadEmbeddingModel();
+      }
+      // Adapt chunk size based on context window for efficiency
+      const contextWindow = this.modelContextWindow || 2048;
+      const chunkSize = contextWindow <= 1024 ? 256 : contextWindow <= 2048 ? 512 : 1024;
+      const overlap = Math.floor(chunkSize / 4);
+      // Step 2: Chunk documents if not already chunked
+      const allChunks: Array<{ content: string; documentName: string; embedding?: number[] }> = [];
+      for (const doc of options.documents) {
+        const chunks = this.chunkTextLocal(doc.content, chunkSize, overlap);
+        for (const chunk of chunks) {
+          const embedding = await this.embedTextLocal(chunk);
+          allChunks.push({ content: chunk, documentName: doc.name || 'Document', embedding });
+        }
+      }
+      // Step 3: Embed query
+      const queryEmbedding = await this.embedTextLocal(options.query);
+      // Step 4: Cosine similarity search
+      const scored = allChunks
+        .filter(c => c.embedding)
+        .map(c => ({
+          ...c,
+          similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding!)
+        }))
+        .sort((a, b) => b.similarityScore - a.similarityScore)
+        .slice(0, options.topK || 5);
+      // Step 5: Build context with size limits
+      const maxContextChars = (contextWindow - 200) * 3; // Rough token-to-char ratio, reserving 200 tokens
+      let contextLength = 0;
+      const contextParts: string[] = [];
+      for (const c of scored) {
+        const part = `[Source: ${c.documentName}]\n${c.content}`;
+        if (contextLength + part.length <= maxContextChars) {
+          contextParts.push(part);
+          contextLength += part.length + 10; // Account for separator
+        } else {
+          break;
+        }
+      }
+      const context = contextParts.join('\n\n---\n\n');
+      const prompt = `You are a helpful assistant. Answer based ONLY on the following context:\n\n${context}\n\nQuestion: ${options.query}\n\nAnswer:`;
+      // Step 6: Generate locally
+      const response = await this.generate(options.modelId, prompt, {
+        temperature: options.temperature,
+        maxTokens: options.maxTokens,
+      });
+      return {
+        query: options.query,
+        retrievedChunks: scored.map((c, i) => ({
+          id: `local-${i}`,
+          documentId: 'local',
+          documentName: c.documentName,
+          content: c.content,
+          similarityScore: c.similarityScore,
+          metadata: {}
+        })),
+        generatedResponse: response,
+        context,
+        latencyMs: Date.now() - startTime,
+        tierUsed: 1,
+      };
+    } catch (error: any) {
+      this.emitEvent('error', { stage: 'rag_local', error: error.message });
+      throw new Error(`Local RAG failed: ${error.message}`);
+    }
+  }
+  /**
+   * Tier 3: Offline RAG using a synced knowledge base.
+   * First call syncKnowledgeBase(), then use this for offline queries.
+   */
+  async ragQueryOffline(options: RAGOptions): Promise<RAGResponse> {
+    const startTime = Date.now();
+    const index = this.offlineIndexes.get(options.knowledgeBaseId);
+    if (!index) {
+      throw new Error(`Knowledge base "${options.knowledgeBaseId}" not synced. Call syncKnowledgeBase() first.`);
+    }
+    // Check expiry
+    if (new Date(index.metadata.expires_at) < new Date()) {
+      throw new Error('Offline index has expired. Please re-sync.');
+    }
+    try {
+      // Load embedding model
+      if (!this.localEmbeddingModel) {
+        await this.loadEmbeddingModel();
+      }
+      // Embed query
+      const queryEmbedding = await this.embedTextLocal(options.query);
+      // Search offline index
+      const scored = index.chunks
+        .filter(c => c.embedding && c.embedding.length > 0)
+        .map(c => ({
+          ...c,
+          similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding!)
+        }))
+        .sort((a, b) => b.similarityScore - a.similarityScore)
+        .slice(0, options.topK || 5);
+      // Build context with size limits
+      const contextWindow = this.modelContextWindow || 2048;
+      const maxContextChars = (contextWindow - 200) * 3; // Rough token-to-char ratio, reserving 200 tokens
+      let contextLength = 0;
+      const contextParts: string[] = [];
+      for (const c of scored) {
+        const part = `[Source: ${c.document_name}]\n${c.content}`;
+        if (contextLength + part.length <= maxContextChars) {
+          contextParts.push(part);
+          contextLength += part.length + 10; // Account for separator
+        } else {
+          break;
+        }
+      }
+      const context = contextParts.join('\n\n---\n\n');
+      const prompt = `You are a helpful assistant. Answer based ONLY on the following context:\n\n${context}\n\nQuestion: ${options.query}\n\nAnswer:`;
+      // Generate locally
+      const response = await this.generate(options.modelId, prompt, {
+        temperature: options.temperature,
+        maxTokens: options.maxTokens,
+      });
+      return {
+        query: options.query,
+        retrievedChunks: scored.map(c => ({
+          id: c.id,
+          documentId: c.document_id,
+          documentName: c.document_name,
+          content: c.content,
+          similarityScore: c.similarityScore,
+          metadata: c.metadata
+        })),
+        generatedResponse: response,
+        context,
+        latencyMs: Date.now() - startTime,
+        tierUsed: 3,
+      };
+    } catch (error: any) {
+      this.emitEvent('error', { stage: 'rag_offline', error: error.message });
+      throw new Error(`Offline RAG failed: ${error.message}`);
+    }
+  }
+  /**
+   * Sync a knowledge base for offline use (Tier 3).
+   * Downloads chunks + embeddings from server, stores locally.
+   */
+  async syncKnowledgeBase(knowledgeBaseId: string, deviceId?: string): Promise<{ chunkCount: number; sizeMb: number; expiresAt: string }> {
+    try {
+      if (!this.token) throw new Error('Not authenticated. Call init() first.');
+      const response = await axios.post(
+        `${this.apiUrl}/api/rag/knowledge-bases/${knowledgeBaseId}/sync`,
+        { device_id: deviceId || this.deviceId || 'sdk-device' },
+        { headers: { Authorization: `Bearer ${this.token}` } }
+      );
+      const { sync_package, chunk_count, package_size_mb, expires_at } = response.data;
+      this.offlineIndexes.set(knowledgeBaseId, sync_package);
+      return {
+        chunkCount: chunk_count,
+        sizeMb: package_size_mb,
+        expiresAt: expires_at
+      };
+    } catch (error: any) {
+      throw new Error(`Sync failed: ${error.message}`);
+    }
+  }
+  // --- RAG Helper Methods ---
+  private async loadEmbeddingModel(): Promise<void> {
+    this.emitProgress('downloading', 0, 'Loading embedding model (all-MiniLM-L6-v2)...');
+    try {
+      const { pipeline } = await import('@huggingface/transformers');
+      this.localEmbeddingModel = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
+      this.emitProgress('ready', 100, 'Embedding model loaded');
+    } catch (error: any) {
+      this.emitProgress('error', 0, `Embedding model failed: ${error.message}`);
+      throw error;
+    }
+  }
+  private async embedTextLocal(text: string): Promise<number[]> {
+    if (!this.localEmbeddingModel) throw new Error('Embedding model not loaded');
+    const result = await this.localEmbeddingModel(text, { pooling: 'mean', normalize: true });
+    // Handle different tensor output formats (v2 vs v3 of transformers)
+    if (result.data) return Array.from(result.data);
+    if (result.tolist) return result.tolist().flat();
+    if (Array.isArray(result)) return result.flat();
+    throw new Error('Unexpected embedding output format');
+  }
+  private cosineSimilarity(a: number[], b: number[]): number {
+    let dot = 0, normA = 0, normB = 0;
+    for (let i = 0; i < a.length; i++) {
+      dot += a[i] * b[i];
+      normA += a[i] * a[i];
+      normB += b[i] * b[i];
+    }
+    const denom = Math.sqrt(normA) * Math.sqrt(normB);
+    return denom === 0 ? 0 : dot / denom;
+  }
+  private chunkTextLocal(text: string, chunkSize: number = 512, overlap: number = 128): string[] {
+    if (!text || text.length === 0) return [];
+    if (overlap >= chunkSize) overlap = Math.floor(chunkSize * 0.25);
+    const chunks: string[] = [];
+    let start = 0;
+    while (start < text.length) {
+      let end = start + chunkSize;
+      if (end < text.length) {
+        const bp = Math.max(text.lastIndexOf('.', end), text.lastIndexOf('\n', end));
+        if (bp > start + chunkSize / 2) end = bp + 1;
+      }
+      const chunk = text.slice(start, end).trim();
+      if (chunk.length > 20) chunks.push(chunk);
+      start = end - overlap;
+      if (start >= text.length) break;
+    }
+    return chunks;
+  }
   // ── Static OpenAI Compatible Factory ────────────────────────────────
   static openaiCompatible(config: { apiKey: string; apiUrl?: string; fallback?: FallbackConfig }): OpenAICompatibleClient {
@@ -1045,4 +1502,8 @@ export type {
   FallbackConfig,
   FallbackProvider,
   OpenAICompatibleClient,
+  RAGOptions,
+  RAGChunk,
+  RAGResponse,
+  OfflineIndex,
 };