@emilshirokikh/slyos-sdk 1.3.2 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -81,6 +81,25 @@ function selectQuantization(memoryMB, modelId) {
81
81
  }
82
82
  return 'q4'; // fallback
83
83
  }
84
+ // ─── Context Window Detection ──────────────────────────────────────
85
+ async function detectContextWindowFromHF(hfModelId) {
86
+ try {
87
+ const configUrl = `https://huggingface.co/${hfModelId}/raw/main/config.json`;
88
+ const response = await axios.get(configUrl, { timeout: 5000 });
89
+ const config = response.data;
90
+ // Try multiple context window field names
91
+ const contextWindow = config.max_position_embeddings ||
92
+ config.n_positions ||
93
+ config.max_seq_len ||
94
+ config.model_max_length ||
95
+ 2048;
96
+ return contextWindow;
97
+ }
98
+ catch {
99
+ // Default if config cannot be fetched
100
+ return 2048;
101
+ }
102
+ }
84
103
  // ─── Device Profiling ───────────────────────────────────────────────
85
104
  async function profileDevice() {
86
105
  const isNode = typeof window === 'undefined';
@@ -148,6 +167,12 @@ class SlyOS {
148
167
  this.token = null;
149
168
  this.models = new Map();
150
169
  this.deviceProfile = null;
170
+ this.modelContextWindow = 0;
171
+ // ═══════════════════════════════════════════════════════════
172
+ // RAG — Retrieval Augmented Generation
173
+ // ═══════════════════════════════════════════════════════════
174
+ this.localEmbeddingModel = null;
175
+ this.offlineIndexes = new Map();
151
176
  this.apiKey = config.apiKey;
152
177
  this.apiUrl = config.apiUrl || 'https://api.slyos.world';
153
178
  this.deviceId = `device-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
@@ -177,6 +202,9 @@ class SlyOS {
177
202
  getDeviceProfile() {
178
203
  return this.deviceProfile;
179
204
  }
205
+ getModelContextWindow() {
206
+ return this.modelContextWindow;
207
+ }
180
208
  // ── Smart Model Recommendation ──────────────────────────────────
181
209
  recommendModel(category = 'llm') {
182
210
  if (!this.deviceProfile) {
@@ -271,6 +299,31 @@ class SlyOS {
271
299
  }
272
300
  return Object.fromEntries(Object.entries(grouped).map(([cat, models]) => [cat, { models }]));
273
301
  }
302
+ async searchModels(query, options) {
303
+ try {
304
+ const limit = options?.limit || 20;
305
+ const filters = ['onnx']; // Filter for ONNX models only
306
+ if (options?.task) {
307
+ filters.push(options.task);
308
+ }
309
+ const filterString = filters.map(f => `"${f}"`).join(',');
310
+ const url = `https://huggingface.co/api/models?search=${encodeURIComponent(query)}&filter=${encodeURIComponent(`[${filterString}]`)}&sort=downloads&direction=-1&limit=${limit}`;
311
+ const response = await axios.get(url, { timeout: 10000 });
312
+ const models = Array.isArray(response.data) ? response.data : [];
313
+ return models.map((model) => ({
314
+ id: model.id,
315
+ name: model.id.split('/')[1] || model.id,
316
+ downloads: model.downloads || 0,
317
+ likes: model.likes || 0,
318
+ task: model.task || 'unknown',
319
+ size_category: model.size_category || 'unknown',
320
+ }));
321
+ }
322
+ catch (error) {
323
+ this.emitEvent('error', { stage: 'model_search', error: error.message });
324
+ throw new Error(`Model search failed: ${error.message}`);
325
+ }
326
+ }
274
327
  canRunModel(modelId, quant) {
275
328
  const info = modelMap[modelId];
276
329
  if (!info)
@@ -297,25 +350,37 @@ class SlyOS {
297
350
  }
298
351
  async loadModel(modelId, options) {
299
352
  const info = modelMap[modelId];
300
- if (!info) {
301
- const available = Object.keys(modelMap).join(', ');
302
- throw new Error(`Unknown model "${modelId}". Available: ${available}`);
303
- }
304
- // Determine quantization
305
- let quant = options?.quant || 'fp32';
306
- if (!options?.quant && this.deviceProfile) {
307
- quant = selectQuantization(this.deviceProfile.memoryMB, modelId);
308
- this.emitProgress('downloading', 0, `Auto-selected ${quant.toUpperCase()} quantization for your device`);
309
- }
310
- // Check feasibility
311
- const check = this.canRunModel(modelId, quant);
312
- if (!check.canRun) {
313
- this.emitProgress('error', 0, check.reason);
314
- throw new Error(check.reason);
315
- }
316
- const estimatedSize = info.sizesMB[quant];
317
- this.emitProgress('downloading', 0, `Downloading ${modelId} (${quant.toUpperCase()}, ~${estimatedSize}MB)...`);
318
- this.emitEvent('model_download_start', { modelId, quant, estimatedSizeMB: estimatedSize });
353
+ let hfModelId;
354
+ let task;
355
+ let estimatedSize;
356
+ // Handle curated models
357
+ if (info) {
358
+ hfModelId = info.hfModel;
359
+ task = info.task;
360
+ // Determine quantization
361
+ let quant = options?.quant || 'fp32';
362
+ if (!options?.quant && this.deviceProfile) {
363
+ quant = selectQuantization(this.deviceProfile.memoryMB, modelId);
364
+ this.emitProgress('downloading', 0, `Auto-selected ${quant.toUpperCase()} quantization for your device`);
365
+ }
366
+ // Check feasibility
367
+ const check = this.canRunModel(modelId, quant);
368
+ if (!check.canRun) {
369
+ this.emitProgress('error', 0, check.reason);
370
+ throw new Error(check.reason);
371
+ }
372
+ estimatedSize = info.sizesMB[quant];
373
+ this.emitProgress('downloading', 0, `Downloading ${modelId} (${quant.toUpperCase()}, ~${estimatedSize}MB)...`);
374
+ this.emitEvent('model_download_start', { modelId, quant, estimatedSizeMB: estimatedSize });
375
+ }
376
+ else {
377
+ // Handle custom HuggingFace models
378
+ hfModelId = modelId;
379
+ task = 'text-generation'; // Default task
380
+ estimatedSize = 2048; // Default estimate
381
+ this.emitProgress('downloading', 0, `Loading custom HuggingFace model: ${modelId}...`);
382
+ this.emitEvent('model_download_start', { modelId, custom: true, estimatedSizeMB: estimatedSize });
383
+ }
319
384
  // Map quant to dtype for HuggingFace
320
385
  const dtypeMap = {
321
386
  q4: 'q4',
@@ -326,9 +391,14 @@ class SlyOS {
326
391
  let lastReportedPercent = 0;
327
392
  const startTime = Date.now();
328
393
  try {
329
- const pipe = await pipeline(info.task, info.hfModel, {
394
+ // For custom HF models, detect context window
395
+ let detectedContextWindow = 2048;
396
+ if (!info) {
397
+ detectedContextWindow = await detectContextWindowFromHF(hfModelId);
398
+ }
399
+ const pipe = await pipeline(task, hfModelId, {
330
400
  device: 'cpu',
331
- dtype: dtypeMap[quant],
401
+ dtype: 'q4', // Default to q4 for stability
332
402
  progress_callback: (progressData) => {
333
403
  // HuggingFace transformers sends progress events during download
334
404
  if (progressData && typeof progressData === 'object') {
@@ -357,12 +427,22 @@ class SlyOS {
357
427
  },
358
428
  });
359
429
  const loadTime = Date.now() - startTime;
360
- const contextWindow = this.deviceProfile
361
- ? recommendContextWindow(this.deviceProfile.memoryMB, quant)
362
- : 2048;
363
- this.models.set(modelId, { pipe, info, quant, contextWindow });
364
- this.emitProgress('ready', 100, `${modelId} loaded (${quant.toUpperCase()}, ${(loadTime / 1000).toFixed(1)}s, ctx: ${contextWindow})`);
365
- this.emitEvent('model_loaded', { modelId, quant, loadTimeMs: loadTime, contextWindow });
430
+ let contextWindow;
431
+ if (info) {
432
+ // For curated models, use recommendContextWindow
433
+ const quant = options?.quant || (this.deviceProfile ? selectQuantization(this.deviceProfile.memoryMB, modelId) : 'q4');
434
+ contextWindow = this.deviceProfile
435
+ ? recommendContextWindow(this.deviceProfile.memoryMB, quant)
436
+ : 2048;
437
+ }
438
+ else {
439
+ // For custom HF models, use detected context window
440
+ contextWindow = detectedContextWindow;
441
+ }
442
+ this.modelContextWindow = contextWindow;
443
+ this.models.set(modelId, { pipe, info, quant: 'q4', contextWindow });
444
+ this.emitProgress('ready', 100, `${modelId} loaded (q4, ${(loadTime / 1000).toFixed(1)}s, ctx: ${contextWindow})`);
445
+ this.emitEvent('model_loaded', { modelId, quant: 'q4', loadTimeMs: loadTime, contextWindow });
366
446
  // Telemetry
367
447
  if (this.token) {
368
448
  await axios.post(`${this.apiUrl}/api/telemetry`, {
@@ -370,7 +450,7 @@ class SlyOS {
370
450
  event_type: 'model_load',
371
451
  model_id: modelId,
372
452
  success: true,
373
- metadata: { quant, loadTimeMs: loadTime, contextWindow },
453
+ metadata: { quant: 'q4', loadTimeMs: loadTime, contextWindow, custom: !info },
374
454
  }, {
375
455
  headers: { Authorization: `Bearer ${this.token}` },
376
456
  }).catch(() => { });
@@ -760,6 +840,284 @@ class SlyOS {
760
840
  };
761
841
  return modelMapping[slyModelId] || 'gpt-4o-mini';
762
842
  }
843
+ /**
844
+ * Tier 2: Cloud-indexed RAG with local inference.
845
+ * Retrieves relevant chunks from server, generates response locally.
846
+ */
847
+ async ragQuery(options) {
848
+ const startTime = Date.now();
849
+ try {
850
+ if (!this.token)
851
+ throw new Error('Not authenticated. Call init() first.');
852
+ // Step 1: Retrieve relevant chunks from backend
853
+ const searchResponse = await axios.post(`${this.apiUrl}/api/rag/knowledge-bases/${options.knowledgeBaseId}/query`, {
854
+ query: options.query,
855
+ top_k: options.topK || 5,
856
+ model_id: options.modelId
857
+ }, { headers: { Authorization: `Bearer ${this.token}` } });
858
+ let { retrieved_chunks, prompt_template, context } = searchResponse.data;
859
+ // Apply context window limits
860
+ const contextWindow = this.modelContextWindow || 2048;
861
+ const maxContextChars = (contextWindow - 200) * 3; // Rough token-to-char ratio, reserving 200 tokens
862
+ if (context && context.length > maxContextChars) {
863
+ context = context.substring(0, maxContextChars) + '...';
864
+ }
865
+ // Step 2: Generate response locally using the augmented prompt
866
+ const response = await this.generate(options.modelId, prompt_template, {
867
+ temperature: options.temperature,
868
+ maxTokens: options.maxTokens,
869
+ });
870
+ return {
871
+ query: options.query,
872
+ retrievedChunks: retrieved_chunks.map((c) => ({
873
+ id: c.id,
874
+ documentId: c.document_id,
875
+ documentName: c.document_name,
876
+ content: c.content,
877
+ similarityScore: c.similarity_score,
878
+ metadata: c.metadata
879
+ })),
880
+ generatedResponse: response,
881
+ context,
882
+ latencyMs: Date.now() - startTime,
883
+ tierUsed: 2,
884
+ };
885
+ }
886
+ catch (error) {
887
+ this.emitEvent('error', { stage: 'rag_query', error: error.message });
888
+ throw new Error(`RAG query failed: ${error.message}`);
889
+ }
890
+ }
891
+ /**
892
+ * Tier 1: Fully local RAG. Zero network calls.
893
+ * Documents are chunked/embedded on-device, retrieval and generation all local.
894
+ */
895
+ async ragQueryLocal(options) {
896
+ const startTime = Date.now();
897
+ try {
898
+ // Step 1: Load embedding model if needed
899
+ if (!this.localEmbeddingModel) {
900
+ await this.loadEmbeddingModel();
901
+ }
902
+ // Adapt chunk size based on context window for efficiency
903
+ const contextWindow = this.modelContextWindow || 2048;
904
+ const chunkSize = contextWindow <= 1024 ? 256 : contextWindow <= 2048 ? 512 : 1024;
905
+ const overlap = Math.floor(chunkSize / 4);
906
+ // Step 2: Chunk documents if not already chunked
907
+ const allChunks = [];
908
+ for (const doc of options.documents) {
909
+ const chunks = this.chunkTextLocal(doc.content, chunkSize, overlap);
910
+ for (const chunk of chunks) {
911
+ const embedding = await this.embedTextLocal(chunk);
912
+ allChunks.push({ content: chunk, documentName: doc.name || 'Document', embedding });
913
+ }
914
+ }
915
+ // Step 3: Embed query
916
+ const queryEmbedding = await this.embedTextLocal(options.query);
917
+ // Step 4: Cosine similarity search
918
+ const scored = allChunks
919
+ .filter(c => c.embedding)
920
+ .map(c => ({
921
+ ...c,
922
+ similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding)
923
+ }))
924
+ .sort((a, b) => b.similarityScore - a.similarityScore)
925
+ .slice(0, options.topK || 5);
926
+ // Step 5: Build context with size limits
927
+ const maxContextChars = (contextWindow - 200) * 3; // Rough token-to-char ratio, reserving 200 tokens
928
+ let contextLength = 0;
929
+ const contextParts = [];
930
+ for (const c of scored) {
931
+ const part = `[Source: ${c.documentName}]\n${c.content}`;
932
+ if (contextLength + part.length <= maxContextChars) {
933
+ contextParts.push(part);
934
+ contextLength += part.length + 10; // Account for separator
935
+ }
936
+ else {
937
+ break;
938
+ }
939
+ }
940
+ const context = contextParts.join('\n\n---\n\n');
941
+ const prompt = `You are a helpful assistant. Answer based ONLY on the following context:\n\n${context}\n\nQuestion: ${options.query}\n\nAnswer:`;
942
+ // Step 6: Generate locally
943
+ const response = await this.generate(options.modelId, prompt, {
944
+ temperature: options.temperature,
945
+ maxTokens: options.maxTokens,
946
+ });
947
+ return {
948
+ query: options.query,
949
+ retrievedChunks: scored.map((c, i) => ({
950
+ id: `local-${i}`,
951
+ documentId: 'local',
952
+ documentName: c.documentName,
953
+ content: c.content,
954
+ similarityScore: c.similarityScore,
955
+ metadata: {}
956
+ })),
957
+ generatedResponse: response,
958
+ context,
959
+ latencyMs: Date.now() - startTime,
960
+ tierUsed: 1,
961
+ };
962
+ }
963
+ catch (error) {
964
+ this.emitEvent('error', { stage: 'rag_local', error: error.message });
965
+ throw new Error(`Local RAG failed: ${error.message}`);
966
+ }
967
+ }
968
+ /**
969
+ * Tier 3: Offline RAG using a synced knowledge base.
970
+ * First call syncKnowledgeBase(), then use this for offline queries.
971
+ */
972
+ async ragQueryOffline(options) {
973
+ const startTime = Date.now();
974
+ const index = this.offlineIndexes.get(options.knowledgeBaseId);
975
+ if (!index) {
976
+ throw new Error(`Knowledge base "${options.knowledgeBaseId}" not synced. Call syncKnowledgeBase() first.`);
977
+ }
978
+ // Check expiry
979
+ if (new Date(index.metadata.expires_at) < new Date()) {
980
+ throw new Error('Offline index has expired. Please re-sync.');
981
+ }
982
+ try {
983
+ // Load embedding model
984
+ if (!this.localEmbeddingModel) {
985
+ await this.loadEmbeddingModel();
986
+ }
987
+ // Embed query
988
+ const queryEmbedding = await this.embedTextLocal(options.query);
989
+ // Search offline index
990
+ const scored = index.chunks
991
+ .filter(c => c.embedding && c.embedding.length > 0)
992
+ .map(c => ({
993
+ ...c,
994
+ similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding)
995
+ }))
996
+ .sort((a, b) => b.similarityScore - a.similarityScore)
997
+ .slice(0, options.topK || 5);
998
+ // Build context with size limits
999
+ const contextWindow = this.modelContextWindow || 2048;
1000
+ const maxContextChars = (contextWindow - 200) * 3; // Rough token-to-char ratio, reserving 200 tokens
1001
+ let contextLength = 0;
1002
+ const contextParts = [];
1003
+ for (const c of scored) {
1004
+ const part = `[Source: ${c.document_name}]\n${c.content}`;
1005
+ if (contextLength + part.length <= maxContextChars) {
1006
+ contextParts.push(part);
1007
+ contextLength += part.length + 10; // Account for separator
1008
+ }
1009
+ else {
1010
+ break;
1011
+ }
1012
+ }
1013
+ const context = contextParts.join('\n\n---\n\n');
1014
+ const prompt = `You are a helpful assistant. Answer based ONLY on the following context:\n\n${context}\n\nQuestion: ${options.query}\n\nAnswer:`;
1015
+ // Generate locally
1016
+ const response = await this.generate(options.modelId, prompt, {
1017
+ temperature: options.temperature,
1018
+ maxTokens: options.maxTokens,
1019
+ });
1020
+ return {
1021
+ query: options.query,
1022
+ retrievedChunks: scored.map(c => ({
1023
+ id: c.id,
1024
+ documentId: c.document_id,
1025
+ documentName: c.document_name,
1026
+ content: c.content,
1027
+ similarityScore: c.similarityScore,
1028
+ metadata: c.metadata
1029
+ })),
1030
+ generatedResponse: response,
1031
+ context,
1032
+ latencyMs: Date.now() - startTime,
1033
+ tierUsed: 3,
1034
+ };
1035
+ }
1036
+ catch (error) {
1037
+ this.emitEvent('error', { stage: 'rag_offline', error: error.message });
1038
+ throw new Error(`Offline RAG failed: ${error.message}`);
1039
+ }
1040
+ }
1041
+ /**
1042
+ * Sync a knowledge base for offline use (Tier 3).
1043
+ * Downloads chunks + embeddings from server, stores locally.
1044
+ */
1045
+ async syncKnowledgeBase(knowledgeBaseId, deviceId) {
1046
+ try {
1047
+ if (!this.token)
1048
+ throw new Error('Not authenticated. Call init() first.');
1049
+ const response = await axios.post(`${this.apiUrl}/api/rag/knowledge-bases/${knowledgeBaseId}/sync`, { device_id: deviceId || this.deviceId || 'sdk-device' }, { headers: { Authorization: `Bearer ${this.token}` } });
1050
+ const { sync_package, chunk_count, package_size_mb, expires_at } = response.data;
1051
+ this.offlineIndexes.set(knowledgeBaseId, sync_package);
1052
+ return {
1053
+ chunkCount: chunk_count,
1054
+ sizeMb: package_size_mb,
1055
+ expiresAt: expires_at
1056
+ };
1057
+ }
1058
+ catch (error) {
1059
+ throw new Error(`Sync failed: ${error.message}`);
1060
+ }
1061
+ }
1062
+ // --- RAG Helper Methods ---
1063
+ async loadEmbeddingModel() {
1064
+ this.emitProgress('downloading', 0, 'Loading embedding model (all-MiniLM-L6-v2)...');
1065
+ try {
1066
+ const { pipeline } = await import('@huggingface/transformers');
1067
+ this.localEmbeddingModel = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
1068
+ this.emitProgress('ready', 100, 'Embedding model loaded');
1069
+ }
1070
+ catch (error) {
1071
+ this.emitProgress('error', 0, `Embedding model failed: ${error.message}`);
1072
+ throw error;
1073
+ }
1074
+ }
1075
+ async embedTextLocal(text) {
1076
+ if (!this.localEmbeddingModel)
1077
+ throw new Error('Embedding model not loaded');
1078
+ const result = await this.localEmbeddingModel(text, { pooling: 'mean', normalize: true });
1079
+ // Handle different tensor output formats (v2 vs v3 of transformers)
1080
+ if (result.data)
1081
+ return Array.from(result.data);
1082
+ if (result.tolist)
1083
+ return result.tolist().flat();
1084
+ if (Array.isArray(result))
1085
+ return result.flat();
1086
+ throw new Error('Unexpected embedding output format');
1087
+ }
1088
+ cosineSimilarity(a, b) {
1089
+ let dot = 0, normA = 0, normB = 0;
1090
+ for (let i = 0; i < a.length; i++) {
1091
+ dot += a[i] * b[i];
1092
+ normA += a[i] * a[i];
1093
+ normB += b[i] * b[i];
1094
+ }
1095
+ const denom = Math.sqrt(normA) * Math.sqrt(normB);
1096
+ return denom === 0 ? 0 : dot / denom;
1097
+ }
1098
+ chunkTextLocal(text, chunkSize = 512, overlap = 128) {
1099
+ if (!text || text.length === 0)
1100
+ return [];
1101
+ if (overlap >= chunkSize)
1102
+ overlap = Math.floor(chunkSize * 0.25);
1103
+ const chunks = [];
1104
+ let start = 0;
1105
+ while (start < text.length) {
1106
+ let end = start + chunkSize;
1107
+ if (end < text.length) {
1108
+ const bp = Math.max(text.lastIndexOf('.', end), text.lastIndexOf('\n', end));
1109
+ if (bp > start + chunkSize / 2)
1110
+ end = bp + 1;
1111
+ }
1112
+ const chunk = text.slice(start, end).trim();
1113
+ if (chunk.length > 20)
1114
+ chunks.push(chunk);
1115
+ start = end - overlap;
1116
+ if (start >= text.length)
1117
+ break;
1118
+ }
1119
+ return chunks;
1120
+ }
763
1121
  // ── Static OpenAI Compatible Factory ────────────────────────────────
764
1122
  static openaiCompatible(config) {
765
1123
  const instance = new SlyOS({
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@emilshirokikh/slyos-sdk",
3
- "version": "1.3.2",
3
+ "version": "1.3.3",
4
4
  "description": "SlyOS - On-Device AI SDK for Web and Node.js",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",