@beltoinc/slyos-sdk 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,1483 @@
1
+ import axios from 'axios';
2
+ import { pipeline, env } from '@huggingface/transformers';
3
+ // @ts-ignore - Force CPU in Node.js
4
+ if (env.backends?.onnx?.wasm) {
5
+ env.backends.onnx.wasm.proxy = false;
6
+ }
7
+ // ─── Model Registry ─────────────────────────────────────────────────
8
+ const modelMap = {
9
+ // LLM models (1B+)
10
+ 'quantum-1.7b': {
11
+ hfModel: 'HuggingFaceTB/SmolLM2-1.7B-Instruct',
12
+ task: 'text-generation',
13
+ category: 'llm',
14
+ sizesMB: { q4: 900, q8: 1700, fp16: 3400, fp32: 6800 },
15
+ minRAM_MB: { q4: 2048, q8: 3072, fp16: 5120, fp32: 8192 },
16
+ },
17
+ 'quantum-3b': {
18
+ hfModel: 'Qwen/Qwen2.5-3B-Instruct',
19
+ task: 'text-generation',
20
+ category: 'llm',
21
+ sizesMB: { q4: 1600, q8: 3200, fp16: 6400, fp32: 12800 },
22
+ minRAM_MB: { q4: 3072, q8: 5120, fp16: 8192, fp32: 16384 },
23
+ },
24
+ 'quantum-code-3b': {
25
+ hfModel: 'Qwen/Qwen2.5-Coder-3B-Instruct',
26
+ task: 'text-generation',
27
+ category: 'llm',
28
+ sizesMB: { q4: 1600, q8: 3200, fp16: 6400, fp32: 12800 },
29
+ minRAM_MB: { q4: 3072, q8: 5120, fp16: 8192, fp32: 16384 },
30
+ },
31
+ 'quantum-8b': {
32
+ hfModel: 'Qwen/Qwen2.5-7B-Instruct',
33
+ task: 'text-generation',
34
+ category: 'llm',
35
+ sizesMB: { q4: 4200, q8: 8400, fp16: 16800, fp32: 33600 },
36
+ minRAM_MB: { q4: 6144, q8: 10240, fp16: 20480, fp32: 40960 },
37
+ },
38
+ // STT models
39
+ 'voicecore-base': {
40
+ hfModel: 'onnx-community/whisper-base',
41
+ task: 'automatic-speech-recognition',
42
+ category: 'stt',
43
+ sizesMB: { q4: 40, q8: 75, fp16: 150, fp32: 300 },
44
+ minRAM_MB: { q4: 512, q8: 512, fp16: 1024, fp32: 2048 },
45
+ },
46
+ 'voicecore-small': {
47
+ hfModel: 'onnx-community/whisper-small',
48
+ task: 'automatic-speech-recognition',
49
+ category: 'stt',
50
+ sizesMB: { q4: 100, q8: 200, fp16: 400, fp32: 800 },
51
+ minRAM_MB: { q4: 1024, q8: 1024, fp16: 2048, fp32: 4096 },
52
+ },
53
+ };
54
+ // ─── Context Window Sizing ──────────────────────────────────────────
55
+ function recommendContextWindow(memoryMB, quant) {
56
+ // More RAM + smaller quant = larger context window
57
+ const base = quant === 'q4' ? 1024 : quant === 'q8' ? 2048 : quant === 'fp16' ? 4096 : 8192;
58
+ if (memoryMB >= 16384)
59
+ return Math.min(base * 4, 32768);
60
+ if (memoryMB >= 8192)
61
+ return Math.min(base * 2, 16384);
62
+ if (memoryMB >= 4096)
63
+ return base;
64
+ return Math.max(512, Math.floor(base / 2));
65
+ }
66
+ function selectQuantization(memoryMB, modelId) {
67
+ const info = modelMap[modelId];
68
+ if (!info)
69
+ return 'q4';
70
+ // ONNX/WASM has protobuf size limits — fp16 files >2GB crash on many systems.
71
+ // For LLMs, cap at q4 via WASM. FP16/Q8 need native backends (llama.cpp).
72
+ // STT models are small enough for q8/fp16.
73
+ if (info.category === 'llm') {
74
+ return 'q4'; // safest for ONNX/WASM across all platforms
75
+ }
76
+ // STT models: try from best quality down
77
+ const quants = ['fp16', 'q8', 'q4'];
78
+ for (const q of quants) {
79
+ if (memoryMB >= info.minRAM_MB[q])
80
+ return q;
81
+ }
82
+ return 'q4'; // fallback
83
+ }
84
+ // ─── Context Window Detection ──────────────────────────────────────
85
+ async function detectContextWindowFromHF(hfModelId) {
86
+ try {
87
+ const configUrl = `https://huggingface.co/${hfModelId}/raw/main/config.json`;
88
+ const response = await axios.get(configUrl, { timeout: 5000 });
89
+ const config = response.data;
90
+ // Try multiple context window field names
91
+ const contextWindow = config.max_position_embeddings ||
92
+ config.n_positions ||
93
+ config.max_seq_len ||
94
+ config.model_max_length ||
95
+ 2048;
96
+ return contextWindow;
97
+ }
98
+ catch {
99
+ // Default if config cannot be fetched
100
+ return 2048;
101
+ }
102
+ }
103
+ // ─── SDK Version ────────────────────────────────────────────────────
104
+ const SDK_VERSION = '1.4.1';
105
+ // ─── Persistent Device Identity ─────────────────────────────────────
106
+ async function hashString(str) {
107
+ const isNode = typeof window === 'undefined';
108
+ if (isNode) {
109
+ const crypto = await import('crypto');
110
+ return crypto.createHash('sha256').update(str).digest('hex').substring(0, 32);
111
+ }
112
+ else {
113
+ const encoder = new TextEncoder();
114
+ const data = encoder.encode(str);
115
+ const hashBuffer = await crypto.subtle.digest('SHA-256', data);
116
+ return Array.from(new Uint8Array(hashBuffer))
117
+ .map(b => b.toString(16).padStart(2, '0'))
118
+ .join('')
119
+ .substring(0, 32);
120
+ }
121
+ }
122
+ async function getOrCreateDeviceId() {
123
+ const isNode = typeof window === 'undefined';
124
+ if (isNode) {
125
+ // Node.js: persist in ~/.slyos/device-id
126
+ try {
127
+ const fs = await import('fs');
128
+ const path = await import('path');
129
+ const os = await import('os');
130
+ const slyosDir = path.join(os.homedir(), '.slyos');
131
+ const idFile = path.join(slyosDir, 'device-id');
132
+ try {
133
+ const existing = fs.readFileSync(idFile, 'utf-8').trim();
134
+ if (existing)
135
+ return existing;
136
+ }
137
+ catch { }
138
+ const deviceId = `device-${Date.now()}-${Math.random().toString(36).substr(2, 12)}`;
139
+ fs.mkdirSync(slyosDir, { recursive: true });
140
+ fs.writeFileSync(idFile, deviceId);
141
+ return deviceId;
142
+ }
143
+ catch {
144
+ return `device-${Date.now()}-${Math.random().toString(36).substr(2, 12)}`;
145
+ }
146
+ }
147
+ else {
148
+ // Browser: persist in localStorage
149
+ const key = 'slyos_device_id';
150
+ try {
151
+ const existing = localStorage.getItem(key);
152
+ if (existing)
153
+ return existing;
154
+ }
155
+ catch { }
156
+ const deviceId = `device-${Date.now()}-${Math.random().toString(36).substr(2, 12)}`;
157
+ try {
158
+ localStorage.setItem(key, deviceId);
159
+ }
160
+ catch { }
161
+ return deviceId;
162
+ }
163
+ }
164
+ async function generateDeviceFingerprint() {
165
+ const isNode = typeof window === 'undefined';
166
+ let components = [];
167
+ if (isNode) {
168
+ try {
169
+ const os = await import('os');
170
+ const cpus = os.cpus();
171
+ components.push(cpus[0]?.model || 'unknown-cpu');
172
+ components.push(String(os.totalmem()));
173
+ components.push(os.platform());
174
+ components.push(os.arch());
175
+ components.push(String(cpus.length));
176
+ }
177
+ catch { }
178
+ }
179
+ else {
180
+ components.push(String(navigator.hardwareConcurrency || 0));
181
+ components.push(String(navigator.deviceMemory || 0));
182
+ components.push(navigator.platform || 'unknown');
183
+ // WebGL renderer for GPU fingerprint
184
+ try {
185
+ const canvas = document.createElement('canvas');
186
+ const gl = canvas.getContext('webgl') || canvas.getContext('experimental-webgl');
187
+ if (gl) {
188
+ const ext = gl.getExtension('WEBGL_debug_renderer_info');
189
+ if (ext) {
190
+ components.push(gl.getParameter(ext.UNMASKED_RENDERER_WEBGL) || 'unknown-gpu');
191
+ }
192
+ }
193
+ }
194
+ catch { }
195
+ components.push(String(screen.width || 0));
196
+ components.push(String(screen.height || 0));
197
+ }
198
+ return await hashString(components.join('|'));
199
+ }
200
+ // ─── Enhanced Device Profiling ──────────────────────────────────────
201
+ function detectGPU() {
202
+ if (typeof window === 'undefined')
203
+ return { renderer: null, vramMb: 0 };
204
+ try {
205
+ const canvas = document.createElement('canvas');
206
+ const gl = canvas.getContext('webgl') || canvas.getContext('experimental-webgl');
207
+ if (!gl)
208
+ return { renderer: null, vramMb: 0 };
209
+ const ext = gl.getExtension('WEBGL_debug_renderer_info');
210
+ const renderer = ext ? gl.getParameter(ext.UNMASKED_RENDERER_WEBGL) : null;
211
+ // Rough VRAM estimate from renderer string
212
+ let vramMb = 0;
213
+ if (renderer) {
214
+ const match = renderer.match(/(\d+)\s*MB/i);
215
+ if (match)
216
+ vramMb = parseInt(match[1]);
217
+ else if (/RTX\s*40/i.test(renderer))
218
+ vramMb = 8192;
219
+ else if (/RTX\s*30/i.test(renderer))
220
+ vramMb = 6144;
221
+ else if (/GTX/i.test(renderer))
222
+ vramMb = 4096;
223
+ else if (/Apple M[2-4]/i.test(renderer))
224
+ vramMb = 8192;
225
+ else if (/Apple M1/i.test(renderer))
226
+ vramMb = 4096;
227
+ else if (/Intel/i.test(renderer))
228
+ vramMb = 1024;
229
+ }
230
+ return { renderer, vramMb };
231
+ }
232
+ catch {
233
+ return { renderer: null, vramMb: 0 };
234
+ }
235
+ }
236
+ function detectBrowser() {
237
+ if (typeof window === 'undefined' || typeof navigator === 'undefined')
238
+ return { name: 'node', version: process.version || 'unknown' };
239
+ const ua = navigator.userAgent;
240
+ if (/Edg\//i.test(ua)) {
241
+ const m = ua.match(/Edg\/([\d.]+)/);
242
+ return { name: 'Edge', version: m?.[1] || '' };
243
+ }
244
+ if (/Chrome\//i.test(ua)) {
245
+ const m = ua.match(/Chrome\/([\d.]+)/);
246
+ return { name: 'Chrome', version: m?.[1] || '' };
247
+ }
248
+ if (/Firefox\//i.test(ua)) {
249
+ const m = ua.match(/Firefox\/([\d.]+)/);
250
+ return { name: 'Firefox', version: m?.[1] || '' };
251
+ }
252
+ if (/Safari\//i.test(ua)) {
253
+ const m = ua.match(/Version\/([\d.]+)/);
254
+ return { name: 'Safari', version: m?.[1] || '' };
255
+ }
256
+ return { name: 'unknown', version: '' };
257
+ }
258
+ function detectNetworkType() {
259
+ if (typeof navigator === 'undefined')
260
+ return 'unknown';
261
+ const conn = navigator.connection || navigator.mozConnection || navigator.webkitConnection;
262
+ if (!conn)
263
+ return 'unknown';
264
+ return conn.effectiveType || conn.type || 'unknown';
265
+ }
266
+ async function measureApiLatency(apiUrl) {
267
+ try {
268
+ const start = Date.now();
269
+ await axios.head(`${apiUrl}/api/health`, { timeout: 5000 });
270
+ return Date.now() - start;
271
+ }
272
+ catch {
273
+ try {
274
+ const start = Date.now();
275
+ await axios.get(`${apiUrl}/api/health`, { timeout: 5000 });
276
+ return Date.now() - start;
277
+ }
278
+ catch {
279
+ return -1;
280
+ }
281
+ }
282
+ }
283
+ // ─── Device Profiling ───────────────────────────────────────────────
284
+ async function profileDevice() {
285
+ const isNode = typeof window === 'undefined';
286
+ let cpuCores = 4;
287
+ let memoryMB = 4096;
288
+ let estimatedStorageMB = 10000;
289
+ let platform = isNode ? 'nodejs' : 'web';
290
+ let os = 'unknown';
291
+ if (isNode) {
292
+ // Node.js environment
293
+ try {
294
+ const osModule = await import('os');
295
+ cpuCores = osModule.cpus().length;
296
+ memoryMB = Math.round(osModule.totalmem() / (1024 * 1024));
297
+ os = `${osModule.platform()} ${osModule.release()}`;
298
+ // Estimate free disk via df-like check
299
+ try {
300
+ const { execSync } = await import('child_process');
301
+ const dfOutput = execSync('df -m . 2>/dev/null || echo "0 0 0 0"', { encoding: 'utf-8' });
302
+ const lines = dfOutput.trim().split('\n');
303
+ if (lines.length > 1) {
304
+ const parts = lines[1].split(/\s+/);
305
+ estimatedStorageMB = parseInt(parts[3]) || 10000; // Available column
306
+ }
307
+ }
308
+ catch {
309
+ estimatedStorageMB = 10000;
310
+ }
311
+ }
312
+ catch {
313
+ // Fallback
314
+ }
315
+ }
316
+ else {
317
+ // Browser environment
318
+ cpuCores = navigator.hardwareConcurrency || 4;
319
+ memoryMB = (navigator.deviceMemory || 4) * 1024; // deviceMemory is in GB
320
+ os = navigator.userAgent;
321
+ // Storage Manager API (Chrome 61+)
322
+ try {
323
+ if (navigator.storage && navigator.storage.estimate) {
324
+ const estimate = await navigator.storage.estimate();
325
+ estimatedStorageMB = Math.round((estimate.quota || 0) / (1024 * 1024));
326
+ }
327
+ }
328
+ catch {
329
+ estimatedStorageMB = 5000;
330
+ }
331
+ }
332
+ const recommendedQuant = selectQuantization(memoryMB, 'quantum-1.7b'); // default baseline
333
+ const maxContextWindow = recommendContextWindow(memoryMB, recommendedQuant);
334
+ // Enhanced profiling
335
+ const gpu = detectGPU();
336
+ const browser = detectBrowser();
337
+ const networkType = detectNetworkType();
338
+ const timezone = Intl?.DateTimeFormat?.()?.resolvedOptions?.()?.timeZone || 'unknown';
339
+ let screenWidth = 0, screenHeight = 0, pixelRatio = 0;
340
+ let wasmAvailable = false, webgpuAvailable = false;
341
+ if (!isNode) {
342
+ screenWidth = screen?.width || 0;
343
+ screenHeight = screen?.height || 0;
344
+ pixelRatio = window?.devicePixelRatio || 1;
345
+ }
346
+ // Capability detection
347
+ try {
348
+ wasmAvailable = typeof WebAssembly !== 'undefined';
349
+ }
350
+ catch { }
351
+ if (!isNode) {
352
+ try {
353
+ webgpuAvailable = !!navigator.gpu;
354
+ }
355
+ catch { }
356
+ }
357
+ return {
358
+ cpuCores,
359
+ memoryMB,
360
+ estimatedStorageMB,
361
+ platform,
362
+ os,
363
+ recommendedQuant,
364
+ maxContextWindow,
365
+ gpuRenderer: gpu.renderer || undefined,
366
+ gpuVramMb: gpu.vramMb || undefined,
367
+ screenWidth: screenWidth || undefined,
368
+ screenHeight: screenHeight || undefined,
369
+ pixelRatio: pixelRatio || undefined,
370
+ browserName: browser.name,
371
+ browserVersion: browser.version,
372
+ networkType,
373
+ timezone,
374
+ wasmAvailable,
375
+ webgpuAvailable,
376
+ };
377
+ }
378
+ class SlyOS {
379
+ constructor(config) {
380
+ this.token = null;
381
+ this.models = new Map();
382
+ this.deviceProfile = null;
383
+ this.modelContextWindow = 0;
384
+ // Telemetry batching
385
+ this.telemetryBuffer = [];
386
+ this.telemetryFlushTimer = null;
387
+ // ═══════════════════════════════════════════════════════════
388
+ // RAG — Retrieval Augmented Generation
389
+ // ═══════════════════════════════════════════════════════════
390
+ this.localEmbeddingModel = null;
391
+ this.offlineIndexes = new Map();
392
+ this.apiKey = config.apiKey;
393
+ this.apiUrl = config.apiUrl || 'https://api.slyos.world';
394
+ this.deviceId = ''; // Set asynchronously in initialize()
395
+ this.onProgress = config.onProgress || null;
396
+ this.onEvent = config.onEvent || null;
397
+ this.fallbackConfig = config.fallback || null;
398
+ }
399
+ // ── Progress & Event Helpers ────────────────────────────────────
400
+ emitProgress(stage, progress, message, detail) {
401
+ if (this.onProgress) {
402
+ this.onProgress({ stage, progress, message, detail });
403
+ }
404
+ }
405
+ emitEvent(type, data) {
406
+ if (this.onEvent) {
407
+ this.onEvent({ type, data, timestamp: Date.now() });
408
+ }
409
+ }
410
+ // ── Telemetry Batching ─────────────────────────────────────────
411
+ recordTelemetry(entry) {
412
+ this.telemetryBuffer.push(entry);
413
+ if (this.telemetryBuffer.length >= SlyOS.TELEMETRY_BATCH_SIZE) {
414
+ this.flushTelemetry();
415
+ }
416
+ else if (!this.telemetryFlushTimer) {
417
+ this.telemetryFlushTimer = setTimeout(() => this.flushTelemetry(), SlyOS.TELEMETRY_FLUSH_INTERVAL);
418
+ }
419
+ }
420
+ async flushTelemetry() {
421
+ if (this.telemetryFlushTimer) {
422
+ clearTimeout(this.telemetryFlushTimer);
423
+ this.telemetryFlushTimer = null;
424
+ }
425
+ if (this.telemetryBuffer.length === 0 || !this.token)
426
+ return;
427
+ const batch = [...this.telemetryBuffer];
428
+ this.telemetryBuffer = [];
429
+ try {
430
+ await axios.post(`${this.apiUrl}/api/devices/telemetry`, {
431
+ device_id: this.deviceId,
432
+ metrics: batch,
433
+ }, {
434
+ headers: { Authorization: `Bearer ${this.token}` },
435
+ timeout: 10000,
436
+ });
437
+ this.emitEvent('telemetry_flushed', { count: batch.length });
438
+ }
439
+ catch {
440
+ // Put back on failure for next attempt
441
+ this.telemetryBuffer.unshift(...batch);
442
+ // Cap buffer to prevent memory leak
443
+ if (this.telemetryBuffer.length > 100) {
444
+ this.telemetryBuffer = this.telemetryBuffer.slice(-100);
445
+ }
446
+ }
447
+ }
448
+ // ── Device Analysis ─────────────────────────────────────────────
449
+ async analyzeDevice() {
450
+ try {
451
+ this.emitProgress('profiling', 10, 'Analyzing device capabilities...');
452
+ this.deviceProfile = await profileDevice();
453
+ this.emitProgress('profiling', 100, `Device: ${this.deviceProfile.cpuCores} cores, ${Math.round(this.deviceProfile.memoryMB / 1024 * 10) / 10}GB RAM`);
454
+ this.emitEvent('device_profiled', this.deviceProfile);
455
+ return this.deviceProfile;
456
+ }
457
+ catch (err) {
458
+ this.emitEvent('error', { method: 'analyzeDevice', error: err.message });
459
+ throw new Error(`Device analysis failed: ${err.message}`);
460
+ }
461
+ }
462
+ getDeviceProfile() {
463
+ return this.deviceProfile;
464
+ }
465
+ getModelContextWindow() {
466
+ return this.modelContextWindow;
467
+ }
468
+ getDeviceId() {
469
+ return this.deviceId;
470
+ }
471
+ getSdkVersion() {
472
+ return SDK_VERSION;
473
+ }
474
+ // Flush remaining telemetry and clean up timers
475
+ async destroy() {
476
+ await this.flushTelemetry();
477
+ if (this.telemetryFlushTimer) {
478
+ clearTimeout(this.telemetryFlushTimer);
479
+ this.telemetryFlushTimer = null;
480
+ }
481
+ }
482
+ // ── Smart Model Recommendation ──────────────────────────────────
483
+ recommendModel(category = 'llm') {
484
+ if (!this.deviceProfile) {
485
+ throw new Error('Call analyzeDevice() first to get a recommendation.');
486
+ }
487
+ const mem = this.deviceProfile.memoryMB;
488
+ const candidates = Object.entries(modelMap).filter(([_, info]) => info.category === category);
489
+ // Sort by size descending — pick the biggest model that fits
490
+ for (const [id, info] of candidates.sort((a, b) => b[1].sizesMB.q4 - a[1].sizesMB.q4)) {
491
+ const quant = selectQuantization(mem, id);
492
+ if (mem >= info.minRAM_MB[quant]) {
493
+ const ctx = recommendContextWindow(mem, quant);
494
+ return {
495
+ modelId: id,
496
+ quant,
497
+ contextWindow: ctx,
498
+ reason: `Best model for ${Math.round(mem / 1024)}GB RAM at ${quant.toUpperCase()} precision`,
499
+ };
500
+ }
501
+ }
502
+ // Fallback to smallest
503
+ const smallest = candidates.sort((a, b) => a[1].sizesMB.q4 - b[1].sizesMB.q4)[0];
504
+ if (smallest) {
505
+ return {
506
+ modelId: smallest[0],
507
+ quant: 'q4',
508
+ contextWindow: 512,
509
+ reason: 'Limited device memory — using smallest available model at Q4',
510
+ };
511
+ }
512
+ return null;
513
+ }
514
+ // ── Initialize ──────────────────────────────────────────────────
515
+ async initialize() {
516
+ this.emitProgress('initializing', 0, 'Starting SlyOS...');
517
+ // Step 1: Persistent device ID
518
+ this.deviceId = await getOrCreateDeviceId();
519
+ // Step 2: Profile device (enhanced)
520
+ this.emitProgress('profiling', 5, 'Detecting device capabilities...');
521
+ this.deviceProfile = await profileDevice();
522
+ // Step 2b: Generate device fingerprint
523
+ this.deviceProfile.deviceFingerprint = await generateDeviceFingerprint();
524
+ this.emitProgress('profiling', 20, `Detected: ${this.deviceProfile.cpuCores} CPU cores, ${Math.round(this.deviceProfile.memoryMB / 1024 * 10) / 10}GB RAM${this.deviceProfile.gpuRenderer ? ', GPU: ' + this.deviceProfile.gpuRenderer.substring(0, 30) : ''}`);
525
+ this.emitEvent('device_profiled', this.deviceProfile);
526
+ // Step 3: Authenticate
527
+ this.emitProgress('initializing', 40, 'Authenticating with API key...');
528
+ try {
529
+ const authRes = await axios.post(`${this.apiUrl}/api/auth/sdk`, {
530
+ apiKey: this.apiKey,
531
+ });
532
+ this.token = authRes.data.token;
533
+ this.emitProgress('initializing', 60, 'Authenticated successfully');
534
+ this.emitEvent('auth', { success: true });
535
+ }
536
+ catch (err) {
537
+ this.emitProgress('error', 0, `Authentication failed: ${err.message}`);
538
+ this.emitEvent('error', { stage: 'auth', error: err.message });
539
+ throw new Error(`SlyOS auth failed: ${err.response?.data?.error || err.message}`);
540
+ }
541
+ // Step 4: Measure API latency
542
+ const latency = await measureApiLatency(this.apiUrl);
543
+ if (latency > 0)
544
+ this.deviceProfile.latencyToApiMs = latency;
545
+ // Step 5: Register device with full intelligence profile
546
+ this.emitProgress('initializing', 70, 'Registering device...');
547
+ try {
548
+ // Determine supported quantizations based on memory
549
+ const mem = this.deviceProfile.memoryMB;
550
+ const supportedQuants = ['q4'];
551
+ if (mem >= 4096)
552
+ supportedQuants.push('q8');
553
+ if (mem >= 8192)
554
+ supportedQuants.push('fp16');
555
+ if (mem >= 16384)
556
+ supportedQuants.push('fp32');
557
+ // Determine recommended tier
558
+ let recommendedTier = 1;
559
+ if (mem >= 8192 && this.deviceProfile.cpuCores >= 4)
560
+ recommendedTier = 2;
561
+ if (mem >= 16384 && this.deviceProfile.cpuCores >= 8)
562
+ recommendedTier = 3;
563
+ await axios.post(`${this.apiUrl}/api/devices/register`, {
564
+ device_id: this.deviceId,
565
+ device_fingerprint: this.deviceProfile.deviceFingerprint,
566
+ platform: this.deviceProfile.platform,
567
+ os_version: this.deviceProfile.os,
568
+ total_memory_mb: this.deviceProfile.memoryMB,
569
+ cpu_cores: this.deviceProfile.cpuCores,
570
+ // Enhanced fields
571
+ gpu_renderer: this.deviceProfile.gpuRenderer || null,
572
+ gpu_vram_mb: this.deviceProfile.gpuVramMb || null,
573
+ screen_width: this.deviceProfile.screenWidth || null,
574
+ screen_height: this.deviceProfile.screenHeight || null,
575
+ pixel_ratio: this.deviceProfile.pixelRatio || null,
576
+ browser_name: this.deviceProfile.browserName || null,
577
+ browser_version: this.deviceProfile.browserVersion || null,
578
+ sdk_version: SDK_VERSION,
579
+ network_type: this.deviceProfile.networkType || null,
580
+ latency_to_api_ms: this.deviceProfile.latencyToApiMs || null,
581
+ timezone: this.deviceProfile.timezone || null,
582
+ // Capabilities
583
+ wasm_available: this.deviceProfile.wasmAvailable || false,
584
+ webgpu_available: this.deviceProfile.webgpuAvailable || false,
585
+ supported_quants: supportedQuants,
586
+ recommended_tier: recommendedTier,
587
+ }, {
588
+ headers: { Authorization: `Bearer ${this.token}` },
589
+ });
590
+ this.emitProgress('initializing', 90, 'Device registered');
591
+ this.emitEvent('device_registered', { deviceId: this.deviceId, fingerprint: this.deviceProfile.deviceFingerprint });
592
+ }
593
+ catch (err) {
594
+ // Non-fatal — device registration shouldn't block usage
595
+ this.emitProgress('initializing', 90, 'Device registration skipped (non-fatal)');
596
+ }
597
+ // Step 6: Start telemetry flush timer
598
+ this.telemetryFlushTimer = setTimeout(() => this.flushTelemetry(), SlyOS.TELEMETRY_FLUSH_INTERVAL);
599
+ this.emitProgress('ready', 100, `SlyOS v${SDK_VERSION} ready — ${this.deviceProfile.recommendedQuant.toUpperCase()}, ${this.deviceProfile.gpuRenderer ? 'GPU detected' : 'CPU only'}`);
600
+ return this.deviceProfile;
601
+ }
602
+ // ── Model Loading ───────────────────────────────────────────────
603
+ getAvailableModels() {
604
+ const grouped = { llm: [], stt: [] };
605
+ for (const [id, info] of Object.entries(modelMap)) {
606
+ if (!grouped[info.category])
607
+ grouped[info.category] = [];
608
+ grouped[info.category].push({
609
+ id,
610
+ sizesMB: info.sizesMB,
611
+ minRAM_MB: info.minRAM_MB,
612
+ });
613
+ }
614
+ return Object.fromEntries(Object.entries(grouped).map(([cat, models]) => [cat, { models }]));
615
+ }
616
+ async searchModels(query, options) {
617
+ try {
618
+ const limit = options?.limit || 20;
619
+ const filters = ['onnx']; // Filter for ONNX models only
620
+ if (options?.task) {
621
+ filters.push(options.task);
622
+ }
623
+ const filterString = filters.map(f => `"${f}"`).join(',');
624
+ const url = `https://huggingface.co/api/models?search=${encodeURIComponent(query)}&filter=${encodeURIComponent(`[${filterString}]`)}&sort=downloads&direction=-1&limit=${limit}`;
625
+ const response = await axios.get(url, { timeout: 10000 });
626
+ const models = Array.isArray(response.data) ? response.data : [];
627
+ return models.map((model) => ({
628
+ id: model.id,
629
+ name: model.id.split('/')[1] || model.id,
630
+ downloads: model.downloads || 0,
631
+ likes: model.likes || 0,
632
+ task: model.task || 'unknown',
633
+ size_category: model.size_category || 'unknown',
634
+ }));
635
+ }
636
+ catch (error) {
637
+ this.emitEvent('error', { stage: 'model_search', error: error.message });
638
+ throw new Error(`Model search failed: ${error.message}`);
639
+ }
640
+ }
641
+ canRunModel(modelId, quant) {
642
+ const info = modelMap[modelId];
643
+ if (!info)
644
+ return { canRun: false, reason: `Unknown model "${modelId}"`, recommendedQuant: 'q4' };
645
+ if (!this.deviceProfile)
646
+ return { canRun: true, reason: 'Device not profiled yet — call initialize() first', recommendedQuant: 'q4' };
647
+ const mem = this.deviceProfile.memoryMB;
648
+ const bestQuant = selectQuantization(mem, modelId);
649
+ if (quant && mem < info.minRAM_MB[quant]) {
650
+ return {
651
+ canRun: false,
652
+ reason: `Not enough RAM for ${quant.toUpperCase()} (need ${info.minRAM_MB[quant]}MB, have ${mem}MB). Try ${bestQuant.toUpperCase()} instead.`,
653
+ recommendedQuant: bestQuant,
654
+ };
655
+ }
656
+ if (mem < info.minRAM_MB.q4) {
657
+ return {
658
+ canRun: false,
659
+ reason: `Model requires at least ${info.minRAM_MB.q4}MB RAM even at Q4. Device has ${mem}MB.`,
660
+ recommendedQuant: 'q4',
661
+ };
662
+ }
663
+ return { canRun: true, reason: `OK at ${bestQuant.toUpperCase()} precision`, recommendedQuant: bestQuant };
664
+ }
665
+ async loadModel(modelId, options) {
666
+ const info = modelMap[modelId];
667
+ let hfModelId;
668
+ let task;
669
+ let estimatedSize;
670
+ // Handle curated models
671
+ if (info) {
672
+ hfModelId = info.hfModel;
673
+ task = info.task;
674
+ // Determine quantization
675
+ let quant = options?.quant || 'fp32';
676
+ if (!options?.quant && this.deviceProfile) {
677
+ quant = selectQuantization(this.deviceProfile.memoryMB, modelId);
678
+ this.emitProgress('downloading', 0, `Auto-selected ${quant.toUpperCase()} quantization for your device`);
679
+ }
680
+ // Check feasibility
681
+ const check = this.canRunModel(modelId, quant);
682
+ if (!check.canRun) {
683
+ this.emitProgress('error', 0, check.reason);
684
+ throw new Error(check.reason);
685
+ }
686
+ estimatedSize = info.sizesMB[quant];
687
+ this.emitProgress('downloading', 0, `Downloading ${modelId} (${quant.toUpperCase()}, ~${estimatedSize}MB)...`);
688
+ this.emitEvent('model_download_start', { modelId, quant, estimatedSizeMB: estimatedSize });
689
+ }
690
+ else {
691
+ // Handle custom HuggingFace models
692
+ hfModelId = modelId;
693
+ task = 'text-generation'; // Default task
694
+ estimatedSize = 2048; // Default estimate
695
+ this.emitProgress('downloading', 0, `Loading custom HuggingFace model: ${modelId}...`);
696
+ this.emitEvent('model_download_start', { modelId, custom: true, estimatedSizeMB: estimatedSize });
697
+ }
698
+ // Map quant to dtype for HuggingFace
699
+ const dtypeMap = {
700
+ q4: 'q4',
701
+ q8: 'q8',
702
+ fp16: 'fp16',
703
+ fp32: 'fp32',
704
+ };
705
+ let lastReportedPercent = 0;
706
+ const startTime = Date.now();
707
+ try {
708
+ // For custom HF models, detect context window
709
+ let detectedContextWindow = 2048;
710
+ if (!info) {
711
+ detectedContextWindow = await detectContextWindowFromHF(hfModelId);
712
+ }
713
+ const pipe = await pipeline(task, hfModelId, {
714
+ device: 'cpu',
715
+ dtype: 'q4', // Default to q4 for stability
716
+ progress_callback: (progressData) => {
717
+ // HuggingFace transformers sends progress events during download
718
+ if (progressData && typeof progressData === 'object') {
719
+ let percent = 0;
720
+ let msg = 'Downloading...';
721
+ if (progressData.status === 'progress' && progressData.progress !== undefined) {
722
+ percent = Math.round(progressData.progress);
723
+ const loaded = progressData.loaded ? `${Math.round(progressData.loaded / 1024 / 1024)}MB` : '';
724
+ const total = progressData.total ? `${Math.round(progressData.total / 1024 / 1024)}MB` : '';
725
+ msg = loaded && total ? `Downloading: ${loaded} / ${total}` : `Downloading: ${percent}%`;
726
+ }
727
+ else if (progressData.status === 'done') {
728
+ percent = 100;
729
+ msg = progressData.file ? `Downloaded ${progressData.file}` : 'Download complete';
730
+ }
731
+ else if (progressData.status === 'initiate') {
732
+ msg = progressData.file ? `Starting download: ${progressData.file}` : 'Initiating download...';
733
+ }
734
+ // Only emit if progress meaningfully changed (avoid flooding)
735
+ if (percent !== lastReportedPercent || progressData.status === 'done' || progressData.status === 'initiate') {
736
+ lastReportedPercent = percent;
737
+ this.emitProgress('downloading', percent, msg, progressData);
738
+ this.emitEvent('model_download_progress', { modelId, percent, ...progressData });
739
+ }
740
+ }
741
+ },
742
+ });
743
+ const loadTime = Date.now() - startTime;
744
+ let contextWindow;
745
+ if (info) {
746
+ // For curated models, use recommendContextWindow
747
+ const quant = options?.quant || (this.deviceProfile ? selectQuantization(this.deviceProfile.memoryMB, modelId) : 'q4');
748
+ contextWindow = this.deviceProfile
749
+ ? recommendContextWindow(this.deviceProfile.memoryMB, quant)
750
+ : 2048;
751
+ }
752
+ else {
753
+ // For custom HF models, use detected context window
754
+ contextWindow = detectedContextWindow;
755
+ }
756
+ this.modelContextWindow = contextWindow;
757
+ this.models.set(modelId, { pipe, info, quant: 'q4', contextWindow });
758
+ this.emitProgress('ready', 100, `${modelId} loaded (q4, ${(loadTime / 1000).toFixed(1)}s, ctx: ${contextWindow})`);
759
+ this.emitEvent('model_loaded', { modelId, quant: 'q4', loadTimeMs: loadTime, contextWindow });
760
+ // Telemetry
761
+ if (this.token) {
762
+ await axios.post(`${this.apiUrl}/api/telemetry`, {
763
+ device_id: this.deviceId,
764
+ event_type: 'model_load',
765
+ model_id: modelId,
766
+ success: true,
767
+ metadata: { quant: 'q4', loadTimeMs: loadTime, contextWindow, custom: !info },
768
+ }, {
769
+ headers: { Authorization: `Bearer ${this.token}` },
770
+ }).catch(() => { });
771
+ }
772
+ }
773
+ catch (error) {
774
+ this.emitProgress('error', 0, `Failed to load ${modelId}: ${error.message}`);
775
+ this.emitEvent('error', { stage: 'model_load', modelId, error: error.message });
776
+ if (this.token) {
777
+ await axios.post(`${this.apiUrl}/api/telemetry`, {
778
+ device_id: this.deviceId,
779
+ event_type: 'model_load',
780
+ model_id: modelId,
781
+ success: false,
782
+ error_message: error.message,
783
+ }, {
784
+ headers: { Authorization: `Bearer ${this.token}` },
785
+ }).catch(() => { });
786
+ }
787
+ throw error;
788
+ }
789
+ }
790
+ // ── Inference: Generate ─────────────────────────────────────────
791
+ async generate(modelId, prompt, options = {}) {
792
+ if (!this.models.has(modelId)) {
793
+ await this.loadModel(modelId);
794
+ }
795
+ const loaded = this.models.get(modelId);
796
+ if (!loaded) {
797
+ throw new Error(`Model "${modelId}" failed to load. Check your connection and model ID.`);
798
+ }
799
+ const { pipe, info, contextWindow } = loaded;
800
+ if (info.category !== 'llm') {
801
+ throw new Error(`Model "${modelId}" is not an LLM. Use transcribe() for STT models.`);
802
+ }
803
+ const maxTokens = Math.min(options.maxTokens || 100, contextWindow || 2048);
804
+ this.emitProgress('generating', 0, `Generating response (max ${maxTokens} tokens)...`);
805
+ this.emitEvent('inference_start', { modelId, maxTokens });
806
+ const startTime = Date.now();
807
+ try {
808
+ const result = await pipe(prompt, {
809
+ max_new_tokens: maxTokens,
810
+ temperature: options.temperature || 0.7,
811
+ top_p: options.topP || 0.9,
812
+ do_sample: true,
813
+ });
814
+ const rawOutput = result[0].generated_text;
815
+ // HuggingFace transformers returns the prompt + generated text concatenated.
816
+ // Strip the original prompt so we only return the NEW tokens.
817
+ const response = rawOutput.startsWith(prompt)
818
+ ? rawOutput.slice(prompt.length).trim()
819
+ : rawOutput.trim();
820
+ const latency = Date.now() - startTime;
821
+ const tokensGenerated = response.split(/\s+/).length;
822
+ const tokensPerSec = (tokensGenerated / (latency / 1000)).toFixed(1);
823
+ this.emitProgress('ready', 100, `Generated ${tokensGenerated} tokens in ${(latency / 1000).toFixed(1)}s (${tokensPerSec} tok/s)`);
824
+ this.emitEvent('inference_complete', { modelId, latencyMs: latency, tokensGenerated, tokensPerSec: parseFloat(tokensPerSec) });
825
+ // Batch telemetry (new device intelligence)
826
+ this.recordTelemetry({
827
+ latency_ms: latency,
828
+ tokens_generated: tokensGenerated,
829
+ success: true,
830
+ model_id: modelId,
831
+ timestamp: Date.now(),
832
+ });
833
+ // Legacy telemetry (backwards compatible)
834
+ if (this.token) {
835
+ await axios.post(`${this.apiUrl}/api/telemetry`, {
836
+ device_id: this.deviceId,
837
+ event_type: 'inference',
838
+ model_id: modelId,
839
+ latency_ms: latency,
840
+ tokens_generated: tokensGenerated,
841
+ success: true,
842
+ }, {
843
+ headers: { Authorization: `Bearer ${this.token}` },
844
+ }).catch(() => { });
845
+ }
846
+ return response;
847
+ }
848
+ catch (error) {
849
+ this.emitProgress('error', 0, `Generation failed: ${error.message}`);
850
+ this.emitEvent('error', { stage: 'inference', modelId, error: error.message });
851
+ // Batch telemetry (failure)
852
+ this.recordTelemetry({
853
+ latency_ms: 0,
854
+ tokens_generated: 0,
855
+ success: false,
856
+ model_id: modelId,
857
+ timestamp: Date.now(),
858
+ });
859
+ if (this.token) {
860
+ await axios.post(`${this.apiUrl}/api/telemetry`, {
861
+ device_id: this.deviceId,
862
+ event_type: 'inference',
863
+ model_id: modelId,
864
+ success: false,
865
+ error_message: error.message,
866
+ }, {
867
+ headers: { Authorization: `Bearer ${this.token}` },
868
+ }).catch(() => { });
869
+ }
870
+ throw error;
871
+ }
872
+ }
873
+ // ── Inference: Transcribe ───────────────────────────────────────
874
+ async transcribe(modelId, audioInput, options = {}) {
875
+ if (!this.models.has(modelId)) {
876
+ await this.loadModel(modelId);
877
+ }
878
+ const loaded = this.models.get(modelId);
879
+ if (!loaded) {
880
+ throw new Error(`Model "${modelId}" failed to load. Check your connection and model ID.`);
881
+ }
882
+ const { pipe, info } = loaded;
883
+ if (info.category !== 'stt') {
884
+ throw new Error(`Model "${modelId}" is not an STT model. Use generate() for LLMs.`);
885
+ }
886
+ this.emitProgress('transcribing', 0, 'Transcribing audio...');
887
+ this.emitEvent('inference_start', { modelId, type: 'transcription' });
888
+ const startTime = Date.now();
889
+ try {
890
+ const result = await pipe(audioInput, {
891
+ language: options.language || 'en',
892
+ return_timestamps: options.returnTimestamps || false,
893
+ });
894
+ const text = result.text;
895
+ const latency = Date.now() - startTime;
896
+ this.emitProgress('ready', 100, `Transcribed in ${(latency / 1000).toFixed(1)}s`);
897
+ this.emitEvent('inference_complete', { modelId, latencyMs: latency, type: 'transcription' });
898
+ if (this.token) {
899
+ await axios.post(`${this.apiUrl}/api/telemetry`, {
900
+ device_id: this.deviceId,
901
+ event_type: 'inference',
902
+ model_id: modelId,
903
+ latency_ms: latency,
904
+ success: true,
905
+ }, {
906
+ headers: { Authorization: `Bearer ${this.token}` },
907
+ }).catch(() => { });
908
+ }
909
+ return text;
910
+ }
911
+ catch (error) {
912
+ this.emitProgress('error', 0, `Transcription failed: ${error.message}`);
913
+ this.emitEvent('error', { stage: 'transcription', modelId, error: error.message });
914
+ if (this.token) {
915
+ await axios.post(`${this.apiUrl}/api/telemetry`, {
916
+ device_id: this.deviceId,
917
+ event_type: 'inference',
918
+ model_id: modelId,
919
+ success: false,
920
+ error_message: error.message,
921
+ }, {
922
+ headers: { Authorization: `Bearer ${this.token}` },
923
+ }).catch(() => { });
924
+ }
925
+ throw error;
926
+ }
927
+ }
928
+ // ── OpenAI Compatibility ────────────────────────────────────────────
929
+ async chatCompletion(modelId, request) {
930
+ try {
931
+ // Convert OpenAI message format to a prompt string
932
+ const prompt = request.messages
933
+ .map(msg => {
934
+ if (msg.role === 'system') {
935
+ return `System: ${msg.content}`;
936
+ }
937
+ else if (msg.role === 'user') {
938
+ return `User: ${msg.content}`;
939
+ }
940
+ else {
941
+ return `Assistant: ${msg.content}`;
942
+ }
943
+ })
944
+ .join('\n\n');
945
+ const response = await this.generate(modelId, prompt, {
946
+ temperature: request.temperature,
947
+ maxTokens: request.max_tokens,
948
+ topP: request.top_p,
949
+ });
950
+ // Estimate token counts (rough approximation: ~4 chars per token)
951
+ const promptTokens = Math.ceil(prompt.length / 4);
952
+ const completionTokens = Math.ceil(response.length / 4);
953
+ return {
954
+ id: `chat-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`,
955
+ object: 'chat.completion',
956
+ created: Math.floor(Date.now() / 1000),
957
+ model: modelId,
958
+ choices: [
959
+ {
960
+ index: 0,
961
+ message: {
962
+ role: 'assistant',
963
+ content: response,
964
+ },
965
+ finish_reason: 'stop',
966
+ },
967
+ ],
968
+ usage: {
969
+ prompt_tokens: promptTokens,
970
+ completion_tokens: completionTokens,
971
+ total_tokens: promptTokens + completionTokens,
972
+ },
973
+ };
974
+ }
975
+ catch (error) {
976
+ // Fallback to cloud provider if configured
977
+ if (this.fallbackConfig?.provider === 'openai') {
978
+ return this.fallbackToOpenAI(modelId, request);
979
+ }
980
+ else if (this.fallbackConfig?.provider === 'bedrock') {
981
+ return this.fallbackToBedrock(modelId, request);
982
+ }
983
+ throw error;
984
+ }
985
+ }
986
+ // ── AWS Bedrock Compatibility ──────────────────────────────────────
987
+ async bedrockInvoke(modelId, request) {
988
+ try {
989
+ const response = await this.generate(modelId, request.inputText, {
990
+ temperature: request.textGenerationConfig?.temperature,
991
+ maxTokens: request.textGenerationConfig?.maxTokenCount,
992
+ topP: request.textGenerationConfig?.topP,
993
+ });
994
+ // Estimate token counts
995
+ const inputTokens = Math.ceil(request.inputText.length / 4);
996
+ const outputTokens = Math.ceil(response.length / 4);
997
+ return {
998
+ results: [
999
+ {
1000
+ outputText: response,
1001
+ tokenCount: outputTokens,
1002
+ },
1003
+ ],
1004
+ input_text_token_count: inputTokens,
1005
+ };
1006
+ }
1007
+ catch (error) {
1008
+ // Fallback to cloud provider if configured
1009
+ if (this.fallbackConfig?.provider === 'bedrock') {
1010
+ return this.fallbackToBedrockCloud(modelId, request);
1011
+ }
1012
+ else if (this.fallbackConfig?.provider === 'openai') {
1013
+ return this.fallbackToOpenAICloud(modelId, request);
1014
+ }
1015
+ throw error;
1016
+ }
1017
+ }
1018
+ // ── Fallback: OpenAI Cloud ────────────────────────────────────────
1019
+ async fallbackToOpenAI(modelId, request) {
1020
+ if (!this.fallbackConfig) {
1021
+ throw new Error('OpenAI fallback not configured');
1022
+ }
1023
+ const mappedModel = this.mapModelToOpenAI(modelId);
1024
+ const payload = {
1025
+ model: this.fallbackConfig.model || mappedModel,
1026
+ messages: request.messages,
1027
+ temperature: request.temperature,
1028
+ max_tokens: request.max_tokens,
1029
+ top_p: request.top_p,
1030
+ frequency_penalty: request.frequency_penalty,
1031
+ presence_penalty: request.presence_penalty,
1032
+ stop: request.stop,
1033
+ };
1034
+ try {
1035
+ const response = await axios.post('https://api.openai.com/v1/chat/completions', payload, {
1036
+ headers: {
1037
+ Authorization: `Bearer ${this.fallbackConfig.apiKey}`,
1038
+ 'Content-Type': 'application/json',
1039
+ },
1040
+ });
1041
+ this.emitEvent('fallback_success', { provider: 'openai', originalModel: modelId, mappedModel: this.fallbackConfig.model });
1042
+ return response.data;
1043
+ }
1044
+ catch (error) {
1045
+ this.emitProgress('error', 0, `OpenAI fallback failed: ${error.message}`);
1046
+ this.emitEvent('fallback_error', { provider: 'openai', error: error.message });
1047
+ throw error;
1048
+ }
1049
+ }
1050
+ async fallbackToBedrock(modelId, request) {
1051
+ if (!this.fallbackConfig) {
1052
+ throw new Error('Bedrock fallback not configured');
1053
+ }
1054
+ // Convert OpenAI format to Bedrock's expected format (simplified)
1055
+ const lastMessage = request.messages[request.messages.length - 1];
1056
+ const inputText = lastMessage.content;
1057
+ const bedrockResponse = await this.invokeBedrockCloud(inputText, {
1058
+ temperature: request.temperature,
1059
+ maxTokenCount: request.max_tokens,
1060
+ topP: request.top_p,
1061
+ });
1062
+ // Convert Bedrock response back to OpenAI format
1063
+ const promptTokens = Math.ceil(inputText.length / 4);
1064
+ const completionTokens = bedrockResponse.results[0].tokenCount;
1065
+ this.emitEvent('fallback_success', { provider: 'bedrock', originalModel: modelId, mappedModel: this.fallbackConfig.model });
1066
+ return {
1067
+ id: `chat-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`,
1068
+ object: 'chat.completion',
1069
+ created: Math.floor(Date.now() / 1000),
1070
+ model: modelId,
1071
+ choices: [
1072
+ {
1073
+ index: 0,
1074
+ message: {
1075
+ role: 'assistant',
1076
+ content: bedrockResponse.results[0].outputText,
1077
+ },
1078
+ finish_reason: 'stop',
1079
+ },
1080
+ ],
1081
+ usage: {
1082
+ prompt_tokens: promptTokens,
1083
+ completion_tokens: completionTokens,
1084
+ total_tokens: promptTokens + completionTokens,
1085
+ },
1086
+ };
1087
+ }
1088
+ async fallbackToOpenAICloud(modelId, request) {
1089
+ if (!this.fallbackConfig) {
1090
+ throw new Error('OpenAI fallback not configured');
1091
+ }
1092
+ const mappedModel = this.mapModelToOpenAI(modelId);
1093
+ const payload = {
1094
+ model: this.fallbackConfig.model || mappedModel,
1095
+ messages: [{ role: 'user', content: request.inputText }],
1096
+ temperature: request.textGenerationConfig?.temperature,
1097
+ max_tokens: request.textGenerationConfig?.maxTokenCount,
1098
+ top_p: request.textGenerationConfig?.topP,
1099
+ };
1100
+ try {
1101
+ const response = await axios.post('https://api.openai.com/v1/chat/completions', payload, {
1102
+ headers: {
1103
+ Authorization: `Bearer ${this.fallbackConfig.apiKey}`,
1104
+ 'Content-Type': 'application/json',
1105
+ },
1106
+ });
1107
+ const outputText = response.data.choices[0].message.content;
1108
+ const inputTokens = Math.ceil(request.inputText.length / 4);
1109
+ const outputTokens = response.data.usage.completion_tokens;
1110
+ this.emitEvent('fallback_success', { provider: 'openai', originalModel: modelId, mappedModel: this.fallbackConfig.model });
1111
+ return {
1112
+ results: [
1113
+ {
1114
+ outputText,
1115
+ tokenCount: outputTokens,
1116
+ },
1117
+ ],
1118
+ input_text_token_count: inputTokens,
1119
+ };
1120
+ }
1121
+ catch (error) {
1122
+ this.emitProgress('error', 0, `OpenAI fallback failed: ${error.message}`);
1123
+ this.emitEvent('fallback_error', { provider: 'openai', error: error.message });
1124
+ throw error;
1125
+ }
1126
+ }
1127
+ async fallbackToBedrockCloud(modelId, request) {
1128
+ if (!this.fallbackConfig) {
1129
+ throw new Error('Bedrock fallback not configured');
1130
+ }
1131
+ try {
1132
+ return await this.invokeBedrockCloud(request.inputText, request.textGenerationConfig);
1133
+ }
1134
+ catch (error) {
1135
+ this.emitProgress('error', 0, `Bedrock fallback failed: ${error.message}`);
1136
+ this.emitEvent('fallback_error', { provider: 'bedrock', error: error.message });
1137
+ throw error;
1138
+ }
1139
+ }
1140
+ async invokeBedrockCloud(inputText, config) {
1141
+ if (!this.fallbackConfig) {
1142
+ throw new Error('Bedrock fallback not configured');
1143
+ }
1144
+ const region = this.fallbackConfig.region || 'us-east-1';
1145
+ const model = this.fallbackConfig.model || 'anthropic.claude-3-sonnet-20240229-v1:0';
1146
+ // Bedrock endpoint format: https://bedrock-runtime.{region}.amazonaws.com/model/{modelId}/invoke
1147
+ const endpoint = `https://bedrock-runtime.${region}.amazonaws.com/model/${model}/invoke`;
1148
+ const payload = {
1149
+ inputText,
1150
+ textGenerationConfig: {
1151
+ maxTokenCount: config?.maxTokenCount || 256,
1152
+ temperature: config?.temperature || 0.7,
1153
+ topP: config?.topP || 0.9,
1154
+ topK: config?.topK,
1155
+ stopSequences: config?.stopSequences,
1156
+ },
1157
+ };
1158
+ try {
1159
+ const response = await axios.post(endpoint, payload, {
1160
+ headers: {
1161
+ Authorization: `Bearer ${this.fallbackConfig.apiKey}`,
1162
+ 'Content-Type': 'application/json',
1163
+ 'X-Amz-Target': 'AmazonBedrockRuntime.InvokeModel',
1164
+ },
1165
+ });
1166
+ this.emitEvent('fallback_success', { provider: 'bedrock', model });
1167
+ return response.data;
1168
+ }
1169
+ catch (error) {
1170
+ throw new Error(`Bedrock invocation failed: ${error.message}`);
1171
+ }
1172
+ }
1173
+ mapModelToOpenAI(slyModelId) {
1174
+ const modelMapping = {
1175
+ 'quantum-1.7b': 'gpt-4o-mini',
1176
+ 'quantum-3b': 'gpt-4o',
1177
+ 'quantum-code-3b': 'gpt-4o',
1178
+ 'quantum-8b': 'gpt-4-turbo',
1179
+ };
1180
+ return modelMapping[slyModelId] || 'gpt-4o-mini';
1181
+ }
1182
+ /**
1183
+ * Tier 2: Cloud-indexed RAG with local inference.
1184
+ * Retrieves relevant chunks from server, generates response locally.
1185
+ */
1186
+ async ragQuery(options) {
1187
+ const startTime = Date.now();
1188
+ try {
1189
+ if (!this.token)
1190
+ throw new Error('Not authenticated. Call init() first.');
1191
+ // Step 1: Retrieve relevant chunks from backend
1192
+ const searchResponse = await axios.post(`${this.apiUrl}/api/rag/knowledge-bases/${options.knowledgeBaseId}/query`, {
1193
+ query: options.query,
1194
+ top_k: options.topK || 5,
1195
+ model_id: options.modelId
1196
+ }, { headers: { Authorization: `Bearer ${this.token}` } });
1197
+ let { retrieved_chunks, prompt_template, context } = searchResponse.data;
1198
+ // Apply context window limits
1199
+ const contextWindow = this.modelContextWindow || 2048;
1200
+ const maxContextChars = (contextWindow - 200) * 3; // Rough token-to-char ratio, reserving 200 tokens
1201
+ if (context && context.length > maxContextChars) {
1202
+ context = context.substring(0, maxContextChars) + '...';
1203
+ }
1204
+ // Step 2: Generate response locally using the augmented prompt
1205
+ const response = await this.generate(options.modelId, prompt_template, {
1206
+ temperature: options.temperature,
1207
+ maxTokens: options.maxTokens,
1208
+ });
1209
+ return {
1210
+ query: options.query,
1211
+ retrievedChunks: retrieved_chunks.map((c) => ({
1212
+ id: c.id,
1213
+ documentId: c.document_id,
1214
+ documentName: c.document_name,
1215
+ content: c.content,
1216
+ similarityScore: c.similarity_score,
1217
+ metadata: c.metadata
1218
+ })),
1219
+ generatedResponse: response,
1220
+ context,
1221
+ latencyMs: Date.now() - startTime,
1222
+ tierUsed: 2,
1223
+ };
1224
+ }
1225
+ catch (error) {
1226
+ this.emitEvent('error', { stage: 'rag_query', error: error.message });
1227
+ throw new Error(`RAG query failed: ${error.message}`);
1228
+ }
1229
+ }
1230
+ /**
1231
+ * Tier 1: Fully local RAG. Zero network calls.
1232
+ * Documents are chunked/embedded on-device, retrieval and generation all local.
1233
+ */
1234
+ async ragQueryLocal(options) {
1235
+ const startTime = Date.now();
1236
+ try {
1237
+ // Step 1: Load embedding model if needed
1238
+ if (!this.localEmbeddingModel) {
1239
+ await this.loadEmbeddingModel();
1240
+ }
1241
+ // Adapt chunk size based on context window for efficiency
1242
+ const contextWindow = this.modelContextWindow || 2048;
1243
+ const chunkSize = contextWindow <= 1024 ? 256 : contextWindow <= 2048 ? 512 : 1024;
1244
+ const overlap = Math.floor(chunkSize / 4);
1245
+ // Step 2: Chunk documents if not already chunked
1246
+ const allChunks = [];
1247
+ for (const doc of options.documents) {
1248
+ const chunks = this.chunkTextLocal(doc.content, chunkSize, overlap);
1249
+ for (const chunk of chunks) {
1250
+ const embedding = await this.embedTextLocal(chunk);
1251
+ allChunks.push({ content: chunk, documentName: doc.name || 'Document', embedding });
1252
+ }
1253
+ }
1254
+ // Step 3: Embed query
1255
+ const queryEmbedding = await this.embedTextLocal(options.query);
1256
+ // Step 4: Cosine similarity search
1257
+ const scored = allChunks
1258
+ .filter(c => c.embedding)
1259
+ .map(c => ({
1260
+ ...c,
1261
+ similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding)
1262
+ }))
1263
+ .sort((a, b) => b.similarityScore - a.similarityScore)
1264
+ .slice(0, options.topK || 5);
1265
+ // Step 5: Build context with size limits — keep context SHORT so model has room to generate
1266
+ const maxContextChars = contextWindow <= 2048 ? 800 : contextWindow <= 4096 ? 1500 : 3000;
1267
+ let contextLength = 0;
1268
+ const contextParts = [];
1269
+ for (const c of scored) {
1270
+ const part = `[Source: ${c.documentName}]\n${c.content}`;
1271
+ if (contextLength + part.length <= maxContextChars) {
1272
+ contextParts.push(part);
1273
+ contextLength += part.length + 10; // Account for separator
1274
+ }
1275
+ else {
1276
+ break;
1277
+ }
1278
+ }
1279
+ const context = contextParts.join('\n\n---\n\n');
1280
+ const prompt = `Use the following information to answer the question.\n\nInfo: ${context}\n\nQuestion: ${options.query}\nAnswer:`;
1281
+ // Step 6: Generate locally
1282
+ const maxGen = contextWindow <= 2048 ? 150 : Math.min(300, Math.floor(contextWindow / 4));
1283
+ const response = await this.generate(options.modelId, prompt, {
1284
+ temperature: options.temperature || 0.6,
1285
+ maxTokens: options.maxTokens || maxGen,
1286
+ });
1287
+ return {
1288
+ query: options.query,
1289
+ retrievedChunks: scored.map((c, i) => ({
1290
+ id: `local-${i}`,
1291
+ documentId: 'local',
1292
+ documentName: c.documentName,
1293
+ content: c.content,
1294
+ similarityScore: c.similarityScore,
1295
+ metadata: {}
1296
+ })),
1297
+ generatedResponse: response,
1298
+ context,
1299
+ latencyMs: Date.now() - startTime,
1300
+ tierUsed: 1,
1301
+ };
1302
+ }
1303
+ catch (error) {
1304
+ this.emitEvent('error', { stage: 'rag_local', error: error.message });
1305
+ throw new Error(`Local RAG failed: ${error.message}`);
1306
+ }
1307
+ }
1308
+ /**
1309
+ * Tier 3: Offline RAG using a synced knowledge base.
1310
+ * First call syncKnowledgeBase(), then use this for offline queries.
1311
+ */
1312
+ async ragQueryOffline(options) {
1313
+ const startTime = Date.now();
1314
+ const index = this.offlineIndexes.get(options.knowledgeBaseId);
1315
+ if (!index) {
1316
+ throw new Error(`Knowledge base "${options.knowledgeBaseId}" not synced. Call syncKnowledgeBase() first.`);
1317
+ }
1318
+ // Check expiry
1319
+ if (new Date(index.metadata.expires_at) < new Date()) {
1320
+ throw new Error('Offline index has expired. Please re-sync.');
1321
+ }
1322
+ try {
1323
+ // Load embedding model
1324
+ if (!this.localEmbeddingModel) {
1325
+ await this.loadEmbeddingModel();
1326
+ }
1327
+ // Embed query
1328
+ const queryEmbedding = await this.embedTextLocal(options.query);
1329
+ // Search offline index
1330
+ const scored = index.chunks
1331
+ .filter(c => c.embedding && c.embedding.length > 0)
1332
+ .map(c => ({
1333
+ ...c,
1334
+ similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding)
1335
+ }))
1336
+ .sort((a, b) => b.similarityScore - a.similarityScore)
1337
+ .slice(0, options.topK || 5);
1338
+ // Build context with size limits — keep context SHORT so model has room to generate
1339
+ const contextWindow = this.modelContextWindow || 2048;
1340
+ const maxContextChars = contextWindow <= 2048 ? 800 : contextWindow <= 4096 ? 1500 : 3000;
1341
+ let contextLength = 0;
1342
+ const contextParts = [];
1343
+ for (const c of scored) {
1344
+ const part = `[Source: ${c.document_name}]\n${c.content}`;
1345
+ if (contextLength + part.length <= maxContextChars) {
1346
+ contextParts.push(part);
1347
+ contextLength += part.length + 10;
1348
+ }
1349
+ else {
1350
+ break;
1351
+ }
1352
+ }
1353
+ const context = contextParts.join('\n\n---\n\n');
1354
+ const prompt = `Use the following information to answer the question.\n\nInfo: ${context}\n\nQuestion: ${options.query}\nAnswer:`;
1355
+ // Generate locally
1356
+ const maxGen = contextWindow <= 2048 ? 150 : Math.min(300, Math.floor(contextWindow / 4));
1357
+ const response = await this.generate(options.modelId, prompt, {
1358
+ temperature: options.temperature || 0.6,
1359
+ maxTokens: options.maxTokens || maxGen,
1360
+ });
1361
+ return {
1362
+ query: options.query,
1363
+ retrievedChunks: scored.map(c => ({
1364
+ id: c.id,
1365
+ documentId: c.document_id,
1366
+ documentName: c.document_name,
1367
+ content: c.content,
1368
+ similarityScore: c.similarityScore,
1369
+ metadata: c.metadata
1370
+ })),
1371
+ generatedResponse: response,
1372
+ context,
1373
+ latencyMs: Date.now() - startTime,
1374
+ tierUsed: 3,
1375
+ };
1376
+ }
1377
+ catch (error) {
1378
+ this.emitEvent('error', { stage: 'rag_offline', error: error.message });
1379
+ throw new Error(`Offline RAG failed: ${error.message}`);
1380
+ }
1381
+ }
1382
+ /**
1383
+ * Sync a knowledge base for offline use (Tier 3).
1384
+ * Downloads chunks + embeddings from server, stores locally.
1385
+ */
1386
+ async syncKnowledgeBase(knowledgeBaseId, deviceId) {
1387
+ try {
1388
+ if (!this.token)
1389
+ throw new Error('Not authenticated. Call init() first.');
1390
+ const response = await axios.post(`${this.apiUrl}/api/rag/knowledge-bases/${knowledgeBaseId}/sync`, { device_id: deviceId || this.deviceId || 'sdk-device' }, { headers: { Authorization: `Bearer ${this.token}` } });
1391
+ const { sync_package, chunk_count, package_size_mb, expires_at } = response.data;
1392
+ this.offlineIndexes.set(knowledgeBaseId, sync_package);
1393
+ return {
1394
+ chunkCount: chunk_count,
1395
+ sizeMb: package_size_mb,
1396
+ expiresAt: expires_at
1397
+ };
1398
+ }
1399
+ catch (error) {
1400
+ throw new Error(`Sync failed: ${error.message}`);
1401
+ }
1402
+ }
1403
+ // --- RAG Helper Methods ---
1404
+ async loadEmbeddingModel() {
1405
+ this.emitProgress('downloading', 0, 'Loading embedding model (all-MiniLM-L6-v2)...');
1406
+ try {
1407
+ const { pipeline } = await import('@huggingface/transformers');
1408
+ this.localEmbeddingModel = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
1409
+ this.emitProgress('ready', 100, 'Embedding model loaded');
1410
+ }
1411
+ catch (error) {
1412
+ this.emitProgress('error', 0, `Embedding model failed: ${error.message}`);
1413
+ throw error;
1414
+ }
1415
+ }
1416
+ async embedTextLocal(text) {
1417
+ if (!this.localEmbeddingModel)
1418
+ throw new Error('Embedding model not loaded');
1419
+ const result = await this.localEmbeddingModel(text, { pooling: 'mean', normalize: true });
1420
+ // Handle different tensor output formats (v2 vs v3 of transformers)
1421
+ if (result.data)
1422
+ return Array.from(result.data);
1423
+ if (result.tolist)
1424
+ return result.tolist().flat();
1425
+ if (Array.isArray(result))
1426
+ return result.flat();
1427
+ throw new Error('Unexpected embedding output format');
1428
+ }
1429
+ cosineSimilarity(a, b) {
1430
+ let dot = 0, normA = 0, normB = 0;
1431
+ for (let i = 0; i < a.length; i++) {
1432
+ dot += a[i] * b[i];
1433
+ normA += a[i] * a[i];
1434
+ normB += b[i] * b[i];
1435
+ }
1436
+ const denom = Math.sqrt(normA) * Math.sqrt(normB);
1437
+ return denom === 0 ? 0 : dot / denom;
1438
+ }
1439
+ chunkTextLocal(text, chunkSize = 512, overlap = 128) {
1440
+ if (!text || text.length === 0)
1441
+ return [];
1442
+ if (overlap >= chunkSize)
1443
+ overlap = Math.floor(chunkSize * 0.25);
1444
+ const chunks = [];
1445
+ let start = 0;
1446
+ while (start < text.length) {
1447
+ let end = start + chunkSize;
1448
+ if (end < text.length) {
1449
+ const bp = Math.max(text.lastIndexOf('.', end), text.lastIndexOf('\n', end));
1450
+ if (bp > start + chunkSize / 2)
1451
+ end = bp + 1;
1452
+ }
1453
+ const chunk = text.slice(start, end).trim();
1454
+ if (chunk.length > 20)
1455
+ chunks.push(chunk);
1456
+ start = end - overlap;
1457
+ if (start >= text.length)
1458
+ break;
1459
+ }
1460
+ return chunks;
1461
+ }
1462
+ // ── Static OpenAI Compatible Factory ────────────────────────────────
1463
+ static openaiCompatible(config) {
1464
+ const instance = new SlyOS({
1465
+ apiKey: config.apiKey,
1466
+ apiUrl: config.apiUrl,
1467
+ fallback: { ...config.fallback, provider: config.fallback?.provider || 'openai' },
1468
+ });
1469
+ return {
1470
+ chat: {
1471
+ completions: {
1472
+ async create(request) {
1473
+ const { model, ...chatRequest } = request;
1474
+ return instance.chatCompletion(model, chatRequest);
1475
+ },
1476
+ },
1477
+ },
1478
+ };
1479
+ }
1480
+ }
1481
+ SlyOS.TELEMETRY_BATCH_SIZE = 10;
1482
+ SlyOS.TELEMETRY_FLUSH_INTERVAL = 60000; // 60 seconds
1483
+ export default SlyOS;