@beltoinc/slyos-sdk 1.5.5 → 1.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.ts ADDED
@@ -0,0 +1,2147 @@
1
+ import axios from 'axios';
2
+ import { pipeline, env } from '@huggingface/transformers';
3
+
4
+ // @ts-ignore - Force CPU in Node.js
5
+ if (env.backends?.onnx?.wasm) {
6
+ env.backends.onnx.wasm.proxy = false;
7
+ }
8
+
9
+ // ─── Types ──────────────────────────────────────────────────────────
10
+
11
+ interface SlyOSConfig {
12
+ apiKey: string;
13
+ apiUrl?: string;
14
+ onProgress?: ProgressCallback;
15
+ onEvent?: EventCallback;
16
+ }
17
+
18
+ interface GenerateOptions {
19
+ temperature?: number;
20
+ maxTokens?: number;
21
+ topP?: number;
22
+ }
23
+
24
+ interface TranscribeOptions {
25
+ language?: string;
26
+ returnTimestamps?: boolean;
27
+ }
28
+
29
+ type ModelCategory = 'llm' | 'stt';
30
+ type QuantizationLevel = 'q4' | 'q8' | 'fp16' | 'fp32';
31
+
32
+ interface ModelInfo {
33
+ hfModel: string;
34
+ task: string;
35
+ category: ModelCategory;
36
+ sizesMB: Record<QuantizationLevel, number>;
37
+ minRAM_MB: Record<QuantizationLevel, number>;
38
+ }
39
+
40
+ interface DeviceProfile {
41
+ cpuCores: number;
42
+ memoryMB: number;
43
+ estimatedStorageMB: number;
44
+ platform: 'web' | 'nodejs';
45
+ os: string;
46
+ recommendedQuant: QuantizationLevel;
47
+ maxContextWindow: number;
48
+ // Enhanced device intelligence fields
49
+ deviceFingerprint?: string;
50
+ gpuRenderer?: string;
51
+ gpuVramMb?: number;
52
+ screenWidth?: number;
53
+ screenHeight?: number;
54
+ pixelRatio?: number;
55
+ browserName?: string;
56
+ browserVersion?: string;
57
+ networkType?: string;
58
+ latencyToApiMs?: number;
59
+ timezone?: string;
60
+ wasmAvailable?: boolean;
61
+ webgpuAvailable?: boolean;
62
+ }
63
+
64
+ interface ProgressEvent {
65
+ stage: 'initializing' | 'profiling' | 'downloading' | 'loading' | 'ready' | 'generating' | 'transcribing' | 'error';
66
+ progress: number; // 0-100
67
+ message: string;
68
+ detail?: any;
69
+ }
70
+
71
+ interface SlyEvent {
72
+ type: 'auth' | 'device_registered' | 'device_profiled' | 'model_download_start' | 'model_download_progress' | 'model_loaded' | 'inference_start' | 'inference_complete' | 'error' | 'fallback_success' | 'fallback_error' | 'telemetry_flushed' | 'token';
73
+ data?: any;
74
+ timestamp: number;
75
+ }
76
+
77
+ type ProgressCallback = (event: ProgressEvent) => void;
78
+ type EventCallback = (event: SlyEvent) => void;
79
+
80
+ // ─── OpenAI Compatibility Types ──────────────────────────────────────
81
+
82
+ interface OpenAIMessage {
83
+ role: 'system' | 'user' | 'assistant';
84
+ content: string;
85
+ }
86
+
87
+ interface OpenAIChatCompletionRequest {
88
+ messages: OpenAIMessage[];
89
+ temperature?: number;
90
+ top_p?: number;
91
+ max_tokens?: number;
92
+ frequency_penalty?: number;
93
+ presence_penalty?: number;
94
+ stop?: string | string[];
95
+ }
96
+
97
+ interface OpenAIChoice {
98
+ index: number;
99
+ message: OpenAIMessage;
100
+ finish_reason: string;
101
+ }
102
+
103
+ interface OpenAIUsage {
104
+ prompt_tokens: number;
105
+ completion_tokens: number;
106
+ total_tokens: number;
107
+ }
108
+
109
+ interface OpenAIChatCompletionResponse {
110
+ id: string;
111
+ object: 'chat.completion';
112
+ created: number;
113
+ model: string;
114
+ choices: OpenAIChoice[];
115
+ usage: OpenAIUsage;
116
+ }
117
+
118
+ // ─── AWS Bedrock Compatibility Types ─────────────────────────────────
119
+
120
+ interface BedrockTextGenerationConfig {
121
+ maxTokenCount?: number;
122
+ temperature?: number;
123
+ topP?: number;
124
+ topK?: number;
125
+ stopSequences?: string[];
126
+ }
127
+
128
+ interface BedrockInvokeRequest {
129
+ inputText: string;
130
+ textGenerationConfig?: BedrockTextGenerationConfig;
131
+ }
132
+
133
+ interface BedrockResult {
134
+ outputText: string;
135
+ tokenCount: number;
136
+ }
137
+
138
+ interface BedrockInvokeResponse {
139
+ results: BedrockResult[];
140
+ input_text_token_count?: number;
141
+ }
142
+
143
+ // ─── Fallback Configuration ─────────────────────────────────────────
144
+
145
+ type FallbackProvider = 'openai' | 'bedrock';
146
+
147
+ interface FallbackConfig {
148
+ provider: FallbackProvider;
149
+ apiKey: string;
150
+ model: string;
151
+ region?: string; // for Bedrock
152
+ }
153
+
154
+ interface SlyOSConfigWithFallback extends SlyOSConfig {
155
+ fallback?: FallbackConfig;
156
+ }
157
+
158
+ // ─── OpenAI Compatible Client ───────────────────────────────────────
159
+
160
+ interface OpenAICompatibleClient {
161
+ chat: {
162
+ completions: {
163
+ create(request: OpenAIChatCompletionRequest & { model: string }): Promise<OpenAIChatCompletionResponse>;
164
+ };
165
+ };
166
+ }
167
+
168
+ // ─── RAG Types ──────────────────────────────────────────────────
169
+
170
+ interface RAGOptions {
171
+ knowledgeBaseId: string;
172
+ query: string;
173
+ topK?: number;
174
+ modelId: string;
175
+ temperature?: number;
176
+ maxTokens?: number;
177
+ // NEW: streaming callback
178
+ onToken?: (token: string, partial: string) => void;
179
+ }
180
+
181
+ interface RAGChunk {
182
+ id: string;
183
+ documentId: string;
184
+ documentName: string;
185
+ content: string;
186
+ similarityScore: number;
187
+ metadata?: Record<string, any>;
188
+ }
189
+
190
+ interface RAGResponse {
191
+ query: string;
192
+ retrievedChunks: RAGChunk[];
193
+ generatedResponse: string;
194
+ context: string;
195
+ latencyMs: number;
196
+ tierUsed: 1 | 2 | 3;
197
+ // NEW: detailed timing metrics
198
+ timing: {
199
+ retrievalMs: number; // Time spent retrieving/embedding chunks
200
+ contextBuildMs: number; // Time spent building context
201
+ firstTokenMs: number; // Time to first token (from generation start)
202
+ generationMs: number; // Total generation time
203
+ totalMs: number; // End-to-end latency
204
+ tokensGenerated: number; // Number of tokens in response
205
+ tokensPerSecond: number; // Generation throughput
206
+ };
207
+ // NEW: dynamic config used
208
+ config: {
209
+ maxContextChars: number;
210
+ maxGenTokens: number;
211
+ chunkSize: number;
212
+ topK: number;
213
+ contextWindowUsed: number;
214
+ deviceTier: 'low' | 'mid' | 'high';
215
+ };
216
+ }
217
+
218
+ interface OfflineIndex {
219
+ metadata: {
220
+ kb_id: string;
221
+ kb_name: string;
222
+ chunk_size: number;
223
+ embedding_dim: number;
224
+ total_chunks: number;
225
+ synced_at: string;
226
+ expires_at: string;
227
+ sync_token: string;
228
+ };
229
+ chunks: Array<{
230
+ id: string;
231
+ document_id: string;
232
+ document_name: string;
233
+ content: string;
234
+ chunk_index: number;
235
+ embedding: number[] | null;
236
+ metadata: Record<string, any>;
237
+ }>;
238
+ }
239
+
240
+ // ─── Model Registry ─────────────────────────────────────────────────
241
+
242
+ const modelMap: Record<string, ModelInfo> = {
243
+ // LLM models (1B+)
244
+ 'quantum-1.7b': {
245
+ hfModel: 'onnx-community/SmolLM2-1.7B-Instruct',
246
+ task: 'text-generation',
247
+ category: 'llm',
248
+ sizesMB: { q4: 900, q8: 1700, fp16: 3400, fp32: 6800 },
249
+ minRAM_MB: { q4: 2048, q8: 3072, fp16: 5120, fp32: 8192 },
250
+ },
251
+ 'quantum-3b': {
252
+ hfModel: 'onnx-community/Qwen2.5-3B-Instruct',
253
+ task: 'text-generation',
254
+ category: 'llm',
255
+ sizesMB: { q4: 1600, q8: 3200, fp16: 6400, fp32: 12800 },
256
+ minRAM_MB: { q4: 3072, q8: 5120, fp16: 8192, fp32: 16384 },
257
+ },
258
+ 'quantum-code-3b': {
259
+ hfModel: 'onnx-community/Qwen2.5-Coder-3B-Instruct',
260
+ task: 'text-generation',
261
+ category: 'llm',
262
+ sizesMB: { q4: 1600, q8: 3200, fp16: 6400, fp32: 12800 },
263
+ minRAM_MB: { q4: 3072, q8: 5120, fp16: 8192, fp32: 16384 },
264
+ },
265
+ 'quantum-8b': {
266
+ hfModel: 'onnx-community/Qwen2.5-7B-Instruct',
267
+ task: 'text-generation',
268
+ category: 'llm',
269
+ sizesMB: { q4: 4200, q8: 8400, fp16: 16800, fp32: 33600 },
270
+ minRAM_MB: { q4: 6144, q8: 10240, fp16: 20480, fp32: 40960 },
271
+ },
272
+ // STT models
273
+ 'voicecore-base': {
274
+ hfModel: 'onnx-community/whisper-base',
275
+ task: 'automatic-speech-recognition',
276
+ category: 'stt',
277
+ sizesMB: { q4: 40, q8: 75, fp16: 150, fp32: 300 },
278
+ minRAM_MB: { q4: 512, q8: 512, fp16: 1024, fp32: 2048 },
279
+ },
280
+ 'voicecore-small': {
281
+ hfModel: 'onnx-community/whisper-small',
282
+ task: 'automatic-speech-recognition',
283
+ category: 'stt',
284
+ sizesMB: { q4: 100, q8: 200, fp16: 400, fp32: 800 },
285
+ minRAM_MB: { q4: 1024, q8: 1024, fp16: 2048, fp32: 4096 },
286
+ },
287
+ };
288
+
289
+ // ─── Context Window Sizing ──────────────────────────────────────────
290
+
291
+ function recommendContextWindow(memoryMB: number, quant: QuantizationLevel): number {
292
+ // More RAM + smaller quant = larger context window
293
+ const base = quant === 'q4' ? 1024 : quant === 'q8' ? 2048 : quant === 'fp16' ? 4096 : 8192;
294
+
295
+ if (memoryMB >= 16384) return Math.min(base * 4, 32768);
296
+ if (memoryMB >= 8192) return Math.min(base * 2, 16384);
297
+ if (memoryMB >= 4096) return base;
298
+ return Math.max(512, Math.floor(base / 2));
299
+ }
300
+
301
+ function selectQuantization(memoryMB: number, modelId: string): QuantizationLevel {
302
+ const info = modelMap[modelId];
303
+ if (!info) return 'q4';
304
+
305
+ // ONNX/WASM has protobuf size limits — fp16 files >2GB crash on many systems.
306
+ // For LLMs, cap at q4 via WASM. FP16/Q8 need native backends (llama.cpp).
307
+ // STT models are small enough for q8/fp16.
308
+ if (info.category === 'llm') {
309
+ return 'q4'; // safest for ONNX/WASM across all platforms
310
+ }
311
+
312
+ // STT models: try from best quality down
313
+ const quants: QuantizationLevel[] = ['fp16', 'q8', 'q4'];
314
+ for (const q of quants) {
315
+ if (memoryMB >= info.minRAM_MB[q]) return q;
316
+ }
317
+ return 'q4'; // fallback
318
+ }
319
+
320
+ // ─── Context Window Detection ──────────────────────────────────────
321
+
322
+ async function detectContextWindowFromHF(hfModelId: string): Promise<number> {
323
+ try {
324
+ const configUrl = `https://huggingface.co/${hfModelId}/raw/main/config.json`;
325
+ const response = await axios.get(configUrl, { timeout: 5000 });
326
+ const config = response.data;
327
+
328
+ // Try multiple context window field names
329
+ const contextWindow =
330
+ config.max_position_embeddings ||
331
+ config.n_positions ||
332
+ config.max_seq_len ||
333
+ config.model_max_length ||
334
+ 2048;
335
+
336
+ return contextWindow;
337
+ } catch {
338
+ // Default if config cannot be fetched
339
+ return 2048;
340
+ }
341
+ }
342
+
343
+ // ─── SDK Version ────────────────────────────────────────────────────
344
+ const SDK_VERSION = '1.4.1';
345
+
346
+ // ─── Persistent Device Identity ─────────────────────────────────────
347
+
348
+ async function hashString(str: string): Promise<string> {
349
+ const isNode = typeof window === 'undefined';
350
+ if (isNode) {
351
+ const crypto = await import('crypto');
352
+ return crypto.createHash('sha256').update(str).digest('hex').substring(0, 32);
353
+ } else {
354
+ const encoder = new TextEncoder();
355
+ const data = encoder.encode(str);
356
+ const hashBuffer = await crypto.subtle.digest('SHA-256', data);
357
+ return Array.from(new Uint8Array(hashBuffer))
358
+ .map(b => b.toString(16).padStart(2, '0'))
359
+ .join('')
360
+ .substring(0, 32);
361
+ }
362
+ }
363
+
364
+ async function getOrCreateDeviceId(): Promise<string> {
365
+ const isNode = typeof window === 'undefined';
366
+
367
+ if (isNode) {
368
+ // Node.js: persist in ~/.slyos/device-id
369
+ try {
370
+ const fs = await import('fs');
371
+ const path = await import('path');
372
+ const os = await import('os');
373
+ const slyosDir = path.join(os.homedir(), '.slyos');
374
+ const idFile = path.join(slyosDir, 'device-id');
375
+
376
+ try {
377
+ const existing = fs.readFileSync(idFile, 'utf-8').trim();
378
+ if (existing) return existing;
379
+ } catch {}
380
+
381
+ const deviceId = `device-${Date.now()}-${Math.random().toString(36).substr(2, 12)}`;
382
+ fs.mkdirSync(slyosDir, { recursive: true });
383
+ fs.writeFileSync(idFile, deviceId);
384
+ return deviceId;
385
+ } catch {
386
+ return `device-${Date.now()}-${Math.random().toString(36).substr(2, 12)}`;
387
+ }
388
+ } else {
389
+ // Browser: persist in localStorage
390
+ const key = 'slyos_device_id';
391
+ try {
392
+ const existing = localStorage.getItem(key);
393
+ if (existing) return existing;
394
+ } catch {}
395
+
396
+ const deviceId = `device-${Date.now()}-${Math.random().toString(36).substr(2, 12)}`;
397
+ try { localStorage.setItem(key, deviceId); } catch {}
398
+ return deviceId;
399
+ }
400
+ }
401
+
402
+ async function generateDeviceFingerprint(): Promise<string> {
403
+ const isNode = typeof window === 'undefined';
404
+ let components: string[] = [];
405
+
406
+ if (isNode) {
407
+ try {
408
+ const os = await import('os');
409
+ const cpus = os.cpus();
410
+ components.push(cpus[0]?.model || 'unknown-cpu');
411
+ components.push(String(os.totalmem()));
412
+ components.push(os.platform());
413
+ components.push(os.arch());
414
+ components.push(String(cpus.length));
415
+ } catch {}
416
+ } else {
417
+ components.push(String(navigator.hardwareConcurrency || 0));
418
+ components.push(String((navigator as any).deviceMemory || 0));
419
+ components.push(navigator.platform || 'unknown');
420
+ // WebGL renderer for GPU fingerprint
421
+ try {
422
+ const canvas = document.createElement('canvas');
423
+ const gl = canvas.getContext('webgl') || canvas.getContext('experimental-webgl') as WebGLRenderingContext | null;
424
+ if (gl) {
425
+ const ext = gl.getExtension('WEBGL_debug_renderer_info');
426
+ if (ext) {
427
+ components.push(gl.getParameter(ext.UNMASKED_RENDERER_WEBGL) || 'unknown-gpu');
428
+ }
429
+ }
430
+ } catch {}
431
+ components.push(String(screen.width || 0));
432
+ components.push(String(screen.height || 0));
433
+ }
434
+
435
+ return await hashString(components.join('|'));
436
+ }
437
+
438
+ // ─── Enhanced Device Profiling ──────────────────────────────────────
439
+
440
+ function detectGPU(): { renderer: string | null; vramMb: number } {
441
+ if (typeof window === 'undefined') return { renderer: null, vramMb: 0 };
442
+ try {
443
+ const canvas = document.createElement('canvas');
444
+ const gl = canvas.getContext('webgl') || canvas.getContext('experimental-webgl') as WebGLRenderingContext | null;
445
+ if (!gl) return { renderer: null, vramMb: 0 };
446
+ const ext = gl.getExtension('WEBGL_debug_renderer_info');
447
+ const renderer = ext ? gl.getParameter(ext.UNMASKED_RENDERER_WEBGL) : null;
448
+ // Rough VRAM estimate from renderer string
449
+ let vramMb = 0;
450
+ if (renderer) {
451
+ const match = renderer.match(/(\d+)\s*MB/i);
452
+ if (match) vramMb = parseInt(match[1]);
453
+ else if (/RTX\s*40/i.test(renderer)) vramMb = 8192;
454
+ else if (/RTX\s*30/i.test(renderer)) vramMb = 6144;
455
+ else if (/GTX/i.test(renderer)) vramMb = 4096;
456
+ else if (/Apple M[2-4]/i.test(renderer)) vramMb = 8192;
457
+ else if (/Apple M1/i.test(renderer)) vramMb = 4096;
458
+ else if (/Intel/i.test(renderer)) vramMb = 1024;
459
+ }
460
+ return { renderer, vramMb };
461
+ } catch {
462
+ return { renderer: null, vramMb: 0 };
463
+ }
464
+ }
465
+
466
+ function detectBrowser(): { name: string; version: string } {
467
+ if (typeof window === 'undefined' || typeof navigator === 'undefined') return { name: 'node', version: process.version || 'unknown' };
468
+ const ua = navigator.userAgent;
469
+ if (/Edg\//i.test(ua)) { const m = ua.match(/Edg\/([\d.]+)/); return { name: 'Edge', version: m?.[1] || '' }; }
470
+ if (/Chrome\//i.test(ua)) { const m = ua.match(/Chrome\/([\d.]+)/); return { name: 'Chrome', version: m?.[1] || '' }; }
471
+ if (/Firefox\//i.test(ua)) { const m = ua.match(/Firefox\/([\d.]+)/); return { name: 'Firefox', version: m?.[1] || '' }; }
472
+ if (/Safari\//i.test(ua)) { const m = ua.match(/Version\/([\d.]+)/); return { name: 'Safari', version: m?.[1] || '' }; }
473
+ return { name: 'unknown', version: '' };
474
+ }
475
+
476
+ function detectNetworkType(): string {
477
+ if (typeof navigator === 'undefined') return 'unknown';
478
+ const conn = (navigator as any).connection || (navigator as any).mozConnection || (navigator as any).webkitConnection;
479
+ if (!conn) return 'unknown';
480
+ return conn.effectiveType || conn.type || 'unknown';
481
+ }
482
+
483
+ async function measureApiLatency(apiUrl: string): Promise<number> {
484
+ try {
485
+ const start = Date.now();
486
+ await axios.head(`${apiUrl}/api/health`, { timeout: 5000 });
487
+ return Date.now() - start;
488
+ } catch {
489
+ try {
490
+ const start = Date.now();
491
+ await axios.get(`${apiUrl}/api/health`, { timeout: 5000 });
492
+ return Date.now() - start;
493
+ } catch {
494
+ return -1;
495
+ }
496
+ }
497
+ }
498
+
499
+ // ─── Device Profiling ───────────────────────────────────────────────
500
+
501
+ async function profileDevice(): Promise<DeviceProfile> {
502
+ const isNode = typeof window === 'undefined';
503
+ let cpuCores = 4;
504
+ let memoryMB = 4096;
505
+ let estimatedStorageMB = 10000;
506
+ let platform: 'web' | 'nodejs' = isNode ? 'nodejs' : 'web';
507
+ let os = 'unknown';
508
+
509
+ if (isNode) {
510
+ // Node.js environment
511
+ try {
512
+ const osModule = await import('os');
513
+ cpuCores = osModule.cpus().length;
514
+ memoryMB = Math.round(osModule.totalmem() / (1024 * 1024));
515
+ os = `${osModule.platform()} ${osModule.release()}`;
516
+
517
+ // Estimate free disk via df-like check
518
+ try {
519
+ const { execSync } = await import('child_process');
520
+ const dfOutput = execSync('df -m . 2>/dev/null || echo "0 0 0 0"', { encoding: 'utf-8' });
521
+ const lines = dfOutput.trim().split('\n');
522
+ if (lines.length > 1) {
523
+ const parts = lines[1].split(/\s+/);
524
+ estimatedStorageMB = parseInt(parts[3]) || 10000; // Available column
525
+ }
526
+ } catch {
527
+ estimatedStorageMB = 10000;
528
+ }
529
+ } catch {
530
+ // Fallback
531
+ }
532
+ } else {
533
+ // Browser environment
534
+ cpuCores = navigator.hardwareConcurrency || 4;
535
+ memoryMB = ((navigator as any).deviceMemory || 4) * 1024; // deviceMemory is in GB
536
+ os = navigator.userAgent;
537
+
538
+ // Storage Manager API (Chrome 61+)
539
+ try {
540
+ if (navigator.storage && navigator.storage.estimate) {
541
+ const estimate = await navigator.storage.estimate();
542
+ estimatedStorageMB = Math.round((estimate.quota || 0) / (1024 * 1024));
543
+ }
544
+ } catch {
545
+ estimatedStorageMB = 5000;
546
+ }
547
+ }
548
+
549
+ const recommendedQuant = selectQuantization(memoryMB, 'quantum-1.7b'); // default baseline
550
+ const maxContextWindow = recommendContextWindow(memoryMB, recommendedQuant);
551
+
552
+ // Enhanced profiling
553
+ const gpu = detectGPU();
554
+ const browser = detectBrowser();
555
+ const networkType = detectNetworkType();
556
+ const timezone = Intl?.DateTimeFormat?.()?.resolvedOptions?.()?.timeZone || 'unknown';
557
+
558
+ let screenWidth = 0, screenHeight = 0, pixelRatio = 0;
559
+ let wasmAvailable = false, webgpuAvailable = false;
560
+
561
+ if (!isNode) {
562
+ screenWidth = screen?.width || 0;
563
+ screenHeight = screen?.height || 0;
564
+ pixelRatio = window?.devicePixelRatio || 1;
565
+ }
566
+
567
+ // Capability detection
568
+ try { wasmAvailable = typeof WebAssembly !== 'undefined'; } catch {}
569
+ if (!isNode) {
570
+ try { webgpuAvailable = !!(navigator as any).gpu; } catch {}
571
+ }
572
+
573
+ return {
574
+ cpuCores,
575
+ memoryMB,
576
+ estimatedStorageMB,
577
+ platform,
578
+ os,
579
+ recommendedQuant,
580
+ maxContextWindow,
581
+ gpuRenderer: gpu.renderer || undefined,
582
+ gpuVramMb: gpu.vramMb || undefined,
583
+ screenWidth: screenWidth || undefined,
584
+ screenHeight: screenHeight || undefined,
585
+ pixelRatio: pixelRatio || undefined,
586
+ browserName: browser.name,
587
+ browserVersion: browser.version,
588
+ networkType,
589
+ timezone,
590
+ wasmAvailable,
591
+ webgpuAvailable,
592
+ };
593
+ }
594
+
595
+ // ─── Main SDK Class ─────────────────────────────────────────────────
596
+
597
+ interface TelemetryEntry {
598
+ latency_ms: number;
599
+ tokens_generated: number;
600
+ success: boolean;
601
+ model_id: string;
602
+ timestamp: number;
603
+ }
604
+
605
+ class SlyOS {
606
+ private apiKey: string;
607
+ private apiUrl: string;
608
+ private deviceId: string;
609
+ private token: string | null = null;
610
+ private models: Map<string, any> = new Map();
611
+ private deviceProfile: DeviceProfile | null = null;
612
+ private onProgress: ProgressCallback | null;
613
+ private onEvent: EventCallback | null;
614
+ private fallbackConfig: FallbackConfig | null;
615
+ private modelContextWindow: number = 0;
616
+ // Telemetry batching
617
+ private telemetryBuffer: TelemetryEntry[] = [];
618
+ private telemetryFlushTimer: any = null;
619
+ private static readonly TELEMETRY_BATCH_SIZE = 10;
620
+ private static readonly TELEMETRY_FLUSH_INTERVAL = 60000; // 60 seconds
621
+
622
+ constructor(config: SlyOSConfigWithFallback) {
623
+ this.apiKey = config.apiKey;
624
+ this.apiUrl = config.apiUrl || 'https://api.slyos.world';
625
+ this.deviceId = ''; // Set asynchronously in initialize()
626
+ this.onProgress = config.onProgress || null;
627
+ this.onEvent = config.onEvent || null;
628
+ this.fallbackConfig = config.fallback || null;
629
+ }
630
+
631
+ // ── Progress & Event Helpers ────────────────────────────────────
632
+
633
+ private emitProgress(stage: ProgressEvent['stage'], progress: number, message: string, detail?: any) {
634
+ if (this.onProgress) {
635
+ this.onProgress({ stage, progress, message, detail });
636
+ }
637
+ }
638
+
639
+ private emitEvent(type: SlyEvent['type'], data?: any) {
640
+ if (this.onEvent) {
641
+ this.onEvent({ type, data, timestamp: Date.now() });
642
+ }
643
+ }
644
+
645
+ // ── Telemetry Batching ─────────────────────────────────────────
646
+
647
+ private recordTelemetry(entry: TelemetryEntry) {
648
+ this.telemetryBuffer.push(entry);
649
+ if (this.telemetryBuffer.length >= SlyOS.TELEMETRY_BATCH_SIZE) {
650
+ this.flushTelemetry();
651
+ } else if (!this.telemetryFlushTimer) {
652
+ this.telemetryFlushTimer = setTimeout(() => this.flushTelemetry(), SlyOS.TELEMETRY_FLUSH_INTERVAL);
653
+ }
654
+ }
655
+
656
+ private async flushTelemetry() {
657
+ if (this.telemetryFlushTimer) {
658
+ clearTimeout(this.telemetryFlushTimer);
659
+ this.telemetryFlushTimer = null;
660
+ }
661
+ if (this.telemetryBuffer.length === 0 || !this.token) return;
662
+
663
+ const batch = [...this.telemetryBuffer];
664
+ this.telemetryBuffer = [];
665
+
666
+ try {
667
+ await axios.post(`${this.apiUrl}/api/devices/telemetry`, {
668
+ device_id: this.deviceId,
669
+ metrics: batch,
670
+ }, {
671
+ headers: { Authorization: `Bearer ${this.token}` },
672
+ timeout: 10000,
673
+ });
674
+ this.emitEvent('telemetry_flushed', { count: batch.length });
675
+ } catch {
676
+ // Put back on failure for next attempt
677
+ this.telemetryBuffer.unshift(...batch);
678
+ // Cap buffer to prevent memory leak
679
+ if (this.telemetryBuffer.length > 100) {
680
+ this.telemetryBuffer = this.telemetryBuffer.slice(-100);
681
+ }
682
+ }
683
+ }
684
+
685
+ // ── Device Analysis ─────────────────────────────────────────────
686
+
687
+ async analyzeDevice(): Promise<DeviceProfile> {
688
+ try {
689
+ this.emitProgress('profiling', 10, 'Analyzing device capabilities...');
690
+ this.deviceProfile = await profileDevice();
691
+ this.emitProgress('profiling', 100, `Device: ${this.deviceProfile.cpuCores} cores, ${Math.round(this.deviceProfile.memoryMB / 1024 * 10) / 10}GB RAM`);
692
+ this.emitEvent('device_profiled', this.deviceProfile);
693
+ return this.deviceProfile;
694
+ } catch (err: any) {
695
+ this.emitEvent('error', { method: 'analyzeDevice', error: err.message });
696
+ throw new Error(`Device analysis failed: ${err.message}`);
697
+ }
698
+ }
699
+
700
+ getDeviceProfile(): DeviceProfile | null {
701
+ return this.deviceProfile;
702
+ }
703
+
704
+ getModelContextWindow(): number {
705
+ return this.modelContextWindow;
706
+ }
707
+
708
+ getDeviceId(): string {
709
+ return this.deviceId;
710
+ }
711
+
712
+ getSdkVersion(): string {
713
+ return SDK_VERSION;
714
+ }
715
+
716
+ // Flush remaining telemetry and clean up timers
717
+ async destroy(): Promise<void> {
718
+ await this.flushTelemetry();
719
+ if (this.telemetryFlushTimer) {
720
+ clearTimeout(this.telemetryFlushTimer);
721
+ this.telemetryFlushTimer = null;
722
+ }
723
+ }
724
+
725
+ // ── Smart Model Recommendation ──────────────────────────────────
726
+
727
+ recommendModel(category: ModelCategory = 'llm'): { modelId: string; quant: QuantizationLevel; contextWindow: number; reason: string } | null {
728
+ if (!this.deviceProfile) {
729
+ throw new Error('Call analyzeDevice() first to get a recommendation.');
730
+ }
731
+
732
+ const mem = this.deviceProfile.memoryMB;
733
+ const candidates = Object.entries(modelMap).filter(([_, info]) => info.category === category);
734
+
735
+ // Sort by size descending — pick the biggest model that fits
736
+ for (const [id, info] of candidates.sort((a, b) => b[1].sizesMB.q4 - a[1].sizesMB.q4)) {
737
+ const quant = selectQuantization(mem, id);
738
+ if (mem >= info.minRAM_MB[quant]) {
739
+ const ctx = recommendContextWindow(mem, quant);
740
+ return {
741
+ modelId: id,
742
+ quant,
743
+ contextWindow: ctx,
744
+ reason: `Best model for ${Math.round(mem / 1024)}GB RAM at ${quant.toUpperCase()} precision`,
745
+ };
746
+ }
747
+ }
748
+
749
+ // Fallback to smallest
750
+ const smallest = candidates.sort((a, b) => a[1].sizesMB.q4 - b[1].sizesMB.q4)[0];
751
+ if (smallest) {
752
+ return {
753
+ modelId: smallest[0],
754
+ quant: 'q4',
755
+ contextWindow: 512,
756
+ reason: 'Limited device memory — using smallest available model at Q4',
757
+ };
758
+ }
759
+
760
+ return null;
761
+ }
762
+
763
+ // ── Initialize ──────────────────────────────────────────────────
764
+
765
+ async initialize(): Promise<DeviceProfile> {
766
+ this.emitProgress('initializing', 0, 'Starting SlyOS...');
767
+
768
+ // Step 1: Persistent device ID
769
+ this.deviceId = await getOrCreateDeviceId();
770
+
771
+ // Step 2: Profile device (enhanced)
772
+ this.emitProgress('profiling', 5, 'Detecting device capabilities...');
773
+ this.deviceProfile = await profileDevice();
774
+
775
+ // Step 2b: Generate device fingerprint
776
+ this.deviceProfile.deviceFingerprint = await generateDeviceFingerprint();
777
+
778
+ this.emitProgress('profiling', 20, `Detected: ${this.deviceProfile.cpuCores} CPU cores, ${Math.round(this.deviceProfile.memoryMB / 1024 * 10) / 10}GB RAM${this.deviceProfile.gpuRenderer ? ', GPU: ' + this.deviceProfile.gpuRenderer.substring(0, 30) : ''}`);
779
+ this.emitEvent('device_profiled', this.deviceProfile);
780
+
781
+ // Step 3: Authenticate
782
+ this.emitProgress('initializing', 40, 'Authenticating with API key...');
783
+ try {
784
+ const authRes = await axios.post(`${this.apiUrl}/api/auth/sdk`, {
785
+ apiKey: this.apiKey,
786
+ });
787
+ this.token = authRes.data.token;
788
+ this.emitProgress('initializing', 60, 'Authenticated successfully');
789
+ this.emitEvent('auth', { success: true });
790
+ } catch (err: any) {
791
+ this.emitProgress('error', 0, `Authentication failed: ${err.message}`);
792
+ this.emitEvent('error', { stage: 'auth', error: err.message });
793
+ throw new Error(`SlyOS auth failed: ${err.response?.data?.error || err.message}`);
794
+ }
795
+
796
+ // Step 4: Measure API latency
797
+ const latency = await measureApiLatency(this.apiUrl);
798
+ if (latency > 0) this.deviceProfile.latencyToApiMs = latency;
799
+
800
+ // Step 5: Register device with full intelligence profile
801
+ this.emitProgress('initializing', 70, 'Registering device...');
802
+ try {
803
+ // Determine supported quantizations based on memory
804
+ const mem = this.deviceProfile.memoryMB;
805
+ const supportedQuants: string[] = ['q4'];
806
+ if (mem >= 4096) supportedQuants.push('q8');
807
+ if (mem >= 8192) supportedQuants.push('fp16');
808
+ if (mem >= 16384) supportedQuants.push('fp32');
809
+
810
+ // Determine recommended tier
811
+ let recommendedTier = 1;
812
+ if (mem >= 8192 && this.deviceProfile.cpuCores >= 4) recommendedTier = 2;
813
+ if (mem >= 16384 && this.deviceProfile.cpuCores >= 8) recommendedTier = 3;
814
+
815
+ await axios.post(`${this.apiUrl}/api/devices/register`, {
816
+ device_id: this.deviceId,
817
+ device_fingerprint: this.deviceProfile.deviceFingerprint,
818
+ platform: this.deviceProfile.platform,
819
+ os_version: this.deviceProfile.os,
820
+ total_memory_mb: this.deviceProfile.memoryMB,
821
+ cpu_cores: this.deviceProfile.cpuCores,
822
+ // Enhanced fields
823
+ gpu_renderer: this.deviceProfile.gpuRenderer || null,
824
+ gpu_vram_mb: this.deviceProfile.gpuVramMb || null,
825
+ screen_width: this.deviceProfile.screenWidth || null,
826
+ screen_height: this.deviceProfile.screenHeight || null,
827
+ pixel_ratio: this.deviceProfile.pixelRatio || null,
828
+ browser_name: this.deviceProfile.browserName || null,
829
+ browser_version: this.deviceProfile.browserVersion || null,
830
+ sdk_version: SDK_VERSION,
831
+ network_type: this.deviceProfile.networkType || null,
832
+ latency_to_api_ms: this.deviceProfile.latencyToApiMs || null,
833
+ timezone: this.deviceProfile.timezone || null,
834
+ // Capabilities
835
+ wasm_available: this.deviceProfile.wasmAvailable || false,
836
+ webgpu_available: this.deviceProfile.webgpuAvailable || false,
837
+ supported_quants: supportedQuants,
838
+ recommended_tier: recommendedTier,
839
+ }, {
840
+ headers: { Authorization: `Bearer ${this.token}` },
841
+ });
842
+ this.emitProgress('initializing', 90, 'Device registered');
843
+ this.emitEvent('device_registered', { deviceId: this.deviceId, fingerprint: this.deviceProfile.deviceFingerprint });
844
+ } catch (err: any) {
845
+ // Non-fatal — device registration shouldn't block usage
846
+ this.emitProgress('initializing', 90, 'Device registration skipped (non-fatal)');
847
+ }
848
+
849
+ // Step 6: Start telemetry flush timer
850
+ this.telemetryFlushTimer = setTimeout(() => this.flushTelemetry(), SlyOS.TELEMETRY_FLUSH_INTERVAL);
851
+
852
+ this.emitProgress('ready', 100, `SlyOS v${SDK_VERSION} ready — ${this.deviceProfile.recommendedQuant.toUpperCase()}, ${this.deviceProfile.gpuRenderer ? 'GPU detected' : 'CPU only'}`);
853
+
854
+ return this.deviceProfile;
855
+ }
856
+
857
+ // ── Model Loading ───────────────────────────────────────────────
858
+
859
+ getAvailableModels(): Record<string, { models: { id: string; sizesMB: Record<string, number>; minRAM_MB: Record<string, number> }[] }> {
860
+ const grouped: Record<string, any[]> = { llm: [], stt: [] };
861
+ for (const [id, info] of Object.entries(modelMap)) {
862
+ if (!grouped[info.category]) grouped[info.category] = [];
863
+ grouped[info.category].push({
864
+ id,
865
+ sizesMB: info.sizesMB,
866
+ minRAM_MB: info.minRAM_MB,
867
+ });
868
+ }
869
+ return Object.fromEntries(
870
+ Object.entries(grouped).map(([cat, models]) => [cat, { models }])
871
+ );
872
+ }
873
+
874
+ async searchModels(query: string, options?: { limit?: number; task?: string }): Promise<Array<{
875
+ id: string;
876
+ name: string;
877
+ downloads: number;
878
+ likes: number;
879
+ task: string;
880
+ size_category: string;
881
+ }>> {
882
+ try {
883
+ const limit = options?.limit || 20;
884
+ const filters = ['onnx']; // Filter for ONNX models only
885
+ if (options?.task) {
886
+ filters.push(options.task);
887
+ }
888
+
889
+ const filterString = filters.map(f => `"${f}"`).join(',');
890
+ const url = `https://huggingface.co/api/models?search=${encodeURIComponent(query)}&filter=${encodeURIComponent(`[${filterString}]`)}&sort=downloads&direction=-1&limit=${limit}`;
891
+
892
+ const response = await axios.get(url, { timeout: 10000 });
893
+ const models = Array.isArray(response.data) ? response.data : [];
894
+
895
+ return models.map((model: any) => ({
896
+ id: model.id,
897
+ name: model.id.split('/')[1] || model.id,
898
+ downloads: model.downloads || 0,
899
+ likes: model.likes || 0,
900
+ task: model.task || 'unknown',
901
+ size_category: model.size_category || 'unknown',
902
+ }));
903
+ } catch (error: any) {
904
+ this.emitEvent('error', { stage: 'model_search', error: error.message });
905
+ throw new Error(`Model search failed: ${error.message}`);
906
+ }
907
+ }
908
+
909
+ canRunModel(modelId: string, quant?: QuantizationLevel): { canRun: boolean; reason: string; recommendedQuant: QuantizationLevel } {
910
+ const info = modelMap[modelId];
911
+ if (!info) return { canRun: false, reason: `Unknown model "${modelId}"`, recommendedQuant: 'q4' };
912
+ if (!this.deviceProfile) return { canRun: true, reason: 'Device not profiled yet — call initialize() first', recommendedQuant: 'q4' };
913
+
914
+ const mem = this.deviceProfile.memoryMB;
915
+ const bestQuant = selectQuantization(mem, modelId);
916
+
917
+ if (quant && mem < info.minRAM_MB[quant]) {
918
+ return {
919
+ canRun: false,
920
+ reason: `Not enough RAM for ${quant.toUpperCase()} (need ${info.minRAM_MB[quant]}MB, have ${mem}MB). Try ${bestQuant.toUpperCase()} instead.`,
921
+ recommendedQuant: bestQuant,
922
+ };
923
+ }
924
+
925
+ if (mem < info.minRAM_MB.q4) {
926
+ return {
927
+ canRun: false,
928
+ reason: `Model requires at least ${info.minRAM_MB.q4}MB RAM even at Q4. Device has ${mem}MB.`,
929
+ recommendedQuant: 'q4',
930
+ };
931
+ }
932
+
933
+ return { canRun: true, reason: `OK at ${bestQuant.toUpperCase()} precision`, recommendedQuant: bestQuant };
934
+ }
935
+
936
+ async loadModel(modelId: string, options?: { quant?: QuantizationLevel }): Promise<void> {
937
+ const info = modelMap[modelId];
938
+ let hfModelId: string;
939
+ let task: string;
940
+ let estimatedSize: number;
941
+
942
+ // Handle curated models
943
+ if (info) {
944
+ hfModelId = info.hfModel;
945
+ task = info.task;
946
+
947
+ // Determine quantization
948
+ let quant: QuantizationLevel = options?.quant || 'fp32';
949
+ if (!options?.quant && this.deviceProfile) {
950
+ quant = selectQuantization(this.deviceProfile.memoryMB, modelId);
951
+ this.emitProgress('downloading', 0, `Auto-selected ${quant.toUpperCase()} quantization for your device`);
952
+ }
953
+
954
+ // Check feasibility
955
+ const check = this.canRunModel(modelId, quant);
956
+ if (!check.canRun) {
957
+ this.emitProgress('error', 0, check.reason);
958
+ throw new Error(check.reason);
959
+ }
960
+
961
+ estimatedSize = info.sizesMB[quant];
962
+ this.emitProgress('downloading', 0, `Downloading ${modelId} (${quant.toUpperCase()}, ~${estimatedSize}MB)...`);
963
+ this.emitEvent('model_download_start', { modelId, quant, estimatedSizeMB: estimatedSize });
964
+ } else {
965
+ // Handle custom HuggingFace models
966
+ hfModelId = modelId;
967
+ task = 'text-generation'; // Default task
968
+ estimatedSize = 2048; // Default estimate
969
+
970
+ this.emitProgress('downloading', 0, `Loading custom HuggingFace model: ${modelId}...`);
971
+ this.emitEvent('model_download_start', { modelId, custom: true, estimatedSizeMB: estimatedSize });
972
+ }
973
+
974
+ // Map quant to dtype for HuggingFace
975
+ const dtypeMap: Record<QuantizationLevel, string> = {
976
+ q4: 'q4',
977
+ q8: 'q8',
978
+ fp16: 'fp16',
979
+ fp32: 'fp32',
980
+ };
981
+
982
+ let lastReportedPercent = 0;
983
+ const startTime = Date.now();
984
+
985
+ try {
986
+ // For custom HF models, detect context window
987
+ let detectedContextWindow = 2048;
988
+ if (!info) {
989
+ detectedContextWindow = await detectContextWindowFromHF(hfModelId);
990
+ }
991
+
992
+ const pipe = await pipeline(task as any, hfModelId, {
993
+ device: 'cpu',
994
+ dtype: 'q4' as any, // Default to q4 for stability
995
+ progress_callback: (progressData: any) => {
996
+ // HuggingFace transformers sends progress events during download
997
+ if (progressData && typeof progressData === 'object') {
998
+ let percent = 0;
999
+ let msg = 'Downloading...';
1000
+
1001
+ if (progressData.status === 'progress' && progressData.progress !== undefined) {
1002
+ percent = Math.round(progressData.progress);
1003
+ const loaded = progressData.loaded ? `${Math.round(progressData.loaded / 1024 / 1024)}MB` : '';
1004
+ const total = progressData.total ? `${Math.round(progressData.total / 1024 / 1024)}MB` : '';
1005
+ msg = loaded && total ? `Downloading: ${loaded} / ${total}` : `Downloading: ${percent}%`;
1006
+ } else if (progressData.status === 'done') {
1007
+ percent = 100;
1008
+ msg = progressData.file ? `Downloaded ${progressData.file}` : 'Download complete';
1009
+ } else if (progressData.status === 'initiate') {
1010
+ msg = progressData.file ? `Starting download: ${progressData.file}` : 'Initiating download...';
1011
+ }
1012
+
1013
+ // Only emit if progress meaningfully changed (avoid flooding)
1014
+ if (percent !== lastReportedPercent || progressData.status === 'done' || progressData.status === 'initiate') {
1015
+ lastReportedPercent = percent;
1016
+ this.emitProgress('downloading', percent, msg, progressData);
1017
+ this.emitEvent('model_download_progress', { modelId, percent, ...progressData });
1018
+ }
1019
+ }
1020
+ },
1021
+ });
1022
+
1023
+ const loadTime = Date.now() - startTime;
1024
+ let contextWindow: number;
1025
+
1026
+ if (info) {
1027
+ // For curated models, use recommendContextWindow
1028
+ const quant = options?.quant || (this.deviceProfile ? selectQuantization(this.deviceProfile.memoryMB, modelId) : 'q4');
1029
+ contextWindow = this.deviceProfile
1030
+ ? recommendContextWindow(this.deviceProfile.memoryMB, quant)
1031
+ : 2048;
1032
+ } else {
1033
+ // For custom HF models, use detected context window
1034
+ contextWindow = detectedContextWindow;
1035
+ }
1036
+
1037
+ this.modelContextWindow = contextWindow;
1038
+ this.models.set(modelId, { pipe, info, quant: 'q4', contextWindow });
1039
+
1040
+ this.emitProgress('ready', 100, `${modelId} loaded (q4, ${(loadTime / 1000).toFixed(1)}s, ctx: ${contextWindow})`);
1041
+ this.emitEvent('model_loaded', { modelId, quant: 'q4', loadTimeMs: loadTime, contextWindow });
1042
+
1043
+ // Telemetry
1044
+ if (this.token) {
1045
+ await axios.post(`${this.apiUrl}/api/telemetry`, {
1046
+ device_id: this.deviceId,
1047
+ event_type: 'model_load',
1048
+ model_id: modelId,
1049
+ success: true,
1050
+ metadata: { quant: 'q4', loadTimeMs: loadTime, contextWindow, custom: !info },
1051
+ }, {
1052
+ headers: { Authorization: `Bearer ${this.token}` },
1053
+ }).catch(() => {});
1054
+ }
1055
+ } catch (error: any) {
1056
+ this.emitProgress('error', 0, `Failed to load ${modelId}: ${error.message}`);
1057
+ this.emitEvent('error', { stage: 'model_load', modelId, error: error.message });
1058
+
1059
+ if (this.token) {
1060
+ await axios.post(`${this.apiUrl}/api/telemetry`, {
1061
+ device_id: this.deviceId,
1062
+ event_type: 'model_load',
1063
+ model_id: modelId,
1064
+ success: false,
1065
+ error_message: error.message,
1066
+ }, {
1067
+ headers: { Authorization: `Bearer ${this.token}` },
1068
+ }).catch(() => {});
1069
+ }
1070
+ throw error;
1071
+ }
1072
+ }
1073
+
1074
+ // ── Inference: Generate ─────────────────────────────────────────
1075
+
1076
+ async generate(modelId: string, prompt: string | Array<{role: string; content: string}>, options: GenerateOptions = {}): Promise<string> {
1077
+ if (!this.models.has(modelId)) {
1078
+ await this.loadModel(modelId);
1079
+ }
1080
+
1081
+ const loaded = this.models.get(modelId);
1082
+ if (!loaded) {
1083
+ throw new Error(`Model "${modelId}" failed to load. Check your connection and model ID.`);
1084
+ }
1085
+ const { pipe, info, contextWindow } = loaded;
1086
+ if (info.category !== 'llm') {
1087
+ throw new Error(`Model "${modelId}" is not an LLM. Use transcribe() for STT models.`);
1088
+ }
1089
+
1090
+ const maxTokens = Math.min(options.maxTokens || 100, contextWindow || 2048);
1091
+ const isMessages = Array.isArray(prompt);
1092
+
1093
+ this.emitProgress('generating', 0, `Generating response (max ${maxTokens} tokens)...`);
1094
+ this.emitEvent('inference_start', { modelId, maxTokens });
1095
+ const startTime = Date.now();
1096
+
1097
+ try {
1098
+ const result = await pipe(prompt, {
1099
+ max_new_tokens: maxTokens,
1100
+ temperature: options.temperature || 0.7,
1101
+ top_p: options.topP || 0.9,
1102
+ do_sample: true,
1103
+ repetition_penalty: 1.1,
1104
+ });
1105
+
1106
+ let response: string;
1107
+ if (isMessages) {
1108
+ // When using messages format, the pipeline returns the assistant's reply
1109
+ // in the last message of the generated conversation
1110
+ const generated = result[0].generated_text;
1111
+ if (Array.isArray(generated)) {
1112
+ // Transformers.js returns messages array — extract assistant reply
1113
+ const assistantMsg = generated.filter((m: any) => m.role === 'assistant').pop();
1114
+ response = assistantMsg?.content?.trim() || '';
1115
+ } else {
1116
+ response = typeof generated === 'string' ? generated.trim() : '';
1117
+ }
1118
+ } else {
1119
+ const rawOutput = result[0].generated_text;
1120
+ // HuggingFace transformers returns the prompt + generated text concatenated.
1121
+ // Strip the original prompt so we only return the NEW tokens.
1122
+ response = (typeof rawOutput === 'string' && rawOutput.startsWith(prompt as string))
1123
+ ? rawOutput.slice((prompt as string).length).trim()
1124
+ : (typeof rawOutput === 'string' ? rawOutput.trim() : '');
1125
+ }
1126
+ const latency = Date.now() - startTime;
1127
+ const tokensGenerated = response.split(/\s+/).length;
1128
+ const tokensPerSec = (tokensGenerated / (latency / 1000)).toFixed(1);
1129
+
1130
+ this.emitProgress('ready', 100, `Generated ${tokensGenerated} tokens in ${(latency / 1000).toFixed(1)}s (${tokensPerSec} tok/s)`);
1131
+ this.emitEvent('inference_complete', { modelId, latencyMs: latency, tokensGenerated, tokensPerSec: parseFloat(tokensPerSec) });
1132
+
1133
+ // Batch telemetry (new device intelligence)
1134
+ this.recordTelemetry({
1135
+ latency_ms: latency,
1136
+ tokens_generated: tokensGenerated,
1137
+ success: true,
1138
+ model_id: modelId,
1139
+ timestamp: Date.now(),
1140
+ });
1141
+
1142
+ // Legacy telemetry (backwards compatible)
1143
+ if (this.token) {
1144
+ await axios.post(`${this.apiUrl}/api/telemetry`, {
1145
+ device_id: this.deviceId,
1146
+ event_type: 'inference',
1147
+ model_id: modelId,
1148
+ latency_ms: latency,
1149
+ tokens_generated: tokensGenerated,
1150
+ success: true,
1151
+ }, {
1152
+ headers: { Authorization: `Bearer ${this.token}` },
1153
+ }).catch(() => {});
1154
+ }
1155
+
1156
+ return response;
1157
+ } catch (error: any) {
1158
+ this.emitProgress('error', 0, `Generation failed: ${error.message}`);
1159
+ this.emitEvent('error', { stage: 'inference', modelId, error: error.message });
1160
+
1161
+ // Batch telemetry (failure)
1162
+ this.recordTelemetry({
1163
+ latency_ms: 0,
1164
+ tokens_generated: 0,
1165
+ success: false,
1166
+ model_id: modelId,
1167
+ timestamp: Date.now(),
1168
+ });
1169
+
1170
+ if (this.token) {
1171
+ await axios.post(`${this.apiUrl}/api/telemetry`, {
1172
+ device_id: this.deviceId,
1173
+ event_type: 'inference',
1174
+ model_id: modelId,
1175
+ success: false,
1176
+ error_message: error.message,
1177
+ }, {
1178
+ headers: { Authorization: `Bearer ${this.token}` },
1179
+ }).catch(() => {});
1180
+ }
1181
+ throw error;
1182
+ }
1183
+ }
1184
+
1185
+ /**
1186
+ * Stream text generation token-by-token.
1187
+ * Calls onToken callback for each generated token.
1188
+ */
1189
+ async generateStream(
1190
+ modelId: string,
1191
+ prompt: string | Array<{role: string; content: string}>,
1192
+ options: GenerateOptions & { onToken?: (token: string, partial: string) => void } = {}
1193
+ ): Promise<{ text: string; firstTokenMs: number; totalMs: number; tokensGenerated: number }> {
1194
+ if (!this.models.has(modelId)) {
1195
+ await this.loadModel(modelId);
1196
+ }
1197
+ const loaded = this.models.get(modelId);
1198
+ if (!loaded) throw new Error(`Model "${modelId}" not loaded`);
1199
+ const { pipe, info, contextWindow } = loaded;
1200
+ if (info.category !== 'llm') throw new Error(`Not an LLM`);
1201
+
1202
+ const maxTokens = Math.min(options.maxTokens || 100, contextWindow || 2048);
1203
+ const isMessages = Array.isArray(prompt);
1204
+ const startTime = Date.now();
1205
+ let firstTokenTime = 0;
1206
+ let accumulated = '';
1207
+ let prevText = '';
1208
+ let callbackCount = 0;
1209
+
1210
+ this.emitProgress('generating', 0, `Streaming (max ${maxTokens} tokens)...`);
1211
+
1212
+ try {
1213
+ const result = await pipe(prompt, {
1214
+ max_new_tokens: maxTokens,
1215
+ temperature: options.temperature || 0.7,
1216
+ top_p: options.topP || 0.9,
1217
+ do_sample: true,
1218
+ repetition_penalty: 1.1,
1219
+ // Transformers.js v3 streamer callback — receives decoded output tokens
1220
+ callback_function: (output: any) => {
1221
+ callbackCount++;
1222
+ if (!firstTokenTime) firstTokenTime = Date.now() - startTime;
1223
+
1224
+ // Transformers.js v3 callback_function may receive:
1225
+ // 1. A string (decoded text so far) in some pipeline configurations
1226
+ // 2. Token IDs array/tensor in others
1227
+ // We handle both cases
1228
+ let tokenText = '';
1229
+ if (typeof output === 'string') {
1230
+ tokenText = output;
1231
+ } else if (output && typeof output === 'object') {
1232
+ // For newer Transformers.js: try to extract text if available
1233
+ if (output.text) tokenText = output.text;
1234
+ }
1235
+
1236
+ if (tokenText && tokenText !== prevText) {
1237
+ const newPart = tokenText.startsWith(prevText) ? tokenText.slice(prevText.length) : tokenText;
1238
+ prevText = tokenText;
1239
+ if (newPart) {
1240
+ accumulated += newPart;
1241
+ options.onToken?.(newPart, accumulated);
1242
+ this.emitEvent('token', { token: newPart, partial: accumulated });
1243
+ }
1244
+ }
1245
+ }
1246
+ });
1247
+
1248
+ let response: string;
1249
+ if (isMessages) {
1250
+ const generated = result[0].generated_text;
1251
+ if (Array.isArray(generated)) {
1252
+ const assistantMsg = generated.filter((m: any) => m.role === 'assistant').pop();
1253
+ response = assistantMsg?.content?.trim() || '';
1254
+ } else {
1255
+ response = typeof generated === 'string' ? generated.trim() : '';
1256
+ }
1257
+ } else {
1258
+ const rawOutput = result[0].generated_text;
1259
+ response = (typeof rawOutput === 'string' && rawOutput.startsWith(prompt as string))
1260
+ ? rawOutput.slice((prompt as string).length).trim()
1261
+ : (typeof rawOutput === 'string' ? rawOutput.trim() : '');
1262
+ }
1263
+
1264
+ if (!firstTokenTime) firstTokenTime = Date.now() - startTime;
1265
+ const totalMs = Date.now() - startTime;
1266
+ const tokensGenerated = response.split(/\s+/).filter(Boolean).length;
1267
+
1268
+ this.emitProgress('ready', 100, `Streamed ${tokensGenerated} tokens in ${(totalMs/1000).toFixed(1)}s`);
1269
+ this.emitEvent('inference_complete', { modelId, latencyMs: totalMs, tokensGenerated });
1270
+
1271
+ // Batch telemetry (device intelligence)
1272
+ this.recordTelemetry({
1273
+ latency_ms: totalMs,
1274
+ tokens_generated: tokensGenerated,
1275
+ success: true,
1276
+ model_id: modelId,
1277
+ timestamp: Date.now(),
1278
+ });
1279
+
1280
+ // Legacy telemetry — updates analytics_daily for dashboard counts
1281
+ if (this.token) {
1282
+ await axios.post(`${this.apiUrl}/api/telemetry`, {
1283
+ device_id: this.deviceId,
1284
+ event_type: 'inference',
1285
+ model_id: modelId,
1286
+ latency_ms: totalMs,
1287
+ tokens_generated: tokensGenerated,
1288
+ success: true,
1289
+ }, {
1290
+ headers: { Authorization: `Bearer ${this.token}` },
1291
+ }).catch(() => {});
1292
+ }
1293
+
1294
+ return { text: response, firstTokenMs: firstTokenTime, totalMs, tokensGenerated };
1295
+ } catch (error: any) {
1296
+ this.emitProgress('error', 0, `Stream failed: ${error.message}`);
1297
+ throw error;
1298
+ }
1299
+ }
1300
+
1301
+ // ── Inference: Transcribe ───────────────────────────────────────
1302
+
1303
+ async transcribe(modelId: string, audioInput: any, options: TranscribeOptions = {}): Promise<string> {
1304
+ if (!this.models.has(modelId)) {
1305
+ await this.loadModel(modelId);
1306
+ }
1307
+
1308
+ const loaded = this.models.get(modelId);
1309
+ if (!loaded) {
1310
+ throw new Error(`Model "${modelId}" failed to load. Check your connection and model ID.`);
1311
+ }
1312
+ const { pipe, info } = loaded;
1313
+ if (info.category !== 'stt') {
1314
+ throw new Error(`Model "${modelId}" is not an STT model. Use generate() for LLMs.`);
1315
+ }
1316
+
1317
+ this.emitProgress('transcribing', 0, 'Transcribing audio...');
1318
+ this.emitEvent('inference_start', { modelId, type: 'transcription' });
1319
+ const startTime = Date.now();
1320
+
1321
+ try {
1322
+ const result = await pipe(audioInput, {
1323
+ language: options.language || 'en',
1324
+ return_timestamps: options.returnTimestamps || false,
1325
+ });
1326
+
1327
+ const text = result.text;
1328
+ const latency = Date.now() - startTime;
1329
+
1330
+ this.emitProgress('ready', 100, `Transcribed in ${(latency / 1000).toFixed(1)}s`);
1331
+ this.emitEvent('inference_complete', { modelId, latencyMs: latency, type: 'transcription' });
1332
+
1333
+ if (this.token) {
1334
+ await axios.post(`${this.apiUrl}/api/telemetry`, {
1335
+ device_id: this.deviceId,
1336
+ event_type: 'inference',
1337
+ model_id: modelId,
1338
+ latency_ms: latency,
1339
+ success: true,
1340
+ }, {
1341
+ headers: { Authorization: `Bearer ${this.token}` },
1342
+ }).catch(() => {});
1343
+ }
1344
+
1345
+ return text;
1346
+ } catch (error: any) {
1347
+ this.emitProgress('error', 0, `Transcription failed: ${error.message}`);
1348
+ this.emitEvent('error', { stage: 'transcription', modelId, error: error.message });
1349
+
1350
+ if (this.token) {
1351
+ await axios.post(`${this.apiUrl}/api/telemetry`, {
1352
+ device_id: this.deviceId,
1353
+ event_type: 'inference',
1354
+ model_id: modelId,
1355
+ success: false,
1356
+ error_message: error.message,
1357
+ }, {
1358
+ headers: { Authorization: `Bearer ${this.token}` },
1359
+ }).catch(() => {});
1360
+ }
1361
+ throw error;
1362
+ }
1363
+ }
1364
+
1365
+ // ── OpenAI Compatibility ────────────────────────────────────────────
1366
+
1367
+ async chatCompletion(modelId: string, request: OpenAIChatCompletionRequest): Promise<OpenAIChatCompletionResponse> {
1368
+ try {
1369
+ // Pass messages directly to generate() — Transformers.js v3 applies the model's
1370
+ // chat template automatically, which produces much better results than raw text
1371
+ const messages = request.messages.map(msg => ({
1372
+ role: msg.role,
1373
+ content: msg.content,
1374
+ }));
1375
+
1376
+ const response = await this.generate(modelId, messages, {
1377
+ temperature: request.temperature,
1378
+ maxTokens: request.max_tokens,
1379
+ topP: request.top_p,
1380
+ });
1381
+
1382
+ // Estimate token counts (rough approximation: ~4 chars per token)
1383
+ const promptTokens = Math.ceil(prompt.length / 4);
1384
+ const completionTokens = Math.ceil(response.length / 4);
1385
+
1386
+ return {
1387
+ id: `chat-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`,
1388
+ object: 'chat.completion',
1389
+ created: Math.floor(Date.now() / 1000),
1390
+ model: modelId,
1391
+ choices: [
1392
+ {
1393
+ index: 0,
1394
+ message: {
1395
+ role: 'assistant',
1396
+ content: response,
1397
+ },
1398
+ finish_reason: 'stop',
1399
+ },
1400
+ ],
1401
+ usage: {
1402
+ prompt_tokens: promptTokens,
1403
+ completion_tokens: completionTokens,
1404
+ total_tokens: promptTokens + completionTokens,
1405
+ },
1406
+ };
1407
+ } catch (error: any) {
1408
+ // Fallback to cloud provider if configured
1409
+ if (this.fallbackConfig?.provider === 'openai') {
1410
+ return this.fallbackToOpenAI(modelId, request);
1411
+ } else if (this.fallbackConfig?.provider === 'bedrock') {
1412
+ return this.fallbackToBedrock(modelId, request);
1413
+ }
1414
+ throw error;
1415
+ }
1416
+ }
1417
+
1418
+ // ── AWS Bedrock Compatibility ──────────────────────────────────────
1419
+
1420
+ async bedrockInvoke(modelId: string, request: BedrockInvokeRequest): Promise<BedrockInvokeResponse> {
1421
+ try {
1422
+ const response = await this.generate(modelId, request.inputText, {
1423
+ temperature: request.textGenerationConfig?.temperature,
1424
+ maxTokens: request.textGenerationConfig?.maxTokenCount,
1425
+ topP: request.textGenerationConfig?.topP,
1426
+ });
1427
+
1428
+ // Estimate token counts
1429
+ const inputTokens = Math.ceil(request.inputText.length / 4);
1430
+ const outputTokens = Math.ceil(response.length / 4);
1431
+
1432
+ return {
1433
+ results: [
1434
+ {
1435
+ outputText: response,
1436
+ tokenCount: outputTokens,
1437
+ },
1438
+ ],
1439
+ input_text_token_count: inputTokens,
1440
+ };
1441
+ } catch (error: any) {
1442
+ // Fallback to cloud provider if configured
1443
+ if (this.fallbackConfig?.provider === 'bedrock') {
1444
+ return this.fallbackToBedrockCloud(modelId, request);
1445
+ } else if (this.fallbackConfig?.provider === 'openai') {
1446
+ return this.fallbackToOpenAICloud(modelId, request);
1447
+ }
1448
+ throw error;
1449
+ }
1450
+ }
1451
+
1452
+ // ── Fallback: OpenAI Cloud ────────────────────────────────────────
1453
+
1454
+ private async fallbackToOpenAI(modelId: string, request: OpenAIChatCompletionRequest): Promise<OpenAIChatCompletionResponse> {
1455
+ if (!this.fallbackConfig) {
1456
+ throw new Error('OpenAI fallback not configured');
1457
+ }
1458
+
1459
+ const mappedModel = this.mapModelToOpenAI(modelId);
1460
+ const payload = {
1461
+ model: this.fallbackConfig.model || mappedModel,
1462
+ messages: request.messages,
1463
+ temperature: request.temperature,
1464
+ max_tokens: request.max_tokens,
1465
+ top_p: request.top_p,
1466
+ frequency_penalty: request.frequency_penalty,
1467
+ presence_penalty: request.presence_penalty,
1468
+ stop: request.stop,
1469
+ };
1470
+
1471
+ try {
1472
+ const response = await axios.post('https://api.openai.com/v1/chat/completions', payload, {
1473
+ headers: {
1474
+ Authorization: `Bearer ${this.fallbackConfig.apiKey}`,
1475
+ 'Content-Type': 'application/json',
1476
+ },
1477
+ });
1478
+
1479
+ this.emitEvent('fallback_success', { provider: 'openai', originalModel: modelId, mappedModel: this.fallbackConfig.model });
1480
+ return response.data;
1481
+ } catch (error: any) {
1482
+ this.emitProgress('error', 0, `OpenAI fallback failed: ${error.message}`);
1483
+ this.emitEvent('fallback_error', { provider: 'openai', error: error.message });
1484
+ throw error;
1485
+ }
1486
+ }
1487
+
1488
+ private async fallbackToBedrock(modelId: string, request: OpenAIChatCompletionRequest): Promise<OpenAIChatCompletionResponse> {
1489
+ if (!this.fallbackConfig) {
1490
+ throw new Error('Bedrock fallback not configured');
1491
+ }
1492
+
1493
+ // Convert OpenAI format to Bedrock's expected format (simplified)
1494
+ const lastMessage = request.messages[request.messages.length - 1];
1495
+ const inputText = lastMessage.content;
1496
+
1497
+ const bedrockResponse = await this.invokeBedrockCloud(inputText, {
1498
+ temperature: request.temperature,
1499
+ maxTokenCount: request.max_tokens,
1500
+ topP: request.top_p,
1501
+ });
1502
+
1503
+ // Convert Bedrock response back to OpenAI format
1504
+ const promptTokens = Math.ceil(inputText.length / 4);
1505
+ const completionTokens = bedrockResponse.results[0].tokenCount;
1506
+
1507
+ this.emitEvent('fallback_success', { provider: 'bedrock', originalModel: modelId, mappedModel: this.fallbackConfig.model });
1508
+
1509
+ return {
1510
+ id: `chat-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`,
1511
+ object: 'chat.completion',
1512
+ created: Math.floor(Date.now() / 1000),
1513
+ model: modelId,
1514
+ choices: [
1515
+ {
1516
+ index: 0,
1517
+ message: {
1518
+ role: 'assistant',
1519
+ content: bedrockResponse.results[0].outputText,
1520
+ },
1521
+ finish_reason: 'stop',
1522
+ },
1523
+ ],
1524
+ usage: {
1525
+ prompt_tokens: promptTokens,
1526
+ completion_tokens: completionTokens,
1527
+ total_tokens: promptTokens + completionTokens,
1528
+ },
1529
+ };
1530
+ }
1531
+
1532
+ private async fallbackToOpenAICloud(modelId: string, request: BedrockInvokeRequest): Promise<BedrockInvokeResponse> {
1533
+ if (!this.fallbackConfig) {
1534
+ throw new Error('OpenAI fallback not configured');
1535
+ }
1536
+
1537
+ const mappedModel = this.mapModelToOpenAI(modelId);
1538
+ const payload = {
1539
+ model: this.fallbackConfig.model || mappedModel,
1540
+ messages: [{ role: 'user', content: request.inputText }],
1541
+ temperature: request.textGenerationConfig?.temperature,
1542
+ max_tokens: request.textGenerationConfig?.maxTokenCount,
1543
+ top_p: request.textGenerationConfig?.topP,
1544
+ };
1545
+
1546
+ try {
1547
+ const response = await axios.post('https://api.openai.com/v1/chat/completions', payload, {
1548
+ headers: {
1549
+ Authorization: `Bearer ${this.fallbackConfig.apiKey}`,
1550
+ 'Content-Type': 'application/json',
1551
+ },
1552
+ });
1553
+
1554
+ const outputText = response.data.choices[0].message.content;
1555
+ const inputTokens = Math.ceil(request.inputText.length / 4);
1556
+ const outputTokens = response.data.usage.completion_tokens;
1557
+
1558
+ this.emitEvent('fallback_success', { provider: 'openai', originalModel: modelId, mappedModel: this.fallbackConfig.model });
1559
+
1560
+ return {
1561
+ results: [
1562
+ {
1563
+ outputText,
1564
+ tokenCount: outputTokens,
1565
+ },
1566
+ ],
1567
+ input_text_token_count: inputTokens,
1568
+ };
1569
+ } catch (error: any) {
1570
+ this.emitProgress('error', 0, `OpenAI fallback failed: ${error.message}`);
1571
+ this.emitEvent('fallback_error', { provider: 'openai', error: error.message });
1572
+ throw error;
1573
+ }
1574
+ }
1575
+
1576
+ private async fallbackToBedrockCloud(modelId: string, request: BedrockInvokeRequest): Promise<BedrockInvokeResponse> {
1577
+ if (!this.fallbackConfig) {
1578
+ throw new Error('Bedrock fallback not configured');
1579
+ }
1580
+
1581
+ try {
1582
+ return await this.invokeBedrockCloud(request.inputText, request.textGenerationConfig);
1583
+ } catch (error: any) {
1584
+ this.emitProgress('error', 0, `Bedrock fallback failed: ${error.message}`);
1585
+ this.emitEvent('fallback_error', { provider: 'bedrock', error: error.message });
1586
+ throw error;
1587
+ }
1588
+ }
1589
+
1590
+ private async invokeBedrockCloud(inputText: string, config?: BedrockTextGenerationConfig): Promise<BedrockInvokeResponse> {
1591
+ if (!this.fallbackConfig) {
1592
+ throw new Error('Bedrock fallback not configured');
1593
+ }
1594
+
1595
+ const region = this.fallbackConfig.region || 'us-east-1';
1596
+ const model = this.fallbackConfig.model || 'anthropic.claude-3-sonnet-20240229-v1:0';
1597
+
1598
+ // Bedrock endpoint format: https://bedrock-runtime.{region}.amazonaws.com/model/{modelId}/invoke
1599
+ const endpoint = `https://bedrock-runtime.${region}.amazonaws.com/model/${model}/invoke`;
1600
+
1601
+ const payload = {
1602
+ inputText,
1603
+ textGenerationConfig: {
1604
+ maxTokenCount: config?.maxTokenCount || 256,
1605
+ temperature: config?.temperature || 0.7,
1606
+ topP: config?.topP || 0.9,
1607
+ topK: config?.topK,
1608
+ stopSequences: config?.stopSequences,
1609
+ },
1610
+ };
1611
+
1612
+ try {
1613
+ const response = await axios.post(endpoint, payload, {
1614
+ headers: {
1615
+ Authorization: `Bearer ${this.fallbackConfig.apiKey}`,
1616
+ 'Content-Type': 'application/json',
1617
+ 'X-Amz-Target': 'AmazonBedrockRuntime.InvokeModel',
1618
+ },
1619
+ });
1620
+
1621
+ this.emitEvent('fallback_success', { provider: 'bedrock', model });
1622
+ return response.data;
1623
+ } catch (error: any) {
1624
+ throw new Error(`Bedrock invocation failed: ${error.message}`);
1625
+ }
1626
+ }
1627
+
1628
+ private mapModelToOpenAI(slyModelId: string): string {
1629
+ const modelMapping: Record<string, string> = {
1630
+ 'quantum-1.7b': 'gpt-4o-mini',
1631
+ 'quantum-3b': 'gpt-4o',
1632
+ 'quantum-code-3b': 'gpt-4o',
1633
+ 'quantum-8b': 'gpt-4-turbo',
1634
+ };
1635
+ return modelMapping[slyModelId] || 'gpt-4o-mini';
1636
+ }
1637
+
1638
+ // ═══════════════════════════════════════════════════════════
1639
+ // RAG — Retrieval Augmented Generation
1640
+ // ═══════════════════════════════════════════════════════════
1641
+
1642
+ private localEmbeddingModel: any = null;
1643
+ private offlineIndexes: Map<string, OfflineIndex> = new Map();
1644
+
1645
+ /**
1646
+ * Compute dynamic RAG parameters based on device profile and model.
1647
+ */
1648
+ private computeRAGConfig(modelId: string): {
1649
+ maxContextChars: number;
1650
+ maxGenTokens: number;
1651
+ chunkSize: number;
1652
+ topK: number;
1653
+ contextWindow: number;
1654
+ deviceTier: 'low' | 'mid' | 'high';
1655
+ } {
1656
+ const contextWindow = this.modelContextWindow || 2048;
1657
+ const memoryMB = this.deviceProfile?.memoryMB || 4096;
1658
+ const cpuCores = this.deviceProfile?.cpuCores || 4;
1659
+ const hasGPU = !!(this.deviceProfile?.gpuRenderer || this.deviceProfile?.webgpuAvailable);
1660
+
1661
+ // Determine device tier
1662
+ let deviceTier: 'low' | 'mid' | 'high' = 'low';
1663
+ if (memoryMB >= 8192 && cpuCores >= 8) deviceTier = 'high';
1664
+ else if (memoryMB >= 4096 && cpuCores >= 4) deviceTier = 'mid';
1665
+
1666
+ // Context chars: scale with context window AND device capability
1667
+ let maxContextChars: number;
1668
+ if (contextWindow <= 2048) {
1669
+ maxContextChars = deviceTier === 'high' ? 600 : deviceTier === 'mid' ? 400 : 300;
1670
+ } else if (contextWindow <= 4096) {
1671
+ maxContextChars = deviceTier === 'high' ? 1500 : deviceTier === 'mid' ? 1000 : 600;
1672
+ } else {
1673
+ maxContextChars = deviceTier === 'high' ? 3000 : deviceTier === 'mid' ? 2000 : 1000;
1674
+ }
1675
+
1676
+ // Gen tokens: scale with device tier
1677
+ let maxGenTokens: number;
1678
+ if (contextWindow <= 2048) {
1679
+ maxGenTokens = deviceTier === 'high' ? 200 : deviceTier === 'mid' ? 150 : 100;
1680
+ } else {
1681
+ maxGenTokens = deviceTier === 'high' ? 400 : deviceTier === 'mid' ? 300 : 150;
1682
+ }
1683
+
1684
+ // Chunk size: larger chunks for bigger context windows
1685
+ const chunkSize = contextWindow <= 2048 ? 256 : contextWindow <= 4096 ? 512 : 1024;
1686
+
1687
+ // TopK: more chunks for powerful devices
1688
+ const topK = deviceTier === 'high' ? 5 : deviceTier === 'mid' ? 3 : 1;
1689
+
1690
+ return { maxContextChars, maxGenTokens, chunkSize, topK, contextWindow, deviceTier };
1691
+ }
1692
+
1693
+ /**
1694
+ * Tier 2: Cloud-indexed RAG with local inference.
1695
+ * Retrieves relevant chunks from server, generates response locally.
1696
+ */
1697
+ async ragQuery(options: RAGOptions): Promise<RAGResponse> {
1698
+ const startTime = Date.now();
1699
+
1700
+ try {
1701
+ if (!this.token) throw new Error('Not authenticated. Call init() first.');
1702
+
1703
+ const ragConfig = this.computeRAGConfig(options.modelId);
1704
+
1705
+ // Step 1: Retrieve relevant chunks from backend
1706
+ const retrievalStart = Date.now();
1707
+ const searchResponse = await axios.post(
1708
+ `${this.apiUrl}/api/rag/knowledge-bases/${options.knowledgeBaseId}/query`,
1709
+ {
1710
+ query: options.query,
1711
+ top_k: options.topK || ragConfig.topK,
1712
+ model_id: options.modelId
1713
+ },
1714
+ { headers: { Authorization: `Bearer ${this.token}` } }
1715
+ );
1716
+ const retrievalMs = Date.now() - retrievalStart;
1717
+
1718
+ let { retrieved_chunks, context } = searchResponse.data;
1719
+
1720
+ // Step 2: Build context with dynamic limits
1721
+ const contextBuildStart = Date.now();
1722
+ if (context && context.length > ragConfig.maxContextChars) {
1723
+ context = context.substring(0, ragConfig.maxContextChars);
1724
+ }
1725
+
1726
+ // Build messages array for proper chat template handling
1727
+ // This uses the model's built-in chat template (e.g. <|im_start|> for SmolLM/Qwen)
1728
+ // which produces dramatically better results than raw text prompts
1729
+ const messages: Array<{role: string; content: string}> = [
1730
+ { role: 'system', content: `Answer questions using only the following context. Be concise.\n\n${context}` },
1731
+ { role: 'user', content: options.query },
1732
+ ];
1733
+ const contextBuildMs = Date.now() - contextBuildStart;
1734
+
1735
+ // Step 3: Generate response — stream if callback provided
1736
+ const genStart = Date.now();
1737
+ let response: string;
1738
+ let firstTokenMs = 0;
1739
+
1740
+ if (options.onToken) {
1741
+ const streamResult = await this.generateStream(options.modelId, messages, {
1742
+ temperature: options.temperature,
1743
+ maxTokens: options.maxTokens || ragConfig.maxGenTokens,
1744
+ onToken: options.onToken,
1745
+ });
1746
+ response = streamResult.text;
1747
+ firstTokenMs = streamResult.firstTokenMs;
1748
+ } else {
1749
+ response = await this.generate(options.modelId, messages, {
1750
+ temperature: options.temperature,
1751
+ maxTokens: options.maxTokens || ragConfig.maxGenTokens,
1752
+ });
1753
+ firstTokenMs = Date.now() - genStart; // approximate
1754
+ }
1755
+ const generationMs = Date.now() - genStart;
1756
+ const totalMs = Date.now() - startTime;
1757
+ const tokensGenerated = response.split(/\s+/).length;
1758
+
1759
+ return {
1760
+ query: options.query,
1761
+ retrievedChunks: (retrieved_chunks || []).map((c: any) => ({
1762
+ id: c.id,
1763
+ documentId: c.document_id,
1764
+ documentName: c.document_name,
1765
+ content: c.content,
1766
+ similarityScore: c.similarity_score,
1767
+ metadata: c.metadata
1768
+ })),
1769
+ generatedResponse: response,
1770
+ context,
1771
+ latencyMs: totalMs,
1772
+ tierUsed: 2,
1773
+ timing: {
1774
+ retrievalMs,
1775
+ contextBuildMs,
1776
+ firstTokenMs,
1777
+ generationMs,
1778
+ totalMs,
1779
+ tokensGenerated,
1780
+ tokensPerSecond: generationMs > 0 ? tokensGenerated / (generationMs / 1000) : 0,
1781
+ },
1782
+ config: {
1783
+ maxContextChars: ragConfig.maxContextChars,
1784
+ maxGenTokens: ragConfig.maxGenTokens,
1785
+ chunkSize: ragConfig.chunkSize,
1786
+ topK: options.topK || ragConfig.topK,
1787
+ contextWindowUsed: ragConfig.contextWindow,
1788
+ deviceTier: ragConfig.deviceTier,
1789
+ },
1790
+ };
1791
+ } catch (error: any) {
1792
+ this.emitEvent('error', { stage: 'rag_query', error: error.message });
1793
+ throw new Error(`RAG query failed: ${error.message}`);
1794
+ }
1795
+ }
1796
+
1797
+ /**
1798
+ * Tier 1: Fully local RAG. Zero network calls.
1799
+ * Documents are chunked/embedded on-device, retrieval and generation all local.
1800
+ */
1801
+ async ragQueryLocal(options: RAGOptions & { documents: Array<{ content: string; name?: string }> }): Promise<RAGResponse> {
1802
+ const startTime = Date.now();
1803
+
1804
+ try {
1805
+ const ragConfig = this.computeRAGConfig(options.modelId);
1806
+
1807
+ // Step 1: Load embedding model if needed
1808
+ if (!this.localEmbeddingModel) {
1809
+ await this.loadEmbeddingModel();
1810
+ }
1811
+
1812
+ // Step 2: Chunk and embed documents (dynamic chunk size)
1813
+ const retrievalStart = Date.now();
1814
+ const allChunks: Array<{ content: string; documentName: string; embedding?: number[] }> = [];
1815
+ for (const doc of options.documents) {
1816
+ const chunks = this.chunkTextLocal(doc.content, ragConfig.chunkSize, Math.floor(ragConfig.chunkSize / 4));
1817
+ for (const chunk of chunks) {
1818
+ const embedding = await this.embedTextLocal(chunk);
1819
+ allChunks.push({ content: chunk, documentName: doc.name || 'Document', embedding });
1820
+ }
1821
+ }
1822
+
1823
+ // Step 3: Embed query and search
1824
+ const queryEmbedding = await this.embedTextLocal(options.query);
1825
+ const scored = allChunks
1826
+ .filter(c => c.embedding)
1827
+ .map(c => ({ ...c, similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding!) }))
1828
+ .sort((a, b) => b.similarityScore - a.similarityScore)
1829
+ .slice(0, options.topK || ragConfig.topK);
1830
+ const retrievalMs = Date.now() - retrievalStart;
1831
+
1832
+ // Step 4: Build context
1833
+ const contextBuildStart = Date.now();
1834
+ const bestChunk = scored[0];
1835
+ let context = bestChunk.content
1836
+ .replace(/[^\x20-\x7E\n]/g, ' ')
1837
+ .replace(/\s{2,}/g, ' ')
1838
+ .replace(/<[^>]+>/g, ' ')
1839
+ .replace(/https?:\/\/\S+/g, '')
1840
+ .replace(/[{}()\[\]]/g, '')
1841
+ .trim();
1842
+ if (context.length > ragConfig.maxContextChars) context = context.substring(0, ragConfig.maxContextChars);
1843
+ const messages: Array<{role: string; content: string}> = [
1844
+ { role: 'system', content: `Answer questions using only the following context. Be concise.\n\n${context}` },
1845
+ { role: 'user', content: options.query },
1846
+ ];
1847
+ const contextBuildMs = Date.now() - contextBuildStart;
1848
+
1849
+ // Step 5: Generate — stream if callback provided
1850
+ const genStart = Date.now();
1851
+ let response: string;
1852
+ let firstTokenMs = 0;
1853
+
1854
+ if (options.onToken) {
1855
+ const streamResult = await this.generateStream(options.modelId, messages, {
1856
+ temperature: options.temperature || 0.6,
1857
+ maxTokens: options.maxTokens || ragConfig.maxGenTokens,
1858
+ onToken: options.onToken,
1859
+ });
1860
+ response = streamResult.text;
1861
+ firstTokenMs = streamResult.firstTokenMs;
1862
+ } else {
1863
+ response = await this.generate(options.modelId, messages, {
1864
+ temperature: options.temperature || 0.6,
1865
+ maxTokens: options.maxTokens || ragConfig.maxGenTokens,
1866
+ });
1867
+ firstTokenMs = Date.now() - genStart;
1868
+ }
1869
+ const generationMs = Date.now() - genStart;
1870
+ const totalMs = Date.now() - startTime;
1871
+ const tokensGenerated = response.split(/\s+/).length;
1872
+
1873
+ return {
1874
+ query: options.query,
1875
+ retrievedChunks: scored.map((c, i) => ({
1876
+ id: `local-${i}`,
1877
+ documentId: 'local',
1878
+ documentName: c.documentName,
1879
+ content: c.content,
1880
+ similarityScore: c.similarityScore,
1881
+ metadata: {}
1882
+ })),
1883
+ generatedResponse: response,
1884
+ context,
1885
+ latencyMs: totalMs,
1886
+ tierUsed: 1,
1887
+ timing: {
1888
+ retrievalMs,
1889
+ contextBuildMs,
1890
+ firstTokenMs,
1891
+ generationMs,
1892
+ totalMs,
1893
+ tokensGenerated,
1894
+ tokensPerSecond: generationMs > 0 ? tokensGenerated / (generationMs / 1000) : 0,
1895
+ },
1896
+ config: {
1897
+ maxContextChars: ragConfig.maxContextChars,
1898
+ maxGenTokens: ragConfig.maxGenTokens,
1899
+ chunkSize: ragConfig.chunkSize,
1900
+ topK: options.topK || ragConfig.topK,
1901
+ contextWindowUsed: ragConfig.contextWindow,
1902
+ deviceTier: ragConfig.deviceTier,
1903
+ },
1904
+ };
1905
+ } catch (error: any) {
1906
+ this.emitEvent('error', { stage: 'rag_local', error: error.message });
1907
+ throw new Error(`Local RAG failed: ${error.message}`);
1908
+ }
1909
+ }
1910
+
1911
+ /**
1912
+ * Tier 3: Offline RAG using a synced knowledge base.
1913
+ * First call syncKnowledgeBase(), then use this for offline queries.
1914
+ */
1915
+ async ragQueryOffline(options: RAGOptions): Promise<RAGResponse> {
1916
+ const startTime = Date.now();
1917
+
1918
+ const index = this.offlineIndexes.get(options.knowledgeBaseId);
1919
+ if (!index) throw new Error(`KB "${options.knowledgeBaseId}" not synced.`);
1920
+ if (new Date(index.metadata.expires_at) < new Date()) throw new Error('Offline index expired.');
1921
+
1922
+ try {
1923
+ const ragConfig = this.computeRAGConfig(options.modelId);
1924
+
1925
+ // Load embedding model
1926
+ if (!this.localEmbeddingModel) await this.loadEmbeddingModel();
1927
+
1928
+ // Search offline index
1929
+ const retrievalStart = Date.now();
1930
+ const queryEmbedding = await this.embedTextLocal(options.query);
1931
+ const scored = index.chunks
1932
+ .filter(c => c.embedding && c.embedding.length > 0)
1933
+ .map(c => ({ ...c, similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding!) }))
1934
+ .sort((a, b) => b.similarityScore - a.similarityScore)
1935
+ .slice(0, options.topK || ragConfig.topK);
1936
+ const retrievalMs = Date.now() - retrievalStart;
1937
+
1938
+ // Build context
1939
+ const contextBuildStart = Date.now();
1940
+ const bestChunk = scored[0];
1941
+ let context = bestChunk.content
1942
+ .replace(/[^\x20-\x7E\n]/g, ' ')
1943
+ .replace(/\s{2,}/g, ' ')
1944
+ .replace(/<[^>]+>/g, ' ')
1945
+ .replace(/https?:\/\/\S+/g, '')
1946
+ .replace(/[{}()\[\]]/g, '')
1947
+ .trim();
1948
+ if (context.length > ragConfig.maxContextChars) context = context.substring(0, ragConfig.maxContextChars);
1949
+ const messages: Array<{role: string; content: string}> = [
1950
+ { role: 'system', content: `Answer questions using only the following context. Be concise.\n\n${context}` },
1951
+ { role: 'user', content: options.query },
1952
+ ];
1953
+ const contextBuildMs = Date.now() - contextBuildStart;
1954
+
1955
+ // Generate
1956
+ const genStart = Date.now();
1957
+ let response: string;
1958
+ let firstTokenMs = 0;
1959
+
1960
+ if (options.onToken) {
1961
+ const streamResult = await this.generateStream(options.modelId, messages, {
1962
+ temperature: options.temperature || 0.6,
1963
+ maxTokens: options.maxTokens || ragConfig.maxGenTokens,
1964
+ onToken: options.onToken,
1965
+ });
1966
+ response = streamResult.text;
1967
+ firstTokenMs = streamResult.firstTokenMs;
1968
+ } else {
1969
+ response = await this.generate(options.modelId, messages, {
1970
+ temperature: options.temperature || 0.6,
1971
+ maxTokens: options.maxTokens || ragConfig.maxGenTokens,
1972
+ });
1973
+ firstTokenMs = Date.now() - genStart;
1974
+ }
1975
+ const generationMs = Date.now() - genStart;
1976
+ const totalMs = Date.now() - startTime;
1977
+ const tokensGenerated = response.split(/\s+/).length;
1978
+
1979
+ return {
1980
+ query: options.query,
1981
+ retrievedChunks: scored.map(c => ({
1982
+ id: c.id,
1983
+ documentId: c.document_id,
1984
+ documentName: c.document_name,
1985
+ content: c.content,
1986
+ similarityScore: c.similarityScore,
1987
+ metadata: c.metadata
1988
+ })),
1989
+ generatedResponse: response,
1990
+ context,
1991
+ latencyMs: totalMs,
1992
+ tierUsed: 3,
1993
+ timing: {
1994
+ retrievalMs,
1995
+ contextBuildMs,
1996
+ firstTokenMs,
1997
+ generationMs,
1998
+ totalMs,
1999
+ tokensGenerated,
2000
+ tokensPerSecond: generationMs > 0 ? tokensGenerated / (generationMs / 1000) : 0,
2001
+ },
2002
+ config: {
2003
+ maxContextChars: ragConfig.maxContextChars,
2004
+ maxGenTokens: ragConfig.maxGenTokens,
2005
+ chunkSize: ragConfig.chunkSize,
2006
+ topK: options.topK || ragConfig.topK,
2007
+ contextWindowUsed: ragConfig.contextWindow,
2008
+ deviceTier: ragConfig.deviceTier,
2009
+ },
2010
+ };
2011
+ } catch (error: any) {
2012
+ this.emitEvent('error', { stage: 'rag_offline', error: error.message });
2013
+ throw new Error(`Offline RAG failed: ${error.message}`);
2014
+ }
2015
+ }
2016
+
2017
+ /**
2018
+ * Sync a knowledge base for offline use (Tier 3).
2019
+ * Downloads chunks + embeddings from server, stores locally.
2020
+ */
2021
+ async syncKnowledgeBase(knowledgeBaseId: string, deviceId?: string): Promise<{ chunkCount: number; sizeMb: number; expiresAt: string }> {
2022
+ try {
2023
+ if (!this.token) throw new Error('Not authenticated. Call init() first.');
2024
+
2025
+ const response = await axios.post(
2026
+ `${this.apiUrl}/api/rag/knowledge-bases/${knowledgeBaseId}/sync`,
2027
+ { device_id: deviceId || this.deviceId || 'sdk-device' },
2028
+ { headers: { Authorization: `Bearer ${this.token}` } }
2029
+ );
2030
+
2031
+ const { sync_package, chunk_count, package_size_mb, expires_at } = response.data;
2032
+ this.offlineIndexes.set(knowledgeBaseId, sync_package);
2033
+
2034
+ return {
2035
+ chunkCount: chunk_count,
2036
+ sizeMb: package_size_mb,
2037
+ expiresAt: expires_at
2038
+ };
2039
+ } catch (error: any) {
2040
+ throw new Error(`Sync failed: ${error.message}`);
2041
+ }
2042
+ }
2043
+
2044
+ // --- RAG Helper Methods ---
2045
+
2046
+ private async loadEmbeddingModel(): Promise<void> {
2047
+ this.emitProgress('downloading', 0, 'Loading embedding model (all-MiniLM-L6-v2)...');
2048
+ try {
2049
+ const { pipeline } = await import('@huggingface/transformers');
2050
+ this.localEmbeddingModel = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
2051
+ this.emitProgress('ready', 100, 'Embedding model loaded');
2052
+ } catch (error: any) {
2053
+ this.emitProgress('error', 0, `Embedding model failed: ${error.message}`);
2054
+ throw error;
2055
+ }
2056
+ }
2057
+
2058
+ private async embedTextLocal(text: string): Promise<number[]> {
2059
+ if (!this.localEmbeddingModel) throw new Error('Embedding model not loaded');
2060
+ const result = await this.localEmbeddingModel(text, { pooling: 'mean', normalize: true });
2061
+ // Handle different tensor output formats (v2 vs v3 of transformers)
2062
+ if (result.data) return Array.from(result.data);
2063
+ if (result.tolist) return result.tolist().flat();
2064
+ if (Array.isArray(result)) return result.flat();
2065
+ throw new Error('Unexpected embedding output format');
2066
+ }
2067
+
2068
+ private cosineSimilarity(a: number[], b: number[]): number {
2069
+ let dot = 0, normA = 0, normB = 0;
2070
+ for (let i = 0; i < a.length; i++) {
2071
+ dot += a[i] * b[i];
2072
+ normA += a[i] * a[i];
2073
+ normB += b[i] * b[i];
2074
+ }
2075
+ const denom = Math.sqrt(normA) * Math.sqrt(normB);
2076
+ return denom === 0 ? 0 : dot / denom;
2077
+ }
2078
+
2079
+ private chunkTextLocal(text: string, chunkSize: number = 512, overlap: number = 128): string[] {
2080
+ if (!text || text.length === 0) return [];
2081
+ if (overlap >= chunkSize) overlap = Math.floor(chunkSize * 0.25);
2082
+ const chunks: string[] = [];
2083
+ let start = 0;
2084
+ while (start < text.length) {
2085
+ let end = start + chunkSize;
2086
+ if (end < text.length) {
2087
+ const bp = Math.max(text.lastIndexOf('.', end), text.lastIndexOf('\n', end));
2088
+ if (bp > start + chunkSize / 2) end = bp + 1;
2089
+ }
2090
+ const chunk = text.slice(start, end).trim();
2091
+ if (chunk.length > 20) chunks.push(chunk);
2092
+ start = end - overlap;
2093
+ if (start >= text.length) break;
2094
+ }
2095
+ return chunks;
2096
+ }
2097
+
2098
+ // ── Static OpenAI Compatible Factory ────────────────────────────────
2099
+
2100
+ static openaiCompatible(config: { apiKey: string; apiUrl?: string; fallback?: FallbackConfig }): OpenAICompatibleClient {
2101
+ const instance = new SlyOS({
2102
+ apiKey: config.apiKey,
2103
+ apiUrl: config.apiUrl,
2104
+ fallback: { ...config.fallback, provider: config.fallback?.provider || 'openai' } as FallbackConfig,
2105
+ });
2106
+
2107
+ return {
2108
+ chat: {
2109
+ completions: {
2110
+ async create(request: OpenAIChatCompletionRequest & { model: string }): Promise<OpenAIChatCompletionResponse> {
2111
+ const { model, ...chatRequest } = request;
2112
+ return instance.chatCompletion(model, chatRequest);
2113
+ },
2114
+ },
2115
+ },
2116
+ };
2117
+ }
2118
+ }
2119
+
2120
+ export default SlyOS;
2121
+ export type {
2122
+ SlyOSConfig,
2123
+ SlyOSConfigWithFallback,
2124
+ GenerateOptions,
2125
+ TranscribeOptions,
2126
+ DeviceProfile,
2127
+ ProgressEvent,
2128
+ SlyEvent,
2129
+ QuantizationLevel,
2130
+ ModelCategory,
2131
+ OpenAIMessage,
2132
+ OpenAIChatCompletionRequest,
2133
+ OpenAIChatCompletionResponse,
2134
+ OpenAIChoice,
2135
+ OpenAIUsage,
2136
+ BedrockTextGenerationConfig,
2137
+ BedrockInvokeRequest,
2138
+ BedrockInvokeResponse,
2139
+ BedrockResult,
2140
+ FallbackConfig,
2141
+ FallbackProvider,
2142
+ OpenAICompatibleClient,
2143
+ RAGOptions,
2144
+ RAGChunk,
2145
+ RAGResponse,
2146
+ OfflineIndex,
2147
+ };