@emilshirokikh/slyos-sdk 1.3.2 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.ts CHANGED
@@ -151,6 +151,57 @@ interface OpenAICompatibleClient {
151
151
  };
152
152
  }
153
153
 
154
+ // ─── RAG Types ──────────────────────────────────────────────────
155
+
156
+ interface RAGOptions {
157
+ knowledgeBaseId: string;
158
+ query: string;
159
+ topK?: number;
160
+ modelId: string;
161
+ temperature?: number;
162
+ maxTokens?: number;
163
+ }
164
+
165
+ interface RAGChunk {
166
+ id: string;
167
+ documentId: string;
168
+ documentName: string;
169
+ content: string;
170
+ similarityScore: number;
171
+ metadata?: Record<string, any>;
172
+ }
173
+
174
+ interface RAGResponse {
175
+ query: string;
176
+ retrievedChunks: RAGChunk[];
177
+ generatedResponse: string;
178
+ context: string;
179
+ latencyMs: number;
180
+ tierUsed: 1 | 2 | 3;
181
+ }
182
+
183
+ interface OfflineIndex {
184
+ metadata: {
185
+ kb_id: string;
186
+ kb_name: string;
187
+ chunk_size: number;
188
+ embedding_dim: number;
189
+ total_chunks: number;
190
+ synced_at: string;
191
+ expires_at: string;
192
+ sync_token: string;
193
+ };
194
+ chunks: Array<{
195
+ id: string;
196
+ document_id: string;
197
+ document_name: string;
198
+ content: string;
199
+ chunk_index: number;
200
+ embedding: number[] | null;
201
+ metadata: Record<string, any>;
202
+ }>;
203
+ }
204
+
154
205
  // ─── Model Registry ─────────────────────────────────────────────────
155
206
 
156
207
  const modelMap: Record<string, ModelInfo> = {
@@ -231,6 +282,29 @@ function selectQuantization(memoryMB: number, modelId: string): QuantizationLeve
231
282
  return 'q4'; // fallback
232
283
  }
233
284
 
285
+ // ─── Context Window Detection ──────────────────────────────────────
286
+
287
+ async function detectContextWindowFromHF(hfModelId: string): Promise<number> {
288
+ try {
289
+ const configUrl = `https://huggingface.co/${hfModelId}/raw/main/config.json`;
290
+ const response = await axios.get(configUrl, { timeout: 5000 });
291
+ const config = response.data;
292
+
293
+ // Try multiple context window field names
294
+ const contextWindow =
295
+ config.max_position_embeddings ||
296
+ config.n_positions ||
297
+ config.max_seq_len ||
298
+ config.model_max_length ||
299
+ 2048;
300
+
301
+ return contextWindow;
302
+ } catch {
303
+ // Default if config cannot be fetched
304
+ return 2048;
305
+ }
306
+ }
307
+
234
308
  // ─── Device Profiling ───────────────────────────────────────────────
235
309
 
236
310
  async function profileDevice(): Promise<DeviceProfile> {
@@ -307,6 +381,7 @@ class SlyOS {
307
381
  private onProgress: ProgressCallback | null;
308
382
  private onEvent: EventCallback | null;
309
383
  private fallbackConfig: FallbackConfig | null;
384
+ private modelContextWindow: number = 0;
310
385
 
311
386
  constructor(config: SlyOSConfigWithFallback) {
312
387
  this.apiKey = config.apiKey;
@@ -345,6 +420,10 @@ class SlyOS {
345
420
  return this.deviceProfile;
346
421
  }
347
422
 
423
+ getModelContextWindow(): number {
424
+ return this.modelContextWindow;
425
+ }
426
+
348
427
  // ── Smart Model Recommendation ──────────────────────────────────
349
428
 
350
429
  recommendModel(category: ModelCategory = 'llm'): { modelId: string; quant: QuantizationLevel; contextWindow: number; reason: string } | null {
@@ -453,6 +532,41 @@ class SlyOS {
453
532
  );
454
533
  }
455
534
 
535
+ async searchModels(query: string, options?: { limit?: number; task?: string }): Promise<Array<{
536
+ id: string;
537
+ name: string;
538
+ downloads: number;
539
+ likes: number;
540
+ task: string;
541
+ size_category: string;
542
+ }>> {
543
+ try {
544
+ const limit = options?.limit || 20;
545
+ const filters = ['onnx']; // Filter for ONNX models only
546
+ if (options?.task) {
547
+ filters.push(options.task);
548
+ }
549
+
550
+ const filterString = filters.map(f => `"${f}"`).join(',');
551
+ const url = `https://huggingface.co/api/models?search=${encodeURIComponent(query)}&filter=${encodeURIComponent(`[${filterString}]`)}&sort=downloads&direction=-1&limit=${limit}`;
552
+
553
+ const response = await axios.get(url, { timeout: 10000 });
554
+ const models = Array.isArray(response.data) ? response.data : [];
555
+
556
+ return models.map((model: any) => ({
557
+ id: model.id,
558
+ name: model.id.split('/')[1] || model.id,
559
+ downloads: model.downloads || 0,
560
+ likes: model.likes || 0,
561
+ task: model.task || 'unknown',
562
+ size_category: model.size_category || 'unknown',
563
+ }));
564
+ } catch (error: any) {
565
+ this.emitEvent('error', { stage: 'model_search', error: error.message });
566
+ throw new Error(`Model search failed: ${error.message}`);
567
+ }
568
+ }
569
+
456
570
  canRunModel(modelId: string, quant?: QuantizationLevel): { canRun: boolean; reason: string; recommendedQuant: QuantizationLevel } {
457
571
  const info = modelMap[modelId];
458
572
  if (!info) return { canRun: false, reason: `Unknown model "${modelId}"`, recommendedQuant: 'q4' };
@@ -482,29 +596,42 @@ class SlyOS {
482
596
 
483
597
  async loadModel(modelId: string, options?: { quant?: QuantizationLevel }): Promise<void> {
484
598
  const info = modelMap[modelId];
485
- if (!info) {
486
- const available = Object.keys(modelMap).join(', ');
487
- throw new Error(`Unknown model "${modelId}". Available: ${available}`);
488
- }
599
+ let hfModelId: string;
600
+ let task: string;
601
+ let estimatedSize: number;
602
+
603
+ // Handle curated models
604
+ if (info) {
605
+ hfModelId = info.hfModel;
606
+ task = info.task;
607
+
608
+ // Determine quantization
609
+ let quant: QuantizationLevel = options?.quant || 'fp32';
610
+ if (!options?.quant && this.deviceProfile) {
611
+ quant = selectQuantization(this.deviceProfile.memoryMB, modelId);
612
+ this.emitProgress('downloading', 0, `Auto-selected ${quant.toUpperCase()} quantization for your device`);
613
+ }
489
614
 
490
- // Determine quantization
491
- let quant: QuantizationLevel = options?.quant || 'fp32';
492
- if (!options?.quant && this.deviceProfile) {
493
- quant = selectQuantization(this.deviceProfile.memoryMB, modelId);
494
- this.emitProgress('downloading', 0, `Auto-selected ${quant.toUpperCase()} quantization for your device`);
495
- }
615
+ // Check feasibility
616
+ const check = this.canRunModel(modelId, quant);
617
+ if (!check.canRun) {
618
+ this.emitProgress('error', 0, check.reason);
619
+ throw new Error(check.reason);
620
+ }
496
621
 
497
- // Check feasibility
498
- const check = this.canRunModel(modelId, quant);
499
- if (!check.canRun) {
500
- this.emitProgress('error', 0, check.reason);
501
- throw new Error(check.reason);
622
+ estimatedSize = info.sizesMB[quant];
623
+ this.emitProgress('downloading', 0, `Downloading ${modelId} (${quant.toUpperCase()}, ~${estimatedSize}MB)...`);
624
+ this.emitEvent('model_download_start', { modelId, quant, estimatedSizeMB: estimatedSize });
625
+ } else {
626
+ // Handle custom HuggingFace models
627
+ hfModelId = modelId;
628
+ task = 'text-generation'; // Default task
629
+ estimatedSize = 2048; // Default estimate
630
+
631
+ this.emitProgress('downloading', 0, `Loading custom HuggingFace model: ${modelId}...`);
632
+ this.emitEvent('model_download_start', { modelId, custom: true, estimatedSizeMB: estimatedSize });
502
633
  }
503
634
 
504
- const estimatedSize = info.sizesMB[quant];
505
- this.emitProgress('downloading', 0, `Downloading ${modelId} (${quant.toUpperCase()}, ~${estimatedSize}MB)...`);
506
- this.emitEvent('model_download_start', { modelId, quant, estimatedSizeMB: estimatedSize });
507
-
508
635
  // Map quant to dtype for HuggingFace
509
636
  const dtypeMap: Record<QuantizationLevel, string> = {
510
637
  q4: 'q4',
@@ -517,9 +644,15 @@ class SlyOS {
517
644
  const startTime = Date.now();
518
645
 
519
646
  try {
520
- const pipe = await pipeline(info.task as any, info.hfModel, {
647
+ // For custom HF models, detect context window
648
+ let detectedContextWindow = 2048;
649
+ if (!info) {
650
+ detectedContextWindow = await detectContextWindowFromHF(hfModelId);
651
+ }
652
+
653
+ const pipe = await pipeline(task as any, hfModelId, {
521
654
  device: 'cpu',
522
- dtype: dtypeMap[quant] as any,
655
+ dtype: 'q4' as any, // Default to q4 for stability
523
656
  progress_callback: (progressData: any) => {
524
657
  // HuggingFace transformers sends progress events during download
525
658
  if (progressData && typeof progressData === 'object') {
@@ -549,14 +682,24 @@ class SlyOS {
549
682
  });
550
683
 
551
684
  const loadTime = Date.now() - startTime;
552
- const contextWindow = this.deviceProfile
553
- ? recommendContextWindow(this.deviceProfile.memoryMB, quant)
554
- : 2048;
685
+ let contextWindow: number;
686
+
687
+ if (info) {
688
+ // For curated models, use recommendContextWindow
689
+ const quant = options?.quant || (this.deviceProfile ? selectQuantization(this.deviceProfile.memoryMB, modelId) : 'q4');
690
+ contextWindow = this.deviceProfile
691
+ ? recommendContextWindow(this.deviceProfile.memoryMB, quant)
692
+ : 2048;
693
+ } else {
694
+ // For custom HF models, use detected context window
695
+ contextWindow = detectedContextWindow;
696
+ }
555
697
 
556
- this.models.set(modelId, { pipe, info, quant, contextWindow });
698
+ this.modelContextWindow = contextWindow;
699
+ this.models.set(modelId, { pipe, info, quant: 'q4', contextWindow });
557
700
 
558
- this.emitProgress('ready', 100, `${modelId} loaded (${quant.toUpperCase()}, ${(loadTime / 1000).toFixed(1)}s, ctx: ${contextWindow})`);
559
- this.emitEvent('model_loaded', { modelId, quant, loadTimeMs: loadTime, contextWindow });
701
+ this.emitProgress('ready', 100, `${modelId} loaded (q4, ${(loadTime / 1000).toFixed(1)}s, ctx: ${contextWindow})`);
702
+ this.emitEvent('model_loaded', { modelId, quant: 'q4', loadTimeMs: loadTime, contextWindow });
560
703
 
561
704
  // Telemetry
562
705
  if (this.token) {
@@ -565,7 +708,7 @@ class SlyOS {
565
708
  event_type: 'model_load',
566
709
  model_id: modelId,
567
710
  success: true,
568
- metadata: { quant, loadTimeMs: loadTime, contextWindow },
711
+ metadata: { quant: 'q4', loadTimeMs: loadTime, contextWindow, custom: !info },
569
712
  }, {
570
713
  headers: { Authorization: `Bearer ${this.token}` },
571
714
  }).catch(() => {});
@@ -1000,6 +1143,320 @@ class SlyOS {
1000
1143
  return modelMapping[slyModelId] || 'gpt-4o-mini';
1001
1144
  }
1002
1145
 
1146
+ // ═══════════════════════════════════════════════════════════
1147
+ // RAG — Retrieval Augmented Generation
1148
+ // ═══════════════════════════════════════════════════════════
1149
+
1150
+ private localEmbeddingModel: any = null;
1151
+ private offlineIndexes: Map<string, OfflineIndex> = new Map();
1152
+
1153
+ /**
1154
+ * Tier 2: Cloud-indexed RAG with local inference.
1155
+ * Retrieves relevant chunks from server, generates response locally.
1156
+ */
1157
+ async ragQuery(options: RAGOptions): Promise<RAGResponse> {
1158
+ const startTime = Date.now();
1159
+
1160
+ try {
1161
+ if (!this.token) throw new Error('Not authenticated. Call init() first.');
1162
+
1163
+ // Step 1: Retrieve relevant chunks from backend
1164
+ const searchResponse = await axios.post(
1165
+ `${this.apiUrl}/api/rag/knowledge-bases/${options.knowledgeBaseId}/query`,
1166
+ {
1167
+ query: options.query,
1168
+ top_k: options.topK || 5,
1169
+ model_id: options.modelId
1170
+ },
1171
+ { headers: { Authorization: `Bearer ${this.token}` } }
1172
+ );
1173
+
1174
+ let { retrieved_chunks, prompt_template, context } = searchResponse.data;
1175
+
1176
+ // Apply context window limits
1177
+ const contextWindow = this.modelContextWindow || 2048;
1178
+ const maxContextChars = (contextWindow - 200) * 3; // Rough token-to-char ratio, reserving 200 tokens
1179
+
1180
+ if (context && context.length > maxContextChars) {
1181
+ context = context.substring(0, maxContextChars) + '...';
1182
+ }
1183
+
1184
+ // Step 2: Generate response locally using the augmented prompt
1185
+ const response = await this.generate(options.modelId, prompt_template, {
1186
+ temperature: options.temperature,
1187
+ maxTokens: options.maxTokens,
1188
+ });
1189
+
1190
+ return {
1191
+ query: options.query,
1192
+ retrievedChunks: retrieved_chunks.map((c: any) => ({
1193
+ id: c.id,
1194
+ documentId: c.document_id,
1195
+ documentName: c.document_name,
1196
+ content: c.content,
1197
+ similarityScore: c.similarity_score,
1198
+ metadata: c.metadata
1199
+ })),
1200
+ generatedResponse: response,
1201
+ context,
1202
+ latencyMs: Date.now() - startTime,
1203
+ tierUsed: 2,
1204
+ };
1205
+ } catch (error: any) {
1206
+ this.emitEvent('error', { stage: 'rag_query', error: error.message });
1207
+ throw new Error(`RAG query failed: ${error.message}`);
1208
+ }
1209
+ }
1210
+
1211
+ /**
1212
+ * Tier 1: Fully local RAG. Zero network calls.
1213
+ * Documents are chunked/embedded on-device, retrieval and generation all local.
1214
+ */
1215
+ async ragQueryLocal(options: RAGOptions & { documents: Array<{ content: string; name?: string }> }): Promise<RAGResponse> {
1216
+ const startTime = Date.now();
1217
+
1218
+ try {
1219
+ // Step 1: Load embedding model if needed
1220
+ if (!this.localEmbeddingModel) {
1221
+ await this.loadEmbeddingModel();
1222
+ }
1223
+
1224
+ // Adapt chunk size based on context window for efficiency
1225
+ const contextWindow = this.modelContextWindow || 2048;
1226
+ const chunkSize = contextWindow <= 1024 ? 256 : contextWindow <= 2048 ? 512 : 1024;
1227
+ const overlap = Math.floor(chunkSize / 4);
1228
+
1229
+ // Step 2: Chunk documents if not already chunked
1230
+ const allChunks: Array<{ content: string; documentName: string; embedding?: number[] }> = [];
1231
+ for (const doc of options.documents) {
1232
+ const chunks = this.chunkTextLocal(doc.content, chunkSize, overlap);
1233
+ for (const chunk of chunks) {
1234
+ const embedding = await this.embedTextLocal(chunk);
1235
+ allChunks.push({ content: chunk, documentName: doc.name || 'Document', embedding });
1236
+ }
1237
+ }
1238
+
1239
+ // Step 3: Embed query
1240
+ const queryEmbedding = await this.embedTextLocal(options.query);
1241
+
1242
+ // Step 4: Cosine similarity search
1243
+ const scored = allChunks
1244
+ .filter(c => c.embedding)
1245
+ .map(c => ({
1246
+ ...c,
1247
+ similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding!)
1248
+ }))
1249
+ .sort((a, b) => b.similarityScore - a.similarityScore)
1250
+ .slice(0, options.topK || 5);
1251
+
1252
+ // Step 5: Build context with size limits
1253
+ const maxContextChars = (contextWindow - 200) * 3; // Rough token-to-char ratio, reserving 200 tokens
1254
+ let contextLength = 0;
1255
+ const contextParts: string[] = [];
1256
+
1257
+ for (const c of scored) {
1258
+ const part = `[Source: ${c.documentName}]\n${c.content}`;
1259
+ if (contextLength + part.length <= maxContextChars) {
1260
+ contextParts.push(part);
1261
+ contextLength += part.length + 10; // Account for separator
1262
+ } else {
1263
+ break;
1264
+ }
1265
+ }
1266
+
1267
+ const context = contextParts.join('\n\n---\n\n');
1268
+ const prompt = `You are a helpful assistant. Answer based ONLY on the following context:\n\n${context}\n\nQuestion: ${options.query}\n\nAnswer:`;
1269
+
1270
+ // Step 6: Generate locally
1271
+ const response = await this.generate(options.modelId, prompt, {
1272
+ temperature: options.temperature,
1273
+ maxTokens: options.maxTokens,
1274
+ });
1275
+
1276
+ return {
1277
+ query: options.query,
1278
+ retrievedChunks: scored.map((c, i) => ({
1279
+ id: `local-${i}`,
1280
+ documentId: 'local',
1281
+ documentName: c.documentName,
1282
+ content: c.content,
1283
+ similarityScore: c.similarityScore,
1284
+ metadata: {}
1285
+ })),
1286
+ generatedResponse: response,
1287
+ context,
1288
+ latencyMs: Date.now() - startTime,
1289
+ tierUsed: 1,
1290
+ };
1291
+ } catch (error: any) {
1292
+ this.emitEvent('error', { stage: 'rag_local', error: error.message });
1293
+ throw new Error(`Local RAG failed: ${error.message}`);
1294
+ }
1295
+ }
1296
+
1297
+ /**
1298
+ * Tier 3: Offline RAG using a synced knowledge base.
1299
+ * First call syncKnowledgeBase(), then use this for offline queries.
1300
+ */
1301
+ async ragQueryOffline(options: RAGOptions): Promise<RAGResponse> {
1302
+ const startTime = Date.now();
1303
+
1304
+ const index = this.offlineIndexes.get(options.knowledgeBaseId);
1305
+ if (!index) {
1306
+ throw new Error(`Knowledge base "${options.knowledgeBaseId}" not synced. Call syncKnowledgeBase() first.`);
1307
+ }
1308
+
1309
+ // Check expiry
1310
+ if (new Date(index.metadata.expires_at) < new Date()) {
1311
+ throw new Error('Offline index has expired. Please re-sync.');
1312
+ }
1313
+
1314
+ try {
1315
+ // Load embedding model
1316
+ if (!this.localEmbeddingModel) {
1317
+ await this.loadEmbeddingModel();
1318
+ }
1319
+
1320
+ // Embed query
1321
+ const queryEmbedding = await this.embedTextLocal(options.query);
1322
+
1323
+ // Search offline index
1324
+ const scored = index.chunks
1325
+ .filter(c => c.embedding && c.embedding.length > 0)
1326
+ .map(c => ({
1327
+ ...c,
1328
+ similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding!)
1329
+ }))
1330
+ .sort((a, b) => b.similarityScore - a.similarityScore)
1331
+ .slice(0, options.topK || 5);
1332
+
1333
+ // Build context with size limits
1334
+ const contextWindow = this.modelContextWindow || 2048;
1335
+ const maxContextChars = (contextWindow - 200) * 3; // Rough token-to-char ratio, reserving 200 tokens
1336
+ let contextLength = 0;
1337
+ const contextParts: string[] = [];
1338
+
1339
+ for (const c of scored) {
1340
+ const part = `[Source: ${c.document_name}]\n${c.content}`;
1341
+ if (contextLength + part.length <= maxContextChars) {
1342
+ contextParts.push(part);
1343
+ contextLength += part.length + 10; // Account for separator
1344
+ } else {
1345
+ break;
1346
+ }
1347
+ }
1348
+
1349
+ const context = contextParts.join('\n\n---\n\n');
1350
+ const prompt = `You are a helpful assistant. Answer based ONLY on the following context:\n\n${context}\n\nQuestion: ${options.query}\n\nAnswer:`;
1351
+
1352
+ // Generate locally
1353
+ const response = await this.generate(options.modelId, prompt, {
1354
+ temperature: options.temperature,
1355
+ maxTokens: options.maxTokens,
1356
+ });
1357
+
1358
+ return {
1359
+ query: options.query,
1360
+ retrievedChunks: scored.map(c => ({
1361
+ id: c.id,
1362
+ documentId: c.document_id,
1363
+ documentName: c.document_name,
1364
+ content: c.content,
1365
+ similarityScore: c.similarityScore,
1366
+ metadata: c.metadata
1367
+ })),
1368
+ generatedResponse: response,
1369
+ context,
1370
+ latencyMs: Date.now() - startTime,
1371
+ tierUsed: 3,
1372
+ };
1373
+ } catch (error: any) {
1374
+ this.emitEvent('error', { stage: 'rag_offline', error: error.message });
1375
+ throw new Error(`Offline RAG failed: ${error.message}`);
1376
+ }
1377
+ }
1378
+
1379
+ /**
1380
+ * Sync a knowledge base for offline use (Tier 3).
1381
+ * Downloads chunks + embeddings from server, stores locally.
1382
+ */
1383
+ async syncKnowledgeBase(knowledgeBaseId: string, deviceId?: string): Promise<{ chunkCount: number; sizeMb: number; expiresAt: string }> {
1384
+ try {
1385
+ if (!this.token) throw new Error('Not authenticated. Call init() first.');
1386
+
1387
+ const response = await axios.post(
1388
+ `${this.apiUrl}/api/rag/knowledge-bases/${knowledgeBaseId}/sync`,
1389
+ { device_id: deviceId || this.deviceId || 'sdk-device' },
1390
+ { headers: { Authorization: `Bearer ${this.token}` } }
1391
+ );
1392
+
1393
+ const { sync_package, chunk_count, package_size_mb, expires_at } = response.data;
1394
+ this.offlineIndexes.set(knowledgeBaseId, sync_package);
1395
+
1396
+ return {
1397
+ chunkCount: chunk_count,
1398
+ sizeMb: package_size_mb,
1399
+ expiresAt: expires_at
1400
+ };
1401
+ } catch (error: any) {
1402
+ throw new Error(`Sync failed: ${error.message}`);
1403
+ }
1404
+ }
1405
+
1406
+ // --- RAG Helper Methods ---
1407
+
1408
+ private async loadEmbeddingModel(): Promise<void> {
1409
+ this.emitProgress('downloading', 0, 'Loading embedding model (all-MiniLM-L6-v2)...');
1410
+ try {
1411
+ const { pipeline } = await import('@huggingface/transformers');
1412
+ this.localEmbeddingModel = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
1413
+ this.emitProgress('ready', 100, 'Embedding model loaded');
1414
+ } catch (error: any) {
1415
+ this.emitProgress('error', 0, `Embedding model failed: ${error.message}`);
1416
+ throw error;
1417
+ }
1418
+ }
1419
+
1420
+ private async embedTextLocal(text: string): Promise<number[]> {
1421
+ if (!this.localEmbeddingModel) throw new Error('Embedding model not loaded');
1422
+ const result = await this.localEmbeddingModel(text, { pooling: 'mean', normalize: true });
1423
+ // Handle different tensor output formats (v2 vs v3 of transformers)
1424
+ if (result.data) return Array.from(result.data);
1425
+ if (result.tolist) return result.tolist().flat();
1426
+ if (Array.isArray(result)) return result.flat();
1427
+ throw new Error('Unexpected embedding output format');
1428
+ }
1429
+
1430
+ private cosineSimilarity(a: number[], b: number[]): number {
1431
+ let dot = 0, normA = 0, normB = 0;
1432
+ for (let i = 0; i < a.length; i++) {
1433
+ dot += a[i] * b[i];
1434
+ normA += a[i] * a[i];
1435
+ normB += b[i] * b[i];
1436
+ }
1437
+ const denom = Math.sqrt(normA) * Math.sqrt(normB);
1438
+ return denom === 0 ? 0 : dot / denom;
1439
+ }
1440
+
1441
+ private chunkTextLocal(text: string, chunkSize: number = 512, overlap: number = 128): string[] {
1442
+ if (!text || text.length === 0) return [];
1443
+ if (overlap >= chunkSize) overlap = Math.floor(chunkSize * 0.25);
1444
+ const chunks: string[] = [];
1445
+ let start = 0;
1446
+ while (start < text.length) {
1447
+ let end = start + chunkSize;
1448
+ if (end < text.length) {
1449
+ const bp = Math.max(text.lastIndexOf('.', end), text.lastIndexOf('\n', end));
1450
+ if (bp > start + chunkSize / 2) end = bp + 1;
1451
+ }
1452
+ const chunk = text.slice(start, end).trim();
1453
+ if (chunk.length > 20) chunks.push(chunk);
1454
+ start = end - overlap;
1455
+ if (start >= text.length) break;
1456
+ }
1457
+ return chunks;
1458
+ }
1459
+
1003
1460
  // ── Static OpenAI Compatible Factory ────────────────────────────────
1004
1461
 
1005
1462
  static openaiCompatible(config: { apiKey: string; apiUrl?: string; fallback?: FallbackConfig }): OpenAICompatibleClient {
@@ -1045,4 +1502,8 @@ export type {
1045
1502
  FallbackConfig,
1046
1503
  FallbackProvider,
1047
1504
  OpenAICompatibleClient,
1505
+ RAGOptions,
1506
+ RAGChunk,
1507
+ RAGResponse,
1508
+ OfflineIndex,
1048
1509
  };