@emilshirokikh/slyos-sdk 1.3.2 → 1.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -16
- package/create-chatbot.sh +145 -20
- package/dist/index.d.ts +94 -1
- package/dist/index.js +386 -28
- package/package.json +1 -1
- package/src/index.ts +489 -28
package/src/index.ts
CHANGED
|
@@ -151,6 +151,57 @@ interface OpenAICompatibleClient {
|
|
|
151
151
|
};
|
|
152
152
|
}
|
|
153
153
|
|
|
154
|
+
// ─── RAG Types ──────────────────────────────────────────────────
|
|
155
|
+
|
|
156
|
+
interface RAGOptions {
|
|
157
|
+
knowledgeBaseId: string;
|
|
158
|
+
query: string;
|
|
159
|
+
topK?: number;
|
|
160
|
+
modelId: string;
|
|
161
|
+
temperature?: number;
|
|
162
|
+
maxTokens?: number;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
interface RAGChunk {
|
|
166
|
+
id: string;
|
|
167
|
+
documentId: string;
|
|
168
|
+
documentName: string;
|
|
169
|
+
content: string;
|
|
170
|
+
similarityScore: number;
|
|
171
|
+
metadata?: Record<string, any>;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
interface RAGResponse {
|
|
175
|
+
query: string;
|
|
176
|
+
retrievedChunks: RAGChunk[];
|
|
177
|
+
generatedResponse: string;
|
|
178
|
+
context: string;
|
|
179
|
+
latencyMs: number;
|
|
180
|
+
tierUsed: 1 | 2 | 3;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
interface OfflineIndex {
|
|
184
|
+
metadata: {
|
|
185
|
+
kb_id: string;
|
|
186
|
+
kb_name: string;
|
|
187
|
+
chunk_size: number;
|
|
188
|
+
embedding_dim: number;
|
|
189
|
+
total_chunks: number;
|
|
190
|
+
synced_at: string;
|
|
191
|
+
expires_at: string;
|
|
192
|
+
sync_token: string;
|
|
193
|
+
};
|
|
194
|
+
chunks: Array<{
|
|
195
|
+
id: string;
|
|
196
|
+
document_id: string;
|
|
197
|
+
document_name: string;
|
|
198
|
+
content: string;
|
|
199
|
+
chunk_index: number;
|
|
200
|
+
embedding: number[] | null;
|
|
201
|
+
metadata: Record<string, any>;
|
|
202
|
+
}>;
|
|
203
|
+
}
|
|
204
|
+
|
|
154
205
|
// ─── Model Registry ─────────────────────────────────────────────────
|
|
155
206
|
|
|
156
207
|
const modelMap: Record<string, ModelInfo> = {
|
|
@@ -231,6 +282,29 @@ function selectQuantization(memoryMB: number, modelId: string): QuantizationLeve
|
|
|
231
282
|
return 'q4'; // fallback
|
|
232
283
|
}
|
|
233
284
|
|
|
285
|
+
// ─── Context Window Detection ──────────────────────────────────────
|
|
286
|
+
|
|
287
|
+
async function detectContextWindowFromHF(hfModelId: string): Promise<number> {
|
|
288
|
+
try {
|
|
289
|
+
const configUrl = `https://huggingface.co/${hfModelId}/raw/main/config.json`;
|
|
290
|
+
const response = await axios.get(configUrl, { timeout: 5000 });
|
|
291
|
+
const config = response.data;
|
|
292
|
+
|
|
293
|
+
// Try multiple context window field names
|
|
294
|
+
const contextWindow =
|
|
295
|
+
config.max_position_embeddings ||
|
|
296
|
+
config.n_positions ||
|
|
297
|
+
config.max_seq_len ||
|
|
298
|
+
config.model_max_length ||
|
|
299
|
+
2048;
|
|
300
|
+
|
|
301
|
+
return contextWindow;
|
|
302
|
+
} catch {
|
|
303
|
+
// Default if config cannot be fetched
|
|
304
|
+
return 2048;
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
|
|
234
308
|
// ─── Device Profiling ───────────────────────────────────────────────
|
|
235
309
|
|
|
236
310
|
async function profileDevice(): Promise<DeviceProfile> {
|
|
@@ -307,6 +381,7 @@ class SlyOS {
|
|
|
307
381
|
private onProgress: ProgressCallback | null;
|
|
308
382
|
private onEvent: EventCallback | null;
|
|
309
383
|
private fallbackConfig: FallbackConfig | null;
|
|
384
|
+
private modelContextWindow: number = 0;
|
|
310
385
|
|
|
311
386
|
constructor(config: SlyOSConfigWithFallback) {
|
|
312
387
|
this.apiKey = config.apiKey;
|
|
@@ -345,6 +420,10 @@ class SlyOS {
|
|
|
345
420
|
return this.deviceProfile;
|
|
346
421
|
}
|
|
347
422
|
|
|
423
|
+
getModelContextWindow(): number {
|
|
424
|
+
return this.modelContextWindow;
|
|
425
|
+
}
|
|
426
|
+
|
|
348
427
|
// ── Smart Model Recommendation ──────────────────────────────────
|
|
349
428
|
|
|
350
429
|
recommendModel(category: ModelCategory = 'llm'): { modelId: string; quant: QuantizationLevel; contextWindow: number; reason: string } | null {
|
|
@@ -453,6 +532,41 @@ class SlyOS {
|
|
|
453
532
|
);
|
|
454
533
|
}
|
|
455
534
|
|
|
535
|
+
async searchModels(query: string, options?: { limit?: number; task?: string }): Promise<Array<{
|
|
536
|
+
id: string;
|
|
537
|
+
name: string;
|
|
538
|
+
downloads: number;
|
|
539
|
+
likes: number;
|
|
540
|
+
task: string;
|
|
541
|
+
size_category: string;
|
|
542
|
+
}>> {
|
|
543
|
+
try {
|
|
544
|
+
const limit = options?.limit || 20;
|
|
545
|
+
const filters = ['onnx']; // Filter for ONNX models only
|
|
546
|
+
if (options?.task) {
|
|
547
|
+
filters.push(options.task);
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
const filterString = filters.map(f => `"${f}"`).join(',');
|
|
551
|
+
const url = `https://huggingface.co/api/models?search=${encodeURIComponent(query)}&filter=${encodeURIComponent(`[${filterString}]`)}&sort=downloads&direction=-1&limit=${limit}`;
|
|
552
|
+
|
|
553
|
+
const response = await axios.get(url, { timeout: 10000 });
|
|
554
|
+
const models = Array.isArray(response.data) ? response.data : [];
|
|
555
|
+
|
|
556
|
+
return models.map((model: any) => ({
|
|
557
|
+
id: model.id,
|
|
558
|
+
name: model.id.split('/')[1] || model.id,
|
|
559
|
+
downloads: model.downloads || 0,
|
|
560
|
+
likes: model.likes || 0,
|
|
561
|
+
task: model.task || 'unknown',
|
|
562
|
+
size_category: model.size_category || 'unknown',
|
|
563
|
+
}));
|
|
564
|
+
} catch (error: any) {
|
|
565
|
+
this.emitEvent('error', { stage: 'model_search', error: error.message });
|
|
566
|
+
throw new Error(`Model search failed: ${error.message}`);
|
|
567
|
+
}
|
|
568
|
+
}
|
|
569
|
+
|
|
456
570
|
canRunModel(modelId: string, quant?: QuantizationLevel): { canRun: boolean; reason: string; recommendedQuant: QuantizationLevel } {
|
|
457
571
|
const info = modelMap[modelId];
|
|
458
572
|
if (!info) return { canRun: false, reason: `Unknown model "${modelId}"`, recommendedQuant: 'q4' };
|
|
@@ -482,29 +596,42 @@ class SlyOS {
|
|
|
482
596
|
|
|
483
597
|
async loadModel(modelId: string, options?: { quant?: QuantizationLevel }): Promise<void> {
|
|
484
598
|
const info = modelMap[modelId];
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
599
|
+
let hfModelId: string;
|
|
600
|
+
let task: string;
|
|
601
|
+
let estimatedSize: number;
|
|
602
|
+
|
|
603
|
+
// Handle curated models
|
|
604
|
+
if (info) {
|
|
605
|
+
hfModelId = info.hfModel;
|
|
606
|
+
task = info.task;
|
|
607
|
+
|
|
608
|
+
// Determine quantization
|
|
609
|
+
let quant: QuantizationLevel = options?.quant || 'fp32';
|
|
610
|
+
if (!options?.quant && this.deviceProfile) {
|
|
611
|
+
quant = selectQuantization(this.deviceProfile.memoryMB, modelId);
|
|
612
|
+
this.emitProgress('downloading', 0, `Auto-selected ${quant.toUpperCase()} quantization for your device`);
|
|
613
|
+
}
|
|
489
614
|
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
615
|
+
// Check feasibility
|
|
616
|
+
const check = this.canRunModel(modelId, quant);
|
|
617
|
+
if (!check.canRun) {
|
|
618
|
+
this.emitProgress('error', 0, check.reason);
|
|
619
|
+
throw new Error(check.reason);
|
|
620
|
+
}
|
|
496
621
|
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
622
|
+
estimatedSize = info.sizesMB[quant];
|
|
623
|
+
this.emitProgress('downloading', 0, `Downloading ${modelId} (${quant.toUpperCase()}, ~${estimatedSize}MB)...`);
|
|
624
|
+
this.emitEvent('model_download_start', { modelId, quant, estimatedSizeMB: estimatedSize });
|
|
625
|
+
} else {
|
|
626
|
+
// Handle custom HuggingFace models
|
|
627
|
+
hfModelId = modelId;
|
|
628
|
+
task = 'text-generation'; // Default task
|
|
629
|
+
estimatedSize = 2048; // Default estimate
|
|
630
|
+
|
|
631
|
+
this.emitProgress('downloading', 0, `Loading custom HuggingFace model: ${modelId}...`);
|
|
632
|
+
this.emitEvent('model_download_start', { modelId, custom: true, estimatedSizeMB: estimatedSize });
|
|
502
633
|
}
|
|
503
634
|
|
|
504
|
-
const estimatedSize = info.sizesMB[quant];
|
|
505
|
-
this.emitProgress('downloading', 0, `Downloading ${modelId} (${quant.toUpperCase()}, ~${estimatedSize}MB)...`);
|
|
506
|
-
this.emitEvent('model_download_start', { modelId, quant, estimatedSizeMB: estimatedSize });
|
|
507
|
-
|
|
508
635
|
// Map quant to dtype for HuggingFace
|
|
509
636
|
const dtypeMap: Record<QuantizationLevel, string> = {
|
|
510
637
|
q4: 'q4',
|
|
@@ -517,9 +644,15 @@ class SlyOS {
|
|
|
517
644
|
const startTime = Date.now();
|
|
518
645
|
|
|
519
646
|
try {
|
|
520
|
-
|
|
647
|
+
// For custom HF models, detect context window
|
|
648
|
+
let detectedContextWindow = 2048;
|
|
649
|
+
if (!info) {
|
|
650
|
+
detectedContextWindow = await detectContextWindowFromHF(hfModelId);
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
const pipe = await pipeline(task as any, hfModelId, {
|
|
521
654
|
device: 'cpu',
|
|
522
|
-
dtype:
|
|
655
|
+
dtype: 'q4' as any, // Default to q4 for stability
|
|
523
656
|
progress_callback: (progressData: any) => {
|
|
524
657
|
// HuggingFace transformers sends progress events during download
|
|
525
658
|
if (progressData && typeof progressData === 'object') {
|
|
@@ -549,14 +682,24 @@ class SlyOS {
|
|
|
549
682
|
});
|
|
550
683
|
|
|
551
684
|
const loadTime = Date.now() - startTime;
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
685
|
+
let contextWindow: number;
|
|
686
|
+
|
|
687
|
+
if (info) {
|
|
688
|
+
// For curated models, use recommendContextWindow
|
|
689
|
+
const quant = options?.quant || (this.deviceProfile ? selectQuantization(this.deviceProfile.memoryMB, modelId) : 'q4');
|
|
690
|
+
contextWindow = this.deviceProfile
|
|
691
|
+
? recommendContextWindow(this.deviceProfile.memoryMB, quant)
|
|
692
|
+
: 2048;
|
|
693
|
+
} else {
|
|
694
|
+
// For custom HF models, use detected context window
|
|
695
|
+
contextWindow = detectedContextWindow;
|
|
696
|
+
}
|
|
555
697
|
|
|
556
|
-
this.
|
|
698
|
+
this.modelContextWindow = contextWindow;
|
|
699
|
+
this.models.set(modelId, { pipe, info, quant: 'q4', contextWindow });
|
|
557
700
|
|
|
558
|
-
this.emitProgress('ready', 100, `${modelId} loaded (
|
|
559
|
-
this.emitEvent('model_loaded', { modelId, quant, loadTimeMs: loadTime, contextWindow });
|
|
701
|
+
this.emitProgress('ready', 100, `${modelId} loaded (q4, ${(loadTime / 1000).toFixed(1)}s, ctx: ${contextWindow})`);
|
|
702
|
+
this.emitEvent('model_loaded', { modelId, quant: 'q4', loadTimeMs: loadTime, contextWindow });
|
|
560
703
|
|
|
561
704
|
// Telemetry
|
|
562
705
|
if (this.token) {
|
|
@@ -565,7 +708,7 @@ class SlyOS {
|
|
|
565
708
|
event_type: 'model_load',
|
|
566
709
|
model_id: modelId,
|
|
567
710
|
success: true,
|
|
568
|
-
metadata: { quant, loadTimeMs: loadTime, contextWindow },
|
|
711
|
+
metadata: { quant: 'q4', loadTimeMs: loadTime, contextWindow, custom: !info },
|
|
569
712
|
}, {
|
|
570
713
|
headers: { Authorization: `Bearer ${this.token}` },
|
|
571
714
|
}).catch(() => {});
|
|
@@ -1000,6 +1143,320 @@ class SlyOS {
|
|
|
1000
1143
|
return modelMapping[slyModelId] || 'gpt-4o-mini';
|
|
1001
1144
|
}
|
|
1002
1145
|
|
|
1146
|
+
// ═══════════════════════════════════════════════════════════
|
|
1147
|
+
// RAG — Retrieval Augmented Generation
|
|
1148
|
+
// ═══════════════════════════════════════════════════════════
|
|
1149
|
+
|
|
1150
|
+
private localEmbeddingModel: any = null;
|
|
1151
|
+
private offlineIndexes: Map<string, OfflineIndex> = new Map();
|
|
1152
|
+
|
|
1153
|
+
/**
|
|
1154
|
+
* Tier 2: Cloud-indexed RAG with local inference.
|
|
1155
|
+
* Retrieves relevant chunks from server, generates response locally.
|
|
1156
|
+
*/
|
|
1157
|
+
async ragQuery(options: RAGOptions): Promise<RAGResponse> {
|
|
1158
|
+
const startTime = Date.now();
|
|
1159
|
+
|
|
1160
|
+
try {
|
|
1161
|
+
if (!this.token) throw new Error('Not authenticated. Call init() first.');
|
|
1162
|
+
|
|
1163
|
+
// Step 1: Retrieve relevant chunks from backend
|
|
1164
|
+
const searchResponse = await axios.post(
|
|
1165
|
+
`${this.apiUrl}/api/rag/knowledge-bases/${options.knowledgeBaseId}/query`,
|
|
1166
|
+
{
|
|
1167
|
+
query: options.query,
|
|
1168
|
+
top_k: options.topK || 5,
|
|
1169
|
+
model_id: options.modelId
|
|
1170
|
+
},
|
|
1171
|
+
{ headers: { Authorization: `Bearer ${this.token}` } }
|
|
1172
|
+
);
|
|
1173
|
+
|
|
1174
|
+
let { retrieved_chunks, prompt_template, context } = searchResponse.data;
|
|
1175
|
+
|
|
1176
|
+
// Apply context window limits
|
|
1177
|
+
const contextWindow = this.modelContextWindow || 2048;
|
|
1178
|
+
const maxContextChars = (contextWindow - 200) * 3; // Rough token-to-char ratio, reserving 200 tokens
|
|
1179
|
+
|
|
1180
|
+
if (context && context.length > maxContextChars) {
|
|
1181
|
+
context = context.substring(0, maxContextChars) + '...';
|
|
1182
|
+
}
|
|
1183
|
+
|
|
1184
|
+
// Step 2: Generate response locally using the augmented prompt
|
|
1185
|
+
const response = await this.generate(options.modelId, prompt_template, {
|
|
1186
|
+
temperature: options.temperature,
|
|
1187
|
+
maxTokens: options.maxTokens,
|
|
1188
|
+
});
|
|
1189
|
+
|
|
1190
|
+
return {
|
|
1191
|
+
query: options.query,
|
|
1192
|
+
retrievedChunks: retrieved_chunks.map((c: any) => ({
|
|
1193
|
+
id: c.id,
|
|
1194
|
+
documentId: c.document_id,
|
|
1195
|
+
documentName: c.document_name,
|
|
1196
|
+
content: c.content,
|
|
1197
|
+
similarityScore: c.similarity_score,
|
|
1198
|
+
metadata: c.metadata
|
|
1199
|
+
})),
|
|
1200
|
+
generatedResponse: response,
|
|
1201
|
+
context,
|
|
1202
|
+
latencyMs: Date.now() - startTime,
|
|
1203
|
+
tierUsed: 2,
|
|
1204
|
+
};
|
|
1205
|
+
} catch (error: any) {
|
|
1206
|
+
this.emitEvent('error', { stage: 'rag_query', error: error.message });
|
|
1207
|
+
throw new Error(`RAG query failed: ${error.message}`);
|
|
1208
|
+
}
|
|
1209
|
+
}
|
|
1210
|
+
|
|
1211
|
+
/**
|
|
1212
|
+
* Tier 1: Fully local RAG. Zero network calls.
|
|
1213
|
+
* Documents are chunked/embedded on-device, retrieval and generation all local.
|
|
1214
|
+
*/
|
|
1215
|
+
async ragQueryLocal(options: RAGOptions & { documents: Array<{ content: string; name?: string }> }): Promise<RAGResponse> {
|
|
1216
|
+
const startTime = Date.now();
|
|
1217
|
+
|
|
1218
|
+
try {
|
|
1219
|
+
// Step 1: Load embedding model if needed
|
|
1220
|
+
if (!this.localEmbeddingModel) {
|
|
1221
|
+
await this.loadEmbeddingModel();
|
|
1222
|
+
}
|
|
1223
|
+
|
|
1224
|
+
// Adapt chunk size based on context window for efficiency
|
|
1225
|
+
const contextWindow = this.modelContextWindow || 2048;
|
|
1226
|
+
const chunkSize = contextWindow <= 1024 ? 256 : contextWindow <= 2048 ? 512 : 1024;
|
|
1227
|
+
const overlap = Math.floor(chunkSize / 4);
|
|
1228
|
+
|
|
1229
|
+
// Step 2: Chunk documents if not already chunked
|
|
1230
|
+
const allChunks: Array<{ content: string; documentName: string; embedding?: number[] }> = [];
|
|
1231
|
+
for (const doc of options.documents) {
|
|
1232
|
+
const chunks = this.chunkTextLocal(doc.content, chunkSize, overlap);
|
|
1233
|
+
for (const chunk of chunks) {
|
|
1234
|
+
const embedding = await this.embedTextLocal(chunk);
|
|
1235
|
+
allChunks.push({ content: chunk, documentName: doc.name || 'Document', embedding });
|
|
1236
|
+
}
|
|
1237
|
+
}
|
|
1238
|
+
|
|
1239
|
+
// Step 3: Embed query
|
|
1240
|
+
const queryEmbedding = await this.embedTextLocal(options.query);
|
|
1241
|
+
|
|
1242
|
+
// Step 4: Cosine similarity search
|
|
1243
|
+
const scored = allChunks
|
|
1244
|
+
.filter(c => c.embedding)
|
|
1245
|
+
.map(c => ({
|
|
1246
|
+
...c,
|
|
1247
|
+
similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding!)
|
|
1248
|
+
}))
|
|
1249
|
+
.sort((a, b) => b.similarityScore - a.similarityScore)
|
|
1250
|
+
.slice(0, options.topK || 5);
|
|
1251
|
+
|
|
1252
|
+
// Step 5: Build context with size limits
|
|
1253
|
+
const maxContextChars = (contextWindow - 200) * 3; // Rough token-to-char ratio, reserving 200 tokens
|
|
1254
|
+
let contextLength = 0;
|
|
1255
|
+
const contextParts: string[] = [];
|
|
1256
|
+
|
|
1257
|
+
for (const c of scored) {
|
|
1258
|
+
const part = `[Source: ${c.documentName}]\n${c.content}`;
|
|
1259
|
+
if (contextLength + part.length <= maxContextChars) {
|
|
1260
|
+
contextParts.push(part);
|
|
1261
|
+
contextLength += part.length + 10; // Account for separator
|
|
1262
|
+
} else {
|
|
1263
|
+
break;
|
|
1264
|
+
}
|
|
1265
|
+
}
|
|
1266
|
+
|
|
1267
|
+
const context = contextParts.join('\n\n---\n\n');
|
|
1268
|
+
const prompt = `You are a helpful assistant. Answer based ONLY on the following context:\n\n${context}\n\nQuestion: ${options.query}\n\nAnswer:`;
|
|
1269
|
+
|
|
1270
|
+
// Step 6: Generate locally
|
|
1271
|
+
const response = await this.generate(options.modelId, prompt, {
|
|
1272
|
+
temperature: options.temperature,
|
|
1273
|
+
maxTokens: options.maxTokens,
|
|
1274
|
+
});
|
|
1275
|
+
|
|
1276
|
+
return {
|
|
1277
|
+
query: options.query,
|
|
1278
|
+
retrievedChunks: scored.map((c, i) => ({
|
|
1279
|
+
id: `local-${i}`,
|
|
1280
|
+
documentId: 'local',
|
|
1281
|
+
documentName: c.documentName,
|
|
1282
|
+
content: c.content,
|
|
1283
|
+
similarityScore: c.similarityScore,
|
|
1284
|
+
metadata: {}
|
|
1285
|
+
})),
|
|
1286
|
+
generatedResponse: response,
|
|
1287
|
+
context,
|
|
1288
|
+
latencyMs: Date.now() - startTime,
|
|
1289
|
+
tierUsed: 1,
|
|
1290
|
+
};
|
|
1291
|
+
} catch (error: any) {
|
|
1292
|
+
this.emitEvent('error', { stage: 'rag_local', error: error.message });
|
|
1293
|
+
throw new Error(`Local RAG failed: ${error.message}`);
|
|
1294
|
+
}
|
|
1295
|
+
}
|
|
1296
|
+
|
|
1297
|
+
/**
|
|
1298
|
+
* Tier 3: Offline RAG using a synced knowledge base.
|
|
1299
|
+
* First call syncKnowledgeBase(), then use this for offline queries.
|
|
1300
|
+
*/
|
|
1301
|
+
async ragQueryOffline(options: RAGOptions): Promise<RAGResponse> {
|
|
1302
|
+
const startTime = Date.now();
|
|
1303
|
+
|
|
1304
|
+
const index = this.offlineIndexes.get(options.knowledgeBaseId);
|
|
1305
|
+
if (!index) {
|
|
1306
|
+
throw new Error(`Knowledge base "${options.knowledgeBaseId}" not synced. Call syncKnowledgeBase() first.`);
|
|
1307
|
+
}
|
|
1308
|
+
|
|
1309
|
+
// Check expiry
|
|
1310
|
+
if (new Date(index.metadata.expires_at) < new Date()) {
|
|
1311
|
+
throw new Error('Offline index has expired. Please re-sync.');
|
|
1312
|
+
}
|
|
1313
|
+
|
|
1314
|
+
try {
|
|
1315
|
+
// Load embedding model
|
|
1316
|
+
if (!this.localEmbeddingModel) {
|
|
1317
|
+
await this.loadEmbeddingModel();
|
|
1318
|
+
}
|
|
1319
|
+
|
|
1320
|
+
// Embed query
|
|
1321
|
+
const queryEmbedding = await this.embedTextLocal(options.query);
|
|
1322
|
+
|
|
1323
|
+
// Search offline index
|
|
1324
|
+
const scored = index.chunks
|
|
1325
|
+
.filter(c => c.embedding && c.embedding.length > 0)
|
|
1326
|
+
.map(c => ({
|
|
1327
|
+
...c,
|
|
1328
|
+
similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding!)
|
|
1329
|
+
}))
|
|
1330
|
+
.sort((a, b) => b.similarityScore - a.similarityScore)
|
|
1331
|
+
.slice(0, options.topK || 5);
|
|
1332
|
+
|
|
1333
|
+
// Build context with size limits
|
|
1334
|
+
const contextWindow = this.modelContextWindow || 2048;
|
|
1335
|
+
const maxContextChars = (contextWindow - 200) * 3; // Rough token-to-char ratio, reserving 200 tokens
|
|
1336
|
+
let contextLength = 0;
|
|
1337
|
+
const contextParts: string[] = [];
|
|
1338
|
+
|
|
1339
|
+
for (const c of scored) {
|
|
1340
|
+
const part = `[Source: ${c.document_name}]\n${c.content}`;
|
|
1341
|
+
if (contextLength + part.length <= maxContextChars) {
|
|
1342
|
+
contextParts.push(part);
|
|
1343
|
+
contextLength += part.length + 10; // Account for separator
|
|
1344
|
+
} else {
|
|
1345
|
+
break;
|
|
1346
|
+
}
|
|
1347
|
+
}
|
|
1348
|
+
|
|
1349
|
+
const context = contextParts.join('\n\n---\n\n');
|
|
1350
|
+
const prompt = `You are a helpful assistant. Answer based ONLY on the following context:\n\n${context}\n\nQuestion: ${options.query}\n\nAnswer:`;
|
|
1351
|
+
|
|
1352
|
+
// Generate locally
|
|
1353
|
+
const response = await this.generate(options.modelId, prompt, {
|
|
1354
|
+
temperature: options.temperature,
|
|
1355
|
+
maxTokens: options.maxTokens,
|
|
1356
|
+
});
|
|
1357
|
+
|
|
1358
|
+
return {
|
|
1359
|
+
query: options.query,
|
|
1360
|
+
retrievedChunks: scored.map(c => ({
|
|
1361
|
+
id: c.id,
|
|
1362
|
+
documentId: c.document_id,
|
|
1363
|
+
documentName: c.document_name,
|
|
1364
|
+
content: c.content,
|
|
1365
|
+
similarityScore: c.similarityScore,
|
|
1366
|
+
metadata: c.metadata
|
|
1367
|
+
})),
|
|
1368
|
+
generatedResponse: response,
|
|
1369
|
+
context,
|
|
1370
|
+
latencyMs: Date.now() - startTime,
|
|
1371
|
+
tierUsed: 3,
|
|
1372
|
+
};
|
|
1373
|
+
} catch (error: any) {
|
|
1374
|
+
this.emitEvent('error', { stage: 'rag_offline', error: error.message });
|
|
1375
|
+
throw new Error(`Offline RAG failed: ${error.message}`);
|
|
1376
|
+
}
|
|
1377
|
+
}
|
|
1378
|
+
|
|
1379
|
+
/**
|
|
1380
|
+
* Sync a knowledge base for offline use (Tier 3).
|
|
1381
|
+
* Downloads chunks + embeddings from server, stores locally.
|
|
1382
|
+
*/
|
|
1383
|
+
async syncKnowledgeBase(knowledgeBaseId: string, deviceId?: string): Promise<{ chunkCount: number; sizeMb: number; expiresAt: string }> {
|
|
1384
|
+
try {
|
|
1385
|
+
if (!this.token) throw new Error('Not authenticated. Call init() first.');
|
|
1386
|
+
|
|
1387
|
+
const response = await axios.post(
|
|
1388
|
+
`${this.apiUrl}/api/rag/knowledge-bases/${knowledgeBaseId}/sync`,
|
|
1389
|
+
{ device_id: deviceId || this.deviceId || 'sdk-device' },
|
|
1390
|
+
{ headers: { Authorization: `Bearer ${this.token}` } }
|
|
1391
|
+
);
|
|
1392
|
+
|
|
1393
|
+
const { sync_package, chunk_count, package_size_mb, expires_at } = response.data;
|
|
1394
|
+
this.offlineIndexes.set(knowledgeBaseId, sync_package);
|
|
1395
|
+
|
|
1396
|
+
return {
|
|
1397
|
+
chunkCount: chunk_count,
|
|
1398
|
+
sizeMb: package_size_mb,
|
|
1399
|
+
expiresAt: expires_at
|
|
1400
|
+
};
|
|
1401
|
+
} catch (error: any) {
|
|
1402
|
+
throw new Error(`Sync failed: ${error.message}`);
|
|
1403
|
+
}
|
|
1404
|
+
}
|
|
1405
|
+
|
|
1406
|
+
// --- RAG Helper Methods ---
|
|
1407
|
+
|
|
1408
|
+
private async loadEmbeddingModel(): Promise<void> {
|
|
1409
|
+
this.emitProgress('downloading', 0, 'Loading embedding model (all-MiniLM-L6-v2)...');
|
|
1410
|
+
try {
|
|
1411
|
+
const { pipeline } = await import('@huggingface/transformers');
|
|
1412
|
+
this.localEmbeddingModel = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
|
|
1413
|
+
this.emitProgress('ready', 100, 'Embedding model loaded');
|
|
1414
|
+
} catch (error: any) {
|
|
1415
|
+
this.emitProgress('error', 0, `Embedding model failed: ${error.message}`);
|
|
1416
|
+
throw error;
|
|
1417
|
+
}
|
|
1418
|
+
}
|
|
1419
|
+
|
|
1420
|
+
private async embedTextLocal(text: string): Promise<number[]> {
|
|
1421
|
+
if (!this.localEmbeddingModel) throw new Error('Embedding model not loaded');
|
|
1422
|
+
const result = await this.localEmbeddingModel(text, { pooling: 'mean', normalize: true });
|
|
1423
|
+
// Handle different tensor output formats (v2 vs v3 of transformers)
|
|
1424
|
+
if (result.data) return Array.from(result.data);
|
|
1425
|
+
if (result.tolist) return result.tolist().flat();
|
|
1426
|
+
if (Array.isArray(result)) return result.flat();
|
|
1427
|
+
throw new Error('Unexpected embedding output format');
|
|
1428
|
+
}
|
|
1429
|
+
|
|
1430
|
+
private cosineSimilarity(a: number[], b: number[]): number {
|
|
1431
|
+
let dot = 0, normA = 0, normB = 0;
|
|
1432
|
+
for (let i = 0; i < a.length; i++) {
|
|
1433
|
+
dot += a[i] * b[i];
|
|
1434
|
+
normA += a[i] * a[i];
|
|
1435
|
+
normB += b[i] * b[i];
|
|
1436
|
+
}
|
|
1437
|
+
const denom = Math.sqrt(normA) * Math.sqrt(normB);
|
|
1438
|
+
return denom === 0 ? 0 : dot / denom;
|
|
1439
|
+
}
|
|
1440
|
+
|
|
1441
|
+
private chunkTextLocal(text: string, chunkSize: number = 512, overlap: number = 128): string[] {
|
|
1442
|
+
if (!text || text.length === 0) return [];
|
|
1443
|
+
if (overlap >= chunkSize) overlap = Math.floor(chunkSize * 0.25);
|
|
1444
|
+
const chunks: string[] = [];
|
|
1445
|
+
let start = 0;
|
|
1446
|
+
while (start < text.length) {
|
|
1447
|
+
let end = start + chunkSize;
|
|
1448
|
+
if (end < text.length) {
|
|
1449
|
+
const bp = Math.max(text.lastIndexOf('.', end), text.lastIndexOf('\n', end));
|
|
1450
|
+
if (bp > start + chunkSize / 2) end = bp + 1;
|
|
1451
|
+
}
|
|
1452
|
+
const chunk = text.slice(start, end).trim();
|
|
1453
|
+
if (chunk.length > 20) chunks.push(chunk);
|
|
1454
|
+
start = end - overlap;
|
|
1455
|
+
if (start >= text.length) break;
|
|
1456
|
+
}
|
|
1457
|
+
return chunks;
|
|
1458
|
+
}
|
|
1459
|
+
|
|
1003
1460
|
// ── Static OpenAI Compatible Factory ────────────────────────────────
|
|
1004
1461
|
|
|
1005
1462
|
static openaiCompatible(config: { apiKey: string; apiUrl?: string; fallback?: FallbackConfig }): OpenAICompatibleClient {
|
|
@@ -1045,4 +1502,8 @@ export type {
|
|
|
1045
1502
|
FallbackConfig,
|
|
1046
1503
|
FallbackProvider,
|
|
1047
1504
|
OpenAICompatibleClient,
|
|
1505
|
+
RAGOptions,
|
|
1506
|
+
RAGChunk,
|
|
1507
|
+
RAGResponse,
|
|
1508
|
+
OfflineIndex,
|
|
1048
1509
|
};
|