@beltoinc/slyos-sdk 1.5.0 → 1.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/create-chatbot.sh +16 -10
- package/dist/index.d.ts +35 -1
- package/dist/index.js +276 -87
- package/package.json +1 -1
- package/src/index.ts +318 -101
package/create-chatbot.sh
CHANGED
|
@@ -329,23 +329,29 @@ async function sendMessage(userMessage) {
|
|
|
329
329
|
const goodChunks = chunks.filter(c => (c.similarity_score || 0) > 0.3);
|
|
330
330
|
|
|
331
331
|
if (goodChunks.length > 0) {
|
|
332
|
-
// Keep context SHORT — small models need room to generate
|
|
333
332
|
const ctxWindow = sdk.getModelContextWindow?.() || 2048;
|
|
334
|
-
|
|
335
|
-
|
|
333
|
+
|
|
334
|
+
// AGGRESSIVE context limits — small models choke on long prompts
|
|
335
|
+
// ~4 chars per token on average, reserve 60% of window for generation
|
|
336
|
+
const maxContextTokens = Math.floor(ctxWindow * 0.3);
|
|
337
|
+
const maxContextChars = ctxWindow <= 2048 ? 400 : ctxWindow <= 4096 ? 1000 : 2000;
|
|
336
338
|
const maxGenTokens = ctxWindow <= 2048 ? 150 : Math.min(300, Math.floor(ctxWindow / 4));
|
|
337
339
|
|
|
338
|
-
//
|
|
339
|
-
|
|
340
|
+
// Use only the single best chunk for small models
|
|
341
|
+
const bestChunk = goodChunks[0];
|
|
342
|
+
let context = bestChunk.content
|
|
340
343
|
.replace(/[^\x20-\x7E\n]/g, ' ') // Strip non-ASCII/control chars
|
|
341
|
-
.replace(/\s{
|
|
342
|
-
.replace(/<[^>]+>/g, ' ') // Strip
|
|
343
|
-
.replace(/https?:\/\/\S+/g, '') // Strip URLs
|
|
344
|
+
.replace(/\s{2,}/g, ' ') // Collapse whitespace
|
|
345
|
+
.replace(/<[^>]+>/g, ' ') // Strip HTML tags
|
|
346
|
+
.replace(/https?:\/\/\S+/g, '') // Strip URLs
|
|
347
|
+
.replace(/[{}()\[\]]/g, '') // Strip brackets/braces
|
|
344
348
|
.trim();
|
|
345
349
|
if (context.length > maxContextChars) context = context.substring(0, maxContextChars);
|
|
346
350
|
|
|
347
|
-
|
|
348
|
-
|
|
351
|
+
console.log(`${colors.dim}Context: ${context.length} chars from "${bestChunk.document_name}"${colors.reset}`);
|
|
352
|
+
|
|
353
|
+
// Minimal prompt — every token counts
|
|
354
|
+
const prompt = `${context}\n\nQ: ${userMessage}\nA:`;
|
|
349
355
|
const response = await sdk.generate(config.model, prompt, {
|
|
350
356
|
temperature: 0.6,
|
|
351
357
|
maxTokens: maxGenTokens
|
package/dist/index.d.ts
CHANGED
|
@@ -44,7 +44,7 @@ interface ProgressEvent {
|
|
|
44
44
|
detail?: any;
|
|
45
45
|
}
|
|
46
46
|
interface SlyEvent {
|
|
47
|
-
type: 'auth' | 'device_registered' | 'device_profiled' | 'model_download_start' | 'model_download_progress' | 'model_loaded' | 'inference_start' | 'inference_complete' | 'error' | 'fallback_success' | 'fallback_error' | 'telemetry_flushed';
|
|
47
|
+
type: 'auth' | 'device_registered' | 'device_profiled' | 'model_download_start' | 'model_download_progress' | 'model_loaded' | 'inference_start' | 'inference_complete' | 'error' | 'fallback_success' | 'fallback_error' | 'telemetry_flushed' | 'token';
|
|
48
48
|
data?: any;
|
|
49
49
|
timestamp: number;
|
|
50
50
|
}
|
|
@@ -126,6 +126,7 @@ interface RAGOptions {
|
|
|
126
126
|
modelId: string;
|
|
127
127
|
temperature?: number;
|
|
128
128
|
maxTokens?: number;
|
|
129
|
+
onToken?: (token: string, partial: string) => void;
|
|
129
130
|
}
|
|
130
131
|
interface RAGChunk {
|
|
131
132
|
id: string;
|
|
@@ -142,6 +143,23 @@ interface RAGResponse {
|
|
|
142
143
|
context: string;
|
|
143
144
|
latencyMs: number;
|
|
144
145
|
tierUsed: 1 | 2 | 3;
|
|
146
|
+
timing: {
|
|
147
|
+
retrievalMs: number;
|
|
148
|
+
contextBuildMs: number;
|
|
149
|
+
firstTokenMs: number;
|
|
150
|
+
generationMs: number;
|
|
151
|
+
totalMs: number;
|
|
152
|
+
tokensGenerated: number;
|
|
153
|
+
tokensPerSecond: number;
|
|
154
|
+
};
|
|
155
|
+
config: {
|
|
156
|
+
maxContextChars: number;
|
|
157
|
+
maxGenTokens: number;
|
|
158
|
+
chunkSize: number;
|
|
159
|
+
topK: number;
|
|
160
|
+
contextWindowUsed: number;
|
|
161
|
+
deviceTier: 'low' | 'mid' | 'high';
|
|
162
|
+
};
|
|
145
163
|
}
|
|
146
164
|
interface OfflineIndex {
|
|
147
165
|
metadata: {
|
|
@@ -224,6 +242,18 @@ declare class SlyOS {
|
|
|
224
242
|
quant?: QuantizationLevel;
|
|
225
243
|
}): Promise<void>;
|
|
226
244
|
generate(modelId: string, prompt: string, options?: GenerateOptions): Promise<string>;
|
|
245
|
+
/**
|
|
246
|
+
* Stream text generation token-by-token.
|
|
247
|
+
* Calls onToken callback for each generated token.
|
|
248
|
+
*/
|
|
249
|
+
generateStream(modelId: string, prompt: string, options?: GenerateOptions & {
|
|
250
|
+
onToken?: (token: string, partial: string) => void;
|
|
251
|
+
}): Promise<{
|
|
252
|
+
text: string;
|
|
253
|
+
firstTokenMs: number;
|
|
254
|
+
totalMs: number;
|
|
255
|
+
tokensGenerated: number;
|
|
256
|
+
}>;
|
|
227
257
|
transcribe(modelId: string, audioInput: any, options?: TranscribeOptions): Promise<string>;
|
|
228
258
|
chatCompletion(modelId: string, request: OpenAIChatCompletionRequest): Promise<OpenAIChatCompletionResponse>;
|
|
229
259
|
bedrockInvoke(modelId: string, request: BedrockInvokeRequest): Promise<BedrockInvokeResponse>;
|
|
@@ -235,6 +265,10 @@ declare class SlyOS {
|
|
|
235
265
|
private mapModelToOpenAI;
|
|
236
266
|
private localEmbeddingModel;
|
|
237
267
|
private offlineIndexes;
|
|
268
|
+
/**
|
|
269
|
+
* Compute dynamic RAG parameters based on device profile and model.
|
|
270
|
+
*/
|
|
271
|
+
private computeRAGConfig;
|
|
238
272
|
/**
|
|
239
273
|
* Tier 2: Cloud-indexed RAG with local inference.
|
|
240
274
|
* Retrieves relevant chunks from server, generates response locally.
|
package/dist/index.js
CHANGED
|
@@ -870,6 +870,61 @@ class SlyOS {
|
|
|
870
870
|
throw error;
|
|
871
871
|
}
|
|
872
872
|
}
|
|
873
|
+
/**
|
|
874
|
+
* Stream text generation token-by-token.
|
|
875
|
+
* Calls onToken callback for each generated token.
|
|
876
|
+
*/
|
|
877
|
+
async generateStream(modelId, prompt, options = {}) {
|
|
878
|
+
if (!this.models.has(modelId)) {
|
|
879
|
+
await this.loadModel(modelId);
|
|
880
|
+
}
|
|
881
|
+
const loaded = this.models.get(modelId);
|
|
882
|
+
if (!loaded)
|
|
883
|
+
throw new Error(`Model "${modelId}" not loaded`);
|
|
884
|
+
const { pipe, info, contextWindow } = loaded;
|
|
885
|
+
if (info.category !== 'llm')
|
|
886
|
+
throw new Error(`Not an LLM`);
|
|
887
|
+
const maxTokens = Math.min(options.maxTokens || 100, contextWindow || 2048);
|
|
888
|
+
const startTime = Date.now();
|
|
889
|
+
let firstTokenTime = 0;
|
|
890
|
+
let accumulated = '';
|
|
891
|
+
this.emitProgress('generating', 0, `Streaming (max ${maxTokens} tokens)...`);
|
|
892
|
+
try {
|
|
893
|
+
const result = await pipe(prompt, {
|
|
894
|
+
max_new_tokens: maxTokens,
|
|
895
|
+
temperature: options.temperature || 0.7,
|
|
896
|
+
top_p: options.topP || 0.9,
|
|
897
|
+
do_sample: true,
|
|
898
|
+
// Transformers.js streamer callback
|
|
899
|
+
callback_function: (output) => {
|
|
900
|
+
if (!firstTokenTime)
|
|
901
|
+
firstTokenTime = Date.now() - startTime;
|
|
902
|
+
if (output && output.length > 0) {
|
|
903
|
+
// output is token IDs, we need to decode
|
|
904
|
+
// The callback in transformers.js v3 gives decoded text tokens
|
|
905
|
+
const tokenText = typeof output === 'string' ? output : '';
|
|
906
|
+
if (tokenText) {
|
|
907
|
+
accumulated += tokenText;
|
|
908
|
+
options.onToken?.(tokenText, accumulated);
|
|
909
|
+
this.emitEvent('token', { token: tokenText, partial: accumulated });
|
|
910
|
+
}
|
|
911
|
+
}
|
|
912
|
+
}
|
|
913
|
+
});
|
|
914
|
+
const rawOutput = result[0].generated_text;
|
|
915
|
+
const response = rawOutput.startsWith(prompt) ? rawOutput.slice(prompt.length).trim() : rawOutput.trim();
|
|
916
|
+
if (!firstTokenTime)
|
|
917
|
+
firstTokenTime = Date.now() - startTime;
|
|
918
|
+
const totalMs = Date.now() - startTime;
|
|
919
|
+
const tokensGenerated = response.split(/\s+/).length;
|
|
920
|
+
this.emitProgress('ready', 100, `Streamed ${tokensGenerated} tokens in ${(totalMs / 1000).toFixed(1)}s`);
|
|
921
|
+
return { text: response, firstTokenMs: firstTokenTime, totalMs, tokensGenerated };
|
|
922
|
+
}
|
|
923
|
+
catch (error) {
|
|
924
|
+
this.emitProgress('error', 0, `Stream failed: ${error.message}`);
|
|
925
|
+
throw error;
|
|
926
|
+
}
|
|
927
|
+
}
|
|
873
928
|
// ── Inference: Transcribe ───────────────────────────────────────
|
|
874
929
|
async transcribe(modelId, audioInput, options = {}) {
|
|
875
930
|
if (!this.models.has(modelId)) {
|
|
@@ -1179,6 +1234,45 @@ class SlyOS {
|
|
|
1179
1234
|
};
|
|
1180
1235
|
return modelMapping[slyModelId] || 'gpt-4o-mini';
|
|
1181
1236
|
}
|
|
1237
|
+
/**
|
|
1238
|
+
* Compute dynamic RAG parameters based on device profile and model.
|
|
1239
|
+
*/
|
|
1240
|
+
computeRAGConfig(modelId) {
|
|
1241
|
+
const contextWindow = this.modelContextWindow || 2048;
|
|
1242
|
+
const memoryMB = this.deviceProfile?.memoryMB || 4096;
|
|
1243
|
+
const cpuCores = this.deviceProfile?.cpuCores || 4;
|
|
1244
|
+
const hasGPU = !!(this.deviceProfile?.gpuRenderer || this.deviceProfile?.webgpuAvailable);
|
|
1245
|
+
// Determine device tier
|
|
1246
|
+
let deviceTier = 'low';
|
|
1247
|
+
if (memoryMB >= 8192 && cpuCores >= 8)
|
|
1248
|
+
deviceTier = 'high';
|
|
1249
|
+
else if (memoryMB >= 4096 && cpuCores >= 4)
|
|
1250
|
+
deviceTier = 'mid';
|
|
1251
|
+
// Context chars: scale with context window AND device capability
|
|
1252
|
+
let maxContextChars;
|
|
1253
|
+
if (contextWindow <= 2048) {
|
|
1254
|
+
maxContextChars = deviceTier === 'high' ? 600 : deviceTier === 'mid' ? 400 : 300;
|
|
1255
|
+
}
|
|
1256
|
+
else if (contextWindow <= 4096) {
|
|
1257
|
+
maxContextChars = deviceTier === 'high' ? 1500 : deviceTier === 'mid' ? 1000 : 600;
|
|
1258
|
+
}
|
|
1259
|
+
else {
|
|
1260
|
+
maxContextChars = deviceTier === 'high' ? 3000 : deviceTier === 'mid' ? 2000 : 1000;
|
|
1261
|
+
}
|
|
1262
|
+
// Gen tokens: scale with device tier
|
|
1263
|
+
let maxGenTokens;
|
|
1264
|
+
if (contextWindow <= 2048) {
|
|
1265
|
+
maxGenTokens = deviceTier === 'high' ? 200 : deviceTier === 'mid' ? 150 : 100;
|
|
1266
|
+
}
|
|
1267
|
+
else {
|
|
1268
|
+
maxGenTokens = deviceTier === 'high' ? 400 : deviceTier === 'mid' ? 300 : 150;
|
|
1269
|
+
}
|
|
1270
|
+
// Chunk size: larger chunks for bigger context windows
|
|
1271
|
+
const chunkSize = contextWindow <= 2048 ? 256 : contextWindow <= 4096 ? 512 : 1024;
|
|
1272
|
+
// TopK: more chunks for powerful devices
|
|
1273
|
+
const topK = deviceTier === 'high' ? 5 : deviceTier === 'mid' ? 3 : 1;
|
|
1274
|
+
return { maxContextChars, maxGenTokens, chunkSize, topK, contextWindow, deviceTier };
|
|
1275
|
+
}
|
|
1182
1276
|
/**
|
|
1183
1277
|
* Tier 2: Cloud-indexed RAG with local inference.
|
|
1184
1278
|
* Retrieves relevant chunks from server, generates response locally.
|
|
@@ -1188,27 +1282,52 @@ class SlyOS {
|
|
|
1188
1282
|
try {
|
|
1189
1283
|
if (!this.token)
|
|
1190
1284
|
throw new Error('Not authenticated. Call init() first.');
|
|
1285
|
+
const ragConfig = this.computeRAGConfig(options.modelId);
|
|
1191
1286
|
// Step 1: Retrieve relevant chunks from backend
|
|
1287
|
+
const retrievalStart = Date.now();
|
|
1192
1288
|
const searchResponse = await axios.post(`${this.apiUrl}/api/rag/knowledge-bases/${options.knowledgeBaseId}/query`, {
|
|
1193
1289
|
query: options.query,
|
|
1194
|
-
top_k: options.topK ||
|
|
1290
|
+
top_k: options.topK || ragConfig.topK,
|
|
1195
1291
|
model_id: options.modelId
|
|
1196
1292
|
}, { headers: { Authorization: `Bearer ${this.token}` } });
|
|
1293
|
+
const retrievalMs = Date.now() - retrievalStart;
|
|
1197
1294
|
let { retrieved_chunks, prompt_template, context } = searchResponse.data;
|
|
1198
|
-
//
|
|
1199
|
-
const
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
context = context.substring(0, maxContextChars) + '...';
|
|
1295
|
+
// Step 2: Build context with dynamic limits
|
|
1296
|
+
const contextBuildStart = Date.now();
|
|
1297
|
+
if (context && context.length > ragConfig.maxContextChars) {
|
|
1298
|
+
context = context.substring(0, ragConfig.maxContextChars);
|
|
1203
1299
|
}
|
|
1204
|
-
//
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1300
|
+
// If no prompt_template from server, build minimal one
|
|
1301
|
+
if (!prompt_template) {
|
|
1302
|
+
prompt_template = `${context}\n\nQ: ${options.query}\nA:`;
|
|
1303
|
+
}
|
|
1304
|
+
const contextBuildMs = Date.now() - contextBuildStart;
|
|
1305
|
+
// Step 3: Generate response — stream if callback provided
|
|
1306
|
+
const genStart = Date.now();
|
|
1307
|
+
let response;
|
|
1308
|
+
let firstTokenMs = 0;
|
|
1309
|
+
if (options.onToken) {
|
|
1310
|
+
const streamResult = await this.generateStream(options.modelId, prompt_template, {
|
|
1311
|
+
temperature: options.temperature,
|
|
1312
|
+
maxTokens: options.maxTokens || ragConfig.maxGenTokens,
|
|
1313
|
+
onToken: options.onToken,
|
|
1314
|
+
});
|
|
1315
|
+
response = streamResult.text;
|
|
1316
|
+
firstTokenMs = streamResult.firstTokenMs;
|
|
1317
|
+
}
|
|
1318
|
+
else {
|
|
1319
|
+
response = await this.generate(options.modelId, prompt_template, {
|
|
1320
|
+
temperature: options.temperature,
|
|
1321
|
+
maxTokens: options.maxTokens || ragConfig.maxGenTokens,
|
|
1322
|
+
});
|
|
1323
|
+
firstTokenMs = Date.now() - genStart; // approximate
|
|
1324
|
+
}
|
|
1325
|
+
const generationMs = Date.now() - genStart;
|
|
1326
|
+
const totalMs = Date.now() - startTime;
|
|
1327
|
+
const tokensGenerated = response.split(/\s+/).length;
|
|
1209
1328
|
return {
|
|
1210
1329
|
query: options.query,
|
|
1211
|
-
retrievedChunks: retrieved_chunks.map((c) => ({
|
|
1330
|
+
retrievedChunks: (retrieved_chunks || []).map((c) => ({
|
|
1212
1331
|
id: c.id,
|
|
1213
1332
|
documentId: c.document_id,
|
|
1214
1333
|
documentName: c.document_name,
|
|
@@ -1218,8 +1337,25 @@ class SlyOS {
|
|
|
1218
1337
|
})),
|
|
1219
1338
|
generatedResponse: response,
|
|
1220
1339
|
context,
|
|
1221
|
-
latencyMs:
|
|
1340
|
+
latencyMs: totalMs,
|
|
1222
1341
|
tierUsed: 2,
|
|
1342
|
+
timing: {
|
|
1343
|
+
retrievalMs,
|
|
1344
|
+
contextBuildMs,
|
|
1345
|
+
firstTokenMs,
|
|
1346
|
+
generationMs,
|
|
1347
|
+
totalMs,
|
|
1348
|
+
tokensGenerated,
|
|
1349
|
+
tokensPerSecond: generationMs > 0 ? tokensGenerated / (generationMs / 1000) : 0,
|
|
1350
|
+
},
|
|
1351
|
+
config: {
|
|
1352
|
+
maxContextChars: ragConfig.maxContextChars,
|
|
1353
|
+
maxGenTokens: ragConfig.maxGenTokens,
|
|
1354
|
+
chunkSize: ragConfig.chunkSize,
|
|
1355
|
+
topK: options.topK || ragConfig.topK,
|
|
1356
|
+
contextWindowUsed: ragConfig.contextWindow,
|
|
1357
|
+
deviceTier: ragConfig.deviceTier,
|
|
1358
|
+
},
|
|
1223
1359
|
};
|
|
1224
1360
|
}
|
|
1225
1361
|
catch (error) {
|
|
@@ -1234,56 +1370,66 @@ class SlyOS {
|
|
|
1234
1370
|
async ragQueryLocal(options) {
|
|
1235
1371
|
const startTime = Date.now();
|
|
1236
1372
|
try {
|
|
1373
|
+
const ragConfig = this.computeRAGConfig(options.modelId);
|
|
1237
1374
|
// Step 1: Load embedding model if needed
|
|
1238
1375
|
if (!this.localEmbeddingModel) {
|
|
1239
1376
|
await this.loadEmbeddingModel();
|
|
1240
1377
|
}
|
|
1241
|
-
//
|
|
1242
|
-
const
|
|
1243
|
-
const chunkSize = contextWindow <= 1024 ? 256 : contextWindow <= 2048 ? 512 : 1024;
|
|
1244
|
-
const overlap = Math.floor(chunkSize / 4);
|
|
1245
|
-
// Step 2: Chunk documents if not already chunked
|
|
1378
|
+
// Step 2: Chunk and embed documents (dynamic chunk size)
|
|
1379
|
+
const retrievalStart = Date.now();
|
|
1246
1380
|
const allChunks = [];
|
|
1247
1381
|
for (const doc of options.documents) {
|
|
1248
|
-
const chunks = this.chunkTextLocal(doc.content, chunkSize,
|
|
1382
|
+
const chunks = this.chunkTextLocal(doc.content, ragConfig.chunkSize, Math.floor(ragConfig.chunkSize / 4));
|
|
1249
1383
|
for (const chunk of chunks) {
|
|
1250
1384
|
const embedding = await this.embedTextLocal(chunk);
|
|
1251
1385
|
allChunks.push({ content: chunk, documentName: doc.name || 'Document', embedding });
|
|
1252
1386
|
}
|
|
1253
1387
|
}
|
|
1254
|
-
// Step 3: Embed query
|
|
1388
|
+
// Step 3: Embed query and search
|
|
1255
1389
|
const queryEmbedding = await this.embedTextLocal(options.query);
|
|
1256
|
-
// Step 4: Cosine similarity search
|
|
1257
1390
|
const scored = allChunks
|
|
1258
1391
|
.filter(c => c.embedding)
|
|
1259
|
-
.map(c => ({
|
|
1260
|
-
...c,
|
|
1261
|
-
similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding)
|
|
1262
|
-
}))
|
|
1392
|
+
.map(c => ({ ...c, similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding) }))
|
|
1263
1393
|
.sort((a, b) => b.similarityScore - a.similarityScore)
|
|
1264
|
-
.slice(0, options.topK ||
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
const
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
}
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1394
|
+
.slice(0, options.topK || ragConfig.topK);
|
|
1395
|
+
const retrievalMs = Date.now() - retrievalStart;
|
|
1396
|
+
// Step 4: Build context
|
|
1397
|
+
const contextBuildStart = Date.now();
|
|
1398
|
+
const bestChunk = scored[0];
|
|
1399
|
+
let context = bestChunk.content
|
|
1400
|
+
.replace(/[^\x20-\x7E\n]/g, ' ')
|
|
1401
|
+
.replace(/\s{2,}/g, ' ')
|
|
1402
|
+
.replace(/<[^>]+>/g, ' ')
|
|
1403
|
+
.replace(/https?:\/\/\S+/g, '')
|
|
1404
|
+
.replace(/[{}()\[\]]/g, '')
|
|
1405
|
+
.trim();
|
|
1406
|
+
if (context.length > ragConfig.maxContextChars)
|
|
1407
|
+
context = context.substring(0, ragConfig.maxContextChars);
|
|
1408
|
+
const prompt = `${context}\n\nQ: ${options.query}\nA:`;
|
|
1409
|
+
const contextBuildMs = Date.now() - contextBuildStart;
|
|
1410
|
+
// Step 5: Generate — stream if callback provided
|
|
1411
|
+
const genStart = Date.now();
|
|
1412
|
+
let response;
|
|
1413
|
+
let firstTokenMs = 0;
|
|
1414
|
+
if (options.onToken) {
|
|
1415
|
+
const streamResult = await this.generateStream(options.modelId, prompt, {
|
|
1416
|
+
temperature: options.temperature || 0.6,
|
|
1417
|
+
maxTokens: options.maxTokens || ragConfig.maxGenTokens,
|
|
1418
|
+
onToken: options.onToken,
|
|
1419
|
+
});
|
|
1420
|
+
response = streamResult.text;
|
|
1421
|
+
firstTokenMs = streamResult.firstTokenMs;
|
|
1278
1422
|
}
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1423
|
+
else {
|
|
1424
|
+
response = await this.generate(options.modelId, prompt, {
|
|
1425
|
+
temperature: options.temperature || 0.6,
|
|
1426
|
+
maxTokens: options.maxTokens || ragConfig.maxGenTokens,
|
|
1427
|
+
});
|
|
1428
|
+
firstTokenMs = Date.now() - genStart;
|
|
1429
|
+
}
|
|
1430
|
+
const generationMs = Date.now() - genStart;
|
|
1431
|
+
const totalMs = Date.now() - startTime;
|
|
1432
|
+
const tokensGenerated = response.split(/\s+/).length;
|
|
1287
1433
|
return {
|
|
1288
1434
|
query: options.query,
|
|
1289
1435
|
retrievedChunks: scored.map((c, i) => ({
|
|
@@ -1296,8 +1442,25 @@ class SlyOS {
|
|
|
1296
1442
|
})),
|
|
1297
1443
|
generatedResponse: response,
|
|
1298
1444
|
context,
|
|
1299
|
-
latencyMs:
|
|
1445
|
+
latencyMs: totalMs,
|
|
1300
1446
|
tierUsed: 1,
|
|
1447
|
+
timing: {
|
|
1448
|
+
retrievalMs,
|
|
1449
|
+
contextBuildMs,
|
|
1450
|
+
firstTokenMs,
|
|
1451
|
+
generationMs,
|
|
1452
|
+
totalMs,
|
|
1453
|
+
tokensGenerated,
|
|
1454
|
+
tokensPerSecond: generationMs > 0 ? tokensGenerated / (generationMs / 1000) : 0,
|
|
1455
|
+
},
|
|
1456
|
+
config: {
|
|
1457
|
+
maxContextChars: ragConfig.maxContextChars,
|
|
1458
|
+
maxGenTokens: ragConfig.maxGenTokens,
|
|
1459
|
+
chunkSize: ragConfig.chunkSize,
|
|
1460
|
+
topK: options.topK || ragConfig.topK,
|
|
1461
|
+
contextWindowUsed: ragConfig.contextWindow,
|
|
1462
|
+
deviceTier: ragConfig.deviceTier,
|
|
1463
|
+
},
|
|
1301
1464
|
};
|
|
1302
1465
|
}
|
|
1303
1466
|
catch (error) {
|
|
@@ -1312,52 +1475,61 @@ class SlyOS {
|
|
|
1312
1475
|
async ragQueryOffline(options) {
|
|
1313
1476
|
const startTime = Date.now();
|
|
1314
1477
|
const index = this.offlineIndexes.get(options.knowledgeBaseId);
|
|
1315
|
-
if (!index)
|
|
1316
|
-
throw new Error(`
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
if (new Date(index.metadata.expires_at) < new Date()) {
|
|
1320
|
-
throw new Error('Offline index has expired. Please re-sync.');
|
|
1321
|
-
}
|
|
1478
|
+
if (!index)
|
|
1479
|
+
throw new Error(`KB "${options.knowledgeBaseId}" not synced.`);
|
|
1480
|
+
if (new Date(index.metadata.expires_at) < new Date())
|
|
1481
|
+
throw new Error('Offline index expired.');
|
|
1322
1482
|
try {
|
|
1483
|
+
const ragConfig = this.computeRAGConfig(options.modelId);
|
|
1323
1484
|
// Load embedding model
|
|
1324
|
-
if (!this.localEmbeddingModel)
|
|
1485
|
+
if (!this.localEmbeddingModel)
|
|
1325
1486
|
await this.loadEmbeddingModel();
|
|
1326
|
-
}
|
|
1327
|
-
// Embed query
|
|
1328
|
-
const queryEmbedding = await this.embedTextLocal(options.query);
|
|
1329
1487
|
// Search offline index
|
|
1488
|
+
const retrievalStart = Date.now();
|
|
1489
|
+
const queryEmbedding = await this.embedTextLocal(options.query);
|
|
1330
1490
|
const scored = index.chunks
|
|
1331
1491
|
.filter(c => c.embedding && c.embedding.length > 0)
|
|
1332
|
-
.map(c => ({
|
|
1333
|
-
...c,
|
|
1334
|
-
similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding)
|
|
1335
|
-
}))
|
|
1492
|
+
.map(c => ({ ...c, similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding) }))
|
|
1336
1493
|
.sort((a, b) => b.similarityScore - a.similarityScore)
|
|
1337
|
-
.slice(0, options.topK ||
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
const
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1494
|
+
.slice(0, options.topK || ragConfig.topK);
|
|
1495
|
+
const retrievalMs = Date.now() - retrievalStart;
|
|
1496
|
+
// Build context
|
|
1497
|
+
const contextBuildStart = Date.now();
|
|
1498
|
+
const bestChunk = scored[0];
|
|
1499
|
+
let context = bestChunk.content
|
|
1500
|
+
.replace(/[^\x20-\x7E\n]/g, ' ')
|
|
1501
|
+
.replace(/\s{2,}/g, ' ')
|
|
1502
|
+
.replace(/<[^>]+>/g, ' ')
|
|
1503
|
+
.replace(/https?:\/\/\S+/g, '')
|
|
1504
|
+
.replace(/[{}()\[\]]/g, '')
|
|
1505
|
+
.trim();
|
|
1506
|
+
if (context.length > ragConfig.maxContextChars)
|
|
1507
|
+
context = context.substring(0, ragConfig.maxContextChars);
|
|
1508
|
+
const prompt = `${context}\n\nQ: ${options.query}\nA:`;
|
|
1509
|
+
const contextBuildMs = Date.now() - contextBuildStart;
|
|
1510
|
+
// Generate
|
|
1511
|
+
const genStart = Date.now();
|
|
1512
|
+
let response;
|
|
1513
|
+
let firstTokenMs = 0;
|
|
1514
|
+
if (options.onToken) {
|
|
1515
|
+
const streamResult = await this.generateStream(options.modelId, prompt, {
|
|
1516
|
+
temperature: options.temperature || 0.6,
|
|
1517
|
+
maxTokens: options.maxTokens || ragConfig.maxGenTokens,
|
|
1518
|
+
onToken: options.onToken,
|
|
1519
|
+
});
|
|
1520
|
+
response = streamResult.text;
|
|
1521
|
+
firstTokenMs = streamResult.firstTokenMs;
|
|
1352
1522
|
}
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1523
|
+
else {
|
|
1524
|
+
response = await this.generate(options.modelId, prompt, {
|
|
1525
|
+
temperature: options.temperature || 0.6,
|
|
1526
|
+
maxTokens: options.maxTokens || ragConfig.maxGenTokens,
|
|
1527
|
+
});
|
|
1528
|
+
firstTokenMs = Date.now() - genStart;
|
|
1529
|
+
}
|
|
1530
|
+
const generationMs = Date.now() - genStart;
|
|
1531
|
+
const totalMs = Date.now() - startTime;
|
|
1532
|
+
const tokensGenerated = response.split(/\s+/).length;
|
|
1361
1533
|
return {
|
|
1362
1534
|
query: options.query,
|
|
1363
1535
|
retrievedChunks: scored.map(c => ({
|
|
@@ -1370,8 +1542,25 @@ class SlyOS {
|
|
|
1370
1542
|
})),
|
|
1371
1543
|
generatedResponse: response,
|
|
1372
1544
|
context,
|
|
1373
|
-
latencyMs:
|
|
1545
|
+
latencyMs: totalMs,
|
|
1374
1546
|
tierUsed: 3,
|
|
1547
|
+
timing: {
|
|
1548
|
+
retrievalMs,
|
|
1549
|
+
contextBuildMs,
|
|
1550
|
+
firstTokenMs,
|
|
1551
|
+
generationMs,
|
|
1552
|
+
totalMs,
|
|
1553
|
+
tokensGenerated,
|
|
1554
|
+
tokensPerSecond: generationMs > 0 ? tokensGenerated / (generationMs / 1000) : 0,
|
|
1555
|
+
},
|
|
1556
|
+
config: {
|
|
1557
|
+
maxContextChars: ragConfig.maxContextChars,
|
|
1558
|
+
maxGenTokens: ragConfig.maxGenTokens,
|
|
1559
|
+
chunkSize: ragConfig.chunkSize,
|
|
1560
|
+
topK: options.topK || ragConfig.topK,
|
|
1561
|
+
contextWindowUsed: ragConfig.contextWindow,
|
|
1562
|
+
deviceTier: ragConfig.deviceTier,
|
|
1563
|
+
},
|
|
1375
1564
|
};
|
|
1376
1565
|
}
|
|
1377
1566
|
catch (error) {
|
package/package.json
CHANGED
package/src/index.ts
CHANGED
|
@@ -69,7 +69,7 @@ interface ProgressEvent {
|
|
|
69
69
|
}
|
|
70
70
|
|
|
71
71
|
interface SlyEvent {
|
|
72
|
-
type: 'auth' | 'device_registered' | 'device_profiled' | 'model_download_start' | 'model_download_progress' | 'model_loaded' | 'inference_start' | 'inference_complete' | 'error' | 'fallback_success' | 'fallback_error' | 'telemetry_flushed';
|
|
72
|
+
type: 'auth' | 'device_registered' | 'device_profiled' | 'model_download_start' | 'model_download_progress' | 'model_loaded' | 'inference_start' | 'inference_complete' | 'error' | 'fallback_success' | 'fallback_error' | 'telemetry_flushed' | 'token';
|
|
73
73
|
data?: any;
|
|
74
74
|
timestamp: number;
|
|
75
75
|
}
|
|
@@ -174,6 +174,8 @@ interface RAGOptions {
|
|
|
174
174
|
modelId: string;
|
|
175
175
|
temperature?: number;
|
|
176
176
|
maxTokens?: number;
|
|
177
|
+
// NEW: streaming callback
|
|
178
|
+
onToken?: (token: string, partial: string) => void;
|
|
177
179
|
}
|
|
178
180
|
|
|
179
181
|
interface RAGChunk {
|
|
@@ -192,6 +194,25 @@ interface RAGResponse {
|
|
|
192
194
|
context: string;
|
|
193
195
|
latencyMs: number;
|
|
194
196
|
tierUsed: 1 | 2 | 3;
|
|
197
|
+
// NEW: detailed timing metrics
|
|
198
|
+
timing: {
|
|
199
|
+
retrievalMs: number; // Time spent retrieving/embedding chunks
|
|
200
|
+
contextBuildMs: number; // Time spent building context
|
|
201
|
+
firstTokenMs: number; // Time to first token (from generation start)
|
|
202
|
+
generationMs: number; // Total generation time
|
|
203
|
+
totalMs: number; // End-to-end latency
|
|
204
|
+
tokensGenerated: number; // Number of tokens in response
|
|
205
|
+
tokensPerSecond: number; // Generation throughput
|
|
206
|
+
};
|
|
207
|
+
// NEW: dynamic config used
|
|
208
|
+
config: {
|
|
209
|
+
maxContextChars: number;
|
|
210
|
+
maxGenTokens: number;
|
|
211
|
+
chunkSize: number;
|
|
212
|
+
topK: number;
|
|
213
|
+
contextWindowUsed: number;
|
|
214
|
+
deviceTier: 'low' | 'mid' | 'high';
|
|
215
|
+
};
|
|
195
216
|
}
|
|
196
217
|
|
|
197
218
|
interface OfflineIndex {
|
|
@@ -1145,6 +1166,68 @@ class SlyOS {
|
|
|
1145
1166
|
}
|
|
1146
1167
|
}
|
|
1147
1168
|
|
|
1169
|
+
/**
|
|
1170
|
+
* Stream text generation token-by-token.
|
|
1171
|
+
* Calls onToken callback for each generated token.
|
|
1172
|
+
*/
|
|
1173
|
+
async generateStream(
|
|
1174
|
+
modelId: string,
|
|
1175
|
+
prompt: string,
|
|
1176
|
+
options: GenerateOptions & { onToken?: (token: string, partial: string) => void } = {}
|
|
1177
|
+
): Promise<{ text: string; firstTokenMs: number; totalMs: number; tokensGenerated: number }> {
|
|
1178
|
+
if (!this.models.has(modelId)) {
|
|
1179
|
+
await this.loadModel(modelId);
|
|
1180
|
+
}
|
|
1181
|
+
const loaded = this.models.get(modelId);
|
|
1182
|
+
if (!loaded) throw new Error(`Model "${modelId}" not loaded`);
|
|
1183
|
+
const { pipe, info, contextWindow } = loaded;
|
|
1184
|
+
if (info.category !== 'llm') throw new Error(`Not an LLM`);
|
|
1185
|
+
|
|
1186
|
+
const maxTokens = Math.min(options.maxTokens || 100, contextWindow || 2048);
|
|
1187
|
+
const startTime = Date.now();
|
|
1188
|
+
let firstTokenTime = 0;
|
|
1189
|
+
let accumulated = '';
|
|
1190
|
+
|
|
1191
|
+
this.emitProgress('generating', 0, `Streaming (max ${maxTokens} tokens)...`);
|
|
1192
|
+
|
|
1193
|
+
try {
|
|
1194
|
+
const result = await pipe(prompt, {
|
|
1195
|
+
max_new_tokens: maxTokens,
|
|
1196
|
+
temperature: options.temperature || 0.7,
|
|
1197
|
+
top_p: options.topP || 0.9,
|
|
1198
|
+
do_sample: true,
|
|
1199
|
+
// Transformers.js streamer callback
|
|
1200
|
+
callback_function: (output: any) => {
|
|
1201
|
+
if (!firstTokenTime) firstTokenTime = Date.now() - startTime;
|
|
1202
|
+
if (output && output.length > 0) {
|
|
1203
|
+
// output is token IDs, we need to decode
|
|
1204
|
+
// The callback in transformers.js v3 gives decoded text tokens
|
|
1205
|
+
const tokenText = typeof output === 'string' ? output : '';
|
|
1206
|
+
if (tokenText) {
|
|
1207
|
+
accumulated += tokenText;
|
|
1208
|
+
options.onToken?.(tokenText, accumulated);
|
|
1209
|
+
this.emitEvent('token', { token: tokenText, partial: accumulated });
|
|
1210
|
+
}
|
|
1211
|
+
}
|
|
1212
|
+
}
|
|
1213
|
+
});
|
|
1214
|
+
|
|
1215
|
+
const rawOutput = result[0].generated_text;
|
|
1216
|
+
const response = rawOutput.startsWith(prompt) ? rawOutput.slice(prompt.length).trim() : rawOutput.trim();
|
|
1217
|
+
|
|
1218
|
+
if (!firstTokenTime) firstTokenTime = Date.now() - startTime;
|
|
1219
|
+
const totalMs = Date.now() - startTime;
|
|
1220
|
+
const tokensGenerated = response.split(/\s+/).length;
|
|
1221
|
+
|
|
1222
|
+
this.emitProgress('ready', 100, `Streamed ${tokensGenerated} tokens in ${(totalMs/1000).toFixed(1)}s`);
|
|
1223
|
+
|
|
1224
|
+
return { text: response, firstTokenMs: firstTokenTime, totalMs, tokensGenerated };
|
|
1225
|
+
} catch (error: any) {
|
|
1226
|
+
this.emitProgress('error', 0, `Stream failed: ${error.message}`);
|
|
1227
|
+
throw error;
|
|
1228
|
+
}
|
|
1229
|
+
}
|
|
1230
|
+
|
|
1148
1231
|
// ── Inference: Transcribe ───────────────────────────────────────
|
|
1149
1232
|
|
|
1150
1233
|
async transcribe(modelId: string, audioInput: any, options: TranscribeOptions = {}): Promise<string> {
|
|
@@ -1495,6 +1578,54 @@ class SlyOS {
|
|
|
1495
1578
|
private localEmbeddingModel: any = null;
|
|
1496
1579
|
private offlineIndexes: Map<string, OfflineIndex> = new Map();
|
|
1497
1580
|
|
|
1581
|
+
/**
|
|
1582
|
+
* Compute dynamic RAG parameters based on device profile and model.
|
|
1583
|
+
*/
|
|
1584
|
+
private computeRAGConfig(modelId: string): {
|
|
1585
|
+
maxContextChars: number;
|
|
1586
|
+
maxGenTokens: number;
|
|
1587
|
+
chunkSize: number;
|
|
1588
|
+
topK: number;
|
|
1589
|
+
contextWindow: number;
|
|
1590
|
+
deviceTier: 'low' | 'mid' | 'high';
|
|
1591
|
+
} {
|
|
1592
|
+
const contextWindow = this.modelContextWindow || 2048;
|
|
1593
|
+
const memoryMB = this.deviceProfile?.memoryMB || 4096;
|
|
1594
|
+
const cpuCores = this.deviceProfile?.cpuCores || 4;
|
|
1595
|
+
const hasGPU = !!(this.deviceProfile?.gpuRenderer || this.deviceProfile?.webgpuAvailable);
|
|
1596
|
+
|
|
1597
|
+
// Determine device tier
|
|
1598
|
+
let deviceTier: 'low' | 'mid' | 'high' = 'low';
|
|
1599
|
+
if (memoryMB >= 8192 && cpuCores >= 8) deviceTier = 'high';
|
|
1600
|
+
else if (memoryMB >= 4096 && cpuCores >= 4) deviceTier = 'mid';
|
|
1601
|
+
|
|
1602
|
+
// Context chars: scale with context window AND device capability
|
|
1603
|
+
let maxContextChars: number;
|
|
1604
|
+
if (contextWindow <= 2048) {
|
|
1605
|
+
maxContextChars = deviceTier === 'high' ? 600 : deviceTier === 'mid' ? 400 : 300;
|
|
1606
|
+
} else if (contextWindow <= 4096) {
|
|
1607
|
+
maxContextChars = deviceTier === 'high' ? 1500 : deviceTier === 'mid' ? 1000 : 600;
|
|
1608
|
+
} else {
|
|
1609
|
+
maxContextChars = deviceTier === 'high' ? 3000 : deviceTier === 'mid' ? 2000 : 1000;
|
|
1610
|
+
}
|
|
1611
|
+
|
|
1612
|
+
// Gen tokens: scale with device tier
|
|
1613
|
+
let maxGenTokens: number;
|
|
1614
|
+
if (contextWindow <= 2048) {
|
|
1615
|
+
maxGenTokens = deviceTier === 'high' ? 200 : deviceTier === 'mid' ? 150 : 100;
|
|
1616
|
+
} else {
|
|
1617
|
+
maxGenTokens = deviceTier === 'high' ? 400 : deviceTier === 'mid' ? 300 : 150;
|
|
1618
|
+
}
|
|
1619
|
+
|
|
1620
|
+
// Chunk size: larger chunks for bigger context windows
|
|
1621
|
+
const chunkSize = contextWindow <= 2048 ? 256 : contextWindow <= 4096 ? 512 : 1024;
|
|
1622
|
+
|
|
1623
|
+
// TopK: more chunks for powerful devices
|
|
1624
|
+
const topK = deviceTier === 'high' ? 5 : deviceTier === 'mid' ? 3 : 1;
|
|
1625
|
+
|
|
1626
|
+
return { maxContextChars, maxGenTokens, chunkSize, topK, contextWindow, deviceTier };
|
|
1627
|
+
}
|
|
1628
|
+
|
|
1498
1629
|
/**
|
|
1499
1630
|
* Tier 2: Cloud-indexed RAG with local inference.
|
|
1500
1631
|
* Retrieves relevant chunks from server, generates response locally.
|
|
@@ -1505,36 +1636,61 @@ class SlyOS {
|
|
|
1505
1636
|
try {
|
|
1506
1637
|
if (!this.token) throw new Error('Not authenticated. Call init() first.');
|
|
1507
1638
|
|
|
1639
|
+
const ragConfig = this.computeRAGConfig(options.modelId);
|
|
1640
|
+
|
|
1508
1641
|
// Step 1: Retrieve relevant chunks from backend
|
|
1642
|
+
const retrievalStart = Date.now();
|
|
1509
1643
|
const searchResponse = await axios.post(
|
|
1510
1644
|
`${this.apiUrl}/api/rag/knowledge-bases/${options.knowledgeBaseId}/query`,
|
|
1511
1645
|
{
|
|
1512
1646
|
query: options.query,
|
|
1513
|
-
top_k: options.topK ||
|
|
1647
|
+
top_k: options.topK || ragConfig.topK,
|
|
1514
1648
|
model_id: options.modelId
|
|
1515
1649
|
},
|
|
1516
1650
|
{ headers: { Authorization: `Bearer ${this.token}` } }
|
|
1517
1651
|
);
|
|
1652
|
+
const retrievalMs = Date.now() - retrievalStart;
|
|
1518
1653
|
|
|
1519
1654
|
let { retrieved_chunks, prompt_template, context } = searchResponse.data;
|
|
1520
1655
|
|
|
1521
|
-
//
|
|
1522
|
-
const
|
|
1523
|
-
|
|
1524
|
-
|
|
1525
|
-
if (context && context.length > maxContextChars) {
|
|
1526
|
-
context = context.substring(0, maxContextChars) + '...';
|
|
1656
|
+
// Step 2: Build context with dynamic limits
|
|
1657
|
+
const contextBuildStart = Date.now();
|
|
1658
|
+
if (context && context.length > ragConfig.maxContextChars) {
|
|
1659
|
+
context = context.substring(0, ragConfig.maxContextChars);
|
|
1527
1660
|
}
|
|
1528
|
-
|
|
1529
|
-
|
|
1530
|
-
|
|
1531
|
-
|
|
1532
|
-
|
|
1533
|
-
|
|
1661
|
+
// If no prompt_template from server, build minimal one
|
|
1662
|
+
if (!prompt_template) {
|
|
1663
|
+
prompt_template = `${context}\n\nQ: ${options.query}\nA:`;
|
|
1664
|
+
}
|
|
1665
|
+
const contextBuildMs = Date.now() - contextBuildStart;
|
|
1666
|
+
|
|
1667
|
+
// Step 3: Generate response — stream if callback provided
|
|
1668
|
+
const genStart = Date.now();
|
|
1669
|
+
let response: string;
|
|
1670
|
+
let firstTokenMs = 0;
|
|
1671
|
+
|
|
1672
|
+
if (options.onToken) {
|
|
1673
|
+
const streamResult = await this.generateStream(options.modelId, prompt_template, {
|
|
1674
|
+
temperature: options.temperature,
|
|
1675
|
+
maxTokens: options.maxTokens || ragConfig.maxGenTokens,
|
|
1676
|
+
onToken: options.onToken,
|
|
1677
|
+
});
|
|
1678
|
+
response = streamResult.text;
|
|
1679
|
+
firstTokenMs = streamResult.firstTokenMs;
|
|
1680
|
+
} else {
|
|
1681
|
+
response = await this.generate(options.modelId, prompt_template, {
|
|
1682
|
+
temperature: options.temperature,
|
|
1683
|
+
maxTokens: options.maxTokens || ragConfig.maxGenTokens,
|
|
1684
|
+
});
|
|
1685
|
+
firstTokenMs = Date.now() - genStart; // approximate
|
|
1686
|
+
}
|
|
1687
|
+
const generationMs = Date.now() - genStart;
|
|
1688
|
+
const totalMs = Date.now() - startTime;
|
|
1689
|
+
const tokensGenerated = response.split(/\s+/).length;
|
|
1534
1690
|
|
|
1535
1691
|
return {
|
|
1536
1692
|
query: options.query,
|
|
1537
|
-
retrievedChunks: retrieved_chunks.map((c: any) => ({
|
|
1693
|
+
retrievedChunks: (retrieved_chunks || []).map((c: any) => ({
|
|
1538
1694
|
id: c.id,
|
|
1539
1695
|
documentId: c.document_id,
|
|
1540
1696
|
documentName: c.document_name,
|
|
@@ -1544,8 +1700,25 @@ class SlyOS {
|
|
|
1544
1700
|
})),
|
|
1545
1701
|
generatedResponse: response,
|
|
1546
1702
|
context,
|
|
1547
|
-
latencyMs:
|
|
1703
|
+
latencyMs: totalMs,
|
|
1548
1704
|
tierUsed: 2,
|
|
1705
|
+
timing: {
|
|
1706
|
+
retrievalMs,
|
|
1707
|
+
contextBuildMs,
|
|
1708
|
+
firstTokenMs,
|
|
1709
|
+
generationMs,
|
|
1710
|
+
totalMs,
|
|
1711
|
+
tokensGenerated,
|
|
1712
|
+
tokensPerSecond: generationMs > 0 ? tokensGenerated / (generationMs / 1000) : 0,
|
|
1713
|
+
},
|
|
1714
|
+
config: {
|
|
1715
|
+
maxContextChars: ragConfig.maxContextChars,
|
|
1716
|
+
maxGenTokens: ragConfig.maxGenTokens,
|
|
1717
|
+
chunkSize: ragConfig.chunkSize,
|
|
1718
|
+
topK: options.topK || ragConfig.topK,
|
|
1719
|
+
contextWindowUsed: ragConfig.contextWindow,
|
|
1720
|
+
deviceTier: ragConfig.deviceTier,
|
|
1721
|
+
},
|
|
1549
1722
|
};
|
|
1550
1723
|
} catch (error: any) {
|
|
1551
1724
|
this.emitEvent('error', { stage: 'rag_query', error: error.message });
|
|
@@ -1561,63 +1734,70 @@ class SlyOS {
|
|
|
1561
1734
|
const startTime = Date.now();
|
|
1562
1735
|
|
|
1563
1736
|
try {
|
|
1737
|
+
const ragConfig = this.computeRAGConfig(options.modelId);
|
|
1738
|
+
|
|
1564
1739
|
// Step 1: Load embedding model if needed
|
|
1565
1740
|
if (!this.localEmbeddingModel) {
|
|
1566
1741
|
await this.loadEmbeddingModel();
|
|
1567
1742
|
}
|
|
1568
1743
|
|
|
1569
|
-
//
|
|
1570
|
-
const
|
|
1571
|
-
const chunkSize = contextWindow <= 1024 ? 256 : contextWindow <= 2048 ? 512 : 1024;
|
|
1572
|
-
const overlap = Math.floor(chunkSize / 4);
|
|
1573
|
-
|
|
1574
|
-
// Step 2: Chunk documents if not already chunked
|
|
1744
|
+
// Step 2: Chunk and embed documents (dynamic chunk size)
|
|
1745
|
+
const retrievalStart = Date.now();
|
|
1575
1746
|
const allChunks: Array<{ content: string; documentName: string; embedding?: number[] }> = [];
|
|
1576
1747
|
for (const doc of options.documents) {
|
|
1577
|
-
const chunks = this.chunkTextLocal(doc.content, chunkSize,
|
|
1748
|
+
const chunks = this.chunkTextLocal(doc.content, ragConfig.chunkSize, Math.floor(ragConfig.chunkSize / 4));
|
|
1578
1749
|
for (const chunk of chunks) {
|
|
1579
1750
|
const embedding = await this.embedTextLocal(chunk);
|
|
1580
1751
|
allChunks.push({ content: chunk, documentName: doc.name || 'Document', embedding });
|
|
1581
1752
|
}
|
|
1582
1753
|
}
|
|
1583
1754
|
|
|
1584
|
-
// Step 3: Embed query
|
|
1755
|
+
// Step 3: Embed query and search
|
|
1585
1756
|
const queryEmbedding = await this.embedTextLocal(options.query);
|
|
1586
|
-
|
|
1587
|
-
// Step 4: Cosine similarity search
|
|
1588
1757
|
const scored = allChunks
|
|
1589
1758
|
.filter(c => c.embedding)
|
|
1590
|
-
.map(c => ({
|
|
1591
|
-
...c,
|
|
1592
|
-
similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding!)
|
|
1593
|
-
}))
|
|
1759
|
+
.map(c => ({ ...c, similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding!) }))
|
|
1594
1760
|
.sort((a, b) => b.similarityScore - a.similarityScore)
|
|
1595
|
-
.slice(0, options.topK ||
|
|
1596
|
-
|
|
1597
|
-
|
|
1598
|
-
|
|
1599
|
-
|
|
1600
|
-
const
|
|
1601
|
-
|
|
1602
|
-
|
|
1603
|
-
|
|
1604
|
-
|
|
1605
|
-
|
|
1606
|
-
|
|
1607
|
-
|
|
1608
|
-
|
|
1609
|
-
|
|
1761
|
+
.slice(0, options.topK || ragConfig.topK);
|
|
1762
|
+
const retrievalMs = Date.now() - retrievalStart;
|
|
1763
|
+
|
|
1764
|
+
// Step 4: Build context
|
|
1765
|
+
const contextBuildStart = Date.now();
|
|
1766
|
+
const bestChunk = scored[0];
|
|
1767
|
+
let context = bestChunk.content
|
|
1768
|
+
.replace(/[^\x20-\x7E\n]/g, ' ')
|
|
1769
|
+
.replace(/\s{2,}/g, ' ')
|
|
1770
|
+
.replace(/<[^>]+>/g, ' ')
|
|
1771
|
+
.replace(/https?:\/\/\S+/g, '')
|
|
1772
|
+
.replace(/[{}()\[\]]/g, '')
|
|
1773
|
+
.trim();
|
|
1774
|
+
if (context.length > ragConfig.maxContextChars) context = context.substring(0, ragConfig.maxContextChars);
|
|
1775
|
+
const prompt = `${context}\n\nQ: ${options.query}\nA:`;
|
|
1776
|
+
const contextBuildMs = Date.now() - contextBuildStart;
|
|
1777
|
+
|
|
1778
|
+
// Step 5: Generate — stream if callback provided
|
|
1779
|
+
const genStart = Date.now();
|
|
1780
|
+
let response: string;
|
|
1781
|
+
let firstTokenMs = 0;
|
|
1782
|
+
|
|
1783
|
+
if (options.onToken) {
|
|
1784
|
+
const streamResult = await this.generateStream(options.modelId, prompt, {
|
|
1785
|
+
temperature: options.temperature || 0.6,
|
|
1786
|
+
maxTokens: options.maxTokens || ragConfig.maxGenTokens,
|
|
1787
|
+
onToken: options.onToken,
|
|
1788
|
+
});
|
|
1789
|
+
response = streamResult.text;
|
|
1790
|
+
firstTokenMs = streamResult.firstTokenMs;
|
|
1791
|
+
} else {
|
|
1792
|
+
response = await this.generate(options.modelId, prompt, {
|
|
1793
|
+
temperature: options.temperature || 0.6,
|
|
1794
|
+
maxTokens: options.maxTokens || ragConfig.maxGenTokens,
|
|
1795
|
+
});
|
|
1796
|
+
firstTokenMs = Date.now() - genStart;
|
|
1610
1797
|
}
|
|
1611
|
-
|
|
1612
|
-
const
|
|
1613
|
-
const
|
|
1614
|
-
|
|
1615
|
-
// Step 6: Generate locally
|
|
1616
|
-
const maxGen = contextWindow <= 2048 ? 150 : Math.min(300, Math.floor(contextWindow / 4));
|
|
1617
|
-
const response = await this.generate(options.modelId, prompt, {
|
|
1618
|
-
temperature: options.temperature || 0.6,
|
|
1619
|
-
maxTokens: options.maxTokens || maxGen,
|
|
1620
|
-
});
|
|
1798
|
+
const generationMs = Date.now() - genStart;
|
|
1799
|
+
const totalMs = Date.now() - startTime;
|
|
1800
|
+
const tokensGenerated = response.split(/\s+/).length;
|
|
1621
1801
|
|
|
1622
1802
|
return {
|
|
1623
1803
|
query: options.query,
|
|
@@ -1631,8 +1811,25 @@ class SlyOS {
|
|
|
1631
1811
|
})),
|
|
1632
1812
|
generatedResponse: response,
|
|
1633
1813
|
context,
|
|
1634
|
-
latencyMs:
|
|
1814
|
+
latencyMs: totalMs,
|
|
1635
1815
|
tierUsed: 1,
|
|
1816
|
+
timing: {
|
|
1817
|
+
retrievalMs,
|
|
1818
|
+
contextBuildMs,
|
|
1819
|
+
firstTokenMs,
|
|
1820
|
+
generationMs,
|
|
1821
|
+
totalMs,
|
|
1822
|
+
tokensGenerated,
|
|
1823
|
+
tokensPerSecond: generationMs > 0 ? tokensGenerated / (generationMs / 1000) : 0,
|
|
1824
|
+
},
|
|
1825
|
+
config: {
|
|
1826
|
+
maxContextChars: ragConfig.maxContextChars,
|
|
1827
|
+
maxGenTokens: ragConfig.maxGenTokens,
|
|
1828
|
+
chunkSize: ragConfig.chunkSize,
|
|
1829
|
+
topK: options.topK || ragConfig.topK,
|
|
1830
|
+
contextWindowUsed: ragConfig.contextWindow,
|
|
1831
|
+
deviceTier: ragConfig.deviceTier,
|
|
1832
|
+
},
|
|
1636
1833
|
};
|
|
1637
1834
|
} catch (error: any) {
|
|
1638
1835
|
this.emitEvent('error', { stage: 'rag_local', error: error.message });
|
|
@@ -1648,59 +1845,62 @@ class SlyOS {
|
|
|
1648
1845
|
const startTime = Date.now();
|
|
1649
1846
|
|
|
1650
1847
|
const index = this.offlineIndexes.get(options.knowledgeBaseId);
|
|
1651
|
-
if (!index) {
|
|
1652
|
-
|
|
1653
|
-
}
|
|
1654
|
-
|
|
1655
|
-
// Check expiry
|
|
1656
|
-
if (new Date(index.metadata.expires_at) < new Date()) {
|
|
1657
|
-
throw new Error('Offline index has expired. Please re-sync.');
|
|
1658
|
-
}
|
|
1848
|
+
if (!index) throw new Error(`KB "${options.knowledgeBaseId}" not synced.`);
|
|
1849
|
+
if (new Date(index.metadata.expires_at) < new Date()) throw new Error('Offline index expired.');
|
|
1659
1850
|
|
|
1660
1851
|
try {
|
|
1661
|
-
|
|
1662
|
-
if (!this.localEmbeddingModel) {
|
|
1663
|
-
await this.loadEmbeddingModel();
|
|
1664
|
-
}
|
|
1852
|
+
const ragConfig = this.computeRAGConfig(options.modelId);
|
|
1665
1853
|
|
|
1666
|
-
//
|
|
1667
|
-
|
|
1854
|
+
// Load embedding model
|
|
1855
|
+
if (!this.localEmbeddingModel) await this.loadEmbeddingModel();
|
|
1668
1856
|
|
|
1669
1857
|
// Search offline index
|
|
1858
|
+
const retrievalStart = Date.now();
|
|
1859
|
+
const queryEmbedding = await this.embedTextLocal(options.query);
|
|
1670
1860
|
const scored = index.chunks
|
|
1671
1861
|
.filter(c => c.embedding && c.embedding.length > 0)
|
|
1672
|
-
.map(c => ({
|
|
1673
|
-
...c,
|
|
1674
|
-
similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding!)
|
|
1675
|
-
}))
|
|
1862
|
+
.map(c => ({ ...c, similarityScore: this.cosineSimilarity(queryEmbedding, c.embedding!) }))
|
|
1676
1863
|
.sort((a, b) => b.similarityScore - a.similarityScore)
|
|
1677
|
-
.slice(0, options.topK ||
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
const
|
|
1682
|
-
|
|
1683
|
-
|
|
1684
|
-
|
|
1685
|
-
|
|
1686
|
-
|
|
1687
|
-
|
|
1688
|
-
|
|
1689
|
-
|
|
1690
|
-
|
|
1691
|
-
|
|
1692
|
-
|
|
1864
|
+
.slice(0, options.topK || ragConfig.topK);
|
|
1865
|
+
const retrievalMs = Date.now() - retrievalStart;
|
|
1866
|
+
|
|
1867
|
+
// Build context
|
|
1868
|
+
const contextBuildStart = Date.now();
|
|
1869
|
+
const bestChunk = scored[0];
|
|
1870
|
+
let context = bestChunk.content
|
|
1871
|
+
.replace(/[^\x20-\x7E\n]/g, ' ')
|
|
1872
|
+
.replace(/\s{2,}/g, ' ')
|
|
1873
|
+
.replace(/<[^>]+>/g, ' ')
|
|
1874
|
+
.replace(/https?:\/\/\S+/g, '')
|
|
1875
|
+
.replace(/[{}()\[\]]/g, '')
|
|
1876
|
+
.trim();
|
|
1877
|
+
if (context.length > ragConfig.maxContextChars) context = context.substring(0, ragConfig.maxContextChars);
|
|
1878
|
+
const prompt = `${context}\n\nQ: ${options.query}\nA:`;
|
|
1879
|
+
const contextBuildMs = Date.now() - contextBuildStart;
|
|
1880
|
+
|
|
1881
|
+
// Generate
|
|
1882
|
+
const genStart = Date.now();
|
|
1883
|
+
let response: string;
|
|
1884
|
+
let firstTokenMs = 0;
|
|
1885
|
+
|
|
1886
|
+
if (options.onToken) {
|
|
1887
|
+
const streamResult = await this.generateStream(options.modelId, prompt, {
|
|
1888
|
+
temperature: options.temperature || 0.6,
|
|
1889
|
+
maxTokens: options.maxTokens || ragConfig.maxGenTokens,
|
|
1890
|
+
onToken: options.onToken,
|
|
1891
|
+
});
|
|
1892
|
+
response = streamResult.text;
|
|
1893
|
+
firstTokenMs = streamResult.firstTokenMs;
|
|
1894
|
+
} else {
|
|
1895
|
+
response = await this.generate(options.modelId, prompt, {
|
|
1896
|
+
temperature: options.temperature || 0.6,
|
|
1897
|
+
maxTokens: options.maxTokens || ragConfig.maxGenTokens,
|
|
1898
|
+
});
|
|
1899
|
+
firstTokenMs = Date.now() - genStart;
|
|
1693
1900
|
}
|
|
1694
|
-
|
|
1695
|
-
const
|
|
1696
|
-
const
|
|
1697
|
-
|
|
1698
|
-
// Generate locally
|
|
1699
|
-
const maxGen = contextWindow <= 2048 ? 150 : Math.min(300, Math.floor(contextWindow / 4));
|
|
1700
|
-
const response = await this.generate(options.modelId, prompt, {
|
|
1701
|
-
temperature: options.temperature || 0.6,
|
|
1702
|
-
maxTokens: options.maxTokens || maxGen,
|
|
1703
|
-
});
|
|
1901
|
+
const generationMs = Date.now() - genStart;
|
|
1902
|
+
const totalMs = Date.now() - startTime;
|
|
1903
|
+
const tokensGenerated = response.split(/\s+/).length;
|
|
1704
1904
|
|
|
1705
1905
|
return {
|
|
1706
1906
|
query: options.query,
|
|
@@ -1714,8 +1914,25 @@ class SlyOS {
|
|
|
1714
1914
|
})),
|
|
1715
1915
|
generatedResponse: response,
|
|
1716
1916
|
context,
|
|
1717
|
-
latencyMs:
|
|
1917
|
+
latencyMs: totalMs,
|
|
1718
1918
|
tierUsed: 3,
|
|
1919
|
+
timing: {
|
|
1920
|
+
retrievalMs,
|
|
1921
|
+
contextBuildMs,
|
|
1922
|
+
firstTokenMs,
|
|
1923
|
+
generationMs,
|
|
1924
|
+
totalMs,
|
|
1925
|
+
tokensGenerated,
|
|
1926
|
+
tokensPerSecond: generationMs > 0 ? tokensGenerated / (generationMs / 1000) : 0,
|
|
1927
|
+
},
|
|
1928
|
+
config: {
|
|
1929
|
+
maxContextChars: ragConfig.maxContextChars,
|
|
1930
|
+
maxGenTokens: ragConfig.maxGenTokens,
|
|
1931
|
+
chunkSize: ragConfig.chunkSize,
|
|
1932
|
+
topK: options.topK || ragConfig.topK,
|
|
1933
|
+
contextWindowUsed: ragConfig.contextWindow,
|
|
1934
|
+
deviceTier: ragConfig.deviceTier,
|
|
1935
|
+
},
|
|
1719
1936
|
};
|
|
1720
1937
|
} catch (error: any) {
|
|
1721
1938
|
this.emitEvent('error', { stage: 'rag_offline', error: error.message });
|