client-llm-preprocessor 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +162 -0
- package/dist/index.d.ts +1319 -0
- package/dist/index.js +1040 -0
- package/package.json +67 -0
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,1319 @@
|
|
|
1
|
+
import * as webllm from '@mlc-ai/web-llm';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Advanced Internal LLM Logging System
|
|
5
|
+
*
|
|
6
|
+
* Captures detailed insights into LLM processing:
|
|
7
|
+
* - Token-by-token generation (if streaming available)
|
|
8
|
+
* - Prompt construction steps
|
|
9
|
+
* - Intermediate processing states
|
|
10
|
+
* - Validation steps
|
|
11
|
+
* - Performance metrics
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
class InternalLogger {
|
|
15
|
+
constructor(options = {}) {
|
|
16
|
+
this.enabled = options.enabled !== false; // Default: enabled
|
|
17
|
+
this.verbose = options.verbose || false;
|
|
18
|
+
this.logLevel = options.logLevel || 'info'; // 'debug', 'info', 'warn', 'error'
|
|
19
|
+
this.logs = [];
|
|
20
|
+
this.maxLogs = options.maxLogs || 1000;
|
|
21
|
+
this.onLogCallback = options.onLogCallback || null;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Enable or disable logging
|
|
26
|
+
*/
|
|
27
|
+
setEnabled(enabled) {
|
|
28
|
+
this.enabled = enabled;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Set verbosity level
|
|
33
|
+
*/
|
|
34
|
+
setVerbose(verbose) {
|
|
35
|
+
this.verbose = verbose;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Log an event with metadata
|
|
40
|
+
*/
|
|
41
|
+
log(level, category, message, data = {}) {
|
|
42
|
+
if (!this.enabled) return;
|
|
43
|
+
|
|
44
|
+
const logEntry = {
|
|
45
|
+
timestamp: new Date().toISOString(),
|
|
46
|
+
level,
|
|
47
|
+
category,
|
|
48
|
+
message,
|
|
49
|
+
data,
|
|
50
|
+
stackTrace: this.verbose ? new Error().stack : undefined
|
|
51
|
+
};
|
|
52
|
+
|
|
53
|
+
// Store log
|
|
54
|
+
this.logs.push(logEntry);
|
|
55
|
+
if (this.logs.length > this.maxLogs) {
|
|
56
|
+
this.logs.shift(); // Remove oldest
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Console output based on level
|
|
60
|
+
if (this.shouldLog(level)) {
|
|
61
|
+
const prefix = `[${level.toUpperCase()}] [${category}]`;
|
|
62
|
+
console.log(`${prefix} ${message}`, data);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// Callback for external handlers
|
|
66
|
+
if (this.onLogCallback) {
|
|
67
|
+
this.onLogCallback(logEntry);
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Check if log level should be output
|
|
73
|
+
*/
|
|
74
|
+
shouldLog(level) {
|
|
75
|
+
const levels = { debug: 0, info: 1, warn: 2, error: 3 };
|
|
76
|
+
return levels[level] >= levels[this.logLevel];
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Log prompt construction
|
|
81
|
+
*/
|
|
82
|
+
logPromptConstruction(operation, originalPrompt, finalPrompt, options = {}) {
|
|
83
|
+
this.log('debug', 'PROMPT', 'Constructing prompt', {
|
|
84
|
+
operation,
|
|
85
|
+
originalLength: originalPrompt.length,
|
|
86
|
+
finalLength: finalPrompt.length,
|
|
87
|
+
options,
|
|
88
|
+
promptPreview: finalPrompt.substring(0, 200) + '...'
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Log token generation (if streaming available)
|
|
94
|
+
*/
|
|
95
|
+
logTokenGeneration(token, cumulativeText, tokenIndex) {
|
|
96
|
+
if (this.verbose) {
|
|
97
|
+
this.log('debug', 'TOKEN', `Generated token ${tokenIndex}`, {
|
|
98
|
+
token,
|
|
99
|
+
cumulativeLength: cumulativeText.length,
|
|
100
|
+
tokenIndex
|
|
101
|
+
});
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Log LLM inference start
|
|
107
|
+
*/
|
|
108
|
+
logInferenceStart(prompt, options) {
|
|
109
|
+
this.log('info', 'INFERENCE', 'Starting LLM inference', {
|
|
110
|
+
promptLength: prompt.length,
|
|
111
|
+
temperature: options.temperature,
|
|
112
|
+
maxTokens: options.maxTokens,
|
|
113
|
+
promptPreview: prompt.substring(0, 100) + '...'
|
|
114
|
+
});
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Log LLM inference completion
|
|
119
|
+
*/
|
|
120
|
+
logInferenceComplete(response, duration, tokenCount) {
|
|
121
|
+
this.log('info', 'INFERENCE', 'LLM inference completed', {
|
|
122
|
+
responseLength: response.length,
|
|
123
|
+
duration: `${duration}ms`,
|
|
124
|
+
estimatedTokens: tokenCount,
|
|
125
|
+
responsePreview: response.substring(0, 200) + '...'
|
|
126
|
+
});
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Log validation step
|
|
131
|
+
*/
|
|
132
|
+
logValidation(step, input, output, isValid, error = null) {
|
|
133
|
+
this.log(isValid ? 'info' : 'warn', 'VALIDATION', `Validation: ${step}`, {
|
|
134
|
+
inputPreview: typeof input === 'string' ? input.substring(0, 100) : input,
|
|
135
|
+
outputPreview: typeof output === 'string' ? output.substring(0, 100) : output,
|
|
136
|
+
isValid,
|
|
137
|
+
error: error?.message
|
|
138
|
+
});
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Log pipeline step
|
|
144
|
+
*/
|
|
145
|
+
logPipelineStep(stepIndex, stepName, input, output, duration) {
|
|
146
|
+
this.log('info', 'PIPELINE', `Pipeline step ${stepIndex + 1}: ${stepName}`, {
|
|
147
|
+
inputLength: typeof input === 'string' ? input.length : 'N/A',
|
|
148
|
+
outputLength: typeof output === 'string' ? output.length : 'N/A',
|
|
149
|
+
duration: `${duration}ms`,
|
|
150
|
+
inputPreview: typeof input === 'string' ? input.substring(0, 50) : input,
|
|
151
|
+
outputPreview: typeof output === 'string' ? output.substring(0, 50) : output
|
|
152
|
+
});
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/**
|
|
156
|
+
* Log error with context
|
|
157
|
+
*/
|
|
158
|
+
logError(operation, error, context = {}) {
|
|
159
|
+
this.log('error', 'ERROR', `Error in ${operation}`, {
|
|
160
|
+
error: error.message,
|
|
161
|
+
stack: error.stack,
|
|
162
|
+
context
|
|
163
|
+
});
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
/**
|
|
167
|
+
* Log performance metrics
|
|
168
|
+
*/
|
|
169
|
+
logPerformance(operation, metrics) {
|
|
170
|
+
this.log('info', 'PERFORMANCE', `Performance: ${operation}`, metrics);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/**
|
|
174
|
+
* Get all logs
|
|
175
|
+
*/
|
|
176
|
+
getLogs(filter = {}) {
|
|
177
|
+
let filtered = [...this.logs];
|
|
178
|
+
|
|
179
|
+
if (filter.level) {
|
|
180
|
+
filtered = filtered.filter(log => log.level === filter.level);
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
if (filter.category) {
|
|
184
|
+
filtered = filtered.filter(log => log.category === filter.category);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
if (filter.since) {
|
|
188
|
+
const sinceTime = new Date(filter.since).getTime();
|
|
189
|
+
filtered = filtered.filter(log => new Date(log.timestamp).getTime() >= sinceTime);
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
return filtered;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
/**
|
|
196
|
+
* Get logs as formatted string
|
|
197
|
+
*/
|
|
198
|
+
getLogsAsString(filter = {}) {
|
|
199
|
+
const logs = this.getLogs(filter);
|
|
200
|
+
return logs.map(log => {
|
|
201
|
+
return `[${log.timestamp}] [${log.level}] [${log.category}] ${log.message}`;
|
|
202
|
+
}).join('\n');
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
/**
|
|
206
|
+
* Clear logs
|
|
207
|
+
*/
|
|
208
|
+
clear() {
|
|
209
|
+
this.logs = [];
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
/**
|
|
213
|
+
* Export logs as JSON
|
|
214
|
+
*/
|
|
215
|
+
exportLogs() {
|
|
216
|
+
return JSON.stringify(this.logs, null, 2);
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
/**
|
|
220
|
+
* Get summary statistics
|
|
221
|
+
*/
|
|
222
|
+
getStats() {
|
|
223
|
+
const stats = {
|
|
224
|
+
totalLogs: this.logs.length,
|
|
225
|
+
byLevel: {},
|
|
226
|
+
byCategory: {},
|
|
227
|
+
errors: 0,
|
|
228
|
+
warnings: 0,
|
|
229
|
+
timeRange: {
|
|
230
|
+
start: this.logs[0]?.timestamp,
|
|
231
|
+
end: this.logs[this.logs.length - 1]?.timestamp
|
|
232
|
+
}
|
|
233
|
+
};
|
|
234
|
+
|
|
235
|
+
this.logs.forEach(log => {
|
|
236
|
+
stats.byLevel[log.level] = (stats.byLevel[log.level] || 0) + 1;
|
|
237
|
+
stats.byCategory[log.category] = (stats.byCategory[log.category] || 0) + 1;
|
|
238
|
+
if (log.level === 'error') stats.errors++;
|
|
239
|
+
if (log.level === 'warn') stats.warnings++;
|
|
240
|
+
});
|
|
241
|
+
|
|
242
|
+
return stats;
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
// Singleton instance
|
|
247
|
+
let defaultLogger = null;
|
|
248
|
+
|
|
249
|
+
/**
|
|
250
|
+
* Get or create default logger
|
|
251
|
+
*/
|
|
252
|
+
function getLogger(options = {}) {
|
|
253
|
+
if (!defaultLogger) {
|
|
254
|
+
defaultLogger = new InternalLogger(options);
|
|
255
|
+
}
|
|
256
|
+
return defaultLogger;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
/**
|
|
260
|
+
* WebLLM Engine Wrapper
|
|
261
|
+
* Handles model loading and inference with detailed internal logging
|
|
262
|
+
*/
|
|
263
|
+
class LLMEngine {
|
|
264
|
+
constructor(options = {}) {
|
|
265
|
+
this.engine = null;
|
|
266
|
+
this.model = null;
|
|
267
|
+
this.logger = options.logger || getLogger(options.loggerOptions);
|
|
268
|
+
this.streamingEnabled = options.streaming !== false; // Try streaming by default
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
/**
|
|
272
|
+
* Load a WebLLM model
|
|
273
|
+
* @param {string} model - Model name (default: "Llama-3.2-1B-Instruct-q4f16_1")
|
|
274
|
+
* @returns {Promise<void>}
|
|
275
|
+
*/
|
|
276
|
+
async loadModel(model = "Llama-3.2-1B-Instruct-q4f16_1-MLC") {
|
|
277
|
+
if (this.engine && this.model === model) {
|
|
278
|
+
this.logger.log('info', 'MODEL', 'Model already loaded, skipping');
|
|
279
|
+
return;
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
const startTime = Date.now();
|
|
283
|
+
this.logger.log('info', 'MODEL', `Loading model: ${model}`, { model });
|
|
284
|
+
|
|
285
|
+
try {
|
|
286
|
+
this.engine = await webllm.CreateMLCEngine(model, {
|
|
287
|
+
initProgressCallback: (report) => {
|
|
288
|
+
if (report.progress) {
|
|
289
|
+
const progress = (report.progress * 100).toFixed(1);
|
|
290
|
+
this.logger.log('info', 'MODEL', `Loading progress: ${progress}%`, {
|
|
291
|
+
progress: parseFloat(progress),
|
|
292
|
+
report
|
|
293
|
+
});
|
|
294
|
+
}
|
|
295
|
+
},
|
|
296
|
+
});
|
|
297
|
+
|
|
298
|
+
this.model = model;
|
|
299
|
+
const loadTime = Date.now() - startTime;
|
|
300
|
+
this.logger.log('info', 'MODEL', 'Model loaded successfully', {
|
|
301
|
+
model,
|
|
302
|
+
loadTime: `${loadTime}ms`
|
|
303
|
+
});
|
|
304
|
+
} catch (error) {
|
|
305
|
+
this.logger.logError('loadModel', error, { model });
|
|
306
|
+
throw new Error(`Failed to load model: ${error.message}`);
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
/**
|
|
311
|
+
* Run inference with the loaded model
|
|
312
|
+
* Captures detailed internal state including token-by-token generation
|
|
313
|
+
* @param {string} prompt - The prompt to send to the model
|
|
314
|
+
* @param {Object} options - Generation options
|
|
315
|
+
* @returns {Promise<string>}
|
|
316
|
+
*/
|
|
317
|
+
async run(prompt, options = {}) {
|
|
318
|
+
if (!this.engine) {
|
|
319
|
+
throw new Error("Model not loaded. Call loadModel() first.");
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
const {
|
|
323
|
+
temperature = 0.7,
|
|
324
|
+
maxTokens = 512,
|
|
325
|
+
stopSequences = [],
|
|
326
|
+
stream = this.streamingEnabled, // Try streaming for token-by-token logging
|
|
327
|
+
} = options;
|
|
328
|
+
|
|
329
|
+
const startTime = Date.now();
|
|
330
|
+
this.logger.logInferenceStart(prompt, { temperature, maxTokens, stopSequences, stream });
|
|
331
|
+
|
|
332
|
+
try {
|
|
333
|
+
let fullResponse = '';
|
|
334
|
+
let tokenCount = 0;
|
|
335
|
+
const tokens = [];
|
|
336
|
+
|
|
337
|
+
// Try streaming first for token-by-token visibility
|
|
338
|
+
if (stream && this.engine.chat?.completions?.createStream) {
|
|
339
|
+
this.logger.log('info', 'INFERENCE', 'Using streaming mode for token-by-token logging');
|
|
340
|
+
|
|
341
|
+
try {
|
|
342
|
+
const stream = await this.engine.chat.completions.createStream({
|
|
343
|
+
messages: [{ role: "user", content: prompt }],
|
|
344
|
+
temperature,
|
|
345
|
+
max_tokens: maxTokens,
|
|
346
|
+
stop: stopSequences.length > 0 ? stopSequences : undefined,
|
|
347
|
+
});
|
|
348
|
+
|
|
349
|
+
// Capture each token as it's generated
|
|
350
|
+
for await (const chunk of stream) {
|
|
351
|
+
const delta = chunk.choices?.[0]?.delta?.content || '';
|
|
352
|
+
if (delta) {
|
|
353
|
+
fullResponse += delta;
|
|
354
|
+
tokenCount++;
|
|
355
|
+
tokens.push(delta);
|
|
356
|
+
|
|
357
|
+
// Log each token (if verbose)
|
|
358
|
+
this.logger.logTokenGeneration(delta, fullResponse, tokenCount);
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
this.logger.log('info', 'INFERENCE', 'Streaming completed', {
|
|
363
|
+
totalTokens: tokenCount,
|
|
364
|
+
responseLength: fullResponse.length
|
|
365
|
+
});
|
|
366
|
+
} catch (streamError) {
|
|
367
|
+
// Fallback to non-streaming if streaming fails
|
|
368
|
+
this.logger.log('warn', 'INFERENCE', 'Streaming failed, falling back to non-streaming', {
|
|
369
|
+
error: streamError.message
|
|
370
|
+
});
|
|
371
|
+
return await this.runNonStreaming(prompt, options, startTime);
|
|
372
|
+
}
|
|
373
|
+
} else {
|
|
374
|
+
// Non-streaming mode
|
|
375
|
+
return await this.runNonStreaming(prompt, options, startTime);
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
const duration = Date.now() - startTime;
|
|
379
|
+
this.logger.logInferenceComplete(fullResponse, duration, tokenCount);
|
|
380
|
+
|
|
381
|
+
// Log token sequence for analysis
|
|
382
|
+
this.logger.log('debug', 'INFERENCE', 'Token sequence captured', {
|
|
383
|
+
tokenCount,
|
|
384
|
+
tokens: tokens.slice(0, 20), // First 20 tokens
|
|
385
|
+
fullSequenceLength: tokens.length
|
|
386
|
+
});
|
|
387
|
+
|
|
388
|
+
return fullResponse;
|
|
389
|
+
} catch (error) {
|
|
390
|
+
const duration = Date.now() - startTime;
|
|
391
|
+
this.logger.logError('run', error, {
|
|
392
|
+
promptLength: prompt.length,
|
|
393
|
+
duration: `${duration}ms`,
|
|
394
|
+
options
|
|
395
|
+
});
|
|
396
|
+
throw new Error(`Inference failed: ${error.message}`);
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
/**
|
|
401
|
+
* Non-streaming inference (fallback)
|
|
402
|
+
* @private
|
|
403
|
+
*/
|
|
404
|
+
async runNonStreaming(prompt, options, startTime) {
|
|
405
|
+
const {
|
|
406
|
+
temperature = 0.7,
|
|
407
|
+
maxTokens = 512,
|
|
408
|
+
stopSequences = [],
|
|
409
|
+
} = options;
|
|
410
|
+
|
|
411
|
+
this.logger.log('info', 'INFERENCE', 'Using non-streaming mode');
|
|
412
|
+
|
|
413
|
+
const response = await this.engine.chat.completions.create({
|
|
414
|
+
messages: [{ role: "user", content: prompt }],
|
|
415
|
+
temperature,
|
|
416
|
+
max_tokens: maxTokens,
|
|
417
|
+
stop: stopSequences.length > 0 ? stopSequences : undefined,
|
|
418
|
+
});
|
|
419
|
+
|
|
420
|
+
const result = response.choices[0].message.content;
|
|
421
|
+
const duration = Date.now() - (startTime || Date.now());
|
|
422
|
+
const estimatedTokens = Math.ceil(result.length / 4); // Rough estimate: ~4 chars per token
|
|
423
|
+
|
|
424
|
+
this.logger.logInferenceComplete(result, duration, estimatedTokens);
|
|
425
|
+
|
|
426
|
+
return result;
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
/**
|
|
430
|
+
* Check if model is loaded
|
|
431
|
+
* @returns {boolean}
|
|
432
|
+
*/
|
|
433
|
+
isLoaded() {
|
|
434
|
+
return this.engine !== null;
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
/**
|
|
438
|
+
* Get the logger instance
|
|
439
|
+
* @returns {InternalLogger}
|
|
440
|
+
*/
|
|
441
|
+
getLogger() {
|
|
442
|
+
return this.logger;
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
/**
|
|
446
|
+
* Enable/disable streaming for token-by-token logging
|
|
447
|
+
*/
|
|
448
|
+
setStreaming(enabled) {
|
|
449
|
+
this.streamingEnabled = enabled;
|
|
450
|
+
this.logger.log('info', 'ENGINE', `Streaming ${enabled ? 'enabled' : 'disabled'}`);
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
/**
|
|
455
|
+
* Non-LLM cleaning using rules and regex
|
|
456
|
+
* Fast, deterministic, works without model
|
|
457
|
+
* All options are opt-in (default: false) - user chooses what to remove
|
|
458
|
+
*/
|
|
459
|
+
|
|
460
|
+
/**
|
|
461
|
+
* Clean text using rule-based approach (no LLM)
|
|
462
|
+
* @param {string} text - Text to clean
|
|
463
|
+
* @param {Object} options - Cleaning options (all optional, default: false)
|
|
464
|
+
* @param {boolean} options.removeHtml - Remove HTML tags (default: false)
|
|
465
|
+
* @param {boolean} options.removeUrls - Remove URLs (default: false)
|
|
466
|
+
* @param {boolean} options.removeExtraWhitespace - Remove extra whitespace (default: false)
|
|
467
|
+
* @param {boolean} options.removeLineBreaks - Remove line breaks (default: false)
|
|
468
|
+
* @param {boolean} options.removeSpecialChars - Remove special characters (default: false)
|
|
469
|
+
* @param {boolean} options.decodeHtmlEntities - Decode HTML entities like & (default: false)
|
|
470
|
+
* @returns {string}
|
|
471
|
+
*/
|
|
472
|
+
function cleanWithRules(text, options = {}) {
|
|
473
|
+
const {
|
|
474
|
+
removeHtml = false,
|
|
475
|
+
removeUrls = false,
|
|
476
|
+
removeExtraWhitespace = false,
|
|
477
|
+
removeLineBreaks = false,
|
|
478
|
+
removeSpecialChars = false,
|
|
479
|
+
decodeHtmlEntities = false,
|
|
480
|
+
} = options;
|
|
481
|
+
|
|
482
|
+
let cleaned = text;
|
|
483
|
+
|
|
484
|
+
// Decode HTML entities (if requested)
|
|
485
|
+
if (decodeHtmlEntities) {
|
|
486
|
+
cleaned = cleaned
|
|
487
|
+
.replace(/ /g, ' ')
|
|
488
|
+
.replace(/&/g, '&')
|
|
489
|
+
.replace(/</g, '<')
|
|
490
|
+
.replace(/>/g, '>')
|
|
491
|
+
.replace(/"/g, '"')
|
|
492
|
+
.replace(/'/g, "'")
|
|
493
|
+
.replace(/'/g, "'")
|
|
494
|
+
.replace(///g, '/');
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
// Remove HTML tags (if requested)
|
|
498
|
+
if (removeHtml) {
|
|
499
|
+
cleaned = cleaned.replace(/<[^>]+>/g, '');
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
// Remove URLs (if requested)
|
|
503
|
+
if (removeUrls) {
|
|
504
|
+
cleaned = cleaned.replace(/https?:\/\/[^\s]+/g, '');
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
// Remove line breaks (if requested)
|
|
508
|
+
if (removeLineBreaks) {
|
|
509
|
+
cleaned = cleaned.replace(/[\r\n]+/g, ' ');
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
// Remove extra whitespace (if requested)
|
|
513
|
+
if (removeExtraWhitespace) {
|
|
514
|
+
// Replace multiple spaces with single space
|
|
515
|
+
cleaned = cleaned.replace(/[ \t]+/g, ' ');
|
|
516
|
+
// Remove leading/trailing whitespace from each line
|
|
517
|
+
if (!removeLineBreaks) {
|
|
518
|
+
cleaned = cleaned.split('\n').map(line => line.trim()).join('\n');
|
|
519
|
+
}
|
|
520
|
+
// Remove multiple newlines (if line breaks not removed)
|
|
521
|
+
if (!removeLineBreaks) {
|
|
522
|
+
cleaned = cleaned.replace(/\n{3,}/g, '\n\n');
|
|
523
|
+
}
|
|
524
|
+
// Trim overall
|
|
525
|
+
cleaned = cleaned.trim();
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
// Remove special characters (if requested)
|
|
529
|
+
if (removeSpecialChars) {
|
|
530
|
+
// Keep alphanumeric, spaces, and basic punctuation
|
|
531
|
+
cleaned = cleaned.replace(/[^\w\s.,!?;:()\-'"]/g, '');
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
return cleaned;
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
/**
|
|
538
|
+
* Clean text by removing noise, HTML, and irrelevant content
|
|
539
|
+
* Uses LLM if available, falls back to rule-based cleaning if not
|
|
540
|
+
* All options are opt-in (default: false) - user chooses what to remove
|
|
541
|
+
* @param {LLMEngine|null} engine - The LLM engine instance (can be null)
|
|
542
|
+
* @param {string} text - Text to clean
|
|
543
|
+
* @param {Object} options - Cleaning options (all optional, default: false)
|
|
544
|
+
* @param {boolean} options.removeHtml - Remove HTML tags (default: false)
|
|
545
|
+
* @param {boolean} options.removeUrls - Remove URLs (default: false)
|
|
546
|
+
* @param {boolean} options.removeExtraWhitespace - Remove extra whitespace (default: false)
|
|
547
|
+
* @param {boolean} options.removeLineBreaks - Remove line breaks (default: false)
|
|
548
|
+
* @param {boolean} options.removeSpecialChars - Remove special characters (default: false)
|
|
549
|
+
* @param {boolean} options.decodeHtmlEntities - Decode HTML entities like & (default: false)
|
|
550
|
+
* @param {string} options.customInstructions - Additional cleaning instructions (requires LLM)
|
|
551
|
+
* @param {boolean} options.useLLM - Force LLM usage if model is loaded (default: auto-detect)
|
|
552
|
+
* @returns {Promise<string>|string}
|
|
553
|
+
*/
|
|
554
|
+
async function clean(engine, text, options = {}) {
|
|
555
|
+
const {
|
|
556
|
+
removeHtml = false,
|
|
557
|
+
removeUrls = false,
|
|
558
|
+
removeExtraWhitespace = false,
|
|
559
|
+
removeLineBreaks = false,
|
|
560
|
+
removeSpecialChars = false,
|
|
561
|
+
decodeHtmlEntities = false,
|
|
562
|
+
customInstructions = "",
|
|
563
|
+
useLLM = null, // null = auto-detect, true = force LLM, false = force rules
|
|
564
|
+
} = options;
|
|
565
|
+
|
|
566
|
+
// Check if we should use LLM
|
|
567
|
+
const shouldUseLLM = useLLM !== false &&
|
|
568
|
+
engine !== null &&
|
|
569
|
+
engine.isLoaded() &&
|
|
570
|
+
(useLLM === true || customInstructions !== "");
|
|
571
|
+
|
|
572
|
+
if (!shouldUseLLM) {
|
|
573
|
+
// Use fast rule-based cleaning (no LLM needed)
|
|
574
|
+
const logger = engine?.getLogger();
|
|
575
|
+
if (logger) {
|
|
576
|
+
logger.log('info', 'CLEAN', 'Using rule-based cleaning (no LLM)', {
|
|
577
|
+
reason: !engine ? 'No engine' : !engine.isLoaded() ? 'Model not loaded' : 'useLLM=false',
|
|
578
|
+
options: { removeHtml, removeUrls, removeExtraWhitespace, removeLineBreaks, removeSpecialChars, decodeHtmlEntities }
|
|
579
|
+
});
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
return cleanWithRules(text, {
|
|
583
|
+
removeHtml,
|
|
584
|
+
removeUrls,
|
|
585
|
+
removeExtraWhitespace,
|
|
586
|
+
removeLineBreaks,
|
|
587
|
+
removeSpecialChars,
|
|
588
|
+
decodeHtmlEntities
|
|
589
|
+
});
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
// Use LLM for semantic cleaning (especially if customInstructions provided)
|
|
593
|
+
const logger = engine.getLogger();
|
|
594
|
+
|
|
595
|
+
// Build prompt based on user's selections
|
|
596
|
+
const cleaningSteps = [];
|
|
597
|
+
|
|
598
|
+
if (removeHtml) cleaningSteps.push('HTML tags');
|
|
599
|
+
if (removeUrls) cleaningSteps.push('URLs');
|
|
600
|
+
if (removeExtraWhitespace) cleaningSteps.push('extra whitespace');
|
|
601
|
+
if (removeLineBreaks) cleaningSteps.push('line breaks');
|
|
602
|
+
if (removeSpecialChars) cleaningSteps.push('special characters');
|
|
603
|
+
if (decodeHtmlEntities) cleaningSteps.push('decode HTML entities');
|
|
604
|
+
|
|
605
|
+
let originalPrompt = `Clean the following text`;
|
|
606
|
+
let prompt = originalPrompt;
|
|
607
|
+
|
|
608
|
+
if (cleaningSteps.length > 0) {
|
|
609
|
+
prompt += ` by removing: ${cleaningSteps.join(', ')}`;
|
|
610
|
+
} else if (!customInstructions) {
|
|
611
|
+
// If no options selected and no custom instructions, just return text
|
|
612
|
+
return text;
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
// Add instruction to preserve meaning
|
|
616
|
+
prompt += `. IMPORTANT: Do NOT modify the meaning or remove important information. Only remove what was requested.`;
|
|
617
|
+
|
|
618
|
+
if (customInstructions) {
|
|
619
|
+
prompt += ` Also: ${customInstructions}`;
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
prompt += `:\n\n${text}`;
|
|
623
|
+
|
|
624
|
+
logger.logPromptConstruction('clean', originalPrompt, prompt, options);
|
|
625
|
+
logger.log('info', 'CLEAN', 'Using LLM-based cleaning');
|
|
626
|
+
|
|
627
|
+
const result = await engine.run(prompt, { temperature: 0.3 });
|
|
628
|
+
const cleaned = result.trim();
|
|
629
|
+
|
|
630
|
+
logger.log('info', 'CLEAN', 'LLM cleaning completed', {
|
|
631
|
+
originalLength: text.length,
|
|
632
|
+
finalLength: cleaned.length
|
|
633
|
+
});
|
|
634
|
+
|
|
635
|
+
return cleaned;
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
/**
|
|
639
|
+
* Chunk text into smaller pieces
|
|
640
|
+
* @param {string} text - Text to chunk
|
|
641
|
+
* @param {Object} options - Chunking options
|
|
642
|
+
* @returns {string[]}
|
|
643
|
+
*/
|
|
644
|
+
function chunk(text, options = {}) {
|
|
645
|
+
const {
|
|
646
|
+
size = 500, // Character count per chunk
|
|
647
|
+
overlap = 0, // Overlap between chunks (in characters)
|
|
648
|
+
strategy = "character", // "character", "sentence", "word"
|
|
649
|
+
} = options;
|
|
650
|
+
|
|
651
|
+
if (!text || text.length === 0) {
|
|
652
|
+
return [];
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
const chunks = [];
|
|
656
|
+
|
|
657
|
+
if (strategy === "character") {
|
|
658
|
+
for (let i = 0; i < text.length; i += size - overlap) {
|
|
659
|
+
chunks.push(text.slice(i, i + size));
|
|
660
|
+
}
|
|
661
|
+
} else if (strategy === "sentence") {
|
|
662
|
+
const sentences = text.match(/[^.!?]+[.!?]+/g) || [text];
|
|
663
|
+
let currentChunk = "";
|
|
664
|
+
|
|
665
|
+
for (const sentence of sentences) {
|
|
666
|
+
if (currentChunk.length + sentence.length > size && currentChunk) {
|
|
667
|
+
chunks.push(currentChunk.trim());
|
|
668
|
+
currentChunk = sentence;
|
|
669
|
+
} else {
|
|
670
|
+
currentChunk += sentence;
|
|
671
|
+
}
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
if (currentChunk.trim()) {
|
|
675
|
+
chunks.push(currentChunk.trim());
|
|
676
|
+
}
|
|
677
|
+
} else if (strategy === "word") {
|
|
678
|
+
const words = text.split(/\s+/);
|
|
679
|
+
let currentChunk = [];
|
|
680
|
+
let currentSize = 0;
|
|
681
|
+
|
|
682
|
+
for (const word of words) {
|
|
683
|
+
if (currentSize + word.length > size && currentChunk.length > 0) {
|
|
684
|
+
chunks.push(currentChunk.join(" "));
|
|
685
|
+
currentChunk = [word];
|
|
686
|
+
currentSize = word.length;
|
|
687
|
+
} else {
|
|
688
|
+
currentChunk.push(word);
|
|
689
|
+
currentSize += word.length + 1; // +1 for space
|
|
690
|
+
}
|
|
691
|
+
}
|
|
692
|
+
|
|
693
|
+
if (currentChunk.length > 0) {
|
|
694
|
+
chunks.push(currentChunk.join(" "));
|
|
695
|
+
}
|
|
696
|
+
}
|
|
697
|
+
|
|
698
|
+
return chunks.filter(ch => ch.length > 0);
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
/**
|
|
702
|
+
* Rule-based validation utilities
|
|
703
|
+
* Prevents hallucinations by validating LLM output against source text
|
|
704
|
+
*/
|
|
705
|
+
|
|
706
|
+
/**
|
|
707
|
+
* Validate JSON structure and parse safely
|
|
708
|
+
*/
|
|
709
|
+
function validateJSON(text, expectedFields = []) {
|
|
710
|
+
try {
|
|
711
|
+
// Strip markdown code blocks if present (e.g., ```json ... ```)
|
|
712
|
+
const jsonMatch = text.match(/```(?:json)?\s*([\s\S]*?)\s*```/);
|
|
713
|
+
const cleanText = jsonMatch ? jsonMatch[1] : text;
|
|
714
|
+
const parsed = JSON.parse(cleanText);
|
|
715
|
+
|
|
716
|
+
// If fields specified, check they exist
|
|
717
|
+
if (expectedFields.length > 0) {
|
|
718
|
+
const missingFields = expectedFields.filter(field => !(field in parsed));
|
|
719
|
+
if (missingFields.length > 0) {
|
|
720
|
+
return {
|
|
721
|
+
isValid: false,
|
|
722
|
+
error: `Missing required fields: ${missingFields.join(', ')}`,
|
|
723
|
+
data: parsed
|
|
724
|
+
};
|
|
725
|
+
}
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
return {
|
|
729
|
+
isValid: true,
|
|
730
|
+
data: parsed
|
|
731
|
+
};
|
|
732
|
+
} catch (error) {
|
|
733
|
+
return {
|
|
734
|
+
isValid: false,
|
|
735
|
+
error: `Invalid JSON: ${error.message}`,
|
|
736
|
+
data: null
|
|
737
|
+
};
|
|
738
|
+
}
|
|
739
|
+
}
|
|
740
|
+
|
|
741
|
+
/**
|
|
742
|
+
* Format-specific validators
|
|
743
|
+
*/
|
|
744
|
+
const validators = {
|
|
745
|
+
email: (value) => /^[^\s@]+@[^\s@]+\.[^\s@]+$/.test(value),
|
|
746
|
+
phone: (value) => /^[\d\s\-\+\(\)]+$/.test(value) && value.replace(/\D/g, '').length >= 7,
|
|
747
|
+
url: (value) => /^https?:\/\/.+/.test(value),
|
|
748
|
+
};
|
|
749
|
+
|
|
750
|
+
/**
|
|
751
|
+
* Verify extracted data exists in source text
|
|
752
|
+
* Prevents hallucination by checking if extracted values appear in original text
|
|
753
|
+
* Now uses exact matching for structured fields and format validation
|
|
754
|
+
*/
|
|
755
|
+
function verifyExtraction(extracted, sourceText, fields = []) {
|
|
756
|
+
const sourceLower = sourceText.toLowerCase();
|
|
757
|
+
const issues = [];
|
|
758
|
+
|
|
759
|
+
if (typeof extracted === 'object' && extracted !== null) {
|
|
760
|
+
// Check each field
|
|
761
|
+
for (const [key, value] of Object.entries(extracted)) {
|
|
762
|
+
if (fields.length > 0 && !fields.includes(key)) {
|
|
763
|
+
continue; // Skip fields not in expected list
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
if (value && typeof value === 'string' && value.trim().length > 0) {
|
|
767
|
+
const valueLower = value.toLowerCase();
|
|
768
|
+
|
|
769
|
+
// First, try exact substring match (case-insensitive)
|
|
770
|
+
let foundInSource = sourceLower.includes(valueLower);
|
|
771
|
+
|
|
772
|
+
// If not found, try format-specific validation
|
|
773
|
+
if (!foundInSource) {
|
|
774
|
+
// Check if it matches expected format
|
|
775
|
+
const fieldType = key.toLowerCase();
|
|
776
|
+
if (validators[fieldType]) {
|
|
777
|
+
if (!validators[fieldType](value)) {
|
|
778
|
+
issues.push({
|
|
779
|
+
field: key,
|
|
780
|
+
value,
|
|
781
|
+
reason: `Invalid ${fieldType} format`
|
|
782
|
+
});
|
|
783
|
+
continue;
|
|
784
|
+
}
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
// For non-exact matches, try word-level matching with stricter threshold
|
|
788
|
+
const words = valueLower.split(/\s+/).filter(w => w.length > 3);
|
|
789
|
+
const matchedWords = words.filter(word => sourceLower.includes(word));
|
|
790
|
+
const matchRatio = words.length > 0 ? matchedWords.length / words.length : 0;
|
|
791
|
+
|
|
792
|
+
// Require at least 80% of words to match (stricter than before)
|
|
793
|
+
foundInSource = matchRatio >= 0.8;
|
|
794
|
+
}
|
|
795
|
+
|
|
796
|
+
if (!foundInSource) {
|
|
797
|
+
issues.push({
|
|
798
|
+
field: key,
|
|
799
|
+
value,
|
|
800
|
+
reason: 'Value not found in source text (possible hallucination)'
|
|
801
|
+
});
|
|
802
|
+
}
|
|
803
|
+
}
|
|
804
|
+
}
|
|
805
|
+
}
|
|
806
|
+
|
|
807
|
+
return {
|
|
808
|
+
isValid: issues.length === 0,
|
|
809
|
+
issues,
|
|
810
|
+
extracted
|
|
811
|
+
};
|
|
812
|
+
}
|
|
813
|
+
|
|
814
|
+
/**
|
|
815
|
+
* Clean and normalize extracted values
|
|
816
|
+
*/
|
|
817
|
+
function normalizeExtracted(extracted) {
|
|
818
|
+
if (typeof extracted !== 'object' || extracted === null) {
|
|
819
|
+
return extracted;
|
|
820
|
+
}
|
|
821
|
+
|
|
822
|
+
const normalized = {};
|
|
823
|
+
for (const [key, value] of Object.entries(extracted)) {
|
|
824
|
+
if (typeof value === 'string') {
|
|
825
|
+
// Remove common LLM artifacts
|
|
826
|
+
normalized[key] = value
|
|
827
|
+
.trim()
|
|
828
|
+
.replace(/^["']|["']$/g, '') // Remove quotes
|
|
829
|
+
.replace(/^[-•*]\s*/, '') // Remove list markers
|
|
830
|
+
.trim();
|
|
831
|
+
} else {
|
|
832
|
+
normalized[key] = value;
|
|
833
|
+
}
|
|
834
|
+
}
|
|
835
|
+
|
|
836
|
+
return normalized;
|
|
837
|
+
}
|
|
838
|
+
|
|
839
|
+
/**
|
|
840
|
+
* Validate extraction result with multiple checks
|
|
841
|
+
*/
|
|
842
|
+
function validateExtraction(llmOutput, sourceText, options = {}) {
|
|
843
|
+
const {
|
|
844
|
+
format = 'json',
|
|
845
|
+
fields = [],
|
|
846
|
+
strict = true, // If true, reject if validation fails
|
|
847
|
+
} = options;
|
|
848
|
+
|
|
849
|
+
let parsed = llmOutput.trim();
|
|
850
|
+
|
|
851
|
+
// Step 1: Parse JSON if needed
|
|
852
|
+
if (format === 'json') {
|
|
853
|
+
const jsonResult = validateJSON(parsed, fields);
|
|
854
|
+
if (!jsonResult.isValid) {
|
|
855
|
+
return {
|
|
856
|
+
isValid: false,
|
|
857
|
+
error: jsonResult.error,
|
|
858
|
+
raw: llmOutput,
|
|
859
|
+
validated: null
|
|
860
|
+
};
|
|
861
|
+
}
|
|
862
|
+
parsed = jsonResult.data;
|
|
863
|
+
}
|
|
864
|
+
|
|
865
|
+
// Step 2: Normalize
|
|
866
|
+
const normalized = normalizeExtracted(parsed);
|
|
867
|
+
|
|
868
|
+
// Step 3: Verify against source
|
|
869
|
+
const verification = verifyExtraction(normalized, sourceText, fields);
|
|
870
|
+
|
|
871
|
+
// Step 4: Return result
|
|
872
|
+
if (strict && !verification.isValid) {
|
|
873
|
+
return {
|
|
874
|
+
isValid: false,
|
|
875
|
+
error: 'Extraction validation failed',
|
|
876
|
+
issues: verification.issues,
|
|
877
|
+
raw: llmOutput,
|
|
878
|
+
validated: null
|
|
879
|
+
};
|
|
880
|
+
}
|
|
881
|
+
|
|
882
|
+
return {
|
|
883
|
+
isValid: true,
|
|
884
|
+
raw: llmOutput,
|
|
885
|
+
validated: normalized,
|
|
886
|
+
warnings: verification.issues // Include warnings even if not strict
|
|
887
|
+
};
|
|
888
|
+
}
|
|
889
|
+
|
|
890
|
+
/**
|
|
891
|
+
* Extract specific information from text
|
|
892
|
+
* Uses rule-based validation to prevent hallucinations
|
|
893
|
+
* @param {LLMEngine} engine - The LLM engine instance
|
|
894
|
+
* @param {string} text - Text to extract from
|
|
895
|
+
* @param {Object} options - Extraction options
|
|
896
|
+
* @returns {Promise<string>}
|
|
897
|
+
*/
|
|
898
|
+
async function extract(engine, text, options = {}) {
|
|
899
|
+
const logger = engine.getLogger();
|
|
900
|
+
|
|
901
|
+
const {
|
|
902
|
+
what = "key information", // What to extract
|
|
903
|
+
format = "text", // "text", "json", "list"
|
|
904
|
+
fields = [], // Specific fields to extract (for JSON)
|
|
905
|
+
validate = true, // Enable rule-based validation
|
|
906
|
+
strict = false, // If true, throw error on validation failure
|
|
907
|
+
} = options;
|
|
908
|
+
|
|
909
|
+
// Log prompt construction
|
|
910
|
+
const originalPrompt = `Extract ${what} from the following text`;
|
|
911
|
+
let prompt = originalPrompt;
|
|
912
|
+
|
|
913
|
+
if (format === "json") {
|
|
914
|
+
if (fields.length > 0) {
|
|
915
|
+
prompt += ` in JSON format with these fields: ${fields.join(", ")}`;
|
|
916
|
+
} else {
|
|
917
|
+
prompt += ` in JSON format`;
|
|
918
|
+
}
|
|
919
|
+
} else if (format === "list") {
|
|
920
|
+
prompt += ` as a list`;
|
|
921
|
+
}
|
|
922
|
+
|
|
923
|
+
prompt += `:\n\n${text}`;
|
|
924
|
+
|
|
925
|
+
logger.logPromptConstruction('extract', originalPrompt, prompt, options);
|
|
926
|
+
|
|
927
|
+
// Run LLM extraction
|
|
928
|
+
const llmResult = await engine.run(prompt, { temperature: 0.3 });
|
|
929
|
+
const rawResult = llmResult.trim();
|
|
930
|
+
|
|
931
|
+
logger.log('info', 'EXTRACT', 'LLM extraction completed', {
|
|
932
|
+
format,
|
|
933
|
+
fields,
|
|
934
|
+
resultLength: rawResult.length
|
|
935
|
+
});
|
|
936
|
+
|
|
937
|
+
// Apply validation if enabled and format is JSON
|
|
938
|
+
if (validate && format === "json") {
|
|
939
|
+
logger.log('info', 'VALIDATION', 'Starting rule-based validation');
|
|
940
|
+
|
|
941
|
+
const validation = validateExtraction(rawResult, text, {
|
|
942
|
+
format,
|
|
943
|
+
fields,
|
|
944
|
+
strict
|
|
945
|
+
});
|
|
946
|
+
|
|
947
|
+
logger.logValidation('extract', text, validation.validated || rawResult, validation.isValid,
|
|
948
|
+
validation.error ? new Error(validation.error) : null);
|
|
949
|
+
|
|
950
|
+
if (!validation.isValid) {
|
|
951
|
+
if (strict) {
|
|
952
|
+
throw new Error(`Extraction validation failed: ${validation.error}`);
|
|
953
|
+
} else {
|
|
954
|
+
logger.log('warn', 'VALIDATION', 'Validation failed but continuing (non-strict mode)', {
|
|
955
|
+
error: validation.error,
|
|
956
|
+
issues: validation.issues
|
|
957
|
+
});
|
|
958
|
+
// Return raw result with warning
|
|
959
|
+
return rawResult;
|
|
960
|
+
}
|
|
961
|
+
}
|
|
962
|
+
|
|
963
|
+
// Return validated result
|
|
964
|
+
if (validation.validated) {
|
|
965
|
+
logger.log('info', 'VALIDATION', 'Validation passed, returning validated data');
|
|
966
|
+
return JSON.stringify(validation.validated, null, 2);
|
|
967
|
+
}
|
|
968
|
+
}
|
|
969
|
+
|
|
970
|
+
return rawResult;
|
|
971
|
+
}
|
|
972
|
+
|
|
973
|
+
/**
|
|
974
|
+
* Client-Side LLM Preprocessor
|
|
975
|
+
*
|
|
976
|
+
* A flexible SDK for preprocessing text using local LLM models in the browser.
|
|
977
|
+
* Supports cleaning, extraction, and custom prompts.
|
|
978
|
+
*/
|
|
979
|
+
class Preprocessor {
|
|
980
|
+
constructor(options = {}) {
|
|
981
|
+
this.engine = new LLMEngine(options);
|
|
982
|
+
this.isModelLoaded = false;
|
|
983
|
+
this.logger = this.engine.getLogger();
|
|
984
|
+
}
|
|
985
|
+
|
|
986
|
+
/**
|
|
987
|
+
* Load the WebLLM model
|
|
988
|
+
* @param {string} model - Model name (default: "Llama-3.2-1B-Instruct-q4f16_1-MLC")
|
|
989
|
+
* @returns {Promise<void>}
|
|
990
|
+
*/
|
|
991
|
+
async loadModel(model) {
|
|
992
|
+
await this.engine.loadModel(model);
|
|
993
|
+
this.isModelLoaded = true;
|
|
994
|
+
this.logger.log('info', 'PREPROCESSOR', 'Model loaded and ready');
|
|
995
|
+
}
|
|
996
|
+
|
|
997
|
+
/**
|
|
998
|
+
* Check if WebGPU is supported in the current environment
|
|
999
|
+
* @returns {Promise<boolean>}
|
|
1000
|
+
*/
|
|
1001
|
+
async checkWebGPU() {
|
|
1002
|
+
if (typeof navigator === 'undefined' || !navigator.gpu) {
|
|
1003
|
+
return false;
|
|
1004
|
+
}
|
|
1005
|
+
try {
|
|
1006
|
+
const adapter = await navigator.gpu.requestAdapter();
|
|
1007
|
+
return !!adapter;
|
|
1008
|
+
} catch (e) {
|
|
1009
|
+
return false;
|
|
1010
|
+
}
|
|
1011
|
+
}
|
|
1012
|
+
|
|
1013
|
+
/**
|
|
1014
|
+
* Get the logger instance for accessing internal logs
|
|
1015
|
+
* @returns {InternalLogger}
|
|
1016
|
+
*/
|
|
1017
|
+
getLogger() {
|
|
1018
|
+
return this.logger;
|
|
1019
|
+
}
|
|
1020
|
+
|
|
1021
|
+
/**
|
|
1022
|
+
* Enable/disable internal logging
|
|
1023
|
+
*/
|
|
1024
|
+
setLogging(enabled, verbose = false) {
|
|
1025
|
+
this.logger.setEnabled(enabled);
|
|
1026
|
+
this.logger.setVerbose(verbose);
|
|
1027
|
+
this.logger.log('info', 'PREPROCESSOR', `Logging ${enabled ? 'enabled' : 'disabled'}`, { verbose });
|
|
1028
|
+
}
|
|
1029
|
+
|
|
1030
|
+
/**
|
|
1031
|
+
* Ensure model is loaded
|
|
1032
|
+
* @private
|
|
1033
|
+
*/
|
|
1034
|
+
_ensureLoaded() {
|
|
1035
|
+
if (!this.isModelLoaded && !this.engine.isLoaded()) {
|
|
1036
|
+
throw new Error(
|
|
1037
|
+
"Model not loaded. Call loadModel() first before using preprocessing functions."
|
|
1038
|
+
);
|
|
1039
|
+
}
|
|
1040
|
+
}
|
|
1041
|
+
|
|
1042
|
+
/**
|
|
1043
|
+
* Clean text
|
|
1044
|
+
* Works with or without LLM model loaded
|
|
1045
|
+
* Uses rule-based cleaning if model not loaded, LLM if available
|
|
1046
|
+
* All options are opt-in (default: false) - user chooses what to remove
|
|
1047
|
+
* @param {string} text - Text to clean
|
|
1048
|
+
* @param {Object} options - Cleaning options (all optional, default: false)
|
|
1049
|
+
* @param {boolean} options.removeHtml - Remove HTML tags (default: false)
|
|
1050
|
+
* @param {boolean} options.removeUrls - Remove URLs (default: false)
|
|
1051
|
+
* @param {boolean} options.removeExtraWhitespace - Remove extra whitespace (default: false)
|
|
1052
|
+
* @param {boolean} options.removeLineBreaks - Remove line breaks (default: false)
|
|
1053
|
+
* @param {boolean} options.removeSpecialChars - Remove special characters (default: false)
|
|
1054
|
+
* @param {boolean} options.decodeHtmlEntities - Decode HTML entities like & (default: false)
|
|
1055
|
+
* @param {string} options.customInstructions - Additional cleaning instructions (requires LLM)
|
|
1056
|
+
* @param {boolean} options.useLLM - Force LLM usage (requires model loaded)
|
|
1057
|
+
* @returns {Promise<string>|string}
|
|
1058
|
+
*
|
|
1059
|
+
* @example
|
|
1060
|
+
* // No options - returns text as-is
|
|
1061
|
+
* await p.clean(text);
|
|
1062
|
+
*
|
|
1063
|
+
* // User chooses what to remove
|
|
1064
|
+
* await p.clean(text, { removeHtml: true, removeExtraWhitespace: true });
|
|
1065
|
+
*
|
|
1066
|
+
* // Use LLM for semantic cleaning
|
|
1067
|
+
* await p.clean(text, { useLLM: true, customInstructions: "Remove all dates" });
|
|
1068
|
+
*/
|
|
1069
|
+
async clean(text, options = {}) {
|
|
1070
|
+
// Don't require model - can work without it
|
|
1071
|
+
// Only require if explicitly using LLM or custom instructions
|
|
1072
|
+
if (options.useLLM === true || options.customInstructions) {
|
|
1073
|
+
this._ensureLoaded();
|
|
1074
|
+
}
|
|
1075
|
+
|
|
1076
|
+
return await clean(this.engine, text, options);
|
|
1077
|
+
}
|
|
1078
|
+
|
|
1079
|
+
/**
|
|
1080
|
+
* Extract information from text
|
|
1081
|
+
* @param {string} text - Text to extract from
|
|
1082
|
+
* @param {Object} options - Extraction options
|
|
1083
|
+
* @returns {Promise<string>}
|
|
1084
|
+
*/
|
|
1085
|
+
async extract(text, options = {}) {
|
|
1086
|
+
this._ensureLoaded();
|
|
1087
|
+
return await extract(this.engine, text, options);
|
|
1088
|
+
}
|
|
1089
|
+
|
|
1090
|
+
/**
|
|
1091
|
+
* Chunk text into smaller pieces (non-LLM, fast operation)
|
|
1092
|
+
* Works immediately, no model needed
|
|
1093
|
+
* @param {string} text - Text to chunk
|
|
1094
|
+
* @param {Object} options - Chunking options
|
|
1095
|
+
* @returns {string[]}
|
|
1096
|
+
*/
|
|
1097
|
+
chunk(text, options = {}) {
|
|
1098
|
+
// No model check needed - chunk is pure string operation
|
|
1099
|
+
return chunk(text, options);
|
|
1100
|
+
}
|
|
1101
|
+
|
|
1102
|
+
/**
|
|
1103
|
+
* Run a custom prompt on text
|
|
1104
|
+
* @param {string} text - Input text
|
|
1105
|
+
* @param {string|Object} instruction - Custom instruction or config object
|
|
1106
|
+
* @param {Object} options - Generation options
|
|
1107
|
+
* @returns {Promise<string>}
|
|
1108
|
+
*/
|
|
1109
|
+
async prompt(text, instruction, options = {}) {
|
|
1110
|
+
this._ensureLoaded();
|
|
1111
|
+
|
|
1112
|
+
let promptText;
|
|
1113
|
+
let genOptions = { ...options };
|
|
1114
|
+
|
|
1115
|
+
if (typeof instruction === "string") {
|
|
1116
|
+
promptText = `${instruction}\n\n${text}`;
|
|
1117
|
+
} else if (typeof instruction === "object") {
|
|
1118
|
+
// Advanced prompt configuration
|
|
1119
|
+
const { instruction: inst, format, temperature, maxTokens } = instruction;
|
|
1120
|
+
|
|
1121
|
+
promptText = inst;
|
|
1122
|
+
if (format) {
|
|
1123
|
+
if (typeof format === "object") {
|
|
1124
|
+
promptText += `\n\nReturn the result in JSON format with these fields: ${JSON.stringify(format)}`;
|
|
1125
|
+
} else {
|
|
1126
|
+
promptText += `\n\nFormat: ${format}`;
|
|
1127
|
+
}
|
|
1128
|
+
}
|
|
1129
|
+
promptText += `\n\n${text}`;
|
|
1130
|
+
|
|
1131
|
+
if (temperature !== undefined) genOptions.temperature = temperature;
|
|
1132
|
+
if (maxTokens !== undefined) genOptions.maxTokens = maxTokens;
|
|
1133
|
+
} else {
|
|
1134
|
+
throw new Error("Instruction must be a string or object");
|
|
1135
|
+
}
|
|
1136
|
+
|
|
1137
|
+
return await this.engine.run(promptText, genOptions);
|
|
1138
|
+
}
|
|
1139
|
+
|
|
1140
|
+
/**
|
|
1141
|
+
* Enforce correct pipeline ordering
|
|
1142
|
+
* Always ensures: clean → extract (if both present)
|
|
1143
|
+
* @private
|
|
1144
|
+
*/
|
|
1145
|
+
_enforcePipelineOrder(pipeline) {
|
|
1146
|
+
const ordered = [...pipeline];
|
|
1147
|
+
const cleanIndex = ordered.findIndex(step =>
|
|
1148
|
+
step === "clean" || (typeof step === "object" && step.clean !== undefined)
|
|
1149
|
+
);
|
|
1150
|
+
const extractIndex = ordered.findIndex(step =>
|
|
1151
|
+
step === "extract" || (typeof step === "object" && step.extract !== undefined)
|
|
1152
|
+
);
|
|
1153
|
+
|
|
1154
|
+
// If both clean and extract exist, ensure clean comes first
|
|
1155
|
+
if (cleanIndex !== -1 && extractIndex !== -1 && cleanIndex > extractIndex) {
|
|
1156
|
+
this.logger.log('warn', 'PIPELINE', 'Reordering pipeline: clean must come before extract', {
|
|
1157
|
+
originalOrder: ordered.map(s => typeof s === 'string' ? s : Object.keys(s)[0]),
|
|
1158
|
+
reordered: true
|
|
1159
|
+
});
|
|
1160
|
+
|
|
1161
|
+
// Move clean before extract
|
|
1162
|
+
const cleanStep = ordered.splice(cleanIndex, 1)[0];
|
|
1163
|
+
const newExtractIndex = ordered.findIndex(step =>
|
|
1164
|
+
step === "extract" || (typeof step === "object" && step.extract !== undefined)
|
|
1165
|
+
);
|
|
1166
|
+
ordered.splice(newExtractIndex, 0, cleanStep);
|
|
1167
|
+
}
|
|
1168
|
+
|
|
1169
|
+
return ordered;
|
|
1170
|
+
}
|
|
1171
|
+
|
|
1172
|
+
/**
|
|
1173
|
+
* Process text with multiple operations in a pipeline
|
|
1174
|
+
* Automatically enforces correct ordering (clean → extract)
|
|
1175
|
+
* @param {string} text - Input text
|
|
1176
|
+
* @param {Array} pipeline - Array of operations to apply
|
|
1177
|
+
* @returns {Promise<string|string[]>}
|
|
1178
|
+
*
|
|
1179
|
+
* @example
|
|
1180
|
+
* await p.pipeline(text, [
|
|
1181
|
+
* "extract", // Will be reordered to run after clean
|
|
1182
|
+
* "clean",
|
|
1183
|
+
* { prompt: "Rewrite in pirate style" }
|
|
1184
|
+
* ]);
|
|
1185
|
+
*/
|
|
1186
|
+
async pipeline(text, pipeline) {
|
|
1187
|
+
this._ensureLoaded();
|
|
1188
|
+
|
|
1189
|
+
if (!Array.isArray(pipeline) || pipeline.length === 0) {
|
|
1190
|
+
throw new Error("Pipeline must be a non-empty array");
|
|
1191
|
+
}
|
|
1192
|
+
|
|
1193
|
+
// Enforce correct ordering
|
|
1194
|
+
const orderedPipeline = this._enforcePipelineOrder(pipeline);
|
|
1195
|
+
|
|
1196
|
+
this.logger.log('info', 'PIPELINE', 'Starting pipeline execution', {
|
|
1197
|
+
stepCount: orderedPipeline.length,
|
|
1198
|
+
steps: orderedPipeline.map(s => typeof s === 'string' ? s : Object.keys(s)[0])
|
|
1199
|
+
});
|
|
1200
|
+
|
|
1201
|
+
let result = text;
|
|
1202
|
+
const startTime = Date.now();
|
|
1203
|
+
|
|
1204
|
+
for (let i = 0; i < orderedPipeline.length; i++) {
|
|
1205
|
+
const step = orderedPipeline[i];
|
|
1206
|
+
const stepStartTime = Date.now();
|
|
1207
|
+
const stepName = typeof step === 'string' ? step : Object.keys(step)[0] || 'unknown';
|
|
1208
|
+
|
|
1209
|
+
try {
|
|
1210
|
+
if (typeof step === "string") {
|
|
1211
|
+
// Built-in operation name
|
|
1212
|
+
switch (step) {
|
|
1213
|
+
case "clean":
|
|
1214
|
+
result = await this.clean(result);
|
|
1215
|
+
break;
|
|
1216
|
+
case "extract":
|
|
1217
|
+
result = await this.extract(result);
|
|
1218
|
+
break;
|
|
1219
|
+
case "chunk":
|
|
1220
|
+
result = this.chunk(result);
|
|
1221
|
+
break;
|
|
1222
|
+
default:
|
|
1223
|
+
throw new Error(`Unknown operation: ${step}`);
|
|
1224
|
+
}
|
|
1225
|
+
} else if (typeof step === "object") {
|
|
1226
|
+
// Custom operation with options
|
|
1227
|
+
if (step.prompt) {
|
|
1228
|
+
result = await this.prompt(result, step.prompt, step.options || {});
|
|
1229
|
+
} else if (step.clean) {
|
|
1230
|
+
result = await this.clean(result, step.clean);
|
|
1231
|
+
} else if (step.extract) {
|
|
1232
|
+
result = await this.extract(result, step.extract);
|
|
1233
|
+
} else if (step.chunk) {
|
|
1234
|
+
result = this.chunk(result, step.chunk);
|
|
1235
|
+
} else {
|
|
1236
|
+
throw new Error(`Unknown operation object: ${JSON.stringify(step)}`);
|
|
1237
|
+
}
|
|
1238
|
+
} else {
|
|
1239
|
+
throw new Error(`Invalid pipeline step: ${step}`);
|
|
1240
|
+
}
|
|
1241
|
+
|
|
1242
|
+
// If chunking was applied, result is now an array
|
|
1243
|
+
if (Array.isArray(result)) {
|
|
1244
|
+
this.logger.log('info', 'PIPELINE', 'Chunking applied, stopping pipeline', {
|
|
1245
|
+
chunks: result.length
|
|
1246
|
+
});
|
|
1247
|
+
break; // Can't process arrays further
|
|
1248
|
+
}
|
|
1249
|
+
|
|
1250
|
+
const stepDuration = Date.now() - stepStartTime;
|
|
1251
|
+
this.logger.logPipelineStep(i, stepName, text, result, stepDuration);
|
|
1252
|
+
} catch (error) {
|
|
1253
|
+
this.logger.logError(`pipeline step ${i + 1} (${stepName})`, error, {
|
|
1254
|
+
step,
|
|
1255
|
+
inputLength: typeof text === 'string' ? text.length : 'N/A'
|
|
1256
|
+
});
|
|
1257
|
+
throw error;
|
|
1258
|
+
}
|
|
1259
|
+
}
|
|
1260
|
+
|
|
1261
|
+
const totalDuration = Date.now() - startTime;
|
|
1262
|
+
this.logger.logPerformance('pipeline', {
|
|
1263
|
+
totalSteps: orderedPipeline.length,
|
|
1264
|
+
duration: `${totalDuration}ms`,
|
|
1265
|
+
averageStepTime: `${(totalDuration / orderedPipeline.length).toFixed(2)}ms`
|
|
1266
|
+
});
|
|
1267
|
+
|
|
1268
|
+
return result;
|
|
1269
|
+
}
|
|
1270
|
+
|
|
1271
|
+
/**
|
|
1272
|
+
* Process text with a simple configuration object
|
|
1273
|
+
* @param {string} text - Input text
|
|
1274
|
+
* @param {Object} config - Processing configuration
|
|
1275
|
+
* @returns {Promise<string>}
|
|
1276
|
+
*
|
|
1277
|
+
* @example
|
|
1278
|
+
* await p.process(text, {
|
|
1279
|
+
* clean: true,
|
|
1280
|
+
* extract: { format: "json", fields: ["name", "email"] },
|
|
1281
|
+
* customPrompt: "Convert to bullet points"
|
|
1282
|
+
* });
|
|
1283
|
+
*/
|
|
1284
|
+
async process(text, config = {}) {
|
|
1285
|
+
this._ensureLoaded();
|
|
1286
|
+
|
|
1287
|
+
let result = text;
|
|
1288
|
+
|
|
1289
|
+
// Apply operations in order
|
|
1290
|
+
if (config.clean) {
|
|
1291
|
+
result = await this.clean(
|
|
1292
|
+
result,
|
|
1293
|
+
typeof config.clean === "object" ? config.clean : {}
|
|
1294
|
+
);
|
|
1295
|
+
}
|
|
1296
|
+
|
|
1297
|
+
if (config.extract) {
|
|
1298
|
+
result = await this.extract(
|
|
1299
|
+
result,
|
|
1300
|
+
typeof config.extract === "object" ? config.extract : {}
|
|
1301
|
+
);
|
|
1302
|
+
}
|
|
1303
|
+
|
|
1304
|
+
if (config.customPrompt) {
|
|
1305
|
+
result = await this.prompt(result, config.customPrompt, config.promptOptions || {});
|
|
1306
|
+
}
|
|
1307
|
+
|
|
1308
|
+
if (config.chunk) {
|
|
1309
|
+
result = this.chunk(
|
|
1310
|
+
result,
|
|
1311
|
+
typeof config.chunk === "object" ? config.chunk : {}
|
|
1312
|
+
);
|
|
1313
|
+
}
|
|
1314
|
+
|
|
1315
|
+
return result;
|
|
1316
|
+
}
|
|
1317
|
+
}
|
|
1318
|
+
|
|
1319
|
+
export { LLMEngine, Preprocessor, chunk, clean, cleanWithRules, extract };
|