client-llm-preprocessor 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1319 @@
1
+ import * as webllm from '@mlc-ai/web-llm';
2
+
3
+ /**
4
+ * Advanced Internal LLM Logging System
5
+ *
6
+ * Captures detailed insights into LLM processing:
7
+ * - Token-by-token generation (if streaming available)
8
+ * - Prompt construction steps
9
+ * - Intermediate processing states
10
+ * - Validation steps
11
+ * - Performance metrics
12
+ */
13
+
14
+ class InternalLogger {
15
+ constructor(options = {}) {
16
+ this.enabled = options.enabled !== false; // Default: enabled
17
+ this.verbose = options.verbose || false;
18
+ this.logLevel = options.logLevel || 'info'; // 'debug', 'info', 'warn', 'error'
19
+ this.logs = [];
20
+ this.maxLogs = options.maxLogs || 1000;
21
+ this.onLogCallback = options.onLogCallback || null;
22
+ }
23
+
24
+ /**
25
+ * Enable or disable logging
26
+ */
27
+ setEnabled(enabled) {
28
+ this.enabled = enabled;
29
+ }
30
+
31
+ /**
32
+ * Set verbosity level
33
+ */
34
+ setVerbose(verbose) {
35
+ this.verbose = verbose;
36
+ }
37
+
38
+ /**
39
+ * Log an event with metadata
40
+ */
41
+ log(level, category, message, data = {}) {
42
+ if (!this.enabled) return;
43
+
44
+ const logEntry = {
45
+ timestamp: new Date().toISOString(),
46
+ level,
47
+ category,
48
+ message,
49
+ data,
50
+ stackTrace: this.verbose ? new Error().stack : undefined
51
+ };
52
+
53
+ // Store log
54
+ this.logs.push(logEntry);
55
+ if (this.logs.length > this.maxLogs) {
56
+ this.logs.shift(); // Remove oldest
57
+ }
58
+
59
+ // Console output based on level
60
+ if (this.shouldLog(level)) {
61
+ const prefix = `[${level.toUpperCase()}] [${category}]`;
62
+ console.log(`${prefix} ${message}`, data);
63
+ }
64
+
65
+ // Callback for external handlers
66
+ if (this.onLogCallback) {
67
+ this.onLogCallback(logEntry);
68
+ }
69
+ }
70
+
71
+ /**
72
+ * Check if log level should be output
73
+ */
74
+ shouldLog(level) {
75
+ const levels = { debug: 0, info: 1, warn: 2, error: 3 };
76
+ return levels[level] >= levels[this.logLevel];
77
+ }
78
+
79
+ /**
80
+ * Log prompt construction
81
+ */
82
+ logPromptConstruction(operation, originalPrompt, finalPrompt, options = {}) {
83
+ this.log('debug', 'PROMPT', 'Constructing prompt', {
84
+ operation,
85
+ originalLength: originalPrompt.length,
86
+ finalLength: finalPrompt.length,
87
+ options,
88
+ promptPreview: finalPrompt.substring(0, 200) + '...'
89
+ });
90
+ }
91
+
92
+ /**
93
+ * Log token generation (if streaming available)
94
+ */
95
+ logTokenGeneration(token, cumulativeText, tokenIndex) {
96
+ if (this.verbose) {
97
+ this.log('debug', 'TOKEN', `Generated token ${tokenIndex}`, {
98
+ token,
99
+ cumulativeLength: cumulativeText.length,
100
+ tokenIndex
101
+ });
102
+ }
103
+ }
104
+
105
+ /**
106
+ * Log LLM inference start
107
+ */
108
+ logInferenceStart(prompt, options) {
109
+ this.log('info', 'INFERENCE', 'Starting LLM inference', {
110
+ promptLength: prompt.length,
111
+ temperature: options.temperature,
112
+ maxTokens: options.maxTokens,
113
+ promptPreview: prompt.substring(0, 100) + '...'
114
+ });
115
+ }
116
+
117
+ /**
118
+ * Log LLM inference completion
119
+ */
120
+ logInferenceComplete(response, duration, tokenCount) {
121
+ this.log('info', 'INFERENCE', 'LLM inference completed', {
122
+ responseLength: response.length,
123
+ duration: `${duration}ms`,
124
+ estimatedTokens: tokenCount,
125
+ responsePreview: response.substring(0, 200) + '...'
126
+ });
127
+ }
128
+
129
+ /**
130
+ * Log validation step
131
+ */
132
+ logValidation(step, input, output, isValid, error = null) {
133
+ this.log(isValid ? 'info' : 'warn', 'VALIDATION', `Validation: ${step}`, {
134
+ inputPreview: typeof input === 'string' ? input.substring(0, 100) : input,
135
+ outputPreview: typeof output === 'string' ? output.substring(0, 100) : output,
136
+ isValid,
137
+ error: error?.message
138
+ });
139
+ }
140
+
141
+
142
+ /**
143
+ * Log pipeline step
144
+ */
145
+ logPipelineStep(stepIndex, stepName, input, output, duration) {
146
+ this.log('info', 'PIPELINE', `Pipeline step ${stepIndex + 1}: ${stepName}`, {
147
+ inputLength: typeof input === 'string' ? input.length : 'N/A',
148
+ outputLength: typeof output === 'string' ? output.length : 'N/A',
149
+ duration: `${duration}ms`,
150
+ inputPreview: typeof input === 'string' ? input.substring(0, 50) : input,
151
+ outputPreview: typeof output === 'string' ? output.substring(0, 50) : output
152
+ });
153
+ }
154
+
155
+ /**
156
+ * Log error with context
157
+ */
158
+ logError(operation, error, context = {}) {
159
+ this.log('error', 'ERROR', `Error in ${operation}`, {
160
+ error: error.message,
161
+ stack: error.stack,
162
+ context
163
+ });
164
+ }
165
+
166
+ /**
167
+ * Log performance metrics
168
+ */
169
+ logPerformance(operation, metrics) {
170
+ this.log('info', 'PERFORMANCE', `Performance: ${operation}`, metrics);
171
+ }
172
+
173
+ /**
174
+ * Get all logs
175
+ */
176
+ getLogs(filter = {}) {
177
+ let filtered = [...this.logs];
178
+
179
+ if (filter.level) {
180
+ filtered = filtered.filter(log => log.level === filter.level);
181
+ }
182
+
183
+ if (filter.category) {
184
+ filtered = filtered.filter(log => log.category === filter.category);
185
+ }
186
+
187
+ if (filter.since) {
188
+ const sinceTime = new Date(filter.since).getTime();
189
+ filtered = filtered.filter(log => new Date(log.timestamp).getTime() >= sinceTime);
190
+ }
191
+
192
+ return filtered;
193
+ }
194
+
195
+ /**
196
+ * Get logs as formatted string
197
+ */
198
+ getLogsAsString(filter = {}) {
199
+ const logs = this.getLogs(filter);
200
+ return logs.map(log => {
201
+ return `[${log.timestamp}] [${log.level}] [${log.category}] ${log.message}`;
202
+ }).join('\n');
203
+ }
204
+
205
+ /**
206
+ * Clear logs
207
+ */
208
+ clear() {
209
+ this.logs = [];
210
+ }
211
+
212
+ /**
213
+ * Export logs as JSON
214
+ */
215
+ exportLogs() {
216
+ return JSON.stringify(this.logs, null, 2);
217
+ }
218
+
219
+ /**
220
+ * Get summary statistics
221
+ */
222
+ getStats() {
223
+ const stats = {
224
+ totalLogs: this.logs.length,
225
+ byLevel: {},
226
+ byCategory: {},
227
+ errors: 0,
228
+ warnings: 0,
229
+ timeRange: {
230
+ start: this.logs[0]?.timestamp,
231
+ end: this.logs[this.logs.length - 1]?.timestamp
232
+ }
233
+ };
234
+
235
+ this.logs.forEach(log => {
236
+ stats.byLevel[log.level] = (stats.byLevel[log.level] || 0) + 1;
237
+ stats.byCategory[log.category] = (stats.byCategory[log.category] || 0) + 1;
238
+ if (log.level === 'error') stats.errors++;
239
+ if (log.level === 'warn') stats.warnings++;
240
+ });
241
+
242
+ return stats;
243
+ }
244
+ }
245
+
246
+ // Singleton instance
247
+ let defaultLogger = null;
248
+
249
+ /**
250
+ * Get or create default logger
251
+ */
252
+ function getLogger(options = {}) {
253
+ if (!defaultLogger) {
254
+ defaultLogger = new InternalLogger(options);
255
+ }
256
+ return defaultLogger;
257
+ }
258
+
259
+ /**
260
+ * WebLLM Engine Wrapper
261
+ * Handles model loading and inference with detailed internal logging
262
+ */
263
+ class LLMEngine {
264
+ constructor(options = {}) {
265
+ this.engine = null;
266
+ this.model = null;
267
+ this.logger = options.logger || getLogger(options.loggerOptions);
268
+ this.streamingEnabled = options.streaming !== false; // Try streaming by default
269
+ }
270
+
271
+ /**
272
+ * Load a WebLLM model
273
+ * @param {string} model - Model name (default: "Llama-3.2-1B-Instruct-q4f16_1")
274
+ * @returns {Promise<void>}
275
+ */
276
+ async loadModel(model = "Llama-3.2-1B-Instruct-q4f16_1-MLC") {
277
+ if (this.engine && this.model === model) {
278
+ this.logger.log('info', 'MODEL', 'Model already loaded, skipping');
279
+ return;
280
+ }
281
+
282
+ const startTime = Date.now();
283
+ this.logger.log('info', 'MODEL', `Loading model: ${model}`, { model });
284
+
285
+ try {
286
+ this.engine = await webllm.CreateMLCEngine(model, {
287
+ initProgressCallback: (report) => {
288
+ if (report.progress) {
289
+ const progress = (report.progress * 100).toFixed(1);
290
+ this.logger.log('info', 'MODEL', `Loading progress: ${progress}%`, {
291
+ progress: parseFloat(progress),
292
+ report
293
+ });
294
+ }
295
+ },
296
+ });
297
+
298
+ this.model = model;
299
+ const loadTime = Date.now() - startTime;
300
+ this.logger.log('info', 'MODEL', 'Model loaded successfully', {
301
+ model,
302
+ loadTime: `${loadTime}ms`
303
+ });
304
+ } catch (error) {
305
+ this.logger.logError('loadModel', error, { model });
306
+ throw new Error(`Failed to load model: ${error.message}`);
307
+ }
308
+ }
309
+
310
+ /**
311
+ * Run inference with the loaded model
312
+ * Captures detailed internal state including token-by-token generation
313
+ * @param {string} prompt - The prompt to send to the model
314
+ * @param {Object} options - Generation options
315
+ * @returns {Promise<string>}
316
+ */
317
+ async run(prompt, options = {}) {
318
+ if (!this.engine) {
319
+ throw new Error("Model not loaded. Call loadModel() first.");
320
+ }
321
+
322
+ const {
323
+ temperature = 0.7,
324
+ maxTokens = 512,
325
+ stopSequences = [],
326
+ stream = this.streamingEnabled, // Try streaming for token-by-token logging
327
+ } = options;
328
+
329
+ const startTime = Date.now();
330
+ this.logger.logInferenceStart(prompt, { temperature, maxTokens, stopSequences, stream });
331
+
332
+ try {
333
+ let fullResponse = '';
334
+ let tokenCount = 0;
335
+ const tokens = [];
336
+
337
+ // Try streaming first for token-by-token visibility
338
+ if (stream && this.engine.chat?.completions?.createStream) {
339
+ this.logger.log('info', 'INFERENCE', 'Using streaming mode for token-by-token logging');
340
+
341
+ try {
342
+ const stream = await this.engine.chat.completions.createStream({
343
+ messages: [{ role: "user", content: prompt }],
344
+ temperature,
345
+ max_tokens: maxTokens,
346
+ stop: stopSequences.length > 0 ? stopSequences : undefined,
347
+ });
348
+
349
+ // Capture each token as it's generated
350
+ for await (const chunk of stream) {
351
+ const delta = chunk.choices?.[0]?.delta?.content || '';
352
+ if (delta) {
353
+ fullResponse += delta;
354
+ tokenCount++;
355
+ tokens.push(delta);
356
+
357
+ // Log each token (if verbose)
358
+ this.logger.logTokenGeneration(delta, fullResponse, tokenCount);
359
+ }
360
+ }
361
+
362
+ this.logger.log('info', 'INFERENCE', 'Streaming completed', {
363
+ totalTokens: tokenCount,
364
+ responseLength: fullResponse.length
365
+ });
366
+ } catch (streamError) {
367
+ // Fallback to non-streaming if streaming fails
368
+ this.logger.log('warn', 'INFERENCE', 'Streaming failed, falling back to non-streaming', {
369
+ error: streamError.message
370
+ });
371
+ return await this.runNonStreaming(prompt, options, startTime);
372
+ }
373
+ } else {
374
+ // Non-streaming mode
375
+ return await this.runNonStreaming(prompt, options, startTime);
376
+ }
377
+
378
+ const duration = Date.now() - startTime;
379
+ this.logger.logInferenceComplete(fullResponse, duration, tokenCount);
380
+
381
+ // Log token sequence for analysis
382
+ this.logger.log('debug', 'INFERENCE', 'Token sequence captured', {
383
+ tokenCount,
384
+ tokens: tokens.slice(0, 20), // First 20 tokens
385
+ fullSequenceLength: tokens.length
386
+ });
387
+
388
+ return fullResponse;
389
+ } catch (error) {
390
+ const duration = Date.now() - startTime;
391
+ this.logger.logError('run', error, {
392
+ promptLength: prompt.length,
393
+ duration: `${duration}ms`,
394
+ options
395
+ });
396
+ throw new Error(`Inference failed: ${error.message}`);
397
+ }
398
+ }
399
+
400
+ /**
401
+ * Non-streaming inference (fallback)
402
+ * @private
403
+ */
404
+ async runNonStreaming(prompt, options, startTime) {
405
+ const {
406
+ temperature = 0.7,
407
+ maxTokens = 512,
408
+ stopSequences = [],
409
+ } = options;
410
+
411
+ this.logger.log('info', 'INFERENCE', 'Using non-streaming mode');
412
+
413
+ const response = await this.engine.chat.completions.create({
414
+ messages: [{ role: "user", content: prompt }],
415
+ temperature,
416
+ max_tokens: maxTokens,
417
+ stop: stopSequences.length > 0 ? stopSequences : undefined,
418
+ });
419
+
420
+ const result = response.choices[0].message.content;
421
+ const duration = Date.now() - (startTime || Date.now());
422
+ const estimatedTokens = Math.ceil(result.length / 4); // Rough estimate: ~4 chars per token
423
+
424
+ this.logger.logInferenceComplete(result, duration, estimatedTokens);
425
+
426
+ return result;
427
+ }
428
+
429
+ /**
430
+ * Check if model is loaded
431
+ * @returns {boolean}
432
+ */
433
+ isLoaded() {
434
+ return this.engine !== null;
435
+ }
436
+
437
+ /**
438
+ * Get the logger instance
439
+ * @returns {InternalLogger}
440
+ */
441
+ getLogger() {
442
+ return this.logger;
443
+ }
444
+
445
+ /**
446
+ * Enable/disable streaming for token-by-token logging
447
+ */
448
+ setStreaming(enabled) {
449
+ this.streamingEnabled = enabled;
450
+ this.logger.log('info', 'ENGINE', `Streaming ${enabled ? 'enabled' : 'disabled'}`);
451
+ }
452
+ }
453
+
454
+ /**
455
+ * Non-LLM cleaning using rules and regex
456
+ * Fast, deterministic, works without model
457
+ * All options are opt-in (default: false) - user chooses what to remove
458
+ */
459
+
460
+ /**
461
+ * Clean text using rule-based approach (no LLM)
462
+ * @param {string} text - Text to clean
463
+ * @param {Object} options - Cleaning options (all optional, default: false)
464
+ * @param {boolean} options.removeHtml - Remove HTML tags (default: false)
465
+ * @param {boolean} options.removeUrls - Remove URLs (default: false)
466
+ * @param {boolean} options.removeExtraWhitespace - Remove extra whitespace (default: false)
467
+ * @param {boolean} options.removeLineBreaks - Remove line breaks (default: false)
468
+ * @param {boolean} options.removeSpecialChars - Remove special characters (default: false)
469
+ * @param {boolean} options.decodeHtmlEntities - Decode HTML entities like &amp; (default: false)
470
+ * @returns {string}
471
+ */
472
+ function cleanWithRules(text, options = {}) {
473
+ const {
474
+ removeHtml = false,
475
+ removeUrls = false,
476
+ removeExtraWhitespace = false,
477
+ removeLineBreaks = false,
478
+ removeSpecialChars = false,
479
+ decodeHtmlEntities = false,
480
+ } = options;
481
+
482
+ let cleaned = text;
483
+
484
+ // Decode HTML entities (if requested)
485
+ if (decodeHtmlEntities) {
486
+ cleaned = cleaned
487
+ .replace(/&nbsp;/g, ' ')
488
+ .replace(/&amp;/g, '&')
489
+ .replace(/&lt;/g, '<')
490
+ .replace(/&gt;/g, '>')
491
+ .replace(/&quot;/g, '"')
492
+ .replace(/&#39;/g, "'")
493
+ .replace(/&#x27;/g, "'")
494
+ .replace(/&#x2F;/g, '/');
495
+ }
496
+
497
+ // Remove HTML tags (if requested)
498
+ if (removeHtml) {
499
+ cleaned = cleaned.replace(/<[^>]+>/g, '');
500
+ }
501
+
502
+ // Remove URLs (if requested)
503
+ if (removeUrls) {
504
+ cleaned = cleaned.replace(/https?:\/\/[^\s]+/g, '');
505
+ }
506
+
507
+ // Remove line breaks (if requested)
508
+ if (removeLineBreaks) {
509
+ cleaned = cleaned.replace(/[\r\n]+/g, ' ');
510
+ }
511
+
512
+ // Remove extra whitespace (if requested)
513
+ if (removeExtraWhitespace) {
514
+ // Replace multiple spaces with single space
515
+ cleaned = cleaned.replace(/[ \t]+/g, ' ');
516
+ // Remove leading/trailing whitespace from each line
517
+ if (!removeLineBreaks) {
518
+ cleaned = cleaned.split('\n').map(line => line.trim()).join('\n');
519
+ }
520
+ // Remove multiple newlines (if line breaks not removed)
521
+ if (!removeLineBreaks) {
522
+ cleaned = cleaned.replace(/\n{3,}/g, '\n\n');
523
+ }
524
+ // Trim overall
525
+ cleaned = cleaned.trim();
526
+ }
527
+
528
+ // Remove special characters (if requested)
529
+ if (removeSpecialChars) {
530
+ // Keep alphanumeric, spaces, and basic punctuation
531
+ cleaned = cleaned.replace(/[^\w\s.,!?;:()\-'"]/g, '');
532
+ }
533
+
534
+ return cleaned;
535
+ }
536
+
537
+ /**
538
+ * Clean text by removing noise, HTML, and irrelevant content
539
+ * Uses LLM if available, falls back to rule-based cleaning if not
540
+ * All options are opt-in (default: false) - user chooses what to remove
541
+ * @param {LLMEngine|null} engine - The LLM engine instance (can be null)
542
+ * @param {string} text - Text to clean
543
+ * @param {Object} options - Cleaning options (all optional, default: false)
544
+ * @param {boolean} options.removeHtml - Remove HTML tags (default: false)
545
+ * @param {boolean} options.removeUrls - Remove URLs (default: false)
546
+ * @param {boolean} options.removeExtraWhitespace - Remove extra whitespace (default: false)
547
+ * @param {boolean} options.removeLineBreaks - Remove line breaks (default: false)
548
+ * @param {boolean} options.removeSpecialChars - Remove special characters (default: false)
549
+ * @param {boolean} options.decodeHtmlEntities - Decode HTML entities like &amp; (default: false)
550
+ * @param {string} options.customInstructions - Additional cleaning instructions (requires LLM)
551
+ * @param {boolean} options.useLLM - Force LLM usage if model is loaded (default: auto-detect)
552
+ * @returns {Promise<string>|string}
553
+ */
554
+ async function clean(engine, text, options = {}) {
555
+ const {
556
+ removeHtml = false,
557
+ removeUrls = false,
558
+ removeExtraWhitespace = false,
559
+ removeLineBreaks = false,
560
+ removeSpecialChars = false,
561
+ decodeHtmlEntities = false,
562
+ customInstructions = "",
563
+ useLLM = null, // null = auto-detect, true = force LLM, false = force rules
564
+ } = options;
565
+
566
+ // Check if we should use LLM
567
+ const shouldUseLLM = useLLM !== false &&
568
+ engine !== null &&
569
+ engine.isLoaded() &&
570
+ (useLLM === true || customInstructions !== "");
571
+
572
+ if (!shouldUseLLM) {
573
+ // Use fast rule-based cleaning (no LLM needed)
574
+ const logger = engine?.getLogger();
575
+ if (logger) {
576
+ logger.log('info', 'CLEAN', 'Using rule-based cleaning (no LLM)', {
577
+ reason: !engine ? 'No engine' : !engine.isLoaded() ? 'Model not loaded' : 'useLLM=false',
578
+ options: { removeHtml, removeUrls, removeExtraWhitespace, removeLineBreaks, removeSpecialChars, decodeHtmlEntities }
579
+ });
580
+ }
581
+
582
+ return cleanWithRules(text, {
583
+ removeHtml,
584
+ removeUrls,
585
+ removeExtraWhitespace,
586
+ removeLineBreaks,
587
+ removeSpecialChars,
588
+ decodeHtmlEntities
589
+ });
590
+ }
591
+
592
+ // Use LLM for semantic cleaning (especially if customInstructions provided)
593
+ const logger = engine.getLogger();
594
+
595
+ // Build prompt based on user's selections
596
+ const cleaningSteps = [];
597
+
598
+ if (removeHtml) cleaningSteps.push('HTML tags');
599
+ if (removeUrls) cleaningSteps.push('URLs');
600
+ if (removeExtraWhitespace) cleaningSteps.push('extra whitespace');
601
+ if (removeLineBreaks) cleaningSteps.push('line breaks');
602
+ if (removeSpecialChars) cleaningSteps.push('special characters');
603
+ if (decodeHtmlEntities) cleaningSteps.push('decode HTML entities');
604
+
605
+ let originalPrompt = `Clean the following text`;
606
+ let prompt = originalPrompt;
607
+
608
+ if (cleaningSteps.length > 0) {
609
+ prompt += ` by removing: ${cleaningSteps.join(', ')}`;
610
+ } else if (!customInstructions) {
611
+ // If no options selected and no custom instructions, just return text
612
+ return text;
613
+ }
614
+
615
+ // Add instruction to preserve meaning
616
+ prompt += `. IMPORTANT: Do NOT modify the meaning or remove important information. Only remove what was requested.`;
617
+
618
+ if (customInstructions) {
619
+ prompt += ` Also: ${customInstructions}`;
620
+ }
621
+
622
+ prompt += `:\n\n${text}`;
623
+
624
+ logger.logPromptConstruction('clean', originalPrompt, prompt, options);
625
+ logger.log('info', 'CLEAN', 'Using LLM-based cleaning');
626
+
627
+ const result = await engine.run(prompt, { temperature: 0.3 });
628
+ const cleaned = result.trim();
629
+
630
+ logger.log('info', 'CLEAN', 'LLM cleaning completed', {
631
+ originalLength: text.length,
632
+ finalLength: cleaned.length
633
+ });
634
+
635
+ return cleaned;
636
+ }
637
+
638
+ /**
639
+ * Chunk text into smaller pieces
640
+ * @param {string} text - Text to chunk
641
+ * @param {Object} options - Chunking options
642
+ * @returns {string[]}
643
+ */
644
+ function chunk(text, options = {}) {
645
+ const {
646
+ size = 500, // Character count per chunk
647
+ overlap = 0, // Overlap between chunks (in characters)
648
+ strategy = "character", // "character", "sentence", "word"
649
+ } = options;
650
+
651
+ if (!text || text.length === 0) {
652
+ return [];
653
+ }
654
+
655
+ const chunks = [];
656
+
657
+ if (strategy === "character") {
658
+ for (let i = 0; i < text.length; i += size - overlap) {
659
+ chunks.push(text.slice(i, i + size));
660
+ }
661
+ } else if (strategy === "sentence") {
662
+ const sentences = text.match(/[^.!?]+[.!?]+/g) || [text];
663
+ let currentChunk = "";
664
+
665
+ for (const sentence of sentences) {
666
+ if (currentChunk.length + sentence.length > size && currentChunk) {
667
+ chunks.push(currentChunk.trim());
668
+ currentChunk = sentence;
669
+ } else {
670
+ currentChunk += sentence;
671
+ }
672
+ }
673
+
674
+ if (currentChunk.trim()) {
675
+ chunks.push(currentChunk.trim());
676
+ }
677
+ } else if (strategy === "word") {
678
+ const words = text.split(/\s+/);
679
+ let currentChunk = [];
680
+ let currentSize = 0;
681
+
682
+ for (const word of words) {
683
+ if (currentSize + word.length > size && currentChunk.length > 0) {
684
+ chunks.push(currentChunk.join(" "));
685
+ currentChunk = [word];
686
+ currentSize = word.length;
687
+ } else {
688
+ currentChunk.push(word);
689
+ currentSize += word.length + 1; // +1 for space
690
+ }
691
+ }
692
+
693
+ if (currentChunk.length > 0) {
694
+ chunks.push(currentChunk.join(" "));
695
+ }
696
+ }
697
+
698
+ return chunks.filter(ch => ch.length > 0);
699
+ }
700
+
701
+ /**
702
+ * Rule-based validation utilities
703
+ * Prevents hallucinations by validating LLM output against source text
704
+ */
705
+
706
+ /**
707
+ * Validate JSON structure and parse safely
708
+ */
709
+ function validateJSON(text, expectedFields = []) {
710
+ try {
711
+ // Strip markdown code blocks if present (e.g., ```json ... ```)
712
+ const jsonMatch = text.match(/```(?:json)?\s*([\s\S]*?)\s*```/);
713
+ const cleanText = jsonMatch ? jsonMatch[1] : text;
714
+ const parsed = JSON.parse(cleanText);
715
+
716
+ // If fields specified, check they exist
717
+ if (expectedFields.length > 0) {
718
+ const missingFields = expectedFields.filter(field => !(field in parsed));
719
+ if (missingFields.length > 0) {
720
+ return {
721
+ isValid: false,
722
+ error: `Missing required fields: ${missingFields.join(', ')}`,
723
+ data: parsed
724
+ };
725
+ }
726
+ }
727
+
728
+ return {
729
+ isValid: true,
730
+ data: parsed
731
+ };
732
+ } catch (error) {
733
+ return {
734
+ isValid: false,
735
+ error: `Invalid JSON: ${error.message}`,
736
+ data: null
737
+ };
738
+ }
739
+ }
740
+
741
+ /**
742
+ * Format-specific validators
743
+ */
744
+ const validators = {
745
+ email: (value) => /^[^\s@]+@[^\s@]+\.[^\s@]+$/.test(value),
746
+ phone: (value) => /^[\d\s\-\+\(\)]+$/.test(value) && value.replace(/\D/g, '').length >= 7,
747
+ url: (value) => /^https?:\/\/.+/.test(value),
748
+ };
749
+
750
+ /**
751
+ * Verify extracted data exists in source text
752
+ * Prevents hallucination by checking if extracted values appear in original text
753
+ * Now uses exact matching for structured fields and format validation
754
+ */
755
+ function verifyExtraction(extracted, sourceText, fields = []) {
756
+ const sourceLower = sourceText.toLowerCase();
757
+ const issues = [];
758
+
759
+ if (typeof extracted === 'object' && extracted !== null) {
760
+ // Check each field
761
+ for (const [key, value] of Object.entries(extracted)) {
762
+ if (fields.length > 0 && !fields.includes(key)) {
763
+ continue; // Skip fields not in expected list
764
+ }
765
+
766
+ if (value && typeof value === 'string' && value.trim().length > 0) {
767
+ const valueLower = value.toLowerCase();
768
+
769
+ // First, try exact substring match (case-insensitive)
770
+ let foundInSource = sourceLower.includes(valueLower);
771
+
772
+ // If not found, try format-specific validation
773
+ if (!foundInSource) {
774
+ // Check if it matches expected format
775
+ const fieldType = key.toLowerCase();
776
+ if (validators[fieldType]) {
777
+ if (!validators[fieldType](value)) {
778
+ issues.push({
779
+ field: key,
780
+ value,
781
+ reason: `Invalid ${fieldType} format`
782
+ });
783
+ continue;
784
+ }
785
+ }
786
+
787
+ // For non-exact matches, try word-level matching with stricter threshold
788
+ const words = valueLower.split(/\s+/).filter(w => w.length > 3);
789
+ const matchedWords = words.filter(word => sourceLower.includes(word));
790
+ const matchRatio = words.length > 0 ? matchedWords.length / words.length : 0;
791
+
792
+ // Require at least 80% of words to match (stricter than before)
793
+ foundInSource = matchRatio >= 0.8;
794
+ }
795
+
796
+ if (!foundInSource) {
797
+ issues.push({
798
+ field: key,
799
+ value,
800
+ reason: 'Value not found in source text (possible hallucination)'
801
+ });
802
+ }
803
+ }
804
+ }
805
+ }
806
+
807
+ return {
808
+ isValid: issues.length === 0,
809
+ issues,
810
+ extracted
811
+ };
812
+ }
813
+
814
+ /**
815
+ * Clean and normalize extracted values
816
+ */
817
+ function normalizeExtracted(extracted) {
818
+ if (typeof extracted !== 'object' || extracted === null) {
819
+ return extracted;
820
+ }
821
+
822
+ const normalized = {};
823
+ for (const [key, value] of Object.entries(extracted)) {
824
+ if (typeof value === 'string') {
825
+ // Remove common LLM artifacts
826
+ normalized[key] = value
827
+ .trim()
828
+ .replace(/^["']|["']$/g, '') // Remove quotes
829
+ .replace(/^[-•*]\s*/, '') // Remove list markers
830
+ .trim();
831
+ } else {
832
+ normalized[key] = value;
833
+ }
834
+ }
835
+
836
+ return normalized;
837
+ }
838
+
839
+ /**
840
+ * Validate extraction result with multiple checks
841
+ */
842
+ function validateExtraction(llmOutput, sourceText, options = {}) {
843
+ const {
844
+ format = 'json',
845
+ fields = [],
846
+ strict = true, // If true, reject if validation fails
847
+ } = options;
848
+
849
+ let parsed = llmOutput.trim();
850
+
851
+ // Step 1: Parse JSON if needed
852
+ if (format === 'json') {
853
+ const jsonResult = validateJSON(parsed, fields);
854
+ if (!jsonResult.isValid) {
855
+ return {
856
+ isValid: false,
857
+ error: jsonResult.error,
858
+ raw: llmOutput,
859
+ validated: null
860
+ };
861
+ }
862
+ parsed = jsonResult.data;
863
+ }
864
+
865
+ // Step 2: Normalize
866
+ const normalized = normalizeExtracted(parsed);
867
+
868
+ // Step 3: Verify against source
869
+ const verification = verifyExtraction(normalized, sourceText, fields);
870
+
871
+ // Step 4: Return result
872
+ if (strict && !verification.isValid) {
873
+ return {
874
+ isValid: false,
875
+ error: 'Extraction validation failed',
876
+ issues: verification.issues,
877
+ raw: llmOutput,
878
+ validated: null
879
+ };
880
+ }
881
+
882
+ return {
883
+ isValid: true,
884
+ raw: llmOutput,
885
+ validated: normalized,
886
+ warnings: verification.issues // Include warnings even if not strict
887
+ };
888
+ }
889
+
890
+ /**
891
+ * Extract specific information from text
892
+ * Uses rule-based validation to prevent hallucinations
893
+ * @param {LLMEngine} engine - The LLM engine instance
894
+ * @param {string} text - Text to extract from
895
+ * @param {Object} options - Extraction options
896
+ * @returns {Promise<string>}
897
+ */
898
+ async function extract(engine, text, options = {}) {
899
+ const logger = engine.getLogger();
900
+
901
+ const {
902
+ what = "key information", // What to extract
903
+ format = "text", // "text", "json", "list"
904
+ fields = [], // Specific fields to extract (for JSON)
905
+ validate = true, // Enable rule-based validation
906
+ strict = false, // If true, throw error on validation failure
907
+ } = options;
908
+
909
+ // Log prompt construction
910
+ const originalPrompt = `Extract ${what} from the following text`;
911
+ let prompt = originalPrompt;
912
+
913
+ if (format === "json") {
914
+ if (fields.length > 0) {
915
+ prompt += ` in JSON format with these fields: ${fields.join(", ")}`;
916
+ } else {
917
+ prompt += ` in JSON format`;
918
+ }
919
+ } else if (format === "list") {
920
+ prompt += ` as a list`;
921
+ }
922
+
923
+ prompt += `:\n\n${text}`;
924
+
925
+ logger.logPromptConstruction('extract', originalPrompt, prompt, options);
926
+
927
+ // Run LLM extraction
928
+ const llmResult = await engine.run(prompt, { temperature: 0.3 });
929
+ const rawResult = llmResult.trim();
930
+
931
+ logger.log('info', 'EXTRACT', 'LLM extraction completed', {
932
+ format,
933
+ fields,
934
+ resultLength: rawResult.length
935
+ });
936
+
937
+ // Apply validation if enabled and format is JSON
938
+ if (validate && format === "json") {
939
+ logger.log('info', 'VALIDATION', 'Starting rule-based validation');
940
+
941
+ const validation = validateExtraction(rawResult, text, {
942
+ format,
943
+ fields,
944
+ strict
945
+ });
946
+
947
+ logger.logValidation('extract', text, validation.validated || rawResult, validation.isValid,
948
+ validation.error ? new Error(validation.error) : null);
949
+
950
+ if (!validation.isValid) {
951
+ if (strict) {
952
+ throw new Error(`Extraction validation failed: ${validation.error}`);
953
+ } else {
954
+ logger.log('warn', 'VALIDATION', 'Validation failed but continuing (non-strict mode)', {
955
+ error: validation.error,
956
+ issues: validation.issues
957
+ });
958
+ // Return raw result with warning
959
+ return rawResult;
960
+ }
961
+ }
962
+
963
+ // Return validated result
964
+ if (validation.validated) {
965
+ logger.log('info', 'VALIDATION', 'Validation passed, returning validated data');
966
+ return JSON.stringify(validation.validated, null, 2);
967
+ }
968
+ }
969
+
970
+ return rawResult;
971
+ }
972
+
973
+ /**
974
+ * Client-Side LLM Preprocessor
975
+ *
976
+ * A flexible SDK for preprocessing text using local LLM models in the browser.
977
+ * Supports cleaning, extraction, and custom prompts.
978
+ */
979
+ class Preprocessor {
980
+ constructor(options = {}) {
981
+ this.engine = new LLMEngine(options);
982
+ this.isModelLoaded = false;
983
+ this.logger = this.engine.getLogger();
984
+ }
985
+
986
+ /**
987
+ * Load the WebLLM model
988
+ * @param {string} model - Model name (default: "Llama-3.2-1B-Instruct-q4f16_1-MLC")
989
+ * @returns {Promise<void>}
990
+ */
991
+ async loadModel(model) {
992
+ await this.engine.loadModel(model);
993
+ this.isModelLoaded = true;
994
+ this.logger.log('info', 'PREPROCESSOR', 'Model loaded and ready');
995
+ }
996
+
997
+ /**
998
+ * Check if WebGPU is supported in the current environment
999
+ * @returns {Promise<boolean>}
1000
+ */
1001
+ async checkWebGPU() {
1002
+ if (typeof navigator === 'undefined' || !navigator.gpu) {
1003
+ return false;
1004
+ }
1005
+ try {
1006
+ const adapter = await navigator.gpu.requestAdapter();
1007
+ return !!adapter;
1008
+ } catch (e) {
1009
+ return false;
1010
+ }
1011
+ }
1012
+
1013
+ /**
1014
+ * Get the logger instance for accessing internal logs
1015
+ * @returns {InternalLogger}
1016
+ */
1017
+ getLogger() {
1018
+ return this.logger;
1019
+ }
1020
+
1021
+ /**
1022
+ * Enable/disable internal logging
1023
+ */
1024
+ setLogging(enabled, verbose = false) {
1025
+ this.logger.setEnabled(enabled);
1026
+ this.logger.setVerbose(verbose);
1027
+ this.logger.log('info', 'PREPROCESSOR', `Logging ${enabled ? 'enabled' : 'disabled'}`, { verbose });
1028
+ }
1029
+
1030
+ /**
1031
+ * Ensure model is loaded
1032
+ * @private
1033
+ */
1034
+ _ensureLoaded() {
1035
+ if (!this.isModelLoaded && !this.engine.isLoaded()) {
1036
+ throw new Error(
1037
+ "Model not loaded. Call loadModel() first before using preprocessing functions."
1038
+ );
1039
+ }
1040
+ }
1041
+
1042
+ /**
1043
+ * Clean text
1044
+ * Works with or without LLM model loaded
1045
+ * Uses rule-based cleaning if model not loaded, LLM if available
1046
+ * All options are opt-in (default: false) - user chooses what to remove
1047
+ * @param {string} text - Text to clean
1048
+ * @param {Object} options - Cleaning options (all optional, default: false)
1049
+ * @param {boolean} options.removeHtml - Remove HTML tags (default: false)
1050
+ * @param {boolean} options.removeUrls - Remove URLs (default: false)
1051
+ * @param {boolean} options.removeExtraWhitespace - Remove extra whitespace (default: false)
1052
+ * @param {boolean} options.removeLineBreaks - Remove line breaks (default: false)
1053
+ * @param {boolean} options.removeSpecialChars - Remove special characters (default: false)
1054
+ * @param {boolean} options.decodeHtmlEntities - Decode HTML entities like &amp; (default: false)
1055
+ * @param {string} options.customInstructions - Additional cleaning instructions (requires LLM)
1056
+ * @param {boolean} options.useLLM - Force LLM usage (requires model loaded)
1057
+ * @returns {Promise<string>|string}
1058
+ *
1059
+ * @example
1060
+ * // No options - returns text as-is
1061
+ * await p.clean(text);
1062
+ *
1063
+ * // User chooses what to remove
1064
+ * await p.clean(text, { removeHtml: true, removeExtraWhitespace: true });
1065
+ *
1066
+ * // Use LLM for semantic cleaning
1067
+ * await p.clean(text, { useLLM: true, customInstructions: "Remove all dates" });
1068
+ */
1069
+ async clean(text, options = {}) {
1070
+ // Don't require model - can work without it
1071
+ // Only require if explicitly using LLM or custom instructions
1072
+ if (options.useLLM === true || options.customInstructions) {
1073
+ this._ensureLoaded();
1074
+ }
1075
+
1076
+ return await clean(this.engine, text, options);
1077
+ }
1078
+
1079
+ /**
1080
+ * Extract information from text
1081
+ * @param {string} text - Text to extract from
1082
+ * @param {Object} options - Extraction options
1083
+ * @returns {Promise<string>}
1084
+ */
1085
+ async extract(text, options = {}) {
1086
+ this._ensureLoaded();
1087
+ return await extract(this.engine, text, options);
1088
+ }
1089
+
1090
+ /**
1091
+ * Chunk text into smaller pieces (non-LLM, fast operation)
1092
+ * Works immediately, no model needed
1093
+ * @param {string} text - Text to chunk
1094
+ * @param {Object} options - Chunking options
1095
+ * @returns {string[]}
1096
+ */
1097
+ chunk(text, options = {}) {
1098
+ // No model check needed - chunk is pure string operation
1099
+ return chunk(text, options);
1100
+ }
1101
+
1102
+ /**
1103
+ * Run a custom prompt on text
1104
+ * @param {string} text - Input text
1105
+ * @param {string|Object} instruction - Custom instruction or config object
1106
+ * @param {Object} options - Generation options
1107
+ * @returns {Promise<string>}
1108
+ */
1109
+ async prompt(text, instruction, options = {}) {
1110
+ this._ensureLoaded();
1111
+
1112
+ let promptText;
1113
+ let genOptions = { ...options };
1114
+
1115
+ if (typeof instruction === "string") {
1116
+ promptText = `${instruction}\n\n${text}`;
1117
+ } else if (typeof instruction === "object") {
1118
+ // Advanced prompt configuration
1119
+ const { instruction: inst, format, temperature, maxTokens } = instruction;
1120
+
1121
+ promptText = inst;
1122
+ if (format) {
1123
+ if (typeof format === "object") {
1124
+ promptText += `\n\nReturn the result in JSON format with these fields: ${JSON.stringify(format)}`;
1125
+ } else {
1126
+ promptText += `\n\nFormat: ${format}`;
1127
+ }
1128
+ }
1129
+ promptText += `\n\n${text}`;
1130
+
1131
+ if (temperature !== undefined) genOptions.temperature = temperature;
1132
+ if (maxTokens !== undefined) genOptions.maxTokens = maxTokens;
1133
+ } else {
1134
+ throw new Error("Instruction must be a string or object");
1135
+ }
1136
+
1137
+ return await this.engine.run(promptText, genOptions);
1138
+ }
1139
+
1140
+ /**
1141
+ * Enforce correct pipeline ordering
1142
+ * Always ensures: clean → extract (if both present)
1143
+ * @private
1144
+ */
1145
+ _enforcePipelineOrder(pipeline) {
1146
+ const ordered = [...pipeline];
1147
+ const cleanIndex = ordered.findIndex(step =>
1148
+ step === "clean" || (typeof step === "object" && step.clean !== undefined)
1149
+ );
1150
+ const extractIndex = ordered.findIndex(step =>
1151
+ step === "extract" || (typeof step === "object" && step.extract !== undefined)
1152
+ );
1153
+
1154
+ // If both clean and extract exist, ensure clean comes first
1155
+ if (cleanIndex !== -1 && extractIndex !== -1 && cleanIndex > extractIndex) {
1156
+ this.logger.log('warn', 'PIPELINE', 'Reordering pipeline: clean must come before extract', {
1157
+ originalOrder: ordered.map(s => typeof s === 'string' ? s : Object.keys(s)[0]),
1158
+ reordered: true
1159
+ });
1160
+
1161
+ // Move clean before extract
1162
+ const cleanStep = ordered.splice(cleanIndex, 1)[0];
1163
+ const newExtractIndex = ordered.findIndex(step =>
1164
+ step === "extract" || (typeof step === "object" && step.extract !== undefined)
1165
+ );
1166
+ ordered.splice(newExtractIndex, 0, cleanStep);
1167
+ }
1168
+
1169
+ return ordered;
1170
+ }
1171
+
1172
+ /**
1173
+ * Process text with multiple operations in a pipeline
1174
+ * Automatically enforces correct ordering (clean → extract)
1175
+ * @param {string} text - Input text
1176
+ * @param {Array} pipeline - Array of operations to apply
1177
+ * @returns {Promise<string|string[]>}
1178
+ *
1179
+ * @example
1180
+ * await p.pipeline(text, [
1181
+ * "extract", // Will be reordered to run after clean
1182
+ * "clean",
1183
+ * { prompt: "Rewrite in pirate style" }
1184
+ * ]);
1185
+ */
1186
+ async pipeline(text, pipeline) {
1187
+ this._ensureLoaded();
1188
+
1189
+ if (!Array.isArray(pipeline) || pipeline.length === 0) {
1190
+ throw new Error("Pipeline must be a non-empty array");
1191
+ }
1192
+
1193
+ // Enforce correct ordering
1194
+ const orderedPipeline = this._enforcePipelineOrder(pipeline);
1195
+
1196
+ this.logger.log('info', 'PIPELINE', 'Starting pipeline execution', {
1197
+ stepCount: orderedPipeline.length,
1198
+ steps: orderedPipeline.map(s => typeof s === 'string' ? s : Object.keys(s)[0])
1199
+ });
1200
+
1201
+ let result = text;
1202
+ const startTime = Date.now();
1203
+
1204
+ for (let i = 0; i < orderedPipeline.length; i++) {
1205
+ const step = orderedPipeline[i];
1206
+ const stepStartTime = Date.now();
1207
+ const stepName = typeof step === 'string' ? step : Object.keys(step)[0] || 'unknown';
1208
+
1209
+ try {
1210
+ if (typeof step === "string") {
1211
+ // Built-in operation name
1212
+ switch (step) {
1213
+ case "clean":
1214
+ result = await this.clean(result);
1215
+ break;
1216
+ case "extract":
1217
+ result = await this.extract(result);
1218
+ break;
1219
+ case "chunk":
1220
+ result = this.chunk(result);
1221
+ break;
1222
+ default:
1223
+ throw new Error(`Unknown operation: ${step}`);
1224
+ }
1225
+ } else if (typeof step === "object") {
1226
+ // Custom operation with options
1227
+ if (step.prompt) {
1228
+ result = await this.prompt(result, step.prompt, step.options || {});
1229
+ } else if (step.clean) {
1230
+ result = await this.clean(result, step.clean);
1231
+ } else if (step.extract) {
1232
+ result = await this.extract(result, step.extract);
1233
+ } else if (step.chunk) {
1234
+ result = this.chunk(result, step.chunk);
1235
+ } else {
1236
+ throw new Error(`Unknown operation object: ${JSON.stringify(step)}`);
1237
+ }
1238
+ } else {
1239
+ throw new Error(`Invalid pipeline step: ${step}`);
1240
+ }
1241
+
1242
+ // If chunking was applied, result is now an array
1243
+ if (Array.isArray(result)) {
1244
+ this.logger.log('info', 'PIPELINE', 'Chunking applied, stopping pipeline', {
1245
+ chunks: result.length
1246
+ });
1247
+ break; // Can't process arrays further
1248
+ }
1249
+
1250
+ const stepDuration = Date.now() - stepStartTime;
1251
+ this.logger.logPipelineStep(i, stepName, text, result, stepDuration);
1252
+ } catch (error) {
1253
+ this.logger.logError(`pipeline step ${i + 1} (${stepName})`, error, {
1254
+ step,
1255
+ inputLength: typeof text === 'string' ? text.length : 'N/A'
1256
+ });
1257
+ throw error;
1258
+ }
1259
+ }
1260
+
1261
+ const totalDuration = Date.now() - startTime;
1262
+ this.logger.logPerformance('pipeline', {
1263
+ totalSteps: orderedPipeline.length,
1264
+ duration: `${totalDuration}ms`,
1265
+ averageStepTime: `${(totalDuration / orderedPipeline.length).toFixed(2)}ms`
1266
+ });
1267
+
1268
+ return result;
1269
+ }
1270
+
1271
+ /**
1272
+ * Process text with a simple configuration object
1273
+ * @param {string} text - Input text
1274
+ * @param {Object} config - Processing configuration
1275
+ * @returns {Promise<string>}
1276
+ *
1277
+ * @example
1278
+ * await p.process(text, {
1279
+ * clean: true,
1280
+ * extract: { format: "json", fields: ["name", "email"] },
1281
+ * customPrompt: "Convert to bullet points"
1282
+ * });
1283
+ */
1284
+ async process(text, config = {}) {
1285
+ this._ensureLoaded();
1286
+
1287
+ let result = text;
1288
+
1289
+ // Apply operations in order
1290
+ if (config.clean) {
1291
+ result = await this.clean(
1292
+ result,
1293
+ typeof config.clean === "object" ? config.clean : {}
1294
+ );
1295
+ }
1296
+
1297
+ if (config.extract) {
1298
+ result = await this.extract(
1299
+ result,
1300
+ typeof config.extract === "object" ? config.extract : {}
1301
+ );
1302
+ }
1303
+
1304
+ if (config.customPrompt) {
1305
+ result = await this.prompt(result, config.customPrompt, config.promptOptions || {});
1306
+ }
1307
+
1308
+ if (config.chunk) {
1309
+ result = this.chunk(
1310
+ result,
1311
+ typeof config.chunk === "object" ? config.chunk : {}
1312
+ );
1313
+ }
1314
+
1315
+ return result;
1316
+ }
1317
+ }
1318
+
1319
+ export { LLMEngine, Preprocessor, chunk, clean, cleanWithRules, extract };