client-llm-preprocessor 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,1040 @@
1
+ // src/engine.js
2
+ import * as webllm from "@mlc-ai/web-llm";
3
+
4
+ // src/utils/logger.js
5
+ var InternalLogger = class {
6
+ constructor(options = {}) {
7
+ this.enabled = options.enabled !== false;
8
+ this.verbose = options.verbose || false;
9
+ this.logLevel = options.logLevel || "info";
10
+ this.logs = [];
11
+ this.maxLogs = options.maxLogs || 1e3;
12
+ this.onLogCallback = options.onLogCallback || null;
13
+ }
14
+ /**
15
+ * Enable or disable logging
16
+ */
17
+ setEnabled(enabled) {
18
+ this.enabled = enabled;
19
+ }
20
+ /**
21
+ * Set verbosity level
22
+ */
23
+ setVerbose(verbose) {
24
+ this.verbose = verbose;
25
+ }
26
+ /**
27
+ * Log an event with metadata
28
+ */
29
+ log(level, category, message, data = {}) {
30
+ if (!this.enabled) return;
31
+ const logEntry = {
32
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
33
+ level,
34
+ category,
35
+ message,
36
+ data,
37
+ stackTrace: this.verbose ? new Error().stack : void 0
38
+ };
39
+ this.logs.push(logEntry);
40
+ if (this.logs.length > this.maxLogs) {
41
+ this.logs.shift();
42
+ }
43
+ if (this.shouldLog(level)) {
44
+ const prefix = `[${level.toUpperCase()}] [${category}]`;
45
+ console.log(`${prefix} ${message}`, data);
46
+ }
47
+ if (this.onLogCallback) {
48
+ this.onLogCallback(logEntry);
49
+ }
50
+ }
51
+ /**
52
+ * Check if log level should be output
53
+ */
54
+ shouldLog(level) {
55
+ const levels = { debug: 0, info: 1, warn: 2, error: 3 };
56
+ return levels[level] >= levels[this.logLevel];
57
+ }
58
+ /**
59
+ * Log prompt construction
60
+ */
61
+ logPromptConstruction(operation, originalPrompt, finalPrompt, options = {}) {
62
+ this.log("debug", "PROMPT", "Constructing prompt", {
63
+ operation,
64
+ originalLength: originalPrompt.length,
65
+ finalLength: finalPrompt.length,
66
+ options,
67
+ promptPreview: finalPrompt.substring(0, 200) + "..."
68
+ });
69
+ }
70
+ /**
71
+ * Log token generation (if streaming available)
72
+ */
73
+ logTokenGeneration(token, cumulativeText, tokenIndex) {
74
+ if (this.verbose) {
75
+ this.log("debug", "TOKEN", `Generated token ${tokenIndex}`, {
76
+ token,
77
+ cumulativeLength: cumulativeText.length,
78
+ tokenIndex
79
+ });
80
+ }
81
+ }
82
+ /**
83
+ * Log LLM inference start
84
+ */
85
+ logInferenceStart(prompt, options) {
86
+ this.log("info", "INFERENCE", "Starting LLM inference", {
87
+ promptLength: prompt.length,
88
+ temperature: options.temperature,
89
+ maxTokens: options.maxTokens,
90
+ promptPreview: prompt.substring(0, 100) + "..."
91
+ });
92
+ }
93
+ /**
94
+ * Log LLM inference completion
95
+ */
96
+ logInferenceComplete(response, duration, tokenCount) {
97
+ this.log("info", "INFERENCE", "LLM inference completed", {
98
+ responseLength: response.length,
99
+ duration: `${duration}ms`,
100
+ estimatedTokens: tokenCount,
101
+ responsePreview: response.substring(0, 200) + "..."
102
+ });
103
+ }
104
+ /**
105
+ * Log validation step
106
+ */
107
+ logValidation(step, input, output, isValid, error = null) {
108
+ this.log(isValid ? "info" : "warn", "VALIDATION", `Validation: ${step}`, {
109
+ inputPreview: typeof input === "string" ? input.substring(0, 100) : input,
110
+ outputPreview: typeof output === "string" ? output.substring(0, 100) : output,
111
+ isValid,
112
+ error: error?.message
113
+ });
114
+ }
115
+ /**
116
+ * Log pipeline step
117
+ */
118
+ logPipelineStep(stepIndex, stepName, input, output, duration) {
119
+ this.log("info", "PIPELINE", `Pipeline step ${stepIndex + 1}: ${stepName}`, {
120
+ inputLength: typeof input === "string" ? input.length : "N/A",
121
+ outputLength: typeof output === "string" ? output.length : "N/A",
122
+ duration: `${duration}ms`,
123
+ inputPreview: typeof input === "string" ? input.substring(0, 50) : input,
124
+ outputPreview: typeof output === "string" ? output.substring(0, 50) : output
125
+ });
126
+ }
127
+ /**
128
+ * Log error with context
129
+ */
130
+ logError(operation, error, context = {}) {
131
+ this.log("error", "ERROR", `Error in ${operation}`, {
132
+ error: error.message,
133
+ stack: error.stack,
134
+ context
135
+ });
136
+ }
137
+ /**
138
+ * Log performance metrics
139
+ */
140
+ logPerformance(operation, metrics) {
141
+ this.log("info", "PERFORMANCE", `Performance: ${operation}`, metrics);
142
+ }
143
+ /**
144
+ * Get all logs
145
+ */
146
+ getLogs(filter = {}) {
147
+ let filtered = [...this.logs];
148
+ if (filter.level) {
149
+ filtered = filtered.filter((log) => log.level === filter.level);
150
+ }
151
+ if (filter.category) {
152
+ filtered = filtered.filter((log) => log.category === filter.category);
153
+ }
154
+ if (filter.since) {
155
+ const sinceTime = new Date(filter.since).getTime();
156
+ filtered = filtered.filter((log) => new Date(log.timestamp).getTime() >= sinceTime);
157
+ }
158
+ return filtered;
159
+ }
160
+ /**
161
+ * Get logs as formatted string
162
+ */
163
+ getLogsAsString(filter = {}) {
164
+ const logs = this.getLogs(filter);
165
+ return logs.map((log) => {
166
+ return `[${log.timestamp}] [${log.level}] [${log.category}] ${log.message}`;
167
+ }).join("\n");
168
+ }
169
+ /**
170
+ * Clear logs
171
+ */
172
+ clear() {
173
+ this.logs = [];
174
+ }
175
+ /**
176
+ * Export logs as JSON
177
+ */
178
+ exportLogs() {
179
+ return JSON.stringify(this.logs, null, 2);
180
+ }
181
+ /**
182
+ * Get summary statistics
183
+ */
184
+ getStats() {
185
+ const stats = {
186
+ totalLogs: this.logs.length,
187
+ byLevel: {},
188
+ byCategory: {},
189
+ errors: 0,
190
+ warnings: 0,
191
+ timeRange: {
192
+ start: this.logs[0]?.timestamp,
193
+ end: this.logs[this.logs.length - 1]?.timestamp
194
+ }
195
+ };
196
+ this.logs.forEach((log) => {
197
+ stats.byLevel[log.level] = (stats.byLevel[log.level] || 0) + 1;
198
+ stats.byCategory[log.category] = (stats.byCategory[log.category] || 0) + 1;
199
+ if (log.level === "error") stats.errors++;
200
+ if (log.level === "warn") stats.warnings++;
201
+ });
202
+ return stats;
203
+ }
204
+ };
205
+ var defaultLogger = null;
206
+ function getLogger(options = {}) {
207
+ if (!defaultLogger) {
208
+ defaultLogger = new InternalLogger(options);
209
+ }
210
+ return defaultLogger;
211
+ }
212
+
213
+ // src/engine.js
214
+ var LLMEngine = class {
215
+ constructor(options = {}) {
216
+ this.engine = null;
217
+ this.model = null;
218
+ this.logger = options.logger || getLogger(options.loggerOptions);
219
+ this.streamingEnabled = options.streaming !== false;
220
+ }
221
+ /**
222
+ * Load a WebLLM model
223
+ * @param {string} model - Model name (default: "Llama-3.2-1B-Instruct-q4f16_1")
224
+ * @returns {Promise<void>}
225
+ */
226
+ async loadModel(model = "Llama-3.2-1B-Instruct-q4f16_1-MLC") {
227
+ if (this.engine && this.model === model) {
228
+ this.logger.log("info", "MODEL", "Model already loaded, skipping");
229
+ return;
230
+ }
231
+ const startTime = Date.now();
232
+ this.logger.log("info", "MODEL", `Loading model: ${model}`, { model });
233
+ try {
234
+ this.engine = await webllm.CreateMLCEngine(model, {
235
+ initProgressCallback: (report) => {
236
+ if (report.progress) {
237
+ const progress = (report.progress * 100).toFixed(1);
238
+ this.logger.log("info", "MODEL", `Loading progress: ${progress}%`, {
239
+ progress: parseFloat(progress),
240
+ report
241
+ });
242
+ }
243
+ }
244
+ });
245
+ this.model = model;
246
+ const loadTime = Date.now() - startTime;
247
+ this.logger.log("info", "MODEL", "Model loaded successfully", {
248
+ model,
249
+ loadTime: `${loadTime}ms`
250
+ });
251
+ } catch (error) {
252
+ this.logger.logError("loadModel", error, { model });
253
+ throw new Error(`Failed to load model: ${error.message}`);
254
+ }
255
+ }
256
+ /**
257
+ * Run inference with the loaded model
258
+ * Captures detailed internal state including token-by-token generation
259
+ * @param {string} prompt - The prompt to send to the model
260
+ * @param {Object} options - Generation options
261
+ * @returns {Promise<string>}
262
+ */
263
+ async run(prompt, options = {}) {
264
+ if (!this.engine) {
265
+ throw new Error("Model not loaded. Call loadModel() first.");
266
+ }
267
+ const {
268
+ temperature = 0.7,
269
+ maxTokens = 512,
270
+ stopSequences = [],
271
+ stream = this.streamingEnabled
272
+ // Try streaming for token-by-token logging
273
+ } = options;
274
+ const startTime = Date.now();
275
+ this.logger.logInferenceStart(prompt, { temperature, maxTokens, stopSequences, stream });
276
+ try {
277
+ let fullResponse = "";
278
+ let tokenCount = 0;
279
+ const tokens = [];
280
+ if (stream && this.engine.chat?.completions?.createStream) {
281
+ this.logger.log("info", "INFERENCE", "Using streaming mode for token-by-token logging");
282
+ try {
283
+ const stream2 = await this.engine.chat.completions.createStream({
284
+ messages: [{ role: "user", content: prompt }],
285
+ temperature,
286
+ max_tokens: maxTokens,
287
+ stop: stopSequences.length > 0 ? stopSequences : void 0
288
+ });
289
+ for await (const chunk2 of stream2) {
290
+ const delta = chunk2.choices?.[0]?.delta?.content || "";
291
+ if (delta) {
292
+ fullResponse += delta;
293
+ tokenCount++;
294
+ tokens.push(delta);
295
+ this.logger.logTokenGeneration(delta, fullResponse, tokenCount);
296
+ }
297
+ }
298
+ this.logger.log("info", "INFERENCE", "Streaming completed", {
299
+ totalTokens: tokenCount,
300
+ responseLength: fullResponse.length
301
+ });
302
+ } catch (streamError) {
303
+ this.logger.log("warn", "INFERENCE", "Streaming failed, falling back to non-streaming", {
304
+ error: streamError.message
305
+ });
306
+ return await this.runNonStreaming(prompt, options, startTime);
307
+ }
308
+ } else {
309
+ return await this.runNonStreaming(prompt, options, startTime);
310
+ }
311
+ const duration = Date.now() - startTime;
312
+ this.logger.logInferenceComplete(fullResponse, duration, tokenCount);
313
+ this.logger.log("debug", "INFERENCE", "Token sequence captured", {
314
+ tokenCount,
315
+ tokens: tokens.slice(0, 20),
316
+ // First 20 tokens
317
+ fullSequenceLength: tokens.length
318
+ });
319
+ return fullResponse;
320
+ } catch (error) {
321
+ const duration = Date.now() - startTime;
322
+ this.logger.logError("run", error, {
323
+ promptLength: prompt.length,
324
+ duration: `${duration}ms`,
325
+ options
326
+ });
327
+ throw new Error(`Inference failed: ${error.message}`);
328
+ }
329
+ }
330
+ /**
331
+ * Non-streaming inference (fallback)
332
+ * @private
333
+ */
334
+ async runNonStreaming(prompt, options, startTime) {
335
+ const {
336
+ temperature = 0.7,
337
+ maxTokens = 512,
338
+ stopSequences = []
339
+ } = options;
340
+ this.logger.log("info", "INFERENCE", "Using non-streaming mode");
341
+ const response = await this.engine.chat.completions.create({
342
+ messages: [{ role: "user", content: prompt }],
343
+ temperature,
344
+ max_tokens: maxTokens,
345
+ stop: stopSequences.length > 0 ? stopSequences : void 0
346
+ });
347
+ const result = response.choices[0].message.content;
348
+ const duration = Date.now() - (startTime || Date.now());
349
+ const estimatedTokens = Math.ceil(result.length / 4);
350
+ this.logger.logInferenceComplete(result, duration, estimatedTokens);
351
+ return result;
352
+ }
353
+ /**
354
+ * Check if model is loaded
355
+ * @returns {boolean}
356
+ */
357
+ isLoaded() {
358
+ return this.engine !== null;
359
+ }
360
+ /**
361
+ * Get the logger instance
362
+ * @returns {InternalLogger}
363
+ */
364
+ getLogger() {
365
+ return this.logger;
366
+ }
367
+ /**
368
+ * Enable/disable streaming for token-by-token logging
369
+ */
370
+ setStreaming(enabled) {
371
+ this.streamingEnabled = enabled;
372
+ this.logger.log("info", "ENGINE", `Streaming ${enabled ? "enabled" : "disabled"}`);
373
+ }
374
+ };
375
+
376
+ // src/preprocess/clean-rules.js
377
+ function cleanWithRules(text, options = {}) {
378
+ const {
379
+ removeHtml = false,
380
+ removeUrls = false,
381
+ removeExtraWhitespace = false,
382
+ removeLineBreaks = false,
383
+ removeSpecialChars = false,
384
+ decodeHtmlEntities = false
385
+ } = options;
386
+ let cleaned = text;
387
+ if (decodeHtmlEntities) {
388
+ cleaned = cleaned.replace(/&nbsp;/g, " ").replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&quot;/g, '"').replace(/&#39;/g, "'").replace(/&#x27;/g, "'").replace(/&#x2F;/g, "/");
389
+ }
390
+ if (removeHtml) {
391
+ cleaned = cleaned.replace(/<[^>]+>/g, "");
392
+ }
393
+ if (removeUrls) {
394
+ cleaned = cleaned.replace(/https?:\/\/[^\s]+/g, "");
395
+ }
396
+ if (removeLineBreaks) {
397
+ cleaned = cleaned.replace(/[\r\n]+/g, " ");
398
+ }
399
+ if (removeExtraWhitespace) {
400
+ cleaned = cleaned.replace(/[ \t]+/g, " ");
401
+ if (!removeLineBreaks) {
402
+ cleaned = cleaned.split("\n").map((line) => line.trim()).join("\n");
403
+ }
404
+ if (!removeLineBreaks) {
405
+ cleaned = cleaned.replace(/\n{3,}/g, "\n\n");
406
+ }
407
+ cleaned = cleaned.trim();
408
+ }
409
+ if (removeSpecialChars) {
410
+ cleaned = cleaned.replace(/[^\w\s.,!?;:()\-'"]/g, "");
411
+ }
412
+ return cleaned;
413
+ }
414
+
415
+ // src/preprocess/clean.js
416
+ async function clean(engine, text, options = {}) {
417
+ const {
418
+ removeHtml = false,
419
+ removeUrls = false,
420
+ removeExtraWhitespace = false,
421
+ removeLineBreaks = false,
422
+ removeSpecialChars = false,
423
+ decodeHtmlEntities = false,
424
+ customInstructions = "",
425
+ useLLM = null
426
+ // null = auto-detect, true = force LLM, false = force rules
427
+ } = options;
428
+ const shouldUseLLM = useLLM !== false && engine !== null && engine.isLoaded() && (useLLM === true || customInstructions !== "");
429
+ if (!shouldUseLLM) {
430
+ const logger2 = engine?.getLogger();
431
+ if (logger2) {
432
+ logger2.log("info", "CLEAN", "Using rule-based cleaning (no LLM)", {
433
+ reason: !engine ? "No engine" : !engine.isLoaded() ? "Model not loaded" : "useLLM=false",
434
+ options: { removeHtml, removeUrls, removeExtraWhitespace, removeLineBreaks, removeSpecialChars, decodeHtmlEntities }
435
+ });
436
+ }
437
+ return cleanWithRules(text, {
438
+ removeHtml,
439
+ removeUrls,
440
+ removeExtraWhitespace,
441
+ removeLineBreaks,
442
+ removeSpecialChars,
443
+ decodeHtmlEntities
444
+ });
445
+ }
446
+ const logger = engine.getLogger();
447
+ const cleaningSteps = [];
448
+ if (removeHtml) cleaningSteps.push("HTML tags");
449
+ if (removeUrls) cleaningSteps.push("URLs");
450
+ if (removeExtraWhitespace) cleaningSteps.push("extra whitespace");
451
+ if (removeLineBreaks) cleaningSteps.push("line breaks");
452
+ if (removeSpecialChars) cleaningSteps.push("special characters");
453
+ if (decodeHtmlEntities) cleaningSteps.push("decode HTML entities");
454
+ let originalPrompt = `Clean the following text`;
455
+ let prompt = originalPrompt;
456
+ if (cleaningSteps.length > 0) {
457
+ prompt += ` by removing: ${cleaningSteps.join(", ")}`;
458
+ } else if (!customInstructions) {
459
+ return text;
460
+ }
461
+ prompt += `. IMPORTANT: Do NOT modify the meaning or remove important information. Only remove what was requested.`;
462
+ if (customInstructions) {
463
+ prompt += ` Also: ${customInstructions}`;
464
+ }
465
+ prompt += `:
466
+
467
+ ${text}`;
468
+ logger.logPromptConstruction("clean", originalPrompt, prompt, options);
469
+ logger.log("info", "CLEAN", "Using LLM-based cleaning");
470
+ const result = await engine.run(prompt, { temperature: 0.3 });
471
+ const cleaned = result.trim();
472
+ logger.log("info", "CLEAN", "LLM cleaning completed", {
473
+ originalLength: text.length,
474
+ finalLength: cleaned.length
475
+ });
476
+ return cleaned;
477
+ }
478
+
479
+ // src/preprocess/chunk.js
480
+ function chunk(text, options = {}) {
481
+ const {
482
+ size = 500,
483
+ // Character count per chunk
484
+ overlap = 0,
485
+ // Overlap between chunks (in characters)
486
+ strategy = "character"
487
+ // "character", "sentence", "word"
488
+ } = options;
489
+ if (!text || text.length === 0) {
490
+ return [];
491
+ }
492
+ const chunks = [];
493
+ if (strategy === "character") {
494
+ for (let i = 0; i < text.length; i += size - overlap) {
495
+ chunks.push(text.slice(i, i + size));
496
+ }
497
+ } else if (strategy === "sentence") {
498
+ const sentences = text.match(/[^.!?]+[.!?]+/g) || [text];
499
+ let currentChunk = "";
500
+ for (const sentence of sentences) {
501
+ if (currentChunk.length + sentence.length > size && currentChunk) {
502
+ chunks.push(currentChunk.trim());
503
+ currentChunk = sentence;
504
+ } else {
505
+ currentChunk += sentence;
506
+ }
507
+ }
508
+ if (currentChunk.trim()) {
509
+ chunks.push(currentChunk.trim());
510
+ }
511
+ } else if (strategy === "word") {
512
+ const words = text.split(/\s+/);
513
+ let currentChunk = [];
514
+ let currentSize = 0;
515
+ for (const word of words) {
516
+ if (currentSize + word.length > size && currentChunk.length > 0) {
517
+ chunks.push(currentChunk.join(" "));
518
+ currentChunk = [word];
519
+ currentSize = word.length;
520
+ } else {
521
+ currentChunk.push(word);
522
+ currentSize += word.length + 1;
523
+ }
524
+ }
525
+ if (currentChunk.length > 0) {
526
+ chunks.push(currentChunk.join(" "));
527
+ }
528
+ }
529
+ return chunks.filter((ch) => ch.length > 0);
530
+ }
531
+
532
+ // src/utils/validation.js
533
+ function validateJSON(text, expectedFields = []) {
534
+ try {
535
+ const jsonMatch = text.match(/```(?:json)?\s*([\s\S]*?)\s*```/);
536
+ const cleanText = jsonMatch ? jsonMatch[1] : text;
537
+ const parsed = JSON.parse(cleanText);
538
+ if (expectedFields.length > 0) {
539
+ const missingFields = expectedFields.filter((field) => !(field in parsed));
540
+ if (missingFields.length > 0) {
541
+ return {
542
+ isValid: false,
543
+ error: `Missing required fields: ${missingFields.join(", ")}`,
544
+ data: parsed
545
+ };
546
+ }
547
+ }
548
+ return {
549
+ isValid: true,
550
+ data: parsed
551
+ };
552
+ } catch (error) {
553
+ return {
554
+ isValid: false,
555
+ error: `Invalid JSON: ${error.message}`,
556
+ data: null
557
+ };
558
+ }
559
+ }
560
+ var validators = {
561
+ email: (value) => /^[^\s@]+@[^\s@]+\.[^\s@]+$/.test(value),
562
+ phone: (value) => /^[\d\s\-\+\(\)]+$/.test(value) && value.replace(/\D/g, "").length >= 7,
563
+ url: (value) => /^https?:\/\/.+/.test(value)
564
+ };
565
+ function verifyExtraction(extracted, sourceText, fields = []) {
566
+ const sourceLower = sourceText.toLowerCase();
567
+ const issues = [];
568
+ if (typeof extracted === "object" && extracted !== null) {
569
+ for (const [key, value] of Object.entries(extracted)) {
570
+ if (fields.length > 0 && !fields.includes(key)) {
571
+ continue;
572
+ }
573
+ if (value && typeof value === "string" && value.trim().length > 0) {
574
+ const valueLower = value.toLowerCase();
575
+ let foundInSource = sourceLower.includes(valueLower);
576
+ if (!foundInSource) {
577
+ const fieldType = key.toLowerCase();
578
+ if (validators[fieldType]) {
579
+ if (!validators[fieldType](value)) {
580
+ issues.push({
581
+ field: key,
582
+ value,
583
+ reason: `Invalid ${fieldType} format`
584
+ });
585
+ continue;
586
+ }
587
+ }
588
+ const words = valueLower.split(/\s+/).filter((w) => w.length > 3);
589
+ const matchedWords = words.filter((word) => sourceLower.includes(word));
590
+ const matchRatio = words.length > 0 ? matchedWords.length / words.length : 0;
591
+ foundInSource = matchRatio >= 0.8;
592
+ }
593
+ if (!foundInSource) {
594
+ issues.push({
595
+ field: key,
596
+ value,
597
+ reason: "Value not found in source text (possible hallucination)"
598
+ });
599
+ }
600
+ }
601
+ }
602
+ }
603
+ return {
604
+ isValid: issues.length === 0,
605
+ issues,
606
+ extracted
607
+ };
608
+ }
609
+ function normalizeExtracted(extracted) {
610
+ if (typeof extracted !== "object" || extracted === null) {
611
+ return extracted;
612
+ }
613
+ const normalized = {};
614
+ for (const [key, value] of Object.entries(extracted)) {
615
+ if (typeof value === "string") {
616
+ normalized[key] = value.trim().replace(/^["']|["']$/g, "").replace(/^[-•*]\s*/, "").trim();
617
+ } else {
618
+ normalized[key] = value;
619
+ }
620
+ }
621
+ return normalized;
622
+ }
623
+ function validateExtraction(llmOutput, sourceText, options = {}) {
624
+ const {
625
+ format = "json",
626
+ fields = [],
627
+ strict = true
628
+ // If true, reject if validation fails
629
+ } = options;
630
+ let parsed = llmOutput.trim();
631
+ if (format === "json") {
632
+ const jsonResult = validateJSON(parsed, fields);
633
+ if (!jsonResult.isValid) {
634
+ return {
635
+ isValid: false,
636
+ error: jsonResult.error,
637
+ raw: llmOutput,
638
+ validated: null
639
+ };
640
+ }
641
+ parsed = jsonResult.data;
642
+ }
643
+ const normalized = normalizeExtracted(parsed);
644
+ const verification = verifyExtraction(normalized, sourceText, fields);
645
+ if (strict && !verification.isValid) {
646
+ return {
647
+ isValid: false,
648
+ error: "Extraction validation failed",
649
+ issues: verification.issues,
650
+ raw: llmOutput,
651
+ validated: null
652
+ };
653
+ }
654
+ return {
655
+ isValid: true,
656
+ raw: llmOutput,
657
+ validated: normalized,
658
+ warnings: verification.issues
659
+ // Include warnings even if not strict
660
+ };
661
+ }
662
+
663
+ // src/preprocess/extract.js
664
+ async function extract(engine, text, options = {}) {
665
+ const logger = engine.getLogger();
666
+ const {
667
+ what = "key information",
668
+ // What to extract
669
+ format = "text",
670
+ // "text", "json", "list"
671
+ fields = [],
672
+ // Specific fields to extract (for JSON)
673
+ validate = true,
674
+ // Enable rule-based validation
675
+ strict = false
676
+ // If true, throw error on validation failure
677
+ } = options;
678
+ const originalPrompt = `Extract ${what} from the following text`;
679
+ let prompt = originalPrompt;
680
+ if (format === "json") {
681
+ if (fields.length > 0) {
682
+ prompt += ` in JSON format with these fields: ${fields.join(", ")}`;
683
+ } else {
684
+ prompt += ` in JSON format`;
685
+ }
686
+ } else if (format === "list") {
687
+ prompt += ` as a list`;
688
+ }
689
+ prompt += `:
690
+
691
+ ${text}`;
692
+ logger.logPromptConstruction("extract", originalPrompt, prompt, options);
693
+ const llmResult = await engine.run(prompt, { temperature: 0.3 });
694
+ const rawResult = llmResult.trim();
695
+ logger.log("info", "EXTRACT", "LLM extraction completed", {
696
+ format,
697
+ fields,
698
+ resultLength: rawResult.length
699
+ });
700
+ if (validate && format === "json") {
701
+ logger.log("info", "VALIDATION", "Starting rule-based validation");
702
+ const validation = validateExtraction(rawResult, text, {
703
+ format,
704
+ fields,
705
+ strict
706
+ });
707
+ logger.logValidation(
708
+ "extract",
709
+ text,
710
+ validation.validated || rawResult,
711
+ validation.isValid,
712
+ validation.error ? new Error(validation.error) : null
713
+ );
714
+ if (!validation.isValid) {
715
+ if (strict) {
716
+ throw new Error(`Extraction validation failed: ${validation.error}`);
717
+ } else {
718
+ logger.log("warn", "VALIDATION", "Validation failed but continuing (non-strict mode)", {
719
+ error: validation.error,
720
+ issues: validation.issues
721
+ });
722
+ return rawResult;
723
+ }
724
+ }
725
+ if (validation.validated) {
726
+ logger.log("info", "VALIDATION", "Validation passed, returning validated data");
727
+ return JSON.stringify(validation.validated, null, 2);
728
+ }
729
+ }
730
+ return rawResult;
731
+ }
732
+
733
+ // src/index.js
734
+ var Preprocessor = class {
735
+ constructor(options = {}) {
736
+ this.engine = new LLMEngine(options);
737
+ this.isModelLoaded = false;
738
+ this.logger = this.engine.getLogger();
739
+ }
740
+ /**
741
+ * Load the WebLLM model
742
+ * @param {string} model - Model name (default: "Llama-3.2-1B-Instruct-q4f16_1-MLC")
743
+ * @returns {Promise<void>}
744
+ */
745
+ async loadModel(model) {
746
+ await this.engine.loadModel(model);
747
+ this.isModelLoaded = true;
748
+ this.logger.log("info", "PREPROCESSOR", "Model loaded and ready");
749
+ }
750
+ /**
751
+ * Check if WebGPU is supported in the current environment
752
+ * @returns {Promise<boolean>}
753
+ */
754
+ async checkWebGPU() {
755
+ if (typeof navigator === "undefined" || !navigator.gpu) {
756
+ return false;
757
+ }
758
+ try {
759
+ const adapter = await navigator.gpu.requestAdapter();
760
+ return !!adapter;
761
+ } catch (e) {
762
+ return false;
763
+ }
764
+ }
765
+ /**
766
+ * Get the logger instance for accessing internal logs
767
+ * @returns {InternalLogger}
768
+ */
769
+ getLogger() {
770
+ return this.logger;
771
+ }
772
+ /**
773
+ * Enable/disable internal logging
774
+ */
775
+ setLogging(enabled, verbose = false) {
776
+ this.logger.setEnabled(enabled);
777
+ this.logger.setVerbose(verbose);
778
+ this.logger.log("info", "PREPROCESSOR", `Logging ${enabled ? "enabled" : "disabled"}`, { verbose });
779
+ }
780
+ /**
781
+ * Ensure model is loaded
782
+ * @private
783
+ */
784
+ _ensureLoaded() {
785
+ if (!this.isModelLoaded && !this.engine.isLoaded()) {
786
+ throw new Error(
787
+ "Model not loaded. Call loadModel() first before using preprocessing functions."
788
+ );
789
+ }
790
+ }
791
+ /**
792
+ * Clean text
793
+ * Works with or without LLM model loaded
794
+ * Uses rule-based cleaning if model not loaded, LLM if available
795
+ * All options are opt-in (default: false) - user chooses what to remove
796
+ * @param {string} text - Text to clean
797
+ * @param {Object} options - Cleaning options (all optional, default: false)
798
+ * @param {boolean} options.removeHtml - Remove HTML tags (default: false)
799
+ * @param {boolean} options.removeUrls - Remove URLs (default: false)
800
+ * @param {boolean} options.removeExtraWhitespace - Remove extra whitespace (default: false)
801
+ * @param {boolean} options.removeLineBreaks - Remove line breaks (default: false)
802
+ * @param {boolean} options.removeSpecialChars - Remove special characters (default: false)
803
+ * @param {boolean} options.decodeHtmlEntities - Decode HTML entities like &amp; (default: false)
804
+ * @param {string} options.customInstructions - Additional cleaning instructions (requires LLM)
805
+ * @param {boolean} options.useLLM - Force LLM usage (requires model loaded)
806
+ * @returns {Promise<string>|string}
807
+ *
808
+ * @example
809
+ * // No options - returns text as-is
810
+ * await p.clean(text);
811
+ *
812
+ * // User chooses what to remove
813
+ * await p.clean(text, { removeHtml: true, removeExtraWhitespace: true });
814
+ *
815
+ * // Use LLM for semantic cleaning
816
+ * await p.clean(text, { useLLM: true, customInstructions: "Remove all dates" });
817
+ */
818
+ async clean(text, options = {}) {
819
+ if (options.useLLM === true || options.customInstructions) {
820
+ this._ensureLoaded();
821
+ }
822
+ return await clean(this.engine, text, options);
823
+ }
824
+ /**
825
+ * Extract information from text
826
+ * @param {string} text - Text to extract from
827
+ * @param {Object} options - Extraction options
828
+ * @returns {Promise<string>}
829
+ */
830
+ async extract(text, options = {}) {
831
+ this._ensureLoaded();
832
+ return await extract(this.engine, text, options);
833
+ }
834
+ /**
835
+ * Chunk text into smaller pieces (non-LLM, fast operation)
836
+ * Works immediately, no model needed
837
+ * @param {string} text - Text to chunk
838
+ * @param {Object} options - Chunking options
839
+ * @returns {string[]}
840
+ */
841
+ chunk(text, options = {}) {
842
+ return chunk(text, options);
843
+ }
844
+ /**
845
+ * Run a custom prompt on text
846
+ * @param {string} text - Input text
847
+ * @param {string|Object} instruction - Custom instruction or config object
848
+ * @param {Object} options - Generation options
849
+ * @returns {Promise<string>}
850
+ */
851
+ async prompt(text, instruction, options = {}) {
852
+ this._ensureLoaded();
853
+ let promptText;
854
+ let genOptions = { ...options };
855
+ if (typeof instruction === "string") {
856
+ promptText = `${instruction}
857
+
858
+ ${text}`;
859
+ } else if (typeof instruction === "object") {
860
+ const { instruction: inst, format, temperature, maxTokens } = instruction;
861
+ promptText = inst;
862
+ if (format) {
863
+ if (typeof format === "object") {
864
+ promptText += `
865
+
866
+ Return the result in JSON format with these fields: ${JSON.stringify(format)}`;
867
+ } else {
868
+ promptText += `
869
+
870
+ Format: ${format}`;
871
+ }
872
+ }
873
+ promptText += `
874
+
875
+ ${text}`;
876
+ if (temperature !== void 0) genOptions.temperature = temperature;
877
+ if (maxTokens !== void 0) genOptions.maxTokens = maxTokens;
878
+ } else {
879
+ throw new Error("Instruction must be a string or object");
880
+ }
881
+ return await this.engine.run(promptText, genOptions);
882
+ }
883
+ /**
884
+ * Enforce correct pipeline ordering
885
+ * Always ensures: clean → extract (if both present)
886
+ * @private
887
+ */
888
+ _enforcePipelineOrder(pipeline) {
889
+ const ordered = [...pipeline];
890
+ const cleanIndex = ordered.findIndex(
891
+ (step) => step === "clean" || typeof step === "object" && step.clean !== void 0
892
+ );
893
+ const extractIndex = ordered.findIndex(
894
+ (step) => step === "extract" || typeof step === "object" && step.extract !== void 0
895
+ );
896
+ if (cleanIndex !== -1 && extractIndex !== -1 && cleanIndex > extractIndex) {
897
+ this.logger.log("warn", "PIPELINE", "Reordering pipeline: clean must come before extract", {
898
+ originalOrder: ordered.map((s) => typeof s === "string" ? s : Object.keys(s)[0]),
899
+ reordered: true
900
+ });
901
+ const cleanStep = ordered.splice(cleanIndex, 1)[0];
902
+ const newExtractIndex = ordered.findIndex(
903
+ (step) => step === "extract" || typeof step === "object" && step.extract !== void 0
904
+ );
905
+ ordered.splice(newExtractIndex, 0, cleanStep);
906
+ }
907
+ return ordered;
908
+ }
909
+ /**
910
+ * Process text with multiple operations in a pipeline
911
+ * Automatically enforces correct ordering (clean → extract)
912
+ * @param {string} text - Input text
913
+ * @param {Array} pipeline - Array of operations to apply
914
+ * @returns {Promise<string|string[]>}
915
+ *
916
+ * @example
917
+ * await p.pipeline(text, [
918
+ * "extract", // Will be reordered to run after clean
919
+ * "clean",
920
+ * { prompt: "Rewrite in pirate style" }
921
+ * ]);
922
+ */
923
+ async pipeline(text, pipeline) {
924
+ this._ensureLoaded();
925
+ if (!Array.isArray(pipeline) || pipeline.length === 0) {
926
+ throw new Error("Pipeline must be a non-empty array");
927
+ }
928
+ const orderedPipeline = this._enforcePipelineOrder(pipeline);
929
+ this.logger.log("info", "PIPELINE", "Starting pipeline execution", {
930
+ stepCount: orderedPipeline.length,
931
+ steps: orderedPipeline.map((s) => typeof s === "string" ? s : Object.keys(s)[0])
932
+ });
933
+ let result = text;
934
+ const startTime = Date.now();
935
+ for (let i = 0; i < orderedPipeline.length; i++) {
936
+ const step = orderedPipeline[i];
937
+ const stepStartTime = Date.now();
938
+ const stepName = typeof step === "string" ? step : Object.keys(step)[0] || "unknown";
939
+ try {
940
+ if (typeof step === "string") {
941
+ switch (step) {
942
+ case "clean":
943
+ result = await this.clean(result);
944
+ break;
945
+ case "extract":
946
+ result = await this.extract(result);
947
+ break;
948
+ case "chunk":
949
+ result = this.chunk(result);
950
+ break;
951
+ default:
952
+ throw new Error(`Unknown operation: ${step}`);
953
+ }
954
+ } else if (typeof step === "object") {
955
+ if (step.prompt) {
956
+ result = await this.prompt(result, step.prompt, step.options || {});
957
+ } else if (step.clean) {
958
+ result = await this.clean(result, step.clean);
959
+ } else if (step.extract) {
960
+ result = await this.extract(result, step.extract);
961
+ } else if (step.chunk) {
962
+ result = this.chunk(result, step.chunk);
963
+ } else {
964
+ throw new Error(`Unknown operation object: ${JSON.stringify(step)}`);
965
+ }
966
+ } else {
967
+ throw new Error(`Invalid pipeline step: ${step}`);
968
+ }
969
+ if (Array.isArray(result)) {
970
+ this.logger.log("info", "PIPELINE", "Chunking applied, stopping pipeline", {
971
+ chunks: result.length
972
+ });
973
+ break;
974
+ }
975
+ const stepDuration = Date.now() - stepStartTime;
976
+ this.logger.logPipelineStep(i, stepName, text, result, stepDuration);
977
+ } catch (error) {
978
+ this.logger.logError(`pipeline step ${i + 1} (${stepName})`, error, {
979
+ step,
980
+ inputLength: typeof text === "string" ? text.length : "N/A"
981
+ });
982
+ throw error;
983
+ }
984
+ }
985
+ const totalDuration = Date.now() - startTime;
986
+ this.logger.logPerformance("pipeline", {
987
+ totalSteps: orderedPipeline.length,
988
+ duration: `${totalDuration}ms`,
989
+ averageStepTime: `${(totalDuration / orderedPipeline.length).toFixed(2)}ms`
990
+ });
991
+ return result;
992
+ }
993
+ /**
994
+ * Process text with a simple configuration object
995
+ * @param {string} text - Input text
996
+ * @param {Object} config - Processing configuration
997
+ * @returns {Promise<string>}
998
+ *
999
+ * @example
1000
+ * await p.process(text, {
1001
+ * clean: true,
1002
+ * extract: { format: "json", fields: ["name", "email"] },
1003
+ * customPrompt: "Convert to bullet points"
1004
+ * });
1005
+ */
1006
+ async process(text, config = {}) {
1007
+ this._ensureLoaded();
1008
+ let result = text;
1009
+ if (config.clean) {
1010
+ result = await this.clean(
1011
+ result,
1012
+ typeof config.clean === "object" ? config.clean : {}
1013
+ );
1014
+ }
1015
+ if (config.extract) {
1016
+ result = await this.extract(
1017
+ result,
1018
+ typeof config.extract === "object" ? config.extract : {}
1019
+ );
1020
+ }
1021
+ if (config.customPrompt) {
1022
+ result = await this.prompt(result, config.customPrompt, config.promptOptions || {});
1023
+ }
1024
+ if (config.chunk) {
1025
+ result = this.chunk(
1026
+ result,
1027
+ typeof config.chunk === "object" ? config.chunk : {}
1028
+ );
1029
+ }
1030
+ return result;
1031
+ }
1032
+ };
1033
+ export {
1034
+ LLMEngine,
1035
+ Preprocessor,
1036
+ chunk,
1037
+ clean,
1038
+ cleanWithRules,
1039
+ extract
1040
+ };