client-llm-preprocessor 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +162 -0
- package/dist/index.d.ts +1319 -0
- package/dist/index.js +1040 -0
- package/package.json +67 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,1040 @@
|
|
|
1
|
+
// src/engine.js
|
|
2
|
+
import * as webllm from "@mlc-ai/web-llm";
|
|
3
|
+
|
|
4
|
+
// src/utils/logger.js
|
|
5
|
+
var InternalLogger = class {
|
|
6
|
+
constructor(options = {}) {
|
|
7
|
+
this.enabled = options.enabled !== false;
|
|
8
|
+
this.verbose = options.verbose || false;
|
|
9
|
+
this.logLevel = options.logLevel || "info";
|
|
10
|
+
this.logs = [];
|
|
11
|
+
this.maxLogs = options.maxLogs || 1e3;
|
|
12
|
+
this.onLogCallback = options.onLogCallback || null;
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Enable or disable logging
|
|
16
|
+
*/
|
|
17
|
+
setEnabled(enabled) {
|
|
18
|
+
this.enabled = enabled;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Set verbosity level
|
|
22
|
+
*/
|
|
23
|
+
setVerbose(verbose) {
|
|
24
|
+
this.verbose = verbose;
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Log an event with metadata
|
|
28
|
+
*/
|
|
29
|
+
log(level, category, message, data = {}) {
|
|
30
|
+
if (!this.enabled) return;
|
|
31
|
+
const logEntry = {
|
|
32
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
33
|
+
level,
|
|
34
|
+
category,
|
|
35
|
+
message,
|
|
36
|
+
data,
|
|
37
|
+
stackTrace: this.verbose ? new Error().stack : void 0
|
|
38
|
+
};
|
|
39
|
+
this.logs.push(logEntry);
|
|
40
|
+
if (this.logs.length > this.maxLogs) {
|
|
41
|
+
this.logs.shift();
|
|
42
|
+
}
|
|
43
|
+
if (this.shouldLog(level)) {
|
|
44
|
+
const prefix = `[${level.toUpperCase()}] [${category}]`;
|
|
45
|
+
console.log(`${prefix} ${message}`, data);
|
|
46
|
+
}
|
|
47
|
+
if (this.onLogCallback) {
|
|
48
|
+
this.onLogCallback(logEntry);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Check if log level should be output
|
|
53
|
+
*/
|
|
54
|
+
shouldLog(level) {
|
|
55
|
+
const levels = { debug: 0, info: 1, warn: 2, error: 3 };
|
|
56
|
+
return levels[level] >= levels[this.logLevel];
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Log prompt construction
|
|
60
|
+
*/
|
|
61
|
+
logPromptConstruction(operation, originalPrompt, finalPrompt, options = {}) {
|
|
62
|
+
this.log("debug", "PROMPT", "Constructing prompt", {
|
|
63
|
+
operation,
|
|
64
|
+
originalLength: originalPrompt.length,
|
|
65
|
+
finalLength: finalPrompt.length,
|
|
66
|
+
options,
|
|
67
|
+
promptPreview: finalPrompt.substring(0, 200) + "..."
|
|
68
|
+
});
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Log token generation (if streaming available)
|
|
72
|
+
*/
|
|
73
|
+
logTokenGeneration(token, cumulativeText, tokenIndex) {
|
|
74
|
+
if (this.verbose) {
|
|
75
|
+
this.log("debug", "TOKEN", `Generated token ${tokenIndex}`, {
|
|
76
|
+
token,
|
|
77
|
+
cumulativeLength: cumulativeText.length,
|
|
78
|
+
tokenIndex
|
|
79
|
+
});
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Log LLM inference start
|
|
84
|
+
*/
|
|
85
|
+
logInferenceStart(prompt, options) {
|
|
86
|
+
this.log("info", "INFERENCE", "Starting LLM inference", {
|
|
87
|
+
promptLength: prompt.length,
|
|
88
|
+
temperature: options.temperature,
|
|
89
|
+
maxTokens: options.maxTokens,
|
|
90
|
+
promptPreview: prompt.substring(0, 100) + "..."
|
|
91
|
+
});
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Log LLM inference completion
|
|
95
|
+
*/
|
|
96
|
+
logInferenceComplete(response, duration, tokenCount) {
|
|
97
|
+
this.log("info", "INFERENCE", "LLM inference completed", {
|
|
98
|
+
responseLength: response.length,
|
|
99
|
+
duration: `${duration}ms`,
|
|
100
|
+
estimatedTokens: tokenCount,
|
|
101
|
+
responsePreview: response.substring(0, 200) + "..."
|
|
102
|
+
});
|
|
103
|
+
}
|
|
104
|
+
/**
|
|
105
|
+
* Log validation step
|
|
106
|
+
*/
|
|
107
|
+
logValidation(step, input, output, isValid, error = null) {
|
|
108
|
+
this.log(isValid ? "info" : "warn", "VALIDATION", `Validation: ${step}`, {
|
|
109
|
+
inputPreview: typeof input === "string" ? input.substring(0, 100) : input,
|
|
110
|
+
outputPreview: typeof output === "string" ? output.substring(0, 100) : output,
|
|
111
|
+
isValid,
|
|
112
|
+
error: error?.message
|
|
113
|
+
});
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Log pipeline step
|
|
117
|
+
*/
|
|
118
|
+
logPipelineStep(stepIndex, stepName, input, output, duration) {
|
|
119
|
+
this.log("info", "PIPELINE", `Pipeline step ${stepIndex + 1}: ${stepName}`, {
|
|
120
|
+
inputLength: typeof input === "string" ? input.length : "N/A",
|
|
121
|
+
outputLength: typeof output === "string" ? output.length : "N/A",
|
|
122
|
+
duration: `${duration}ms`,
|
|
123
|
+
inputPreview: typeof input === "string" ? input.substring(0, 50) : input,
|
|
124
|
+
outputPreview: typeof output === "string" ? output.substring(0, 50) : output
|
|
125
|
+
});
|
|
126
|
+
}
|
|
127
|
+
/**
|
|
128
|
+
* Log error with context
|
|
129
|
+
*/
|
|
130
|
+
logError(operation, error, context = {}) {
|
|
131
|
+
this.log("error", "ERROR", `Error in ${operation}`, {
|
|
132
|
+
error: error.message,
|
|
133
|
+
stack: error.stack,
|
|
134
|
+
context
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
/**
|
|
138
|
+
* Log performance metrics
|
|
139
|
+
*/
|
|
140
|
+
logPerformance(operation, metrics) {
|
|
141
|
+
this.log("info", "PERFORMANCE", `Performance: ${operation}`, metrics);
|
|
142
|
+
}
|
|
143
|
+
/**
|
|
144
|
+
* Get all logs
|
|
145
|
+
*/
|
|
146
|
+
getLogs(filter = {}) {
|
|
147
|
+
let filtered = [...this.logs];
|
|
148
|
+
if (filter.level) {
|
|
149
|
+
filtered = filtered.filter((log) => log.level === filter.level);
|
|
150
|
+
}
|
|
151
|
+
if (filter.category) {
|
|
152
|
+
filtered = filtered.filter((log) => log.category === filter.category);
|
|
153
|
+
}
|
|
154
|
+
if (filter.since) {
|
|
155
|
+
const sinceTime = new Date(filter.since).getTime();
|
|
156
|
+
filtered = filtered.filter((log) => new Date(log.timestamp).getTime() >= sinceTime);
|
|
157
|
+
}
|
|
158
|
+
return filtered;
|
|
159
|
+
}
|
|
160
|
+
/**
|
|
161
|
+
* Get logs as formatted string
|
|
162
|
+
*/
|
|
163
|
+
getLogsAsString(filter = {}) {
|
|
164
|
+
const logs = this.getLogs(filter);
|
|
165
|
+
return logs.map((log) => {
|
|
166
|
+
return `[${log.timestamp}] [${log.level}] [${log.category}] ${log.message}`;
|
|
167
|
+
}).join("\n");
|
|
168
|
+
}
|
|
169
|
+
/**
|
|
170
|
+
* Clear logs
|
|
171
|
+
*/
|
|
172
|
+
clear() {
|
|
173
|
+
this.logs = [];
|
|
174
|
+
}
|
|
175
|
+
/**
|
|
176
|
+
* Export logs as JSON
|
|
177
|
+
*/
|
|
178
|
+
exportLogs() {
|
|
179
|
+
return JSON.stringify(this.logs, null, 2);
|
|
180
|
+
}
|
|
181
|
+
/**
|
|
182
|
+
* Get summary statistics
|
|
183
|
+
*/
|
|
184
|
+
getStats() {
|
|
185
|
+
const stats = {
|
|
186
|
+
totalLogs: this.logs.length,
|
|
187
|
+
byLevel: {},
|
|
188
|
+
byCategory: {},
|
|
189
|
+
errors: 0,
|
|
190
|
+
warnings: 0,
|
|
191
|
+
timeRange: {
|
|
192
|
+
start: this.logs[0]?.timestamp,
|
|
193
|
+
end: this.logs[this.logs.length - 1]?.timestamp
|
|
194
|
+
}
|
|
195
|
+
};
|
|
196
|
+
this.logs.forEach((log) => {
|
|
197
|
+
stats.byLevel[log.level] = (stats.byLevel[log.level] || 0) + 1;
|
|
198
|
+
stats.byCategory[log.category] = (stats.byCategory[log.category] || 0) + 1;
|
|
199
|
+
if (log.level === "error") stats.errors++;
|
|
200
|
+
if (log.level === "warn") stats.warnings++;
|
|
201
|
+
});
|
|
202
|
+
return stats;
|
|
203
|
+
}
|
|
204
|
+
};
|
|
205
|
+
var defaultLogger = null;
|
|
206
|
+
function getLogger(options = {}) {
|
|
207
|
+
if (!defaultLogger) {
|
|
208
|
+
defaultLogger = new InternalLogger(options);
|
|
209
|
+
}
|
|
210
|
+
return defaultLogger;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
// src/engine.js
|
|
214
|
+
var LLMEngine = class {
|
|
215
|
+
constructor(options = {}) {
|
|
216
|
+
this.engine = null;
|
|
217
|
+
this.model = null;
|
|
218
|
+
this.logger = options.logger || getLogger(options.loggerOptions);
|
|
219
|
+
this.streamingEnabled = options.streaming !== false;
|
|
220
|
+
}
|
|
221
|
+
/**
|
|
222
|
+
* Load a WebLLM model
|
|
223
|
+
* @param {string} model - Model name (default: "Llama-3.2-1B-Instruct-q4f16_1")
|
|
224
|
+
* @returns {Promise<void>}
|
|
225
|
+
*/
|
|
226
|
+
async loadModel(model = "Llama-3.2-1B-Instruct-q4f16_1-MLC") {
|
|
227
|
+
if (this.engine && this.model === model) {
|
|
228
|
+
this.logger.log("info", "MODEL", "Model already loaded, skipping");
|
|
229
|
+
return;
|
|
230
|
+
}
|
|
231
|
+
const startTime = Date.now();
|
|
232
|
+
this.logger.log("info", "MODEL", `Loading model: ${model}`, { model });
|
|
233
|
+
try {
|
|
234
|
+
this.engine = await webllm.CreateMLCEngine(model, {
|
|
235
|
+
initProgressCallback: (report) => {
|
|
236
|
+
if (report.progress) {
|
|
237
|
+
const progress = (report.progress * 100).toFixed(1);
|
|
238
|
+
this.logger.log("info", "MODEL", `Loading progress: ${progress}%`, {
|
|
239
|
+
progress: parseFloat(progress),
|
|
240
|
+
report
|
|
241
|
+
});
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
});
|
|
245
|
+
this.model = model;
|
|
246
|
+
const loadTime = Date.now() - startTime;
|
|
247
|
+
this.logger.log("info", "MODEL", "Model loaded successfully", {
|
|
248
|
+
model,
|
|
249
|
+
loadTime: `${loadTime}ms`
|
|
250
|
+
});
|
|
251
|
+
} catch (error) {
|
|
252
|
+
this.logger.logError("loadModel", error, { model });
|
|
253
|
+
throw new Error(`Failed to load model: ${error.message}`);
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
/**
|
|
257
|
+
* Run inference with the loaded model
|
|
258
|
+
* Captures detailed internal state including token-by-token generation
|
|
259
|
+
* @param {string} prompt - The prompt to send to the model
|
|
260
|
+
* @param {Object} options - Generation options
|
|
261
|
+
* @returns {Promise<string>}
|
|
262
|
+
*/
|
|
263
|
+
async run(prompt, options = {}) {
|
|
264
|
+
if (!this.engine) {
|
|
265
|
+
throw new Error("Model not loaded. Call loadModel() first.");
|
|
266
|
+
}
|
|
267
|
+
const {
|
|
268
|
+
temperature = 0.7,
|
|
269
|
+
maxTokens = 512,
|
|
270
|
+
stopSequences = [],
|
|
271
|
+
stream = this.streamingEnabled
|
|
272
|
+
// Try streaming for token-by-token logging
|
|
273
|
+
} = options;
|
|
274
|
+
const startTime = Date.now();
|
|
275
|
+
this.logger.logInferenceStart(prompt, { temperature, maxTokens, stopSequences, stream });
|
|
276
|
+
try {
|
|
277
|
+
let fullResponse = "";
|
|
278
|
+
let tokenCount = 0;
|
|
279
|
+
const tokens = [];
|
|
280
|
+
if (stream && this.engine.chat?.completions?.createStream) {
|
|
281
|
+
this.logger.log("info", "INFERENCE", "Using streaming mode for token-by-token logging");
|
|
282
|
+
try {
|
|
283
|
+
const stream2 = await this.engine.chat.completions.createStream({
|
|
284
|
+
messages: [{ role: "user", content: prompt }],
|
|
285
|
+
temperature,
|
|
286
|
+
max_tokens: maxTokens,
|
|
287
|
+
stop: stopSequences.length > 0 ? stopSequences : void 0
|
|
288
|
+
});
|
|
289
|
+
for await (const chunk2 of stream2) {
|
|
290
|
+
const delta = chunk2.choices?.[0]?.delta?.content || "";
|
|
291
|
+
if (delta) {
|
|
292
|
+
fullResponse += delta;
|
|
293
|
+
tokenCount++;
|
|
294
|
+
tokens.push(delta);
|
|
295
|
+
this.logger.logTokenGeneration(delta, fullResponse, tokenCount);
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
this.logger.log("info", "INFERENCE", "Streaming completed", {
|
|
299
|
+
totalTokens: tokenCount,
|
|
300
|
+
responseLength: fullResponse.length
|
|
301
|
+
});
|
|
302
|
+
} catch (streamError) {
|
|
303
|
+
this.logger.log("warn", "INFERENCE", "Streaming failed, falling back to non-streaming", {
|
|
304
|
+
error: streamError.message
|
|
305
|
+
});
|
|
306
|
+
return await this.runNonStreaming(prompt, options, startTime);
|
|
307
|
+
}
|
|
308
|
+
} else {
|
|
309
|
+
return await this.runNonStreaming(prompt, options, startTime);
|
|
310
|
+
}
|
|
311
|
+
const duration = Date.now() - startTime;
|
|
312
|
+
this.logger.logInferenceComplete(fullResponse, duration, tokenCount);
|
|
313
|
+
this.logger.log("debug", "INFERENCE", "Token sequence captured", {
|
|
314
|
+
tokenCount,
|
|
315
|
+
tokens: tokens.slice(0, 20),
|
|
316
|
+
// First 20 tokens
|
|
317
|
+
fullSequenceLength: tokens.length
|
|
318
|
+
});
|
|
319
|
+
return fullResponse;
|
|
320
|
+
} catch (error) {
|
|
321
|
+
const duration = Date.now() - startTime;
|
|
322
|
+
this.logger.logError("run", error, {
|
|
323
|
+
promptLength: prompt.length,
|
|
324
|
+
duration: `${duration}ms`,
|
|
325
|
+
options
|
|
326
|
+
});
|
|
327
|
+
throw new Error(`Inference failed: ${error.message}`);
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
/**
|
|
331
|
+
* Non-streaming inference (fallback)
|
|
332
|
+
* @private
|
|
333
|
+
*/
|
|
334
|
+
async runNonStreaming(prompt, options, startTime) {
|
|
335
|
+
const {
|
|
336
|
+
temperature = 0.7,
|
|
337
|
+
maxTokens = 512,
|
|
338
|
+
stopSequences = []
|
|
339
|
+
} = options;
|
|
340
|
+
this.logger.log("info", "INFERENCE", "Using non-streaming mode");
|
|
341
|
+
const response = await this.engine.chat.completions.create({
|
|
342
|
+
messages: [{ role: "user", content: prompt }],
|
|
343
|
+
temperature,
|
|
344
|
+
max_tokens: maxTokens,
|
|
345
|
+
stop: stopSequences.length > 0 ? stopSequences : void 0
|
|
346
|
+
});
|
|
347
|
+
const result = response.choices[0].message.content;
|
|
348
|
+
const duration = Date.now() - (startTime || Date.now());
|
|
349
|
+
const estimatedTokens = Math.ceil(result.length / 4);
|
|
350
|
+
this.logger.logInferenceComplete(result, duration, estimatedTokens);
|
|
351
|
+
return result;
|
|
352
|
+
}
|
|
353
|
+
/**
|
|
354
|
+
* Check if model is loaded
|
|
355
|
+
* @returns {boolean}
|
|
356
|
+
*/
|
|
357
|
+
isLoaded() {
|
|
358
|
+
return this.engine !== null;
|
|
359
|
+
}
|
|
360
|
+
/**
|
|
361
|
+
* Get the logger instance
|
|
362
|
+
* @returns {InternalLogger}
|
|
363
|
+
*/
|
|
364
|
+
getLogger() {
|
|
365
|
+
return this.logger;
|
|
366
|
+
}
|
|
367
|
+
/**
|
|
368
|
+
* Enable/disable streaming for token-by-token logging
|
|
369
|
+
*/
|
|
370
|
+
setStreaming(enabled) {
|
|
371
|
+
this.streamingEnabled = enabled;
|
|
372
|
+
this.logger.log("info", "ENGINE", `Streaming ${enabled ? "enabled" : "disabled"}`);
|
|
373
|
+
}
|
|
374
|
+
};
|
|
375
|
+
|
|
376
|
+
// src/preprocess/clean-rules.js
|
|
377
|
+
function cleanWithRules(text, options = {}) {
|
|
378
|
+
const {
|
|
379
|
+
removeHtml = false,
|
|
380
|
+
removeUrls = false,
|
|
381
|
+
removeExtraWhitespace = false,
|
|
382
|
+
removeLineBreaks = false,
|
|
383
|
+
removeSpecialChars = false,
|
|
384
|
+
decodeHtmlEntities = false
|
|
385
|
+
} = options;
|
|
386
|
+
let cleaned = text;
|
|
387
|
+
if (decodeHtmlEntities) {
|
|
388
|
+
cleaned = cleaned.replace(/ /g, " ").replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, '"').replace(/'/g, "'").replace(/'/g, "'").replace(///g, "/");
|
|
389
|
+
}
|
|
390
|
+
if (removeHtml) {
|
|
391
|
+
cleaned = cleaned.replace(/<[^>]+>/g, "");
|
|
392
|
+
}
|
|
393
|
+
if (removeUrls) {
|
|
394
|
+
cleaned = cleaned.replace(/https?:\/\/[^\s]+/g, "");
|
|
395
|
+
}
|
|
396
|
+
if (removeLineBreaks) {
|
|
397
|
+
cleaned = cleaned.replace(/[\r\n]+/g, " ");
|
|
398
|
+
}
|
|
399
|
+
if (removeExtraWhitespace) {
|
|
400
|
+
cleaned = cleaned.replace(/[ \t]+/g, " ");
|
|
401
|
+
if (!removeLineBreaks) {
|
|
402
|
+
cleaned = cleaned.split("\n").map((line) => line.trim()).join("\n");
|
|
403
|
+
}
|
|
404
|
+
if (!removeLineBreaks) {
|
|
405
|
+
cleaned = cleaned.replace(/\n{3,}/g, "\n\n");
|
|
406
|
+
}
|
|
407
|
+
cleaned = cleaned.trim();
|
|
408
|
+
}
|
|
409
|
+
if (removeSpecialChars) {
|
|
410
|
+
cleaned = cleaned.replace(/[^\w\s.,!?;:()\-'"]/g, "");
|
|
411
|
+
}
|
|
412
|
+
return cleaned;
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
// src/preprocess/clean.js
|
|
416
|
+
async function clean(engine, text, options = {}) {
|
|
417
|
+
const {
|
|
418
|
+
removeHtml = false,
|
|
419
|
+
removeUrls = false,
|
|
420
|
+
removeExtraWhitespace = false,
|
|
421
|
+
removeLineBreaks = false,
|
|
422
|
+
removeSpecialChars = false,
|
|
423
|
+
decodeHtmlEntities = false,
|
|
424
|
+
customInstructions = "",
|
|
425
|
+
useLLM = null
|
|
426
|
+
// null = auto-detect, true = force LLM, false = force rules
|
|
427
|
+
} = options;
|
|
428
|
+
const shouldUseLLM = useLLM !== false && engine !== null && engine.isLoaded() && (useLLM === true || customInstructions !== "");
|
|
429
|
+
if (!shouldUseLLM) {
|
|
430
|
+
const logger2 = engine?.getLogger();
|
|
431
|
+
if (logger2) {
|
|
432
|
+
logger2.log("info", "CLEAN", "Using rule-based cleaning (no LLM)", {
|
|
433
|
+
reason: !engine ? "No engine" : !engine.isLoaded() ? "Model not loaded" : "useLLM=false",
|
|
434
|
+
options: { removeHtml, removeUrls, removeExtraWhitespace, removeLineBreaks, removeSpecialChars, decodeHtmlEntities }
|
|
435
|
+
});
|
|
436
|
+
}
|
|
437
|
+
return cleanWithRules(text, {
|
|
438
|
+
removeHtml,
|
|
439
|
+
removeUrls,
|
|
440
|
+
removeExtraWhitespace,
|
|
441
|
+
removeLineBreaks,
|
|
442
|
+
removeSpecialChars,
|
|
443
|
+
decodeHtmlEntities
|
|
444
|
+
});
|
|
445
|
+
}
|
|
446
|
+
const logger = engine.getLogger();
|
|
447
|
+
const cleaningSteps = [];
|
|
448
|
+
if (removeHtml) cleaningSteps.push("HTML tags");
|
|
449
|
+
if (removeUrls) cleaningSteps.push("URLs");
|
|
450
|
+
if (removeExtraWhitespace) cleaningSteps.push("extra whitespace");
|
|
451
|
+
if (removeLineBreaks) cleaningSteps.push("line breaks");
|
|
452
|
+
if (removeSpecialChars) cleaningSteps.push("special characters");
|
|
453
|
+
if (decodeHtmlEntities) cleaningSteps.push("decode HTML entities");
|
|
454
|
+
let originalPrompt = `Clean the following text`;
|
|
455
|
+
let prompt = originalPrompt;
|
|
456
|
+
if (cleaningSteps.length > 0) {
|
|
457
|
+
prompt += ` by removing: ${cleaningSteps.join(", ")}`;
|
|
458
|
+
} else if (!customInstructions) {
|
|
459
|
+
return text;
|
|
460
|
+
}
|
|
461
|
+
prompt += `. IMPORTANT: Do NOT modify the meaning or remove important information. Only remove what was requested.`;
|
|
462
|
+
if (customInstructions) {
|
|
463
|
+
prompt += ` Also: ${customInstructions}`;
|
|
464
|
+
}
|
|
465
|
+
prompt += `:
|
|
466
|
+
|
|
467
|
+
${text}`;
|
|
468
|
+
logger.logPromptConstruction("clean", originalPrompt, prompt, options);
|
|
469
|
+
logger.log("info", "CLEAN", "Using LLM-based cleaning");
|
|
470
|
+
const result = await engine.run(prompt, { temperature: 0.3 });
|
|
471
|
+
const cleaned = result.trim();
|
|
472
|
+
logger.log("info", "CLEAN", "LLM cleaning completed", {
|
|
473
|
+
originalLength: text.length,
|
|
474
|
+
finalLength: cleaned.length
|
|
475
|
+
});
|
|
476
|
+
return cleaned;
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
// src/preprocess/chunk.js
|
|
480
|
+
function chunk(text, options = {}) {
|
|
481
|
+
const {
|
|
482
|
+
size = 500,
|
|
483
|
+
// Character count per chunk
|
|
484
|
+
overlap = 0,
|
|
485
|
+
// Overlap between chunks (in characters)
|
|
486
|
+
strategy = "character"
|
|
487
|
+
// "character", "sentence", "word"
|
|
488
|
+
} = options;
|
|
489
|
+
if (!text || text.length === 0) {
|
|
490
|
+
return [];
|
|
491
|
+
}
|
|
492
|
+
const chunks = [];
|
|
493
|
+
if (strategy === "character") {
|
|
494
|
+
for (let i = 0; i < text.length; i += size - overlap) {
|
|
495
|
+
chunks.push(text.slice(i, i + size));
|
|
496
|
+
}
|
|
497
|
+
} else if (strategy === "sentence") {
|
|
498
|
+
const sentences = text.match(/[^.!?]+[.!?]+/g) || [text];
|
|
499
|
+
let currentChunk = "";
|
|
500
|
+
for (const sentence of sentences) {
|
|
501
|
+
if (currentChunk.length + sentence.length > size && currentChunk) {
|
|
502
|
+
chunks.push(currentChunk.trim());
|
|
503
|
+
currentChunk = sentence;
|
|
504
|
+
} else {
|
|
505
|
+
currentChunk += sentence;
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
if (currentChunk.trim()) {
|
|
509
|
+
chunks.push(currentChunk.trim());
|
|
510
|
+
}
|
|
511
|
+
} else if (strategy === "word") {
|
|
512
|
+
const words = text.split(/\s+/);
|
|
513
|
+
let currentChunk = [];
|
|
514
|
+
let currentSize = 0;
|
|
515
|
+
for (const word of words) {
|
|
516
|
+
if (currentSize + word.length > size && currentChunk.length > 0) {
|
|
517
|
+
chunks.push(currentChunk.join(" "));
|
|
518
|
+
currentChunk = [word];
|
|
519
|
+
currentSize = word.length;
|
|
520
|
+
} else {
|
|
521
|
+
currentChunk.push(word);
|
|
522
|
+
currentSize += word.length + 1;
|
|
523
|
+
}
|
|
524
|
+
}
|
|
525
|
+
if (currentChunk.length > 0) {
|
|
526
|
+
chunks.push(currentChunk.join(" "));
|
|
527
|
+
}
|
|
528
|
+
}
|
|
529
|
+
return chunks.filter((ch) => ch.length > 0);
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
// src/utils/validation.js
|
|
533
|
+
function validateJSON(text, expectedFields = []) {
|
|
534
|
+
try {
|
|
535
|
+
const jsonMatch = text.match(/```(?:json)?\s*([\s\S]*?)\s*```/);
|
|
536
|
+
const cleanText = jsonMatch ? jsonMatch[1] : text;
|
|
537
|
+
const parsed = JSON.parse(cleanText);
|
|
538
|
+
if (expectedFields.length > 0) {
|
|
539
|
+
const missingFields = expectedFields.filter((field) => !(field in parsed));
|
|
540
|
+
if (missingFields.length > 0) {
|
|
541
|
+
return {
|
|
542
|
+
isValid: false,
|
|
543
|
+
error: `Missing required fields: ${missingFields.join(", ")}`,
|
|
544
|
+
data: parsed
|
|
545
|
+
};
|
|
546
|
+
}
|
|
547
|
+
}
|
|
548
|
+
return {
|
|
549
|
+
isValid: true,
|
|
550
|
+
data: parsed
|
|
551
|
+
};
|
|
552
|
+
} catch (error) {
|
|
553
|
+
return {
|
|
554
|
+
isValid: false,
|
|
555
|
+
error: `Invalid JSON: ${error.message}`,
|
|
556
|
+
data: null
|
|
557
|
+
};
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
var validators = {
|
|
561
|
+
email: (value) => /^[^\s@]+@[^\s@]+\.[^\s@]+$/.test(value),
|
|
562
|
+
phone: (value) => /^[\d\s\-\+\(\)]+$/.test(value) && value.replace(/\D/g, "").length >= 7,
|
|
563
|
+
url: (value) => /^https?:\/\/.+/.test(value)
|
|
564
|
+
};
|
|
565
|
+
function verifyExtraction(extracted, sourceText, fields = []) {
|
|
566
|
+
const sourceLower = sourceText.toLowerCase();
|
|
567
|
+
const issues = [];
|
|
568
|
+
if (typeof extracted === "object" && extracted !== null) {
|
|
569
|
+
for (const [key, value] of Object.entries(extracted)) {
|
|
570
|
+
if (fields.length > 0 && !fields.includes(key)) {
|
|
571
|
+
continue;
|
|
572
|
+
}
|
|
573
|
+
if (value && typeof value === "string" && value.trim().length > 0) {
|
|
574
|
+
const valueLower = value.toLowerCase();
|
|
575
|
+
let foundInSource = sourceLower.includes(valueLower);
|
|
576
|
+
if (!foundInSource) {
|
|
577
|
+
const fieldType = key.toLowerCase();
|
|
578
|
+
if (validators[fieldType]) {
|
|
579
|
+
if (!validators[fieldType](value)) {
|
|
580
|
+
issues.push({
|
|
581
|
+
field: key,
|
|
582
|
+
value,
|
|
583
|
+
reason: `Invalid ${fieldType} format`
|
|
584
|
+
});
|
|
585
|
+
continue;
|
|
586
|
+
}
|
|
587
|
+
}
|
|
588
|
+
const words = valueLower.split(/\s+/).filter((w) => w.length > 3);
|
|
589
|
+
const matchedWords = words.filter((word) => sourceLower.includes(word));
|
|
590
|
+
const matchRatio = words.length > 0 ? matchedWords.length / words.length : 0;
|
|
591
|
+
foundInSource = matchRatio >= 0.8;
|
|
592
|
+
}
|
|
593
|
+
if (!foundInSource) {
|
|
594
|
+
issues.push({
|
|
595
|
+
field: key,
|
|
596
|
+
value,
|
|
597
|
+
reason: "Value not found in source text (possible hallucination)"
|
|
598
|
+
});
|
|
599
|
+
}
|
|
600
|
+
}
|
|
601
|
+
}
|
|
602
|
+
}
|
|
603
|
+
return {
|
|
604
|
+
isValid: issues.length === 0,
|
|
605
|
+
issues,
|
|
606
|
+
extracted
|
|
607
|
+
};
|
|
608
|
+
}
|
|
609
|
+
function normalizeExtracted(extracted) {
|
|
610
|
+
if (typeof extracted !== "object" || extracted === null) {
|
|
611
|
+
return extracted;
|
|
612
|
+
}
|
|
613
|
+
const normalized = {};
|
|
614
|
+
for (const [key, value] of Object.entries(extracted)) {
|
|
615
|
+
if (typeof value === "string") {
|
|
616
|
+
normalized[key] = value.trim().replace(/^["']|["']$/g, "").replace(/^[-•*]\s*/, "").trim();
|
|
617
|
+
} else {
|
|
618
|
+
normalized[key] = value;
|
|
619
|
+
}
|
|
620
|
+
}
|
|
621
|
+
return normalized;
|
|
622
|
+
}
|
|
623
|
+
function validateExtraction(llmOutput, sourceText, options = {}) {
|
|
624
|
+
const {
|
|
625
|
+
format = "json",
|
|
626
|
+
fields = [],
|
|
627
|
+
strict = true
|
|
628
|
+
// If true, reject if validation fails
|
|
629
|
+
} = options;
|
|
630
|
+
let parsed = llmOutput.trim();
|
|
631
|
+
if (format === "json") {
|
|
632
|
+
const jsonResult = validateJSON(parsed, fields);
|
|
633
|
+
if (!jsonResult.isValid) {
|
|
634
|
+
return {
|
|
635
|
+
isValid: false,
|
|
636
|
+
error: jsonResult.error,
|
|
637
|
+
raw: llmOutput,
|
|
638
|
+
validated: null
|
|
639
|
+
};
|
|
640
|
+
}
|
|
641
|
+
parsed = jsonResult.data;
|
|
642
|
+
}
|
|
643
|
+
const normalized = normalizeExtracted(parsed);
|
|
644
|
+
const verification = verifyExtraction(normalized, sourceText, fields);
|
|
645
|
+
if (strict && !verification.isValid) {
|
|
646
|
+
return {
|
|
647
|
+
isValid: false,
|
|
648
|
+
error: "Extraction validation failed",
|
|
649
|
+
issues: verification.issues,
|
|
650
|
+
raw: llmOutput,
|
|
651
|
+
validated: null
|
|
652
|
+
};
|
|
653
|
+
}
|
|
654
|
+
return {
|
|
655
|
+
isValid: true,
|
|
656
|
+
raw: llmOutput,
|
|
657
|
+
validated: normalized,
|
|
658
|
+
warnings: verification.issues
|
|
659
|
+
// Include warnings even if not strict
|
|
660
|
+
};
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
// src/preprocess/extract.js
|
|
664
|
+
async function extract(engine, text, options = {}) {
|
|
665
|
+
const logger = engine.getLogger();
|
|
666
|
+
const {
|
|
667
|
+
what = "key information",
|
|
668
|
+
// What to extract
|
|
669
|
+
format = "text",
|
|
670
|
+
// "text", "json", "list"
|
|
671
|
+
fields = [],
|
|
672
|
+
// Specific fields to extract (for JSON)
|
|
673
|
+
validate = true,
|
|
674
|
+
// Enable rule-based validation
|
|
675
|
+
strict = false
|
|
676
|
+
// If true, throw error on validation failure
|
|
677
|
+
} = options;
|
|
678
|
+
const originalPrompt = `Extract ${what} from the following text`;
|
|
679
|
+
let prompt = originalPrompt;
|
|
680
|
+
if (format === "json") {
|
|
681
|
+
if (fields.length > 0) {
|
|
682
|
+
prompt += ` in JSON format with these fields: ${fields.join(", ")}`;
|
|
683
|
+
} else {
|
|
684
|
+
prompt += ` in JSON format`;
|
|
685
|
+
}
|
|
686
|
+
} else if (format === "list") {
|
|
687
|
+
prompt += ` as a list`;
|
|
688
|
+
}
|
|
689
|
+
prompt += `:
|
|
690
|
+
|
|
691
|
+
${text}`;
|
|
692
|
+
logger.logPromptConstruction("extract", originalPrompt, prompt, options);
|
|
693
|
+
const llmResult = await engine.run(prompt, { temperature: 0.3 });
|
|
694
|
+
const rawResult = llmResult.trim();
|
|
695
|
+
logger.log("info", "EXTRACT", "LLM extraction completed", {
|
|
696
|
+
format,
|
|
697
|
+
fields,
|
|
698
|
+
resultLength: rawResult.length
|
|
699
|
+
});
|
|
700
|
+
if (validate && format === "json") {
|
|
701
|
+
logger.log("info", "VALIDATION", "Starting rule-based validation");
|
|
702
|
+
const validation = validateExtraction(rawResult, text, {
|
|
703
|
+
format,
|
|
704
|
+
fields,
|
|
705
|
+
strict
|
|
706
|
+
});
|
|
707
|
+
logger.logValidation(
|
|
708
|
+
"extract",
|
|
709
|
+
text,
|
|
710
|
+
validation.validated || rawResult,
|
|
711
|
+
validation.isValid,
|
|
712
|
+
validation.error ? new Error(validation.error) : null
|
|
713
|
+
);
|
|
714
|
+
if (!validation.isValid) {
|
|
715
|
+
if (strict) {
|
|
716
|
+
throw new Error(`Extraction validation failed: ${validation.error}`);
|
|
717
|
+
} else {
|
|
718
|
+
logger.log("warn", "VALIDATION", "Validation failed but continuing (non-strict mode)", {
|
|
719
|
+
error: validation.error,
|
|
720
|
+
issues: validation.issues
|
|
721
|
+
});
|
|
722
|
+
return rawResult;
|
|
723
|
+
}
|
|
724
|
+
}
|
|
725
|
+
if (validation.validated) {
|
|
726
|
+
logger.log("info", "VALIDATION", "Validation passed, returning validated data");
|
|
727
|
+
return JSON.stringify(validation.validated, null, 2);
|
|
728
|
+
}
|
|
729
|
+
}
|
|
730
|
+
return rawResult;
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
// src/index.js
|
|
734
|
+
var Preprocessor = class {
|
|
735
|
+
constructor(options = {}) {
|
|
736
|
+
this.engine = new LLMEngine(options);
|
|
737
|
+
this.isModelLoaded = false;
|
|
738
|
+
this.logger = this.engine.getLogger();
|
|
739
|
+
}
|
|
740
|
+
/**
|
|
741
|
+
* Load the WebLLM model
|
|
742
|
+
* @param {string} model - Model name (default: "Llama-3.2-1B-Instruct-q4f16_1-MLC")
|
|
743
|
+
* @returns {Promise<void>}
|
|
744
|
+
*/
|
|
745
|
+
async loadModel(model) {
|
|
746
|
+
await this.engine.loadModel(model);
|
|
747
|
+
this.isModelLoaded = true;
|
|
748
|
+
this.logger.log("info", "PREPROCESSOR", "Model loaded and ready");
|
|
749
|
+
}
|
|
750
|
+
/**
|
|
751
|
+
* Check if WebGPU is supported in the current environment
|
|
752
|
+
* @returns {Promise<boolean>}
|
|
753
|
+
*/
|
|
754
|
+
async checkWebGPU() {
|
|
755
|
+
if (typeof navigator === "undefined" || !navigator.gpu) {
|
|
756
|
+
return false;
|
|
757
|
+
}
|
|
758
|
+
try {
|
|
759
|
+
const adapter = await navigator.gpu.requestAdapter();
|
|
760
|
+
return !!adapter;
|
|
761
|
+
} catch (e) {
|
|
762
|
+
return false;
|
|
763
|
+
}
|
|
764
|
+
}
|
|
765
|
+
/**
|
|
766
|
+
* Get the logger instance for accessing internal logs
|
|
767
|
+
* @returns {InternalLogger}
|
|
768
|
+
*/
|
|
769
|
+
getLogger() {
|
|
770
|
+
return this.logger;
|
|
771
|
+
}
|
|
772
|
+
/**
|
|
773
|
+
* Enable/disable internal logging
|
|
774
|
+
*/
|
|
775
|
+
setLogging(enabled, verbose = false) {
|
|
776
|
+
this.logger.setEnabled(enabled);
|
|
777
|
+
this.logger.setVerbose(verbose);
|
|
778
|
+
this.logger.log("info", "PREPROCESSOR", `Logging ${enabled ? "enabled" : "disabled"}`, { verbose });
|
|
779
|
+
}
|
|
780
|
+
/**
|
|
781
|
+
* Ensure model is loaded
|
|
782
|
+
* @private
|
|
783
|
+
*/
|
|
784
|
+
_ensureLoaded() {
|
|
785
|
+
if (!this.isModelLoaded && !this.engine.isLoaded()) {
|
|
786
|
+
throw new Error(
|
|
787
|
+
"Model not loaded. Call loadModel() first before using preprocessing functions."
|
|
788
|
+
);
|
|
789
|
+
}
|
|
790
|
+
}
|
|
791
|
+
/**
|
|
792
|
+
* Clean text
|
|
793
|
+
* Works with or without LLM model loaded
|
|
794
|
+
* Uses rule-based cleaning if model not loaded, LLM if available
|
|
795
|
+
* All options are opt-in (default: false) - user chooses what to remove
|
|
796
|
+
* @param {string} text - Text to clean
|
|
797
|
+
* @param {Object} options - Cleaning options (all optional, default: false)
|
|
798
|
+
* @param {boolean} options.removeHtml - Remove HTML tags (default: false)
|
|
799
|
+
* @param {boolean} options.removeUrls - Remove URLs (default: false)
|
|
800
|
+
* @param {boolean} options.removeExtraWhitespace - Remove extra whitespace (default: false)
|
|
801
|
+
* @param {boolean} options.removeLineBreaks - Remove line breaks (default: false)
|
|
802
|
+
* @param {boolean} options.removeSpecialChars - Remove special characters (default: false)
|
|
803
|
+
* @param {boolean} options.decodeHtmlEntities - Decode HTML entities like & (default: false)
|
|
804
|
+
* @param {string} options.customInstructions - Additional cleaning instructions (requires LLM)
|
|
805
|
+
* @param {boolean} options.useLLM - Force LLM usage (requires model loaded)
|
|
806
|
+
* @returns {Promise<string>|string}
|
|
807
|
+
*
|
|
808
|
+
* @example
|
|
809
|
+
* // No options - returns text as-is
|
|
810
|
+
* await p.clean(text);
|
|
811
|
+
*
|
|
812
|
+
* // User chooses what to remove
|
|
813
|
+
* await p.clean(text, { removeHtml: true, removeExtraWhitespace: true });
|
|
814
|
+
*
|
|
815
|
+
* // Use LLM for semantic cleaning
|
|
816
|
+
* await p.clean(text, { useLLM: true, customInstructions: "Remove all dates" });
|
|
817
|
+
*/
|
|
818
|
+
async clean(text, options = {}) {
|
|
819
|
+
if (options.useLLM === true || options.customInstructions) {
|
|
820
|
+
this._ensureLoaded();
|
|
821
|
+
}
|
|
822
|
+
return await clean(this.engine, text, options);
|
|
823
|
+
}
|
|
824
|
+
/**
|
|
825
|
+
* Extract information from text
|
|
826
|
+
* @param {string} text - Text to extract from
|
|
827
|
+
* @param {Object} options - Extraction options
|
|
828
|
+
* @returns {Promise<string>}
|
|
829
|
+
*/
|
|
830
|
+
async extract(text, options = {}) {
|
|
831
|
+
this._ensureLoaded();
|
|
832
|
+
return await extract(this.engine, text, options);
|
|
833
|
+
}
|
|
834
|
+
/**
|
|
835
|
+
* Chunk text into smaller pieces (non-LLM, fast operation)
|
|
836
|
+
* Works immediately, no model needed
|
|
837
|
+
* @param {string} text - Text to chunk
|
|
838
|
+
* @param {Object} options - Chunking options
|
|
839
|
+
* @returns {string[]}
|
|
840
|
+
*/
|
|
841
|
+
chunk(text, options = {}) {
|
|
842
|
+
return chunk(text, options);
|
|
843
|
+
}
|
|
844
|
+
/**
|
|
845
|
+
* Run a custom prompt on text
|
|
846
|
+
* @param {string} text - Input text
|
|
847
|
+
* @param {string|Object} instruction - Custom instruction or config object
|
|
848
|
+
* @param {Object} options - Generation options
|
|
849
|
+
* @returns {Promise<string>}
|
|
850
|
+
*/
|
|
851
|
+
async prompt(text, instruction, options = {}) {
|
|
852
|
+
this._ensureLoaded();
|
|
853
|
+
let promptText;
|
|
854
|
+
let genOptions = { ...options };
|
|
855
|
+
if (typeof instruction === "string") {
|
|
856
|
+
promptText = `${instruction}
|
|
857
|
+
|
|
858
|
+
${text}`;
|
|
859
|
+
} else if (typeof instruction === "object") {
|
|
860
|
+
const { instruction: inst, format, temperature, maxTokens } = instruction;
|
|
861
|
+
promptText = inst;
|
|
862
|
+
if (format) {
|
|
863
|
+
if (typeof format === "object") {
|
|
864
|
+
promptText += `
|
|
865
|
+
|
|
866
|
+
Return the result in JSON format with these fields: ${JSON.stringify(format)}`;
|
|
867
|
+
} else {
|
|
868
|
+
promptText += `
|
|
869
|
+
|
|
870
|
+
Format: ${format}`;
|
|
871
|
+
}
|
|
872
|
+
}
|
|
873
|
+
promptText += `
|
|
874
|
+
|
|
875
|
+
${text}`;
|
|
876
|
+
if (temperature !== void 0) genOptions.temperature = temperature;
|
|
877
|
+
if (maxTokens !== void 0) genOptions.maxTokens = maxTokens;
|
|
878
|
+
} else {
|
|
879
|
+
throw new Error("Instruction must be a string or object");
|
|
880
|
+
}
|
|
881
|
+
return await this.engine.run(promptText, genOptions);
|
|
882
|
+
}
|
|
883
|
+
/**
|
|
884
|
+
* Enforce correct pipeline ordering
|
|
885
|
+
* Always ensures: clean → extract (if both present)
|
|
886
|
+
* @private
|
|
887
|
+
*/
|
|
888
|
+
_enforcePipelineOrder(pipeline) {
|
|
889
|
+
const ordered = [...pipeline];
|
|
890
|
+
const cleanIndex = ordered.findIndex(
|
|
891
|
+
(step) => step === "clean" || typeof step === "object" && step.clean !== void 0
|
|
892
|
+
);
|
|
893
|
+
const extractIndex = ordered.findIndex(
|
|
894
|
+
(step) => step === "extract" || typeof step === "object" && step.extract !== void 0
|
|
895
|
+
);
|
|
896
|
+
if (cleanIndex !== -1 && extractIndex !== -1 && cleanIndex > extractIndex) {
|
|
897
|
+
this.logger.log("warn", "PIPELINE", "Reordering pipeline: clean must come before extract", {
|
|
898
|
+
originalOrder: ordered.map((s) => typeof s === "string" ? s : Object.keys(s)[0]),
|
|
899
|
+
reordered: true
|
|
900
|
+
});
|
|
901
|
+
const cleanStep = ordered.splice(cleanIndex, 1)[0];
|
|
902
|
+
const newExtractIndex = ordered.findIndex(
|
|
903
|
+
(step) => step === "extract" || typeof step === "object" && step.extract !== void 0
|
|
904
|
+
);
|
|
905
|
+
ordered.splice(newExtractIndex, 0, cleanStep);
|
|
906
|
+
}
|
|
907
|
+
return ordered;
|
|
908
|
+
}
|
|
909
|
+
/**
|
|
910
|
+
* Process text with multiple operations in a pipeline
|
|
911
|
+
* Automatically enforces correct ordering (clean → extract)
|
|
912
|
+
* @param {string} text - Input text
|
|
913
|
+
* @param {Array} pipeline - Array of operations to apply
|
|
914
|
+
* @returns {Promise<string|string[]>}
|
|
915
|
+
*
|
|
916
|
+
* @example
|
|
917
|
+
* await p.pipeline(text, [
|
|
918
|
+
* "extract", // Will be reordered to run after clean
|
|
919
|
+
* "clean",
|
|
920
|
+
* { prompt: "Rewrite in pirate style" }
|
|
921
|
+
* ]);
|
|
922
|
+
*/
|
|
923
|
+
async pipeline(text, pipeline) {
|
|
924
|
+
this._ensureLoaded();
|
|
925
|
+
if (!Array.isArray(pipeline) || pipeline.length === 0) {
|
|
926
|
+
throw new Error("Pipeline must be a non-empty array");
|
|
927
|
+
}
|
|
928
|
+
const orderedPipeline = this._enforcePipelineOrder(pipeline);
|
|
929
|
+
this.logger.log("info", "PIPELINE", "Starting pipeline execution", {
|
|
930
|
+
stepCount: orderedPipeline.length,
|
|
931
|
+
steps: orderedPipeline.map((s) => typeof s === "string" ? s : Object.keys(s)[0])
|
|
932
|
+
});
|
|
933
|
+
let result = text;
|
|
934
|
+
const startTime = Date.now();
|
|
935
|
+
for (let i = 0; i < orderedPipeline.length; i++) {
|
|
936
|
+
const step = orderedPipeline[i];
|
|
937
|
+
const stepStartTime = Date.now();
|
|
938
|
+
const stepName = typeof step === "string" ? step : Object.keys(step)[0] || "unknown";
|
|
939
|
+
try {
|
|
940
|
+
if (typeof step === "string") {
|
|
941
|
+
switch (step) {
|
|
942
|
+
case "clean":
|
|
943
|
+
result = await this.clean(result);
|
|
944
|
+
break;
|
|
945
|
+
case "extract":
|
|
946
|
+
result = await this.extract(result);
|
|
947
|
+
break;
|
|
948
|
+
case "chunk":
|
|
949
|
+
result = this.chunk(result);
|
|
950
|
+
break;
|
|
951
|
+
default:
|
|
952
|
+
throw new Error(`Unknown operation: ${step}`);
|
|
953
|
+
}
|
|
954
|
+
} else if (typeof step === "object") {
|
|
955
|
+
if (step.prompt) {
|
|
956
|
+
result = await this.prompt(result, step.prompt, step.options || {});
|
|
957
|
+
} else if (step.clean) {
|
|
958
|
+
result = await this.clean(result, step.clean);
|
|
959
|
+
} else if (step.extract) {
|
|
960
|
+
result = await this.extract(result, step.extract);
|
|
961
|
+
} else if (step.chunk) {
|
|
962
|
+
result = this.chunk(result, step.chunk);
|
|
963
|
+
} else {
|
|
964
|
+
throw new Error(`Unknown operation object: ${JSON.stringify(step)}`);
|
|
965
|
+
}
|
|
966
|
+
} else {
|
|
967
|
+
throw new Error(`Invalid pipeline step: ${step}`);
|
|
968
|
+
}
|
|
969
|
+
if (Array.isArray(result)) {
|
|
970
|
+
this.logger.log("info", "PIPELINE", "Chunking applied, stopping pipeline", {
|
|
971
|
+
chunks: result.length
|
|
972
|
+
});
|
|
973
|
+
break;
|
|
974
|
+
}
|
|
975
|
+
const stepDuration = Date.now() - stepStartTime;
|
|
976
|
+
this.logger.logPipelineStep(i, stepName, text, result, stepDuration);
|
|
977
|
+
} catch (error) {
|
|
978
|
+
this.logger.logError(`pipeline step ${i + 1} (${stepName})`, error, {
|
|
979
|
+
step,
|
|
980
|
+
inputLength: typeof text === "string" ? text.length : "N/A"
|
|
981
|
+
});
|
|
982
|
+
throw error;
|
|
983
|
+
}
|
|
984
|
+
}
|
|
985
|
+
const totalDuration = Date.now() - startTime;
|
|
986
|
+
this.logger.logPerformance("pipeline", {
|
|
987
|
+
totalSteps: orderedPipeline.length,
|
|
988
|
+
duration: `${totalDuration}ms`,
|
|
989
|
+
averageStepTime: `${(totalDuration / orderedPipeline.length).toFixed(2)}ms`
|
|
990
|
+
});
|
|
991
|
+
return result;
|
|
992
|
+
}
|
|
993
|
+
/**
|
|
994
|
+
* Process text with a simple configuration object
|
|
995
|
+
* @param {string} text - Input text
|
|
996
|
+
* @param {Object} config - Processing configuration
|
|
997
|
+
* @returns {Promise<string>}
|
|
998
|
+
*
|
|
999
|
+
* @example
|
|
1000
|
+
* await p.process(text, {
|
|
1001
|
+
* clean: true,
|
|
1002
|
+
* extract: { format: "json", fields: ["name", "email"] },
|
|
1003
|
+
* customPrompt: "Convert to bullet points"
|
|
1004
|
+
* });
|
|
1005
|
+
*/
|
|
1006
|
+
async process(text, config = {}) {
|
|
1007
|
+
this._ensureLoaded();
|
|
1008
|
+
let result = text;
|
|
1009
|
+
if (config.clean) {
|
|
1010
|
+
result = await this.clean(
|
|
1011
|
+
result,
|
|
1012
|
+
typeof config.clean === "object" ? config.clean : {}
|
|
1013
|
+
);
|
|
1014
|
+
}
|
|
1015
|
+
if (config.extract) {
|
|
1016
|
+
result = await this.extract(
|
|
1017
|
+
result,
|
|
1018
|
+
typeof config.extract === "object" ? config.extract : {}
|
|
1019
|
+
);
|
|
1020
|
+
}
|
|
1021
|
+
if (config.customPrompt) {
|
|
1022
|
+
result = await this.prompt(result, config.customPrompt, config.promptOptions || {});
|
|
1023
|
+
}
|
|
1024
|
+
if (config.chunk) {
|
|
1025
|
+
result = this.chunk(
|
|
1026
|
+
result,
|
|
1027
|
+
typeof config.chunk === "object" ? config.chunk : {}
|
|
1028
|
+
);
|
|
1029
|
+
}
|
|
1030
|
+
return result;
|
|
1031
|
+
}
|
|
1032
|
+
};
|
|
1033
|
+
export {
|
|
1034
|
+
LLMEngine,
|
|
1035
|
+
Preprocessor,
|
|
1036
|
+
chunk,
|
|
1037
|
+
clean,
|
|
1038
|
+
cleanWithRules,
|
|
1039
|
+
extract
|
|
1040
|
+
};
|