@realtimex/email-automator 2.6.4 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -24,6 +24,28 @@ export const EmailAnalysisSchema = z.object({
24
24
  action_items: z.array(z.string()).optional()
25
25
  .describe('Action items mentioned in the email'),
26
26
  });
27
+ // Context-Aware Analysis Schema - AI evaluates email against user's rules
28
+ export const ContextAwareAnalysisSchema = z.object({
29
+ // Classification (kept for UI/logging)
30
+ summary: z.string().describe('A brief summary of the email content'),
31
+ category: z.enum(['spam', 'newsletter', 'promotional', 'transactional', 'social', 'support', 'client', 'internal', 'personal', 'other'])
32
+ .describe('The category of the email'),
33
+ priority: z.enum(['High', 'Medium', 'Low'])
34
+ .describe('The urgency of the email'),
35
+ // Rule Matching (core of context-aware engine)
36
+ matched_rule: z.object({
37
+ rule_id: z.string().nullable().describe('ID of the matched rule, or null if no match'),
38
+ rule_name: z.string().nullable().describe('Name of the matched rule'),
39
+ confidence: z.number().min(0).max(1).describe('Confidence score for the match (0-1)'),
40
+ reasoning: z.string().describe('Explanation of why this rule was matched or why no rule matched'),
41
+ }),
42
+ // Actions to execute (derived from matched rule)
43
+ actions_to_execute: z.array(z.enum(['none', 'delete', 'archive', 'draft', 'read', 'star']))
44
+ .describe('Actions to execute based on the matched rule'),
45
+ // Intent-aware draft content (if draft action is included)
46
+ draft_content: z.string().optional()
47
+ .describe('Generated draft reply if the action includes drafting'),
48
+ });
27
49
  export class IntelligenceService {
28
50
  client = null;
29
51
  model = 'gpt-4o-mini';
@@ -144,7 +166,8 @@ REQUIRED JSON STRUCTURE:
144
166
  temperature: 0.1,
145
167
  });
146
168
  rawResponse = response.choices[0]?.message?.content || '';
147
- console.log('[Intelligence] Raw LLM Response received (length:', rawResponse.length, ')');
169
+ const usage = response.usage;
170
+ console.log('[Intelligence] Raw LLM Response received (length:', rawResponse.length, ')', { usage });
148
171
  // Clean the response: Find first '{' and last '}'
149
172
  let jsonStr = rawResponse.trim();
150
173
  const startIdx = jsonStr.indexOf('{');
@@ -162,7 +185,8 @@ REQUIRED JSON STRUCTURE:
162
185
  if (eventLogger && emailId) {
163
186
  await eventLogger.analysis('Decided', emailId, {
164
187
  ...validated,
165
- _raw_response: rawResponse
188
+ _raw_response: rawResponse,
189
+ usage: usage // Include token usage
166
190
  });
167
191
  }
168
192
  return validated;
@@ -210,6 +234,173 @@ Please write a reply.`,
210
234
  return null;
211
235
  }
212
236
  }
237
+ /**
238
+ * Context-Aware Analysis: AI evaluates email against user's rules semantically
239
+ * This is the core of the new automation engine
240
+ *
241
+ * @param compiledRulesContext - Pre-compiled rules context string (from user_settings.compiled_rule_context)
242
+ * OR RuleContext[] for backwards compatibility
243
+ */
244
+ async analyzeEmailWithRules(content, context, compiledRulesContext, eventLogger, emailId) {
245
+ console.log('[Intelligence] analyzeEmailWithRules called for:', context.subject);
246
+ if (!this.isReady()) {
247
+ console.log('[Intelligence] Not ready, skipping');
248
+ logger.warn('Intelligence service not ready, skipping analysis');
249
+ if (eventLogger) {
250
+ await eventLogger.info('Skipped', 'AI Analysis skipped: Model not configured.', undefined, emailId);
251
+ }
252
+ return null;
253
+ }
254
+ // Prepare content
255
+ const cleanedContent = ContentCleaner.cleanEmailBody(content).substring(0, 2500);
256
+ // Use pre-compiled context if string, otherwise build from RuleContext[] (backwards compat)
257
+ let rulesContext;
258
+ let rulesCount;
259
+ if (typeof compiledRulesContext === 'string') {
260
+ // Fast path: use pre-compiled context
261
+ rulesContext = compiledRulesContext || '\n[No rules defined - analyze email but take no actions]\n';
262
+ rulesCount = (rulesContext.match(/Rule \d+/g) || []).length;
263
+ }
264
+ else {
265
+ // Backwards compatibility: build from RuleContext[]
266
+ const rules = compiledRulesContext;
267
+ rulesCount = rules.length;
268
+ rulesContext = rules.length > 0
269
+ ? rules.map((r, i) => `
270
+ ### Rule ${i + 1}: "${r.name}" (ID: ${r.id})
271
+ - Description: ${r.description || 'No description provided'}
272
+ - Intent: ${r.intent || 'General automation'}
273
+ - Actions: ${r.actions.join(', ')}
274
+ ${r.draft_instructions ? `- Draft Instructions: "${r.draft_instructions}"` : ''}
275
+ `).join('\n')
276
+ : '\n[No rules defined - analyze email but take no actions]\n';
277
+ }
278
+ const systemPrompt = `You are an AI Email Automation Agent.
279
+
280
+ ## Your Operating Rules
281
+ The user has defined the following automation rules. Your job is to:
282
+ 1. Analyze the incoming email
283
+ 2. Determine if ANY rule semantically matches this email's context
284
+ 3. Match based on INTENT, not just keywords
285
+
286
+ ${rulesContext}
287
+
288
+ ## Category Definitions (choose the most accurate)
289
+ - **client**: Business inquiries, RFPs, quote requests, project discussions, potential customers reaching out
290
+ - **support**: Help requests, bug reports, technical questions from existing users
291
+ - **internal**: Messages from colleagues, team communications
292
+ - **transactional**: Receipts, confirmations, shipping updates, account notifications
293
+ - **newsletter**: Subscribed content, digests, updates from services you signed up for
294
+ - **promotional**: UNSOLICITED marketing, cold sales pitches, ads - NOT legitimate business inquiries
295
+ - **spam**: Scams, phishing, junk mail
296
+ - **social**: Social media notifications, friend requests
297
+ - **personal**: Friends, family, personal matters
298
+ - **other**: Anything that doesn't fit above
299
+
300
+ ## Matching Guidelines
301
+ - A "decline sales" rule should match ANY sales pitch, not just ones with "sales" in the subject
302
+ - Match the rule that best fits the USER'S INTENT
303
+ - Only match if you are confident (>= 0.7 confidence)
304
+ - If no rule clearly matches, return null for rule_id
305
+ - If a matched rule includes "draft" action, generate an appropriate draft using the rule's intent
306
+
307
+ ## CRITICAL: Distinguish Between Inbound vs Outbound
308
+ **INBOUND (Client Inquiries - NOT promotional):**
309
+ - User is RECEIVING a request for quote/proposal/service
310
+ - Examples: "Please send me a quote", "RFP: [project]", "Can you provide pricing", "I need a quote asap"
311
+ - Category: client, support, or transactional (NEVER promotional)
312
+
313
+ **OUTBOUND (Sales/Marketing - IS promotional):**
314
+ - User is RECEIVING a sales pitch or marketing message
315
+ - Examples: "Get a FREE quote today!", "Limited offer", "Don't miss out", "Special discount"
316
+ - Category: promotional, spam, or newsletter
317
+
318
+ **Key Distinction:** If someone is ASKING the user for something (quote, proposal, service), it's a CLIENT INQUIRY, not promotional content.
319
+
320
+ ## Email Context
321
+ - Current Date: ${new Date().toISOString()}
322
+ - Subject: ${context.subject}
323
+ - From: ${context.sender}
324
+ - Date: ${context.date}
325
+
326
+ ## Required JSON Response
327
+ {
328
+ "summary": "Brief summary of the email",
329
+ "category": "spam|newsletter|promotional|transactional|social|support|client|internal|personal|other",
330
+ "priority": "High|Medium|Low",
331
+ "matched_rule": {
332
+ "rule_id": "UUID or null",
333
+ "rule_name": "Rule name or null",
334
+ "confidence": 0.0-1.0,
335
+ "reasoning": "Why this rule was or wasn't matched"
336
+ },
337
+ "actions_to_execute": ["none"] or ["archive", "read", etc.],
338
+ "draft_content": "Optional: draft reply if action includes 'draft'"
339
+ }
340
+
341
+ Return ONLY valid JSON.`;
342
+ // Log thinking phase
343
+ if (eventLogger) {
344
+ try {
345
+ await eventLogger.info('Thinking', `Context-aware analysis: ${context.subject}`, {
346
+ model: this.model,
347
+ system_prompt: systemPrompt,
348
+ content_preview: cleanedContent,
349
+ rules_count: rulesCount,
350
+ }, emailId);
351
+ }
352
+ catch (err) {
353
+ console.error('[Intelligence] Failed to log thinking event:', err);
354
+ }
355
+ }
356
+ let rawResponse = '';
357
+ try {
358
+ const response = await this.client.chat.completions.create({
359
+ model: this.model,
360
+ messages: [
361
+ { role: 'system', content: systemPrompt },
362
+ { role: 'user', content: cleanedContent || '[Empty email body]' },
363
+ ],
364
+ temperature: 0.1,
365
+ });
366
+ rawResponse = response.choices[0]?.message?.content || '';
367
+ const usage = response.usage;
368
+ console.log('[Intelligence] Context-aware response received (length:', rawResponse.length, ')', { usage });
369
+ // Parse JSON from response
370
+ let jsonStr = rawResponse.trim();
371
+ const startIdx = jsonStr.indexOf('{');
372
+ const endIdx = jsonStr.lastIndexOf('}');
373
+ if (startIdx === -1 || endIdx === -1) {
374
+ throw new Error('Response did not contain a valid JSON object');
375
+ }
376
+ jsonStr = jsonStr.substring(startIdx, endIdx + 1);
377
+ const parsed = JSON.parse(jsonStr);
378
+ const validated = ContextAwareAnalysisSchema.parse(parsed);
379
+ logger.debug('Context-aware analysis complete', {
380
+ matched_rule: validated.matched_rule.rule_name,
381
+ confidence: validated.matched_rule.confidence,
382
+ actions: validated.actions_to_execute,
383
+ });
384
+ if (eventLogger && emailId) {
385
+ await eventLogger.analysis('Decided', emailId, {
386
+ ...validated,
387
+ _raw_response: rawResponse,
388
+ usage: usage // Include token usage
389
+ });
390
+ }
391
+ return validated;
392
+ }
393
+ catch (error) {
394
+ console.error('[Intelligence] Context-aware analysis failed:', error);
395
+ if (eventLogger) {
396
+ await eventLogger.error('Error', {
397
+ error: error instanceof Error ? error.message : String(error),
398
+ raw_response: rawResponse || 'No response received from LLM'
399
+ }, emailId);
400
+ }
401
+ return null;
402
+ }
403
+ }
213
404
  async testConnection() {
214
405
  if (!this.isReady()) {
215
406
  return { success: false, message: 'Intelligence service not initialized. Check your API Key.' };
@@ -454,7 +454,66 @@ export class EmailProcessorService {
454
454
  autoSubmitted: parsed.headers.get('auto-submitted')?.toString(),
455
455
  mailer: parsed.headers.get('x-mailer')?.toString()
456
456
  };
457
- // 3. Analyze with AI
457
+ // 3. Fetch account for action execution
458
+ const { data: account } = await this.supabase
459
+ .from('email_accounts')
460
+ .select('*')
461
+ .eq('id', email.account_id)
462
+ .single();
463
+ // 4. Fetch pre-compiled rule context (fast path - no loop/formatting)
464
+ // Falls back to building context if not cached
465
+ let compiledContext = settings?.compiled_rule_context || null;
466
+ // Fetch rules for action execution (need attachments, instructions)
467
+ const { data: rules } = await this.supabase
468
+ .from('rules')
469
+ .select('*')
470
+ .eq('user_id', userId)
471
+ .eq('is_enabled', true)
472
+ .order('priority', { ascending: false });
473
+ // Fallback: build context if not pre-compiled
474
+ if (!compiledContext && rules && rules.length > 0) {
475
+ compiledContext = rules.map((r, i) => {
476
+ // Build human-readable condition text
477
+ let conditionText = '';
478
+ if (r.condition) {
479
+ const cond = r.condition;
480
+ if (cond.field) {
481
+ conditionText = `When ${cond.field}`;
482
+ if (cond.operator === 'equals') {
483
+ conditionText += ` equals "${cond.value}"`;
484
+ }
485
+ else if (cond.operator === 'contains') {
486
+ conditionText += ` contains "${cond.value}"`;
487
+ }
488
+ else if (cond.operator === 'domain_equals') {
489
+ conditionText += ` domain equals "${cond.value}"`;
490
+ }
491
+ else {
492
+ conditionText += ` ${cond.operator} "${cond.value}"`;
493
+ }
494
+ }
495
+ if (cond.is_useless === true) {
496
+ conditionText += (conditionText ? ' AND ' : 'When ') + 'email is useless/low-value';
497
+ }
498
+ if (cond.ai_priority) {
499
+ conditionText += (conditionText ? ' AND ' : 'When ') + `AI priority is "${cond.ai_priority}"`;
500
+ }
501
+ // Extract older_than_days from condition JSONB
502
+ if (cond.older_than_days) {
503
+ conditionText += (conditionText ? ' AND ' : 'When ') + `email is older than ${cond.older_than_days} days`;
504
+ }
505
+ }
506
+ return `Rule ${i + 1} [ID: ${r.id}]\n` +
507
+ ` Name: ${r.name}\n` +
508
+ (r.description ? ` Description: ${r.description}\n` : '') +
509
+ (r.intent ? ` Intent: ${r.intent}\n` : '') +
510
+ (conditionText ? ` Condition: ${conditionText}\n` : '') +
511
+ ` Actions: ${r.actions?.join(', ') || r.action || 'none'}\n` +
512
+ (r.instructions ? ` Draft Instructions: ${r.instructions}\n` : '') +
513
+ '\n';
514
+ }).join('');
515
+ }
516
+ // 5. Context-Aware Analysis: AI evaluates email against user's rules
458
517
  const intelligenceService = getIntelligenceService(settings?.llm_model || settings?.llm_base_url || settings?.llm_api_key
459
518
  ? {
460
519
  model: settings.llm_model,
@@ -462,7 +521,7 @@ export class EmailProcessorService {
462
521
  apiKey: settings.llm_api_key,
463
522
  }
464
523
  : undefined);
465
- const analysis = await intelligenceService.analyzeEmail(cleanContent, {
524
+ const analysis = await intelligenceService.analyzeEmailWithRules(cleanContent, {
466
525
  subject: email.subject || '',
467
526
  sender: email.sender || '',
468
527
  date: email.date || '',
@@ -471,39 +530,41 @@ export class EmailProcessorService {
471
530
  autoTrashSpam: settings?.auto_trash_spam,
472
531
  smartDrafts: settings?.smart_drafts,
473
532
  },
474
- }, eventLogger || undefined, email.id);
533
+ }, compiledContext || '', // Pre-compiled context (fast path)
534
+ eventLogger || undefined, email.id);
475
535
  if (!analysis) {
476
536
  throw new Error('AI analysis returned no result');
477
537
  }
478
- // 4. Update the email record with results
538
+ // 6. Update the email record with context-aware results
479
539
  await this.supabase
480
540
  .from('emails')
481
541
  .update({
482
542
  category: analysis.category,
483
- is_useless: analysis.is_useless,
484
543
  ai_analysis: analysis,
485
- suggested_actions: analysis.suggested_actions || [],
486
- suggested_action: analysis.suggested_actions?.[0] || 'none',
544
+ suggested_actions: analysis.actions_to_execute || [],
545
+ suggested_action: analysis.actions_to_execute?.[0] || 'none',
546
+ matched_rule_id: analysis.matched_rule.rule_id,
547
+ matched_rule_confidence: analysis.matched_rule.confidence,
487
548
  processing_status: 'completed'
488
549
  })
489
550
  .eq('id', email.id);
490
- // 5. Execute automation rules
491
- // Fetch account and rules needed for execution
492
- const { data: account } = await this.supabase
493
- .from('email_accounts')
494
- .select('*')
495
- .eq('id', email.account_id)
496
- .single();
497
- const { data: rules } = await this.supabase
498
- .from('rules')
499
- .select('*')
500
- .eq('user_id', userId)
501
- .eq('is_enabled', true);
502
- if (account && rules) {
503
- const tempResult = { processed: 0, deleted: 0, drafted: 0, errors: 0 };
504
- // Ensure email object for rules has the analysis fields merged in
505
- const emailForRules = { ...email, ...analysis };
506
- await this.executeRules(account, emailForRules, analysis, rules, settings, tempResult, eventLogger);
551
+ // 7. Execute actions if rule matched with sufficient confidence
552
+ if (account && analysis.matched_rule.rule_id && analysis.matched_rule.confidence >= 0.7) {
553
+ const matchedRule = rules?.find(r => r.id === analysis.matched_rule.rule_id);
554
+ if (eventLogger) {
555
+ await eventLogger.info('Rule Matched', `"${analysis.matched_rule.rule_name}" matched with ${(analysis.matched_rule.confidence * 100).toFixed(0)}% confidence`, { reasoning: analysis.matched_rule.reasoning }, email.id);
556
+ }
557
+ // Execute each action from the AI's decision
558
+ for (const action of analysis.actions_to_execute) {
559
+ if (action === 'none')
560
+ continue;
561
+ // Use AI-generated draft content if available
562
+ const draftContent = action === 'draft' ? analysis.draft_content : undefined;
563
+ await this.executeAction(account, email, action, draftContent, eventLogger, `Rule: ${matchedRule?.name || analysis.matched_rule.rule_name}`, matchedRule?.attachments);
564
+ }
565
+ }
566
+ else if (eventLogger && rules && rules.length > 0) {
567
+ await eventLogger.info('No Match', analysis.matched_rule.reasoning, { confidence: analysis.matched_rule.confidence }, email.id);
507
568
  }
508
569
  // Mark log as success
509
570
  if (log) {
@@ -1,98 +1,114 @@
1
1
  export class ContentCleaner {
2
2
  /**
3
3
  * Cleans email body by removing noise, quoted replies, and footers.
4
- * Ported from Python ContentCleaner.
4
+ * optimized for LLM processing.
5
5
  */
6
6
  static cleanEmailBody(text) {
7
7
  if (!text)
8
8
  return "";
9
9
  const originalText = text;
10
- // 0. Lightweight HTML -> Markdown Conversion
11
- // Structure: <br>, <p> -> Newlines
12
- text = text.replace(/<br\s*\/?>/gi, '\n');
13
- text = text.replace(/<\/p>/gi, '\n\n');
14
- text = text.replace(/<p.*?>/gi, ''); // Open p tags just gone
15
- // Structure: Headers <h1>-<h6> -> # Title
16
- text = text.replace(/<h[1-6].*?>(.*?)<\/h[1-6]>/gsi, (match, p1) => `\n# ${p1}\n`);
17
- // Structure: Lists <li> -> - Item
18
- text = text.replace(/<li.*?>(.*?)<\/li>/gsi, (match, p1) => `\n- ${p1}`);
19
- text = text.replace(/<ul.*?>/gi, '');
20
- text = text.replace(/<\/ul>/gi, '\n');
21
- // Links: <a href=\"...\">text</a> -> [text](href)
22
- text = text.replace(/<a\s+(?:[^>]*?\s+)?href=\"([^\"]*)\"[^>]*>(.*?)<\/a>/gsi, (match, href, content) => `[${content}](${href})`);
23
- // Images: <img src=\"...\" alt=\"...\"> -> ![alt](src)
24
- text = text.replace(/<img\s+(?:[^>]*?\s+)?src=\"([^\"]*)\"(?:[^>]*?\s+)?alt=\"([^\"]*)\"[^>]*>/gsi, (match, src, alt) => `![${alt}](${src})`);
25
- // Style/Script removal (strictly remove content)
26
- text = text.replace(/<script.*?>.*?<\/script>/gsi, '');
27
- text = text.replace(/<style.*?>.*?<\/style>/gsi, '');
28
- // Final Strip of remaining tags
29
- text = text.replace(/<[^>]+>/g, ' ');
30
- // Entity decoding (Basic)
31
- text = text.replace(/&nbsp;/gi, ' ');
32
- text = text.replace(/&amp;/gi, '&');
33
- text = text.replace(/&lt;/gi, '<');
34
- text = text.replace(/&gt;/gi, '>');
35
- text = text.replace(/&quot;/gi, '"');
36
- text = text.replace(/&#39;/gi, "'");
10
+ // 1. Detect if content is actually HTML
11
+ const isHtml = /<[a-z][\s\S]*>/i.test(text);
12
+ if (isHtml) {
13
+ // Lightweight HTML -> Markdown Conversion
14
+ // Structure: <br>, <p> -> Newlines
15
+ text = text.replace(/<br\s*\/?>/gi, '\n');
16
+ text = text.replace(/<\/p>/gi, '\n\n');
17
+ text = text.replace(/<p.*?>/gi, '');
18
+ // Structure: Headers <h1>-<h6> -> # Title
19
+ text = text.replace(/<h[1-6].*?>(.*?)<\/h[1-6]>/gsi, (match, p1) => `\n# ${p1}\n`);
20
+ // Structure: Lists <li> -> - Item
21
+ text = text.replace(/<li.*?>(.*?)<\/li>/gsi, (match, p1) => `\n- ${p1}`);
22
+ text = text.replace(/<ul.*?>/gi, '');
23
+ text = text.replace(/<\/ul>/gi, '\n');
24
+ // Links: <a href=\"...\">text</a> -> [text](href)
25
+ text = text.replace(/<a\s+(?:[^>]*?\s+)?href=\"([^\"]*)\"[^>]*>(.*?)<\/a>/gsi, (match, href, content) => `[${content}](${href})`);
26
+ // Images: <img src=\"...\" alt=\"...\"> -> ![alt](src)
27
+ text = text.replace(/<img\s+(?:[^>]*?\s+)?src=\"([^\"]*)\"(?:[^>]*?\s+)?alt=\"([^\"]*)\"[^>]*>/gsi, (match, src, alt) => `![${alt}](${src})`);
28
+ // Style/Script removal (strictly remove content)
29
+ text = text.replace(/<script.*?>.*?<\/script>/gsi, '');
30
+ text = text.replace(/<style.*?>.*?<\/style>/gsi, '');
31
+ // Final Strip of remaining tags
32
+ text = text.replace(/<[^>]+>/g, ' ');
33
+ // Entity decoding (Basic)
34
+ text = text.replace(/&nbsp;/gi, ' ');
35
+ text = text.replace(/&amp;/gi, '&');
36
+ text = text.replace(/&lt;/gi, '<');
37
+ text = text.replace(/&gt;/gi, '>');
38
+ text = text.replace(/&quot;/gi, '"');
39
+ text = text.replace(/&#39;/gi, "'");
40
+ }
37
41
  const lines = text.split('\n');
38
42
  const cleanedLines = [];
39
- // Heuristics for reply headers
40
- const replyHeaderPatterns = [
43
+ // Patterns that usually mark the START of a reply chain or a generic footer
44
+ const truncationPatterns = [
41
45
  /^On .* wrote:$/i,
42
- /^From: .*$/i,
43
- /^Sent: .*$/i,
44
- /^To: .*$/i,
45
- /^Subject: .*$/i
46
+ /^From: .* <.*>$/i,
47
+ /^-----Original Message-----$/i,
48
+ /^________________________________$/i,
49
+ /^Sent from my iPhone$/i,
50
+ /^Sent from my Android$/i,
51
+ /^Get Outlook for/i,
52
+ /^--$/ // Standard signature separator
46
53
  ];
47
- // Heuristics for footers
48
- const footerPatterns = [
49
- /unsubscribe/i,
54
+ // Patterns for lines that should be stripped but NOT truncate the whole email
55
+ const noisePatterns = [
56
+ /view in browser/i,
57
+ /click here to view/i,
58
+ /legal notice/i,
59
+ /all rights reserved/i,
50
60
  /privacy policy/i,
51
61
  /terms of service/i,
52
- /view in browser/i,
53
- /copyright \d{4}/i
62
+ /unsubscribe/i
54
63
  ];
55
64
  for (let line of lines) {
56
65
  let lineStripped = line.trim();
66
+ if (!lineStripped) {
67
+ cleanedLines.push("");
68
+ continue;
69
+ }
57
70
  // 2. Quoted text removal (lines starting with >)
58
71
  if (lineStripped.startsWith('>')) {
59
72
  continue;
60
73
  }
61
- // 3. Check for specific reply separators
62
- // If we hit a reply header, we truncate the rest
63
- if (/^On .* wrote:$/i.test(lineStripped)) {
64
- break;
74
+ // 3. Truncation check: If we hit a reply header, we stop entirely
75
+ let shouldTruncate = false;
76
+ for (const pattern of truncationPatterns) {
77
+ if (pattern.test(lineStripped)) {
78
+ shouldTruncate = true;
79
+ break;
80
+ }
65
81
  }
66
- // 4. Footer removal (only on very short lines to avoid stripping body content)
67
- if (lineStripped.length < 60) {
68
- let isFooter = false;
69
- for (const pattern of footerPatterns) {
82
+ if (shouldTruncate)
83
+ break;
84
+ // 4. Noise check: Strip boilerplate lines
85
+ let isNoise = false;
86
+ if (lineStripped.length < 100) {
87
+ for (const pattern of noisePatterns) {
70
88
  if (pattern.test(lineStripped)) {
71
- isFooter = true;
89
+ isNoise = true;
72
90
  break;
73
91
  }
74
92
  }
75
- if (isFooter) {
76
- continue;
77
- }
78
93
  }
94
+ if (isNoise)
95
+ continue;
79
96
  cleanedLines.push(line);
80
97
  }
81
98
  // Reassemble
82
99
  text = cleanedLines.join('\n');
83
- // Safety Fallback: If cleaning stripped everything, return original (truncated)
84
- if (!text.trim() || text.length < 10) {
85
- text = originalText.substring(0, 3000);
86
- }
87
- // Collapse multiple newlines
100
+ // Collapse whitespace
88
101
  text = text.replace(/\n{3,}/g, '\n\n');
102
+ text = text.replace(/[ \t]{2,}/g, ' ');
103
+ // Safety Fallback: If cleaning stripped too much, return original text truncated
104
+ if (text.trim().length < 20 && originalText.trim().length > 20) {
105
+ return originalText.substring(0, 3000).trim();
106
+ }
89
107
  // Sanitize LLM Special Tokens
90
108
  text = text.replace(/<\|/g, '< |');
91
109
  text = text.replace(/\|>/g, '| >');
92
110
  text = text.replace(/\[INST\]/gi, '[ INST ]');
93
111
  text = text.replace(/\[\/INST\]/gi, '[ /INST ]');
94
- text = text.replace(/<s>/gi, '&lt;s&gt;');
95
- text = text.replace(/<\/s>/gi, '&lt;/s&gt;');
96
112
  return text.trim();
97
113
  }
98
114
  }