@realtimex/email-automator 2.6.4 → 2.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/api/src/middleware/validation.ts +7 -0
- package/api/src/services/intelligence.ts +232 -7
- package/api/src/services/processor.ts +153 -69
- package/api/src/services/supabase.ts +5 -2
- package/api/src/utils/contentCleaner.ts +80 -66
- package/dist/api/src/middleware/validation.js +7 -0
- package/dist/api/src/services/intelligence.js +193 -2
- package/dist/api/src/services/processor.js +85 -24
- package/dist/api/src/utils/contentCleaner.js +74 -58
- package/dist/assets/index-aTk6SbAd.js +97 -0
- package/dist/assets/index-npWWfPF9.css +1 -0
- package/dist/index.html +2 -2
- package/package.json +1 -1
- package/supabase/migrations/20260119000000_context_aware_rules.sql +44 -0
- package/supabase/migrations/20260119000001_compiled_rule_context.sql +128 -0
- package/supabase/migrations/20260119000002_fix_compiled_context_conditions.sql +137 -0
- package/dist/assets/index-BSHZ3lFn.js +0 -97
- package/dist/assets/index-CRQKk5IW.css +0 -1
|
@@ -24,6 +24,28 @@ export const EmailAnalysisSchema = z.object({
|
|
|
24
24
|
action_items: z.array(z.string()).optional()
|
|
25
25
|
.describe('Action items mentioned in the email'),
|
|
26
26
|
});
|
|
27
|
+
// Context-Aware Analysis Schema - AI evaluates email against user's rules
|
|
28
|
+
export const ContextAwareAnalysisSchema = z.object({
|
|
29
|
+
// Classification (kept for UI/logging)
|
|
30
|
+
summary: z.string().describe('A brief summary of the email content'),
|
|
31
|
+
category: z.enum(['spam', 'newsletter', 'promotional', 'transactional', 'social', 'support', 'client', 'internal', 'personal', 'other'])
|
|
32
|
+
.describe('The category of the email'),
|
|
33
|
+
priority: z.enum(['High', 'Medium', 'Low'])
|
|
34
|
+
.describe('The urgency of the email'),
|
|
35
|
+
// Rule Matching (core of context-aware engine)
|
|
36
|
+
matched_rule: z.object({
|
|
37
|
+
rule_id: z.string().nullable().describe('ID of the matched rule, or null if no match'),
|
|
38
|
+
rule_name: z.string().nullable().describe('Name of the matched rule'),
|
|
39
|
+
confidence: z.number().min(0).max(1).describe('Confidence score for the match (0-1)'),
|
|
40
|
+
reasoning: z.string().describe('Explanation of why this rule was matched or why no rule matched'),
|
|
41
|
+
}),
|
|
42
|
+
// Actions to execute (derived from matched rule)
|
|
43
|
+
actions_to_execute: z.array(z.enum(['none', 'delete', 'archive', 'draft', 'read', 'star']))
|
|
44
|
+
.describe('Actions to execute based on the matched rule'),
|
|
45
|
+
// Intent-aware draft content (if draft action is included)
|
|
46
|
+
draft_content: z.string().optional()
|
|
47
|
+
.describe('Generated draft reply if the action includes drafting'),
|
|
48
|
+
});
|
|
27
49
|
export class IntelligenceService {
|
|
28
50
|
client = null;
|
|
29
51
|
model = 'gpt-4o-mini';
|
|
@@ -144,7 +166,8 @@ REQUIRED JSON STRUCTURE:
|
|
|
144
166
|
temperature: 0.1,
|
|
145
167
|
});
|
|
146
168
|
rawResponse = response.choices[0]?.message?.content || '';
|
|
147
|
-
|
|
169
|
+
const usage = response.usage;
|
|
170
|
+
console.log('[Intelligence] Raw LLM Response received (length:', rawResponse.length, ')', { usage });
|
|
148
171
|
// Clean the response: Find first '{' and last '}'
|
|
149
172
|
let jsonStr = rawResponse.trim();
|
|
150
173
|
const startIdx = jsonStr.indexOf('{');
|
|
@@ -162,7 +185,8 @@ REQUIRED JSON STRUCTURE:
|
|
|
162
185
|
if (eventLogger && emailId) {
|
|
163
186
|
await eventLogger.analysis('Decided', emailId, {
|
|
164
187
|
...validated,
|
|
165
|
-
_raw_response: rawResponse
|
|
188
|
+
_raw_response: rawResponse,
|
|
189
|
+
usage: usage // Include token usage
|
|
166
190
|
});
|
|
167
191
|
}
|
|
168
192
|
return validated;
|
|
@@ -210,6 +234,173 @@ Please write a reply.`,
|
|
|
210
234
|
return null;
|
|
211
235
|
}
|
|
212
236
|
}
|
|
237
|
+
/**
|
|
238
|
+
* Context-Aware Analysis: AI evaluates email against user's rules semantically
|
|
239
|
+
* This is the core of the new automation engine
|
|
240
|
+
*
|
|
241
|
+
* @param compiledRulesContext - Pre-compiled rules context string (from user_settings.compiled_rule_context)
|
|
242
|
+
* OR RuleContext[] for backwards compatibility
|
|
243
|
+
*/
|
|
244
|
+
async analyzeEmailWithRules(content, context, compiledRulesContext, eventLogger, emailId) {
|
|
245
|
+
console.log('[Intelligence] analyzeEmailWithRules called for:', context.subject);
|
|
246
|
+
if (!this.isReady()) {
|
|
247
|
+
console.log('[Intelligence] Not ready, skipping');
|
|
248
|
+
logger.warn('Intelligence service not ready, skipping analysis');
|
|
249
|
+
if (eventLogger) {
|
|
250
|
+
await eventLogger.info('Skipped', 'AI Analysis skipped: Model not configured.', undefined, emailId);
|
|
251
|
+
}
|
|
252
|
+
return null;
|
|
253
|
+
}
|
|
254
|
+
// Prepare content
|
|
255
|
+
const cleanedContent = ContentCleaner.cleanEmailBody(content).substring(0, 2500);
|
|
256
|
+
// Use pre-compiled context if string, otherwise build from RuleContext[] (backwards compat)
|
|
257
|
+
let rulesContext;
|
|
258
|
+
let rulesCount;
|
|
259
|
+
if (typeof compiledRulesContext === 'string') {
|
|
260
|
+
// Fast path: use pre-compiled context
|
|
261
|
+
rulesContext = compiledRulesContext || '\n[No rules defined - analyze email but take no actions]\n';
|
|
262
|
+
rulesCount = (rulesContext.match(/Rule \d+/g) || []).length;
|
|
263
|
+
}
|
|
264
|
+
else {
|
|
265
|
+
// Backwards compatibility: build from RuleContext[]
|
|
266
|
+
const rules = compiledRulesContext;
|
|
267
|
+
rulesCount = rules.length;
|
|
268
|
+
rulesContext = rules.length > 0
|
|
269
|
+
? rules.map((r, i) => `
|
|
270
|
+
### Rule ${i + 1}: "${r.name}" (ID: ${r.id})
|
|
271
|
+
- Description: ${r.description || 'No description provided'}
|
|
272
|
+
- Intent: ${r.intent || 'General automation'}
|
|
273
|
+
- Actions: ${r.actions.join(', ')}
|
|
274
|
+
${r.draft_instructions ? `- Draft Instructions: "${r.draft_instructions}"` : ''}
|
|
275
|
+
`).join('\n')
|
|
276
|
+
: '\n[No rules defined - analyze email but take no actions]\n';
|
|
277
|
+
}
|
|
278
|
+
const systemPrompt = `You are an AI Email Automation Agent.
|
|
279
|
+
|
|
280
|
+
## Your Operating Rules
|
|
281
|
+
The user has defined the following automation rules. Your job is to:
|
|
282
|
+
1. Analyze the incoming email
|
|
283
|
+
2. Determine if ANY rule semantically matches this email's context
|
|
284
|
+
3. Match based on INTENT, not just keywords
|
|
285
|
+
|
|
286
|
+
${rulesContext}
|
|
287
|
+
|
|
288
|
+
## Category Definitions (choose the most accurate)
|
|
289
|
+
- **client**: Business inquiries, RFPs, quote requests, project discussions, potential customers reaching out
|
|
290
|
+
- **support**: Help requests, bug reports, technical questions from existing users
|
|
291
|
+
- **internal**: Messages from colleagues, team communications
|
|
292
|
+
- **transactional**: Receipts, confirmations, shipping updates, account notifications
|
|
293
|
+
- **newsletter**: Subscribed content, digests, updates from services you signed up for
|
|
294
|
+
- **promotional**: UNSOLICITED marketing, cold sales pitches, ads - NOT legitimate business inquiries
|
|
295
|
+
- **spam**: Scams, phishing, junk mail
|
|
296
|
+
- **social**: Social media notifications, friend requests
|
|
297
|
+
- **personal**: Friends, family, personal matters
|
|
298
|
+
- **other**: Anything that doesn't fit above
|
|
299
|
+
|
|
300
|
+
## Matching Guidelines
|
|
301
|
+
- A "decline sales" rule should match ANY sales pitch, not just ones with "sales" in the subject
|
|
302
|
+
- Match the rule that best fits the USER'S INTENT
|
|
303
|
+
- Only match if you are confident (>= 0.7 confidence)
|
|
304
|
+
- If no rule clearly matches, return null for rule_id
|
|
305
|
+
- If a matched rule includes "draft" action, generate an appropriate draft using the rule's intent
|
|
306
|
+
|
|
307
|
+
## CRITICAL: Distinguish Between Inbound vs Outbound
|
|
308
|
+
**INBOUND (Client Inquiries - NOT promotional):**
|
|
309
|
+
- User is RECEIVING a request for quote/proposal/service
|
|
310
|
+
- Examples: "Please send me a quote", "RFP: [project]", "Can you provide pricing", "I need a quote asap"
|
|
311
|
+
- Category: client, support, or transactional (NEVER promotional)
|
|
312
|
+
|
|
313
|
+
**OUTBOUND (Sales/Marketing - IS promotional):**
|
|
314
|
+
- User is RECEIVING a sales pitch or marketing message
|
|
315
|
+
- Examples: "Get a FREE quote today!", "Limited offer", "Don't miss out", "Special discount"
|
|
316
|
+
- Category: promotional, spam, or newsletter
|
|
317
|
+
|
|
318
|
+
**Key Distinction:** If someone is ASKING the user for something (quote, proposal, service), it's a CLIENT INQUIRY, not promotional content.
|
|
319
|
+
|
|
320
|
+
## Email Context
|
|
321
|
+
- Current Date: ${new Date().toISOString()}
|
|
322
|
+
- Subject: ${context.subject}
|
|
323
|
+
- From: ${context.sender}
|
|
324
|
+
- Date: ${context.date}
|
|
325
|
+
|
|
326
|
+
## Required JSON Response
|
|
327
|
+
{
|
|
328
|
+
"summary": "Brief summary of the email",
|
|
329
|
+
"category": "spam|newsletter|promotional|transactional|social|support|client|internal|personal|other",
|
|
330
|
+
"priority": "High|Medium|Low",
|
|
331
|
+
"matched_rule": {
|
|
332
|
+
"rule_id": "UUID or null",
|
|
333
|
+
"rule_name": "Rule name or null",
|
|
334
|
+
"confidence": 0.0-1.0,
|
|
335
|
+
"reasoning": "Why this rule was or wasn't matched"
|
|
336
|
+
},
|
|
337
|
+
"actions_to_execute": ["none"] or ["archive", "read", etc.],
|
|
338
|
+
"draft_content": "Optional: draft reply if action includes 'draft'"
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
Return ONLY valid JSON.`;
|
|
342
|
+
// Log thinking phase
|
|
343
|
+
if (eventLogger) {
|
|
344
|
+
try {
|
|
345
|
+
await eventLogger.info('Thinking', `Context-aware analysis: ${context.subject}`, {
|
|
346
|
+
model: this.model,
|
|
347
|
+
system_prompt: systemPrompt,
|
|
348
|
+
content_preview: cleanedContent,
|
|
349
|
+
rules_count: rulesCount,
|
|
350
|
+
}, emailId);
|
|
351
|
+
}
|
|
352
|
+
catch (err) {
|
|
353
|
+
console.error('[Intelligence] Failed to log thinking event:', err);
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
let rawResponse = '';
|
|
357
|
+
try {
|
|
358
|
+
const response = await this.client.chat.completions.create({
|
|
359
|
+
model: this.model,
|
|
360
|
+
messages: [
|
|
361
|
+
{ role: 'system', content: systemPrompt },
|
|
362
|
+
{ role: 'user', content: cleanedContent || '[Empty email body]' },
|
|
363
|
+
],
|
|
364
|
+
temperature: 0.1,
|
|
365
|
+
});
|
|
366
|
+
rawResponse = response.choices[0]?.message?.content || '';
|
|
367
|
+
const usage = response.usage;
|
|
368
|
+
console.log('[Intelligence] Context-aware response received (length:', rawResponse.length, ')', { usage });
|
|
369
|
+
// Parse JSON from response
|
|
370
|
+
let jsonStr = rawResponse.trim();
|
|
371
|
+
const startIdx = jsonStr.indexOf('{');
|
|
372
|
+
const endIdx = jsonStr.lastIndexOf('}');
|
|
373
|
+
if (startIdx === -1 || endIdx === -1) {
|
|
374
|
+
throw new Error('Response did not contain a valid JSON object');
|
|
375
|
+
}
|
|
376
|
+
jsonStr = jsonStr.substring(startIdx, endIdx + 1);
|
|
377
|
+
const parsed = JSON.parse(jsonStr);
|
|
378
|
+
const validated = ContextAwareAnalysisSchema.parse(parsed);
|
|
379
|
+
logger.debug('Context-aware analysis complete', {
|
|
380
|
+
matched_rule: validated.matched_rule.rule_name,
|
|
381
|
+
confidence: validated.matched_rule.confidence,
|
|
382
|
+
actions: validated.actions_to_execute,
|
|
383
|
+
});
|
|
384
|
+
if (eventLogger && emailId) {
|
|
385
|
+
await eventLogger.analysis('Decided', emailId, {
|
|
386
|
+
...validated,
|
|
387
|
+
_raw_response: rawResponse,
|
|
388
|
+
usage: usage // Include token usage
|
|
389
|
+
});
|
|
390
|
+
}
|
|
391
|
+
return validated;
|
|
392
|
+
}
|
|
393
|
+
catch (error) {
|
|
394
|
+
console.error('[Intelligence] Context-aware analysis failed:', error);
|
|
395
|
+
if (eventLogger) {
|
|
396
|
+
await eventLogger.error('Error', {
|
|
397
|
+
error: error instanceof Error ? error.message : String(error),
|
|
398
|
+
raw_response: rawResponse || 'No response received from LLM'
|
|
399
|
+
}, emailId);
|
|
400
|
+
}
|
|
401
|
+
return null;
|
|
402
|
+
}
|
|
403
|
+
}
|
|
213
404
|
async testConnection() {
|
|
214
405
|
if (!this.isReady()) {
|
|
215
406
|
return { success: false, message: 'Intelligence service not initialized. Check your API Key.' };
|
|
@@ -454,7 +454,66 @@ export class EmailProcessorService {
|
|
|
454
454
|
autoSubmitted: parsed.headers.get('auto-submitted')?.toString(),
|
|
455
455
|
mailer: parsed.headers.get('x-mailer')?.toString()
|
|
456
456
|
};
|
|
457
|
-
// 3.
|
|
457
|
+
// 3. Fetch account for action execution
|
|
458
|
+
const { data: account } = await this.supabase
|
|
459
|
+
.from('email_accounts')
|
|
460
|
+
.select('*')
|
|
461
|
+
.eq('id', email.account_id)
|
|
462
|
+
.single();
|
|
463
|
+
// 4. Fetch pre-compiled rule context (fast path - no loop/formatting)
|
|
464
|
+
// Falls back to building context if not cached
|
|
465
|
+
let compiledContext = settings?.compiled_rule_context || null;
|
|
466
|
+
// Fetch rules for action execution (need attachments, instructions)
|
|
467
|
+
const { data: rules } = await this.supabase
|
|
468
|
+
.from('rules')
|
|
469
|
+
.select('*')
|
|
470
|
+
.eq('user_id', userId)
|
|
471
|
+
.eq('is_enabled', true)
|
|
472
|
+
.order('priority', { ascending: false });
|
|
473
|
+
// Fallback: build context if not pre-compiled
|
|
474
|
+
if (!compiledContext && rules && rules.length > 0) {
|
|
475
|
+
compiledContext = rules.map((r, i) => {
|
|
476
|
+
// Build human-readable condition text
|
|
477
|
+
let conditionText = '';
|
|
478
|
+
if (r.condition) {
|
|
479
|
+
const cond = r.condition;
|
|
480
|
+
if (cond.field) {
|
|
481
|
+
conditionText = `When ${cond.field}`;
|
|
482
|
+
if (cond.operator === 'equals') {
|
|
483
|
+
conditionText += ` equals "${cond.value}"`;
|
|
484
|
+
}
|
|
485
|
+
else if (cond.operator === 'contains') {
|
|
486
|
+
conditionText += ` contains "${cond.value}"`;
|
|
487
|
+
}
|
|
488
|
+
else if (cond.operator === 'domain_equals') {
|
|
489
|
+
conditionText += ` domain equals "${cond.value}"`;
|
|
490
|
+
}
|
|
491
|
+
else {
|
|
492
|
+
conditionText += ` ${cond.operator} "${cond.value}"`;
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
if (cond.is_useless === true) {
|
|
496
|
+
conditionText += (conditionText ? ' AND ' : 'When ') + 'email is useless/low-value';
|
|
497
|
+
}
|
|
498
|
+
if (cond.ai_priority) {
|
|
499
|
+
conditionText += (conditionText ? ' AND ' : 'When ') + `AI priority is "${cond.ai_priority}"`;
|
|
500
|
+
}
|
|
501
|
+
// Extract older_than_days from condition JSONB
|
|
502
|
+
if (cond.older_than_days) {
|
|
503
|
+
conditionText += (conditionText ? ' AND ' : 'When ') + `email is older than ${cond.older_than_days} days`;
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
return `Rule ${i + 1} [ID: ${r.id}]\n` +
|
|
507
|
+
` Name: ${r.name}\n` +
|
|
508
|
+
(r.description ? ` Description: ${r.description}\n` : '') +
|
|
509
|
+
(r.intent ? ` Intent: ${r.intent}\n` : '') +
|
|
510
|
+
(conditionText ? ` Condition: ${conditionText}\n` : '') +
|
|
511
|
+
` Actions: ${r.actions?.join(', ') || r.action || 'none'}\n` +
|
|
512
|
+
(r.instructions ? ` Draft Instructions: ${r.instructions}\n` : '') +
|
|
513
|
+
'\n';
|
|
514
|
+
}).join('');
|
|
515
|
+
}
|
|
516
|
+
// 5. Context-Aware Analysis: AI evaluates email against user's rules
|
|
458
517
|
const intelligenceService = getIntelligenceService(settings?.llm_model || settings?.llm_base_url || settings?.llm_api_key
|
|
459
518
|
? {
|
|
460
519
|
model: settings.llm_model,
|
|
@@ -462,7 +521,7 @@ export class EmailProcessorService {
|
|
|
462
521
|
apiKey: settings.llm_api_key,
|
|
463
522
|
}
|
|
464
523
|
: undefined);
|
|
465
|
-
const analysis = await intelligenceService.
|
|
524
|
+
const analysis = await intelligenceService.analyzeEmailWithRules(cleanContent, {
|
|
466
525
|
subject: email.subject || '',
|
|
467
526
|
sender: email.sender || '',
|
|
468
527
|
date: email.date || '',
|
|
@@ -471,39 +530,41 @@ export class EmailProcessorService {
|
|
|
471
530
|
autoTrashSpam: settings?.auto_trash_spam,
|
|
472
531
|
smartDrafts: settings?.smart_drafts,
|
|
473
532
|
},
|
|
474
|
-
},
|
|
533
|
+
}, compiledContext || '', // Pre-compiled context (fast path)
|
|
534
|
+
eventLogger || undefined, email.id);
|
|
475
535
|
if (!analysis) {
|
|
476
536
|
throw new Error('AI analysis returned no result');
|
|
477
537
|
}
|
|
478
|
-
//
|
|
538
|
+
// 6. Update the email record with context-aware results
|
|
479
539
|
await this.supabase
|
|
480
540
|
.from('emails')
|
|
481
541
|
.update({
|
|
482
542
|
category: analysis.category,
|
|
483
|
-
is_useless: analysis.is_useless,
|
|
484
543
|
ai_analysis: analysis,
|
|
485
|
-
suggested_actions: analysis.
|
|
486
|
-
suggested_action: analysis.
|
|
544
|
+
suggested_actions: analysis.actions_to_execute || [],
|
|
545
|
+
suggested_action: analysis.actions_to_execute?.[0] || 'none',
|
|
546
|
+
matched_rule_id: analysis.matched_rule.rule_id,
|
|
547
|
+
matched_rule_confidence: analysis.matched_rule.confidence,
|
|
487
548
|
processing_status: 'completed'
|
|
488
549
|
})
|
|
489
550
|
.eq('id', email.id);
|
|
490
|
-
//
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
await
|
|
551
|
+
// 7. Execute actions if rule matched with sufficient confidence
|
|
552
|
+
if (account && analysis.matched_rule.rule_id && analysis.matched_rule.confidence >= 0.7) {
|
|
553
|
+
const matchedRule = rules?.find(r => r.id === analysis.matched_rule.rule_id);
|
|
554
|
+
if (eventLogger) {
|
|
555
|
+
await eventLogger.info('Rule Matched', `"${analysis.matched_rule.rule_name}" matched with ${(analysis.matched_rule.confidence * 100).toFixed(0)}% confidence`, { reasoning: analysis.matched_rule.reasoning }, email.id);
|
|
556
|
+
}
|
|
557
|
+
// Execute each action from the AI's decision
|
|
558
|
+
for (const action of analysis.actions_to_execute) {
|
|
559
|
+
if (action === 'none')
|
|
560
|
+
continue;
|
|
561
|
+
// Use AI-generated draft content if available
|
|
562
|
+
const draftContent = action === 'draft' ? analysis.draft_content : undefined;
|
|
563
|
+
await this.executeAction(account, email, action, draftContent, eventLogger, `Rule: ${matchedRule?.name || analysis.matched_rule.rule_name}`, matchedRule?.attachments);
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
else if (eventLogger && rules && rules.length > 0) {
|
|
567
|
+
await eventLogger.info('No Match', analysis.matched_rule.reasoning, { confidence: analysis.matched_rule.confidence }, email.id);
|
|
507
568
|
}
|
|
508
569
|
// Mark log as success
|
|
509
570
|
if (log) {
|
|
@@ -1,98 +1,114 @@
|
|
|
1
1
|
export class ContentCleaner {
|
|
2
2
|
/**
|
|
3
3
|
* Cleans email body by removing noise, quoted replies, and footers.
|
|
4
|
-
*
|
|
4
|
+
* optimized for LLM processing.
|
|
5
5
|
*/
|
|
6
6
|
static cleanEmailBody(text) {
|
|
7
7
|
if (!text)
|
|
8
8
|
return "";
|
|
9
9
|
const originalText = text;
|
|
10
|
-
//
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
10
|
+
// 1. Detect if content is actually HTML
|
|
11
|
+
const isHtml = /<[a-z][\s\S]*>/i.test(text);
|
|
12
|
+
if (isHtml) {
|
|
13
|
+
// Lightweight HTML -> Markdown Conversion
|
|
14
|
+
// Structure: <br>, <p> -> Newlines
|
|
15
|
+
text = text.replace(/<br\s*\/?>/gi, '\n');
|
|
16
|
+
text = text.replace(/<\/p>/gi, '\n\n');
|
|
17
|
+
text = text.replace(/<p.*?>/gi, '');
|
|
18
|
+
// Structure: Headers <h1>-<h6> -> # Title
|
|
19
|
+
text = text.replace(/<h[1-6].*?>(.*?)<\/h[1-6]>/gsi, (match, p1) => `\n# ${p1}\n`);
|
|
20
|
+
// Structure: Lists <li> -> - Item
|
|
21
|
+
text = text.replace(/<li.*?>(.*?)<\/li>/gsi, (match, p1) => `\n- ${p1}`);
|
|
22
|
+
text = text.replace(/<ul.*?>/gi, '');
|
|
23
|
+
text = text.replace(/<\/ul>/gi, '\n');
|
|
24
|
+
// Links: <a href=\"...\">text</a> -> [text](href)
|
|
25
|
+
text = text.replace(/<a\s+(?:[^>]*?\s+)?href=\"([^\"]*)\"[^>]*>(.*?)<\/a>/gsi, (match, href, content) => `[${content}](${href})`);
|
|
26
|
+
// Images: <img src=\"...\" alt=\"...\"> -> 
|
|
27
|
+
text = text.replace(/<img\s+(?:[^>]*?\s+)?src=\"([^\"]*)\"(?:[^>]*?\s+)?alt=\"([^\"]*)\"[^>]*>/gsi, (match, src, alt) => ``);
|
|
28
|
+
// Style/Script removal (strictly remove content)
|
|
29
|
+
text = text.replace(/<script.*?>.*?<\/script>/gsi, '');
|
|
30
|
+
text = text.replace(/<style.*?>.*?<\/style>/gsi, '');
|
|
31
|
+
// Final Strip of remaining tags
|
|
32
|
+
text = text.replace(/<[^>]+>/g, ' ');
|
|
33
|
+
// Entity decoding (Basic)
|
|
34
|
+
text = text.replace(/ /gi, ' ');
|
|
35
|
+
text = text.replace(/&/gi, '&');
|
|
36
|
+
text = text.replace(/</gi, '<');
|
|
37
|
+
text = text.replace(/>/gi, '>');
|
|
38
|
+
text = text.replace(/"/gi, '"');
|
|
39
|
+
text = text.replace(/'/gi, "'");
|
|
40
|
+
}
|
|
37
41
|
const lines = text.split('\n');
|
|
38
42
|
const cleanedLines = [];
|
|
39
|
-
//
|
|
40
|
-
const
|
|
43
|
+
// Patterns that usually mark the START of a reply chain or a generic footer
|
|
44
|
+
const truncationPatterns = [
|
|
41
45
|
/^On .* wrote:$/i,
|
|
42
|
-
/^From:
|
|
43
|
-
|
|
44
|
-
/^
|
|
45
|
-
/^
|
|
46
|
+
/^From: .* <.*>$/i,
|
|
47
|
+
/^-----Original Message-----$/i,
|
|
48
|
+
/^________________________________$/i,
|
|
49
|
+
/^Sent from my iPhone$/i,
|
|
50
|
+
/^Sent from my Android$/i,
|
|
51
|
+
/^Get Outlook for/i,
|
|
52
|
+
/^--$/ // Standard signature separator
|
|
46
53
|
];
|
|
47
|
-
//
|
|
48
|
-
const
|
|
49
|
-
/
|
|
54
|
+
// Patterns for lines that should be stripped but NOT truncate the whole email
|
|
55
|
+
const noisePatterns = [
|
|
56
|
+
/view in browser/i,
|
|
57
|
+
/click here to view/i,
|
|
58
|
+
/legal notice/i,
|
|
59
|
+
/all rights reserved/i,
|
|
50
60
|
/privacy policy/i,
|
|
51
61
|
/terms of service/i,
|
|
52
|
-
/
|
|
53
|
-
/copyright \d{4}/i
|
|
62
|
+
/unsubscribe/i
|
|
54
63
|
];
|
|
55
64
|
for (let line of lines) {
|
|
56
65
|
let lineStripped = line.trim();
|
|
66
|
+
if (!lineStripped) {
|
|
67
|
+
cleanedLines.push("");
|
|
68
|
+
continue;
|
|
69
|
+
}
|
|
57
70
|
// 2. Quoted text removal (lines starting with >)
|
|
58
71
|
if (lineStripped.startsWith('>')) {
|
|
59
72
|
continue;
|
|
60
73
|
}
|
|
61
|
-
// 3.
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
74
|
+
// 3. Truncation check: If we hit a reply header, we stop entirely
|
|
75
|
+
let shouldTruncate = false;
|
|
76
|
+
for (const pattern of truncationPatterns) {
|
|
77
|
+
if (pattern.test(lineStripped)) {
|
|
78
|
+
shouldTruncate = true;
|
|
79
|
+
break;
|
|
80
|
+
}
|
|
65
81
|
}
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
82
|
+
if (shouldTruncate)
|
|
83
|
+
break;
|
|
84
|
+
// 4. Noise check: Strip boilerplate lines
|
|
85
|
+
let isNoise = false;
|
|
86
|
+
if (lineStripped.length < 100) {
|
|
87
|
+
for (const pattern of noisePatterns) {
|
|
70
88
|
if (pattern.test(lineStripped)) {
|
|
71
|
-
|
|
89
|
+
isNoise = true;
|
|
72
90
|
break;
|
|
73
91
|
}
|
|
74
92
|
}
|
|
75
|
-
if (isFooter) {
|
|
76
|
-
continue;
|
|
77
|
-
}
|
|
78
93
|
}
|
|
94
|
+
if (isNoise)
|
|
95
|
+
continue;
|
|
79
96
|
cleanedLines.push(line);
|
|
80
97
|
}
|
|
81
98
|
// Reassemble
|
|
82
99
|
text = cleanedLines.join('\n');
|
|
83
|
-
//
|
|
84
|
-
if (!text.trim() || text.length < 10) {
|
|
85
|
-
text = originalText.substring(0, 3000);
|
|
86
|
-
}
|
|
87
|
-
// Collapse multiple newlines
|
|
100
|
+
// Collapse whitespace
|
|
88
101
|
text = text.replace(/\n{3,}/g, '\n\n');
|
|
102
|
+
text = text.replace(/[ \t]{2,}/g, ' ');
|
|
103
|
+
// Safety Fallback: If cleaning stripped too much, return original text truncated
|
|
104
|
+
if (text.trim().length < 20 && originalText.trim().length > 20) {
|
|
105
|
+
return originalText.substring(0, 3000).trim();
|
|
106
|
+
}
|
|
89
107
|
// Sanitize LLM Special Tokens
|
|
90
108
|
text = text.replace(/<\|/g, '< |');
|
|
91
109
|
text = text.replace(/\|>/g, '| >');
|
|
92
110
|
text = text.replace(/\[INST\]/gi, '[ INST ]');
|
|
93
111
|
text = text.replace(/\[\/INST\]/gi, '[ /INST ]');
|
|
94
|
-
text = text.replace(/<s>/gi, '<s>');
|
|
95
|
-
text = text.replace(/<\/s>/gi, '</s>');
|
|
96
112
|
return text.trim();
|
|
97
113
|
}
|
|
98
114
|
}
|