npm - @realtimex/email-automator - Versions diffs - 2.6.4 → 2.7.0 - Mend

@realtimex/email-automator 2.6.4 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/api/src/middleware/validation.ts +7 -0
package/api/src/services/intelligence.ts +232 -7
package/api/src/services/processor.ts +153 -69
package/api/src/services/supabase.ts +5 -2
package/api/src/utils/contentCleaner.ts +80 -66
package/dist/api/src/middleware/validation.js +7 -0
package/dist/api/src/services/intelligence.js +193 -2
package/dist/api/src/services/processor.js +85 -24
package/dist/api/src/utils/contentCleaner.js +74 -58
package/dist/assets/index-aTk6SbAd.js +97 -0
package/dist/assets/index-npWWfPF9.css +1 -0
package/dist/index.html +2 -2
package/package.json +1 -1
package/supabase/migrations/20260119000000_context_aware_rules.sql +44 -0
package/supabase/migrations/20260119000001_compiled_rule_context.sql +128 -0
package/supabase/migrations/20260119000002_fix_compiled_context_conditions.sql +137 -0
package/dist/assets/index-BSHZ3lFn.js +0 -97
package/dist/assets/index-CRQKk5IW.css +0 -1

package/dist/api/src/services/intelligence.js CHANGED Viewed

@@ -24,6 +24,28 @@ export const EmailAnalysisSchema = z.object({
     action_items: z.array(z.string()).optional()
         .describe('Action items mentioned in the email'),
 });
+// Context-Aware Analysis Schema - AI evaluates email against user's rules
+export const ContextAwareAnalysisSchema = z.object({
+    // Classification (kept for UI/logging)
+    summary: z.string().describe('A brief summary of the email content'),
+    category: z.enum(['spam', 'newsletter', 'promotional', 'transactional', 'social', 'support', 'client', 'internal', 'personal', 'other'])
+        .describe('The category of the email'),
+    priority: z.enum(['High', 'Medium', 'Low'])
+        .describe('The urgency of the email'),
+    // Rule Matching (core of context-aware engine)
+    matched_rule: z.object({
+        rule_id: z.string().nullable().describe('ID of the matched rule, or null if no match'),
+        rule_name: z.string().nullable().describe('Name of the matched rule'),
+        confidence: z.number().min(0).max(1).describe('Confidence score for the match (0-1)'),
+        reasoning: z.string().describe('Explanation of why this rule was matched or why no rule matched'),
+    }),
+    // Actions to execute (derived from matched rule)
+    actions_to_execute: z.array(z.enum(['none', 'delete', 'archive', 'draft', 'read', 'star']))
+        .describe('Actions to execute based on the matched rule'),
+    // Intent-aware draft content (if draft action is included)
+    draft_content: z.string().optional()
+        .describe('Generated draft reply if the action includes drafting'),
+});
 export class IntelligenceService {
     client = null;
     model = 'gpt-4o-mini';
@@ -144,7 +166,8 @@ REQUIRED JSON STRUCTURE:
                 temperature: 0.1,
             });
             rawResponse = response.choices[0]?.message?.content || '';
-            console.log('[Intelligence] Raw LLM Response received (length:', rawResponse.length, ')');
+            const usage = response.usage;
+            console.log('[Intelligence] Raw LLM Response received (length:', rawResponse.length, ')', { usage });
             // Clean the response: Find first '{' and last '}'
             let jsonStr = rawResponse.trim();
             const startIdx = jsonStr.indexOf('{');
@@ -162,7 +185,8 @@ REQUIRED JSON STRUCTURE:
             if (eventLogger && emailId) {
                 await eventLogger.analysis('Decided', emailId, {
                     ...validated,
-                    _raw_response: rawResponse
+                    _raw_response: rawResponse,
+                    usage: usage // Include token usage
                 });
             }
             return validated;
@@ -210,6 +234,173 @@ Please write a reply.`,
             return null;
         }
     }
+    /**
+     * Context-Aware Analysis: AI evaluates email against user's rules semantically
+     * This is the core of the new automation engine
+     *
+     * @param compiledRulesContext - Pre-compiled rules context string (from user_settings.compiled_rule_context)
+     *                               OR RuleContext[] for backwards compatibility
+     */
+    async analyzeEmailWithRules(content, context, compiledRulesContext, eventLogger, emailId) {
+        console.log('[Intelligence] analyzeEmailWithRules called for:', context.subject);
+        if (!this.isReady()) {
+            console.log('[Intelligence] Not ready, skipping');
+            logger.warn('Intelligence service not ready, skipping analysis');
+            if (eventLogger) {
+                await eventLogger.info('Skipped', 'AI Analysis skipped: Model not configured.', undefined, emailId);
+            }
+            return null;
+        }
+        // Prepare content
+        const cleanedContent = ContentCleaner.cleanEmailBody(content).substring(0, 2500);
+        // Use pre-compiled context if string, otherwise build from RuleContext[] (backwards compat)
+        let rulesContext;
+        let rulesCount;
+        if (typeof compiledRulesContext === 'string') {
+            // Fast path: use pre-compiled context
+            rulesContext = compiledRulesContext || '\n[No rules defined - analyze email but take no actions]\n';
+            rulesCount = (rulesContext.match(/Rule \d+/g) || []).length;
+        }
+        else {
+            // Backwards compatibility: build from RuleContext[]
+            const rules = compiledRulesContext;
+            rulesCount = rules.length;
+            rulesContext = rules.length > 0
+                ? rules.map((r, i) => `
+### Rule ${i + 1}: "${r.name}" (ID: ${r.id})
+- Description: ${r.description || 'No description provided'}
+- Intent: ${r.intent || 'General automation'}
+- Actions: ${r.actions.join(', ')}
+${r.draft_instructions ? `- Draft Instructions: "${r.draft_instructions}"` : ''}
+`).join('\n')
+                : '\n[No rules defined - analyze email but take no actions]\n';
+        }
+        const systemPrompt = `You are an AI Email Automation Agent.
+## Your Operating Rules
+The user has defined the following automation rules. Your job is to:
+1. Analyze the incoming email
+2. Determine if ANY rule semantically matches this email's context
+3. Match based on INTENT, not just keywords
+${rulesContext}
+## Category Definitions (choose the most accurate)
+- **client**: Business inquiries, RFPs, quote requests, project discussions, potential customers reaching out
+- **support**: Help requests, bug reports, technical questions from existing users
+- **internal**: Messages from colleagues, team communications
+- **transactional**: Receipts, confirmations, shipping updates, account notifications
+- **newsletter**: Subscribed content, digests, updates from services you signed up for
+- **promotional**: UNSOLICITED marketing, cold sales pitches, ads - NOT legitimate business inquiries
+- **spam**: Scams, phishing, junk mail
+- **social**: Social media notifications, friend requests
+- **personal**: Friends, family, personal matters
+- **other**: Anything that doesn't fit above
+## Matching Guidelines
+- A "decline sales" rule should match ANY sales pitch, not just ones with "sales" in the subject
+- Match the rule that best fits the USER'S INTENT
+- Only match if you are confident (>= 0.7 confidence)
+- If no rule clearly matches, return null for rule_id
+- If a matched rule includes "draft" action, generate an appropriate draft using the rule's intent
+## CRITICAL: Distinguish Between Inbound vs Outbound
+**INBOUND (Client Inquiries - NOT promotional):**
+- User is RECEIVING a request for quote/proposal/service
+- Examples: "Please send me a quote", "RFP: [project]", "Can you provide pricing", "I need a quote asap"
+- Category: client, support, or transactional (NEVER promotional)
+**OUTBOUND (Sales/Marketing - IS promotional):**
+- User is RECEIVING a sales pitch or marketing message
+- Examples: "Get a FREE quote today!", "Limited offer", "Don't miss out", "Special discount"
+- Category: promotional, spam, or newsletter
+**Key Distinction:** If someone is ASKING the user for something (quote, proposal, service), it's a CLIENT INQUIRY, not promotional content.
+## Email Context
+- Current Date: ${new Date().toISOString()}
+- Subject: ${context.subject}
+- From: ${context.sender}
+- Date: ${context.date}
+## Required JSON Response
+{
+  "summary": "Brief summary of the email",
+  "category": "spam|newsletter|promotional|transactional|social|support|client|internal|personal|other",
+  "priority": "High|Medium|Low",
+  "matched_rule": {
+    "rule_id": "UUID or null",
+    "rule_name": "Rule name or null",
+    "confidence": 0.0-1.0,
+    "reasoning": "Why this rule was or wasn't matched"
+  },
+  "actions_to_execute": ["none"] or ["archive", "read", etc.],
+  "draft_content": "Optional: draft reply if action includes 'draft'"
+}
+Return ONLY valid JSON.`;
+        // Log thinking phase
+        if (eventLogger) {
+            try {
+                await eventLogger.info('Thinking', `Context-aware analysis: ${context.subject}`, {
+                    model: this.model,
+                    system_prompt: systemPrompt,
+                    content_preview: cleanedContent,
+                    rules_count: rulesCount,
+                }, emailId);
+            }
+            catch (err) {
+                console.error('[Intelligence] Failed to log thinking event:', err);
+            }
+        }
+        let rawResponse = '';
+        try {
+            const response = await this.client.chat.completions.create({
+                model: this.model,
+                messages: [
+                    { role: 'system', content: systemPrompt },
+                    { role: 'user', content: cleanedContent || '[Empty email body]' },
+                ],
+                temperature: 0.1,
+            });
+            rawResponse = response.choices[0]?.message?.content || '';
+            const usage = response.usage;
+            console.log('[Intelligence] Context-aware response received (length:', rawResponse.length, ')', { usage });
+            // Parse JSON from response
+            let jsonStr = rawResponse.trim();
+            const startIdx = jsonStr.indexOf('{');
+            const endIdx = jsonStr.lastIndexOf('}');
+            if (startIdx === -1 || endIdx === -1) {
+                throw new Error('Response did not contain a valid JSON object');
+            }
+            jsonStr = jsonStr.substring(startIdx, endIdx + 1);
+            const parsed = JSON.parse(jsonStr);
+            const validated = ContextAwareAnalysisSchema.parse(parsed);
+            logger.debug('Context-aware analysis complete', {
+                matched_rule: validated.matched_rule.rule_name,
+                confidence: validated.matched_rule.confidence,
+                actions: validated.actions_to_execute,
+            });
+            if (eventLogger && emailId) {
+                await eventLogger.analysis('Decided', emailId, {
+                    ...validated,
+                    _raw_response: rawResponse,
+                    usage: usage // Include token usage
+                });
+            }
+            return validated;
+        }
+        catch (error) {
+            console.error('[Intelligence] Context-aware analysis failed:', error);
+            if (eventLogger) {
+                await eventLogger.error('Error', {
+                    error: error instanceof Error ? error.message : String(error),
+                    raw_response: rawResponse || 'No response received from LLM'
+                }, emailId);
+            }
+            return null;
+        }
+    }
     async testConnection() {
         if (!this.isReady()) {
             return { success: false, message: 'Intelligence service not initialized. Check your API Key.' };

package/dist/api/src/services/processor.js CHANGED Viewed

@@ -454,7 +454,66 @@ export class EmailProcessorService {
                 autoSubmitted: parsed.headers.get('auto-submitted')?.toString(),
                 mailer: parsed.headers.get('x-mailer')?.toString()
             };
-            // 3. Analyze with AI
+            // 3. Fetch account for action execution
+            const { data: account } = await this.supabase
+                .from('email_accounts')
+                .select('*')
+                .eq('id', email.account_id)
+                .single();
+            // 4. Fetch pre-compiled rule context (fast path - no loop/formatting)
+            // Falls back to building context if not cached
+            let compiledContext = settings?.compiled_rule_context || null;
+            // Fetch rules for action execution (need attachments, instructions)
+            const { data: rules } = await this.supabase
+                .from('rules')
+                .select('*')
+                .eq('user_id', userId)
+                .eq('is_enabled', true)
+                .order('priority', { ascending: false });
+            // Fallback: build context if not pre-compiled
+            if (!compiledContext && rules && rules.length > 0) {
+                compiledContext = rules.map((r, i) => {
+                    // Build human-readable condition text
+                    let conditionText = '';
+                    if (r.condition) {
+                        const cond = r.condition;
+                        if (cond.field) {
+                            conditionText = `When ${cond.field}`;
+                            if (cond.operator === 'equals') {
+                                conditionText += ` equals "${cond.value}"`;
+                            }
+                            else if (cond.operator === 'contains') {
+                                conditionText += ` contains "${cond.value}"`;
+                            }
+                            else if (cond.operator === 'domain_equals') {
+                                conditionText += ` domain equals "${cond.value}"`;
+                            }
+                            else {
+                                conditionText += ` ${cond.operator} "${cond.value}"`;
+                            }
+                        }
+                        if (cond.is_useless === true) {
+                            conditionText += (conditionText ? ' AND ' : 'When ') + 'email is useless/low-value';
+                        }
+                        if (cond.ai_priority) {
+                            conditionText += (conditionText ? ' AND ' : 'When ') + `AI priority is "${cond.ai_priority}"`;
+                        }
+                        // Extract older_than_days from condition JSONB
+                        if (cond.older_than_days) {
+                            conditionText += (conditionText ? ' AND ' : 'When ') + `email is older than ${cond.older_than_days} days`;
+                        }
+                    }
+                    return `Rule ${i + 1} [ID: ${r.id}]\n` +
+                        `  Name: ${r.name}\n` +
+                        (r.description ? `  Description: ${r.description}\n` : '') +
+                        (r.intent ? `  Intent: ${r.intent}\n` : '') +
+                        (conditionText ? `  Condition: ${conditionText}\n` : '') +
+                        `  Actions: ${r.actions?.join(', ') || r.action || 'none'}\n` +
+                        (r.instructions ? `  Draft Instructions: ${r.instructions}\n` : '') +
+                        '\n';
+                }).join('');
+            }
+            // 5. Context-Aware Analysis: AI evaluates email against user's rules
             const intelligenceService = getIntelligenceService(settings?.llm_model || settings?.llm_base_url || settings?.llm_api_key
                 ? {
                     model: settings.llm_model,
@@ -462,7 +521,7 @@ export class EmailProcessorService {
                     apiKey: settings.llm_api_key,
                 }
                 : undefined);
-            const analysis = await intelligenceService.analyzeEmail(cleanContent, {
+            const analysis = await intelligenceService.analyzeEmailWithRules(cleanContent, {
                 subject: email.subject || '',
                 sender: email.sender || '',
                 date: email.date || '',
@@ -471,39 +530,41 @@ export class EmailProcessorService {
                     autoTrashSpam: settings?.auto_trash_spam,
                     smartDrafts: settings?.smart_drafts,
                 },
-            }, eventLogger || undefined, email.id);
+            }, compiledContext || '', // Pre-compiled context (fast path)
+            eventLogger || undefined, email.id);
             if (!analysis) {
                 throw new Error('AI analysis returned no result');
             }
-            // 4. Update the email record with results
+            // 6. Update the email record with context-aware results
             await this.supabase
                 .from('emails')
                 .update({
                 category: analysis.category,
-                is_useless: analysis.is_useless,
                 ai_analysis: analysis,
-                suggested_actions: analysis.suggested_actions || [],
-                suggested_action: analysis.suggested_actions?.[0] || 'none',
+                suggested_actions: analysis.actions_to_execute || [],
+                suggested_action: analysis.actions_to_execute?.[0] || 'none',
+                matched_rule_id: analysis.matched_rule.rule_id,
+                matched_rule_confidence: analysis.matched_rule.confidence,
                 processing_status: 'completed'
             })
                 .eq('id', email.id);
-            // 5. Execute automation rules
-            // Fetch account and rules needed for execution
-            const { data: account } = await this.supabase
-                .from('email_accounts')
-                .select('*')
-                .eq('id', email.account_id)
-                .single();
-            const { data: rules } = await this.supabase
-                .from('rules')
-                .select('*')
-                .eq('user_id', userId)
-                .eq('is_enabled', true);
-            if (account && rules) {
-                const tempResult = { processed: 0, deleted: 0, drafted: 0, errors: 0 };
-                // Ensure email object for rules has the analysis fields merged in
-                const emailForRules = { ...email, ...analysis };
-                await this.executeRules(account, emailForRules, analysis, rules, settings, tempResult, eventLogger);
+            // 7. Execute actions if rule matched with sufficient confidence
+            if (account && analysis.matched_rule.rule_id && analysis.matched_rule.confidence >= 0.7) {
+                const matchedRule = rules?.find(r => r.id === analysis.matched_rule.rule_id);
+                if (eventLogger) {
+                    await eventLogger.info('Rule Matched', `"${analysis.matched_rule.rule_name}" matched with ${(analysis.matched_rule.confidence * 100).toFixed(0)}% confidence`, { reasoning: analysis.matched_rule.reasoning }, email.id);
+                }
+                // Execute each action from the AI's decision
+                for (const action of analysis.actions_to_execute) {
+                    if (action === 'none')
+                        continue;
+                    // Use AI-generated draft content if available
+                    const draftContent = action === 'draft' ? analysis.draft_content : undefined;
+                    await this.executeAction(account, email, action, draftContent, eventLogger, `Rule: ${matchedRule?.name || analysis.matched_rule.rule_name}`, matchedRule?.attachments);
+                }
+            }
+            else if (eventLogger && rules && rules.length > 0) {
+                await eventLogger.info('No Match', analysis.matched_rule.reasoning, { confidence: analysis.matched_rule.confidence }, email.id);
             }
             // Mark log as success
             if (log) {

package/dist/api/src/utils/contentCleaner.js CHANGED Viewed

@@ -1,98 +1,114 @@
 export class ContentCleaner {
     /**
      * Cleans email body by removing noise, quoted replies, and footers.
-     * Ported from Python ContentCleaner.
+     * optimized for LLM processing.
      */
     static cleanEmailBody(text) {
         if (!text)
             return "";
         const originalText = text;
-        // 0. Lightweight HTML -> Markdown Conversion
-        // Structure: <br>, <p> -> Newlines
-        text = text.replace(/<br\s*\/?>/gi, '\n');
-        text = text.replace(/<\/p>/gi, '\n\n');
-        text = text.replace(/<p.*?>/gi, ''); // Open p tags just gone
-        // Structure: Headers <h1>-<h6> -> # Title
-        text = text.replace(/<h[1-6].*?>(.*?)<\/h[1-6]>/gsi, (match, p1) => `\n# ${p1}\n`);
-        // Structure: Lists <li> -> - Item
-        text = text.replace(/<li.*?>(.*?)<\/li>/gsi, (match, p1) => `\n- ${p1}`);
-        text = text.replace(/<ul.*?>/gi, '');
-        text = text.replace(/<\/ul>/gi, '\n');
-        // Links: <a href=\"...\">text</a> -> [text](href)
-        text = text.replace(/<a\s+(?:[^>]*?\s+)?href=\"([^\"]*)\"[^>]*>(.*?)<\/a>/gsi, (match, href, content) => `[${content}](${href})`);
-        // Images: <img src=\"...\" alt=\"...\"> -> ![alt](src)
-        text = text.replace(/<img\s+(?:[^>]*?\s+)?src=\"([^\"]*)\"(?:[^>]*?\s+)?alt=\"([^\"]*)\"[^>]*>/gsi, (match, src, alt) => `![${alt}](${src})`);
-        // Style/Script removal (strictly remove content)
-        text = text.replace(/<script.*?>.*?<\/script>/gsi, '');
-        text = text.replace(/<style.*?>.*?<\/style>/gsi, '');
-        // Final Strip of remaining tags
-        text = text.replace(/<[^>]+>/g, ' ');
-        // Entity decoding (Basic)
-        text = text.replace(/&nbsp;/gi, ' ');
-        text = text.replace(/&amp;/gi, '&');
-        text = text.replace(/&lt;/gi, '<');
-        text = text.replace(/&gt;/gi, '>');
-        text = text.replace(/&quot;/gi, '"');
-        text = text.replace(/&#39;/gi, "'");
+        // 1. Detect if content is actually HTML
+        const isHtml = /<[a-z][\s\S]*>/i.test(text);
+        if (isHtml) {
+            // Lightweight HTML -> Markdown Conversion
+            // Structure: <br>, <p> -> Newlines
+            text = text.replace(/<br\s*\/?>/gi, '\n');
+            text = text.replace(/<\/p>/gi, '\n\n');
+            text = text.replace(/<p.*?>/gi, '');
+            // Structure: Headers <h1>-<h6> -> # Title
+            text = text.replace(/<h[1-6].*?>(.*?)<\/h[1-6]>/gsi, (match, p1) => `\n# ${p1}\n`);
+            // Structure: Lists <li> -> - Item
+            text = text.replace(/<li.*?>(.*?)<\/li>/gsi, (match, p1) => `\n- ${p1}`);
+            text = text.replace(/<ul.*?>/gi, '');
+            text = text.replace(/<\/ul>/gi, '\n');
+            // Links: <a href=\"...\">text</a> -> [text](href)
+            text = text.replace(/<a\s+(?:[^>]*?\s+)?href=\"([^\"]*)\"[^>]*>(.*?)<\/a>/gsi, (match, href, content) => `[${content}](${href})`);
+            // Images: <img src=\"...\" alt=\"...\"> -> ![alt](src)
+            text = text.replace(/<img\s+(?:[^>]*?\s+)?src=\"([^\"]*)\"(?:[^>]*?\s+)?alt=\"([^\"]*)\"[^>]*>/gsi, (match, src, alt) => `![${alt}](${src})`);
+            // Style/Script removal (strictly remove content)
+            text = text.replace(/<script.*?>.*?<\/script>/gsi, '');
+            text = text.replace(/<style.*?>.*?<\/style>/gsi, '');
+            // Final Strip of remaining tags
+            text = text.replace(/<[^>]+>/g, ' ');
+            // Entity decoding (Basic)
+            text = text.replace(/&nbsp;/gi, ' ');
+            text = text.replace(/&amp;/gi, '&');
+            text = text.replace(/&lt;/gi, '<');
+            text = text.replace(/&gt;/gi, '>');
+            text = text.replace(/&quot;/gi, '"');
+            text = text.replace(/&#39;/gi, "'");
+        }
         const lines = text.split('\n');
         const cleanedLines = [];
-        // Heuristics for reply headers
-        const replyHeaderPatterns = [
+        // Patterns that usually mark the START of a reply chain or a generic footer
+        const truncationPatterns = [
             /^On .* wrote:$/i,
-            /^From: .*$/i,
-            /^Sent: .*$/i,
-            /^To: .*$/i,
-            /^Subject: .*$/i
+            /^From: .* <.*>$/i,
+            /^-----Original Message-----$/i,
+            /^________________________________$/i,
+            /^Sent from my iPhone$/i,
+            /^Sent from my Android$/i,
+            /^Get Outlook for/i,
+            /^--$/ // Standard signature separator
         ];
-        // Heuristics for footers
-        const footerPatterns = [
-            /unsubscribe/i,
+        // Patterns for lines that should be stripped but NOT truncate the whole email
+        const noisePatterns = [
+            /view in browser/i,
+            /click here to view/i,
+            /legal notice/i,
+            /all rights reserved/i,
             /privacy policy/i,
             /terms of service/i,
-            /view in browser/i,
-            /copyright \d{4}/i
+            /unsubscribe/i
         ];
         for (let line of lines) {
             let lineStripped = line.trim();
+            if (!lineStripped) {
+                cleanedLines.push("");
+                continue;
+            }
             // 2. Quoted text removal (lines starting with >)
             if (lineStripped.startsWith('>')) {
                 continue;
             }
-            // 3. Check for specific reply separators
-            // If we hit a reply header, we truncate the rest
-            if (/^On .* wrote:$/i.test(lineStripped)) {
-                break;
+            // 3. Truncation check: If we hit a reply header, we stop entirely
+            let shouldTruncate = false;
+            for (const pattern of truncationPatterns) {
+                if (pattern.test(lineStripped)) {
+                    shouldTruncate = true;
+                    break;
+                }
             }
-            // 4. Footer removal (only on very short lines to avoid stripping body content)
-            if (lineStripped.length < 60) {
-                let isFooter = false;
-                for (const pattern of footerPatterns) {
+            if (shouldTruncate)
+                break;
+            // 4. Noise check: Strip boilerplate lines
+            let isNoise = false;
+            if (lineStripped.length < 100) {
+                for (const pattern of noisePatterns) {
                     if (pattern.test(lineStripped)) {
-                        isFooter = true;
+                        isNoise = true;
                         break;
                     }
                 }
-                if (isFooter) {
-                    continue;
-                }
             }
+            if (isNoise)
+                continue;
             cleanedLines.push(line);
         }
         // Reassemble
         text = cleanedLines.join('\n');
-        // Safety Fallback: If cleaning stripped everything, return original (truncated)
-        if (!text.trim() || text.length < 10) {
-            text = originalText.substring(0, 3000);
-        }
-        // Collapse multiple newlines
+        // Collapse whitespace
         text = text.replace(/\n{3,}/g, '\n\n');
+        text = text.replace(/[ \t]{2,}/g, ' ');
+        // Safety Fallback: If cleaning stripped too much, return original text truncated
+        if (text.trim().length < 20 && originalText.trim().length > 20) {
+            return originalText.substring(0, 3000).trim();
+        }
         // Sanitize LLM Special Tokens
         text = text.replace(/<\|/g, '< |');
         text = text.replace(/\|>/g, '| >');
         text = text.replace(/\[INST\]/gi, '[ INST ]');
         text = text.replace(/\[\/INST\]/gi, '[ /INST ]');
-        text = text.replace(/<s>/gi, '&lt;s&gt;');
-        text = text.replace(/<\/s>/gi, '&lt;/s&gt;');
         return text.trim();
     }
 }