@realtimex/email-automator 2.4.5 → 2.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,88 @@
1
+ import fs from 'fs/promises';
2
+ import path from 'path';
3
+ import os from 'os';
4
+ import { createLogger } from '../utils/logger.js';
5
+
6
+ const logger = createLogger('StorageService');
7
+
8
+ export class StorageService {
9
+ private defaultPath: string;
10
+
11
+ constructor() {
12
+ // Default to a folder in the user's home directory or current project
13
+ // Using project-relative path for now as discussed
14
+ this.defaultPath = path.resolve(process.cwd(), 'data', 'emails');
15
+ }
16
+
17
+ /**
18
+ * Ensures the storage directory exists and is writable.
19
+ */
20
+ async ensureDirectory(customPath?: string | null): Promise<string> {
21
+ const targetPath = customPath || this.defaultPath;
22
+ try {
23
+ await fs.mkdir(targetPath, { recursive: true });
24
+ // Test writability
25
+ const testFile = path.join(targetPath, '.write_test');
26
+ await fs.writeFile(testFile, 'ok');
27
+ await fs.unlink(testFile);
28
+ return targetPath;
29
+ } catch (error) {
30
+ logger.error('Storage directory validation failed', error, { targetPath });
31
+ throw new Error(`Storage path "${targetPath}" is not accessible or writable.`);
32
+ }
33
+ }
34
+
35
+ /**
36
+ * Saves raw email content to disk.
37
+ * Returns the absolute path to the saved file.
38
+ */
39
+ async saveEmail(content: string, filename: string, customPath?: string | null): Promise<string> {
40
+ const baseDir = await this.ensureDirectory(customPath);
41
+ const filePath = path.join(baseDir, filename);
42
+
43
+ try {
44
+ await fs.writeFile(filePath, content, 'utf8');
45
+ logger.debug('Email saved to disk', { filePath });
46
+ return filePath;
47
+ } catch (error) {
48
+ logger.error('Failed to save email to disk', error, { filePath });
49
+ throw error;
50
+ }
51
+ }
52
+
53
+ /**
54
+ * Reads email content from disk.
55
+ */
56
+ async readEmail(filePath: string): Promise<string> {
57
+ try {
58
+ return await fs.readFile(filePath, 'utf8');
59
+ } catch (error) {
60
+ logger.error('Failed to read email from disk', error, { filePath });
61
+ throw error;
62
+ }
63
+ }
64
+
65
+ /**
66
+ * Deletes email from disk.
67
+ */
68
+ async deleteEmail(filePath: string): Promise<void> {
69
+ try {
70
+ await fs.unlink(filePath);
71
+ logger.debug('Email deleted from disk', { filePath });
72
+ } catch (error) {
73
+ // If file doesn't exist, we don't care much
74
+ if ((error as any).code !== 'ENOENT') {
75
+ logger.warn('Failed to delete email from disk', { error, filePath });
76
+ }
77
+ }
78
+ }
79
+ }
80
+
81
+ let storageService: StorageService | null = null;
82
+
83
+ export function getStorageService(): StorageService {
84
+ if (!storageService) {
85
+ storageService = new StorageService();
86
+ }
87
+ return storageService;
88
+ }
@@ -103,6 +103,12 @@ export interface Email {
103
103
  action_taken: string | null; // Deprecated
104
104
  actions_taken?: string[];
105
105
  created_at: string;
106
+ email_accounts?: EmailAccount;
107
+ // ETL fields
108
+ file_path?: string | null;
109
+ processing_status: 'pending' | 'processing' | 'completed' | 'failed';
110
+ processing_error?: string | null;
111
+ retry_count: number;
106
112
  }
107
113
 
108
114
  export interface Rule {
@@ -5,11 +5,12 @@ export class ContentCleaner {
5
5
  */
6
6
  static cleanEmailBody(text: string): string {
7
7
  if (!text) return "";
8
+ const originalText = text;
8
9
 
9
10
  // 0. Lightweight HTML -> Markdown Conversion
10
11
 
11
12
  // Structure: <br>, <p> -> Newlines
12
- text = text.replace(/<br\s*\/?\?>/gi, '\n');
13
+ text = text.replace(/<br\s*\/?>/gi, '\n');
13
14
  text = text.replace(/<\/p>/gi, '\n\n');
14
15
  text = text.replace(/<p.*?>/gi, ''); // Open p tags just gone
15
16
 
@@ -72,13 +73,13 @@ export class ContentCleaner {
72
73
  }
73
74
 
74
75
  // 3. Check for specific reply separators
75
- // If we hit a reply header, we truncate the rest (Aggressive strategy per Python code)
76
+ // If we hit a reply header, we truncate the rest
76
77
  if (/^On .* wrote:$/i.test(lineStripped)) {
77
78
  break;
78
79
  }
79
80
 
80
- // 4. Footer removal (simple check on short lines)
81
- if (lineStripped.length < 100) {
81
+ // 4. Footer removal (only on very short lines to avoid stripping body content)
82
+ if (lineStripped.length < 60) {
82
83
  let isFooter = false;
83
84
  for (const pattern of footerPatterns) {
84
85
  if (pattern.test(lineStripped)) {
@@ -97,11 +98,15 @@ export class ContentCleaner {
97
98
  // Reassemble
98
99
  text = cleanedLines.join('\n');
99
100
 
101
+ // Safety Fallback: If cleaning stripped everything, return original (truncated)
102
+ if (!text.trim() || text.length < 10) {
103
+ text = originalText.substring(0, 3000);
104
+ }
105
+
100
106
  // Collapse multiple newlines
101
107
  text = text.replace(/\n{3,}/g, '\n\n');
102
108
 
103
- // Sanitize LLM Special Tokens (Prevent Prompt Injection/Confusion)
104
- // Break sequences like <|channel|>, [INST], <s>
109
+ // Sanitize LLM Special Tokens
105
110
  text = text.replace(/<\|/g, '< |');
106
111
  text = text.replace(/\|>/g, '| >');
107
112
  text = text.replace(/\[INST\]/gi, '[ INST ]');
@@ -3,6 +3,7 @@ import { asyncHandler, NotFoundError } from '../middleware/errorHandler.js';
3
3
  import { authMiddleware } from '../middleware/auth.js';
4
4
  import { apiRateLimit } from '../middleware/rateLimit.js';
5
5
  import { createLogger } from '../utils/logger.js';
6
+ import { getStorageService } from '../services/storage.js';
6
7
  const router = Router();
7
8
  const logger = createLogger('EmailsRoutes');
8
9
  // List emails with pagination and filters
@@ -64,19 +65,44 @@ router.get('/:emailId', authMiddleware, asyncHandler(async (req, res) => {
64
65
  }
65
66
  res.json({ email: data });
66
67
  }));
68
+ // Get raw email content (.eml)
69
+ router.get('/:emailId/raw', authMiddleware, asyncHandler(async (req, res) => {
70
+ const { emailId } = req.params;
71
+ const { data: email, error } = await req.supabase
72
+ .from('emails')
73
+ .select('file_path, subject, email_accounts!inner(user_id)')
74
+ .eq('id', emailId)
75
+ .eq('email_accounts.user_id', req.user.id)
76
+ .single();
77
+ if (error || !email || !email.file_path) {
78
+ throw new NotFoundError('Raw Email');
79
+ }
80
+ const storageService = getStorageService();
81
+ const content = await storageService.readEmail(email.file_path);
82
+ const filename = `${email.subject || 'email'}.eml`.replace(/[^a-z0-9._-]/gi, '_');
83
+ res.setHeader('Content-Type', 'message/rfc822');
84
+ res.setHeader('Content-Disposition', `attachment; filename="${filename}"`);
85
+ res.send(content);
86
+ }));
67
87
  // Delete email record (not the actual email from provider)
68
88
  router.delete('/:emailId', apiRateLimit, authMiddleware, asyncHandler(async (req, res) => {
69
89
  const { emailId } = req.params;
70
90
  // Verify ownership first
71
91
  const { data: email } = await req.supabase
72
92
  .from('emails')
73
- .select('id, email_accounts!inner(user_id)')
93
+ .select('id, file_path, email_accounts!inner(user_id)')
74
94
  .eq('id', emailId)
75
95
  .eq('email_accounts.user_id', req.user.id)
76
96
  .single();
77
97
  if (!email) {
78
98
  throw new NotFoundError('Email');
79
99
  }
100
+ // 1. Delete from disk
101
+ if (email.file_path) {
102
+ const storageService = getStorageService();
103
+ await storageService.deleteEmail(email.file_path);
104
+ }
105
+ // 2. Delete from DB
80
106
  const { error } = await req.supabase
81
107
  .from('emails')
82
108
  .delete()
@@ -46,7 +46,11 @@ router.post('/', apiRateLimit, authMiddleware, validateBody(schemas.createRule),
46
46
  // Update rule
47
47
  router.patch('/:ruleId', apiRateLimit, authMiddleware, validateParams(z.object({ ruleId: schemas.uuid })), validateBody(schemas.updateRule), asyncHandler(async (req, res) => {
48
48
  const { ruleId } = req.params;
49
- const updates = req.body;
49
+ const updates = { ...req.body };
50
+ // Ensure legacy action is in sync if actions array is provided
51
+ if (updates.actions && Array.isArray(updates.actions) && updates.actions.length > 0) {
52
+ updates.action = updates.actions[0];
53
+ }
50
54
  const { data, error } = await req.supabase
51
55
  .from('rules')
52
56
  .update(updates)
@@ -58,7 +62,7 @@ router.patch('/:ruleId', apiRateLimit, authMiddleware, validateParams(z.object({
58
62
  throw error;
59
63
  if (!data)
60
64
  throw new NotFoundError('Rule');
61
- logger.info('Rule updated', { ruleId, userId: req.user.id });
65
+ logger.info('Rule updated', { ruleId, actions: data.actions, userId: req.user.id });
62
66
  res.json({ rule: data });
63
67
  }));
64
68
  // Delete rule
@@ -113,182 +113,93 @@ export class GmailService {
113
113
  throw error;
114
114
  return data;
115
115
  }
116
- async fetchMessages(account, options = {}) {
117
- const gmail = await this.getAuthenticatedClient(account);
118
- const { maxResults = config.processing.batchSize, query, pageToken } = options;
119
- const response = await gmail.users.messages.list({
120
- userId: 'me',
121
- maxResults,
122
- q: query,
123
- pageToken,
124
- });
125
- const messages = [];
126
- for (const msg of response.data.messages || []) {
127
- if (!msg.id)
128
- continue;
129
- try {
130
- const detail = await gmail.users.messages.get({
131
- userId: 'me',
132
- id: msg.id,
133
- format: 'full',
134
- });
135
- const parsed = this.parseMessage(detail.data);
136
- if (parsed) {
137
- messages.push(parsed);
138
- }
139
- }
140
- catch (error) {
141
- logger.warn('Failed to fetch message details', { messageId: msg.id, error });
142
- }
143
- }
144
- return {
145
- messages,
146
- nextPageToken: response.data.nextPageToken ?? undefined,
147
- };
148
- }
149
116
  /**
150
- * Fetch messages in OLDEST-FIRST order using "Fetch IDs → Sort → Hydrate" strategy.
117
+ * Fetch messages in OLDEST-FIRST order using "Fetch IDs → Reverse → Hydrate" strategy.
151
118
  *
152
- * Gmail API always returns newest first and doesn't support sorting.
153
- * To process oldest emails first (critical for checkpoint-based sync), we:
154
- * 1. Fetch ALL message IDs matching the query (lightweight, paginated)
155
- * 2. Sort by internalDate ascending (oldest first)
119
+ * Gmail API always returns newest first. To process absolute oldest emails first:
120
+ * 1. Fetch ALL message IDs matching the query (lightweight)
121
+ * 2. Reverse the list (turning Newest-First into Oldest-First)
156
122
  * 3. Take first N messages (limit)
157
- * 4. Hydrate only those N messages with full details
158
- *
159
- * This ensures we never skip emails when using max_emails pagination.
123
+ * 4. Hydrate ONLY those N messages
160
124
  */
161
125
  async fetchMessagesOldestFirst(account, options) {
162
- const { limit, query, maxIdsToFetch = 1000 } = options;
163
- // Step 1: Fetch all message IDs (lightweight)
164
- const allIds = await this.fetchAllMessageIds(account, query, maxIdsToFetch);
126
+ const { limit, query } = options;
127
+ // Step 1: Fetch IDs (No hydration yet, so this is fast)
128
+ const allIds = await this.fetchAllMessageIds(account, query);
165
129
  if (allIds.length === 0) {
166
130
  return { messages: [], hasMore: false };
167
131
  }
168
- logger.debug('Fetched message IDs', { count: allIds.length, query });
169
- // Step 2: Sort by internalDate ascending (oldest first)
170
- allIds.sort((a, b) => parseInt(a.internalDate) - parseInt(b.internalDate));
171
- // Step 3: Take first N IDs
132
+ // Step 2: Reverse to get oldest first
133
+ allIds.reverse();
134
+ // Step 3: Take the window we need
172
135
  const idsToHydrate = allIds.slice(0, limit);
173
136
  const hasMore = allIds.length > limit;
174
- // Step 4: Hydrate those specific messages
175
- const messages = await this.hydrateMessages(account, idsToHydrate.map(m => m.id));
176
- // Re-sort hydrated messages by internalDate (maintain order)
177
- messages.sort((a, b) => parseInt(a.internalDate) - parseInt(b.internalDate));
137
+ logger.debug('Hydrating oldest emails', { totalFound: allIds.length, hydrating: idsToHydrate.length });
138
+ // Step 4: Hydrate only the target messages
139
+ const messages = await this.hydrateMessages(account, idsToHydrate);
178
140
  return { messages, hasMore };
179
141
  }
180
142
  /**
181
143
  * Fetch all message IDs matching a query (lightweight, paginated).
182
- * Uses minimal fields for speed: only id and internalDate.
144
+ * Collects IDs only to remain fast even for large result sets.
183
145
  */
184
- async fetchAllMessageIds(account, query, maxIds) {
146
+ async fetchAllMessageIds(account, query) {
185
147
  const gmail = await this.getAuthenticatedClient(account);
186
- const results = [];
148
+ const allIds = [];
187
149
  let pageToken;
150
+ const MAX_IDS = 5000; // Efficient chunk size for finding the "bottom" of recent emails
188
151
  do {
189
152
  const response = await gmail.users.messages.list({
190
153
  userId: 'me',
191
154
  q: query,
192
155
  pageToken,
193
156
  maxResults: 500, // Max allowed per page
194
- // Note: messages.list only returns id and threadId, not internalDate
195
- // We need to fetch internalDate separately with minimal format
196
157
  });
197
158
  const messageRefs = response.data.messages || [];
198
- // Fetch internalDate for each message (using metadata format for speed)
199
159
  for (const ref of messageRefs) {
200
- if (!ref.id || results.length >= maxIds)
201
- break;
202
- try {
203
- const msg = await gmail.users.messages.get({
204
- userId: 'me',
205
- id: ref.id,
206
- format: 'minimal', // Only returns id, threadId, labelIds, snippet, internalDate
207
- });
208
- if (msg.data.id && msg.data.internalDate) {
209
- results.push({
210
- id: msg.data.id,
211
- internalDate: msg.data.internalDate,
212
- });
213
- }
214
- }
215
- catch (error) {
216
- logger.warn('Failed to fetch message metadata', { messageId: ref.id });
217
- }
160
+ if (ref.id)
161
+ allIds.push(ref.id);
218
162
  }
219
163
  pageToken = response.data.nextPageToken ?? undefined;
220
- } while (pageToken && results.length < maxIds);
221
- return results;
164
+ } while (pageToken && allIds.length < MAX_IDS);
165
+ logger.info('Collected matching message IDs', { total: allIds.length, query });
166
+ return allIds;
222
167
  }
223
168
  /**
224
- * Hydrate specific messages by ID (fetch full details).
169
+ * Hydrate specific messages by ID (fetch raw RFC822 data).
225
170
  */
226
171
  async hydrateMessages(account, messageIds) {
227
172
  const gmail = await this.getAuthenticatedClient(account);
228
173
  const messages = [];
229
- for (const id of messageIds) {
230
- try {
231
- const detail = await gmail.users.messages.get({
232
- userId: 'me',
233
- id,
234
- format: 'full',
235
- });
236
- const parsed = this.parseMessage(detail.data);
237
- if (parsed) {
238
- messages.push(parsed);
174
+ // Hydrate in small parallel batches to avoid rate limits
175
+ const BATCH_SIZE = 10;
176
+ for (let i = 0; i < messageIds.length; i += BATCH_SIZE) {
177
+ const batch = messageIds.slice(i, i + BATCH_SIZE);
178
+ const hydrated = await Promise.all(batch.map(async (id) => {
179
+ try {
180
+ const detail = await gmail.users.messages.get({
181
+ userId: 'me',
182
+ id,
183
+ format: 'raw',
184
+ });
185
+ if (detail.data.raw) {
186
+ return {
187
+ id: detail.data.id,
188
+ threadId: detail.data.threadId,
189
+ internalDate: detail.data.internalDate,
190
+ raw: detail.data.raw
191
+ };
192
+ }
239
193
  }
240
- }
241
- catch (error) {
242
- logger.warn('Failed to hydrate message', { messageId: id, error });
243
- }
194
+ catch (error) {
195
+ logger.warn('Failed to hydrate message', { messageId: id, error });
196
+ }
197
+ return null;
198
+ }));
199
+ messages.push(...hydrated.filter((m) => m !== null));
244
200
  }
245
201
  return messages;
246
202
  }
247
- parseMessage(message) {
248
- if (!message.id || !message.threadId)
249
- return null;
250
- const headers = message.payload?.headers || [];
251
- const getHeader = (name) => headers.find(h => h.name?.toLowerCase() === name.toLowerCase())?.value || '';
252
- let body = '';
253
- const payload = message.payload;
254
- if (payload?.parts) {
255
- // Multipart message
256
- const textPart = payload.parts.find(p => p.mimeType === 'text/plain');
257
- const htmlPart = payload.parts.find(p => p.mimeType === 'text/html');
258
- const part = textPart || htmlPart || payload.parts[0];
259
- body = this.decodeBody(part?.body?.data);
260
- }
261
- else if (payload?.body?.data) {
262
- body = this.decodeBody(payload.body.data);
263
- }
264
- return {
265
- id: message.id,
266
- threadId: message.threadId,
267
- subject: getHeader('Subject') || 'No Subject',
268
- sender: getHeader('From'),
269
- recipient: getHeader('To'),
270
- date: getHeader('Date'),
271
- internalDate: message.internalDate || '', // Gmail's internal timestamp (ms since epoch)
272
- body,
273
- snippet: message.snippet || '',
274
- headers: {
275
- importance: getHeader('Importance') || getHeader('X-Priority'),
276
- listUnsubscribe: getHeader('List-Unsubscribe'),
277
- autoSubmitted: getHeader('Auto-Submitted'),
278
- mailer: getHeader('X-Mailer'),
279
- }
280
- };
281
- }
282
- decodeBody(data) {
283
- if (!data)
284
- return '';
285
- try {
286
- return Buffer.from(data, 'base64').toString('utf-8');
287
- }
288
- catch {
289
- return '';
290
- }
291
- }
292
203
  async trashMessage(account, messageId) {
293
204
  const gmail = await this.getAuthenticatedClient(account);
294
205
  await gmail.users.messages.trash({ userId: 'me', id: messageId });
@@ -123,7 +123,8 @@ REQUIRED JSON STRUCTURE:
123
123
  await eventLogger.info('Thinking', `Analyzing email: ${context.subject}`, {
124
124
  model: this.model,
125
125
  system_prompt: systemPrompt,
126
- content_preview: cleanedContent
126
+ content_preview: cleanedContent,
127
+ content_length: cleanedContent.length
127
128
  }, emailId);
128
129
  }
129
130
  catch (err) {
@@ -103,9 +103,7 @@ export class MicrosoftService {
103
103
  async fetchMessages(account, options = {}) {
104
104
  const accessToken = account.access_token || '';
105
105
  const { top = 20, skip = 0, filter } = options;
106
- // IMPORTANT: Use ascending order to fetch OLDEST emails first
107
- // This ensures checkpoint-based pagination works correctly and doesn't skip emails
108
- let url = `https://graph.microsoft.com/v1.0/me/messages?$top=${top}&$skip=${skip}&$orderby=receivedDateTime asc&$select=id,conversationId,subject,from,toRecipients,receivedDateTime,body,bodyPreview,importance`;
106
+ let url = `https://graph.microsoft.com/v1.0/me/messages?$top=${top}&$skip=${skip}&$orderby=receivedDateTime asc&$select=id,conversationId`;
109
107
  if (filter) {
110
108
  url += `&$filter=${encodeURIComponent(filter)}`;
111
109
  }
@@ -121,19 +119,29 @@ export class MicrosoftService {
121
119
  throw new Error('Failed to fetch messages from Outlook');
122
120
  }
123
121
  const data = await response.json();
124
- const messages = (data.value || []).map((msg) => ({
125
- id: msg.id,
126
- conversationId: msg.conversationId,
127
- subject: msg.subject || 'No Subject',
128
- sender: msg.from?.emailAddress?.address || 'Unknown',
129
- recipient: msg.toRecipients?.[0]?.emailAddress?.address || '',
130
- date: msg.receivedDateTime,
131
- body: msg.body?.content || '',
132
- snippet: msg.bodyPreview || '',
133
- headers: {
134
- importance: msg.importance,
122
+ const messageRefs = data.value || [];
123
+ const messages = [];
124
+ // For each message, fetch the raw MIME content
125
+ for (const ref of messageRefs) {
126
+ try {
127
+ const rawResponse = await fetch(`https://graph.microsoft.com/v1.0/me/messages/${ref.id}/$value`, {
128
+ headers: {
129
+ Authorization: `Bearer ${accessToken}`,
130
+ },
131
+ });
132
+ if (rawResponse.ok) {
133
+ const rawMime = await rawResponse.text();
134
+ messages.push({
135
+ id: ref.id,
136
+ conversationId: ref.conversationId,
137
+ raw: rawMime
138
+ });
139
+ }
140
+ }
141
+ catch (error) {
142
+ logger.warn('Failed to fetch raw content for Outlook message', { messageId: ref.id, error });
135
143
  }
136
- }));
144
+ }
137
145
  return {
138
146
  messages,
139
147
  hasMore: !!data['@odata.nextLink'],
@@ -195,13 +203,15 @@ export class MicrosoftService {
195
203
  }
196
204
  async createDraft(account, originalMessageId, replyContent) {
197
205
  const accessToken = account.access_token || '';
198
- // Get original message
199
- const originalResponse = await fetch(`https://graph.microsoft.com/v1.0/me/messages/${originalMessageId}`, {
206
+ // Get original message (minimal metadata)
207
+ const originalResponse = await fetch(`https://graph.microsoft.com/v1.0/me/messages/${originalMessageId}?$select=id,conversationId`, {
200
208
  headers: {
201
209
  Authorization: `Bearer ${accessToken}`,
202
210
  },
203
211
  });
204
- const original = await originalResponse.json();
212
+ if (!originalResponse.ok) {
213
+ throw new Error('Failed to fetch original message metadata');
214
+ }
205
215
  // Create reply draft
206
216
  const response = await fetch(`https://graph.microsoft.com/v1.0/me/messages/${originalMessageId}/createReply`, {
207
217
  method: 'POST',