jxa-mail-mcp 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of jxa-mail-mcp might be problematic. Click here for more details.

@@ -0,0 +1,485 @@
1
+ """Direct disk reading of Apple Mail .emlx files.
2
+
3
+ This module reads emails directly from ~/Library/Mail/V10/ for fast indexing.
4
+ Requires Full Disk Access permission for the terminal.
5
+
6
+ Mail.app storage structure:
7
+ ~/Library/Mail/V10/
8
+ ├── [Account-UUID]/
9
+ │ └── [Mailbox].mbox/
10
+ │ └── Data/x/y/Messages/
11
+ │ ├── 12345.emlx
12
+ │ └── 12346.emlx
13
+ └── MailData/
14
+ └── Envelope Index # SQLite with metadata
15
+
16
+ .emlx file format:
17
+ 1255 ← Byte count of MIME content
18
+ From: sender@example.com ← RFC 5322 headers + body
19
+ Subject: Hello
20
+ ...
21
+ <?xml version="1.0"?> ← Plist metadata footer
22
+ <plist>...</plist>
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import email
28
+ import re
29
+ import sqlite3
30
+ from dataclasses import dataclass
31
+ from email.header import decode_header, make_header
32
+ from pathlib import Path
33
+ from typing import TYPE_CHECKING
34
+
35
+ if TYPE_CHECKING:
36
+ from collections.abc import Iterator
37
+
38
+ # Mail.app version folder (V10 for macOS Catalina+)
39
+ MAIL_VERSION = "V10"
40
+
41
+ # Maximum email file size to prevent OOM from malformed/huge files (25 MB)
42
+ MAX_EMLX_SIZE = 25 * 1024 * 1024
43
+
44
+
45
+ @dataclass
46
+ class EmlxEmail:
47
+ """Parsed email from .emlx file."""
48
+
49
+ id: int
50
+ subject: str
51
+ sender: str
52
+ content: str
53
+ date_received: str
54
+ emlx_path: Path
55
+
56
+
57
+ def find_mail_directory() -> Path:
58
+ """
59
+ Find the Apple Mail data directory.
60
+
61
+ Returns:
62
+ Path to ~/Library/Mail/V10/
63
+
64
+ Raises:
65
+ FileNotFoundError: If directory doesn't exist
66
+ PermissionError: If Full Disk Access is not granted
67
+ """
68
+ mail_dir = Path.home() / "Library" / "Mail" / MAIL_VERSION
69
+
70
+ if not mail_dir.exists():
71
+ raise FileNotFoundError(
72
+ f"Mail directory not found: {mail_dir}\n"
73
+ "Ensure Apple Mail has been used on this Mac."
74
+ )
75
+
76
+ # Test access by trying to list contents
77
+ try:
78
+ next(mail_dir.iterdir(), None)
79
+ except PermissionError as e:
80
+ raise PermissionError(
81
+ f"Cannot access {mail_dir}\n"
82
+ "Grant Full Disk Access to Terminal:\n"
83
+ " System Settings → Privacy & Security → Full Disk Access"
84
+ ) from e
85
+
86
+ return mail_dir
87
+
88
+
89
+ def find_envelope_index(mail_dir: Path) -> Path:
90
+ """
91
+ Find the Envelope Index SQLite database.
92
+
93
+ Args:
94
+ mail_dir: Path to ~/Library/Mail/V10/
95
+
96
+ Returns:
97
+ Path to the Envelope Index database
98
+
99
+ Raises:
100
+ FileNotFoundError: If database not found
101
+ """
102
+ # The Envelope Index is in MailData directory
103
+ envelope_path = mail_dir.parent / "MailData" / "Envelope Index"
104
+
105
+ if not envelope_path.exists():
106
+ raise FileNotFoundError(
107
+ f"Envelope Index not found: {envelope_path}\n"
108
+ "Ensure Apple Mail has synced email."
109
+ )
110
+
111
+ return envelope_path
112
+
113
+
114
+ def read_envelope_index(mail_dir: Path) -> dict[int, dict]:
115
+ """
116
+ Read the Envelope Index database to get message metadata.
117
+
118
+ The Envelope Index contains:
119
+ - Message IDs and their file paths
120
+ - Account and mailbox information
121
+ - Basic metadata (subject, sender, dates)
122
+
123
+ Args:
124
+ mail_dir: Path to ~/Library/Mail/V10/
125
+
126
+ Returns:
127
+ Dict mapping message ID to metadata dict with:
128
+ - account: Account name
129
+ - mailbox: Mailbox name
130
+ - emlx_path: Path to .emlx file (relative)
131
+ - subject: Email subject
132
+ - sender: Sender address
133
+ - date_received: ISO date string
134
+ """
135
+ envelope_path = find_envelope_index(mail_dir)
136
+
137
+ # Connect in read-only mode to avoid locking issues
138
+ conn = sqlite3.connect(f"file:{envelope_path}?mode=ro", uri=True)
139
+ conn.row_factory = sqlite3.Row
140
+
141
+ result: dict[int, dict] = {}
142
+
143
+ try:
144
+ # Query the messages table joined with mailboxes
145
+ # Schema varies by macOS version, so we use a flexible approach
146
+ cursor = conn.execute("""
147
+ SELECT
148
+ m.ROWID as id,
149
+ m.subject,
150
+ m.sender,
151
+ m.date_received,
152
+ m.mailbox as mailbox_id,
153
+ mb.url as mailbox_url
154
+ FROM messages m
155
+ LEFT JOIN mailboxes mb ON m.mailbox = mb.ROWID
156
+ ORDER BY m.date_received DESC
157
+ """)
158
+
159
+ for row in cursor:
160
+ msg_id = row["id"]
161
+
162
+ # Parse mailbox URL to get account and mailbox name
163
+ # Format: mailbox://[account-uuid]/[mailbox-name]
164
+ mailbox_url = row["mailbox_url"] or ""
165
+ account, mailbox = _parse_mailbox_url(mailbox_url)
166
+
167
+ result[msg_id] = {
168
+ "account": account,
169
+ "mailbox": mailbox,
170
+ "subject": row["subject"] or "",
171
+ "sender": row["sender"] or "",
172
+ "date_received": _format_timestamp(row["date_received"]),
173
+ }
174
+
175
+ except sqlite3.OperationalError as e:
176
+ # Schema might be different, try alternative approach
177
+ if "no such table" in str(e).lower():
178
+ # Fallback to scanning .emlx files directly
179
+ pass
180
+ else:
181
+ raise
182
+ finally:
183
+ conn.close()
184
+
185
+ return result
186
+
187
+
188
+ def _parse_mailbox_url(url: str) -> tuple[str, str]:
189
+ """
190
+ Parse a mailbox URL to extract account and mailbox names.
191
+
192
+ Args:
193
+ url: mailbox://account-uuid/mailbox-name
194
+
195
+ Returns:
196
+ (account_name, mailbox_name) tuple
197
+ """
198
+ if not url:
199
+ return ("Unknown", "Unknown")
200
+
201
+ # Remove mailbox:// prefix
202
+ path = url.replace("mailbox://", "")
203
+
204
+ # Split by /
205
+ parts = path.split("/", 1)
206
+
207
+ if len(parts) >= 2:
208
+ account = parts[0] or "Unknown"
209
+ mailbox = parts[1] or "Unknown"
210
+ return (account, mailbox)
211
+
212
+ return (parts[0] if parts else "Unknown", "Unknown")
213
+
214
+
215
+ def _format_timestamp(timestamp: float | int | None) -> str:
216
+ """Convert Core Data timestamp to ISO string."""
217
+ if timestamp is None:
218
+ return ""
219
+
220
+ # Core Data timestamps are seconds since Jan 1, 2001
221
+ # Convert to Unix timestamp (seconds since Jan 1, 1970)
222
+ import datetime
223
+
224
+ CORE_DATA_EPOCH = 978307200 # Jan 1, 2001 in Unix time
225
+
226
+ try:
227
+ unix_ts = timestamp + CORE_DATA_EPOCH
228
+ dt = datetime.datetime.fromtimestamp(unix_ts, tz=datetime.UTC)
229
+ return dt.isoformat()
230
+ except (OSError, ValueError, OverflowError):
231
+ return ""
232
+
233
+
234
+ def parse_emlx(path: Path) -> EmlxEmail | None:
235
+ """
236
+ Parse a single .emlx file.
237
+
238
+ .emlx format:
239
+ 1. First line: byte count of MIME content
240
+ 2. MIME message (RFC 5322)
241
+ 3. XML plist footer with Apple metadata
242
+
243
+ Args:
244
+ path: Path to .emlx file
245
+
246
+ Returns:
247
+ EmlxEmail with parsed content, or None if parsing fails
248
+ """
249
+ try:
250
+ # Check file size to prevent OOM from huge/malformed files
251
+ if path.stat().st_size > MAX_EMLX_SIZE:
252
+ return None
253
+
254
+ content = path.read_bytes()
255
+
256
+ # Find the byte count on first line
257
+ newline_idx = content.find(b"\n")
258
+ if newline_idx == -1:
259
+ return None
260
+
261
+ try:
262
+ byte_count = int(content[:newline_idx].strip())
263
+ except ValueError:
264
+ return None
265
+
266
+ # Extract MIME content
267
+ mime_start = newline_idx + 1
268
+ mime_end = mime_start + byte_count
269
+ mime_content = content[mime_start:mime_end]
270
+
271
+ # Parse MIME message
272
+ msg = email.message_from_bytes(mime_content)
273
+
274
+ # Extract subject with proper decoding
275
+ subject = ""
276
+ if msg["Subject"]:
277
+ try:
278
+ subject = str(make_header(decode_header(msg["Subject"])))
279
+ except (UnicodeDecodeError, LookupError):
280
+ subject = msg["Subject"] or ""
281
+
282
+ # Extract sender
283
+ sender = msg["From"] or ""
284
+ if sender:
285
+ try:
286
+ sender = str(make_header(decode_header(sender)))
287
+ except (UnicodeDecodeError, LookupError):
288
+ pass
289
+
290
+ # Extract date
291
+ date_received = ""
292
+ if msg["Date"]:
293
+ date_received = msg["Date"]
294
+
295
+ # Extract body text
296
+ body = _extract_body_text(msg)
297
+
298
+ # Extract message ID from filename
299
+ msg_id = int(path.stem)
300
+
301
+ return EmlxEmail(
302
+ id=msg_id,
303
+ subject=subject,
304
+ sender=sender,
305
+ content=body,
306
+ date_received=date_received,
307
+ emlx_path=path,
308
+ )
309
+
310
+ except Exception:
311
+ # Skip malformed files
312
+ return None
313
+
314
+
315
+ def _extract_body_text(msg: email.message.Message) -> str:
316
+ """
317
+ Extract plain text body from email message.
318
+
319
+ Handles multipart messages, preferring text/plain over text/html.
320
+ """
321
+ if msg.is_multipart():
322
+ text_parts = []
323
+ for part in msg.walk():
324
+ content_type = part.get_content_type()
325
+ if content_type == "text/plain":
326
+ payload = part.get_payload(decode=True)
327
+ if payload:
328
+ charset = part.get_content_charset() or "utf-8"
329
+ try:
330
+ decoded = payload.decode(charset, errors="replace")
331
+ text_parts.append(decoded)
332
+ except (UnicodeDecodeError, LookupError):
333
+ decoded = payload.decode("utf-8", errors="replace")
334
+ text_parts.append(decoded)
335
+ if text_parts:
336
+ return "\n".join(text_parts)
337
+
338
+ # Fallback to HTML if no plain text
339
+ for part in msg.walk():
340
+ content_type = part.get_content_type()
341
+ if content_type == "text/html":
342
+ payload = part.get_payload(decode=True)
343
+ if payload:
344
+ charset = part.get_content_charset() or "utf-8"
345
+ try:
346
+ html = payload.decode(charset, errors="replace")
347
+ return _strip_html(html)
348
+ except (UnicodeDecodeError, LookupError):
349
+ pass
350
+ return ""
351
+ else:
352
+ payload = msg.get_payload(decode=True)
353
+ if payload:
354
+ charset = msg.get_content_charset() or "utf-8"
355
+ try:
356
+ text = payload.decode(charset, errors="replace")
357
+ if msg.get_content_type() == "text/html":
358
+ return _strip_html(text)
359
+ return text
360
+ except (UnicodeDecodeError, LookupError):
361
+ return payload.decode("utf-8", errors="replace")
362
+ return ""
363
+
364
+
365
+ def _strip_html(html: str) -> str:
366
+ """
367
+ Robust HTML to text conversion using BeautifulSoup.
368
+
369
+ Uses a proper HTML parser instead of regex to prevent XSS bypass
370
+ attacks from malformed HTML like <<script> or nested tags.
371
+ """
372
+ try:
373
+ from bs4 import BeautifulSoup
374
+
375
+ soup = BeautifulSoup(html, "html.parser")
376
+
377
+ # Remove script and style elements completely
378
+ for element in soup(["script", "style"]):
379
+ element.decompose()
380
+
381
+ # Get text with newlines as separators
382
+ text = soup.get_text(separator="\n", strip=True)
383
+
384
+ # Collapse multiple newlines
385
+ text = re.sub(r"\n\s*\n", "\n\n", text)
386
+ text = re.sub(r" +", " ", text)
387
+
388
+ return text.strip()
389
+
390
+ except Exception:
391
+ # Fallback: return empty string if parsing fails entirely
392
+ # This is safer than returning potentially malicious content
393
+ return ""
394
+
395
+
396
+ def scan_emlx_files(mail_dir: Path) -> Iterator[Path]:
397
+ """
398
+ Find all .emlx files in the Mail directory.
399
+
400
+ Args:
401
+ mail_dir: Path to ~/Library/Mail/V10/
402
+
403
+ Yields:
404
+ Paths to .emlx files
405
+ """
406
+ # .emlx files are in: account-uuid/mailbox.mbox/Data/x/y/Messages/
407
+ for emlx_path in mail_dir.rglob("*.emlx"):
408
+ # Skip partial downloads
409
+ if ".partial.emlx" in emlx_path.name:
410
+ continue
411
+ yield emlx_path
412
+
413
+
414
+ def scan_all_emails(mail_dir: Path) -> Iterator[dict]:
415
+ """
416
+ Scan all emails from the Mail directory.
417
+
418
+ This combines the Envelope Index metadata with .emlx file content
419
+ for comprehensive email data.
420
+
421
+ Args:
422
+ mail_dir: Path to ~/Library/Mail/V10/
423
+
424
+ Yields:
425
+ Email dicts with: id, account, mailbox, subject, sender,
426
+ content, date_received
427
+ """
428
+ # First, try to read metadata from Envelope Index
429
+ try:
430
+ metadata = read_envelope_index(mail_dir)
431
+ except (FileNotFoundError, sqlite3.Error):
432
+ metadata = {}
433
+
434
+ # Scan .emlx files and combine with metadata
435
+ for emlx_path in scan_emlx_files(mail_dir):
436
+ parsed = parse_emlx(emlx_path)
437
+ if not parsed:
438
+ continue
439
+
440
+ msg_id = parsed.id
441
+
442
+ # Get metadata from Envelope Index if available
443
+ meta = metadata.get(msg_id, {})
444
+
445
+ # Infer account/mailbox from path if not in metadata
446
+ if not meta:
447
+ account, mailbox = _infer_account_mailbox(emlx_path, mail_dir)
448
+ meta = {"account": account, "mailbox": mailbox}
449
+
450
+ yield {
451
+ "id": msg_id,
452
+ "account": meta.get("account", "Unknown"),
453
+ "mailbox": meta.get("mailbox", "Unknown"),
454
+ "subject": parsed.subject or meta.get("subject", ""),
455
+ "sender": parsed.sender or meta.get("sender", ""),
456
+ "content": parsed.content,
457
+ "date_received": meta.get("date_received") or parsed.date_received,
458
+ }
459
+
460
+
461
+ def _infer_account_mailbox(emlx_path: Path, mail_dir: Path) -> tuple[str, str]:
462
+ """
463
+ Infer account and mailbox from .emlx file path.
464
+
465
+ Path structure: V10/account-uuid/mailbox.mbox/Data/.../Messages/id.emlx
466
+ """
467
+ try:
468
+ relative = emlx_path.relative_to(mail_dir)
469
+ parts = relative.parts
470
+
471
+ # First part is account UUID
472
+ account = parts[0] if parts else "Unknown"
473
+
474
+ # Second part is mailbox.mbox
475
+ mailbox = "Unknown"
476
+ if len(parts) > 1:
477
+ mbox_part = parts[1]
478
+ if mbox_part.endswith(".mbox"):
479
+ mailbox = mbox_part[:-5] # Remove .mbox suffix
480
+ else:
481
+ mailbox = mbox_part
482
+
483
+ return (account, mailbox)
484
+ except ValueError:
485
+ return ("Unknown", "Unknown")