@forwardimpact/basecamp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +229 -0
  3. package/build.js +124 -0
  4. package/config/scheduler.json +28 -0
  5. package/package.json +37 -0
  6. package/scheduler.js +552 -0
  7. package/scripts/build-pkg.sh +117 -0
  8. package/scripts/compile.sh +26 -0
  9. package/scripts/install.sh +108 -0
  10. package/scripts/pkg-resources/conclusion.html +62 -0
  11. package/scripts/pkg-resources/welcome.html +64 -0
  12. package/scripts/postinstall +46 -0
  13. package/scripts/uninstall.sh +56 -0
  14. package/template/.claude/settings.json +40 -0
  15. package/template/.claude/skills/create-presentations/SKILL.md +75 -0
  16. package/template/.claude/skills/create-presentations/references/slide.css +35 -0
  17. package/template/.claude/skills/create-presentations/scripts/convert-to-pdf.js +32 -0
  18. package/template/.claude/skills/doc-collab/SKILL.md +112 -0
  19. package/template/.claude/skills/draft-emails/SKILL.md +191 -0
  20. package/template/.claude/skills/draft-emails/scripts/scan-emails.sh +33 -0
  21. package/template/.claude/skills/extract-entities/SKILL.md +466 -0
  22. package/template/.claude/skills/extract-entities/references/TEMPLATES.md +131 -0
  23. package/template/.claude/skills/extract-entities/scripts/state.py +100 -0
  24. package/template/.claude/skills/meeting-prep/SKILL.md +135 -0
  25. package/template/.claude/skills/organize-files/SKILL.md +146 -0
  26. package/template/.claude/skills/organize-files/scripts/organize-by-type.sh +42 -0
  27. package/template/.claude/skills/organize-files/scripts/summarize.sh +21 -0
  28. package/template/.claude/skills/sync-apple-calendar/SKILL.md +101 -0
  29. package/template/.claude/skills/sync-apple-calendar/references/SCHEMA.md +80 -0
  30. package/template/.claude/skills/sync-apple-calendar/scripts/sync.py +233 -0
  31. package/template/.claude/skills/sync-apple-mail/SKILL.md +131 -0
  32. package/template/.claude/skills/sync-apple-mail/references/SCHEMA.md +88 -0
  33. package/template/.claude/skills/sync-apple-mail/scripts/parse-emlx.py +104 -0
  34. package/template/.claude/skills/sync-apple-mail/scripts/sync.py +348 -0
  35. package/template/CLAUDE.md +152 -0
  36. package/template/USER.md +5 -0
@@ -0,0 +1,80 @@
1
+ # Apple Calendar Database Schema
2
+
3
+ The Apple Calendar SQLite database uses Core Data. Key tables and their actual
4
+ column names (verified on macOS Sonoma+).
5
+
6
+ ## CalendarItem (events and reminders)
7
+
8
+ | Column | Type | Notes |
9
+ | ---------------- | ------- | ---------------------------------------------------------- |
10
+ | `ROWID` | INTEGER | Primary key |
11
+ | `summary` | TEXT | Event title |
12
+ | `start_date` | REAL | Core Data timestamp (seconds since 2001-01-01 UTC) |
13
+ | `end_date` | REAL | Core Data timestamp (null for all-day events) |
14
+ | `start_tz` | TEXT | IANA timezone (e.g., `Europe/Paris`), `_float` for all-day |
15
+ | `end_tz` | TEXT | IANA timezone, `_float` for all-day |
16
+ | `all_day` | INTEGER | 1 = all-day event |
17
+ | `location_id` | INTEGER | FK → Location.ROWID |
18
+ | `description` | TEXT | Event notes/body |
19
+ | `organizer_id` | INTEGER | FK → Identity.ROWID |
20
+ | `calendar_id` | INTEGER | FK → Calendar.ROWID |
21
+ | `has_attendees` | INTEGER | 1 = event has attendees |
22
+ | `conference_url` | TEXT | Video call URL (often null — check description too) |
23
+ | `entity_type` | INTEGER | 2 = calendar event |
24
+
25
+ ## Identity (organizer info)
26
+
27
+ | Column | Type | Notes |
28
+ | -------------- | ---- | ------------------------------------------------------------- |
29
+ | `display_name` | TEXT | Full name (e.g., `"Chen, Sarah"`) |
30
+ | `address` | TEXT | Email with `mailto:` prefix (e.g., `"mailto:sarah@acme.com"`) |
31
+ | `first_name` | TEXT | Usually null — `display_name` is the reliable field |
32
+ | `last_name` | TEXT | Usually null — `display_name` is the reliable field |
33
+
34
+ **IMPORTANT:** Identity does NOT have an `email` column. Use `address` and strip
35
+ the `mailto:` prefix. Use `display_name` for the name (not
36
+ `first_name`/`last_name`, which are typically null).
37
+
38
+ ## Participant (attendees and organizer)
39
+
40
+ | Column | Type | Notes |
41
+ | ------------- | ------- | -------------------------------------------------- |
42
+ | `ROWID` | INTEGER | Primary key |
43
+ | `entity_type` | INTEGER | 7 = attendee, 8 = organizer |
44
+ | `owner_id` | INTEGER | FK → CalendarItem.ROWID |
45
+ | `identity_id` | INTEGER | FK → Identity.ROWID (for display_name lookup) |
46
+ | `email` | TEXT | Email address (no `mailto:` prefix) |
47
+ | `status` | INTEGER | EKParticipantStatus (see mapping below) |
48
+ | `role` | INTEGER | 0 = unknown, 1 = required, 2 = optional, 3 = chair |
49
+ | `is_self` | INTEGER | 1 = this is the calendar owner |
50
+
51
+ **IMPORTANT:** Participant does NOT have `display_name`, `first_name`, or
52
+ `last_name` columns. To get the attendee's name, JOIN with Identity via
53
+ `identity_id`. There is NO `Attendee` table — only use `Participant`.
54
+
55
+ ### EKParticipantStatus mapping
56
+
57
+ | Value | Status |
58
+ | ----- | ---------- |
59
+ | 0 | unknown |
60
+ | 1 | pending |
61
+ | 2 | accepted |
62
+ | 3 | declined |
63
+ | 4 | tentative |
64
+ | 5 | delegated |
65
+ | 6 | completed |
66
+ | 7 | in-process |
67
+
68
+ ## Calendar (calendar metadata)
69
+
70
+ | Column | Type | Notes |
71
+ | ------- | ------- | ------------- |
72
+ | `ROWID` | INTEGER | Primary key |
73
+ | `title` | TEXT | Calendar name |
74
+
75
+ ## Location
76
+
77
+ | Column | Type | Notes |
78
+ | ------- | ------- | --------------- |
79
+ | `ROWID` | INTEGER | Primary key |
80
+ | `title` | TEXT | Location string |
@@ -0,0 +1,233 @@
1
+ #!/usr/bin/env python3
2
+ """Sync Apple Calendar events to ~/.cache/fit/basecamp/apple_calendar/ as JSON.
3
+
4
+ Queries the macOS Calendar SQLite database for events in a 14-day sliding
5
+ window (past and future) and writes one JSON file per event.
6
+
7
+ Usage: python3 scripts/sync.py
8
+
9
+ Requires: macOS with Calendar app configured and Full Disk Access granted.
10
+ """
11
+
12
+ import json
13
+ import os
14
+ import subprocess
15
+ from datetime import datetime, timezone, timedelta
16
+
17
+ EPOCH = datetime(2001, 1, 1, tzinfo=timezone.utc)
18
+ OUTDIR = os.path.expanduser("~/.cache/fit/basecamp/apple_calendar")
19
+
20
+ DB_PATHS = [
21
+ os.path.expanduser(
22
+ "~/Library/Group Containers/group.com.apple.calendar/Calendar.sqlitedb"
23
+ ),
24
+ os.path.expanduser("~/Library/Calendars/Calendar.sqlitedb"),
25
+ ]
26
+
27
+ STATUS_MAP = {
28
+ 0: "unknown",
29
+ 1: "pending",
30
+ 2: "accepted",
31
+ 3: "declined",
32
+ 4: "tentative",
33
+ 5: "delegated",
34
+ 6: "completed",
35
+ 7: "in-process",
36
+ }
37
+
38
+ ROLE_MAP = {0: "unknown", 1: "required", 2: "optional", 3: "chair"}
39
+
40
+
41
+ def find_db():
42
+ db = next((p for p in DB_PATHS if os.path.exists(p)), None)
43
+ if not db:
44
+ print("Error: Apple Calendar database not found. Is Calendar configured?")
45
+ exit(1)
46
+ return db
47
+
48
+
49
+ def query(db, sql):
50
+ result = subprocess.run(
51
+ ["sqlite3", "-readonly", "-json", db, sql], capture_output=True, text=True
52
+ )
53
+ if result.returncode != 0:
54
+ if "database is locked" in result.stderr:
55
+ import time
56
+
57
+ time.sleep(2)
58
+ result = subprocess.run(
59
+ ["sqlite3", "-readonly", "-json", db, sql],
60
+ capture_output=True,
61
+ text=True,
62
+ )
63
+ if result.returncode != 0:
64
+ print(f"SQLite error: {result.stderr.strip()}")
65
+ return []
66
+ return json.loads(result.stdout) if result.stdout.strip() else []
67
+
68
+
69
+ def coredata_to_iso(ts, tz_name=None):
70
+ """Convert Core Data timestamp to ISO 8601."""
71
+ if ts is None:
72
+ return None
73
+ dt = EPOCH + timedelta(seconds=ts)
74
+ if tz_name and tz_name != "_float":
75
+ try:
76
+ from zoneinfo import ZoneInfo
77
+
78
+ dt = dt.astimezone(ZoneInfo(tz_name))
79
+ except Exception:
80
+ pass
81
+ return dt.isoformat()
82
+
83
+
84
+ def main():
85
+ db = find_db()
86
+ os.makedirs(OUTDIR, exist_ok=True)
87
+
88
+ now = datetime.now(timezone.utc)
89
+ start = now - timedelta(days=14)
90
+ end = now + timedelta(days=14)
91
+ START_TS = (start - EPOCH).total_seconds()
92
+ END_TS = (end - EPOCH).total_seconds()
93
+
94
+ # Fetch events with a single query
95
+ events = query(
96
+ db,
97
+ f"""
98
+ SELECT
99
+ ci.ROWID AS id,
100
+ ci.summary,
101
+ ci.start_date,
102
+ ci.end_date,
103
+ ci.start_tz,
104
+ ci.end_tz,
105
+ ci.all_day,
106
+ ci.description,
107
+ ci.has_attendees,
108
+ ci.conference_url,
109
+ loc.title AS location,
110
+ cal.title AS calendar_name,
111
+ org.address AS organizer_email,
112
+ org.display_name AS organizer_name
113
+ FROM CalendarItem ci
114
+ LEFT JOIN Location loc ON loc.ROWID = ci.location_id
115
+ LEFT JOIN Calendar cal ON cal.ROWID = ci.calendar_id
116
+ LEFT JOIN Identity org ON org.ROWID = ci.organizer_id
117
+ WHERE ci.start_date <= {END_TS}
118
+ AND COALESCE(ci.end_date, ci.start_date) >= {START_TS}
119
+ AND ci.summary IS NOT NULL
120
+ AND ci.summary != ''
121
+ ORDER BY ci.start_date ASC
122
+ LIMIT 1000;
123
+ """,
124
+ )
125
+
126
+ # Collect event IDs for batch attendee query
127
+ event_ids = [str(ev["id"]) for ev in events]
128
+
129
+ # Batch-fetch all attendees in one query (avoids N+1)
130
+ attendees_by_event = {}
131
+ if event_ids:
132
+ id_list = ",".join(event_ids)
133
+ attendees_raw = query(
134
+ db,
135
+ f"""
136
+ SELECT
137
+ p.owner_id,
138
+ p.email,
139
+ p.status,
140
+ p.role,
141
+ p.is_self,
142
+ p.entity_type,
143
+ i.display_name
144
+ FROM Participant p
145
+ LEFT JOIN Identity i ON i.ROWID = p.identity_id
146
+ WHERE p.owner_id IN ({id_list})
147
+ AND p.entity_type = 7;
148
+ """,
149
+ )
150
+ for a in attendees_raw:
151
+ oid = a["owner_id"]
152
+ attendees_by_event.setdefault(oid, []).append(a)
153
+
154
+ # Write event JSON files
155
+ written_ids = set()
156
+ for ev in events:
157
+ eid = ev["id"]
158
+
159
+ # Organizer — strip mailto: prefix from Identity.address
160
+ org_email = ev.get("organizer_email") or None
161
+ if org_email and org_email.startswith("mailto:"):
162
+ org_email = org_email[7:]
163
+
164
+ # Attendees
165
+ attendees = []
166
+ for a in attendees_by_event.get(eid, []):
167
+ if not a.get("email"):
168
+ continue
169
+ attendees.append(
170
+ {
171
+ "email": a["email"],
172
+ "name": (a.get("display_name") or "").strip() or None,
173
+ "status": STATUS_MAP.get(a.get("status"), "unknown"),
174
+ "role": ROLE_MAP.get(a.get("role"), "unknown"),
175
+ "self": bool(a.get("is_self")),
176
+ }
177
+ )
178
+
179
+ is_all_day = bool(ev.get("all_day"))
180
+
181
+ event_json = {
182
+ "id": f"apple_cal_{eid}",
183
+ "summary": ev["summary"],
184
+ "start": {
185
+ "dateTime": coredata_to_iso(ev["start_date"], ev.get("start_tz")),
186
+ "timeZone": ev.get("start_tz")
187
+ if ev.get("start_tz") != "_float"
188
+ else None,
189
+ },
190
+ "end": {
191
+ "dateTime": coredata_to_iso(
192
+ ev["end_date"] if ev["end_date"] else ev["start_date"],
193
+ ev.get("end_tz"),
194
+ ),
195
+ "timeZone": ev.get("end_tz")
196
+ if ev.get("end_tz") != "_float"
197
+ else None,
198
+ },
199
+ "allDay": is_all_day,
200
+ "location": ev.get("location") or None,
201
+ "description": ev.get("description") or None,
202
+ "conferenceUrl": ev.get("conference_url") or None,
203
+ "calendar": ev.get("calendar_name") or None,
204
+ "organizer": {
205
+ "email": org_email,
206
+ "name": (ev.get("organizer_name") or "").strip() or None,
207
+ }
208
+ if org_email
209
+ else None,
210
+ "attendees": attendees if attendees else None,
211
+ }
212
+
213
+ filepath = os.path.join(OUTDIR, f"{eid}.json")
214
+ with open(filepath, "w") as f:
215
+ json.dump(event_json, f, indent=2)
216
+ written_ids.add(f"{eid}.json")
217
+
218
+ # Clean up events outside the window
219
+ removed = 0
220
+ for fname in os.listdir(OUTDIR):
221
+ if fname.endswith(".json") and fname not in written_ids:
222
+ os.remove(os.path.join(OUTDIR, fname))
223
+ removed += 1
224
+
225
+ print(f"Apple Calendar Sync Complete")
226
+ print(f"Events synced: {len(written_ids)}")
227
+ print(f"Time window: {start.date()} to {end.date()}")
228
+ print(f"Files cleaned up: {removed} (outside window)")
229
+ print(f"Output: {OUTDIR}")
230
+
231
+
232
+ if __name__ == "__main__":
233
+ main()
@@ -0,0 +1,131 @@
1
+ ---
2
+ name: sync-apple-mail
3
+ description: Sync email threads from the macOS Mail app's local SQLite database into ~/.cache/fit/basecamp/apple_mail/ as markdown files. Use on a schedule or when the user asks to sync their email. Requires macOS with Mail app configured and Full Disk Access granted.
4
+ compatibility: Requires macOS with Apple Mail configured and Full Disk Access granted to the terminal
5
+ ---
6
+
7
+ # Sync Apple Mail
8
+
9
+ Sync email threads from the macOS Mail app's local SQLite database into
10
+ `~/.cache/fit/basecamp/apple_mail/` as markdown files. This is an automated data
11
+ pipeline skill — it ingests raw email data that other skills (like
12
+ `extract-entities`) consume downstream.
13
+
14
+ ## Trigger
15
+
16
+ Run this skill on a schedule (every 5 minutes) or when the user asks to sync
17
+ their email.
18
+
19
+ ## Prerequisites
20
+
21
+ - macOS with the built-in Mail app configured
22
+ - Full Disk Access granted to the terminal (System Settings → Privacy & Security
23
+ → Full Disk Access)
24
+
25
+ ## Inputs
26
+
27
+ - `~/.cache/fit/basecamp/state/apple_mail_last_sync` — last sync timestamp
28
+ (single-line text file)
29
+ - `~/Library/Mail/V*/MailData/Envelope Index` — Apple Mail SQLite database
30
+
31
+ ## Outputs
32
+
33
+ - `~/.cache/fit/basecamp/apple_mail/{thread_id}.md` — one markdown file per
34
+ email thread
35
+ - `~/.cache/fit/basecamp/state/apple_mail_last_sync` — updated with new sync
36
+ timestamp
37
+
38
+ ---
39
+
40
+ ## Implementation
41
+
42
+ Run the sync as a single Python script. This avoids N+1 shell invocations and
43
+ handles all data transformation in one pass:
44
+
45
+ python3 scripts/sync.py
46
+
47
+ The script:
48
+
49
+ 1. Finds the Mail database (`~/Library/Mail/V*/MailData/Envelope Index`)
50
+ 2. Loads last sync timestamp (or defaults to 30 days ago for first sync)
51
+ 3. Discovers the thread grouping column (`conversation_id` or `thread_id`)
52
+ 4. Finds threads with new messages since last sync (up to 500)
53
+ 5. For each thread: fetches messages, batch-fetches recipients, parses `.emlx`
54
+ files for full email bodies (falling back to database summaries)
55
+ 6. Writes one markdown file per thread to `~/.cache/fit/basecamp/apple_mail/`
56
+ 7. Updates sync state timestamp
57
+ 8. Reports summary (threads processed, files written)
58
+
59
+ The script calls `scripts/parse-emlx.py` to extract plain text bodies from
60
+ `.emlx` / `.partial.emlx` files (handles HTML-only emails by stripping tags).
61
+
62
+ ## Database Schema
63
+
64
+ See [references/SCHEMA.md](references/SCHEMA.md) for the complete Apple Mail
65
+ SQLite schema including table structures, column names, and important caveats
66
+ (e.g., `date_received` is Unix timestamps not Core Data, `addresses.comment`
67
+ holds display names, `recipients` columns are `message`/`address` not
68
+ `message_id`/`address_id`).
69
+
70
+ ## Output Format
71
+
72
+ Each `{thread_id}.md` file:
73
+
74
+ ```markdown
75
+ # {Base Subject}
76
+
77
+ **Thread ID:** {thread_id}
78
+ **Message Count:** {count}
79
+ **Flags:** mailing-list, automated
80
+
81
+ ---
82
+
83
+ ### From: {sender_name} <{sender_email}>
84
+ **Date:** {YYYY-MM-DD HH:MM:SS UTC}
85
+ **To:** {name} <{email}>, {name2} <{email2}>
86
+ **Cc:** {name} <{email}>
87
+
88
+ {email_body_or_summary}
89
+
90
+ ---
91
+
92
+ ### From: {next_sender_name} <{next_sender_email}>
93
+ **Date:** {next_date}
94
+ **To:** ...
95
+ **Cc:** ...
96
+
97
+ {next_body}
98
+ ```
99
+
100
+ Rules:
101
+
102
+ - Use the **base subject** (from `subject` column, without `subject_prefix`) as
103
+ the `# heading`.
104
+ - **Flags line** — only include when at least one flag is set:
105
+ - `mailing-list` if any message in the thread has `list_id_hash != 0`
106
+ - `automated` if any message has `automated_conversation = 1`
107
+ - Omit the `**Flags:**` line entirely if neither flag applies.
108
+ - **Sender** — format as `{sender_name} <{sender_email}>` when display name is
109
+ present, otherwise just `{sender_email}`.
110
+ - **To/Cc** — include per-message. Format each recipient as `{name} <{email}>`
111
+ when name exists, otherwise just `{email}`. Omit the line if that field has no
112
+ recipients.
113
+
114
+ ## Error Handling
115
+
116
+ - Database not found → Mail not configured, report and stop
117
+ - Permission denied → Full Disk Access not granted, report and stop
118
+ - Database locked → wait 2 seconds, retry once
119
+ - `.emlx` / `.partial.emlx` not found → fall back to database summary field
120
+ - `.emlx` parse error → fall back to database summary field
121
+ - HTML-only email → strip tags and use as plain text body (handled by
122
+ parse-emlx.py)
123
+ - `find` timeout → skip that message's body, use summary
124
+ - Always update sync state, even on partial success
125
+
126
+ ## Constraints
127
+
128
+ - Open database read-only (`-readonly`)
129
+ - Only sync Inbox and Sent folders
130
+ - Limit to 500 threads per run
131
+ - Incremental: only threads with new messages since last sync
@@ -0,0 +1,88 @@
1
+ # Apple Mail Database Schema
2
+
3
+ The Apple Mail SQLite database (`Envelope Index`) stores email metadata. Key
4
+ tables and their actual column names (verified on macOS Sequoia / V10).
5
+
6
+ Typical path: `~/Library/Mail/V10/MailData/Envelope Index`
7
+
8
+ ## messages (email metadata)
9
+
10
+ | Column | Type | Notes |
11
+ | ------------------------ | ------- | -------------------------------------------------------- |
12
+ | `ROWID` | INTEGER | Primary key |
13
+ | `sender` | INTEGER | FK → addresses.ROWID |
14
+ | `subject` | INTEGER | FK → subjects.ROWID |
15
+ | `subject_prefix` | TEXT | `Re:`, `Fwd:`, etc. (directly on messages, not subjects) |
16
+ | `summary` | INTEGER | FK → summaries.ROWID |
17
+ | `date_sent` | INTEGER | Unix timestamp (seconds since 1970-01-01 UTC) |
18
+ | `date_received` | INTEGER | Unix timestamp (seconds since 1970-01-01 UTC) |
19
+ | `mailbox` | INTEGER | FK → mailboxes.ROWID |
20
+ | `deleted` | INTEGER | 1 = deleted |
21
+ | `conversation_id` | INTEGER | Thread grouping ID |
22
+ | `list_id_hash` | INTEGER | Non-zero = mailing list message |
23
+ | `automated_conversation` | INTEGER | 1 = automated/machine-generated |
24
+ | `read` | INTEGER | 1 = read |
25
+ | `flagged` | INTEGER | 1 = flagged |
26
+
27
+ **IMPORTANT:** `date_received` stores **Unix timestamps** (seconds since
28
+ 1970-01-01 UTC), NOT Core Data timestamps (which use 2001-01-01 epoch). Do NOT
29
+ apply Core Data conversion.
30
+
31
+ ## addresses (sender and recipient addresses)
32
+
33
+ | Column | Type | Notes |
34
+ | --------- | ------- | ------------------------------------- |
35
+ | `ROWID` | INTEGER | Primary key |
36
+ | `address` | TEXT | Email address |
37
+ | `comment` | TEXT | Display name (e.g., `"Olsson, Dick"`) |
38
+
39
+ **IMPORTANT:** The display name is in `comment`, not a `name` or `display_name`
40
+ column.
41
+
42
+ ## subjects
43
+
44
+ | Column | Type | Notes |
45
+ | --------- | ------- | ----------------- |
46
+ | `ROWID` | INTEGER | Primary key |
47
+ | `subject` | TEXT | Base subject text |
48
+
49
+ Note: `subject_prefix` (Re:, Fwd:, etc.) is stored on the `messages` table
50
+ directly, not here.
51
+
52
+ ## recipients (To/Cc/Bcc per message)
53
+
54
+ | Column | Type | Notes |
55
+ | ---------- | ------- | --------------------------- |
56
+ | `ROWID` | INTEGER | Primary key |
57
+ | `message` | INTEGER | FK → messages.ROWID |
58
+ | `address` | INTEGER | FK → addresses.ROWID |
59
+ | `type` | INTEGER | 0 = To, 1 = Cc, 2 = Bcc |
60
+ | `position` | INTEGER | Order within the type group |
61
+
62
+ **IMPORTANT:** Column names are `message` and `address` (not `message_id` or
63
+ `address_id`).
64
+
65
+ ## summaries (Apple Intelligence email summaries)
66
+
67
+ | Column | Type | Notes |
68
+ | --------- | ------- | ------------ |
69
+ | `ROWID` | INTEGER | Primary key |
70
+ | `summary` | TEXT | Summary text |
71
+
72
+ ## mailboxes
73
+
74
+ | Column | Type | Notes |
75
+ | ------- | ------- | -------------------------------- |
76
+ | `ROWID` | INTEGER | Primary key |
77
+ | `url` | TEXT | Mailbox URL (IMAP or EWS format) |
78
+
79
+ ### Mailbox URL patterns
80
+
81
+ Standard IMAP: `imap://user@host/INBOX`, `imap://user@host/Sent Messages` EWS
82
+ (Exchange): `ews://UUID/Inbox`, `ews://UUID/Sent%20Items`
83
+
84
+ Use case-insensitive `LIKE` patterns to match both:
85
+
86
+ - `%/Inbox%` (catches IMAP `/INBOX` and EWS `/Inbox`)
87
+ - `%/INBOX%` (explicit uppercase match)
88
+ - `%/Sent%` (catches `Sent Messages`, `Sent Items`, `Sent%20Items`)
@@ -0,0 +1,104 @@
1
+ #!/usr/bin/env python3
2
+ """Parse a macOS Mail .emlx or .partial.emlx file and output the plain text body.
3
+
4
+ Usage: python3 scripts/parse-emlx.py <path-to-emlx-file>
5
+
6
+ The .emlx format is: first line = byte count, then RFC822 message, then Apple
7
+ plist. This script extracts and prints the plain text body.
8
+
9
+ If the email has no text/plain part (HTML-only), falls back to stripping HTML
10
+ tags and outputting as plain text.
11
+
12
+ Exit codes:
13
+ 0 — success (body printed to stdout)
14
+ 1 — file not found or parse error (message on stderr)
15
+ """
16
+
17
+ import email
18
+ import html as html_mod
19
+ import re
20
+ import sys
21
+
22
+
23
+ def html_to_text(html):
24
+ """Strip HTML tags and convert to plain text. Uses only stdlib."""
25
+ # Remove style and script blocks
26
+ text = re.sub(
27
+ r"<(style|script)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE
28
+ )
29
+ # Replace br and p tags with newlines
30
+ text = re.sub(r"<br\s*/?\s*>", "\n", text, flags=re.IGNORECASE)
31
+ text = re.sub(r"</p>", "\n", text, flags=re.IGNORECASE)
32
+ # Strip remaining tags
33
+ text = re.sub(r"<[^>]+>", "", text)
34
+ # Decode HTML entities
35
+ text = html_mod.unescape(text)
36
+ # Collapse whitespace
37
+ text = re.sub(r"[ \t]+", " ", text)
38
+ text = re.sub(r"\n{3,}", "\n\n", text)
39
+ return text.strip()
40
+
41
+
42
+ def extract_body(msg):
43
+ """Extract plain text body from an email message, with HTML fallback."""
44
+ body = None
45
+ html_body = None
46
+
47
+ if msg.is_multipart():
48
+ for part in msg.walk():
49
+ ct = part.get_content_type()
50
+ if ct == "text/plain" and body is None:
51
+ charset = part.get_content_charset() or "utf-8"
52
+ payload = part.get_payload(decode=True)
53
+ if payload:
54
+ body = payload.decode(charset, errors="replace")
55
+ elif ct == "text/html" and html_body is None:
56
+ charset = part.get_content_charset() or "utf-8"
57
+ payload = part.get_payload(decode=True)
58
+ if payload:
59
+ html_body = payload.decode(charset, errors="replace")
60
+ else:
61
+ ct = msg.get_content_type()
62
+ charset = msg.get_content_charset() or "utf-8"
63
+ payload = msg.get_payload(decode=True)
64
+ if payload:
65
+ text = payload.decode(charset, errors="replace")
66
+ if ct == "text/plain":
67
+ body = text
68
+ elif ct == "text/html":
69
+ html_body = text
70
+
71
+ if body:
72
+ return body
73
+ elif html_body:
74
+ return html_to_text(html_body)
75
+ return None
76
+
77
+
78
+ def parse_emlx(path):
79
+ try:
80
+ with open(path, "rb") as f:
81
+ byte_count = int(f.readline())
82
+ raw = f.read(byte_count)
83
+ msg = email.message_from_bytes(raw)
84
+
85
+ print(f"From: {msg.get('From', 'Unknown')}")
86
+ print(f"Date: {msg.get('Date', '')}")
87
+ print("---")
88
+
89
+ body = extract_body(msg)
90
+ if body:
91
+ print(body)
92
+ except FileNotFoundError:
93
+ print(f"Error: File not found: {path}", file=sys.stderr)
94
+ sys.exit(1)
95
+ except Exception as e:
96
+ print(f"Error parsing {path}: {e}", file=sys.stderr)
97
+ sys.exit(1)
98
+
99
+
100
+ if __name__ == "__main__":
101
+ if len(sys.argv) != 2:
102
+ print("Usage: python3 scripts/parse-emlx.py <path>", file=sys.stderr)
103
+ sys.exit(1)
104
+ parse_emlx(sys.argv[1])