@chainlesschain/personal-data-hub 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/README.md +241 -0
  2. package/__tests__/adapter-spec.test.js +78 -0
  3. package/__tests__/adapters/email-adapter.test.js +605 -0
  4. package/__tests__/adapters/email-imap-session.test.js +334 -0
  5. package/__tests__/adapters/email-parser.test.js +244 -0
  6. package/__tests__/adapters/email-providers.test.js +84 -0
  7. package/__tests__/analysis.test.js +302 -0
  8. package/__tests__/batch.test.js +133 -0
  9. package/__tests__/bridges-cc-kg.test.js +231 -0
  10. package/__tests__/bridges-cc-llm.test.js +191 -0
  11. package/__tests__/bridges-cc-rag.test.js +162 -0
  12. package/__tests__/ids.test.js +45 -0
  13. package/__tests__/key-providers.test.js +126 -0
  14. package/__tests__/kg-derive.test.js +219 -0
  15. package/__tests__/llm-client.test.js +122 -0
  16. package/__tests__/mock-adapter.test.js +93 -0
  17. package/__tests__/prompt-builder.test.js +204 -0
  18. package/__tests__/query-parser.test.js +150 -0
  19. package/__tests__/rag-derive.test.js +169 -0
  20. package/__tests__/registry.test.js +304 -0
  21. package/__tests__/schemas.test.js +331 -0
  22. package/__tests__/vault.test.js +506 -0
  23. package/lib/adapter-spec.js +155 -0
  24. package/lib/adapters/email-imap/email-adapter.js +398 -0
  25. package/lib/adapters/email-imap/email-parser.js +177 -0
  26. package/lib/adapters/email-imap/imap-session.js +294 -0
  27. package/lib/adapters/email-imap/index.js +26 -0
  28. package/lib/adapters/email-imap/providers.js +111 -0
  29. package/lib/analysis.js +226 -0
  30. package/lib/batch.js +123 -0
  31. package/lib/bridges/cc-kg-sink.js +264 -0
  32. package/lib/bridges/cc-llm-adapter.js +169 -0
  33. package/lib/bridges/cc-rag-sink.js +118 -0
  34. package/lib/bridges/index.js +44 -0
  35. package/lib/constants.js +92 -0
  36. package/lib/ids.js +103 -0
  37. package/lib/index.js +141 -0
  38. package/lib/key-providers.js +146 -0
  39. package/lib/kg-derive.js +214 -0
  40. package/lib/llm-client.js +171 -0
  41. package/lib/migrations.js +246 -0
  42. package/lib/mock-adapter.js +199 -0
  43. package/lib/prompt-builder.js +205 -0
  44. package/lib/query-parser.js +250 -0
  45. package/lib/rag-derive.js +186 -0
  46. package/lib/registry.js +398 -0
  47. package/lib/schemas.js +379 -0
  48. package/lib/vault.js +883 -0
  49. package/package.json +63 -0
  50. package/vitest.config.js +10 -0
@@ -0,0 +1,398 @@
1
+ /**
2
+ * EmailAdapter — Phase 5.1 of the Personal Data Hub.
3
+ *
4
+ * Connects to a user's IMAP mailbox (QQ / 189 / 163 / Outlook / Gmail /
5
+ * custom), incrementally syncs new envelopes since the last watermark,
6
+ * and emits one RawEvent per email. Body parsing + LLM classification +
7
+ * 6-template extraction land in Phase 5.2–5.4; this phase just gets the
8
+ * envelope flow working end-to-end with proper UIDVALIDITY-change
9
+ * handling.
10
+ *
11
+ * Watermark format `<uidValidity>:<lastUid>`:
12
+ * - Same UIDVALIDITY → fetch UID > lastUid (incremental)
13
+ * - Changed → reset lastUid = 0; vault dedupes via Message-ID
14
+ */
15
+
16
+ "use strict";
17
+
18
+ const {
19
+ EVENT_SUBTYPES,
20
+ PERSON_SUBTYPES,
21
+ CAPTURED_BY,
22
+ } = require("../../constants");
23
+ const { newId } = require("../../ids");
24
+ const { resolveProvider } = require("./providers");
25
+ const {
26
+ ImapSession,
27
+ ImapAuthFailedError,
28
+ ImapConnectionFailedError,
29
+ } = require("./imap-session");
30
+ const { parseRawEmail } = require("./email-parser");
31
+
32
+ const NAME = "email-imap";
33
+ const VERSION = "0.2.0"; // bumped for Phase 5.2 body parsing
34
+
35
+ class EmailAdapter {
36
+ constructor(opts) {
37
+ if (!opts || typeof opts !== "object") {
38
+ throw new Error("EmailAdapter: opts required");
39
+ }
40
+ const account = opts.account;
41
+ if (!account || typeof account !== "object") {
42
+ throw new Error("EmailAdapter: opts.account required");
43
+ }
44
+ if (typeof account.email !== "string" || !account.email.includes("@")) {
45
+ throw new Error("EmailAdapter: account.email must be a full address");
46
+ }
47
+ if (typeof account.authCode !== "string" || account.authCode.length === 0) {
48
+ throw new Error("EmailAdapter: account.authCode required (provider authorization code)");
49
+ }
50
+
51
+ this.account = account;
52
+ this._provider = resolveProvider(account);
53
+ this._sessionFactory = typeof opts.sessionFactory === "function"
54
+ ? opts.sessionFactory
55
+ : (cfg) => new ImapSession(cfg);
56
+
57
+ // Phase 5.2: opt-out hook for tests that don't want to depend on
58
+ // mailparser. parser must be `async (rawBuffer) => ParsedEmail`.
59
+ this._parser = typeof opts.parser === "function" ? opts.parser : parseRawEmail;
60
+ // Soft cap on bodies stored in vault content.text — long newsletter
61
+ // HTML can be megabytes; trimming keeps `events` row + KG triple
62
+ // + RAG embed budgets sane.
63
+ this._maxBodyChars = Number.isFinite(opts.maxBodyChars) && opts.maxBodyChars > 0
64
+ ? opts.maxBodyChars
65
+ : 8000;
66
+
67
+ this.name = NAME;
68
+ this.version = VERSION;
69
+ this.capabilities = ["sync:imap", "auth:authcode", "parse:mime-body", "parse:attachment-metadata"];
70
+ this.rateLimits = { perMinute: 60 };
71
+ this.dataDisclosure = {
72
+ fields: [
73
+ "email:headers (from/to/subject/date/messageId)",
74
+ "email:flags + uid + internalDate",
75
+ "email:body (text + html, capped to ~8k chars)",
76
+ "email:attachment-metadata (filename, contentType, size, sha256; no file bytes saved in v0)",
77
+ ],
78
+ sensitivity: "high",
79
+ legalGate: false,
80
+ };
81
+ }
82
+
83
+ async authenticate(_ctx = {}) {
84
+ const session = this._sessionFactory(this._sessionConfig());
85
+ try {
86
+ await session.connect();
87
+ return { ok: true, account: this.account.email, provider: this._provider.providerId };
88
+ } catch (err) {
89
+ if (err instanceof ImapAuthFailedError) {
90
+ return { ok: false, reason: "AUTH_FAILED", error: err.message };
91
+ }
92
+ if (err instanceof ImapConnectionFailedError) {
93
+ return { ok: false, reason: "CONNECTION_FAILED", error: err.message };
94
+ }
95
+ return { ok: false, reason: "UNKNOWN", error: err && err.message ? err.message : String(err) };
96
+ } finally {
97
+ try { await session.close(); } catch (_e) {}
98
+ }
99
+ }
100
+
101
+ async healthCheck() {
102
+ const r = await this.authenticate();
103
+ if (r.ok) return { ok: true, lastChecked: Date.now() };
104
+ return { ok: false, reason: r.reason || "unknown", error: r.error };
105
+ }
106
+
107
+ async *sync(opts = {}) {
108
+ const folders = Array.isArray(opts.folders) && opts.folders.length > 0
109
+ ? opts.folders
110
+ : this._provider.folders;
111
+ const maxPerFolder = Number.isFinite(opts.maxPerFolder) && opts.maxPerFolder > 0
112
+ ? opts.maxPerFolder
113
+ : 5000;
114
+ const watermark = typeof opts.sinceWatermark === "string" ? opts.sinceWatermark : "";
115
+ const { uidValidity: prevUv, lastUid: prevLastUid } = parseWatermark(watermark);
116
+
117
+ const session = this._sessionFactory(this._sessionConfig());
118
+ try {
119
+ await session.connect();
120
+
121
+ for (const folder of folders) {
122
+ const mb = await session.openMailbox(folder);
123
+ const uvChanged = prevUv !== null && String(prevUv) !== String(mb.uidValidity);
124
+ const since = uvChanged ? 0 : prevLastUid;
125
+
126
+ let emitted = 0;
127
+ for await (const env of session.fetchFullSince(since)) {
128
+ // Parse the body in the adapter (not the session) so the
129
+ // session stays a thin IMAP wrapper. Parse failures degrade
130
+ // gracefully — emit the raw event without parsedBody so the
131
+ // registry's invalidCount tracker isn't tripped by every
132
+ // weird MIME structure we hit in the wild.
133
+ let parsedBody = null;
134
+ try {
135
+ if (env.source && env.source.length > 0) {
136
+ parsedBody = await this._parser(env.source);
137
+ }
138
+ } catch (parseErr) {
139
+ // Phase 5.3 classifier rules can still fire on envelope-only
140
+ // facts; we just lose body text + attachments for this email.
141
+ parsedBody = {
142
+ parseError: parseErr && parseErr.message ? parseErr.message : String(parseErr),
143
+ };
144
+ }
145
+ yield this._envelopeToRawEvent(env, folder, parsedBody);
146
+ emitted += 1;
147
+ if (emitted >= maxPerFolder) break;
148
+ }
149
+ }
150
+ } finally {
151
+ try { await session.close(); } catch (_e) {}
152
+ }
153
+ }
154
+
155
+ normalize(raw) {
156
+ if (!raw || typeof raw !== "object" || !raw.payload) {
157
+ throw new Error("EmailAdapter.normalize: missing raw or raw.payload");
158
+ }
159
+ const env = raw.payload;
160
+ const ingestedAt = Date.now();
161
+ const occurredAt = (env.internalDate instanceof Date ? env.internalDate.getTime() : 0)
162
+ || (env.date instanceof Date ? env.date.getTime() : 0)
163
+ || raw.capturedAt
164
+ || ingestedAt;
165
+
166
+ const persons = [];
167
+ let actorId = "person-self";
168
+ if (Array.isArray(env.from) && env.from.length > 0 && env.from[0].address) {
169
+ const senderAddr = env.from[0].address.toLowerCase();
170
+ const senderId = `person-email-${senderAddr}`;
171
+ const senderName = env.from[0].name || senderAddr;
172
+ persons.push({
173
+ id: senderId,
174
+ type: "person",
175
+ subtype: PERSON_SUBTYPES.CONTACT,
176
+ names: [senderName],
177
+ identifiers: { email: [senderAddr] },
178
+ ingestedAt,
179
+ source: this._source(senderAddr, env.internalDate),
180
+ });
181
+ actorId = senderId;
182
+ }
183
+
184
+ const participants = ["person-self"];
185
+ if (actorId !== "person-self") participants.push(actorId);
186
+
187
+ const subject = env.subject || "(no subject)";
188
+
189
+ // Phase 5.2: prefer the parsed text body over the envelope-only
190
+ // placeholder. Falls back to the recipient prose when body parsing
191
+ // failed or the email was envelope-only fetched.
192
+ const parsedBody = env.parsedBody || null;
193
+ let contentText;
194
+ if (parsedBody && typeof parsedBody.textBody === "string" && parsedBody.textBody.length > 0) {
195
+ contentText = trim(parsedBody.textBody, this._maxBodyChars);
196
+ } else if (parsedBody && typeof parsedBody.htmlBody === "string" && parsedBody.htmlBody.length > 0) {
197
+ // For HTML-only newsletters where the text/plain part is empty,
198
+ // keep a crude strip — analysis prompts handle HTML fine, but
199
+ // BM25 tokenization works better on stripped text.
200
+ contentText = trim(stripHtml(parsedBody.htmlBody), this._maxBodyChars);
201
+ } else {
202
+ contentText = `From: ${env.from && env.from[0] ? formatAddr(env.from[0]) : "?"}; To: ${formatRecipients(env.to)}; subject: ${subject}`;
203
+ }
204
+
205
+ const event = {
206
+ id: newId(),
207
+ type: "event",
208
+ subtype: EVENT_SUBTYPES.MESSAGE,
209
+ occurredAt,
210
+ actor: actorId,
211
+ participants,
212
+ content: {
213
+ title: subject,
214
+ text: contentText,
215
+ },
216
+ ingestedAt,
217
+ source: this._source(env.messageId, env.internalDate),
218
+ extra: {
219
+ emailFolder: env.folder,
220
+ messageId: env.messageId,
221
+ from: env.from || [],
222
+ to: env.to || [],
223
+ cc: env.cc || [],
224
+ flags: env.flags || [],
225
+ uid: env.uid,
226
+ size: env.size,
227
+ accountEmail: this.account.email,
228
+ ...(parsedBody && parsedBody.attachments
229
+ ? {
230
+ attachments: parsedBody.attachments.map((a) => ({
231
+ filename: a.filename,
232
+ contentType: a.contentType,
233
+ contentDisposition: a.contentDisposition,
234
+ size: a.size,
235
+ sha256: a.sha256,
236
+ isInline: a.isInline,
237
+ isEncrypted: a.isEncrypted,
238
+ })),
239
+ }
240
+ : {}),
241
+ ...(parsedBody && parsedBody.contentSha256
242
+ ? { rawSha256: parsedBody.contentSha256 }
243
+ : {}),
244
+ ...(parsedBody && parsedBody.parseError
245
+ ? { parseError: parsedBody.parseError }
246
+ : {}),
247
+ // List-Unsubscribe + other indicator headers will fuel Phase 5.3
248
+ // classification — stash a small allowlist now.
249
+ ...(parsedBody && parsedBody.headers
250
+ ? {
251
+ indicatorHeaders: pickIndicatorHeaders(parsedBody.headers),
252
+ }
253
+ : {}),
254
+ },
255
+ };
256
+
257
+ return { events: [event], persons, places: [], items: [], topics: [] };
258
+ }
259
+
260
+ _sessionConfig() {
261
+ return {
262
+ host: this._provider.host,
263
+ port: this._provider.port,
264
+ secure: this._provider.secure,
265
+ user: this.account.email,
266
+ authCode: this.account.authCode,
267
+ };
268
+ }
269
+
270
+ _envelopeToRawEvent(env, folder, parsedBody) {
271
+ const originalId = env.messageId && env.messageId.length > 0
272
+ ? env.messageId
273
+ : `mid-fallback:${this.account.email}:${folder}:${env.uid}`;
274
+ const capturedAt = env.internalDate instanceof Date && env.internalDate.getTime() > 0
275
+ ? env.internalDate.getTime()
276
+ : Date.now();
277
+ // Strip the raw `source` Buffer from payload — keeping it would
278
+ // bloat the vault's raw_events archive 100x (raw is in worst case
279
+ // hundreds of KB per email; the parsed body alone is enough for
280
+ // re-derive). The source is recoverable by re-syncing if absolutely
281
+ // needed.
282
+ const { source: _src, ...envNoSource } = env;
283
+ return {
284
+ adapter: NAME,
285
+ originalId,
286
+ capturedAt,
287
+ payload: {
288
+ ...envNoSource,
289
+ folder,
290
+ ...(parsedBody ? { parsedBody } : {}),
291
+ },
292
+ };
293
+ }
294
+
295
+ _source(originalId, internalDate) {
296
+ return {
297
+ adapter: NAME,
298
+ adapterVersion: VERSION,
299
+ capturedAt: internalDate instanceof Date && internalDate.getTime() > 0
300
+ ? internalDate.getTime()
301
+ : Date.now(),
302
+ capturedBy: CAPTURED_BY.API,
303
+ originalId: typeof originalId === "string" && originalId.length > 0 ? originalId : undefined,
304
+ };
305
+ }
306
+ }
307
+
308
+ function parseWatermark(s) {
309
+ if (typeof s !== "string" || s.length === 0) {
310
+ return { uidValidity: null, lastUid: 0 };
311
+ }
312
+ const idx = s.indexOf(":");
313
+ if (idx < 0) return { uidValidity: null, lastUid: 0 };
314
+ const uv = s.slice(0, idx);
315
+ const uid = parseInt(s.slice(idx + 1), 10);
316
+ return {
317
+ uidValidity: uv,
318
+ lastUid: Number.isFinite(uid) && uid > 0 ? uid : 0,
319
+ };
320
+ }
321
+
322
+ function formatWatermark(uidValidity, lastUid) {
323
+ const uv = uidValidity == null ? "" : String(uidValidity);
324
+ const uid = Number.isFinite(lastUid) && lastUid > 0 ? lastUid : 0;
325
+ return `${uv}:${uid}`;
326
+ }
327
+
328
+ function formatAddr(a) {
329
+ if (!a || !a.address) return "?";
330
+ return a.name ? `${a.name} <${a.address}>` : a.address;
331
+ }
332
+
333
+ function formatRecipients(list) {
334
+ if (!Array.isArray(list) || list.length === 0) return "?";
335
+ const head = list.slice(0, 3).map(formatAddr).join(", ");
336
+ return list.length > 3 ? `${head} (+${list.length - 3} more)` : head;
337
+ }
338
+
339
+ function trim(s, max) {
340
+ if (typeof s !== "string") return "";
341
+ if (s.length <= max) return s;
342
+ return s.slice(0, max) + `…[truncated ${s.length - max} chars]`;
343
+ }
344
+
345
+ /**
346
+ * Quick HTML→plaintext for cases where text/plain part is missing.
347
+ * Phase 5.4 templating may upgrade to cheerio if structure matters,
348
+ * but for BM25 tokenization + LLM prompt prose, a basic strip is fine.
349
+ */
350
+ function stripHtml(html) {
351
+ return String(html)
352
+ .replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, "")
353
+ .replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, "")
354
+ .replace(/<br\b[^>]*>/gi, "\n")
355
+ .replace(/<\/p>/gi, "\n\n")
356
+ .replace(/<[^>]+>/g, "")
357
+ .replace(/&nbsp;/g, " ")
358
+ .replace(/&amp;/g, "&")
359
+ .replace(/&lt;/g, "<")
360
+ .replace(/&gt;/g, ">")
361
+ .replace(/&quot;/g, '"')
362
+ .replace(/&#(\d+);/g, (_, n) => String.fromCharCode(parseInt(n, 10)))
363
+ .replace(/\n{3,}/g, "\n\n")
364
+ .trim();
365
+ }
366
+
367
+ /**
368
+ * Pick the small set of headers Phase 5.3 classifier rules actually use,
369
+ * so we don't bloat each Event row with the full header bag.
370
+ */
371
+ const INDICATOR_HEADERS = [
372
+ "list-unsubscribe",
373
+ "list-id",
374
+ "x-mailer",
375
+ "x-priority",
376
+ "auto-submitted",
377
+ "precedence",
378
+ "x-amazon-mail-relay-type",
379
+ "feedback-id",
380
+ "x-campaign",
381
+ "x-mc-user",
382
+ ];
383
+ function pickIndicatorHeaders(headers) {
384
+ if (!headers || typeof headers !== "object") return {};
385
+ const out = {};
386
+ for (const h of INDICATOR_HEADERS) {
387
+ if (headers[h] !== undefined) out[h] = headers[h];
388
+ }
389
+ return out;
390
+ }
391
+
392
+ module.exports = {
393
+ EmailAdapter,
394
+ parseWatermark,
395
+ formatWatermark,
396
+ NAME,
397
+ VERSION,
398
+ };
@@ -0,0 +1,177 @@
1
+ /**
2
+ * Email body parser — Phase 5.2 of the Personal Data Hub.
3
+ *
4
+ * Wraps `mailparser.simpleParser` to consume the raw RFC822 source we
5
+ * now fetch via ImapSession.fetchFullSince, returning a normalized shape
6
+ * the EmailAdapter can stuff into `payload.parsedBody` (so downstream
7
+ * Phase 5.3 classifiers and 5.4 template extractors don't need to redo
8
+ * the parsing).
9
+ *
10
+ * What we keep vs. drop from mailparser output:
11
+ *
12
+ * headers → Map<string, any>, flattened to a plain object
13
+ * keyed by lowercased name. Useful for Phase 5.3 rules
14
+ * (List-Unsubscribe, X-* mailer headers, etc.)
15
+ * textBody → text/plain content (UTF-8 string)
16
+ * htmlBody → text/html content (UTF-8 string)
17
+ * attachments → Array<AttachmentMetadata> — METADATA ONLY in v0
18
+ * per Adapter_Email_IMAP.md §3 OQ-3. Disk persistence
19
+ * + image thumbnail localization land in v0.2.
20
+ * contentSha256→ hex digest of the entire raw source. Lets the vault
21
+ * short-circuit duplicate ingest (same message-id
22
+ * re-uploaded by a misconfigured server).
23
+ *
24
+ * Charset handling is delegated to mailparser + iconv-lite (transitively
25
+ * via libmime). GBK / GB2312 / Big5 / ShiftJIS all "just work" — we
26
+ * never see raw bytes for text content.
27
+ */
28
+
29
+ "use strict";
30
+
31
+ const crypto = require("node:crypto");
32
+ const { simpleParser } = require("mailparser");
33
+
34
+ /**
35
+ * @typedef {object} AttachmentMetadata
36
+ * @property {string} filename original filename (decoded; mailparser handles RFC 2231)
37
+ * @property {string} contentType MIME type, e.g. "application/pdf"
38
+ * @property {string} contentDisposition "attachment" | "inline" | ...
39
+ * @property {string} [contentId] for inline cid: references
40
+ * @property {number} size byte length of decoded content
41
+ * @property {string} sha256 hex digest of decoded content
42
+ * @property {boolean} isInline
43
+ * @property {boolean} isEncrypted heuristic: PDF whose %PDF header is followed by /Encrypt
44
+ * @property {boolean} [relatesTo] contentId pointer of related Multipart/Related parent (if any)
45
+ */
46
+
47
+ /**
48
+ * @typedef {object} ParsedEmail
49
+ * @property {object} headers flat, lowercase-keyed
50
+ * @property {string} textBody may be ""
51
+ * @property {string} htmlBody may be ""
52
+ * @property {AttachmentMetadata[]} attachments
53
+ * @property {string} contentSha256 hex of the raw source
54
+ * @property {number} sourceBytes raw size
55
+ * @property {string|null} subject convenience copy (mailparser also lower-cases per-charset)
56
+ * @property {Date|null} date header Date (separate from envelope.internalDate)
57
+ */
58
+
59
+ /**
60
+ * Parse an RFC822 raw email (Buffer or string). Returns a ParsedEmail or
61
+ * throws on a fatal parse failure (rare — mailparser is very lenient).
62
+ *
63
+ * @param {Buffer|string} rawSource
64
+ * @param {object} [opts]
65
+ * @param {boolean} [opts.keepAttachmentBuffers=false]
66
+ * if true, AttachmentMetadata.buffer (Buffer) is also attached.
67
+ * Default false — v0 saves only metadata to keep prompt + KG
68
+ * budgets sane. PDF decryption (Phase 5.5) will pass true.
69
+ * @param {number} [opts.maxBodyChars=200000]
70
+ * soft cap on text/html body length. Long Newsletter html in
71
+ * particular can be megabytes; trimming here keeps vault row
72
+ * sizes reasonable. Truncated bodies get `…[truncated N chars]`
73
+ * appended.
74
+ * @returns {Promise<ParsedEmail>}
75
+ */
76
+ async function parseRawEmail(rawSource, opts = {}) {
77
+ if (rawSource == null) {
78
+ throw new Error("parseRawEmail: rawSource required (Buffer or string)");
79
+ }
80
+ const buf = Buffer.isBuffer(rawSource) ? rawSource : Buffer.from(String(rawSource), "utf8");
81
+ const contentSha256 = crypto.createHash("sha256").update(buf).digest("hex");
82
+
83
+ const keepBufs = !!opts.keepAttachmentBuffers;
84
+ const maxBody = Number.isFinite(opts.maxBodyChars) && opts.maxBodyChars > 0
85
+ ? opts.maxBodyChars
86
+ : 200_000;
87
+
88
+ // simpleParser handles charset, multipart, nested message/rfc822, etc.
89
+ // skipImageLinks/skipHtmlToText: we want full content, so leave defaults.
90
+ let parsed;
91
+ try {
92
+ parsed = await simpleParser(buf, { skipImageLinks: true });
93
+ } catch (err) {
94
+ const wrapped = new Error(
95
+ `parseRawEmail: mailparser failed — ${err && err.message ? err.message : err}`
96
+ );
97
+ wrapped.cause = err;
98
+ throw wrapped;
99
+ }
100
+
101
+ // Headers: mailparser returns a Map. Flatten to plain object,
102
+ // lowercased keys. Multi-value headers become arrays.
103
+ const headers = {};
104
+ if (parsed.headers instanceof Map) {
105
+ for (const [k, v] of parsed.headers.entries()) {
106
+ headers[String(k).toLowerCase()] = v;
107
+ }
108
+ } else if (parsed.headers && typeof parsed.headers === "object") {
109
+ for (const k of Object.keys(parsed.headers)) {
110
+ headers[k.toLowerCase()] = parsed.headers[k];
111
+ }
112
+ }
113
+
114
+ const textBody = trim(parsed.text || "", maxBody);
115
+ const htmlBody = trim(parsed.html || "", maxBody);
116
+
117
+ const attachments = Array.isArray(parsed.attachments)
118
+ ? parsed.attachments.map((a) => attachmentMeta(a, keepBufs))
119
+ : [];
120
+
121
+ return {
122
+ headers,
123
+ textBody,
124
+ htmlBody,
125
+ attachments,
126
+ contentSha256,
127
+ sourceBytes: buf.length,
128
+ subject: typeof parsed.subject === "string" ? parsed.subject : null,
129
+ date: parsed.date instanceof Date ? parsed.date : (parsed.date ? new Date(parsed.date) : null),
130
+ };
131
+ }
132
+
133
+ // ─── helpers ───────────────────────────────────────────────────────────
134
+
135
+ function trim(s, max) {
136
+ if (typeof s !== "string") return "";
137
+ if (s.length <= max) return s;
138
+ return s.slice(0, max) + `…[truncated ${s.length - max} chars]`;
139
+ }
140
+
141
+ function attachmentMeta(a, keepBuf) {
142
+ const content = Buffer.isBuffer(a.content) ? a.content : null;
143
+ const size = a.size || (content ? content.length : 0);
144
+ const sha256 = content
145
+ ? crypto.createHash("sha256").update(content).digest("hex")
146
+ : "";
147
+ const out = {
148
+ filename: typeof a.filename === "string" ? a.filename : "",
149
+ contentType: typeof a.contentType === "string" ? a.contentType : "application/octet-stream",
150
+ contentDisposition: typeof a.contentDisposition === "string" ? a.contentDisposition : "attachment",
151
+ contentId: typeof a.contentId === "string" ? a.contentId.replace(/^<|>$/g, "") : undefined,
152
+ size,
153
+ sha256,
154
+ isInline: a.contentDisposition === "inline" || !!a.cid,
155
+ isEncrypted: detectEncryptedPdf(a, content),
156
+ };
157
+ if (a.related) out.relatesTo = String(a.related);
158
+ if (keepBuf && content) out.buffer = content;
159
+ return out;
160
+ }
161
+
162
+ /**
163
+ * Cheap heuristic to flag encrypted PDFs (for Phase 5.5). Detects
164
+ * `%PDF-` header followed by `/Encrypt` somewhere in the first 4 KB.
165
+ * False positives are fine — the decryption pass will just no-op.
166
+ */
167
+ function detectEncryptedPdf(a, content) {
168
+ if (!content) return false;
169
+ if (!a.contentType || !a.contentType.toLowerCase().includes("pdf")) return false;
170
+ if (!content.slice(0, 8).toString("ascii").includes("%PDF-")) return false;
171
+ const head = content.slice(0, 4096).toString("binary");
172
+ return head.includes("/Encrypt");
173
+ }
174
+
175
+ module.exports = {
176
+ parseRawEmail,
177
+ };