@chainlesschain/personal-data-hub 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +241 -0
- package/__tests__/adapter-spec.test.js +78 -0
- package/__tests__/adapters/email-adapter.test.js +605 -0
- package/__tests__/adapters/email-imap-session.test.js +334 -0
- package/__tests__/adapters/email-parser.test.js +244 -0
- package/__tests__/adapters/email-providers.test.js +84 -0
- package/__tests__/analysis.test.js +302 -0
- package/__tests__/batch.test.js +133 -0
- package/__tests__/bridges-cc-kg.test.js +231 -0
- package/__tests__/bridges-cc-llm.test.js +191 -0
- package/__tests__/bridges-cc-rag.test.js +162 -0
- package/__tests__/ids.test.js +45 -0
- package/__tests__/key-providers.test.js +126 -0
- package/__tests__/kg-derive.test.js +219 -0
- package/__tests__/llm-client.test.js +122 -0
- package/__tests__/mock-adapter.test.js +93 -0
- package/__tests__/prompt-builder.test.js +204 -0
- package/__tests__/query-parser.test.js +150 -0
- package/__tests__/rag-derive.test.js +169 -0
- package/__tests__/registry.test.js +304 -0
- package/__tests__/schemas.test.js +331 -0
- package/__tests__/vault.test.js +506 -0
- package/lib/adapter-spec.js +155 -0
- package/lib/adapters/email-imap/email-adapter.js +398 -0
- package/lib/adapters/email-imap/email-parser.js +177 -0
- package/lib/adapters/email-imap/imap-session.js +294 -0
- package/lib/adapters/email-imap/index.js +26 -0
- package/lib/adapters/email-imap/providers.js +111 -0
- package/lib/analysis.js +226 -0
- package/lib/batch.js +123 -0
- package/lib/bridges/cc-kg-sink.js +264 -0
- package/lib/bridges/cc-llm-adapter.js +169 -0
- package/lib/bridges/cc-rag-sink.js +118 -0
- package/lib/bridges/index.js +44 -0
- package/lib/constants.js +92 -0
- package/lib/ids.js +103 -0
- package/lib/index.js +141 -0
- package/lib/key-providers.js +146 -0
- package/lib/kg-derive.js +214 -0
- package/lib/llm-client.js +171 -0
- package/lib/migrations.js +246 -0
- package/lib/mock-adapter.js +199 -0
- package/lib/prompt-builder.js +205 -0
- package/lib/query-parser.js +250 -0
- package/lib/rag-derive.js +186 -0
- package/lib/registry.js +398 -0
- package/lib/schemas.js +379 -0
- package/lib/vault.js +883 -0
- package/package.json +63 -0
- package/vitest.config.js +10 -0
|
@@ -0,0 +1,398 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* EmailAdapter — Phase 5.1 of the Personal Data Hub.
|
|
3
|
+
*
|
|
4
|
+
* Connects to a user's IMAP mailbox (QQ / 189 / 163 / Outlook / Gmail /
|
|
5
|
+
* custom), incrementally syncs new envelopes since the last watermark,
|
|
6
|
+
* and emits one RawEvent per email. Body parsing + LLM classification +
|
|
7
|
+
* 6-template extraction land in Phase 5.2–5.4; this phase just gets the
|
|
8
|
+
* envelope flow working end-to-end with proper UIDVALIDITY-change
|
|
9
|
+
* handling.
|
|
10
|
+
*
|
|
11
|
+
* Watermark format `<uidValidity>:<lastUid>`:
|
|
12
|
+
* - Same UIDVALIDITY → fetch UID > lastUid (incremental)
|
|
13
|
+
* - Changed → reset lastUid = 0; vault dedupes via Message-ID
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
"use strict";
|
|
17
|
+
|
|
18
|
+
const {
|
|
19
|
+
EVENT_SUBTYPES,
|
|
20
|
+
PERSON_SUBTYPES,
|
|
21
|
+
CAPTURED_BY,
|
|
22
|
+
} = require("../../constants");
|
|
23
|
+
const { newId } = require("../../ids");
|
|
24
|
+
const { resolveProvider } = require("./providers");
|
|
25
|
+
const {
|
|
26
|
+
ImapSession,
|
|
27
|
+
ImapAuthFailedError,
|
|
28
|
+
ImapConnectionFailedError,
|
|
29
|
+
} = require("./imap-session");
|
|
30
|
+
const { parseRawEmail } = require("./email-parser");
|
|
31
|
+
|
|
32
|
+
const NAME = "email-imap";
|
|
33
|
+
const VERSION = "0.2.0"; // bumped for Phase 5.2 body parsing
|
|
34
|
+
|
|
35
|
+
class EmailAdapter {
|
|
36
|
+
constructor(opts) {
|
|
37
|
+
if (!opts || typeof opts !== "object") {
|
|
38
|
+
throw new Error("EmailAdapter: opts required");
|
|
39
|
+
}
|
|
40
|
+
const account = opts.account;
|
|
41
|
+
if (!account || typeof account !== "object") {
|
|
42
|
+
throw new Error("EmailAdapter: opts.account required");
|
|
43
|
+
}
|
|
44
|
+
if (typeof account.email !== "string" || !account.email.includes("@")) {
|
|
45
|
+
throw new Error("EmailAdapter: account.email must be a full address");
|
|
46
|
+
}
|
|
47
|
+
if (typeof account.authCode !== "string" || account.authCode.length === 0) {
|
|
48
|
+
throw new Error("EmailAdapter: account.authCode required (provider authorization code)");
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
this.account = account;
|
|
52
|
+
this._provider = resolveProvider(account);
|
|
53
|
+
this._sessionFactory = typeof opts.sessionFactory === "function"
|
|
54
|
+
? opts.sessionFactory
|
|
55
|
+
: (cfg) => new ImapSession(cfg);
|
|
56
|
+
|
|
57
|
+
// Phase 5.2: opt-out hook for tests that don't want to depend on
|
|
58
|
+
// mailparser. parser must be `async (rawBuffer) => ParsedEmail`.
|
|
59
|
+
this._parser = typeof opts.parser === "function" ? opts.parser : parseRawEmail;
|
|
60
|
+
// Soft cap on bodies stored in vault content.text — long newsletter
|
|
61
|
+
// HTML can be megabytes; trimming keeps `events` row + KG triple
|
|
62
|
+
// + RAG embed budgets sane.
|
|
63
|
+
this._maxBodyChars = Number.isFinite(opts.maxBodyChars) && opts.maxBodyChars > 0
|
|
64
|
+
? opts.maxBodyChars
|
|
65
|
+
: 8000;
|
|
66
|
+
|
|
67
|
+
this.name = NAME;
|
|
68
|
+
this.version = VERSION;
|
|
69
|
+
this.capabilities = ["sync:imap", "auth:authcode", "parse:mime-body", "parse:attachment-metadata"];
|
|
70
|
+
this.rateLimits = { perMinute: 60 };
|
|
71
|
+
this.dataDisclosure = {
|
|
72
|
+
fields: [
|
|
73
|
+
"email:headers (from/to/subject/date/messageId)",
|
|
74
|
+
"email:flags + uid + internalDate",
|
|
75
|
+
"email:body (text + html, capped to ~8k chars)",
|
|
76
|
+
"email:attachment-metadata (filename, contentType, size, sha256; no file bytes saved in v0)",
|
|
77
|
+
],
|
|
78
|
+
sensitivity: "high",
|
|
79
|
+
legalGate: false,
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
async authenticate(_ctx = {}) {
|
|
84
|
+
const session = this._sessionFactory(this._sessionConfig());
|
|
85
|
+
try {
|
|
86
|
+
await session.connect();
|
|
87
|
+
return { ok: true, account: this.account.email, provider: this._provider.providerId };
|
|
88
|
+
} catch (err) {
|
|
89
|
+
if (err instanceof ImapAuthFailedError) {
|
|
90
|
+
return { ok: false, reason: "AUTH_FAILED", error: err.message };
|
|
91
|
+
}
|
|
92
|
+
if (err instanceof ImapConnectionFailedError) {
|
|
93
|
+
return { ok: false, reason: "CONNECTION_FAILED", error: err.message };
|
|
94
|
+
}
|
|
95
|
+
return { ok: false, reason: "UNKNOWN", error: err && err.message ? err.message : String(err) };
|
|
96
|
+
} finally {
|
|
97
|
+
try { await session.close(); } catch (_e) {}
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
async healthCheck() {
|
|
102
|
+
const r = await this.authenticate();
|
|
103
|
+
if (r.ok) return { ok: true, lastChecked: Date.now() };
|
|
104
|
+
return { ok: false, reason: r.reason || "unknown", error: r.error };
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
async *sync(opts = {}) {
|
|
108
|
+
const folders = Array.isArray(opts.folders) && opts.folders.length > 0
|
|
109
|
+
? opts.folders
|
|
110
|
+
: this._provider.folders;
|
|
111
|
+
const maxPerFolder = Number.isFinite(opts.maxPerFolder) && opts.maxPerFolder > 0
|
|
112
|
+
? opts.maxPerFolder
|
|
113
|
+
: 5000;
|
|
114
|
+
const watermark = typeof opts.sinceWatermark === "string" ? opts.sinceWatermark : "";
|
|
115
|
+
const { uidValidity: prevUv, lastUid: prevLastUid } = parseWatermark(watermark);
|
|
116
|
+
|
|
117
|
+
const session = this._sessionFactory(this._sessionConfig());
|
|
118
|
+
try {
|
|
119
|
+
await session.connect();
|
|
120
|
+
|
|
121
|
+
for (const folder of folders) {
|
|
122
|
+
const mb = await session.openMailbox(folder);
|
|
123
|
+
const uvChanged = prevUv !== null && String(prevUv) !== String(mb.uidValidity);
|
|
124
|
+
const since = uvChanged ? 0 : prevLastUid;
|
|
125
|
+
|
|
126
|
+
let emitted = 0;
|
|
127
|
+
for await (const env of session.fetchFullSince(since)) {
|
|
128
|
+
// Parse the body in the adapter (not the session) so the
|
|
129
|
+
// session stays a thin IMAP wrapper. Parse failures degrade
|
|
130
|
+
// gracefully — emit the raw event without parsedBody so the
|
|
131
|
+
// registry's invalidCount tracker isn't tripped by every
|
|
132
|
+
// weird MIME structure we hit in the wild.
|
|
133
|
+
let parsedBody = null;
|
|
134
|
+
try {
|
|
135
|
+
if (env.source && env.source.length > 0) {
|
|
136
|
+
parsedBody = await this._parser(env.source);
|
|
137
|
+
}
|
|
138
|
+
} catch (parseErr) {
|
|
139
|
+
// Phase 5.3 classifier rules can still fire on envelope-only
|
|
140
|
+
// facts; we just lose body text + attachments for this email.
|
|
141
|
+
parsedBody = {
|
|
142
|
+
parseError: parseErr && parseErr.message ? parseErr.message : String(parseErr),
|
|
143
|
+
};
|
|
144
|
+
}
|
|
145
|
+
yield this._envelopeToRawEvent(env, folder, parsedBody);
|
|
146
|
+
emitted += 1;
|
|
147
|
+
if (emitted >= maxPerFolder) break;
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
} finally {
|
|
151
|
+
try { await session.close(); } catch (_e) {}
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
normalize(raw) {
|
|
156
|
+
if (!raw || typeof raw !== "object" || !raw.payload) {
|
|
157
|
+
throw new Error("EmailAdapter.normalize: missing raw or raw.payload");
|
|
158
|
+
}
|
|
159
|
+
const env = raw.payload;
|
|
160
|
+
const ingestedAt = Date.now();
|
|
161
|
+
const occurredAt = (env.internalDate instanceof Date ? env.internalDate.getTime() : 0)
|
|
162
|
+
|| (env.date instanceof Date ? env.date.getTime() : 0)
|
|
163
|
+
|| raw.capturedAt
|
|
164
|
+
|| ingestedAt;
|
|
165
|
+
|
|
166
|
+
const persons = [];
|
|
167
|
+
let actorId = "person-self";
|
|
168
|
+
if (Array.isArray(env.from) && env.from.length > 0 && env.from[0].address) {
|
|
169
|
+
const senderAddr = env.from[0].address.toLowerCase();
|
|
170
|
+
const senderId = `person-email-${senderAddr}`;
|
|
171
|
+
const senderName = env.from[0].name || senderAddr;
|
|
172
|
+
persons.push({
|
|
173
|
+
id: senderId,
|
|
174
|
+
type: "person",
|
|
175
|
+
subtype: PERSON_SUBTYPES.CONTACT,
|
|
176
|
+
names: [senderName],
|
|
177
|
+
identifiers: { email: [senderAddr] },
|
|
178
|
+
ingestedAt,
|
|
179
|
+
source: this._source(senderAddr, env.internalDate),
|
|
180
|
+
});
|
|
181
|
+
actorId = senderId;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
const participants = ["person-self"];
|
|
185
|
+
if (actorId !== "person-self") participants.push(actorId);
|
|
186
|
+
|
|
187
|
+
const subject = env.subject || "(no subject)";
|
|
188
|
+
|
|
189
|
+
// Phase 5.2: prefer the parsed text body over the envelope-only
|
|
190
|
+
// placeholder. Falls back to the recipient prose when body parsing
|
|
191
|
+
// failed or the email was envelope-only fetched.
|
|
192
|
+
const parsedBody = env.parsedBody || null;
|
|
193
|
+
let contentText;
|
|
194
|
+
if (parsedBody && typeof parsedBody.textBody === "string" && parsedBody.textBody.length > 0) {
|
|
195
|
+
contentText = trim(parsedBody.textBody, this._maxBodyChars);
|
|
196
|
+
} else if (parsedBody && typeof parsedBody.htmlBody === "string" && parsedBody.htmlBody.length > 0) {
|
|
197
|
+
// For HTML-only newsletters where the text/plain part is empty,
|
|
198
|
+
// keep a crude strip — analysis prompts handle HTML fine, but
|
|
199
|
+
// BM25 tokenization works better on stripped text.
|
|
200
|
+
contentText = trim(stripHtml(parsedBody.htmlBody), this._maxBodyChars);
|
|
201
|
+
} else {
|
|
202
|
+
contentText = `From: ${env.from && env.from[0] ? formatAddr(env.from[0]) : "?"}; To: ${formatRecipients(env.to)}; subject: ${subject}`;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
const event = {
|
|
206
|
+
id: newId(),
|
|
207
|
+
type: "event",
|
|
208
|
+
subtype: EVENT_SUBTYPES.MESSAGE,
|
|
209
|
+
occurredAt,
|
|
210
|
+
actor: actorId,
|
|
211
|
+
participants,
|
|
212
|
+
content: {
|
|
213
|
+
title: subject,
|
|
214
|
+
text: contentText,
|
|
215
|
+
},
|
|
216
|
+
ingestedAt,
|
|
217
|
+
source: this._source(env.messageId, env.internalDate),
|
|
218
|
+
extra: {
|
|
219
|
+
emailFolder: env.folder,
|
|
220
|
+
messageId: env.messageId,
|
|
221
|
+
from: env.from || [],
|
|
222
|
+
to: env.to || [],
|
|
223
|
+
cc: env.cc || [],
|
|
224
|
+
flags: env.flags || [],
|
|
225
|
+
uid: env.uid,
|
|
226
|
+
size: env.size,
|
|
227
|
+
accountEmail: this.account.email,
|
|
228
|
+
...(parsedBody && parsedBody.attachments
|
|
229
|
+
? {
|
|
230
|
+
attachments: parsedBody.attachments.map((a) => ({
|
|
231
|
+
filename: a.filename,
|
|
232
|
+
contentType: a.contentType,
|
|
233
|
+
contentDisposition: a.contentDisposition,
|
|
234
|
+
size: a.size,
|
|
235
|
+
sha256: a.sha256,
|
|
236
|
+
isInline: a.isInline,
|
|
237
|
+
isEncrypted: a.isEncrypted,
|
|
238
|
+
})),
|
|
239
|
+
}
|
|
240
|
+
: {}),
|
|
241
|
+
...(parsedBody && parsedBody.contentSha256
|
|
242
|
+
? { rawSha256: parsedBody.contentSha256 }
|
|
243
|
+
: {}),
|
|
244
|
+
...(parsedBody && parsedBody.parseError
|
|
245
|
+
? { parseError: parsedBody.parseError }
|
|
246
|
+
: {}),
|
|
247
|
+
// List-Unsubscribe + other indicator headers will fuel Phase 5.3
|
|
248
|
+
// classification — stash a small allowlist now.
|
|
249
|
+
...(parsedBody && parsedBody.headers
|
|
250
|
+
? {
|
|
251
|
+
indicatorHeaders: pickIndicatorHeaders(parsedBody.headers),
|
|
252
|
+
}
|
|
253
|
+
: {}),
|
|
254
|
+
},
|
|
255
|
+
};
|
|
256
|
+
|
|
257
|
+
return { events: [event], persons, places: [], items: [], topics: [] };
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
_sessionConfig() {
|
|
261
|
+
return {
|
|
262
|
+
host: this._provider.host,
|
|
263
|
+
port: this._provider.port,
|
|
264
|
+
secure: this._provider.secure,
|
|
265
|
+
user: this.account.email,
|
|
266
|
+
authCode: this.account.authCode,
|
|
267
|
+
};
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
_envelopeToRawEvent(env, folder, parsedBody) {
|
|
271
|
+
const originalId = env.messageId && env.messageId.length > 0
|
|
272
|
+
? env.messageId
|
|
273
|
+
: `mid-fallback:${this.account.email}:${folder}:${env.uid}`;
|
|
274
|
+
const capturedAt = env.internalDate instanceof Date && env.internalDate.getTime() > 0
|
|
275
|
+
? env.internalDate.getTime()
|
|
276
|
+
: Date.now();
|
|
277
|
+
// Strip the raw `source` Buffer from payload — keeping it would
|
|
278
|
+
// bloat the vault's raw_events archive 100x (raw is in worst case
|
|
279
|
+
// hundreds of KB per email; the parsed body alone is enough for
|
|
280
|
+
// re-derive). The source is recoverable by re-syncing if absolutely
|
|
281
|
+
// needed.
|
|
282
|
+
const { source: _src, ...envNoSource } = env;
|
|
283
|
+
return {
|
|
284
|
+
adapter: NAME,
|
|
285
|
+
originalId,
|
|
286
|
+
capturedAt,
|
|
287
|
+
payload: {
|
|
288
|
+
...envNoSource,
|
|
289
|
+
folder,
|
|
290
|
+
...(parsedBody ? { parsedBody } : {}),
|
|
291
|
+
},
|
|
292
|
+
};
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
_source(originalId, internalDate) {
|
|
296
|
+
return {
|
|
297
|
+
adapter: NAME,
|
|
298
|
+
adapterVersion: VERSION,
|
|
299
|
+
capturedAt: internalDate instanceof Date && internalDate.getTime() > 0
|
|
300
|
+
? internalDate.getTime()
|
|
301
|
+
: Date.now(),
|
|
302
|
+
capturedBy: CAPTURED_BY.API,
|
|
303
|
+
originalId: typeof originalId === "string" && originalId.length > 0 ? originalId : undefined,
|
|
304
|
+
};
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
function parseWatermark(s) {
|
|
309
|
+
if (typeof s !== "string" || s.length === 0) {
|
|
310
|
+
return { uidValidity: null, lastUid: 0 };
|
|
311
|
+
}
|
|
312
|
+
const idx = s.indexOf(":");
|
|
313
|
+
if (idx < 0) return { uidValidity: null, lastUid: 0 };
|
|
314
|
+
const uv = s.slice(0, idx);
|
|
315
|
+
const uid = parseInt(s.slice(idx + 1), 10);
|
|
316
|
+
return {
|
|
317
|
+
uidValidity: uv,
|
|
318
|
+
lastUid: Number.isFinite(uid) && uid > 0 ? uid : 0,
|
|
319
|
+
};
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
function formatWatermark(uidValidity, lastUid) {
|
|
323
|
+
const uv = uidValidity == null ? "" : String(uidValidity);
|
|
324
|
+
const uid = Number.isFinite(lastUid) && lastUid > 0 ? lastUid : 0;
|
|
325
|
+
return `${uv}:${uid}`;
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
function formatAddr(a) {
|
|
329
|
+
if (!a || !a.address) return "?";
|
|
330
|
+
return a.name ? `${a.name} <${a.address}>` : a.address;
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
function formatRecipients(list) {
|
|
334
|
+
if (!Array.isArray(list) || list.length === 0) return "?";
|
|
335
|
+
const head = list.slice(0, 3).map(formatAddr).join(", ");
|
|
336
|
+
return list.length > 3 ? `${head} (+${list.length - 3} more)` : head;
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
function trim(s, max) {
|
|
340
|
+
if (typeof s !== "string") return "";
|
|
341
|
+
if (s.length <= max) return s;
|
|
342
|
+
return s.slice(0, max) + `…[truncated ${s.length - max} chars]`;
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
/**
|
|
346
|
+
* Quick HTML→plaintext for cases where text/plain part is missing.
|
|
347
|
+
* Phase 5.4 templating may upgrade to cheerio if structure matters,
|
|
348
|
+
* but for BM25 tokenization + LLM prompt prose, a basic strip is fine.
|
|
349
|
+
*/
|
|
350
|
+
function stripHtml(html) {
|
|
351
|
+
return String(html)
|
|
352
|
+
.replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, "")
|
|
353
|
+
.replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, "")
|
|
354
|
+
.replace(/<br\b[^>]*>/gi, "\n")
|
|
355
|
+
.replace(/<\/p>/gi, "\n\n")
|
|
356
|
+
.replace(/<[^>]+>/g, "")
|
|
357
|
+
.replace(/ /g, " ")
|
|
358
|
+
.replace(/&/g, "&")
|
|
359
|
+
.replace(/</g, "<")
|
|
360
|
+
.replace(/>/g, ">")
|
|
361
|
+
.replace(/"/g, '"')
|
|
362
|
+
.replace(/&#(\d+);/g, (_, n) => String.fromCharCode(parseInt(n, 10)))
|
|
363
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
364
|
+
.trim();
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
/**
|
|
368
|
+
* Pick the small set of headers Phase 5.3 classifier rules actually use,
|
|
369
|
+
* so we don't bloat each Event row with the full header bag.
|
|
370
|
+
*/
|
|
371
|
+
const INDICATOR_HEADERS = [
|
|
372
|
+
"list-unsubscribe",
|
|
373
|
+
"list-id",
|
|
374
|
+
"x-mailer",
|
|
375
|
+
"x-priority",
|
|
376
|
+
"auto-submitted",
|
|
377
|
+
"precedence",
|
|
378
|
+
"x-amazon-mail-relay-type",
|
|
379
|
+
"feedback-id",
|
|
380
|
+
"x-campaign",
|
|
381
|
+
"x-mc-user",
|
|
382
|
+
];
|
|
383
|
+
function pickIndicatorHeaders(headers) {
|
|
384
|
+
if (!headers || typeof headers !== "object") return {};
|
|
385
|
+
const out = {};
|
|
386
|
+
for (const h of INDICATOR_HEADERS) {
|
|
387
|
+
if (headers[h] !== undefined) out[h] = headers[h];
|
|
388
|
+
}
|
|
389
|
+
return out;
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
module.exports = {
|
|
393
|
+
EmailAdapter,
|
|
394
|
+
parseWatermark,
|
|
395
|
+
formatWatermark,
|
|
396
|
+
NAME,
|
|
397
|
+
VERSION,
|
|
398
|
+
};
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Email body parser — Phase 5.2 of the Personal Data Hub.
|
|
3
|
+
*
|
|
4
|
+
* Wraps `mailparser.simpleParser` to consume the raw RFC822 source we
|
|
5
|
+
* now fetch via ImapSession.fetchFullSince, returning a normalized shape
|
|
6
|
+
* the EmailAdapter can stuff into `payload.parsedBody` (so downstream
|
|
7
|
+
* Phase 5.3 classifiers and 5.4 template extractors don't need to redo
|
|
8
|
+
* the parsing).
|
|
9
|
+
*
|
|
10
|
+
* What we keep vs. drop from mailparser output:
|
|
11
|
+
*
|
|
12
|
+
* headers → Map<string, any>, flattened to a plain object
|
|
13
|
+
* keyed by lowercased name. Useful for Phase 5.3 rules
|
|
14
|
+
* (List-Unsubscribe, X-* mailer headers, etc.)
|
|
15
|
+
* textBody → text/plain content (UTF-8 string)
|
|
16
|
+
* htmlBody → text/html content (UTF-8 string)
|
|
17
|
+
* attachments → Array<AttachmentMetadata> — METADATA ONLY in v0
|
|
18
|
+
* per Adapter_Email_IMAP.md §3 OQ-3. Disk persistence
|
|
19
|
+
* + image thumbnail localization land in v0.2.
|
|
20
|
+
* contentSha256→ hex digest of the entire raw source. Lets the vault
|
|
21
|
+
* short-circuit duplicate ingest (same message-id
|
|
22
|
+
* re-uploaded by a misconfigured server).
|
|
23
|
+
*
|
|
24
|
+
* Charset handling is delegated to mailparser + iconv-lite (transitively
|
|
25
|
+
* via libmime). GBK / GB2312 / Big5 / ShiftJIS all "just work" — we
|
|
26
|
+
* never see raw bytes for text content.
|
|
27
|
+
*/
|
|
28
|
+
|
|
29
|
+
"use strict";
|
|
30
|
+
|
|
31
|
+
const crypto = require("node:crypto");
|
|
32
|
+
const { simpleParser } = require("mailparser");
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* @typedef {object} AttachmentMetadata
|
|
36
|
+
* @property {string} filename original filename (decoded; mailparser handles RFC 2231)
|
|
37
|
+
* @property {string} contentType MIME type, e.g. "application/pdf"
|
|
38
|
+
* @property {string} contentDisposition "attachment" | "inline" | ...
|
|
39
|
+
* @property {string} [contentId] for inline cid: references
|
|
40
|
+
* @property {number} size byte length of decoded content
|
|
41
|
+
* @property {string} sha256 hex digest of decoded content
|
|
42
|
+
* @property {boolean} isInline
|
|
43
|
+
* @property {boolean} isEncrypted heuristic: PDF whose %PDF header is followed by /Encrypt
|
|
44
|
+
* @property {boolean} [relatesTo] contentId pointer of related Multipart/Related parent (if any)
|
|
45
|
+
*/
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* @typedef {object} ParsedEmail
|
|
49
|
+
* @property {object} headers flat, lowercase-keyed
|
|
50
|
+
* @property {string} textBody may be ""
|
|
51
|
+
* @property {string} htmlBody may be ""
|
|
52
|
+
* @property {AttachmentMetadata[]} attachments
|
|
53
|
+
* @property {string} contentSha256 hex of the raw source
|
|
54
|
+
* @property {number} sourceBytes raw size
|
|
55
|
+
* @property {string|null} subject convenience copy (mailparser also lower-cases per-charset)
|
|
56
|
+
* @property {Date|null} date header Date (separate from envelope.internalDate)
|
|
57
|
+
*/
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Parse an RFC822 raw email (Buffer or string). Returns a ParsedEmail or
|
|
61
|
+
* throws on a fatal parse failure (rare — mailparser is very lenient).
|
|
62
|
+
*
|
|
63
|
+
* @param {Buffer|string} rawSource
|
|
64
|
+
* @param {object} [opts]
|
|
65
|
+
* @param {boolean} [opts.keepAttachmentBuffers=false]
|
|
66
|
+
* if true, AttachmentMetadata.buffer (Buffer) is also attached.
|
|
67
|
+
* Default false — v0 saves only metadata to keep prompt + KG
|
|
68
|
+
* budgets sane. PDF decryption (Phase 5.5) will pass true.
|
|
69
|
+
* @param {number} [opts.maxBodyChars=200000]
|
|
70
|
+
* soft cap on text/html body length. Long Newsletter html in
|
|
71
|
+
* particular can be megabytes; trimming here keeps vault row
|
|
72
|
+
* sizes reasonable. Truncated bodies get `…[truncated N chars]`
|
|
73
|
+
* appended.
|
|
74
|
+
* @returns {Promise<ParsedEmail>}
|
|
75
|
+
*/
|
|
76
|
+
async function parseRawEmail(rawSource, opts = {}) {
|
|
77
|
+
if (rawSource == null) {
|
|
78
|
+
throw new Error("parseRawEmail: rawSource required (Buffer or string)");
|
|
79
|
+
}
|
|
80
|
+
const buf = Buffer.isBuffer(rawSource) ? rawSource : Buffer.from(String(rawSource), "utf8");
|
|
81
|
+
const contentSha256 = crypto.createHash("sha256").update(buf).digest("hex");
|
|
82
|
+
|
|
83
|
+
const keepBufs = !!opts.keepAttachmentBuffers;
|
|
84
|
+
const maxBody = Number.isFinite(opts.maxBodyChars) && opts.maxBodyChars > 0
|
|
85
|
+
? opts.maxBodyChars
|
|
86
|
+
: 200_000;
|
|
87
|
+
|
|
88
|
+
// simpleParser handles charset, multipart, nested message/rfc822, etc.
|
|
89
|
+
// skipImageLinks/skipHtmlToText: we want full content, so leave defaults.
|
|
90
|
+
let parsed;
|
|
91
|
+
try {
|
|
92
|
+
parsed = await simpleParser(buf, { skipImageLinks: true });
|
|
93
|
+
} catch (err) {
|
|
94
|
+
const wrapped = new Error(
|
|
95
|
+
`parseRawEmail: mailparser failed — ${err && err.message ? err.message : err}`
|
|
96
|
+
);
|
|
97
|
+
wrapped.cause = err;
|
|
98
|
+
throw wrapped;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Headers: mailparser returns a Map. Flatten to plain object,
|
|
102
|
+
// lowercased keys. Multi-value headers become arrays.
|
|
103
|
+
const headers = {};
|
|
104
|
+
if (parsed.headers instanceof Map) {
|
|
105
|
+
for (const [k, v] of parsed.headers.entries()) {
|
|
106
|
+
headers[String(k).toLowerCase()] = v;
|
|
107
|
+
}
|
|
108
|
+
} else if (parsed.headers && typeof parsed.headers === "object") {
|
|
109
|
+
for (const k of Object.keys(parsed.headers)) {
|
|
110
|
+
headers[k.toLowerCase()] = parsed.headers[k];
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
const textBody = trim(parsed.text || "", maxBody);
|
|
115
|
+
const htmlBody = trim(parsed.html || "", maxBody);
|
|
116
|
+
|
|
117
|
+
const attachments = Array.isArray(parsed.attachments)
|
|
118
|
+
? parsed.attachments.map((a) => attachmentMeta(a, keepBufs))
|
|
119
|
+
: [];
|
|
120
|
+
|
|
121
|
+
return {
|
|
122
|
+
headers,
|
|
123
|
+
textBody,
|
|
124
|
+
htmlBody,
|
|
125
|
+
attachments,
|
|
126
|
+
contentSha256,
|
|
127
|
+
sourceBytes: buf.length,
|
|
128
|
+
subject: typeof parsed.subject === "string" ? parsed.subject : null,
|
|
129
|
+
date: parsed.date instanceof Date ? parsed.date : (parsed.date ? new Date(parsed.date) : null),
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// ─── helpers ───────────────────────────────────────────────────────────
|
|
134
|
+
|
|
135
|
+
function trim(s, max) {
|
|
136
|
+
if (typeof s !== "string") return "";
|
|
137
|
+
if (s.length <= max) return s;
|
|
138
|
+
return s.slice(0, max) + `…[truncated ${s.length - max} chars]`;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
function attachmentMeta(a, keepBuf) {
|
|
142
|
+
const content = Buffer.isBuffer(a.content) ? a.content : null;
|
|
143
|
+
const size = a.size || (content ? content.length : 0);
|
|
144
|
+
const sha256 = content
|
|
145
|
+
? crypto.createHash("sha256").update(content).digest("hex")
|
|
146
|
+
: "";
|
|
147
|
+
const out = {
|
|
148
|
+
filename: typeof a.filename === "string" ? a.filename : "",
|
|
149
|
+
contentType: typeof a.contentType === "string" ? a.contentType : "application/octet-stream",
|
|
150
|
+
contentDisposition: typeof a.contentDisposition === "string" ? a.contentDisposition : "attachment",
|
|
151
|
+
contentId: typeof a.contentId === "string" ? a.contentId.replace(/^<|>$/g, "") : undefined,
|
|
152
|
+
size,
|
|
153
|
+
sha256,
|
|
154
|
+
isInline: a.contentDisposition === "inline" || !!a.cid,
|
|
155
|
+
isEncrypted: detectEncryptedPdf(a, content),
|
|
156
|
+
};
|
|
157
|
+
if (a.related) out.relatesTo = String(a.related);
|
|
158
|
+
if (keepBuf && content) out.buffer = content;
|
|
159
|
+
return out;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* Cheap heuristic to flag encrypted PDFs (for Phase 5.5). Detects
|
|
164
|
+
* `%PDF-` header followed by `/Encrypt` somewhere in the first 4 KB.
|
|
165
|
+
* False positives are fine — the decryption pass will just no-op.
|
|
166
|
+
*/
|
|
167
|
+
function detectEncryptedPdf(a, content) {
|
|
168
|
+
if (!content) return false;
|
|
169
|
+
if (!a.contentType || !a.contentType.toLowerCase().includes("pdf")) return false;
|
|
170
|
+
if (!content.slice(0, 8).toString("ascii").includes("%PDF-")) return false;
|
|
171
|
+
const head = content.slice(0, 4096).toString("binary");
|
|
172
|
+
return head.includes("/Encrypt");
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
module.exports = {
|
|
176
|
+
parseRawEmail,
|
|
177
|
+
};
|