@chainlesschain/personal-data-hub 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. package/__tests__/adapters/ai-chat-cookie-capture-spec.test.js +211 -0
  2. package/__tests__/adapters/ai-chat-health-checker.test.js +262 -0
  3. package/__tests__/adapters/ai-chat-history.test.js +396 -0
  4. package/__tests__/adapters/ai-chat-http-client.test.js +242 -0
  5. package/__tests__/adapters/ai-chat-vendors.test.js +874 -0
  6. package/__tests__/adapters/alipay-bill-adapter.test.js +538 -0
  7. package/__tests__/adapters/email-adapter.test.js +138 -1
  8. package/__tests__/adapters/email-classifier.test.js +347 -0
  9. package/__tests__/adapters/email-pdf-extractor.test.js +529 -0
  10. package/__tests__/adapters/email-retry-progress.test.js +294 -0
  11. package/__tests__/adapters/email-templates.test.js +699 -0
  12. package/__tests__/adapters/social-toutiao-kuaishou-scaffold.test.js +269 -0
  13. package/__tests__/adapters/system-data-adapter.test.js +440 -0
  14. package/__tests__/adapters/system-data-android-ingest.test.js +144 -0
  15. package/__tests__/adapters/system-data-android.test.js +387 -0
  16. package/__tests__/adapters/system-data-disclosure.test.js +153 -0
  17. package/__tests__/adapters/wechat-bootstrap.test.js +240 -0
  18. package/__tests__/adapters/wechat-env-probe.test.js +162 -0
  19. package/__tests__/adapters/wechat-frida-agent.test.js +191 -0
  20. package/__tests__/adapters/wechat-frida-integration.test.js +149 -0
  21. package/__tests__/adapters/wechat-frida-key-provider.test.js +188 -0
  22. package/__tests__/adapters/wechat-md5-key-provider.test.js +101 -0
  23. package/__tests__/analysis-skills.test.js +556 -0
  24. package/__tests__/analysis.test.js +329 -1
  25. package/__tests__/e2e/ai-chat-cross-source-journey.test.js +213 -0
  26. package/__tests__/e2e/full-user-journey.test.js +188 -0
  27. package/__tests__/entity-resolver-ingest-hook.test.js +177 -0
  28. package/__tests__/entity-resolver-stages.test.js +411 -0
  29. package/__tests__/entity-resolver-vault.test.js +246 -0
  30. package/__tests__/entity-resolver.test.js +526 -0
  31. package/__tests__/fixtures/entity-resolver-200-mock.json +96 -0
  32. package/__tests__/integration/ai-chat-history-registry.test.js +228 -0
  33. package/__tests__/integration/aichat-wizard-end-to-end.test.js +282 -0
  34. package/__tests__/integration/cross-adapter-pipelines.test.js +396 -0
  35. package/__tests__/integration/wechat-bootstrap-end-to-end.test.js +390 -0
  36. package/__tests__/longtail-adapters.test.js +217 -0
  37. package/__tests__/mobile-extractor.test.js +288 -0
  38. package/__tests__/registry.test.js +4 -2
  39. package/__tests__/shopping-adapters.test.js +296 -0
  40. package/__tests__/sidecar-contacts-cross-validate.test.js +163 -0
  41. package/__tests__/sidecar-supervisor.test.js +120 -0
  42. package/__tests__/social-adapters.test.js +206 -0
  43. package/__tests__/travel-adapters.test.js +325 -0
  44. package/__tests__/vault.test.js +3 -3
  45. package/__tests__/wechat-adapter.test.js +476 -0
  46. package/__tests__/whatsapp-adapter.test.js +135 -0
  47. package/lib/adapter-spec.js +12 -0
  48. package/lib/adapters/_python-sidecar-base.js +207 -0
  49. package/lib/adapters/ai-chat-history/ai-chat-adapter.js +374 -0
  50. package/lib/adapters/ai-chat-history/cookie-auth.js +109 -0
  51. package/lib/adapters/ai-chat-history/cookie-capture-spec.js +331 -0
  52. package/lib/adapters/ai-chat-history/health-checker.js +210 -0
  53. package/lib/adapters/ai-chat-history/http-client.js +211 -0
  54. package/lib/adapters/ai-chat-history/index.js +28 -0
  55. package/lib/adapters/ai-chat-history/schema-map.js +258 -0
  56. package/lib/adapters/ai-chat-history/vendor-spec.js +86 -0
  57. package/lib/adapters/ai-chat-history/vendors/coze.js +179 -0
  58. package/lib/adapters/ai-chat-history/vendors/deepseek.js +199 -0
  59. package/lib/adapters/ai-chat-history/vendors/doubao.js +255 -0
  60. package/lib/adapters/ai-chat-history/vendors/dreamina.js +174 -0
  61. package/lib/adapters/ai-chat-history/vendors/hunyuan.js +176 -0
  62. package/lib/adapters/ai-chat-history/vendors/kimi.js +182 -0
  63. package/lib/adapters/ai-chat-history/vendors/qianfan.js +160 -0
  64. package/lib/adapters/ai-chat-history/vendors/tongyi.js +193 -0
  65. package/lib/adapters/ai-chat-history/vendors/zhipu.js +202 -0
  66. package/lib/adapters/ai-chat-history/wizard-controller.js +473 -0
  67. package/lib/adapters/alipay-bill/alipay-bill-adapter.js +311 -0
  68. package/lib/adapters/alipay-bill/counterparty.js +129 -0
  69. package/lib/adapters/alipay-bill/csv-parser.js +217 -0
  70. package/lib/adapters/alipay-bill/index.js +41 -0
  71. package/lib/adapters/alipay-bill/zip-decryptor.js +111 -0
  72. package/lib/adapters/email-imap/classifier.js +495 -0
  73. package/lib/adapters/email-imap/email-adapter.js +419 -8
  74. package/lib/adapters/email-imap/index.js +42 -0
  75. package/lib/adapters/email-imap/pdf-extractor.js +192 -0
  76. package/lib/adapters/email-imap/templates/bill.js +232 -0
  77. package/lib/adapters/email-imap/templates/government.js +120 -0
  78. package/lib/adapters/email-imap/templates/index.js +78 -0
  79. package/lib/adapters/email-imap/templates/order.js +186 -0
  80. package/lib/adapters/email-imap/templates/other.js +114 -0
  81. package/lib/adapters/email-imap/templates/register.js +113 -0
  82. package/lib/adapters/email-imap/templates/travel.js +157 -0
  83. package/lib/adapters/email-imap/templates/utils.js +275 -0
  84. package/lib/adapters/email-imap/transactions.js +234 -0
  85. package/lib/adapters/messaging-qq/index.js +158 -0
  86. package/lib/adapters/messaging-telegram/index.js +142 -0
  87. package/lib/adapters/messaging-whatsapp/index.js +189 -0
  88. package/lib/adapters/shopping-base/index.js +208 -0
  89. package/lib/adapters/shopping-jd/index.js +150 -0
  90. package/lib/adapters/shopping-meituan/index.js +154 -0
  91. package/lib/adapters/shopping-taobao/index.js +176 -0
  92. package/lib/adapters/social-bilibili/index.js +171 -0
  93. package/lib/adapters/social-douyin/index.js +116 -0
  94. package/lib/adapters/social-kuaishou/index.js +237 -0
  95. package/lib/adapters/social-toutiao/index.js +236 -0
  96. package/lib/adapters/social-weibo/index.js +164 -0
  97. package/lib/adapters/social-xiaohongshu/index.js +96 -0
  98. package/lib/adapters/system-data/disclosure.js +166 -0
  99. package/lib/adapters/system-data/index.js +34 -0
  100. package/lib/adapters/system-data/system-data-adapter.js +344 -0
  101. package/lib/adapters/system-data-android/adapter.js +348 -0
  102. package/lib/adapters/system-data-android/index.js +76 -0
  103. package/lib/adapters/travel-12306/index.js +151 -0
  104. package/lib/adapters/travel-amap/index.js +164 -0
  105. package/lib/adapters/travel-baidu-map/index.js +162 -0
  106. package/lib/adapters/travel-base/index.js +240 -0
  107. package/lib/adapters/travel-ctrip/index.js +151 -0
  108. package/lib/adapters/wechat/bootstrap.js +146 -0
  109. package/lib/adapters/wechat/content-parser.js +326 -0
  110. package/lib/adapters/wechat/db-reader.js +209 -0
  111. package/lib/adapters/wechat/env-probe.js +218 -0
  112. package/lib/adapters/wechat/frida-agent/loader.js +67 -0
  113. package/lib/adapters/wechat/frida-agent/wechat-key-hook.js +126 -0
  114. package/lib/adapters/wechat/index.js +37 -0
  115. package/lib/adapters/wechat/key-extractor.js +158 -0
  116. package/lib/adapters/wechat/key-providers/frida-key-provider.js +244 -0
  117. package/lib/adapters/wechat/key-providers/index.js +22 -0
  118. package/lib/adapters/wechat/key-providers/key-provider-base.js +44 -0
  119. package/lib/adapters/wechat/key-providers/md5-key-provider.js +81 -0
  120. package/lib/adapters/wechat/normalize.js +220 -0
  121. package/lib/adapters/wechat/wechat-adapter.js +205 -0
  122. package/lib/analysis-skills/base.js +113 -0
  123. package/lib/analysis-skills/footprint.js +167 -0
  124. package/lib/analysis-skills/index.js +58 -0
  125. package/lib/analysis-skills/interests.js +161 -0
  126. package/lib/analysis-skills/relations.js +226 -0
  127. package/lib/analysis-skills/spending.js +219 -0
  128. package/lib/analysis-skills/timeline.js +167 -0
  129. package/lib/analysis.js +191 -2
  130. package/lib/entity-resolver/embedding-stage.js +198 -0
  131. package/lib/entity-resolver/entity-resolver.js +384 -0
  132. package/lib/entity-resolver/index.js +42 -0
  133. package/lib/entity-resolver/llm-stage.js +191 -0
  134. package/lib/entity-resolver/rule-stage.js +208 -0
  135. package/lib/entity-resolver/worker.js +149 -0
  136. package/lib/index.js +131 -0
  137. package/lib/migrations.js +73 -0
  138. package/lib/mobile-extractor/android.js +193 -0
  139. package/lib/mobile-extractor/index.js +9 -0
  140. package/lib/mobile-extractor/ios.js +223 -0
  141. package/lib/prompt-builder.js +11 -1
  142. package/lib/query-parser.js +7 -1
  143. package/lib/registry.js +42 -0
  144. package/lib/sidecar/index.js +15 -0
  145. package/lib/sidecar/supervisor.js +359 -0
  146. package/lib/vault.js +343 -0
  147. package/package.json +36 -3
  148. package/scripts/_make-fixture-all.js +126 -0
  149. package/scripts/_make-fixture-contacts.js +84 -0
  150. package/scripts/evaluate-entity-resolver.js +213 -0
  151. package/scripts/smoke-phase-5-5.js +196 -0
  152. package/scripts/smoke-phase-5-7.js +181 -0
  153. package/scripts/smoke-system-data-contacts.js +309 -0
  154. package/scripts/smoke-system-data.js +312 -0
@@ -28,9 +28,13 @@ const {
28
28
  ImapConnectionFailedError,
29
29
  } = require("./imap-session");
30
30
  const { parseRawEmail } = require("./email-parser");
31
+ const { classifyEmail, CATEGORIES } = require("./classifier");
32
+ const { extractFields } = require("./templates");
33
+ const { extractPdfText, passwordsFromHints } = require("./pdf-extractor");
34
+ const { extractTransactions } = require("./transactions");
31
35
 
32
36
  const NAME = "email-imap";
33
- const VERSION = "0.2.0"; // bumped for Phase 5.2 body parsing
37
+ const VERSION = "0.6.0"; // Phase 5.7 retry-with-backoff + progress streaming
34
38
 
35
39
  class EmailAdapter {
36
40
  constructor(opts) {
@@ -64,9 +68,78 @@ class EmailAdapter {
64
68
  ? opts.maxBodyChars
65
69
  : 8000;
66
70
 
71
+ // Phase 5.3: classifier configuration.
72
+ // - opts.llm: optional LLMClient for Layer 2 + Phase-5.4 `other`-template
73
+ // summarization; absent → Layer 1 + regex-only extractors.
74
+ // - opts.classifier: custom orchestrator (override for tests).
75
+ // - opts.minLayer1Confidence: short-circuit threshold (default 0.85).
76
+ // - opts.disableClassification: skip both layers entirely.
77
+ this._llm = opts.llm && typeof opts.llm.chat === "function" ? opts.llm : null;
78
+ this._classifier = typeof opts.classifier === "function" ? opts.classifier : classifyEmail;
79
+ this._minLayer1Confidence = Number.isFinite(opts.minLayer1Confidence)
80
+ ? opts.minLayer1Confidence
81
+ : 0.85;
82
+ this._disableClassification = !!opts.disableClassification;
83
+
84
+ // Phase 5.4: template field extractor (regex-first, LLM-optional).
85
+ // - opts.extractor: custom dispatcher (test seam).
86
+ // - opts.disableExtraction: skip the per-email field-extraction call
87
+ // (e.g. when the registry only needs envelope+classification).
88
+ this._extractor = typeof opts.extractor === "function" ? opts.extractor : extractFields;
89
+ this._disableExtraction = !!opts.disableExtraction;
90
+
91
+ // Phase 5.5: PDF attachment decryption + transactions extraction.
92
+ // See pdf-extractor.js + transactions.js. Test seam: opts.pdfExtractor.
93
+ this._pdfExtractor = typeof opts.pdfExtractor === "function" ? opts.pdfExtractor : extractPdfText;
94
+ this._transactionsExtractor = typeof opts.transactionsExtractor === "function"
95
+ ? opts.transactionsExtractor
96
+ : extractTransactions;
97
+ const hintsList = passwordsFromHints(opts.pdfPasswordHints || {});
98
+ const userList = Array.isArray(opts.pdfPasswords)
99
+ ? opts.pdfPasswords.filter((p) => typeof p === "string")
100
+ : [];
101
+ this._pdfPasswords = [...userList, ...hintsList].filter((v, i, arr) => arr.indexOf(v) === i);
102
+ this._disablePdfExtraction = !!opts.disablePdfExtraction;
103
+
104
+ // Phase 5.7: connection retry + progress streaming.
105
+ // - opts.maxConnectRetries (default 3): total attempts including first.
106
+ // Set to 1 to disable retry. Retries fire ONLY on transient errors
107
+ // (ECONNRESET / ETIMEDOUT / EPIPE / socket disconnects); AUTH_FAILED
108
+ // and MAILBOX_NOT_FOUND short-circuit.
109
+ // - opts.retryBaseDelayMs (default 200): exponential backoff base.
110
+ // Actual delays: 200ms → 600ms → 1800ms for 3 attempts.
111
+ // - opts.onProgress (callback): receives {phase, ...payload} events
112
+ // throughout sync(). Registry forwards via onSyncEvent so the WS/IPC
113
+ // layer can stream to UI. Phases:
114
+ // "connecting" {attempt}
115
+ // "connected"
116
+ // "mailbox-opened" {mailbox, exists}
117
+ // "fetching" {mailbox, current, total}
118
+ // "decrypting-pdf" {filename}
119
+ // "done" {emitted, durationMs}
120
+ // "error" {phase, message, retriable?}
121
+ this._maxConnectRetries = Number.isFinite(opts.maxConnectRetries) && opts.maxConnectRetries > 0
122
+ ? opts.maxConnectRetries
123
+ : 3;
124
+ this._retryBaseDelayMs = Number.isFinite(opts.retryBaseDelayMs) && opts.retryBaseDelayMs > 0
125
+ ? opts.retryBaseDelayMs
126
+ : 200;
127
+ this._onProgress = typeof opts.onProgress === "function" ? opts.onProgress : null;
128
+
67
129
  this.name = NAME;
68
130
  this.version = VERSION;
69
- this.capabilities = ["sync:imap", "auth:authcode", "parse:mime-body", "parse:attachment-metadata"];
131
+ this.capabilities = [
132
+ "sync:imap",
133
+ "auth:authcode",
134
+ "parse:mime-body",
135
+ "parse:attachment-metadata",
136
+ "classify:layer1-rules",
137
+ ...(this._llm ? ["classify:layer2-llm"] : []),
138
+ "extract:6-templates",
139
+ ...(this._disablePdfExtraction ? [] : ["decrypt:pdf-bills"]),
140
+ "sync:retry-backoff",
141
+ "sync:progress-stream",
142
+ ];
70
143
  this.rateLimits = { perMinute: 60 };
71
144
  this.dataDisclosure = {
72
145
  fields: [
@@ -74,6 +147,12 @@ class EmailAdapter {
74
147
  "email:flags + uid + internalDate",
75
148
  "email:body (text + html, capped to ~8k chars)",
76
149
  "email:attachment-metadata (filename, contentType, size, sha256; no file bytes saved in v0)",
150
+ "classification:layer-1-rule-or-layer-2-llm-category (bill_bank/order/travel/etc.)",
151
+ ...(this._disablePdfExtraction
152
+ ? []
153
+ : [
154
+ "bill-transactions:date+description+amount+balance (extracted from decrypted PDF attachments; PDF bytes themselves never persist)",
155
+ ]),
77
156
  ],
78
157
  sensitivity: "high",
79
158
  legalGate: false,
@@ -114,17 +193,46 @@ class EmailAdapter {
114
193
  const watermark = typeof opts.sinceWatermark === "string" ? opts.sinceWatermark : "";
115
194
  const { uidValidity: prevUv, lastUid: prevLastUid } = parseWatermark(watermark);
116
195
 
196
+ // Phase 5.7: per-sync progress hook is the union of constructor + opts.
197
+ // Callers (registry / tests) can pass a fresh callback per sync without
198
+ // mutating the adapter instance.
199
+ const syncOnProgress = typeof opts.onProgress === "function"
200
+ ? opts.onProgress
201
+ : this._onProgress;
202
+ const emitProgress = (phase, payload = {}) => {
203
+ if (!syncOnProgress) return;
204
+ try {
205
+ syncOnProgress({ phase, adapter: NAME, ...payload });
206
+ } catch (_e) {
207
+ // Listener errors must NOT abort the sync.
208
+ }
209
+ };
210
+
211
+ const syncStart = Date.now();
117
212
  const session = this._sessionFactory(this._sessionConfig());
213
+ let totalEmitted = 0;
118
214
  try {
119
- await session.connect();
215
+ // Phase 5.7: connect with retry on transient errors.
216
+ await this._connectWithRetry(session, emitProgress);
120
217
 
121
218
  for (const folder of folders) {
122
219
  const mb = await session.openMailbox(folder);
220
+ emitProgress("mailbox-opened", {
221
+ mailbox: folder,
222
+ exists: mb.exists,
223
+ uidValidity: mb.uidValidity,
224
+ });
123
225
  const uvChanged = prevUv !== null && String(prevUv) !== String(mb.uidValidity);
124
226
  const since = uvChanged ? 0 : prevLastUid;
125
227
 
126
228
  let emitted = 0;
127
229
  for await (const env of session.fetchFullSince(since)) {
230
+ emitProgress("fetching", {
231
+ mailbox: folder,
232
+ current: emitted + 1,
233
+ total: mb.exists,
234
+ uid: env.uid,
235
+ });
128
236
  // Parse the body in the adapter (not the session) so the
129
237
  // session stays a thin IMAP wrapper. Parse failures degrade
130
238
  // gracefully — emit the raw event without parsedBody so the
@@ -133,25 +241,132 @@ class EmailAdapter {
133
241
  let parsedBody = null;
134
242
  try {
135
243
  if (env.source && env.source.length > 0) {
136
- parsedBody = await this._parser(env.source);
244
+ // Phase 5.5: ask parser to keep attachment buffers when
245
+ // we may need to decrypt PDFs. Buffers are stripped from
246
+ // the emitted RawEvent (in _envelopeToRawEvent) so the
247
+ // vault doesn't archive megabytes of PDF bytes.
248
+ parsedBody = await this._parser(env.source, {
249
+ keepAttachmentBuffers: !this._disablePdfExtraction,
250
+ });
137
251
  }
138
252
  } catch (parseErr) {
139
- // Phase 5.3 classifier rules can still fire on envelope-only
253
+ // Layer 1 classifier rules can still fire on envelope-only
140
254
  // facts; we just lose body text + attachments for this email.
141
255
  parsedBody = {
142
256
  parseError: parseErr && parseErr.message ? parseErr.message : String(parseErr),
143
257
  };
144
258
  }
145
- yield this._envelopeToRawEvent(env, folder, parsedBody);
259
+
260
+ // Phase 5.3: classify. Layer 1 runs synchronously on
261
+ // (from, subject, headers, attachment hints). If under the
262
+ // confidence threshold AND we have an LLM, Layer 2 fires.
263
+ // Classifier errors degrade to OTHER (never abort sync).
264
+ let classification = null;
265
+ if (!this._disableClassification) {
266
+ try {
267
+ classification = await this._classifier(
268
+ this._classifierInput(env, parsedBody),
269
+ {
270
+ llm: this._llm,
271
+ minLayer1Confidence: this._minLayer1Confidence,
272
+ }
273
+ );
274
+ } catch (err) {
275
+ classification = {
276
+ category: CATEGORIES.OTHER,
277
+ confidence: 0,
278
+ layer: "error",
279
+ error: err && err.message ? err.message : String(err),
280
+ };
281
+ }
282
+ }
283
+
284
+ // Phase 5.4: per-category template extraction.
285
+ // Dispatcher routes via classification.category, so a missing
286
+ // classification (disableClassification=true) maps to "other".
287
+ let extraction = null;
288
+ if (!this._disableExtraction) {
289
+ try {
290
+ extraction = await this._extractor(
291
+ this._classifierInput(env, parsedBody),
292
+ classification || { category: CATEGORIES.OTHER },
293
+ { llm: this._llm }
294
+ );
295
+ } catch (err) {
296
+ extraction = {
297
+ template: "other",
298
+ fields: {},
299
+ confidence: 0,
300
+ warnings: [`extractor threw: ${err && err.message ? err.message : err}`],
301
+ };
302
+ }
303
+ }
304
+
305
+ // Phase 5.5: PDF attachment decryption + transactions extraction.
306
+ // Runs only when the email was classified as a bill / travel AND
307
+ // has at least one PDF attachment whose buffer is available.
308
+ // Errors captured per-attachment, never thrown.
309
+ if (
310
+ !this._disablePdfExtraction
311
+ && extraction
312
+ && (extraction.template === "bill" || extraction.template === "travel")
313
+ && parsedBody
314
+ && Array.isArray(parsedBody.attachments)
315
+ && parsedBody.attachments.some((a) => isPdfAttachment(a))
316
+ ) {
317
+ await this._runPdfExtraction(parsedBody, extraction);
318
+ }
319
+
320
+ yield this._envelopeToRawEvent(env, folder, parsedBody, classification, extraction);
146
321
  emitted += 1;
322
+ totalEmitted += 1;
147
323
  if (emitted >= maxPerFolder) break;
148
324
  }
149
325
  }
326
+ emitProgress("done", { emitted: totalEmitted, durationMs: Date.now() - syncStart });
150
327
  } finally {
151
328
  try { await session.close(); } catch (_e) {}
152
329
  }
153
330
  }
154
331
 
332
+ /**
333
+ * Phase 5.7: connect with retry on transient errors. Auth failures
334
+ * (AUTH_FAILED) and mailbox-not-found (MAILBOX_NOT_FOUND) bypass retry —
335
+ * those are user errors that won't fix themselves. Network blips
336
+ * (ECONNRESET / ETIMEDOUT / EPIPE / socket errors / generic
337
+ * CONNECTION_FAILED) get up to `_maxConnectRetries` attempts with
338
+ * exponential backoff.
339
+ */
340
+ async _connectWithRetry(session, emitProgress) {
341
+ const maxAttempts = this._maxConnectRetries;
342
+ let lastErr = null;
343
+ for (let attempt = 1; attempt <= maxAttempts; attempt += 1) {
344
+ emitProgress("connecting", { attempt, maxAttempts });
345
+ try {
346
+ await session.connect();
347
+ emitProgress("connected", { attempt });
348
+ return;
349
+ } catch (err) {
350
+ lastErr = err;
351
+ const transient = isTransientImapError(err);
352
+ emitProgress("error", {
353
+ failingPhase: "connecting",
354
+ attempt,
355
+ retriable: transient && attempt < maxAttempts,
356
+ code: err && err.code,
357
+ message: err && err.message ? err.message : String(err),
358
+ });
359
+ if (!transient || attempt >= maxAttempts) {
360
+ throw err;
361
+ }
362
+ // Exponential backoff: base * 3^(attempt-1)
363
+ const delay = this._retryBaseDelayMs * Math.pow(3, attempt - 1);
364
+ await sleep(delay);
365
+ }
366
+ }
367
+ throw lastErr;
368
+ }
369
+
155
370
  normalize(raw) {
156
371
  if (!raw || typeof raw !== "object" || !raw.payload) {
157
372
  throw new Error("EmailAdapter.normalize: missing raw or raw.payload");
@@ -251,9 +466,54 @@ class EmailAdapter {
251
466
  indicatorHeaders: pickIndicatorHeaders(parsedBody.headers),
252
467
  }
253
468
  : {}),
469
+ // Phase 5.3: per-email category + which layer / rule decided it.
470
+ // Phase 5.4 template extractors dispatched on `.classified`.
471
+ ...(env.classification
472
+ ? {
473
+ classified: env.classification.category,
474
+ classification: {
475
+ category: env.classification.category,
476
+ confidence: env.classification.confidence,
477
+ layer: env.classification.layer,
478
+ ...(env.classification.ruleName ? { ruleName: env.classification.ruleName } : {}),
479
+ ...(env.classification.reason ? { reason: env.classification.reason } : {}),
480
+ },
481
+ }
482
+ : {}),
483
+ // Phase 5.4: structured fields from the per-category template.
484
+ // Stored at top of extra so analysis prompts + KG ingestors can
485
+ // see them without spelunking through `extraction.fields`.
486
+ ...(env.extraction && env.extraction.fields
487
+ ? {
488
+ fields: env.extraction.fields,
489
+ extractionTemplate: env.extraction.template,
490
+ extractionConfidence: env.extraction.confidence,
491
+ ...(env.extraction.warnings && env.extraction.warnings.length > 0
492
+ ? { extractionWarnings: env.extraction.warnings }
493
+ : {}),
494
+ // Phase 5.5: per-attachment decrypt+extract summary so the
495
+ // UI can flag "could not unlock this bill" + transactions
496
+ // count. Actual transactions list lives at fields.transactions.
497
+ ...(env.extraction.pdfExtraction
498
+ ? { pdfExtraction: env.extraction.pdfExtraction }
499
+ : {}),
500
+ }
501
+ : {}),
254
502
  },
255
503
  };
256
504
 
505
+ // Phase 5.4 compliance redaction: emails containing verification
506
+ // codes (OTP / 2FA) must NEVER persist their body in vault — even
507
+ // an "expired" OTP is sensitive evidence of session activity.
508
+ // Adapter_Email_IMAP.md §9.2 mandates "verificationCodePresent =
509
+ // store metadata only".
510
+ if (env.extraction
511
+ && env.extraction.template === "register"
512
+ && env.extraction.fields
513
+ && env.extraction.fields.verificationCodePresent) {
514
+ event.content.text = "(redacted: verification code email)";
515
+ }
516
+
257
517
  return { events: [event], persons, places: [], items: [], topics: [] };
258
518
  }
259
519
 
@@ -267,7 +527,7 @@ class EmailAdapter {
267
527
  };
268
528
  }
269
529
 
270
- _envelopeToRawEvent(env, folder, parsedBody) {
530
+ _envelopeToRawEvent(env, folder, parsedBody, classification, extraction) {
271
531
  const originalId = env.messageId && env.messageId.length > 0
272
532
  ? env.messageId
273
533
  : `mid-fallback:${this.account.email}:${folder}:${env.uid}`;
@@ -280,6 +540,10 @@ class EmailAdapter {
280
540
  // re-derive). The source is recoverable by re-syncing if absolutely
281
541
  // needed.
282
542
  const { source: _src, ...envNoSource } = env;
543
+ // Phase 5.5: also strip attachment buffers from parsedBody. Buffers
544
+ // are loaded for PDF decryption then discarded — vault keeps only
545
+ // metadata (filename / contentType / size / sha256).
546
+ const safeBody = parsedBody ? stripAttachmentBuffers(parsedBody) : null;
283
547
  return {
284
548
  adapter: NAME,
285
549
  originalId,
@@ -287,11 +551,90 @@ class EmailAdapter {
287
551
  payload: {
288
552
  ...envNoSource,
289
553
  folder,
290
- ...(parsedBody ? { parsedBody } : {}),
554
+ ...(safeBody ? { parsedBody: safeBody } : {}),
555
+ ...(classification ? { classification } : {}),
556
+ ...(extraction ? { extraction } : {}),
291
557
  },
292
558
  };
293
559
  }
294
560
 
561
+ /**
562
+ * Phase 5.5 helper: for each PDF attachment with a buffer, try to
563
+ * decrypt + extract text + parse transactions. Merges results into
564
+ * `extraction.fields.transactions` and stamps `pdfExtraction` metadata
565
+ * on the extraction so UI can surface failures.
566
+ *
567
+ * Side effects: mutates `extraction.fields` + adds `extraction.pdfExtraction`.
568
+ * Errors captured, never thrown — preserves the "sync never aborts on
569
+ * a single bad email" invariant.
570
+ */
571
+ async _runPdfExtraction(parsedBody, extraction) {
572
+ const pdfAtts = parsedBody.attachments.filter((a) => isPdfAttachment(a));
573
+ const results = [];
574
+ const allTxns = [];
575
+
576
+ for (const a of pdfAtts) {
577
+ if (!Buffer.isBuffer(a.buffer)) {
578
+ results.push({
579
+ filename: a.filename,
580
+ decrypted: false,
581
+ attempted: 0,
582
+ error: "no buffer (parser keepAttachmentBuffers=false?)",
583
+ });
584
+ continue;
585
+ }
586
+ try {
587
+ const r = await this._pdfExtractor(a.buffer, { passwords: this._pdfPasswords });
588
+ const summary = {
589
+ filename: a.filename,
590
+ decrypted: r.decrypted,
591
+ attempted: r.attempted,
592
+ wasEncrypted: r.wasEncrypted,
593
+ pageCount: r.pageCount,
594
+ ...(r.password !== undefined ? { passwordUsed: "***" } : {}), // never persist the real password
595
+ ...(r.error ? { error: r.error } : {}),
596
+ };
597
+ if (r.decrypted && typeof r.text === "string" && r.text.length > 0) {
598
+ const txns = this._transactionsExtractor(r.text);
599
+ if (Array.isArray(txns) && txns.length > 0) {
600
+ for (const t of txns) {
601
+ t.attachmentSha256 = a.sha256;
602
+ allTxns.push(t);
603
+ }
604
+ summary.transactionsExtracted = txns.length;
605
+ } else {
606
+ summary.transactionsExtracted = 0;
607
+ }
608
+ }
609
+ results.push(summary);
610
+ } catch (err) {
611
+ results.push({
612
+ filename: a.filename,
613
+ decrypted: false,
614
+ attempted: 0,
615
+ error: err && err.message ? err.message : String(err),
616
+ });
617
+ }
618
+ }
619
+
620
+ if (allTxns.length > 0) {
621
+ extraction.fields = extraction.fields || {};
622
+ extraction.fields.transactions = allTxns;
623
+ }
624
+ extraction.pdfExtraction = results;
625
+ }
626
+
627
+ _classifierInput(env, parsedBody) {
628
+ return {
629
+ from: env.from,
630
+ subject: env.subject,
631
+ attachments: parsedBody && Array.isArray(parsedBody.attachments) ? parsedBody.attachments : [],
632
+ textBody: (parsedBody && parsedBody.textBody) || "",
633
+ htmlBody: (parsedBody && parsedBody.htmlBody) || "",
634
+ headers: parsedBody && parsedBody.headers ? parsedBody.headers : {},
635
+ };
636
+ }
637
+
295
638
  _source(originalId, internalDate) {
296
639
  return {
297
640
  adapter: NAME,
@@ -389,6 +732,74 @@ function pickIndicatorHeaders(headers) {
389
732
  return out;
390
733
  }
391
734
 
735
+ /**
736
+ * Phase 5.7: decide if an IMAP error is worth retrying. Transient
737
+ * network blips (ECONNRESET / ETIMEDOUT / EPIPE / connect-failed / socket-
738
+ * disconnect / "connection lost") get a retry; auth failures / mailbox
739
+ * misconfig do NOT.
740
+ */
741
+ function isTransientImapError(err) {
742
+ if (!err) return false;
743
+ if (err.code === "AUTH_FAILED" || err.code === "MAILBOX_NOT_FOUND") return false;
744
+ if (err.code === "CONNECTION_FAILED") return true;
745
+ // Node-level network error codes
746
+ const networkCodes = new Set([
747
+ "ECONNRESET",
748
+ "ETIMEDOUT",
749
+ "EPIPE",
750
+ "ECONNREFUSED",
751
+ "ENETUNREACH",
752
+ "EAI_AGAIN",
753
+ "ENOTFOUND", // DNS can be transient on flaky networks
754
+ ]);
755
+ if (err.code && networkCodes.has(err.code)) return true;
756
+ if (err.cause && err.cause.code && networkCodes.has(err.cause.code)) return true;
757
+ const msg = (err.message || "").toLowerCase();
758
+ if (
759
+ msg.includes("timed out")
760
+ || msg.includes("timeout")
761
+ || msg.includes("socket disconnect")
762
+ || msg.includes("connection lost")
763
+ || msg.includes("connection reset")
764
+ || msg.includes("write after end")
765
+ ) {
766
+ return true;
767
+ }
768
+ return false;
769
+ }
770
+
771
+ function sleep(ms) {
772
+ return new Promise((resolve) => setTimeout(resolve, ms));
773
+ }
774
+
775
+ /**
776
+ * Phase 5.5: detect PDF attachments. Goes by contentType first, falls
777
+ * back to filename extension for senders that omit the MIME type.
778
+ */
779
+ function isPdfAttachment(a) {
780
+ if (!a || typeof a !== "object") return false;
781
+ if (typeof a.contentType === "string" && a.contentType.toLowerCase().includes("pdf")) return true;
782
+ if (typeof a.filename === "string" && /\.pdf$/i.test(a.filename)) return true;
783
+ return false;
784
+ }
785
+
786
+ /**
787
+ * Phase 5.5: drop Buffer fields from each attachment before the parsed
788
+ * body lands in the emitted RawEvent. Vault row size + WS-gateway
789
+ * serialization cost would be dominated by attachment bytes otherwise.
790
+ */
791
+ function stripAttachmentBuffers(parsedBody) {
792
+ if (!parsedBody || !Array.isArray(parsedBody.attachments)) return parsedBody;
793
+ return {
794
+ ...parsedBody,
795
+ attachments: parsedBody.attachments.map((a) => {
796
+ if (!a || a.buffer == null) return a;
797
+ const { buffer: _b, ...rest } = a;
798
+ return rest;
799
+ }),
800
+ };
801
+ }
802
+
392
803
  module.exports = {
393
804
  EmailAdapter,
394
805
  parseWatermark,
@@ -9,6 +9,27 @@ const {
9
9
  ImapMailboxNotFoundError,
10
10
  } = require("./imap-session");
11
11
  const { parseRawEmail } = require("./email-parser");
12
+ const {
13
+ CATEGORIES,
14
+ ALL_CATEGORIES,
15
+ LAYER1_RULES,
16
+ classifyLayer1,
17
+ classifyLayer2,
18
+ classifyEmail,
19
+ LAYER2_SYSTEM_PROMPT,
20
+ } = require("./classifier");
21
+ const {
22
+ extractFields,
23
+ CATEGORY_TO_EXTRACTOR,
24
+ extractBill,
25
+ extractOrder,
26
+ extractTravel,
27
+ extractGovernment,
28
+ extractRegister,
29
+ extractOther,
30
+ } = require("./templates");
31
+ const { extractPdfText, passwordsFromHints } = require("./pdf-extractor");
32
+ const { extractTransactions } = require("./transactions");
12
33
 
13
34
  module.exports = {
14
35
  EmailAdapter,
@@ -23,4 +44,25 @@ module.exports = {
23
44
  ImapConnectionFailedError,
24
45
  ImapMailboxNotFoundError,
25
46
  parseRawEmail,
47
+ // Phase 5.3 classifier
48
+ EMAIL_CATEGORIES: CATEGORIES,
49
+ EMAIL_ALL_CATEGORIES: ALL_CATEGORIES,
50
+ EMAIL_LAYER1_RULES: LAYER1_RULES,
51
+ classifyEmailLayer1: classifyLayer1,
52
+ classifyEmailLayer2: classifyLayer2,
53
+ classifyEmail,
54
+ EMAIL_LAYER2_SYSTEM_PROMPT: LAYER2_SYSTEM_PROMPT,
55
+ // Phase 5.4 — 6 template extractors + dispatcher
56
+ extractEmailFields: extractFields,
57
+ EMAIL_CATEGORY_EXTRACTORS: CATEGORY_TO_EXTRACTOR,
58
+ extractBill,
59
+ extractOrder,
60
+ extractTravel,
61
+ extractGovernment,
62
+ extractRegister,
63
+ extractOther,
64
+ // Phase 5.5 — PDF decryption + transactions extraction
65
+ extractPdfText,
66
+ pdfPasswordsFromHints: passwordsFromHints,
67
+ extractTransactions,
26
68
  };