@chainlesschain/personal-data-hub 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/__tests__/adapters/ai-chat-cookie-capture-spec.test.js +211 -0
- package/__tests__/adapters/ai-chat-health-checker.test.js +262 -0
- package/__tests__/adapters/ai-chat-history.test.js +396 -0
- package/__tests__/adapters/ai-chat-http-client.test.js +242 -0
- package/__tests__/adapters/ai-chat-vendors.test.js +874 -0
- package/__tests__/adapters/alipay-bill-adapter.test.js +538 -0
- package/__tests__/adapters/email-adapter.test.js +138 -1
- package/__tests__/adapters/email-classifier.test.js +347 -0
- package/__tests__/adapters/email-pdf-extractor.test.js +529 -0
- package/__tests__/adapters/email-retry-progress.test.js +294 -0
- package/__tests__/adapters/email-templates.test.js +699 -0
- package/__tests__/adapters/social-toutiao-kuaishou-scaffold.test.js +269 -0
- package/__tests__/adapters/system-data-adapter.test.js +440 -0
- package/__tests__/adapters/system-data-android-ingest.test.js +144 -0
- package/__tests__/adapters/system-data-android.test.js +387 -0
- package/__tests__/adapters/system-data-disclosure.test.js +153 -0
- package/__tests__/adapters/wechat-bootstrap.test.js +240 -0
- package/__tests__/adapters/wechat-env-probe.test.js +162 -0
- package/__tests__/adapters/wechat-frida-agent.test.js +191 -0
- package/__tests__/adapters/wechat-frida-integration.test.js +149 -0
- package/__tests__/adapters/wechat-frida-key-provider.test.js +188 -0
- package/__tests__/adapters/wechat-md5-key-provider.test.js +101 -0
- package/__tests__/analysis-skills.test.js +556 -0
- package/__tests__/analysis.test.js +329 -1
- package/__tests__/e2e/ai-chat-cross-source-journey.test.js +213 -0
- package/__tests__/e2e/full-user-journey.test.js +188 -0
- package/__tests__/entity-resolver-ingest-hook.test.js +177 -0
- package/__tests__/entity-resolver-stages.test.js +411 -0
- package/__tests__/entity-resolver-vault.test.js +246 -0
- package/__tests__/entity-resolver.test.js +526 -0
- package/__tests__/fixtures/entity-resolver-200-mock.json +96 -0
- package/__tests__/integration/ai-chat-history-registry.test.js +228 -0
- package/__tests__/integration/aichat-wizard-end-to-end.test.js +282 -0
- package/__tests__/integration/cross-adapter-pipelines.test.js +396 -0
- package/__tests__/integration/wechat-bootstrap-end-to-end.test.js +390 -0
- package/__tests__/longtail-adapters.test.js +217 -0
- package/__tests__/mobile-extractor.test.js +288 -0
- package/__tests__/registry.test.js +4 -2
- package/__tests__/shopping-adapters.test.js +296 -0
- package/__tests__/sidecar-contacts-cross-validate.test.js +163 -0
- package/__tests__/sidecar-supervisor.test.js +120 -0
- package/__tests__/social-adapters.test.js +206 -0
- package/__tests__/travel-adapters.test.js +325 -0
- package/__tests__/vault.test.js +3 -3
- package/__tests__/wechat-adapter.test.js +476 -0
- package/__tests__/whatsapp-adapter.test.js +135 -0
- package/lib/adapter-spec.js +12 -0
- package/lib/adapters/_python-sidecar-base.js +207 -0
- package/lib/adapters/ai-chat-history/ai-chat-adapter.js +374 -0
- package/lib/adapters/ai-chat-history/cookie-auth.js +109 -0
- package/lib/adapters/ai-chat-history/cookie-capture-spec.js +331 -0
- package/lib/adapters/ai-chat-history/health-checker.js +210 -0
- package/lib/adapters/ai-chat-history/http-client.js +211 -0
- package/lib/adapters/ai-chat-history/index.js +28 -0
- package/lib/adapters/ai-chat-history/schema-map.js +258 -0
- package/lib/adapters/ai-chat-history/vendor-spec.js +86 -0
- package/lib/adapters/ai-chat-history/vendors/coze.js +179 -0
- package/lib/adapters/ai-chat-history/vendors/deepseek.js +199 -0
- package/lib/adapters/ai-chat-history/vendors/doubao.js +255 -0
- package/lib/adapters/ai-chat-history/vendors/dreamina.js +174 -0
- package/lib/adapters/ai-chat-history/vendors/hunyuan.js +176 -0
- package/lib/adapters/ai-chat-history/vendors/kimi.js +182 -0
- package/lib/adapters/ai-chat-history/vendors/qianfan.js +160 -0
- package/lib/adapters/ai-chat-history/vendors/tongyi.js +193 -0
- package/lib/adapters/ai-chat-history/vendors/zhipu.js +202 -0
- package/lib/adapters/ai-chat-history/wizard-controller.js +473 -0
- package/lib/adapters/alipay-bill/alipay-bill-adapter.js +311 -0
- package/lib/adapters/alipay-bill/counterparty.js +129 -0
- package/lib/adapters/alipay-bill/csv-parser.js +217 -0
- package/lib/adapters/alipay-bill/index.js +41 -0
- package/lib/adapters/alipay-bill/zip-decryptor.js +111 -0
- package/lib/adapters/email-imap/classifier.js +495 -0
- package/lib/adapters/email-imap/email-adapter.js +419 -8
- package/lib/adapters/email-imap/index.js +42 -0
- package/lib/adapters/email-imap/pdf-extractor.js +192 -0
- package/lib/adapters/email-imap/templates/bill.js +232 -0
- package/lib/adapters/email-imap/templates/government.js +120 -0
- package/lib/adapters/email-imap/templates/index.js +78 -0
- package/lib/adapters/email-imap/templates/order.js +186 -0
- package/lib/adapters/email-imap/templates/other.js +114 -0
- package/lib/adapters/email-imap/templates/register.js +113 -0
- package/lib/adapters/email-imap/templates/travel.js +157 -0
- package/lib/adapters/email-imap/templates/utils.js +275 -0
- package/lib/adapters/email-imap/transactions.js +234 -0
- package/lib/adapters/messaging-qq/index.js +158 -0
- package/lib/adapters/messaging-telegram/index.js +142 -0
- package/lib/adapters/messaging-whatsapp/index.js +189 -0
- package/lib/adapters/shopping-base/index.js +208 -0
- package/lib/adapters/shopping-jd/index.js +150 -0
- package/lib/adapters/shopping-meituan/index.js +154 -0
- package/lib/adapters/shopping-taobao/index.js +176 -0
- package/lib/adapters/social-bilibili/index.js +171 -0
- package/lib/adapters/social-douyin/index.js +116 -0
- package/lib/adapters/social-kuaishou/index.js +237 -0
- package/lib/adapters/social-toutiao/index.js +236 -0
- package/lib/adapters/social-weibo/index.js +164 -0
- package/lib/adapters/social-xiaohongshu/index.js +96 -0
- package/lib/adapters/system-data/disclosure.js +166 -0
- package/lib/adapters/system-data/index.js +34 -0
- package/lib/adapters/system-data/system-data-adapter.js +344 -0
- package/lib/adapters/system-data-android/adapter.js +348 -0
- package/lib/adapters/system-data-android/index.js +76 -0
- package/lib/adapters/travel-12306/index.js +151 -0
- package/lib/adapters/travel-amap/index.js +164 -0
- package/lib/adapters/travel-baidu-map/index.js +162 -0
- package/lib/adapters/travel-base/index.js +240 -0
- package/lib/adapters/travel-ctrip/index.js +151 -0
- package/lib/adapters/wechat/bootstrap.js +146 -0
- package/lib/adapters/wechat/content-parser.js +326 -0
- package/lib/adapters/wechat/db-reader.js +209 -0
- package/lib/adapters/wechat/env-probe.js +218 -0
- package/lib/adapters/wechat/frida-agent/loader.js +67 -0
- package/lib/adapters/wechat/frida-agent/wechat-key-hook.js +126 -0
- package/lib/adapters/wechat/index.js +37 -0
- package/lib/adapters/wechat/key-extractor.js +158 -0
- package/lib/adapters/wechat/key-providers/frida-key-provider.js +244 -0
- package/lib/adapters/wechat/key-providers/index.js +22 -0
- package/lib/adapters/wechat/key-providers/key-provider-base.js +44 -0
- package/lib/adapters/wechat/key-providers/md5-key-provider.js +81 -0
- package/lib/adapters/wechat/normalize.js +220 -0
- package/lib/adapters/wechat/wechat-adapter.js +205 -0
- package/lib/analysis-skills/base.js +113 -0
- package/lib/analysis-skills/footprint.js +167 -0
- package/lib/analysis-skills/index.js +58 -0
- package/lib/analysis-skills/interests.js +161 -0
- package/lib/analysis-skills/relations.js +226 -0
- package/lib/analysis-skills/spending.js +219 -0
- package/lib/analysis-skills/timeline.js +167 -0
- package/lib/analysis.js +191 -2
- package/lib/entity-resolver/embedding-stage.js +198 -0
- package/lib/entity-resolver/entity-resolver.js +384 -0
- package/lib/entity-resolver/index.js +42 -0
- package/lib/entity-resolver/llm-stage.js +191 -0
- package/lib/entity-resolver/rule-stage.js +208 -0
- package/lib/entity-resolver/worker.js +149 -0
- package/lib/index.js +131 -0
- package/lib/migrations.js +73 -0
- package/lib/mobile-extractor/android.js +193 -0
- package/lib/mobile-extractor/index.js +9 -0
- package/lib/mobile-extractor/ios.js +223 -0
- package/lib/prompt-builder.js +11 -1
- package/lib/query-parser.js +7 -1
- package/lib/registry.js +42 -0
- package/lib/sidecar/index.js +15 -0
- package/lib/sidecar/supervisor.js +359 -0
- package/lib/vault.js +343 -0
- package/package.json +36 -3
- package/scripts/_make-fixture-all.js +126 -0
- package/scripts/_make-fixture-contacts.js +84 -0
- package/scripts/evaluate-entity-resolver.js +213 -0
- package/scripts/smoke-phase-5-5.js +196 -0
- package/scripts/smoke-phase-5-7.js +181 -0
- package/scripts/smoke-system-data-contacts.js +309 -0
- package/scripts/smoke-system-data.js +312 -0
|
@@ -28,9 +28,13 @@ const {
|
|
|
28
28
|
ImapConnectionFailedError,
|
|
29
29
|
} = require("./imap-session");
|
|
30
30
|
const { parseRawEmail } = require("./email-parser");
|
|
31
|
+
const { classifyEmail, CATEGORIES } = require("./classifier");
|
|
32
|
+
const { extractFields } = require("./templates");
|
|
33
|
+
const { extractPdfText, passwordsFromHints } = require("./pdf-extractor");
|
|
34
|
+
const { extractTransactions } = require("./transactions");
|
|
31
35
|
|
|
32
36
|
const NAME = "email-imap";
|
|
33
|
-
const VERSION = "0.
|
|
37
|
+
const VERSION = "0.6.0"; // Phase 5.7 — retry-with-backoff + progress streaming
|
|
34
38
|
|
|
35
39
|
class EmailAdapter {
|
|
36
40
|
constructor(opts) {
|
|
@@ -64,9 +68,78 @@ class EmailAdapter {
|
|
|
64
68
|
? opts.maxBodyChars
|
|
65
69
|
: 8000;
|
|
66
70
|
|
|
71
|
+
// Phase 5.3: classifier configuration.
|
|
72
|
+
// - opts.llm: optional LLMClient for Layer 2 + Phase-5.4 `other`-template
|
|
73
|
+
// summarization; absent → Layer 1 + regex-only extractors.
|
|
74
|
+
// - opts.classifier: custom orchestrator (override for tests).
|
|
75
|
+
// - opts.minLayer1Confidence: short-circuit threshold (default 0.85).
|
|
76
|
+
// - opts.disableClassification: skip both layers entirely.
|
|
77
|
+
this._llm = opts.llm && typeof opts.llm.chat === "function" ? opts.llm : null;
|
|
78
|
+
this._classifier = typeof opts.classifier === "function" ? opts.classifier : classifyEmail;
|
|
79
|
+
this._minLayer1Confidence = Number.isFinite(opts.minLayer1Confidence)
|
|
80
|
+
? opts.minLayer1Confidence
|
|
81
|
+
: 0.85;
|
|
82
|
+
this._disableClassification = !!opts.disableClassification;
|
|
83
|
+
|
|
84
|
+
// Phase 5.4: template field extractor (regex-first, LLM-optional).
|
|
85
|
+
// - opts.extractor: custom dispatcher (test seam).
|
|
86
|
+
// - opts.disableExtraction: skip the per-email field-extraction call
|
|
87
|
+
// (e.g. when the registry only needs envelope+classification).
|
|
88
|
+
this._extractor = typeof opts.extractor === "function" ? opts.extractor : extractFields;
|
|
89
|
+
this._disableExtraction = !!opts.disableExtraction;
|
|
90
|
+
|
|
91
|
+
// Phase 5.5: PDF attachment decryption + transactions extraction.
|
|
92
|
+
// See pdf-extractor.js + transactions.js. Test seam: opts.pdfExtractor.
|
|
93
|
+
this._pdfExtractor = typeof opts.pdfExtractor === "function" ? opts.pdfExtractor : extractPdfText;
|
|
94
|
+
this._transactionsExtractor = typeof opts.transactionsExtractor === "function"
|
|
95
|
+
? opts.transactionsExtractor
|
|
96
|
+
: extractTransactions;
|
|
97
|
+
const hintsList = passwordsFromHints(opts.pdfPasswordHints || {});
|
|
98
|
+
const userList = Array.isArray(opts.pdfPasswords)
|
|
99
|
+
? opts.pdfPasswords.filter((p) => typeof p === "string")
|
|
100
|
+
: [];
|
|
101
|
+
this._pdfPasswords = [...userList, ...hintsList].filter((v, i, arr) => arr.indexOf(v) === i);
|
|
102
|
+
this._disablePdfExtraction = !!opts.disablePdfExtraction;
|
|
103
|
+
|
|
104
|
+
// Phase 5.7: connection retry + progress streaming.
|
|
105
|
+
// - opts.maxConnectRetries (default 3): total attempts including first.
|
|
106
|
+
// Set to 1 to disable retry. Retries fire ONLY on transient errors
|
|
107
|
+
// (ECONNRESET / ETIMEDOUT / EPIPE / socket disconnects); AUTH_FAILED
|
|
108
|
+
// and MAILBOX_NOT_FOUND short-circuit.
|
|
109
|
+
// - opts.retryBaseDelayMs (default 200): exponential backoff base.
|
|
110
|
+
// Actual delays: 200ms → 600ms → 1800ms for 3 attempts.
|
|
111
|
+
// - opts.onProgress (callback): receives {phase, ...payload} events
|
|
112
|
+
// throughout sync(). Registry forwards via onSyncEvent so the WS/IPC
|
|
113
|
+
// layer can stream to UI. Phases:
|
|
114
|
+
// "connecting" {attempt}
|
|
115
|
+
// "connected"
|
|
116
|
+
// "mailbox-opened" {mailbox, exists}
|
|
117
|
+
// "fetching" {mailbox, current, total}
|
|
118
|
+
// "decrypting-pdf" {filename}
|
|
119
|
+
// "done" {emitted, durationMs}
|
|
120
|
+
// "error" {phase, message, retriable?}
|
|
121
|
+
this._maxConnectRetries = Number.isFinite(opts.maxConnectRetries) && opts.maxConnectRetries > 0
|
|
122
|
+
? opts.maxConnectRetries
|
|
123
|
+
: 3;
|
|
124
|
+
this._retryBaseDelayMs = Number.isFinite(opts.retryBaseDelayMs) && opts.retryBaseDelayMs > 0
|
|
125
|
+
? opts.retryBaseDelayMs
|
|
126
|
+
: 200;
|
|
127
|
+
this._onProgress = typeof opts.onProgress === "function" ? opts.onProgress : null;
|
|
128
|
+
|
|
67
129
|
this.name = NAME;
|
|
68
130
|
this.version = VERSION;
|
|
69
|
-
this.capabilities = [
|
|
131
|
+
this.capabilities = [
|
|
132
|
+
"sync:imap",
|
|
133
|
+
"auth:authcode",
|
|
134
|
+
"parse:mime-body",
|
|
135
|
+
"parse:attachment-metadata",
|
|
136
|
+
"classify:layer1-rules",
|
|
137
|
+
...(this._llm ? ["classify:layer2-llm"] : []),
|
|
138
|
+
"extract:6-templates",
|
|
139
|
+
...(this._disablePdfExtraction ? [] : ["decrypt:pdf-bills"]),
|
|
140
|
+
"sync:retry-backoff",
|
|
141
|
+
"sync:progress-stream",
|
|
142
|
+
];
|
|
70
143
|
this.rateLimits = { perMinute: 60 };
|
|
71
144
|
this.dataDisclosure = {
|
|
72
145
|
fields: [
|
|
@@ -74,6 +147,12 @@ class EmailAdapter {
|
|
|
74
147
|
"email:flags + uid + internalDate",
|
|
75
148
|
"email:body (text + html, capped to ~8k chars)",
|
|
76
149
|
"email:attachment-metadata (filename, contentType, size, sha256; no file bytes saved in v0)",
|
|
150
|
+
"classification:layer-1-rule-or-layer-2-llm-category (bill_bank/order/travel/etc.)",
|
|
151
|
+
...(this._disablePdfExtraction
|
|
152
|
+
? []
|
|
153
|
+
: [
|
|
154
|
+
"bill-transactions:date+description+amount+balance (extracted from decrypted PDF attachments; PDF bytes themselves never persist)",
|
|
155
|
+
]),
|
|
77
156
|
],
|
|
78
157
|
sensitivity: "high",
|
|
79
158
|
legalGate: false,
|
|
@@ -114,17 +193,46 @@ class EmailAdapter {
|
|
|
114
193
|
const watermark = typeof opts.sinceWatermark === "string" ? opts.sinceWatermark : "";
|
|
115
194
|
const { uidValidity: prevUv, lastUid: prevLastUid } = parseWatermark(watermark);
|
|
116
195
|
|
|
196
|
+
// Phase 5.7: per-sync progress hook is the union of constructor + opts.
|
|
197
|
+
// Callers (registry / tests) can pass a fresh callback per sync without
|
|
198
|
+
// mutating the adapter instance.
|
|
199
|
+
const syncOnProgress = typeof opts.onProgress === "function"
|
|
200
|
+
? opts.onProgress
|
|
201
|
+
: this._onProgress;
|
|
202
|
+
const emitProgress = (phase, payload = {}) => {
|
|
203
|
+
if (!syncOnProgress) return;
|
|
204
|
+
try {
|
|
205
|
+
syncOnProgress({ phase, adapter: NAME, ...payload });
|
|
206
|
+
} catch (_e) {
|
|
207
|
+
// Listener errors must NOT abort the sync.
|
|
208
|
+
}
|
|
209
|
+
};
|
|
210
|
+
|
|
211
|
+
const syncStart = Date.now();
|
|
117
212
|
const session = this._sessionFactory(this._sessionConfig());
|
|
213
|
+
let totalEmitted = 0;
|
|
118
214
|
try {
|
|
119
|
-
|
|
215
|
+
// Phase 5.7: connect with retry on transient errors.
|
|
216
|
+
await this._connectWithRetry(session, emitProgress);
|
|
120
217
|
|
|
121
218
|
for (const folder of folders) {
|
|
122
219
|
const mb = await session.openMailbox(folder);
|
|
220
|
+
emitProgress("mailbox-opened", {
|
|
221
|
+
mailbox: folder,
|
|
222
|
+
exists: mb.exists,
|
|
223
|
+
uidValidity: mb.uidValidity,
|
|
224
|
+
});
|
|
123
225
|
const uvChanged = prevUv !== null && String(prevUv) !== String(mb.uidValidity);
|
|
124
226
|
const since = uvChanged ? 0 : prevLastUid;
|
|
125
227
|
|
|
126
228
|
let emitted = 0;
|
|
127
229
|
for await (const env of session.fetchFullSince(since)) {
|
|
230
|
+
emitProgress("fetching", {
|
|
231
|
+
mailbox: folder,
|
|
232
|
+
current: emitted + 1,
|
|
233
|
+
total: mb.exists,
|
|
234
|
+
uid: env.uid,
|
|
235
|
+
});
|
|
128
236
|
// Parse the body in the adapter (not the session) so the
|
|
129
237
|
// session stays a thin IMAP wrapper. Parse failures degrade
|
|
130
238
|
// gracefully — emit the raw event without parsedBody so the
|
|
@@ -133,25 +241,132 @@ class EmailAdapter {
|
|
|
133
241
|
let parsedBody = null;
|
|
134
242
|
try {
|
|
135
243
|
if (env.source && env.source.length > 0) {
|
|
136
|
-
|
|
244
|
+
// Phase 5.5: ask parser to keep attachment buffers when
|
|
245
|
+
// we may need to decrypt PDFs. Buffers are stripped from
|
|
246
|
+
// the emitted RawEvent (in _envelopeToRawEvent) so the
|
|
247
|
+
// vault doesn't archive megabytes of PDF bytes.
|
|
248
|
+
parsedBody = await this._parser(env.source, {
|
|
249
|
+
keepAttachmentBuffers: !this._disablePdfExtraction,
|
|
250
|
+
});
|
|
137
251
|
}
|
|
138
252
|
} catch (parseErr) {
|
|
139
|
-
//
|
|
253
|
+
// Layer 1 classifier rules can still fire on envelope-only
|
|
140
254
|
// facts; we just lose body text + attachments for this email.
|
|
141
255
|
parsedBody = {
|
|
142
256
|
parseError: parseErr && parseErr.message ? parseErr.message : String(parseErr),
|
|
143
257
|
};
|
|
144
258
|
}
|
|
145
|
-
|
|
259
|
+
|
|
260
|
+
// Phase 5.3: classify. Layer 1 runs synchronously on
|
|
261
|
+
// (from, subject, headers, attachment hints). If under the
|
|
262
|
+
// confidence threshold AND we have an LLM, Layer 2 fires.
|
|
263
|
+
// Classifier errors degrade to OTHER (never abort sync).
|
|
264
|
+
let classification = null;
|
|
265
|
+
if (!this._disableClassification) {
|
|
266
|
+
try {
|
|
267
|
+
classification = await this._classifier(
|
|
268
|
+
this._classifierInput(env, parsedBody),
|
|
269
|
+
{
|
|
270
|
+
llm: this._llm,
|
|
271
|
+
minLayer1Confidence: this._minLayer1Confidence,
|
|
272
|
+
}
|
|
273
|
+
);
|
|
274
|
+
} catch (err) {
|
|
275
|
+
classification = {
|
|
276
|
+
category: CATEGORIES.OTHER,
|
|
277
|
+
confidence: 0,
|
|
278
|
+
layer: "error",
|
|
279
|
+
error: err && err.message ? err.message : String(err),
|
|
280
|
+
};
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
// Phase 5.4: per-category template extraction.
|
|
285
|
+
// Dispatcher routes via classification.category, so a missing
|
|
286
|
+
// classification (disableClassification=true) maps to "other".
|
|
287
|
+
let extraction = null;
|
|
288
|
+
if (!this._disableExtraction) {
|
|
289
|
+
try {
|
|
290
|
+
extraction = await this._extractor(
|
|
291
|
+
this._classifierInput(env, parsedBody),
|
|
292
|
+
classification || { category: CATEGORIES.OTHER },
|
|
293
|
+
{ llm: this._llm }
|
|
294
|
+
);
|
|
295
|
+
} catch (err) {
|
|
296
|
+
extraction = {
|
|
297
|
+
template: "other",
|
|
298
|
+
fields: {},
|
|
299
|
+
confidence: 0,
|
|
300
|
+
warnings: [`extractor threw: ${err && err.message ? err.message : err}`],
|
|
301
|
+
};
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
// Phase 5.5: PDF attachment decryption + transactions extraction.
|
|
306
|
+
// Runs only when the email was classified as a bill / travel AND
|
|
307
|
+
// has at least one PDF attachment whose buffer is available.
|
|
308
|
+
// Errors captured per-attachment, never thrown.
|
|
309
|
+
if (
|
|
310
|
+
!this._disablePdfExtraction
|
|
311
|
+
&& extraction
|
|
312
|
+
&& (extraction.template === "bill" || extraction.template === "travel")
|
|
313
|
+
&& parsedBody
|
|
314
|
+
&& Array.isArray(parsedBody.attachments)
|
|
315
|
+
&& parsedBody.attachments.some((a) => isPdfAttachment(a))
|
|
316
|
+
) {
|
|
317
|
+
await this._runPdfExtraction(parsedBody, extraction);
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
yield this._envelopeToRawEvent(env, folder, parsedBody, classification, extraction);
|
|
146
321
|
emitted += 1;
|
|
322
|
+
totalEmitted += 1;
|
|
147
323
|
if (emitted >= maxPerFolder) break;
|
|
148
324
|
}
|
|
149
325
|
}
|
|
326
|
+
emitProgress("done", { emitted: totalEmitted, durationMs: Date.now() - syncStart });
|
|
150
327
|
} finally {
|
|
151
328
|
try { await session.close(); } catch (_e) {}
|
|
152
329
|
}
|
|
153
330
|
}
|
|
154
331
|
|
|
332
|
+
/**
|
|
333
|
+
* Phase 5.7: connect with retry on transient errors. Auth failures
|
|
334
|
+
* (AUTH_FAILED) and mailbox-not-found (MAILBOX_NOT_FOUND) bypass retry —
|
|
335
|
+
* those are user errors that won't fix themselves. Network blips
|
|
336
|
+
* (ECONNRESET / ETIMEDOUT / EPIPE / socket errors / generic
|
|
337
|
+
* CONNECTION_FAILED) get up to `_maxConnectRetries` attempts with
|
|
338
|
+
* exponential backoff.
|
|
339
|
+
*/
|
|
340
|
+
async _connectWithRetry(session, emitProgress) {
|
|
341
|
+
const maxAttempts = this._maxConnectRetries;
|
|
342
|
+
let lastErr = null;
|
|
343
|
+
for (let attempt = 1; attempt <= maxAttempts; attempt += 1) {
|
|
344
|
+
emitProgress("connecting", { attempt, maxAttempts });
|
|
345
|
+
try {
|
|
346
|
+
await session.connect();
|
|
347
|
+
emitProgress("connected", { attempt });
|
|
348
|
+
return;
|
|
349
|
+
} catch (err) {
|
|
350
|
+
lastErr = err;
|
|
351
|
+
const transient = isTransientImapError(err);
|
|
352
|
+
emitProgress("error", {
|
|
353
|
+
failingPhase: "connecting",
|
|
354
|
+
attempt,
|
|
355
|
+
retriable: transient && attempt < maxAttempts,
|
|
356
|
+
code: err && err.code,
|
|
357
|
+
message: err && err.message ? err.message : String(err),
|
|
358
|
+
});
|
|
359
|
+
if (!transient || attempt >= maxAttempts) {
|
|
360
|
+
throw err;
|
|
361
|
+
}
|
|
362
|
+
// Exponential backoff: base * 3^(attempt-1)
|
|
363
|
+
const delay = this._retryBaseDelayMs * Math.pow(3, attempt - 1);
|
|
364
|
+
await sleep(delay);
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
throw lastErr;
|
|
368
|
+
}
|
|
369
|
+
|
|
155
370
|
normalize(raw) {
|
|
156
371
|
if (!raw || typeof raw !== "object" || !raw.payload) {
|
|
157
372
|
throw new Error("EmailAdapter.normalize: missing raw or raw.payload");
|
|
@@ -251,9 +466,54 @@ class EmailAdapter {
|
|
|
251
466
|
indicatorHeaders: pickIndicatorHeaders(parsedBody.headers),
|
|
252
467
|
}
|
|
253
468
|
: {}),
|
|
469
|
+
// Phase 5.3: per-email category + which layer / rule decided it.
|
|
470
|
+
// Phase 5.4 template extractors dispatched on `.classified`.
|
|
471
|
+
...(env.classification
|
|
472
|
+
? {
|
|
473
|
+
classified: env.classification.category,
|
|
474
|
+
classification: {
|
|
475
|
+
category: env.classification.category,
|
|
476
|
+
confidence: env.classification.confidence,
|
|
477
|
+
layer: env.classification.layer,
|
|
478
|
+
...(env.classification.ruleName ? { ruleName: env.classification.ruleName } : {}),
|
|
479
|
+
...(env.classification.reason ? { reason: env.classification.reason } : {}),
|
|
480
|
+
},
|
|
481
|
+
}
|
|
482
|
+
: {}),
|
|
483
|
+
// Phase 5.4: structured fields from the per-category template.
|
|
484
|
+
// Stored at top of extra so analysis prompts + KG ingestors can
|
|
485
|
+
// see them without spelunking through `extraction.fields`.
|
|
486
|
+
...(env.extraction && env.extraction.fields
|
|
487
|
+
? {
|
|
488
|
+
fields: env.extraction.fields,
|
|
489
|
+
extractionTemplate: env.extraction.template,
|
|
490
|
+
extractionConfidence: env.extraction.confidence,
|
|
491
|
+
...(env.extraction.warnings && env.extraction.warnings.length > 0
|
|
492
|
+
? { extractionWarnings: env.extraction.warnings }
|
|
493
|
+
: {}),
|
|
494
|
+
// Phase 5.5: per-attachment decrypt+extract summary so the
|
|
495
|
+
// UI can flag "could not unlock this bill" + transactions
|
|
496
|
+
// count. Actual transactions list lives at fields.transactions.
|
|
497
|
+
...(env.extraction.pdfExtraction
|
|
498
|
+
? { pdfExtraction: env.extraction.pdfExtraction }
|
|
499
|
+
: {}),
|
|
500
|
+
}
|
|
501
|
+
: {}),
|
|
254
502
|
},
|
|
255
503
|
};
|
|
256
504
|
|
|
505
|
+
// Phase 5.4 compliance redaction: emails containing verification
|
|
506
|
+
// codes (OTP / 2FA) must NEVER persist their body in vault — even
|
|
507
|
+
// an "expired" OTP is sensitive evidence of session activity.
|
|
508
|
+
// Adapter_Email_IMAP.md §9.2 mandates "verificationCodePresent =
|
|
509
|
+
// store metadata only".
|
|
510
|
+
if (env.extraction
|
|
511
|
+
&& env.extraction.template === "register"
|
|
512
|
+
&& env.extraction.fields
|
|
513
|
+
&& env.extraction.fields.verificationCodePresent) {
|
|
514
|
+
event.content.text = "(redacted: verification code email)";
|
|
515
|
+
}
|
|
516
|
+
|
|
257
517
|
return { events: [event], persons, places: [], items: [], topics: [] };
|
|
258
518
|
}
|
|
259
519
|
|
|
@@ -267,7 +527,7 @@ class EmailAdapter {
|
|
|
267
527
|
};
|
|
268
528
|
}
|
|
269
529
|
|
|
270
|
-
_envelopeToRawEvent(env, folder, parsedBody) {
|
|
530
|
+
_envelopeToRawEvent(env, folder, parsedBody, classification, extraction) {
|
|
271
531
|
const originalId = env.messageId && env.messageId.length > 0
|
|
272
532
|
? env.messageId
|
|
273
533
|
: `mid-fallback:${this.account.email}:${folder}:${env.uid}`;
|
|
@@ -280,6 +540,10 @@ class EmailAdapter {
|
|
|
280
540
|
// re-derive). The source is recoverable by re-syncing if absolutely
|
|
281
541
|
// needed.
|
|
282
542
|
const { source: _src, ...envNoSource } = env;
|
|
543
|
+
// Phase 5.5: also strip attachment buffers from parsedBody. Buffers
|
|
544
|
+
// are loaded for PDF decryption then discarded — vault keeps only
|
|
545
|
+
// metadata (filename / contentType / size / sha256).
|
|
546
|
+
const safeBody = parsedBody ? stripAttachmentBuffers(parsedBody) : null;
|
|
283
547
|
return {
|
|
284
548
|
adapter: NAME,
|
|
285
549
|
originalId,
|
|
@@ -287,11 +551,90 @@ class EmailAdapter {
|
|
|
287
551
|
payload: {
|
|
288
552
|
...envNoSource,
|
|
289
553
|
folder,
|
|
290
|
-
...(
|
|
554
|
+
...(safeBody ? { parsedBody: safeBody } : {}),
|
|
555
|
+
...(classification ? { classification } : {}),
|
|
556
|
+
...(extraction ? { extraction } : {}),
|
|
291
557
|
},
|
|
292
558
|
};
|
|
293
559
|
}
|
|
294
560
|
|
|
561
|
+
/**
|
|
562
|
+
* Phase 5.5 helper: for each PDF attachment with a buffer, try to
|
|
563
|
+
* decrypt + extract text + parse transactions. Merges results into
|
|
564
|
+
* `extraction.fields.transactions` and stamps `pdfExtraction` metadata
|
|
565
|
+
* on the extraction so UI can surface failures.
|
|
566
|
+
*
|
|
567
|
+
* Side effects: mutates `extraction.fields` + adds `extraction.pdfExtraction`.
|
|
568
|
+
* Errors captured, never thrown — preserves the "sync never aborts on
|
|
569
|
+
* a single bad email" invariant.
|
|
570
|
+
*/
|
|
571
|
+
async _runPdfExtraction(parsedBody, extraction) {
|
|
572
|
+
const pdfAtts = parsedBody.attachments.filter((a) => isPdfAttachment(a));
|
|
573
|
+
const results = [];
|
|
574
|
+
const allTxns = [];
|
|
575
|
+
|
|
576
|
+
for (const a of pdfAtts) {
|
|
577
|
+
if (!Buffer.isBuffer(a.buffer)) {
|
|
578
|
+
results.push({
|
|
579
|
+
filename: a.filename,
|
|
580
|
+
decrypted: false,
|
|
581
|
+
attempted: 0,
|
|
582
|
+
error: "no buffer (parser keepAttachmentBuffers=false?)",
|
|
583
|
+
});
|
|
584
|
+
continue;
|
|
585
|
+
}
|
|
586
|
+
try {
|
|
587
|
+
const r = await this._pdfExtractor(a.buffer, { passwords: this._pdfPasswords });
|
|
588
|
+
const summary = {
|
|
589
|
+
filename: a.filename,
|
|
590
|
+
decrypted: r.decrypted,
|
|
591
|
+
attempted: r.attempted,
|
|
592
|
+
wasEncrypted: r.wasEncrypted,
|
|
593
|
+
pageCount: r.pageCount,
|
|
594
|
+
...(r.password !== undefined ? { passwordUsed: "***" } : {}), // never persist the real password
|
|
595
|
+
...(r.error ? { error: r.error } : {}),
|
|
596
|
+
};
|
|
597
|
+
if (r.decrypted && typeof r.text === "string" && r.text.length > 0) {
|
|
598
|
+
const txns = this._transactionsExtractor(r.text);
|
|
599
|
+
if (Array.isArray(txns) && txns.length > 0) {
|
|
600
|
+
for (const t of txns) {
|
|
601
|
+
t.attachmentSha256 = a.sha256;
|
|
602
|
+
allTxns.push(t);
|
|
603
|
+
}
|
|
604
|
+
summary.transactionsExtracted = txns.length;
|
|
605
|
+
} else {
|
|
606
|
+
summary.transactionsExtracted = 0;
|
|
607
|
+
}
|
|
608
|
+
}
|
|
609
|
+
results.push(summary);
|
|
610
|
+
} catch (err) {
|
|
611
|
+
results.push({
|
|
612
|
+
filename: a.filename,
|
|
613
|
+
decrypted: false,
|
|
614
|
+
attempted: 0,
|
|
615
|
+
error: err && err.message ? err.message : String(err),
|
|
616
|
+
});
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
if (allTxns.length > 0) {
|
|
621
|
+
extraction.fields = extraction.fields || {};
|
|
622
|
+
extraction.fields.transactions = allTxns;
|
|
623
|
+
}
|
|
624
|
+
extraction.pdfExtraction = results;
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
_classifierInput(env, parsedBody) {
|
|
628
|
+
return {
|
|
629
|
+
from: env.from,
|
|
630
|
+
subject: env.subject,
|
|
631
|
+
attachments: parsedBody && Array.isArray(parsedBody.attachments) ? parsedBody.attachments : [],
|
|
632
|
+
textBody: (parsedBody && parsedBody.textBody) || "",
|
|
633
|
+
htmlBody: (parsedBody && parsedBody.htmlBody) || "",
|
|
634
|
+
headers: parsedBody && parsedBody.headers ? parsedBody.headers : {},
|
|
635
|
+
};
|
|
636
|
+
}
|
|
637
|
+
|
|
295
638
|
_source(originalId, internalDate) {
|
|
296
639
|
return {
|
|
297
640
|
adapter: NAME,
|
|
@@ -389,6 +732,74 @@ function pickIndicatorHeaders(headers) {
|
|
|
389
732
|
return out;
|
|
390
733
|
}
|
|
391
734
|
|
|
735
|
+
/**
|
|
736
|
+
* Phase 5.7: decide if an IMAP error is worth retrying. Transient
|
|
737
|
+
* network blips (ECONNRESET / ETIMEDOUT / EPIPE / connect-failed / socket-
|
|
738
|
+
* disconnect / "connection lost") get a retry; auth failures / mailbox
|
|
739
|
+
* misconfig do NOT.
|
|
740
|
+
*/
|
|
741
|
+
function isTransientImapError(err) {
|
|
742
|
+
if (!err) return false;
|
|
743
|
+
if (err.code === "AUTH_FAILED" || err.code === "MAILBOX_NOT_FOUND") return false;
|
|
744
|
+
if (err.code === "CONNECTION_FAILED") return true;
|
|
745
|
+
// Node-level network error codes
|
|
746
|
+
const networkCodes = new Set([
|
|
747
|
+
"ECONNRESET",
|
|
748
|
+
"ETIMEDOUT",
|
|
749
|
+
"EPIPE",
|
|
750
|
+
"ECONNREFUSED",
|
|
751
|
+
"ENETUNREACH",
|
|
752
|
+
"EAI_AGAIN",
|
|
753
|
+
"ENOTFOUND", // DNS can be transient on flaky networks
|
|
754
|
+
]);
|
|
755
|
+
if (err.code && networkCodes.has(err.code)) return true;
|
|
756
|
+
if (err.cause && err.cause.code && networkCodes.has(err.cause.code)) return true;
|
|
757
|
+
const msg = (err.message || "").toLowerCase();
|
|
758
|
+
if (
|
|
759
|
+
msg.includes("timed out")
|
|
760
|
+
|| msg.includes("timeout")
|
|
761
|
+
|| msg.includes("socket disconnect")
|
|
762
|
+
|| msg.includes("connection lost")
|
|
763
|
+
|| msg.includes("connection reset")
|
|
764
|
+
|| msg.includes("write after end")
|
|
765
|
+
) {
|
|
766
|
+
return true;
|
|
767
|
+
}
|
|
768
|
+
return false;
|
|
769
|
+
}
|
|
770
|
+
|
|
771
|
+
function sleep(ms) {
|
|
772
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
773
|
+
}
|
|
774
|
+
|
|
775
|
+
/**
|
|
776
|
+
* Phase 5.5: detect PDF attachments. Goes by contentType first, falls
|
|
777
|
+
* back to filename extension for senders that omit the MIME type.
|
|
778
|
+
*/
|
|
779
|
+
function isPdfAttachment(a) {
|
|
780
|
+
if (!a || typeof a !== "object") return false;
|
|
781
|
+
if (typeof a.contentType === "string" && a.contentType.toLowerCase().includes("pdf")) return true;
|
|
782
|
+
if (typeof a.filename === "string" && /\.pdf$/i.test(a.filename)) return true;
|
|
783
|
+
return false;
|
|
784
|
+
}
|
|
785
|
+
|
|
786
|
+
/**
|
|
787
|
+
* Phase 5.5: drop Buffer fields from each attachment before the parsed
|
|
788
|
+
* body lands in the emitted RawEvent. Vault row size + WS-gateway
|
|
789
|
+
* serialization cost would be dominated by attachment bytes otherwise.
|
|
790
|
+
*/
|
|
791
|
+
function stripAttachmentBuffers(parsedBody) {
|
|
792
|
+
if (!parsedBody || !Array.isArray(parsedBody.attachments)) return parsedBody;
|
|
793
|
+
return {
|
|
794
|
+
...parsedBody,
|
|
795
|
+
attachments: parsedBody.attachments.map((a) => {
|
|
796
|
+
if (!a || a.buffer == null) return a;
|
|
797
|
+
const { buffer: _b, ...rest } = a;
|
|
798
|
+
return rest;
|
|
799
|
+
}),
|
|
800
|
+
};
|
|
801
|
+
}
|
|
802
|
+
|
|
392
803
|
module.exports = {
|
|
393
804
|
EmailAdapter,
|
|
394
805
|
parseWatermark,
|
|
@@ -9,6 +9,27 @@ const {
|
|
|
9
9
|
ImapMailboxNotFoundError,
|
|
10
10
|
} = require("./imap-session");
|
|
11
11
|
const { parseRawEmail } = require("./email-parser");
|
|
12
|
+
const {
|
|
13
|
+
CATEGORIES,
|
|
14
|
+
ALL_CATEGORIES,
|
|
15
|
+
LAYER1_RULES,
|
|
16
|
+
classifyLayer1,
|
|
17
|
+
classifyLayer2,
|
|
18
|
+
classifyEmail,
|
|
19
|
+
LAYER2_SYSTEM_PROMPT,
|
|
20
|
+
} = require("./classifier");
|
|
21
|
+
const {
|
|
22
|
+
extractFields,
|
|
23
|
+
CATEGORY_TO_EXTRACTOR,
|
|
24
|
+
extractBill,
|
|
25
|
+
extractOrder,
|
|
26
|
+
extractTravel,
|
|
27
|
+
extractGovernment,
|
|
28
|
+
extractRegister,
|
|
29
|
+
extractOther,
|
|
30
|
+
} = require("./templates");
|
|
31
|
+
const { extractPdfText, passwordsFromHints } = require("./pdf-extractor");
|
|
32
|
+
const { extractTransactions } = require("./transactions");
|
|
12
33
|
|
|
13
34
|
module.exports = {
|
|
14
35
|
EmailAdapter,
|
|
@@ -23,4 +44,25 @@ module.exports = {
|
|
|
23
44
|
ImapConnectionFailedError,
|
|
24
45
|
ImapMailboxNotFoundError,
|
|
25
46
|
parseRawEmail,
|
|
47
|
+
// Phase 5.3 classifier
|
|
48
|
+
EMAIL_CATEGORIES: CATEGORIES,
|
|
49
|
+
EMAIL_ALL_CATEGORIES: ALL_CATEGORIES,
|
|
50
|
+
EMAIL_LAYER1_RULES: LAYER1_RULES,
|
|
51
|
+
classifyEmailLayer1: classifyLayer1,
|
|
52
|
+
classifyEmailLayer2: classifyLayer2,
|
|
53
|
+
classifyEmail,
|
|
54
|
+
EMAIL_LAYER2_SYSTEM_PROMPT: LAYER2_SYSTEM_PROMPT,
|
|
55
|
+
// Phase 5.4 — 6 template extractors + dispatcher
|
|
56
|
+
extractEmailFields: extractFields,
|
|
57
|
+
EMAIL_CATEGORY_EXTRACTORS: CATEGORY_TO_EXTRACTOR,
|
|
58
|
+
extractBill,
|
|
59
|
+
extractOrder,
|
|
60
|
+
extractTravel,
|
|
61
|
+
extractGovernment,
|
|
62
|
+
extractRegister,
|
|
63
|
+
extractOther,
|
|
64
|
+
// Phase 5.5 — PDF decryption + transactions extraction
|
|
65
|
+
extractPdfText,
|
|
66
|
+
pdfPasswordsFromHints: passwordsFromHints,
|
|
67
|
+
extractTransactions,
|
|
26
68
|
};
|