@chainlesschain/personal-data-hub 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/__tests__/adapters/ai-chat-history.test.js +395 -0
- package/__tests__/adapters/ai-chat-http-client.test.js +242 -0
- package/__tests__/adapters/ai-chat-vendors.test.js +733 -0
- package/__tests__/adapters/alipay-bill-adapter.test.js +538 -0
- package/__tests__/adapters/email-adapter.test.js +138 -1
- package/__tests__/adapters/email-classifier.test.js +347 -0
- package/__tests__/adapters/email-pdf-extractor.test.js +529 -0
- package/__tests__/adapters/email-retry-progress.test.js +294 -0
- package/__tests__/adapters/email-templates.test.js +699 -0
- package/__tests__/adapters/system-data-adapter.test.js +440 -0
- package/__tests__/adapters/system-data-disclosure.test.js +153 -0
- package/__tests__/analysis-skills.test.js +409 -0
- package/__tests__/entity-resolver-ingest-hook.test.js +177 -0
- package/__tests__/entity-resolver-stages.test.js +411 -0
- package/__tests__/entity-resolver-vault.test.js +246 -0
- package/__tests__/entity-resolver.test.js +526 -0
- package/__tests__/fixtures/entity-resolver-200-mock.json +96 -0
- package/__tests__/longtail-adapters.test.js +217 -0
- package/__tests__/mobile-extractor.test.js +288 -0
- package/__tests__/shopping-adapters.test.js +296 -0
- package/__tests__/sidecar-contacts-cross-validate.test.js +163 -0
- package/__tests__/sidecar-supervisor.test.js +120 -0
- package/__tests__/social-adapters.test.js +206 -0
- package/__tests__/travel-adapters.test.js +325 -0
- package/__tests__/vault.test.js +3 -3
- package/__tests__/wechat-adapter.test.js +476 -0
- package/__tests__/whatsapp-adapter.test.js +135 -0
- package/lib/adapter-spec.js +12 -0
- package/lib/adapters/_python-sidecar-base.js +207 -0
- package/lib/adapters/ai-chat-history/ai-chat-adapter.js +335 -0
- package/lib/adapters/ai-chat-history/cookie-auth.js +109 -0
- package/lib/adapters/ai-chat-history/http-client.js +211 -0
- package/lib/adapters/ai-chat-history/index.js +28 -0
- package/lib/adapters/ai-chat-history/schema-map.js +221 -0
- package/lib/adapters/ai-chat-history/vendor-spec.js +85 -0
- package/lib/adapters/ai-chat-history/vendors/coze.js +179 -0
- package/lib/adapters/ai-chat-history/vendors/deepseek.js +199 -0
- package/lib/adapters/ai-chat-history/vendors/dreamina.js +174 -0
- package/lib/adapters/ai-chat-history/vendors/hunyuan.js +176 -0
- package/lib/adapters/ai-chat-history/vendors/kimi.js +182 -0
- package/lib/adapters/ai-chat-history/vendors/qianfan.js +160 -0
- package/lib/adapters/ai-chat-history/vendors/tongyi.js +193 -0
- package/lib/adapters/ai-chat-history/vendors/zhipu.js +202 -0
- package/lib/adapters/alipay-bill/alipay-bill-adapter.js +307 -0
- package/lib/adapters/alipay-bill/counterparty.js +129 -0
- package/lib/adapters/alipay-bill/csv-parser.js +217 -0
- package/lib/adapters/alipay-bill/index.js +41 -0
- package/lib/adapters/alipay-bill/zip-decryptor.js +111 -0
- package/lib/adapters/email-imap/classifier.js +495 -0
- package/lib/adapters/email-imap/email-adapter.js +419 -8
- package/lib/adapters/email-imap/index.js +42 -0
- package/lib/adapters/email-imap/pdf-extractor.js +192 -0
- package/lib/adapters/email-imap/templates/bill.js +232 -0
- package/lib/adapters/email-imap/templates/government.js +120 -0
- package/lib/adapters/email-imap/templates/index.js +78 -0
- package/lib/adapters/email-imap/templates/order.js +186 -0
- package/lib/adapters/email-imap/templates/other.js +114 -0
- package/lib/adapters/email-imap/templates/register.js +113 -0
- package/lib/adapters/email-imap/templates/travel.js +157 -0
- package/lib/adapters/email-imap/templates/utils.js +275 -0
- package/lib/adapters/email-imap/transactions.js +234 -0
- package/lib/adapters/messaging-qq/index.js +158 -0
- package/lib/adapters/messaging-telegram/index.js +142 -0
- package/lib/adapters/messaging-whatsapp/index.js +189 -0
- package/lib/adapters/shopping-base/index.js +208 -0
- package/lib/adapters/shopping-jd/index.js +150 -0
- package/lib/adapters/shopping-meituan/index.js +154 -0
- package/lib/adapters/shopping-taobao/index.js +176 -0
- package/lib/adapters/social-bilibili/index.js +171 -0
- package/lib/adapters/social-douyin/index.js +116 -0
- package/lib/adapters/social-weibo/index.js +164 -0
- package/lib/adapters/social-xiaohongshu/index.js +96 -0
- package/lib/adapters/system-data/disclosure.js +166 -0
- package/lib/adapters/system-data/index.js +34 -0
- package/lib/adapters/system-data/system-data-adapter.js +344 -0
- package/lib/adapters/travel-12306/index.js +151 -0
- package/lib/adapters/travel-amap/index.js +164 -0
- package/lib/adapters/travel-baidu-map/index.js +162 -0
- package/lib/adapters/travel-base/index.js +240 -0
- package/lib/adapters/travel-ctrip/index.js +151 -0
- package/lib/adapters/wechat/content-parser.js +326 -0
- package/lib/adapters/wechat/db-reader.js +209 -0
- package/lib/adapters/wechat/index.js +28 -0
- package/lib/adapters/wechat/key-extractor.js +158 -0
- package/lib/adapters/wechat/normalize.js +220 -0
- package/lib/adapters/wechat/wechat-adapter.js +205 -0
- package/lib/analysis-skills/base.js +113 -0
- package/lib/analysis-skills/footprint.js +167 -0
- package/lib/analysis-skills/index.js +58 -0
- package/lib/analysis-skills/interests.js +161 -0
- package/lib/analysis-skills/relations.js +226 -0
- package/lib/analysis-skills/spending.js +216 -0
- package/lib/analysis-skills/timeline.js +167 -0
- package/lib/entity-resolver/embedding-stage.js +198 -0
- package/lib/entity-resolver/entity-resolver.js +384 -0
- package/lib/entity-resolver/index.js +42 -0
- package/lib/entity-resolver/llm-stage.js +191 -0
- package/lib/entity-resolver/rule-stage.js +208 -0
- package/lib/entity-resolver/worker.js +149 -0
- package/lib/index.js +115 -0
- package/lib/migrations.js +73 -0
- package/lib/mobile-extractor/android.js +193 -0
- package/lib/mobile-extractor/index.js +9 -0
- package/lib/mobile-extractor/ios.js +223 -0
- package/lib/registry.js +42 -0
- package/lib/sidecar/index.js +15 -0
- package/lib/sidecar/supervisor.js +359 -0
- package/lib/vault.js +266 -0
- package/package.json +29 -3
- package/scripts/_make-fixture-all.js +126 -0
- package/scripts/_make-fixture-contacts.js +84 -0
- package/scripts/evaluate-entity-resolver.js +213 -0
- package/scripts/smoke-phase-5-5.js +196 -0
- package/scripts/smoke-phase-5-7.js +181 -0
- package/scripts/smoke-system-data-contacts.js +309 -0
- package/scripts/smoke-system-data.js +312 -0
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Phase 5.5 — PDF text extractor with password-trial loop.
|
|
3
|
+
*
|
|
4
|
+
* Bank credit-card statements arrive as **encrypted** PDF attachments
|
|
5
|
+
* (~80% of Chinese issuers). Common password formats:
|
|
6
|
+
* - 身份证后 6 位 (last 6 of national ID)
|
|
7
|
+
* - 手机后 6 位 (last 6 of registered phone)
|
|
8
|
+
* - 卡号尾 6 位 (last 6 of card number)
|
|
9
|
+
* - 卡号尾 4 位 (last 4 of card number)
|
|
10
|
+
* - 用户生日 YYYYMMDD (date of birth)
|
|
11
|
+
*
|
|
12
|
+
* Strategy (per design doc §6.2.1 + T4 in §12):
|
|
13
|
+
* 1. Try ALL passwords in the supplied list (user-config'd + adapter
|
|
14
|
+
* heuristics) plus an empty-password attempt first (PDFs flagged
|
|
15
|
+
* as `/Encrypt` but with empty owner-password decrypt successfully).
|
|
16
|
+
* 2. Return on FIRST success.
|
|
17
|
+
* 3. If every password fails, return `{ decrypted:false, attempted:N }`
|
|
18
|
+
* so the caller can mark the bill as `unparsable` instead of
|
|
19
|
+
* silently dropping it. The user can later add a password to
|
|
20
|
+
* their list and re-run sync — the cached watermark plus
|
|
21
|
+
* reclassification on retry will pick it up.
|
|
22
|
+
*
|
|
23
|
+
* Implementation:
|
|
24
|
+
* - This module wraps `pdf-parse`. To avoid loading the heavy pdf.js
|
|
25
|
+
* bundle at hub require-time (cold start cost for non-email
|
|
26
|
+
* consumers), the dep is lazy-loaded on first call. `pdf-parse`
|
|
27
|
+
* itself imports pdfjs-dist v1.10 internally.
|
|
28
|
+
* - Callers may inject a custom `pdfParseImpl` for testing — this
|
|
29
|
+
* keeps the unit tests entirely free of pdf-parse / pdfjs deps
|
|
30
|
+
* (the real lib is pulled in only by integration / smoke).
|
|
31
|
+
* - Outputs the first 200_000 chars of extracted text (same cap as
|
|
32
|
+
* mailparser body output) so downstream LLM prompts stay sane.
|
|
33
|
+
*
|
|
34
|
+
* @typedef {object} PdfExtractResult
|
|
35
|
+
* @property {boolean} decrypted true if SOME password (or no-password)
|
|
36
|
+
* successfully opened the PDF.
|
|
37
|
+
* @property {string} text extracted plain text (empty when !decrypted)
|
|
38
|
+
* @property {string=} password the password that worked, or undefined
|
|
39
|
+
* if the PDF was not encrypted to begin
|
|
40
|
+
* with. (Empty-string password used for
|
|
41
|
+
* owner-password-only PDFs is reported as "".)
|
|
42
|
+
* @property {number} attempted count of password trials performed
|
|
43
|
+
* @property {boolean} wasEncrypted whether the PDF was actually encrypted
|
|
44
|
+
* (vs `isEncrypted` heuristic returning
|
|
45
|
+
* a false-positive — pdf-parse reports
|
|
46
|
+
* this via `info.IsEncrypted`).
|
|
47
|
+
* @property {number} pageCount pages, when known.
|
|
48
|
+
* @property {string=} error last error message if extraction failed
|
|
49
|
+
*/
|
|
50
|
+
|
|
51
|
+
"use strict";
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Extract text from a PDF buffer. Tries each candidate password until
|
|
55
|
+
* one works (or the list is exhausted). Never throws on bad passwords —
|
|
56
|
+
* returns `{decrypted:false}` so callers can mark the bill unparsable
|
|
57
|
+
* and surface a "missing password" hint to the UI.
|
|
58
|
+
*
|
|
59
|
+
* @param {Buffer} buffer PDF bytes (required)
|
|
60
|
+
* @param {object} [opts]
|
|
61
|
+
* @param {string[]} [opts.passwords=[]] candidate passwords to try, in order
|
|
62
|
+
* @param {number} [opts.maxTextChars=200000]
|
|
63
|
+
* @param {Function} [opts.pdfParseImpl] DI seam: `async (buf, {password?}) => {text, numpages, info?}`.
|
|
64
|
+
* Defaults to lazy-loaded `pdf-parse`.
|
|
65
|
+
* @returns {Promise<PdfExtractResult>}
|
|
66
|
+
*/
|
|
67
|
+
async function extractPdfText(buffer, opts = {}) {
|
|
68
|
+
if (!Buffer.isBuffer(buffer)) {
|
|
69
|
+
return {
|
|
70
|
+
decrypted: false,
|
|
71
|
+
text: "",
|
|
72
|
+
attempted: 0,
|
|
73
|
+
wasEncrypted: false,
|
|
74
|
+
pageCount: 0,
|
|
75
|
+
error: "buffer required",
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
const maxTextChars = Number.isFinite(opts.maxTextChars) && opts.maxTextChars > 0
|
|
79
|
+
? opts.maxTextChars
|
|
80
|
+
: 200_000;
|
|
81
|
+
const userPasswords = Array.isArray(opts.passwords) ? opts.passwords.filter((p) => typeof p === "string") : [];
|
|
82
|
+
const pdfParse = typeof opts.pdfParseImpl === "function" ? opts.pdfParseImpl : await loadPdfParse();
|
|
83
|
+
|
|
84
|
+
// Always attempt empty-password FIRST — pdf-parse treats undefined as
|
|
85
|
+
// "no password" and many "encrypted" bank PDFs use an empty owner
|
|
86
|
+
// password (only restricts editing, not decryption).
|
|
87
|
+
// Deduplicate while preserving order — user lists often include "".
|
|
88
|
+
const trial = ["", ...userPasswords].filter((v, i, arr) => arr.indexOf(v) === i);
|
|
89
|
+
|
|
90
|
+
let lastError = null;
|
|
91
|
+
let attempted = 0;
|
|
92
|
+
let wasEncrypted = false;
|
|
93
|
+
|
|
94
|
+
for (const pw of trial) {
|
|
95
|
+
attempted += 1;
|
|
96
|
+
try {
|
|
97
|
+
// pdf-parse accepts options.password (a string). Empty string is
|
|
98
|
+
// a valid argument and means "no password" to pdf.js.
|
|
99
|
+
const parsed = await pdfParse(buffer, { password: pw });
|
|
100
|
+
const info = parsed && parsed.info ? parsed.info : {};
|
|
101
|
+
if (info.IsEncrypted) wasEncrypted = true;
|
|
102
|
+
const text = trim((parsed && parsed.text) || "", maxTextChars);
|
|
103
|
+
return {
|
|
104
|
+
decrypted: true,
|
|
105
|
+
text,
|
|
106
|
+
password: pw === "" ? undefined : pw,
|
|
107
|
+
attempted,
|
|
108
|
+
wasEncrypted,
|
|
109
|
+
pageCount: (parsed && parsed.numpages) || 0,
|
|
110
|
+
};
|
|
111
|
+
} catch (err) {
|
|
112
|
+
lastError = err && err.message ? err.message : String(err);
|
|
113
|
+
// pdf-parse emits "PasswordException" / "InvalidPasswordException" on
|
|
114
|
+
// wrong password. Anything else (corrupt file, unsupported features)
|
|
115
|
+
// is also caught — we don't retry on those, but we mark wasEncrypted
|
|
116
|
+
// if the error string hints at it so the UI shows the right reason.
|
|
117
|
+
if (/password/i.test(lastError)) {
|
|
118
|
+
wasEncrypted = true;
|
|
119
|
+
continue; // try next candidate
|
|
120
|
+
}
|
|
121
|
+
// Non-password error → no point trying more passwords
|
|
122
|
+
return {
|
|
123
|
+
decrypted: false,
|
|
124
|
+
text: "",
|
|
125
|
+
attempted,
|
|
126
|
+
wasEncrypted,
|
|
127
|
+
pageCount: 0,
|
|
128
|
+
error: lastError,
|
|
129
|
+
};
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
return {
|
|
134
|
+
decrypted: false,
|
|
135
|
+
text: "",
|
|
136
|
+
attempted,
|
|
137
|
+
wasEncrypted: wasEncrypted || true, // exhausted all passwords → must be encrypted
|
|
138
|
+
pageCount: 0,
|
|
139
|
+
error: lastError || "all passwords failed",
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Generate candidate passwords from per-user hints. Each input is an
|
|
145
|
+
* optional string from the user's profile (idCardLast6, phoneLast6,
|
|
146
|
+
* etc.); the function returns the non-empty values in priority order,
|
|
147
|
+
* deduplicated.
|
|
148
|
+
*
|
|
149
|
+
* @param {object} hints
|
|
150
|
+
* @param {string} [hints.idCardLast6]
|
|
151
|
+
* @param {string} [hints.phoneLast6]
|
|
152
|
+
* @param {string} [hints.cardLast6]
|
|
153
|
+
* @param {string} [hints.cardLast4]
|
|
154
|
+
* @param {string} [hints.dobYYYYMMDD]
|
|
155
|
+
* @returns {string[]}
|
|
156
|
+
*/
|
|
157
|
+
function passwordsFromHints(hints = {}) {
|
|
158
|
+
const order = ["idCardLast6", "phoneLast6", "cardLast6", "cardLast4", "dobYYYYMMDD"];
|
|
159
|
+
const out = [];
|
|
160
|
+
for (const k of order) {
|
|
161
|
+
const v = hints[k];
|
|
162
|
+
if (typeof v === "string" && v.length > 0 && !out.includes(v)) out.push(v);
|
|
163
|
+
}
|
|
164
|
+
return out;
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// ─── helpers ────────────────────────────────────────────────────────────
|
|
168
|
+
|
|
169
|
+
function trim(s, max) {
|
|
170
|
+
if (typeof s !== "string") return "";
|
|
171
|
+
if (s.length <= max) return s;
|
|
172
|
+
return s.slice(0, max) + `…[truncated ${s.length - max} chars]`;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
let _pdfParseCache = null;
|
|
176
|
+
async function loadPdfParse() {
|
|
177
|
+
if (_pdfParseCache) return _pdfParseCache;
|
|
178
|
+
try {
|
|
179
|
+
// eslint-disable-next-line global-require
|
|
180
|
+
_pdfParseCache = require("pdf-parse");
|
|
181
|
+
} catch (err) {
|
|
182
|
+
throw new Error(
|
|
183
|
+
`pdf-parse not installed — Phase 5.5 PDF extraction needs it. ${err && err.message ? err.message : err}`
|
|
184
|
+
);
|
|
185
|
+
}
|
|
186
|
+
return _pdfParseCache;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
module.exports = {
|
|
190
|
+
extractPdfText,
|
|
191
|
+
passwordsFromHints,
|
|
192
|
+
};
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* bill template extractor — Phase 5.4 of the Personal Data Hub.
|
|
3
|
+
*
|
|
4
|
+
* Handles emails classified as bill_bank or bill_credit. Pulls structured
|
|
5
|
+
* financial fields out of the body + headers so Phase 5.5 (PDF
|
|
6
|
+
* decryption) and the future spending analyzer have something to grip:
|
|
7
|
+
*
|
|
8
|
+
* amount { value, currency, direction? }
|
|
9
|
+
* dueAmount same shape — separate from `amount` so total-billed
|
|
10
|
+
* vs total-due aren't conflated
|
|
11
|
+
* billingPeriod { start: Date, end: Date } when both detected
|
|
12
|
+
* dueDate Date (always parsed from explicit "最后还款日" /
|
|
13
|
+
* "due date" keywords, never inferred)
|
|
14
|
+
* accountIdentifier "尾号 1234" / "**** 5678" — last 4 digits only
|
|
15
|
+
* institution sender display name OR domain root
|
|
16
|
+
* billingMonth "YYYY-MM" string (heuristic from subject + dueDate)
|
|
17
|
+
*
|
|
18
|
+
* Strategy:
|
|
19
|
+
* 1. Regex over textBody + subject — single pass, deterministic.
|
|
20
|
+
* 2. Pick the most-plausible amount via `selectPrimaryAmount`. When
|
|
21
|
+
* both 应还/应付 and a generic amount are present, the directional
|
|
22
|
+
* one wins.
|
|
23
|
+
* 3. If `opts.llm` provided AND regex coverage < 60%, ask the LLM to
|
|
24
|
+
* fill gaps (Phase 5.4 leaves this as a stub — Phase 5.5 wires it
|
|
25
|
+
* after we ground PDF-text against actual bank statements).
|
|
26
|
+
*
|
|
27
|
+
* Returns { template:"bill", fields, confidence, warnings }.
|
|
28
|
+
*/
|
|
29
|
+
|
|
30
|
+
"use strict";
|
|
31
|
+
|
|
32
|
+
const {
|
|
33
|
+
extractAmounts,
|
|
34
|
+
extractDates,
|
|
35
|
+
extractAccountTails,
|
|
36
|
+
selectPrimaryAmount,
|
|
37
|
+
dateToMs,
|
|
38
|
+
} = require("./utils");
|
|
39
|
+
|
|
40
|
+
const DUE_DATE_KEYWORDS = /(最后还款日|还款日|账单到期日|due\s*date|payment\s*due|应还日期)\s*[::]?\s*/i;
|
|
41
|
+
const PERIOD_KEYWORDS = /(账单周期|账期|结账周期|billing\s*period|statement\s*period)\s*[::]?\s*/i;
|
|
42
|
+
const DUE_AMOUNT_KEYWORDS = /(应还金额|本期应还|本期欠款|应还合计|最低还款额|amount\s*due|total\s*due)\s*[::]?\s*/i;
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* @param {object} email — must include from/subject/textBody (or htmlBody)
|
|
46
|
+
* @param {object} [opts]
|
|
47
|
+
* @param {{chat:Function}} [opts.llm]
|
|
48
|
+
* @returns {Promise<{template:"bill",fields:object,confidence:number,warnings:string[]}>}
|
|
49
|
+
*/
|
|
50
|
+
async function extractBill(email, _opts = {}) {
|
|
51
|
+
const warnings = [];
|
|
52
|
+
const textParts = collectSearchableText(email);
|
|
53
|
+
|
|
54
|
+
// ── 1. amount + dueAmount ──────────────────────────────────────────
|
|
55
|
+
const allAmounts = textParts
|
|
56
|
+
.flatMap((t) => extractAmounts(t.body).map((a) => ({ ...a, source: t.label })));
|
|
57
|
+
|
|
58
|
+
// Find dueAmount via window around DUE_AMOUNT_KEYWORDS
|
|
59
|
+
let dueAmount = null;
|
|
60
|
+
for (const t of textParts) {
|
|
61
|
+
const m = t.body.match(DUE_AMOUNT_KEYWORDS);
|
|
62
|
+
if (!m) continue;
|
|
63
|
+
const after = t.body.slice(m.index + m[0].length, m.index + m[0].length + 40);
|
|
64
|
+
const a = extractAmounts(after)[0];
|
|
65
|
+
if (a) {
|
|
66
|
+
dueAmount = { value: a.value, currency: a.currency, raw: a.raw };
|
|
67
|
+
break;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
const primary = selectPrimaryAmount(allAmounts);
|
|
72
|
+
const amount = primary
|
|
73
|
+
? { value: primary.value, currency: primary.currency, direction: primary.direction || "out" }
|
|
74
|
+
: null;
|
|
75
|
+
if (!amount) warnings.push("no monetary amount detected");
|
|
76
|
+
|
|
77
|
+
// ── 2. dueDate ─────────────────────────────────────────────────────
|
|
78
|
+
let dueDate = null;
|
|
79
|
+
for (const t of textParts) {
|
|
80
|
+
const m = t.body.match(DUE_DATE_KEYWORDS);
|
|
81
|
+
if (!m) continue;
|
|
82
|
+
// Search the next 40 chars for a date
|
|
83
|
+
const after = t.body.slice(m.index + m[0].length, m.index + m[0].length + 40);
|
|
84
|
+
const dates = extractDates(after);
|
|
85
|
+
if (dates.length > 0) {
|
|
86
|
+
dueDate = dates[0].date;
|
|
87
|
+
break;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// ── 3. billingPeriod ───────────────────────────────────────────────
|
|
92
|
+
let billingPeriod = null;
|
|
93
|
+
for (const t of textParts) {
|
|
94
|
+
const m = t.body.match(PERIOD_KEYWORDS);
|
|
95
|
+
if (!m) continue;
|
|
96
|
+
const after = t.body.slice(m.index + m[0].length, m.index + m[0].length + 80);
|
|
97
|
+
const dates = extractDates(after);
|
|
98
|
+
if (dates.length >= 2) {
|
|
99
|
+
billingPeriod = { start: dates[0].date, end: dates[1].date };
|
|
100
|
+
break;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// ── 4. account identifier (last 4) ────────────────────────────────
|
|
105
|
+
const tails = textParts.flatMap((t) => extractAccountTails(t.body));
|
|
106
|
+
const accountIdentifier = tails.length > 0 ? `**** ${tails[0].last4}` : null;
|
|
107
|
+
|
|
108
|
+
// ── 5. institution — from sender display name, fall back to domain ─
|
|
109
|
+
const institution = resolveInstitution(email);
|
|
110
|
+
|
|
111
|
+
// ── 6. billingMonth heuristic ──────────────────────────────────────
|
|
112
|
+
let billingMonth = null;
|
|
113
|
+
if (billingPeriod && billingPeriod.start instanceof Date) {
|
|
114
|
+
billingMonth = formatMonthKey(billingPeriod.start);
|
|
115
|
+
} else if (dueDate instanceof Date) {
|
|
116
|
+
// "11 月对账单 due 12-25" → bill is for month BEFORE due
|
|
117
|
+
const prev = new Date(dueDate);
|
|
118
|
+
prev.setMonth(prev.getMonth() - 1);
|
|
119
|
+
billingMonth = formatMonthKey(prev);
|
|
120
|
+
} else {
|
|
121
|
+
const m = (email.subject || "").match(/(\d{1,2})\s*月.*(?:对账单|月结|账单)/);
|
|
122
|
+
if (m) {
|
|
123
|
+
const month = +m[1];
|
|
124
|
+
const now = new Date();
|
|
125
|
+
const year = month > now.getMonth() + 1 ? now.getFullYear() - 1 : now.getFullYear();
|
|
126
|
+
billingMonth = formatMonthKey(new Date(year, month - 1, 1));
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
const fields = {
|
|
131
|
+
...(amount ? { amount } : {}),
|
|
132
|
+
...(dueAmount ? { dueAmount } : {}),
|
|
133
|
+
...(dueDate ? { dueDate: dateToMs(dueDate) } : {}),
|
|
134
|
+
...(billingPeriod
|
|
135
|
+
? {
|
|
136
|
+
billingPeriod: {
|
|
137
|
+
startMs: dateToMs(billingPeriod.start),
|
|
138
|
+
endMs: dateToMs(billingPeriod.end),
|
|
139
|
+
},
|
|
140
|
+
}
|
|
141
|
+
: {}),
|
|
142
|
+
...(accountIdentifier ? { accountIdentifier } : {}),
|
|
143
|
+
...(institution ? { institution } : {}),
|
|
144
|
+
...(billingMonth ? { billingMonth } : {}),
|
|
145
|
+
};
|
|
146
|
+
|
|
147
|
+
return {
|
|
148
|
+
template: "bill",
|
|
149
|
+
fields,
|
|
150
|
+
confidence: confidenceFor(fields),
|
|
151
|
+
warnings,
|
|
152
|
+
};
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// ─── helpers ────────────────────────────────────────────────────────────
|
|
156
|
+
|
|
157
|
+
function collectSearchableText(email) {
|
|
158
|
+
const parts = [];
|
|
159
|
+
if (email.subject) parts.push({ label: "subject", body: email.subject });
|
|
160
|
+
if (email.textBody) parts.push({ label: "textBody", body: email.textBody });
|
|
161
|
+
if (email.htmlBody && !email.textBody) parts.push({ label: "htmlBody", body: stripHtml(email.htmlBody) });
|
|
162
|
+
return parts;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
function stripHtml(html) {
|
|
166
|
+
return String(html)
|
|
167
|
+
.replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, "")
|
|
168
|
+
.replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, "")
|
|
169
|
+
.replace(/<[^>]+>/g, " ")
|
|
170
|
+
.replace(/ /g, " ")
|
|
171
|
+
.replace(/&/g, "&")
|
|
172
|
+
.replace(/\s+/g, " ");
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
function resolveInstitution(email) {
|
|
176
|
+
if (Array.isArray(email.from) && email.from[0]) {
|
|
177
|
+
if (email.from[0].name) return email.from[0].name;
|
|
178
|
+
const addr = email.from[0].address || "";
|
|
179
|
+
const at = addr.lastIndexOf("@");
|
|
180
|
+
if (at >= 0) {
|
|
181
|
+
const domain = addr.slice(at + 1).toLowerCase();
|
|
182
|
+
// Map known bank domains to friendly names
|
|
183
|
+
const known = {
|
|
184
|
+
"cmbchina.com": "招商银行",
|
|
185
|
+
"ccb.com.cn": "建设银行",
|
|
186
|
+
"boc.cn": "中国银行",
|
|
187
|
+
"bochk.cn": "中国银行",
|
|
188
|
+
"bochk.com": "中国银行",
|
|
189
|
+
"icbc.com.cn": "工商银行",
|
|
190
|
+
"psbc.com": "邮储银行",
|
|
191
|
+
"abchina.com": "农业银行",
|
|
192
|
+
"bankcomm.com": "交通银行",
|
|
193
|
+
"spdb.com.cn": "浦发银行",
|
|
194
|
+
"cmbc.com.cn": "民生银行",
|
|
195
|
+
"cebbank.com": "光大银行",
|
|
196
|
+
"citicbank.com": "中信银行",
|
|
197
|
+
"hxb.com.cn": "华夏银行",
|
|
198
|
+
};
|
|
199
|
+
if (known[domain]) return known[domain];
|
|
200
|
+
// Try parent domain (e.g. credit.boc.cn → boc.cn)
|
|
201
|
+
const parts = domain.split(".");
|
|
202
|
+
if (parts.length > 2) {
|
|
203
|
+
const parent = parts.slice(-2).join(".");
|
|
204
|
+
if (known[parent]) return known[parent];
|
|
205
|
+
}
|
|
206
|
+
return domain;
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
return null;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
function formatMonthKey(d) {
|
|
213
|
+
if (!(d instanceof Date)) return null;
|
|
214
|
+
const y = d.getFullYear();
|
|
215
|
+
const m = d.getMonth() + 1;
|
|
216
|
+
return `${y}-${String(m).padStart(2, "0")}`;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
/**
|
|
220
|
+
* Confidence: count of populated fields / total. Coarse but useful as
|
|
221
|
+
* a UI badge and as a switch for whether to fire Layer 2 LLM (TODO 5.5).
|
|
222
|
+
*/
|
|
223
|
+
function confidenceFor(fields) {
|
|
224
|
+
const tracked = [
|
|
225
|
+
"amount", "dueAmount", "dueDate", "billingPeriod",
|
|
226
|
+
"accountIdentifier", "institution", "billingMonth",
|
|
227
|
+
];
|
|
228
|
+
const populated = tracked.filter((k) => fields[k] != null).length;
|
|
229
|
+
return Math.round((populated / tracked.length) * 100) / 100;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
module.exports = { extractBill };
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* government template extractor — Phase 5.4.
|
|
3
|
+
*
|
|
4
|
+
* Fields:
|
|
5
|
+
* agencyName sender display or known *.gov.cn registrar
|
|
6
|
+
* documentType tax_declaration / social_security / housing_fund /
|
|
7
|
+
* real_estate / immigration / health_insurance / other
|
|
8
|
+
* period "YYYY-MM" when keyword-tagged month found
|
|
9
|
+
* amount { value, currency } if monetary content (e.g.
|
|
10
|
+
* tax declaration confirmations include a paid amount)
|
|
11
|
+
* referenceNumber receipt / 申报编号 / 文号
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
"use strict";
|
|
15
|
+
|
|
16
|
+
const {
|
|
17
|
+
extractAmounts,
|
|
18
|
+
selectPrimaryAmount,
|
|
19
|
+
extractOrderNumbers,
|
|
20
|
+
} = require("./utils");
|
|
21
|
+
|
|
22
|
+
const DOC_TYPE_KEYWORDS = [
|
|
23
|
+
{ type: "tax_declaration", patterns: [/(完税|个税|纳税申报|增值税|enterprise\s*income\s*tax)/i] },
|
|
24
|
+
{ type: "social_security", patterns: [/(社保|社会保险|医疗保险|工伤|生育|失业)/i] },
|
|
25
|
+
{ type: "housing_fund", patterns: [/(住房公积金|公积金|housing\s*fund)/i] },
|
|
26
|
+
{ type: "real_estate", patterns: [/(不动产登记|房产证|产权登记|real\s*estate\s*registration)/i] },
|
|
27
|
+
{ type: "immigration", patterns: [/(出入境|护照|签证|户籍|户口|immigration)/i] },
|
|
28
|
+
{ type: "health_insurance", patterns: [/(医保|医疗保险卡)/i] },
|
|
29
|
+
];
|
|
30
|
+
|
|
31
|
+
const REF_NUMBER_KEYWORDS = /(申报编号|文号|receipt\s*number|reference\s*number|案卷号|流水号)\s*[::]?\s*([A-Z0-9][-A-Z0-9]{4,30})/i;
|
|
32
|
+
const PERIOD_KEYWORDS = /(税期|缴费月份|缴款期间|属期|期间)\s*[::]?\s*(\d{4})(?:[-/.年]|\s)\s*(\d{1,2})/i;
|
|
33
|
+
|
|
34
|
+
async function extractGovernment(email, _opts = {}) {
|
|
35
|
+
const warnings = [];
|
|
36
|
+
const combined = collectSearchableText(email);
|
|
37
|
+
|
|
38
|
+
// ── documentType ─────────────────────────────────────────────────
|
|
39
|
+
let documentType = null;
|
|
40
|
+
for (const d of DOC_TYPE_KEYWORDS) {
|
|
41
|
+
if (d.patterns.some((re) => re.test(combined))) {
|
|
42
|
+
documentType = d.type;
|
|
43
|
+
break;
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
if (!documentType) documentType = "other";
|
|
47
|
+
|
|
48
|
+
// ── agency ───────────────────────────────────────────────────────
|
|
49
|
+
let agencyName = null;
|
|
50
|
+
if (Array.isArray(email.from) && email.from[0]) {
|
|
51
|
+
if (email.from[0].name) {
|
|
52
|
+
agencyName = email.from[0].name;
|
|
53
|
+
} else {
|
|
54
|
+
const addr = (email.from[0].address || "").toLowerCase();
|
|
55
|
+
const at = addr.lastIndexOf("@");
|
|
56
|
+
if (at >= 0) agencyName = addr.slice(at + 1);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// ── period (YYYY-MM) ─────────────────────────────────────────────
|
|
61
|
+
let period = null;
|
|
62
|
+
const periodMatch = combined.match(PERIOD_KEYWORDS);
|
|
63
|
+
if (periodMatch) {
|
|
64
|
+
const y = +periodMatch[2];
|
|
65
|
+
const mo = +periodMatch[3];
|
|
66
|
+
if (y >= 1970 && y <= 2099 && mo >= 1 && mo <= 12) {
|
|
67
|
+
period = `${y}-${String(mo).padStart(2, "0")}`;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// ── amount (e.g. tax paid / fee charged) ─────────────────────────
|
|
72
|
+
const amounts = extractAmounts(combined);
|
|
73
|
+
const primary = selectPrimaryAmount(amounts);
|
|
74
|
+
const amount = primary ? { value: primary.value, currency: primary.currency } : null;
|
|
75
|
+
|
|
76
|
+
// ── reference number ──────────────────────────────────────────────
|
|
77
|
+
let referenceNumber = null;
|
|
78
|
+
const refMatch = combined.match(REF_NUMBER_KEYWORDS);
|
|
79
|
+
if (refMatch) {
|
|
80
|
+
referenceNumber = refMatch[2];
|
|
81
|
+
} else {
|
|
82
|
+
const orderHits = extractOrderNumbers(combined);
|
|
83
|
+
if (orderHits.length > 0) referenceNumber = orderHits[0].orderNumber;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
if (documentType === "other") warnings.push("documentType could not be narrowed");
|
|
87
|
+
|
|
88
|
+
const fields = {
|
|
89
|
+
documentType,
|
|
90
|
+
...(agencyName ? { agencyName } : {}),
|
|
91
|
+
...(period ? { period } : {}),
|
|
92
|
+
...(amount ? { amount } : {}),
|
|
93
|
+
...(referenceNumber ? { referenceNumber } : {}),
|
|
94
|
+
};
|
|
95
|
+
|
|
96
|
+
return {
|
|
97
|
+
template: "government",
|
|
98
|
+
fields,
|
|
99
|
+
confidence: confidenceFor(fields),
|
|
100
|
+
warnings,
|
|
101
|
+
};
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
function collectSearchableText(email) {
|
|
105
|
+
const parts = [];
|
|
106
|
+
if (email.subject) parts.push(email.subject);
|
|
107
|
+
if (email.textBody) parts.push(email.textBody);
|
|
108
|
+
else if (email.htmlBody) {
|
|
109
|
+
parts.push(String(email.htmlBody).replace(/<[^>]+>/g, " "));
|
|
110
|
+
}
|
|
111
|
+
return parts.join("\n");
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
function confidenceFor(fields) {
|
|
115
|
+
const tracked = ["documentType", "agencyName", "period", "amount", "referenceNumber"];
|
|
116
|
+
const populated = tracked.filter((k) => fields[k] != null && fields[k] !== "other").length;
|
|
117
|
+
return Math.round((populated / tracked.length) * 100) / 100;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
module.exports = { extractGovernment };
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Phase 5.4 — template dispatcher.
|
|
3
|
+
*
|
|
4
|
+
* Routes a classified email through the appropriate field extractor:
|
|
5
|
+
*
|
|
6
|
+
* bill_bank / bill_credit → bill
|
|
7
|
+
* order → order
|
|
8
|
+
* travel → travel
|
|
9
|
+
* government → government
|
|
10
|
+
* register → register
|
|
11
|
+
* notify / other / null → other
|
|
12
|
+
*
|
|
13
|
+
* The dispatcher is the SINGLE entry point used by EmailAdapter.sync —
|
|
14
|
+
* adding a new category later means adding a row to CATEGORY_TO_EXTRACTOR
|
|
15
|
+
* and a new file, no other change.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
"use strict";
|
|
19
|
+
|
|
20
|
+
const { extractBill } = require("./bill");
|
|
21
|
+
const { extractOrder } = require("./order");
|
|
22
|
+
const { extractTravel } = require("./travel");
|
|
23
|
+
const { extractGovernment } = require("./government");
|
|
24
|
+
const { extractRegister } = require("./register");
|
|
25
|
+
const { extractOther } = require("./other");
|
|
26
|
+
|
|
27
|
+
const CATEGORY_TO_EXTRACTOR = Object.freeze({
|
|
28
|
+
bill_bank: extractBill,
|
|
29
|
+
bill_credit: extractBill,
|
|
30
|
+
order: extractOrder,
|
|
31
|
+
travel: extractTravel,
|
|
32
|
+
government: extractGovernment,
|
|
33
|
+
register: extractRegister,
|
|
34
|
+
notify: extractOther,
|
|
35
|
+
other: extractOther,
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Dispatch by classification.category. Always returns a result —
|
|
40
|
+
* unknown categories fall through to the `other` extractor.
|
|
41
|
+
*
|
|
42
|
+
* @param {object} email — same shape EmailAdapter uses internally:
|
|
43
|
+
* { from, subject, textBody, htmlBody, headers, attachments }
|
|
44
|
+
* @param {object} classification — Phase 5.3 result; `{category}` minimum
|
|
45
|
+
* @param {object} [opts]
|
|
46
|
+
* @param {{chat:Function}} [opts.llm] LLM (currently only used by `other`)
|
|
47
|
+
* @returns {Promise<{template:string,fields:object,confidence:number,warnings:string[]}>}
|
|
48
|
+
*/
|
|
49
|
+
async function extractFields(email, classification, opts = {}) {
|
|
50
|
+
if (!email || typeof email !== "object") {
|
|
51
|
+
return { template: "other", fields: {}, confidence: 0, warnings: ["email missing"] };
|
|
52
|
+
}
|
|
53
|
+
const category = classification && classification.category;
|
|
54
|
+
const extractor = CATEGORY_TO_EXTRACTOR[category] || extractOther;
|
|
55
|
+
try {
|
|
56
|
+
return await extractor(email, opts);
|
|
57
|
+
} catch (err) {
|
|
58
|
+
return {
|
|
59
|
+
template: extractor === extractOther ? "other" : (category || "other"),
|
|
60
|
+
fields: {},
|
|
61
|
+
confidence: 0,
|
|
62
|
+
warnings: [`extractor threw: ${err && err.message ? err.message : err}`],
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
module.exports = {
|
|
68
|
+
extractFields,
|
|
69
|
+
CATEGORY_TO_EXTRACTOR,
|
|
70
|
+
// Direct re-exports so callers needing one specific extractor can
|
|
71
|
+
// bypass the dispatcher (handy in tests).
|
|
72
|
+
extractBill,
|
|
73
|
+
extractOrder,
|
|
74
|
+
extractTravel,
|
|
75
|
+
extractGovernment,
|
|
76
|
+
extractRegister,
|
|
77
|
+
extractOther,
|
|
78
|
+
};
|