@chainlesschain/personal-data-hub 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/__tests__/adapters/ai-chat-history.test.js +395 -0
- package/__tests__/adapters/ai-chat-http-client.test.js +242 -0
- package/__tests__/adapters/ai-chat-vendors.test.js +733 -0
- package/__tests__/adapters/alipay-bill-adapter.test.js +538 -0
- package/__tests__/adapters/email-adapter.test.js +138 -1
- package/__tests__/adapters/email-classifier.test.js +347 -0
- package/__tests__/adapters/email-pdf-extractor.test.js +529 -0
- package/__tests__/adapters/email-retry-progress.test.js +294 -0
- package/__tests__/adapters/email-templates.test.js +699 -0
- package/__tests__/adapters/system-data-adapter.test.js +440 -0
- package/__tests__/adapters/system-data-disclosure.test.js +153 -0
- package/__tests__/analysis-skills.test.js +409 -0
- package/__tests__/entity-resolver-ingest-hook.test.js +177 -0
- package/__tests__/entity-resolver-stages.test.js +411 -0
- package/__tests__/entity-resolver-vault.test.js +246 -0
- package/__tests__/entity-resolver.test.js +526 -0
- package/__tests__/fixtures/entity-resolver-200-mock.json +96 -0
- package/__tests__/longtail-adapters.test.js +217 -0
- package/__tests__/mobile-extractor.test.js +288 -0
- package/__tests__/shopping-adapters.test.js +296 -0
- package/__tests__/sidecar-contacts-cross-validate.test.js +163 -0
- package/__tests__/sidecar-supervisor.test.js +120 -0
- package/__tests__/social-adapters.test.js +206 -0
- package/__tests__/travel-adapters.test.js +325 -0
- package/__tests__/vault.test.js +3 -3
- package/__tests__/wechat-adapter.test.js +476 -0
- package/__tests__/whatsapp-adapter.test.js +135 -0
- package/lib/adapter-spec.js +12 -0
- package/lib/adapters/_python-sidecar-base.js +207 -0
- package/lib/adapters/ai-chat-history/ai-chat-adapter.js +335 -0
- package/lib/adapters/ai-chat-history/cookie-auth.js +109 -0
- package/lib/adapters/ai-chat-history/http-client.js +211 -0
- package/lib/adapters/ai-chat-history/index.js +28 -0
- package/lib/adapters/ai-chat-history/schema-map.js +221 -0
- package/lib/adapters/ai-chat-history/vendor-spec.js +85 -0
- package/lib/adapters/ai-chat-history/vendors/coze.js +179 -0
- package/lib/adapters/ai-chat-history/vendors/deepseek.js +199 -0
- package/lib/adapters/ai-chat-history/vendors/dreamina.js +174 -0
- package/lib/adapters/ai-chat-history/vendors/hunyuan.js +176 -0
- package/lib/adapters/ai-chat-history/vendors/kimi.js +182 -0
- package/lib/adapters/ai-chat-history/vendors/qianfan.js +160 -0
- package/lib/adapters/ai-chat-history/vendors/tongyi.js +193 -0
- package/lib/adapters/ai-chat-history/vendors/zhipu.js +202 -0
- package/lib/adapters/alipay-bill/alipay-bill-adapter.js +307 -0
- package/lib/adapters/alipay-bill/counterparty.js +129 -0
- package/lib/adapters/alipay-bill/csv-parser.js +217 -0
- package/lib/adapters/alipay-bill/index.js +41 -0
- package/lib/adapters/alipay-bill/zip-decryptor.js +111 -0
- package/lib/adapters/email-imap/classifier.js +495 -0
- package/lib/adapters/email-imap/email-adapter.js +419 -8
- package/lib/adapters/email-imap/index.js +42 -0
- package/lib/adapters/email-imap/pdf-extractor.js +192 -0
- package/lib/adapters/email-imap/templates/bill.js +232 -0
- package/lib/adapters/email-imap/templates/government.js +120 -0
- package/lib/adapters/email-imap/templates/index.js +78 -0
- package/lib/adapters/email-imap/templates/order.js +186 -0
- package/lib/adapters/email-imap/templates/other.js +114 -0
- package/lib/adapters/email-imap/templates/register.js +113 -0
- package/lib/adapters/email-imap/templates/travel.js +157 -0
- package/lib/adapters/email-imap/templates/utils.js +275 -0
- package/lib/adapters/email-imap/transactions.js +234 -0
- package/lib/adapters/messaging-qq/index.js +158 -0
- package/lib/adapters/messaging-telegram/index.js +142 -0
- package/lib/adapters/messaging-whatsapp/index.js +189 -0
- package/lib/adapters/shopping-base/index.js +208 -0
- package/lib/adapters/shopping-jd/index.js +150 -0
- package/lib/adapters/shopping-meituan/index.js +154 -0
- package/lib/adapters/shopping-taobao/index.js +176 -0
- package/lib/adapters/social-bilibili/index.js +171 -0
- package/lib/adapters/social-douyin/index.js +116 -0
- package/lib/adapters/social-weibo/index.js +164 -0
- package/lib/adapters/social-xiaohongshu/index.js +96 -0
- package/lib/adapters/system-data/disclosure.js +166 -0
- package/lib/adapters/system-data/index.js +34 -0
- package/lib/adapters/system-data/system-data-adapter.js +344 -0
- package/lib/adapters/travel-12306/index.js +151 -0
- package/lib/adapters/travel-amap/index.js +164 -0
- package/lib/adapters/travel-baidu-map/index.js +162 -0
- package/lib/adapters/travel-base/index.js +240 -0
- package/lib/adapters/travel-ctrip/index.js +151 -0
- package/lib/adapters/wechat/content-parser.js +326 -0
- package/lib/adapters/wechat/db-reader.js +209 -0
- package/lib/adapters/wechat/index.js +28 -0
- package/lib/adapters/wechat/key-extractor.js +158 -0
- package/lib/adapters/wechat/normalize.js +220 -0
- package/lib/adapters/wechat/wechat-adapter.js +205 -0
- package/lib/analysis-skills/base.js +113 -0
- package/lib/analysis-skills/footprint.js +167 -0
- package/lib/analysis-skills/index.js +58 -0
- package/lib/analysis-skills/interests.js +161 -0
- package/lib/analysis-skills/relations.js +226 -0
- package/lib/analysis-skills/spending.js +216 -0
- package/lib/analysis-skills/timeline.js +167 -0
- package/lib/entity-resolver/embedding-stage.js +198 -0
- package/lib/entity-resolver/entity-resolver.js +384 -0
- package/lib/entity-resolver/index.js +42 -0
- package/lib/entity-resolver/llm-stage.js +191 -0
- package/lib/entity-resolver/rule-stage.js +208 -0
- package/lib/entity-resolver/worker.js +149 -0
- package/lib/index.js +115 -0
- package/lib/migrations.js +73 -0
- package/lib/mobile-extractor/android.js +193 -0
- package/lib/mobile-extractor/index.js +9 -0
- package/lib/mobile-extractor/ios.js +223 -0
- package/lib/registry.js +42 -0
- package/lib/sidecar/index.js +15 -0
- package/lib/sidecar/supervisor.js +359 -0
- package/lib/vault.js +266 -0
- package/package.json +29 -3
- package/scripts/_make-fixture-all.js +126 -0
- package/scripts/_make-fixture-contacts.js +84 -0
- package/scripts/evaluate-entity-resolver.js +213 -0
- package/scripts/smoke-phase-5-5.js +196 -0
- package/scripts/smoke-phase-5-7.js +181 -0
- package/scripts/smoke-system-data-contacts.js +309 -0
- package/scripts/smoke-system-data.js +312 -0
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared regex extraction helpers for the 6 Phase 5.4 template extractors.
|
|
3
|
+
*
|
|
4
|
+
* Design notes:
|
|
5
|
+
* - Regex-first. We aim to extract the high-value structured fields
|
|
6
|
+
* (金额 / 日期 / 卡尾 4 位 / 订单号 / 快递单号) without paying LLM
|
|
7
|
+
* tokens per email. Each helper returns ALL matches (an adapter can
|
|
8
|
+
* pick the most plausible one when context is needed) and degrades
|
|
9
|
+
* to [] / null for unrecognized input — never throws.
|
|
10
|
+
* - LLM enrichment lives in per-template files when callers pass a
|
|
11
|
+
* `llm` opt; this module stays free of LLM dependencies so it can
|
|
12
|
+
* be unit-tested without mocks.
|
|
13
|
+
* - Chinese-first patterns. ¥ and 元 are the primary currency
|
|
14
|
+
* markers; CNY / RMB and $/USD/EUR are also handled for emails
|
|
15
|
+
* from foreign vendors.
|
|
16
|
+
* - Numbers can have thousands separators (1,234.50) and Chinese
|
|
17
|
+
* decimals (1,234.50 元 / 1234元 / 1.5万 / 8千).
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
"use strict";
|
|
21
|
+
|
|
22
|
+
// ─── Currency / amount ──────────────────────────────────────────────────
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Match plausible monetary amounts and tag direction when adjacent
|
|
26
|
+
* "支出/支付/扣款" or "收入/退款/到账" markers surround them. Returns
|
|
27
|
+
* [{ value, currency, raw, direction?, index }, ...] sorted by source
|
|
28
|
+
* position. Always returns an array.
|
|
29
|
+
*
|
|
30
|
+
* Direction heuristics:
|
|
31
|
+
* - text within 12 chars BEFORE or AFTER amount containing
|
|
32
|
+
* "退款"/"返还"/"到账"/"收入"/"refund"/"credit" → "in"
|
|
33
|
+
* - similar window with "扣款"/"支付"/"支出"/"消费"/"应还"/"还款"/"debit"/"charge"
|
|
34
|
+
* → "out"
|
|
35
|
+
* - default: undefined (caller decides)
|
|
36
|
+
*
|
|
37
|
+
* @param {string} text
|
|
38
|
+
* @returns {Array<{value:number,currency:string,raw:string,direction?:string,index:number}>}
|
|
39
|
+
*/
|
|
40
|
+
function extractAmounts(text) {
|
|
41
|
+
if (typeof text !== "string" || text.length === 0) return [];
|
|
42
|
+
const results = [];
|
|
43
|
+
|
|
44
|
+
// Pattern A: ¥1,234.50 / RMB 1,234 / CNY 99.00 / USD $100 / $99.99 / 100元
|
|
45
|
+
// Number group allows comma thousands + optional decimal. Currency can
|
|
46
|
+
// come before OR after the number.
|
|
47
|
+
const re = /(?:(?:[¥$€£]|RMB|CNY|USD|EUR|GBP|HKD)\s*([\d][\d,]*(?:\.\d{1,4})?)|([\d][\d,]*(?:\.\d{1,4})?)\s*(元|RMB|CNY|USD|EUR|HKD|港币|美元|欧元))/gi;
|
|
48
|
+
let m;
|
|
49
|
+
while ((m = re.exec(text)) !== null) {
|
|
50
|
+
const numericStr = (m[1] || m[2] || "").replace(/,/g, "");
|
|
51
|
+
const value = Number(numericStr);
|
|
52
|
+
if (!Number.isFinite(value) || value <= 0) continue;
|
|
53
|
+
const currencyMarker = (m[0].match(/[¥$€£]|RMB|CNY|USD|EUR|GBP|HKD|元|港币|美元|欧元/i) || ["?"])[0];
|
|
54
|
+
const currency = normalizeCurrency(currencyMarker);
|
|
55
|
+
const direction = detectDirection(text, m.index, m[0].length);
|
|
56
|
+
results.push({
|
|
57
|
+
value,
|
|
58
|
+
currency,
|
|
59
|
+
raw: m[0],
|
|
60
|
+
direction,
|
|
61
|
+
index: m.index,
|
|
62
|
+
});
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
return results;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function normalizeCurrency(marker) {
|
|
69
|
+
const m = marker.toUpperCase();
|
|
70
|
+
if (m === "¥" || m === "元" || m === "RMB" || m === "CNY") return "CNY";
|
|
71
|
+
if (m === "$" || m === "USD" || m === "美元") return "USD";
|
|
72
|
+
if (m === "€" || m === "EUR" || m === "欧元") return "EUR";
|
|
73
|
+
if (m === "£" || m === "GBP") return "GBP";
|
|
74
|
+
if (m === "HKD" || m === "港币") return "HKD";
|
|
75
|
+
return "CNY"; // default — most Chinese emails
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
function detectDirection(text, idx, len) {
|
|
79
|
+
const winStart = Math.max(0, idx - 24);
|
|
80
|
+
const winEnd = Math.min(text.length, idx + len + 24);
|
|
81
|
+
const window = text.slice(winStart, winEnd);
|
|
82
|
+
if (/(退款|返还|到账|收入|入账|credit|refund|received)/i.test(window)) return "in";
|
|
83
|
+
if (/(扣款|支付|支出|消费|应还|还款|账单|debit|charge|paid|due)/i.test(window)) return "out";
|
|
84
|
+
return undefined;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// ─── Dates ──────────────────────────────────────────────────────────────
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Find date-like fragments. Recognizes:
|
|
91
|
+
* - 2026-05-19 / 2026/05/19 / 2026.05.19 / 2026年5月19日
|
|
92
|
+
* - 5/19/2026 / 19-05-2026 (Western)
|
|
93
|
+
* - 5月19日 (current year inferred from now)
|
|
94
|
+
*
|
|
95
|
+
* Returns array of { raw, date: Date, index }. Year defaults to current
|
|
96
|
+
* year when only month+day is present (or last year if today's month is
|
|
97
|
+
* earlier than parsed month, suggesting "Christmas card sent in January").
|
|
98
|
+
* The default-year heuristic is intentionally conservative — callers
|
|
99
|
+
* needing strict semantics should rely on full YYYY-MM-DD forms.
|
|
100
|
+
*
|
|
101
|
+
* @param {string} text
|
|
102
|
+
* @param {number} [nowMs]
|
|
103
|
+
* @returns {Array<{raw:string,date:Date,index:number,hasYear:boolean}>}
|
|
104
|
+
*/
|
|
105
|
+
function extractDates(text, nowMs = Date.now()) {
|
|
106
|
+
if (typeof text !== "string" || text.length === 0) return [];
|
|
107
|
+
const out = [];
|
|
108
|
+
const now = new Date(nowMs);
|
|
109
|
+
const currentYear = now.getFullYear();
|
|
110
|
+
|
|
111
|
+
// YYYY-MM-DD / YYYY/MM/DD / YYYY.MM.DD / YYYY年M月D日
|
|
112
|
+
const reIso = /(\d{4})[-/.年](\d{1,2})[-/.月](\d{1,2})日?/g;
|
|
113
|
+
let m;
|
|
114
|
+
while ((m = reIso.exec(text)) !== null) {
|
|
115
|
+
const y = +m[1], mo = +m[2], d = +m[3];
|
|
116
|
+
if (isValidYMD(y, mo, d)) {
|
|
117
|
+
out.push({ raw: m[0], date: new Date(y, mo - 1, d), index: m.index, hasYear: true });
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// M月D日 (Chinese, year-less)
|
|
122
|
+
const reMd = /(\d{1,2})月(\d{1,2})日?/g;
|
|
123
|
+
while ((m = reMd.exec(text)) !== null) {
|
|
124
|
+
if (out.some((o) => o.index === m.index)) continue; // already matched as YYYY-MM-DD
|
|
125
|
+
const mo = +m[1], d = +m[2];
|
|
126
|
+
if (isValidYMD(currentYear, mo, d)) {
|
|
127
|
+
// Heuristic: if parsed month is far before now's month, assume
|
|
128
|
+
// last year (e.g. a January-billed bill arriving in November).
|
|
129
|
+
const monthsAhead = mo - (now.getMonth() + 1);
|
|
130
|
+
const year = monthsAhead < -6 ? currentYear + 1 : (monthsAhead > 6 ? currentYear - 1 : currentYear);
|
|
131
|
+
out.push({ raw: m[0], date: new Date(year, mo - 1, d), index: m.index, hasYear: false });
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// M/D/YYYY (Western shorthand)
|
|
136
|
+
const reUs = /\b(\d{1,2})\/(\d{1,2})\/(\d{4})\b/g;
|
|
137
|
+
while ((m = reUs.exec(text)) !== null) {
|
|
138
|
+
if (out.some((o) => o.index === m.index)) continue;
|
|
139
|
+
const mo = +m[1], d = +m[2], y = +m[3];
|
|
140
|
+
if (isValidYMD(y, mo, d)) {
|
|
141
|
+
out.push({ raw: m[0], date: new Date(y, mo - 1, d), index: m.index, hasYear: true });
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
return out.sort((a, b) => a.index - b.index);
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
function isValidYMD(y, m, d) {
|
|
149
|
+
return y >= 1970 && y <= 2099 && m >= 1 && m <= 12 && d >= 1 && d <= 31;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// ─── Account / card last-4 ──────────────────────────────────────────────
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* Extract trailing 4-digit identifiers commonly used for card masking.
|
|
156
|
+
* Recognizes:
|
|
157
|
+
* - 尾号 1234 / 卡号尾 1234 / **** 1234 / ending in 1234 / last 4 digits 1234
|
|
158
|
+
* Returns array of {raw, last4, index}. Always 4 digits exactly.
|
|
159
|
+
*/
|
|
160
|
+
function extractAccountTails(text) {
|
|
161
|
+
if (typeof text !== "string" || text.length === 0) return [];
|
|
162
|
+
const out = [];
|
|
163
|
+
const patterns = [
|
|
164
|
+
/(?:尾号|尾[四4]位|卡号尾|卡尾|账号尾)\s*[::]?\s*(\d{4})\b/g,
|
|
165
|
+
/\*{2,}\s*(\d{4})\b/g,
|
|
166
|
+
/(?:ending in|last\s*4\s*digits|\bending\b)\s*(\d{4})\b/gi,
|
|
167
|
+
];
|
|
168
|
+
for (const re of patterns) {
|
|
169
|
+
let m;
|
|
170
|
+
while ((m = re.exec(text)) !== null) {
|
|
171
|
+
out.push({ raw: m[0], last4: m[1], index: m.index });
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
return out;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
// ─── Order / tracking numbers ──────────────────────────────────────────
|
|
178
|
+
|
|
179
|
+
/**
|
|
180
|
+
* Extract order-number candidates. Recognizes:
|
|
181
|
+
* 订单号: 12345678 / 订单 12345678 / Order # 12345-67 / Order Number 12345
|
|
182
|
+
* 订单编号: ABC1234567
|
|
183
|
+
* Returns array of {raw, orderNumber, index}.
|
|
184
|
+
*/
|
|
185
|
+
function extractOrderNumbers(text) {
|
|
186
|
+
if (typeof text !== "string" || text.length === 0) return [];
|
|
187
|
+
const out = [];
|
|
188
|
+
const patterns = [
|
|
189
|
+
/(?:订单(?:号|编号|号码)?|order(?:\s*number|\s*id|\s*#)?)\s*[::]?\s*([A-Z0-9][-A-Z0-9]{4,30})\b/gi,
|
|
190
|
+
];
|
|
191
|
+
for (const re of patterns) {
|
|
192
|
+
let m;
|
|
193
|
+
while ((m = re.exec(text)) !== null) {
|
|
194
|
+
out.push({ raw: m[0], orderNumber: m[1], index: m.index });
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
return out;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
/**
|
|
201
|
+
* Tracking-number candidates. Recognizes Chinese express keywords +
|
|
202
|
+
* generic "tracking number" markers. Sufficient for Phase 5.4; v2 may
|
|
203
|
+
* add carrier-specific format validation (SF / YT / ZTO regex).
|
|
204
|
+
*/
|
|
205
|
+
function extractTrackingNumbers(text) {
|
|
206
|
+
if (typeof text !== "string" || text.length === 0) return [];
|
|
207
|
+
const out = [];
|
|
208
|
+
const re = /(?:快递单号|运单号|物流单号|物流号|tracking\s*(?:number|#)|track(?:ing)?\s*id)\s*[::]?\s*([A-Z0-9][-A-Z0-9]{6,30})\b/gi;
|
|
209
|
+
let m;
|
|
210
|
+
while ((m = re.exec(text)) !== null) {
|
|
211
|
+
out.push({ raw: m[0], trackingNumber: m[1], index: m.index });
|
|
212
|
+
}
|
|
213
|
+
return out;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
// ─── Verification codes (REDACTED in extra; only field captured is "yes") ──
|
|
217
|
+
|
|
218
|
+
/**
|
|
219
|
+
* Find OTP / verification code strings. Returns just the count + position
|
|
220
|
+
* — the code itself is NEVER returned to caller, since storing OTPs in
|
|
221
|
+
* vault is a compliance red flag (architecture-doc §9.2). Caller uses
|
|
222
|
+
* the count as a signal that this is a register email.
|
|
223
|
+
*/
|
|
224
|
+
function detectVerificationCodes(text) {
|
|
225
|
+
if (typeof text !== "string" || text.length === 0) return { count: 0, hits: [] };
|
|
226
|
+
const re = /(?:验证码|verification\s*code|otp|动态密码|安全码)\s*(?:为|是|:|:|is|为)?\s*\d{4,8}\b/gi;
|
|
227
|
+
const hits = [];
|
|
228
|
+
let m;
|
|
229
|
+
while ((m = re.exec(text)) !== null) {
|
|
230
|
+
hits.push({ raw: m[0].replace(/\d/g, "*"), index: m.index });
|
|
231
|
+
}
|
|
232
|
+
return { count: hits.length, hits };
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
// ─── Helpers shared by templates ────────────────────────────────────────
|
|
236
|
+
|
|
237
|
+
/**
|
|
238
|
+
* Pick the most plausible amount given a list. Strategy:
|
|
239
|
+
* 1. If any amount has a `direction`, prefer those over directionless
|
|
240
|
+
* 2. Prefer larger amounts (statements tend to put the total prominently)
|
|
241
|
+
* 3. Tie-break by earliest position
|
|
242
|
+
*/
|
|
243
|
+
function selectPrimaryAmount(amounts) {
|
|
244
|
+
if (!Array.isArray(amounts) || amounts.length === 0) return null;
|
|
245
|
+
const directed = amounts.filter((a) => a.direction);
|
|
246
|
+
const pool = directed.length > 0 ? directed : amounts;
|
|
247
|
+
return pool.slice().sort((a, b) => {
|
|
248
|
+
if (b.value !== a.value) return b.value - a.value;
|
|
249
|
+
return a.index - b.index;
|
|
250
|
+
})[0];
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
/**
|
|
254
|
+
* Convert a Date to a ms epoch, falling back to null. Handy when
|
|
255
|
+
* templates emit dates into Event extra (vault stores ms-ints).
|
|
256
|
+
*/
|
|
257
|
+
function dateToMs(date) {
|
|
258
|
+
if (!(date instanceof Date)) return null;
|
|
259
|
+
const t = date.getTime();
|
|
260
|
+
return Number.isFinite(t) ? t : null;
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
module.exports = {
|
|
264
|
+
extractAmounts,
|
|
265
|
+
extractDates,
|
|
266
|
+
extractAccountTails,
|
|
267
|
+
extractOrderNumbers,
|
|
268
|
+
extractTrackingNumbers,
|
|
269
|
+
detectVerificationCodes,
|
|
270
|
+
selectPrimaryAmount,
|
|
271
|
+
dateToMs,
|
|
272
|
+
// exposed for tests
|
|
273
|
+
normalizeCurrency,
|
|
274
|
+
detectDirection,
|
|
275
|
+
};
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Phase 5.5 — line-item transaction parser for Chinese bank-statement
|
|
3
|
+
* PDF text. Decrypted statements typically follow one of a few row
|
|
4
|
+
* formats; we go regex-first with a generic fallback.
|
|
5
|
+
*
|
|
6
|
+
* Output shape (per item):
|
|
7
|
+
* {
|
|
8
|
+
* occurredAtMs: number, // ms epoch of the transaction date
|
|
9
|
+
* description: string, // merchant + memo, trimmed
|
|
10
|
+
* amount: { value, currency, direction },
|
|
11
|
+
* balance?: { value, currency }, // running balance when present
|
|
12
|
+
* raw: string, // the original line for audit
|
|
13
|
+
* index: number, // line index within the body
|
|
14
|
+
* }
|
|
15
|
+
*
|
|
16
|
+
* Recognized formats (sample row patterns):
|
|
17
|
+
*
|
|
18
|
+
* 招商银行 / 中国银行 / 民生银行 (column-aligned with whitespace):
|
|
19
|
+
* `2026-04-15 星巴克 上海中山公园店 ¥-39.00 1,234.56`
|
|
20
|
+
*
|
|
21
|
+
* 工商银行 / 建设银行 (slash dates + 借/贷 prefix):
|
|
22
|
+
* `2026/04/15 借 39.00 星巴克 上海中山公园店 CNY 1,234.56`
|
|
23
|
+
*
|
|
24
|
+
* 交通银行 / 浦发银行 (Chinese date + tab/space):
|
|
25
|
+
* `2026年04月15日 星巴克 上海中山公园店 支出 39.00 余额 1234.56`
|
|
26
|
+
*
|
|
27
|
+
* Strategy:
|
|
28
|
+
* - One pass per line. Skip header / footer / legalese lines via
|
|
29
|
+
* a denylist.
|
|
30
|
+
* - Direction inferred from: explicit 借/贷 / 支出/收入 / + / - / 退款 keywords.
|
|
31
|
+
* - Amount currency defaults to CNY (Chinese statements rarely
|
|
32
|
+
* mix currencies); foreign-currency cards emit USD/EUR rows
|
|
33
|
+
* which we detect via prefix.
|
|
34
|
+
* - Returns an empty array when the text doesn't look like a
|
|
35
|
+
* statement (e.g. marketing PDF). Callers should not assume
|
|
36
|
+
* a non-empty list.
|
|
37
|
+
*/
|
|
38
|
+
|
|
39
|
+
"use strict";
|
|
40
|
+
|
|
41
|
+
// Lines that look like statement chrome / legalese — skip outright.
|
|
42
|
+
const SKIP_PATTERNS = [
|
|
43
|
+
/^[\s\d]*$/, // pure whitespace / numbers (page-number footers)
|
|
44
|
+
/^\s*第\s*\d+\s*页/, // 第 1 页 of 3
|
|
45
|
+
/^\s*page\s+\d+/i,
|
|
46
|
+
/声明|免责|提示|温馨提示|温馨提醒|请勿回复|本邮件由系统/,
|
|
47
|
+
/账单周期|账单日|还款日|信用额度|可用额度/,
|
|
48
|
+
/^\s*[-=]{3,}\s*$/,
|
|
49
|
+
];
|
|
50
|
+
|
|
51
|
+
// Direction keywords: ordered so "支出/借/-" wins over plain numbers.
|
|
52
|
+
const DIRECTION_OUT = /(支出|借方|借|消费|扣款|paid|debit|charged?)/i;
|
|
53
|
+
const DIRECTION_IN = /(收入|贷方|贷|退款|返还|到账|入账|credit|refund|received)/i;
|
|
54
|
+
|
|
55
|
+
// ── Row patterns (each returns a SHARED capture-group layout via .exec()):
|
|
56
|
+
// m.groups = { date, dateY, dateM, dateD, sign?, currency?, amount, balance?, desc }
|
|
57
|
+
// To keep regex sane we use a 2-pass approach: a "date-prefix" regex
|
|
58
|
+
// rooted at line start, then a follow-up amount/balance regex on the rest.
|
|
59
|
+
|
|
60
|
+
const DATE_PATTERNS = [
|
|
61
|
+
// 2026-04-15 / 2026/04/15 / 2026.04.15
|
|
62
|
+
/^\s*(?<dateY>\d{4})[-/.](?<dateM>\d{1,2})[-/.](?<dateD>\d{1,2})\s+/,
|
|
63
|
+
// 2026年04月15日
|
|
64
|
+
/^\s*(?<dateY>\d{4})年(?<dateM>\d{1,2})月(?<dateD>\d{1,2})日?\s+/,
|
|
65
|
+
// 04-15 (year-less, fall back to current year) — rare but BOC uses it
|
|
66
|
+
/^\s*(?<dateM>\d{1,2})[-/.](?<dateD>\d{1,2})\s+/,
|
|
67
|
+
];
|
|
68
|
+
|
|
69
|
+
// Amount: a signed (optional) currency-marked number. Negative or "-"
|
|
70
|
+
// prefix means OUT. Currency optional; defaults to CNY.
|
|
71
|
+
const AMOUNT_RE = /(?<sign>[+\-])?\s*(?:(?<cur>¥|RMB|CNY|USD|EUR|\$|€)\s*)?(?<amt>[\d][\d,]*(?:\.\d{1,4})?)\b/;
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Parse a bank-statement text body into transactions.
|
|
75
|
+
*
|
|
76
|
+
* @param {string} text decrypted PDF text (or any plain-text body)
|
|
77
|
+
* @param {object} [opts]
|
|
78
|
+
* @param {number} [opts.maxRows=500] cap to keep DoS-shaped statements bounded
|
|
79
|
+
* @param {number} [opts.nowMs=Date.now()] reference for year-less dates
|
|
80
|
+
* @returns {Array<object>}
|
|
81
|
+
*/
|
|
82
|
+
function extractTransactions(text, opts = {}) {
|
|
83
|
+
if (typeof text !== "string" || text.length === 0) return [];
|
|
84
|
+
const maxRows = Number.isFinite(opts.maxRows) && opts.maxRows > 0 ? opts.maxRows : 500;
|
|
85
|
+
const nowMs = Number.isFinite(opts.nowMs) ? opts.nowMs : Date.now();
|
|
86
|
+
const now = new Date(nowMs);
|
|
87
|
+
|
|
88
|
+
const lines = text.split(/\r?\n/);
|
|
89
|
+
const out = [];
|
|
90
|
+
|
|
91
|
+
for (let i = 0; i < lines.length && out.length < maxRows; i++) {
|
|
92
|
+
const line = lines[i];
|
|
93
|
+
if (!line || line.length < 6) continue;
|
|
94
|
+
if (SKIP_PATTERNS.some((re) => re.test(line))) continue;
|
|
95
|
+
|
|
96
|
+
// ── 1. find date prefix ──
|
|
97
|
+
let dateMatch = null;
|
|
98
|
+
let dateConsumed = 0;
|
|
99
|
+
for (const re of DATE_PATTERNS) {
|
|
100
|
+
const m = re.exec(line);
|
|
101
|
+
if (m) {
|
|
102
|
+
dateMatch = m;
|
|
103
|
+
dateConsumed = m[0].length;
|
|
104
|
+
break;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
if (!dateMatch) continue;
|
|
108
|
+
|
|
109
|
+
const y = +(dateMatch.groups.dateY || now.getFullYear());
|
|
110
|
+
const mo = +dateMatch.groups.dateM;
|
|
111
|
+
const d = +dateMatch.groups.dateD;
|
|
112
|
+
if (!validYMD(y, mo, d)) continue;
|
|
113
|
+
const occurredAtMs = new Date(y, mo - 1, d).getTime();
|
|
114
|
+
|
|
115
|
+
// ── 2. rest of line — find amount(s) ──
|
|
116
|
+
const rest = line.slice(dateConsumed);
|
|
117
|
+
|
|
118
|
+
// 借/贷 / 支出/收入 modifier may appear before the amount; capture
|
|
119
|
+
// its position so we can scope direction detection narrowly.
|
|
120
|
+
const directionWindow = rest;
|
|
121
|
+
let direction;
|
|
122
|
+
if (DIRECTION_OUT.test(directionWindow)) direction = "out";
|
|
123
|
+
else if (DIRECTION_IN.test(directionWindow)) direction = "in";
|
|
124
|
+
|
|
125
|
+
const amountMatches = collectAmounts(rest);
|
|
126
|
+
if (amountMatches.length === 0) continue;
|
|
127
|
+
|
|
128
|
+
// Heuristic: when 2+ numbers appear, the LAST is usually the
|
|
129
|
+
// running balance, the one BEFORE it is the transaction amount.
|
|
130
|
+
// Single number → that's the amount, no balance.
|
|
131
|
+
let amount, balance;
|
|
132
|
+
if (amountMatches.length >= 2) {
|
|
133
|
+
amount = amountMatches[amountMatches.length - 2];
|
|
134
|
+
balance = amountMatches[amountMatches.length - 1];
|
|
135
|
+
} else {
|
|
136
|
+
amount = amountMatches[0];
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// sign / direction reconciliation: if amount.value had a "-" sign,
|
|
140
|
+
// force direction=out; "+" → in. Otherwise keep keyword-detected
|
|
141
|
+
// direction (default undefined → caller may treat as "out" for
|
|
142
|
+
// bills).
|
|
143
|
+
if (amount.sign === "-") direction = "out";
|
|
144
|
+
else if (amount.sign === "+") direction = "in";
|
|
145
|
+
|
|
146
|
+
// ── 3. description = rest minus the amount tokens ──
|
|
147
|
+
const description = cleanDescription(rest, amountMatches);
|
|
148
|
+
if (description.length === 0) continue;
|
|
149
|
+
|
|
150
|
+
const row = {
|
|
151
|
+
occurredAtMs,
|
|
152
|
+
description,
|
|
153
|
+
amount: {
|
|
154
|
+
value: amount.value,
|
|
155
|
+
currency: amount.currency,
|
|
156
|
+
...(direction ? { direction } : {}),
|
|
157
|
+
},
|
|
158
|
+
...(balance ? { balance: { value: balance.value, currency: balance.currency } } : {}),
|
|
159
|
+
raw: line.trim(),
|
|
160
|
+
index: i,
|
|
161
|
+
};
|
|
162
|
+
out.push(row);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
return out;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// ─── helpers ─────────────────────────────────────────────────────────────
|
|
169
|
+
|
|
170
|
+
function validYMD(y, m, d) {
|
|
171
|
+
return y >= 1970 && y <= 2099 && m >= 1 && m <= 12 && d >= 1 && d <= 31;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
function collectAmounts(text) {
|
|
175
|
+
const out = [];
|
|
176
|
+
const re = new RegExp(AMOUNT_RE.source, "gi");
|
|
177
|
+
let m;
|
|
178
|
+
while ((m = re.exec(text)) !== null) {
|
|
179
|
+
const groups = m.groups || {};
|
|
180
|
+
const numericStr = (groups.amt || "").replace(/,/g, "");
|
|
181
|
+
const value = Number(numericStr);
|
|
182
|
+
if (!Number.isFinite(value) || value <= 0) continue;
|
|
183
|
+
// Skip pure-int matches that look like year/month numbers (e.g. "2026")
|
|
184
|
+
// when they are bare and < 100000 with no currency hint AND have 4
|
|
185
|
+
// digits exactly. False-positive guard for date-only rows.
|
|
186
|
+
if (!groups.cur && !groups.sign && /^\d{4}$/.test(numericStr) && value >= 1900 && value <= 2099) continue;
|
|
187
|
+
out.push({
|
|
188
|
+
value,
|
|
189
|
+
currency: normalizeCurrency(groups.cur),
|
|
190
|
+
sign: groups.sign,
|
|
191
|
+
raw: m[0],
|
|
192
|
+
index: m.index,
|
|
193
|
+
length: m[0].length,
|
|
194
|
+
});
|
|
195
|
+
}
|
|
196
|
+
return out;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
function normalizeCurrency(marker) {
|
|
200
|
+
if (!marker) return "CNY";
|
|
201
|
+
const m = marker.toUpperCase();
|
|
202
|
+
if (m === "¥" || m === "元" || m === "RMB" || m === "CNY") return "CNY";
|
|
203
|
+
if (m === "$" || m === "USD") return "USD";
|
|
204
|
+
if (m === "€" || m === "EUR") return "EUR";
|
|
205
|
+
return "CNY";
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
/**
|
|
209
|
+
* Build the description by cutting amount tokens out of the line and
|
|
210
|
+
* collapsing whitespace. Also strips leading direction keywords (借/贷)
|
|
211
|
+
* — those belong in `amount.direction`, not the human-readable label.
|
|
212
|
+
*/
|
|
213
|
+
function cleanDescription(text, amountMatches) {
|
|
214
|
+
// Sort matches by descending start index so deletions don't shift
|
|
215
|
+
// earlier indices.
|
|
216
|
+
const sorted = amountMatches.slice().sort((a, b) => b.index - a.index);
|
|
217
|
+
let s = text;
|
|
218
|
+
for (const m of sorted) {
|
|
219
|
+
s = s.slice(0, m.index) + " " + s.slice(m.index + m.length);
|
|
220
|
+
}
|
|
221
|
+
// Strip direction keywords + standalone punctuation
|
|
222
|
+
s = s
|
|
223
|
+
.replace(/(支出|收入|借方|贷方|借|贷|debit|credit|paid|charged?|refunded?)/gi, " ")
|
|
224
|
+
.replace(/(余额|balance)\s*[::]?/gi, " ")
|
|
225
|
+
.replace(/[¥$€]\s*/g, " ")
|
|
226
|
+
.replace(/[,,;;|]+/g, " ")
|
|
227
|
+
.replace(/\s+/g, " ")
|
|
228
|
+
.trim();
|
|
229
|
+
return s;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
module.exports = {
|
|
233
|
+
extractTransactions,
|
|
234
|
+
};
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Phase 13.5 — QQ adapter.
|
|
3
|
+
*
|
|
4
|
+
* Per sjqz/parsers/qq.py QQParser. QQ DBs (msg.db / messages.db) are
|
|
5
|
+
* SQLCipher-encrypted with a per-installation key — Phase 13.5b will
|
|
6
|
+
* port the QQ key extractor; v0.5 accepts a `keyProvider` like WeChat.
|
|
7
|
+
*
|
|
8
|
+
* Tables:
|
|
9
|
+
* - mr_friend friend contacts
|
|
10
|
+
* - mr_troop groups
|
|
11
|
+
* - mr_buddy_groupbuddy group members
|
|
12
|
+
* - msgcsr_friend_* friend messages (per-buddy table sharding)
|
|
13
|
+
* - msgcsr_troop_* group messages
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
"use strict";
|
|
17
|
+
|
|
18
|
+
const fs = require("node:fs");
|
|
19
|
+
const { newId } = require("../../ids");
|
|
20
|
+
|
|
21
|
+
const NAME = "messaging-qq";
|
|
22
|
+
const VERSION = "0.5.0";
|
|
23
|
+
|
|
24
|
+
class QQAdapter {
|
|
25
|
+
constructor(opts = {}) {
|
|
26
|
+
if (!opts.account || !opts.account.qq) {
|
|
27
|
+
throw new Error("QQAdapter: opts.account.qq required");
|
|
28
|
+
}
|
|
29
|
+
this.account = opts.account;
|
|
30
|
+
this._dbPath = opts.dbPath || null;
|
|
31
|
+
this._keyProvider = opts.keyProvider || null;
|
|
32
|
+
this._dbDriverFactory = opts.dbDriverFactory || null;
|
|
33
|
+
|
|
34
|
+
this.name = NAME;
|
|
35
|
+
this.version = VERSION;
|
|
36
|
+
this.capabilities = ["sync:sqlite", "parse:qq-messages", "decrypt:sqlcipher"];
|
|
37
|
+
this.extractMode = "device-pull";
|
|
38
|
+
this.rateLimits = {};
|
|
39
|
+
this.dataDisclosure = {
|
|
40
|
+
fields: [
|
|
41
|
+
"qq:friends (uin / nickname / remark)",
|
|
42
|
+
"qq:groups (troop_uin / name)",
|
|
43
|
+
"qq:messages (peer / content / time / type)",
|
|
44
|
+
],
|
|
45
|
+
sensitivity: "high",
|
|
46
|
+
legalGate: true,
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
async authenticate() {
|
|
51
|
+
if (!this._dbPath || !fs.existsSync(this._dbPath)) {
|
|
52
|
+
return { ok: false, reason: "DB_NOT_PULLED" };
|
|
53
|
+
}
|
|
54
|
+
if (!this._keyProvider || typeof this._keyProvider.getKey !== "function") {
|
|
55
|
+
return { ok: false, reason: "NO_KEY_PROVIDER" };
|
|
56
|
+
}
|
|
57
|
+
return { ok: true, account: this.account.qq };
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
async healthCheck() {
|
|
61
|
+
const r = await this.authenticate();
|
|
62
|
+
return r.ok ? { ok: true, lastChecked: Date.now() } : r;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
async *sync(opts = {}) {
|
|
66
|
+
const dbPath = opts.dbPath || this._dbPath;
|
|
67
|
+
if (!dbPath || !fs.existsSync(dbPath)) return;
|
|
68
|
+
if (!this._keyProvider) return;
|
|
69
|
+
const key = await this._keyProvider.getKey();
|
|
70
|
+
if (!key) return;
|
|
71
|
+
|
|
72
|
+
const Driver = this._dbDriverFactory
|
|
73
|
+
? this._dbDriverFactory()
|
|
74
|
+
: require("better-sqlite3-multiple-ciphers");
|
|
75
|
+
const db = new Driver(dbPath, { readonly: true });
|
|
76
|
+
try {
|
|
77
|
+
db.pragma(`key = '${key}'`);
|
|
78
|
+
// Friends
|
|
79
|
+
const friends = trySelect(db, "SELECT * FROM mr_friend LIMIT 5000") || [];
|
|
80
|
+
for (const row of friends) {
|
|
81
|
+
yield { adapter: NAME, originalId: `friend-${row.uin}`, capturedAt: Date.now(), payload: { row, kind: "contact" } };
|
|
82
|
+
}
|
|
83
|
+
// Groups
|
|
84
|
+
const groups = trySelect(db, "SELECT * FROM mr_troop LIMIT 1000") || [];
|
|
85
|
+
for (const row of groups) {
|
|
86
|
+
yield { adapter: NAME, originalId: `group-${row.troop_uin}`, capturedAt: Date.now(), payload: { row, kind: "group" } };
|
|
87
|
+
}
|
|
88
|
+
// Messages — QQ shards by buddy. Iterate any msgcsr_friend_* table.
|
|
89
|
+
const tables = trySelect(db, "SELECT name FROM sqlite_master WHERE type='table' AND (name LIKE 'msgcsr_friend_%' OR name LIKE 'msgcsr_troop_%')") || [];
|
|
90
|
+
for (const t of tables) {
|
|
91
|
+
const msgs = trySelect(db, `SELECT * FROM ${t.name} ORDER BY time DESC LIMIT 1000`) || [];
|
|
92
|
+
for (const row of msgs) {
|
|
93
|
+
yield {
|
|
94
|
+
adapter: NAME,
|
|
95
|
+
originalId: `msg-${row.msgid || row._id}`,
|
|
96
|
+
capturedAt: parseTime(row.time),
|
|
97
|
+
payload: { row, kind: "message", table: t.name },
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
} finally {
|
|
102
|
+
try { db.close(); } catch (_e) {}
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
normalize(raw) {
|
|
107
|
+
const { kind, row } = raw.payload;
|
|
108
|
+
const now = Date.now();
|
|
109
|
+
const occurredAt = parseTime(row.time) || now;
|
|
110
|
+
const source = { adapter: NAME, adapterVersion: VERSION, originalId: raw.originalId, capturedAt: occurredAt, capturedBy: "sqlite" };
|
|
111
|
+
if (kind === "contact") {
|
|
112
|
+
return {
|
|
113
|
+
events: [], places: [], items: [], topics: [],
|
|
114
|
+
persons: [{
|
|
115
|
+
id: `person-qq-${row.uin}`,
|
|
116
|
+
type: "person", subtype: "contact",
|
|
117
|
+
names: [row.remark, row.nickname, String(row.uin)].filter((x) => typeof x === "string" && x.length > 0),
|
|
118
|
+
identifiers: { qqId: String(row.uin) },
|
|
119
|
+
ingestedAt: now, source,
|
|
120
|
+
extra: { fromAdapter: NAME, qq: row.uin },
|
|
121
|
+
}],
|
|
122
|
+
};
|
|
123
|
+
}
|
|
124
|
+
if (kind === "group") {
|
|
125
|
+
return {
|
|
126
|
+
events: [], places: [], items: [], persons: [],
|
|
127
|
+
topics: [{
|
|
128
|
+
id: `topic-qq-group-${row.troop_uin}`,
|
|
129
|
+
type: "topic", name: row.troop_name || String(row.troop_uin),
|
|
130
|
+
ingestedAt: now, source,
|
|
131
|
+
extra: { fromAdapter: NAME, troopUin: row.troop_uin },
|
|
132
|
+
}],
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
// message
|
|
136
|
+
const isGroup = (raw.payload.table || "").startsWith("msgcsr_troop_");
|
|
137
|
+
return {
|
|
138
|
+
events: [{
|
|
139
|
+
id: newId(), type: "event", subtype: "message",
|
|
140
|
+
occurredAt, actor: "person-self",
|
|
141
|
+
content: { title: (row.msg || "").slice(0, 80) || "(空)", text: row.msg || "" },
|
|
142
|
+
ingestedAt: now, source,
|
|
143
|
+
extra: { peer: row.frienduin || row.troopuin, isGroup, msgType: row.msgtype },
|
|
144
|
+
}],
|
|
145
|
+
persons: [], places: [], items: [], topics: [],
|
|
146
|
+
};
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
function trySelect(db, sql) { try { return db.prepare(sql).all(); } catch (_e) { return null; } }
|
|
150
|
+
function parseTime(v) {
|
|
151
|
+
if (Number.isFinite(v)) return v > 1e12 ? v : v * 1000;
|
|
152
|
+
if (typeof v === "string") {
|
|
153
|
+
if (/^\d+$/.test(v)) { const n = parseInt(v, 10); return n > 1e12 ? n : n * 1000; }
|
|
154
|
+
return Date.parse(v) || null;
|
|
155
|
+
}
|
|
156
|
+
return null;
|
|
157
|
+
}
|
|
158
|
+
module.exports = { QQAdapter, NAME, VERSION };
|