@chainlesschain/personal-data-hub 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. package/__tests__/adapters/ai-chat-cookie-capture-spec.test.js +211 -0
  2. package/__tests__/adapters/ai-chat-health-checker.test.js +262 -0
  3. package/__tests__/adapters/ai-chat-history.test.js +396 -0
  4. package/__tests__/adapters/ai-chat-http-client.test.js +242 -0
  5. package/__tests__/adapters/ai-chat-vendors.test.js +874 -0
  6. package/__tests__/adapters/alipay-bill-adapter.test.js +538 -0
  7. package/__tests__/adapters/email-adapter.test.js +138 -1
  8. package/__tests__/adapters/email-classifier.test.js +347 -0
  9. package/__tests__/adapters/email-pdf-extractor.test.js +529 -0
  10. package/__tests__/adapters/email-retry-progress.test.js +294 -0
  11. package/__tests__/adapters/email-templates.test.js +699 -0
  12. package/__tests__/adapters/social-toutiao-kuaishou-scaffold.test.js +269 -0
  13. package/__tests__/adapters/system-data-adapter.test.js +440 -0
  14. package/__tests__/adapters/system-data-android-ingest.test.js +144 -0
  15. package/__tests__/adapters/system-data-android.test.js +387 -0
  16. package/__tests__/adapters/system-data-disclosure.test.js +153 -0
  17. package/__tests__/adapters/wechat-bootstrap.test.js +240 -0
  18. package/__tests__/adapters/wechat-env-probe.test.js +162 -0
  19. package/__tests__/adapters/wechat-frida-agent.test.js +191 -0
  20. package/__tests__/adapters/wechat-frida-integration.test.js +149 -0
  21. package/__tests__/adapters/wechat-frida-key-provider.test.js +188 -0
  22. package/__tests__/adapters/wechat-md5-key-provider.test.js +101 -0
  23. package/__tests__/analysis-skills.test.js +556 -0
  24. package/__tests__/analysis.test.js +329 -1
  25. package/__tests__/e2e/ai-chat-cross-source-journey.test.js +213 -0
  26. package/__tests__/e2e/full-user-journey.test.js +188 -0
  27. package/__tests__/entity-resolver-ingest-hook.test.js +177 -0
  28. package/__tests__/entity-resolver-stages.test.js +411 -0
  29. package/__tests__/entity-resolver-vault.test.js +246 -0
  30. package/__tests__/entity-resolver.test.js +526 -0
  31. package/__tests__/fixtures/entity-resolver-200-mock.json +96 -0
  32. package/__tests__/integration/ai-chat-history-registry.test.js +228 -0
  33. package/__tests__/integration/aichat-wizard-end-to-end.test.js +282 -0
  34. package/__tests__/integration/cross-adapter-pipelines.test.js +396 -0
  35. package/__tests__/integration/wechat-bootstrap-end-to-end.test.js +390 -0
  36. package/__tests__/longtail-adapters.test.js +217 -0
  37. package/__tests__/mobile-extractor.test.js +288 -0
  38. package/__tests__/registry.test.js +4 -2
  39. package/__tests__/shopping-adapters.test.js +296 -0
  40. package/__tests__/sidecar-contacts-cross-validate.test.js +163 -0
  41. package/__tests__/sidecar-supervisor.test.js +120 -0
  42. package/__tests__/social-adapters.test.js +206 -0
  43. package/__tests__/travel-adapters.test.js +325 -0
  44. package/__tests__/vault.test.js +3 -3
  45. package/__tests__/wechat-adapter.test.js +476 -0
  46. package/__tests__/whatsapp-adapter.test.js +135 -0
  47. package/lib/adapter-spec.js +12 -0
  48. package/lib/adapters/_python-sidecar-base.js +207 -0
  49. package/lib/adapters/ai-chat-history/ai-chat-adapter.js +374 -0
  50. package/lib/adapters/ai-chat-history/cookie-auth.js +109 -0
  51. package/lib/adapters/ai-chat-history/cookie-capture-spec.js +331 -0
  52. package/lib/adapters/ai-chat-history/health-checker.js +210 -0
  53. package/lib/adapters/ai-chat-history/http-client.js +211 -0
  54. package/lib/adapters/ai-chat-history/index.js +28 -0
  55. package/lib/adapters/ai-chat-history/schema-map.js +258 -0
  56. package/lib/adapters/ai-chat-history/vendor-spec.js +86 -0
  57. package/lib/adapters/ai-chat-history/vendors/coze.js +179 -0
  58. package/lib/adapters/ai-chat-history/vendors/deepseek.js +199 -0
  59. package/lib/adapters/ai-chat-history/vendors/doubao.js +255 -0
  60. package/lib/adapters/ai-chat-history/vendors/dreamina.js +174 -0
  61. package/lib/adapters/ai-chat-history/vendors/hunyuan.js +176 -0
  62. package/lib/adapters/ai-chat-history/vendors/kimi.js +182 -0
  63. package/lib/adapters/ai-chat-history/vendors/qianfan.js +160 -0
  64. package/lib/adapters/ai-chat-history/vendors/tongyi.js +193 -0
  65. package/lib/adapters/ai-chat-history/vendors/zhipu.js +202 -0
  66. package/lib/adapters/ai-chat-history/wizard-controller.js +473 -0
  67. package/lib/adapters/alipay-bill/alipay-bill-adapter.js +311 -0
  68. package/lib/adapters/alipay-bill/counterparty.js +129 -0
  69. package/lib/adapters/alipay-bill/csv-parser.js +217 -0
  70. package/lib/adapters/alipay-bill/index.js +41 -0
  71. package/lib/adapters/alipay-bill/zip-decryptor.js +111 -0
  72. package/lib/adapters/email-imap/classifier.js +495 -0
  73. package/lib/adapters/email-imap/email-adapter.js +419 -8
  74. package/lib/adapters/email-imap/index.js +42 -0
  75. package/lib/adapters/email-imap/pdf-extractor.js +192 -0
  76. package/lib/adapters/email-imap/templates/bill.js +232 -0
  77. package/lib/adapters/email-imap/templates/government.js +120 -0
  78. package/lib/adapters/email-imap/templates/index.js +78 -0
  79. package/lib/adapters/email-imap/templates/order.js +186 -0
  80. package/lib/adapters/email-imap/templates/other.js +114 -0
  81. package/lib/adapters/email-imap/templates/register.js +113 -0
  82. package/lib/adapters/email-imap/templates/travel.js +157 -0
  83. package/lib/adapters/email-imap/templates/utils.js +275 -0
  84. package/lib/adapters/email-imap/transactions.js +234 -0
  85. package/lib/adapters/messaging-qq/index.js +158 -0
  86. package/lib/adapters/messaging-telegram/index.js +142 -0
  87. package/lib/adapters/messaging-whatsapp/index.js +189 -0
  88. package/lib/adapters/shopping-base/index.js +208 -0
  89. package/lib/adapters/shopping-jd/index.js +150 -0
  90. package/lib/adapters/shopping-meituan/index.js +154 -0
  91. package/lib/adapters/shopping-taobao/index.js +176 -0
  92. package/lib/adapters/social-bilibili/index.js +171 -0
  93. package/lib/adapters/social-douyin/index.js +116 -0
  94. package/lib/adapters/social-kuaishou/index.js +237 -0
  95. package/lib/adapters/social-toutiao/index.js +236 -0
  96. package/lib/adapters/social-weibo/index.js +164 -0
  97. package/lib/adapters/social-xiaohongshu/index.js +96 -0
  98. package/lib/adapters/system-data/disclosure.js +166 -0
  99. package/lib/adapters/system-data/index.js +34 -0
  100. package/lib/adapters/system-data/system-data-adapter.js +344 -0
  101. package/lib/adapters/system-data-android/adapter.js +348 -0
  102. package/lib/adapters/system-data-android/index.js +76 -0
  103. package/lib/adapters/travel-12306/index.js +151 -0
  104. package/lib/adapters/travel-amap/index.js +164 -0
  105. package/lib/adapters/travel-baidu-map/index.js +162 -0
  106. package/lib/adapters/travel-base/index.js +240 -0
  107. package/lib/adapters/travel-ctrip/index.js +151 -0
  108. package/lib/adapters/wechat/bootstrap.js +146 -0
  109. package/lib/adapters/wechat/content-parser.js +326 -0
  110. package/lib/adapters/wechat/db-reader.js +209 -0
  111. package/lib/adapters/wechat/env-probe.js +218 -0
  112. package/lib/adapters/wechat/frida-agent/loader.js +67 -0
  113. package/lib/adapters/wechat/frida-agent/wechat-key-hook.js +126 -0
  114. package/lib/adapters/wechat/index.js +37 -0
  115. package/lib/adapters/wechat/key-extractor.js +158 -0
  116. package/lib/adapters/wechat/key-providers/frida-key-provider.js +244 -0
  117. package/lib/adapters/wechat/key-providers/index.js +22 -0
  118. package/lib/adapters/wechat/key-providers/key-provider-base.js +44 -0
  119. package/lib/adapters/wechat/key-providers/md5-key-provider.js +81 -0
  120. package/lib/adapters/wechat/normalize.js +220 -0
  121. package/lib/adapters/wechat/wechat-adapter.js +205 -0
  122. package/lib/analysis-skills/base.js +113 -0
  123. package/lib/analysis-skills/footprint.js +167 -0
  124. package/lib/analysis-skills/index.js +58 -0
  125. package/lib/analysis-skills/interests.js +161 -0
  126. package/lib/analysis-skills/relations.js +226 -0
  127. package/lib/analysis-skills/spending.js +219 -0
  128. package/lib/analysis-skills/timeline.js +167 -0
  129. package/lib/analysis.js +191 -2
  130. package/lib/entity-resolver/embedding-stage.js +198 -0
  131. package/lib/entity-resolver/entity-resolver.js +384 -0
  132. package/lib/entity-resolver/index.js +42 -0
  133. package/lib/entity-resolver/llm-stage.js +191 -0
  134. package/lib/entity-resolver/rule-stage.js +208 -0
  135. package/lib/entity-resolver/worker.js +149 -0
  136. package/lib/index.js +131 -0
  137. package/lib/migrations.js +73 -0
  138. package/lib/mobile-extractor/android.js +193 -0
  139. package/lib/mobile-extractor/index.js +9 -0
  140. package/lib/mobile-extractor/ios.js +223 -0
  141. package/lib/prompt-builder.js +11 -1
  142. package/lib/query-parser.js +7 -1
  143. package/lib/registry.js +42 -0
  144. package/lib/sidecar/index.js +15 -0
  145. package/lib/sidecar/supervisor.js +359 -0
  146. package/lib/vault.js +343 -0
  147. package/package.json +36 -3
  148. package/scripts/_make-fixture-all.js +126 -0
  149. package/scripts/_make-fixture-contacts.js +84 -0
  150. package/scripts/evaluate-entity-resolver.js +213 -0
  151. package/scripts/smoke-phase-5-5.js +196 -0
  152. package/scripts/smoke-phase-5-7.js +181 -0
  153. package/scripts/smoke-system-data-contacts.js +309 -0
  154. package/scripts/smoke-system-data.js +312 -0
@@ -0,0 +1,275 @@
1
+ /**
2
+ * Shared regex extraction helpers for the 6 Phase 5.4 template extractors.
3
+ *
4
+ * Design notes:
5
+ * - Regex-first. We aim to extract the high-value structured fields
6
+ * (金额 / 日期 / 卡尾 4 位 / 订单号 / 快递单号) without paying LLM
7
+ * tokens per email. Each helper returns ALL matches (an adapter can
8
+ * pick the most plausible one when context is needed) and degrades
9
+ * to [] / null for unrecognized input — never throws.
10
+ * - LLM enrichment lives in per-template files when callers pass a
11
+ * `llm` opt; this module stays free of LLM dependencies so it can
12
+ * be unit-tested without mocks.
13
+ * - Chinese-first patterns. ¥ and 元 are the primary currency
14
+ * markers; CNY / RMB and $/USD/EUR are also handled for emails
15
+ * from foreign vendors.
16
+ * - Numbers can have thousands separators (1,234.50) and Chinese
17
+ * decimals (1,234.50 元 / 1234元 / 1.5万 / 8千).
18
+ */
19
+
20
+ "use strict";
21
+
22
+ // ─── Currency / amount ──────────────────────────────────────────────────
23
+
24
+ /**
25
+ * Match plausible monetary amounts and tag direction when adjacent
26
+ * "支出/支付/扣款" or "收入/退款/到账" markers surround them. Returns
27
+ * [{ value, currency, raw, direction?, index }, ...] sorted by source
28
+ * position. Always returns an array.
29
+ *
30
+ * Direction heuristics:
31
+ * - text within 12 chars BEFORE or AFTER amount containing
32
+ * "退款"/"返还"/"到账"/"收入"/"refund"/"credit" → "in"
33
+ * - similar window with "扣款"/"支付"/"支出"/"消费"/"应还"/"还款"/"debit"/"charge"
34
+ * → "out"
35
+ * - default: undefined (caller decides)
36
+ *
37
+ * @param {string} text
38
+ * @returns {Array<{value:number,currency:string,raw:string,direction?:string,index:number}>}
39
+ */
40
+ function extractAmounts(text) {
41
+ if (typeof text !== "string" || text.length === 0) return [];
42
+ const results = [];
43
+
44
+ // Pattern A: ¥1,234.50 / RMB 1,234 / CNY 99.00 / USD $100 / $99.99 / 100元
45
+ // Number group allows comma thousands + optional decimal. Currency can
46
+ // come before OR after the number.
47
+ const re = /(?:(?:[¥$€£]|RMB|CNY|USD|EUR|GBP|HKD)\s*([\d][\d,]*(?:\.\d{1,4})?)|([\d][\d,]*(?:\.\d{1,4})?)\s*(元|RMB|CNY|USD|EUR|HKD|港币|美元|欧元))/gi;
48
+ let m;
49
+ while ((m = re.exec(text)) !== null) {
50
+ const numericStr = (m[1] || m[2] || "").replace(/,/g, "");
51
+ const value = Number(numericStr);
52
+ if (!Number.isFinite(value) || value <= 0) continue;
53
+ const currencyMarker = (m[0].match(/[¥$€£]|RMB|CNY|USD|EUR|GBP|HKD|元|港币|美元|欧元/i) || ["?"])[0];
54
+ const currency = normalizeCurrency(currencyMarker);
55
+ const direction = detectDirection(text, m.index, m[0].length);
56
+ results.push({
57
+ value,
58
+ currency,
59
+ raw: m[0],
60
+ direction,
61
+ index: m.index,
62
+ });
63
+ }
64
+
65
+ return results;
66
+ }
67
+
68
+ function normalizeCurrency(marker) {
69
+ const m = marker.toUpperCase();
70
+ if (m === "¥" || m === "元" || m === "RMB" || m === "CNY") return "CNY";
71
+ if (m === "$" || m === "USD" || m === "美元") return "USD";
72
+ if (m === "€" || m === "EUR" || m === "欧元") return "EUR";
73
+ if (m === "£" || m === "GBP") return "GBP";
74
+ if (m === "HKD" || m === "港币") return "HKD";
75
+ return "CNY"; // default — most Chinese emails
76
+ }
77
+
78
+ function detectDirection(text, idx, len) {
79
+ const winStart = Math.max(0, idx - 24);
80
+ const winEnd = Math.min(text.length, idx + len + 24);
81
+ const window = text.slice(winStart, winEnd);
82
+ if (/(退款|返还|到账|收入|入账|credit|refund|received)/i.test(window)) return "in";
83
+ if (/(扣款|支付|支出|消费|应还|还款|账单|debit|charge|paid|due)/i.test(window)) return "out";
84
+ return undefined;
85
+ }
86
+
87
+ // ─── Dates ──────────────────────────────────────────────────────────────
88
+
89
+ /**
90
+ * Find date-like fragments. Recognizes:
91
+ * - 2026-05-19 / 2026/05/19 / 2026.05.19 / 2026年5月19日
92
+ * - 5/19/2026 / 19-05-2026 (Western)
93
+ * - 5月19日 (current year inferred from now)
94
+ *
95
+ * Returns array of { raw, date: Date, index }. Year defaults to current
96
+ * year when only month+day is present (or last year if today's month is
97
+ * earlier than parsed month, suggesting "Christmas card sent in January").
98
+ * The default-year heuristic is intentionally conservative — callers
99
+ * needing strict semantics should rely on full YYYY-MM-DD forms.
100
+ *
101
+ * @param {string} text
102
+ * @param {number} [nowMs]
103
+ * @returns {Array<{raw:string,date:Date,index:number,hasYear:boolean}>}
104
+ */
105
+ function extractDates(text, nowMs = Date.now()) {
106
+ if (typeof text !== "string" || text.length === 0) return [];
107
+ const out = [];
108
+ const now = new Date(nowMs);
109
+ const currentYear = now.getFullYear();
110
+
111
+ // YYYY-MM-DD / YYYY/MM/DD / YYYY.MM.DD / YYYY年M月D日
112
+ const reIso = /(\d{4})[-/.年](\d{1,2})[-/.月](\d{1,2})日?/g;
113
+ let m;
114
+ while ((m = reIso.exec(text)) !== null) {
115
+ const y = +m[1], mo = +m[2], d = +m[3];
116
+ if (isValidYMD(y, mo, d)) {
117
+ out.push({ raw: m[0], date: new Date(y, mo - 1, d), index: m.index, hasYear: true });
118
+ }
119
+ }
120
+
121
+ // M月D日 (Chinese, year-less)
122
+ const reMd = /(\d{1,2})月(\d{1,2})日?/g;
123
+ while ((m = reMd.exec(text)) !== null) {
124
+ if (out.some((o) => o.index === m.index)) continue; // already matched as YYYY-MM-DD
125
+ const mo = +m[1], d = +m[2];
126
+ if (isValidYMD(currentYear, mo, d)) {
127
+ // Heuristic: if parsed month is far before now's month, assume
128
+ // last year (e.g. a January-billed bill arriving in November).
129
+ const monthsAhead = mo - (now.getMonth() + 1);
130
+ const year = monthsAhead < -6 ? currentYear + 1 : (monthsAhead > 6 ? currentYear - 1 : currentYear);
131
+ out.push({ raw: m[0], date: new Date(year, mo - 1, d), index: m.index, hasYear: false });
132
+ }
133
+ }
134
+
135
+ // M/D/YYYY (Western shorthand)
136
+ const reUs = /\b(\d{1,2})\/(\d{1,2})\/(\d{4})\b/g;
137
+ while ((m = reUs.exec(text)) !== null) {
138
+ if (out.some((o) => o.index === m.index)) continue;
139
+ const mo = +m[1], d = +m[2], y = +m[3];
140
+ if (isValidYMD(y, mo, d)) {
141
+ out.push({ raw: m[0], date: new Date(y, mo - 1, d), index: m.index, hasYear: true });
142
+ }
143
+ }
144
+
145
+ return out.sort((a, b) => a.index - b.index);
146
+ }
147
+
148
+ function isValidYMD(y, m, d) {
149
+ return y >= 1970 && y <= 2099 && m >= 1 && m <= 12 && d >= 1 && d <= 31;
150
+ }
151
+
152
+ // ─── Account / card last-4 ──────────────────────────────────────────────
153
+
154
+ /**
155
+ * Extract trailing 4-digit identifiers commonly used for card masking.
156
+ * Recognizes:
157
+ * - 尾号 1234 / 卡号尾 1234 / **** 1234 / ending in 1234 / last 4 digits 1234
158
+ * Returns array of {raw, last4, index}. Always 4 digits exactly.
159
+ */
160
+ function extractAccountTails(text) {
161
+ if (typeof text !== "string" || text.length === 0) return [];
162
+ const out = [];
163
+ const patterns = [
164
+ /(?:尾号|尾[四4]位|卡号尾|卡尾|账号尾)\s*[::]?\s*(\d{4})\b/g,
165
+ /\*{2,}\s*(\d{4})\b/g,
166
+ /(?:ending in|last\s*4\s*digits|\bending\b)\s*(\d{4})\b/gi,
167
+ ];
168
+ for (const re of patterns) {
169
+ let m;
170
+ while ((m = re.exec(text)) !== null) {
171
+ out.push({ raw: m[0], last4: m[1], index: m.index });
172
+ }
173
+ }
174
+ return out;
175
+ }
176
+
177
+ // ─── Order / tracking numbers ──────────────────────────────────────────
178
+
179
+ /**
180
+ * Extract order-number candidates. Recognizes:
181
+ * 订单号: 12345678 / 订单 12345678 / Order # 12345-67 / Order Number 12345
182
+ * 订单编号: ABC1234567
183
+ * Returns array of {raw, orderNumber, index}.
184
+ */
185
+ function extractOrderNumbers(text) {
186
+ if (typeof text !== "string" || text.length === 0) return [];
187
+ const out = [];
188
+ const patterns = [
189
+ /(?:订单(?:号|编号|号码)?|order(?:\s*number|\s*id|\s*#)?)\s*[::]?\s*([A-Z0-9][-A-Z0-9]{4,30})\b/gi,
190
+ ];
191
+ for (const re of patterns) {
192
+ let m;
193
+ while ((m = re.exec(text)) !== null) {
194
+ out.push({ raw: m[0], orderNumber: m[1], index: m.index });
195
+ }
196
+ }
197
+ return out;
198
+ }
199
+
200
+ /**
201
+ * Tracking-number candidates. Recognizes Chinese express keywords +
202
+ * generic "tracking number" markers. Sufficient for Phase 5.4; v2 may
203
+ * add carrier-specific format validation (SF / YT / ZTO regex).
204
+ */
205
+ function extractTrackingNumbers(text) {
206
+ if (typeof text !== "string" || text.length === 0) return [];
207
+ const out = [];
208
+ const re = /(?:快递单号|运单号|物流单号|物流号|tracking\s*(?:number|#)|track(?:ing)?\s*id)\s*[::]?\s*([A-Z0-9][-A-Z0-9]{6,30})\b/gi;
209
+ let m;
210
+ while ((m = re.exec(text)) !== null) {
211
+ out.push({ raw: m[0], trackingNumber: m[1], index: m.index });
212
+ }
213
+ return out;
214
+ }
215
+
216
+ // ─── Verification codes (REDACTED in extra; only field captured is "yes") ──
217
+
218
+ /**
219
+ * Find OTP / verification code strings. Returns just the count + position
220
+ * — the code itself is NEVER returned to caller, since storing OTPs in
221
+ * vault is a compliance red flag (architecture-doc §9.2). Caller uses
222
+ * the count as a signal that this is a register email.
223
+ */
224
+ function detectVerificationCodes(text) {
225
+ if (typeof text !== "string" || text.length === 0) return { count: 0, hits: [] };
226
+ const re = /(?:验证码|verification\s*code|otp|动态密码|安全码)\s*(?:为|是|:|:|is|为)?\s*\d{4,8}\b/gi;
227
+ const hits = [];
228
+ let m;
229
+ while ((m = re.exec(text)) !== null) {
230
+ hits.push({ raw: m[0].replace(/\d/g, "*"), index: m.index });
231
+ }
232
+ return { count: hits.length, hits };
233
+ }
234
+
235
+ // ─── Helpers shared by templates ────────────────────────────────────────
236
+
237
+ /**
238
+ * Pick the most plausible amount given a list. Strategy:
239
+ * 1. If any amount has a `direction`, prefer those over directionless
240
+ * 2. Prefer larger amounts (statements tend to put the total prominently)
241
+ * 3. Tie-break by earliest position
242
+ */
243
+ function selectPrimaryAmount(amounts) {
244
+ if (!Array.isArray(amounts) || amounts.length === 0) return null;
245
+ const directed = amounts.filter((a) => a.direction);
246
+ const pool = directed.length > 0 ? directed : amounts;
247
+ return pool.slice().sort((a, b) => {
248
+ if (b.value !== a.value) return b.value - a.value;
249
+ return a.index - b.index;
250
+ })[0];
251
+ }
252
+
253
+ /**
254
+ * Convert a Date to a ms epoch, falling back to null. Handy when
255
+ * templates emit dates into Event extra (vault stores ms-ints).
256
+ */
257
+ function dateToMs(date) {
258
+ if (!(date instanceof Date)) return null;
259
+ const t = date.getTime();
260
+ return Number.isFinite(t) ? t : null;
261
+ }
262
+
263
+ module.exports = {
264
+ extractAmounts,
265
+ extractDates,
266
+ extractAccountTails,
267
+ extractOrderNumbers,
268
+ extractTrackingNumbers,
269
+ detectVerificationCodes,
270
+ selectPrimaryAmount,
271
+ dateToMs,
272
+ // exposed for tests
273
+ normalizeCurrency,
274
+ detectDirection,
275
+ };
@@ -0,0 +1,234 @@
1
+ /**
2
+ * Phase 5.5 — line-item transaction parser for Chinese bank-statement
3
+ * PDF text. Decrypted statements typically follow one of a few row
4
+ * formats; we go regex-first with a generic fallback.
5
+ *
6
+ * Output shape (per item):
7
+ * {
8
+ * occurredAtMs: number, // ms epoch of the transaction date
9
+ * description: string, // merchant + memo, trimmed
10
+ * amount: { value, currency, direction },
11
+ * balance?: { value, currency }, // running balance when present
12
+ * raw: string, // the original line for audit
13
+ * index: number, // line index within the body
14
+ * }
15
+ *
16
+ * Recognized formats (sample row patterns):
17
+ *
18
+ * 招商银行 / 中国银行 / 民生银行 (column-aligned with whitespace):
19
+ * `2026-04-15 星巴克 上海中山公园店 ¥-39.00 1,234.56`
20
+ *
21
+ * 工商银行 / 建设银行 (slash dates + 借/贷 prefix):
22
+ * `2026/04/15 借 39.00 星巴克 上海中山公园店 CNY 1,234.56`
23
+ *
24
+ * 交通银行 / 浦发银行 (Chinese date + tab/space):
25
+ * `2026年04月15日 星巴克 上海中山公园店 支出 39.00 余额 1234.56`
26
+ *
27
+ * Strategy:
28
+ * - One pass per line. Skip header / footer / legalese lines via
29
+ * a denylist.
30
+ * - Direction inferred from: explicit 借/贷 / 支出/收入 / + / - / 退款 keywords.
31
+ * - Amount currency defaults to CNY (Chinese statements rarely
32
+ * mix currencies); foreign-currency cards emit USD/EUR rows
33
+ * which we detect via prefix.
34
+ * - Returns an empty array when the text doesn't look like a
35
+ * statement (e.g. marketing PDF). Callers should not assume
36
+ * a non-empty list.
37
+ */
38
+
39
+ "use strict";
40
+
41
+ // Lines that look like statement chrome / legalese — skip outright.
42
+ const SKIP_PATTERNS = [
43
+ /^[\s\d]*$/, // pure whitespace / numbers (page-number footers)
44
+ /^\s*第\s*\d+\s*页/, // 第 1 页 of 3
45
+ /^\s*page\s+\d+/i,
46
+ /声明|免责|提示|温馨提示|温馨提醒|请勿回复|本邮件由系统/,
47
+ /账单周期|账单日|还款日|信用额度|可用额度/,
48
+ /^\s*[-=]{3,}\s*$/,
49
+ ];
50
+
51
+ // Direction keywords: ordered so "支出/借/-" wins over plain numbers.
52
+ const DIRECTION_OUT = /(支出|借方|借|消费|扣款|paid|debit|charged?)/i;
53
+ const DIRECTION_IN = /(收入|贷方|贷|退款|返还|到账|入账|credit|refund|received)/i;
54
+
55
+ // ── Row patterns (each returns a SHARED capture-group layout via .exec()):
56
+ // m.groups = { date, dateY, dateM, dateD, sign?, currency?, amount, balance?, desc }
57
+ // To keep regex sane we use a 2-pass approach: a "date-prefix" regex
58
+ // rooted at line start, then a follow-up amount/balance regex on the rest.
59
+
60
+ const DATE_PATTERNS = [
61
+ // 2026-04-15 / 2026/04/15 / 2026.04.15
62
+ /^\s*(?<dateY>\d{4})[-/.](?<dateM>\d{1,2})[-/.](?<dateD>\d{1,2})\s+/,
63
+ // 2026年04月15日
64
+ /^\s*(?<dateY>\d{4})年(?<dateM>\d{1,2})月(?<dateD>\d{1,2})日?\s+/,
65
+ // 04-15 (year-less, fall back to current year) — rare but BOC uses it
66
+ /^\s*(?<dateM>\d{1,2})[-/.](?<dateD>\d{1,2})\s+/,
67
+ ];
68
+
69
+ // Amount: a signed (optional) currency-marked number. Negative or "-"
70
+ // prefix means OUT. Currency optional; defaults to CNY.
71
+ const AMOUNT_RE = /(?<sign>[+\-])?\s*(?:(?<cur>¥|RMB|CNY|USD|EUR|\$|€)\s*)?(?<amt>[\d][\d,]*(?:\.\d{1,4})?)\b/;
72
+
73
+ /**
74
+ * Parse a bank-statement text body into transactions.
75
+ *
76
+ * @param {string} text decrypted PDF text (or any plain-text body)
77
+ * @param {object} [opts]
78
+ * @param {number} [opts.maxRows=500] cap to keep DoS-shaped statements bounded
79
+ * @param {number} [opts.nowMs=Date.now()] reference for year-less dates
80
+ * @returns {Array<object>}
81
+ */
82
+ function extractTransactions(text, opts = {}) {
83
+ if (typeof text !== "string" || text.length === 0) return [];
84
+ const maxRows = Number.isFinite(opts.maxRows) && opts.maxRows > 0 ? opts.maxRows : 500;
85
+ const nowMs = Number.isFinite(opts.nowMs) ? opts.nowMs : Date.now();
86
+ const now = new Date(nowMs);
87
+
88
+ const lines = text.split(/\r?\n/);
89
+ const out = [];
90
+
91
+ for (let i = 0; i < lines.length && out.length < maxRows; i++) {
92
+ const line = lines[i];
93
+ if (!line || line.length < 6) continue;
94
+ if (SKIP_PATTERNS.some((re) => re.test(line))) continue;
95
+
96
+ // ── 1. find date prefix ──
97
+ let dateMatch = null;
98
+ let dateConsumed = 0;
99
+ for (const re of DATE_PATTERNS) {
100
+ const m = re.exec(line);
101
+ if (m) {
102
+ dateMatch = m;
103
+ dateConsumed = m[0].length;
104
+ break;
105
+ }
106
+ }
107
+ if (!dateMatch) continue;
108
+
109
+ const y = +(dateMatch.groups.dateY || now.getFullYear());
110
+ const mo = +dateMatch.groups.dateM;
111
+ const d = +dateMatch.groups.dateD;
112
+ if (!validYMD(y, mo, d)) continue;
113
+ const occurredAtMs = new Date(y, mo - 1, d).getTime();
114
+
115
+ // ── 2. rest of line — find amount(s) ──
116
+ const rest = line.slice(dateConsumed);
117
+
118
+ // 借/贷 / 支出/收入 modifier may appear before the amount; capture
119
+ // its position so we can scope direction detection narrowly.
120
+ const directionWindow = rest;
121
+ let direction;
122
+ if (DIRECTION_OUT.test(directionWindow)) direction = "out";
123
+ else if (DIRECTION_IN.test(directionWindow)) direction = "in";
124
+
125
+ const amountMatches = collectAmounts(rest);
126
+ if (amountMatches.length === 0) continue;
127
+
128
+ // Heuristic: when 2+ numbers appear, the LAST is usually the
129
+ // running balance, the one BEFORE it is the transaction amount.
130
+ // Single number → that's the amount, no balance.
131
+ let amount, balance;
132
+ if (amountMatches.length >= 2) {
133
+ amount = amountMatches[amountMatches.length - 2];
134
+ balance = amountMatches[amountMatches.length - 1];
135
+ } else {
136
+ amount = amountMatches[0];
137
+ }
138
+
139
+ // sign / direction reconciliation: if amount.value had a "-" sign,
140
+ // force direction=out; "+" → in. Otherwise keep keyword-detected
141
+ // direction (default undefined → caller may treat as "out" for
142
+ // bills).
143
+ if (amount.sign === "-") direction = "out";
144
+ else if (amount.sign === "+") direction = "in";
145
+
146
+ // ── 3. description = rest minus the amount tokens ──
147
+ const description = cleanDescription(rest, amountMatches);
148
+ if (description.length === 0) continue;
149
+
150
+ const row = {
151
+ occurredAtMs,
152
+ description,
153
+ amount: {
154
+ value: amount.value,
155
+ currency: amount.currency,
156
+ ...(direction ? { direction } : {}),
157
+ },
158
+ ...(balance ? { balance: { value: balance.value, currency: balance.currency } } : {}),
159
+ raw: line.trim(),
160
+ index: i,
161
+ };
162
+ out.push(row);
163
+ }
164
+
165
+ return out;
166
+ }
167
+
168
+ // ─── helpers ─────────────────────────────────────────────────────────────
169
+
170
+ function validYMD(y, m, d) {
171
+ return y >= 1970 && y <= 2099 && m >= 1 && m <= 12 && d >= 1 && d <= 31;
172
+ }
173
+
174
+ function collectAmounts(text) {
175
+ const out = [];
176
+ const re = new RegExp(AMOUNT_RE.source, "gi");
177
+ let m;
178
+ while ((m = re.exec(text)) !== null) {
179
+ const groups = m.groups || {};
180
+ const numericStr = (groups.amt || "").replace(/,/g, "");
181
+ const value = Number(numericStr);
182
+ if (!Number.isFinite(value) || value <= 0) continue;
183
+ // Skip pure-int matches that look like year/month numbers (e.g. "2026")
184
+ // when they are bare and < 100000 with no currency hint AND have 4
185
+ // digits exactly. False-positive guard for date-only rows.
186
+ if (!groups.cur && !groups.sign && /^\d{4}$/.test(numericStr) && value >= 1900 && value <= 2099) continue;
187
+ out.push({
188
+ value,
189
+ currency: normalizeCurrency(groups.cur),
190
+ sign: groups.sign,
191
+ raw: m[0],
192
+ index: m.index,
193
+ length: m[0].length,
194
+ });
195
+ }
196
+ return out;
197
+ }
198
+
199
+ function normalizeCurrency(marker) {
200
+ if (!marker) return "CNY";
201
+ const m = marker.toUpperCase();
202
+ if (m === "¥" || m === "元" || m === "RMB" || m === "CNY") return "CNY";
203
+ if (m === "$" || m === "USD") return "USD";
204
+ if (m === "€" || m === "EUR") return "EUR";
205
+ return "CNY";
206
+ }
207
+
208
+ /**
209
+ * Build the description by cutting amount tokens out of the line and
210
+ * collapsing whitespace. Also strips leading direction keywords (借/贷)
211
+ * — those belong in `amount.direction`, not the human-readable label.
212
+ */
213
+ function cleanDescription(text, amountMatches) {
214
+ // Sort matches by descending start index so deletions don't shift
215
+ // earlier indices.
216
+ const sorted = amountMatches.slice().sort((a, b) => b.index - a.index);
217
+ let s = text;
218
+ for (const m of sorted) {
219
+ s = s.slice(0, m.index) + " " + s.slice(m.index + m.length);
220
+ }
221
+ // Strip direction keywords + standalone punctuation
222
+ s = s
223
+ .replace(/(支出|收入|借方|贷方|借|贷|debit|credit|paid|charged?|refunded?)/gi, " ")
224
+ .replace(/(余额|balance)\s*[::]?/gi, " ")
225
+ .replace(/[¥$€]\s*/g, " ")
226
+ .replace(/[,,;;|]+/g, " ")
227
+ .replace(/\s+/g, " ")
228
+ .trim();
229
+ return s;
230
+ }
231
+
232
+ module.exports = {
233
+ extractTransactions,
234
+ };
@@ -0,0 +1,158 @@
1
+ /**
2
+ * Phase 13.5 — QQ adapter.
3
+ *
4
+ * Per sjqz/parsers/qq.py QQParser. QQ DBs (msg.db / messages.db) are
5
+ * SQLCipher-encrypted with a per-installation key — Phase 13.5b will
6
+ * port the QQ key extractor; v0.5 accepts a `keyProvider` like WeChat.
7
+ *
8
+ * Tables:
9
+ * - mr_friend friend contacts
10
+ * - mr_troop groups
11
+ * - mr_buddy_groupbuddy group members
12
+ * - msgcsr_friend_* friend messages (per-buddy table sharding)
13
+ * - msgcsr_troop_* group messages
14
+ */
15
+
16
+ "use strict";
17
+
18
+ const fs = require("node:fs");
19
+ const { newId } = require("../../ids");
20
+
21
+ const NAME = "messaging-qq";
22
+ const VERSION = "0.5.0";
23
+
24
+ class QQAdapter {
25
+ constructor(opts = {}) {
26
+ if (!opts.account || !opts.account.qq) {
27
+ throw new Error("QQAdapter: opts.account.qq required");
28
+ }
29
+ this.account = opts.account;
30
+ this._dbPath = opts.dbPath || null;
31
+ this._keyProvider = opts.keyProvider || null;
32
+ this._dbDriverFactory = opts.dbDriverFactory || null;
33
+
34
+ this.name = NAME;
35
+ this.version = VERSION;
36
+ this.capabilities = ["sync:sqlite", "parse:qq-messages", "decrypt:sqlcipher"];
37
+ this.extractMode = "device-pull";
38
+ this.rateLimits = {};
39
+ this.dataDisclosure = {
40
+ fields: [
41
+ "qq:friends (uin / nickname / remark)",
42
+ "qq:groups (troop_uin / name)",
43
+ "qq:messages (peer / content / time / type)",
44
+ ],
45
+ sensitivity: "high",
46
+ legalGate: true,
47
+ };
48
+ }
49
+
50
+ async authenticate() {
51
+ if (!this._dbPath || !fs.existsSync(this._dbPath)) {
52
+ return { ok: false, reason: "DB_NOT_PULLED" };
53
+ }
54
+ if (!this._keyProvider || typeof this._keyProvider.getKey !== "function") {
55
+ return { ok: false, reason: "NO_KEY_PROVIDER" };
56
+ }
57
+ return { ok: true, account: this.account.qq };
58
+ }
59
+
60
+ async healthCheck() {
61
+ const r = await this.authenticate();
62
+ return r.ok ? { ok: true, lastChecked: Date.now() } : r;
63
+ }
64
+
65
+ async *sync(opts = {}) {
66
+ const dbPath = opts.dbPath || this._dbPath;
67
+ if (!dbPath || !fs.existsSync(dbPath)) return;
68
+ if (!this._keyProvider) return;
69
+ const key = await this._keyProvider.getKey();
70
+ if (!key) return;
71
+
72
+ const Driver = this._dbDriverFactory
73
+ ? this._dbDriverFactory()
74
+ : require("better-sqlite3-multiple-ciphers");
75
+ const db = new Driver(dbPath, { readonly: true });
76
+ try {
77
+ db.pragma(`key = '${key}'`);
78
+ // Friends
79
+ const friends = trySelect(db, "SELECT * FROM mr_friend LIMIT 5000") || [];
80
+ for (const row of friends) {
81
+ yield { adapter: NAME, originalId: `friend-${row.uin}`, capturedAt: Date.now(), payload: { row, kind: "contact" } };
82
+ }
83
+ // Groups
84
+ const groups = trySelect(db, "SELECT * FROM mr_troop LIMIT 1000") || [];
85
+ for (const row of groups) {
86
+ yield { adapter: NAME, originalId: `group-${row.troop_uin}`, capturedAt: Date.now(), payload: { row, kind: "group" } };
87
+ }
88
+ // Messages — QQ shards by buddy. Iterate any msgcsr_friend_* table.
89
+ const tables = trySelect(db, "SELECT name FROM sqlite_master WHERE type='table' AND (name LIKE 'msgcsr_friend_%' OR name LIKE 'msgcsr_troop_%')") || [];
90
+ for (const t of tables) {
91
+ const msgs = trySelect(db, `SELECT * FROM ${t.name} ORDER BY time DESC LIMIT 1000`) || [];
92
+ for (const row of msgs) {
93
+ yield {
94
+ adapter: NAME,
95
+ originalId: `msg-${row.msgid || row._id}`,
96
+ capturedAt: parseTime(row.time),
97
+ payload: { row, kind: "message", table: t.name },
98
+ };
99
+ }
100
+ }
101
+ } finally {
102
+ try { db.close(); } catch (_e) {}
103
+ }
104
+ }
105
+
106
+ normalize(raw) {
107
+ const { kind, row } = raw.payload;
108
+ const now = Date.now();
109
+ const occurredAt = parseTime(row.time) || now;
110
+ const source = { adapter: NAME, adapterVersion: VERSION, originalId: raw.originalId, capturedAt: occurredAt, capturedBy: "sqlite" };
111
+ if (kind === "contact") {
112
+ return {
113
+ events: [], places: [], items: [], topics: [],
114
+ persons: [{
115
+ id: `person-qq-${row.uin}`,
116
+ type: "person", subtype: "contact",
117
+ names: [row.remark, row.nickname, String(row.uin)].filter((x) => typeof x === "string" && x.length > 0),
118
+ identifiers: { qqId: String(row.uin) },
119
+ ingestedAt: now, source,
120
+ extra: { fromAdapter: NAME, qq: row.uin },
121
+ }],
122
+ };
123
+ }
124
+ if (kind === "group") {
125
+ return {
126
+ events: [], places: [], items: [], persons: [],
127
+ topics: [{
128
+ id: `topic-qq-group-${row.troop_uin}`,
129
+ type: "topic", name: row.troop_name || String(row.troop_uin),
130
+ ingestedAt: now, source,
131
+ extra: { fromAdapter: NAME, troopUin: row.troop_uin },
132
+ }],
133
+ };
134
+ }
135
+ // message
136
+ const isGroup = (raw.payload.table || "").startsWith("msgcsr_troop_");
137
+ return {
138
+ events: [{
139
+ id: newId(), type: "event", subtype: "message",
140
+ occurredAt, actor: "person-self",
141
+ content: { title: (row.msg || "").slice(0, 80) || "(空)", text: row.msg || "" },
142
+ ingestedAt: now, source,
143
+ extra: { peer: row.frienduin || row.troopuin, isGroup, msgType: row.msgtype },
144
+ }],
145
+ persons: [], places: [], items: [], topics: [],
146
+ };
147
+ }
148
+ }
149
+ function trySelect(db, sql) { try { return db.prepare(sql).all(); } catch (_e) { return null; } }
150
+ function parseTime(v) {
151
+ if (Number.isFinite(v)) return v > 1e12 ? v : v * 1000;
152
+ if (typeof v === "string") {
153
+ if (/^\d+$/.test(v)) { const n = parseInt(v, 10); return n > 1e12 ? n : n * 1000; }
154
+ return Date.parse(v) || null;
155
+ }
156
+ return null;
157
+ }
158
+ module.exports = { QQAdapter, NAME, VERSION };