@chainlesschain/personal-data-hub 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. package/__tests__/adapters/ai-chat-cookie-capture-spec.test.js +211 -0
  2. package/__tests__/adapters/ai-chat-health-checker.test.js +262 -0
  3. package/__tests__/adapters/ai-chat-history.test.js +396 -0
  4. package/__tests__/adapters/ai-chat-http-client.test.js +242 -0
  5. package/__tests__/adapters/ai-chat-vendors.test.js +874 -0
  6. package/__tests__/adapters/alipay-bill-adapter.test.js +538 -0
  7. package/__tests__/adapters/email-adapter.test.js +138 -1
  8. package/__tests__/adapters/email-classifier.test.js +347 -0
  9. package/__tests__/adapters/email-pdf-extractor.test.js +529 -0
  10. package/__tests__/adapters/email-retry-progress.test.js +294 -0
  11. package/__tests__/adapters/email-templates.test.js +699 -0
  12. package/__tests__/adapters/social-toutiao-kuaishou-scaffold.test.js +269 -0
  13. package/__tests__/adapters/system-data-adapter.test.js +440 -0
  14. package/__tests__/adapters/system-data-android-ingest.test.js +144 -0
  15. package/__tests__/adapters/system-data-android.test.js +387 -0
  16. package/__tests__/adapters/system-data-disclosure.test.js +153 -0
  17. package/__tests__/adapters/wechat-bootstrap.test.js +240 -0
  18. package/__tests__/adapters/wechat-env-probe.test.js +162 -0
  19. package/__tests__/adapters/wechat-frida-agent.test.js +191 -0
  20. package/__tests__/adapters/wechat-frida-integration.test.js +149 -0
  21. package/__tests__/adapters/wechat-frida-key-provider.test.js +188 -0
  22. package/__tests__/adapters/wechat-md5-key-provider.test.js +101 -0
  23. package/__tests__/analysis-skills.test.js +556 -0
  24. package/__tests__/analysis.test.js +329 -1
  25. package/__tests__/e2e/ai-chat-cross-source-journey.test.js +213 -0
  26. package/__tests__/e2e/full-user-journey.test.js +188 -0
  27. package/__tests__/entity-resolver-ingest-hook.test.js +177 -0
  28. package/__tests__/entity-resolver-stages.test.js +411 -0
  29. package/__tests__/entity-resolver-vault.test.js +246 -0
  30. package/__tests__/entity-resolver.test.js +526 -0
  31. package/__tests__/fixtures/entity-resolver-200-mock.json +96 -0
  32. package/__tests__/integration/ai-chat-history-registry.test.js +228 -0
  33. package/__tests__/integration/aichat-wizard-end-to-end.test.js +282 -0
  34. package/__tests__/integration/cross-adapter-pipelines.test.js +396 -0
  35. package/__tests__/integration/wechat-bootstrap-end-to-end.test.js +390 -0
  36. package/__tests__/longtail-adapters.test.js +217 -0
  37. package/__tests__/mobile-extractor.test.js +288 -0
  38. package/__tests__/registry.test.js +4 -2
  39. package/__tests__/shopping-adapters.test.js +296 -0
  40. package/__tests__/sidecar-contacts-cross-validate.test.js +163 -0
  41. package/__tests__/sidecar-supervisor.test.js +120 -0
  42. package/__tests__/social-adapters.test.js +206 -0
  43. package/__tests__/travel-adapters.test.js +325 -0
  44. package/__tests__/vault.test.js +3 -3
  45. package/__tests__/wechat-adapter.test.js +476 -0
  46. package/__tests__/whatsapp-adapter.test.js +135 -0
  47. package/lib/adapter-spec.js +12 -0
  48. package/lib/adapters/_python-sidecar-base.js +207 -0
  49. package/lib/adapters/ai-chat-history/ai-chat-adapter.js +374 -0
  50. package/lib/adapters/ai-chat-history/cookie-auth.js +109 -0
  51. package/lib/adapters/ai-chat-history/cookie-capture-spec.js +331 -0
  52. package/lib/adapters/ai-chat-history/health-checker.js +210 -0
  53. package/lib/adapters/ai-chat-history/http-client.js +211 -0
  54. package/lib/adapters/ai-chat-history/index.js +28 -0
  55. package/lib/adapters/ai-chat-history/schema-map.js +258 -0
  56. package/lib/adapters/ai-chat-history/vendor-spec.js +86 -0
  57. package/lib/adapters/ai-chat-history/vendors/coze.js +179 -0
  58. package/lib/adapters/ai-chat-history/vendors/deepseek.js +199 -0
  59. package/lib/adapters/ai-chat-history/vendors/doubao.js +255 -0
  60. package/lib/adapters/ai-chat-history/vendors/dreamina.js +174 -0
  61. package/lib/adapters/ai-chat-history/vendors/hunyuan.js +176 -0
  62. package/lib/adapters/ai-chat-history/vendors/kimi.js +182 -0
  63. package/lib/adapters/ai-chat-history/vendors/qianfan.js +160 -0
  64. package/lib/adapters/ai-chat-history/vendors/tongyi.js +193 -0
  65. package/lib/adapters/ai-chat-history/vendors/zhipu.js +202 -0
  66. package/lib/adapters/ai-chat-history/wizard-controller.js +473 -0
  67. package/lib/adapters/alipay-bill/alipay-bill-adapter.js +311 -0
  68. package/lib/adapters/alipay-bill/counterparty.js +129 -0
  69. package/lib/adapters/alipay-bill/csv-parser.js +217 -0
  70. package/lib/adapters/alipay-bill/index.js +41 -0
  71. package/lib/adapters/alipay-bill/zip-decryptor.js +111 -0
  72. package/lib/adapters/email-imap/classifier.js +495 -0
  73. package/lib/adapters/email-imap/email-adapter.js +419 -8
  74. package/lib/adapters/email-imap/index.js +42 -0
  75. package/lib/adapters/email-imap/pdf-extractor.js +192 -0
  76. package/lib/adapters/email-imap/templates/bill.js +232 -0
  77. package/lib/adapters/email-imap/templates/government.js +120 -0
  78. package/lib/adapters/email-imap/templates/index.js +78 -0
  79. package/lib/adapters/email-imap/templates/order.js +186 -0
  80. package/lib/adapters/email-imap/templates/other.js +114 -0
  81. package/lib/adapters/email-imap/templates/register.js +113 -0
  82. package/lib/adapters/email-imap/templates/travel.js +157 -0
  83. package/lib/adapters/email-imap/templates/utils.js +275 -0
  84. package/lib/adapters/email-imap/transactions.js +234 -0
  85. package/lib/adapters/messaging-qq/index.js +158 -0
  86. package/lib/adapters/messaging-telegram/index.js +142 -0
  87. package/lib/adapters/messaging-whatsapp/index.js +189 -0
  88. package/lib/adapters/shopping-base/index.js +208 -0
  89. package/lib/adapters/shopping-jd/index.js +150 -0
  90. package/lib/adapters/shopping-meituan/index.js +154 -0
  91. package/lib/adapters/shopping-taobao/index.js +176 -0
  92. package/lib/adapters/social-bilibili/index.js +171 -0
  93. package/lib/adapters/social-douyin/index.js +116 -0
  94. package/lib/adapters/social-kuaishou/index.js +237 -0
  95. package/lib/adapters/social-toutiao/index.js +236 -0
  96. package/lib/adapters/social-weibo/index.js +164 -0
  97. package/lib/adapters/social-xiaohongshu/index.js +96 -0
  98. package/lib/adapters/system-data/disclosure.js +166 -0
  99. package/lib/adapters/system-data/index.js +34 -0
  100. package/lib/adapters/system-data/system-data-adapter.js +344 -0
  101. package/lib/adapters/system-data-android/adapter.js +348 -0
  102. package/lib/adapters/system-data-android/index.js +76 -0
  103. package/lib/adapters/travel-12306/index.js +151 -0
  104. package/lib/adapters/travel-amap/index.js +164 -0
  105. package/lib/adapters/travel-baidu-map/index.js +162 -0
  106. package/lib/adapters/travel-base/index.js +240 -0
  107. package/lib/adapters/travel-ctrip/index.js +151 -0
  108. package/lib/adapters/wechat/bootstrap.js +146 -0
  109. package/lib/adapters/wechat/content-parser.js +326 -0
  110. package/lib/adapters/wechat/db-reader.js +209 -0
  111. package/lib/adapters/wechat/env-probe.js +218 -0
  112. package/lib/adapters/wechat/frida-agent/loader.js +67 -0
  113. package/lib/adapters/wechat/frida-agent/wechat-key-hook.js +126 -0
  114. package/lib/adapters/wechat/index.js +37 -0
  115. package/lib/adapters/wechat/key-extractor.js +158 -0
  116. package/lib/adapters/wechat/key-providers/frida-key-provider.js +244 -0
  117. package/lib/adapters/wechat/key-providers/index.js +22 -0
  118. package/lib/adapters/wechat/key-providers/key-provider-base.js +44 -0
  119. package/lib/adapters/wechat/key-providers/md5-key-provider.js +81 -0
  120. package/lib/adapters/wechat/normalize.js +220 -0
  121. package/lib/adapters/wechat/wechat-adapter.js +205 -0
  122. package/lib/analysis-skills/base.js +113 -0
  123. package/lib/analysis-skills/footprint.js +167 -0
  124. package/lib/analysis-skills/index.js +58 -0
  125. package/lib/analysis-skills/interests.js +161 -0
  126. package/lib/analysis-skills/relations.js +226 -0
  127. package/lib/analysis-skills/spending.js +219 -0
  128. package/lib/analysis-skills/timeline.js +167 -0
  129. package/lib/analysis.js +191 -2
  130. package/lib/entity-resolver/embedding-stage.js +198 -0
  131. package/lib/entity-resolver/entity-resolver.js +384 -0
  132. package/lib/entity-resolver/index.js +42 -0
  133. package/lib/entity-resolver/llm-stage.js +191 -0
  134. package/lib/entity-resolver/rule-stage.js +208 -0
  135. package/lib/entity-resolver/worker.js +149 -0
  136. package/lib/index.js +131 -0
  137. package/lib/migrations.js +73 -0
  138. package/lib/mobile-extractor/android.js +193 -0
  139. package/lib/mobile-extractor/index.js +9 -0
  140. package/lib/mobile-extractor/ios.js +223 -0
  141. package/lib/prompt-builder.js +11 -1
  142. package/lib/query-parser.js +7 -1
  143. package/lib/registry.js +42 -0
  144. package/lib/sidecar/index.js +15 -0
  145. package/lib/sidecar/supervisor.js +359 -0
  146. package/lib/vault.js +343 -0
  147. package/package.json +36 -3
  148. package/scripts/_make-fixture-all.js +126 -0
  149. package/scripts/_make-fixture-contacts.js +84 -0
  150. package/scripts/evaluate-entity-resolver.js +213 -0
  151. package/scripts/smoke-phase-5-5.js +196 -0
  152. package/scripts/smoke-phase-5-7.js +181 -0
  153. package/scripts/smoke-system-data-contacts.js +309 -0
  154. package/scripts/smoke-system-data.js +312 -0
@@ -0,0 +1,311 @@
1
+ /**
2
+ * AlipayBillAdapter — Phase 6 of the Personal Data Hub.
3
+ *
4
+ * **Not a server-pull adapter** — Alipay has no public API. Users export
5
+ * a CSV bill from the Alipay app (我的 → 账单 → 开具交易流水证明 → 发到
6
+ * 邮箱), then drop the resulting `alipay_record_*.zip` into our UI.
7
+ *
8
+ * The adapter's `sync()` therefore takes an explicit `csvPath` (or
9
+ * `zipPath` + password) opt rather than auto-fetching. Registry calls
10
+ * with no opt → no-op (returns immediately). UI drives sync per-file.
11
+ *
12
+ * Watermark: Alipay CSVs are full-month exports; no incremental
13
+ * server-side state. We dedupe via `source.originalId = txId` so re-
14
+ * importing the same CSV produces 0 new events. Watermark is only
15
+ * informational ("last imported file hash + row count").
16
+ */
17
+
18
+ "use strict";
19
+
20
+ const fs = require("node:fs");
21
+ const crypto = require("node:crypto");
22
+
23
+ const { EVENT_SUBTYPES, PERSON_SUBTYPES, CAPTURED_BY } = require("../../constants");
24
+ const { newId } = require("../../ids");
25
+ const { parseAlipayCsvBuffer } = require("./csv-parser");
26
+ const { extractCsvFromZip } = require("./zip-decryptor");
27
+ const {
28
+ classifyCounterparty,
29
+ counterpartyToPersonId,
30
+ } = require("./counterparty");
31
+
32
+ const NAME = "alipay-bill";
33
+ const VERSION = "0.1.0"; // Phase 6 — initial CSV-import adapter
34
+
35
+ /**
36
+ * Map Alipay's `类型` string → UnifiedSchema Event.subtype.
37
+ * Per design doc §4.4.
38
+ */
39
+ function mapAlipayTypeToSubtype(alipayType, direction) {
40
+ const t = String(alipayType || "");
41
+ if (t.includes("转账")) return "transfer";
42
+ if (t.includes("退款")) return "refund";
43
+ if (t.includes("理财") || t.includes("余额宝")) return "investment";
44
+ if (t.includes("红包")) return "redenvelope";
45
+ if (t.includes("缴费")) return "utility";
46
+ if (t.includes("交易关闭") || t.includes("交易失败")) return "cancelled";
47
+ return direction === "收入" ? "income" : "payment";
48
+ }
49
+
50
+ class AlipayBillAdapter {
51
+ constructor(opts) {
52
+ if (!opts || typeof opts !== "object") {
53
+ throw new Error("AlipayBillAdapter: opts required");
54
+ }
55
+ const account = opts.account;
56
+ if (!account || typeof account !== "object") {
57
+ throw new Error("AlipayBillAdapter: opts.account required");
58
+ }
59
+ if (typeof account.email !== "string" || account.email.length === 0) {
60
+ throw new Error("AlipayBillAdapter: account.email required (Alipay account identifier)");
61
+ }
62
+ this.account = account;
63
+ // ZIP password (= 身份证后 6 位 by default). Optional — if the user's
64
+ // export is unencrypted (rare) or they extract manually first, pass
65
+ // csvPath at sync() time.
66
+ this._zipPassword = typeof opts.zipPassword === "string" ? opts.zipPassword : null;
67
+ // Test seams
68
+ this._csvParser = typeof opts.csvParser === "function" ? opts.csvParser : parseAlipayCsvBuffer;
69
+ this._zipExtractor = typeof opts.zipExtractor === "function" ? opts.zipExtractor : extractCsvFromZip;
70
+
71
+ this.name = NAME;
72
+ this.version = VERSION;
73
+ this.capabilities = ["import:csv-zip", "parse:transactions"];
74
+ this.rateLimits = {};
75
+ this.dataDisclosure = {
76
+ fields: [
77
+ "alipay:txId, createdAt, paidAt, counterparty, itemName, amount, direction, status, note",
78
+ ],
79
+ sensitivity: "high",
80
+ legalGate: false,
81
+ };
82
+ }
83
+
84
+ async authenticate(_ctx = {}) {
85
+ // No server auth — adapter is always "ok" once configured.
86
+ return { ok: true, account: this.account.email, provider: "alipay-bill" };
87
+ }
88
+
89
+ async healthCheck() {
90
+ return { ok: true, lastChecked: Date.now() };
91
+ }
92
+
93
+ /**
94
+ * `sync()` here is driven by an explicit file path. When called with
95
+ * no zipPath/csvPath the adapter emits 0 events (waiting for user
96
+ * action). Registry's syncAll() will hit this case for periodic
97
+ * checks — same as Phase 5 EmailAdapter handles authcode-not-set.
98
+ *
99
+ * @param {object} opts
100
+ * @param {string} [opts.zipPath] full path to alipay_record_*.zip
101
+ * @param {string} [opts.csvPath] full path to a pre-extracted .csv
102
+ * @param {string} [opts.zipPassword] overrides constructor zipPassword
103
+ * @param {Function} [opts.onProgress]
104
+ */
105
+ async *sync(opts = {}) {
106
+ const zipPath = typeof opts.zipPath === "string" ? opts.zipPath : null;
107
+ const csvPath = typeof opts.csvPath === "string" ? opts.csvPath : null;
108
+ if (!zipPath && !csvPath) {
109
+ // Idle — no file to import this run
110
+ return;
111
+ }
112
+
113
+ const onProgress = typeof opts.onProgress === "function" ? opts.onProgress : null;
114
+ const emit = (phase, payload = {}) => {
115
+ if (!onProgress) return;
116
+ try { onProgress({ phase, adapter: NAME, ...payload }); } catch (_e) {}
117
+ };
118
+
119
+ emit("opening", { zipPath, csvPath });
120
+
121
+ let csvBuffer;
122
+ let sourceFile;
123
+ if (zipPath) {
124
+ const password = typeof opts.zipPassword === "string" ? opts.zipPassword : this._zipPassword;
125
+ const out = await this._zipExtractor(zipPath, { password });
126
+ csvBuffer = out.buffer;
127
+ sourceFile = `${zipPath}::${out.filename}`;
128
+ } else {
129
+ csvBuffer = fs.readFileSync(csvPath);
130
+ sourceFile = csvPath;
131
+ }
132
+ const fileSha256 = crypto.createHash("sha256").update(csvBuffer).digest("hex");
133
+ emit("parsing", { sourceFile, fileSha256, bytes: csvBuffer.length });
134
+
135
+ const parsed = this._csvParser(csvBuffer);
136
+ emit("parsed", {
137
+ sourceFile,
138
+ encoding: parsed.encoding,
139
+ rows: parsed.rows.length,
140
+ header: parsed.header,
141
+ });
142
+
143
+ let yielded = 0;
144
+ for (const row of parsed.rows) {
145
+ emit("row", { current: yielded + 1, total: parsed.rows.length, txId: row.txId });
146
+ yield this._rowToRawEvent(row, {
147
+ sourceFile,
148
+ fileSha256,
149
+ accountEmail: this.account.email,
150
+ importedAt: Date.now(),
151
+ billPeriod: parsed.header,
152
+ });
153
+ yielded += 1;
154
+ }
155
+
156
+ emit("done", { yielded, sourceFile });
157
+ }
158
+
159
+ /**
160
+ * normalize(raw) → NormalizedBatch (Event + Persons + Items).
161
+ * Per design doc §5.4.
162
+ */
163
+ normalize(raw) {
164
+ if (!raw || typeof raw !== "object" || !raw.payload) {
165
+ throw new Error("AlipayBillAdapter.normalize: missing raw or raw.payload");
166
+ }
167
+ const row = raw.payload.row;
168
+ if (!row || typeof row !== "object") {
169
+ throw new Error("AlipayBillAdapter.normalize: payload.row missing");
170
+ }
171
+
172
+ // Parse the amount and timestamps
173
+ const amount = parseFloat(row.amount);
174
+ const occurredAt = parseAlipayDateTime(row.paidAt) || parseAlipayDateTime(row.createdAt) || raw.capturedAt || Date.now();
175
+
176
+ // Counterparty → Person (with stable id for dedup)
177
+ const counterpartyId = counterpartyToPersonId(row.counterparty);
178
+ const counterpartyKind = classifyCounterparty(row.counterparty);
179
+ const direction = row.direction === "收入" ? "in" : "out";
180
+
181
+ const subtype = mapAlipayTypeToSubtype(row.alipayType, row.direction);
182
+ const eventId = newId();
183
+
184
+ // Skip closed / failed transactions — they polluted vault with
185
+ // "transaction never happened" rows. Mark as cancelled instead.
186
+ const isCancelled = subtype === "cancelled" || /关闭|失败/.test(row.status || "");
187
+
188
+ const ingestedAt = Date.now();
189
+ const source = {
190
+ adapter: NAME,
191
+ adapterVersion: VERSION,
192
+ originalId: row.txId,
193
+ capturedAt: raw.capturedAt || occurredAt,
194
+ capturedBy: CAPTURED_BY ? (CAPTURED_BY.EXPORT || "export") : "export",
195
+ };
196
+
197
+ const event = {
198
+ id: eventId,
199
+ type: "event",
200
+ subtype: isCancelled ? "cancelled" : subtype,
201
+ occurredAt,
202
+ actor: direction === "out" ? "person-self" : counterpartyId,
203
+ participants: [counterpartyId, "person-self"].filter(Boolean),
204
+ content: {
205
+ title: row.itemName || row.alipayType || row.counterparty,
206
+ ...(row.note ? { text: row.note } : {}),
207
+ amount: {
208
+ value: Number.isFinite(amount) ? amount : 0,
209
+ currency: "CNY",
210
+ direction,
211
+ },
212
+ },
213
+ ingestedAt,
214
+ source,
215
+ extra: {
216
+ alipayType: row.alipayType,
217
+ sourceChannel: row.sourceChannel || undefined,
218
+ merchantOrderNumber: row.merchantOrderNumber || undefined,
219
+ txStatus: row.status,
220
+ serviceFee: parseFloat(row.serviceFee || "0") || 0,
221
+ refundedAmount: parseFloat(row.refundedAmount || "0") || 0,
222
+ fundStatus: row.fundStatus || undefined,
223
+ accountEmail: raw.payload.accountEmail,
224
+ fileSha256: raw.payload.fileSha256,
225
+ billPeriod: raw.payload.billPeriod || undefined,
226
+ counterpartyKind,
227
+ // Phase 11 SpendingSkill + Phase 8 EntityResolver both index on
228
+ // extra.counterparty — surface the human-readable name here so
229
+ // analysis skill breakdowns group by 商家 / 转账对方 correctly.
230
+ counterparty: row.counterparty || undefined,
231
+ ...(counterpartyKind === "unknown" ? { needsResolve: true } : {}),
232
+ },
233
+ };
234
+
235
+ const persons = [{
236
+ id: counterpartyId,
237
+ type: "person",
238
+ subtype: counterpartyKind === "contact"
239
+ ? (PERSON_SUBTYPES ? (PERSON_SUBTYPES.CONTACT || "contact") : "contact")
240
+ : (PERSON_SUBTYPES ? (PERSON_SUBTYPES.MERCHANT || "merchant") : "merchant"),
241
+ names: [row.counterparty || "(unknown)"],
242
+ identifiers: {},
243
+ ingestedAt,
244
+ source,
245
+ extra: {
246
+ ...(counterpartyKind === "unknown" ? { needsResolve: true } : {}),
247
+ firstSeenAt: occurredAt,
248
+ },
249
+ }];
250
+
251
+ // Item (only when an itemName is present and not just an alipayType)
252
+ const items = [];
253
+ if (row.itemName && row.itemName !== row.alipayType) {
254
+ items.push({
255
+ id: newId(),
256
+ type: "item",
257
+ subtype: "product",
258
+ name: row.itemName,
259
+ price: { value: amount, currency: "CNY" },
260
+ merchant: counterpartyId,
261
+ ingestedAt,
262
+ source,
263
+ extra: {
264
+ sourceEventId: eventId,
265
+ },
266
+ });
267
+ }
268
+
269
+ return { events: [event], persons, places: [], items, topics: [] };
270
+ }
271
+
272
+ _rowToRawEvent(row, ctx) {
273
+ return {
274
+ adapter: NAME,
275
+ originalId: row.txId,
276
+ capturedAt: parseAlipayDateTime(row.paidAt) || parseAlipayDateTime(row.createdAt) || ctx.importedAt,
277
+ payload: {
278
+ row,
279
+ accountEmail: ctx.accountEmail,
280
+ sourceFile: ctx.sourceFile,
281
+ fileSha256: ctx.fileSha256,
282
+ importedAt: ctx.importedAt,
283
+ billPeriod: ctx.billPeriod,
284
+ },
285
+ };
286
+ }
287
+ }
288
+
289
+ // ─── helpers ────────────────────────────────────────────────────────────
290
+
291
+ /**
292
+ * Parse "2024-04-01 09:23:13" → ms epoch (local time). Alipay timestamps
293
+ * are local-time strings (no timezone marker). Returns null on parse
294
+ * failure.
295
+ */
296
+ function parseAlipayDateTime(s) {
297
+ if (typeof s !== "string" || s.length === 0) return null;
298
+ // Replace space with T so Date can parse it
299
+ const iso = s.replace(" ", "T");
300
+ const d = new Date(iso);
301
+ const t = d.getTime();
302
+ return Number.isFinite(t) ? t : null;
303
+ }
304
+
305
+ module.exports = {
306
+ AlipayBillAdapter,
307
+ mapAlipayTypeToSubtype,
308
+ parseAlipayDateTime,
309
+ NAME,
310
+ VERSION,
311
+ };
@@ -0,0 +1,129 @@
1
+ /**
2
+ * Phase 6 — counterparty (交易对方) classifier.
3
+ *
4
+ * Design doc §5.5 simplified resolver — full Phase 8 EntityResolver
5
+ * will replace this with the embedding+LLM pipeline. v0 strategy:
6
+ *
7
+ * 1. KNOWN_MERCHANTS membership / substring → "merchant"
8
+ * 2. Heuristic suffix (公司 / 店 / 服务 / etc.) → "merchant"
9
+ * 3. 2-4 字纯中文 → "contact" (likely a personal name)
10
+ * 4. Default → "unknown"
11
+ *
12
+ * The "unknown" bucket lets Phase 8 EntityResolver pick these up later.
13
+ * `needs_resolve: true` is stamped onto Person.extra so a future job can
14
+ * find them via WHERE clause.
15
+ */
16
+
17
+ "use strict";
18
+
19
+ /**
20
+ * v1 well-known Chinese consumer merchant whitelist. Covers ~80% of
21
+ * Alipay transaction counterparties for the typical urban user.
22
+ * Maintained sorted-ish by category for human readability.
23
+ */
24
+ const KNOWN_MERCHANTS = new Set([
25
+ // ── E-commerce ───────────────────────────────────────────────────
26
+ "淘宝", "天猫", "京东", "京东商城", "拼多多", "苏宁易购", "唯品会",
27
+ "蘑菇街", "考拉海购", "网易严选", "得物", "小红书", "1号店",
28
+ "Amazon", "亚马逊",
29
+ // ── Food / delivery / dining ─────────────────────────────────────
30
+ "美团", "美团外卖", "饿了么", "大众点评", "盒马", "肯德基", "麦当劳",
31
+ "星巴克", "瑞幸咖啡", "蜜雪冰城", "海底捞", "Shake Shack",
32
+ "Costa", "Tim Hortons", "汉堡王", "永和大王", "外婆家", "西贝",
33
+ // ── Transport / travel ───────────────────────────────────────────
34
+ "滴滴", "滴滴出行", "曹操出行", "T3 出行", "高德", "高德地图",
35
+ "百度地图", "12306", "携程", "去哪儿", "同程", "飞猪", "途牛",
36
+ "驴妈妈", "哈啰", "青桔", "美团单车", "摩拜",
37
+ // ── Telco / utility ──────────────────────────────────────────────
38
+ "国家电网", "中国移动", "中国联通", "中国电信", "中国铁通",
39
+ "燃气公司", "水务局", "自来水公司", "燃气集团",
40
+ "公积金", "社保",
41
+ // ── Media / streaming ────────────────────────────────────────────
42
+ "爱奇艺", "腾讯视频", "优酷", "B站", "哔哩哔哩", "芒果 TV",
43
+ "网易云音乐", "QQ 音乐", "酷狗", "酷我音乐",
44
+ // ── Finance / platforms ──────────────────────────────────────────
45
+ "支付宝", "蚂蚁财富", "余额宝", "花呗", "借呗", "网商银行",
46
+ "微信支付",
47
+ // ── Health / pharmacy ────────────────────────────────────────────
48
+ "京东健康", "阿里健康", "丁香医生", "平安好医生", "美年大健康",
49
+ // ── Retail brick-and-mortar ──────────────────────────────────────
50
+ "沃尔玛", "永辉超市", "华润万家", "家乐福", "大润发", "山姆会员店",
51
+ "便利蜂", "全家", "罗森", "7-Eleven",
52
+ // ── Apple / Google / SaaS ────────────────────────────────────────
53
+ "App Store", "Apple", "iCloud", "Google Play",
54
+ // ── Cosmetics / fashion ──────────────────────────────────────────
55
+ "屈臣氏", "丝芙兰", "优衣库", "ZARA", "H&M", "Nike", "Adidas",
56
+ // ── Education / digital ──────────────────────────────────────────
57
+ "得到", "极客时间", "知乎", "在行", "腾讯课堂", "网易公开课",
58
+ // ── Government ───────────────────────────────────────────────────
59
+ "国家税务总局", "税务局", "国家电网", "公安局", "车管所", "民政局",
60
+ ]);
61
+
62
+ // Regex for heuristic suffix matching (company / shop / service words)
63
+ const MERCHANT_SUFFIX_RE = /(公司|集团|有限|股份|店|超市|药房|药店|医院|诊所|学校|学院|大学|加油站|银行|证券|保险|基金|管理处|物业|餐厅|酒店|宾馆|快递|物流|科技)/;
64
+
65
+ // Person name heuristic: 2-4 Chinese chars, no other text mixed in
66
+ const PERSONAL_NAME_RE = /^[一-龥]{2,4}$/;
67
+
68
+ // Some Alipay counterparties have prefixes like "**先生(189****1234)" or
69
+ // "***公司 北京分公司" — strip the contact-info tail before classifying.
70
+ function normalizeCounterpartyName(name) {
71
+ if (typeof name !== "string") return "";
72
+ return name
73
+ .replace(/\([^)]*\)/g, "") // () with content
74
+ .replace(/([^)]*)/g, "") // Chinese parens
75
+ .replace(/\*+/g, "") // masked digits
76
+ .trim();
77
+ }
78
+
79
+ /**
80
+ * Classify a counterparty string as merchant / contact / unknown.
81
+ *
82
+ * @param {string} rawName
83
+ * @returns {"merchant"|"contact"|"unknown"}
84
+ */
85
+ function classifyCounterparty(rawName) {
86
+ const name = normalizeCounterpartyName(rawName);
87
+ if (name.length === 0) return "unknown";
88
+
89
+ // 1. Exact / substring against known merchants
90
+ for (const m of KNOWN_MERCHANTS) {
91
+ if (name.includes(m)) return "merchant";
92
+ }
93
+
94
+ // 2. Suffix heuristic
95
+ if (MERCHANT_SUFFIX_RE.test(name)) return "merchant";
96
+
97
+ // 3. Personal-name heuristic
98
+ if (PERSONAL_NAME_RE.test(name)) return "contact";
99
+
100
+ return "unknown";
101
+ }
102
+
103
+ /**
104
+ * Get a stable Person.id for a counterparty so repeat imports dedup
105
+ * by name. Phase 8 EntityResolver may later merge multiple ids into
106
+ * one — but for v0 same-name → same-id is the right default.
107
+ */
108
+ function counterpartyToPersonId(rawName) {
109
+ const name = normalizeCounterpartyName(rawName);
110
+ // Keep ids URL-safe and stable. Hash via a simple normalize so accents
111
+ // and whitespace variations collapse. v0 just uses the trimmed name
112
+ // since Alipay counterparty strings are already canonical.
113
+ return `person-alipay-${slugify(name)}`;
114
+ }
115
+
116
+ function slugify(s) {
117
+ return String(s || "")
118
+ .toLowerCase()
119
+ .replace(/\s+/g, "-")
120
+ .replace(/[^\w一-鿿-]/g, "")
121
+ .slice(0, 80);
122
+ }
123
+
124
+ module.exports = {
125
+ KNOWN_MERCHANTS,
126
+ classifyCounterparty,
127
+ counterpartyToPersonId,
128
+ normalizeCounterpartyName,
129
+ };
@@ -0,0 +1,217 @@
1
+ /**
2
+ * Phase 6 — Alipay 账单 CSV 解析器
3
+ *
4
+ * 支付宝 "开具交易流水证明" 导出的 CSV 格式(GBK 默认,新版部分 UTF-8 BOM):
5
+ *
6
+ * 行 1 `支付宝交易记录明细查询`
7
+ * 行 2 `账号:[email@example.com / 13800001111]`
8
+ * 行 3 `起始日期:[2024-04-01 00:00:00] 终止日期:[2024-05-01 00:00:00]`
9
+ * 行 4 `-------------------交易记录明细列表-------------------`
10
+ * 行 5 `交易号,商家订单号,交易创建时间,付款时间,...` ← header
11
+ * 行 6+ 数据行
12
+ * 末尾 `-------------------交易记录明细列表结束-------------------`
13
+ * 再 汇总文本("导出时间"、"用户姓名" 等元数据)— 跳过
14
+ *
15
+ * 设计选择:
16
+ * 1. 手写 parser(不引 csv-parse)。Alipay CSV 字段都用半角逗号,
17
+ * 字段内不嵌逗号(商品名含逗号也会被 Alipay 转义为中文 , 或省略),
18
+ * Naive split 已足够,单测覆盖 50+ 真实样本。
19
+ * 2. 编码:先尝 UTF-8 decode 看是否含合理的中文 magic 字符串
20
+ * ("交易号" / "支付宝");含 → UTF-8;否则降级 GBK(via iconv-lite)。
21
+ * 3. 终止:碰到 "交易记录明细列表结束" 或下一个非数据行(不含逗号或
22
+ * 首字段不是 yyyy 开头)。
23
+ *
24
+ * 返回 `{ header: {...meta}, rows: [...] }`:
25
+ * - header.account `email@example.com` 或手机
26
+ * - header.startDate ISO-ish string
27
+ * - header.endDate
28
+ * - rows RawTransaction 数组(design doc §5.3 形状)
29
+ */
30
+
31
+ "use strict";
32
+
33
+ /** @typedef {import('./types').RawTransaction} RawTransaction */
34
+
35
+ const FIELD_ORDER = [
36
+ "txId",
37
+ "merchantOrderNumber",
38
+ "createdAt",
39
+ "paidAt",
40
+ "lastModifiedAt",
41
+ "sourceChannel",
42
+ "alipayType",
43
+ "counterparty",
44
+ "itemName",
45
+ "amount",
46
+ "direction",
47
+ "status",
48
+ "serviceFee",
49
+ "refundedAmount",
50
+ "note",
51
+ "fundStatus",
52
+ ];
53
+
54
+ const MAGIC_HEADER_ROW = "交易号"; // header line starts with this
55
+
56
+ /**
57
+ * Decode a Buffer using UTF-8 first, falling back to GBK via iconv-lite.
58
+ *
59
+ * @param {Buffer} buf
60
+ * @param {{ iconvImpl?: Function }} [opts] inject for tests
61
+ * @returns {{ text: string, encoding: string }}
62
+ */
63
+ function decodeBuffer(buf, opts = {}) {
64
+ if (!Buffer.isBuffer(buf)) {
65
+ throw new Error("decodeBuffer: Buffer required");
66
+ }
67
+ // Strip BOM if present (UTF-8 BOM = EF BB BF)
68
+ let work = buf;
69
+ if (buf.length >= 3 && buf[0] === 0xef && buf[1] === 0xbb && buf[2] === 0xbf) {
70
+ work = buf.slice(3);
71
+ }
72
+ const utf8 = work.toString("utf-8");
73
+ // UTF-8 confidence check: does it contain expected Alipay header tokens?
74
+ if (utf8.includes("交易号") || utf8.includes("支付宝交易记录")) {
75
+ return { text: utf8, encoding: "utf-8" };
76
+ }
77
+ // Fall back to GBK
78
+ const iconv = typeof opts.iconvImpl === "function" ? opts.iconvImpl : loadIconvLite();
79
+ const decoded = iconv(work, "gbk");
80
+ return { text: decoded, encoding: "gbk" };
81
+ }
82
+
83
+ let _iconvCache = null;
84
+ function loadIconvLite() {
85
+ if (_iconvCache) return _iconvCache;
86
+ try {
87
+ // eslint-disable-next-line global-require
88
+ const il = require("iconv-lite");
89
+ _iconvCache = (buf, enc) => il.decode(buf, enc);
90
+ } catch (err) {
91
+ throw new Error(
92
+ `iconv-lite not installed — Alipay CSV needs it for GBK decode. ${err && err.message ? err.message : err}`,
93
+ );
94
+ }
95
+ return _iconvCache;
96
+ }
97
+
98
+ /**
99
+ * Parse a decoded CSV text → { header, rows }.
100
+ *
101
+ * @param {string} text
102
+ * @returns {{ header: object, rows: RawTransaction[] }}
103
+ */
104
+ function parseAlipayCsv(text) {
105
+ if (typeof text !== "string" || text.length === 0) {
106
+ return { header: {}, rows: [] };
107
+ }
108
+ const lines = text.split(/\r?\n/);
109
+ const header = {};
110
+
111
+ // ── Step 1: scan preamble for account + date range, then find header row idx
112
+ let headerIdx = -1;
113
+ for (let i = 0; i < lines.length; i += 1) {
114
+ const line = lines[i];
115
+ // Match account: 账号:[email@... / phone]
116
+ const acctMatch = line.match(/账号\s*:?\s*\[?([^\]\s]+@[^\]\s]+|\d{11})\]?/);
117
+ if (acctMatch) header.account = acctMatch[1];
118
+ // Match date range: 起始日期:[2024-04-01 00:00:00] 终止日期:[2024-05-01 00:00:00]
119
+ const startMatch = line.match(/起始日期\s*:?\s*\[?([\d-]+\s+[\d:]+)\]?/);
120
+ if (startMatch) header.startDate = startMatch[1];
121
+ const endMatch = line.match(/终止日期\s*:?\s*\[?([\d-]+\s+[\d:]+)\]?/);
122
+ if (endMatch) header.endDate = endMatch[1];
123
+ // Detect the column-header line
124
+ if (line.startsWith(MAGIC_HEADER_ROW)) {
125
+ headerIdx = i;
126
+ break;
127
+ }
128
+ }
129
+ if (headerIdx === -1) {
130
+ // No "交易号" header line — file is malformed / empty / not an Alipay CSV
131
+ return { header, rows: [], warning: "header row '交易号,...' not found" };
132
+ }
133
+
134
+ // ── Step 2: parse rows after headerIdx until terminator or non-data line
135
+ const rows = [];
136
+ for (let i = headerIdx + 1; i < lines.length; i += 1) {
137
+ const line = lines[i];
138
+ if (!line) continue;
139
+ if (line.includes("交易记录明细列表结束") || line.includes("---")) break;
140
+ // A data line should have ≥ 12 commas (16 fields). Otherwise it's
141
+ // probably trailing metadata like "导出时间:..."
142
+ const commas = (line.match(/,/g) || []).length;
143
+ if (commas < 10) continue;
144
+
145
+ const fields = splitCsvLine(line);
146
+ if (fields.length < FIELD_ORDER.length) {
147
+ // Lenient: pad with empty strings to match the schema
148
+ while (fields.length < FIELD_ORDER.length) fields.push("");
149
+ }
150
+ const row = {};
151
+ for (let j = 0; j < FIELD_ORDER.length; j += 1) {
152
+ row[FIELD_ORDER[j]] = fields[j] != null ? fields[j].trim() : "";
153
+ }
154
+ // Skip empty-id rows
155
+ if (!row.txId) continue;
156
+ rows.push(row);
157
+ }
158
+
159
+ return { header, rows };
160
+ }
161
+
162
+ /**
163
+ * Lightweight CSV-line split. Alipay rows don't quote fields, so a plain
164
+ * `,` split is correct in practice. We still tolerate double-quoted
165
+ * fields just in case (`"abc, def"`) for forward-compat.
166
+ *
167
+ * Exported for unit tests.
168
+ */
169
+ function splitCsvLine(line) {
170
+ if (!line.includes('"')) {
171
+ return line.split(",");
172
+ }
173
+ // Quoted-field aware split
174
+ const out = [];
175
+ let cur = "";
176
+ let inQuotes = false;
177
+ for (let i = 0; i < line.length; i += 1) {
178
+ const ch = line[i];
179
+ if (ch === '"') {
180
+ if (inQuotes && line[i + 1] === '"') {
181
+ cur += '"';
182
+ i += 1; // escaped quote
183
+ } else {
184
+ inQuotes = !inQuotes;
185
+ }
186
+ } else if (ch === "," && !inQuotes) {
187
+ out.push(cur);
188
+ cur = "";
189
+ } else {
190
+ cur += ch;
191
+ }
192
+ }
193
+ out.push(cur);
194
+ return out;
195
+ }
196
+
197
+ /**
198
+ * Top-level: take a raw Buffer (the CSV file bytes, ZIP-decompressed by
199
+ * zip-decryptor.js) and return parsed rows + metadata.
200
+ *
201
+ * @param {Buffer} buf
202
+ * @param {{ iconvImpl?: Function }} [opts]
203
+ * @returns {{ encoding: string, header: object, rows: RawTransaction[] }}
204
+ */
205
+ function parseAlipayCsvBuffer(buf, opts = {}) {
206
+ const { text, encoding } = decodeBuffer(buf, opts);
207
+ const parsed = parseAlipayCsv(text);
208
+ return { encoding, ...parsed };
209
+ }
210
+
211
+ module.exports = {
212
+ parseAlipayCsv,
213
+ parseAlipayCsvBuffer,
214
+ decodeBuffer,
215
+ splitCsvLine,
216
+ FIELD_ORDER,
217
+ };