@chainlesschain/personal-data-hub 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/__tests__/adapters/ai-chat-cookie-capture-spec.test.js +211 -0
- package/__tests__/adapters/ai-chat-health-checker.test.js +262 -0
- package/__tests__/adapters/ai-chat-history.test.js +396 -0
- package/__tests__/adapters/ai-chat-http-client.test.js +242 -0
- package/__tests__/adapters/ai-chat-vendors.test.js +874 -0
- package/__tests__/adapters/alipay-bill-adapter.test.js +538 -0
- package/__tests__/adapters/email-adapter.test.js +138 -1
- package/__tests__/adapters/email-classifier.test.js +347 -0
- package/__tests__/adapters/email-pdf-extractor.test.js +529 -0
- package/__tests__/adapters/email-retry-progress.test.js +294 -0
- package/__tests__/adapters/email-templates.test.js +699 -0
- package/__tests__/adapters/social-toutiao-kuaishou-scaffold.test.js +269 -0
- package/__tests__/adapters/system-data-adapter.test.js +440 -0
- package/__tests__/adapters/system-data-android-ingest.test.js +144 -0
- package/__tests__/adapters/system-data-android.test.js +387 -0
- package/__tests__/adapters/system-data-disclosure.test.js +153 -0
- package/__tests__/adapters/wechat-bootstrap.test.js +240 -0
- package/__tests__/adapters/wechat-env-probe.test.js +162 -0
- package/__tests__/adapters/wechat-frida-agent.test.js +191 -0
- package/__tests__/adapters/wechat-frida-integration.test.js +149 -0
- package/__tests__/adapters/wechat-frida-key-provider.test.js +188 -0
- package/__tests__/adapters/wechat-md5-key-provider.test.js +101 -0
- package/__tests__/analysis-skills.test.js +556 -0
- package/__tests__/analysis.test.js +329 -1
- package/__tests__/e2e/ai-chat-cross-source-journey.test.js +213 -0
- package/__tests__/e2e/full-user-journey.test.js +188 -0
- package/__tests__/entity-resolver-ingest-hook.test.js +177 -0
- package/__tests__/entity-resolver-stages.test.js +411 -0
- package/__tests__/entity-resolver-vault.test.js +246 -0
- package/__tests__/entity-resolver.test.js +526 -0
- package/__tests__/fixtures/entity-resolver-200-mock.json +96 -0
- package/__tests__/integration/ai-chat-history-registry.test.js +228 -0
- package/__tests__/integration/aichat-wizard-end-to-end.test.js +282 -0
- package/__tests__/integration/cross-adapter-pipelines.test.js +396 -0
- package/__tests__/integration/wechat-bootstrap-end-to-end.test.js +390 -0
- package/__tests__/longtail-adapters.test.js +217 -0
- package/__tests__/mobile-extractor.test.js +288 -0
- package/__tests__/registry.test.js +4 -2
- package/__tests__/shopping-adapters.test.js +296 -0
- package/__tests__/sidecar-contacts-cross-validate.test.js +163 -0
- package/__tests__/sidecar-supervisor.test.js +120 -0
- package/__tests__/social-adapters.test.js +206 -0
- package/__tests__/travel-adapters.test.js +325 -0
- package/__tests__/vault.test.js +3 -3
- package/__tests__/wechat-adapter.test.js +476 -0
- package/__tests__/whatsapp-adapter.test.js +135 -0
- package/lib/adapter-spec.js +12 -0
- package/lib/adapters/_python-sidecar-base.js +207 -0
- package/lib/adapters/ai-chat-history/ai-chat-adapter.js +374 -0
- package/lib/adapters/ai-chat-history/cookie-auth.js +109 -0
- package/lib/adapters/ai-chat-history/cookie-capture-spec.js +331 -0
- package/lib/adapters/ai-chat-history/health-checker.js +210 -0
- package/lib/adapters/ai-chat-history/http-client.js +211 -0
- package/lib/adapters/ai-chat-history/index.js +28 -0
- package/lib/adapters/ai-chat-history/schema-map.js +258 -0
- package/lib/adapters/ai-chat-history/vendor-spec.js +86 -0
- package/lib/adapters/ai-chat-history/vendors/coze.js +179 -0
- package/lib/adapters/ai-chat-history/vendors/deepseek.js +199 -0
- package/lib/adapters/ai-chat-history/vendors/doubao.js +255 -0
- package/lib/adapters/ai-chat-history/vendors/dreamina.js +174 -0
- package/lib/adapters/ai-chat-history/vendors/hunyuan.js +176 -0
- package/lib/adapters/ai-chat-history/vendors/kimi.js +182 -0
- package/lib/adapters/ai-chat-history/vendors/qianfan.js +160 -0
- package/lib/adapters/ai-chat-history/vendors/tongyi.js +193 -0
- package/lib/adapters/ai-chat-history/vendors/zhipu.js +202 -0
- package/lib/adapters/ai-chat-history/wizard-controller.js +473 -0
- package/lib/adapters/alipay-bill/alipay-bill-adapter.js +311 -0
- package/lib/adapters/alipay-bill/counterparty.js +129 -0
- package/lib/adapters/alipay-bill/csv-parser.js +217 -0
- package/lib/adapters/alipay-bill/index.js +41 -0
- package/lib/adapters/alipay-bill/zip-decryptor.js +111 -0
- package/lib/adapters/email-imap/classifier.js +495 -0
- package/lib/adapters/email-imap/email-adapter.js +419 -8
- package/lib/adapters/email-imap/index.js +42 -0
- package/lib/adapters/email-imap/pdf-extractor.js +192 -0
- package/lib/adapters/email-imap/templates/bill.js +232 -0
- package/lib/adapters/email-imap/templates/government.js +120 -0
- package/lib/adapters/email-imap/templates/index.js +78 -0
- package/lib/adapters/email-imap/templates/order.js +186 -0
- package/lib/adapters/email-imap/templates/other.js +114 -0
- package/lib/adapters/email-imap/templates/register.js +113 -0
- package/lib/adapters/email-imap/templates/travel.js +157 -0
- package/lib/adapters/email-imap/templates/utils.js +275 -0
- package/lib/adapters/email-imap/transactions.js +234 -0
- package/lib/adapters/messaging-qq/index.js +158 -0
- package/lib/adapters/messaging-telegram/index.js +142 -0
- package/lib/adapters/messaging-whatsapp/index.js +189 -0
- package/lib/adapters/shopping-base/index.js +208 -0
- package/lib/adapters/shopping-jd/index.js +150 -0
- package/lib/adapters/shopping-meituan/index.js +154 -0
- package/lib/adapters/shopping-taobao/index.js +176 -0
- package/lib/adapters/social-bilibili/index.js +171 -0
- package/lib/adapters/social-douyin/index.js +116 -0
- package/lib/adapters/social-kuaishou/index.js +237 -0
- package/lib/adapters/social-toutiao/index.js +236 -0
- package/lib/adapters/social-weibo/index.js +164 -0
- package/lib/adapters/social-xiaohongshu/index.js +96 -0
- package/lib/adapters/system-data/disclosure.js +166 -0
- package/lib/adapters/system-data/index.js +34 -0
- package/lib/adapters/system-data/system-data-adapter.js +344 -0
- package/lib/adapters/system-data-android/adapter.js +348 -0
- package/lib/adapters/system-data-android/index.js +76 -0
- package/lib/adapters/travel-12306/index.js +151 -0
- package/lib/adapters/travel-amap/index.js +164 -0
- package/lib/adapters/travel-baidu-map/index.js +162 -0
- package/lib/adapters/travel-base/index.js +240 -0
- package/lib/adapters/travel-ctrip/index.js +151 -0
- package/lib/adapters/wechat/bootstrap.js +146 -0
- package/lib/adapters/wechat/content-parser.js +326 -0
- package/lib/adapters/wechat/db-reader.js +209 -0
- package/lib/adapters/wechat/env-probe.js +218 -0
- package/lib/adapters/wechat/frida-agent/loader.js +67 -0
- package/lib/adapters/wechat/frida-agent/wechat-key-hook.js +126 -0
- package/lib/adapters/wechat/index.js +37 -0
- package/lib/adapters/wechat/key-extractor.js +158 -0
- package/lib/adapters/wechat/key-providers/frida-key-provider.js +244 -0
- package/lib/adapters/wechat/key-providers/index.js +22 -0
- package/lib/adapters/wechat/key-providers/key-provider-base.js +44 -0
- package/lib/adapters/wechat/key-providers/md5-key-provider.js +81 -0
- package/lib/adapters/wechat/normalize.js +220 -0
- package/lib/adapters/wechat/wechat-adapter.js +205 -0
- package/lib/analysis-skills/base.js +113 -0
- package/lib/analysis-skills/footprint.js +167 -0
- package/lib/analysis-skills/index.js +58 -0
- package/lib/analysis-skills/interests.js +161 -0
- package/lib/analysis-skills/relations.js +226 -0
- package/lib/analysis-skills/spending.js +219 -0
- package/lib/analysis-skills/timeline.js +167 -0
- package/lib/analysis.js +191 -2
- package/lib/entity-resolver/embedding-stage.js +198 -0
- package/lib/entity-resolver/entity-resolver.js +384 -0
- package/lib/entity-resolver/index.js +42 -0
- package/lib/entity-resolver/llm-stage.js +191 -0
- package/lib/entity-resolver/rule-stage.js +208 -0
- package/lib/entity-resolver/worker.js +149 -0
- package/lib/index.js +131 -0
- package/lib/migrations.js +73 -0
- package/lib/mobile-extractor/android.js +193 -0
- package/lib/mobile-extractor/index.js +9 -0
- package/lib/mobile-extractor/ios.js +223 -0
- package/lib/prompt-builder.js +11 -1
- package/lib/query-parser.js +7 -1
- package/lib/registry.js +42 -0
- package/lib/sidecar/index.js +15 -0
- package/lib/sidecar/supervisor.js +359 -0
- package/lib/vault.js +343 -0
- package/package.json +36 -3
- package/scripts/_make-fixture-all.js +126 -0
- package/scripts/_make-fixture-contacts.js +84 -0
- package/scripts/evaluate-entity-resolver.js +213 -0
- package/scripts/smoke-phase-5-5.js +196 -0
- package/scripts/smoke-phase-5-7.js +181 -0
- package/scripts/smoke-system-data-contacts.js +309 -0
- package/scripts/smoke-system-data.js +312 -0
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Phase 8.2 — EntityResolver rule stage.
|
|
3
|
+
*
|
|
4
|
+
* Per docs/design/Personal_Data_Hub_EntityResolver.md §4.1 — the synchronous
|
|
5
|
+
* fast-path. Returns "same" | "different" | "uncertain" by looking only at
|
|
6
|
+
* identifier overlap + naming + source provenance. No external calls, no
|
|
7
|
+
* Ollama, < 5ms p99 even for thousands of candidates.
|
|
8
|
+
*
|
|
9
|
+
* The intent is to cleanly handle the cases where we KNOW the answer:
|
|
10
|
+
* - same identifier (email/phone/wechatId/did) → same
|
|
11
|
+
* - completely disjoint fingerprint → different
|
|
12
|
+
* - same adapter, same name, different originalId → same (adapter
|
|
13
|
+
* produced two rows for one person — rare but happens with
|
|
14
|
+
* un-normalized export channels)
|
|
15
|
+
*
|
|
16
|
+
* Anything else falls through to `uncertain` and gets sent to the
|
|
17
|
+
* async embedding+LLM pipeline.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
"use strict";
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Identifier keys we treat as "if they overlap, they're definitively
|
|
24
|
+
* the same person". These should be globally-unique-per-person values.
|
|
25
|
+
* Keep this list conservative — adding `name` here would catch false
|
|
26
|
+
* positives (different people sharing common Chinese names).
|
|
27
|
+
*/
|
|
28
|
+
const STRONG_IDENTIFIER_KEYS = [
|
|
29
|
+
"email",
|
|
30
|
+
"phone",
|
|
31
|
+
"wechatId",
|
|
32
|
+
"alipayUid",
|
|
33
|
+
"did",
|
|
34
|
+
"idHash", // SHA-256 of national ID number (Phase 9+ contribution)
|
|
35
|
+
];
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Run the rule stage on a (pending, candidate) pair.
|
|
39
|
+
*
|
|
40
|
+
* @param {object} a Person row
|
|
41
|
+
* @param {object} b Person row
|
|
42
|
+
* @returns {{ verdict: "same"|"different"|"uncertain", reason: string }}
|
|
43
|
+
*/
|
|
44
|
+
function ruleStage(a, b) {
|
|
45
|
+
if (!a || !b || typeof a !== "object" || typeof b !== "object") {
|
|
46
|
+
return { verdict: "different", reason: "invalid input" };
|
|
47
|
+
}
|
|
48
|
+
if (a.id === b.id) {
|
|
49
|
+
// Same Person row — vacuously "same" but caller should never pair a
|
|
50
|
+
// person with itself. Surface clearly.
|
|
51
|
+
return { verdict: "same", reason: "identical id" };
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// R1. Strong identifier overlap → same
|
|
55
|
+
const sharedKey = findSharedIdentifier(a.identifiers || {}, b.identifiers || {});
|
|
56
|
+
if (sharedKey) {
|
|
57
|
+
return { verdict: "same", reason: `identifier match: ${sharedKey.key}=${sharedKey.value}` };
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// R3. Same adapter, different originalId, sharing a name → same
|
|
61
|
+
// (catches adapter-internal duplicates where an export contains the
|
|
62
|
+
// same person under two surface forms — rare but documented in design.)
|
|
63
|
+
// GUARD: if BOTH sides have a strong identifier of the SAME key but
|
|
64
|
+
// with DIFFERENT values, they're definitively different people sharing
|
|
65
|
+
// a common name (homonym-trap). Don't R3-merge.
|
|
66
|
+
if (a.source && b.source
|
|
67
|
+
&& a.source.adapter === b.source.adapter
|
|
68
|
+
&& a.source.originalId !== b.source.originalId
|
|
69
|
+
&& sharesAnyName(a.names, b.names)
|
|
70
|
+
&& !hasConflictingIdentifier(a.identifiers || {}, b.identifiers || {})
|
|
71
|
+
) {
|
|
72
|
+
return {
|
|
73
|
+
verdict: "same",
|
|
74
|
+
reason: `same-adapter (${a.source.adapter}) internal dup: shared name`,
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// R2. Zero overlap on any field → different
|
|
79
|
+
const overlap = countFieldOverlap(a, b);
|
|
80
|
+
if (overlap === 0) {
|
|
81
|
+
return { verdict: "different", reason: "no field overlap" };
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// R4. Otherwise uncertain — send to async pipeline
|
|
85
|
+
return { verdict: "uncertain", reason: `overlap=${overlap}` };
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Find a strong identifier shared between two identifier maps. Each
|
|
90
|
+
* identifier value can be a string OR an array of strings (per UnifiedSchema).
|
|
91
|
+
*
|
|
92
|
+
* @returns {{key: string, value: string} | null}
|
|
93
|
+
*/
|
|
94
|
+
function findSharedIdentifier(idsA, idsB) {
|
|
95
|
+
for (const key of STRONG_IDENTIFIER_KEYS) {
|
|
96
|
+
const av = toArray(idsA[key]);
|
|
97
|
+
const bv = toArray(idsB[key]);
|
|
98
|
+
if (av.length === 0 || bv.length === 0) continue;
|
|
99
|
+
// Normalize before compare — emails / phones often have surface variance
|
|
100
|
+
const aNorm = av.map((v) => normalizeIdValue(key, v));
|
|
101
|
+
const bNorm = bv.map((v) => normalizeIdValue(key, v));
|
|
102
|
+
for (const v of aNorm) {
|
|
103
|
+
if (v && bNorm.includes(v)) return { key, value: v };
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
return null;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
function toArray(v) {
|
|
110
|
+
if (Array.isArray(v)) return v.filter((x) => typeof x === "string" && x.length > 0);
|
|
111
|
+
if (typeof v === "string" && v.length > 0) return [v];
|
|
112
|
+
return [];
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Returns true iff both sides have at least one strong identifier of
|
|
117
|
+
* the SAME key but with DIFFERENT (normalized) values. This is the
|
|
118
|
+
* "homonym-trap guard" for R3: if Alice@a.com and Alice@b.com share
|
|
119
|
+
* the name "Alice" but have different emails, they're DIFFERENT people.
|
|
120
|
+
*/
|
|
121
|
+
function hasConflictingIdentifier(idsA, idsB) {
|
|
122
|
+
for (const key of STRONG_IDENTIFIER_KEYS) {
|
|
123
|
+
const av = toArray(idsA[key]).map((v) => normalizeIdValue(key, v));
|
|
124
|
+
const bv = toArray(idsB[key]).map((v) => normalizeIdValue(key, v));
|
|
125
|
+
if (av.length === 0 || bv.length === 0) continue;
|
|
126
|
+
// Both have this identifier — overlap means SAME (handled by R1 above);
|
|
127
|
+
// no overlap on the same key = conflict
|
|
128
|
+
const overlap = av.some((v) => bv.includes(v));
|
|
129
|
+
if (!overlap) return true;
|
|
130
|
+
}
|
|
131
|
+
return false;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/**
|
|
135
|
+
* Light normalization to avoid trivial misses:
|
|
136
|
+
* - email: lowercase + trim
|
|
137
|
+
* - phone: digits only (strips + - spaces parens)
|
|
138
|
+
* - others: trim only
|
|
139
|
+
*/
|
|
140
|
+
function normalizeIdValue(key, v) {
|
|
141
|
+
if (typeof v !== "string") return "";
|
|
142
|
+
const trimmed = v.trim();
|
|
143
|
+
if (key === "email") return trimmed.toLowerCase();
|
|
144
|
+
if (key === "phone") {
|
|
145
|
+
let digits = trimmed.replace(/[^0-9]/g, "");
|
|
146
|
+
// Strip common country-code prefixes so "+86 138-0000 1111" and
|
|
147
|
+
// "13800001111" collapse to the same value.
|
|
148
|
+
if (digits.length === 13 && digits.startsWith("86")) digits = digits.slice(2);
|
|
149
|
+
if (digits.length === 12 && digits.startsWith("1")) digits = digits.slice(1); // US +1 leading
|
|
150
|
+
return digits;
|
|
151
|
+
}
|
|
152
|
+
return trimmed;
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/**
|
|
156
|
+
* Whether any name in A is also a name (or substring) in B. We use
|
|
157
|
+
* substring match because adapters often label the same person differently
|
|
158
|
+
* ("陈XX" in Alipay vs "陈" in WeChat nickname).
|
|
159
|
+
*/
|
|
160
|
+
function sharesAnyName(namesA, namesB) {
|
|
161
|
+
const a = (namesA || []).filter((n) => typeof n === "string" && n.length > 0);
|
|
162
|
+
const b = (namesB || []).filter((n) => typeof n === "string" && n.length > 0);
|
|
163
|
+
if (a.length === 0 || b.length === 0) return false;
|
|
164
|
+
// Exact match
|
|
165
|
+
for (const x of a) {
|
|
166
|
+
if (b.includes(x)) return true;
|
|
167
|
+
}
|
|
168
|
+
return false;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Count the number of fields (name, identifier value, location, etc.)
|
|
173
|
+
* that show ANY commonality between A and B. The threshold for R2 is
|
|
174
|
+
* "0 overlap → different"; we don't try to weight overlaps here, just
|
|
175
|
+
* detect total disjointedness.
|
|
176
|
+
*/
|
|
177
|
+
function countFieldOverlap(a, b) {
|
|
178
|
+
let n = 0;
|
|
179
|
+
|
|
180
|
+
// Name overlap
|
|
181
|
+
if (sharesAnyName(a.names, b.names)) n += 1;
|
|
182
|
+
|
|
183
|
+
// Identifier overlap (counted per-key)
|
|
184
|
+
const aIds = a.identifiers || {};
|
|
185
|
+
const bIds = b.identifiers || {};
|
|
186
|
+
for (const key of Object.keys(aIds)) {
|
|
187
|
+
if (!bIds[key]) continue;
|
|
188
|
+
const av = toArray(aIds[key]);
|
|
189
|
+
const bv = toArray(bIds[key]);
|
|
190
|
+
if (av.some((v) => bv.includes(v))) n += 1;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// Same primary source adapter — usually means same data origin
|
|
194
|
+
if (a.source && b.source && a.source.adapter === b.source.adapter) {
|
|
195
|
+
n += 1;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
return n;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
module.exports = {
|
|
202
|
+
ruleStage,
|
|
203
|
+
findSharedIdentifier,
|
|
204
|
+
countFieldOverlap,
|
|
205
|
+
sharesAnyName,
|
|
206
|
+
normalizeIdValue,
|
|
207
|
+
STRONG_IDENTIFIER_KEYS,
|
|
208
|
+
};
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Phase 8.5 — Async resolve_queue worker.
|
|
3
|
+
*
|
|
4
|
+
* Long-running idle loop that polls the resolve_queue, calls
|
|
5
|
+
* `EntityResolver.drain(batch)`, sleeps when empty, exits cleanly on
|
|
6
|
+
* stop(). Designed to run alongside the hub process (desktop main /
|
|
7
|
+
* cc serve / Workflow runner).
|
|
8
|
+
*
|
|
9
|
+
* Lifecycle:
|
|
10
|
+
* const w = new EntityResolverWorker({ resolver, intervalMs: 30_000 });
|
|
11
|
+
* w.start(); // returns immediately; loop runs async
|
|
12
|
+
* ...
|
|
13
|
+
* await w.stop(); // waits for current batch to finish, then exits
|
|
14
|
+
*
|
|
15
|
+
* The worker is dumb to where it runs — caller decides cadence (idle
|
|
16
|
+
* scheduler in Electron main, cron in cc serve, etc.).
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
"use strict";
|
|
20
|
+
|
|
21
|
+
class EntityResolverWorker {
|
|
22
|
+
constructor(opts = {}) {
|
|
23
|
+
if (!opts || typeof opts !== "object") {
|
|
24
|
+
throw new Error("EntityResolverWorker: opts required");
|
|
25
|
+
}
|
|
26
|
+
if (!opts.resolver) {
|
|
27
|
+
throw new Error("EntityResolverWorker: opts.resolver required");
|
|
28
|
+
}
|
|
29
|
+
this._resolver = opts.resolver;
|
|
30
|
+
this._batchSize = Number.isFinite(opts.batchSize) ? opts.batchSize : 20;
|
|
31
|
+
// Sleep when the queue is empty (poll cadence). 30s default is
|
|
32
|
+
// enough to keep latency low on user-initiated sync while not
|
|
33
|
+
// pegging CPU in steady state.
|
|
34
|
+
this._idleIntervalMs = Number.isFinite(opts.idleIntervalMs) ? opts.idleIntervalMs : 30_000;
|
|
35
|
+
// Between non-empty batches: small breather so cooperative scheduling
|
|
36
|
+
// works with other adapter syncs.
|
|
37
|
+
this._batchSpacingMs = Number.isFinite(opts.batchSpacingMs) ? opts.batchSpacingMs : 50;
|
|
38
|
+
// Optional progress callback for UI + audit
|
|
39
|
+
this._onProgress = typeof opts.onProgress === "function" ? opts.onProgress : null;
|
|
40
|
+
this._logger = opts.logger || null;
|
|
41
|
+
|
|
42
|
+
this._running = false;
|
|
43
|
+
this._loopPromise = null;
|
|
44
|
+
this._stopRequested = false;
|
|
45
|
+
this._stats = {
|
|
46
|
+
startedAt: 0,
|
|
47
|
+
batchesProcessed: 0,
|
|
48
|
+
itemsProcessed: 0,
|
|
49
|
+
same: 0,
|
|
50
|
+
different: 0,
|
|
51
|
+
review: 0,
|
|
52
|
+
error: 0,
|
|
53
|
+
lastBatchAt: 0,
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
isRunning() {
|
|
58
|
+
return this._running;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
stats() {
|
|
62
|
+
return { ...this._stats };
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
start() {
|
|
66
|
+
if (this._running) return;
|
|
67
|
+
this._running = true;
|
|
68
|
+
this._stopRequested = false;
|
|
69
|
+
this._stats.startedAt = Date.now();
|
|
70
|
+
this._loopPromise = this._loop().finally(() => {
|
|
71
|
+
this._running = false;
|
|
72
|
+
});
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
async stop() {
|
|
76
|
+
this._stopRequested = true;
|
|
77
|
+
if (this._loopPromise) {
|
|
78
|
+
try { await this._loopPromise; } catch (_e) {}
|
|
79
|
+
}
|
|
80
|
+
this._running = false;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Run one batch synchronously (caller drives, no loop). Returns the
|
|
85
|
+
* drain output. Useful for tests + on-demand "process N now" buttons.
|
|
86
|
+
*/
|
|
87
|
+
async tick() {
|
|
88
|
+
return await this._processBatch();
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
async _loop() {
|
|
92
|
+
while (!this._stopRequested) {
|
|
93
|
+
let batchResult;
|
|
94
|
+
try {
|
|
95
|
+
batchResult = await this._processBatch();
|
|
96
|
+
} catch (err) {
|
|
97
|
+
this._log("worker batch threw — sleeping then retrying", err && err.message);
|
|
98
|
+
batchResult = null;
|
|
99
|
+
}
|
|
100
|
+
if (this._stopRequested) break;
|
|
101
|
+
const empty = !batchResult || batchResult.processed === 0;
|
|
102
|
+
const delay = empty ? this._idleIntervalMs : this._batchSpacingMs;
|
|
103
|
+
await this._sleep(delay);
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
async _processBatch() {
|
|
108
|
+
const result = await this._resolver.drain({ limit: this._batchSize });
|
|
109
|
+
this._stats.batchesProcessed += 1;
|
|
110
|
+
this._stats.itemsProcessed += result.processed;
|
|
111
|
+
this._stats.same += result.same;
|
|
112
|
+
this._stats.different += result.different;
|
|
113
|
+
this._stats.review += result.review;
|
|
114
|
+
this._stats.error += result.error;
|
|
115
|
+
this._stats.lastBatchAt = Date.now();
|
|
116
|
+
this._emitProgress({ batch: result, totals: { ...this._stats } });
|
|
117
|
+
return result;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
_emitProgress(payload) {
|
|
121
|
+
if (!this._onProgress) return;
|
|
122
|
+
try {
|
|
123
|
+
this._onProgress(payload);
|
|
124
|
+
} catch (_e) {
|
|
125
|
+
// listener errors don't break the loop
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
async _sleep(ms) {
|
|
130
|
+
if (ms <= 0) return;
|
|
131
|
+
// Interruptable sleep — checks _stopRequested every 100ms so stop()
|
|
132
|
+
// can land within a sane bound.
|
|
133
|
+
const step = 100;
|
|
134
|
+
let remaining = ms;
|
|
135
|
+
while (remaining > 0 && !this._stopRequested) {
|
|
136
|
+
const next = Math.min(step, remaining);
|
|
137
|
+
await new Promise((resolve) => setTimeout(resolve, next));
|
|
138
|
+
remaining -= next;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
_log(...args) {
|
|
143
|
+
if (this._logger && typeof this._logger.info === "function") {
|
|
144
|
+
this._logger.info("[EntityResolverWorker]", ...args);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
module.exports = { EntityResolverWorker };
|
package/lib/index.js
CHANGED
|
@@ -32,6 +32,30 @@ const { MockLLMClient, OllamaClient } = require("./llm-client");
|
|
|
32
32
|
const { AnalysisEngine, DEFAULT_MAX_FACTS, DEFAULT_MAX_QUERY_LIMIT } = require("./analysis");
|
|
33
33
|
const bridges = require("./bridges");
|
|
34
34
|
const emailImapAdapter = require("./adapters/email-imap");
|
|
35
|
+
const alipayBillAdapter = require("./adapters/alipay-bill");
|
|
36
|
+
const wechatAdapter = require("./adapters/wechat");
|
|
37
|
+
const travelBase = require("./adapters/travel-base");
|
|
38
|
+
const { Train12306Adapter } = require("./adapters/travel-12306");
|
|
39
|
+
const { CtripAdapter } = require("./adapters/travel-ctrip");
|
|
40
|
+
const { AmapAdapter } = require("./adapters/travel-amap");
|
|
41
|
+
const { BaiduMapAdapter } = require("./adapters/travel-baidu-map");
|
|
42
|
+
const shoppingBase = require("./adapters/shopping-base");
|
|
43
|
+
const { TaobaoAdapter } = require("./adapters/shopping-taobao");
|
|
44
|
+
const { JdAdapter } = require("./adapters/shopping-jd");
|
|
45
|
+
const { MeituanAdapter } = require("./adapters/shopping-meituan");
|
|
46
|
+
const { BilibiliAdapter } = require("./adapters/social-bilibili");
|
|
47
|
+
const { WeiboAdapter } = require("./adapters/social-weibo");
|
|
48
|
+
const { DouyinAdapter } = require("./adapters/social-douyin");
|
|
49
|
+
const { XiaohongshuAdapter } = require("./adapters/social-xiaohongshu");
|
|
50
|
+
const { ToutiaoAdapter } = require("./adapters/social-toutiao");
|
|
51
|
+
const { KuaishouAdapter } = require("./adapters/social-kuaishou");
|
|
52
|
+
const { QQAdapter } = require("./adapters/messaging-qq");
|
|
53
|
+
const { TelegramAdapter } = require("./adapters/messaging-telegram");
|
|
54
|
+
const { WhatsAppAdapter } = require("./adapters/messaging-whatsapp");
|
|
55
|
+
const entityResolver = require("./entity-resolver");
|
|
56
|
+
const analysisSkills = require("./analysis-skills");
|
|
57
|
+
const mobileExtractor = require("./mobile-extractor");
|
|
58
|
+
const systemDataAndroid = require("./adapters/system-data-android");
|
|
35
59
|
|
|
36
60
|
module.exports = {
|
|
37
61
|
// Constants / enums
|
|
@@ -138,4 +162,111 @@ module.exports = {
|
|
|
138
162
|
ImapConnectionFailedError: emailImapAdapter.ImapConnectionFailedError,
|
|
139
163
|
ImapMailboxNotFoundError: emailImapAdapter.ImapMailboxNotFoundError,
|
|
140
164
|
parseRawEmail: emailImapAdapter.parseRawEmail,
|
|
165
|
+
// Phase 5.3 — email classifier
|
|
166
|
+
EMAIL_CATEGORIES: emailImapAdapter.EMAIL_CATEGORIES,
|
|
167
|
+
EMAIL_LAYER1_RULES: emailImapAdapter.EMAIL_LAYER1_RULES,
|
|
168
|
+
classifyEmail: emailImapAdapter.classifyEmail,
|
|
169
|
+
classifyEmailLayer1: emailImapAdapter.classifyEmailLayer1,
|
|
170
|
+
classifyEmailLayer2: emailImapAdapter.classifyEmailLayer2,
|
|
171
|
+
// Phase 5.4 — 6 template extractors + dispatcher
|
|
172
|
+
extractEmailFields: emailImapAdapter.extractEmailFields,
|
|
173
|
+
EMAIL_CATEGORY_EXTRACTORS: emailImapAdapter.EMAIL_CATEGORY_EXTRACTORS,
|
|
174
|
+
extractEmailBill: emailImapAdapter.extractBill,
|
|
175
|
+
extractEmailOrder: emailImapAdapter.extractOrder,
|
|
176
|
+
extractEmailTravel: emailImapAdapter.extractTravel,
|
|
177
|
+
extractEmailGovernment: emailImapAdapter.extractGovernment,
|
|
178
|
+
extractEmailRegister: emailImapAdapter.extractRegister,
|
|
179
|
+
extractEmailOther: emailImapAdapter.extractOther,
|
|
180
|
+
// Phase 5.5 — PDF decryption + transactions
|
|
181
|
+
extractPdfText: emailImapAdapter.extractPdfText,
|
|
182
|
+
pdfPasswordsFromHints: emailImapAdapter.pdfPasswordsFromHints,
|
|
183
|
+
extractEmailTransactions: emailImapAdapter.extractTransactions,
|
|
184
|
+
|
|
185
|
+
// Phase 8 — EntityResolver
|
|
186
|
+
EntityResolver: entityResolver.EntityResolver,
|
|
187
|
+
entityResolverRuleStage: entityResolver.entityResolverRuleStage,
|
|
188
|
+
entityResolverSharedIdentifier: entityResolver.entityResolverSharedIdentifier,
|
|
189
|
+
ENTITY_RESOLVER_STRONG_IDENTIFIER_KEYS: entityResolver.ENTITY_RESOLVER_STRONG_IDENTIFIER_KEYS,
|
|
190
|
+
EntityResolverEmbeddingStage: entityResolver.EntityResolverEmbeddingStage,
|
|
191
|
+
entityResolverCosineSimilarity: entityResolver.entityResolverCosineSimilarity,
|
|
192
|
+
EntityResolverLLMStage: entityResolver.EntityResolverLLMStage,
|
|
193
|
+
ENTITY_RESOLVER_LLM_SYSTEM_PROMPT: entityResolver.ENTITY_RESOLVER_LLM_SYSTEM_PROMPT,
|
|
194
|
+
parseEntityResolverLLMResponse: entityResolver.parseEntityResolverLLMResponse,
|
|
195
|
+
EntityResolverWorker: entityResolver.EntityResolverWorker,
|
|
196
|
+
|
|
197
|
+
// Phase 11 — 5 内置 analysis skills
|
|
198
|
+
AnalysisSkill: analysisSkills.AnalysisSkill,
|
|
199
|
+
SpendingSkill: analysisSkills.SpendingSkill,
|
|
200
|
+
RelationsSkill: analysisSkills.RelationsSkill,
|
|
201
|
+
FootprintSkill: analysisSkills.FootprintSkill,
|
|
202
|
+
InterestsSkill: analysisSkills.InterestsSkill,
|
|
203
|
+
TimelineSkill: analysisSkills.TimelineSkill,
|
|
204
|
+
ANALYSIS_SKILL_NAMES: analysisSkills.ANALYSIS_SKILL_NAMES,
|
|
205
|
+
runAnalysisSkill: analysisSkills.runAnalysisSkill,
|
|
206
|
+
|
|
207
|
+
// Phase 7.5 — Mobile extraction layer (借 sjqz architecture)
|
|
208
|
+
AndroidExtractor: mobileExtractor.AndroidExtractor,
|
|
209
|
+
iOSBackupReader: mobileExtractor.iOSBackupReader,
|
|
210
|
+
|
|
211
|
+
// Phase 12 v0.5 — WeChat (frida-independent slice)
|
|
212
|
+
WechatAdapter: wechatAdapter.WechatAdapter,
|
|
213
|
+
WECHAT_NAME: wechatAdapter.WECHAT_NAME,
|
|
214
|
+
WECHAT_VERSION: wechatAdapter.WECHAT_VERSION,
|
|
215
|
+
parseWeChatContent: wechatAdapter.parseWeChatContent,
|
|
216
|
+
extractWeChatKey: wechatAdapter.extractWeChatKey,
|
|
217
|
+
deriveWeChatLegacyKey: wechatAdapter.deriveWeChatLegacyKey,
|
|
218
|
+
WeChatDBReader: wechatAdapter.WeChatDBReader,
|
|
219
|
+
normalizeWeChatMessage: wechatAdapter.normalizeWeChatMessage,
|
|
220
|
+
normalizeWeChatContact: wechatAdapter.normalizeWeChatContact,
|
|
221
|
+
wxidToWeChatPersonId: wechatAdapter.wxidToWeChatPersonId,
|
|
222
|
+
WECHAT_PRAGMA_PROFILES: wechatAdapter.WECHAT_PRAGMA_PROFILES,
|
|
223
|
+
|
|
224
|
+
// Phase 9 — Travel four-pack
|
|
225
|
+
normalizeTravelRecord: travelBase.normalizeTravelRecord,
|
|
226
|
+
parseChineseDateTime: travelBase.parseChineseDateTime,
|
|
227
|
+
Train12306Adapter,
|
|
228
|
+
CtripAdapter,
|
|
229
|
+
AmapAdapter,
|
|
230
|
+
BaiduMapAdapter,
|
|
231
|
+
|
|
232
|
+
// Phase 7 — Shopping three-pack
|
|
233
|
+
normalizeOrderRecord: shoppingBase.normalizeOrderRecord,
|
|
234
|
+
CookieAuth: shoppingBase.CookieAuth,
|
|
235
|
+
TaobaoAdapter,
|
|
236
|
+
JdAdapter,
|
|
237
|
+
MeituanAdapter,
|
|
238
|
+
|
|
239
|
+
// Phase 13+ — long-tail social + messaging (借 sjqz parsers)
|
|
240
|
+
BilibiliAdapter,
|
|
241
|
+
WeiboAdapter,
|
|
242
|
+
DouyinAdapter,
|
|
243
|
+
XiaohongshuAdapter,
|
|
244
|
+
ToutiaoAdapter,
|
|
245
|
+
KuaishouAdapter,
|
|
246
|
+
QQAdapter,
|
|
247
|
+
TelegramAdapter,
|
|
248
|
+
WhatsAppAdapter,
|
|
249
|
+
|
|
250
|
+
// Plan A v0.1 — Android on-device system-data adapter (no Python sidecar,
|
|
251
|
+
// UI-pushed snapshot via ContentResolver + PackageManager).
|
|
252
|
+
SystemDataAndroidAdapter: systemDataAndroid.SystemDataAndroidAdapter,
|
|
253
|
+
SYSTEM_DATA_ANDROID_NAME: systemDataAndroid.SYSTEM_DATA_ANDROID_NAME,
|
|
254
|
+
SYSTEM_DATA_ANDROID_VERSION: systemDataAndroid.SYSTEM_DATA_ANDROID_VERSION,
|
|
255
|
+
SYSTEM_DATA_ANDROID_SNAPSHOT_SCHEMA_VERSION:
|
|
256
|
+
systemDataAndroid.SNAPSHOT_SCHEMA_VERSION,
|
|
257
|
+
// Path C — staging + ingest helper shared by IPC / WS / mobile-route layers
|
|
258
|
+
ingestSystemDataAndroidSnapshot:
|
|
259
|
+
systemDataAndroid.ingestSystemDataAndroidSnapshot,
|
|
260
|
+
|
|
261
|
+
// Phase 6 — AlipayBillAdapter (CSV import)
|
|
262
|
+
AlipayBillAdapter: alipayBillAdapter.AlipayBillAdapter,
|
|
263
|
+
ALIPAY_BILL_NAME: alipayBillAdapter.ALIPAY_BILL_NAME,
|
|
264
|
+
ALIPAY_BILL_VERSION: alipayBillAdapter.ALIPAY_BILL_VERSION,
|
|
265
|
+
parseAlipayCsv: alipayBillAdapter.parseAlipayCsv,
|
|
266
|
+
parseAlipayCsvBuffer: alipayBillAdapter.parseAlipayCsvBuffer,
|
|
267
|
+
extractAlipayCsvFromZip: alipayBillAdapter.extractAlipayCsvFromZip,
|
|
268
|
+
classifyAlipayCounterparty: alipayBillAdapter.classifyAlipayCounterparty,
|
|
269
|
+
alipayCounterpartyToPersonId: alipayBillAdapter.alipayCounterpartyToPersonId,
|
|
270
|
+
ALIPAY_KNOWN_MERCHANTS: alipayBillAdapter.ALIPAY_KNOWN_MERCHANTS,
|
|
271
|
+
mapAlipayTypeToSubtype: alipayBillAdapter.mapAlipayTypeToSubtype,
|
|
141
272
|
};
|
package/lib/migrations.js
CHANGED
|
@@ -179,6 +179,72 @@ const INITIAL_DDL = [
|
|
|
179
179
|
`CREATE INDEX IF NOT EXISTS idx_raw_captured ON raw_events(captured_at)`,
|
|
180
180
|
];
|
|
181
181
|
|
|
182
|
+
// Phase 8 DDL — EntityResolver tables (5 new tables).
|
|
183
|
+
// Per docs/design/Personal_Data_Hub_EntityResolver.md §5.1.
|
|
184
|
+
const PHASE_8_DDL = [
|
|
185
|
+
// mergeGroups: identifies clusters of Person rows that are the "same"
|
|
186
|
+
// real-world entity. Multiple Person ids in the same group_id ↔ same
|
|
187
|
+
// person. primary_id is the canonical (oldest) row for display.
|
|
188
|
+
`CREATE TABLE IF NOT EXISTS merge_groups (
|
|
189
|
+
id TEXT PRIMARY KEY,
|
|
190
|
+
primary_id TEXT NOT NULL,
|
|
191
|
+
member_count INTEGER NOT NULL DEFAULT 1,
|
|
192
|
+
created_at INTEGER NOT NULL,
|
|
193
|
+
last_updated INTEGER NOT NULL,
|
|
194
|
+
reviewed_by_user INTEGER NOT NULL DEFAULT 0
|
|
195
|
+
)`,
|
|
196
|
+
`CREATE TABLE IF NOT EXISTS merge_members (
|
|
197
|
+
group_id TEXT NOT NULL,
|
|
198
|
+
person_id TEXT NOT NULL,
|
|
199
|
+
joined_at INTEGER NOT NULL,
|
|
200
|
+
joined_by TEXT NOT NULL,
|
|
201
|
+
PRIMARY KEY (group_id, person_id),
|
|
202
|
+
FOREIGN KEY (group_id) REFERENCES merge_groups(id) ON DELETE CASCADE
|
|
203
|
+
)`,
|
|
204
|
+
`CREATE INDEX IF NOT EXISTS idx_merge_members_person ON merge_members(person_id)`,
|
|
205
|
+
|
|
206
|
+
// resolve_decisions: every yes/no verdict the pipeline (or user) has
|
|
207
|
+
// emitted for a pair. Lexicographic ordering on the two ids prevents
|
|
208
|
+
// both (A,B) and (B,A) ever existing.
|
|
209
|
+
`CREATE TABLE IF NOT EXISTS resolve_decisions (
|
|
210
|
+
a_person_id TEXT NOT NULL,
|
|
211
|
+
b_person_id TEXT NOT NULL,
|
|
212
|
+
verdict TEXT NOT NULL,
|
|
213
|
+
confidence REAL NOT NULL,
|
|
214
|
+
decided_at INTEGER NOT NULL,
|
|
215
|
+
decided_by TEXT NOT NULL,
|
|
216
|
+
reason TEXT,
|
|
217
|
+
PRIMARY KEY (a_person_id, b_person_id)
|
|
218
|
+
)`,
|
|
219
|
+
|
|
220
|
+
// resolve_queue: backlog of Person rows pending pipeline processing.
|
|
221
|
+
// Adapter ingest hook enqueues; async worker drains.
|
|
222
|
+
`CREATE TABLE IF NOT EXISTS resolve_queue (
|
|
223
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
224
|
+
person_id TEXT NOT NULL,
|
|
225
|
+
enqueued_at INTEGER NOT NULL,
|
|
226
|
+
status TEXT NOT NULL DEFAULT 'pending',
|
|
227
|
+
attempts INTEGER NOT NULL DEFAULT 0,
|
|
228
|
+
last_error TEXT
|
|
229
|
+
)`,
|
|
230
|
+
`CREATE INDEX IF NOT EXISTS idx_resolve_queue_status ON resolve_queue(status, enqueued_at)`,
|
|
231
|
+
|
|
232
|
+
// review_queue: pairs the pipeline can't decide; user reviews via UI.
|
|
233
|
+
`CREATE TABLE IF NOT EXISTS review_queue (
|
|
234
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
235
|
+
a_person_id TEXT NOT NULL,
|
|
236
|
+
b_person_id TEXT NOT NULL,
|
|
237
|
+
embed_sim REAL,
|
|
238
|
+
llm_verdict TEXT,
|
|
239
|
+
llm_reason TEXT,
|
|
240
|
+
llm_confidence REAL,
|
|
241
|
+
enqueued_at INTEGER NOT NULL,
|
|
242
|
+
reviewed_at INTEGER,
|
|
243
|
+
user_decision TEXT
|
|
244
|
+
)`,
|
|
245
|
+
`CREATE INDEX IF NOT EXISTS idx_review_queue_pending ON review_queue(reviewed_at, enqueued_at)`,
|
|
246
|
+
];
|
|
247
|
+
|
|
182
248
|
const MIGRATIONS = [
|
|
183
249
|
{
|
|
184
250
|
version: 1,
|
|
@@ -187,6 +253,13 @@ const MIGRATIONS = [
|
|
|
187
253
|
for (const sql of INITIAL_DDL) db.exec(sql);
|
|
188
254
|
},
|
|
189
255
|
},
|
|
256
|
+
{
|
|
257
|
+
version: 2,
|
|
258
|
+
description: "Phase 8 EntityResolver — merge_groups + merge_members + resolve_decisions + resolve_queue + review_queue",
|
|
259
|
+
up(db) {
|
|
260
|
+
for (const sql of PHASE_8_DDL) db.exec(sql);
|
|
261
|
+
},
|
|
262
|
+
},
|
|
190
263
|
];
|
|
191
264
|
|
|
192
265
|
const TARGET_VERSION = MIGRATIONS[MIGRATIONS.length - 1].version;
|