@chainlesschain/personal-data-hub 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/__tests__/adapters/ai-chat-history.test.js +395 -0
- package/__tests__/adapters/ai-chat-http-client.test.js +242 -0
- package/__tests__/adapters/ai-chat-vendors.test.js +733 -0
- package/__tests__/adapters/alipay-bill-adapter.test.js +538 -0
- package/__tests__/adapters/email-adapter.test.js +138 -1
- package/__tests__/adapters/email-classifier.test.js +347 -0
- package/__tests__/adapters/email-pdf-extractor.test.js +529 -0
- package/__tests__/adapters/email-retry-progress.test.js +294 -0
- package/__tests__/adapters/email-templates.test.js +699 -0
- package/__tests__/adapters/system-data-adapter.test.js +440 -0
- package/__tests__/adapters/system-data-disclosure.test.js +153 -0
- package/__tests__/analysis-skills.test.js +409 -0
- package/__tests__/entity-resolver-ingest-hook.test.js +177 -0
- package/__tests__/entity-resolver-stages.test.js +411 -0
- package/__tests__/entity-resolver-vault.test.js +246 -0
- package/__tests__/entity-resolver.test.js +526 -0
- package/__tests__/fixtures/entity-resolver-200-mock.json +96 -0
- package/__tests__/longtail-adapters.test.js +217 -0
- package/__tests__/mobile-extractor.test.js +288 -0
- package/__tests__/shopping-adapters.test.js +296 -0
- package/__tests__/sidecar-contacts-cross-validate.test.js +163 -0
- package/__tests__/sidecar-supervisor.test.js +120 -0
- package/__tests__/social-adapters.test.js +206 -0
- package/__tests__/travel-adapters.test.js +325 -0
- package/__tests__/vault.test.js +3 -3
- package/__tests__/wechat-adapter.test.js +476 -0
- package/__tests__/whatsapp-adapter.test.js +135 -0
- package/lib/adapter-spec.js +12 -0
- package/lib/adapters/_python-sidecar-base.js +207 -0
- package/lib/adapters/ai-chat-history/ai-chat-adapter.js +335 -0
- package/lib/adapters/ai-chat-history/cookie-auth.js +109 -0
- package/lib/adapters/ai-chat-history/http-client.js +211 -0
- package/lib/adapters/ai-chat-history/index.js +28 -0
- package/lib/adapters/ai-chat-history/schema-map.js +221 -0
- package/lib/adapters/ai-chat-history/vendor-spec.js +85 -0
- package/lib/adapters/ai-chat-history/vendors/coze.js +179 -0
- package/lib/adapters/ai-chat-history/vendors/deepseek.js +199 -0
- package/lib/adapters/ai-chat-history/vendors/dreamina.js +174 -0
- package/lib/adapters/ai-chat-history/vendors/hunyuan.js +176 -0
- package/lib/adapters/ai-chat-history/vendors/kimi.js +182 -0
- package/lib/adapters/ai-chat-history/vendors/qianfan.js +160 -0
- package/lib/adapters/ai-chat-history/vendors/tongyi.js +193 -0
- package/lib/adapters/ai-chat-history/vendors/zhipu.js +202 -0
- package/lib/adapters/alipay-bill/alipay-bill-adapter.js +307 -0
- package/lib/adapters/alipay-bill/counterparty.js +129 -0
- package/lib/adapters/alipay-bill/csv-parser.js +217 -0
- package/lib/adapters/alipay-bill/index.js +41 -0
- package/lib/adapters/alipay-bill/zip-decryptor.js +111 -0
- package/lib/adapters/email-imap/classifier.js +495 -0
- package/lib/adapters/email-imap/email-adapter.js +419 -8
- package/lib/adapters/email-imap/index.js +42 -0
- package/lib/adapters/email-imap/pdf-extractor.js +192 -0
- package/lib/adapters/email-imap/templates/bill.js +232 -0
- package/lib/adapters/email-imap/templates/government.js +120 -0
- package/lib/adapters/email-imap/templates/index.js +78 -0
- package/lib/adapters/email-imap/templates/order.js +186 -0
- package/lib/adapters/email-imap/templates/other.js +114 -0
- package/lib/adapters/email-imap/templates/register.js +113 -0
- package/lib/adapters/email-imap/templates/travel.js +157 -0
- package/lib/adapters/email-imap/templates/utils.js +275 -0
- package/lib/adapters/email-imap/transactions.js +234 -0
- package/lib/adapters/messaging-qq/index.js +158 -0
- package/lib/adapters/messaging-telegram/index.js +142 -0
- package/lib/adapters/messaging-whatsapp/index.js +189 -0
- package/lib/adapters/shopping-base/index.js +208 -0
- package/lib/adapters/shopping-jd/index.js +150 -0
- package/lib/adapters/shopping-meituan/index.js +154 -0
- package/lib/adapters/shopping-taobao/index.js +176 -0
- package/lib/adapters/social-bilibili/index.js +171 -0
- package/lib/adapters/social-douyin/index.js +116 -0
- package/lib/adapters/social-weibo/index.js +164 -0
- package/lib/adapters/social-xiaohongshu/index.js +96 -0
- package/lib/adapters/system-data/disclosure.js +166 -0
- package/lib/adapters/system-data/index.js +34 -0
- package/lib/adapters/system-data/system-data-adapter.js +344 -0
- package/lib/adapters/travel-12306/index.js +151 -0
- package/lib/adapters/travel-amap/index.js +164 -0
- package/lib/adapters/travel-baidu-map/index.js +162 -0
- package/lib/adapters/travel-base/index.js +240 -0
- package/lib/adapters/travel-ctrip/index.js +151 -0
- package/lib/adapters/wechat/content-parser.js +326 -0
- package/lib/adapters/wechat/db-reader.js +209 -0
- package/lib/adapters/wechat/index.js +28 -0
- package/lib/adapters/wechat/key-extractor.js +158 -0
- package/lib/adapters/wechat/normalize.js +220 -0
- package/lib/adapters/wechat/wechat-adapter.js +205 -0
- package/lib/analysis-skills/base.js +113 -0
- package/lib/analysis-skills/footprint.js +167 -0
- package/lib/analysis-skills/index.js +58 -0
- package/lib/analysis-skills/interests.js +161 -0
- package/lib/analysis-skills/relations.js +226 -0
- package/lib/analysis-skills/spending.js +216 -0
- package/lib/analysis-skills/timeline.js +167 -0
- package/lib/entity-resolver/embedding-stage.js +198 -0
- package/lib/entity-resolver/entity-resolver.js +384 -0
- package/lib/entity-resolver/index.js +42 -0
- package/lib/entity-resolver/llm-stage.js +191 -0
- package/lib/entity-resolver/rule-stage.js +208 -0
- package/lib/entity-resolver/worker.js +149 -0
- package/lib/index.js +115 -0
- package/lib/migrations.js +73 -0
- package/lib/mobile-extractor/android.js +193 -0
- package/lib/mobile-extractor/index.js +9 -0
- package/lib/mobile-extractor/ios.js +223 -0
- package/lib/registry.js +42 -0
- package/lib/sidecar/index.js +15 -0
- package/lib/sidecar/supervisor.js +359 -0
- package/lib/vault.js +266 -0
- package/package.json +29 -3
- package/scripts/_make-fixture-all.js +126 -0
- package/scripts/_make-fixture-contacts.js +84 -0
- package/scripts/evaluate-entity-resolver.js +213 -0
- package/scripts/smoke-phase-5-5.js +196 -0
- package/scripts/smoke-phase-5-7.js +181 -0
- package/scripts/smoke-system-data-contacts.js +309 -0
- package/scripts/smoke-system-data.js +312 -0
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Build a complete fixture directory with all 4 system-data sources:
|
|
4
|
+
*
|
|
5
|
+
* <out>/
|
|
6
|
+
* contacts2.db (raw_contacts + data + mimetypes)
|
|
7
|
+
* mmssms.db (sms)
|
|
8
|
+
* wifi/
|
|
9
|
+
* WifiConfigStore.xml
|
|
10
|
+
*
|
|
11
|
+
* Usage:
|
|
12
|
+
* node scripts/_make-fixture-all.js ./fixtures
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
"use strict";
|
|
16
|
+
|
|
17
|
+
const fs = require("node:fs");
|
|
18
|
+
const path = require("node:path");
|
|
19
|
+
const Database = require("better-sqlite3-multiple-ciphers");
|
|
20
|
+
|
|
21
|
+
const outDir = path.resolve(process.argv[2] || "./fixtures");
|
|
22
|
+
fs.mkdirSync(path.join(outDir, "wifi"), { recursive: true });
|
|
23
|
+
|
|
24
|
+
// ── contacts2.db ──────────────────────────────────────────────────────────
|
|
25
|
+
const contactsPath = path.join(outDir, "contacts2.db");
|
|
26
|
+
if (fs.existsSync(contactsPath)) fs.unlinkSync(contactsPath);
|
|
27
|
+
const contacts = new Database(contactsPath);
|
|
28
|
+
contacts.exec(`
|
|
29
|
+
CREATE TABLE raw_contacts (
|
|
30
|
+
_id INTEGER PRIMARY KEY, display_name TEXT, starred INTEGER DEFAULT 0, deleted INTEGER DEFAULT 0
|
|
31
|
+
);
|
|
32
|
+
CREATE TABLE mimetypes (_id INTEGER PRIMARY KEY, mimetype TEXT NOT NULL UNIQUE);
|
|
33
|
+
CREATE TABLE data (
|
|
34
|
+
_id INTEGER PRIMARY KEY, raw_contact_id INTEGER NOT NULL, mimetype_id INTEGER NOT NULL, data1 TEXT
|
|
35
|
+
);
|
|
36
|
+
CREATE TABLE calls (
|
|
37
|
+
_id INTEGER PRIMARY KEY, number TEXT, type INTEGER, duration INTEGER, date INTEGER, name TEXT, is_read INTEGER DEFAULT 1
|
|
38
|
+
);
|
|
39
|
+
`);
|
|
40
|
+
const MT = { phone: 5, email: 1, org: 4, note: 10, photo: 14 };
|
|
41
|
+
const mi = contacts.prepare("INSERT INTO mimetypes (_id, mimetype) VALUES (?, ?)");
|
|
42
|
+
mi.run(MT.phone, "vnd.android.cursor.item/phone_v2");
|
|
43
|
+
mi.run(MT.email, "vnd.android.cursor.item/email_v2");
|
|
44
|
+
mi.run(MT.org, "vnd.android.cursor.item/organization");
|
|
45
|
+
mi.run(MT.note, "vnd.android.cursor.item/note");
|
|
46
|
+
mi.run(MT.photo, "vnd.android.cursor.item/photo");
|
|
47
|
+
const ci = contacts.prepare(
|
|
48
|
+
"INSERT INTO raw_contacts (_id, display_name, starred, deleted) VALUES (?, ?, ?, 0)",
|
|
49
|
+
);
|
|
50
|
+
ci.run(1, "妈妈", 1);
|
|
51
|
+
ci.run(2, "张三", 0);
|
|
52
|
+
ci.run(3, "李四 Manager", 0);
|
|
53
|
+
ci.run(5, "工商银行客服", 0);
|
|
54
|
+
const di = contacts.prepare(
|
|
55
|
+
"INSERT INTO data (raw_contact_id, mimetype_id, data1) VALUES (?, ?, ?)",
|
|
56
|
+
);
|
|
57
|
+
di.run(1, MT.phone, "13800001111");
|
|
58
|
+
di.run(1, MT.phone, "13900002222");
|
|
59
|
+
di.run(1, MT.email, "mom@example.com");
|
|
60
|
+
di.run(1, MT.note, "亲妈,过年回家");
|
|
61
|
+
di.run(2, MT.phone, "13711112222");
|
|
62
|
+
di.run(3, MT.phone, "13822223333");
|
|
63
|
+
di.run(3, MT.email, "lisi@corp.example.com");
|
|
64
|
+
di.run(3, MT.org, "Example Corp");
|
|
65
|
+
di.run(5, MT.phone, "95588");
|
|
66
|
+
// Calls table inside contacts2.db (pre-Android-11 location)
|
|
67
|
+
const li = contacts.prepare(
|
|
68
|
+
"INSERT INTO calls (_id, number, type, duration, date, name, is_read) VALUES (?, ?, ?, ?, ?, ?, ?)",
|
|
69
|
+
);
|
|
70
|
+
li.run(1, "13800001111", 1, 120, 1737000000000, "妈妈", 1);
|
|
71
|
+
li.run(2, "13800001111", 2, 45, 1737010000000, "妈妈", 1);
|
|
72
|
+
li.run(3, "13999998888", 3, 0, 1737020000000, "", 0);
|
|
73
|
+
li.run(4, "10086", 1, 8, 1737030000000, "中国移动", 1);
|
|
74
|
+
contacts.close();
|
|
75
|
+
console.log("wrote:", contactsPath);
|
|
76
|
+
|
|
77
|
+
// ── mmssms.db ─────────────────────────────────────────────────────────────
|
|
78
|
+
const smsPath = path.join(outDir, "mmssms.db");
|
|
79
|
+
if (fs.existsSync(smsPath)) fs.unlinkSync(smsPath);
|
|
80
|
+
const sms = new Database(smsPath);
|
|
81
|
+
sms.exec(`
|
|
82
|
+
CREATE TABLE sms (
|
|
83
|
+
_id INTEGER PRIMARY KEY, thread_id INTEGER, address TEXT, body TEXT,
|
|
84
|
+
type INTEGER, date INTEGER, read INTEGER
|
|
85
|
+
);
|
|
86
|
+
`);
|
|
87
|
+
const si = sms.prepare(
|
|
88
|
+
"INSERT INTO sms (_id, thread_id, address, body, type, date, read) VALUES (?, ?, ?, ?, ?, ?, ?)",
|
|
89
|
+
);
|
|
90
|
+
si.run(1, 100, "13800001111", "妈妈我到家了", 2, 1737000000000, 1);
|
|
91
|
+
si.run(2, 100, "13800001111", "好的,注意安全", 1, 1737000010000, 1);
|
|
92
|
+
si.run(3, 200, "10086", "【中国移动】您的话费余额为 ¥36.50", 1, 1737000020000, 1);
|
|
93
|
+
si.run(4, 300, "95588", "【工商银行】您的验证码为 123456,3 分钟内有效", 1, 1737000030000, 0);
|
|
94
|
+
sms.close();
|
|
95
|
+
console.log("wrote:", smsPath);
|
|
96
|
+
|
|
97
|
+
// ── wifi/WifiConfigStore.xml ──────────────────────────────────────────────
|
|
98
|
+
const wifiXml = path.join(outDir, "wifi", "WifiConfigStore.xml");
|
|
99
|
+
fs.writeFileSync(
|
|
100
|
+
wifiXml,
|
|
101
|
+
`<?xml version='1.0' encoding='UTF-8'?>
|
|
102
|
+
<WifiConfigStoreData>
|
|
103
|
+
<NetworkList>
|
|
104
|
+
<Network>
|
|
105
|
+
<WifiConfiguration>
|
|
106
|
+
<string name="SSID">"Home_5G"</string>
|
|
107
|
+
<string name="PreSharedKey">"secret"</string>
|
|
108
|
+
<string name="KeyMgmt">WPA-PSK</string>
|
|
109
|
+
<boolean name="HiddenSSID">false</boolean>
|
|
110
|
+
</WifiConfiguration>
|
|
111
|
+
</Network>
|
|
112
|
+
<Network>
|
|
113
|
+
<WifiConfiguration>
|
|
114
|
+
<string name="SSID">"Starbucks Free"</string>
|
|
115
|
+
<string name="KeyMgmt">NONE</string>
|
|
116
|
+
<boolean name="HiddenSSID">false</boolean>
|
|
117
|
+
</WifiConfiguration>
|
|
118
|
+
</Network>
|
|
119
|
+
</NetworkList>
|
|
120
|
+
</WifiConfigStoreData>
|
|
121
|
+
`,
|
|
122
|
+
"utf-8",
|
|
123
|
+
);
|
|
124
|
+
console.log("wrote:", wifiXml);
|
|
125
|
+
|
|
126
|
+
console.log("\nAll fixtures ready under:", outDir);
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Build a synthetic Android contacts2.db at the given path.
|
|
4
|
+
*
|
|
5
|
+
* Only used by the smoke runner / docs walkthrough — production code never
|
|
6
|
+
* relies on this. Mirrors the fixture from
|
|
7
|
+
* packages/personal-data-hub-bridge/tests/test_parsers_system_contacts.py.
|
|
8
|
+
*
|
|
9
|
+
* Usage:
|
|
10
|
+
* node scripts/_make-fixture-contacts.js ./fixtures/contacts2.db
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
"use strict";
|
|
14
|
+
|
|
15
|
+
const fs = require("node:fs");
|
|
16
|
+
const path = require("node:path");
|
|
17
|
+
const Database = require("better-sqlite3-multiple-ciphers");
|
|
18
|
+
|
|
19
|
+
const target = path.resolve(process.argv[2] || "./fixtures/contacts2.db");
|
|
20
|
+
fs.mkdirSync(path.dirname(target), { recursive: true });
|
|
21
|
+
if (fs.existsSync(target)) fs.unlinkSync(target);
|
|
22
|
+
|
|
23
|
+
const db = new Database(target);
|
|
24
|
+
try {
|
|
25
|
+
db.exec(`
|
|
26
|
+
CREATE TABLE raw_contacts (
|
|
27
|
+
_id INTEGER PRIMARY KEY,
|
|
28
|
+
display_name TEXT,
|
|
29
|
+
starred INTEGER DEFAULT 0,
|
|
30
|
+
deleted INTEGER DEFAULT 0
|
|
31
|
+
);
|
|
32
|
+
CREATE TABLE mimetypes (
|
|
33
|
+
_id INTEGER PRIMARY KEY,
|
|
34
|
+
mimetype TEXT NOT NULL UNIQUE
|
|
35
|
+
);
|
|
36
|
+
CREATE TABLE data (
|
|
37
|
+
_id INTEGER PRIMARY KEY,
|
|
38
|
+
raw_contact_id INTEGER NOT NULL,
|
|
39
|
+
mimetype_id INTEGER NOT NULL,
|
|
40
|
+
data1 TEXT
|
|
41
|
+
);
|
|
42
|
+
`);
|
|
43
|
+
|
|
44
|
+
const MT = {
|
|
45
|
+
phone: 5,
|
|
46
|
+
email: 1,
|
|
47
|
+
org: 4,
|
|
48
|
+
note: 10,
|
|
49
|
+
photo: 14,
|
|
50
|
+
};
|
|
51
|
+
const insertMime = db.prepare(
|
|
52
|
+
"INSERT INTO mimetypes (_id, mimetype) VALUES (?, ?)",
|
|
53
|
+
);
|
|
54
|
+
insertMime.run(MT.phone, "vnd.android.cursor.item/phone_v2");
|
|
55
|
+
insertMime.run(MT.email, "vnd.android.cursor.item/email_v2");
|
|
56
|
+
insertMime.run(MT.org, "vnd.android.cursor.item/organization");
|
|
57
|
+
insertMime.run(MT.note, "vnd.android.cursor.item/note");
|
|
58
|
+
insertMime.run(MT.photo, "vnd.android.cursor.item/photo");
|
|
59
|
+
|
|
60
|
+
const insertC = db.prepare(
|
|
61
|
+
"INSERT INTO raw_contacts (_id, display_name, starred, deleted) VALUES (?, ?, ?, 0)",
|
|
62
|
+
);
|
|
63
|
+
insertC.run(1, "妈妈", 1);
|
|
64
|
+
insertC.run(2, "张三", 0);
|
|
65
|
+
insertC.run(3, "李四 Manager", 0);
|
|
66
|
+
insertC.run(4, "", 0); // nameless — skipped by parser
|
|
67
|
+
insertC.run(5, "工商银行客服", 0);
|
|
68
|
+
|
|
69
|
+
const insertD = db.prepare(
|
|
70
|
+
"INSERT INTO data (raw_contact_id, mimetype_id, data1) VALUES (?, ?, ?)",
|
|
71
|
+
);
|
|
72
|
+
insertD.run(1, MT.phone, "13800001111");
|
|
73
|
+
insertD.run(1, MT.phone, "13900002222");
|
|
74
|
+
insertD.run(1, MT.email, "mom@example.com");
|
|
75
|
+
insertD.run(1, MT.note, "亲妈,过年回家");
|
|
76
|
+
insertD.run(2, MT.phone, "13711112222");
|
|
77
|
+
insertD.run(3, MT.phone, "13822223333");
|
|
78
|
+
insertD.run(3, MT.email, "lisi@corp.example.com");
|
|
79
|
+
insertD.run(3, MT.org, "Example Corp");
|
|
80
|
+
insertD.run(5, MT.phone, "95588");
|
|
81
|
+
} finally {
|
|
82
|
+
db.close();
|
|
83
|
+
}
|
|
84
|
+
console.log("fixture written:", target);
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Phase 8.8 — EntityResolver evaluation runner.
|
|
4
|
+
*
|
|
5
|
+
* Reads a labeled pair fixture, runs each pair through the configured
|
|
6
|
+
* stages, computes recall / accuracy / per-stage breakdown, and exits
|
|
7
|
+
* non-zero if the CI gate (recall ≥ 80%, accuracy ≥ 90%) fails.
|
|
8
|
+
*
|
|
9
|
+
* Usage:
|
|
10
|
+
* node scripts/evaluate-entity-resolver.js \
|
|
11
|
+
* [--fixture <path>] [--use-embedding] [--use-llm] [--require-pass]
|
|
12
|
+
*
|
|
13
|
+
* Defaults:
|
|
14
|
+
* - fixture: __tests__/fixtures/entity-resolver-200-mock.json
|
|
15
|
+
* - --use-embedding: skipped unless flag set (needs Ollama running)
|
|
16
|
+
* - --use-llm: skipped unless flag set (needs Ollama + chat model)
|
|
17
|
+
* - --require-pass: exit 1 when gate fails (use in CI; otherwise warn-only)
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
"use strict";
|
|
21
|
+
|
|
22
|
+
const fs = require("node:fs");
|
|
23
|
+
const path = require("node:path");
|
|
24
|
+
|
|
25
|
+
const {
|
|
26
|
+
entityResolverRuleStage: ruleStage,
|
|
27
|
+
} = require("../lib/entity-resolver");
|
|
28
|
+
|
|
29
|
+
const args = parseArgs(process.argv.slice(2));
|
|
30
|
+
const fixturePath = args.fixture || path.join(
|
|
31
|
+
__dirname, "..", "__tests__", "fixtures", "entity-resolver-200-mock.json"
|
|
32
|
+
);
|
|
33
|
+
const useEmbedding = !!args["use-embedding"];
|
|
34
|
+
const useLlm = !!args["use-llm"];
|
|
35
|
+
const requirePass = !!args["require-pass"];
|
|
36
|
+
|
|
37
|
+
const RECALL_GATE = 0.80;
|
|
38
|
+
const ACCURACY_GATE = 0.90;
|
|
39
|
+
|
|
40
|
+
async function main() {
|
|
41
|
+
console.log("== EntityResolver evaluation ==");
|
|
42
|
+
console.log("fixture:", fixturePath);
|
|
43
|
+
if (!fs.existsSync(fixturePath)) {
|
|
44
|
+
console.error("\nFAIL: fixture not found");
|
|
45
|
+
process.exit(2);
|
|
46
|
+
}
|
|
47
|
+
const data = JSON.parse(fs.readFileSync(fixturePath, "utf-8"));
|
|
48
|
+
if (!Array.isArray(data.pairs) || data.pairs.length === 0) {
|
|
49
|
+
console.error("\nFAIL: fixture has no pairs");
|
|
50
|
+
process.exit(2);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
const stages = { embedding: null, llm: null };
|
|
54
|
+
if (useEmbedding) {
|
|
55
|
+
const { EntityResolverEmbeddingStage } = require("../lib/entity-resolver");
|
|
56
|
+
stages.embedding = new EntityResolverEmbeddingStage({}).asStageFn();
|
|
57
|
+
console.log("embedding stage: Ollama nomic-embed-text @ localhost:11434");
|
|
58
|
+
}
|
|
59
|
+
if (useLlm) {
|
|
60
|
+
const { EntityResolverLLMStage } = require("../lib/entity-resolver");
|
|
61
|
+
const { OllamaClient } = require("../lib/llm-client");
|
|
62
|
+
const llm = new OllamaClient({ baseUrl: "http://localhost:11434", model: "qwen2.5:7b-instruct" });
|
|
63
|
+
stages.llm = new EntityResolverLLMStage({ llm }).asStageFn();
|
|
64
|
+
console.log("llm stage: Ollama qwen2.5:7b-instruct");
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// Per-stage counters
|
|
68
|
+
const counts = {
|
|
69
|
+
ruleSame: 0, ruleDifferent: 0, ruleUncertain: 0,
|
|
70
|
+
embeddingSame: 0, embeddingDifferent: 0, embeddingUncertain: 0,
|
|
71
|
+
llmSame: 0, llmDifferent: 0, llmMaybe: 0,
|
|
72
|
+
};
|
|
73
|
+
|
|
74
|
+
// Confusion matrix
|
|
75
|
+
const confusion = { tp: 0, fp: 0, fn: 0, tn: 0, unresolved: 0 };
|
|
76
|
+
|
|
77
|
+
// Per-pair breakdown
|
|
78
|
+
const results = [];
|
|
79
|
+
|
|
80
|
+
for (const pair of data.pairs) {
|
|
81
|
+
const truth = pair.groundTruth; // "same" | "different"
|
|
82
|
+
const ruleVerdict = ruleStage(pair.a, pair.b).verdict;
|
|
83
|
+
let finalVerdict = ruleVerdict;
|
|
84
|
+
let stage = "rule";
|
|
85
|
+
|
|
86
|
+
if (ruleVerdict === "same") counts.ruleSame += 1;
|
|
87
|
+
else if (ruleVerdict === "different") counts.ruleDifferent += 1;
|
|
88
|
+
else counts.ruleUncertain += 1;
|
|
89
|
+
|
|
90
|
+
if (ruleVerdict === "uncertain" && stages.embedding) {
|
|
91
|
+
const e = await stages.embedding(pair.a, pair.b);
|
|
92
|
+
if (e.sim >= 0.85) {
|
|
93
|
+
finalVerdict = "same";
|
|
94
|
+
counts.embeddingSame += 1;
|
|
95
|
+
stage = "embedding";
|
|
96
|
+
} else if (e.sim < 0.55) {
|
|
97
|
+
finalVerdict = "different";
|
|
98
|
+
counts.embeddingDifferent += 1;
|
|
99
|
+
stage = "embedding";
|
|
100
|
+
} else {
|
|
101
|
+
counts.embeddingUncertain += 1;
|
|
102
|
+
if (stages.llm) {
|
|
103
|
+
const v = await stages.llm(pair.a, pair.b);
|
|
104
|
+
if (v.verdict === "yes" && v.confidence >= 0.7) {
|
|
105
|
+
finalVerdict = "same";
|
|
106
|
+
counts.llmSame += 1;
|
|
107
|
+
stage = "llm";
|
|
108
|
+
} else if (v.verdict === "no" && v.confidence >= 0.7) {
|
|
109
|
+
finalVerdict = "different";
|
|
110
|
+
counts.llmDifferent += 1;
|
|
111
|
+
stage = "llm";
|
|
112
|
+
} else {
|
|
113
|
+
finalVerdict = "review";
|
|
114
|
+
counts.llmMaybe += 1;
|
|
115
|
+
stage = "llm-review";
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// Tally confusion (only counting decided verdicts)
|
|
122
|
+
if (finalVerdict === "same" && truth === "same") confusion.tp += 1;
|
|
123
|
+
else if (finalVerdict === "same" && truth === "different") confusion.fp += 1;
|
|
124
|
+
else if (finalVerdict === "different" && truth === "same") confusion.fn += 1;
|
|
125
|
+
else if (finalVerdict === "different" && truth === "different") confusion.tn += 1;
|
|
126
|
+
else confusion.unresolved += 1;
|
|
127
|
+
|
|
128
|
+
results.push({ id: pair.id, truth, finalVerdict, stage, category: pair.category });
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
const total = data.pairs.length;
|
|
132
|
+
const resolved = total - confusion.unresolved;
|
|
133
|
+
const accuracy = resolved > 0 ? (confusion.tp + confusion.tn) / resolved : 0;
|
|
134
|
+
const recall = (confusion.tp + confusion.fn) > 0
|
|
135
|
+
? confusion.tp / (confusion.tp + confusion.fn)
|
|
136
|
+
: 1;
|
|
137
|
+
const precision = (confusion.tp + confusion.fp) > 0
|
|
138
|
+
? confusion.tp / (confusion.tp + confusion.fp)
|
|
139
|
+
: 1;
|
|
140
|
+
const resolveRate = resolved / total;
|
|
141
|
+
|
|
142
|
+
// ── Report ──
|
|
143
|
+
console.log("\nPair counts:", { total, resolved, unresolved: confusion.unresolved });
|
|
144
|
+
console.log("\nPipeline stages:");
|
|
145
|
+
console.log(` Rule: same=${counts.ruleSame} different=${counts.ruleDifferent} uncertain=${counts.ruleUncertain}`);
|
|
146
|
+
if (stages.embedding) console.log(` Embedding: same=${counts.embeddingSame} different=${counts.embeddingDifferent} uncertain=${counts.embeddingUncertain}`);
|
|
147
|
+
if (stages.llm) console.log(` LLM: same=${counts.llmSame} different=${counts.llmDifferent} maybe=${counts.llmMaybe}`);
|
|
148
|
+
|
|
149
|
+
console.log("\nConfusion (decided only):");
|
|
150
|
+
console.log(` TP=${confusion.tp} FP=${confusion.fp} FN=${confusion.fn} TN=${confusion.tn}`);
|
|
151
|
+
console.log(` unresolved=${confusion.unresolved} (rule "uncertain" with no embedding/llm wired)`);
|
|
152
|
+
|
|
153
|
+
console.log("\nMetrics:");
|
|
154
|
+
const recallStr = (recall * 100).toFixed(1) + "%";
|
|
155
|
+
const accStr = (accuracy * 100).toFixed(1) + "%";
|
|
156
|
+
const precStr = (precision * 100).toFixed(1) + "%";
|
|
157
|
+
const resolveStr = (resolveRate * 100).toFixed(1) + "%";
|
|
158
|
+
console.log(` Recall: ${recallStr} (target ≥ ${(RECALL_GATE * 100).toFixed(0)}%) ${recall >= RECALL_GATE ? "✓" : "✗"}`);
|
|
159
|
+
console.log(` Accuracy: ${accStr} (target ≥ ${(ACCURACY_GATE * 100).toFixed(0)}%) ${accuracy >= ACCURACY_GATE ? "✓" : "✗"}`);
|
|
160
|
+
console.log(` Precision: ${precStr}`);
|
|
161
|
+
console.log(` Resolve rate: ${resolveStr}`);
|
|
162
|
+
|
|
163
|
+
// Per-category breakdown (useful for spotting weak spots)
|
|
164
|
+
const byCategory = {};
|
|
165
|
+
for (const r of results) {
|
|
166
|
+
const cat = r.category || "(uncat)";
|
|
167
|
+
if (!byCategory[cat]) byCategory[cat] = { total: 0, correct: 0, unresolved: 0 };
|
|
168
|
+
byCategory[cat].total += 1;
|
|
169
|
+
if (r.finalVerdict === "review" || r.finalVerdict === "uncertain") {
|
|
170
|
+
byCategory[cat].unresolved += 1;
|
|
171
|
+
} else if (r.finalVerdict === r.truth) {
|
|
172
|
+
byCategory[cat].correct += 1;
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
console.log("\nPer-category:");
|
|
176
|
+
for (const cat of Object.keys(byCategory)) {
|
|
177
|
+
const s = byCategory[cat];
|
|
178
|
+
const accStr = s.total - s.unresolved > 0
|
|
179
|
+
? ((s.correct / (s.total - s.unresolved)) * 100).toFixed(0) + "%"
|
|
180
|
+
: "N/A";
|
|
181
|
+
console.log(` ${cat}: ${s.correct}/${s.total - s.unresolved} correct (${accStr}); ${s.unresolved} unresolved`);
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// Gate decision
|
|
185
|
+
const passed = recall >= RECALL_GATE && accuracy >= ACCURACY_GATE;
|
|
186
|
+
console.log(`\n${passed ? "✓ PASS" : "✗ FAIL"} — recall ${recallStr} / accuracy ${accStr}`);
|
|
187
|
+
|
|
188
|
+
if (!passed && requirePass) {
|
|
189
|
+
process.exit(1);
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
function parseArgs(argv) {
|
|
194
|
+
const out = {};
|
|
195
|
+
for (let i = 0; i < argv.length; i += 1) {
|
|
196
|
+
const a = argv[i];
|
|
197
|
+
if (a.startsWith("--")) {
|
|
198
|
+
const k = a.slice(2);
|
|
199
|
+
if (argv[i + 1] && !argv[i + 1].startsWith("--")) {
|
|
200
|
+
out[k] = argv[i + 1];
|
|
201
|
+
i += 1;
|
|
202
|
+
} else {
|
|
203
|
+
out[k] = true;
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
return out;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
main().catch((err) => {
|
|
211
|
+
console.error("\nFATAL:", err && err.message ? err.message : err);
|
|
212
|
+
process.exit(2);
|
|
213
|
+
});
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Phase 5.5 smoke — drives EmailAdapter end-to-end with a mock encrypted
|
|
4
|
+
* PDF attachment, exercising:
|
|
5
|
+
* - password-trial loop (3 wrong → 1 right)
|
|
6
|
+
* - text extraction
|
|
7
|
+
* - transactions regex (3 rows from a 招行-style statement)
|
|
8
|
+
* - merging transactions[] into bill template fields
|
|
9
|
+
* - per-attachment pdfExtraction summary
|
|
10
|
+
* - attachment buffer stripping before raw-event emission
|
|
11
|
+
*
|
|
12
|
+
* Uses an INJECTED pdfExtractor so the smoke runs without pulling the
|
|
13
|
+
* heavy pdfjs dep. The shape of the injected output matches what
|
|
14
|
+
* `extractPdfText` from pdf-extractor.js would return.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
"use strict";
|
|
18
|
+
|
|
19
|
+
const { EmailAdapter } = require("../lib/adapters/email-imap/email-adapter");
|
|
20
|
+
const { extractTransactions } = require("../lib/adapters/email-imap/transactions");
|
|
21
|
+
|
|
22
|
+
const PDF_TEXT = [
|
|
23
|
+
"招商银行信用卡 11 月对账单",
|
|
24
|
+
"持卡人: 张三 尾号 1234",
|
|
25
|
+
"账单周期: 2026-10-26 至 2026-11-25",
|
|
26
|
+
"最后还款日: 2026-12-05 应还金额: ¥3,000.00",
|
|
27
|
+
"",
|
|
28
|
+
"交易明细:",
|
|
29
|
+
"2026-10-30 星巴克 上海中山公园店 -39.00 2,961.00",
|
|
30
|
+
"2026-11-05 京东自营 -899.00 2,062.00",
|
|
31
|
+
"2026-11-12 退款 淘宝 +50.00 2,112.00",
|
|
32
|
+
"2026-11-18 美团外卖 -85.00 2,027.00",
|
|
33
|
+
"",
|
|
34
|
+
"第 1 页 共 1 页",
|
|
35
|
+
].join("\n");
|
|
36
|
+
|
|
37
|
+
const PDF_PASSWORDS = ["wrong1", "wrong2", "wrong3", "987654"];
|
|
38
|
+
|
|
39
|
+
function makeSession() {
|
|
40
|
+
const env = {
|
|
41
|
+
uid: 1,
|
|
42
|
+
internalDate: new Date("2026-11-26T10:00:00Z"),
|
|
43
|
+
flags: ["\\Seen"],
|
|
44
|
+
messageId: "<bill-cmb-11@x>",
|
|
45
|
+
subject: "招商银行信用卡 11 月对账单",
|
|
46
|
+
from: [{ name: "招商银行", address: "ebank@cmbchina.com" }],
|
|
47
|
+
to: [{ address: "me@example.com" }],
|
|
48
|
+
cc: [],
|
|
49
|
+
date: new Date("2026-11-26T10:00:00Z"),
|
|
50
|
+
size: 8192,
|
|
51
|
+
source: Buffer.from("RAW", "utf8"),
|
|
52
|
+
};
|
|
53
|
+
return () => ({
|
|
54
|
+
async connect() {},
|
|
55
|
+
async openMailbox(_name) {
|
|
56
|
+
return { uidValidity: 1, uidNext: 9999, exists: 1 };
|
|
57
|
+
},
|
|
58
|
+
async *fetchFullSince(sinceUid = 0) {
|
|
59
|
+
if (env.uid > sinceUid) yield env;
|
|
60
|
+
},
|
|
61
|
+
async close() {},
|
|
62
|
+
});
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
let trialCount = 0;
|
|
66
|
+
async function mockPdfExtractor(buffer, opts) {
|
|
67
|
+
trialCount = 0;
|
|
68
|
+
for (const pw of ["", ...(opts.passwords || [])]) {
|
|
69
|
+
trialCount += 1;
|
|
70
|
+
if (pw === "987654") {
|
|
71
|
+
return {
|
|
72
|
+
decrypted: true,
|
|
73
|
+
text: PDF_TEXT,
|
|
74
|
+
password: pw,
|
|
75
|
+
attempted: trialCount,
|
|
76
|
+
wasEncrypted: true,
|
|
77
|
+
pageCount: 1,
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
return {
|
|
82
|
+
decrypted: false,
|
|
83
|
+
text: "",
|
|
84
|
+
attempted: trialCount,
|
|
85
|
+
wasEncrypted: true,
|
|
86
|
+
pageCount: 0,
|
|
87
|
+
error: "all passwords failed",
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
async function main() {
|
|
92
|
+
console.log("== Phase 5.5 smoke ==");
|
|
93
|
+
|
|
94
|
+
// First validate the standalone transactions parser on the fixture text
|
|
95
|
+
console.log("\n— Standalone transactions regex —");
|
|
96
|
+
const standaloneTxns = extractTransactions(PDF_TEXT);
|
|
97
|
+
console.log(`extracted ${standaloneTxns.length} transactions:`);
|
|
98
|
+
for (const t of standaloneTxns) {
|
|
99
|
+
const dir = t.amount.direction || "?";
|
|
100
|
+
const date = new Date(t.occurredAtMs).toISOString().slice(0, 10);
|
|
101
|
+
console.log(` ${date} ${dir.padEnd(3)} ¥${t.amount.value.toFixed(2).padStart(8)} ${t.description}`);
|
|
102
|
+
}
|
|
103
|
+
if (standaloneTxns.length !== 4) {
|
|
104
|
+
console.log(`FAIL: expected 4 transactions, got ${standaloneTxns.length}`);
|
|
105
|
+
process.exitCode = 1;
|
|
106
|
+
return;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// Now full pipeline
|
|
110
|
+
console.log("\n— Full adapter pipeline —");
|
|
111
|
+
|
|
112
|
+
const a = new EmailAdapter({
|
|
113
|
+
account: { provider: "qq", email: "me@qq.com", authCode: "x", folders: ["INBOX"] },
|
|
114
|
+
sessionFactory: makeSession(),
|
|
115
|
+
parser: async () => ({
|
|
116
|
+
textBody: "您的招商银行信用卡 11 月对账单已生成,详情见附件 PDF。",
|
|
117
|
+
attachments: [{
|
|
118
|
+
filename: "招行账单_11月.pdf",
|
|
119
|
+
contentType: "application/pdf",
|
|
120
|
+
contentDisposition: "attachment",
|
|
121
|
+
size: 78_456,
|
|
122
|
+
sha256: "abc123sha256deadbeef",
|
|
123
|
+
isInline: false,
|
|
124
|
+
isEncrypted: true,
|
|
125
|
+
buffer: Buffer.from("FAKE-PDF-BYTES-DO-NOT-LEAK"),
|
|
126
|
+
}],
|
|
127
|
+
}),
|
|
128
|
+
pdfExtractor: mockPdfExtractor,
|
|
129
|
+
pdfPasswordHints: { idCardLast6: "987654", phoneLast6: "555000" },
|
|
130
|
+
pdfPasswords: ["wrong1", "wrong2", "wrong3"], // tried before hints
|
|
131
|
+
});
|
|
132
|
+
|
|
133
|
+
console.log("adapter.version =", a.version);
|
|
134
|
+
console.log("adapter.capabilities =", a.capabilities.join(", "));
|
|
135
|
+
console.log("pdfPasswords (merged) =", a._pdfPasswords);
|
|
136
|
+
|
|
137
|
+
let count = 0;
|
|
138
|
+
for await (const raw of a.sync()) {
|
|
139
|
+
count += 1;
|
|
140
|
+
const ext = raw.payload.extraction;
|
|
141
|
+
console.log(`\nemail #${count} subject: ${raw.payload.subject}`);
|
|
142
|
+
console.log(" classification.category:", raw.payload.classification.category);
|
|
143
|
+
console.log(" extraction.template :", ext.template);
|
|
144
|
+
console.log(" extraction.confidence :", ext.confidence);
|
|
145
|
+
console.log(" extraction.fields keys :", Object.keys(ext.fields || {}).join(", "));
|
|
146
|
+
if (ext.fields.transactions) {
|
|
147
|
+
console.log(` transactions[] count : ${ext.fields.transactions.length}`);
|
|
148
|
+
for (const t of ext.fields.transactions) {
|
|
149
|
+
const date = new Date(t.occurredAtMs).toISOString().slice(0, 10);
|
|
150
|
+
console.log(` ${date} ¥${t.amount.value} ${t.amount.direction} ${t.description}`);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
console.log(" pdfExtraction[] :");
|
|
154
|
+
for (const p of ext.pdfExtraction || []) {
|
|
155
|
+
console.log(` ${p.filename}: decrypted=${p.decrypted} attempted=${p.attempted} txns=${p.transactionsExtracted ?? "-"}`);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// Buffer-leakage check
|
|
159
|
+
const serialized = JSON.stringify(raw);
|
|
160
|
+
if (serialized.includes("FAKE-PDF-BYTES-DO-NOT-LEAK")) {
|
|
161
|
+
console.log("\nBUFFER LEAK ✗ — raw PDF bytes survived into payload!");
|
|
162
|
+
process.exitCode = 1;
|
|
163
|
+
} else {
|
|
164
|
+
console.log(" buffer stripping : ✓ no PDF bytes in payload");
|
|
165
|
+
}
|
|
166
|
+
// Password-leakage check
|
|
167
|
+
if (serialized.match(/987654/)) {
|
|
168
|
+
console.log(" PASSWORD LEAK ✗ — real password survived into payload");
|
|
169
|
+
process.exitCode = 1;
|
|
170
|
+
} else {
|
|
171
|
+
console.log(" password redaction : ✓ real password not in payload");
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// Normalize and confirm transactions land in extra.fields
|
|
175
|
+
const batch = a.normalize(raw);
|
|
176
|
+
const ev = batch.events[0];
|
|
177
|
+
if (ev.extra.fields && Array.isArray(ev.extra.fields.transactions)) {
|
|
178
|
+
console.log(` normalize → extra.fields.transactions: ${ev.extra.fields.transactions.length} rows ✓`);
|
|
179
|
+
} else {
|
|
180
|
+
console.log(" normalize MISSING transactions in extra.fields ✗");
|
|
181
|
+
process.exitCode = 1;
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
if (count === 1 && !process.exitCode) {
|
|
186
|
+
console.log("\n== Phase 5.5 smoke PASSED ==");
|
|
187
|
+
} else if (!process.exitCode) {
|
|
188
|
+
console.log(`expected 1 email, got ${count}`);
|
|
189
|
+
process.exitCode = 1;
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
main().catch((err) => {
|
|
194
|
+
console.error("smoke failed:", err);
|
|
195
|
+
process.exitCode = 1;
|
|
196
|
+
});
|