@chainlesschain/personal-data-hub 0.3.6 → 0.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/__tests__/adapters/social-kuaishou-adb-api-client.test.js +432 -0
- package/__tests__/adapters/social-kuaishou-adb-collector.test.js +276 -0
- package/__tests__/adapters/social-kuaishou-adb-cookies-extension.test.js +141 -0
- package/__tests__/adapters/social-kuaishou-adb-snapshot-builder.test.js +178 -0
- package/__tests__/adapters/social-toutiao-adb-api-client.test.js +537 -0
- package/__tests__/adapters/social-toutiao-adb-collector.test.js +285 -0
- package/__tests__/adapters/social-toutiao-adb-cookies-extension.test.js +163 -0
- package/__tests__/adapters/social-toutiao-adb-snapshot-builder.test.js +196 -0
- package/__tests__/adapters/social-xiaohongshu-adb-sign-provider-injection.test.js +351 -0
- package/__tests__/analysis.test.js +239 -14
- package/__tests__/query-parser.test.js +86 -0
- package/__tests__/vault.test.js +88 -0
- package/lib/adapters/ai-chat-history/health-checker.js +11 -0
- package/lib/adapters/social-kuaishou-adb/api-client.js +397 -0
- package/lib/adapters/social-kuaishou-adb/collector.js +196 -0
- package/lib/adapters/social-kuaishou-adb/cookies-extension.js +261 -0
- package/lib/adapters/social-kuaishou-adb/index.js +53 -0
- package/lib/adapters/social-kuaishou-adb/snapshot-builder.js +145 -0
- package/lib/adapters/social-toutiao-adb/api-client.js +377 -0
- package/lib/adapters/social-toutiao-adb/collector.js +200 -0
- package/lib/adapters/social-toutiao-adb/cookies-extension.js +266 -0
- package/lib/adapters/social-toutiao-adb/index.js +52 -0
- package/lib/adapters/social-toutiao-adb/snapshot-builder.js +148 -0
- package/lib/adapters/social-xiaohongshu-adb/api-client.js +36 -5
- package/lib/adapters/social-xiaohongshu-adb/collector.js +102 -51
- package/lib/analysis.js +154 -17
- package/lib/query-parser.js +93 -0
- package/lib/vault.js +64 -0
- package/package.json +5 -1
|
@@ -36,7 +36,12 @@ async function collect(bridge, opts = {}) {
|
|
|
36
36
|
);
|
|
37
37
|
}
|
|
38
38
|
const now = opts.now || Date.now;
|
|
39
|
-
|
|
39
|
+
// Phase 6b: signProvider opt — desktop wiring injects XhsSignBridge for
|
|
40
|
+
// ~100% X-S hit rate; cli wiring leaves undefined → client falls back
|
|
41
|
+
// to in-process best-effort md5 (~60% GET / <30% POST).
|
|
42
|
+
const signProvider = opts.signProvider || undefined;
|
|
43
|
+
const client =
|
|
44
|
+
opts.apiClient || new XhsApiClient({ now, signProvider });
|
|
40
45
|
const limits = opts.limits || {};
|
|
41
46
|
|
|
42
47
|
const cookieResult = await bridge.invoke("xhs.cookies");
|
|
@@ -54,67 +59,108 @@ async function collect(bridge, opts = {}) {
|
|
|
54
59
|
}
|
|
55
60
|
const { cookie, a1, diagnostic: cookieDiagnostic } = cookieResult;
|
|
56
61
|
|
|
57
|
-
//
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
+
// Phase 6b: warm up the sign bridge with the captured cookie BEFORE
|
|
63
|
+
// calling any X-S endpoint. warmUp is idempotent (no-op when already
|
|
64
|
+
// warm). NullSignProvider.warmUp doesn't exist (only on the abstract
|
|
65
|
+
// base + ElectronWebSignBridge), so we feature-detect.
|
|
66
|
+
if (signProvider && typeof signProvider.warmUp === "function") {
|
|
67
|
+
try {
|
|
68
|
+
await signProvider.warmUp(cookie);
|
|
69
|
+
} catch (e) {
|
|
70
|
+
// Bridge warm-up failed (timeout / xhs.com 403 / IPC error).
|
|
71
|
+
// Fall through — api-client will use in-process fallback. Surface
|
|
72
|
+
// the reason via lastErrorMessage so UI can hint "Electron bridge
|
|
73
|
+
// unavailable, command-line precision degraded".
|
|
74
|
+
client._setLastError(
|
|
75
|
+
-98,
|
|
76
|
+
`signProvider warm-up failed: ${e && e.message ? e.message : String(e)}`,
|
|
77
|
+
);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
try {
|
|
82
|
+
// fetchMe — no X-S required
|
|
83
|
+
const me = await client.fetchMe(cookie);
|
|
84
|
+
if (!me) {
|
|
85
|
+
// Cookie expired or web_session missing — write empty snapshot
|
|
86
|
+
// (build requires userId, use sentinel "0" + emit 0 events).
|
|
87
|
+
const snapshot = buildSnapshot({
|
|
88
|
+
userId: "unknown-user",
|
|
89
|
+
nickname: opts.displayName,
|
|
90
|
+
snapshottedAt: now(),
|
|
91
|
+
});
|
|
92
|
+
const snapshotPath = writeSnapshotJson(snapshot, { dir: opts.stagingDir });
|
|
93
|
+
return {
|
|
94
|
+
snapshotPath,
|
|
95
|
+
userId: null,
|
|
96
|
+
nickname: null,
|
|
97
|
+
eventCounts: { note: 0, liked: 0, follow: 0, total: 0 },
|
|
98
|
+
lastErrorCode: client.lastErrorCode,
|
|
99
|
+
lastErrorMessage: client.lastErrorMessage,
|
|
100
|
+
cookieDiagnostic: cookieDiagnostic || null,
|
|
101
|
+
meFetchFailed: true,
|
|
102
|
+
signProviderUsed: signProvider
|
|
103
|
+
? signProvider.constructor.name
|
|
104
|
+
: "none",
|
|
105
|
+
signProviderHits: client._bridgeHits,
|
|
106
|
+
signProviderFallbacks: client._fallbackHits,
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// Parallel 3 endpoints — partial failure tolerated; bridge-signed
|
|
111
|
+
// requests should hit ~100% while fallback hits ~60% GET / <30% POST.
|
|
112
|
+
const [notes, liked, follows] = await Promise.all([
|
|
113
|
+
client.fetchNotes(cookie, a1, me.userId, {
|
|
114
|
+
limit: Number.isInteger(limits.note) ? limits.note : undefined,
|
|
115
|
+
}),
|
|
116
|
+
client.fetchLiked(cookie, a1, {
|
|
117
|
+
limit: Number.isInteger(limits.liked) ? limits.liked : undefined,
|
|
118
|
+
}),
|
|
119
|
+
client.fetchFollows(cookie, a1, me.userId, {
|
|
120
|
+
limit: Number.isInteger(limits.follow) ? limits.follow : undefined,
|
|
121
|
+
}),
|
|
122
|
+
]);
|
|
123
|
+
|
|
62
124
|
const snapshot = buildSnapshot({
|
|
63
|
-
userId:
|
|
64
|
-
nickname: opts.displayName,
|
|
125
|
+
userId: me.userId,
|
|
126
|
+
nickname: opts.displayName || me.nickname,
|
|
127
|
+
notes,
|
|
128
|
+
liked,
|
|
129
|
+
follows,
|
|
65
130
|
snapshottedAt: now(),
|
|
66
131
|
});
|
|
67
132
|
const snapshotPath = writeSnapshotJson(snapshot, { dir: opts.stagingDir });
|
|
133
|
+
|
|
68
134
|
return {
|
|
69
135
|
snapshotPath,
|
|
70
|
-
userId:
|
|
71
|
-
nickname:
|
|
72
|
-
eventCounts: {
|
|
136
|
+
userId: me.userId,
|
|
137
|
+
nickname: me.nickname,
|
|
138
|
+
eventCounts: {
|
|
139
|
+
note: notes.length,
|
|
140
|
+
liked: liked.length,
|
|
141
|
+
follow: follows.length,
|
|
142
|
+
total: snapshot.events.length,
|
|
143
|
+
},
|
|
73
144
|
lastErrorCode: client.lastErrorCode,
|
|
74
145
|
lastErrorMessage: client.lastErrorMessage,
|
|
75
146
|
cookieDiagnostic: cookieDiagnostic || null,
|
|
76
|
-
meFetchFailed:
|
|
147
|
+
meFetchFailed: false,
|
|
148
|
+
signProviderUsed: signProvider ? signProvider.constructor.name : "none",
|
|
149
|
+
signProviderHits: client._bridgeHits,
|
|
150
|
+
signProviderFallbacks: client._fallbackHits,
|
|
77
151
|
};
|
|
152
|
+
} finally {
|
|
153
|
+
// Always release the WebContentsView heap (~30-50MB) — even on
|
|
154
|
+
// throw. shutdown is idempotent so collectAndSync's outer cleanup
|
|
155
|
+
// calling it again is safe.
|
|
156
|
+
if (signProvider && typeof signProvider.shutdown === "function") {
|
|
157
|
+
try {
|
|
158
|
+
await signProvider.shutdown();
|
|
159
|
+
} catch (_e) {
|
|
160
|
+
// Best-effort — shutdown errors don't block sync result.
|
|
161
|
+
}
|
|
162
|
+
}
|
|
78
163
|
}
|
|
79
|
-
|
|
80
|
-
// Parallel 3 endpoints — partial failure tolerated (~60% X-S hit rate)
|
|
81
|
-
const [notes, liked, follows] = await Promise.all([
|
|
82
|
-
client.fetchNotes(cookie, a1, me.userId, {
|
|
83
|
-
limit: Number.isInteger(limits.note) ? limits.note : undefined,
|
|
84
|
-
}),
|
|
85
|
-
client.fetchLiked(cookie, a1, {
|
|
86
|
-
limit: Number.isInteger(limits.liked) ? limits.liked : undefined,
|
|
87
|
-
}),
|
|
88
|
-
client.fetchFollows(cookie, a1, me.userId, {
|
|
89
|
-
limit: Number.isInteger(limits.follow) ? limits.follow : undefined,
|
|
90
|
-
}),
|
|
91
|
-
]);
|
|
92
|
-
|
|
93
|
-
const snapshot = buildSnapshot({
|
|
94
|
-
userId: me.userId,
|
|
95
|
-
nickname: opts.displayName || me.nickname,
|
|
96
|
-
notes,
|
|
97
|
-
liked,
|
|
98
|
-
follows,
|
|
99
|
-
snapshottedAt: now(),
|
|
100
|
-
});
|
|
101
|
-
const snapshotPath = writeSnapshotJson(snapshot, { dir: opts.stagingDir });
|
|
102
|
-
|
|
103
|
-
return {
|
|
104
|
-
snapshotPath,
|
|
105
|
-
userId: me.userId,
|
|
106
|
-
nickname: me.nickname,
|
|
107
|
-
eventCounts: {
|
|
108
|
-
note: notes.length,
|
|
109
|
-
liked: liked.length,
|
|
110
|
-
follow: follows.length,
|
|
111
|
-
total: snapshot.events.length,
|
|
112
|
-
},
|
|
113
|
-
lastErrorCode: client.lastErrorCode,
|
|
114
|
-
lastErrorMessage: client.lastErrorMessage,
|
|
115
|
-
cookieDiagnostic: cookieDiagnostic || null,
|
|
116
|
-
meFetchFailed: false,
|
|
117
|
-
};
|
|
118
164
|
}
|
|
119
165
|
|
|
120
166
|
async function collectAndSync(bridge, registry, opts = {}) {
|
|
@@ -147,6 +193,11 @@ async function collectAndSync(bridge, registry, opts = {}) {
|
|
|
147
193
|
lastErrorMessage: collectResult.lastErrorMessage,
|
|
148
194
|
cookieDiagnostic: collectResult.cookieDiagnostic,
|
|
149
195
|
meFetchFailed: collectResult.meFetchFailed,
|
|
196
|
+
// Phase 6b diagnostic — UI can highlight when bridge upgraded
|
|
197
|
+
// X-S signing from ~60% best-effort to ~100% bridge.
|
|
198
|
+
signProviderUsed: collectResult.signProviderUsed,
|
|
199
|
+
signProviderHits: collectResult.signProviderHits,
|
|
200
|
+
signProviderFallbacks: collectResult.signProviderFallbacks,
|
|
150
201
|
cleanupFailed,
|
|
151
202
|
},
|
|
152
203
|
};
|
package/lib/analysis.js
CHANGED
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
|
|
22
22
|
"use strict";
|
|
23
23
|
|
|
24
|
-
const { parseQuery, extractEntityTerm } = require("./query-parser");
|
|
24
|
+
const { parseQuery, extractEntityTerm, extractPersonNameCandidate } = require("./query-parser");
|
|
25
25
|
const {
|
|
26
26
|
buildPrompt,
|
|
27
27
|
parseCitations,
|
|
@@ -61,6 +61,27 @@ const SUM_AMOUNT_SUBTYPES = ["order", "payment", "transfer", "income"];
|
|
|
61
61
|
// 12) doesn't starve any single subtype.
|
|
62
62
|
const SUM_AMOUNT_MIN_PER_SUBTYPE = 20;
|
|
63
63
|
|
|
64
|
+
// entityFocus="persons" routing — explicit contact queries ("我有哪些联系人",
|
|
65
|
+
// "妈手机号"). When the user names the target table the engine MUST NOT
|
|
66
|
+
// compete persons against the events pool: small-model Android budgets
|
|
67
|
+
// (20 facts / 50 row cap) get drained by a few hundred Bilibili
|
|
68
|
+
// notifications and the contact slice ends up empty. parseEntityFocus
|
|
69
|
+
// surfaces the signal; we honor it by going persons-first.
|
|
70
|
+
//
|
|
71
|
+
// Keep a TINY events headroom (5%) so questions like "我最近跟妈打过电话吗"
|
|
72
|
+
// still surface 通话 event rows alongside the contact entry.
|
|
73
|
+
const PERSONS_FOCUS_EVENT_HEADROOM_RATIO = 0.05;
|
|
74
|
+
|
|
75
|
+
// Default-path budget split when no entityFocus signal. Pre-fix events
|
|
76
|
+
// got the entire effMaxFacts pool first and persons/items shared only the
|
|
77
|
+
// remainder; on a busy vault that meant 0 contacts in the prompt. Cap
|
|
78
|
+
// events at 70%, reserve 20% for persons and 10% for items so a generic
|
|
79
|
+
// "what's going on" question still sees the full data shape.
|
|
80
|
+
const DEFAULT_EVENT_BUDGET_RATIO = 0.7;
|
|
81
|
+
const DEFAULT_PERSON_BUDGET_RATIO = 0.2;
|
|
82
|
+
// Items take whatever remains; intent=count/list questions about contacts
|
|
83
|
+
// already short-circuit via entityFocus before reaching this branch.
|
|
84
|
+
|
|
64
85
|
class AnalysisEngine {
|
|
65
86
|
/**
|
|
66
87
|
* @param {object} opts
|
|
@@ -426,6 +447,88 @@ class AnalysisEngine {
|
|
|
426
447
|
// 0 results → fall through to default broader path below.
|
|
427
448
|
}
|
|
428
449
|
|
|
450
|
+
// entityFocus=persons routing — "我有哪些联系人", "妈手机号", "通讯录里
|
|
451
|
+
// 有多少人". Skip the events broad scan and put the entire fact budget
|
|
452
|
+
// on the persons table (with a 5% events headroom for adjacent rows
|
|
453
|
+
// like 通话/短信). Adapter / time window are NOT applied to persons:
|
|
454
|
+
// contacts are current-state snapshots, not time-stamped events.
|
|
455
|
+
//
|
|
456
|
+
// 0 hits → fall through to the default path. A user might say "联系人"
|
|
457
|
+
// colloquially when they mean "people I've messaged" — the default
|
|
458
|
+
// events+persons mix is the right safety net.
|
|
459
|
+
if (parsed.entityFocus === "persons") {
|
|
460
|
+
const personLimit = effMaxFacts > 1 ? effMaxFacts - 1 : effMaxFacts;
|
|
461
|
+
let persons = [];
|
|
462
|
+
// Name-search short-circuit — when the question carries a probable
|
|
463
|
+
// person-name candidate ("妈手机号", "张三的电话"), try LIKE-search
|
|
464
|
+
// against names / identifiers / notes / relation. Hits go straight
|
|
465
|
+
// to FACTS so the LLM sees the target contact even when the vault
|
|
466
|
+
// holds hundreds of others. Falls back to ingest-ordered queryPersons
|
|
467
|
+
// when 0 hits or no name candidate.
|
|
468
|
+
const nameCandidate = extractPersonNameCandidate(parsed.raw);
|
|
469
|
+
if (nameCandidate && typeof this.vault.searchPersons === "function") {
|
|
470
|
+
try {
|
|
471
|
+
persons = this.vault.searchPersons({ q: nameCandidate, limit: personLimit });
|
|
472
|
+
} catch (_e) { /* tolerate — try ingest-ordered fallback */ }
|
|
473
|
+
}
|
|
474
|
+
if (persons.length === 0) {
|
|
475
|
+
try {
|
|
476
|
+
persons = this.vault.queryPersons({ limit: personLimit });
|
|
477
|
+
} catch (_e) {
|
|
478
|
+
// legacy vault — fall through
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
if (persons.length > 0) {
|
|
482
|
+
const eventHeadroom = Math.max(
|
|
483
|
+
0,
|
|
484
|
+
Math.floor(effMaxFacts * PERSONS_FOCUS_EVENT_HEADROOM_RATIO)
|
|
485
|
+
);
|
|
486
|
+
let events = [];
|
|
487
|
+
if (eventHeadroom > 0) {
|
|
488
|
+
const eq = { limit: eventHeadroom };
|
|
489
|
+
if (parsed.filters && parsed.filters.adapter) eq.adapter = parsed.filters.adapter;
|
|
490
|
+
if (parsed.timeWindow) {
|
|
491
|
+
if (Number.isFinite(parsed.timeWindow.since)) eq.since = parsed.timeWindow.since;
|
|
492
|
+
if (Number.isFinite(parsed.timeWindow.until)) eq.until = parsed.timeWindow.until;
|
|
493
|
+
}
|
|
494
|
+
try {
|
|
495
|
+
events = this.vault.queryEvents(eq);
|
|
496
|
+
} catch (_e) { /* tolerate */ }
|
|
497
|
+
}
|
|
498
|
+
// persons-first ordering so the LLM reads the contact rows before
|
|
499
|
+
// the (sparse) event tail.
|
|
500
|
+
const combined = [...persons, ...events].slice(0, effMaxFacts);
|
|
501
|
+
return combined;
|
|
502
|
+
}
|
|
503
|
+
// 0 persons → fall through.
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
// entityFocus=items routing — "我装了哪些 app", "有哪些游戏". Mirror
|
|
507
|
+
// persons branch: skip events, query items table directly, keep a
|
|
508
|
+
// tiny events headroom for adjacent rows.
|
|
509
|
+
if (parsed.entityFocus === "items") {
|
|
510
|
+
const itemLimit = effMaxFacts > 1 ? effMaxFacts - 1 : effMaxFacts;
|
|
511
|
+
let items = [];
|
|
512
|
+
try {
|
|
513
|
+
items = this.vault.queryItems({ limit: itemLimit });
|
|
514
|
+
} catch (_e) { /* legacy */ }
|
|
515
|
+
if (items.length > 0) {
|
|
516
|
+
const eventHeadroom = Math.max(
|
|
517
|
+
0,
|
|
518
|
+
Math.floor(effMaxFacts * PERSONS_FOCUS_EVENT_HEADROOM_RATIO)
|
|
519
|
+
);
|
|
520
|
+
let events = [];
|
|
521
|
+
if (eventHeadroom > 0) {
|
|
522
|
+
const eq = { limit: eventHeadroom };
|
|
523
|
+
if (parsed.filters && parsed.filters.adapter) eq.adapter = parsed.filters.adapter;
|
|
524
|
+
try {
|
|
525
|
+
events = this.vault.queryEvents(eq);
|
|
526
|
+
} catch (_e) { /* tolerate */ }
|
|
527
|
+
}
|
|
528
|
+
return [...items, ...events].slice(0, effMaxFacts);
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
|
|
429
532
|
// intent=sum-amount routing — "总共花了多少" / "在淘宝花了多少钱"
|
|
430
533
|
// only needs events from amount-bearing subtypes (order/payment/
|
|
431
534
|
// transfer/income). Pulling messages / visits / browses wastes
|
|
@@ -551,22 +654,40 @@ class AnalysisEngine {
|
|
|
551
654
|
// - installed apps land in `items`, not `events`
|
|
552
655
|
// - places (visited locations) live in `places`
|
|
553
656
|
// Without these the LLM gets 0 facts for "我有几个联系人" style questions
|
|
554
|
-
// and hallucinates a count.
|
|
555
|
-
//
|
|
556
|
-
//
|
|
557
|
-
//
|
|
657
|
+
// and hallucinates a count.
|
|
658
|
+
//
|
|
659
|
+
// Sizing — two regimes:
|
|
660
|
+
// (a) Events fit (events.length < effMaxFacts): legacy behavior —
|
|
661
|
+
// events first, split the remainder evenly between persons + items.
|
|
662
|
+
// (b) Events would monopolize (events.length >= effMaxFacts): reserve
|
|
663
|
+
// DEFAULT_PERSON_BUDGET_RATIO (20%) + 10% for persons + items so a
|
|
664
|
+
// busy event timeline doesn't shove every contact out of the prompt.
|
|
665
|
+
// If persons + items tables BOTH return 0 rows, refill the reserve
|
|
666
|
+
// with events — no point starving the LLM of facts when the side
|
|
667
|
+
// tables are empty (small vaults / pre-Path-C ingest state).
|
|
558
668
|
//
|
|
559
|
-
//
|
|
560
|
-
//
|
|
561
|
-
//
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
669
|
+
// Time window + adapter filters don't apply to persons/items: they're
|
|
670
|
+
// current-state snapshots, not time-stamped events. A user asking
|
|
671
|
+
// "上个月联系人变化" is rare enough to leave for a future intent.
|
|
672
|
+
let cappedEvents = events;
|
|
673
|
+
let personBudget;
|
|
674
|
+
let itemBudget;
|
|
675
|
+
if (events.length >= effMaxFacts) {
|
|
676
|
+
const personReserve = Math.max(1, Math.floor(effMaxFacts * DEFAULT_PERSON_BUDGET_RATIO));
|
|
677
|
+
const itemReserve = Math.max(
|
|
678
|
+
1,
|
|
679
|
+
Math.floor(effMaxFacts * (1 - DEFAULT_EVENT_BUDGET_RATIO - DEFAULT_PERSON_BUDGET_RATIO))
|
|
680
|
+
);
|
|
681
|
+
const eventCap = Math.max(1, effMaxFacts - personReserve - itemReserve);
|
|
682
|
+
cappedEvents = events.slice(0, eventCap);
|
|
683
|
+
personBudget = personReserve;
|
|
684
|
+
itemBudget = itemReserve;
|
|
685
|
+
} else {
|
|
686
|
+
const remaining = effMaxFacts - events.length;
|
|
687
|
+
const sideBudget = Math.floor(remaining / 2);
|
|
688
|
+
personBudget = sideBudget > 0 ? sideBudget : 0;
|
|
689
|
+
itemBudget = remaining - personBudget;
|
|
690
|
+
}
|
|
570
691
|
|
|
571
692
|
let persons = [];
|
|
572
693
|
if (personBudget > 0) {
|
|
@@ -585,7 +706,20 @@ class AnalysisEngine {
|
|
|
585
706
|
}
|
|
586
707
|
}
|
|
587
708
|
|
|
588
|
-
|
|
709
|
+
// Refill backfill — when events overflowed (reservation branch) but
|
|
710
|
+
// persons + items both returned 0 rows, give the reserved slots back
|
|
711
|
+
// to events. Small vaults / pre-Path-C state would otherwise see fewer
|
|
712
|
+
// facts than the budget allowed.
|
|
713
|
+
if (
|
|
714
|
+
events.length >= effMaxFacts &&
|
|
715
|
+
persons.length === 0 &&
|
|
716
|
+
items.length === 0 &&
|
|
717
|
+
cappedEvents.length < effMaxFacts
|
|
718
|
+
) {
|
|
719
|
+
cappedEvents = events.slice(0, effMaxFacts);
|
|
720
|
+
}
|
|
721
|
+
|
|
722
|
+
return [...cappedEvents, ...persons, ...items];
|
|
589
723
|
}
|
|
590
724
|
|
|
591
725
|
/**
|
|
@@ -630,4 +764,7 @@ module.exports = {
|
|
|
630
764
|
LIST_INTENT_FTS_LIMIT,
|
|
631
765
|
SUM_AMOUNT_SUBTYPES,
|
|
632
766
|
SUM_AMOUNT_MIN_PER_SUBTYPE,
|
|
767
|
+
PERSONS_FOCUS_EVENT_HEADROOM_RATIO,
|
|
768
|
+
DEFAULT_EVENT_BUDGET_RATIO,
|
|
769
|
+
DEFAULT_PERSON_BUDGET_RATIO,
|
|
633
770
|
};
|
package/lib/query-parser.js
CHANGED
|
@@ -219,6 +219,42 @@ function parseIntent(text) {
|
|
|
219
219
|
return "list";
|
|
220
220
|
}
|
|
221
221
|
|
|
222
|
+
// ─── Entity-focus detection (persons / items routing) ────────────────────
|
|
223
|
+
//
|
|
224
|
+
// 2026-05-27 — Bug: user asked "我有哪些联系人" / "我妈手机号" several times;
|
|
225
|
+
// vault held real contacts but the LLM kept replying "没数据" because the
|
|
226
|
+
// default _gatherFacts pulled 200 row-cap of events first and the persons
|
|
227
|
+
// slice got squeezed out of the small-model 20-fact budget. parseIntent
|
|
228
|
+
// already catches "几个 X" as count, but that doesn't tell the engine WHICH
|
|
229
|
+
// table the user means. parseEntityFocus is the missing signal: when the
|
|
230
|
+
// question is explicitly about contacts/apps, the engine prioritizes that
|
|
231
|
+
// table instead of competing with events.
|
|
232
|
+
//
|
|
233
|
+
// Returns null when no focus signal — engine falls back to the existing
|
|
234
|
+
// events-majority + persons/items remainder behavior.
|
|
235
|
+
//
|
|
236
|
+
// Memory: pdh_analysis_engine_intent_routing.md.
|
|
237
|
+
|
|
238
|
+
const PERSON_FOCUS_PATTERNS = [
|
|
239
|
+
/(联系人|通讯录|电话簿|通信录|好友列表|朋友列表)/,
|
|
240
|
+
/(手机号|电话号|号码是|的电话|的手机)/,
|
|
241
|
+
/(谁是|是谁|是什么人)/,
|
|
242
|
+
/\b(contact|contacts|phonebook|address\s*book|phone\s*number)\b/i,
|
|
243
|
+
];
|
|
244
|
+
|
|
245
|
+
const ITEM_FOCUS_PATTERNS = [
|
|
246
|
+
/(装了|安装了|装过|下了什么|下载了什么|有哪些(app|应用|软件|游戏))/i,
|
|
247
|
+
/(我的(app|应用|软件)|哪些(app|应用|软件|游戏))/i,
|
|
248
|
+
/\b(installed\s+apps?|my\s+apps?|installed\s+packages?)\b/i,
|
|
249
|
+
];
|
|
250
|
+
|
|
251
|
+
function parseEntityFocus(text) {
|
|
252
|
+
if (typeof text !== "string" || text.length === 0) return null;
|
|
253
|
+
if (PERSON_FOCUS_PATTERNS.some((re) => re.test(text))) return "persons";
|
|
254
|
+
if (ITEM_FOCUS_PATTERNS.some((re) => re.test(text))) return "items";
|
|
255
|
+
return null;
|
|
256
|
+
}
|
|
257
|
+
|
|
222
258
|
// ─── Entity-name extraction (FTS5 fulltext routing) ────────────────────
|
|
223
259
|
//
|
|
224
260
|
// Pull a probable entity-name candidate out of the raw question so
|
|
@@ -291,6 +327,56 @@ function extractEntityTerm(text) {
|
|
|
291
327
|
return candidates[0];
|
|
292
328
|
}
|
|
293
329
|
|
|
330
|
+
// ─── Person-name extraction (entityFocus=persons routing) ────────────────
|
|
331
|
+
//
|
|
332
|
+
// Specialized extractor for the persons branch in AnalysisEngine. Differs
|
|
333
|
+
// from extractEntityTerm in two ways:
|
|
334
|
+
//
|
|
335
|
+
// 1. Strips person-FOCUS framing words first (联系人/手机号/电话/etc.) —
|
|
336
|
+
// they're question scaffolding, not the target name. extractEntityTerm
|
|
337
|
+
// left "妈手机号" intact because it doesn't know that phrase is framing.
|
|
338
|
+
//
|
|
339
|
+
// 2. Allows single-character names from a relation-word whitelist
|
|
340
|
+
// (妈/爸/姐/弟/...) — extractEntityTerm filtered every 1-char Chinese to
|
|
341
|
+
// suppress verb false positives, but that also dropped "妈" / "爸" which
|
|
342
|
+
// are the dominant contact-name shorthands on a personal phonebook.
|
|
343
|
+
//
|
|
344
|
+
// Multi-char candidates always win over single-char fallback so "张三的
|
|
345
|
+
// 手机号" returns "张三" not "三".
|
|
346
|
+
|
|
347
|
+
const PERSON_FRAMING_STOP_PATTERNS = [
|
|
348
|
+
/(联系人|通讯录|电话簿|通信录|好友列表|朋友列表)/g,
|
|
349
|
+
/(手机号|电话号|号码是|的电话|的手机|号码|电话)/g,
|
|
350
|
+
/(谁是|是谁|是什么人|是哪位)/g,
|
|
351
|
+
/\b(contact|contacts|phonebook|address\s*book|phone\s*number)\b/gi,
|
|
352
|
+
];
|
|
353
|
+
|
|
354
|
+
// Whitelisted single-character Chinese relation words. Single-char tokens
|
|
355
|
+
// outside this set are dropped to keep verb / particle false-positives from
|
|
356
|
+
// leaking through. Extend cautiously — every new char widens the LIKE
|
|
357
|
+
// surface area and could match unrelated rows.
|
|
358
|
+
const PERSON_RELATION_SINGLE_CHARS_RE =
|
|
359
|
+
/^[妈爸姐妹哥弟爹娘爷奶姥舅姑叔伯婶嫂嫁公婆]$/;
|
|
360
|
+
|
|
361
|
+
function extractPersonNameCandidate(text) {
|
|
362
|
+
if (typeof text !== "string" || text.length === 0) return null;
|
|
363
|
+
let s = text;
|
|
364
|
+
for (const re of PERSON_FRAMING_STOP_PATTERNS) {
|
|
365
|
+
s = s.replace(re, " ");
|
|
366
|
+
}
|
|
367
|
+
for (const re of ENTITY_STOP_PATTERNS) {
|
|
368
|
+
s = s.replace(re, " ");
|
|
369
|
+
}
|
|
370
|
+
const all = s.split(/\s+/).filter((t) => t.length >= 1 && t.length <= 10);
|
|
371
|
+
if (all.length === 0) return null;
|
|
372
|
+
const multi = all
|
|
373
|
+
.filter((t) => t.length >= 2)
|
|
374
|
+
.sort((a, b) => b.length - a.length);
|
|
375
|
+
if (multi.length > 0) return multi[0];
|
|
376
|
+
const single = all.find((t) => t.length === 1 && PERSON_RELATION_SINGLE_CHARS_RE.test(t));
|
|
377
|
+
return single || null;
|
|
378
|
+
}
|
|
379
|
+
|
|
294
380
|
// ─── Full parser ─────────────────────────────────────────────────────────
|
|
295
381
|
|
|
296
382
|
/**
|
|
@@ -314,6 +400,7 @@ function parseQuery(question, opts = {}) {
|
|
|
314
400
|
timeWindow: parseTimeWindow(raw, now),
|
|
315
401
|
filters: parseFilters(raw),
|
|
316
402
|
intent: parseIntent(raw),
|
|
403
|
+
entityFocus: parseEntityFocus(raw),
|
|
317
404
|
};
|
|
318
405
|
}
|
|
319
406
|
|
|
@@ -322,9 +409,15 @@ module.exports = {
|
|
|
322
409
|
parseTimeWindow,
|
|
323
410
|
parseFilters,
|
|
324
411
|
parseIntent,
|
|
412
|
+
parseEntityFocus,
|
|
325
413
|
extractEntityTerm,
|
|
414
|
+
extractPersonNameCandidate,
|
|
326
415
|
// exposed for tests
|
|
327
416
|
SUBTYPE_KEYWORDS,
|
|
328
417
|
ADAPTER_KEYWORDS,
|
|
418
|
+
PERSON_FOCUS_PATTERNS,
|
|
419
|
+
ITEM_FOCUS_PATTERNS,
|
|
329
420
|
ENTITY_STOP_PATTERNS,
|
|
421
|
+
PERSON_FRAMING_STOP_PATTERNS,
|
|
422
|
+
PERSON_RELATION_SINGLE_CHARS_RE,
|
|
330
423
|
};
|
package/lib/vault.js
CHANGED
|
@@ -865,6 +865,70 @@ class LocalVault {
|
|
|
865
865
|
.map((row) => this._rowToPerson(row));
|
|
866
866
|
}
|
|
867
867
|
|
|
868
|
+
/**
|
|
869
|
+
* searchPersons — LIKE-based name/identifier/notes search.
|
|
870
|
+
*
|
|
871
|
+
* 2026-05-27 — AnalysisEngine entityFocus="persons" path uses this when the
|
|
872
|
+
* question carries a probable person-name candidate ("妈手机号", "张三的电话").
|
|
873
|
+
* Pre-fix the engine dumped the first N contacts by ingest_at and let the
|
|
874
|
+
* LLM scan — but on small-model (Qwen 0.5B/1.5B, 20-fact budget) and large
|
|
875
|
+
* contact tables (100+), the target person rarely landed in the slice.
|
|
876
|
+
* Searching by LIKE %term% against the JSON-serialized `names` column +
|
|
877
|
+
* `identifiers` (phone numbers) + `notes` + `relation` gives the LLM the
|
|
878
|
+
* matching contact directly, eliminating that miss.
|
|
879
|
+
*
|
|
880
|
+
* No FTS5 schema migration: contact tables are small (typically <2000
|
|
881
|
+
* rows on Android), full LIKE scan stays sub-millisecond. Sticking with
|
|
882
|
+
* LIKE also avoids partial-index drift trap #25.
|
|
883
|
+
*
|
|
884
|
+
* @param {object} q
|
|
885
|
+
* @param {string} q.q term to match. Falls back to queryPersons when empty.
|
|
886
|
+
* @param {string} [q.subtype]
|
|
887
|
+
* @param {string} [q.adapter]
|
|
888
|
+
* @param {number} [q.limit=100]
|
|
889
|
+
* @param {number} [q.offset=0]
|
|
890
|
+
*/
|
|
891
|
+
searchPersons(q = {}) {
|
|
892
|
+
const term = typeof q.q === "string" ? q.q.trim() : "";
|
|
893
|
+
if (term.length === 0) {
|
|
894
|
+
return this.queryPersons(q);
|
|
895
|
+
}
|
|
896
|
+
const where = [];
|
|
897
|
+
const params = {};
|
|
898
|
+
// LIKE-escape % and _ in the user input so a name with literal % won't
|
|
899
|
+
// wildcard. SQLite LIKE ESCAPE clause handles this.
|
|
900
|
+
const escaped = term.replace(/([\\%_])/g, "\\$1");
|
|
901
|
+
params.qPat = "%" + escaped + "%";
|
|
902
|
+
where.push(
|
|
903
|
+
"(" +
|
|
904
|
+
"names LIKE @qPat ESCAPE '\\' OR " +
|
|
905
|
+
"identifiers LIKE @qPat ESCAPE '\\' OR " +
|
|
906
|
+
"notes LIKE @qPat ESCAPE '\\' OR " +
|
|
907
|
+
"relation LIKE @qPat ESCAPE '\\'" +
|
|
908
|
+
")"
|
|
909
|
+
);
|
|
910
|
+
if (q.subtype) {
|
|
911
|
+
where.push("subtype = @subtype");
|
|
912
|
+
params.subtype = q.subtype;
|
|
913
|
+
}
|
|
914
|
+
if (q.adapter) {
|
|
915
|
+
where.push("source_adapter = @adapter");
|
|
916
|
+
params.adapter = q.adapter;
|
|
917
|
+
}
|
|
918
|
+
const limit = Number.isInteger(q.limit) && q.limit > 0 ? Math.min(q.limit, 10000) : 100;
|
|
919
|
+
const offset = Number.isInteger(q.offset) && q.offset >= 0 ? q.offset : 0;
|
|
920
|
+
params.limit = limit;
|
|
921
|
+
params.offset = offset;
|
|
922
|
+
const sql =
|
|
923
|
+
"SELECT * FROM persons WHERE " + where.join(" AND ") +
|
|
924
|
+
" ORDER BY (confidence IS NULL) ASC, confidence DESC, ingested_at DESC" +
|
|
925
|
+
" LIMIT @limit OFFSET @offset";
|
|
926
|
+
return this._requireOpen()
|
|
927
|
+
.prepare(sql)
|
|
928
|
+
.all(params)
|
|
929
|
+
.map((row) => this._rowToPerson(row));
|
|
930
|
+
}
|
|
931
|
+
|
|
868
932
|
/**
|
|
869
933
|
* queryItems — list item entities (installed apps, purchases, media...).
|
|
870
934
|
* Pairs with queryPersons for AnalysisEngine fact gathering.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@chainlesschain/personal-data-hub",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.8",
|
|
4
4
|
"description": "Personal Data Hub — UnifiedSchema + validators + KG ingest helpers for the data-back-to-the-individual middleware",
|
|
5
5
|
"type": "commonjs",
|
|
6
6
|
"main": "lib/index.js",
|
|
@@ -61,6 +61,10 @@
|
|
|
61
61
|
"./adapters/social-douyin-adb": "./lib/adapters/social-douyin-adb/index.js",
|
|
62
62
|
"./adapters/social-xiaohongshu": "./lib/adapters/social-xiaohongshu/index.js",
|
|
63
63
|
"./adapters/social-xiaohongshu-adb": "./lib/adapters/social-xiaohongshu-adb/index.js",
|
|
64
|
+
"./adapters/social-toutiao": "./lib/adapters/social-toutiao/index.js",
|
|
65
|
+
"./adapters/social-toutiao-adb": "./lib/adapters/social-toutiao-adb/index.js",
|
|
66
|
+
"./adapters/social-kuaishou": "./lib/adapters/social-kuaishou/index.js",
|
|
67
|
+
"./adapters/social-kuaishou-adb": "./lib/adapters/social-kuaishou-adb/index.js",
|
|
64
68
|
"./adapters/messaging-qq": "./lib/adapters/messaging-qq/index.js",
|
|
65
69
|
"./adapters/messaging-telegram": "./lib/adapters/messaging-telegram/index.js",
|
|
66
70
|
"./adapters/messaging-whatsapp": "./lib/adapters/messaging-whatsapp/index.js",
|