@chainlesschain/personal-data-hub 0.4.28 → 0.4.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,202 @@
1
+ /**
2
+ * Toutiao on-device article reader — recovers the user's feed/read articles
3
+ * from the app's local `news_article.db` (table `article`), a plaintext SQLite
4
+ * DB. No signing/encryption needed.
5
+ *
6
+ * Why this exists (real-device 2026-06-18, user's exported plaintext DB):
7
+ * - `article` rows are the local feed cache (48 rows on the test export). The
8
+ * title is NOT a column — it lives in the `share_info` JSON blob
9
+ * ({title, share_url, ...}); `ext_json` is a heavier fallback. `behot_time`
10
+ * is when the item surfaced; `read_timestamp>0` ⇒ actually opened;
11
+ * `is_user_digg`/`is_user_repin` ⇒ engagement.
12
+ * - Modest signal (feed-shown ≈ weak interest; digg/read ≈ strong), but
13
+ * titled + plaintext, so it's a usable "articles I browsed" stream.
14
+ *
15
+ * Emits BROWSE events under source.adapter `social-toutiao` (the canonical
16
+ * adapter name, so byApp aggregation attributes correctly). Stable originalId
17
+ * (`social-toutiao:article:<group_id>`) → re-ingest UPDATES, not duplicates.
18
+ *
19
+ * Authorization: only on your own device/account.
20
+ */
21
+ "use strict";
22
+
23
+ const { newId } = require("../../ids");
24
+ const {
25
+ _internals: { loadDatabaseClass },
26
+ } = require("../social-bilibili-adb/chromium-cookies-reader");
27
+
28
+ const ARTICLE_TABLE = "article";
29
+ const READER_VERSION = "toutiao-article-0.1";
30
+ const TITLE_SUFFIX = /\s*-\s*今日头条\s*$/;
31
+
32
+ /** seconds-or-ms epoch → ms (heuristic: > 1e12 ⇒ already ms). */
33
+ function toEpochMs(v) {
34
+ const n = Number(v);
35
+ if (!Number.isFinite(n) || n <= 0) return null;
36
+ return n > 1e12 ? Math.floor(n) : Math.floor(n * 1000);
37
+ }
38
+
39
+ function safeParse(s) {
40
+ if (typeof s !== "string" || s.length < 2) return null;
41
+ try {
42
+ return JSON.parse(s);
43
+ } catch (_e) {
44
+ return null;
45
+ }
46
+ }
47
+
48
+ /** Title lives in share_info.title (or ext_json.title); strip the brand suffix. */
49
+ function extractTitle(row) {
50
+ const si = safeParse(row.share_info);
51
+ let title = si && (si.title || si.share_title);
52
+ if (!title) {
53
+ const ej = safeParse(row.ext_json);
54
+ title = ej && (ej.title || ej.share_title || (ej.article && ej.article.title));
55
+ }
56
+ if (typeof title !== "string" || !title.trim()) return null;
57
+ return title.replace(TITLE_SUFFIX, "").trim();
58
+ }
59
+
60
+ function extractUrl(row) {
61
+ const si = safeParse(row.share_info);
62
+ const u = (si && si.share_url) || row.share_url || null;
63
+ if (typeof u !== "string" || !u) return null;
64
+ // Drop the noisy share/tracking query so the same article dedups by url too.
65
+ return u.split("?")[0];
66
+ }
67
+
68
+ function extractCategory(row) {
69
+ const u = row.share_url || "";
70
+ const m = /[?&]category_new=([^&]+)/.exec(u);
71
+ return m ? decodeURIComponent(m[1]) : null;
72
+ }
73
+
74
+ /**
75
+ * Read article rows + parse them into structured records (pure once a Database
76
+ * class is injected). Newest-first.
77
+ *
78
+ * @returns {{articles: Array<{groupId,title,url,category,behotTime,readTimestamp,digg,repin}>}}
79
+ */
80
+ function readToutiaoArticles(dbPath, opts = {}) {
81
+ const Database = opts._databaseClass || loadDatabaseClass();
82
+ const limit = Number.isInteger(opts.limit) && opts.limit > 0 ? opts.limit : 5000;
83
+ const db = new Database(dbPath, { readonly: true });
84
+ try {
85
+ const exists = db
86
+ .prepare("SELECT name FROM sqlite_master WHERE type='table' AND name=?")
87
+ .get(ARTICLE_TABLE);
88
+ if (!exists) return { articles: [] };
89
+
90
+ const cols = new Set(
91
+ db.prepare(`PRAGMA table_info("${ARTICLE_TABLE}")`).all().map((c) => c.name),
92
+ );
93
+ const hasBehot = cols.has("behot_time");
94
+ const rows = db
95
+ .prepare(
96
+ `SELECT * FROM "${ARTICLE_TABLE}"${hasBehot ? " ORDER BY behot_time DESC" : ""} LIMIT ${limit}`,
97
+ )
98
+ .all();
99
+
100
+ const articles = [];
101
+ for (const r of rows) {
102
+ const groupId =
103
+ r.group_id != null ? String(r.group_id) : r.item_id != null ? String(r.item_id) : null;
104
+ if (!groupId) continue;
105
+ const title = extractTitle(r);
106
+ if (!title) continue; // untitled cache rows carry no signal
107
+ articles.push({
108
+ groupId,
109
+ title,
110
+ url: extractUrl(r),
111
+ category: extractCategory(r),
112
+ behotTime: hasBehot ? toEpochMs(r.behot_time) : null,
113
+ readTimestamp: cols.has("read_timestamp") ? toEpochMs(r.read_timestamp) : null,
114
+ digg: cols.has("is_user_digg") ? !!r.is_user_digg : false,
115
+ repin: cols.has("is_user_repin") ? !!r.is_user_repin : false,
116
+ });
117
+ }
118
+ return { articles };
119
+ } finally {
120
+ try {
121
+ db.close();
122
+ } catch (_e) {
123
+ /* best-effort */
124
+ }
125
+ }
126
+ }
127
+
128
+ /**
129
+ * Map article records → BROWSE events tagged with the toutiao source.
130
+ * @returns {{events: object[]}}
131
+ */
132
+ function buildArticleEvents(articles, opts = {}) {
133
+ const now = Number.isFinite(opts.now) ? opts.now : Date.now();
134
+ const events = [];
135
+ for (const a of articles || []) {
136
+ if (!a || !a.groupId || !a.title) continue;
137
+ const occurredAt =
138
+ (Number.isFinite(a.readTimestamp) && a.readTimestamp) ||
139
+ (Number.isFinite(a.behotTime) && a.behotTime) ||
140
+ now;
141
+ events.push({
142
+ id: newId(),
143
+ type: "event",
144
+ subtype: "browse",
145
+ occurredAt,
146
+ actor: "person-self",
147
+ content: { title: a.title, text: a.title },
148
+ ingestedAt: now,
149
+ source: {
150
+ adapter: "social-toutiao",
151
+ adapterVersion: READER_VERSION,
152
+ originalId: `social-toutiao:article:${a.groupId}`,
153
+ capturedAt: occurredAt,
154
+ capturedBy: "sqlite",
155
+ },
156
+ extra: {
157
+ platform: "toutiao",
158
+ kind: "article",
159
+ groupId: a.groupId,
160
+ url: a.url || null,
161
+ category: a.category || null,
162
+ digg: a.digg,
163
+ repin: a.repin,
164
+ read: Number.isFinite(a.readTimestamp) && a.readTimestamp > 0,
165
+ },
166
+ });
167
+ }
168
+ return { events };
169
+ }
170
+
171
+ /**
172
+ * Read news_article.db and write the article BROWSE events into the vault.
173
+ * @param {object} vault LocalVault (must expose putBatch)
174
+ * @param {string} dbPath path to news_article.db
175
+ */
176
+ function articlesToVault(vault, dbPath, opts = {}) {
177
+ if (!vault || typeof vault.putBatch !== "function") {
178
+ throw new TypeError("articlesToVault: vault with putBatch required");
179
+ }
180
+ if (typeof dbPath !== "string" || !dbPath) {
181
+ throw new TypeError("articlesToVault: dbPath required");
182
+ }
183
+ const { articles } = readToutiaoArticles(dbPath, opts);
184
+ const built = buildArticleEvents(articles, opts);
185
+ const res = built.events.length
186
+ ? vault.putBatch({ events: built.events })
187
+ : { events: 0 };
188
+ return {
189
+ ingested: res.events || 0,
190
+ articles: articles.length,
191
+ digg: articles.filter((a) => a.digg).length,
192
+ read: articles.filter((a) => Number.isFinite(a.readTimestamp) && a.readTimestamp > 0).length,
193
+ };
194
+ }
195
+
196
+ module.exports = {
197
+ ARTICLE_TABLE,
198
+ readToutiaoArticles,
199
+ buildArticleEvents,
200
+ articlesToVault,
201
+ _internals: { toEpochMs, extractTitle, extractUrl, extractCategory },
202
+ };
@@ -39,6 +39,18 @@ class OverviewSkill extends AnalysisSkill {
39
39
  if (until != null) q.until = until;
40
40
  const events = this.vault.queryEvents(q) || [];
41
41
 
42
+ // Accurate, uncapped app/type/total counts via SQL GROUP BY. queryEvents
43
+ // hard-caps at 10k rows, so deriving byApp/byType/total from `events`
44
+ // silently undercounts any app whose data is older than the recent-10k
45
+ // window — e.g. on a vault where one chat app dominates recent events,
46
+ // social-douyin showed 10 instead of its true 232. facetCounts honors the
47
+ // same since/until. (Row-derived spend/contacts/monthly stay sample-based —
48
+ // they need actual rows.)
49
+ const facets =
50
+ typeof this.vault.facetCounts === "function"
51
+ ? this.vault.facetCounts({ since, until })
52
+ : null;
53
+
42
54
  const byApp = new Map();
43
55
  const byType = new Map();
44
56
  const byMonth = new Map();
@@ -81,7 +93,11 @@ class OverviewSkill extends AnalysisSkill {
81
93
  if (citations.length < 50) citations.push(e.id);
82
94
  }
83
95
 
84
- const byAppArr = [...byApp.entries()].map(([app, count]) => ({ app, count })).sort((a, b) => b.count - a.count);
96
+ const byAppArr = (
97
+ facets
98
+ ? Object.entries(facets.byAdapter).map(([app, count]) => ({ app, count }))
99
+ : [...byApp.entries()].map(([app, count]) => ({ app, count }))
100
+ ).sort((a, b) => b.count - a.count);
85
101
  const topContacts = [...contacts.entries()]
86
102
  .map(([personId, v]) => ({
87
103
  personId,
@@ -93,8 +109,8 @@ class OverviewSkill extends AnalysisSkill {
93
109
  .slice(0, topN);
94
110
 
95
111
  const summary = {
96
- totalEvents: events.length,
97
- appsActive: byApp.size,
112
+ totalEvents: facets ? facets.total : events.length,
113
+ appsActive: facets ? Object.keys(facets.byAdapter).length : byApp.size,
98
114
  period: { since: since || null, until: until || null },
99
115
  topAppName: byAppArr.length ? byAppArr[0].app : null,
100
116
  };
@@ -103,7 +119,11 @@ class OverviewSkill extends AnalysisSkill {
103
119
  skill: "analysis.overview",
104
120
  summary,
105
121
  byApp: byAppArr,
106
- byType: [...byType.entries()].map(([type, count]) => ({ type, count })).sort((a, b) => b.count - a.count),
122
+ byType: (
123
+ facets
124
+ ? Object.entries(facets.bySubtype).map(([type, count]) => ({ type, count }))
125
+ : [...byType.entries()].map(([type, count]) => ({ type, count }))
126
+ ).sort((a, b) => b.count - a.count),
107
127
  monthlyActivity: [...byMonth.entries()].map(([monthKey, count]) => ({ monthKey, count })).sort((a, b) => a.monthKey.localeCompare(b.monthKey)),
108
128
  topContacts,
109
129
  spending: {
@@ -30,6 +30,13 @@ const { AnalysisSkill } = require("./base");
30
30
 
31
31
  const SUPPORTED_DIMENSIONS = new Set(["merchant", "category", "counterparty", "month"]);
32
32
 
33
+ // Event subtypes that carry content.amount (shared by the row fetch + the
34
+ // accurate SQL-sum path). Phase 7 shopping adapters emit "order".
35
+ const PAYMENT_SUBTYPES = [
36
+ "payment", "transfer", "refund", "utility",
37
+ "redenvelope", "investment", "income", "order",
38
+ ];
39
+
33
40
  class SpendingSkill extends AnalysisSkill {
34
41
  constructor(opts) {
35
42
  super({ ...opts, name: "analysis.spending" });
@@ -48,6 +55,19 @@ class SpendingSkill extends AnalysisSkill {
48
55
  const filtered = this._applyFilters(events, options);
49
56
 
50
57
  const summary = this._summarize(filtered, since, until);
58
+ // The row fetch caps at 5000 events PER subtype — a heavy alipay/wechat-pay
59
+ // user with >5000 payments would have their TOTAL silently undercounted.
60
+ // When no row-only filter is active (merchant text / personId / direction),
61
+ // recompute the headline totals from the uncapped SQL SUM. Breakdown / trend
62
+ // / citations stay row-sampled (they need actual rows).
63
+ const accurate = this._accurateTotals({ since, until }, options);
64
+ if (accurate) {
65
+ summary.totalSpend = accurate.totalSpend;
66
+ summary.totalIncome = accurate.totalIncome;
67
+ summary.netFlow = accurate.netFlow;
68
+ summary.eventCount = accurate.eventCount;
69
+ if (accurate.currency) summary.currency = accurate.currency;
70
+ }
51
71
  const breakdown = this._breakdown(filtered, dimension, topN);
52
72
  const trend = this._monthlyTrend(filtered);
53
73
  const citations = filtered.slice(0, 50).map((e) => e.id);
@@ -72,8 +92,7 @@ class SpendingSkill extends AnalysisSkill {
72
92
  // Phase 7 shopping adapters emit subtype="order" — must include so
73
93
  // spending aggregates cover Taobao/JD/Meituan along with Alipay
74
94
  // (payment/transfer) + Email (refund) etc.
75
- const subtypes = ["payment", "transfer", "refund", "utility", "redenvelope", "investment", "income", "order"];
76
- for (const subtype of subtypes) {
95
+ for (const subtype of PAYMENT_SUBTYPES) {
77
96
  const q = { subtype, limit: 5000 };
78
97
  if (since != null) q.since = since;
79
98
  if (until != null) q.until = until;
@@ -88,6 +107,48 @@ class SpendingSkill extends AnalysisSkill {
88
107
  return events;
89
108
  }
90
109
 
110
+ /**
111
+ * Accurate (uncapped) headline totals via vault.sumEventAmount — used only
112
+ * when the query has no filter SQL can't express. merchantFilter (text match
113
+ * on title/counterparty) and personId (participant expansion) need rows, and
114
+ * a direction filter changes which total/count is meaningful, so any of them
115
+ * → return null and fall back to the row-sampled summary. Returns null when
116
+ * the vault lacks sumEventAmount (older vault → original behavior).
117
+ */
118
+ _accurateTotals({ since, until }, options) {
119
+ if (
120
+ (typeof options.merchantFilter === "string" && options.merchantFilter.length > 0) ||
121
+ (typeof options.personId === "string" && options.personId.length > 0) ||
122
+ options.direction === "out" ||
123
+ options.direction === "in"
124
+ ) {
125
+ return null;
126
+ }
127
+ if (typeof this.vault.sumEventAmount !== "function") return null;
128
+ let totalSpend = 0;
129
+ let totalIncome = 0;
130
+ let eventCount = 0;
131
+ let currency = null;
132
+ for (const subtype of PAYMENT_SUBTYPES) {
133
+ const q = { subtype };
134
+ if (since != null) q.since = since;
135
+ if (until != null) q.until = until;
136
+ const r = this.vault.sumEventAmount(q);
137
+ if (!r) continue;
138
+ totalSpend += (r.byDirection && r.byDirection.out) || 0;
139
+ totalIncome += (r.byDirection && r.byDirection.in) || 0;
140
+ eventCount += r.count || 0;
141
+ if (!currency && r.count > 0 && r.currency) currency = r.currency;
142
+ }
143
+ return {
144
+ totalSpend: Math.round(totalSpend * 100) / 100,
145
+ totalIncome: Math.round(totalIncome * 100) / 100,
146
+ netFlow: Math.round((totalIncome - totalSpend) * 100) / 100,
147
+ eventCount,
148
+ currency,
149
+ };
150
+ }
151
+
91
152
  _applyFilters(events, options) {
92
153
  let out = events;
93
154
  if (typeof options.merchantFilter === "string" && options.merchantFilter.length > 0) {
@@ -63,12 +63,17 @@ class TimelineSkill extends AnalysisSkill {
63
63
  }
64
64
 
65
65
  _fetchEvents({ since, until }, limit) {
66
- // Exclude inventory-snapshot events (installed-app roster + contact
67
- // roster from system-data-android). They carry a synthetic
68
- // collection-time occurredAt — tens of thousands of them cluster at one
69
- // recent timestamp and would otherwise crowd out real activity from this
70
- // chronological narrative. They remain in the vault for facet counts.
71
- const q = { limit, excludeExtraKinds: ["app-snapshot", "contact-snapshot"] };
66
+ // Exclude inventory-snapshot + aggregate-baseline events. The snapshots
67
+ // (installed-app / contact roster from system-data-android) carry a
68
+ // synthetic collection-time occurredAt — tens of thousands cluster at one
69
+ // recent timestamp and would crowd out real activity. `app-usage-profile`
70
+ // is a single rolling aggregate (e.g. douyin "24天/108h" baseline), not a
71
+ // discrete activity, so it doesn't belong in a chronological narrative.
72
+ // All remain in the vault for facet counts / overview.
73
+ const q = {
74
+ limit,
75
+ excludeExtraKinds: ["app-snapshot", "contact-snapshot", "app-usage-profile"],
76
+ };
72
77
  if (since != null) q.since = since;
73
78
  if (until != null) q.until = until;
74
79
  const events = this.vault.queryEvents(q) || [];
@@ -127,8 +127,16 @@ function parseTimeWindow(text, now = Date.now()) {
127
127
  if (m) {
128
128
  const n = parseInt(m[1], 10);
129
129
  if (Number.isFinite(n) && n > 0) {
130
+ // Safe month subtraction. Naive `setMonth(getMonth()-n)` overflows on a
131
+ // month-end day into a shorter month (e.g. Mar 31 −1mo → "Feb 31" → Mar 3),
132
+ // which silently DROPS the whole previous month from the window. Pin to
133
+ // day 1 first, then clamp the day to the target month's length.
130
134
  const target = new Date(now);
135
+ const day = target.getDate();
136
+ target.setDate(1);
131
137
  target.setMonth(target.getMonth() - n);
138
+ const lastDay = new Date(target.getFullYear(), target.getMonth() + 1, 0).getDate();
139
+ target.setDate(Math.min(day, lastDay));
132
140
  return { since: target.getTime(), until: now };
133
141
  }
134
142
  }
@@ -152,7 +160,10 @@ const SUBTYPE_KEYWORDS = [
152
160
  { subtype: "order", patterns: [/(订单|下单|买了|购买|下了几单|下了多少单|order)/i] },
153
161
  { subtype: "payment", patterns: [/(支付|付款|花了|花费|消费|开销|payment|spent|spend)/i] },
154
162
  { subtype: "transfer", patterns: [/(转账|转给|转钱|transfer)/i] },
155
- { subtype: "income", patterns: [/(收入|工资|进账|收到|income)/i] },
163
+ // NOTE: bare 收到 ("receive") is deliberately NOT here — you 收到 messages /
164
+ // packages / 红包 too, so it stole "收到多少消息" → income (income is checked
165
+ // before message). 收到转账 still classifies as transfer (checked earlier).
166
+ { subtype: "income", patterns: [/(收入|工资|进账|入账|income)/i] },
156
167
  { subtype: "message", patterns: [/(聊天|消息|聊了|对话|message|chat)/i] },
157
168
  { subtype: "post", patterns: [/(朋友圈|发了|动态|moment|post)/i] },
158
169
  { subtype: "visit", patterns: [/(去过|到过|visited|去了|来到)/i] },
@@ -201,20 +212,39 @@ function parseFilters(text) {
201
212
 
202
213
  // ─── Intent detection (sum / count / list / latest / ...) ────────────────
203
214
 
215
+ // Amount/money words — BOTH the spend side (花/消费/开销/spent/金额) and the
216
+ // income side (收入/进账/到账/赚/挣). A question carrying one of these plus a
217
+ // "多少/how much" wants a SUM (sumEventAmount), not a row list.
218
+ const AMOUNT_HINT =
219
+ /(花|花了|花费|消费|开销|spent|金额|多少钱|amount|收入|进账|到账|入账|赚|挣)/;
220
+ // Count quantifier: "多少X" or "几X" for a measure word. 钱 is deliberately
221
+ // EXCLUDED so "多少钱" routes to sum-amount, not count. Symmetric 多少/几 (the
222
+ // old pattern had 几条/几单 but not 多少条/多少单, and 多少部 but not 几部).
223
+ const COUNT_QUANTIFIER =
224
+ /(多少|几)(次|条|单|个|家|人|张|部|篇|集|本|件|笔|顿|杯)|how\s+many|count\s+of/i;
225
+ const HOW_MUCH = /(多少钱|多少|how\s+much)/i;
226
+
204
227
  function parseIntent(text) {
205
228
  if (typeof text !== "string") return "list";
206
229
  if (/(总共|共多少|加起来|sum|total|合计)/.test(text)) {
207
- // Distinguish amount vs count by presence of currency words.
208
- if (/(花|花了|花费|消费|开销|spent|金额|多少钱|amount)/.test(text)) return "sum-amount";
230
+ // Distinguish amount vs count by presence of amount words (incl. income,
231
+ // so "总共收入多少" is sum-amount, not count).
232
+ if (AMOUNT_HINT.test(text)) return "sum-amount";
209
233
  return "count";
210
234
  }
211
- // Count intents: 几次/条/单/个 / 多少个/家/人/张/部 / how many / count of
212
- // 2026-05-21: extended "几个 X" / "多少个 X" needed for "几个联系人"
213
- // and "几个 app" which prior pattern missed (returned "list" → LLM had no
214
- // hint to read authoritative TOTALS instead of the FACTS sample length).
215
- if (/(多少次|几次|几条|几单|几个|多少个|多少家|多少人|多少张|多少部|how\s+many|count\s+of)/i.test(text)) {
235
+ // Count: 多少X / 几X for a measure word ("多少条朋友圈" / "下了几单" /
236
+ // "几个联系人"). Runs BEFORE the bare-sum rule so "消费了多少次" → count.
237
+ if (COUNT_QUANTIFIER.test(text)) {
216
238
  return "count";
217
239
  }
240
+ // Spend/income question without an explicit 总共/合计 — "(这个月)花了多少钱" /
241
+ // "在淘宝花了多少" / "这个月收入多少" / "赚了多少". The amount word + a
242
+ // "多少/how much" ⇒ a TOTAL. Without this these common phrasings fell through
243
+ // to intent=list and the engine returned a row sample, not the authoritative
244
+ // sumEventAmount total.
245
+ if (AMOUNT_HINT.test(text) && HOW_MUCH.test(text)) {
246
+ return "sum-amount";
247
+ }
218
248
  if (/(最近|最新|latest|recent)/i.test(text)) return "latest";
219
249
  return "list";
220
250
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@chainlesschain/personal-data-hub",
3
- "version": "0.4.28",
3
+ "version": "0.4.29",
4
4
  "description": "Personal Data Hub — UnifiedSchema + validators + KG ingest helpers for the data-back-to-the-individual middleware",
5
5
  "type": "commonjs",
6
6
  "main": "lib/index.js",