@chainlesschain/personal-data-hub 0.2.4 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/__tests__/adapters/browser-history-chrome.test.js +377 -0
- package/__tests__/adapters/browser-history-edge.test.js +159 -0
- package/__tests__/adapters/git-activity.test.js +216 -0
- package/__tests__/adapters/local-files.test.js +264 -0
- package/__tests__/adapters/shell-history.test.js +180 -0
- package/__tests__/adapters/system-data-android.test.js +104 -3
- package/__tests__/adapters/vscode.test.js +299 -0
- package/__tests__/adapters/win-recent.test.js +192 -0
- package/__tests__/analysis.test.js +840 -1
- package/__tests__/categories.test.js +92 -0
- package/__tests__/e2e/local-data-adapters-cli.e2e.test.js +146 -0
- package/__tests__/entity-resolver-vault.test.js +5 -2
- package/__tests__/integration/local-data-adapters-pipeline.test.js +373 -0
- package/__tests__/query-parser.test.js +66 -0
- package/__tests__/registry.test.js +114 -0
- package/__tests__/sidecar-contacts-cross-validate.test.js +24 -1
- package/__tests__/sidecar-supervisor.test.js +9 -1
- package/__tests__/social-kuaishou-snapshot.test.js +55 -2
- package/__tests__/social-toutiao-snapshot.test.js +54 -2
- package/__tests__/travel-adapters.test.js +97 -5
- package/__tests__/vault-search-helpers.test.js +104 -0
- package/__tests__/vault-search.test.js +423 -0
- package/__tests__/vault.test.js +77 -3
- package/lib/adapters/browser-history-chrome/adapter.js +247 -0
- package/lib/adapters/browser-history-chrome/bookmarks-reader.js +79 -0
- package/lib/adapters/browser-history-chrome/chrome-db-reader.js +223 -0
- package/lib/adapters/browser-history-chrome/index.js +23 -0
- package/lib/adapters/browser-history-edge/adapter.js +34 -0
- package/lib/adapters/browser-history-edge/index.js +13 -0
- package/lib/adapters/git-activity/adapter.js +155 -0
- package/lib/adapters/git-activity/git-reader.js +125 -0
- package/lib/adapters/git-activity/index.js +17 -0
- package/lib/adapters/local-files/adapter.js +149 -0
- package/lib/adapters/local-files/file-walker.js +125 -0
- package/lib/adapters/local-files/index.js +18 -0
- package/lib/adapters/shell-history/adapter.js +137 -0
- package/lib/adapters/shell-history/index.js +17 -0
- package/lib/adapters/shell-history/shell-reader.js +100 -0
- package/lib/adapters/social-kuaishou/index.js +57 -1
- package/lib/adapters/social-toutiao/index.js +59 -1
- package/lib/adapters/system-data-android/adapter.js +220 -3
- package/lib/adapters/travel-12306/index.js +215 -29
- package/lib/adapters/vscode/adapter.js +285 -0
- package/lib/adapters/vscode/index.js +18 -0
- package/lib/adapters/vscode/vscode-reader.js +191 -0
- package/lib/adapters/win-recent/adapter.js +150 -0
- package/lib/adapters/win-recent/index.js +16 -0
- package/lib/adapters/win-recent/win-recent-reader.js +72 -0
- package/lib/analysis.js +227 -9
- package/lib/categories.js +101 -0
- package/lib/index.js +61 -0
- package/lib/migrations.js +146 -0
- package/lib/query-parser.js +74 -0
- package/lib/registry.js +162 -0
- package/lib/vault.js +363 -2
- package/package.json +2 -1
- package/scripts/run-native-tests-sandbox.sh +53 -0
package/lib/migrations.js
CHANGED
|
@@ -245,6 +245,90 @@ const PHASE_8_DDL = [
|
|
|
245
245
|
`CREATE INDEX IF NOT EXISTS idx_review_queue_pending ON review_queue(reviewed_at, enqueued_at)`,
|
|
246
246
|
];
|
|
247
247
|
|
|
248
|
+
// Phase 16 DDL — FTS5 full-text index over events for the Vault Browser UI.
|
|
249
|
+
//
|
|
250
|
+
// Uses FTS5 in **external-content** mode (content='events', content_rowid='rowid')
|
|
251
|
+
// so the events table remains the single source of truth — events_fts only
|
|
252
|
+
// stores the inverted index, kept in sync by 3 triggers.
|
|
253
|
+
//
|
|
254
|
+
// Tokenizer is `trigram` (SQLite ≥3.34) which gives substring matching for
|
|
255
|
+
// CJK (中文) — unicode61 (the FTS5 default) splits only on whitespace and
|
|
256
|
+
// is unusable for Chinese. Trigram min query length is 3 chars (1-2 char
|
|
257
|
+
// queries match nothing); the UI surfaces a hint.
|
|
258
|
+
//
|
|
259
|
+
// If FTS5 or the trigram tokenizer is unavailable in the local SQLite build
|
|
260
|
+
// (rare with bs3mc which bundles SQLCipher 4 / SQLite 3.42+, but possible
|
|
261
|
+
// in custom builds), the probe sets `_meta.fts_mode = 'like'` and the
|
|
262
|
+
// migration skips the virtual table + triggers. vault.searchEvents falls
|
|
263
|
+
// back to LIKE-mode (slower, no ranking, ASCII-only correct).
|
|
264
|
+
//
|
|
265
|
+
// Backfill runs inline inside the migration transaction; on multi-100k-row
|
|
266
|
+
// vaults this can take ~5-15 seconds (one-time cost on upgrade).
|
|
267
|
+
function _hasFts5Trigram(db) {
|
|
268
|
+
// Probe by trying to create a temp virtual table. We can't rely on
|
|
269
|
+
// pragma_compile_options because (a) some bs3mc builds don't surface
|
|
270
|
+
// ENABLE_FTS5 there, and (b) trigram is a tokenizer registration, not
|
|
271
|
+
// a compile option. The CREATE is the ground truth.
|
|
272
|
+
try {
|
|
273
|
+
db.exec(
|
|
274
|
+
"CREATE VIRTUAL TABLE temp._fts_probe USING fts5(x, tokenize='trigram')"
|
|
275
|
+
);
|
|
276
|
+
db.exec("DROP TABLE temp._fts_probe");
|
|
277
|
+
return true;
|
|
278
|
+
} catch (_err) {
|
|
279
|
+
return false;
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
const PHASE_16_FTS_DDL = {
|
|
284
|
+
// External-content FTS5: no row data copy, just the inverted index.
|
|
285
|
+
// Columns mirror what's worth searching in events:
|
|
286
|
+
// content_text — the JSON payload, searched as-is (JSON braces tokenize
|
|
287
|
+
// harmlessly with trigram; query "{key" never matches
|
|
288
|
+
// anything users type)
|
|
289
|
+
// extra_text — extra JSON tail (also worth searching: order numbers,
|
|
290
|
+
// merchant names, message bodies live here for many
|
|
291
|
+
// adapters)
|
|
292
|
+
// subtype/actor/place — short flat strings, indexed for keyword filter
|
|
293
|
+
createVirtualTable: `
|
|
294
|
+
CREATE VIRTUAL TABLE events_fts USING fts5(
|
|
295
|
+
subtype, content_text, actor, place, extra_text,
|
|
296
|
+
content='events',
|
|
297
|
+
content_rowid='rowid',
|
|
298
|
+
tokenize='trigram'
|
|
299
|
+
)
|
|
300
|
+
`,
|
|
301
|
+
// After INSERT — index the new row.
|
|
302
|
+
triggerAi: `
|
|
303
|
+
CREATE TRIGGER events_ai AFTER INSERT ON events BEGIN
|
|
304
|
+
INSERT INTO events_fts(rowid, subtype, content_text, actor, place, extra_text)
|
|
305
|
+
VALUES (new.rowid, new.subtype, new.content, new.actor, new.place, new.extra);
|
|
306
|
+
END
|
|
307
|
+
`,
|
|
308
|
+
// After DELETE — remove from index. external-content delete uses the
|
|
309
|
+
// sentinel ('delete', rowid, ...all-cols) pattern.
|
|
310
|
+
triggerAd: `
|
|
311
|
+
CREATE TRIGGER events_ad AFTER DELETE ON events BEGIN
|
|
312
|
+
INSERT INTO events_fts(events_fts, rowid, subtype, content_text, actor, place, extra_text)
|
|
313
|
+
VALUES('delete', old.rowid, old.subtype, old.content, old.actor, old.place, old.extra);
|
|
314
|
+
END
|
|
315
|
+
`,
|
|
316
|
+
// After UPDATE — delete-then-insert (FTS5 external-content idiom).
|
|
317
|
+
triggerAu: `
|
|
318
|
+
CREATE TRIGGER events_au AFTER UPDATE ON events BEGIN
|
|
319
|
+
INSERT INTO events_fts(events_fts, rowid, subtype, content_text, actor, place, extra_text)
|
|
320
|
+
VALUES('delete', old.rowid, old.subtype, old.content, old.actor, old.place, old.extra);
|
|
321
|
+
INSERT INTO events_fts(rowid, subtype, content_text, actor, place, extra_text)
|
|
322
|
+
VALUES (new.rowid, new.subtype, new.content, new.actor, new.place, new.extra);
|
|
323
|
+
END
|
|
324
|
+
`,
|
|
325
|
+
// One-shot backfill of all existing rows.
|
|
326
|
+
backfill: `
|
|
327
|
+
INSERT INTO events_fts(rowid, subtype, content_text, actor, place, extra_text)
|
|
328
|
+
SELECT rowid, subtype, content, actor, place, extra FROM events
|
|
329
|
+
`,
|
|
330
|
+
};
|
|
331
|
+
|
|
248
332
|
const MIGRATIONS = [
|
|
249
333
|
{
|
|
250
334
|
version: 1,
|
|
@@ -260,6 +344,52 @@ const MIGRATIONS = [
|
|
|
260
344
|
for (const sql of PHASE_8_DDL) db.exec(sql);
|
|
261
345
|
},
|
|
262
346
|
},
|
|
347
|
+
{
|
|
348
|
+
version: 3,
|
|
349
|
+
description:
|
|
350
|
+
"Phase 16 Vault Browser — events_fts FTS5 (trigram) virtual table + 3 sync triggers + backfill; LIKE fallback when FTS5 unavailable",
|
|
351
|
+
up(db) {
|
|
352
|
+
const supported = _hasFts5Trigram(db);
|
|
353
|
+
// Record the mode in _meta so the runtime can pick the right query path
|
|
354
|
+
// without re-probing every open. Set BEFORE creating tables so partial
|
|
355
|
+
// failures still leave a queryable mode marker.
|
|
356
|
+
const now = Date.now();
|
|
357
|
+
db.prepare(
|
|
358
|
+
`INSERT INTO _meta (key, value, updated_at) VALUES ('fts_mode', ?, ?)
|
|
359
|
+
ON CONFLICT(key) DO UPDATE SET value = excluded.value, updated_at = excluded.updated_at`
|
|
360
|
+
).run(supported ? "fts5" : "like", now);
|
|
361
|
+
if (!supported) return;
|
|
362
|
+
|
|
363
|
+
db.exec(PHASE_16_FTS_DDL.createVirtualTable);
|
|
364
|
+
db.exec(PHASE_16_FTS_DDL.triggerAi);
|
|
365
|
+
db.exec(PHASE_16_FTS_DDL.triggerAd);
|
|
366
|
+
db.exec(PHASE_16_FTS_DDL.triggerAu);
|
|
367
|
+
db.exec(PHASE_16_FTS_DDL.backfill);
|
|
368
|
+
},
|
|
369
|
+
},
|
|
370
|
+
{
|
|
371
|
+
version: 4,
|
|
372
|
+
description:
|
|
373
|
+
"Recreate uniq_{events,persons,places,items}_source as partial indices " +
|
|
374
|
+
"(WHERE source_original_id IS NOT NULL) to match putEvent/putPerson/etc " +
|
|
375
|
+
"UPSERT ON CONFLICT WHERE clauses. Older vaults (pre commit 44c4188a8) " +
|
|
376
|
+
"may have full unique indices created via CREATE UNIQUE INDEX IF NOT " +
|
|
377
|
+
"EXISTS without the WHERE clause; the IF-NOT-EXISTS hides the schema " +
|
|
378
|
+
"drift forever, manifesting only as runtime SQLite error '2nd ON CONFLICT " +
|
|
379
|
+
"clause does not match any PRIMARY KEY or UNIQUE constraint'. Symptom on " +
|
|
380
|
+
"Android: adapter.sync fails silently, vault events table stays at 1 row " +
|
|
381
|
+
"while raw_events accumulates 1000+. Idempotent on already-correct vaults.",
|
|
382
|
+
up(db) {
|
|
383
|
+
const tables = ["events", "persons", "places", "items"];
|
|
384
|
+
for (const t of tables) {
|
|
385
|
+
db.exec(`DROP INDEX IF EXISTS uniq_${t}_source`);
|
|
386
|
+
db.exec(
|
|
387
|
+
`CREATE UNIQUE INDEX uniq_${t}_source ON ${t}(source_adapter, source_original_id) ` +
|
|
388
|
+
`WHERE source_original_id IS NOT NULL`
|
|
389
|
+
);
|
|
390
|
+
}
|
|
391
|
+
},
|
|
392
|
+
},
|
|
263
393
|
];
|
|
264
394
|
|
|
265
395
|
const TARGET_VERSION = MIGRATIONS[MIGRATIONS.length - 1].version;
|
|
@@ -311,9 +441,25 @@ function getSchemaVersion(db) {
|
|
|
311
441
|
}
|
|
312
442
|
}
|
|
313
443
|
|
|
444
|
+
/**
|
|
445
|
+
* Returns 'fts5' or 'like' depending on what migration 3 recorded.
|
|
446
|
+
* Pre-migration-3 vaults return 'like' as the safe default.
|
|
447
|
+
*/
|
|
448
|
+
function getFtsMode(db) {
|
|
449
|
+
try {
|
|
450
|
+
const row = db.prepare("SELECT value FROM _meta WHERE key = 'fts_mode'").get();
|
|
451
|
+
return row && (row.value === "fts5" || row.value === "like") ? row.value : "like";
|
|
452
|
+
} catch (_err) {
|
|
453
|
+
return "like";
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
|
|
314
457
|
module.exports = {
|
|
315
458
|
MIGRATIONS,
|
|
316
459
|
TARGET_VERSION,
|
|
317
460
|
applyMigrations,
|
|
318
461
|
getSchemaVersion,
|
|
462
|
+
getFtsMode,
|
|
463
|
+
// Exported for tests + driver capability checks at vault open time.
|
|
464
|
+
_hasFts5Trigram,
|
|
319
465
|
};
|
package/lib/query-parser.js
CHANGED
|
@@ -219,6 +219,78 @@ function parseIntent(text) {
|
|
|
219
219
|
return "list";
|
|
220
220
|
}
|
|
221
221
|
|
|
222
|
+
// ─── Entity-name extraction (FTS5 fulltext routing) ────────────────────
|
|
223
|
+
//
|
|
224
|
+
// Pull a probable entity-name candidate out of the raw question so
|
|
225
|
+
// `_gatherFacts` can augment intent=list results with `vault.searchEvents`
|
|
226
|
+
// (FTS5 + trigram CJK substring; LIKE fallback). Heuristic: strip every
|
|
227
|
+
// known stop-pattern (time / intent / subtype / adapter / list-trigger /
|
|
228
|
+
// pronoun / punct / digit) and pick the longest 2-10 char chunk that
|
|
229
|
+
// remains.
|
|
230
|
+
//
|
|
231
|
+
// Wrong extractions are SAFE: the engine treats this as an OPTIONAL
|
|
232
|
+
// augmentation — extracted-but-irrelevant terms just return 0 FTS rows
|
|
233
|
+
// (wasted budget, not lost facts). Single-character Chinese names like
|
|
234
|
+
// "妈" / "爸" are deliberately NOT picked up because single-char tokens
|
|
235
|
+
// false-positive heavily on residual verbs (说/看/买). That's a known
|
|
236
|
+
// limitation; first-pass acceptable.
|
|
237
|
+
//
|
|
238
|
+
// Stop-pattern order matters: multi-char compounds must run BEFORE
|
|
239
|
+
// shorter alternatives so "多少钱" doesn't decay to "多少" + leftover "钱".
|
|
240
|
+
|
|
241
|
+
const ENTITY_STOP_PATTERNS = [
|
|
242
|
+
// Compounds — multi-char specific tokens first
|
|
243
|
+
/(多少钱|多少次|多少个|多少家|多少人|多少张|多少部|加起来|共多少|总共)/g,
|
|
244
|
+
/(几个|几次|几条|几单)/g,
|
|
245
|
+
/(how\s+many|count\s+of)/gi,
|
|
246
|
+
// Time
|
|
247
|
+
/(今天|昨天|前天|明天|本周|这周|上周|这个礼拜|上个礼拜|这一周|上一周|本月|这月|上月|这个月|上个月|上一月|今年|去年|最近|最新)/g,
|
|
248
|
+
/\d+\s*[天周月年个]/g,
|
|
249
|
+
/\d{4}\s*年\s*\d{1,2}\s*月/g,
|
|
250
|
+
/(today|yesterday|past|recent|latest)/gi,
|
|
251
|
+
// Intent (remaining shorter forms after compounds)
|
|
252
|
+
/(多少|合计)/g,
|
|
253
|
+
/(sum|total|count|amount)/gi,
|
|
254
|
+
// Subtype keywords — compound forms first
|
|
255
|
+
/(下了几单|下了多少单|去旅游)/g,
|
|
256
|
+
/(订单|下单|买了|购买|支付|付款|花了|花费|消费|开销|金额|转账|转给|转钱|收入|工资|进账|收到|聊天|消息|聊了|对话|朋友圈|动态|去过|到过|去了|来到|出差|旅行|浏览|看了|阅读|发了)/g,
|
|
257
|
+
/(order|payment|transfer|income|message|chat|moment|post|visited|trip|browse|read|spent|spend)/gi,
|
|
258
|
+
// Adapter keywords — compound forms first
|
|
259
|
+
/(大众点评|百度地图|火车票)/g,
|
|
260
|
+
/(支付宝|微信|邮箱|邮件|淘宝|天猫|京东|拼多多|美团|高德|高铁|携程)/g,
|
|
261
|
+
/(alipay|wechat|email|imap|taobao|tmall|jingdong|jd|pdd|meituan|dianping|baidu\s*map|12306|ctrip)/gi,
|
|
262
|
+
/(deepseek|kimi|通义|智谱|混元|千帆|扣子|chatgpt|claude)/gi,
|
|
263
|
+
// List / search trigger
|
|
264
|
+
/(查一下|找一找|帮我|给我|看下|看看|看一下)/g,
|
|
265
|
+
/(列出|列表|查询|查找|查看|提到|发现)/g,
|
|
266
|
+
/(list|show|find|search)/gi,
|
|
267
|
+
// Pronouns / particles / prepositions (multi-char first, then single-char)
|
|
268
|
+
/(我们|你们|他们|什么|哪个|哪些|怎么|为什么|是否)/g,
|
|
269
|
+
/[的了吗啊呢在给到与和跟对从向是有我你他她它这那哪谁啥嘛]/g,
|
|
270
|
+
// Punctuation + whitespace
|
|
271
|
+
/[\s!?.,;:'"(),。!?;:、《》「」『』【】]+/g,
|
|
272
|
+
// Numbers
|
|
273
|
+
/\d+/g,
|
|
274
|
+
];
|
|
275
|
+
|
|
276
|
+
/**
|
|
277
|
+
* Extract a probable entity-name candidate from raw question text.
|
|
278
|
+
*
|
|
279
|
+
* @param {string} text
|
|
280
|
+
* @returns {string|null} longest remaining 2-10 char chunk, or null
|
|
281
|
+
*/
|
|
282
|
+
function extractEntityTerm(text) {
|
|
283
|
+
if (typeof text !== "string" || text.length === 0) return null;
|
|
284
|
+
let s = text;
|
|
285
|
+
for (const re of ENTITY_STOP_PATTERNS) {
|
|
286
|
+
s = s.replace(re, " ");
|
|
287
|
+
}
|
|
288
|
+
const candidates = s.split(/\s+/).filter((t) => t.length >= 2 && t.length <= 10);
|
|
289
|
+
if (candidates.length === 0) return null;
|
|
290
|
+
candidates.sort((a, b) => b.length - a.length);
|
|
291
|
+
return candidates[0];
|
|
292
|
+
}
|
|
293
|
+
|
|
222
294
|
// ─── Full parser ─────────────────────────────────────────────────────────
|
|
223
295
|
|
|
224
296
|
/**
|
|
@@ -250,7 +322,9 @@ module.exports = {
|
|
|
250
322
|
parseTimeWindow,
|
|
251
323
|
parseFilters,
|
|
252
324
|
parseIntent,
|
|
325
|
+
extractEntityTerm,
|
|
253
326
|
// exposed for tests
|
|
254
327
|
SUBTYPE_KEYWORDS,
|
|
255
328
|
ADAPTER_KEYWORDS,
|
|
329
|
+
ENTITY_STOP_PATTERNS,
|
|
256
330
|
};
|
package/lib/registry.js
CHANGED
|
@@ -267,6 +267,168 @@ class AdapterRegistry {
|
|
|
267
267
|
return this._finish(report, startedAt);
|
|
268
268
|
}
|
|
269
269
|
|
|
270
|
+
/**
|
|
271
|
+
* 2026-05-24 — re-derive canonical events from raw_events archive.
|
|
272
|
+
* Use case: a past sync wrote to raw_events (putRawEvent succeeded) but
|
|
273
|
+
* putBatch failed silently (e.g. partial-index drift trap #25) → events
|
|
274
|
+
* table stuck at 0 for those entities while raw piled up. After fixing
|
|
275
|
+
* the underlying schema/code bug, call this to promote the orphan raws
|
|
276
|
+
* to canonical events without re-fetching from the source.
|
|
277
|
+
*
|
|
278
|
+
* Behaviour:
|
|
279
|
+
* - Iterates raw_events filtered by [opts.adapter] (or all)
|
|
280
|
+
* - For each row: lookup registered adapter → normalize(raw) →
|
|
281
|
+
* partition valid/invalid → putBatch
|
|
282
|
+
* - Skips raws whose adapter is not currently registered (logs
|
|
283
|
+
* `adapter.rederive.adapter_missing` audit)
|
|
284
|
+
* - On adapter.normalize() throw, increments invalidCount + audit
|
|
285
|
+
*
|
|
286
|
+
* Does NOT re-fetch from the source, does NOT update watermarks (raw
|
|
287
|
+
* archive timestamp is what it was), does NOT run KG/RAG sinks (those
|
|
288
|
+
* are sync-time concerns — call them via syncAll if needed).
|
|
289
|
+
*
|
|
290
|
+
* @param {object} [opts]
|
|
291
|
+
* @param {string} [opts.adapter] Filter by adapter name; default = all
|
|
292
|
+
* @param {number} [opts.batchSize=100] Raws per partitionBatch+putBatch tx
|
|
293
|
+
* @returns {Promise<RederiveReport>}
|
|
294
|
+
*
|
|
295
|
+
* @typedef {object} RederiveReport
|
|
296
|
+
* @property {number} rawSeen Total raw_events iterated
|
|
297
|
+
* @property {number} invalidCount Normalize threw or partition rejected
|
|
298
|
+
* @property {number} adapterMissing Raws whose adapter not registered
|
|
299
|
+
* @property {object} entityCounts { events, persons, places, items, topics }
|
|
300
|
+
* @property {number} durationMs
|
|
301
|
+
* @property {Array<{adapter,error,sample?}>} errors Adapter-level errors
|
|
302
|
+
*/
|
|
303
|
+
async rederive(opts = {}) {
|
|
304
|
+
const startedAt = Date.now();
|
|
305
|
+
const report = {
|
|
306
|
+
rawSeen: 0,
|
|
307
|
+
invalidCount: 0,
|
|
308
|
+
adapterMissing: 0,
|
|
309
|
+
entityCounts: { events: 0, persons: 0, places: 0, items: 0, topics: 0 },
|
|
310
|
+
durationMs: 0,
|
|
311
|
+
errors: [],
|
|
312
|
+
};
|
|
313
|
+
const batchSize = Number.isInteger(opts.batchSize) && opts.batchSize > 0
|
|
314
|
+
? opts.batchSize
|
|
315
|
+
: 100;
|
|
316
|
+
|
|
317
|
+
// Page through raw_events in batches to avoid loading the whole table
|
|
318
|
+
// for large vaults (1000+ rows is normal). Group raws by adapter so a
|
|
319
|
+
// single putBatch tx commits per-adapter (mirrors syncAdapter shape).
|
|
320
|
+
let offset = 0;
|
|
321
|
+
let totalProcessed = 0;
|
|
322
|
+
/** @type {Map<string, Array<object>>} */
|
|
323
|
+
const buffersByAdapter = new Map();
|
|
324
|
+
|
|
325
|
+
const flushAdapter = async (adapterName) => {
|
|
326
|
+
const buffer = buffersByAdapter.get(adapterName);
|
|
327
|
+
if (!buffer || buffer.length === 0) return;
|
|
328
|
+
const adapter = this._adapters.get(adapterName);
|
|
329
|
+
if (!adapter) {
|
|
330
|
+
report.adapterMissing += buffer.length;
|
|
331
|
+
try {
|
|
332
|
+
this.vault.audit("adapter.rederive.adapter_missing", adapterName, {
|
|
333
|
+
droppedCount: buffer.length,
|
|
334
|
+
});
|
|
335
|
+
} catch (_e) { /* audit failure is non-fatal */ }
|
|
336
|
+
buffersByAdapter.set(adapterName, []);
|
|
337
|
+
return;
|
|
338
|
+
}
|
|
339
|
+
// normalize + collect into one merged batch (per adapter buffer)
|
|
340
|
+
const merged = { events: [], persons: [], places: [], items: [], topics: [] };
|
|
341
|
+
for (const raw of buffer) {
|
|
342
|
+
let normalized;
|
|
343
|
+
try {
|
|
344
|
+
normalized = adapter.normalize(raw);
|
|
345
|
+
} catch (err) {
|
|
346
|
+
report.invalidCount += 1;
|
|
347
|
+
try {
|
|
348
|
+
this.vault.audit("adapter.rederive.normalize_failed", adapterName, {
|
|
349
|
+
originalId: raw.originalId,
|
|
350
|
+
error: toError(err, "normalize").message,
|
|
351
|
+
});
|
|
352
|
+
} catch (_e) { /* audit non-fatal */ }
|
|
353
|
+
continue;
|
|
354
|
+
}
|
|
355
|
+
if (!normalized || typeof normalized !== "object") continue;
|
|
356
|
+
for (const k of ["events", "persons", "places", "items", "topics"]) {
|
|
357
|
+
if (Array.isArray(normalized[k])) merged[k].push(...normalized[k]);
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
const { valid, invalidReasons } = partitionBatch(merged);
|
|
361
|
+
if (invalidReasons.length > 0) {
|
|
362
|
+
report.invalidCount += invalidReasons.length;
|
|
363
|
+
try {
|
|
364
|
+
this.vault.audit("adapter.rederive.invalid_entities", adapterName, {
|
|
365
|
+
count: invalidReasons.length,
|
|
366
|
+
sample: invalidReasons.slice(0, 5),
|
|
367
|
+
});
|
|
368
|
+
} catch (_e) { /* audit non-fatal */ }
|
|
369
|
+
}
|
|
370
|
+
try {
|
|
371
|
+
const counts = this.vault.putBatch(valid);
|
|
372
|
+
for (const k of Object.keys(counts)) {
|
|
373
|
+
report.entityCounts[k] = (report.entityCounts[k] || 0) + counts[k];
|
|
374
|
+
}
|
|
375
|
+
} catch (err) {
|
|
376
|
+
report.errors.push({
|
|
377
|
+
adapter: adapterName,
|
|
378
|
+
error: toError(err, "putBatch").message,
|
|
379
|
+
sample: buffer.slice(0, 3).map((r) => r.originalId),
|
|
380
|
+
});
|
|
381
|
+
try {
|
|
382
|
+
this.vault.audit("adapter.rederive.put_batch_failed", adapterName, {
|
|
383
|
+
error: toError(err, "putBatch").message,
|
|
384
|
+
droppedCount: buffer.length,
|
|
385
|
+
});
|
|
386
|
+
} catch (_e) { /* audit non-fatal */ }
|
|
387
|
+
}
|
|
388
|
+
buffersByAdapter.set(adapterName, []);
|
|
389
|
+
};
|
|
390
|
+
|
|
391
|
+
while (true) {
|
|
392
|
+
const page = this.vault.queryRawEvents({
|
|
393
|
+
adapter: opts.adapter,
|
|
394
|
+
limit: batchSize,
|
|
395
|
+
offset,
|
|
396
|
+
});
|
|
397
|
+
if (page.length === 0) break;
|
|
398
|
+
offset += page.length;
|
|
399
|
+
report.rawSeen += page.length;
|
|
400
|
+
// Group by adapter into buffers, flush whenever a buffer hits batchSize.
|
|
401
|
+
for (const raw of page) {
|
|
402
|
+
let buf = buffersByAdapter.get(raw.adapter);
|
|
403
|
+
if (!buf) {
|
|
404
|
+
buf = [];
|
|
405
|
+
buffersByAdapter.set(raw.adapter, buf);
|
|
406
|
+
}
|
|
407
|
+
buf.push(raw);
|
|
408
|
+
if (buf.length >= batchSize) {
|
|
409
|
+
await flushAdapter(raw.adapter);
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
totalProcessed += page.length;
|
|
413
|
+
}
|
|
414
|
+
// Final flush of remaining per-adapter buffers
|
|
415
|
+
for (const name of Array.from(buffersByAdapter.keys())) {
|
|
416
|
+
await flushAdapter(name);
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
report.durationMs = Date.now() - startedAt;
|
|
420
|
+
try {
|
|
421
|
+
this.vault.audit("adapter.rederive.summary", opts.adapter || "*", {
|
|
422
|
+
rawSeen: report.rawSeen,
|
|
423
|
+
invalidCount: report.invalidCount,
|
|
424
|
+
adapterMissing: report.adapterMissing,
|
|
425
|
+
entityCounts: report.entityCounts,
|
|
426
|
+
durationMs: report.durationMs,
|
|
427
|
+
});
|
|
428
|
+
} catch (_e) { /* audit non-fatal */ }
|
|
429
|
+
return report;
|
|
430
|
+
}
|
|
431
|
+
|
|
270
432
|
/**
|
|
271
433
|
* Sync every registered adapter sequentially.
|
|
272
434
|
* Returns an array of SyncReports in registration order.
|