@chainlesschain/personal-data-hub 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +241 -0
- package/__tests__/adapter-spec.test.js +78 -0
- package/__tests__/adapters/email-adapter.test.js +605 -0
- package/__tests__/adapters/email-imap-session.test.js +334 -0
- package/__tests__/adapters/email-parser.test.js +244 -0
- package/__tests__/adapters/email-providers.test.js +84 -0
- package/__tests__/analysis.test.js +302 -0
- package/__tests__/batch.test.js +133 -0
- package/__tests__/bridges-cc-kg.test.js +231 -0
- package/__tests__/bridges-cc-llm.test.js +191 -0
- package/__tests__/bridges-cc-rag.test.js +162 -0
- package/__tests__/ids.test.js +45 -0
- package/__tests__/key-providers.test.js +126 -0
- package/__tests__/kg-derive.test.js +219 -0
- package/__tests__/llm-client.test.js +122 -0
- package/__tests__/mock-adapter.test.js +93 -0
- package/__tests__/prompt-builder.test.js +204 -0
- package/__tests__/query-parser.test.js +150 -0
- package/__tests__/rag-derive.test.js +169 -0
- package/__tests__/registry.test.js +304 -0
- package/__tests__/schemas.test.js +331 -0
- package/__tests__/vault.test.js +506 -0
- package/lib/adapter-spec.js +155 -0
- package/lib/adapters/email-imap/email-adapter.js +398 -0
- package/lib/adapters/email-imap/email-parser.js +177 -0
- package/lib/adapters/email-imap/imap-session.js +294 -0
- package/lib/adapters/email-imap/index.js +26 -0
- package/lib/adapters/email-imap/providers.js +111 -0
- package/lib/analysis.js +226 -0
- package/lib/batch.js +123 -0
- package/lib/bridges/cc-kg-sink.js +264 -0
- package/lib/bridges/cc-llm-adapter.js +169 -0
- package/lib/bridges/cc-rag-sink.js +118 -0
- package/lib/bridges/index.js +44 -0
- package/lib/constants.js +92 -0
- package/lib/ids.js +103 -0
- package/lib/index.js +141 -0
- package/lib/key-providers.js +146 -0
- package/lib/kg-derive.js +214 -0
- package/lib/llm-client.js +171 -0
- package/lib/migrations.js +246 -0
- package/lib/mock-adapter.js +199 -0
- package/lib/prompt-builder.js +205 -0
- package/lib/query-parser.js +250 -0
- package/lib/rag-derive.js +186 -0
- package/lib/registry.js +398 -0
- package/lib/schemas.js +379 -0
- package/lib/vault.js +883 -0
- package/package.json +63 -0
- package/vitest.config.js +10 -0
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* UnifiedSchema → RAG documents.
|
|
3
|
+
*
|
|
4
|
+
* Mirrors §8.3 of docs/design/Personal_Data_Hub_Architecture.md. The RAG
|
|
5
|
+
* layer needs (text, metadata) pairs to embed + index. Different entity
|
|
6
|
+
* types contribute different text:
|
|
7
|
+
*
|
|
8
|
+
* - Event text = content.title + content.text + "(amount/place/type prose)"
|
|
9
|
+
* - Person text = names + relation + notes
|
|
10
|
+
* - Place text = name + aliases + address
|
|
11
|
+
* - Item text = name + category
|
|
12
|
+
* - Topic text = name
|
|
13
|
+
*
|
|
14
|
+
* Metadata always includes (id, type, subtype, occurredAt where applicable,
|
|
15
|
+
* source.adapter). The RAG sink can then filter by adapter / time-window /
|
|
16
|
+
* subtype when retrieving — critical for "上个月的消费 Q&A" style queries.
|
|
17
|
+
*
|
|
18
|
+
* Like kg-derive, this module is engine-agnostic. The registry pipes
|
|
19
|
+
* {text, metadata} pairs to a `ragSink(doc)` callback wired up by the
|
|
20
|
+
* desktop main process to ChainlessChain's existing RAG pipeline (BM25 +
|
|
21
|
+
* Qdrant vector). In tests it just collects into an array.
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
"use strict";
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* @typedef {object} RagDoc
|
|
28
|
+
* @property {string} id entity id (also serves as doc key in RAG)
|
|
29
|
+
* @property {string} type
|
|
30
|
+
* @property {string} text embedding input
|
|
31
|
+
* @property {object} metadata filter fields for retrieval
|
|
32
|
+
*/
|
|
33
|
+
|
|
34
|
+
function joinNonEmpty(parts, sep = "\n") {
|
|
35
|
+
return parts.filter((p) => p != null && p !== "").join(sep);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function eventToRagDoc(event) {
|
|
39
|
+
const parts = [];
|
|
40
|
+
if (event.content) {
|
|
41
|
+
if (event.content.title) parts.push(event.content.title);
|
|
42
|
+
if (event.content.text) parts.push(event.content.text);
|
|
43
|
+
if (event.content.amount) {
|
|
44
|
+
const a = event.content.amount;
|
|
45
|
+
const sign = a.direction === "in" ? "+" : "-";
|
|
46
|
+
parts.push(`${sign}${a.value} ${a.currency}`);
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
// Add structural prose so embedding picks up type/category context.
|
|
50
|
+
if (event.subtype) parts.push(`type: ${event.subtype}`);
|
|
51
|
+
if (event.source && event.source.adapter) parts.push(`from: ${event.source.adapter}`);
|
|
52
|
+
|
|
53
|
+
const text = joinNonEmpty(parts);
|
|
54
|
+
return {
|
|
55
|
+
id: event.id,
|
|
56
|
+
type: "event",
|
|
57
|
+
text,
|
|
58
|
+
metadata: {
|
|
59
|
+
subtype: event.subtype,
|
|
60
|
+
occurredAt: event.occurredAt,
|
|
61
|
+
actor: event.actor || null,
|
|
62
|
+
place: event.place || null,
|
|
63
|
+
adapter: event.source && event.source.adapter,
|
|
64
|
+
originalId: event.source && event.source.originalId,
|
|
65
|
+
...(event.topics ? { topics: event.topics } : {}),
|
|
66
|
+
},
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
function personToRagDoc(person) {
|
|
71
|
+
const parts = [...person.names];
|
|
72
|
+
if (person.relation) parts.push(`relation: ${person.relation}`);
|
|
73
|
+
if (person.notes) parts.push(person.notes);
|
|
74
|
+
if (person.identifiers) {
|
|
75
|
+
for (const [k, v] of Object.entries(person.identifiers)) {
|
|
76
|
+
if (v == null) continue;
|
|
77
|
+
const display = Array.isArray(v) ? v.join(", ") : v;
|
|
78
|
+
parts.push(`${k}: ${display}`);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
return {
|
|
82
|
+
id: person.id,
|
|
83
|
+
type: "person",
|
|
84
|
+
text: joinNonEmpty(parts),
|
|
85
|
+
metadata: {
|
|
86
|
+
subtype: person.subtype,
|
|
87
|
+
adapter: person.source && person.source.adapter,
|
|
88
|
+
originalId: person.source && person.source.originalId,
|
|
89
|
+
},
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
function placeToRagDoc(place) {
|
|
94
|
+
const parts = [place.name];
|
|
95
|
+
for (const a of place.aliases) {
|
|
96
|
+
if (a !== place.name) parts.push(a);
|
|
97
|
+
}
|
|
98
|
+
if (place.address) parts.push(place.address);
|
|
99
|
+
if (place.category) parts.push(`category: ${place.category}`);
|
|
100
|
+
return {
|
|
101
|
+
id: place.id,
|
|
102
|
+
type: "place",
|
|
103
|
+
text: joinNonEmpty(parts),
|
|
104
|
+
metadata: {
|
|
105
|
+
adapter: place.source && place.source.adapter,
|
|
106
|
+
originalId: place.source && place.source.originalId,
|
|
107
|
+
...(place.coordinates ? { coordinates: place.coordinates } : {}),
|
|
108
|
+
},
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
function itemToRagDoc(item) {
|
|
113
|
+
const parts = [item.name];
|
|
114
|
+
if (item.category) parts.push(`category: ${item.category}`);
|
|
115
|
+
if (item.price) parts.push(`${item.price.value} ${item.price.currency}`);
|
|
116
|
+
return {
|
|
117
|
+
id: item.id,
|
|
118
|
+
type: "item",
|
|
119
|
+
text: joinNonEmpty(parts),
|
|
120
|
+
metadata: {
|
|
121
|
+
subtype: item.subtype,
|
|
122
|
+
adapter: item.source && item.source.adapter,
|
|
123
|
+
originalId: item.source && item.source.originalId,
|
|
124
|
+
merchant: item.merchant || null,
|
|
125
|
+
},
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
function topicToRagDoc(topic) {
|
|
130
|
+
return {
|
|
131
|
+
id: topic.id,
|
|
132
|
+
type: "topic",
|
|
133
|
+
text: topic.name,
|
|
134
|
+
metadata: {
|
|
135
|
+
adapter: topic.source && topic.source.adapter,
|
|
136
|
+
parentTopic: topic.parentTopic || null,
|
|
137
|
+
},
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
function entityToRagDoc(entity) {
|
|
142
|
+
if (!entity || typeof entity !== "object") return null;
|
|
143
|
+
switch (entity.type) {
|
|
144
|
+
case "event":
|
|
145
|
+
return eventToRagDoc(entity);
|
|
146
|
+
case "person":
|
|
147
|
+
return personToRagDoc(entity);
|
|
148
|
+
case "place":
|
|
149
|
+
return placeToRagDoc(entity);
|
|
150
|
+
case "item":
|
|
151
|
+
return itemToRagDoc(entity);
|
|
152
|
+
case "topic":
|
|
153
|
+
return topicToRagDoc(entity);
|
|
154
|
+
default:
|
|
155
|
+
return null;
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
/**
|
|
160
|
+
* Derive RAG docs for an entire NormalizedBatch.
|
|
161
|
+
* Empty-text entities are filtered out — no point embedding "" into Qdrant.
|
|
162
|
+
*/
|
|
163
|
+
function deriveBatchDocs(batch) {
|
|
164
|
+
const out = [];
|
|
165
|
+
if (!batch || typeof batch !== "object") return out;
|
|
166
|
+
const push = (entity, fn) => {
|
|
167
|
+
const doc = fn(entity);
|
|
168
|
+
if (doc && doc.text && doc.text.length > 0) out.push(doc);
|
|
169
|
+
};
|
|
170
|
+
for (const e of batch.events || []) push(e, eventToRagDoc);
|
|
171
|
+
for (const p of batch.persons || []) push(p, personToRagDoc);
|
|
172
|
+
for (const pl of batch.places || []) push(pl, placeToRagDoc);
|
|
173
|
+
for (const i of batch.items || []) push(i, itemToRagDoc);
|
|
174
|
+
for (const t of batch.topics || []) push(t, topicToRagDoc);
|
|
175
|
+
return out;
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
module.exports = {
|
|
179
|
+
eventToRagDoc,
|
|
180
|
+
personToRagDoc,
|
|
181
|
+
placeToRagDoc,
|
|
182
|
+
itemToRagDoc,
|
|
183
|
+
topicToRagDoc,
|
|
184
|
+
entityToRagDoc,
|
|
185
|
+
deriveBatchDocs,
|
|
186
|
+
};
|
package/lib/registry.js
ADDED
|
@@ -0,0 +1,398 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* AdapterRegistry — runtime registry + sync orchestrator for adapters.
|
|
3
|
+
*
|
|
4
|
+
* Responsibilities:
|
|
5
|
+
* 1. Hold registered adapters by .name; reject double-register.
|
|
6
|
+
* 2. Run `syncAdapter(name, options)` end-to-end:
|
|
7
|
+
* adapter.healthCheck()
|
|
8
|
+
* → adapter.sync({ sinceWatermark }) AsyncIterable<RawEvent>
|
|
9
|
+
* → vault.putRawEvent(...) (archive verbatim payload)
|
|
10
|
+
* → adapter.normalize(raw) → NormalizedBatch
|
|
11
|
+
* → partitionBatch (valid vs invalid)
|
|
12
|
+
* → vault.putBatch(valid)
|
|
13
|
+
* → kgSink(triples) / ragSink(docs) (pluggable)
|
|
14
|
+
* → audit invalidReasons + sync stats
|
|
15
|
+
* → vault.setWatermark(...)
|
|
16
|
+
* 3. `syncAll()` runs every registered adapter sequentially (concurrency:
|
|
17
|
+
* one at a time; v1 is fine, parallel sync needs careful rate-limit
|
|
18
|
+
* coordination per architecture doc §10).
|
|
19
|
+
*
|
|
20
|
+
* Sinks (kgSink, ragSink) are intentionally PUSH callbacks rather than the
|
|
21
|
+
* registry pulling from existing engines. This keeps the hub package free
|
|
22
|
+
* of dependencies on KG / RAG / Ollama / IPC layers — desktop main wires
|
|
23
|
+
* them up. Tests inject in-memory collectors.
|
|
24
|
+
*
|
|
25
|
+
* Concurrency policy: one sync at a time per registry instance. Multiple
|
|
26
|
+
* registries with separate vaults can run in parallel (different processes).
|
|
27
|
+
*/
|
|
28
|
+
|
|
29
|
+
"use strict";
|
|
30
|
+
|
|
31
|
+
const { assertAdapter, toError } = require("./adapter-spec");
|
|
32
|
+
const { partitionBatch } = require("./batch");
|
|
33
|
+
const { deriveBatchTriples } = require("./kg-derive");
|
|
34
|
+
const { deriveBatchDocs } = require("./rag-derive");
|
|
35
|
+
|
|
36
|
+
const DEFAULT_BATCH_SIZE = 100;
|
|
37
|
+
|
|
38
|
+
class AdapterRegistry {
|
|
39
|
+
/**
|
|
40
|
+
* @param {object} opts
|
|
41
|
+
* @param {import("./vault").LocalVault} opts.vault open LocalVault to write into
|
|
42
|
+
* @param {(triples: object[]) => void|Promise<void>} [opts.kgSink]
|
|
43
|
+
* @param {(docs: object[]) => void|Promise<void>} [opts.ragSink]
|
|
44
|
+
* @param {number} [opts.batchSize=100] raw events per ingest batch (commit size)
|
|
45
|
+
* @param {(msg: object) => void} [opts.onSyncEvent] optional progress callback
|
|
46
|
+
*/
|
|
47
|
+
constructor(opts) {
|
|
48
|
+
if (!opts || typeof opts !== "object") throw new Error("AdapterRegistry: opts required");
|
|
49
|
+
if (!opts.vault) throw new Error("AdapterRegistry: opts.vault required");
|
|
50
|
+
this.vault = opts.vault;
|
|
51
|
+
this.kgSink = typeof opts.kgSink === "function" ? opts.kgSink : null;
|
|
52
|
+
this.ragSink = typeof opts.ragSink === "function" ? opts.ragSink : null;
|
|
53
|
+
this.onSyncEvent = typeof opts.onSyncEvent === "function" ? opts.onSyncEvent : null;
|
|
54
|
+
this.batchSize =
|
|
55
|
+
Number.isInteger(opts.batchSize) && opts.batchSize > 0 ? opts.batchSize : DEFAULT_BATCH_SIZE;
|
|
56
|
+
|
|
57
|
+
this._adapters = new Map();
|
|
58
|
+
this._activeSync = null; // name of currently-running adapter, or null
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// ─── Registration ────────────────────────────────────────────────────
|
|
62
|
+
|
|
63
|
+
register(adapter) {
|
|
64
|
+
const r = assertAdapter(adapter);
|
|
65
|
+
if (!r.ok) {
|
|
66
|
+
throw new Error(`AdapterRegistry.register: invalid adapter — ${r.errors.join("; ")}`);
|
|
67
|
+
}
|
|
68
|
+
if (this._adapters.has(adapter.name)) {
|
|
69
|
+
throw new Error(`AdapterRegistry.register: adapter "${adapter.name}" already registered`);
|
|
70
|
+
}
|
|
71
|
+
this._adapters.set(adapter.name, adapter);
|
|
72
|
+
this._emit({ kind: "registered", adapter: adapter.name });
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
unregister(name) {
|
|
76
|
+
if (!this._adapters.has(name)) return false;
|
|
77
|
+
if (this._activeSync === name) {
|
|
78
|
+
throw new Error(`AdapterRegistry.unregister: cannot unregister "${name}" mid-sync`);
|
|
79
|
+
}
|
|
80
|
+
this._adapters.delete(name);
|
|
81
|
+
this._emit({ kind: "unregistered", adapter: name });
|
|
82
|
+
return true;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
get(name) {
|
|
86
|
+
return this._adapters.get(name) || null;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
list() {
|
|
90
|
+
return Array.from(this._adapters.values()).map((a) => ({
|
|
91
|
+
name: a.name,
|
|
92
|
+
version: a.version,
|
|
93
|
+
capabilities: [...a.capabilities],
|
|
94
|
+
sensitivity: a.dataDisclosure.sensitivity,
|
|
95
|
+
legalGate: !!a.dataDisclosure.legalGate,
|
|
96
|
+
}));
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
has(name) {
|
|
100
|
+
return this._adapters.has(name);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// ─── Sync orchestration ──────────────────────────────────────────────
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Sync one adapter end-to-end.
|
|
107
|
+
*
|
|
108
|
+
* @param {string} name
|
|
109
|
+
* @param {object} [options]
|
|
110
|
+
* @param {string} [options.scope=""]
|
|
111
|
+
* @param {number} [options.maxEvents]
|
|
112
|
+
* @param {string} [options.sinceWatermark] override stored watermark
|
|
113
|
+
* @returns {Promise<SyncReport>}
|
|
114
|
+
*
|
|
115
|
+
* @typedef {object} SyncReport
|
|
116
|
+
* @property {string} adapter
|
|
117
|
+
* @property {string} status "ok" | "auth_expired" | "unhealthy" | "error"
|
|
118
|
+
* @property {number} rawCount
|
|
119
|
+
* @property {object} entityCounts { events, persons, places, items, topics }
|
|
120
|
+
* @property {number} invalidCount
|
|
121
|
+
* @property {number} kgTripleCount
|
|
122
|
+
* @property {number} ragDocCount
|
|
123
|
+
* @property {number} durationMs
|
|
124
|
+
* @property {string|null} error
|
|
125
|
+
* @property {string|null} watermark
|
|
126
|
+
*/
|
|
127
|
+
async syncAdapter(name, options = {}) {
|
|
128
|
+
const adapter = this._adapters.get(name);
|
|
129
|
+
if (!adapter) throw new Error(`AdapterRegistry.syncAdapter: no adapter "${name}"`);
|
|
130
|
+
if (this._activeSync) {
|
|
131
|
+
throw new Error(
|
|
132
|
+
`AdapterRegistry.syncAdapter: already syncing "${this._activeSync}"; one at a time`
|
|
133
|
+
);
|
|
134
|
+
}
|
|
135
|
+
this._activeSync = name;
|
|
136
|
+
|
|
137
|
+
const startedAt = Date.now();
|
|
138
|
+
const report = {
|
|
139
|
+
adapter: name,
|
|
140
|
+
status: "ok",
|
|
141
|
+
rawCount: 0,
|
|
142
|
+
entityCounts: { events: 0, persons: 0, places: 0, items: 0, topics: 0 },
|
|
143
|
+
invalidCount: 0,
|
|
144
|
+
kgTripleCount: 0,
|
|
145
|
+
ragDocCount: 0,
|
|
146
|
+
durationMs: 0,
|
|
147
|
+
error: null,
|
|
148
|
+
watermark: null,
|
|
149
|
+
};
|
|
150
|
+
const scope = typeof options.scope === "string" ? options.scope : "";
|
|
151
|
+
|
|
152
|
+
try {
|
|
153
|
+
// 1. Health check (gate)
|
|
154
|
+
const health = await adapter.healthCheck();
|
|
155
|
+
if (!health || !health.ok) {
|
|
156
|
+
report.status = "unhealthy";
|
|
157
|
+
report.error = (health && health.reason) || "healthCheck returned not ok";
|
|
158
|
+
this.vault.audit("adapter.sync.unhealthy", name, {
|
|
159
|
+
scope,
|
|
160
|
+
reason: report.error,
|
|
161
|
+
});
|
|
162
|
+
return this._finish(report, startedAt);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// 2. Resolve watermark
|
|
166
|
+
let sinceWatermark = options.sinceWatermark;
|
|
167
|
+
if (sinceWatermark === undefined) {
|
|
168
|
+
const stored = this.vault.getWatermark(name, scope);
|
|
169
|
+
sinceWatermark = stored && stored.watermark != null
|
|
170
|
+
? this._parseStoredWatermark(stored.watermark)
|
|
171
|
+
: undefined;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
this._emit({ kind: "sync.start", adapter: name, scope, sinceWatermark });
|
|
175
|
+
|
|
176
|
+
// 3. Iterate raw events, batch them, ingest each batch
|
|
177
|
+
let buffer = [];
|
|
178
|
+
const flush = async () => {
|
|
179
|
+
if (buffer.length === 0) return;
|
|
180
|
+
await this._ingestRawBatch(adapter, buffer, report);
|
|
181
|
+
buffer = [];
|
|
182
|
+
};
|
|
183
|
+
|
|
184
|
+
const iter = adapter.sync({
|
|
185
|
+
sinceWatermark,
|
|
186
|
+
maxEvents: options.maxEvents,
|
|
187
|
+
scope,
|
|
188
|
+
});
|
|
189
|
+
|
|
190
|
+
for await (const raw of iter) {
|
|
191
|
+
if (!raw || typeof raw !== "object") {
|
|
192
|
+
report.invalidCount += 1;
|
|
193
|
+
continue;
|
|
194
|
+
}
|
|
195
|
+
buffer.push(raw);
|
|
196
|
+
report.rawCount += 1;
|
|
197
|
+
if (buffer.length >= this.batchSize) {
|
|
198
|
+
await flush();
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
await flush();
|
|
202
|
+
|
|
203
|
+
// 4. Persist final watermark
|
|
204
|
+
const newWatermark = report.rawCount + (this._parseStoredWatermark(sinceWatermark) || 0);
|
|
205
|
+
report.watermark = String(newWatermark);
|
|
206
|
+
this.vault.setWatermark(name, scope, {
|
|
207
|
+
watermark: report.watermark,
|
|
208
|
+
lastSyncedAt: Date.now(),
|
|
209
|
+
lastStatus: "ok",
|
|
210
|
+
lastError: null,
|
|
211
|
+
});
|
|
212
|
+
|
|
213
|
+
this.vault.audit("adapter.sync.ok", name, {
|
|
214
|
+
scope,
|
|
215
|
+
rawCount: report.rawCount,
|
|
216
|
+
invalidCount: report.invalidCount,
|
|
217
|
+
watermark: report.watermark,
|
|
218
|
+
});
|
|
219
|
+
this._emit({ kind: "sync.ok", adapter: name, ...report });
|
|
220
|
+
} catch (err) {
|
|
221
|
+
const error = toError(err, `sync ${name}`);
|
|
222
|
+
report.status = "error";
|
|
223
|
+
report.error = error.message;
|
|
224
|
+
this.vault.audit("adapter.sync.error", name, {
|
|
225
|
+
scope,
|
|
226
|
+
message: error.message,
|
|
227
|
+
});
|
|
228
|
+
this._emit({ kind: "sync.error", adapter: name, error: error.message });
|
|
229
|
+
// Update watermark with error status (preserve last successful watermark value)
|
|
230
|
+
try {
|
|
231
|
+
const prev = this.vault.getWatermark(name, scope);
|
|
232
|
+
this.vault.setWatermark(name, scope, {
|
|
233
|
+
watermark: prev ? prev.watermark : null,
|
|
234
|
+
lastSyncedAt: Date.now(),
|
|
235
|
+
lastStatus: "error",
|
|
236
|
+
lastError: error.message,
|
|
237
|
+
});
|
|
238
|
+
} catch (_e) {
|
|
239
|
+
// Watermark write failure is non-fatal in the error path.
|
|
240
|
+
}
|
|
241
|
+
} finally {
|
|
242
|
+
this._activeSync = null;
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
return this._finish(report, startedAt);
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
/**
|
|
249
|
+
* Sync every registered adapter sequentially.
|
|
250
|
+
* Returns an array of SyncReports in registration order.
|
|
251
|
+
*/
|
|
252
|
+
async syncAll(options = {}) {
|
|
253
|
+
const reports = [];
|
|
254
|
+
for (const adapter of this._adapters.values()) {
|
|
255
|
+
try {
|
|
256
|
+
reports.push(await this.syncAdapter(adapter.name, options));
|
|
257
|
+
} catch (err) {
|
|
258
|
+
// Should not happen — syncAdapter catches everything — but be paranoid.
|
|
259
|
+
reports.push({
|
|
260
|
+
adapter: adapter.name,
|
|
261
|
+
status: "error",
|
|
262
|
+
error: toError(err, "syncAll").message,
|
|
263
|
+
rawCount: 0,
|
|
264
|
+
entityCounts: { events: 0, persons: 0, places: 0, items: 0, topics: 0 },
|
|
265
|
+
invalidCount: 0,
|
|
266
|
+
kgTripleCount: 0,
|
|
267
|
+
ragDocCount: 0,
|
|
268
|
+
durationMs: 0,
|
|
269
|
+
watermark: null,
|
|
270
|
+
});
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
return reports;
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
// ─── Internals ───────────────────────────────────────────────────────
|
|
277
|
+
|
|
278
|
+
async _ingestRawBatch(adapter, rawBatch, report) {
|
|
279
|
+
// 1. Archive raw payloads to vault.raw_events. Done first so even if
|
|
280
|
+
// normalize / KG / RAG fails, the raw is recoverable for re-derive.
|
|
281
|
+
for (const raw of rawBatch) {
|
|
282
|
+
try {
|
|
283
|
+
this.vault.putRawEvent({
|
|
284
|
+
adapter: adapter.name,
|
|
285
|
+
originalId: raw.originalId,
|
|
286
|
+
capturedAt: raw.capturedAt,
|
|
287
|
+
payload: raw.payload,
|
|
288
|
+
});
|
|
289
|
+
} catch (err) {
|
|
290
|
+
// Bad raw — record and skip.
|
|
291
|
+
report.invalidCount += 1;
|
|
292
|
+
this.vault.audit("adapter.sync.invalid_raw", adapter.name, {
|
|
293
|
+
originalId: raw && raw.originalId,
|
|
294
|
+
error: toError(err, "putRawEvent").message,
|
|
295
|
+
});
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
// 2. Normalize each raw → merge into one batch for transactional commit.
|
|
300
|
+
const merged = { events: [], persons: [], places: [], items: [], topics: [] };
|
|
301
|
+
for (const raw of rawBatch) {
|
|
302
|
+
let normalized;
|
|
303
|
+
try {
|
|
304
|
+
normalized = adapter.normalize(raw);
|
|
305
|
+
} catch (err) {
|
|
306
|
+
report.invalidCount += 1;
|
|
307
|
+
this.vault.audit("adapter.sync.normalize_failed", adapter.name, {
|
|
308
|
+
originalId: raw.originalId,
|
|
309
|
+
error: toError(err, "normalize").message,
|
|
310
|
+
});
|
|
311
|
+
continue;
|
|
312
|
+
}
|
|
313
|
+
if (!normalized || typeof normalized !== "object") continue;
|
|
314
|
+
for (const key of ["events", "persons", "places", "items", "topics"]) {
|
|
315
|
+
if (Array.isArray(normalized[key])) merged[key].push(...normalized[key]);
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
// 3. Partition valid vs invalid (validators gate before vault write).
|
|
320
|
+
const { valid, invalid, invalidReasons } = partitionBatch(merged);
|
|
321
|
+
if (invalidReasons.length > 0) {
|
|
322
|
+
report.invalidCount += invalidReasons.length;
|
|
323
|
+
// Only audit a small sample — invalid rows can be high-cardinality.
|
|
324
|
+
this.vault.audit("adapter.sync.invalid_entities", adapter.name, {
|
|
325
|
+
count: invalidReasons.length,
|
|
326
|
+
sample: invalidReasons.slice(0, 5),
|
|
327
|
+
});
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
// 4. Transactional write to vault.
|
|
331
|
+
const counts = this.vault.putBatch(valid);
|
|
332
|
+
for (const k of Object.keys(counts)) {
|
|
333
|
+
report.entityCounts[k] = (report.entityCounts[k] || 0) + counts[k];
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
// 5. KG sink (per-batch, not per-entity, so the sink can amortize work).
|
|
337
|
+
if (this.kgSink) {
|
|
338
|
+
const triples = deriveBatchTriples(valid);
|
|
339
|
+
report.kgTripleCount += triples.length;
|
|
340
|
+
try {
|
|
341
|
+
await this.kgSink(triples);
|
|
342
|
+
} catch (err) {
|
|
343
|
+
this.vault.audit("adapter.sync.kg_sink_failed", adapter.name, {
|
|
344
|
+
error: toError(err, "kgSink").message,
|
|
345
|
+
});
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
// 6. RAG sink.
|
|
350
|
+
if (this.ragSink) {
|
|
351
|
+
const docs = deriveBatchDocs(valid);
|
|
352
|
+
report.ragDocCount += docs.length;
|
|
353
|
+
try {
|
|
354
|
+
await this.ragSink(docs);
|
|
355
|
+
} catch (err) {
|
|
356
|
+
this.vault.audit("adapter.sync.rag_sink_failed", adapter.name, {
|
|
357
|
+
error: toError(err, "ragSink").message,
|
|
358
|
+
});
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
this._emit({
|
|
363
|
+
kind: "sync.batch",
|
|
364
|
+
adapter: adapter.name,
|
|
365
|
+
rawCount: report.rawCount,
|
|
366
|
+
invalidCount: report.invalidCount,
|
|
367
|
+
});
|
|
368
|
+
|
|
369
|
+
// Suppress unused-var lint
|
|
370
|
+
void invalid;
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
_parseStoredWatermark(s) {
|
|
374
|
+
if (s == null) return undefined;
|
|
375
|
+
const n = parseInt(s, 10);
|
|
376
|
+
return Number.isFinite(n) ? n : undefined;
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
_emit(msg) {
|
|
380
|
+
if (this.onSyncEvent) {
|
|
381
|
+
try {
|
|
382
|
+
this.onSyncEvent(msg);
|
|
383
|
+
} catch (_err) {
|
|
384
|
+
// Listener errors must NOT abort the sync.
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
_finish(report, startedAt) {
|
|
390
|
+
report.durationMs = Date.now() - startedAt;
|
|
391
|
+
return report;
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
module.exports = {
|
|
396
|
+
AdapterRegistry,
|
|
397
|
+
DEFAULT_BATCH_SIZE,
|
|
398
|
+
};
|