@chainlesschain/personal-data-hub 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/README.md +241 -0
  2. package/__tests__/adapter-spec.test.js +78 -0
  3. package/__tests__/adapters/email-adapter.test.js +605 -0
  4. package/__tests__/adapters/email-imap-session.test.js +334 -0
  5. package/__tests__/adapters/email-parser.test.js +244 -0
  6. package/__tests__/adapters/email-providers.test.js +84 -0
  7. package/__tests__/analysis.test.js +302 -0
  8. package/__tests__/batch.test.js +133 -0
  9. package/__tests__/bridges-cc-kg.test.js +231 -0
  10. package/__tests__/bridges-cc-llm.test.js +191 -0
  11. package/__tests__/bridges-cc-rag.test.js +162 -0
  12. package/__tests__/ids.test.js +45 -0
  13. package/__tests__/key-providers.test.js +126 -0
  14. package/__tests__/kg-derive.test.js +219 -0
  15. package/__tests__/llm-client.test.js +122 -0
  16. package/__tests__/mock-adapter.test.js +93 -0
  17. package/__tests__/prompt-builder.test.js +204 -0
  18. package/__tests__/query-parser.test.js +150 -0
  19. package/__tests__/rag-derive.test.js +169 -0
  20. package/__tests__/registry.test.js +304 -0
  21. package/__tests__/schemas.test.js +331 -0
  22. package/__tests__/vault.test.js +506 -0
  23. package/lib/adapter-spec.js +155 -0
  24. package/lib/adapters/email-imap/email-adapter.js +398 -0
  25. package/lib/adapters/email-imap/email-parser.js +177 -0
  26. package/lib/adapters/email-imap/imap-session.js +294 -0
  27. package/lib/adapters/email-imap/index.js +26 -0
  28. package/lib/adapters/email-imap/providers.js +111 -0
  29. package/lib/analysis.js +226 -0
  30. package/lib/batch.js +123 -0
  31. package/lib/bridges/cc-kg-sink.js +264 -0
  32. package/lib/bridges/cc-llm-adapter.js +169 -0
  33. package/lib/bridges/cc-rag-sink.js +118 -0
  34. package/lib/bridges/index.js +44 -0
  35. package/lib/constants.js +92 -0
  36. package/lib/ids.js +103 -0
  37. package/lib/index.js +141 -0
  38. package/lib/key-providers.js +146 -0
  39. package/lib/kg-derive.js +214 -0
  40. package/lib/llm-client.js +171 -0
  41. package/lib/migrations.js +246 -0
  42. package/lib/mock-adapter.js +199 -0
  43. package/lib/prompt-builder.js +205 -0
  44. package/lib/query-parser.js +250 -0
  45. package/lib/rag-derive.js +186 -0
  46. package/lib/registry.js +398 -0
  47. package/lib/schemas.js +379 -0
  48. package/lib/vault.js +883 -0
  49. package/package.json +63 -0
  50. package/vitest.config.js +10 -0
package/lib/batch.js ADDED
@@ -0,0 +1,123 @@
1
+ /**
2
+ * NormalizedBatch — what every adapter.normalize() returns to the ingestor
3
+ *
4
+ * Mirrors §9.1 of docs/design/Personal_Data_Hub_Architecture.md:
5
+ * interface NormalizedBatch {
6
+ * events: Event[];
7
+ * persons: Person[];
8
+ * places: Place[];
9
+ * items: Item[];
10
+ * topics?: Topic[];
11
+ * }
12
+ *
13
+ * Helpers here let adapters build batches incrementally + ingestors validate
14
+ * a whole batch in one call (so a single bad row doesn't kill the whole sync).
15
+ */
16
+
17
+ "use strict";
18
+
19
+ const { validate } = require("./schemas");
20
+
21
+ function emptyBatch() {
22
+ return { events: [], persons: [], places: [], items: [], topics: [] };
23
+ }
24
+
25
+ function mergeBatches(a, b) {
26
+ return {
27
+ events: [...(a.events || []), ...(b.events || [])],
28
+ persons: [...(a.persons || []), ...(b.persons || [])],
29
+ places: [...(a.places || []), ...(b.places || [])],
30
+ items: [...(a.items || []), ...(b.items || [])],
31
+ topics: [...(a.topics || []), ...(b.topics || [])],
32
+ };
33
+ }
34
+
35
+ /**
36
+ * Validate every entity in a batch. Returns:
37
+ * {
38
+ * valid: boolean,
39
+ * entityCount: number,
40
+ * errorCount: number,
41
+ * errors: Array<{ kind, index, id?, errors: string[] }>,
42
+ * }
43
+ *
44
+ * The ingestor's policy is: if errorCount > 0, log all bad rows but still
45
+ * ingest the good ones (don't let a single corrupt row from a third-party
46
+ * adapter abort the whole sync window).
47
+ */
48
+ function validateBatch(batch) {
49
+ if (batch == null || typeof batch !== "object") {
50
+ return {
51
+ valid: false,
52
+ entityCount: 0,
53
+ errorCount: 1,
54
+ errors: [{ kind: "batch", index: -1, errors: ["batch must be a plain object"] }],
55
+ };
56
+ }
57
+
58
+ const errors = [];
59
+ let entityCount = 0;
60
+
61
+ const kinds = ["events", "persons", "places", "items", "topics"];
62
+ for (const kind of kinds) {
63
+ const arr = batch[kind];
64
+ if (arr === undefined) continue;
65
+ if (!Array.isArray(arr)) {
66
+ errors.push({ kind, index: -1, errors: [`${kind} must be an array when present`] });
67
+ continue;
68
+ }
69
+ arr.forEach((entity, i) => {
70
+ entityCount += 1;
71
+ const result = validate(entity);
72
+ if (!result.valid) {
73
+ errors.push({
74
+ kind,
75
+ index: i,
76
+ id: entity && typeof entity === "object" ? entity.id : undefined,
77
+ errors: result.errors,
78
+ });
79
+ }
80
+ });
81
+ }
82
+
83
+ return {
84
+ valid: errors.length === 0,
85
+ entityCount,
86
+ errorCount: errors.length,
87
+ errors,
88
+ };
89
+ }
90
+
91
+ /**
92
+ * Partition a batch into "valid" and "invalid" sub-batches.
93
+ * Lets the ingestor commit valids + spool invalids to a review queue.
94
+ */
95
+ function partitionBatch(batch) {
96
+ const valid = emptyBatch();
97
+ const invalid = emptyBatch();
98
+ const invalidReasons = [];
99
+
100
+ const kinds = ["events", "persons", "places", "items", "topics"];
101
+ for (const kind of kinds) {
102
+ const arr = batch[kind] || [];
103
+ if (!Array.isArray(arr)) continue;
104
+ arr.forEach((entity, i) => {
105
+ const result = validate(entity);
106
+ if (result.valid) {
107
+ valid[kind].push(entity);
108
+ } else {
109
+ invalid[kind].push(entity);
110
+ invalidReasons.push({ kind, index: i, id: entity?.id, errors: result.errors });
111
+ }
112
+ });
113
+ }
114
+
115
+ return { valid, invalid, invalidReasons };
116
+ }
117
+
118
+ module.exports = {
119
+ emptyBatch,
120
+ mergeBatches,
121
+ validateBatch,
122
+ partitionBatch,
123
+ };
@@ -0,0 +1,264 @@
1
+ /**
2
+ * CcKgSink — translates hub KG triples (subject/predicate/object|literal)
3
+ * into ChainlessChain's existing knowledge-graph addEntity + addRelation API.
4
+ *
5
+ * Hub triples come in two shapes:
6
+ *
7
+ * Object triples: { subject: "evt-x", predicate: "by", object: "person-y" }
8
+ * → addRelation(db, { sourceId, targetId, relationType })
9
+ *
10
+ * Literal triples: { subject: "evt-x", predicate: "subtype", literal: "order" }
11
+ * → accumulate as entity properties at entity creation time
12
+ *
13
+ * Special literal: { subject: "evt-x", predicate: "rdf:type", literal: "event" }
14
+ * → decides the cc entity type (Person / Event / Concept / ...)
15
+ *
16
+ * Hub's 5 entity kinds map onto cc's 7 with this convention:
17
+ *
18
+ * hub cc notes
19
+ * ───── ──────── ──────
20
+ * person Person direct
21
+ * event Event direct
22
+ * place Concept with properties.hubKind = "place"
23
+ * item Concept with properties.hubKind = "item"
24
+ * topic Concept with properties.hubKind = "topic"
25
+ *
26
+ * Concept is used as a catch-all for hub kinds the cc KG doesn't natively
27
+ * model. The original kind is preserved in properties.hubKind so future cc
28
+ * KG schema upgrades (adding Place / Item / Topic) can re-classify.
29
+ *
30
+ * Like CcLLMAdapter, this bridge uses dependency injection — caller passes
31
+ * addEntity + addRelation + db. The bridge has zero static knowledge of
32
+ * the cc KG module path or module system.
33
+ *
34
+ * Two-pass algorithm:
35
+ *
36
+ * Pass 1 — for every distinct subject:
37
+ * a. collect literal triples into a property bag + identify primary name
38
+ * (first `has-name` wins) + cc type from `rdf:type`
39
+ * b. addEntity(db, { id, name, type, properties }) — caller's addEntity
40
+ * must be idempotent on duplicate id (the cc impl throws "already
41
+ * exists"; we catch that and treat as upsert)
42
+ *
43
+ * Pass 2 — for every object triple, addRelation. Skip if either endpoint
44
+ * wasn't seen in pass 1 (avoids dangling-relation errors from cc KG).
45
+ *
46
+ * Returns { entitiesUpserted, relationsAdded, errors[] } so the registry
47
+ * can audit ingest stats.
48
+ */
49
+
50
+ "use strict";
51
+
52
+ const HUB_TO_CC_TYPE = Object.freeze({
53
+ person: "Person",
54
+ event: "Event",
55
+ place: "Concept",
56
+ item: "Concept",
57
+ topic: "Concept",
58
+ });
59
+
60
+ const PROPERTY_TRIPLE_PREDICATES = new Set([
61
+ "subtype",
62
+ "occurred-at",
63
+ "source",
64
+ "amount-value",
65
+ "amount-currency",
66
+ "amount-direction",
67
+ "address",
68
+ "category",
69
+ "located-at",
70
+ "priced-at",
71
+ "has-alias",
72
+ "relation",
73
+ ]);
74
+ // `id:<kind>` predicate is handled separately (variable suffix)
75
+
76
+ const OBJECT_TRIPLE_PREDICATES = new Set([
77
+ "by",
78
+ "involves",
79
+ "happened-at",
80
+ "about",
81
+ "topic",
82
+ "sold-by",
83
+ "parent",
84
+ "derived-from",
85
+ ]);
86
+
87
+ class CcKgSink {
88
+ /**
89
+ * @param {object} deps
90
+ * @param {(db: object, config: object) => object} deps.addEntity cc addEntity
91
+ * @param {(db: object, config: object) => object} deps.addRelation cc addRelation
92
+ * @param {object} [deps.db] cc db handle (forwarded)
93
+ * @param {(label: string, ...args: any[]) => void} [deps.logger] optional logger for non-fatal errors
94
+ */
95
+ constructor(deps) {
96
+ if (!deps || typeof deps !== "object") {
97
+ throw new Error("CcKgSink: deps required");
98
+ }
99
+ if (typeof deps.addEntity !== "function") {
100
+ throw new Error("CcKgSink: deps.addEntity(db, config) required");
101
+ }
102
+ if (typeof deps.addRelation !== "function") {
103
+ throw new Error("CcKgSink: deps.addRelation(db, config) required");
104
+ }
105
+ this._addEntity = deps.addEntity;
106
+ this._addRelation = deps.addRelation;
107
+ this._db = deps.db || null;
108
+ this._log = typeof deps.logger === "function" ? deps.logger : null;
109
+ this._seenEntities = new Set(); // de-dup across calls within process lifetime
110
+ }
111
+
112
+ /**
113
+ * Bound method used as the registry kgSink callback:
114
+ * const sink = new CcKgSink({ ... });
115
+ * new AdapterRegistry({ vault, kgSink: sink.write.bind(sink) });
116
+ */
117
+ async write(triples) {
118
+ if (!Array.isArray(triples) || triples.length === 0) {
119
+ return { entitiesUpserted: 0, relationsAdded: 0, errors: [] };
120
+ }
121
+
122
+ // ─── Group triples by subject ─────────────────────────────────────
123
+ const bySubject = new Map();
124
+ const objectTriples = [];
125
+ for (const t of triples) {
126
+ if (!t || !t.subject || !t.predicate) continue;
127
+ if (typeof t.object === "string") {
128
+ objectTriples.push(t);
129
+ continue;
130
+ }
131
+ if (!bySubject.has(t.subject)) bySubject.set(t.subject, []);
132
+ bySubject.get(t.subject).push(t);
133
+ }
134
+
135
+ const errors = [];
136
+ let entitiesUpserted = 0;
137
+ let relationsAdded = 0;
138
+ const subjectsCreated = new Set();
139
+
140
+ // ─── Pass 1: upsert entities ──────────────────────────────────────
141
+ for (const [subject, subTriples] of bySubject.entries()) {
142
+ let hubType = null;
143
+ let primaryName = null;
144
+ const aliases = [];
145
+ const properties = {};
146
+
147
+ for (const t of subTriples) {
148
+ const pred = t.predicate;
149
+ const lit = t.literal;
150
+ if (pred === "rdf:type") {
151
+ hubType = typeof lit === "string" ? lit : null;
152
+ continue;
153
+ }
154
+ if (pred === "has-name") {
155
+ if (primaryName == null) primaryName = String(lit);
156
+ else aliases.push(String(lit));
157
+ continue;
158
+ }
159
+ if (pred === "has-alias") {
160
+ aliases.push(String(lit));
161
+ continue;
162
+ }
163
+ if (pred.startsWith("id:")) {
164
+ properties[pred] = lit;
165
+ continue;
166
+ }
167
+ if (PROPERTY_TRIPLE_PREDICATES.has(pred)) {
168
+ // Some predicates can repeat (e.g. multiple aliases via separate triples).
169
+ // Stash as array if duplicate.
170
+ if (properties[pred] === undefined) {
171
+ properties[pred] = lit;
172
+ } else if (Array.isArray(properties[pred])) {
173
+ properties[pred].push(lit);
174
+ } else {
175
+ properties[pred] = [properties[pred], lit];
176
+ }
177
+ continue;
178
+ }
179
+ // Unknown predicate — preserve under a `__extra` namespace.
180
+ if (!properties.__extra) properties.__extra = {};
181
+ properties.__extra[pred] = lit;
182
+ }
183
+
184
+ const ccType = HUB_TO_CC_TYPE[hubType] || "Concept";
185
+ properties.hubKind = hubType || "unknown";
186
+ if (aliases.length > 0) properties.aliases = aliases;
187
+ const name = primaryName || subject; // fallback: id as name
188
+
189
+ try {
190
+ this._addEntity(this._db, {
191
+ id: subject,
192
+ name,
193
+ type: ccType,
194
+ properties,
195
+ });
196
+ entitiesUpserted += 1;
197
+ this._seenEntities.add(subject);
198
+ subjectsCreated.add(subject);
199
+ } catch (err) {
200
+ const msg = err && err.message ? err.message : String(err);
201
+ // cc throws "Entity already exists" — treat as success (upsert semantics).
202
+ if (/already exists/i.test(msg)) {
203
+ this._seenEntities.add(subject);
204
+ subjectsCreated.add(subject);
205
+ continue;
206
+ }
207
+ errors.push({ kind: "entity", subject, error: msg });
208
+ if (this._log) this._log("CcKgSink.addEntity failed", subject, msg);
209
+ }
210
+ }
211
+
212
+ // ─── Pass 2: add relations ────────────────────────────────────────
213
+ for (const t of objectTriples) {
214
+ if (!OBJECT_TRIPLE_PREDICATES.has(t.predicate)) {
215
+ // Unknown predicate — record for telemetry; cc KG would reject anyway.
216
+ errors.push({
217
+ kind: "relation",
218
+ subject: t.subject,
219
+ predicate: t.predicate,
220
+ error: "unknown predicate",
221
+ });
222
+ continue;
223
+ }
224
+ // cc requires both endpoints already in the KG. Best-effort: if not
225
+ // in this batch's subjectsCreated AND not in the long-lived _seenEntities,
226
+ // skip with a warning. (Cross-batch references should be rare in
227
+ // practice — KG ingest is per-batch from same sync.)
228
+ if (!this._seenEntities.has(t.subject) || !this._seenEntities.has(t.object)) {
229
+ errors.push({
230
+ kind: "relation",
231
+ subject: t.subject,
232
+ target: t.object,
233
+ predicate: t.predicate,
234
+ error: "endpoint not in KG",
235
+ });
236
+ continue;
237
+ }
238
+ try {
239
+ this._addRelation(this._db, {
240
+ sourceId: t.subject,
241
+ targetId: t.object,
242
+ relationType: t.predicate,
243
+ });
244
+ relationsAdded += 1;
245
+ } catch (err) {
246
+ const msg = err && err.message ? err.message : String(err);
247
+ // Tolerate "already exists" if cc throws it.
248
+ if (/already exists|duplicate/i.test(msg)) continue;
249
+ errors.push({
250
+ kind: "relation",
251
+ subject: t.subject,
252
+ target: t.object,
253
+ predicate: t.predicate,
254
+ error: msg,
255
+ });
256
+ if (this._log) this._log("CcKgSink.addRelation failed", t.subject, t.object, msg);
257
+ }
258
+ }
259
+
260
+ return { entitiesUpserted, relationsAdded, errors };
261
+ }
262
+ }
263
+
264
+ module.exports = { CcKgSink, HUB_TO_CC_TYPE };
@@ -0,0 +1,169 @@
1
+ /**
2
+ * CcLLMAdapter — bridges the hub's LLMClient contract to ChainlessChain's
3
+ * existing llm-manager (or any compatible client).
4
+ *
5
+ * Hub package stays decoupled from cc by INJECTING the cc-specific bits:
6
+ *
7
+ * const llmManager = require("desktop-app-vue/src/main/llm/llm-manager");
8
+ * const adapter = new CcLLMAdapter({
9
+ * chat: (messages, opts) => llmManager.getInstance().chat(messages, opts),
10
+ * getActiveProvider: () => llmManager.getInstance().getActiveProvider(),
11
+ * getActiveModel: () => llmManager.getInstance().getActiveModel(),
12
+ * });
13
+ * // adapter satisfies LLMClient — drop into new AnalysisEngine({ vault, llm: adapter }).
14
+ *
15
+ * Why injection: the hub is CJS + workspace-portable. cli is ESM, desktop
16
+ * main is CJS, future iOS bridge is yet another runtime. Each caller knows
17
+ * how to obtain a `chat(messages, opts)` function in its own module system.
18
+ * The hub just adapts the response shape.
19
+ *
20
+ * Privacy: isLocal is computed from the caller-supplied getActiveProvider().
21
+ * Providers we consider local (no network egress on chat): ollama, llama-cpp,
22
+ * vllm-local, lm-studio. Everything else is non-local — AnalysisEngine will
23
+ * refuse to call unless caller explicitly opts in via acceptNonLocal: true.
24
+ *
25
+ * Response normalization: cc's llm-manager returns slightly different
26
+ * shapes per provider. We coerce to the hub's { text, model, usage }
27
+ * contract, preferring (in order):
28
+ * result.content // llm-manager wraps with .content
29
+ * result.message.content // raw provider message
30
+ * result.text // some clients
31
+ * result.choices[0].message.content // OpenAI-style
32
+ */
33
+
34
+ "use strict";
35
+
36
+ const LOCAL_PROVIDERS = new Set([
37
+ "ollama",
38
+ "llama-cpp",
39
+ "llamacpp",
40
+ "vllm-local",
41
+ "lm-studio",
42
+ "lmstudio",
43
+ ]);
44
+
45
+ function extractText(result) {
46
+ if (!result || typeof result !== "object") return "";
47
+ if (typeof result.content === "string") return result.content;
48
+ if (typeof result.text === "string") return result.text;
49
+ if (result.message && typeof result.message.content === "string") {
50
+ return result.message.content;
51
+ }
52
+ if (
53
+ Array.isArray(result.choices) &&
54
+ result.choices[0] &&
55
+ result.choices[0].message &&
56
+ typeof result.choices[0].message.content === "string"
57
+ ) {
58
+ return result.choices[0].message.content;
59
+ }
60
+ return "";
61
+ }
62
+
63
+ function extractUsage(result) {
64
+ if (!result || typeof result !== "object" || !result.usage) {
65
+ return { promptTokens: 0, completionTokens: 0, totalTokens: 0 };
66
+ }
67
+ const u = result.usage;
68
+ // cc llm-manager + OpenAI-compat: prompt_tokens / completion_tokens / total_tokens
69
+ // hub contract: camelCase
70
+ return {
71
+ promptTokens: u.promptTokens ?? u.prompt_tokens ?? u.input_tokens ?? 0,
72
+ completionTokens: u.completionTokens ?? u.completion_tokens ?? u.output_tokens ?? 0,
73
+ totalTokens:
74
+ u.totalTokens ??
75
+ u.total_tokens ??
76
+ (u.promptTokens ?? u.prompt_tokens ?? 0) + (u.completionTokens ?? u.completion_tokens ?? 0),
77
+ };
78
+ }
79
+
80
+ class CcLLMAdapter {
81
+ /**
82
+ * @param {object} deps
83
+ * @param {(messages: Array, opts?: object) => Promise<object>} deps.chat
84
+ * @param {() => string} [deps.getActiveProvider]
85
+ * @param {() => string} [deps.getActiveModel]
86
+ * @param {Set<string>|string[]} [deps.localProviders] override the default local-provider whitelist
87
+ * @param {string} [deps.name] override the .name surface
88
+ */
89
+ constructor(deps) {
90
+ if (!deps || typeof deps !== "object") {
91
+ throw new Error("CcLLMAdapter: deps required");
92
+ }
93
+ if (typeof deps.chat !== "function") {
94
+ throw new Error("CcLLMAdapter: deps.chat(messages, opts) required");
95
+ }
96
+ this._chat = deps.chat;
97
+ this._getActiveProvider = typeof deps.getActiveProvider === "function" ? deps.getActiveProvider : null;
98
+ this._getActiveModel = typeof deps.getActiveModel === "function" ? deps.getActiveModel : null;
99
+ this._localProviders =
100
+ deps.localProviders instanceof Set
101
+ ? deps.localProviders
102
+ : Array.isArray(deps.localProviders)
103
+ ? new Set(deps.localProviders)
104
+ : LOCAL_PROVIDERS;
105
+ this._name = deps.name || null;
106
+ }
107
+
108
+ get name() {
109
+ if (this._name) return this._name;
110
+ const model = this._getActiveModel ? this._tryCall(this._getActiveModel, "model") : null;
111
+ const provider = this._getActiveProvider
112
+ ? this._tryCall(this._getActiveProvider, "provider")
113
+ : null;
114
+ if (provider && model) return `${provider}:${model}`;
115
+ if (model) return model;
116
+ if (provider) return provider;
117
+ return "cc-llm";
118
+ }
119
+
120
+ /**
121
+ * Privacy invariant: report whether this LLM keeps data on-device. The
122
+ * AnalysisEngine consults this BEFORE calling chat; non-local clients
123
+ * are refused unless the caller explicitly passes acceptNonLocal: true.
124
+ */
125
+ get isLocal() {
126
+ if (!this._getActiveProvider) {
127
+ // No provider info → conservative: assume non-local. Caller must
128
+ // explicitly mark via constructor opts.localProviders if needed.
129
+ return false;
130
+ }
131
+ const provider = this._tryCall(this._getActiveProvider, "provider");
132
+ if (!provider) return false;
133
+ return this._localProviders.has(String(provider).toLowerCase());
134
+ }
135
+
136
+ async chat(messages, opts = {}) {
137
+ if (!Array.isArray(messages)) {
138
+ throw new Error("CcLLMAdapter.chat: messages must be an array");
139
+ }
140
+ let result;
141
+ try {
142
+ result = await this._chat(messages, opts);
143
+ } catch (err) {
144
+ const wrapped = new Error(
145
+ `CcLLMAdapter.chat: underlying client failed — ${err && err.message ? err.message : err}`
146
+ );
147
+ wrapped.cause = err;
148
+ throw wrapped;
149
+ }
150
+ return {
151
+ text: extractText(result),
152
+ model: this._getActiveModel ? this._tryCall(this._getActiveModel, "model") : result && result.model,
153
+ usage: extractUsage(result),
154
+ raw: result,
155
+ };
156
+ }
157
+
158
+ _tryCall(fn, label) {
159
+ try {
160
+ return fn();
161
+ } catch (err) {
162
+ // Don't let getActiveProvider/getActiveModel side-effects abort isLocal
163
+ // computation or name lookup. Fall through to default.
164
+ return null;
165
+ }
166
+ }
167
+ }
168
+
169
+ module.exports = { CcLLMAdapter, LOCAL_PROVIDERS };
@@ -0,0 +1,118 @@
1
+ /**
2
+ * CcRagSink — feeds hub RagDocs into ChainlessChain's existing BM25 + (later)
3
+ * Qdrant vector store.
4
+ *
5
+ * Hub RagDoc shape: { id, type, text, metadata: { ... } }
6
+ * cc BM25.addDocument(doc) expects: { id, title?, content? } — concatenates
7
+ * title + " " + content for tokenization.
8
+ *
9
+ * We map:
10
+ * doc.id → doc.id
11
+ * doc.metadata.title || doc.type → doc.title (short, BM25-prioritized via tokenization)
12
+ * doc.text → doc.content
13
+ *
14
+ * Metadata is also serialized into a `meta` property the BM25 originalDoc
15
+ * field preserves verbatim — that's how the downstream Q&A flow filters
16
+ * hits by adapter / time-window / subtype.
17
+ *
18
+ * Vector store wiring (Qdrant) is intentionally LEFT OUT of this v0 sink:
19
+ * the existing cc embed/qdrant integration is async and IPC-shaped; adding
20
+ * it here would couple us to its lifecycle. v1 sink: BM25 only. The hub
21
+ * caller can add a second sink in parallel for vector indexing once that
22
+ * surface stabilizes.
23
+ *
24
+ * Like the other bridges this is dependency-injected — caller passes
25
+ * the BM25 instance (or any object with .addDocument(doc)).
26
+ */
27
+
28
+ "use strict";
29
+
30
+ class CcRagSink {
31
+ /**
32
+ * @param {object} deps
33
+ * @param {{ addDocument: (doc: object) => void }} deps.bm25 cc BM25Search instance
34
+ * @param {{ index: (docs: Array) => Promise<void> }} [deps.vector] future Qdrant adapter (not yet used)
35
+ * @param {(label: string, ...args: any[]) => void} [deps.logger]
36
+ * @param {(doc: object) => object} [deps.transformDoc] optional pre-write hook
37
+ */
38
+ constructor(deps) {
39
+ if (!deps || typeof deps !== "object") {
40
+ throw new Error("CcRagSink: deps required");
41
+ }
42
+ if (!deps.bm25 || typeof deps.bm25.addDocument !== "function") {
43
+ throw new Error("CcRagSink: deps.bm25 with .addDocument(doc) required");
44
+ }
45
+ this._bm25 = deps.bm25;
46
+ this._vector = deps.vector && typeof deps.vector.index === "function" ? deps.vector : null;
47
+ this._log = typeof deps.logger === "function" ? deps.logger : null;
48
+ this._transform = typeof deps.transformDoc === "function" ? deps.transformDoc : null;
49
+ this._writtenIds = new Set();
50
+ }
51
+
52
+ /**
53
+ * Bound to .write(docs) for use as the registry's ragSink callback.
54
+ */
55
+ async write(docs) {
56
+ if (!Array.isArray(docs) || docs.length === 0) {
57
+ return { indexed: 0, skipped: 0, errors: [] };
58
+ }
59
+ let indexed = 0;
60
+ let skipped = 0;
61
+ const errors = [];
62
+ const forVector = [];
63
+
64
+ for (const d of docs) {
65
+ if (!d || !d.id || typeof d.text !== "string" || d.text.length === 0) {
66
+ skipped += 1;
67
+ continue;
68
+ }
69
+ // De-dup within process lifetime — BM25.addDocument doesn't dedup
70
+ // internally; a re-ingest would otherwise double-count term frequencies.
71
+ if (this._writtenIds.has(d.id)) {
72
+ skipped += 1;
73
+ continue;
74
+ }
75
+
76
+ const doc = this._transform ? this._transform(d) : this._toBm25Doc(d);
77
+ try {
78
+ this._bm25.addDocument(doc);
79
+ this._writtenIds.add(d.id);
80
+ indexed += 1;
81
+ if (this._vector) forVector.push(d);
82
+ } catch (err) {
83
+ const msg = err && err.message ? err.message : String(err);
84
+ errors.push({ id: d.id, error: msg });
85
+ if (this._log) this._log("CcRagSink.addDocument failed", d.id, msg);
86
+ }
87
+ }
88
+
89
+ if (this._vector && forVector.length > 0) {
90
+ try {
91
+ await this._vector.index(forVector);
92
+ } catch (err) {
93
+ const msg = err && err.message ? err.message : String(err);
94
+ errors.push({ phase: "vector", error: msg });
95
+ if (this._log) this._log("CcRagSink.vector.index failed", msg);
96
+ }
97
+ }
98
+
99
+ return { indexed, skipped, errors };
100
+ }
101
+
102
+ _toBm25Doc(d) {
103
+ const title =
104
+ (d.metadata && (d.metadata.title || d.metadata.subtype)) ||
105
+ d.type ||
106
+ "";
107
+ return {
108
+ id: d.id,
109
+ title: String(title),
110
+ content: d.text,
111
+ // BM25 preserves the original doc in `originalDoc`; metadata lives there.
112
+ meta: d.metadata || {},
113
+ hubType: d.type,
114
+ };
115
+ }
116
+ }
117
+
118
+ module.exports = { CcRagSink };