@chainlesschain/personal-data-hub 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +241 -0
- package/__tests__/adapter-spec.test.js +78 -0
- package/__tests__/adapters/email-adapter.test.js +605 -0
- package/__tests__/adapters/email-imap-session.test.js +334 -0
- package/__tests__/adapters/email-parser.test.js +244 -0
- package/__tests__/adapters/email-providers.test.js +84 -0
- package/__tests__/analysis.test.js +302 -0
- package/__tests__/batch.test.js +133 -0
- package/__tests__/bridges-cc-kg.test.js +231 -0
- package/__tests__/bridges-cc-llm.test.js +191 -0
- package/__tests__/bridges-cc-rag.test.js +162 -0
- package/__tests__/ids.test.js +45 -0
- package/__tests__/key-providers.test.js +126 -0
- package/__tests__/kg-derive.test.js +219 -0
- package/__tests__/llm-client.test.js +122 -0
- package/__tests__/mock-adapter.test.js +93 -0
- package/__tests__/prompt-builder.test.js +204 -0
- package/__tests__/query-parser.test.js +150 -0
- package/__tests__/rag-derive.test.js +169 -0
- package/__tests__/registry.test.js +304 -0
- package/__tests__/schemas.test.js +331 -0
- package/__tests__/vault.test.js +506 -0
- package/lib/adapter-spec.js +155 -0
- package/lib/adapters/email-imap/email-adapter.js +398 -0
- package/lib/adapters/email-imap/email-parser.js +177 -0
- package/lib/adapters/email-imap/imap-session.js +294 -0
- package/lib/adapters/email-imap/index.js +26 -0
- package/lib/adapters/email-imap/providers.js +111 -0
- package/lib/analysis.js +226 -0
- package/lib/batch.js +123 -0
- package/lib/bridges/cc-kg-sink.js +264 -0
- package/lib/bridges/cc-llm-adapter.js +169 -0
- package/lib/bridges/cc-rag-sink.js +118 -0
- package/lib/bridges/index.js +44 -0
- package/lib/constants.js +92 -0
- package/lib/ids.js +103 -0
- package/lib/index.js +141 -0
- package/lib/key-providers.js +146 -0
- package/lib/kg-derive.js +214 -0
- package/lib/llm-client.js +171 -0
- package/lib/migrations.js +246 -0
- package/lib/mock-adapter.js +199 -0
- package/lib/prompt-builder.js +205 -0
- package/lib/query-parser.js +250 -0
- package/lib/rag-derive.js +186 -0
- package/lib/registry.js +398 -0
- package/lib/schemas.js +379 -0
- package/lib/vault.js +883 -0
- package/package.json +63 -0
- package/vitest.config.js +10 -0
package/lib/batch.js
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* NormalizedBatch — what every adapter.normalize() returns to the ingestor
|
|
3
|
+
*
|
|
4
|
+
* Mirrors §9.1 of docs/design/Personal_Data_Hub_Architecture.md:
|
|
5
|
+
* interface NormalizedBatch {
|
|
6
|
+
* events: Event[];
|
|
7
|
+
* persons: Person[];
|
|
8
|
+
* places: Place[];
|
|
9
|
+
* items: Item[];
|
|
10
|
+
* topics?: Topic[];
|
|
11
|
+
* }
|
|
12
|
+
*
|
|
13
|
+
* Helpers here let adapters build batches incrementally + ingestors validate
|
|
14
|
+
* a whole batch in one call (so a single bad row doesn't kill the whole sync).
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
"use strict";
|
|
18
|
+
|
|
19
|
+
const { validate } = require("./schemas");
|
|
20
|
+
|
|
21
|
+
function emptyBatch() {
|
|
22
|
+
return { events: [], persons: [], places: [], items: [], topics: [] };
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
function mergeBatches(a, b) {
|
|
26
|
+
return {
|
|
27
|
+
events: [...(a.events || []), ...(b.events || [])],
|
|
28
|
+
persons: [...(a.persons || []), ...(b.persons || [])],
|
|
29
|
+
places: [...(a.places || []), ...(b.places || [])],
|
|
30
|
+
items: [...(a.items || []), ...(b.items || [])],
|
|
31
|
+
topics: [...(a.topics || []), ...(b.topics || [])],
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Validate every entity in a batch. Returns:
|
|
37
|
+
* {
|
|
38
|
+
* valid: boolean,
|
|
39
|
+
* entityCount: number,
|
|
40
|
+
* errorCount: number,
|
|
41
|
+
* errors: Array<{ kind, index, id?, errors: string[] }>,
|
|
42
|
+
* }
|
|
43
|
+
*
|
|
44
|
+
* The ingestor's policy is: if errorCount > 0, log all bad rows but still
|
|
45
|
+
* ingest the good ones (don't let a single corrupt row from a third-party
|
|
46
|
+
* adapter abort the whole sync window).
|
|
47
|
+
*/
|
|
48
|
+
function validateBatch(batch) {
|
|
49
|
+
if (batch == null || typeof batch !== "object") {
|
|
50
|
+
return {
|
|
51
|
+
valid: false,
|
|
52
|
+
entityCount: 0,
|
|
53
|
+
errorCount: 1,
|
|
54
|
+
errors: [{ kind: "batch", index: -1, errors: ["batch must be a plain object"] }],
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
const errors = [];
|
|
59
|
+
let entityCount = 0;
|
|
60
|
+
|
|
61
|
+
const kinds = ["events", "persons", "places", "items", "topics"];
|
|
62
|
+
for (const kind of kinds) {
|
|
63
|
+
const arr = batch[kind];
|
|
64
|
+
if (arr === undefined) continue;
|
|
65
|
+
if (!Array.isArray(arr)) {
|
|
66
|
+
errors.push({ kind, index: -1, errors: [`${kind} must be an array when present`] });
|
|
67
|
+
continue;
|
|
68
|
+
}
|
|
69
|
+
arr.forEach((entity, i) => {
|
|
70
|
+
entityCount += 1;
|
|
71
|
+
const result = validate(entity);
|
|
72
|
+
if (!result.valid) {
|
|
73
|
+
errors.push({
|
|
74
|
+
kind,
|
|
75
|
+
index: i,
|
|
76
|
+
id: entity && typeof entity === "object" ? entity.id : undefined,
|
|
77
|
+
errors: result.errors,
|
|
78
|
+
});
|
|
79
|
+
}
|
|
80
|
+
});
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
return {
|
|
84
|
+
valid: errors.length === 0,
|
|
85
|
+
entityCount,
|
|
86
|
+
errorCount: errors.length,
|
|
87
|
+
errors,
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Partition a batch into "valid" and "invalid" sub-batches.
|
|
93
|
+
* Lets the ingestor commit valids + spool invalids to a review queue.
|
|
94
|
+
*/
|
|
95
|
+
function partitionBatch(batch) {
|
|
96
|
+
const valid = emptyBatch();
|
|
97
|
+
const invalid = emptyBatch();
|
|
98
|
+
const invalidReasons = [];
|
|
99
|
+
|
|
100
|
+
const kinds = ["events", "persons", "places", "items", "topics"];
|
|
101
|
+
for (const kind of kinds) {
|
|
102
|
+
const arr = batch[kind] || [];
|
|
103
|
+
if (!Array.isArray(arr)) continue;
|
|
104
|
+
arr.forEach((entity, i) => {
|
|
105
|
+
const result = validate(entity);
|
|
106
|
+
if (result.valid) {
|
|
107
|
+
valid[kind].push(entity);
|
|
108
|
+
} else {
|
|
109
|
+
invalid[kind].push(entity);
|
|
110
|
+
invalidReasons.push({ kind, index: i, id: entity?.id, errors: result.errors });
|
|
111
|
+
}
|
|
112
|
+
});
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
return { valid, invalid, invalidReasons };
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
module.exports = {
|
|
119
|
+
emptyBatch,
|
|
120
|
+
mergeBatches,
|
|
121
|
+
validateBatch,
|
|
122
|
+
partitionBatch,
|
|
123
|
+
};
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CcKgSink — translates hub KG triples (subject/predicate/object|literal)
|
|
3
|
+
* into ChainlessChain's existing knowledge-graph addEntity + addRelation API.
|
|
4
|
+
*
|
|
5
|
+
* Hub triples come in two shapes:
|
|
6
|
+
*
|
|
7
|
+
* Object triples: { subject: "evt-x", predicate: "by", object: "person-y" }
|
|
8
|
+
* → addRelation(db, { sourceId, targetId, relationType })
|
|
9
|
+
*
|
|
10
|
+
* Literal triples: { subject: "evt-x", predicate: "subtype", literal: "order" }
|
|
11
|
+
* → accumulate as entity properties at entity creation time
|
|
12
|
+
*
|
|
13
|
+
* Special literal: { subject: "evt-x", predicate: "rdf:type", literal: "event" }
|
|
14
|
+
* → decides the cc entity type (Person / Event / Concept / ...)
|
|
15
|
+
*
|
|
16
|
+
* Hub's 5 entity kinds map onto cc's 7 with this convention:
|
|
17
|
+
*
|
|
18
|
+
* hub cc notes
|
|
19
|
+
* ───── ──────── ──────
|
|
20
|
+
* person Person direct
|
|
21
|
+
* event Event direct
|
|
22
|
+
* place Concept with properties.hubKind = "place"
|
|
23
|
+
* item Concept with properties.hubKind = "item"
|
|
24
|
+
* topic Concept with properties.hubKind = "topic"
|
|
25
|
+
*
|
|
26
|
+
* Concept is used as a catch-all for hub kinds the cc KG doesn't natively
|
|
27
|
+
* model. The original kind is preserved in properties.hubKind so future cc
|
|
28
|
+
* KG schema upgrades (adding Place / Item / Topic) can re-classify.
|
|
29
|
+
*
|
|
30
|
+
* Like CcLLMAdapter, this bridge uses dependency injection — caller passes
|
|
31
|
+
* addEntity + addRelation + db. The bridge has zero static knowledge of
|
|
32
|
+
* the cc KG module path or module system.
|
|
33
|
+
*
|
|
34
|
+
* Two-pass algorithm:
|
|
35
|
+
*
|
|
36
|
+
* Pass 1 — for every distinct subject:
|
|
37
|
+
* a. collect literal triples into a property bag + identify primary name
|
|
38
|
+
* (first `has-name` wins) + cc type from `rdf:type`
|
|
39
|
+
* b. addEntity(db, { id, name, type, properties }) — caller's addEntity
|
|
40
|
+
* must be idempotent on duplicate id (the cc impl throws "already
|
|
41
|
+
* exists"; we catch that and treat as upsert)
|
|
42
|
+
*
|
|
43
|
+
* Pass 2 — for every object triple, addRelation. Skip if either endpoint
|
|
44
|
+
* wasn't seen in pass 1 (avoids dangling-relation errors from cc KG).
|
|
45
|
+
*
|
|
46
|
+
* Returns { entitiesUpserted, relationsAdded, errors[] } so the registry
|
|
47
|
+
* can audit ingest stats.
|
|
48
|
+
*/
|
|
49
|
+
|
|
50
|
+
"use strict";
|
|
51
|
+
|
|
52
|
+
const HUB_TO_CC_TYPE = Object.freeze({
|
|
53
|
+
person: "Person",
|
|
54
|
+
event: "Event",
|
|
55
|
+
place: "Concept",
|
|
56
|
+
item: "Concept",
|
|
57
|
+
topic: "Concept",
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
const PROPERTY_TRIPLE_PREDICATES = new Set([
|
|
61
|
+
"subtype",
|
|
62
|
+
"occurred-at",
|
|
63
|
+
"source",
|
|
64
|
+
"amount-value",
|
|
65
|
+
"amount-currency",
|
|
66
|
+
"amount-direction",
|
|
67
|
+
"address",
|
|
68
|
+
"category",
|
|
69
|
+
"located-at",
|
|
70
|
+
"priced-at",
|
|
71
|
+
"has-alias",
|
|
72
|
+
"relation",
|
|
73
|
+
]);
|
|
74
|
+
// `id:<kind>` predicate is handled separately (variable suffix)
|
|
75
|
+
|
|
76
|
+
const OBJECT_TRIPLE_PREDICATES = new Set([
|
|
77
|
+
"by",
|
|
78
|
+
"involves",
|
|
79
|
+
"happened-at",
|
|
80
|
+
"about",
|
|
81
|
+
"topic",
|
|
82
|
+
"sold-by",
|
|
83
|
+
"parent",
|
|
84
|
+
"derived-from",
|
|
85
|
+
]);
|
|
86
|
+
|
|
87
|
+
class CcKgSink {
|
|
88
|
+
/**
|
|
89
|
+
* @param {object} deps
|
|
90
|
+
* @param {(db: object, config: object) => object} deps.addEntity cc addEntity
|
|
91
|
+
* @param {(db: object, config: object) => object} deps.addRelation cc addRelation
|
|
92
|
+
* @param {object} [deps.db] cc db handle (forwarded)
|
|
93
|
+
* @param {(label: string, ...args: any[]) => void} [deps.logger] optional logger for non-fatal errors
|
|
94
|
+
*/
|
|
95
|
+
constructor(deps) {
|
|
96
|
+
if (!deps || typeof deps !== "object") {
|
|
97
|
+
throw new Error("CcKgSink: deps required");
|
|
98
|
+
}
|
|
99
|
+
if (typeof deps.addEntity !== "function") {
|
|
100
|
+
throw new Error("CcKgSink: deps.addEntity(db, config) required");
|
|
101
|
+
}
|
|
102
|
+
if (typeof deps.addRelation !== "function") {
|
|
103
|
+
throw new Error("CcKgSink: deps.addRelation(db, config) required");
|
|
104
|
+
}
|
|
105
|
+
this._addEntity = deps.addEntity;
|
|
106
|
+
this._addRelation = deps.addRelation;
|
|
107
|
+
this._db = deps.db || null;
|
|
108
|
+
this._log = typeof deps.logger === "function" ? deps.logger : null;
|
|
109
|
+
this._seenEntities = new Set(); // de-dup across calls within process lifetime
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Bound method used as the registry kgSink callback:
|
|
114
|
+
* const sink = new CcKgSink({ ... });
|
|
115
|
+
* new AdapterRegistry({ vault, kgSink: sink.write.bind(sink) });
|
|
116
|
+
*/
|
|
117
|
+
async write(triples) {
|
|
118
|
+
if (!Array.isArray(triples) || triples.length === 0) {
|
|
119
|
+
return { entitiesUpserted: 0, relationsAdded: 0, errors: [] };
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// ─── Group triples by subject ─────────────────────────────────────
|
|
123
|
+
const bySubject = new Map();
|
|
124
|
+
const objectTriples = [];
|
|
125
|
+
for (const t of triples) {
|
|
126
|
+
if (!t || !t.subject || !t.predicate) continue;
|
|
127
|
+
if (typeof t.object === "string") {
|
|
128
|
+
objectTriples.push(t);
|
|
129
|
+
continue;
|
|
130
|
+
}
|
|
131
|
+
if (!bySubject.has(t.subject)) bySubject.set(t.subject, []);
|
|
132
|
+
bySubject.get(t.subject).push(t);
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
const errors = [];
|
|
136
|
+
let entitiesUpserted = 0;
|
|
137
|
+
let relationsAdded = 0;
|
|
138
|
+
const subjectsCreated = new Set();
|
|
139
|
+
|
|
140
|
+
// ─── Pass 1: upsert entities ──────────────────────────────────────
|
|
141
|
+
for (const [subject, subTriples] of bySubject.entries()) {
|
|
142
|
+
let hubType = null;
|
|
143
|
+
let primaryName = null;
|
|
144
|
+
const aliases = [];
|
|
145
|
+
const properties = {};
|
|
146
|
+
|
|
147
|
+
for (const t of subTriples) {
|
|
148
|
+
const pred = t.predicate;
|
|
149
|
+
const lit = t.literal;
|
|
150
|
+
if (pred === "rdf:type") {
|
|
151
|
+
hubType = typeof lit === "string" ? lit : null;
|
|
152
|
+
continue;
|
|
153
|
+
}
|
|
154
|
+
if (pred === "has-name") {
|
|
155
|
+
if (primaryName == null) primaryName = String(lit);
|
|
156
|
+
else aliases.push(String(lit));
|
|
157
|
+
continue;
|
|
158
|
+
}
|
|
159
|
+
if (pred === "has-alias") {
|
|
160
|
+
aliases.push(String(lit));
|
|
161
|
+
continue;
|
|
162
|
+
}
|
|
163
|
+
if (pred.startsWith("id:")) {
|
|
164
|
+
properties[pred] = lit;
|
|
165
|
+
continue;
|
|
166
|
+
}
|
|
167
|
+
if (PROPERTY_TRIPLE_PREDICATES.has(pred)) {
|
|
168
|
+
// Some predicates can repeat (e.g. multiple aliases via separate triples).
|
|
169
|
+
// Stash as array if duplicate.
|
|
170
|
+
if (properties[pred] === undefined) {
|
|
171
|
+
properties[pred] = lit;
|
|
172
|
+
} else if (Array.isArray(properties[pred])) {
|
|
173
|
+
properties[pred].push(lit);
|
|
174
|
+
} else {
|
|
175
|
+
properties[pred] = [properties[pred], lit];
|
|
176
|
+
}
|
|
177
|
+
continue;
|
|
178
|
+
}
|
|
179
|
+
// Unknown predicate — preserve under a `__extra` namespace.
|
|
180
|
+
if (!properties.__extra) properties.__extra = {};
|
|
181
|
+
properties.__extra[pred] = lit;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
const ccType = HUB_TO_CC_TYPE[hubType] || "Concept";
|
|
185
|
+
properties.hubKind = hubType || "unknown";
|
|
186
|
+
if (aliases.length > 0) properties.aliases = aliases;
|
|
187
|
+
const name = primaryName || subject; // fallback: id as name
|
|
188
|
+
|
|
189
|
+
try {
|
|
190
|
+
this._addEntity(this._db, {
|
|
191
|
+
id: subject,
|
|
192
|
+
name,
|
|
193
|
+
type: ccType,
|
|
194
|
+
properties,
|
|
195
|
+
});
|
|
196
|
+
entitiesUpserted += 1;
|
|
197
|
+
this._seenEntities.add(subject);
|
|
198
|
+
subjectsCreated.add(subject);
|
|
199
|
+
} catch (err) {
|
|
200
|
+
const msg = err && err.message ? err.message : String(err);
|
|
201
|
+
// cc throws "Entity already exists" — treat as success (upsert semantics).
|
|
202
|
+
if (/already exists/i.test(msg)) {
|
|
203
|
+
this._seenEntities.add(subject);
|
|
204
|
+
subjectsCreated.add(subject);
|
|
205
|
+
continue;
|
|
206
|
+
}
|
|
207
|
+
errors.push({ kind: "entity", subject, error: msg });
|
|
208
|
+
if (this._log) this._log("CcKgSink.addEntity failed", subject, msg);
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
// ─── Pass 2: add relations ────────────────────────────────────────
|
|
213
|
+
for (const t of objectTriples) {
|
|
214
|
+
if (!OBJECT_TRIPLE_PREDICATES.has(t.predicate)) {
|
|
215
|
+
// Unknown predicate — record for telemetry; cc KG would reject anyway.
|
|
216
|
+
errors.push({
|
|
217
|
+
kind: "relation",
|
|
218
|
+
subject: t.subject,
|
|
219
|
+
predicate: t.predicate,
|
|
220
|
+
error: "unknown predicate",
|
|
221
|
+
});
|
|
222
|
+
continue;
|
|
223
|
+
}
|
|
224
|
+
// cc requires both endpoints already in the KG. Best-effort: if not
|
|
225
|
+
// in this batch's subjectsCreated AND not in the long-lived _seenEntities,
|
|
226
|
+
// skip with a warning. (Cross-batch references should be rare in
|
|
227
|
+
// practice — KG ingest is per-batch from same sync.)
|
|
228
|
+
if (!this._seenEntities.has(t.subject) || !this._seenEntities.has(t.object)) {
|
|
229
|
+
errors.push({
|
|
230
|
+
kind: "relation",
|
|
231
|
+
subject: t.subject,
|
|
232
|
+
target: t.object,
|
|
233
|
+
predicate: t.predicate,
|
|
234
|
+
error: "endpoint not in KG",
|
|
235
|
+
});
|
|
236
|
+
continue;
|
|
237
|
+
}
|
|
238
|
+
try {
|
|
239
|
+
this._addRelation(this._db, {
|
|
240
|
+
sourceId: t.subject,
|
|
241
|
+
targetId: t.object,
|
|
242
|
+
relationType: t.predicate,
|
|
243
|
+
});
|
|
244
|
+
relationsAdded += 1;
|
|
245
|
+
} catch (err) {
|
|
246
|
+
const msg = err && err.message ? err.message : String(err);
|
|
247
|
+
// Tolerate "already exists" if cc throws it.
|
|
248
|
+
if (/already exists|duplicate/i.test(msg)) continue;
|
|
249
|
+
errors.push({
|
|
250
|
+
kind: "relation",
|
|
251
|
+
subject: t.subject,
|
|
252
|
+
target: t.object,
|
|
253
|
+
predicate: t.predicate,
|
|
254
|
+
error: msg,
|
|
255
|
+
});
|
|
256
|
+
if (this._log) this._log("CcKgSink.addRelation failed", t.subject, t.object, msg);
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
return { entitiesUpserted, relationsAdded, errors };
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
module.exports = { CcKgSink, HUB_TO_CC_TYPE };
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CcLLMAdapter — bridges the hub's LLMClient contract to ChainlessChain's
|
|
3
|
+
* existing llm-manager (or any compatible client).
|
|
4
|
+
*
|
|
5
|
+
* Hub package stays decoupled from cc by INJECTING the cc-specific bits:
|
|
6
|
+
*
|
|
7
|
+
* const llmManager = require("desktop-app-vue/src/main/llm/llm-manager");
|
|
8
|
+
* const adapter = new CcLLMAdapter({
|
|
9
|
+
* chat: (messages, opts) => llmManager.getInstance().chat(messages, opts),
|
|
10
|
+
* getActiveProvider: () => llmManager.getInstance().getActiveProvider(),
|
|
11
|
+
* getActiveModel: () => llmManager.getInstance().getActiveModel(),
|
|
12
|
+
* });
|
|
13
|
+
* // adapter satisfies LLMClient — drop into new AnalysisEngine({ vault, llm: adapter }).
|
|
14
|
+
*
|
|
15
|
+
* Why injection: the hub is CJS + workspace-portable. cli is ESM, desktop
|
|
16
|
+
* main is CJS, future iOS bridge is yet another runtime. Each caller knows
|
|
17
|
+
* how to obtain a `chat(messages, opts)` function in its own module system.
|
|
18
|
+
* The hub just adapts the response shape.
|
|
19
|
+
*
|
|
20
|
+
* Privacy: isLocal is computed from the caller-supplied getActiveProvider().
|
|
21
|
+
* Providers we consider local (no network egress on chat): ollama, llama-cpp,
|
|
22
|
+
* vllm-local, lm-studio. Everything else is non-local — AnalysisEngine will
|
|
23
|
+
* refuse to call unless caller explicitly opts in via acceptNonLocal: true.
|
|
24
|
+
*
|
|
25
|
+
* Response normalization: cc's llm-manager returns slightly different
|
|
26
|
+
* shapes per provider. We coerce to the hub's { text, model, usage }
|
|
27
|
+
* contract, preferring (in order):
|
|
28
|
+
* result.content // llm-manager wraps with .content
|
|
29
|
+
* result.message.content // raw provider message
|
|
30
|
+
* result.text // some clients
|
|
31
|
+
* result.choices[0].message.content // OpenAI-style
|
|
32
|
+
*/
|
|
33
|
+
|
|
34
|
+
"use strict";
|
|
35
|
+
|
|
36
|
+
const LOCAL_PROVIDERS = new Set([
|
|
37
|
+
"ollama",
|
|
38
|
+
"llama-cpp",
|
|
39
|
+
"llamacpp",
|
|
40
|
+
"vllm-local",
|
|
41
|
+
"lm-studio",
|
|
42
|
+
"lmstudio",
|
|
43
|
+
]);
|
|
44
|
+
|
|
45
|
+
function extractText(result) {
|
|
46
|
+
if (!result || typeof result !== "object") return "";
|
|
47
|
+
if (typeof result.content === "string") return result.content;
|
|
48
|
+
if (typeof result.text === "string") return result.text;
|
|
49
|
+
if (result.message && typeof result.message.content === "string") {
|
|
50
|
+
return result.message.content;
|
|
51
|
+
}
|
|
52
|
+
if (
|
|
53
|
+
Array.isArray(result.choices) &&
|
|
54
|
+
result.choices[0] &&
|
|
55
|
+
result.choices[0].message &&
|
|
56
|
+
typeof result.choices[0].message.content === "string"
|
|
57
|
+
) {
|
|
58
|
+
return result.choices[0].message.content;
|
|
59
|
+
}
|
|
60
|
+
return "";
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
function extractUsage(result) {
|
|
64
|
+
if (!result || typeof result !== "object" || !result.usage) {
|
|
65
|
+
return { promptTokens: 0, completionTokens: 0, totalTokens: 0 };
|
|
66
|
+
}
|
|
67
|
+
const u = result.usage;
|
|
68
|
+
// cc llm-manager + OpenAI-compat: prompt_tokens / completion_tokens / total_tokens
|
|
69
|
+
// hub contract: camelCase
|
|
70
|
+
return {
|
|
71
|
+
promptTokens: u.promptTokens ?? u.prompt_tokens ?? u.input_tokens ?? 0,
|
|
72
|
+
completionTokens: u.completionTokens ?? u.completion_tokens ?? u.output_tokens ?? 0,
|
|
73
|
+
totalTokens:
|
|
74
|
+
u.totalTokens ??
|
|
75
|
+
u.total_tokens ??
|
|
76
|
+
(u.promptTokens ?? u.prompt_tokens ?? 0) + (u.completionTokens ?? u.completion_tokens ?? 0),
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
class CcLLMAdapter {
|
|
81
|
+
/**
|
|
82
|
+
* @param {object} deps
|
|
83
|
+
* @param {(messages: Array, opts?: object) => Promise<object>} deps.chat
|
|
84
|
+
* @param {() => string} [deps.getActiveProvider]
|
|
85
|
+
* @param {() => string} [deps.getActiveModel]
|
|
86
|
+
* @param {Set<string>|string[]} [deps.localProviders] override the default local-provider whitelist
|
|
87
|
+
* @param {string} [deps.name] override the .name surface
|
|
88
|
+
*/
|
|
89
|
+
constructor(deps) {
|
|
90
|
+
if (!deps || typeof deps !== "object") {
|
|
91
|
+
throw new Error("CcLLMAdapter: deps required");
|
|
92
|
+
}
|
|
93
|
+
if (typeof deps.chat !== "function") {
|
|
94
|
+
throw new Error("CcLLMAdapter: deps.chat(messages, opts) required");
|
|
95
|
+
}
|
|
96
|
+
this._chat = deps.chat;
|
|
97
|
+
this._getActiveProvider = typeof deps.getActiveProvider === "function" ? deps.getActiveProvider : null;
|
|
98
|
+
this._getActiveModel = typeof deps.getActiveModel === "function" ? deps.getActiveModel : null;
|
|
99
|
+
this._localProviders =
|
|
100
|
+
deps.localProviders instanceof Set
|
|
101
|
+
? deps.localProviders
|
|
102
|
+
: Array.isArray(deps.localProviders)
|
|
103
|
+
? new Set(deps.localProviders)
|
|
104
|
+
: LOCAL_PROVIDERS;
|
|
105
|
+
this._name = deps.name || null;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
get name() {
|
|
109
|
+
if (this._name) return this._name;
|
|
110
|
+
const model = this._getActiveModel ? this._tryCall(this._getActiveModel, "model") : null;
|
|
111
|
+
const provider = this._getActiveProvider
|
|
112
|
+
? this._tryCall(this._getActiveProvider, "provider")
|
|
113
|
+
: null;
|
|
114
|
+
if (provider && model) return `${provider}:${model}`;
|
|
115
|
+
if (model) return model;
|
|
116
|
+
if (provider) return provider;
|
|
117
|
+
return "cc-llm";
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* Privacy invariant: report whether this LLM keeps data on-device. The
|
|
122
|
+
* AnalysisEngine consults this BEFORE calling chat; non-local clients
|
|
123
|
+
* are refused unless the caller explicitly passes acceptNonLocal: true.
|
|
124
|
+
*/
|
|
125
|
+
get isLocal() {
|
|
126
|
+
if (!this._getActiveProvider) {
|
|
127
|
+
// No provider info → conservative: assume non-local. Caller must
|
|
128
|
+
// explicitly mark via constructor opts.localProviders if needed.
|
|
129
|
+
return false;
|
|
130
|
+
}
|
|
131
|
+
const provider = this._tryCall(this._getActiveProvider, "provider");
|
|
132
|
+
if (!provider) return false;
|
|
133
|
+
return this._localProviders.has(String(provider).toLowerCase());
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
async chat(messages, opts = {}) {
|
|
137
|
+
if (!Array.isArray(messages)) {
|
|
138
|
+
throw new Error("CcLLMAdapter.chat: messages must be an array");
|
|
139
|
+
}
|
|
140
|
+
let result;
|
|
141
|
+
try {
|
|
142
|
+
result = await this._chat(messages, opts);
|
|
143
|
+
} catch (err) {
|
|
144
|
+
const wrapped = new Error(
|
|
145
|
+
`CcLLMAdapter.chat: underlying client failed — ${err && err.message ? err.message : err}`
|
|
146
|
+
);
|
|
147
|
+
wrapped.cause = err;
|
|
148
|
+
throw wrapped;
|
|
149
|
+
}
|
|
150
|
+
return {
|
|
151
|
+
text: extractText(result),
|
|
152
|
+
model: this._getActiveModel ? this._tryCall(this._getActiveModel, "model") : result && result.model,
|
|
153
|
+
usage: extractUsage(result),
|
|
154
|
+
raw: result,
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
_tryCall(fn, label) {
|
|
159
|
+
try {
|
|
160
|
+
return fn();
|
|
161
|
+
} catch (err) {
|
|
162
|
+
// Don't let getActiveProvider/getActiveModel side-effects abort isLocal
|
|
163
|
+
// computation or name lookup. Fall through to default.
|
|
164
|
+
return null;
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
module.exports = { CcLLMAdapter, LOCAL_PROVIDERS };
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CcRagSink — feeds hub RagDocs into ChainlessChain's existing BM25 + (later)
|
|
3
|
+
* Qdrant vector store.
|
|
4
|
+
*
|
|
5
|
+
* Hub RagDoc shape: { id, type, text, metadata: { ... } }
|
|
6
|
+
* cc BM25.addDocument(doc) expects: { id, title?, content? } — concatenates
|
|
7
|
+
* title + " " + content for tokenization.
|
|
8
|
+
*
|
|
9
|
+
* We map:
|
|
10
|
+
* doc.id → doc.id
|
|
11
|
+
* doc.metadata.title || doc.type → doc.title (short, BM25-prioritized via tokenization)
|
|
12
|
+
* doc.text → doc.content
|
|
13
|
+
*
|
|
14
|
+
* Metadata is also serialized into a `meta` property the BM25 originalDoc
|
|
15
|
+
* field preserves verbatim — that's how the downstream Q&A flow filters
|
|
16
|
+
* hits by adapter / time-window / subtype.
|
|
17
|
+
*
|
|
18
|
+
* Vector store wiring (Qdrant) is intentionally LEFT OUT of this v0 sink:
|
|
19
|
+
* the existing cc embed/qdrant integration is async and IPC-shaped; adding
|
|
20
|
+
* it here would couple us to its lifecycle. v1 sink: BM25 only. The hub
|
|
21
|
+
* caller can add a second sink in parallel for vector indexing once that
|
|
22
|
+
* surface stabilizes.
|
|
23
|
+
*
|
|
24
|
+
* Like the other bridges this is dependency-injected — caller passes
|
|
25
|
+
* the BM25 instance (or any object with .addDocument(doc)).
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
"use strict";
|
|
29
|
+
|
|
30
|
+
class CcRagSink {
|
|
31
|
+
/**
|
|
32
|
+
* @param {object} deps
|
|
33
|
+
* @param {{ addDocument: (doc: object) => void }} deps.bm25 cc BM25Search instance
|
|
34
|
+
* @param {{ index: (docs: Array) => Promise<void> }} [deps.vector] future Qdrant adapter (not yet used)
|
|
35
|
+
* @param {(label: string, ...args: any[]) => void} [deps.logger]
|
|
36
|
+
* @param {(doc: object) => object} [deps.transformDoc] optional pre-write hook
|
|
37
|
+
*/
|
|
38
|
+
constructor(deps) {
|
|
39
|
+
if (!deps || typeof deps !== "object") {
|
|
40
|
+
throw new Error("CcRagSink: deps required");
|
|
41
|
+
}
|
|
42
|
+
if (!deps.bm25 || typeof deps.bm25.addDocument !== "function") {
|
|
43
|
+
throw new Error("CcRagSink: deps.bm25 with .addDocument(doc) required");
|
|
44
|
+
}
|
|
45
|
+
this._bm25 = deps.bm25;
|
|
46
|
+
this._vector = deps.vector && typeof deps.vector.index === "function" ? deps.vector : null;
|
|
47
|
+
this._log = typeof deps.logger === "function" ? deps.logger : null;
|
|
48
|
+
this._transform = typeof deps.transformDoc === "function" ? deps.transformDoc : null;
|
|
49
|
+
this._writtenIds = new Set();
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Bound to .write(docs) for use as the registry's ragSink callback.
|
|
54
|
+
*/
|
|
55
|
+
async write(docs) {
|
|
56
|
+
if (!Array.isArray(docs) || docs.length === 0) {
|
|
57
|
+
return { indexed: 0, skipped: 0, errors: [] };
|
|
58
|
+
}
|
|
59
|
+
let indexed = 0;
|
|
60
|
+
let skipped = 0;
|
|
61
|
+
const errors = [];
|
|
62
|
+
const forVector = [];
|
|
63
|
+
|
|
64
|
+
for (const d of docs) {
|
|
65
|
+
if (!d || !d.id || typeof d.text !== "string" || d.text.length === 0) {
|
|
66
|
+
skipped += 1;
|
|
67
|
+
continue;
|
|
68
|
+
}
|
|
69
|
+
// De-dup within process lifetime — BM25.addDocument doesn't dedup
|
|
70
|
+
// internally; a re-ingest would otherwise double-count term frequencies.
|
|
71
|
+
if (this._writtenIds.has(d.id)) {
|
|
72
|
+
skipped += 1;
|
|
73
|
+
continue;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
const doc = this._transform ? this._transform(d) : this._toBm25Doc(d);
|
|
77
|
+
try {
|
|
78
|
+
this._bm25.addDocument(doc);
|
|
79
|
+
this._writtenIds.add(d.id);
|
|
80
|
+
indexed += 1;
|
|
81
|
+
if (this._vector) forVector.push(d);
|
|
82
|
+
} catch (err) {
|
|
83
|
+
const msg = err && err.message ? err.message : String(err);
|
|
84
|
+
errors.push({ id: d.id, error: msg });
|
|
85
|
+
if (this._log) this._log("CcRagSink.addDocument failed", d.id, msg);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
if (this._vector && forVector.length > 0) {
|
|
90
|
+
try {
|
|
91
|
+
await this._vector.index(forVector);
|
|
92
|
+
} catch (err) {
|
|
93
|
+
const msg = err && err.message ? err.message : String(err);
|
|
94
|
+
errors.push({ phase: "vector", error: msg });
|
|
95
|
+
if (this._log) this._log("CcRagSink.vector.index failed", msg);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
return { indexed, skipped, errors };
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
_toBm25Doc(d) {
|
|
103
|
+
const title =
|
|
104
|
+
(d.metadata && (d.metadata.title || d.metadata.subtype)) ||
|
|
105
|
+
d.type ||
|
|
106
|
+
"";
|
|
107
|
+
return {
|
|
108
|
+
id: d.id,
|
|
109
|
+
title: String(title),
|
|
110
|
+
content: d.text,
|
|
111
|
+
// BM25 preserves the original doc in `originalDoc`; metadata lives there.
|
|
112
|
+
meta: d.metadata || {},
|
|
113
|
+
hubType: d.type,
|
|
114
|
+
};
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
module.exports = { CcRagSink };
|