@directory-builder/core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +112 -0
  3. package/bin/cli.js +38 -0
  4. package/example/README.md +64 -0
  5. package/example/config/federation.ttl +136 -0
  6. package/example/config/match-knowledge.ttl +8 -0
  7. package/example/sources/cityopen/clean.sparql +17 -0
  8. package/example/sources/cityopen/fetch.js +14 -0
  9. package/example/sources/cityopen/static/libraries.json +32 -0
  10. package/example/sources/civichub/clean.sparql +34 -0
  11. package/example/sources/civichub/fetch.js +14 -0
  12. package/example/sources/civichub/static/libraries.json +38 -0
  13. package/package.json +38 -0
  14. package/src/federate.js +571 -0
  15. package/src/index.js +6 -0
  16. package/src/ingest.js +158 -0
  17. package/src/lift/html.sparql +12 -0
  18. package/src/lift/json.sparql +12 -0
  19. package/src/pipeline.js +16 -0
  20. package/src/utils.js +152 -0
  21. package/src/webapp.js +41 -0
  22. package/webapp/index.html +11 -0
  23. package/webapp/src/About.jsx +24 -0
  24. package/webapp/src/App.jsx +96 -0
  25. package/webapp/src/Card.jsx +32 -0
  26. package/webapp/src/ColumnGraph.jsx +290 -0
  27. package/webapp/src/Directory.jsx +15 -0
  28. package/webapp/src/Download.jsx +174 -0
  29. package/webapp/src/MapGraph.jsx +244 -0
  30. package/webapp/src/MatchGraph.jsx +137 -0
  31. package/webapp/src/MergeTables.jsx +61 -0
  32. package/webapp/src/OrgCard.jsx +126 -0
  33. package/webapp/src/Pipeline.jsx +41 -0
  34. package/webapp/src/Query.jsx +165 -0
  35. package/webapp/src/Sources.jsx +52 -0
  36. package/webapp/src/instanceData.js +35 -0
  37. package/webapp/src/loadMap.js +276 -0
  38. package/webapp/src/loadMatch.js +228 -0
  39. package/webapp/src/loadMerge.js +93 -0
  40. package/webapp/src/loadPipeline.js +130 -0
  41. package/webapp/src/loadSources.js +102 -0
  42. package/webapp/src/main.jsx +9 -0
  43. package/webapp/src/mergeOrgs.js +15 -0
  44. package/webapp/src/sourceMeta.js +81 -0
  45. package/webapp/src/styles.css +23 -0
  46. package/webapp/vite.config.js +14 -0
  47. package/webapp/vite.js +28 -0
@@ -0,0 +1,571 @@
1
+ import { newStore, parser as n3Parser, sparqlConstruct, sparqlInsertDelete, sparqlSelect, storeFromTurtles } from "@foerderfunke/sem-ops-utils"
2
+ import { buildPrefixBlock, CDP, objectsOf, parseTtl, PATHS, shrink, sourceGraph, sourceName, stepIri, stepJournal } from "./utils.js"
3
+ import { token_set_ratio } from "fuzzball"
4
+ import { DataFactory, Writer } from "n3"
5
+ import { createHash } from "crypto"
6
+ import path from "path"
7
+ import fs from "fs"
8
+
9
+ const df = DataFactory
10
+
11
+ // Dedupe via a Store and sort by subject so the Writer can emit grouped
12
+ // "subject p1 o1; p2 o2." blocks instead of repeating subjects. Strips
13
+ // graph names (writes triples, not quads).
14
+ const writeTurtleFile = (filePath, quads, prefixes = {}) => new Promise((resolve, reject) => {
15
+ const store = newStore()
16
+ for (const q of quads) store.addQuad(df.quad(q.subject, q.predicate, q.object))
17
+ const dedup = store.getQuads(null, null, null, null)
18
+ .sort((a, b) => a.subject.value.localeCompare(b.subject.value))
19
+ const writer = new Writer({ prefixes })
20
+ for (const q of dedup) writer.addQuad(q)
21
+ writer.end((err, result) => {
22
+ if (err) return reject(err)
23
+ fs.mkdirSync(path.dirname(filePath), { recursive: true })
24
+ fs.writeFileSync(filePath, result)
25
+ resolve()
26
+ })
27
+ })
28
+
29
+ // ---- Direct-mapping generator ------------------------------------------
30
+
31
+ const XYZ = "http://sparql.xyz/facade-x/data/"
32
+
33
+ const buildDirectInsert = ({ sourceGraph, source, targetClass, target }, fields) => {
34
+ const prefixes = {
35
+ xyz: XYZ,
36
+ cdp: CDP,
37
+ cdf: "https://civic-data.de/federated-directory#",
38
+ schema: "http://schema.org/",
39
+ foaf: "http://xmlns.com/foaf/0.1/",
40
+ dct: "http://purl.org/dc/terms/",
41
+ }
42
+ // shrink() returns the IRI verbatim if no prefix matches; wrap that as <…>.
43
+ const short = (iri) => {
44
+ const s = shrink(iri, prefixes)
45
+ return s === iri ? `<${iri}>` : s
46
+ }
47
+
48
+ const v = (path) => `?${path}`
49
+ // STR() before the emptiness check so the guard works for any literal
50
+ // datatype — a bare `?v != ""` errors on e.g. xsd:int and would silently
51
+ // drop the field (AWO's numeric ids hit exactly this).
52
+ const optLit = (subj, path) =>
53
+ `OPTIONAL { ${subj} xyz:${path} ${v(path)} . ` +
54
+ `FILTER(isLiteral(${v(path)}) && STR(${v(path)}) != "") }`
55
+
56
+ const insertBlock = fields
57
+ .map(f => ` ?org ${short(f.predicate)} ${v(f.fieldPath)} .`)
58
+ .join("\n")
59
+
60
+ const topLevel = fields.filter(f => !f.parentPath)
61
+ const subFields = fields.filter(f => f.parentPath)
62
+
63
+ // Source subjects = federation IRIs after the clean step, identified via
64
+ // cdp:fromSource — no minting from a key field. Where clean reshapes one
65
+ // source into several entity kinds it tags each subject with cdp:targetSchema;
66
+ // select only those for this mapping's schema. Subjects with no marker
67
+ // (single-entity sources like caritas/dhs) match unconditionally.
68
+ const bgp = [`?org cdp:fromSource ${short(source)} .`]
69
+ if (target) {
70
+ bgp.push(`OPTIONAL { ?org cdp:targetSchema ?_ts }`)
71
+ bgp.push(`FILTER(!bound(?_ts) || ?_ts = ${short(target)})`)
72
+ }
73
+ for (const f of topLevel) bgp.push(optLit("?org", f.fieldPath))
74
+
75
+ const byParent = new Map()
76
+ for (const f of subFields) {
77
+ if (!byParent.has(f.parentPath)) byParent.set(f.parentPath, [])
78
+ byParent.get(f.parentPath).push(f)
79
+ }
80
+ let parentIdx = 0
81
+ for (const [parent, subs] of byParent) {
82
+ const pv = `?_p${parentIdx++}`
83
+ const inner = subs.map(s => ` ${optLit(pv, s.fieldPath)}`).join("\n")
84
+ bgp.push(`OPTIONAL {\n ?org xyz:${parent} ${pv} .\n${inner}\n }`)
85
+ }
86
+
87
+ // The target schema's :targetClass becomes the record's rdf:type here in the
88
+ // mapped graph — this is where schema: vocabulary first enters; the clean step
89
+ // stays in xyz:/cdp: only.
90
+ const typeClause = targetClass ? `a ${short(targetClass)} ; ` : ""
91
+
92
+ return `${buildPrefixBlock(prefixes)}
93
+
94
+ INSERT {
95
+ GRAPH <urn:mapped> {
96
+ ?org ${typeClause}cdp:fromSource ${short(source)} .
97
+ ${insertBlock}
98
+ }
99
+ } WHERE {
100
+ GRAPH <${sourceGraph}> {
101
+ ${bgp.join("\n ")}
102
+ }
103
+ }`
104
+ }
105
+
106
+ const runMap = async ({ store, defStore, abs }, queriesDir) => {
107
+ const mappings = await sparqlSelect(`
108
+ PREFIX : <${CDP}>
109
+ SELECT ?mapping ?source ?sourceGraph ?target ?targetClass WHERE {
110
+ ?mapping a :Mapping ;
111
+ :fromSource ?source .
112
+ OPTIONAL { ?mapping :sourceGraph ?sourceGraph }
113
+ OPTIONAL { ?mapping :toTarget ?target }
114
+ OPTIONAL { ?mapping :toTarget/:targetClass ?targetClass }
115
+ } ORDER BY ?mapping`, [defStore])
116
+
117
+ for (const m of mappings) {
118
+ const directRows = await sparqlSelect(`
119
+ PREFIX : <${CDP}>
120
+ SELECT ?fieldPath ?predicate ?parentPath WHERE {
121
+ <${m.mapping}> :hasFieldMapping ?fm .
122
+ ?fm :from ?src ; :to ?tgt .
123
+ FILTER NOT EXISTS { ?fm :via ?_v }
124
+ ?tgt :targetPredicate ?predicate .
125
+ ?src :fieldPath ?fieldPath .
126
+ OPTIONAL { ?parent :hasSubField ?src . ?parent :fieldPath ?parentPath }
127
+ }`, [defStore])
128
+
129
+ if (directRows.length && m.sourceGraph) {
130
+ const localName = m.mapping.split("#").pop()
131
+ const query = buildDirectInsert(m, directRows)
132
+ const queryPath = abs(path.join(queriesDir, `${localName}.sparql`))
133
+ fs.mkdirSync(path.dirname(queryPath), { recursive: true })
134
+ fs.writeFileSync(queryPath, query)
135
+ console.log(`map ${localName} direct (${directRows.length} mappings) → ${queryPath}`)
136
+ await sparqlInsertDelete(query, store)
137
+ }
138
+
139
+ // :via names a transform of the mapping's source — the script path
140
+ // follows by convention (sources/<source>/transform-<via>.sparql).
141
+ const viaRows = await sparqlSelect(`
142
+ PREFIX : <${CDP}>
143
+ SELECT DISTINCT ?via WHERE {
144
+ <${m.mapping}> :hasFieldMapping/:via ?via .
145
+ } ORDER BY ?via`, [defStore])
146
+
147
+ for (const v of viaRows) {
148
+ const script = PATHS.transform(sourceName(m.source), v.via)
149
+ console.log(`map ${script}`)
150
+ await sparqlInsertDelete(fs.readFileSync(abs(script), "utf8"), store)
151
+ }
152
+ }
153
+
154
+ // A mapping's :hasRelationship turns the clean step's source-level link
155
+ // (e.g. :providedBy) into a target predicate (schema:provider), matching the
156
+ // two ends by their cdp:targetSchema. Both ends are still source IRIs here;
157
+ // the merge step rewrites them to the minted cluster IRIs.
158
+ const linkRows = await sparqlSelect(`
159
+ PREFIX : <${CDP}>
160
+ SELECT ?mapping ?sourceGraph ?fromSchema ?sourcePredicate ?targetPredicate ?toSchema WHERE {
161
+ ?mapping a :Mapping ;
162
+ :sourceGraph ?sourceGraph ;
163
+ :toTarget ?fromSchema ;
164
+ :hasRelationship ?rel .
165
+ ?rel :sourcePredicate ?sourcePredicate ;
166
+ :toTargetField ?field ;
167
+ :toTargetSchema ?toSchema .
168
+ ?field :targetPredicate ?targetPredicate .
169
+ } ORDER BY ?mapping`, [defStore])
170
+
171
+ for (const rel of linkRows) {
172
+ const prefixes = { cdp: CDP, schema: "http://schema.org/" }
173
+ const short = (iri) => { const s = shrink(iri, prefixes); return s === iri ? `<${iri}>` : s }
174
+ const query = `${buildPrefixBlock(prefixes)}
175
+
176
+ INSERT {
177
+ GRAPH <urn:mapped> {
178
+ ?from ${short(rel.targetPredicate)} ?to .
179
+ }
180
+ } WHERE {
181
+ GRAPH <${rel.sourceGraph}> {
182
+ ?from ${short(rel.sourcePredicate)} ?to ;
183
+ cdp:targetSchema ${short(rel.fromSchema)} .
184
+ ?to cdp:targetSchema ${short(rel.toSchema)} .
185
+ }
186
+ }`
187
+ console.log(`map ${rel.mapping.split("#").pop()} link (${short(rel.targetPredicate)})`)
188
+ await sparqlInsertDelete(query, store)
189
+ }
190
+ }
191
+
192
+ // ---- Shared graphs and prefixes ----------------------------------------
193
+
194
+ const MAPPED_GRAPH = df.namedNode("urn:mapped")
195
+ const MATCH_GRAPH = df.namedNode("urn:matched")
196
+ const MERGED_GRAPH = df.namedNode("urn:merged")
197
+
198
+ const HAS_MEMBER = df.namedNode(CDP + "hasMember")
199
+
200
+ const COMMON_PREFIXES = {
201
+ schema: "http://schema.org/",
202
+ foaf: "http://xmlns.com/foaf/0.1/",
203
+ dct: "http://purl.org/dc/terms/",
204
+ }
205
+
206
+ // ---- Match -------------------------------------------------------------
207
+
208
+ const RDF_TYPE = df.namedNode("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")
209
+ const MATCH_CLUSTER = df.namedNode(CDP + "MatchCluster")
210
+
211
+ // token_set_ratio computes a ratio over the intersection of token sets, which
212
+ // is robust to legal-form noise ("gGmbH", "e.V."), sub-unit specifiers, and
213
+ // word-order variations. Returns 0–100; we normalise to 0–1. The algorithm
214
+ // name is recorded in the evidence graph so old similarity numbers stay
215
+ // interpretable across algorithm swaps.
216
+ const SIMILARITY_ALGORITHM = "token_set_ratio"
217
+ const similarity = (a, b) => token_set_ratio(a ?? "", b ?? "") / 100
218
+
219
+ const runMatch = async ({ store, defStore, abs }, outPath) => {
220
+ // One match rule per target schema; each rule scores its own fields, mints
221
+ // with its own prefix, and clusters only subjects of its :targetClass.
222
+ const rules = await sparqlSelect(`
223
+ PREFIX : <${CDP}>
224
+ SELECT ?match ?targetClass ?ns ?prefix ?minScore WHERE {
225
+ ?match a :MatchRule ;
226
+ :forTarget ?target ;
227
+ :targetNamespace ?ns ;
228
+ :mintedSubjectPrefix ?prefix .
229
+ ?target :targetClass ?targetClass .
230
+ OPTIONAL { ?match :minScore ?minScore }
231
+ } ORDER BY ?match`, [defStore])
232
+ if (!rules.length) throw new Error(":MatchRule config missing in federation.ttl")
233
+
234
+ const criteriaRows = await sparqlSelect(`
235
+ PREFIX : <${CDP}>
236
+ SELECT ?match ?on ?weight ?minSim WHERE {
237
+ ?match a :MatchRule ; :hasWeightedCriterion ?c .
238
+ ?c :on ?on ; :weight ?weight .
239
+ OPTIONAL { ?c :minSimilarity ?minSim }
240
+ }`, [defStore])
241
+ // Hard criteria: fields that must be identical in both records (pass/fail gates).
242
+ const hardRows = await sparqlSelect(`
243
+ PREFIX : <${CDP}>
244
+ SELECT ?match ?on WHERE {
245
+ ?match a :MatchRule ; :hasHardCriterion ?h . ?h :on ?on .
246
+ }`, [defStore])
247
+ // Criteria keyed by their owning rule, so each pass scores on its own fields.
248
+ const criteriaByMatch = new Map()
249
+ for (const r of criteriaRows) {
250
+ if (!criteriaByMatch.has(r.match)) criteriaByMatch.set(r.match, [])
251
+ criteriaByMatch.get(r.match).push({
252
+ pred: df.namedNode(r.on),
253
+ weight: parseFloat(r.weight),
254
+ minSim: r.minSim != null ? parseFloat(r.minSim) : null,
255
+ })
256
+ }
257
+ const hardByMatch = new Map()
258
+ for (const r of hardRows) {
259
+ if (!hardByMatch.has(r.match)) hardByMatch.set(r.match, [])
260
+ hardByMatch.get(r.match).push({ pred: df.namedNode(r.on) })
261
+ }
262
+ // owl:sameAs assertions are shared; each pass only acts on the pairs whose
263
+ // endpoints are in its own subject set (gated by parent.has below).
264
+ const sameAsRows = await sparqlSelect(`
265
+ PREFIX owl: <http://www.w3.org/2002/07/owl#>
266
+ SELECT ?a ?b WHERE { ?a owl:sameAs ?b }`, [defStore])
267
+
268
+ const MATCH_EVIDENCE = df.namedNode(CDP + "MatchEvidence")
269
+ const HAS_MATCH_EVIDENCE = df.namedNode(CDP + "hasMatchEvidence")
270
+ const PAIR = df.namedNode(CDP + "pair")
271
+ const ON_CRITERION = df.namedNode(CDP + "onCriterion")
272
+ const ON = df.namedNode(CDP + "on")
273
+ const SIMILARITY = df.namedNode(CDP + "similarity")
274
+ const SIM_ALGORITHM = df.namedNode(CDP + "similarityAlgorithm")
275
+ const WEIGHT = df.namedNode(CDP + "weight")
276
+ const VALUE_A = df.namedNode(CDP + "valueA")
277
+ const VALUE_B = df.namedNode(CDP + "valueB")
278
+ const AGGREGATE_SCORE = df.namedNode(CDP + "aggregateScore")
279
+ const VIA_MANUAL_MATCH = df.namedNode(CDP + "viaManualMatch")
280
+ const XSD_DECIMAL = df.namedNode("http://www.w3.org/2001/XMLSchema#decimal")
281
+ const XSD_BOOLEAN = df.namedNode("http://www.w3.org/2001/XMLSchema#boolean")
282
+
283
+ for (const rule of rules) {
284
+ const namespace = rule.ns
285
+ const mintedPrefix = rule.prefix
286
+ const minScore = parseFloat(rule.minScore)
287
+ const hard = hardByMatch.get(rule.match) ?? []
288
+ const weighted = criteriaByMatch.get(rule.match) ?? []
289
+
290
+ // Subjects of this rule's target class only — passes never cross types.
291
+ const subjects = [...new Set(store.getQuads(null, RDF_TYPE, df.namedNode(rule.targetClass), MAPPED_GRAPH)
292
+ .filter(qu => qu.subject.termType === "NamedNode")
293
+ .map(qu => qu.subject.value))]
294
+
295
+ const valOf = (s, pred) => {
296
+ const qs = store.getQuads(df.namedNode(s), pred, null, MAPPED_GRAPH)
297
+ return qs.length ? qs[0].object.value : null
298
+ }
299
+ const hardVals = new Map(subjects.map(s => [s, hard.map(h => valOf(s, h.pred))]))
300
+ const weightedVals = new Map(subjects.map(s => [s, weighted.map(c => valOf(s, c.pred))]))
301
+
302
+ // A pair matches when every hard criterion is present and identical in both,
303
+ // and the weighted criteria's aggregate (sum of sim·weight, each optionally
304
+ // floored by :minSimilarity) clears :minScore. No criteria at all → every
305
+ // subject stays its own cluster.
306
+ const matches = (a, b) => {
307
+ if (!hard.length && !weighted.length) return null
308
+ const ha = hardVals.get(a), hb = hardVals.get(b)
309
+ for (let i = 0; i < hard.length; i++) {
310
+ if (ha[i] == null || hb[i] == null || ha[i] !== hb[i]) return null
311
+ }
312
+ const va = weightedVals.get(a), vb = weightedVals.get(b)
313
+ const scores = []
314
+ let weightedSum = 0
315
+ for (let i = 0; i < weighted.length; i++) {
316
+ if (va[i] == null || vb[i] == null) return null
317
+ const c = weighted[i]
318
+ const sim = similarity(va[i], vb[i])
319
+ if (c.minSim != null && sim < c.minSim) return null
320
+ scores.push({ pred: c.pred, sim, weight: c.weight, valueA: va[i], valueB: vb[i] })
321
+ weightedSum += sim * c.weight
322
+ }
323
+ if (weighted.length && weightedSum < minScore) return null
324
+ return { scores, aggregate: weightedSum }
325
+ }
326
+
327
+ const parent = new Map(subjects.map(s => [s, s]))
328
+ const find = (x) => {
329
+ let r = x
330
+ while (parent.get(r) !== r) r = parent.get(r)
331
+ let c = x
332
+ while (parent.get(c) !== r) { const n = parent.get(c); parent.set(c, r); c = n }
333
+ return r
334
+ }
335
+ const union = (a, b) => {
336
+ const ra = find(a), rb = find(b)
337
+ if (ra !== rb) parent.set(ra, rb)
338
+ }
339
+
340
+ const evidence = []
341
+ let sameAsUnions = 0
342
+ for (const { a, b } of sameAsRows) {
343
+ if (parent.has(a) && parent.has(b)) { union(a, b); sameAsUnions++; evidence.push({ a, b, manual: true }) }
344
+ }
345
+
346
+ for (let i = 0; i < subjects.length; i++) {
347
+ for (let j = i + 1; j < subjects.length; j++) {
348
+ const m = matches(subjects[i], subjects[j])
349
+ if (m) { union(subjects[i], subjects[j]); evidence.push({ a: subjects[i], b: subjects[j], ...m }) }
350
+ }
351
+ }
352
+
353
+ const clusters = new Map()
354
+ for (const s of subjects) {
355
+ const root = find(s)
356
+ if (!clusters.has(root)) clusters.set(root, [])
357
+ clusters.get(root).push(s)
358
+ }
359
+ const clusterMembers = [...clusters.values()]
360
+ .map(m => [...m].sort())
361
+ .sort((a, b) => b.length - a.length || a[0].localeCompare(b[0]))
362
+
363
+ let multiSource = 0
364
+ const clusterIriByRoot = new Map()
365
+ for (const members of clusterMembers) {
366
+ const id = createHash("sha1").update(members.join("|")).digest("hex").slice(0, 12)
367
+ const minted = df.namedNode(namespace + mintedPrefix + id)
368
+ clusterIriByRoot.set(find(members[0]), minted)
369
+ if (members.length > 1) multiSource++
370
+ store.addQuad(df.quad(minted, RDF_TYPE, MATCH_CLUSTER, MATCH_GRAPH))
371
+ for (const s of members) {
372
+ store.addQuad(df.quad(minted, HAS_MEMBER, df.namedNode(s), MATCH_GRAPH))
373
+ }
374
+ }
375
+
376
+ for (const ev of evidence) {
377
+ const evNode = df.blankNode()
378
+ const cluster = clusterIriByRoot.get(find(ev.a))
379
+ store.addQuad(df.quad(cluster, HAS_MATCH_EVIDENCE, evNode, MATCH_GRAPH))
380
+ store.addQuad(df.quad(evNode, RDF_TYPE, MATCH_EVIDENCE, MATCH_GRAPH))
381
+ store.addQuad(df.quad(evNode, PAIR, df.namedNode(ev.a), MATCH_GRAPH))
382
+ store.addQuad(df.quad(evNode, PAIR, df.namedNode(ev.b), MATCH_GRAPH))
383
+ if (ev.manual) {
384
+ store.addQuad(df.quad(evNode, VIA_MANUAL_MATCH, df.literal("true", XSD_BOOLEAN), MATCH_GRAPH))
385
+ } else {
386
+ store.addQuad(df.quad(evNode, AGGREGATE_SCORE, df.literal(ev.aggregate.toFixed(3), XSD_DECIMAL), MATCH_GRAPH))
387
+ store.addQuad(df.quad(evNode, SIM_ALGORITHM, df.literal(SIMILARITY_ALGORITHM), MATCH_GRAPH))
388
+ for (const s of ev.scores) {
389
+ const cNode = df.blankNode()
390
+ store.addQuad(df.quad(evNode, ON_CRITERION, cNode, MATCH_GRAPH))
391
+ store.addQuad(df.quad(cNode, ON, s.pred, MATCH_GRAPH))
392
+ store.addQuad(df.quad(cNode, SIMILARITY, df.literal(s.sim.toFixed(3), XSD_DECIMAL), MATCH_GRAPH))
393
+ store.addQuad(df.quad(cNode, WEIGHT, df.literal(s.weight.toFixed(2), XSD_DECIMAL), MATCH_GRAPH))
394
+ store.addQuad(df.quad(cNode, VALUE_A, df.literal(s.valueA), MATCH_GRAPH))
395
+ store.addQuad(df.quad(cNode, VALUE_B, df.literal(s.valueB), MATCH_GRAPH))
396
+ }
397
+ }
398
+ }
399
+
400
+ console.log(`match: ${rule.match.split("#").pop()} ${subjects.length} entities → ${clusters.size} clusters (${multiSource} multi-source, ${sameAsUnions} sameAs unions)`)
401
+ }
402
+
403
+ const matchQuads = store.getQuads(null, null, null, MATCH_GRAPH)
404
+ await writeTurtleFile(abs(outPath), matchQuads, { cdp: CDP, cdf: rules[0].ns, ...COMMON_PREFIXES })
405
+ console.log(`match: wrote cluster log → ${outPath}`)
406
+ }
407
+
408
+ // ---- Merge -------------------------------------------------------------
409
+
410
+ const runMerge = async ({ store, defStore, abs }, outPath, provOutPath) => {
411
+ const [cfg] = await sparqlSelect(`
412
+ PREFIX : <${CDP}>
413
+ SELECT ?ns ?originPred WHERE {
414
+ ?match a :MatchRule ; :targetNamespace ?ns .
415
+ ?merge a :MergeRule ; :originPredicate ?originPred .
416
+ }`, [defStore])
417
+ if (!cfg) throw new Error(":MergeRule / :MatchRule config missing in federation.ttl")
418
+ const { ns: namespace, originPred } = cfg
419
+
420
+ const memberQuads = store.getQuads(null, HAS_MEMBER, null, MATCH_GRAPH)
421
+ const mintedFor = new Map()
422
+ for (const mq of memberQuads) mintedFor.set(mq.object.value, mq.subject)
423
+
424
+ const fedQuads = store.getQuads(null, null, null, MAPPED_GRAPH)
425
+ const originPredNode = df.namedNode(originPred)
426
+ const provQuads = []
427
+ for (const qu of fedQuads) {
428
+ const minted = mintedFor.get(qu.subject.value)
429
+ if (!minted) continue
430
+ // Rewrite IRI objects that are themselves matched subjects to their minted
431
+ // cluster IRI, so inter-entity links (e.g. schema:provider) point at the
432
+ // merged entity rather than the pre-merge source IRI.
433
+ const object = qu.object.termType === "NamedNode" && mintedFor.has(qu.object.value)
434
+ ? mintedFor.get(qu.object.value)
435
+ : qu.object
436
+ store.addQuad(df.quad(minted, qu.predicate, object, MERGED_GRAPH))
437
+ const triple = df.quad(minted, qu.predicate, object)
438
+ provQuads.push(df.quad(triple, originPredNode, qu.subject))
439
+ }
440
+
441
+ const mergedQuads = store.getQuads(null, null, null, MERGED_GRAPH)
442
+
443
+ await writeTurtleFile(abs(outPath), mergedQuads, { ...COMMON_PREFIXES, cdp: CDP, cdf: namespace })
444
+ console.log(`merge: wrote ${mergedQuads.length} triples → ${outPath}`)
445
+
446
+ await writeTurtleFile(abs(provOutPath), provQuads, {
447
+ ...COMMON_PREFIXES, cdp: CDP, cdf: namespace, prov: "http://www.w3.org/ns/prov#",
448
+ })
449
+ console.log(`merge: wrote ${provQuads.length} provenance annotations → ${provOutPath}`)
450
+ }
451
+
452
+ // ---- Resolve -----------------------------------------------------------
453
+
454
+ // One value per (subject, predicate). schema:identifier and cdp:fromSource
455
+ // are dropped — final.ttl is the consumer-facing artifact, source attribution
456
+ // lives in provenance.ttl.
457
+ const STRATEGIES = {
458
+ alphabeticFirst: (quads) => [...quads].sort((a, b) => a.object.value.localeCompare(b.object.value))[0],
459
+ concatenateAll: (quads) => df.quad(quads[0].subject, quads[0].predicate,
460
+ df.literal([...new Set(quads.map(q => q.object.value))].sort().join(", "))),
461
+ }
462
+ const RESOLVE_EXCLUDE = new Set(["http://schema.org/identifier", `${CDP}fromSource`])
463
+
464
+ const lookupStrategy = (iri) => {
465
+ const fn = STRATEGIES[iri.split("#").pop()]
466
+ if (!fn) throw new Error(`Unknown resolve strategy ${iri}`)
467
+ return fn
468
+ }
469
+
470
+ const runResolve = async ({ store, defStore, abs }, outPath) => {
471
+ const [cfg] = await sparqlSelect(`
472
+ PREFIX : <${CDP}>
473
+ SELECT ?strategy ?ns WHERE {
474
+ ?resolve a :ResolveRule ; :defaultStrategy ?strategy .
475
+ ?match a :MatchRule ; :targetNamespace ?ns .
476
+ }`, [defStore])
477
+ if (!cfg) throw new Error(":ResolveRule config missing in federation.ttl")
478
+ const defaultPick = lookupStrategy(cfg.strategy)
479
+
480
+ const overrideRows = await sparqlSelect(`
481
+ PREFIX : <${CDP}>
482
+ SELECT ?on ?strategy WHERE {
483
+ ?resolve a :ResolveRule ; :hasOverride [ :on ?on ; :strategy ?strategy ] .
484
+ }`, [defStore])
485
+ const overrides = new Map(overrideRows.map(r => [r.on, lookupStrategy(r.strategy)]))
486
+
487
+ const groups = new Map()
488
+ for (const q of store.getQuads(null, null, null, MERGED_GRAPH)) {
489
+ if (RESOLVE_EXCLUDE.has(q.predicate.value)) continue
490
+ const k = `${q.subject.value}\t${q.predicate.value}`
491
+ if (!groups.has(k)) groups.set(k, [])
492
+ groups.get(k).push(q)
493
+ }
494
+ const finalQuads = [...groups.values()].map(quads =>
495
+ (overrides.get(quads[0].predicate.value) ?? defaultPick)(quads))
496
+
497
+ await writeTurtleFile(abs(outPath), finalQuads, { ...COMMON_PREFIXES, cdf: cfg.ns })
498
+ console.log(`resolve: wrote ${finalQuads.length} triples → ${outPath}`)
499
+ }
500
+
501
+ // ---- Federate engine -----------------------------------------------------
502
+ // Clean per source, load, then map → match → merge → resolve. The step
503
+ // sequence is the engine's own shape; config declares only the sources,
504
+ // processed in :hasSource declaration order. Paths follow from the source
505
+ // name (PATHS), resolved against the instance `root`. Each step runs through
506
+ // the journal, which records what executed and is rendered by the webapp's
507
+ // Pipeline page. The clean steps' predecessors are the other engine's lift
508
+ // steps, referenced by their conventional stepIri.
509
+
510
+ export async function federate(root = process.cwd()) {
511
+ const abs = (p) => path.join(root, p)
512
+ const federationTtl = fs.readFileSync(abs(PATHS.federation), "utf8")
513
+ const defStore = storeFromTurtles([federationTtl, fs.readFileSync(abs(PATHS.matchKnowledge), "utf8")])
514
+ const sources = objectsOf(parseTtl(federationTtl), `${CDP}hasSource`)
515
+
516
+ const store = newStore()
517
+ const journal = stepJournal()
518
+ const ctx = { store, defStore, abs }
519
+
520
+ const cleanSteps = []
521
+ for (const src of sources) {
522
+ const name = sourceName(src)
523
+ cleanSteps.push(await journal.step("clean", { source: src, after: [stepIri("lift", name)] }, async () => {
524
+ const cleanQuery = fs.readFileSync(abs(PATHS.cleanQuery(name)), "utf8")
525
+ const inDir = PATHS.lifted(name)
526
+ const outPath = PATHS.cleaned(name)
527
+ // Run CONSTRUCT per file so each lifted TTL stays isolated in its
528
+ // own store — the clean SPARQL can't cross-join across documents.
529
+ const inAbs = abs(inDir)
530
+ const files = fs.readdirSync(inAbs).filter(f => f.endsWith(".ttl")).sort()
531
+ console.log(`clean ${inDir} (${files.length} files) → ${outPath}`)
532
+ const allQuads = []
533
+ for (const f of files) {
534
+ const fileStore = storeFromTurtles([fs.readFileSync(path.join(inAbs, f), "utf8")])
535
+ allQuads.push(...await sparqlConstruct(cleanQuery, [fileStore]))
536
+ }
537
+ await writeTurtleFile(abs(outPath), allQuads, {
538
+ xyz: "http://sparql.xyz/facade-x/data/",
539
+ cdp: CDP,
540
+ })
541
+ }))
542
+ }
543
+
544
+ // Load each source's cleaned TTL into its own graph — plain mechanics, not a
545
+ // pipeline step.
546
+ for (const src of sources) {
547
+ const name = sourceName(src)
548
+ console.log(`load ${PATHS.cleaned(name)} → <${sourceGraph(name)}>`)
549
+ const graph = df.namedNode(sourceGraph(name))
550
+ for (const quad of n3Parser.parse(fs.readFileSync(abs(PATHS.cleaned(name)), "utf8"))) {
551
+ store.addQuad(df.quad(quad.subject, quad.predicate, quad.object, graph))
552
+ }
553
+ }
554
+
555
+ const mapStep = await journal.step("map", { after: cleanSteps }, async () => {
556
+ await runMap(ctx, PATHS.mappingQueries)
557
+ const mappedQuads = store.getQuads(null, null, null, MAPPED_GRAPH)
558
+ await writeTurtleFile(abs(PATHS.mapped), mappedQuads, { ...COMMON_PREFIXES, cdp: CDP })
559
+ console.log(`map: wrote ${mappedQuads.length} triples → ${PATHS.mapped}`)
560
+ })
561
+ const matchStep = await journal.step("match", { after: [mapStep] }, () => runMatch(ctx, PATHS.matches))
562
+ const mergeStep = await journal.step("merge", { after: [matchStep] }, () => runMerge(ctx, PATHS.merged, PATHS.provenance))
563
+ await journal.step("resolve", { after: [mergeStep] }, () => runResolve(ctx, PATHS.final))
564
+
565
+ fs.writeFileSync(abs(PATHS.federateLog), `@prefix : <${CDP}> .
566
+ @prefix p-plan: <http://purl.org/net/p-plan#> .
567
+
568
+ ${journal.toTurtle()}
569
+ `)
570
+ console.log(`log: wrote steps → ${PATHS.federateLog}`)
571
+ }
package/src/index.js ADDED
@@ -0,0 +1,6 @@
1
+ // Node entry of @directory-builder/core. Browser-safe helpers live in the
2
+ // "./utils" subpath export — import those from "@directory-builder/core/utils"
3
+ // so bundlers never see the engines' fs/child_process imports.
4
+ export { Pipeline } from "./pipeline.js"
5
+ export { ingest } from "./ingest.js"
6
+ export { federate } from "./federate.js"