@directory-builder/core 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -2
- package/package.json +1 -1
- package/src/index.js +2 -2
- package/src/lift/xml.sparql +12 -0
- package/src/pipeline/federate.js +69 -0
- package/src/pipeline/ingest.js +97 -0
- package/src/pipeline/run.js +8 -0
- package/src/pipeline/steps/clean.js +27 -0
- package/src/pipeline/steps/fetch.js +26 -0
- package/src/pipeline/steps/lift.js +60 -0
- package/src/pipeline/steps/map.js +172 -0
- package/src/pipeline/steps/match.js +212 -0
- package/src/pipeline/steps/merge.js +59 -0
- package/src/pipeline/steps/resolve.js +54 -0
- package/src/pipeline/write-turtle.js +30 -0
- package/src/pipeline.js +2 -2
- package/src/utils.js +8 -5
- package/webapp/src/App.jsx +3 -1
- package/webapp/src/Card.jsx +3 -3
- package/webapp/src/ColumnGraph.jsx +1 -1
- package/webapp/src/Directory.jsx +6 -6
- package/webapp/src/{OrgCard.jsx → EntityCard.jsx} +12 -12
- package/webapp/src/MapGraph.jsx +22 -22
- package/webapp/src/MatchGraph.jsx +3 -3
- package/webapp/src/MergeTables.jsx +43 -36
- package/webapp/src/Query.jsx +4 -9
- package/webapp/src/instanceData.js +8 -4
- package/webapp/src/loadMap.js +22 -22
- package/webapp/src/loadMerge.js +30 -32
- package/webapp/src/loadSources.js +3 -3
- package/webapp/src/mergeEntities.js +15 -0
- package/webapp/src/sourceMeta.js +1 -1
- package/webapp/src/styles.css +6 -6
- package/webapp/vite.js +1 -1
- package/src/federate.js +0 -571
- package/src/ingest.js +0 -158
- package/webapp/src/mergeOrgs.js +0 -15
package/src/federate.js
DELETED
|
@@ -1,571 +0,0 @@
|
|
|
1
|
-
import { newStore, parser as n3Parser, sparqlConstruct, sparqlInsertDelete, sparqlSelect, storeFromTurtles } from "@foerderfunke/sem-ops-utils"
|
|
2
|
-
import { buildPrefixBlock, CDP, objectsOf, parseTtl, PATHS, shrink, sourceGraph, sourceName, stepIri, stepJournal } from "./utils.js"
|
|
3
|
-
import { token_set_ratio } from "fuzzball"
|
|
4
|
-
import { DataFactory, Writer } from "n3"
|
|
5
|
-
import { createHash } from "crypto"
|
|
6
|
-
import path from "path"
|
|
7
|
-
import fs from "fs"
|
|
8
|
-
|
|
9
|
-
const df = DataFactory
|
|
10
|
-
|
|
11
|
-
// Dedupe via a Store and sort by subject so the Writer can emit grouped
|
|
12
|
-
// "subject p1 o1; p2 o2." blocks instead of repeating subjects. Strips
|
|
13
|
-
// graph names (writes triples, not quads).
|
|
14
|
-
const writeTurtleFile = (filePath, quads, prefixes = {}) => new Promise((resolve, reject) => {
|
|
15
|
-
const store = newStore()
|
|
16
|
-
for (const q of quads) store.addQuad(df.quad(q.subject, q.predicate, q.object))
|
|
17
|
-
const dedup = store.getQuads(null, null, null, null)
|
|
18
|
-
.sort((a, b) => a.subject.value.localeCompare(b.subject.value))
|
|
19
|
-
const writer = new Writer({ prefixes })
|
|
20
|
-
for (const q of dedup) writer.addQuad(q)
|
|
21
|
-
writer.end((err, result) => {
|
|
22
|
-
if (err) return reject(err)
|
|
23
|
-
fs.mkdirSync(path.dirname(filePath), { recursive: true })
|
|
24
|
-
fs.writeFileSync(filePath, result)
|
|
25
|
-
resolve()
|
|
26
|
-
})
|
|
27
|
-
})
|
|
28
|
-
|
|
29
|
-
// ---- Direct-mapping generator ------------------------------------------
|
|
30
|
-
|
|
31
|
-
const XYZ = "http://sparql.xyz/facade-x/data/"
|
|
32
|
-
|
|
33
|
-
const buildDirectInsert = ({ sourceGraph, source, targetClass, target }, fields) => {
|
|
34
|
-
const prefixes = {
|
|
35
|
-
xyz: XYZ,
|
|
36
|
-
cdp: CDP,
|
|
37
|
-
cdf: "https://civic-data.de/federated-directory#",
|
|
38
|
-
schema: "http://schema.org/",
|
|
39
|
-
foaf: "http://xmlns.com/foaf/0.1/",
|
|
40
|
-
dct: "http://purl.org/dc/terms/",
|
|
41
|
-
}
|
|
42
|
-
// shrink() returns the IRI verbatim if no prefix matches; wrap that as <…>.
|
|
43
|
-
const short = (iri) => {
|
|
44
|
-
const s = shrink(iri, prefixes)
|
|
45
|
-
return s === iri ? `<${iri}>` : s
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
const v = (path) => `?${path}`
|
|
49
|
-
// STR() before the emptiness check so the guard works for any literal
|
|
50
|
-
// datatype — a bare `?v != ""` errors on e.g. xsd:int and would silently
|
|
51
|
-
// drop the field (AWO's numeric ids hit exactly this).
|
|
52
|
-
const optLit = (subj, path) =>
|
|
53
|
-
`OPTIONAL { ${subj} xyz:${path} ${v(path)} . ` +
|
|
54
|
-
`FILTER(isLiteral(${v(path)}) && STR(${v(path)}) != "") }`
|
|
55
|
-
|
|
56
|
-
const insertBlock = fields
|
|
57
|
-
.map(f => ` ?org ${short(f.predicate)} ${v(f.fieldPath)} .`)
|
|
58
|
-
.join("\n")
|
|
59
|
-
|
|
60
|
-
const topLevel = fields.filter(f => !f.parentPath)
|
|
61
|
-
const subFields = fields.filter(f => f.parentPath)
|
|
62
|
-
|
|
63
|
-
// Source subjects = federation IRIs after the clean step, identified via
|
|
64
|
-
// cdp:fromSource — no minting from a key field. Where clean reshapes one
|
|
65
|
-
// source into several entity kinds it tags each subject with cdp:targetSchema;
|
|
66
|
-
// select only those for this mapping's schema. Subjects with no marker
|
|
67
|
-
// (single-entity sources like caritas/dhs) match unconditionally.
|
|
68
|
-
const bgp = [`?org cdp:fromSource ${short(source)} .`]
|
|
69
|
-
if (target) {
|
|
70
|
-
bgp.push(`OPTIONAL { ?org cdp:targetSchema ?_ts }`)
|
|
71
|
-
bgp.push(`FILTER(!bound(?_ts) || ?_ts = ${short(target)})`)
|
|
72
|
-
}
|
|
73
|
-
for (const f of topLevel) bgp.push(optLit("?org", f.fieldPath))
|
|
74
|
-
|
|
75
|
-
const byParent = new Map()
|
|
76
|
-
for (const f of subFields) {
|
|
77
|
-
if (!byParent.has(f.parentPath)) byParent.set(f.parentPath, [])
|
|
78
|
-
byParent.get(f.parentPath).push(f)
|
|
79
|
-
}
|
|
80
|
-
let parentIdx = 0
|
|
81
|
-
for (const [parent, subs] of byParent) {
|
|
82
|
-
const pv = `?_p${parentIdx++}`
|
|
83
|
-
const inner = subs.map(s => ` ${optLit(pv, s.fieldPath)}`).join("\n")
|
|
84
|
-
bgp.push(`OPTIONAL {\n ?org xyz:${parent} ${pv} .\n${inner}\n }`)
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
// The target schema's :targetClass becomes the record's rdf:type here in the
|
|
88
|
-
// mapped graph — this is where schema: vocabulary first enters; the clean step
|
|
89
|
-
// stays in xyz:/cdp: only.
|
|
90
|
-
const typeClause = targetClass ? `a ${short(targetClass)} ; ` : ""
|
|
91
|
-
|
|
92
|
-
return `${buildPrefixBlock(prefixes)}
|
|
93
|
-
|
|
94
|
-
INSERT {
|
|
95
|
-
GRAPH <urn:mapped> {
|
|
96
|
-
?org ${typeClause}cdp:fromSource ${short(source)} .
|
|
97
|
-
${insertBlock}
|
|
98
|
-
}
|
|
99
|
-
} WHERE {
|
|
100
|
-
GRAPH <${sourceGraph}> {
|
|
101
|
-
${bgp.join("\n ")}
|
|
102
|
-
}
|
|
103
|
-
}`
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
const runMap = async ({ store, defStore, abs }, queriesDir) => {
|
|
107
|
-
const mappings = await sparqlSelect(`
|
|
108
|
-
PREFIX : <${CDP}>
|
|
109
|
-
SELECT ?mapping ?source ?sourceGraph ?target ?targetClass WHERE {
|
|
110
|
-
?mapping a :Mapping ;
|
|
111
|
-
:fromSource ?source .
|
|
112
|
-
OPTIONAL { ?mapping :sourceGraph ?sourceGraph }
|
|
113
|
-
OPTIONAL { ?mapping :toTarget ?target }
|
|
114
|
-
OPTIONAL { ?mapping :toTarget/:targetClass ?targetClass }
|
|
115
|
-
} ORDER BY ?mapping`, [defStore])
|
|
116
|
-
|
|
117
|
-
for (const m of mappings) {
|
|
118
|
-
const directRows = await sparqlSelect(`
|
|
119
|
-
PREFIX : <${CDP}>
|
|
120
|
-
SELECT ?fieldPath ?predicate ?parentPath WHERE {
|
|
121
|
-
<${m.mapping}> :hasFieldMapping ?fm .
|
|
122
|
-
?fm :from ?src ; :to ?tgt .
|
|
123
|
-
FILTER NOT EXISTS { ?fm :via ?_v }
|
|
124
|
-
?tgt :targetPredicate ?predicate .
|
|
125
|
-
?src :fieldPath ?fieldPath .
|
|
126
|
-
OPTIONAL { ?parent :hasSubField ?src . ?parent :fieldPath ?parentPath }
|
|
127
|
-
}`, [defStore])
|
|
128
|
-
|
|
129
|
-
if (directRows.length && m.sourceGraph) {
|
|
130
|
-
const localName = m.mapping.split("#").pop()
|
|
131
|
-
const query = buildDirectInsert(m, directRows)
|
|
132
|
-
const queryPath = abs(path.join(queriesDir, `${localName}.sparql`))
|
|
133
|
-
fs.mkdirSync(path.dirname(queryPath), { recursive: true })
|
|
134
|
-
fs.writeFileSync(queryPath, query)
|
|
135
|
-
console.log(`map ${localName} direct (${directRows.length} mappings) → ${queryPath}`)
|
|
136
|
-
await sparqlInsertDelete(query, store)
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
// :via names a transform of the mapping's source — the script path
|
|
140
|
-
// follows by convention (sources/<source>/transform-<via>.sparql).
|
|
141
|
-
const viaRows = await sparqlSelect(`
|
|
142
|
-
PREFIX : <${CDP}>
|
|
143
|
-
SELECT DISTINCT ?via WHERE {
|
|
144
|
-
<${m.mapping}> :hasFieldMapping/:via ?via .
|
|
145
|
-
} ORDER BY ?via`, [defStore])
|
|
146
|
-
|
|
147
|
-
for (const v of viaRows) {
|
|
148
|
-
const script = PATHS.transform(sourceName(m.source), v.via)
|
|
149
|
-
console.log(`map ${script}`)
|
|
150
|
-
await sparqlInsertDelete(fs.readFileSync(abs(script), "utf8"), store)
|
|
151
|
-
}
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
// A mapping's :hasRelationship turns the clean step's source-level link
|
|
155
|
-
// (e.g. :providedBy) into a target predicate (schema:provider), matching the
|
|
156
|
-
// two ends by their cdp:targetSchema. Both ends are still source IRIs here;
|
|
157
|
-
// the merge step rewrites them to the minted cluster IRIs.
|
|
158
|
-
const linkRows = await sparqlSelect(`
|
|
159
|
-
PREFIX : <${CDP}>
|
|
160
|
-
SELECT ?mapping ?sourceGraph ?fromSchema ?sourcePredicate ?targetPredicate ?toSchema WHERE {
|
|
161
|
-
?mapping a :Mapping ;
|
|
162
|
-
:sourceGraph ?sourceGraph ;
|
|
163
|
-
:toTarget ?fromSchema ;
|
|
164
|
-
:hasRelationship ?rel .
|
|
165
|
-
?rel :sourcePredicate ?sourcePredicate ;
|
|
166
|
-
:toTargetField ?field ;
|
|
167
|
-
:toTargetSchema ?toSchema .
|
|
168
|
-
?field :targetPredicate ?targetPredicate .
|
|
169
|
-
} ORDER BY ?mapping`, [defStore])
|
|
170
|
-
|
|
171
|
-
for (const rel of linkRows) {
|
|
172
|
-
const prefixes = { cdp: CDP, schema: "http://schema.org/" }
|
|
173
|
-
const short = (iri) => { const s = shrink(iri, prefixes); return s === iri ? `<${iri}>` : s }
|
|
174
|
-
const query = `${buildPrefixBlock(prefixes)}
|
|
175
|
-
|
|
176
|
-
INSERT {
|
|
177
|
-
GRAPH <urn:mapped> {
|
|
178
|
-
?from ${short(rel.targetPredicate)} ?to .
|
|
179
|
-
}
|
|
180
|
-
} WHERE {
|
|
181
|
-
GRAPH <${rel.sourceGraph}> {
|
|
182
|
-
?from ${short(rel.sourcePredicate)} ?to ;
|
|
183
|
-
cdp:targetSchema ${short(rel.fromSchema)} .
|
|
184
|
-
?to cdp:targetSchema ${short(rel.toSchema)} .
|
|
185
|
-
}
|
|
186
|
-
}`
|
|
187
|
-
console.log(`map ${rel.mapping.split("#").pop()} link (${short(rel.targetPredicate)})`)
|
|
188
|
-
await sparqlInsertDelete(query, store)
|
|
189
|
-
}
|
|
190
|
-
}
|
|
191
|
-
|
|
192
|
-
// ---- Shared graphs and prefixes ----------------------------------------
|
|
193
|
-
|
|
194
|
-
const MAPPED_GRAPH = df.namedNode("urn:mapped")
|
|
195
|
-
const MATCH_GRAPH = df.namedNode("urn:matched")
|
|
196
|
-
const MERGED_GRAPH = df.namedNode("urn:merged")
|
|
197
|
-
|
|
198
|
-
const HAS_MEMBER = df.namedNode(CDP + "hasMember")
|
|
199
|
-
|
|
200
|
-
const COMMON_PREFIXES = {
|
|
201
|
-
schema: "http://schema.org/",
|
|
202
|
-
foaf: "http://xmlns.com/foaf/0.1/",
|
|
203
|
-
dct: "http://purl.org/dc/terms/",
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
// ---- Match -------------------------------------------------------------
|
|
207
|
-
|
|
208
|
-
const RDF_TYPE = df.namedNode("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")
|
|
209
|
-
const MATCH_CLUSTER = df.namedNode(CDP + "MatchCluster")
|
|
210
|
-
|
|
211
|
-
// token_set_ratio computes a ratio over the intersection of token sets, which
|
|
212
|
-
// is robust to legal-form noise ("gGmbH", "e.V."), sub-unit specifiers, and
|
|
213
|
-
// word-order variations. Returns 0–100; we normalise to 0–1. The algorithm
|
|
214
|
-
// name is recorded in the evidence graph so old similarity numbers stay
|
|
215
|
-
// interpretable across algorithm swaps.
|
|
216
|
-
const SIMILARITY_ALGORITHM = "token_set_ratio"
|
|
217
|
-
const similarity = (a, b) => token_set_ratio(a ?? "", b ?? "") / 100
|
|
218
|
-
|
|
219
|
-
const runMatch = async ({ store, defStore, abs }, outPath) => {
|
|
220
|
-
// One match rule per target schema; each rule scores its own fields, mints
|
|
221
|
-
// with its own prefix, and clusters only subjects of its :targetClass.
|
|
222
|
-
const rules = await sparqlSelect(`
|
|
223
|
-
PREFIX : <${CDP}>
|
|
224
|
-
SELECT ?match ?targetClass ?ns ?prefix ?minScore WHERE {
|
|
225
|
-
?match a :MatchRule ;
|
|
226
|
-
:forTarget ?target ;
|
|
227
|
-
:targetNamespace ?ns ;
|
|
228
|
-
:mintedSubjectPrefix ?prefix .
|
|
229
|
-
?target :targetClass ?targetClass .
|
|
230
|
-
OPTIONAL { ?match :minScore ?minScore }
|
|
231
|
-
} ORDER BY ?match`, [defStore])
|
|
232
|
-
if (!rules.length) throw new Error(":MatchRule config missing in federation.ttl")
|
|
233
|
-
|
|
234
|
-
const criteriaRows = await sparqlSelect(`
|
|
235
|
-
PREFIX : <${CDP}>
|
|
236
|
-
SELECT ?match ?on ?weight ?minSim WHERE {
|
|
237
|
-
?match a :MatchRule ; :hasWeightedCriterion ?c .
|
|
238
|
-
?c :on ?on ; :weight ?weight .
|
|
239
|
-
OPTIONAL { ?c :minSimilarity ?minSim }
|
|
240
|
-
}`, [defStore])
|
|
241
|
-
// Hard criteria: fields that must be identical in both records (pass/fail gates).
|
|
242
|
-
const hardRows = await sparqlSelect(`
|
|
243
|
-
PREFIX : <${CDP}>
|
|
244
|
-
SELECT ?match ?on WHERE {
|
|
245
|
-
?match a :MatchRule ; :hasHardCriterion ?h . ?h :on ?on .
|
|
246
|
-
}`, [defStore])
|
|
247
|
-
// Criteria keyed by their owning rule, so each pass scores on its own fields.
|
|
248
|
-
const criteriaByMatch = new Map()
|
|
249
|
-
for (const r of criteriaRows) {
|
|
250
|
-
if (!criteriaByMatch.has(r.match)) criteriaByMatch.set(r.match, [])
|
|
251
|
-
criteriaByMatch.get(r.match).push({
|
|
252
|
-
pred: df.namedNode(r.on),
|
|
253
|
-
weight: parseFloat(r.weight),
|
|
254
|
-
minSim: r.minSim != null ? parseFloat(r.minSim) : null,
|
|
255
|
-
})
|
|
256
|
-
}
|
|
257
|
-
const hardByMatch = new Map()
|
|
258
|
-
for (const r of hardRows) {
|
|
259
|
-
if (!hardByMatch.has(r.match)) hardByMatch.set(r.match, [])
|
|
260
|
-
hardByMatch.get(r.match).push({ pred: df.namedNode(r.on) })
|
|
261
|
-
}
|
|
262
|
-
// owl:sameAs assertions are shared; each pass only acts on the pairs whose
|
|
263
|
-
// endpoints are in its own subject set (gated by parent.has below).
|
|
264
|
-
const sameAsRows = await sparqlSelect(`
|
|
265
|
-
PREFIX owl: <http://www.w3.org/2002/07/owl#>
|
|
266
|
-
SELECT ?a ?b WHERE { ?a owl:sameAs ?b }`, [defStore])
|
|
267
|
-
|
|
268
|
-
const MATCH_EVIDENCE = df.namedNode(CDP + "MatchEvidence")
|
|
269
|
-
const HAS_MATCH_EVIDENCE = df.namedNode(CDP + "hasMatchEvidence")
|
|
270
|
-
const PAIR = df.namedNode(CDP + "pair")
|
|
271
|
-
const ON_CRITERION = df.namedNode(CDP + "onCriterion")
|
|
272
|
-
const ON = df.namedNode(CDP + "on")
|
|
273
|
-
const SIMILARITY = df.namedNode(CDP + "similarity")
|
|
274
|
-
const SIM_ALGORITHM = df.namedNode(CDP + "similarityAlgorithm")
|
|
275
|
-
const WEIGHT = df.namedNode(CDP + "weight")
|
|
276
|
-
const VALUE_A = df.namedNode(CDP + "valueA")
|
|
277
|
-
const VALUE_B = df.namedNode(CDP + "valueB")
|
|
278
|
-
const AGGREGATE_SCORE = df.namedNode(CDP + "aggregateScore")
|
|
279
|
-
const VIA_MANUAL_MATCH = df.namedNode(CDP + "viaManualMatch")
|
|
280
|
-
const XSD_DECIMAL = df.namedNode("http://www.w3.org/2001/XMLSchema#decimal")
|
|
281
|
-
const XSD_BOOLEAN = df.namedNode("http://www.w3.org/2001/XMLSchema#boolean")
|
|
282
|
-
|
|
283
|
-
for (const rule of rules) {
|
|
284
|
-
const namespace = rule.ns
|
|
285
|
-
const mintedPrefix = rule.prefix
|
|
286
|
-
const minScore = parseFloat(rule.minScore)
|
|
287
|
-
const hard = hardByMatch.get(rule.match) ?? []
|
|
288
|
-
const weighted = criteriaByMatch.get(rule.match) ?? []
|
|
289
|
-
|
|
290
|
-
// Subjects of this rule's target class only — passes never cross types.
|
|
291
|
-
const subjects = [...new Set(store.getQuads(null, RDF_TYPE, df.namedNode(rule.targetClass), MAPPED_GRAPH)
|
|
292
|
-
.filter(qu => qu.subject.termType === "NamedNode")
|
|
293
|
-
.map(qu => qu.subject.value))]
|
|
294
|
-
|
|
295
|
-
const valOf = (s, pred) => {
|
|
296
|
-
const qs = store.getQuads(df.namedNode(s), pred, null, MAPPED_GRAPH)
|
|
297
|
-
return qs.length ? qs[0].object.value : null
|
|
298
|
-
}
|
|
299
|
-
const hardVals = new Map(subjects.map(s => [s, hard.map(h => valOf(s, h.pred))]))
|
|
300
|
-
const weightedVals = new Map(subjects.map(s => [s, weighted.map(c => valOf(s, c.pred))]))
|
|
301
|
-
|
|
302
|
-
// A pair matches when every hard criterion is present and identical in both,
|
|
303
|
-
// and the weighted criteria's aggregate (sum of sim·weight, each optionally
|
|
304
|
-
// floored by :minSimilarity) clears :minScore. No criteria at all → every
|
|
305
|
-
// subject stays its own cluster.
|
|
306
|
-
const matches = (a, b) => {
|
|
307
|
-
if (!hard.length && !weighted.length) return null
|
|
308
|
-
const ha = hardVals.get(a), hb = hardVals.get(b)
|
|
309
|
-
for (let i = 0; i < hard.length; i++) {
|
|
310
|
-
if (ha[i] == null || hb[i] == null || ha[i] !== hb[i]) return null
|
|
311
|
-
}
|
|
312
|
-
const va = weightedVals.get(a), vb = weightedVals.get(b)
|
|
313
|
-
const scores = []
|
|
314
|
-
let weightedSum = 0
|
|
315
|
-
for (let i = 0; i < weighted.length; i++) {
|
|
316
|
-
if (va[i] == null || vb[i] == null) return null
|
|
317
|
-
const c = weighted[i]
|
|
318
|
-
const sim = similarity(va[i], vb[i])
|
|
319
|
-
if (c.minSim != null && sim < c.minSim) return null
|
|
320
|
-
scores.push({ pred: c.pred, sim, weight: c.weight, valueA: va[i], valueB: vb[i] })
|
|
321
|
-
weightedSum += sim * c.weight
|
|
322
|
-
}
|
|
323
|
-
if (weighted.length && weightedSum < minScore) return null
|
|
324
|
-
return { scores, aggregate: weightedSum }
|
|
325
|
-
}
|
|
326
|
-
|
|
327
|
-
const parent = new Map(subjects.map(s => [s, s]))
|
|
328
|
-
const find = (x) => {
|
|
329
|
-
let r = x
|
|
330
|
-
while (parent.get(r) !== r) r = parent.get(r)
|
|
331
|
-
let c = x
|
|
332
|
-
while (parent.get(c) !== r) { const n = parent.get(c); parent.set(c, r); c = n }
|
|
333
|
-
return r
|
|
334
|
-
}
|
|
335
|
-
const union = (a, b) => {
|
|
336
|
-
const ra = find(a), rb = find(b)
|
|
337
|
-
if (ra !== rb) parent.set(ra, rb)
|
|
338
|
-
}
|
|
339
|
-
|
|
340
|
-
const evidence = []
|
|
341
|
-
let sameAsUnions = 0
|
|
342
|
-
for (const { a, b } of sameAsRows) {
|
|
343
|
-
if (parent.has(a) && parent.has(b)) { union(a, b); sameAsUnions++; evidence.push({ a, b, manual: true }) }
|
|
344
|
-
}
|
|
345
|
-
|
|
346
|
-
for (let i = 0; i < subjects.length; i++) {
|
|
347
|
-
for (let j = i + 1; j < subjects.length; j++) {
|
|
348
|
-
const m = matches(subjects[i], subjects[j])
|
|
349
|
-
if (m) { union(subjects[i], subjects[j]); evidence.push({ a: subjects[i], b: subjects[j], ...m }) }
|
|
350
|
-
}
|
|
351
|
-
}
|
|
352
|
-
|
|
353
|
-
const clusters = new Map()
|
|
354
|
-
for (const s of subjects) {
|
|
355
|
-
const root = find(s)
|
|
356
|
-
if (!clusters.has(root)) clusters.set(root, [])
|
|
357
|
-
clusters.get(root).push(s)
|
|
358
|
-
}
|
|
359
|
-
const clusterMembers = [...clusters.values()]
|
|
360
|
-
.map(m => [...m].sort())
|
|
361
|
-
.sort((a, b) => b.length - a.length || a[0].localeCompare(b[0]))
|
|
362
|
-
|
|
363
|
-
let multiSource = 0
|
|
364
|
-
const clusterIriByRoot = new Map()
|
|
365
|
-
for (const members of clusterMembers) {
|
|
366
|
-
const id = createHash("sha1").update(members.join("|")).digest("hex").slice(0, 12)
|
|
367
|
-
const minted = df.namedNode(namespace + mintedPrefix + id)
|
|
368
|
-
clusterIriByRoot.set(find(members[0]), minted)
|
|
369
|
-
if (members.length > 1) multiSource++
|
|
370
|
-
store.addQuad(df.quad(minted, RDF_TYPE, MATCH_CLUSTER, MATCH_GRAPH))
|
|
371
|
-
for (const s of members) {
|
|
372
|
-
store.addQuad(df.quad(minted, HAS_MEMBER, df.namedNode(s), MATCH_GRAPH))
|
|
373
|
-
}
|
|
374
|
-
}
|
|
375
|
-
|
|
376
|
-
for (const ev of evidence) {
|
|
377
|
-
const evNode = df.blankNode()
|
|
378
|
-
const cluster = clusterIriByRoot.get(find(ev.a))
|
|
379
|
-
store.addQuad(df.quad(cluster, HAS_MATCH_EVIDENCE, evNode, MATCH_GRAPH))
|
|
380
|
-
store.addQuad(df.quad(evNode, RDF_TYPE, MATCH_EVIDENCE, MATCH_GRAPH))
|
|
381
|
-
store.addQuad(df.quad(evNode, PAIR, df.namedNode(ev.a), MATCH_GRAPH))
|
|
382
|
-
store.addQuad(df.quad(evNode, PAIR, df.namedNode(ev.b), MATCH_GRAPH))
|
|
383
|
-
if (ev.manual) {
|
|
384
|
-
store.addQuad(df.quad(evNode, VIA_MANUAL_MATCH, df.literal("true", XSD_BOOLEAN), MATCH_GRAPH))
|
|
385
|
-
} else {
|
|
386
|
-
store.addQuad(df.quad(evNode, AGGREGATE_SCORE, df.literal(ev.aggregate.toFixed(3), XSD_DECIMAL), MATCH_GRAPH))
|
|
387
|
-
store.addQuad(df.quad(evNode, SIM_ALGORITHM, df.literal(SIMILARITY_ALGORITHM), MATCH_GRAPH))
|
|
388
|
-
for (const s of ev.scores) {
|
|
389
|
-
const cNode = df.blankNode()
|
|
390
|
-
store.addQuad(df.quad(evNode, ON_CRITERION, cNode, MATCH_GRAPH))
|
|
391
|
-
store.addQuad(df.quad(cNode, ON, s.pred, MATCH_GRAPH))
|
|
392
|
-
store.addQuad(df.quad(cNode, SIMILARITY, df.literal(s.sim.toFixed(3), XSD_DECIMAL), MATCH_GRAPH))
|
|
393
|
-
store.addQuad(df.quad(cNode, WEIGHT, df.literal(s.weight.toFixed(2), XSD_DECIMAL), MATCH_GRAPH))
|
|
394
|
-
store.addQuad(df.quad(cNode, VALUE_A, df.literal(s.valueA), MATCH_GRAPH))
|
|
395
|
-
store.addQuad(df.quad(cNode, VALUE_B, df.literal(s.valueB), MATCH_GRAPH))
|
|
396
|
-
}
|
|
397
|
-
}
|
|
398
|
-
}
|
|
399
|
-
|
|
400
|
-
console.log(`match: ${rule.match.split("#").pop()} ${subjects.length} entities → ${clusters.size} clusters (${multiSource} multi-source, ${sameAsUnions} sameAs unions)`)
|
|
401
|
-
}
|
|
402
|
-
|
|
403
|
-
const matchQuads = store.getQuads(null, null, null, MATCH_GRAPH)
|
|
404
|
-
await writeTurtleFile(abs(outPath), matchQuads, { cdp: CDP, cdf: rules[0].ns, ...COMMON_PREFIXES })
|
|
405
|
-
console.log(`match: wrote cluster log → ${outPath}`)
|
|
406
|
-
}
|
|
407
|
-
|
|
408
|
-
// ---- Merge -------------------------------------------------------------
|
|
409
|
-
|
|
410
|
-
const runMerge = async ({ store, defStore, abs }, outPath, provOutPath) => {
|
|
411
|
-
const [cfg] = await sparqlSelect(`
|
|
412
|
-
PREFIX : <${CDP}>
|
|
413
|
-
SELECT ?ns ?originPred WHERE {
|
|
414
|
-
?match a :MatchRule ; :targetNamespace ?ns .
|
|
415
|
-
?merge a :MergeRule ; :originPredicate ?originPred .
|
|
416
|
-
}`, [defStore])
|
|
417
|
-
if (!cfg) throw new Error(":MergeRule / :MatchRule config missing in federation.ttl")
|
|
418
|
-
const { ns: namespace, originPred } = cfg
|
|
419
|
-
|
|
420
|
-
const memberQuads = store.getQuads(null, HAS_MEMBER, null, MATCH_GRAPH)
|
|
421
|
-
const mintedFor = new Map()
|
|
422
|
-
for (const mq of memberQuads) mintedFor.set(mq.object.value, mq.subject)
|
|
423
|
-
|
|
424
|
-
const fedQuads = store.getQuads(null, null, null, MAPPED_GRAPH)
|
|
425
|
-
const originPredNode = df.namedNode(originPred)
|
|
426
|
-
const provQuads = []
|
|
427
|
-
for (const qu of fedQuads) {
|
|
428
|
-
const minted = mintedFor.get(qu.subject.value)
|
|
429
|
-
if (!minted) continue
|
|
430
|
-
// Rewrite IRI objects that are themselves matched subjects to their minted
|
|
431
|
-
// cluster IRI, so inter-entity links (e.g. schema:provider) point at the
|
|
432
|
-
// merged entity rather than the pre-merge source IRI.
|
|
433
|
-
const object = qu.object.termType === "NamedNode" && mintedFor.has(qu.object.value)
|
|
434
|
-
? mintedFor.get(qu.object.value)
|
|
435
|
-
: qu.object
|
|
436
|
-
store.addQuad(df.quad(minted, qu.predicate, object, MERGED_GRAPH))
|
|
437
|
-
const triple = df.quad(minted, qu.predicate, object)
|
|
438
|
-
provQuads.push(df.quad(triple, originPredNode, qu.subject))
|
|
439
|
-
}
|
|
440
|
-
|
|
441
|
-
const mergedQuads = store.getQuads(null, null, null, MERGED_GRAPH)
|
|
442
|
-
|
|
443
|
-
await writeTurtleFile(abs(outPath), mergedQuads, { ...COMMON_PREFIXES, cdp: CDP, cdf: namespace })
|
|
444
|
-
console.log(`merge: wrote ${mergedQuads.length} triples → ${outPath}`)
|
|
445
|
-
|
|
446
|
-
await writeTurtleFile(abs(provOutPath), provQuads, {
|
|
447
|
-
...COMMON_PREFIXES, cdp: CDP, cdf: namespace, prov: "http://www.w3.org/ns/prov#",
|
|
448
|
-
})
|
|
449
|
-
console.log(`merge: wrote ${provQuads.length} provenance annotations → ${provOutPath}`)
|
|
450
|
-
}
|
|
451
|
-
|
|
452
|
-
// ---- Resolve -----------------------------------------------------------
|
|
453
|
-
|
|
454
|
-
// One value per (subject, predicate). schema:identifier and cdp:fromSource
|
|
455
|
-
// are dropped — final.ttl is the consumer-facing artifact, source attribution
|
|
456
|
-
// lives in provenance.ttl.
|
|
457
|
-
const STRATEGIES = {
|
|
458
|
-
alphabeticFirst: (quads) => [...quads].sort((a, b) => a.object.value.localeCompare(b.object.value))[0],
|
|
459
|
-
concatenateAll: (quads) => df.quad(quads[0].subject, quads[0].predicate,
|
|
460
|
-
df.literal([...new Set(quads.map(q => q.object.value))].sort().join(", "))),
|
|
461
|
-
}
|
|
462
|
-
const RESOLVE_EXCLUDE = new Set(["http://schema.org/identifier", `${CDP}fromSource`])
|
|
463
|
-
|
|
464
|
-
const lookupStrategy = (iri) => {
|
|
465
|
-
const fn = STRATEGIES[iri.split("#").pop()]
|
|
466
|
-
if (!fn) throw new Error(`Unknown resolve strategy ${iri}`)
|
|
467
|
-
return fn
|
|
468
|
-
}
|
|
469
|
-
|
|
470
|
-
const runResolve = async ({ store, defStore, abs }, outPath) => {
|
|
471
|
-
const [cfg] = await sparqlSelect(`
|
|
472
|
-
PREFIX : <${CDP}>
|
|
473
|
-
SELECT ?strategy ?ns WHERE {
|
|
474
|
-
?resolve a :ResolveRule ; :defaultStrategy ?strategy .
|
|
475
|
-
?match a :MatchRule ; :targetNamespace ?ns .
|
|
476
|
-
}`, [defStore])
|
|
477
|
-
if (!cfg) throw new Error(":ResolveRule config missing in federation.ttl")
|
|
478
|
-
const defaultPick = lookupStrategy(cfg.strategy)
|
|
479
|
-
|
|
480
|
-
const overrideRows = await sparqlSelect(`
|
|
481
|
-
PREFIX : <${CDP}>
|
|
482
|
-
SELECT ?on ?strategy WHERE {
|
|
483
|
-
?resolve a :ResolveRule ; :hasOverride [ :on ?on ; :strategy ?strategy ] .
|
|
484
|
-
}`, [defStore])
|
|
485
|
-
const overrides = new Map(overrideRows.map(r => [r.on, lookupStrategy(r.strategy)]))
|
|
486
|
-
|
|
487
|
-
const groups = new Map()
|
|
488
|
-
for (const q of store.getQuads(null, null, null, MERGED_GRAPH)) {
|
|
489
|
-
if (RESOLVE_EXCLUDE.has(q.predicate.value)) continue
|
|
490
|
-
const k = `${q.subject.value}\t${q.predicate.value}`
|
|
491
|
-
if (!groups.has(k)) groups.set(k, [])
|
|
492
|
-
groups.get(k).push(q)
|
|
493
|
-
}
|
|
494
|
-
const finalQuads = [...groups.values()].map(quads =>
|
|
495
|
-
(overrides.get(quads[0].predicate.value) ?? defaultPick)(quads))
|
|
496
|
-
|
|
497
|
-
await writeTurtleFile(abs(outPath), finalQuads, { ...COMMON_PREFIXES, cdf: cfg.ns })
|
|
498
|
-
console.log(`resolve: wrote ${finalQuads.length} triples → ${outPath}`)
|
|
499
|
-
}
|
|
500
|
-
|
|
501
|
-
// ---- Federate engine -----------------------------------------------------
|
|
502
|
-
// Clean per source, load, then map → match → merge → resolve. The step
|
|
503
|
-
// sequence is the engine's own shape; config declares only the sources,
|
|
504
|
-
// processed in :hasSource declaration order. Paths follow from the source
|
|
505
|
-
// name (PATHS), resolved against the instance `root`. Each step runs through
|
|
506
|
-
// the journal, which records what executed and is rendered by the webapp's
|
|
507
|
-
// Pipeline page. The clean steps' predecessors are the other engine's lift
|
|
508
|
-
// steps, referenced by their conventional stepIri.
|
|
509
|
-
|
|
510
|
-
export async function federate(root = process.cwd()) {
|
|
511
|
-
const abs = (p) => path.join(root, p)
|
|
512
|
-
const federationTtl = fs.readFileSync(abs(PATHS.federation), "utf8")
|
|
513
|
-
const defStore = storeFromTurtles([federationTtl, fs.readFileSync(abs(PATHS.matchKnowledge), "utf8")])
|
|
514
|
-
const sources = objectsOf(parseTtl(federationTtl), `${CDP}hasSource`)
|
|
515
|
-
|
|
516
|
-
const store = newStore()
|
|
517
|
-
const journal = stepJournal()
|
|
518
|
-
const ctx = { store, defStore, abs }
|
|
519
|
-
|
|
520
|
-
const cleanSteps = []
|
|
521
|
-
for (const src of sources) {
|
|
522
|
-
const name = sourceName(src)
|
|
523
|
-
cleanSteps.push(await journal.step("clean", { source: src, after: [stepIri("lift", name)] }, async () => {
|
|
524
|
-
const cleanQuery = fs.readFileSync(abs(PATHS.cleanQuery(name)), "utf8")
|
|
525
|
-
const inDir = PATHS.lifted(name)
|
|
526
|
-
const outPath = PATHS.cleaned(name)
|
|
527
|
-
// Run CONSTRUCT per file so each lifted TTL stays isolated in its
|
|
528
|
-
// own store — the clean SPARQL can't cross-join across documents.
|
|
529
|
-
const inAbs = abs(inDir)
|
|
530
|
-
const files = fs.readdirSync(inAbs).filter(f => f.endsWith(".ttl")).sort()
|
|
531
|
-
console.log(`clean ${inDir} (${files.length} files) → ${outPath}`)
|
|
532
|
-
const allQuads = []
|
|
533
|
-
for (const f of files) {
|
|
534
|
-
const fileStore = storeFromTurtles([fs.readFileSync(path.join(inAbs, f), "utf8")])
|
|
535
|
-
allQuads.push(...await sparqlConstruct(cleanQuery, [fileStore]))
|
|
536
|
-
}
|
|
537
|
-
await writeTurtleFile(abs(outPath), allQuads, {
|
|
538
|
-
xyz: "http://sparql.xyz/facade-x/data/",
|
|
539
|
-
cdp: CDP,
|
|
540
|
-
})
|
|
541
|
-
}))
|
|
542
|
-
}
|
|
543
|
-
|
|
544
|
-
// Load each source's cleaned TTL into its own graph — plain mechanics, not a
|
|
545
|
-
// pipeline step.
|
|
546
|
-
for (const src of sources) {
|
|
547
|
-
const name = sourceName(src)
|
|
548
|
-
console.log(`load ${PATHS.cleaned(name)} → <${sourceGraph(name)}>`)
|
|
549
|
-
const graph = df.namedNode(sourceGraph(name))
|
|
550
|
-
for (const quad of n3Parser.parse(fs.readFileSync(abs(PATHS.cleaned(name)), "utf8"))) {
|
|
551
|
-
store.addQuad(df.quad(quad.subject, quad.predicate, quad.object, graph))
|
|
552
|
-
}
|
|
553
|
-
}
|
|
554
|
-
|
|
555
|
-
const mapStep = await journal.step("map", { after: cleanSteps }, async () => {
|
|
556
|
-
await runMap(ctx, PATHS.mappingQueries)
|
|
557
|
-
const mappedQuads = store.getQuads(null, null, null, MAPPED_GRAPH)
|
|
558
|
-
await writeTurtleFile(abs(PATHS.mapped), mappedQuads, { ...COMMON_PREFIXES, cdp: CDP })
|
|
559
|
-
console.log(`map: wrote ${mappedQuads.length} triples → ${PATHS.mapped}`)
|
|
560
|
-
})
|
|
561
|
-
const matchStep = await journal.step("match", { after: [mapStep] }, () => runMatch(ctx, PATHS.matches))
|
|
562
|
-
const mergeStep = await journal.step("merge", { after: [matchStep] }, () => runMerge(ctx, PATHS.merged, PATHS.provenance))
|
|
563
|
-
await journal.step("resolve", { after: [mergeStep] }, () => runResolve(ctx, PATHS.final))
|
|
564
|
-
|
|
565
|
-
fs.writeFileSync(abs(PATHS.federateLog), `@prefix : <${CDP}> .
|
|
566
|
-
@prefix p-plan: <http://purl.org/net/p-plan#> .
|
|
567
|
-
|
|
568
|
-
${journal.toTurtle()}
|
|
569
|
-
`)
|
|
570
|
-
console.log(`log: wrote steps → ${PATHS.federateLog}`)
|
|
571
|
-
}
|