@directory-builder/core 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -2
- package/package.json +1 -1
- package/src/index.js +2 -2
- package/src/pipeline/federate.js +69 -0
- package/src/pipeline/ingest.js +97 -0
- package/src/pipeline/run.js +8 -0
- package/src/pipeline/steps/clean.js +27 -0
- package/src/pipeline/steps/fetch.js +24 -0
- package/src/pipeline/steps/lift.js +58 -0
- package/src/pipeline/steps/map.js +172 -0
- package/src/pipeline/steps/match.js +212 -0
- package/src/pipeline/steps/merge.js +59 -0
- package/src/pipeline/steps/resolve.js +54 -0
- package/src/pipeline/write-turtle.js +30 -0
- package/src/pipeline.js +2 -2
- package/src/utils.js +1 -5
- package/webapp/src/App.jsx +3 -1
- package/webapp/src/Card.jsx +3 -3
- package/webapp/src/ColumnGraph.jsx +1 -1
- package/webapp/src/Directory.jsx +6 -6
- package/webapp/src/{OrgCard.jsx → EntityCard.jsx} +12 -12
- package/webapp/src/MapGraph.jsx +22 -22
- package/webapp/src/MatchGraph.jsx +3 -3
- package/webapp/src/MergeTables.jsx +43 -36
- package/webapp/src/Query.jsx +4 -9
- package/webapp/src/instanceData.js +6 -2
- package/webapp/src/loadMap.js +22 -22
- package/webapp/src/loadMerge.js +30 -32
- package/webapp/src/loadSources.js +1 -1
- package/webapp/src/mergeEntities.js +15 -0
- package/webapp/src/sourceMeta.js +1 -1
- package/webapp/src/styles.css +6 -6
- package/webapp/vite.js +1 -1
- package/src/federate.js +0 -571
- package/src/ingest.js +0 -158
- package/webapp/src/mergeOrgs.js +0 -15
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
import { sparqlSelect } from "@foerderfunke/sem-ops-utils"
|
|
2
|
+
import { COMMON_PREFIXES, writeTurtleFile } from "../write-turtle.js"
|
|
3
|
+
import { MAPPED_GRAPH } from "./map.js"
|
|
4
|
+
import { CDP } from "../../utils.js"
|
|
5
|
+
import { token_set_ratio } from "fuzzball"
|
|
6
|
+
import { DataFactory } from "n3"
|
|
7
|
+
import { createHash } from "crypto"
|
|
8
|
+
|
|
9
|
+
const df = DataFactory
|
|
10
|
+
|
|
11
|
+
export const MATCH_GRAPH = df.namedNode("urn:matched")
|
|
12
|
+
export const HAS_MEMBER = df.namedNode(CDP + "hasMember")
|
|
13
|
+
|
|
14
|
+
const RDF_TYPE = df.namedNode("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")
|
|
15
|
+
const MATCH_CLUSTER = df.namedNode(CDP + "MatchCluster")
|
|
16
|
+
|
|
17
|
+
// token_set_ratio computes a ratio over the intersection of token sets, which
|
|
18
|
+
// is robust to legal-form noise ("gGmbH", "e.V."), sub-unit specifiers, and
|
|
19
|
+
// word-order variations. Returns 0–100; we normalise to 0–1. The algorithm
|
|
20
|
+
// name is recorded in the evidence graph so old similarity numbers stay
|
|
21
|
+
// interpretable across algorithm swaps.
|
|
22
|
+
const SIMILARITY_ALGORITHM = "token_set_ratio"
|
|
23
|
+
const similarity = (a, b) => token_set_ratio(a ?? "", b ?? "") / 100
|
|
24
|
+
|
|
25
|
+
export const runMatch = async ({ store, defStore, abs }, outPath) => {
|
|
26
|
+
// One match rule per target schema; each rule scores its own fields, mints
|
|
27
|
+
// with its own prefix, and clusters only subjects of its :targetClass.
|
|
28
|
+
const rules = await sparqlSelect(`
|
|
29
|
+
PREFIX : <${CDP}>
|
|
30
|
+
SELECT ?match ?targetClass ?ns ?prefix ?minScore WHERE {
|
|
31
|
+
?match a :MatchRule ;
|
|
32
|
+
:forTarget ?target ;
|
|
33
|
+
:targetNamespace ?ns ;
|
|
34
|
+
:mintedSubjectPrefix ?prefix .
|
|
35
|
+
?target :targetClass ?targetClass .
|
|
36
|
+
OPTIONAL { ?match :minScore ?minScore }
|
|
37
|
+
} ORDER BY ?match`, [defStore])
|
|
38
|
+
if (!rules.length) throw new Error(":MatchRule config missing in federation.ttl")
|
|
39
|
+
|
|
40
|
+
const criteriaRows = await sparqlSelect(`
|
|
41
|
+
PREFIX : <${CDP}>
|
|
42
|
+
SELECT ?match ?on ?weight ?minSim WHERE {
|
|
43
|
+
?match a :MatchRule ; :hasWeightedCriterion ?c .
|
|
44
|
+
?c :on ?on ; :weight ?weight .
|
|
45
|
+
OPTIONAL { ?c :minSimilarity ?minSim }
|
|
46
|
+
}`, [defStore])
|
|
47
|
+
// Hard criteria: fields that must be identical in both records (pass/fail gates).
|
|
48
|
+
const hardRows = await sparqlSelect(`
|
|
49
|
+
PREFIX : <${CDP}>
|
|
50
|
+
SELECT ?match ?on WHERE {
|
|
51
|
+
?match a :MatchRule ; :hasHardCriterion ?h . ?h :on ?on .
|
|
52
|
+
}`, [defStore])
|
|
53
|
+
// Criteria keyed by their owning rule, so each pass scores on its own fields.
|
|
54
|
+
const criteriaByMatch = new Map()
|
|
55
|
+
for (const r of criteriaRows) {
|
|
56
|
+
if (!criteriaByMatch.has(r.match)) criteriaByMatch.set(r.match, [])
|
|
57
|
+
criteriaByMatch.get(r.match).push({
|
|
58
|
+
pred: df.namedNode(r.on),
|
|
59
|
+
weight: parseFloat(r.weight),
|
|
60
|
+
minSim: r.minSim != null ? parseFloat(r.minSim) : null,
|
|
61
|
+
})
|
|
62
|
+
}
|
|
63
|
+
const hardByMatch = new Map()
|
|
64
|
+
for (const r of hardRows) {
|
|
65
|
+
if (!hardByMatch.has(r.match)) hardByMatch.set(r.match, [])
|
|
66
|
+
hardByMatch.get(r.match).push({ pred: df.namedNode(r.on) })
|
|
67
|
+
}
|
|
68
|
+
// owl:sameAs assertions are shared; each pass only acts on the pairs whose
|
|
69
|
+
// endpoints are in its own subject set (gated by parent.has below).
|
|
70
|
+
const sameAsRows = await sparqlSelect(`
|
|
71
|
+
PREFIX owl: <http://www.w3.org/2002/07/owl#>
|
|
72
|
+
SELECT ?a ?b WHERE { ?a owl:sameAs ?b }`, [defStore])
|
|
73
|
+
|
|
74
|
+
const MATCH_EVIDENCE = df.namedNode(CDP + "MatchEvidence")
|
|
75
|
+
const HAS_MATCH_EVIDENCE = df.namedNode(CDP + "hasMatchEvidence")
|
|
76
|
+
const PAIR = df.namedNode(CDP + "pair")
|
|
77
|
+
const ON_CRITERION = df.namedNode(CDP + "onCriterion")
|
|
78
|
+
const ON = df.namedNode(CDP + "on")
|
|
79
|
+
const SIMILARITY = df.namedNode(CDP + "similarity")
|
|
80
|
+
const SIM_ALGORITHM = df.namedNode(CDP + "similarityAlgorithm")
|
|
81
|
+
const WEIGHT = df.namedNode(CDP + "weight")
|
|
82
|
+
const VALUE_A = df.namedNode(CDP + "valueA")
|
|
83
|
+
const VALUE_B = df.namedNode(CDP + "valueB")
|
|
84
|
+
const AGGREGATE_SCORE = df.namedNode(CDP + "aggregateScore")
|
|
85
|
+
const VIA_MANUAL_MATCH = df.namedNode(CDP + "viaManualMatch")
|
|
86
|
+
const XSD_DECIMAL = df.namedNode("http://www.w3.org/2001/XMLSchema#decimal")
|
|
87
|
+
const XSD_BOOLEAN = df.namedNode("http://www.w3.org/2001/XMLSchema#boolean")
|
|
88
|
+
|
|
89
|
+
for (const rule of rules) {
|
|
90
|
+
const namespace = rule.ns
|
|
91
|
+
const mintedPrefix = rule.prefix
|
|
92
|
+
const minScore = parseFloat(rule.minScore)
|
|
93
|
+
const hard = hardByMatch.get(rule.match) ?? []
|
|
94
|
+
const weighted = criteriaByMatch.get(rule.match) ?? []
|
|
95
|
+
|
|
96
|
+
// Subjects of this rule's target class only — passes never cross types.
|
|
97
|
+
const subjects = [...new Set(store.getQuads(null, RDF_TYPE, df.namedNode(rule.targetClass), MAPPED_GRAPH)
|
|
98
|
+
.filter(qu => qu.subject.termType === "NamedNode")
|
|
99
|
+
.map(qu => qu.subject.value))]
|
|
100
|
+
|
|
101
|
+
const valOf = (s, pred) => {
|
|
102
|
+
const qs = store.getQuads(df.namedNode(s), pred, null, MAPPED_GRAPH)
|
|
103
|
+
return qs.length ? qs[0].object.value : null
|
|
104
|
+
}
|
|
105
|
+
const hardVals = new Map(subjects.map(s => [s, hard.map(h => valOf(s, h.pred))]))
|
|
106
|
+
const weightedVals = new Map(subjects.map(s => [s, weighted.map(c => valOf(s, c.pred))]))
|
|
107
|
+
|
|
108
|
+
// A pair matches when every hard criterion is present and identical in both,
|
|
109
|
+
// and the weighted criteria's aggregate (sum of sim·weight, each optionally
|
|
110
|
+
// floored by :minSimilarity) clears :minScore. No criteria at all → every
|
|
111
|
+
// subject stays its own cluster.
|
|
112
|
+
const matches = (a, b) => {
|
|
113
|
+
if (!hard.length && !weighted.length) return null
|
|
114
|
+
const ha = hardVals.get(a), hb = hardVals.get(b)
|
|
115
|
+
for (let i = 0; i < hard.length; i++) {
|
|
116
|
+
if (ha[i] == null || hb[i] == null || ha[i] !== hb[i]) return null
|
|
117
|
+
}
|
|
118
|
+
const va = weightedVals.get(a), vb = weightedVals.get(b)
|
|
119
|
+
const scores = []
|
|
120
|
+
let weightedSum = 0
|
|
121
|
+
for (let i = 0; i < weighted.length; i++) {
|
|
122
|
+
if (va[i] == null || vb[i] == null) return null
|
|
123
|
+
const c = weighted[i]
|
|
124
|
+
const sim = similarity(va[i], vb[i])
|
|
125
|
+
if (c.minSim != null && sim < c.minSim) return null
|
|
126
|
+
scores.push({ pred: c.pred, sim, weight: c.weight, valueA: va[i], valueB: vb[i] })
|
|
127
|
+
weightedSum += sim * c.weight
|
|
128
|
+
}
|
|
129
|
+
if (weighted.length && weightedSum < minScore) return null
|
|
130
|
+
return { scores, aggregate: weightedSum }
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
const parent = new Map(subjects.map(s => [s, s]))
|
|
134
|
+
const find = (x) => {
|
|
135
|
+
let r = x
|
|
136
|
+
while (parent.get(r) !== r) r = parent.get(r)
|
|
137
|
+
let c = x
|
|
138
|
+
while (parent.get(c) !== r) { const n = parent.get(c); parent.set(c, r); c = n }
|
|
139
|
+
return r
|
|
140
|
+
}
|
|
141
|
+
const union = (a, b) => {
|
|
142
|
+
const ra = find(a), rb = find(b)
|
|
143
|
+
if (ra !== rb) parent.set(ra, rb)
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
const evidence = []
|
|
147
|
+
let sameAsUnions = 0
|
|
148
|
+
for (const { a, b } of sameAsRows) {
|
|
149
|
+
if (parent.has(a) && parent.has(b)) { union(a, b); sameAsUnions++; evidence.push({ a, b, manual: true }) }
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
for (let i = 0; i < subjects.length; i++) {
|
|
153
|
+
for (let j = i + 1; j < subjects.length; j++) {
|
|
154
|
+
const m = matches(subjects[i], subjects[j])
|
|
155
|
+
if (m) { union(subjects[i], subjects[j]); evidence.push({ a: subjects[i], b: subjects[j], ...m }) }
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
const clusters = new Map()
|
|
160
|
+
for (const s of subjects) {
|
|
161
|
+
const root = find(s)
|
|
162
|
+
if (!clusters.has(root)) clusters.set(root, [])
|
|
163
|
+
clusters.get(root).push(s)
|
|
164
|
+
}
|
|
165
|
+
const clusterMembers = [...clusters.values()]
|
|
166
|
+
.map(m => [...m].sort())
|
|
167
|
+
.sort((a, b) => b.length - a.length || a[0].localeCompare(b[0]))
|
|
168
|
+
|
|
169
|
+
let multiSource = 0
|
|
170
|
+
const clusterIriByRoot = new Map()
|
|
171
|
+
for (const members of clusterMembers) {
|
|
172
|
+
const id = createHash("sha1").update(members.join("|")).digest("hex").slice(0, 12)
|
|
173
|
+
const minted = df.namedNode(namespace + mintedPrefix + id)
|
|
174
|
+
clusterIriByRoot.set(find(members[0]), minted)
|
|
175
|
+
if (members.length > 1) multiSource++
|
|
176
|
+
store.addQuad(df.quad(minted, RDF_TYPE, MATCH_CLUSTER, MATCH_GRAPH))
|
|
177
|
+
for (const s of members) {
|
|
178
|
+
store.addQuad(df.quad(minted, HAS_MEMBER, df.namedNode(s), MATCH_GRAPH))
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
for (const ev of evidence) {
|
|
183
|
+
const evNode = df.blankNode()
|
|
184
|
+
const cluster = clusterIriByRoot.get(find(ev.a))
|
|
185
|
+
store.addQuad(df.quad(cluster, HAS_MATCH_EVIDENCE, evNode, MATCH_GRAPH))
|
|
186
|
+
store.addQuad(df.quad(evNode, RDF_TYPE, MATCH_EVIDENCE, MATCH_GRAPH))
|
|
187
|
+
store.addQuad(df.quad(evNode, PAIR, df.namedNode(ev.a), MATCH_GRAPH))
|
|
188
|
+
store.addQuad(df.quad(evNode, PAIR, df.namedNode(ev.b), MATCH_GRAPH))
|
|
189
|
+
if (ev.manual) {
|
|
190
|
+
store.addQuad(df.quad(evNode, VIA_MANUAL_MATCH, df.literal("true", XSD_BOOLEAN), MATCH_GRAPH))
|
|
191
|
+
} else {
|
|
192
|
+
store.addQuad(df.quad(evNode, AGGREGATE_SCORE, df.literal(ev.aggregate.toFixed(3), XSD_DECIMAL), MATCH_GRAPH))
|
|
193
|
+
store.addQuad(df.quad(evNode, SIM_ALGORITHM, df.literal(SIMILARITY_ALGORITHM), MATCH_GRAPH))
|
|
194
|
+
for (const s of ev.scores) {
|
|
195
|
+
const cNode = df.blankNode()
|
|
196
|
+
store.addQuad(df.quad(evNode, ON_CRITERION, cNode, MATCH_GRAPH))
|
|
197
|
+
store.addQuad(df.quad(cNode, ON, s.pred, MATCH_GRAPH))
|
|
198
|
+
store.addQuad(df.quad(cNode, SIMILARITY, df.literal(s.sim.toFixed(3), XSD_DECIMAL), MATCH_GRAPH))
|
|
199
|
+
store.addQuad(df.quad(cNode, WEIGHT, df.literal(s.weight.toFixed(2), XSD_DECIMAL), MATCH_GRAPH))
|
|
200
|
+
store.addQuad(df.quad(cNode, VALUE_A, df.literal(s.valueA), MATCH_GRAPH))
|
|
201
|
+
store.addQuad(df.quad(cNode, VALUE_B, df.literal(s.valueB), MATCH_GRAPH))
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
console.log(`match: ${rule.match.split("#").pop()} ${subjects.length} entities → ${clusters.size} clusters (${multiSource} multi-source, ${sameAsUnions} sameAs unions)`)
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
const matchQuads = store.getQuads(null, null, null, MATCH_GRAPH)
|
|
210
|
+
await writeTurtleFile(abs(outPath), matchQuads, { cdp: CDP, cdf: rules[0].ns, ...COMMON_PREFIXES })
|
|
211
|
+
console.log(`match: wrote cluster log → ${outPath}`)
|
|
212
|
+
}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import { sparqlSelect } from "@foerderfunke/sem-ops-utils"
|
|
2
|
+
import { COMMON_PREFIXES, writeTurtleFile } from "../write-turtle.js"
|
|
3
|
+
import { HAS_MEMBER, MATCH_GRAPH } from "./match.js"
|
|
4
|
+
import { MAPPED_GRAPH } from "./map.js"
|
|
5
|
+
import { CDP } from "../../utils.js"
|
|
6
|
+
import { DataFactory } from "n3"
|
|
7
|
+
|
|
8
|
+
const df = DataFactory
|
|
9
|
+
|
|
10
|
+
export const MERGED_GRAPH = df.namedNode("urn:merged")
|
|
11
|
+
|
|
12
|
+
const RDF_REIFIES = df.namedNode("http://www.w3.org/1999/02/22-rdf-syntax-ns#reifies")
|
|
13
|
+
|
|
14
|
+
export const runMerge = async ({ store, defStore, abs }, outPath, provOutPath) => {
|
|
15
|
+
const [cfg] = await sparqlSelect(`
|
|
16
|
+
PREFIX : <${CDP}>
|
|
17
|
+
SELECT ?ns ?originPred WHERE {
|
|
18
|
+
?match a :MatchRule ; :targetNamespace ?ns .
|
|
19
|
+
?merge a :MergeRule ; :originPredicate ?originPred .
|
|
20
|
+
}`, [defStore])
|
|
21
|
+
if (!cfg) throw new Error(":MergeRule / :MatchRule config missing in federation.ttl")
|
|
22
|
+
const { ns: namespace, originPred } = cfg
|
|
23
|
+
|
|
24
|
+
const memberQuads = store.getQuads(null, HAS_MEMBER, null, MATCH_GRAPH)
|
|
25
|
+
const mintedFor = new Map()
|
|
26
|
+
for (const mq of memberQuads) mintedFor.set(mq.object.value, mq.subject)
|
|
27
|
+
|
|
28
|
+
const fedQuads = store.getQuads(null, null, null, MAPPED_GRAPH)
|
|
29
|
+
const originPredNode = df.namedNode(originPred)
|
|
30
|
+
const provQuads = []
|
|
31
|
+
for (const qu of fedQuads) {
|
|
32
|
+
const minted = mintedFor.get(qu.subject.value)
|
|
33
|
+
if (!minted) continue
|
|
34
|
+
// Rewrite IRI objects that are themselves matched subjects to their minted
|
|
35
|
+
// cluster IRI, so inter-entity links (e.g. schema:provider) point at the
|
|
36
|
+
// merged entity rather than the pre-merge source IRI.
|
|
37
|
+
const object = qu.object.termType === "NamedNode" && mintedFor.has(qu.object.value)
|
|
38
|
+
? mintedFor.get(qu.object.value)
|
|
39
|
+
: qu.object
|
|
40
|
+
store.addQuad(df.quad(minted, qu.predicate, object, MERGED_GRAPH))
|
|
41
|
+
// One reifier per derivation occurrence (RDF 1.2: triple terms are only
|
|
42
|
+
// legal as objects, via rdf:reifies) — the provenance hangs off it, and
|
|
43
|
+
// per-derivation metadata (time, confidence) has a home when needed.
|
|
44
|
+
const reifier = df.blankNode()
|
|
45
|
+
provQuads.push(df.quad(reifier, RDF_REIFIES, df.quad(minted, qu.predicate, object)))
|
|
46
|
+
provQuads.push(df.quad(reifier, originPredNode, qu.subject))
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
const mergedQuads = store.getQuads(null, null, null, MERGED_GRAPH)
|
|
50
|
+
|
|
51
|
+
await writeTurtleFile(abs(outPath), mergedQuads, { ...COMMON_PREFIXES, cdp: CDP, cdf: namespace })
|
|
52
|
+
console.log(`merge: wrote ${mergedQuads.length} triples → ${outPath}`)
|
|
53
|
+
|
|
54
|
+
await writeTurtleFile(abs(provOutPath), provQuads, {
|
|
55
|
+
...COMMON_PREFIXES, cdp: CDP, cdf: namespace, prov: "http://www.w3.org/ns/prov#",
|
|
56
|
+
rdf: "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
|
|
57
|
+
})
|
|
58
|
+
console.log(`merge: wrote ${provQuads.length / 2} provenance annotations → ${provOutPath}`)
|
|
59
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import { sparqlSelect } from "@foerderfunke/sem-ops-utils"
|
|
2
|
+
import { COMMON_PREFIXES, writeTurtleFile } from "../write-turtle.js"
|
|
3
|
+
import { MERGED_GRAPH } from "./merge.js"
|
|
4
|
+
import { CDP } from "../../utils.js"
|
|
5
|
+
import { DataFactory } from "n3"
|
|
6
|
+
|
|
7
|
+
const df = DataFactory
|
|
8
|
+
|
|
9
|
+
// One value per (subject, predicate). schema:identifier and cdp:fromSource
|
|
10
|
+
// are dropped — final.ttl is the consumer-facing artifact, source attribution
|
|
11
|
+
// lives in provenance.ttl.
|
|
12
|
+
const STRATEGIES = {
|
|
13
|
+
alphabeticFirst: (quads) => [...quads].sort((a, b) => a.object.value.localeCompare(b.object.value))[0],
|
|
14
|
+
concatenateAll: (quads) => df.quad(quads[0].subject, quads[0].predicate,
|
|
15
|
+
df.literal([...new Set(quads.map(q => q.object.value))].sort().join(", "))),
|
|
16
|
+
}
|
|
17
|
+
const RESOLVE_EXCLUDE = new Set(["http://schema.org/identifier", `${CDP}fromSource`])
|
|
18
|
+
|
|
19
|
+
const lookupStrategy = (iri) => {
|
|
20
|
+
const fn = STRATEGIES[iri.split("#").pop()]
|
|
21
|
+
if (!fn) throw new Error(`Unknown resolve strategy ${iri}`)
|
|
22
|
+
return fn
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export const runResolve = async ({ store, defStore, abs }, outPath) => {
|
|
26
|
+
const [cfg] = await sparqlSelect(`
|
|
27
|
+
PREFIX : <${CDP}>
|
|
28
|
+
SELECT ?strategy ?ns WHERE {
|
|
29
|
+
?resolve a :ResolveRule ; :defaultStrategy ?strategy .
|
|
30
|
+
?match a :MatchRule ; :targetNamespace ?ns .
|
|
31
|
+
}`, [defStore])
|
|
32
|
+
if (!cfg) throw new Error(":ResolveRule config missing in federation.ttl")
|
|
33
|
+
const defaultPick = lookupStrategy(cfg.strategy)
|
|
34
|
+
|
|
35
|
+
const overrideRows = await sparqlSelect(`
|
|
36
|
+
PREFIX : <${CDP}>
|
|
37
|
+
SELECT ?on ?strategy WHERE {
|
|
38
|
+
?resolve a :ResolveRule ; :hasOverride [ :on ?on ; :strategy ?strategy ] .
|
|
39
|
+
}`, [defStore])
|
|
40
|
+
const overrides = new Map(overrideRows.map(r => [r.on, lookupStrategy(r.strategy)]))
|
|
41
|
+
|
|
42
|
+
const groups = new Map()
|
|
43
|
+
for (const q of store.getQuads(null, null, null, MERGED_GRAPH)) {
|
|
44
|
+
if (RESOLVE_EXCLUDE.has(q.predicate.value)) continue
|
|
45
|
+
const k = `${q.subject.value}\t${q.predicate.value}`
|
|
46
|
+
if (!groups.has(k)) groups.set(k, [])
|
|
47
|
+
groups.get(k).push(q)
|
|
48
|
+
}
|
|
49
|
+
const finalQuads = [...groups.values()].map(quads =>
|
|
50
|
+
(overrides.get(quads[0].predicate.value) ?? defaultPick)(quads))
|
|
51
|
+
|
|
52
|
+
await writeTurtleFile(abs(outPath), finalQuads, { ...COMMON_PREFIXES, cdf: cfg.ns })
|
|
53
|
+
console.log(`resolve: wrote ${finalQuads.length} triples → ${outPath}`)
|
|
54
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import { newStore } from "@foerderfunke/sem-ops-utils"
|
|
2
|
+
import { DataFactory, Writer } from "n3"
|
|
3
|
+
import path from "path"
|
|
4
|
+
import fs from "fs"
|
|
5
|
+
|
|
6
|
+
const df = DataFactory
|
|
7
|
+
|
|
8
|
+
export const COMMON_PREFIXES = {
|
|
9
|
+
schema: "http://schema.org/",
|
|
10
|
+
foaf: "http://xmlns.com/foaf/0.1/",
|
|
11
|
+
dct: "http://purl.org/dc/terms/",
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
// Dedupe via a Store and sort by subject so the Writer can emit grouped
|
|
15
|
+
// "subject p1 o1; p2 o2." blocks instead of repeating subjects. Strips
|
|
16
|
+
// graph names (writes triples, not quads).
|
|
17
|
+
export const writeTurtleFile = (filePath, quads, prefixes = {}) => new Promise((resolve, reject) => {
|
|
18
|
+
const store = newStore()
|
|
19
|
+
for (const q of quads) store.addQuad(df.quad(q.subject, q.predicate, q.object))
|
|
20
|
+
const dedup = store.getQuads(null, null, null, null)
|
|
21
|
+
.sort((a, b) => a.subject.value.localeCompare(b.subject.value))
|
|
22
|
+
const writer = new Writer({ prefixes })
|
|
23
|
+
for (const q of dedup) writer.addQuad(q)
|
|
24
|
+
writer.end((err, result) => {
|
|
25
|
+
if (err) return reject(err)
|
|
26
|
+
fs.mkdirSync(path.dirname(filePath), { recursive: true })
|
|
27
|
+
fs.writeFileSync(filePath, result)
|
|
28
|
+
resolve()
|
|
29
|
+
})
|
|
30
|
+
})
|
package/src/pipeline.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { ingest } from "./ingest.js"
|
|
2
|
-
import { federate } from "./federate.js"
|
|
1
|
+
import { ingest } from "./pipeline/ingest.js"
|
|
2
|
+
import { federate } from "./pipeline/federate.js"
|
|
3
3
|
|
|
4
4
|
// Programmatic entry: hold the instance root once, run the engines against it.
|
|
5
5
|
// The CLI (bin/cli.js) is this same class with defaults — root = cwd.
|
package/src/utils.js
CHANGED
|
@@ -57,6 +57,7 @@ export const PATHS = {
|
|
|
57
57
|
federation: "config/federation.ttl",
|
|
58
58
|
matchKnowledge: "config/match-knowledge.ttl",
|
|
59
59
|
about: "webapp/content/about.md",
|
|
60
|
+
query: "webapp/content/query.sparql",
|
|
60
61
|
fetchScript: (name) => `sources/${name}/fetch.js`,
|
|
61
62
|
exporter: (name) => `webapp/exporters/${name}.js`,
|
|
62
63
|
staticDir: (name) => `sources/${name}/static/`,
|
|
@@ -87,11 +88,6 @@ export const parseTtl = (turtle) => new Parser().parse(turtle)
|
|
|
87
88
|
export const prefixesOf = (turtle) =>
|
|
88
89
|
Object.fromEntries([...turtle.matchAll(/^\s*@?prefix\s+([\w-]*):\s*<([^>]*)>/gim)].map(([, p, ns]) => [p, ns]))
|
|
89
90
|
|
|
90
|
-
// Turtle with RDF-star triple terms in subject position (the engine's
|
|
91
|
-
// provenance annotations) — plain Turtle parsing disallows those, N3 mode
|
|
92
|
-
// accepts them.
|
|
93
|
-
export const parseTtlStar = (turtle) => new Parser({ format: "text/n3" }).parse(turtle)
|
|
94
|
-
|
|
95
91
|
// {prefix: namespace} → "PREFIX p1: <ns1>\nPREFIX p2: <ns2>"
|
|
96
92
|
export const buildPrefixBlock = (prefixMap) =>
|
|
97
93
|
Object.entries(prefixMap).map(([p, ns]) => `PREFIX ${p}: <${ns}>`).join("\n")
|
package/webapp/src/App.jsx
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { HashRouter, Routes, Route, NavLink } from "react-router-dom"
|
|
2
2
|
import "./styles.css"
|
|
3
|
-
import { repositoryUrl } from "./instanceData.js"
|
|
3
|
+
import { federationLabel, repositoryUrl } from "./instanceData.js"
|
|
4
4
|
import About from "./About.jsx"
|
|
5
5
|
import React, { lazy, Suspense, useState } from "react"
|
|
6
6
|
|
|
@@ -16,6 +16,8 @@ const MergeTables = lazy(() => import("./MergeTables.jsx"))
|
|
|
16
16
|
const Query = lazy(() => import("./Query.jsx"))
|
|
17
17
|
const Sources = lazy(() => import("./Sources.jsx"))
|
|
18
18
|
|
|
19
|
+
if (federationLabel) document.title = federationLabel
|
|
20
|
+
|
|
19
21
|
const STORAGE_KEY = "showFederation"
|
|
20
22
|
|
|
21
23
|
const initialShowFed = () => {
|
package/webapp/src/Card.jsx
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
// Presentational building blocks: <Card> (titled box) and <KeyValueTable>.
|
|
2
2
|
// Reads: props (title, children, rows)
|
|
3
|
-
// Does: renders DOM; used by
|
|
3
|
+
// Does: renders DOM; used by EntityCard and Sources
|
|
4
4
|
|
|
5
5
|
import React from "react"
|
|
6
6
|
|
|
7
7
|
export default function Card({ title, tag, children }) {
|
|
8
8
|
return (
|
|
9
|
-
<div className="
|
|
10
|
-
<div className="
|
|
9
|
+
<div className="entity-card">
|
|
10
|
+
<div className="entity-card-header">
|
|
11
11
|
<code>{title}</code>
|
|
12
12
|
{tag && <span style={{ marginLeft: "0.6rem", fontSize: 11, color: "#888", fontFamily: "monospace" }}>{tag}</span>}
|
|
13
13
|
</div>
|
|
@@ -246,7 +246,7 @@ export default function ColumnGraph({ nodes, edges, columns, colors, centerColum
|
|
|
246
246
|
const [draggingId, setDraggingId] = useState(null)
|
|
247
247
|
const [hoveredEdge, setHoveredEdge] = useState(null)
|
|
248
248
|
const hoverCtx = useMemo(() => ({ id: hoveredEdge, set: setHoveredEdge }), [hoveredEdge])
|
|
249
|
-
// Sync edges when value labels change (e.g. selecting a different
|
|
249
|
+
// Sync edges when value labels change (e.g. selecting a different entity) so
|
|
250
250
|
// the user keeps any node positions they've dragged.
|
|
251
251
|
useEffect(() => { setRfEdges(flowEdges) }, [flowEdges, setRfEdges])
|
|
252
252
|
|
package/webapp/src/Directory.jsx
CHANGED
|
@@ -1,15 +1,15 @@
|
|
|
1
|
-
// Consumer-facing directory: one compact card per resolved
|
|
2
|
-
// Reads:
|
|
3
|
-
// Does: renders the Directory page (list of compact <
|
|
1
|
+
// Consumer-facing directory: one compact card per resolved entity.
|
|
2
|
+
// Reads: finalEntities from mergeEntities.js (← data/pipeline/final.ttl)
|
|
3
|
+
// Does: renders the Directory page (list of compact <EntityCard>)
|
|
4
4
|
|
|
5
|
-
import
|
|
6
|
-
import {
|
|
5
|
+
import EntityCard from "./EntityCard.jsx"
|
|
6
|
+
import { finalEntities } from "./mergeEntities.js"
|
|
7
7
|
import React from "react"
|
|
8
8
|
|
|
9
9
|
export default function Directory() {
|
|
10
10
|
return (
|
|
11
11
|
<div className="page" style={{ overflowY: "auto", height: "100%" }}>
|
|
12
|
-
{
|
|
12
|
+
{finalEntities.map((entity) => <EntityCard key={entity.iri} entity={entity} compact={true} highlight={false} />)}
|
|
13
13
|
</div>
|
|
14
14
|
)
|
|
15
15
|
}
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
// Renders one
|
|
1
|
+
// Renders one entity as a card (narrow key/value, or wide per-source table)
|
|
2
2
|
// with source tags and conflict highlighting. Also exports the conflict helpers.
|
|
3
3
|
// Reads: config/federation.ttl, data/ingest/ingest-log.ttl (via sourceMeta.js);
|
|
4
|
-
//
|
|
5
|
-
// Does: renders <
|
|
4
|
+
// entity objects from loadMerge.js
|
|
5
|
+
// Does: renders <EntityCard>; exports EXPECTED_MULTI, isConflict (used by mergeEntities, MergeTables)
|
|
6
6
|
|
|
7
7
|
import { federationTtl, ingestLogTtl as logTtl } from "./instanceData.js"
|
|
8
8
|
import Card, { KeyValueTable } from "./Card.jsx"
|
|
@@ -10,7 +10,7 @@ import { loadHarvestBySource, loadSourceMeta } from "./sourceMeta.js"
|
|
|
10
10
|
import { CDP, parseTtl } from "@directory-builder/core/utils"
|
|
11
11
|
import React, { useState } from "react"
|
|
12
12
|
|
|
13
|
-
//
|
|
13
|
+
// entity.columns are one entry per contributing record (resolved in loadMerge); look
|
|
14
14
|
// up source display data in config (notation, label) and the harvest log (time).
|
|
15
15
|
const sourceMeta = loadSourceMeta(federationTtl)
|
|
16
16
|
const harvestBySource = loadHarvestBySource(logTtl)
|
|
@@ -81,12 +81,12 @@ function ValueCell({ values, highlight }) {
|
|
|
81
81
|
)
|
|
82
82
|
}
|
|
83
83
|
|
|
84
|
-
function
|
|
85
|
-
return <KeyValueTable rows={
|
|
84
|
+
function EntityCardNarrow({ entity, highlight }) {
|
|
85
|
+
return <KeyValueTable rows={entity.fields.map((f) => ({ key: f.predicate, label: f.predLabel, value: <ValueCell values={f.values} highlight={highlight && isConflict(f)} /> }))} />
|
|
86
86
|
}
|
|
87
87
|
|
|
88
|
-
function
|
|
89
|
-
const columns =
|
|
88
|
+
function EntityCardWide({ entity, highlight }) {
|
|
89
|
+
const columns = entity.columns
|
|
90
90
|
return (
|
|
91
91
|
<table>
|
|
92
92
|
<thead>
|
|
@@ -100,7 +100,7 @@ function OrgCardWide({ org, highlight }) {
|
|
|
100
100
|
</tr>
|
|
101
101
|
</thead>
|
|
102
102
|
<tbody>
|
|
103
|
-
{
|
|
103
|
+
{entity.fields.map((f) => {
|
|
104
104
|
const conflict = highlight && isConflict(f) ? conflictStyle(f.values.length) : undefined
|
|
105
105
|
return (
|
|
106
106
|
<tr key={f.predicate}>
|
|
@@ -117,10 +117,10 @@ function OrgCardWide({ org, highlight }) {
|
|
|
117
117
|
)
|
|
118
118
|
}
|
|
119
119
|
|
|
120
|
-
export default function
|
|
120
|
+
export default function EntityCard({ entity, compact, highlight }) {
|
|
121
121
|
return (
|
|
122
|
-
<Card title={
|
|
123
|
-
{compact ? <
|
|
122
|
+
<Card title={entity.label} tag={entity.type}>
|
|
123
|
+
{compact ? <EntityCardNarrow entity={entity} highlight={highlight} /> : <EntityCardWide entity={entity} highlight={highlight} />}
|
|
124
124
|
</Card>
|
|
125
125
|
)
|
|
126
126
|
}
|
package/webapp/src/MapGraph.jsx
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
// Map view: the source-schema → target-schema mapping graph, optionally animated
|
|
2
|
-
// with one
|
|
2
|
+
// with one entity's field values flowing through the transform nodes.
|
|
3
3
|
// Reads: config/federation.ttl, data/pipeline/mapped.ttl,
|
|
4
4
|
// data/pipeline/cleaned/*.ttl (via loadMap.js + sourceMeta.js)
|
|
5
5
|
// Does: renders the Map page (horizontal <ColumnGraph>)
|
|
6
6
|
|
|
7
7
|
import { federationTtl as ttl, mappedTtl, cleanedByPath } from "./instanceData.js"
|
|
8
|
-
import { loadMap, loadSources,
|
|
8
|
+
import { loadMap, loadSources, loadEntitiesBySource, loadFieldValuesByEntity } from "./loadMap.js"
|
|
9
9
|
import React, { useEffect, useMemo, useRef, useState } from "react"
|
|
10
10
|
import { loadCleanedBySource } from "./sourceMeta.js"
|
|
11
11
|
import { SkipBack, SkipForward } from "lucide-react"
|
|
@@ -27,10 +27,10 @@ const VALUE_LABEL_BG = {
|
|
|
27
27
|
}
|
|
28
28
|
|
|
29
29
|
const SOURCES = loadSources(ttl)
|
|
30
|
-
const
|
|
30
|
+
const ENTITIES_BY_SOURCE = loadEntitiesBySource(ttl, mappedTtl)
|
|
31
31
|
// Source-to-file mapping is resolved from config: instanceData enumerates the
|
|
32
32
|
// cleaned TTLs from :hasSource, so a new source needs no edit here.
|
|
33
|
-
const FIELD_VALUES =
|
|
33
|
+
const FIELD_VALUES = loadFieldValuesByEntity(ttl, mappedTtl, loadCleanedBySource(ttl, cleanedByPath))
|
|
34
34
|
|
|
35
35
|
function SourcesDropdown({ visible, onChange }) {
|
|
36
36
|
const [open, setOpen] = useState(false)
|
|
@@ -81,7 +81,7 @@ function SourcesDropdown({ visible, onChange }) {
|
|
|
81
81
|
)
|
|
82
82
|
}
|
|
83
83
|
|
|
84
|
-
function
|
|
84
|
+
function EntityCombobox({ entities, value, onChange, disabled }) {
|
|
85
85
|
const [open, setOpen] = useState(false)
|
|
86
86
|
const [filter, setFilter] = useState("")
|
|
87
87
|
const ref = useRef(null)
|
|
@@ -93,9 +93,9 @@ function OrgCombobox({ orgs, value, onChange, disabled }) {
|
|
|
93
93
|
return () => document.removeEventListener("mousedown", onDown)
|
|
94
94
|
}, [open])
|
|
95
95
|
|
|
96
|
-
const selected =
|
|
96
|
+
const selected = entities.find(o => o.iri === value)
|
|
97
97
|
const f = filter.toLowerCase()
|
|
98
|
-
const filtered = f ?
|
|
98
|
+
const filtered = f ? entities.filter(o => o.id.toLowerCase().includes(f) || o.name.toLowerCase().includes(f)) : entities
|
|
99
99
|
|
|
100
100
|
return (
|
|
101
101
|
<div ref={ref} style={{ position: "relative" }}>
|
|
@@ -103,7 +103,7 @@ function OrgCombobox({ orgs, value, onChange, disabled }) {
|
|
|
103
103
|
type="text"
|
|
104
104
|
disabled={disabled}
|
|
105
105
|
value={open ? filter : (selected?.name || selected?.id || "")}
|
|
106
|
-
placeholder={disabled ? "" : "Pick
|
|
106
|
+
placeholder={disabled ? "" : "Pick entity…"}
|
|
107
107
|
onChange={(e) => { setFilter(e.target.value); if (!open) setOpen(true) }}
|
|
108
108
|
onFocus={() => { setFilter(""); setOpen(true) }}
|
|
109
109
|
style={{
|
|
@@ -137,7 +137,7 @@ function OrgCombobox({ orgs, value, onChange, disabled }) {
|
|
|
137
137
|
|
|
138
138
|
export default function MapGraph() {
|
|
139
139
|
const [visible, setVisible] = useState(() => new Set(SOURCES.map(s => s.iri)))
|
|
140
|
-
const [
|
|
140
|
+
const [selectedEntity, setSelectedEntity] = useState(null)
|
|
141
141
|
const [dataFlow, setDataFlow] = useState(false)
|
|
142
142
|
const [showUnmapped, setShowUnmapped] = useState(false)
|
|
143
143
|
const [showAllTargets, setShowAllTargets] = useState(false)
|
|
@@ -150,7 +150,7 @@ export default function MapGraph() {
|
|
|
150
150
|
|
|
151
151
|
const oneActive = visible.size === 1
|
|
152
152
|
const enabled = dataFlow && oneActive
|
|
153
|
-
const valueByField = enabled &&
|
|
153
|
+
const valueByField = enabled && selectedEntity ? FIELD_VALUES.get(selectedEntity) : null
|
|
154
154
|
const edges = useMemo(() => {
|
|
155
155
|
if (!valueByField) return rawEdges
|
|
156
156
|
const typeOf = new Map(nodes.map(n => [n.id, n.type]))
|
|
@@ -171,29 +171,29 @@ export default function MapGraph() {
|
|
|
171
171
|
}, [rawEdges, nodes, valueByField, showDirectFlows])
|
|
172
172
|
|
|
173
173
|
// Remount when the visible node set changes (sources or unmapped-fields
|
|
174
|
-
// toggle).
|
|
174
|
+
// toggle). Entity / data-flow changes only update edge labels in place.
|
|
175
175
|
const graphKey = useMemo(() => `${[...visible].sort().join("|")}::${showUnmapped ? "all" : "mapped"}::${showAllTargets ? "allT" : "mappedT"}`, [visible, showUnmapped, showAllTargets])
|
|
176
176
|
|
|
177
177
|
const activeSource = oneActive ? [...visible][0] : null
|
|
178
|
-
const
|
|
178
|
+
const entities = activeSource ? (ENTITIES_BY_SOURCE.get(activeSource) ?? []) : []
|
|
179
179
|
|
|
180
180
|
useEffect(() => {
|
|
181
|
-
if (
|
|
182
|
-
if (!
|
|
183
|
-
} else if (
|
|
184
|
-
|
|
181
|
+
if (entities.length > 0) {
|
|
182
|
+
if (!entities.find(o => o.iri === selectedEntity)) setSelectedEntity(entities[0].iri)
|
|
183
|
+
} else if (selectedEntity !== null) {
|
|
184
|
+
setSelectedEntity(null)
|
|
185
185
|
}
|
|
186
|
-
}, [
|
|
186
|
+
}, [entities])
|
|
187
187
|
|
|
188
188
|
useEffect(() => {
|
|
189
189
|
if (!oneActive && dataFlow) setDataFlow(false)
|
|
190
190
|
}, [oneActive])
|
|
191
191
|
|
|
192
192
|
const cycle = (delta) => {
|
|
193
|
-
if (
|
|
194
|
-
const idx =
|
|
195
|
-
const next = ((idx < 0 ? 0 : idx + delta) +
|
|
196
|
-
|
|
193
|
+
if (entities.length === 0) return
|
|
194
|
+
const idx = entities.findIndex(o => o.iri === selectedEntity)
|
|
195
|
+
const next = ((idx < 0 ? 0 : idx + delta) + entities.length) % entities.length
|
|
196
|
+
setSelectedEntity(entities[next].iri)
|
|
197
197
|
}
|
|
198
198
|
|
|
199
199
|
const disabledHint = !dataFlow
|
|
@@ -232,7 +232,7 @@ export default function MapGraph() {
|
|
|
232
232
|
</label>
|
|
233
233
|
<div style={{ display: "flex", alignItems: "center", gap: "0.25rem" }}>
|
|
234
234
|
<button disabled={!enabled} onClick={() => cycle(-1)} title={enabled ? "Previous" : disabledHint} style={iconBtnStyle}><SkipBack size={13} fill="currentColor" /></button>
|
|
235
|
-
<
|
|
235
|
+
<EntityCombobox entities={entities} value={selectedEntity} onChange={setSelectedEntity} disabled={!enabled} />
|
|
236
236
|
<button disabled={!enabled} onClick={() => cycle(1)} title={enabled ? "Next" : disabledHint} style={iconBtnStyle}><SkipForward size={13} fill="currentColor" /></button>
|
|
237
237
|
</div>
|
|
238
238
|
</div>
|