@directory-builder/core 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -2
- package/example/config/federation.ttl +1 -10
- package/package.json +2 -2
- package/src/clean/default.sparql +19 -0
- package/src/pipeline/federate.js +8 -6
- package/src/pipeline/steps/clean.js +26 -3
- package/src/pipeline/steps/fetch.js +12 -2
- package/src/pipeline/steps/map.js +9 -8
- package/src/pipeline/steps/merge.js +8 -8
- package/src/pipeline/steps/resolve.js +5 -4
- package/src/utils.js +13 -0
- package/src/validate.js +15 -11
- package/test/helpers/instance.js +24 -0
- package/test/pipeline.test.js +90 -0
- package/example/sources/cityopen/fetch.js +0 -14
- package/example/sources/civichub/fetch.js +0 -14
package/README.md
CHANGED
|
@@ -11,7 +11,7 @@ artefacts — no engine code:
|
|
|
11
11
|
config/
|
|
12
12
|
federation.ttl # the decisions: sources + facts, target schemas,
|
|
13
13
|
# field mappings, match/merge/resolve rules
|
|
14
|
-
match-knowledge.ttl # curated owl:sameAs pairs
|
|
14
|
+
match-knowledge.ttl # optional: curated owl:sameAs pairs
|
|
15
15
|
sources/<name>/
|
|
16
16
|
fetch.js # how to fetch this source
|
|
17
17
|
clean.sparql # how to clean its lifted RDF
|
|
@@ -64,7 +64,12 @@ fixture.
|
|
|
64
64
|
|
|
65
65
|
Each source's `fetch.js` is invoked as `node fetch.js <outDir> <fetchUrl-or-staticDir>
|
|
66
66
|
<runParamsJson>` — the JSON holds all `:hasRunParam` values grouped by name;
|
|
67
|
-
each fetcher picks the parameters it needs.
|
|
67
|
+
each fetcher picks the parameters it needs. For static-file sources `fetch.js`
|
|
68
|
+
is optional: without one, the default fetch copies `sources/<name>/static/`
|
|
69
|
+
verbatim. `clean.sparql` is likewise optional when the source maps a field to
|
|
70
|
+
`schema:identifier`: the engine derives a default clean from that mapping —
|
|
71
|
+
skolemise on the identifier field, copy the scalar fields — and puts the
|
|
72
|
+
resolved query on record under `data/pipeline/default-clean-queries/`.
|
|
68
73
|
|
|
69
74
|
A source declared with `:enabled false` stays in the config but is skipped by
|
|
70
75
|
the engines and hidden from the webapp's Sources page — e.g. while its files
|
|
@@ -2,13 +2,12 @@
|
|
|
2
2
|
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
|
|
3
3
|
@prefix schema: <http://schema.org/> .
|
|
4
4
|
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
|
|
5
|
-
@prefix prov: <http://www.w3.org/ns/prov#> .
|
|
6
5
|
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
|
|
7
6
|
@prefix ft: <http://publications.europa.eu/resource/authority/file-type/> .
|
|
8
7
|
|
|
9
8
|
# A minimal single-entity federation: two directories of libraries, merged into
|
|
10
9
|
# one set of schema:Organization records. Config declares decisions only —
|
|
11
|
-
# sources and their facts, target schema, mappings, match/
|
|
10
|
+
# sources and their facts, target schema, mappings, match/resolve rules.
|
|
12
11
|
# The engines own the step shape (fetch → lift, clean → map → match → merge →
|
|
13
12
|
# resolve) and resolve all file paths by convention from the source names.
|
|
14
13
|
# Add target schemas / mappings / match rules to model more entity types
|
|
@@ -18,7 +17,6 @@
|
|
|
18
17
|
:hasSource :cityopenSource, :civichubSource ;
|
|
19
18
|
:hasTargetSchema :organisationSchema ;
|
|
20
19
|
:hasMatchRule :organisationMatch ;
|
|
21
|
-
:hasMergeRule :merge ;
|
|
22
20
|
:hasResolveRule :resolve .
|
|
23
21
|
|
|
24
22
|
# ---- Target schema ------------------------------------------------------
|
|
@@ -87,7 +85,6 @@
|
|
|
87
85
|
:cityopen-mapping a :Mapping ;
|
|
88
86
|
:fromSource :cityopenSource ;
|
|
89
87
|
:toTarget :organisationSchema ;
|
|
90
|
-
:sourceGraph <urn:source:cityopen> ;
|
|
91
88
|
:hasFieldMapping
|
|
92
89
|
[ :from :co-id ; :to :t-identifier ] ,
|
|
93
90
|
[ :from :co-name ; :to :t-name ] ,
|
|
@@ -101,7 +98,6 @@
|
|
|
101
98
|
:civichub-mapping a :Mapping ;
|
|
102
99
|
:fromSource :civichubSource ;
|
|
103
100
|
:toTarget :organisationSchema ;
|
|
104
|
-
:sourceGraph <urn:source:civichub> ;
|
|
105
101
|
:hasFieldMapping
|
|
106
102
|
[ :from :ch-uid ; :to :t-identifier ] ,
|
|
107
103
|
[ :from :ch-bezeichnung ; :to :t-name ] ,
|
|
@@ -124,11 +120,6 @@
|
|
|
124
120
|
:minScore 0.5 ;
|
|
125
121
|
:hasWeightedCriterion [ :on schema:name ; :weight 1.0 ] .
|
|
126
122
|
|
|
127
|
-
# ---- Merge --------------------------------------------------------------
|
|
128
|
-
|
|
129
|
-
:merge a :MergeRule ;
|
|
130
|
-
:originPredicate prov:wasDerivedFrom .
|
|
131
|
-
|
|
132
123
|
# ---- Resolve ------------------------------------------------------------
|
|
133
124
|
# One value per predicate per merged record; alphabeticFirst is deterministic.
|
|
134
125
|
|
package/package.json
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@directory-builder/core",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.4",
|
|
4
4
|
"description": "Use-case-agnostic engine for config-driven federation pipelines",
|
|
5
5
|
"author": "Civic Data Lab",
|
|
6
6
|
"repository": "github:foederierter-datenpool/directory-builder-core",
|
|
7
7
|
"license": "MIT",
|
|
8
8
|
"type": "module",
|
|
9
9
|
"scripts": {
|
|
10
|
-
"test": "node --test",
|
|
10
|
+
"test": "node --test 'test/*.test.js'",
|
|
11
11
|
"example": "cd example && node ../bin/cli.js",
|
|
12
12
|
"webapp": "vite webapp",
|
|
13
13
|
"webapp:build": "vite build webapp"
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# Default clean, applied when a source ships no clean.sparql: skolemise each
|
|
2
|
+
# record from its identifier field (the source field mapped to the target
|
|
3
|
+
# schema's schema:identifier) into a stable cdp:__name__-<id> IRI, copy its
|
|
4
|
+
# scalar fields verbatim, and tag the source. The engine fills __source__,
|
|
5
|
+
# __name__ and __idPath__ from federation.ttl and puts the resolved query on
|
|
6
|
+
# record under data/pipeline/default-clean-queries/.
|
|
7
|
+
|
|
8
|
+
PREFIX xyz: <http://sparql.xyz/facade-x/data/>
|
|
9
|
+
PREFIX cdp: <https://civic-data.de/pipeline#>
|
|
10
|
+
|
|
11
|
+
CONSTRUCT {
|
|
12
|
+
?record cdp:fromSource __source__ ;
|
|
13
|
+
?p ?o .
|
|
14
|
+
} WHERE {
|
|
15
|
+
?node xyz:__idPath__ ?id ;
|
|
16
|
+
?p ?o .
|
|
17
|
+
FILTER(isLiteral(?o))
|
|
18
|
+
BIND(IRI(CONCAT(STR(cdp:), "__name__-", STR(?id))) AS ?record)
|
|
19
|
+
}
|
package/src/pipeline/federate.js
CHANGED
|
@@ -25,18 +25,20 @@ const df = DataFactory
|
|
|
25
25
|
export async function federate(root = process.cwd()) {
|
|
26
26
|
const abs = (p) => path.join(root, p)
|
|
27
27
|
const federationTtl = fs.readFileSync(abs(PATHS.federation), "utf8")
|
|
28
|
-
|
|
29
|
-
const
|
|
28
|
+
// match-knowledge.ttl (curated owl:sameAs pairs) is optional — no file, no manual matches.
|
|
29
|
+
const matchKnowledge = fs.existsSync(abs(PATHS.matchKnowledge)) ? [fs.readFileSync(abs(PATHS.matchKnowledge), "utf8")] : []
|
|
30
|
+
const defStore = storeFromTurtles([federationTtl, ...matchKnowledge])
|
|
31
|
+
const federationQuads = parseTtl(federationTtl)
|
|
32
|
+
const sources = enabledSources(federationQuads)
|
|
30
33
|
|
|
31
34
|
const store = newStore()
|
|
32
35
|
const journal = stepJournal()
|
|
33
|
-
const ctx = { store, defStore, abs }
|
|
36
|
+
const ctx = { store, defStore, abs, quads: federationQuads }
|
|
34
37
|
|
|
35
38
|
const cleanSteps = []
|
|
36
39
|
for (const src of sources) {
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
() => runClean(ctx, name)))
|
|
40
|
+
cleanSteps.push(await journal.step("clean", { source: src, after: [stepIri("lift", sourceName(src))] },
|
|
41
|
+
() => runClean(ctx, src)))
|
|
40
42
|
}
|
|
41
43
|
|
|
42
44
|
// Load each source's cleaned TTL into its own graph — plain mechanics, not a
|
|
@@ -1,13 +1,21 @@
|
|
|
1
1
|
import { sparqlConstruct, storeFromTurtles } from "@foerderfunke/sem-ops-utils"
|
|
2
|
+
import { CDP, identifierFieldPath, PATHS, sourceName } from "../../utils.js"
|
|
2
3
|
import { writeTurtleFile } from "../write-turtle.js"
|
|
3
|
-
import { CDP, PATHS } from "../../utils.js"
|
|
4
4
|
import path from "path"
|
|
5
5
|
import fs from "fs"
|
|
6
6
|
|
|
7
|
+
// The default clean ships with the engine, like the lift queries.
|
|
8
|
+
const DEFAULT_CLEAN = path.join(import.meta.dirname, "../../clean/default.sparql")
|
|
9
|
+
|
|
7
10
|
// Clean step: the source's clean.sparql reshapes its lifted RDF into
|
|
8
11
|
// federation subjects (xyz:/cdp: vocabulary only — schema: enters at map).
|
|
9
|
-
|
|
10
|
-
|
|
12
|
+
// clean.sparql is optional when the source maps a field to schema:identifier:
|
|
13
|
+
// the engine then derives the default clean from that mapping.
|
|
14
|
+
export const runClean = async ({ abs, quads }, sourceIri) => {
|
|
15
|
+
const name = sourceName(sourceIri)
|
|
16
|
+
const cleanQuery = fs.existsSync(abs(PATHS.cleanQuery(name)))
|
|
17
|
+
? fs.readFileSync(abs(PATHS.cleanQuery(name)), "utf8")
|
|
18
|
+
: defaultClean({ abs, quads }, sourceIri, name)
|
|
11
19
|
const inDir = PATHS.lifted(name)
|
|
12
20
|
const outPath = PATHS.cleaned(name)
|
|
13
21
|
// Run CONSTRUCT per file so each lifted TTL stays isolated in its
|
|
@@ -25,3 +33,18 @@ export const runClean = async ({ abs }, name) => {
|
|
|
25
33
|
cdp: CDP,
|
|
26
34
|
})
|
|
27
35
|
}
|
|
36
|
+
|
|
37
|
+
// No clean.sparql given: resolve the engine's default template with the
|
|
38
|
+
// source's identifier field as skolem key, and put the applied query on
|
|
39
|
+
// record under data/ — no silent fallbacks.
|
|
40
|
+
const defaultClean = ({ abs, quads }, sourceIri, name) => {
|
|
41
|
+
const idPath = identifierFieldPath(quads, sourceIri)
|
|
42
|
+
if (!idPath) throw new Error(`${PATHS.cleanQuery(name)} missing and no schema:identifier mapping to derive the default clean from`)
|
|
43
|
+
const query = fs.readFileSync(DEFAULT_CLEAN, "utf8")
|
|
44
|
+
.replaceAll("__source__", `<${sourceIri}>`).replaceAll("__name__", name).replaceAll("__idPath__", idPath)
|
|
45
|
+
const outPath = abs(PATHS.defaultCleanQuery(name))
|
|
46
|
+
fs.mkdirSync(path.dirname(outPath), { recursive: true })
|
|
47
|
+
fs.writeFileSync(outPath, query)
|
|
48
|
+
console.log(`clean ${name} default (id field: ${idPath}) → ${PATHS.defaultCleanQuery(name)}`)
|
|
49
|
+
return query
|
|
50
|
+
}
|
|
@@ -6,7 +6,8 @@ import fs from "fs"
|
|
|
6
6
|
// Fetch step: run the source's fetch.js. Live sources pass their :fetchUrl;
|
|
7
7
|
// static-file sources pass the absolute static dir instead — the script gets
|
|
8
8
|
// whichever applies, plus the federation's run params as one JSON argument.
|
|
9
|
-
//
|
|
9
|
+
// fetch.js is optional for static sources: without it, the default fetch
|
|
10
|
+
// copies static/ verbatim. Returns the harvest record for the ingest log.
|
|
10
11
|
export const runFetch = ({ abs, root }, { name, fetchUrl, paramsJson }) => {
|
|
11
12
|
const outDir = PATHS.raw(name)
|
|
12
13
|
const origin = fetchUrl ?? abs(PATHS.staticDir(name))
|
|
@@ -14,7 +15,9 @@ export const runFetch = ({ abs, root }, { name, fetchUrl, paramsJson }) => {
|
|
|
14
15
|
// Clear any prior output first, so changed run params (or changed records) can't leave stale files behind
|
|
15
16
|
fs.rmSync(abs(outDir), { recursive: true, force: true })
|
|
16
17
|
fs.mkdirSync(abs(outDir), { recursive: true })
|
|
17
|
-
|
|
18
|
+
const script = abs(PATHS.fetchScript(name))
|
|
19
|
+
if (fs.existsSync(script)) run("node", [script, abs(outDir), origin, paramsJson])
|
|
20
|
+
else localCopyFallback({ name, fetchUrl, origin, outDir: abs(outDir) })
|
|
18
21
|
const harvest = { time: new Date().toISOString() }
|
|
19
22
|
// Static sources have no live harvest — record the files' git commit
|
|
20
23
|
// time instead (the freshness the Sources page shows for them).
|
|
@@ -24,3 +27,10 @@ export const runFetch = ({ abs, root }, { name, fetchUrl, paramsJson }) => {
|
|
|
24
27
|
} catch { /* not committed yet / no git → omit */ }
|
|
25
28
|
return harvest
|
|
26
29
|
}
|
|
30
|
+
|
|
31
|
+
// Fallback when a source ships no dedicated fetch.js: static sources get
|
|
32
|
+
// their static/ dir copied verbatim; live sources have no fallback yet.
|
|
33
|
+
const localCopyFallback = ({ name, fetchUrl, origin, outDir }) => {
|
|
34
|
+
if (fetchUrl) throw new Error(`${PATHS.fetchScript(name)} missing (no default fetch for live sources yet)`)
|
|
35
|
+
fs.cpSync(origin, outDir, { recursive: true })
|
|
36
|
+
}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { sparqlInsertDelete, sparqlSelect } from "@foerderfunke/sem-ops-utils"
|
|
2
|
-
import { buildPrefixBlock, CDP, PATHS, shrink, sourceName } from "../../utils.js"
|
|
2
|
+
import { buildPrefixBlock, CDP, PATHS, shrink, sourceGraph, sourceName } from "../../utils.js"
|
|
3
3
|
import { DataFactory } from "n3"
|
|
4
4
|
import path from "path"
|
|
5
5
|
import fs from "fs"
|
|
@@ -88,10 +88,9 @@ ${insertBlock}
|
|
|
88
88
|
export const runMap = async ({ store, defStore, abs }, queriesDir) => {
|
|
89
89
|
const mappings = await sparqlSelect(`
|
|
90
90
|
PREFIX : <${CDP}>
|
|
91
|
-
SELECT ?mapping ?source ?
|
|
91
|
+
SELECT ?mapping ?source ?target ?targetClass WHERE {
|
|
92
92
|
?mapping a :Mapping ;
|
|
93
93
|
:fromSource ?source .
|
|
94
|
-
OPTIONAL { ?mapping :sourceGraph ?sourceGraph }
|
|
95
94
|
OPTIONAL { ?mapping :toTarget ?target }
|
|
96
95
|
OPTIONAL { ?mapping :toTarget/:targetClass ?targetClass }
|
|
97
96
|
} ORDER BY ?mapping`, [defStore])
|
|
@@ -108,9 +107,11 @@ export const runMap = async ({ store, defStore, abs }, queriesDir) => {
|
|
|
108
107
|
OPTIONAL { ?parent :hasSubField ?src . ?parent :fieldPath ?parentPath }
|
|
109
108
|
}`, [defStore])
|
|
110
109
|
|
|
111
|
-
if (directRows.length
|
|
110
|
+
if (directRows.length) {
|
|
112
111
|
const localName = m.mapping.split("#").pop()
|
|
113
|
-
|
|
112
|
+
// The mapping's source graph follows by convention from :fromSource —
|
|
113
|
+
// the load step names it the same way.
|
|
114
|
+
const query = buildDirectInsert({ ...m, sourceGraph: sourceGraph(sourceName(m.source)) }, directRows)
|
|
114
115
|
const queryPath = abs(path.join(queriesDir, `${localName}.sparql`))
|
|
115
116
|
fs.mkdirSync(path.dirname(queryPath), { recursive: true })
|
|
116
117
|
fs.writeFileSync(queryPath, query)
|
|
@@ -139,9 +140,9 @@ export const runMap = async ({ store, defStore, abs }, queriesDir) => {
|
|
|
139
140
|
// the merge step rewrites them to the minted cluster IRIs.
|
|
140
141
|
const linkRows = await sparqlSelect(`
|
|
141
142
|
PREFIX : <${CDP}>
|
|
142
|
-
SELECT ?mapping ?
|
|
143
|
+
SELECT ?mapping ?source ?fromSchema ?sourcePredicate ?targetPredicate ?toSchema WHERE {
|
|
143
144
|
?mapping a :Mapping ;
|
|
144
|
-
:
|
|
145
|
+
:fromSource ?source ;
|
|
145
146
|
:toTarget ?fromSchema ;
|
|
146
147
|
:hasRelationship ?rel .
|
|
147
148
|
?rel :sourcePredicate ?sourcePredicate ;
|
|
@@ -160,7 +161,7 @@ INSERT {
|
|
|
160
161
|
?from ${short(rel.targetPredicate)} ?to .
|
|
161
162
|
}
|
|
162
163
|
} WHERE {
|
|
163
|
-
GRAPH <${rel.
|
|
164
|
+
GRAPH <${sourceGraph(sourceName(rel.source))}> {
|
|
164
165
|
?from ${short(rel.sourcePredicate)} ?to ;
|
|
165
166
|
cdp:targetSchema ${short(rel.fromSchema)} .
|
|
166
167
|
?to cdp:targetSchema ${short(rel.toSchema)} .
|
|
@@ -11,22 +11,22 @@ export const MERGED_GRAPH = df.namedNode("urn:merged")
|
|
|
11
11
|
|
|
12
12
|
const RDF_REIFIES = df.namedNode("http://www.w3.org/1999/02/22-rdf-syntax-ns#reifies")
|
|
13
13
|
|
|
14
|
+
// Engine invariant, mirrored by the webapp's loadMerge: each derivation's
|
|
15
|
+
// origin hangs off its reifier via prov:wasDerivedFrom.
|
|
16
|
+
const PROV_DERIVED_FROM = df.namedNode("http://www.w3.org/ns/prov#wasDerivedFrom")
|
|
17
|
+
|
|
14
18
|
export const runMerge = async ({ store, defStore, abs }, outPath, provOutPath) => {
|
|
15
19
|
const [cfg] = await sparqlSelect(`
|
|
16
20
|
PREFIX : <${CDP}>
|
|
17
|
-
SELECT ?ns ?
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
}`, [defStore])
|
|
21
|
-
if (!cfg) throw new Error(":MergeRule / :MatchRule config missing in federation.ttl")
|
|
22
|
-
const { ns: namespace, originPred } = cfg
|
|
21
|
+
SELECT ?ns WHERE { ?match a :MatchRule ; :targetNamespace ?ns . }`, [defStore])
|
|
22
|
+
if (!cfg) throw new Error(":MatchRule config missing in federation.ttl")
|
|
23
|
+
const namespace = cfg.ns
|
|
23
24
|
|
|
24
25
|
const memberQuads = store.getQuads(null, HAS_MEMBER, null, MATCH_GRAPH)
|
|
25
26
|
const mintedFor = new Map()
|
|
26
27
|
for (const mq of memberQuads) mintedFor.set(mq.object.value, mq.subject)
|
|
27
28
|
|
|
28
29
|
const fedQuads = store.getQuads(null, null, null, MAPPED_GRAPH)
|
|
29
|
-
const originPredNode = df.namedNode(originPred)
|
|
30
30
|
const provQuads = []
|
|
31
31
|
for (const qu of fedQuads) {
|
|
32
32
|
const minted = mintedFor.get(qu.subject.value)
|
|
@@ -43,7 +43,7 @@ export const runMerge = async ({ store, defStore, abs }, outPath, provOutPath) =
|
|
|
43
43
|
// per-derivation metadata (time, confidence) has a home when needed.
|
|
44
44
|
const reifier = df.blankNode()
|
|
45
45
|
provQuads.push(df.quad(reifier, RDF_REIFIES, df.quad(minted, qu.predicate, object)))
|
|
46
|
-
provQuads.push(df.quad(reifier,
|
|
46
|
+
provQuads.push(df.quad(reifier, PROV_DERIVED_FROM, qu.subject))
|
|
47
47
|
}
|
|
48
48
|
|
|
49
49
|
const mergedQuads = store.getQuads(null, null, null, MERGED_GRAPH)
|
|
@@ -26,11 +26,12 @@ export const runResolve = async ({ store, defStore, abs }, outPath) => {
|
|
|
26
26
|
const [cfg] = await sparqlSelect(`
|
|
27
27
|
PREFIX : <${CDP}>
|
|
28
28
|
SELECT ?strategy ?ns WHERE {
|
|
29
|
-
?
|
|
30
|
-
?
|
|
29
|
+
?match a :MatchRule ; :targetNamespace ?ns .
|
|
30
|
+
OPTIONAL { ?resolve a :ResolveRule ; :defaultStrategy ?strategy }
|
|
31
31
|
}`, [defStore])
|
|
32
|
-
if (!cfg) throw new Error(":
|
|
33
|
-
|
|
32
|
+
if (!cfg) throw new Error(":MatchRule config missing in federation.ttl")
|
|
33
|
+
// No :ResolveRule (or none with a :defaultStrategy) → alphabeticFirst.
|
|
34
|
+
const defaultPick = lookupStrategy(cfg.strategy ?? `${CDP}alphabeticFirst`)
|
|
34
35
|
|
|
35
36
|
const overrideRows = await sparqlSelect(`
|
|
36
37
|
PREFIX : <${CDP}>
|
package/src/utils.js
CHANGED
|
@@ -69,6 +69,7 @@ export const PATHS = {
|
|
|
69
69
|
ingestLog: "data/ingest/ingest-log.ttl",
|
|
70
70
|
federateLog: "data/pipeline/federate-log.ttl",
|
|
71
71
|
mappingQueries: "data/pipeline/direct-mapping-queries/",
|
|
72
|
+
defaultCleanQuery: (name) => `data/pipeline/default-clean-queries/${name}.sparql`,
|
|
72
73
|
mapped: "data/pipeline/mapped.ttl",
|
|
73
74
|
matches: "data/pipeline/matches.ttl",
|
|
74
75
|
merged: "data/pipeline/merged.ttl",
|
|
@@ -115,6 +116,18 @@ export const enabledSources = (quads) => {
|
|
|
115
116
|
return objectsOf(quads, `${CDP}hasSource`).filter((iri) => !disabled.has(iri))
|
|
116
117
|
}
|
|
117
118
|
|
|
119
|
+
// The source's skolem key for the default clean: the :fieldPath of the source
|
|
120
|
+
// field whose mapping points at the target field with :targetPredicate
|
|
121
|
+
// schema:identifier. Undefined when the source declares no such mapping.
|
|
122
|
+
export const identifierFieldPath = (quads, sourceIri) => {
|
|
123
|
+
const o = (s, p) => quads.filter((q) => q.subject.value === s && q.predicate.value === `${CDP}${p}`).map((q) => q.object.value)
|
|
124
|
+
for (const m of quads.filter((q) => q.predicate.value === `${CDP}fromSource` && q.object.value === sourceIri).map((q) => q.subject.value)) {
|
|
125
|
+
for (const fm of o(m, "hasFieldMapping")) {
|
|
126
|
+
if (o(o(fm, "to")[0], "targetPredicate")[0] === "http://schema.org/identifier") return o(o(fm, "from")[0], "fieldPath")[0]
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
118
131
|
// Set of subjects typed `rdf:type typeIri`. Iteration order = encounter order.
|
|
119
132
|
export function subjectsOfType(quads, typeIri) {
|
|
120
133
|
const out = new Set()
|
package/src/validate.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { buildValidator, turtleToDataset } from "@foerderfunke/sem-ops-utils"
|
|
2
|
-
import { CDP, objectsOf, parseTtl, PATHS, shrink, sourceName } from "./utils.js"
|
|
2
|
+
import { CDP, identifierFieldPath, objectsOf, parseTtl, PATHS, shrink, sourceName } from "./utils.js"
|
|
3
3
|
import path from "path"
|
|
4
4
|
import fs from "fs"
|
|
5
5
|
|
|
@@ -19,23 +19,27 @@ export async function validate(root = process.cwd()) {
|
|
|
19
19
|
return (await Promise.all(checks.map((check) => check(ctx)))).flat()
|
|
20
20
|
}
|
|
21
21
|
|
|
22
|
-
// Every :hasSource in federation.ttl has its
|
|
23
|
-
// fetch.js
|
|
24
|
-
//
|
|
25
|
-
//
|
|
22
|
+
// Every :hasSource in federation.ttl has what its engine steps need: a
|
|
23
|
+
// fetch.js or static/ to default to, a clean.sparql or a schema:identifier
|
|
24
|
+
// mapping to derive the default clean from - and no sources/ folder exists
|
|
25
|
+
// that the federation doesn't declare. Checks all declared sources, enabled
|
|
26
|
+
// or not: folder presence is a repo-layout contract.
|
|
26
27
|
function sourcesFoldersInSync({ abs, quads }) {
|
|
27
|
-
const declared = objectsOf(quads, `${CDP}hasSource`)
|
|
28
|
+
const declared = objectsOf(quads, `${CDP}hasSource`)
|
|
28
29
|
const problems = []
|
|
29
|
-
for (const
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
30
|
+
for (const iri of declared) {
|
|
31
|
+
const name = sourceName(iri)
|
|
32
|
+
if (![PATHS.fetchScript(name), PATHS.staticDir(name)].some((f) => fs.existsSync(abs(f))))
|
|
33
|
+
problems.push(`${PATHS.fetchScript(name)} missing and no ${PATHS.staticDir(name)} to default to`)
|
|
34
|
+
if (!fs.existsSync(abs(PATHS.cleanQuery(name))) && !identifierFieldPath(quads, iri))
|
|
35
|
+
problems.push(`${PATHS.cleanQuery(name)} missing and no schema:identifier mapping to derive the default clean from`)
|
|
33
36
|
}
|
|
37
|
+
const declaredNames = declared.map(sourceName)
|
|
34
38
|
const folders = fs.existsSync(abs("sources"))
|
|
35
39
|
? fs.readdirSync(abs("sources"), { withFileTypes: true }).filter((d) => d.isDirectory()).map((d) => d.name)
|
|
36
40
|
: []
|
|
37
41
|
for (const name of folders) {
|
|
38
|
-
if (!
|
|
42
|
+
if (!declaredNames.includes(name)) problems.push(`sources/${name}/ has no :hasSource declaration in ${PATHS.federation}`)
|
|
39
43
|
}
|
|
40
44
|
return problems
|
|
41
45
|
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import { PATHS } from "@directory-builder/core/utils"
|
|
2
|
+
import path from "path"
|
|
3
|
+
import fs from "fs"
|
|
4
|
+
|
|
5
|
+
// SPARQL Anything cache shared with example/ — a fixture's tools/ symlinks
|
|
6
|
+
// here, so test runs never re-download the jar.
|
|
7
|
+
const TOOLS_CACHE = path.join(import.meta.dirname, "../../example/tools")
|
|
8
|
+
|
|
9
|
+
// Materialize an in-test instance definition (federation.ttl string + records
|
|
10
|
+
// per source) into test/tmp/<name>/ — a real instance folder the engines run
|
|
11
|
+
// against, wiped at setup and left in place afterwards for inspection.
|
|
12
|
+
export const makeInstance = (name, { federation, sources }) => {
|
|
13
|
+
const root = path.join(import.meta.dirname, "../tmp", name)
|
|
14
|
+
fs.rmSync(root, { recursive: true, force: true })
|
|
15
|
+
fs.mkdirSync(path.join(root, "config"), { recursive: true })
|
|
16
|
+
fs.writeFileSync(path.join(root, PATHS.federation), federation)
|
|
17
|
+
for (const [source, records] of Object.entries(sources)) {
|
|
18
|
+
fs.mkdirSync(path.join(root, PATHS.staticDir(source)), { recursive: true })
|
|
19
|
+
fs.writeFileSync(path.join(root, PATHS.staticDir(source), "data.json"), JSON.stringify(records, null, 4))
|
|
20
|
+
}
|
|
21
|
+
fs.mkdirSync(TOOLS_CACHE, { recursive: true })
|
|
22
|
+
fs.symlinkSync(TOOLS_CACHE, path.join(root, "tools"))
|
|
23
|
+
return root
|
|
24
|
+
}
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import { parseTtl, PATHS } from "@directory-builder/core/utils"
|
|
2
|
+
import { Pipeline, validate } from "@directory-builder/core"
|
|
3
|
+
import { makeInstance } from "./helpers/instance.js"
|
|
4
|
+
import assert from "node:assert/strict"
|
|
5
|
+
import { test } from "node:test"
|
|
6
|
+
import path from "path"
|
|
7
|
+
import fs from "fs"
|
|
8
|
+
|
|
9
|
+
// The ultra-minimal instance: federation.ttl + two static JSON sources,
|
|
10
|
+
// nothing else — fetch, clean and resolve all run on engine defaults. The
|
|
11
|
+
// sources share one record by name ("Entry One"), so the pipeline should
|
|
12
|
+
// merge a1+b1 and leave a2 and b2 as their own entities.
|
|
13
|
+
|
|
14
|
+
const federation = `
|
|
15
|
+
@prefix : <https://civic-data.de/pipeline#> .
|
|
16
|
+
@prefix schema: <http://schema.org/> .
|
|
17
|
+
@prefix ft: <http://publications.europa.eu/resource/authority/file-type/> .
|
|
18
|
+
|
|
19
|
+
:federation a :Federation ;
|
|
20
|
+
:hasSource :alphaSource, :betaSource .
|
|
21
|
+
|
|
22
|
+
:thingSchema a :TargetSchema ;
|
|
23
|
+
:targetClass schema:Thing .
|
|
24
|
+
|
|
25
|
+
:t-id a :TargetField ; :targetPredicate schema:identifier .
|
|
26
|
+
:t-name a :TargetField ; :targetPredicate schema:name .
|
|
27
|
+
|
|
28
|
+
:alphaSource a :Source ; :format ft:JSON .
|
|
29
|
+
:betaSource a :Source ; :format ft:JSON .
|
|
30
|
+
|
|
31
|
+
:alpha-id a :SourceField ; :fieldPath "id" .
|
|
32
|
+
:alpha-name a :SourceField ; :fieldPath "name" .
|
|
33
|
+
:beta-id a :SourceField ; :fieldPath "id" .
|
|
34
|
+
:beta-label a :SourceField ; :fieldPath "label" .
|
|
35
|
+
|
|
36
|
+
:alpha-mapping a :Mapping ; :fromSource :alphaSource ; :toTarget :thingSchema ;
|
|
37
|
+
:hasFieldMapping [ :from :alpha-id ; :to :t-id ] , [ :from :alpha-name ; :to :t-name ] .
|
|
38
|
+
|
|
39
|
+
:beta-mapping a :Mapping ; :fromSource :betaSource ; :toTarget :thingSchema ;
|
|
40
|
+
:hasFieldMapping [ :from :beta-id ; :to :t-id ] , [ :from :beta-label ; :to :t-name ] .
|
|
41
|
+
|
|
42
|
+
:match a :MatchRule ;
|
|
43
|
+
:forTarget :thingSchema ;
|
|
44
|
+
:targetNamespace "urn:test:" ;
|
|
45
|
+
:mintedSubjectPrefix "thing-" ;
|
|
46
|
+
:minScore 1.0 ;
|
|
47
|
+
:hasWeightedCriterion [ :on schema:name ; :weight 1.0 ] .
|
|
48
|
+
`
|
|
49
|
+
|
|
50
|
+
const alpha = [
|
|
51
|
+
{ id: "a1", name: "Entry One" },
|
|
52
|
+
{ id: "a2", name: "Entry Two" },
|
|
53
|
+
]
|
|
54
|
+
const beta = [
|
|
55
|
+
{ id: "b1", label: "Entry One" },
|
|
56
|
+
{ id: "b2", label: "Entry Three" },
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
const root = makeInstance("tiny", { federation, sources: { alpha, beta } })
|
|
60
|
+
|
|
61
|
+
const expectedFinal = `@prefix schema: <http://schema.org/>.
|
|
62
|
+
@prefix foaf: <http://xmlns.com/foaf/0.1/>.
|
|
63
|
+
@prefix dct: <http://purl.org/dc/terms/>.
|
|
64
|
+
@prefix cdf: <urn:test:>.
|
|
65
|
+
|
|
66
|
+
cdf:thing-5a45645edb31 a schema:Thing;
|
|
67
|
+
schema:name "Entry Two".
|
|
68
|
+
cdf:thing-616feb993283 a schema:Thing;
|
|
69
|
+
schema:name "Entry One".
|
|
70
|
+
cdf:thing-d1583c098826 a schema:Thing;
|
|
71
|
+
schema:name "Entry Three".
|
|
72
|
+
`
|
|
73
|
+
|
|
74
|
+
test("the tiny fixture validates and runs the whole pipeline on defaults", async () => {
|
|
75
|
+
// the fixture satisfies the instance contract (folders, derivable defaults, shape)
|
|
76
|
+
assert.deepEqual(await validate(root), [])
|
|
77
|
+
await new Pipeline({ root }).run()
|
|
78
|
+
const finalTtl = fs.readFileSync(path.join(root, PATHS.final), "utf8")
|
|
79
|
+
const final = parseTtl(finalTtl)
|
|
80
|
+
// match merged a1+b1 on their identical name; a2 and b2 stay their own entities
|
|
81
|
+
const subjects = new Set(final.map((q) => q.subject.value))
|
|
82
|
+
assert.equal(subjects.size, 3, "a1+b1 merge, a2 and b2 stay alone")
|
|
83
|
+
// entity IRIs are minted from the match rule's :targetNamespace + :mintedSubjectPrefix
|
|
84
|
+
for (const s of subjects) assert.match(s, /^urn:test:thing-/)
|
|
85
|
+
// map carried both sources' name fields through, resolve kept one value per entity
|
|
86
|
+
const names = final.filter((q) => q.predicate.value === "http://schema.org/name").map((q) => q.object.value)
|
|
87
|
+
assert.deepEqual(names.toSorted(), ["Entry One", "Entry Three", "Entry Two"])
|
|
88
|
+
// and the consumer-facing artifact as a whole
|
|
89
|
+
assert.equal(finalTtl, expectedFinal)
|
|
90
|
+
})
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
import path from "path"
|
|
2
|
-
import fs from "fs"
|
|
3
|
-
|
|
4
|
-
// Static-file source: copy the committed JSON straight into the ingest area.
|
|
5
|
-
// A live source would instead call an API here and write the responses out.
|
|
6
|
-
// argv: [outDir, sourceDir, runParamsJson] — params unused for this static example.
|
|
7
|
-
const OUT_DIR = process.argv[2]
|
|
8
|
-
const SRC_DIR = process.argv[3]
|
|
9
|
-
|
|
10
|
-
fs.mkdirSync(OUT_DIR, { recursive: true })
|
|
11
|
-
for (const f of fs.readdirSync(SRC_DIR).filter((f) => f.endsWith(".json"))) {
|
|
12
|
-
fs.copyFileSync(path.join(SRC_DIR, f), path.join(OUT_DIR, f))
|
|
13
|
-
console.log(` ${f} → ${OUT_DIR}`)
|
|
14
|
-
}
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
import path from "path"
|
|
2
|
-
import fs from "fs"
|
|
3
|
-
|
|
4
|
-
// Static-file source: copy the committed JSON straight into the ingest area.
|
|
5
|
-
// A live source would instead call an API here and write the responses out.
|
|
6
|
-
// argv: [outDir, sourceDir, runParamsJson] — params unused for this static example.
|
|
7
|
-
const OUT_DIR = process.argv[2]
|
|
8
|
-
const SRC_DIR = process.argv[3]
|
|
9
|
-
|
|
10
|
-
fs.mkdirSync(OUT_DIR, { recursive: true })
|
|
11
|
-
for (const f of fs.readdirSync(SRC_DIR).filter((f) => f.endsWith(".json"))) {
|
|
12
|
-
fs.copyFileSync(path.join(SRC_DIR, f), path.join(OUT_DIR, f))
|
|
13
|
-
console.log(` ${f} → ${OUT_DIR}`)
|
|
14
|
-
}
|