@directory-builder/core 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -2
- package/package.json +1 -1
- package/src/index.js +2 -2
- package/src/pipeline/federate.js +69 -0
- package/src/pipeline/ingest.js +97 -0
- package/src/pipeline/run.js +8 -0
- package/src/pipeline/steps/clean.js +27 -0
- package/src/pipeline/steps/fetch.js +24 -0
- package/src/pipeline/steps/lift.js +58 -0
- package/src/pipeline/steps/map.js +172 -0
- package/src/pipeline/steps/match.js +212 -0
- package/src/pipeline/steps/merge.js +59 -0
- package/src/pipeline/steps/resolve.js +54 -0
- package/src/pipeline/write-turtle.js +30 -0
- package/src/pipeline.js +2 -2
- package/src/utils.js +1 -5
- package/webapp/src/App.jsx +3 -1
- package/webapp/src/Card.jsx +3 -3
- package/webapp/src/ColumnGraph.jsx +1 -1
- package/webapp/src/Directory.jsx +6 -6
- package/webapp/src/{OrgCard.jsx → EntityCard.jsx} +12 -12
- package/webapp/src/MapGraph.jsx +22 -22
- package/webapp/src/MatchGraph.jsx +3 -3
- package/webapp/src/MergeTables.jsx +43 -36
- package/webapp/src/Query.jsx +4 -9
- package/webapp/src/instanceData.js +6 -2
- package/webapp/src/loadMap.js +22 -22
- package/webapp/src/loadMerge.js +30 -32
- package/webapp/src/loadSources.js +1 -1
- package/webapp/src/mergeEntities.js +15 -0
- package/webapp/src/sourceMeta.js +1 -1
- package/webapp/src/styles.css +6 -6
- package/webapp/vite.js +1 -1
- package/src/federate.js +0 -571
- package/src/ingest.js +0 -158
- package/webapp/src/mergeOrgs.js +0 -15
package/README.md
CHANGED
|
@@ -93,8 +93,10 @@ INSTANCE=../sosuse-directory-builder npm run webapp # any other instance dir
|
|
|
93
93
|
|
|
94
94
|
Instances own the About page by providing `webapp/content/about.md` (markdown,
|
|
95
95
|
served and deployed like config and data); without one, a generic default
|
|
96
|
-
renders
|
|
97
|
-
|
|
96
|
+
renders — and the Query page's starting query the same way, via
|
|
97
|
+
`webapp/content/query.sparql`. On the `:federation` node, `rdfs:label` sets
|
|
98
|
+
the page title and `:repository "https://github.com/…"` adds the GitHub links
|
|
99
|
+
(nav, static-source folders); both stay generic/hidden when absent.
|
|
98
100
|
|
|
99
101
|
Instances can inject **exporters** — output adapters mapping the directory
|
|
100
102
|
into an external schema. The federation declares them (`:federation
|
|
@@ -110,3 +112,12 @@ exported separately so bundlers never see the engines' Node imports:
|
|
|
110
112
|
```js
|
|
111
113
|
import { CDP, parseTtl, PATHS } from "@directory-builder/core/utils"
|
|
112
114
|
```
|
|
115
|
+
|
|
116
|
+
## Roadmap
|
|
117
|
+
|
|
118
|
+
- Testing
|
|
119
|
+
- Periodic harvesting
|
|
120
|
+
- `@directory-builder/create`: an npm initializer scaffolding a new use
|
|
121
|
+
case, plus a `validate` command checking an instance setup
|
|
122
|
+
- `@directory-builder/ui`: extract the webapp into its own package
|
|
123
|
+
- ...
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@directory-builder/core",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.1",
|
|
4
4
|
"description": "Use-case-agnostic engine for config-driven federation pipelines",
|
|
5
5
|
"author": "Civic Data Lab",
|
|
6
6
|
"repository": "github:foederierter-datenpool/directory-builder-core",
|
package/src/index.js
CHANGED
|
@@ -2,5 +2,5 @@
|
|
|
2
2
|
// "./utils" subpath export — import those from "@directory-builder/core/utils"
|
|
3
3
|
// so bundlers never see the engines' fs/child_process imports.
|
|
4
4
|
export { Pipeline } from "./pipeline.js"
|
|
5
|
-
export { ingest } from "./ingest.js"
|
|
6
|
-
export { federate } from "./federate.js"
|
|
5
|
+
export { ingest } from "./pipeline/ingest.js"
|
|
6
|
+
export { federate } from "./pipeline/federate.js"
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import { newStore, parser as n3Parser, storeFromTurtles } from "@foerderfunke/sem-ops-utils"
|
|
2
|
+
import { CDP, objectsOf, parseTtl, PATHS, sourceGraph, sourceName, stepIri, stepJournal } from "../utils.js"
|
|
3
|
+
import { COMMON_PREFIXES, writeTurtleFile } from "./write-turtle.js"
|
|
4
|
+
import { MAPPED_GRAPH, runMap } from "./steps/map.js"
|
|
5
|
+
import { runClean } from "./steps/clean.js"
|
|
6
|
+
import { runMatch } from "./steps/match.js"
|
|
7
|
+
import { runMerge } from "./steps/merge.js"
|
|
8
|
+
import { runResolve } from "./steps/resolve.js"
|
|
9
|
+
import { DataFactory } from "n3"
|
|
10
|
+
import path from "path"
|
|
11
|
+
import fs from "fs"
|
|
12
|
+
|
|
13
|
+
const df = DataFactory
|
|
14
|
+
|
|
15
|
+
// ---- Federate engine -----------------------------------------------------
|
|
16
|
+
// Clean per source, load, then map → match → merge → resolve (one module per
|
|
17
|
+
// step under steps/, sharing the ctx of store + config + path resolver). The
|
|
18
|
+
// step sequence is the engine's own shape; config declares only the sources,
|
|
19
|
+
// processed in :hasSource declaration order. Paths follow from the source
|
|
20
|
+
// name (PATHS), resolved against the instance `root`. Each step runs through
|
|
21
|
+
// the journal, which records what executed and is rendered by the webapp's
|
|
22
|
+
// Pipeline page. The clean steps' predecessors are the other engine's lift
|
|
23
|
+
// steps, referenced by their conventional stepIri.
|
|
24
|
+
|
|
25
|
+
export async function federate(root = process.cwd()) {
|
|
26
|
+
const abs = (p) => path.join(root, p)
|
|
27
|
+
const federationTtl = fs.readFileSync(abs(PATHS.federation), "utf8")
|
|
28
|
+
const defStore = storeFromTurtles([federationTtl, fs.readFileSync(abs(PATHS.matchKnowledge), "utf8")])
|
|
29
|
+
const sources = objectsOf(parseTtl(federationTtl), `${CDP}hasSource`)
|
|
30
|
+
|
|
31
|
+
const store = newStore()
|
|
32
|
+
const journal = stepJournal()
|
|
33
|
+
const ctx = { store, defStore, abs }
|
|
34
|
+
|
|
35
|
+
const cleanSteps = []
|
|
36
|
+
for (const src of sources) {
|
|
37
|
+
const name = sourceName(src)
|
|
38
|
+
cleanSteps.push(await journal.step("clean", { source: src, after: [stepIri("lift", name)] },
|
|
39
|
+
() => runClean(ctx, name)))
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// Load each source's cleaned TTL into its own graph — plain mechanics, not a
|
|
43
|
+
// pipeline step.
|
|
44
|
+
for (const src of sources) {
|
|
45
|
+
const name = sourceName(src)
|
|
46
|
+
console.log(`load ${PATHS.cleaned(name)} → <${sourceGraph(name)}>`)
|
|
47
|
+
const graph = df.namedNode(sourceGraph(name))
|
|
48
|
+
for (const quad of n3Parser.parse(fs.readFileSync(abs(PATHS.cleaned(name)), "utf8"))) {
|
|
49
|
+
store.addQuad(df.quad(quad.subject, quad.predicate, quad.object, graph))
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
const mapStep = await journal.step("map", { after: cleanSteps }, async () => {
|
|
54
|
+
await runMap(ctx, PATHS.mappingQueries)
|
|
55
|
+
const mappedQuads = store.getQuads(null, null, null, MAPPED_GRAPH)
|
|
56
|
+
await writeTurtleFile(abs(PATHS.mapped), mappedQuads, { ...COMMON_PREFIXES, cdp: CDP })
|
|
57
|
+
console.log(`map: wrote ${mappedQuads.length} triples → ${PATHS.mapped}`)
|
|
58
|
+
})
|
|
59
|
+
const matchStep = await journal.step("match", { after: [mapStep] }, () => runMatch(ctx, PATHS.matches))
|
|
60
|
+
const mergeStep = await journal.step("merge", { after: [matchStep] }, () => runMerge(ctx, PATHS.merged, PATHS.provenance))
|
|
61
|
+
await journal.step("resolve", { after: [mergeStep] }, () => runResolve(ctx, PATHS.final))
|
|
62
|
+
|
|
63
|
+
fs.writeFileSync(abs(PATHS.federateLog), `@prefix : <${CDP}> .
|
|
64
|
+
@prefix p-plan: <http://purl.org/net/p-plan#> .
|
|
65
|
+
|
|
66
|
+
${journal.toTurtle()}
|
|
67
|
+
`)
|
|
68
|
+
console.log(`log: wrote steps → ${PATHS.federateLog}`)
|
|
69
|
+
}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import { sparqlSelect, storeFromTurtles } from "@foerderfunke/sem-ops-utils"
|
|
2
|
+
import { CDP, objectsOf, parseTtl, PATHS, sourceName, stepJournal } from "../utils.js"
|
|
3
|
+
import { ensureJar, runLift } from "./steps/lift.js"
|
|
4
|
+
import { runFetch } from "./steps/fetch.js"
|
|
5
|
+
import path from "path"
|
|
6
|
+
import fs from "fs"
|
|
7
|
+
|
|
8
|
+
// Ingest engine: fetch + lift per source declared in the instance's
|
|
9
|
+
// federation.ttl. `root` is the instance directory all PATHS resolve against.
|
|
10
|
+
export async function ingest(root = process.cwd()) {
|
|
11
|
+
const abs = (p) => path.join(root, p)
|
|
12
|
+
const federationTtl = fs.readFileSync(abs(PATHS.federation), "utf8")
|
|
13
|
+
const defStore = storeFromTurtles([federationTtl])
|
|
14
|
+
|
|
15
|
+
// ---- Read the sources ------------------------------------------------
|
|
16
|
+
// The step graph (fetch → lift per source) is the engine's own shape;
|
|
17
|
+
// config declares only the sources and their facts. Lift params are SPARQL
|
|
18
|
+
// Anything variables declared per source. Sources run in :hasSource
|
|
19
|
+
// declaration order.
|
|
20
|
+
|
|
21
|
+
const facts = new Map()
|
|
22
|
+
for (const r of await sparqlSelect(`
|
|
23
|
+
PREFIX : <${CDP}>
|
|
24
|
+
SELECT ?source ?fetchUrl ?format ?paramName ?paramValue WHERE {
|
|
25
|
+
:federation :hasSource ?source .
|
|
26
|
+
OPTIONAL { ?source :fetchUrl ?fetchUrl }
|
|
27
|
+
OPTIONAL { ?source :format ?format }
|
|
28
|
+
OPTIONAL { ?source :hasLiftParam [ :name ?paramName ; :value ?paramValue ] }
|
|
29
|
+
}`, [defStore])) {
|
|
30
|
+
if (!facts.has(r.source)) facts.set(r.source, { fetchUrl: r.fetchUrl, format: r.format, params: [] })
|
|
31
|
+
if (r.paramName) facts.get(r.source).params.push([r.paramName, r.paramValue])
|
|
32
|
+
}
|
|
33
|
+
const sources = new Map(objectsOf(parseTtl(federationTtl), `${CDP}hasSource`).map((iri) => [iri, facts.get(iri)]))
|
|
34
|
+
for (const [iri, s] of sources) {
|
|
35
|
+
if (!s.format) throw new Error(`${iri} declares no :format (needed to pick the lift query)`)
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
const jar = await ensureJar(abs)
|
|
39
|
+
|
|
40
|
+
// ---- Run steps ----------------------------------------------------------
|
|
41
|
+
|
|
42
|
+
// All :hasRunParam values grouped by name, handed to every fetcher as one
|
|
43
|
+
// JSON argument — each fetcher picks the parameters it needs.
|
|
44
|
+
const runParams = {}
|
|
45
|
+
for (const r of await sparqlSelect(`
|
|
46
|
+
PREFIX : <${CDP}>
|
|
47
|
+
SELECT ?name ?value WHERE { :federation :hasRunParam [ :name ?name ; :value ?value ] } ORDER BY ?name ?value`, [defStore])) {
|
|
48
|
+
(runParams[r.name] ??= []).push(r.value)
|
|
49
|
+
}
|
|
50
|
+
const paramsJson = JSON.stringify(runParams)
|
|
51
|
+
|
|
52
|
+
const runStart = new Date()
|
|
53
|
+
const harvests = []
|
|
54
|
+
const journal = stepJournal()
|
|
55
|
+
const fetchStepOf = new Map()
|
|
56
|
+
const ctx = { abs, root }
|
|
57
|
+
|
|
58
|
+
for (const [iri, s] of sources) {
|
|
59
|
+
const name = sourceName(iri)
|
|
60
|
+
fetchStepOf.set(iri, await journal.step("fetch", { source: iri }, () => {
|
|
61
|
+
harvests.push({ source: iri, ...runFetch(ctx, { name, fetchUrl: s.fetchUrl, paramsJson }) })
|
|
62
|
+
}))
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
for (const [iri, s] of sources) {
|
|
66
|
+
const name = sourceName(iri)
|
|
67
|
+
await journal.step("lift", { source: iri, after: [fetchStepOf.get(iri)] },
|
|
68
|
+
() => runLift(ctx, { jar, name, format: s.format, params: s.params }))
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
const dt = (s) => `"${s}"^^xsd:dateTime`
|
|
72
|
+
const runId = "run" + runStart.toISOString().replace(/\D/g, "").slice(0, 14)
|
|
73
|
+
const harvestPart = harvests.length
|
|
74
|
+
? ` ;\n :harvested\n` + harvests.map((h) => {
|
|
75
|
+
const local = h.source.split("#").pop()
|
|
76
|
+
const committed = h.staticCommittedAt ? ` ; :staticCommittedAt ${dt(h.staticCommittedAt)}` : ""
|
|
77
|
+
return ` [ :ofSource :${local} ; prov:atTime ${dt(h.time)}${committed} ]`
|
|
78
|
+
}).join(" ,\n")
|
|
79
|
+
: ""
|
|
80
|
+
|
|
81
|
+
const block = `
|
|
82
|
+
${journal.toTurtle()}
|
|
83
|
+
|
|
84
|
+
:${runId} a :IngestRun ;
|
|
85
|
+
prov:startedAtTime ${dt(runStart.toISOString())} ;
|
|
86
|
+
prov:endedAtTime ${dt(new Date().toISOString())}${harvestPart} .
|
|
87
|
+
`
|
|
88
|
+
|
|
89
|
+
const prefixes = `@prefix : <${CDP}> .
|
|
90
|
+
@prefix p-plan: <http://purl.org/net/p-plan#> .
|
|
91
|
+
@prefix prov: <http://www.w3.org/ns/prov#> .
|
|
92
|
+
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
|
|
93
|
+
`
|
|
94
|
+
fs.mkdirSync(path.dirname(abs(PATHS.ingestLog)), { recursive: true })
|
|
95
|
+
fs.writeFileSync(abs(PATHS.ingestLog), prefixes + block)
|
|
96
|
+
console.log(`log: wrote steps + IngestRun → ${PATHS.ingestLog}`)
|
|
97
|
+
}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import { spawnSync } from "child_process"
|
|
2
|
+
|
|
3
|
+
// Run an external command (a fetcher's node process, SPARQL Anything's java),
|
|
4
|
+
// inheriting stdio; non-zero exit aborts the step.
|
|
5
|
+
export const run = (cmd, args) => {
|
|
6
|
+
const r = spawnSync(cmd, args, { stdio: "inherit" })
|
|
7
|
+
if (r.status !== 0) throw new Error(`Exit ${r.status}: ${cmd} ${args.join(" ")}`)
|
|
8
|
+
}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import { sparqlConstruct, storeFromTurtles } from "@foerderfunke/sem-ops-utils"
|
|
2
|
+
import { writeTurtleFile } from "../write-turtle.js"
|
|
3
|
+
import { CDP, PATHS } from "../../utils.js"
|
|
4
|
+
import path from "path"
|
|
5
|
+
import fs from "fs"
|
|
6
|
+
|
|
7
|
+
// Clean step: the source's clean.sparql reshapes its lifted RDF into
|
|
8
|
+
// federation subjects (xyz:/cdp: vocabulary only — schema: enters at map).
|
|
9
|
+
export const runClean = async ({ abs }, name) => {
|
|
10
|
+
const cleanQuery = fs.readFileSync(abs(PATHS.cleanQuery(name)), "utf8")
|
|
11
|
+
const inDir = PATHS.lifted(name)
|
|
12
|
+
const outPath = PATHS.cleaned(name)
|
|
13
|
+
// Run CONSTRUCT per file so each lifted TTL stays isolated in its
|
|
14
|
+
// own store — the clean SPARQL can't cross-join across documents.
|
|
15
|
+
const inAbs = abs(inDir)
|
|
16
|
+
const files = fs.readdirSync(inAbs).filter(f => f.endsWith(".ttl")).sort()
|
|
17
|
+
console.log(`clean ${inDir} (${files.length} files) → ${outPath}`)
|
|
18
|
+
const allQuads = []
|
|
19
|
+
for (const f of files) {
|
|
20
|
+
const fileStore = storeFromTurtles([fs.readFileSync(path.join(inAbs, f), "utf8")])
|
|
21
|
+
allQuads.push(...await sparqlConstruct(cleanQuery, [fileStore]))
|
|
22
|
+
}
|
|
23
|
+
await writeTurtleFile(abs(outPath), allQuads, {
|
|
24
|
+
xyz: "http://sparql.xyz/facade-x/data/",
|
|
25
|
+
cdp: CDP,
|
|
26
|
+
})
|
|
27
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import { PATHS } from "../../utils.js"
|
|
2
|
+
import { execSync } from "child_process"
|
|
3
|
+
import { run } from "../run.js"
|
|
4
|
+
import fs from "fs"
|
|
5
|
+
|
|
6
|
+
// Fetch step: run the source's fetch.js. Live sources pass their :fetchUrl;
|
|
7
|
+
// static-file sources pass the absolute static dir instead — the script gets
|
|
8
|
+
// whichever applies, plus the federation's run params as one JSON argument.
|
|
9
|
+
// Returns the harvest record for the ingest log.
|
|
10
|
+
export const runFetch = ({ abs, root }, { name, fetchUrl, paramsJson }) => {
|
|
11
|
+
const outDir = PATHS.raw(name)
|
|
12
|
+
const origin = fetchUrl ?? abs(PATHS.staticDir(name))
|
|
13
|
+
console.log(`fetch ${fetchUrl ?? PATHS.staticDir(name)} (params ${paramsJson}) → ${outDir}`)
|
|
14
|
+
fs.mkdirSync(abs(outDir), { recursive: true })
|
|
15
|
+
run("node", [abs(PATHS.fetchScript(name)), abs(outDir), origin, paramsJson])
|
|
16
|
+
const harvest = { time: new Date().toISOString() }
|
|
17
|
+
// Static sources have no live harvest — record the files' git commit
|
|
18
|
+
// time instead (the freshness the Sources page shows for them).
|
|
19
|
+
if (!fetchUrl) try {
|
|
20
|
+
const iso = execSync(`git log -1 --format=%cI -- "${PATHS.staticDir(name)}"`, { cwd: root, encoding: "utf8" }).trim()
|
|
21
|
+
if (iso) harvest.staticCommittedAt = iso
|
|
22
|
+
} catch { /* not committed yet / no git → omit */ }
|
|
23
|
+
return harvest
|
|
24
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import { localName, PATHS } from "../../utils.js"
|
|
2
|
+
import { run } from "../run.js"
|
|
3
|
+
import path from "path"
|
|
4
|
+
import fs from "fs"
|
|
5
|
+
|
|
6
|
+
const SPARQL_ANYTHING_VERSION = "v1.1.0"
|
|
7
|
+
|
|
8
|
+
// The generic lift queries ship with the engine — they resolve against this
|
|
9
|
+
// package, not the instance root like everything else in PATHS.
|
|
10
|
+
const liftQueryFor = (formatIri) =>
|
|
11
|
+
path.join(import.meta.dirname, "../../lift", `${localName(formatIri).toLowerCase()}.sparql`)
|
|
12
|
+
|
|
13
|
+
// SPARQL Anything is the lift tool — cached per instance (tools/, gitignored),
|
|
14
|
+
// downloaded on first run and re-downloaded on version bumps.
|
|
15
|
+
export async function ensureJar(abs) {
|
|
16
|
+
const JAR = abs("tools/sparql-anything.jar")
|
|
17
|
+
const VERSION_FILE = abs("tools/sparql-anything.version")
|
|
18
|
+
const haveCurrentJar = fs.existsSync(JAR) && fs.existsSync(VERSION_FILE)
|
|
19
|
+
&& fs.readFileSync(VERSION_FILE, "utf8").trim() === SPARQL_ANYTHING_VERSION
|
|
20
|
+
|
|
21
|
+
if (!haveCurrentJar) {
|
|
22
|
+
const url = `https://github.com/SPARQL-Anything/sparql.anything/releases/download/${SPARQL_ANYTHING_VERSION}/sparql-anything-${SPARQL_ANYTHING_VERSION}.jar`
|
|
23
|
+
console.log(`Downloading sparql-anything ${SPARQL_ANYTHING_VERSION}...`)
|
|
24
|
+
fs.mkdirSync(path.dirname(JAR), { recursive: true })
|
|
25
|
+
const response = await fetch(url)
|
|
26
|
+
if (!response.ok) throw new Error(`Failed to fetch ${url}: ${response.status}`)
|
|
27
|
+
fs.writeFileSync(JAR, Buffer.from(await response.arrayBuffer()))
|
|
28
|
+
fs.writeFileSync(VERSION_FILE, SPARQL_ANYTHING_VERSION)
|
|
29
|
+
console.log(`Saved to ${JAR}`)
|
|
30
|
+
}
|
|
31
|
+
return JAR
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// Lift step: SPARQL Anything turns each raw file into TTL, via the bundled
|
|
35
|
+
// query for the source's :format, with the source's :hasLiftParam variables.
|
|
36
|
+
export const runLift = ({ abs }, { jar, name, format, params }) => {
|
|
37
|
+
// TODO: directory mode spawns one JVM per file (~1s startup each).
|
|
38
|
+
// Fine at small N; revisit if a source crosses ~50 items. SPARQL Anything
|
|
39
|
+
// accepts VALUES ?_location { … } in the lift query, which would let one
|
|
40
|
+
// invocation handle the whole batch.
|
|
41
|
+
const liftQuery = liftQueryFor(format)
|
|
42
|
+
const liftOne = (location, outPath) => {
|
|
43
|
+
const args = ["-jar", jar, "-q", liftQuery,
|
|
44
|
+
"-v", `location=${location}`,
|
|
45
|
+
"-f", "TTL", "-o", outPath]
|
|
46
|
+
for (const [pName, value] of params) args.push("-v", `${pName}=${value}`)
|
|
47
|
+
run("java", args)
|
|
48
|
+
}
|
|
49
|
+
const inAbs = abs(PATHS.raw(name))
|
|
50
|
+
const outAbs = abs(PATHS.lifted(name))
|
|
51
|
+
const files = fs.readdirSync(inAbs).filter(f => !f.startsWith(".")).sort()
|
|
52
|
+
fs.mkdirSync(outAbs, { recursive: true })
|
|
53
|
+
console.log(`lift ${PATHS.raw(name)} (${files.length} files) → ${PATHS.lifted(name)}`)
|
|
54
|
+
for (const f of files) {
|
|
55
|
+
const stem = path.basename(f, path.extname(f))
|
|
56
|
+
liftOne(path.join(inAbs, f), path.join(outAbs, `${stem}.ttl`))
|
|
57
|
+
}
|
|
58
|
+
}
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
import { sparqlInsertDelete, sparqlSelect } from "@foerderfunke/sem-ops-utils"
|
|
2
|
+
import { buildPrefixBlock, CDP, PATHS, shrink, sourceName } from "../../utils.js"
|
|
3
|
+
import { DataFactory } from "n3"
|
|
4
|
+
import path from "path"
|
|
5
|
+
import fs from "fs"
|
|
6
|
+
|
|
7
|
+
const df = DataFactory
|
|
8
|
+
|
|
9
|
+
export const MAPPED_GRAPH = df.namedNode("urn:mapped")
|
|
10
|
+
|
|
11
|
+
// ---- Direct-mapping generator ------------------------------------------
|
|
12
|
+
|
|
13
|
+
const XYZ = "http://sparql.xyz/facade-x/data/"
|
|
14
|
+
|
|
15
|
+
const buildDirectInsert = ({ sourceGraph, source, targetClass, target }, fields) => {
|
|
16
|
+
const prefixes = {
|
|
17
|
+
xyz: XYZ,
|
|
18
|
+
cdp: CDP,
|
|
19
|
+
cdf: "https://civic-data.de/federated-directory#",
|
|
20
|
+
schema: "http://schema.org/",
|
|
21
|
+
foaf: "http://xmlns.com/foaf/0.1/",
|
|
22
|
+
dct: "http://purl.org/dc/terms/",
|
|
23
|
+
}
|
|
24
|
+
// shrink() returns the IRI verbatim if no prefix matches; wrap that as <…>.
|
|
25
|
+
const short = (iri) => {
|
|
26
|
+
const s = shrink(iri, prefixes)
|
|
27
|
+
return s === iri ? `<${iri}>` : s
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
const v = (path) => `?${path}`
|
|
31
|
+
// STR() before the emptiness check so the guard works for any literal
|
|
32
|
+
// datatype — a bare `?v != ""` errors on e.g. xsd:int and would silently
|
|
33
|
+
// drop the field (AWO's numeric ids hit exactly this).
|
|
34
|
+
const optLit = (subj, path) =>
|
|
35
|
+
`OPTIONAL { ${subj} xyz:${path} ${v(path)} . ` +
|
|
36
|
+
`FILTER(isLiteral(${v(path)}) && STR(${v(path)}) != "") }`
|
|
37
|
+
|
|
38
|
+
const insertBlock = fields
|
|
39
|
+
.map(f => ` ?entity ${short(f.predicate)} ${v(f.fieldPath)} .`)
|
|
40
|
+
.join("\n")
|
|
41
|
+
|
|
42
|
+
const topLevel = fields.filter(f => !f.parentPath)
|
|
43
|
+
const subFields = fields.filter(f => f.parentPath)
|
|
44
|
+
|
|
45
|
+
// Source subjects = federation IRIs after the clean step, identified via
|
|
46
|
+
// cdp:fromSource — no minting from a key field. Where clean reshapes one
|
|
47
|
+
// source into several entity kinds it tags each subject with cdp:targetSchema;
|
|
48
|
+
// select only those for this mapping's schema. Subjects with no marker
|
|
49
|
+
// (single-entity sources like caritas/dhs) match unconditionally.
|
|
50
|
+
const bgp = [`?entity cdp:fromSource ${short(source)} .`]
|
|
51
|
+
if (target) {
|
|
52
|
+
bgp.push(`OPTIONAL { ?entity cdp:targetSchema ?_ts }`)
|
|
53
|
+
bgp.push(`FILTER(!bound(?_ts) || ?_ts = ${short(target)})`)
|
|
54
|
+
}
|
|
55
|
+
for (const f of topLevel) bgp.push(optLit("?entity", f.fieldPath))
|
|
56
|
+
|
|
57
|
+
const byParent = new Map()
|
|
58
|
+
for (const f of subFields) {
|
|
59
|
+
if (!byParent.has(f.parentPath)) byParent.set(f.parentPath, [])
|
|
60
|
+
byParent.get(f.parentPath).push(f)
|
|
61
|
+
}
|
|
62
|
+
let parentIdx = 0
|
|
63
|
+
for (const [parent, subs] of byParent) {
|
|
64
|
+
const pv = `?_p${parentIdx++}`
|
|
65
|
+
const inner = subs.map(s => ` ${optLit(pv, s.fieldPath)}`).join("\n")
|
|
66
|
+
bgp.push(`OPTIONAL {\n ?entity xyz:${parent} ${pv} .\n${inner}\n }`)
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// The target schema's :targetClass becomes the record's rdf:type here in the
|
|
70
|
+
// mapped graph — this is where schema: vocabulary first enters; the clean step
|
|
71
|
+
// stays in xyz:/cdp: only.
|
|
72
|
+
const typeClause = targetClass ? `a ${short(targetClass)} ; ` : ""
|
|
73
|
+
|
|
74
|
+
return `${buildPrefixBlock(prefixes)}
|
|
75
|
+
|
|
76
|
+
INSERT {
|
|
77
|
+
GRAPH <urn:mapped> {
|
|
78
|
+
?entity ${typeClause}cdp:fromSource ${short(source)} .
|
|
79
|
+
${insertBlock}
|
|
80
|
+
}
|
|
81
|
+
} WHERE {
|
|
82
|
+
GRAPH <${sourceGraph}> {
|
|
83
|
+
${bgp.join("\n ")}
|
|
84
|
+
}
|
|
85
|
+
}`
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
export const runMap = async ({ store, defStore, abs }, queriesDir) => {
|
|
89
|
+
const mappings = await sparqlSelect(`
|
|
90
|
+
PREFIX : <${CDP}>
|
|
91
|
+
SELECT ?mapping ?source ?sourceGraph ?target ?targetClass WHERE {
|
|
92
|
+
?mapping a :Mapping ;
|
|
93
|
+
:fromSource ?source .
|
|
94
|
+
OPTIONAL { ?mapping :sourceGraph ?sourceGraph }
|
|
95
|
+
OPTIONAL { ?mapping :toTarget ?target }
|
|
96
|
+
OPTIONAL { ?mapping :toTarget/:targetClass ?targetClass }
|
|
97
|
+
} ORDER BY ?mapping`, [defStore])
|
|
98
|
+
|
|
99
|
+
for (const m of mappings) {
|
|
100
|
+
const directRows = await sparqlSelect(`
|
|
101
|
+
PREFIX : <${CDP}>
|
|
102
|
+
SELECT ?fieldPath ?predicate ?parentPath WHERE {
|
|
103
|
+
<${m.mapping}> :hasFieldMapping ?fm .
|
|
104
|
+
?fm :from ?src ; :to ?tgt .
|
|
105
|
+
FILTER NOT EXISTS { ?fm :via ?_v }
|
|
106
|
+
?tgt :targetPredicate ?predicate .
|
|
107
|
+
?src :fieldPath ?fieldPath .
|
|
108
|
+
OPTIONAL { ?parent :hasSubField ?src . ?parent :fieldPath ?parentPath }
|
|
109
|
+
}`, [defStore])
|
|
110
|
+
|
|
111
|
+
if (directRows.length && m.sourceGraph) {
|
|
112
|
+
const localName = m.mapping.split("#").pop()
|
|
113
|
+
const query = buildDirectInsert(m, directRows)
|
|
114
|
+
const queryPath = abs(path.join(queriesDir, `${localName}.sparql`))
|
|
115
|
+
fs.mkdirSync(path.dirname(queryPath), { recursive: true })
|
|
116
|
+
fs.writeFileSync(queryPath, query)
|
|
117
|
+
console.log(`map ${localName} direct (${directRows.length} mappings) → ${queryPath}`)
|
|
118
|
+
await sparqlInsertDelete(query, store)
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// :via names a transform of the mapping's source — the script path
|
|
122
|
+
// follows by convention (sources/<source>/transform-<via>.sparql).
|
|
123
|
+
const viaRows = await sparqlSelect(`
|
|
124
|
+
PREFIX : <${CDP}>
|
|
125
|
+
SELECT DISTINCT ?via WHERE {
|
|
126
|
+
<${m.mapping}> :hasFieldMapping/:via ?via .
|
|
127
|
+
} ORDER BY ?via`, [defStore])
|
|
128
|
+
|
|
129
|
+
for (const v of viaRows) {
|
|
130
|
+
const script = PATHS.transform(sourceName(m.source), v.via)
|
|
131
|
+
console.log(`map ${script}`)
|
|
132
|
+
await sparqlInsertDelete(fs.readFileSync(abs(script), "utf8"), store)
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// A mapping's :hasRelationship turns the clean step's source-level link
|
|
137
|
+
// (e.g. :providedBy) into a target predicate (schema:provider), matching the
|
|
138
|
+
// two ends by their cdp:targetSchema. Both ends are still source IRIs here;
|
|
139
|
+
// the merge step rewrites them to the minted cluster IRIs.
|
|
140
|
+
const linkRows = await sparqlSelect(`
|
|
141
|
+
PREFIX : <${CDP}>
|
|
142
|
+
SELECT ?mapping ?sourceGraph ?fromSchema ?sourcePredicate ?targetPredicate ?toSchema WHERE {
|
|
143
|
+
?mapping a :Mapping ;
|
|
144
|
+
:sourceGraph ?sourceGraph ;
|
|
145
|
+
:toTarget ?fromSchema ;
|
|
146
|
+
:hasRelationship ?rel .
|
|
147
|
+
?rel :sourcePredicate ?sourcePredicate ;
|
|
148
|
+
:toTargetField ?field ;
|
|
149
|
+
:toTargetSchema ?toSchema .
|
|
150
|
+
?field :targetPredicate ?targetPredicate .
|
|
151
|
+
} ORDER BY ?mapping`, [defStore])
|
|
152
|
+
|
|
153
|
+
for (const rel of linkRows) {
|
|
154
|
+
const prefixes = { cdp: CDP, schema: "http://schema.org/" }
|
|
155
|
+
const short = (iri) => { const s = shrink(iri, prefixes); return s === iri ? `<${iri}>` : s }
|
|
156
|
+
const query = `${buildPrefixBlock(prefixes)}
|
|
157
|
+
|
|
158
|
+
INSERT {
|
|
159
|
+
GRAPH <urn:mapped> {
|
|
160
|
+
?from ${short(rel.targetPredicate)} ?to .
|
|
161
|
+
}
|
|
162
|
+
} WHERE {
|
|
163
|
+
GRAPH <${rel.sourceGraph}> {
|
|
164
|
+
?from ${short(rel.sourcePredicate)} ?to ;
|
|
165
|
+
cdp:targetSchema ${short(rel.fromSchema)} .
|
|
166
|
+
?to cdp:targetSchema ${short(rel.toSchema)} .
|
|
167
|
+
}
|
|
168
|
+
}`
|
|
169
|
+
console.log(`map ${rel.mapping.split("#").pop()} link (${short(rel.targetPredicate)})`)
|
|
170
|
+
await sparqlInsertDelete(query, store)
|
|
171
|
+
}
|
|
172
|
+
}
|