@directory-builder/core 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -2
- package/package.json +1 -1
- package/src/index.js +2 -2
- package/src/lift/xml.sparql +12 -0
- package/src/pipeline/federate.js +69 -0
- package/src/pipeline/ingest.js +97 -0
- package/src/pipeline/run.js +8 -0
- package/src/pipeline/steps/clean.js +27 -0
- package/src/pipeline/steps/fetch.js +26 -0
- package/src/pipeline/steps/lift.js +60 -0
- package/src/pipeline/steps/map.js +172 -0
- package/src/pipeline/steps/match.js +212 -0
- package/src/pipeline/steps/merge.js +59 -0
- package/src/pipeline/steps/resolve.js +54 -0
- package/src/pipeline/write-turtle.js +30 -0
- package/src/pipeline.js +2 -2
- package/src/utils.js +8 -5
- package/webapp/src/App.jsx +3 -1
- package/webapp/src/Card.jsx +3 -3
- package/webapp/src/ColumnGraph.jsx +1 -1
- package/webapp/src/Directory.jsx +6 -6
- package/webapp/src/{OrgCard.jsx → EntityCard.jsx} +12 -12
- package/webapp/src/MapGraph.jsx +22 -22
- package/webapp/src/MatchGraph.jsx +3 -3
- package/webapp/src/MergeTables.jsx +43 -36
- package/webapp/src/Query.jsx +4 -9
- package/webapp/src/instanceData.js +8 -4
- package/webapp/src/loadMap.js +22 -22
- package/webapp/src/loadMerge.js +30 -32
- package/webapp/src/loadSources.js +3 -3
- package/webapp/src/mergeEntities.js +15 -0
- package/webapp/src/sourceMeta.js +1 -1
- package/webapp/src/styles.css +6 -6
- package/webapp/vite.js +1 -1
- package/src/federate.js +0 -571
- package/src/ingest.js +0 -158
- package/webapp/src/mergeOrgs.js +0 -15
package/src/ingest.js
DELETED
|
@@ -1,158 +0,0 @@
|
|
|
1
|
-
import { sparqlSelect, storeFromTurtles } from "@foerderfunke/sem-ops-utils"
|
|
2
|
-
import { CDP, localName, objectsOf, parseTtl, PATHS, sourceName, stepJournal } from "./utils.js"
|
|
3
|
-
import { execSync, spawnSync } from "child_process"
|
|
4
|
-
import path from "path"
|
|
5
|
-
import fs from "fs"
|
|
6
|
-
|
|
7
|
-
const SPARQL_ANYTHING_VERSION = "v1.1.0"
|
|
8
|
-
|
|
9
|
-
const run = (cmd, args) => {
|
|
10
|
-
const r = spawnSync(cmd, args, { stdio: "inherit" })
|
|
11
|
-
if (r.status !== 0) throw new Error(`Exit ${r.status}: ${cmd} ${args.join(" ")}`)
|
|
12
|
-
}
|
|
13
|
-
|
|
14
|
-
// The generic lift queries ship with the engine — they resolve against this
|
|
15
|
-
// package, not the instance root like everything else in PATHS.
|
|
16
|
-
const liftQueryFor = (formatIri) =>
|
|
17
|
-
path.join(import.meta.dirname, "lift", `${localName(formatIri).toLowerCase()}.sparql`)
|
|
18
|
-
|
|
19
|
-
// Ingest engine: fetch + lift per source declared in the instance's
|
|
20
|
-
// federation.ttl. `root` is the instance directory all PATHS resolve against.
|
|
21
|
-
export async function ingest(root = process.cwd()) {
|
|
22
|
-
const abs = (p) => path.join(root, p)
|
|
23
|
-
const federationTtl = fs.readFileSync(abs(PATHS.federation), "utf8")
|
|
24
|
-
const defStore = storeFromTurtles([federationTtl])
|
|
25
|
-
|
|
26
|
-
// ---- Read the sources ------------------------------------------------
|
|
27
|
-
// The step graph (fetch → lift per source) is the engine's own shape;
|
|
28
|
-
// config declares only the sources and their facts. Lift params are SPARQL
|
|
29
|
-
// Anything variables declared per source. Sources run in :hasSource
|
|
30
|
-
// declaration order.
|
|
31
|
-
|
|
32
|
-
const facts = new Map()
|
|
33
|
-
for (const r of await sparqlSelect(`
|
|
34
|
-
PREFIX : <${CDP}>
|
|
35
|
-
SELECT ?source ?fetchUrl ?format ?paramName ?paramValue WHERE {
|
|
36
|
-
:federation :hasSource ?source .
|
|
37
|
-
OPTIONAL { ?source :fetchUrl ?fetchUrl }
|
|
38
|
-
OPTIONAL { ?source :format ?format }
|
|
39
|
-
OPTIONAL { ?source :hasLiftParam [ :name ?paramName ; :value ?paramValue ] }
|
|
40
|
-
}`, [defStore])) {
|
|
41
|
-
if (!facts.has(r.source)) facts.set(r.source, { fetchUrl: r.fetchUrl, format: r.format, params: [] })
|
|
42
|
-
if (r.paramName) facts.get(r.source).params.push([r.paramName, r.paramValue])
|
|
43
|
-
}
|
|
44
|
-
const sources = new Map(objectsOf(parseTtl(federationTtl), `${CDP}hasSource`).map((iri) => [iri, facts.get(iri)]))
|
|
45
|
-
for (const [iri, s] of sources) {
|
|
46
|
-
if (!s.format) throw new Error(`${iri} declares no :format (needed to pick the lift query)`)
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
// ---- Ensure sparql-anything.jar ----------------------------------------
|
|
50
|
-
|
|
51
|
-
const JAR = abs("tools/sparql-anything.jar")
|
|
52
|
-
const VERSION_FILE = abs("tools/sparql-anything.version")
|
|
53
|
-
const haveCurrentJar = fs.existsSync(JAR) && fs.existsSync(VERSION_FILE)
|
|
54
|
-
&& fs.readFileSync(VERSION_FILE, "utf8").trim() === SPARQL_ANYTHING_VERSION
|
|
55
|
-
|
|
56
|
-
if (!haveCurrentJar) {
|
|
57
|
-
const url = `https://github.com/SPARQL-Anything/sparql.anything/releases/download/${SPARQL_ANYTHING_VERSION}/sparql-anything-${SPARQL_ANYTHING_VERSION}.jar`
|
|
58
|
-
console.log(`Downloading sparql-anything ${SPARQL_ANYTHING_VERSION}...`)
|
|
59
|
-
fs.mkdirSync(path.dirname(JAR), { recursive: true })
|
|
60
|
-
const response = await fetch(url)
|
|
61
|
-
if (!response.ok) throw new Error(`Failed to fetch ${url}: ${response.status}`)
|
|
62
|
-
fs.writeFileSync(JAR, Buffer.from(await response.arrayBuffer()))
|
|
63
|
-
fs.writeFileSync(VERSION_FILE, SPARQL_ANYTHING_VERSION)
|
|
64
|
-
console.log(`Saved to ${JAR}`)
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
// ---- Run steps ----------------------------------------------------------
|
|
68
|
-
|
|
69
|
-
// All :hasRunParam values grouped by name, handed to every fetcher as one
|
|
70
|
-
// JSON argument — each fetcher picks the parameters it needs.
|
|
71
|
-
const runParams = {}
|
|
72
|
-
for (const r of await sparqlSelect(`
|
|
73
|
-
PREFIX : <${CDP}>
|
|
74
|
-
SELECT ?name ?value WHERE { :federation :hasRunParam [ :name ?name ; :value ?value ] } ORDER BY ?name ?value`, [defStore])) {
|
|
75
|
-
(runParams[r.name] ??= []).push(r.value)
|
|
76
|
-
}
|
|
77
|
-
const paramsJson = JSON.stringify(runParams)
|
|
78
|
-
|
|
79
|
-
const runStart = new Date()
|
|
80
|
-
const harvests = []
|
|
81
|
-
const journal = stepJournal()
|
|
82
|
-
const fetchStepOf = new Map()
|
|
83
|
-
|
|
84
|
-
for (const [iri, s] of sources) {
|
|
85
|
-
const name = sourceName(iri)
|
|
86
|
-
fetchStepOf.set(iri, await journal.step("fetch", { source: iri }, () => {
|
|
87
|
-
const outDir = PATHS.raw(name)
|
|
88
|
-
// Live sources pass their :fetchUrl; static-file sources pass the
|
|
89
|
-
// absolute static dir instead. The script gets whichever applies.
|
|
90
|
-
const origin = s.fetchUrl ?? abs(PATHS.staticDir(name))
|
|
91
|
-
console.log(`fetch ${s.fetchUrl ?? PATHS.staticDir(name)} (params ${paramsJson}) → ${outDir}`)
|
|
92
|
-
fs.mkdirSync(abs(outDir), { recursive: true })
|
|
93
|
-
run("node", [abs(PATHS.fetchScript(name)), abs(outDir), origin, paramsJson])
|
|
94
|
-
const harvest = { source: iri, time: new Date().toISOString() }
|
|
95
|
-
// Static sources have no live harvest — record the files' git commit
|
|
96
|
-
// time instead (the freshness the Sources page shows for them).
|
|
97
|
-
if (!s.fetchUrl) try {
|
|
98
|
-
const iso = execSync(`git log -1 --format=%cI -- "${PATHS.staticDir(name)}"`, { cwd: root, encoding: "utf8" }).trim()
|
|
99
|
-
if (iso) harvest.staticCommittedAt = iso
|
|
100
|
-
} catch { /* not committed yet / no git → omit */ }
|
|
101
|
-
harvests.push(harvest)
|
|
102
|
-
}))
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
for (const [iri, s] of sources) {
|
|
106
|
-
const name = sourceName(iri)
|
|
107
|
-
await journal.step("lift", { source: iri, after: [fetchStepOf.get(iri)] }, () => {
|
|
108
|
-
// TODO: directory mode spawns one JVM per file (~1s startup each).
|
|
109
|
-
// Fine at small N; revisit if a source crosses ~50 items. SPARQL Anything
|
|
110
|
-
// accepts VALUES ?_location { … } in the lift query, which would let one
|
|
111
|
-
// invocation handle the whole batch.
|
|
112
|
-
const liftQuery = liftQueryFor(s.format)
|
|
113
|
-
const liftOne = (location, outPath) => {
|
|
114
|
-
const args = ["-jar", JAR, "-q", liftQuery,
|
|
115
|
-
"-v", `location=${location}`,
|
|
116
|
-
"-f", "TTL", "-o", outPath]
|
|
117
|
-
for (const [pName, value] of s.params) args.push("-v", `${pName}=${value}`)
|
|
118
|
-
run("java", args)
|
|
119
|
-
}
|
|
120
|
-
const inAbs = abs(PATHS.raw(name))
|
|
121
|
-
const outAbs = abs(PATHS.lifted(name))
|
|
122
|
-
const files = fs.readdirSync(inAbs).filter(f => !f.startsWith(".")).sort()
|
|
123
|
-
fs.mkdirSync(outAbs, { recursive: true })
|
|
124
|
-
console.log(`lift ${PATHS.raw(name)} (${files.length} files) → ${PATHS.lifted(name)}`)
|
|
125
|
-
for (const f of files) {
|
|
126
|
-
const stem = path.basename(f, path.extname(f))
|
|
127
|
-
liftOne(path.join(inAbs, f), path.join(outAbs, `${stem}.ttl`))
|
|
128
|
-
}
|
|
129
|
-
})
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
const dt = (s) => `"${s}"^^xsd:dateTime`
|
|
133
|
-
const runId = "run" + runStart.toISOString().replace(/\D/g, "").slice(0, 14)
|
|
134
|
-
const harvestPart = harvests.length
|
|
135
|
-
? ` ;\n :harvested\n` + harvests.map((h) => {
|
|
136
|
-
const local = h.source.split("#").pop()
|
|
137
|
-
const committed = h.staticCommittedAt ? ` ; :staticCommittedAt ${dt(h.staticCommittedAt)}` : ""
|
|
138
|
-
return ` [ :ofSource :${local} ; prov:atTime ${dt(h.time)}${committed} ]`
|
|
139
|
-
}).join(" ,\n")
|
|
140
|
-
: ""
|
|
141
|
-
|
|
142
|
-
const block = `
|
|
143
|
-
${journal.toTurtle()}
|
|
144
|
-
|
|
145
|
-
:${runId} a :IngestRun ;
|
|
146
|
-
prov:startedAtTime ${dt(runStart.toISOString())} ;
|
|
147
|
-
prov:endedAtTime ${dt(new Date().toISOString())}${harvestPart} .
|
|
148
|
-
`
|
|
149
|
-
|
|
150
|
-
const prefixes = `@prefix : <${CDP}> .
|
|
151
|
-
@prefix p-plan: <http://purl.org/net/p-plan#> .
|
|
152
|
-
@prefix prov: <http://www.w3.org/ns/prov#> .
|
|
153
|
-
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
|
|
154
|
-
`
|
|
155
|
-
fs.mkdirSync(path.dirname(abs(PATHS.ingestLog)), { recursive: true })
|
|
156
|
-
fs.writeFileSync(abs(PATHS.ingestLog), prefixes + block)
|
|
157
|
-
console.log(`log: wrote steps + IngestRun → ${PATHS.ingestLog}`)
|
|
158
|
-
}
|
package/webapp/src/mergeOrgs.js
DELETED
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
// Builds the org lists for the Merge and Directory views, in one shared order.
|
|
2
|
-
// Reads: data/pipeline/{merged,provenance,final}.ttl, config/federation.ttl (via loadMerge.js)
|
|
3
|
-
// Does: exports mergedOrgs and finalOrgs (consumed by MergeTables, Directory)
|
|
4
|
-
|
|
5
|
-
import { loadMerge } from "./loadMerge.js"
|
|
6
|
-
import { isConflict } from "./OrgCard.jsx"
|
|
7
|
-
import { federationTtl, provenanceTtl as provTtl, mergedTtl, finalTtl } from "./instanceData.js"
|
|
8
|
-
|
|
9
|
-
const conflictCount = (org) => org.fields.reduce((n, f) => n + (isConflict(f) ? 1 : 0), 0)
|
|
10
|
-
|
|
11
|
-
// Merge view sorts by conflict count desc; the directory mirrors that order
|
|
12
|
-
// so the same org sits in the same visual slot across pages.
|
|
13
|
-
export const mergedOrgs = loadMerge(mergedTtl, provTtl, federationTtl).sort((a, b) => conflictCount(b) - conflictCount(a) || a.iri.localeCompare(b.iri))
|
|
14
|
-
const orderIndex = new Map(mergedOrgs.map((o, i) => [o.iri, i]))
|
|
15
|
-
export const finalOrgs = loadMerge(finalTtl, "", federationTtl).sort((a, b) => (orderIndex.get(a.iri) ?? Infinity) - (orderIndex.get(b.iri) ?? Infinity))
|