@lde/pipeline-shacl-sampler 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +92 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +2 -0
- package/dist/pathExtractor.d.ts +38 -0
- package/dist/pathExtractor.d.ts.map +1 -0
- package/dist/pathExtractor.js +170 -0
- package/dist/sampleStages.d.ts +61 -0
- package/dist/sampleStages.d.ts.map +1 -0
- package/dist/sampleStages.js +86 -0
- package/package.json +37 -0
package/README.md
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# @lde/pipeline-shacl-sampler
|
|
2
|
+
|
|
3
|
+
Per-class sampling stages for [`@lde/pipeline`](../pipeline),
|
|
4
|
+
derived from SHACL shapes.
|
|
5
|
+
|
|
6
|
+
Given a SHACL shapes file, this package builds one
|
|
7
|
+
[`Stage`](../pipeline/src/stage.ts) per `sh:targetClass`. Each stage
|
|
8
|
+
pairs an [`ItemSelector`](../pipeline/src/sparql/selector.ts) that picks
|
|
9
|
+
N instances of the target class from the distribution’s SPARQL endpoint
|
|
10
|
+
with a CONSTRUCT executor that, for every path chain the SHACL declares
|
|
11
|
+
(walked recursively through `sh:node`, `sh:class`,
|
|
12
|
+
`sh:qualifiedValueShape`, and `sh:or` branches, stopping at leaf
|
|
13
|
+
constraints or shape cycles), pulls in the triples reachable along that
|
|
14
|
+
chain’s terminal node. The resulting quads are a sample subgraph rich
|
|
15
|
+
enough for
|
|
16
|
+
[`@lde/pipeline-shacl-validator`](../pipeline-shacl-validator) to
|
|
17
|
+
validate without false-positive ‘missing nested node’ violations.
|
|
18
|
+
|
|
19
|
+
## Usage
|
|
20
|
+
|
|
21
|
+
```typescript
|
|
22
|
+
import { Pipeline } from '@lde/pipeline';
|
|
23
|
+
import { shaclSampleStages } from '@lde/pipeline-shacl-sampler';
|
|
24
|
+
import { ShaclValidator } from '@lde/pipeline-shacl-validator';
|
|
25
|
+
|
|
26
|
+
const shapesFile = 'https://docs.nde.nl/schema-profile/shacl.ttl';
|
|
27
|
+
const validator = new ShaclValidator({ shapesFile, reportDir: './validation' });
|
|
28
|
+
|
|
29
|
+
const stages = await shaclSampleStages({
|
|
30
|
+
shapesFile,
|
|
31
|
+
samplesPerClass: 50,
|
|
32
|
+
validator,
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
await new Pipeline({ /* … */, stages }).run();
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Options
|
|
39
|
+
|
|
40
|
+
| Option | Default | Description |
|
|
41
|
+
| ----------------- | ----------------- | ---------------------------------------------------------------------------------------------------------------------- |
|
|
42
|
+
| `shapesFile` | — | URL or local path to the SHACL shapes file. Any format `rdf-dereference` accepts. |
|
|
43
|
+
| `samplesPerClass` | `50` | Number of top-level resources to sample per `sh:targetClass`. |
|
|
44
|
+
| `timeout` | `60000` | SPARQL query timeout in milliseconds. |
|
|
45
|
+
| `batchSize` | `samplesPerClass` | Maximum sampled subjects per executor call. Lower values spread work across multiple parallel queries. |
|
|
46
|
+
| `maxConcurrency` | `10` | Maximum concurrent in-flight executor batches per stage. |
|
|
47
|
+
| `validator` | — | Optional [`Validator`](../pipeline/src/validator.ts) attached to every generated stage (typically a `ShaclValidator`). |
|
|
48
|
+
| `onInvalid` | `'write'` | Behaviour when a sampled batch fails validation: `'write'` \| `'skip'` \| `'halt'`. Only used when `validator` is set. |
|
|
49
|
+
|
|
50
|
+
## Limitations
|
|
51
|
+
|
|
52
|
+
- Only plain-IRI `sh:path` values are supported. Sequence, alternative
|
|
53
|
+
and inverse paths throw at extraction time.
|
|
54
|
+
- `sh:targetClass` is the only target form recognised; `sh:targetNode`,
|
|
55
|
+
`sh:targetSubjectsOf`, `sh:targetObjectsOf` and `sh:sparqlTarget` are
|
|
56
|
+
not yet supported.
|
|
57
|
+
|
|
58
|
+
## Related work
|
|
59
|
+
|
|
60
|
+
### `extract-cbd-shape`
|
|
61
|
+
|
|
62
|
+
The TREEcg / W3C TREE-incubation
|
|
63
|
+
[`extract-cbd-shape`](https://github.com/TREEcg/extract-cbd-shape)
|
|
64
|
+
library implements a per-entity walk over an in-memory `RdfStore`,
|
|
65
|
+
falling back to an HTTP dereference of the focus node whenever a
|
|
66
|
+
required path is missing from the local store. It is the right tool
|
|
67
|
+
for streaming hypermedia consumers (e.g. LDES clients) that already
|
|
68
|
+
hold a current context and can fetch more of it over HTTP.
|
|
69
|
+
|
|
70
|
+
It is the wrong tool for this package’s setting. We assemble sample
|
|
71
|
+
subgraphs against a remote SPARQL endpoint with millions of triples;
|
|
72
|
+
the per-entity round-trip pattern would issue _N samples × M target
|
|
73
|
+
classes_ dereferences per dataset and assumes content-negotiable
|
|
74
|
+
entity IRIs that resolve to RDF — rarely true for the cultural
|
|
75
|
+
heritage datasets this package was built for.
|
|
76
|
+
|
|
77
|
+
### SHACL2SPARQL
|
|
78
|
+
|
|
79
|
+
The Corman, Reutter & Savković 2019
|
|
80
|
+
[translation](https://link.springer.com/chapter/10.1007/978-3-030-30796-7_27)
|
|
81
|
+
of SHACL constraints to SPARQL targets validation, not sample
|
|
82
|
+
subgraph extraction. No production JavaScript implementation exists.
|
|
83
|
+
|
|
84
|
+
### Why batch CONSTRUCT per `sh:targetClass`
|
|
85
|
+
|
|
86
|
+
For each top-level shape, this package emits one `Stage` whose
|
|
87
|
+
CONSTRUCT executor receives a batch of sampled subjects and walks
|
|
88
|
+
every path chain the SHACL declares server-side. A capable SPARQL
|
|
89
|
+
endpoint (e.g. QLever) evaluates the property-path UNIONs in a
|
|
90
|
+
single round-trip per batch, regardless of chain count. The
|
|
91
|
+
alternative — extracting per entity in the client — would multiply
|
|
92
|
+
round-trips by the sample size.
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,mBAAmB,EAAE,KAAK,WAAW,EAAE,MAAM,oBAAoB,CAAC;AAC3E,OAAO,EACL,iBAAiB,EACjB,KAAK,wBAAwB,GAC9B,MAAM,mBAAmB,CAAC"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import type { NamedNode } from '@rdfjs/types';
|
|
2
|
+
/**
|
|
3
|
+
* A SHACL `sh:targetClass` shape distilled into the path chains the sampler
|
|
4
|
+
* needs to walk to feed the validator a closed sample subgraph.
|
|
5
|
+
*/
|
|
6
|
+
export interface TargetShape {
|
|
7
|
+
/** The class targeted by `sh:targetClass`. */
|
|
8
|
+
targetClass: NamedNode;
|
|
9
|
+
/**
|
|
10
|
+
* Property-path chains rooted at a sampled instance of {@link targetClass}.
|
|
11
|
+
*
|
|
12
|
+
* Each chain is the sequence of `sh:path` IRIs leading from the sampled
|
|
13
|
+
* subject to a node whose direct triples are needed for validation —
|
|
14
|
+
* because the SHACL declares a nested-shape constraint on that path
|
|
15
|
+
* (`sh:node`, `sh:class`, `sh:qualifiedValueShape`, or `sh:or` branches
|
|
16
|
+
* that reference those). Chains continue recursively into every shape
|
|
17
|
+
* thus referenced and stop on a cycle (a shape revisited on the current
|
|
18
|
+
* stack) or on a leaf property shape whose constraints reference no
|
|
19
|
+
* further shape.
|
|
20
|
+
*
|
|
21
|
+
* Each chain is *additive*: shorter prefixes are also present in the
|
|
22
|
+
* list when they themselves terminate at a nested-shape property.
|
|
23
|
+
*/
|
|
24
|
+
pathChains: NamedNode[][];
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Load a SHACL shapes file and extract its `sh:targetClass` shapes into
|
|
28
|
+
* {@link TargetShape}s.
|
|
29
|
+
*
|
|
30
|
+
* Multiple NodeShapes can target the same class — they are merged into a
|
|
31
|
+
* single entry. Only plain-IRI `sh:path` values are supported; sequence,
|
|
32
|
+
* alternative and inverse paths throw.
|
|
33
|
+
*
|
|
34
|
+
* @param shapesFile URL or local path to the SHACL shapes file. Any format
|
|
35
|
+
* supported by `rdf-dereference` (Turtle, JSON-LD, N-Triples, …).
|
|
36
|
+
*/
|
|
37
|
+
export declare function extractTargetShapes(shapesFile: string): Promise<TargetShape[]>;
|
|
38
|
+
//# sourceMappingURL=pathExtractor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pathExtractor.d.ts","sourceRoot":"","sources":["../src/pathExtractor.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,SAAS,EAAQ,MAAM,cAAc,CAAC;AAsBpD;;;GAGG;AACH,MAAM,WAAW,WAAW;IAC1B,8CAA8C;IAC9C,WAAW,EAAE,SAAS,CAAC;IACvB;;;;;;;;;;;;;;OAcG;IACH,UAAU,EAAE,SAAS,EAAE,EAAE,CAAC;CAC3B;AAED;;;;;;;;;;GAUG;AACH,wBAAsB,mBAAmB,CACvC,UAAU,EAAE,MAAM,GACjB,OAAO,CAAC,WAAW,EAAE,CAAC,CAkCxB"}
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
import { DataFactory, Store } from 'n3';
|
|
2
|
+
import { rdfDereferencer } from 'rdf-dereference';
|
|
3
|
+
const { namedNode } = DataFactory;
|
|
4
|
+
const SHACL = 'http://www.w3.org/ns/shacl#';
|
|
5
|
+
const RDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
|
|
6
|
+
const sh = {
|
|
7
|
+
targetClass: namedNode(`${SHACL}targetClass`),
|
|
8
|
+
property: namedNode(`${SHACL}property`),
|
|
9
|
+
path: namedNode(`${SHACL}path`),
|
|
10
|
+
node: namedNode(`${SHACL}node`),
|
|
11
|
+
class: namedNode(`${SHACL}class`),
|
|
12
|
+
or: namedNode(`${SHACL}or`),
|
|
13
|
+
qualifiedValueShape: namedNode(`${SHACL}qualifiedValueShape`),
|
|
14
|
+
};
|
|
15
|
+
const rdfFirst = namedNode(`${RDF}first`);
|
|
16
|
+
const rdfRest = namedNode(`${RDF}rest`);
|
|
17
|
+
const rdfNil = namedNode(`${RDF}nil`);
|
|
18
|
+
/**
|
|
19
|
+
* Load a SHACL shapes file and extract its `sh:targetClass` shapes into
|
|
20
|
+
* {@link TargetShape}s.
|
|
21
|
+
*
|
|
22
|
+
* Multiple NodeShapes can target the same class — they are merged into a
|
|
23
|
+
* single entry. Only plain-IRI `sh:path` values are supported; sequence,
|
|
24
|
+
* alternative and inverse paths throw.
|
|
25
|
+
*
|
|
26
|
+
* @param shapesFile URL or local path to the SHACL shapes file. Any format
|
|
27
|
+
* supported by `rdf-dereference` (Turtle, JSON-LD, N-Triples, …).
|
|
28
|
+
*/
|
|
29
|
+
export async function extractTargetShapes(shapesFile) {
|
|
30
|
+
const store = await loadShapes(shapesFile);
|
|
31
|
+
const classToShapes = new Map();
|
|
32
|
+
for (const quad of store.getQuads(null, sh.targetClass, null, null)) {
|
|
33
|
+
if (quad.object.termType !== 'NamedNode')
|
|
34
|
+
continue;
|
|
35
|
+
const key = quad.object.value;
|
|
36
|
+
const list = classToShapes.get(key) ?? [];
|
|
37
|
+
list.push(quad.subject);
|
|
38
|
+
classToShapes.set(key, list);
|
|
39
|
+
}
|
|
40
|
+
const result = [];
|
|
41
|
+
for (const [classIri, shapeIris] of classToShapes) {
|
|
42
|
+
const pathChains = [];
|
|
43
|
+
const seen = new Set();
|
|
44
|
+
for (const shapeIri of shapeIris) {
|
|
45
|
+
for (const chain of expandShape(store, shapeIri, new Set(), classToShapes)) {
|
|
46
|
+
const key = chainKey(chain);
|
|
47
|
+
if (!seen.has(key)) {
|
|
48
|
+
seen.add(key);
|
|
49
|
+
pathChains.push(chain);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
result.push({ targetClass: namedNode(classIri), pathChains });
|
|
54
|
+
}
|
|
55
|
+
return result;
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Walk one shape's property graph, producing every path chain that ends at a
|
|
59
|
+
* nested-shape property. Cycle-detected via the {@link stack} of shape keys
|
|
60
|
+
* the current recursion has entered but not yet left.
|
|
61
|
+
*/
|
|
62
|
+
function expandShape(store, shape, stack, classToShapes) {
|
|
63
|
+
const key = termKey(shape);
|
|
64
|
+
if (stack.has(key))
|
|
65
|
+
return [];
|
|
66
|
+
stack.add(key);
|
|
67
|
+
const chains = [];
|
|
68
|
+
const seen = new Set();
|
|
69
|
+
for (const propQuad of store.getQuads(shape, sh.property, null, null)) {
|
|
70
|
+
const propShape = propQuad.object;
|
|
71
|
+
const analysis = valueShapeAnalysis(store, propShape, classToShapes);
|
|
72
|
+
if (!analysis.emit)
|
|
73
|
+
continue;
|
|
74
|
+
const path = readPath(store, propShape);
|
|
75
|
+
addUnique(chains, seen, [path]);
|
|
76
|
+
for (const nestedRef of analysis.refs) {
|
|
77
|
+
for (const subChain of expandShape(store, nestedRef, stack, classToShapes)) {
|
|
78
|
+
addUnique(chains, seen, [path, ...subChain]);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
stack.delete(key);
|
|
83
|
+
return chains;
|
|
84
|
+
}
|
|
85
|
+
function valueShapeAnalysis(store, constraintShape, classToShapes) {
|
|
86
|
+
const refs = [];
|
|
87
|
+
let emit = false;
|
|
88
|
+
for (const q of store.getQuads(constraintShape, sh.node, null, null)) {
|
|
89
|
+
emit = true;
|
|
90
|
+
refs.push(q.object);
|
|
91
|
+
}
|
|
92
|
+
for (const q of store.getQuads(constraintShape, sh.qualifiedValueShape, null, null)) {
|
|
93
|
+
emit = true;
|
|
94
|
+
refs.push(q.object);
|
|
95
|
+
}
|
|
96
|
+
for (const q of store.getQuads(constraintShape, sh.class, null, null)) {
|
|
97
|
+
emit = true;
|
|
98
|
+
if (q.object.termType !== 'NamedNode')
|
|
99
|
+
continue;
|
|
100
|
+
for (const target of classToShapes.get(q.object.value) ?? []) {
|
|
101
|
+
refs.push(target);
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
for (const q of store.getQuads(constraintShape, sh.or, null, null)) {
|
|
105
|
+
for (const branch of orListBranches(store, q.object)) {
|
|
106
|
+
const sub = valueShapeAnalysis(store, branch, classToShapes);
|
|
107
|
+
if (sub.emit)
|
|
108
|
+
emit = true;
|
|
109
|
+
refs.push(...sub.refs);
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
return { emit, refs };
|
|
113
|
+
}
|
|
114
|
+
function orListBranches(store, listHead) {
|
|
115
|
+
const branches = [];
|
|
116
|
+
let current = listHead;
|
|
117
|
+
while (!(current.termType === 'NamedNode' && current.value === rdfNil.value)) {
|
|
118
|
+
if (current.termType !== 'NamedNode' && current.termType !== 'BlankNode') {
|
|
119
|
+
break;
|
|
120
|
+
}
|
|
121
|
+
const firsts = store.getQuads(current, rdfFirst, null, null);
|
|
122
|
+
if (firsts.length === 0)
|
|
123
|
+
break;
|
|
124
|
+
branches.push(firsts[0].object);
|
|
125
|
+
const rests = store.getQuads(current, rdfRest, null, null);
|
|
126
|
+
if (rests.length === 0)
|
|
127
|
+
break;
|
|
128
|
+
current = rests[0].object;
|
|
129
|
+
}
|
|
130
|
+
return branches;
|
|
131
|
+
}
|
|
132
|
+
async function loadShapes(shapesFile) {
|
|
133
|
+
const { data } = await rdfDereferencer.dereference(shapesFile, {
|
|
134
|
+
localFiles: true,
|
|
135
|
+
});
|
|
136
|
+
const store = new Store();
|
|
137
|
+
await new Promise((resolve, reject) => store
|
|
138
|
+
.import(data)
|
|
139
|
+
.on('end', () => resolve())
|
|
140
|
+
.on('error', reject));
|
|
141
|
+
return store;
|
|
142
|
+
}
|
|
143
|
+
function readPath(store, propShape) {
|
|
144
|
+
const pathQuads = store.getQuads(propShape, sh.path, null, null);
|
|
145
|
+
if (pathQuads.length !== 1) {
|
|
146
|
+
throw new Error(`Property shape ${termLabel(propShape)} must have exactly one sh:path; found ${pathQuads.length}`);
|
|
147
|
+
}
|
|
148
|
+
const pathTerm = pathQuads[0].object;
|
|
149
|
+
if (pathTerm.termType !== 'NamedNode') {
|
|
150
|
+
throw new Error(`Unsupported sh:path form on property shape ${termLabel(propShape)}: ` +
|
|
151
|
+
`only plain IRI paths are supported (sequence, alternative and inverse paths are not).`);
|
|
152
|
+
}
|
|
153
|
+
return pathTerm;
|
|
154
|
+
}
|
|
155
|
+
function addUnique(chains, seen, chain) {
|
|
156
|
+
const key = chainKey(chain);
|
|
157
|
+
if (seen.has(key))
|
|
158
|
+
return;
|
|
159
|
+
seen.add(key);
|
|
160
|
+
chains.push(chain);
|
|
161
|
+
}
|
|
162
|
+
function chainKey(chain) {
|
|
163
|
+
return chain.map((n) => n.value).join('/');
|
|
164
|
+
}
|
|
165
|
+
function termKey(term) {
|
|
166
|
+
return `${term.termType}:${term.value}`;
|
|
167
|
+
}
|
|
168
|
+
function termLabel(term) {
|
|
169
|
+
return term.termType === 'BlankNode' ? `_:${term.value}` : `<${term.value}>`;
|
|
170
|
+
}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import { Stage, type StageOptions, type Validator } from '@lde/pipeline';
|
|
2
|
+
import type { NamedNode } from '@rdfjs/types';
|
|
3
|
+
import { type TargetShape } from './pathExtractor.js';
|
|
4
|
+
type OnInvalid = NonNullable<StageOptions['validation']>['onInvalid'];
|
|
5
|
+
/** Options for {@link shaclSampleStages}. */
|
|
6
|
+
export interface ShaclSampleStagesOptions {
|
|
7
|
+
/** URL or local path to the SHACL shapes file. */
|
|
8
|
+
shapesFile: string;
|
|
9
|
+
/**
|
|
10
|
+
* Number of top-level resources to sample per `sh:targetClass`.
|
|
11
|
+
* @default 50
|
|
12
|
+
*/
|
|
13
|
+
samplesPerClass?: number;
|
|
14
|
+
/**
|
|
15
|
+
* SPARQL query timeout in milliseconds.
|
|
16
|
+
* @default 60000
|
|
17
|
+
*/
|
|
18
|
+
timeout?: number;
|
|
19
|
+
/**
|
|
20
|
+
* Maximum number of sampled subjects per executor call. Defaults to
|
|
21
|
+
* {@link samplesPerClass} so the whole sample fits in one CONSTRUCT
|
|
22
|
+
* round-trip; lower to spread work across multiple parallel queries.
|
|
23
|
+
*/
|
|
24
|
+
batchSize?: number;
|
|
25
|
+
/**
|
|
26
|
+
* Maximum concurrent in-flight executor batches per stage. @default 10
|
|
27
|
+
*/
|
|
28
|
+
maxConcurrency?: number;
|
|
29
|
+
/**
|
|
30
|
+
* Validator attached to every generated stage. Typically a
|
|
31
|
+
* {@link https://www.npmjs.com/package/@lde/pipeline-shacl-validator ShaclValidator}
|
|
32
|
+
* configured with the same {@link shapesFile}.
|
|
33
|
+
*/
|
|
34
|
+
validator?: Validator;
|
|
35
|
+
/**
|
|
36
|
+
* Behaviour when a sampled batch fails validation. Only used when
|
|
37
|
+
* {@link validator} is set.
|
|
38
|
+
* @default 'write'
|
|
39
|
+
*/
|
|
40
|
+
onInvalid?: OnInvalid;
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Build one sampling {@link Stage} per `sh:targetClass` declared in the SHACL
|
|
44
|
+
* shapes file. Each stage pairs a SELECT-based {@link ItemSelector} that picks
|
|
45
|
+
* N instances of its target class with a CONSTRUCT executor that, for every
|
|
46
|
+
* path chain the SHACL declares (recursively, stopping at leaf constraints
|
|
47
|
+
* or cycles), pulls in the triples reachable along that chain’s terminal
|
|
48
|
+
* node.
|
|
49
|
+
*
|
|
50
|
+
* Pass a {@link Validator} to attach it to every generated stage:
|
|
51
|
+
*
|
|
52
|
+
* ```ts
|
|
53
|
+
* const validator = new ShaclValidator({ shapesFile, reportDir });
|
|
54
|
+
* const stages = await shaclSampleStages({ shapesFile, validator });
|
|
55
|
+
* ```
|
|
56
|
+
*/
|
|
57
|
+
export declare function shaclSampleStages(options: ShaclSampleStagesOptions): Promise<Stage[]>;
|
|
58
|
+
export declare function buildSubjectSelectorQuery(targetClass: NamedNode, limit: number, subjectFilter?: string, namedGraph?: string): string;
|
|
59
|
+
export declare function buildSampleQuery(shape: TargetShape): string;
|
|
60
|
+
export {};
|
|
61
|
+
//# sourceMappingURL=sampleStages.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sampleStages.d.ts","sourceRoot":"","sources":["../src/sampleStages.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,KAAK,EAIL,KAAK,YAAY,EACjB,KAAK,SAAS,EACf,MAAM,eAAe,CAAC;AAEvB,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AAC9C,OAAO,EAAuB,KAAK,WAAW,EAAE,MAAM,oBAAoB,CAAC;AAE3E,KAAK,SAAS,GAAG,WAAW,CAAC,YAAY,CAAC,YAAY,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC;AAEtE,6CAA6C;AAC7C,MAAM,WAAW,wBAAwB;IACvC,kDAAkD;IAClD,UAAU,EAAE,MAAM,CAAC;IACnB;;;OAGG;IACH,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB;;;OAGG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB;;;;OAIG;IACH,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB;;OAEG;IACH,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB;;;;OAIG;IACH,SAAS,CAAC,EAAE,SAAS,CAAC;IACtB;;;;OAIG;IACH,SAAS,CAAC,EAAE,SAAS,CAAC;CACvB;AAED;;;;;;;;;;;;;;GAcG;AACH,wBAAsB,iBAAiB,CACrC,OAAO,EAAE,wBAAwB,GAChC,OAAO,CAAC,KAAK,EAAE,CAAC,CAwBlB;AAiBD,wBAAgB,yBAAyB,CACvC,WAAW,EAAE,SAAS,EACtB,KAAK,EAAE,MAAM,EACb,aAAa,CAAC,EAAE,MAAM,EACtB,UAAU,CAAC,EAAE,MAAM,GAClB,MAAM,CAYR;AAED,wBAAgB,gBAAgB,CAAC,KAAK,EAAE,WAAW,GAAG,MAAM,CAuB3D"}
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import { Stage, SparqlConstructExecutor, SparqlItemSelector, } from '@lde/pipeline';
|
|
2
|
+
import { assertSafeIri } from '@lde/dataset';
|
|
3
|
+
import { extractTargetShapes } from './pathExtractor.js';
|
|
4
|
+
/**
|
|
5
|
+
* Build one sampling {@link Stage} per `sh:targetClass` declared in the SHACL
|
|
6
|
+
* shapes file. Each stage pairs a SELECT-based {@link ItemSelector} that picks
|
|
7
|
+
* N instances of its target class with a CONSTRUCT executor that, for every
|
|
8
|
+
* path chain the SHACL declares (recursively, stopping at leaf constraints
|
|
9
|
+
* or cycles), pulls in the triples reachable along that chain’s terminal
|
|
10
|
+
* node.
|
|
11
|
+
*
|
|
12
|
+
* Pass a {@link Validator} to attach it to every generated stage:
|
|
13
|
+
*
|
|
14
|
+
* ```ts
|
|
15
|
+
* const validator = new ShaclValidator({ shapesFile, reportDir });
|
|
16
|
+
* const stages = await shaclSampleStages({ shapesFile, validator });
|
|
17
|
+
* ```
|
|
18
|
+
*/
|
|
19
|
+
export async function shaclSampleStages(options) {
|
|
20
|
+
const samplesPerClass = options.samplesPerClass ?? 50;
|
|
21
|
+
const timeout = options.timeout ?? 60_000;
|
|
22
|
+
const batchSize = options.batchSize ?? samplesPerClass;
|
|
23
|
+
const maxConcurrency = options.maxConcurrency;
|
|
24
|
+
const validation = options.validator
|
|
25
|
+
? { validator: options.validator, onInvalid: options.onInvalid ?? 'write' }
|
|
26
|
+
: undefined;
|
|
27
|
+
const shapes = await extractTargetShapes(options.shapesFile);
|
|
28
|
+
return shapes.map((shape) => new Stage({
|
|
29
|
+
name: `shacl-sample-${localName(shape.targetClass.value)}`,
|
|
30
|
+
itemSelector: subjectSelector(shape.targetClass, samplesPerClass),
|
|
31
|
+
executors: new SparqlConstructExecutor({
|
|
32
|
+
query: buildSampleQuery(shape),
|
|
33
|
+
timeout,
|
|
34
|
+
}),
|
|
35
|
+
batchSize,
|
|
36
|
+
maxConcurrency,
|
|
37
|
+
validation,
|
|
38
|
+
}));
|
|
39
|
+
}
|
|
40
|
+
function subjectSelector(targetClass, limit) {
|
|
41
|
+
assertSafeIri(targetClass.value);
|
|
42
|
+
return {
|
|
43
|
+
select(distribution, batchSize) {
|
|
44
|
+
const query = buildSubjectSelectorQuery(targetClass, limit, distribution.subjectFilter, distribution.namedGraph);
|
|
45
|
+
return new SparqlItemSelector({ query }).select(distribution, batchSize);
|
|
46
|
+
},
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
export function buildSubjectSelectorQuery(targetClass, limit, subjectFilter, namedGraph) {
|
|
50
|
+
let fromClause = '';
|
|
51
|
+
if (namedGraph) {
|
|
52
|
+
assertSafeIri(namedGraph);
|
|
53
|
+
fromClause = `FROM <${namedGraph}>`;
|
|
54
|
+
}
|
|
55
|
+
return [
|
|
56
|
+
'SELECT DISTINCT ?s',
|
|
57
|
+
fromClause,
|
|
58
|
+
`WHERE { ${subjectFilter ?? ''} ?s a <${targetClass.value}> . }`,
|
|
59
|
+
`LIMIT ${limit}`,
|
|
60
|
+
].join('\n');
|
|
61
|
+
}
|
|
62
|
+
export function buildSampleQuery(shape) {
|
|
63
|
+
for (const chain of shape.pathChains) {
|
|
64
|
+
for (const path of chain)
|
|
65
|
+
assertSafeIri(path.value);
|
|
66
|
+
}
|
|
67
|
+
const chainBranches = shape.pathChains
|
|
68
|
+
.map((chain) => ` UNION {
|
|
69
|
+
?s ${chain.map((p) => `<${p.value}>`).join('/')} ?neighbour .
|
|
70
|
+
?neighbour ?np ?nv .
|
|
71
|
+
}`)
|
|
72
|
+
.join('');
|
|
73
|
+
return `CONSTRUCT {
|
|
74
|
+
?s ?p ?o .
|
|
75
|
+
?neighbour ?np ?nv .
|
|
76
|
+
}
|
|
77
|
+
WHERE {
|
|
78
|
+
{
|
|
79
|
+
?s ?p ?o .
|
|
80
|
+
}${chainBranches}
|
|
81
|
+
}`;
|
|
82
|
+
}
|
|
83
|
+
function localName(iri) {
|
|
84
|
+
const match = /[#/]([^#/]+)$/.exec(iri);
|
|
85
|
+
return (match?.[1] ?? iri).replace(/[^A-Za-z0-9_-]/g, '_');
|
|
86
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@lde/pipeline-shacl-sampler",
|
|
3
|
+
"version": "0.2.0",
|
|
4
|
+
"description": "Per-class sampling stages for @lde/pipeline, derived from SHACL shapes",
|
|
5
|
+
"repository": {
|
|
6
|
+
"url": "git+https://github.com/ldelements/lde.git",
|
|
7
|
+
"directory": "packages/pipeline-shacl-sampler"
|
|
8
|
+
},
|
|
9
|
+
"license": "MIT",
|
|
10
|
+
"type": "module",
|
|
11
|
+
"exports": {
|
|
12
|
+
"./package.json": "./package.json",
|
|
13
|
+
".": {
|
|
14
|
+
"types": "./dist/index.d.ts",
|
|
15
|
+
"import": "./dist/index.js",
|
|
16
|
+
"development": "./src/index.ts",
|
|
17
|
+
"default": "./dist/index.js"
|
|
18
|
+
}
|
|
19
|
+
},
|
|
20
|
+
"main": "./dist/index.js",
|
|
21
|
+
"module": "./dist/index.js",
|
|
22
|
+
"types": "./dist/index.d.ts",
|
|
23
|
+
"files": [
|
|
24
|
+
"dist",
|
|
25
|
+
"!**/*.tsbuildinfo"
|
|
26
|
+
],
|
|
27
|
+
"dependencies": {
|
|
28
|
+
"@rdfjs/types": "^2.0.1",
|
|
29
|
+
"n3": "^2.0.3",
|
|
30
|
+
"rdf-dereference": "^5.0.0",
|
|
31
|
+
"tslib": "^2.3.0"
|
|
32
|
+
},
|
|
33
|
+
"peerDependencies": {
|
|
34
|
+
"@lde/dataset": "0.7.3",
|
|
35
|
+
"@lde/pipeline": "0.28.14"
|
|
36
|
+
}
|
|
37
|
+
}
|