@lde/pipeline-void 0.2.15 → 0.2.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +30 -5
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -0
- package/dist/perClassAnalyzer.d.ts +5 -50
- package/dist/perClassAnalyzer.d.ts.map +1 -1
- package/dist/perClassAnalyzer.js +36 -92
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -5,7 +5,16 @@ VOiD (Vocabulary of Interlinked Datasets) statistical analysis for RDF datasets.
|
|
|
5
5
|
## Analyzers
|
|
6
6
|
|
|
7
7
|
- **SparqlQueryAnalyzer** — Execute SPARQL CONSTRUCT queries with template substitution
|
|
8
|
-
|
|
8
|
+
|
|
9
|
+
## Per-class stages
|
|
10
|
+
|
|
11
|
+
Factory functions that create `Stage` instances for per-class analysis.
|
|
12
|
+
Each stage first selects classes from the endpoint, then runs a CONSTRUCT query
|
|
13
|
+
with `?class` bound via VALUES:
|
|
14
|
+
|
|
15
|
+
- `createDatatypeStage` — per-class datatype partitions
|
|
16
|
+
- `createLanguageStage` — per-class language tags
|
|
17
|
+
- `createObjectClassStage` — per-class object class partitions
|
|
9
18
|
|
|
10
19
|
## SPARQL Queries
|
|
11
20
|
|
|
@@ -33,14 +42,30 @@ Generic VOiD analysis queries included:
|
|
|
33
42
|
## Usage
|
|
34
43
|
|
|
35
44
|
```typescript
|
|
36
|
-
import {
|
|
45
|
+
import {
|
|
46
|
+
SparqlQueryAnalyzer,
|
|
47
|
+
Success,
|
|
48
|
+
createDatatypeStage,
|
|
49
|
+
} from '@lde/pipeline-void';
|
|
50
|
+
import { Distribution } from '@lde/dataset';
|
|
37
51
|
|
|
38
|
-
//
|
|
52
|
+
// Simple CONSTRUCT query analyzer
|
|
39
53
|
const analyzer = await SparqlQueryAnalyzer.fromFile('triples.rq');
|
|
40
|
-
|
|
41
|
-
// Execute against a dataset
|
|
42
54
|
const result = await analyzer.execute(dataset);
|
|
43
55
|
if (result instanceof Success) {
|
|
44
56
|
// result.data contains the VOiD statistics as RDF
|
|
45
57
|
}
|
|
58
|
+
|
|
59
|
+
// Per-class stage (streaming)
|
|
60
|
+
const distribution = Distribution.sparql(new URL('http://example.com/sparql'));
|
|
61
|
+
const stage = await createDatatypeStage(distribution);
|
|
62
|
+
const quads = await stage.run(dataset, distribution);
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Validation
|
|
66
|
+
|
|
67
|
+
```sh
|
|
68
|
+
npx nx test pipeline-void
|
|
69
|
+
npx nx lint pipeline-void
|
|
70
|
+
npx nx typecheck pipeline-void
|
|
46
71
|
```
|
package/dist/index.d.ts
CHANGED
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,KAAK,QAAQ,EACb,YAAY,EACZ,OAAO,EACP,OAAO,EACP,YAAY,GACb,MAAM,wBAAwB,CAAC;AAChC,cAAc,0BAA0B,CAAC;AACzC,cAAc,uBAAuB,CAAC;AACtC,cAAc,yBAAyB,CAAC;AACxC,cAAc,iBAAiB,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,MAAM,eAAe,CAAC;AACtC,OAAO,EACL,KAAK,QAAQ,EACb,YAAY,EACZ,OAAO,EACP,OAAO,EACP,YAAY,GACb,MAAM,wBAAwB,CAAC;AAChC,cAAc,0BAA0B,CAAC;AACzC,cAAc,uBAAuB,CAAC;AACtC,cAAc,yBAAyB,CAAC;AACxC,cAAc,iBAAiB,CAAC"}
|
package/dist/index.js
CHANGED
|
@@ -1,51 +1,6 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import {
|
|
3
|
-
|
|
4
|
-
export
|
|
5
|
-
|
|
6
|
-
* Timeout for SPARQL queries in milliseconds.
|
|
7
|
-
* @default 300000 (5 minutes)
|
|
8
|
-
*/
|
|
9
|
-
timeout?: number;
|
|
10
|
-
/**
|
|
11
|
-
* Custom SparqlEndpointFetcher instance.
|
|
12
|
-
*/
|
|
13
|
-
fetcher?: SparqlEndpointFetcher;
|
|
14
|
-
/**
|
|
15
|
-
* Maximum number of classes to analyze.
|
|
16
|
-
* @default 1000
|
|
17
|
-
*/
|
|
18
|
-
maxClasses?: number;
|
|
19
|
-
}
|
|
20
|
-
/**
|
|
21
|
-
* Two-phase analyzer that first retrieves classes, then runs a query for each class.
|
|
22
|
-
*
|
|
23
|
-
* This approach prevents timeouts and OOM errors on large datasets by splitting
|
|
24
|
-
* the analysis into smaller queries per class.
|
|
25
|
-
*
|
|
26
|
-
* Supports legacy template substitution:
|
|
27
|
-
* - `#subjectFilter#` — replaced with the dataset's subject filter (if any)
|
|
28
|
-
* - `#namedGraph#` — replaced with `FROM <graph>` clause if the distribution has a named graph
|
|
29
|
-
* - `?dataset` — replaced with the dataset IRI
|
|
30
|
-
* - `<#class#>` — replaced with the current class IRI
|
|
31
|
-
*/
|
|
32
|
-
export declare class PerClassAnalyzer extends BaseAnalyzer {
|
|
33
|
-
readonly name: string;
|
|
34
|
-
private readonly fetcher;
|
|
35
|
-
private readonly query;
|
|
36
|
-
private readonly maxClasses;
|
|
37
|
-
constructor(name: string, query: string, options?: PerClassAnalyzerOptions);
|
|
38
|
-
/**
|
|
39
|
-
* Create an analyzer from a query file in the queries directory.
|
|
40
|
-
*
|
|
41
|
-
* @param filename Query filename (e.g., 'class-property-datatypes.rq')
|
|
42
|
-
* @param options Optional analyzer options
|
|
43
|
-
*/
|
|
44
|
-
static fromFile(filename: string, options?: PerClassAnalyzerOptions): Promise<PerClassAnalyzer>;
|
|
45
|
-
execute(dataset: Dataset): Promise<Success | Failure | NotSupported>;
|
|
46
|
-
private getClasses;
|
|
47
|
-
}
|
|
48
|
-
export declare function createDatatypeAnalyzer(options?: PerClassAnalyzerOptions): Promise<PerClassAnalyzer>;
|
|
49
|
-
export declare function createLanguageAnalyzer(options?: PerClassAnalyzerOptions): Promise<PerClassAnalyzer>;
|
|
50
|
-
export declare function createObjectClassAnalyzer(options?: PerClassAnalyzerOptions): Promise<PerClassAnalyzer>;
|
|
1
|
+
import { Distribution } from '@lde/dataset';
|
|
2
|
+
import { Stage } from '@lde/pipeline';
|
|
3
|
+
export declare function createDatatypeStage(distribution: Distribution): Promise<Stage>;
|
|
4
|
+
export declare function createLanguageStage(distribution: Distribution): Promise<Stage>;
|
|
5
|
+
export declare function createObjectClassStage(distribution: Distribution): Promise<Stage>;
|
|
51
6
|
//# sourceMappingURL=perClassAnalyzer.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"perClassAnalyzer.d.ts","sourceRoot":"","sources":["../src/perClassAnalyzer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,
|
|
1
|
+
{"version":3,"file":"perClassAnalyzer.d.ts","sourceRoot":"","sources":["../src/perClassAnalyzer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAC5C,OAAO,EACL,KAAK,EAIN,MAAM,eAAe,CAAC;AAkDvB,wBAAgB,mBAAmB,CACjC,YAAY,EAAE,YAAY,GACzB,OAAO,CAAC,KAAK,CAAC,CAEhB;AAED,wBAAgB,mBAAmB,CACjC,YAAY,EAAE,YAAY,GACzB,OAAO,CAAC,KAAK,CAAC,CAEhB;AAED,wBAAgB,sBAAsB,CACpC,YAAY,EAAE,YAAY,GACzB,OAAO,CAAC,KAAK,CAAC,CAEhB"}
|
package/dist/perClassAnalyzer.js
CHANGED
|
@@ -1,102 +1,46 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { Store } from 'n3';
|
|
3
|
-
import { SparqlEndpointFetcher } from 'fetch-sparql-endpoint';
|
|
1
|
+
import { Stage, SparqlSelector, SparqlConstructExecutor, readQueryFile, } from '@lde/pipeline';
|
|
4
2
|
import { resolve, dirname } from 'node:path';
|
|
5
3
|
import { fileURLToPath } from 'node:url';
|
|
6
|
-
import { BaseAnalyzer, Success, Failure, NotSupported, } from '@lde/pipeline/analyzer';
|
|
7
4
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
8
5
|
/**
|
|
9
|
-
*
|
|
6
|
+
* Create a Stage that first selects classes from the endpoint,
|
|
7
|
+
* then runs a per-class CONSTRUCT query with `?class` bound via VALUES.
|
|
10
8
|
*
|
|
11
|
-
*
|
|
12
|
-
* the analysis into smaller queries per class.
|
|
13
|
-
*
|
|
14
|
-
* Supports legacy template substitution:
|
|
15
|
-
* - `#subjectFilter#` — replaced with the dataset's subject filter (if any)
|
|
16
|
-
* - `#namedGraph#` — replaced with `FROM <graph>` clause if the distribution has a named graph
|
|
17
|
-
* - `?dataset` — replaced with the dataset IRI
|
|
18
|
-
* - `<#class#>` — replaced with the current class IRI
|
|
9
|
+
* Replaces the legacy `PerClassAnalyzer` two-phase loop with streaming.
|
|
19
10
|
*/
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
}
|
|
46
|
-
async execute(dataset) {
|
|
47
|
-
const sparqlDistribution = dataset.getSparqlDistribution();
|
|
48
|
-
if (sparqlDistribution === null) {
|
|
49
|
-
return new NotSupported('No SPARQL distribution available');
|
|
50
|
-
}
|
|
51
|
-
const store = new Store();
|
|
52
|
-
try {
|
|
53
|
-
// Phase 1: Get all classes.
|
|
54
|
-
const classes = await this.getClasses(sparqlDistribution, dataset);
|
|
55
|
-
// Phase 2: Run query for each class.
|
|
56
|
-
for (const classIri of classes) {
|
|
57
|
-
const substituted = substituteQueryTemplates(this.query.replaceAll('<#class#>', `<${classIri}>`), sparqlDistribution, dataset);
|
|
58
|
-
const executor = new SparqlConstructExecutor({
|
|
59
|
-
query: substituted,
|
|
60
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
61
|
-
fetcher: this.fetcher,
|
|
62
|
-
});
|
|
63
|
-
const stream = await executor.execute(dataset, sparqlDistribution);
|
|
64
|
-
store.addQuads([...(await collect(stream))]);
|
|
65
|
-
}
|
|
66
|
-
}
|
|
67
|
-
catch (e) {
|
|
68
|
-
const accessUrl = sparqlDistribution.accessUrl;
|
|
69
|
-
return new Failure(accessUrl ?? new URL('unknown://'), e instanceof Error ? e.message : undefined);
|
|
70
|
-
}
|
|
71
|
-
return new Success(store);
|
|
72
|
-
}
|
|
73
|
-
async getClasses(distribution, dataset) {
|
|
74
|
-
const classQuery = substituteQueryTemplates(`SELECT DISTINCT ?class
|
|
75
|
-
#namedGraph#
|
|
76
|
-
WHERE {
|
|
77
|
-
#subjectFilter#
|
|
78
|
-
?s a ?class .
|
|
79
|
-
}
|
|
80
|
-
LIMIT ${this.maxClasses}`, distribution, dataset);
|
|
81
|
-
const bindings = await this.fetcher.fetchBindings(distribution.accessUrl.toString(), classQuery);
|
|
82
|
-
const classes = [];
|
|
83
|
-
for await (const binding of bindings) {
|
|
84
|
-
// Bindings are Record<string, RDF.Term>.
|
|
85
|
-
const bindingRecord = binding;
|
|
86
|
-
const classValue = bindingRecord['class'];
|
|
87
|
-
if (classValue && classValue.termType === 'NamedNode') {
|
|
88
|
-
classes.push(classValue.value);
|
|
89
|
-
}
|
|
90
|
-
}
|
|
91
|
-
return classes;
|
|
92
|
-
}
|
|
11
|
+
async function createPerClassStage(queryFilename, distribution) {
|
|
12
|
+
const rawQuery = await readQueryFile(resolve(__dirname, 'queries', queryFilename));
|
|
13
|
+
// Pre-process #subjectFilter# before the query is parsed as SPARQL.
|
|
14
|
+
const subjectFilter = distribution.subjectFilter ?? '';
|
|
15
|
+
const query = rawQuery.replace('#subjectFilter#', subjectFilter);
|
|
16
|
+
// Build the selector SELECT query (same substitution for subjectFilter).
|
|
17
|
+
const fromClause = distribution.namedGraph
|
|
18
|
+
? `FROM <${distribution.namedGraph}>`
|
|
19
|
+
: '';
|
|
20
|
+
const selectorQuery = [
|
|
21
|
+
'SELECT DISTINCT ?class',
|
|
22
|
+
fromClause,
|
|
23
|
+
`WHERE { ${subjectFilter} ?s a ?class . }`,
|
|
24
|
+
'LIMIT 1000',
|
|
25
|
+
].join('\n');
|
|
26
|
+
const selector = new SparqlSelector({
|
|
27
|
+
query: selectorQuery,
|
|
28
|
+
endpoint: distribution.accessUrl,
|
|
29
|
+
pageSize: 1000,
|
|
30
|
+
});
|
|
31
|
+
const executor = new SparqlConstructExecutor({ query });
|
|
32
|
+
return new Stage({
|
|
33
|
+
name: queryFilename,
|
|
34
|
+
selector,
|
|
35
|
+
executors: executor,
|
|
36
|
+
});
|
|
93
37
|
}
|
|
94
|
-
export function
|
|
95
|
-
return
|
|
38
|
+
export function createDatatypeStage(distribution) {
|
|
39
|
+
return createPerClassStage('class-property-datatypes.rq', distribution);
|
|
96
40
|
}
|
|
97
|
-
export function
|
|
98
|
-
return
|
|
41
|
+
export function createLanguageStage(distribution) {
|
|
42
|
+
return createPerClassStage('class-property-languages.rq', distribution);
|
|
99
43
|
}
|
|
100
|
-
export function
|
|
101
|
-
return
|
|
44
|
+
export function createObjectClassStage(distribution) {
|
|
45
|
+
return createPerClassStage('class-property-object-classes.rq', distribution);
|
|
102
46
|
}
|
package/package.json
CHANGED