@lde/pipeline-void 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,46 @@
1
+ # Pipeline VOiD
2
+
3
+ VOiD (Vocabulary of Interlinked Datasets) statistical analysis for RDF datasets.
4
+
5
+ ## Analyzers
6
+
7
+ - **SparqlQueryAnalyzer** — Execute SPARQL CONSTRUCT queries with template substitution
8
+ - **PerClassAnalyzer** — Two-phase analyzer that iterates over classes to avoid timeouts
9
+
10
+ ## SPARQL Queries
11
+
12
+ Generic VOiD analysis queries included:
13
+
14
+ | Query | Description |
15
+ | ---------------------------------- | ------------------------------------- |
16
+ | `triples.rq` | Total triple count |
17
+ | `subjects.rq` | Distinct subjects |
18
+ | `properties.rq` | Distinct properties |
19
+ | `class-partition.rq` | Classes with entity counts |
20
+ | `class-properties-subjects.rq` | Properties per class (subject counts) |
21
+ | `class-properties-objects.rq` | Properties per class (object counts) |
22
+ | `class-property-datatypes.rq` | Per-class datatype partitions |
23
+ | `class-property-languages.rq` | Per-class language tags |
24
+ | `class-property-object-classes.rq` | Per-class object class partitions |
25
+ | `object-literals.rq` | Literal object counts |
26
+ | `object-uris.rq` | URI object counts |
27
+ | `object-uri-space.rq` | Object URI namespaces |
28
+ | `subject-uri-space.rq` | Subject URI namespaces |
29
+ | `datatypes.rq` | Dataset-level datatypes |
30
+ | `entity-properties.rq` | Property statistics |
31
+ | `licenses.rq` | License detection |
32
+
33
+ ## Usage
34
+
35
+ ```typescript
36
+ import { SparqlQueryAnalyzer } from '@lde/pipeline-void';
37
+
38
+ // Load a query from file
39
+ const analyzer = await SparqlQueryAnalyzer.fromFile('triples.rq');
40
+
41
+ // Execute against a dataset
42
+ const result = await analyzer.execute(dataset);
43
+ if (result instanceof Success) {
44
+ // result.data contains the VOiD statistics as RDF
45
+ }
46
+ ```
@@ -0,0 +1,41 @@
1
+ import { Dataset } from '@lde/dataset';
2
+ import type { DatasetCore } from '@rdfjs/types';
3
+ /**
4
+ * Result of a successful analysis.
5
+ */
6
+ export declare class Success {
7
+ readonly data: DatasetCore;
8
+ constructor(data: DatasetCore);
9
+ }
10
+ /**
11
+ * Analysis failed.
12
+ */
13
+ export declare class Failure {
14
+ readonly endpoint: URL;
15
+ readonly message?: string | undefined;
16
+ constructor(endpoint: URL, message?: string | undefined);
17
+ }
18
+ /**
19
+ * Analysis is not supported for this dataset (e.g., no SPARQL distribution).
20
+ */
21
+ export declare class NotSupported {
22
+ readonly message: string;
23
+ constructor(message: string);
24
+ }
25
+ /**
26
+ * Interface for VOiD analyzers.
27
+ */
28
+ export interface Analyzer {
29
+ readonly name: string;
30
+ execute(dataset: Dataset): Promise<Success | Failure | NotSupported>;
31
+ finish?(): Promise<void>;
32
+ }
33
+ /**
34
+ * Base class for analyzers with default implementations.
35
+ */
36
+ export declare abstract class BaseAnalyzer implements Analyzer {
37
+ abstract readonly name: string;
38
+ abstract execute(dataset: Dataset): Promise<Success | Failure | NotSupported>;
39
+ finish(): Promise<void>;
40
+ }
41
+ //# sourceMappingURL=analyzer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"analyzer.d.ts","sourceRoot":"","sources":["../src/analyzer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AACvC,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,cAAc,CAAC;AAEhD;;GAEG;AACH,qBAAa,OAAO;aACU,IAAI,EAAE,WAAW;gBAAjB,IAAI,EAAE,WAAW;CAC9C;AAED;;GAEG;AACH,qBAAa,OAAO;aAEA,QAAQ,EAAE,GAAG;aACb,OAAO,CAAC,EAAE,MAAM;gBADhB,QAAQ,EAAE,GAAG,EACb,OAAO,CAAC,EAAE,MAAM,YAAA;CAEnC;AAED;;GAEG;AACH,qBAAa,YAAY;aACK,OAAO,EAAE,MAAM;gBAAf,OAAO,EAAE,MAAM;CAC5C;AAED;;GAEG;AACH,MAAM,WAAW,QAAQ;IACvB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,OAAO,CAAC,OAAO,EAAE,OAAO,GAAG,OAAO,CAAC,OAAO,GAAG,OAAO,GAAG,YAAY,CAAC,CAAC;IACrE,MAAM,CAAC,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;CAC1B;AAED;;GAEG;AACH,8BAAsB,YAAa,YAAW,QAAQ;IACpD,QAAQ,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IAC/B,QAAQ,CAAC,OAAO,CAAC,OAAO,EAAE,OAAO,GAAG,OAAO,CAAC,OAAO,GAAG,OAAO,GAAG,YAAY,CAAC;IAEvE,MAAM,IAAI,OAAO,CAAC,IAAI,CAAC;CAG9B"}
@@ -0,0 +1,37 @@
1
+ /**
2
+ * Result of a successful analysis.
3
+ */
4
+ export class Success {
5
+ data;
6
+ constructor(data) {
7
+ this.data = data;
8
+ }
9
+ }
10
+ /**
11
+ * Analysis failed.
12
+ */
13
+ export class Failure {
14
+ endpoint;
15
+ message;
16
+ constructor(endpoint, message) {
17
+ this.endpoint = endpoint;
18
+ this.message = message;
19
+ }
20
+ }
21
+ /**
22
+ * Analysis is not supported for this dataset (e.g., no SPARQL distribution).
23
+ */
24
+ export class NotSupported {
25
+ message;
26
+ constructor(message) {
27
+ this.message = message;
28
+ }
29
+ }
30
+ /**
31
+ * Base class for analyzers with default implementations.
32
+ */
33
+ export class BaseAnalyzer {
34
+ async finish() {
35
+ // Default no-op implementation.
36
+ }
37
+ }
@@ -0,0 +1,13 @@
1
+ import { PerClassAnalyzer, type PerClassAnalyzerOptions } from './perClassAnalyzer.js';
2
+ /**
3
+ * Per-class analyzer for datatype partitions.
4
+ *
5
+ * Detects which datatypes are used for each property of each class.
6
+ */
7
+ export declare class DatatypeAnalyzer extends PerClassAnalyzer {
8
+ /**
9
+ * Create a DatatypeAnalyzer.
10
+ */
11
+ static create(options?: PerClassAnalyzerOptions): Promise<DatatypeAnalyzer>;
12
+ }
13
+ //# sourceMappingURL=datatypeAnalyzer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"datatypeAnalyzer.d.ts","sourceRoot":"","sources":["../src/datatypeAnalyzer.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,gBAAgB,EAChB,KAAK,uBAAuB,EAC7B,MAAM,uBAAuB,CAAC;AAI/B;;;;GAIG;AACH,qBAAa,gBAAiB,SAAQ,gBAAgB;IACpD;;OAEG;WACiB,MAAM,CACxB,OAAO,CAAC,EAAE,uBAAuB,GAChC,OAAO,CAAC,gBAAgB,CAAC;CAI7B"}
@@ -0,0 +1,16 @@
1
+ import { PerClassAnalyzer, } from './perClassAnalyzer.js';
2
+ const QUERY_FILE = 'class-property-datatypes.rq';
3
+ /**
4
+ * Per-class analyzer for datatype partitions.
5
+ *
6
+ * Detects which datatypes are used for each property of each class.
7
+ */
8
+ export class DatatypeAnalyzer extends PerClassAnalyzer {
9
+ /**
10
+ * Create a DatatypeAnalyzer.
11
+ */
12
+ static async create(options) {
13
+ const query = await PerClassAnalyzer.loadQuery(QUERY_FILE);
14
+ return new DatatypeAnalyzer(QUERY_FILE, query, options);
15
+ }
16
+ }
@@ -0,0 +1,6 @@
1
+ export { type Analyzer, BaseAnalyzer, Success, Failure, NotSupported, } from '@lde/pipeline/analyzer';
2
+ export * from './sparqlQueryAnalyzer.js';
3
+ export * from './perClassAnalyzer.js';
4
+ export * from './vocabularyAnalyzer.js';
5
+ export * from './provenance.js';
6
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,KAAK,QAAQ,EACb,YAAY,EACZ,OAAO,EACP,OAAO,EACP,YAAY,GACb,MAAM,wBAAwB,CAAC;AAChC,cAAc,0BAA0B,CAAC;AACzC,cAAc,uBAAuB,CAAC;AACtC,cAAc,yBAAyB,CAAC;AACxC,cAAc,iBAAiB,CAAC"}
package/dist/index.js ADDED
@@ -0,0 +1,5 @@
1
+ export { BaseAnalyzer, Success, Failure, NotSupported, } from '@lde/pipeline/analyzer';
2
+ export * from './sparqlQueryAnalyzer.js';
3
+ export * from './perClassAnalyzer.js';
4
+ export * from './vocabularyAnalyzer.js';
5
+ export * from './provenance.js';
@@ -0,0 +1,13 @@
1
+ import { PerClassAnalyzer, type PerClassAnalyzerOptions } from './perClassAnalyzer.js';
2
+ /**
3
+ * Per-class analyzer for language partitions.
4
+ *
5
+ * Detects which language tags are used for each property of each class.
6
+ */
7
+ export declare class LanguageAnalyzer extends PerClassAnalyzer {
8
+ /**
9
+ * Create a LanguageAnalyzer.
10
+ */
11
+ static create(options?: PerClassAnalyzerOptions): Promise<LanguageAnalyzer>;
12
+ }
13
+ //# sourceMappingURL=languageAnalyzer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"languageAnalyzer.d.ts","sourceRoot":"","sources":["../src/languageAnalyzer.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,gBAAgB,EAChB,KAAK,uBAAuB,EAC7B,MAAM,uBAAuB,CAAC;AAI/B;;;;GAIG;AACH,qBAAa,gBAAiB,SAAQ,gBAAgB;IACpD;;OAEG;WACiB,MAAM,CACxB,OAAO,CAAC,EAAE,uBAAuB,GAChC,OAAO,CAAC,gBAAgB,CAAC;CAI7B"}
@@ -0,0 +1,16 @@
1
+ import { PerClassAnalyzer, } from './perClassAnalyzer.js';
2
+ const QUERY_FILE = 'class-property-languages.rq';
3
+ /**
4
+ * Per-class analyzer for language partitions.
5
+ *
6
+ * Detects which language tags are used for each property of each class.
7
+ */
8
+ export class LanguageAnalyzer extends PerClassAnalyzer {
9
+ /**
10
+ * Create a LanguageAnalyzer.
11
+ */
12
+ static async create(options) {
13
+ const query = await PerClassAnalyzer.loadQuery(QUERY_FILE);
14
+ return new LanguageAnalyzer(QUERY_FILE, query, options);
15
+ }
16
+ }
@@ -0,0 +1,13 @@
1
+ import { PerClassAnalyzer, type PerClassAnalyzerOptions } from './perClassAnalyzer.js';
2
+ /**
3
+ * Per-class analyzer for object class partitions.
4
+ *
5
+ * Detects which classes appear as objects for each property of each class.
6
+ */
7
+ export declare class ObjectClassAnalyzer extends PerClassAnalyzer {
8
+ /**
9
+ * Create an ObjectClassAnalyzer.
10
+ */
11
+ static create(options?: PerClassAnalyzerOptions): Promise<ObjectClassAnalyzer>;
12
+ }
13
+ //# sourceMappingURL=objectClassAnalyzer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"objectClassAnalyzer.d.ts","sourceRoot":"","sources":["../src/objectClassAnalyzer.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,gBAAgB,EAChB,KAAK,uBAAuB,EAC7B,MAAM,uBAAuB,CAAC;AAI/B;;;;GAIG;AACH,qBAAa,mBAAoB,SAAQ,gBAAgB;IACvD;;OAEG;WACiB,MAAM,CACxB,OAAO,CAAC,EAAE,uBAAuB,GAChC,OAAO,CAAC,mBAAmB,CAAC;CAIhC"}
@@ -0,0 +1,16 @@
1
+ import { PerClassAnalyzer, } from './perClassAnalyzer.js';
2
+ const QUERY_FILE = 'class-property-object-classes.rq';
3
+ /**
4
+ * Per-class analyzer for object class partitions.
5
+ *
6
+ * Detects which classes appear as objects for each property of each class.
7
+ */
8
+ export class ObjectClassAnalyzer extends PerClassAnalyzer {
9
+ /**
10
+ * Create an ObjectClassAnalyzer.
11
+ */
12
+ static async create(options) {
13
+ const query = await PerClassAnalyzer.loadQuery(QUERY_FILE);
14
+ return new ObjectClassAnalyzer(QUERY_FILE, query, options);
15
+ }
16
+ }
@@ -0,0 +1,51 @@
1
+ import { type ExecutableDataset } from '@lde/pipeline';
2
+ import { SparqlEndpointFetcher } from 'fetch-sparql-endpoint';
3
+ import { BaseAnalyzer, Success, Failure, NotSupported } from '@lde/pipeline/analyzer';
4
+ export interface PerClassAnalyzerOptions {
5
+ /**
6
+ * Timeout for SPARQL queries in milliseconds.
7
+ * @default 300000 (5 minutes)
8
+ */
9
+ timeout?: number;
10
+ /**
11
+ * Custom SparqlEndpointFetcher instance.
12
+ */
13
+ fetcher?: SparqlEndpointFetcher;
14
+ /**
15
+ * Maximum number of classes to analyze.
16
+ * @default 1000
17
+ */
18
+ maxClasses?: number;
19
+ }
20
+ /**
21
+ * Two-phase analyzer that first retrieves classes, then runs a query for each class.
22
+ *
23
+ * This approach prevents timeouts and OOM errors on large datasets by splitting
24
+ * the analysis into smaller queries per class.
25
+ *
26
+ * Supports template substitution:
27
+ * - `#subjectFilter#` — replaced with the dataset's subject filter (if any)
28
+ * - `#namedGraph#` — replaced with `FROM <graph>` clause if the distribution has a named graph
29
+ * - `?dataset` — replaced with the dataset IRI
30
+ * - `<#class#>` — replaced with the current class IRI
31
+ */
32
+ export declare class PerClassAnalyzer extends BaseAnalyzer {
33
+ readonly name: string;
34
+ private readonly fetcher;
35
+ private readonly executor;
36
+ private readonly maxClasses;
37
+ constructor(name: string, query: string, options?: PerClassAnalyzerOptions);
38
+ /**
39
+ * Create an analyzer from a query file in the queries directory.
40
+ *
41
+ * @param filename Query filename (e.g., 'class-property-datatypes.rq')
42
+ * @param options Optional analyzer options
43
+ */
44
+ static fromFile(filename: string, options?: PerClassAnalyzerOptions): Promise<PerClassAnalyzer>;
45
+ execute(dataset: ExecutableDataset): Promise<Success | Failure | NotSupported>;
46
+ private getClasses;
47
+ }
48
+ export declare function createDatatypeAnalyzer(options?: PerClassAnalyzerOptions): Promise<PerClassAnalyzer>;
49
+ export declare function createLanguageAnalyzer(options?: PerClassAnalyzerOptions): Promise<PerClassAnalyzer>;
50
+ export declare function createObjectClassAnalyzer(options?: PerClassAnalyzerOptions): Promise<PerClassAnalyzer>;
51
+ //# sourceMappingURL=perClassAnalyzer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"perClassAnalyzer.d.ts","sourceRoot":"","sources":["../src/perClassAnalyzer.ts"],"names":[],"mappings":"AACA,OAAO,EAKL,KAAK,iBAAiB,EACvB,MAAM,eAAe,CAAC;AAEvB,OAAO,EAAE,qBAAqB,EAAE,MAAM,uBAAuB,CAAC;AAG9D,OAAO,EACL,YAAY,EACZ,OAAO,EACP,OAAO,EACP,YAAY,EACb,MAAM,wBAAwB,CAAC;AAIhC,MAAM,WAAW,uBAAuB;IACtC;;;OAGG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB;;OAEG;IACH,OAAO,CAAC,EAAE,qBAAqB,CAAC;IAChC;;;OAGG;IACH,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAED;;;;;;;;;;;GAWG;AACH,qBAAa,gBAAiB,SAAQ,YAAY;aAM9B,IAAI,EAAE,MAAM;IAL9B,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAwB;IAChD,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAA0B;IACnD,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAS;gBAGlB,IAAI,EAAE,MAAM,EAC5B,KAAK,EAAE,MAAM,EACb,OAAO,CAAC,EAAE,uBAAuB;IAgBnC;;;;;OAKG;WACiB,QAAQ,CAC1B,QAAQ,EAAE,MAAM,EAChB,OAAO,CAAC,EAAE,uBAAuB,GAChC,OAAO,CAAC,gBAAgB,CAAC;IAKf,OAAO,CAClB,OAAO,EAAE,iBAAiB,GACzB,OAAO,CAAC,OAAO,GAAG,OAAO,GAAG,YAAY,CAAC;YAiC9B,UAAU;CAkCzB;AAED,wBAAgB,sBAAsB,CACpC,OAAO,CAAC,EAAE,uBAAuB,GAChC,OAAO,CAAC,gBAAgB,CAAC,CAE3B;AAED,wBAAgB,sBAAsB,CACpC,OAAO,CAAC,EAAE,uBAAuB,GAChC,OAAO,CAAC,gBAAgB,CAAC,CAE3B;AAED,wBAAgB,yBAAyB,CACvC,OAAO,CAAC,EAAE,uBAAuB,GAChC,OAAO,CAAC,gBAAgB,CAAC,CAE3B"}
@@ -0,0 +1,105 @@
1
+ import { SparqlConstructExecutor, substituteQueryTemplates, readQueryFile, collect, } from '@lde/pipeline';
2
+ import { Store } from 'n3';
3
+ import { SparqlEndpointFetcher } from 'fetch-sparql-endpoint';
4
+ import { resolve, dirname } from 'node:path';
5
+ import { fileURLToPath } from 'node:url';
6
+ import { BaseAnalyzer, Success, Failure, NotSupported, } from '@lde/pipeline/analyzer';
7
+ const __dirname = dirname(fileURLToPath(import.meta.url));
8
+ /**
9
+ * Two-phase analyzer that first retrieves classes, then runs a query for each class.
10
+ *
11
+ * This approach prevents timeouts and OOM errors on large datasets by splitting
12
+ * the analysis into smaller queries per class.
13
+ *
14
+ * Supports template substitution:
15
+ * - `#subjectFilter#` — replaced with the dataset's subject filter (if any)
16
+ * - `#namedGraph#` — replaced with `FROM <graph>` clause if the distribution has a named graph
17
+ * - `?dataset` — replaced with the dataset IRI
18
+ * - `<#class#>` — replaced with the current class IRI
19
+ */
20
+ export class PerClassAnalyzer extends BaseAnalyzer {
21
+ name;
22
+ fetcher;
23
+ executor;
24
+ maxClasses;
25
+ constructor(name, query, options) {
26
+ super();
27
+ this.name = name;
28
+ this.fetcher =
29
+ options?.fetcher ??
30
+ new SparqlEndpointFetcher({
31
+ timeout: options?.timeout ?? 300_000,
32
+ });
33
+ this.maxClasses = options?.maxClasses ?? 1000;
34
+ this.executor = new SparqlConstructExecutor({
35
+ query,
36
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
37
+ fetcher: this.fetcher,
38
+ });
39
+ }
40
+ /**
41
+ * Create an analyzer from a query file in the queries directory.
42
+ *
43
+ * @param filename Query filename (e.g., 'class-property-datatypes.rq')
44
+ * @param options Optional analyzer options
45
+ */
46
+ static async fromFile(filename, options) {
47
+ const query = await readQueryFile(resolve(__dirname, 'queries', filename));
48
+ return new PerClassAnalyzer(filename, query, options);
49
+ }
50
+ async execute(dataset) {
51
+ const sparqlDistribution = dataset.getSparqlDistribution();
52
+ if (sparqlDistribution === null) {
53
+ return new NotSupported('No SPARQL distribution available');
54
+ }
55
+ const store = new Store();
56
+ try {
57
+ // Phase 1: Get all classes.
58
+ const classes = await this.getClasses(sparqlDistribution, dataset);
59
+ // Phase 2: Run query for each class via SparqlConstructExecutor.
60
+ for (const classIri of classes) {
61
+ const result = await this.executor.execute(dataset, {
62
+ bindings: { '<#class#>': `<${classIri}>` },
63
+ });
64
+ if (result instanceof NotSupported) {
65
+ return result;
66
+ }
67
+ store.addQuads([...(await collect(result))]);
68
+ }
69
+ }
70
+ catch (e) {
71
+ const accessUrl = sparqlDistribution.accessUrl;
72
+ return new Failure(accessUrl ?? new URL('unknown://'), e instanceof Error ? e.message : undefined);
73
+ }
74
+ return new Success(store);
75
+ }
76
+ async getClasses(distribution, dataset) {
77
+ const classQuery = substituteQueryTemplates(`SELECT DISTINCT ?class
78
+ #namedGraph#
79
+ WHERE {
80
+ #subjectFilter#
81
+ ?s a ?class .
82
+ }
83
+ LIMIT ${this.maxClasses}`, distribution, dataset);
84
+ const bindings = await this.fetcher.fetchBindings(distribution.accessUrl.toString(), classQuery);
85
+ const classes = [];
86
+ for await (const binding of bindings) {
87
+ // Bindings are Record<string, RDF.Term>.
88
+ const bindingRecord = binding;
89
+ const classValue = bindingRecord['class'];
90
+ if (classValue && classValue.termType === 'NamedNode') {
91
+ classes.push(classValue.value);
92
+ }
93
+ }
94
+ return classes;
95
+ }
96
+ }
97
+ export function createDatatypeAnalyzer(options) {
98
+ return PerClassAnalyzer.fromFile('class-property-datatypes.rq', options);
99
+ }
100
+ export function createLanguageAnalyzer(options) {
101
+ return PerClassAnalyzer.fromFile('class-property-languages.rq', options);
102
+ }
103
+ export function createObjectClassAnalyzer(options) {
104
+ return PerClassAnalyzer.fromFile('class-property-object-classes.rq', options);
105
+ }
@@ -0,0 +1,18 @@
1
+ import type { DatasetCore } from '@rdfjs/types';
2
+ /**
3
+ * Add PROV-O provenance metadata to a dataset.
4
+ *
5
+ * Adds:
6
+ * - `<iri> a prov:Entity`
7
+ * - `<iri> prov:wasGeneratedBy _:activity`
8
+ * - `_:activity a prov:Activity`
9
+ * - `_:activity prov:startedAtTime "..."^^xsd:dateTime`
10
+ * - `_:activity prov:endedAtTime "..."^^xsd:dateTime`
11
+ *
12
+ * @param data The dataset to add provenance to
13
+ * @param iri The IRI of the entity
14
+ * @param startedAt Start time of the activity
15
+ * @param endedAt End time of the activity
16
+ */
17
+ export declare function withProvenance(data: DatasetCore, iri: string, startedAt: Date, endedAt: Date): DatasetCore;
18
+ //# sourceMappingURL=provenance.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"provenance.d.ts","sourceRoot":"","sources":["../src/provenance.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,cAAc,CAAC;AAiBhD;;;;;;;;;;;;;;GAcG;AACH,wBAAgB,cAAc,CAC5B,IAAI,EAAE,WAAW,EACjB,GAAG,EAAE,MAAM,EACX,SAAS,EAAE,IAAI,EACf,OAAO,EAAE,IAAI,GACZ,WAAW,CAwBb"}
@@ -0,0 +1,35 @@
1
+ import { DataFactory, Store } from 'n3';
2
+ const { namedNode, literal, blankNode, quad } = DataFactory;
3
+ const RDF_TYPE = namedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#type');
4
+ const PROV_ENTITY = namedNode('http://www.w3.org/ns/prov#Entity');
5
+ const PROV_ACTIVITY = namedNode('http://www.w3.org/ns/prov#Activity');
6
+ const PROV_WAS_GENERATED_BY = namedNode('http://www.w3.org/ns/prov#wasGeneratedBy');
7
+ const PROV_STARTED_AT_TIME = namedNode('http://www.w3.org/ns/prov#startedAtTime');
8
+ const PROV_ENDED_AT_TIME = namedNode('http://www.w3.org/ns/prov#endedAtTime');
9
+ const XSD_DATE_TIME = namedNode('http://www.w3.org/2001/XMLSchema#dateTime');
10
+ /**
11
+ * Add PROV-O provenance metadata to a dataset.
12
+ *
13
+ * Adds:
14
+ * - `<iri> a prov:Entity`
15
+ * - `<iri> prov:wasGeneratedBy _:activity`
16
+ * - `_:activity a prov:Activity`
17
+ * - `_:activity prov:startedAtTime "..."^^xsd:dateTime`
18
+ * - `_:activity prov:endedAtTime "..."^^xsd:dateTime`
19
+ *
20
+ * @param data The dataset to add provenance to
21
+ * @param iri The IRI of the entity
22
+ * @param startedAt Start time of the activity
23
+ * @param endedAt End time of the activity
24
+ */
25
+ export function withProvenance(data, iri, startedAt, endedAt) {
26
+ const store = new Store([...data]);
27
+ const subject = namedNode(iri);
28
+ const activity = blankNode();
29
+ store.addQuad(quad(subject, RDF_TYPE, PROV_ENTITY));
30
+ store.addQuad(quad(subject, PROV_WAS_GENERATED_BY, activity));
31
+ store.addQuad(quad(activity, RDF_TYPE, PROV_ACTIVITY));
32
+ store.addQuad(quad(activity, PROV_STARTED_AT_TIME, literal(startedAt.toISOString(), XSD_DATE_TIME)));
33
+ store.addQuad(quad(activity, PROV_ENDED_AT_TIME, literal(endedAt.toISOString(), XSD_DATE_TIME)));
34
+ return store;
35
+ }
@@ -0,0 +1,38 @@
1
+ import { type ExecutableDataset } from '@lde/pipeline';
2
+ import { SparqlEndpointFetcher } from 'fetch-sparql-endpoint';
3
+ import { BaseAnalyzer, Success, Failure, NotSupported } from '@lde/pipeline/analyzer';
4
+ export interface SparqlQueryAnalyzerOptions {
5
+ /**
6
+ * Timeout for SPARQL queries in milliseconds.
7
+ * @default 300000 (5 minutes)
8
+ */
9
+ timeout?: number;
10
+ /**
11
+ * Custom SparqlEndpointFetcher instance.
12
+ */
13
+ fetcher?: SparqlEndpointFetcher;
14
+ }
15
+ /**
16
+ * Analyzer that executes a SPARQL CONSTRUCT query against a dataset's SPARQL endpoint.
17
+ *
18
+ * Supports template substitution:
19
+ * - `#subjectFilter#` — replaced with the dataset's subject filter (if any)
20
+ * - `#namedGraph#` — replaced with `FROM <graph>` clause if the distribution has a named graph
21
+ * - `?dataset` — replaced with the dataset IRI
22
+ *
23
+ * This class wraps the SparqlConstructExecutor from @lde/pipeline.
24
+ */
25
+ export declare class SparqlQueryAnalyzer extends BaseAnalyzer {
26
+ readonly name: string;
27
+ private readonly executor;
28
+ constructor(name: string, query: string, options?: SparqlQueryAnalyzerOptions);
29
+ /**
30
+ * Create an analyzer from a query file in the queries directory.
31
+ *
32
+ * @param filename Query filename (e.g., 'triples.rq')
33
+ * @param options Optional analyzer options
34
+ */
35
+ static fromFile(filename: string, options?: SparqlQueryAnalyzerOptions): Promise<SparqlQueryAnalyzer>;
36
+ execute(dataset: ExecutableDataset): Promise<Success | Failure | NotSupported>;
37
+ }
38
+ //# sourceMappingURL=sparqlQueryAnalyzer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"sparqlQueryAnalyzer.d.ts","sourceRoot":"","sources":["../src/sparqlQueryAnalyzer.ts"],"names":[],"mappings":"AAAA,OAAO,EAIL,KAAK,iBAAiB,EACvB,MAAM,eAAe,CAAC;AACvB,OAAO,EAAE,qBAAqB,EAAE,MAAM,uBAAuB,CAAC;AAG9D,OAAO,EACL,YAAY,EACZ,OAAO,EACP,OAAO,EACP,YAAY,EACb,MAAM,wBAAwB,CAAC;AAIhC,MAAM,WAAW,0BAA0B;IACzC;;;OAGG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB;;OAEG;IACH,OAAO,CAAC,EAAE,qBAAqB,CAAC;CACjC;AAED;;;;;;;;;GASG;AACH,qBAAa,mBAAoB,SAAQ,YAAY;aAIjC,IAAI,EAAE,MAAM;IAH9B,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAA0B;gBAGjC,IAAI,EAAE,MAAM,EAC5B,KAAK,EAAE,MAAM,EACb,OAAO,CAAC,EAAE,0BAA0B;IAiBtC;;;;;OAKG;WACiB,QAAQ,CAC1B,QAAQ,EAAE,MAAM,EAChB,OAAO,CAAC,EAAE,0BAA0B,GACnC,OAAO,CAAC,mBAAmB,CAAC;IAKlB,OAAO,CAClB,OAAO,EAAE,iBAAiB,GACzB,OAAO,CAAC,OAAO,GAAG,OAAO,GAAG,YAAY,CAAC;CAqB7C"}
@@ -0,0 +1,60 @@
1
+ import { SparqlConstructExecutor, collect, readQueryFile, } from '@lde/pipeline';
2
+ import { SparqlEndpointFetcher } from 'fetch-sparql-endpoint';
3
+ import { resolve, dirname } from 'node:path';
4
+ import { fileURLToPath } from 'node:url';
5
+ import { BaseAnalyzer, Success, Failure, NotSupported, } from '@lde/pipeline/analyzer';
6
+ const __dirname = dirname(fileURLToPath(import.meta.url));
7
+ /**
8
+ * Analyzer that executes a SPARQL CONSTRUCT query against a dataset's SPARQL endpoint.
9
+ *
10
+ * Supports template substitution:
11
+ * - `#subjectFilter#` — replaced with the dataset's subject filter (if any)
12
+ * - `#namedGraph#` — replaced with `FROM <graph>` clause if the distribution has a named graph
13
+ * - `?dataset` — replaced with the dataset IRI
14
+ *
15
+ * This class wraps the SparqlConstructExecutor from @lde/pipeline.
16
+ */
17
+ export class SparqlQueryAnalyzer extends BaseAnalyzer {
18
+ name;
19
+ executor;
20
+ constructor(name, query, options) {
21
+ super();
22
+ this.name = name;
23
+ const fetcher = options?.fetcher ??
24
+ new SparqlEndpointFetcher({
25
+ timeout: options?.timeout ?? 300_000,
26
+ });
27
+ this.executor = new SparqlConstructExecutor({
28
+ query,
29
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
30
+ fetcher: fetcher, // Types differ between package instances
31
+ });
32
+ }
33
+ /**
34
+ * Create an analyzer from a query file in the queries directory.
35
+ *
36
+ * @param filename Query filename (e.g., 'triples.rq')
37
+ * @param options Optional analyzer options
38
+ */
39
+ static async fromFile(filename, options) {
40
+ const query = await readQueryFile(resolve(__dirname, 'queries', filename));
41
+ return new SparqlQueryAnalyzer(filename, query, options);
42
+ }
43
+ async execute(dataset) {
44
+ const sparqlDistribution = dataset.getSparqlDistribution();
45
+ if (sparqlDistribution === null) {
46
+ return new NotSupported('No SPARQL distribution available');
47
+ }
48
+ try {
49
+ const result = await this.executor.execute(dataset);
50
+ if (result instanceof NotSupported) {
51
+ return result;
52
+ }
53
+ const store = await collect(result);
54
+ return new Success(store);
55
+ }
56
+ catch (e) {
57
+ return new Failure(sparqlDistribution.accessUrl ?? new URL('unknown://'), e instanceof Error ? e.message : undefined);
58
+ }
59
+ }
60
+ }
@@ -0,0 +1,17 @@
1
+ import { Dataset } from '@lde/dataset';
2
+ import { type Analyzer, Success, type Failure, type NotSupported } from '@lde/pipeline/analyzer';
3
+ /**
4
+ * Decorator analyzer that enriches results with `void:vocabulary` triples.
5
+ *
6
+ * Wraps another analyzer, runs it, then inspects `void:property` triples
7
+ * to detect known vocabulary prefixes and add corresponding `void:vocabulary`
8
+ * triples to the result.
9
+ */
10
+ export declare class VocabularyAnalyzer implements Analyzer {
11
+ private readonly inner;
12
+ readonly name: string;
13
+ constructor(inner: Analyzer);
14
+ execute(dataset: Dataset): Promise<Success | Failure | NotSupported>;
15
+ finish(): Promise<void>;
16
+ }
17
+ //# sourceMappingURL=vocabularyAnalyzer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"vocabularyAnalyzer.d.ts","sourceRoot":"","sources":["../src/vocabularyAnalyzer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAGvC,OAAO,EACL,KAAK,QAAQ,EACb,OAAO,EACP,KAAK,OAAO,EACZ,KAAK,YAAY,EAClB,MAAM,wBAAwB,CAAC;AAiChC;;;;;;GAMG;AACH,qBAAa,kBAAmB,YAAW,QAAQ;IAGrC,OAAO,CAAC,QAAQ,CAAC,KAAK;IAFlC,SAAgB,IAAI,EAAE,MAAM,CAAC;gBAEA,KAAK,EAAE,QAAQ;IAI/B,OAAO,CAClB,OAAO,EAAE,OAAO,GACf,OAAO,CAAC,OAAO,GAAG,OAAO,GAAG,YAAY,CAAC;IAU/B,MAAM,IAAI,OAAO,CAAC,IAAI,CAAC;CAGrC"}
@@ -0,0 +1,75 @@
1
+ import { DataFactory, Store } from 'n3';
2
+ import { Success, } from '@lde/pipeline/analyzer';
3
+ const { namedNode, quad } = DataFactory;
4
+ const VOID = 'http://rdfs.org/ns/void#';
5
+ const voidProperty = namedNode(`${VOID}property`);
6
+ const voidVocabulary = namedNode(`${VOID}vocabulary`);
7
+ /**
8
+ * Known vocabulary namespace prefixes mapped to their canonical URIs.
9
+ */
10
+ const vocabularyPrefixes = new Map([
11
+ ['http://schema.org/', 'http://schema.org/'],
12
+ ['https://schema.org/', 'https://schema.org/'],
13
+ [
14
+ 'https://www.ica.org/standards/RiC/ontology#',
15
+ 'https://www.ica.org/standards/RiC/ontology#',
16
+ ],
17
+ [
18
+ 'http://www.cidoc-crm.org/cidoc-crm/',
19
+ 'http://www.cidoc-crm.org/cidoc-crm/',
20
+ ],
21
+ ['http://purl.org/ontology/bibo/', 'http://purl.org/ontology/bibo/'],
22
+ ['http://purl.org/dc/elements/1.1/', 'http://purl.org/dc/elements/1.1/'],
23
+ ['http://purl.org/dc/terms/', 'http://purl.org/dc/terms/'],
24
+ ['http://purl.org/dc/dcmitype/', 'http://purl.org/dc/dcmitype/'],
25
+ [
26
+ 'http://www.w3.org/2004/02/skos/core#',
27
+ 'http://www.w3.org/2004/02/skos/core#',
28
+ ],
29
+ ['http://xmlns.com/foaf/0.1/', 'http://xmlns.com/foaf/0.1/'],
30
+ ]);
31
+ /**
32
+ * Decorator analyzer that enriches results with `void:vocabulary` triples.
33
+ *
34
+ * Wraps another analyzer, runs it, then inspects `void:property` triples
35
+ * to detect known vocabulary prefixes and add corresponding `void:vocabulary`
36
+ * triples to the result.
37
+ */
38
+ export class VocabularyAnalyzer {
39
+ inner;
40
+ name;
41
+ constructor(inner) {
42
+ this.inner = inner;
43
+ this.name = inner.name;
44
+ }
45
+ async execute(dataset) {
46
+ const result = await this.inner.execute(dataset);
47
+ if (!(result instanceof Success)) {
48
+ return result;
49
+ }
50
+ const enriched = addVocabularyTriples(result.data, dataset.iri.toString());
51
+ return new Success(enriched);
52
+ }
53
+ async finish() {
54
+ await this.inner.finish?.();
55
+ }
56
+ }
57
+ function addVocabularyTriples(data, datasetIri) {
58
+ const store = new Store([...data]);
59
+ const datasetNode = namedNode(datasetIri);
60
+ // Collect unique vocabulary URIs from void:property triples.
61
+ const detectedVocabularies = new Set();
62
+ for (const q of store.match(null, voidProperty, null)) {
63
+ const propertyUri = q.object.value;
64
+ for (const [prefix, vocabUri] of vocabularyPrefixes) {
65
+ if (propertyUri.startsWith(prefix)) {
66
+ detectedVocabularies.add(vocabUri);
67
+ break;
68
+ }
69
+ }
70
+ }
71
+ for (const vocabUri of detectedVocabularies) {
72
+ store.addQuad(quad(datasetNode, voidVocabulary, namedNode(vocabUri)));
73
+ }
74
+ return store;
75
+ }
package/package.json ADDED
@@ -0,0 +1,34 @@
1
+ {
2
+ "name": "@lde/pipeline-void",
3
+ "version": "0.2.2",
4
+ "description": "VOiD (Vocabulary of Interlinked Datasets) statistical analysis for RDF datasets",
5
+ "repository": {
6
+ "url": "git+https://github.com/ldengine/lde.git",
7
+ "directory": "packages/pipeline-void"
8
+ },
9
+ "type": "module",
10
+ "exports": {
11
+ "./package.json": "./package.json",
12
+ ".": {
13
+ "types": "./dist/index.d.ts",
14
+ "import": "./dist/index.js",
15
+ "development": "./src/index.ts",
16
+ "default": "./dist/index.js"
17
+ }
18
+ },
19
+ "main": "./dist/index.js",
20
+ "module": "./dist/index.js",
21
+ "types": "./dist/index.d.ts",
22
+ "files": [
23
+ "dist",
24
+ "!**/*.tsbuildinfo"
25
+ ],
26
+ "dependencies": {
27
+ "@lde/dataset": "0.6.1",
28
+ "@lde/pipeline": "0.6.2",
29
+ "@rdfjs/types": "^2.0.1",
30
+ "fetch-sparql-endpoint": "^6.0.0",
31
+ "n3": "^1.17.0",
32
+ "tslib": "^2.3.0"
33
+ }
34
+ }