@lde/distribution-probe 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,35 @@
1
+ # Distribution Probe
2
+
3
+ Probes a DCAT `Distribution` to check availability and gather metadata. Returns `SparqlProbeResult`, `DataDumpProbeResult`, or `NetworkError` – the probe never throws.
4
+
5
+ ```ts
6
+ import { Distribution } from '@lde/dataset';
7
+ import { probe } from '@lde/distribution-probe';
8
+
9
+ const distribution = new Distribution(
10
+ new URL('https://example.org/data.ttl'),
11
+ 'text/turtle',
12
+ );
13
+ const result = await probe(distribution);
14
+ ```
15
+
16
+ ## Behaviour
17
+
18
+ ### SPARQL endpoints
19
+
20
+ Sends `POST` with `SELECT * { ?s ?p ?o } LIMIT 1` and `Accept: application/sparql-results+json`, then:
21
+
22
+ - **Content-Type is enforced.** The response Content-Type must start with `application/sparql-results+json`; anything else fails the probe (`isSuccess() === false`). This rules out HTML error pages served with `200 OK`.
23
+ - The JSON body must parse and contain a `results` object. Empty bodies, invalid JSON, and missing `results` all fail the probe with a `failureReason`.
24
+
25
+ ### Data dumps
26
+
27
+ Sends `HEAD` with `Accept: <distribution.mimeType>` and `Accept-Encoding: identity`. If `Content-Length` is missing or ≤ 10 KB, retries with `GET` to validate the body – this also catches servers that return `0` from `HEAD`.
28
+
29
+ - **Content-Type is checked as a soft warning, not a hard failure.** If the server’s Content-Type disagrees with the distribution’s declared `mimeType`, a message is appended to `result.warnings` but `isSuccess()` stays `true`. Compression wrappers (`application/gzip`, `application/x-gzip`, `application/octet-stream`) are skipped so a gzipped Turtle file doesn’t trigger a warning.
30
+ - **Body is parse-validated only for Turtle, N-Triples, and N-Quads** (Content-Type starting with `text/turtle`, `application/n-triples`, or `application/n-quads`). Empty bodies and parse errors fail the probe. Other RDF serializations (RDF/XML, JSON-LD, TriG, …) are not parse-validated – only HTTP status and headers are checked.
31
+ - Bodies larger than 10 KB are not fetched; only `HEAD` metadata is inspected.
32
+
33
+ ### Network errors
34
+
35
+ Any thrown exception from `fetch` (DNS, connection refused, TLS, timeout after the configured `timeout` – default 5 000 ms) is caught and returned as a `NetworkError` with the original message.
@@ -0,0 +1,2 @@
1
+ export { probe, NetworkError, SparqlProbeResult, DataDumpProbeResult, type ProbeResultType, } from './probe.js';
2
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,KAAK,EACL,YAAY,EACZ,iBAAiB,EACjB,mBAAmB,EACnB,KAAK,eAAe,GACrB,MAAM,YAAY,CAAC"}
package/dist/index.js ADDED
@@ -0,0 +1 @@
1
+ export { probe, NetworkError, SparqlProbeResult, DataDumpProbeResult, } from './probe.js';
@@ -0,0 +1,76 @@
1
+ import { Distribution } from '@lde/dataset';
2
+ /**
3
+ * Options for {@link probe}.
4
+ */
5
+ export interface ProbeOptions {
6
+ /** Request timeout in milliseconds. Defaults to 5 000. */
7
+ timeoutMs?: number;
8
+ /**
9
+ * Extra HTTP headers to send with the request. Merged with probe-generated
10
+ * headers; caller-supplied values take precedence on conflict.
11
+ */
12
+ headers?: Headers;
13
+ /**
14
+ * SPARQL query to use when probing a SPARQL endpoint. The query’s type
15
+ * (`ASK` / `SELECT` / `CONSTRUCT` / `DESCRIBE`) determines the `Accept`
16
+ * header and the response validation strategy. Ignored for data-dump
17
+ * distributions. Defaults to `SELECT * { ?s ?p ?o } LIMIT 1`.
18
+ */
19
+ sparqlQuery?: string;
20
+ }
21
+ /**
22
+ * Result of a network error during probing.
23
+ */
24
+ export declare class NetworkError {
25
+ readonly url: string;
26
+ readonly message: string;
27
+ readonly responseTimeMs: number;
28
+ constructor(url: string, message: string, responseTimeMs: number);
29
+ }
30
+ /**
31
+ * Base class for successful probe results.
32
+ */
33
+ declare abstract class ProbeResult {
34
+ readonly url: string;
35
+ readonly statusCode: number;
36
+ readonly statusText: string;
37
+ readonly lastModified: Date | null;
38
+ readonly contentType: string | null;
39
+ readonly failureReason: string | null;
40
+ readonly warnings: string[];
41
+ readonly responseTimeMs: number;
42
+ constructor(url: string, response: Response, responseTimeMs: number, failureReason?: string | null);
43
+ isSuccess(): boolean;
44
+ }
45
+ /**
46
+ * Result of probing a SPARQL endpoint.
47
+ */
48
+ export declare class SparqlProbeResult extends ProbeResult {
49
+ readonly acceptedContentType: string;
50
+ constructor(url: string, response: Response, responseTimeMs: number, acceptedContentType: string, failureReason?: string | null);
51
+ isSuccess(): boolean;
52
+ }
53
+ /**
54
+ * Result of probing a data dump distribution.
55
+ */
56
+ export declare class DataDumpProbeResult extends ProbeResult {
57
+ readonly contentSize: number | null;
58
+ constructor(url: string, response: Response, responseTimeMs: number, failureReason?: string | null);
59
+ }
60
+ export type ProbeResultType = SparqlProbeResult | DataDumpProbeResult | NetworkError;
61
+ /**
62
+ * Probe a distribution to check availability and gather metadata.
63
+ *
64
+ * For SPARQL endpoints, issues the configured SPARQL query (default: a
65
+ * minimal `SELECT`). For data dumps, issues `HEAD` (with a `GET` fallback
66
+ * for small or unknown-size bodies).
67
+ *
68
+ * Returns a pure result object; never throws.
69
+ *
70
+ * @param distribution The distribution to probe.
71
+ * @param options Probe options. For back-compat, passing a `number` is
72
+ * equivalent to `{ timeoutMs: number }` and is deprecated.
73
+ */
74
+ export declare function probe(distribution: Distribution, options?: number | ProbeOptions): Promise<ProbeResultType>;
75
+ export {};
76
+ //# sourceMappingURL=probe.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"probe.d.ts","sourceRoot":"","sources":["../src/probe.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAG5C;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,0DAA0D;IAC1D,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB;;;OAGG;IACH,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB;;;;;OAKG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAKD;;GAEG;AACH,qBAAa,YAAY;aAEL,GAAG,EAAE,MAAM;aACX,OAAO,EAAE,MAAM;aACf,cAAc,EAAE,MAAM;gBAFtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,MAAM,EACf,cAAc,EAAE,MAAM;CAEzC;AAED;;GAEG;AACH,uBAAe,WAAW;aAUN,GAAG,EAAE,MAAM;IAT7B,SAAgB,UAAU,EAAE,MAAM,CAAC;IACnC,SAAgB,UAAU,EAAE,MAAM,CAAC;IACnC,SAAgB,YAAY,EAAE,IAAI,GAAG,IAAI,CAAQ;IACjD,SAAgB,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3C,SAAgB,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7C,SAAgB,QAAQ,EAAE,MAAM,EAAE,CAAM;IACxC,SAAgB,cAAc,EAAE,MAAM,CAAC;gBAGrB,GAAG,EAAE,MAAM,EAC3B,QAAQ,EAAE,QAAQ,EAClB,cAAc,EAAE,MAAM,EACtB,aAAa,GAAE,MAAM,GAAG,IAAW;IAa9B,SAAS,IAAI,OAAO;CAO5B;AAKD;;GAEG;AACH,qBAAa,iBAAkB,SAAQ,WAAW;IAChD,SAAgB,mBAAmB,EAAE,MAAM,CAAC;gBAG1C,GAAG,EAAE,MAAM,EACX,QAAQ,EAAE,QAAQ,EAClB,cAAc,EAAE,MAAM,EACtB,mBAAmB,EAAE,MAAM,EAC3B,aAAa,GAAE,MAAM,GAAG,IAAW;IAM5B,SAAS,IAAI,OAAO;CAM9B;AAED;;GAEG;AACH,qBAAa,mBAAoB,SAAQ,WAAW;IAClD,SAAgB,WAAW,EAAE,MAAM,GAAG,IAAI,CAAQ;gBAGhD,GAAG,EAAE,MAAM,EACX,QAAQ,EAAE,QAAQ,EAClB,cAAc,EAAE,MAAM,EACtB,aAAa,GAAE,MAAM,GAAG,IAAW;CAQtC;AAED,MAAM,MAAM,eAAe,GACvB,iBAAiB,GACjB,mBAAmB,GACnB,YAAY,CAAC;AAIjB;;;;;;;;;;;;GAYG;AACH,wBAAsB,KAAK,CACzB,YAAY,EAAE,YAAY,EAC1B,OAAO,CAAC,EAAE,MAAM,GAAG,YAAY,GAC9B,OAAO,CAAC,eAAe,CAAC,CAkC1B"}
package/dist/probe.js ADDED
@@ -0,0 +1,296 @@
1
+ import { Parser } from 'n3';
2
+ const DEFAULT_SPARQL_QUERY = 'SELECT * { ?s ?p ?o } LIMIT 1';
3
+ const DEFAULT_TIMEOUT_MS = 5000;
4
+ /**
5
+ * Result of a network error during probing.
6
+ */
7
+ export class NetworkError {
8
+ url;
9
+ message;
10
+ responseTimeMs;
11
+ constructor(url, message, responseTimeMs) {
12
+ this.url = url;
13
+ this.message = message;
14
+ this.responseTimeMs = responseTimeMs;
15
+ }
16
+ }
17
+ /**
18
+ * Base class for successful probe results.
19
+ */
20
+ class ProbeResult {
21
+ url;
22
+ statusCode;
23
+ statusText;
24
+ lastModified = null;
25
+ contentType;
26
+ failureReason;
27
+ warnings = [];
28
+ responseTimeMs;
29
+ constructor(url, response, responseTimeMs, failureReason = null) {
30
+ this.url = url;
31
+ this.statusCode = response.status;
32
+ this.statusText = response.statusText;
33
+ this.contentType = response.headers.get('Content-Type');
34
+ this.failureReason = failureReason;
35
+ this.responseTimeMs = responseTimeMs;
36
+ const lastModifiedHeader = response.headers.get('Last-Modified');
37
+ if (lastModifiedHeader) {
38
+ this.lastModified = new Date(lastModifiedHeader);
39
+ }
40
+ }
41
+ isSuccess() {
42
+ return (this.statusCode >= 200 &&
43
+ this.statusCode < 400 &&
44
+ this.failureReason === null);
45
+ }
46
+ }
47
+ const SPARQL_RESULTS_JSON = 'application/sparql-results+json';
48
+ const SPARQL_RDF_RESULTS = 'application/n-triples';
49
+ /**
50
+ * Result of probing a SPARQL endpoint.
51
+ */
52
+ export class SparqlProbeResult extends ProbeResult {
53
+ acceptedContentType;
54
+ constructor(url, response, responseTimeMs, acceptedContentType, failureReason = null) {
55
+ super(url, response, responseTimeMs, failureReason);
56
+ this.acceptedContentType = acceptedContentType;
57
+ }
58
+ isSuccess() {
59
+ return (super.isSuccess() &&
60
+ (this.contentType?.startsWith(this.acceptedContentType) ?? false));
61
+ }
62
+ }
63
+ /**
64
+ * Result of probing a data dump distribution.
65
+ */
66
+ export class DataDumpProbeResult extends ProbeResult {
67
+ contentSize = null;
68
+ constructor(url, response, responseTimeMs, failureReason = null) {
69
+ super(url, response, responseTimeMs, failureReason);
70
+ const contentLengthHeader = response.headers.get('Content-Length');
71
+ if (contentLengthHeader) {
72
+ this.contentSize = parseInt(contentLengthHeader);
73
+ }
74
+ }
75
+ }
76
+ /**
77
+ * Probe a distribution to check availability and gather metadata.
78
+ *
79
+ * For SPARQL endpoints, issues the configured SPARQL query (default: a
80
+ * minimal `SELECT`). For data dumps, issues `HEAD` (with a `GET` fallback
81
+ * for small or unknown-size bodies).
82
+ *
83
+ * Returns a pure result object; never throws.
84
+ *
85
+ * @param distribution The distribution to probe.
86
+ * @param options Probe options. For back-compat, passing a `number` is
87
+ * equivalent to `{ timeoutMs: number }` and is deprecated.
88
+ */
89
+ export async function probe(distribution, options) {
90
+ const resolved = resolveOptions(options);
91
+ const url = distribution.accessUrl?.toString() ?? 'unknown';
92
+ const [authUrl, authHeaders] = distribution.accessUrl !== undefined
93
+ ? extractUrlCredentials(distribution.accessUrl, resolved.headers)
94
+ : [new URL(url), new Headers(resolved.headers)];
95
+ const start = performance.now();
96
+ try {
97
+ if (distribution.isSparql()) {
98
+ return await probeSparqlEndpoint(authUrl.toString(), distribution, resolved, authHeaders, start);
99
+ }
100
+ return await probeDataDump(authUrl.toString(), distribution, resolved, authHeaders, start);
101
+ }
102
+ catch (e) {
103
+ const responseTimeMs = Math.round(performance.now() - start);
104
+ return new NetworkError(url, e instanceof Error ? e.message : String(e), responseTimeMs);
105
+ }
106
+ }
107
+ function resolveOptions(options) {
108
+ if (typeof options === 'number') {
109
+ return {
110
+ timeoutMs: options,
111
+ headers: new Headers(),
112
+ sparqlQuery: DEFAULT_SPARQL_QUERY,
113
+ };
114
+ }
115
+ return {
116
+ timeoutMs: options?.timeoutMs ?? DEFAULT_TIMEOUT_MS,
117
+ headers: options?.headers ?? new Headers(),
118
+ sparqlQuery: options?.sparqlQuery ?? DEFAULT_SPARQL_QUERY,
119
+ };
120
+ }
121
+ /**
122
+ * Strip `user:pass@` from a URL and turn it into an `Authorization: Basic`
123
+ * header. Returns the cleaned URL and a merged Headers object that preserves
124
+ * any caller-supplied headers.
125
+ */
126
+ function extractUrlCredentials(url, baseHeaders) {
127
+ const headers = new Headers(baseHeaders);
128
+ if (url.username === '' && url.password === '') {
129
+ return [url, headers];
130
+ }
131
+ const credentials = `${decodeURIComponent(url.username)}:${decodeURIComponent(url.password)}`;
132
+ if (!headers.has('Authorization')) {
133
+ headers.set('Authorization', `Basic ${Buffer.from(credentials).toString('base64')}`);
134
+ }
135
+ const cleanUrl = new URL(url.toString());
136
+ cleanUrl.username = '';
137
+ cleanUrl.password = '';
138
+ return [cleanUrl, headers];
139
+ }
140
+ /**
141
+ * Classify a SPARQL query. Comments are stripped; the first keyword match
142
+ * wins. Falls back to `SELECT` when no keyword is found – robust enough for
143
+ * availability probing but not a full SPARQL parser.
144
+ */
145
+ function detectSparqlQueryType(query) {
146
+ const withoutComments = query.replace(/#[^\n\r]*/g, ' ');
147
+ const match = /\b(ASK|SELECT|CONSTRUCT|DESCRIBE)\b/i.exec(withoutComments);
148
+ return (match?.[1].toUpperCase() ?? 'SELECT');
149
+ }
150
+ function acceptHeaderForQueryType(queryType) {
151
+ if (queryType === 'ASK' || queryType === 'SELECT') {
152
+ return SPARQL_RESULTS_JSON;
153
+ }
154
+ return SPARQL_RDF_RESULTS;
155
+ }
156
+ async function probeSparqlEndpoint(url, _distribution, options, authHeaders, start) {
157
+ const queryType = detectSparqlQueryType(options.sparqlQuery);
158
+ const accept = acceptHeaderForQueryType(queryType);
159
+ const headers = new Headers({
160
+ 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
161
+ Accept: accept,
162
+ });
163
+ for (const [key, value] of authHeaders) {
164
+ headers.set(key, value);
165
+ }
166
+ const response = await fetch(url, {
167
+ signal: AbortSignal.timeout(options.timeoutMs),
168
+ method: 'POST',
169
+ headers,
170
+ body: `query=${encodeURIComponent(options.sparqlQuery)}`,
171
+ });
172
+ const actualContentType = response.headers.get('Content-Type');
173
+ const contentTypeMatches = actualContentType?.startsWith(accept) ?? false;
174
+ let failureReason = null;
175
+ if (response.ok && contentTypeMatches) {
176
+ failureReason = await validateSparqlResponse(response, queryType);
177
+ }
178
+ else {
179
+ // Drain unconsumed body to release the underlying connection.
180
+ await response.body?.cancel();
181
+ }
182
+ const responseTimeMs = Math.round(performance.now() - start);
183
+ return new SparqlProbeResult(url, response, responseTimeMs, accept, failureReason);
184
+ }
185
+ async function validateSparqlResponse(response, queryType) {
186
+ const body = await response.text();
187
+ if (body.length === 0) {
188
+ return 'SPARQL endpoint returned an empty response';
189
+ }
190
+ if (queryType === 'CONSTRUCT' || queryType === 'DESCRIBE') {
191
+ // Body should be RDF; a non-empty response is sufficient to confirm the
192
+ // endpoint answered. Deep parse validation is the data-dump path’s job.
193
+ return null;
194
+ }
195
+ let json;
196
+ try {
197
+ json = JSON.parse(body);
198
+ }
199
+ catch {
200
+ return 'SPARQL endpoint returned invalid JSON';
201
+ }
202
+ if (queryType === 'ASK') {
203
+ if (typeof json.boolean !== 'boolean') {
204
+ return 'SPARQL endpoint did not return a valid ASK result';
205
+ }
206
+ return null;
207
+ }
208
+ // SELECT
209
+ if (!json.results || typeof json.results !== 'object') {
210
+ return 'SPARQL endpoint did not return a valid results object';
211
+ }
212
+ return null;
213
+ }
214
+ async function probeDataDump(url, distribution, options, authHeaders, start) {
215
+ const headers = new Headers({
216
+ Accept: distribution.mimeType ?? '*/*',
217
+ 'Accept-Encoding': 'identity',
218
+ });
219
+ for (const [key, value] of authHeaders) {
220
+ headers.set(key, value);
221
+ }
222
+ const requestOptions = {
223
+ signal: AbortSignal.timeout(options.timeoutMs),
224
+ headers,
225
+ };
226
+ const headResponse = await fetch(url, {
227
+ method: 'HEAD',
228
+ ...requestOptions,
229
+ });
230
+ const contentLength = headResponse.headers.get('Content-Length');
231
+ const contentLengthBytes = contentLength ? parseInt(contentLength) : 0;
232
+ // For small or unknown-size files, do a GET to validate body content.
233
+ // This also handles servers that incorrectly return 0 Content-Length for HEAD.
234
+ if (contentLengthBytes <= 10_240) {
235
+ const getResponse = await fetch(url, {
236
+ method: 'GET',
237
+ ...requestOptions,
238
+ });
239
+ const body = await getResponse.text();
240
+ const isHttpSuccess = getResponse.status >= 200 && getResponse.status < 400;
241
+ const failureReason = isHttpSuccess
242
+ ? validateBody(body, getResponse.headers.get('Content-Type'))
243
+ : null;
244
+ const responseTimeMs = Math.round(performance.now() - start);
245
+ const result = new DataDumpProbeResult(url, getResponse, responseTimeMs, failureReason);
246
+ checkContentTypeMismatch(result, distribution.mimeType);
247
+ return result;
248
+ }
249
+ const responseTimeMs = Math.round(performance.now() - start);
250
+ const result = new DataDumpProbeResult(url, headResponse, responseTimeMs);
251
+ checkContentTypeMismatch(result, distribution.mimeType);
252
+ return result;
253
+ }
254
+ const rdfContentTypes = [
255
+ 'text/turtle',
256
+ 'application/n-triples',
257
+ 'application/n-quads',
258
+ ];
259
+ function validateBody(body, contentType) {
260
+ if (body.length === 0) {
261
+ return 'Distribution is empty';
262
+ }
263
+ if (contentType && rdfContentTypes.some((t) => contentType.startsWith(t))) {
264
+ try {
265
+ const parser = new Parser();
266
+ const quads = parser.parse(body);
267
+ if (quads.length === 0) {
268
+ return 'Distribution contains no RDF triples';
269
+ }
270
+ }
271
+ catch (e) {
272
+ return e instanceof Error ? e.message : String(e);
273
+ }
274
+ }
275
+ return null;
276
+ }
277
+ /** Content types that indicate compression, not the RDF serialization format. */
278
+ const compressionTypes = new Set([
279
+ 'application/gzip',
280
+ 'application/x-gzip',
281
+ 'application/octet-stream',
282
+ ]);
283
+ /**
284
+ * Compare the declared MIME type from the dataset registry against the
285
+ * server's Content-Type header. Adds a warning when they disagree.
286
+ */
287
+ function checkContentTypeMismatch(result, declaredMimeType) {
288
+ if (!result.isSuccess() || !declaredMimeType || !result.contentType)
289
+ return;
290
+ const actual = result.contentType.split(';')[0].trim();
291
+ if (compressionTypes.has(actual))
292
+ return;
293
+ if (actual !== declaredMimeType) {
294
+ result.warnings.push(`Server Content-Type ${actual} does not match declared media type ${declaredMimeType}`);
295
+ }
296
+ }
package/package.json ADDED
@@ -0,0 +1,31 @@
1
+ {
2
+ "name": "@lde/distribution-probe",
3
+ "version": "0.1.0",
4
+ "repository": {
5
+ "url": "git+https://github.com/ldelements/lde.git",
6
+ "directory": "packages/distribution-probe"
7
+ },
8
+ "license": "MIT",
9
+ "type": "module",
10
+ "exports": {
11
+ "./package.json": "./package.json",
12
+ ".": {
13
+ "types": "./dist/index.d.ts",
14
+ "import": "./dist/index.js",
15
+ "development": "./src/index.ts",
16
+ "default": "./dist/index.js"
17
+ }
18
+ },
19
+ "main": "./dist/index.js",
20
+ "module": "./dist/index.js",
21
+ "types": "./dist/index.d.ts",
22
+ "files": [
23
+ "dist",
24
+ "!**/*.tsbuildinfo"
25
+ ],
26
+ "dependencies": {
27
+ "@lde/dataset": "0.7.2",
28
+ "n3": "^2.0.1",
29
+ "tslib": "^2.3.0"
30
+ }
31
+ }