npm - @lde/distribution-probe - Versions diffs - 0.1.13 → 0.2.0 - Mend

@lde/distribution-probe 0.1.13 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.md CHANGED Viewed

@@ -24,11 +24,24 @@ Sends `POST` with the configured query (default `SELECT * { ?s ?p ?o } LIMIT 1`)
 ### Data dumps
-Sends `HEAD` with `Accept: <distribution.mimeType>` and `Accept-Encoding: identity`. If `Content-Length` is missing or ≤ 10 KB, retries with `GET` to validate the body – this also catches servers that return `0` from `HEAD`.
+#### Reachability (the default)
+Sends `HEAD` with `Accept: <distribution.mimeType>` and `Accept-Encoding: identity`. A successful `HEAD` settles reachability and gathers metadata (`Content-Length`, `Last-Modified`) **without reading the body**. If `HEAD` is unsuccessful — e.g. a server that returns `405`/`501` because it does not implement `HEAD` — the probe falls back to a body-less `GET` to confirm the endpoint is up. The body is never downloaded.
+This is deliberately cheap: reading a body forces a slow, generate-on-the-fly endpoint (a TriplyDB dump, a SPARQL `CONSTRUCT` export) to start producing its export, which a `HEAD` does not.
 - **Content-Type is checked as a soft warning, not a hard failure.** If the server’s Content-Type disagrees with the distribution’s declared `mimeType`, a message is appended to `result.warnings` but `isSuccess()` stays `true`. Compression wrappers (`application/gzip`, `application/x-gzip`, `application/octet-stream`) are skipped so a gzipped Turtle file doesn’t trigger a warning.
-- **Body is parse-validated only for Turtle, N-Triples, and N-Quads** (Content-Type starting with `text/turtle`, `application/n-triples`, or `application/n-quads`). Empty bodies and parse errors fail the probe. Other RDF serializations (RDF/XML, JSON-LD, TriG, …) are not parse-validated – only HTTP status and headers are checked.
-- Bodies larger than 10 KB are not fetched; only `HEAD` metadata is inspected.
+#### Content validation (opt-in)
+Set `validateRdfContent: true` to additionally confirm that a dump actually carries RDF. It applies only to distributions whose **declared** `mimeType` is an RDF serialization (`text/turtle`, `application/n-triples`, `application/n-quads`, `application/trig`, `text/n3`, `application/ld+json`, `application/rdf+xml`); non-RDF and undeclared-type distributions stay reachability-only.
+When on, the probe `GET`s the dump — **regardless of size** — and reads only a **bounded prefix** (256 KiB), never the whole body:
+- It settles on the **first triple** and stops, so a large dump is validated from its opening chunk. The line/statement-oriented serializations and RDF/XML stream a triple out of the prefix; **JSON-LD is not streamable** (its parser needs the whole document), so a JSON-LD dump is only validated when it fits the prefix in full — a larger one is reported reachable but unvalidated.
+- A gzip body that `fetch` did not decompress (a `.gz` dump, or one served with a non-standard `Content-Encoding`) is inflated in-place; a gzip that will not inflate when the **complete** compressed body was read fails as `Distribution is not valid gzip`.
+- Empty bodies (`Distribution is empty`) and bodies that parse to **zero** triples (`Distribution contains no RDF triples`) fail the probe. A deliberately truncated prefix is never mistaken for either — it is inconclusive.
+- **Reachability is settled by the response, so validation never turns a reachable dump into a failure.** If no triple surfaces within `rdfValidationBudgetMs` (default `min(timeoutMs, 2000)`, clamped to `timeoutMs`), the read is aborted and the distribution is reported reachable but unvalidated (no `failureReason`). This bounds the extra latency content validation adds on slow, generate-on-the-fly endpoints.
 ### Network errors

package/dist/probe.d.ts CHANGED Viewed

@@ -28,6 +28,29 @@ export interface ProbeOptions {
      * the default; negative values are clamped to `0`.
      */
     retries?: number;
+    /**
+     * Validate the body content of data-dump distributions whose declared media
+     * type is an RDF serialization, by reading a bounded prefix and confirming it
+     * carries at least one triple. When `false` (the default) a data dump is only
+     * checked for reachability (a `HEAD`, with a body-less `GET` fallback if `HEAD`
+     * is unsupported) and its body is never read. When `true`, every declared-RDF
+     * dump — regardless of size — is fetched and validated; non-RDF and
+     * undeclared-type distributions are still reachability-only. Validation is
+     * opt-in because reading a body forces a slow, generate-on-the-fly endpoint to
+     * start producing its export, which a `HEAD` does not.
+     */
+    validateRdfContent?: boolean;
+    /**
+     * Soft deadline, in milliseconds, for finding the first triple when
+     * {@link validateRdfContent} is on. Reachability is settled by the response
+     * itself; if no triple has surfaced within this budget the read is aborted and
+     * the distribution is reported reachable but unvalidated (no `failureReason`),
+     * never failed. This bounds the extra latency content validation adds on slow,
+     * generate-on-the-fly endpoints. Clamped to {@link timeoutMs} (a longer budget
+     * is meaningless — the request times out first). Defaults to
+     * `min(timeoutMs, 2000)`.
+     */
+    rdfValidationBudgetMs?: number;
 }
 /**
  * Result of a network error during probing.
@@ -80,7 +103,8 @@ export type ProbeResultType = SparqlProbeResult | DataDumpProbeResult | NetworkE
  *
  * For SPARQL endpoints, issues the configured SPARQL query (default: a
  * minimal `SELECT`). For data dumps, issues `HEAD` (with a `GET` fallback
- * for small or unknown-size bodies).
+ * for small or unknown-size bodies, reading only a bounded prefix so a large
+ * streamed dump is never downloaded in full).
  *
  * Returns a pure result object; never throws.
  */

package/dist/probe.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"probe.d.ts","sourceRoot":"","sources":["../src/probe.ts"],"names":[],"mappings":"AAAA,OAAO,EAAyB,YAAY,EAAE,MAAM,cAAc,CAAC;~~AAInE~~;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,0DAA0D;IAC1D,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB;;;OAGG;IACH,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB;;;;;OAKG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB;;;;;;;;;OASG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;~~CAClB~~;~~AASD~~;;GAEG;AACH,qBAAa,YAAY;aAEL,GAAG,EAAE,MAAM;aACX,OAAO,EAAE,MAAM;aACf,cAAc,EAAE,MAAM;gBAFtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,MAAM,EACf,cAAc,EAAE,MAAM;CAEzC;AAED;;GAEG;AACH,uBAAe,WAAW;aAUN,GAAG,EAAE,MAAM;IAT7B,SAAgB,UAAU,EAAE,MAAM,CAAC;IACnC,SAAgB,UAAU,EAAE,MAAM,CAAC;IACnC,SAAgB,YAAY,EAAE,IAAI,GAAG,IAAI,CAAQ;IACjD,SAAgB,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3C,SAAgB,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7C,SAAgB,QAAQ,EAAE,MAAM,EAAE,CAAM;IACxC,SAAgB,cAAc,EAAE,MAAM,CAAC;gBAGrB,GAAG,EAAE,MAAM,EAC3B,QAAQ,EAAE,QAAQ,EAClB,cAAc,EAAE,MAAM,EACtB,aAAa,GAAE,MAAM,GAAG,IAAW;IAa9B,SAAS,IAAI,OAAO;CAO5B;AAqBD;;GAEG;AACH,qBAAa,iBAAkB,SAAQ,WAAW;IAChD;;;;;OAKG;IACH,SAAgB,oBAAoB,EAAE,SAAS,MAAM,EAAE,CAAC;gBAGtD,GAAG,EAAE,MAAM,EACX,QAAQ,EAAE,QAAQ,EAClB,cAAc,EAAE,MAAM,EACtB,oBAAoB,EAAE,MAAM,GAAG,SAAS,MAAM,EAAE,EAChD,aAAa,GAAE,MAAM,GAAG,IAAW;IAS5B,SAAS,IAAI,OAAO;CAQ9B;AAED;;GAEG;AACH,qBAAa,mBAAoB,SAAQ,WAAW;IAClD,SAAgB,WAAW,EAAE,MAAM,GAAG,IAAI,CAAQ;gBAGhD,GAAG,EAAE,MAAM,EACX,QAAQ,EAAE,QAAQ,EAClB,cAAc,EAAE,MAAM,EACtB,aAAa,GAAE,MAAM,GAAG,IAAW;CAQtC;AAED,MAAM,MAAM,eAAe,GACvB,iBAAiB,GACjB,mBAAmB,GACnB,YAAY,CAAC;AAIjB~~;;;;;;;;GAQG~~;AACH,wBAAsB,KAAK,CACzB,YAAY,EAAE,YAAY,EAC1B,OAAO,CAAC,EAAE,YAAY,GACrB,OAAO,CAAC,eAAe,CAAC,CAqD1B"}
1	+ {"version":3,"file":"probe.d.ts","sourceRoot":"","sources":["../src/probe.ts"],"names":[],"mappings":"AAAA,OAAO,EAAyB,YAAY,EAAE,MAAM,cAAc,CAAC;AAKnE;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,0DAA0D;IAC1D,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB;;;OAGG;IACH,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB;;;;;OAKG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB;;;;;;;;;OASG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB;;;;;;;;;;OAUG;IACH,kBAAkB,CAAC,EAAE,OAAO,CAAC;IAC7B;;;;;;;;;OASG;IACH,qBAAqB,CAAC,EAAE,MAAM,CAAC;CAChC;AAgCD;;GAEG;AACH,qBAAa,YAAY;aAEL,GAAG,EAAE,MAAM;aACX,OAAO,EAAE,MAAM;aACf,cAAc,EAAE,MAAM;gBAFtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,MAAM,EACf,cAAc,EAAE,MAAM;CAEzC;AAED;;GAEG;AACH,uBAAe,WAAW;aAUN,GAAG,EAAE,MAAM;IAT7B,SAAgB,UAAU,EAAE,MAAM,CAAC;IACnC,SAAgB,UAAU,EAAE,MAAM,CAAC;IACnC,SAAgB,YAAY,EAAE,IAAI,GAAG,IAAI,CAAQ;IACjD,SAAgB,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3C,SAAgB,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7C,SAAgB,QAAQ,EAAE,MAAM,EAAE,CAAM;IACxC,SAAgB,cAAc,EAAE,MAAM,CAAC;gBAGrB,GAAG,EAAE,MAAM,EAC3B,QAAQ,EAAE,QAAQ,EAClB,cAAc,EAAE,MAAM,EACtB,aAAa,GAAE,MAAM,GAAG,IAAW;IAa9B,SAAS,IAAI,OAAO;CAO5B;AAqBD;;GAEG;AACH,qBAAa,iBAAkB,SAAQ,WAAW;IAChD;;;;;OAKG;IACH,SAAgB,oBAAoB,EAAE,SAAS,MAAM,EAAE,CAAC;gBAGtD,GAAG,EAAE,MAAM,EACX,QAAQ,EAAE,QAAQ,EAClB,cAAc,EAAE,MAAM,EACtB,oBAAoB,EAAE,MAAM,GAAG,SAAS,MAAM,EAAE,EAChD,aAAa,GAAE,MAAM,GAAG,IAAW;IAS5B,SAAS,IAAI,OAAO;CAQ9B;AAED;;GAEG;AACH,qBAAa,mBAAoB,SAAQ,WAAW;IAClD,SAAgB,WAAW,EAAE,MAAM,GAAG,IAAI,CAAQ;gBAGhD,GAAG,EAAE,MAAM,EACX,QAAQ,EAAE,QAAQ,EAClB,cAAc,EAAE,MAAM,EACtB,aAAa,GAAE,MAAM,GAAG,IAAW;CAQtC;AAED,MAAM,MAAM,eAAe,GACvB,iBAAiB,GACjB,mBAAmB,GACnB,YAAY,CAAC;AAIjB;;;;;;;;;GASG;AACH,wBAAsB,KAAK,CACzB,YAAY,EAAE,YAAY,EAC1B,OAAO,CAAC,EAAE,YAAY,GACrB,OAAO,CAAC,eAAe,CAAC,CAqD1B"}

package/dist/probe.js CHANGED Viewed

@@ -1,9 +1,30 @@
 import { compressionMediaTypes } from '@lde/dataset';
 import { rdfParser } from 'rdf-parse';
 import { Readable } from 'node:stream';
+import { createGunzip } from 'node:zlib';
 const DEFAULT_SPARQL_QUERY = 'SELECT * { ?s ?p ?o } LIMIT 1';
 const DEFAULT_TIMEOUT_MS = 5000;
 const DEFAULT_RETRIES = 2;
+/**
+ * Default soft deadline for finding the first triple when content validation is
+ * on (capped at `timeoutMs`). Two seconds comfortably covers a static file
+ * server's first chunk while keeping the extra wait bounded on a slow,
+ * generate-on-the-fly endpoint.
+ */
+const DEFAULT_RDF_VALIDATION_BUDGET_MS = 2000;
+/** Sentinel: the validation budget elapsed before a triple surfaced. */
+const VALIDATION_TIMED_OUT = Symbol('rdf-validation-timed-out');
+/**
+ * Maximum number of body bytes the data-dump probe reads before it stops and
+ * releases the connection. Reachability needs only that the endpoint answered
+ * with a success status and produced bytes; a large dump must never be
+ * downloaded in full within the probe's timeout budget. 256 KiB comfortably
+ * surfaces the first RDF triple — the signal {@link validateBody} needs — while
+ * bounding the read regardless of the dump's true size, chunked transfer, or
+ * compression. Applied to both the raw read and, for a gzip body, the inflated
+ * output.
+ */
+const MAX_PROBE_BODY_BYTES = 256 * 1024;
 /** Base backoff between retries; the nth retry waits `n × base`. */
 const RETRY_BACKOFF_MS = 250;
 /**
@@ -107,7 +128,8 @@ export class DataDumpProbeResult extends ProbeResult {
  *
  * For SPARQL endpoints, issues the configured SPARQL query (default: a
  * minimal `SELECT`). For data dumps, issues `HEAD` (with a `GET` fallback
- * for small or unknown-size bodies).
+ * for small or unknown-size bodies, reading only a bounded prefix so a large
+ * streamed dump is never downloaded in full).
  *
  * Returns a pure result object; never throws.
  */
@@ -186,6 +208,9 @@ function resolveOptions(options) {
         retries: retries === undefined || !Number.isInteger(retries)
             ? DEFAULT_RETRIES
             : Math.max(0, retries),
+        validateRdfContent: options?.validateRdfContent ?? false,
+        rdfValidationBudgetMs: options?.rdfValidationBudgetMs ??
+            Math.min(options?.timeoutMs ?? DEFAULT_TIMEOUT_MS, DEFAULT_RDF_VALIDATION_BUDGET_MS),
     };
 }
 /**
@@ -350,30 +375,201 @@ async function probeDataDump(url, distribution, options, authHeaders, start) {
         method: 'HEAD',
         ...requestOptions,
     });
-    const contentLength = headResponse.headers.get('Content-Length');
-    const contentLengthBytes = contentLength ? parseInt(contentLength) : 0;
-    // For small or unknown-size files, do a GET to validate body content.
-    // This also handles servers that incorrectly return 0 Content-Length for HEAD.
-    if (contentLengthBytes <= 10_240) {
-        const getResponse = await fetch(url, {
-            method: 'GET',
-            ...requestOptions,
-        });
-        const body = await getResponse.text();
-        const isHttpSuccess = getResponse.status >= 200 && getResponse.status < 400;
-        const failureReason = isHttpSuccess
-            ? await validateBody(body, getResponse.headers.get('Content-Type'), url, options.timeoutMs)
-            : null;
-        const responseTimeMs = Math.round(performance.now() - start);
-        const result = new DataDumpProbeResult(url, getResponse, responseTimeMs, failureReason);
-        checkContentTypeMismatch(result, distribution);
-        return result;
+    // Validate body content only when asked to and the distribution declares an
+    // RDF media type; otherwise the probe is reachability-only and never reads a
+    // body — which keeps it from forcing a slow, generate-on-the-fly endpoint to
+    // start producing its export.
+    if (options.validateRdfContent &&
+        isDeclaredRdf(distribution) &&
+        isHttpSuccess(headResponse)) {
+        const { response, failureReason } = await validateDumpBody(url, headers, options, headResponse);
+        return finalizeDataDump(url, distribution, response, start, failureReason);
+    }
+    // Reachability only. A successful HEAD is enough; otherwise confirm with a
+    // body-less GET, which rescues servers that reject or do not implement HEAD.
+    if (isHttpSuccess(headResponse)) {
+        return finalizeDataDump(url, distribution, headResponse, start, null);
     }
+    const getResponse = await fetch(url, { method: 'GET', ...requestOptions });
+    await getResponse.body?.cancel();
+    return finalizeDataDump(url, distribution, getResponse, start, null);
+}
+/** Whether an HTTP response carries a success (2xx/3xx) status. */
+function isHttpSuccess(response) {
+    return response.status >= 200 && response.status < 400;
+}
+/** Whether the distribution declares an RDF serialization as its media type. */
+function isDeclaredRdf(distribution) {
+    const declared = distribution.mimeType?.toLowerCase();
+    return declared !== undefined && rdfContentTypes.includes(declared);
+}
+/** Build a DataDumpProbeResult and attach any Content-Type-mismatch warning. */
+function finalizeDataDump(url, distribution, response, start, failureReason) {
     const responseTimeMs = Math.round(performance.now() - start);
-    const result = new DataDumpProbeResult(url, headResponse, responseTimeMs);
+    const result = new DataDumpProbeResult(url, response, responseTimeMs, failureReason);
     checkContentTypeMismatch(result, distribution);
     return result;
 }
+/**
+ * GET the dump and validate that its body carries a triple, but only for as long
+ * as the validation budget allows. Reachability is already settled by the prior
+ * HEAD, so any shortfall — a budget that elapses before a triple, a read error,
+ * a GET that cannot start — yields a `null` failureReason (reachable,
+ * unvalidated), never a failure. Returns the response to draw metadata from
+ * (the GET, or the HEAD when the GET could not start) alongside that reason.
+ */
+async function validateDumpBody(url, headers, options, headResponse) {
+    const budgetMs = Math.min(options.rdfValidationBudgetMs, options.timeoutMs);
+    // Aborting on budget expiry stops a slow endpoint from streaming on in the
+    // background once we have given up waiting for a triple.
+    const budgetController = new AbortController();
+    let getResponse;
+    try {
+        getResponse = await fetch(url, {
+            method: 'GET',
+            headers,
+            signal: AbortSignal.any([
+                AbortSignal.timeout(options.timeoutMs),
+                budgetController.signal,
+            ]),
+        });
+    }
+    catch {
+        // The GET could not even return headers; the HEAD already proved the
+        // distribution reachable, so report it unvalidated rather than down.
+        return { response: headResponse, failureReason: null };
+    }
+    if (!isHttpSuccess(getResponse)) {
+        await getResponse.body?.cancel();
+        return { response: getResponse, failureReason: null };
+    }
+    const validation = (async () => {
+        const bounded = await readBoundedBody(getResponse, MAX_PROBE_BODY_BYTES);
+        const { text, truncated, corrupt } = await decodeProbeBody(bounded);
+        return corrupt
+            ? 'Distribution is not valid gzip'
+            : await validateBody(text, getResponse.headers.get('Content-Type'), url, budgetMs, truncated);
+    })().catch(() => null);
+    let budgetTimer;
+    const budgetExpiry = new Promise((resolve) => {
+        budgetTimer = setTimeout(() => {
+            budgetController.abort();
+            resolve(VALIDATION_TIMED_OUT);
+        }, budgetMs);
+    });
+    try {
+        const outcome = await Promise.race([validation, budgetExpiry]);
+        return {
+            response: getResponse,
+            failureReason: outcome === VALIDATION_TIMED_OUT ? null : outcome,
+        };
+    }
+    finally {
+        clearTimeout(budgetTimer);
+    }
+}
+/**
+ * Read at most `maxBytes` from a response body, then cancel the stream to free
+ * the underlying connection. Returns the bytes read and whether the body was
+ * longer than the cap (`truncated`), so the caller can tell a complete, small
+ * body — whose emptiness or parse errors are meaningful — from a deliberately
+ * cut-off prefix of a large one, where only the presence of content is
+ * conclusive. This is what keeps the probe from downloading a multi-hundred-MB
+ * streamed dump in full just to confirm it is reachable.
+ */
+async function readBoundedBody(response, maxBytes) {
+    const stream = response.body;
+    if (stream === null) {
+        return { bytes: new Uint8Array(0), truncated: false };
+    }
+    const chunks = [];
+    let total = 0;
+    let truncated = false;
+    // Breaking out of `for await` cancels the stream, which stops any further
+    // download and releases the underlying connection — so a large dump is never
+    // pulled in full once we have the prefix we need.
+    for await (const chunk of stream) {
+        chunks.push(chunk);
+        total += chunk.length;
+        if (total >= maxBytes) {
+            truncated = true;
+            break;
+        }
+    }
+    return { bytes: Buffer.concat(chunks), truncated };
+}
+/**
+ * Decode a bounded body to text for RDF validation, inflating it first when it
+ * is a gzip stream that `fetch` did not transparently decompress — e.g. a `.gz`
+ * data dump served as-is, or one labelled with a non-standard Content-Encoding
+ * (`application/gzip`) that undici does not recognise as a content coding.
+ * Detection is by the gzip magic on the delivered bytes, so a body that `fetch`
+ * already inflated (a standard `Content-Encoding: gzip`) is passed through
+ * untouched. A truncated gzip tail is expected — we only read a prefix — and
+ * inflates cleanly up to the cut, so it is never mistaken for corruption.
+ */
+async function decodeProbeBody(bounded) {
+    if (!isGzip(bounded.bytes)) {
+        return {
+            text: decodeUtf8(bounded.bytes),
+            truncated: bounded.truncated,
+            corrupt: false,
+        };
+    }
+    // The compressed body is complete only when the raw read was not itself cut
+    // off: a gzip error on a complete body is genuine corruption, on a prefix we
+    // cut it is just the dropped tail.
+    const inflated = await gunzipPrefix(bounded.bytes, MAX_PROBE_BODY_BYTES, !bounded.truncated);
+    return {
+        text: decodeUtf8(inflated.bytes),
+        truncated: bounded.truncated || inflated.truncated,
+        corrupt: inflated.corrupt,
+    };
+}
+/** Whether the bytes begin with the gzip magic number (RFC 1952 §2.3.1). */
+function isGzip(bytes) {
+    return bytes.length >= 2 && bytes[0] === 0x1f && bytes[1] === 0x8b;
+}
+/**
+ * Decode bytes as UTF-8 without throwing: an incomplete multi-byte sequence at
+ * the truncation boundary is replaced rather than fatal, since the RDF parser
+ * only needs the leading, intact portion to find the first triple.
+ */
+function decodeUtf8(bytes) {
+    return new TextDecoder('utf-8', { fatal: false }).decode(bytes);
+}
+/**
+ * Inflate up to `maxBytes` of output from a gzip prefix, stopping once the cap
+ * is reached or the input runs out. `inputComplete` says whether the caller
+ * handed us the whole compressed body (true) or a prefix it had already cut
+ * (false). An inflate error therefore means different things: on a complete body
+ * the gzip is genuinely corrupt; on a cut prefix it is just the dropped tail, so
+ * whatever inflated cleanly is reported as a (truncated) partial inflate.
+ */
+function gunzipPrefix(bytes, maxBytes, inputComplete) {
+    return new Promise((resolve) => {
+        const gunzip = createGunzip();
+        const chunks = [];
+        let total = 0;
+        // `resolve` and `destroy` are both idempotent, so the first outcome wins and
+        // any later event (e.g. a premature-close error emitted by `destroy`) is a
+        // harmless no-op — no `settled` guard needed.
+        function finish(outcome) {
+            gunzip.destroy();
+            resolve({ bytes: Buffer.concat(chunks), ...outcome });
+        }
+        gunzip.on('data', (chunk) => {
+            chunks.push(chunk);
+            total += chunk.length;
+            if (total >= maxBytes) {
+                finish({ truncated: true, corrupt: false });
+            }
+        });
+        gunzip.on('error', () => finish({ truncated: !inputComplete, corrupt: inputComplete }));
+        gunzip.on('end', () => finish({ truncated: false, corrupt: false }));
+        gunzip.end(bytes);
+    });
+}
 // The RDF serializations whose bodies we parse to confirm they carry triples. A
 // non-empty body in one of these formats that yields zero triples — an empty
 // graph such as a JSON-LD `{}`, an `<rdf:RDF/>`, or prefix-only Turtle — is a
@@ -389,9 +585,21 @@ const rdfContentTypes = [
     'application/ld+json',
     'application/rdf+xml',
 ];
-async function validateBody(body, contentType, baseIRI, timeoutMs) {
+// Serializations a streaming parser cannot validate from a truncated prefix.
+// The line/statement-oriented formats (N-Triples, N-Quads, Turtle, TriG, N3) and
+// SAX-based RDF/XML all yield their first triple from the opening chunk, but
+// JSON-LD is a single JSON value whose parser emits nothing until the whole
+// document closes — a truncated JSON-LD body parses to an ‘unclosed document’
+// error, never a triple. So a truncated body in one of these can only be
+// validated if it happened to fit the read cap in full; beyond that it is
+// inconclusive, and we must not download it in full to find out.
+const nonStreamableRdfContentTypes = ['application/ld+json'];
+async function validateBody(body, contentType, baseIRI, timeoutMs, truncated) {
     if (body.length === 0) {
-        return 'Distribution is empty';
+        // A complete, empty body is a faulty distribution; an empty *prefix* (a
+        // truncated read that yielded no bytes, e.g. a corrupt gzip header) is
+        // inconclusive — the endpoint answered, we just could not validate content.
+        return truncated ? null : 'Distribution is empty';
     }
     // Media types are case-insensitive (RFC 9110 §8.3.1), so normalise before
     // matching the lower-case allow-list — a server sending `Application/LD+JSON`
@@ -400,7 +608,13 @@ async function validateBody(body, contentType, baseIRI, timeoutMs) {
     if (!serialization || !rdfContentTypes.includes(serialization)) {
         return null;
     }
-    const outcome = await classifyRdfBody(body, serialization, baseIRI, timeoutMs);
+    if (truncated && nonStreamableRdfContentTypes.includes(serialization)) {
+        // A bounded prefix of a non-streamable serialization (JSON-LD) can never
+        // yield a triple, so skip the doomed parse and report it inconclusive — only
+        // a complete document, small enough to fit the read cap, can be validated.
+        return null;
+    }
+    const outcome = await classifyRdfBody(body, serialization, baseIRI, timeoutMs, truncated);
     switch (outcome.type) {
         case 'empty':
             return 'Distribution contains no RDF triples';
@@ -422,8 +636,13 @@ async function validateBody(body, contentType, baseIRI, timeoutMs) {
  * on expiry — and likewise when a remote `@context` is unreachable — the outcome
  * is 'inconclusive', so a valid distribution is never flagged faulty for a
  * context host's failure. `baseIRI` resolves any relative IRIs in the document.
+ *
+ * When `truncated` is true the body is only a bounded prefix of a larger one, so
+ * only finding a triple ('hasTriples') is conclusive: a parse error at the cut
+ * or a clean end with no triple yet means we did not read far enough, not that
+ * the distribution is empty or malformed, and is reported as 'inconclusive'.
  */
-function classifyRdfBody(body, contentType, baseIRI, timeoutMs) {
+function classifyRdfBody(body, contentType, baseIRI, timeoutMs, truncated) {
     return new Promise((resolve) => {
         const quads = rdfParser.parse(Readable.from([body]), {
             contentType,
@@ -441,10 +660,10 @@ function classifyRdfBody(body, contentType, baseIRI, timeoutMs) {
         }
         quads
             .on('data', () => settle({ type: 'hasTriples' }))
-            .on('error', (error) => settle(isRemoteContextError(error)
+            .on('error', (error) => settle(truncated || isRemoteContextError(error)
             ? { type: 'inconclusive' }
             : { type: 'parseError', message: error.message }))
-            .on('end', () => settle({ type: 'empty' }));
+            .on('end', () => settle(truncated ? { type: 'inconclusive' } : { type: 'empty' }));
     });
 }
 /**

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@lde/distribution-probe",
-  "version": "0.1.13",
+  "version": "0.2.0",
   "repository": {
     "url": "git+https://github.com/ldelements/lde.git",
     "directory": "packages/distribution-probe"