@lde/distribution-probe 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +21 -0
- package/dist/index.d.ts +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -1
- package/dist/probe.d.ts +43 -0
- package/dist/probe.d.ts.map +1 -1
- package/dist/probe.js +103 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -48,3 +48,24 @@ When on, the probe `GET`s the dump — **regardless of size** — and reads only
|
|
|
48
48
|
A thrown exception from `fetch` (DNS failure, connection refused, socket reset, TLS error, timeout after the configured `timeoutMs` – default 5 000 ms) is a connection-level failure. The probe retries these up to `retries` times (default 2) with a short backoff before giving up and returning a `NetworkError`. This turns a transient transport blip into a reliable single measurement without looking backward across checks. A genuine outage still resolves to a `NetworkError` on the current check – every attempt fails – but note each attempt gets its own `timeoutMs`, so an endpoint that fails only by timing out takes up to `(retries + 1) × timeoutMs` (plus backoff) to be reported down. HTTP error responses (4xx/5xx) and content-validation failures are real ‘down’ states and are **never** retried.
|
|
49
49
|
|
|
50
50
|
`NetworkError.message` includes the underlying `error.cause` (e.g. `ECONNRESET`, `UND_ERR_SOCKET “other side closed”`) when Node wraps one, so observations record what actually failed rather than a bare ‘fetch failed’.
|
|
51
|
+
|
|
52
|
+
## Probing many distributions
|
|
53
|
+
|
|
54
|
+
`probeMany` probes an array of distributions concurrently and returns one result per input, in input order. Each distribution is probed once with `probe`, so every behaviour above applies per distribution; like `probe`, `probeMany` never throws – a probe that fails is reported as a `NetworkError` in its slot.
|
|
55
|
+
|
|
56
|
+
```ts
|
|
57
|
+
import { probeMany } from '@lde/distribution-probe';
|
|
58
|
+
|
|
59
|
+
const results = await probeMany(distributions, {
|
|
60
|
+
concurrency: 20, // max probes in flight across all hosts (default 20)
|
|
61
|
+
perHostConcurrency: 4, // max probes in flight against one host (default 4)
|
|
62
|
+
validateRdfContent: true, // any ProbeOptions are forwarded to each probe
|
|
63
|
+
});
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Two caps bound the batch:
|
|
67
|
+
|
|
68
|
+
- **`concurrency`** bounds the total fan-out, so a large catalogue does not exhaust sockets or buffer too many response bodies at once.
|
|
69
|
+
- **`perHostConcurrency`** bounds the burst any one server sees, keeping the batch a polite client: a catalogue that declares many distributions on a single host (e.g. a download endpoint per named graph) will not trip that server’s rate limiter (HTTP 429). Distributions sharing a host (by `accessUrl`) contend for the same budget; a probe whose host is saturated waits while probes for other hosts proceed, so one busy host never idles the global pool.
|
|
70
|
+
|
|
71
|
+
All other `ProbeOptions` (`timeoutMs`, `retries`, `validateRdfContent`, and the rest) are forwarded unchanged to every probe.
|
package/dist/index.d.ts
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
export { probe, NetworkError, SparqlProbeResult, DataDumpProbeResult, type ProbeOptions, type ProbeResultType, } from './probe.js';
|
|
1
|
+
export { probe, probeMany, NetworkError, SparqlProbeResult, DataDumpProbeResult, type ProbeOptions, type ProbeManyOptions, type ProbeResultType, } from './probe.js';
|
|
2
2
|
//# sourceMappingURL=index.d.ts.map
|
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,KAAK,EACL,YAAY,EACZ,iBAAiB,EACjB,mBAAmB,EACnB,KAAK,YAAY,EACjB,KAAK,eAAe,GACrB,MAAM,YAAY,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,KAAK,EACL,SAAS,EACT,YAAY,EACZ,iBAAiB,EACjB,mBAAmB,EACnB,KAAK,YAAY,EACjB,KAAK,gBAAgB,EACrB,KAAK,eAAe,GACrB,MAAM,YAAY,CAAC"}
|
package/dist/index.js
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
export { probe, NetworkError, SparqlProbeResult, DataDumpProbeResult, } from './probe.js';
|
|
1
|
+
export { probe, probeMany, NetworkError, SparqlProbeResult, DataDumpProbeResult, } from './probe.js';
|
package/dist/probe.d.ts
CHANGED
|
@@ -52,6 +52,35 @@ export interface ProbeOptions {
|
|
|
52
52
|
*/
|
|
53
53
|
rdfValidationBudgetMs?: number;
|
|
54
54
|
}
|
|
55
|
+
/**
|
|
56
|
+
* Options for {@link probeMany}: the per-probe {@link ProbeOptions} plus the
|
|
57
|
+
* concurrency budgets that bound the batch.
|
|
58
|
+
*/
|
|
59
|
+
export interface ProbeManyOptions extends ProbeOptions {
|
|
60
|
+
/**
|
|
61
|
+
* Maximum number of probes to run at once across all hosts. Bounds the batch’s
|
|
62
|
+
* total fan-out so a large catalogue does not exhaust sockets or buffer too many
|
|
63
|
+
* response bodies at once. Default 20.
|
|
64
|
+
*/
|
|
65
|
+
concurrency?: number;
|
|
66
|
+
/**
|
|
67
|
+
* Maximum number of probes to run at once against a single host. Bounds the
|
|
68
|
+
* burst any one server sees, so a catalogue that declares many distributions on
|
|
69
|
+
* one host (e.g. a download endpoint per named graph) does not trip its rate
|
|
70
|
+
* limiter (HTTP 429). A probe whose host is at this cap waits while probes for
|
|
71
|
+
* other hosts proceed, so this never idles the global pool. Default 4.
|
|
72
|
+
*/
|
|
73
|
+
perHostConcurrency?: number;
|
|
74
|
+
/**
|
|
75
|
+
* Called once after each probe settles, with the number of probes completed so
|
|
76
|
+
* far and the total to run (`distributions.length`). Lets a caller drive a
|
|
77
|
+
* determinate progress indicator while a large batch runs. Fires `total` times,
|
|
78
|
+
* ending at `(total, total)`; the order reflects completion, not input order.
|
|
79
|
+
* Never called for an empty batch. A throwing callback rejects the batch, so
|
|
80
|
+
* keep it cheap and side-effect-only.
|
|
81
|
+
*/
|
|
82
|
+
onProgress?: (completed: number, total: number) => void;
|
|
83
|
+
}
|
|
55
84
|
/**
|
|
56
85
|
* Result of a network error during probing.
|
|
57
86
|
*/
|
|
@@ -109,5 +138,19 @@ export type ProbeResultType = SparqlProbeResult | DataDumpProbeResult | NetworkE
|
|
|
109
138
|
* Returns a pure result object; never throws.
|
|
110
139
|
*/
|
|
111
140
|
export declare function probe(distribution: Distribution, options?: ProbeOptions): Promise<ProbeResultType>;
|
|
141
|
+
/**
|
|
142
|
+
* Probe many distributions concurrently, bounded by a global cap and a per-host
|
|
143
|
+
* cap, returning one result per input in input order. Like {@link probe}, this
|
|
144
|
+
* never throws: a probe that somehow fails is reported as a {@link NetworkError}
|
|
145
|
+
* in its slot.
|
|
146
|
+
*
|
|
147
|
+
* The per-host cap keeps the batch a polite client. Distributions sharing a host
|
|
148
|
+
* (by {@link Distribution.accessUrl}) contend for the same budget, so no single
|
|
149
|
+
* server is hit by the full global pool at once — the burst that trips a rate
|
|
150
|
+
* limiter (HTTP 429). When the next queued probe’s host is saturated it is
|
|
151
|
+
* skipped in favour of a later probe on a different host, so one busy host never
|
|
152
|
+
* idles the global pool (no head-of-line blocking).
|
|
153
|
+
*/
|
|
154
|
+
export declare function probeMany(distributions: readonly Distribution[], options?: ProbeManyOptions): Promise<ProbeResultType[]>;
|
|
112
155
|
export {};
|
|
113
156
|
//# sourceMappingURL=probe.d.ts.map
|
package/dist/probe.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"probe.d.ts","sourceRoot":"","sources":["../src/probe.ts"],"names":[],"mappings":"AAAA,OAAO,EAAyB,YAAY,EAAE,MAAM,cAAc,CAAC;AAKnE;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,0DAA0D;IAC1D,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB;;;OAGG;IACH,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB;;;;;OAKG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB;;;;;;;;;OASG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB;;;;;;;;;;OAUG;IACH,kBAAkB,CAAC,EAAE,OAAO,CAAC;IAC7B;;;;;;;;;OASG;IACH,qBAAqB,CAAC,EAAE,MAAM,CAAC;CAChC;
|
|
1
|
+
{"version":3,"file":"probe.d.ts","sourceRoot":"","sources":["../src/probe.ts"],"names":[],"mappings":"AAAA,OAAO,EAAyB,YAAY,EAAE,MAAM,cAAc,CAAC;AAKnE;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,0DAA0D;IAC1D,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB;;;OAGG;IACH,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB;;;;;OAKG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB;;;;;;;;;OASG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB;;;;;;;;;;OAUG;IACH,kBAAkB,CAAC,EAAE,OAAO,CAAC;IAC7B;;;;;;;;;OASG;IACH,qBAAqB,CAAC,EAAE,MAAM,CAAC;CAChC;AAED;;;GAGG;AACH,MAAM,WAAW,gBAAiB,SAAQ,YAAY;IACpD;;;;OAIG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB;;;;;;OAMG;IACH,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B;;;;;;;OAOG;IACH,UAAU,CAAC,EAAE,CAAC,SAAS,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,CAAC;CACzD;AAkCD;;GAEG;AACH,qBAAa,YAAY;aAEL,GAAG,EAAE,MAAM;aACX,OAAO,EAAE,MAAM;aACf,cAAc,EAAE,MAAM;gBAFtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,MAAM,EACf,cAAc,EAAE,MAAM;CAEzC;AAED;;GAEG;AACH,uBAAe,WAAW;aAUN,GAAG,EAAE,MAAM;IAT7B,SAAgB,UAAU,EAAE,MAAM,CAAC;IACnC,SAAgB,UAAU,EAAE,MAAM,CAAC;IACnC,SAAgB,YAAY,EAAE,IAAI,GAAG,IAAI,CAAQ;IACjD,SAAgB,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3C,SAAgB,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7C,SAAgB,QAAQ,EAAE,MAAM,EAAE,CAAM;IACxC,SAAgB,cAAc,EAAE,MAAM,CAAC;gBAGrB,GAAG,EAAE,MAAM,EAC3B,QAAQ,EAAE,QAAQ,EAClB,cAAc,EAAE,MAAM,EACtB,aAAa,GAAE,MAAM,GAAG,IAAW;IAa9B,SAAS,IAAI,OAAO;CAO5B;AAqBD;;GAEG;AACH,qBAAa,iBAAkB,SAAQ,WAAW;IAChD;;;;;OAKG;IACH,SAAgB,oBAAoB,EAAE,SAAS,MAAM,EAAE,CAAC;gBAGtD,GAAG,EAAE,MAAM,EACX,QAAQ,EAAE,QAAQ,EAClB,cAAc,EAAE,MAAM,EACtB,oBAAoB,EAAE,MAAM,GAAG,SAAS,MAAM,EAAE,EAChD,aAAa,GAAE,MAAM,GAAG,IAAW;IAS5B,SAAS,IAAI,OAAO;CAQ9B;AAED;;GAEG;AACH,qBAAa,mBAAoB,SAAQ,WAAW;IAClD,SAAgB,WAAW,EAAE,MAAM,GAAG,IAAI,CAAQ;gBAGhD,GAAG,EAAE,MAAM,EACX,QAAQ,EAAE,QAAQ,EAClB,cAAc,EAAE,MAAM,EACtB,aAAa,GAAE,MAAM,GAAG,IAAW;CAQtC;AAED,MAAM,MAAM,eAAe,GACvB,iBAAiB,GACjB,mBAAmB,GACnB,YAAY,CAAC;AAIjB;;;;;;;;;GASG;AACH,wBAAsB,KAAK,CACzB,YAAY,EAAE,YAAY,EAC1B,OAAO,CAAC,EAAE,YAAY,GACrB,OAAO,CAAC,eAAe,CAAC,CAqD1B;AAED;;;;;;;;;;;;GAYG;AACH,wBAAsB,SAAS,CAC7B,aAAa,EAAE,SAAS,YAAY,EAAE,EACtC,OAAO,CAAC,EAAE,gBAAgB,GACzB,OAAO,CAAC,eAAe,EAAE,CAAC,CAuC5B"}
|
package/dist/probe.js
CHANGED
|
@@ -5,6 +5,8 @@ import { createGunzip } from 'node:zlib';
|
|
|
5
5
|
const DEFAULT_SPARQL_QUERY = 'SELECT * { ?s ?p ?o } LIMIT 1';
|
|
6
6
|
const DEFAULT_TIMEOUT_MS = 5000;
|
|
7
7
|
const DEFAULT_RETRIES = 2;
|
|
8
|
+
const DEFAULT_PROBE_CONCURRENCY = 20;
|
|
9
|
+
const DEFAULT_PROBE_PER_HOST_CONCURRENCY = 4;
|
|
8
10
|
/**
|
|
9
11
|
* Default soft deadline for finding the first triple when content validation is
|
|
10
12
|
* on (capped at `timeoutMs`). Two seconds comfortably covers a static file
|
|
@@ -169,6 +171,107 @@ export async function probe(distribution, options) {
|
|
|
169
171
|
// real cost of a down endpoint.
|
|
170
172
|
return new NetworkError(url, describeNetworkError(lastError), Math.round(performance.now() - overallStart));
|
|
171
173
|
}
|
|
174
|
+
/**
|
|
175
|
+
* Probe many distributions concurrently, bounded by a global cap and a per-host
|
|
176
|
+
* cap, returning one result per input in input order. Like {@link probe}, this
|
|
177
|
+
* never throws: a probe that somehow fails is reported as a {@link NetworkError}
|
|
178
|
+
* in its slot.
|
|
179
|
+
*
|
|
180
|
+
* The per-host cap keeps the batch a polite client. Distributions sharing a host
|
|
181
|
+
* (by {@link Distribution.accessUrl}) contend for the same budget, so no single
|
|
182
|
+
* server is hit by the full global pool at once — the burst that trips a rate
|
|
183
|
+
* limiter (HTTP 429). When the next queued probe’s host is saturated it is
|
|
184
|
+
* skipped in favour of a later probe on a different host, so one busy host never
|
|
185
|
+
* idles the global pool (no head-of-line blocking).
|
|
186
|
+
*/
|
|
187
|
+
export async function probeMany(distributions, options) {
|
|
188
|
+
// Clamp the budgets to a positive integer, mirroring how probe() treats an
|
|
189
|
+
// invalid retries value: a zero, negative, fractional, or NaN limit would
|
|
190
|
+
// otherwise stall the scheduler (no task ever starts, so the promise never
|
|
191
|
+
// resolves) or overrun the cap, so fall back to the default rather than trust
|
|
192
|
+
// the caller.
|
|
193
|
+
const globalLimit = positiveIntOrDefault(options?.concurrency, DEFAULT_PROBE_CONCURRENCY);
|
|
194
|
+
const perHostLimit = positiveIntOrDefault(options?.perHostConcurrency, DEFAULT_PROBE_PER_HOST_CONCURRENCY);
|
|
195
|
+
// Probes contend per host. An authority-less URL (e.g. urn:, file:) has an
|
|
196
|
+
// empty host, so it falls back to its full href and never shares a budget with
|
|
197
|
+
// an unrelated one.
|
|
198
|
+
const hostKeys = distributions.map((distribution) => distribution.accessUrl.host || distribution.accessUrl.href);
|
|
199
|
+
// Report progress as each probe settles. mapHostLimited resolves results in
|
|
200
|
+
// input order, but tasks complete out of order, so count completions here
|
|
201
|
+
// rather than rely on result position. The total is the batch size.
|
|
202
|
+
const onProgress = options?.onProgress;
|
|
203
|
+
const total = distributions.length;
|
|
204
|
+
let completed = 0;
|
|
205
|
+
return mapHostLimited(distributions, hostKeys, globalLimit, perHostLimit, async (distribution) => {
|
|
206
|
+
const result = await probe(distribution, options);
|
|
207
|
+
completed += 1;
|
|
208
|
+
onProgress?.(completed, total);
|
|
209
|
+
return result;
|
|
210
|
+
});
|
|
211
|
+
}
|
|
212
|
+
/**
|
|
213
|
+
* Coerce an optional concurrency budget to a usable value: a positive integer is
|
|
214
|
+
* taken as-is; undefined, zero, negative, fractional, or NaN falls back to the
|
|
215
|
+
* default. Matches probe()’s treatment of an invalid retries value.
|
|
216
|
+
*/
|
|
217
|
+
function positiveIntOrDefault(value, fallback) {
|
|
218
|
+
return value !== undefined && Number.isInteger(value) && value >= 1
|
|
219
|
+
? value
|
|
220
|
+
: fallback;
|
|
221
|
+
}
|
|
222
|
+
/**
|
|
223
|
+
* Run `task` over `items` with two concurrency caps — a global cap and a per-host
|
|
224
|
+
* cap keyed by `hostKeys[index]` — resolving to results in input order. When the
|
|
225
|
+
* next queued item’s host is at the per-host cap it is skipped for a later item on
|
|
226
|
+
* a different host, so a saturated host never idles the global pool (no head-of-line
|
|
227
|
+
* blocking); the skipped host always has a task in flight, whose completion re-runs
|
|
228
|
+
* the scheduler, so the queue always drains. `task` must not reject — callers wrap
|
|
229
|
+
* failures into a result value — as a rejection would leave the promise pending.
|
|
230
|
+
*/
|
|
231
|
+
function mapHostLimited(items, hostKeys, globalLimit, perHostLimit, task) {
|
|
232
|
+
const results = new Array(items.length);
|
|
233
|
+
const perHostInFlight = new Map();
|
|
234
|
+
const pending = items.map((_unused, index) => index);
|
|
235
|
+
let globalInFlight = 0;
|
|
236
|
+
let settledCount = 0;
|
|
237
|
+
const adjustHost = (host, delta) => {
|
|
238
|
+
perHostInFlight.set(host, (perHostInFlight.get(host) ?? 0) + delta);
|
|
239
|
+
};
|
|
240
|
+
return new Promise((resolve) => {
|
|
241
|
+
const schedule = () => {
|
|
242
|
+
let cursor = 0;
|
|
243
|
+
while (cursor < pending.length && globalInFlight < globalLimit) {
|
|
244
|
+
const index = pending[cursor];
|
|
245
|
+
const host = hostKeys[index];
|
|
246
|
+
if ((perHostInFlight.get(host) ?? 0) >= perHostLimit) {
|
|
247
|
+
cursor++; // Host saturated; leave it queued and try a later, different host.
|
|
248
|
+
continue;
|
|
249
|
+
}
|
|
250
|
+
pending.splice(cursor, 1);
|
|
251
|
+
globalInFlight++;
|
|
252
|
+
adjustHost(host, 1);
|
|
253
|
+
void task(items[index]).then((result) => {
|
|
254
|
+
results[index] = result;
|
|
255
|
+
globalInFlight--;
|
|
256
|
+
adjustHost(host, -1);
|
|
257
|
+
settledCount++;
|
|
258
|
+
if (settledCount === items.length) {
|
|
259
|
+
resolve(results);
|
|
260
|
+
}
|
|
261
|
+
else {
|
|
262
|
+
schedule();
|
|
263
|
+
}
|
|
264
|
+
});
|
|
265
|
+
// pending[cursor] now holds the next queued item; do not advance cursor.
|
|
266
|
+
}
|
|
267
|
+
};
|
|
268
|
+
schedule();
|
|
269
|
+
// Resolve immediately when there is nothing to settle (empty input); a
|
|
270
|
+
// non-empty run resolves via the task completion above.
|
|
271
|
+
if (settledCount === items.length)
|
|
272
|
+
resolve(results);
|
|
273
|
+
});
|
|
274
|
+
}
|
|
172
275
|
function delay(milliseconds) {
|
|
173
276
|
return new Promise((resolve) => setTimeout(resolve, milliseconds));
|
|
174
277
|
}
|