muaddib-scanner 2.10.81 → 2.10.82
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -17,7 +17,7 @@ const os = require('os');
|
|
|
17
17
|
|
|
18
18
|
const MIN_CONCURRENCY = 4;
|
|
19
19
|
const BASE_CONCURRENCY = Math.max(MIN_CONCURRENCY, parseInt(process.env.MUADDIB_SCAN_CONCURRENCY, 10) || 8);
|
|
20
|
-
const MAX_CONCURRENCY = Math.max(BASE_CONCURRENCY, parseInt(process.env.MUADDIB_MAX_CONCURRENCY, 10) ||
|
|
20
|
+
const MAX_CONCURRENCY = Math.max(BASE_CONCURRENCY, parseInt(process.env.MUADDIB_MAX_CONCURRENCY, 10) || 16);
|
|
21
21
|
const ADJUST_INTERVAL_MS = 30_000;
|
|
22
22
|
|
|
23
23
|
// Queue depth thresholds
|
|
@@ -37,6 +37,12 @@ const TIMEOUT_RATE_MIN_SAMPLES = 20;
|
|
|
37
37
|
let _prevScanned = 0;
|
|
38
38
|
let _prevTimeouts = 0;
|
|
39
39
|
|
|
40
|
+
// Throughput plateau detection: if we scaled up but throughput didn't increase,
|
|
41
|
+
// we've hit I/O saturation (npm registry rate limiting, disk contention).
|
|
42
|
+
// More workers would make it worse — scale back instead.
|
|
43
|
+
let _prevThroughput = 0;
|
|
44
|
+
let _lastScaleDirection = 0; // +1 = scaled up, -1 = scaled down, 0 = stable
|
|
45
|
+
|
|
40
46
|
/**
|
|
41
47
|
* Compute new target concurrency from system signals.
|
|
42
48
|
* Uses stats deltas (not cumulative) for timeout rate — avoids stale data.
|
|
@@ -74,21 +80,39 @@ function computeTarget(current, queueDepth, stats) {
|
|
|
74
80
|
// Priority 2: High timeout rate — system saturated, adding workers makes it worse
|
|
75
81
|
if (timeoutRate > TIMEOUT_RATE_THRESHOLD) {
|
|
76
82
|
const target = clamp(current - 2);
|
|
83
|
+
_prevThroughput = scannedDelta;
|
|
84
|
+
_lastScaleDirection = target < current ? -1 : 0;
|
|
77
85
|
return { target, reason: `high_timeout_rate (${(timeoutRate * 100).toFixed(0)}%, ${timeoutDelta}/${scannedDelta})` };
|
|
78
86
|
}
|
|
79
87
|
|
|
80
|
-
// Priority 3:
|
|
88
|
+
// Priority 3: Throughput plateau — scaled up last tick but throughput flat/down.
|
|
89
|
+
// This catches I/O saturation: more workers = more concurrent HTTP to npm registry
|
|
90
|
+
// = rate limiting + contention = scan times 10s→90s = throughput drops.
|
|
91
|
+
// Scale back instead of continuing to add workers.
|
|
92
|
+
if (_lastScaleDirection > 0 && _prevThroughput > 0 && scannedDelta > 0 && scannedDelta <= _prevThroughput) {
|
|
93
|
+
const prevTp = _prevThroughput;
|
|
94
|
+
_prevThroughput = scannedDelta;
|
|
95
|
+
_lastScaleDirection = -1;
|
|
96
|
+
return { target: clamp(current - 2), reason: `throughput_plateau (${prevTp}→${scannedDelta} scans/30s, more workers didn't help)` };
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// Priority 4: Queue depth — scale up for backlog, down toward base when idle
|
|
81
100
|
if (queueDepth > QUEUE_BACKLOG_THRESHOLD) {
|
|
82
101
|
const target = clamp(current + 4);
|
|
102
|
+
// Record throughput at the point of scale-up — next tick compares against this
|
|
103
|
+
_prevThroughput = scannedDelta;
|
|
104
|
+
_lastScaleDirection = target > current ? 1 : 0;
|
|
83
105
|
return { target, reason: `backlog (queue=${queueDepth})` };
|
|
84
106
|
}
|
|
85
107
|
|
|
86
108
|
if (queueDepth < QUEUE_IDLE_THRESHOLD) {
|
|
87
109
|
// Converge toward BASE, not MIN — normal traffic needs BASE capacity
|
|
88
110
|
const target = Math.max(BASE_CONCURRENCY, clamp(current - 2));
|
|
111
|
+
_lastScaleDirection = target < current ? -1 : 0;
|
|
89
112
|
return { target, reason: `idle (queue=${queueDepth})` };
|
|
90
113
|
}
|
|
91
114
|
|
|
115
|
+
_lastScaleDirection = 0;
|
|
92
116
|
return { target: current, reason: 'stable' };
|
|
93
117
|
}
|
|
94
118
|
|
|
@@ -102,6 +126,8 @@ function clamp(n) {
|
|
|
102
126
|
function resetDeltas() {
|
|
103
127
|
_prevScanned = 0;
|
|
104
128
|
_prevTimeouts = 0;
|
|
129
|
+
_prevThroughput = 0;
|
|
130
|
+
_lastScaleDirection = 0;
|
|
105
131
|
}
|
|
106
132
|
|
|
107
133
|
module.exports = {
|
package/src/monitor/ingestion.js
CHANGED
|
@@ -643,13 +643,20 @@ async function pollPyPI(state, scanQueue) {
|
|
|
643
643
|
* @param {Array} scanQueue - Mutable scan queue array
|
|
644
644
|
* @param {Object} stats - Mutable stats object
|
|
645
645
|
*/
|
|
646
|
+
const SOFT_BACKPRESSURE_THRESHOLD = 10_000;
|
|
647
|
+
|
|
646
648
|
async function poll(state, scanQueue, stats) {
|
|
647
|
-
//
|
|
648
|
-
//
|
|
649
|
-
//
|
|
650
|
-
//
|
|
649
|
+
// Soft backpressure: skip poll when queue is very deep.
|
|
650
|
+
// Safe because: CouchDB seq is NOT advanced (stays in memory only, persisted
|
|
651
|
+
// by daemon.js AFTER poll returns) — next poll resumes from the same point.
|
|
652
|
+
// Combined with adaptive concurrency: workers scale up → queue drains → poll resumes.
|
|
653
|
+
// This prevents the queue from growing to 30-40K during catch-up (OOM risk).
|
|
654
|
+
if (scanQueue.length >= SOFT_BACKPRESSURE_THRESHOLD) {
|
|
655
|
+
console.log(`[MONITOR] BACKPRESSURE: skipping poll (queue ${scanQueue.length} >= ${SOFT_BACKPRESSURE_THRESHOLD}) — seq not advanced, 0 packages lost`);
|
|
656
|
+
return;
|
|
657
|
+
}
|
|
651
658
|
if (scanQueue.length > 5_000) {
|
|
652
|
-
console.log(`[MONITOR] QUEUE_DEPTH: ${scanQueue.length} items — polling continues
|
|
659
|
+
console.log(`[MONITOR] QUEUE_DEPTH: ${scanQueue.length} items — polling continues`);
|
|
653
660
|
}
|
|
654
661
|
|
|
655
662
|
const timestamp = new Date().toISOString().slice(0, 19).replace('T', ' ');
|
package/src/shared/download.js
CHANGED
|
@@ -171,6 +171,12 @@ function downloadToFile(url, destPath, timeoutMs = DOWNLOAD_TIMEOUT) {
|
|
|
171
171
|
}
|
|
172
172
|
return doRequest(absoluteLocation, redirectCount + 1);
|
|
173
173
|
}
|
|
174
|
+
if (res.statusCode === 429) {
|
|
175
|
+
res.resume();
|
|
176
|
+
// Signal rate limiter to back off — drains tokens, forces ~1s pause
|
|
177
|
+
try { require('./http-limiter.js').signal429(); } catch {}
|
|
178
|
+
return reject(new Error(`HTTP 429 rate limited for ${requestUrl}`));
|
|
179
|
+
}
|
|
174
180
|
if (res.statusCode < 200 || res.statusCode >= 300) {
|
|
175
181
|
res.resume();
|
|
176
182
|
return reject(new Error(`HTTP ${res.statusCode} for ${requestUrl}`));
|
|
@@ -1,29 +1,35 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
|
-
* Centralized HTTP concurrency limiter for npm registry requests.
|
|
4
|
+
* Centralized HTTP concurrency + rate limiter for npm registry requests.
|
|
5
5
|
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
* so that no more than REGISTRY_SEMAPHORE_MAX requests are in-flight at once.
|
|
6
|
+
* Two layers of protection:
|
|
7
|
+
* 1. Concurrency semaphore (REGISTRY_SEMAPHORE_MAX = 10) — caps in-flight requests
|
|
8
|
+
* 2. Rate limiter (RATE_LIMIT_PER_SEC = 30) — caps requests/second via token bucket
|
|
10
9
|
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
10
|
+
* Without rate limiting, 10 concurrent slots × fast-completing requests = 100+ req/s
|
|
11
|
+
* bursts that trigger npm 429 responses → exponential backoff → scan times 10s→90s.
|
|
12
|
+
*
|
|
13
|
+
* Consumers: queue.js (downloadToFile), temporal-analysis.js, npm-registry.js.
|
|
13
14
|
* NOT covered: api.npmjs.org (different server), replicate.npmjs.com (CouchDB changes stream).
|
|
14
15
|
*/
|
|
15
16
|
|
|
16
17
|
const REGISTRY_SEMAPHORE_MAX = 10;
|
|
18
|
+
const RATE_LIMIT_PER_SEC = 30;
|
|
19
|
+
|
|
20
|
+
// --- Concurrency semaphore ---
|
|
17
21
|
|
|
18
22
|
const _semaphore = { active: 0, queue: [] };
|
|
19
23
|
|
|
20
24
|
function acquireRegistrySlot() {
|
|
21
25
|
if (_semaphore.active < REGISTRY_SEMAPHORE_MAX) {
|
|
22
26
|
_semaphore.active++;
|
|
23
|
-
return
|
|
27
|
+
return _acquireRateToken();
|
|
24
28
|
}
|
|
25
29
|
return new Promise(resolve => {
|
|
26
|
-
_semaphore.queue.push(
|
|
30
|
+
_semaphore.queue.push(() => {
|
|
31
|
+
_acquireRateToken().then(resolve);
|
|
32
|
+
});
|
|
27
33
|
});
|
|
28
34
|
}
|
|
29
35
|
|
|
@@ -36,9 +42,64 @@ function releaseRegistrySlot() {
|
|
|
36
42
|
}
|
|
37
43
|
}
|
|
38
44
|
|
|
45
|
+
// --- Token bucket rate limiter ---
|
|
46
|
+
// Refills RATE_LIMIT_PER_SEC tokens per second. Each request consumes 1 token.
|
|
47
|
+
// If no tokens available, waits until the next refill.
|
|
48
|
+
|
|
49
|
+
let _tokens = RATE_LIMIT_PER_SEC;
|
|
50
|
+
let _lastRefill = Date.now();
|
|
51
|
+
|
|
52
|
+
function _refillTokens() {
|
|
53
|
+
const now = Date.now();
|
|
54
|
+
const elapsed = now - _lastRefill;
|
|
55
|
+
if (elapsed >= 1000) {
|
|
56
|
+
_tokens = Math.min(RATE_LIMIT_PER_SEC, _tokens + Math.floor(elapsed / 1000) * RATE_LIMIT_PER_SEC);
|
|
57
|
+
_lastRefill = now;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
function _acquireRateToken() {
|
|
62
|
+
_refillTokens();
|
|
63
|
+
if (_tokens > 0) {
|
|
64
|
+
_tokens--;
|
|
65
|
+
return Promise.resolve();
|
|
66
|
+
}
|
|
67
|
+
// Wait until next refill
|
|
68
|
+
const waitMs = 1000 - (Date.now() - _lastRefill);
|
|
69
|
+
return new Promise(resolve => {
|
|
70
|
+
setTimeout(() => {
|
|
71
|
+
_refillTokens();
|
|
72
|
+
_tokens = Math.max(0, _tokens - 1);
|
|
73
|
+
resolve();
|
|
74
|
+
}, Math.max(10, waitMs));
|
|
75
|
+
});
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// --- 429 backoff helper ---
|
|
79
|
+
// Call this when a 429 response is received. Drains all tokens to force
|
|
80
|
+
// a ~1s pause on subsequent requests (token bucket naturally refills).
|
|
81
|
+
|
|
82
|
+
let _backoffCount = 0;
|
|
83
|
+
|
|
84
|
+
function signal429() {
|
|
85
|
+
_tokens = 0;
|
|
86
|
+
_lastRefill = Date.now() + 1000; // Force 1s pause
|
|
87
|
+
_backoffCount++;
|
|
88
|
+
if (_backoffCount % 10 === 1) {
|
|
89
|
+
console.warn(`[HTTP-LIMITER] 429 rate limited by npm registry (total: ${_backoffCount})`);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
function getBackoffCount() {
|
|
94
|
+
return _backoffCount;
|
|
95
|
+
}
|
|
96
|
+
|
|
39
97
|
function resetLimiter() {
|
|
40
98
|
_semaphore.active = 0;
|
|
41
99
|
_semaphore.queue.length = 0;
|
|
100
|
+
_tokens = RATE_LIMIT_PER_SEC;
|
|
101
|
+
_lastRefill = Date.now();
|
|
102
|
+
_backoffCount = 0;
|
|
42
103
|
}
|
|
43
104
|
|
|
44
105
|
function getActiveSemaphore() {
|
|
@@ -47,8 +108,11 @@ function getActiveSemaphore() {
|
|
|
47
108
|
|
|
48
109
|
module.exports = {
|
|
49
110
|
REGISTRY_SEMAPHORE_MAX,
|
|
111
|
+
RATE_LIMIT_PER_SEC,
|
|
50
112
|
acquireRegistrySlot,
|
|
51
113
|
releaseRegistrySlot,
|
|
114
|
+
signal429,
|
|
115
|
+
getBackoffCount,
|
|
52
116
|
resetLimiter,
|
|
53
117
|
getActiveSemaphore
|
|
54
118
|
};
|