@hotmeshio/hotmesh 0.19.4 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/modules/enums.d.ts +38 -0
- package/build/modules/enums.js +40 -1
- package/build/package.json +1 -1
- package/build/services/engine/index.d.ts +6 -0
- package/build/services/engine/index.js +8 -0
- package/build/services/hotmesh/index.js +14 -0
- package/build/services/quorum/index.js +13 -0
- package/build/services/router/config/index.d.ts +2 -2
- package/build/services/router/config/index.js +8 -1
- package/build/services/router/consumption/index.d.ts +6 -1
- package/build/services/router/consumption/index.js +35 -1
- package/build/services/router/duress/index.d.ts +91 -0
- package/build/services/router/duress/index.js +217 -0
- package/build/services/router/index.d.ts +7 -0
- package/build/services/router/index.js +18 -1
- package/build/services/router/throttling/index.d.ts +28 -0
- package/build/services/router/throttling/index.js +43 -4
- package/build/services/stream/providers/postgres/kvtables.js +74 -36
- package/build/services/stream/providers/postgres/messages.js +16 -6
- package/build/services/stream/providers/postgres/postgres.d.ts +7 -0
- package/build/services/stream/providers/postgres/postgres.js +26 -3
- package/build/services/stream/providers/postgres/procedures.js +10 -3
- package/build/types/quorum.d.ts +20 -1
- package/build/types/stream.d.ts +4 -0
- package/package.json +1 -1
package/build/modules/enums.d.ts
CHANGED
|
@@ -256,6 +256,44 @@ export declare const HMSH_GUID_SIZE: number;
|
|
|
256
256
|
* Default task queue name used when no task queue is specified
|
|
257
257
|
*/
|
|
258
258
|
export declare const DEFAULT_TASK_QUEUE = "default";
|
|
259
|
+
/**
|
|
260
|
+
* EMA smoothing factor for duress latency tracking.
|
|
261
|
+
* Higher = faster response to spikes, lower = more stable.
|
|
262
|
+
* @default 0.3
|
|
263
|
+
*/
|
|
264
|
+
export declare const HMSH_DURESS_ALPHA: number;
|
|
265
|
+
/**
|
|
266
|
+
* Number of messages between duress evaluations.
|
|
267
|
+
* @default 10
|
|
268
|
+
*/
|
|
269
|
+
export declare const HMSH_DURESS_EVAL_INTERVAL: number;
|
|
270
|
+
/**
|
|
271
|
+
* Max EMA (ms) below which the engine is considered healthy. No throttle applied.
|
|
272
|
+
* @default 200
|
|
273
|
+
*/
|
|
274
|
+
export declare const HMSH_DURESS_HEALTHY_CEILING_MS: number;
|
|
275
|
+
/**
|
|
276
|
+
* Max EMA (ms) below which duress is mild. Light throttle (100-500ms).
|
|
277
|
+
* @default 1000
|
|
278
|
+
*/
|
|
279
|
+
export declare const HMSH_DURESS_MILD_CEILING_MS: number;
|
|
280
|
+
/**
|
|
281
|
+
* Max EMA (ms) below which duress is moderate. Moderate throttle (500-2000ms).
|
|
282
|
+
* Above this threshold, duress is severe (2000-5000ms throttle).
|
|
283
|
+
* @default 5000
|
|
284
|
+
*/
|
|
285
|
+
export declare const HMSH_DURESS_MODERATE_CEILING_MS: number;
|
|
286
|
+
/**
|
|
287
|
+
* Minimum interval (ms) between quorum duress broadcasts.
|
|
288
|
+
* @default 5000
|
|
289
|
+
*/
|
|
290
|
+
export declare const HMSH_DURESS_BROADCAST_INTERVAL_MS: number;
|
|
291
|
+
/**
|
|
292
|
+
* Number of consecutive improving evaluations required before
|
|
293
|
+
* dropping a duress level. Prevents oscillation.
|
|
294
|
+
* @default 3
|
|
295
|
+
*/
|
|
296
|
+
export declare const HMSH_DURESS_HYSTERESIS_COUNT: number;
|
|
259
297
|
/**
|
|
260
298
|
* PostgreSQL NOTIFY payload limit. If a job message exceeds this size,
|
|
261
299
|
* a reference message is sent instead and the subscriber fetches via getState.
|
package/build/modules/enums.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.HMSH_RESERVATION_TIMEOUT_S = exports.HMSH_ENGINE_CONCURRENCY = exports.HMSH_BATCH_SIZE_MIN = exports.HMSH_BATCH_SIZE = exports.HMSH_XPENDING_COUNT = exports.HMSH_XCLAIM_COUNT = exports.HMSH_XCLAIM_DELAY_MS = exports.HMSH_BLOCK_TIME_MS = exports.HMSH_DURABLE_INITIAL_INTERVAL = exports.HMSH_DURABLE_EXP_BACKOFF = exports.HMSH_DURABLE_MAX_INTERVAL = exports.HMSH_DURABLE_MAX_ATTEMPTS = exports.HMSH_GRADUATED_INTERVAL_MS = exports.HMSH_MAX_TIMEOUT_MS = exports.HMSH_MAX_CYCLES = exports.HMSH_POISON_MESSAGE_THRESHOLD = exports.HMSH_MAX_RETRIES = exports.MAX_DELAY = exports.MAX_STREAM_RETRIES = exports.INITIAL_STREAM_BACKOFF = exports.MAX_STREAM_BACKOFF = exports.HMSH_EXPIRE_JOB_SECONDS = exports.HMSH_OTT_WAIT_TIME = exports.HMSH_DEPLOYMENT_PAUSE = exports.HMSH_DEPLOYMENT_DELAY = exports.HMSH_ACTIVATION_MAX_RETRY = exports.HMSH_QUORUM_DELAY_MS = exports.HMSH_QUORUM_ROLLCALL_CYCLES = exports.HMSH_STATUS_UNKNOWN = exports.HMSH_CODE_DURABLE_RETRYABLE = exports.HMSH_CODE_DURABLE_FATAL = exports.HMSH_CODE_DURABLE_MAXED = exports.HMSH_CODE_DURABLE_TIMEOUT = exports.HMSH_CODE_DURABLE_WAIT = exports.HMSH_CODE_DURABLE_CONTINUE = exports.HMSH_CODE_DURABLE_PROXY = exports.HMSH_CODE_DURABLE_CHILD = exports.HMSH_CODE_DURABLE_ALL = exports.HMSH_CODE_DURABLE_SLEEP = exports.HMSH_CODE_UNACKED = exports.HMSH_CODE_TIMEOUT = exports.HMSH_CODE_UNKNOWN = exports.HMSH_CODE_INTERRUPT = exports.HMSH_CODE_NOTFOUND = exports.HMSH_CODE_PENDING = exports.HMSH_CODE_SUCCESS = exports.HMSH_PENDING_SIGNAL_EXPIRE = exports.HMSH_SIGNAL_EXPIRE = exports.HMSH_TELEMETRY = exports.HMSH_LOGLEVEL = void 0;
|
|
4
|
-
exports.HMSH_ROUTER_POLL_FALLBACK_INTERVAL = exports.HMSH_NOTIFY_PAYLOAD_LIMIT = exports.DEFAULT_TASK_QUEUE = exports.HMSH_GUID_SIZE = exports.HMSH_ROUTER_SCOUT_INTERVAL_MS = exports.HMSH_ROUTER_SCOUT_INTERVAL_SECONDS = exports.HMSH_SCOUT_INTERVAL_SECONDS = exports.HMSH_FIDELITY_SECONDS = exports.HMSH_EXPIRE_DURATION = exports.HMSH_RESERVATION_TIMEOUT_MAX_S = void 0;
|
|
4
|
+
exports.HMSH_ROUTER_POLL_FALLBACK_INTERVAL = exports.HMSH_NOTIFY_PAYLOAD_LIMIT = exports.HMSH_DURESS_HYSTERESIS_COUNT = exports.HMSH_DURESS_BROADCAST_INTERVAL_MS = exports.HMSH_DURESS_MODERATE_CEILING_MS = exports.HMSH_DURESS_MILD_CEILING_MS = exports.HMSH_DURESS_HEALTHY_CEILING_MS = exports.HMSH_DURESS_EVAL_INTERVAL = exports.HMSH_DURESS_ALPHA = exports.DEFAULT_TASK_QUEUE = exports.HMSH_GUID_SIZE = exports.HMSH_ROUTER_SCOUT_INTERVAL_MS = exports.HMSH_ROUTER_SCOUT_INTERVAL_SECONDS = exports.HMSH_SCOUT_INTERVAL_SECONDS = exports.HMSH_FIDELITY_SECONDS = exports.HMSH_EXPIRE_DURATION = exports.HMSH_RESERVATION_TIMEOUT_MAX_S = void 0;
|
|
5
5
|
/**
|
|
6
6
|
* Determines the log level for the application. The default is 'info'.
|
|
7
7
|
*/
|
|
@@ -288,6 +288,45 @@ exports.HMSH_GUID_SIZE = Math.min(parseInt(process.env.HMSH_GUID_SIZE, 10) || 22
|
|
|
288
288
|
* Default task queue name used when no task queue is specified
|
|
289
289
|
*/
|
|
290
290
|
exports.DEFAULT_TASK_QUEUE = 'default';
|
|
291
|
+
// DURESS DETECTION — adaptive engine throttling based on processing latency
|
|
292
|
+
/**
|
|
293
|
+
* EMA smoothing factor for duress latency tracking.
|
|
294
|
+
* Higher = faster response to spikes, lower = more stable.
|
|
295
|
+
* @default 0.3
|
|
296
|
+
*/
|
|
297
|
+
exports.HMSH_DURESS_ALPHA = parseFloat(process.env.HMSH_DURESS_ALPHA) || 0.3;
|
|
298
|
+
/**
|
|
299
|
+
* Number of messages between duress evaluations.
|
|
300
|
+
* @default 10
|
|
301
|
+
*/
|
|
302
|
+
exports.HMSH_DURESS_EVAL_INTERVAL = parseInt(process.env.HMSH_DURESS_EVAL_INTERVAL, 10) || 10;
|
|
303
|
+
/**
|
|
304
|
+
* Max EMA (ms) below which the engine is considered healthy. No throttle applied.
|
|
305
|
+
* @default 200
|
|
306
|
+
*/
|
|
307
|
+
exports.HMSH_DURESS_HEALTHY_CEILING_MS = parseInt(process.env.HMSH_DURESS_HEALTHY_CEILING_MS, 10) || 200;
|
|
308
|
+
/**
|
|
309
|
+
* Max EMA (ms) below which duress is mild. Light throttle (100-500ms).
|
|
310
|
+
* @default 1000
|
|
311
|
+
*/
|
|
312
|
+
exports.HMSH_DURESS_MILD_CEILING_MS = parseInt(process.env.HMSH_DURESS_MILD_CEILING_MS, 10) || 1000;
|
|
313
|
+
/**
|
|
314
|
+
* Max EMA (ms) below which duress is moderate. Moderate throttle (500-2000ms).
|
|
315
|
+
* Above this threshold, duress is severe (2000-5000ms throttle).
|
|
316
|
+
* @default 5000
|
|
317
|
+
*/
|
|
318
|
+
exports.HMSH_DURESS_MODERATE_CEILING_MS = parseInt(process.env.HMSH_DURESS_MODERATE_CEILING_MS, 10) || 5000;
|
|
319
|
+
/**
|
|
320
|
+
* Minimum interval (ms) between quorum duress broadcasts.
|
|
321
|
+
* @default 5000
|
|
322
|
+
*/
|
|
323
|
+
exports.HMSH_DURESS_BROADCAST_INTERVAL_MS = parseInt(process.env.HMSH_DURESS_BROADCAST_INTERVAL_MS, 10) || 5000;
|
|
324
|
+
/**
|
|
325
|
+
* Number of consecutive improving evaluations required before
|
|
326
|
+
* dropping a duress level. Prevents oscillation.
|
|
327
|
+
* @default 3
|
|
328
|
+
*/
|
|
329
|
+
exports.HMSH_DURESS_HYSTERESIS_COUNT = parseInt(process.env.HMSH_DURESS_HYSTERESIS_COUNT, 10) || 3;
|
|
291
330
|
/**
|
|
292
331
|
* PostgreSQL NOTIFY payload limit. If a job message exceeds this size,
|
|
293
332
|
* a reference message is sent instead and the subscriber fetches via getState.
|
package/build/package.json
CHANGED
|
@@ -167,6 +167,12 @@ declare class EngineService {
|
|
|
167
167
|
* @private
|
|
168
168
|
*/
|
|
169
169
|
throttle(delayInMillis: number): Promise<void>;
|
|
170
|
+
/**
|
|
171
|
+
* Apply a remote duress signal from the quorum.
|
|
172
|
+
* Delegates to the router's duress manager.
|
|
173
|
+
* @private
|
|
174
|
+
*/
|
|
175
|
+
applyRemoteDuress(throttleMs: number, level: string): void;
|
|
170
176
|
/**
|
|
171
177
|
* @private
|
|
172
178
|
*/
|
|
@@ -267,6 +267,14 @@ class EngineService {
|
|
|
267
267
|
async throttle(delayInMillis) {
|
|
268
268
|
return Signal.throttle(this, delayInMillis);
|
|
269
269
|
}
|
|
270
|
+
/**
|
|
271
|
+
* Apply a remote duress signal from the quorum.
|
|
272
|
+
* Delegates to the router's duress manager.
|
|
273
|
+
* @private
|
|
274
|
+
*/
|
|
275
|
+
applyRemoteDuress(throttleMs, level) {
|
|
276
|
+
this.router?.applyRemoteDuress(throttleMs, level);
|
|
277
|
+
}
|
|
270
278
|
// ═════════════════════════════════════════════════════════════════
|
|
271
279
|
// 9. PUB/SUB — topic messaging, subscriptions, callbacks
|
|
272
280
|
// → see pubsub.ts
|
|
@@ -181,6 +181,20 @@ class HotMesh {
|
|
|
181
181
|
instance.logger = new logger_1.LoggerService(config.appId, instance.guid, config.name || '', config.logLevel);
|
|
182
182
|
await Init.initEngine(instance, config, instance.logger);
|
|
183
183
|
await Init.initQuorum(instance, config, instance.engine, instance.logger);
|
|
184
|
+
// Register duress broadcast callback: engine router → quorum
|
|
185
|
+
if (instance.engine?.router && instance.quorum) {
|
|
186
|
+
const quorum = instance.quorum;
|
|
187
|
+
const engineGuid = instance.guid;
|
|
188
|
+
instance.engine.router.setDuressCallback((snapshot) => {
|
|
189
|
+
quorum.pub({
|
|
190
|
+
type: 'duress',
|
|
191
|
+
originator: engineGuid,
|
|
192
|
+
duress_score_ms: snapshot.score_ms,
|
|
193
|
+
throttle_ms: snapshot.throttle_ms,
|
|
194
|
+
level: snapshot.level,
|
|
195
|
+
});
|
|
196
|
+
});
|
|
197
|
+
}
|
|
184
198
|
await Init.doWork(instance, config, instance.logger);
|
|
185
199
|
return instance;
|
|
186
200
|
}
|
|
@@ -111,6 +111,12 @@ class QuorumService {
|
|
|
111
111
|
else if (message.type === 'cron') {
|
|
112
112
|
self.engine.processTimeHooks();
|
|
113
113
|
}
|
|
114
|
+
else if (message.type === 'duress') {
|
|
115
|
+
// Apply remote duress signal (skip our own broadcasts)
|
|
116
|
+
if (message.originator !== self.guid) {
|
|
117
|
+
self.engine.applyRemoteDuress(message.throttle_ms, message.level);
|
|
118
|
+
}
|
|
119
|
+
}
|
|
114
120
|
else if (message.type === 'rollcall') {
|
|
115
121
|
self.doRollCall(message);
|
|
116
122
|
}
|
|
@@ -147,6 +153,13 @@ class QuorumService {
|
|
|
147
153
|
reclaimCount: this.engine.router.reclaimCount,
|
|
148
154
|
system: await (0, utils_1.getSystemHealth)(),
|
|
149
155
|
};
|
|
156
|
+
// Include duress info if available (engine routers only)
|
|
157
|
+
const duressSnapshot = this.engine.router.getDuressSnapshot?.();
|
|
158
|
+
if (duressSnapshot) {
|
|
159
|
+
profile.duress_level = duressSnapshot.level;
|
|
160
|
+
profile.duress_score_ms = duressSnapshot.score_ms;
|
|
161
|
+
profile.duress_per_type = duressSnapshot.per_type;
|
|
162
|
+
}
|
|
150
163
|
}
|
|
151
164
|
this.subscribe.publish(hotmesh_1.KeyType.QUORUM, {
|
|
152
165
|
type: 'pong',
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { HMSH_BLOCK_TIME_MS, HMSH_MAX_RETRIES, HMSH_MAX_TIMEOUT_MS, HMSH_GRADUATED_INTERVAL_MS, HMSH_CODE_UNACKED, HMSH_CODE_UNKNOWN, HMSH_STATUS_UNKNOWN, HMSH_XCLAIM_COUNT, HMSH_XCLAIM_DELAY_MS, HMSH_XPENDING_COUNT, HMSH_BATCH_SIZE, HMSH_BATCH_SIZE_MIN, HMSH_RESERVATION_TIMEOUT_S, HMSH_RESERVATION_TIMEOUT_MAX_S, MAX_DELAY, MAX_STREAM_BACKOFF, INITIAL_STREAM_BACKOFF, MAX_STREAM_RETRIES, HMSH_POISON_MESSAGE_THRESHOLD } from '../../../modules/enums';
|
|
1
|
+
import { HMSH_BLOCK_TIME_MS, HMSH_MAX_RETRIES, HMSH_MAX_TIMEOUT_MS, HMSH_GRADUATED_INTERVAL_MS, HMSH_CODE_UNACKED, HMSH_CODE_UNKNOWN, HMSH_STATUS_UNKNOWN, HMSH_XCLAIM_COUNT, HMSH_XCLAIM_DELAY_MS, HMSH_XPENDING_COUNT, HMSH_BATCH_SIZE, HMSH_BATCH_SIZE_MIN, HMSH_RESERVATION_TIMEOUT_S, HMSH_RESERVATION_TIMEOUT_MAX_S, MAX_DELAY, MAX_STREAM_BACKOFF, INITIAL_STREAM_BACKOFF, MAX_STREAM_RETRIES, HMSH_POISON_MESSAGE_THRESHOLD, HMSH_DURESS_ALPHA, HMSH_DURESS_EVAL_INTERVAL, HMSH_DURESS_HEALTHY_CEILING_MS, HMSH_DURESS_MILD_CEILING_MS, HMSH_DURESS_MODERATE_CEILING_MS, HMSH_DURESS_BROADCAST_INTERVAL_MS, HMSH_DURESS_HYSTERESIS_COUNT } from '../../../modules/enums';
|
|
2
2
|
import { RouterConfig } from '../../../types/stream';
|
|
3
3
|
export declare class RouterConfigManager {
|
|
4
4
|
static validateThrottle(delayInMillis: number): void;
|
|
@@ -8,4 +8,4 @@ export declare class RouterConfigManager {
|
|
|
8
8
|
readonly: boolean;
|
|
9
9
|
};
|
|
10
10
|
}
|
|
11
|
-
export { HMSH_BLOCK_TIME_MS, HMSH_MAX_RETRIES, HMSH_MAX_TIMEOUT_MS, HMSH_GRADUATED_INTERVAL_MS, HMSH_CODE_UNACKED, HMSH_CODE_UNKNOWN, HMSH_STATUS_UNKNOWN, HMSH_XCLAIM_COUNT, HMSH_XCLAIM_DELAY_MS, HMSH_XPENDING_COUNT, HMSH_BATCH_SIZE, HMSH_BATCH_SIZE_MIN, HMSH_RESERVATION_TIMEOUT_S, HMSH_RESERVATION_TIMEOUT_MAX_S, MAX_DELAY, MAX_STREAM_BACKOFF, INITIAL_STREAM_BACKOFF, MAX_STREAM_RETRIES, HMSH_POISON_MESSAGE_THRESHOLD, };
|
|
11
|
+
export { HMSH_BLOCK_TIME_MS, HMSH_MAX_RETRIES, HMSH_MAX_TIMEOUT_MS, HMSH_GRADUATED_INTERVAL_MS, HMSH_CODE_UNACKED, HMSH_CODE_UNKNOWN, HMSH_STATUS_UNKNOWN, HMSH_XCLAIM_COUNT, HMSH_XCLAIM_DELAY_MS, HMSH_XPENDING_COUNT, HMSH_BATCH_SIZE, HMSH_BATCH_SIZE_MIN, HMSH_RESERVATION_TIMEOUT_S, HMSH_RESERVATION_TIMEOUT_MAX_S, MAX_DELAY, MAX_STREAM_BACKOFF, INITIAL_STREAM_BACKOFF, MAX_STREAM_RETRIES, HMSH_POISON_MESSAGE_THRESHOLD, HMSH_DURESS_ALPHA, HMSH_DURESS_EVAL_INTERVAL, HMSH_DURESS_HEALTHY_CEILING_MS, HMSH_DURESS_MILD_CEILING_MS, HMSH_DURESS_MODERATE_CEILING_MS, HMSH_DURESS_BROADCAST_INTERVAL_MS, HMSH_DURESS_HYSTERESIS_COUNT, };
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.HMSH_POISON_MESSAGE_THRESHOLD = exports.MAX_STREAM_RETRIES = exports.INITIAL_STREAM_BACKOFF = exports.MAX_STREAM_BACKOFF = exports.MAX_DELAY = exports.HMSH_RESERVATION_TIMEOUT_MAX_S = exports.HMSH_RESERVATION_TIMEOUT_S = exports.HMSH_BATCH_SIZE_MIN = exports.HMSH_BATCH_SIZE = exports.HMSH_XPENDING_COUNT = exports.HMSH_XCLAIM_DELAY_MS = exports.HMSH_XCLAIM_COUNT = exports.HMSH_STATUS_UNKNOWN = exports.HMSH_CODE_UNKNOWN = exports.HMSH_CODE_UNACKED = exports.HMSH_GRADUATED_INTERVAL_MS = exports.HMSH_MAX_TIMEOUT_MS = exports.HMSH_MAX_RETRIES = exports.HMSH_BLOCK_TIME_MS = exports.RouterConfigManager = void 0;
|
|
3
|
+
exports.HMSH_DURESS_HYSTERESIS_COUNT = exports.HMSH_DURESS_BROADCAST_INTERVAL_MS = exports.HMSH_DURESS_MODERATE_CEILING_MS = exports.HMSH_DURESS_MILD_CEILING_MS = exports.HMSH_DURESS_HEALTHY_CEILING_MS = exports.HMSH_DURESS_EVAL_INTERVAL = exports.HMSH_DURESS_ALPHA = exports.HMSH_POISON_MESSAGE_THRESHOLD = exports.MAX_STREAM_RETRIES = exports.INITIAL_STREAM_BACKOFF = exports.MAX_STREAM_BACKOFF = exports.MAX_DELAY = exports.HMSH_RESERVATION_TIMEOUT_MAX_S = exports.HMSH_RESERVATION_TIMEOUT_S = exports.HMSH_BATCH_SIZE_MIN = exports.HMSH_BATCH_SIZE = exports.HMSH_XPENDING_COUNT = exports.HMSH_XCLAIM_DELAY_MS = exports.HMSH_XCLAIM_COUNT = exports.HMSH_STATUS_UNKNOWN = exports.HMSH_CODE_UNKNOWN = exports.HMSH_CODE_UNACKED = exports.HMSH_GRADUATED_INTERVAL_MS = exports.HMSH_MAX_TIMEOUT_MS = exports.HMSH_MAX_RETRIES = exports.HMSH_BLOCK_TIME_MS = exports.RouterConfigManager = void 0;
|
|
4
4
|
const enums_1 = require("../../../modules/enums");
|
|
5
5
|
Object.defineProperty(exports, "HMSH_BLOCK_TIME_MS", { enumerable: true, get: function () { return enums_1.HMSH_BLOCK_TIME_MS; } });
|
|
6
6
|
Object.defineProperty(exports, "HMSH_MAX_RETRIES", { enumerable: true, get: function () { return enums_1.HMSH_MAX_RETRIES; } });
|
|
@@ -21,6 +21,13 @@ Object.defineProperty(exports, "MAX_STREAM_BACKOFF", { enumerable: true, get: fu
|
|
|
21
21
|
Object.defineProperty(exports, "INITIAL_STREAM_BACKOFF", { enumerable: true, get: function () { return enums_1.INITIAL_STREAM_BACKOFF; } });
|
|
22
22
|
Object.defineProperty(exports, "MAX_STREAM_RETRIES", { enumerable: true, get: function () { return enums_1.MAX_STREAM_RETRIES; } });
|
|
23
23
|
Object.defineProperty(exports, "HMSH_POISON_MESSAGE_THRESHOLD", { enumerable: true, get: function () { return enums_1.HMSH_POISON_MESSAGE_THRESHOLD; } });
|
|
24
|
+
Object.defineProperty(exports, "HMSH_DURESS_ALPHA", { enumerable: true, get: function () { return enums_1.HMSH_DURESS_ALPHA; } });
|
|
25
|
+
Object.defineProperty(exports, "HMSH_DURESS_EVAL_INTERVAL", { enumerable: true, get: function () { return enums_1.HMSH_DURESS_EVAL_INTERVAL; } });
|
|
26
|
+
Object.defineProperty(exports, "HMSH_DURESS_HEALTHY_CEILING_MS", { enumerable: true, get: function () { return enums_1.HMSH_DURESS_HEALTHY_CEILING_MS; } });
|
|
27
|
+
Object.defineProperty(exports, "HMSH_DURESS_MILD_CEILING_MS", { enumerable: true, get: function () { return enums_1.HMSH_DURESS_MILD_CEILING_MS; } });
|
|
28
|
+
Object.defineProperty(exports, "HMSH_DURESS_MODERATE_CEILING_MS", { enumerable: true, get: function () { return enums_1.HMSH_DURESS_MODERATE_CEILING_MS; } });
|
|
29
|
+
Object.defineProperty(exports, "HMSH_DURESS_BROADCAST_INTERVAL_MS", { enumerable: true, get: function () { return enums_1.HMSH_DURESS_BROADCAST_INTERVAL_MS; } });
|
|
30
|
+
Object.defineProperty(exports, "HMSH_DURESS_HYSTERESIS_COUNT", { enumerable: true, get: function () { return enums_1.HMSH_DURESS_HYSTERESIS_COUNT; } });
|
|
24
31
|
class RouterConfigManager {
|
|
25
32
|
static validateThrottle(delayInMillis) {
|
|
26
33
|
if (!Number.isInteger(delayInMillis) ||
|
|
@@ -3,6 +3,7 @@ import { StreamService } from '../../stream';
|
|
|
3
3
|
import { ThrottleManager } from '../throttling';
|
|
4
4
|
import { ErrorHandler } from '../error-handling';
|
|
5
5
|
import { LifecycleManager } from '../lifecycle';
|
|
6
|
+
import { DuressManager, DuressSnapshot } from '../duress';
|
|
6
7
|
import { StreamData, StreamDataResponse } from '../../../types/stream';
|
|
7
8
|
import { ProviderClient, ProviderTransaction } from '../../../types/provider';
|
|
8
9
|
export declare class ConsumptionManager<S extends StreamService<ProviderClient, ProviderTransaction>> {
|
|
@@ -26,6 +27,9 @@ export declare class ConsumptionManager<S extends StreamService<ProviderClient,
|
|
|
26
27
|
private set hasReachedMaxBackoff(value);
|
|
27
28
|
private router;
|
|
28
29
|
private retry;
|
|
30
|
+
private duressManager?;
|
|
31
|
+
private onDuressChange?;
|
|
32
|
+
private messagesSinceLastEval;
|
|
29
33
|
private adaptiveReservationTimeout;
|
|
30
34
|
private adaptiveBatchSize;
|
|
31
35
|
private lastDepthCheckAt;
|
|
@@ -33,7 +37,8 @@ export declare class ConsumptionManager<S extends StreamService<ProviderClient,
|
|
|
33
37
|
private static readonly DEPTH_SCALE_UP_THRESHOLD;
|
|
34
38
|
private static readonly DEPTH_SCALE_DOWN_THRESHOLD;
|
|
35
39
|
private static readonly LEASE_BUFFER_S;
|
|
36
|
-
constructor(stream: S, logger: ILogger, throttleManager: ThrottleManager, errorHandler: ErrorHandler, lifecycleManager: LifecycleManager<S>, reclaimDelay: number, reclaimCount: number, appId: string, role: any, router: any, retry?: import('../../../types/stream').RetryPolicy);
|
|
40
|
+
constructor(stream: S, logger: ILogger, throttleManager: ThrottleManager, errorHandler: ErrorHandler, lifecycleManager: LifecycleManager<S>, reclaimDelay: number, reclaimCount: number, appId: string, role: any, router: any, retry?: import('../../../types/stream').RetryPolicy, duressManager?: DuressManager);
|
|
41
|
+
setDuressCallback(callback: (snapshot: DuressSnapshot) => void): void;
|
|
37
42
|
/**
|
|
38
43
|
* Adjusts reservation timeout based on stream depth. Called periodically
|
|
39
44
|
* from the consume loop. When depth is high:
|
|
@@ -17,7 +17,8 @@ class ConsumptionManager {
|
|
|
17
17
|
get counts() { return this.router.counts; }
|
|
18
18
|
get hasReachedMaxBackoff() { return this.router.hasReachedMaxBackoff; }
|
|
19
19
|
set hasReachedMaxBackoff(v) { this.router.hasReachedMaxBackoff = v; }
|
|
20
|
-
constructor(stream, logger, throttleManager, errorHandler, lifecycleManager, reclaimDelay, reclaimCount, appId, role, router, retry) {
|
|
20
|
+
constructor(stream, logger, throttleManager, errorHandler, lifecycleManager, reclaimDelay, reclaimCount, appId, role, router, retry, duressManager) {
|
|
21
|
+
this.messagesSinceLastEval = 0;
|
|
21
22
|
// Adaptive consumption pressure — scales reservation timeout AND batch
|
|
22
23
|
// size based on stream depth. Under load: timeout grows (prevents
|
|
23
24
|
// duplicate re-reservation) and batch size shrinks (reduces in-memory
|
|
@@ -37,6 +38,10 @@ class ConsumptionManager {
|
|
|
37
38
|
this.role = role;
|
|
38
39
|
this.router = router;
|
|
39
40
|
this.retry = retry;
|
|
41
|
+
this.duressManager = duressManager;
|
|
42
|
+
}
|
|
43
|
+
setDuressCallback(callback) {
|
|
44
|
+
this.onDuressChange = callback;
|
|
40
45
|
}
|
|
41
46
|
/**
|
|
42
47
|
* Adjusts reservation timeout based on stream depth. Called periodically
|
|
@@ -500,6 +505,7 @@ class ConsumptionManager {
|
|
|
500
505
|
const deadlineMs = this.adaptiveReservationTimeout * 1000;
|
|
501
506
|
let output;
|
|
502
507
|
const telemetry = new telemetry_1.RouterTelemetry(this.appId);
|
|
508
|
+
const processingStart = Date.now();
|
|
503
509
|
try {
|
|
504
510
|
telemetry.startStreamSpan(input, this.role);
|
|
505
511
|
let deadlineTimer;
|
|
@@ -549,6 +555,34 @@ class ConsumptionManager {
|
|
|
549
555
|
telemetry.setStreamErrorFromException(err);
|
|
550
556
|
output = this.errorHandler.structureUnhandledError(input, err instanceof Error ? err : new Error(String(err)));
|
|
551
557
|
}
|
|
558
|
+
// Record processing latency for duress detection (engine routers only).
|
|
559
|
+
// This measures the actual time spent in execStreamLeg — the causal
|
|
560
|
+
// signal. The prior depth-based mechanism (adjustConsumptionPressure)
|
|
561
|
+
// responds to queue backlog; this responds to *why* the backlog exists.
|
|
562
|
+
// Evaluation is amortized over HMSH_DURESS_EVAL_INTERVAL messages to
|
|
563
|
+
// avoid per-message overhead.
|
|
564
|
+
if (this.duressManager && input.type) {
|
|
565
|
+
const processingDuration = Date.now() - processingStart;
|
|
566
|
+
this.duressManager.recordLatency(input.type, processingDuration);
|
|
567
|
+
if (++this.messagesSinceLastEval >= config_1.HMSH_DURESS_EVAL_INTERVAL) {
|
|
568
|
+
this.messagesSinceLastEval = 0;
|
|
569
|
+
const snapshot = this.duressManager.evaluate();
|
|
570
|
+
this.throttleManager.setDuressFloor(snapshot.throttle_ms);
|
|
571
|
+
if (snapshot.level !== 'healthy') {
|
|
572
|
+
this.logger.info('stream-duress-detected', {
|
|
573
|
+
stream,
|
|
574
|
+
level: snapshot.level,
|
|
575
|
+
score_ms: snapshot.score_ms,
|
|
576
|
+
throttle_ms: snapshot.throttle_ms,
|
|
577
|
+
per_type: snapshot.per_type,
|
|
578
|
+
});
|
|
579
|
+
}
|
|
580
|
+
if (this.duressManager.shouldBroadcast() && this.onDuressChange) {
|
|
581
|
+
this.duressManager.markBroadcast();
|
|
582
|
+
this.onDuressChange(snapshot);
|
|
583
|
+
}
|
|
584
|
+
}
|
|
585
|
+
}
|
|
552
586
|
try {
|
|
553
587
|
// When the ENGINE encounters an infrastructure error (schema not found,
|
|
554
588
|
// subscription missing — code 598), the message is permanently unprocessable.
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import { StreamDataType } from '../../../types/stream';
|
|
2
|
+
import { DuressLevel } from '../../../types/quorum';
|
|
3
|
+
export interface DuressSnapshot {
|
|
4
|
+
level: DuressLevel;
|
|
5
|
+
score_ms: number;
|
|
6
|
+
throttle_ms: number;
|
|
7
|
+
per_type: Record<string, number>;
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* Adaptive engine duress detection via processing latency.
|
|
11
|
+
*
|
|
12
|
+
* ## Why this exists
|
|
13
|
+
*
|
|
14
|
+
* Prior fixes responded to queue *depth* (a symptom) — doubling reservation
|
|
15
|
+
* timeouts and halving batch sizes when the stream backed up. A deep queue
|
|
16
|
+
* doesn't necessarily mean duress (it could be a burst of external triggers),
|
|
17
|
+
* and a shallow queue doesn't necessarily mean health. This module responds
|
|
18
|
+
* to the *cause*: actual processing latency per message type.
|
|
19
|
+
*
|
|
20
|
+
* ## How it works
|
|
21
|
+
*
|
|
22
|
+
* Each engine router tracks an exponential moving average (EMA) of how long
|
|
23
|
+
* each canonical message type (transition, timehook, webhook, worker response,
|
|
24
|
+
* etc.) takes to process. When healthy, these are sub-50ms. When the max EMA
|
|
25
|
+
* crosses configurable thresholds (200ms → mild, 1s → moderate, 5s → severe),
|
|
26
|
+
* the manager computes a proportional throttle delay that the ThrottleManager
|
|
27
|
+
* applies as a floor on engine consumption rate.
|
|
28
|
+
*
|
|
29
|
+
* ## Hysteresis (asymmetric by design)
|
|
30
|
+
*
|
|
31
|
+
* Escalation is immediate — if the engine suddenly enters duress, the throttle
|
|
32
|
+
* kicks in on the next evaluation. De-escalation requires `HYSTERESIS_COUNT`
|
|
33
|
+
* (default 3) consecutive improving evaluations before dropping a level. This
|
|
34
|
+
* prevents oscillation: throttle → drain → un-throttle → refill → throttle.
|
|
35
|
+
* The EMA already smooths individual outliers; hysteresis gates the recovery
|
|
36
|
+
* path specifically.
|
|
37
|
+
*
|
|
38
|
+
* ## Quorum coordination
|
|
39
|
+
*
|
|
40
|
+
* When a router detects a level change (or remains in duress), it broadcasts
|
|
41
|
+
* a `'duress'` message via the quorum. Peers adopt the signal only if it's
|
|
42
|
+
* worse than their local state, so the mesh converges on the worst-case
|
|
43
|
+
* throttle without coordination.
|
|
44
|
+
*
|
|
45
|
+
* ## What this does NOT do
|
|
46
|
+
*
|
|
47
|
+
* External messages (triggers, signalIn/webhooks from the outside world) are
|
|
48
|
+
* never throttled. They always enter `engine_streams`. Only the engine
|
|
49
|
+
* routers' pull rate slows down, giving the system breathing room.
|
|
50
|
+
*/
|
|
51
|
+
export declare class DuressManager {
|
|
52
|
+
private emas;
|
|
53
|
+
private sampleCounts;
|
|
54
|
+
private currentLevel;
|
|
55
|
+
private belowThresholdCount;
|
|
56
|
+
private duressThrottle;
|
|
57
|
+
private lastBroadcastAt;
|
|
58
|
+
private lastBroadcastLevel;
|
|
59
|
+
/**
|
|
60
|
+
* Record a processing duration for a message type.
|
|
61
|
+
* Updates the exponential moving average for that type.
|
|
62
|
+
*/
|
|
63
|
+
recordLatency(type: StreamDataType, durationMs: number): void;
|
|
64
|
+
/**
|
|
65
|
+
* Evaluate duress state from current EMAs.
|
|
66
|
+
* Returns a snapshot with level, score, recommended throttle,
|
|
67
|
+
* and per-type latencies.
|
|
68
|
+
*/
|
|
69
|
+
evaluate(): DuressSnapshot;
|
|
70
|
+
getDuressThrottle(): number;
|
|
71
|
+
getCurrentLevel(): DuressLevel;
|
|
72
|
+
/**
|
|
73
|
+
* Apply a duress snapshot received from another engine via quorum.
|
|
74
|
+
* Adopts the remote signal only if it indicates worse duress than local.
|
|
75
|
+
*/
|
|
76
|
+
applyRemoteDuress(throttleMs: number, level: DuressLevel): void;
|
|
77
|
+
/**
|
|
78
|
+
* Whether a quorum broadcast is warranted.
|
|
79
|
+
* Rate-limited and only fires when level changes or duress is active.
|
|
80
|
+
*/
|
|
81
|
+
shouldBroadcast(): boolean;
|
|
82
|
+
markBroadcast(): void;
|
|
83
|
+
/**
|
|
84
|
+
* Returns a snapshot for inclusion in quorum rollcall profiles.
|
|
85
|
+
*/
|
|
86
|
+
getSnapshot(): DuressSnapshot;
|
|
87
|
+
private scoreToLevel;
|
|
88
|
+
private scoreToThrottle;
|
|
89
|
+
private lerp;
|
|
90
|
+
private levelOrdinal;
|
|
91
|
+
}
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.DuressManager = void 0;
|
|
4
|
+
const config_1 = require("../config");
|
|
5
|
+
// Throttle band boundaries (ms)
|
|
6
|
+
const MILD_THROTTLE_MIN = 100;
|
|
7
|
+
const MILD_THROTTLE_MAX = 500;
|
|
8
|
+
const MODERATE_THROTTLE_MIN = 500;
|
|
9
|
+
const MODERATE_THROTTLE_MAX = 2000;
|
|
10
|
+
const SEVERE_THROTTLE_MIN = 2000;
|
|
11
|
+
const SEVERE_THROTTLE_MAX = 5000;
|
|
12
|
+
/**
|
|
13
|
+
* Adaptive engine duress detection via processing latency.
|
|
14
|
+
*
|
|
15
|
+
* ## Why this exists
|
|
16
|
+
*
|
|
17
|
+
* Prior fixes responded to queue *depth* (a symptom) — doubling reservation
|
|
18
|
+
* timeouts and halving batch sizes when the stream backed up. A deep queue
|
|
19
|
+
* doesn't necessarily mean duress (it could be a burst of external triggers),
|
|
20
|
+
* and a shallow queue doesn't necessarily mean health. This module responds
|
|
21
|
+
* to the *cause*: actual processing latency per message type.
|
|
22
|
+
*
|
|
23
|
+
* ## How it works
|
|
24
|
+
*
|
|
25
|
+
* Each engine router tracks an exponential moving average (EMA) of how long
|
|
26
|
+
* each canonical message type (transition, timehook, webhook, worker response,
|
|
27
|
+
* etc.) takes to process. When healthy, these are sub-50ms. When the max EMA
|
|
28
|
+
* crosses configurable thresholds (200ms → mild, 1s → moderate, 5s → severe),
|
|
29
|
+
* the manager computes a proportional throttle delay that the ThrottleManager
|
|
30
|
+
* applies as a floor on engine consumption rate.
|
|
31
|
+
*
|
|
32
|
+
* ## Hysteresis (asymmetric by design)
|
|
33
|
+
*
|
|
34
|
+
* Escalation is immediate — if the engine suddenly enters duress, the throttle
|
|
35
|
+
* kicks in on the next evaluation. De-escalation requires `HYSTERESIS_COUNT`
|
|
36
|
+
* (default 3) consecutive improving evaluations before dropping a level. This
|
|
37
|
+
* prevents oscillation: throttle → drain → un-throttle → refill → throttle.
|
|
38
|
+
* The EMA already smooths individual outliers; hysteresis gates the recovery
|
|
39
|
+
* path specifically.
|
|
40
|
+
*
|
|
41
|
+
* ## Quorum coordination
|
|
42
|
+
*
|
|
43
|
+
* When a router detects a level change (or remains in duress), it broadcasts
|
|
44
|
+
* a `'duress'` message via the quorum. Peers adopt the signal only if it's
|
|
45
|
+
* worse than their local state, so the mesh converges on the worst-case
|
|
46
|
+
* throttle without coordination.
|
|
47
|
+
*
|
|
48
|
+
* ## What this does NOT do
|
|
49
|
+
*
|
|
50
|
+
* External messages (triggers, signalIn/webhooks from the outside world) are
|
|
51
|
+
* never throttled. They always enter `engine_streams`. Only the engine
|
|
52
|
+
* routers' pull rate slows down, giving the system breathing room.
|
|
53
|
+
*/
|
|
54
|
+
class DuressManager {
|
|
55
|
+
constructor() {
|
|
56
|
+
// Per-message-type exponential moving averages
|
|
57
|
+
this.emas = new Map();
|
|
58
|
+
this.sampleCounts = new Map();
|
|
59
|
+
// Hysteresis state
|
|
60
|
+
this.currentLevel = 'healthy';
|
|
61
|
+
this.belowThresholdCount = 0;
|
|
62
|
+
// Computed duress throttle floor
|
|
63
|
+
this.duressThrottle = 0;
|
|
64
|
+
// Broadcast rate limiting
|
|
65
|
+
this.lastBroadcastAt = 0;
|
|
66
|
+
this.lastBroadcastLevel = 'healthy';
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Record a processing duration for a message type.
|
|
70
|
+
* Updates the exponential moving average for that type.
|
|
71
|
+
*/
|
|
72
|
+
recordLatency(type, durationMs) {
|
|
73
|
+
const key = type;
|
|
74
|
+
const count = this.sampleCounts.get(key) || 0;
|
|
75
|
+
if (count === 0) {
|
|
76
|
+
// First sample: seed the EMA directly
|
|
77
|
+
this.emas.set(key, durationMs);
|
|
78
|
+
}
|
|
79
|
+
else {
|
|
80
|
+
const prev = this.emas.get(key);
|
|
81
|
+
this.emas.set(key, config_1.HMSH_DURESS_ALPHA * durationMs + (1 - config_1.HMSH_DURESS_ALPHA) * prev);
|
|
82
|
+
}
|
|
83
|
+
this.sampleCounts.set(key, count + 1);
|
|
84
|
+
}
|
|
85
|
+
/**
|
|
86
|
+
* Evaluate duress state from current EMAs.
|
|
87
|
+
* Returns a snapshot with level, score, recommended throttle,
|
|
88
|
+
* and per-type latencies.
|
|
89
|
+
*/
|
|
90
|
+
evaluate() {
|
|
91
|
+
// Aggregate: max EMA across all tracked types
|
|
92
|
+
let maxEma = 0;
|
|
93
|
+
const perType = {};
|
|
94
|
+
for (const [type, ema] of this.emas) {
|
|
95
|
+
perType[type] = Math.round(ema);
|
|
96
|
+
if (ema > maxEma)
|
|
97
|
+
maxEma = ema;
|
|
98
|
+
}
|
|
99
|
+
const rawLevel = this.scoreToLevel(maxEma);
|
|
100
|
+
// Hysteresis: only drop level after sustained improvement
|
|
101
|
+
if (this.levelOrdinal(rawLevel) < this.levelOrdinal(this.currentLevel)) {
|
|
102
|
+
this.belowThresholdCount++;
|
|
103
|
+
if (this.belowThresholdCount >= config_1.HMSH_DURESS_HYSTERESIS_COUNT) {
|
|
104
|
+
this.currentLevel = rawLevel;
|
|
105
|
+
this.belowThresholdCount = 0;
|
|
106
|
+
}
|
|
107
|
+
// Keep current (higher) level until hysteresis clears
|
|
108
|
+
}
|
|
109
|
+
else {
|
|
110
|
+
// Same or worse: reset hysteresis counter, adopt immediately
|
|
111
|
+
this.belowThresholdCount = 0;
|
|
112
|
+
this.currentLevel = rawLevel;
|
|
113
|
+
}
|
|
114
|
+
this.duressThrottle =
|
|
115
|
+
this.currentLevel === 'healthy'
|
|
116
|
+
? 0
|
|
117
|
+
: this.scoreToThrottle(maxEma, this.currentLevel);
|
|
118
|
+
return {
|
|
119
|
+
level: this.currentLevel,
|
|
120
|
+
score_ms: Math.round(maxEma),
|
|
121
|
+
throttle_ms: this.duressThrottle,
|
|
122
|
+
per_type: perType,
|
|
123
|
+
};
|
|
124
|
+
}
|
|
125
|
+
getDuressThrottle() {
|
|
126
|
+
return this.duressThrottle;
|
|
127
|
+
}
|
|
128
|
+
getCurrentLevel() {
|
|
129
|
+
return this.currentLevel;
|
|
130
|
+
}
|
|
131
|
+
/**
|
|
132
|
+
* Apply a duress snapshot received from another engine via quorum.
|
|
133
|
+
* Adopts the remote signal only if it indicates worse duress than local.
|
|
134
|
+
*/
|
|
135
|
+
applyRemoteDuress(throttleMs, level) {
|
|
136
|
+
if (this.levelOrdinal(level) > this.levelOrdinal(this.currentLevel)) {
|
|
137
|
+
this.currentLevel = level;
|
|
138
|
+
this.duressThrottle = throttleMs;
|
|
139
|
+
this.belowThresholdCount = 0;
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
/**
|
|
143
|
+
* Whether a quorum broadcast is warranted.
|
|
144
|
+
* Rate-limited and only fires when level changes or duress is active.
|
|
145
|
+
*/
|
|
146
|
+
shouldBroadcast() {
|
|
147
|
+
const now = Date.now();
|
|
148
|
+
if (now - this.lastBroadcastAt < config_1.HMSH_DURESS_BROADCAST_INTERVAL_MS) {
|
|
149
|
+
return false;
|
|
150
|
+
}
|
|
151
|
+
return (this.currentLevel !== this.lastBroadcastLevel ||
|
|
152
|
+
this.currentLevel !== 'healthy');
|
|
153
|
+
}
|
|
154
|
+
markBroadcast() {
|
|
155
|
+
this.lastBroadcastAt = Date.now();
|
|
156
|
+
this.lastBroadcastLevel = this.currentLevel;
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* Returns a snapshot for inclusion in quorum rollcall profiles.
|
|
160
|
+
*/
|
|
161
|
+
getSnapshot() {
|
|
162
|
+
let maxEma = 0;
|
|
163
|
+
const perType = {};
|
|
164
|
+
for (const [type, ema] of this.emas) {
|
|
165
|
+
perType[type] = Math.round(ema);
|
|
166
|
+
if (ema > maxEma)
|
|
167
|
+
maxEma = ema;
|
|
168
|
+
}
|
|
169
|
+
return {
|
|
170
|
+
level: this.currentLevel,
|
|
171
|
+
score_ms: Math.round(maxEma),
|
|
172
|
+
throttle_ms: this.duressThrottle,
|
|
173
|
+
per_type: perType,
|
|
174
|
+
};
|
|
175
|
+
}
|
|
176
|
+
// --- Private helpers ---
|
|
177
|
+
scoreToLevel(ms) {
|
|
178
|
+
if (ms < config_1.HMSH_DURESS_HEALTHY_CEILING_MS)
|
|
179
|
+
return 'healthy';
|
|
180
|
+
if (ms < config_1.HMSH_DURESS_MILD_CEILING_MS)
|
|
181
|
+
return 'mild';
|
|
182
|
+
if (ms < config_1.HMSH_DURESS_MODERATE_CEILING_MS)
|
|
183
|
+
return 'moderate';
|
|
184
|
+
return 'severe';
|
|
185
|
+
}
|
|
186
|
+
scoreToThrottle(ms, level) {
|
|
187
|
+
// Linear interpolation within the band for the given level
|
|
188
|
+
switch (level) {
|
|
189
|
+
case 'healthy':
|
|
190
|
+
return 0;
|
|
191
|
+
case 'mild':
|
|
192
|
+
return this.lerp(ms, config_1.HMSH_DURESS_HEALTHY_CEILING_MS, config_1.HMSH_DURESS_MILD_CEILING_MS, MILD_THROTTLE_MIN, MILD_THROTTLE_MAX);
|
|
193
|
+
case 'moderate':
|
|
194
|
+
return this.lerp(ms, config_1.HMSH_DURESS_MILD_CEILING_MS, config_1.HMSH_DURESS_MODERATE_CEILING_MS, MODERATE_THROTTLE_MIN, MODERATE_THROTTLE_MAX);
|
|
195
|
+
case 'severe':
|
|
196
|
+
// Clamp to severe band max; beyond the ceiling is still severe
|
|
197
|
+
return this.lerp(ms, config_1.HMSH_DURESS_MODERATE_CEILING_MS, config_1.HMSH_DURESS_MODERATE_CEILING_MS * 2, SEVERE_THROTTLE_MIN, SEVERE_THROTTLE_MAX);
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
lerp(value, inMin, inMax, outMin, outMax) {
|
|
201
|
+
const t = Math.min(Math.max((value - inMin) / (inMax - inMin), 0), 1);
|
|
202
|
+
return Math.round(outMin + t * (outMax - outMin));
|
|
203
|
+
}
|
|
204
|
+
levelOrdinal(level) {
|
|
205
|
+
switch (level) {
|
|
206
|
+
case 'healthy':
|
|
207
|
+
return 0;
|
|
208
|
+
case 'mild':
|
|
209
|
+
return 1;
|
|
210
|
+
case 'moderate':
|
|
211
|
+
return 2;
|
|
212
|
+
case 'severe':
|
|
213
|
+
return 3;
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
exports.DuressManager = DuressManager;
|
|
@@ -2,7 +2,9 @@
|
|
|
2
2
|
import { ILogger } from '../logger';
|
|
3
3
|
import { StreamService } from '../stream';
|
|
4
4
|
import { RouterConfig, StreamData, StreamDataResponse, StreamRole } from '../../types/stream';
|
|
5
|
+
import { DuressLevel } from '../../types/quorum';
|
|
5
6
|
import { ProviderClient, ProviderTransaction } from '../../types/provider';
|
|
7
|
+
import { DuressSnapshot } from './duress';
|
|
6
8
|
declare class Router<S extends StreamService<ProviderClient, ProviderTransaction>> {
|
|
7
9
|
appId: string;
|
|
8
10
|
guid: string;
|
|
@@ -29,6 +31,8 @@ declare class Router<S extends StreamService<ProviderClient, ProviderTransaction
|
|
|
29
31
|
private errorHandler;
|
|
30
32
|
private lifecycleManager;
|
|
31
33
|
private consumptionManager;
|
|
34
|
+
private duressManager?;
|
|
35
|
+
private _pendingDuressSnapshot?;
|
|
32
36
|
constructor(config: RouterConfig, stream: S, logger: ILogger);
|
|
33
37
|
get throttle(): number;
|
|
34
38
|
get shouldConsume(): boolean;
|
|
@@ -49,6 +53,9 @@ declare class Router<S extends StreamService<ProviderClient, ProviderTransaction
|
|
|
49
53
|
structureUnhandledError(input: StreamData, err: Error): StreamDataResponse;
|
|
50
54
|
structureUnacknowledgedError(input: StreamData): StreamDataResponse;
|
|
51
55
|
structureError(input: StreamData, output: StreamDataResponse): StreamDataResponse;
|
|
56
|
+
setDuressCallback(callback: (snapshot: DuressSnapshot) => void): void;
|
|
57
|
+
applyRemoteDuress(throttleMs: number, level: DuressLevel): void;
|
|
58
|
+
getDuressSnapshot(): DuressSnapshot | undefined;
|
|
52
59
|
static stopConsuming(): Promise<void>;
|
|
53
60
|
stopConsuming(): Promise<void>;
|
|
54
61
|
cancelThrottle(): void;
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.Router = void 0;
|
|
4
|
+
const stream_1 = require("../../types/stream");
|
|
4
5
|
// Import the new submodules
|
|
5
6
|
const config_1 = require("./config");
|
|
6
7
|
const throttling_1 = require("./throttling");
|
|
7
8
|
const error_handling_1 = require("./error-handling");
|
|
8
9
|
const lifecycle_1 = require("./lifecycle");
|
|
9
10
|
const consumption_1 = require("./consumption");
|
|
11
|
+
const duress_1 = require("./duress");
|
|
10
12
|
class Router {
|
|
11
13
|
constructor(config, stream, logger) {
|
|
12
14
|
// Legacy properties for backward compatibility
|
|
@@ -34,7 +36,11 @@ class Router {
|
|
|
34
36
|
this.throttleManager = new throttling_1.ThrottleManager(enhancedConfig.throttle);
|
|
35
37
|
this.errorHandler = new error_handling_1.ErrorHandler();
|
|
36
38
|
this.lifecycleManager = new lifecycle_1.LifecycleManager(this.readonly, this.topic, this.logger, this.stream);
|
|
37
|
-
|
|
39
|
+
// Engine routers get duress detection; workers do not
|
|
40
|
+
if (this.role === stream_1.StreamRole.ENGINE) {
|
|
41
|
+
this.duressManager = new duress_1.DuressManager();
|
|
42
|
+
}
|
|
43
|
+
this.consumptionManager = new consumption_1.ConsumptionManager(this.stream, this.logger, this.throttleManager, this.errorHandler, this.lifecycleManager, this.reclaimDelay, this.reclaimCount, this.appId, this.role, this, this.retry, this.duressManager);
|
|
38
44
|
this.resetThrottleState();
|
|
39
45
|
}
|
|
40
46
|
// Legacy compatibility methods
|
|
@@ -99,6 +105,17 @@ class Router {
|
|
|
99
105
|
structureError(input, output) {
|
|
100
106
|
return this.errorHandler.structureError(input, output);
|
|
101
107
|
}
|
|
108
|
+
// Duress detection methods (engine routers only)
|
|
109
|
+
setDuressCallback(callback) {
|
|
110
|
+
this.consumptionManager.setDuressCallback(callback);
|
|
111
|
+
}
|
|
112
|
+
applyRemoteDuress(throttleMs, level) {
|
|
113
|
+
this.duressManager?.applyRemoteDuress(throttleMs, level);
|
|
114
|
+
this.throttleManager.setDuressFloor(throttleMs);
|
|
115
|
+
}
|
|
116
|
+
getDuressSnapshot() {
|
|
117
|
+
return this.duressManager?.getSnapshot();
|
|
118
|
+
}
|
|
102
119
|
// Static methods for instance management
|
|
103
120
|
static async stopConsuming() {
|
|
104
121
|
return lifecycle_1.InstanceRegistry.stopAll();
|
|
@@ -1,11 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Elastic throttle with two independent inputs:
|
|
3
|
+
*
|
|
4
|
+
* 1. **User throttle** — set explicitly via quorum `throttle` command.
|
|
5
|
+
* Absolute value: 0 = resume, >0 = delay per message, -1 = pause.
|
|
6
|
+
*
|
|
7
|
+
* 2. **Duress floor** — set automatically by the DuressManager based on
|
|
8
|
+
* processing latency. The effective throttle is `max(user, duress)`,
|
|
9
|
+
* so duress never reduces below what the user set, and pause always
|
|
10
|
+
* takes precedence. When duress clears (floor returns to 0), the
|
|
11
|
+
* user's original throttle remains in effect.
|
|
12
|
+
*
|
|
13
|
+
* `customSleep()` uses the effective throttle, supports dynamic
|
|
14
|
+
* interruption (if the throttle decreases mid-sleep, the router wakes
|
|
15
|
+
* early), and handles pause via a bare promise with no timer.
|
|
16
|
+
*/
|
|
1
17
|
export declare class ThrottleManager {
|
|
2
18
|
private throttle;
|
|
19
|
+
private duressFloor;
|
|
3
20
|
private isSleeping;
|
|
4
21
|
private sleepPromiseResolve;
|
|
5
22
|
private innerPromiseResolve;
|
|
6
23
|
private sleepTimeout;
|
|
7
24
|
constructor(initialThrottle?: number);
|
|
8
25
|
getThrottle(): number;
|
|
26
|
+
/**
|
|
27
|
+
* Set the duress-computed throttle floor. The effective throttle
|
|
28
|
+
* is max(userThrottle, duressFloor). Pause (throttle < 0) overrides.
|
|
29
|
+
*/
|
|
30
|
+
setDuressFloor(delayMs: number): void;
|
|
31
|
+
getDuressFloor(): number;
|
|
32
|
+
/**
|
|
33
|
+
* Returns the effective throttle: max of user-set throttle and
|
|
34
|
+
* duress floor. Pause (negative) always takes precedence.
|
|
35
|
+
*/
|
|
36
|
+
getEffectiveThrottle(): number;
|
|
9
37
|
setThrottle(delayInMillis: number): void;
|
|
10
38
|
isPaused(): boolean;
|
|
11
39
|
/**
|
|
@@ -1,9 +1,26 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.ThrottleManager = void 0;
|
|
4
|
+
/**
|
|
5
|
+
* Elastic throttle with two independent inputs:
|
|
6
|
+
*
|
|
7
|
+
* 1. **User throttle** — set explicitly via quorum `throttle` command.
|
|
8
|
+
* Absolute value: 0 = resume, >0 = delay per message, -1 = pause.
|
|
9
|
+
*
|
|
10
|
+
* 2. **Duress floor** — set automatically by the DuressManager based on
|
|
11
|
+
* processing latency. The effective throttle is `max(user, duress)`,
|
|
12
|
+
* so duress never reduces below what the user set, and pause always
|
|
13
|
+
* takes precedence. When duress clears (floor returns to 0), the
|
|
14
|
+
* user's original throttle remains in effect.
|
|
15
|
+
*
|
|
16
|
+
* `customSleep()` uses the effective throttle, supports dynamic
|
|
17
|
+
* interruption (if the throttle decreases mid-sleep, the router wakes
|
|
18
|
+
* early), and handles pause via a bare promise with no timer.
|
|
19
|
+
*/
|
|
4
20
|
class ThrottleManager {
|
|
5
21
|
constructor(initialThrottle = 0) {
|
|
6
22
|
this.throttle = 0;
|
|
23
|
+
this.duressFloor = 0;
|
|
7
24
|
this.isSleeping = false;
|
|
8
25
|
this.sleepPromiseResolve = null;
|
|
9
26
|
this.innerPromiseResolve = null;
|
|
@@ -13,6 +30,25 @@ class ThrottleManager {
|
|
|
13
30
|
getThrottle() {
|
|
14
31
|
return this.throttle;
|
|
15
32
|
}
|
|
33
|
+
/**
|
|
34
|
+
* Set the duress-computed throttle floor. The effective throttle
|
|
35
|
+
* is max(userThrottle, duressFloor). Pause (throttle < 0) overrides.
|
|
36
|
+
*/
|
|
37
|
+
setDuressFloor(delayMs) {
|
|
38
|
+
this.duressFloor = Math.max(0, delayMs);
|
|
39
|
+
}
|
|
40
|
+
getDuressFloor() {
|
|
41
|
+
return this.duressFloor;
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Returns the effective throttle: max of user-set throttle and
|
|
45
|
+
* duress floor. Pause (negative) always takes precedence.
|
|
46
|
+
*/
|
|
47
|
+
getEffectiveThrottle() {
|
|
48
|
+
if (this.throttle < 0)
|
|
49
|
+
return this.throttle; // pause overrides
|
|
50
|
+
return Math.max(this.throttle, this.duressFloor);
|
|
51
|
+
}
|
|
16
52
|
setThrottle(delayInMillis) {
|
|
17
53
|
const wasPaused = this.throttle < 0;
|
|
18
54
|
const wasDecreased = delayInMillis < this.throttle;
|
|
@@ -45,12 +81,13 @@ class ThrottleManager {
|
|
|
45
81
|
* setThrottle() is called with a non-negative value.
|
|
46
82
|
*/
|
|
47
83
|
async customSleep() {
|
|
48
|
-
|
|
84
|
+
const effective = this.getEffectiveThrottle();
|
|
85
|
+
if (effective === 0)
|
|
49
86
|
return;
|
|
50
87
|
if (this.isSleeping)
|
|
51
88
|
return;
|
|
52
89
|
this.isSleeping = true;
|
|
53
|
-
if (
|
|
90
|
+
if (effective < 0) {
|
|
54
91
|
// Paused: wait indefinitely until setThrottle interrupts
|
|
55
92
|
await new Promise((resolve) => {
|
|
56
93
|
this.innerPromiseResolve = resolve;
|
|
@@ -62,12 +99,14 @@ class ThrottleManager {
|
|
|
62
99
|
await new Promise(async (outerResolve) => {
|
|
63
100
|
this.sleepPromiseResolve = outerResolve;
|
|
64
101
|
let elapsedTime = Date.now() - startTime;
|
|
65
|
-
|
|
102
|
+
let target = this.getEffectiveThrottle();
|
|
103
|
+
while (elapsedTime < target && target > 0) {
|
|
66
104
|
await new Promise((innerResolve) => {
|
|
67
105
|
this.innerPromiseResolve = innerResolve;
|
|
68
|
-
this.sleepTimeout = setTimeout(innerResolve,
|
|
106
|
+
this.sleepTimeout = setTimeout(innerResolve, target - elapsedTime);
|
|
69
107
|
});
|
|
70
108
|
elapsedTime = Date.now() - startTime;
|
|
109
|
+
target = this.getEffectiveThrottle();
|
|
71
110
|
}
|
|
72
111
|
this.resetThrottleState();
|
|
73
112
|
outerResolve();
|
|
@@ -31,8 +31,10 @@ async function deploySchema(streamClient, appId, logger) {
|
|
|
31
31
|
}
|
|
32
32
|
await client.query('COMMIT');
|
|
33
33
|
}
|
|
34
|
-
// Always run index migrations under the lock
|
|
34
|
+
// Always run index, procedure, and trigger migrations under the lock
|
|
35
35
|
await ensureIndexes(client, schemaName);
|
|
36
|
+
await ensureProcedures(client, schemaName);
|
|
37
|
+
await ensureStatementLevelTriggers(client, schemaName);
|
|
36
38
|
}
|
|
37
39
|
finally {
|
|
38
40
|
await client.query('SELECT pg_advisory_unlock($1)', [lockId]);
|
|
@@ -129,7 +131,12 @@ async function waitForTablesCreation(streamClient, lockId, schemaName, logger) {
|
|
|
129
131
|
async function ensureIndexes(client, schemaName) {
|
|
130
132
|
const engineTable = `${schemaName}.engine_streams`;
|
|
131
133
|
const workerTable = `${schemaName}.worker_streams`;
|
|
132
|
-
// Drop legacy indexes that don't include the priority column
|
|
134
|
+
// Drop legacy indexes that don't include the priority column, plus
|
|
135
|
+
// redundant ones: idx_*_expired_at duplicates the partial
|
|
136
|
+
// idx_*_processed_volume for the retention purge, and
|
|
137
|
+
// idx_*_stream_name_expired_at duplicates the leading column and
|
|
138
|
+
// predicate of idx_*_message_fetch. Every index here is maintained on
|
|
139
|
+
// each message's INSERT plus two non-HOT UPDATEs (reserve, ack).
|
|
133
140
|
for (const idx of [
|
|
134
141
|
'idx_engine_streams_dequeue',
|
|
135
142
|
'idx_engine_streams_stale_reservations',
|
|
@@ -139,6 +146,10 @@ async function ensureIndexes(client, schemaName) {
|
|
|
139
146
|
'idx_engine_streams_message_fetch',
|
|
140
147
|
'idx_worker_streams_active_messages',
|
|
141
148
|
'idx_worker_streams_message_fetch',
|
|
149
|
+
'idx_engine_streams_expired_at',
|
|
150
|
+
'idx_engine_stream_name_expired_at',
|
|
151
|
+
'idx_worker_streams_expired_at',
|
|
152
|
+
'idx_worker_stream_name_expired_at',
|
|
142
153
|
]) {
|
|
143
154
|
await client.query(`DROP INDEX IF EXISTS ${schemaName}.${idx}`);
|
|
144
155
|
}
|
|
@@ -148,9 +159,13 @@ async function ensureIndexes(client, schemaName) {
|
|
|
148
159
|
ON ${engineTable} (stream_name, priority DESC, visible_at, id)
|
|
149
160
|
WHERE reserved_at IS NULL AND expired_at IS NULL;
|
|
150
161
|
`);
|
|
162
|
+
// message_fetch must match the dequeue ORDER BY (priority DESC, id)
|
|
163
|
+
// exactly — placing visible_at between them forces the claim query to
|
|
164
|
+
// fetch and sort the entire pending backlog instead of stopping at
|
|
165
|
+
// LIMIT. visible_at and stale-reservation checks are scan filters.
|
|
151
166
|
await client.query(`
|
|
152
167
|
CREATE INDEX IF NOT EXISTS idx_engine_streams_message_fetch
|
|
153
|
-
ON ${engineTable} (stream_name, priority DESC,
|
|
168
|
+
ON ${engineTable} (stream_name, priority DESC, id)
|
|
154
169
|
WHERE expired_at IS NULL;
|
|
155
170
|
`);
|
|
156
171
|
await client.query(`
|
|
@@ -160,7 +175,7 @@ async function ensureIndexes(client, schemaName) {
|
|
|
160
175
|
`);
|
|
161
176
|
await client.query(`
|
|
162
177
|
CREATE INDEX IF NOT EXISTS idx_worker_streams_message_fetch
|
|
163
|
-
ON ${workerTable} (stream_name, priority DESC,
|
|
178
|
+
ON ${workerTable} (stream_name, priority DESC, id)
|
|
164
179
|
WHERE expired_at IS NULL;
|
|
165
180
|
`);
|
|
166
181
|
// v0.18.0: add jid column to engine_streams for job tracing
|
|
@@ -171,6 +186,35 @@ async function ensureIndexes(client, schemaName) {
|
|
|
171
186
|
WHERE jid != '';
|
|
172
187
|
`);
|
|
173
188
|
}
|
|
189
|
+
/**
|
|
190
|
+
* Re-deploy the SECURITY DEFINER stored procedures on existing
|
|
191
|
+
* databases so query changes (e.g., worker_dequeue) reach deployments
|
|
192
|
+
* created before the change. CREATE OR REPLACE preserves grants.
|
|
193
|
+
*/
|
|
194
|
+
async function ensureProcedures(client, schemaName) {
|
|
195
|
+
for (const sql of (0, procedures_1.getCreateProceduresSQL)(schemaName)) {
|
|
196
|
+
await client.query(sql);
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
/**
|
|
200
|
+
* Migrate pre-existing row-level notification triggers to the
|
|
201
|
+
* statement-level form. Recreating a trigger takes an ACCESS EXCLUSIVE
|
|
202
|
+
* lock on the table, so only do it when the installed trigger is still
|
|
203
|
+
* row-level (tgtype bit 0 set); subsequent boots are a no-op.
|
|
204
|
+
*/
|
|
205
|
+
async function ensureStatementLevelTriggers(client, schemaName) {
|
|
206
|
+
const result = await client.query(`SELECT count(*) AS row_level
|
|
207
|
+
FROM pg_trigger t
|
|
208
|
+
JOIN pg_class c ON c.oid = t.tgrelid
|
|
209
|
+
JOIN pg_namespace n ON n.oid = c.relnamespace
|
|
210
|
+
WHERE n.nspname = $1
|
|
211
|
+
AND c.relname IN ('engine_streams', 'worker_streams')
|
|
212
|
+
AND t.tgname IN ('notify_engine_stream_insert', 'notify_worker_stream_insert')
|
|
213
|
+
AND (t.tgtype & 1) = 1`, [schemaName]);
|
|
214
|
+
if (parseInt(result.rows[0].row_level, 10) > 0) {
|
|
215
|
+
await createNotificationTriggers(client, schemaName);
|
|
216
|
+
}
|
|
217
|
+
}
|
|
174
218
|
async function createTables(client, schemaName) {
|
|
175
219
|
await client.query(`CREATE SCHEMA IF NOT EXISTS ${schemaName};`);
|
|
176
220
|
// ---- ENGINE_STREAMS table ----
|
|
@@ -210,16 +254,7 @@ async function createTables(client, schemaName) {
|
|
|
210
254
|
`);
|
|
211
255
|
await client.query(`
|
|
212
256
|
CREATE INDEX IF NOT EXISTS idx_engine_streams_message_fetch
|
|
213
|
-
ON ${engineTable} (stream_name, priority DESC,
|
|
214
|
-
WHERE expired_at IS NULL;
|
|
215
|
-
`);
|
|
216
|
-
await client.query(`
|
|
217
|
-
CREATE INDEX IF NOT EXISTS idx_engine_streams_expired_at
|
|
218
|
-
ON ${engineTable} (expired_at);
|
|
219
|
-
`);
|
|
220
|
-
await client.query(`
|
|
221
|
-
CREATE INDEX IF NOT EXISTS idx_engine_stream_name_expired_at
|
|
222
|
-
ON ${engineTable} (stream_name)
|
|
257
|
+
ON ${engineTable} (stream_name, priority DESC, id)
|
|
223
258
|
WHERE expired_at IS NULL;
|
|
224
259
|
`);
|
|
225
260
|
await client.query(`
|
|
@@ -280,16 +315,7 @@ async function createTables(client, schemaName) {
|
|
|
280
315
|
`);
|
|
281
316
|
await client.query(`
|
|
282
317
|
CREATE INDEX IF NOT EXISTS idx_worker_streams_message_fetch
|
|
283
|
-
ON ${workerTable} (stream_name, priority DESC,
|
|
284
|
-
WHERE expired_at IS NULL;
|
|
285
|
-
`);
|
|
286
|
-
await client.query(`
|
|
287
|
-
CREATE INDEX IF NOT EXISTS idx_worker_streams_expired_at
|
|
288
|
-
ON ${workerTable} (expired_at);
|
|
289
|
-
`);
|
|
290
|
-
await client.query(`
|
|
291
|
-
CREATE INDEX IF NOT EXISTS idx_worker_stream_name_expired_at
|
|
292
|
-
ON ${workerTable} (stream_name)
|
|
318
|
+
ON ${workerTable} (stream_name, priority DESC, id)
|
|
293
319
|
WHERE expired_at IS NULL;
|
|
294
320
|
`);
|
|
295
321
|
await client.query(`
|
|
@@ -342,28 +368,35 @@ async function createNotificationTriggers(client, schemaName) {
|
|
|
342
368
|
const engineTable = `${schemaName}.engine_streams`;
|
|
343
369
|
const workerTable = `${schemaName}.worker_streams`;
|
|
344
370
|
// ---- ENGINE notification trigger ----
|
|
371
|
+
// Statement-level with a transition table: one pg_notify per distinct
|
|
372
|
+
// stream_name per INSERT statement. Row-level triggers fire pg_notify
|
|
373
|
+
// per message, which both multiplies trigger overhead and serializes
|
|
374
|
+
// commits on the global notification queue lock at high insert rates.
|
|
345
375
|
await client.query(`
|
|
346
376
|
CREATE OR REPLACE FUNCTION ${schemaName}.notify_new_engine_stream_message()
|
|
347
377
|
RETURNS TRIGGER AS $$
|
|
348
378
|
DECLARE
|
|
379
|
+
rec RECORD;
|
|
349
380
|
channel_name TEXT;
|
|
350
381
|
payload JSON;
|
|
351
382
|
BEGIN
|
|
352
|
-
|
|
353
|
-
|
|
383
|
+
FOR rec IN
|
|
384
|
+
SELECT DISTINCT stream_name FROM new_rows WHERE visible_at <= NOW()
|
|
385
|
+
LOOP
|
|
386
|
+
channel_name := 'eng_' || rec.stream_name;
|
|
354
387
|
IF length(channel_name) > 63 THEN
|
|
355
388
|
channel_name := left(channel_name, 63);
|
|
356
389
|
END IF;
|
|
357
390
|
|
|
358
391
|
payload := json_build_object(
|
|
359
|
-
'stream_name',
|
|
392
|
+
'stream_name', rec.stream_name,
|
|
360
393
|
'table_type', 'engine'
|
|
361
394
|
);
|
|
362
395
|
|
|
363
396
|
PERFORM pg_notify(channel_name, payload::text);
|
|
364
|
-
END
|
|
397
|
+
END LOOP;
|
|
365
398
|
|
|
366
|
-
RETURN
|
|
399
|
+
RETURN NULL;
|
|
367
400
|
END;
|
|
368
401
|
$$ LANGUAGE plpgsql;
|
|
369
402
|
`);
|
|
@@ -371,7 +404,8 @@ async function createNotificationTriggers(client, schemaName) {
|
|
|
371
404
|
DROP TRIGGER IF EXISTS notify_engine_stream_insert ON ${engineTable};
|
|
372
405
|
CREATE TRIGGER notify_engine_stream_insert
|
|
373
406
|
AFTER INSERT ON ${engineTable}
|
|
374
|
-
|
|
407
|
+
REFERENCING NEW TABLE AS new_rows
|
|
408
|
+
FOR EACH STATEMENT
|
|
375
409
|
EXECUTE FUNCTION ${schemaName}.notify_new_engine_stream_message();
|
|
376
410
|
`);
|
|
377
411
|
// ---- WORKER notification trigger ----
|
|
@@ -379,24 +413,27 @@ async function createNotificationTriggers(client, schemaName) {
|
|
|
379
413
|
CREATE OR REPLACE FUNCTION ${schemaName}.notify_new_worker_stream_message()
|
|
380
414
|
RETURNS TRIGGER AS $$
|
|
381
415
|
DECLARE
|
|
416
|
+
rec RECORD;
|
|
382
417
|
channel_name TEXT;
|
|
383
418
|
payload JSON;
|
|
384
419
|
BEGIN
|
|
385
|
-
|
|
386
|
-
|
|
420
|
+
FOR rec IN
|
|
421
|
+
SELECT DISTINCT stream_name FROM new_rows WHERE visible_at <= NOW()
|
|
422
|
+
LOOP
|
|
423
|
+
channel_name := 'wrk_' || rec.stream_name;
|
|
387
424
|
IF length(channel_name) > 63 THEN
|
|
388
425
|
channel_name := left(channel_name, 63);
|
|
389
426
|
END IF;
|
|
390
427
|
|
|
391
428
|
payload := json_build_object(
|
|
392
|
-
'stream_name',
|
|
429
|
+
'stream_name', rec.stream_name,
|
|
393
430
|
'table_type', 'worker'
|
|
394
431
|
);
|
|
395
432
|
|
|
396
433
|
PERFORM pg_notify(channel_name, payload::text);
|
|
397
|
-
END
|
|
434
|
+
END LOOP;
|
|
398
435
|
|
|
399
|
-
RETURN
|
|
436
|
+
RETURN NULL;
|
|
400
437
|
END;
|
|
401
438
|
$$ LANGUAGE plpgsql;
|
|
402
439
|
`);
|
|
@@ -404,7 +441,8 @@ async function createNotificationTriggers(client, schemaName) {
|
|
|
404
441
|
DROP TRIGGER IF EXISTS notify_worker_stream_insert ON ${workerTable};
|
|
405
442
|
CREATE TRIGGER notify_worker_stream_insert
|
|
406
443
|
AFTER INSERT ON ${workerTable}
|
|
407
|
-
|
|
444
|
+
REFERENCING NEW TABLE AS new_rows
|
|
445
|
+
FOR EACH STATEMENT
|
|
408
446
|
EXECUTE FUNCTION ${schemaName}.notify_new_worker_stream_message();
|
|
409
447
|
`);
|
|
410
448
|
// ---- Visibility timeout notification function (queries both tables) ----
|
|
@@ -215,18 +215,24 @@ async function fetchMessages(client, tableName, streamName, isEngine, consumerNa
|
|
|
215
215
|
const maxRetries = options?.maxRetries ?? 3;
|
|
216
216
|
let backoff = initialBackoff;
|
|
217
217
|
let retries = 0;
|
|
218
|
-
// Include workflow_name in RETURNING for worker streams
|
|
218
|
+
// Include workflow_name in RETURNING for worker streams. Columns are
|
|
219
|
+
// qualified with the update target's alias because the claim UPDATE
|
|
220
|
+
// joins a CTE that also exposes an id column.
|
|
219
221
|
const returningClause = isEngine
|
|
220
|
-
? 'id, message, max_retry_attempts, backoff_coefficient, maximum_interval_seconds, retry_attempt'
|
|
221
|
-
: 'id, message, workflow_name, max_retry_attempts, backoff_coefficient, maximum_interval_seconds, retry_attempt';
|
|
222
|
+
? 't.id, t.message, t.max_retry_attempts, t.backoff_coefficient, t.maximum_interval_seconds, t.retry_attempt'
|
|
223
|
+
: 't.id, t.message, t.workflow_name, t.max_retry_attempts, t.backoff_coefficient, t.maximum_interval_seconds, t.retry_attempt';
|
|
222
224
|
try {
|
|
223
225
|
while (retries < maxRetries) {
|
|
224
226
|
retries++;
|
|
225
227
|
const batchSize = options?.batchSize || 1;
|
|
226
228
|
const reservationTimeout = options?.reservationTimeout || (enums_1.HMSH_RESERVATION_TIMEOUT_S + 5);
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
229
|
+
// The locking SELECT must live in a MATERIALIZED CTE: as a plain IN
|
|
230
|
+
// subquery the planner may re-execute it per outer row (rows updated
|
|
231
|
+
// earlier in the same command are skipped as lock candidates), which
|
|
232
|
+
// reserves MORE rows than LIMIT. The UPDATE repeats stream_name so
|
|
233
|
+
// the planner prunes to a single hash partition and joins on the
|
|
234
|
+
// (stream_name, id) primary key.
|
|
235
|
+
const res = await client.query(`WITH candidates AS MATERIALIZED (
|
|
230
236
|
SELECT id FROM ${tableName}
|
|
231
237
|
WHERE stream_name = $1
|
|
232
238
|
AND (reserved_at IS NULL OR reserved_at < NOW() - INTERVAL '${reservationTimeout} seconds')
|
|
@@ -236,6 +242,10 @@ async function fetchMessages(client, tableName, streamName, isEngine, consumerNa
|
|
|
236
242
|
LIMIT $2
|
|
237
243
|
FOR UPDATE SKIP LOCKED
|
|
238
244
|
)
|
|
245
|
+
UPDATE ${tableName} t
|
|
246
|
+
SET reserved_at = NOW(), reserved_by = $3
|
|
247
|
+
FROM candidates
|
|
248
|
+
WHERE t.stream_name = $1 AND t.id = candidates.id
|
|
239
249
|
RETURNING ${returningClause}`, [streamName, batchSize, consumerName]);
|
|
240
250
|
const messages = res.rows.map((row) => {
|
|
241
251
|
const data = (0, utils_1.parseStreamMessage)(row.message);
|
|
@@ -36,6 +36,13 @@ declare class PostgresStreamService extends StreamService<PostgresClientType & P
|
|
|
36
36
|
init(namespace: string, appId: string, logger: ILogger): Promise<void>;
|
|
37
37
|
private isNotificationsEnabled;
|
|
38
38
|
private checkForMissedMessages;
|
|
39
|
+
/**
|
|
40
|
+
* Notification-driven fetch with coalescing. NOTIFYs that arrive while
|
|
41
|
+
* a fetch is in flight set fetchPending instead of issuing concurrent
|
|
42
|
+
* claim queries (a burst of N inserts otherwise triggers N claims per
|
|
43
|
+
* consumer, most returning empty). The drain loop re-fetches while the
|
|
44
|
+
* batch came back full or a NOTIFY arrived mid-fetch.
|
|
45
|
+
*/
|
|
39
46
|
private fetchAndDeliverMessages;
|
|
40
47
|
private getConsumerKey;
|
|
41
48
|
/**
|
|
@@ -82,11 +82,31 @@ class PostgresStreamService extends index_1.StreamService {
|
|
|
82
82
|
return await instance.fetchMessages(consumer.streamName, consumer.groupName, consumer.consumerName, { batchSize: 10, reservationTimeout: instance.reservationTimeout, enableBackoff: false, maxRetries: 1 });
|
|
83
83
|
});
|
|
84
84
|
}
|
|
85
|
+
/**
|
|
86
|
+
* Notification-driven fetch with coalescing. NOTIFYs that arrive while
|
|
87
|
+
* a fetch is in flight set fetchPending instead of issuing concurrent
|
|
88
|
+
* claim queries (a burst of N inserts otherwise triggers N claims per
|
|
89
|
+
* consumer, most returning empty). The drain loop re-fetches while the
|
|
90
|
+
* batch came back full or a NOTIFY arrived mid-fetch.
|
|
91
|
+
*/
|
|
85
92
|
async fetchAndDeliverMessages(consumer) {
|
|
93
|
+
if (consumer.fetchInFlight) {
|
|
94
|
+
consumer.fetchPending = true;
|
|
95
|
+
return;
|
|
96
|
+
}
|
|
97
|
+
consumer.fetchInFlight = true;
|
|
98
|
+
const batchSize = 10;
|
|
86
99
|
try {
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
consumer.
|
|
100
|
+
let drain = true;
|
|
101
|
+
while (drain && consumer.isListening !== false) {
|
|
102
|
+
consumer.fetchPending = false;
|
|
103
|
+
const messages = await this.fetchMessages(consumer.streamName, consumer.groupName, consumer.consumerName, { batchSize, reservationTimeout: this.reservationTimeout, enableBackoff: false, maxRetries: 1 });
|
|
104
|
+
if (messages.length > 0) {
|
|
105
|
+
consumer.callback(messages);
|
|
106
|
+
}
|
|
107
|
+
// Boolean() rather than === true: fetchPending is mutated by the
|
|
108
|
+
// notification handler across the await, which TS narrowing misses
|
|
109
|
+
drain = messages.length === batchSize || Boolean(consumer.fetchPending);
|
|
90
110
|
}
|
|
91
111
|
}
|
|
92
112
|
catch (error) {
|
|
@@ -96,6 +116,9 @@ class PostgresStreamService extends index_1.StreamService {
|
|
|
96
116
|
error,
|
|
97
117
|
});
|
|
98
118
|
}
|
|
119
|
+
finally {
|
|
120
|
+
consumer.fetchInFlight = false;
|
|
121
|
+
}
|
|
99
122
|
}
|
|
100
123
|
getConsumerKey(streamName, groupName) {
|
|
101
124
|
return `${streamName}:${groupName}`;
|
|
@@ -54,10 +54,12 @@ function getCreateProceduresSQL(schemaName) {
|
|
|
54
54
|
SET search_path = ${schemaName}, pg_temp
|
|
55
55
|
AS $$
|
|
56
56
|
${STREAM_ACCESS_CHECK}
|
|
57
|
+
-- The locking SELECT must live in a MATERIALIZED CTE: as a plain IN
|
|
58
|
+
-- subquery the planner may re-execute it per outer row, reserving
|
|
59
|
+
-- MORE rows than p_batch_size. stream_name on the UPDATE prunes to
|
|
60
|
+
-- a single hash partition and joins on the primary key.
|
|
57
61
|
RETURN QUERY
|
|
58
|
-
|
|
59
|
-
SET reserved_at = NOW(), reserved_by = p_consumer_id
|
|
60
|
-
WHERE ws.id IN (
|
|
62
|
+
WITH candidates AS MATERIALIZED (
|
|
61
63
|
SELECT ws2.id FROM ${workerTable} ws2
|
|
62
64
|
WHERE ws2.stream_name = p_stream_name
|
|
63
65
|
AND (ws2.reserved_at IS NULL OR ws2.reserved_at < NOW() - (p_reservation_timeout_sec || ' seconds')::INTERVAL)
|
|
@@ -67,6 +69,11 @@ function getCreateProceduresSQL(schemaName) {
|
|
|
67
69
|
LIMIT p_batch_size
|
|
68
70
|
FOR UPDATE SKIP LOCKED
|
|
69
71
|
)
|
|
72
|
+
UPDATE ${workerTable} ws
|
|
73
|
+
SET reserved_at = NOW(), reserved_by = p_consumer_id
|
|
74
|
+
FROM candidates
|
|
75
|
+
WHERE ws.stream_name = p_stream_name
|
|
76
|
+
AND ws.id = candidates.id
|
|
70
77
|
RETURNING ws.id, ws.message, ws.workflow_name, ws.max_retry_attempts,
|
|
71
78
|
ws.backoff_coefficient, ws.maximum_interval_seconds, ws.retry_attempt;
|
|
72
79
|
END;
|
package/build/types/quorum.d.ts
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import { JobOutput } from './job';
|
|
2
2
|
import { StringAnyType } from './serializer';
|
|
3
|
+
/** Duress severity level for adaptive engine throttling. */
|
|
4
|
+
export type DuressLevel = 'healthy' | 'mild' | 'moderate' | 'severe';
|
|
3
5
|
export interface CPULoad {
|
|
4
6
|
[cpu: string]: string;
|
|
5
7
|
}
|
|
@@ -86,6 +88,12 @@ export interface QuorumProfile {
|
|
|
86
88
|
system?: SystemHealth;
|
|
87
89
|
/** Stringified worker callback function (only if `signature: true` in rollcall). */
|
|
88
90
|
signature?: string;
|
|
91
|
+
/** Current duress level. Engine routers only. */
|
|
92
|
+
duress_level?: DuressLevel;
|
|
93
|
+
/** Current duress score in ms (max EMA across message types). Engine routers only. */
|
|
94
|
+
duress_score_ms?: number;
|
|
95
|
+
/** Per-message-type EMA latencies in ms. Engine routers only. */
|
|
96
|
+
duress_per_type?: Record<string, number>;
|
|
89
97
|
}
|
|
90
98
|
interface QuorumMessageBase {
|
|
91
99
|
entity?: string;
|
|
@@ -138,6 +146,17 @@ export interface ThrottleMessage extends QuorumMessageBase {
|
|
|
138
146
|
topic?: string;
|
|
139
147
|
throttle: number;
|
|
140
148
|
}
|
|
149
|
+
export interface DuressMessage extends QuorumMessageBase {
|
|
150
|
+
type: 'duress';
|
|
151
|
+
/** GUID of the engine that detected duress */
|
|
152
|
+
originator: string;
|
|
153
|
+
/** Aggregate duress score (max EMA across message types) in ms */
|
|
154
|
+
duress_score_ms: number;
|
|
155
|
+
/** Recommended throttle delay in ms */
|
|
156
|
+
throttle_ms: number;
|
|
157
|
+
/** Duress severity level */
|
|
158
|
+
level: DuressLevel;
|
|
159
|
+
}
|
|
141
160
|
export interface RollCallMessage extends QuorumMessageBase {
|
|
142
161
|
type: 'rollcall';
|
|
143
162
|
guid?: string;
|
|
@@ -169,5 +188,5 @@ export type SubscriptionOptions = {
|
|
|
169
188
|
* These messages serve to coordinate the cache invalidation and switch-over
|
|
170
189
|
* to the new version without any downtime and a coordinating parent server.
|
|
171
190
|
*/
|
|
172
|
-
export type QuorumMessage = PingMessage | PongMessage | ActivateMessage | WorkMessage | JobMessage | ThrottleMessage | RollCallMessage | CronMessage | UserMessage;
|
|
191
|
+
export type QuorumMessage = PingMessage | PongMessage | ActivateMessage | WorkMessage | JobMessage | ThrottleMessage | DuressMessage | RollCallMessage | CronMessage | UserMessage;
|
|
173
192
|
export {};
|
package/build/types/stream.d.ts
CHANGED
|
@@ -300,4 +300,8 @@ export interface NotificationConsumer {
|
|
|
300
300
|
lastFallbackCheck: number;
|
|
301
301
|
/** Service instance that owns this consumer (for fetchAndDeliverMessages dispatch) */
|
|
302
302
|
serviceInstance?: any;
|
|
303
|
+
/** True while a notification-driven fetch is in flight (coalesces concurrent NOTIFYs) */
|
|
304
|
+
fetchInFlight?: boolean;
|
|
305
|
+
/** Set when a NOTIFY arrives mid-fetch; triggers one follow-up fetch */
|
|
306
|
+
fetchPending?: boolean;
|
|
303
307
|
}
|