bulltrackers-module 1.0.338 → 1.0.339
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* FILENAME: computation-system/helpers/computation_dispatcher.js
|
|
3
3
|
* PURPOSE: Sequential Cursor-Based Dispatcher.
|
|
4
|
-
*
|
|
5
|
-
* UPDATED:
|
|
6
|
-
* UPDATED: Added Safety Checks to permanently skip Deterministic Failures.
|
|
4
|
+
* UPDATED: Enforces Strict One-Shot Policy (Standard -> HighMem -> Dead Letter).
|
|
5
|
+
* UPDATED: Prevents infinite loops by permanently ignoring deterministic failures.
|
|
7
6
|
*/
|
|
8
7
|
|
|
9
8
|
const { getExpectedDateStrings, getEarliestDataDates, normalizeName, DEFINITIVE_EARLIEST_DATES } = require('../utils/utils.js');
|
|
@@ -23,7 +22,7 @@ const STALE_LOCK_THRESHOLD_MS = 1000 * 60 * 15;
|
|
|
23
22
|
// =============================================================================
|
|
24
23
|
async function filterActiveTasks(db, date, pass, tasks, logger, forceRun = false) {
|
|
25
24
|
if (!tasks || tasks.length === 0) return [];
|
|
26
|
-
if (forceRun) return tasks;
|
|
25
|
+
if (forceRun) return tasks;
|
|
27
26
|
|
|
28
27
|
const checkPromises = tasks.map(async (t) => {
|
|
29
28
|
const taskName = normalizeName(t.name);
|
|
@@ -49,21 +48,14 @@ async function filterActiveTasks(db, date, pass, tasks, logger, forceRun = false
|
|
|
49
48
|
return null;
|
|
50
49
|
}
|
|
51
50
|
|
|
52
|
-
// 2.
|
|
53
|
-
|
|
54
|
-
data.completedAt &&
|
|
55
|
-
(Date.now() - new Date(data.completedAt).getTime() < 60 * 1000);
|
|
56
|
-
|
|
57
|
-
if (isJustFinished) return null;
|
|
51
|
+
// 2. COMPLETED CHECK (Ignore)
|
|
52
|
+
if (data.status === 'COMPLETED') return null;
|
|
58
53
|
|
|
59
|
-
// 3.
|
|
60
|
-
//
|
|
54
|
+
// 3. FAILED CHECK (Pass through to Route Splitter)
|
|
55
|
+
// We do NOT filter FAILED tasks here. We pass them to splitRoutes()
|
|
56
|
+
// which decides if they get promoted to High-Mem or dropped forever.
|
|
61
57
|
if (data.status === 'FAILED') {
|
|
62
|
-
|
|
63
|
-
if (['QUALITY_CIRCUIT_BREAKER', 'SEMANTIC_GATE', 'SHARDING_LIMIT_EXCEEDED'].includes(stage)) {
|
|
64
|
-
if (logger) logger.log('WARN', `[Dispatcher] 🛑 Skipping deterministic failure for ${taskName} (${stage}).`);
|
|
65
|
-
return null;
|
|
66
|
-
}
|
|
58
|
+
return t;
|
|
67
59
|
}
|
|
68
60
|
}
|
|
69
61
|
return t;
|
|
@@ -202,7 +194,6 @@ async function handlePassVerification(config, dependencies, computationManifest,
|
|
|
202
194
|
const pending = [...report.runnable, ...report.reRuns];
|
|
203
195
|
|
|
204
196
|
if (pending.length > 0) {
|
|
205
|
-
// Calculate ETA
|
|
206
197
|
const totalWeight = pending.reduce((sum, t) => sum + (weightMap.get(normalizeName(t.name)) || 1.0), 0);
|
|
207
198
|
const eta = Math.max(30, Math.ceil(totalWeight * BASE_SECONDS_PER_WEIGHT_UNIT));
|
|
208
199
|
|
|
@@ -254,8 +245,8 @@ async function handleSweepDispatch(config, dependencies, computationManifest, re
|
|
|
254
245
|
return { dispatched: 0 };
|
|
255
246
|
}
|
|
256
247
|
|
|
257
|
-
// [
|
|
258
|
-
//
|
|
248
|
+
// [CRITICAL] FILTER FOR SWEEP:
|
|
249
|
+
// Only dispatch if it hasn't failed High-Mem or Quality checks.
|
|
259
250
|
const validTasks = [];
|
|
260
251
|
for (const task of pending) {
|
|
261
252
|
const name = normalizeName(task.name);
|
|
@@ -264,20 +255,28 @@ async function handleSweepDispatch(config, dependencies, computationManifest, re
|
|
|
264
255
|
if (doc.exists) {
|
|
265
256
|
const data = doc.data();
|
|
266
257
|
const stage = data.error?.stage;
|
|
267
|
-
|
|
258
|
+
|
|
259
|
+
// A. QUALITY CHECK: If it failed logic, DO NOT RETRY.
|
|
260
|
+
if (['QUALITY_CIRCUIT_BREAKER', 'SEMANTIC_GATE', 'SHARDING_LIMIT_EXCEEDED'].includes(stage)) {
|
|
268
261
|
logger.log('WARN', `[Sweep] 🛑 Skipping deterministic failure for ${name} (${stage}).`);
|
|
269
262
|
continue;
|
|
270
263
|
}
|
|
264
|
+
|
|
265
|
+
// B. DEAD END CHECK: If it failed High-Mem already, DO NOT RETRY.
|
|
266
|
+
if (data.resourceTier === 'high-mem' && data.status === 'FAILED') {
|
|
267
|
+
logger.log('WARN', `[Sweep] 🛑 Skipping ${name} - Already failed on High-Mem.`);
|
|
268
|
+
continue;
|
|
269
|
+
}
|
|
271
270
|
}
|
|
272
271
|
validTasks.push(task);
|
|
273
272
|
}
|
|
274
273
|
|
|
275
274
|
if (validTasks.length === 0) {
|
|
276
|
-
logger.log('INFO', `[Sweep] ${date}
|
|
275
|
+
logger.log('INFO', `[Sweep] ${date} has no retryable tasks. Ignoring.`);
|
|
277
276
|
return { dispatched: 0 };
|
|
278
277
|
}
|
|
279
278
|
|
|
280
|
-
// 2. FORCE High Mem
|
|
279
|
+
// 2. FORCE High Mem for remaining valid tasks
|
|
281
280
|
const currentDispatchId = crypto.randomUUID();
|
|
282
281
|
|
|
283
282
|
const tasksPayload = validTasks.map(t => ({
|
|
@@ -360,8 +359,9 @@ async function handleStandardDispatch(config, dependencies, computationManifest,
|
|
|
360
359
|
}
|
|
361
360
|
|
|
362
361
|
if (selectedTasks.length > 0) {
|
|
363
|
-
|
|
364
|
-
|
|
362
|
+
// Split Logic: Moves OOMs to High-Mem, drops dead letters
|
|
363
|
+
const { standard, highMem } = await splitRoutes(db, selectedDate, passToRun, selectedTasks, logger);
|
|
364
|
+
selectedTasks = [...standard, ...highMem];
|
|
365
365
|
}
|
|
366
366
|
}
|
|
367
367
|
}
|
|
@@ -421,21 +421,58 @@ async function handleStandardDispatch(config, dependencies, computationManifest,
|
|
|
421
421
|
};
|
|
422
422
|
}
|
|
423
423
|
|
|
424
|
-
|
|
425
|
-
|
|
424
|
+
// =============================================================================
|
|
425
|
+
// HELPER: Route Splitting (One-Shot Enforcement)
|
|
426
|
+
// =============================================================================
|
|
427
|
+
async function splitRoutes(db, date, pass, tasks, logger) {
|
|
428
|
+
const standard = [];
|
|
429
|
+
const highMem = [];
|
|
430
|
+
|
|
426
431
|
for (const task of tasks) {
|
|
427
432
|
const name = normalizeName(task.name);
|
|
428
433
|
const ledgerPath = `computation_audit_ledger/${date}/passes/${pass}/tasks/${name}`;
|
|
429
434
|
const doc = await db.doc(ledgerPath).get();
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
435
|
+
|
|
436
|
+
if (!doc.exists) {
|
|
437
|
+
// New task -> Standard
|
|
438
|
+
standard.push(task);
|
|
439
|
+
continue;
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
const data = doc.data();
|
|
443
|
+
|
|
444
|
+
// If it FAILED, we check if we can escalate it.
|
|
445
|
+
if (data.status === 'FAILED') {
|
|
446
|
+
const stage = data.error?.stage;
|
|
447
|
+
|
|
448
|
+
// 1. QUALITY / LOGIC FAIL: Dead Letter (Drop it)
|
|
449
|
+
if (['QUALITY_CIRCUIT_BREAKER', 'SEMANTIC_GATE'].includes(stage)) {
|
|
450
|
+
logger.log('WARN', `[Dispatcher] 🛑 Dropping ${name} - Deterministic Failure (${stage}).`);
|
|
451
|
+
continue;
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
// 2. PREVIOUSLY HIGH MEM FAIL: Dead Letter (Drop it)
|
|
455
|
+
if (data.resourceTier === 'high-mem') {
|
|
456
|
+
logger.log('WARN', `[Dispatcher] 🛑 Dropping ${name} - Failed on High-Mem already.`);
|
|
457
|
+
continue;
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
// 3. STANDARD FAIL (Crash/OOM): Promote to High Mem (Retry)
|
|
461
|
+
// If it failed standard, we give it ONE shot on high-mem.
|
|
462
|
+
// Note: Even if it was an "Unknown" error, we promote to High-Mem to cover OOMs that looked like crashes.
|
|
463
|
+
highMem.push({
|
|
464
|
+
...task,
|
|
465
|
+
resources: 'high-mem',
|
|
466
|
+
reason: `Retry: ${data.error?.message || 'Standard Failure'}`
|
|
467
|
+
});
|
|
468
|
+
|
|
469
|
+
} else {
|
|
470
|
+
// If status is not FAILED (e.g. was Zombie and filterActiveTasks passed it), retry Standard.
|
|
471
|
+
standard.push(task);
|
|
436
472
|
}
|
|
437
473
|
}
|
|
438
|
-
|
|
474
|
+
|
|
475
|
+
return { standard, highMem };
|
|
439
476
|
}
|
|
440
477
|
|
|
441
478
|
module.exports = { dispatchComputationPass };
|
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* FILENAME: computation-system/helpers/computation_worker.js
|
|
3
|
-
* UPDATED:
|
|
4
|
-
*
|
|
3
|
+
* UPDATED: Implemented Strict Idempotency Gate (The "One-Shot" Policy).
|
|
4
|
+
* BEHAVIOR:
|
|
5
|
+
* 1. Checks Ledger via Transaction before execution.
|
|
6
|
+
* 2. If Status is COMPLETED or FAILED, immediately ACKs (returns) to stop Pub/Sub loops.
|
|
7
|
+
* 3. Preserves Error 'stage' to ensure logic errors are not retried.
|
|
5
8
|
*/
|
|
6
9
|
|
|
7
10
|
const { executeDispatchTask } = require('../WorkflowOrchestrator.js');
|
|
@@ -28,6 +31,62 @@ function startMemoryHeartbeat(db, ledgerPath, intervalMs = 2000) {
|
|
|
28
31
|
return { timer, getPeak: () => peakRss };
|
|
29
32
|
}
|
|
30
33
|
|
|
34
|
+
/**
|
|
35
|
+
* STRICT IDEMPOTENCY GATE
|
|
36
|
+
* Uses a transaction to verify this task hasn't already been processed.
|
|
37
|
+
* Returns { shouldRun: boolean, leaseData: object }
|
|
38
|
+
*/
|
|
39
|
+
async function checkIdempotencyAndClaimLease(db, ledgerPath, dispatchId, workerId) {
|
|
40
|
+
const docRef = db.doc(ledgerPath);
|
|
41
|
+
|
|
42
|
+
try {
|
|
43
|
+
return await db.runTransaction(async (t) => {
|
|
44
|
+
const doc = await t.get(docRef);
|
|
45
|
+
|
|
46
|
+
if (doc.exists) {
|
|
47
|
+
const data = doc.data();
|
|
48
|
+
|
|
49
|
+
// 1. TERMINAL STATE CHECK
|
|
50
|
+
// If the task is already finalized, we MUST NOT run it again.
|
|
51
|
+
// This stops the infinite loop if Pub/Sub redelivers a FAILED task.
|
|
52
|
+
if (['COMPLETED', 'FAILED', 'CRASH'].includes(data.status)) {
|
|
53
|
+
return { shouldRun: false, reason: `Task already in terminal state: ${data.status}` };
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// 2. DUPLICATE DELIVERY CHECK
|
|
57
|
+
// If it's IN_PROGRESS with the SAME dispatchId, we are likely seeing a Pub/Sub redelivery
|
|
58
|
+
// while the code is actually running. Ignore it.
|
|
59
|
+
if (data.status === 'IN_PROGRESS' && data.dispatchId === dispatchId) {
|
|
60
|
+
return { shouldRun: false, reason: 'Duplicate delivery: Task already IN_PROGRESS with same ID.' };
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// 3. ZOMBIE CHECK (Optional safety, usually handled by Dispatcher)
|
|
64
|
+
// If it's IN_PROGRESS but clearly stale (older than 15 mins), we might steal it,
|
|
65
|
+
// but generally we let the Dispatcher handle zombies. For now, we block collision.
|
|
66
|
+
if (data.status === 'IN_PROGRESS') {
|
|
67
|
+
return { shouldRun: false, reason: 'Collision: Task currently IN_PROGRESS by another worker.' };
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// 4. CLAIM LEASE
|
|
72
|
+
// If we get here, the task is either new (doesn't exist) or PENDING.
|
|
73
|
+
const lease = {
|
|
74
|
+
status: 'IN_PROGRESS',
|
|
75
|
+
workerId: workerId,
|
|
76
|
+
dispatchId: dispatchId || 'unknown',
|
|
77
|
+
startedAt: new Date()
|
|
78
|
+
};
|
|
79
|
+
|
|
80
|
+
t.set(docRef, lease, { merge: true });
|
|
81
|
+
return { shouldRun: true, leaseData: lease };
|
|
82
|
+
});
|
|
83
|
+
} catch (e) {
|
|
84
|
+
console.error(`[Idempotency] Transaction failed: ${e.message}`);
|
|
85
|
+
// If transaction fails (contention), assume we shouldn't run
|
|
86
|
+
return { shouldRun: false, reason: `Transaction Error: ${e.message}` };
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
31
90
|
async function handleComputationTask(message, config, dependencies) {
|
|
32
91
|
const logger = new StructuredLogger({ minLevel: config.minLevel || 'INFO', enableStructured: true, ...config });
|
|
33
92
|
const runDeps = { ...dependencies, logger };
|
|
@@ -45,17 +104,19 @@ async function handleComputationTask(message, config, dependencies) {
|
|
|
45
104
|
const resourceTier = resources || 'standard';
|
|
46
105
|
const ledgerPath = `computation_audit_ledger/${date}/passes/${pass}/tasks/${computation}`;
|
|
47
106
|
|
|
48
|
-
|
|
107
|
+
const workerId = process.env.K_REVISION || os.hostname();
|
|
49
108
|
|
|
50
|
-
//
|
|
51
|
-
const
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
109
|
+
// --- STEP 1: IDEMPOTENCY CHECK ---
|
|
110
|
+
const gate = await checkIdempotencyAndClaimLease(db, ledgerPath, dispatchId, workerId);
|
|
111
|
+
|
|
112
|
+
if (!gate.shouldRun) {
|
|
113
|
+
// [CRITICAL] We return successfully (ACK) to remove the message from Pub/Sub.
|
|
114
|
+
// We do NOT throw an error, because that would cause a retry.
|
|
115
|
+
logger.log('WARN', `[Worker] 🛑 Idempotency Gate: Skipping ${computation}. Reason: ${gate.reason}`);
|
|
116
|
+
return;
|
|
117
|
+
}
|
|
57
118
|
|
|
58
|
-
|
|
119
|
+
logger.log('INFO', `[Worker] 📥 Task: ${computation} (${date}) [Tier: ${resourceTier}] [ID: ${dispatchId}]`);
|
|
59
120
|
|
|
60
121
|
const heartbeat = startMemoryHeartbeat(db, ledgerPath);
|
|
61
122
|
|
|
@@ -72,14 +133,11 @@ async function handleComputationTask(message, config, dependencies) {
|
|
|
72
133
|
const failureReport = result?.updates?.failureReport || [];
|
|
73
134
|
const successUpdates = result?.updates?.successUpdates || {};
|
|
74
135
|
|
|
75
|
-
// [CRITICAL
|
|
76
|
-
// Previously, 'throw new Error(msg)' stripped the 'stage' property, causing the
|
|
77
|
-
// catch block to treat Deterministic errors (Quality/Logic) as System errors (Transient),
|
|
78
|
-
// triggering infinite Pub/Sub retries.
|
|
136
|
+
// [CRITICAL] Propagate Error Stage from inner logic
|
|
79
137
|
if (failureReport.length > 0) {
|
|
80
138
|
const reportedError = failureReport[0].error;
|
|
81
139
|
const errorObj = new Error(reportedError.message);
|
|
82
|
-
errorObj.stage = reportedError.stage;
|
|
140
|
+
errorObj.stage = reportedError.stage;
|
|
83
141
|
throw errorObj;
|
|
84
142
|
}
|
|
85
143
|
|
|
@@ -99,18 +157,19 @@ async function handleComputationTask(message, config, dependencies) {
|
|
|
99
157
|
|
|
100
158
|
} catch (err) {
|
|
101
159
|
clearInterval(heartbeat.timer);
|
|
160
|
+
|
|
102
161
|
const isDeterministic = ['SHARDING_LIMIT_EXCEEDED', 'QUALITY_CIRCUIT_BREAKER', 'SEMANTIC_GATE'].includes(err.stage);
|
|
103
|
-
|
|
104
|
-
//
|
|
105
|
-
//
|
|
162
|
+
|
|
163
|
+
// --- STEP 2: ERROR HANDLING ---
|
|
164
|
+
// If Logic Error OR Max Retries reached, mark FAILED and ACK.
|
|
106
165
|
if (isDeterministic || (message.deliveryAttempt || 1) >= MAX_RETRIES) {
|
|
107
166
|
|
|
108
|
-
// Write structured error to Ledger so Dispatcher can see the 'stage' later
|
|
109
167
|
const errorPayload = {
|
|
110
168
|
message: err.message,
|
|
111
169
|
stage: err.stage || 'FATAL'
|
|
112
170
|
};
|
|
113
171
|
|
|
172
|
+
// This write ensures the Idempotency Gate blocks future retries
|
|
114
173
|
await db.doc(ledgerPath).set({
|
|
115
174
|
status: 'FAILED',
|
|
116
175
|
error: errorPayload,
|
|
@@ -118,10 +177,10 @@ async function handleComputationTask(message, config, dependencies) {
|
|
|
118
177
|
}, { merge: true });
|
|
119
178
|
|
|
120
179
|
await recordRunAttempt(db, { date, computation, pass }, 'FAILURE', { message: err.message, stage: err.stage || 'FATAL' }, { peakMemoryMB: heartbeat.getPeak() }, triggerReason, resourceTier);
|
|
121
|
-
return;
|
|
180
|
+
return; // ACK
|
|
122
181
|
}
|
|
123
182
|
|
|
124
|
-
//
|
|
183
|
+
// Only throw (NACK) for transient system errors (Network, etc)
|
|
125
184
|
throw err;
|
|
126
185
|
}
|
|
127
186
|
}
|