bulltrackers-module 1.0.337 → 1.0.339
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* FILENAME: computation-system/helpers/computation_dispatcher.js
|
|
3
3
|
* PURPOSE: Sequential Cursor-Based Dispatcher.
|
|
4
|
-
*
|
|
5
|
-
* UPDATED:
|
|
6
|
-
* UPDATED: Added checks to permanently skip Deterministic Failures (Quality Breakers).
|
|
4
|
+
* UPDATED: Enforces Strict One-Shot Policy (Standard -> HighMem -> Dead Letter).
|
|
5
|
+
* UPDATED: Prevents infinite loops by permanently ignoring deterministic failures.
|
|
7
6
|
*/
|
|
8
7
|
|
|
9
8
|
const { getExpectedDateStrings, getEarliestDataDates, normalizeName, DEFINITIVE_EARLIEST_DATES } = require('../utils/utils.js');
|
|
@@ -23,7 +22,7 @@ const STALE_LOCK_THRESHOLD_MS = 1000 * 60 * 15;
|
|
|
23
22
|
// =============================================================================
|
|
24
23
|
async function filterActiveTasks(db, date, pass, tasks, logger, forceRun = false) {
|
|
25
24
|
if (!tasks || tasks.length === 0) return [];
|
|
26
|
-
if (forceRun) return tasks;
|
|
25
|
+
if (forceRun) return tasks;
|
|
27
26
|
|
|
28
27
|
const checkPromises = tasks.map(async (t) => {
|
|
29
28
|
const taskName = normalizeName(t.name);
|
|
@@ -49,21 +48,14 @@ async function filterActiveTasks(db, date, pass, tasks, logger, forceRun = false
|
|
|
49
48
|
return null;
|
|
50
49
|
}
|
|
51
50
|
|
|
52
|
-
// 2.
|
|
53
|
-
|
|
54
|
-
data.completedAt &&
|
|
55
|
-
(Date.now() - new Date(data.completedAt).getTime() < 60 * 1000);
|
|
56
|
-
|
|
57
|
-
if (isJustFinished) return null;
|
|
51
|
+
// 2. COMPLETED CHECK (Ignore)
|
|
52
|
+
if (data.status === 'COMPLETED') return null;
|
|
58
53
|
|
|
59
|
-
// 3.
|
|
60
|
-
//
|
|
54
|
+
// 3. FAILED CHECK (Pass through to Route Splitter)
|
|
55
|
+
// We do NOT filter FAILED tasks here. We pass them to splitRoutes()
|
|
56
|
+
// which decides if they get promoted to High-Mem or dropped forever.
|
|
61
57
|
if (data.status === 'FAILED') {
|
|
62
|
-
|
|
63
|
-
if (['QUALITY_CIRCUIT_BREAKER', 'SEMANTIC_GATE', 'SHARDING_LIMIT_EXCEEDED'].includes(stage)) {
|
|
64
|
-
if (logger) logger.log('WARN', `[Dispatcher] 🛑 Skipping deterministic failure for ${taskName} (${stage}).`);
|
|
65
|
-
return null;
|
|
66
|
-
}
|
|
58
|
+
return t;
|
|
67
59
|
}
|
|
68
60
|
}
|
|
69
61
|
return t;
|
|
@@ -184,15 +176,12 @@ async function handlePassVerification(config, dependencies, computationManifest,
|
|
|
184
176
|
|
|
185
177
|
const missingTasks = [];
|
|
186
178
|
|
|
187
|
-
// Optimize: Batch fetch statuses if possible, but for now loop is safer for memory
|
|
188
|
-
// In production, we might want p-limit here.
|
|
189
179
|
for (const date of sessionDates) {
|
|
190
180
|
const [dailyStatus, availability] = await Promise.all([
|
|
191
181
|
fetchComputationStatus(date, config, dependencies),
|
|
192
182
|
checkRootDataAvailability(date, config, dependencies, DEFINITIVE_EARLIEST_DATES)
|
|
193
183
|
]);
|
|
194
184
|
|
|
195
|
-
// Need previous status for historical calcs
|
|
196
185
|
let prevDailyStatus = null;
|
|
197
186
|
if (calcsInPass.some(c => c.isHistorical)) {
|
|
198
187
|
const prevD = new Date(date + 'T00:00:00Z');
|
|
@@ -202,12 +191,9 @@ async function handlePassVerification(config, dependencies, computationManifest,
|
|
|
202
191
|
|
|
203
192
|
const report = analyzeDateExecution(date, calcsInPass, availability ? availability.status : {}, dailyStatus, manifestMap, prevDailyStatus);
|
|
204
193
|
|
|
205
|
-
// We only care about Runnable (New) or ReRuns (Changed/Failed)
|
|
206
|
-
// We ignore Blocked (impossible to run) and Impossible (permanent fail)
|
|
207
194
|
const pending = [...report.runnable, ...report.reRuns];
|
|
208
195
|
|
|
209
196
|
if (pending.length > 0) {
|
|
210
|
-
// Calculate ETA
|
|
211
197
|
const totalWeight = pending.reduce((sum, t) => sum + (weightMap.get(normalizeName(t.name)) || 1.0), 0);
|
|
212
198
|
const eta = Math.max(30, Math.ceil(totalWeight * BASE_SECONDS_PER_WEIGHT_UNIT));
|
|
213
199
|
|
|
@@ -244,7 +230,6 @@ async function handleSweepDispatch(config, dependencies, computationManifest, re
|
|
|
244
230
|
checkRootDataAvailability(date, config, dependencies, DEFINITIVE_EARLIEST_DATES)
|
|
245
231
|
]);
|
|
246
232
|
|
|
247
|
-
// Previous Status Fetch (simplified for brevity, assume historical dependency check works or fails safe)
|
|
248
233
|
let prevDailyStatus = null;
|
|
249
234
|
if (calcsInPass.some(c => c.isHistorical)) {
|
|
250
235
|
const prevD = new Date(date + 'T00:00:00Z');
|
|
@@ -260,8 +245,8 @@ async function handleSweepDispatch(config, dependencies, computationManifest, re
|
|
|
260
245
|
return { dispatched: 0 };
|
|
261
246
|
}
|
|
262
247
|
|
|
263
|
-
// [
|
|
264
|
-
//
|
|
248
|
+
// [CRITICAL] FILTER FOR SWEEP:
|
|
249
|
+
// Only dispatch if it hasn't failed High-Mem or Quality checks.
|
|
265
250
|
const validTasks = [];
|
|
266
251
|
for (const task of pending) {
|
|
267
252
|
const name = normalizeName(task.name);
|
|
@@ -270,21 +255,28 @@ async function handleSweepDispatch(config, dependencies, computationManifest, re
|
|
|
270
255
|
if (doc.exists) {
|
|
271
256
|
const data = doc.data();
|
|
272
257
|
const stage = data.error?.stage;
|
|
273
|
-
|
|
258
|
+
|
|
259
|
+
// A. QUALITY CHECK: If it failed logic, DO NOT RETRY.
|
|
260
|
+
if (['QUALITY_CIRCUIT_BREAKER', 'SEMANTIC_GATE', 'SHARDING_LIMIT_EXCEEDED'].includes(stage)) {
|
|
274
261
|
logger.log('WARN', `[Sweep] 🛑 Skipping deterministic failure for ${name} (${stage}).`);
|
|
275
262
|
continue;
|
|
276
263
|
}
|
|
264
|
+
|
|
265
|
+
// B. DEAD END CHECK: If it failed High-Mem already, DO NOT RETRY.
|
|
266
|
+
if (data.resourceTier === 'high-mem' && data.status === 'FAILED') {
|
|
267
|
+
logger.log('WARN', `[Sweep] 🛑 Skipping ${name} - Already failed on High-Mem.`);
|
|
268
|
+
continue;
|
|
269
|
+
}
|
|
277
270
|
}
|
|
278
271
|
validTasks.push(task);
|
|
279
272
|
}
|
|
280
273
|
|
|
281
274
|
if (validTasks.length === 0) {
|
|
282
|
-
logger.log('INFO', `[Sweep] ${date}
|
|
275
|
+
logger.log('INFO', `[Sweep] ${date} has no retryable tasks. Ignoring.`);
|
|
283
276
|
return { dispatched: 0 };
|
|
284
277
|
}
|
|
285
278
|
|
|
286
|
-
// 2. FORCE High Mem
|
|
287
|
-
// We use validTasks now
|
|
279
|
+
// 2. FORCE High Mem for remaining valid tasks
|
|
288
280
|
const currentDispatchId = crypto.randomUUID();
|
|
289
281
|
|
|
290
282
|
const tasksPayload = validTasks.map(t => ({
|
|
@@ -367,8 +359,9 @@ async function handleStandardDispatch(config, dependencies, computationManifest,
|
|
|
367
359
|
}
|
|
368
360
|
|
|
369
361
|
if (selectedTasks.length > 0) {
|
|
370
|
-
|
|
371
|
-
|
|
362
|
+
// Split Logic: Moves OOMs to High-Mem, drops dead letters
|
|
363
|
+
const { standard, highMem } = await splitRoutes(db, selectedDate, passToRun, selectedTasks, logger);
|
|
364
|
+
selectedTasks = [...standard, ...highMem];
|
|
372
365
|
}
|
|
373
366
|
}
|
|
374
367
|
}
|
|
@@ -428,21 +421,58 @@ async function handleStandardDispatch(config, dependencies, computationManifest,
|
|
|
428
421
|
};
|
|
429
422
|
}
|
|
430
423
|
|
|
431
|
-
|
|
432
|
-
|
|
424
|
+
// =============================================================================
|
|
425
|
+
// HELPER: Route Splitting (One-Shot Enforcement)
|
|
426
|
+
// =============================================================================
|
|
427
|
+
async function splitRoutes(db, date, pass, tasks, logger) {
|
|
428
|
+
const standard = [];
|
|
429
|
+
const highMem = [];
|
|
430
|
+
|
|
433
431
|
for (const task of tasks) {
|
|
434
432
|
const name = normalizeName(task.name);
|
|
435
433
|
const ledgerPath = `computation_audit_ledger/${date}/passes/${pass}/tasks/${name}`;
|
|
436
434
|
const doc = await db.doc(ledgerPath).get();
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
435
|
+
|
|
436
|
+
if (!doc.exists) {
|
|
437
|
+
// New task -> Standard
|
|
438
|
+
standard.push(task);
|
|
439
|
+
continue;
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
const data = doc.data();
|
|
443
|
+
|
|
444
|
+
// If it FAILED, we check if we can escalate it.
|
|
445
|
+
if (data.status === 'FAILED') {
|
|
446
|
+
const stage = data.error?.stage;
|
|
447
|
+
|
|
448
|
+
// 1. QUALITY / LOGIC FAIL: Dead Letter (Drop it)
|
|
449
|
+
if (['QUALITY_CIRCUIT_BREAKER', 'SEMANTIC_GATE'].includes(stage)) {
|
|
450
|
+
logger.log('WARN', `[Dispatcher] 🛑 Dropping ${name} - Deterministic Failure (${stage}).`);
|
|
451
|
+
continue;
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
// 2. PREVIOUSLY HIGH MEM FAIL: Dead Letter (Drop it)
|
|
455
|
+
if (data.resourceTier === 'high-mem') {
|
|
456
|
+
logger.log('WARN', `[Dispatcher] 🛑 Dropping ${name} - Failed on High-Mem already.`);
|
|
457
|
+
continue;
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
// 3. STANDARD FAIL (Crash/OOM): Promote to High Mem (Retry)
|
|
461
|
+
// If it failed standard, we give it ONE shot on high-mem.
|
|
462
|
+
// Note: Even if it was an "Unknown" error, we promote to High-Mem to cover OOMs that looked like crashes.
|
|
463
|
+
highMem.push({
|
|
464
|
+
...task,
|
|
465
|
+
resources: 'high-mem',
|
|
466
|
+
reason: `Retry: ${data.error?.message || 'Standard Failure'}`
|
|
467
|
+
});
|
|
468
|
+
|
|
469
|
+
} else {
|
|
470
|
+
// If status is not FAILED (e.g. was Zombie and filterActiveTasks passed it), retry Standard.
|
|
471
|
+
standard.push(task);
|
|
443
472
|
}
|
|
444
473
|
}
|
|
445
|
-
|
|
474
|
+
|
|
475
|
+
return { standard, highMem };
|
|
446
476
|
}
|
|
447
477
|
|
|
448
478
|
module.exports = { dispatchComputationPass };
|
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* FILENAME: computation-system/helpers/computation_worker.js
|
|
3
|
-
* UPDATED:
|
|
4
|
-
*
|
|
3
|
+
* UPDATED: Implemented Strict Idempotency Gate (The "One-Shot" Policy).
|
|
4
|
+
* BEHAVIOR:
|
|
5
|
+
* 1. Checks Ledger via Transaction before execution.
|
|
6
|
+
* 2. If Status is COMPLETED or FAILED, immediately ACKs (returns) to stop Pub/Sub loops.
|
|
7
|
+
* 3. Preserves Error 'stage' to ensure logic errors are not retried.
|
|
5
8
|
*/
|
|
6
9
|
|
|
7
10
|
const { executeDispatchTask } = require('../WorkflowOrchestrator.js');
|
|
@@ -28,6 +31,62 @@ function startMemoryHeartbeat(db, ledgerPath, intervalMs = 2000) {
|
|
|
28
31
|
return { timer, getPeak: () => peakRss };
|
|
29
32
|
}
|
|
30
33
|
|
|
34
|
+
/**
|
|
35
|
+
* STRICT IDEMPOTENCY GATE
|
|
36
|
+
* Uses a transaction to verify this task hasn't already been processed.
|
|
37
|
+
* Returns { shouldRun: boolean, leaseData: object }
|
|
38
|
+
*/
|
|
39
|
+
async function checkIdempotencyAndClaimLease(db, ledgerPath, dispatchId, workerId) {
|
|
40
|
+
const docRef = db.doc(ledgerPath);
|
|
41
|
+
|
|
42
|
+
try {
|
|
43
|
+
return await db.runTransaction(async (t) => {
|
|
44
|
+
const doc = await t.get(docRef);
|
|
45
|
+
|
|
46
|
+
if (doc.exists) {
|
|
47
|
+
const data = doc.data();
|
|
48
|
+
|
|
49
|
+
// 1. TERMINAL STATE CHECK
|
|
50
|
+
// If the task is already finalized, we MUST NOT run it again.
|
|
51
|
+
// This stops the infinite loop if Pub/Sub redelivers a FAILED task.
|
|
52
|
+
if (['COMPLETED', 'FAILED', 'CRASH'].includes(data.status)) {
|
|
53
|
+
return { shouldRun: false, reason: `Task already in terminal state: ${data.status}` };
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// 2. DUPLICATE DELIVERY CHECK
|
|
57
|
+
// If it's IN_PROGRESS with the SAME dispatchId, we are likely seeing a Pub/Sub redelivery
|
|
58
|
+
// while the code is actually running. Ignore it.
|
|
59
|
+
if (data.status === 'IN_PROGRESS' && data.dispatchId === dispatchId) {
|
|
60
|
+
return { shouldRun: false, reason: 'Duplicate delivery: Task already IN_PROGRESS with same ID.' };
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// 3. ZOMBIE CHECK (Optional safety, usually handled by Dispatcher)
|
|
64
|
+
// If it's IN_PROGRESS but clearly stale (older than 15 mins), we might steal it,
|
|
65
|
+
// but generally we let the Dispatcher handle zombies. For now, we block collision.
|
|
66
|
+
if (data.status === 'IN_PROGRESS') {
|
|
67
|
+
return { shouldRun: false, reason: 'Collision: Task currently IN_PROGRESS by another worker.' };
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// 4. CLAIM LEASE
|
|
72
|
+
// If we get here, the task is either new (doesn't exist) or PENDING.
|
|
73
|
+
const lease = {
|
|
74
|
+
status: 'IN_PROGRESS',
|
|
75
|
+
workerId: workerId,
|
|
76
|
+
dispatchId: dispatchId || 'unknown',
|
|
77
|
+
startedAt: new Date()
|
|
78
|
+
};
|
|
79
|
+
|
|
80
|
+
t.set(docRef, lease, { merge: true });
|
|
81
|
+
return { shouldRun: true, leaseData: lease };
|
|
82
|
+
});
|
|
83
|
+
} catch (e) {
|
|
84
|
+
console.error(`[Idempotency] Transaction failed: ${e.message}`);
|
|
85
|
+
// If transaction fails (contention), assume we shouldn't run
|
|
86
|
+
return { shouldRun: false, reason: `Transaction Error: ${e.message}` };
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
31
90
|
async function handleComputationTask(message, config, dependencies) {
|
|
32
91
|
const logger = new StructuredLogger({ minLevel: config.minLevel || 'INFO', enableStructured: true, ...config });
|
|
33
92
|
const runDeps = { ...dependencies, logger };
|
|
@@ -45,17 +104,19 @@ async function handleComputationTask(message, config, dependencies) {
|
|
|
45
104
|
const resourceTier = resources || 'standard';
|
|
46
105
|
const ledgerPath = `computation_audit_ledger/${date}/passes/${pass}/tasks/${computation}`;
|
|
47
106
|
|
|
48
|
-
|
|
107
|
+
const workerId = process.env.K_REVISION || os.hostname();
|
|
49
108
|
|
|
50
|
-
//
|
|
51
|
-
const
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
109
|
+
// --- STEP 1: IDEMPOTENCY CHECK ---
|
|
110
|
+
const gate = await checkIdempotencyAndClaimLease(db, ledgerPath, dispatchId, workerId);
|
|
111
|
+
|
|
112
|
+
if (!gate.shouldRun) {
|
|
113
|
+
// [CRITICAL] We return successfully (ACK) to remove the message from Pub/Sub.
|
|
114
|
+
// We do NOT throw an error, because that would cause a retry.
|
|
115
|
+
logger.log('WARN', `[Worker] 🛑 Idempotency Gate: Skipping ${computation}. Reason: ${gate.reason}`);
|
|
116
|
+
return;
|
|
117
|
+
}
|
|
57
118
|
|
|
58
|
-
|
|
119
|
+
logger.log('INFO', `[Worker] 📥 Task: ${computation} (${date}) [Tier: ${resourceTier}] [ID: ${dispatchId}]`);
|
|
59
120
|
|
|
60
121
|
const heartbeat = startMemoryHeartbeat(db, ledgerPath);
|
|
61
122
|
|
|
@@ -72,7 +133,13 @@ async function handleComputationTask(message, config, dependencies) {
|
|
|
72
133
|
const failureReport = result?.updates?.failureReport || [];
|
|
73
134
|
const successUpdates = result?.updates?.successUpdates || {};
|
|
74
135
|
|
|
75
|
-
|
|
136
|
+
// [CRITICAL] Propagate Error Stage from inner logic
|
|
137
|
+
if (failureReport.length > 0) {
|
|
138
|
+
const reportedError = failureReport[0].error;
|
|
139
|
+
const errorObj = new Error(reportedError.message);
|
|
140
|
+
errorObj.stage = reportedError.stage;
|
|
141
|
+
throw errorObj;
|
|
142
|
+
}
|
|
76
143
|
|
|
77
144
|
const calcUpdate = successUpdates[normalizeName(computation)] || {};
|
|
78
145
|
const metrics = {
|
|
@@ -90,16 +157,19 @@ async function handleComputationTask(message, config, dependencies) {
|
|
|
90
157
|
|
|
91
158
|
} catch (err) {
|
|
92
159
|
clearInterval(heartbeat.timer);
|
|
160
|
+
|
|
93
161
|
const isDeterministic = ['SHARDING_LIMIT_EXCEEDED', 'QUALITY_CIRCUIT_BREAKER', 'SEMANTIC_GATE'].includes(err.stage);
|
|
94
|
-
|
|
162
|
+
|
|
163
|
+
// --- STEP 2: ERROR HANDLING ---
|
|
164
|
+
// If Logic Error OR Max Retries reached, mark FAILED and ACK.
|
|
95
165
|
if (isDeterministic || (message.deliveryAttempt || 1) >= MAX_RETRIES) {
|
|
96
|
-
|
|
97
|
-
// This prevents the Dispatcher from retrying Quality Broken tasks.
|
|
166
|
+
|
|
98
167
|
const errorPayload = {
|
|
99
168
|
message: err.message,
|
|
100
169
|
stage: err.stage || 'FATAL'
|
|
101
170
|
};
|
|
102
171
|
|
|
172
|
+
// This write ensures the Idempotency Gate blocks future retries
|
|
103
173
|
await db.doc(ledgerPath).set({
|
|
104
174
|
status: 'FAILED',
|
|
105
175
|
error: errorPayload,
|
|
@@ -107,8 +177,10 @@ async function handleComputationTask(message, config, dependencies) {
|
|
|
107
177
|
}, { merge: true });
|
|
108
178
|
|
|
109
179
|
await recordRunAttempt(db, { date, computation, pass }, 'FAILURE', { message: err.message, stage: err.stage || 'FATAL' }, { peakMemoryMB: heartbeat.getPeak() }, triggerReason, resourceTier);
|
|
110
|
-
return;
|
|
180
|
+
return; // ACK
|
|
111
181
|
}
|
|
182
|
+
|
|
183
|
+
// Only throw (NACK) for transient system errors (Network, etc)
|
|
112
184
|
throw err;
|
|
113
185
|
}
|
|
114
186
|
}
|