bulltrackers-module 1.0.744 → 1.0.746
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -43,6 +43,35 @@ class StorageManager {
|
|
|
43
43
|
}
|
|
44
44
|
return this._firestore;
|
|
45
45
|
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Updates the heartbeat of a zombie to "hide" it from detection
|
|
49
|
+
* while the recovery task is being queued.
|
|
50
|
+
*/
|
|
51
|
+
async claimZombie(checkpointId) {
|
|
52
|
+
if (!checkpointId) return;
|
|
53
|
+
|
|
54
|
+
// FIX: Access projectId and dataset from the config object
|
|
55
|
+
const { projectId, dataset } = this.config.bigquery; //
|
|
56
|
+
|
|
57
|
+
const query = `
|
|
58
|
+
UPDATE \`${projectId}.${dataset}.computation_checkpoints\`
|
|
59
|
+
SET last_updated = CURRENT_TIMESTAMP()
|
|
60
|
+
WHERE checkpoint_id = @checkpointId
|
|
61
|
+
`;
|
|
62
|
+
|
|
63
|
+
try {
|
|
64
|
+
await this.bigquery.query({
|
|
65
|
+
query,
|
|
66
|
+
params: { checkpointId }
|
|
67
|
+
});
|
|
68
|
+
} catch (e) {
|
|
69
|
+
// Ignore errors here, it's an optimization, not critical
|
|
70
|
+
console.warn(`[Storage] Failed to claim zombie ${checkpointId}: ${e.message}`);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
|
|
46
75
|
|
|
47
76
|
// =========================================================================
|
|
48
77
|
// RESULT COMMITTING (Batch -> GCS Buffer)
|
|
@@ -318,15 +347,40 @@ class StorageManager {
|
|
|
318
347
|
const table = 'computation_checkpoints';
|
|
319
348
|
const fullTable = `\`${this.config.bigquery.projectId}.${this.config.bigquery.dataset}.${table}\``;
|
|
320
349
|
try {
|
|
350
|
+
// FIX: Use subquery with ROW_NUMBER to find the TRUE latest state per computation.
|
|
351
|
+
// We only count it as a zombie if the LATEST row is 'running'.
|
|
352
|
+
// This ignores 'running' rows that have a newer (or same-time) 'completed' sibling.
|
|
321
353
|
const query = `
|
|
322
354
|
SELECT computation_name, date, checkpoint_id, last_updated
|
|
323
|
-
FROM
|
|
324
|
-
|
|
355
|
+
FROM (
|
|
356
|
+
SELECT
|
|
357
|
+
computation_name,
|
|
358
|
+
date,
|
|
359
|
+
checkpoint_id,
|
|
360
|
+
last_updated,
|
|
361
|
+
status,
|
|
362
|
+
ROW_NUMBER() OVER (
|
|
363
|
+
PARTITION BY computation_name, date
|
|
364
|
+
ORDER BY last_updated DESC,
|
|
365
|
+
CASE status
|
|
366
|
+
WHEN 'completed' THEN 1
|
|
367
|
+
WHEN 'failed' THEN 2
|
|
368
|
+
ELSE 3
|
|
369
|
+
END ASC
|
|
370
|
+
) as rn
|
|
371
|
+
FROM ${fullTable}
|
|
372
|
+
)
|
|
373
|
+
WHERE rn = 1
|
|
374
|
+
AND status = 'running'
|
|
375
|
+
AND last_updated < TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL @minutes MINUTE)
|
|
325
376
|
LIMIT 50
|
|
326
377
|
`;
|
|
327
378
|
const [rows] = await this.bigquery.query({ query, params: { minutes: minutesThreshold }, location: this.config.bigquery.location });
|
|
328
379
|
return rows.map(r => ({ name: r.computation_name, date: r.date.value || r.date, checkpointId: r.checkpoint_id }));
|
|
329
|
-
} catch (e) {
|
|
380
|
+
} catch (e) {
|
|
381
|
+
console.error(`[Storage] findZombies failed: ${e.message}`);
|
|
382
|
+
return [];
|
|
383
|
+
}
|
|
330
384
|
}
|
|
331
385
|
|
|
332
386
|
async completeCheckpoint(dateStr, computationName, checkpointId) {
|
|
@@ -344,11 +398,21 @@ class StorageManager {
|
|
|
344
398
|
const table = 'computation_checkpoints';
|
|
345
399
|
const fullTable = `\`${this.config.bigquery.projectId}.${this.config.bigquery.dataset}.${table}\``;
|
|
346
400
|
try {
|
|
401
|
+
// FIX: Added Tie-Breaker logic to ORDER BY
|
|
402
|
+
// If timestamps are identical, 'completed' (1) comes before 'failed' (2) before 'running' (3).
|
|
403
|
+
// This ensures we never accidentally pick a "running" row when a "completed" one exists at the exact same ms.
|
|
347
404
|
const query = `
|
|
348
405
|
SELECT checkpoint_id, status, processed_count, last_entity_id, completed_batches, worker_instance_id, last_updated, attempts, code_hash, started_at
|
|
349
406
|
FROM ${fullTable}
|
|
350
407
|
WHERE date = @date AND computation_name = @computationName
|
|
351
|
-
ORDER BY
|
|
408
|
+
ORDER BY
|
|
409
|
+
last_updated DESC,
|
|
410
|
+
CASE status
|
|
411
|
+
WHEN 'completed' THEN 1
|
|
412
|
+
WHEN 'failed' THEN 2
|
|
413
|
+
ELSE 3
|
|
414
|
+
END ASC
|
|
415
|
+
LIMIT 1
|
|
352
416
|
`;
|
|
353
417
|
const [rows] = await this.bigquery.query({ query, params: { date: dateStr, computationName }, location: this.config.bigquery.location });
|
|
354
418
|
if (rows.length === 0) return null;
|
|
@@ -696,6 +760,7 @@ class StorageManager {
|
|
|
696
760
|
|
|
697
761
|
this._log('ERROR', `${context}: ${details}`);
|
|
698
762
|
}
|
|
763
|
+
|
|
699
764
|
}
|
|
700
765
|
|
|
701
766
|
module.exports = { StorageManager };
|
|
@@ -51,14 +51,21 @@ async function schedulerHandler(req, res) {
|
|
|
51
51
|
const dueComputations = findDueComputations(now);
|
|
52
52
|
|
|
53
53
|
// 2. ZOMBIE DETECTION
|
|
54
|
-
// Find tasks marked 'running' that haven't heartbeated in X mins
|
|
55
54
|
let zombies = [];
|
|
56
55
|
try {
|
|
57
56
|
zombies = await storageManager.findZombies(ZOMBIE_THRESHOLD_MINUTES);
|
|
58
|
-
|
|
57
|
+
|
|
59
58
|
if (zombies.length > 0) {
|
|
60
59
|
const zombieDetails = zombies.map(z => `${z.name} [${z.date}]`).join(', ');
|
|
61
60
|
console.log(`[Scheduler] DETECTED ${zombies.length} ZOMBIES: ${zombieDetails}`);
|
|
61
|
+
|
|
62
|
+
// --- NEW FIX: CLAIM ZOMBIES ---
|
|
63
|
+
// "Touch" these rows in the DB so they don't look like zombies for another 15 mins.
|
|
64
|
+
// This prevents re-dispatching the same task 15 times if the queue is slow.
|
|
65
|
+
await Promise.all(zombies.map(z =>
|
|
66
|
+
storageManager.claimZombie(z.checkpointId)
|
|
67
|
+
));
|
|
68
|
+
// ------------------------------
|
|
62
69
|
}
|
|
63
70
|
} catch (e) {
|
|
64
71
|
console.error(`[Scheduler] Zombie check failed: ${e.message}`);
|