bulltrackers-module 1.0.301 → 1.0.303
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/functions/computation-system/executors/StandardExecutor.js +1 -1
- package/functions/computation-system/helpers/computation_dispatcher.js +108 -262
- package/functions/computation-system/helpers/computation_worker.js +47 -16
- package/functions/computation-system/utils/data_loader.js +3 -3
- package/functions/computation-system/workflows/bulltrackers_pipeline.yaml +29 -124
- package/package.json +1 -1
|
@@ -114,7 +114,7 @@ class StandardExecutor {
|
|
|
114
114
|
|
|
115
115
|
usersSinceLastFlush += chunkSize;
|
|
116
116
|
const heapStats = v8.getHeapStatistics();
|
|
117
|
-
if (usersSinceLastFlush >=
|
|
117
|
+
if (usersSinceLastFlush >= 500 || (heapStats.used_heap_size / heapStats.heap_size_limit) > 0.70) {
|
|
118
118
|
const flushResult = await StandardExecutor.flushBuffer(state, dateStr, passName, config, deps, shardIndexMap, executionStats, 'INTERMEDIATE', true, !hasFlushed);
|
|
119
119
|
hasFlushed = true;
|
|
120
120
|
StandardExecutor.mergeReports(aggregatedSuccess, aggregatedFailures, flushResult);
|
|
@@ -1,303 +1,149 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* FILENAME: computation-system/helpers/computation_dispatcher.js
|
|
3
|
-
* PURPOSE:
|
|
4
|
-
*
|
|
3
|
+
* PURPOSE: Sequential Cursor-Based Dispatcher.
|
|
4
|
+
* IMPLEMENTS: Dirty-Date Discovery, Forensics Rerouting, and Satiation Sweeps.
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
7
|
const { getExpectedDateStrings, getEarliestDataDates, normalizeName, DEFINITIVE_EARLIEST_DATES } = require('../utils/utils.js');
|
|
8
|
-
const { groupByPass, analyzeDateExecution }
|
|
9
|
-
const { PubSubUtils }
|
|
10
|
-
const { fetchComputationStatus
|
|
11
|
-
const { checkRootDataAvailability }
|
|
12
|
-
const { generateCodeHash } = require('../topology/HashManager');
|
|
13
|
-
const pLimit = require('p-limit');
|
|
14
|
-
const crypto = require('crypto');
|
|
8
|
+
const { groupByPass, analyzeDateExecution } = require('../WorkflowOrchestrator.js');
|
|
9
|
+
const { PubSubUtils } = require('../../core/utils/pubsub_utils');
|
|
10
|
+
const { fetchComputationStatus } = require('../persistence/StatusRepository');
|
|
11
|
+
const { checkRootDataAvailability } = require('../data/AvailabilityChecker');
|
|
15
12
|
|
|
16
|
-
const STATUS_IMPOSSIBLE = 'IMPOSSIBLE';
|
|
17
|
-
|
|
18
|
-
// Threshold to trigger high-mem routing (e.g., 1.5 GB for a 2GB worker)
|
|
19
13
|
const OOM_THRESHOLD_MB = 1500;
|
|
14
|
+
const SECONDS_PER_CALC_MARGIN = 25; // 20s base + safety margin
|
|
20
15
|
|
|
21
16
|
/**
|
|
22
|
-
*
|
|
23
|
-
*
|
|
17
|
+
* Checks if specific tasks on a date need a high-memory reroute.
|
|
18
|
+
* Returns only tasks that failed on 'standard' and haven't been tried on 'high-mem'.
|
|
24
19
|
*/
|
|
25
|
-
async function
|
|
26
|
-
|
|
27
|
-
|
|
20
|
+
async function getHighMemReroutes(db, date, pass, tasks) {
|
|
21
|
+
const reroutes = [];
|
|
22
|
+
for (const task of tasks) {
|
|
23
|
+
const name = normalizeName(task.name);
|
|
24
|
+
const ledgerPath = `computation_audit_ledger/${date}/passes/${pass}/tasks/${name}`;
|
|
28
25
|
const doc = await db.doc(ledgerPath).get();
|
|
29
|
-
|
|
30
|
-
// Default to standard
|
|
31
|
-
if (!doc.exists) return 'standard';
|
|
32
|
-
|
|
33
|
-
const data = doc.data();
|
|
34
26
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
27
|
+
if (doc.exists) {
|
|
28
|
+
const data = doc.data();
|
|
29
|
+
// Check if it failed due to memory and hasn't been attempted on high-mem yet
|
|
30
|
+
const isOOM = (data.status === 'FAILED' || data.status === 'CRASH') &&
|
|
31
|
+
(data.resourceTier !== 'high-mem') &&
|
|
32
|
+
((data.peakMemoryMB > OOM_THRESHOLD_MB) || (data.error && /memory/i.test(data.error.message)));
|
|
38
33
|
|
|
39
|
-
if (
|
|
40
|
-
|
|
41
|
-
return 'high-mem';
|
|
34
|
+
if (isOOM) {
|
|
35
|
+
reroutes.push({ ...task, resources: 'high-mem' });
|
|
42
36
|
}
|
|
43
37
|
}
|
|
44
|
-
|
|
45
|
-
// Also check if it's explicitly marked FAILED with 'Memory' in error
|
|
46
|
-
if (data.status === 'FAILED' && data.error && /memory/i.test(data.error)) {
|
|
47
|
-
return 'high-mem';
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
} catch (e) {
|
|
51
|
-
console.warn(`[Dispatcher] Forensics check failed for ${computationName}: ${e.message}`);
|
|
52
38
|
}
|
|
53
|
-
|
|
54
|
-
return 'standard';
|
|
39
|
+
return reroutes;
|
|
55
40
|
}
|
|
56
41
|
|
|
57
|
-
/**
|
|
58
|
-
* Dispatches computation tasks for a specific pass.
|
|
59
|
-
* @param {Object} config - System config (Injected with topics)
|
|
60
|
-
* @param {Object} dependencies - { db, logger, ... }
|
|
61
|
-
* @param {Array} computationManifest - List of calculations
|
|
62
|
-
* @param {Object} reqBody - (Optional) HTTP Body containing 'callbackUrl', 'date', and 'attempt'
|
|
63
|
-
*/
|
|
64
42
|
async function dispatchComputationPass(config, dependencies, computationManifest, reqBody = {}) {
|
|
65
43
|
const { logger, db } = dependencies;
|
|
66
|
-
const pubsubUtils
|
|
67
|
-
const passToRun = String(config.COMPUTATION_PASS_TO_RUN);
|
|
44
|
+
const pubsubUtils = new PubSubUtils(dependencies);
|
|
68
45
|
|
|
69
|
-
//
|
|
70
|
-
const
|
|
71
|
-
const
|
|
72
|
-
|
|
73
|
-
const attemptCount = reqBody.attempt ? parseInt(reqBody.attempt) : 1;
|
|
74
|
-
|
|
75
|
-
if (!passToRun) { return logger.log('ERROR', '[Dispatcher] No pass defined (COMPUTATION_PASS_TO_RUN). Aborting.'); }
|
|
76
|
-
if (!dateStr) { return logger.log('ERROR', '[Dispatcher] No date defined. Aborting.'); }
|
|
46
|
+
// Inputs from Workflow Cursor
|
|
47
|
+
const passToRun = String(reqBody.pass || config.COMPUTATION_PASS_TO_RUN);
|
|
48
|
+
const targetCursorN = parseInt(reqBody.cursorIndex || 1);
|
|
49
|
+
const dateLimitStr = reqBody.date || config.date;
|
|
77
50
|
|
|
78
|
-
const
|
|
79
|
-
|
|
80
|
-
const passes = groupByPass(computationManifest);
|
|
51
|
+
const manifestMap = new Map(computationManifest.map(c => [normalizeName(c.name), c]));
|
|
52
|
+
const passes = groupByPass(computationManifest);
|
|
81
53
|
const calcsInThisPass = passes[passToRun] || [];
|
|
82
54
|
|
|
83
|
-
if (!calcsInThisPass.length) {
|
|
84
|
-
|
|
85
|
-
logger.log('INFO', `🚀 [Dispatcher] Smart-Dispatching PASS ${passToRun} (Target: ${dateStr}) [Attempt ${attemptCount}]`);
|
|
86
|
-
|
|
87
|
-
// -- DATE ANALYSIS LOGIC (FIXED: RANGE SCAN) --
|
|
88
|
-
|
|
89
|
-
// 1. Determine the absolute start of data history
|
|
90
|
-
const earliestDates = await getEarliestDataDates(config, dependencies);
|
|
91
|
-
const startDate = earliestDates.absoluteEarliest;
|
|
92
|
-
const endDate = new Date(dateStr + 'T00:00:00Z');
|
|
93
|
-
|
|
94
|
-
// 2. Generate the full range of dates to check
|
|
95
|
-
let allExpectedDates = getExpectedDateStrings(startDate, endDate);
|
|
96
|
-
|
|
97
|
-
// Safety fallback
|
|
98
|
-
if (!allExpectedDates || allExpectedDates.length === 0) {
|
|
99
|
-
logger.log('WARN', `[Dispatcher] Date range calculation returned empty (Start: ${startDate.toISOString()} -> End: ${endDate.toISOString()}). Defaulting to single target date.`);
|
|
100
|
-
allExpectedDates = [dateStr];
|
|
101
|
-
} else {
|
|
102
|
-
logger.log('INFO', `[Dispatcher] 📅 Analysis Range: ${allExpectedDates.length} days (${allExpectedDates[0]} to ${allExpectedDates[allExpectedDates.length-1]})`);
|
|
55
|
+
if (!calcsInThisPass.length) {
|
|
56
|
+
return { status: 'MOVE_TO_NEXT_PASS', dispatched: 0 };
|
|
103
57
|
}
|
|
104
58
|
|
|
105
|
-
|
|
106
|
-
const
|
|
59
|
+
// 1. Discover all "Dirty" Dates (Dates needing work)
|
|
60
|
+
const earliestDates = await getEarliestDataDates(config, dependencies);
|
|
61
|
+
const allDates = getExpectedDateStrings(earliestDates.absoluteEarliest, new Date(dateLimitStr + 'T00:00:00Z'));
|
|
107
62
|
|
|
108
|
-
|
|
109
|
-
const
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
if (!shouldRunForensics) {
|
|
120
|
-
if (isBulkBackfill) logger.log('INFO', `[Dispatcher] ⏩ Bulk Backfill (${allExpectedDates.length} days). Skipping Forensics.`);
|
|
121
|
-
else logger.log('INFO', `[Dispatcher] ⏩ First Attempt. Skipping Forensics (Defaulting to Standard).`);
|
|
122
|
-
} else {
|
|
123
|
-
logger.log('WARN', `[Dispatcher] 🕵️♀️ Retry Detected (Attempt ${attemptCount}). Enabling Forensic Crash Analysis.`);
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
const analysisPromises = allExpectedDates.map(d => dateLimit(async () => {
|
|
127
|
-
try {
|
|
128
|
-
const fetchPromises = [
|
|
129
|
-
fetchComputationStatus(d, config, dependencies),
|
|
130
|
-
checkRootDataAvailability(d, config, dependencies, DEFINITIVE_EARLIEST_DATES)
|
|
131
|
-
];
|
|
132
|
-
|
|
133
|
-
let prevDateStr = null;
|
|
134
|
-
if (calcsInThisPass.some(c => c.isHistorical)) {
|
|
135
|
-
const prevDate = new Date(d + 'T00:00:00Z');
|
|
136
|
-
prevDate.setUTCDate(prevDate.getUTCDate() - 1);
|
|
137
|
-
prevDateStr = prevDate.toISOString().slice(0, 10);
|
|
138
|
-
if (prevDate >= DEFINITIVE_EARLIEST_DATES.absoluteEarliest) {
|
|
139
|
-
fetchPromises.push(fetchComputationStatus(prevDateStr, config, dependencies));
|
|
140
|
-
}
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
const results = await Promise.all(fetchPromises);
|
|
144
|
-
const dailyStatus = results[0];
|
|
145
|
-
const availability = results[1];
|
|
146
|
-
const prevDailyStatus = (prevDateStr && results[2]) ? results[2] : (prevDateStr ? {} : null);
|
|
147
|
-
|
|
148
|
-
const rootDataStatus = availability ? availability.status : {
|
|
149
|
-
hasPortfolio: false, hasHistory: false, hasSocial: false, hasInsights: false, hasPrices: false
|
|
150
|
-
};
|
|
151
|
-
|
|
152
|
-
const report = analyzeDateExecution(d, calcsInThisPass, rootDataStatus, dailyStatus, manifestMap, prevDailyStatus);
|
|
153
|
-
|
|
154
|
-
const statusUpdates = {};
|
|
155
|
-
report.impossible.forEach(item => {
|
|
156
|
-
if (dailyStatus[item.name]?.hash !== STATUS_IMPOSSIBLE) {
|
|
157
|
-
statusUpdates[item.name] = { hash: STATUS_IMPOSSIBLE, category: 'unknown', reason: item.reason };
|
|
158
|
-
}
|
|
159
|
-
});
|
|
160
|
-
report.blocked.forEach(item => {
|
|
161
|
-
statusUpdates[item.name] = { hash: false, category: 'unknown', reason: item.reason };
|
|
162
|
-
});
|
|
163
|
-
report.failedDependency.forEach(item => {
|
|
164
|
-
const missingStr = item.missing ? item.missing.join(', ') : 'unknown';
|
|
165
|
-
statusUpdates[item.name] = { hash: false, category: 'unknown', reason: `Dependency Missing: ${missingStr}` };
|
|
166
|
-
});
|
|
167
|
-
|
|
168
|
-
if (Object.keys(statusUpdates).length > 0) {
|
|
169
|
-
await updateComputationStatus(d, statusUpdates, config, dependencies);
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
const validToRun = [...report.runnable, ...report.reRuns];
|
|
173
|
-
|
|
174
|
-
await Promise.all(validToRun.map(item => forensicsLimit(async () => {
|
|
175
|
-
const compName = normalizeName(item.name);
|
|
176
|
-
|
|
177
|
-
// [UPDATED] Conditional Forensics
|
|
178
|
-
let requiredResource = 'standard';
|
|
179
|
-
if (shouldRunForensics) {
|
|
180
|
-
requiredResource = await checkCrashForensics(db, d, passToRun, compName);
|
|
181
|
-
}
|
|
182
|
-
|
|
183
|
-
const uniqueDispatchId = crypto.randomUUID();
|
|
184
|
-
tasksToDispatch.push({
|
|
185
|
-
action: 'RUN_COMPUTATION_DATE',
|
|
186
|
-
dispatchId: uniqueDispatchId,
|
|
187
|
-
date: d,
|
|
188
|
-
pass: passToRun,
|
|
189
|
-
computation: compName,
|
|
190
|
-
hash: item.hash || item.newHash,
|
|
191
|
-
previousCategory: item.previousCategory || null,
|
|
192
|
-
triggerReason: item.reason || "Unknown",
|
|
193
|
-
dependencyResultHashes: item.dependencyResultHashes || {},
|
|
194
|
-
timestamp: Date.now(),
|
|
195
|
-
resources: requiredResource
|
|
196
|
-
});
|
|
197
|
-
})));
|
|
198
|
-
|
|
199
|
-
logger.log('INFO', `[Dispatcher] Analyzed ${d}: ${validToRun.length} tasks (Cumulative: ${tasksToDispatch.length})`);
|
|
200
|
-
|
|
201
|
-
} catch (e) {
|
|
202
|
-
logger.log('ERROR', `[Dispatcher] Failed analysis for ${d}: ${e.message}`);
|
|
63
|
+
const dirtyDates = [];
|
|
64
|
+
for (const d of allDates) {
|
|
65
|
+
const dailyStatus = await fetchComputationStatus(d, config, dependencies);
|
|
66
|
+
const availability = await checkRootDataAvailability(d, config, dependencies, DEFINITIVE_EARLIEST_DATES);
|
|
67
|
+
|
|
68
|
+
const report = analyzeDateExecution(d, calcsInThisPass, availability.status, dailyStatus, manifestMap, null);
|
|
69
|
+
const tasks = [...report.runnable, ...report.reRuns];
|
|
70
|
+
|
|
71
|
+
if (tasks.length > 0) {
|
|
72
|
+
dirtyDates.push({ date: d, tasks });
|
|
203
73
|
}
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
await Promise.all(analysisPromises);
|
|
74
|
+
}
|
|
207
75
|
|
|
208
|
-
|
|
76
|
+
let selectedDate = null;
|
|
77
|
+
let selectedTasks = [];
|
|
78
|
+
let isReroute = false;
|
|
79
|
+
let isSweep = false;
|
|
209
80
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
const
|
|
213
|
-
const
|
|
81
|
+
// 2. Logic: Prioritize Reroute for N-1
|
|
82
|
+
if (targetCursorN > 1 && (targetCursorN - 2) < dirtyDates.length) {
|
|
83
|
+
const prevEntry = dirtyDates[targetCursorN - 2];
|
|
84
|
+
const reroutes = await getHighMemReroutes(db, prevEntry.date, passToRun, prevEntry.tasks);
|
|
214
85
|
|
|
215
|
-
if (
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
totalTasks: tasksToDispatch.length,
|
|
221
|
-
remainingTasks: tasksToDispatch.length,
|
|
222
|
-
callbackUrl: callbackUrl,
|
|
223
|
-
status: 'IN_PROGRESS'
|
|
224
|
-
});
|
|
225
|
-
logger.log('INFO', `[Dispatcher] 🏁 Run State Initialized: ${runId}. Tasks: ${tasksToDispatch.length}`);
|
|
86
|
+
if (reroutes.length > 0) {
|
|
87
|
+
selectedDate = prevEntry.date;
|
|
88
|
+
selectedTasks = reroutes;
|
|
89
|
+
isReroute = true;
|
|
90
|
+
logger.log('INFO', `[Dispatcher] Reroute detected for ${selectedDate}. Pausing N increment.`);
|
|
226
91
|
}
|
|
92
|
+
}
|
|
227
93
|
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
const doc = await t.get(ledgerRef);
|
|
241
|
-
if (doc.exists) {
|
|
242
|
-
const data = doc.data();
|
|
243
|
-
if (data.status === 'COMPLETED') return false;
|
|
244
|
-
}
|
|
245
|
-
t.set(ledgerRef, {
|
|
246
|
-
status: 'PENDING',
|
|
247
|
-
dispatchId: task.dispatchId,
|
|
248
|
-
runId: task.runId,
|
|
249
|
-
computation: task.computation,
|
|
250
|
-
expectedHash: task.hash || 'unknown',
|
|
251
|
-
createdAt: new Date(),
|
|
252
|
-
dispatcherHash: currentManifestHash,
|
|
253
|
-
triggerReason: task.triggerReason,
|
|
254
|
-
resources: task.resources,
|
|
255
|
-
retries: 0
|
|
256
|
-
}, { merge: true });
|
|
257
|
-
return true;
|
|
258
|
-
});
|
|
259
|
-
finalDispatched.push(task);
|
|
260
|
-
} catch (txnErr) {
|
|
261
|
-
logger.log('WARN', `[Dispatcher] Transaction failed for ${task.computation}: ${txnErr.message}`);
|
|
94
|
+
// 3. Logic: N-th Dirty Date or Final Sweep
|
|
95
|
+
if (!selectedDate) {
|
|
96
|
+
if (targetCursorN <= dirtyDates.length) {
|
|
97
|
+
const entry = dirtyDates[targetCursorN - 1];
|
|
98
|
+
selectedDate = entry.date;
|
|
99
|
+
selectedTasks = entry.tasks;
|
|
100
|
+
} else {
|
|
101
|
+
// Final Satiation Sweep: Check if anything was missed (recovery)
|
|
102
|
+
if (dirtyDates.length > 0) {
|
|
103
|
+
isSweep = true;
|
|
104
|
+
selectedDate = dirtyDates[0].date;
|
|
105
|
+
selectedTasks = dirtyDates[0].tasks;
|
|
262
106
|
}
|
|
263
|
-
}
|
|
264
|
-
|
|
265
|
-
await Promise.all(txnPromises);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
266
109
|
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
110
|
+
// 4. Signal Pass Completion
|
|
111
|
+
if (!selectedDate) {
|
|
112
|
+
return { status: 'MOVE_TO_NEXT_PASS', dispatched: 0, etaSeconds: 0 };
|
|
113
|
+
}
|
|
270
114
|
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
topicName: config.computationTopicStandard || 'computation-tasks',
|
|
275
|
-
tasks: standardTasks,
|
|
276
|
-
taskType: `computation-pass-${passToRun}-std`,
|
|
277
|
-
maxPubsubBatchSize: 100
|
|
278
|
-
});
|
|
279
|
-
}
|
|
115
|
+
// 5. Dispatch to PubSub (Standard vs. High-Mem)
|
|
116
|
+
const standardTasks = selectedTasks.filter(t => t.resources !== 'high-mem').map(t => ({ ...t, date: selectedDate, pass: passToRun }));
|
|
117
|
+
const highMemTasks = selectedTasks.filter(t => t.resources === 'high-mem').map(t => ({ ...t, date: selectedDate, pass: passToRun }));
|
|
280
118
|
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
});
|
|
289
|
-
}
|
|
290
|
-
|
|
291
|
-
return { dispatched: finalDispatched.length, runId };
|
|
292
|
-
} else {
|
|
293
|
-
logger.log('INFO', `[Dispatcher] All tasks were already COMPLETED.`);
|
|
294
|
-
return { dispatched: 0 };
|
|
295
|
-
}
|
|
296
|
-
|
|
297
|
-
} else {
|
|
298
|
-
logger.log('INFO', `[Dispatcher] No valid tasks found (Up to date).`);
|
|
299
|
-
return { dispatched: 0 };
|
|
119
|
+
const pubPromises = [];
|
|
120
|
+
if (standardTasks.length > 0) {
|
|
121
|
+
pubPromises.push(pubsubUtils.batchPublishTasks(dependencies, {
|
|
122
|
+
topicName: config.computationTopicStandard || 'computation-tasks',
|
|
123
|
+
tasks: standardTasks,
|
|
124
|
+
taskType: `pass-${passToRun}-std`
|
|
125
|
+
}));
|
|
300
126
|
}
|
|
127
|
+
if (highMemTasks.length > 0) {
|
|
128
|
+
pubPromises.push(pubsubUtils.batchPublishTasks(dependencies, {
|
|
129
|
+
topicName: config.computationTopicHighMem || 'computation-tasks-highmem',
|
|
130
|
+
tasks: highMemTasks,
|
|
131
|
+
taskType: `pass-${passToRun}-high`
|
|
132
|
+
}));
|
|
133
|
+
}
|
|
134
|
+
await Promise.all(pubPromises);
|
|
135
|
+
|
|
136
|
+
const etaSeconds = Math.max(20, selectedTasks.length * SECONDS_PER_CALC_MARGIN);
|
|
137
|
+
|
|
138
|
+
logger.log('INFO', `[Dispatcher] ${isReroute ? 'Reroute' : (isSweep ? 'Sweep' : 'Standard')} Run: ${selectedDate}. Tasks: ${selectedTasks.length}. ETA: ${etaSeconds}s`);
|
|
139
|
+
|
|
140
|
+
return {
|
|
141
|
+
status: isSweep ? 'RECOVERY' : 'CONTINUE_PASS',
|
|
142
|
+
dateProcessed: selectedDate,
|
|
143
|
+
dispatched: selectedTasks.length,
|
|
144
|
+
n_cursor_ignored: isReroute, // Tell workflow to stay on same N
|
|
145
|
+
etaSeconds: etaSeconds
|
|
146
|
+
};
|
|
301
147
|
}
|
|
302
148
|
|
|
303
149
|
module.exports = { dispatchComputationPass };
|
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* FILENAME: computation-system/helpers/computation_worker.js
|
|
3
3
|
* PURPOSE: Consumes tasks, executes logic, and signals Workflow upon Batch Completion.
|
|
4
|
-
* UPDATED:
|
|
5
|
-
* UPDATED: Implements Peak Memory Heartbeat and Resource Tier tracking.
|
|
4
|
+
* UPDATED: Added "Contention-Aware Retry" for the Batch Counter to fix ABORTED errors.
|
|
6
5
|
*/
|
|
7
6
|
|
|
8
7
|
const { executeDispatchTask } = require('../WorkflowOrchestrator.js');
|
|
@@ -99,24 +98,56 @@ async function triggerWorkflowCallback(url, status, logger) {
|
|
|
99
98
|
}
|
|
100
99
|
|
|
101
100
|
/**
|
|
102
|
-
* Helper: Decrements 'remainingTasks' in Firestore.
|
|
101
|
+
* [UPDATED] Helper: Decrements 'remainingTasks' in Firestore.
|
|
102
|
+
* NOW INCLUDES CONTENTION RETRY LOGIC (The "Sentinel" Fix)
|
|
103
103
|
*/
|
|
104
104
|
async function decrementAndCheck(db, metaStatePath, logger) {
|
|
105
105
|
if (!metaStatePath) return null;
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
const
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
106
|
+
|
|
107
|
+
const MAX_CONTENTION_RETRIES = 10;
|
|
108
|
+
let attempt = 0;
|
|
109
|
+
|
|
110
|
+
while (attempt < MAX_CONTENTION_RETRIES) {
|
|
111
|
+
try {
|
|
112
|
+
const result = await db.runTransaction(async (t) => {
|
|
113
|
+
const ref = db.doc(metaStatePath);
|
|
114
|
+
const doc = await t.get(ref);
|
|
115
|
+
if (!doc.exists) return null;
|
|
116
|
+
|
|
117
|
+
const data = doc.data();
|
|
118
|
+
// Safety: Don't decrement below zero
|
|
119
|
+
const currentRemaining = data.remainingTasks || 0;
|
|
120
|
+
if (currentRemaining <= 0) return { remaining: 0, callbackUrl: data.callbackUrl };
|
|
121
|
+
|
|
122
|
+
const newRemaining = currentRemaining - 1;
|
|
123
|
+
t.update(ref, { remainingTasks: newRemaining, lastUpdated: new Date() });
|
|
124
|
+
|
|
125
|
+
return { remaining: newRemaining, callbackUrl: data.callbackUrl };
|
|
126
|
+
});
|
|
127
|
+
|
|
128
|
+
// Success! Check if we are the "Sentinel" (the last one)
|
|
129
|
+
if (result && result.remaining <= 0) return result.callbackUrl;
|
|
130
|
+
return null; // We decremented successfully, but weren't the last one.
|
|
131
|
+
|
|
132
|
+
} catch (e) {
|
|
133
|
+
// Check if it's a contention error (ABORTED/10 or DEADLINE_EXCEEDED/4)
|
|
134
|
+
const isContention = e.code === 10 || e.code === 4 || (e.message && e.message.includes('contention'));
|
|
135
|
+
|
|
136
|
+
if (isContention) {
|
|
137
|
+
attempt++;
|
|
138
|
+
// JITTER: Random delay between 50ms and 500ms to desynchronize the herd
|
|
139
|
+
const delay = Math.floor(Math.random() * 450) + 50;
|
|
140
|
+
logger.log('WARN', `[Worker] Batch counter contention (Attempt ${attempt}/${MAX_CONTENTION_RETRIES}). Retrying in ${delay}ms...`);
|
|
141
|
+
await new Promise(r => setTimeout(r, delay));
|
|
142
|
+
} else {
|
|
143
|
+
// Fatal error (permission, etc)
|
|
144
|
+
logger.log('ERROR', `[Worker] Fatal error decrementing batch counter: ${e.message}`);
|
|
145
|
+
return null;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
119
148
|
}
|
|
149
|
+
|
|
150
|
+
logger.log('ERROR', `[Worker] Failed to decrement batch counter after ${MAX_CONTENTION_RETRIES} attempts. The count will be inaccurate.`);
|
|
120
151
|
return null;
|
|
121
152
|
}
|
|
122
153
|
|
|
@@ -48,7 +48,7 @@ async function loadDataByRefs(config, deps, refs) {
|
|
|
48
48
|
const { withRetry } = calculationUtils;
|
|
49
49
|
if (!refs || !refs.length) return {};
|
|
50
50
|
const mergedPortfolios = {};
|
|
51
|
-
const batchSize = config.partRefBatchSize ||
|
|
51
|
+
const batchSize = config.partRefBatchSize || 10;
|
|
52
52
|
for (let i = 0; i < refs.length; i += batchSize) {
|
|
53
53
|
const batchRefs = refs.slice(i, i + batchSize);
|
|
54
54
|
const snapshots = await withRetry(() => db.getAll(...batchRefs), `getAll(batch ${Math.floor(i / batchSize)})`);
|
|
@@ -145,7 +145,7 @@ async function* streamPortfolioData(config, deps, dateString, providedRefs = nul
|
|
|
145
145
|
const { logger } = deps;
|
|
146
146
|
const refs = providedRefs || (await getPortfolioPartRefs(config, deps, dateString));
|
|
147
147
|
if (refs.length === 0) { logger.log('WARN', `[streamPortfolioData] No portfolio refs found for ${dateString}. Stream is empty.`); return; }
|
|
148
|
-
const batchSize = config.partRefBatchSize ||
|
|
148
|
+
const batchSize = config.partRefBatchSize || 10;
|
|
149
149
|
logger.log('INFO', `[streamPortfolioData] Streaming ${refs.length} portfolio parts in chunks of ${batchSize}...`);
|
|
150
150
|
for (let i = 0; i < refs.length; i += batchSize) {
|
|
151
151
|
const batchRefs = refs.slice(i, i + batchSize);
|
|
@@ -160,7 +160,7 @@ async function* streamHistoryData(config, deps, dateString, providedRefs = null)
|
|
|
160
160
|
const { logger } = deps;
|
|
161
161
|
const refs = providedRefs || (await getHistoryPartRefs(config, deps, dateString));
|
|
162
162
|
if (refs.length === 0) { logger.log('WARN', `[streamHistoryData] No history refs found for ${dateString}. Stream is empty.`); return; }
|
|
163
|
-
const batchSize = config.partRefBatchSize ||
|
|
163
|
+
const batchSize = config.partRefBatchSize || 10;
|
|
164
164
|
logger.log('INFO', `[streamHistoryData] Streaming ${refs.length} history parts in chunks of ${batchSize}...`);
|
|
165
165
|
for (let i = 0; i < refs.length; i += batchSize) {
|
|
166
166
|
const batchRefs = refs.slice(i, i + batchSize);
|
|
@@ -1,148 +1,53 @@
|
|
|
1
|
-
# Cloud Workflows
|
|
2
|
-
# Orchestrates 5 sequential passes using Event-Driven Callbacks (Zero Polling).
|
|
3
|
-
# UPDATED: Passes 'attempt' count to Dispatcher to trigger Smart Forensics on retries.
|
|
4
|
-
|
|
1
|
+
# Cloud Workflows: Precision Cursor-Based Orchestrator
|
|
5
2
|
main:
|
|
6
3
|
params: [input]
|
|
7
4
|
steps:
|
|
8
5
|
- init:
|
|
9
6
|
assign:
|
|
10
7
|
- project: ${sys.get_env("GOOGLE_CLOUD_PROJECT_ID")}
|
|
11
|
-
- location: "europe-west1"
|
|
12
|
-
|
|
13
|
-
# T-1 Date Logic (Process Yesterday)
|
|
14
|
-
- now: ${sys.now()}
|
|
15
|
-
- yesterday_timestamp: ${now - 86400}
|
|
16
|
-
- yesterday_str: ${text.substring(time.format(yesterday_timestamp), 0, 10)}
|
|
17
|
-
- date_to_run: ${default(map.get(input, "date"), yesterday_str)}
|
|
18
|
-
|
|
19
|
-
# Configuration Variables
|
|
20
8
|
- passes: ["1", "2", "3", "4", "5"]
|
|
21
|
-
-
|
|
9
|
+
- date_to_run: ${default(map.get(input, "date"), "2025-01-01")}
|
|
22
10
|
|
|
23
|
-
|
|
24
|
-
# MAIN LOOP: Iterate through Passes 1 to 5
|
|
25
|
-
# ======================================================
|
|
26
|
-
- run_passes:
|
|
11
|
+
- run_sequential_passes:
|
|
27
12
|
for:
|
|
28
13
|
value: pass_id
|
|
29
14
|
in: ${passes}
|
|
30
15
|
steps:
|
|
31
|
-
-
|
|
16
|
+
- init_cursor:
|
|
32
17
|
assign:
|
|
33
|
-
-
|
|
34
|
-
-
|
|
35
|
-
# URL of the Cloud Function acting as the Dispatcher
|
|
36
|
-
- dispatcher_url: ${"https://europe-west1-" + project + ".cloudfunctions.net/computation-pass-" + pass_id}
|
|
18
|
+
- n_cursor: 1
|
|
19
|
+
- pass_complete: false
|
|
37
20
|
|
|
38
|
-
|
|
39
|
-
# RETRY LOOP: Try to complete the pass up to 3 times
|
|
40
|
-
# -----------------------------------------------
|
|
41
|
-
- pass_retry_loop:
|
|
21
|
+
- sequential_date_loop:
|
|
42
22
|
switch:
|
|
43
|
-
- condition: ${
|
|
23
|
+
- condition: ${not pass_complete}
|
|
44
24
|
steps:
|
|
45
|
-
-
|
|
46
|
-
assign:
|
|
47
|
-
- attempt_count: ${attempt_count + 1}
|
|
48
|
-
|
|
49
|
-
# 1. GENERATE CALLBACK ENDPOINT
|
|
50
|
-
- create_callback:
|
|
51
|
-
call: events.create_callback_endpoint
|
|
52
|
-
args:
|
|
53
|
-
http_callback_method: "POST"
|
|
54
|
-
result: callback_details
|
|
55
|
-
|
|
56
|
-
- extract_callback_url:
|
|
57
|
-
assign:
|
|
58
|
-
- callback_url: ${callback_details.url}
|
|
59
|
-
|
|
60
|
-
- log_start:
|
|
61
|
-
call: sys.log
|
|
62
|
-
args:
|
|
63
|
-
text: ${"Starting Pass " + pass_id + " (Attempt " + attempt_count + ") for " + date_to_run + ". Waiting for signal at " + callback_url}
|
|
64
|
-
severity: "INFO"
|
|
65
|
-
|
|
66
|
-
# 2. TRIGGER DISPATCHER
|
|
67
|
-
- trigger_dispatcher:
|
|
25
|
+
- call_dispatcher:
|
|
68
26
|
call: http.post
|
|
69
27
|
args:
|
|
70
|
-
url: ${
|
|
28
|
+
url: ${"https://europe-west1-" + project + ".cloudfunctions.net/dispatch-pass-" + pass_id}
|
|
71
29
|
body:
|
|
30
|
+
pass: ${pass_id}
|
|
31
|
+
cursorIndex: ${n_cursor}
|
|
72
32
|
date: ${date_to_run}
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
auth:
|
|
76
|
-
type: OIDC
|
|
77
|
-
timeout: 1800 # 30 mins max for dispatch analysis
|
|
78
|
-
result: dispatch_response
|
|
33
|
+
auth: { type: OIDC }
|
|
34
|
+
result: dispatch_res
|
|
79
35
|
|
|
80
|
-
|
|
81
|
-
- check_immediate_completion:
|
|
36
|
+
- evaluate_dispatch:
|
|
82
37
|
switch:
|
|
83
|
-
|
|
38
|
+
# State 1: Dispatcher signals pass is fully completed
|
|
39
|
+
- condition: ${dispatch_res.body.status == "MOVE_TO_NEXT_PASS"}
|
|
40
|
+
assign: [pass_complete: true]
|
|
41
|
+
|
|
42
|
+
# State 2: Tasks dispatched (Standard, Reroute, or Recovery)
|
|
43
|
+
- condition: ${dispatch_res.body.dispatched > 0}
|
|
84
44
|
steps:
|
|
85
|
-
-
|
|
86
|
-
call: sys.
|
|
87
|
-
args:
|
|
88
|
-
|
|
89
|
-
- mark_success_empty:
|
|
45
|
+
- wait_for_eta:
|
|
46
|
+
call: sys.sleep
|
|
47
|
+
args: { seconds: ${dispatch_res.body.etaSeconds} }
|
|
48
|
+
- update_cursor:
|
|
90
49
|
assign:
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
# 4. WAIT FOR WORKER SIGNAL
|
|
96
|
-
- wait_for_completion:
|
|
97
|
-
call: events.await_callback
|
|
98
|
-
args:
|
|
99
|
-
callback: ${callback_details}
|
|
100
|
-
timeout: 10800 # Reduced to 3h to fail faster if stalled
|
|
101
|
-
result: callback_request
|
|
102
|
-
|
|
103
|
-
# 5. PROCESS SIGNAL
|
|
104
|
-
- parse_signal:
|
|
105
|
-
assign:
|
|
106
|
-
- signal_data: ${callback_request.http_request.body}
|
|
107
|
-
|
|
108
|
-
- evaluate_signal:
|
|
109
|
-
switch:
|
|
110
|
-
- condition: ${signal_data.status == "SUCCESS"}
|
|
111
|
-
steps:
|
|
112
|
-
- log_success:
|
|
113
|
-
call: sys.log
|
|
114
|
-
args:
|
|
115
|
-
text: ${"Pass " + pass_id + " signaled COMPLETION via Callback."}
|
|
116
|
-
severity: "INFO"
|
|
117
|
-
- mark_success:
|
|
118
|
-
assign:
|
|
119
|
-
- pass_success: true
|
|
120
|
-
- proceed:
|
|
121
|
-
next: pass_retry_loop
|
|
122
|
-
|
|
123
|
-
- condition: ${signal_data.status == "FAILURE"}
|
|
124
|
-
steps:
|
|
125
|
-
- log_failure:
|
|
126
|
-
call: sys.log
|
|
127
|
-
args:
|
|
128
|
-
text: ${"Pass " + pass_id + " signaled FAILURE (DLQ limit hit). Retrying pass."}
|
|
129
|
-
severity: "WARNING"
|
|
130
|
-
- retry_pass:
|
|
131
|
-
next: pass_retry_loop
|
|
132
|
-
|
|
133
|
-
# -----------------------------------------------
|
|
134
|
-
# END RETRY LOOP
|
|
135
|
-
# -----------------------------------------------
|
|
136
|
-
|
|
137
|
-
- check_final_status:
|
|
138
|
-
switch:
|
|
139
|
-
- condition: ${not pass_success}
|
|
140
|
-
steps:
|
|
141
|
-
- log_giving_up:
|
|
142
|
-
call: sys.log
|
|
143
|
-
args:
|
|
144
|
-
text: ${"Pass " + pass_id + " failed after " + max_retries + " attempts. Proceeding with potential gaps."}
|
|
145
|
-
severity: "ERROR"
|
|
146
|
-
|
|
147
|
-
- finish:
|
|
148
|
-
return: "Pipeline Execution Complete"
|
|
50
|
+
# If it was a reroute, stay on the same N to retry the sequence
|
|
51
|
+
- n_cursor: ${if(dispatch_res.body.n_cursor_ignored, n_cursor, n_cursor + 1)}
|
|
52
|
+
- next_step:
|
|
53
|
+
next: sequential_date_loop
|