bulltrackers-module 1.0.292 → 1.0.293
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/functions/computation-system/context/ManifestBuilder.js +5 -5
- package/functions/computation-system/helpers/computation_dispatcher.js +91 -37
- package/functions/computation-system/helpers/computation_worker.js +73 -75
- package/functions/computation-system/tools/BuildReporter.js +5 -5
- package/functions/computation-system/workflows/bulltrackers_pipeline.yaml +10 -8
- package/package.json +1 -1
|
@@ -89,12 +89,12 @@ function getDependencySet(endpoints, adjacencyList) {
|
|
|
89
89
|
* Returns a string description of the first cycle found.
|
|
90
90
|
*/
|
|
91
91
|
function detectCircularDependencies(manifestMap) {
|
|
92
|
-
let index
|
|
93
|
-
const stack
|
|
94
|
-
const indices
|
|
92
|
+
let index = 0;
|
|
93
|
+
const stack = [];
|
|
94
|
+
const indices = new Map();
|
|
95
95
|
const lowLinks = new Map();
|
|
96
|
-
const onStack
|
|
97
|
-
const cycles
|
|
96
|
+
const onStack = new Set();
|
|
97
|
+
const cycles = [];
|
|
98
98
|
|
|
99
99
|
function strongconnect(v) {
|
|
100
100
|
indices.set(v, index);
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
* FILENAME: computation-system/helpers/computation_dispatcher.js
|
|
3
3
|
* PURPOSE: "Smart Dispatcher" - Analyzes state, initializes Run Counters, and dispatches tasks.
|
|
4
4
|
* UPDATED: Implements Callback Pattern. Initializes 'computation_runs' doc for worker coordination.
|
|
5
|
+
* UPDATED: Implements Forensic Crash Analysis & Intelligent Resource Routing.
|
|
5
6
|
*/
|
|
6
7
|
|
|
7
8
|
const { getExpectedDateStrings, normalizeName, DEFINITIVE_EARLIEST_DATES } = require('../utils/utils.js');
|
|
@@ -13,12 +14,49 @@ const { generateCodeHash } = require('../topology/HashManag
|
|
|
13
14
|
const pLimit = require('p-limit');
|
|
14
15
|
const crypto = require('crypto');
|
|
15
16
|
|
|
16
|
-
const
|
|
17
|
-
|
|
17
|
+
const STATUS_IMPOSSIBLE = 'IMPOSSIBLE';
|
|
18
|
+
|
|
19
|
+
// Threshold to trigger high-mem routing (e.g., 1.5 GB for a 2GB worker)
|
|
20
|
+
const OOM_THRESHOLD_MB = 1500;
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* [NEW] Forensics: Checks if the calculation crashed previously due to Memory.
|
|
24
|
+
* Reads the 'telemetry.lastMemory' from the audit ledger.
|
|
25
|
+
*/
|
|
26
|
+
async function checkCrashForensics(db, date, pass, computationName) {
|
|
27
|
+
try {
|
|
28
|
+
const ledgerPath = `computation_audit_ledger/${date}/passes/${pass}/tasks/${computationName}`;
|
|
29
|
+
const doc = await db.doc(ledgerPath).get();
|
|
30
|
+
|
|
31
|
+
if (!doc.exists) return 'standard';
|
|
32
|
+
|
|
33
|
+
const data = doc.data();
|
|
34
|
+
|
|
35
|
+
// Check if we have telemetry from a previous run
|
|
36
|
+
if (data.telemetry && data.telemetry.lastMemory) {
|
|
37
|
+
const lastRSS = data.telemetry.lastMemory.rssMB || 0;
|
|
38
|
+
|
|
39
|
+
if (lastRSS > OOM_THRESHOLD_MB) {
|
|
40
|
+
console.log(`[Dispatcher] 🕵️♀️ Forensics: ${computationName} likely OOM'd at ${lastRSS}MB. Routing to HIGH-MEM.`);
|
|
41
|
+
return 'high-mem';
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// Also check if it's explicitly marked FAILED with 'Memory' in error
|
|
46
|
+
if (data.status === 'FAILED' && data.error && /memory/i.test(data.error)) {
|
|
47
|
+
return 'high-mem';
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
} catch (e) {
|
|
51
|
+
console.warn(`[Dispatcher] Forensics check failed for ${computationName}: ${e.message}`);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
return 'standard';
|
|
55
|
+
}
|
|
18
56
|
|
|
19
57
|
/**
|
|
20
58
|
* Dispatches computation tasks for a specific pass.
|
|
21
|
-
* @param {Object} config - System config
|
|
59
|
+
* @param {Object} config - System config (Injected with topics)
|
|
22
60
|
* @param {Object} dependencies - { db, logger, ... }
|
|
23
61
|
* @param {Array} computationManifest - List of calculations
|
|
24
62
|
* @param {Object} reqBody - (Optional) HTTP Body containing 'callbackUrl' and 'date'
|
|
@@ -28,7 +66,7 @@ async function dispatchComputationPass(config, dependencies, computationManifest
|
|
|
28
66
|
const pubsubUtils = new PubSubUtils(dependencies);
|
|
29
67
|
const passToRun = String(config.COMPUTATION_PASS_TO_RUN);
|
|
30
68
|
|
|
31
|
-
//
|
|
69
|
+
// Extract Date and Callback from request body (pushed by Workflow)
|
|
32
70
|
const dateStr = reqBody.date || config.date;
|
|
33
71
|
const callbackUrl = reqBody.callbackUrl || null;
|
|
34
72
|
|
|
@@ -44,18 +82,14 @@ async function dispatchComputationPass(config, dependencies, computationManifest
|
|
|
44
82
|
|
|
45
83
|
if (!calcsInThisPass.length) { return logger.log('WARN', `[Dispatcher] No calcs for Pass ${passToRun}. Exiting.`); }
|
|
46
84
|
|
|
47
|
-
const calcNames = calcsInThisPass.map(c => c.name);
|
|
48
85
|
logger.log('INFO', `🚀 [Dispatcher] Smart-Dispatching PASS ${passToRun} for ${dateStr}`);
|
|
49
86
|
|
|
50
|
-
// -- DATE ANALYSIS LOGIC
|
|
51
|
-
const passEarliestDate = Object.values(DEFINITIVE_EARLIEST_DATES).reduce((a, b) => a < b ? a : b);
|
|
52
|
-
const endDateUTC = new Date(Date.UTC(new Date().getUTCFullYear(), new Date().getUTCMonth(), new Date().getUTCDate() - 1));
|
|
53
|
-
|
|
54
|
-
// We only analyze the specific requested date to keep dispatch fast for the workflow
|
|
87
|
+
// -- DATE ANALYSIS LOGIC --
|
|
55
88
|
const allExpectedDates = [dateStr];
|
|
56
|
-
|
|
57
89
|
const manifestMap = new Map(computationManifest.map(c => [normalizeName(c.name), c]));
|
|
58
90
|
const tasksToDispatch = [];
|
|
91
|
+
|
|
92
|
+
// Concurrency limit for analysis & forensics
|
|
59
93
|
const limit = pLimit(20);
|
|
60
94
|
|
|
61
95
|
const analysisPromises = allExpectedDates.map(d => limit(async () => {
|
|
@@ -87,18 +121,16 @@ async function dispatchComputationPass(config, dependencies, computationManifest
|
|
|
87
121
|
|
|
88
122
|
const report = analyzeDateExecution(d, calcsInThisPass, rootDataStatus, dailyStatus, manifestMap, prevDailyStatus);
|
|
89
123
|
|
|
124
|
+
// Handle Status Updates (Impossible / Blocked)
|
|
90
125
|
const statusUpdates = {};
|
|
91
|
-
|
|
92
126
|
report.impossible.forEach(item => {
|
|
93
127
|
if (dailyStatus[item.name]?.hash !== STATUS_IMPOSSIBLE) {
|
|
94
128
|
statusUpdates[item.name] = { hash: STATUS_IMPOSSIBLE, category: 'unknown', reason: item.reason };
|
|
95
129
|
}
|
|
96
130
|
});
|
|
97
|
-
|
|
98
131
|
report.blocked.forEach(item => {
|
|
99
132
|
statusUpdates[item.name] = { hash: false, category: 'unknown', reason: item.reason };
|
|
100
133
|
});
|
|
101
|
-
|
|
102
134
|
report.failedDependency.forEach(item => {
|
|
103
135
|
const missingStr = item.missing ? item.missing.join(', ') : 'unknown';
|
|
104
136
|
statusUpdates[item.name] = { hash: false, category: 'unknown', reason: `Dependency Missing: ${missingStr}` };
|
|
@@ -109,21 +141,29 @@ async function dispatchComputationPass(config, dependencies, computationManifest
|
|
|
109
141
|
}
|
|
110
142
|
|
|
111
143
|
const validToRun = [...report.runnable, ...report.reRuns];
|
|
112
|
-
|
|
144
|
+
|
|
145
|
+
// [NEW] Parallel Forensics Check
|
|
146
|
+
await Promise.all(validToRun.map(item => limit(async () => {
|
|
147
|
+
const compName = normalizeName(item.name);
|
|
148
|
+
|
|
149
|
+
// 1. Determine Resource Requirements
|
|
150
|
+
const requiredResource = await checkCrashForensics(db, d, passToRun, compName);
|
|
151
|
+
|
|
113
152
|
const uniqueDispatchId = crypto.randomUUID();
|
|
114
153
|
tasksToDispatch.push({
|
|
115
154
|
action: 'RUN_COMPUTATION_DATE',
|
|
116
155
|
dispatchId: uniqueDispatchId,
|
|
117
156
|
date: d,
|
|
118
157
|
pass: passToRun,
|
|
119
|
-
computation:
|
|
158
|
+
computation: compName,
|
|
120
159
|
hash: item.hash || item.newHash,
|
|
121
160
|
previousCategory: item.previousCategory || null,
|
|
122
161
|
triggerReason: item.reason || "Unknown",
|
|
123
162
|
dependencyResultHashes: item.dependencyResultHashes || {},
|
|
124
|
-
timestamp: Date.now()
|
|
163
|
+
timestamp: Date.now(),
|
|
164
|
+
resources: requiredResource // 'standard' or 'high-mem'
|
|
125
165
|
});
|
|
126
|
-
});
|
|
166
|
+
})));
|
|
127
167
|
|
|
128
168
|
} catch (e) {
|
|
129
169
|
logger.log('ERROR', `[Dispatcher] Failed analysis for ${d}: ${e.message}`);
|
|
@@ -132,10 +172,9 @@ async function dispatchComputationPass(config, dependencies, computationManifest
|
|
|
132
172
|
|
|
133
173
|
await Promise.all(analysisPromises);
|
|
134
174
|
|
|
135
|
-
// --
|
|
175
|
+
// -- CALLBACK & COUNTER INITIALIZATION --
|
|
136
176
|
|
|
137
177
|
if (tasksToDispatch.length > 0) {
|
|
138
|
-
logger.log('INFO', `[Dispatcher] 📝 Preparing ${tasksToDispatch.length} tasks for execution...`);
|
|
139
178
|
|
|
140
179
|
// 1. Initialize Shared State Document (The Counter)
|
|
141
180
|
const runId = crypto.randomUUID();
|
|
@@ -147,14 +186,14 @@ async function dispatchComputationPass(config, dependencies, computationManifest
|
|
|
147
186
|
date: dateStr,
|
|
148
187
|
pass: passToRun,
|
|
149
188
|
totalTasks: tasksToDispatch.length,
|
|
150
|
-
remainingTasks: tasksToDispatch.length,
|
|
151
|
-
callbackUrl: callbackUrl,
|
|
189
|
+
remainingTasks: tasksToDispatch.length,
|
|
190
|
+
callbackUrl: callbackUrl,
|
|
152
191
|
status: 'IN_PROGRESS'
|
|
153
192
|
});
|
|
154
|
-
logger.log('INFO', `[Dispatcher] 🏁 Run State Initialized: ${runId}`);
|
|
193
|
+
logger.log('INFO', `[Dispatcher] 🏁 Run State Initialized: ${runId}. Tasks: ${tasksToDispatch.length}`);
|
|
155
194
|
}
|
|
156
195
|
|
|
157
|
-
// 2. Attach Run Metadata
|
|
196
|
+
// 2. Attach Run Metadata
|
|
158
197
|
tasksToDispatch.forEach(task => {
|
|
159
198
|
task.runId = runId;
|
|
160
199
|
task.metaStatePath = callbackUrl ? metaStatePath : null;
|
|
@@ -180,12 +219,13 @@ async function dispatchComputationPass(config, dependencies, computationManifest
|
|
|
180
219
|
t.set(ledgerRef, {
|
|
181
220
|
status: 'PENDING',
|
|
182
221
|
dispatchId: task.dispatchId,
|
|
183
|
-
runId: task.runId,
|
|
222
|
+
runId: task.runId,
|
|
184
223
|
computation: task.computation,
|
|
185
224
|
expectedHash: task.hash || 'unknown',
|
|
186
225
|
createdAt: new Date(),
|
|
187
226
|
dispatcherHash: currentManifestHash,
|
|
188
227
|
triggerReason: task.triggerReason,
|
|
228
|
+
resources: task.resources, // Log intended resource type
|
|
189
229
|
retries: 0
|
|
190
230
|
}, { merge: true });
|
|
191
231
|
|
|
@@ -201,22 +241,36 @@ async function dispatchComputationPass(config, dependencies, computationManifest
|
|
|
201
241
|
|
|
202
242
|
await Promise.all(txnPromises);
|
|
203
243
|
|
|
204
|
-
// 4. Publish to Pub/Sub
|
|
244
|
+
// 4. Publish to Pub/Sub (Segregated by Resources)
|
|
205
245
|
if (finalDispatched.length > 0) {
|
|
206
|
-
logger.log('INFO', `[Dispatcher] ✅ Publishing ${finalDispatched.length} tasks to Pub/Sub...`);
|
|
207
246
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
247
|
+
const standardTasks = finalDispatched.filter(t => t.resources !== 'high-mem');
|
|
248
|
+
const highMemTasks = finalDispatched.filter(t => t.resources === 'high-mem');
|
|
249
|
+
|
|
250
|
+
// Publish Standard
|
|
251
|
+
if (standardTasks.length > 0) {
|
|
252
|
+
logger.log('INFO', `[Dispatcher] ✅ Publishing ${standardTasks.length} Standard tasks...`);
|
|
253
|
+
await pubsubUtils.batchPublishTasks(dependencies, {
|
|
254
|
+
topicName: config.computationTopicStandard || 'computation-tasks',
|
|
255
|
+
tasks: standardTasks,
|
|
256
|
+
taskType: `computation-pass-${passToRun}-std`,
|
|
257
|
+
maxPubsubBatchSize: 100
|
|
258
|
+
});
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
// Publish High-Mem
|
|
262
|
+
if (highMemTasks.length > 0) {
|
|
263
|
+
logger.log('INFO', `[Dispatcher] 🏋️♀️ Publishing ${highMemTasks.length} tasks to HIGH-MEM infrastructure.`);
|
|
264
|
+
await pubsubUtils.batchPublishTasks(dependencies, {
|
|
265
|
+
topicName: config.computationTopicHighMem || 'computation-tasks-highmem',
|
|
266
|
+
tasks: highMemTasks,
|
|
267
|
+
taskType: `computation-pass-${passToRun}-highmem`,
|
|
268
|
+
maxPubsubBatchSize: 100
|
|
269
|
+
});
|
|
270
|
+
}
|
|
214
271
|
|
|
215
|
-
// Return count so workflow knows to wait
|
|
216
272
|
return { dispatched: finalDispatched.length, runId };
|
|
217
273
|
} else {
|
|
218
|
-
// Edge Case: Analysis said "Run", but Ledger said "Already Done"
|
|
219
|
-
// We must update the state doc to 0 or delete it, OR return 0 so workflow doesn't wait.
|
|
220
274
|
logger.log('INFO', `[Dispatcher] All tasks were already COMPLETED.`);
|
|
221
275
|
return { dispatched: 0 };
|
|
222
276
|
}
|
|
@@ -227,4 +281,4 @@ async function dispatchComputationPass(config, dependencies, computationManifest
|
|
|
227
281
|
}
|
|
228
282
|
}
|
|
229
283
|
|
|
230
|
-
module.exports = { dispatchComputationPass };
|
|
284
|
+
module.exports = { dispatchComputationPass };
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
* FILENAME: computation-system/helpers/computation_worker.js
|
|
3
3
|
* PURPOSE: Consumes tasks, executes logic, and signals Workflow upon Batch Completion.
|
|
4
4
|
* UPDATED: Implements IAM Auth for Workflow Callbacks.
|
|
5
|
+
* UPDATED: Implements Memory Heartbeat (Flight Recorder) for OOM detection.
|
|
5
6
|
*/
|
|
6
7
|
|
|
7
8
|
const { executeDispatchTask } = require('../WorkflowOrchestrator.js');
|
|
@@ -9,7 +10,7 @@ const { getManifest } = require('../topology/ManifestLoader');
|
|
|
9
10
|
const { StructuredLogger } = require('../logger/logger');
|
|
10
11
|
const { recordRunAttempt } = require('../persistence/RunRecorder');
|
|
11
12
|
const https = require('https');
|
|
12
|
-
const { GoogleAuth } = require('google-auth-library');
|
|
13
|
+
const { GoogleAuth } = require('google-auth-library');
|
|
13
14
|
|
|
14
15
|
let calculationPackage;
|
|
15
16
|
try { calculationPackage = require('aiden-shared-calculations-unified');
|
|
@@ -19,51 +20,68 @@ const calculations = calculationPackage.calculations;
|
|
|
19
20
|
const MAX_RETRIES = 3;
|
|
20
21
|
|
|
21
22
|
/**
|
|
22
|
-
* [NEW] Helper:
|
|
23
|
-
*
|
|
23
|
+
* [NEW] Helper: Starts a background heartbeat to track memory usage.
|
|
24
|
+
* This acts as a "Black Box Recorder". If the worker crashes (OOM),
|
|
25
|
+
* the last written value will remain in Firestore for the Dispatcher to analyze.
|
|
26
|
+
*/
|
|
27
|
+
function startMemoryHeartbeat(db, ledgerPath, intervalMs = 2000) {
|
|
28
|
+
const getMemStats = () => {
|
|
29
|
+
const mem = process.memoryUsage();
|
|
30
|
+
return {
|
|
31
|
+
rssMB: Math.round(mem.rss / 1024 / 1024), // Resident Set Size (OOM Killer Metric)
|
|
32
|
+
heapUsedMB: Math.round(mem.heapUsed / 1024 / 1024),
|
|
33
|
+
timestamp: new Date()
|
|
34
|
+
};
|
|
35
|
+
};
|
|
36
|
+
|
|
37
|
+
const timer = setInterval(async () => {
|
|
38
|
+
try {
|
|
39
|
+
const stats = getMemStats();
|
|
40
|
+
// Use update() to minimize payload size and avoid overwriting status
|
|
41
|
+
await db.doc(ledgerPath).update({
|
|
42
|
+
'telemetry.lastMemory': stats,
|
|
43
|
+
'telemetry.lastHeartbeat': new Date()
|
|
44
|
+
}).catch(() => {}); // Ignore write errors to prevent crashing the worker
|
|
45
|
+
} catch (e) {
|
|
46
|
+
// Silently fail on telemetry errors
|
|
47
|
+
}
|
|
48
|
+
}, intervalMs);
|
|
49
|
+
|
|
50
|
+
// Unref so this timer doesn't prevent the process from exiting naturally
|
|
51
|
+
timer.unref();
|
|
52
|
+
|
|
53
|
+
return timer;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Helper: Fires the webhook back to Google Cloud Workflows.
|
|
24
58
|
*/
|
|
25
59
|
async function triggerWorkflowCallback(url, status, logger) {
|
|
26
60
|
if (!url) return;
|
|
27
61
|
logger.log('INFO', `[Worker] 🔔 BATCH COMPLETE! Triggering Workflow Callback: ${status}`);
|
|
28
62
|
|
|
29
63
|
try {
|
|
30
|
-
|
|
31
|
-
const auth = new GoogleAuth({
|
|
32
|
-
scopes: ['https://www.googleapis.com/auth/cloud-platform']
|
|
33
|
-
});
|
|
64
|
+
const auth = new GoogleAuth({ scopes: ['https://www.googleapis.com/auth/cloud-platform'] });
|
|
34
65
|
const client = await auth.getClient();
|
|
35
66
|
const accessToken = await client.getAccessToken();
|
|
36
67
|
const token = accessToken.token;
|
|
37
68
|
|
|
38
|
-
// 2. Send Authenticated Request
|
|
39
69
|
return new Promise((resolve, reject) => {
|
|
40
|
-
const body = JSON.stringify({
|
|
41
|
-
status: status,
|
|
42
|
-
timestamp: new Date().toISOString()
|
|
43
|
-
});
|
|
70
|
+
const body = JSON.stringify({ status: status, timestamp: new Date().toISOString() });
|
|
44
71
|
|
|
45
72
|
const req = https.request(url, {
|
|
46
73
|
method: 'POST',
|
|
47
74
|
headers: {
|
|
48
75
|
'Content-Type': 'application/json',
|
|
49
76
|
'Content-Length': Buffer.byteLength(body),
|
|
50
|
-
'Authorization': `Bearer ${token}`
|
|
77
|
+
'Authorization': `Bearer ${token}`
|
|
51
78
|
}
|
|
52
79
|
}, (res) => {
|
|
53
|
-
if (res.statusCode >= 200 && res.statusCode < 300) {
|
|
54
|
-
|
|
55
|
-
} else {
|
|
56
|
-
logger.log('WARN', `Callback responded with ${res.statusCode}`);
|
|
57
|
-
// We resolve anyway to avoid crashing the worker logic
|
|
58
|
-
resolve();
|
|
59
|
-
}
|
|
80
|
+
if (res.statusCode >= 200 && res.statusCode < 300) { resolve(); }
|
|
81
|
+
else { logger.log('WARN', `Callback responded with ${res.statusCode}`); resolve(); }
|
|
60
82
|
});
|
|
61
83
|
|
|
62
|
-
req.on('error', (e) => {
|
|
63
|
-
logger.log('ERROR', `Failed to trigger callback: ${e.message}`);
|
|
64
|
-
resolve();
|
|
65
|
-
});
|
|
66
|
-
|
|
84
|
+
req.on('error', (e) => { logger.log('ERROR', `Failed to trigger callback: ${e.message}`); resolve(); });
|
|
67
85
|
req.write(body);
|
|
68
86
|
req.end();
|
|
69
87
|
});
|
|
@@ -73,37 +91,21 @@ async function triggerWorkflowCallback(url, status, logger) {
|
|
|
73
91
|
}
|
|
74
92
|
|
|
75
93
|
/**
|
|
76
|
-
*
|
|
77
|
-
* Returns the callbackUrl IF this was the last task.
|
|
94
|
+
* Helper: Decrements 'remainingTasks' in Firestore.
|
|
78
95
|
*/
|
|
79
96
|
async function decrementAndCheck(db, metaStatePath, logger) {
|
|
80
97
|
if (!metaStatePath) return null;
|
|
81
|
-
|
|
82
98
|
try {
|
|
83
99
|
const result = await db.runTransaction(async (t) => {
|
|
84
100
|
const ref = db.doc(metaStatePath);
|
|
85
101
|
const doc = await t.get(ref);
|
|
86
|
-
|
|
87
|
-
if (!doc.exists) return null; // State might have expired or been deleted
|
|
102
|
+
if (!doc.exists) return null;
|
|
88
103
|
const data = doc.data();
|
|
89
|
-
|
|
90
104
|
const newRemaining = (data.remainingTasks || 0) - 1;
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
remainingTasks: newRemaining,
|
|
94
|
-
lastUpdated: new Date()
|
|
95
|
-
});
|
|
96
|
-
|
|
97
|
-
// Return needed data only if we hit 0 (or lower, for safety)
|
|
98
|
-
return {
|
|
99
|
-
remaining: newRemaining,
|
|
100
|
-
callbackUrl: data.callbackUrl
|
|
101
|
-
};
|
|
105
|
+
t.update(ref, { remainingTasks: newRemaining, lastUpdated: new Date() });
|
|
106
|
+
return { remaining: newRemaining, callbackUrl: data.callbackUrl };
|
|
102
107
|
});
|
|
103
|
-
|
|
104
|
-
if (result && result.remaining <= 0) {
|
|
105
|
-
return result.callbackUrl;
|
|
106
|
-
}
|
|
108
|
+
if (result && result.remaining <= 0) return result.callbackUrl;
|
|
107
109
|
} catch (e) {
|
|
108
110
|
logger.log('ERROR', `[Worker] Failed to decrement batch counter: ${e.message}`);
|
|
109
111
|
}
|
|
@@ -125,12 +127,12 @@ async function handleComputationTask(message, config, dependencies) {
|
|
|
125
127
|
|
|
126
128
|
if (!data || data.action !== 'RUN_COMPUTATION_DATE') { return; }
|
|
127
129
|
|
|
128
|
-
// Extract fields including new metaStatePath
|
|
129
130
|
const { date, pass, computation, previousCategory, triggerReason, dispatchId, dependencyResultHashes, metaStatePath } = data;
|
|
130
131
|
|
|
131
132
|
if (!date || !pass || !computation) { logger.log('ERROR', `[Worker] Invalid payload.`, data); return; }
|
|
132
133
|
|
|
133
134
|
const retryCount = message.deliveryAttempt || 1;
|
|
135
|
+
const ledgerPath = `computation_audit_ledger/${date}/passes/${pass}/tasks/${computation}`;
|
|
134
136
|
|
|
135
137
|
// --- POISON MESSAGE HANDLING (DLQ) ---
|
|
136
138
|
if (retryCount > MAX_RETRIES) {
|
|
@@ -144,36 +146,38 @@ async function handleComputationTask(message, config, dependencies) {
|
|
|
144
146
|
failureReason: 'MAX_RETRIES_EXCEEDED'
|
|
145
147
|
});
|
|
146
148
|
|
|
147
|
-
await db.
|
|
149
|
+
await db.doc(ledgerPath).set({
|
|
148
150
|
status: 'FAILED',
|
|
149
151
|
error: 'Max Retries Exceeded (Poison Message)',
|
|
150
152
|
failedAt: new Date()
|
|
151
153
|
}, { merge: true });
|
|
152
154
|
|
|
153
155
|
const callbackUrl = await decrementAndCheck(db, metaStatePath, logger);
|
|
154
|
-
if (callbackUrl) {
|
|
155
|
-
await triggerWorkflowCallback(callbackUrl, 'SUCCESS', logger);
|
|
156
|
-
}
|
|
157
|
-
|
|
156
|
+
if (callbackUrl) { await triggerWorkflowCallback(callbackUrl, 'SUCCESS', logger); }
|
|
158
157
|
return;
|
|
159
158
|
} catch (dlqErr) { logger.log('FATAL', `[Worker] Failed to write to DLQ`, dlqErr); }
|
|
160
159
|
}
|
|
161
160
|
|
|
162
161
|
logger.log('INFO', `[Worker] 📥 Received Task: ${computation} (${date}) [Attempt ${retryCount}/${MAX_RETRIES}]`);
|
|
163
162
|
|
|
164
|
-
// Update Status to IN_PROGRESS
|
|
163
|
+
// 1. Update Status to IN_PROGRESS & Initialize Telemetry
|
|
165
164
|
try {
|
|
166
|
-
await db.
|
|
165
|
+
await db.doc(ledgerPath).set({
|
|
167
166
|
status: 'IN_PROGRESS',
|
|
168
167
|
workerId: process.env.K_REVISION || 'unknown',
|
|
169
168
|
startedAt: new Date(),
|
|
170
|
-
dispatchId: dispatchId
|
|
169
|
+
dispatchId: dispatchId,
|
|
170
|
+
telemetry: { startTime: new Date(), lastMemory: null } // Init for heartbeat
|
|
171
171
|
}, { merge: true });
|
|
172
172
|
} catch (leaseErr) {}
|
|
173
173
|
|
|
174
|
+
// 2. START HEARTBEAT (The Flight Recorder)
|
|
175
|
+
const heartbeatTimer = startMemoryHeartbeat(db, ledgerPath, 2000);
|
|
176
|
+
|
|
174
177
|
let computationManifest;
|
|
175
178
|
try { computationManifest = getManifest(config.activeProductLines || [], calculations, runDependencies);
|
|
176
179
|
} catch (manifestError) {
|
|
180
|
+
clearInterval(heartbeatTimer); // Stop if we fail early
|
|
177
181
|
logger.log('FATAL', `[Worker] Failed to load Manifest: ${manifestError.message}`);
|
|
178
182
|
return;
|
|
179
183
|
}
|
|
@@ -186,6 +190,9 @@ async function handleComputationTask(message, config, dependencies) {
|
|
|
186
190
|
);
|
|
187
191
|
const duration = Date.now() - startTime;
|
|
188
192
|
|
|
193
|
+
// STOP HEARTBEAT ON SUCCESS
|
|
194
|
+
clearInterval(heartbeatTimer);
|
|
195
|
+
|
|
189
196
|
const failureReport = result?.updates?.failureReport || [];
|
|
190
197
|
const successUpdates = result?.updates?.successUpdates || {};
|
|
191
198
|
|
|
@@ -194,26 +201,23 @@ async function handleComputationTask(message, config, dependencies) {
|
|
|
194
201
|
throw new Error(failReason.error.message || 'Computation Logic Failed');
|
|
195
202
|
}
|
|
196
203
|
else {
|
|
197
|
-
if (Object.keys(successUpdates).length > 0) {
|
|
198
|
-
|
|
199
|
-
} else {
|
|
200
|
-
logger.log('WARN', `[Worker] ⚠️ Empty Result: ${computation}`);
|
|
201
|
-
}
|
|
204
|
+
if (Object.keys(successUpdates).length > 0) { logger.log('INFO', `[Worker] ✅ Stored: ${computation}`); }
|
|
205
|
+
else { logger.log('WARN', `[Worker] ⚠️ Empty Result: ${computation}`); }
|
|
202
206
|
|
|
203
|
-
await db.
|
|
207
|
+
await db.doc(ledgerPath).update({
|
|
204
208
|
status: 'COMPLETED',
|
|
205
209
|
completedAt: new Date()
|
|
206
210
|
}).catch(() => {});
|
|
207
211
|
|
|
208
212
|
await recordRunAttempt(db, { date, computation, pass }, 'SUCCESS', null, { durationMs: duration }, triggerReason);
|
|
209
213
|
|
|
210
|
-
// Decrement & Callback
|
|
211
214
|
const callbackUrl = await decrementAndCheck(db, metaStatePath, logger);
|
|
212
|
-
if (callbackUrl) {
|
|
213
|
-
await triggerWorkflowCallback(callbackUrl, 'SUCCESS', logger);
|
|
214
|
-
}
|
|
215
|
+
if (callbackUrl) { await triggerWorkflowCallback(callbackUrl, 'SUCCESS', logger); }
|
|
215
216
|
}
|
|
216
217
|
} catch (err) {
|
|
218
|
+
// STOP HEARTBEAT ON ERROR
|
|
219
|
+
clearInterval(heartbeatTimer);
|
|
220
|
+
|
|
217
221
|
// --- ERROR HANDLING ---
|
|
218
222
|
const isDeterministicError = err.stage === 'SHARDING_LIMIT_EXCEEDED' ||
|
|
219
223
|
err.stage === 'QUALITY_CIRCUIT_BREAKER' ||
|
|
@@ -231,7 +235,7 @@ async function handleComputationTask(message, config, dependencies) {
|
|
|
231
235
|
failureReason: 'PERMANENT_DETERMINISTIC_ERROR'
|
|
232
236
|
});
|
|
233
237
|
|
|
234
|
-
await db.
|
|
238
|
+
await db.doc(ledgerPath).set({
|
|
235
239
|
status: 'FAILED',
|
|
236
240
|
error: err.message || 'Permanent Deterministic Error',
|
|
237
241
|
failedAt: new Date()
|
|
@@ -240,23 +244,17 @@ async function handleComputationTask(message, config, dependencies) {
|
|
|
240
244
|
await recordRunAttempt(db, { date, computation, pass }, 'FAILURE', { message: err.message, stage: err.stage || 'PERMANENT_FAIL' }, { durationMs: 0 }, triggerReason);
|
|
241
245
|
|
|
242
246
|
const callbackUrl = await decrementAndCheck(db, metaStatePath, logger);
|
|
243
|
-
if (callbackUrl) {
|
|
244
|
-
await triggerWorkflowCallback(callbackUrl, 'SUCCESS', logger);
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
+
if (callbackUrl) { await triggerWorkflowCallback(callbackUrl, 'SUCCESS', logger); }
|
|
247
248
|
return;
|
|
248
249
|
} catch (dlqErr) { logger.log('FATAL', `[Worker] Failed to write to DLQ`, dlqErr); }
|
|
249
250
|
}
|
|
250
251
|
|
|
251
|
-
if (retryCount >= MAX_RETRIES) {
|
|
252
|
-
throw err;
|
|
253
|
-
}
|
|
252
|
+
if (retryCount >= MAX_RETRIES) { throw err; }
|
|
254
253
|
|
|
255
254
|
logger.log('ERROR', `[Worker] ❌ Crash: ${computation}: ${err.message}`);
|
|
256
255
|
await recordRunAttempt(db, { date, computation, pass }, 'CRASH', { message: err.message, stack: err.stack, stage: 'SYSTEM_CRASH' }, { durationMs: 0 }, triggerReason);
|
|
257
|
-
|
|
258
256
|
throw err;
|
|
259
257
|
}
|
|
260
258
|
}
|
|
261
259
|
|
|
262
|
-
module.exports = { handleComputationTask };
|
|
260
|
+
module.exports = { handleComputationTask };
|
|
@@ -325,13 +325,13 @@ async function generateBuildReport(config, dependencies, manifest, daysBack = 90
|
|
|
325
325
|
}
|
|
326
326
|
|
|
327
327
|
// 3. BLOCKED / IMPOSSIBLE / UPTODATE
|
|
328
|
-
analysis.blocked.forEach(item => pushIfValid(dateSummary.blocked,
|
|
329
|
-
analysis.failedDependency.forEach(item => pushIfValid(dateSummary.blocked,
|
|
330
|
-
analysis.impossible.forEach(item => pushIfValid(dateSummary.impossible, item));
|
|
331
|
-
analysis.skipped.forEach(item => pushIfValid(dateSummary.uptodate,
|
|
328
|
+
analysis.blocked.forEach (item => pushIfValid(dateSummary.blocked, item));
|
|
329
|
+
analysis.failedDependency.forEach (item => pushIfValid(dateSummary.blocked, item, "Dependency Missing"));
|
|
330
|
+
analysis.impossible.forEach (item => pushIfValid(dateSummary.impossible, item));
|
|
331
|
+
analysis.skipped.forEach (item => pushIfValid(dateSummary.uptodate, item, "Up To Date"));
|
|
332
332
|
|
|
333
333
|
// Meta stats
|
|
334
|
-
const includedCount = dateSummary.run.length
|
|
334
|
+
const includedCount = dateSummary.run.length + dateSummary.rerun.length + dateSummary.stable.length +
|
|
335
335
|
dateSummary.blocked.length + dateSummary.impossible.length + dateSummary.uptodate.length;
|
|
336
336
|
dateSummary.meta.totalIncluded = includedCount;
|
|
337
337
|
dateSummary.meta.match = (includedCount === expectedCount);
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
# Cloud Workflows Definition for BullTrackers Computation Pipeline
|
|
2
2
|
# Orchestrates 5 sequential passes using Event-Driven Callbacks (Zero Polling).
|
|
3
|
-
# FIXED:
|
|
4
|
-
# FIXED: Proper extraction of 'callback_details.url' for the dispatcher.
|
|
3
|
+
# FIXED: Restored 'passes' and 'max_retries' variables in init step.
|
|
5
4
|
|
|
6
5
|
main:
|
|
7
6
|
params: [input]
|
|
@@ -10,8 +9,14 @@ main:
|
|
|
10
9
|
assign:
|
|
11
10
|
- project: ${sys.get_env("GOOGLE_CLOUD_PROJECT_ID")}
|
|
12
11
|
- location: "europe-west1"
|
|
13
|
-
|
|
14
|
-
-
|
|
12
|
+
|
|
13
|
+
# T-1 Date Logic (Process Yesterday)
|
|
14
|
+
- now: ${sys.now()}
|
|
15
|
+
- yesterday_timestamp: ${now - 86400}
|
|
16
|
+
- yesterday_str: ${text.substring(time.format(yesterday_timestamp), 0, 10)}
|
|
17
|
+
- date_to_run: ${default(map.get(input, "date"), yesterday_str)}
|
|
18
|
+
|
|
19
|
+
# Configuration Variables (Restored)
|
|
15
20
|
- passes: ["1", "2", "3", "4", "5"]
|
|
16
21
|
- max_retries: 3
|
|
17
22
|
|
|
@@ -42,7 +47,6 @@ main:
|
|
|
42
47
|
- attempt_count: ${attempt_count + 1}
|
|
43
48
|
|
|
44
49
|
# 1. GENERATE CALLBACK ENDPOINT
|
|
45
|
-
# We use the 'events' library. This returns an object containing the URL.
|
|
46
50
|
- create_callback:
|
|
47
51
|
call: events.create_callback_endpoint
|
|
48
52
|
args:
|
|
@@ -60,7 +64,6 @@ main:
|
|
|
60
64
|
severity: "INFO"
|
|
61
65
|
|
|
62
66
|
# 2. TRIGGER DISPATCHER
|
|
63
|
-
# We pass the extracted 'callback_url' string to the dispatcher.
|
|
64
67
|
- trigger_dispatcher:
|
|
65
68
|
call: http.post
|
|
66
69
|
args:
|
|
@@ -89,12 +92,11 @@ main:
|
|
|
89
92
|
next: pass_retry_loop
|
|
90
93
|
|
|
91
94
|
# 4. WAIT FOR WORKER SIGNAL
|
|
92
|
-
# We must pass the original 'callback_details' object here, not the URL string.
|
|
93
95
|
- wait_for_completion:
|
|
94
96
|
call: events.await_callback
|
|
95
97
|
args:
|
|
96
98
|
callback: ${callback_details}
|
|
97
|
-
timeout:
|
|
99
|
+
timeout: 10800 # UPDATED: Reduced from 86400 (24h) to 10800 (3h) to detect crashes faster
|
|
98
100
|
result: callback_request
|
|
99
101
|
|
|
100
102
|
# 5. PROCESS SIGNAL
|