bulltrackers-module 1.0.306 → 1.0.307
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/functions/computation-system/WorkflowOrchestrator.js +87 -213
- package/functions/computation-system/helpers/computation_worker.js +55 -267
- package/functions/computation-system/utils/utils.js +54 -171
- package/package.json +1 -1
- package/functions/computation-system/features.md +0 -395
- package/functions/computation-system/paper.md +0 -93
|
@@ -1,314 +1,102 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* FILENAME: computation-system/helpers/computation_worker.js
|
|
3
|
-
*
|
|
4
|
-
*
|
|
3
|
+
* UPDATED: Removed redundant Callback and Sentinel logic.
|
|
4
|
+
* The system now relies on Dispatcher cursor satiation.
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
7
|
const { executeDispatchTask } = require('../WorkflowOrchestrator.js');
|
|
8
8
|
const { getManifest } = require('../topology/ManifestLoader');
|
|
9
9
|
const { StructuredLogger } = require('../logger/logger');
|
|
10
10
|
const { recordRunAttempt } = require('../persistence/RunRecorder');
|
|
11
|
-
const https = require('https');
|
|
12
|
-
const { GoogleAuth } = require('google-auth-library');
|
|
13
11
|
const { normalizeName } = require('../utils/utils');
|
|
12
|
+
const os = require('os');
|
|
14
13
|
|
|
15
14
|
let calculationPackage;
|
|
16
|
-
try { calculationPackage = require('aiden-shared-calculations-unified');
|
|
17
|
-
} catch (e) {console.error("FATAL: Could not load 'aiden-shared-calculations-unified'."); throw e; }
|
|
15
|
+
try { calculationPackage = require('aiden-shared-calculations-unified'); } catch (e) { throw e; }
|
|
18
16
|
const calculations = calculationPackage.calculations;
|
|
19
17
|
|
|
20
18
|
const MAX_RETRIES = 3;
|
|
21
19
|
|
|
22
|
-
/**
|
|
23
|
-
* [UPDATED] Heartbeat now returns a closure to get the PEAK memory.
|
|
24
|
-
* This acts as a "Black Box Recorder".
|
|
25
|
-
*/
|
|
20
|
+
/** Black Box Recorder for Peak Memory. */
|
|
26
21
|
function startMemoryHeartbeat(db, ledgerPath, intervalMs = 2000) {
|
|
27
22
|
let peakRss = 0;
|
|
28
|
-
|
|
29
|
-
const getMemStats = () => {
|
|
30
|
-
const mem = process.memoryUsage();
|
|
31
|
-
const rssMB = Math.round(mem.rss / 1024 / 1024);
|
|
32
|
-
if (rssMB > peakRss) peakRss = rssMB;
|
|
33
|
-
|
|
34
|
-
return {
|
|
35
|
-
rssMB: rssMB,
|
|
36
|
-
heapUsedMB: Math.round(mem.heapUsed / 1024 / 1024),
|
|
37
|
-
timestamp: new Date()
|
|
38
|
-
};
|
|
39
|
-
};
|
|
40
|
-
|
|
41
23
|
const timer = setInterval(async () => {
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
await db.doc(ledgerPath).update({
|
|
46
|
-
'telemetry.lastMemory': stats,
|
|
47
|
-
'telemetry.lastHeartbeat': new Date()
|
|
48
|
-
}).catch(() => {}); // Ignore write errors to prevent crashing the worker
|
|
49
|
-
} catch (e) {
|
|
50
|
-
// Silently fail on telemetry errors
|
|
51
|
-
}
|
|
24
|
+
const rssMB = Math.round(process.memoryUsage().rss / 1024 / 1024);
|
|
25
|
+
if (rssMB > peakRss) peakRss = rssMB;
|
|
26
|
+
await db.doc(ledgerPath).update({ 'telemetry.lastMemoryMB': rssMB, 'telemetry.lastHeartbeat': new Date() }).catch(() => {});
|
|
52
27
|
}, intervalMs);
|
|
53
|
-
|
|
54
|
-
// Unref so this timer doesn't prevent the process from exiting naturally
|
|
55
28
|
timer.unref();
|
|
56
|
-
|
|
57
|
-
return {
|
|
58
|
-
timer,
|
|
59
|
-
getPeak: () => peakRss
|
|
60
|
-
};
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
/**
|
|
64
|
-
* Helper: Fires the webhook back to Google Cloud Workflows.
|
|
65
|
-
*/
|
|
66
|
-
async function triggerWorkflowCallback(url, status, logger) {
|
|
67
|
-
if (!url) return;
|
|
68
|
-
logger.log('INFO', `[Worker] 🔔 BATCH COMPLETE! Triggering Workflow Callback: ${status}`);
|
|
69
|
-
|
|
70
|
-
try {
|
|
71
|
-
const auth = new GoogleAuth({ scopes: ['https://www.googleapis.com/auth/cloud-platform'] });
|
|
72
|
-
const client = await auth.getClient();
|
|
73
|
-
const accessToken = await client.getAccessToken();
|
|
74
|
-
const token = accessToken.token;
|
|
75
|
-
|
|
76
|
-
return new Promise((resolve, reject) => {
|
|
77
|
-
const body = JSON.stringify({ status: status, timestamp: new Date().toISOString() });
|
|
78
|
-
|
|
79
|
-
const req = https.request(url, {
|
|
80
|
-
method: 'POST',
|
|
81
|
-
headers: {
|
|
82
|
-
'Content-Type': 'application/json',
|
|
83
|
-
'Content-Length': Buffer.byteLength(body),
|
|
84
|
-
'Authorization': `Bearer ${token}`
|
|
85
|
-
}
|
|
86
|
-
}, (res) => {
|
|
87
|
-
if (res.statusCode >= 200 && res.statusCode < 300) { resolve(); }
|
|
88
|
-
else { logger.log('WARN', `Callback responded with ${res.statusCode}`); resolve(); }
|
|
89
|
-
});
|
|
90
|
-
|
|
91
|
-
req.on('error', (e) => { logger.log('ERROR', `Failed to trigger callback: ${e.message}`); resolve(); });
|
|
92
|
-
req.write(body);
|
|
93
|
-
req.end();
|
|
94
|
-
});
|
|
95
|
-
} catch (e) {
|
|
96
|
-
logger.log('ERROR', `Failed to generate auth token for callback: ${e.message}`);
|
|
97
|
-
}
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
/**
|
|
101
|
-
* [UPDATED] Helper: Decrements 'remainingTasks' in Firestore.
|
|
102
|
-
* NOW INCLUDES CONTENTION RETRY LOGIC (The "Sentinel" Fix)
|
|
103
|
-
*/
|
|
104
|
-
async function decrementAndCheck(db, metaStatePath, logger) {
|
|
105
|
-
if (!metaStatePath) return null;
|
|
106
|
-
|
|
107
|
-
const MAX_CONTENTION_RETRIES = 10;
|
|
108
|
-
let attempt = 0;
|
|
109
|
-
|
|
110
|
-
while (attempt < MAX_CONTENTION_RETRIES) {
|
|
111
|
-
try {
|
|
112
|
-
const result = await db.runTransaction(async (t) => {
|
|
113
|
-
const ref = db.doc(metaStatePath);
|
|
114
|
-
const doc = await t.get(ref);
|
|
115
|
-
if (!doc.exists) return null;
|
|
116
|
-
|
|
117
|
-
const data = doc.data();
|
|
118
|
-
// Safety: Don't decrement below zero
|
|
119
|
-
const currentRemaining = data.remainingTasks || 0;
|
|
120
|
-
if (currentRemaining <= 0) return { remaining: 0, callbackUrl: data.callbackUrl };
|
|
121
|
-
|
|
122
|
-
const newRemaining = currentRemaining - 1;
|
|
123
|
-
t.update(ref, { remainingTasks: newRemaining, lastUpdated: new Date() });
|
|
124
|
-
|
|
125
|
-
return { remaining: newRemaining, callbackUrl: data.callbackUrl };
|
|
126
|
-
});
|
|
127
|
-
|
|
128
|
-
// Success! Check if we are the "Sentinel" (the last one)
|
|
129
|
-
if (result && result.remaining <= 0) return result.callbackUrl;
|
|
130
|
-
return null; // We decremented successfully, but weren't the last one.
|
|
131
|
-
|
|
132
|
-
} catch (e) {
|
|
133
|
-
// Check if it's a contention error (ABORTED/10 or DEADLINE_EXCEEDED/4)
|
|
134
|
-
const isContention = e.code === 10 || e.code === 4 || (e.message && e.message.includes('contention'));
|
|
135
|
-
|
|
136
|
-
if (isContention) {
|
|
137
|
-
attempt++;
|
|
138
|
-
// JITTER: Random delay between 50ms and 500ms to desynchronize the herd
|
|
139
|
-
const delay = Math.floor(Math.random() * 450) + 50;
|
|
140
|
-
logger.log('WARN', `[Worker] Batch counter contention (Attempt ${attempt}/${MAX_CONTENTION_RETRIES}). Retrying in ${delay}ms...`);
|
|
141
|
-
await new Promise(r => setTimeout(r, delay));
|
|
142
|
-
} else {
|
|
143
|
-
// Fatal error (permission, etc)
|
|
144
|
-
logger.log('ERROR', `[Worker] Fatal error decrementing batch counter: ${e.message}`);
|
|
145
|
-
return null;
|
|
146
|
-
}
|
|
147
|
-
}
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
logger.log('ERROR', `[Worker] Failed to decrement batch counter after ${MAX_CONTENTION_RETRIES} attempts. The count will be inaccurate.`);
|
|
151
|
-
return null;
|
|
29
|
+
return { timer, getPeak: () => peakRss };
|
|
152
30
|
}
|
|
153
31
|
|
|
154
32
|
async function handleComputationTask(message, config, dependencies) {
|
|
155
|
-
const
|
|
156
|
-
const
|
|
157
|
-
const
|
|
33
|
+
const logger = new StructuredLogger({ minLevel: config.minLevel || 'INFO', enableStructured: true, ...config });
|
|
34
|
+
const runDeps = { ...dependencies, logger };
|
|
35
|
+
const db = dependencies.db;
|
|
158
36
|
let data;
|
|
159
37
|
|
|
160
38
|
try {
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
} else { data = message; }
|
|
165
|
-
} catch (parseError) { logger.log('ERROR', `[Worker] Failed to parse Pub/Sub payload.`, { error: parseError.message }); return; }
|
|
39
|
+
const raw = message.data?.message?.data || message.data || message.json;
|
|
40
|
+
data = (typeof raw === 'string') ? JSON.parse(Buffer.from(raw, 'base64').toString()) : raw;
|
|
41
|
+
} catch (e) { return; }
|
|
166
42
|
|
|
167
|
-
if (!data || data.action !== 'RUN_COMPUTATION_DATE')
|
|
43
|
+
if (!data || data.action !== 'RUN_COMPUTATION_DATE') return;
|
|
168
44
|
|
|
169
|
-
|
|
170
|
-
const
|
|
171
|
-
const resourceTier = resources || 'standard'; // Default to standard
|
|
172
|
-
|
|
173
|
-
if (!date || !pass || !computation) { logger.log('ERROR', `[Worker] Invalid payload.`, data); return; }
|
|
174
|
-
|
|
175
|
-
const retryCount = message.deliveryAttempt || 1;
|
|
45
|
+
const { date, pass, computation, previousCategory, triggerReason, dispatchId, dependencyResultHashes, resources } = data;
|
|
46
|
+
const resourceTier = resources || 'standard';
|
|
176
47
|
const ledgerPath = `computation_audit_ledger/${date}/passes/${pass}/tasks/${computation}`;
|
|
177
48
|
|
|
178
|
-
|
|
179
|
-
if (retryCount > MAX_RETRIES) {
|
|
180
|
-
logger.log('ERROR', `[Worker] ☠️ Task POISONED. Moved to DLQ: ${computation}`);
|
|
181
|
-
try {
|
|
182
|
-
await db.collection('computation_dead_letter_queue').add({
|
|
183
|
-
originalData: data,
|
|
184
|
-
dispatchId: dispatchId,
|
|
185
|
-
error: { message: 'Max Retries Exceeded', stack: 'PubSub delivery limit reached' },
|
|
186
|
-
finalAttemptAt: new Date(),
|
|
187
|
-
failureReason: 'MAX_RETRIES_EXCEEDED'
|
|
188
|
-
});
|
|
189
|
-
|
|
190
|
-
await db.doc(ledgerPath).set({
|
|
191
|
-
status: 'FAILED',
|
|
192
|
-
error: 'Max Retries Exceeded (Poison Message)',
|
|
193
|
-
failedAt: new Date()
|
|
194
|
-
}, { merge: true });
|
|
49
|
+
logger.log('INFO', `[Worker] 📥 Task: ${computation} (${date}) [Tier: ${resourceTier}]`);
|
|
195
50
|
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
51
|
+
// 1. Audit Lease
|
|
52
|
+
await db.doc(ledgerPath).set({
|
|
53
|
+
status: 'IN_PROGRESS',
|
|
54
|
+
workerId: process.env.K_REVISION || os.hostname(),
|
|
55
|
+
startedAt: new Date(),
|
|
56
|
+
dispatchId
|
|
57
|
+
}, { merge: true });
|
|
201
58
|
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
// 1. Update Status to IN_PROGRESS & Initialize Telemetry
|
|
205
|
-
try {
|
|
206
|
-
await db.doc(ledgerPath).set({
|
|
207
|
-
status: 'IN_PROGRESS',
|
|
208
|
-
workerId: process.env.K_REVISION || 'unknown',
|
|
209
|
-
startedAt: new Date(),
|
|
210
|
-
dispatchId: dispatchId,
|
|
211
|
-
telemetry: { startTime: new Date(), lastMemory: null } // Init for heartbeat
|
|
212
|
-
}, { merge: true });
|
|
213
|
-
} catch (leaseErr) {}
|
|
214
|
-
|
|
215
|
-
// 2. START HEARTBEAT (The Flight Recorder)
|
|
216
|
-
// [UPDATED] Using new logic to track peak
|
|
217
|
-
const heartbeatControl = startMemoryHeartbeat(db, ledgerPath, 2000);
|
|
218
|
-
|
|
219
|
-
let computationManifest;
|
|
220
|
-
try { computationManifest = getManifest(config.activeProductLines || [], calculations, runDependencies);
|
|
221
|
-
} catch (manifestError) {
|
|
222
|
-
clearInterval(heartbeatControl.timer); // Stop if we fail early
|
|
223
|
-
logger.log('FATAL', `[Worker] Failed to load Manifest: ${manifestError.message}`);
|
|
224
|
-
return;
|
|
225
|
-
}
|
|
59
|
+
const heartbeat = startMemoryHeartbeat(db, ledgerPath);
|
|
226
60
|
|
|
227
61
|
try {
|
|
62
|
+
const manifest = getManifest(config.activeProductLines || [], calculations, runDeps);
|
|
228
63
|
const startTime = Date.now();
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
64
|
+
|
|
65
|
+
const result = await executeDispatchTask(
|
|
66
|
+
date, pass, computation, config, runDeps,
|
|
67
|
+
manifest, previousCategory, dependencyResultHashes
|
|
232
68
|
);
|
|
233
|
-
const duration = Date.now() - startTime;
|
|
234
|
-
|
|
235
|
-
// STOP HEARTBEAT ON SUCCESS
|
|
236
|
-
clearInterval(heartbeatControl.timer);
|
|
237
69
|
|
|
238
|
-
|
|
70
|
+
clearInterval(heartbeat.timer);
|
|
71
|
+
const failureReport = result?.updates?.failureReport || [];
|
|
239
72
|
const successUpdates = result?.updates?.successUpdates || {};
|
|
240
73
|
|
|
241
|
-
if (failureReport.length > 0)
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
peakMemoryMB: heartbeatControl.getPeak(),
|
|
254
|
-
io: calcUpdate.metrics?.io,
|
|
255
|
-
storage: calcUpdate.metrics?.storage,
|
|
256
|
-
execution: calcUpdate.metrics?.execution,
|
|
257
|
-
validation: calcUpdate.metrics?.validation,
|
|
258
|
-
composition: calcUpdate.composition
|
|
259
|
-
};
|
|
260
|
-
|
|
261
|
-
await db.doc(ledgerPath).update({
|
|
262
|
-
status: 'COMPLETED',
|
|
263
|
-
completedAt: new Date()
|
|
264
|
-
}).catch(() => {});
|
|
74
|
+
if (failureReport.length > 0) throw new Error(failureReport[0].error.message);
|
|
75
|
+
|
|
76
|
+
const calcUpdate = successUpdates[normalizeName(computation)] || {};
|
|
77
|
+
const metrics = {
|
|
78
|
+
durationMs: Date.now() - startTime,
|
|
79
|
+
peakMemoryMB: heartbeat.getPeak(),
|
|
80
|
+
io: calcUpdate.metrics?.io,
|
|
81
|
+
storage: calcUpdate.metrics?.storage,
|
|
82
|
+
execution: calcUpdate.metrics?.execution,
|
|
83
|
+
validation: calcUpdate.metrics?.validation,
|
|
84
|
+
composition: calcUpdate.composition
|
|
85
|
+
};
|
|
265
86
|
|
|
266
|
-
|
|
267
|
-
|
|
87
|
+
await db.doc(ledgerPath).update({ status: 'COMPLETED', completedAt: new Date() });
|
|
88
|
+
await recordRunAttempt(db, { date, computation, pass }, 'SUCCESS', null, metrics, triggerReason, resourceTier);
|
|
268
89
|
|
|
269
|
-
const callbackUrl = await decrementAndCheck(db, metaStatePath, logger);
|
|
270
|
-
if (callbackUrl) { await triggerWorkflowCallback(callbackUrl, 'SUCCESS', logger); }
|
|
271
|
-
}
|
|
272
90
|
} catch (err) {
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
// --- ERROR HANDLING ---
|
|
277
|
-
const isDeterministicError = err.stage === 'SHARDING_LIMIT_EXCEEDED' ||
|
|
278
|
-
err.stage === 'QUALITY_CIRCUIT_BREAKER' ||
|
|
279
|
-
err.stage === 'SEMANTIC_GATE' ||
|
|
280
|
-
(err.message && (err.message.includes('INVALID_ARGUMENT') || err.message.includes('Transaction too big')));
|
|
91
|
+
clearInterval(heartbeat.timer);
|
|
92
|
+
const isDeterministic = ['SHARDING_LIMIT_EXCEEDED', 'QUALITY_CIRCUIT_BREAKER', 'SEMANTIC_GATE'].includes(err.stage);
|
|
281
93
|
|
|
282
|
-
if (
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
originalData: data,
|
|
287
|
-
dispatchId: dispatchId,
|
|
288
|
-
error: { message: err.message, stack: err.stack, stage: err.stage || 'UNKNOWN' },
|
|
289
|
-
finalAttemptAt: new Date(),
|
|
290
|
-
failureReason: 'PERMANENT_DETERMINISTIC_ERROR'
|
|
291
|
-
});
|
|
292
|
-
|
|
293
|
-
await db.doc(ledgerPath).set({
|
|
294
|
-
status: 'FAILED',
|
|
295
|
-
error: err.message || 'Permanent Deterministic Error',
|
|
296
|
-
failedAt: new Date()
|
|
297
|
-
}, { merge: true });
|
|
298
|
-
|
|
299
|
-
await recordRunAttempt(db, { date, computation, pass }, 'FAILURE', { message: err.message, stage: err.stage || 'PERMANENT_FAIL' }, { durationMs: 0, peakMemoryMB: heartbeatControl.getPeak() }, triggerReason, resourceTier);
|
|
300
|
-
|
|
301
|
-
const callbackUrl = await decrementAndCheck(db, metaStatePath, logger);
|
|
302
|
-
if (callbackUrl) { await triggerWorkflowCallback(callbackUrl, 'SUCCESS', logger); }
|
|
303
|
-
return;
|
|
304
|
-
} catch (dlqErr) { logger.log('FATAL', `[Worker] Failed to write to DLQ`, dlqErr); }
|
|
94
|
+
if (isDeterministic || (message.deliveryAttempt || 1) >= MAX_RETRIES) {
|
|
95
|
+
await db.doc(ledgerPath).set({ status: 'FAILED', error: err.message, failedAt: new Date() }, { merge: true });
|
|
96
|
+
await recordRunAttempt(db, { date, computation, pass }, 'FAILURE', { message: err.message, stage: err.stage || 'FATAL' }, { peakMemoryMB: heartbeat.getPeak() }, triggerReason, resourceTier);
|
|
97
|
+
return; // Don't throw, we've handled the permanent failure
|
|
305
98
|
}
|
|
306
|
-
|
|
307
|
-
if (retryCount >= MAX_RETRIES) { throw err; }
|
|
308
|
-
|
|
309
|
-
logger.log('ERROR', `[Worker] ❌ Crash: ${computation}: ${err.message}`);
|
|
310
|
-
await recordRunAttempt(db, { date, computation, pass }, 'CRASH', { message: err.message, stack: err.stack, stage: 'SYSTEM_CRASH' }, { durationMs: 0, peakMemoryMB: heartbeatControl.getPeak() }, triggerReason, resourceTier);
|
|
311
|
-
throw err;
|
|
99
|
+
throw err; // Trigger Pub/Sub retry
|
|
312
100
|
}
|
|
313
101
|
}
|
|
314
102
|
|