bulltrackers-module 1.0.288 → 1.0.289
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* FILENAME: computation-system/helpers/computation_dispatcher.js
|
|
3
|
-
* PURPOSE: "Smart Dispatcher" - Analyzes state and dispatches tasks.
|
|
4
|
-
* UPDATED:
|
|
5
|
-
* that is not explicitly COMPLETED, ensuring reliability for one-shot execution.
|
|
3
|
+
* PURPOSE: "Smart Dispatcher" - Analyzes state, initializes Run Counters, and dispatches tasks.
|
|
4
|
+
* UPDATED: Implements Callback Pattern. Initializes 'computation_runs' doc for worker coordination.
|
|
6
5
|
*/
|
|
7
6
|
|
|
8
7
|
const { getExpectedDateStrings, normalizeName, DEFINITIVE_EARLIEST_DATES } = require('../utils/utils.js');
|
|
@@ -19,13 +18,22 @@ const STATUS_IMPOSSIBLE = 'IMPOSSIBLE';
|
|
|
19
18
|
|
|
20
19
|
/**
|
|
21
20
|
* Dispatches computation tasks for a specific pass.
|
|
21
|
+
* @param {Object} config - System config
|
|
22
|
+
* @param {Object} dependencies - { db, logger, ... }
|
|
23
|
+
* @param {Array} computationManifest - List of calculations
|
|
24
|
+
* @param {Object} reqBody - (Optional) HTTP Body containing 'callbackUrl' and 'date'
|
|
22
25
|
*/
|
|
23
|
-
async function dispatchComputationPass(config, dependencies, computationManifest) {
|
|
26
|
+
async function dispatchComputationPass(config, dependencies, computationManifest, reqBody = {}) {
|
|
24
27
|
const { logger, db } = dependencies;
|
|
25
28
|
const pubsubUtils = new PubSubUtils(dependencies);
|
|
26
29
|
const passToRun = String(config.COMPUTATION_PASS_TO_RUN);
|
|
27
30
|
|
|
31
|
+
// [NEW] Extract Date and Callback from request body (pushed by Workflow)
|
|
32
|
+
const dateStr = reqBody.date || config.date;
|
|
33
|
+
const callbackUrl = reqBody.callbackUrl || null;
|
|
34
|
+
|
|
28
35
|
if (!passToRun) { return logger.log('ERROR', '[Dispatcher] No pass defined (COMPUTATION_PASS_TO_RUN). Aborting.'); }
|
|
36
|
+
if (!dateStr) { return logger.log('ERROR', '[Dispatcher] No date defined. Aborting.'); }
|
|
29
37
|
|
|
30
38
|
const currentManifestHash = generateCodeHash(
|
|
31
39
|
computationManifest.map(c => c.hash).sort().join('|')
|
|
@@ -37,29 +45,29 @@ async function dispatchComputationPass(config, dependencies, computationManifest
|
|
|
37
45
|
if (!calcsInThisPass.length) { return logger.log('WARN', `[Dispatcher] No calcs for Pass ${passToRun}. Exiting.`); }
|
|
38
46
|
|
|
39
47
|
const calcNames = calcsInThisPass.map(c => c.name);
|
|
40
|
-
logger.log('INFO', `🚀 [Dispatcher] Smart-Dispatching PASS ${passToRun}`);
|
|
41
|
-
|
|
42
|
-
|
|
48
|
+
logger.log('INFO', `🚀 [Dispatcher] Smart-Dispatching PASS ${passToRun} for ${dateStr}`);
|
|
49
|
+
|
|
50
|
+
// -- DATE ANALYSIS LOGIC (Unchanged) --
|
|
43
51
|
const passEarliestDate = Object.values(DEFINITIVE_EARLIEST_DATES).reduce((a, b) => a < b ? a : b);
|
|
44
52
|
const endDateUTC = new Date(Date.UTC(new Date().getUTCFullYear(), new Date().getUTCMonth(), new Date().getUTCDate() - 1));
|
|
45
|
-
|
|
53
|
+
|
|
54
|
+
// We only analyze the specific requested date to keep dispatch fast for the workflow
|
|
55
|
+
const allExpectedDates = [dateStr];
|
|
46
56
|
|
|
47
57
|
const manifestMap = new Map(computationManifest.map(c => [normalizeName(c.name), c]));
|
|
48
58
|
const tasksToDispatch = [];
|
|
49
59
|
const limit = pLimit(20);
|
|
50
60
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
const analysisPromises = allExpectedDates.map(dateStr => limit(async () => {
|
|
61
|
+
const analysisPromises = allExpectedDates.map(d => limit(async () => {
|
|
54
62
|
try {
|
|
55
63
|
const fetchPromises = [
|
|
56
|
-
fetchComputationStatus(
|
|
57
|
-
checkRootDataAvailability(
|
|
64
|
+
fetchComputationStatus(d, config, dependencies),
|
|
65
|
+
checkRootDataAvailability(d, config, dependencies, DEFINITIVE_EARLIEST_DATES)
|
|
58
66
|
];
|
|
59
67
|
|
|
60
68
|
let prevDateStr = null;
|
|
61
69
|
if (calcsInThisPass.some(c => c.isHistorical)) {
|
|
62
|
-
const prevDate = new Date(
|
|
70
|
+
const prevDate = new Date(d + 'T00:00:00Z');
|
|
63
71
|
prevDate.setUTCDate(prevDate.getUTCDate() - 1);
|
|
64
72
|
prevDateStr = prevDate.toISOString().slice(0, 10);
|
|
65
73
|
|
|
@@ -77,7 +85,7 @@ async function dispatchComputationPass(config, dependencies, computationManifest
|
|
|
77
85
|
hasPortfolio: false, hasHistory: false, hasSocial: false, hasInsights: false, hasPrices: false
|
|
78
86
|
};
|
|
79
87
|
|
|
80
|
-
const report = analyzeDateExecution(
|
|
88
|
+
const report = analyzeDateExecution(d, calcsInThisPass, rootDataStatus, dailyStatus, manifestMap, prevDailyStatus);
|
|
81
89
|
|
|
82
90
|
const statusUpdates = {};
|
|
83
91
|
|
|
@@ -93,25 +101,20 @@ async function dispatchComputationPass(config, dependencies, computationManifest
|
|
|
93
101
|
|
|
94
102
|
report.failedDependency.forEach(item => {
|
|
95
103
|
const missingStr = item.missing ? item.missing.join(', ') : 'unknown';
|
|
96
|
-
statusUpdates[item.name] = {
|
|
97
|
-
hash: false,
|
|
98
|
-
category: 'unknown',
|
|
99
|
-
reason: `Dependency Missing: ${missingStr}`
|
|
100
|
-
};
|
|
104
|
+
statusUpdates[item.name] = { hash: false, category: 'unknown', reason: `Dependency Missing: ${missingStr}` };
|
|
101
105
|
});
|
|
102
106
|
|
|
103
107
|
if (Object.keys(statusUpdates).length > 0) {
|
|
104
|
-
await updateComputationStatus(
|
|
108
|
+
await updateComputationStatus(d, statusUpdates, config, dependencies);
|
|
105
109
|
}
|
|
106
110
|
|
|
107
111
|
const validToRun = [...report.runnable, ...report.reRuns];
|
|
108
112
|
validToRun.forEach(item => {
|
|
109
113
|
const uniqueDispatchId = crypto.randomUUID();
|
|
110
|
-
|
|
111
114
|
tasksToDispatch.push({
|
|
112
115
|
action: 'RUN_COMPUTATION_DATE',
|
|
113
116
|
dispatchId: uniqueDispatchId,
|
|
114
|
-
date:
|
|
117
|
+
date: d,
|
|
115
118
|
pass: passToRun,
|
|
116
119
|
computation: normalizeName(item.name),
|
|
117
120
|
hash: item.hash || item.newHash,
|
|
@@ -123,15 +126,41 @@ async function dispatchComputationPass(config, dependencies, computationManifest
|
|
|
123
126
|
});
|
|
124
127
|
|
|
125
128
|
} catch (e) {
|
|
126
|
-
logger.log('ERROR', `[Dispatcher] Failed analysis for ${
|
|
129
|
+
logger.log('ERROR', `[Dispatcher] Failed analysis for ${d}: ${e.message}`);
|
|
127
130
|
}
|
|
128
131
|
}));
|
|
129
132
|
|
|
130
133
|
await Promise.all(analysisPromises);
|
|
131
134
|
|
|
135
|
+
// -- NEW: CALLBACK & COUNTER INITIALIZATION --
|
|
136
|
+
|
|
132
137
|
if (tasksToDispatch.length > 0) {
|
|
133
|
-
logger.log('INFO', `[Dispatcher] 📝
|
|
138
|
+
logger.log('INFO', `[Dispatcher] 📝 Preparing ${tasksToDispatch.length} tasks for execution...`);
|
|
134
139
|
|
|
140
|
+
// 1. Initialize Shared State Document (The Counter)
|
|
141
|
+
const runId = crypto.randomUUID();
|
|
142
|
+
const metaStatePath = `computation_runs/${runId}`;
|
|
143
|
+
|
|
144
|
+
if (callbackUrl) {
|
|
145
|
+
await db.doc(metaStatePath).set({
|
|
146
|
+
createdAt: new Date(),
|
|
147
|
+
date: dateStr,
|
|
148
|
+
pass: passToRun,
|
|
149
|
+
totalTasks: tasksToDispatch.length,
|
|
150
|
+
remainingTasks: tasksToDispatch.length, // <--- The Countdown
|
|
151
|
+
callbackUrl: callbackUrl, // <--- The Workflow Hook
|
|
152
|
+
status: 'IN_PROGRESS'
|
|
153
|
+
});
|
|
154
|
+
logger.log('INFO', `[Dispatcher] 🏁 Run State Initialized: ${runId}`);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// 2. Attach Run Metadata to every task
|
|
158
|
+
tasksToDispatch.forEach(task => {
|
|
159
|
+
task.runId = runId;
|
|
160
|
+
task.metaStatePath = callbackUrl ? metaStatePath : null;
|
|
161
|
+
});
|
|
162
|
+
|
|
163
|
+
// 3. Create Audit Ledger Entries
|
|
135
164
|
const finalDispatched = [];
|
|
136
165
|
const txnLimit = pLimit(20);
|
|
137
166
|
|
|
@@ -142,34 +171,22 @@ async function dispatchComputationPass(config, dependencies, computationManifest
|
|
|
142
171
|
await db.runTransaction(async (t) => {
|
|
143
172
|
const doc = await t.get(ledgerRef);
|
|
144
173
|
|
|
145
|
-
// [UPDATED] Robust One-Shot Dispatch Logic
|
|
146
|
-
// We REMOVED the "Zombie Timeout" check.
|
|
147
|
-
// If the Dispatcher is running, we assume the user intends to ensure these tasks are dispatched.
|
|
148
|
-
|
|
149
174
|
if (doc.exists) {
|
|
150
175
|
const data = doc.data();
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
if (data.status === 'COMPLETED') {
|
|
154
|
-
return false;
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
// 2. If it is PENDING or IN_PROGRESS:
|
|
158
|
-
// Since the Dispatcher runs ONCE per day, seeing PENDING here means
|
|
159
|
-
// the *previous* run failed to complete, or the worker died.
|
|
160
|
-
// We overwrite it to force a restart.
|
|
176
|
+
// Strict Idempotency: If completed, don't run again.
|
|
177
|
+
if (data.status === 'COMPLETED') return false;
|
|
161
178
|
}
|
|
162
179
|
|
|
163
|
-
// Create/Overwrite entry with PENDING to start the cycle
|
|
164
180
|
t.set(ledgerRef, {
|
|
165
181
|
status: 'PENDING',
|
|
166
182
|
dispatchId: task.dispatchId,
|
|
183
|
+
runId: task.runId, // Track the batch ID
|
|
167
184
|
computation: task.computation,
|
|
168
185
|
expectedHash: task.hash || 'unknown',
|
|
169
186
|
createdAt: new Date(),
|
|
170
187
|
dispatcherHash: currentManifestHash,
|
|
171
188
|
triggerReason: task.triggerReason,
|
|
172
|
-
retries: 0
|
|
189
|
+
retries: 0
|
|
173
190
|
}, { merge: true });
|
|
174
191
|
|
|
175
192
|
return true;
|
|
@@ -178,14 +195,15 @@ async function dispatchComputationPass(config, dependencies, computationManifest
|
|
|
178
195
|
finalDispatched.push(task);
|
|
179
196
|
|
|
180
197
|
} catch (txnErr) {
|
|
181
|
-
logger.log('WARN', `[Dispatcher] Transaction failed for ${task.computation}
|
|
198
|
+
logger.log('WARN', `[Dispatcher] Transaction failed for ${task.computation}: ${txnErr.message}`);
|
|
182
199
|
}
|
|
183
200
|
}));
|
|
184
201
|
|
|
185
202
|
await Promise.all(txnPromises);
|
|
186
203
|
|
|
204
|
+
// 4. Publish to Pub/Sub
|
|
187
205
|
if (finalDispatched.length > 0) {
|
|
188
|
-
logger.log('INFO', `[Dispatcher] ✅ Publishing ${finalDispatched.length}
|
|
206
|
+
logger.log('INFO', `[Dispatcher] ✅ Publishing ${finalDispatched.length} tasks to Pub/Sub...`);
|
|
189
207
|
|
|
190
208
|
await pubsubUtils.batchPublishTasks(dependencies, {
|
|
191
209
|
topicName: TOPIC_NAME,
|
|
@@ -194,14 +212,17 @@ async function dispatchComputationPass(config, dependencies, computationManifest
|
|
|
194
212
|
maxPubsubBatchSize: 100
|
|
195
213
|
});
|
|
196
214
|
|
|
197
|
-
|
|
215
|
+
// Return count so workflow knows to wait
|
|
216
|
+
return { dispatched: finalDispatched.length, runId };
|
|
198
217
|
} else {
|
|
199
|
-
|
|
218
|
+
// Edge Case: Analysis said "Run", but Ledger said "Already Done"
|
|
219
|
+
// We must update the state doc to 0 or delete it, OR return 0 so workflow doesn't wait.
|
|
220
|
+
logger.log('INFO', `[Dispatcher] All tasks were already COMPLETED.`);
|
|
200
221
|
return { dispatched: 0 };
|
|
201
222
|
}
|
|
202
223
|
|
|
203
224
|
} else {
|
|
204
|
-
logger.log('INFO', `[Dispatcher] No valid tasks found
|
|
225
|
+
logger.log('INFO', `[Dispatcher] No valid tasks found (Up to date).`);
|
|
205
226
|
return { dispatched: 0 };
|
|
206
227
|
}
|
|
207
228
|
}
|
|
@@ -1,24 +1,98 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* FILENAME: computation-system/helpers/computation_worker.js
|
|
3
|
-
* PURPOSE: Consumes
|
|
4
|
-
* UPDATED:
|
|
5
|
-
* Increased MAX_RETRIES and ensured Ledger is updated on poison messages.
|
|
3
|
+
* PURPOSE: Consumes tasks, executes logic, and signals Workflow upon Batch Completion.
|
|
4
|
+
* UPDATED: Implements "Last Worker" Callback Pattern.
|
|
6
5
|
*/
|
|
7
6
|
|
|
8
7
|
const { executeDispatchTask } = require('../WorkflowOrchestrator.js');
|
|
9
8
|
const { getManifest } = require('../topology/ManifestLoader');
|
|
10
9
|
const { StructuredLogger } = require('../logger/logger');
|
|
11
10
|
const { recordRunAttempt } = require('../persistence/RunRecorder');
|
|
11
|
+
const https = require('https'); // [NEW] Required for callback
|
|
12
12
|
|
|
13
13
|
let calculationPackage;
|
|
14
14
|
try { calculationPackage = require('aiden-shared-calculations-unified');
|
|
15
15
|
} catch (e) {console.error("FATAL: Could not load 'aiden-shared-calculations-unified'."); throw e; }
|
|
16
16
|
const calculations = calculationPackage.calculations;
|
|
17
17
|
|
|
18
|
-
// [FIX] Increased from 0 to 3.
|
|
19
|
-
// 0 caused "retryCount >= MAX_RETRIES" to trigger immediately on the first run.
|
|
20
18
|
const MAX_RETRIES = 3;
|
|
21
19
|
|
|
20
|
+
/**
|
|
21
|
+
* [NEW] Helper: Fires the webhook back to Google Cloud Workflows.
|
|
22
|
+
*/
|
|
23
|
+
function triggerWorkflowCallback(url, status, logger) {
|
|
24
|
+
if (!url) return Promise.resolve();
|
|
25
|
+
logger.log('INFO', `[Worker] 🔔 BATCH COMPLETE! Triggering Workflow Callback: ${status}`);
|
|
26
|
+
|
|
27
|
+
return new Promise((resolve, reject) => {
|
|
28
|
+
const body = JSON.stringify({
|
|
29
|
+
status: status,
|
|
30
|
+
timestamp: new Date().toISOString()
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
const req = https.request(url, {
|
|
34
|
+
method: 'POST',
|
|
35
|
+
headers: {
|
|
36
|
+
'Content-Type': 'application/json',
|
|
37
|
+
'Content-Length': Buffer.byteLength(body)
|
|
38
|
+
}
|
|
39
|
+
}, (res) => {
|
|
40
|
+
if (res.statusCode >= 200 && res.statusCode < 300) {
|
|
41
|
+
resolve();
|
|
42
|
+
} else {
|
|
43
|
+
logger.log('WARN', `Callback responded with ${res.statusCode}`);
|
|
44
|
+
resolve(); // Don't crash the worker if callback fails, logic is done.
|
|
45
|
+
}
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
req.on('error', (e) => {
|
|
49
|
+
logger.log('ERROR', `Failed to trigger callback: ${e.message}`);
|
|
50
|
+
resolve();
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
req.write(body);
|
|
54
|
+
req.end();
|
|
55
|
+
});
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* [NEW] Helper: Decrements 'remainingTasks' in Firestore.
|
|
60
|
+
* Returns the callbackUrl IF this was the last task.
|
|
61
|
+
*/
|
|
62
|
+
async function decrementAndCheck(db, metaStatePath, logger) {
|
|
63
|
+
if (!metaStatePath) return null;
|
|
64
|
+
|
|
65
|
+
try {
|
|
66
|
+
const result = await db.runTransaction(async (t) => {
|
|
67
|
+
const ref = db.doc(metaStatePath);
|
|
68
|
+
const doc = await t.get(ref);
|
|
69
|
+
|
|
70
|
+
if (!doc.exists) return null; // State might have expired or been deleted
|
|
71
|
+
const data = doc.data();
|
|
72
|
+
|
|
73
|
+
const newRemaining = (data.remainingTasks || 0) - 1;
|
|
74
|
+
|
|
75
|
+
t.update(ref, {
|
|
76
|
+
remainingTasks: newRemaining,
|
|
77
|
+
lastUpdated: new Date()
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
// Return needed data only if we hit 0 (or lower, for safety)
|
|
81
|
+
return {
|
|
82
|
+
remaining: newRemaining,
|
|
83
|
+
callbackUrl: data.callbackUrl
|
|
84
|
+
};
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
if (result && result.remaining <= 0) {
|
|
88
|
+
return result.callbackUrl;
|
|
89
|
+
}
|
|
90
|
+
} catch (e) {
|
|
91
|
+
logger.log('ERROR', `[Worker] Failed to decrement batch counter: ${e.message}`);
|
|
92
|
+
}
|
|
93
|
+
return null;
|
|
94
|
+
}
|
|
95
|
+
|
|
22
96
|
async function handleComputationTask(message, config, dependencies) {
|
|
23
97
|
const systemLogger = new StructuredLogger({ minLevel: config.minLevel || 'INFO', enableStructured: true, ...config });
|
|
24
98
|
const runDependencies = { ...dependencies, logger: systemLogger };
|
|
@@ -34,15 +108,14 @@ async function handleComputationTask(message, config, dependencies) {
|
|
|
34
108
|
|
|
35
109
|
if (!data || data.action !== 'RUN_COMPUTATION_DATE') { return; }
|
|
36
110
|
|
|
37
|
-
// Extract fields including new
|
|
38
|
-
const { date, pass, computation, previousCategory, triggerReason, dispatchId, dependencyResultHashes } = data;
|
|
111
|
+
// Extract fields including new metaStatePath
|
|
112
|
+
const { date, pass, computation, previousCategory, triggerReason, dispatchId, dependencyResultHashes, metaStatePath } = data;
|
|
39
113
|
|
|
40
114
|
if (!date || !pass || !computation) { logger.log('ERROR', `[Worker] Invalid payload.`, data); return; }
|
|
41
115
|
|
|
42
|
-
// [FIX] Ensure retryCount defaults to 1 (PubSub usually sends 1 for the first attempt)
|
|
43
116
|
const retryCount = message.deliveryAttempt || 1;
|
|
44
117
|
|
|
45
|
-
//
|
|
118
|
+
// --- POISON MESSAGE HANDLING (DLQ) ---
|
|
46
119
|
if (retryCount > MAX_RETRIES) {
|
|
47
120
|
logger.log('ERROR', `[Worker] ☠️ Task POISONED. Moved to DLQ: ${computation}`);
|
|
48
121
|
try {
|
|
@@ -54,25 +127,28 @@ async function handleComputationTask(message, config, dependencies) {
|
|
|
54
127
|
failureReason: 'MAX_RETRIES_EXCEEDED'
|
|
55
128
|
});
|
|
56
129
|
|
|
57
|
-
// [FIX] CRITICAL: Update Ledger to FAILED.
|
|
58
|
-
// Previously, this returned without updating, leaving the Ledger stuck in 'PENDING'.
|
|
59
|
-
// Now we explicitly mark it FAILED so the pipeline knows it's dead.
|
|
60
130
|
await db.collection(`computation_audit_ledger/${date}/passes/${pass}/tasks`).doc(computation).set({
|
|
61
131
|
status: 'FAILED',
|
|
62
132
|
error: 'Max Retries Exceeded (Poison Message)',
|
|
63
133
|
failedAt: new Date()
|
|
64
134
|
}, { merge: true });
|
|
65
135
|
|
|
136
|
+
// [CRITICAL] Even if it failed, we MUST decrement the counter.
|
|
137
|
+
// Otherwise the workflow waits 24h for a task that will never finish.
|
|
138
|
+
const callbackUrl = await decrementAndCheck(db, metaStatePath, logger);
|
|
139
|
+
if (callbackUrl) {
|
|
140
|
+
// We signal SUCCESS to the workflow because the *Batch* is finished processing (even if this task failed).
|
|
141
|
+
// The "monitor" or next pass can handle data gaps.
|
|
142
|
+
await triggerWorkflowCallback(callbackUrl, 'SUCCESS', logger);
|
|
143
|
+
}
|
|
144
|
+
|
|
66
145
|
return;
|
|
67
146
|
} catch (dlqErr) { logger.log('FATAL', `[Worker] Failed to write to DLQ`, dlqErr); }
|
|
68
147
|
}
|
|
69
148
|
|
|
70
|
-
logger.log('INFO', `[Worker] 📥 Received Task: ${computation} (${date}) [Attempt ${retryCount}/${MAX_RETRIES}]
|
|
71
|
-
dispatchId: dispatchId || 'legacy',
|
|
72
|
-
reason: triggerReason
|
|
73
|
-
});
|
|
149
|
+
logger.log('INFO', `[Worker] 📥 Received Task: ${computation} (${date}) [Attempt ${retryCount}/${MAX_RETRIES}]`);
|
|
74
150
|
|
|
75
|
-
//
|
|
151
|
+
// Update Status to IN_PROGRESS
|
|
76
152
|
try {
|
|
77
153
|
await db.collection(`computation_audit_ledger/${date}/passes/${pass}/tasks`).doc(computation).set({
|
|
78
154
|
status: 'IN_PROGRESS',
|
|
@@ -80,29 +156,21 @@ async function handleComputationTask(message, config, dependencies) {
|
|
|
80
156
|
startedAt: new Date(),
|
|
81
157
|
dispatchId: dispatchId
|
|
82
158
|
}, { merge: true });
|
|
83
|
-
} catch (leaseErr) {
|
|
84
|
-
logger.log('WARN', `[Worker] Failed to update status to IN_PROGRESS for ${computation}. Continuing...`, leaseErr);
|
|
85
|
-
}
|
|
159
|
+
} catch (leaseErr) {}
|
|
86
160
|
|
|
87
161
|
let computationManifest;
|
|
88
162
|
try { computationManifest = getManifest(config.activeProductLines || [], calculations, runDependencies);
|
|
89
163
|
} catch (manifestError) {
|
|
90
164
|
logger.log('FATAL', `[Worker] Failed to load Manifest: ${manifestError.message}`);
|
|
91
|
-
|
|
165
|
+
// Do NOT decrement here, let PubSub retry the task.
|
|
92
166
|
return;
|
|
93
167
|
}
|
|
94
168
|
|
|
95
169
|
try {
|
|
96
170
|
const startTime = Date.now();
|
|
97
171
|
const result = await executeDispatchTask(
|
|
98
|
-
date,
|
|
99
|
-
|
|
100
|
-
computation,
|
|
101
|
-
config,
|
|
102
|
-
runDependencies,
|
|
103
|
-
computationManifest,
|
|
104
|
-
previousCategory,
|
|
105
|
-
dependencyResultHashes
|
|
172
|
+
date, pass, computation, config, runDependencies,
|
|
173
|
+
computationManifest, previousCategory, dependencyResultHashes
|
|
106
174
|
);
|
|
107
175
|
const duration = Date.now() - startTime;
|
|
108
176
|
|
|
@@ -110,38 +178,44 @@ async function handleComputationTask(message, config, dependencies) {
|
|
|
110
178
|
const successUpdates = result?.updates?.successUpdates || {};
|
|
111
179
|
|
|
112
180
|
if (failureReport.length > 0) {
|
|
181
|
+
// Logic/Storage failure (handled internally by executor)
|
|
113
182
|
const failReason = failureReport[0];
|
|
114
|
-
logger.log('ERROR', `[Worker] ❌ Failed logic/storage for ${computation}`, failReason.error);
|
|
115
|
-
const metrics = failReason.metrics || {};
|
|
116
|
-
metrics.durationMs = duration;
|
|
117
|
-
await recordRunAttempt(db, { date, computation, pass }, 'FAILURE', failReason.error, metrics, triggerReason);
|
|
118
183
|
throw new Error(failReason.error.message || 'Computation Logic Failed');
|
|
119
184
|
}
|
|
120
|
-
else
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
185
|
+
else {
|
|
186
|
+
if (Object.keys(successUpdates).length > 0) {
|
|
187
|
+
logger.log('INFO', `[Worker] ✅ Stored: ${computation}`);
|
|
188
|
+
} else {
|
|
189
|
+
logger.log('WARN', `[Worker] ⚠️ Empty Result: ${computation}`);
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// 1. Mark Ledger as COMPLETED
|
|
127
193
|
await db.collection(`computation_audit_ledger/${date}/passes/${pass}/tasks`).doc(computation).update({
|
|
128
194
|
status: 'COMPLETED',
|
|
129
195
|
completedAt: new Date()
|
|
130
196
|
}).catch(() => {});
|
|
131
197
|
|
|
132
|
-
await recordRunAttempt(db, { date, computation, pass }, 'SUCCESS', null,
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
198
|
+
await recordRunAttempt(db, { date, computation, pass }, 'SUCCESS', null, { durationMs: duration }, triggerReason);
|
|
199
|
+
|
|
200
|
+
// 2. [NEW] Decrement Batch Counter & Check for Callback
|
|
201
|
+
const callbackUrl = await decrementAndCheck(db, metaStatePath, logger);
|
|
202
|
+
|
|
203
|
+
// 3. [NEW] If last one, fire callback
|
|
204
|
+
if (callbackUrl) {
|
|
205
|
+
await triggerWorkflowCallback(callbackUrl, 'SUCCESS', logger);
|
|
206
|
+
}
|
|
137
207
|
}
|
|
138
208
|
} catch (err) {
|
|
209
|
+
// --- ERROR HANDLING ---
|
|
210
|
+
|
|
211
|
+
// Check for Permanent/Deterministic Errors
|
|
139
212
|
const isDeterministicError = err.stage === 'SHARDING_LIMIT_EXCEEDED' ||
|
|
140
213
|
err.stage === 'QUALITY_CIRCUIT_BREAKER' ||
|
|
214
|
+
err.stage === 'SEMANTIC_GATE' ||
|
|
141
215
|
(err.message && (err.message.includes('INVALID_ARGUMENT') || err.message.includes('Transaction too big')));
|
|
142
216
|
|
|
143
217
|
if (isDeterministicError) {
|
|
144
|
-
logger.log('ERROR', `[Worker] 🛑 Permanent Failure
|
|
218
|
+
logger.log('ERROR', `[Worker] 🛑 Permanent Failure: ${err.message}`);
|
|
145
219
|
try {
|
|
146
220
|
await db.collection('computation_dead_letter_queue').add({
|
|
147
221
|
originalData: data,
|
|
@@ -151,7 +225,6 @@ async function handleComputationTask(message, config, dependencies) {
|
|
|
151
225
|
failureReason: 'PERMANENT_DETERMINISTIC_ERROR'
|
|
152
226
|
});
|
|
153
227
|
|
|
154
|
-
// [FIX] Update Ledger to FAILED immediately for deterministic errors
|
|
155
228
|
await db.collection(`computation_audit_ledger/${date}/passes/${pass}/tasks`).doc(computation).set({
|
|
156
229
|
status: 'FAILED',
|
|
157
230
|
error: err.message || 'Permanent Deterministic Error',
|
|
@@ -159,22 +232,29 @@ async function handleComputationTask(message, config, dependencies) {
|
|
|
159
232
|
}, { merge: true });
|
|
160
233
|
|
|
161
234
|
await recordRunAttempt(db, { date, computation, pass }, 'FAILURE', { message: err.message, stage: err.stage || 'PERMANENT_FAIL' }, { durationMs: 0 }, triggerReason);
|
|
162
|
-
|
|
235
|
+
|
|
236
|
+
// [CRITICAL] Permanent failure -> Must decrement so workflow doesn't hang
|
|
237
|
+
const callbackUrl = await decrementAndCheck(db, metaStatePath, logger);
|
|
238
|
+
if (callbackUrl) {
|
|
239
|
+
await triggerWorkflowCallback(callbackUrl, 'SUCCESS', logger);
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
return; // Do NOT throw, consume the message
|
|
163
243
|
} catch (dlqErr) { logger.log('FATAL', `[Worker] Failed to write to DLQ`, dlqErr); }
|
|
164
244
|
}
|
|
165
245
|
|
|
166
|
-
// Standard Retryable Error (
|
|
246
|
+
// Standard Retryable Error (Transient)
|
|
167
247
|
if (retryCount >= MAX_RETRIES) {
|
|
168
|
-
//
|
|
169
|
-
//
|
|
248
|
+
// Let the top-level poison check handle the decrement on the *next* delivery (or handle here if you prefer).
|
|
249
|
+
// Standard practice: throw so PubSub handles the backoff and redelivery.
|
|
250
|
+
// The poison logic at the top of this function will catch it on attempt N+1.
|
|
170
251
|
throw err;
|
|
171
252
|
}
|
|
172
253
|
|
|
173
254
|
logger.log('ERROR', `[Worker] ❌ Crash: ${computation}: ${err.message}`);
|
|
174
|
-
|
|
175
255
|
await recordRunAttempt(db, { date, computation, pass }, 'CRASH', { message: err.message, stack: err.stack, stage: 'SYSTEM_CRASH' }, { durationMs: 0 }, triggerReason);
|
|
176
|
-
|
|
177
|
-
throw err;
|
|
256
|
+
|
|
257
|
+
throw err; // Trigger Pub/Sub retry
|
|
178
258
|
}
|
|
179
259
|
}
|
|
180
260
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# Cloud Workflows Definition for BullTrackers Computation Pipeline
|
|
2
|
-
# Orchestrates 5 sequential passes
|
|
2
|
+
# Orchestrates 5 sequential passes using Event-Driven Callbacks (Zero Polling).
|
|
3
|
+
# UPDATED: Implements "Callback Pattern" to eliminate arbitrary sleeps.
|
|
3
4
|
|
|
4
5
|
main:
|
|
5
6
|
params: [input]
|
|
@@ -12,9 +13,6 @@ main:
|
|
|
12
13
|
- date_to_run: ${default(map.get(input, "date"), text.substring(time.format(sys.now()), 0, 10))}
|
|
13
14
|
- passes: ["1", "2", "3", "4", "5"]
|
|
14
15
|
- max_retries: 3
|
|
15
|
-
- propagation_wait_seconds: 300 # 5 Minutes
|
|
16
|
-
# URL of the new Monitor Function
|
|
17
|
-
- monitor_url: ${"https://europe-west1-" + project + ".cloudfunctions.net/computation-monitor"}
|
|
18
16
|
|
|
19
17
|
# ======================================================
|
|
20
18
|
# MAIN LOOP: Iterate through Passes 1 to 5
|
|
@@ -28,7 +26,7 @@ main:
|
|
|
28
26
|
assign:
|
|
29
27
|
- attempt_count: 0
|
|
30
28
|
- pass_success: false
|
|
31
|
-
#
|
|
29
|
+
# URL of the Cloud Function acting as the Dispatcher
|
|
32
30
|
- dispatcher_url: ${"https://europe-west1-" + project + ".cloudfunctions.net/computation-pass-" + pass_id}
|
|
33
31
|
|
|
34
32
|
# -----------------------------------------------
|
|
@@ -42,85 +40,82 @@ main:
|
|
|
42
40
|
assign:
|
|
43
41
|
- attempt_count: ${attempt_count + 1}
|
|
44
42
|
|
|
43
|
+
# 1. GENERATE CALLBACK URL
|
|
44
|
+
# This creates a unique HTTP endpoint that points specifically to this step execution.
|
|
45
|
+
- create_callback:
|
|
46
|
+
call: sys.create_callback_url
|
|
47
|
+
result: callback_url
|
|
48
|
+
|
|
45
49
|
- log_start:
|
|
46
50
|
call: sys.log
|
|
47
51
|
args:
|
|
48
|
-
text: ${"Starting Pass " + pass_id + " (Attempt " + attempt_count + ") for " + date_to_run}
|
|
52
|
+
text: ${"Starting Pass " + pass_id + " (Attempt " + attempt_count + ") for " + date_to_run + ". Waiting for signal at " + callback_url}
|
|
49
53
|
severity: "INFO"
|
|
50
54
|
|
|
51
|
-
#
|
|
55
|
+
# 2. TRIGGER DISPATCHER
|
|
56
|
+
# We pass the 'callback_url' to the dispatcher so it can hand it to the workers.
|
|
52
57
|
- trigger_dispatcher:
|
|
53
|
-
call: http.
|
|
58
|
+
call: http.post
|
|
54
59
|
args:
|
|
55
60
|
url: ${dispatcher_url}
|
|
56
|
-
|
|
61
|
+
body:
|
|
57
62
|
date: ${date_to_run}
|
|
63
|
+
callbackUrl: ${callback_url} # <--- CRITICAL: Passing the token
|
|
58
64
|
auth:
|
|
59
65
|
type: OIDC
|
|
60
66
|
timeout: 1800 # 30 mins max for dispatch analysis
|
|
61
67
|
result: dispatch_response
|
|
62
68
|
|
|
63
|
-
#
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
args:
|
|
67
|
-
text: ${"Pass " + pass_id + " dispatched. Waiting " + propagation_wait_seconds + "s for propagation..."}
|
|
68
|
-
next: sleep_propagation
|
|
69
|
-
|
|
70
|
-
- sleep_propagation:
|
|
71
|
-
call: sys.sleep
|
|
72
|
-
args:
|
|
73
|
-
seconds: ${propagation_wait_seconds}
|
|
74
|
-
|
|
75
|
-
# 3. MONITORING LOOP
|
|
76
|
-
- monitor_loop:
|
|
77
|
-
call: http.get
|
|
78
|
-
args:
|
|
79
|
-
url: ${monitor_url}
|
|
80
|
-
query:
|
|
81
|
-
date: ${date_to_run}
|
|
82
|
-
pass: ${pass_id}
|
|
83
|
-
auth:
|
|
84
|
-
type: OIDC
|
|
85
|
-
result: status_resp
|
|
86
|
-
|
|
87
|
-
- evaluate_status:
|
|
69
|
+
# 3. CHECK FOR "NOTHING TO DO"
|
|
70
|
+
# If the dispatcher found 0 tasks, it returns immediate success. We skip waiting.
|
|
71
|
+
- check_immediate_completion:
|
|
88
72
|
switch:
|
|
89
|
-
|
|
90
|
-
- condition: ${status_resp.body.state == "RUNNING"}
|
|
73
|
+
- condition: ${dispatch_response.body.dispatched == 0}
|
|
91
74
|
steps:
|
|
92
|
-
-
|
|
75
|
+
- log_empty:
|
|
93
76
|
call: sys.log
|
|
94
77
|
args:
|
|
95
|
-
text: ${"Pass " + pass_id + "
|
|
96
|
-
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
next: monitor_loop
|
|
78
|
+
text: ${"Pass " + pass_id + " had 0 tasks. Moving to next pass immediately."}
|
|
79
|
+
- mark_success_empty:
|
|
80
|
+
assign:
|
|
81
|
+
- pass_success: true
|
|
82
|
+
- next_pass_empty:
|
|
83
|
+
next: pass_retry_loop
|
|
102
84
|
|
|
103
|
-
|
|
104
|
-
|
|
85
|
+
# 4. WAIT FOR WORKER SIGNAL
|
|
86
|
+
# The workflow freezes here (sleeps) until a worker hits the callback_url.
|
|
87
|
+
# This eliminates the need for polling logic.
|
|
88
|
+
- wait_for_completion:
|
|
89
|
+
call: sys.await_callback
|
|
90
|
+
args:
|
|
91
|
+
callback_url: ${callback_url}
|
|
92
|
+
timeout: 86400 # Wait up to 24 hours for the batch to finish
|
|
93
|
+
result: callback_request
|
|
94
|
+
|
|
95
|
+
# 5. PROCESS SIGNAL
|
|
96
|
+
- evaluate_signal:
|
|
97
|
+
assign:
|
|
98
|
+
- signal_data: ${callback_request.http_request.body}
|
|
99
|
+
switch:
|
|
100
|
+
- condition: ${signal_data.status == "SUCCESS"}
|
|
105
101
|
steps:
|
|
106
102
|
- log_success:
|
|
107
103
|
call: sys.log
|
|
108
104
|
args:
|
|
109
|
-
text: ${"Pass " + pass_id + "
|
|
105
|
+
text: ${"Pass " + pass_id + " signaled COMPLETION via Callback."}
|
|
110
106
|
severity: "INFO"
|
|
111
107
|
- mark_success:
|
|
112
108
|
assign:
|
|
113
109
|
- pass_success: true
|
|
114
|
-
-
|
|
115
|
-
next: pass_retry_loop
|
|
110
|
+
- proceed:
|
|
111
|
+
next: pass_retry_loop
|
|
116
112
|
|
|
117
|
-
|
|
118
|
-
- condition: ${status_resp.body.state == "HAS_FAILURES"}
|
|
113
|
+
- condition: ${signal_data.status == "FAILURE"}
|
|
119
114
|
steps:
|
|
120
115
|
- log_failure:
|
|
121
116
|
call: sys.log
|
|
122
117
|
args:
|
|
123
|
-
text: ${"Pass " + pass_id + "
|
|
118
|
+
text: ${"Pass " + pass_id + " signaled FAILURE (DLQ limit hit). Retrying pass."}
|
|
124
119
|
severity: "WARNING"
|
|
125
120
|
- retry_pass:
|
|
126
121
|
next: pass_retry_loop
|
|
@@ -136,7 +131,7 @@ main:
|
|
|
136
131
|
- log_giving_up:
|
|
137
132
|
call: sys.log
|
|
138
133
|
args:
|
|
139
|
-
text: ${"Pass " + pass_id + " failed after " + max_retries + " attempts. Proceeding
|
|
134
|
+
text: ${"Pass " + pass_id + " failed after " + max_retries + " attempts. Proceeding with potential gaps."}
|
|
140
135
|
severity: "ERROR"
|
|
141
136
|
|
|
142
137
|
- finish:
|