bulltrackers-module 1.0.292 → 1.0.294
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/functions/computation-system/context/ManifestBuilder.js +5 -5
- package/functions/computation-system/executors/StandardExecutor.js +47 -7
- package/functions/computation-system/helpers/computation_dispatcher.js +111 -38
- package/functions/computation-system/helpers/computation_worker.js +73 -75
- package/functions/computation-system/paper.md +93 -0
- package/functions/computation-system/persistence/RunRecorder.js +7 -7
- package/functions/computation-system/tools/BuildReporter.js +5 -5
- package/functions/computation-system/workflows/bulltrackers_pipeline.yaml +10 -8
- package/package.json +1 -1
- package/functions/computation-system/onboarding.md +0 -210
|
@@ -89,12 +89,12 @@ function getDependencySet(endpoints, adjacencyList) {
|
|
|
89
89
|
* Returns a string description of the first cycle found.
|
|
90
90
|
*/
|
|
91
91
|
function detectCircularDependencies(manifestMap) {
|
|
92
|
-
let index
|
|
93
|
-
const stack
|
|
94
|
-
const indices
|
|
92
|
+
let index = 0;
|
|
93
|
+
const stack = [];
|
|
94
|
+
const indices = new Map();
|
|
95
95
|
const lowLinks = new Map();
|
|
96
|
-
const onStack
|
|
97
|
-
const cycles
|
|
96
|
+
const onStack = new Set();
|
|
97
|
+
const cycles = [];
|
|
98
98
|
|
|
99
99
|
function strongconnect(v) {
|
|
100
100
|
indices.set(v, index);
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* @fileoverview Executor for "Standard" (per-user) calculations.
|
|
3
3
|
* UPDATED: Implements Batch Flushing to prevent OOM on large datasets.
|
|
4
|
-
* UPDATED:
|
|
4
|
+
* UPDATED: Implements "Circuit Breaker" to fail fast on high error rates.
|
|
5
|
+
* UPDATED: Implements "Adaptive Flushing" based on V8 Heap usage.
|
|
5
6
|
* UPDATED: Manages incremental sharding states.
|
|
6
7
|
* UPDATED: Implements 'isInitialWrite' flag for robust cleanup.
|
|
7
8
|
*/
|
|
@@ -12,6 +13,7 @@ const { ContextFactory } = require
|
|
|
12
13
|
const { commitResults } = require('../persistence/ResultCommitter');
|
|
13
14
|
const mathLayer = require('../layers/index');
|
|
14
15
|
const { performance } = require('perf_hooks');
|
|
16
|
+
const v8 = require('v8'); // [NEW] For Memory introspection
|
|
15
17
|
|
|
16
18
|
class StandardExecutor {
|
|
17
19
|
static async run(date, calcs, passName, config, deps, rootData, fetchedDeps, previousFetchedDeps, skipStatusWrite = false) {
|
|
@@ -59,6 +61,9 @@ class StandardExecutor {
|
|
|
59
61
|
const aggregatedSuccess = {};
|
|
60
62
|
const aggregatedFailures = [];
|
|
61
63
|
|
|
64
|
+
// [NEW] Global Error Tracking for Circuit Breaker
|
|
65
|
+
const errorStats = { count: 0, total: 0 };
|
|
66
|
+
|
|
62
67
|
Object.keys(state).forEach(name => {
|
|
63
68
|
executionStats[name] = {
|
|
64
69
|
processedUsers: 0,
|
|
@@ -89,7 +94,7 @@ class StandardExecutor {
|
|
|
89
94
|
|
|
90
95
|
let yP_chunk = {}, tH_chunk = {};
|
|
91
96
|
|
|
92
|
-
const
|
|
97
|
+
const MIN_BATCH_SIZE = 1000; // Minimum to process before checking stats
|
|
93
98
|
let usersSinceLastFlush = 0;
|
|
94
99
|
|
|
95
100
|
try {
|
|
@@ -103,6 +108,8 @@ class StandardExecutor {
|
|
|
103
108
|
const chunkSize = Object.keys(tP_chunk).length;
|
|
104
109
|
|
|
105
110
|
const startProcessing = performance.now();
|
|
111
|
+
|
|
112
|
+
// [UPDATED] Collect execution results (success/failure counts)
|
|
106
113
|
const promises = streamingCalcs.map(calc =>
|
|
107
114
|
StandardExecutor.executePerUser(
|
|
108
115
|
calc, calc.manifest, dateStr, tP_chunk, yP_chunk, tH_chunk,
|
|
@@ -110,15 +117,37 @@ class StandardExecutor {
|
|
|
110
117
|
executionStats[normalizeName(calc.manifest.name)]
|
|
111
118
|
)
|
|
112
119
|
);
|
|
113
|
-
|
|
120
|
+
|
|
121
|
+
const batchResults = await Promise.all(promises);
|
|
114
122
|
const procDuration = performance.now() - startProcessing;
|
|
115
123
|
|
|
116
124
|
Object.keys(executionStats).forEach(name => executionStats[name].timings.processing += procDuration);
|
|
117
125
|
|
|
126
|
+
// [NEW] Update Error Stats
|
|
127
|
+
batchResults.forEach(r => {
|
|
128
|
+
errorStats.total += (r.success + r.failures);
|
|
129
|
+
errorStats.count += r.failures;
|
|
130
|
+
});
|
|
131
|
+
|
|
132
|
+
// [NEW] Circuit Breaker: Fail fast if error rate > 10% after processing 100+ items
|
|
133
|
+
// We check total > 100 to avoid failing on the very first user if they happen to be bad.
|
|
134
|
+
if (errorStats.total > 100 && (errorStats.count / errorStats.total) > 0.10) {
|
|
135
|
+
const failRate = (errorStats.count / errorStats.total * 100).toFixed(1);
|
|
136
|
+
throw new Error(`[Circuit Breaker] High failure rate detected (${failRate}%). Aborting batch to prevent silent data loss.`);
|
|
137
|
+
}
|
|
138
|
+
|
|
118
139
|
usersSinceLastFlush += chunkSize;
|
|
119
140
|
|
|
120
|
-
|
|
121
|
-
|
|
141
|
+
// [NEW] Adaptive Flushing (Memory Pressure Check)
|
|
142
|
+
const heapStats = v8.getHeapStatistics();
|
|
143
|
+
const heapUsedRatio = heapStats.used_heap_size / heapStats.heap_size_limit;
|
|
144
|
+
const MEMORY_THRESHOLD = 0.70; // 70% of available RAM
|
|
145
|
+
const COUNT_THRESHOLD = 5000;
|
|
146
|
+
|
|
147
|
+
if (usersSinceLastFlush >= COUNT_THRESHOLD || heapUsedRatio > MEMORY_THRESHOLD) {
|
|
148
|
+
const reason = heapUsedRatio > MEMORY_THRESHOLD ? `MEMORY_PRESSURE (${(heapUsedRatio*100).toFixed(0)}%)` : 'BATCH_LIMIT';
|
|
149
|
+
|
|
150
|
+
logger.log('INFO', `[${passName}] 🛁 Flushing buffer after ${usersSinceLastFlush} users. Reason: ${reason}`);
|
|
122
151
|
|
|
123
152
|
// [UPDATED] Pass isInitialWrite: true only on the first flush
|
|
124
153
|
const flushResult = await StandardExecutor.flushBuffer(state, dateStr, passName, config, deps, shardIndexMap, executionStats, 'INTERMEDIATE', true, !hasFlushed);
|
|
@@ -171,6 +200,7 @@ class StandardExecutor {
|
|
|
171
200
|
_executionStats: executionStats[name]
|
|
172
201
|
};
|
|
173
202
|
|
|
203
|
+
// Clear the memory immediately after preparing the commit
|
|
174
204
|
inst.results = {};
|
|
175
205
|
}
|
|
176
206
|
|
|
@@ -226,6 +256,10 @@ class StandardExecutor {
|
|
|
226
256
|
const insights = metadata.rootDataDependencies?.includes('insights') ? { today: await loader.loadInsights(dateStr) } : null;
|
|
227
257
|
const SCHEMAS = mathLayer.SCHEMAS;
|
|
228
258
|
|
|
259
|
+
// [NEW] Track local batch success/failure
|
|
260
|
+
let chunkSuccess = 0;
|
|
261
|
+
let chunkFailures = 0;
|
|
262
|
+
|
|
229
263
|
for (const [userId, todayPortfolio] of Object.entries(portfolioData)) {
|
|
230
264
|
const yesterdayPortfolio = yesterdayPortfolioData ? yesterdayPortfolioData[userId] : null;
|
|
231
265
|
const todayHistory = historyData ? historyData[userId] : null;
|
|
@@ -249,10 +283,16 @@ class StandardExecutor {
|
|
|
249
283
|
try {
|
|
250
284
|
await calcInstance.process(context);
|
|
251
285
|
if (stats) stats.processedUsers++;
|
|
286
|
+
chunkSuccess++;
|
|
252
287
|
}
|
|
253
|
-
catch (e) {
|
|
288
|
+
catch (e) {
|
|
289
|
+
logger.log('WARN', `Calc ${metadata.name} failed for user ${userId}: ${e.message}`);
|
|
290
|
+
chunkFailures++;
|
|
291
|
+
}
|
|
254
292
|
}
|
|
293
|
+
|
|
294
|
+
return { success: chunkSuccess, failures: chunkFailures };
|
|
255
295
|
}
|
|
256
296
|
}
|
|
257
297
|
|
|
258
|
-
module.exports = { StandardExecutor };
|
|
298
|
+
module.exports = { StandardExecutor };
|
|
@@ -2,9 +2,11 @@
|
|
|
2
2
|
* FILENAME: computation-system/helpers/computation_dispatcher.js
|
|
3
3
|
* PURPOSE: "Smart Dispatcher" - Analyzes state, initializes Run Counters, and dispatches tasks.
|
|
4
4
|
* UPDATED: Implements Callback Pattern. Initializes 'computation_runs' doc for worker coordination.
|
|
5
|
+
* UPDATED: Implements Forensic Crash Analysis & Intelligent Resource Routing.
|
|
6
|
+
* FIXED: Implemented "Catch-Up" logic to scan full history (Start -> Target Date) instead of just Target Date.
|
|
5
7
|
*/
|
|
6
8
|
|
|
7
|
-
const { getExpectedDateStrings, normalizeName, DEFINITIVE_EARLIEST_DATES } = require('../utils/utils.js');
|
|
9
|
+
const { getExpectedDateStrings, getEarliestDataDates, normalizeName, DEFINITIVE_EARLIEST_DATES } = require('../utils/utils.js');
|
|
8
10
|
const { groupByPass, analyzeDateExecution } = require('../WorkflowOrchestrator.js');
|
|
9
11
|
const { PubSubUtils } = require('../../core/utils/pubsub_utils');
|
|
10
12
|
const { fetchComputationStatus, updateComputationStatus } = require('../persistence/StatusRepository');
|
|
@@ -13,12 +15,49 @@ const { generateCodeHash } = require('../topology/HashManag
|
|
|
13
15
|
const pLimit = require('p-limit');
|
|
14
16
|
const crypto = require('crypto');
|
|
15
17
|
|
|
16
|
-
const
|
|
17
|
-
|
|
18
|
+
const STATUS_IMPOSSIBLE = 'IMPOSSIBLE';
|
|
19
|
+
|
|
20
|
+
// Threshold to trigger high-mem routing (e.g., 1.5 GB for a 2GB worker)
|
|
21
|
+
const OOM_THRESHOLD_MB = 1500;
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* [NEW] Forensics: Checks if the calculation crashed previously due to Memory.
|
|
25
|
+
* Reads the 'telemetry.lastMemory' from the audit ledger.
|
|
26
|
+
*/
|
|
27
|
+
async function checkCrashForensics(db, date, pass, computationName) {
|
|
28
|
+
try {
|
|
29
|
+
const ledgerPath = `computation_audit_ledger/${date}/passes/${pass}/tasks/${computationName}`;
|
|
30
|
+
const doc = await db.doc(ledgerPath).get();
|
|
31
|
+
|
|
32
|
+
if (!doc.exists) return 'standard';
|
|
33
|
+
|
|
34
|
+
const data = doc.data();
|
|
35
|
+
|
|
36
|
+
// Check if we have telemetry from a previous run
|
|
37
|
+
if (data.telemetry && data.telemetry.lastMemory) {
|
|
38
|
+
const lastRSS = data.telemetry.lastMemory.rssMB || 0;
|
|
39
|
+
|
|
40
|
+
if (lastRSS > OOM_THRESHOLD_MB) {
|
|
41
|
+
console.log(`[Dispatcher] 🕵️♀️ Forensics: ${computationName} likely OOM'd at ${lastRSS}MB. Routing to HIGH-MEM.`);
|
|
42
|
+
return 'high-mem';
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// Also check if it's explicitly marked FAILED with 'Memory' in error
|
|
47
|
+
if (data.status === 'FAILED' && data.error && /memory/i.test(data.error)) {
|
|
48
|
+
return 'high-mem';
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
} catch (e) {
|
|
52
|
+
console.warn(`[Dispatcher] Forensics check failed for ${computationName}: ${e.message}`);
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
return 'standard';
|
|
56
|
+
}
|
|
18
57
|
|
|
19
58
|
/**
|
|
20
59
|
* Dispatches computation tasks for a specific pass.
|
|
21
|
-
* @param {Object} config - System config
|
|
60
|
+
* @param {Object} config - System config (Injected with topics)
|
|
22
61
|
* @param {Object} dependencies - { db, logger, ... }
|
|
23
62
|
* @param {Array} computationManifest - List of calculations
|
|
24
63
|
* @param {Object} reqBody - (Optional) HTTP Body containing 'callbackUrl' and 'date'
|
|
@@ -28,7 +67,8 @@ async function dispatchComputationPass(config, dependencies, computationManifest
|
|
|
28
67
|
const pubsubUtils = new PubSubUtils(dependencies);
|
|
29
68
|
const passToRun = String(config.COMPUTATION_PASS_TO_RUN);
|
|
30
69
|
|
|
31
|
-
//
|
|
70
|
+
// Extract Date and Callback from request body (pushed by Workflow)
|
|
71
|
+
// NOTE: 'dateStr' acts as the "Target Date" (Ceiling), usually T-1.
|
|
32
72
|
const dateStr = reqBody.date || config.date;
|
|
33
73
|
const callbackUrl = reqBody.callbackUrl || null;
|
|
34
74
|
|
|
@@ -44,18 +84,30 @@ async function dispatchComputationPass(config, dependencies, computationManifest
|
|
|
44
84
|
|
|
45
85
|
if (!calcsInThisPass.length) { return logger.log('WARN', `[Dispatcher] No calcs for Pass ${passToRun}. Exiting.`); }
|
|
46
86
|
|
|
47
|
-
|
|
48
|
-
logger.log('INFO', `🚀 [Dispatcher] Smart-Dispatching PASS ${passToRun} for ${dateStr}`);
|
|
87
|
+
logger.log('INFO', `🚀 [Dispatcher] Smart-Dispatching PASS ${passToRun} (Target: ${dateStr})`);
|
|
49
88
|
|
|
50
|
-
// -- DATE ANALYSIS LOGIC (
|
|
51
|
-
const passEarliestDate = Object.values(DEFINITIVE_EARLIEST_DATES).reduce((a, b) => a < b ? a : b);
|
|
52
|
-
const endDateUTC = new Date(Date.UTC(new Date().getUTCFullYear(), new Date().getUTCMonth(), new Date().getUTCDate() - 1));
|
|
89
|
+
// -- DATE ANALYSIS LOGIC (FIXED: RANGE SCAN) --
|
|
53
90
|
|
|
54
|
-
//
|
|
55
|
-
const
|
|
91
|
+
// 1. Determine the absolute start of data history
|
|
92
|
+
const earliestDates = await getEarliestDataDates(config, dependencies);
|
|
93
|
+
const startDate = earliestDates.absoluteEarliest;
|
|
94
|
+
const endDate = new Date(dateStr + 'T00:00:00Z');
|
|
95
|
+
|
|
96
|
+
// 2. Generate the full range of dates to check
|
|
97
|
+
let allExpectedDates = getExpectedDateStrings(startDate, endDate);
|
|
98
|
+
|
|
99
|
+
// Safety fallback: if range is invalid or empty, default to target date only
|
|
100
|
+
if (!allExpectedDates || allExpectedDates.length === 0) {
|
|
101
|
+
logger.log('WARN', `[Dispatcher] Date range calculation returned empty (Start: ${startDate.toISOString()} -> End: ${endDate.toISOString()}). Defaulting to single target date.`);
|
|
102
|
+
allExpectedDates = [dateStr];
|
|
103
|
+
} else {
|
|
104
|
+
logger.log('INFO', `[Dispatcher] 📅 Analysis Range: ${allExpectedDates.length} days (${allExpectedDates[0]} to ${allExpectedDates[allExpectedDates.length-1]})`);
|
|
105
|
+
}
|
|
56
106
|
|
|
57
107
|
const manifestMap = new Map(computationManifest.map(c => [normalizeName(c.name), c]));
|
|
58
108
|
const tasksToDispatch = [];
|
|
109
|
+
|
|
110
|
+
// Concurrency limit for analysis & forensics (Parallelize the historical scan)
|
|
59
111
|
const limit = pLimit(20);
|
|
60
112
|
|
|
61
113
|
const analysisPromises = allExpectedDates.map(d => limit(async () => {
|
|
@@ -71,6 +123,7 @@ async function dispatchComputationPass(config, dependencies, computationManifest
|
|
|
71
123
|
prevDate.setUTCDate(prevDate.getUTCDate() - 1);
|
|
72
124
|
prevDateStr = prevDate.toISOString().slice(0, 10);
|
|
73
125
|
|
|
126
|
+
// Only fetch previous status if it's within valid range
|
|
74
127
|
if (prevDate >= DEFINITIVE_EARLIEST_DATES.absoluteEarliest) {
|
|
75
128
|
fetchPromises.push(fetchComputationStatus(prevDateStr, config, dependencies));
|
|
76
129
|
}
|
|
@@ -87,18 +140,16 @@ async function dispatchComputationPass(config, dependencies, computationManifest
|
|
|
87
140
|
|
|
88
141
|
const report = analyzeDateExecution(d, calcsInThisPass, rootDataStatus, dailyStatus, manifestMap, prevDailyStatus);
|
|
89
142
|
|
|
143
|
+
// Handle Status Updates (Impossible / Blocked)
|
|
90
144
|
const statusUpdates = {};
|
|
91
|
-
|
|
92
145
|
report.impossible.forEach(item => {
|
|
93
146
|
if (dailyStatus[item.name]?.hash !== STATUS_IMPOSSIBLE) {
|
|
94
147
|
statusUpdates[item.name] = { hash: STATUS_IMPOSSIBLE, category: 'unknown', reason: item.reason };
|
|
95
148
|
}
|
|
96
149
|
});
|
|
97
|
-
|
|
98
150
|
report.blocked.forEach(item => {
|
|
99
151
|
statusUpdates[item.name] = { hash: false, category: 'unknown', reason: item.reason };
|
|
100
152
|
});
|
|
101
|
-
|
|
102
153
|
report.failedDependency.forEach(item => {
|
|
103
154
|
const missingStr = item.missing ? item.missing.join(', ') : 'unknown';
|
|
104
155
|
statusUpdates[item.name] = { hash: false, category: 'unknown', reason: `Dependency Missing: ${missingStr}` };
|
|
@@ -109,21 +160,29 @@ async function dispatchComputationPass(config, dependencies, computationManifest
|
|
|
109
160
|
}
|
|
110
161
|
|
|
111
162
|
const validToRun = [...report.runnable, ...report.reRuns];
|
|
112
|
-
|
|
163
|
+
|
|
164
|
+
// [NEW] Parallel Forensics Check
|
|
165
|
+
await Promise.all(validToRun.map(item => limit(async () => {
|
|
166
|
+
const compName = normalizeName(item.name);
|
|
167
|
+
|
|
168
|
+
// 1. Determine Resource Requirements
|
|
169
|
+
const requiredResource = await checkCrashForensics(db, d, passToRun, compName);
|
|
170
|
+
|
|
113
171
|
const uniqueDispatchId = crypto.randomUUID();
|
|
114
172
|
tasksToDispatch.push({
|
|
115
173
|
action: 'RUN_COMPUTATION_DATE',
|
|
116
174
|
dispatchId: uniqueDispatchId,
|
|
117
175
|
date: d,
|
|
118
176
|
pass: passToRun,
|
|
119
|
-
computation:
|
|
177
|
+
computation: compName,
|
|
120
178
|
hash: item.hash || item.newHash,
|
|
121
179
|
previousCategory: item.previousCategory || null,
|
|
122
180
|
triggerReason: item.reason || "Unknown",
|
|
123
181
|
dependencyResultHashes: item.dependencyResultHashes || {},
|
|
124
|
-
timestamp: Date.now()
|
|
182
|
+
timestamp: Date.now(),
|
|
183
|
+
resources: requiredResource // 'standard' or 'high-mem'
|
|
125
184
|
});
|
|
126
|
-
});
|
|
185
|
+
})));
|
|
127
186
|
|
|
128
187
|
} catch (e) {
|
|
129
188
|
logger.log('ERROR', `[Dispatcher] Failed analysis for ${d}: ${e.message}`);
|
|
@@ -132,10 +191,9 @@ async function dispatchComputationPass(config, dependencies, computationManifest
|
|
|
132
191
|
|
|
133
192
|
await Promise.all(analysisPromises);
|
|
134
193
|
|
|
135
|
-
// --
|
|
194
|
+
// -- CALLBACK & COUNTER INITIALIZATION --
|
|
136
195
|
|
|
137
196
|
if (tasksToDispatch.length > 0) {
|
|
138
|
-
logger.log('INFO', `[Dispatcher] 📝 Preparing ${tasksToDispatch.length} tasks for execution...`);
|
|
139
197
|
|
|
140
198
|
// 1. Initialize Shared State Document (The Counter)
|
|
141
199
|
const runId = crypto.randomUUID();
|
|
@@ -144,17 +202,17 @@ async function dispatchComputationPass(config, dependencies, computationManifest
|
|
|
144
202
|
if (callbackUrl) {
|
|
145
203
|
await db.doc(metaStatePath).set({
|
|
146
204
|
createdAt: new Date(),
|
|
147
|
-
date: dateStr,
|
|
205
|
+
date: dateStr, // Acts as the "Job Label" (target date)
|
|
148
206
|
pass: passToRun,
|
|
149
207
|
totalTasks: tasksToDispatch.length,
|
|
150
|
-
remainingTasks: tasksToDispatch.length,
|
|
151
|
-
callbackUrl: callbackUrl,
|
|
208
|
+
remainingTasks: tasksToDispatch.length,
|
|
209
|
+
callbackUrl: callbackUrl,
|
|
152
210
|
status: 'IN_PROGRESS'
|
|
153
211
|
});
|
|
154
|
-
logger.log('INFO', `[Dispatcher] 🏁 Run State Initialized: ${runId}`);
|
|
212
|
+
logger.log('INFO', `[Dispatcher] 🏁 Run State Initialized: ${runId}. Tasks: ${tasksToDispatch.length}`);
|
|
155
213
|
}
|
|
156
214
|
|
|
157
|
-
// 2. Attach Run Metadata
|
|
215
|
+
// 2. Attach Run Metadata
|
|
158
216
|
tasksToDispatch.forEach(task => {
|
|
159
217
|
task.runId = runId;
|
|
160
218
|
task.metaStatePath = callbackUrl ? metaStatePath : null;
|
|
@@ -180,12 +238,13 @@ async function dispatchComputationPass(config, dependencies, computationManifest
|
|
|
180
238
|
t.set(ledgerRef, {
|
|
181
239
|
status: 'PENDING',
|
|
182
240
|
dispatchId: task.dispatchId,
|
|
183
|
-
runId: task.runId,
|
|
241
|
+
runId: task.runId,
|
|
184
242
|
computation: task.computation,
|
|
185
243
|
expectedHash: task.hash || 'unknown',
|
|
186
244
|
createdAt: new Date(),
|
|
187
245
|
dispatcherHash: currentManifestHash,
|
|
188
246
|
triggerReason: task.triggerReason,
|
|
247
|
+
resources: task.resources, // Log intended resource type
|
|
189
248
|
retries: 0
|
|
190
249
|
}, { merge: true });
|
|
191
250
|
|
|
@@ -201,22 +260,36 @@ async function dispatchComputationPass(config, dependencies, computationManifest
|
|
|
201
260
|
|
|
202
261
|
await Promise.all(txnPromises);
|
|
203
262
|
|
|
204
|
-
// 4. Publish to Pub/Sub
|
|
263
|
+
// 4. Publish to Pub/Sub (Segregated by Resources)
|
|
205
264
|
if (finalDispatched.length > 0) {
|
|
206
|
-
logger.log('INFO', `[Dispatcher] ✅ Publishing ${finalDispatched.length} tasks to Pub/Sub...`);
|
|
207
265
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
266
|
+
const standardTasks = finalDispatched.filter(t => t.resources !== 'high-mem');
|
|
267
|
+
const highMemTasks = finalDispatched.filter(t => t.resources === 'high-mem');
|
|
268
|
+
|
|
269
|
+
// Publish Standard
|
|
270
|
+
if (standardTasks.length > 0) {
|
|
271
|
+
logger.log('INFO', `[Dispatcher] ✅ Publishing ${standardTasks.length} Standard tasks...`);
|
|
272
|
+
await pubsubUtils.batchPublishTasks(dependencies, {
|
|
273
|
+
topicName: config.computationTopicStandard || 'computation-tasks',
|
|
274
|
+
tasks: standardTasks,
|
|
275
|
+
taskType: `computation-pass-${passToRun}-std`,
|
|
276
|
+
maxPubsubBatchSize: 100
|
|
277
|
+
});
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
// Publish High-Mem
|
|
281
|
+
if (highMemTasks.length > 0) {
|
|
282
|
+
logger.log('INFO', `[Dispatcher] 🏋️♀️ Publishing ${highMemTasks.length} tasks to HIGH-MEM infrastructure.`);
|
|
283
|
+
await pubsubUtils.batchPublishTasks(dependencies, {
|
|
284
|
+
topicName: config.computationTopicHighMem || 'computation-tasks-highmem',
|
|
285
|
+
tasks: highMemTasks,
|
|
286
|
+
taskType: `computation-pass-${passToRun}-highmem`,
|
|
287
|
+
maxPubsubBatchSize: 100
|
|
288
|
+
});
|
|
289
|
+
}
|
|
214
290
|
|
|
215
|
-
// Return count so workflow knows to wait
|
|
216
291
|
return { dispatched: finalDispatched.length, runId };
|
|
217
292
|
} else {
|
|
218
|
-
// Edge Case: Analysis said "Run", but Ledger said "Already Done"
|
|
219
|
-
// We must update the state doc to 0 or delete it, OR return 0 so workflow doesn't wait.
|
|
220
293
|
logger.log('INFO', `[Dispatcher] All tasks were already COMPLETED.`);
|
|
221
294
|
return { dispatched: 0 };
|
|
222
295
|
}
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
* FILENAME: computation-system/helpers/computation_worker.js
|
|
3
3
|
* PURPOSE: Consumes tasks, executes logic, and signals Workflow upon Batch Completion.
|
|
4
4
|
* UPDATED: Implements IAM Auth for Workflow Callbacks.
|
|
5
|
+
* UPDATED: Implements Memory Heartbeat (Flight Recorder) for OOM detection.
|
|
5
6
|
*/
|
|
6
7
|
|
|
7
8
|
const { executeDispatchTask } = require('../WorkflowOrchestrator.js');
|
|
@@ -9,7 +10,7 @@ const { getManifest } = require('../topology/ManifestLoader');
|
|
|
9
10
|
const { StructuredLogger } = require('../logger/logger');
|
|
10
11
|
const { recordRunAttempt } = require('../persistence/RunRecorder');
|
|
11
12
|
const https = require('https');
|
|
12
|
-
const { GoogleAuth } = require('google-auth-library');
|
|
13
|
+
const { GoogleAuth } = require('google-auth-library');
|
|
13
14
|
|
|
14
15
|
let calculationPackage;
|
|
15
16
|
try { calculationPackage = require('aiden-shared-calculations-unified');
|
|
@@ -19,51 +20,68 @@ const calculations = calculationPackage.calculations;
|
|
|
19
20
|
const MAX_RETRIES = 3;
|
|
20
21
|
|
|
21
22
|
/**
|
|
22
|
-
* [NEW] Helper:
|
|
23
|
-
*
|
|
23
|
+
* [NEW] Helper: Starts a background heartbeat to track memory usage.
|
|
24
|
+
* This acts as a "Black Box Recorder". If the worker crashes (OOM),
|
|
25
|
+
* the last written value will remain in Firestore for the Dispatcher to analyze.
|
|
26
|
+
*/
|
|
27
|
+
function startMemoryHeartbeat(db, ledgerPath, intervalMs = 2000) {
|
|
28
|
+
const getMemStats = () => {
|
|
29
|
+
const mem = process.memoryUsage();
|
|
30
|
+
return {
|
|
31
|
+
rssMB: Math.round(mem.rss / 1024 / 1024), // Resident Set Size (OOM Killer Metric)
|
|
32
|
+
heapUsedMB: Math.round(mem.heapUsed / 1024 / 1024),
|
|
33
|
+
timestamp: new Date()
|
|
34
|
+
};
|
|
35
|
+
};
|
|
36
|
+
|
|
37
|
+
const timer = setInterval(async () => {
|
|
38
|
+
try {
|
|
39
|
+
const stats = getMemStats();
|
|
40
|
+
// Use update() to minimize payload size and avoid overwriting status
|
|
41
|
+
await db.doc(ledgerPath).update({
|
|
42
|
+
'telemetry.lastMemory': stats,
|
|
43
|
+
'telemetry.lastHeartbeat': new Date()
|
|
44
|
+
}).catch(() => {}); // Ignore write errors to prevent crashing the worker
|
|
45
|
+
} catch (e) {
|
|
46
|
+
// Silently fail on telemetry errors
|
|
47
|
+
}
|
|
48
|
+
}, intervalMs);
|
|
49
|
+
|
|
50
|
+
// Unref so this timer doesn't prevent the process from exiting naturally
|
|
51
|
+
timer.unref();
|
|
52
|
+
|
|
53
|
+
return timer;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Helper: Fires the webhook back to Google Cloud Workflows.
|
|
24
58
|
*/
|
|
25
59
|
async function triggerWorkflowCallback(url, status, logger) {
|
|
26
60
|
if (!url) return;
|
|
27
61
|
logger.log('INFO', `[Worker] 🔔 BATCH COMPLETE! Triggering Workflow Callback: ${status}`);
|
|
28
62
|
|
|
29
63
|
try {
|
|
30
|
-
|
|
31
|
-
const auth = new GoogleAuth({
|
|
32
|
-
scopes: ['https://www.googleapis.com/auth/cloud-platform']
|
|
33
|
-
});
|
|
64
|
+
const auth = new GoogleAuth({ scopes: ['https://www.googleapis.com/auth/cloud-platform'] });
|
|
34
65
|
const client = await auth.getClient();
|
|
35
66
|
const accessToken = await client.getAccessToken();
|
|
36
67
|
const token = accessToken.token;
|
|
37
68
|
|
|
38
|
-
// 2. Send Authenticated Request
|
|
39
69
|
return new Promise((resolve, reject) => {
|
|
40
|
-
const body = JSON.stringify({
|
|
41
|
-
status: status,
|
|
42
|
-
timestamp: new Date().toISOString()
|
|
43
|
-
});
|
|
70
|
+
const body = JSON.stringify({ status: status, timestamp: new Date().toISOString() });
|
|
44
71
|
|
|
45
72
|
const req = https.request(url, {
|
|
46
73
|
method: 'POST',
|
|
47
74
|
headers: {
|
|
48
75
|
'Content-Type': 'application/json',
|
|
49
76
|
'Content-Length': Buffer.byteLength(body),
|
|
50
|
-
'Authorization': `Bearer ${token}`
|
|
77
|
+
'Authorization': `Bearer ${token}`
|
|
51
78
|
}
|
|
52
79
|
}, (res) => {
|
|
53
|
-
if (res.statusCode >= 200 && res.statusCode < 300) {
|
|
54
|
-
|
|
55
|
-
} else {
|
|
56
|
-
logger.log('WARN', `Callback responded with ${res.statusCode}`);
|
|
57
|
-
// We resolve anyway to avoid crashing the worker logic
|
|
58
|
-
resolve();
|
|
59
|
-
}
|
|
80
|
+
if (res.statusCode >= 200 && res.statusCode < 300) { resolve(); }
|
|
81
|
+
else { logger.log('WARN', `Callback responded with ${res.statusCode}`); resolve(); }
|
|
60
82
|
});
|
|
61
83
|
|
|
62
|
-
req.on('error', (e) => {
|
|
63
|
-
logger.log('ERROR', `Failed to trigger callback: ${e.message}`);
|
|
64
|
-
resolve();
|
|
65
|
-
});
|
|
66
|
-
|
|
84
|
+
req.on('error', (e) => { logger.log('ERROR', `Failed to trigger callback: ${e.message}`); resolve(); });
|
|
67
85
|
req.write(body);
|
|
68
86
|
req.end();
|
|
69
87
|
});
|
|
@@ -73,37 +91,21 @@ async function triggerWorkflowCallback(url, status, logger) {
|
|
|
73
91
|
}
|
|
74
92
|
|
|
75
93
|
/**
|
|
76
|
-
*
|
|
77
|
-
* Returns the callbackUrl IF this was the last task.
|
|
94
|
+
* Helper: Decrements 'remainingTasks' in Firestore.
|
|
78
95
|
*/
|
|
79
96
|
async function decrementAndCheck(db, metaStatePath, logger) {
|
|
80
97
|
if (!metaStatePath) return null;
|
|
81
|
-
|
|
82
98
|
try {
|
|
83
99
|
const result = await db.runTransaction(async (t) => {
|
|
84
100
|
const ref = db.doc(metaStatePath);
|
|
85
101
|
const doc = await t.get(ref);
|
|
86
|
-
|
|
87
|
-
if (!doc.exists) return null; // State might have expired or been deleted
|
|
102
|
+
if (!doc.exists) return null;
|
|
88
103
|
const data = doc.data();
|
|
89
|
-
|
|
90
104
|
const newRemaining = (data.remainingTasks || 0) - 1;
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
remainingTasks: newRemaining,
|
|
94
|
-
lastUpdated: new Date()
|
|
95
|
-
});
|
|
96
|
-
|
|
97
|
-
// Return needed data only if we hit 0 (or lower, for safety)
|
|
98
|
-
return {
|
|
99
|
-
remaining: newRemaining,
|
|
100
|
-
callbackUrl: data.callbackUrl
|
|
101
|
-
};
|
|
105
|
+
t.update(ref, { remainingTasks: newRemaining, lastUpdated: new Date() });
|
|
106
|
+
return { remaining: newRemaining, callbackUrl: data.callbackUrl };
|
|
102
107
|
});
|
|
103
|
-
|
|
104
|
-
if (result && result.remaining <= 0) {
|
|
105
|
-
return result.callbackUrl;
|
|
106
|
-
}
|
|
108
|
+
if (result && result.remaining <= 0) return result.callbackUrl;
|
|
107
109
|
} catch (e) {
|
|
108
110
|
logger.log('ERROR', `[Worker] Failed to decrement batch counter: ${e.message}`);
|
|
109
111
|
}
|
|
@@ -125,12 +127,12 @@ async function handleComputationTask(message, config, dependencies) {
|
|
|
125
127
|
|
|
126
128
|
if (!data || data.action !== 'RUN_COMPUTATION_DATE') { return; }
|
|
127
129
|
|
|
128
|
-
// Extract fields including new metaStatePath
|
|
129
130
|
const { date, pass, computation, previousCategory, triggerReason, dispatchId, dependencyResultHashes, metaStatePath } = data;
|
|
130
131
|
|
|
131
132
|
if (!date || !pass || !computation) { logger.log('ERROR', `[Worker] Invalid payload.`, data); return; }
|
|
132
133
|
|
|
133
134
|
const retryCount = message.deliveryAttempt || 1;
|
|
135
|
+
const ledgerPath = `computation_audit_ledger/${date}/passes/${pass}/tasks/${computation}`;
|
|
134
136
|
|
|
135
137
|
// --- POISON MESSAGE HANDLING (DLQ) ---
|
|
136
138
|
if (retryCount > MAX_RETRIES) {
|
|
@@ -144,36 +146,38 @@ async function handleComputationTask(message, config, dependencies) {
|
|
|
144
146
|
failureReason: 'MAX_RETRIES_EXCEEDED'
|
|
145
147
|
});
|
|
146
148
|
|
|
147
|
-
await db.
|
|
149
|
+
await db.doc(ledgerPath).set({
|
|
148
150
|
status: 'FAILED',
|
|
149
151
|
error: 'Max Retries Exceeded (Poison Message)',
|
|
150
152
|
failedAt: new Date()
|
|
151
153
|
}, { merge: true });
|
|
152
154
|
|
|
153
155
|
const callbackUrl = await decrementAndCheck(db, metaStatePath, logger);
|
|
154
|
-
if (callbackUrl) {
|
|
155
|
-
await triggerWorkflowCallback(callbackUrl, 'SUCCESS', logger);
|
|
156
|
-
}
|
|
157
|
-
|
|
156
|
+
if (callbackUrl) { await triggerWorkflowCallback(callbackUrl, 'SUCCESS', logger); }
|
|
158
157
|
return;
|
|
159
158
|
} catch (dlqErr) { logger.log('FATAL', `[Worker] Failed to write to DLQ`, dlqErr); }
|
|
160
159
|
}
|
|
161
160
|
|
|
162
161
|
logger.log('INFO', `[Worker] 📥 Received Task: ${computation} (${date}) [Attempt ${retryCount}/${MAX_RETRIES}]`);
|
|
163
162
|
|
|
164
|
-
// Update Status to IN_PROGRESS
|
|
163
|
+
// 1. Update Status to IN_PROGRESS & Initialize Telemetry
|
|
165
164
|
try {
|
|
166
|
-
await db.
|
|
165
|
+
await db.doc(ledgerPath).set({
|
|
167
166
|
status: 'IN_PROGRESS',
|
|
168
167
|
workerId: process.env.K_REVISION || 'unknown',
|
|
169
168
|
startedAt: new Date(),
|
|
170
|
-
dispatchId: dispatchId
|
|
169
|
+
dispatchId: dispatchId,
|
|
170
|
+
telemetry: { startTime: new Date(), lastMemory: null } // Init for heartbeat
|
|
171
171
|
}, { merge: true });
|
|
172
172
|
} catch (leaseErr) {}
|
|
173
173
|
|
|
174
|
+
// 2. START HEARTBEAT (The Flight Recorder)
|
|
175
|
+
const heartbeatTimer = startMemoryHeartbeat(db, ledgerPath, 2000);
|
|
176
|
+
|
|
174
177
|
let computationManifest;
|
|
175
178
|
try { computationManifest = getManifest(config.activeProductLines || [], calculations, runDependencies);
|
|
176
179
|
} catch (manifestError) {
|
|
180
|
+
clearInterval(heartbeatTimer); // Stop if we fail early
|
|
177
181
|
logger.log('FATAL', `[Worker] Failed to load Manifest: ${manifestError.message}`);
|
|
178
182
|
return;
|
|
179
183
|
}
|
|
@@ -186,6 +190,9 @@ async function handleComputationTask(message, config, dependencies) {
|
|
|
186
190
|
);
|
|
187
191
|
const duration = Date.now() - startTime;
|
|
188
192
|
|
|
193
|
+
// STOP HEARTBEAT ON SUCCESS
|
|
194
|
+
clearInterval(heartbeatTimer);
|
|
195
|
+
|
|
189
196
|
const failureReport = result?.updates?.failureReport || [];
|
|
190
197
|
const successUpdates = result?.updates?.successUpdates || {};
|
|
191
198
|
|
|
@@ -194,26 +201,23 @@ async function handleComputationTask(message, config, dependencies) {
|
|
|
194
201
|
throw new Error(failReason.error.message || 'Computation Logic Failed');
|
|
195
202
|
}
|
|
196
203
|
else {
|
|
197
|
-
if (Object.keys(successUpdates).length > 0) {
|
|
198
|
-
|
|
199
|
-
} else {
|
|
200
|
-
logger.log('WARN', `[Worker] ⚠️ Empty Result: ${computation}`);
|
|
201
|
-
}
|
|
204
|
+
if (Object.keys(successUpdates).length > 0) { logger.log('INFO', `[Worker] ✅ Stored: ${computation}`); }
|
|
205
|
+
else { logger.log('WARN', `[Worker] ⚠️ Empty Result: ${computation}`); }
|
|
202
206
|
|
|
203
|
-
await db.
|
|
207
|
+
await db.doc(ledgerPath).update({
|
|
204
208
|
status: 'COMPLETED',
|
|
205
209
|
completedAt: new Date()
|
|
206
210
|
}).catch(() => {});
|
|
207
211
|
|
|
208
212
|
await recordRunAttempt(db, { date, computation, pass }, 'SUCCESS', null, { durationMs: duration }, triggerReason);
|
|
209
213
|
|
|
210
|
-
// Decrement & Callback
|
|
211
214
|
const callbackUrl = await decrementAndCheck(db, metaStatePath, logger);
|
|
212
|
-
if (callbackUrl) {
|
|
213
|
-
await triggerWorkflowCallback(callbackUrl, 'SUCCESS', logger);
|
|
214
|
-
}
|
|
215
|
+
if (callbackUrl) { await triggerWorkflowCallback(callbackUrl, 'SUCCESS', logger); }
|
|
215
216
|
}
|
|
216
217
|
} catch (err) {
|
|
218
|
+
// STOP HEARTBEAT ON ERROR
|
|
219
|
+
clearInterval(heartbeatTimer);
|
|
220
|
+
|
|
217
221
|
// --- ERROR HANDLING ---
|
|
218
222
|
const isDeterministicError = err.stage === 'SHARDING_LIMIT_EXCEEDED' ||
|
|
219
223
|
err.stage === 'QUALITY_CIRCUIT_BREAKER' ||
|
|
@@ -231,7 +235,7 @@ async function handleComputationTask(message, config, dependencies) {
|
|
|
231
235
|
failureReason: 'PERMANENT_DETERMINISTIC_ERROR'
|
|
232
236
|
});
|
|
233
237
|
|
|
234
|
-
await db.
|
|
238
|
+
await db.doc(ledgerPath).set({
|
|
235
239
|
status: 'FAILED',
|
|
236
240
|
error: err.message || 'Permanent Deterministic Error',
|
|
237
241
|
failedAt: new Date()
|
|
@@ -240,23 +244,17 @@ async function handleComputationTask(message, config, dependencies) {
|
|
|
240
244
|
await recordRunAttempt(db, { date, computation, pass }, 'FAILURE', { message: err.message, stage: err.stage || 'PERMANENT_FAIL' }, { durationMs: 0 }, triggerReason);
|
|
241
245
|
|
|
242
246
|
const callbackUrl = await decrementAndCheck(db, metaStatePath, logger);
|
|
243
|
-
if (callbackUrl) {
|
|
244
|
-
await triggerWorkflowCallback(callbackUrl, 'SUCCESS', logger);
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
+
if (callbackUrl) { await triggerWorkflowCallback(callbackUrl, 'SUCCESS', logger); }
|
|
247
248
|
return;
|
|
248
249
|
} catch (dlqErr) { logger.log('FATAL', `[Worker] Failed to write to DLQ`, dlqErr); }
|
|
249
250
|
}
|
|
250
251
|
|
|
251
|
-
if (retryCount >= MAX_RETRIES) {
|
|
252
|
-
throw err;
|
|
253
|
-
}
|
|
252
|
+
if (retryCount >= MAX_RETRIES) { throw err; }
|
|
254
253
|
|
|
255
254
|
logger.log('ERROR', `[Worker] ❌ Crash: ${computation}: ${err.message}`);
|
|
256
255
|
await recordRunAttempt(db, { date, computation, pass }, 'CRASH', { message: err.message, stack: err.stack, stage: 'SYSTEM_CRASH' }, { durationMs: 0 }, triggerReason);
|
|
257
|
-
|
|
258
256
|
throw err;
|
|
259
257
|
}
|
|
260
258
|
}
|
|
261
259
|
|
|
262
|
-
module.exports = { handleComputationTask };
|
|
260
|
+
module.exports = { handleComputationTask };
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# The BullTrackers Computation System: An Advanced DAG-Based Architecture for High-Fidelity Financial Simulation
|
|
2
|
+
|
|
3
|
+
## Abstract
|
|
4
|
+
|
|
5
|
+
This paper details the design, implementation, and theoretical underpinnings of the BullTrackers Computation System, a proprietary high-performance execution engine designed for complex financial modeling and user behavior analysis. The system leverages a Directed Acyclic Graph (DAG) architecture to orchestrate interdependent calculations, employing Kahn’s Algorithm for topological sorting and Tarjan’s Algorithm for cycle detection. Key innovations include "Content-Based Dependency Short-Circuiting" for massive optimization, a "System Epoch" and "Infrastructure Hash" based auditing system for absolute reproducibility, and a batch-flushing execution model designed to mitigate Out-Of-Memory (OOM) errors during high-volume processing. We further explore the application of this system in running advanced psychometric and risk-geometry models ("Smart Money" scoring) and how the architecture supports self-healing workflows through granular state management.
|
|
6
|
+
|
|
7
|
+
## 1. Introduction
|
|
8
|
+
|
|
9
|
+
In modern financial analytics, derived data often depends on a complex web of varying input frequencies—real-time price ticks, daily portfolio snapshots, and historical trade logs. Traditional linear batch processing protocols fail to capture the nuances of these interdependencies, often leading to race conditions or redundant computations.
|
|
10
|
+
|
|
11
|
+
The BullTrackers Computation System was devised to solve this by treating the entire domain logic as a **Directed Acyclic Graph (DAG)**. Every calculation is a node, and every data requirement is an edge. By resolving the topography of this graph dynamically at runtime, the system ensures that:
|
|
12
|
+
1. Data is always available before it is consumed (referential integrity).
|
|
13
|
+
2. Only necessary computations are executed (efficiency).
|
|
14
|
+
3. Changes in code or infrastructure propagate deterministically through the graph (auditability).
|
|
15
|
+
|
|
16
|
+
## 2. Theoretical Foundations
|
|
17
|
+
|
|
18
|
+
The core utility of the system is its ability to turn a collection of loosely coupled JavaScript classes into a strictly ordered execution plan.
|
|
19
|
+
|
|
20
|
+
### 2.1 Directed Acyclic Graphs (DAGs)
|
|
21
|
+
We model the computation space as a DAG where $G = (V, E)$.
|
|
22
|
+
* **Vertices ($V$)**: Individual Calculation Units (e.g., `NetProfit`, [SmartMoneyScore](file:///C:/Users/aiden/Desktop/code_projects/Bulltrackers2025/Backend/Entrypoints/BullTrackers/Backend/Core/bulltrackers-module/functions/computation-system/layers/profiling.js#24-236)).
|
|
23
|
+
* **Edges ($E$)**: Data dependencies, where an edge $(u, v)$ implies $v$ requires the output of $u$.
|
|
24
|
+
|
|
25
|
+
### 2.2 Topological Sorting (Kahn’s Algorithm)
|
|
26
|
+
To execute the graph, we must linearize it such that for every dependency $u \rightarrow v$, $u$ precedes $v$ in the execution order. We implement **Kahn’s Algorithm** within [ManifestBuilder.js](file:///C:/Users/aiden/Desktop/code_projects/Bulltrackers2025/Backend/Entrypoints/BullTrackers/Backend/Core/bulltrackers-module/functions/computation-system/context/ManifestBuilder.js) to achieve this:
|
|
27
|
+
1. Calculate the **in-degree** (number of incoming edges) for all nodes.
|
|
28
|
+
2. Initialize a queue with all nodes having an in-degree of 0 (independent nodes).
|
|
29
|
+
3. While the queue is not empty:
|
|
30
|
+
* Dequeue node $N$ and add it to the `SortedManifest`.
|
|
31
|
+
* For each neighbor $M$ dependent on $N$, decrement $M$'s in-degree.
|
|
32
|
+
* If $M$'s in-degree becomes 0, enqueue $M$.
|
|
33
|
+
4. This generates a series of "Passes" or "Waves" of execution, allowing parallel processing of independent nodes within the same pass.
|
|
34
|
+
|
|
35
|
+
### 2.3 Cycle Detection (Tarjan’s Algorithm)
|
|
36
|
+
A critical failure mode in DAGs is the introduction of a cycle (e.g., A needs B, B needs A), effectively turning the DAG into a DCG (Directed Cyclic Graph), which is unresolvable.
|
|
37
|
+
If Kahn’s algorithm fails to visit all nodes (indicating a cycle exists), the system falls back to **Tarjan’s Strongly Connected Components (SCC) Algorithm**. This uses depth-first search to identify the exact cycle chain (e.g., `Calc A -> Calc B -> Calc C -> Calc A`), reporting the "First Cycle Found" to the developer for immediate remediation.
|
|
38
|
+
|
|
39
|
+
## 3. System Architecture & "Source of Truth"
|
|
40
|
+
|
|
41
|
+
The architecture is centered around the **Manifest**, a dynamic, immutable registry of all capabilities within the system.
|
|
42
|
+
|
|
43
|
+
### 3.1 The Dynamic Manifest
|
|
44
|
+
Unlike static build tools, the Manifest is built at runtime by [ManifestLoader.js](file:///C:/Users/aiden/Desktop/code_projects/Bulltrackers2025/Backend/Entrypoints/BullTrackers/Backend/Core/bulltrackers-module/functions/computation-system/topology/ManifestLoader.js) and [ManifestBuilder.js](file:///C:/Users/aiden/Desktop/code_projects/Bulltrackers2025/Backend/Entrypoints/BullTrackers/Backend/Core/bulltrackers-module/functions/computation-system/context/ManifestBuilder.js). It employs an **Auto-Discovery** mechanism that scans directories for calculation classes.
|
|
45
|
+
* **Static Metadata**: Each class exposes `getMetadata()` and `getDependencies()`.
|
|
46
|
+
* **Product Line Filtering**: The builder can slice the graph, generating a subgraph relevant only to specific product lines (e.g., "Crypto", "Stocks"), reducing overhead.
|
|
47
|
+
|
|
48
|
+
### 3.2 Granular Hashing & The Audit Chain
|
|
49
|
+
To ensure that "if the code hasn't changed, the result shouldn't change," the system implements a multi-layered hashing strategy ([HashManager.js](file:///C:/Users/aiden/Desktop/code_projects/Bulltrackers2025/Backend/Entrypoints/BullTrackers/Backend/Core/bulltrackers-module/functions/computation-system/topology/HashManager.js)):
|
|
50
|
+
1. **Code Hash**: The raw string content of the calculation class.
|
|
51
|
+
2. **Layer Hash**: Hashes of shared utility layers (`mathematics`, `profiling`) used by the class.
|
|
52
|
+
3. **Dependency Hash**: A composite hash of all upstream dependencies.
|
|
53
|
+
4. **Infrastructure Hash**: A hash representing the underlying system environment.
|
|
54
|
+
5. **System Epoch**: A manual versioning flag to force global re-computation.
|
|
55
|
+
|
|
56
|
+
This results in a `Composite Hash`. If this hash matches the `storedHash` in the database, execution can be skipped entirely.
|
|
57
|
+
|
|
58
|
+
## 4. Execution Engine: Flow, Resilience & Optimization
|
|
59
|
+
|
|
60
|
+
The `WorkflowOrchestrator` acts as the runtime kernel, utilizing [StandardExecutor](file:///C:/Users/aiden/Desktop/code_projects/Bulltrackers2025/Backend/Entrypoints/BullTrackers/Backend/Core/bulltrackers-module/functions/computation-system/executors/StandardExecutor.js#16-257) and [MetaExecutor](file:///C:/Users/aiden/Desktop/code_projects/Bulltrackers2025/Backend/Entrypoints/BullTrackers/Backend/Core/bulltrackers-module/functions/computation-system/executors/MetaExecutor.js#12-83) for the heavy lifting.
|
|
61
|
+
|
|
62
|
+
### 4.1 Content-Based Dependency Short-Circuiting
|
|
63
|
+
A major optimization (O(n) gain) is the **Content-Based Short-Circuiting** logic found in [WorkflowOrchestrator.js](file:///C:/Users/aiden/Desktop/code_projects/Bulltrackers2025/Backend/Entrypoints/BullTrackers/Backend/Core/bulltrackers-module/functions/computation-system/WorkflowOrchestrator.js):
|
|
64
|
+
Even if an upstream dependency *re-runs* (e.g., its timestamp changed), its *output* might be identical to the previous run.
|
|
65
|
+
1. The system tracks `ResultHash` (hash of the actual output data).
|
|
66
|
+
2. When checking dependencies for Node B (which depends on A), if A has re-run but its `ResultHash` is unchanged from what B used last time, B **does not need to re-run**.
|
|
67
|
+
3. This effectively stops "change propagation" dead in its tracks if the data change is semantically null.
|
|
68
|
+
|
|
69
|
+
### 4.2 Batch Flushing & OOM Prevention
|
|
70
|
+
Financial datasets (processing 100k+ users with daily portfolios) often exceed Node.js heap limits. The [StandardExecutor](file:///C:/Users/aiden/Desktop/code_projects/Bulltrackers2025/Backend/Entrypoints/BullTrackers/Backend/Core/bulltrackers-module/functions/computation-system/executors/StandardExecutor.js#16-257) implements a **Streaming & Flushing** architecture:
|
|
71
|
+
* **Streams** inputs (Portfolio/History) using generators (`yield`), preventing loading all users into memory.
|
|
72
|
+
* **Buffers** results in a `state` object.
|
|
73
|
+
* **Flushes** to the database (Firestore/Storage) every $N$ users (e.g., 5000), clearing the internal buffer helps avoid Out-Of-Memory crashes.
|
|
74
|
+
* **Incremental Sharding**: It manages shard indices dynamically to split massive result sets into retrievable chunks.
|
|
75
|
+
|
|
76
|
+
### 4.3 Handling "Impossible" States
|
|
77
|
+
If a dependency fails or is missing critical data, the Orchestrator marks dependent nodes as `IMPOSSIBLE` rather than failing them. This allows the rest of the graph (independent branches) to continue execution, maximizing system throughput even in a partially degraded state.
|
|
78
|
+
|
|
79
|
+
## 5. Advanced Application: Psychometrics & Risk Geometry
|
|
80
|
+
|
|
81
|
+
The capabilities of this computation engine are best demonstrated by the [profiling.js](file:///C:/Users/aiden/Desktop/code_projects/Bulltrackers2025/Backend/Entrypoints/BullTrackers/Backend/Core/bulltrackers-module/functions/computation-system/layers/profiling.js) layer it powers. Because the DAG ensures all historical and portfolio data is perfectly aligned, we can run sophisticated O(n^2) or O(n log n) algorithms on user data reliably.
|
|
82
|
+
|
|
83
|
+
### 5.1 "Smart Money" & Cognitive Profiling
|
|
84
|
+
The system executes a [UserClassifier](file:///C:/Users/aiden/Desktop/code_projects/Bulltrackers2025/Backend/Entrypoints/BullTrackers/Backend/Core/bulltrackers-module/functions/computation-system/layers/profiling.js#382-399) that computes:
|
|
85
|
+
* **Risk Geometry**: Using the **Monotone Chain** algorithm to compute the Convex Hull of a user's risk/reward performance (Efficient Frontier analysis).
|
|
86
|
+
* **Psychometrics**: Detecting "Revenge Trading" (increasing risk after losses) and "Disposition Skew" (holding losers too long).
|
|
87
|
+
* **Attribution**: Separating "Luck" (market beta) from "Skill" (Alpha) by comparing performance against sector benchmarks.
|
|
88
|
+
|
|
89
|
+
These complex models depend on the *guarantee* provided by the DAG that all necessary history and price data is pre-computed and available in the [Context](file:///C:/Users/aiden/Desktop/code_projects/Bulltrackers2025/Backend/Entrypoints/BullTrackers/Backend/Core/bulltrackers-module/functions/computation-system/simulation/Fabricator.js#20-69).
|
|
90
|
+
|
|
91
|
+
## 6. Conclusion
|
|
92
|
+
|
|
93
|
+
The BullTrackers Computation System represents a shift from "Action-Based" to "State-Based" architecture. By encoding the domain logic into a Directed Acyclic Graph, we achieve a system that is self-healing, massively scalable via short-circuiting and batching, and capable of supporting deep analytical models. It provides the robustness required for high-stakes financial simulation, ensuring that every decimal point is traceable, reproducible, and verifiable.
|
|
@@ -43,14 +43,14 @@ async function recordRunAttempt(db, context, status, error = null, detailedMetri
|
|
|
43
43
|
const timings = rawExecStats.timings || {};
|
|
44
44
|
|
|
45
45
|
const runEntry = {
|
|
46
|
-
runId:
|
|
46
|
+
runId: runId,
|
|
47
47
|
computationName: computation,
|
|
48
|
-
pass:
|
|
49
|
-
workerId:
|
|
50
|
-
targetDate:
|
|
51
|
-
triggerTime:
|
|
52
|
-
durationMs:
|
|
53
|
-
status:
|
|
48
|
+
pass: String(pass),
|
|
49
|
+
workerId: workerId,
|
|
50
|
+
targetDate: targetDate,
|
|
51
|
+
triggerTime: now.toISOString(),
|
|
52
|
+
durationMs: detailedMetrics.durationMs || 0,
|
|
53
|
+
status: status,
|
|
54
54
|
|
|
55
55
|
// [NEW] Trigger Context
|
|
56
56
|
trigger: {
|
|
@@ -325,13 +325,13 @@ async function generateBuildReport(config, dependencies, manifest, daysBack = 90
|
|
|
325
325
|
}
|
|
326
326
|
|
|
327
327
|
// 3. BLOCKED / IMPOSSIBLE / UPTODATE
|
|
328
|
-
analysis.blocked.forEach(item => pushIfValid(dateSummary.blocked,
|
|
329
|
-
analysis.failedDependency.forEach(item => pushIfValid(dateSummary.blocked,
|
|
330
|
-
analysis.impossible.forEach(item => pushIfValid(dateSummary.impossible, item));
|
|
331
|
-
analysis.skipped.forEach(item => pushIfValid(dateSummary.uptodate,
|
|
328
|
+
analysis.blocked.forEach (item => pushIfValid(dateSummary.blocked, item));
|
|
329
|
+
analysis.failedDependency.forEach (item => pushIfValid(dateSummary.blocked, item, "Dependency Missing"));
|
|
330
|
+
analysis.impossible.forEach (item => pushIfValid(dateSummary.impossible, item));
|
|
331
|
+
analysis.skipped.forEach (item => pushIfValid(dateSummary.uptodate, item, "Up To Date"));
|
|
332
332
|
|
|
333
333
|
// Meta stats
|
|
334
|
-
const includedCount = dateSummary.run.length
|
|
334
|
+
const includedCount = dateSummary.run.length + dateSummary.rerun.length + dateSummary.stable.length +
|
|
335
335
|
dateSummary.blocked.length + dateSummary.impossible.length + dateSummary.uptodate.length;
|
|
336
336
|
dateSummary.meta.totalIncluded = includedCount;
|
|
337
337
|
dateSummary.meta.match = (includedCount === expectedCount);
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
# Cloud Workflows Definition for BullTrackers Computation Pipeline
|
|
2
2
|
# Orchestrates 5 sequential passes using Event-Driven Callbacks (Zero Polling).
|
|
3
|
-
# FIXED:
|
|
4
|
-
# FIXED: Proper extraction of 'callback_details.url' for the dispatcher.
|
|
3
|
+
# FIXED: Restored 'passes' and 'max_retries' variables in init step.
|
|
5
4
|
|
|
6
5
|
main:
|
|
7
6
|
params: [input]
|
|
@@ -10,8 +9,14 @@ main:
|
|
|
10
9
|
assign:
|
|
11
10
|
- project: ${sys.get_env("GOOGLE_CLOUD_PROJECT_ID")}
|
|
12
11
|
- location: "europe-west1"
|
|
13
|
-
|
|
14
|
-
-
|
|
12
|
+
|
|
13
|
+
# T-1 Date Logic (Process Yesterday)
|
|
14
|
+
- now: ${sys.now()}
|
|
15
|
+
- yesterday_timestamp: ${now - 86400}
|
|
16
|
+
- yesterday_str: ${text.substring(time.format(yesterday_timestamp), 0, 10)}
|
|
17
|
+
- date_to_run: ${default(map.get(input, "date"), yesterday_str)}
|
|
18
|
+
|
|
19
|
+
# Configuration Variables (Restored)
|
|
15
20
|
- passes: ["1", "2", "3", "4", "5"]
|
|
16
21
|
- max_retries: 3
|
|
17
22
|
|
|
@@ -42,7 +47,6 @@ main:
|
|
|
42
47
|
- attempt_count: ${attempt_count + 1}
|
|
43
48
|
|
|
44
49
|
# 1. GENERATE CALLBACK ENDPOINT
|
|
45
|
-
# We use the 'events' library. This returns an object containing the URL.
|
|
46
50
|
- create_callback:
|
|
47
51
|
call: events.create_callback_endpoint
|
|
48
52
|
args:
|
|
@@ -60,7 +64,6 @@ main:
|
|
|
60
64
|
severity: "INFO"
|
|
61
65
|
|
|
62
66
|
# 2. TRIGGER DISPATCHER
|
|
63
|
-
# We pass the extracted 'callback_url' string to the dispatcher.
|
|
64
67
|
- trigger_dispatcher:
|
|
65
68
|
call: http.post
|
|
66
69
|
args:
|
|
@@ -89,12 +92,11 @@ main:
|
|
|
89
92
|
next: pass_retry_loop
|
|
90
93
|
|
|
91
94
|
# 4. WAIT FOR WORKER SIGNAL
|
|
92
|
-
# We must pass the original 'callback_details' object here, not the URL string.
|
|
93
95
|
- wait_for_completion:
|
|
94
96
|
call: events.await_callback
|
|
95
97
|
args:
|
|
96
98
|
callback: ${callback_details}
|
|
97
|
-
timeout:
|
|
99
|
+
timeout: 10800 # UPDATED: Reduced from 86400 (24h) to 10800 (3h) to detect crashes faster
|
|
98
100
|
result: callback_request
|
|
99
101
|
|
|
100
102
|
# 5. PROCESS SIGNAL
|
package/package.json
CHANGED
|
@@ -1,210 +0,0 @@
|
|
|
1
|
-
# BullTrackers Computation System: Architecture & Operational Manual
|
|
2
|
-
|
|
3
|
-
This document provides a comprehensive overview of the BullTrackers Computation System, a distributed, deterministic, and self-optimizing data pipeline. Unlike traditional task schedulers, this system operates on "Build System" principles, treating data calculations as compiled artifacts with strict versioning and dependency guarantees.
|
|
4
|
-
|
|
5
|
-
---
|
|
6
|
-
|
|
7
|
-
## 1. System Philosophy & Core Concepts
|
|
8
|
-
|
|
9
|
-
### The "Build System" Paradigm
|
|
10
|
-
We treat the computation pipeline like a large-scale software build system (e.g., Bazel or Make). Every data point is an "artifact" produced by a specific version of code (Code Hash) acting on specific versions of dependencies (Dependency Hashes).
|
|
11
|
-
* **Determinism**: If the input data and code haven't changed, the output *must* be identical. We verify this to skip unnecessary work.
|
|
12
|
-
* **Merkle Tree Structure**: The state of the system is a DAG (Directed Acyclic Graph) of hashes. A change in a root node propagates potential invalidation down the tree, but invalidation stops as soon as a node produces the same output as before (Short-Circuiting).
|
|
13
|
-
|
|
14
|
-
### Source-of-Truth Architecture
|
|
15
|
-
The **Root Data Index** is the absolute source of truth. No computation can start until the underlying raw data (prices, signals) is indexed and verified "Available" for the target date. This prevents partial runs and "garbage-in-garbage-out".
|
|
16
|
-
|
|
17
|
-
### The Three-Layer Hash Model
|
|
18
|
-
To optimize execution, we track three distinct hashes for every calculation:
|
|
19
|
-
1. **Code Hash (Static)**: A SHA-256 hash of the cleaned source code (comments and whitespace stripped). This tells us if the logic *might* have changed.
|
|
20
|
-
2. **SimHash (Behavioral)**: Generated by running the code against a deterministic "Fabricated" context. This tells us if the logic *actually* changed behavior (e.g., a refactor that changes variable names but not logic will have a different Code Hash but the same SimHash).
|
|
21
|
-
3. **ResultHash (Output)**: A hash of the actual production output from a run. This tells us if the data changed. Used for downstream short-circuiting.
|
|
22
|
-
|
|
23
|
-
---
|
|
24
|
-
|
|
25
|
-
## 2. Core Components Overview
|
|
26
|
-
|
|
27
|
-
### Root Data Indexer
|
|
28
|
-
A scheduled crawler that verifies the availability of raw external data (e.g., asset prices, global signals) for a given date. It produces an "Availability Manifest" that the Dispatcher consults before scheduling anything.
|
|
29
|
-
|
|
30
|
-
### Manifest Builder
|
|
31
|
-
* **Role**: Topology Discovery.
|
|
32
|
-
* **Mechanism**: It scans the `calculations/` directory, loads every module, and builds the global Dependency Graph (DAG) in memory.
|
|
33
|
-
* **Output**: A topological sort of all calculations assigned to "Passes" (Pass 0, Pass 1, etc.).
|
|
34
|
-
|
|
35
|
-
### The Dispatcher (`WorkflowOrchestrator.js`)
|
|
36
|
-
The "Brain" of the system. It runs largely stateless, analyzing the `StatusRepository` against the `Manifest`.
|
|
37
|
-
* **Responsibility**: For a given Grid (Date x Calculation), it determines if the state is `RUNNABLE`, `BLOCKED`, `SKIPPED`, or `IMPOSSIBLE`.
|
|
38
|
-
* **Key Logic**: It implements the "Short-Circuiting" and "Historical Continuity" checks.
|
|
39
|
-
|
|
40
|
-
### The Build Optimizer
|
|
41
|
-
A pre-flight tool that attempts to avoiding running tasks by proving they are identical to previous versions.
|
|
42
|
-
* **Mechanism**: If a calculation's Code Hash changes, the Optimizer runs a **Simulation** (using `SimRunner`) to generate a SimHash. If the SimHash matches the registry, the system acts as if the code never changed, skipping the production re-run.
|
|
43
|
-
|
|
44
|
-
### The Worker (`StandardExecutor` / `MetaExecutor`)
|
|
45
|
-
The execution unit. It is unaware of the broader topology.
|
|
46
|
-
* **Input**: A target Calculation and Date.
|
|
47
|
-
* **Action**: Fetches inputs, runs `process()`, validates results, and writes to Firestore.
|
|
48
|
-
* **Output**: The computed data + the **ResultHash**.
|
|
49
|
-
|
|
50
|
-
---
|
|
51
|
-
|
|
52
|
-
## 3. The Daily Lifecycle (Chronological Process)
|
|
53
|
-
|
|
54
|
-
### Phase 1: Indexing
|
|
55
|
-
The system waits for the `SystemEpoch` to advance. The Root Data Indexer checks for "Canary Blocks" (indicators that external data providers have finished for the day). Once confirmed, the date is marked `OPEN`.
|
|
56
|
-
|
|
57
|
-
### Phase 2: Pre-Flight Optimization
|
|
58
|
-
Before dispatching workers:
|
|
59
|
-
1. The system identifies all calculations with new **Code Hashes**.
|
|
60
|
-
2. It runs `SimRunner` for these calculations to generate fresh **SimHashes**.
|
|
61
|
-
3. If `SimHash(New) == SimHash(Old)`, the system updates the Status Ledger to enable the new Code Hash without flagging it as "Changed".
|
|
62
|
-
|
|
63
|
-
### Phase 3: Dispatch Analysis
|
|
64
|
-
The Dispatcher iterates through the Topological Passes (0 -> N). For each calculation, it queries `calculateExecutionStatus`:
|
|
65
|
-
* Are dependencies done?
|
|
66
|
-
* Did dependencies change their output (`ResultHash`)?
|
|
67
|
-
* Is historical context available?
|
|
68
|
-
|
|
69
|
-
### Phase 4: Execution Waves
|
|
70
|
-
Workers are triggered via Pub/Sub or direct method invocation.
|
|
71
|
-
* **Pass 1**: Primitive conversions (e.g., Price Extractor).
|
|
72
|
-
* **Pass 2**: Technical Indicators that depend on Pass 1.
|
|
73
|
-
* **Pass 3**: Aggregations and Complex Metrics.
|
|
74
|
-
|
|
75
|
-
### Phase 5: Reconciliation
|
|
76
|
-
After all queues drain, the system performs a final sweep. Any tasks marked `FAILED` are retried (up to a limit). Impossible tasks are finalized as `IMPOSSIBLE`.
|
|
77
|
-
|
|
78
|
-
---
|
|
79
|
-
|
|
80
|
-
## 4. Deep Dive: Hashing & Dependency Logic
|
|
81
|
-
|
|
82
|
-
### Intrinsic Code Hashing
|
|
83
|
-
Located in `topology/HashManager.js`.
|
|
84
|
-
We generate a unique fingerprint for every calculation file:
|
|
85
|
-
```javascript
|
|
86
|
-
clean = codeString.replace(comments).replace(whitespace);
|
|
87
|
-
hash = sha256(clean);
|
|
88
|
-
```
|
|
89
|
-
This ensures that changes to comments or formatting do *not* trigger re-runs.
|
|
90
|
-
|
|
91
|
-
### Behavioral Hashing (SimHash)
|
|
92
|
-
Located in `simulation/SimRunner.js`.
|
|
93
|
-
When code changes, we can't be 100% sure it's safe just by looking at the source.
|
|
94
|
-
1. **The Fabricator**: Generates a deterministic mock `Context` (prices, previous results) based on the input schema.
|
|
95
|
-
2. **Simulation Run**: The calculation `process()` method is executed against this mock data.
|
|
96
|
-
3. **The Registry**: The hash of the *output* of this simulation is stored.
|
|
97
|
-
If a refactor results in the exact same Mock Output, the system considers the change "Cosmetic".
|
|
98
|
-
|
|
99
|
-
### Dependency Short-Circuiting
|
|
100
|
-
Implemented in `WorkflowOrchestrator.js` (`analyzeDateExecution`).
|
|
101
|
-
Even if an upstream calculation re-runs, downstream dependents might not need to.
|
|
102
|
-
* **Logic**:
|
|
103
|
-
* Calc A (Upstream) re-runs. Old Output Hash: `HashX`. New Output Hash: `HashX`.
|
|
104
|
-
* Calc B (Downstream) sees that Calc A "changed" (new timestamp), BUT the content hash `HashX` is identical to what Calc B used last time.
|
|
105
|
-
* **Result**: Calc B is `SKIPPED`.
|
|
106
|
-
|
|
107
|
-
---
|
|
108
|
-
|
|
109
|
-
## 5. Decision Logic & Edge Case Scenarios
|
|
110
|
-
|
|
111
|
-
### Scenario A: Standard Code Change (Logic)
|
|
112
|
-
* **Trigger**: You change the formula for `RSI`. Code Hash changes. SimHash changes.
|
|
113
|
-
* **Dispatcher**: Sees `storedHash !== currentHash`.
|
|
114
|
-
* **Result**: Marks as `RUNNABLE`. Worker runs.
|
|
115
|
-
|
|
116
|
-
### Scenario B: Cosmetic Code Change (Refactor)
|
|
117
|
-
* **Trigger**: You rename a variable in `RSI`. Code Hash changes. SimHash remains identical.
|
|
118
|
-
* **Optimizer**: Updates the centralized Status Ledger: "Version `Desc_v2` is equivalent to `Desc_v1`".
|
|
119
|
-
* **Dispatcher**: Sees the new hash in the ledger as "Verified".
|
|
120
|
-
* **Result**: Task is `SKIPPED`.
|
|
121
|
-
|
|
122
|
-
### Scenario C: Upstream Invalidation (The Cascade)
|
|
123
|
-
* **Condition**: `PriceExtractor` fixes a bug. `ResultHash` changes from `HashA` to `HashB`.
|
|
124
|
-
* **Downstream**: `RSI` checks detailed dependency report.
|
|
125
|
-
* **Check**: `LastRunDeps['PriceExtractor'] (HashA) !== CurrentDeps['PriceExtractor'] (HashB)`.
|
|
126
|
-
* **Result**: `RSI` is forced to re-run.
|
|
127
|
-
|
|
128
|
-
### Scenario D: Upstream Stability (The Firewall)
|
|
129
|
-
* **Condition**: `PriceExtractor` runs an optimization. Output is exact same data. `ResultHash` remains `HashA`.
|
|
130
|
-
* **Downstream**: `RSI` checks dependency report.
|
|
131
|
-
* **Check**: `LastRunDeps['PriceExtractor'] (HashA) === CurrentDeps['PriceExtractor'] (HashA)`.
|
|
132
|
-
* **Result**: `RSI` is `SKIPPED`. This firewall prevents massive re-calculation storms for non-functional upstream changes.
|
|
133
|
-
|
|
134
|
-
### Scenario E: The "Impossible" State
|
|
135
|
-
* **Condition**: Core market data is missing for `1990-01-01`.
|
|
136
|
-
* **Root Indexer**: Marks date as providing `[]` (empty) for critical inputs.
|
|
137
|
-
* **Dispatcher**: Marks `PriceExtractor` as `IMPOSSIBLE: NO_DATA`.
|
|
138
|
-
* **Propagation**: Any calculation depending on `PriceExtractor` sees the `IMPOSSIBLE` status and marks *itself* as `IMPOSSIBLE: UPSTREAM`.
|
|
139
|
-
* **Benefit**: The system doesn't waste cycles retrying calculations that can never succeed.
|
|
140
|
-
|
|
141
|
-
### Scenario F: Category Migration
|
|
142
|
-
* **Condition**: You change `getMetadata()` for a calculation, moving it from `signals` to `risk`.
|
|
143
|
-
* **Dispatcher**: Detects `storedCategory !== newCategory`.
|
|
144
|
-
* **Worker**:
|
|
145
|
-
1. Runs `process()` and writes to the *new* path (`risk/CalculateX`).
|
|
146
|
-
2. Detects the `previousCategory` flag.
|
|
147
|
-
3. Deletes the data at the *old* path (`signals/CalculateX`) to prevent orphan data.
|
|
148
|
-
|
|
149
|
-
---
|
|
150
|
-
|
|
151
|
-
## 6. Data Management & Storage
|
|
152
|
-
|
|
153
|
-
### Input Streaming
|
|
154
|
-
To handle large datasets without OOM (Out Of Memory) errors:
|
|
155
|
-
* `StandardExecutor` does not load all users/tickers at once.
|
|
156
|
-
* It utilizes wait-and-stream logic (e.g., batches of 50 ids) to process the `Context`.
|
|
157
|
-
|
|
158
|
-
### Transparent Auto-Sharding
|
|
159
|
-
Firestore has a 1MB document limit.
|
|
160
|
-
* **Write Path**: If a calculation result > 900KB, it is split into `DocID`, `DocID_shard1`, `DocID_shard2`.
|
|
161
|
-
* **Read Path**: The `DependencyFetcher` automatically detects sharding pointers and re-assembles (hydrates) the full object before passing it to `process()`.
|
|
162
|
-
|
|
163
|
-
### Compression Strategy
|
|
164
|
-
* Payloads are inspected before write.
|
|
165
|
-
* If efficient (high entropy text/JSON), Zlib compression is applied.
|
|
166
|
-
* Metadata is tagged `encoding: 'zlib'` so readers know to inflate.
|
|
167
|
-
|
|
168
|
-
---
|
|
169
|
-
|
|
170
|
-
## 7. Quality Assurance & Self-Healing
|
|
171
|
-
|
|
172
|
-
### The Heuristic Validator
|
|
173
|
-
Before saving *any* result, the Executor runs heuristics:
|
|
174
|
-
* **NaN Check**: Are there `NaN` or `Infinity` values in key fields?
|
|
175
|
-
* **Flatline Check**: Is the data variance 0.00 across a large timespan?
|
|
176
|
-
* **Null Density**: Is >50% of the dataset null?
|
|
177
|
-
* **Circuit Breaker**: If heuristics fail, the task throws an error. It is better to fail and alert than to persist corrupted data that pollutes the cache.
|
|
178
|
-
|
|
179
|
-
### Zombie Task Recovery
|
|
180
|
-
* **Lease Mechanism**: When a task starts, it sets a `startedAt` timestamp.
|
|
181
|
-
* **Detection**: The Dispatcher checks for tasks marked `RUNNING` where `startedAt` > 15 minutes ago.
|
|
182
|
-
* **Resolution**: These are assumed crashed (OOM/Timeout). They are reset to `PENDING` (or `FAILED` if retry count exceeded).
|
|
183
|
-
|
|
184
|
-
### Dead Letter Queue (DLQ)
|
|
185
|
-
Tasks that deterministically fail (crash every time) after N retries are moved to a special DLQ status. This prevents the system from getting stuck in an infinite retry loop.
|
|
186
|
-
|
|
187
|
-
---
|
|
188
|
-
|
|
189
|
-
## 8. Developer Workflows
|
|
190
|
-
|
|
191
|
-
### How to Add a New Calculation
|
|
192
|
-
1. Create `calculations/category/MyNewCalc.js`.
|
|
193
|
-
2. Implement `getMetadata()` to define dependencies.
|
|
194
|
-
3. Implement `process(context)`.
|
|
195
|
-
4. Run `npm run build-manifest` to register it in the topology.
|
|
196
|
-
|
|
197
|
-
### How to Force a Global Re-Run
|
|
198
|
-
* Change the `SYSTEM_EPOCH` constant in `system_epoch.js`.
|
|
199
|
-
* This changes the "Global Salt" for all hashes, processing every calculation as "New".
|
|
200
|
-
|
|
201
|
-
### How to Backfill History
|
|
202
|
-
* **Standard Dispatcher**: Good for recent history (last 30 days).
|
|
203
|
-
* **BatchPriceExecutor**: Specialized for massive historical backfills (e.g., 20 years of price data). It bypasses some topology checks for raw speed.
|
|
204
|
-
|
|
205
|
-
### Local Debugging
|
|
206
|
-
Run the orchestrator in "Dry Run" mode:
|
|
207
|
-
```bash
|
|
208
|
-
node scripts/run_orchestrator.js --date=2024-01-01 --dry-run
|
|
209
|
-
```
|
|
210
|
-
This prints the `Analysis Report` (Runnable/Blocked lists) without actually triggering workers.
|