bulltrackers-module 1.0.738 → 1.0.739
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/functions/computation-system-v2/docs/plans.md +588 -0
- package/functions/computation-system-v2/framework/core/Manifest.js +25 -1
- package/functions/computation-system-v2/framework/data/DataFetcher.js +34 -2
- package/functions/computation-system-v2/framework/data/SchemaRegistry.js +31 -7
- package/functions/computation-system-v2/framework/execution/Orchestrator.js +36 -21
- package/functions/computation-system-v2/framework/execution/RemoteTaskRunner.js +74 -13
- package/package.json +1 -1
|
@@ -0,0 +1,588 @@
|
|
|
1
|
+
# Comprehensive Review & Improvement Suggestions
|
|
2
|
+
|
|
3
|
+
## 🚨 **CRITICAL ISSUES**
|
|
4
|
+
|
|
5
|
+
### 1. **Infinite Loop Risk in Conditional Dependencies**
|
|
6
|
+
```javascript
|
|
7
|
+
// framework/core/Manifest.js - Line ~50
|
|
8
|
+
if (config.conditionalDependencies) {
|
|
9
|
+
for (const condDep of config.conditionalDependencies) {
|
|
10
|
+
const shouldLoad = condDep.condition({ date: dateStr, config: this.config });
|
|
11
|
+
if (shouldLoad) {
|
|
12
|
+
// NO CYCLE DETECTION for conditional deps!
|
|
13
|
+
const depStatus = dailyStatus.get(condDep.computation.toLowerCase());
|
|
14
|
+
if (depStatus) {
|
|
15
|
+
depResults[condDep.computation] = await this.stateRepository.getResult(dateStr, condDep.computation);
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
**Problem**: Conditional dependencies bypass cycle detection in `Graph.detectCycle()`. If Comp A conditionally depends on Comp B, and Comp B conditionally depends on Comp A, you get an infinite loop.
|
|
23
|
+
|
|
24
|
+
**Fix**:
|
|
25
|
+
```javascript
|
|
26
|
+
// In ManifestBuilder._processComputation()
|
|
27
|
+
_processComputation(ComputationClass) {
|
|
28
|
+
const config = ComputationClass.getConfig();
|
|
29
|
+
|
|
30
|
+
// Merge conditional deps into regular deps for graph analysis
|
|
31
|
+
const allDeps = [...(config.dependencies || [])];
|
|
32
|
+
if (config.conditionalDependencies) {
|
|
33
|
+
allDeps.push(...config.conditionalDependencies.map(cd => cd.computation));
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
return {
|
|
37
|
+
// ...
|
|
38
|
+
dependencies: allDeps.map(d => this._normalize(d)),
|
|
39
|
+
conditionalDependencies: config.conditionalDependencies || [],
|
|
40
|
+
// ...
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### 2. **Cost Spiral in Worker Pool Retries**
|
|
46
|
+
```javascript
|
|
47
|
+
// framework/execution/RemoteTaskRunner.js - Line ~200
|
|
48
|
+
async _invokeWorkerWithRetry(payload, attempt = 1) {
|
|
49
|
+
try {
|
|
50
|
+
return await this._invokeWorker(payload);
|
|
51
|
+
} catch (e) {
|
|
52
|
+
if (isRetryable && attempt < this.retries) {
|
|
53
|
+
// NO COST TRACKING ON RETRIES!
|
|
54
|
+
return this._invokeWorkerWithRetry(payload, attempt + 1);
|
|
55
|
+
}
|
|
56
|
+
throw e;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
**Problem**: Failed workers retry without tracking costs. If 1000 entities fail and each retries 2x, you pay 3x the expected cost with no warning.
|
|
62
|
+
|
|
63
|
+
**Fix**: Add cost multiplier tracking:
|
|
64
|
+
```javascript
|
|
65
|
+
class RemoteTaskRunner {
|
|
66
|
+
constructor(config, logger) {
|
|
67
|
+
this.costMultiplier = 1.0;
|
|
68
|
+
this.totalRetries = 0;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
async _invokeWorkerWithRetry(payload, attempt = 1) {
|
|
72
|
+
try {
|
|
73
|
+
return await this._invokeWorker(payload);
|
|
74
|
+
} catch (e) {
|
|
75
|
+
if (isRetryable && attempt < this.retries) {
|
|
76
|
+
this.totalRetries++;
|
|
77
|
+
this.costMultiplier = 1 + (this.totalRetries / this.stats.totalInvocations);
|
|
78
|
+
|
|
79
|
+
if (this.costMultiplier > 2.0) {
|
|
80
|
+
throw new Error(`🚨 RETRY COST SPIRAL: ${this.totalRetries} retries (${this.costMultiplier.toFixed(1)}x cost). Aborting.`);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
await new Promise(r => setTimeout(r, delay));
|
|
84
|
+
return this._invokeWorkerWithRetry(payload, attempt + 1);
|
|
85
|
+
}
|
|
86
|
+
throw e;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### 3. **Runaway Query Cost in Historical Computations**
|
|
93
|
+
```javascript
|
|
94
|
+
// framework/data/DataFetcher.js - Line ~80
|
|
95
|
+
async fetchForComputation(requires, targetDate, entities = null) {
|
|
96
|
+
await Promise.all(Object.entries(requires).map(async ([tableName, spec]) => {
|
|
97
|
+
const data = await this.fetch({
|
|
98
|
+
lookback: spec.lookback || 0, // No upper limit!
|
|
99
|
+
// ...
|
|
100
|
+
});
|
|
101
|
+
}));
|
|
102
|
+
}
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
**Problem**: A computation with `lookback: 365` on a 10TB table scans 365 days. If you have 100 computations with high lookbacks, costs explode.
|
|
106
|
+
|
|
107
|
+
**Fix**: Add lookback budget:
|
|
108
|
+
```javascript
|
|
109
|
+
// In config
|
|
110
|
+
execution: {
|
|
111
|
+
maxLookbackDays: 90, // Hard limit
|
|
112
|
+
maxLookbackBudgetGB: 100, // Cumulative budget per run
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// In DataFetcher
|
|
116
|
+
async fetch(options) {
|
|
117
|
+
if (options.lookback > this.config.execution.maxLookbackDays) {
|
|
118
|
+
throw new Error(`Lookback ${options.lookback} exceeds limit of ${this.config.execution.maxLookbackDays} days`);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// Estimate scan size before executing
|
|
122
|
+
const estimatedGB = await this._estimateScanSize(options);
|
|
123
|
+
this.cumulativeScanGB += estimatedGB;
|
|
124
|
+
|
|
125
|
+
if (this.cumulativeScanGB > this.config.execution.maxLookbackBudgetGB) {
|
|
126
|
+
throw new Error(`Query budget exceeded: ${this.cumulativeScanGB}GB / ${this.config.execution.maxLookbackBudgetGB}GB`);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Execute...
|
|
130
|
+
}
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## ⚡ **PERFORMANCE BOTTLENECKS**
|
|
136
|
+
|
|
137
|
+
### 4. **N+1 Dependency Fetches**
|
|
138
|
+
```javascript
|
|
139
|
+
// framework/execution/Orchestrator.js - Line ~450
|
|
140
|
+
async _lazyLoadDependency(dateStr, depName, entityId, preloaded) {
|
|
141
|
+
if (preloaded[depName] && entityId) return preloaded[depName][entityId];
|
|
142
|
+
// FETCHES ONE ENTITY AT A TIME!
|
|
143
|
+
return this.stateRepository.getEntityResult(dateStr, depName, entityId);
|
|
144
|
+
}
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
**Problem**: If a computation depends on `PopularInvestorProfileMetrics` and processes 10,000 users, this issues 10,000 individual BigQuery queries.
|
|
148
|
+
|
|
149
|
+
**Fix**: Already partially addressed with `getBatchEntityResults()`, but enforce it:
|
|
150
|
+
```javascript
|
|
151
|
+
async _executeStreaming(entry, dateStr, depResults, previousResult, options) {
|
|
152
|
+
// ...
|
|
153
|
+
for await (const batch of batchStream) {
|
|
154
|
+
const { entityIds } = batch;
|
|
155
|
+
|
|
156
|
+
// FORCE batch prefetch
|
|
157
|
+
const batchDeps = await this._prefetchBatchDependencies(entry, dateStr, depResults, entityIds);
|
|
158
|
+
|
|
159
|
+
// Remove lazy load fallback to catch bugs
|
|
160
|
+
await Promise.all(entityIds.map(entityId => limit(async () => {
|
|
161
|
+
const context = {
|
|
162
|
+
getDependency: (depName, targetId) => {
|
|
163
|
+
const id = targetId || entityId;
|
|
164
|
+
if (!batchDeps[depName]?.has(id)) {
|
|
165
|
+
throw new Error(`LAZY LOAD DETECTED: ${depName}:${id}. This is a bug - dependencies must be prefetched.`);
|
|
166
|
+
}
|
|
167
|
+
return batchDeps[depName].get(id);
|
|
168
|
+
},
|
|
169
|
+
// ...
|
|
170
|
+
};
|
|
171
|
+
})));
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
### 5. **Redundant Schema Fetches**
|
|
177
|
+
```javascript
|
|
178
|
+
// framework/data/SchemaRegistry.js - Line ~70
|
|
179
|
+
async getSchema(tableName) {
|
|
180
|
+
const cached = this.cache.get(tableName);
|
|
181
|
+
if (cached && !this._isExpired(cached)) {
|
|
182
|
+
return cached; // Good!
|
|
183
|
+
}
|
|
184
|
+
// But if 100 concurrent requests arrive during cold start...
|
|
185
|
+
return await this._fetchAndCacheSchema(tableName); // 100 concurrent fetches!
|
|
186
|
+
}
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
**Fix**: Coalesce concurrent requests:
|
|
190
|
+
```javascript
|
|
191
|
+
constructor() {
|
|
192
|
+
this.cache = new Map();
|
|
193
|
+
this.pendingFetches = new Map(); // NEW
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
async getSchema(tableName) {
|
|
197
|
+
const cached = this.cache.get(tableName);
|
|
198
|
+
if (cached && !this._isExpired(cached)) return cached;
|
|
199
|
+
|
|
200
|
+
// Coalesce concurrent requests
|
|
201
|
+
if (this.pendingFetches.has(tableName)) {
|
|
202
|
+
return this.pendingFetches.get(tableName);
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
const fetchPromise = this._fetchAndCacheSchema(tableName);
|
|
206
|
+
this.pendingFetches.set(tableName, fetchPromise);
|
|
207
|
+
|
|
208
|
+
try {
|
|
209
|
+
return await fetchPromise;
|
|
210
|
+
} finally {
|
|
211
|
+
this.pendingFetches.delete(tableName);
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
### 6. **Inefficient Batch Splitting**
|
|
217
|
+
```javascript
|
|
218
|
+
// framework/data/DataFetcher.js - Line ~200
|
|
219
|
+
async *fetchBatched(options, batchSize = 1000) {
|
|
220
|
+
let batch = [];
|
|
221
|
+
let currentEntity = null;
|
|
222
|
+
|
|
223
|
+
for await (const row of rowStream) {
|
|
224
|
+
if (entityField) {
|
|
225
|
+
const rowEntity = String(row[entityField]);
|
|
226
|
+
|
|
227
|
+
// Splits batch even if at 999 rows (wastes 1 row)
|
|
228
|
+
if (batch.length >= batchSize && rowEntity !== currentEntity && currentEntity !== null) {
|
|
229
|
+
yield this._transform(batch, { lookback, dateField, entityField, dataField });
|
|
230
|
+
batch = [];
|
|
231
|
+
}
|
|
232
|
+
currentEntity = rowEntity;
|
|
233
|
+
}
|
|
234
|
+
batch.push(row);
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
**Problem**: If an entity has 5,000 rows and `batchSize=1000`, you get 5 batches for one entity. If processing is slow, this serializes work.
|
|
240
|
+
|
|
241
|
+
**Fix**: Add max batch size override:
|
|
242
|
+
```javascript
|
|
243
|
+
async *fetchBatched(options, batchSize = 1000) {
|
|
244
|
+
const maxBatchSize = batchSize * 10; // Allow up to 10x for large entities
|
|
245
|
+
|
|
246
|
+
for await (const row of rowStream) {
|
|
247
|
+
if (entityField) {
|
|
248
|
+
const rowEntity = String(row[entityField]);
|
|
249
|
+
|
|
250
|
+
// Force split if exceeding max
|
|
251
|
+
if (batch.length >= maxBatchSize) {
|
|
252
|
+
yield this._transform(batch, ...);
|
|
253
|
+
batch = [];
|
|
254
|
+
currentEntity = null;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
// Normal split logic
|
|
258
|
+
if (batch.length >= batchSize && rowEntity !== currentEntity && currentEntity !== null) {
|
|
259
|
+
yield this._transform(batch, ...);
|
|
260
|
+
batch = [];
|
|
261
|
+
}
|
|
262
|
+
currentEntity = rowEntity;
|
|
263
|
+
}
|
|
264
|
+
batch.push(row);
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
---
|
|
270
|
+
|
|
271
|
+
## 🔒 **RELIABILITY ISSUES**
|
|
272
|
+
|
|
273
|
+
### 7. **Zombie Detection Race Condition**
|
|
274
|
+
```javascript
|
|
275
|
+
// handlers/scheduler.js - Line ~80
|
|
276
|
+
let zombies = [];
|
|
277
|
+
try {
|
|
278
|
+
zombies = await storageManager.findZombies(ZOMBIE_THRESHOLD_MINUTES);
|
|
279
|
+
if (zombies.length > 0) {
|
|
280
|
+
console.log(`DETECTED ${zombies.length} ZOMBIES`);
|
|
281
|
+
}
|
|
282
|
+
} catch (e) {
|
|
283
|
+
console.error(`Zombie check failed: ${e.message}`);
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
// Later: dispatch zombies
|
|
287
|
+
const zombieEntries = zombies.map(z => {...});
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
**Problem**: If `findZombies()` throws, scheduler continues but `zombieEntries` is empty. Zombies never recover.
|
|
291
|
+
|
|
292
|
+
**Fix**: Make zombie detection non-blocking:
|
|
293
|
+
```javascript
|
|
294
|
+
async function schedulerHandler(req, res) {
|
|
295
|
+
// 1. Find due computations (CRITICAL PATH)
|
|
296
|
+
const dueComputations = findDueComputations(now);
|
|
297
|
+
|
|
298
|
+
// 2. Zombie detection (BEST EFFORT)
|
|
299
|
+
let zombies = [];
|
|
300
|
+
try {
|
|
301
|
+
zombies = await Promise.race([
|
|
302
|
+
storageManager.findZombies(ZOMBIE_THRESHOLD_MINUTES),
|
|
303
|
+
new Promise((_, reject) => setTimeout(() => reject(new Error('Timeout')), 5000))
|
|
304
|
+
]);
|
|
305
|
+
} catch (e) {
|
|
306
|
+
// Log but don't block scheduler
|
|
307
|
+
console.warn(`⚠️ Zombie detection failed (non-fatal): ${e.message}`);
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
// Continue with due + zombies...
|
|
311
|
+
}
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
### 8. **Silent Firestore Write Failures**
|
|
315
|
+
```javascript
|
|
316
|
+
// framework/storage/StorageManager.js - Line ~650
|
|
317
|
+
async _writeToFirestore(dateStr, entry, results, firestoreConfig) {
|
|
318
|
+
try {
|
|
319
|
+
// ... batching logic ...
|
|
320
|
+
return Promise.all(batches.map(b => limit(() => b.commit()))).then(() => ({ docCount }));
|
|
321
|
+
} catch (error) {
|
|
322
|
+
if (error.message.includes('Could not load the default credentials')) {
|
|
323
|
+
console.log(' Skipped: No GCP credentials');
|
|
324
|
+
return 'skipped';
|
|
325
|
+
}
|
|
326
|
+
console.log(` Error: ${error.message}`);
|
|
327
|
+
return false; // Returns false but doesn't throw!
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
```
|
|
331
|
+
|
|
332
|
+
**Problem**: If Firestore writes fail, `commitResults()` returns `{ firestore: false }` but execution continues. Users think data is saved.
|
|
333
|
+
|
|
334
|
+
**Fix**:
|
|
335
|
+
```javascript
|
|
336
|
+
async commitResults(dateStr, entry, results, depResultHashes = {}) {
|
|
337
|
+
const storageConfig = this._resolveStorageConfig(entry);
|
|
338
|
+
|
|
339
|
+
const firestoreTask = async () => {
|
|
340
|
+
if (!storageConfig.firestore?.enabled) return null;
|
|
341
|
+
const result = await this._writeToFirestore(...);
|
|
342
|
+
|
|
343
|
+
// ENFORCE: If enabled, must succeed
|
|
344
|
+
if (result === false || result === 'skipped') {
|
|
345
|
+
throw new Error(`Firestore write failed for ${entry.name} but was configured as enabled`);
|
|
346
|
+
}
|
|
347
|
+
return result;
|
|
348
|
+
};
|
|
349
|
+
|
|
350
|
+
const [bigqueryResult, firestoreResult] = await Promise.all([gcsTask(), firestoreTask()]);
|
|
351
|
+
return { bigquery: bigqueryResult, firestore: firestoreResult };
|
|
352
|
+
}
|
|
353
|
+
```
|
|
354
|
+
|
|
355
|
+
---
|
|
356
|
+
|
|
357
|
+
## 🎯 **NEW FEATURE SUGGESTIONS**
|
|
358
|
+
|
|
359
|
+
### 9. **Adaptive Batch Sizing**
|
|
360
|
+
```javascript
|
|
361
|
+
// NEW: framework/execution/AdaptiveBatcher.js
|
|
362
|
+
class AdaptiveBatcher {
|
|
363
|
+
constructor(initialSize = 1000) {
|
|
364
|
+
this.currentSize = initialSize;
|
|
365
|
+
this.performanceWindow = [];
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
recordBatch(entityCount, durationMs) {
|
|
369
|
+
const throughput = entityCount / (durationMs / 1000); // entities/sec
|
|
370
|
+
this.performanceWindow.push({ size: entityCount, throughput });
|
|
371
|
+
|
|
372
|
+
if (this.performanceWindow.length > 10) {
|
|
373
|
+
this.performanceWindow.shift();
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
// Adjust batch size based on throughput trend
|
|
377
|
+
const avgThroughput = this.performanceWindow.reduce((sum, p) => sum + p.throughput, 0) / this.performanceWindow.length;
|
|
378
|
+
|
|
379
|
+
if (avgThroughput < 10) {
|
|
380
|
+
// Slow processing - reduce batch size for parallelism
|
|
381
|
+
this.currentSize = Math.max(100, this.currentSize * 0.8);
|
|
382
|
+
} else if (avgThroughput > 100) {
|
|
383
|
+
// Fast processing - increase batch size
|
|
384
|
+
this.currentSize = Math.min(10000, this.currentSize * 1.2);
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
getSize() {
|
|
389
|
+
return Math.round(this.currentSize);
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
// In Orchestrator
|
|
394
|
+
this.adaptiveBatcher = new AdaptiveBatcher(config.execution.batchSize);
|
|
395
|
+
|
|
396
|
+
for await (const batch of batchStream) {
|
|
397
|
+
const startTime = Date.now();
|
|
398
|
+
// ... process batch ...
|
|
399
|
+
this.adaptiveBatcher.recordBatch(batch.entityIds.length, Date.now() - startTime);
|
|
400
|
+
|
|
401
|
+
// Use adaptive size for next batch
|
|
402
|
+
batchSize = this.adaptiveBatcher.getSize();
|
|
403
|
+
}
|
|
404
|
+
```
|
|
405
|
+
|
|
406
|
+
### 10. **Query Plan Caching**
|
|
407
|
+
```javascript
|
|
408
|
+
// NEW: framework/data/QueryPlanCache.js
|
|
409
|
+
class QueryPlanCache {
|
|
410
|
+
constructor(bigquery) {
|
|
411
|
+
this.cache = new Map();
|
|
412
|
+
this.bigquery = bigquery;
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
async getEstimate(query) {
|
|
416
|
+
const queryHash = crypto.createHash('md5').update(query.sql).digest('hex');
|
|
417
|
+
|
|
418
|
+
if (this.cache.has(queryHash)) {
|
|
419
|
+
return this.cache.get(queryHash);
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
// Use BigQuery dry run for cost estimation
|
|
423
|
+
const [job] = await this.bigquery.createQueryJob({
|
|
424
|
+
query: query.sql,
|
|
425
|
+
params: query.params,
|
|
426
|
+
dryRun: true
|
|
427
|
+
});
|
|
428
|
+
|
|
429
|
+
const [metadata] = await job.getMetadata();
|
|
430
|
+
const estimate = {
|
|
431
|
+
bytesProcessed: parseInt(metadata.statistics.totalBytesProcessed || 0, 10),
|
|
432
|
+
cacheHit: metadata.statistics.query.cacheHit,
|
|
433
|
+
estimatedCost: (parseInt(metadata.statistics.totalBytesProcessed || 0, 10) / 1099511627776) * 5
|
|
434
|
+
};
|
|
435
|
+
|
|
436
|
+
this.cache.set(queryHash, estimate);
|
|
437
|
+
return estimate;
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
// In DataFetcher
|
|
442
|
+
async _execute(query) {
|
|
443
|
+
const estimate = await this.queryPlanCache.getEstimate(query);
|
|
444
|
+
|
|
445
|
+
if (estimate.estimatedCost > 1.0) { // $1+ queries
|
|
446
|
+
this._log('WARN', `Expensive query detected: $${estimate.estimatedCost.toFixed(2)} - ${query.sql.substring(0, 100)}`);
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
// Execute as normal...
|
|
450
|
+
}
|
|
451
|
+
```
|
|
452
|
+
|
|
453
|
+
### 11. **Circuit Breaker for Failed Computations**
|
|
454
|
+
```javascript
|
|
455
|
+
// NEW: framework/resilience/CircuitBreaker.js
|
|
456
|
+
class CircuitBreaker {
|
|
457
|
+
constructor(failureThreshold = 5, resetTimeout = 60000) {
|
|
458
|
+
this.failures = new Map();
|
|
459
|
+
this.failureThreshold = failureThreshold;
|
|
460
|
+
this.resetTimeout = resetTimeout;
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
recordFailure(computationName) {
|
|
464
|
+
const state = this.failures.get(computationName) || { count: 0, lastFailure: null };
|
|
465
|
+
state.count++;
|
|
466
|
+
state.lastFailure = Date.now();
|
|
467
|
+
this.failures.set(computationName, state);
|
|
468
|
+
|
|
469
|
+
if (state.count >= this.failureThreshold) {
|
|
470
|
+
console.warn(`🔴 Circuit breaker OPEN for ${computationName} (${state.count} failures)`);
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
recordSuccess(computationName) {
|
|
475
|
+
this.failures.delete(computationName);
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
shouldSkip(computationName) {
|
|
479
|
+
const state = this.failures.get(computationName);
|
|
480
|
+
if (!state || state.count < this.failureThreshold) return false;
|
|
481
|
+
|
|
482
|
+
// Auto-reset after timeout
|
|
483
|
+
if (Date.now() - state.lastFailure > this.resetTimeout) {
|
|
484
|
+
console.log(`🟡 Circuit breaker HALF-OPEN for ${computationName} (attempting reset)`);
|
|
485
|
+
this.failures.delete(computationName);
|
|
486
|
+
return false;
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
return true;
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
// In Orchestrator._executeComputation
|
|
494
|
+
if (this.circuitBreaker.shouldSkip(entry.name)) {
|
|
495
|
+
return { name: entry.name, status: 'circuit_open', reason: 'Too many consecutive failures' };
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
try {
|
|
499
|
+
const result = await this._executeGlobal(...);
|
|
500
|
+
this.circuitBreaker.recordSuccess(entry.name);
|
|
501
|
+
return result;
|
|
502
|
+
} catch (e) {
|
|
503
|
+
this.circuitBreaker.recordFailure(entry.name);
|
|
504
|
+
throw e;
|
|
505
|
+
}
|
|
506
|
+
```
|
|
507
|
+
|
|
508
|
+
---
|
|
509
|
+
|
|
510
|
+
## 📊 **MONITORING ENHANCEMENTS**
|
|
511
|
+
|
|
512
|
+
### 12. **Real-Time Cost Dashboard**
|
|
513
|
+
```javascript
|
|
514
|
+
// NEW: Create a separate Cloud Function that queries cost tables
|
|
515
|
+
exports.getCostDashboard = async (req, res) => {
|
|
516
|
+
const { startDate, endDate } = req.query;
|
|
517
|
+
|
|
518
|
+
const query = `
|
|
519
|
+
SELECT
|
|
520
|
+
computation_name,
|
|
521
|
+
SUM(estimated_cost_usd) as total_cost,
|
|
522
|
+
COUNT(*) as runs,
|
|
523
|
+
AVG(estimated_cost_usd) as avg_cost,
|
|
524
|
+
MAX(estimated_cost_usd) as max_cost
|
|
525
|
+
FROM \`${PROJECT}.${DATASET}.computation_costs\`
|
|
526
|
+
WHERE date BETWEEN @start AND @end
|
|
527
|
+
GROUP BY computation_name
|
|
528
|
+
ORDER BY total_cost DESC
|
|
529
|
+
`;
|
|
530
|
+
|
|
531
|
+
const [rows] = await bigquery.query({ query, params: { start: startDate, end: endDate } });
|
|
532
|
+
|
|
533
|
+
res.json({
|
|
534
|
+
period: { start: startDate, end: endDate },
|
|
535
|
+
totalCost: rows.reduce((sum, r) => sum + r.total_cost, 0),
|
|
536
|
+
computations: rows,
|
|
537
|
+
alert: rows.some(r => r.total_cost > 10) ? 'HIGH_COST_DETECTED' : null
|
|
538
|
+
});
|
|
539
|
+
};
|
|
540
|
+
```
|
|
541
|
+
|
|
542
|
+
---
|
|
543
|
+
|
|
544
|
+
## 🎛️ **CONFIGURATION IMPROVEMENTS**
|
|
545
|
+
|
|
546
|
+
### 13. **Cost-Aware Scheduling**
|
|
547
|
+
```json
|
|
548
|
+
// In config/bulltrackers.config.js
|
|
549
|
+
{
|
|
550
|
+
"computations": [
|
|
551
|
+
{
|
|
552
|
+
"class": "PopularInvestorRiskMetrics",
|
|
553
|
+
"schedule": {
|
|
554
|
+
"frequency": "daily",
|
|
555
|
+
"time": "02:00",
|
|
556
|
+
"costBudget": 5.0, // NEW: Skip if daily cost exceeded
|
|
557
|
+
"priority": 1 // NEW: Lower priority runs only if budget allows
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
],
|
|
561
|
+
"execution": {
|
|
562
|
+
"dailyCostBudget": 50, // Global budget
|
|
563
|
+
"costCheckInterval": 3600000 // Re-check every hour
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
```
|
|
567
|
+
|
|
568
|
+
---
|
|
569
|
+
|
|
570
|
+
## Summary of Critical Fixes Needed
|
|
571
|
+
|
|
572
|
+
| Issue | Severity | Fix Complexity | Impact |
|
|
573
|
+
|-------|----------|----------------|--------|
|
|
574
|
+
| Conditional dependency cycles | 🔴 CRITICAL | Low | Infinite loops |
|
|
575
|
+
| Worker pool retry cost spiral | 🔴 CRITICAL | Medium | Cost overruns |
|
|
576
|
+
| Unbounded lookback queries | 🔴 CRITICAL | Low | Query cost explosion |
|
|
577
|
+
| N+1 dependency fetches | 🟡 HIGH | Medium | 100x slower execution |
|
|
578
|
+
| Schema fetch stampede | 🟡 HIGH | Low | Cold start failures |
|
|
579
|
+
| Silent Firestore failures | 🟡 HIGH | Low | Data loss |
|
|
580
|
+
| Zombie recovery failures | 🟡 HIGH | Low | Stuck computations |
|
|
581
|
+
|
|
582
|
+
**Immediate Actions:**
|
|
583
|
+
1. Add conditional dependency cycle detection (1 hour fix)
|
|
584
|
+
2. Add retry cost tracking in RemoteTaskRunner (2 hours)
|
|
585
|
+
3. Add lookback budget limits (1 hour)
|
|
586
|
+
4. Enforce batch dependency prefetching (already partially done, needs enforcement - 2 hours)
|
|
587
|
+
5. Add schema fetch coalescing (1 hour)
|
|
588
|
+
|
|
@@ -38,7 +38,18 @@ class ManifestBuilder {
|
|
|
38
38
|
const entry = this._processComputation(ComputationClass);
|
|
39
39
|
if (entry) {
|
|
40
40
|
manifestMap.set(entry.name, entry);
|
|
41
|
-
|
|
41
|
+
|
|
42
|
+
// CRITICAL FIX: Include conditional dependencies in the DAG for cycle detection and topological sort.
|
|
43
|
+
// Even if the dependency is conditional at runtime, the execution order (Pass) must respect it.
|
|
44
|
+
const graphDeps = [...entry.dependencies];
|
|
45
|
+
if (entry.conditionalDependencies) {
|
|
46
|
+
entry.conditionalDependencies.forEach(cd => {
|
|
47
|
+
// Ensure we use the normalized name for the graph
|
|
48
|
+
graphDeps.push(cd.computation);
|
|
49
|
+
});
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
adjacency.set(entry.name, graphDeps);
|
|
42
53
|
}
|
|
43
54
|
}
|
|
44
55
|
|
|
@@ -104,6 +115,13 @@ class ManifestBuilder {
|
|
|
104
115
|
compositeHash += `|RULE:${mod}:${h}`;
|
|
105
116
|
}
|
|
106
117
|
|
|
118
|
+
// Normalize conditional dependencies if they exist
|
|
119
|
+
// This ensures the Orchestrator can look them up by normalized name later
|
|
120
|
+
const conditionalDependencies = (config.conditionalDependencies || []).map(cd => ({
|
|
121
|
+
...cd,
|
|
122
|
+
computation: this._normalize(cd.computation)
|
|
123
|
+
}));
|
|
124
|
+
|
|
107
125
|
return {
|
|
108
126
|
name,
|
|
109
127
|
originalName: config.name,
|
|
@@ -112,6 +130,7 @@ class ManifestBuilder {
|
|
|
112
130
|
type: config.type || 'global',
|
|
113
131
|
requires: config.requires || {},
|
|
114
132
|
dependencies: (config.dependencies || []).map(d => this._normalize(d)),
|
|
133
|
+
conditionalDependencies, // FIX: Pass this through to the manifest entry
|
|
115
134
|
isHistorical: config.isHistorical || false,
|
|
116
135
|
isTest: config.isTest || false,
|
|
117
136
|
schedule: this.scheduleValidator.parseSchedule(config.schedule),
|
|
@@ -133,6 +152,7 @@ class ManifestBuilder {
|
|
|
133
152
|
_computeFinalHashes(sorted, manifestMap) {
|
|
134
153
|
for (const entry of sorted) {
|
|
135
154
|
let hashInput = entry.hash;
|
|
155
|
+
// Includes strict dependencies in the hash chain
|
|
136
156
|
if (entry.dependencies.length > 0) {
|
|
137
157
|
const depHashes = entry.dependencies.sort().map(d => {
|
|
138
158
|
const h = manifestMap.get(d)?.hash;
|
|
@@ -141,6 +161,10 @@ class ManifestBuilder {
|
|
|
141
161
|
});
|
|
142
162
|
hashInput += `|DEPS:${depHashes.join('|')}`;
|
|
143
163
|
}
|
|
164
|
+
// Note: Conditional dependencies are currently excluded from the hash chain
|
|
165
|
+
// because they might not be loaded. If strict versioning is required for them,
|
|
166
|
+
// they should be added here too.
|
|
167
|
+
|
|
144
168
|
entry.hash = this._hashCode(hashInput);
|
|
145
169
|
}
|
|
146
170
|
}
|
|
@@ -6,10 +6,18 @@
|
|
|
6
6
|
* * V2.3 FIX: "Insufficient History" bug.
|
|
7
7
|
* - fetchBatched now orders by Entity ID to keep historical rows together.
|
|
8
8
|
* - Implemented "Entity-Atomic Batching" to prevent splitting a user's history across batches.
|
|
9
|
+
* * V2.4 FIX: Runaway Query Cost Prevention [Fix #3].
|
|
10
|
+
* * V2.5 UPDATE: Super-Entity Monitoring [Safety Valve for Fix #6].
|
|
11
|
+
* - Warns if a single entity exceeds reasonable batch limits (Memory Risk).
|
|
9
12
|
*/
|
|
10
13
|
|
|
11
14
|
const { BigQuery } = require('@google-cloud/bigquery');
|
|
12
15
|
|
|
16
|
+
// FIX #3: Hard limit to prevent cost spirals
|
|
17
|
+
const MAX_LOOKBACK_DAYS = 30;
|
|
18
|
+
// FIX #6 (Alternative): Warn if an entity is massive (e.g. > 5x batch size)
|
|
19
|
+
const BATCH_GROWTH_WARNING_THRESHOLD = 5;
|
|
20
|
+
|
|
13
21
|
class DataFetcher {
|
|
14
22
|
constructor(config, queryBuilder, logger = null) {
|
|
15
23
|
this.projectId = config.projectId;
|
|
@@ -151,6 +159,12 @@ class DataFetcher {
|
|
|
151
159
|
|
|
152
160
|
async fetch(options) {
|
|
153
161
|
const { table, targetDate, lookback = 0, filter = {}, fields = null, entities = null } = options;
|
|
162
|
+
|
|
163
|
+
// FIX #3: Prevent Runaway Costs
|
|
164
|
+
if (lookback > MAX_LOOKBACK_DAYS) {
|
|
165
|
+
throw new Error(`[DataFetcher] COST GUARD: Lookback of ${lookback} days exceeds limit of ${MAX_LOOKBACK_DAYS}. Table: ${table}`);
|
|
166
|
+
}
|
|
167
|
+
|
|
154
168
|
const tableConfig = this.tables[table] || {};
|
|
155
169
|
const { dateField, entityField, dataField } = tableConfig;
|
|
156
170
|
|
|
@@ -168,6 +182,12 @@ class DataFetcher {
|
|
|
168
182
|
|
|
169
183
|
async *fetchBatched(options, batchSize = 1000) {
|
|
170
184
|
const { table, targetDate, lookback = 0, filter = {}, fields = null, entities = null } = options;
|
|
185
|
+
|
|
186
|
+
// FIX #3: Prevent Runaway Costs
|
|
187
|
+
if (lookback > MAX_LOOKBACK_DAYS) {
|
|
188
|
+
throw new Error(`[DataFetcher] COST GUARD: Lookback of ${lookback} days exceeds limit of ${MAX_LOOKBACK_DAYS}. Table: ${table}`);
|
|
189
|
+
}
|
|
190
|
+
|
|
171
191
|
const tableConfig = this.tables[table] || {};
|
|
172
192
|
const { dateField, entityField, dataField } = tableConfig;
|
|
173
193
|
|
|
@@ -181,6 +201,7 @@ class DataFetcher {
|
|
|
181
201
|
|
|
182
202
|
let batch = [];
|
|
183
203
|
let currentEntity = null;
|
|
204
|
+
let batchHasWarned = false; // Flag to prevent log spam for a single massive batch
|
|
184
205
|
|
|
185
206
|
for await (const row of rowStream) {
|
|
186
207
|
// FIX #2: Entity-Atomic Batching
|
|
@@ -188,12 +209,23 @@ class DataFetcher {
|
|
|
188
209
|
if (entityField) {
|
|
189
210
|
const rowEntity = String(row[entityField]);
|
|
190
211
|
|
|
191
|
-
//
|
|
192
|
-
//
|
|
212
|
+
// Check if we should yield
|
|
213
|
+
// Condition: Batch is full AND we are on a NEW entity
|
|
193
214
|
if (batch.length >= batchSize && rowEntity !== currentEntity && currentEntity !== null) {
|
|
194
215
|
yield this._transform(batch, { lookback, dateField, entityField, dataField });
|
|
195
216
|
batch = [];
|
|
217
|
+
batchHasWarned = false;
|
|
196
218
|
}
|
|
219
|
+
|
|
220
|
+
// SAFETY VALVE (Fix #6 Alternative):
|
|
221
|
+
// If batch grows huge (Super Entity) and we CANNOT split (same entity), warn the admin.
|
|
222
|
+
if (batch.length > batchSize * BATCH_GROWTH_WARNING_THRESHOLD && !batchHasWarned) {
|
|
223
|
+
this._log('WARN', `SUPER ENTITY DETECTED: Entity '${currentEntity}' in table '${table}' has >${batch.length} rows. ` +
|
|
224
|
+
`This exceeds batch size ${batchSize} by ${BATCH_GROWTH_WARNING_THRESHOLD}x. ` +
|
|
225
|
+
`Risk of OOM or Timeouts. Consider filtering this entity.`);
|
|
226
|
+
batchHasWarned = true;
|
|
227
|
+
}
|
|
228
|
+
|
|
197
229
|
currentEntity = rowEntity;
|
|
198
230
|
} else {
|
|
199
231
|
// Fallback for non-entity tables (strict count)
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* @fileoverview Schema Registry - Dynamic schema discovery with caching
|
|
3
|
-
*
|
|
4
|
-
* Core innovation of v2: No hardcoded schemas. Instead, we:
|
|
3
|
+
* * Core innovation of v2: No hardcoded schemas. Instead, we:
|
|
5
4
|
* 1. Fetch schemas from BigQuery INFORMATION_SCHEMA on first access
|
|
6
5
|
* 2. Cache them with configurable TTL
|
|
7
6
|
* 3. Validate all queries against cached schemas BEFORE sending to BigQuery
|
|
8
|
-
*
|
|
9
|
-
*
|
|
7
|
+
* * This prevents runtime query failures and eliminates schema maintenance burden.
|
|
8
|
+
* * UPDATE: Implemented Request Coalescing (Fix #5) to prevent "Thundering Herd"
|
|
9
|
+
* on startup or cache expiry.
|
|
10
10
|
*/
|
|
11
11
|
|
|
12
12
|
const { BigQuery } = require('@google-cloud/bigquery');
|
|
@@ -45,18 +45,21 @@ class SchemaRegistry {
|
|
|
45
45
|
|
|
46
46
|
this.client = new BigQuery({ projectId: this.projectId });
|
|
47
47
|
this.cache = new Map();
|
|
48
|
+
this.pendingFetches = new Map(); // FIX: Track in-flight requests
|
|
48
49
|
|
|
49
50
|
// Track schema fetch stats for monitoring
|
|
50
51
|
this.stats = {
|
|
51
52
|
hits: 0,
|
|
52
53
|
misses: 0,
|
|
53
54
|
refreshes: 0,
|
|
54
|
-
errors: 0
|
|
55
|
+
errors: 0,
|
|
56
|
+
coalesced: 0 // New metric
|
|
55
57
|
};
|
|
56
58
|
}
|
|
57
59
|
|
|
58
60
|
/**
|
|
59
61
|
* Get schema for a table, fetching from BigQuery if not cached.
|
|
62
|
+
* Implements Request Coalescing to handle concurrent access.
|
|
60
63
|
* @param {string} tableName - Table name (without dataset prefix)
|
|
61
64
|
* @returns {Promise<TableSchema>}
|
|
62
65
|
*/
|
|
@@ -67,6 +70,13 @@ class SchemaRegistry {
|
|
|
67
70
|
this.stats.hits++;
|
|
68
71
|
return cached;
|
|
69
72
|
}
|
|
73
|
+
|
|
74
|
+
// FIX: Check for pending fetch (Request Coalescing)
|
|
75
|
+
if (this.pendingFetches.has(tableName)) {
|
|
76
|
+
this.stats.coalesced++;
|
|
77
|
+
// this._log('DEBUG', `Coalescing request for ${tableName}`);
|
|
78
|
+
return this.pendingFetches.get(tableName);
|
|
79
|
+
}
|
|
70
80
|
|
|
71
81
|
if (cached) {
|
|
72
82
|
this.stats.refreshes++;
|
|
@@ -76,7 +86,16 @@ class SchemaRegistry {
|
|
|
76
86
|
this._log('DEBUG', `Schema cache miss for ${tableName}, fetching...`);
|
|
77
87
|
}
|
|
78
88
|
|
|
79
|
-
|
|
89
|
+
// Create the promise and store it
|
|
90
|
+
const fetchPromise = this._fetchAndCacheSchema(tableName);
|
|
91
|
+
this.pendingFetches.set(tableName, fetchPromise);
|
|
92
|
+
|
|
93
|
+
try {
|
|
94
|
+
return await fetchPromise;
|
|
95
|
+
} finally {
|
|
96
|
+
// Always clean up pending map, success or failure
|
|
97
|
+
this.pendingFetches.delete(tableName);
|
|
98
|
+
}
|
|
80
99
|
}
|
|
81
100
|
|
|
82
101
|
/**
|
|
@@ -158,6 +177,8 @@ class SchemaRegistry {
|
|
|
158
177
|
async warmCache(tableNames) {
|
|
159
178
|
const results = { success: [], failed: [] };
|
|
160
179
|
|
|
180
|
+
// With request coalescing, we can just map and wait.
|
|
181
|
+
// Simultaneous calls for the same table will automatically merge.
|
|
161
182
|
await Promise.all(tableNames.map(async (tableName) => {
|
|
162
183
|
try {
|
|
163
184
|
await this.getSchema(tableName);
|
|
@@ -178,9 +199,11 @@ class SchemaRegistry {
|
|
|
178
199
|
clearCache(tableName = null) {
|
|
179
200
|
if (tableName) {
|
|
180
201
|
this.cache.delete(tableName);
|
|
202
|
+
this.pendingFetches.delete(tableName); // Also clear pending if forced
|
|
181
203
|
this._log('DEBUG', `Cleared schema cache for ${tableName}`);
|
|
182
204
|
} else {
|
|
183
205
|
this.cache.clear();
|
|
206
|
+
this.pendingFetches.clear();
|
|
184
207
|
this._log('DEBUG', 'Cleared entire schema cache');
|
|
185
208
|
}
|
|
186
209
|
}
|
|
@@ -193,6 +216,7 @@ class SchemaRegistry {
|
|
|
193
216
|
return {
|
|
194
217
|
...this.stats,
|
|
195
218
|
cachedTables: this.cache.size,
|
|
219
|
+
pendingRequests: this.pendingFetches.size,
|
|
196
220
|
cacheContents: Array.from(this.cache.keys())
|
|
197
221
|
};
|
|
198
222
|
}
|
|
@@ -284,4 +308,4 @@ class SchemaRegistry {
|
|
|
284
308
|
}
|
|
285
309
|
}
|
|
286
310
|
|
|
287
|
-
module.exports = { SchemaRegistry };
|
|
311
|
+
module.exports = { SchemaRegistry };
|
|
@@ -9,6 +9,8 @@
|
|
|
9
9
|
* * * UPDATE: Includes Global vs Batch Data Split to fix "Identity Crisis".
|
|
10
10
|
* * * UPDATE: Implemented FORCE logic to bypass "up-to-date" checks for testing.
|
|
11
11
|
* * * UPDATE: Aggregates performance reporting to prevent log spam.
|
|
12
|
+
* * * FIX: Resolved N+1 Dependency Fetching (Strict Mode in Streaming).
|
|
13
|
+
* * * FIX: Added missing 'skipped' property to return types for type safety.
|
|
12
14
|
*/
|
|
13
15
|
|
|
14
16
|
const crypto = require('crypto');
|
|
@@ -316,21 +318,41 @@ class Orchestrator {
|
|
|
316
318
|
|
|
317
319
|
const { data: batchLocalData, entityIds } = batch;
|
|
318
320
|
const combinedData = { ...batchLocalData, ...globalData };
|
|
321
|
+
|
|
322
|
+
// STRICT FIX: Prefetch dependencies for the batch.
|
|
319
323
|
const batchDeps = await this._prefetchBatchDependencies(entry, dateStr, depResults, entityIds);
|
|
324
|
+
|
|
320
325
|
const { rules } = this.ruleInjector.createContext();
|
|
321
326
|
const batchResults = {};
|
|
322
327
|
|
|
323
328
|
await Promise.all(entityIds.map(entityId => limit(async () => {
|
|
324
329
|
const instance = new entry.class();
|
|
325
330
|
const entityData = this._filterDataForEntity(combinedData, entityId, driverEntityField);
|
|
331
|
+
|
|
326
332
|
const context = {
|
|
327
333
|
computation: entry, date: dateStr, entityId, data: entityData,
|
|
334
|
+
|
|
335
|
+
// STRICT FIX: No fallback to _lazyLoadDependency.
|
|
328
336
|
getDependency: (depName, targetId) => {
|
|
329
|
-
|
|
330
|
-
|
|
337
|
+
const id = targetId || entityId;
|
|
338
|
+
|
|
339
|
+
// 1. Look in Batch-Prefetched Dependencies (Priority)
|
|
340
|
+
if (batchDeps[depName] && batchDeps[depName].has(id)) {
|
|
341
|
+
return batchDeps[depName].get(id);
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
// 2. Look in Global/Preloaded Dependencies
|
|
345
|
+
if (depResults[depName]) {
|
|
346
|
+
if (depResults[depName][id] !== undefined) return depResults[depName][id];
|
|
331
347
|
}
|
|
332
|
-
|
|
348
|
+
|
|
349
|
+
// 3. STRICT MODE: Throw Error
|
|
350
|
+
throw new Error(
|
|
351
|
+
`[Strict Dependency] Dependency '${depName}' (ID: ${id}) not found in batch context. ` +
|
|
352
|
+
`Ensure '${depName}' is listed in ${entry.name}.getConfig().dependencies.`
|
|
353
|
+
);
|
|
333
354
|
},
|
|
355
|
+
|
|
334
356
|
previousResult, rules, references: this.referenceDataCache,
|
|
335
357
|
config: this.config, dataFetcher: this.dataFetcher
|
|
336
358
|
};
|
|
@@ -357,22 +379,18 @@ class Orchestrator {
|
|
|
357
379
|
if (cp) await checkpointer.complete(dateStr, entry.name, cp.id);
|
|
358
380
|
}
|
|
359
381
|
|
|
360
|
-
|
|
382
|
+
// FIX: Return valid object shape including skipped: false
|
|
383
|
+
return { count: totalCount, hash: rollingHash.digest('hex').substring(0, 16), skipped: false };
|
|
361
384
|
}
|
|
362
385
|
|
|
363
386
|
/**
|
|
364
387
|
* Determine if a computation should use remote workers
|
|
365
|
-
*
|
|
366
|
-
* @param {Object} entry - Manifest entry
|
|
388
|
+
* * @param {Object} entry - Manifest entry
|
|
367
389
|
* @param {Object} options - Execution options
|
|
368
390
|
* @param {boolean} [options.useWorkerPool] - Runtime override (true/false/undefined)
|
|
369
391
|
* @param {boolean} [options.forceLocal] - Force local execution
|
|
370
392
|
*/
|
|
371
393
|
_shouldUseRemoteWorkers(entry, options) {
|
|
372
|
-
// Runtime override takes precedence (for admin testing)
|
|
373
|
-
// useWorkerPool: true -> force use worker pool
|
|
374
|
-
// useWorkerPool: false -> force local execution
|
|
375
|
-
// useWorkerPool: undefined -> use config
|
|
376
394
|
if (options.useWorkerPool === true) {
|
|
377
395
|
if (!this.remoteRunner) {
|
|
378
396
|
this._log('WARN', 'useWorkerPool=true but remoteRunner not initialized');
|
|
@@ -384,30 +402,22 @@ class Orchestrator {
|
|
|
384
402
|
return false;
|
|
385
403
|
}
|
|
386
404
|
|
|
387
|
-
// No remote runner configured
|
|
388
405
|
if (!this.remoteRunner) return false;
|
|
389
|
-
|
|
390
|
-
// Force local execution via options
|
|
391
406
|
if (options.forceLocal) return false;
|
|
392
407
|
|
|
393
408
|
const poolConfig = this.config.workerPool || {};
|
|
394
409
|
|
|
395
|
-
// Exclusion list
|
|
396
410
|
if (poolConfig.excludeComputations?.includes(entry.name) ||
|
|
397
411
|
poolConfig.excludeComputations?.includes(entry.originalName)) {
|
|
398
412
|
return false;
|
|
399
413
|
}
|
|
400
414
|
|
|
401
|
-
// Force list (override threshold)
|
|
402
415
|
if (poolConfig.forceOffloadComputations?.includes(entry.name) ||
|
|
403
416
|
poolConfig.forceOffloadComputations?.includes(entry.originalName)) {
|
|
404
417
|
return true;
|
|
405
418
|
}
|
|
406
419
|
|
|
407
|
-
// Only per-entity computations can be offloaded
|
|
408
420
|
if (entry.type !== 'per-entity') return false;
|
|
409
|
-
|
|
410
|
-
// Default: use remote if worker pool is enabled
|
|
411
421
|
return true;
|
|
412
422
|
}
|
|
413
423
|
|
|
@@ -492,7 +502,6 @@ class Orchestrator {
|
|
|
492
502
|
this._log('WARN', `[Remote] Batch ${batchIndex}: ${errors.length} entities failed`);
|
|
493
503
|
totalErrors += errors.length;
|
|
494
504
|
|
|
495
|
-
// Log first few errors for debugging
|
|
496
505
|
errors.slice(0, 3).forEach(e => {
|
|
497
506
|
this._log('DEBUG', ` - ${e.entityId}: ${e.error}`);
|
|
498
507
|
});
|
|
@@ -524,7 +533,8 @@ class Orchestrator {
|
|
|
524
533
|
this._log('WARN', `[Remote] Completed with ${totalErrors} total errors out of ${totalCount + totalErrors} entities`);
|
|
525
534
|
}
|
|
526
535
|
|
|
527
|
-
|
|
536
|
+
// FIX: Return valid object shape including skipped: false
|
|
537
|
+
return { count: totalCount, hash: rollingHash.digest('hex').substring(0, 16), skipped: false };
|
|
528
538
|
}
|
|
529
539
|
|
|
530
540
|
async _executeGlobal(entry, dateStr, depResults, previousResult, options, forceEntities) {
|
|
@@ -569,7 +579,8 @@ class Orchestrator {
|
|
|
569
579
|
await this.storageManager.finalizeResults(dateStr, entry);
|
|
570
580
|
}
|
|
571
581
|
|
|
572
|
-
|
|
582
|
+
// FIX: Return valid object shape including skipped: false
|
|
583
|
+
return { count: Object.keys(results || {}).length, hash: finalHash, skipped: false };
|
|
573
584
|
}
|
|
574
585
|
|
|
575
586
|
_printExecutionSummary(summary) {
|
|
@@ -668,6 +679,10 @@ class Orchestrator {
|
|
|
668
679
|
async _lazyLoadDependency(dateStr, depName, entityId, preloaded) {
|
|
669
680
|
if (preloaded[depName] && !entityId) return preloaded[depName];
|
|
670
681
|
if (preloaded[depName] && entityId) return preloaded[depName][entityId];
|
|
682
|
+
|
|
683
|
+
// WARN: This is the slow path that we removed from Streaming
|
|
684
|
+
this._log('WARN', `LAZY LOAD: Fetching single entity '${entityId}' for '${depName}'. This is slow.`);
|
|
685
|
+
|
|
671
686
|
if (entityId) return this.stateRepository.getEntityResult(dateStr, depName, entityId);
|
|
672
687
|
return this.stateRepository.getResult(dateStr, depName);
|
|
673
688
|
}
|
|
@@ -1,20 +1,18 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* @fileoverview Remote Task Runner (Serverless Worker Pool Client)
|
|
3
|
-
*
|
|
4
|
-
* RESPONSIBILITIES:
|
|
3
|
+
* * RESPONSIBILITIES:
|
|
5
4
|
* 1. Package entity data and context into GCS files
|
|
6
5
|
* 2. Invoke remote worker functions in parallel
|
|
7
6
|
* 3. Collect results and errors
|
|
8
7
|
* 4. Handle retries for transient failures
|
|
9
|
-
*
|
|
10
|
-
* DATA FLOW:
|
|
8
|
+
* * DATA FLOW:
|
|
11
9
|
* Orchestrator calls runBatch() -> Upload to GCS -> Invoke Workers -> Collect Results
|
|
12
|
-
*
|
|
13
|
-
* DESIGN PRINCIPLES:
|
|
10
|
+
* * DESIGN PRINCIPLES:
|
|
14
11
|
* - Workers are stateless - all context is passed via GCS
|
|
15
12
|
* - High parallelism - hundreds of concurrent invocations
|
|
16
13
|
* - Fault isolation - one entity failure doesn't affect others
|
|
17
14
|
* - Cost efficient - workers scale to zero between runs
|
|
15
|
+
* - RESILIENCE: Implements Circuit Breaker to prevent Retry Cost Spirals [Fix #2]
|
|
18
16
|
*/
|
|
19
17
|
|
|
20
18
|
const { Storage } = require('@google-cloud/storage');
|
|
@@ -36,6 +34,13 @@ class RemoteTaskRunner {
|
|
|
36
34
|
this.timeout = poolConfig.timeout || 60000; // 60s default
|
|
37
35
|
this.retries = poolConfig.retries || 2;
|
|
38
36
|
|
|
37
|
+
// Circuit Breaker Config [Fix #2]
|
|
38
|
+
this.cbConfig = {
|
|
39
|
+
minInvocations: 20, // Minimum calls before checking rate
|
|
40
|
+
failureThreshold: 0.30, // Trip if failure rate > 30%
|
|
41
|
+
...poolConfig.circuitBreaker
|
|
42
|
+
};
|
|
43
|
+
|
|
39
44
|
// Local mode for testing
|
|
40
45
|
this.localMode = poolConfig.localMode || process.env.WORKER_LOCAL_MODE === 'true';
|
|
41
46
|
|
|
@@ -53,8 +58,7 @@ class RemoteTaskRunner {
|
|
|
53
58
|
|
|
54
59
|
/**
|
|
55
60
|
* Execute a batch of entities remotely (or locally for testing)
|
|
56
|
-
*
|
|
57
|
-
* @param {Object} entry - Manifest entry for the computation
|
|
61
|
+
* * @param {Object} entry - Manifest entry for the computation
|
|
58
62
|
* @param {string} dateStr - Target date (YYYY-MM-DD)
|
|
59
63
|
* @param {Object} baseContext - Shared context (references, config)
|
|
60
64
|
* @param {string[]} entityIds - Entity IDs to process
|
|
@@ -126,11 +130,21 @@ class RemoteTaskRunner {
|
|
|
126
130
|
const errors = [];
|
|
127
131
|
const uploadedPaths = [];
|
|
128
132
|
|
|
133
|
+
// Circuit Breaker Stats (scoped to this batch)
|
|
134
|
+
const batchStats = {
|
|
135
|
+
invocations: 0,
|
|
136
|
+
failures: 0,
|
|
137
|
+
tripped: false
|
|
138
|
+
};
|
|
139
|
+
|
|
129
140
|
// Phase 1: Upload context packages to GCS
|
|
130
141
|
this._log('INFO', 'Uploading context packages to GCS...');
|
|
131
142
|
const uploadStart = Date.now();
|
|
132
143
|
|
|
133
144
|
const uploadTasks = entityIds.map(entityId => uploadLimit(async () => {
|
|
145
|
+
// Check tripped status early to save uploads if massive failure occurring
|
|
146
|
+
if (batchStats.tripped) return;
|
|
147
|
+
|
|
134
148
|
const contextPackage = this._buildContextPackage(
|
|
135
149
|
entry,
|
|
136
150
|
entityId,
|
|
@@ -158,13 +172,19 @@ class RemoteTaskRunner {
|
|
|
158
172
|
|
|
159
173
|
const invokeTasks = uploadedPaths.map(({ entityId, path }) =>
|
|
160
174
|
invokeLimit(async () => {
|
|
175
|
+
// FAIL FAST: If circuit tripped, do not invoke worker
|
|
176
|
+
if (batchStats.tripped) {
|
|
177
|
+
errors.push({ entityId, error: 'Skipped: Circuit Breaker Tripped due to high failure rate' });
|
|
178
|
+
return;
|
|
179
|
+
}
|
|
180
|
+
|
|
161
181
|
try {
|
|
162
182
|
const response = await this._invokeWorkerWithRetry({
|
|
163
183
|
computationName: entry.originalName || entry.name,
|
|
164
184
|
entityId,
|
|
165
185
|
date: dateStr,
|
|
166
186
|
dataUri: { bucket: this.bucketName, path }
|
|
167
|
-
});
|
|
187
|
+
}, 1, batchStats); // Pass stats object to retry logic
|
|
168
188
|
|
|
169
189
|
if (response.status === 'success' && response.result !== null) {
|
|
170
190
|
results[entityId] = response.result;
|
|
@@ -174,12 +194,18 @@ class RemoteTaskRunner {
|
|
|
174
194
|
// status === 'success' with result === null means skipped (filtered out)
|
|
175
195
|
|
|
176
196
|
} catch (e) {
|
|
197
|
+
// Circuit Breaker errors are thrown here
|
|
177
198
|
errors.push({ entityId, error: e.message });
|
|
178
199
|
}
|
|
179
200
|
})
|
|
180
201
|
);
|
|
181
202
|
|
|
182
203
|
await Promise.all(invokeTasks);
|
|
204
|
+
|
|
205
|
+
if (batchStats.tripped) {
|
|
206
|
+
this._log('ERROR', `Batch ABORTED by Circuit Breaker. Stats: ${batchStats.failures} failures / ${batchStats.invocations} invocations.`);
|
|
207
|
+
}
|
|
208
|
+
|
|
183
209
|
this._log('INFO', `Invocations complete in ${Date.now() - invokeStart}ms`);
|
|
184
210
|
|
|
185
211
|
// Phase 3: Cleanup GCS (fire and forget)
|
|
@@ -237,12 +263,21 @@ class RemoteTaskRunner {
|
|
|
237
263
|
}
|
|
238
264
|
|
|
239
265
|
/**
|
|
240
|
-
* Invoke a worker with retry logic
|
|
266
|
+
* Invoke a worker with retry logic and Circuit Breaker
|
|
241
267
|
*/
|
|
242
|
-
async _invokeWorkerWithRetry(payload, attempt = 1) {
|
|
268
|
+
async _invokeWorkerWithRetry(payload, attempt = 1, stats = null) {
|
|
269
|
+
// Track Invocation (Cost)
|
|
270
|
+
if (stats) stats.invocations++;
|
|
271
|
+
|
|
243
272
|
try {
|
|
244
273
|
return await this._invokeWorker(payload);
|
|
245
274
|
} catch (e) {
|
|
275
|
+
// Track Failure
|
|
276
|
+
if (stats) {
|
|
277
|
+
stats.failures++;
|
|
278
|
+
this._checkCircuitBreaker(stats);
|
|
279
|
+
}
|
|
280
|
+
|
|
246
281
|
const isRetryable = this._isRetryableError(e);
|
|
247
282
|
|
|
248
283
|
if (isRetryable && attempt < this.retries) {
|
|
@@ -250,14 +285,40 @@ class RemoteTaskRunner {
|
|
|
250
285
|
const delay = Math.min(1000 * Math.pow(2, attempt - 1), 10000);
|
|
251
286
|
await new Promise(r => setTimeout(r, delay));
|
|
252
287
|
|
|
288
|
+
// Re-check circuit before retrying (another thread might have tripped it)
|
|
289
|
+
if (stats) this._checkCircuitBreaker(stats);
|
|
290
|
+
|
|
253
291
|
this._log('DEBUG', `Retrying ${payload.entityId} (attempt ${attempt + 1})`);
|
|
254
|
-
return this._invokeWorkerWithRetry(payload, attempt + 1);
|
|
292
|
+
return this._invokeWorkerWithRetry(payload, attempt + 1, stats);
|
|
255
293
|
}
|
|
256
294
|
|
|
257
295
|
throw e;
|
|
258
296
|
}
|
|
259
297
|
}
|
|
260
298
|
|
|
299
|
+
/**
|
|
300
|
+
* Check circuit breaker status and throw if tripped
|
|
301
|
+
*/
|
|
302
|
+
_checkCircuitBreaker(stats) {
|
|
303
|
+
if (stats.tripped) {
|
|
304
|
+
throw new Error('Circuit Breaker: Batch aborted due to high failure rate');
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
// Only check after minimum invocations (warmup)
|
|
308
|
+
if (stats.invocations >= this.cbConfig.minInvocations) {
|
|
309
|
+
const failureRate = stats.failures / stats.invocations;
|
|
310
|
+
|
|
311
|
+
if (failureRate > this.cbConfig.failureThreshold) {
|
|
312
|
+
stats.tripped = true;
|
|
313
|
+
const msg = `🚨 CIRCUIT BREAKER TRIPPED! Failure rate ${(failureRate * 100).toFixed(1)}% ` +
|
|
314
|
+
`(${stats.failures}/${stats.invocations}) exceeds threshold of ${(this.cbConfig.failureThreshold * 100)}%`;
|
|
315
|
+
|
|
316
|
+
this._log('ERROR', msg);
|
|
317
|
+
throw new Error(msg);
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
|
|
261
322
|
/**
|
|
262
323
|
* Invoke a single worker via HTTP
|
|
263
324
|
*/
|
|
@@ -324,4 +385,4 @@ class RemoteTaskRunner {
|
|
|
324
385
|
}
|
|
325
386
|
}
|
|
326
387
|
|
|
327
|
-
module.exports = { RemoteTaskRunner };
|
|
388
|
+
module.exports = { RemoteTaskRunner };
|