bulltrackers-module 1.0.737 → 1.0.739
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/functions/computation-system-v2/docs/admin.md +91 -0
- package/functions/computation-system-v2/docs/plans.md +588 -0
- package/functions/computation-system-v2/framework/core/Manifest.js +25 -1
- package/functions/computation-system-v2/framework/data/DataFetcher.js +34 -2
- package/functions/computation-system-v2/framework/data/SchemaRegistry.js +31 -7
- package/functions/computation-system-v2/framework/execution/Orchestrator.js +50 -15
- package/functions/computation-system-v2/framework/execution/RemoteTaskRunner.js +74 -13
- package/functions/computation-system-v2/handlers/adminTest.js +327 -0
- package/functions/computation-system-v2/handlers/index.js +4 -0
- package/functions/computation-system-v2/index.js +15 -1
- package/package.json +1 -1
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# Admin Test Endpoint
|
|
2
|
+
|
|
3
|
+
## Deploy
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
node deploy.mjs ComputeAdminTest
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
## Usage Examples
|
|
10
|
+
|
|
11
|
+
### 1. Check System Status
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
TOKEN=$(gcloud auth print-identity-token --audiences="https://europe-west1-stocks-12345.cloudfunctions.net/compute-admin-test")
|
|
15
|
+
|
|
16
|
+
curl -X POST \
|
|
17
|
+
"https://europe-west1-stocks-12345.cloudfunctions.net/compute-admin-test" \
|
|
18
|
+
-H "Authorization: Bearer $TOKEN" \
|
|
19
|
+
-H "Content-Type: application/json" \
|
|
20
|
+
-d '{"action": "status"}'
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
### 2. Analyze What Would Run
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
curl -X POST \
|
|
27
|
+
"https://europe-west1-stocks-12345.cloudfunctions.net/compute-admin-test" \
|
|
28
|
+
-H "Authorization: Bearer $TOKEN" \
|
|
29
|
+
-H "Content-Type: application/json" \
|
|
30
|
+
-d '{"action": "analyze", "date": "2026-01-25"}'
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
### 3. Run Full Computation
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
curl -X POST \
|
|
37
|
+
"https://europe-west1-stocks-12345.cloudfunctions.net/compute-admin-test" \
|
|
38
|
+
-H "Authorization: Bearer $TOKEN" \
|
|
39
|
+
-H "Content-Type: application/json" \
|
|
40
|
+
-d '{"action": "run", "computation": "UserPortfolioSummary", "date": "2026-01-25", "force": true}'
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### 4. Run Limited Test
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
curl -X POST \
|
|
47
|
+
"https://europe-west1-stocks-12345.cloudfunctions.net/compute-admin-test" \
|
|
48
|
+
-H "Authorization: Bearer $TOKEN" \
|
|
49
|
+
-H "Content-Type: application/json" \
|
|
50
|
+
-d '{"action": "run_limited", "computation": "UserPortfolioSummary", "date": "2026-01-25", "limit": 5}'
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### 5. Test Specific Entities
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
curl -X POST \
|
|
57
|
+
"https://europe-west1-stocks-12345.cloudfunctions.net/compute-admin-test" \
|
|
58
|
+
-H "Authorization: Bearer $TOKEN" \
|
|
59
|
+
-H "Content-Type: application/json" \
|
|
60
|
+
-d '{"action": "run", "computation": "UserPortfolioSummary", "date": "2026-01-25", "entityIds": ["user-123", "user-456"], "force": true}'
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### 6. Test Worker Directly
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
curl -X POST \
|
|
67
|
+
"https://europe-west1-stocks-12345.cloudfunctions.net/compute-admin-test" \
|
|
68
|
+
-H "Authorization: Bearer $TOKEN" \
|
|
69
|
+
-H "Content-Type: application/json" \
|
|
70
|
+
-d '{"action": "test_worker", "computation": "UserPortfolioSummary", "date": "2026-01-25", "entityIds": ["user-123"]}'
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### 7. Test with Worker Pool Override
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
curl -X POST \
|
|
77
|
+
"https://europe-west1-stocks-12345.cloudfunctions.net/compute-admin-test" \
|
|
78
|
+
-H "Authorization: Bearer $TOKEN" \
|
|
79
|
+
-H "Content-Type: application/json" \
|
|
80
|
+
-d '{"action": "run", "computation": "UserPortfolioSummary", "date": "2026-01-25", "useWorkerPool": true, "force": true}'
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Available Actions
|
|
84
|
+
|
|
85
|
+
| Action | Description |
|
|
86
|
+
|--------|-------------|
|
|
87
|
+
| `status` | List all computations and system status |
|
|
88
|
+
| `analyze` | Check what would run for a given date |
|
|
89
|
+
| `run` | Execute a full computation |
|
|
90
|
+
| `run_limited` | Execute on N random entities (safer for testing) |
|
|
91
|
+
| `test_worker` | Direct test of worker function logic |
|
|
@@ -0,0 +1,588 @@
|
|
|
1
|
+
# Comprehensive Review & Improvement Suggestions
|
|
2
|
+
|
|
3
|
+
## 🚨 **CRITICAL ISSUES**
|
|
4
|
+
|
|
5
|
+
### 1. **Infinite Loop Risk in Conditional Dependencies**
|
|
6
|
+
```javascript
|
|
7
|
+
// framework/core/Manifest.js - Line ~50
|
|
8
|
+
if (config.conditionalDependencies) {
|
|
9
|
+
for (const condDep of config.conditionalDependencies) {
|
|
10
|
+
const shouldLoad = condDep.condition({ date: dateStr, config: this.config });
|
|
11
|
+
if (shouldLoad) {
|
|
12
|
+
// NO CYCLE DETECTION for conditional deps!
|
|
13
|
+
const depStatus = dailyStatus.get(condDep.computation.toLowerCase());
|
|
14
|
+
if (depStatus) {
|
|
15
|
+
depResults[condDep.computation] = await this.stateRepository.getResult(dateStr, condDep.computation);
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
**Problem**: Conditional dependencies bypass cycle detection in `Graph.detectCycle()`. If Comp A conditionally depends on Comp B, and Comp B conditionally depends on Comp A, you get an infinite loop.
|
|
23
|
+
|
|
24
|
+
**Fix**:
|
|
25
|
+
```javascript
|
|
26
|
+
// In ManifestBuilder._processComputation()
|
|
27
|
+
_processComputation(ComputationClass) {
|
|
28
|
+
const config = ComputationClass.getConfig();
|
|
29
|
+
|
|
30
|
+
// Merge conditional deps into regular deps for graph analysis
|
|
31
|
+
const allDeps = [...(config.dependencies || [])];
|
|
32
|
+
if (config.conditionalDependencies) {
|
|
33
|
+
allDeps.push(...config.conditionalDependencies.map(cd => cd.computation));
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
return {
|
|
37
|
+
// ...
|
|
38
|
+
dependencies: allDeps.map(d => this._normalize(d)),
|
|
39
|
+
conditionalDependencies: config.conditionalDependencies || [],
|
|
40
|
+
// ...
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### 2. **Cost Spiral in Worker Pool Retries**
|
|
46
|
+
```javascript
|
|
47
|
+
// framework/execution/RemoteTaskRunner.js - Line ~200
|
|
48
|
+
async _invokeWorkerWithRetry(payload, attempt = 1) {
|
|
49
|
+
try {
|
|
50
|
+
return await this._invokeWorker(payload);
|
|
51
|
+
} catch (e) {
|
|
52
|
+
if (isRetryable && attempt < this.retries) {
|
|
53
|
+
// NO COST TRACKING ON RETRIES!
|
|
54
|
+
return this._invokeWorkerWithRetry(payload, attempt + 1);
|
|
55
|
+
}
|
|
56
|
+
throw e;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
**Problem**: Failed workers retry without tracking costs. If 1000 entities fail and each retries 2x, you pay 3x the expected cost with no warning.
|
|
62
|
+
|
|
63
|
+
**Fix**: Add cost multiplier tracking:
|
|
64
|
+
```javascript
|
|
65
|
+
class RemoteTaskRunner {
|
|
66
|
+
constructor(config, logger) {
|
|
67
|
+
this.costMultiplier = 1.0;
|
|
68
|
+
this.totalRetries = 0;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
async _invokeWorkerWithRetry(payload, attempt = 1) {
|
|
72
|
+
try {
|
|
73
|
+
return await this._invokeWorker(payload);
|
|
74
|
+
} catch (e) {
|
|
75
|
+
if (isRetryable && attempt < this.retries) {
|
|
76
|
+
this.totalRetries++;
|
|
77
|
+
this.costMultiplier = 1 + (this.totalRetries / this.stats.totalInvocations);
|
|
78
|
+
|
|
79
|
+
if (this.costMultiplier > 2.0) {
|
|
80
|
+
throw new Error(`🚨 RETRY COST SPIRAL: ${this.totalRetries} retries (${this.costMultiplier.toFixed(1)}x cost). Aborting.`);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
await new Promise(r => setTimeout(r, delay));
|
|
84
|
+
return this._invokeWorkerWithRetry(payload, attempt + 1);
|
|
85
|
+
}
|
|
86
|
+
throw e;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### 3. **Runaway Query Cost in Historical Computations**
|
|
93
|
+
```javascript
|
|
94
|
+
// framework/data/DataFetcher.js - Line ~80
|
|
95
|
+
async fetchForComputation(requires, targetDate, entities = null) {
|
|
96
|
+
await Promise.all(Object.entries(requires).map(async ([tableName, spec]) => {
|
|
97
|
+
const data = await this.fetch({
|
|
98
|
+
lookback: spec.lookback || 0, // No upper limit!
|
|
99
|
+
// ...
|
|
100
|
+
});
|
|
101
|
+
}));
|
|
102
|
+
}
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
**Problem**: A computation with `lookback: 365` on a 10TB table scans 365 days. If you have 100 computations with high lookbacks, costs explode.
|
|
106
|
+
|
|
107
|
+
**Fix**: Add lookback budget:
|
|
108
|
+
```javascript
|
|
109
|
+
// In config
|
|
110
|
+
execution: {
|
|
111
|
+
maxLookbackDays: 90, // Hard limit
|
|
112
|
+
maxLookbackBudgetGB: 100, // Cumulative budget per run
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// In DataFetcher
|
|
116
|
+
async fetch(options) {
|
|
117
|
+
if (options.lookback > this.config.execution.maxLookbackDays) {
|
|
118
|
+
throw new Error(`Lookback ${options.lookback} exceeds limit of ${this.config.execution.maxLookbackDays} days`);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// Estimate scan size before executing
|
|
122
|
+
const estimatedGB = await this._estimateScanSize(options);
|
|
123
|
+
this.cumulativeScanGB += estimatedGB;
|
|
124
|
+
|
|
125
|
+
if (this.cumulativeScanGB > this.config.execution.maxLookbackBudgetGB) {
|
|
126
|
+
throw new Error(`Query budget exceeded: ${this.cumulativeScanGB}GB / ${this.config.execution.maxLookbackBudgetGB}GB`);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Execute...
|
|
130
|
+
}
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## ⚡ **PERFORMANCE BOTTLENECKS**
|
|
136
|
+
|
|
137
|
+
### 4. **N+1 Dependency Fetches**
|
|
138
|
+
```javascript
|
|
139
|
+
// framework/execution/Orchestrator.js - Line ~450
|
|
140
|
+
async _lazyLoadDependency(dateStr, depName, entityId, preloaded) {
|
|
141
|
+
if (preloaded[depName] && entityId) return preloaded[depName][entityId];
|
|
142
|
+
// FETCHES ONE ENTITY AT A TIME!
|
|
143
|
+
return this.stateRepository.getEntityResult(dateStr, depName, entityId);
|
|
144
|
+
}
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
**Problem**: If a computation depends on `PopularInvestorProfileMetrics` and processes 10,000 users, this issues 10,000 individual BigQuery queries.
|
|
148
|
+
|
|
149
|
+
**Fix**: Already partially addressed with `getBatchEntityResults()`, but enforce it:
|
|
150
|
+
```javascript
|
|
151
|
+
async _executeStreaming(entry, dateStr, depResults, previousResult, options) {
|
|
152
|
+
// ...
|
|
153
|
+
for await (const batch of batchStream) {
|
|
154
|
+
const { entityIds } = batch;
|
|
155
|
+
|
|
156
|
+
// FORCE batch prefetch
|
|
157
|
+
const batchDeps = await this._prefetchBatchDependencies(entry, dateStr, depResults, entityIds);
|
|
158
|
+
|
|
159
|
+
// Remove lazy load fallback to catch bugs
|
|
160
|
+
await Promise.all(entityIds.map(entityId => limit(async () => {
|
|
161
|
+
const context = {
|
|
162
|
+
getDependency: (depName, targetId) => {
|
|
163
|
+
const id = targetId || entityId;
|
|
164
|
+
if (!batchDeps[depName]?.has(id)) {
|
|
165
|
+
throw new Error(`LAZY LOAD DETECTED: ${depName}:${id}. This is a bug - dependencies must be prefetched.`);
|
|
166
|
+
}
|
|
167
|
+
return batchDeps[depName].get(id);
|
|
168
|
+
},
|
|
169
|
+
// ...
|
|
170
|
+
};
|
|
171
|
+
})));
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
### 5. **Redundant Schema Fetches**
|
|
177
|
+
```javascript
|
|
178
|
+
// framework/data/SchemaRegistry.js - Line ~70
|
|
179
|
+
async getSchema(tableName) {
|
|
180
|
+
const cached = this.cache.get(tableName);
|
|
181
|
+
if (cached && !this._isExpired(cached)) {
|
|
182
|
+
return cached; // Good!
|
|
183
|
+
}
|
|
184
|
+
// But if 100 concurrent requests arrive during cold start...
|
|
185
|
+
return await this._fetchAndCacheSchema(tableName); // 100 concurrent fetches!
|
|
186
|
+
}
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
**Fix**: Coalesce concurrent requests:
|
|
190
|
+
```javascript
|
|
191
|
+
constructor() {
|
|
192
|
+
this.cache = new Map();
|
|
193
|
+
this.pendingFetches = new Map(); // NEW
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
async getSchema(tableName) {
|
|
197
|
+
const cached = this.cache.get(tableName);
|
|
198
|
+
if (cached && !this._isExpired(cached)) return cached;
|
|
199
|
+
|
|
200
|
+
// Coalesce concurrent requests
|
|
201
|
+
if (this.pendingFetches.has(tableName)) {
|
|
202
|
+
return this.pendingFetches.get(tableName);
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
const fetchPromise = this._fetchAndCacheSchema(tableName);
|
|
206
|
+
this.pendingFetches.set(tableName, fetchPromise);
|
|
207
|
+
|
|
208
|
+
try {
|
|
209
|
+
return await fetchPromise;
|
|
210
|
+
} finally {
|
|
211
|
+
this.pendingFetches.delete(tableName);
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
### 6. **Inefficient Batch Splitting**
|
|
217
|
+
```javascript
|
|
218
|
+
// framework/data/DataFetcher.js - Line ~200
|
|
219
|
+
async *fetchBatched(options, batchSize = 1000) {
|
|
220
|
+
let batch = [];
|
|
221
|
+
let currentEntity = null;
|
|
222
|
+
|
|
223
|
+
for await (const row of rowStream) {
|
|
224
|
+
if (entityField) {
|
|
225
|
+
const rowEntity = String(row[entityField]);
|
|
226
|
+
|
|
227
|
+
// Splits batch even if at 999 rows (wastes 1 row)
|
|
228
|
+
if (batch.length >= batchSize && rowEntity !== currentEntity && currentEntity !== null) {
|
|
229
|
+
yield this._transform(batch, { lookback, dateField, entityField, dataField });
|
|
230
|
+
batch = [];
|
|
231
|
+
}
|
|
232
|
+
currentEntity = rowEntity;
|
|
233
|
+
}
|
|
234
|
+
batch.push(row);
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
**Problem**: If an entity has 5,000 rows and `batchSize=1000`, you get 5 batches for one entity. If processing is slow, this serializes work.
|
|
240
|
+
|
|
241
|
+
**Fix**: Add max batch size override:
|
|
242
|
+
```javascript
|
|
243
|
+
async *fetchBatched(options, batchSize = 1000) {
|
|
244
|
+
const maxBatchSize = batchSize * 10; // Allow up to 10x for large entities
|
|
245
|
+
|
|
246
|
+
for await (const row of rowStream) {
|
|
247
|
+
if (entityField) {
|
|
248
|
+
const rowEntity = String(row[entityField]);
|
|
249
|
+
|
|
250
|
+
// Force split if exceeding max
|
|
251
|
+
if (batch.length >= maxBatchSize) {
|
|
252
|
+
yield this._transform(batch, ...);
|
|
253
|
+
batch = [];
|
|
254
|
+
currentEntity = null;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
// Normal split logic
|
|
258
|
+
if (batch.length >= batchSize && rowEntity !== currentEntity && currentEntity !== null) {
|
|
259
|
+
yield this._transform(batch, ...);
|
|
260
|
+
batch = [];
|
|
261
|
+
}
|
|
262
|
+
currentEntity = rowEntity;
|
|
263
|
+
}
|
|
264
|
+
batch.push(row);
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
---
|
|
270
|
+
|
|
271
|
+
## 🔒 **RELIABILITY ISSUES**
|
|
272
|
+
|
|
273
|
+
### 7. **Zombie Detection Race Condition**
|
|
274
|
+
```javascript
|
|
275
|
+
// handlers/scheduler.js - Line ~80
|
|
276
|
+
let zombies = [];
|
|
277
|
+
try {
|
|
278
|
+
zombies = await storageManager.findZombies(ZOMBIE_THRESHOLD_MINUTES);
|
|
279
|
+
if (zombies.length > 0) {
|
|
280
|
+
console.log(`DETECTED ${zombies.length} ZOMBIES`);
|
|
281
|
+
}
|
|
282
|
+
} catch (e) {
|
|
283
|
+
console.error(`Zombie check failed: ${e.message}`);
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
// Later: dispatch zombies
|
|
287
|
+
const zombieEntries = zombies.map(z => {...});
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
**Problem**: If `findZombies()` throws, scheduler continues but `zombieEntries` is empty. Zombies never recover.
|
|
291
|
+
|
|
292
|
+
**Fix**: Make zombie detection non-blocking:
|
|
293
|
+
```javascript
|
|
294
|
+
async function schedulerHandler(req, res) {
|
|
295
|
+
// 1. Find due computations (CRITICAL PATH)
|
|
296
|
+
const dueComputations = findDueComputations(now);
|
|
297
|
+
|
|
298
|
+
// 2. Zombie detection (BEST EFFORT)
|
|
299
|
+
let zombies = [];
|
|
300
|
+
try {
|
|
301
|
+
zombies = await Promise.race([
|
|
302
|
+
storageManager.findZombies(ZOMBIE_THRESHOLD_MINUTES),
|
|
303
|
+
new Promise((_, reject) => setTimeout(() => reject(new Error('Timeout')), 5000))
|
|
304
|
+
]);
|
|
305
|
+
} catch (e) {
|
|
306
|
+
// Log but don't block scheduler
|
|
307
|
+
console.warn(`⚠️ Zombie detection failed (non-fatal): ${e.message}`);
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
// Continue with due + zombies...
|
|
311
|
+
}
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
### 8. **Silent Firestore Write Failures**
|
|
315
|
+
```javascript
|
|
316
|
+
// framework/storage/StorageManager.js - Line ~650
|
|
317
|
+
async _writeToFirestore(dateStr, entry, results, firestoreConfig) {
|
|
318
|
+
try {
|
|
319
|
+
// ... batching logic ...
|
|
320
|
+
return Promise.all(batches.map(b => limit(() => b.commit()))).then(() => ({ docCount }));
|
|
321
|
+
} catch (error) {
|
|
322
|
+
if (error.message.includes('Could not load the default credentials')) {
|
|
323
|
+
console.log(' Skipped: No GCP credentials');
|
|
324
|
+
return 'skipped';
|
|
325
|
+
}
|
|
326
|
+
console.log(` Error: ${error.message}`);
|
|
327
|
+
return false; // Returns false but doesn't throw!
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
```
|
|
331
|
+
|
|
332
|
+
**Problem**: If Firestore writes fail, `commitResults()` returns `{ firestore: false }` but execution continues. Users think data is saved.
|
|
333
|
+
|
|
334
|
+
**Fix**:
|
|
335
|
+
```javascript
|
|
336
|
+
async commitResults(dateStr, entry, results, depResultHashes = {}) {
|
|
337
|
+
const storageConfig = this._resolveStorageConfig(entry);
|
|
338
|
+
|
|
339
|
+
const firestoreTask = async () => {
|
|
340
|
+
if (!storageConfig.firestore?.enabled) return null;
|
|
341
|
+
const result = await this._writeToFirestore(...);
|
|
342
|
+
|
|
343
|
+
// ENFORCE: If enabled, must succeed
|
|
344
|
+
if (result === false || result === 'skipped') {
|
|
345
|
+
throw new Error(`Firestore write failed for ${entry.name} but was configured as enabled`);
|
|
346
|
+
}
|
|
347
|
+
return result;
|
|
348
|
+
};
|
|
349
|
+
|
|
350
|
+
const [bigqueryResult, firestoreResult] = await Promise.all([gcsTask(), firestoreTask()]);
|
|
351
|
+
return { bigquery: bigqueryResult, firestore: firestoreResult };
|
|
352
|
+
}
|
|
353
|
+
```
|
|
354
|
+
|
|
355
|
+
---
|
|
356
|
+
|
|
357
|
+
## 🎯 **NEW FEATURE SUGGESTIONS**
|
|
358
|
+
|
|
359
|
+
### 9. **Adaptive Batch Sizing**
|
|
360
|
+
```javascript
|
|
361
|
+
// NEW: framework/execution/AdaptiveBatcher.js
|
|
362
|
+
class AdaptiveBatcher {
|
|
363
|
+
constructor(initialSize = 1000) {
|
|
364
|
+
this.currentSize = initialSize;
|
|
365
|
+
this.performanceWindow = [];
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
recordBatch(entityCount, durationMs) {
|
|
369
|
+
const throughput = entityCount / (durationMs / 1000); // entities/sec
|
|
370
|
+
this.performanceWindow.push({ size: entityCount, throughput });
|
|
371
|
+
|
|
372
|
+
if (this.performanceWindow.length > 10) {
|
|
373
|
+
this.performanceWindow.shift();
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
// Adjust batch size based on throughput trend
|
|
377
|
+
const avgThroughput = this.performanceWindow.reduce((sum, p) => sum + p.throughput, 0) / this.performanceWindow.length;
|
|
378
|
+
|
|
379
|
+
if (avgThroughput < 10) {
|
|
380
|
+
// Slow processing - reduce batch size for parallelism
|
|
381
|
+
this.currentSize = Math.max(100, this.currentSize * 0.8);
|
|
382
|
+
} else if (avgThroughput > 100) {
|
|
383
|
+
// Fast processing - increase batch size
|
|
384
|
+
this.currentSize = Math.min(10000, this.currentSize * 1.2);
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
getSize() {
|
|
389
|
+
return Math.round(this.currentSize);
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
// In Orchestrator
|
|
394
|
+
this.adaptiveBatcher = new AdaptiveBatcher(config.execution.batchSize);
|
|
395
|
+
|
|
396
|
+
for await (const batch of batchStream) {
|
|
397
|
+
const startTime = Date.now();
|
|
398
|
+
// ... process batch ...
|
|
399
|
+
this.adaptiveBatcher.recordBatch(batch.entityIds.length, Date.now() - startTime);
|
|
400
|
+
|
|
401
|
+
// Use adaptive size for next batch
|
|
402
|
+
batchSize = this.adaptiveBatcher.getSize();
|
|
403
|
+
}
|
|
404
|
+
```
|
|
405
|
+
|
|
406
|
+
### 10. **Query Plan Caching**
|
|
407
|
+
```javascript
|
|
408
|
+
// NEW: framework/data/QueryPlanCache.js
|
|
409
|
+
class QueryPlanCache {
|
|
410
|
+
constructor(bigquery) {
|
|
411
|
+
this.cache = new Map();
|
|
412
|
+
this.bigquery = bigquery;
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
async getEstimate(query) {
|
|
416
|
+
const queryHash = crypto.createHash('md5').update(query.sql).digest('hex');
|
|
417
|
+
|
|
418
|
+
if (this.cache.has(queryHash)) {
|
|
419
|
+
return this.cache.get(queryHash);
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
// Use BigQuery dry run for cost estimation
|
|
423
|
+
const [job] = await this.bigquery.createQueryJob({
|
|
424
|
+
query: query.sql,
|
|
425
|
+
params: query.params,
|
|
426
|
+
dryRun: true
|
|
427
|
+
});
|
|
428
|
+
|
|
429
|
+
const [metadata] = await job.getMetadata();
|
|
430
|
+
const estimate = {
|
|
431
|
+
bytesProcessed: parseInt(metadata.statistics.totalBytesProcessed || 0, 10),
|
|
432
|
+
cacheHit: metadata.statistics.query.cacheHit,
|
|
433
|
+
estimatedCost: (parseInt(metadata.statistics.totalBytesProcessed || 0, 10) / 1099511627776) * 5
|
|
434
|
+
};
|
|
435
|
+
|
|
436
|
+
this.cache.set(queryHash, estimate);
|
|
437
|
+
return estimate;
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
// In DataFetcher
|
|
442
|
+
async _execute(query) {
|
|
443
|
+
const estimate = await this.queryPlanCache.getEstimate(query);
|
|
444
|
+
|
|
445
|
+
if (estimate.estimatedCost > 1.0) { // $1+ queries
|
|
446
|
+
this._log('WARN', `Expensive query detected: $${estimate.estimatedCost.toFixed(2)} - ${query.sql.substring(0, 100)}`);
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
// Execute as normal...
|
|
450
|
+
}
|
|
451
|
+
```
|
|
452
|
+
|
|
453
|
+
### 11. **Circuit Breaker for Failed Computations**
|
|
454
|
+
```javascript
|
|
455
|
+
// NEW: framework/resilience/CircuitBreaker.js
|
|
456
|
+
class CircuitBreaker {
|
|
457
|
+
constructor(failureThreshold = 5, resetTimeout = 60000) {
|
|
458
|
+
this.failures = new Map();
|
|
459
|
+
this.failureThreshold = failureThreshold;
|
|
460
|
+
this.resetTimeout = resetTimeout;
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
recordFailure(computationName) {
|
|
464
|
+
const state = this.failures.get(computationName) || { count: 0, lastFailure: null };
|
|
465
|
+
state.count++;
|
|
466
|
+
state.lastFailure = Date.now();
|
|
467
|
+
this.failures.set(computationName, state);
|
|
468
|
+
|
|
469
|
+
if (state.count >= this.failureThreshold) {
|
|
470
|
+
console.warn(`🔴 Circuit breaker OPEN for ${computationName} (${state.count} failures)`);
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
recordSuccess(computationName) {
|
|
475
|
+
this.failures.delete(computationName);
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
shouldSkip(computationName) {
|
|
479
|
+
const state = this.failures.get(computationName);
|
|
480
|
+
if (!state || state.count < this.failureThreshold) return false;
|
|
481
|
+
|
|
482
|
+
// Auto-reset after timeout
|
|
483
|
+
if (Date.now() - state.lastFailure > this.resetTimeout) {
|
|
484
|
+
console.log(`🟡 Circuit breaker HALF-OPEN for ${computationName} (attempting reset)`);
|
|
485
|
+
this.failures.delete(computationName);
|
|
486
|
+
return false;
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
return true;
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
// In Orchestrator._executeComputation
|
|
494
|
+
if (this.circuitBreaker.shouldSkip(entry.name)) {
|
|
495
|
+
return { name: entry.name, status: 'circuit_open', reason: 'Too many consecutive failures' };
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
try {
|
|
499
|
+
const result = await this._executeGlobal(...);
|
|
500
|
+
this.circuitBreaker.recordSuccess(entry.name);
|
|
501
|
+
return result;
|
|
502
|
+
} catch (e) {
|
|
503
|
+
this.circuitBreaker.recordFailure(entry.name);
|
|
504
|
+
throw e;
|
|
505
|
+
}
|
|
506
|
+
```
|
|
507
|
+
|
|
508
|
+
---
|
|
509
|
+
|
|
510
|
+
## 📊 **MONITORING ENHANCEMENTS**
|
|
511
|
+
|
|
512
|
+
### 12. **Real-Time Cost Dashboard**
|
|
513
|
+
```javascript
|
|
514
|
+
// NEW: Create a separate Cloud Function that queries cost tables
|
|
515
|
+
exports.getCostDashboard = async (req, res) => {
|
|
516
|
+
const { startDate, endDate } = req.query;
|
|
517
|
+
|
|
518
|
+
const query = `
|
|
519
|
+
SELECT
|
|
520
|
+
computation_name,
|
|
521
|
+
SUM(estimated_cost_usd) as total_cost,
|
|
522
|
+
COUNT(*) as runs,
|
|
523
|
+
AVG(estimated_cost_usd) as avg_cost,
|
|
524
|
+
MAX(estimated_cost_usd) as max_cost
|
|
525
|
+
FROM \`${PROJECT}.${DATASET}.computation_costs\`
|
|
526
|
+
WHERE date BETWEEN @start AND @end
|
|
527
|
+
GROUP BY computation_name
|
|
528
|
+
ORDER BY total_cost DESC
|
|
529
|
+
`;
|
|
530
|
+
|
|
531
|
+
const [rows] = await bigquery.query({ query, params: { start: startDate, end: endDate } });
|
|
532
|
+
|
|
533
|
+
res.json({
|
|
534
|
+
period: { start: startDate, end: endDate },
|
|
535
|
+
totalCost: rows.reduce((sum, r) => sum + r.total_cost, 0),
|
|
536
|
+
computations: rows,
|
|
537
|
+
alert: rows.some(r => r.total_cost > 10) ? 'HIGH_COST_DETECTED' : null
|
|
538
|
+
});
|
|
539
|
+
};
|
|
540
|
+
```
|
|
541
|
+
|
|
542
|
+
---
|
|
543
|
+
|
|
544
|
+
## 🎛️ **CONFIGURATION IMPROVEMENTS**
|
|
545
|
+
|
|
546
|
+
### 13. **Cost-Aware Scheduling**
|
|
547
|
+
```json
|
|
548
|
+
// In config/bulltrackers.config.js
|
|
549
|
+
{
|
|
550
|
+
"computations": [
|
|
551
|
+
{
|
|
552
|
+
"class": "PopularInvestorRiskMetrics",
|
|
553
|
+
"schedule": {
|
|
554
|
+
"frequency": "daily",
|
|
555
|
+
"time": "02:00",
|
|
556
|
+
"costBudget": 5.0, // NEW: Skip if daily cost exceeded
|
|
557
|
+
"priority": 1 // NEW: Lower priority runs only if budget allows
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
],
|
|
561
|
+
"execution": {
|
|
562
|
+
"dailyCostBudget": 50, // Global budget
|
|
563
|
+
"costCheckInterval": 3600000 // Re-check every hour
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
```
|
|
567
|
+
|
|
568
|
+
---
|
|
569
|
+
|
|
570
|
+
## Summary of Critical Fixes Needed
|
|
571
|
+
|
|
572
|
+
| Issue | Severity | Fix Complexity | Impact |
|
|
573
|
+
|-------|----------|----------------|--------|
|
|
574
|
+
| Conditional dependency cycles | 🔴 CRITICAL | Low | Infinite loops |
|
|
575
|
+
| Worker pool retry cost spiral | 🔴 CRITICAL | Medium | Cost overruns |
|
|
576
|
+
| Unbounded lookback queries | 🔴 CRITICAL | Low | Query cost explosion |
|
|
577
|
+
| N+1 dependency fetches | 🟡 HIGH | Medium | 100x slower execution |
|
|
578
|
+
| Schema fetch stampede | 🟡 HIGH | Low | Cold start failures |
|
|
579
|
+
| Silent Firestore failures | 🟡 HIGH | Low | Data loss |
|
|
580
|
+
| Zombie recovery failures | 🟡 HIGH | Low | Stuck computations |
|
|
581
|
+
|
|
582
|
+
**Immediate Actions:**
|
|
583
|
+
1. Add conditional dependency cycle detection (1 hour fix)
|
|
584
|
+
2. Add retry cost tracking in RemoteTaskRunner (2 hours)
|
|
585
|
+
3. Add lookback budget limits (1 hour)
|
|
586
|
+
4. Enforce batch dependency prefetching (already partially done, needs enforcement - 2 hours)
|
|
587
|
+
5. Add schema fetch coalescing (1 hour)
|
|
588
|
+
|