bulltrackers-module 1.0.735 → 1.0.736
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/functions/computation-system-v2/config/bulltrackers.config.js +75 -5
- package/functions/computation-system-v2/framework/data/DataFetcher.js +107 -105
- package/functions/computation-system-v2/framework/execution/Orchestrator.js +357 -150
- package/functions/computation-system-v2/framework/execution/RemoteTaskRunner.js +327 -0
- package/functions/computation-system-v2/framework/execution/middleware/LineageMiddleware.js +9 -4
- package/functions/computation-system-v2/framework/execution/middleware/ProfilerMiddleware.js +9 -21
- package/functions/computation-system-v2/framework/index.js +10 -3
- package/functions/computation-system-v2/framework/lineage/LineageTracker.js +53 -57
- package/functions/computation-system-v2/framework/monitoring/Profiler.js +54 -52
- package/functions/computation-system-v2/framework/resilience/Checkpointer.js +173 -27
- package/functions/computation-system-v2/framework/storage/StorageManager.js +419 -187
- package/functions/computation-system-v2/handlers/index.js +10 -1
- package/functions/computation-system-v2/handlers/scheduler.js +85 -193
- package/functions/computation-system-v2/handlers/worker.js +242 -0
- package/functions/computation-system-v2/test/analyze-results.js +238 -0
- package/functions/computation-system-v2/test/{test-dispatcher.js → other/test-dispatcher.js} +6 -6
- package/functions/computation-system-v2/test/{test-framework.js → other/test-framework.js} +14 -14
- package/functions/computation-system-v2/test/{test-real-execution.js → other/test-real-execution.js} +1 -1
- package/functions/computation-system-v2/test/{test-real-integration.js → other/test-real-integration.js} +3 -3
- package/functions/computation-system-v2/test/{test-refactor-e2e.js → other/test-refactor-e2e.js} +3 -3
- package/functions/computation-system-v2/test/{test-risk-metrics-computation.js → other/test-risk-metrics-computation.js} +4 -4
- package/functions/computation-system-v2/test/{test-scheduler.js → other/test-scheduler.js} +1 -1
- package/functions/computation-system-v2/test/{test-storage.js → other/test-storage.js} +2 -2
- package/functions/computation-system-v2/test/run-pipeline-test.js +554 -0
- package/functions/computation-system-v2/test/test-worker-pool.js +494 -0
- package/package.json +1 -1
- package/functions/computation-system-v2/computations/TestComputation.js +0 -46
- /package/functions/computation-system-v2/test/{test-results.json → other/test-results.json} +0 -0
|
@@ -1,79 +1,81 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* @fileoverview Computation Profiler
|
|
3
3
|
* Tracks execution metrics (duration, memory) for computations.
|
|
4
|
+
* * UPDATE: Now aggregates metrics in memory to allow single-row reporting.
|
|
4
5
|
*/
|
|
5
6
|
|
|
6
7
|
class ComputationProfiler {
|
|
7
8
|
constructor() {
|
|
8
|
-
this.
|
|
9
|
+
this.activeProfiles = new Map();
|
|
10
|
+
this.completedStats = new Map(); // Stores arrays of durations/memory per computation
|
|
9
11
|
}
|
|
10
12
|
|
|
11
13
|
startProfile(computationName, entityId = null) {
|
|
12
|
-
const
|
|
14
|
+
const id = entityId ? `${computationName}:${entityId}` : `${computationName}:${Date.now()}:${Math.random()}`;
|
|
13
15
|
|
|
14
|
-
this.
|
|
16
|
+
this.activeProfiles.set(id, {
|
|
17
|
+
name: computationName,
|
|
15
18
|
startTime: Date.now(),
|
|
16
|
-
startMemory: process.memoryUsage().heapUsed
|
|
17
|
-
queryCount: 0,
|
|
18
|
-
bytesProcessed: 0
|
|
19
|
+
startMemory: process.memoryUsage().heapUsed
|
|
19
20
|
});
|
|
20
21
|
|
|
21
|
-
return
|
|
22
|
+
return id;
|
|
22
23
|
}
|
|
23
24
|
|
|
24
|
-
endProfile(
|
|
25
|
-
const profile = this.
|
|
26
|
-
if (!profile) return;
|
|
25
|
+
endProfile(id, metadata = {}) {
|
|
26
|
+
const profile = this.activeProfiles.get(id);
|
|
27
|
+
if (!profile) return null;
|
|
27
28
|
|
|
28
29
|
const endTime = Date.now();
|
|
29
30
|
const endMemory = process.memoryUsage().heapUsed;
|
|
30
|
-
|
|
31
|
+
const duration = endTime - profile.startTime;
|
|
32
|
+
const memoryDelta = endMemory - profile.startMemory;
|
|
33
|
+
|
|
34
|
+
// Store for aggregation
|
|
35
|
+
if (!this.completedStats.has(profile.name)) {
|
|
36
|
+
this.completedStats.set(profile.name, { durations: [], memories: [] });
|
|
37
|
+
}
|
|
38
|
+
const stats = this.completedStats.get(profile.name);
|
|
39
|
+
stats.durations.push(duration);
|
|
40
|
+
stats.memories.push(memoryDelta);
|
|
41
|
+
|
|
42
|
+
this.activeProfiles.delete(id);
|
|
43
|
+
|
|
31
44
|
return {
|
|
32
|
-
duration
|
|
33
|
-
memoryDelta
|
|
34
|
-
queriesExecuted: profile.queryCount,
|
|
35
|
-
bytesProcessed: profile.bytesProcessed,
|
|
45
|
+
duration,
|
|
46
|
+
memoryDelta,
|
|
36
47
|
...metadata
|
|
37
48
|
};
|
|
38
49
|
}
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Retrieves aggregated statistics for a specific computation and clears them.
|
|
53
|
+
*/
|
|
54
|
+
getAndClearStats(computationName) {
|
|
55
|
+
const stats = this.completedStats.get(computationName);
|
|
56
|
+
if (!stats || stats.durations.length === 0) return null;
|
|
57
|
+
|
|
58
|
+
const count = stats.durations.length;
|
|
59
|
+
const totalDuration = stats.durations.reduce((a, b) => a + b, 0);
|
|
60
|
+
const avgDuration = totalDuration / count;
|
|
61
|
+
const avgMemory = stats.memories.reduce((a, b) => a + b, 0) / count;
|
|
62
|
+
|
|
63
|
+
// Calculate P95
|
|
64
|
+
const sorted = [...stats.durations].sort((a, b) => a - b);
|
|
65
|
+
const p95 = sorted[Math.ceil(count * 0.95) - 1];
|
|
66
|
+
|
|
67
|
+
// Clear memory
|
|
68
|
+
this.completedStats.delete(computationName);
|
|
69
|
+
|
|
70
|
+
return {
|
|
71
|
+
entityCount: count,
|
|
72
|
+
totalDuration,
|
|
73
|
+
avgDuration,
|
|
74
|
+
p95Duration: p95,
|
|
75
|
+
minDuration: sorted[0],
|
|
76
|
+
maxDuration: sorted[count - 1],
|
|
77
|
+
avgMemoryDelta: avgMemory
|
|
47
78
|
};
|
|
48
|
-
|
|
49
|
-
for (const entry of manifest) {
|
|
50
|
-
const compProfiles = Array.from(this.profiles.entries())
|
|
51
|
-
.filter(([k]) => k.startsWith(entry.name));
|
|
52
|
-
|
|
53
|
-
if (compProfiles.length === 0) continue;
|
|
54
|
-
|
|
55
|
-
const durations = compProfiles.map(([, p]) => Date.now() - p.startTime);
|
|
56
|
-
const memories = compProfiles.map(([, p]) => process.memoryUsage().heapUsed - p.startMemory);
|
|
57
|
-
|
|
58
|
-
report.computations.push({
|
|
59
|
-
name: entry.name,
|
|
60
|
-
entityCount: compProfiles.length,
|
|
61
|
-
avgDuration: durations.reduce((a, b) => a + b, 0) / durations.length,
|
|
62
|
-
maxDuration: Math.max(...durations),
|
|
63
|
-
p95Duration: this._percentile(durations, 0.95),
|
|
64
|
-
avgMemory: memories.reduce((a, b) => a + b, 0) / memories.length,
|
|
65
|
-
totalDuration: durations.reduce((a, b) => a + b, 0)
|
|
66
|
-
});
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
report.computations.sort((a, b) => b.totalDuration - a.totalDuration);
|
|
70
|
-
return report;
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
_percentile(arr, p) {
|
|
74
|
-
const sorted = [...arr].sort((a, b) => a - b);
|
|
75
|
-
const index = Math.ceil(sorted.length * p) - 1;
|
|
76
|
-
return sorted[index];
|
|
77
79
|
}
|
|
78
80
|
}
|
|
79
81
|
|
|
@@ -1,42 +1,145 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* @fileoverview Computation Checkpointer
|
|
3
|
-
* Manages save/resume states for long-running computations
|
|
2
|
+
* @fileoverview Computation Checkpointer (Append-Only Version)
|
|
3
|
+
* Manages save/resume states for long-running computations.
|
|
4
|
+
* Implements "Heartbeat & Steal" using an APPEND-ONLY pattern to avoid BigQuery Streaming Buffer locks.
|
|
5
|
+
* * UPDATE: Added 'force' parameter to initCheckpoint for local testing bypass.
|
|
4
6
|
*/
|
|
5
7
|
|
|
6
8
|
const crypto = require('crypto');
|
|
7
9
|
|
|
10
|
+
// Max number of times the Scheduler can revive a Zombie before we give up.
|
|
11
|
+
const MAX_ATTEMPTS = 3;
|
|
12
|
+
|
|
8
13
|
class Checkpointer {
|
|
9
14
|
constructor(config, storage) {
|
|
10
15
|
this.config = config;
|
|
11
16
|
this.storage = storage;
|
|
12
17
|
this.enabled = config.checkpointing?.enabled !== false;
|
|
18
|
+
|
|
19
|
+
// Identity of this specific process/worker
|
|
20
|
+
this.workerId = crypto.randomUUID();
|
|
21
|
+
this.heartbeatInterval = null;
|
|
22
|
+
|
|
23
|
+
// Local cache of the current state (for append-only writes)
|
|
24
|
+
this.currentState = null;
|
|
13
25
|
}
|
|
14
26
|
|
|
15
|
-
|
|
27
|
+
/**
|
|
28
|
+
* Initialize or Resume a checkpoint.
|
|
29
|
+
* Handles zombie detection, atomic lock stealing, and version-aware dead-letter logic.
|
|
30
|
+
* @param {string} dateStr - Target date
|
|
31
|
+
* @param {string} computationName - Name of the computation
|
|
32
|
+
* @param {number} totalEntities - Total entities to process (approx)
|
|
33
|
+
* @param {string} codeHash - Hash of the current running code
|
|
34
|
+
* @param {boolean} force - If true, bypasses lock checks (steals immediately). Useful for testing.
|
|
35
|
+
*/
|
|
36
|
+
async initCheckpoint(dateStr, computationName, totalEntities = 0, codeHash, force = false) {
|
|
16
37
|
if (!this.enabled) return null;
|
|
17
38
|
|
|
18
|
-
//
|
|
39
|
+
// 1. Get the LATEST checkpoint row (Snapshot)
|
|
40
|
+
// StorageManager must order by last_updated DESC limit 1 to handle append-only logs
|
|
19
41
|
const existing = await this.storage.getLatestCheckpoint(dateStr, computationName);
|
|
20
42
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
43
|
+
// FIX: Force Re-Run (Bypass everything and start fresh)
|
|
44
|
+
// This ensures even "completed" or "up-to-date" tasks are re-run when testing.
|
|
45
|
+
if (force) {
|
|
46
|
+
console.log(`[Checkpointer] ⚡ FORCE mode active. Resetting checkpoint for ${computationName}...`);
|
|
47
|
+
return this._createNew(dateStr, computationName, totalEntities, codeHash);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
// LOGIC: Version Check
|
|
51
|
+
// If code changed, we generally want to invalidate previous partial progress
|
|
52
|
+
// unless it was fully completed.
|
|
53
|
+
if (existing && existing.code_hash && existing.code_hash !== codeHash && existing.status !== 'completed') {
|
|
54
|
+
console.log(`[Checkpointer] Code change detected (${existing.code_hash} -> ${codeHash}). Resetting checkpoint.`);
|
|
55
|
+
return this._createNew(dateStr, computationName, totalEntities, codeHash);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// CASE 1: No checkpoint exists -> Create New
|
|
59
|
+
if (!existing) {
|
|
60
|
+
return this._createNew(dateStr, computationName, totalEntities, codeHash);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// CASE 2: Already Completed -> Return cached
|
|
64
|
+
if (existing.status === 'completed') {
|
|
31
65
|
return { isCompleted: true };
|
|
32
66
|
}
|
|
33
67
|
|
|
34
|
-
//
|
|
35
|
-
|
|
36
|
-
|
|
68
|
+
// CASE 3: Zombie / Stale Lock Detection
|
|
69
|
+
// If status is 'running' but heartbeat is old (> 15 mins), it's a Zombie.
|
|
70
|
+
const lastUpdate = new Date(existing.last_updated).getTime();
|
|
71
|
+
const diffMinutes = (Date.now() - lastUpdate) / 1000 / 60;
|
|
72
|
+
const isStale = diffMinutes > 15; // 15 min timeout
|
|
73
|
+
|
|
74
|
+
// If it is running and valid (and not forced), we cannot touch it.
|
|
75
|
+
if (existing.status === 'running' && !isStale) {
|
|
76
|
+
return { isLocked: true, workerId: existing.worker_instance_id };
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// CASE 4: Dead Letter Queue (Too many retries)
|
|
80
|
+
if (existing.attempts >= MAX_ATTEMPTS) {
|
|
81
|
+
console.warn(`[Checkpointer] ${computationName} exceeded max attempts (${existing.attempts}). Skipping.`);
|
|
82
|
+
return { skipped: true, reason: 'max_attempts_exceeded' };
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// CASE 5: Resume / Steal Lock
|
|
86
|
+
// It's either 'failed', 'running' (stale), or 'running' (force). We steal it.
|
|
87
|
+
const reason = isStale ? 'ZOMBIE_DETECTED' : 'RESUME';
|
|
88
|
+
console.log(`[Checkpointer] Stealing lock for ${computationName} [${reason}] (Attempt ${existing.attempts + 1}/${MAX_ATTEMPTS})...`);
|
|
37
89
|
|
|
90
|
+
// Prepare new state based on previous
|
|
91
|
+
this.currentState = {
|
|
92
|
+
date: dateStr,
|
|
93
|
+
computation_name: computationName,
|
|
94
|
+
checkpoint_id: existing.checkpoint_id, // Keep ID to link history
|
|
95
|
+
worker_instance_id: this.workerId,
|
|
96
|
+
status: 'running',
|
|
97
|
+
processed_count: existing.processed_count || 0,
|
|
98
|
+
total_entities: totalEntities || existing.total_entities,
|
|
99
|
+
last_entity_id: existing.last_entity_id,
|
|
100
|
+
completed_batches: existing.completed_batches || [], // Keep previous progress
|
|
101
|
+
attempts: (existing.attempts || 1) + 1,
|
|
102
|
+
code_hash: codeHash,
|
|
103
|
+
started_at: existing.started_at,
|
|
104
|
+
last_updated: new Date().toISOString()
|
|
105
|
+
};
|
|
106
|
+
|
|
107
|
+
await this._persistState();
|
|
108
|
+
this._startHeartbeat();
|
|
109
|
+
|
|
38
110
|
return {
|
|
39
|
-
id:
|
|
111
|
+
id: this.currentState.checkpoint_id,
|
|
112
|
+
processedCount: this.currentState.processed_count,
|
|
113
|
+
completedBatches: new Set(this.currentState.completed_batches),
|
|
114
|
+
lastEntityId: this.currentState.last_entity_id,
|
|
115
|
+
isResumed: true
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
async _createNew(dateStr, computationName, totalEntities, codeHash) {
|
|
120
|
+
const id = crypto.randomUUID();
|
|
121
|
+
|
|
122
|
+
this.currentState = {
|
|
123
|
+
date: dateStr,
|
|
124
|
+
computation_name: computationName,
|
|
125
|
+
checkpoint_id: id,
|
|
126
|
+
worker_instance_id: this.workerId,
|
|
127
|
+
status: 'running',
|
|
128
|
+
processed_count: 0,
|
|
129
|
+
total_entities: totalEntities,
|
|
130
|
+
last_entity_id: null,
|
|
131
|
+
completed_batches: [],
|
|
132
|
+
attempts: 1,
|
|
133
|
+
code_hash: codeHash,
|
|
134
|
+
started_at: new Date().toISOString(),
|
|
135
|
+
last_updated: new Date().toISOString()
|
|
136
|
+
};
|
|
137
|
+
|
|
138
|
+
await this._persistState();
|
|
139
|
+
this._startHeartbeat();
|
|
140
|
+
|
|
141
|
+
return {
|
|
142
|
+
id,
|
|
40
143
|
processedCount: 0,
|
|
41
144
|
completedBatches: new Set(),
|
|
42
145
|
lastEntityId: null,
|
|
@@ -45,21 +148,64 @@ class Checkpointer {
|
|
|
45
148
|
}
|
|
46
149
|
|
|
47
150
|
async markBatchComplete(dateStr, computationName, checkpointId, batchIndex, batchSize, lastEntityId) {
|
|
48
|
-
if (!this.enabled || !
|
|
151
|
+
if (!this.enabled || !this.currentState) return;
|
|
49
152
|
|
|
50
|
-
//
|
|
153
|
+
// Update local state
|
|
51
154
|
const processedCount = (batchIndex + 1) * batchSize;
|
|
155
|
+
const batches = new Set(this.currentState.completed_batches || []);
|
|
156
|
+
batches.add(batchIndex);
|
|
157
|
+
|
|
158
|
+
this.currentState.processed_count = processedCount;
|
|
159
|
+
this.currentState.last_entity_id = lastEntityId;
|
|
160
|
+
this.currentState.completed_batches = Array.from(batches); // Store as Array for BigQuery
|
|
161
|
+
this.currentState.last_updated = new Date().toISOString();
|
|
52
162
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
lastEntityId,
|
|
56
|
-
batchIndex
|
|
57
|
-
});
|
|
163
|
+
// Append new state row
|
|
164
|
+
await this._persistState();
|
|
58
165
|
}
|
|
59
166
|
|
|
60
167
|
async complete(dateStr, computationName, checkpointId) {
|
|
61
|
-
|
|
62
|
-
|
|
168
|
+
this._stopHeartbeat();
|
|
169
|
+
if (!this.enabled || !this.currentState) return;
|
|
170
|
+
|
|
171
|
+
this.currentState.status = 'completed';
|
|
172
|
+
this.currentState.last_updated = new Date().toISOString();
|
|
173
|
+
|
|
174
|
+
await this._persistState();
|
|
175
|
+
this.currentState = null; // Done
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
async _persistState() {
|
|
179
|
+
if (!this.storage.saveCheckpoint) {
|
|
180
|
+
if (this.storage.createCheckpoint) {
|
|
181
|
+
return this.storage.createCheckpoint(this.currentState);
|
|
182
|
+
}
|
|
183
|
+
throw new Error("StorageManager is missing 'saveCheckpoint' or 'createCheckpoint' method for Append-Only logic.");
|
|
184
|
+
}
|
|
185
|
+
return this.storage.saveCheckpoint(this.currentState);
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
_startHeartbeat() {
|
|
189
|
+
if (this.heartbeatInterval) clearInterval(this.heartbeatInterval);
|
|
190
|
+
|
|
191
|
+
// Pulse every 2 minutes
|
|
192
|
+
this.heartbeatInterval = setInterval(async () => {
|
|
193
|
+
if (this.currentState) {
|
|
194
|
+
this.currentState.last_updated = new Date().toISOString();
|
|
195
|
+
try {
|
|
196
|
+
await this._persistState();
|
|
197
|
+
} catch (e) {
|
|
198
|
+
console.warn(`[Checkpointer] Heartbeat failed: ${e.message}`);
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
}, 2 * 60 * 1000);
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
_stopHeartbeat() {
|
|
205
|
+
if (this.heartbeatInterval) {
|
|
206
|
+
clearInterval(this.heartbeatInterval);
|
|
207
|
+
this.heartbeatInterval = null;
|
|
208
|
+
}
|
|
63
209
|
}
|
|
64
210
|
}
|
|
65
211
|
|