bulltrackers-module 1.0.735 → 1.0.737
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/functions/computation-system-v2/config/bulltrackers.config.js +80 -6
- package/functions/computation-system-v2/docs/architecture.md +59 -0
- package/functions/computation-system-v2/framework/data/DataFetcher.js +107 -105
- package/functions/computation-system-v2/framework/execution/Orchestrator.js +357 -150
- package/functions/computation-system-v2/framework/execution/RemoteTaskRunner.js +327 -0
- package/functions/computation-system-v2/framework/execution/middleware/LineageMiddleware.js +9 -4
- package/functions/computation-system-v2/framework/execution/middleware/ProfilerMiddleware.js +9 -21
- package/functions/computation-system-v2/framework/index.js +10 -3
- package/functions/computation-system-v2/framework/lineage/LineageTracker.js +53 -57
- package/functions/computation-system-v2/framework/monitoring/Profiler.js +54 -52
- package/functions/computation-system-v2/framework/resilience/Checkpointer.js +173 -27
- package/functions/computation-system-v2/framework/storage/StorageManager.js +419 -187
- package/functions/computation-system-v2/handlers/index.js +10 -1
- package/functions/computation-system-v2/handlers/scheduler.js +85 -193
- package/functions/computation-system-v2/handlers/worker.js +242 -0
- package/functions/computation-system-v2/index.js +2 -0
- package/functions/computation-system-v2/test/analyze-results.js +238 -0
- package/functions/computation-system-v2/test/{test-dispatcher.js → other/test-dispatcher.js} +6 -6
- package/functions/computation-system-v2/test/{test-framework.js → other/test-framework.js} +14 -14
- package/functions/computation-system-v2/test/{test-real-execution.js → other/test-real-execution.js} +1 -1
- package/functions/computation-system-v2/test/{test-real-integration.js → other/test-real-integration.js} +3 -3
- package/functions/computation-system-v2/test/{test-refactor-e2e.js → other/test-refactor-e2e.js} +3 -3
- package/functions/computation-system-v2/test/{test-risk-metrics-computation.js → other/test-risk-metrics-computation.js} +4 -4
- package/functions/computation-system-v2/test/{test-scheduler.js → other/test-scheduler.js} +1 -1
- package/functions/computation-system-v2/test/{test-storage.js → other/test-storage.js} +2 -2
- package/functions/computation-system-v2/test/run-pipeline-test.js +554 -0
- package/functions/computation-system-v2/test/test-full-pipeline.js +227 -0
- package/functions/computation-system-v2/test/test-worker-pool.js +266 -0
- package/package.json +1 -1
- package/functions/computation-system-v2/computations/TestComputation.js +0 -46
- /package/functions/computation-system-v2/test/{test-results.json → other/test-results.json} +0 -0
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview Remote Task Runner (Serverless Worker Pool Client)
|
|
3
|
+
*
|
|
4
|
+
* RESPONSIBILITIES:
|
|
5
|
+
* 1. Package entity data and context into GCS files
|
|
6
|
+
* 2. Invoke remote worker functions in parallel
|
|
7
|
+
* 3. Collect results and errors
|
|
8
|
+
* 4. Handle retries for transient failures
|
|
9
|
+
*
|
|
10
|
+
* DATA FLOW:
|
|
11
|
+
* Orchestrator calls runBatch() -> Upload to GCS -> Invoke Workers -> Collect Results
|
|
12
|
+
*
|
|
13
|
+
* DESIGN PRINCIPLES:
|
|
14
|
+
* - Workers are stateless - all context is passed via GCS
|
|
15
|
+
* - High parallelism - hundreds of concurrent invocations
|
|
16
|
+
* - Fault isolation - one entity failure doesn't affect others
|
|
17
|
+
* - Cost efficient - workers scale to zero between runs
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
const { Storage } = require('@google-cloud/storage');
|
|
21
|
+
const pLimit = require('p-limit');
|
|
22
|
+
|
|
23
|
+
// For local testing
|
|
24
|
+
const { executeLocal } = require('../../handlers/worker');
|
|
25
|
+
|
|
26
|
+
class RemoteTaskRunner {
|
|
27
|
+
constructor(config, logger = console) {
|
|
28
|
+
this.config = config;
|
|
29
|
+
this.logger = logger;
|
|
30
|
+
|
|
31
|
+
// Worker pool configuration
|
|
32
|
+
const poolConfig = config.workerPool || {};
|
|
33
|
+
this.bucketName = poolConfig.tempBucket || 'bulltrackers-worker-staging';
|
|
34
|
+
this.workerUrl = poolConfig.workerUrl;
|
|
35
|
+
this.concurrency = poolConfig.concurrency || 100;
|
|
36
|
+
this.timeout = poolConfig.timeout || 60000; // 60s default
|
|
37
|
+
this.retries = poolConfig.retries || 2;
|
|
38
|
+
|
|
39
|
+
// Local mode for testing
|
|
40
|
+
this.localMode = poolConfig.localMode || process.env.WORKER_LOCAL_MODE === 'true';
|
|
41
|
+
|
|
42
|
+
// Lazy-initialized clients
|
|
43
|
+
this._storage = null;
|
|
44
|
+
this._authClient = null;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
get storage() {
|
|
48
|
+
if (!this._storage) {
|
|
49
|
+
this._storage = new Storage();
|
|
50
|
+
}
|
|
51
|
+
return this._storage;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Execute a batch of entities remotely (or locally for testing)
|
|
56
|
+
*
|
|
57
|
+
* @param {Object} entry - Manifest entry for the computation
|
|
58
|
+
* @param {string} dateStr - Target date (YYYY-MM-DD)
|
|
59
|
+
* @param {Object} baseContext - Shared context (references, config)
|
|
60
|
+
* @param {string[]} entityIds - Entity IDs to process
|
|
61
|
+
* @param {Map<string, Object>} entityDataMap - Pre-filtered data per entity
|
|
62
|
+
* @param {Object} depResults - Pre-loaded dependency results
|
|
63
|
+
* @returns {Promise<{results: Object, errors: Array}>}
|
|
64
|
+
*/
|
|
65
|
+
async runBatch(entry, dateStr, baseContext, entityIds, entityDataMap, depResults) {
|
|
66
|
+
const startTime = Date.now();
|
|
67
|
+
this._log('INFO', `Starting batch: ${entityIds.length} entities for ${entry.name}`);
|
|
68
|
+
|
|
69
|
+
if (this.localMode) {
|
|
70
|
+
return this._runBatchLocal(entry, dateStr, baseContext, entityIds, entityDataMap, depResults);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
return this._runBatchRemote(entry, dateStr, baseContext, entityIds, entityDataMap, depResults);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Local execution mode - runs workers in-process
|
|
78
|
+
* Perfect for testing without GCS or network overhead
|
|
79
|
+
*/
|
|
80
|
+
async _runBatchLocal(entry, dateStr, baseContext, entityIds, entityDataMap, depResults) {
|
|
81
|
+
const limit = pLimit(this.concurrency);
|
|
82
|
+
const results = {};
|
|
83
|
+
const errors = [];
|
|
84
|
+
|
|
85
|
+
const tasks = entityIds.map(entityId => limit(async () => {
|
|
86
|
+
try {
|
|
87
|
+
const contextPackage = this._buildContextPackage(
|
|
88
|
+
entry,
|
|
89
|
+
entityId,
|
|
90
|
+
entityDataMap.get(entityId),
|
|
91
|
+
baseContext,
|
|
92
|
+
depResults
|
|
93
|
+
);
|
|
94
|
+
|
|
95
|
+
const { result } = await executeLocal({
|
|
96
|
+
computationName: entry.originalName || entry.name,
|
|
97
|
+
entityId,
|
|
98
|
+
date: dateStr,
|
|
99
|
+
contextPackage
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
if (result !== null && result !== undefined) {
|
|
103
|
+
results[entityId] = result;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
} catch (e) {
|
|
107
|
+
this._log('WARN', `Local execution failed for ${entityId}: ${e.message}`);
|
|
108
|
+
errors.push({ entityId, error: e.message });
|
|
109
|
+
}
|
|
110
|
+
}));
|
|
111
|
+
|
|
112
|
+
await Promise.all(tasks);
|
|
113
|
+
|
|
114
|
+
this._log('INFO', `Local batch complete: ${Object.keys(results).length} results, ${errors.length} errors`);
|
|
115
|
+
return { results, errors };
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Remote execution mode - invokes Cloud Functions via HTTP
|
|
120
|
+
*/
|
|
121
|
+
async _runBatchRemote(entry, dateStr, baseContext, entityIds, entityDataMap, depResults) {
|
|
122
|
+
const uploadLimit = pLimit(50); // Concurrent uploads to GCS
|
|
123
|
+
const invokeLimit = pLimit(this.concurrency); // Concurrent worker invocations
|
|
124
|
+
|
|
125
|
+
const results = {};
|
|
126
|
+
const errors = [];
|
|
127
|
+
const uploadedPaths = [];
|
|
128
|
+
|
|
129
|
+
// Phase 1: Upload context packages to GCS
|
|
130
|
+
this._log('INFO', 'Uploading context packages to GCS...');
|
|
131
|
+
const uploadStart = Date.now();
|
|
132
|
+
|
|
133
|
+
const uploadTasks = entityIds.map(entityId => uploadLimit(async () => {
|
|
134
|
+
const contextPackage = this._buildContextPackage(
|
|
135
|
+
entry,
|
|
136
|
+
entityId,
|
|
137
|
+
entityDataMap.get(entityId),
|
|
138
|
+
baseContext,
|
|
139
|
+
depResults
|
|
140
|
+
);
|
|
141
|
+
|
|
142
|
+
const path = `${dateStr}/${entry.name}/${entityId}.json`;
|
|
143
|
+
|
|
144
|
+
try {
|
|
145
|
+
await this._uploadToGCS(path, contextPackage);
|
|
146
|
+
uploadedPaths.push({ entityId, path });
|
|
147
|
+
} catch (e) {
|
|
148
|
+
errors.push({ entityId, error: `Upload failed: ${e.message}` });
|
|
149
|
+
}
|
|
150
|
+
}));
|
|
151
|
+
|
|
152
|
+
await Promise.all(uploadTasks);
|
|
153
|
+
this._log('INFO', `Uploaded ${uploadedPaths.length} packages in ${Date.now() - uploadStart}ms`);
|
|
154
|
+
|
|
155
|
+
// Phase 2: Invoke workers in parallel
|
|
156
|
+
this._log('INFO', 'Invoking workers...');
|
|
157
|
+
const invokeStart = Date.now();
|
|
158
|
+
|
|
159
|
+
const invokeTasks = uploadedPaths.map(({ entityId, path }) =>
|
|
160
|
+
invokeLimit(async () => {
|
|
161
|
+
try {
|
|
162
|
+
const response = await this._invokeWorkerWithRetry({
|
|
163
|
+
computationName: entry.originalName || entry.name,
|
|
164
|
+
entityId,
|
|
165
|
+
date: dateStr,
|
|
166
|
+
dataUri: { bucket: this.bucketName, path }
|
|
167
|
+
});
|
|
168
|
+
|
|
169
|
+
if (response.status === 'success' && response.result !== null) {
|
|
170
|
+
results[entityId] = response.result;
|
|
171
|
+
} else if (response.status === 'error') {
|
|
172
|
+
errors.push({ entityId, error: response.error });
|
|
173
|
+
}
|
|
174
|
+
// status === 'success' with result === null means skipped (filtered out)
|
|
175
|
+
|
|
176
|
+
} catch (e) {
|
|
177
|
+
errors.push({ entityId, error: e.message });
|
|
178
|
+
}
|
|
179
|
+
})
|
|
180
|
+
);
|
|
181
|
+
|
|
182
|
+
await Promise.all(invokeTasks);
|
|
183
|
+
this._log('INFO', `Invocations complete in ${Date.now() - invokeStart}ms`);
|
|
184
|
+
|
|
185
|
+
// Phase 3: Cleanup GCS (fire and forget)
|
|
186
|
+
this._cleanupGCS(uploadedPaths.map(p => p.path)).catch(e => {
|
|
187
|
+
this._log('WARN', `GCS cleanup failed: ${e.message}`);
|
|
188
|
+
});
|
|
189
|
+
|
|
190
|
+
return { results, errors };
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
/**
|
|
194
|
+
* Build the context package for a single entity
|
|
195
|
+
*/
|
|
196
|
+
_buildContextPackage(entry, entityId, entityData, baseContext, depResults) {
|
|
197
|
+
// Extract only this entity's dependencies
|
|
198
|
+
const entityDeps = {};
|
|
199
|
+
for (const [depName, allResults] of Object.entries(depResults || {})) {
|
|
200
|
+
if (allResults === null) continue; // Large dependency not preloaded
|
|
201
|
+
|
|
202
|
+
if (typeof allResults === 'object') {
|
|
203
|
+
// If it's a map of entity -> result, extract this entity's
|
|
204
|
+
if (allResults[entityId]) {
|
|
205
|
+
entityDeps[depName] = { [entityId]: allResults[entityId] };
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
return {
|
|
211
|
+
entityData: entityData || {},
|
|
212
|
+
references: baseContext.references || {},
|
|
213
|
+
dependencies: entityDeps,
|
|
214
|
+
computationMeta: {
|
|
215
|
+
name: entry.name,
|
|
216
|
+
originalName: entry.originalName,
|
|
217
|
+
type: entry.type,
|
|
218
|
+
hash: entry.hash
|
|
219
|
+
},
|
|
220
|
+
config: baseContext.config || {}
|
|
221
|
+
};
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
/**
|
|
225
|
+
* Upload a context package to GCS
|
|
226
|
+
*/
|
|
227
|
+
async _uploadToGCS(path, data) {
|
|
228
|
+
const file = this.storage.bucket(this.bucketName).file(path);
|
|
229
|
+
|
|
230
|
+
await file.save(JSON.stringify(data), {
|
|
231
|
+
contentType: 'application/json',
|
|
232
|
+
resumable: false, // Faster for small files
|
|
233
|
+
metadata: {
|
|
234
|
+
cacheControl: 'no-cache' // Don't cache temp files
|
|
235
|
+
}
|
|
236
|
+
});
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
/**
|
|
240
|
+
* Invoke a worker with retry logic
|
|
241
|
+
*/
|
|
242
|
+
async _invokeWorkerWithRetry(payload, attempt = 1) {
|
|
243
|
+
try {
|
|
244
|
+
return await this._invokeWorker(payload);
|
|
245
|
+
} catch (e) {
|
|
246
|
+
const isRetryable = this._isRetryableError(e);
|
|
247
|
+
|
|
248
|
+
if (isRetryable && attempt < this.retries) {
|
|
249
|
+
// Exponential backoff
|
|
250
|
+
const delay = Math.min(1000 * Math.pow(2, attempt - 1), 10000);
|
|
251
|
+
await new Promise(r => setTimeout(r, delay));
|
|
252
|
+
|
|
253
|
+
this._log('DEBUG', `Retrying ${payload.entityId} (attempt ${attempt + 1})`);
|
|
254
|
+
return this._invokeWorkerWithRetry(payload, attempt + 1);
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
throw e;
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
/**
|
|
262
|
+
* Invoke a single worker via HTTP
|
|
263
|
+
*/
|
|
264
|
+
async _invokeWorker(payload) {
|
|
265
|
+
// Lazy-load auth client
|
|
266
|
+
if (!this._authClient) {
|
|
267
|
+
const { GoogleAuth } = require('google-auth-library');
|
|
268
|
+
const auth = new GoogleAuth();
|
|
269
|
+
this._authClient = await auth.getIdTokenClient(this.workerUrl);
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
const response = await this._authClient.request({
|
|
273
|
+
url: this.workerUrl,
|
|
274
|
+
method: 'POST',
|
|
275
|
+
data: payload,
|
|
276
|
+
timeout: this.timeout,
|
|
277
|
+
headers: {
|
|
278
|
+
'Content-Type': 'application/json'
|
|
279
|
+
}
|
|
280
|
+
});
|
|
281
|
+
|
|
282
|
+
return response.data;
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
/**
|
|
286
|
+
* Check if an error is retryable
|
|
287
|
+
*/
|
|
288
|
+
_isRetryableError(error) {
|
|
289
|
+
// Network errors
|
|
290
|
+
if (error.code === 'ECONNRESET' || error.code === 'ETIMEDOUT') return true;
|
|
291
|
+
|
|
292
|
+
// HTTP 5xx errors (server errors)
|
|
293
|
+
if (error.response && error.response.status >= 500) return true;
|
|
294
|
+
|
|
295
|
+
// Rate limiting
|
|
296
|
+
if (error.response && error.response.status === 429) return true;
|
|
297
|
+
|
|
298
|
+
return false;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
/**
|
|
302
|
+
* Cleanup uploaded files from GCS
|
|
303
|
+
*/
|
|
304
|
+
async _cleanupGCS(paths) {
|
|
305
|
+
// Batch delete
|
|
306
|
+
const bucket = this.storage.bucket(this.bucketName);
|
|
307
|
+
|
|
308
|
+
// GCS batch delete has limits, process in chunks
|
|
309
|
+
const chunkSize = 100;
|
|
310
|
+
for (let i = 0; i < paths.length; i += chunkSize) {
|
|
311
|
+
const chunk = paths.slice(i, i + chunkSize);
|
|
312
|
+
await Promise.all(chunk.map(path =>
|
|
313
|
+
bucket.file(path).delete().catch(() => {})
|
|
314
|
+
));
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
_log(level, message) {
|
|
319
|
+
if (this.logger && this.logger.log) {
|
|
320
|
+
this.logger.log(level, `[RemoteTaskRunner] ${message}`);
|
|
321
|
+
} else {
|
|
322
|
+
console.log(`[${level}] [RemoteTaskRunner] ${message}`);
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
module.exports = { RemoteTaskRunner };
|
|
@@ -10,20 +10,25 @@ class LineageMiddleware extends Middleware {
|
|
|
10
10
|
async execute(context, next) {
|
|
11
11
|
const result = await next(context);
|
|
12
12
|
|
|
13
|
-
// Only track if we have a valid result
|
|
14
|
-
|
|
13
|
+
// Only track if we have a valid result
|
|
14
|
+
// We still call track() per entity, but the Tracker now just aggregates in RAM (very fast)
|
|
15
|
+
if (result && context.computation) {
|
|
16
|
+
// Fire and forget - don't await to avoid slowing down execution
|
|
15
17
|
this.tracker.track({
|
|
16
18
|
computation: context.computation.name,
|
|
17
19
|
date: context.date,
|
|
18
20
|
entityId: context.entityId,
|
|
19
|
-
sourceData: context.data,
|
|
21
|
+
sourceData: context.data,
|
|
20
22
|
result: result
|
|
21
|
-
}).catch(e => console.error('Lineage tracking
|
|
23
|
+
}).catch(e => console.error('Lineage tracking error:', e.message));
|
|
22
24
|
}
|
|
23
25
|
|
|
24
26
|
return result;
|
|
25
27
|
}
|
|
26
28
|
|
|
29
|
+
/**
|
|
30
|
+
* Called by Orchestrator at the very end of the computation
|
|
31
|
+
*/
|
|
27
32
|
async flush() {
|
|
28
33
|
await this.tracker.flush();
|
|
29
34
|
}
|
package/functions/computation-system-v2/framework/execution/middleware/ProfilerMiddleware.js
CHANGED
|
@@ -5,7 +5,7 @@ class ProfilerMiddleware extends Middleware {
|
|
|
5
5
|
constructor(config) {
|
|
6
6
|
super();
|
|
7
7
|
this.profiler = new ComputationProfiler();
|
|
8
|
-
this.storageManager = null;
|
|
8
|
+
this.storageManager = null;
|
|
9
9
|
}
|
|
10
10
|
|
|
11
11
|
setStorage(storageManager) {
|
|
@@ -13,33 +13,21 @@ class ProfilerMiddleware extends Middleware {
|
|
|
13
13
|
}
|
|
14
14
|
|
|
15
15
|
async execute(context, next) {
|
|
16
|
-
const { computation, entityId
|
|
16
|
+
const { computation, entityId } = context;
|
|
17
17
|
|
|
18
18
|
// Start Profile
|
|
19
19
|
const key = this.profiler.startProfile(computation.name, entityId || 'global');
|
|
20
20
|
|
|
21
21
|
try {
|
|
22
|
-
|
|
23
|
-
const result = await next(context);
|
|
24
|
-
return result;
|
|
22
|
+
return await next(context);
|
|
25
23
|
} finally {
|
|
26
|
-
// End Profile (
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
entityId: entityId || 'global',
|
|
30
|
-
resultSize
|
|
24
|
+
// End Profile (Just tracks stats in memory now)
|
|
25
|
+
this.profiler.endProfile(key, {
|
|
26
|
+
entityId: entityId || 'global'
|
|
31
27
|
});
|
|
32
|
-
|
|
33
|
-
//
|
|
34
|
-
|
|
35
|
-
// Async save (don't block)
|
|
36
|
-
this.storageManager.savePerformanceReport(date, {
|
|
37
|
-
computations: [{
|
|
38
|
-
name: computation.name,
|
|
39
|
-
...profile
|
|
40
|
-
}]
|
|
41
|
-
}).catch(err => console.error('Failed to save profile', err));
|
|
42
|
-
}
|
|
28
|
+
|
|
29
|
+
// NOTE: We no longer save to storage here to prevent log spam/DB load.
|
|
30
|
+
// The Orchestrator handles saving the aggregated report.
|
|
43
31
|
}
|
|
44
32
|
}
|
|
45
33
|
}
|
|
@@ -1,15 +1,20 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* @fileoverview Framework exports
|
|
3
|
-
*
|
|
3
|
+
*
|
|
4
|
+
* This is the public API of the computation framework.
|
|
4
5
|
* Computations only need to import from here.
|
|
5
6
|
*/
|
|
6
7
|
|
|
7
8
|
// Core
|
|
8
9
|
const { Computation } = require('./core/Computation');
|
|
9
|
-
const { Orchestrator } = require('./execution/Orchestrator');
|
|
10
|
+
const { Orchestrator } = require('./execution/Orchestrator');
|
|
10
11
|
const { ManifestBuilder } = require('./core/Manifest');
|
|
11
12
|
const { RulesRegistry } = require('./core/Rules');
|
|
12
13
|
|
|
14
|
+
// Execution
|
|
15
|
+
const { TaskRunner } = require('./execution/TaskRunner');
|
|
16
|
+
const { RemoteTaskRunner } = require('./execution/RemoteTaskRunner');
|
|
17
|
+
|
|
13
18
|
// Scheduling
|
|
14
19
|
const { ScheduleValidator } = require('./scheduling/ScheduleValidator');
|
|
15
20
|
|
|
@@ -26,7 +31,9 @@ module.exports = {
|
|
|
26
31
|
Computation,
|
|
27
32
|
|
|
28
33
|
// Execution engine
|
|
29
|
-
Orchestrator,
|
|
34
|
+
Orchestrator,
|
|
35
|
+
TaskRunner,
|
|
36
|
+
RemoteTaskRunner, // Serverless worker pool client
|
|
30
37
|
ManifestBuilder,
|
|
31
38
|
|
|
32
39
|
// Business rules
|
|
@@ -1,18 +1,14 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* @fileoverview Data Lineage Tracker
|
|
2
|
+
* @fileoverview Data Lineage Tracker (Aggregated)
|
|
3
3
|
* * Tracks the provenance of computation results.
|
|
4
|
-
* *
|
|
5
|
-
* *
|
|
4
|
+
* * UPDATE: Refactored to AGGREGATE lineage at the computation level.
|
|
5
|
+
* * Instead of 1 row per entity, it produces 1 row per computation run with total source counts.
|
|
6
6
|
*/
|
|
7
7
|
|
|
8
8
|
const { BigQuery } = require('@google-cloud/bigquery');
|
|
9
9
|
const crypto = require('crypto');
|
|
10
10
|
|
|
11
11
|
class LineageTracker {
|
|
12
|
-
/**
|
|
13
|
-
* @param {Object} config - System configuration
|
|
14
|
-
* @param {Object} [logger] - Logger instance
|
|
15
|
-
*/
|
|
16
12
|
constructor(config, logger = null) {
|
|
17
13
|
this.config = config;
|
|
18
14
|
this.logger = logger || console;
|
|
@@ -25,48 +21,62 @@ class LineageTracker {
|
|
|
25
21
|
this.datasetId = config.bigquery.dataset;
|
|
26
22
|
this.tableName = 'data_lineage';
|
|
27
23
|
|
|
28
|
-
|
|
29
|
-
this.
|
|
24
|
+
// Aggregation State: Map<computationName, { date, sources: Map<table, count> }>
|
|
25
|
+
this.aggregationState = new Map();
|
|
30
26
|
this._tableChecked = false;
|
|
31
27
|
}
|
|
32
28
|
|
|
33
29
|
/**
|
|
34
|
-
*
|
|
35
|
-
*
|
|
36
|
-
* @param {string} params.computation - Computation name
|
|
37
|
-
* @param {string} params.date - Execution date
|
|
38
|
-
* @param {string} params.entityId - Entity ID
|
|
39
|
-
* @param {Object} params.sourceData - The actual input data object used
|
|
40
|
-
* @param {Object} params.result - The result object produced
|
|
30
|
+
* Accumulate lineage stats for a specific entity result.
|
|
31
|
+
* Does NOT write to BigQuery immediately.
|
|
41
32
|
*/
|
|
42
33
|
async track({ computation, date, entityId, sourceData, result }) {
|
|
43
34
|
if (!sourceData) return;
|
|
44
35
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
result_hash: resultHash,
|
|
54
|
-
timestamp: new Date().toISOString()
|
|
55
|
-
});
|
|
56
|
-
|
|
57
|
-
if (this.buffer.length >= this.BUFFER_SIZE) {
|
|
58
|
-
await this.flush();
|
|
36
|
+
// Initialize aggregation bucket if needed
|
|
37
|
+
if (!this.aggregationState.has(computation)) {
|
|
38
|
+
this.aggregationState.set(computation, {
|
|
39
|
+
date,
|
|
40
|
+
computation,
|
|
41
|
+
sources: new Map(), // Table -> Count
|
|
42
|
+
firstTimestamp: new Date().toISOString()
|
|
43
|
+
});
|
|
59
44
|
}
|
|
45
|
+
|
|
46
|
+
const state = this.aggregationState.get(computation);
|
|
47
|
+
|
|
48
|
+
// Summarize and merge sources
|
|
49
|
+
this._mergeSources(state.sources, sourceData);
|
|
60
50
|
}
|
|
61
51
|
|
|
62
52
|
/**
|
|
63
|
-
*
|
|
53
|
+
* Finalize and write aggregated lineage to BigQuery.
|
|
54
|
+
* Called by Orchestrator at the end of execution.
|
|
64
55
|
*/
|
|
65
56
|
async flush() {
|
|
66
|
-
if (this.
|
|
57
|
+
if (this.aggregationState.size === 0) return;
|
|
67
58
|
|
|
68
|
-
const batch = [
|
|
69
|
-
|
|
59
|
+
const batch = [];
|
|
60
|
+
|
|
61
|
+
for (const [compName, state] of this.aggregationState.entries()) {
|
|
62
|
+
const sourcesSummary = Array.from(state.sources.entries()).map(([table, count]) => ({
|
|
63
|
+
table,
|
|
64
|
+
total_rows_used: count
|
|
65
|
+
}));
|
|
66
|
+
|
|
67
|
+
// Create a single summary row
|
|
68
|
+
batch.push({
|
|
69
|
+
date: state.date,
|
|
70
|
+
computation_name: compName,
|
|
71
|
+
entity_id: 'AGGREGATED_BATCH', // explicit marker
|
|
72
|
+
sources_json: JSON.stringify(sourcesSummary),
|
|
73
|
+
result_hash: 'AGGREGATED', // Individual hashes aren't useful in summary
|
|
74
|
+
timestamp: new Date().toISOString()
|
|
75
|
+
});
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// Clear memory immediately
|
|
79
|
+
this.aggregationState.clear();
|
|
70
80
|
|
|
71
81
|
try {
|
|
72
82
|
await this._ensureTable();
|
|
@@ -76,43 +86,29 @@ class LineageTracker {
|
|
|
76
86
|
.table(this.tableName)
|
|
77
87
|
.insert(batch);
|
|
78
88
|
|
|
79
|
-
this._log('
|
|
89
|
+
this._log('INFO', `Saved aggregated lineage for ${batch.length} computations`);
|
|
80
90
|
} catch (e) {
|
|
81
|
-
this._log('ERROR', `Failed to
|
|
82
|
-
// In a production system, we might want to retry or dump to a dead-letter file
|
|
91
|
+
this._log('ERROR', `Failed to save lineage: ${e.message}`);
|
|
83
92
|
}
|
|
84
93
|
}
|
|
85
94
|
|
|
86
95
|
/**
|
|
87
|
-
*
|
|
88
|
-
* @param {Object} data - Input data map { tableName: data }
|
|
89
|
-
* @returns {Array} Summary of sources [{ table, count, ... }]
|
|
96
|
+
* Merges current entity's data usage into the global counter.
|
|
90
97
|
*/
|
|
91
|
-
|
|
92
|
-
|
|
98
|
+
_mergeSources(aggSources, currentData) {
|
|
99
|
+
Object.entries(currentData).forEach(([table, content]) => {
|
|
93
100
|
let count = 0;
|
|
94
|
-
let meta = null;
|
|
95
|
-
|
|
96
101
|
if (Array.isArray(content)) {
|
|
97
102
|
count = content.length;
|
|
98
103
|
} else if (content && typeof content === 'object') {
|
|
99
104
|
count = Object.keys(content).length;
|
|
100
105
|
}
|
|
101
106
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
meta = content._metadata;
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
return { table, count, meta };
|
|
107
|
+
const currentTotal = aggSources.get(table) || 0;
|
|
108
|
+
aggSources.set(table, currentTotal + count);
|
|
108
109
|
});
|
|
109
110
|
}
|
|
110
111
|
|
|
111
|
-
_hash(data) {
|
|
112
|
-
const str = typeof data === 'string' ? data : JSON.stringify(data);
|
|
113
|
-
return crypto.createHash('md5').update(str || '').digest('hex').substring(0, 16);
|
|
114
|
-
}
|
|
115
|
-
|
|
116
112
|
async _ensureTable() {
|
|
117
113
|
if (this._tableChecked) return;
|
|
118
114
|
|
|
@@ -127,12 +123,12 @@ class LineageTracker {
|
|
|
127
123
|
{ name: 'date', type: 'DATE', mode: 'REQUIRED' },
|
|
128
124
|
{ name: 'computation_name', type: 'STRING', mode: 'REQUIRED' },
|
|
129
125
|
{ name: 'entity_id', type: 'STRING', mode: 'REQUIRED' },
|
|
130
|
-
{ name: 'sources_json', type: 'STRING', mode: 'REQUIRED' },
|
|
126
|
+
{ name: 'sources_json', type: 'STRING', mode: 'REQUIRED' },
|
|
131
127
|
{ name: 'result_hash', type: 'STRING', mode: 'NULLABLE' },
|
|
132
128
|
{ name: 'timestamp', type: 'TIMESTAMP', mode: 'REQUIRED' }
|
|
133
129
|
],
|
|
134
130
|
timePartitioning: { type: 'DAY', field: 'date' },
|
|
135
|
-
clustering: { fields: ['computation_name'
|
|
131
|
+
clustering: { fields: ['computation_name'] }
|
|
136
132
|
});
|
|
137
133
|
}
|
|
138
134
|
this._tableChecked = true;
|