bulltrackers-module 1.0.735 → 1.0.736

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. package/functions/computation-system-v2/config/bulltrackers.config.js +75 -5
  2. package/functions/computation-system-v2/framework/data/DataFetcher.js +107 -105
  3. package/functions/computation-system-v2/framework/execution/Orchestrator.js +357 -150
  4. package/functions/computation-system-v2/framework/execution/RemoteTaskRunner.js +327 -0
  5. package/functions/computation-system-v2/framework/execution/middleware/LineageMiddleware.js +9 -4
  6. package/functions/computation-system-v2/framework/execution/middleware/ProfilerMiddleware.js +9 -21
  7. package/functions/computation-system-v2/framework/index.js +10 -3
  8. package/functions/computation-system-v2/framework/lineage/LineageTracker.js +53 -57
  9. package/functions/computation-system-v2/framework/monitoring/Profiler.js +54 -52
  10. package/functions/computation-system-v2/framework/resilience/Checkpointer.js +173 -27
  11. package/functions/computation-system-v2/framework/storage/StorageManager.js +419 -187
  12. package/functions/computation-system-v2/handlers/index.js +10 -1
  13. package/functions/computation-system-v2/handlers/scheduler.js +85 -193
  14. package/functions/computation-system-v2/handlers/worker.js +242 -0
  15. package/functions/computation-system-v2/test/analyze-results.js +238 -0
  16. package/functions/computation-system-v2/test/{test-dispatcher.js → other/test-dispatcher.js} +6 -6
  17. package/functions/computation-system-v2/test/{test-framework.js → other/test-framework.js} +14 -14
  18. package/functions/computation-system-v2/test/{test-real-execution.js → other/test-real-execution.js} +1 -1
  19. package/functions/computation-system-v2/test/{test-real-integration.js → other/test-real-integration.js} +3 -3
  20. package/functions/computation-system-v2/test/{test-refactor-e2e.js → other/test-refactor-e2e.js} +3 -3
  21. package/functions/computation-system-v2/test/{test-risk-metrics-computation.js → other/test-risk-metrics-computation.js} +4 -4
  22. package/functions/computation-system-v2/test/{test-scheduler.js → other/test-scheduler.js} +1 -1
  23. package/functions/computation-system-v2/test/{test-storage.js → other/test-storage.js} +2 -2
  24. package/functions/computation-system-v2/test/run-pipeline-test.js +554 -0
  25. package/functions/computation-system-v2/test/test-worker-pool.js +494 -0
  26. package/package.json +1 -1
  27. package/functions/computation-system-v2/computations/TestComputation.js +0 -46
  28. /package/functions/computation-system-v2/test/{test-results.json → other/test-results.json} +0 -0
@@ -0,0 +1,327 @@
1
+ /**
2
+ * @fileoverview Remote Task Runner (Serverless Worker Pool Client)
3
+ *
4
+ * RESPONSIBILITIES:
5
+ * 1. Package entity data and context into GCS files
6
+ * 2. Invoke remote worker functions in parallel
7
+ * 3. Collect results and errors
8
+ * 4. Handle retries for transient failures
9
+ *
10
+ * DATA FLOW:
11
+ * Orchestrator calls runBatch() -> Upload to GCS -> Invoke Workers -> Collect Results
12
+ *
13
+ * DESIGN PRINCIPLES:
14
+ * - Workers are stateless - all context is passed via GCS
15
+ * - High parallelism - hundreds of concurrent invocations
16
+ * - Fault isolation - one entity failure doesn't affect others
17
+ * - Cost efficient - workers scale to zero between runs
18
+ */
19
+
20
+ const { Storage } = require('@google-cloud/storage');
21
+ const pLimit = require('p-limit');
22
+
23
+ // For local testing
24
+ const { executeLocal } = require('../../handlers/worker');
25
+
26
+ class RemoteTaskRunner {
27
+ constructor(config, logger = console) {
28
+ this.config = config;
29
+ this.logger = logger;
30
+
31
+ // Worker pool configuration
32
+ const poolConfig = config.workerPool || {};
33
+ this.bucketName = poolConfig.tempBucket || 'bulltrackers-worker-staging';
34
+ this.workerUrl = poolConfig.workerUrl;
35
+ this.concurrency = poolConfig.concurrency || 100;
36
+ this.timeout = poolConfig.timeout || 60000; // 60s default
37
+ this.retries = poolConfig.retries || 2;
38
+
39
+ // Local mode for testing
40
+ this.localMode = poolConfig.localMode || process.env.WORKER_LOCAL_MODE === 'true';
41
+
42
+ // Lazy-initialized clients
43
+ this._storage = null;
44
+ this._authClient = null;
45
+ }
46
+
47
+ get storage() {
48
+ if (!this._storage) {
49
+ this._storage = new Storage();
50
+ }
51
+ return this._storage;
52
+ }
53
+
54
+ /**
55
+ * Execute a batch of entities remotely (or locally for testing)
56
+ *
57
+ * @param {Object} entry - Manifest entry for the computation
58
+ * @param {string} dateStr - Target date (YYYY-MM-DD)
59
+ * @param {Object} baseContext - Shared context (references, config)
60
+ * @param {string[]} entityIds - Entity IDs to process
61
+ * @param {Map<string, Object>} entityDataMap - Pre-filtered data per entity
62
+ * @param {Object} depResults - Pre-loaded dependency results
63
+ * @returns {Promise<{results: Object, errors: Array}>}
64
+ */
65
+ async runBatch(entry, dateStr, baseContext, entityIds, entityDataMap, depResults) {
66
+ const startTime = Date.now();
67
+ this._log('INFO', `Starting batch: ${entityIds.length} entities for ${entry.name}`);
68
+
69
+ if (this.localMode) {
70
+ return this._runBatchLocal(entry, dateStr, baseContext, entityIds, entityDataMap, depResults);
71
+ }
72
+
73
+ return this._runBatchRemote(entry, dateStr, baseContext, entityIds, entityDataMap, depResults);
74
+ }
75
+
76
+ /**
77
+ * Local execution mode - runs workers in-process
78
+ * Perfect for testing without GCS or network overhead
79
+ */
80
+ async _runBatchLocal(entry, dateStr, baseContext, entityIds, entityDataMap, depResults) {
81
+ const limit = pLimit(this.concurrency);
82
+ const results = {};
83
+ const errors = [];
84
+
85
+ const tasks = entityIds.map(entityId => limit(async () => {
86
+ try {
87
+ const contextPackage = this._buildContextPackage(
88
+ entry,
89
+ entityId,
90
+ entityDataMap.get(entityId),
91
+ baseContext,
92
+ depResults
93
+ );
94
+
95
+ const { result } = await executeLocal({
96
+ computationName: entry.originalName || entry.name,
97
+ entityId,
98
+ date: dateStr,
99
+ contextPackage
100
+ });
101
+
102
+ if (result !== null && result !== undefined) {
103
+ results[entityId] = result;
104
+ }
105
+
106
+ } catch (e) {
107
+ this._log('WARN', `Local execution failed for ${entityId}: ${e.message}`);
108
+ errors.push({ entityId, error: e.message });
109
+ }
110
+ }));
111
+
112
+ await Promise.all(tasks);
113
+
114
+ this._log('INFO', `Local batch complete: ${Object.keys(results).length} results, ${errors.length} errors`);
115
+ return { results, errors };
116
+ }
117
+
118
+ /**
119
+ * Remote execution mode - invokes Cloud Functions via HTTP
120
+ */
121
+ async _runBatchRemote(entry, dateStr, baseContext, entityIds, entityDataMap, depResults) {
122
+ const uploadLimit = pLimit(50); // Concurrent uploads to GCS
123
+ const invokeLimit = pLimit(this.concurrency); // Concurrent worker invocations
124
+
125
+ const results = {};
126
+ const errors = [];
127
+ const uploadedPaths = [];
128
+
129
+ // Phase 1: Upload context packages to GCS
130
+ this._log('INFO', 'Uploading context packages to GCS...');
131
+ const uploadStart = Date.now();
132
+
133
+ const uploadTasks = entityIds.map(entityId => uploadLimit(async () => {
134
+ const contextPackage = this._buildContextPackage(
135
+ entry,
136
+ entityId,
137
+ entityDataMap.get(entityId),
138
+ baseContext,
139
+ depResults
140
+ );
141
+
142
+ const path = `${dateStr}/${entry.name}/${entityId}.json`;
143
+
144
+ try {
145
+ await this._uploadToGCS(path, contextPackage);
146
+ uploadedPaths.push({ entityId, path });
147
+ } catch (e) {
148
+ errors.push({ entityId, error: `Upload failed: ${e.message}` });
149
+ }
150
+ }));
151
+
152
+ await Promise.all(uploadTasks);
153
+ this._log('INFO', `Uploaded ${uploadedPaths.length} packages in ${Date.now() - uploadStart}ms`);
154
+
155
+ // Phase 2: Invoke workers in parallel
156
+ this._log('INFO', 'Invoking workers...');
157
+ const invokeStart = Date.now();
158
+
159
+ const invokeTasks = uploadedPaths.map(({ entityId, path }) =>
160
+ invokeLimit(async () => {
161
+ try {
162
+ const response = await this._invokeWorkerWithRetry({
163
+ computationName: entry.originalName || entry.name,
164
+ entityId,
165
+ date: dateStr,
166
+ dataUri: { bucket: this.bucketName, path }
167
+ });
168
+
169
+ if (response.status === 'success' && response.result !== null) {
170
+ results[entityId] = response.result;
171
+ } else if (response.status === 'error') {
172
+ errors.push({ entityId, error: response.error });
173
+ }
174
+ // status === 'success' with result === null means skipped (filtered out)
175
+
176
+ } catch (e) {
177
+ errors.push({ entityId, error: e.message });
178
+ }
179
+ })
180
+ );
181
+
182
+ await Promise.all(invokeTasks);
183
+ this._log('INFO', `Invocations complete in ${Date.now() - invokeStart}ms`);
184
+
185
+ // Phase 3: Cleanup GCS (fire and forget)
186
+ this._cleanupGCS(uploadedPaths.map(p => p.path)).catch(e => {
187
+ this._log('WARN', `GCS cleanup failed: ${e.message}`);
188
+ });
189
+
190
+ return { results, errors };
191
+ }
192
+
193
+ /**
194
+ * Build the context package for a single entity
195
+ */
196
+ _buildContextPackage(entry, entityId, entityData, baseContext, depResults) {
197
+ // Extract only this entity's dependencies
198
+ const entityDeps = {};
199
+ for (const [depName, allResults] of Object.entries(depResults || {})) {
200
+ if (allResults === null) continue; // Large dependency not preloaded
201
+
202
+ if (typeof allResults === 'object') {
203
+ // If it's a map of entity -> result, extract this entity's
204
+ if (allResults[entityId]) {
205
+ entityDeps[depName] = { [entityId]: allResults[entityId] };
206
+ }
207
+ }
208
+ }
209
+
210
+ return {
211
+ entityData: entityData || {},
212
+ references: baseContext.references || {},
213
+ dependencies: entityDeps,
214
+ computationMeta: {
215
+ name: entry.name,
216
+ originalName: entry.originalName,
217
+ type: entry.type,
218
+ hash: entry.hash
219
+ },
220
+ config: baseContext.config || {}
221
+ };
222
+ }
223
+
224
+ /**
225
+ * Upload a context package to GCS
226
+ */
227
+ async _uploadToGCS(path, data) {
228
+ const file = this.storage.bucket(this.bucketName).file(path);
229
+
230
+ await file.save(JSON.stringify(data), {
231
+ contentType: 'application/json',
232
+ resumable: false, // Faster for small files
233
+ metadata: {
234
+ cacheControl: 'no-cache' // Don't cache temp files
235
+ }
236
+ });
237
+ }
238
+
239
+ /**
240
+ * Invoke a worker with retry logic
241
+ */
242
+ async _invokeWorkerWithRetry(payload, attempt = 1) {
243
+ try {
244
+ return await this._invokeWorker(payload);
245
+ } catch (e) {
246
+ const isRetryable = this._isRetryableError(e);
247
+
248
+ if (isRetryable && attempt < this.retries) {
249
+ // Exponential backoff
250
+ const delay = Math.min(1000 * Math.pow(2, attempt - 1), 10000);
251
+ await new Promise(r => setTimeout(r, delay));
252
+
253
+ this._log('DEBUG', `Retrying ${payload.entityId} (attempt ${attempt + 1})`);
254
+ return this._invokeWorkerWithRetry(payload, attempt + 1);
255
+ }
256
+
257
+ throw e;
258
+ }
259
+ }
260
+
261
+ /**
262
+ * Invoke a single worker via HTTP
263
+ */
264
+ async _invokeWorker(payload) {
265
+ // Lazy-load auth client
266
+ if (!this._authClient) {
267
+ const { GoogleAuth } = require('google-auth-library');
268
+ const auth = new GoogleAuth();
269
+ this._authClient = await auth.getIdTokenClient(this.workerUrl);
270
+ }
271
+
272
+ const response = await this._authClient.request({
273
+ url: this.workerUrl,
274
+ method: 'POST',
275
+ data: payload,
276
+ timeout: this.timeout,
277
+ headers: {
278
+ 'Content-Type': 'application/json'
279
+ }
280
+ });
281
+
282
+ return response.data;
283
+ }
284
+
285
+ /**
286
+ * Check if an error is retryable
287
+ */
288
+ _isRetryableError(error) {
289
+ // Network errors
290
+ if (error.code === 'ECONNRESET' || error.code === 'ETIMEDOUT') return true;
291
+
292
+ // HTTP 5xx errors (server errors)
293
+ if (error.response && error.response.status >= 500) return true;
294
+
295
+ // Rate limiting
296
+ if (error.response && error.response.status === 429) return true;
297
+
298
+ return false;
299
+ }
300
+
301
+ /**
302
+ * Cleanup uploaded files from GCS
303
+ */
304
+ async _cleanupGCS(paths) {
305
+ // Batch delete
306
+ const bucket = this.storage.bucket(this.bucketName);
307
+
308
+ // GCS batch delete has limits, process in chunks
309
+ const chunkSize = 100;
310
+ for (let i = 0; i < paths.length; i += chunkSize) {
311
+ const chunk = paths.slice(i, i + chunkSize);
312
+ await Promise.all(chunk.map(path =>
313
+ bucket.file(path).delete().catch(() => {})
314
+ ));
315
+ }
316
+ }
317
+
318
+ _log(level, message) {
319
+ if (this.logger && this.logger.log) {
320
+ this.logger.log(level, `[RemoteTaskRunner] ${message}`);
321
+ } else {
322
+ console.log(`[${level}] [RemoteTaskRunner] ${message}`);
323
+ }
324
+ }
325
+ }
326
+
327
+ module.exports = { RemoteTaskRunner };
@@ -10,20 +10,25 @@ class LineageMiddleware extends Middleware {
10
10
  async execute(context, next) {
11
11
  const result = await next(context);
12
12
 
13
- // Only track if we have a valid result and entity
14
- if (result && context.entityId) {
13
+ // Only track if we have a valid result
14
+ // We still call track() per entity, but the Tracker now just aggregates in RAM (very fast)
15
+ if (result && context.computation) {
16
+ // Fire and forget - don't await to avoid slowing down execution
15
17
  this.tracker.track({
16
18
  computation: context.computation.name,
17
19
  date: context.date,
18
20
  entityId: context.entityId,
19
- sourceData: context.data, // The slice of data used
21
+ sourceData: context.data,
20
22
  result: result
21
- }).catch(e => console.error('Lineage tracking failed', e));
23
+ }).catch(e => console.error('Lineage tracking error:', e.message));
22
24
  }
23
25
 
24
26
  return result;
25
27
  }
26
28
 
29
+ /**
30
+ * Called by Orchestrator at the very end of the computation
31
+ */
27
32
  async flush() {
28
33
  await this.tracker.flush();
29
34
  }
@@ -5,7 +5,7 @@ class ProfilerMiddleware extends Middleware {
5
5
  constructor(config) {
6
6
  super();
7
7
  this.profiler = new ComputationProfiler();
8
- this.storageManager = null; // Injected by Orchestrator
8
+ this.storageManager = null;
9
9
  }
10
10
 
11
11
  setStorage(storageManager) {
@@ -13,33 +13,21 @@ class ProfilerMiddleware extends Middleware {
13
13
  }
14
14
 
15
15
  async execute(context, next) {
16
- const { computation, entityId, date } = context;
16
+ const { computation, entityId } = context;
17
17
 
18
18
  // Start Profile
19
19
  const key = this.profiler.startProfile(computation.name, entityId || 'global');
20
20
 
21
21
  try {
22
- // Run Next
23
- const result = await next(context);
24
- return result;
22
+ return await next(context);
25
23
  } finally {
26
- // End Profile (runs even if error)
27
- const resultSize = context.results ? JSON.stringify(context.results).length : 0;
28
- const profile = this.profiler.endProfile(key, {
29
- entityId: entityId || 'global',
30
- resultSize
24
+ // End Profile (Just tracks stats in memory now)
25
+ this.profiler.endProfile(key, {
26
+ entityId: entityId || 'global'
31
27
  });
32
-
33
- // Persist Profile if storage available
34
- if (this.storageManager && profile) {
35
- // Async save (don't block)
36
- this.storageManager.savePerformanceReport(date, {
37
- computations: [{
38
- name: computation.name,
39
- ...profile
40
- }]
41
- }).catch(err => console.error('Failed to save profile', err));
42
- }
28
+
29
+ // NOTE: We no longer save to storage here to prevent log spam/DB load.
30
+ // The Orchestrator handles saving the aggregated report.
43
31
  }
44
32
  }
45
33
  }
@@ -1,15 +1,20 @@
1
1
  /**
2
2
  * @fileoverview Framework exports
3
- * * This is the public API of the computation framework.
3
+ *
4
+ * This is the public API of the computation framework.
4
5
  * Computations only need to import from here.
5
6
  */
6
7
 
7
8
  // Core
8
9
  const { Computation } = require('./core/Computation');
9
- const { Orchestrator } = require('./execution/Orchestrator'); // <--- CHANGED
10
+ const { Orchestrator } = require('./execution/Orchestrator');
10
11
  const { ManifestBuilder } = require('./core/Manifest');
11
12
  const { RulesRegistry } = require('./core/Rules');
12
13
 
14
+ // Execution
15
+ const { TaskRunner } = require('./execution/TaskRunner');
16
+ const { RemoteTaskRunner } = require('./execution/RemoteTaskRunner');
17
+
13
18
  // Scheduling
14
19
  const { ScheduleValidator } = require('./scheduling/ScheduleValidator');
15
20
 
@@ -26,7 +31,9 @@ module.exports = {
26
31
  Computation,
27
32
 
28
33
  // Execution engine
29
- Orchestrator, // <--- CHANGED
34
+ Orchestrator,
35
+ TaskRunner,
36
+ RemoteTaskRunner, // Serverless worker pool client
30
37
  ManifestBuilder,
31
38
 
32
39
  // Business rules
@@ -1,18 +1,14 @@
1
1
  /**
2
- * @fileoverview Data Lineage Tracker
2
+ * @fileoverview Data Lineage Tracker (Aggregated)
3
3
  * * Tracks the provenance of computation results.
4
- * * Records which source data (tables, record counts) contributed to specific entity results.
5
- * * Uses internal buffering to handle high-volume writes efficiently.
4
+ * * UPDATE: Refactored to AGGREGATE lineage at the computation level.
5
+ * * Instead of 1 row per entity, it produces 1 row per computation run with total source counts.
6
6
  */
7
7
 
8
8
  const { BigQuery } = require('@google-cloud/bigquery');
9
9
  const crypto = require('crypto');
10
10
 
11
11
  class LineageTracker {
12
- /**
13
- * @param {Object} config - System configuration
14
- * @param {Object} [logger] - Logger instance
15
- */
16
12
  constructor(config, logger = null) {
17
13
  this.config = config;
18
14
  this.logger = logger || console;
@@ -25,48 +21,62 @@ class LineageTracker {
25
21
  this.datasetId = config.bigquery.dataset;
26
22
  this.tableName = 'data_lineage';
27
23
 
28
- this.buffer = [];
29
- this.BUFFER_SIZE = config.execution?.lineageBatchSize || 500;
24
+ // Aggregation State: Map<computationName, { date, sources: Map<table, count> }>
25
+ this.aggregationState = new Map();
30
26
  this._tableChecked = false;
31
27
  }
32
28
 
33
29
  /**
34
- * Track lineage for a specific computation result.
35
- * @param {Object} params
36
- * @param {string} params.computation - Computation name
37
- * @param {string} params.date - Execution date
38
- * @param {string} params.entityId - Entity ID
39
- * @param {Object} params.sourceData - The actual input data object used
40
- * @param {Object} params.result - The result object produced
30
+ * Accumulate lineage stats for a specific entity result.
31
+ * Does NOT write to BigQuery immediately.
41
32
  */
42
33
  async track({ computation, date, entityId, sourceData, result }) {
43
34
  if (!sourceData) return;
44
35
 
45
- const sourceSummary = this._summarizeSources(sourceData);
46
- const resultHash = this._hash(result);
47
-
48
- this.buffer.push({
49
- date,
50
- computation_name: computation,
51
- entity_id: String(entityId),
52
- sources_json: JSON.stringify(sourceSummary),
53
- result_hash: resultHash,
54
- timestamp: new Date().toISOString()
55
- });
56
-
57
- if (this.buffer.length >= this.BUFFER_SIZE) {
58
- await this.flush();
36
+ // Initialize aggregation bucket if needed
37
+ if (!this.aggregationState.has(computation)) {
38
+ this.aggregationState.set(computation, {
39
+ date,
40
+ computation,
41
+ sources: new Map(), // Table -> Count
42
+ firstTimestamp: new Date().toISOString()
43
+ });
59
44
  }
45
+
46
+ const state = this.aggregationState.get(computation);
47
+
48
+ // Summarize and merge sources
49
+ this._mergeSources(state.sources, sourceData);
60
50
  }
61
51
 
62
52
  /**
63
- * Force write any buffered records to BigQuery.
53
+ * Finalize and write aggregated lineage to BigQuery.
54
+ * Called by Orchestrator at the end of execution.
64
55
  */
65
56
  async flush() {
66
- if (this.buffer.length === 0) return;
57
+ if (this.aggregationState.size === 0) return;
67
58
 
68
- const batch = [...this.buffer];
69
- this.buffer = []; // Clear immediately
59
+ const batch = [];
60
+
61
+ for (const [compName, state] of this.aggregationState.entries()) {
62
+ const sourcesSummary = Array.from(state.sources.entries()).map(([table, count]) => ({
63
+ table,
64
+ total_rows_used: count
65
+ }));
66
+
67
+ // Create a single summary row
68
+ batch.push({
69
+ date: state.date,
70
+ computation_name: compName,
71
+ entity_id: 'AGGREGATED_BATCH', // explicit marker
72
+ sources_json: JSON.stringify(sourcesSummary),
73
+ result_hash: 'AGGREGATED', // Individual hashes aren't useful in summary
74
+ timestamp: new Date().toISOString()
75
+ });
76
+ }
77
+
78
+ // Clear memory immediately
79
+ this.aggregationState.clear();
70
80
 
71
81
  try {
72
82
  await this._ensureTable();
@@ -76,43 +86,29 @@ class LineageTracker {
76
86
  .table(this.tableName)
77
87
  .insert(batch);
78
88
 
79
- this._log('DEBUG', `Flushed ${batch.length} lineage records`);
89
+ this._log('INFO', `Saved aggregated lineage for ${batch.length} computations`);
80
90
  } catch (e) {
81
- this._log('ERROR', `Failed to flush lineage buffer: ${e.message}`);
82
- // In a production system, we might want to retry or dump to a dead-letter file
91
+ this._log('ERROR', `Failed to save lineage: ${e.message}`);
83
92
  }
84
93
  }
85
94
 
86
95
  /**
87
- * Analyze source data to create a lightweight summary.
88
- * @param {Object} data - Input data map { tableName: data }
89
- * @returns {Array} Summary of sources [{ table, count, ... }]
96
+ * Merges current entity's data usage into the global counter.
90
97
  */
91
- _summarizeSources(data) {
92
- return Object.entries(data).map(([table, content]) => {
98
+ _mergeSources(aggSources, currentData) {
99
+ Object.entries(currentData).forEach(([table, content]) => {
93
100
  let count = 0;
94
- let meta = null;
95
-
96
101
  if (Array.isArray(content)) {
97
102
  count = content.length;
98
103
  } else if (content && typeof content === 'object') {
99
104
  count = Object.keys(content).length;
100
105
  }
101
106
 
102
- // Optional: If data has specific metadata fields (like 'version' or 'snapshot_id'), extract them
103
- if (content && content._metadata) {
104
- meta = content._metadata;
105
- }
106
-
107
- return { table, count, meta };
107
+ const currentTotal = aggSources.get(table) || 0;
108
+ aggSources.set(table, currentTotal + count);
108
109
  });
109
110
  }
110
111
 
111
- _hash(data) {
112
- const str = typeof data === 'string' ? data : JSON.stringify(data);
113
- return crypto.createHash('md5').update(str || '').digest('hex').substring(0, 16);
114
- }
115
-
116
112
  async _ensureTable() {
117
113
  if (this._tableChecked) return;
118
114
 
@@ -127,12 +123,12 @@ class LineageTracker {
127
123
  { name: 'date', type: 'DATE', mode: 'REQUIRED' },
128
124
  { name: 'computation_name', type: 'STRING', mode: 'REQUIRED' },
129
125
  { name: 'entity_id', type: 'STRING', mode: 'REQUIRED' },
130
- { name: 'sources_json', type: 'STRING', mode: 'REQUIRED' }, // JSON array of source summaries
126
+ { name: 'sources_json', type: 'STRING', mode: 'REQUIRED' },
131
127
  { name: 'result_hash', type: 'STRING', mode: 'NULLABLE' },
132
128
  { name: 'timestamp', type: 'TIMESTAMP', mode: 'REQUIRED' }
133
129
  ],
134
130
  timePartitioning: { type: 'DAY', field: 'date' },
135
- clustering: { fields: ['computation_name', 'entity_id'] }
131
+ clustering: { fields: ['computation_name'] }
136
132
  });
137
133
  }
138
134
  this._tableChecked = true;