npm - bulltrackers-module - Versions diffs - 1.0.735 → 1.0.737 - Mend

bulltrackers-module 1.0.735 → 1.0.737

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/functions/computation-system-v2/framework/execution/RemoteTaskRunner.js ADDED Viewed

@@ -0,0 +1,327 @@
+/**
+ * @fileoverview Remote Task Runner (Serverless Worker Pool Client)
+ *
+ * RESPONSIBILITIES:
+ * 1. Package entity data and context into GCS files
+ * 2. Invoke remote worker functions in parallel
+ * 3. Collect results and errors
+ * 4. Handle retries for transient failures
+ *
+ * DATA FLOW:
+ * Orchestrator calls runBatch() -> Upload to GCS -> Invoke Workers -> Collect Results
+ *
+ * DESIGN PRINCIPLES:
+ * - Workers are stateless - all context is passed via GCS
+ * - High parallelism - hundreds of concurrent invocations
+ * - Fault isolation - one entity failure doesn't affect others
+ * - Cost efficient - workers scale to zero between runs
+ */
+const { Storage } = require('@google-cloud/storage');
+const pLimit = require('p-limit');
+// For local testing
+const { executeLocal } = require('../../handlers/worker');
+class RemoteTaskRunner {
+    constructor(config, logger = console) {
+        this.config = config;
+        this.logger = logger;
+        // Worker pool configuration
+        const poolConfig = config.workerPool || {};
+        this.bucketName = poolConfig.tempBucket || 'bulltrackers-worker-staging';
+        this.workerUrl = poolConfig.workerUrl;
+        this.concurrency = poolConfig.concurrency || 100;
+        this.timeout = poolConfig.timeout || 60000; // 60s default
+        this.retries = poolConfig.retries || 2;
+        // Local mode for testing
+        this.localMode = poolConfig.localMode || process.env.WORKER_LOCAL_MODE === 'true';
+        // Lazy-initialized clients
+        this._storage = null;
+        this._authClient = null;
+    }
+    get storage() {
+        if (!this._storage) {
+            this._storage = new Storage();
+        }
+        return this._storage;
+    }
+    /**
+     * Execute a batch of entities remotely (or locally for testing)
+     *
+     * @param {Object} entry - Manifest entry for the computation
+     * @param {string} dateStr - Target date (YYYY-MM-DD)
+     * @param {Object} baseContext - Shared context (references, config)
+     * @param {string[]} entityIds - Entity IDs to process
+     * @param {Map<string, Object>} entityDataMap - Pre-filtered data per entity
+     * @param {Object} depResults - Pre-loaded dependency results
+     * @returns {Promise<{results: Object, errors: Array}>}
+     */
+    async runBatch(entry, dateStr, baseContext, entityIds, entityDataMap, depResults) {
+        const startTime = Date.now();
+        this._log('INFO', `Starting batch: ${entityIds.length} entities for ${entry.name}`);
+        if (this.localMode) {
+            return this._runBatchLocal(entry, dateStr, baseContext, entityIds, entityDataMap, depResults);
+        }
+        return this._runBatchRemote(entry, dateStr, baseContext, entityIds, entityDataMap, depResults);
+    }
+    /**
+     * Local execution mode - runs workers in-process
+     * Perfect for testing without GCS or network overhead
+     */
+    async _runBatchLocal(entry, dateStr, baseContext, entityIds, entityDataMap, depResults) {
+        const limit = pLimit(this.concurrency);
+        const results = {};
+        const errors = [];
+        const tasks = entityIds.map(entityId => limit(async () => {
+            try {
+                const contextPackage = this._buildContextPackage(
+                    entry,
+                    entityId,
+                    entityDataMap.get(entityId),
+                    baseContext,
+                    depResults
+                );
+                const { result } = await executeLocal({
+                    computationName: entry.originalName || entry.name,
+                    entityId,
+                    date: dateStr,
+                    contextPackage
+                });
+                if (result !== null && result !== undefined) {
+                    results[entityId] = result;
+                }
+            } catch (e) {
+                this._log('WARN', `Local execution failed for ${entityId}: ${e.message}`);
+                errors.push({ entityId, error: e.message });
+            }
+        }));
+        await Promise.all(tasks);
+        this._log('INFO', `Local batch complete: ${Object.keys(results).length} results, ${errors.length} errors`);
+        return { results, errors };
+    }
+    /**
+     * Remote execution mode - invokes Cloud Functions via HTTP
+     */
+    async _runBatchRemote(entry, dateStr, baseContext, entityIds, entityDataMap, depResults) {
+        const uploadLimit = pLimit(50); // Concurrent uploads to GCS
+        const invokeLimit = pLimit(this.concurrency); // Concurrent worker invocations
+        const results = {};
+        const errors = [];
+        const uploadedPaths = [];
+        // Phase 1: Upload context packages to GCS
+        this._log('INFO', 'Uploading context packages to GCS...');
+        const uploadStart = Date.now();
+        const uploadTasks = entityIds.map(entityId => uploadLimit(async () => {
+            const contextPackage = this._buildContextPackage(
+                entry,
+                entityId,
+                entityDataMap.get(entityId),
+                baseContext,
+                depResults
+            );
+            const path = `${dateStr}/${entry.name}/${entityId}.json`;
+            try {
+                await this._uploadToGCS(path, contextPackage);
+                uploadedPaths.push({ entityId, path });
+            } catch (e) {
+                errors.push({ entityId, error: `Upload failed: ${e.message}` });
+            }
+        }));
+        await Promise.all(uploadTasks);
+        this._log('INFO', `Uploaded ${uploadedPaths.length} packages in ${Date.now() - uploadStart}ms`);
+        // Phase 2: Invoke workers in parallel
+        this._log('INFO', 'Invoking workers...');
+        const invokeStart = Date.now();
+        const invokeTasks = uploadedPaths.map(({ entityId, path }) =>
+            invokeLimit(async () => {
+                try {
+                    const response = await this._invokeWorkerWithRetry({
+                        computationName: entry.originalName || entry.name,
+                        entityId,
+                        date: dateStr,
+                        dataUri: { bucket: this.bucketName, path }
+                    });
+                    if (response.status === 'success' && response.result !== null) {
+                        results[entityId] = response.result;
+                    } else if (response.status === 'error') {
+                        errors.push({ entityId, error: response.error });
+                    }
+                    // status === 'success' with result === null means skipped (filtered out)
+                } catch (e) {
+                    errors.push({ entityId, error: e.message });
+                }
+            })
+        );
+        await Promise.all(invokeTasks);
+        this._log('INFO', `Invocations complete in ${Date.now() - invokeStart}ms`);
+        // Phase 3: Cleanup GCS (fire and forget)
+        this._cleanupGCS(uploadedPaths.map(p => p.path)).catch(e => {
+            this._log('WARN', `GCS cleanup failed: ${e.message}`);
+        });
+        return { results, errors };
+    }
+    /**
+     * Build the context package for a single entity
+     */
+    _buildContextPackage(entry, entityId, entityData, baseContext, depResults) {
+        // Extract only this entity's dependencies
+        const entityDeps = {};
+        for (const [depName, allResults] of Object.entries(depResults || {})) {
+            if (allResults === null) continue; // Large dependency not preloaded
+            if (typeof allResults === 'object') {
+                // If it's a map of entity -> result, extract this entity's
+                if (allResults[entityId]) {
+                    entityDeps[depName] = { [entityId]: allResults[entityId] };
+                }
+            }
+        }
+        return {
+            entityData: entityData || {},
+            references: baseContext.references || {},
+            dependencies: entityDeps,
+            computationMeta: {
+                name: entry.name,
+                originalName: entry.originalName,
+                type: entry.type,
+                hash: entry.hash
+            },
+            config: baseContext.config || {}
+        };
+    }
+    /**
+     * Upload a context package to GCS
+     */
+    async _uploadToGCS(path, data) {
+        const file = this.storage.bucket(this.bucketName).file(path);
+        await file.save(JSON.stringify(data), {
+            contentType: 'application/json',
+            resumable: false, // Faster for small files
+            metadata: {
+                cacheControl: 'no-cache' // Don't cache temp files
+            }
+        });
+    }
+    /**
+     * Invoke a worker with retry logic
+     */
+    async _invokeWorkerWithRetry(payload, attempt = 1) {
+        try {
+            return await this._invokeWorker(payload);
+        } catch (e) {
+            const isRetryable = this._isRetryableError(e);
+            if (isRetryable && attempt < this.retries) {
+                // Exponential backoff
+                const delay = Math.min(1000 * Math.pow(2, attempt - 1), 10000);
+                await new Promise(r => setTimeout(r, delay));
+                this._log('DEBUG', `Retrying ${payload.entityId} (attempt ${attempt + 1})`);
+                return this._invokeWorkerWithRetry(payload, attempt + 1);
+            }
+            throw e;
+        }
+    }
+    /**
+     * Invoke a single worker via HTTP
+     */
+    async _invokeWorker(payload) {
+        // Lazy-load auth client
+        if (!this._authClient) {
+            const { GoogleAuth } = require('google-auth-library');
+            const auth = new GoogleAuth();
+            this._authClient = await auth.getIdTokenClient(this.workerUrl);
+        }
+        const response = await this._authClient.request({
+            url: this.workerUrl,
+            method: 'POST',
+            data: payload,
+            timeout: this.timeout,
+            headers: {
+                'Content-Type': 'application/json'
+            }
+        });
+        return response.data;
+    }
+    /**
+     * Check if an error is retryable
+     */
+    _isRetryableError(error) {
+        // Network errors
+        if (error.code === 'ECONNRESET' || error.code === 'ETIMEDOUT') return true;
+        // HTTP 5xx errors (server errors)
+        if (error.response && error.response.status >= 500) return true;
+        // Rate limiting
+        if (error.response && error.response.status === 429) return true;
+        return false;
+    }
+    /**
+     * Cleanup uploaded files from GCS
+     */
+    async _cleanupGCS(paths) {
+        // Batch delete
+        const bucket = this.storage.bucket(this.bucketName);
+        // GCS batch delete has limits, process in chunks
+        const chunkSize = 100;
+        for (let i = 0; i < paths.length; i += chunkSize) {
+            const chunk = paths.slice(i, i + chunkSize);
+            await Promise.all(chunk.map(path =>
+                bucket.file(path).delete().catch(() => {})
+            ));
+        }
+    }
+    _log(level, message) {
+        if (this.logger && this.logger.log) {
+            this.logger.log(level, `[RemoteTaskRunner] ${message}`);
+        } else {
+            console.log(`[${level}] [RemoteTaskRunner] ${message}`);
+        }
+    }
+}
+module.exports = { RemoteTaskRunner };

package/functions/computation-system-v2/framework/execution/middleware/LineageMiddleware.js CHANGED Viewed

@@ -10,20 +10,25 @@ class LineageMiddleware extends Middleware {
     async execute(context, next) {
         const result = await next(context);
-        // Only track if we have a valid result and entity
-        if (result && context.entityId) {
+        // Only track if we have a valid result
+        // We still call track() per entity, but the Tracker now just aggregates in RAM (very fast)
+        if (result && context.computation) {
+             // Fire and forget - don't await to avoid slowing down execution
              this.tracker.track({
                 computation: context.computation.name,
                 date: context.date,
                 entityId: context.entityId,
-                sourceData: context.data, // The slice of data used
+                sourceData: context.data,
                 result: result
-            }).catch(e => console.error('Lineage tracking failed', e));
+            }).catch(e => console.error('Lineage tracking error:', e.message));
         }
         return result;
     }
+    /**
+     * Called by Orchestrator at the very end of the computation
+     */
     async flush() {
         await this.tracker.flush();
     }

package/functions/computation-system-v2/framework/execution/middleware/ProfilerMiddleware.js CHANGED Viewed

@@ -5,7 +5,7 @@ class ProfilerMiddleware extends Middleware {
     constructor(config) {
         super();
         this.profiler = new ComputationProfiler();
-        this.storageManager = null; // Injected by Orchestrator
+        this.storageManager = null;
     }
     setStorage(storageManager) {
@@ -13,33 +13,21 @@ class ProfilerMiddleware extends Middleware {
     }
     async execute(context, next) {
-        const { computation, entityId, date } = context;
+        const { computation, entityId } = context;
         // Start Profile
         const key = this.profiler.startProfile(computation.name, entityId || 'global');
         try {
-            // Run Next
-            const result = await next(context);
-            return result;
+            return await next(context);
         } finally {
-            // End Profile (runs even if error)
-            const resultSize = context.results ? JSON.stringify(context.results).length : 0;
-            const profile = this.profiler.endProfile(key, {
-                entityId: entityId || 'global',
-                resultSize
+            // End Profile (Just tracks stats in memory now)
+            this.profiler.endProfile(key, {
+                entityId: entityId || 'global'
             });
-            // Persist Profile if storage available
-            if (this.storageManager && profile) {
-                // Async save (don't block)
-                this.storageManager.savePerformanceReport(date, {
-                    computations: [{
-                        name: computation.name,
-                        ...profile
-                    }]
-                }).catch(err => console.error('Failed to save profile', err));
-            }
+            // NOTE: We no longer save to storage here to prevent log spam/DB load.
+            // The Orchestrator handles saving the aggregated report.
         }
     }
 }

package/functions/computation-system-v2/framework/index.js CHANGED Viewed

@@ -1,15 +1,20 @@
 /**
  * @fileoverview Framework exports
- * * This is the public API of the computation framework.
+ *
+ * This is the public API of the computation framework.
  * Computations only need to import from here.
  */
 // Core
 const { Computation } = require('./core/Computation');
-const { Orchestrator } = require('./execution/Orchestrator'); // <--- CHANGED
+const { Orchestrator } = require('./execution/Orchestrator');
 const { ManifestBuilder } = require('./core/Manifest');
 const { RulesRegistry } = require('./core/Rules');
+// Execution
+const { TaskRunner } = require('./execution/TaskRunner');
+const { RemoteTaskRunner } = require('./execution/RemoteTaskRunner');
 // Scheduling
 const { ScheduleValidator } = require('./scheduling/ScheduleValidator');
@@ -26,7 +31,9 @@ module.exports = {
     Computation,
     // Execution engine
-    Orchestrator, // <--- CHANGED
+    Orchestrator,
+    TaskRunner,
+    RemoteTaskRunner,  // Serverless worker pool client
     ManifestBuilder,
     // Business rules

package/functions/computation-system-v2/framework/lineage/LineageTracker.js CHANGED Viewed

@@ -1,18 +1,14 @@
 /**
- * @fileoverview Data Lineage Tracker
+ * @fileoverview Data Lineage Tracker (Aggregated)
  * * Tracks the provenance of computation results.
- * * Records which source data (tables, record counts) contributed to specific entity results.
- * * Uses internal buffering to handle high-volume writes efficiently.
+ * * UPDATE: Refactored to AGGREGATE lineage at the computation level.
+ * * Instead of 1 row per entity, it produces 1 row per computation run with total source counts.
  */
 const { BigQuery } = require('@google-cloud/bigquery');
 const crypto = require('crypto');
 class LineageTracker {
-    /**
-     * @param {Object} config - System configuration
-     * @param {Object} [logger] - Logger instance
-     */
     constructor(config, logger = null) {
         this.config = config;
         this.logger = logger || console;
@@ -25,48 +21,62 @@ class LineageTracker {
         this.datasetId = config.bigquery.dataset;
         this.tableName = 'data_lineage';
-        this.buffer = [];
-        this.BUFFER_SIZE = config.execution?.lineageBatchSize || 500;
+        // Aggregation State: Map<computationName, { date, sources: Map<table, count> }>
+        this.aggregationState = new Map();
         this._tableChecked = false;
     }
     /**
-     * Track lineage for a specific computation result.
-     * @param {Object} params
-     * @param {string} params.computation - Computation name
-     * @param {string} params.date - Execution date
-     * @param {string} params.entityId - Entity ID
-     * @param {Object} params.sourceData - The actual input data object used
-     * @param {Object} params.result - The result object produced
+     * Accumulate lineage stats for a specific entity result.
+     * Does NOT write to BigQuery immediately.
      */
     async track({ computation, date, entityId, sourceData, result }) {
         if (!sourceData) return;
-        const sourceSummary = this._summarizeSources(sourceData);
-        const resultHash = this._hash(result);
-        this.buffer.push({
-            date,
-            computation_name: computation,
-            entity_id: String(entityId),
-            sources_json: JSON.stringify(sourceSummary),
-            result_hash: resultHash,
-            timestamp: new Date().toISOString()
-        });
-        if (this.buffer.length >= this.BUFFER_SIZE) {
-            await this.flush();
+        // Initialize aggregation bucket if needed
+        if (!this.aggregationState.has(computation)) {
+            this.aggregationState.set(computation, {
+                date,
+                computation,
+                sources: new Map(), // Table -> Count
+                firstTimestamp: new Date().toISOString()
+            });
         }
+        const state = this.aggregationState.get(computation);
+        // Summarize and merge sources
+        this._mergeSources(state.sources, sourceData);
     }
     /**
-     * Force write any buffered records to BigQuery.
+     * Finalize and write aggregated lineage to BigQuery.
+     * Called by Orchestrator at the end of execution.
      */
     async flush() {
-        if (this.buffer.length === 0) return;
+        if (this.aggregationState.size === 0) return;
-        const batch = [...this.buffer];
-        this.buffer = []; // Clear immediately
+        const batch = [];
+        for (const [compName, state] of this.aggregationState.entries()) {
+            const sourcesSummary = Array.from(state.sources.entries()).map(([table, count]) => ({
+                table,
+                total_rows_used: count
+            }));
+            // Create a single summary row
+            batch.push({
+                date: state.date,
+                computation_name: compName,
+                entity_id: 'AGGREGATED_BATCH', // explicit marker
+                sources_json: JSON.stringify(sourcesSummary),
+                result_hash: 'AGGREGATED', // Individual hashes aren't useful in summary
+                timestamp: new Date().toISOString()
+            });
+        }
+        // Clear memory immediately
+        this.aggregationState.clear();
         try {
             await this._ensureTable();
@@ -76,43 +86,29 @@ class LineageTracker {
                 .table(this.tableName)
                 .insert(batch);
-            this._log('DEBUG', `Flushed ${batch.length} lineage records`);
+            this._log('INFO', `Saved aggregated lineage for ${batch.length} computations`);
         } catch (e) {
-            this._log('ERROR', `Failed to flush lineage buffer: ${e.message}`);
-            // In a production system, we might want to retry or dump to a dead-letter file
+            this._log('ERROR', `Failed to save lineage: ${e.message}`);
         }
     }
     /**
-     * Analyze source data to create a lightweight summary.
-     * @param {Object} data - Input data map { tableName: data }
-     * @returns {Array} Summary of sources [{ table, count, ... }]
+     * Merges current entity's data usage into the global counter.
      */
-    _summarizeSources(data) {
-        return Object.entries(data).map(([table, content]) => {
+    _mergeSources(aggSources, currentData) {
+        Object.entries(currentData).forEach(([table, content]) => {
             let count = 0;
-            let meta = null;
             if (Array.isArray(content)) {
                 count = content.length;
             } else if (content && typeof content === 'object') {
                 count = Object.keys(content).length;
             }
-            // Optional: If data has specific metadata fields (like 'version' or 'snapshot_id'), extract them
-            if (content && content._metadata) {
-                meta = content._metadata;
-            }
-            return { table, count, meta };
+            const currentTotal = aggSources.get(table) || 0;
+            aggSources.set(table, currentTotal + count);
         });
     }
-    _hash(data) {
-        const str = typeof data === 'string' ? data : JSON.stringify(data);
-        return crypto.createHash('md5').update(str || '').digest('hex').substring(0, 16);
-    }
     async _ensureTable() {
         if (this._tableChecked) return;
@@ -127,12 +123,12 @@ class LineageTracker {
                     { name: 'date', type: 'DATE', mode: 'REQUIRED' },
                     { name: 'computation_name', type: 'STRING', mode: 'REQUIRED' },
                     { name: 'entity_id', type: 'STRING', mode: 'REQUIRED' },
-                    { name: 'sources_json', type: 'STRING', mode: 'REQUIRED' }, // JSON array of source summaries
+                    { name: 'sources_json', type: 'STRING', mode: 'REQUIRED' },
                     { name: 'result_hash', type: 'STRING', mode: 'NULLABLE' },
                     { name: 'timestamp', type: 'TIMESTAMP', mode: 'REQUIRED' }
                 ],
                 timePartitioning: { type: 'DAY', field: 'date' },
-                clustering: { fields: ['computation_name', 'entity_id'] }
+                clustering: { fields: ['computation_name'] }
             });
         }
         this._tableChecked = true;