npm - bulltrackers-module - Versions diffs - 1.0.734 → 1.0.736 - Mend

bulltrackers-module 1.0.734 → 1.0.736

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/functions/computation-system-v2/config/bulltrackers.config.js CHANGED Viewed

@@ -1,10 +1,8 @@
 /**
  * @fileoverview BullTrackers Configuration for Computation System v2
- *
- * This is the ONLY file that contains BullTrackers-specific knowledge.
+ * * This is the ONLY file that contains BullTrackers-specific knowledge.
  * The framework itself is completely domain-agnostic.
- *
- * Business Rules:
+ * * Business Rules:
  * - Rules are injected into computations automatically
  * - When a rule changes, computations using it are re-run
  * - Computations should be "simple recipes" that call rules
@@ -37,6 +35,15 @@ module.exports = {
         location: 'europe-west1',
         cacheTTLMs: 3600000  // 1 hour schema cache
     },
+    // =========================================================================
+    // GCS CONFIGURATION (NEW: For Batch Loading)
+    // =========================================================================
+    gcs: {
+        bucket: process.env.GCS_BUCKET_ID || 'bulltrackers-computation-staging',
+        prefix: 'staging'
+    },
     // =========================================================================
     // TABLE DEFINITIONS
@@ -313,5 +320,68 @@ module.exports = {
         // null = all computations allowed
         // array = only listed computations allowed
         allowedComputations: null
+    },
+    // =========================================================================
+    // WORKER POOL CONFIGURATION (SERVERLESS WORKERS)
+    // =========================================================================
+    //
+    // Enables offloading per-entity computations to a serverless worker pool.
+    // Workers run as separate Cloud Functions with high concurrency.
+    //
+    // Benefits:
+    // - Massive parallelism (100s of concurrent entity computations)
+    // - Cost efficient (workers scale to zero, high concurrency per instance)
+    // - Fault isolation (one entity failure doesn't affect others)
+    // - Memory efficient (workers only load one entity's data at a time)
+    //
+    // Data Flow:
+    // 1. Orchestrator packages entity data → GCS
+    // 2. Orchestrator invokes workers in parallel
+    // 3. Workers load data from GCS, execute, return result
+    // 4. Orchestrator collects results, commits to storage
+    // =========================================================================
+    workerPool: {
+        // Master switch - set to true to enable worker pool
+        enabled: process.env.WORKER_POOL_ENABLED === 'true',
+        // Local mode for testing - runs workers in-process without GCS/HTTP
+        // Set WORKER_LOCAL_MODE=true or pass localMode: true in config
+        localMode: process.env.WORKER_LOCAL_MODE === 'true',
+        // Worker Cloud Function URL
+        workerUrl: process.env.WORKER_URL ||
+            'https://europe-west1-stocks-12345.cloudfunctions.net/computation-worker',
+        // GCS bucket for temporary context packages
+        // Should have lifecycle rule to auto-delete after 1 day
+        tempBucket: process.env.WORKER_TEMP_BUCKET || 'bulltrackers-worker-staging',
+        // Max concurrent worker invocations
+        // Higher = faster but more network/GCS load
+        // Recommended: 100-200 for production
+        concurrency: parseInt(process.env.WORKER_CONCURRENCY || '100', 10),
+        // Worker invocation timeout (ms)
+        // Should be slightly less than worker function timeout
+        timeout: parseInt(process.env.WORKER_TIMEOUT || '60000', 10),
+        // Retry count for transient failures
+        retries: 2,
+        // Minimum entities to trigger worker pool
+        // Below this threshold, run locally (overhead not worth it)
+        minEntitiesForOffload: parseInt(process.env.WORKER_MIN_ENTITIES || '50', 10),
+        // Computations that should NEVER use worker pool
+        // (e.g., need persistent state, special middleware)
+        excludeComputations: [],
+        // Computations that MUST use worker pool (override threshold)
+        // Useful for testing specific computations
+        forceOffloadComputations: process.env.WORKER_FORCE_COMPUTATIONS
+            ? process.env.WORKER_FORCE_COMPUTATIONS.split(',')
+            : []
     }
-};
+};

package/functions/computation-system-v2/framework/data/DataFetcher.js CHANGED Viewed

@@ -2,33 +2,15 @@
  * @fileoverview Data Fetcher - Executes queries and transforms results
  * * The single point of data access for computations.
  * Uses QueryBuilder for validation, executes against BigQuery, transforms results.
- * * V2.1 UPDATE: Added Streaming/Batching support to fix "Memory God" issues.
- * Now supports O(1) memory usage for large datasets via fetchBatched.
+ * * V2.2 FIX: "Identity Crisis" & "Date as ID" bugs.
+ * * V2.3 FIX: "Insufficient History" bug.
+ * - fetchBatched now orders by Entity ID to keep historical rows together.
+ * - Implemented "Entity-Atomic Batching" to prevent splitting a user's history across batches.
  */
 const { BigQuery } = require('@google-cloud/bigquery');
-/**
- * @typedef {Object} FetchOptions
- * @property {string} table - Table name
- * @property {string} targetDate - Target date (YYYY-MM-DD)
- * @property {number} [lookback=0] - Days to look back
- * @property {boolean} [mandatory=false] - If true, throws on no data
- * @property {Object} [filter] - Additional WHERE conditions
- * @property {string[]} [fields] - Specific fields to select
- * @property {string[]} [entities] - Specific entity IDs to fetch
- */
 class DataFetcher {
-    /**
-     * @param {Object} config - Configuration object
-     * @param {string} config.projectId - GCP project ID
-     * @param {string} config.dataset - BigQuery dataset name
-     * @param {string} [config.location='US'] - BigQuery location
-     * @param {Object} config.tables - Table configuration map
-     * @param {QueryBuilder} queryBuilder - Query builder instance
-     * @param {Object} [logger] - Logger instance
-     */
     constructor(config, queryBuilder, logger = null) {
         this.projectId = config.projectId;
         this.dataset = config.dataset;
@@ -39,7 +21,6 @@ class DataFetcher {
         this.client = new BigQuery({ projectId: this.projectId });
-        // Query stats
         this.stats = {
             queries: 0,
             rowsFetched: 0,
@@ -50,16 +31,11 @@ class DataFetcher {
     /**
      * Fetch data for a computation's requirements.
-     * @param {Object} requires - Computation's requires declaration
-     * @param {string} targetDate - Target date
-     * @param {string[]} [entities] - Optional entity IDs to filter
-     * @returns {Promise<Object>} Data keyed by table name
      */
     async fetchForComputation(requires, targetDate, entities = null) {
         const results = {};
         const errors = [];
-        // Fetch all tables in parallel
         await Promise.all(Object.entries(requires).map(async ([tableName, spec]) => {
             try {
                 const data = await this.fetch({
@@ -88,7 +64,6 @@ class DataFetcher {
             }
         }));
-        // Throw if mandatory data is missing
         if (errors.length > 0) {
             const msg = errors.map(e => `${e.table}: ${e.reason}`).join(', ');
             throw new Error(`[DataFetcher] Missing mandatory data: ${msg}`);
@@ -99,28 +74,22 @@ class DataFetcher {
     /**
      * STREAMING: Fetch data for computation in batches.
-     * Uses a "Driver Table" (one with entityField) to stream entities,
-     * then fetches dependencies for just that batch.
-     * * @param {Object} requires - Computation requires
-     * @param {string} targetDate - Target Date
-     * @param {number} batchSize - Rows per batch
-     * @returns {AsyncGenerator<{data: Object, entityIds: string[]}>}
      */
     async *fetchComputationBatched(requires, targetDate, batchSize = 1000) {
-        // 1. Identify Driver Table (The one we iterate over)
         const driverTableName = this._getDriverTable(requires);
+        const driverConfig = this.tables[driverTableName] || {};
         if (!driverTableName) {
             this._log('WARN', 'No entity-keyed table found for batching. Falling back to full fetch.');
             const fullData = await this.fetchForComputation(requires, targetDate);
-            yield { data: fullData, entityIds: null };
+            const allIds = fullData ? this._extractAllEntityIds(fullData) : [];
+            yield { data: fullData, entityIds: allIds };
             return;
         }
-        this._log('INFO', `Starting batched fetch driven by table: ${driverTableName}`);
+        this._log('INFO', `Starting batched fetch driven by table: ${driverTableName} (${driverConfig.entityField})`);
         const driverSpec = requires[driverTableName];
-        // 2. Stream Driver Table
         const driverStream = this.fetchBatched({
             table: driverTableName,
             targetDate,
@@ -130,20 +99,24 @@ class DataFetcher {
             mandatory: driverSpec.mandatory
         }, batchSize);
-        // 3. Process Streams
         for await (const batch of driverStream) {
-            // Extract Entity IDs from this batch
-            // The batch is already transformed (e.g. Object by EntityID or Array)
+            // FIX: Robust ID Extraction
             const entityIds = this._extractEntityIdsFromBatch(batch, driverTableName);
-            if (entityIds.length === 0) continue;
+            if (entityIds.length === 0) {
+                this._log('WARN', `Driver batch from ${driverTableName} yielded 0 entity IDs. Check config.entityField matches BigQuery column.`);
+                continue;
+            }
             const batchResults = { [driverTableName]: batch };
             const errors = [];
-            // 4. Fetch Dependencies for this SPECIFIC batch of entities
             await Promise.all(Object.entries(requires).map(async ([tableName, spec]) => {
-                if (tableName === driverTableName) return; // Already have this
+                if (tableName === driverTableName) return;
+                // FIX: Identity Crisis Check
+                const depConfig = this.tables[tableName] || {};
+                const shouldFilterById = depConfig.entityField === driverConfig.entityField;
                 try {
                     const data = await this.fetch({
@@ -153,13 +126,13 @@ class DataFetcher {
                         mandatory: spec.mandatory || false,
                         filter: spec.filter || {},
                         fields: spec.fields || null,
-                        entities: entityIds // <--- CRITICAL: Filter by current batch
+                        entities: shouldFilterById ? entityIds : null
                     });
                     batchResults[tableName] = data;
                     if (spec.mandatory && this._isEmpty(data)) {
-                        errors.push({ table: tableName, reason: 'MANDATORY_MISSING_IN_BATCH' });
+                        this._log('WARN', `Batch warning: Mandatory table ${tableName} returned 0 rows. (Filtered by ID: ${shouldFilterById})`);
                     }
                 } catch (e) {
                     if (spec.mandatory) errors.push({ table: tableName, reason: e.message });
@@ -168,7 +141,7 @@ class DataFetcher {
             }));
             if (errors.length > 0) {
-                this._log('WARN', `Batch missing mandatory data: ${errors.map(e => e.table).join(', ')}. Skipping batch.`);
+                this._log('WARN', `Batch missing mandatory data due to errors: ${errors.map(e => e.table).join(', ')}. Skipping batch.`);
                 continue;
             }
@@ -176,14 +149,8 @@ class DataFetcher {
         }
     }
-    /**
-     * Fetch data from a single table (Full Load).
-     * @param {FetchOptions} options - Fetch options
-     * @returns {Promise<Object|Array|null>} Transformed data
-     */
     async fetch(options) {
         const { table, targetDate, lookback = 0, filter = {}, fields = null, entities = null } = options;
         const tableConfig = this.tables[table] || {};
         const { dateField, entityField, dataField } = tableConfig;
@@ -199,32 +166,44 @@ class DataFetcher {
         return this._transform(rows, { lookback, dateField, entityField, dataField });
     }
-    /**
-     * STREAMING: Fetch data from a single table in batches.
-     * @param {FetchOptions} options
-     * @param {number} batchSize
-     * @returns {AsyncGenerator<Object|Array>}
-     */
     async *fetchBatched(options, batchSize = 1000) {
         const { table, targetDate, lookback = 0, filter = {}, fields = null, entities = null } = options;
         const tableConfig = this.tables[table] || {};
         const { dateField, entityField, dataField } = tableConfig;
+        // FIX #1: Prioritize ordering by Entity to keep historical rows together
         const query = await this.queryBuilder.build({
             table, select: fields, where: filter, dateField, targetDate, lookback, entityField, entities,
-            orderBy: dateField || entityField
+            orderBy: entityField || dateField
         });
-        // Use Stream Executor
         const rowStream = this._executeStream(query);
         let batch = [];
+        let currentEntity = null;
         for await (const row of rowStream) {
-            batch.push(row);
-            if (batch.length >= batchSize) {
-                yield this._transform(batch, { lookback, dateField, entityField, dataField });
-                batch = [];
+            // FIX #2: Entity-Atomic Batching
+            // If we have an entity field, verify we don't split an entity across batches
+            if (entityField) {
+                const rowEntity = String(row[entityField]);
+                // If batch is full AND we have moved to a new entity, yield the batch
+                // This ensures the current entity (which might have many rows) stays together
+                if (batch.length >= batchSize && rowEntity !== currentEntity && currentEntity !== null) {
+                    yield this._transform(batch, { lookback, dateField, entityField, dataField });
+                    batch = [];
+                }
+                currentEntity = rowEntity;
+            } else {
+                // Fallback for non-entity tables (strict count)
+                if (batch.length >= batchSize) {
+                    yield this._transform(batch, { lookback, dateField, entityField, dataField });
+                    batch = [];
+                }
             }
+            batch.push(row);
         }
         if (batch.length > 0) {
@@ -232,9 +211,6 @@ class DataFetcher {
         }
     }
-    /**
-     * Check if data exists for a table on a given date.
-     */
     async hasData(table, targetDate) {
         const tableConfig = this.tables[table] || {};
         const { dateField } = tableConfig;
@@ -248,9 +224,6 @@ class DataFetcher {
         }
     }
-    /**
-     * Check availability for multiple tables.
-     */
     async checkAvailability(requires, targetDate) {
         const available = [];
         const missing = [];
@@ -272,24 +245,16 @@ class DataFetcher {
     getStats() { return { ...this.stats }; }
     resetStats() { this.stats = { queries: 0, rowsFetched: 0, errors: 0, bytesProcessed: 0 }; }
-    // =========================================================================
-    // PRIVATE METHODS
-    // =========================================================================
     async _execute(query) {
         this.stats.queries++;
         try {
-            this._log('DEBUG', `Executing (Full): ${query.sql.substring(0, 100)}...`);
             const [job] = await this.client.createQueryJob({
                 query: query.sql, params: query.params, location: this.location
             });
             const [rows] = await job.getQueryResults();
             const [metadata] = await job.getMetadata();
-            const bytesProcessed = parseInt(metadata.statistics?.totalBytesProcessed || 0, 10);
             this.stats.rowsFetched += rows.length;
-            this.stats.bytesProcessed += bytesProcessed;
+            this.stats.bytesProcessed += parseInt(metadata.statistics?.totalBytesProcessed || 0, 10);
             return rows;
         } catch (e) {
             this.stats.errors++;
@@ -298,17 +263,13 @@ class DataFetcher {
         }
     }
-    // New Stream Executor using BigQuery Streams
     async *_executeStream(query) {
         this.stats.queries++;
         try {
-            this._log('DEBUG', `Executing (Stream): ${query.sql.substring(0, 100)}...`);
             const [job] = await this.client.createQueryJob({
                 query: query.sql, params: query.params, location: this.location
             });
             const stream = job.getQueryResultsStream();
             for await (const row of stream) {
                 this.stats.rowsFetched++;
                 yield row;
@@ -320,13 +281,38 @@ class DataFetcher {
         }
     }
+    /**
+     * Transforms raw rows into a structured object.
+     * FIX: PRIORITIZE ENTITY FIELD.
+     * If entityField exists, we MUST return { [id]: [rows] } so IDs can be extracted.
+     */
     _transform(rows, config) {
         const { lookback, dateField, entityField, dataField } = config;
-        // Ensure rows is array
         const rowArray = Array.isArray(rows) ? rows : [rows];
-        // For lookback > 0, return date-keyed object
+        // FIX: Primary Grouping = Entity
+        if (entityField) {
+            const byEntity = {};
+            for (const row of rowArray) {
+                const entityKey = String(row[entityField]);
+                if (!byEntity[entityKey]) {
+                    // If simple fetch (no history), value is single object
+                    // If history fetch (lookback), value is Array of rows
+                    byEntity[entityKey] = lookback > 0 ? [] : null;
+                }
+                const value = dataField ? row[dataField] : row;
+                if (Array.isArray(byEntity[entityKey])) {
+                    byEntity[entityKey].push(value);
+                } else {
+                    byEntity[entityKey] = value;
+                }
+            }
+            return byEntity;
+        }
+        // Fallback: Date Grouping (Only if no Entity ID)
         if (lookback > 0 && dateField) {
             const byDate = {};
             for (const row of rowArray) {
@@ -338,17 +324,6 @@ class DataFetcher {
             return byDate;
         }
-        // For entity-keyed tables, return entity-keyed object
-        if (entityField) {
-            const byEntity = {};
-            for (const row of rowArray) {
-                const entityKey = String(row[entityField]);
-                byEntity[entityKey] = dataField ? row[dataField] : row;
-            }
-            return byEntity;
-        }
-        // Default: return array
         return rowArray;
     }
@@ -362,15 +337,42 @@ class DataFetcher {
     _extractEntityIdsFromBatch(batchData, tableName) {
         const config = this.tables[tableName] || {};
-        if (config.entityField && batchData && !Array.isArray(batchData)) {
+        const field = config.entityField;
+        // Case 1: Transformed Object { "id1": data, "id2": data }
+        if (field && batchData && !Array.isArray(batchData)) {
             return Object.keys(batchData);
         }
-        // If it's an array, we need to map
-        if (Array.isArray(batchData) && config.entityField) {
-            return batchData.map(r => String(r[config.entityField]));
+        // Case 2: Array of Rows (Only if _transform didn't group by entity)
+        if (Array.isArray(batchData) && field) {
+            const ids = [];
+            let undefinedCount = 0;
+            for (const r of batchData) {
+                const val = r[field];
+                if (val === undefined) {
+                    undefinedCount++;
+                } else {
+                    ids.push(String(val));
+                }
+            }
+            if (undefinedCount > 0) {
+                this._log('ERROR', `CRITICAL CONFIG ERROR: Found ${undefinedCount} rows in '${tableName}' where entityField '${field}' was UNDEFINED.`);
+            }
+            return ids;
         }
         return [];
     }
+    _extractAllEntityIds(fullData) {
+        const ids = new Set();
+        Object.values(fullData || {}).forEach(tableData => {
+             if (tableData && typeof tableData === 'object' && !Array.isArray(tableData)) {
+                 Object.keys(tableData).forEach(k => ids.add(k));
+             }
+        });
+        return Array.from(ids);
+    }
     _formatDate(dateValue) {
         if (!dateValue) return null;
@@ -390,8 +392,8 @@ class DataFetcher {
     _log(level, message) {
         if (this.logger && typeof this.logger.log === 'function') {
             this.logger.log(level, `[DataFetcher] ${message}`);
-        } else if (level === 'ERROR') {
-            console.error(`[DataFetcher] ${message}`);
+        } else if (level === 'ERROR' || level === 'WARN') {
+            console.error(`[${level}] [DataFetcher] ${message}`);
         }
     }
 }