bulltrackers-module 1.0.709 → 1.0.712

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1655 @@
1
+ /**
2
+ * @fileoverview BigQuery utility functions for BullTrackers
3
+ * Handles table creation, data insertion, and querying with automatic schema management
4
+ */
5
+
6
+ const { BigQuery } = require('@google-cloud/bigquery');
7
+ const fs = require('fs');
8
+ const path = require('path');
9
+ const os = require('os');
10
+
11
+ // Singleton BigQuery client
12
+ let bigqueryClient = null;
13
+
14
+ /**
15
+ * Get or create BigQuery client
16
+ */
17
+ function getBigQueryClient() {
18
+ if (!bigqueryClient) {
19
+ const projectId = process.env.GCP_PROJECT_ID || 'stocks-12345';
20
+ bigqueryClient = new BigQuery({ projectId });
21
+ }
22
+ return bigqueryClient;
23
+ }
24
+
25
+ /**
26
+ * Get dataset reference, creating it if it doesn't exist
27
+ * @param {string} datasetId - Dataset ID (e.g., 'bulltrackers_data')
28
+ * @param {object} logger - Logger instance
29
+ * @returns {Promise<Dataset>}
30
+ */
31
+ async function getOrCreateDataset(datasetId, logger = null) {
32
+ const bigquery = getBigQueryClient();
33
+ const dataset = bigquery.dataset(datasetId);
34
+
35
+ try {
36
+ const [exists] = await dataset.exists();
37
+ if (!exists) {
38
+ if (logger) logger.log('INFO', `[BigQuery] Creating dataset: ${datasetId}`);
39
+ await dataset.create({
40
+ location: 'europe-west1', // Match your Cloud Functions region
41
+ description: 'BullTrackers analytical data'
42
+ });
43
+ if (logger) logger.log('INFO', `[BigQuery] Dataset ${datasetId} created successfully`);
44
+ }
45
+ return dataset;
46
+ } catch (error) {
47
+ if (logger) logger.log('ERROR', `[BigQuery] Error with dataset ${datasetId}: ${error.message}`);
48
+ throw error;
49
+ }
50
+ }
51
+
52
+ /**
53
+ * Ensure a table exists with the given schema, creating it if necessary
54
+ * @param {string} datasetId - Dataset ID
55
+ * @param {string} tableId - Table ID
56
+ * @param {Array} schema - BigQuery schema array
57
+ * @param {object} options - Additional options (partitionField, clusterFields)
58
+ * @param {object} logger - Logger instance
59
+ * @returns {Promise<Table>}
60
+ */
61
+ async function ensureTableExists(datasetId, tableId, schema, options = {}, logger = null) {
62
+ const dataset = await getOrCreateDataset(datasetId, logger);
63
+ const table = dataset.table(tableId);
64
+
65
+ try {
66
+ const [exists] = await table.exists();
67
+ if (!exists) {
68
+ if (logger) logger.log('INFO', `[BigQuery] Creating table: ${datasetId}.${tableId}`);
69
+
70
+ const tableOptions = {
71
+ schema: schema,
72
+ description: `Auto-created table for ${tableId}`
73
+ };
74
+
75
+ // Add partitioning if specified
76
+ if (options.partitionField) {
77
+ tableOptions.timePartitioning = {
78
+ field: options.partitionField,
79
+ type: 'DAY' // Partition by day
80
+ };
81
+ }
82
+
83
+ // Add clustering if specified
84
+ if (options.clusterFields && options.clusterFields.length > 0) {
85
+ tableOptions.clustering = {
86
+ fields: options.clusterFields
87
+ };
88
+ }
89
+
90
+ await table.create(tableOptions);
91
+ if (logger) logger.log('INFO', `[BigQuery] Table ${datasetId}.${tableId} created successfully`);
92
+ } else {
93
+ // Table exists - verify schema matches (optional, can be enhanced)
94
+ if (logger) logger.log('DEBUG', `[BigQuery] Table ${datasetId}.${tableId} already exists`);
95
+ }
96
+
97
+ return table;
98
+ } catch (error) {
99
+ if (logger) logger.log('ERROR', `[BigQuery] Error ensuring table ${datasetId}.${tableId}: ${error.message}`);
100
+ throw error;
101
+ }
102
+ }
103
+
104
+ /**
105
+ * Insert rows using BigQuery MERGE statement (handles duplicates natively via SQL)
106
+ * More efficient than checking then inserting - BigQuery handles deduplication in SQL
107
+ * Uses a temporary table and MERGE statement for atomic deduplication
108
+ * @param {string} datasetId - Dataset ID
109
+ * @param {string} tableId - Table ID
110
+ * @param {Array} rows - Array of row objects
111
+ * @param {Array} keyFields - Fields that form unique key (e.g., ['date', 'user_id', 'user_type'])
112
+ * @param {object} logger - Logger instance
113
+ * @returns {Promise<number>} Number of rows actually inserted (not duplicates)
114
+ */
115
+ async function insertRowsWithMerge(datasetId, tableId, rows, keyFields, logger = null) {
116
+ if (!rows || rows.length === 0) {
117
+ if (logger) logger.log('WARN', `[BigQuery] No rows to merge into ${datasetId}.${tableId}`);
118
+ return 0;
119
+ }
120
+
121
+ const MAX_ROW_SIZE = 9 * 1024 * 1024; // 9MB safety limit
122
+ const validRows = rows.filter(row => {
123
+ const rowSize = JSON.stringify(row).length;
124
+ return rowSize <= MAX_ROW_SIZE;
125
+ });
126
+
127
+ if (validRows.length === 0) {
128
+ if (logger) logger.log('WARN', `[BigQuery] All rows too large for MERGE into ${datasetId}.${tableId}`);
129
+ return 0;
130
+ }
131
+
132
+ try {
133
+ const tablePath = `${datasetId}.${tableId}`;
134
+ const keyFieldsStr = keyFields.join(', ');
135
+ const tempTableId = `${tableId}_temp_${Date.now()}`;
136
+ const tempTablePath = `${datasetId}.${tempTableId}`;
137
+
138
+ // Get table schema
139
+ const dataset = await getOrCreateDataset(datasetId, logger);
140
+ const table = dataset.table(tableId);
141
+ const [tableMetadata] = await table.getMetadata();
142
+ const schema = tableMetadata.schema.fields;
143
+
144
+ // Create temp table with same schema
145
+ const tempTable = dataset.table(tempTableId);
146
+ await tempTable.create({
147
+ schema: schema,
148
+ description: 'Temporary table for merge operation'
149
+ });
150
+
151
+ if (logger) {
152
+ logger.log('INFO', `[BigQuery] Created temp table ${tempTableId} for MERGE operation`);
153
+ }
154
+
155
+ // Insert all rows into temp table using LOAD JOB (free, not streaming)
156
+ // Write to temporary file (load jobs require a file, not a stream)
157
+ const tempFile = path.join(os.tmpdir(), `bigquery_merge_${Date.now()}_${Math.random().toString(36).substring(7)}.ndjson`);
158
+ const ndjson = validRows.map(r => JSON.stringify(r)).join('\n');
159
+
160
+ try {
161
+ fs.writeFileSync(tempFile, ndjson, 'utf8');
162
+
163
+ // Load into temp table using load job (FREE) from temp file
164
+ // Use createLoadJob to get a Job object we can wait on
165
+ const [loadJob] = await tempTable.createLoadJob(tempFile, {
166
+ sourceFormat: 'NEWLINE_DELIMITED_JSON',
167
+ writeDisposition: 'WRITE_APPEND',
168
+ autodetect: false // Use existing table schema
169
+ });
170
+
171
+ // [FIX] Use native job.promise() instead of custom polling
172
+ // This automatically polls and waits for completion
173
+ await loadJob.promise();
174
+
175
+ // Get job metadata to check for errors
176
+ const [jobMetadata] = await loadJob.getMetadata();
177
+
178
+ // Check for errors
179
+ if (jobMetadata.status?.errorResult) {
180
+ throw new Error(`Load job failed: ${jobMetadata.status.errorResult.message}`);
181
+ }
182
+ } finally {
183
+ // Clean up temp file
184
+ try {
185
+ if (fs.existsSync(tempFile)) {
186
+ fs.unlinkSync(tempFile);
187
+ }
188
+ } catch (cleanupError) {
189
+ if (logger) {
190
+ logger.log('WARN', `[BigQuery] Failed to delete temp file ${tempFile}: ${cleanupError.message}`);
191
+ }
192
+ }
193
+ }
194
+
195
+ if (logger) {
196
+ logger.log('INFO', `[BigQuery] Loaded ${validRows.length} rows into temp table ${tempTableId} using LOAD JOB (free)`);
197
+ }
198
+
199
+ // Use MERGE to insert only new rows (SQL-native deduplication)
200
+ // This is more efficient than checking in JavaScript
201
+ const mergeConditions = keyFields.map(f => `target.${f} = source.${f}`).join(' AND ');
202
+ const mergeQuery = `
203
+ MERGE \`${tablePath}\` AS target
204
+ USING \`${tempTablePath}\` AS source
205
+ ON ${mergeConditions}
206
+ WHEN NOT MATCHED THEN
207
+ INSERT ROW
208
+ `;
209
+
210
+ await query(mergeQuery, {}, logger);
211
+
212
+ // Get count of rows that were actually inserted (not matched = new rows)
213
+ // We can't directly get this from MERGE, so we'll query the temp table
214
+ // and subtract what already exists
215
+ const [existingBefore] = await query(`SELECT COUNT(*) as cnt FROM \`${tablePath}\``, {}, logger);
216
+ const countBefore = existingBefore[0]?.cnt || 0;
217
+
218
+ // Actually, MERGE doesn't return row count directly
219
+ // Let's use a different approach - query what was inserted
220
+ const [insertedCountResult] = await query(`
221
+ SELECT COUNT(*) as inserted
222
+ FROM \`${tempTablePath}\` AS source
223
+ WHERE NOT EXISTS (
224
+ SELECT 1 FROM \`${tablePath}\` AS target
225
+ WHERE ${mergeConditions}
226
+ )
227
+ `, {}, logger);
228
+
229
+ const rowsInserted = insertedCountResult[0]?.inserted || 0;
230
+
231
+ // Drop temp table
232
+ await tempTable.delete();
233
+
234
+ if (logger) {
235
+ logger.log('INFO', `[BigQuery] MERGE completed: ${rowsInserted} new rows inserted into ${tablePath} (${validRows.length - rowsInserted} duplicates skipped via SQL)`);
236
+ }
237
+
238
+ return rowsInserted;
239
+ } catch (error) {
240
+ const errorDetails = {
241
+ message: error.message,
242
+ code: error.code,
243
+ errors: error.errors
244
+ };
245
+ if (logger) {
246
+ logger.log('ERROR', `[BigQuery] MERGE failed for ${datasetId}.${tableId}:`, JSON.stringify(errorDetails, null, 2));
247
+ }
248
+ // Try to clean up temp table if it exists
249
+ try {
250
+ const dataset = await getOrCreateDataset(datasetId, logger);
251
+ const tempTableId = `${tableId}_temp_${Date.now()}`;
252
+ await dataset.table(tempTableId).delete().catch(() => {}); // Ignore cleanup errors
253
+ } catch {}
254
+ throw error;
255
+ }
256
+ }
257
+
258
+ /**
259
+ * Insert rows into a BigQuery table using LOAD JOB (FREE, not streaming inserts)
260
+ * @param {string} datasetId - Dataset ID
261
+ * @param {string} tableId - Table ID
262
+ * @param {Array} rows - Array of row objects
263
+ * @param {object} logger - Logger instance
264
+ * @returns {Promise<void>}
265
+ */
266
+ async function insertRows(datasetId, tableId, rows, logger = null) {
267
+ if (!rows || rows.length === 0) {
268
+ if (logger) logger.log('WARN', `[BigQuery] No rows to insert into ${datasetId}.${tableId}`);
269
+ return;
270
+ }
271
+
272
+ const MAX_ROW_SIZE = 9 * 1024 * 1024; // 9MB safety limit (BigQuery limit is 10MB)
273
+ const MAX_LOAD_JOB_SIZE = 100 * 1024 * 1024; // 100MB per load job (BigQuery limit is 10GB, but we batch smaller)
274
+
275
+ try {
276
+ const dataset = await getOrCreateDataset(datasetId, logger);
277
+ const table = dataset.table(tableId);
278
+
279
+ // Filter out rows that are too large
280
+ const validRows = [];
281
+ const skippedRows = [];
282
+
283
+ for (const row of rows) {
284
+ const rowSize = JSON.stringify(row).length;
285
+ if (rowSize > MAX_ROW_SIZE) {
286
+ skippedRows.push({ size: rowSize, row: Object.keys(row) });
287
+ if (logger) {
288
+ logger.log('WARN', `[BigQuery] Skipping row in ${datasetId}.${tableId}: Row too large (${(rowSize/1024/1024).toFixed(2)}MB, limit: ${(MAX_ROW_SIZE/1024/1024).toFixed(2)}MB)`);
289
+ }
290
+ } else {
291
+ validRows.push(row);
292
+ }
293
+ }
294
+
295
+ if (skippedRows.length > 0 && logger) {
296
+ logger.log('WARN', `[BigQuery] Skipped ${skippedRows.length} rows in ${datasetId}.${tableId} due to size limits`);
297
+ }
298
+
299
+ if (validRows.length === 0) {
300
+ if (logger) logger.log('WARN', `[BigQuery] No valid rows to insert into ${datasetId}.${tableId} (all rows too large)`);
301
+ return;
302
+ }
303
+
304
+ // Use LOAD JOBS (free) instead of streaming inserts (expensive)
305
+ // Batch rows into load jobs of reasonable size
306
+ let totalInserted = 0;
307
+ let currentBatch = [];
308
+ let currentBatchSize = 0;
309
+
310
+ for (let i = 0; i < validRows.length; i++) {
311
+ const row = validRows[i];
312
+ const rowSize = JSON.stringify(row).length;
313
+
314
+ // If adding this row would exceed batch size, load current batch first
315
+ if (currentBatch.length > 0 && (currentBatchSize + rowSize) > MAX_LOAD_JOB_SIZE) {
316
+ // Write batch to temporary file (load jobs require a file, not a stream)
317
+ const tempFile = path.join(os.tmpdir(), `bigquery_load_${Date.now()}_${Math.random().toString(36).substring(7)}.ndjson`);
318
+ const ndjson = currentBatch.map(r => JSON.stringify(r)).join('\n');
319
+
320
+ try {
321
+ fs.writeFileSync(tempFile, ndjson, 'utf8');
322
+
323
+ // Load current batch using load job (FREE) from temp file
324
+ // Use createLoadJob to get a Job object we can wait on
325
+ const [job] = await table.createLoadJob(tempFile, {
326
+ sourceFormat: 'NEWLINE_DELIMITED_JSON',
327
+ writeDisposition: 'WRITE_APPEND',
328
+ autodetect: false // Use existing table schema
329
+ });
330
+
331
+ // [FIX] Use native job.promise() instead of custom polling
332
+ // This automatically polls and waits for completion
333
+ await job.promise();
334
+
335
+ // Get job metadata to check for errors and get row count
336
+ const [jobMetadata] = await job.getMetadata();
337
+
338
+ // Check for errors
339
+ if (jobMetadata.status?.errorResult) {
340
+ throw new Error(`Load job failed: ${jobMetadata.status.errorResult.message}`);
341
+ }
342
+
343
+ const rowsLoaded = jobMetadata.statistics?.load?.outputRows || currentBatch.length;
344
+ totalInserted += rowsLoaded;
345
+
346
+ if (logger) {
347
+ logger.log('INFO', `[BigQuery] Load job completed: ${rowsLoaded} rows loaded into ${datasetId}.${tableId}`);
348
+ }
349
+ } finally {
350
+ // Clean up temp file
351
+ try {
352
+ if (fs.existsSync(tempFile)) {
353
+ fs.unlinkSync(tempFile);
354
+ }
355
+ } catch (cleanupError) {
356
+ if (logger) {
357
+ logger.log('WARN', `[BigQuery] Failed to delete temp file ${tempFile}: ${cleanupError.message}`);
358
+ }
359
+ }
360
+ }
361
+
362
+ // Reset batch
363
+ currentBatch = [];
364
+ currentBatchSize = 0;
365
+ }
366
+
367
+ // Add row to current batch
368
+ currentBatch.push(row);
369
+ currentBatchSize += rowSize;
370
+ }
371
+
372
+ // Load remaining batch
373
+ if (currentBatch.length > 0) {
374
+ // Write batch to temporary file (load jobs require a file, not a stream)
375
+ const tempFile = path.join(os.tmpdir(), `bigquery_load_${Date.now()}_${Math.random().toString(36).substring(7)}.ndjson`);
376
+ const ndjson = currentBatch.map(r => JSON.stringify(r)).join('\n');
377
+
378
+ try {
379
+ fs.writeFileSync(tempFile, ndjson, 'utf8');
380
+
381
+ // Load using load job (FREE) from temp file
382
+ // Use createLoadJob to get a Job object we can wait on
383
+ const [job] = await table.createLoadJob(tempFile, {
384
+ sourceFormat: 'NEWLINE_DELIMITED_JSON',
385
+ writeDisposition: 'WRITE_APPEND',
386
+ autodetect: false // Use existing table schema
387
+ });
388
+
389
+ // Wait for job to complete using polling
390
+ let jobMetadata;
391
+ const maxAttempts = 60; // 5 minutes max (5 second intervals)
392
+ const pollInterval = 5000; // 5 seconds
393
+
394
+ for (let attempt = 0; attempt < maxAttempts; attempt++) {
395
+ [jobMetadata] = await job.getMetadata();
396
+ const state = jobMetadata.status?.state;
397
+
398
+ if (state === 'DONE') {
399
+ break;
400
+ }
401
+
402
+ if (state === 'PENDING' || state === 'RUNNING') {
403
+ // Wait before next poll
404
+ await new Promise(resolve => setTimeout(resolve, pollInterval));
405
+ } else {
406
+ throw new Error(`Unexpected job state: ${state}`);
407
+ }
408
+ }
409
+
410
+ // Check if we timed out
411
+ if (jobMetadata.status?.state !== 'DONE') {
412
+ throw new Error(`Load job did not complete within ${maxAttempts * pollInterval / 1000} seconds`);
413
+ }
414
+
415
+ // Check for errors
416
+ if (jobMetadata.status?.errorResult) {
417
+ throw new Error(`Load job failed: ${jobMetadata.status.errorResult.message}`);
418
+ }
419
+
420
+ const rowsLoaded = jobMetadata.statistics?.load?.outputRows || currentBatch.length;
421
+ totalInserted += rowsLoaded;
422
+
423
+ if (logger) {
424
+ logger.log('INFO', `[BigQuery] Load job completed: ${rowsLoaded} rows loaded into ${datasetId}.${tableId}`);
425
+ }
426
+ } finally {
427
+ // Clean up temp file
428
+ try {
429
+ if (fs.existsSync(tempFile)) {
430
+ fs.unlinkSync(tempFile);
431
+ }
432
+ } catch (cleanupError) {
433
+ if (logger) {
434
+ logger.log('WARN', `[BigQuery] Failed to delete temp file ${tempFile}: ${cleanupError.message}`);
435
+ }
436
+ }
437
+ }
438
+ }
439
+
440
+ if (logger) {
441
+ logger.log('INFO', `[BigQuery] Loaded ${totalInserted}/${validRows.length} rows into ${datasetId}.${tableId} using LOAD JOBS (free)${skippedRows.length > 0 ? ` (${skippedRows.length} skipped due to size)` : ''}`);
442
+ }
443
+ } catch (error) {
444
+ const errorDetails = {
445
+ message: error.message,
446
+ code: error.code,
447
+ errors: error.errors,
448
+ stack: error.stack
449
+ };
450
+ if (logger) {
451
+ logger.log('ERROR', `[BigQuery] Error loading into ${datasetId}.${tableId}:`, JSON.stringify(errorDetails, null, 2));
452
+ }
453
+ throw error;
454
+ }
455
+ }
456
+
457
+ /**
458
+ * Insert rows into BigQuery using STREAMING INSERTS (immediate, costs ~$0.05/GB)
459
+ * Use this for time-sensitive data like alert computations
460
+ * @param {string} datasetId - Dataset ID
461
+ * @param {string} tableId - Table ID
462
+ * @param {Array} rows - Array of row objects
463
+ * @param {object} logger - Logger instance
464
+ * @returns {Promise<void>}
465
+ */
466
+ async function insertRowsStreaming(datasetId, tableId, rows, logger = null) {
467
+ if (!rows || rows.length === 0) {
468
+ if (logger) logger.log('WARN', `[BigQuery] No rows to stream into ${datasetId}.${tableId}`);
469
+ return;
470
+ }
471
+
472
+ const MAX_ROW_SIZE = 9 * 1024 * 1024; // 9MB safety limit (BigQuery limit is 10MB)
473
+ const MAX_BATCH_SIZE = 100; // Streaming insert batch size
474
+
475
+ try {
476
+ const dataset = await getOrCreateDataset(datasetId, logger);
477
+ const table = dataset.table(tableId);
478
+
479
+ // Filter out rows that are too large
480
+ const validRows = rows.filter(row => {
481
+ const rowSize = JSON.stringify(row).length;
482
+ return rowSize <= MAX_ROW_SIZE;
483
+ });
484
+
485
+ if (validRows.length === 0) {
486
+ if (logger) logger.log('WARN', `[BigQuery] No valid rows to stream into ${datasetId}.${tableId} (all rows too large)`);
487
+ return;
488
+ }
489
+
490
+ // Stream inserts in batches
491
+ let insertedCount = 0;
492
+ for (let i = 0; i < validRows.length; i += MAX_BATCH_SIZE) {
493
+ const batch = validRows.slice(i, i + MAX_BATCH_SIZE);
494
+
495
+ try {
496
+ const [result] = await table.insert(batch);
497
+
498
+ if (result.insertErrors && result.insertErrors.length > 0) {
499
+ const errors = result.insertErrors.map(e => e.errors).flat();
500
+ if (logger) logger.log('ERROR', `[BigQuery] Streaming insert errors for batch in ${datasetId}.${tableId}:`, errors);
501
+ // Continue with next batch
502
+ } else {
503
+ insertedCount += batch.length;
504
+ }
505
+ } catch (batchError) {
506
+ if (logger) {
507
+ logger.log('WARN', `[BigQuery] Streaming insert batch failed for ${datasetId}.${tableId}: ${batchError.message}`);
508
+ }
509
+ // Continue with next batch
510
+ }
511
+ }
512
+
513
+ if (logger) {
514
+ logger.log('INFO', `[BigQuery] Streamed ${insertedCount}/${validRows.length} rows into ${datasetId}.${tableId} using STREAMING INSERTS`);
515
+ }
516
+ } catch (error) {
517
+ const errorDetails = {
518
+ message: error.message,
519
+ code: error.code,
520
+ errors: error.errors
521
+ };
522
+ if (logger) {
523
+ logger.log('ERROR', `[BigQuery] Error streaming into ${datasetId}.${tableId}:`, JSON.stringify(errorDetails, null, 2));
524
+ }
525
+ throw error;
526
+ }
527
+ }
528
+
529
+ /**
530
+ * Query BigQuery and return results
531
+ * @param {string} query - SQL query string
532
+ * @param {object} options - Query options
533
+ * @param {object} logger - Logger instance
534
+ * @returns {Promise<Array>} Array of row objects
535
+ */
536
+ async function query(query, options = {}, logger = null) {
537
+ const bigquery = getBigQueryClient();
538
+
539
+ try {
540
+ const [rows] = await bigquery.query({
541
+ query: query,
542
+ location: 'europe-west1',
543
+ ...options
544
+ });
545
+
546
+ if (logger) logger.log('INFO', `[BigQuery] Query returned ${rows.length} rows`);
547
+ return rows;
548
+ } catch (error) {
549
+ if (logger) logger.log('ERROR', `[BigQuery] Query error: ${error.message}`);
550
+ throw error;
551
+ }
552
+ }
553
+
554
+ /**
555
+ * Schema definitions for BullTrackers tables
556
+ */
557
+ const SCHEMAS = {
558
+ computation_results: [
559
+ { name: 'date', type: 'DATE', mode: 'REQUIRED' },
560
+ { name: 'computation_name', type: 'STRING', mode: 'REQUIRED' },
561
+ { name: 'category', type: 'STRING', mode: 'REQUIRED' },
562
+ { name: 'result_data', type: 'JSON', mode: 'NULLABLE' },
563
+ { name: 'metadata', type: 'JSON', mode: 'NULLABLE' },
564
+ { name: 'created_at', type: 'TIMESTAMP', mode: 'REQUIRED' }
565
+ ],
566
+ portfolio_snapshots: [
567
+ { name: 'date', type: 'DATE', mode: 'REQUIRED' },
568
+ { name: 'user_id', type: 'INT64', mode: 'REQUIRED' },
569
+ { name: 'user_type', type: 'STRING', mode: 'REQUIRED' },
570
+ { name: 'portfolio_data', type: 'JSON', mode: 'NULLABLE' },
571
+ { name: 'fetched_at', type: 'TIMESTAMP', mode: 'REQUIRED' }
572
+ ],
573
+ trade_history_snapshots: [
574
+ { name: 'date', type: 'DATE', mode: 'REQUIRED' },
575
+ { name: 'user_id', type: 'INT64', mode: 'REQUIRED' },
576
+ { name: 'user_type', type: 'STRING', mode: 'REQUIRED' },
577
+ { name: 'history_data', type: 'JSON', mode: 'NULLABLE' },
578
+ { name: 'fetched_at', type: 'TIMESTAMP', mode: 'REQUIRED' }
579
+ ],
580
+ social_post_snapshots: [
581
+ { name: 'date', type: 'DATE', mode: 'REQUIRED' },
582
+ { name: 'user_id', type: 'INT64', mode: 'REQUIRED' },
583
+ { name: 'user_type', type: 'STRING', mode: 'REQUIRED' },
584
+ { name: 'posts_data', type: 'JSON', mode: 'NULLABLE' },
585
+ { name: 'fetched_at', type: 'TIMESTAMP', mode: 'REQUIRED' }
586
+ ],
587
+ asset_prices: [
588
+ { name: 'date', type: 'DATE', mode: 'REQUIRED' },
589
+ { name: 'instrument_id', type: 'INT64', mode: 'REQUIRED' },
590
+ { name: 'ticker', type: 'STRING', mode: 'REQUIRED' },
591
+ { name: 'price', type: 'FLOAT64', mode: 'REQUIRED' },
592
+ { name: 'open', type: 'FLOAT64', mode: 'NULLABLE' },
593
+ { name: 'high', type: 'FLOAT64', mode: 'NULLABLE' },
594
+ { name: 'low', type: 'FLOAT64', mode: 'NULLABLE' },
595
+ { name: 'close', type: 'FLOAT64', mode: 'NULLABLE' },
596
+ { name: 'volume', type: 'FLOAT64', mode: 'NULLABLE' },
597
+ { name: 'fetched_at', type: 'TIMESTAMP', mode: 'REQUIRED' }
598
+ ],
599
+ pi_master_list: [
600
+ { name: 'cid', type: 'INT64', mode: 'REQUIRED' },
601
+ { name: 'username', type: 'STRING', mode: 'REQUIRED' },
602
+ { name: 'first_seen_at', type: 'TIMESTAMP', mode: 'REQUIRED' },
603
+ { name: 'last_seen_at', type: 'TIMESTAMP', mode: 'REQUIRED' },
604
+ { name: 'last_updated', type: 'TIMESTAMP', mode: 'REQUIRED' }
605
+ ],
606
+ pi_rankings: [
607
+ { name: 'date', type: 'DATE', mode: 'REQUIRED' },
608
+ { name: 'pi_id', type: 'INT64', mode: 'REQUIRED' },
609
+ { name: 'username', type: 'STRING', mode: 'REQUIRED' },
610
+ { name: 'rank', type: 'INT64', mode: 'NULLABLE' },
611
+ { name: 'category', type: 'STRING', mode: 'NULLABLE' },
612
+ { name: 'rankings_data', type: 'JSON', mode: 'NULLABLE' },
613
+ { name: 'fetched_at', type: 'TIMESTAMP', mode: 'REQUIRED' }
614
+ ],
615
+ instrument_insights: [
616
+ { name: 'date', type: 'DATE', mode: 'REQUIRED' },
617
+ { name: 'instrument_id', type: 'INT64', mode: 'REQUIRED' },
618
+ { name: 'insights_data', type: 'JSON', mode: 'REQUIRED' },
619
+ { name: 'fetched_at', type: 'TIMESTAMP', mode: 'REQUIRED' }
620
+ ]
621
+ };
622
+
623
+ /**
624
+ * Get schema for a table
625
+ * @param {string} tableName - Table name
626
+ * @returns {Array} Schema array
627
+ */
628
+ function getSchema(tableName) {
629
+ return SCHEMAS[tableName] || null;
630
+ }
631
+
632
+ /**
633
+ * Ensure computation_results table exists
634
+ * @param {object} logger - Logger instance
635
+ * @returns {Promise<Table>}
636
+ */
637
+ async function ensureComputationResultsTable(logger = null) {
638
+ const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
639
+ const tableId = 'computation_results';
640
+ const schema = getSchema(tableId);
641
+
642
+ return await ensureTableExists(
643
+ datasetId,
644
+ tableId,
645
+ schema,
646
+ {
647
+ partitionField: 'date',
648
+ clusterFields: ['computation_name', 'category']
649
+ },
650
+ logger
651
+ );
652
+ }
653
+
654
+ /**
655
+ * Ensure portfolio_snapshots table exists
656
+ * @param {object} logger - Logger instance
657
+ * @returns {Promise<Table>}
658
+ */
659
+ async function ensurePortfolioSnapshotsTable(logger = null) {
660
+ const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
661
+ const tableId = 'portfolio_snapshots';
662
+ const schema = getSchema(tableId);
663
+
664
+ return await ensureTableExists(
665
+ datasetId,
666
+ tableId,
667
+ schema,
668
+ {
669
+ partitionField: 'date',
670
+ clusterFields: ['user_type', 'user_id']
671
+ },
672
+ logger
673
+ );
674
+ }
675
+
676
+ /**
677
+ * Ensure trade_history_snapshots table exists
678
+ * @param {object} logger - Logger instance
679
+ * @returns {Promise<Table>}
680
+ */
681
+ async function ensureTradeHistorySnapshotsTable(logger = null) {
682
+ const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
683
+ const tableId = 'trade_history_snapshots';
684
+ const schema = getSchema(tableId);
685
+
686
+ return await ensureTableExists(
687
+ datasetId,
688
+ tableId,
689
+ schema,
690
+ {
691
+ partitionField: 'date',
692
+ clusterFields: ['user_type', 'user_id']
693
+ },
694
+ logger
695
+ );
696
+ }
697
+
698
+ /**
699
+ * Ensure social_post_snapshots table exists
700
+ * @param {object} logger - Logger instance
701
+ * @returns {Promise<Table>}
702
+ */
703
+ async function ensureSocialPostSnapshotsTable(logger = null) {
704
+ const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
705
+ const tableId = 'social_post_snapshots';
706
+ const schema = getSchema(tableId);
707
+
708
+ return await ensureTableExists(
709
+ datasetId,
710
+ tableId,
711
+ schema,
712
+ {
713
+ partitionField: 'date',
714
+ clusterFields: ['user_type', 'user_id']
715
+ },
716
+ logger
717
+ );
718
+ }
719
+
720
+ /**
721
+ * Ensure asset_prices table exists
722
+ * @param {object} logger - Logger instance
723
+ * @returns {Promise<Table>}
724
+ */
725
+ async function ensureAssetPricesTable(logger = null) {
726
+ const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
727
+ const tableId = 'asset_prices';
728
+ const schema = getSchema(tableId);
729
+
730
+ return await ensureTableExists(
731
+ datasetId,
732
+ tableId,
733
+ schema,
734
+ {
735
+ partitionField: 'date',
736
+ clusterFields: ['ticker', 'instrument_id']
737
+ },
738
+ logger
739
+ );
740
+ }
741
+
742
+ /**
743
+ * Ensure pi_master_list table exists
744
+ * @param {object} logger - Logger instance
745
+ * @returns {Promise<Table>}
746
+ */
747
+ async function ensurePIMasterListTable(logger = null) {
748
+ const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
749
+ const tableId = 'pi_master_list';
750
+ const schema = getSchema(tableId);
751
+
752
+ return await ensureTableExists(
753
+ datasetId,
754
+ tableId,
755
+ schema,
756
+ {
757
+ clusterFields: ['cid']
758
+ },
759
+ logger
760
+ );
761
+ }
762
+
763
+ /**
764
+ * Ensure pi_rankings table exists
765
+ * @param {object} logger - Logger instance
766
+ * @returns {Promise<Table>}
767
+ */
768
+ async function ensurePIRankingsTable(logger = null) {
769
+ const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
770
+ const tableId = 'pi_rankings';
771
+ const schema = getSchema(tableId);
772
+
773
+ return await ensureTableExists(
774
+ datasetId,
775
+ tableId,
776
+ schema,
777
+ {
778
+ partitionField: 'date',
779
+ clusterFields: ['pi_id', 'category']
780
+ },
781
+ logger
782
+ );
783
+ }
784
+
785
+ /**
786
+ * Ensure instrument_insights table exists
787
+ * @param {object} logger - Logger instance
788
+ * @returns {Promise<Table>}
789
+ */
790
+ async function ensureInstrumentInsightsTable(logger = null) {
791
+ const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
792
+ const tableId = 'instrument_insights';
793
+ const schema = getSchema(tableId);
794
+
795
+ return await ensureTableExists(
796
+ datasetId,
797
+ tableId,
798
+ schema,
799
+ {
800
+ partitionField: 'date',
801
+ clusterFields: ['instrument_id']
802
+ },
803
+ logger
804
+ );
805
+ }
806
+
807
+ /**
808
+ * Query portfolio data from BigQuery
809
+ * @param {string} dateStr - Date string (YYYY-MM-DD)
810
+ * @param {Array} userIds - Optional array of user IDs to filter
811
+ * @param {Array} userTypes - Optional array of user types to filter (e.g., ['POPULAR_INVESTOR', 'SIGNED_IN_USER'])
812
+ * @param {object} logger - Logger instance
813
+ * @returns {Promise<Object>} Map of user_id -> portfolio_data, or null if no data/error
814
+ */
815
+ async function queryPortfolioData(dateStr, userIds = null, userTypes = null, logger = null) {
816
+ if (process.env.BIGQUERY_ENABLED === 'false') {
817
+ if (logger) logger.log('DEBUG', '[BigQuery] Portfolio query skipped (BIGQUERY_ENABLED=false)');
818
+ return null;
819
+ }
820
+
821
+ const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
822
+ const tablePath = `${datasetId}.portfolio_snapshots`;
823
+
824
+ try {
825
+ // Build WHERE clause
826
+ const conditions = [`date = '${dateStr}'`];
827
+
828
+ if (userIds && userIds.length > 0) {
829
+ const userIdList = userIds.map(id => String(id)).join(',');
830
+ conditions.push(`user_id IN (${userIdList})`);
831
+ }
832
+
833
+ if (userTypes && userTypes.length > 0) {
834
+ const typeList = userTypes.map(t => `'${t.toUpperCase()}'`).join(',');
835
+ conditions.push(`user_type IN (${typeList})`);
836
+ }
837
+
838
+ const whereClause = conditions.join(' AND ');
839
+
840
+ const sqlQuery = `
841
+ SELECT
842
+ user_id,
843
+ user_type,
844
+ portfolio_data,
845
+ fetched_at
846
+ FROM \`${tablePath}\`
847
+ WHERE ${whereClause}
848
+ `;
849
+
850
+ if (logger) {
851
+ logger.log('INFO', `[BigQuery] 🔍 Querying portfolio data from ${tablePath} for date ${dateStr}${userTypes ? ` (types: ${userTypes.join(',')})` : ''}${userIds ? ` (${userIds.length} users)` : ''}`);
852
+ }
853
+
854
+ const rows = await query(sqlQuery, {}, logger);
855
+
856
+ if (!rows || rows.length === 0) {
857
+ if (logger) logger.log('INFO', `[BigQuery] No portfolio data found in ${tablePath} for ${dateStr}`);
858
+ return null;
859
+ }
860
+
861
+ // Transform to map: user_id -> { portfolio_data, user_type, fetched_at }
862
+ const result = {};
863
+ for (const row of rows) {
864
+ result[String(row.user_id)] = {
865
+ portfolio_data: row.portfolio_data || {},
866
+ user_type: row.user_type,
867
+ fetched_at: row.fetched_at
868
+ };
869
+ }
870
+
871
+ if (logger) {
872
+ logger.log('INFO', `[BigQuery] ✅ Retrieved ${rows.length} portfolio records from ${tablePath} for ${dateStr}`);
873
+ }
874
+
875
+ return result;
876
+ } catch (error) {
877
+ if (logger) {
878
+ logger.log('WARN', `[BigQuery] Portfolio query failed for ${tablePath} (${dateStr}): ${error.message}`);
879
+ }
880
+ return null; // Return null to trigger Firestore fallback
881
+ }
882
+ }
883
+
884
+ /**
885
+ * Query trade history data from BigQuery
886
+ * @param {string} dateStr - Date string (YYYY-MM-DD)
887
+ * @param {Array} userIds - Optional array of user IDs to filter
888
+ * @param {Array} userTypes - Optional array of user types to filter
889
+ * @param {object} logger - Logger instance
890
+ * @returns {Promise<Object>} Map of user_id -> history_data, or null if no data/error
891
+ */
892
+ async function queryHistoryData(dateStr, userIds = null, userTypes = null, logger = null) {
893
+ if (process.env.BIGQUERY_ENABLED === 'false') {
894
+ if (logger) logger.log('DEBUG', '[BigQuery] History query skipped (BIGQUERY_ENABLED=false)');
895
+ return null;
896
+ }
897
+
898
+ const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
899
+ const tablePath = `${datasetId}.trade_history_snapshots`;
900
+
901
+ try {
902
+ const conditions = [`date = '${dateStr}'`];
903
+
904
+ if (userIds && userIds.length > 0) {
905
+ const userIdList = userIds.map(id => String(id)).join(',');
906
+ conditions.push(`user_id IN (${userIdList})`);
907
+ }
908
+
909
+ if (userTypes && userTypes.length > 0) {
910
+ const typeList = userTypes.map(t => `'${t.toUpperCase()}'`).join(',');
911
+ conditions.push(`user_type IN (${typeList})`);
912
+ }
913
+
914
+ const whereClause = conditions.join(' AND ');
915
+
916
+ const sqlQuery = `
917
+ SELECT
918
+ user_id,
919
+ user_type,
920
+ history_data,
921
+ fetched_at
922
+ FROM \`${tablePath}\`
923
+ WHERE ${whereClause}
924
+ `;
925
+
926
+ if (logger) {
927
+ logger.log('INFO', `[BigQuery] 🔍 Querying trade history from ${tablePath} for date ${dateStr}${userTypes ? ` (types: ${userTypes.join(',')})` : ''}${userIds ? ` (${userIds.length} users)` : ''}`);
928
+ }
929
+
930
+ const rows = await query(sqlQuery, {}, logger);
931
+
932
+ if (!rows || rows.length === 0) {
933
+ if (logger) logger.log('INFO', `[BigQuery] No history data found in ${tablePath} for ${dateStr}`);
934
+ return null;
935
+ }
936
+
937
+ const result = {};
938
+ for (const row of rows) {
939
+ result[String(row.user_id)] = {
940
+ history_data: row.history_data || {},
941
+ user_type: row.user_type,
942
+ fetched_at: row.fetched_at
943
+ };
944
+ }
945
+
946
+ if (logger) {
947
+ logger.log('INFO', `[BigQuery] ✅ Retrieved ${rows.length} history records from ${tablePath} for ${dateStr}`);
948
+ }
949
+
950
+ return result;
951
+ } catch (error) {
952
+ if (logger) {
953
+ logger.log('WARN', `[BigQuery] History query failed for ${tablePath} (${dateStr}): ${error.message}`);
954
+ }
955
+ return null;
956
+ }
957
+ }
958
+
959
+ /**
960
+ * Check which rows already exist in BigQuery (for deduplication)
961
+ * @param {string} datasetId - Dataset ID
962
+ * @param {string} tableId - Table ID
963
+ * @param {string} dateStr - Date string (YYYY-MM-DD)
964
+ * @param {Array} rows - Array of rows to check (must have user_id and user_type)
965
+ * @param {object} logger - Logger instance
966
+ * @returns {Promise<Set>} Set of existing keys as "date|user_id|user_type"
967
+ */
968
+ async function checkExistingRows(datasetId, tableId, dateStr, rows, logger = null) {
969
+ if (process.env.BIGQUERY_ENABLED === 'false' || !rows || rows.length === 0) {
970
+ return new Set();
971
+ }
972
+
973
+ try {
974
+ const tablePath = `${datasetId}.${tableId}`;
975
+
976
+ // Extract unique user_id and user_type combinations
977
+ const userKeys = new Set();
978
+ for (const row of rows) {
979
+ if (row.user_id && row.user_type) {
980
+ userKeys.add(`${row.user_id}|${row.user_type}`);
981
+ }
982
+ }
983
+
984
+ if (userKeys.size === 0) {
985
+ return new Set();
986
+ }
987
+
988
+ // Build WHERE clause for user combinations
989
+ const conditions = [`date = '${dateStr}'`];
990
+ const userConditions = [];
991
+ for (const key of userKeys) {
992
+ const [userId, userType] = key.split('|');
993
+ userConditions.push(`(user_id = ${userId} AND user_type = '${userType}')`);
994
+ }
995
+ conditions.push(`(${userConditions.join(' OR ')})`);
996
+
997
+ const whereClause = conditions.join(' AND ');
998
+
999
+ const sqlQuery = `
1000
+ SELECT
1001
+ user_id,
1002
+ user_type
1003
+ FROM \`${tablePath}\`
1004
+ WHERE ${whereClause}
1005
+ `;
1006
+
1007
+ if (logger) {
1008
+ logger.log('DEBUG', `[BigQuery] Checking for existing rows in ${tablePath} for ${dateStr} (${userKeys.size} unique users)`);
1009
+ }
1010
+
1011
+ const existingRows = await query(sqlQuery, {}, logger);
1012
+
1013
+ // Build set of existing keys
1014
+ const existingKeys = new Set();
1015
+ for (const row of existingRows) {
1016
+ existingKeys.add(`${row.user_id}|${row.user_type}`);
1017
+ }
1018
+
1019
+ if (logger && existingKeys.size > 0) {
1020
+ logger.log('INFO', `[BigQuery] Found ${existingKeys.size} existing rows in ${tablePath} for ${dateStr}, will skip duplicates`);
1021
+ }
1022
+
1023
+ return existingKeys;
1024
+ } catch (error) {
1025
+ if (logger) {
1026
+ logger.log('WARN', `[BigQuery] Error checking existing rows in ${datasetId}.${tableId}: ${error.message}`);
1027
+ }
1028
+ // On error, return empty set (will attempt insert, might create duplicates but safer than skipping)
1029
+ return new Set();
1030
+ }
1031
+ }
1032
+
1033
+ /**
1034
+ * Query social post data from BigQuery
1035
+ * @param {string} dateStr - Date string (YYYY-MM-DD)
1036
+ * @param {Array} userIds - Optional array of user IDs to filter
1037
+ * @param {Array} userTypes - Optional array of user types to filter
1038
+ * @param {object} logger - Logger instance
1039
+ * @returns {Promise<Object>} Map of user_id -> posts_data, or null if no data/error
1040
+ */
1041
+ async function querySocialData(dateStr, userIds = null, userTypes = null, logger = null) {
1042
+ if (process.env.BIGQUERY_ENABLED === 'false') {
1043
+ if (logger) logger.log('DEBUG', '[BigQuery] Social query skipped (BIGQUERY_ENABLED=false)');
1044
+ return null;
1045
+ }
1046
+
1047
+ const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
1048
+ const tablePath = `${datasetId}.social_post_snapshots`;
1049
+
1050
+ try {
1051
+ const conditions = [`date = '${dateStr}'`];
1052
+
1053
+ if (userIds && userIds.length > 0) {
1054
+ const userIdList = userIds.map(id => String(id)).join(',');
1055
+ conditions.push(`user_id IN (${userIdList})`);
1056
+ }
1057
+
1058
+ if (userTypes && userTypes.length > 0) {
1059
+ const typeList = userTypes.map(t => `'${t.toUpperCase()}'`).join(',');
1060
+ conditions.push(`user_type IN (${typeList})`);
1061
+ }
1062
+
1063
+ const whereClause = conditions.join(' AND ');
1064
+
1065
+ const sqlQuery = `
1066
+ SELECT
1067
+ user_id,
1068
+ user_type,
1069
+ posts_data,
1070
+ fetched_at
1071
+ FROM \`${tablePath}\`
1072
+ WHERE ${whereClause}
1073
+ `;
1074
+
1075
+ if (logger) {
1076
+ logger.log('INFO', `[BigQuery] 🔍 Querying social posts from ${tablePath} for date ${dateStr}${userTypes ? ` (types: ${userTypes.join(',')})` : ''}${userIds ? ` (${userIds.length} users)` : ''}`);
1077
+ }
1078
+
1079
+ const rows = await query(sqlQuery, {}, logger);
1080
+
1081
+ if (!rows || rows.length === 0) {
1082
+ if (logger) logger.log('INFO', `[BigQuery] No social data found in ${tablePath} for ${dateStr}`);
1083
+ return null;
1084
+ }
1085
+
1086
+ const result = {};
1087
+ for (const row of rows) {
1088
+ result[String(row.user_id)] = {
1089
+ posts_data: row.posts_data || {},
1090
+ user_type: row.user_type,
1091
+ fetched_at: row.fetched_at
1092
+ };
1093
+ }
1094
+
1095
+ if (logger) {
1096
+ logger.log('INFO', `[BigQuery] ✅ Retrieved ${rows.length} social post records from ${tablePath} for ${dateStr}`);
1097
+ }
1098
+
1099
+ return result;
1100
+ } catch (error) {
1101
+ if (logger) {
1102
+ logger.log('WARN', `[BigQuery] Social query failed for ${tablePath} (${dateStr}): ${error.message}`);
1103
+ }
1104
+ return null;
1105
+ }
1106
+ }
1107
+
1108
+ /**
1109
+ * Query a single computation result from BigQuery for a specific date
1110
+ * @param {string} computationName - Computation name
1111
+ * @param {string} category - Category (e.g., 'popular-investor', 'alerts')
1112
+ * @param {string} dateStr - Date (YYYY-MM-DD)
1113
+ * @param {object} logger - Logger instance
1114
+ * @returns {Promise<object|null>} Result data object, or null if not found/error
1115
+ */
1116
+ async function queryComputationResult(computationName, category, dateStr, logger = null) {
1117
+ if (process.env.BIGQUERY_ENABLED === 'false') {
1118
+ if (logger) logger.log('DEBUG', '[BigQuery] Computation result query skipped (BIGQUERY_ENABLED=false)');
1119
+ return null;
1120
+ }
1121
+
1122
+ const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
1123
+ const tablePath = `${datasetId}.computation_results`;
1124
+
1125
+ try {
1126
+ const sqlQuery = `
1127
+ SELECT result_data
1128
+ FROM \`${tablePath}\`
1129
+ WHERE date = '${dateStr}'
1130
+ AND computation_name = '${computationName}'
1131
+ AND category = '${category}'
1132
+ ORDER BY created_at DESC
1133
+ LIMIT 1
1134
+ `;
1135
+
1136
+ const rows = await query(sqlQuery, {}, logger);
1137
+
1138
+ if (!rows || rows.length === 0) {
1139
+ if (logger) logger.log('DEBUG', `[BigQuery] No computation result found for ${computationName} (${dateStr}, ${category})`);
1140
+ return null;
1141
+ }
1142
+
1143
+ const result = rows[0].result_data;
1144
+ if (logger) logger.log('INFO', `[BigQuery] ✅ Retrieved computation result for ${computationName} (${dateStr})`);
1145
+ return result;
1146
+ } catch (error) {
1147
+ if (logger) {
1148
+ logger.log('WARN', `[BigQuery] Computation result query failed for ${computationName} (${dateStr}): ${error.message}`);
1149
+ }
1150
+ return null;
1151
+ }
1152
+ }
1153
+
1154
+ /**
1155
+ * Query computation results from BigQuery for a date range
1156
+ * @param {string} computationName - Computation name
1157
+ * @param {string} category - Category (e.g., 'popular-investor', 'alerts')
1158
+ * @param {string} startDateStr - Start date (YYYY-MM-DD)
1159
+ * @param {string} endDateStr - End date (YYYY-MM-DD)
1160
+ * @param {object} logger - Logger instance
1161
+ * @returns {Promise<Array>} Array of {date, result_data} objects, or null if error
1162
+ */
1163
+ async function queryComputationResultsRange(computationName, category, startDateStr, endDateStr, logger = null) {
1164
+ if (process.env.BIGQUERY_ENABLED === 'false') {
1165
+ if (logger) logger.log('DEBUG', '[BigQuery] Computation results range query skipped (BIGQUERY_ENABLED=false)');
1166
+ return null;
1167
+ }
1168
+
1169
+ const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
1170
+ const tablePath = `${datasetId}.computation_results`;
1171
+
1172
+ try {
1173
+ // [FIX] Use parameterized queries to prevent SQL injection
1174
+ const sqlQuery = `
1175
+ SELECT
1176
+ date,
1177
+ result_data,
1178
+ category
1179
+ FROM \`${tablePath}\`
1180
+ WHERE computation_name = @computationName
1181
+ AND category = @category
1182
+ AND date BETWEEN @startDate AND @endDate
1183
+ ORDER BY date DESC
1184
+ `;
1185
+
1186
+ if (logger) {
1187
+ logger.log('INFO', `[BigQuery] 🔍 Querying computation results from ${tablePath} for ${computationName} (${category}) from ${startDateStr} to ${endDateStr}`);
1188
+ }
1189
+
1190
+ const rows = await query(sqlQuery, {
1191
+ params: {
1192
+ computationName: computationName,
1193
+ category: category,
1194
+ startDate: startDateStr,
1195
+ endDate: endDateStr
1196
+ }
1197
+ }, logger);
1198
+
1199
+ if (!rows || rows.length === 0) {
1200
+ if (logger) logger.log('INFO', `[BigQuery] No computation results found in ${tablePath} for ${computationName} in date range`);
1201
+ return [];
1202
+ }
1203
+
1204
+ if (logger) {
1205
+ logger.log('INFO', `[BigQuery] ✅ Retrieved ${rows.length} computation result records from ${tablePath} for ${computationName}`);
1206
+ }
1207
+
1208
+ return rows.map(row => ({
1209
+ date: row.date,
1210
+ data: row.result_data || {},
1211
+ category: row.category
1212
+ }));
1213
+ } catch (error) {
1214
+ if (logger) {
1215
+ logger.log('WARN', `[BigQuery] Computation results range query failed for ${tablePath}: ${error.message}`);
1216
+ }
1217
+ return null;
1218
+ }
1219
+ }
1220
+
1221
+ /**
1222
+ * Remove duplicate rows from a BigQuery table (keeps the most recent row per unique key)
1223
+ * This is cheaper than checking duplicates before each insert for large backfills
1224
+ * Uses CREATE OR REPLACE to overwrite the table with deduplicated data
1225
+ * @param {string} datasetId - Dataset ID
1226
+ * @param {string} tableId - Table ID
1227
+ * @param {string} dateField - Date field name (e.g., 'date')
1228
+ * @param {Array} keyFields - Array of field names that form the unique key (e.g., ['user_id', 'user_type'])
1229
+ * @param {object} logger - Logger instance
1230
+ * @returns {Promise<number>} Number of duplicates removed
1231
+ */
1232
+ async function removeDuplicates(datasetId, tableId, dateField, keyFields, logger = null) {
1233
+ if (process.env.BIGQUERY_ENABLED === 'false') {
1234
+ if (logger) logger.log('DEBUG', '[BigQuery] Deduplication skipped (BIGQUERY_ENABLED=false)');
1235
+ return 0;
1236
+ }
1237
+
1238
+ try {
1239
+ const tablePath = `${datasetId}.${tableId}`;
1240
+ const keyFieldsStr = keyFields.join(', ');
1241
+
1242
+ if (logger) {
1243
+ logger.log('INFO', `[BigQuery] 🔄 Deduplicating ${tablePath} by (${keyFieldsStr})`);
1244
+ }
1245
+
1246
+ // Get original count
1247
+ const [originalCountResult] = await query(`SELECT COUNT(*) as cnt FROM \`${tablePath}\``, {}, logger);
1248
+ const originalCount = originalCountResult[0]?.cnt || 0;
1249
+
1250
+ // Check for duplicates before deduplication (for logging)
1251
+ // Use CONCAT to create a composite key for COUNT(DISTINCT)
1252
+ const keyConcat = `${dateField}, '-', ${keyFieldsStr.split(', ').join(", '-', ")}`;
1253
+ const duplicateCheckQuery = `
1254
+ SELECT
1255
+ COUNT(*) as total_rows,
1256
+ COUNT(DISTINCT CONCAT(${keyConcat})) as unique_keys
1257
+ FROM \`${tablePath}\`
1258
+ `;
1259
+ const [duplicateInfo] = await query(duplicateCheckQuery, {}, logger);
1260
+ const uniqueKeys = duplicateInfo[0]?.unique_keys || 0;
1261
+ const duplicateCount = originalCount - uniqueKeys;
1262
+
1263
+ if (logger) {
1264
+ logger.log('INFO', `[BigQuery] Before deduplication: ${originalCount} total rows, ${uniqueKeys} unique keys, ${duplicateCount} duplicates expected`);
1265
+ }
1266
+
1267
+ // Get table metadata to preserve partitioning and clustering
1268
+ const dataset = await getOrCreateDataset(datasetId, logger);
1269
+ const table = dataset.table(tableId);
1270
+ const [tableMetadata] = await table.getMetadata();
1271
+ const partitioning = tableMetadata.timePartitioning;
1272
+ const clustering = tableMetadata.clustering;
1273
+
1274
+ // Build partitioning clause
1275
+ let partitionClause = '';
1276
+ if (partitioning && partitioning.field) {
1277
+ partitionClause = `PARTITION BY ${partitioning.field}`;
1278
+ }
1279
+
1280
+ // Build clustering clause
1281
+ let clusterClause = '';
1282
+ if (clustering && clustering.fields && clustering.fields.length > 0) {
1283
+ clusterClause = `CLUSTER BY ${clustering.fields.join(', ')}`;
1284
+ }
1285
+
1286
+ // [FIX] Use CREATE OR REPLACE TABLE instead of DELETE + INSERT
1287
+ // This is atomic, metadata-based swap that is free/cheap and instant
1288
+ // Strategy:
1289
+ // 1. For rows with same (date, user_id, user_type), keep the one with latest fetched_at
1290
+ // 2. If multiple rows have same (date, user_id, user_type, fetched_at), keep one (they're effectively identical)
1291
+ // The ORDER BY with fetched_at DESC ensures we keep the most recent snapshot
1292
+ const createReplaceQuery = `
1293
+ CREATE OR REPLACE TABLE \`${tablePath}\`
1294
+ ${partitionClause}
1295
+ ${clusterClause}
1296
+ AS
1297
+ SELECT * EXCEPT(row_num)
1298
+ FROM (
1299
+ SELECT
1300
+ *,
1301
+ ROW_NUMBER() OVER (
1302
+ PARTITION BY ${dateField}, ${keyFieldsStr}
1303
+ ORDER BY fetched_at DESC
1304
+ ) AS row_num
1305
+ FROM \`${tablePath}\`
1306
+ )
1307
+ WHERE row_num = 1
1308
+ `;
1309
+
1310
+ await query(createReplaceQuery, {}, logger);
1311
+
1312
+ // Get deduplicated count
1313
+ const [dedupedCountResult] = await query(`SELECT COUNT(*) as cnt FROM \`${tablePath}\``, {}, logger);
1314
+ const dedupedCount = dedupedCountResult[0]?.cnt || 0;
1315
+
1316
+ const duplicatesRemoved = originalCount - dedupedCount;
1317
+
1318
+ if (logger) {
1319
+ if (duplicatesRemoved > 0) {
1320
+ logger.log('INFO', `[BigQuery] ✅ Removed ${duplicatesRemoved} duplicate rows from ${tablePath} (${originalCount} → ${dedupedCount})`);
1321
+ } else {
1322
+ logger.log('INFO', `[BigQuery] ✅ No duplicates found in ${tablePath}`);
1323
+ }
1324
+ }
1325
+
1326
+ return duplicatesRemoved;
1327
+ } catch (error) {
1328
+ if (logger) {
1329
+ logger.log('ERROR', `[BigQuery] Deduplication failed for ${datasetId}.${tableId}: ${error.message}`);
1330
+ }
1331
+ throw error;
1332
+ }
1333
+ }
1334
+
1335
+ /**
1336
+ * Query Popular Investor master list from BigQuery
1337
+ * Returns data in format: { cid: { cid, username, firstSeenAt, lastSeenAt } }
1338
+ * @param {object} logger - Logger instance
1339
+ * @returns {Promise<object>} Master list map in format { cid: { cid, username, firstSeenAt, lastSeenAt } }
1340
+ */
1341
+ async function queryPIMasterList(logger = null) {
1342
+ if (process.env.BIGQUERY_ENABLED === 'false') {
1343
+ if (logger) logger.log('DEBUG', '[BigQuery] PI master list query skipped (BIGQUERY_ENABLED=false)');
1344
+ return null;
1345
+ }
1346
+
1347
+ const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
1348
+ const tablePath = `${datasetId}.pi_master_list`;
1349
+
1350
+ try {
1351
+ const sqlQuery = `
1352
+ SELECT
1353
+ cid,
1354
+ username,
1355
+ first_seen_at,
1356
+ last_seen_at
1357
+ FROM \`${tablePath}\`
1358
+ ORDER BY cid
1359
+ `;
1360
+
1361
+ if (logger) {
1362
+ logger.log('INFO', `[BigQuery] 🔍 Querying PI master list from ${tablePath}`);
1363
+ }
1364
+
1365
+ const rows = await query(sqlQuery, {}, logger);
1366
+
1367
+ if (!rows || rows.length === 0) {
1368
+ if (logger) logger.log('INFO', `[BigQuery] No PI master list found in ${tablePath}`);
1369
+ return {};
1370
+ }
1371
+
1372
+ // Transform to expected format: { cid: { cid, username, firstSeenAt, lastSeenAt } }
1373
+ const masterList = {};
1374
+ for (const row of rows) {
1375
+ const cid = String(row.cid);
1376
+ masterList[cid] = {
1377
+ cid: cid,
1378
+ username: row.username,
1379
+ firstSeenAt: row.first_seen_at,
1380
+ lastSeenAt: row.last_seen_at
1381
+ };
1382
+ }
1383
+
1384
+ if (logger) {
1385
+ logger.log('INFO', `[BigQuery] ✅ Retrieved ${Object.keys(masterList).length} PIs from master list`);
1386
+ }
1387
+
1388
+ return masterList;
1389
+ } catch (error) {
1390
+ if (logger) {
1391
+ logger.log('WARN', `[BigQuery] PI master list query failed for ${tablePath}: ${error.message}`);
1392
+ }
1393
+ return null;
1394
+ }
1395
+ }
1396
+
1397
+ /**
1398
+ * Query instrument insights from BigQuery for a specific date
1399
+ * Returns data in format: array of insights objects (same as Firestore)
1400
+ * @param {string} dateStr - Date (YYYY-MM-DD)
1401
+ * @param {object} logger - Logger instance
1402
+ * @returns {Promise<Array|null>} Array of insights objects, or null if not found/error
1403
+ */
1404
+ async function queryInstrumentInsights(dateStr, logger = null) {
1405
+ if (process.env.BIGQUERY_ENABLED === 'false') {
1406
+ if (logger) logger.log('DEBUG', '[BigQuery] Instrument insights query skipped (BIGQUERY_ENABLED=false)');
1407
+ return null;
1408
+ }
1409
+
1410
+ const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
1411
+ const tablePath = `${datasetId}.instrument_insights`;
1412
+
1413
+ try {
1414
+ const sqlQuery = `
1415
+ SELECT
1416
+ instrument_id,
1417
+ insights_data
1418
+ FROM \`${tablePath}\`
1419
+ WHERE date = @dateStr
1420
+ ORDER BY instrument_id ASC
1421
+ `;
1422
+
1423
+ if (logger) {
1424
+ logger.log('INFO', `[BigQuery] 🔍 Querying instrument insights from ${tablePath} for ${dateStr}`);
1425
+ }
1426
+
1427
+ const rows = await query(sqlQuery, {
1428
+ params: {
1429
+ dateStr: dateStr
1430
+ }
1431
+ }, logger);
1432
+
1433
+ if (!rows || rows.length === 0) {
1434
+ if (logger) logger.log('INFO', `[BigQuery] No instrument insights found for ${dateStr}`);
1435
+ return null;
1436
+ }
1437
+
1438
+ // Transform to expected format: array of insights objects
1439
+ // insights_data is already a JSON object, so we can use it directly
1440
+ const insights = rows.map(row => row.insights_data);
1441
+
1442
+ if (logger) {
1443
+ logger.log('INFO', `[BigQuery] ✅ Retrieved ${insights.length} instrument insights for ${dateStr}`);
1444
+ }
1445
+
1446
+ return insights;
1447
+ } catch (error) {
1448
+ if (logger) {
1449
+ logger.log('WARN', `[BigQuery] Instrument insights query failed for ${dateStr}: ${error.message}`);
1450
+ }
1451
+ return null;
1452
+ }
1453
+ }
1454
+
1455
+ /**
1456
+ * Query Popular Investor rankings from BigQuery for a specific date
1457
+ * Returns data in format matching Firestore structure (Items array)
1458
+ * @param {string} dateStr - Date (YYYY-MM-DD)
1459
+ * @param {object} logger - Logger instance
1460
+ * @returns {Promise<object|null>} Rankings data with Items array, or null if not found/error
1461
+ */
1462
+ async function queryPIRankings(dateStr, logger = null) {
1463
+ if (process.env.BIGQUERY_ENABLED === 'false') {
1464
+ if (logger) logger.log('DEBUG', '[BigQuery] PI rankings query skipped (BIGQUERY_ENABLED=false)');
1465
+ return null;
1466
+ }
1467
+
1468
+ const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
1469
+ const tablePath = `${datasetId}.pi_rankings`;
1470
+
1471
+ try {
1472
+ const sqlQuery = `
1473
+ SELECT
1474
+ pi_id,
1475
+ username,
1476
+ rank,
1477
+ category,
1478
+ rankings_data
1479
+ FROM \`${tablePath}\`
1480
+ WHERE date = @dateStr
1481
+ ORDER BY rank ASC
1482
+ `;
1483
+
1484
+ if (logger) {
1485
+ logger.log('INFO', `[BigQuery] 🔍 Querying PI rankings from ${tablePath} for ${dateStr}`);
1486
+ }
1487
+
1488
+ const rows = await query(sqlQuery, {
1489
+ params: {
1490
+ dateStr: dateStr
1491
+ }
1492
+ }, logger);
1493
+
1494
+ if (!rows || rows.length === 0) {
1495
+ if (logger) logger.log('INFO', `[BigQuery] No PI rankings found for ${dateStr}`);
1496
+ return null;
1497
+ }
1498
+
1499
+ // Transform to expected format: { Items: [...], TotalRows: N }
1500
+ // Use rankings_data if available (full item), otherwise reconstruct from fields
1501
+ const items = rows.map(row => {
1502
+ if (row.rankings_data) {
1503
+ return row.rankings_data; // Full item data stored as JSON
1504
+ } else {
1505
+ // Reconstruct item from individual fields
1506
+ return {
1507
+ CustomerId: row.pi_id,
1508
+ UserName: row.username,
1509
+ Rank: row.rank,
1510
+ Category: row.category
1511
+ };
1512
+ }
1513
+ });
1514
+
1515
+ const result = {
1516
+ Items: items,
1517
+ TotalRows: items.length
1518
+ };
1519
+
1520
+ if (logger) {
1521
+ logger.log('INFO', `[BigQuery] ✅ Retrieved ${items.length} PI rankings for ${dateStr}`);
1522
+ }
1523
+
1524
+ return result;
1525
+ } catch (error) {
1526
+ if (logger) {
1527
+ logger.log('WARN', `[BigQuery] PI rankings query failed for ${dateStr}: ${error.message}`);
1528
+ }
1529
+ return null;
1530
+ }
1531
+ }
1532
+
1533
+ /**
1534
+ * Query all asset prices from BigQuery
1535
+ * Returns data in format: { instrumentId: { "YYYY-MM-DD": price, ... } }
1536
+ * @param {string} startDateStr - Start date (YYYY-MM-DD), optional
1537
+ * @param {string} endDateStr - End date (YYYY-MM-DD), optional
1538
+ * @param {Array} instrumentIds - Optional array of instrument IDs to filter
1539
+ * @param {object} logger - Logger instance
1540
+ * @returns {Promise<object>} Price data map in format { instrumentId: { "YYYY-MM-DD": price } }
1541
+ */
1542
+ async function queryAssetPrices(startDateStr = null, endDateStr = null, instrumentIds = null, logger = null) {
1543
+ if (process.env.BIGQUERY_ENABLED === 'false') {
1544
+ if (logger) logger.log('DEBUG', '[BigQuery] Asset prices query skipped (BIGQUERY_ENABLED=false)');
1545
+ return null;
1546
+ }
1547
+
1548
+ const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
1549
+ const tablePath = `${datasetId}.asset_prices`;
1550
+
1551
+ try {
1552
+ // Build WHERE clause
1553
+ const conditions = [];
1554
+
1555
+ if (startDateStr && endDateStr) {
1556
+ conditions.push(`date BETWEEN @startDate AND @endDate`);
1557
+ } else if (startDateStr) {
1558
+ conditions.push(`date >= @startDate`);
1559
+ } else if (endDateStr) {
1560
+ conditions.push(`date <= @endDate`);
1561
+ }
1562
+
1563
+ if (instrumentIds && instrumentIds.length > 0) {
1564
+ // For IN clause with parameters, we need to use UNNEST
1565
+ conditions.push(`instrument_id IN UNNEST(@instrumentIds)`);
1566
+ }
1567
+
1568
+ const whereClause = conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : '';
1569
+
1570
+ const sqlQuery = `
1571
+ SELECT
1572
+ instrument_id,
1573
+ date,
1574
+ price,
1575
+ ticker
1576
+ FROM \`${tablePath}\`
1577
+ ${whereClause}
1578
+ ORDER BY instrument_id, date DESC
1579
+ `;
1580
+
1581
+ const params = {};
1582
+ if (startDateStr) params.startDate = startDateStr;
1583
+ if (endDateStr) params.endDate = endDateStr;
1584
+ if (instrumentIds && instrumentIds.length > 0) {
1585
+ params.instrumentIds = instrumentIds.map(id => parseInt(id, 10));
1586
+ }
1587
+
1588
+ if (logger) {
1589
+ logger.log('INFO', `[BigQuery] 🔍 Querying asset prices from ${tablePath}${startDateStr ? ` (${startDateStr} to ${endDateStr || 'latest'})` : ''}`);
1590
+ }
1591
+
1592
+ const rows = await query(sqlQuery, { params }, logger);
1593
+
1594
+ if (!rows || rows.length === 0) {
1595
+ if (logger) logger.log('INFO', `[BigQuery] No asset prices found in ${tablePath}`);
1596
+ return {};
1597
+ }
1598
+
1599
+ // Transform to expected format: { instrumentId: { "YYYY-MM-DD": price } }
1600
+ const priceMap = {};
1601
+ for (const row of rows) {
1602
+ const instrumentId = String(row.instrument_id);
1603
+ const dateStr = row.date; // Already in YYYY-MM-DD format from BigQuery DATE type
1604
+
1605
+ if (!priceMap[instrumentId]) {
1606
+ priceMap[instrumentId] = {};
1607
+ }
1608
+
1609
+ // Use close price if available, otherwise use price
1610
+ priceMap[instrumentId][dateStr] = row.price || null;
1611
+ }
1612
+
1613
+ if (logger) {
1614
+ logger.log('INFO', `[BigQuery] ✅ Retrieved prices for ${Object.keys(priceMap).length} instruments from ${tablePath}`);
1615
+ }
1616
+
1617
+ return priceMap;
1618
+ } catch (error) {
1619
+ if (logger) {
1620
+ logger.log('WARN', `[BigQuery] Asset prices query failed for ${tablePath}: ${error.message}`);
1621
+ }
1622
+ return null;
1623
+ }
1624
+ }
1625
+
1626
+ module.exports = {
1627
+ getBigQueryClient,
1628
+ getOrCreateDataset,
1629
+ ensureTableExists,
1630
+ insertRows,
1631
+ insertRowsStreaming,
1632
+ query,
1633
+ getSchema,
1634
+ ensureComputationResultsTable,
1635
+ ensurePortfolioSnapshotsTable,
1636
+ ensureTradeHistorySnapshotsTable,
1637
+ ensureSocialPostSnapshotsTable,
1638
+ ensureAssetPricesTable,
1639
+ ensurePIMasterListTable,
1640
+ ensurePIRankingsTable,
1641
+ ensureInstrumentInsightsTable,
1642
+ queryPortfolioData,
1643
+ queryHistoryData,
1644
+ querySocialData,
1645
+ queryAssetPrices,
1646
+ queryPIMasterList,
1647
+ queryPIRankings,
1648
+ queryInstrumentInsights,
1649
+ queryComputationResult,
1650
+ queryComputationResultsRange,
1651
+ checkExistingRows,
1652
+ removeDuplicates,
1653
+ insertRowsWithMerge,
1654
+ SCHEMAS
1655
+ };