bulltrackers-module 1.0.710 → 1.0.713
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/functions/api-v2/helpers/data-fetchers/firestore.js +119 -63
- package/functions/computation-system/data/CachedDataLoader.js +22 -1
- package/functions/computation-system/data/DependencyFetcher.js +118 -0
- package/functions/computation-system/persistence/ResultCommitter.js +94 -3
- package/functions/computation-system/utils/data_loader.js +244 -13
- package/functions/core/utils/bigquery_utils.js +1655 -0
- package/functions/core/utils/firestore_utils.js +99 -30
- package/functions/etoro-price-fetcher/helpers/handler_helpers.js +85 -13
- package/functions/fetch-insights/helpers/handler_helpers.js +26 -0
- package/functions/fetch-popular-investors/helpers/fetch_helpers.js +66 -0
- package/functions/maintenance/backfill-instrument-insights/index.js +180 -0
- package/functions/maintenance/backfill-pi-master-list-rankings/index.js +293 -0
- package/functions/maintenance/backfill-task-engine-data/README.md +72 -0
- package/functions/maintenance/backfill-task-engine-data/index.js +844 -0
- package/functions/price-backfill/helpers/handler_helpers.js +59 -10
- package/functions/root-data-indexer/index.js +79 -27
- package/functions/task-engine/helpers/data_storage_helpers.js +194 -102
- package/functions/task-engine/helpers/popular_investor_helpers.js +13 -7
- package/functions/task-engine/utils/bigquery_batch_manager.js +201 -0
- package/functions/task-engine/utils/firestore_batch_manager.js +21 -1
- package/index.js +34 -2
- package/package.json +7 -3
|
@@ -0,0 +1,1655 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview BigQuery utility functions for BullTrackers
|
|
3
|
+
* Handles table creation, data insertion, and querying with automatic schema management
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
const { BigQuery } = require('@google-cloud/bigquery');
|
|
7
|
+
const fs = require('fs');
|
|
8
|
+
const path = require('path');
|
|
9
|
+
const os = require('os');
|
|
10
|
+
|
|
11
|
+
// Singleton BigQuery client
|
|
12
|
+
let bigqueryClient = null;
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Get or create BigQuery client
|
|
16
|
+
*/
|
|
17
|
+
function getBigQueryClient() {
|
|
18
|
+
if (!bigqueryClient) {
|
|
19
|
+
const projectId = process.env.GCP_PROJECT_ID || 'stocks-12345';
|
|
20
|
+
bigqueryClient = new BigQuery({ projectId });
|
|
21
|
+
}
|
|
22
|
+
return bigqueryClient;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Get dataset reference, creating it if it doesn't exist
|
|
27
|
+
* @param {string} datasetId - Dataset ID (e.g., 'bulltrackers_data')
|
|
28
|
+
* @param {object} logger - Logger instance
|
|
29
|
+
* @returns {Promise<Dataset>}
|
|
30
|
+
*/
|
|
31
|
+
async function getOrCreateDataset(datasetId, logger = null) {
|
|
32
|
+
const bigquery = getBigQueryClient();
|
|
33
|
+
const dataset = bigquery.dataset(datasetId);
|
|
34
|
+
|
|
35
|
+
try {
|
|
36
|
+
const [exists] = await dataset.exists();
|
|
37
|
+
if (!exists) {
|
|
38
|
+
if (logger) logger.log('INFO', `[BigQuery] Creating dataset: ${datasetId}`);
|
|
39
|
+
await dataset.create({
|
|
40
|
+
location: 'europe-west1', // Match your Cloud Functions region
|
|
41
|
+
description: 'BullTrackers analytical data'
|
|
42
|
+
});
|
|
43
|
+
if (logger) logger.log('INFO', `[BigQuery] Dataset ${datasetId} created successfully`);
|
|
44
|
+
}
|
|
45
|
+
return dataset;
|
|
46
|
+
} catch (error) {
|
|
47
|
+
if (logger) logger.log('ERROR', `[BigQuery] Error with dataset ${datasetId}: ${error.message}`);
|
|
48
|
+
throw error;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Ensure a table exists with the given schema, creating it if necessary
|
|
54
|
+
* @param {string} datasetId - Dataset ID
|
|
55
|
+
* @param {string} tableId - Table ID
|
|
56
|
+
* @param {Array} schema - BigQuery schema array
|
|
57
|
+
* @param {object} options - Additional options (partitionField, clusterFields)
|
|
58
|
+
* @param {object} logger - Logger instance
|
|
59
|
+
* @returns {Promise<Table>}
|
|
60
|
+
*/
|
|
61
|
+
async function ensureTableExists(datasetId, tableId, schema, options = {}, logger = null) {
|
|
62
|
+
const dataset = await getOrCreateDataset(datasetId, logger);
|
|
63
|
+
const table = dataset.table(tableId);
|
|
64
|
+
|
|
65
|
+
try {
|
|
66
|
+
const [exists] = await table.exists();
|
|
67
|
+
if (!exists) {
|
|
68
|
+
if (logger) logger.log('INFO', `[BigQuery] Creating table: ${datasetId}.${tableId}`);
|
|
69
|
+
|
|
70
|
+
const tableOptions = {
|
|
71
|
+
schema: schema,
|
|
72
|
+
description: `Auto-created table for ${tableId}`
|
|
73
|
+
};
|
|
74
|
+
|
|
75
|
+
// Add partitioning if specified
|
|
76
|
+
if (options.partitionField) {
|
|
77
|
+
tableOptions.timePartitioning = {
|
|
78
|
+
field: options.partitionField,
|
|
79
|
+
type: 'DAY' // Partition by day
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// Add clustering if specified
|
|
84
|
+
if (options.clusterFields && options.clusterFields.length > 0) {
|
|
85
|
+
tableOptions.clustering = {
|
|
86
|
+
fields: options.clusterFields
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
await table.create(tableOptions);
|
|
91
|
+
if (logger) logger.log('INFO', `[BigQuery] Table ${datasetId}.${tableId} created successfully`);
|
|
92
|
+
} else {
|
|
93
|
+
// Table exists - verify schema matches (optional, can be enhanced)
|
|
94
|
+
if (logger) logger.log('DEBUG', `[BigQuery] Table ${datasetId}.${tableId} already exists`);
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
return table;
|
|
98
|
+
} catch (error) {
|
|
99
|
+
if (logger) logger.log('ERROR', `[BigQuery] Error ensuring table ${datasetId}.${tableId}: ${error.message}`);
|
|
100
|
+
throw error;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Insert rows using BigQuery MERGE statement (handles duplicates natively via SQL)
|
|
106
|
+
* More efficient than checking then inserting - BigQuery handles deduplication in SQL
|
|
107
|
+
* Uses a temporary table and MERGE statement for atomic deduplication
|
|
108
|
+
* @param {string} datasetId - Dataset ID
|
|
109
|
+
* @param {string} tableId - Table ID
|
|
110
|
+
* @param {Array} rows - Array of row objects
|
|
111
|
+
* @param {Array} keyFields - Fields that form unique key (e.g., ['date', 'user_id', 'user_type'])
|
|
112
|
+
* @param {object} logger - Logger instance
|
|
113
|
+
* @returns {Promise<number>} Number of rows actually inserted (not duplicates)
|
|
114
|
+
*/
|
|
115
|
+
async function insertRowsWithMerge(datasetId, tableId, rows, keyFields, logger = null) {
|
|
116
|
+
if (!rows || rows.length === 0) {
|
|
117
|
+
if (logger) logger.log('WARN', `[BigQuery] No rows to merge into ${datasetId}.${tableId}`);
|
|
118
|
+
return 0;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
const MAX_ROW_SIZE = 9 * 1024 * 1024; // 9MB safety limit
|
|
122
|
+
const validRows = rows.filter(row => {
|
|
123
|
+
const rowSize = JSON.stringify(row).length;
|
|
124
|
+
return rowSize <= MAX_ROW_SIZE;
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
if (validRows.length === 0) {
|
|
128
|
+
if (logger) logger.log('WARN', `[BigQuery] All rows too large for MERGE into ${datasetId}.${tableId}`);
|
|
129
|
+
return 0;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
try {
|
|
133
|
+
const tablePath = `${datasetId}.${tableId}`;
|
|
134
|
+
const keyFieldsStr = keyFields.join(', ');
|
|
135
|
+
const tempTableId = `${tableId}_temp_${Date.now()}`;
|
|
136
|
+
const tempTablePath = `${datasetId}.${tempTableId}`;
|
|
137
|
+
|
|
138
|
+
// Get table schema
|
|
139
|
+
const dataset = await getOrCreateDataset(datasetId, logger);
|
|
140
|
+
const table = dataset.table(tableId);
|
|
141
|
+
const [tableMetadata] = await table.getMetadata();
|
|
142
|
+
const schema = tableMetadata.schema.fields;
|
|
143
|
+
|
|
144
|
+
// Create temp table with same schema
|
|
145
|
+
const tempTable = dataset.table(tempTableId);
|
|
146
|
+
await tempTable.create({
|
|
147
|
+
schema: schema,
|
|
148
|
+
description: 'Temporary table for merge operation'
|
|
149
|
+
});
|
|
150
|
+
|
|
151
|
+
if (logger) {
|
|
152
|
+
logger.log('INFO', `[BigQuery] Created temp table ${tempTableId} for MERGE operation`);
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// Insert all rows into temp table using LOAD JOB (free, not streaming)
|
|
156
|
+
// Write to temporary file (load jobs require a file, not a stream)
|
|
157
|
+
const tempFile = path.join(os.tmpdir(), `bigquery_merge_${Date.now()}_${Math.random().toString(36).substring(7)}.ndjson`);
|
|
158
|
+
const ndjson = validRows.map(r => JSON.stringify(r)).join('\n');
|
|
159
|
+
|
|
160
|
+
try {
|
|
161
|
+
fs.writeFileSync(tempFile, ndjson, 'utf8');
|
|
162
|
+
|
|
163
|
+
// Load into temp table using load job (FREE) from temp file
|
|
164
|
+
// Use createLoadJob to get a Job object we can wait on
|
|
165
|
+
const [loadJob] = await tempTable.createLoadJob(tempFile, {
|
|
166
|
+
sourceFormat: 'NEWLINE_DELIMITED_JSON',
|
|
167
|
+
writeDisposition: 'WRITE_APPEND',
|
|
168
|
+
autodetect: false // Use existing table schema
|
|
169
|
+
});
|
|
170
|
+
|
|
171
|
+
// [FIX] Use native job.promise() instead of custom polling
|
|
172
|
+
// This automatically polls and waits for completion
|
|
173
|
+
await loadJob.promise();
|
|
174
|
+
|
|
175
|
+
// Get job metadata to check for errors
|
|
176
|
+
const [jobMetadata] = await loadJob.getMetadata();
|
|
177
|
+
|
|
178
|
+
// Check for errors
|
|
179
|
+
if (jobMetadata.status?.errorResult) {
|
|
180
|
+
throw new Error(`Load job failed: ${jobMetadata.status.errorResult.message}`);
|
|
181
|
+
}
|
|
182
|
+
} finally {
|
|
183
|
+
// Clean up temp file
|
|
184
|
+
try {
|
|
185
|
+
if (fs.existsSync(tempFile)) {
|
|
186
|
+
fs.unlinkSync(tempFile);
|
|
187
|
+
}
|
|
188
|
+
} catch (cleanupError) {
|
|
189
|
+
if (logger) {
|
|
190
|
+
logger.log('WARN', `[BigQuery] Failed to delete temp file ${tempFile}: ${cleanupError.message}`);
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
if (logger) {
|
|
196
|
+
logger.log('INFO', `[BigQuery] Loaded ${validRows.length} rows into temp table ${tempTableId} using LOAD JOB (free)`);
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
// Use MERGE to insert only new rows (SQL-native deduplication)
|
|
200
|
+
// This is more efficient than checking in JavaScript
|
|
201
|
+
const mergeConditions = keyFields.map(f => `target.${f} = source.${f}`).join(' AND ');
|
|
202
|
+
const mergeQuery = `
|
|
203
|
+
MERGE \`${tablePath}\` AS target
|
|
204
|
+
USING \`${tempTablePath}\` AS source
|
|
205
|
+
ON ${mergeConditions}
|
|
206
|
+
WHEN NOT MATCHED THEN
|
|
207
|
+
INSERT ROW
|
|
208
|
+
`;
|
|
209
|
+
|
|
210
|
+
await query(mergeQuery, {}, logger);
|
|
211
|
+
|
|
212
|
+
// Get count of rows that were actually inserted (not matched = new rows)
|
|
213
|
+
// We can't directly get this from MERGE, so we'll query the temp table
|
|
214
|
+
// and subtract what already exists
|
|
215
|
+
const [existingBefore] = await query(`SELECT COUNT(*) as cnt FROM \`${tablePath}\``, {}, logger);
|
|
216
|
+
const countBefore = existingBefore[0]?.cnt || 0;
|
|
217
|
+
|
|
218
|
+
// Actually, MERGE doesn't return row count directly
|
|
219
|
+
// Let's use a different approach - query what was inserted
|
|
220
|
+
const [insertedCountResult] = await query(`
|
|
221
|
+
SELECT COUNT(*) as inserted
|
|
222
|
+
FROM \`${tempTablePath}\` AS source
|
|
223
|
+
WHERE NOT EXISTS (
|
|
224
|
+
SELECT 1 FROM \`${tablePath}\` AS target
|
|
225
|
+
WHERE ${mergeConditions}
|
|
226
|
+
)
|
|
227
|
+
`, {}, logger);
|
|
228
|
+
|
|
229
|
+
const rowsInserted = insertedCountResult[0]?.inserted || 0;
|
|
230
|
+
|
|
231
|
+
// Drop temp table
|
|
232
|
+
await tempTable.delete();
|
|
233
|
+
|
|
234
|
+
if (logger) {
|
|
235
|
+
logger.log('INFO', `[BigQuery] MERGE completed: ${rowsInserted} new rows inserted into ${tablePath} (${validRows.length - rowsInserted} duplicates skipped via SQL)`);
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
return rowsInserted;
|
|
239
|
+
} catch (error) {
|
|
240
|
+
const errorDetails = {
|
|
241
|
+
message: error.message,
|
|
242
|
+
code: error.code,
|
|
243
|
+
errors: error.errors
|
|
244
|
+
};
|
|
245
|
+
if (logger) {
|
|
246
|
+
logger.log('ERROR', `[BigQuery] MERGE failed for ${datasetId}.${tableId}:`, JSON.stringify(errorDetails, null, 2));
|
|
247
|
+
}
|
|
248
|
+
// Try to clean up temp table if it exists
|
|
249
|
+
try {
|
|
250
|
+
const dataset = await getOrCreateDataset(datasetId, logger);
|
|
251
|
+
const tempTableId = `${tableId}_temp_${Date.now()}`;
|
|
252
|
+
await dataset.table(tempTableId).delete().catch(() => {}); // Ignore cleanup errors
|
|
253
|
+
} catch {}
|
|
254
|
+
throw error;
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
/**
|
|
259
|
+
* Insert rows into a BigQuery table using LOAD JOB (FREE, not streaming inserts)
|
|
260
|
+
* @param {string} datasetId - Dataset ID
|
|
261
|
+
* @param {string} tableId - Table ID
|
|
262
|
+
* @param {Array} rows - Array of row objects
|
|
263
|
+
* @param {object} logger - Logger instance
|
|
264
|
+
* @returns {Promise<void>}
|
|
265
|
+
*/
|
|
266
|
+
async function insertRows(datasetId, tableId, rows, logger = null) {
|
|
267
|
+
if (!rows || rows.length === 0) {
|
|
268
|
+
if (logger) logger.log('WARN', `[BigQuery] No rows to insert into ${datasetId}.${tableId}`);
|
|
269
|
+
return;
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
const MAX_ROW_SIZE = 9 * 1024 * 1024; // 9MB safety limit (BigQuery limit is 10MB)
|
|
273
|
+
const MAX_LOAD_JOB_SIZE = 100 * 1024 * 1024; // 100MB per load job (BigQuery limit is 10GB, but we batch smaller)
|
|
274
|
+
|
|
275
|
+
try {
|
|
276
|
+
const dataset = await getOrCreateDataset(datasetId, logger);
|
|
277
|
+
const table = dataset.table(tableId);
|
|
278
|
+
|
|
279
|
+
// Filter out rows that are too large
|
|
280
|
+
const validRows = [];
|
|
281
|
+
const skippedRows = [];
|
|
282
|
+
|
|
283
|
+
for (const row of rows) {
|
|
284
|
+
const rowSize = JSON.stringify(row).length;
|
|
285
|
+
if (rowSize > MAX_ROW_SIZE) {
|
|
286
|
+
skippedRows.push({ size: rowSize, row: Object.keys(row) });
|
|
287
|
+
if (logger) {
|
|
288
|
+
logger.log('WARN', `[BigQuery] Skipping row in ${datasetId}.${tableId}: Row too large (${(rowSize/1024/1024).toFixed(2)}MB, limit: ${(MAX_ROW_SIZE/1024/1024).toFixed(2)}MB)`);
|
|
289
|
+
}
|
|
290
|
+
} else {
|
|
291
|
+
validRows.push(row);
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
if (skippedRows.length > 0 && logger) {
|
|
296
|
+
logger.log('WARN', `[BigQuery] Skipped ${skippedRows.length} rows in ${datasetId}.${tableId} due to size limits`);
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
if (validRows.length === 0) {
|
|
300
|
+
if (logger) logger.log('WARN', `[BigQuery] No valid rows to insert into ${datasetId}.${tableId} (all rows too large)`);
|
|
301
|
+
return;
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
// Use LOAD JOBS (free) instead of streaming inserts (expensive)
|
|
305
|
+
// Batch rows into load jobs of reasonable size
|
|
306
|
+
let totalInserted = 0;
|
|
307
|
+
let currentBatch = [];
|
|
308
|
+
let currentBatchSize = 0;
|
|
309
|
+
|
|
310
|
+
for (let i = 0; i < validRows.length; i++) {
|
|
311
|
+
const row = validRows[i];
|
|
312
|
+
const rowSize = JSON.stringify(row).length;
|
|
313
|
+
|
|
314
|
+
// If adding this row would exceed batch size, load current batch first
|
|
315
|
+
if (currentBatch.length > 0 && (currentBatchSize + rowSize) > MAX_LOAD_JOB_SIZE) {
|
|
316
|
+
// Write batch to temporary file (load jobs require a file, not a stream)
|
|
317
|
+
const tempFile = path.join(os.tmpdir(), `bigquery_load_${Date.now()}_${Math.random().toString(36).substring(7)}.ndjson`);
|
|
318
|
+
const ndjson = currentBatch.map(r => JSON.stringify(r)).join('\n');
|
|
319
|
+
|
|
320
|
+
try {
|
|
321
|
+
fs.writeFileSync(tempFile, ndjson, 'utf8');
|
|
322
|
+
|
|
323
|
+
// Load current batch using load job (FREE) from temp file
|
|
324
|
+
// Use createLoadJob to get a Job object we can wait on
|
|
325
|
+
const [job] = await table.createLoadJob(tempFile, {
|
|
326
|
+
sourceFormat: 'NEWLINE_DELIMITED_JSON',
|
|
327
|
+
writeDisposition: 'WRITE_APPEND',
|
|
328
|
+
autodetect: false // Use existing table schema
|
|
329
|
+
});
|
|
330
|
+
|
|
331
|
+
// [FIX] Use native job.promise() instead of custom polling
|
|
332
|
+
// This automatically polls and waits for completion
|
|
333
|
+
await job.promise();
|
|
334
|
+
|
|
335
|
+
// Get job metadata to check for errors and get row count
|
|
336
|
+
const [jobMetadata] = await job.getMetadata();
|
|
337
|
+
|
|
338
|
+
// Check for errors
|
|
339
|
+
if (jobMetadata.status?.errorResult) {
|
|
340
|
+
throw new Error(`Load job failed: ${jobMetadata.status.errorResult.message}`);
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
const rowsLoaded = jobMetadata.statistics?.load?.outputRows || currentBatch.length;
|
|
344
|
+
totalInserted += rowsLoaded;
|
|
345
|
+
|
|
346
|
+
if (logger) {
|
|
347
|
+
logger.log('INFO', `[BigQuery] Load job completed: ${rowsLoaded} rows loaded into ${datasetId}.${tableId}`);
|
|
348
|
+
}
|
|
349
|
+
} finally {
|
|
350
|
+
// Clean up temp file
|
|
351
|
+
try {
|
|
352
|
+
if (fs.existsSync(tempFile)) {
|
|
353
|
+
fs.unlinkSync(tempFile);
|
|
354
|
+
}
|
|
355
|
+
} catch (cleanupError) {
|
|
356
|
+
if (logger) {
|
|
357
|
+
logger.log('WARN', `[BigQuery] Failed to delete temp file ${tempFile}: ${cleanupError.message}`);
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
// Reset batch
|
|
363
|
+
currentBatch = [];
|
|
364
|
+
currentBatchSize = 0;
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
// Add row to current batch
|
|
368
|
+
currentBatch.push(row);
|
|
369
|
+
currentBatchSize += rowSize;
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
// Load remaining batch
|
|
373
|
+
if (currentBatch.length > 0) {
|
|
374
|
+
// Write batch to temporary file (load jobs require a file, not a stream)
|
|
375
|
+
const tempFile = path.join(os.tmpdir(), `bigquery_load_${Date.now()}_${Math.random().toString(36).substring(7)}.ndjson`);
|
|
376
|
+
const ndjson = currentBatch.map(r => JSON.stringify(r)).join('\n');
|
|
377
|
+
|
|
378
|
+
try {
|
|
379
|
+
fs.writeFileSync(tempFile, ndjson, 'utf8');
|
|
380
|
+
|
|
381
|
+
// Load using load job (FREE) from temp file
|
|
382
|
+
// Use createLoadJob to get a Job object we can wait on
|
|
383
|
+
const [job] = await table.createLoadJob(tempFile, {
|
|
384
|
+
sourceFormat: 'NEWLINE_DELIMITED_JSON',
|
|
385
|
+
writeDisposition: 'WRITE_APPEND',
|
|
386
|
+
autodetect: false // Use existing table schema
|
|
387
|
+
});
|
|
388
|
+
|
|
389
|
+
// Wait for job to complete using polling
|
|
390
|
+
let jobMetadata;
|
|
391
|
+
const maxAttempts = 60; // 5 minutes max (5 second intervals)
|
|
392
|
+
const pollInterval = 5000; // 5 seconds
|
|
393
|
+
|
|
394
|
+
for (let attempt = 0; attempt < maxAttempts; attempt++) {
|
|
395
|
+
[jobMetadata] = await job.getMetadata();
|
|
396
|
+
const state = jobMetadata.status?.state;
|
|
397
|
+
|
|
398
|
+
if (state === 'DONE') {
|
|
399
|
+
break;
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
if (state === 'PENDING' || state === 'RUNNING') {
|
|
403
|
+
// Wait before next poll
|
|
404
|
+
await new Promise(resolve => setTimeout(resolve, pollInterval));
|
|
405
|
+
} else {
|
|
406
|
+
throw new Error(`Unexpected job state: ${state}`);
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
// Check if we timed out
|
|
411
|
+
if (jobMetadata.status?.state !== 'DONE') {
|
|
412
|
+
throw new Error(`Load job did not complete within ${maxAttempts * pollInterval / 1000} seconds`);
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
// Check for errors
|
|
416
|
+
if (jobMetadata.status?.errorResult) {
|
|
417
|
+
throw new Error(`Load job failed: ${jobMetadata.status.errorResult.message}`);
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
const rowsLoaded = jobMetadata.statistics?.load?.outputRows || currentBatch.length;
|
|
421
|
+
totalInserted += rowsLoaded;
|
|
422
|
+
|
|
423
|
+
if (logger) {
|
|
424
|
+
logger.log('INFO', `[BigQuery] Load job completed: ${rowsLoaded} rows loaded into ${datasetId}.${tableId}`);
|
|
425
|
+
}
|
|
426
|
+
} finally {
|
|
427
|
+
// Clean up temp file
|
|
428
|
+
try {
|
|
429
|
+
if (fs.existsSync(tempFile)) {
|
|
430
|
+
fs.unlinkSync(tempFile);
|
|
431
|
+
}
|
|
432
|
+
} catch (cleanupError) {
|
|
433
|
+
if (logger) {
|
|
434
|
+
logger.log('WARN', `[BigQuery] Failed to delete temp file ${tempFile}: ${cleanupError.message}`);
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
if (logger) {
|
|
441
|
+
logger.log('INFO', `[BigQuery] Loaded ${totalInserted}/${validRows.length} rows into ${datasetId}.${tableId} using LOAD JOBS (free)${skippedRows.length > 0 ? ` (${skippedRows.length} skipped due to size)` : ''}`);
|
|
442
|
+
}
|
|
443
|
+
} catch (error) {
|
|
444
|
+
const errorDetails = {
|
|
445
|
+
message: error.message,
|
|
446
|
+
code: error.code,
|
|
447
|
+
errors: error.errors,
|
|
448
|
+
stack: error.stack
|
|
449
|
+
};
|
|
450
|
+
if (logger) {
|
|
451
|
+
logger.log('ERROR', `[BigQuery] Error loading into ${datasetId}.${tableId}:`, JSON.stringify(errorDetails, null, 2));
|
|
452
|
+
}
|
|
453
|
+
throw error;
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
/**
|
|
458
|
+
* Insert rows into BigQuery using STREAMING INSERTS (immediate, costs ~$0.05/GB)
|
|
459
|
+
* Use this for time-sensitive data like alert computations
|
|
460
|
+
* @param {string} datasetId - Dataset ID
|
|
461
|
+
* @param {string} tableId - Table ID
|
|
462
|
+
* @param {Array} rows - Array of row objects
|
|
463
|
+
* @param {object} logger - Logger instance
|
|
464
|
+
* @returns {Promise<void>}
|
|
465
|
+
*/
|
|
466
|
+
async function insertRowsStreaming(datasetId, tableId, rows, logger = null) {
|
|
467
|
+
if (!rows || rows.length === 0) {
|
|
468
|
+
if (logger) logger.log('WARN', `[BigQuery] No rows to stream into ${datasetId}.${tableId}`);
|
|
469
|
+
return;
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
const MAX_ROW_SIZE = 9 * 1024 * 1024; // 9MB safety limit (BigQuery limit is 10MB)
|
|
473
|
+
const MAX_BATCH_SIZE = 100; // Streaming insert batch size
|
|
474
|
+
|
|
475
|
+
try {
|
|
476
|
+
const dataset = await getOrCreateDataset(datasetId, logger);
|
|
477
|
+
const table = dataset.table(tableId);
|
|
478
|
+
|
|
479
|
+
// Filter out rows that are too large
|
|
480
|
+
const validRows = rows.filter(row => {
|
|
481
|
+
const rowSize = JSON.stringify(row).length;
|
|
482
|
+
return rowSize <= MAX_ROW_SIZE;
|
|
483
|
+
});
|
|
484
|
+
|
|
485
|
+
if (validRows.length === 0) {
|
|
486
|
+
if (logger) logger.log('WARN', `[BigQuery] No valid rows to stream into ${datasetId}.${tableId} (all rows too large)`);
|
|
487
|
+
return;
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
// Stream inserts in batches
|
|
491
|
+
let insertedCount = 0;
|
|
492
|
+
for (let i = 0; i < validRows.length; i += MAX_BATCH_SIZE) {
|
|
493
|
+
const batch = validRows.slice(i, i + MAX_BATCH_SIZE);
|
|
494
|
+
|
|
495
|
+
try {
|
|
496
|
+
const [result] = await table.insert(batch);
|
|
497
|
+
|
|
498
|
+
if (result.insertErrors && result.insertErrors.length > 0) {
|
|
499
|
+
const errors = result.insertErrors.map(e => e.errors).flat();
|
|
500
|
+
if (logger) logger.log('ERROR', `[BigQuery] Streaming insert errors for batch in ${datasetId}.${tableId}:`, errors);
|
|
501
|
+
// Continue with next batch
|
|
502
|
+
} else {
|
|
503
|
+
insertedCount += batch.length;
|
|
504
|
+
}
|
|
505
|
+
} catch (batchError) {
|
|
506
|
+
if (logger) {
|
|
507
|
+
logger.log('WARN', `[BigQuery] Streaming insert batch failed for ${datasetId}.${tableId}: ${batchError.message}`);
|
|
508
|
+
}
|
|
509
|
+
// Continue with next batch
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
if (logger) {
|
|
514
|
+
logger.log('INFO', `[BigQuery] Streamed ${insertedCount}/${validRows.length} rows into ${datasetId}.${tableId} using STREAMING INSERTS`);
|
|
515
|
+
}
|
|
516
|
+
} catch (error) {
|
|
517
|
+
const errorDetails = {
|
|
518
|
+
message: error.message,
|
|
519
|
+
code: error.code,
|
|
520
|
+
errors: error.errors
|
|
521
|
+
};
|
|
522
|
+
if (logger) {
|
|
523
|
+
logger.log('ERROR', `[BigQuery] Error streaming into ${datasetId}.${tableId}:`, JSON.stringify(errorDetails, null, 2));
|
|
524
|
+
}
|
|
525
|
+
throw error;
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
/**
|
|
530
|
+
* Query BigQuery and return results
|
|
531
|
+
* @param {string} query - SQL query string
|
|
532
|
+
* @param {object} options - Query options
|
|
533
|
+
* @param {object} logger - Logger instance
|
|
534
|
+
* @returns {Promise<Array>} Array of row objects
|
|
535
|
+
*/
|
|
536
|
+
async function query(query, options = {}, logger = null) {
|
|
537
|
+
const bigquery = getBigQueryClient();
|
|
538
|
+
|
|
539
|
+
try {
|
|
540
|
+
const [rows] = await bigquery.query({
|
|
541
|
+
query: query,
|
|
542
|
+
location: 'europe-west1',
|
|
543
|
+
...options
|
|
544
|
+
});
|
|
545
|
+
|
|
546
|
+
if (logger) logger.log('INFO', `[BigQuery] Query returned ${rows.length} rows`);
|
|
547
|
+
return rows;
|
|
548
|
+
} catch (error) {
|
|
549
|
+
if (logger) logger.log('ERROR', `[BigQuery] Query error: ${error.message}`);
|
|
550
|
+
throw error;
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
/**
|
|
555
|
+
* Schema definitions for BullTrackers tables
|
|
556
|
+
*/
|
|
557
|
+
const SCHEMAS = {
|
|
558
|
+
computation_results: [
|
|
559
|
+
{ name: 'date', type: 'DATE', mode: 'REQUIRED' },
|
|
560
|
+
{ name: 'computation_name', type: 'STRING', mode: 'REQUIRED' },
|
|
561
|
+
{ name: 'category', type: 'STRING', mode: 'REQUIRED' },
|
|
562
|
+
{ name: 'result_data', type: 'JSON', mode: 'NULLABLE' },
|
|
563
|
+
{ name: 'metadata', type: 'JSON', mode: 'NULLABLE' },
|
|
564
|
+
{ name: 'created_at', type: 'TIMESTAMP', mode: 'REQUIRED' }
|
|
565
|
+
],
|
|
566
|
+
portfolio_snapshots: [
|
|
567
|
+
{ name: 'date', type: 'DATE', mode: 'REQUIRED' },
|
|
568
|
+
{ name: 'user_id', type: 'INT64', mode: 'REQUIRED' },
|
|
569
|
+
{ name: 'user_type', type: 'STRING', mode: 'REQUIRED' },
|
|
570
|
+
{ name: 'portfolio_data', type: 'JSON', mode: 'NULLABLE' },
|
|
571
|
+
{ name: 'fetched_at', type: 'TIMESTAMP', mode: 'REQUIRED' }
|
|
572
|
+
],
|
|
573
|
+
trade_history_snapshots: [
|
|
574
|
+
{ name: 'date', type: 'DATE', mode: 'REQUIRED' },
|
|
575
|
+
{ name: 'user_id', type: 'INT64', mode: 'REQUIRED' },
|
|
576
|
+
{ name: 'user_type', type: 'STRING', mode: 'REQUIRED' },
|
|
577
|
+
{ name: 'history_data', type: 'JSON', mode: 'NULLABLE' },
|
|
578
|
+
{ name: 'fetched_at', type: 'TIMESTAMP', mode: 'REQUIRED' }
|
|
579
|
+
],
|
|
580
|
+
social_post_snapshots: [
|
|
581
|
+
{ name: 'date', type: 'DATE', mode: 'REQUIRED' },
|
|
582
|
+
{ name: 'user_id', type: 'INT64', mode: 'REQUIRED' },
|
|
583
|
+
{ name: 'user_type', type: 'STRING', mode: 'REQUIRED' },
|
|
584
|
+
{ name: 'posts_data', type: 'JSON', mode: 'NULLABLE' },
|
|
585
|
+
{ name: 'fetched_at', type: 'TIMESTAMP', mode: 'REQUIRED' }
|
|
586
|
+
],
|
|
587
|
+
asset_prices: [
|
|
588
|
+
{ name: 'date', type: 'DATE', mode: 'REQUIRED' },
|
|
589
|
+
{ name: 'instrument_id', type: 'INT64', mode: 'REQUIRED' },
|
|
590
|
+
{ name: 'ticker', type: 'STRING', mode: 'REQUIRED' },
|
|
591
|
+
{ name: 'price', type: 'FLOAT64', mode: 'REQUIRED' },
|
|
592
|
+
{ name: 'open', type: 'FLOAT64', mode: 'NULLABLE' },
|
|
593
|
+
{ name: 'high', type: 'FLOAT64', mode: 'NULLABLE' },
|
|
594
|
+
{ name: 'low', type: 'FLOAT64', mode: 'NULLABLE' },
|
|
595
|
+
{ name: 'close', type: 'FLOAT64', mode: 'NULLABLE' },
|
|
596
|
+
{ name: 'volume', type: 'FLOAT64', mode: 'NULLABLE' },
|
|
597
|
+
{ name: 'fetched_at', type: 'TIMESTAMP', mode: 'REQUIRED' }
|
|
598
|
+
],
|
|
599
|
+
pi_master_list: [
|
|
600
|
+
{ name: 'cid', type: 'INT64', mode: 'REQUIRED' },
|
|
601
|
+
{ name: 'username', type: 'STRING', mode: 'REQUIRED' },
|
|
602
|
+
{ name: 'first_seen_at', type: 'TIMESTAMP', mode: 'REQUIRED' },
|
|
603
|
+
{ name: 'last_seen_at', type: 'TIMESTAMP', mode: 'REQUIRED' },
|
|
604
|
+
{ name: 'last_updated', type: 'TIMESTAMP', mode: 'REQUIRED' }
|
|
605
|
+
],
|
|
606
|
+
pi_rankings: [
|
|
607
|
+
{ name: 'date', type: 'DATE', mode: 'REQUIRED' },
|
|
608
|
+
{ name: 'pi_id', type: 'INT64', mode: 'REQUIRED' },
|
|
609
|
+
{ name: 'username', type: 'STRING', mode: 'REQUIRED' },
|
|
610
|
+
{ name: 'rank', type: 'INT64', mode: 'NULLABLE' },
|
|
611
|
+
{ name: 'category', type: 'STRING', mode: 'NULLABLE' },
|
|
612
|
+
{ name: 'rankings_data', type: 'JSON', mode: 'NULLABLE' },
|
|
613
|
+
{ name: 'fetched_at', type: 'TIMESTAMP', mode: 'REQUIRED' }
|
|
614
|
+
],
|
|
615
|
+
instrument_insights: [
|
|
616
|
+
{ name: 'date', type: 'DATE', mode: 'REQUIRED' },
|
|
617
|
+
{ name: 'instrument_id', type: 'INT64', mode: 'REQUIRED' },
|
|
618
|
+
{ name: 'insights_data', type: 'JSON', mode: 'REQUIRED' },
|
|
619
|
+
{ name: 'fetched_at', type: 'TIMESTAMP', mode: 'REQUIRED' }
|
|
620
|
+
]
|
|
621
|
+
};
|
|
622
|
+
|
|
623
|
+
/**
|
|
624
|
+
* Get schema for a table
|
|
625
|
+
* @param {string} tableName - Table name
|
|
626
|
+
* @returns {Array} Schema array
|
|
627
|
+
*/
|
|
628
|
+
function getSchema(tableName) {
|
|
629
|
+
return SCHEMAS[tableName] || null;
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
/**
|
|
633
|
+
* Ensure computation_results table exists
|
|
634
|
+
* @param {object} logger - Logger instance
|
|
635
|
+
* @returns {Promise<Table>}
|
|
636
|
+
*/
|
|
637
|
+
async function ensureComputationResultsTable(logger = null) {
|
|
638
|
+
const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
|
|
639
|
+
const tableId = 'computation_results';
|
|
640
|
+
const schema = getSchema(tableId);
|
|
641
|
+
|
|
642
|
+
return await ensureTableExists(
|
|
643
|
+
datasetId,
|
|
644
|
+
tableId,
|
|
645
|
+
schema,
|
|
646
|
+
{
|
|
647
|
+
partitionField: 'date',
|
|
648
|
+
clusterFields: ['computation_name', 'category']
|
|
649
|
+
},
|
|
650
|
+
logger
|
|
651
|
+
);
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
/**
|
|
655
|
+
* Ensure portfolio_snapshots table exists
|
|
656
|
+
* @param {object} logger - Logger instance
|
|
657
|
+
* @returns {Promise<Table>}
|
|
658
|
+
*/
|
|
659
|
+
async function ensurePortfolioSnapshotsTable(logger = null) {
|
|
660
|
+
const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
|
|
661
|
+
const tableId = 'portfolio_snapshots';
|
|
662
|
+
const schema = getSchema(tableId);
|
|
663
|
+
|
|
664
|
+
return await ensureTableExists(
|
|
665
|
+
datasetId,
|
|
666
|
+
tableId,
|
|
667
|
+
schema,
|
|
668
|
+
{
|
|
669
|
+
partitionField: 'date',
|
|
670
|
+
clusterFields: ['user_type', 'user_id']
|
|
671
|
+
},
|
|
672
|
+
logger
|
|
673
|
+
);
|
|
674
|
+
}
|
|
675
|
+
|
|
676
|
+
/**
|
|
677
|
+
* Ensure trade_history_snapshots table exists
|
|
678
|
+
* @param {object} logger - Logger instance
|
|
679
|
+
* @returns {Promise<Table>}
|
|
680
|
+
*/
|
|
681
|
+
async function ensureTradeHistorySnapshotsTable(logger = null) {
|
|
682
|
+
const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
|
|
683
|
+
const tableId = 'trade_history_snapshots';
|
|
684
|
+
const schema = getSchema(tableId);
|
|
685
|
+
|
|
686
|
+
return await ensureTableExists(
|
|
687
|
+
datasetId,
|
|
688
|
+
tableId,
|
|
689
|
+
schema,
|
|
690
|
+
{
|
|
691
|
+
partitionField: 'date',
|
|
692
|
+
clusterFields: ['user_type', 'user_id']
|
|
693
|
+
},
|
|
694
|
+
logger
|
|
695
|
+
);
|
|
696
|
+
}
|
|
697
|
+
|
|
698
|
+
/**
|
|
699
|
+
* Ensure social_post_snapshots table exists
|
|
700
|
+
* @param {object} logger - Logger instance
|
|
701
|
+
* @returns {Promise<Table>}
|
|
702
|
+
*/
|
|
703
|
+
async function ensureSocialPostSnapshotsTable(logger = null) {
|
|
704
|
+
const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
|
|
705
|
+
const tableId = 'social_post_snapshots';
|
|
706
|
+
const schema = getSchema(tableId);
|
|
707
|
+
|
|
708
|
+
return await ensureTableExists(
|
|
709
|
+
datasetId,
|
|
710
|
+
tableId,
|
|
711
|
+
schema,
|
|
712
|
+
{
|
|
713
|
+
partitionField: 'date',
|
|
714
|
+
clusterFields: ['user_type', 'user_id']
|
|
715
|
+
},
|
|
716
|
+
logger
|
|
717
|
+
);
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
/**
|
|
721
|
+
* Ensure asset_prices table exists
|
|
722
|
+
* @param {object} logger - Logger instance
|
|
723
|
+
* @returns {Promise<Table>}
|
|
724
|
+
*/
|
|
725
|
+
async function ensureAssetPricesTable(logger = null) {
|
|
726
|
+
const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
|
|
727
|
+
const tableId = 'asset_prices';
|
|
728
|
+
const schema = getSchema(tableId);
|
|
729
|
+
|
|
730
|
+
return await ensureTableExists(
|
|
731
|
+
datasetId,
|
|
732
|
+
tableId,
|
|
733
|
+
schema,
|
|
734
|
+
{
|
|
735
|
+
partitionField: 'date',
|
|
736
|
+
clusterFields: ['ticker', 'instrument_id']
|
|
737
|
+
},
|
|
738
|
+
logger
|
|
739
|
+
);
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
/**
|
|
743
|
+
* Ensure pi_master_list table exists
|
|
744
|
+
* @param {object} logger - Logger instance
|
|
745
|
+
* @returns {Promise<Table>}
|
|
746
|
+
*/
|
|
747
|
+
async function ensurePIMasterListTable(logger = null) {
|
|
748
|
+
const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
|
|
749
|
+
const tableId = 'pi_master_list';
|
|
750
|
+
const schema = getSchema(tableId);
|
|
751
|
+
|
|
752
|
+
return await ensureTableExists(
|
|
753
|
+
datasetId,
|
|
754
|
+
tableId,
|
|
755
|
+
schema,
|
|
756
|
+
{
|
|
757
|
+
clusterFields: ['cid']
|
|
758
|
+
},
|
|
759
|
+
logger
|
|
760
|
+
);
|
|
761
|
+
}
|
|
762
|
+
|
|
763
|
+
/**
|
|
764
|
+
* Ensure pi_rankings table exists
|
|
765
|
+
* @param {object} logger - Logger instance
|
|
766
|
+
* @returns {Promise<Table>}
|
|
767
|
+
*/
|
|
768
|
+
async function ensurePIRankingsTable(logger = null) {
|
|
769
|
+
const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
|
|
770
|
+
const tableId = 'pi_rankings';
|
|
771
|
+
const schema = getSchema(tableId);
|
|
772
|
+
|
|
773
|
+
return await ensureTableExists(
|
|
774
|
+
datasetId,
|
|
775
|
+
tableId,
|
|
776
|
+
schema,
|
|
777
|
+
{
|
|
778
|
+
partitionField: 'date',
|
|
779
|
+
clusterFields: ['pi_id', 'category']
|
|
780
|
+
},
|
|
781
|
+
logger
|
|
782
|
+
);
|
|
783
|
+
}
|
|
784
|
+
|
|
785
|
+
/**
|
|
786
|
+
* Ensure instrument_insights table exists
|
|
787
|
+
* @param {object} logger - Logger instance
|
|
788
|
+
* @returns {Promise<Table>}
|
|
789
|
+
*/
|
|
790
|
+
async function ensureInstrumentInsightsTable(logger = null) {
|
|
791
|
+
const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
|
|
792
|
+
const tableId = 'instrument_insights';
|
|
793
|
+
const schema = getSchema(tableId);
|
|
794
|
+
|
|
795
|
+
return await ensureTableExists(
|
|
796
|
+
datasetId,
|
|
797
|
+
tableId,
|
|
798
|
+
schema,
|
|
799
|
+
{
|
|
800
|
+
partitionField: 'date',
|
|
801
|
+
clusterFields: ['instrument_id']
|
|
802
|
+
},
|
|
803
|
+
logger
|
|
804
|
+
);
|
|
805
|
+
}
|
|
806
|
+
|
|
807
|
+
/**
|
|
808
|
+
* Query portfolio data from BigQuery
|
|
809
|
+
* @param {string} dateStr - Date string (YYYY-MM-DD)
|
|
810
|
+
* @param {Array} userIds - Optional array of user IDs to filter
|
|
811
|
+
* @param {Array} userTypes - Optional array of user types to filter (e.g., ['POPULAR_INVESTOR', 'SIGNED_IN_USER'])
|
|
812
|
+
* @param {object} logger - Logger instance
|
|
813
|
+
* @returns {Promise<Object>} Map of user_id -> portfolio_data, or null if no data/error
|
|
814
|
+
*/
|
|
815
|
+
async function queryPortfolioData(dateStr, userIds = null, userTypes = null, logger = null) {
|
|
816
|
+
if (process.env.BIGQUERY_ENABLED === 'false') {
|
|
817
|
+
if (logger) logger.log('DEBUG', '[BigQuery] Portfolio query skipped (BIGQUERY_ENABLED=false)');
|
|
818
|
+
return null;
|
|
819
|
+
}
|
|
820
|
+
|
|
821
|
+
const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
|
|
822
|
+
const tablePath = `${datasetId}.portfolio_snapshots`;
|
|
823
|
+
|
|
824
|
+
try {
|
|
825
|
+
// Build WHERE clause
|
|
826
|
+
const conditions = [`date = '${dateStr}'`];
|
|
827
|
+
|
|
828
|
+
if (userIds && userIds.length > 0) {
|
|
829
|
+
const userIdList = userIds.map(id => String(id)).join(',');
|
|
830
|
+
conditions.push(`user_id IN (${userIdList})`);
|
|
831
|
+
}
|
|
832
|
+
|
|
833
|
+
if (userTypes && userTypes.length > 0) {
|
|
834
|
+
const typeList = userTypes.map(t => `'${t.toUpperCase()}'`).join(',');
|
|
835
|
+
conditions.push(`user_type IN (${typeList})`);
|
|
836
|
+
}
|
|
837
|
+
|
|
838
|
+
const whereClause = conditions.join(' AND ');
|
|
839
|
+
|
|
840
|
+
const sqlQuery = `
|
|
841
|
+
SELECT
|
|
842
|
+
user_id,
|
|
843
|
+
user_type,
|
|
844
|
+
portfolio_data,
|
|
845
|
+
fetched_at
|
|
846
|
+
FROM \`${tablePath}\`
|
|
847
|
+
WHERE ${whereClause}
|
|
848
|
+
`;
|
|
849
|
+
|
|
850
|
+
if (logger) {
|
|
851
|
+
logger.log('INFO', `[BigQuery] 🔍 Querying portfolio data from ${tablePath} for date ${dateStr}${userTypes ? ` (types: ${userTypes.join(',')})` : ''}${userIds ? ` (${userIds.length} users)` : ''}`);
|
|
852
|
+
}
|
|
853
|
+
|
|
854
|
+
const rows = await query(sqlQuery, {}, logger);
|
|
855
|
+
|
|
856
|
+
if (!rows || rows.length === 0) {
|
|
857
|
+
if (logger) logger.log('INFO', `[BigQuery] No portfolio data found in ${tablePath} for ${dateStr}`);
|
|
858
|
+
return null;
|
|
859
|
+
}
|
|
860
|
+
|
|
861
|
+
// Transform to map: user_id -> { portfolio_data, user_type, fetched_at }
|
|
862
|
+
const result = {};
|
|
863
|
+
for (const row of rows) {
|
|
864
|
+
result[String(row.user_id)] = {
|
|
865
|
+
portfolio_data: row.portfolio_data || {},
|
|
866
|
+
user_type: row.user_type,
|
|
867
|
+
fetched_at: row.fetched_at
|
|
868
|
+
};
|
|
869
|
+
}
|
|
870
|
+
|
|
871
|
+
if (logger) {
|
|
872
|
+
logger.log('INFO', `[BigQuery] ✅ Retrieved ${rows.length} portfolio records from ${tablePath} for ${dateStr}`);
|
|
873
|
+
}
|
|
874
|
+
|
|
875
|
+
return result;
|
|
876
|
+
} catch (error) {
|
|
877
|
+
if (logger) {
|
|
878
|
+
logger.log('WARN', `[BigQuery] Portfolio query failed for ${tablePath} (${dateStr}): ${error.message}`);
|
|
879
|
+
}
|
|
880
|
+
return null; // Return null to trigger Firestore fallback
|
|
881
|
+
}
|
|
882
|
+
}
|
|
883
|
+
|
|
884
|
+
/**
|
|
885
|
+
* Query trade history data from BigQuery
|
|
886
|
+
* @param {string} dateStr - Date string (YYYY-MM-DD)
|
|
887
|
+
* @param {Array} userIds - Optional array of user IDs to filter
|
|
888
|
+
* @param {Array} userTypes - Optional array of user types to filter
|
|
889
|
+
* @param {object} logger - Logger instance
|
|
890
|
+
* @returns {Promise<Object>} Map of user_id -> history_data, or null if no data/error
|
|
891
|
+
*/
|
|
892
|
+
async function queryHistoryData(dateStr, userIds = null, userTypes = null, logger = null) {
|
|
893
|
+
if (process.env.BIGQUERY_ENABLED === 'false') {
|
|
894
|
+
if (logger) logger.log('DEBUG', '[BigQuery] History query skipped (BIGQUERY_ENABLED=false)');
|
|
895
|
+
return null;
|
|
896
|
+
}
|
|
897
|
+
|
|
898
|
+
const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
|
|
899
|
+
const tablePath = `${datasetId}.trade_history_snapshots`;
|
|
900
|
+
|
|
901
|
+
try {
|
|
902
|
+
const conditions = [`date = '${dateStr}'`];
|
|
903
|
+
|
|
904
|
+
if (userIds && userIds.length > 0) {
|
|
905
|
+
const userIdList = userIds.map(id => String(id)).join(',');
|
|
906
|
+
conditions.push(`user_id IN (${userIdList})`);
|
|
907
|
+
}
|
|
908
|
+
|
|
909
|
+
if (userTypes && userTypes.length > 0) {
|
|
910
|
+
const typeList = userTypes.map(t => `'${t.toUpperCase()}'`).join(',');
|
|
911
|
+
conditions.push(`user_type IN (${typeList})`);
|
|
912
|
+
}
|
|
913
|
+
|
|
914
|
+
const whereClause = conditions.join(' AND ');
|
|
915
|
+
|
|
916
|
+
const sqlQuery = `
|
|
917
|
+
SELECT
|
|
918
|
+
user_id,
|
|
919
|
+
user_type,
|
|
920
|
+
history_data,
|
|
921
|
+
fetched_at
|
|
922
|
+
FROM \`${tablePath}\`
|
|
923
|
+
WHERE ${whereClause}
|
|
924
|
+
`;
|
|
925
|
+
|
|
926
|
+
if (logger) {
|
|
927
|
+
logger.log('INFO', `[BigQuery] 🔍 Querying trade history from ${tablePath} for date ${dateStr}${userTypes ? ` (types: ${userTypes.join(',')})` : ''}${userIds ? ` (${userIds.length} users)` : ''}`);
|
|
928
|
+
}
|
|
929
|
+
|
|
930
|
+
const rows = await query(sqlQuery, {}, logger);
|
|
931
|
+
|
|
932
|
+
if (!rows || rows.length === 0) {
|
|
933
|
+
if (logger) logger.log('INFO', `[BigQuery] No history data found in ${tablePath} for ${dateStr}`);
|
|
934
|
+
return null;
|
|
935
|
+
}
|
|
936
|
+
|
|
937
|
+
const result = {};
|
|
938
|
+
for (const row of rows) {
|
|
939
|
+
result[String(row.user_id)] = {
|
|
940
|
+
history_data: row.history_data || {},
|
|
941
|
+
user_type: row.user_type,
|
|
942
|
+
fetched_at: row.fetched_at
|
|
943
|
+
};
|
|
944
|
+
}
|
|
945
|
+
|
|
946
|
+
if (logger) {
|
|
947
|
+
logger.log('INFO', `[BigQuery] ✅ Retrieved ${rows.length} history records from ${tablePath} for ${dateStr}`);
|
|
948
|
+
}
|
|
949
|
+
|
|
950
|
+
return result;
|
|
951
|
+
} catch (error) {
|
|
952
|
+
if (logger) {
|
|
953
|
+
logger.log('WARN', `[BigQuery] History query failed for ${tablePath} (${dateStr}): ${error.message}`);
|
|
954
|
+
}
|
|
955
|
+
return null;
|
|
956
|
+
}
|
|
957
|
+
}
|
|
958
|
+
|
|
959
|
+
/**
|
|
960
|
+
* Check which rows already exist in BigQuery (for deduplication)
|
|
961
|
+
* @param {string} datasetId - Dataset ID
|
|
962
|
+
* @param {string} tableId - Table ID
|
|
963
|
+
* @param {string} dateStr - Date string (YYYY-MM-DD)
|
|
964
|
+
* @param {Array} rows - Array of rows to check (must have user_id and user_type)
|
|
965
|
+
* @param {object} logger - Logger instance
|
|
966
|
+
* @returns {Promise<Set>} Set of existing keys as "date|user_id|user_type"
|
|
967
|
+
*/
|
|
968
|
+
async function checkExistingRows(datasetId, tableId, dateStr, rows, logger = null) {
|
|
969
|
+
if (process.env.BIGQUERY_ENABLED === 'false' || !rows || rows.length === 0) {
|
|
970
|
+
return new Set();
|
|
971
|
+
}
|
|
972
|
+
|
|
973
|
+
try {
|
|
974
|
+
const tablePath = `${datasetId}.${tableId}`;
|
|
975
|
+
|
|
976
|
+
// Extract unique user_id and user_type combinations
|
|
977
|
+
const userKeys = new Set();
|
|
978
|
+
for (const row of rows) {
|
|
979
|
+
if (row.user_id && row.user_type) {
|
|
980
|
+
userKeys.add(`${row.user_id}|${row.user_type}`);
|
|
981
|
+
}
|
|
982
|
+
}
|
|
983
|
+
|
|
984
|
+
if (userKeys.size === 0) {
|
|
985
|
+
return new Set();
|
|
986
|
+
}
|
|
987
|
+
|
|
988
|
+
// Build WHERE clause for user combinations
|
|
989
|
+
const conditions = [`date = '${dateStr}'`];
|
|
990
|
+
const userConditions = [];
|
|
991
|
+
for (const key of userKeys) {
|
|
992
|
+
const [userId, userType] = key.split('|');
|
|
993
|
+
userConditions.push(`(user_id = ${userId} AND user_type = '${userType}')`);
|
|
994
|
+
}
|
|
995
|
+
conditions.push(`(${userConditions.join(' OR ')})`);
|
|
996
|
+
|
|
997
|
+
const whereClause = conditions.join(' AND ');
|
|
998
|
+
|
|
999
|
+
const sqlQuery = `
|
|
1000
|
+
SELECT
|
|
1001
|
+
user_id,
|
|
1002
|
+
user_type
|
|
1003
|
+
FROM \`${tablePath}\`
|
|
1004
|
+
WHERE ${whereClause}
|
|
1005
|
+
`;
|
|
1006
|
+
|
|
1007
|
+
if (logger) {
|
|
1008
|
+
logger.log('DEBUG', `[BigQuery] Checking for existing rows in ${tablePath} for ${dateStr} (${userKeys.size} unique users)`);
|
|
1009
|
+
}
|
|
1010
|
+
|
|
1011
|
+
const existingRows = await query(sqlQuery, {}, logger);
|
|
1012
|
+
|
|
1013
|
+
// Build set of existing keys
|
|
1014
|
+
const existingKeys = new Set();
|
|
1015
|
+
for (const row of existingRows) {
|
|
1016
|
+
existingKeys.add(`${row.user_id}|${row.user_type}`);
|
|
1017
|
+
}
|
|
1018
|
+
|
|
1019
|
+
if (logger && existingKeys.size > 0) {
|
|
1020
|
+
logger.log('INFO', `[BigQuery] Found ${existingKeys.size} existing rows in ${tablePath} for ${dateStr}, will skip duplicates`);
|
|
1021
|
+
}
|
|
1022
|
+
|
|
1023
|
+
return existingKeys;
|
|
1024
|
+
} catch (error) {
|
|
1025
|
+
if (logger) {
|
|
1026
|
+
logger.log('WARN', `[BigQuery] Error checking existing rows in ${datasetId}.${tableId}: ${error.message}`);
|
|
1027
|
+
}
|
|
1028
|
+
// On error, return empty set (will attempt insert, might create duplicates but safer than skipping)
|
|
1029
|
+
return new Set();
|
|
1030
|
+
}
|
|
1031
|
+
}
|
|
1032
|
+
|
|
1033
|
+
/**
|
|
1034
|
+
* Query social post data from BigQuery
|
|
1035
|
+
* @param {string} dateStr - Date string (YYYY-MM-DD)
|
|
1036
|
+
* @param {Array} userIds - Optional array of user IDs to filter
|
|
1037
|
+
* @param {Array} userTypes - Optional array of user types to filter
|
|
1038
|
+
* @param {object} logger - Logger instance
|
|
1039
|
+
* @returns {Promise<Object>} Map of user_id -> posts_data, or null if no data/error
|
|
1040
|
+
*/
|
|
1041
|
+
async function querySocialData(dateStr, userIds = null, userTypes = null, logger = null) {
|
|
1042
|
+
if (process.env.BIGQUERY_ENABLED === 'false') {
|
|
1043
|
+
if (logger) logger.log('DEBUG', '[BigQuery] Social query skipped (BIGQUERY_ENABLED=false)');
|
|
1044
|
+
return null;
|
|
1045
|
+
}
|
|
1046
|
+
|
|
1047
|
+
const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
|
|
1048
|
+
const tablePath = `${datasetId}.social_post_snapshots`;
|
|
1049
|
+
|
|
1050
|
+
try {
|
|
1051
|
+
const conditions = [`date = '${dateStr}'`];
|
|
1052
|
+
|
|
1053
|
+
if (userIds && userIds.length > 0) {
|
|
1054
|
+
const userIdList = userIds.map(id => String(id)).join(',');
|
|
1055
|
+
conditions.push(`user_id IN (${userIdList})`);
|
|
1056
|
+
}
|
|
1057
|
+
|
|
1058
|
+
if (userTypes && userTypes.length > 0) {
|
|
1059
|
+
const typeList = userTypes.map(t => `'${t.toUpperCase()}'`).join(',');
|
|
1060
|
+
conditions.push(`user_type IN (${typeList})`);
|
|
1061
|
+
}
|
|
1062
|
+
|
|
1063
|
+
const whereClause = conditions.join(' AND ');
|
|
1064
|
+
|
|
1065
|
+
const sqlQuery = `
|
|
1066
|
+
SELECT
|
|
1067
|
+
user_id,
|
|
1068
|
+
user_type,
|
|
1069
|
+
posts_data,
|
|
1070
|
+
fetched_at
|
|
1071
|
+
FROM \`${tablePath}\`
|
|
1072
|
+
WHERE ${whereClause}
|
|
1073
|
+
`;
|
|
1074
|
+
|
|
1075
|
+
if (logger) {
|
|
1076
|
+
logger.log('INFO', `[BigQuery] 🔍 Querying social posts from ${tablePath} for date ${dateStr}${userTypes ? ` (types: ${userTypes.join(',')})` : ''}${userIds ? ` (${userIds.length} users)` : ''}`);
|
|
1077
|
+
}
|
|
1078
|
+
|
|
1079
|
+
const rows = await query(sqlQuery, {}, logger);
|
|
1080
|
+
|
|
1081
|
+
if (!rows || rows.length === 0) {
|
|
1082
|
+
if (logger) logger.log('INFO', `[BigQuery] No social data found in ${tablePath} for ${dateStr}`);
|
|
1083
|
+
return null;
|
|
1084
|
+
}
|
|
1085
|
+
|
|
1086
|
+
const result = {};
|
|
1087
|
+
for (const row of rows) {
|
|
1088
|
+
result[String(row.user_id)] = {
|
|
1089
|
+
posts_data: row.posts_data || {},
|
|
1090
|
+
user_type: row.user_type,
|
|
1091
|
+
fetched_at: row.fetched_at
|
|
1092
|
+
};
|
|
1093
|
+
}
|
|
1094
|
+
|
|
1095
|
+
if (logger) {
|
|
1096
|
+
logger.log('INFO', `[BigQuery] ✅ Retrieved ${rows.length} social post records from ${tablePath} for ${dateStr}`);
|
|
1097
|
+
}
|
|
1098
|
+
|
|
1099
|
+
return result;
|
|
1100
|
+
} catch (error) {
|
|
1101
|
+
if (logger) {
|
|
1102
|
+
logger.log('WARN', `[BigQuery] Social query failed for ${tablePath} (${dateStr}): ${error.message}`);
|
|
1103
|
+
}
|
|
1104
|
+
return null;
|
|
1105
|
+
}
|
|
1106
|
+
}
|
|
1107
|
+
|
|
1108
|
+
/**
|
|
1109
|
+
* Query a single computation result from BigQuery for a specific date
|
|
1110
|
+
* @param {string} computationName - Computation name
|
|
1111
|
+
* @param {string} category - Category (e.g., 'popular-investor', 'alerts')
|
|
1112
|
+
* @param {string} dateStr - Date (YYYY-MM-DD)
|
|
1113
|
+
* @param {object} logger - Logger instance
|
|
1114
|
+
* @returns {Promise<object|null>} Result data object, or null if not found/error
|
|
1115
|
+
*/
|
|
1116
|
+
async function queryComputationResult(computationName, category, dateStr, logger = null) {
|
|
1117
|
+
if (process.env.BIGQUERY_ENABLED === 'false') {
|
|
1118
|
+
if (logger) logger.log('DEBUG', '[BigQuery] Computation result query skipped (BIGQUERY_ENABLED=false)');
|
|
1119
|
+
return null;
|
|
1120
|
+
}
|
|
1121
|
+
|
|
1122
|
+
const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
|
|
1123
|
+
const tablePath = `${datasetId}.computation_results`;
|
|
1124
|
+
|
|
1125
|
+
try {
|
|
1126
|
+
const sqlQuery = `
|
|
1127
|
+
SELECT result_data
|
|
1128
|
+
FROM \`${tablePath}\`
|
|
1129
|
+
WHERE date = '${dateStr}'
|
|
1130
|
+
AND computation_name = '${computationName}'
|
|
1131
|
+
AND category = '${category}'
|
|
1132
|
+
ORDER BY created_at DESC
|
|
1133
|
+
LIMIT 1
|
|
1134
|
+
`;
|
|
1135
|
+
|
|
1136
|
+
const rows = await query(sqlQuery, {}, logger);
|
|
1137
|
+
|
|
1138
|
+
if (!rows || rows.length === 0) {
|
|
1139
|
+
if (logger) logger.log('DEBUG', `[BigQuery] No computation result found for ${computationName} (${dateStr}, ${category})`);
|
|
1140
|
+
return null;
|
|
1141
|
+
}
|
|
1142
|
+
|
|
1143
|
+
const result = rows[0].result_data;
|
|
1144
|
+
if (logger) logger.log('INFO', `[BigQuery] ✅ Retrieved computation result for ${computationName} (${dateStr})`);
|
|
1145
|
+
return result;
|
|
1146
|
+
} catch (error) {
|
|
1147
|
+
if (logger) {
|
|
1148
|
+
logger.log('WARN', `[BigQuery] Computation result query failed for ${computationName} (${dateStr}): ${error.message}`);
|
|
1149
|
+
}
|
|
1150
|
+
return null;
|
|
1151
|
+
}
|
|
1152
|
+
}
|
|
1153
|
+
|
|
1154
|
+
/**
|
|
1155
|
+
* Query computation results from BigQuery for a date range
|
|
1156
|
+
* @param {string} computationName - Computation name
|
|
1157
|
+
* @param {string} category - Category (e.g., 'popular-investor', 'alerts')
|
|
1158
|
+
* @param {string} startDateStr - Start date (YYYY-MM-DD)
|
|
1159
|
+
* @param {string} endDateStr - End date (YYYY-MM-DD)
|
|
1160
|
+
* @param {object} logger - Logger instance
|
|
1161
|
+
* @returns {Promise<Array>} Array of {date, result_data} objects, or null if error
|
|
1162
|
+
*/
|
|
1163
|
+
async function queryComputationResultsRange(computationName, category, startDateStr, endDateStr, logger = null) {
|
|
1164
|
+
if (process.env.BIGQUERY_ENABLED === 'false') {
|
|
1165
|
+
if (logger) logger.log('DEBUG', '[BigQuery] Computation results range query skipped (BIGQUERY_ENABLED=false)');
|
|
1166
|
+
return null;
|
|
1167
|
+
}
|
|
1168
|
+
|
|
1169
|
+
const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
|
|
1170
|
+
const tablePath = `${datasetId}.computation_results`;
|
|
1171
|
+
|
|
1172
|
+
try {
|
|
1173
|
+
// [FIX] Use parameterized queries to prevent SQL injection
|
|
1174
|
+
const sqlQuery = `
|
|
1175
|
+
SELECT
|
|
1176
|
+
date,
|
|
1177
|
+
result_data,
|
|
1178
|
+
category
|
|
1179
|
+
FROM \`${tablePath}\`
|
|
1180
|
+
WHERE computation_name = @computationName
|
|
1181
|
+
AND category = @category
|
|
1182
|
+
AND date BETWEEN @startDate AND @endDate
|
|
1183
|
+
ORDER BY date DESC
|
|
1184
|
+
`;
|
|
1185
|
+
|
|
1186
|
+
if (logger) {
|
|
1187
|
+
logger.log('INFO', `[BigQuery] 🔍 Querying computation results from ${tablePath} for ${computationName} (${category}) from ${startDateStr} to ${endDateStr}`);
|
|
1188
|
+
}
|
|
1189
|
+
|
|
1190
|
+
const rows = await query(sqlQuery, {
|
|
1191
|
+
params: {
|
|
1192
|
+
computationName: computationName,
|
|
1193
|
+
category: category,
|
|
1194
|
+
startDate: startDateStr,
|
|
1195
|
+
endDate: endDateStr
|
|
1196
|
+
}
|
|
1197
|
+
}, logger);
|
|
1198
|
+
|
|
1199
|
+
if (!rows || rows.length === 0) {
|
|
1200
|
+
if (logger) logger.log('INFO', `[BigQuery] No computation results found in ${tablePath} for ${computationName} in date range`);
|
|
1201
|
+
return [];
|
|
1202
|
+
}
|
|
1203
|
+
|
|
1204
|
+
if (logger) {
|
|
1205
|
+
logger.log('INFO', `[BigQuery] ✅ Retrieved ${rows.length} computation result records from ${tablePath} for ${computationName}`);
|
|
1206
|
+
}
|
|
1207
|
+
|
|
1208
|
+
return rows.map(row => ({
|
|
1209
|
+
date: row.date,
|
|
1210
|
+
data: row.result_data || {},
|
|
1211
|
+
category: row.category
|
|
1212
|
+
}));
|
|
1213
|
+
} catch (error) {
|
|
1214
|
+
if (logger) {
|
|
1215
|
+
logger.log('WARN', `[BigQuery] Computation results range query failed for ${tablePath}: ${error.message}`);
|
|
1216
|
+
}
|
|
1217
|
+
return null;
|
|
1218
|
+
}
|
|
1219
|
+
}
|
|
1220
|
+
|
|
1221
|
+
/**
|
|
1222
|
+
* Remove duplicate rows from a BigQuery table (keeps the most recent row per unique key)
|
|
1223
|
+
* This is cheaper than checking duplicates before each insert for large backfills
|
|
1224
|
+
* Uses CREATE OR REPLACE to overwrite the table with deduplicated data
|
|
1225
|
+
* @param {string} datasetId - Dataset ID
|
|
1226
|
+
* @param {string} tableId - Table ID
|
|
1227
|
+
* @param {string} dateField - Date field name (e.g., 'date')
|
|
1228
|
+
* @param {Array} keyFields - Array of field names that form the unique key (e.g., ['user_id', 'user_type'])
|
|
1229
|
+
* @param {object} logger - Logger instance
|
|
1230
|
+
* @returns {Promise<number>} Number of duplicates removed
|
|
1231
|
+
*/
|
|
1232
|
+
async function removeDuplicates(datasetId, tableId, dateField, keyFields, logger = null) {
|
|
1233
|
+
if (process.env.BIGQUERY_ENABLED === 'false') {
|
|
1234
|
+
if (logger) logger.log('DEBUG', '[BigQuery] Deduplication skipped (BIGQUERY_ENABLED=false)');
|
|
1235
|
+
return 0;
|
|
1236
|
+
}
|
|
1237
|
+
|
|
1238
|
+
try {
|
|
1239
|
+
const tablePath = `${datasetId}.${tableId}`;
|
|
1240
|
+
const keyFieldsStr = keyFields.join(', ');
|
|
1241
|
+
|
|
1242
|
+
if (logger) {
|
|
1243
|
+
logger.log('INFO', `[BigQuery] 🔄 Deduplicating ${tablePath} by (${keyFieldsStr})`);
|
|
1244
|
+
}
|
|
1245
|
+
|
|
1246
|
+
// Get original count
|
|
1247
|
+
const [originalCountResult] = await query(`SELECT COUNT(*) as cnt FROM \`${tablePath}\``, {}, logger);
|
|
1248
|
+
const originalCount = originalCountResult[0]?.cnt || 0;
|
|
1249
|
+
|
|
1250
|
+
// Check for duplicates before deduplication (for logging)
|
|
1251
|
+
// Use CONCAT to create a composite key for COUNT(DISTINCT)
|
|
1252
|
+
const keyConcat = `${dateField}, '-', ${keyFieldsStr.split(', ').join(", '-', ")}`;
|
|
1253
|
+
const duplicateCheckQuery = `
|
|
1254
|
+
SELECT
|
|
1255
|
+
COUNT(*) as total_rows,
|
|
1256
|
+
COUNT(DISTINCT CONCAT(${keyConcat})) as unique_keys
|
|
1257
|
+
FROM \`${tablePath}\`
|
|
1258
|
+
`;
|
|
1259
|
+
const [duplicateInfo] = await query(duplicateCheckQuery, {}, logger);
|
|
1260
|
+
const uniqueKeys = duplicateInfo[0]?.unique_keys || 0;
|
|
1261
|
+
const duplicateCount = originalCount - uniqueKeys;
|
|
1262
|
+
|
|
1263
|
+
if (logger) {
|
|
1264
|
+
logger.log('INFO', `[BigQuery] Before deduplication: ${originalCount} total rows, ${uniqueKeys} unique keys, ${duplicateCount} duplicates expected`);
|
|
1265
|
+
}
|
|
1266
|
+
|
|
1267
|
+
// Get table metadata to preserve partitioning and clustering
|
|
1268
|
+
const dataset = await getOrCreateDataset(datasetId, logger);
|
|
1269
|
+
const table = dataset.table(tableId);
|
|
1270
|
+
const [tableMetadata] = await table.getMetadata();
|
|
1271
|
+
const partitioning = tableMetadata.timePartitioning;
|
|
1272
|
+
const clustering = tableMetadata.clustering;
|
|
1273
|
+
|
|
1274
|
+
// Build partitioning clause
|
|
1275
|
+
let partitionClause = '';
|
|
1276
|
+
if (partitioning && partitioning.field) {
|
|
1277
|
+
partitionClause = `PARTITION BY ${partitioning.field}`;
|
|
1278
|
+
}
|
|
1279
|
+
|
|
1280
|
+
// Build clustering clause
|
|
1281
|
+
let clusterClause = '';
|
|
1282
|
+
if (clustering && clustering.fields && clustering.fields.length > 0) {
|
|
1283
|
+
clusterClause = `CLUSTER BY ${clustering.fields.join(', ')}`;
|
|
1284
|
+
}
|
|
1285
|
+
|
|
1286
|
+
// [FIX] Use CREATE OR REPLACE TABLE instead of DELETE + INSERT
|
|
1287
|
+
// This is atomic, metadata-based swap that is free/cheap and instant
|
|
1288
|
+
// Strategy:
|
|
1289
|
+
// 1. For rows with same (date, user_id, user_type), keep the one with latest fetched_at
|
|
1290
|
+
// 2. If multiple rows have same (date, user_id, user_type, fetched_at), keep one (they're effectively identical)
|
|
1291
|
+
// The ORDER BY with fetched_at DESC ensures we keep the most recent snapshot
|
|
1292
|
+
const createReplaceQuery = `
|
|
1293
|
+
CREATE OR REPLACE TABLE \`${tablePath}\`
|
|
1294
|
+
${partitionClause}
|
|
1295
|
+
${clusterClause}
|
|
1296
|
+
AS
|
|
1297
|
+
SELECT * EXCEPT(row_num)
|
|
1298
|
+
FROM (
|
|
1299
|
+
SELECT
|
|
1300
|
+
*,
|
|
1301
|
+
ROW_NUMBER() OVER (
|
|
1302
|
+
PARTITION BY ${dateField}, ${keyFieldsStr}
|
|
1303
|
+
ORDER BY fetched_at DESC
|
|
1304
|
+
) AS row_num
|
|
1305
|
+
FROM \`${tablePath}\`
|
|
1306
|
+
)
|
|
1307
|
+
WHERE row_num = 1
|
|
1308
|
+
`;
|
|
1309
|
+
|
|
1310
|
+
await query(createReplaceQuery, {}, logger);
|
|
1311
|
+
|
|
1312
|
+
// Get deduplicated count
|
|
1313
|
+
const [dedupedCountResult] = await query(`SELECT COUNT(*) as cnt FROM \`${tablePath}\``, {}, logger);
|
|
1314
|
+
const dedupedCount = dedupedCountResult[0]?.cnt || 0;
|
|
1315
|
+
|
|
1316
|
+
const duplicatesRemoved = originalCount - dedupedCount;
|
|
1317
|
+
|
|
1318
|
+
if (logger) {
|
|
1319
|
+
if (duplicatesRemoved > 0) {
|
|
1320
|
+
logger.log('INFO', `[BigQuery] ✅ Removed ${duplicatesRemoved} duplicate rows from ${tablePath} (${originalCount} → ${dedupedCount})`);
|
|
1321
|
+
} else {
|
|
1322
|
+
logger.log('INFO', `[BigQuery] ✅ No duplicates found in ${tablePath}`);
|
|
1323
|
+
}
|
|
1324
|
+
}
|
|
1325
|
+
|
|
1326
|
+
return duplicatesRemoved;
|
|
1327
|
+
} catch (error) {
|
|
1328
|
+
if (logger) {
|
|
1329
|
+
logger.log('ERROR', `[BigQuery] Deduplication failed for ${datasetId}.${tableId}: ${error.message}`);
|
|
1330
|
+
}
|
|
1331
|
+
throw error;
|
|
1332
|
+
}
|
|
1333
|
+
}
|
|
1334
|
+
|
|
1335
|
+
/**
|
|
1336
|
+
* Query Popular Investor master list from BigQuery
|
|
1337
|
+
* Returns data in format: { cid: { cid, username, firstSeenAt, lastSeenAt } }
|
|
1338
|
+
* @param {object} logger - Logger instance
|
|
1339
|
+
* @returns {Promise<object>} Master list map in format { cid: { cid, username, firstSeenAt, lastSeenAt } }
|
|
1340
|
+
*/
|
|
1341
|
+
async function queryPIMasterList(logger = null) {
|
|
1342
|
+
if (process.env.BIGQUERY_ENABLED === 'false') {
|
|
1343
|
+
if (logger) logger.log('DEBUG', '[BigQuery] PI master list query skipped (BIGQUERY_ENABLED=false)');
|
|
1344
|
+
return null;
|
|
1345
|
+
}
|
|
1346
|
+
|
|
1347
|
+
const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
|
|
1348
|
+
const tablePath = `${datasetId}.pi_master_list`;
|
|
1349
|
+
|
|
1350
|
+
try {
|
|
1351
|
+
const sqlQuery = `
|
|
1352
|
+
SELECT
|
|
1353
|
+
cid,
|
|
1354
|
+
username,
|
|
1355
|
+
first_seen_at,
|
|
1356
|
+
last_seen_at
|
|
1357
|
+
FROM \`${tablePath}\`
|
|
1358
|
+
ORDER BY cid
|
|
1359
|
+
`;
|
|
1360
|
+
|
|
1361
|
+
if (logger) {
|
|
1362
|
+
logger.log('INFO', `[BigQuery] 🔍 Querying PI master list from ${tablePath}`);
|
|
1363
|
+
}
|
|
1364
|
+
|
|
1365
|
+
const rows = await query(sqlQuery, {}, logger);
|
|
1366
|
+
|
|
1367
|
+
if (!rows || rows.length === 0) {
|
|
1368
|
+
if (logger) logger.log('INFO', `[BigQuery] No PI master list found in ${tablePath}`);
|
|
1369
|
+
return {};
|
|
1370
|
+
}
|
|
1371
|
+
|
|
1372
|
+
// Transform to expected format: { cid: { cid, username, firstSeenAt, lastSeenAt } }
|
|
1373
|
+
const masterList = {};
|
|
1374
|
+
for (const row of rows) {
|
|
1375
|
+
const cid = String(row.cid);
|
|
1376
|
+
masterList[cid] = {
|
|
1377
|
+
cid: cid,
|
|
1378
|
+
username: row.username,
|
|
1379
|
+
firstSeenAt: row.first_seen_at,
|
|
1380
|
+
lastSeenAt: row.last_seen_at
|
|
1381
|
+
};
|
|
1382
|
+
}
|
|
1383
|
+
|
|
1384
|
+
if (logger) {
|
|
1385
|
+
logger.log('INFO', `[BigQuery] ✅ Retrieved ${Object.keys(masterList).length} PIs from master list`);
|
|
1386
|
+
}
|
|
1387
|
+
|
|
1388
|
+
return masterList;
|
|
1389
|
+
} catch (error) {
|
|
1390
|
+
if (logger) {
|
|
1391
|
+
logger.log('WARN', `[BigQuery] PI master list query failed for ${tablePath}: ${error.message}`);
|
|
1392
|
+
}
|
|
1393
|
+
return null;
|
|
1394
|
+
}
|
|
1395
|
+
}
|
|
1396
|
+
|
|
1397
|
+
/**
|
|
1398
|
+
* Query instrument insights from BigQuery for a specific date
|
|
1399
|
+
* Returns data in format: array of insights objects (same as Firestore)
|
|
1400
|
+
* @param {string} dateStr - Date (YYYY-MM-DD)
|
|
1401
|
+
* @param {object} logger - Logger instance
|
|
1402
|
+
* @returns {Promise<Array|null>} Array of insights objects, or null if not found/error
|
|
1403
|
+
*/
|
|
1404
|
+
async function queryInstrumentInsights(dateStr, logger = null) {
|
|
1405
|
+
if (process.env.BIGQUERY_ENABLED === 'false') {
|
|
1406
|
+
if (logger) logger.log('DEBUG', '[BigQuery] Instrument insights query skipped (BIGQUERY_ENABLED=false)');
|
|
1407
|
+
return null;
|
|
1408
|
+
}
|
|
1409
|
+
|
|
1410
|
+
const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
|
|
1411
|
+
const tablePath = `${datasetId}.instrument_insights`;
|
|
1412
|
+
|
|
1413
|
+
try {
|
|
1414
|
+
const sqlQuery = `
|
|
1415
|
+
SELECT
|
|
1416
|
+
instrument_id,
|
|
1417
|
+
insights_data
|
|
1418
|
+
FROM \`${tablePath}\`
|
|
1419
|
+
WHERE date = @dateStr
|
|
1420
|
+
ORDER BY instrument_id ASC
|
|
1421
|
+
`;
|
|
1422
|
+
|
|
1423
|
+
if (logger) {
|
|
1424
|
+
logger.log('INFO', `[BigQuery] 🔍 Querying instrument insights from ${tablePath} for ${dateStr}`);
|
|
1425
|
+
}
|
|
1426
|
+
|
|
1427
|
+
const rows = await query(sqlQuery, {
|
|
1428
|
+
params: {
|
|
1429
|
+
dateStr: dateStr
|
|
1430
|
+
}
|
|
1431
|
+
}, logger);
|
|
1432
|
+
|
|
1433
|
+
if (!rows || rows.length === 0) {
|
|
1434
|
+
if (logger) logger.log('INFO', `[BigQuery] No instrument insights found for ${dateStr}`);
|
|
1435
|
+
return null;
|
|
1436
|
+
}
|
|
1437
|
+
|
|
1438
|
+
// Transform to expected format: array of insights objects
|
|
1439
|
+
// insights_data is already a JSON object, so we can use it directly
|
|
1440
|
+
const insights = rows.map(row => row.insights_data);
|
|
1441
|
+
|
|
1442
|
+
if (logger) {
|
|
1443
|
+
logger.log('INFO', `[BigQuery] ✅ Retrieved ${insights.length} instrument insights for ${dateStr}`);
|
|
1444
|
+
}
|
|
1445
|
+
|
|
1446
|
+
return insights;
|
|
1447
|
+
} catch (error) {
|
|
1448
|
+
if (logger) {
|
|
1449
|
+
logger.log('WARN', `[BigQuery] Instrument insights query failed for ${dateStr}: ${error.message}`);
|
|
1450
|
+
}
|
|
1451
|
+
return null;
|
|
1452
|
+
}
|
|
1453
|
+
}
|
|
1454
|
+
|
|
1455
|
+
/**
|
|
1456
|
+
* Query Popular Investor rankings from BigQuery for a specific date
|
|
1457
|
+
* Returns data in format matching Firestore structure (Items array)
|
|
1458
|
+
* @param {string} dateStr - Date (YYYY-MM-DD)
|
|
1459
|
+
* @param {object} logger - Logger instance
|
|
1460
|
+
* @returns {Promise<object|null>} Rankings data with Items array, or null if not found/error
|
|
1461
|
+
*/
|
|
1462
|
+
async function queryPIRankings(dateStr, logger = null) {
|
|
1463
|
+
if (process.env.BIGQUERY_ENABLED === 'false') {
|
|
1464
|
+
if (logger) logger.log('DEBUG', '[BigQuery] PI rankings query skipped (BIGQUERY_ENABLED=false)');
|
|
1465
|
+
return null;
|
|
1466
|
+
}
|
|
1467
|
+
|
|
1468
|
+
const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
|
|
1469
|
+
const tablePath = `${datasetId}.pi_rankings`;
|
|
1470
|
+
|
|
1471
|
+
try {
|
|
1472
|
+
const sqlQuery = `
|
|
1473
|
+
SELECT
|
|
1474
|
+
pi_id,
|
|
1475
|
+
username,
|
|
1476
|
+
rank,
|
|
1477
|
+
category,
|
|
1478
|
+
rankings_data
|
|
1479
|
+
FROM \`${tablePath}\`
|
|
1480
|
+
WHERE date = @dateStr
|
|
1481
|
+
ORDER BY rank ASC
|
|
1482
|
+
`;
|
|
1483
|
+
|
|
1484
|
+
if (logger) {
|
|
1485
|
+
logger.log('INFO', `[BigQuery] 🔍 Querying PI rankings from ${tablePath} for ${dateStr}`);
|
|
1486
|
+
}
|
|
1487
|
+
|
|
1488
|
+
const rows = await query(sqlQuery, {
|
|
1489
|
+
params: {
|
|
1490
|
+
dateStr: dateStr
|
|
1491
|
+
}
|
|
1492
|
+
}, logger);
|
|
1493
|
+
|
|
1494
|
+
if (!rows || rows.length === 0) {
|
|
1495
|
+
if (logger) logger.log('INFO', `[BigQuery] No PI rankings found for ${dateStr}`);
|
|
1496
|
+
return null;
|
|
1497
|
+
}
|
|
1498
|
+
|
|
1499
|
+
// Transform to expected format: { Items: [...], TotalRows: N }
|
|
1500
|
+
// Use rankings_data if available (full item), otherwise reconstruct from fields
|
|
1501
|
+
const items = rows.map(row => {
|
|
1502
|
+
if (row.rankings_data) {
|
|
1503
|
+
return row.rankings_data; // Full item data stored as JSON
|
|
1504
|
+
} else {
|
|
1505
|
+
// Reconstruct item from individual fields
|
|
1506
|
+
return {
|
|
1507
|
+
CustomerId: row.pi_id,
|
|
1508
|
+
UserName: row.username,
|
|
1509
|
+
Rank: row.rank,
|
|
1510
|
+
Category: row.category
|
|
1511
|
+
};
|
|
1512
|
+
}
|
|
1513
|
+
});
|
|
1514
|
+
|
|
1515
|
+
const result = {
|
|
1516
|
+
Items: items,
|
|
1517
|
+
TotalRows: items.length
|
|
1518
|
+
};
|
|
1519
|
+
|
|
1520
|
+
if (logger) {
|
|
1521
|
+
logger.log('INFO', `[BigQuery] ✅ Retrieved ${items.length} PI rankings for ${dateStr}`);
|
|
1522
|
+
}
|
|
1523
|
+
|
|
1524
|
+
return result;
|
|
1525
|
+
} catch (error) {
|
|
1526
|
+
if (logger) {
|
|
1527
|
+
logger.log('WARN', `[BigQuery] PI rankings query failed for ${dateStr}: ${error.message}`);
|
|
1528
|
+
}
|
|
1529
|
+
return null;
|
|
1530
|
+
}
|
|
1531
|
+
}
|
|
1532
|
+
|
|
1533
|
+
/**
|
|
1534
|
+
* Query all asset prices from BigQuery
|
|
1535
|
+
* Returns data in format: { instrumentId: { "YYYY-MM-DD": price, ... } }
|
|
1536
|
+
* @param {string} startDateStr - Start date (YYYY-MM-DD), optional
|
|
1537
|
+
* @param {string} endDateStr - End date (YYYY-MM-DD), optional
|
|
1538
|
+
* @param {Array} instrumentIds - Optional array of instrument IDs to filter
|
|
1539
|
+
* @param {object} logger - Logger instance
|
|
1540
|
+
* @returns {Promise<object>} Price data map in format { instrumentId: { "YYYY-MM-DD": price } }
|
|
1541
|
+
*/
|
|
1542
|
+
async function queryAssetPrices(startDateStr = null, endDateStr = null, instrumentIds = null, logger = null) {
|
|
1543
|
+
if (process.env.BIGQUERY_ENABLED === 'false') {
|
|
1544
|
+
if (logger) logger.log('DEBUG', '[BigQuery] Asset prices query skipped (BIGQUERY_ENABLED=false)');
|
|
1545
|
+
return null;
|
|
1546
|
+
}
|
|
1547
|
+
|
|
1548
|
+
const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
|
|
1549
|
+
const tablePath = `${datasetId}.asset_prices`;
|
|
1550
|
+
|
|
1551
|
+
try {
|
|
1552
|
+
// Build WHERE clause
|
|
1553
|
+
const conditions = [];
|
|
1554
|
+
|
|
1555
|
+
if (startDateStr && endDateStr) {
|
|
1556
|
+
conditions.push(`date BETWEEN @startDate AND @endDate`);
|
|
1557
|
+
} else if (startDateStr) {
|
|
1558
|
+
conditions.push(`date >= @startDate`);
|
|
1559
|
+
} else if (endDateStr) {
|
|
1560
|
+
conditions.push(`date <= @endDate`);
|
|
1561
|
+
}
|
|
1562
|
+
|
|
1563
|
+
if (instrumentIds && instrumentIds.length > 0) {
|
|
1564
|
+
// For IN clause with parameters, we need to use UNNEST
|
|
1565
|
+
conditions.push(`instrument_id IN UNNEST(@instrumentIds)`);
|
|
1566
|
+
}
|
|
1567
|
+
|
|
1568
|
+
const whereClause = conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : '';
|
|
1569
|
+
|
|
1570
|
+
const sqlQuery = `
|
|
1571
|
+
SELECT
|
|
1572
|
+
instrument_id,
|
|
1573
|
+
date,
|
|
1574
|
+
price,
|
|
1575
|
+
ticker
|
|
1576
|
+
FROM \`${tablePath}\`
|
|
1577
|
+
${whereClause}
|
|
1578
|
+
ORDER BY instrument_id, date DESC
|
|
1579
|
+
`;
|
|
1580
|
+
|
|
1581
|
+
const params = {};
|
|
1582
|
+
if (startDateStr) params.startDate = startDateStr;
|
|
1583
|
+
if (endDateStr) params.endDate = endDateStr;
|
|
1584
|
+
if (instrumentIds && instrumentIds.length > 0) {
|
|
1585
|
+
params.instrumentIds = instrumentIds.map(id => parseInt(id, 10));
|
|
1586
|
+
}
|
|
1587
|
+
|
|
1588
|
+
if (logger) {
|
|
1589
|
+
logger.log('INFO', `[BigQuery] 🔍 Querying asset prices from ${tablePath}${startDateStr ? ` (${startDateStr} to ${endDateStr || 'latest'})` : ''}`);
|
|
1590
|
+
}
|
|
1591
|
+
|
|
1592
|
+
const rows = await query(sqlQuery, { params }, logger);
|
|
1593
|
+
|
|
1594
|
+
if (!rows || rows.length === 0) {
|
|
1595
|
+
if (logger) logger.log('INFO', `[BigQuery] No asset prices found in ${tablePath}`);
|
|
1596
|
+
return {};
|
|
1597
|
+
}
|
|
1598
|
+
|
|
1599
|
+
// Transform to expected format: { instrumentId: { "YYYY-MM-DD": price } }
|
|
1600
|
+
const priceMap = {};
|
|
1601
|
+
for (const row of rows) {
|
|
1602
|
+
const instrumentId = String(row.instrument_id);
|
|
1603
|
+
const dateStr = row.date; // Already in YYYY-MM-DD format from BigQuery DATE type
|
|
1604
|
+
|
|
1605
|
+
if (!priceMap[instrumentId]) {
|
|
1606
|
+
priceMap[instrumentId] = {};
|
|
1607
|
+
}
|
|
1608
|
+
|
|
1609
|
+
// Use close price if available, otherwise use price
|
|
1610
|
+
priceMap[instrumentId][dateStr] = row.price || null;
|
|
1611
|
+
}
|
|
1612
|
+
|
|
1613
|
+
if (logger) {
|
|
1614
|
+
logger.log('INFO', `[BigQuery] ✅ Retrieved prices for ${Object.keys(priceMap).length} instruments from ${tablePath}`);
|
|
1615
|
+
}
|
|
1616
|
+
|
|
1617
|
+
return priceMap;
|
|
1618
|
+
} catch (error) {
|
|
1619
|
+
if (logger) {
|
|
1620
|
+
logger.log('WARN', `[BigQuery] Asset prices query failed for ${tablePath}: ${error.message}`);
|
|
1621
|
+
}
|
|
1622
|
+
return null;
|
|
1623
|
+
}
|
|
1624
|
+
}
|
|
1625
|
+
|
|
1626
|
+
module.exports = {
|
|
1627
|
+
getBigQueryClient,
|
|
1628
|
+
getOrCreateDataset,
|
|
1629
|
+
ensureTableExists,
|
|
1630
|
+
insertRows,
|
|
1631
|
+
insertRowsStreaming,
|
|
1632
|
+
query,
|
|
1633
|
+
getSchema,
|
|
1634
|
+
ensureComputationResultsTable,
|
|
1635
|
+
ensurePortfolioSnapshotsTable,
|
|
1636
|
+
ensureTradeHistorySnapshotsTable,
|
|
1637
|
+
ensureSocialPostSnapshotsTable,
|
|
1638
|
+
ensureAssetPricesTable,
|
|
1639
|
+
ensurePIMasterListTable,
|
|
1640
|
+
ensurePIRankingsTable,
|
|
1641
|
+
ensureInstrumentInsightsTable,
|
|
1642
|
+
queryPortfolioData,
|
|
1643
|
+
queryHistoryData,
|
|
1644
|
+
querySocialData,
|
|
1645
|
+
queryAssetPrices,
|
|
1646
|
+
queryPIMasterList,
|
|
1647
|
+
queryPIRankings,
|
|
1648
|
+
queryInstrumentInsights,
|
|
1649
|
+
queryComputationResult,
|
|
1650
|
+
queryComputationResultsRange,
|
|
1651
|
+
checkExistingRows,
|
|
1652
|
+
removeDuplicates,
|
|
1653
|
+
insertRowsWithMerge,
|
|
1654
|
+
SCHEMAS
|
|
1655
|
+
};
|