bulltrackers-module 1.0.710 → 1.0.713

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,844 @@
1
+ /**
2
+ * @fileoverview Backfill Task Engine Data from Firestore to BigQuery
3
+ *
4
+ * This function reads existing portfolio, trade history, and social post data
5
+ * from Firestore and writes it to BigQuery tables.
6
+ *
7
+ * Usage (Cloud Function HTTP trigger):
8
+ * ?startDate=2024-01-01&endDate=2024-12-31&dataType=all
9
+ * ?startDate=2024-01-01&endDate=2024-12-31&dataType=portfolio
10
+ * ?startDate=2024-01-01&endDate=2024-12-31&dataType=history
11
+ * ?startDate=2024-01-01&endDate=2024-12-31&dataType=social
12
+ *
13
+ * Usage (Local Node.js script):
14
+ * node index.js --startDate=2024-01-01 --endDate=2024-12-31 --dataType=all
15
+ * node index.js --startDate=2024-01-01 --endDate=2024-12-31 --dataType=portfolio
16
+ * node index.js --startDate=2024-01-01 --endDate=2024-12-31 --dataType=history
17
+ * node index.js --startDate=2024-01-01 --endDate=2024-12-31 --dataType=social
18
+ *
19
+ * Features:
20
+ * - Processes in batches to avoid timeouts
21
+ * - Tracks progress in Firestore (resume capability)
22
+ * - Handles both new and legacy Firestore structures
23
+ * - Does NOT delete any Firestore data
24
+ * - Works both as Cloud Function and local script
25
+ */
26
+
27
+ const { Firestore } = require('@google-cloud/firestore');
28
+ const pLimit = require('p-limit');
29
+ const {
30
+ ensurePortfolioSnapshotsTable,
31
+ ensureTradeHistorySnapshotsTable,
32
+ ensureSocialPostSnapshotsTable,
33
+ insertRows,
34
+ insertRowsWithMerge,
35
+ checkExistingRows,
36
+ removeDuplicates
37
+ } = require('../../core/utils/bigquery_utils');
38
+
39
+ // Reusable timestamp converter (created once, reused)
40
+ const timestampReplacer = (key, value) => {
41
+ if (value && typeof value === 'object' && value.toDate && typeof value.toDate === 'function') {
42
+ return value.toDate().toISOString();
43
+ }
44
+ return value;
45
+ };
46
+
47
+ // Firestore batch read size (Firestore getAll limit is 100)
48
+ const FIRESTORE_BATCH_SIZE = 100;
49
+
50
+ const db = new Firestore();
51
+ const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
52
+ const PROGRESS_COLLECTION = 'backfill_progress';
53
+
54
+ /**
55
+ * Get all dates between start and end (inclusive)
56
+ */
57
+ function getDateRange(startDateStr, endDateStr) {
58
+ const dates = [];
59
+ const start = new Date(startDateStr);
60
+ const end = new Date(endDateStr);
61
+
62
+ for (let d = new Date(start); d <= end; d.setUTCDate(d.getUTCDate() + 1)) {
63
+ dates.push(d.toISOString().split('T')[0]);
64
+ }
65
+
66
+ return dates;
67
+ }
68
+
69
+ /**
70
+ * Check if a date has already been backfilled
71
+ */
72
+ async function isDateBackfilled(dataType, dateStr) {
73
+ const progressRef = db.collection(PROGRESS_COLLECTION)
74
+ .doc(dataType)
75
+ .collection('dates')
76
+ .doc(dateStr);
77
+
78
+ const snap = await progressRef.get();
79
+ return snap.exists && snap.data().completed === true;
80
+ }
81
+
82
+ /**
83
+ * Mark a date as backfilled
84
+ * @param {string} dataType - Data type ('portfolio', 'history', 'social')
85
+ * @param {string} dateStr - Date string (YYYY-MM-DD)
86
+ * @param {number} rowCount - Number of rows successfully inserted
87
+ * @param {number} expectedCount - Expected total rows (for verification)
88
+ */
89
+ async function markDateBackfilled(dataType, dateStr, rowCount, expectedCount = null) {
90
+ const progressRef = db.collection(PROGRESS_COLLECTION)
91
+ .doc(dataType)
92
+ .collection('dates')
93
+ .doc(dateStr);
94
+
95
+ await progressRef.set({
96
+ completed: true,
97
+ rowCount: rowCount,
98
+ expectedCount: expectedCount || rowCount, // Store expected for verification
99
+ completedAt: require('@google-cloud/firestore').FieldValue.serverTimestamp()
100
+ }, { merge: true });
101
+ }
102
+
103
+ /**
104
+ * Backfill portfolio data for a single date
105
+ */
106
+ async function backfillPortfolioData(dateStr, logger) {
107
+ const rows = [];
108
+
109
+ try {
110
+ // Collect all document references first
111
+ const allDocRefs = [];
112
+
113
+ // NEW STRUCTURE: PopularInvestorPortfolioData/{date}/{cid}/{cid}
114
+ const piPortCollection = db.collection('PopularInvestorPortfolioData').doc(dateStr);
115
+ const piSubcollections = await piPortCollection.listCollections();
116
+ for (const subcol of piSubcollections) {
117
+ const cid = subcol.id;
118
+ allDocRefs.push({ ref: subcol.doc(cid), type: 'POPULAR_INVESTOR', cid });
119
+ }
120
+
121
+ // NEW STRUCTURE: SignedInUserPortfolioData/{date}/{cid}/{cid}
122
+ const signedInPortCollection = db.collection('SignedInUserPortfolioData').doc(dateStr);
123
+ const signedInSubcollections = await signedInPortCollection.listCollections();
124
+ for (const subcol of signedInSubcollections) {
125
+ const cid = subcol.id;
126
+ allDocRefs.push({ ref: subcol.doc(cid), type: 'SIGNED_IN_USER', cid });
127
+ }
128
+
129
+ if (allDocRefs.length === 0) {
130
+ logger.log('INFO', `[Backfill] Portfolio: No data found for ${dateStr}, skipping`);
131
+ return 0;
132
+ }
133
+
134
+ // Batch read all documents using getAll() (up to 100 per call, much faster)
135
+ for (let i = 0; i < allDocRefs.length; i += FIRESTORE_BATCH_SIZE) {
136
+ const batch = allDocRefs.slice(i, i + FIRESTORE_BATCH_SIZE);
137
+ const docRefs = batch.map(item => item.ref);
138
+
139
+ // Single RPC call for up to 100 documents
140
+ const docs = await db.getAll(...docRefs);
141
+
142
+ for (let j = 0; j < docs.length; j++) {
143
+ const doc = docs[j];
144
+ if (doc.exists) {
145
+ const data = doc.data();
146
+ const batchItem = batch[j];
147
+
148
+ // Convert Firestore Timestamps to ISO strings (reusable function)
149
+ const cleanData = JSON.parse(JSON.stringify(data, timestampReplacer));
150
+
151
+ rows.push({
152
+ date: dateStr,
153
+ user_id: Number(batchItem.cid) || 0,
154
+ user_type: batchItem.type,
155
+ portfolio_data: JSON.stringify(cleanData), // BigQuery JSON type requires a string
156
+ fetched_at: data.fetchedAt?.toDate?.()?.toISOString() || new Date().toISOString()
157
+ });
158
+ }
159
+ }
160
+ }
161
+
162
+ // LEGACY STRUCTURE: NormalUserPortfolios/{blockId}/snapshots/{date}/parts/{partId}
163
+ // Note: Legacy structure is more complex, would need to read parts and merge
164
+ // For now, we'll skip legacy and focus on new structure
165
+ // TODO: Add legacy structure support if needed
166
+
167
+ if (rows.length > 0) {
168
+ await ensurePortfolioSnapshotsTable(logger);
169
+
170
+ // Use BigQuery MERGE for efficient deduplication (SQL-native, most efficient)
171
+ // MERGE handles duplicates natively - no need to check first
172
+ if (process.env.USE_MERGE_FOR_DEDUP === 'true') {
173
+ logger.log('INFO', `[Backfill] Portfolio: Using MERGE for ${dateStr} (${rows.length} rows, SQL-native deduplication)`);
174
+ try {
175
+ const rowsInserted = await insertRowsWithMerge(
176
+ datasetId,
177
+ 'portfolio_snapshots',
178
+ rows,
179
+ ['date', 'user_id', 'user_type'], // Unique key fields
180
+ logger
181
+ );
182
+ logger.log('INFO', `[Backfill] Portfolio: MERGE completed for ${dateStr} - ${rowsInserted} new rows inserted`);
183
+ return rowsInserted;
184
+ } catch (mergeError) {
185
+ logger.log('WARN', `[Backfill] Portfolio MERGE failed for ${dateStr}, falling back to check-then-insert: ${mergeError.message}`);
186
+ // Fall through to check-then-insert method
187
+ }
188
+ }
189
+
190
+ // Fallback: Check for existing rows to prevent duplicates
191
+ // Can skip if SKIP_DUPLICATE_CHECK=true (for fresh backfills)
192
+ let newRows = rows;
193
+ if (process.env.SKIP_DUPLICATE_CHECK !== 'true') {
194
+ const existingKeys = await checkExistingRows(datasetId, 'portfolio_snapshots', dateStr, rows, logger);
195
+
196
+ // Filter out rows that already exist
197
+ newRows = rows.filter(row => {
198
+ const key = `${row.user_id}|${row.user_type}`;
199
+ return !existingKeys.has(key);
200
+ });
201
+
202
+ if (newRows.length < rows.length) {
203
+ logger.log('INFO', `[Backfill] Portfolio: Skipping ${rows.length - newRows.length} duplicate rows for ${dateStr}`);
204
+ }
205
+ } else {
206
+ logger.log('INFO', `[Backfill] Portfolio: Skipping duplicate check for ${dateStr} (SKIP_DUPLICATE_CHECK=true)`);
207
+ }
208
+
209
+ if (newRows.length > 0) {
210
+ // Insert in smaller batches to avoid request size limits
211
+ const BATCH_SIZE = 50; // Insert 50 rows at a time
212
+ let totalInserted = 0;
213
+ const failedBatches = [];
214
+ for (let i = 0; i < newRows.length; i += BATCH_SIZE) {
215
+ const batch = newRows.slice(i, i + BATCH_SIZE);
216
+ try {
217
+ await insertRows(datasetId, 'portfolio_snapshots', batch, logger);
218
+ totalInserted += batch.length;
219
+ } catch (batchError) {
220
+ const batchNum = Math.floor(i/BATCH_SIZE) + 1;
221
+ logger.log('WARN', `[Backfill] Portfolio batch insert failed for ${dateStr}, batch ${batchNum}: ${batchError.message}`);
222
+ failedBatches.push({ batchNum, startIndex: i, batchSize: batch.length });
223
+ // Continue with next batch
224
+ }
225
+ }
226
+ logger.log('INFO', `[Backfill] Portfolio: Inserted ${totalInserted}/${newRows.length} new rows for ${dateStr} (${rows.length - newRows.length} duplicates skipped)`);
227
+
228
+ // Return actual inserted count, and throw if partial failure
229
+ if (totalInserted < newRows.length) {
230
+ const missing = newRows.length - totalInserted;
231
+ throw new Error(`Partial failure: Only inserted ${totalInserted}/${newRows.length} rows. ${missing} rows failed. Failed batches: ${failedBatches.map(b => `batch ${b.batchNum}`).join(', ')}`);
232
+ }
233
+
234
+ return totalInserted + (rows.length - newRows.length); // Return total (new + existing)
235
+ } else {
236
+ logger.log('INFO', `[Backfill] Portfolio: All rows already exist for ${dateStr}, skipping`);
237
+ return rows.length; // All were duplicates, return count found
238
+ }
239
+ } else {
240
+ logger.log('INFO', `[Backfill] Portfolio: No data found for ${dateStr}, skipping`);
241
+ return 0;
242
+ }
243
+ } catch (error) {
244
+ logger.log('ERROR', `[Backfill] Portfolio error for ${dateStr}:`, {
245
+ message: error.message,
246
+ code: error.code,
247
+ errors: error.errors,
248
+ stack: error.stack
249
+ });
250
+ throw error;
251
+ }
252
+ }
253
+
254
+ /**
255
+ * Backfill trade history data for a single date
256
+ */
257
+ async function backfillHistoryData(dateStr, logger) {
258
+ const rows = [];
259
+
260
+ try {
261
+ // Collect all document references first
262
+ const allDocRefs = [];
263
+
264
+ // NEW STRUCTURE: PopularInvestorTradeHistoryData/{date}/{cid}/{cid}
265
+ const piHistCollection = db.collection('PopularInvestorTradeHistoryData').doc(dateStr);
266
+ const piSubcollections = await piHistCollection.listCollections();
267
+ for (const subcol of piSubcollections) {
268
+ const cid = subcol.id;
269
+ allDocRefs.push({ ref: subcol.doc(cid), type: 'POPULAR_INVESTOR', cid });
270
+ }
271
+
272
+ // NEW STRUCTURE: SignedInUserTradeHistoryData/{date}/{cid}/{cid}
273
+ const signedInHistCollection = db.collection('SignedInUserTradeHistoryData').doc(dateStr);
274
+ const signedInSubcollections = await signedInHistCollection.listCollections();
275
+ for (const subcol of signedInSubcollections) {
276
+ const cid = subcol.id;
277
+ allDocRefs.push({ ref: subcol.doc(cid), type: 'SIGNED_IN_USER', cid });
278
+ }
279
+
280
+ if (allDocRefs.length === 0) {
281
+ logger.log('INFO', `[Backfill] History: No data found for ${dateStr}, skipping`);
282
+ return 0;
283
+ }
284
+
285
+ // Batch read all documents using getAll() (up to 100 per call, much faster)
286
+ for (let i = 0; i < allDocRefs.length; i += FIRESTORE_BATCH_SIZE) {
287
+ const batch = allDocRefs.slice(i, i + FIRESTORE_BATCH_SIZE);
288
+ const docRefs = batch.map(item => item.ref);
289
+
290
+ // Single RPC call for up to 100 documents
291
+ const docs = await db.getAll(...docRefs);
292
+
293
+ for (let j = 0; j < docs.length; j++) {
294
+ const doc = docs[j];
295
+ if (doc.exists) {
296
+ const data = doc.data();
297
+ const batchItem = batch[j];
298
+
299
+ // Convert Firestore Timestamps to ISO strings (reusable function)
300
+ const cleanData = JSON.parse(JSON.stringify(data, timestampReplacer));
301
+
302
+ rows.push({
303
+ date: dateStr,
304
+ user_id: Number(batchItem.cid) || 0,
305
+ user_type: batchItem.type,
306
+ history_data: JSON.stringify(cleanData), // BigQuery JSON type requires a string
307
+ fetched_at: data.fetchedAt?.toDate?.()?.toISOString() || new Date().toISOString()
308
+ });
309
+ }
310
+ }
311
+ }
312
+
313
+ if (rows.length > 0) {
314
+ await ensureTradeHistorySnapshotsTable(logger);
315
+
316
+ // Use BigQuery MERGE for efficient deduplication (SQL-native, most efficient)
317
+ if (process.env.USE_MERGE_FOR_DEDUP === 'true') {
318
+ logger.log('INFO', `[Backfill] History: Using MERGE for ${dateStr} (${rows.length} rows, SQL-native deduplication)`);
319
+ try {
320
+ const rowsInserted = await insertRowsWithMerge(
321
+ datasetId,
322
+ 'trade_history_snapshots',
323
+ rows,
324
+ ['date', 'user_id', 'user_type'], // Unique key fields
325
+ logger
326
+ );
327
+ logger.log('INFO', `[Backfill] History: MERGE completed for ${dateStr} - ${rowsInserted} new rows inserted`);
328
+ return rowsInserted;
329
+ } catch (mergeError) {
330
+ logger.log('WARN', `[Backfill] History MERGE failed for ${dateStr}, falling back to check-then-insert: ${mergeError.message}`);
331
+ // Fall through to check-then-insert method
332
+ }
333
+ }
334
+
335
+ // Fallback: Check for existing rows to prevent duplicates
336
+ let newRows = rows;
337
+ if (process.env.SKIP_DUPLICATE_CHECK !== 'true') {
338
+ const existingKeys = await checkExistingRows(datasetId, 'trade_history_snapshots', dateStr, rows, logger);
339
+
340
+ // Filter out rows that already exist
341
+ newRows = rows.filter(row => {
342
+ const key = `${row.user_id}|${row.user_type}`;
343
+ return !existingKeys.has(key);
344
+ });
345
+
346
+ if (newRows.length < rows.length) {
347
+ logger.log('INFO', `[Backfill] History: Skipping ${rows.length - newRows.length} duplicate rows for ${dateStr}`);
348
+ }
349
+ } else {
350
+ logger.log('INFO', `[Backfill] History: Skipping duplicate check for ${dateStr} (SKIP_DUPLICATE_CHECK=true)`);
351
+ }
352
+
353
+ if (newRows.length > 0) {
354
+ // Insert in smaller batches to avoid request size limits (trade history can be very large)
355
+ const BATCH_SIZE = 25; // Smaller batches for history (can be very large)
356
+ let totalInserted = 0;
357
+ const failedBatches = [];
358
+ for (let i = 0; i < newRows.length; i += BATCH_SIZE) {
359
+ const batch = newRows.slice(i, i + BATCH_SIZE);
360
+ try {
361
+ await insertRows(datasetId, 'trade_history_snapshots', batch, logger);
362
+ totalInserted += batch.length;
363
+ } catch (batchError) {
364
+ const batchNum = Math.floor(i/BATCH_SIZE) + 1;
365
+ logger.log('WARN', `[Backfill] History batch insert failed for ${dateStr}, batch ${batchNum}: ${batchError.message}`);
366
+ failedBatches.push({ batchNum, startIndex: i, batchSize: batch.length });
367
+ // Continue with next batch
368
+ }
369
+ }
370
+ logger.log('INFO', `[Backfill] History: Inserted ${totalInserted}/${newRows.length} new rows for ${dateStr} (${rows.length - newRows.length} duplicates skipped)`);
371
+
372
+ // Return actual inserted count, and throw if partial failure
373
+ if (totalInserted < newRows.length) {
374
+ const missing = newRows.length - totalInserted;
375
+ throw new Error(`Partial failure: Only inserted ${totalInserted}/${newRows.length} rows. ${missing} rows failed. Failed batches: ${failedBatches.map(b => `batch ${b.batchNum}`).join(', ')}`);
376
+ }
377
+
378
+ return totalInserted + (rows.length - newRows.length); // Return total (new + existing)
379
+ } else {
380
+ logger.log('INFO', `[Backfill] History: All rows already exist for ${dateStr}, skipping`);
381
+ return rows.length; // All were duplicates, return count found
382
+ }
383
+ } else {
384
+ logger.log('INFO', `[Backfill] History: No data found for ${dateStr}, skipping`);
385
+ return 0;
386
+ }
387
+ } catch (error) {
388
+ logger.log('ERROR', `[Backfill] History error for ${dateStr}:`, {
389
+ message: error.message,
390
+ code: error.code,
391
+ errors: error.errors,
392
+ stack: error.stack
393
+ });
394
+ throw error;
395
+ }
396
+ }
397
+
398
+ /**
399
+ * Backfill social post data for a single date
400
+ */
401
+ async function backfillSocialData(dateStr, logger) {
402
+ const rows = [];
403
+
404
+ try {
405
+ // Collect all document references first
406
+ const allDocRefs = [];
407
+
408
+ // NEW STRUCTURE: PopularInvestorSocialPostData/{date}/{cid}/{cid}
409
+ const piSocialCollection = db.collection('PopularInvestorSocialPostData').doc(dateStr);
410
+ const piSubcollections = await piSocialCollection.listCollections();
411
+ for (const subcol of piSubcollections) {
412
+ const cid = subcol.id;
413
+ allDocRefs.push({ ref: subcol.doc(cid), type: 'POPULAR_INVESTOR', cid });
414
+ }
415
+
416
+ // NEW STRUCTURE: SignedInUserSocialPostData/{date}/{cid}/{cid}
417
+ const signedInSocialCollection = db.collection('SignedInUserSocialPostData').doc(dateStr);
418
+ const signedInSubcollections = await signedInSocialCollection.listCollections();
419
+ for (const subcol of signedInSubcollections) {
420
+ const cid = subcol.id;
421
+ allDocRefs.push({ ref: subcol.doc(cid), type: 'SIGNED_IN_USER', cid });
422
+ }
423
+
424
+ if (allDocRefs.length === 0) {
425
+ logger.log('INFO', `[Backfill] Social: No data found for ${dateStr}, skipping`);
426
+ return 0;
427
+ }
428
+
429
+ // Batch read all documents using getAll() (up to 100 per call, much faster)
430
+ for (let i = 0; i < allDocRefs.length; i += FIRESTORE_BATCH_SIZE) {
431
+ const batch = allDocRefs.slice(i, i + FIRESTORE_BATCH_SIZE);
432
+ const docRefs = batch.map(item => item.ref);
433
+
434
+ // Single RPC call for up to 100 documents
435
+ const docs = await db.getAll(...docRefs);
436
+
437
+ for (let j = 0; j < docs.length; j++) {
438
+ const doc = docs[j];
439
+ if (doc.exists) {
440
+ const data = doc.data();
441
+ const batchItem = batch[j];
442
+
443
+ // Convert Firestore Timestamps to ISO strings (reusable function)
444
+ const cleanData = JSON.parse(JSON.stringify(data, timestampReplacer));
445
+
446
+ rows.push({
447
+ date: dateStr,
448
+ user_id: Number(batchItem.cid) || 0,
449
+ user_type: batchItem.type,
450
+ posts_data: JSON.stringify(cleanData), // BigQuery JSON type requires a string
451
+ fetched_at: data.fetchedAt?.toDate?.()?.toISOString() || new Date().toISOString()
452
+ });
453
+ }
454
+ }
455
+ }
456
+
457
+ if (rows.length > 0) {
458
+ await ensureSocialPostSnapshotsTable(logger);
459
+
460
+ // Use BigQuery MERGE for efficient deduplication (SQL-native, most efficient)
461
+ if (process.env.USE_MERGE_FOR_DEDUP === 'true') {
462
+ logger.log('INFO', `[Backfill] Social: Using MERGE for ${dateStr} (${rows.length} rows, SQL-native deduplication)`);
463
+ try {
464
+ const rowsInserted = await insertRowsWithMerge(
465
+ datasetId,
466
+ 'social_post_snapshots',
467
+ rows,
468
+ ['date', 'user_id', 'user_type'], // Unique key fields
469
+ logger
470
+ );
471
+ logger.log('INFO', `[Backfill] Social: MERGE completed for ${dateStr} - ${rowsInserted} new rows inserted`);
472
+ return rowsInserted;
473
+ } catch (mergeError) {
474
+ logger.log('WARN', `[Backfill] Social MERGE failed for ${dateStr}, falling back to check-then-insert: ${mergeError.message}`);
475
+ // Fall through to check-then-insert method
476
+ }
477
+ }
478
+
479
+ // Fallback: Check for existing rows to prevent duplicates
480
+ let newRows = rows;
481
+ if (process.env.SKIP_DUPLICATE_CHECK !== 'true') {
482
+ const existingKeys = await checkExistingRows(datasetId, 'social_post_snapshots', dateStr, rows, logger);
483
+
484
+ // Filter out rows that already exist
485
+ newRows = rows.filter(row => {
486
+ const key = `${row.user_id}|${row.user_type}`;
487
+ return !existingKeys.has(key);
488
+ });
489
+
490
+ if (newRows.length < rows.length) {
491
+ logger.log('INFO', `[Backfill] Social: Skipping ${rows.length - newRows.length} duplicate rows for ${dateStr}`);
492
+ }
493
+ } else {
494
+ logger.log('INFO', `[Backfill] Social: Skipping duplicate check for ${dateStr} (SKIP_DUPLICATE_CHECK=true)`);
495
+ }
496
+
497
+ if (newRows.length > 0) {
498
+ // Insert in smaller batches to avoid request size limits
499
+ const BATCH_SIZE = 50; // Insert 50 rows at a time
500
+ let totalInserted = 0;
501
+ const failedBatches = [];
502
+ for (let i = 0; i < newRows.length; i += BATCH_SIZE) {
503
+ const batch = newRows.slice(i, i + BATCH_SIZE);
504
+ try {
505
+ await insertRows(datasetId, 'social_post_snapshots', batch, logger);
506
+ totalInserted += batch.length;
507
+ } catch (batchError) {
508
+ const batchNum = Math.floor(i/BATCH_SIZE) + 1;
509
+ logger.log('WARN', `[Backfill] Social batch insert failed for ${dateStr}, batch ${batchNum}: ${batchError.message}`);
510
+ failedBatches.push({ batchNum, startIndex: i, batchSize: batch.length });
511
+ // Continue with next batch
512
+ }
513
+ }
514
+ logger.log('INFO', `[Backfill] Social: Inserted ${totalInserted}/${newRows.length} new rows for ${dateStr} (${rows.length - newRows.length} duplicates skipped)`);
515
+
516
+ // Return actual inserted count, and throw if partial failure
517
+ if (totalInserted < newRows.length) {
518
+ const missing = newRows.length - totalInserted;
519
+ throw new Error(`Partial failure: Only inserted ${totalInserted}/${newRows.length} rows. ${missing} rows failed. Failed batches: ${failedBatches.map(b => `batch ${b.batchNum}`).join(', ')}`);
520
+ }
521
+
522
+ return totalInserted + (rows.length - newRows.length); // Return total (new + existing)
523
+ } else {
524
+ logger.log('INFO', `[Backfill] Social: All rows already exist for ${dateStr}, skipping`);
525
+ return rows.length; // All were duplicates, return count found
526
+ }
527
+ } else {
528
+ logger.log('INFO', `[Backfill] Social: No data found for ${dateStr}, skipping`);
529
+ return 0;
530
+ }
531
+ } catch (error) {
532
+ logger.log('ERROR', `[Backfill] Social error for ${dateStr}:`, {
533
+ message: error.message,
534
+ code: error.code,
535
+ errors: error.errors,
536
+ stack: error.stack
537
+ });
538
+ throw error;
539
+ }
540
+ }
541
+
542
+ /**
543
+ * Main backfill function
544
+ * Works both as Cloud Function (req/res) and local script (no req/res)
545
+ */
546
+ async function backfillTaskEngineData(req, res) {
547
+ const logger = {
548
+ log: (level, message, ...args) => {
549
+ const timestamp = new Date().toISOString();
550
+ console.log(`[${timestamp}] [${level}] ${message}`, ...args);
551
+ }
552
+ };
553
+
554
+ // Determine if running as Cloud Function or local script
555
+ const isCloudFunction = req && res;
556
+
557
+ // Extract parameters from either HTTP request or command line
558
+ let startDate, endDate, dataType;
559
+
560
+ if (isCloudFunction) {
561
+ // Cloud Function: get from query params
562
+ startDate = req.query.startDate || '2024-01-01';
563
+ endDate = req.query.endDate || new Date().toISOString().split('T')[0];
564
+ dataType = req.query.dataType || 'all';
565
+ } else {
566
+ // Local script: get from command line args
567
+ const args = process.argv.slice(2);
568
+ const parseArg = (key, defaultValue) => {
569
+ const arg = args.find(a => a.startsWith(`--${key}=`));
570
+ return arg ? arg.split('=')[1] : defaultValue;
571
+ };
572
+
573
+ startDate = parseArg('startDate', '2024-01-01');
574
+ endDate = parseArg('endDate', new Date().toISOString().split('T')[0]);
575
+ dataType = parseArg('dataType', 'all');
576
+ }
577
+
578
+ try {
579
+ logger.log('INFO', `[Backfill] Starting backfill: ${dataType} from ${startDate} to ${endDate}`);
580
+ logger.log('INFO', `[Backfill] Running in ${isCloudFunction ? 'Cloud Function' : 'local'} mode`);
581
+
582
+ const dates = getDateRange(startDate, endDate);
583
+ logger.log('INFO', `[Backfill] Processing ${dates.length} dates`);
584
+
585
+ const summary = {
586
+ totalDates: dates.length,
587
+ processedDates: 0,
588
+ skippedDates: 0,
589
+ portfolioRows: 0,
590
+ historyRows: 0,
591
+ socialRows: 0,
592
+ errors: []
593
+ };
594
+
595
+ // Process dates in parallel with concurrency limit
596
+ // Limit to 5 concurrent dates to avoid overwhelming BigQuery/Firestore
597
+ const dateLimit = pLimit(5);
598
+
599
+ // Helper function to retry operations with exponential backoff
600
+ async function retryWithBackoff(fn, maxRetries = 3, baseDelay = 1000) {
601
+ for (let attempt = 1; attempt <= maxRetries; attempt++) {
602
+ try {
603
+ return await fn();
604
+ } catch (error) {
605
+ // Check if error is retryable (timeout, rate limit, etc.)
606
+ const isRetryable = error.code === 4 || // DEADLINE_EXCEEDED (timeout)
607
+ error.code === 8 || // RESOURCE_EXHAUSTED (rate limit)
608
+ error.code === 14 || // UNAVAILABLE (service unavailable)
609
+ error.message?.includes('timeout') ||
610
+ error.message?.includes('rate limit') ||
611
+ error.message?.includes('exceeded');
612
+
613
+ if (!isRetryable || attempt === maxRetries) {
614
+ throw error; // Not retryable or out of retries
615
+ }
616
+
617
+ const delay = baseDelay * Math.pow(2, attempt - 1); // Exponential backoff
618
+ logger.log('WARN', `[Backfill] Retryable error (attempt ${attempt}/${maxRetries}), retrying in ${delay}ms: ${error.message}`);
619
+ await new Promise(resolve => setTimeout(resolve, delay));
620
+ }
621
+ }
622
+ }
623
+
624
+ // Process all dates in parallel (with limit)
625
+ const datePromises = dates.map(dateStr =>
626
+ dateLimit(async () => {
627
+ const dateResults = { date: dateStr, portfolio: null, history: null, social: null };
628
+
629
+ // Process each data type independently - if one fails, others can still succeed (self-healing)
630
+ // Process data types in parallel for better performance
631
+ const dataTypeTasks = [];
632
+
633
+ logger.log('INFO', `[Backfill] Processing date ${dateStr} with dataType=${dataType}`);
634
+
635
+ // Portfolio
636
+ if (dataType === 'all' || dataType === 'portfolio') {
637
+ logger.log('INFO', `[Backfill] Adding portfolio task for ${dateStr}`);
638
+ dataTypeTasks.push((async () => {
639
+ try {
640
+ logger.log('INFO', `[Backfill] Starting portfolio backfill for ${dateStr}`);
641
+ if (!(await isDateBackfilled('portfolio', dateStr))) {
642
+ const count = await retryWithBackoff(
643
+ () => backfillPortfolioData(dateStr, logger),
644
+ 3, // 3 retries
645
+ 2000 // 2s base delay
646
+ );
647
+ summary.portfolioRows += count;
648
+ // Only mark as backfilled if we actually inserted data (count > 0)
649
+ if (count > 0) {
650
+ await markDateBackfilled('portfolio', dateStr, count);
651
+ }
652
+ dateResults.portfolio = { success: true, count };
653
+ logger.log('INFO', `[Backfill] ✅ Portfolio completed for ${dateStr}: ${count} rows`);
654
+ } else {
655
+ logger.log('INFO', `[Backfill] Portfolio for ${dateStr} already backfilled, skipping`);
656
+ summary.skippedDates++;
657
+ dateResults.portfolio = { success: true, skipped: true };
658
+ }
659
+ } catch (portfolioError) {
660
+ const errorDetails = {
661
+ message: portfolioError.message,
662
+ code: portfolioError.code,
663
+ errors: portfolioError.errors
664
+ };
665
+ logger.log('ERROR', `[Backfill] Portfolio failed for ${dateStr} (after retries):`, errorDetails);
666
+ summary.errors.push({ date: dateStr, dataType: 'portfolio', error: portfolioError.message });
667
+ dateResults.portfolio = { success: false, error: portfolioError.message };
668
+ }
669
+ })());
670
+ } else {
671
+ logger.log('INFO', `[Backfill] Skipping portfolio for ${dateStr} (dataType=${dataType})`);
672
+ }
673
+
674
+ // History
675
+ if (dataType === 'all' || dataType === 'history') {
676
+ logger.log('INFO', `[Backfill] Adding history task for ${dateStr}`);
677
+ dataTypeTasks.push((async () => {
678
+ try {
679
+ logger.log('INFO', `[Backfill] Starting history backfill for ${dateStr}`);
680
+ if (!(await isDateBackfilled('history', dateStr))) {
681
+ const count = await retryWithBackoff(
682
+ () => backfillHistoryData(dateStr, logger),
683
+ 3, // 3 retries
684
+ 2000 // 2s base delay
685
+ );
686
+ summary.historyRows += count;
687
+ if (count > 0) {
688
+ await markDateBackfilled('history', dateStr, count);
689
+ }
690
+ dateResults.history = { success: true, count };
691
+ logger.log('INFO', `[Backfill] ✅ History completed for ${dateStr}: ${count} rows`);
692
+ } else {
693
+ logger.log('INFO', `[Backfill] History for ${dateStr} already backfilled, skipping`);
694
+ dateResults.history = { success: true, skipped: true };
695
+ }
696
+ } catch (historyError) {
697
+ const errorDetails = {
698
+ message: historyError.message,
699
+ code: historyError.code,
700
+ errors: historyError.errors
701
+ };
702
+ logger.log('ERROR', `[Backfill] History failed for ${dateStr} (after retries):`, errorDetails);
703
+ summary.errors.push({ date: dateStr, dataType: 'history', error: historyError.message });
704
+ dateResults.history = { success: false, error: historyError.message };
705
+ }
706
+ })());
707
+ } else {
708
+ logger.log('INFO', `[Backfill] Skipping history for ${dateStr} (dataType=${dataType})`);
709
+ }
710
+
711
+ // Social
712
+ if (dataType === 'all' || dataType === 'social') {
713
+ logger.log('INFO', `[Backfill] Adding social task for ${dateStr}`);
714
+ dataTypeTasks.push((async () => {
715
+ try {
716
+ logger.log('INFO', `[Backfill] Starting social backfill for ${dateStr}`);
717
+ if (!(await isDateBackfilled('social', dateStr))) {
718
+ const count = await retryWithBackoff(
719
+ () => backfillSocialData(dateStr, logger),
720
+ 3, // 3 retries
721
+ 2000 // 2s base delay
722
+ );
723
+ summary.socialRows += count;
724
+ if (count > 0) {
725
+ await markDateBackfilled('social', dateStr, count);
726
+ }
727
+ dateResults.social = { success: true, count };
728
+ logger.log('INFO', `[Backfill] ✅ Social completed for ${dateStr}: ${count} rows`);
729
+ } else {
730
+ logger.log('INFO', `[Backfill] Social for ${dateStr} already backfilled, skipping`);
731
+ dateResults.social = { success: true, skipped: true };
732
+ }
733
+ } catch (socialError) {
734
+ const errorDetails = {
735
+ message: socialError.message,
736
+ code: socialError.code,
737
+ errors: socialError.errors
738
+ };
739
+ logger.log('ERROR', `[Backfill] Social failed for ${dateStr} (after retries):`, errorDetails);
740
+ summary.errors.push({ date: dateStr, dataType: 'social', error: socialError.message });
741
+ dateResults.social = { success: false, error: socialError.message };
742
+ }
743
+ })());
744
+ } else {
745
+ logger.log('INFO', `[Backfill] Skipping social for ${dateStr} (dataType=${dataType})`);
746
+ }
747
+
748
+ logger.log('INFO', `[Backfill] Executing ${dataTypeTasks.length} data type tasks in parallel for ${dateStr}`);
749
+
750
+ // Wait for all data types to complete in parallel
751
+ await Promise.all(dataTypeTasks);
752
+
753
+ logger.log('INFO', `[Backfill] All data type tasks completed for ${dateStr}`);
754
+
755
+ // Date is considered processed even if some data types failed
756
+ // Failed data types will be retried on next run (self-healing)
757
+ summary.processedDates++;
758
+ const allSuccess = Object.values(dateResults).every(r => r === null || r.success);
759
+ return { date: dateStr, success: allSuccess, results: dateResults };
760
+ })
761
+ );
762
+
763
+ // Wait for all dates to complete, logging progress periodically
764
+ logger.log('INFO', `[Backfill] Processing ${dates.length} dates in parallel (max 5 concurrent)`);
765
+ const results = await Promise.all(datePromises);
766
+
767
+ const successful = results.filter(r => r.success).length;
768
+ const failed = results.filter(r => !r.success).length;
769
+ logger.log('INFO', `[Backfill] Completed: ${successful} successful, ${failed} failed`);
770
+
771
+ // Optional: Remove duplicates at the end (cheaper than checking each date)
772
+ // Only run if DEDUPLICATE_AT_END=true and we processed all data types
773
+ if (process.env.DEDUPLICATE_AT_END === 'true' && (dataType === 'all' || dataType === 'portfolio' || dataType === 'history' || dataType === 'social')) {
774
+ logger.log('INFO', `[Backfill] Running final deduplication pass...`);
775
+
776
+ const dedupTasks = [];
777
+ if (dataType === 'all' || dataType === 'portfolio') {
778
+ dedupTasks.push(
779
+ removeDuplicates(datasetId, 'portfolio_snapshots', 'date', ['user_id', 'user_type'], logger)
780
+ .then(count => logger.log('INFO', `[Backfill] Deduplication: Removed ${count} duplicate portfolio rows`))
781
+ .catch(err => logger.log('WARN', `[Backfill] Portfolio deduplication failed: ${err.message}`))
782
+ );
783
+ }
784
+ if (dataType === 'all' || dataType === 'history') {
785
+ dedupTasks.push(
786
+ removeDuplicates(datasetId, 'trade_history_snapshots', 'date', ['user_id', 'user_type'], logger)
787
+ .then(count => logger.log('INFO', `[Backfill] Deduplication: Removed ${count} duplicate history rows`))
788
+ .catch(err => logger.log('WARN', `[Backfill] History deduplication failed: ${err.message}`))
789
+ );
790
+ }
791
+ if (dataType === 'all' || dataType === 'social') {
792
+ dedupTasks.push(
793
+ removeDuplicates(datasetId, 'social_post_snapshots', 'date', ['user_id', 'user_type'], logger)
794
+ .then(count => logger.log('INFO', `[Backfill] Deduplication: Removed ${count} duplicate social rows`))
795
+ .catch(err => logger.log('WARN', `[Backfill] Social deduplication failed: ${err.message}`))
796
+ );
797
+ }
798
+
799
+ await Promise.all(dedupTasks);
800
+ logger.log('INFO', `[Backfill] Deduplication complete`);
801
+ }
802
+
803
+ logger.log('INFO', `[Backfill] Complete! Summary:`, JSON.stringify(summary, null, 2));
804
+
805
+ if (isCloudFunction) {
806
+ res.status(200).json({
807
+ success: true,
808
+ message: 'Backfill completed',
809
+ summary: summary
810
+ });
811
+ } else {
812
+ // Local script: just log and exit
813
+ console.log('\n✅ Backfill completed successfully!');
814
+ process.exit(0);
815
+ }
816
+
817
+ } catch (error) {
818
+ logger.log('ERROR', `[Backfill] Fatal error: ${error.message}`, error);
819
+
820
+ if (isCloudFunction) {
821
+ res.status(500).json({
822
+ success: false,
823
+ error: error.message
824
+ });
825
+ } else {
826
+ console.error('\n❌ Backfill failed:', error.message);
827
+ process.exit(1);
828
+ }
829
+ }
830
+ }
831
+
832
+ // If running as a script directly (not imported as module), execute immediately
833
+ if (require.main === module) {
834
+ // Running as local script
835
+ console.log('🚀 Starting local backfill script...\n');
836
+ console.log('Usage: node index.js --startDate=YYYY-MM-DD --endDate=YYYY-MM-DD --dataType=all|portfolio|history|social\n');
837
+
838
+ backfillTaskEngineData(null, null).catch(error => {
839
+ console.error('Fatal error:', error);
840
+ process.exit(1);
841
+ });
842
+ }
843
+
844
+ module.exports = { backfillTaskEngineData };