bulltrackers-module 1.0.712 → 1.0.714
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/functions/maintenance/backfill-instrument-insights/index.js +180 -0
- package/functions/maintenance/backfill-pi-master-list-rankings/index.js +293 -0
- package/functions/maintenance/backfill-task-engine-data/README.md +72 -0
- package/functions/maintenance/backfill-task-engine-data/index.js +844 -0
- package/functions/task-engine/helpers/data_storage_helpers.js +11 -10
- package/package.json +5 -2
|
@@ -0,0 +1,844 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview Backfill Task Engine Data from Firestore to BigQuery
|
|
3
|
+
*
|
|
4
|
+
* This function reads existing portfolio, trade history, and social post data
|
|
5
|
+
* from Firestore and writes it to BigQuery tables.
|
|
6
|
+
*
|
|
7
|
+
* Usage (Cloud Function HTTP trigger):
|
|
8
|
+
* ?startDate=2024-01-01&endDate=2024-12-31&dataType=all
|
|
9
|
+
* ?startDate=2024-01-01&endDate=2024-12-31&dataType=portfolio
|
|
10
|
+
* ?startDate=2024-01-01&endDate=2024-12-31&dataType=history
|
|
11
|
+
* ?startDate=2024-01-01&endDate=2024-12-31&dataType=social
|
|
12
|
+
*
|
|
13
|
+
* Usage (Local Node.js script):
|
|
14
|
+
* node index.js --startDate=2024-01-01 --endDate=2024-12-31 --dataType=all
|
|
15
|
+
* node index.js --startDate=2024-01-01 --endDate=2024-12-31 --dataType=portfolio
|
|
16
|
+
* node index.js --startDate=2024-01-01 --endDate=2024-12-31 --dataType=history
|
|
17
|
+
* node index.js --startDate=2024-01-01 --endDate=2024-12-31 --dataType=social
|
|
18
|
+
*
|
|
19
|
+
* Features:
|
|
20
|
+
* - Processes in batches to avoid timeouts
|
|
21
|
+
* - Tracks progress in Firestore (resume capability)
|
|
22
|
+
* - Handles both new and legacy Firestore structures
|
|
23
|
+
* - Does NOT delete any Firestore data
|
|
24
|
+
* - Works both as Cloud Function and local script
|
|
25
|
+
*/
|
|
26
|
+
|
|
27
|
+
const { Firestore } = require('@google-cloud/firestore');
|
|
28
|
+
const pLimit = require('p-limit');
|
|
29
|
+
const {
|
|
30
|
+
ensurePortfolioSnapshotsTable,
|
|
31
|
+
ensureTradeHistorySnapshotsTable,
|
|
32
|
+
ensureSocialPostSnapshotsTable,
|
|
33
|
+
insertRows,
|
|
34
|
+
insertRowsWithMerge,
|
|
35
|
+
checkExistingRows,
|
|
36
|
+
removeDuplicates
|
|
37
|
+
} = require('../../core/utils/bigquery_utils');
|
|
38
|
+
|
|
39
|
+
// Reusable timestamp converter (created once, reused)
|
|
40
|
+
const timestampReplacer = (key, value) => {
|
|
41
|
+
if (value && typeof value === 'object' && value.toDate && typeof value.toDate === 'function') {
|
|
42
|
+
return value.toDate().toISOString();
|
|
43
|
+
}
|
|
44
|
+
return value;
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
// Firestore batch read size (Firestore getAll limit is 100)
|
|
48
|
+
const FIRESTORE_BATCH_SIZE = 100;
|
|
49
|
+
|
|
50
|
+
const db = new Firestore();
|
|
51
|
+
const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
|
|
52
|
+
const PROGRESS_COLLECTION = 'backfill_progress';
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Get all dates between start and end (inclusive)
|
|
56
|
+
*/
|
|
57
|
+
function getDateRange(startDateStr, endDateStr) {
|
|
58
|
+
const dates = [];
|
|
59
|
+
const start = new Date(startDateStr);
|
|
60
|
+
const end = new Date(endDateStr);
|
|
61
|
+
|
|
62
|
+
for (let d = new Date(start); d <= end; d.setUTCDate(d.getUTCDate() + 1)) {
|
|
63
|
+
dates.push(d.toISOString().split('T')[0]);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
return dates;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Check if a date has already been backfilled
|
|
71
|
+
*/
|
|
72
|
+
async function isDateBackfilled(dataType, dateStr) {
|
|
73
|
+
const progressRef = db.collection(PROGRESS_COLLECTION)
|
|
74
|
+
.doc(dataType)
|
|
75
|
+
.collection('dates')
|
|
76
|
+
.doc(dateStr);
|
|
77
|
+
|
|
78
|
+
const snap = await progressRef.get();
|
|
79
|
+
return snap.exists && snap.data().completed === true;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Mark a date as backfilled
|
|
84
|
+
* @param {string} dataType - Data type ('portfolio', 'history', 'social')
|
|
85
|
+
* @param {string} dateStr - Date string (YYYY-MM-DD)
|
|
86
|
+
* @param {number} rowCount - Number of rows successfully inserted
|
|
87
|
+
* @param {number} expectedCount - Expected total rows (for verification)
|
|
88
|
+
*/
|
|
89
|
+
async function markDateBackfilled(dataType, dateStr, rowCount, expectedCount = null) {
|
|
90
|
+
const progressRef = db.collection(PROGRESS_COLLECTION)
|
|
91
|
+
.doc(dataType)
|
|
92
|
+
.collection('dates')
|
|
93
|
+
.doc(dateStr);
|
|
94
|
+
|
|
95
|
+
await progressRef.set({
|
|
96
|
+
completed: true,
|
|
97
|
+
rowCount: rowCount,
|
|
98
|
+
expectedCount: expectedCount || rowCount, // Store expected for verification
|
|
99
|
+
completedAt: require('@google-cloud/firestore').FieldValue.serverTimestamp()
|
|
100
|
+
}, { merge: true });
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Backfill portfolio data for a single date
|
|
105
|
+
*/
|
|
106
|
+
async function backfillPortfolioData(dateStr, logger) {
|
|
107
|
+
const rows = [];
|
|
108
|
+
|
|
109
|
+
try {
|
|
110
|
+
// Collect all document references first
|
|
111
|
+
const allDocRefs = [];
|
|
112
|
+
|
|
113
|
+
// NEW STRUCTURE: PopularInvestorPortfolioData/{date}/{cid}/{cid}
|
|
114
|
+
const piPortCollection = db.collection('PopularInvestorPortfolioData').doc(dateStr);
|
|
115
|
+
const piSubcollections = await piPortCollection.listCollections();
|
|
116
|
+
for (const subcol of piSubcollections) {
|
|
117
|
+
const cid = subcol.id;
|
|
118
|
+
allDocRefs.push({ ref: subcol.doc(cid), type: 'POPULAR_INVESTOR', cid });
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// NEW STRUCTURE: SignedInUserPortfolioData/{date}/{cid}/{cid}
|
|
122
|
+
const signedInPortCollection = db.collection('SignedInUserPortfolioData').doc(dateStr);
|
|
123
|
+
const signedInSubcollections = await signedInPortCollection.listCollections();
|
|
124
|
+
for (const subcol of signedInSubcollections) {
|
|
125
|
+
const cid = subcol.id;
|
|
126
|
+
allDocRefs.push({ ref: subcol.doc(cid), type: 'SIGNED_IN_USER', cid });
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
if (allDocRefs.length === 0) {
|
|
130
|
+
logger.log('INFO', `[Backfill] Portfolio: No data found for ${dateStr}, skipping`);
|
|
131
|
+
return 0;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// Batch read all documents using getAll() (up to 100 per call, much faster)
|
|
135
|
+
for (let i = 0; i < allDocRefs.length; i += FIRESTORE_BATCH_SIZE) {
|
|
136
|
+
const batch = allDocRefs.slice(i, i + FIRESTORE_BATCH_SIZE);
|
|
137
|
+
const docRefs = batch.map(item => item.ref);
|
|
138
|
+
|
|
139
|
+
// Single RPC call for up to 100 documents
|
|
140
|
+
const docs = await db.getAll(...docRefs);
|
|
141
|
+
|
|
142
|
+
for (let j = 0; j < docs.length; j++) {
|
|
143
|
+
const doc = docs[j];
|
|
144
|
+
if (doc.exists) {
|
|
145
|
+
const data = doc.data();
|
|
146
|
+
const batchItem = batch[j];
|
|
147
|
+
|
|
148
|
+
// Convert Firestore Timestamps to ISO strings (reusable function)
|
|
149
|
+
const cleanData = JSON.parse(JSON.stringify(data, timestampReplacer));
|
|
150
|
+
|
|
151
|
+
rows.push({
|
|
152
|
+
date: dateStr,
|
|
153
|
+
user_id: Number(batchItem.cid) || 0,
|
|
154
|
+
user_type: batchItem.type,
|
|
155
|
+
portfolio_data: JSON.stringify(cleanData), // BigQuery JSON type requires a string
|
|
156
|
+
fetched_at: data.fetchedAt?.toDate?.()?.toISOString() || new Date().toISOString()
|
|
157
|
+
});
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// LEGACY STRUCTURE: NormalUserPortfolios/{blockId}/snapshots/{date}/parts/{partId}
|
|
163
|
+
// Note: Legacy structure is more complex, would need to read parts and merge
|
|
164
|
+
// For now, we'll skip legacy and focus on new structure
|
|
165
|
+
// TODO: Add legacy structure support if needed
|
|
166
|
+
|
|
167
|
+
if (rows.length > 0) {
|
|
168
|
+
await ensurePortfolioSnapshotsTable(logger);
|
|
169
|
+
|
|
170
|
+
// Use BigQuery MERGE for efficient deduplication (SQL-native, most efficient)
|
|
171
|
+
// MERGE handles duplicates natively - no need to check first
|
|
172
|
+
if (process.env.USE_MERGE_FOR_DEDUP === 'true') {
|
|
173
|
+
logger.log('INFO', `[Backfill] Portfolio: Using MERGE for ${dateStr} (${rows.length} rows, SQL-native deduplication)`);
|
|
174
|
+
try {
|
|
175
|
+
const rowsInserted = await insertRowsWithMerge(
|
|
176
|
+
datasetId,
|
|
177
|
+
'portfolio_snapshots',
|
|
178
|
+
rows,
|
|
179
|
+
['date', 'user_id', 'user_type'], // Unique key fields
|
|
180
|
+
logger
|
|
181
|
+
);
|
|
182
|
+
logger.log('INFO', `[Backfill] Portfolio: MERGE completed for ${dateStr} - ${rowsInserted} new rows inserted`);
|
|
183
|
+
return rowsInserted;
|
|
184
|
+
} catch (mergeError) {
|
|
185
|
+
logger.log('WARN', `[Backfill] Portfolio MERGE failed for ${dateStr}, falling back to check-then-insert: ${mergeError.message}`);
|
|
186
|
+
// Fall through to check-then-insert method
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// Fallback: Check for existing rows to prevent duplicates
|
|
191
|
+
// Can skip if SKIP_DUPLICATE_CHECK=true (for fresh backfills)
|
|
192
|
+
let newRows = rows;
|
|
193
|
+
if (process.env.SKIP_DUPLICATE_CHECK !== 'true') {
|
|
194
|
+
const existingKeys = await checkExistingRows(datasetId, 'portfolio_snapshots', dateStr, rows, logger);
|
|
195
|
+
|
|
196
|
+
// Filter out rows that already exist
|
|
197
|
+
newRows = rows.filter(row => {
|
|
198
|
+
const key = `${row.user_id}|${row.user_type}`;
|
|
199
|
+
return !existingKeys.has(key);
|
|
200
|
+
});
|
|
201
|
+
|
|
202
|
+
if (newRows.length < rows.length) {
|
|
203
|
+
logger.log('INFO', `[Backfill] Portfolio: Skipping ${rows.length - newRows.length} duplicate rows for ${dateStr}`);
|
|
204
|
+
}
|
|
205
|
+
} else {
|
|
206
|
+
logger.log('INFO', `[Backfill] Portfolio: Skipping duplicate check for ${dateStr} (SKIP_DUPLICATE_CHECK=true)`);
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
if (newRows.length > 0) {
|
|
210
|
+
// Insert in smaller batches to avoid request size limits
|
|
211
|
+
const BATCH_SIZE = 50; // Insert 50 rows at a time
|
|
212
|
+
let totalInserted = 0;
|
|
213
|
+
const failedBatches = [];
|
|
214
|
+
for (let i = 0; i < newRows.length; i += BATCH_SIZE) {
|
|
215
|
+
const batch = newRows.slice(i, i + BATCH_SIZE);
|
|
216
|
+
try {
|
|
217
|
+
await insertRows(datasetId, 'portfolio_snapshots', batch, logger);
|
|
218
|
+
totalInserted += batch.length;
|
|
219
|
+
} catch (batchError) {
|
|
220
|
+
const batchNum = Math.floor(i/BATCH_SIZE) + 1;
|
|
221
|
+
logger.log('WARN', `[Backfill] Portfolio batch insert failed for ${dateStr}, batch ${batchNum}: ${batchError.message}`);
|
|
222
|
+
failedBatches.push({ batchNum, startIndex: i, batchSize: batch.length });
|
|
223
|
+
// Continue with next batch
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
logger.log('INFO', `[Backfill] Portfolio: Inserted ${totalInserted}/${newRows.length} new rows for ${dateStr} (${rows.length - newRows.length} duplicates skipped)`);
|
|
227
|
+
|
|
228
|
+
// Return actual inserted count, and throw if partial failure
|
|
229
|
+
if (totalInserted < newRows.length) {
|
|
230
|
+
const missing = newRows.length - totalInserted;
|
|
231
|
+
throw new Error(`Partial failure: Only inserted ${totalInserted}/${newRows.length} rows. ${missing} rows failed. Failed batches: ${failedBatches.map(b => `batch ${b.batchNum}`).join(', ')}`);
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
return totalInserted + (rows.length - newRows.length); // Return total (new + existing)
|
|
235
|
+
} else {
|
|
236
|
+
logger.log('INFO', `[Backfill] Portfolio: All rows already exist for ${dateStr}, skipping`);
|
|
237
|
+
return rows.length; // All were duplicates, return count found
|
|
238
|
+
}
|
|
239
|
+
} else {
|
|
240
|
+
logger.log('INFO', `[Backfill] Portfolio: No data found for ${dateStr}, skipping`);
|
|
241
|
+
return 0;
|
|
242
|
+
}
|
|
243
|
+
} catch (error) {
|
|
244
|
+
logger.log('ERROR', `[Backfill] Portfolio error for ${dateStr}:`, {
|
|
245
|
+
message: error.message,
|
|
246
|
+
code: error.code,
|
|
247
|
+
errors: error.errors,
|
|
248
|
+
stack: error.stack
|
|
249
|
+
});
|
|
250
|
+
throw error;
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
/**
|
|
255
|
+
* Backfill trade history data for a single date
|
|
256
|
+
*/
|
|
257
|
+
async function backfillHistoryData(dateStr, logger) {
|
|
258
|
+
const rows = [];
|
|
259
|
+
|
|
260
|
+
try {
|
|
261
|
+
// Collect all document references first
|
|
262
|
+
const allDocRefs = [];
|
|
263
|
+
|
|
264
|
+
// NEW STRUCTURE: PopularInvestorTradeHistoryData/{date}/{cid}/{cid}
|
|
265
|
+
const piHistCollection = db.collection('PopularInvestorTradeHistoryData').doc(dateStr);
|
|
266
|
+
const piSubcollections = await piHistCollection.listCollections();
|
|
267
|
+
for (const subcol of piSubcollections) {
|
|
268
|
+
const cid = subcol.id;
|
|
269
|
+
allDocRefs.push({ ref: subcol.doc(cid), type: 'POPULAR_INVESTOR', cid });
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
// NEW STRUCTURE: SignedInUserTradeHistoryData/{date}/{cid}/{cid}
|
|
273
|
+
const signedInHistCollection = db.collection('SignedInUserTradeHistoryData').doc(dateStr);
|
|
274
|
+
const signedInSubcollections = await signedInHistCollection.listCollections();
|
|
275
|
+
for (const subcol of signedInSubcollections) {
|
|
276
|
+
const cid = subcol.id;
|
|
277
|
+
allDocRefs.push({ ref: subcol.doc(cid), type: 'SIGNED_IN_USER', cid });
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
if (allDocRefs.length === 0) {
|
|
281
|
+
logger.log('INFO', `[Backfill] History: No data found for ${dateStr}, skipping`);
|
|
282
|
+
return 0;
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
// Batch read all documents using getAll() (up to 100 per call, much faster)
|
|
286
|
+
for (let i = 0; i < allDocRefs.length; i += FIRESTORE_BATCH_SIZE) {
|
|
287
|
+
const batch = allDocRefs.slice(i, i + FIRESTORE_BATCH_SIZE);
|
|
288
|
+
const docRefs = batch.map(item => item.ref);
|
|
289
|
+
|
|
290
|
+
// Single RPC call for up to 100 documents
|
|
291
|
+
const docs = await db.getAll(...docRefs);
|
|
292
|
+
|
|
293
|
+
for (let j = 0; j < docs.length; j++) {
|
|
294
|
+
const doc = docs[j];
|
|
295
|
+
if (doc.exists) {
|
|
296
|
+
const data = doc.data();
|
|
297
|
+
const batchItem = batch[j];
|
|
298
|
+
|
|
299
|
+
// Convert Firestore Timestamps to ISO strings (reusable function)
|
|
300
|
+
const cleanData = JSON.parse(JSON.stringify(data, timestampReplacer));
|
|
301
|
+
|
|
302
|
+
rows.push({
|
|
303
|
+
date: dateStr,
|
|
304
|
+
user_id: Number(batchItem.cid) || 0,
|
|
305
|
+
user_type: batchItem.type,
|
|
306
|
+
history_data: JSON.stringify(cleanData), // BigQuery JSON type requires a string
|
|
307
|
+
fetched_at: data.fetchedAt?.toDate?.()?.toISOString() || new Date().toISOString()
|
|
308
|
+
});
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
if (rows.length > 0) {
|
|
314
|
+
await ensureTradeHistorySnapshotsTable(logger);
|
|
315
|
+
|
|
316
|
+
// Use BigQuery MERGE for efficient deduplication (SQL-native, most efficient)
|
|
317
|
+
if (process.env.USE_MERGE_FOR_DEDUP === 'true') {
|
|
318
|
+
logger.log('INFO', `[Backfill] History: Using MERGE for ${dateStr} (${rows.length} rows, SQL-native deduplication)`);
|
|
319
|
+
try {
|
|
320
|
+
const rowsInserted = await insertRowsWithMerge(
|
|
321
|
+
datasetId,
|
|
322
|
+
'trade_history_snapshots',
|
|
323
|
+
rows,
|
|
324
|
+
['date', 'user_id', 'user_type'], // Unique key fields
|
|
325
|
+
logger
|
|
326
|
+
);
|
|
327
|
+
logger.log('INFO', `[Backfill] History: MERGE completed for ${dateStr} - ${rowsInserted} new rows inserted`);
|
|
328
|
+
return rowsInserted;
|
|
329
|
+
} catch (mergeError) {
|
|
330
|
+
logger.log('WARN', `[Backfill] History MERGE failed for ${dateStr}, falling back to check-then-insert: ${mergeError.message}`);
|
|
331
|
+
// Fall through to check-then-insert method
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
// Fallback: Check for existing rows to prevent duplicates
|
|
336
|
+
let newRows = rows;
|
|
337
|
+
if (process.env.SKIP_DUPLICATE_CHECK !== 'true') {
|
|
338
|
+
const existingKeys = await checkExistingRows(datasetId, 'trade_history_snapshots', dateStr, rows, logger);
|
|
339
|
+
|
|
340
|
+
// Filter out rows that already exist
|
|
341
|
+
newRows = rows.filter(row => {
|
|
342
|
+
const key = `${row.user_id}|${row.user_type}`;
|
|
343
|
+
return !existingKeys.has(key);
|
|
344
|
+
});
|
|
345
|
+
|
|
346
|
+
if (newRows.length < rows.length) {
|
|
347
|
+
logger.log('INFO', `[Backfill] History: Skipping ${rows.length - newRows.length} duplicate rows for ${dateStr}`);
|
|
348
|
+
}
|
|
349
|
+
} else {
|
|
350
|
+
logger.log('INFO', `[Backfill] History: Skipping duplicate check for ${dateStr} (SKIP_DUPLICATE_CHECK=true)`);
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
if (newRows.length > 0) {
|
|
354
|
+
// Insert in smaller batches to avoid request size limits (trade history can be very large)
|
|
355
|
+
const BATCH_SIZE = 25; // Smaller batches for history (can be very large)
|
|
356
|
+
let totalInserted = 0;
|
|
357
|
+
const failedBatches = [];
|
|
358
|
+
for (let i = 0; i < newRows.length; i += BATCH_SIZE) {
|
|
359
|
+
const batch = newRows.slice(i, i + BATCH_SIZE);
|
|
360
|
+
try {
|
|
361
|
+
await insertRows(datasetId, 'trade_history_snapshots', batch, logger);
|
|
362
|
+
totalInserted += batch.length;
|
|
363
|
+
} catch (batchError) {
|
|
364
|
+
const batchNum = Math.floor(i/BATCH_SIZE) + 1;
|
|
365
|
+
logger.log('WARN', `[Backfill] History batch insert failed for ${dateStr}, batch ${batchNum}: ${batchError.message}`);
|
|
366
|
+
failedBatches.push({ batchNum, startIndex: i, batchSize: batch.length });
|
|
367
|
+
// Continue with next batch
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
logger.log('INFO', `[Backfill] History: Inserted ${totalInserted}/${newRows.length} new rows for ${dateStr} (${rows.length - newRows.length} duplicates skipped)`);
|
|
371
|
+
|
|
372
|
+
// Return actual inserted count, and throw if partial failure
|
|
373
|
+
if (totalInserted < newRows.length) {
|
|
374
|
+
const missing = newRows.length - totalInserted;
|
|
375
|
+
throw new Error(`Partial failure: Only inserted ${totalInserted}/${newRows.length} rows. ${missing} rows failed. Failed batches: ${failedBatches.map(b => `batch ${b.batchNum}`).join(', ')}`);
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
return totalInserted + (rows.length - newRows.length); // Return total (new + existing)
|
|
379
|
+
} else {
|
|
380
|
+
logger.log('INFO', `[Backfill] History: All rows already exist for ${dateStr}, skipping`);
|
|
381
|
+
return rows.length; // All were duplicates, return count found
|
|
382
|
+
}
|
|
383
|
+
} else {
|
|
384
|
+
logger.log('INFO', `[Backfill] History: No data found for ${dateStr}, skipping`);
|
|
385
|
+
return 0;
|
|
386
|
+
}
|
|
387
|
+
} catch (error) {
|
|
388
|
+
logger.log('ERROR', `[Backfill] History error for ${dateStr}:`, {
|
|
389
|
+
message: error.message,
|
|
390
|
+
code: error.code,
|
|
391
|
+
errors: error.errors,
|
|
392
|
+
stack: error.stack
|
|
393
|
+
});
|
|
394
|
+
throw error;
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
/**
|
|
399
|
+
* Backfill social post data for a single date
|
|
400
|
+
*/
|
|
401
|
+
async function backfillSocialData(dateStr, logger) {
|
|
402
|
+
const rows = [];
|
|
403
|
+
|
|
404
|
+
try {
|
|
405
|
+
// Collect all document references first
|
|
406
|
+
const allDocRefs = [];
|
|
407
|
+
|
|
408
|
+
// NEW STRUCTURE: PopularInvestorSocialPostData/{date}/{cid}/{cid}
|
|
409
|
+
const piSocialCollection = db.collection('PopularInvestorSocialPostData').doc(dateStr);
|
|
410
|
+
const piSubcollections = await piSocialCollection.listCollections();
|
|
411
|
+
for (const subcol of piSubcollections) {
|
|
412
|
+
const cid = subcol.id;
|
|
413
|
+
allDocRefs.push({ ref: subcol.doc(cid), type: 'POPULAR_INVESTOR', cid });
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
// NEW STRUCTURE: SignedInUserSocialPostData/{date}/{cid}/{cid}
|
|
417
|
+
const signedInSocialCollection = db.collection('SignedInUserSocialPostData').doc(dateStr);
|
|
418
|
+
const signedInSubcollections = await signedInSocialCollection.listCollections();
|
|
419
|
+
for (const subcol of signedInSubcollections) {
|
|
420
|
+
const cid = subcol.id;
|
|
421
|
+
allDocRefs.push({ ref: subcol.doc(cid), type: 'SIGNED_IN_USER', cid });
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
if (allDocRefs.length === 0) {
|
|
425
|
+
logger.log('INFO', `[Backfill] Social: No data found for ${dateStr}, skipping`);
|
|
426
|
+
return 0;
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
// Batch read all documents using getAll() (up to 100 per call, much faster)
|
|
430
|
+
for (let i = 0; i < allDocRefs.length; i += FIRESTORE_BATCH_SIZE) {
|
|
431
|
+
const batch = allDocRefs.slice(i, i + FIRESTORE_BATCH_SIZE);
|
|
432
|
+
const docRefs = batch.map(item => item.ref);
|
|
433
|
+
|
|
434
|
+
// Single RPC call for up to 100 documents
|
|
435
|
+
const docs = await db.getAll(...docRefs);
|
|
436
|
+
|
|
437
|
+
for (let j = 0; j < docs.length; j++) {
|
|
438
|
+
const doc = docs[j];
|
|
439
|
+
if (doc.exists) {
|
|
440
|
+
const data = doc.data();
|
|
441
|
+
const batchItem = batch[j];
|
|
442
|
+
|
|
443
|
+
// Convert Firestore Timestamps to ISO strings (reusable function)
|
|
444
|
+
const cleanData = JSON.parse(JSON.stringify(data, timestampReplacer));
|
|
445
|
+
|
|
446
|
+
rows.push({
|
|
447
|
+
date: dateStr,
|
|
448
|
+
user_id: Number(batchItem.cid) || 0,
|
|
449
|
+
user_type: batchItem.type,
|
|
450
|
+
posts_data: JSON.stringify(cleanData), // BigQuery JSON type requires a string
|
|
451
|
+
fetched_at: data.fetchedAt?.toDate?.()?.toISOString() || new Date().toISOString()
|
|
452
|
+
});
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
if (rows.length > 0) {
|
|
458
|
+
await ensureSocialPostSnapshotsTable(logger);
|
|
459
|
+
|
|
460
|
+
// Use BigQuery MERGE for efficient deduplication (SQL-native, most efficient)
|
|
461
|
+
if (process.env.USE_MERGE_FOR_DEDUP === 'true') {
|
|
462
|
+
logger.log('INFO', `[Backfill] Social: Using MERGE for ${dateStr} (${rows.length} rows, SQL-native deduplication)`);
|
|
463
|
+
try {
|
|
464
|
+
const rowsInserted = await insertRowsWithMerge(
|
|
465
|
+
datasetId,
|
|
466
|
+
'social_post_snapshots',
|
|
467
|
+
rows,
|
|
468
|
+
['date', 'user_id', 'user_type'], // Unique key fields
|
|
469
|
+
logger
|
|
470
|
+
);
|
|
471
|
+
logger.log('INFO', `[Backfill] Social: MERGE completed for ${dateStr} - ${rowsInserted} new rows inserted`);
|
|
472
|
+
return rowsInserted;
|
|
473
|
+
} catch (mergeError) {
|
|
474
|
+
logger.log('WARN', `[Backfill] Social MERGE failed for ${dateStr}, falling back to check-then-insert: ${mergeError.message}`);
|
|
475
|
+
// Fall through to check-then-insert method
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
// Fallback: Check for existing rows to prevent duplicates
|
|
480
|
+
let newRows = rows;
|
|
481
|
+
if (process.env.SKIP_DUPLICATE_CHECK !== 'true') {
|
|
482
|
+
const existingKeys = await checkExistingRows(datasetId, 'social_post_snapshots', dateStr, rows, logger);
|
|
483
|
+
|
|
484
|
+
// Filter out rows that already exist
|
|
485
|
+
newRows = rows.filter(row => {
|
|
486
|
+
const key = `${row.user_id}|${row.user_type}`;
|
|
487
|
+
return !existingKeys.has(key);
|
|
488
|
+
});
|
|
489
|
+
|
|
490
|
+
if (newRows.length < rows.length) {
|
|
491
|
+
logger.log('INFO', `[Backfill] Social: Skipping ${rows.length - newRows.length} duplicate rows for ${dateStr}`);
|
|
492
|
+
}
|
|
493
|
+
} else {
|
|
494
|
+
logger.log('INFO', `[Backfill] Social: Skipping duplicate check for ${dateStr} (SKIP_DUPLICATE_CHECK=true)`);
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
if (newRows.length > 0) {
|
|
498
|
+
// Insert in smaller batches to avoid request size limits
|
|
499
|
+
const BATCH_SIZE = 50; // Insert 50 rows at a time
|
|
500
|
+
let totalInserted = 0;
|
|
501
|
+
const failedBatches = [];
|
|
502
|
+
for (let i = 0; i < newRows.length; i += BATCH_SIZE) {
|
|
503
|
+
const batch = newRows.slice(i, i + BATCH_SIZE);
|
|
504
|
+
try {
|
|
505
|
+
await insertRows(datasetId, 'social_post_snapshots', batch, logger);
|
|
506
|
+
totalInserted += batch.length;
|
|
507
|
+
} catch (batchError) {
|
|
508
|
+
const batchNum = Math.floor(i/BATCH_SIZE) + 1;
|
|
509
|
+
logger.log('WARN', `[Backfill] Social batch insert failed for ${dateStr}, batch ${batchNum}: ${batchError.message}`);
|
|
510
|
+
failedBatches.push({ batchNum, startIndex: i, batchSize: batch.length });
|
|
511
|
+
// Continue with next batch
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
logger.log('INFO', `[Backfill] Social: Inserted ${totalInserted}/${newRows.length} new rows for ${dateStr} (${rows.length - newRows.length} duplicates skipped)`);
|
|
515
|
+
|
|
516
|
+
// Return actual inserted count, and throw if partial failure
|
|
517
|
+
if (totalInserted < newRows.length) {
|
|
518
|
+
const missing = newRows.length - totalInserted;
|
|
519
|
+
throw new Error(`Partial failure: Only inserted ${totalInserted}/${newRows.length} rows. ${missing} rows failed. Failed batches: ${failedBatches.map(b => `batch ${b.batchNum}`).join(', ')}`);
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
return totalInserted + (rows.length - newRows.length); // Return total (new + existing)
|
|
523
|
+
} else {
|
|
524
|
+
logger.log('INFO', `[Backfill] Social: All rows already exist for ${dateStr}, skipping`);
|
|
525
|
+
return rows.length; // All were duplicates, return count found
|
|
526
|
+
}
|
|
527
|
+
} else {
|
|
528
|
+
logger.log('INFO', `[Backfill] Social: No data found for ${dateStr}, skipping`);
|
|
529
|
+
return 0;
|
|
530
|
+
}
|
|
531
|
+
} catch (error) {
|
|
532
|
+
logger.log('ERROR', `[Backfill] Social error for ${dateStr}:`, {
|
|
533
|
+
message: error.message,
|
|
534
|
+
code: error.code,
|
|
535
|
+
errors: error.errors,
|
|
536
|
+
stack: error.stack
|
|
537
|
+
});
|
|
538
|
+
throw error;
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
/**
|
|
543
|
+
* Main backfill function
|
|
544
|
+
* Works both as Cloud Function (req/res) and local script (no req/res)
|
|
545
|
+
*/
|
|
546
|
+
async function backfillTaskEngineData(req, res) {
|
|
547
|
+
const logger = {
|
|
548
|
+
log: (level, message, ...args) => {
|
|
549
|
+
const timestamp = new Date().toISOString();
|
|
550
|
+
console.log(`[${timestamp}] [${level}] ${message}`, ...args);
|
|
551
|
+
}
|
|
552
|
+
};
|
|
553
|
+
|
|
554
|
+
// Determine if running as Cloud Function or local script
|
|
555
|
+
const isCloudFunction = req && res;
|
|
556
|
+
|
|
557
|
+
// Extract parameters from either HTTP request or command line
|
|
558
|
+
let startDate, endDate, dataType;
|
|
559
|
+
|
|
560
|
+
if (isCloudFunction) {
|
|
561
|
+
// Cloud Function: get from query params
|
|
562
|
+
startDate = req.query.startDate || '2024-01-01';
|
|
563
|
+
endDate = req.query.endDate || new Date().toISOString().split('T')[0];
|
|
564
|
+
dataType = req.query.dataType || 'all';
|
|
565
|
+
} else {
|
|
566
|
+
// Local script: get from command line args
|
|
567
|
+
const args = process.argv.slice(2);
|
|
568
|
+
const parseArg = (key, defaultValue) => {
|
|
569
|
+
const arg = args.find(a => a.startsWith(`--${key}=`));
|
|
570
|
+
return arg ? arg.split('=')[1] : defaultValue;
|
|
571
|
+
};
|
|
572
|
+
|
|
573
|
+
startDate = parseArg('startDate', '2024-01-01');
|
|
574
|
+
endDate = parseArg('endDate', new Date().toISOString().split('T')[0]);
|
|
575
|
+
dataType = parseArg('dataType', 'all');
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
try {
|
|
579
|
+
logger.log('INFO', `[Backfill] Starting backfill: ${dataType} from ${startDate} to ${endDate}`);
|
|
580
|
+
logger.log('INFO', `[Backfill] Running in ${isCloudFunction ? 'Cloud Function' : 'local'} mode`);
|
|
581
|
+
|
|
582
|
+
const dates = getDateRange(startDate, endDate);
|
|
583
|
+
logger.log('INFO', `[Backfill] Processing ${dates.length} dates`);
|
|
584
|
+
|
|
585
|
+
const summary = {
|
|
586
|
+
totalDates: dates.length,
|
|
587
|
+
processedDates: 0,
|
|
588
|
+
skippedDates: 0,
|
|
589
|
+
portfolioRows: 0,
|
|
590
|
+
historyRows: 0,
|
|
591
|
+
socialRows: 0,
|
|
592
|
+
errors: []
|
|
593
|
+
};
|
|
594
|
+
|
|
595
|
+
// Process dates in parallel with concurrency limit
|
|
596
|
+
// Limit to 5 concurrent dates to avoid overwhelming BigQuery/Firestore
|
|
597
|
+
const dateLimit = pLimit(5);
|
|
598
|
+
|
|
599
|
+
// Helper function to retry operations with exponential backoff
|
|
600
|
+
async function retryWithBackoff(fn, maxRetries = 3, baseDelay = 1000) {
|
|
601
|
+
for (let attempt = 1; attempt <= maxRetries; attempt++) {
|
|
602
|
+
try {
|
|
603
|
+
return await fn();
|
|
604
|
+
} catch (error) {
|
|
605
|
+
// Check if error is retryable (timeout, rate limit, etc.)
|
|
606
|
+
const isRetryable = error.code === 4 || // DEADLINE_EXCEEDED (timeout)
|
|
607
|
+
error.code === 8 || // RESOURCE_EXHAUSTED (rate limit)
|
|
608
|
+
error.code === 14 || // UNAVAILABLE (service unavailable)
|
|
609
|
+
error.message?.includes('timeout') ||
|
|
610
|
+
error.message?.includes('rate limit') ||
|
|
611
|
+
error.message?.includes('exceeded');
|
|
612
|
+
|
|
613
|
+
if (!isRetryable || attempt === maxRetries) {
|
|
614
|
+
throw error; // Not retryable or out of retries
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
const delay = baseDelay * Math.pow(2, attempt - 1); // Exponential backoff
|
|
618
|
+
logger.log('WARN', `[Backfill] Retryable error (attempt ${attempt}/${maxRetries}), retrying in ${delay}ms: ${error.message}`);
|
|
619
|
+
await new Promise(resolve => setTimeout(resolve, delay));
|
|
620
|
+
}
|
|
621
|
+
}
|
|
622
|
+
}
|
|
623
|
+
|
|
624
|
+
// Process all dates in parallel (with limit)
|
|
625
|
+
const datePromises = dates.map(dateStr =>
|
|
626
|
+
dateLimit(async () => {
|
|
627
|
+
const dateResults = { date: dateStr, portfolio: null, history: null, social: null };
|
|
628
|
+
|
|
629
|
+
// Process each data type independently - if one fails, others can still succeed (self-healing)
|
|
630
|
+
// Process data types in parallel for better performance
|
|
631
|
+
const dataTypeTasks = [];
|
|
632
|
+
|
|
633
|
+
logger.log('INFO', `[Backfill] Processing date ${dateStr} with dataType=${dataType}`);
|
|
634
|
+
|
|
635
|
+
// Portfolio
|
|
636
|
+
if (dataType === 'all' || dataType === 'portfolio') {
|
|
637
|
+
logger.log('INFO', `[Backfill] Adding portfolio task for ${dateStr}`);
|
|
638
|
+
dataTypeTasks.push((async () => {
|
|
639
|
+
try {
|
|
640
|
+
logger.log('INFO', `[Backfill] Starting portfolio backfill for ${dateStr}`);
|
|
641
|
+
if (!(await isDateBackfilled('portfolio', dateStr))) {
|
|
642
|
+
const count = await retryWithBackoff(
|
|
643
|
+
() => backfillPortfolioData(dateStr, logger),
|
|
644
|
+
3, // 3 retries
|
|
645
|
+
2000 // 2s base delay
|
|
646
|
+
);
|
|
647
|
+
summary.portfolioRows += count;
|
|
648
|
+
// Only mark as backfilled if we actually inserted data (count > 0)
|
|
649
|
+
if (count > 0) {
|
|
650
|
+
await markDateBackfilled('portfolio', dateStr, count);
|
|
651
|
+
}
|
|
652
|
+
dateResults.portfolio = { success: true, count };
|
|
653
|
+
logger.log('INFO', `[Backfill] ✅ Portfolio completed for ${dateStr}: ${count} rows`);
|
|
654
|
+
} else {
|
|
655
|
+
logger.log('INFO', `[Backfill] Portfolio for ${dateStr} already backfilled, skipping`);
|
|
656
|
+
summary.skippedDates++;
|
|
657
|
+
dateResults.portfolio = { success: true, skipped: true };
|
|
658
|
+
}
|
|
659
|
+
} catch (portfolioError) {
|
|
660
|
+
const errorDetails = {
|
|
661
|
+
message: portfolioError.message,
|
|
662
|
+
code: portfolioError.code,
|
|
663
|
+
errors: portfolioError.errors
|
|
664
|
+
};
|
|
665
|
+
logger.log('ERROR', `[Backfill] Portfolio failed for ${dateStr} (after retries):`, errorDetails);
|
|
666
|
+
summary.errors.push({ date: dateStr, dataType: 'portfolio', error: portfolioError.message });
|
|
667
|
+
dateResults.portfolio = { success: false, error: portfolioError.message };
|
|
668
|
+
}
|
|
669
|
+
})());
|
|
670
|
+
} else {
|
|
671
|
+
logger.log('INFO', `[Backfill] Skipping portfolio for ${dateStr} (dataType=${dataType})`);
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
// History
|
|
675
|
+
if (dataType === 'all' || dataType === 'history') {
|
|
676
|
+
logger.log('INFO', `[Backfill] Adding history task for ${dateStr}`);
|
|
677
|
+
dataTypeTasks.push((async () => {
|
|
678
|
+
try {
|
|
679
|
+
logger.log('INFO', `[Backfill] Starting history backfill for ${dateStr}`);
|
|
680
|
+
if (!(await isDateBackfilled('history', dateStr))) {
|
|
681
|
+
const count = await retryWithBackoff(
|
|
682
|
+
() => backfillHistoryData(dateStr, logger),
|
|
683
|
+
3, // 3 retries
|
|
684
|
+
2000 // 2s base delay
|
|
685
|
+
);
|
|
686
|
+
summary.historyRows += count;
|
|
687
|
+
if (count > 0) {
|
|
688
|
+
await markDateBackfilled('history', dateStr, count);
|
|
689
|
+
}
|
|
690
|
+
dateResults.history = { success: true, count };
|
|
691
|
+
logger.log('INFO', `[Backfill] ✅ History completed for ${dateStr}: ${count} rows`);
|
|
692
|
+
} else {
|
|
693
|
+
logger.log('INFO', `[Backfill] History for ${dateStr} already backfilled, skipping`);
|
|
694
|
+
dateResults.history = { success: true, skipped: true };
|
|
695
|
+
}
|
|
696
|
+
} catch (historyError) {
|
|
697
|
+
const errorDetails = {
|
|
698
|
+
message: historyError.message,
|
|
699
|
+
code: historyError.code,
|
|
700
|
+
errors: historyError.errors
|
|
701
|
+
};
|
|
702
|
+
logger.log('ERROR', `[Backfill] History failed for ${dateStr} (after retries):`, errorDetails);
|
|
703
|
+
summary.errors.push({ date: dateStr, dataType: 'history', error: historyError.message });
|
|
704
|
+
dateResults.history = { success: false, error: historyError.message };
|
|
705
|
+
}
|
|
706
|
+
})());
|
|
707
|
+
} else {
|
|
708
|
+
logger.log('INFO', `[Backfill] Skipping history for ${dateStr} (dataType=${dataType})`);
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
// Social
|
|
712
|
+
if (dataType === 'all' || dataType === 'social') {
|
|
713
|
+
logger.log('INFO', `[Backfill] Adding social task for ${dateStr}`);
|
|
714
|
+
dataTypeTasks.push((async () => {
|
|
715
|
+
try {
|
|
716
|
+
logger.log('INFO', `[Backfill] Starting social backfill for ${dateStr}`);
|
|
717
|
+
if (!(await isDateBackfilled('social', dateStr))) {
|
|
718
|
+
const count = await retryWithBackoff(
|
|
719
|
+
() => backfillSocialData(dateStr, logger),
|
|
720
|
+
3, // 3 retries
|
|
721
|
+
2000 // 2s base delay
|
|
722
|
+
);
|
|
723
|
+
summary.socialRows += count;
|
|
724
|
+
if (count > 0) {
|
|
725
|
+
await markDateBackfilled('social', dateStr, count);
|
|
726
|
+
}
|
|
727
|
+
dateResults.social = { success: true, count };
|
|
728
|
+
logger.log('INFO', `[Backfill] ✅ Social completed for ${dateStr}: ${count} rows`);
|
|
729
|
+
} else {
|
|
730
|
+
logger.log('INFO', `[Backfill] Social for ${dateStr} already backfilled, skipping`);
|
|
731
|
+
dateResults.social = { success: true, skipped: true };
|
|
732
|
+
}
|
|
733
|
+
} catch (socialError) {
|
|
734
|
+
const errorDetails = {
|
|
735
|
+
message: socialError.message,
|
|
736
|
+
code: socialError.code,
|
|
737
|
+
errors: socialError.errors
|
|
738
|
+
};
|
|
739
|
+
logger.log('ERROR', `[Backfill] Social failed for ${dateStr} (after retries):`, errorDetails);
|
|
740
|
+
summary.errors.push({ date: dateStr, dataType: 'social', error: socialError.message });
|
|
741
|
+
dateResults.social = { success: false, error: socialError.message };
|
|
742
|
+
}
|
|
743
|
+
})());
|
|
744
|
+
} else {
|
|
745
|
+
logger.log('INFO', `[Backfill] Skipping social for ${dateStr} (dataType=${dataType})`);
|
|
746
|
+
}
|
|
747
|
+
|
|
748
|
+
logger.log('INFO', `[Backfill] Executing ${dataTypeTasks.length} data type tasks in parallel for ${dateStr}`);
|
|
749
|
+
|
|
750
|
+
// Wait for all data types to complete in parallel
|
|
751
|
+
await Promise.all(dataTypeTasks);
|
|
752
|
+
|
|
753
|
+
logger.log('INFO', `[Backfill] All data type tasks completed for ${dateStr}`);
|
|
754
|
+
|
|
755
|
+
// Date is considered processed even if some data types failed
|
|
756
|
+
// Failed data types will be retried on next run (self-healing)
|
|
757
|
+
summary.processedDates++;
|
|
758
|
+
const allSuccess = Object.values(dateResults).every(r => r === null || r.success);
|
|
759
|
+
return { date: dateStr, success: allSuccess, results: dateResults };
|
|
760
|
+
})
|
|
761
|
+
);
|
|
762
|
+
|
|
763
|
+
// Wait for all dates to complete, logging progress periodically
|
|
764
|
+
logger.log('INFO', `[Backfill] Processing ${dates.length} dates in parallel (max 5 concurrent)`);
|
|
765
|
+
const results = await Promise.all(datePromises);
|
|
766
|
+
|
|
767
|
+
const successful = results.filter(r => r.success).length;
|
|
768
|
+
const failed = results.filter(r => !r.success).length;
|
|
769
|
+
logger.log('INFO', `[Backfill] Completed: ${successful} successful, ${failed} failed`);
|
|
770
|
+
|
|
771
|
+
// Optional: Remove duplicates at the end (cheaper than checking each date)
|
|
772
|
+
// Only run if DEDUPLICATE_AT_END=true and we processed all data types
|
|
773
|
+
if (process.env.DEDUPLICATE_AT_END === 'true' && (dataType === 'all' || dataType === 'portfolio' || dataType === 'history' || dataType === 'social')) {
|
|
774
|
+
logger.log('INFO', `[Backfill] Running final deduplication pass...`);
|
|
775
|
+
|
|
776
|
+
const dedupTasks = [];
|
|
777
|
+
if (dataType === 'all' || dataType === 'portfolio') {
|
|
778
|
+
dedupTasks.push(
|
|
779
|
+
removeDuplicates(datasetId, 'portfolio_snapshots', 'date', ['user_id', 'user_type'], logger)
|
|
780
|
+
.then(count => logger.log('INFO', `[Backfill] Deduplication: Removed ${count} duplicate portfolio rows`))
|
|
781
|
+
.catch(err => logger.log('WARN', `[Backfill] Portfolio deduplication failed: ${err.message}`))
|
|
782
|
+
);
|
|
783
|
+
}
|
|
784
|
+
if (dataType === 'all' || dataType === 'history') {
|
|
785
|
+
dedupTasks.push(
|
|
786
|
+
removeDuplicates(datasetId, 'trade_history_snapshots', 'date', ['user_id', 'user_type'], logger)
|
|
787
|
+
.then(count => logger.log('INFO', `[Backfill] Deduplication: Removed ${count} duplicate history rows`))
|
|
788
|
+
.catch(err => logger.log('WARN', `[Backfill] History deduplication failed: ${err.message}`))
|
|
789
|
+
);
|
|
790
|
+
}
|
|
791
|
+
if (dataType === 'all' || dataType === 'social') {
|
|
792
|
+
dedupTasks.push(
|
|
793
|
+
removeDuplicates(datasetId, 'social_post_snapshots', 'date', ['user_id', 'user_type'], logger)
|
|
794
|
+
.then(count => logger.log('INFO', `[Backfill] Deduplication: Removed ${count} duplicate social rows`))
|
|
795
|
+
.catch(err => logger.log('WARN', `[Backfill] Social deduplication failed: ${err.message}`))
|
|
796
|
+
);
|
|
797
|
+
}
|
|
798
|
+
|
|
799
|
+
await Promise.all(dedupTasks);
|
|
800
|
+
logger.log('INFO', `[Backfill] Deduplication complete`);
|
|
801
|
+
}
|
|
802
|
+
|
|
803
|
+
logger.log('INFO', `[Backfill] Complete! Summary:`, JSON.stringify(summary, null, 2));
|
|
804
|
+
|
|
805
|
+
if (isCloudFunction) {
|
|
806
|
+
res.status(200).json({
|
|
807
|
+
success: true,
|
|
808
|
+
message: 'Backfill completed',
|
|
809
|
+
summary: summary
|
|
810
|
+
});
|
|
811
|
+
} else {
|
|
812
|
+
// Local script: just log and exit
|
|
813
|
+
console.log('\n✅ Backfill completed successfully!');
|
|
814
|
+
process.exit(0);
|
|
815
|
+
}
|
|
816
|
+
|
|
817
|
+
} catch (error) {
|
|
818
|
+
logger.log('ERROR', `[Backfill] Fatal error: ${error.message}`, error);
|
|
819
|
+
|
|
820
|
+
if (isCloudFunction) {
|
|
821
|
+
res.status(500).json({
|
|
822
|
+
success: false,
|
|
823
|
+
error: error.message
|
|
824
|
+
});
|
|
825
|
+
} else {
|
|
826
|
+
console.error('\n❌ Backfill failed:', error.message);
|
|
827
|
+
process.exit(1);
|
|
828
|
+
}
|
|
829
|
+
}
|
|
830
|
+
}
|
|
831
|
+
|
|
832
|
+
// If running as a script directly (not imported as module), execute immediately
|
|
833
|
+
if (require.main === module) {
|
|
834
|
+
// Running as local script
|
|
835
|
+
console.log('🚀 Starting local backfill script...\n');
|
|
836
|
+
console.log('Usage: node index.js --startDate=YYYY-MM-DD --endDate=YYYY-MM-DD --dataType=all|portfolio|history|social\n');
|
|
837
|
+
|
|
838
|
+
backfillTaskEngineData(null, null).catch(error => {
|
|
839
|
+
console.error('Fatal error:', error);
|
|
840
|
+
process.exit(1);
|
|
841
|
+
});
|
|
842
|
+
}
|
|
843
|
+
|
|
844
|
+
module.exports = { backfillTaskEngineData };
|