bulltrackers-module 1.0.721 → 1.0.722

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,697 +1,282 @@
1
1
  /**
2
- * @fileoverview Handles saving computation results with observability, Smart Cleanup, and GCS Support.
3
- * UPDATED: Added GCS Offloading logic (Hybrid Pointer System).
4
- * UPDATED: Preserved Legacy Sharding/Compression for backward compatibility.
5
- * UPDATED: Auto-cleanup of old Firestore shards when migrating a doc to GCS.
6
- * FIXED: Disabled "Single Doc Compression" strategy during intermediate flushes to ensure consistent sharding.
2
+ * @fileoverview Handles saving computation results.
3
+ * REFACTORED:
4
+ * 1. Writes ALL data to BigQuery (Source of Truth).
5
+ * 2. Writes to Firestore ONLY for 'Page' (Fan-out) and 'Alert' computations.
6
+ * 3. Removes GCS/Compression complexity for standard data (now BQ-only).
7
7
  */
8
8
  const { commitBatchInChunks, generateDataHash, FieldValue } = require('../utils/utils');
9
- const { updateComputationStatus } = require('./StatusRepository');
10
- const { batchStoreSchemas } = require('../utils/schema_capture');
9
+ const { updateComputationStatus } = require('./StatusRepository');
10
+ const { batchStoreSchemas } = require('../utils/schema_capture');
11
11
  const { generateProcessId, PROCESS_TYPES } = require('../logger/logger');
12
- const { HeuristicValidator } = require('./ResultsValidator');
13
- const { PubSubUtils } = require('../../core/utils/pubsub_utils');
14
- const ContractValidator = require('./ContractValidator');
15
- const validationOverrides = require('../config/validation_overrides');
16
- const pLimit = require('p-limit');
17
- const zlib = require('zlib');
18
- const { Storage } = require('@google-cloud/storage');
19
- const { ensureComputationResultsTable, insertRows } = require('../../core/utils/bigquery_utils');
20
-
21
- const storage = new Storage(); // Singleton GCS Client
22
- const NON_RETRYABLE_ERRORS = [ 'PERMISSION_DENIED', 'DATA_LOSS', 'FAILED_PRECONDITION' ];
12
+ const { HeuristicValidator } = require('./ResultsValidator');
13
+ const ContractValidator = require('./ContractValidator');
14
+ const validationOverrides = require('../config/validation_overrides');
15
+ const pLimit = require('p-limit');
16
+
17
+ const DEFAULT_TTL_DAYS = 90;
23
18
  const SIMHASH_REGISTRY_COLLECTION = 'system_simhash_registry';
24
- const CONTRACTS_COLLECTION = 'system_contracts';
25
- const DEFAULT_TTL_DAYS = 90;
19
+ const CONTRACTS_COLLECTION = 'system_contracts';
26
20
 
27
21
  async function commitResults(stateObj, dStr, passName, config, deps, skipStatusWrite = false, options = {}) {
28
22
  const successUpdates = {};
29
- const failureReport = [];
30
- const schemas = [];
31
- const cleanupTasks = [];
32
- const alertTriggers = [];
23
+ const failureReport = [];
24
+ const schemas = [];
25
+ const alertTriggers = [];
33
26
  const { logger, db, calculationUtils } = deps;
34
- const withRetry = calculationUtils?.withRetry || (fn => fn());
35
27
 
36
28
  const pid = generateProcessId(PROCESS_TYPES.STORAGE, passName, dStr);
37
29
  const flushMode = options.flushMode || 'STANDARD';
38
- const isInitialWrite = options.isInitialWrite === true;
39
- const shardIndexes = options.shardIndexes || {};
40
- const nextShardIndexes = {};
41
- const fanOutLimit = pLimit(10);
30
+ const isInitialWrite = options.isInitialWrite === true;
42
31
 
32
+ // Pre-fetch contracts and hashes
43
33
  const calcNames = Object.keys(stateObj);
44
- const hashKeys = calcNames.map(n => stateObj[n].manifest?.hash).filter(Boolean);
45
-
34
+ const hashKeys = calcNames.map(n => stateObj[n].manifest?.hash).filter(Boolean);
46
35
  const [contractMap, simHashMap] = await Promise.all([
47
36
  fetchContracts(db, calcNames),
48
37
  batchFetchSimHashes(db, hashKeys)
49
38
  ]);
50
39
 
51
40
  for (const name in stateObj) {
52
- const calc = stateObj[name];
41
+ const calc = stateObj[name];
53
42
  const execStats = calc._executionStats || { processedUsers: 0, skippedUsers: 0 };
54
- const currentShardIndex = shardIndexes[name] || 0;
55
43
 
56
44
  const runMetrics = {
57
- storage: { sizeBytes: 0, isSharded: false, shardCount: 1, keys: 0, location: 'FIRESTORE' },
45
+ storage: { sizeBytes: 0, location: 'BIGQUERY', keys: 0 },
58
46
  validation: { isValid: true, anomalies: [] },
59
47
  execution: execStats,
60
48
  io: { writes: 0, deletes: 0 }
61
49
  };
62
50
 
63
- const isAlertComputation = calc.manifest.isAlertComputation === true;
64
- const isPageComputation = calc.manifest.isPage === true;
65
- const ttlDays = calc.manifest.ttlDays !== undefined ? calc.manifest.ttlDays : DEFAULT_TTL_DAYS;
51
+ const manifest = calc.manifest;
52
+ const isAlert = manifest.isAlertComputation === true;
53
+ const isPage = manifest.isPage === true;
54
+ const ttlDays = manifest.ttlDays !== undefined ? manifest.ttlDays : DEFAULT_TTL_DAYS;
66
55
 
67
56
  try {
68
57
  const result = await calc.getResult();
69
- const configOverrides = validationOverrides[calc.manifest.name] || {};
70
58
 
71
- const dataDeps = calc.manifest.rootDataDependencies || [];
72
- const isPriceOnly = (dataDeps.length === 1 && dataDeps[0] === 'price');
73
- let effectiveOverrides = { ...configOverrides };
74
-
75
- if (isPriceOnly) {
76
- effectiveOverrides.maxZeroPct = 100;
77
- effectiveOverrides.maxFlatlinePct = 100;
78
- effectiveOverrides.maxNullPct = 100;
79
- effectiveOverrides.maxNanPct = 100;
80
- delete effectiveOverrides.weekend;
59
+ // --- 1. VALIDATION ---
60
+ const configOverrides = validationOverrides[manifest.name] || {};
61
+ const dataDeps = manifest.rootDataDependencies || [];
62
+ // Relax validation for price-only computations
63
+ if (dataDeps.length === 1 && dataDeps[0] === 'price') {
64
+ Object.assign(configOverrides, { maxZeroPct: 100, maxFlatlinePct: 100, maxNullPct: 100, maxNanPct: 100 });
65
+ delete configOverrides.weekend;
81
66
  }
82
67
 
68
+ // Contract Validation
83
69
  const contract = contractMap[name];
84
70
  if (contract) {
85
71
  const contractCheck = ContractValidator.validate(result, contract);
86
- if (!contractCheck.valid) {
87
- runMetrics.validation.isValid = false;
88
- runMetrics.validation.anomalies.push(contractCheck.reason);
89
- const semanticError = new Error(contractCheck.reason);
90
- semanticError.stage = 'SEMANTIC_GATE';
91
- throw semanticError;
92
- }
72
+ if (!contractCheck.valid) throw new Error(`[SEMANTIC_GATE] ${contractCheck.reason}`);
93
73
  }
94
74
 
75
+ // Heuristic Validation (Circuit Breaker)
95
76
  if (result && Object.keys(result).length > 0) {
96
- const healthCheck = HeuristicValidator.analyze(calc.manifest.name, result, dStr, effectiveOverrides);
77
+ const healthCheck = HeuristicValidator.analyze(manifest.name, result, dStr, configOverrides);
97
78
  if (!healthCheck.valid) {
98
79
  runMetrics.validation.isValid = false;
99
80
  runMetrics.validation.anomalies.push(healthCheck.reason);
100
- const validationError = new Error(healthCheck.reason);
101
- validationError.stage = 'QUALITY_CIRCUIT_BREAKER';
102
- throw validationError;
81
+ throw new Error(`[QUALITY_CIRCUIT_BREAKER] ${healthCheck.reason}`);
103
82
  }
104
83
  }
105
84
 
106
85
  const isEmpty = !result || (typeof result === 'object' && Object.keys(result).length === 0);
107
86
  const resultHash = isEmpty ? 'empty' : generateDataHash(result);
108
- const simHash = (flushMode !== 'INTERMEDIATE') ? (simHashMap[calc.manifest.hash] || null) : null;
109
-
110
- if (isEmpty) {
111
- if (flushMode === 'INTERMEDIATE') {
112
- nextShardIndexes[name] = currentShardIndex;
113
- continue;
114
- }
115
-
116
- if (isAlertComputation && flushMode === 'FINAL') {
117
- const docPath = `${config.resultsCollection}/${dStr}/${config.resultsSubcollection}/${calc.manifest.category}/${config.computationsSubcollection}/${name}`;
118
- alertTriggers.push({ date: dStr, computationName: name, documentPath: docPath });
87
+ const simHash = (flushMode !== 'INTERMEDIATE') ? (simHashMap[manifest.hash] || null) : null;
88
+
89
+ // --- 2. HANDLE EMPTY RESULTS ---
90
+ if (isEmpty) {
91
+ if (flushMode === 'INTERMEDIATE') continue;
92
+ if (manifest.hash) {
93
+ successUpdates[name] = {
94
+ hash: manifest.hash, simHash, resultHash,
95
+ dependencyResultHashes: manifest.dependencyResultHashes || {},
96
+ category: manifest.category, composition: manifest.composition, metrics: runMetrics
97
+ };
119
98
  }
99
+ continue;
100
+ }
120
101
 
121
- if (calc.manifest.hash) {
122
- successUpdates[name] = {
123
- hash: calc.manifest.hash, simHash: simHash, resultHash: resultHash,
124
- dependencyResultHashes: calc.manifest.dependencyResultHashes || {},
125
- category: calc.manifest.category, composition: calc.manifest.composition,
126
- metrics: runMetrics
127
- };
128
- }
129
- continue;
102
+ // --- 3. WRITE TO BIGQUERY (UNIVERSAL) ---
103
+ // ALL data goes to BigQuery first. This is the primary storage.
104
+ // Using a fire-and-forget approach or await based on critical need.
105
+ // We await here to ensure data safety before reporting success.
106
+ await writeToBigQuery(result, name, dStr, manifest.category, logger, isAlert).catch(err => {
107
+ logger.log('WARN', `[BigQuery] Write warning for ${name}: ${err.message}`);
108
+ });
109
+
110
+ // If it's NOT Page or Alert, we are done (No Firestore write)
111
+ if (!isPage && !isAlert) {
112
+ if (manifest.hash) {
113
+ successUpdates[name] = {
114
+ hash: manifest.hash, simHash, resultHash,
115
+ dependencyResultHashes: manifest.dependencyResultHashes || {},
116
+ category: manifest.category, composition: manifest.composition, metrics: runMetrics
117
+ };
118
+ }
119
+ continue; // Skip Firestore logic
130
120
  }
131
121
 
132
- // [NEW] Page Computation Logic (Fan-Out) with TTL
133
- if (isPageComputation && !isEmpty) {
134
- const expireAt = calculateExpirationDate(dStr, ttlDays);
122
+ // --- 4. FIRESTORE WRITES (SELECTIVE) ---
123
+ const expireAt = calculateExpirationDate(dStr, ttlDays);
124
+
125
+ // A. PAGE COMPUTATIONS (Fan-Out)
126
+ if (isPage) {
135
127
  const mainDocRef = db.collection(config.resultsCollection).doc(dStr)
136
- .collection(config.resultsSubcollection).doc(calc.manifest.category)
128
+ .collection(config.resultsSubcollection).doc(manifest.category)
137
129
  .collection(config.computationsSubcollection).doc(name);
138
130
 
139
- // Optimization: Only attempt cleanup on the initial write to save reads
140
- if (isInitialWrite) {
141
- await cleanupOldShards(mainDocRef, name, config, deps, runMetrics);
142
- }
143
-
144
- // 1. Fan-out writes for each user
131
+ // Fan-out writes: One document per User ID
145
132
  const pageWrites = [];
146
133
  for (const [cid, userData] of Object.entries(result)) {
134
+ // Unique document for each user ID
147
135
  const userDocRef = mainDocRef.collection('pages').doc(cid);
148
136
 
149
137
  const payload = (typeof userData === 'object' && userData !== null)
150
138
  ? { ...userData, _expireAt: expireAt }
151
139
  : { value: userData, _expireAt: expireAt };
152
140
 
153
- pageWrites.push({
154
- ref: userDocRef,
155
- data: payload,
156
- options: { merge: false } // Overwrite specifically for this run
157
- });
141
+ pageWrites.push({ ref: userDocRef, data: payload, options: { merge: false } });
158
142
  }
159
143
 
160
- // 2. Commit the fan-out writes
161
144
  if (pageWrites.length > 0) {
162
145
  await commitBatchInChunks(config, deps, pageWrites, `${name}::PageFanOut`);
163
146
  runMetrics.io.writes += pageWrites.length;
164
- runMetrics.storage.keys = pageWrites.length;
165
- logger.log('INFO', `[PageMode] ${name}: Wrote ${pageWrites.length} user pages. TTL: ${ttlDays}d.`);
166
- }
167
-
168
- // 3. Write or Update the "Header" document
169
- const isFinalFlush = (flushMode !== 'INTERMEDIATE');
170
-
171
- let pageCountValue = pageWrites.length;
172
- if (!isInitialWrite) {
173
- pageCountValue = FieldValue.increment(pageWrites.length);
147
+ runMetrics.storage.location = 'FIRESTORE_PAGES';
174
148
  }
175
149
 
150
+ // Write Header Document (Metadata for frontend/indexing)
176
151
  const headerData = {
177
- _isPageMode: true,
178
- _pageCount: pageCountValue,
152
+ _isPageMode: true,
153
+ _pageCount: isInitialWrite ? pageWrites.length : FieldValue.increment(pageWrites.length),
179
154
  _lastUpdated: new Date().toISOString(),
180
155
  _expireAt: expireAt,
181
- _completed: isFinalFlush ? true : false // Always a boolean: true when final flush, false otherwise
156
+ _completed: flushMode !== 'INTERMEDIATE'
182
157
  };
183
-
184
158
  await mainDocRef.set(headerData, { merge: !isInitialWrite });
185
159
 
186
- runMetrics.io.writes += 1;
187
-
188
- // 4. Write to BigQuery (for analytics) - same structure as other computations
189
- // Page computations store the full result object { cid1: {...}, cid2: {...}, ... } in result_data
190
- await writeToBigQuery(result, name, dStr, calc.manifest.category, logger, false).catch(err => {
191
- logger.log('WARN', `[BigQuery] Failed to write page computation ${name} for ${dStr}: ${err.message}`);
192
- });
193
-
194
- if (isFinalFlush && calc.manifest.hash) {
195
- successUpdates[name] = {
196
- hash: calc.manifest.hash, simHash: simHash, resultHash: resultHash,
197
- category: calc.manifest.category, composition: calc.manifest.composition,
198
- metrics: runMetrics
199
- };
200
- }
201
-
202
- continue;
160
+ logger.log('INFO', `[ResultCommitter] ${name}: Wrote ${pageWrites.length} user pages to Firestore.`);
203
161
  }
204
-
205
- // Standard Computation Logic (GCS, Compression or Sharding) with TTL
206
- if (typeof result === 'object') runMetrics.storage.keys = Object.keys(result).length;
207
- const resultKeys = Object.keys(result || {});
208
- const isMultiDate = resultKeys.length > 0 && resultKeys.every(k => /^\d{4}-\d{2}-\d{2}$/.test(k));
209
-
210
- if (isMultiDate) {
211
- const datePromises = resultKeys.map((historicalDate) => fanOutLimit(async () => {
212
- const dailyData = result[historicalDate];
213
- if (!dailyData || Object.keys(dailyData).length === 0) return;
214
-
215
- const dailyExpireAt = calculateExpirationDate(historicalDate, ttlDays);
216
-
217
- const historicalDocRef = db.collection(config.resultsCollection).doc(historicalDate).collection(config.resultsSubcollection).doc(calc.manifest.category).collection(config.computationsSubcollection).doc(name);
218
-
219
- // Recursive call allows GCS logic to apply per-day
220
- const stats = await writeSingleResult(dailyData, historicalDocRef, name, historicalDate, calc.manifest.category, logger, config, deps, 0, 'STANDARD', false, dailyExpireAt, isAlertComputation, isPageComputation);
221
- runMetrics.io.writes += stats.opCounts.writes;
222
- runMetrics.io.deletes += stats.opCounts.deletes;
223
-
224
- if (isAlertComputation && flushMode !== 'INTERMEDIATE') {
225
- alertTriggers.push({ date: historicalDate, computationName: name, documentPath: historicalDocRef.path });
226
- }
227
- }));
228
- await Promise.all(datePromises);
229
-
230
- if (calc.manifest.hash) { successUpdates[name] = { hash: calc.manifest.hash, simHash, resultHash, dependencyResultHashes: calc.manifest.dependencyResultHashes || {}, category: calc.manifest.category, composition: calc.manifest.composition, metrics: runMetrics }; }
231
- } else {
232
- const runExpireAt = calculateExpirationDate(dStr, ttlDays);
233
162
 
234
- const mainDocRef = db.collection(config.resultsCollection).doc(dStr).collection(config.resultsSubcollection).doc(calc.manifest.category).collection(config.computationsSubcollection).doc(name);
235
- const writeStats = await writeSingleResult(result, mainDocRef, name, dStr, calc.manifest.category, logger, config, deps, currentShardIndex, flushMode, isInitialWrite, runExpireAt, isAlertComputation, isPageComputation);
236
-
237
- runMetrics.storage.sizeBytes = writeStats.totalSize;
238
- runMetrics.storage.isSharded = writeStats.isSharded;
239
- runMetrics.storage.shardCount = writeStats.shardCount;
240
- runMetrics.storage.location = writeStats.location;
241
- runMetrics.io.writes += writeStats.opCounts.writes;
242
- runMetrics.io.deletes += writeStats.opCounts.deletes;
243
-
244
- nextShardIndexes[name] = writeStats.nextShardIndex;
245
- if (calc.manifest.hash) { successUpdates[name] = { hash: calc.manifest.hash, simHash, resultHash, dependencyResultHashes: calc.manifest.dependencyResultHashes || {}, category: calc.manifest.category, composition: calc.manifest.composition, metrics: runMetrics }; }
163
+ // B. ALERT COMPUTATIONS (Single Doc for Triggers)
164
+ if (isAlert) {
165
+ // Alerts are written to a single document to trigger the listener
166
+ const mainDocRef = db.collection(config.resultsCollection).doc(dStr)
167
+ .collection(config.resultsSubcollection).doc(manifest.category)
168
+ .collection(config.computationsSubcollection).doc(name);
169
+
170
+ const alertPayload = {
171
+ ...result,
172
+ _isAlert: true,
173
+ _lastUpdated: new Date().toISOString(),
174
+ _expireAt: expireAt
175
+ };
246
176
 
247
- if (isAlertComputation && flushMode !== 'INTERMEDIATE') {
177
+ await mainDocRef.set(alertPayload);
178
+ runMetrics.io.writes += 1;
179
+ runMetrics.storage.location = 'FIRESTORE_ALERT';
180
+
181
+ // Add to triggers list for logging
182
+ if (flushMode !== 'INTERMEDIATE') {
248
183
  alertTriggers.push({ date: dStr, computationName: name, documentPath: mainDocRef.path });
249
184
  }
250
185
  }
251
186
 
252
- if (calc.manifest.class.getSchema && flushMode !== 'INTERMEDIATE') {
253
- const { class: _cls, ...safeMetadata } = calc.manifest;
254
- // Ensure ttlDays is set to the resolved value (defaults to 90 if undefined)
255
- safeMetadata.ttlDays = ttlDays;
256
- schemas.push({ name, category: calc.manifest.category, schema: calc.manifest.class.getSchema(), metadata: safeMetadata });
187
+ // --- 5. FINALIZE ---
188
+ if (manifest.hash) {
189
+ successUpdates[name] = {
190
+ hash: manifest.hash, simHash, resultHash,
191
+ dependencyResultHashes: manifest.dependencyResultHashes || {},
192
+ category: manifest.category, composition: manifest.composition, metrics: runMetrics
193
+ };
257
194
  }
258
- if (calc.manifest.previousCategory && calc.manifest.previousCategory !== calc.manifest.category && flushMode !== 'INTERMEDIATE') {
259
- cleanupTasks.push(deleteOldCalculationData(dStr, calc.manifest.previousCategory, name, config, deps));
195
+
196
+ // Store Schema
197
+ if (manifest.class.getSchema && flushMode !== 'INTERMEDIATE') {
198
+ const { class: _cls, ...safeMetadata } = manifest;
199
+ safeMetadata.ttlDays = ttlDays;
200
+ schemas.push({ name, category: manifest.category, schema: manifest.class.getSchema(), metadata: safeMetadata });
260
201
  }
261
202
 
262
203
  } catch (e) {
263
- const stage = e.stage || 'EXECUTION';
264
- if (logger && logger.log) { logger.log('ERROR', `Commit failed for ${name} [${stage}]`, { processId: pid, error: e }); }
265
- failureReport.push({ name, error: { message: e.message, stack: e.stack, stage }, metrics: runMetrics });
204
+ logger.log('ERROR', `Commit failed for ${name}`, { error: e });
205
+ failureReport.push({ name, error: { message: e.message, stack: e.stack }, metrics: runMetrics });
266
206
  }
267
207
  }
268
208
 
269
209
  if (schemas.length) batchStoreSchemas(deps, config, schemas).catch(() => {});
270
- if (cleanupTasks.length > 0) { await Promise.allSettled(cleanupTasks); }
271
- if (!skipStatusWrite && Object.keys(successUpdates).length > 0 && flushMode !== 'INTERMEDIATE') {
272
- await updateComputationStatus(dStr, successUpdates, config, deps);
210
+ if (!skipStatusWrite && Object.keys(successUpdates).length > 0 && flushMode !== 'INTERMEDIATE') {
211
+ await updateComputationStatus(dStr, successUpdates, config, deps);
273
212
  }
274
-
275
- if (alertTriggers.length > 0) {
276
- logger.log('INFO', `[Alert System] ${alertTriggers.length} alert computations written to Firestore - triggers will fire automatically`);
277
- }
278
-
279
- return { successUpdates, failureReport, shardIndexes: nextShardIndexes };
280
- }
281
-
282
- async function batchFetchSimHashes(db, hashes) {
283
- if (!hashes || hashes.length === 0) return {};
284
- const map = {};
285
- const refs = hashes.map(h => db.collection(SIMHASH_REGISTRY_COLLECTION).doc(h));
286
- try {
287
- const snaps = await db.getAll(...refs);
288
- snaps.forEach(snap => { if (snap.exists) map[snap.id] = snap.data().simHash; });
289
- } catch (e) {}
290
- return map;
291
- }
292
-
293
- async function fetchContracts(db, calcNames) {
294
- if (!calcNames || calcNames.length === 0) return {};
295
- const map = {};
296
- const refs = calcNames.map(name => db.collection(CONTRACTS_COLLECTION).doc(name));
297
- try {
298
- const snaps = await db.getAll(...refs);
299
- snaps.forEach(snap => { if (snap.exists) map[snap.id] = snap.data(); });
300
- } catch (e) {}
301
- return map;
302
- }
303
-
304
- async function writeSingleResult(result, docRef, name, dateContext, category, logger, config, deps, startShardIndex = 0, flushMode = 'STANDARD', isInitialWrite = false, expireAt = null, isAlertComputation = false) {
305
- const opCounts = { writes: 0, deletes: 0 };
306
213
 
307
- // Check if previously sharded (so we can clean up if moving to GCS or Compressed)
308
- let wasSharded = false;
309
- try {
310
- const currentSnap = await docRef.get();
311
- if (currentSnap.exists) {
312
- wasSharded = (currentSnap.data()._sharded === true);
313
- }
314
- } catch (e) {}
315
-
316
- const jsonString = JSON.stringify(result);
317
- const rawBuffer = Buffer.from(jsonString);
318
- const totalSize = rawBuffer.length;
319
-
320
- // --- STRATEGY 1: GCS OFFLOAD ---
321
- // Trigger if bucket defined AND (UseGCS config set OR size > 800KB)
322
- // This keeps small files in Firestore (faster/cheaper reads) but offloads dangerous sizes
323
- const GCS_THRESHOLD = 800 * 1024; // 800KB
324
- const bucketName = config.gcsBucketName || 'bulltrackers';
325
- const useGCS = config.forceGCS || totalSize > GCS_THRESHOLD;
326
-
327
- if (useGCS) {
328
- try {
329
- const bucket = storage.bucket(bucketName);
330
- const fileName = `${dateContext}/${category}/${name}.json.gz`;
331
- const file = bucket.file(fileName);
332
-
333
- // 1. Compress & Upload
334
- const compressedBuffer = zlib.gzipSync(rawBuffer);
335
- await file.save(compressedBuffer, {
336
- contentType: 'application/json',
337
- contentEncoding: 'gzip',
338
- metadata: {
339
- created: new Date().toISOString(),
340
- originalSize: totalSize,
341
- computation: name
342
- }
343
- });
344
-
345
- // 2. Clean up old Firestore shards (Crucial for cost/consistency)
346
- if (wasSharded) {
347
- await cleanupOldShards(docRef, name, config, deps, { io: opCounts });
348
- }
349
-
350
- // 3. Write Pointer Document
351
- const pointerPayload = {
352
- _completed: true,
353
- _gcs: true, // Flag for the Reader
354
- gcsUri: `gs://${bucketName}/${fileName}`,
355
- gcsBucket: bucketName,
356
- gcsPath: fileName,
357
- _lastUpdated: new Date().toISOString(),
358
- sizeBytes: totalSize
359
- };
360
- if (expireAt) pointerPayload._expireAt = expireAt;
361
-
362
- // Overwrite existing doc (merge: false ensures we clear old schema/data fields)
363
- await docRef.set(pointerPayload, { merge: false });
364
- opCounts.writes += 1;
365
-
366
- logger.log('INFO', `[GCS] ${name}: Offloaded ${(totalSize/1024).toFixed(0)}KB to ${fileName}`);
367
-
368
- // Write to BigQuery (await to ensure completion before function returns)
369
- // Errors are caught and logged but don't fail the operation
370
- // Pass isAlertComputation flag to use streaming for alerts, load jobs for others
371
- await writeToBigQuery(result, name, dateContext, category, logger, isAlertComputation).catch(err => {
372
- logger.log('WARN', `[BigQuery] Failed to write ${name} for ${dateContext}: ${err.message}`);
373
- });
374
-
375
- return { totalSize, isSharded: false, shardCount: 1, nextShardIndex: startShardIndex, opCounts, location: 'GCS' };
376
-
377
- } catch (gcsErr) {
378
- logger.log('ERROR', `[GCS] Upload failed for ${name}, falling back to Firestore: ${gcsErr.message}`);
379
- // Fallthrough to Standard Logic...
380
- }
381
- }
382
-
383
- // --- STRATEGY 2: FIRESTORE COMPRESSION ---
384
- // FIX: Only compress if this is a single, atomic write (not part of a stream).
385
- // Streaming relies on Strategy 3 (Sharding) to create distinct files (shard_0, shard_1...).
386
- // If flushMode is INTERMEDIATE or we are already at a high shard index, we MUST fall through to sharding.
387
- if (totalSize > 50 * 1024 && startShardIndex === 0 && flushMode !== 'INTERMEDIATE') {
388
- try {
389
- const compressedBuffer = zlib.gzipSync(rawBuffer);
390
- if (compressedBuffer.length < 900 * 1024) {
391
- const payloadBuffer = Buffer.from(compressedBuffer);
392
- const compressedPayload = {
393
- _compressed: true,
394
- _completed: true,
395
- _lastUpdated: new Date().toISOString(),
396
- payload: payloadBuffer
397
- };
398
- if (expireAt) compressedPayload._expireAt = expireAt;
399
-
400
- if (wasSharded) {
401
- await cleanupOldShards(docRef, name, config, deps, { io: opCounts });
402
- // Use merge: false (overwrite)
403
- await docRef.set(compressedPayload, { merge: false });
404
- } else {
405
- await docRef.set(compressedPayload, { merge: false });
406
- }
407
-
408
- opCounts.writes += 1;
409
- logger.log('INFO', `[Compression] ${name}: Compressed ${(totalSize/1024).toFixed(0)}KB -> ${(compressedBuffer.length/1024).toFixed(0)}KB.`);
410
-
411
- // Write to BigQuery (await to ensure completion before function returns)
412
- // Errors are caught and logged but don't fail the operation
413
- await writeToBigQuery(result, name, dateContext, category, logger).catch(err => {
414
- logger.log('WARN', `[BigQuery] Failed to write ${name} for ${dateContext}: ${err.message}`);
415
- });
416
-
417
- return { totalSize: compressedBuffer.length, isSharded: false, shardCount: 1, nextShardIndex: startShardIndex, opCounts, location: 'FIRESTORE' };
418
- }
419
- } catch (compErr) {
420
- logger.log('WARN', `[SelfHealing] Compression failed for ${name}, reverting to sharding. Error: ${compErr.message}`);
421
- }
214
+ if (alertTriggers.length > 0) {
215
+ logger.log('INFO', `[ResultCommitter] ${alertTriggers.length} alert computations updated in Firestore.`);
422
216
  }
423
217
 
424
- // --- STRATEGY 3: FIRESTORE SHARDING (Fallback) ---
425
- const strategies = [ { bytes: 900 * 1024, keys: null }, { bytes: 450 * 1024, keys: 10000 }, { bytes: 200 * 1024, keys: 2000 }, { bytes: 100 * 1024, keys: 50 } ];
426
- let committed = false; let lastError = null;
427
- let finalStats = { totalSize: 0, isSharded: false, shardCount: 1, nextShardIndex: startShardIndex, location: 'FIRESTORE' };
428
- let rootMergeOption = !isInitialWrite;
429
-
430
- // Only wipe existing shards if this is the INITIAL write for this batch run.
431
- let shouldWipeShards = wasSharded && isInitialWrite;
432
-
433
- for (let attempt = 0; attempt < strategies.length; attempt++) {
434
- if (committed) break;
435
- const constraints = strategies[attempt];
436
- try {
437
- const updates = await prepareAutoShardedWrites(result, docRef, logger, constraints.bytes, constraints.keys, startShardIndex, flushMode, expireAt);
438
-
439
- if (shouldWipeShards) {
440
- const shardCol = docRef.collection('_shards');
441
- const shardDocs = await shardCol.listDocuments();
442
- shardDocs.forEach(d => updates.unshift({ type: 'DELETE', ref: d }));
443
- shouldWipeShards = false;
444
- }
445
-
446
- const rootUpdate = updates.find(u => u.ref.path === docRef.path && u.type !== 'DELETE');
447
- // FIX: Always use merge: false to ensure old fields (like _compressed/payload) are wiped
448
- if (rootUpdate) { rootUpdate.options = { merge: false }; }
449
-
450
- const writes = updates.filter(u => u.type !== 'DELETE').length;
451
- const deletes = updates.filter(u => u.type === 'DELETE').length;
452
-
453
- await commitBatchInChunks(config, deps, updates, `${name}::${dateContext}`);
454
-
455
- opCounts.writes += writes;
456
- opCounts.deletes += deletes;
457
- finalStats.totalSize = updates.reduce((acc, u) => acc + (u.data ? JSON.stringify(u.data).length : 0), 0);
458
-
459
- // Determine shard count from updates
460
- let maxIndex = startShardIndex;
461
- updates.forEach(u => {
462
- if (u.type === 'DELETE') return;
463
- const segs = u.ref.path.split('/');
464
- const last = segs[segs.length - 1];
465
- if (last.startsWith('shard_')) {
466
- const idx = parseInt(last.split('_')[1]);
467
- if (!isNaN(idx) && idx > maxIndex) maxIndex = idx;
468
- finalStats.isSharded = true;
469
- }
470
- });
471
- const pointer = updates.find(u => u.data && u.data._shardCount !== undefined);
472
- if (pointer) {
473
- finalStats.shardCount = pointer.data._shardCount;
474
- finalStats.nextShardIndex = finalStats.shardCount;
475
- } else if (updates.length > 0) {
476
- finalStats.nextShardIndex = maxIndex + 1;
477
- }
478
-
479
- committed = true;
480
- } catch (commitErr) {
481
- lastError = commitErr;
482
- const msg = commitErr.message || '';
483
- const code = commitErr.code || '';
484
-
485
- if (NON_RETRYABLE_ERRORS.includes(code)) {
486
- logger.log('ERROR', `[SelfHealing] ${name} FATAL error: ${msg}.`);
487
- throw commitErr;
488
- }
489
- logger.log('WARN', `[SelfHealing] ${name} on ${dateContext} failed attempt ${attempt+1}. Error: ${msg}. Retrying...`);
490
- continue;
491
- }
492
- }
493
- if (!committed) {
494
- const shardingError = new Error(`Exhausted sharding strategies for ${name}. Last error: ${lastError?.message}`);
495
- shardingError.stage = 'SHARDING_LIMIT_EXCEEDED';
496
- throw shardingError;
497
- }
498
-
499
- // Write to BigQuery (await to ensure completion before function returns)
500
- // Errors are caught and logged but don't fail the operation
501
- await writeToBigQuery(result, name, dateContext, category, logger).catch(err => {
502
- logger.log('WARN', `[BigQuery] Failed to write ${name} for ${dateContext}: ${err.message}`);
503
- });
504
-
505
- finalStats.opCounts = opCounts;
506
- return finalStats;
218
+ return { successUpdates, failureReport };
507
219
  }
508
220
 
509
- // =============================================================================
510
- // HELPERS
511
- // =============================================================================
221
+ // --- HELPERS ---
512
222
 
513
- /**
514
- * Write computation result to BigQuery (errors are logged but don't fail Firestore writes)
515
- * @param {object} result - Computation result data
516
- * @param {string} name - Computation name
517
- * @param {string} dateContext - Date string (YYYY-MM-DD)
518
- * @param {string} category - Category (e.g., 'popular-investor', 'alerts')
519
- * @param {object} logger - Logger instance
520
- * @param {boolean} isAlertComputation - If true, uses streaming inserts (immediate). If false, uses load jobs (batched, free).
521
- */
522
223
  async function writeToBigQuery(result, name, dateContext, category, logger, isAlertComputation = false) {
523
- // Skip if BigQuery is disabled via environment variable
524
- if (process.env.BIGQUERY_ENABLED === 'false') {
525
- return;
526
- }
527
-
224
+ if (process.env.BIGQUERY_ENABLED === 'false') return;
225
+
528
226
  try {
529
- // Size check: BigQuery streaming inserts have a 10MB limit per row
530
- // Estimate size by stringifying the result
531
- const estimatedSize = JSON.stringify(result).length;
532
- const MAX_BIGQUERY_ROW_SIZE = 9 * 1024 * 1024; // 9MB safety limit (10MB is hard limit)
533
-
534
- if (estimatedSize > MAX_BIGQUERY_ROW_SIZE) {
535
- if (logger) {
536
- logger.log('WARN', `[BigQuery] Skipping ${name} (${dateContext}): Result too large for streaming (${(estimatedSize/1024/1024).toFixed(2)}MB). Data is in GCS/Firestore.`);
537
- }
538
- // Return early - don't attempt insert that will fail
539
- // The data is still available in Firestore/GCS, so this is acceptable
540
- return;
541
- }
542
-
543
- // Ensure table exists
227
+ const { ensureComputationResultsTable, insertRowsWithMerge } = require('../../core/utils/bigquery_utils');
544
228
  await ensureComputationResultsTable(logger);
545
-
546
- // Extract metadata (cids if present)
547
- const metadata = {};
548
- if (result.cids && Array.isArray(result.cids)) {
549
- metadata.cids = result.cids;
550
- }
551
-
552
- // Prepare row for BigQuery
229
+
230
+ // Simple metadata extraction
231
+ const metadata = result.cids && Array.isArray(result.cids) ? { cids: result.cids } : null;
232
+
553
233
  const row = {
554
234
  date: dateContext,
555
235
  computation_name: name,
556
236
  category: category,
557
- result_data: result, // Full result as JSON
558
- metadata: Object.keys(metadata).length > 0 ? metadata : null,
237
+ result_data: result, // BigQuery handles JSON wrapping
238
+ metadata: metadata,
559
239
  created_at: new Date().toISOString()
560
240
  };
561
-
241
+
562
242
  const datasetId = process.env.BIGQUERY_DATASET_ID || 'bulltrackers_data';
563
243
 
564
- // Use MERGE operation to overwrite existing results (by date + computation_name + category)
565
- // This ensures re-running a computation overwrites the old result
566
- // Key fields: date, computation_name, category (ignoring created_at)
567
- const { insertRowsWithMerge } = require('../../core/utils/bigquery_utils');
568
- const keyFields = ['date', 'computation_name', 'category'];
569
-
570
- // For alert computations, we still want to use MERGE but it will use load jobs (free)
571
- // This ensures overwrites work correctly for both alert and non-alert computations
572
- await insertRowsWithMerge(datasetId, 'computation_results', [row], keyFields, logger);
573
-
574
- } catch (error) {
575
- // Log but don't throw - BigQuery write failure shouldn't break Firestore writes
576
- if (logger) {
577
- logger.log('WARN', `[BigQuery] Write failed for ${name} (${dateContext}): ${error.message}`);
578
- }
579
- // Don't re-throw - ensure Firestore writes always succeed
580
- }
581
- }
582
-
583
- async function cleanupOldShards(docRef, name, config, deps, metrics) {
584
- const shardCol = docRef.collection('_shards');
585
- const shardDocs = await shardCol.listDocuments();
586
- if (shardDocs.length > 0) {
587
- const updates = shardDocs.map(d => ({ type: 'DELETE', ref: d }));
588
- await commitBatchInChunks(config, deps, updates, `${name}::CleanupOldShards`);
589
- if (metrics && metrics.io) metrics.io.deletes += updates.length;
590
- }
591
- }
592
-
593
- async function prepareAutoShardedWrites(result, docRef, logger, maxBytes = 900 * 1024, maxKeys = null, startShardIndex = 0, flushMode = 'STANDARD', expireAt = null) {
594
- const OVERHEAD_ALLOWANCE = 20 * 1024; const CHUNK_LIMIT = maxBytes - OVERHEAD_ALLOWANCE;
595
- const totalSize = calculateFirestoreBytes(result); const docPathSize = Buffer.byteLength(docRef.path, 'utf8') + 16;
596
- const writes = []; const shardCollection = docRef.collection('_shards');
597
- let currentChunk = {}; let currentChunkSize = 0; let currentKeyCount = 0;
598
- let shardIndex = startShardIndex;
599
-
600
- const injectTTL = (data) => expireAt ? { ...data, _expireAt: expireAt } : data;
244
+ // Always use merge to ensure idempotency (overwrite previous run for same date/calc)
245
+ await insertRowsWithMerge(datasetId, 'computation_results', [row], ['date', 'computation_name', 'category'], logger);
601
246
 
602
- if (!maxKeys && (totalSize + docPathSize) < CHUNK_LIMIT && flushMode === 'STANDARD' && startShardIndex === 0) {
603
- const data = { ...result, _completed: true, _sharded: false, _lastUpdated: new Date().toISOString() };
604
- return [{ ref: docRef, data: injectTTL(data), options: { merge: true } }];
605
- }
606
-
607
- for (const [key, value] of Object.entries(result)) {
608
- if (key.startsWith('_')) continue;
609
- const keySize = Buffer.byteLength(key, 'utf8') + 1; const valueSize = calculateFirestoreBytes(value); const itemSize = keySize + valueSize;
610
- const byteLimitReached = (currentChunkSize + itemSize > CHUNK_LIMIT); const keyLimitReached = (maxKeys && currentKeyCount + 1 >= maxKeys);
611
-
612
- if (byteLimitReached || keyLimitReached) {
613
- const chunkData = injectTTL(currentChunk);
614
- writes.push({ ref: shardCollection.doc(`shard_${shardIndex}`), data: chunkData, options: { merge: false } });
615
- shardIndex++; currentChunk = {}; currentChunkSize = 0; currentKeyCount = 0;
616
- }
617
- currentChunk[key] = value; currentChunkSize += itemSize; currentKeyCount++;
618
- }
619
-
620
- if (Object.keys(currentChunk).length > 0) {
621
- const chunkData = injectTTL(currentChunk);
622
- writes.push({ ref: shardCollection.doc(`shard_${shardIndex}`), data: chunkData, options: { merge: false } });
623
- shardIndex++;
624
- }
625
-
626
- if (flushMode !== 'INTERMEDIATE') {
627
- const pointerData = {
628
- _completed: true,
629
- _sharded: true,
630
- _shardCount: shardIndex,
631
- _lastUpdated: new Date().toISOString()
632
- };
633
- writes.push({ ref: docRef, data: injectTTL(pointerData), options: { merge: true } });
247
+ } catch (error) {
248
+ if (logger) logger.log('WARN', `[BigQuery] Write failed for ${name}: ${error.message}`);
249
+ // Do not throw; we don't want to crash the computation pipeline if metrics fail
634
250
  }
635
-
636
- return writes;
637
251
  }
638
252
 
639
- async function deleteOldCalculationData(dateStr, oldCategory, calcName, config, deps) {
640
- const { db, logger, calculationUtils } = deps;
641
- const { withRetry } = calculationUtils || { withRetry: (fn) => fn() };
253
+ async function batchFetchSimHashes(db, hashes) {
254
+ if (!hashes || hashes.length === 0) return {};
255
+ const map = {};
256
+ const refs = hashes.map(h => db.collection(SIMHASH_REGISTRY_COLLECTION).doc(h));
642
257
  try {
643
- const oldDocRef = db.collection(config.resultsCollection).doc(dateStr).collection(config.resultsSubcollection).doc(oldCategory).collection(config.computationsSubcollection).doc(calcName);
644
-
645
- const batch = db.batch(); let ops = 0;
646
-
647
- // Clean up 'pages' subcollection if it exists (for Page Mode)
648
- const pagesCol = oldDocRef.collection('pages');
649
- const pageDocs = await withRetry(() => pagesCol.listDocuments(), 'ListOldPages');
650
- for (const pDoc of pageDocs) { batch.delete(pDoc); ops++; }
651
-
652
- // Clean up '_shards' subcollection (for Standard Mode)
653
- const shardsCol = oldDocRef.collection('_shards');
654
- const shardsSnap = await withRetry(() => shardsCol.listDocuments(), 'ListOldShards');
655
-
656
- for (const shardDoc of shardsSnap) { batch.delete(shardDoc); ops++; }
657
-
658
- batch.delete(oldDocRef); ops++;
659
-
660
- await withRetry(() => batch.commit(), 'CleanupOldCategory');
661
- logger.log('INFO', `[Migration] Cleaned up ${ops} docs for ${calcName} in '${oldCategory}'`);
662
- } catch (e) { logger.log('WARN', `[Migration] Failed to clean up ${calcName}: ${e.message}`); }
258
+ const snaps = await db.getAll(...refs);
259
+ snaps.forEach(snap => { if (snap.exists) map[snap.id] = snap.data().simHash; });
260
+ } catch (e) {}
261
+ return map;
663
262
  }
664
263
 
665
- function calculateFirestoreBytes(value) {
666
- if (value === null) return 1; if (value === undefined) return 0; if (typeof value === 'boolean') return 1; if (typeof value === 'number') return 8; if (typeof value === 'string') return Buffer.byteLength(value, 'utf8') + 1; if (value instanceof Date) return 8; if (value.constructor && value.constructor.name === 'DocumentReference') { return Buffer.byteLength(value.path, 'utf8') + 16; }
667
- if (Array.isArray(value)) { let sum = 0; for (const item of value) sum += calculateFirestoreBytes(item); return sum; }
668
- if (typeof value === 'object') { let sum = 0; for (const k in value) { if (Object.prototype.hasOwnProperty.call(value, k)) { sum += (Buffer.byteLength(k, 'utf8') + 1) + calculateFirestoreBytes(value[k]); } } return sum; } return 0;
264
+ async function fetchContracts(db, calcNames) {
265
+ if (!calcNames || calcNames.length === 0) return {};
266
+ const map = {};
267
+ const refs = calcNames.map(name => db.collection(CONTRACTS_COLLECTION).doc(name));
268
+ try {
269
+ const snaps = await db.getAll(...refs);
270
+ snaps.forEach(snap => { if (snap.exists) map[snap.id] = snap.data(); });
271
+ } catch (e) {}
272
+ return map;
669
273
  }
670
274
 
671
275
  function calculateExpirationDate(dateStr, ttlDays) {
672
- // Validate inputs
673
- if (!dateStr || typeof dateStr !== 'string') {
674
- return null; // Invalid date string
675
- }
676
-
677
- if (ttlDays === undefined || ttlDays === null || isNaN(Number(ttlDays))) {
678
- return null; // Invalid TTL days
679
- }
680
-
276
+ if (!dateStr || !ttlDays || isNaN(Number(ttlDays))) return null;
681
277
  const base = new Date(dateStr);
682
-
683
- // Check if date is valid (invalid dates have NaN getTime())
684
- if (isNaN(base.getTime())) {
685
- return null; // Invalid date
686
- }
687
-
278
+ if (isNaN(base.getTime())) return null;
688
279
  base.setDate(base.getDate() + Number(ttlDays));
689
-
690
- // Double-check the result is still valid
691
- if (isNaN(base.getTime())) {
692
- return null; // Resulting date is invalid
693
- }
694
-
695
280
  return base;
696
281
  }
697
282