bulltrackers-module 1.0.766 → 1.0.769

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/functions/computation-system-v2/UserPortfolioMetrics.js +50 -0
  2. package/functions/computation-system-v2/computations/BehavioralAnomaly.js +559 -227
  3. package/functions/computation-system-v2/computations/GlobalAumPerAsset30D.js +103 -0
  4. package/functions/computation-system-v2/computations/NewSectorExposure.js +82 -35
  5. package/functions/computation-system-v2/computations/NewSocialPost.js +52 -24
  6. package/functions/computation-system-v2/computations/PIDailyAssetAUM.js +134 -0
  7. package/functions/computation-system-v2/computations/PiFeatureVectors.js +227 -0
  8. package/functions/computation-system-v2/computations/PiRecommender.js +359 -0
  9. package/functions/computation-system-v2/computations/PopularInvestorProfileMetrics.js +354 -641
  10. package/functions/computation-system-v2/computations/SignedInUserList.js +51 -0
  11. package/functions/computation-system-v2/computations/SignedInUserMirrorHistory.js +138 -0
  12. package/functions/computation-system-v2/computations/SignedInUserPIProfileMetrics.js +106 -0
  13. package/functions/computation-system-v2/computations/SignedInUserProfileMetrics.js +324 -0
  14. package/functions/computation-system-v2/config/bulltrackers.config.js +40 -126
  15. package/functions/computation-system-v2/core-api.js +17 -9
  16. package/functions/computation-system-v2/data_schema_reference.MD +108 -0
  17. package/functions/computation-system-v2/devtools/builder/builder.js +362 -0
  18. package/functions/computation-system-v2/devtools/builder/examples/user-metrics.yaml +26 -0
  19. package/functions/computation-system-v2/devtools/index.js +36 -0
  20. package/functions/computation-system-v2/devtools/shared/MockDataFactory.js +235 -0
  21. package/functions/computation-system-v2/devtools/shared/SchemaTemplates.js +475 -0
  22. package/functions/computation-system-v2/devtools/shared/SystemIntrospector.js +517 -0
  23. package/functions/computation-system-v2/devtools/shared/index.js +16 -0
  24. package/functions/computation-system-v2/devtools/simulation/DAGAnalyzer.js +243 -0
  25. package/functions/computation-system-v2/devtools/simulation/MockDataFetcher.js +306 -0
  26. package/functions/computation-system-v2/devtools/simulation/MockStorageManager.js +336 -0
  27. package/functions/computation-system-v2/devtools/simulation/SimulationEngine.js +525 -0
  28. package/functions/computation-system-v2/devtools/simulation/SimulationServer.js +581 -0
  29. package/functions/computation-system-v2/devtools/simulation/index.js +17 -0
  30. package/functions/computation-system-v2/devtools/simulation/simulate.js +324 -0
  31. package/functions/computation-system-v2/devtools/vscode-computation/package.json +90 -0
  32. package/functions/computation-system-v2/devtools/vscode-computation/snippets/computation.json +128 -0
  33. package/functions/computation-system-v2/devtools/vscode-computation/src/extension.ts +401 -0
  34. package/functions/computation-system-v2/devtools/vscode-computation/src/providers/codeActions.ts +152 -0
  35. package/functions/computation-system-v2/devtools/vscode-computation/src/providers/completions.ts +207 -0
  36. package/functions/computation-system-v2/devtools/vscode-computation/src/providers/diagnostics.ts +205 -0
  37. package/functions/computation-system-v2/devtools/vscode-computation/src/providers/hover.ts +205 -0
  38. package/functions/computation-system-v2/devtools/vscode-computation/tsconfig.json +22 -0
  39. package/functions/computation-system-v2/docs/HowToCreateComputations.MD +602 -0
  40. package/functions/computation-system-v2/framework/core/Manifest.js +9 -16
  41. package/functions/computation-system-v2/framework/core/RunAnalyzer.js +2 -1
  42. package/functions/computation-system-v2/framework/data/DataFetcher.js +330 -126
  43. package/functions/computation-system-v2/framework/data/MaterializedViewManager.js +84 -0
  44. package/functions/computation-system-v2/framework/data/QueryBuilder.js +38 -38
  45. package/functions/computation-system-v2/framework/execution/Orchestrator.js +226 -153
  46. package/functions/computation-system-v2/framework/scheduling/ScheduleValidator.js +17 -19
  47. package/functions/computation-system-v2/framework/storage/StateRepository.js +32 -2
  48. package/functions/computation-system-v2/framework/storage/StorageManager.js +111 -83
  49. package/functions/computation-system-v2/framework/testing/ComputationTester.js +161 -66
  50. package/functions/computation-system-v2/handlers/dispatcher.js +57 -29
  51. package/functions/computation-system-v2/legacy/PiAssetRecommender.js.old +115 -0
  52. package/functions/computation-system-v2/legacy/PiSimilarityMatrix.js +104 -0
  53. package/functions/computation-system-v2/legacy/PiSimilarityVector.js +71 -0
  54. package/functions/computation-system-v2/scripts/debug_aggregation.js +25 -0
  55. package/functions/computation-system-v2/scripts/test-computation-dag.js +109 -0
  56. package/functions/computation-system-v2/scripts/test-invalidation-scenarios.js +234 -0
  57. package/functions/task-engine/helpers/data_storage_helpers.js +6 -6
  58. package/package.json +1 -1
  59. package/functions/computation-system-v2/computations/PopularInvestorRiskAssessment.js +0 -176
  60. package/functions/computation-system-v2/computations/PopularInvestorRiskMetrics.js +0 -294
  61. package/functions/computation-system-v2/computations/UserPortfolioSummary.js +0 -172
  62. package/functions/computation-system-v2/scripts/migrate-sectors.js +0 -73
  63. package/functions/computation-system-v2/test/analyze-results.js +0 -238
  64. package/functions/computation-system-v2/test/other/test-dependency-cascade.js +0 -150
  65. package/functions/computation-system-v2/test/other/test-dispatcher.js +0 -317
  66. package/functions/computation-system-v2/test/other/test-framework.js +0 -500
  67. package/functions/computation-system-v2/test/other/test-real-execution.js +0 -166
  68. package/functions/computation-system-v2/test/other/test-real-integration.js +0 -194
  69. package/functions/computation-system-v2/test/other/test-refactor-e2e.js +0 -131
  70. package/functions/computation-system-v2/test/other/test-results.json +0 -31
  71. package/functions/computation-system-v2/test/other/test-risk-metrics-computation.js +0 -329
  72. package/functions/computation-system-v2/test/other/test-scheduler.js +0 -204
  73. package/functions/computation-system-v2/test/other/test-storage.js +0 -449
  74. package/functions/computation-system-v2/test/run-pipeline-test.js +0 -554
  75. package/functions/computation-system-v2/test/test-full-pipeline.js +0 -227
  76. package/functions/computation-system-v2/test/test-worker-pool.js +0 -266
@@ -1,11 +1,12 @@
1
1
  /**
2
2
  * @fileoverview Schedule Validator
3
- *
4
- * Validates computation schedules and enforces timing rules:
3
+ * * Validates computation schedules and enforces timing rules:
5
4
  * 1. Parses schedule declarations
6
5
  * 2. Validates schedule format
7
- * 3. Checks 15-minute rule between dependent computations
8
- * 4. Generates warnings/errors for problematic schedules
6
+ * 3. Checks logical ordering (Dependent must not be scheduled BEFORE Dependency)
7
+ * * * UPDATE: Removed "Gap Warning" (race condition check).
8
+ * * * The system's RunAnalyzer (Blocking) and Orchestrator (Cascading)
9
+ * * * handle overlapping schedules safely, so 0-minute gaps are valid.
9
10
  */
10
11
 
11
12
  /**
@@ -37,6 +38,8 @@ class ScheduleValidator {
37
38
  time: '02:00',
38
39
  timezone: 'UTC'
39
40
  };
41
+ // dependencyGapMinutes is no longer used for validation warnings,
42
+ // but kept if needed for other scheduling logic (e.g. cloud task delays)
40
43
  this.dependencyGapMinutes = config.scheduling?.dependencyGapMinutes || 15;
41
44
  }
42
45
 
@@ -261,7 +264,7 @@ class ScheduleValidator {
261
264
  const gap = this.calculateGap(depSchedule, entrySchedule);
262
265
 
263
266
  if (gap === null) {
264
- // Different frequencies - can't directly compare
267
+ // Different frequencies - can't directly compare (Warning is still useful here as a heads up)
265
268
  issues.push({
266
269
  severity: 'warning',
267
270
  computation: entry.name,
@@ -272,27 +275,22 @@ class ScheduleValidator {
272
275
  continue;
273
276
  }
274
277
 
278
+ // STRICT CHECK: Dependent CANNOT run BEFORE dependency
275
279
  if (gap < 0) {
276
- // Dependent runs BEFORE its dependency
277
280
  issues.push({
278
281
  severity: 'error',
279
282
  computation: entry.name,
280
283
  dependency: depName,
281
284
  gap,
282
285
  message: `${entry.name} is scheduled BEFORE its dependency ${depName} (${Math.abs(gap)} minutes earlier)`,
283
- suggestion: `Move ${entry.name} to at least ${this.dependencyGapMinutes} minutes after ${depName}`
286
+ suggestion: `Move ${entry.name} to after ${depName} or use the default schedule.`
284
287
  });
285
- } else if (gap < this.dependencyGapMinutes) {
286
- // Too close - race condition risk
287
- issues.push({
288
- severity: 'warning',
289
- computation: entry.name,
290
- dependency: depName,
291
- gap,
292
- message: `${entry.name} scheduled only ${gap} minutes after dependency ${depName}`,
293
- suggestion: `Increase gap to at least ${this.dependencyGapMinutes} minutes`
294
- });
295
- }
288
+ }
289
+
290
+ // REMOVED: Warning for (0 <= gap < 15).
291
+ // Reason: In this architecture, dependents are triggered via Event Cascade (Pass 1 -> Pass 2).
292
+ // A 0-minute gap (or same default schedule) is safe because the RunAnalyzer will simply BLOCK
293
+ // the dependent until the dependency is ready, or the Orchestrator will trigger it automatically.
296
294
  }
297
295
 
298
296
  return issues;
@@ -324,4 +322,4 @@ class ScheduleValidator {
324
322
  }
325
323
  }
326
324
 
327
- module.exports = { ScheduleValidator };
325
+ module.exports = { ScheduleValidator };
@@ -4,6 +4,7 @@
4
4
  * 1. Loading daily execution status (hashes, timestamps)
5
5
  * 2. Loading previous results (for dependencies and history)
6
6
  * 3. Caching results for performance
7
+ * * * UPDATE: Added getRunDates() to support automatic backfill fan-out.
7
8
  */
8
9
 
9
10
  const { BigQuery } = require('@google-cloud/bigquery');
@@ -87,6 +88,37 @@ class StateRepository {
87
88
  return statusMap;
88
89
  }
89
90
 
91
+ /**
92
+ * Fetch all dates where a computation has previously run.
93
+ * Used for fan-out / backfill operations on code deployment.
94
+ * @param {string} computationName
95
+ * @returns {Promise<string[]>} List of YYYY-MM-DD strings
96
+ */
97
+ async getRunDates(computationName) {
98
+ try {
99
+ const table = this.config.resultStore?.table || 'computation_results';
100
+ const fullTable = `\`${this.config.bigquery.projectId}.${this.config.bigquery.dataset}.${table}\``;
101
+
102
+ const query = `
103
+ SELECT DISTINCT date
104
+ FROM ${fullTable}
105
+ WHERE computation_name = @compName
106
+ ORDER BY date DESC
107
+ `;
108
+
109
+ const [rows] = await this.bigquery.query({
110
+ query,
111
+ params: { compName: computationName.toLowerCase() },
112
+ location: this.config.bigquery.location
113
+ });
114
+
115
+ return rows.map(r => r.date.value || r.date);
116
+ } catch (e) {
117
+ this._log('WARN', `Failed to fetch run dates for ${computationName}: ${e.message}`);
118
+ return [];
119
+ }
120
+ }
121
+
90
122
  /**
91
123
  * Update the local status cache after a write.
92
124
  * @param {string} dateStr
@@ -208,8 +240,6 @@ class StateRepository {
208
240
 
209
241
  /**
210
242
  * Get multiple entity results in a single query (batch lazy load)
211
- * FIXED: This solves the N+1 problem by allowing the Executor to fetch dependencies
212
- * for an entire processing batch in one go.
213
243
  */
214
244
  async getBatchEntityResults(dateStr, computationName, entityIds) {
215
245
  if (!entityIds || entityIds.length === 0) return {};
@@ -10,7 +10,7 @@
10
10
  * * FIX: Switched to bigquery.createJob for GCS imports to prevent local file path interpretation errors.
11
11
  * * FIX: Improved error logging to catch swallowed BigQuery insert errors.
12
12
  * * FIX: finalizeResults now checks for file existence to prevent "Not found" errors on empty results.
13
- * * FIX: Added SAFE.PARSE_JSON to MERGE statement for BOTH result_data and dependency_result_hashes.
13
+ * * FIX: Removed SAFE.PARSE_JSON from MERGE to match STRING schema types.
14
14
  */
15
15
 
16
16
  const { Firestore } = require('@google-cloud/firestore');
@@ -23,20 +23,20 @@ class StorageManager {
23
23
  constructor(config, logger = null) {
24
24
  this.config = config;
25
25
  this.logger = logger;
26
-
26
+
27
27
  this.bigquery = new BigQuery({
28
28
  projectId: config.bigquery?.projectId,
29
29
  location: config.bigquery?.location || 'EU'
30
30
  });
31
-
31
+
32
32
  this.storage = new Storage({
33
33
  projectId: config.bigquery?.projectId
34
34
  });
35
-
35
+
36
36
  this._firestore = null;
37
37
  this.tableExists = new Map();
38
38
  }
39
-
39
+
40
40
  get firestore() {
41
41
  if (!this._firestore) {
42
42
  this._firestore = new Firestore({
@@ -52,16 +52,15 @@ class StorageManager {
52
52
  */
53
53
  async claimZombie(checkpointId) {
54
54
  if (!checkpointId) return;
55
-
56
- // FIX: Access projectId and dataset from the config object
57
- const { projectId, dataset } = this.config.bigquery; //
55
+
56
+ const { projectId, dataset } = this.config.bigquery;
58
57
 
59
58
  const query = `
60
59
  UPDATE \`${projectId}.${dataset}.computation_checkpoints\`
61
60
  SET last_updated = CURRENT_TIMESTAMP()
62
61
  WHERE checkpoint_id = @checkpointId
63
62
  `;
64
-
63
+
65
64
  try {
66
65
  await this.bigquery.query({
67
66
  query,
@@ -73,8 +72,8 @@ class StorageManager {
73
72
  }
74
73
  }
75
74
 
76
-
77
-
75
+
76
+
78
77
  // =========================================================================
79
78
  // RESULT COMMITTING (Batch -> GCS Buffer)
80
79
  // =========================================================================
@@ -87,7 +86,7 @@ class StorageManager {
87
86
  async commitResults(dateStr, entry, results, depResultHashes = {}) {
88
87
  const storageConfig = this._resolveStorageConfig(entry);
89
88
  const startTime = Date.now();
90
-
89
+
91
90
  // Define GCS Task (Fatal on error)
92
91
  const gcsTask = async () => {
93
92
  if (storageConfig.bigquery === false) return null;
@@ -117,11 +116,11 @@ class StorageManager {
117
116
  firestoreTask()
118
117
  ]);
119
118
 
120
- const writeResults = {
121
- bigquery: bigqueryResult,
122
- firestore: firestoreResult
119
+ const writeResults = {
120
+ bigquery: bigqueryResult,
121
+ firestore: firestoreResult
123
122
  };
124
-
123
+
125
124
  const duration = Date.now() - startTime;
126
125
  this._log('INFO', `Committed (Staged) ${entry.name} results in ${duration}ms`);
127
126
  return writeResults;
@@ -139,18 +138,18 @@ class StorageManager {
139
138
  const table = this.config.resultStore?.table || 'computation_results';
140
139
  const bucketName = this.config.gcs?.bucket;
141
140
  const prefix = this.config.gcs?.prefix || 'staging';
142
-
141
+
143
142
  // 1. Define GCS path pattern: gs://bucket/prefix/date/computation/*.json
144
143
  const filePrefix = `${prefix}/${dateStr}/${entry.name}/`;
145
144
  const gcsPath = `gs://${bucketName}/${filePrefix}*.json`;
146
-
145
+
147
146
  this._log('INFO', `Finalizing ${entry.name}...`);
148
147
 
149
148
  try {
150
149
  // FIX: Check if files actually exist before trying to load them
151
150
  // If the computation produced 0 results, no files exist, and BQ will throw "Not Found".
152
151
  const [files] = await this.storage.bucket(bucketName).getFiles({ prefix: filePrefix });
153
-
152
+
154
153
  if (!files || files.length === 0) {
155
154
  this._log('INFO', `No staged files found for ${entry.name}. Skipping finalization (Empty Result).`);
156
155
  return;
@@ -162,7 +161,7 @@ class StorageManager {
162
161
  // 2. Load GCS files into a Temporary Table
163
162
  // We create the temp table with the exact schema we expect first
164
163
  await this._createTempTableForLoad(tempTableId);
165
-
164
+
166
165
  // FIX: Use bigquery.createJob directly.
167
166
  const [job] = await this.bigquery.createJob({
168
167
  configuration: {
@@ -235,9 +234,9 @@ class StorageManager {
235
234
  await this.bigquery.dataset(dataset).table(table).insert([row]);
236
235
  } catch (error) {
237
236
  if (error.name === 'PartialFailureError' || error.errors) {
238
- this._log('ERROR', `Checkpoint insert failed: ${JSON.stringify(error.errors)}`);
237
+ this._log('ERROR', `Checkpoint insert failed: ${JSON.stringify(error.errors)}`);
239
238
  } else {
240
- this._log('ERROR', `Checkpoint insert failed: ${error.message}`);
239
+ this._log('ERROR', `Checkpoint insert failed: ${error.message}`);
241
240
  }
242
241
  throw error;
243
242
  }
@@ -349,10 +348,6 @@ class StorageManager {
349
348
  const table = 'computation_checkpoints';
350
349
  const fullTable = `\`${this.config.bigquery.projectId}.${this.config.bigquery.dataset}.${table}\``;
351
350
  try {
352
- // FIX: Use subquery with ROW_NUMBER to find the TRUE latest state per computation.
353
- // We only count it as a zombie if the LATEST row is 'running'.
354
- // This ignores 'running' rows that have a newer (or same-time) 'completed' sibling.
355
- // UPDATE: Added attempts to the selection
356
351
  const query = `
357
352
  SELECT computation_name, date, checkpoint_id, last_updated, attempts
358
353
  FROM (
@@ -380,15 +375,15 @@ class StorageManager {
380
375
  LIMIT 50
381
376
  `;
382
377
  const [rows] = await this.bigquery.query({ query, params: { minutes: minutesThreshold }, location: this.config.bigquery.location });
383
- return rows.map(r => ({
384
- name: r.computation_name,
385
- date: r.date.value || r.date,
378
+ return rows.map(r => ({
379
+ name: r.computation_name,
380
+ date: r.date.value || r.date,
386
381
  checkpointId: r.checkpoint_id,
387
- attempts: r.attempts
382
+ attempts: r.attempts
388
383
  }));
389
- } catch (e) {
384
+ } catch (e) {
390
385
  console.error(`[Storage] findZombies failed: ${e.message}`);
391
- return [];
386
+ return [];
392
387
  }
393
388
  }
394
389
 
@@ -407,9 +402,6 @@ class StorageManager {
407
402
  const table = 'computation_checkpoints';
408
403
  const fullTable = `\`${this.config.bigquery.projectId}.${this.config.bigquery.dataset}.${table}\``;
409
404
  try {
410
- // FIX: Added Tie-Breaker logic to ORDER BY
411
- // If timestamps are identical, 'completed' (1) comes before 'failed' (2) before 'running' (3).
412
- // This ensures we never accidentally pick a "running" row when a "completed" one exists at the exact same ms.
413
405
  const query = `
414
406
  SELECT checkpoint_id, status, processed_count, last_entity_id, completed_batches, worker_instance_id, last_updated, attempts, code_hash, started_at
415
407
  FROM ${fullTable}
@@ -425,7 +417,7 @@ class StorageManager {
425
417
  `;
426
418
  const [rows] = await this.bigquery.query({ query, params: { date: dateStr, computationName }, location: this.config.bigquery.location });
427
419
  if (rows.length === 0) return null;
428
-
420
+
429
421
  const r = rows[0];
430
422
  return {
431
423
  ...r,
@@ -446,17 +438,17 @@ class StorageManager {
446
438
  async savePerformanceReport(report) {
447
439
  const table = 'computation_performance';
448
440
  const dataset = this.config.bigquery.dataset;
449
- await this._ensurePerformanceTable(table);
441
+ await this._ensurePerformanceTable(table);
450
442
 
451
443
  const row = {
452
- run_id : report.runId || 'unknown',
453
- computation_name : report.computationName,
444
+ run_id: report.runId || 'unknown',
445
+ computation_name: report.computationName,
454
446
  date: report.date,
455
447
  duration_ms: report.durationMs,
456
- metrics: JSON.stringify (report.metrics || {}),
448
+ metrics: JSON.stringify(report.metrics || {}),
457
449
  entity_count: report.entityCount || 0,
458
- status : report.status || 'completed',
459
- created_at : this.bigquery.timestamp(new Date())
450
+ status: report.status || 'completed',
451
+ created_at: this.bigquery.timestamp(new Date())
460
452
  };
461
453
  try {
462
454
  await this.bigquery.dataset(dataset).table(table).insert([row]);
@@ -486,29 +478,27 @@ class StorageManager {
486
478
  async _stageToGCS(dateStr, entry, results, depResultHashes) {
487
479
  const rows = this._buildBigQueryRows(dateStr, entry, results, depResultHashes);
488
480
  if (rows.length === 0) return { rowCount: 0 };
489
-
481
+
490
482
  const bucketName = this.config.gcs?.bucket;
491
483
  const prefix = this.config.gcs?.prefix || 'staging';
492
484
  const filename = `${prefix}/${dateStr}/${entry.name}/${crypto.randomUUID()}.json`;
493
-
485
+
494
486
  const file = this.storage.bucket(bucketName).file(filename);
495
-
487
+
496
488
  const ndjson = rows.map(r => JSON.stringify(r)).join('\n');
497
-
489
+
498
490
  await file.save(ndjson, {
499
491
  contentType: 'application/json',
500
- resumable: false
492
+ resumable: false
501
493
  });
502
-
494
+
503
495
  return { rowCount: rows.length, gcsUri: `gs://${bucketName}/${filename}` };
504
496
  }
505
497
 
506
498
  async _createTempTableForLoad(tableName) {
507
499
  const dataset = this.bigquery.dataset(this.config.bigquery.dataset);
508
500
  const table = dataset.table(tableName);
509
-
510
- // Note: result_data and dependency_result_hashes are loaded as STRING from the JSON file
511
- // They will be parsed into JSON during the merge step.
501
+
512
502
  const schema = [
513
503
  { name: 'date', type: 'DATE', mode: 'REQUIRED' },
514
504
  { name: 'computation_name', type: 'STRING', mode: 'REQUIRED' },
@@ -528,9 +518,10 @@ class StorageManager {
528
518
  async _mergeStagedData(targetTable, tempTable) {
529
519
  const fullTarget = `\`${this.config.bigquery.projectId}.${this.config.bigquery.dataset}.${targetTable}\``;
530
520
  const fullTemp = `\`${this.config.bigquery.projectId}.${this.config.bigquery.dataset}.${tempTable}\``;
531
-
521
+
532
522
  await this._ensureBigQueryTable(targetTable);
533
523
 
524
+ // FIX: Removed SAFE.PARSE_JSON() because target columns are STRING.
534
525
  const mergeQuery = `
535
526
  MERGE INTO ${fullTarget} T
536
527
  USING (
@@ -544,31 +535,69 @@ class StorageManager {
544
535
  UPDATE SET
545
536
  code_hash = S.code_hash,
546
537
  result_hash = S.result_hash,
547
- dependency_result_hashes = SAFE.PARSE_JSON(S.dependency_result_hashes),
538
+ dependency_result_hashes = S.dependency_result_hashes,
548
539
  entity_count = S.entity_count,
549
- result_data = SAFE.PARSE_JSON(S.result_data),
540
+ result_data = S.result_data,
550
541
  updated_at = S.updated_at
551
542
  WHEN NOT MATCHED THEN
552
543
  INSERT (date, computation_name, category, entity_id, code_hash, result_hash,
553
544
  dependency_result_hashes, entity_count, result_data, updated_at)
554
545
  VALUES (S.date, S.computation_name, S.category, S.entity_id, S.code_hash, S.result_hash,
555
- SAFE.PARSE_JSON(S.dependency_result_hashes), S.entity_count, SAFE.PARSE_JSON(S.result_data), S.updated_at)
546
+ S.dependency_result_hashes, S.entity_count, S.result_data, S.updated_at)
556
547
  `;
557
-
548
+
558
549
  // UPDATE: Use createQueryJob to capture DML statistics
559
550
  try {
560
- const [job] = await this.bigquery.createQueryJob({
561
- query: mergeQuery,
562
- location: this.config.bigquery.location
551
+ const [job] = await this.bigquery.createQueryJob({
552
+ query: mergeQuery,
553
+ location: this.config.bigquery.location
563
554
  });
564
-
555
+
565
556
  await job.getQueryResults(); // Wait for completion
566
-
557
+
567
558
  const metadata = await job.getMetadata();
568
559
  const stats = metadata[0]?.statistics?.query;
569
560
  const affectedRows = stats?.numDmlAffectedRows;
570
561
 
571
562
  this._log('INFO', `Merge complete on ${targetTable}. Rows affected (Inserted/Updated): ${affectedRows}`);
563
+
564
+ // =========================================================================
565
+ // CRITICAL FIX: Delete stale entity rows that were NOT in the new staging data.
566
+ // This prevents ghost entities from previous runs from polluting results.
567
+ // We identify the date and computation from the temp table and remove any
568
+ // rows in the target that don't have a matching entity_id in the new run.
569
+ // =========================================================================
570
+ const deleteQuery = `
571
+ DELETE FROM ${fullTarget} T
572
+ WHERE EXISTS (
573
+ -- Identify which (date, computation_name) pairs were just processed
574
+ SELECT 1 FROM ${fullTemp} S
575
+ WHERE S.date = T.date AND S.computation_name = T.computation_name
576
+ )
577
+ AND NOT EXISTS (
578
+ -- Keep only entity_ids that are in the new staging data
579
+ SELECT 1 FROM ${fullTemp} S
580
+ WHERE S.date = T.date
581
+ AND S.computation_name = T.computation_name
582
+ AND S.entity_id = T.entity_id
583
+ )
584
+ `;
585
+
586
+ const [deleteJob] = await this.bigquery.createQueryJob({
587
+ query: deleteQuery,
588
+ location: this.config.bigquery.location
589
+ });
590
+
591
+ await deleteJob.getQueryResults();
592
+
593
+ const deleteMeta = await deleteJob.getMetadata();
594
+ const deleteStats = deleteMeta[0]?.statistics?.query;
595
+ const deletedRows = deleteStats?.numDmlAffectedRows;
596
+
597
+ if (deletedRows && parseInt(deletedRows, 10) > 0) {
598
+ this._log('INFO', `Cleanup: Deleted ${deletedRows} stale entity rows from ${targetTable}`);
599
+ }
600
+
572
601
  } catch (e) {
573
602
  this._logError(`Merge Failed on ${targetTable}`, e);
574
603
  throw e;
@@ -587,7 +616,7 @@ class StorageManager {
587
616
  const rows = [];
588
617
  const timestamp = new Date().toISOString();
589
618
  const depResultHashesJson = JSON.stringify(depResultHashes);
590
-
619
+
591
620
  if (entry.type === 'per-entity' && typeof results === 'object') {
592
621
  for (const [entityId, data] of Object.entries(results)) {
593
622
  rows.push({
@@ -659,7 +688,7 @@ class StorageManager {
659
688
  { name: 'computation_name', type: 'STRING', mode: 'REQUIRED' },
660
689
  { name: 'date', type: 'DATE', mode: 'REQUIRED' },
661
690
  { name: 'duration_ms', type: 'INTEGER', mode: 'NULLABLE' },
662
- { name: 'metrics', type: 'STRING', mode: 'NULLABLE' }, // JSON string
691
+ { name: 'metrics', type: 'STRING', mode: 'NULLABLE' },
663
692
  { name: 'entity_count', type: 'INTEGER', mode: 'NULLABLE' },
664
693
  { name: 'status', type: 'STRING', mode: 'NULLABLE' },
665
694
  { name: 'created_at', type: 'TIMESTAMP', mode: 'REQUIRED' }
@@ -672,28 +701,28 @@ class StorageManager {
672
701
  });
673
702
  this._log('INFO', `Created table ${tableName}`);
674
703
  }
675
-
704
+
676
705
  this.tableExists.set(tableName, true);
677
706
  }
678
-
707
+
679
708
  async _ensureCheckpointTable(tableName) {
680
709
  if (this.tableExists.get(tableName)) return;
681
710
  const dataset = this.bigquery.dataset(this.config.bigquery.dataset);
682
711
  const table = dataset.table(tableName);
683
712
  const [exists] = await table.exists();
684
-
713
+
685
714
  const schema = [
686
715
  { name: 'date', type: 'DATE', mode: 'REQUIRED' },
687
716
  { name: 'computation_name', type: 'STRING', mode: 'REQUIRED' },
688
717
  { name: 'checkpoint_id', type: 'STRING', mode: 'REQUIRED' },
689
718
  { name: 'worker_instance_id', type: 'STRING', mode: 'NULLABLE' },
690
- { name: 'code_hash', type: 'STRING', mode: 'NULLABLE' },
719
+ { name: 'code_hash', type: 'STRING', mode: 'NULLABLE' },
691
720
  { name: 'status', type: 'STRING', mode: 'REQUIRED' },
692
721
  { name: 'processed_count', type: 'INTEGER', mode: 'NULLABLE' },
693
722
  { name: 'total_entities', type: 'INTEGER', mode: 'NULLABLE' },
694
723
  { name: 'last_entity_id', type: 'STRING', mode: 'NULLABLE' },
695
724
  { name: 'completed_batches', type: 'INTEGER', mode: 'REPEATED' },
696
- { name: 'attempts', type: 'INTEGER', mode: 'NULLABLE' },
725
+ { name: 'attempts', type: 'INTEGER', mode: 'NULLABLE' },
697
726
  { name: 'started_at', type: 'TIMESTAMP', mode: 'REQUIRED' },
698
727
  { name: 'last_updated', type: 'TIMESTAMP', mode: 'REQUIRED' }
699
728
  ];
@@ -704,38 +733,38 @@ class StorageManager {
704
733
  timePartitioning: { type: 'DAY', field: 'date' },
705
734
  clustering: { fields: ['computation_name', 'status'] }
706
735
  });
707
- }
736
+ }
708
737
  this.tableExists.set(tableName, true);
709
738
  }
710
739
 
711
740
  _writeToFirestore(dateStr, entry, results, firestoreConfig) {
712
741
  const { path, merge, includeMetadata } = firestoreConfig;
713
742
  if (!path) throw new Error(`Firestore path not configured for ${entry.name}`);
714
-
743
+
715
744
  const timestamp = new Date();
716
745
  const metadata = includeMetadata ? {
717
- _computedAt: timestamp, _computationDate: dateStr,
746
+ _computedAt: timestamp, _computationDate: dateStr,
718
747
  _computationName: entry.name, _codeHash: entry.hash
719
748
  } : {};
720
-
749
+
721
750
  let docCount = 0;
722
-
751
+
723
752
  if (entry.type === 'per-entity' && typeof results === 'object') {
724
753
  const batches = [];
725
754
  let currentBatch = this.firestore.batch();
726
755
  let batchCount = 0;
727
756
  const MAX_BATCH = 500;
728
-
757
+
729
758
  for (const [entityId, data] of Object.entries(results)) {
730
759
  const docPath = this._resolvePath(path, {
731
760
  entityId, date: dateStr, computationName: entry.name, category: entry.category || 'uncategorized'
732
761
  });
733
-
762
+
734
763
  const docRef = this.firestore.doc(docPath);
735
764
  const docData = { ...data, ...metadata };
736
-
765
+
737
766
  merge ? currentBatch.set(docRef, docData, { merge: true }) : currentBatch.set(docRef, docData);
738
-
767
+
739
768
  batchCount++; docCount++;
740
769
  if (batchCount >= MAX_BATCH) {
741
770
  batches.push(currentBatch);
@@ -744,10 +773,10 @@ class StorageManager {
744
773
  }
745
774
  }
746
775
  if (batchCount > 0) batches.push(currentBatch);
747
-
776
+
748
777
  const limit = pLimit(10);
749
778
  return Promise.all(batches.map(b => limit(() => b.commit()))).then(() => ({ docCount }));
750
-
779
+
751
780
  } else {
752
781
  const docPath = this._resolvePath(path, {
753
782
  entityId: '_global', date: dateStr, computationName: entry.name, category: entry.category || 'uncategorized'
@@ -772,23 +801,22 @@ class StorageManager {
772
801
  const str = typeof data === 'string' ? data : JSON.stringify(data);
773
802
  return crypto.createHash('md5').update(str).digest('hex').substring(0, 16);
774
803
  }
775
-
804
+
776
805
  _log(level, message) {
777
806
  this.logger?.log ? this.logger.log(level, `[StorageManager] ${message}`) : console.log(`[${level}] [StorageManager] ${message}`);
778
807
  }
779
808
 
780
809
  _logError(context, error) {
781
- // Safe logging for BigQuery PartialFailureError which hides details in .errors
782
810
  let details = error.message;
783
811
  if (error.errors && Array.isArray(error.errors)) {
784
812
  details = JSON.stringify(error.errors, null, 2);
785
813
  } else if (error.response && error.response.insertErrors) {
786
814
  details = JSON.stringify(error.response.insertErrors, null, 2);
787
815
  }
788
-
816
+
789
817
  this._log('ERROR', `${context}: ${details}`);
790
818
  }
791
-
819
+
792
820
  }
793
821
 
794
822
  module.exports = { StorageManager };