bulltrackers-module 1.0.766 → 1.0.769
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/functions/computation-system-v2/UserPortfolioMetrics.js +50 -0
- package/functions/computation-system-v2/computations/BehavioralAnomaly.js +559 -227
- package/functions/computation-system-v2/computations/GlobalAumPerAsset30D.js +103 -0
- package/functions/computation-system-v2/computations/NewSectorExposure.js +82 -35
- package/functions/computation-system-v2/computations/NewSocialPost.js +52 -24
- package/functions/computation-system-v2/computations/PIDailyAssetAUM.js +134 -0
- package/functions/computation-system-v2/computations/PiFeatureVectors.js +227 -0
- package/functions/computation-system-v2/computations/PiRecommender.js +359 -0
- package/functions/computation-system-v2/computations/PopularInvestorProfileMetrics.js +354 -641
- package/functions/computation-system-v2/computations/SignedInUserList.js +51 -0
- package/functions/computation-system-v2/computations/SignedInUserMirrorHistory.js +138 -0
- package/functions/computation-system-v2/computations/SignedInUserPIProfileMetrics.js +106 -0
- package/functions/computation-system-v2/computations/SignedInUserProfileMetrics.js +324 -0
- package/functions/computation-system-v2/config/bulltrackers.config.js +40 -126
- package/functions/computation-system-v2/core-api.js +17 -9
- package/functions/computation-system-v2/data_schema_reference.MD +108 -0
- package/functions/computation-system-v2/devtools/builder/builder.js +362 -0
- package/functions/computation-system-v2/devtools/builder/examples/user-metrics.yaml +26 -0
- package/functions/computation-system-v2/devtools/index.js +36 -0
- package/functions/computation-system-v2/devtools/shared/MockDataFactory.js +235 -0
- package/functions/computation-system-v2/devtools/shared/SchemaTemplates.js +475 -0
- package/functions/computation-system-v2/devtools/shared/SystemIntrospector.js +517 -0
- package/functions/computation-system-v2/devtools/shared/index.js +16 -0
- package/functions/computation-system-v2/devtools/simulation/DAGAnalyzer.js +243 -0
- package/functions/computation-system-v2/devtools/simulation/MockDataFetcher.js +306 -0
- package/functions/computation-system-v2/devtools/simulation/MockStorageManager.js +336 -0
- package/functions/computation-system-v2/devtools/simulation/SimulationEngine.js +525 -0
- package/functions/computation-system-v2/devtools/simulation/SimulationServer.js +581 -0
- package/functions/computation-system-v2/devtools/simulation/index.js +17 -0
- package/functions/computation-system-v2/devtools/simulation/simulate.js +324 -0
- package/functions/computation-system-v2/devtools/vscode-computation/package.json +90 -0
- package/functions/computation-system-v2/devtools/vscode-computation/snippets/computation.json +128 -0
- package/functions/computation-system-v2/devtools/vscode-computation/src/extension.ts +401 -0
- package/functions/computation-system-v2/devtools/vscode-computation/src/providers/codeActions.ts +152 -0
- package/functions/computation-system-v2/devtools/vscode-computation/src/providers/completions.ts +207 -0
- package/functions/computation-system-v2/devtools/vscode-computation/src/providers/diagnostics.ts +205 -0
- package/functions/computation-system-v2/devtools/vscode-computation/src/providers/hover.ts +205 -0
- package/functions/computation-system-v2/devtools/vscode-computation/tsconfig.json +22 -0
- package/functions/computation-system-v2/docs/HowToCreateComputations.MD +602 -0
- package/functions/computation-system-v2/framework/core/Manifest.js +9 -16
- package/functions/computation-system-v2/framework/core/RunAnalyzer.js +2 -1
- package/functions/computation-system-v2/framework/data/DataFetcher.js +330 -126
- package/functions/computation-system-v2/framework/data/MaterializedViewManager.js +84 -0
- package/functions/computation-system-v2/framework/data/QueryBuilder.js +38 -38
- package/functions/computation-system-v2/framework/execution/Orchestrator.js +226 -153
- package/functions/computation-system-v2/framework/scheduling/ScheduleValidator.js +17 -19
- package/functions/computation-system-v2/framework/storage/StateRepository.js +32 -2
- package/functions/computation-system-v2/framework/storage/StorageManager.js +111 -83
- package/functions/computation-system-v2/framework/testing/ComputationTester.js +161 -66
- package/functions/computation-system-v2/handlers/dispatcher.js +57 -29
- package/functions/computation-system-v2/legacy/PiAssetRecommender.js.old +115 -0
- package/functions/computation-system-v2/legacy/PiSimilarityMatrix.js +104 -0
- package/functions/computation-system-v2/legacy/PiSimilarityVector.js +71 -0
- package/functions/computation-system-v2/scripts/debug_aggregation.js +25 -0
- package/functions/computation-system-v2/scripts/test-computation-dag.js +109 -0
- package/functions/computation-system-v2/scripts/test-invalidation-scenarios.js +234 -0
- package/functions/task-engine/helpers/data_storage_helpers.js +6 -6
- package/package.json +1 -1
- package/functions/computation-system-v2/computations/PopularInvestorRiskAssessment.js +0 -176
- package/functions/computation-system-v2/computations/PopularInvestorRiskMetrics.js +0 -294
- package/functions/computation-system-v2/computations/UserPortfolioSummary.js +0 -172
- package/functions/computation-system-v2/scripts/migrate-sectors.js +0 -73
- package/functions/computation-system-v2/test/analyze-results.js +0 -238
- package/functions/computation-system-v2/test/other/test-dependency-cascade.js +0 -150
- package/functions/computation-system-v2/test/other/test-dispatcher.js +0 -317
- package/functions/computation-system-v2/test/other/test-framework.js +0 -500
- package/functions/computation-system-v2/test/other/test-real-execution.js +0 -166
- package/functions/computation-system-v2/test/other/test-real-integration.js +0 -194
- package/functions/computation-system-v2/test/other/test-refactor-e2e.js +0 -131
- package/functions/computation-system-v2/test/other/test-results.json +0 -31
- package/functions/computation-system-v2/test/other/test-risk-metrics-computation.js +0 -329
- package/functions/computation-system-v2/test/other/test-scheduler.js +0 -204
- package/functions/computation-system-v2/test/other/test-storage.js +0 -449
- package/functions/computation-system-v2/test/run-pipeline-test.js +0 -554
- package/functions/computation-system-v2/test/test-full-pipeline.js +0 -227
- package/functions/computation-system-v2/test/test-worker-pool.js +0 -266
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* @fileoverview Schedule Validator
|
|
3
|
-
*
|
|
4
|
-
* Validates computation schedules and enforces timing rules:
|
|
3
|
+
* * Validates computation schedules and enforces timing rules:
|
|
5
4
|
* 1. Parses schedule declarations
|
|
6
5
|
* 2. Validates schedule format
|
|
7
|
-
* 3. Checks
|
|
8
|
-
*
|
|
6
|
+
* 3. Checks logical ordering (Dependent must not be scheduled BEFORE Dependency)
|
|
7
|
+
* * * UPDATE: Removed "Gap Warning" (race condition check).
|
|
8
|
+
* * * The system's RunAnalyzer (Blocking) and Orchestrator (Cascading)
|
|
9
|
+
* * * handle overlapping schedules safely, so 0-minute gaps are valid.
|
|
9
10
|
*/
|
|
10
11
|
|
|
11
12
|
/**
|
|
@@ -37,6 +38,8 @@ class ScheduleValidator {
|
|
|
37
38
|
time: '02:00',
|
|
38
39
|
timezone: 'UTC'
|
|
39
40
|
};
|
|
41
|
+
// dependencyGapMinutes is no longer used for validation warnings,
|
|
42
|
+
// but kept if needed for other scheduling logic (e.g. cloud task delays)
|
|
40
43
|
this.dependencyGapMinutes = config.scheduling?.dependencyGapMinutes || 15;
|
|
41
44
|
}
|
|
42
45
|
|
|
@@ -261,7 +264,7 @@ class ScheduleValidator {
|
|
|
261
264
|
const gap = this.calculateGap(depSchedule, entrySchedule);
|
|
262
265
|
|
|
263
266
|
if (gap === null) {
|
|
264
|
-
// Different frequencies - can't directly compare
|
|
267
|
+
// Different frequencies - can't directly compare (Warning is still useful here as a heads up)
|
|
265
268
|
issues.push({
|
|
266
269
|
severity: 'warning',
|
|
267
270
|
computation: entry.name,
|
|
@@ -272,27 +275,22 @@ class ScheduleValidator {
|
|
|
272
275
|
continue;
|
|
273
276
|
}
|
|
274
277
|
|
|
278
|
+
// STRICT CHECK: Dependent CANNOT run BEFORE dependency
|
|
275
279
|
if (gap < 0) {
|
|
276
|
-
// Dependent runs BEFORE its dependency
|
|
277
280
|
issues.push({
|
|
278
281
|
severity: 'error',
|
|
279
282
|
computation: entry.name,
|
|
280
283
|
dependency: depName,
|
|
281
284
|
gap,
|
|
282
285
|
message: `${entry.name} is scheduled BEFORE its dependency ${depName} (${Math.abs(gap)} minutes earlier)`,
|
|
283
|
-
suggestion: `Move ${entry.name} to
|
|
286
|
+
suggestion: `Move ${entry.name} to after ${depName} or use the default schedule.`
|
|
284
287
|
});
|
|
285
|
-
}
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
gap,
|
|
292
|
-
message: `${entry.name} scheduled only ${gap} minutes after dependency ${depName}`,
|
|
293
|
-
suggestion: `Increase gap to at least ${this.dependencyGapMinutes} minutes`
|
|
294
|
-
});
|
|
295
|
-
}
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
// REMOVED: Warning for (0 <= gap < 15).
|
|
291
|
+
// Reason: In this architecture, dependents are triggered via Event Cascade (Pass 1 -> Pass 2).
|
|
292
|
+
// A 0-minute gap (or same default schedule) is safe because the RunAnalyzer will simply BLOCK
|
|
293
|
+
// the dependent until the dependency is ready, or the Orchestrator will trigger it automatically.
|
|
296
294
|
}
|
|
297
295
|
|
|
298
296
|
return issues;
|
|
@@ -324,4 +322,4 @@ class ScheduleValidator {
|
|
|
324
322
|
}
|
|
325
323
|
}
|
|
326
324
|
|
|
327
|
-
module.exports = { ScheduleValidator };
|
|
325
|
+
module.exports = { ScheduleValidator };
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
* 1. Loading daily execution status (hashes, timestamps)
|
|
5
5
|
* 2. Loading previous results (for dependencies and history)
|
|
6
6
|
* 3. Caching results for performance
|
|
7
|
+
* * * UPDATE: Added getRunDates() to support automatic backfill fan-out.
|
|
7
8
|
*/
|
|
8
9
|
|
|
9
10
|
const { BigQuery } = require('@google-cloud/bigquery');
|
|
@@ -87,6 +88,37 @@ class StateRepository {
|
|
|
87
88
|
return statusMap;
|
|
88
89
|
}
|
|
89
90
|
|
|
91
|
+
/**
|
|
92
|
+
* Fetch all dates where a computation has previously run.
|
|
93
|
+
* Used for fan-out / backfill operations on code deployment.
|
|
94
|
+
* @param {string} computationName
|
|
95
|
+
* @returns {Promise<string[]>} List of YYYY-MM-DD strings
|
|
96
|
+
*/
|
|
97
|
+
async getRunDates(computationName) {
|
|
98
|
+
try {
|
|
99
|
+
const table = this.config.resultStore?.table || 'computation_results';
|
|
100
|
+
const fullTable = `\`${this.config.bigquery.projectId}.${this.config.bigquery.dataset}.${table}\``;
|
|
101
|
+
|
|
102
|
+
const query = `
|
|
103
|
+
SELECT DISTINCT date
|
|
104
|
+
FROM ${fullTable}
|
|
105
|
+
WHERE computation_name = @compName
|
|
106
|
+
ORDER BY date DESC
|
|
107
|
+
`;
|
|
108
|
+
|
|
109
|
+
const [rows] = await this.bigquery.query({
|
|
110
|
+
query,
|
|
111
|
+
params: { compName: computationName.toLowerCase() },
|
|
112
|
+
location: this.config.bigquery.location
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
return rows.map(r => r.date.value || r.date);
|
|
116
|
+
} catch (e) {
|
|
117
|
+
this._log('WARN', `Failed to fetch run dates for ${computationName}: ${e.message}`);
|
|
118
|
+
return [];
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
90
122
|
/**
|
|
91
123
|
* Update the local status cache after a write.
|
|
92
124
|
* @param {string} dateStr
|
|
@@ -208,8 +240,6 @@ class StateRepository {
|
|
|
208
240
|
|
|
209
241
|
/**
|
|
210
242
|
* Get multiple entity results in a single query (batch lazy load)
|
|
211
|
-
* FIXED: This solves the N+1 problem by allowing the Executor to fetch dependencies
|
|
212
|
-
* for an entire processing batch in one go.
|
|
213
243
|
*/
|
|
214
244
|
async getBatchEntityResults(dateStr, computationName, entityIds) {
|
|
215
245
|
if (!entityIds || entityIds.length === 0) return {};
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
* * FIX: Switched to bigquery.createJob for GCS imports to prevent local file path interpretation errors.
|
|
11
11
|
* * FIX: Improved error logging to catch swallowed BigQuery insert errors.
|
|
12
12
|
* * FIX: finalizeResults now checks for file existence to prevent "Not found" errors on empty results.
|
|
13
|
-
* * FIX:
|
|
13
|
+
* * FIX: Removed SAFE.PARSE_JSON from MERGE to match STRING schema types.
|
|
14
14
|
*/
|
|
15
15
|
|
|
16
16
|
const { Firestore } = require('@google-cloud/firestore');
|
|
@@ -23,20 +23,20 @@ class StorageManager {
|
|
|
23
23
|
constructor(config, logger = null) {
|
|
24
24
|
this.config = config;
|
|
25
25
|
this.logger = logger;
|
|
26
|
-
|
|
26
|
+
|
|
27
27
|
this.bigquery = new BigQuery({
|
|
28
28
|
projectId: config.bigquery?.projectId,
|
|
29
29
|
location: config.bigquery?.location || 'EU'
|
|
30
30
|
});
|
|
31
|
-
|
|
31
|
+
|
|
32
32
|
this.storage = new Storage({
|
|
33
33
|
projectId: config.bigquery?.projectId
|
|
34
34
|
});
|
|
35
|
-
|
|
35
|
+
|
|
36
36
|
this._firestore = null;
|
|
37
37
|
this.tableExists = new Map();
|
|
38
38
|
}
|
|
39
|
-
|
|
39
|
+
|
|
40
40
|
get firestore() {
|
|
41
41
|
if (!this._firestore) {
|
|
42
42
|
this._firestore = new Firestore({
|
|
@@ -52,16 +52,15 @@ class StorageManager {
|
|
|
52
52
|
*/
|
|
53
53
|
async claimZombie(checkpointId) {
|
|
54
54
|
if (!checkpointId) return;
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
const { projectId, dataset } = this.config.bigquery; //
|
|
55
|
+
|
|
56
|
+
const { projectId, dataset } = this.config.bigquery;
|
|
58
57
|
|
|
59
58
|
const query = `
|
|
60
59
|
UPDATE \`${projectId}.${dataset}.computation_checkpoints\`
|
|
61
60
|
SET last_updated = CURRENT_TIMESTAMP()
|
|
62
61
|
WHERE checkpoint_id = @checkpointId
|
|
63
62
|
`;
|
|
64
|
-
|
|
63
|
+
|
|
65
64
|
try {
|
|
66
65
|
await this.bigquery.query({
|
|
67
66
|
query,
|
|
@@ -73,8 +72,8 @@ class StorageManager {
|
|
|
73
72
|
}
|
|
74
73
|
}
|
|
75
74
|
|
|
76
|
-
|
|
77
|
-
|
|
75
|
+
|
|
76
|
+
|
|
78
77
|
// =========================================================================
|
|
79
78
|
// RESULT COMMITTING (Batch -> GCS Buffer)
|
|
80
79
|
// =========================================================================
|
|
@@ -87,7 +86,7 @@ class StorageManager {
|
|
|
87
86
|
async commitResults(dateStr, entry, results, depResultHashes = {}) {
|
|
88
87
|
const storageConfig = this._resolveStorageConfig(entry);
|
|
89
88
|
const startTime = Date.now();
|
|
90
|
-
|
|
89
|
+
|
|
91
90
|
// Define GCS Task (Fatal on error)
|
|
92
91
|
const gcsTask = async () => {
|
|
93
92
|
if (storageConfig.bigquery === false) return null;
|
|
@@ -117,11 +116,11 @@ class StorageManager {
|
|
|
117
116
|
firestoreTask()
|
|
118
117
|
]);
|
|
119
118
|
|
|
120
|
-
const writeResults = {
|
|
121
|
-
bigquery: bigqueryResult,
|
|
122
|
-
firestore: firestoreResult
|
|
119
|
+
const writeResults = {
|
|
120
|
+
bigquery: bigqueryResult,
|
|
121
|
+
firestore: firestoreResult
|
|
123
122
|
};
|
|
124
|
-
|
|
123
|
+
|
|
125
124
|
const duration = Date.now() - startTime;
|
|
126
125
|
this._log('INFO', `Committed (Staged) ${entry.name} results in ${duration}ms`);
|
|
127
126
|
return writeResults;
|
|
@@ -139,18 +138,18 @@ class StorageManager {
|
|
|
139
138
|
const table = this.config.resultStore?.table || 'computation_results';
|
|
140
139
|
const bucketName = this.config.gcs?.bucket;
|
|
141
140
|
const prefix = this.config.gcs?.prefix || 'staging';
|
|
142
|
-
|
|
141
|
+
|
|
143
142
|
// 1. Define GCS path pattern: gs://bucket/prefix/date/computation/*.json
|
|
144
143
|
const filePrefix = `${prefix}/${dateStr}/${entry.name}/`;
|
|
145
144
|
const gcsPath = `gs://${bucketName}/${filePrefix}*.json`;
|
|
146
|
-
|
|
145
|
+
|
|
147
146
|
this._log('INFO', `Finalizing ${entry.name}...`);
|
|
148
147
|
|
|
149
148
|
try {
|
|
150
149
|
// FIX: Check if files actually exist before trying to load them
|
|
151
150
|
// If the computation produced 0 results, no files exist, and BQ will throw "Not Found".
|
|
152
151
|
const [files] = await this.storage.bucket(bucketName).getFiles({ prefix: filePrefix });
|
|
153
|
-
|
|
152
|
+
|
|
154
153
|
if (!files || files.length === 0) {
|
|
155
154
|
this._log('INFO', `No staged files found for ${entry.name}. Skipping finalization (Empty Result).`);
|
|
156
155
|
return;
|
|
@@ -162,7 +161,7 @@ class StorageManager {
|
|
|
162
161
|
// 2. Load GCS files into a Temporary Table
|
|
163
162
|
// We create the temp table with the exact schema we expect first
|
|
164
163
|
await this._createTempTableForLoad(tempTableId);
|
|
165
|
-
|
|
164
|
+
|
|
166
165
|
// FIX: Use bigquery.createJob directly.
|
|
167
166
|
const [job] = await this.bigquery.createJob({
|
|
168
167
|
configuration: {
|
|
@@ -235,9 +234,9 @@ class StorageManager {
|
|
|
235
234
|
await this.bigquery.dataset(dataset).table(table).insert([row]);
|
|
236
235
|
} catch (error) {
|
|
237
236
|
if (error.name === 'PartialFailureError' || error.errors) {
|
|
238
|
-
|
|
237
|
+
this._log('ERROR', `Checkpoint insert failed: ${JSON.stringify(error.errors)}`);
|
|
239
238
|
} else {
|
|
240
|
-
|
|
239
|
+
this._log('ERROR', `Checkpoint insert failed: ${error.message}`);
|
|
241
240
|
}
|
|
242
241
|
throw error;
|
|
243
242
|
}
|
|
@@ -349,10 +348,6 @@ class StorageManager {
|
|
|
349
348
|
const table = 'computation_checkpoints';
|
|
350
349
|
const fullTable = `\`${this.config.bigquery.projectId}.${this.config.bigquery.dataset}.${table}\``;
|
|
351
350
|
try {
|
|
352
|
-
// FIX: Use subquery with ROW_NUMBER to find the TRUE latest state per computation.
|
|
353
|
-
// We only count it as a zombie if the LATEST row is 'running'.
|
|
354
|
-
// This ignores 'running' rows that have a newer (or same-time) 'completed' sibling.
|
|
355
|
-
// UPDATE: Added attempts to the selection
|
|
356
351
|
const query = `
|
|
357
352
|
SELECT computation_name, date, checkpoint_id, last_updated, attempts
|
|
358
353
|
FROM (
|
|
@@ -380,15 +375,15 @@ class StorageManager {
|
|
|
380
375
|
LIMIT 50
|
|
381
376
|
`;
|
|
382
377
|
const [rows] = await this.bigquery.query({ query, params: { minutes: minutesThreshold }, location: this.config.bigquery.location });
|
|
383
|
-
return rows.map(r => ({
|
|
384
|
-
name: r.computation_name,
|
|
385
|
-
date: r.date.value || r.date,
|
|
378
|
+
return rows.map(r => ({
|
|
379
|
+
name: r.computation_name,
|
|
380
|
+
date: r.date.value || r.date,
|
|
386
381
|
checkpointId: r.checkpoint_id,
|
|
387
|
-
attempts: r.attempts
|
|
382
|
+
attempts: r.attempts
|
|
388
383
|
}));
|
|
389
|
-
} catch (e) {
|
|
384
|
+
} catch (e) {
|
|
390
385
|
console.error(`[Storage] findZombies failed: ${e.message}`);
|
|
391
|
-
return [];
|
|
386
|
+
return [];
|
|
392
387
|
}
|
|
393
388
|
}
|
|
394
389
|
|
|
@@ -407,9 +402,6 @@ class StorageManager {
|
|
|
407
402
|
const table = 'computation_checkpoints';
|
|
408
403
|
const fullTable = `\`${this.config.bigquery.projectId}.${this.config.bigquery.dataset}.${table}\``;
|
|
409
404
|
try {
|
|
410
|
-
// FIX: Added Tie-Breaker logic to ORDER BY
|
|
411
|
-
// If timestamps are identical, 'completed' (1) comes before 'failed' (2) before 'running' (3).
|
|
412
|
-
// This ensures we never accidentally pick a "running" row when a "completed" one exists at the exact same ms.
|
|
413
405
|
const query = `
|
|
414
406
|
SELECT checkpoint_id, status, processed_count, last_entity_id, completed_batches, worker_instance_id, last_updated, attempts, code_hash, started_at
|
|
415
407
|
FROM ${fullTable}
|
|
@@ -425,7 +417,7 @@ class StorageManager {
|
|
|
425
417
|
`;
|
|
426
418
|
const [rows] = await this.bigquery.query({ query, params: { date: dateStr, computationName }, location: this.config.bigquery.location });
|
|
427
419
|
if (rows.length === 0) return null;
|
|
428
|
-
|
|
420
|
+
|
|
429
421
|
const r = rows[0];
|
|
430
422
|
return {
|
|
431
423
|
...r,
|
|
@@ -446,17 +438,17 @@ class StorageManager {
|
|
|
446
438
|
async savePerformanceReport(report) {
|
|
447
439
|
const table = 'computation_performance';
|
|
448
440
|
const dataset = this.config.bigquery.dataset;
|
|
449
|
-
await this._ensurePerformanceTable(table);
|
|
441
|
+
await this._ensurePerformanceTable(table);
|
|
450
442
|
|
|
451
443
|
const row = {
|
|
452
|
-
run_id
|
|
453
|
-
computation_name
|
|
444
|
+
run_id: report.runId || 'unknown',
|
|
445
|
+
computation_name: report.computationName,
|
|
454
446
|
date: report.date,
|
|
455
447
|
duration_ms: report.durationMs,
|
|
456
|
-
metrics: JSON.stringify
|
|
448
|
+
metrics: JSON.stringify(report.metrics || {}),
|
|
457
449
|
entity_count: report.entityCount || 0,
|
|
458
|
-
status
|
|
459
|
-
created_at
|
|
450
|
+
status: report.status || 'completed',
|
|
451
|
+
created_at: this.bigquery.timestamp(new Date())
|
|
460
452
|
};
|
|
461
453
|
try {
|
|
462
454
|
await this.bigquery.dataset(dataset).table(table).insert([row]);
|
|
@@ -486,29 +478,27 @@ class StorageManager {
|
|
|
486
478
|
async _stageToGCS(dateStr, entry, results, depResultHashes) {
|
|
487
479
|
const rows = this._buildBigQueryRows(dateStr, entry, results, depResultHashes);
|
|
488
480
|
if (rows.length === 0) return { rowCount: 0 };
|
|
489
|
-
|
|
481
|
+
|
|
490
482
|
const bucketName = this.config.gcs?.bucket;
|
|
491
483
|
const prefix = this.config.gcs?.prefix || 'staging';
|
|
492
484
|
const filename = `${prefix}/${dateStr}/${entry.name}/${crypto.randomUUID()}.json`;
|
|
493
|
-
|
|
485
|
+
|
|
494
486
|
const file = this.storage.bucket(bucketName).file(filename);
|
|
495
|
-
|
|
487
|
+
|
|
496
488
|
const ndjson = rows.map(r => JSON.stringify(r)).join('\n');
|
|
497
|
-
|
|
489
|
+
|
|
498
490
|
await file.save(ndjson, {
|
|
499
491
|
contentType: 'application/json',
|
|
500
|
-
resumable: false
|
|
492
|
+
resumable: false
|
|
501
493
|
});
|
|
502
|
-
|
|
494
|
+
|
|
503
495
|
return { rowCount: rows.length, gcsUri: `gs://${bucketName}/${filename}` };
|
|
504
496
|
}
|
|
505
497
|
|
|
506
498
|
async _createTempTableForLoad(tableName) {
|
|
507
499
|
const dataset = this.bigquery.dataset(this.config.bigquery.dataset);
|
|
508
500
|
const table = dataset.table(tableName);
|
|
509
|
-
|
|
510
|
-
// Note: result_data and dependency_result_hashes are loaded as STRING from the JSON file
|
|
511
|
-
// They will be parsed into JSON during the merge step.
|
|
501
|
+
|
|
512
502
|
const schema = [
|
|
513
503
|
{ name: 'date', type: 'DATE', mode: 'REQUIRED' },
|
|
514
504
|
{ name: 'computation_name', type: 'STRING', mode: 'REQUIRED' },
|
|
@@ -528,9 +518,10 @@ class StorageManager {
|
|
|
528
518
|
async _mergeStagedData(targetTable, tempTable) {
|
|
529
519
|
const fullTarget = `\`${this.config.bigquery.projectId}.${this.config.bigquery.dataset}.${targetTable}\``;
|
|
530
520
|
const fullTemp = `\`${this.config.bigquery.projectId}.${this.config.bigquery.dataset}.${tempTable}\``;
|
|
531
|
-
|
|
521
|
+
|
|
532
522
|
await this._ensureBigQueryTable(targetTable);
|
|
533
523
|
|
|
524
|
+
// FIX: Removed SAFE.PARSE_JSON() because target columns are STRING.
|
|
534
525
|
const mergeQuery = `
|
|
535
526
|
MERGE INTO ${fullTarget} T
|
|
536
527
|
USING (
|
|
@@ -544,31 +535,69 @@ class StorageManager {
|
|
|
544
535
|
UPDATE SET
|
|
545
536
|
code_hash = S.code_hash,
|
|
546
537
|
result_hash = S.result_hash,
|
|
547
|
-
dependency_result_hashes =
|
|
538
|
+
dependency_result_hashes = S.dependency_result_hashes,
|
|
548
539
|
entity_count = S.entity_count,
|
|
549
|
-
result_data =
|
|
540
|
+
result_data = S.result_data,
|
|
550
541
|
updated_at = S.updated_at
|
|
551
542
|
WHEN NOT MATCHED THEN
|
|
552
543
|
INSERT (date, computation_name, category, entity_id, code_hash, result_hash,
|
|
553
544
|
dependency_result_hashes, entity_count, result_data, updated_at)
|
|
554
545
|
VALUES (S.date, S.computation_name, S.category, S.entity_id, S.code_hash, S.result_hash,
|
|
555
|
-
|
|
546
|
+
S.dependency_result_hashes, S.entity_count, S.result_data, S.updated_at)
|
|
556
547
|
`;
|
|
557
|
-
|
|
548
|
+
|
|
558
549
|
// UPDATE: Use createQueryJob to capture DML statistics
|
|
559
550
|
try {
|
|
560
|
-
const [job] = await this.bigquery.createQueryJob({
|
|
561
|
-
query: mergeQuery,
|
|
562
|
-
location: this.config.bigquery.location
|
|
551
|
+
const [job] = await this.bigquery.createQueryJob({
|
|
552
|
+
query: mergeQuery,
|
|
553
|
+
location: this.config.bigquery.location
|
|
563
554
|
});
|
|
564
|
-
|
|
555
|
+
|
|
565
556
|
await job.getQueryResults(); // Wait for completion
|
|
566
|
-
|
|
557
|
+
|
|
567
558
|
const metadata = await job.getMetadata();
|
|
568
559
|
const stats = metadata[0]?.statistics?.query;
|
|
569
560
|
const affectedRows = stats?.numDmlAffectedRows;
|
|
570
561
|
|
|
571
562
|
this._log('INFO', `Merge complete on ${targetTable}. Rows affected (Inserted/Updated): ${affectedRows}`);
|
|
563
|
+
|
|
564
|
+
// =========================================================================
|
|
565
|
+
// CRITICAL FIX: Delete stale entity rows that were NOT in the new staging data.
|
|
566
|
+
// This prevents ghost entities from previous runs from polluting results.
|
|
567
|
+
// We identify the date and computation from the temp table and remove any
|
|
568
|
+
// rows in the target that don't have a matching entity_id in the new run.
|
|
569
|
+
// =========================================================================
|
|
570
|
+
const deleteQuery = `
|
|
571
|
+
DELETE FROM ${fullTarget} T
|
|
572
|
+
WHERE EXISTS (
|
|
573
|
+
-- Identify which (date, computation_name) pairs were just processed
|
|
574
|
+
SELECT 1 FROM ${fullTemp} S
|
|
575
|
+
WHERE S.date = T.date AND S.computation_name = T.computation_name
|
|
576
|
+
)
|
|
577
|
+
AND NOT EXISTS (
|
|
578
|
+
-- Keep only entity_ids that are in the new staging data
|
|
579
|
+
SELECT 1 FROM ${fullTemp} S
|
|
580
|
+
WHERE S.date = T.date
|
|
581
|
+
AND S.computation_name = T.computation_name
|
|
582
|
+
AND S.entity_id = T.entity_id
|
|
583
|
+
)
|
|
584
|
+
`;
|
|
585
|
+
|
|
586
|
+
const [deleteJob] = await this.bigquery.createQueryJob({
|
|
587
|
+
query: deleteQuery,
|
|
588
|
+
location: this.config.bigquery.location
|
|
589
|
+
});
|
|
590
|
+
|
|
591
|
+
await deleteJob.getQueryResults();
|
|
592
|
+
|
|
593
|
+
const deleteMeta = await deleteJob.getMetadata();
|
|
594
|
+
const deleteStats = deleteMeta[0]?.statistics?.query;
|
|
595
|
+
const deletedRows = deleteStats?.numDmlAffectedRows;
|
|
596
|
+
|
|
597
|
+
if (deletedRows && parseInt(deletedRows, 10) > 0) {
|
|
598
|
+
this._log('INFO', `Cleanup: Deleted ${deletedRows} stale entity rows from ${targetTable}`);
|
|
599
|
+
}
|
|
600
|
+
|
|
572
601
|
} catch (e) {
|
|
573
602
|
this._logError(`Merge Failed on ${targetTable}`, e);
|
|
574
603
|
throw e;
|
|
@@ -587,7 +616,7 @@ class StorageManager {
|
|
|
587
616
|
const rows = [];
|
|
588
617
|
const timestamp = new Date().toISOString();
|
|
589
618
|
const depResultHashesJson = JSON.stringify(depResultHashes);
|
|
590
|
-
|
|
619
|
+
|
|
591
620
|
if (entry.type === 'per-entity' && typeof results === 'object') {
|
|
592
621
|
for (const [entityId, data] of Object.entries(results)) {
|
|
593
622
|
rows.push({
|
|
@@ -659,7 +688,7 @@ class StorageManager {
|
|
|
659
688
|
{ name: 'computation_name', type: 'STRING', mode: 'REQUIRED' },
|
|
660
689
|
{ name: 'date', type: 'DATE', mode: 'REQUIRED' },
|
|
661
690
|
{ name: 'duration_ms', type: 'INTEGER', mode: 'NULLABLE' },
|
|
662
|
-
{ name: 'metrics', type: 'STRING', mode: 'NULLABLE' },
|
|
691
|
+
{ name: 'metrics', type: 'STRING', mode: 'NULLABLE' },
|
|
663
692
|
{ name: 'entity_count', type: 'INTEGER', mode: 'NULLABLE' },
|
|
664
693
|
{ name: 'status', type: 'STRING', mode: 'NULLABLE' },
|
|
665
694
|
{ name: 'created_at', type: 'TIMESTAMP', mode: 'REQUIRED' }
|
|
@@ -672,28 +701,28 @@ class StorageManager {
|
|
|
672
701
|
});
|
|
673
702
|
this._log('INFO', `Created table ${tableName}`);
|
|
674
703
|
}
|
|
675
|
-
|
|
704
|
+
|
|
676
705
|
this.tableExists.set(tableName, true);
|
|
677
706
|
}
|
|
678
|
-
|
|
707
|
+
|
|
679
708
|
async _ensureCheckpointTable(tableName) {
|
|
680
709
|
if (this.tableExists.get(tableName)) return;
|
|
681
710
|
const dataset = this.bigquery.dataset(this.config.bigquery.dataset);
|
|
682
711
|
const table = dataset.table(tableName);
|
|
683
712
|
const [exists] = await table.exists();
|
|
684
|
-
|
|
713
|
+
|
|
685
714
|
const schema = [
|
|
686
715
|
{ name: 'date', type: 'DATE', mode: 'REQUIRED' },
|
|
687
716
|
{ name: 'computation_name', type: 'STRING', mode: 'REQUIRED' },
|
|
688
717
|
{ name: 'checkpoint_id', type: 'STRING', mode: 'REQUIRED' },
|
|
689
718
|
{ name: 'worker_instance_id', type: 'STRING', mode: 'NULLABLE' },
|
|
690
|
-
{ name: 'code_hash', type: 'STRING', mode: 'NULLABLE' },
|
|
719
|
+
{ name: 'code_hash', type: 'STRING', mode: 'NULLABLE' },
|
|
691
720
|
{ name: 'status', type: 'STRING', mode: 'REQUIRED' },
|
|
692
721
|
{ name: 'processed_count', type: 'INTEGER', mode: 'NULLABLE' },
|
|
693
722
|
{ name: 'total_entities', type: 'INTEGER', mode: 'NULLABLE' },
|
|
694
723
|
{ name: 'last_entity_id', type: 'STRING', mode: 'NULLABLE' },
|
|
695
724
|
{ name: 'completed_batches', type: 'INTEGER', mode: 'REPEATED' },
|
|
696
|
-
{ name: 'attempts', type: 'INTEGER', mode: 'NULLABLE' },
|
|
725
|
+
{ name: 'attempts', type: 'INTEGER', mode: 'NULLABLE' },
|
|
697
726
|
{ name: 'started_at', type: 'TIMESTAMP', mode: 'REQUIRED' },
|
|
698
727
|
{ name: 'last_updated', type: 'TIMESTAMP', mode: 'REQUIRED' }
|
|
699
728
|
];
|
|
@@ -704,38 +733,38 @@ class StorageManager {
|
|
|
704
733
|
timePartitioning: { type: 'DAY', field: 'date' },
|
|
705
734
|
clustering: { fields: ['computation_name', 'status'] }
|
|
706
735
|
});
|
|
707
|
-
}
|
|
736
|
+
}
|
|
708
737
|
this.tableExists.set(tableName, true);
|
|
709
738
|
}
|
|
710
739
|
|
|
711
740
|
_writeToFirestore(dateStr, entry, results, firestoreConfig) {
|
|
712
741
|
const { path, merge, includeMetadata } = firestoreConfig;
|
|
713
742
|
if (!path) throw new Error(`Firestore path not configured for ${entry.name}`);
|
|
714
|
-
|
|
743
|
+
|
|
715
744
|
const timestamp = new Date();
|
|
716
745
|
const metadata = includeMetadata ? {
|
|
717
|
-
_computedAt: timestamp, _computationDate: dateStr,
|
|
746
|
+
_computedAt: timestamp, _computationDate: dateStr,
|
|
718
747
|
_computationName: entry.name, _codeHash: entry.hash
|
|
719
748
|
} : {};
|
|
720
|
-
|
|
749
|
+
|
|
721
750
|
let docCount = 0;
|
|
722
|
-
|
|
751
|
+
|
|
723
752
|
if (entry.type === 'per-entity' && typeof results === 'object') {
|
|
724
753
|
const batches = [];
|
|
725
754
|
let currentBatch = this.firestore.batch();
|
|
726
755
|
let batchCount = 0;
|
|
727
756
|
const MAX_BATCH = 500;
|
|
728
|
-
|
|
757
|
+
|
|
729
758
|
for (const [entityId, data] of Object.entries(results)) {
|
|
730
759
|
const docPath = this._resolvePath(path, {
|
|
731
760
|
entityId, date: dateStr, computationName: entry.name, category: entry.category || 'uncategorized'
|
|
732
761
|
});
|
|
733
|
-
|
|
762
|
+
|
|
734
763
|
const docRef = this.firestore.doc(docPath);
|
|
735
764
|
const docData = { ...data, ...metadata };
|
|
736
|
-
|
|
765
|
+
|
|
737
766
|
merge ? currentBatch.set(docRef, docData, { merge: true }) : currentBatch.set(docRef, docData);
|
|
738
|
-
|
|
767
|
+
|
|
739
768
|
batchCount++; docCount++;
|
|
740
769
|
if (batchCount >= MAX_BATCH) {
|
|
741
770
|
batches.push(currentBatch);
|
|
@@ -744,10 +773,10 @@ class StorageManager {
|
|
|
744
773
|
}
|
|
745
774
|
}
|
|
746
775
|
if (batchCount > 0) batches.push(currentBatch);
|
|
747
|
-
|
|
776
|
+
|
|
748
777
|
const limit = pLimit(10);
|
|
749
778
|
return Promise.all(batches.map(b => limit(() => b.commit()))).then(() => ({ docCount }));
|
|
750
|
-
|
|
779
|
+
|
|
751
780
|
} else {
|
|
752
781
|
const docPath = this._resolvePath(path, {
|
|
753
782
|
entityId: '_global', date: dateStr, computationName: entry.name, category: entry.category || 'uncategorized'
|
|
@@ -772,23 +801,22 @@ class StorageManager {
|
|
|
772
801
|
const str = typeof data === 'string' ? data : JSON.stringify(data);
|
|
773
802
|
return crypto.createHash('md5').update(str).digest('hex').substring(0, 16);
|
|
774
803
|
}
|
|
775
|
-
|
|
804
|
+
|
|
776
805
|
_log(level, message) {
|
|
777
806
|
this.logger?.log ? this.logger.log(level, `[StorageManager] ${message}`) : console.log(`[${level}] [StorageManager] ${message}`);
|
|
778
807
|
}
|
|
779
808
|
|
|
780
809
|
_logError(context, error) {
|
|
781
|
-
// Safe logging for BigQuery PartialFailureError which hides details in .errors
|
|
782
810
|
let details = error.message;
|
|
783
811
|
if (error.errors && Array.isArray(error.errors)) {
|
|
784
812
|
details = JSON.stringify(error.errors, null, 2);
|
|
785
813
|
} else if (error.response && error.response.insertErrors) {
|
|
786
814
|
details = JSON.stringify(error.response.insertErrors, null, 2);
|
|
787
815
|
}
|
|
788
|
-
|
|
816
|
+
|
|
789
817
|
this._log('ERROR', `${context}: ${details}`);
|
|
790
818
|
}
|
|
791
|
-
|
|
819
|
+
|
|
792
820
|
}
|
|
793
821
|
|
|
794
822
|
module.exports = { StorageManager };
|