@cascade-flow/backend-postgres 0.2.17 → 0.2.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/db.d.ts +30 -4
- package/dist/db.d.ts.map +1 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +320 -224
- package/dist/index.js.map +5 -5
- package/dist/migrations.d.ts.map +1 -1
- package/package.json +2 -2
package/dist/index.js
CHANGED
|
@@ -5125,13 +5125,15 @@ class DatabaseClient {
|
|
|
5125
5125
|
client.release();
|
|
5126
5126
|
}
|
|
5127
5127
|
}
|
|
5128
|
-
async claimScheduledStep(workflowSlug, runId, stepId,
|
|
5128
|
+
async claimScheduledStep(workflowSlug, runId, stepId, eventToWrite) {
|
|
5129
5129
|
const client = await this.pool.connect();
|
|
5130
5130
|
try {
|
|
5131
5131
|
await client.query("BEGIN");
|
|
5132
|
+
const currentTimeUs = Date.now() * 1000;
|
|
5132
5133
|
const checkQuery = `
|
|
5133
|
-
SELECT event_data FROM ${this.schema}.step_events
|
|
5134
|
+
SELECT event_data, attempt_number, available_at_us FROM ${this.schema}.step_events
|
|
5134
5135
|
WHERE workflow_slug = $1 AND run_id = $2 AND step_id = $3
|
|
5136
|
+
AND type NOT IN ('LogEntry', 'StepCheckpoint', 'StepCheckpointFailed')
|
|
5135
5137
|
ORDER BY timestamp_us DESC, event_id DESC
|
|
5136
5138
|
LIMIT 1
|
|
5137
5139
|
FOR UPDATE SKIP LOCKED
|
|
@@ -5139,14 +5141,22 @@ class DatabaseClient {
|
|
|
5139
5141
|
const checkResult = await client.query(checkQuery, [workflowSlug, runId, stepId]);
|
|
5140
5142
|
if (checkResult.rows.length === 0) {
|
|
5141
5143
|
await client.query("ROLLBACK");
|
|
5142
|
-
return
|
|
5144
|
+
return null;
|
|
5143
5145
|
}
|
|
5144
|
-
const
|
|
5146
|
+
const latestRow = checkResult.rows[0];
|
|
5147
|
+
const latestEvent = latestRow.event_data;
|
|
5145
5148
|
if (latestEvent.type !== "StepScheduled" && latestEvent.type !== "StepReclaimed" && latestEvent.type !== "StepRetrying") {
|
|
5146
5149
|
await client.query("ROLLBACK");
|
|
5147
|
-
return
|
|
5150
|
+
return null;
|
|
5151
|
+
}
|
|
5152
|
+
const availableAtUs = latestRow.available_at_us ?? latestEvent.availableAtUs ?? null;
|
|
5153
|
+
if (availableAtUs !== null && availableAtUs > currentTimeUs) {
|
|
5154
|
+
await client.query("ROLLBACK");
|
|
5155
|
+
return null;
|
|
5148
5156
|
}
|
|
5149
|
-
|
|
5157
|
+
const resolvedAttemptNumber = latestRow.attempt_number ?? latestEvent.attemptNumber ?? (eventToWrite.type === "StepStarted" ? eventToWrite.attemptNumber : null) ?? 1;
|
|
5158
|
+
const eventPayload = eventToWrite.type === "StepStarted" ? { ...eventToWrite, attemptNumber: resolvedAttemptNumber } : eventToWrite;
|
|
5159
|
+
let eventWorkerId = null;
|
|
5150
5160
|
let attemptNumber = null;
|
|
5151
5161
|
let slotIndex = null;
|
|
5152
5162
|
let workerConcurrency = null;
|
|
@@ -5155,18 +5165,18 @@ class DatabaseClient {
|
|
|
5155
5165
|
let errorStackExactHash = "";
|
|
5156
5166
|
let errorStackNormalizedHash = "";
|
|
5157
5167
|
let errorStackPortableHash = "";
|
|
5158
|
-
if (
|
|
5159
|
-
|
|
5160
|
-
attemptNumber =
|
|
5161
|
-
slotIndex =
|
|
5162
|
-
workerConcurrency =
|
|
5163
|
-
}
|
|
5164
|
-
if (
|
|
5165
|
-
errorNameHash =
|
|
5166
|
-
errorMessageHash =
|
|
5167
|
-
errorStackExactHash =
|
|
5168
|
-
errorStackNormalizedHash =
|
|
5169
|
-
errorStackPortableHash =
|
|
5168
|
+
if (eventPayload.type === "StepStarted") {
|
|
5169
|
+
eventWorkerId = eventPayload.workerId;
|
|
5170
|
+
attemptNumber = eventPayload.attemptNumber;
|
|
5171
|
+
slotIndex = eventPayload.slotIndex ?? null;
|
|
5172
|
+
workerConcurrency = eventPayload.workerConcurrency ?? null;
|
|
5173
|
+
}
|
|
5174
|
+
if (eventPayload.type === "StepFailed") {
|
|
5175
|
+
errorNameHash = eventPayload.errorFingerprints.nameHash;
|
|
5176
|
+
errorMessageHash = eventPayload.errorFingerprints.messageHash;
|
|
5177
|
+
errorStackExactHash = eventPayload.errorFingerprints.stackExactHash;
|
|
5178
|
+
errorStackNormalizedHash = eventPayload.errorFingerprints.stackNormalizedHash;
|
|
5179
|
+
errorStackPortableHash = eventPayload.errorFingerprints.stackPortableHash;
|
|
5170
5180
|
}
|
|
5171
5181
|
const versionResult = await client.query(`SELECT version_id FROM ${this.schema}.workflow_events
|
|
5172
5182
|
WHERE workflow_slug = $1 AND run_id = $2
|
|
@@ -5183,15 +5193,15 @@ class DatabaseClient {
|
|
|
5183
5193
|
slot_index, worker_concurrency
|
|
5184
5194
|
)
|
|
5185
5195
|
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20)`, [
|
|
5186
|
-
|
|
5187
|
-
|
|
5188
|
-
|
|
5189
|
-
|
|
5190
|
-
|
|
5191
|
-
|
|
5192
|
-
|
|
5193
|
-
JSON.stringify(stripEventIdFromJson(
|
|
5194
|
-
|
|
5196
|
+
eventPayload.eventId,
|
|
5197
|
+
eventPayload.workflowSlug,
|
|
5198
|
+
eventPayload.runId,
|
|
5199
|
+
eventPayload.stepId,
|
|
5200
|
+
eventPayload.timestampUs,
|
|
5201
|
+
eventPayload.category,
|
|
5202
|
+
eventPayload.type,
|
|
5203
|
+
JSON.stringify(stripEventIdFromJson(eventPayload)),
|
|
5204
|
+
eventWorkerId,
|
|
5195
5205
|
attemptNumber,
|
|
5196
5206
|
null,
|
|
5197
5207
|
null,
|
|
@@ -5205,7 +5215,7 @@ class DatabaseClient {
|
|
|
5205
5215
|
workerConcurrency
|
|
5206
5216
|
]);
|
|
5207
5217
|
await client.query("COMMIT");
|
|
5208
|
-
return
|
|
5218
|
+
return resolvedAttemptNumber;
|
|
5209
5219
|
} catch (error) {
|
|
5210
5220
|
await client.query("ROLLBACK");
|
|
5211
5221
|
throw error;
|
|
@@ -5259,21 +5269,25 @@ class DatabaseClient {
|
|
|
5259
5269
|
const query = `
|
|
5260
5270
|
WITH latest_step_events AS (
|
|
5261
5271
|
SELECT DISTINCT ON (workflow_slug, run_id, step_id)
|
|
5262
|
-
workflow_slug, run_id, step_id, type, timestamp_us, worker_id
|
|
5272
|
+
workflow_slug, run_id, step_id, type, timestamp_us, worker_id, attempt_number
|
|
5263
5273
|
FROM ${this.schema}.step_events
|
|
5264
|
-
WHERE type IN ('
|
|
5274
|
+
WHERE type NOT IN ('LogEntry', 'StepCheckpoint', 'StepCheckpointFailed')
|
|
5265
5275
|
ORDER BY workflow_slug, run_id, step_id, timestamp_us DESC, event_id DESC
|
|
5266
5276
|
)
|
|
5267
|
-
SELECT workflow_slug, run_id, step_id, worker_id
|
|
5277
|
+
SELECT workflow_slug, run_id, step_id, worker_id, attempt_number, timestamp_us
|
|
5268
5278
|
FROM latest_step_events
|
|
5269
|
-
WHERE
|
|
5279
|
+
WHERE type IN ('StepStarted', 'StepHeartbeat')
|
|
5280
|
+
AND timestamp_us < $1
|
|
5281
|
+
AND worker_id IS NOT NULL
|
|
5270
5282
|
`;
|
|
5271
5283
|
const result = await client.query(query, [currentTimeUs - staleThresholdUs]);
|
|
5272
5284
|
return result.rows.map((row) => ({
|
|
5273
5285
|
workflowSlug: row.workflow_slug,
|
|
5274
5286
|
runId: row.run_id,
|
|
5275
5287
|
stepId: row.step_id,
|
|
5276
|
-
workerId: row.worker_id
|
|
5288
|
+
workerId: row.worker_id,
|
|
5289
|
+
attemptNumber: row.attempt_number ?? 1,
|
|
5290
|
+
lastHeartbeatUs: Number(row.timestamp_us)
|
|
5277
5291
|
}));
|
|
5278
5292
|
} finally {
|
|
5279
5293
|
client.release();
|
|
@@ -5630,6 +5644,15 @@ class DatabaseClient {
|
|
|
5630
5644
|
type
|
|
5631
5645
|
FROM ${this.schema}.workflow_events
|
|
5632
5646
|
WHERE ($1::text IS NULL OR workflow_slug = $1)
|
|
5647
|
+
AND type IN (
|
|
5648
|
+
'RunSubmitted',
|
|
5649
|
+
'WorkflowRetryStarted',
|
|
5650
|
+
'WorkflowStarted',
|
|
5651
|
+
'WorkflowResumed',
|
|
5652
|
+
'WorkflowCompleted',
|
|
5653
|
+
'WorkflowFailed',
|
|
5654
|
+
'WorkflowCancelled'
|
|
5655
|
+
)
|
|
5633
5656
|
ORDER BY workflow_slug, run_id, timestamp_us DESC, event_id DESC
|
|
5634
5657
|
)
|
|
5635
5658
|
SELECT
|
|
@@ -5767,90 +5790,33 @@ class DatabaseClient {
|
|
|
5767
5790
|
client.release();
|
|
5768
5791
|
}
|
|
5769
5792
|
}
|
|
5770
|
-
async getActiveWorkersAggregation(
|
|
5793
|
+
async getActiveWorkersAggregation(_options) {
|
|
5771
5794
|
const client = await this.pool.connect();
|
|
5772
5795
|
try {
|
|
5773
|
-
const
|
|
5774
|
-
|
|
5775
|
-
|
|
5776
|
-
|
|
5777
|
-
|
|
5778
|
-
|
|
5779
|
-
|
|
5780
|
-
|
|
5781
|
-
worker_id,
|
|
5782
|
-
MAX(timestamp_us) AS last_seen_us
|
|
5783
|
-
FROM ${this.schema}.step_events
|
|
5784
|
-
WHERE worker_id IS NOT NULL
|
|
5785
|
-
AND type IN ('StepStarted', 'StepHeartbeat')
|
|
5786
|
-
${options?.timeRange ? `AND timestamp_us >= $1 AND timestamp_us <= $2` : ""}
|
|
5787
|
-
GROUP BY worker_id
|
|
5788
|
-
),
|
|
5789
|
-
-- Steps started by each worker
|
|
5790
|
-
steps_started AS (
|
|
5791
|
-
SELECT DISTINCT
|
|
5792
|
-
se.worker_id,
|
|
5793
|
-
se.workflow_slug,
|
|
5794
|
-
se.run_id,
|
|
5795
|
-
se.step_id,
|
|
5796
|
-
se.attempt_number
|
|
5797
|
-
FROM ${this.schema}.step_events se
|
|
5798
|
-
WHERE se.type = 'StepStarted'
|
|
5799
|
-
AND se.worker_id IS NOT NULL
|
|
5800
|
-
),
|
|
5801
|
-
-- Steps completed
|
|
5802
|
-
completed_steps AS (
|
|
5803
|
-
SELECT
|
|
5804
|
-
ss.worker_id,
|
|
5805
|
-
COUNT(*) AS completed_count
|
|
5806
|
-
FROM steps_started ss
|
|
5807
|
-
INNER JOIN ${this.schema}.step_events se
|
|
5808
|
-
ON se.workflow_slug = ss.workflow_slug
|
|
5809
|
-
AND se.run_id = ss.run_id
|
|
5810
|
-
AND se.step_id = ss.step_id
|
|
5811
|
-
AND se.attempt_number = ss.attempt_number
|
|
5812
|
-
AND se.type = 'StepCompleted'
|
|
5813
|
-
GROUP BY ss.worker_id
|
|
5814
|
-
),
|
|
5815
|
-
-- Steps failed
|
|
5816
|
-
failed_steps AS (
|
|
5817
|
-
SELECT
|
|
5818
|
-
ss.worker_id,
|
|
5819
|
-
COUNT(*) AS failed_count
|
|
5820
|
-
FROM steps_started ss
|
|
5821
|
-
INNER JOIN ${this.schema}.step_events se
|
|
5822
|
-
ON se.workflow_slug = ss.workflow_slug
|
|
5823
|
-
AND se.run_id = ss.run_id
|
|
5824
|
-
AND se.step_id = ss.step_id
|
|
5825
|
-
AND se.attempt_number = ss.attempt_number
|
|
5826
|
-
AND se.type = 'StepFailed'
|
|
5827
|
-
GROUP BY ss.worker_id
|
|
5828
|
-
),
|
|
5829
|
-
-- Reclamation counts (times this worker's steps were reclaimed)
|
|
5830
|
-
reclaimed_counts AS (
|
|
5831
|
-
SELECT
|
|
5832
|
-
(event_data->>'originalWorkerId') AS worker_id,
|
|
5833
|
-
COUNT(*) AS reclaimed_count
|
|
5834
|
-
FROM ${this.schema}.step_events
|
|
5835
|
-
WHERE type = 'StepReclaimed'
|
|
5836
|
-
GROUP BY (event_data->>'originalWorkerId')
|
|
5796
|
+
const runningRunsResult = await client.query(`
|
|
5797
|
+
WITH latest_workflow_events AS (
|
|
5798
|
+
SELECT DISTINCT ON (workflow_slug, run_id)
|
|
5799
|
+
workflow_slug,
|
|
5800
|
+
run_id,
|
|
5801
|
+
type
|
|
5802
|
+
FROM ${this.schema}.workflow_events
|
|
5803
|
+
ORDER BY workflow_slug, run_id, timestamp_us DESC, event_id DESC
|
|
5837
5804
|
)
|
|
5838
|
-
SELECT
|
|
5839
|
-
|
|
5840
|
-
|
|
5841
|
-
|
|
5842
|
-
|
|
5843
|
-
|
|
5844
|
-
|
|
5845
|
-
|
|
5846
|
-
|
|
5847
|
-
|
|
5848
|
-
|
|
5849
|
-
|
|
5850
|
-
|
|
5805
|
+
SELECT workflow_slug, run_id
|
|
5806
|
+
FROM latest_workflow_events
|
|
5807
|
+
WHERE type IN ('WorkflowStarted', 'WorkflowResumed')
|
|
5808
|
+
LIMIT 500
|
|
5809
|
+
`);
|
|
5810
|
+
if (runningRunsResult.rows.length === 0) {
|
|
5811
|
+
return {
|
|
5812
|
+
workers: [],
|
|
5813
|
+
totalActiveWorkers: 0,
|
|
5814
|
+
totalRunningSteps: 0
|
|
5815
|
+
};
|
|
5816
|
+
}
|
|
5817
|
+
const runIds = runningRunsResult.rows.map((r) => r.run_id);
|
|
5851
5818
|
const runningStepsResult = await client.query(`
|
|
5852
5819
|
WITH latest_step_events AS (
|
|
5853
|
-
-- Get the latest event per step (excluding LogEntry)
|
|
5854
5820
|
SELECT DISTINCT ON (workflow_slug, run_id, step_id)
|
|
5855
5821
|
workflow_slug,
|
|
5856
5822
|
run_id,
|
|
@@ -5861,78 +5827,24 @@ class DatabaseClient {
|
|
|
5861
5827
|
slot_index,
|
|
5862
5828
|
worker_concurrency
|
|
5863
5829
|
FROM ${this.schema}.step_events
|
|
5864
|
-
WHERE
|
|
5830
|
+
WHERE run_id = ANY($1)
|
|
5831
|
+
AND type NOT IN ('LogEntry', 'StepCheckpoint', 'StepCheckpointFailed')
|
|
5865
5832
|
ORDER BY workflow_slug, run_id, step_id, timestamp_us DESC, event_id DESC
|
|
5866
|
-
),
|
|
5867
|
-
-- Steps currently running (latest event is StepStarted or StepHeartbeat)
|
|
5868
|
-
running_steps AS (
|
|
5869
|
-
SELECT
|
|
5870
|
-
workflow_slug,
|
|
5871
|
-
run_id,
|
|
5872
|
-
step_id,
|
|
5873
|
-
worker_id,
|
|
5874
|
-
slot_index,
|
|
5875
|
-
worker_concurrency
|
|
5876
|
-
FROM latest_step_events
|
|
5877
|
-
WHERE type IN ('StepStarted', 'StepHeartbeat')
|
|
5878
|
-
AND worker_id IS NOT NULL
|
|
5879
|
-
),
|
|
5880
|
-
-- Get start time and last heartbeat for each running step
|
|
5881
|
-
step_times AS (
|
|
5882
|
-
SELECT
|
|
5883
|
-
rs.workflow_slug,
|
|
5884
|
-
rs.run_id,
|
|
5885
|
-
rs.step_id,
|
|
5886
|
-
rs.worker_id,
|
|
5887
|
-
rs.slot_index,
|
|
5888
|
-
rs.worker_concurrency,
|
|
5889
|
-
MIN(se.timestamp_us) FILTER (WHERE se.type = 'StepStarted') AS started_at_us,
|
|
5890
|
-
MAX(se.timestamp_us) FILTER (WHERE se.type IN ('StepStarted', 'StepHeartbeat')) AS last_heartbeat_us
|
|
5891
|
-
FROM running_steps rs
|
|
5892
|
-
INNER JOIN ${this.schema}.step_events se
|
|
5893
|
-
ON se.workflow_slug = rs.workflow_slug
|
|
5894
|
-
AND se.run_id = rs.run_id
|
|
5895
|
-
AND se.step_id = rs.step_id
|
|
5896
|
-
AND se.type IN ('StepStarted', 'StepHeartbeat')
|
|
5897
|
-
GROUP BY rs.workflow_slug, rs.run_id, rs.step_id, rs.worker_id, rs.slot_index, rs.worker_concurrency
|
|
5898
5833
|
)
|
|
5899
5834
|
SELECT
|
|
5900
5835
|
worker_id,
|
|
5901
5836
|
workflow_slug,
|
|
5902
5837
|
run_id,
|
|
5903
5838
|
step_id,
|
|
5904
|
-
|
|
5905
|
-
last_heartbeat_us,
|
|
5839
|
+
timestamp_us as last_heartbeat_us,
|
|
5906
5840
|
slot_index,
|
|
5907
5841
|
worker_concurrency
|
|
5908
|
-
FROM
|
|
5909
|
-
|
|
5910
|
-
|
|
5911
|
-
|
|
5912
|
-
|
|
5913
|
-
worker_id,
|
|
5914
|
-
worker_concurrency
|
|
5915
|
-
FROM ${this.schema}.step_events
|
|
5916
|
-
WHERE worker_id IS NOT NULL
|
|
5917
|
-
AND worker_concurrency IS NOT NULL
|
|
5918
|
-
AND type IN ('StepStarted', 'StepHeartbeat')
|
|
5919
|
-
ORDER BY worker_id, timestamp_us DESC
|
|
5920
|
-
`);
|
|
5921
|
-
const workerConcurrencyMap = new Map;
|
|
5922
|
-
for (const row of workerConcurrencyResult.rows) {
|
|
5923
|
-
workerConcurrencyMap.set(row.worker_id, row.worker_concurrency);
|
|
5924
|
-
}
|
|
5842
|
+
FROM latest_step_events
|
|
5843
|
+
WHERE type IN ('StepStarted', 'StepHeartbeat')
|
|
5844
|
+
AND worker_id IS NOT NULL
|
|
5845
|
+
LIMIT 1000
|
|
5846
|
+
`, [runIds]);
|
|
5925
5847
|
const workerMap = new Map;
|
|
5926
|
-
for (const row of workerStatsResult.rows) {
|
|
5927
|
-
workerMap.set(row.worker_id, {
|
|
5928
|
-
workerId: row.worker_id,
|
|
5929
|
-
lastSeenUs: parseInt(row.last_seen_us, 10),
|
|
5930
|
-
totalStepsProcessed: parseInt(row.total_steps_processed, 10),
|
|
5931
|
-
failedSteps: parseInt(row.failed_steps, 10),
|
|
5932
|
-
reclaimedFromCount: parseInt(row.reclaimed_from_count, 10),
|
|
5933
|
-
activeSteps: []
|
|
5934
|
-
});
|
|
5935
|
-
}
|
|
5936
5848
|
for (const row of runningStepsResult.rows) {
|
|
5937
5849
|
let worker = workerMap.get(row.worker_id);
|
|
5938
5850
|
if (!worker) {
|
|
@@ -5942,10 +5854,15 @@ class DatabaseClient {
|
|
|
5942
5854
|
totalStepsProcessed: 0,
|
|
5943
5855
|
failedSteps: 0,
|
|
5944
5856
|
reclaimedFromCount: 0,
|
|
5857
|
+
workerConcurrency: row.worker_concurrency ?? undefined,
|
|
5945
5858
|
activeSteps: []
|
|
5946
5859
|
};
|
|
5947
5860
|
workerMap.set(row.worker_id, worker);
|
|
5948
5861
|
}
|
|
5862
|
+
const lastHeartbeatUs = parseInt(row.last_heartbeat_us, 10);
|
|
5863
|
+
if (lastHeartbeatUs > worker.lastSeenUs) {
|
|
5864
|
+
worker.lastSeenUs = lastHeartbeatUs;
|
|
5865
|
+
}
|
|
5949
5866
|
if (row.worker_concurrency != null && worker.workerConcurrency == null) {
|
|
5950
5867
|
worker.workerConcurrency = row.worker_concurrency;
|
|
5951
5868
|
}
|
|
@@ -5953,19 +5870,11 @@ class DatabaseClient {
|
|
|
5953
5870
|
workflowSlug: row.workflow_slug,
|
|
5954
5871
|
runId: row.run_id,
|
|
5955
5872
|
stepId: row.step_id,
|
|
5956
|
-
startedAtUs:
|
|
5957
|
-
lastHeartbeatUs
|
|
5873
|
+
startedAtUs: lastHeartbeatUs,
|
|
5874
|
+
lastHeartbeatUs,
|
|
5958
5875
|
slotIndex: row.slot_index ?? undefined
|
|
5959
5876
|
});
|
|
5960
5877
|
}
|
|
5961
|
-
for (const worker of workerMap.values()) {
|
|
5962
|
-
if (worker.workerConcurrency == null) {
|
|
5963
|
-
const historicalConcurrency = workerConcurrencyMap.get(worker.workerId);
|
|
5964
|
-
if (historicalConcurrency != null) {
|
|
5965
|
-
worker.workerConcurrency = historicalConcurrency;
|
|
5966
|
-
}
|
|
5967
|
-
}
|
|
5968
|
-
}
|
|
5969
5878
|
const workers = Array.from(workerMap.values()).sort((a, b) => {
|
|
5970
5879
|
if (b.activeSteps.length !== a.activeSteps.length) {
|
|
5971
5880
|
return b.activeSteps.length - a.activeSteps.length;
|
|
@@ -5973,7 +5882,7 @@ class DatabaseClient {
|
|
|
5973
5882
|
return b.lastSeenUs - a.lastSeenUs;
|
|
5974
5883
|
});
|
|
5975
5884
|
const totalRunningSteps = workers.reduce((sum, w) => sum + w.activeSteps.length, 0);
|
|
5976
|
-
const totalActiveWorkers = workers.
|
|
5885
|
+
const totalActiveWorkers = workers.length;
|
|
5977
5886
|
return {
|
|
5978
5887
|
workers,
|
|
5979
5888
|
totalActiveWorkers,
|
|
@@ -5983,6 +5892,112 @@ class DatabaseClient {
|
|
|
5983
5892
|
client.release();
|
|
5984
5893
|
}
|
|
5985
5894
|
}
|
|
5895
|
+
async getWorkerById(workerId) {
|
|
5896
|
+
const client = await this.pool.connect();
|
|
5897
|
+
try {
|
|
5898
|
+
const result = await client.query(`
|
|
5899
|
+
WITH worker_step_events AS (
|
|
5900
|
+
-- Get all step events for this worker
|
|
5901
|
+
SELECT
|
|
5902
|
+
workflow_slug,
|
|
5903
|
+
run_id,
|
|
5904
|
+
step_id,
|
|
5905
|
+
type,
|
|
5906
|
+
timestamp_us,
|
|
5907
|
+
slot_index,
|
|
5908
|
+
worker_concurrency
|
|
5909
|
+
FROM ${this.schema}.step_events
|
|
5910
|
+
WHERE worker_id = $1
|
|
5911
|
+
AND type IN ('StepStarted', 'StepHeartbeat')
|
|
5912
|
+
),
|
|
5913
|
+
latest_per_step AS (
|
|
5914
|
+
-- For each step this worker touched, get the latest event
|
|
5915
|
+
SELECT DISTINCT ON (workflow_slug, run_id, step_id)
|
|
5916
|
+
workflow_slug,
|
|
5917
|
+
run_id,
|
|
5918
|
+
step_id,
|
|
5919
|
+
timestamp_us,
|
|
5920
|
+
slot_index,
|
|
5921
|
+
worker_concurrency
|
|
5922
|
+
FROM worker_step_events
|
|
5923
|
+
ORDER BY workflow_slug, run_id, step_id, timestamp_us DESC
|
|
5924
|
+
),
|
|
5925
|
+
-- Check if these steps are still running (no completion/failure after our heartbeat)
|
|
5926
|
+
still_running AS (
|
|
5927
|
+
SELECT
|
|
5928
|
+
lps.workflow_slug,
|
|
5929
|
+
lps.run_id,
|
|
5930
|
+
lps.step_id,
|
|
5931
|
+
lps.timestamp_us as last_heartbeat_us,
|
|
5932
|
+
lps.slot_index,
|
|
5933
|
+
lps.worker_concurrency
|
|
5934
|
+
FROM latest_per_step lps
|
|
5935
|
+
WHERE NOT EXISTS (
|
|
5936
|
+
SELECT 1 FROM ${this.schema}.step_events se
|
|
5937
|
+
WHERE se.workflow_slug = lps.workflow_slug
|
|
5938
|
+
AND se.run_id = lps.run_id
|
|
5939
|
+
AND se.step_id = lps.step_id
|
|
5940
|
+
AND se.timestamp_us > lps.timestamp_us
|
|
5941
|
+
AND se.type IN ('StepCompleted', 'StepFailed', 'StepSkipped', 'StepReclaimed')
|
|
5942
|
+
)
|
|
5943
|
+
)
|
|
5944
|
+
SELECT * FROM still_running
|
|
5945
|
+
ORDER BY last_heartbeat_us DESC
|
|
5946
|
+
LIMIT 100
|
|
5947
|
+
`, [workerId]);
|
|
5948
|
+
if (result.rows.length === 0) {
|
|
5949
|
+
const lastSeenResult = await client.query(`
|
|
5950
|
+
SELECT MAX(timestamp_us) as last_seen_us, MAX(worker_concurrency) as worker_concurrency
|
|
5951
|
+
FROM ${this.schema}.step_events
|
|
5952
|
+
WHERE worker_id = $1
|
|
5953
|
+
AND type IN ('StepStarted', 'StepHeartbeat')
|
|
5954
|
+
`, [workerId]);
|
|
5955
|
+
if (!lastSeenResult.rows[0]?.last_seen_us) {
|
|
5956
|
+
return null;
|
|
5957
|
+
}
|
|
5958
|
+
return {
|
|
5959
|
+
workerId,
|
|
5960
|
+
lastSeenUs: parseInt(lastSeenResult.rows[0].last_seen_us, 10),
|
|
5961
|
+
totalStepsProcessed: 0,
|
|
5962
|
+
failedSteps: 0,
|
|
5963
|
+
reclaimedFromCount: 0,
|
|
5964
|
+
workerConcurrency: lastSeenResult.rows[0].worker_concurrency ?? undefined,
|
|
5965
|
+
activeSteps: []
|
|
5966
|
+
};
|
|
5967
|
+
}
|
|
5968
|
+
let lastSeenUs = 0;
|
|
5969
|
+
let workerConcurrency;
|
|
5970
|
+
const activeSteps = [];
|
|
5971
|
+
for (const row of result.rows) {
|
|
5972
|
+
const heartbeatUs = parseInt(row.last_heartbeat_us, 10);
|
|
5973
|
+
if (heartbeatUs > lastSeenUs) {
|
|
5974
|
+
lastSeenUs = heartbeatUs;
|
|
5975
|
+
}
|
|
5976
|
+
if (row.worker_concurrency != null && workerConcurrency == null) {
|
|
5977
|
+
workerConcurrency = row.worker_concurrency;
|
|
5978
|
+
}
|
|
5979
|
+
activeSteps.push({
|
|
5980
|
+
workflowSlug: row.workflow_slug,
|
|
5981
|
+
runId: row.run_id,
|
|
5982
|
+
stepId: row.step_id,
|
|
5983
|
+
startedAtUs: heartbeatUs,
|
|
5984
|
+
lastHeartbeatUs: heartbeatUs,
|
|
5985
|
+
slotIndex: row.slot_index ?? undefined
|
|
5986
|
+
});
|
|
5987
|
+
}
|
|
5988
|
+
return {
|
|
5989
|
+
workerId,
|
|
5990
|
+
lastSeenUs,
|
|
5991
|
+
totalStepsProcessed: 0,
|
|
5992
|
+
failedSteps: 0,
|
|
5993
|
+
reclaimedFromCount: 0,
|
|
5994
|
+
workerConcurrency,
|
|
5995
|
+
activeSteps
|
|
5996
|
+
};
|
|
5997
|
+
} finally {
|
|
5998
|
+
client.release();
|
|
5999
|
+
}
|
|
6000
|
+
}
|
|
5986
6001
|
}
|
|
5987
6002
|
function createPool(connectionString) {
|
|
5988
6003
|
return new Pool2({ connectionString });
|
|
@@ -6384,6 +6399,87 @@ async function migration011_addWorkerConcurrencyIndex(pool, schema) {
|
|
|
6384
6399
|
client.release();
|
|
6385
6400
|
}
|
|
6386
6401
|
}
|
|
6402
|
+
async function migration012_addWorkerAnalyticsIndexes(pool, schema) {
|
|
6403
|
+
const client = await pool.connect();
|
|
6404
|
+
try {
|
|
6405
|
+
await client.query(`
|
|
6406
|
+
CREATE INDEX IF NOT EXISTS idx_step_events_worker_activity
|
|
6407
|
+
ON ${schema}.step_events (timestamp_us, type, worker_id)
|
|
6408
|
+
WHERE worker_id IS NOT NULL
|
|
6409
|
+
AND type IN ('StepStarted', 'StepHeartbeat')
|
|
6410
|
+
`);
|
|
6411
|
+
await client.query(`
|
|
6412
|
+
CREATE INDEX IF NOT EXISTS idx_step_events_recent_by_step
|
|
6413
|
+
ON ${schema}.step_events (timestamp_us DESC, workflow_slug, run_id, step_id, type, event_id DESC)
|
|
6414
|
+
WHERE type NOT IN ('LogEntry', 'StepCheckpoint', 'StepCheckpointFailed')
|
|
6415
|
+
`);
|
|
6416
|
+
console.log("[Migration 012] Worker analytics indexes added successfully");
|
|
6417
|
+
} catch (error) {
|
|
6418
|
+
console.error("[Migration 012] Error adding worker analytics indexes:", error);
|
|
6419
|
+
throw error;
|
|
6420
|
+
} finally {
|
|
6421
|
+
client.release();
|
|
6422
|
+
}
|
|
6423
|
+
}
|
|
6424
|
+
async function migration013_addWorkersObservabilityIndexes(pool, schema) {
|
|
6425
|
+
const client = await pool.connect();
|
|
6426
|
+
try {
|
|
6427
|
+
await client.query(`
|
|
6428
|
+
CREATE INDEX IF NOT EXISTS idx_workflow_events_run_status
|
|
6429
|
+
ON ${schema}.workflow_events (run_id, timestamp_us DESC, event_id DESC)
|
|
6430
|
+
INCLUDE (workflow_slug, type)
|
|
6431
|
+
`);
|
|
6432
|
+
await client.query(`
|
|
6433
|
+
CREATE INDEX IF NOT EXISTS idx_step_events_by_run
|
|
6434
|
+
ON ${schema}.step_events (run_id, workflow_slug, step_id, timestamp_us DESC, event_id DESC)
|
|
6435
|
+
INCLUDE (type, worker_id, slot_index, worker_concurrency)
|
|
6436
|
+
WHERE type NOT IN ('LogEntry', 'StepCheckpoint', 'StepCheckpointFailed')
|
|
6437
|
+
`);
|
|
6438
|
+
console.log("[Migration 013] Workers observability indexes added successfully");
|
|
6439
|
+
} catch (error) {
|
|
6440
|
+
console.error("[Migration 013] Error adding workers observability indexes:", error);
|
|
6441
|
+
throw error;
|
|
6442
|
+
} finally {
|
|
6443
|
+
client.release();
|
|
6444
|
+
}
|
|
6445
|
+
}
|
|
6446
|
+
async function migration014_addWorkerHotPathIndexes(pool, schema) {
|
|
6447
|
+
const client = await pool.connect();
|
|
6448
|
+
try {
|
|
6449
|
+
await client.query(`
|
|
6450
|
+
CREATE INDEX IF NOT EXISTS idx_step_events_latest_cover
|
|
6451
|
+
ON ${schema}.step_events (workflow_slug, run_id, step_id, timestamp_us DESC, event_id DESC)
|
|
6452
|
+
INCLUDE (type, available_at_us, worker_id, attempt_number, priority)
|
|
6453
|
+
`);
|
|
6454
|
+
await client.query(`
|
|
6455
|
+
CREATE INDEX IF NOT EXISTS idx_step_events_terminal_latest
|
|
6456
|
+
ON ${schema}.step_events (workflow_slug, run_id, step_id, timestamp_us DESC, event_id DESC)
|
|
6457
|
+
WHERE type IN ('StepCompleted', 'StepFailed', 'StepSkipped', 'StepReclaimed')
|
|
6458
|
+
`);
|
|
6459
|
+
await client.query(`
|
|
6460
|
+
CREATE INDEX IF NOT EXISTS idx_workflow_events_active_latest
|
|
6461
|
+
ON ${schema}.workflow_events (type, workflow_slug, run_id, timestamp_us DESC, event_id DESC)
|
|
6462
|
+
WHERE type IN ('RunSubmitted', 'WorkflowRetryStarted', 'WorkflowStarted', 'WorkflowResumed')
|
|
6463
|
+
`);
|
|
6464
|
+
await client.query(`
|
|
6465
|
+
CREATE INDEX IF NOT EXISTS idx_workflow_events_run_chrono
|
|
6466
|
+
ON ${schema}.workflow_events (workflow_slug, run_id, timestamp_us ASC, event_id ASC)
|
|
6467
|
+
`);
|
|
6468
|
+
await client.query(`
|
|
6469
|
+
CREATE INDEX IF NOT EXISTS idx_workflow_events_version_lookup
|
|
6470
|
+
ON ${schema}.workflow_events (workflow_slug, run_id, timestamp_us DESC, event_id DESC)
|
|
6471
|
+
INCLUDE (version_id)
|
|
6472
|
+
WHERE type IN ('WorkflowStarted', 'RunSubmitted')
|
|
6473
|
+
AND version_id IS NOT NULL
|
|
6474
|
+
`);
|
|
6475
|
+
console.log("[Migration 014] Worker hot-path indexes added successfully");
|
|
6476
|
+
} catch (error) {
|
|
6477
|
+
console.error("[Migration 014] Error adding worker hot-path indexes:", error);
|
|
6478
|
+
throw error;
|
|
6479
|
+
} finally {
|
|
6480
|
+
client.release();
|
|
6481
|
+
}
|
|
6482
|
+
}
|
|
6387
6483
|
async function runMigrations(pool, schema = "cascadeflow") {
|
|
6388
6484
|
console.log(`[Migrations] Starting database migrations in schema '${schema}'...`);
|
|
6389
6485
|
try {
|
|
@@ -6399,6 +6495,9 @@ async function runMigrations(pool, schema = "cascadeflow") {
|
|
|
6399
6495
|
await migration009_addStepPriority(pool, schema);
|
|
6400
6496
|
await migration010_addSlotTracking(pool, schema);
|
|
6401
6497
|
await migration011_addWorkerConcurrencyIndex(pool, schema);
|
|
6498
|
+
await migration012_addWorkerAnalyticsIndexes(pool, schema);
|
|
6499
|
+
await migration013_addWorkersObservabilityIndexes(pool, schema);
|
|
6500
|
+
await migration014_addWorkerHotPathIndexes(pool, schema);
|
|
6402
6501
|
console.log("[Migrations] All migrations completed successfully");
|
|
6403
6502
|
} catch (error) {
|
|
6404
6503
|
console.error("[Migrations] Migration failed:", error);
|
|
@@ -7264,16 +7363,6 @@ class PostgresBackend extends Backend {
|
|
|
7264
7363
|
return !!(latestEvent && (latestEvent.type === "StepScheduled" || latestEvent.type === "StepReclaimed" || latestEvent.type === "StepRetrying"));
|
|
7265
7364
|
}
|
|
7266
7365
|
async claimScheduledStep(workflowSlug, runId, stepId, workerId, metadata) {
|
|
7267
|
-
const initialEvents = await this.loadEvents(workflowSlug, runId, { category: "step", stepId });
|
|
7268
|
-
if (initialEvents.length === 0) {
|
|
7269
|
-
return null;
|
|
7270
|
-
}
|
|
7271
|
-
const now = getMicrosecondTimestamp();
|
|
7272
|
-
const initialState = projectStepState(initialEvents, workflowSlug);
|
|
7273
|
-
if (initialState.status !== "scheduled" || initialState.availableAt === undefined || initialState.availableAt > now) {
|
|
7274
|
-
return null;
|
|
7275
|
-
}
|
|
7276
|
-
const attemptNumber = initialState.attemptNumber;
|
|
7277
7366
|
const timestamp = getMicrosecondTimestamp();
|
|
7278
7367
|
const event = {
|
|
7279
7368
|
category: "step",
|
|
@@ -7285,43 +7374,34 @@ class PostgresBackend extends Backend {
|
|
|
7285
7374
|
stepId,
|
|
7286
7375
|
workerId,
|
|
7287
7376
|
dependencies: metadata.dependencies,
|
|
7288
|
-
attemptNumber,
|
|
7377
|
+
attemptNumber: metadata.attemptNumber,
|
|
7289
7378
|
slotIndex: metadata.slotIndex,
|
|
7290
7379
|
workerConcurrency: metadata.workerConcurrency
|
|
7291
7380
|
};
|
|
7292
|
-
const
|
|
7293
|
-
return
|
|
7381
|
+
const claimedAttemptNumber = await this.db.claimScheduledStep(workflowSlug, runId, stepId, event);
|
|
7382
|
+
return claimedAttemptNumber !== null ? { attemptNumber: claimedAttemptNumber } : null;
|
|
7294
7383
|
}
|
|
7295
7384
|
async reclaimStaleSteps(staleThreshold, reclaimedBy) {
|
|
7296
7385
|
const reclaimed = [];
|
|
7297
7386
|
const now = getMicrosecondTimestamp();
|
|
7298
7387
|
const staleSteps = await this.db.findStaleSteps(staleThreshold);
|
|
7299
7388
|
for (const step of staleSteps) {
|
|
7300
|
-
const
|
|
7301
|
-
|
|
7302
|
-
|
|
7303
|
-
|
|
7304
|
-
|
|
7305
|
-
|
|
7306
|
-
|
|
7307
|
-
|
|
7308
|
-
|
|
7309
|
-
|
|
7310
|
-
|
|
7311
|
-
|
|
7312
|
-
|
|
7313
|
-
|
|
7314
|
-
|
|
7315
|
-
|
|
7316
|
-
});
|
|
7317
|
-
await this.saveStepScheduled(step.workflowSlug, step.runId, step.stepId, {
|
|
7318
|
-
availableAt: now,
|
|
7319
|
-
reason: "retry",
|
|
7320
|
-
attemptNumber: state.attemptNumber + 1,
|
|
7321
|
-
retryDelayMs: 0
|
|
7322
|
-
});
|
|
7323
|
-
reclaimed.push({ workflowSlug: step.workflowSlug, runId: step.runId, stepId: step.stepId });
|
|
7324
|
-
}
|
|
7389
|
+
const staleDuration = now - step.lastHeartbeatUs;
|
|
7390
|
+
await this.saveStepReclaimed(step.workflowSlug, step.runId, step.stepId, {
|
|
7391
|
+
originalWorkerId: step.workerId,
|
|
7392
|
+
reclaimedBy,
|
|
7393
|
+
lastHeartbeat: step.lastHeartbeatUs,
|
|
7394
|
+
staleThreshold,
|
|
7395
|
+
staleDuration,
|
|
7396
|
+
attemptNumber: step.attemptNumber
|
|
7397
|
+
});
|
|
7398
|
+
await this.saveStepScheduled(step.workflowSlug, step.runId, step.stepId, {
|
|
7399
|
+
availableAt: now,
|
|
7400
|
+
reason: "retry",
|
|
7401
|
+
attemptNumber: step.attemptNumber + 1,
|
|
7402
|
+
retryDelayMs: 0
|
|
7403
|
+
});
|
|
7404
|
+
reclaimed.push({ workflowSlug: step.workflowSlug, runId: step.runId, stepId: step.stepId });
|
|
7325
7405
|
}
|
|
7326
7406
|
return reclaimed;
|
|
7327
7407
|
}
|
|
@@ -7754,9 +7834,25 @@ class PostgresBackend extends Backend {
|
|
|
7754
7834
|
staleThresholdUs
|
|
7755
7835
|
};
|
|
7756
7836
|
}
|
|
7837
|
+
async getWorkerById(workerId) {
|
|
7838
|
+
const result = await this.db.getWorkerById(workerId);
|
|
7839
|
+
if (!result) {
|
|
7840
|
+
return null;
|
|
7841
|
+
}
|
|
7842
|
+
return {
|
|
7843
|
+
workerId: result.workerId,
|
|
7844
|
+
lastSeenUs: result.lastSeenUs,
|
|
7845
|
+
currentlyRunningSteps: result.activeSteps.length,
|
|
7846
|
+
totalStepsProcessed: result.totalStepsProcessed,
|
|
7847
|
+
failedSteps: result.failedSteps,
|
|
7848
|
+
reclaimedFromCount: result.reclaimedFromCount,
|
|
7849
|
+
workerConcurrency: result.workerConcurrency,
|
|
7850
|
+
activeSteps: result.activeSteps
|
|
7851
|
+
};
|
|
7852
|
+
}
|
|
7757
7853
|
}
|
|
7758
7854
|
export {
|
|
7759
7855
|
PostgresBackend
|
|
7760
7856
|
};
|
|
7761
7857
|
|
|
7762
|
-
//# debugId=
|
|
7858
|
+
//# debugId=43E020BB2B7D86DF64756E2164756E21
|