@cascade-flow/backend-postgres 0.2.16 → 0.2.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -4849,6 +4849,9 @@ class DatabaseClient {
4849
4849
  let attemptNumber = null;
4850
4850
  let availableAtUs = null;
4851
4851
  let exportOutput = null;
4852
+ let priority = null;
4853
+ let slotIndex = null;
4854
+ let workerConcurrency = null;
4852
4855
  let errorNameHash = "";
4853
4856
  let errorMessageHash = "";
4854
4857
  let errorStackExactHash = "";
@@ -4856,12 +4859,15 @@ class DatabaseClient {
4856
4859
  let errorStackPortableHash = "";
4857
4860
  if (se.type === "StepStarted" || se.type === "StepHeartbeat") {
4858
4861
  workerId = se.workerId;
4862
+ slotIndex = se.slotIndex ?? null;
4863
+ workerConcurrency = se.workerConcurrency ?? null;
4859
4864
  }
4860
4865
  if ("attemptNumber" in se) {
4861
4866
  attemptNumber = se.attemptNumber;
4862
4867
  }
4863
4868
  if (se.type === "StepScheduled") {
4864
4869
  availableAtUs = se.availableAtUs;
4870
+ priority = se.priority ?? null;
4865
4871
  }
4866
4872
  if (se.type === "StepCompleted") {
4867
4873
  exportOutput = se.exportOutput;
@@ -4884,9 +4890,10 @@ class DatabaseClient {
4884
4890
  event_id, workflow_slug, run_id, step_id, timestamp_us, category, type, event_data,
4885
4891
  worker_id, attempt_number, available_at_us, export_output,
4886
4892
  error_name_hash, error_message_hash, error_stack_exact_hash,
4887
- error_stack_normalized_hash, error_stack_portable_hash, version_id
4893
+ error_stack_normalized_hash, error_stack_portable_hash, version_id, priority,
4894
+ slot_index, worker_concurrency
4888
4895
  )
4889
- VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18)`, [
4896
+ VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21)`, [
4890
4897
  se.eventId,
4891
4898
  se.workflowSlug,
4892
4899
  se.runId,
@@ -4904,7 +4911,10 @@ class DatabaseClient {
4904
4911
  errorStackExactHash,
4905
4912
  errorStackNormalizedHash,
4906
4913
  errorStackPortableHash,
4907
- versionId
4914
+ versionId,
4915
+ priority,
4916
+ slotIndex,
4917
+ workerConcurrency
4908
4918
  ]);
4909
4919
  }
4910
4920
  } finally {
@@ -4966,6 +4976,9 @@ class DatabaseClient {
4966
4976
  let attemptNumber = null;
4967
4977
  let availableAtUs = null;
4968
4978
  let exportOutput = null;
4979
+ let priority = null;
4980
+ let slotIndex = null;
4981
+ let workerConcurrency = null;
4969
4982
  let errorNameHash = "";
4970
4983
  let errorMessageHash = "";
4971
4984
  let errorStackExactHash = "";
@@ -4973,12 +4986,15 @@ class DatabaseClient {
4973
4986
  let errorStackPortableHash = "";
4974
4987
  if (se.type === "StepStarted" || se.type === "StepHeartbeat") {
4975
4988
  workerId = se.workerId;
4989
+ slotIndex = se.slotIndex ?? null;
4990
+ workerConcurrency = se.workerConcurrency ?? null;
4976
4991
  }
4977
4992
  if ("attemptNumber" in se) {
4978
4993
  attemptNumber = se.attemptNumber;
4979
4994
  }
4980
4995
  if (se.type === "StepScheduled") {
4981
4996
  availableAtUs = se.availableAtUs;
4997
+ priority = se.priority ?? null;
4982
4998
  }
4983
4999
  if (se.type === "StepCompleted") {
4984
5000
  exportOutput = se.exportOutput;
@@ -5001,9 +5017,10 @@ class DatabaseClient {
5001
5017
  event_id, workflow_slug, run_id, step_id, timestamp_us, category, type, event_data,
5002
5018
  worker_id, attempt_number, available_at_us, export_output,
5003
5019
  error_name_hash, error_message_hash, error_stack_exact_hash,
5004
- error_stack_normalized_hash, error_stack_portable_hash, version_id
5020
+ error_stack_normalized_hash, error_stack_portable_hash, version_id, priority,
5021
+ slot_index, worker_concurrency
5005
5022
  )
5006
- VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18)`, [
5023
+ VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21)`, [
5007
5024
  se.eventId,
5008
5025
  se.workflowSlug,
5009
5026
  se.runId,
@@ -5021,7 +5038,10 @@ class DatabaseClient {
5021
5038
  errorStackExactHash,
5022
5039
  errorStackNormalizedHash,
5023
5040
  errorStackPortableHash,
5024
- versionId
5041
+ versionId,
5042
+ priority,
5043
+ slotIndex,
5044
+ workerConcurrency
5025
5045
  ]);
5026
5046
  }
5027
5047
  }
@@ -5105,13 +5125,15 @@ class DatabaseClient {
5105
5125
  client.release();
5106
5126
  }
5107
5127
  }
5108
- async claimScheduledStep(workflowSlug, runId, stepId, workerId, eventToWrite) {
5128
+ async claimScheduledStep(workflowSlug, runId, stepId, eventToWrite) {
5109
5129
  const client = await this.pool.connect();
5110
5130
  try {
5111
5131
  await client.query("BEGIN");
5132
+ const currentTimeUs = Date.now() * 1000;
5112
5133
  const checkQuery = `
5113
- SELECT event_data FROM ${this.schema}.step_events
5134
+ SELECT event_data, attempt_number, available_at_us FROM ${this.schema}.step_events
5114
5135
  WHERE workflow_slug = $1 AND run_id = $2 AND step_id = $3
5136
+ AND type NOT IN ('LogEntry', 'StepCheckpoint', 'StepCheckpointFailed')
5115
5137
  ORDER BY timestamp_us DESC, event_id DESC
5116
5138
  LIMIT 1
5117
5139
  FOR UPDATE SKIP LOCKED
@@ -5119,30 +5141,42 @@ class DatabaseClient {
5119
5141
  const checkResult = await client.query(checkQuery, [workflowSlug, runId, stepId]);
5120
5142
  if (checkResult.rows.length === 0) {
5121
5143
  await client.query("ROLLBACK");
5122
- return false;
5144
+ return null;
5123
5145
  }
5124
- const latestEvent = checkResult.rows[0].event_data;
5146
+ const latestRow = checkResult.rows[0];
5147
+ const latestEvent = latestRow.event_data;
5125
5148
  if (latestEvent.type !== "StepScheduled" && latestEvent.type !== "StepReclaimed" && latestEvent.type !== "StepRetrying") {
5126
5149
  await client.query("ROLLBACK");
5127
- return false;
5150
+ return null;
5128
5151
  }
5129
- let workerId2 = null;
5152
+ const availableAtUs = latestRow.available_at_us ?? latestEvent.availableAtUs ?? null;
5153
+ if (availableAtUs !== null && availableAtUs > currentTimeUs) {
5154
+ await client.query("ROLLBACK");
5155
+ return null;
5156
+ }
5157
+ const resolvedAttemptNumber = latestRow.attempt_number ?? latestEvent.attemptNumber ?? (eventToWrite.type === "StepStarted" ? eventToWrite.attemptNumber : null) ?? 1;
5158
+ const eventPayload = eventToWrite.type === "StepStarted" ? { ...eventToWrite, attemptNumber: resolvedAttemptNumber } : eventToWrite;
5159
+ let eventWorkerId = null;
5130
5160
  let attemptNumber = null;
5161
+ let slotIndex = null;
5162
+ let workerConcurrency = null;
5131
5163
  let errorNameHash = "";
5132
5164
  let errorMessageHash = "";
5133
5165
  let errorStackExactHash = "";
5134
5166
  let errorStackNormalizedHash = "";
5135
5167
  let errorStackPortableHash = "";
5136
- if (eventToWrite.type === "StepStarted") {
5137
- workerId2 = eventToWrite.workerId;
5138
- attemptNumber = eventToWrite.attemptNumber;
5139
- }
5140
- if (eventToWrite.type === "StepFailed") {
5141
- errorNameHash = eventToWrite.errorFingerprints.nameHash;
5142
- errorMessageHash = eventToWrite.errorFingerprints.messageHash;
5143
- errorStackExactHash = eventToWrite.errorFingerprints.stackExactHash;
5144
- errorStackNormalizedHash = eventToWrite.errorFingerprints.stackNormalizedHash;
5145
- errorStackPortableHash = eventToWrite.errorFingerprints.stackPortableHash;
5168
+ if (eventPayload.type === "StepStarted") {
5169
+ eventWorkerId = eventPayload.workerId;
5170
+ attemptNumber = eventPayload.attemptNumber;
5171
+ slotIndex = eventPayload.slotIndex ?? null;
5172
+ workerConcurrency = eventPayload.workerConcurrency ?? null;
5173
+ }
5174
+ if (eventPayload.type === "StepFailed") {
5175
+ errorNameHash = eventPayload.errorFingerprints.nameHash;
5176
+ errorMessageHash = eventPayload.errorFingerprints.messageHash;
5177
+ errorStackExactHash = eventPayload.errorFingerprints.stackExactHash;
5178
+ errorStackNormalizedHash = eventPayload.errorFingerprints.stackNormalizedHash;
5179
+ errorStackPortableHash = eventPayload.errorFingerprints.stackPortableHash;
5146
5180
  }
5147
5181
  const versionResult = await client.query(`SELECT version_id FROM ${this.schema}.workflow_events
5148
5182
  WHERE workflow_slug = $1 AND run_id = $2
@@ -5155,18 +5189,19 @@ class DatabaseClient {
5155
5189
  event_id, workflow_slug, run_id, step_id, timestamp_us, category, type, event_data,
5156
5190
  worker_id, attempt_number, available_at_us, export_output,
5157
5191
  error_name_hash, error_message_hash, error_stack_exact_hash,
5158
- error_stack_normalized_hash, error_stack_portable_hash, version_id
5192
+ error_stack_normalized_hash, error_stack_portable_hash, version_id,
5193
+ slot_index, worker_concurrency
5159
5194
  )
5160
- VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18)`, [
5161
- eventToWrite.eventId,
5162
- eventToWrite.workflowSlug,
5163
- eventToWrite.runId,
5164
- eventToWrite.stepId,
5165
- eventToWrite.timestampUs,
5166
- eventToWrite.category,
5167
- eventToWrite.type,
5168
- JSON.stringify(stripEventIdFromJson(eventToWrite)),
5169
- workerId2,
5195
+ VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20)`, [
5196
+ eventPayload.eventId,
5197
+ eventPayload.workflowSlug,
5198
+ eventPayload.runId,
5199
+ eventPayload.stepId,
5200
+ eventPayload.timestampUs,
5201
+ eventPayload.category,
5202
+ eventPayload.type,
5203
+ JSON.stringify(stripEventIdFromJson(eventPayload)),
5204
+ eventWorkerId,
5170
5205
  attemptNumber,
5171
5206
  null,
5172
5207
  null,
@@ -5175,10 +5210,12 @@ class DatabaseClient {
5175
5210
  errorStackExactHash,
5176
5211
  errorStackNormalizedHash,
5177
5212
  errorStackPortableHash,
5178
- versionId
5213
+ versionId,
5214
+ slotIndex,
5215
+ workerConcurrency
5179
5216
  ]);
5180
5217
  await client.query("COMMIT");
5181
- return true;
5218
+ return resolvedAttemptNumber;
5182
5219
  } catch (error) {
5183
5220
  await client.query("ROLLBACK");
5184
5221
  throw error;
@@ -5194,7 +5231,7 @@ class DatabaseClient {
5194
5231
  let query = `
5195
5232
  WITH latest_step_events AS (
5196
5233
  SELECT DISTINCT ON (workflow_slug, run_id, step_id)
5197
- workflow_slug, run_id, step_id, type, available_at_us
5234
+ workflow_slug, run_id, step_id, type, available_at_us, priority
5198
5235
  FROM ${this.schema}.step_events
5199
5236
  ${options?.workflowSlugs ? "WHERE workflow_slug = ANY($1)" : ""}
5200
5237
  ORDER BY workflow_slug, run_id, step_id, timestamp_us DESC, event_id DESC
@@ -5203,6 +5240,7 @@ class DatabaseClient {
5203
5240
  FROM latest_step_events
5204
5241
  WHERE type = ANY($${options?.workflowSlugs ? "2" : "1"})
5205
5242
  AND (available_at_us IS NULL OR available_at_us <= $${options?.workflowSlugs ? "3" : "2"})
5243
+ ORDER BY priority DESC NULLS LAST, available_at_us ASC
5206
5244
  ${options?.limit ? `LIMIT $${options?.workflowSlugs ? "4" : "3"}` : ""}
5207
5245
  `;
5208
5246
  const params = [];
@@ -5231,21 +5269,25 @@ class DatabaseClient {
5231
5269
  const query = `
5232
5270
  WITH latest_step_events AS (
5233
5271
  SELECT DISTINCT ON (workflow_slug, run_id, step_id)
5234
- workflow_slug, run_id, step_id, type, timestamp_us, worker_id
5272
+ workflow_slug, run_id, step_id, type, timestamp_us, worker_id, attempt_number
5235
5273
  FROM ${this.schema}.step_events
5236
- WHERE type IN ('StepStarted', 'StepHeartbeat')
5274
+ WHERE type NOT IN ('LogEntry', 'StepCheckpoint', 'StepCheckpointFailed')
5237
5275
  ORDER BY workflow_slug, run_id, step_id, timestamp_us DESC, event_id DESC
5238
5276
  )
5239
- SELECT workflow_slug, run_id, step_id, worker_id
5277
+ SELECT workflow_slug, run_id, step_id, worker_id, attempt_number, timestamp_us
5240
5278
  FROM latest_step_events
5241
- WHERE timestamp_us < $1 AND worker_id IS NOT NULL
5279
+ WHERE type IN ('StepStarted', 'StepHeartbeat')
5280
+ AND timestamp_us < $1
5281
+ AND worker_id IS NOT NULL
5242
5282
  `;
5243
5283
  const result = await client.query(query, [currentTimeUs - staleThresholdUs]);
5244
5284
  return result.rows.map((row) => ({
5245
5285
  workflowSlug: row.workflow_slug,
5246
5286
  runId: row.run_id,
5247
5287
  stepId: row.step_id,
5248
- workerId: row.worker_id
5288
+ workerId: row.worker_id,
5289
+ attemptNumber: row.attempt_number ?? 1,
5290
+ lastHeartbeatUs: Number(row.timestamp_us)
5249
5291
  }));
5250
5292
  } finally {
5251
5293
  client.release();
@@ -5602,6 +5644,15 @@ class DatabaseClient {
5602
5644
  type
5603
5645
  FROM ${this.schema}.workflow_events
5604
5646
  WHERE ($1::text IS NULL OR workflow_slug = $1)
5647
+ AND type IN (
5648
+ 'RunSubmitted',
5649
+ 'WorkflowRetryStarted',
5650
+ 'WorkflowStarted',
5651
+ 'WorkflowResumed',
5652
+ 'WorkflowCompleted',
5653
+ 'WorkflowFailed',
5654
+ 'WorkflowCancelled'
5655
+ )
5605
5656
  ORDER BY workflow_slug, run_id, timestamp_us DESC, event_id DESC
5606
5657
  )
5607
5658
  SELECT
@@ -5739,6 +5790,214 @@ class DatabaseClient {
5739
5790
  client.release();
5740
5791
  }
5741
5792
  }
5793
+ async getActiveWorkersAggregation(_options) {
5794
+ const client = await this.pool.connect();
5795
+ try {
5796
+ const runningRunsResult = await client.query(`
5797
+ WITH latest_workflow_events AS (
5798
+ SELECT DISTINCT ON (workflow_slug, run_id)
5799
+ workflow_slug,
5800
+ run_id,
5801
+ type
5802
+ FROM ${this.schema}.workflow_events
5803
+ ORDER BY workflow_slug, run_id, timestamp_us DESC, event_id DESC
5804
+ )
5805
+ SELECT workflow_slug, run_id
5806
+ FROM latest_workflow_events
5807
+ WHERE type IN ('WorkflowStarted', 'WorkflowResumed')
5808
+ LIMIT 500
5809
+ `);
5810
+ if (runningRunsResult.rows.length === 0) {
5811
+ return {
5812
+ workers: [],
5813
+ totalActiveWorkers: 0,
5814
+ totalRunningSteps: 0
5815
+ };
5816
+ }
5817
+ const runIds = runningRunsResult.rows.map((r) => r.run_id);
5818
+ const runningStepsResult = await client.query(`
5819
+ WITH latest_step_events AS (
5820
+ SELECT DISTINCT ON (workflow_slug, run_id, step_id)
5821
+ workflow_slug,
5822
+ run_id,
5823
+ step_id,
5824
+ type,
5825
+ worker_id,
5826
+ timestamp_us,
5827
+ slot_index,
5828
+ worker_concurrency
5829
+ FROM ${this.schema}.step_events
5830
+ WHERE run_id = ANY($1)
5831
+ AND type NOT IN ('LogEntry', 'StepCheckpoint', 'StepCheckpointFailed')
5832
+ ORDER BY workflow_slug, run_id, step_id, timestamp_us DESC, event_id DESC
5833
+ )
5834
+ SELECT
5835
+ worker_id,
5836
+ workflow_slug,
5837
+ run_id,
5838
+ step_id,
5839
+ timestamp_us as last_heartbeat_us,
5840
+ slot_index,
5841
+ worker_concurrency
5842
+ FROM latest_step_events
5843
+ WHERE type IN ('StepStarted', 'StepHeartbeat')
5844
+ AND worker_id IS NOT NULL
5845
+ LIMIT 1000
5846
+ `, [runIds]);
5847
+ const workerMap = new Map;
5848
+ for (const row of runningStepsResult.rows) {
5849
+ let worker = workerMap.get(row.worker_id);
5850
+ if (!worker) {
5851
+ worker = {
5852
+ workerId: row.worker_id,
5853
+ lastSeenUs: parseInt(row.last_heartbeat_us, 10),
5854
+ totalStepsProcessed: 0,
5855
+ failedSteps: 0,
5856
+ reclaimedFromCount: 0,
5857
+ workerConcurrency: row.worker_concurrency ?? undefined,
5858
+ activeSteps: []
5859
+ };
5860
+ workerMap.set(row.worker_id, worker);
5861
+ }
5862
+ const lastHeartbeatUs = parseInt(row.last_heartbeat_us, 10);
5863
+ if (lastHeartbeatUs > worker.lastSeenUs) {
5864
+ worker.lastSeenUs = lastHeartbeatUs;
5865
+ }
5866
+ if (row.worker_concurrency != null && worker.workerConcurrency == null) {
5867
+ worker.workerConcurrency = row.worker_concurrency;
5868
+ }
5869
+ worker.activeSteps.push({
5870
+ workflowSlug: row.workflow_slug,
5871
+ runId: row.run_id,
5872
+ stepId: row.step_id,
5873
+ startedAtUs: lastHeartbeatUs,
5874
+ lastHeartbeatUs,
5875
+ slotIndex: row.slot_index ?? undefined
5876
+ });
5877
+ }
5878
+ const workers = Array.from(workerMap.values()).sort((a, b) => {
5879
+ if (b.activeSteps.length !== a.activeSteps.length) {
5880
+ return b.activeSteps.length - a.activeSteps.length;
5881
+ }
5882
+ return b.lastSeenUs - a.lastSeenUs;
5883
+ });
5884
+ const totalRunningSteps = workers.reduce((sum, w) => sum + w.activeSteps.length, 0);
5885
+ const totalActiveWorkers = workers.length;
5886
+ return {
5887
+ workers,
5888
+ totalActiveWorkers,
5889
+ totalRunningSteps
5890
+ };
5891
+ } finally {
5892
+ client.release();
5893
+ }
5894
+ }
5895
+ async getWorkerById(workerId) {
5896
+ const client = await this.pool.connect();
5897
+ try {
5898
+ const result = await client.query(`
5899
+ WITH worker_step_events AS (
5900
+ -- Get all step events for this worker
5901
+ SELECT
5902
+ workflow_slug,
5903
+ run_id,
5904
+ step_id,
5905
+ type,
5906
+ timestamp_us,
5907
+ slot_index,
5908
+ worker_concurrency
5909
+ FROM ${this.schema}.step_events
5910
+ WHERE worker_id = $1
5911
+ AND type IN ('StepStarted', 'StepHeartbeat')
5912
+ ),
5913
+ latest_per_step AS (
5914
+ -- For each step this worker touched, get the latest event
5915
+ SELECT DISTINCT ON (workflow_slug, run_id, step_id)
5916
+ workflow_slug,
5917
+ run_id,
5918
+ step_id,
5919
+ timestamp_us,
5920
+ slot_index,
5921
+ worker_concurrency
5922
+ FROM worker_step_events
5923
+ ORDER BY workflow_slug, run_id, step_id, timestamp_us DESC
5924
+ ),
5925
+ -- Check if these steps are still running (no completion/failure after our heartbeat)
5926
+ still_running AS (
5927
+ SELECT
5928
+ lps.workflow_slug,
5929
+ lps.run_id,
5930
+ lps.step_id,
5931
+ lps.timestamp_us as last_heartbeat_us,
5932
+ lps.slot_index,
5933
+ lps.worker_concurrency
5934
+ FROM latest_per_step lps
5935
+ WHERE NOT EXISTS (
5936
+ SELECT 1 FROM ${this.schema}.step_events se
5937
+ WHERE se.workflow_slug = lps.workflow_slug
5938
+ AND se.run_id = lps.run_id
5939
+ AND se.step_id = lps.step_id
5940
+ AND se.timestamp_us > lps.timestamp_us
5941
+ AND se.type IN ('StepCompleted', 'StepFailed', 'StepSkipped', 'StepReclaimed')
5942
+ )
5943
+ )
5944
+ SELECT * FROM still_running
5945
+ ORDER BY last_heartbeat_us DESC
5946
+ LIMIT 100
5947
+ `, [workerId]);
5948
+ if (result.rows.length === 0) {
5949
+ const lastSeenResult = await client.query(`
5950
+ SELECT MAX(timestamp_us) as last_seen_us, MAX(worker_concurrency) as worker_concurrency
5951
+ FROM ${this.schema}.step_events
5952
+ WHERE worker_id = $1
5953
+ AND type IN ('StepStarted', 'StepHeartbeat')
5954
+ `, [workerId]);
5955
+ if (!lastSeenResult.rows[0]?.last_seen_us) {
5956
+ return null;
5957
+ }
5958
+ return {
5959
+ workerId,
5960
+ lastSeenUs: parseInt(lastSeenResult.rows[0].last_seen_us, 10),
5961
+ totalStepsProcessed: 0,
5962
+ failedSteps: 0,
5963
+ reclaimedFromCount: 0,
5964
+ workerConcurrency: lastSeenResult.rows[0].worker_concurrency ?? undefined,
5965
+ activeSteps: []
5966
+ };
5967
+ }
5968
+ let lastSeenUs = 0;
5969
+ let workerConcurrency;
5970
+ const activeSteps = [];
5971
+ for (const row of result.rows) {
5972
+ const heartbeatUs = parseInt(row.last_heartbeat_us, 10);
5973
+ if (heartbeatUs > lastSeenUs) {
5974
+ lastSeenUs = heartbeatUs;
5975
+ }
5976
+ if (row.worker_concurrency != null && workerConcurrency == null) {
5977
+ workerConcurrency = row.worker_concurrency;
5978
+ }
5979
+ activeSteps.push({
5980
+ workflowSlug: row.workflow_slug,
5981
+ runId: row.run_id,
5982
+ stepId: row.step_id,
5983
+ startedAtUs: heartbeatUs,
5984
+ lastHeartbeatUs: heartbeatUs,
5985
+ slotIndex: row.slot_index ?? undefined
5986
+ });
5987
+ }
5988
+ return {
5989
+ workerId,
5990
+ lastSeenUs,
5991
+ totalStepsProcessed: 0,
5992
+ failedSteps: 0,
5993
+ reclaimedFromCount: 0,
5994
+ workerConcurrency,
5995
+ activeSteps
5996
+ };
5997
+ } finally {
5998
+ client.release();
5999
+ }
6000
+ }
5742
6001
  }
5743
6002
  function createPool(connectionString) {
5744
6003
  return new Pool2({ connectionString });
@@ -6086,6 +6345,141 @@ async function migration008_addRunIdIndex(pool, schema) {
6086
6345
  client.release();
6087
6346
  }
6088
6347
  }
6348
+ async function migration009_addStepPriority(pool, schema) {
6349
+ const client = await pool.connect();
6350
+ try {
6351
+ await client.query(`
6352
+ ALTER TABLE ${schema}.step_events
6353
+ ADD COLUMN IF NOT EXISTS priority INTEGER
6354
+ `);
6355
+ await client.query(`
6356
+ CREATE INDEX IF NOT EXISTS idx_step_events_priority_queue
6357
+ ON ${schema}.step_events (priority DESC NULLS LAST, available_at_us ASC, timestamp_us ASC)
6358
+ WHERE type IN ('StepScheduled', 'StepReclaimed', 'StepRetrying')
6359
+ `);
6360
+ console.log("[Migration 009] Step priority column and index added successfully");
6361
+ } catch (error) {
6362
+ console.error("[Migration 009] Error adding step priority:", error);
6363
+ throw error;
6364
+ } finally {
6365
+ client.release();
6366
+ }
6367
+ }
6368
+ async function migration010_addSlotTracking(pool, schema) {
6369
+ const client = await pool.connect();
6370
+ try {
6371
+ await client.query(`
6372
+ ALTER TABLE ${schema}.step_events
6373
+ ADD COLUMN IF NOT EXISTS slot_index INTEGER,
6374
+ ADD COLUMN IF NOT EXISTS worker_concurrency INTEGER
6375
+ `);
6376
+ console.log("[Migration 010] Slot tracking columns added successfully");
6377
+ } catch (error) {
6378
+ console.error("[Migration 010] Error adding slot tracking columns:", error);
6379
+ throw error;
6380
+ } finally {
6381
+ client.release();
6382
+ }
6383
+ }
6384
+ async function migration011_addWorkerConcurrencyIndex(pool, schema) {
6385
+ const client = await pool.connect();
6386
+ try {
6387
+ await client.query(`
6388
+ CREATE INDEX IF NOT EXISTS idx_step_events_worker_concurrency
6389
+ ON ${schema}.step_events (worker_id, timestamp_us DESC)
6390
+ WHERE worker_id IS NOT NULL
6391
+ AND worker_concurrency IS NOT NULL
6392
+ AND type IN ('StepStarted', 'StepHeartbeat')
6393
+ `);
6394
+ console.log("[Migration 011] Worker concurrency index added successfully");
6395
+ } catch (error) {
6396
+ console.error("[Migration 011] Error adding worker concurrency index:", error);
6397
+ throw error;
6398
+ } finally {
6399
+ client.release();
6400
+ }
6401
+ }
6402
+ async function migration012_addWorkerAnalyticsIndexes(pool, schema) {
6403
+ const client = await pool.connect();
6404
+ try {
6405
+ await client.query(`
6406
+ CREATE INDEX IF NOT EXISTS idx_step_events_worker_activity
6407
+ ON ${schema}.step_events (timestamp_us, type, worker_id)
6408
+ WHERE worker_id IS NOT NULL
6409
+ AND type IN ('StepStarted', 'StepHeartbeat')
6410
+ `);
6411
+ await client.query(`
6412
+ CREATE INDEX IF NOT EXISTS idx_step_events_recent_by_step
6413
+ ON ${schema}.step_events (timestamp_us DESC, workflow_slug, run_id, step_id, type, event_id DESC)
6414
+ WHERE type NOT IN ('LogEntry', 'StepCheckpoint', 'StepCheckpointFailed')
6415
+ `);
6416
+ console.log("[Migration 012] Worker analytics indexes added successfully");
6417
+ } catch (error) {
6418
+ console.error("[Migration 012] Error adding worker analytics indexes:", error);
6419
+ throw error;
6420
+ } finally {
6421
+ client.release();
6422
+ }
6423
+ }
6424
+ async function migration013_addWorkersObservabilityIndexes(pool, schema) {
6425
+ const client = await pool.connect();
6426
+ try {
6427
+ await client.query(`
6428
+ CREATE INDEX IF NOT EXISTS idx_workflow_events_run_status
6429
+ ON ${schema}.workflow_events (run_id, timestamp_us DESC, event_id DESC)
6430
+ INCLUDE (workflow_slug, type)
6431
+ `);
6432
+ await client.query(`
6433
+ CREATE INDEX IF NOT EXISTS idx_step_events_by_run
6434
+ ON ${schema}.step_events (run_id, workflow_slug, step_id, timestamp_us DESC, event_id DESC)
6435
+ INCLUDE (type, worker_id, slot_index, worker_concurrency)
6436
+ WHERE type NOT IN ('LogEntry', 'StepCheckpoint', 'StepCheckpointFailed')
6437
+ `);
6438
+ console.log("[Migration 013] Workers observability indexes added successfully");
6439
+ } catch (error) {
6440
+ console.error("[Migration 013] Error adding workers observability indexes:", error);
6441
+ throw error;
6442
+ } finally {
6443
+ client.release();
6444
+ }
6445
+ }
6446
+ async function migration014_addWorkerHotPathIndexes(pool, schema) {
6447
+ const client = await pool.connect();
6448
+ try {
6449
+ await client.query(`
6450
+ CREATE INDEX IF NOT EXISTS idx_step_events_latest_cover
6451
+ ON ${schema}.step_events (workflow_slug, run_id, step_id, timestamp_us DESC, event_id DESC)
6452
+ INCLUDE (type, available_at_us, worker_id, attempt_number, priority)
6453
+ `);
6454
+ await client.query(`
6455
+ CREATE INDEX IF NOT EXISTS idx_step_events_terminal_latest
6456
+ ON ${schema}.step_events (workflow_slug, run_id, step_id, timestamp_us DESC, event_id DESC)
6457
+ WHERE type IN ('StepCompleted', 'StepFailed', 'StepSkipped', 'StepReclaimed')
6458
+ `);
6459
+ await client.query(`
6460
+ CREATE INDEX IF NOT EXISTS idx_workflow_events_active_latest
6461
+ ON ${schema}.workflow_events (type, workflow_slug, run_id, timestamp_us DESC, event_id DESC)
6462
+ WHERE type IN ('RunSubmitted', 'WorkflowRetryStarted', 'WorkflowStarted', 'WorkflowResumed')
6463
+ `);
6464
+ await client.query(`
6465
+ CREATE INDEX IF NOT EXISTS idx_workflow_events_run_chrono
6466
+ ON ${schema}.workflow_events (workflow_slug, run_id, timestamp_us ASC, event_id ASC)
6467
+ `);
6468
+ await client.query(`
6469
+ CREATE INDEX IF NOT EXISTS idx_workflow_events_version_lookup
6470
+ ON ${schema}.workflow_events (workflow_slug, run_id, timestamp_us DESC, event_id DESC)
6471
+ INCLUDE (version_id)
6472
+ WHERE type IN ('WorkflowStarted', 'RunSubmitted')
6473
+ AND version_id IS NOT NULL
6474
+ `);
6475
+ console.log("[Migration 014] Worker hot-path indexes added successfully");
6476
+ } catch (error) {
6477
+ console.error("[Migration 014] Error adding worker hot-path indexes:", error);
6478
+ throw error;
6479
+ } finally {
6480
+ client.release();
6481
+ }
6482
+ }
6089
6483
  async function runMigrations(pool, schema = "cascadeflow") {
6090
6484
  console.log(`[Migrations] Starting database migrations in schema '${schema}'...`);
6091
6485
  try {
@@ -6098,6 +6492,12 @@ async function runMigrations(pool, schema = "cascadeflow") {
6098
6492
  await migration006_addDescIndexes(pool, schema);
6099
6493
  await migration007_addWorkerIndexes(pool, schema);
6100
6494
  await migration008_addRunIdIndex(pool, schema);
6495
+ await migration009_addStepPriority(pool, schema);
6496
+ await migration010_addSlotTracking(pool, schema);
6497
+ await migration011_addWorkerConcurrencyIndex(pool, schema);
6498
+ await migration012_addWorkerAnalyticsIndexes(pool, schema);
6499
+ await migration013_addWorkersObservabilityIndexes(pool, schema);
6500
+ await migration014_addWorkerHotPathIndexes(pool, schema);
6101
6501
  console.log("[Migrations] All migrations completed successfully");
6102
6502
  } catch (error) {
6103
6503
  console.error("[Migrations] Migration failed:", error);
@@ -6408,7 +6808,8 @@ class PostgresBackend extends Backend {
6408
6808
  availableAtUs: metadata.availableAt,
6409
6809
  reason: metadata.reason,
6410
6810
  attemptNumber: metadata.attemptNumber,
6411
- retryDelayMs: metadata.retryDelayMs
6811
+ retryDelayMs: metadata.retryDelayMs,
6812
+ priority: metadata.priority
6412
6813
  };
6413
6814
  await this.db.appendEvent("step_events", event);
6414
6815
  }
@@ -6426,7 +6827,9 @@ class PostgresBackend extends Backend {
6426
6827
  stepId,
6427
6828
  attemptNumber,
6428
6829
  workerId,
6429
- dependencies: metadata.dependencies
6830
+ dependencies: metadata.dependencies,
6831
+ slotIndex: metadata.slotIndex,
6832
+ workerConcurrency: metadata.workerConcurrency
6430
6833
  };
6431
6834
  await this.db.appendEvent("step_events", event);
6432
6835
  }
@@ -6545,6 +6948,7 @@ class PostgresBackend extends Backend {
6545
6948
  reason: "retry",
6546
6949
  attemptNumber: scheduleMetadata.nextAttemptNumber,
6547
6950
  retryDelayMs: scheduleMetadata.retryDelayMs,
6951
+ priority: scheduleMetadata.priority,
6548
6952
  policyIndex: scheduleMetadata.policyIndex,
6549
6953
  attemptInPolicy: scheduleMetadata.attemptInPolicy
6550
6954
  };
@@ -6614,7 +7018,7 @@ class PostgresBackend extends Backend {
6614
7018
  };
6615
7019
  await this.db.appendEvent("step_events", event);
6616
7020
  }
6617
- async saveStepHeartbeat(workflowSlug, runId, stepId, workerId, attemptNumber) {
7021
+ async saveStepHeartbeat(workflowSlug, runId, stepId, workerId, attemptNumber, slotInfo) {
6618
7022
  const now = getMicrosecondTimestamp();
6619
7023
  const event = {
6620
7024
  category: "step",
@@ -6625,7 +7029,9 @@ class PostgresBackend extends Backend {
6625
7029
  runId,
6626
7030
  stepId,
6627
7031
  workerId,
6628
- attemptNumber
7032
+ attemptNumber,
7033
+ slotIndex: slotInfo?.slotIndex,
7034
+ workerConcurrency: slotInfo?.workerConcurrency
6629
7035
  };
6630
7036
  await this.db.appendEvent("step_events", event);
6631
7037
  }
@@ -6957,16 +7363,6 @@ class PostgresBackend extends Backend {
6957
7363
  return !!(latestEvent && (latestEvent.type === "StepScheduled" || latestEvent.type === "StepReclaimed" || latestEvent.type === "StepRetrying"));
6958
7364
  }
6959
7365
  async claimScheduledStep(workflowSlug, runId, stepId, workerId, metadata) {
6960
- const initialEvents = await this.loadEvents(workflowSlug, runId, { category: "step", stepId });
6961
- if (initialEvents.length === 0) {
6962
- return null;
6963
- }
6964
- const now = getMicrosecondTimestamp();
6965
- const initialState = projectStepState(initialEvents, workflowSlug);
6966
- if (initialState.status !== "scheduled" || initialState.availableAt === undefined || initialState.availableAt > now) {
6967
- return null;
6968
- }
6969
- const attemptNumber = initialState.attemptNumber;
6970
7366
  const timestamp = getMicrosecondTimestamp();
6971
7367
  const event = {
6972
7368
  category: "step",
@@ -6978,41 +7374,34 @@ class PostgresBackend extends Backend {
6978
7374
  stepId,
6979
7375
  workerId,
6980
7376
  dependencies: metadata.dependencies,
6981
- attemptNumber
7377
+ attemptNumber: metadata.attemptNumber,
7378
+ slotIndex: metadata.slotIndex,
7379
+ workerConcurrency: metadata.workerConcurrency
6982
7380
  };
6983
- const claimed = await this.db.claimScheduledStep(workflowSlug, runId, stepId, workerId, event);
6984
- return claimed ? { attemptNumber } : null;
7381
+ const claimedAttemptNumber = await this.db.claimScheduledStep(workflowSlug, runId, stepId, event);
7382
+ return claimedAttemptNumber !== null ? { attemptNumber: claimedAttemptNumber } : null;
6985
7383
  }
6986
7384
  async reclaimStaleSteps(staleThreshold, reclaimedBy) {
6987
7385
  const reclaimed = [];
6988
7386
  const now = getMicrosecondTimestamp();
6989
7387
  const staleSteps = await this.db.findStaleSteps(staleThreshold);
6990
7388
  for (const step of staleSteps) {
6991
- const events = await this.loadEvents(step.workflowSlug, step.runId, { category: "step", stepId: step.stepId });
6992
- if (events.length === 0)
6993
- continue;
6994
- const state = projectStepState(events, step.workflowSlug);
6995
- if (state.status !== "running")
6996
- continue;
6997
- const lastHeartbeat = state.lastHeartbeat || state.startTime || 0;
6998
- const staleDuration = now - lastHeartbeat;
6999
- if (staleDuration > staleThreshold) {
7000
- await this.saveStepReclaimed(step.workflowSlug, step.runId, step.stepId, {
7001
- originalWorkerId: state.claimedBy || "unknown",
7002
- reclaimedBy,
7003
- lastHeartbeat,
7004
- staleThreshold,
7005
- staleDuration,
7006
- attemptNumber: state.attemptNumber
7007
- });
7008
- await this.saveStepScheduled(step.workflowSlug, step.runId, step.stepId, {
7009
- availableAt: now,
7010
- reason: "retry",
7011
- attemptNumber: state.attemptNumber + 1,
7012
- retryDelayMs: 0
7013
- });
7014
- reclaimed.push({ workflowSlug: step.workflowSlug, runId: step.runId, stepId: step.stepId });
7015
- }
7389
+ const staleDuration = now - step.lastHeartbeatUs;
7390
+ await this.saveStepReclaimed(step.workflowSlug, step.runId, step.stepId, {
7391
+ originalWorkerId: step.workerId,
7392
+ reclaimedBy,
7393
+ lastHeartbeat: step.lastHeartbeatUs,
7394
+ staleThreshold,
7395
+ staleDuration,
7396
+ attemptNumber: step.attemptNumber
7397
+ });
7398
+ await this.saveStepScheduled(step.workflowSlug, step.runId, step.stepId, {
7399
+ availableAt: now,
7400
+ reason: "retry",
7401
+ attemptNumber: step.attemptNumber + 1,
7402
+ retryDelayMs: 0
7403
+ });
7404
+ reclaimed.push({ workflowSlug: step.workflowSlug, runId: step.runId, stepId: step.stepId });
7016
7405
  }
7017
7406
  return reclaimed;
7018
7407
  }
@@ -7422,9 +7811,48 @@ class PostgresBackend extends Backend {
7422
7811
  successRate
7423
7812
  };
7424
7813
  }
7814
+ async getActiveWorkers(options) {
7815
+ const staleThresholdUs = options?.staleThresholdUs ?? 30 * 1000 * 1000;
7816
+ const result = await this.db.getActiveWorkersAggregation({
7817
+ staleThresholdUs,
7818
+ includeInactive: options?.includeInactive,
7819
+ timeRange: options?.timeRange
7820
+ });
7821
+ return {
7822
+ workers: result.workers.map((w) => ({
7823
+ workerId: w.workerId,
7824
+ lastSeenUs: w.lastSeenUs,
7825
+ currentlyRunningSteps: w.activeSteps.length,
7826
+ totalStepsProcessed: w.totalStepsProcessed,
7827
+ failedSteps: w.failedSteps,
7828
+ reclaimedFromCount: w.reclaimedFromCount,
7829
+ workerConcurrency: w.workerConcurrency,
7830
+ activeSteps: w.activeSteps
7831
+ })),
7832
+ totalActiveWorkers: result.totalActiveWorkers,
7833
+ totalRunningSteps: result.totalRunningSteps,
7834
+ staleThresholdUs
7835
+ };
7836
+ }
7837
+ async getWorkerById(workerId) {
7838
+ const result = await this.db.getWorkerById(workerId);
7839
+ if (!result) {
7840
+ return null;
7841
+ }
7842
+ return {
7843
+ workerId: result.workerId,
7844
+ lastSeenUs: result.lastSeenUs,
7845
+ currentlyRunningSteps: result.activeSteps.length,
7846
+ totalStepsProcessed: result.totalStepsProcessed,
7847
+ failedSteps: result.failedSteps,
7848
+ reclaimedFromCount: result.reclaimedFromCount,
7849
+ workerConcurrency: result.workerConcurrency,
7850
+ activeSteps: result.activeSteps
7851
+ };
7852
+ }
7425
7853
  }
7426
7854
  export {
7427
7855
  PostgresBackend
7428
7856
  };
7429
7857
 
7430
- //# debugId=DA7768AC03A0F9DC64756E2164756E21
7858
+ //# debugId=43E020BB2B7D86DF64756E2164756E21