@cascade-flow/backend-postgres 0.2.17 → 0.2.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -5125,13 +5125,15 @@ class DatabaseClient {
5125
5125
  client.release();
5126
5126
  }
5127
5127
  }
5128
- async claimScheduledStep(workflowSlug, runId, stepId, workerId, eventToWrite) {
5128
+ async claimScheduledStep(workflowSlug, runId, stepId, eventToWrite) {
5129
5129
  const client = await this.pool.connect();
5130
5130
  try {
5131
5131
  await client.query("BEGIN");
5132
+ const currentTimeUs = Date.now() * 1000;
5132
5133
  const checkQuery = `
5133
- SELECT event_data FROM ${this.schema}.step_events
5134
+ SELECT event_data, attempt_number, available_at_us FROM ${this.schema}.step_events
5134
5135
  WHERE workflow_slug = $1 AND run_id = $2 AND step_id = $3
5136
+ AND type NOT IN ('LogEntry', 'StepCheckpoint', 'StepCheckpointFailed')
5135
5137
  ORDER BY timestamp_us DESC, event_id DESC
5136
5138
  LIMIT 1
5137
5139
  FOR UPDATE SKIP LOCKED
@@ -5139,14 +5141,22 @@ class DatabaseClient {
5139
5141
  const checkResult = await client.query(checkQuery, [workflowSlug, runId, stepId]);
5140
5142
  if (checkResult.rows.length === 0) {
5141
5143
  await client.query("ROLLBACK");
5142
- return false;
5144
+ return null;
5143
5145
  }
5144
- const latestEvent = checkResult.rows[0].event_data;
5146
+ const latestRow = checkResult.rows[0];
5147
+ const latestEvent = latestRow.event_data;
5145
5148
  if (latestEvent.type !== "StepScheduled" && latestEvent.type !== "StepReclaimed" && latestEvent.type !== "StepRetrying") {
5146
5149
  await client.query("ROLLBACK");
5147
- return false;
5150
+ return null;
5151
+ }
5152
+ const availableAtUs = latestRow.available_at_us ?? latestEvent.availableAtUs ?? null;
5153
+ if (availableAtUs !== null && availableAtUs > currentTimeUs) {
5154
+ await client.query("ROLLBACK");
5155
+ return null;
5148
5156
  }
5149
- let workerId2 = null;
5157
+ const resolvedAttemptNumber = latestRow.attempt_number ?? latestEvent.attemptNumber ?? (eventToWrite.type === "StepStarted" ? eventToWrite.attemptNumber : null) ?? 1;
5158
+ const eventPayload = eventToWrite.type === "StepStarted" ? { ...eventToWrite, attemptNumber: resolvedAttemptNumber } : eventToWrite;
5159
+ let eventWorkerId = null;
5150
5160
  let attemptNumber = null;
5151
5161
  let slotIndex = null;
5152
5162
  let workerConcurrency = null;
@@ -5155,18 +5165,18 @@ class DatabaseClient {
5155
5165
  let errorStackExactHash = "";
5156
5166
  let errorStackNormalizedHash = "";
5157
5167
  let errorStackPortableHash = "";
5158
- if (eventToWrite.type === "StepStarted") {
5159
- workerId2 = eventToWrite.workerId;
5160
- attemptNumber = eventToWrite.attemptNumber;
5161
- slotIndex = eventToWrite.slotIndex ?? null;
5162
- workerConcurrency = eventToWrite.workerConcurrency ?? null;
5163
- }
5164
- if (eventToWrite.type === "StepFailed") {
5165
- errorNameHash = eventToWrite.errorFingerprints.nameHash;
5166
- errorMessageHash = eventToWrite.errorFingerprints.messageHash;
5167
- errorStackExactHash = eventToWrite.errorFingerprints.stackExactHash;
5168
- errorStackNormalizedHash = eventToWrite.errorFingerprints.stackNormalizedHash;
5169
- errorStackPortableHash = eventToWrite.errorFingerprints.stackPortableHash;
5168
+ if (eventPayload.type === "StepStarted") {
5169
+ eventWorkerId = eventPayload.workerId;
5170
+ attemptNumber = eventPayload.attemptNumber;
5171
+ slotIndex = eventPayload.slotIndex ?? null;
5172
+ workerConcurrency = eventPayload.workerConcurrency ?? null;
5173
+ }
5174
+ if (eventPayload.type === "StepFailed") {
5175
+ errorNameHash = eventPayload.errorFingerprints.nameHash;
5176
+ errorMessageHash = eventPayload.errorFingerprints.messageHash;
5177
+ errorStackExactHash = eventPayload.errorFingerprints.stackExactHash;
5178
+ errorStackNormalizedHash = eventPayload.errorFingerprints.stackNormalizedHash;
5179
+ errorStackPortableHash = eventPayload.errorFingerprints.stackPortableHash;
5170
5180
  }
5171
5181
  const versionResult = await client.query(`SELECT version_id FROM ${this.schema}.workflow_events
5172
5182
  WHERE workflow_slug = $1 AND run_id = $2
@@ -5183,15 +5193,15 @@ class DatabaseClient {
5183
5193
  slot_index, worker_concurrency
5184
5194
  )
5185
5195
  VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20)`, [
5186
- eventToWrite.eventId,
5187
- eventToWrite.workflowSlug,
5188
- eventToWrite.runId,
5189
- eventToWrite.stepId,
5190
- eventToWrite.timestampUs,
5191
- eventToWrite.category,
5192
- eventToWrite.type,
5193
- JSON.stringify(stripEventIdFromJson(eventToWrite)),
5194
- workerId2,
5196
+ eventPayload.eventId,
5197
+ eventPayload.workflowSlug,
5198
+ eventPayload.runId,
5199
+ eventPayload.stepId,
5200
+ eventPayload.timestampUs,
5201
+ eventPayload.category,
5202
+ eventPayload.type,
5203
+ JSON.stringify(stripEventIdFromJson(eventPayload)),
5204
+ eventWorkerId,
5195
5205
  attemptNumber,
5196
5206
  null,
5197
5207
  null,
@@ -5205,7 +5215,7 @@ class DatabaseClient {
5205
5215
  workerConcurrency
5206
5216
  ]);
5207
5217
  await client.query("COMMIT");
5208
- return true;
5218
+ return resolvedAttemptNumber;
5209
5219
  } catch (error) {
5210
5220
  await client.query("ROLLBACK");
5211
5221
  throw error;
@@ -5218,31 +5228,55 @@ class DatabaseClient {
5218
5228
  try {
5219
5229
  const currentTimeUs = Date.now() * 1000;
5220
5230
  const scheduledTypes = ["StepScheduled", "StepReclaimed", "StepRetrying"];
5221
- let query = `
5222
- WITH latest_step_events AS (
5223
- SELECT DISTINCT ON (workflow_slug, run_id, step_id)
5224
- workflow_slug, run_id, step_id, type, available_at_us, priority
5225
- FROM ${this.schema}.step_events
5226
- ${options?.workflowSlugs ? "WHERE workflow_slug = ANY($1)" : ""}
5227
- ORDER BY workflow_slug, run_id, step_id, timestamp_us DESC, event_id DESC
5228
- )
5229
- SELECT workflow_slug, run_id, step_id
5230
- FROM latest_step_events
5231
- WHERE type = ANY($${options?.workflowSlugs ? "2" : "1"})
5232
- AND (available_at_us IS NULL OR available_at_us <= $${options?.workflowSlugs ? "3" : "2"})
5233
- ORDER BY priority DESC NULLS LAST, available_at_us ASC
5234
- ${options?.limit ? `LIMIT $${options?.workflowSlugs ? "4" : "3"}` : ""}
5235
- `;
5236
- const params = [];
5237
- if (options?.workflowSlugs) {
5238
- params.push(options.workflowSlugs);
5239
- }
5240
- params.push(scheduledTypes);
5241
- params.push(currentTimeUs);
5242
- if (options?.limit) {
5243
- params.push(options.limit);
5244
- }
5245
- const result = await client.query(query, params);
5231
+ const result = await client.query(`WITH latest_workflow_events AS (
5232
+ SELECT DISTINCT ON (workflow_slug, run_id)
5233
+ workflow_slug,
5234
+ run_id,
5235
+ type
5236
+ FROM ${this.schema}.workflow_events
5237
+ WHERE ($1::text[] IS NULL OR workflow_slug = ANY($1))
5238
+ AND type IN (
5239
+ 'RunSubmitted',
5240
+ 'WorkflowRetryStarted',
5241
+ 'WorkflowStarted',
5242
+ 'WorkflowResumed',
5243
+ 'WorkflowCompleted',
5244
+ 'WorkflowFailed',
5245
+ 'WorkflowCancelled'
5246
+ )
5247
+ ORDER BY workflow_slug, run_id, timestamp_us DESC, event_id DESC
5248
+ ),
5249
+ active_runs AS (
5250
+ SELECT workflow_slug, run_id
5251
+ FROM latest_workflow_events
5252
+ WHERE type IN ('RunSubmitted', 'WorkflowRetryStarted', 'WorkflowStarted', 'WorkflowResumed')
5253
+ ),
5254
+ latest_step_events AS (
5255
+ SELECT DISTINCT ON (se.workflow_slug, se.run_id, se.step_id)
5256
+ se.workflow_slug,
5257
+ se.run_id,
5258
+ se.step_id,
5259
+ se.type,
5260
+ se.available_at_us,
5261
+ se.priority
5262
+ FROM ${this.schema}.step_events se
5263
+ INNER JOIN active_runs ar
5264
+ ON ar.workflow_slug = se.workflow_slug
5265
+ AND ar.run_id = se.run_id
5266
+ WHERE se.type NOT IN ('LogEntry', 'StepCheckpoint', 'StepCheckpointFailed')
5267
+ ORDER BY se.workflow_slug, se.run_id, se.step_id, se.timestamp_us DESC, se.event_id DESC
5268
+ )
5269
+ SELECT workflow_slug, run_id, step_id
5270
+ FROM latest_step_events
5271
+ WHERE type = ANY($2)
5272
+ AND (available_at_us IS NULL OR available_at_us <= $3)
5273
+ ORDER BY priority DESC NULLS LAST, available_at_us ASC
5274
+ LIMIT COALESCE($4::int, 2147483647)`, [
5275
+ options?.workflowSlugs ?? null,
5276
+ scheduledTypes,
5277
+ currentTimeUs,
5278
+ options?.limit ?? null
5279
+ ]);
5246
5280
  return result.rows.map((row) => ({
5247
5281
  workflowSlug: row.workflow_slug,
5248
5282
  runId: row.run_id,
@@ -5256,24 +5290,56 @@ class DatabaseClient {
5256
5290
  const client = await this.pool.connect();
5257
5291
  try {
5258
5292
  const currentTimeUs = Date.now() * 1000;
5259
- const query = `
5260
- WITH latest_step_events AS (
5261
- SELECT DISTINCT ON (workflow_slug, run_id, step_id)
5262
- workflow_slug, run_id, step_id, type, timestamp_us, worker_id
5263
- FROM ${this.schema}.step_events
5293
+ const result = await client.query(`WITH latest_workflow_events AS (
5294
+ SELECT DISTINCT ON (workflow_slug, run_id)
5295
+ workflow_slug,
5296
+ run_id,
5297
+ type
5298
+ FROM ${this.schema}.workflow_events
5299
+ WHERE type IN (
5300
+ 'RunSubmitted',
5301
+ 'WorkflowRetryStarted',
5302
+ 'WorkflowStarted',
5303
+ 'WorkflowResumed',
5304
+ 'WorkflowCompleted',
5305
+ 'WorkflowFailed',
5306
+ 'WorkflowCancelled'
5307
+ )
5308
+ ORDER BY workflow_slug, run_id, timestamp_us DESC, event_id DESC
5309
+ ),
5310
+ running_runs AS (
5311
+ SELECT workflow_slug, run_id
5312
+ FROM latest_workflow_events
5313
+ WHERE type IN ('WorkflowStarted', 'WorkflowResumed')
5314
+ ),
5315
+ latest_step_events AS (
5316
+ SELECT DISTINCT ON (se.workflow_slug, se.run_id, se.step_id)
5317
+ se.workflow_slug,
5318
+ se.run_id,
5319
+ se.step_id,
5320
+ se.type,
5321
+ se.timestamp_us,
5322
+ se.worker_id,
5323
+ se.attempt_number
5324
+ FROM ${this.schema}.step_events se
5325
+ INNER JOIN running_runs rr
5326
+ ON rr.workflow_slug = se.workflow_slug
5327
+ AND rr.run_id = se.run_id
5328
+ WHERE se.type NOT IN ('LogEntry', 'StepCheckpoint', 'StepCheckpointFailed')
5329
+ ORDER BY se.workflow_slug, se.run_id, se.step_id, se.timestamp_us DESC, se.event_id DESC
5330
+ )
5331
+ SELECT workflow_slug, run_id, step_id, worker_id, attempt_number, timestamp_us
5332
+ FROM latest_step_events
5264
5333
  WHERE type IN ('StepStarted', 'StepHeartbeat')
5265
- ORDER BY workflow_slug, run_id, step_id, timestamp_us DESC, event_id DESC
5266
- )
5267
- SELECT workflow_slug, run_id, step_id, worker_id
5268
- FROM latest_step_events
5269
- WHERE timestamp_us < $1 AND worker_id IS NOT NULL
5270
- `;
5271
- const result = await client.query(query, [currentTimeUs - staleThresholdUs]);
5334
+ AND timestamp_us < $1
5335
+ AND worker_id IS NOT NULL`, [currentTimeUs - staleThresholdUs]);
5272
5336
  return result.rows.map((row) => ({
5273
5337
  workflowSlug: row.workflow_slug,
5274
5338
  runId: row.run_id,
5275
5339
  stepId: row.step_id,
5276
- workerId: row.worker_id
5340
+ workerId: row.worker_id,
5341
+ attemptNumber: row.attempt_number ?? 1,
5342
+ lastHeartbeatUs: Number(row.timestamp_us)
5277
5343
  }));
5278
5344
  } finally {
5279
5345
  client.release();
@@ -5480,16 +5546,27 @@ class DatabaseClient {
5480
5546
  async listActiveWorkflows() {
5481
5547
  const client = await this.pool.connect();
5482
5548
  try {
5483
- const result = await client.query(`
5484
- SELECT DISTINCT workflow_slug FROM (
5485
- SELECT DISTINCT workflow_slug FROM ${this.schema}.workflow_events
5486
- WHERE type IN ('RunSubmitted', 'WorkflowStarted', 'WorkflowResumed')
5487
- UNION
5488
- SELECT DISTINCT workflow_slug FROM ${this.schema}.step_events
5489
- WHERE type IN ('StepScheduled', 'StepStarted', 'StepReclaimed', 'StepRetrying')
5490
- ) AS active
5491
- ORDER BY workflow_slug ASC
5492
- `);
5549
+ const result = await client.query(`WITH latest_workflow_events AS (
5550
+ SELECT DISTINCT ON (workflow_slug, run_id)
5551
+ workflow_slug,
5552
+ run_id,
5553
+ type
5554
+ FROM ${this.schema}.workflow_events
5555
+ WHERE type IN (
5556
+ 'RunSubmitted',
5557
+ 'WorkflowRetryStarted',
5558
+ 'WorkflowStarted',
5559
+ 'WorkflowResumed',
5560
+ 'WorkflowCompleted',
5561
+ 'WorkflowFailed',
5562
+ 'WorkflowCancelled'
5563
+ )
5564
+ ORDER BY workflow_slug, run_id, timestamp_us DESC, event_id DESC
5565
+ )
5566
+ SELECT DISTINCT workflow_slug
5567
+ FROM latest_workflow_events
5568
+ WHERE type IN ('RunSubmitted', 'WorkflowRetryStarted', 'WorkflowStarted', 'WorkflowResumed')
5569
+ ORDER BY workflow_slug ASC`);
5493
5570
  return result.rows.map((row) => row.workflow_slug);
5494
5571
  } finally {
5495
5572
  client.release();
@@ -5630,6 +5707,15 @@ class DatabaseClient {
5630
5707
  type
5631
5708
  FROM ${this.schema}.workflow_events
5632
5709
  WHERE ($1::text IS NULL OR workflow_slug = $1)
5710
+ AND type IN (
5711
+ 'RunSubmitted',
5712
+ 'WorkflowRetryStarted',
5713
+ 'WorkflowStarted',
5714
+ 'WorkflowResumed',
5715
+ 'WorkflowCompleted',
5716
+ 'WorkflowFailed',
5717
+ 'WorkflowCancelled'
5718
+ )
5633
5719
  ORDER BY workflow_slug, run_id, timestamp_us DESC, event_id DESC
5634
5720
  )
5635
5721
  SELECT
@@ -5767,90 +5853,33 @@ class DatabaseClient {
5767
5853
  client.release();
5768
5854
  }
5769
5855
  }
5770
- async getActiveWorkersAggregation(options) {
5856
+ async getActiveWorkersAggregation(_options) {
5771
5857
  const client = await this.pool.connect();
5772
5858
  try {
5773
- const nowUs = Date.now() * 1000;
5774
- const staleThresholdUs = options?.staleThresholdUs ?? 30 * 1000 * 1000;
5775
- const includeInactive = options?.includeInactive ?? false;
5776
- const staleTimestamp = nowUs - staleThresholdUs;
5777
- const workerStatsResult = await client.query(`
5778
- WITH worker_activity AS (
5779
- -- All worker activity from StepStarted and StepHeartbeat events
5780
- SELECT
5781
- worker_id,
5782
- MAX(timestamp_us) AS last_seen_us
5783
- FROM ${this.schema}.step_events
5784
- WHERE worker_id IS NOT NULL
5785
- AND type IN ('StepStarted', 'StepHeartbeat')
5786
- ${options?.timeRange ? `AND timestamp_us >= $1 AND timestamp_us <= $2` : ""}
5787
- GROUP BY worker_id
5788
- ),
5789
- -- Steps started by each worker
5790
- steps_started AS (
5791
- SELECT DISTINCT
5792
- se.worker_id,
5793
- se.workflow_slug,
5794
- se.run_id,
5795
- se.step_id,
5796
- se.attempt_number
5797
- FROM ${this.schema}.step_events se
5798
- WHERE se.type = 'StepStarted'
5799
- AND se.worker_id IS NOT NULL
5800
- ),
5801
- -- Steps completed
5802
- completed_steps AS (
5803
- SELECT
5804
- ss.worker_id,
5805
- COUNT(*) AS completed_count
5806
- FROM steps_started ss
5807
- INNER JOIN ${this.schema}.step_events se
5808
- ON se.workflow_slug = ss.workflow_slug
5809
- AND se.run_id = ss.run_id
5810
- AND se.step_id = ss.step_id
5811
- AND se.attempt_number = ss.attempt_number
5812
- AND se.type = 'StepCompleted'
5813
- GROUP BY ss.worker_id
5814
- ),
5815
- -- Steps failed
5816
- failed_steps AS (
5817
- SELECT
5818
- ss.worker_id,
5819
- COUNT(*) AS failed_count
5820
- FROM steps_started ss
5821
- INNER JOIN ${this.schema}.step_events se
5822
- ON se.workflow_slug = ss.workflow_slug
5823
- AND se.run_id = ss.run_id
5824
- AND se.step_id = ss.step_id
5825
- AND se.attempt_number = ss.attempt_number
5826
- AND se.type = 'StepFailed'
5827
- GROUP BY ss.worker_id
5828
- ),
5829
- -- Reclamation counts (times this worker's steps were reclaimed)
5830
- reclaimed_counts AS (
5831
- SELECT
5832
- (event_data->>'originalWorkerId') AS worker_id,
5833
- COUNT(*) AS reclaimed_count
5834
- FROM ${this.schema}.step_events
5835
- WHERE type = 'StepReclaimed'
5836
- GROUP BY (event_data->>'originalWorkerId')
5859
+ const runningRunsResult = await client.query(`
5860
+ WITH latest_workflow_events AS (
5861
+ SELECT DISTINCT ON (workflow_slug, run_id)
5862
+ workflow_slug,
5863
+ run_id,
5864
+ type
5865
+ FROM ${this.schema}.workflow_events
5866
+ ORDER BY workflow_slug, run_id, timestamp_us DESC, event_id DESC
5837
5867
  )
5838
- SELECT
5839
- wa.worker_id,
5840
- wa.last_seen_us,
5841
- COALESCE(cs.completed_count, 0) AS total_steps_processed,
5842
- COALESCE(fs.failed_count, 0) AS failed_steps,
5843
- COALESCE(rc.reclaimed_count, 0) AS reclaimed_from_count
5844
- FROM worker_activity wa
5845
- LEFT JOIN completed_steps cs ON wa.worker_id = cs.worker_id
5846
- LEFT JOIN failed_steps fs ON wa.worker_id = fs.worker_id
5847
- LEFT JOIN reclaimed_counts rc ON wa.worker_id = rc.worker_id
5848
- ${!includeInactive ? `WHERE wa.last_seen_us >= ${staleTimestamp}` : ""}
5849
- ORDER BY wa.last_seen_us DESC
5850
- `, options?.timeRange ? [options.timeRange.startUs, options.timeRange.endUs] : []);
5868
+ SELECT workflow_slug, run_id
5869
+ FROM latest_workflow_events
5870
+ WHERE type IN ('WorkflowStarted', 'WorkflowResumed')
5871
+ LIMIT 500
5872
+ `);
5873
+ if (runningRunsResult.rows.length === 0) {
5874
+ return {
5875
+ workers: [],
5876
+ totalActiveWorkers: 0,
5877
+ totalRunningSteps: 0
5878
+ };
5879
+ }
5880
+ const runIds = runningRunsResult.rows.map((r) => r.run_id);
5851
5881
  const runningStepsResult = await client.query(`
5852
5882
  WITH latest_step_events AS (
5853
- -- Get the latest event per step (excluding LogEntry)
5854
5883
  SELECT DISTINCT ON (workflow_slug, run_id, step_id)
5855
5884
  workflow_slug,
5856
5885
  run_id,
@@ -5861,78 +5890,24 @@ class DatabaseClient {
5861
5890
  slot_index,
5862
5891
  worker_concurrency
5863
5892
  FROM ${this.schema}.step_events
5864
- WHERE type NOT IN ('LogEntry', 'StepCheckpoint', 'StepCheckpointFailed')
5893
+ WHERE run_id = ANY($1)
5894
+ AND type NOT IN ('LogEntry', 'StepCheckpoint', 'StepCheckpointFailed')
5865
5895
  ORDER BY workflow_slug, run_id, step_id, timestamp_us DESC, event_id DESC
5866
- ),
5867
- -- Steps currently running (latest event is StepStarted or StepHeartbeat)
5868
- running_steps AS (
5869
- SELECT
5870
- workflow_slug,
5871
- run_id,
5872
- step_id,
5873
- worker_id,
5874
- slot_index,
5875
- worker_concurrency
5876
- FROM latest_step_events
5877
- WHERE type IN ('StepStarted', 'StepHeartbeat')
5878
- AND worker_id IS NOT NULL
5879
- ),
5880
- -- Get start time and last heartbeat for each running step
5881
- step_times AS (
5882
- SELECT
5883
- rs.workflow_slug,
5884
- rs.run_id,
5885
- rs.step_id,
5886
- rs.worker_id,
5887
- rs.slot_index,
5888
- rs.worker_concurrency,
5889
- MIN(se.timestamp_us) FILTER (WHERE se.type = 'StepStarted') AS started_at_us,
5890
- MAX(se.timestamp_us) FILTER (WHERE se.type IN ('StepStarted', 'StepHeartbeat')) AS last_heartbeat_us
5891
- FROM running_steps rs
5892
- INNER JOIN ${this.schema}.step_events se
5893
- ON se.workflow_slug = rs.workflow_slug
5894
- AND se.run_id = rs.run_id
5895
- AND se.step_id = rs.step_id
5896
- AND se.type IN ('StepStarted', 'StepHeartbeat')
5897
- GROUP BY rs.workflow_slug, rs.run_id, rs.step_id, rs.worker_id, rs.slot_index, rs.worker_concurrency
5898
5896
  )
5899
5897
  SELECT
5900
5898
  worker_id,
5901
5899
  workflow_slug,
5902
5900
  run_id,
5903
5901
  step_id,
5904
- started_at_us,
5905
- last_heartbeat_us,
5902
+ timestamp_us as last_heartbeat_us,
5906
5903
  slot_index,
5907
5904
  worker_concurrency
5908
- FROM step_times
5909
- ORDER BY worker_id, last_heartbeat_us DESC
5910
- `);
5911
- const workerConcurrencyResult = await client.query(`
5912
- SELECT DISTINCT ON (worker_id)
5913
- worker_id,
5914
- worker_concurrency
5915
- FROM ${this.schema}.step_events
5916
- WHERE worker_id IS NOT NULL
5917
- AND worker_concurrency IS NOT NULL
5918
- AND type IN ('StepStarted', 'StepHeartbeat')
5919
- ORDER BY worker_id, timestamp_us DESC
5920
- `);
5921
- const workerConcurrencyMap = new Map;
5922
- for (const row of workerConcurrencyResult.rows) {
5923
- workerConcurrencyMap.set(row.worker_id, row.worker_concurrency);
5924
- }
5905
+ FROM latest_step_events
5906
+ WHERE type IN ('StepStarted', 'StepHeartbeat')
5907
+ AND worker_id IS NOT NULL
5908
+ LIMIT 1000
5909
+ `, [runIds]);
5925
5910
  const workerMap = new Map;
5926
- for (const row of workerStatsResult.rows) {
5927
- workerMap.set(row.worker_id, {
5928
- workerId: row.worker_id,
5929
- lastSeenUs: parseInt(row.last_seen_us, 10),
5930
- totalStepsProcessed: parseInt(row.total_steps_processed, 10),
5931
- failedSteps: parseInt(row.failed_steps, 10),
5932
- reclaimedFromCount: parseInt(row.reclaimed_from_count, 10),
5933
- activeSteps: []
5934
- });
5935
- }
5936
5911
  for (const row of runningStepsResult.rows) {
5937
5912
  let worker = workerMap.get(row.worker_id);
5938
5913
  if (!worker) {
@@ -5942,10 +5917,15 @@ class DatabaseClient {
5942
5917
  totalStepsProcessed: 0,
5943
5918
  failedSteps: 0,
5944
5919
  reclaimedFromCount: 0,
5920
+ workerConcurrency: row.worker_concurrency ?? undefined,
5945
5921
  activeSteps: []
5946
5922
  };
5947
5923
  workerMap.set(row.worker_id, worker);
5948
5924
  }
5925
+ const lastHeartbeatUs = parseInt(row.last_heartbeat_us, 10);
5926
+ if (lastHeartbeatUs > worker.lastSeenUs) {
5927
+ worker.lastSeenUs = lastHeartbeatUs;
5928
+ }
5949
5929
  if (row.worker_concurrency != null && worker.workerConcurrency == null) {
5950
5930
  worker.workerConcurrency = row.worker_concurrency;
5951
5931
  }
@@ -5953,19 +5933,11 @@ class DatabaseClient {
5953
5933
  workflowSlug: row.workflow_slug,
5954
5934
  runId: row.run_id,
5955
5935
  stepId: row.step_id,
5956
- startedAtUs: parseInt(row.started_at_us, 10),
5957
- lastHeartbeatUs: parseInt(row.last_heartbeat_us, 10),
5936
+ startedAtUs: lastHeartbeatUs,
5937
+ lastHeartbeatUs,
5958
5938
  slotIndex: row.slot_index ?? undefined
5959
5939
  });
5960
5940
  }
5961
- for (const worker of workerMap.values()) {
5962
- if (worker.workerConcurrency == null) {
5963
- const historicalConcurrency = workerConcurrencyMap.get(worker.workerId);
5964
- if (historicalConcurrency != null) {
5965
- worker.workerConcurrency = historicalConcurrency;
5966
- }
5967
- }
5968
- }
5969
5941
  const workers = Array.from(workerMap.values()).sort((a, b) => {
5970
5942
  if (b.activeSteps.length !== a.activeSteps.length) {
5971
5943
  return b.activeSteps.length - a.activeSteps.length;
@@ -5973,7 +5945,7 @@ class DatabaseClient {
5973
5945
  return b.lastSeenUs - a.lastSeenUs;
5974
5946
  });
5975
5947
  const totalRunningSteps = workers.reduce((sum, w) => sum + w.activeSteps.length, 0);
5976
- const totalActiveWorkers = workers.filter((w) => w.activeSteps.length > 0).length;
5948
+ const totalActiveWorkers = workers.length;
5977
5949
  return {
5978
5950
  workers,
5979
5951
  totalActiveWorkers,
@@ -5983,6 +5955,112 @@ class DatabaseClient {
5983
5955
  client.release();
5984
5956
  }
5985
5957
  }
5958
+ async getWorkerById(workerId) {
5959
+ const client = await this.pool.connect();
5960
+ try {
5961
+ const result = await client.query(`
5962
+ WITH worker_step_events AS (
5963
+ -- Get all step events for this worker
5964
+ SELECT
5965
+ workflow_slug,
5966
+ run_id,
5967
+ step_id,
5968
+ type,
5969
+ timestamp_us,
5970
+ slot_index,
5971
+ worker_concurrency
5972
+ FROM ${this.schema}.step_events
5973
+ WHERE worker_id = $1
5974
+ AND type IN ('StepStarted', 'StepHeartbeat')
5975
+ ),
5976
+ latest_per_step AS (
5977
+ -- For each step this worker touched, get the latest event
5978
+ SELECT DISTINCT ON (workflow_slug, run_id, step_id)
5979
+ workflow_slug,
5980
+ run_id,
5981
+ step_id,
5982
+ timestamp_us,
5983
+ slot_index,
5984
+ worker_concurrency
5985
+ FROM worker_step_events
5986
+ ORDER BY workflow_slug, run_id, step_id, timestamp_us DESC
5987
+ ),
5988
+ -- Check if these steps are still running (no completion/failure after our heartbeat)
5989
+ still_running AS (
5990
+ SELECT
5991
+ lps.workflow_slug,
5992
+ lps.run_id,
5993
+ lps.step_id,
5994
+ lps.timestamp_us as last_heartbeat_us,
5995
+ lps.slot_index,
5996
+ lps.worker_concurrency
5997
+ FROM latest_per_step lps
5998
+ WHERE NOT EXISTS (
5999
+ SELECT 1 FROM ${this.schema}.step_events se
6000
+ WHERE se.workflow_slug = lps.workflow_slug
6001
+ AND se.run_id = lps.run_id
6002
+ AND se.step_id = lps.step_id
6003
+ AND se.timestamp_us > lps.timestamp_us
6004
+ AND se.type IN ('StepCompleted', 'StepFailed', 'StepSkipped', 'StepReclaimed')
6005
+ )
6006
+ )
6007
+ SELECT * FROM still_running
6008
+ ORDER BY last_heartbeat_us DESC
6009
+ LIMIT 100
6010
+ `, [workerId]);
6011
+ if (result.rows.length === 0) {
6012
+ const lastSeenResult = await client.query(`
6013
+ SELECT MAX(timestamp_us) as last_seen_us, MAX(worker_concurrency) as worker_concurrency
6014
+ FROM ${this.schema}.step_events
6015
+ WHERE worker_id = $1
6016
+ AND type IN ('StepStarted', 'StepHeartbeat')
6017
+ `, [workerId]);
6018
+ if (!lastSeenResult.rows[0]?.last_seen_us) {
6019
+ return null;
6020
+ }
6021
+ return {
6022
+ workerId,
6023
+ lastSeenUs: parseInt(lastSeenResult.rows[0].last_seen_us, 10),
6024
+ totalStepsProcessed: 0,
6025
+ failedSteps: 0,
6026
+ reclaimedFromCount: 0,
6027
+ workerConcurrency: lastSeenResult.rows[0].worker_concurrency ?? undefined,
6028
+ activeSteps: []
6029
+ };
6030
+ }
6031
+ let lastSeenUs = 0;
6032
+ let workerConcurrency;
6033
+ const activeSteps = [];
6034
+ for (const row of result.rows) {
6035
+ const heartbeatUs = parseInt(row.last_heartbeat_us, 10);
6036
+ if (heartbeatUs > lastSeenUs) {
6037
+ lastSeenUs = heartbeatUs;
6038
+ }
6039
+ if (row.worker_concurrency != null && workerConcurrency == null) {
6040
+ workerConcurrency = row.worker_concurrency;
6041
+ }
6042
+ activeSteps.push({
6043
+ workflowSlug: row.workflow_slug,
6044
+ runId: row.run_id,
6045
+ stepId: row.step_id,
6046
+ startedAtUs: heartbeatUs,
6047
+ lastHeartbeatUs: heartbeatUs,
6048
+ slotIndex: row.slot_index ?? undefined
6049
+ });
6050
+ }
6051
+ return {
6052
+ workerId,
6053
+ lastSeenUs,
6054
+ totalStepsProcessed: 0,
6055
+ failedSteps: 0,
6056
+ reclaimedFromCount: 0,
6057
+ workerConcurrency,
6058
+ activeSteps
6059
+ };
6060
+ } finally {
6061
+ client.release();
6062
+ }
6063
+ }
5986
6064
  }
5987
6065
  function createPool(connectionString) {
5988
6066
  return new Pool2({ connectionString });
@@ -6384,6 +6462,87 @@ async function migration011_addWorkerConcurrencyIndex(pool, schema) {
6384
6462
  client.release();
6385
6463
  }
6386
6464
  }
6465
+ async function migration012_addWorkerAnalyticsIndexes(pool, schema) {
6466
+ const client = await pool.connect();
6467
+ try {
6468
+ await client.query(`
6469
+ CREATE INDEX IF NOT EXISTS idx_step_events_worker_activity
6470
+ ON ${schema}.step_events (timestamp_us, type, worker_id)
6471
+ WHERE worker_id IS NOT NULL
6472
+ AND type IN ('StepStarted', 'StepHeartbeat')
6473
+ `);
6474
+ await client.query(`
6475
+ CREATE INDEX IF NOT EXISTS idx_step_events_recent_by_step
6476
+ ON ${schema}.step_events (timestamp_us DESC, workflow_slug, run_id, step_id, type, event_id DESC)
6477
+ WHERE type NOT IN ('LogEntry', 'StepCheckpoint', 'StepCheckpointFailed')
6478
+ `);
6479
+ console.log("[Migration 012] Worker analytics indexes added successfully");
6480
+ } catch (error) {
6481
+ console.error("[Migration 012] Error adding worker analytics indexes:", error);
6482
+ throw error;
6483
+ } finally {
6484
+ client.release();
6485
+ }
6486
+ }
6487
+ async function migration013_addWorkersObservabilityIndexes(pool, schema) {
6488
+ const client = await pool.connect();
6489
+ try {
6490
+ await client.query(`
6491
+ CREATE INDEX IF NOT EXISTS idx_workflow_events_run_status
6492
+ ON ${schema}.workflow_events (run_id, timestamp_us DESC, event_id DESC)
6493
+ INCLUDE (workflow_slug, type)
6494
+ `);
6495
+ await client.query(`
6496
+ CREATE INDEX IF NOT EXISTS idx_step_events_by_run
6497
+ ON ${schema}.step_events (run_id, workflow_slug, step_id, timestamp_us DESC, event_id DESC)
6498
+ INCLUDE (type, worker_id, slot_index, worker_concurrency)
6499
+ WHERE type NOT IN ('LogEntry', 'StepCheckpoint', 'StepCheckpointFailed')
6500
+ `);
6501
+ console.log("[Migration 013] Workers observability indexes added successfully");
6502
+ } catch (error) {
6503
+ console.error("[Migration 013] Error adding workers observability indexes:", error);
6504
+ throw error;
6505
+ } finally {
6506
+ client.release();
6507
+ }
6508
+ }
6509
+ async function migration014_addWorkerHotPathIndexes(pool, schema) {
6510
+ const client = await pool.connect();
6511
+ try {
6512
+ await client.query(`
6513
+ CREATE INDEX IF NOT EXISTS idx_step_events_latest_cover
6514
+ ON ${schema}.step_events (workflow_slug, run_id, step_id, timestamp_us DESC, event_id DESC)
6515
+ INCLUDE (type, available_at_us, worker_id, attempt_number, priority)
6516
+ `);
6517
+ await client.query(`
6518
+ CREATE INDEX IF NOT EXISTS idx_step_events_terminal_latest
6519
+ ON ${schema}.step_events (workflow_slug, run_id, step_id, timestamp_us DESC, event_id DESC)
6520
+ WHERE type IN ('StepCompleted', 'StepFailed', 'StepSkipped', 'StepReclaimed')
6521
+ `);
6522
+ await client.query(`
6523
+ CREATE INDEX IF NOT EXISTS idx_workflow_events_active_latest
6524
+ ON ${schema}.workflow_events (type, workflow_slug, run_id, timestamp_us DESC, event_id DESC)
6525
+ WHERE type IN ('RunSubmitted', 'WorkflowRetryStarted', 'WorkflowStarted', 'WorkflowResumed')
6526
+ `);
6527
+ await client.query(`
6528
+ CREATE INDEX IF NOT EXISTS idx_workflow_events_run_chrono
6529
+ ON ${schema}.workflow_events (workflow_slug, run_id, timestamp_us ASC, event_id ASC)
6530
+ `);
6531
+ await client.query(`
6532
+ CREATE INDEX IF NOT EXISTS idx_workflow_events_version_lookup
6533
+ ON ${schema}.workflow_events (workflow_slug, run_id, timestamp_us DESC, event_id DESC)
6534
+ INCLUDE (version_id)
6535
+ WHERE type IN ('WorkflowStarted', 'RunSubmitted')
6536
+ AND version_id IS NOT NULL
6537
+ `);
6538
+ console.log("[Migration 014] Worker hot-path indexes added successfully");
6539
+ } catch (error) {
6540
+ console.error("[Migration 014] Error adding worker hot-path indexes:", error);
6541
+ throw error;
6542
+ } finally {
6543
+ client.release();
6544
+ }
6545
+ }
6387
6546
  async function runMigrations(pool, schema = "cascadeflow") {
6388
6547
  console.log(`[Migrations] Starting database migrations in schema '${schema}'...`);
6389
6548
  try {
@@ -6399,6 +6558,9 @@ async function runMigrations(pool, schema = "cascadeflow") {
6399
6558
  await migration009_addStepPriority(pool, schema);
6400
6559
  await migration010_addSlotTracking(pool, schema);
6401
6560
  await migration011_addWorkerConcurrencyIndex(pool, schema);
6561
+ await migration012_addWorkerAnalyticsIndexes(pool, schema);
6562
+ await migration013_addWorkersObservabilityIndexes(pool, schema);
6563
+ await migration014_addWorkerHotPathIndexes(pool, schema);
6402
6564
  console.log("[Migrations] All migrations completed successfully");
6403
6565
  } catch (error) {
6404
6566
  console.error("[Migrations] Migration failed:", error);
@@ -7264,16 +7426,6 @@ class PostgresBackend extends Backend {
7264
7426
  return !!(latestEvent && (latestEvent.type === "StepScheduled" || latestEvent.type === "StepReclaimed" || latestEvent.type === "StepRetrying"));
7265
7427
  }
7266
7428
  async claimScheduledStep(workflowSlug, runId, stepId, workerId, metadata) {
7267
- const initialEvents = await this.loadEvents(workflowSlug, runId, { category: "step", stepId });
7268
- if (initialEvents.length === 0) {
7269
- return null;
7270
- }
7271
- const now = getMicrosecondTimestamp();
7272
- const initialState = projectStepState(initialEvents, workflowSlug);
7273
- if (initialState.status !== "scheduled" || initialState.availableAt === undefined || initialState.availableAt > now) {
7274
- return null;
7275
- }
7276
- const attemptNumber = initialState.attemptNumber;
7277
7429
  const timestamp = getMicrosecondTimestamp();
7278
7430
  const event = {
7279
7431
  category: "step",
@@ -7285,43 +7437,34 @@ class PostgresBackend extends Backend {
7285
7437
  stepId,
7286
7438
  workerId,
7287
7439
  dependencies: metadata.dependencies,
7288
- attemptNumber,
7440
+ attemptNumber: metadata.attemptNumber,
7289
7441
  slotIndex: metadata.slotIndex,
7290
7442
  workerConcurrency: metadata.workerConcurrency
7291
7443
  };
7292
- const claimed = await this.db.claimScheduledStep(workflowSlug, runId, stepId, workerId, event);
7293
- return claimed ? { attemptNumber } : null;
7444
+ const claimedAttemptNumber = await this.db.claimScheduledStep(workflowSlug, runId, stepId, event);
7445
+ return claimedAttemptNumber !== null ? { attemptNumber: claimedAttemptNumber } : null;
7294
7446
  }
7295
7447
  async reclaimStaleSteps(staleThreshold, reclaimedBy) {
7296
7448
  const reclaimed = [];
7297
7449
  const now = getMicrosecondTimestamp();
7298
7450
  const staleSteps = await this.db.findStaleSteps(staleThreshold);
7299
7451
  for (const step of staleSteps) {
7300
- const events = await this.loadEvents(step.workflowSlug, step.runId, { category: "step", stepId: step.stepId });
7301
- if (events.length === 0)
7302
- continue;
7303
- const state = projectStepState(events, step.workflowSlug);
7304
- if (state.status !== "running")
7305
- continue;
7306
- const lastHeartbeat = state.lastHeartbeat || state.startTime || 0;
7307
- const staleDuration = now - lastHeartbeat;
7308
- if (staleDuration > staleThreshold) {
7309
- await this.saveStepReclaimed(step.workflowSlug, step.runId, step.stepId, {
7310
- originalWorkerId: state.claimedBy || "unknown",
7311
- reclaimedBy,
7312
- lastHeartbeat,
7313
- staleThreshold,
7314
- staleDuration,
7315
- attemptNumber: state.attemptNumber
7316
- });
7317
- await this.saveStepScheduled(step.workflowSlug, step.runId, step.stepId, {
7318
- availableAt: now,
7319
- reason: "retry",
7320
- attemptNumber: state.attemptNumber + 1,
7321
- retryDelayMs: 0
7322
- });
7323
- reclaimed.push({ workflowSlug: step.workflowSlug, runId: step.runId, stepId: step.stepId });
7324
- }
7452
+ const staleDuration = now - step.lastHeartbeatUs;
7453
+ await this.saveStepReclaimed(step.workflowSlug, step.runId, step.stepId, {
7454
+ originalWorkerId: step.workerId,
7455
+ reclaimedBy,
7456
+ lastHeartbeat: step.lastHeartbeatUs,
7457
+ staleThreshold,
7458
+ staleDuration,
7459
+ attemptNumber: step.attemptNumber
7460
+ });
7461
+ await this.saveStepScheduled(step.workflowSlug, step.runId, step.stepId, {
7462
+ availableAt: now,
7463
+ reason: "retry",
7464
+ attemptNumber: step.attemptNumber + 1,
7465
+ retryDelayMs: 0
7466
+ });
7467
+ reclaimed.push({ workflowSlug: step.workflowSlug, runId: step.runId, stepId: step.stepId });
7325
7468
  }
7326
7469
  return reclaimed;
7327
7470
  }
@@ -7754,9 +7897,25 @@ class PostgresBackend extends Backend {
7754
7897
  staleThresholdUs
7755
7898
  };
7756
7899
  }
7900
+ async getWorkerById(workerId) {
7901
+ const result = await this.db.getWorkerById(workerId);
7902
+ if (!result) {
7903
+ return null;
7904
+ }
7905
+ return {
7906
+ workerId: result.workerId,
7907
+ lastSeenUs: result.lastSeenUs,
7908
+ currentlyRunningSteps: result.activeSteps.length,
7909
+ totalStepsProcessed: result.totalStepsProcessed,
7910
+ failedSteps: result.failedSteps,
7911
+ reclaimedFromCount: result.reclaimedFromCount,
7912
+ workerConcurrency: result.workerConcurrency,
7913
+ activeSteps: result.activeSteps
7914
+ };
7915
+ }
7757
7916
  }
7758
7917
  export {
7759
7918
  PostgresBackend
7760
7919
  };
7761
7920
 
7762
- //# debugId=9C62728A85A6463664756E2164756E21
7921
+ //# debugId=70613711DE018DE364756E2164756E21