screwdriver-queue-service 5.0.3 → 6.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/default.yaml +2 -2
- package/docs/ARCHITECTURE_REDESIGN.md +214 -0
- package/docs/QS-REDIS-ATOMIC-REDESIGN.png +0 -0
- package/package.json +2 -1
- package/plugins/queue/scheduler.js +2 -8
- package/plugins/worker/lib/BlockedBy.js +144 -330
- package/plugins/worker/lib/LuaScriptLoader.js +232 -0
- package/plugins/worker/lib/jobs.js +74 -26
- package/plugins/worker/lib/lua/checkTimeout.lua +166 -0
- package/plugins/worker/lib/lua/lib/CollapseDecider.lua +155 -0
- package/plugins/worker/lib/lua/lib/DependencyResolver.lua +109 -0
- package/plugins/worker/lib/lua/lib/StateValidator.lua +179 -0
- package/plugins/worker/lib/lua/lib/TimeoutDecider.lua +161 -0
- package/plugins/worker/lib/lua/startBuild.lua +217 -0
- package/plugins/worker/lib/lua/stopBuild.lua +133 -0
- package/plugins/worker/lib/timeout.js +123 -68
- package/plugins/worker/worker.js +10 -10
package/config/default.yaml
CHANGED
|
@@ -172,8 +172,8 @@ queue:
|
|
|
172
172
|
|
|
173
173
|
plugins:
|
|
174
174
|
blockedBy:
|
|
175
|
-
# re-enqueue in
|
|
176
|
-
reenqueueWaitTime:
|
|
175
|
+
# re-enqueue in 10 seconds if blocked (0.167 minutes)
|
|
176
|
+
reenqueueWaitTime: 0.167
|
|
177
177
|
# job is blocking for maximum 120 mins = build timeout
|
|
178
178
|
blockTimeout: 120
|
|
179
179
|
# job blocked by itself
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
# BlockedBy Architecture Redesign
|
|
2
|
+
|
|
3
|
+
**Purpose**: Propose a cleaner, more maintainable architecture for the build start/stop Queue system (incl. BlockedBy, Collapse, Timeout)
|
|
4
|
+
**Design Proposal:**
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## Current Architecture - Problems
|
|
9
|
+
|
|
10
|
+
### 1. **Complexity Analysis**
|
|
11
|
+
|
|
12
|
+
#### **Current Issues**
|
|
13
|
+
|
|
14
|
+
**A. State Scattered Across Redis Keys**
|
|
15
|
+
```
|
|
16
|
+
running_job_{jobId} # Am I running?
|
|
17
|
+
last_running_job_{jobId} # Who ran last?
|
|
18
|
+
waiting_job_{jobId} # Who's waiting?
|
|
19
|
+
deleted_{jobId}_{buildId} # Am I aborted?
|
|
20
|
+
buildConfigs[buildId] # What's my config?
|
|
21
|
+
timeoutConfigs[buildId] # When do I timeout?
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
**Problem**: No single source of truth. State reconstruction requires 6+ Redis reads.
|
|
25
|
+
|
|
26
|
+
**B. Implicit State Machine**
|
|
27
|
+
```javascript
|
|
28
|
+
// State is implicit in Redis key presence/absence
|
|
29
|
+
if (runningKey exists) → RUNNING
|
|
30
|
+
if (in waitingKey) → BLOCKED
|
|
31
|
+
if (deleteKey exists) → ABORTED
|
|
32
|
+
if (buildId < lastRunning && collapse) → COLLAPSED
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
**Problem**: State transitions scattered across code. Hard to reason about.
|
|
36
|
+
|
|
37
|
+
**C. Race Conditions Everywhere**
|
|
38
|
+
```javascript
|
|
39
|
+
// Non-atomic check-then-act
|
|
40
|
+
const value = await redis.get(key); // Read
|
|
41
|
+
if (value) {
|
|
42
|
+
await redis.set(otherKey, ...); // Write
|
|
43
|
+
}
|
|
44
|
+
// Another worker can interleave here!
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
**Problem**: Locking added as band-aid. Real issue is non-atomic operations.
|
|
48
|
+
|
|
49
|
+
**D. Mixed Concerns**
|
|
50
|
+
```javascript
|
|
51
|
+
async beforePerform() {
|
|
52
|
+
// Concern 1: Filtering (job ownership)
|
|
53
|
+
// Concern 2: Abort checking
|
|
54
|
+
// Concern 3: Collapse logic
|
|
55
|
+
// Concern 4: Blocking logic
|
|
56
|
+
// Concern 5: Queue management
|
|
57
|
+
// Concern 6: Lock management
|
|
58
|
+
// Concern 7: Status updates
|
|
59
|
+
// All in one 300-line function!
|
|
60
|
+
}
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
**Problem**: Single Responsibility Principle violated. Hard to test, modify, understand.
|
|
64
|
+
|
|
65
|
+
**E. Implicit Dependencies**
|
|
66
|
+
```
|
|
67
|
+
beforePerform → checkBlockingJob
|
|
68
|
+
→ blockedBySelf
|
|
69
|
+
→ collapseBuilds
|
|
70
|
+
→ reEnqueue
|
|
71
|
+
→ helper.updateBuildStatus
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
**Problem**: Deep call stack. Side effects hidden. Hard to trace execution flow.
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## Proposed Architecture - Principles
|
|
79
|
+
|
|
80
|
+
### **Core Principles**
|
|
81
|
+
|
|
82
|
+
1. **Single Source of Truth** - One place holds authoritative state
|
|
83
|
+
2. **Explicit State Machine** - States and transitions clearly defined
|
|
84
|
+
3. **Atomic Operations** - Use Lua scripts or transactions
|
|
85
|
+
4. **Separation of Concerns** - Each class has one responsibility
|
|
86
|
+
5. **Immutable Events** - Events log what happened, states derive from events
|
|
87
|
+
6. **Testability** - Pure functions, dependency injection, clear interfaces
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## Redesign Proposal
|
|
92
|
+
|
|
93
|
+
### **State Machine**
|
|
94
|
+
|
|
95
|
+
#### **Architecture**
|
|
96
|
+
|
|
97
|
+
```
|
|
98
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
99
|
+
│ Build Lifecycle │
|
|
100
|
+
├─────────────────────────────────────────────────────────────┤
|
|
101
|
+
│ │
|
|
102
|
+
│ Events (Immutable Log) States (Derived) │
|
|
103
|
+
│ ──────────────────────── ────────────────── │
|
|
104
|
+
│ BuildEnqueued QUEUED │
|
|
105
|
+
│ BuildBlocked BLOCKED │
|
|
106
|
+
│ BuildUnblocked READY │
|
|
107
|
+
│ BuildStarted RUNNING │
|
|
108
|
+
│ BuildCompleted SUCCESS/FAILURE │
|
|
109
|
+
│ BuildAborted ABORTED │
|
|
110
|
+
│ BuildCollapsed COLLAPSED │
|
|
111
|
+
│ │
|
|
112
|
+
└─────────────────────────────────────────────────────────────┘
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
#### **Components**
|
|
116
|
+
|
|
117
|
+
**1. LuaScriptLoader** - Loads and executes Lua scripts on Redis server
|
|
118
|
+
|
|
119
|
+
**2. startBuild.lua** - Main script handling blocking/collapse/abort logic atomically
|
|
120
|
+
|
|
121
|
+
**3. checkTimeout.lua** - Timeout detection and cleanup script
|
|
122
|
+
|
|
123
|
+
**4. stopBuild.lua** - Main script for handling cleanup for stopped build
|
|
124
|
+
|
|
125
|
+
**5. Helper Modules** (Pure Logic - No Redis calls):
|
|
126
|
+
- **CollapseDecider.lua** - Build collapse logic
|
|
127
|
+
- **DependencyResolver.lua** - Dependency blocking logic
|
|
128
|
+
- **StateValidator.lua** - State transition validation
|
|
129
|
+
- **TimeoutDecider.lua** - Timeout calculation logic
|
|
130
|
+
|
|
131
|
+
### **Future Phase: Introduce Event Log**
|
|
132
|
+
|
|
133
|
+
**Goal**: Add event sourcing for observability
|
|
134
|
+
|
|
135
|
+
**Changes**:
|
|
136
|
+
```
|
|
137
|
+
1. Add Redis Streams for events
|
|
138
|
+
build:events:{buildId} → stream
|
|
139
|
+
|
|
140
|
+
2. Record all state changes as events
|
|
141
|
+
- BuildEnqueued
|
|
142
|
+
- BuildBlocked
|
|
143
|
+
- BuildStarted
|
|
144
|
+
- BuildCompleted
|
|
145
|
+
|
|
146
|
+
3. Keep existing state keys (dual-write)
|
|
147
|
+
- Events for debugging
|
|
148
|
+
- Keys for fast reads
|
|
149
|
+
|
|
150
|
+
4. Build debugging tools (may be)
|
|
151
|
+
- Event replay UI
|
|
152
|
+
- State reconstruction
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
## Trade-offs & Decisions
|
|
156
|
+
|
|
157
|
+
### **Lua Scripts**
|
|
158
|
+
|
|
159
|
+
**Pros**:
|
|
160
|
+
- Atomic operations
|
|
161
|
+
- No locks needed
|
|
162
|
+
- Better performance
|
|
163
|
+
- Simpler code
|
|
164
|
+
|
|
165
|
+
**Cons**:
|
|
166
|
+
- Lua learning curve
|
|
167
|
+
- Harder to debug
|
|
168
|
+
- Can't use debugger
|
|
169
|
+
|
|
170
|
+
**Decision**: YES
|
|
171
|
+
- Lua simple (we write once)
|
|
172
|
+
- Atomicity > debuggability
|
|
173
|
+
- Can test Lua separately
|
|
174
|
+
- Worth the complexity reduction
|
|
175
|
+
|
|
176
|
+
### **Event Sourcing**
|
|
177
|
+
|
|
178
|
+
**Pros**:
|
|
179
|
+
- Full audit trail
|
|
180
|
+
- Easy debugging
|
|
181
|
+
- Can replay state
|
|
182
|
+
- Append-only (fast)
|
|
183
|
+
|
|
184
|
+
**Cons**:
|
|
185
|
+
- More storage
|
|
186
|
+
- Event schema versioning
|
|
187
|
+
- Eventual consistency?
|
|
188
|
+
|
|
189
|
+
**Decision**: NO (we can add in a later phase)
|
|
190
|
+
- Redis Streams are cheap
|
|
191
|
+
- Debugging builds is easier
|
|
192
|
+
- Can keep short retention (7 days)
|
|
193
|
+
|
|
194
|
+
---
|
|
195
|
+
|
|
196
|
+
### **Architecture Diagram**
|
|
197
|
+
|
|
198
|
+

|
|
199
|
+
|
|
200
|
+
KEY BENEFITS:
|
|
201
|
+
1 Redis roundtrip (was 6+)
|
|
202
|
+
Zero race conditions (atomic execution)
|
|
203
|
+
No distributed locks (eliminated Redlock)
|
|
204
|
+
Modular design (reusable helper modules)
|
|
205
|
+
|
|
206
|
+
---
|
|
207
|
+
|
|
208
|
+
## Success Metrics
|
|
209
|
+
|
|
210
|
+
- Redis roundtrips: **6+ reduced to 1** (single Lua script execution)
|
|
211
|
+
- Lock contention: **Eliminated entirely** (no Redlock)
|
|
212
|
+
- Race conditions: **Zero** (Lua atomicity guarantees)
|
|
213
|
+
|
|
214
|
+
---
|
|
Binary file
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "screwdriver-queue-service",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "6.0.1",
|
|
4
4
|
"description": "Screwdriver Queue Service API",
|
|
5
5
|
"main": "app.js",
|
|
6
6
|
"directories": {
|
|
@@ -51,6 +51,7 @@
|
|
|
51
51
|
"mocha-sonarqube-reporter": "^1.0.2",
|
|
52
52
|
"mockery": "^2.1.0",
|
|
53
53
|
"nyc": "^15.1.0",
|
|
54
|
+
"redis-memory-server": "^0.13.0",
|
|
54
55
|
"sinon": "^15.0.0",
|
|
55
56
|
"snyk": "^1.814.0",
|
|
56
57
|
"util": "^0.12.5"
|
|
@@ -493,7 +493,7 @@ async function start(executor, config) {
|
|
|
493
493
|
{
|
|
494
494
|
buildId,
|
|
495
495
|
jobId,
|
|
496
|
-
blockedBy
|
|
496
|
+
blockedBy,
|
|
497
497
|
blockedBySameJob,
|
|
498
498
|
blockedBySameJobWaitTime
|
|
499
499
|
}
|
|
@@ -758,13 +758,7 @@ async function stopTimer(executor, config) {
|
|
|
758
758
|
async function stop(executor, config) {
|
|
759
759
|
await executor.connect();
|
|
760
760
|
|
|
761
|
-
const { buildId, jobId } = config; // in case config contains something else
|
|
762
|
-
|
|
763
|
-
let blockedBy;
|
|
764
|
-
|
|
765
|
-
if (config.blockedBy !== undefined) {
|
|
766
|
-
blockedBy = config.blockedBy.toString();
|
|
767
|
-
}
|
|
761
|
+
const { buildId, jobId, blockedBy } = config; // in case config contains something else
|
|
768
762
|
|
|
769
763
|
const numDeleted = await executor.queueBreaker.runCommand('del', executor.buildQueue, 'start', [
|
|
770
764
|
{
|