hatchet-sdk 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +5 -1
  3. data/CHANGELOG.md +24 -0
  4. data/lib/hatchet/clients/grpc/admin.rb +6 -6
  5. data/lib/hatchet/clients/grpc/dispatcher.rb +33 -8
  6. data/lib/hatchet/condition_converter.rb +20 -12
  7. data/lib/hatchet/contracts/dispatcher/dispatcher_pb.rb +3 -1
  8. data/lib/hatchet/contracts/dispatcher/dispatcher_services_pb.rb +1 -0
  9. data/lib/hatchet/contracts/v1/dispatcher_pb.rb +23 -1
  10. data/lib/hatchet/contracts/v1/dispatcher_services_pb.rb +2 -0
  11. data/lib/hatchet/contracts/v1/shared/condition_pb.rb +3 -1
  12. data/lib/hatchet/contracts/v1/shared/trigger_pb.rb +17 -0
  13. data/lib/hatchet/contracts/v1/workflows_pb.rb +4 -3
  14. data/lib/hatchet/contracts/v1/workflows_services_pb.rb +1 -0
  15. data/lib/hatchet/contracts/workflows/workflows_pb.rb +2 -4
  16. data/lib/hatchet/contracts/workflows/workflows_services_pb.rb +1 -1
  17. data/lib/hatchet/durable_context.rb +102 -33
  18. data/lib/hatchet/engine_version.rb +50 -0
  19. data/lib/hatchet/eviction_policy.rb +60 -0
  20. data/lib/hatchet/exceptions.rb +26 -0
  21. data/lib/hatchet/task.rb +7 -0
  22. data/lib/hatchet/version.rb +1 -1
  23. data/lib/hatchet/worker/durable_event_listener.rb +735 -0
  24. data/lib/hatchet/worker/durable_eviction/cache.rb +205 -0
  25. data/lib/hatchet/worker/durable_eviction/manager.rb +233 -0
  26. data/lib/hatchet/worker/runner.rb +278 -53
  27. data/lib/hatchet/worker_obj.rb +59 -3
  28. data/lib/hatchet/workflow.rb +8 -4
  29. data/lib/hatchet-sdk.rb +13 -3
  30. data/sig/hatchet/clients/grpc/dispatcher.rbs +2 -0
  31. data/sig/hatchet/durable_context.rbs +8 -2
  32. data/sig/hatchet/engine_version.rbs +12 -0
  33. data/sig/hatchet/eviction_policy.rbs +14 -0
  34. data/sig/hatchet/exceptions.rbs +12 -0
  35. data/sig/hatchet/task.rbs +2 -0
  36. data/sig/hatchet/worker/durable_event_listener.rbs +31 -0
  37. data/sig/hatchet/worker/durable_eviction/cache.rbs +41 -0
  38. data/sig/hatchet/worker/durable_eviction/manager.rbs +37 -0
  39. data/sig/hatchet/worker/runner.rbs +7 -1
  40. data/sig/hatchet/worker_obj.rbs +3 -0
  41. data/sig/hatchet/workflow.rbs +1 -1
  42. data/sig/hatchet-sdk.rbs +1 -1
  43. metadata +15 -4
@@ -0,0 +1,205 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "monitor"
4
+
5
+ module Hatchet
6
+ module WorkerRuntime
7
+ module DurableEviction
8
+ # Eviction causes produced by :class:`DurableEvictionCache`.
9
+ module EvictionCause
10
+ TTL_EXCEEDED = :ttl_exceeded
11
+ CAPACITY_PRESSURE = :capacity_pressure
12
+ WORKER_SHUTDOWN = :worker_shutdown
13
+ end
14
+
15
+ # Per-run state tracked by the cache.
16
+ #
17
+ # ``wait_count`` is ref-counted so concurrent waits over the same durable
18
+ # run don't prematurely clear the waiting flag when one child completes
19
+ # before the others.
20
+ class DurableRunRecord
21
+ attr_reader :key, :step_run_id, :invocation_count, :eviction_policy, :registered_at
22
+ attr_accessor :waiting_since, :wait_kind, :wait_resource_id, :eviction_reason, :wait_count
23
+
24
+ # @param key [String] The action key uniquely identifying this step run invocation.
25
+ # @param step_run_id [String]
26
+ # @param invocation_count [Integer]
27
+ # @param eviction_policy [Hatchet::EvictionPolicy, nil]
28
+ # @param registered_at [Time]
29
+ def initialize(key:, step_run_id:, invocation_count:, eviction_policy:, registered_at:)
30
+ @key = key
31
+ @step_run_id = step_run_id
32
+ @invocation_count = invocation_count
33
+ @eviction_policy = eviction_policy
34
+ @registered_at = registered_at
35
+ @waiting_since = nil
36
+ @wait_kind = nil
37
+ @wait_resource_id = nil
38
+ @eviction_reason = nil
39
+ @wait_count = 0
40
+ end
41
+
42
+ def waiting?
43
+ @wait_count.positive?
44
+ end
45
+ end
46
+
47
+ # Thread-safe in-memory cache of waiting durable task invocations.
48
+ #
49
+ # Mirrors :class:`hatchet_sdk.worker.durable_eviction.cache.DurableEvictionCache`
50
+ # from the Python SDK. All public methods lock an internal monitor.
51
+ class DurableEvictionCache
52
+ def initialize
53
+ @runs = {}
54
+ @monitor = Monitor.new
55
+ end
56
+
57
+ # Register a new durable run invocation.
58
+ def register_run(key, step_run_id:, invocation_count:, now:, eviction_policy:)
59
+ @monitor.synchronize do
60
+ @runs[key] = DurableRunRecord.new(
61
+ key: key,
62
+ step_run_id: step_run_id,
63
+ invocation_count: invocation_count,
64
+ eviction_policy: eviction_policy,
65
+ registered_at: now,
66
+ )
67
+ end
68
+ end
69
+
70
+ # Unregister a durable run invocation.
71
+ def unregister_run(key)
72
+ @monitor.synchronize { @runs.delete(key) }
73
+ end
74
+
75
+ # Fetch the record for a given key.
76
+ # @return [DurableRunRecord, nil]
77
+ def get(key)
78
+ @monitor.synchronize { @runs[key] }
79
+ end
80
+
81
+ # @return [Array<DurableRunRecord>]
82
+ def all_waiting
83
+ @monitor.synchronize { @runs.values.select(&:waiting?) }
84
+ end
85
+
86
+ # @param step_run_id [String]
87
+ # @return [String, nil] the action key for the matching record
88
+ def find_key_by_step_run_id(step_run_id)
89
+ @monitor.synchronize do
90
+ @runs.each do |k, rec|
91
+ return k if rec.step_run_id == step_run_id
92
+ end
93
+ nil
94
+ end
95
+ end
96
+
97
+ # Mark the run as waiting (ref-counted). Increments the wait counter and
98
+ # stores the wait metadata on the record.
99
+ def mark_waiting(key, now:, wait_kind:, resource_id:)
100
+ @monitor.synchronize do
101
+ rec = @runs[key]
102
+ return unless rec
103
+
104
+ rec.wait_count += 1
105
+ rec.waiting_since = now if rec.wait_count == 1
106
+ rec.wait_kind = wait_kind
107
+ rec.wait_resource_id = resource_id
108
+ end
109
+ end
110
+
111
+ # Mark the run as active (decrement the wait counter). Floors at zero so
112
+ # unmatched +mark_active+ calls never underflow.
113
+ def mark_active(key, now:)
114
+ @monitor.synchronize do
115
+ rec = @runs[key]
116
+ return unless rec
117
+
118
+ rec.wait_count = [rec.wait_count - 1, 0].max
119
+ if rec.wait_count.zero?
120
+ rec.waiting_since = nil
121
+ rec.wait_kind = nil
122
+ rec.wait_resource_id = nil
123
+ end
124
+ end
125
+ end
126
+
127
+ # Select an eviction candidate, preferring TTL-eligible candidates first,
128
+ # then capacity-pressure candidates (only when above the waiting
129
+ # capacity threshold).
130
+ #
131
+ # @return [String, nil] The action key of the chosen candidate, or nil
132
+ def select_eviction_candidate(now:, durable_slots:, reserve_slots:, min_wait_for_capacity_eviction:)
133
+ @monitor.synchronize do
134
+ waiting = @runs.values.select do |r|
135
+ r.waiting? && !r.eviction_policy.nil?
136
+ end
137
+ return nil if waiting.empty?
138
+
139
+ ttl_eligible = waiting.select do |r|
140
+ policy = r.eviction_policy
141
+ policy&.ttl && r.waiting_since && (now - r.waiting_since) >= policy.ttl
142
+ end
143
+
144
+ unless ttl_eligible.empty?
145
+ chosen = ttl_eligible.min_by do |r|
146
+ [r.eviction_policy ? r.eviction_policy.priority : 0, r.waiting_since || now]
147
+ end
148
+ ttl = chosen.eviction_policy&.ttl
149
+ chosen.eviction_reason = DurableEvictionCache.build_eviction_reason(
150
+ EvictionCause::TTL_EXCEEDED, chosen, ttl: ttl,
151
+ )
152
+ return chosen.key
153
+ end
154
+
155
+ return nil unless capacity_pressure?(durable_slots, reserve_slots, waiting.length)
156
+
157
+ capacity_candidates = waiting.select do |r|
158
+ r.eviction_policy&.allow_capacity_eviction &&
159
+ r.waiting_since &&
160
+ (now - r.waiting_since) >= min_wait_for_capacity_eviction
161
+ end
162
+ return nil if capacity_candidates.empty?
163
+
164
+ chosen = capacity_candidates.min_by do |r|
165
+ [r.eviction_policy ? r.eviction_policy.priority : 0, r.waiting_since || now]
166
+ end
167
+ chosen.eviction_reason = DurableEvictionCache.build_eviction_reason(
168
+ EvictionCause::CAPACITY_PRESSURE, chosen,
169
+ )
170
+ chosen.key
171
+ end
172
+ end
173
+
174
+ # Build a human-readable eviction reason string.
175
+ def self.build_eviction_reason(cause, rec, ttl: nil)
176
+ wait_desc = rec.wait_kind || "unknown"
177
+ wait_desc = "#{wait_desc}(#{rec.wait_resource_id})" if rec.wait_resource_id
178
+
179
+ case cause
180
+ when EvictionCause::TTL_EXCEEDED
181
+ ttl_str = ttl ? " (#{ttl}s)" : ""
182
+ "Wait TTL#{ttl_str} exceeded while waiting on #{wait_desc}"
183
+ when EvictionCause::CAPACITY_PRESSURE
184
+ "Worker at capacity while waiting on #{wait_desc}"
185
+ when EvictionCause::WORKER_SHUTDOWN
186
+ "Worker shutdown while waiting on #{wait_desc}"
187
+ else
188
+ raise ArgumentError, "Unknown eviction cause: #{cause}"
189
+ end
190
+ end
191
+
192
+ private
193
+
194
+ def capacity_pressure?(durable_slots, reserve_slots, waiting_count)
195
+ return false if durable_slots <= 0
196
+
197
+ max_waiting = durable_slots - reserve_slots
198
+ return false if max_waiting <= 0
199
+
200
+ waiting_count >= max_waiting
201
+ end
202
+ end
203
+ end
204
+ end
205
+ end
@@ -0,0 +1,233 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "monitor"
4
+
5
+ require_relative "cache"
6
+
7
+ module Hatchet
8
+ module WorkerRuntime
9
+ module DurableEviction
10
+ # Configuration for the background eviction loop.
11
+ class DurableEvictionConfig
12
+ # @return [Float] Seconds between eviction checks.
13
+ attr_reader :check_interval
14
+ # @return [Integer] Slots to reserve from capacity-eviction decisions.
15
+ attr_reader :reserve_slots
16
+ # @return [Float] Minimum seconds a run must have been waiting before it
17
+ # becomes eligible for capacity-based eviction.
18
+ attr_reader :min_wait_for_capacity_eviction
19
+
20
+ def initialize(check_interval: 1.0, reserve_slots: 0, min_wait_for_capacity_eviction: 10.0)
21
+ @check_interval = check_interval
22
+ @reserve_slots = reserve_slots
23
+ @min_wait_for_capacity_eviction = min_wait_for_capacity_eviction
24
+ freeze
25
+ end
26
+ end
27
+
28
+ DEFAULT_DURABLE_EVICTION_CONFIG = DurableEvictionConfig.new
29
+
30
+ # Orchestrates durable-task eviction.
31
+ #
32
+ # Runs a background thread that periodically selects an eviction candidate
33
+ # from the cache, asks the server to evict it, and then interrupts the
34
+ # local task thread.
35
+ #
36
+ # Mirrors :class:`hatchet_sdk.worker.durable_eviction.manager.DurableEvictionManager`.
37
+ class DurableEvictionManager
38
+ # @return [DurableEvictionCache]
39
+ attr_reader :cache
40
+
41
+ # @param durable_slots [Integer]
42
+ # @param cancel_local [Proc] Called with the action key when the manager
43
+ # decides to evict a local run (invoked after the server ACK).
44
+ # @param request_eviction_with_ack [Proc] Called with (action_key, DurableRunRecord)
45
+ # to send the eviction RPC to the server and block until acknowledged.
46
+ # @param config [DurableEvictionConfig]
47
+ # @param cache [DurableEvictionCache, nil]
48
+ # @param logger [Logger, nil]
49
+ def initialize(
50
+ durable_slots:,
51
+ cancel_local:,
52
+ request_eviction_with_ack:,
53
+ config: DEFAULT_DURABLE_EVICTION_CONFIG,
54
+ cache: nil,
55
+ logger: nil
56
+ )
57
+ @durable_slots = durable_slots
58
+ @cancel_local = cancel_local
59
+ @request_eviction_with_ack = request_eviction_with_ack
60
+ @config = config
61
+ @cache = cache || DurableEvictionCache.new
62
+ @logger = logger
63
+
64
+ @thread = nil
65
+ @tick_monitor = Monitor.new
66
+ @stopped = false
67
+ end
68
+
69
+ # Start the background eviction ticker. Idempotent.
70
+ def start
71
+ return if @thread&.alive?
72
+
73
+ @stopped = false
74
+ @thread = Thread.new { run_loop }
75
+ end
76
+
77
+ # Signal the background thread to stop. Does not join.
78
+ def stop
79
+ @stopped = true
80
+ thread = @thread
81
+ return unless thread&.alive?
82
+
83
+ begin
84
+ thread.wakeup
85
+ rescue ThreadError
86
+ nil
87
+ end
88
+ end
89
+
90
+ # Register a new durable run invocation. Takes the current time from the
91
+ # system clock.
92
+ def register_run(key, step_run_id:, invocation_count:, eviction_policy:)
93
+ @cache.register_run(
94
+ key,
95
+ step_run_id: step_run_id,
96
+ invocation_count: invocation_count,
97
+ now: now,
98
+ eviction_policy: eviction_policy,
99
+ )
100
+ end
101
+
102
+ # Unregister a durable run invocation.
103
+ def unregister_run(key)
104
+ @cache.unregister_run(key)
105
+ end
106
+
107
+ # Mark the run as waiting (increments the wait counter).
108
+ def mark_waiting(key, wait_kind:, resource_id:)
109
+ @cache.mark_waiting(key, now: now, wait_kind: wait_kind, resource_id: resource_id)
110
+ end
111
+
112
+ # Mark the run as active (decrements the wait counter).
113
+ def mark_active(key)
114
+ @cache.mark_active(key, now: now)
115
+ end
116
+
117
+ # Handle a server-initiated eviction notification for a stale invocation.
118
+ def handle_server_eviction(step_run_id, invocation_count)
119
+ key = @cache.find_key_by_step_run_id(step_run_id)
120
+ return unless key
121
+
122
+ rec = @cache.get(key)
123
+ return if rec && rec.invocation_count != invocation_count
124
+
125
+ @logger&.info(
126
+ "DurableEvictionManager: server-initiated eviction for " \
127
+ "step_run_id=#{step_run_id} invocation_count=#{invocation_count}",
128
+ )
129
+ evict_run(key)
130
+ end
131
+
132
+ # Evict every currently-waiting durable run. Used during graceful shutdown.
133
+ #
134
+ # @return [Integer] number of runs evicted
135
+ def evict_all_waiting
136
+ stop
137
+
138
+ waiting = @cache.all_waiting
139
+ evicted = 0
140
+
141
+ waiting.each do |rec|
142
+ rec.eviction_reason = DurableEvictionCache.build_eviction_reason(
143
+ EvictionCause::WORKER_SHUTDOWN, rec,
144
+ )
145
+
146
+ @logger&.debug(
147
+ "DurableEvictionManager: shutdown-evicting durable run " \
148
+ "step_run_id=#{rec.step_run_id} wait_kind=#{rec.wait_kind} " \
149
+ "resource_id=#{rec.wait_resource_id}",
150
+ )
151
+
152
+ begin
153
+ @request_eviction_with_ack.call(rec.key, rec)
154
+ rescue StandardError => e
155
+ @logger&.error(
156
+ "DurableEvictionManager: failed to send eviction for " \
157
+ "step_run_id=#{rec.step_run_id}: #{e.class}: #{e.message}",
158
+ )
159
+ end
160
+
161
+ # Always cancel locally even if the server ACK failed, so the
162
+ # future settles and shutdown doesn't hang.
163
+ evict_run(rec.key)
164
+ evicted += 1
165
+ end
166
+
167
+ evicted
168
+ end
169
+
170
+ private
171
+
172
+ def evict_run(key)
173
+ @cancel_local.call(key)
174
+ unregister_run(key)
175
+ end
176
+
177
+ def run_loop
178
+ until @stopped
179
+ sleep @config.check_interval
180
+ break if @stopped
181
+
182
+ tick_safe
183
+ end
184
+ rescue StandardError => e
185
+ @logger&.error("DurableEvictionManager: run_loop exited: #{e.class}: #{e.message}")
186
+ end
187
+
188
+ def tick_safe
189
+ tick
190
+ rescue StandardError => e
191
+ @logger&.error("DurableEvictionManager: error in eviction loop: #{e.class}: #{e.message}")
192
+ end
193
+
194
+ def tick
195
+ @tick_monitor.synchronize do
196
+ evicted_this_tick = []
197
+
198
+ loop do
199
+ key = @cache.select_eviction_candidate(
200
+ now: now,
201
+ durable_slots: @durable_slots,
202
+ reserve_slots: @config.reserve_slots,
203
+ min_wait_for_capacity_eviction: @config.min_wait_for_capacity_eviction,
204
+ )
205
+ return if key.nil?
206
+ return if evicted_this_tick.include?(key)
207
+
208
+ evicted_this_tick << key
209
+
210
+ rec = @cache.get(key)
211
+ next if rec.nil?
212
+ next if rec.eviction_policy.nil?
213
+
214
+ @logger&.debug(
215
+ "DurableEvictionManager: evicting durable run " \
216
+ "step_run_id=#{rec.step_run_id} wait_kind=#{rec.wait_kind} " \
217
+ "resource_id=#{rec.wait_resource_id} ttl=#{rec.eviction_policy.ttl} " \
218
+ "capacity_allowed=#{rec.eviction_policy.allow_capacity_eviction}",
219
+ )
220
+
221
+ @request_eviction_with_ack.call(key, rec)
222
+ evict_run(key)
223
+ end
224
+ end
225
+ end
226
+
227
+ def now
228
+ Time.now
229
+ end
230
+ end
231
+ end
232
+ end
233
+ end