conductor_ruby 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +142 -0
- data/LICENSE +190 -0
- data/README.md +517 -0
- data/examples/agentic_workflows/llm_chat.rb +106 -0
- data/examples/dynamic_workflow.rb +177 -0
- data/examples/event_handler.rb +94 -0
- data/examples/event_listener_examples.rb +430 -0
- data/examples/helloworld/greetings_worker.rb +24 -0
- data/examples/helloworld/helloworld.rb +99 -0
- data/examples/kitchensink.rb +213 -0
- data/examples/metadata_journey.rb +189 -0
- data/examples/metrics_example.rb +284 -0
- data/examples/new_dsl_demo.rb +141 -0
- data/examples/orkes/http_poll.rb +83 -0
- data/examples/orkes/secrets_example.rb +69 -0
- data/examples/orkes/wait_for_webhook.rb +90 -0
- data/examples/prompt_journey.rb +245 -0
- data/examples/rag_workflow.rb +167 -0
- data/examples/schedule_journey.rb +244 -0
- data/examples/simple_worker.rb +125 -0
- data/examples/simple_workflow.rb +89 -0
- data/examples/task_context_example.rb +257 -0
- data/examples/task_listener_example.rb +192 -0
- data/examples/worker_configuration_example.rb +282 -0
- data/examples/workflow_dsl.rb +316 -0
- data/examples/workflow_ops.rb +305 -0
- data/lib/conductor/client/authorization_client.rb +238 -0
- data/lib/conductor/client/integration_client.rb +108 -0
- data/lib/conductor/client/metadata_client.rb +139 -0
- data/lib/conductor/client/prompt_client.rb +58 -0
- data/lib/conductor/client/scheduler_client.rb +132 -0
- data/lib/conductor/client/schema_client.rb +32 -0
- data/lib/conductor/client/secret_client.rb +48 -0
- data/lib/conductor/client/task_client.rb +168 -0
- data/lib/conductor/client/workflow_client.rb +242 -0
- data/lib/conductor/configuration/authentication_settings.rb +17 -0
- data/lib/conductor/configuration.rb +103 -0
- data/lib/conductor/exceptions.rb +86 -0
- data/lib/conductor/http/api/application_resource_api.rb +107 -0
- data/lib/conductor/http/api/authorization_resource_api.rb +56 -0
- data/lib/conductor/http/api/event_resource_api.rb +133 -0
- data/lib/conductor/http/api/gateway_auth_resource_api.rb +48 -0
- data/lib/conductor/http/api/group_resource_api.rb +76 -0
- data/lib/conductor/http/api/integration_resource_api.rb +145 -0
- data/lib/conductor/http/api/metadata_resource_api.rb +231 -0
- data/lib/conductor/http/api/prompt_resource_api.rb +81 -0
- data/lib/conductor/http/api/role_resource_api.rb +60 -0
- data/lib/conductor/http/api/scheduler_resource_api.rb +211 -0
- data/lib/conductor/http/api/schema_resource_api.rb +82 -0
- data/lib/conductor/http/api/secret_resource_api.rb +134 -0
- data/lib/conductor/http/api/task_resource_api.rb +321 -0
- data/lib/conductor/http/api/token_resource_api.rb +42 -0
- data/lib/conductor/http/api/user_resource_api.rb +59 -0
- data/lib/conductor/http/api/workflow_bulk_resource_api.rb +91 -0
- data/lib/conductor/http/api/workflow_resource_api.rb +451 -0
- data/lib/conductor/http/api_client.rb +437 -0
- data/lib/conductor/http/models/authentication_config.rb +67 -0
- data/lib/conductor/http/models/authorization_request.rb +39 -0
- data/lib/conductor/http/models/base_model.rb +162 -0
- data/lib/conductor/http/models/bulk_response.rb +39 -0
- data/lib/conductor/http/models/conductor_application.rb +39 -0
- data/lib/conductor/http/models/conductor_user.rb +53 -0
- data/lib/conductor/http/models/create_or_update_application_request.rb +24 -0
- data/lib/conductor/http/models/create_or_update_role_request.rb +27 -0
- data/lib/conductor/http/models/event_handler.rb +130 -0
- data/lib/conductor/http/models/generate_token_request.rb +27 -0
- data/lib/conductor/http/models/group.rb +36 -0
- data/lib/conductor/http/models/integration.rb +70 -0
- data/lib/conductor/http/models/integration_api.rb +53 -0
- data/lib/conductor/http/models/integration_api_update.rb +43 -0
- data/lib/conductor/http/models/integration_update.rb +36 -0
- data/lib/conductor/http/models/permission.rb +24 -0
- data/lib/conductor/http/models/poll_data.rb +33 -0
- data/lib/conductor/http/models/prompt_template.rb +59 -0
- data/lib/conductor/http/models/prompt_template_test_request.rb +43 -0
- data/lib/conductor/http/models/rerun_workflow_request.rb +37 -0
- data/lib/conductor/http/models/role.rb +27 -0
- data/lib/conductor/http/models/schema_def.rb +59 -0
- data/lib/conductor/http/models/search_result.rb +187 -0
- data/lib/conductor/http/models/skip_task_request.rb +27 -0
- data/lib/conductor/http/models/start_workflow_request.rb +68 -0
- data/lib/conductor/http/models/subject_ref.rb +35 -0
- data/lib/conductor/http/models/tag_object.rb +36 -0
- data/lib/conductor/http/models/target_ref.rb +39 -0
- data/lib/conductor/http/models/task.rb +156 -0
- data/lib/conductor/http/models/task_def.rb +95 -0
- data/lib/conductor/http/models/task_exec_log.rb +30 -0
- data/lib/conductor/http/models/task_result.rb +115 -0
- data/lib/conductor/http/models/task_result_status.rb +24 -0
- data/lib/conductor/http/models/token.rb +33 -0
- data/lib/conductor/http/models/upsert_group_request.rb +30 -0
- data/lib/conductor/http/models/upsert_user_request.rb +39 -0
- data/lib/conductor/http/models/workflow.rb +202 -0
- data/lib/conductor/http/models/workflow_def.rb +73 -0
- data/lib/conductor/http/models/workflow_schedule.rb +100 -0
- data/lib/conductor/http/models/workflow_state_update.rb +30 -0
- data/lib/conductor/http/models/workflow_status_constants.rb +57 -0
- data/lib/conductor/http/models/workflow_task.rb +169 -0
- data/lib/conductor/http/models/workflow_test_request.rb +67 -0
- data/lib/conductor/http/rest_client.rb +211 -0
- data/lib/conductor/orkes/models/access_key.rb +56 -0
- data/lib/conductor/orkes/models/granted_permission.rb +27 -0
- data/lib/conductor/orkes/models/metadata_tag.rb +15 -0
- data/lib/conductor/orkes/models/rate_limit_tag.rb +15 -0
- data/lib/conductor/orkes/orkes_clients.rb +69 -0
- data/lib/conductor/version.rb +5 -0
- data/lib/conductor/worker/events/conductor_event.rb +40 -0
- data/lib/conductor/worker/events/global_dispatcher.rb +37 -0
- data/lib/conductor/worker/events/http_events.rb +25 -0
- data/lib/conductor/worker/events/listener_registry.rb +40 -0
- data/lib/conductor/worker/events/listeners.rb +34 -0
- data/lib/conductor/worker/events/sync_event_dispatcher.rb +78 -0
- data/lib/conductor/worker/events/task_runner_events.rb +271 -0
- data/lib/conductor/worker/events/workflow_events.rb +49 -0
- data/lib/conductor/worker/fiber_executor.rb +532 -0
- data/lib/conductor/worker/ractor_task_runner.rb +501 -0
- data/lib/conductor/worker/task_context.rb +114 -0
- data/lib/conductor/worker/task_definition_registrar.rb +322 -0
- data/lib/conductor/worker/task_handler.rb +360 -0
- data/lib/conductor/worker/task_in_progress.rb +60 -0
- data/lib/conductor/worker/task_runner.rb +538 -0
- data/lib/conductor/worker/telemetry/metrics_collector.rb +196 -0
- data/lib/conductor/worker/telemetry/prometheus_backend.rb +224 -0
- data/lib/conductor/worker/worker.rb +355 -0
- data/lib/conductor/worker/worker_config.rb +154 -0
- data/lib/conductor/worker/worker_registry.rb +71 -0
- data/lib/conductor/workflow/dsl/input_ref.rb +37 -0
- data/lib/conductor/workflow/dsl/output_ref.rb +44 -0
- data/lib/conductor/workflow/dsl/parallel_builder.rb +49 -0
- data/lib/conductor/workflow/dsl/switch_builder.rb +74 -0
- data/lib/conductor/workflow/dsl/task_ref.rb +178 -0
- data/lib/conductor/workflow/dsl/workflow_builder.rb +1016 -0
- data/lib/conductor/workflow/dsl/workflow_definition.rb +150 -0
- data/lib/conductor/workflow/llm/chat_message.rb +47 -0
- data/lib/conductor/workflow/llm/embedding_model.rb +19 -0
- data/lib/conductor/workflow/llm/tool_call.rb +43 -0
- data/lib/conductor/workflow/llm/tool_spec.rb +46 -0
- data/lib/conductor/workflow/task_type.rb +68 -0
- data/lib/conductor/workflow/timeout_policy.rb +31 -0
- data/lib/conductor/workflow/workflow_executor.rb +373 -0
- data/lib/conductor.rb +192 -0
- metadata +359 -0
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Conductor
|
|
4
|
+
module Worker
|
|
5
|
+
# Return type for long-running tasks
|
|
6
|
+
# When a worker returns TaskInProgress, the task remains in IN_PROGRESS state
|
|
7
|
+
# and Conductor will poll again after callback_after_seconds
|
|
8
|
+
#
|
|
9
|
+
# @example Long-running task with periodic updates
|
|
10
|
+
# def execute(task)
|
|
11
|
+
# ctx = TaskContext.current
|
|
12
|
+
#
|
|
13
|
+
# # Check if we're being polled again
|
|
14
|
+
# if ctx.poll_count > 0
|
|
15
|
+
# # Check if processing is complete
|
|
16
|
+
# if processing_complete?(task.input_data['job_id'])
|
|
17
|
+
# return { status: 'completed', result: get_result() }
|
|
18
|
+
# end
|
|
19
|
+
#
|
|
20
|
+
# # Still processing, check back later
|
|
21
|
+
# return TaskInProgress.new(
|
|
22
|
+
# callback_after_seconds: 30,
|
|
23
|
+
# output: { status: 'processing', progress: get_progress() }
|
|
24
|
+
# )
|
|
25
|
+
# end
|
|
26
|
+
#
|
|
27
|
+
# # First poll - start the long-running job
|
|
28
|
+
# job_id = start_long_running_job(task.input_data)
|
|
29
|
+
#
|
|
30
|
+
# TaskInProgress.new(
|
|
31
|
+
# callback_after_seconds: 60,
|
|
32
|
+
# output: { status: 'started', job_id: job_id }
|
|
33
|
+
# )
|
|
34
|
+
# end
|
|
35
|
+
class TaskInProgress
|
|
36
|
+
# @return [Integer] Seconds to wait before Conductor polls again
|
|
37
|
+
attr_accessor :callback_after_seconds
|
|
38
|
+
|
|
39
|
+
# @return [Hash, nil] Intermediate output data
|
|
40
|
+
attr_accessor :output
|
|
41
|
+
|
|
42
|
+
# Create a TaskInProgress response
|
|
43
|
+
# @param callback_after_seconds [Integer] Seconds to wait before polling again (default: 60)
|
|
44
|
+
# @param output [Hash, nil] Intermediate output data (optional)
|
|
45
|
+
def initialize(callback_after_seconds: 60, output: nil)
|
|
46
|
+
@callback_after_seconds = callback_after_seconds
|
|
47
|
+
@output = output
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Convert to hash
|
|
51
|
+
# @return [Hash]
|
|
52
|
+
def to_h
|
|
53
|
+
{
|
|
54
|
+
callback_after_seconds: @callback_after_seconds,
|
|
55
|
+
output: @output
|
|
56
|
+
}
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
@@ -0,0 +1,538 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'concurrent'
|
|
4
|
+
require 'logger'
|
|
5
|
+
require_relative '../client/task_client'
|
|
6
|
+
require_relative '../http/models/task'
|
|
7
|
+
require_relative '../http/models/task_result'
|
|
8
|
+
require_relative '../http/models/task_result_status'
|
|
9
|
+
require_relative '../exceptions'
|
|
10
|
+
require_relative 'task_context'
|
|
11
|
+
require_relative 'task_in_progress'
|
|
12
|
+
require_relative 'worker_config'
|
|
13
|
+
require_relative 'events/task_runner_events'
|
|
14
|
+
require_relative 'events/sync_event_dispatcher'
|
|
15
|
+
require_relative 'events/listener_registry'
|
|
16
|
+
|
|
17
|
+
module Conductor
|
|
18
|
+
module Worker
|
|
19
|
+
# TaskRunner - The core polling loop that runs in a dedicated thread
|
|
20
|
+
# Implements batch polling, adaptive backoff, capacity management, and event publishing
|
|
21
|
+
class TaskRunner
|
|
22
|
+
# Retry backoffs for task update (in seconds)
|
|
23
|
+
RETRY_BACKOFFS = [0, 10, 20, 30].freeze
|
|
24
|
+
|
|
25
|
+
# Maximum exponent for adaptive backoff to prevent overflow
|
|
26
|
+
MAX_BACKOFF_EXPONENT = 10
|
|
27
|
+
|
|
28
|
+
# Maximum auth failure backoff in seconds
|
|
29
|
+
MAX_AUTH_BACKOFF_SECONDS = 60
|
|
30
|
+
|
|
31
|
+
attr_reader :worker, :running
|
|
32
|
+
|
|
33
|
+
# Initialize TaskRunner for a specific worker
|
|
34
|
+
# @param worker [Worker] The worker instance
|
|
35
|
+
# @param configuration [Configuration] Conductor configuration
|
|
36
|
+
# @param event_dispatcher [SyncEventDispatcher] Shared event dispatcher
|
|
37
|
+
# @param logger [Logger] Logger instance
|
|
38
|
+
def initialize(worker, configuration:, event_dispatcher: nil, logger: nil)
|
|
39
|
+
@worker = worker
|
|
40
|
+
@configuration = configuration || Configuration.new
|
|
41
|
+
@event_dispatcher = event_dispatcher || Events::SyncEventDispatcher.new
|
|
42
|
+
@logger = logger || create_default_logger
|
|
43
|
+
|
|
44
|
+
# Create task client for API communication
|
|
45
|
+
@task_client = Client::TaskClient.new(@configuration)
|
|
46
|
+
|
|
47
|
+
# Resolve worker configuration
|
|
48
|
+
resolved_config = WorkerConfig.resolve(
|
|
49
|
+
worker.task_definition_name,
|
|
50
|
+
extract_worker_options(worker)
|
|
51
|
+
)
|
|
52
|
+
apply_resolved_config(resolved_config)
|
|
53
|
+
|
|
54
|
+
# Create thread pool executor for task execution
|
|
55
|
+
@executor = Concurrent::ThreadPoolExecutor.new(
|
|
56
|
+
min_threads: 1,
|
|
57
|
+
max_threads: @max_workers,
|
|
58
|
+
max_queue: @max_workers * 2,
|
|
59
|
+
fallback_policy: :caller_runs
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# State tracking
|
|
63
|
+
@running_tasks = Concurrent::Set.new
|
|
64
|
+
@consecutive_empty_polls = Concurrent::AtomicFixnum.new(0)
|
|
65
|
+
@auth_failures = Concurrent::AtomicFixnum.new(0)
|
|
66
|
+
@last_auth_failure_time = nil
|
|
67
|
+
@last_poll_time = nil
|
|
68
|
+
@poll_count = Concurrent::AtomicFixnum.new(0)
|
|
69
|
+
@shutdown = Concurrent::AtomicBoolean.new(false)
|
|
70
|
+
@mutex = Mutex.new
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Main polling loop (runs until shutdown)
|
|
74
|
+
def run
|
|
75
|
+
@logger.info("Starting TaskRunner for '#{@worker.task_definition_name}' " \
|
|
76
|
+
"(thread_count=#{@max_workers}, poll_interval=#{@poll_interval}ms)")
|
|
77
|
+
|
|
78
|
+
# Register task definition if configured
|
|
79
|
+
register_task_definition if @worker.register_task_def
|
|
80
|
+
|
|
81
|
+
until @shutdown.true?
|
|
82
|
+
begin
|
|
83
|
+
run_once
|
|
84
|
+
rescue StandardError => e
|
|
85
|
+
@logger.error("Error in polling loop: #{e.message}")
|
|
86
|
+
@logger.debug(e.backtrace.join("\n")) if e.backtrace
|
|
87
|
+
sleep(1) # Brief pause before retrying
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
cleanup
|
|
92
|
+
@logger.info("TaskRunner for '#{@worker.task_definition_name}' stopped")
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Single iteration of the polling loop
|
|
96
|
+
def run_once
|
|
97
|
+
# 1. Cleanup completed tasks
|
|
98
|
+
cleanup_completed_tasks
|
|
99
|
+
|
|
100
|
+
# 2. Check capacity
|
|
101
|
+
current_capacity = @running_tasks.size
|
|
102
|
+
if current_capacity >= @max_workers
|
|
103
|
+
sleep(0.001) # 1ms sleep to prevent busy-waiting
|
|
104
|
+
return
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
available_slots = @max_workers - current_capacity
|
|
108
|
+
|
|
109
|
+
# 3. Adaptive backoff for empty polls
|
|
110
|
+
if @consecutive_empty_polls.value.positive?
|
|
111
|
+
backoff_ms = calculate_adaptive_backoff
|
|
112
|
+
elapsed_ms = @last_poll_time ? (Time.now - @last_poll_time) * 1000 : backoff_ms
|
|
113
|
+
|
|
114
|
+
if elapsed_ms < backoff_ms
|
|
115
|
+
sleep_time = (backoff_ms - elapsed_ms) / 1000.0
|
|
116
|
+
sleep([sleep_time, 0.001].max)
|
|
117
|
+
return
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# 4. Batch poll for tasks
|
|
122
|
+
@last_poll_time = Time.now
|
|
123
|
+
tasks = batch_poll(available_slots)
|
|
124
|
+
|
|
125
|
+
# 5. Submit tasks for execution
|
|
126
|
+
if tasks.empty?
|
|
127
|
+
@consecutive_empty_polls.increment
|
|
128
|
+
else
|
|
129
|
+
@consecutive_empty_polls.value = 0
|
|
130
|
+
tasks.each do |task|
|
|
131
|
+
submit_task(task)
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# Signal the runner to stop
|
|
137
|
+
def shutdown
|
|
138
|
+
@shutdown.make_true
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Check if runner is running
|
|
142
|
+
# @return [Boolean]
|
|
143
|
+
def running?
|
|
144
|
+
!@shutdown.true?
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
private
|
|
148
|
+
|
|
149
|
+
# Create default logger
|
|
150
|
+
# @return [Logger]
|
|
151
|
+
def create_default_logger
|
|
152
|
+
logger = Logger.new($stdout)
|
|
153
|
+
logger.level = Logger::INFO
|
|
154
|
+
logger.formatter = proc do |severity, datetime, _progname, msg|
|
|
155
|
+
"[#{datetime.strftime('%Y-%m-%d %H:%M:%S')}] #{severity} -- #{msg}\n"
|
|
156
|
+
end
|
|
157
|
+
logger
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# Extract worker options as a hash
|
|
161
|
+
# @param worker [Worker] Worker instance
|
|
162
|
+
# @return [Hash]
|
|
163
|
+
def extract_worker_options(worker)
|
|
164
|
+
options = {}
|
|
165
|
+
Worker::DEFAULTS.each_key do |key|
|
|
166
|
+
options[key] = worker.send(key) if worker.respond_to?(key)
|
|
167
|
+
end
|
|
168
|
+
options
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
# Apply resolved configuration
|
|
172
|
+
# @param config [Hash] Resolved configuration
|
|
173
|
+
def apply_resolved_config(config)
|
|
174
|
+
@poll_interval = config[:poll_interval]
|
|
175
|
+
@max_workers = config[:thread_count]
|
|
176
|
+
@worker_id = config[:worker_id]
|
|
177
|
+
@domain = config[:domain]
|
|
178
|
+
@poll_timeout = config[:poll_timeout]
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# Cleanup completed task futures
|
|
182
|
+
def cleanup_completed_tasks
|
|
183
|
+
removed = false
|
|
184
|
+
@running_tasks.each do |future|
|
|
185
|
+
if future.fulfilled? || future.rejected?
|
|
186
|
+
@running_tasks.delete(future)
|
|
187
|
+
removed = true
|
|
188
|
+
end
|
|
189
|
+
end
|
|
190
|
+
publish_active_workers if removed
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
# Calculate adaptive backoff for empty polls
|
|
194
|
+
# @return [Float] Backoff in milliseconds
|
|
195
|
+
def calculate_adaptive_backoff
|
|
196
|
+
exponent = [@consecutive_empty_polls.value, MAX_BACKOFF_EXPONENT].min
|
|
197
|
+
[1.0 * (2**exponent), @poll_interval].min
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
# Batch poll for tasks with auth failure backoff
|
|
201
|
+
# @param count [Integer] Number of tasks to poll for
|
|
202
|
+
# @return [Array<Hash>] Array of task hashes
|
|
203
|
+
def batch_poll(count)
|
|
204
|
+
if @worker.paused
|
|
205
|
+
@event_dispatcher.publish(Events::TaskPaused.new(task_type: @worker.task_definition_name))
|
|
206
|
+
return []
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
# Auth failure exponential backoff
|
|
210
|
+
if @auth_failures.value.positive? && @last_auth_failure_time
|
|
211
|
+
backoff_seconds = [2**@auth_failures.value, MAX_AUTH_BACKOFF_SECONDS].min
|
|
212
|
+
elapsed = Time.now - @last_auth_failure_time
|
|
213
|
+
return [] if elapsed < backoff_seconds
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
# Publish PollStarted event
|
|
217
|
+
@event_dispatcher.publish(Events::PollStarted.new(
|
|
218
|
+
task_type: @worker.task_definition_name,
|
|
219
|
+
worker_id: @worker_id,
|
|
220
|
+
poll_count: @poll_count.value
|
|
221
|
+
))
|
|
222
|
+
|
|
223
|
+
start_time = Time.now
|
|
224
|
+
|
|
225
|
+
begin
|
|
226
|
+
# HTTP batch poll - use domain only if it's a non-empty string
|
|
227
|
+
domain_param = @domain.to_s.empty? ? nil : @domain
|
|
228
|
+
|
|
229
|
+
tasks = @task_client.batch_poll_tasks(
|
|
230
|
+
@worker.task_definition_name,
|
|
231
|
+
count: count,
|
|
232
|
+
timeout: @poll_timeout,
|
|
233
|
+
worker_id: @worker_id,
|
|
234
|
+
domain: domain_param
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
tasks ||= []
|
|
238
|
+
duration_ms = (Time.now - start_time) * 1000
|
|
239
|
+
@poll_count.increment
|
|
240
|
+
|
|
241
|
+
# Publish PollCompleted event
|
|
242
|
+
@event_dispatcher.publish(Events::PollCompleted.new(
|
|
243
|
+
task_type: @worker.task_definition_name,
|
|
244
|
+
duration_ms: duration_ms,
|
|
245
|
+
tasks_received: tasks.size
|
|
246
|
+
))
|
|
247
|
+
|
|
248
|
+
# Reset auth failures on success
|
|
249
|
+
@auth_failures.value = 0
|
|
250
|
+
|
|
251
|
+
tasks
|
|
252
|
+
rescue AuthorizationError => e
|
|
253
|
+
handle_auth_failure(e, start_time)
|
|
254
|
+
[]
|
|
255
|
+
rescue StandardError => e
|
|
256
|
+
handle_poll_failure(e, start_time)
|
|
257
|
+
[]
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# Handle authorization failure
|
|
262
|
+
# @param error [AuthorizationError] The error
|
|
263
|
+
# @param start_time [Time] When the poll started
|
|
264
|
+
def handle_auth_failure(error, start_time)
|
|
265
|
+
@auth_failures.increment
|
|
266
|
+
@last_auth_failure_time = Time.now
|
|
267
|
+
duration_ms = (Time.now - start_time) * 1000
|
|
268
|
+
|
|
269
|
+
@event_dispatcher.publish(Events::PollFailure.new(
|
|
270
|
+
task_type: @worker.task_definition_name,
|
|
271
|
+
duration_ms: duration_ms,
|
|
272
|
+
cause: error
|
|
273
|
+
))
|
|
274
|
+
|
|
275
|
+
backoff = [2**@auth_failures.value, MAX_AUTH_BACKOFF_SECONDS].min
|
|
276
|
+
@logger.warn("Auth failure ##{@auth_failures.value} for '#{@worker.task_definition_name}', " \
|
|
277
|
+
"backing off #{backoff}s: #{error.message}")
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
# Handle general poll failure
|
|
281
|
+
# @param error [StandardError] The error
|
|
282
|
+
# @param start_time [Time] When the poll started
|
|
283
|
+
def handle_poll_failure(error, start_time)
|
|
284
|
+
duration_ms = (Time.now - start_time) * 1000
|
|
285
|
+
|
|
286
|
+
@event_dispatcher.publish(Events::PollFailure.new(
|
|
287
|
+
task_type: @worker.task_definition_name,
|
|
288
|
+
duration_ms: duration_ms,
|
|
289
|
+
cause: error
|
|
290
|
+
))
|
|
291
|
+
|
|
292
|
+
@logger.error("Poll failed for '#{@worker.task_definition_name}': #{error.message}")
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
# Submit a task for execution
|
|
296
|
+
# @param task [Hash] Task data from API
|
|
297
|
+
def submit_task(task)
|
|
298
|
+
future = Concurrent::Future.execute(executor: @executor) do
|
|
299
|
+
execute_and_update(task)
|
|
300
|
+
end
|
|
301
|
+
@running_tasks << future
|
|
302
|
+
publish_active_workers
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
# Execute a task and update the result
|
|
306
|
+
# @param task [Hash] Task data from API
|
|
307
|
+
def execute_and_update(task)
|
|
308
|
+
task_result = execute_task(task)
|
|
309
|
+
|
|
310
|
+
# Skip update for TaskInProgress (task stays in IN_PROGRESS state)
|
|
311
|
+
return if task_result.nil?
|
|
312
|
+
|
|
313
|
+
# Don't update if result is IN_PROGRESS (will be polled again)
|
|
314
|
+
return if task_result.status == Http::Models::TaskResultStatus::IN_PROGRESS &&
|
|
315
|
+
task_result.callback_after_seconds&.positive?
|
|
316
|
+
|
|
317
|
+
update_task_with_retry(task_result)
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
# Execute a task
|
|
321
|
+
# @param task [Task] Task object from API (already deserialized)
|
|
322
|
+
# @return [TaskResult, nil]
|
|
323
|
+
def execute_task(task)
|
|
324
|
+
# Ensure we have a Task object (may be Hash if deserialization was skipped)
|
|
325
|
+
task_obj = task.is_a?(Http::Models::Task) ? task : Http::Models::Task.from_hash(task)
|
|
326
|
+
|
|
327
|
+
# Create initial TaskResult for context
|
|
328
|
+
initial_result = Http::Models::TaskResult.new
|
|
329
|
+
initial_result.task_id = task_obj.task_id
|
|
330
|
+
initial_result.workflow_instance_id = task_obj.workflow_instance_id
|
|
331
|
+
initial_result.worker_id = @worker_id
|
|
332
|
+
|
|
333
|
+
# Set task context (thread-local)
|
|
334
|
+
TaskContext.current = TaskContext.new(task_obj, initial_result)
|
|
335
|
+
|
|
336
|
+
start_time = Time.now
|
|
337
|
+
|
|
338
|
+
# Publish TaskExecutionStarted
|
|
339
|
+
@event_dispatcher.publish(Events::TaskExecutionStarted.new(
|
|
340
|
+
task_type: @worker.task_definition_name,
|
|
341
|
+
task_id: task_obj.task_id,
|
|
342
|
+
worker_id: @worker_id,
|
|
343
|
+
workflow_instance_id: task_obj.workflow_instance_id
|
|
344
|
+
))
|
|
345
|
+
|
|
346
|
+
begin
|
|
347
|
+
# Execute worker
|
|
348
|
+
task_result = @worker.execute(task_obj)
|
|
349
|
+
|
|
350
|
+
duration_ms = (Time.now - start_time) * 1000
|
|
351
|
+
|
|
352
|
+
# Merge logs from context
|
|
353
|
+
ctx = TaskContext.current
|
|
354
|
+
if ctx&.task_result&.logs && !ctx.task_result.logs.empty?
|
|
355
|
+
task_result.logs ||= []
|
|
356
|
+
task_result.logs.concat(ctx.task_result.logs)
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
# Merge callback_after from context
|
|
360
|
+
task_result.callback_after_seconds ||= ctx&.callback_after_seconds
|
|
361
|
+
|
|
362
|
+
output_size = calculate_output_size(task_result)
|
|
363
|
+
|
|
364
|
+
# Publish TaskExecutionCompleted
|
|
365
|
+
@event_dispatcher.publish(Events::TaskExecutionCompleted.new(
|
|
366
|
+
task_type: @worker.task_definition_name,
|
|
367
|
+
task_id: task_obj.task_id,
|
|
368
|
+
worker_id: @worker_id,
|
|
369
|
+
workflow_instance_id: task_obj.workflow_instance_id,
|
|
370
|
+
duration_ms: duration_ms,
|
|
371
|
+
output_size_bytes: output_size
|
|
372
|
+
))
|
|
373
|
+
|
|
374
|
+
task_result
|
|
375
|
+
rescue NonRetryableError => e
|
|
376
|
+
handle_non_retryable_error(task_obj, e, start_time)
|
|
377
|
+
rescue StandardError => e
|
|
378
|
+
handle_retryable_error(task_obj, e, start_time)
|
|
379
|
+
ensure
|
|
380
|
+
TaskContext.clear
|
|
381
|
+
end
|
|
382
|
+
end
|
|
383
|
+
|
|
384
|
+
# Calculate output size in bytes
|
|
385
|
+
# @param task_result [TaskResult]
|
|
386
|
+
# @return [Integer]
|
|
387
|
+
def calculate_output_size(task_result)
|
|
388
|
+
return 0 unless task_result.output_data
|
|
389
|
+
|
|
390
|
+
task_result.output_data.to_json.bytesize
|
|
391
|
+
rescue StandardError
|
|
392
|
+
0
|
|
393
|
+
end
|
|
394
|
+
|
|
395
|
+
# Handle non-retryable error
|
|
396
|
+
# @param task [Task] Task object
|
|
397
|
+
# @param error [NonRetryableError] The error
|
|
398
|
+
# @param start_time [Time] When execution started
|
|
399
|
+
# @return [TaskResult]
|
|
400
|
+
def handle_non_retryable_error(task, error, start_time)
|
|
401
|
+
duration_ms = (Time.now - start_time) * 1000
|
|
402
|
+
|
|
403
|
+
task_result = Http::Models::TaskResult.failed_with_terminal_error(error.message)
|
|
404
|
+
task_result.task_id = task.task_id
|
|
405
|
+
task_result.workflow_instance_id = task.workflow_instance_id
|
|
406
|
+
task_result.worker_id = @worker_id
|
|
407
|
+
task_result.log("NonRetryableError: #{error.class}: #{error.message}")
|
|
408
|
+
|
|
409
|
+
@event_dispatcher.publish(Events::TaskExecutionFailure.new(
|
|
410
|
+
task_type: @worker.task_definition_name,
|
|
411
|
+
task_id: task.task_id,
|
|
412
|
+
worker_id: @worker_id,
|
|
413
|
+
workflow_instance_id: task.workflow_instance_id,
|
|
414
|
+
duration_ms: duration_ms,
|
|
415
|
+
cause: error,
|
|
416
|
+
is_retryable: false
|
|
417
|
+
))
|
|
418
|
+
|
|
419
|
+
@logger.warn("Task #{task.task_id} failed with terminal error: #{error.message}")
|
|
420
|
+
task_result
|
|
421
|
+
end
|
|
422
|
+
|
|
423
|
+
# Handle retryable error
|
|
424
|
+
# @param task [Task] Task object
|
|
425
|
+
# @param error [StandardError] The error
|
|
426
|
+
# @param start_time [Time] When execution started
|
|
427
|
+
# @return [TaskResult]
|
|
428
|
+
def handle_retryable_error(task, error, start_time)
|
|
429
|
+
duration_ms = (Time.now - start_time) * 1000
|
|
430
|
+
|
|
431
|
+
task_result = Http::Models::TaskResult.failed(error.message)
|
|
432
|
+
task_result.task_id = task.task_id
|
|
433
|
+
task_result.workflow_instance_id = task.workflow_instance_id
|
|
434
|
+
task_result.worker_id = @worker_id
|
|
435
|
+
|
|
436
|
+
backtrace = error.backtrace&.first(5)&.join("\n") || ''
|
|
437
|
+
task_result.log("Error: #{error.class}: #{error.message}\n#{backtrace}")
|
|
438
|
+
|
|
439
|
+
@event_dispatcher.publish(Events::TaskExecutionFailure.new(
|
|
440
|
+
task_type: @worker.task_definition_name,
|
|
441
|
+
task_id: task.task_id,
|
|
442
|
+
worker_id: @worker_id,
|
|
443
|
+
workflow_instance_id: task.workflow_instance_id,
|
|
444
|
+
duration_ms: duration_ms,
|
|
445
|
+
cause: error,
|
|
446
|
+
is_retryable: true
|
|
447
|
+
))
|
|
448
|
+
|
|
449
|
+
@logger.error("Task #{task.task_id} failed: #{error.message}")
|
|
450
|
+
task_result
|
|
451
|
+
end
|
|
452
|
+
|
|
453
|
+
# Update task with retry logic
|
|
454
|
+
# @param task_result [TaskResult] The result to send
|
|
455
|
+
def update_task_with_retry(task_result)
|
|
456
|
+
RETRY_BACKOFFS.each_with_index do |backoff, attempt|
|
|
457
|
+
sleep(backoff) if backoff.positive?
|
|
458
|
+
|
|
459
|
+
start_time = Time.now
|
|
460
|
+
begin
|
|
461
|
+
@task_client.update_task(task_result)
|
|
462
|
+
duration_ms = (Time.now - start_time) * 1000
|
|
463
|
+
|
|
464
|
+
publish_task_update_completed(task_result, duration_ms)
|
|
465
|
+
return # Success
|
|
466
|
+
rescue StandardError => e
|
|
467
|
+
duration_ms = (Time.now - start_time) * 1000
|
|
468
|
+
@logger.error("Task update failed (attempt #{attempt + 1}/#{RETRY_BACKOFFS.size}): #{e.message}")
|
|
469
|
+
|
|
470
|
+
if attempt == RETRY_BACKOFFS.size - 1
|
|
471
|
+
@logger.fatal("CRITICAL: Task update failed after #{RETRY_BACKOFFS.size} attempts. " \
|
|
472
|
+
"Task #{task_result.task_id} result is LOST.")
|
|
473
|
+
publish_task_update_failure(task_result, e, duration_ms)
|
|
474
|
+
end
|
|
475
|
+
end
|
|
476
|
+
end
|
|
477
|
+
end
|
|
478
|
+
|
|
479
|
+
def publish_task_update_completed(task_result, duration_ms)
|
|
480
|
+
@event_dispatcher.publish(Events::TaskUpdateCompleted.new(
|
|
481
|
+
task_type: @worker.task_definition_name,
|
|
482
|
+
task_id: task_result.task_id,
|
|
483
|
+
worker_id: @worker_id,
|
|
484
|
+
workflow_instance_id: task_result.workflow_instance_id,
|
|
485
|
+
duration_ms: duration_ms
|
|
486
|
+
))
|
|
487
|
+
end
|
|
488
|
+
|
|
489
|
+
def publish_task_update_failure(task_result, error, duration_ms)
|
|
490
|
+
@event_dispatcher.publish(Events::TaskUpdateFailure.new(
|
|
491
|
+
task_type: @worker.task_definition_name,
|
|
492
|
+
task_id: task_result.task_id,
|
|
493
|
+
worker_id: @worker_id,
|
|
494
|
+
workflow_instance_id: task_result.workflow_instance_id,
|
|
495
|
+
cause: error,
|
|
496
|
+
retry_count: RETRY_BACKOFFS.size,
|
|
497
|
+
task_result: task_result,
|
|
498
|
+
duration_ms: duration_ms
|
|
499
|
+
))
|
|
500
|
+
end
|
|
501
|
+
|
|
502
|
+
def publish_active_workers
|
|
503
|
+
@event_dispatcher.publish(Events::ActiveWorkersChanged.new(
|
|
504
|
+
task_type: @worker.task_definition_name,
|
|
505
|
+
count: @running_tasks.size
|
|
506
|
+
))
|
|
507
|
+
rescue StandardError => e
|
|
508
|
+
@logger.debug { "Telemetry error (non-fatal): #{e.class}: #{e.message}" }
|
|
509
|
+
end
|
|
510
|
+
|
|
511
|
+
def publish_uncaught_exception(error)
|
|
512
|
+
@event_dispatcher.publish(Events::ThreadUncaughtException.new(
|
|
513
|
+
cause: error,
|
|
514
|
+
task_type: @worker&.task_definition_name
|
|
515
|
+
))
|
|
516
|
+
rescue StandardError => e
|
|
517
|
+
@logger.debug { "Telemetry error (non-fatal): #{e.class}: #{e.message}" }
|
|
518
|
+
end
|
|
519
|
+
|
|
520
|
+
# Register task definition if configured
|
|
521
|
+
def register_task_definition
|
|
522
|
+
@logger.info('Task definition registration not yet implemented')
|
|
523
|
+
# TODO: Implement task definition registration
|
|
524
|
+
end
|
|
525
|
+
|
|
526
|
+
# Cleanup resources
|
|
527
|
+
def cleanup
|
|
528
|
+
@executor.shutdown
|
|
529
|
+
@executor.wait_for_termination(5)
|
|
530
|
+
@executor.kill unless @executor.shutdown?
|
|
531
|
+
|
|
532
|
+
@event_dispatcher.clear
|
|
533
|
+
rescue StandardError => e
|
|
534
|
+
@logger.warn("Error during cleanup: #{e.message}")
|
|
535
|
+
end
|
|
536
|
+
end
|
|
537
|
+
end
|
|
538
|
+
end
|