horsies 0.1.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- horsies/__init__.py +115 -0
- horsies/core/__init__.py +0 -0
- horsies/core/app.py +552 -0
- horsies/core/banner.py +144 -0
- horsies/core/brokers/__init__.py +5 -0
- horsies/core/brokers/listener.py +444 -0
- horsies/core/brokers/postgres.py +864 -0
- horsies/core/cli.py +624 -0
- horsies/core/codec/serde.py +575 -0
- horsies/core/errors.py +535 -0
- horsies/core/logging.py +90 -0
- horsies/core/models/__init__.py +0 -0
- horsies/core/models/app.py +268 -0
- horsies/core/models/broker.py +79 -0
- horsies/core/models/queues.py +23 -0
- horsies/core/models/recovery.py +101 -0
- horsies/core/models/schedule.py +229 -0
- horsies/core/models/task_pg.py +307 -0
- horsies/core/models/tasks.py +332 -0
- horsies/core/models/workflow.py +1988 -0
- horsies/core/models/workflow_pg.py +245 -0
- horsies/core/registry/tasks.py +101 -0
- horsies/core/scheduler/__init__.py +26 -0
- horsies/core/scheduler/calculator.py +267 -0
- horsies/core/scheduler/service.py +569 -0
- horsies/core/scheduler/state.py +260 -0
- horsies/core/task_decorator.py +615 -0
- horsies/core/types/status.py +38 -0
- horsies/core/utils/imports.py +203 -0
- horsies/core/utils/loop_runner.py +44 -0
- horsies/core/worker/current.py +17 -0
- horsies/core/worker/worker.py +1967 -0
- horsies/core/workflows/__init__.py +23 -0
- horsies/core/workflows/engine.py +2344 -0
- horsies/core/workflows/recovery.py +501 -0
- horsies/core/workflows/registry.py +97 -0
- horsies/py.typed +0 -0
- horsies-0.1.0a1.dist-info/METADATA +31 -0
- horsies-0.1.0a1.dist-info/RECORD +42 -0
- horsies-0.1.0a1.dist-info/WHEEL +5 -0
- horsies-0.1.0a1.dist-info/entry_points.txt +2 -0
- horsies-0.1.0a1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
# app/core/models/app.py
|
|
2
|
+
from typing import List, Optional, Any
|
|
3
|
+
from pydantic import BaseModel, model_validator, Field
|
|
4
|
+
from horsies.core.models.queues import QueueMode, CustomQueueConfig
|
|
5
|
+
from horsies.core.models.broker import PostgresConfig
|
|
6
|
+
from horsies.core.models.recovery import RecoveryConfig
|
|
7
|
+
from horsies.core.models.schedule import ScheduleConfig
|
|
8
|
+
from horsies.core.errors import ConfigurationError, ErrorCode, ValidationReport, raise_collected
|
|
9
|
+
from urllib.parse import urlparse, urlunparse
|
|
10
|
+
import logging
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class AppConfig(BaseModel):
|
|
15
|
+
queue_mode: QueueMode = QueueMode.DEFAULT
|
|
16
|
+
custom_queues: Optional[List[CustomQueueConfig]] = None
|
|
17
|
+
broker: PostgresConfig
|
|
18
|
+
# Cluster-wide cap on concurrently RUNNING tasks across all queues. None = unlimited.
|
|
19
|
+
cluster_wide_cap: Optional[int] = None
|
|
20
|
+
# Prefetch buffer: 0 = hard cap mode (count RUNNING + CLAIMED), >0 = soft cap with lease
|
|
21
|
+
prefetch_buffer: int = 0
|
|
22
|
+
# Claim lease duration in ms. Required when prefetch_buffer > 0.
|
|
23
|
+
# Prefetched claims expire after this duration and can be reclaimed by other workers.
|
|
24
|
+
claim_lease_ms: Optional[int] = None
|
|
25
|
+
recovery: RecoveryConfig = Field(default_factory=RecoveryConfig)
|
|
26
|
+
schedule: Optional[ScheduleConfig] = Field(
|
|
27
|
+
default=None, description='Scheduler configuration'
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
@model_validator(mode='after')
|
|
31
|
+
def validate_queue_configuration(self):
|
|
32
|
+
"""Validate queue configuration based on queue mode.
|
|
33
|
+
|
|
34
|
+
Collects all independent errors and raises them together.
|
|
35
|
+
"""
|
|
36
|
+
report = ValidationReport('config')
|
|
37
|
+
|
|
38
|
+
if self.queue_mode == QueueMode.DEFAULT:
|
|
39
|
+
if self.custom_queues is not None:
|
|
40
|
+
report.add(ConfigurationError(
|
|
41
|
+
message='custom_queues must be None in DEFAULT mode',
|
|
42
|
+
code=ErrorCode.CONFIG_INVALID_QUEUE_MODE,
|
|
43
|
+
notes=['queue_mode=DEFAULT but custom_queues was provided'],
|
|
44
|
+
help_text='either remove custom_queues or set queue_mode=CUSTOM',
|
|
45
|
+
))
|
|
46
|
+
elif self.queue_mode == QueueMode.CUSTOM:
|
|
47
|
+
if self.custom_queues is None or len(self.custom_queues) == 0:
|
|
48
|
+
report.add(ConfigurationError(
|
|
49
|
+
message='custom_queues required in CUSTOM mode',
|
|
50
|
+
code=ErrorCode.CONFIG_INVALID_QUEUE_MODE,
|
|
51
|
+
notes=['queue_mode=CUSTOM but custom_queues is empty or None'],
|
|
52
|
+
help_text='provide at least one CustomQueueConfig in custom_queues',
|
|
53
|
+
))
|
|
54
|
+
else:
|
|
55
|
+
# Validate unique queue names (only if queues exist)
|
|
56
|
+
queue_names = [q.name for q in self.custom_queues]
|
|
57
|
+
if len(queue_names) != len(set(queue_names)):
|
|
58
|
+
report.add(ConfigurationError(
|
|
59
|
+
message='duplicate queue names in custom_queues',
|
|
60
|
+
code=ErrorCode.CONFIG_INVALID_QUEUE_MODE,
|
|
61
|
+
notes=[f'queue names: {queue_names}'],
|
|
62
|
+
help_text='each queue name must be unique',
|
|
63
|
+
))
|
|
64
|
+
|
|
65
|
+
# Validate cluster_wide_cap if provided
|
|
66
|
+
if self.cluster_wide_cap is not None and self.cluster_wide_cap <= 0:
|
|
67
|
+
report.add(ConfigurationError(
|
|
68
|
+
message='cluster_wide_cap must be positive',
|
|
69
|
+
code=ErrorCode.CONFIG_INVALID_CLUSTER_CAP,
|
|
70
|
+
notes=[f'got cluster_wide_cap={self.cluster_wide_cap}'],
|
|
71
|
+
help_text='use a positive integer or None for unlimited',
|
|
72
|
+
))
|
|
73
|
+
|
|
74
|
+
# Validate prefetch_buffer
|
|
75
|
+
if self.prefetch_buffer < 0:
|
|
76
|
+
report.add(ConfigurationError(
|
|
77
|
+
message='prefetch_buffer must be non-negative',
|
|
78
|
+
code=ErrorCode.CONFIG_INVALID_PREFETCH,
|
|
79
|
+
notes=[f'got prefetch_buffer={self.prefetch_buffer}'],
|
|
80
|
+
help_text='use 0 for hard cap mode or positive integer for soft cap',
|
|
81
|
+
))
|
|
82
|
+
|
|
83
|
+
# Validate claim_lease_ms
|
|
84
|
+
if self.prefetch_buffer > 0 and self.claim_lease_ms is None:
|
|
85
|
+
report.add(ConfigurationError(
|
|
86
|
+
message='claim_lease_ms required when prefetch_buffer > 0',
|
|
87
|
+
code=ErrorCode.CONFIG_INVALID_PREFETCH,
|
|
88
|
+
notes=[
|
|
89
|
+
f'prefetch_buffer={self.prefetch_buffer} but claim_lease_ms is None',
|
|
90
|
+
],
|
|
91
|
+
help_text='set claim_lease_ms (e.g., 30000 for 30 seconds)',
|
|
92
|
+
))
|
|
93
|
+
if self.claim_lease_ms is not None and self.claim_lease_ms <= 0:
|
|
94
|
+
report.add(ConfigurationError(
|
|
95
|
+
message='claim_lease_ms must be positive',
|
|
96
|
+
code=ErrorCode.CONFIG_INVALID_PREFETCH,
|
|
97
|
+
notes=[f'got claim_lease_ms={self.claim_lease_ms}'],
|
|
98
|
+
help_text='use a positive integer in milliseconds',
|
|
99
|
+
))
|
|
100
|
+
|
|
101
|
+
# Forbid claim_lease_ms in hard cap mode
|
|
102
|
+
if self.prefetch_buffer == 0 and self.claim_lease_ms is not None:
|
|
103
|
+
report.add(ConfigurationError(
|
|
104
|
+
message='claim_lease_ms incompatible with hard cap mode',
|
|
105
|
+
code=ErrorCode.CONFIG_INVALID_PREFETCH,
|
|
106
|
+
notes=[
|
|
107
|
+
'prefetch_buffer=0 (hard cap mode)',
|
|
108
|
+
f'but claim_lease_ms={self.claim_lease_ms} was set',
|
|
109
|
+
],
|
|
110
|
+
help_text='remove claim_lease_ms or set prefetch_buffer > 0',
|
|
111
|
+
))
|
|
112
|
+
|
|
113
|
+
# Validate prefetch_buffer vs cluster_wide_cap conflict
|
|
114
|
+
if self.cluster_wide_cap is not None and self.prefetch_buffer > 0:
|
|
115
|
+
report.add(ConfigurationError(
|
|
116
|
+
message='cluster_wide_cap incompatible with prefetch mode',
|
|
117
|
+
code=ErrorCode.CONFIG_INVALID_CLUSTER_CAP,
|
|
118
|
+
notes=[
|
|
119
|
+
f'cluster_wide_cap={self.cluster_wide_cap}',
|
|
120
|
+
f'prefetch_buffer={self.prefetch_buffer}',
|
|
121
|
+
'cluster_wide_cap requires hard cap mode (prefetch_buffer=0)',
|
|
122
|
+
],
|
|
123
|
+
help_text='set prefetch_buffer=0 when using cluster_wide_cap',
|
|
124
|
+
))
|
|
125
|
+
|
|
126
|
+
raise_collected(report)
|
|
127
|
+
return self
|
|
128
|
+
|
|
129
|
+
def log_config(self, logger: Optional[logging.Logger] = None) -> None:
|
|
130
|
+
"""
|
|
131
|
+
Log the AppConfig in a human-readable format.
|
|
132
|
+
Masks sensitive data like database passwords.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
logger: Logger instance to use. If None, uses root logger.
|
|
136
|
+
"""
|
|
137
|
+
if os.getenv('HORSIES_CHILD_PROCESS') == '1':
|
|
138
|
+
return
|
|
139
|
+
if logger is None:
|
|
140
|
+
logger = logging.getLogger()
|
|
141
|
+
|
|
142
|
+
formatted = self._format_for_logging()
|
|
143
|
+
logger.info('AppConfig:\n%s', formatted)
|
|
144
|
+
|
|
145
|
+
def _format_for_logging(self) -> str:
|
|
146
|
+
"""Internal helper to format the AppConfig for human-readable logging."""
|
|
147
|
+
lines: list[str] = []
|
|
148
|
+
|
|
149
|
+
# Queue mode and queues
|
|
150
|
+
lines.append(f' queue_mode: {self.queue_mode.value.upper()}')
|
|
151
|
+
if self.queue_mode == QueueMode.CUSTOM and self.custom_queues:
|
|
152
|
+
lines.append(' custom_queues:')
|
|
153
|
+
for queue in self.custom_queues:
|
|
154
|
+
lines.append(
|
|
155
|
+
f' - {queue.name} (priority={queue.priority}, max_concurrency={queue.max_concurrency})'
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# Cluster-wide cap and prefetch settings
|
|
159
|
+
if self.cluster_wide_cap is not None:
|
|
160
|
+
lines.append(f' cluster_wide_cap: {self.cluster_wide_cap}')
|
|
161
|
+
lines.append(f' prefetch_buffer: {self.prefetch_buffer}')
|
|
162
|
+
if self.claim_lease_ms is not None:
|
|
163
|
+
lines.append(f' claim_lease_ms: {self.claim_lease_ms}ms')
|
|
164
|
+
|
|
165
|
+
# Broker config with masked password
|
|
166
|
+
masked_url = self._mask_database_url(self.broker.database_url)
|
|
167
|
+
lines.append(f' broker:')
|
|
168
|
+
lines.append(f' database_url: {masked_url}')
|
|
169
|
+
lines.append(f' pool_size: {self.broker.pool_size}')
|
|
170
|
+
lines.append(f' max_overflow: {self.broker.max_overflow}')
|
|
171
|
+
|
|
172
|
+
# Recovery config
|
|
173
|
+
lines.append(' recovery:')
|
|
174
|
+
lines.append(
|
|
175
|
+
f' auto_requeue_stale_claimed: {self.recovery.auto_requeue_stale_claimed}'
|
|
176
|
+
)
|
|
177
|
+
lines.append(
|
|
178
|
+
f' claimed_stale_threshold: {self.recovery.claimed_stale_threshold_ms}ms'
|
|
179
|
+
)
|
|
180
|
+
lines.append(
|
|
181
|
+
f' auto_fail_stale_running: {self.recovery.auto_fail_stale_running}'
|
|
182
|
+
)
|
|
183
|
+
lines.append(
|
|
184
|
+
f' running_stale_threshold: {self.recovery.running_stale_threshold_ms}ms'
|
|
185
|
+
)
|
|
186
|
+
lines.append(f' check_interval: {self.recovery.check_interval_ms}ms')
|
|
187
|
+
lines.append(
|
|
188
|
+
f' heartbeat_intervals: runner={self.recovery.runner_heartbeat_interval_ms}ms, claimer={self.recovery.claimer_heartbeat_interval_ms}ms'
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# Schedule config
|
|
192
|
+
if self.schedule is not None:
|
|
193
|
+
lines.append(' schedule:')
|
|
194
|
+
lines.append(f' enabled: {self.schedule.enabled}')
|
|
195
|
+
lines.append(f' schedules: {len(self.schedule.schedules)} schedule(s)')
|
|
196
|
+
if self.schedule.schedules:
|
|
197
|
+
for sched in self.schedule.schedules:
|
|
198
|
+
pattern_desc = self._format_schedule_pattern(sched.pattern)
|
|
199
|
+
lines.append(
|
|
200
|
+
f' - {sched.name}: {sched.task_name} {pattern_desc}'
|
|
201
|
+
)
|
|
202
|
+
lines.append(f' check_interval: {self.schedule.check_interval_seconds}s')
|
|
203
|
+
|
|
204
|
+
return '\n'.join(lines)
|
|
205
|
+
|
|
206
|
+
@staticmethod
|
|
207
|
+
def _mask_database_url(url: str) -> str:
|
|
208
|
+
"""Mask password in database URL for secure logging."""
|
|
209
|
+
try:
|
|
210
|
+
parsed = urlparse(url)
|
|
211
|
+
if parsed.password:
|
|
212
|
+
# Replace password with asterisks
|
|
213
|
+
netloc = parsed.netloc.replace(f':{parsed.password}@', ':***@')
|
|
214
|
+
masked = urlunparse(
|
|
215
|
+
(
|
|
216
|
+
parsed.scheme,
|
|
217
|
+
netloc,
|
|
218
|
+
parsed.path,
|
|
219
|
+
parsed.params,
|
|
220
|
+
parsed.query,
|
|
221
|
+
parsed.fragment,
|
|
222
|
+
)
|
|
223
|
+
)
|
|
224
|
+
return masked
|
|
225
|
+
return url
|
|
226
|
+
except Exception:
|
|
227
|
+
# If parsing fails, just return a generic masked version
|
|
228
|
+
return (
|
|
229
|
+
url.split('@')[0].rsplit(':', 1)[0] + ':***@' + url.split('@')[1]
|
|
230
|
+
if '@' in url
|
|
231
|
+
else url
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
@staticmethod
|
|
235
|
+
def _format_schedule_pattern(pattern: Any) -> str:
|
|
236
|
+
"""Format schedule pattern for concise logging."""
|
|
237
|
+
from horsies.core.models.schedule import (
|
|
238
|
+
IntervalSchedule,
|
|
239
|
+
HourlySchedule,
|
|
240
|
+
DailySchedule,
|
|
241
|
+
WeeklySchedule,
|
|
242
|
+
MonthlySchedule,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
if isinstance(pattern, IntervalSchedule):
|
|
246
|
+
parts: list[str] = []
|
|
247
|
+
if pattern.days:
|
|
248
|
+
parts.append(f'{pattern.days}d')
|
|
249
|
+
if pattern.hours:
|
|
250
|
+
parts.append(f'{pattern.hours}h')
|
|
251
|
+
if pattern.minutes:
|
|
252
|
+
parts.append(f'{pattern.minutes}m')
|
|
253
|
+
if pattern.seconds:
|
|
254
|
+
parts.append(f'{pattern.seconds}s')
|
|
255
|
+
return f"every {' '.join(parts)}"
|
|
256
|
+
elif isinstance(pattern, HourlySchedule):
|
|
257
|
+
return f'hourly at :{pattern.minute:02d}:{pattern.second:02d}'
|
|
258
|
+
elif isinstance(pattern, DailySchedule):
|
|
259
|
+
return f"daily at {pattern.time.strftime('%H:%M:%S')}"
|
|
260
|
+
elif isinstance(pattern, WeeklySchedule):
|
|
261
|
+
days = ', '.join(d.value for d in pattern.days)
|
|
262
|
+
return f"weekly on {days} at {pattern.time.strftime('%H:%M:%S')}"
|
|
263
|
+
elif isinstance(pattern, MonthlySchedule):
|
|
264
|
+
return (
|
|
265
|
+
f"monthly on day {pattern.day} at {pattern.time.strftime('%H:%M:%S')}"
|
|
266
|
+
)
|
|
267
|
+
else:
|
|
268
|
+
return str(pattern)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
from pydantic import BaseModel, Field, field_validator
|
|
3
|
+
from horsies.core.errors import ConfigurationError, ErrorCode
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class PostgresConfig(BaseModel):
|
|
7
|
+
database_url: str = Field(..., description='The URL of the PostgreSQL database')
|
|
8
|
+
pool_pre_ping: bool = Field(
|
|
9
|
+
default=True, description='Whether to pre-ping the database connection pool'
|
|
10
|
+
)
|
|
11
|
+
pool_size: int = Field(
|
|
12
|
+
default=30, description='The size of the database connection pool'
|
|
13
|
+
)
|
|
14
|
+
max_overflow: int = Field(
|
|
15
|
+
default=30, description='The maximum number of connections to allow in the pool'
|
|
16
|
+
)
|
|
17
|
+
pool_timeout: int = Field(
|
|
18
|
+
default=30, description='The timeout for acquiring a connection from the pool'
|
|
19
|
+
)
|
|
20
|
+
pool_recycle: int = Field(
|
|
21
|
+
default=1800, description='The number of seconds to recycle connections'
|
|
22
|
+
)
|
|
23
|
+
echo: bool = Field(default=False, description='Whether to echo the SQL statements')
|
|
24
|
+
|
|
25
|
+
@field_validator('database_url')
|
|
26
|
+
def validate_database_url(cls, v: str) -> str:
|
|
27
|
+
if not v.startswith('postgresql+psycopg'):
|
|
28
|
+
raise ConfigurationError(
|
|
29
|
+
message='invalid database URL scheme',
|
|
30
|
+
code=ErrorCode.BROKER_INVALID_URL,
|
|
31
|
+
notes=[
|
|
32
|
+
f"got: {v.split('://')[0] if '://' in v else v[:20]}://...",
|
|
33
|
+
'horsies only supports psycopg3 (async PostgreSQL driver)',
|
|
34
|
+
],
|
|
35
|
+
help_text="use 'postgresql+psycopg://user:pass@host/db'",
|
|
36
|
+
)
|
|
37
|
+
return v
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# =============================================================================
|
|
41
|
+
# DEPRECATED: These exceptions are no longer raised by the broker.
|
|
42
|
+
# Use TaskResult with LibraryErrorCode.TASK_NOT_FOUND and LibraryErrorCode.WAIT_TIMEOUT instead.
|
|
43
|
+
# =============================================================================
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class TaskNotFoundError(Exception):
|
|
47
|
+
"""
|
|
48
|
+
DEPRECATED: No longer raised by broker.get_result().
|
|
49
|
+
|
|
50
|
+
The broker now returns TaskResult(err=TaskError(error_code=LibraryErrorCode.TASK_NOT_FOUND)).
|
|
51
|
+
Check result.is_err() and result.err.error_code instead of catching this exception.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(self, *args: object) -> None:
|
|
55
|
+
warnings.warn(
|
|
56
|
+
'TaskNotFoundError is deprecated. '
|
|
57
|
+
'Use TaskResult with LibraryErrorCode.TASK_NOT_FOUND instead.',
|
|
58
|
+
DeprecationWarning,
|
|
59
|
+
stacklevel=2,
|
|
60
|
+
)
|
|
61
|
+
super().__init__(*args)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class TaskTimeoutError(Exception):
|
|
65
|
+
"""
|
|
66
|
+
DEPRECATED: No longer raised by broker.get_result().
|
|
67
|
+
|
|
68
|
+
The broker now returns TaskResult(err=TaskError(error_code=LibraryErrorCode.WAIT_TIMEOUT)).
|
|
69
|
+
Check result.is_err() and result.err.error_code instead of catching this exception.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
def __init__(self, *args: object) -> None:
|
|
73
|
+
warnings.warn(
|
|
74
|
+
'TaskTimeoutError is deprecated. '
|
|
75
|
+
'Use TaskResult with LibraryErrorCode.WAIT_TIMEOUT instead.',
|
|
76
|
+
DeprecationWarning,
|
|
77
|
+
stacklevel=2,
|
|
78
|
+
)
|
|
79
|
+
super().__init__(*args)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# app/core/models/queues.py
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class QueueMode(Enum):
|
|
7
|
+
CUSTOM = 'custom'
|
|
8
|
+
DEFAULT = 'default'
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class CustomQueueConfig(BaseModel):
|
|
12
|
+
"""
|
|
13
|
+
name: name of the queue. Usage: `@task(queue="name")`
|
|
14
|
+
priority: 1 means first to be executed, 100 means last to be executed.
|
|
15
|
+
max_concurrency: max number of tasks that can be executed at the same time for this queue.
|
|
16
|
+
App level concurrency is still respected.
|
|
17
|
+
If you have set app level concurrency to 5 but queue level to 10,
|
|
18
|
+
it will still be limited to 5.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
name: str
|
|
22
|
+
priority: int = Field(default=1, ge=1, le=100)
|
|
23
|
+
max_concurrency: int = 5
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# horsies/core/models/recovery.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import Annotated, Self
|
|
4
|
+
from pydantic import BaseModel, Field, model_validator
|
|
5
|
+
from horsies.core.errors import ConfigurationError, ErrorCode, ValidationReport, raise_collected
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class RecoveryConfig(BaseModel):
|
|
9
|
+
"""
|
|
10
|
+
Configuration for automatic stale task handling and crash recovery.
|
|
11
|
+
|
|
12
|
+
Stale task detection and recovery:
|
|
13
|
+
- CLAIMED tasks that never start executing: safe to auto-requeue
|
|
14
|
+
- RUNNING tasks that go stale: mark as FAILED (may not be idempotent)
|
|
15
|
+
|
|
16
|
+
All time values are in milliseconds for consistency with timeout_ms.
|
|
17
|
+
|
|
18
|
+
Fields:
|
|
19
|
+
- auto_requeue_stale_claimed: If True, automatically requeue tasks stuck in CLAIMED
|
|
20
|
+
- claimed_stale_threshold_ms: Milliseconds without heartbeat before CLAIMED task is stale
|
|
21
|
+
- auto_fail_stale_running: If True, automatically mark stale RUNNING tasks as FAILED
|
|
22
|
+
- running_stale_threshold_ms: Milliseconds without heartbeat before RUNNING task is stale
|
|
23
|
+
- check_interval_ms: How often the reaper checks for stale tasks
|
|
24
|
+
- runner_heartbeat_interval_ms: How often RUNNING tasks send heartbeats from inside the task process
|
|
25
|
+
- claimer_heartbeat_interval_ms: How often CLAIMED tasks send heartbeats
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
auto_requeue_stale_claimed: bool = Field(
|
|
29
|
+
default=True,
|
|
30
|
+
description='Automatically requeue tasks stuck in CLAIMED (safe - user code never ran)',
|
|
31
|
+
)
|
|
32
|
+
claimed_stale_threshold_ms: Annotated[int, Field(ge=1_000, le=3_600_000)] = Field(
|
|
33
|
+
default=120_000, # 2 minutes
|
|
34
|
+
description='Milliseconds without claimer heartbeat before CLAIMED task is considered stale (1s-1hr)',
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
auto_fail_stale_running: bool = Field(
|
|
38
|
+
default=True,
|
|
39
|
+
description='Automatically mark stale RUNNING tasks as FAILED (not safe to requeue)',
|
|
40
|
+
)
|
|
41
|
+
running_stale_threshold_ms: Annotated[int, Field(ge=1_000, le=7_200_000)] = Field(
|
|
42
|
+
default=300_000, # 5 minutes
|
|
43
|
+
description='Milliseconds without runner heartbeat before RUNNING task is considered stale (1s-2hr)',
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
check_interval_ms: Annotated[int, Field(ge=1_000, le=600_000)] = Field(
|
|
47
|
+
default=30_000, # 30 seconds
|
|
48
|
+
description='How often the reaper checks for stale tasks in milliseconds (1s-10min)',
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
runner_heartbeat_interval_ms: Annotated[int, Field(ge=1_000, le=120_000)] = Field(
|
|
52
|
+
default=30_000, # 30 seconds
|
|
53
|
+
description=(
|
|
54
|
+
'How often RUNNING tasks send heartbeats from inside the task process in milliseconds (5s-2min); '
|
|
55
|
+
'increase stale thresholds for CPU/GIL-heavy tasks to avoid false positives'
|
|
56
|
+
),
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
claimer_heartbeat_interval_ms: Annotated[int, Field(ge=1_000, le=120_000)] = Field(
|
|
60
|
+
default=30_000, # 30 seconds
|
|
61
|
+
description='How often worker sends heartbeats for CLAIMED tasks in milliseconds (5s-2min)',
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
@model_validator(mode='after')
|
|
65
|
+
def validate_heartbeat_thresholds(self) -> Self:
|
|
66
|
+
"""Ensure stale thresholds are at least 2x heartbeat intervals for reliability.
|
|
67
|
+
|
|
68
|
+
Collects both errors (if present) and raises them together.
|
|
69
|
+
"""
|
|
70
|
+
report = ValidationReport('recovery')
|
|
71
|
+
min_running = self.runner_heartbeat_interval_ms * 2
|
|
72
|
+
min_claimed = self.claimer_heartbeat_interval_ms * 2
|
|
73
|
+
|
|
74
|
+
# Validate runner heartbeat vs running stale threshold
|
|
75
|
+
if self.running_stale_threshold_ms < min_running:
|
|
76
|
+
report.add(ConfigurationError(
|
|
77
|
+
message='running_stale_threshold_ms too low',
|
|
78
|
+
code=ErrorCode.CONFIG_INVALID_RECOVERY,
|
|
79
|
+
notes=[
|
|
80
|
+
f'running_stale_threshold_ms={self.running_stale_threshold_ms}ms ({self.running_stale_threshold_ms/1000:.1f}s)',
|
|
81
|
+
f'runner_heartbeat_interval_ms={self.runner_heartbeat_interval_ms}ms ({self.runner_heartbeat_interval_ms/1000:.1f}s)',
|
|
82
|
+
'threshold must be at least 2x heartbeat interval',
|
|
83
|
+
],
|
|
84
|
+
help_text=f'set running_stale_threshold_ms >= {min_running}ms ({min_running/1000:.1f}s)',
|
|
85
|
+
))
|
|
86
|
+
|
|
87
|
+
# Validate claimer heartbeat vs claimed stale threshold
|
|
88
|
+
if self.claimed_stale_threshold_ms < min_claimed:
|
|
89
|
+
report.add(ConfigurationError(
|
|
90
|
+
message='claimed_stale_threshold_ms too low',
|
|
91
|
+
code=ErrorCode.CONFIG_INVALID_RECOVERY,
|
|
92
|
+
notes=[
|
|
93
|
+
f'claimed_stale_threshold_ms={self.claimed_stale_threshold_ms}ms ({self.claimed_stale_threshold_ms/1000:.1f}s)',
|
|
94
|
+
f'claimer_heartbeat_interval_ms={self.claimer_heartbeat_interval_ms}ms ({self.claimer_heartbeat_interval_ms/1000:.1f}s)',
|
|
95
|
+
'threshold must be at least 2x heartbeat interval',
|
|
96
|
+
],
|
|
97
|
+
help_text=f'set claimed_stale_threshold_ms >= {min_claimed}ms ({min_claimed/1000:.1f}s)',
|
|
98
|
+
))
|
|
99
|
+
|
|
100
|
+
raise_collected(report)
|
|
101
|
+
return self
|