django-nativemojo 0.1.15__py3-none-any.whl → 0.1.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (221) hide show
  1. {django_nativemojo-0.1.15.dist-info → django_nativemojo-0.1.16.dist-info}/METADATA +3 -1
  2. django_nativemojo-0.1.16.dist-info/RECORD +302 -0
  3. mojo/__init__.py +1 -1
  4. mojo/apps/account/management/commands/serializer_admin.py +121 -1
  5. mojo/apps/account/migrations/0006_add_device_tracking_models.py +72 -0
  6. mojo/apps/account/migrations/0007_delete_userdevicelocation.py +16 -0
  7. mojo/apps/account/migrations/0008_userdevicelocation.py +33 -0
  8. mojo/apps/account/migrations/0009_geolocatedip_subnet.py +18 -0
  9. mojo/apps/account/migrations/0010_group_avatar.py +20 -0
  10. mojo/apps/account/migrations/0011_user_org_registereddevice_pushconfig_and_more.py +118 -0
  11. mojo/apps/account/migrations/0012_remove_pushconfig_apns_key_file_and_more.py +21 -0
  12. mojo/apps/account/migrations/0013_pushconfig_test_mode_alter_pushconfig_apns_enabled_and_more.py +28 -0
  13. mojo/apps/account/migrations/0014_notificationdelivery_data_payload_and_more.py +48 -0
  14. mojo/apps/account/models/__init__.py +2 -0
  15. mojo/apps/account/models/device.py +281 -0
  16. mojo/apps/account/models/group.py +294 -8
  17. mojo/apps/account/models/member.py +14 -1
  18. mojo/apps/account/models/push/__init__.py +4 -0
  19. mojo/apps/account/models/push/config.py +112 -0
  20. mojo/apps/account/models/push/delivery.py +93 -0
  21. mojo/apps/account/models/push/device.py +66 -0
  22. mojo/apps/account/models/push/template.py +99 -0
  23. mojo/apps/account/models/user.py +190 -17
  24. mojo/apps/account/rest/__init__.py +2 -0
  25. mojo/apps/account/rest/device.py +39 -0
  26. mojo/apps/account/rest/group.py +8 -0
  27. mojo/apps/account/rest/push.py +187 -0
  28. mojo/apps/account/rest/user.py +95 -5
  29. mojo/apps/account/services/__init__.py +1 -0
  30. mojo/apps/account/services/push.py +363 -0
  31. mojo/apps/aws/migrations/0001_initial.py +206 -0
  32. mojo/apps/aws/migrations/0002_emaildomain_can_recv_emaildomain_can_send_and_more.py +28 -0
  33. mojo/apps/aws/migrations/0003_mailbox_is_domain_default_mailbox_is_system_default_and_more.py +31 -0
  34. mojo/apps/aws/migrations/0004_s3bucket.py +39 -0
  35. mojo/apps/aws/migrations/0005_alter_emaildomain_region_delete_s3bucket.py +21 -0
  36. mojo/apps/aws/models/__init__.py +19 -0
  37. mojo/apps/aws/models/email_attachment.py +99 -0
  38. mojo/apps/aws/models/email_domain.py +218 -0
  39. mojo/apps/aws/models/email_template.py +132 -0
  40. mojo/apps/aws/models/incoming_email.py +197 -0
  41. mojo/apps/aws/models/mailbox.py +288 -0
  42. mojo/apps/aws/models/sent_message.py +175 -0
  43. mojo/apps/aws/rest/__init__.py +6 -0
  44. mojo/apps/aws/rest/email.py +33 -0
  45. mojo/apps/aws/rest/email_ops.py +183 -0
  46. mojo/apps/aws/rest/messages.py +32 -0
  47. mojo/apps/aws/rest/send.py +101 -0
  48. mojo/apps/aws/rest/sns.py +403 -0
  49. mojo/apps/aws/rest/templates.py +19 -0
  50. mojo/apps/aws/services/__init__.py +32 -0
  51. mojo/apps/aws/services/email.py +390 -0
  52. mojo/apps/aws/services/email_ops.py +548 -0
  53. mojo/apps/docit/__init__.py +6 -0
  54. mojo/apps/docit/markdown_plugins/syntax_highlight.py +25 -0
  55. mojo/apps/docit/markdown_plugins/toc.py +12 -0
  56. mojo/apps/docit/migrations/0001_initial.py +113 -0
  57. mojo/apps/docit/migrations/0002_alter_book_modified_by_alter_page_modified_by.py +26 -0
  58. mojo/apps/docit/migrations/0003_alter_book_group.py +20 -0
  59. mojo/apps/docit/models/__init__.py +17 -0
  60. mojo/apps/docit/models/asset.py +231 -0
  61. mojo/apps/docit/models/book.py +227 -0
  62. mojo/apps/docit/models/page.py +319 -0
  63. mojo/apps/docit/models/page_revision.py +203 -0
  64. mojo/apps/docit/rest/__init__.py +10 -0
  65. mojo/apps/docit/rest/asset.py +17 -0
  66. mojo/apps/docit/rest/book.py +22 -0
  67. mojo/apps/docit/rest/page.py +22 -0
  68. mojo/apps/docit/rest/page_revision.py +17 -0
  69. mojo/apps/docit/services/__init__.py +11 -0
  70. mojo/apps/docit/services/docit.py +315 -0
  71. mojo/apps/docit/services/markdown.py +44 -0
  72. mojo/apps/fileman/backends/s3.py +209 -0
  73. mojo/apps/fileman/models/file.py +45 -9
  74. mojo/apps/fileman/models/manager.py +269 -3
  75. mojo/apps/incident/migrations/0007_event_uid.py +18 -0
  76. mojo/apps/incident/migrations/0008_ticket_ticketnote.py +55 -0
  77. mojo/apps/incident/migrations/0009_incident_status.py +18 -0
  78. mojo/apps/incident/migrations/0010_event_country_code.py +18 -0
  79. mojo/apps/incident/migrations/0011_incident_country_code.py +18 -0
  80. mojo/apps/incident/migrations/0012_alter_incident_status.py +18 -0
  81. mojo/apps/incident/models/__init__.py +1 -0
  82. mojo/apps/incident/models/event.py +35 -0
  83. mojo/apps/incident/models/incident.py +2 -0
  84. mojo/apps/incident/models/ticket.py +62 -0
  85. mojo/apps/incident/reporter.py +21 -3
  86. mojo/apps/incident/rest/__init__.py +1 -0
  87. mojo/apps/incident/rest/ticket.py +43 -0
  88. mojo/apps/jobs/__init__.py +489 -0
  89. mojo/apps/jobs/adapters.py +24 -0
  90. mojo/apps/jobs/cli.py +616 -0
  91. mojo/apps/jobs/daemon.py +370 -0
  92. mojo/apps/jobs/examples/sample_jobs.py +376 -0
  93. mojo/apps/jobs/examples/webhook_examples.py +203 -0
  94. mojo/apps/jobs/handlers/__init__.py +5 -0
  95. mojo/apps/jobs/handlers/webhook.py +317 -0
  96. mojo/apps/jobs/job_engine.py +734 -0
  97. mojo/apps/jobs/keys.py +203 -0
  98. mojo/apps/jobs/local_queue.py +363 -0
  99. mojo/apps/jobs/management/__init__.py +3 -0
  100. mojo/apps/jobs/management/commands/__init__.py +3 -0
  101. mojo/apps/jobs/manager.py +1327 -0
  102. mojo/apps/jobs/migrations/0001_initial.py +97 -0
  103. mojo/apps/jobs/migrations/0002_alter_job_max_retries_joblog.py +39 -0
  104. mojo/apps/jobs/models/__init__.py +6 -0
  105. mojo/apps/jobs/models/job.py +441 -0
  106. mojo/apps/jobs/rest/__init__.py +2 -0
  107. mojo/apps/jobs/rest/control.py +466 -0
  108. mojo/apps/jobs/rest/jobs.py +421 -0
  109. mojo/apps/jobs/scheduler.py +571 -0
  110. mojo/apps/jobs/services/__init__.py +6 -0
  111. mojo/apps/jobs/services/job_actions.py +465 -0
  112. mojo/apps/jobs/settings.py +209 -0
  113. mojo/apps/logit/models/log.py +3 -0
  114. mojo/apps/metrics/__init__.py +8 -1
  115. mojo/apps/metrics/redis_metrics.py +198 -0
  116. mojo/apps/metrics/rest/__init__.py +3 -0
  117. mojo/apps/metrics/rest/categories.py +266 -0
  118. mojo/apps/metrics/rest/helpers.py +48 -0
  119. mojo/apps/metrics/rest/permissions.py +99 -0
  120. mojo/apps/metrics/rest/values.py +277 -0
  121. mojo/apps/metrics/utils.py +17 -0
  122. mojo/decorators/http.py +40 -1
  123. mojo/helpers/aws/__init__.py +11 -7
  124. mojo/helpers/aws/inbound_email.py +309 -0
  125. mojo/helpers/aws/kms.py +413 -0
  126. mojo/helpers/aws/ses_domain.py +959 -0
  127. mojo/helpers/crypto/__init__.py +1 -1
  128. mojo/helpers/crypto/utils.py +15 -0
  129. mojo/helpers/location/__init__.py +2 -0
  130. mojo/helpers/location/countries.py +262 -0
  131. mojo/helpers/location/geolocation.py +196 -0
  132. mojo/helpers/logit.py +37 -0
  133. mojo/helpers/redis/__init__.py +2 -0
  134. mojo/helpers/redis/adapter.py +606 -0
  135. mojo/helpers/redis/client.py +48 -0
  136. mojo/helpers/redis/pool.py +225 -0
  137. mojo/helpers/request.py +8 -0
  138. mojo/helpers/response.py +8 -0
  139. mojo/middleware/auth.py +1 -1
  140. mojo/middleware/cors.py +40 -0
  141. mojo/middleware/logging.py +131 -12
  142. mojo/middleware/mojo.py +5 -0
  143. mojo/models/rest.py +271 -57
  144. mojo/models/secrets.py +86 -0
  145. mojo/serializers/__init__.py +16 -10
  146. mojo/serializers/core/__init__.py +90 -0
  147. mojo/serializers/core/cache/__init__.py +121 -0
  148. mojo/serializers/core/cache/backends.py +518 -0
  149. mojo/serializers/core/cache/base.py +102 -0
  150. mojo/serializers/core/cache/disabled.py +181 -0
  151. mojo/serializers/core/cache/memory.py +287 -0
  152. mojo/serializers/core/cache/redis.py +533 -0
  153. mojo/serializers/core/cache/utils.py +454 -0
  154. mojo/serializers/{manager.py → core/manager.py} +53 -4
  155. mojo/serializers/core/serializer.py +475 -0
  156. mojo/serializers/{advanced/formats → formats}/csv.py +116 -139
  157. mojo/serializers/suggested_improvements.md +388 -0
  158. testit/client.py +1 -1
  159. testit/helpers.py +14 -0
  160. testit/runner.py +23 -6
  161. django_nativemojo-0.1.15.dist-info/RECORD +0 -234
  162. mojo/apps/notify/README.md +0 -91
  163. mojo/apps/notify/README_NOTIFICATIONS.md +0 -566
  164. mojo/apps/notify/admin.py +0 -52
  165. mojo/apps/notify/handlers/example_handlers.py +0 -516
  166. mojo/apps/notify/handlers/ses/__init__.py +0 -25
  167. mojo/apps/notify/handlers/ses/complaint.py +0 -25
  168. mojo/apps/notify/handlers/ses/message.py +0 -86
  169. mojo/apps/notify/management/commands/__init__.py +0 -1
  170. mojo/apps/notify/management/commands/process_notifications.py +0 -370
  171. mojo/apps/notify/mod +0 -0
  172. mojo/apps/notify/models/__init__.py +0 -12
  173. mojo/apps/notify/models/account.py +0 -128
  174. mojo/apps/notify/models/attachment.py +0 -24
  175. mojo/apps/notify/models/bounce.py +0 -68
  176. mojo/apps/notify/models/complaint.py +0 -40
  177. mojo/apps/notify/models/inbox.py +0 -113
  178. mojo/apps/notify/models/inbox_message.py +0 -173
  179. mojo/apps/notify/models/outbox.py +0 -129
  180. mojo/apps/notify/models/outbox_message.py +0 -288
  181. mojo/apps/notify/models/template.py +0 -30
  182. mojo/apps/notify/providers/aws.py +0 -73
  183. mojo/apps/notify/rest/ses.py +0 -0
  184. mojo/apps/notify/utils/__init__.py +0 -2
  185. mojo/apps/notify/utils/notifications.py +0 -404
  186. mojo/apps/notify/utils/parsing.py +0 -202
  187. mojo/apps/notify/utils/render.py +0 -144
  188. mojo/apps/tasks/README.md +0 -118
  189. mojo/apps/tasks/__init__.py +0 -44
  190. mojo/apps/tasks/manager.py +0 -644
  191. mojo/apps/tasks/rest/__init__.py +0 -2
  192. mojo/apps/tasks/rest/hooks.py +0 -0
  193. mojo/apps/tasks/rest/tasks.py +0 -76
  194. mojo/apps/tasks/runner.py +0 -439
  195. mojo/apps/tasks/task.py +0 -99
  196. mojo/apps/tasks/tq_handlers.py +0 -132
  197. mojo/helpers/crypto/__pycache__/hash.cpython-310.pyc +0 -0
  198. mojo/helpers/crypto/__pycache__/sign.cpython-310.pyc +0 -0
  199. mojo/helpers/crypto/__pycache__/utils.cpython-310.pyc +0 -0
  200. mojo/helpers/redis.py +0 -10
  201. mojo/models/meta.py +0 -262
  202. mojo/serializers/advanced/README.md +0 -363
  203. mojo/serializers/advanced/__init__.py +0 -247
  204. mojo/serializers/advanced/formats/__init__.py +0 -28
  205. mojo/serializers/advanced/formats/excel.py +0 -516
  206. mojo/serializers/advanced/formats/json.py +0 -239
  207. mojo/serializers/advanced/formats/response.py +0 -485
  208. mojo/serializers/advanced/serializer.py +0 -568
  209. mojo/serializers/optimized.py +0 -618
  210. {django_nativemojo-0.1.15.dist-info → django_nativemojo-0.1.16.dist-info}/LICENSE +0 -0
  211. {django_nativemojo-0.1.15.dist-info → django_nativemojo-0.1.16.dist-info}/NOTICE +0 -0
  212. {django_nativemojo-0.1.15.dist-info → django_nativemojo-0.1.16.dist-info}/WHEEL +0 -0
  213. /mojo/apps/{notify → aws/migrations}/__init__.py +0 -0
  214. /mojo/apps/{notify/handlers → docit/markdown_plugins}/__init__.py +0 -0
  215. /mojo/apps/{notify/management → docit/migrations}/__init__.py +0 -0
  216. /mojo/apps/{notify/providers → jobs/examples}/__init__.py +0 -0
  217. /mojo/apps/{notify/rest → jobs/migrations}/__init__.py +0 -0
  218. /mojo/{serializers → rest}/openapi.py +0 -0
  219. /mojo/serializers/{settings_example.py → examples/settings.py} +0 -0
  220. /mojo/{apps/notify/handlers/ses/bounce.py → serializers/formats/__init__.py} +0 -0
  221. /mojo/serializers/{advanced/formats → formats}/localizers.py +0 -0
@@ -0,0 +1,1327 @@
1
+ """
2
+ JobManager for control and inspection of the jobs system.
3
+
4
+ Provides high-level management operations for monitoring and controlling
5
+ job runners, queues, and individual jobs.
6
+ """
7
+ import json
8
+ import uuid
9
+ import time
10
+ from typing import Any, Dict, List, Optional, Tuple
11
+ from datetime import datetime, timedelta
12
+
13
+ from django.conf import settings
14
+ from django.utils import timezone
15
+
16
+ from mojo.helpers import logit
17
+ from .keys import JobKeys
18
+ from .adapters import get_adapter
19
+ from .models import Job, JobEvent
20
+
21
+
22
+ class JobManager:
23
+ """
24
+ Management interface for the jobs system.
25
+
26
+ Provides methods for inspecting queue state, controlling runners,
27
+ and managing jobs.
28
+ """
29
+
30
+ def __init__(self):
31
+ """Initialize the JobManager."""
32
+ self.redis = get_adapter()
33
+ self.keys = JobKeys()
34
+
35
+ def get_runners(self, channel: Optional[str] = None) -> List[Dict[str, Any]]:
36
+ """
37
+ Get list of active runners.
38
+
39
+ Args:
40
+ channel: Filter by channel (None for all runners)
41
+
42
+ Returns:
43
+ List of runner info dicts with keys:
44
+ - runner_id: Runner identifier
45
+ - channels: List of channels served
46
+ - jobs_processed: Number of jobs completed
47
+ - jobs_failed: Number of jobs failed
48
+ - started: When runner started
49
+ - last_heartbeat: Last heartbeat time
50
+ - alive: Whether runner is considered alive
51
+ """
52
+ runners = []
53
+
54
+ try:
55
+ # Find all runner heartbeat keys
56
+ pattern = self.keys.runner_hb('*')
57
+
58
+ # Note: In production, use SCAN instead of KEYS for better performance
59
+ # For now, using a simple approach
60
+ all_keys = []
61
+ cursor = 0
62
+ while True:
63
+ cursor, keys = self.redis.get_client().scan(
64
+ cursor, match=pattern, count=100
65
+ )
66
+ all_keys.extend(keys)
67
+ if cursor == 0:
68
+ break
69
+
70
+ # Check each runner
71
+ for key in all_keys:
72
+ try:
73
+ # Get heartbeat data
74
+ data = self.redis.get(key.decode('utf-8') if isinstance(key, bytes) else key)
75
+ if not data:
76
+ continue
77
+
78
+ runner_info = json.loads(data)
79
+
80
+ # Filter by channel if specified
81
+ if channel and channel not in runner_info.get('channels', []):
82
+ continue
83
+
84
+ # Check if alive (heartbeat within 3x interval)
85
+ last_hb = runner_info.get('last_heartbeat')
86
+ if last_hb:
87
+ last_hb_time = datetime.fromisoformat(last_hb)
88
+ if timezone.is_naive(last_hb_time):
89
+ last_hb_time = timezone.make_aware(last_hb_time)
90
+
91
+ age = (timezone.now() - last_hb_time).total_seconds()
92
+ alive = age < (getattr(settings, 'JOBS_RUNNER_HEARTBEAT_SEC', 5) * 3)
93
+ else:
94
+ alive = False
95
+
96
+ runner_info['alive'] = alive
97
+ runners.append(runner_info)
98
+
99
+ except Exception as e:
100
+ logit.warn(f"Failed to parse runner heartbeat: {e}")
101
+
102
+ except Exception as e:
103
+ logit.error(f"Failed to get runners: {e}")
104
+
105
+ # Sort by runner_id for consistency
106
+ runners.sort(key=lambda r: r.get('runner_id', ''))
107
+
108
+ return runners
109
+
110
+ def get_queue_state(self, channel: str) -> Dict[str, Any]:
111
+ """
112
+ Get queue state for a channel.
113
+
114
+ Args:
115
+ channel: Channel name
116
+
117
+ Returns:
118
+ Dict with queue statistics (Plan B):
119
+ - queued_count: Number of messages waiting in the list queue (LLEN)
120
+ - inflight_count: Number of in-flight messages (ZCARD of processing)
121
+ - scheduled_count: Number of scheduled/delayed jobs (ZCARD of sched + sched_broadcast)
122
+ - runners: Number of active runners
123
+ """
124
+ state = {
125
+ 'channel': channel,
126
+ 'queued_count': 0,
127
+ 'inflight_count': 0,
128
+ 'scheduled_count': 0,
129
+ 'runners': 0,
130
+ }
131
+
132
+ try:
133
+ # Plan B counts: List + ZSET
134
+ queue_key = self.keys.queue(channel)
135
+ processing_key = self.keys.processing(channel)
136
+ sched_key = self.keys.sched(channel)
137
+ sched_b_key = self.keys.sched_broadcast(channel)
138
+ # Exact counts
139
+ state['queued_count'] = self.redis.llen(queue_key) or 0
140
+ state['inflight_count'] = self.redis.zcard(processing_key) or 0
141
+ state['scheduled_count'] = (self.redis.zcard(sched_key) or 0) + (self.redis.zcard(sched_b_key) or 0)
142
+ # Active runners for this channel
143
+ runners = self.get_runners(channel)
144
+ state['runners'] = len([r for r in runners if r.get('alive')])
145
+ # Add metrics (DB-derived)
146
+ state['metrics'] = self._get_channel_metrics(channel)
147
+ except Exception as e:
148
+ logit.error(f"Failed to get queue state for {channel}: {e}")
149
+ return state
150
+
151
+ def get_channel_health(self, channel: str) -> Dict[str, Any]:
152
+ """
153
+ Get comprehensive health metrics for a channel.
154
+
155
+ Args:
156
+ channel: Channel name
157
+
158
+ Returns:
159
+ Dict with health status including unclaimed jobs, stuck jobs, and alerts
160
+ """
161
+ stream_key = self.keys.stream(channel)
162
+ group_key = self.keys.group_workers(channel)
163
+ sched_key = self.keys.sched(channel)
164
+
165
+ # Get basic queue state
166
+ state = self.get_queue_state(channel)
167
+
168
+ # Calculate unclaimed (waiting to be picked up)
169
+ total_messages = state['stream_length']
170
+ pending_count = state['pending_count']
171
+ unclaimed = max(0, total_messages - pending_count)
172
+
173
+ # Find stuck jobs
174
+ stuck = self._find_stuck_jobs(channel)
175
+
176
+ # Get active runners
177
+ runners = self.get_runners(channel)
178
+ active_runners = [r for r in runners if r.get('alive')]
179
+
180
+ # Build health status
181
+ health = {
182
+ 'channel': channel,
183
+ 'status': 'healthy', # Will update based on checks
184
+ 'messages': {
185
+ 'total': total_messages,
186
+ 'unclaimed': unclaimed,
187
+ 'pending': pending_count,
188
+ 'scheduled': state['scheduled_count'],
189
+ 'stuck': len(stuck)
190
+ },
191
+ 'runners': {
192
+ 'active': len(active_runners),
193
+ 'total': len(runners)
194
+ },
195
+ 'stuck_jobs': stuck[:10], # First 10 stuck jobs
196
+ 'alerts': []
197
+ }
198
+
199
+ # Health checks
200
+ if unclaimed > 100:
201
+ health['alerts'].append(f"High unclaimed count: {unclaimed}")
202
+ health['status'] = 'warning'
203
+
204
+ if unclaimed > 500:
205
+ health['status'] = 'critical'
206
+
207
+ if len(stuck) > 0:
208
+ health['alerts'].append(f"Stuck jobs detected: {len(stuck)}")
209
+ health['status'] = 'warning'
210
+
211
+ if len(stuck) > 10:
212
+ health['status'] = 'critical'
213
+
214
+ if len(active_runners) == 0 and total_messages > 0:
215
+ health['alerts'].append("No active runners for channel with pending jobs")
216
+ health['status'] = 'critical'
217
+
218
+ # Add metrics if available
219
+ if 'metrics' in state:
220
+ health['metrics'] = state['metrics']
221
+
222
+ return health
223
+
224
+ def _find_stuck_jobs(self, channel: str, idle_threshold_ms: int = 60000) -> List[Dict]:
225
+ """
226
+ Plan B: Find jobs that are in-flight (processing ZSET) longer than the idle threshold.
227
+
228
+ Args:
229
+ channel: Channel name
230
+ idle_threshold_ms: Consider stuck if idle longer than this (default 1 minute)
231
+
232
+ Returns:
233
+ List of stuck job details
234
+ """
235
+ stuck: List[Dict] = []
236
+ try:
237
+ now_ms = int(time.time() * 1000)
238
+ cutoff = now_ms - max(0, int(idle_threshold_ms))
239
+ processing_key = self.keys.processing(channel)
240
+
241
+ if idle_threshold_ms <= 0:
242
+ # Return all in-flight entries
243
+ ids = self.redis.zrangebyscore(processing_key, float("-inf"), float("inf"))
244
+ for jid in ids:
245
+ stuck.append({'job_id': jid, 'idle_ms': None})
246
+ return stuck
247
+
248
+ # Return entries older than cutoff
249
+ ids = self.redis.zrangebyscore(processing_key, float("-inf"), cutoff)
250
+ for jid in ids:
251
+ # We don't store claim timestamp in the member by default; idle_ms calculation is approximate
252
+ stuck.append({'job_id': jid, 'idle_ms': idle_threshold_ms})
253
+ except Exception as e:
254
+ logit.error(f"Failed to check stuck jobs for channel {channel}: {e}")
255
+
256
+ return stuck
257
+
258
+ def clear_stuck_jobs(self, channel: str, idle_threshold_ms: int = 60000) -> Dict[str, Any]:
259
+ """
260
+ Plan B: Clear stuck in-flight jobs from a channel by re-queueing or removing
261
+ entries from the processing ZSET based on an idle threshold.
262
+
263
+ Args:
264
+ channel: Channel name to clear
265
+ idle_threshold_ms: Consider stuck if older than this many ms (0 to clear all)
266
+
267
+ Returns:
268
+ Dict with results: {'cleared': int, 'details': [...], 'errors': [...]}
269
+ """
270
+ results = {
271
+ 'channel': channel,
272
+ 'cleared': 0,
273
+ 'details': [],
274
+ 'errors': []
275
+ }
276
+
277
+ try:
278
+ now_ms = int(time.time() * 1000)
279
+ processing_key = self.keys.processing(channel)
280
+ queue_key = self.keys.queue(channel)
281
+
282
+ # Determine score range to clear
283
+ if idle_threshold_ms and idle_threshold_ms > 0:
284
+ cutoff = now_ms - int(idle_threshold_ms)
285
+ candidates = self.redis.zrangebyscore(processing_key, float("-inf"), cutoff)
286
+ else:
287
+ candidates = self.redis.zrangebyscore(processing_key, float("-inf"), float("inf"))
288
+
289
+ if not candidates:
290
+ results['message'] = f"No in-flight jobs found in {channel} matching threshold"
291
+ return results
292
+
293
+ for jid in candidates:
294
+ try:
295
+ # Remove from processing and requeue
296
+ self.redis.zrem(processing_key, jid)
297
+ self.redis.rpush(queue_key, jid)
298
+ results['cleared'] += 1
299
+ results['details'].append({'job_id': jid, 'requeued': True})
300
+ # Write event trail (best effort)
301
+ try:
302
+ job = Job.objects.get(id=jid)
303
+ JobEvent.objects.create(
304
+ job=job,
305
+ channel=channel,
306
+ event='retry',
307
+ details={'reason': 'manual_clear_stuck'}
308
+ )
309
+ except Exception:
310
+ pass
311
+ except Exception as e:
312
+ results['errors'].append(f"{jid}: {e}")
313
+
314
+ results['message'] = f"Requeued {results['cleared']} in-flight jobs from {channel}"
315
+
316
+ except Exception as e:
317
+ import traceback
318
+ results['errors'].append(str(e))
319
+ results['stack_trace'] = traceback.format_exc()
320
+ logit.error(f"Failed to clear stuck jobs from {channel}: {e}")
321
+
322
+ return results
323
+
324
+ def broadcast_command(self, command: str, data: Dict = None,
325
+ timeout: float = 2.0) -> List[Dict]:
326
+ """
327
+ Send command to all runners and collect responses.
328
+
329
+ Args:
330
+ command: Command to send (status, shutdown, pause, resume)
331
+ data: Additional command data
332
+ timeout: Time to wait for responses
333
+
334
+ Returns:
335
+ List of responses from runners
336
+ """
337
+ import uuid as uuid_module
338
+ reply_channel = f"mojo:jobs:replies:{uuid_module.uuid4().hex[:8]}"
339
+
340
+ # Subscribe to replies before sending
341
+ pubsub = self.redis.pubsub()
342
+ pubsub.subscribe(reply_channel)
343
+
344
+ # Send broadcast command
345
+ message = {
346
+ 'command': command,
347
+ 'data': data or {},
348
+ 'reply_channel': reply_channel,
349
+ 'timestamp': timezone.now().isoformat()
350
+ }
351
+
352
+ self.redis.publish("mojo:jobs:runners:broadcast", json.dumps(message))
353
+
354
+ # Collect responses
355
+ responses = []
356
+ start_time = time.time()
357
+
358
+ while time.time() - start_time < timeout:
359
+ msg = pubsub.get_message(timeout=0.1)
360
+ if msg and msg['type'] == 'message':
361
+ try:
362
+ response_data = msg['data']
363
+ if isinstance(response_data, bytes):
364
+ response_data = response_data.decode('utf-8')
365
+ response = json.loads(response_data)
366
+ responses.append(response)
367
+ except Exception as e:
368
+ logit.debug(f"Failed to parse response: {e}")
369
+
370
+ pubsub.close()
371
+ return responses
372
+
373
+ def ping(self, runner_id: str, timeout: float = 2.0) -> bool:
374
+ """
375
+ Ping a runner to check if it's responsive.
376
+
377
+ Args:
378
+ runner_id: Runner identifier
379
+ timeout: Maximum time to wait for response (seconds)
380
+
381
+ Returns:
382
+ True if runner responded, False otherwise
383
+ """
384
+ try:
385
+ # Create a unique response key
386
+ response_key = f"{self.keys.runner_ctl(runner_id)}:response:{uuid.uuid4().hex[:8]}"
387
+
388
+ # Send ping command
389
+ control_key = self.keys.runner_ctl(runner_id)
390
+ message = json.dumps({
391
+ 'command': 'ping',
392
+ 'response_key': response_key
393
+ })
394
+
395
+ self.redis.publish(control_key, message)
396
+
397
+ # Wait for response
398
+ start_time = time.time()
399
+ while time.time() - start_time < timeout:
400
+ response = self.redis.get(response_key)
401
+ if response == 'pong':
402
+ self.redis.delete(response_key)
403
+ return True
404
+ time.sleep(0.1)
405
+
406
+ # Timeout
407
+ self.redis.delete(response_key)
408
+ return False
409
+
410
+ except Exception as e:
411
+ logit.error(f"Failed to ping runner {runner_id}: {e}")
412
+ return False
413
+
414
+ def shutdown(self, runner_id: str, graceful: bool = True) -> None:
415
+ """
416
+ Request a runner to shutdown.
417
+
418
+ Args:
419
+ runner_id: Runner identifier
420
+ graceful: If True, wait for current job to finish
421
+ """
422
+ try:
423
+ control_key = self.keys.runner_ctl(runner_id)
424
+ message = json.dumps({
425
+ 'command': 'shutdown',
426
+ 'graceful': graceful
427
+ })
428
+
429
+ self.redis.publish(control_key, message)
430
+ logit.info(f"Sent shutdown command to runner {runner_id} (graceful={graceful})")
431
+
432
+ except Exception as e:
433
+ logit.error(f"Failed to shutdown runner {runner_id}: {e}")
434
+
435
+ def broadcast(self, channel: str, func: str, payload: Dict[str, Any],
436
+ **options) -> str:
437
+ """
438
+ Publish a broadcast job to a channel.
439
+
440
+ Args:
441
+ channel: Channel to broadcast on
442
+ func: Job function module path
443
+ payload: Job payload
444
+ **options: Additional job options
445
+
446
+ Returns:
447
+ Job ID
448
+ """
449
+ from . import publish
450
+
451
+ return publish(
452
+ func=func,
453
+ payload=payload,
454
+ channel=channel,
455
+ broadcast=True,
456
+ **options
457
+ )
458
+
459
+ def job_status(self, job_id: str) -> Optional[Dict[str, Any]]:
460
+ """
461
+ Get detailed status of a job.
462
+
463
+ Args:
464
+ job_id: Job identifier
465
+
466
+ Returns:
467
+ Job status dict or None if not found
468
+ """
469
+ from . import status
470
+
471
+ # Get basic status
472
+ job_info = status(job_id)
473
+ if not job_info:
474
+ return None
475
+
476
+ # Enhance with additional info
477
+ try:
478
+ # Add events timeline
479
+ job = Job.objects.get(id=job_id)
480
+ events = JobEvent.objects.filter(job=job).order_by('at')[:20]
481
+
482
+ job_info['events'] = [
483
+ {
484
+ 'event': e.event,
485
+ 'at': e.at.isoformat(),
486
+ 'runner_id': e.runner_id,
487
+ 'attempt': e.attempt,
488
+ 'details': e.details
489
+ }
490
+ for e in events
491
+ ]
492
+
493
+ # Add queue position if pending
494
+ if job_info['status'] == 'pending' and job.run_at:
495
+ # Check position in scheduled queue
496
+ sched_key = self.keys.sched(job.channel)
497
+ rank = self.redis.get_client().zrank(sched_key, job_id)
498
+ if rank is not None:
499
+ job_info['queue_position'] = rank + 1
500
+
501
+ except Exception as e:
502
+ logit.debug(f"Failed to enhance job status: {e}")
503
+
504
+ return job_info
505
+
506
+ def cancel_job(self, job_id: str) -> bool:
507
+ """
508
+ Cancel a job.
509
+
510
+ Args:
511
+ job_id: Job identifier
512
+
513
+ Returns:
514
+ True if cancelled, False otherwise
515
+ """
516
+ from . import cancel
517
+ return cancel(job_id)
518
+
519
+ def retry_job(self, job_id: str, delay: Optional[int] = None) -> bool:
520
+ """
521
+ Retry a failed job.
522
+
523
+ Args:
524
+ job_id: Job identifier
525
+ delay: Delay in seconds before retry (default: immediate)
526
+
527
+ Returns:
528
+ True if retry scheduled, False otherwise
529
+ """
530
+ try:
531
+ job = Job.objects.get(id=job_id)
532
+
533
+ if job.status not in ('failed', 'canceled'):
534
+ logit.warn(f"Cannot retry job {job_id} in status {job.status}")
535
+ return False
536
+
537
+ # Reset job for retry
538
+ job.status = 'pending'
539
+ job.attempt = 0
540
+ job.last_error = ''
541
+ job.stack_trace = ''
542
+
543
+ if delay:
544
+ job.run_at = timezone.now() + timedelta(seconds=delay)
545
+ else:
546
+ job.run_at = None
547
+
548
+ job.save()
549
+
550
+ # Re-publish to Redis
551
+ from . import publish
552
+
553
+ return publish(
554
+ func=job.func,
555
+ payload=job.payload,
556
+ channel=job.channel,
557
+ run_at=job.run_at,
558
+ broadcast=job.broadcast,
559
+ max_retries=job.max_retries,
560
+ expires_at=job.expires_at,
561
+ max_exec_seconds=job.max_exec_seconds
562
+ )
563
+
564
+ except Job.DoesNotExist:
565
+ logit.error(f"Job {job_id} not found")
566
+ return False
567
+ except Exception as e:
568
+ logit.error(f"Failed to retry job {job_id}: {e}")
569
+ return False
570
+
571
+ def _get_channel_metrics(self, channel: str) -> Dict[str, Any]:
572
+ """Get recent metrics for a channel."""
573
+ metrics = {
574
+ 'jobs_per_minute': 0,
575
+ 'success_rate': 0,
576
+ 'avg_duration_ms': 0
577
+ }
578
+
579
+ try:
580
+ # Get recent job counts from database
581
+ now = timezone.now()
582
+ last_hour = now - timedelta(hours=1)
583
+
584
+ # Jobs completed in last hour
585
+ completed = Job.objects.filter(
586
+ channel=channel,
587
+ status='completed',
588
+ finished_at__gte=last_hour
589
+ ).count()
590
+
591
+ # Jobs failed in last hour
592
+ failed = Job.objects.filter(
593
+ channel=channel,
594
+ status='failed',
595
+ finished_at__gte=last_hour
596
+ ).count()
597
+
598
+ total = completed + failed
599
+ if total > 0:
600
+ metrics['jobs_per_minute'] = round(total / 60, 2)
601
+ metrics['success_rate'] = round(completed / total * 100, 1)
602
+
603
+ # Average duration of recent completed jobs
604
+ from django.db.models import Avg, F
605
+ avg_duration = Job.objects.filter(
606
+ channel=channel,
607
+ status='completed',
608
+ finished_at__gte=last_hour,
609
+ started_at__isnull=False
610
+ ).aggregate(
611
+ avg_ms=Avg(F('finished_at') - F('started_at'))
612
+ )
613
+
614
+ if avg_duration['avg_ms']:
615
+ metrics['avg_duration_ms'] = int(avg_duration['avg_ms'].total_seconds() * 1000)
616
+
617
+ except Exception as e:
618
+ logit.debug(f"Failed to get channel metrics: {e}")
619
+
620
+ return metrics
621
+
622
+ def get_stats(self) -> Dict[str, Any]:
623
+ """
624
+ Get overall system statistics.
625
+
626
+ Returns:
627
+ System-wide statistics
628
+ """
629
+ stats = {
630
+ 'channels': {},
631
+ 'runners': [],
632
+ 'totals': {
633
+ 'pending': 0,
634
+ 'queued': 0,
635
+ 'inflight': 0,
636
+ 'running': 0,
637
+ 'completed': 0,
638
+ 'failed': 0,
639
+ 'scheduled': 0,
640
+ 'runners_active': 0
641
+ },
642
+ 'scheduler': {
643
+ 'active': False,
644
+ 'lock_holder': None
645
+ }
646
+ }
647
+
648
+ try:
649
+ # Get stats for each configured channel
650
+ channels = getattr(settings, 'JOBS_CHANNELS', ['default', 'email', 'webhooks', 'priority'])
651
+ for channel in channels:
652
+ state = self.get_queue_state(channel)
653
+
654
+ # Include DB running count per channel for better visibility
655
+ try:
656
+ state['db_running'] = Job.objects.filter(channel=channel, status='running').count()
657
+ except Exception:
658
+ state['db_running'] = 0
659
+
660
+ stats['channels'][channel] = state
661
+
662
+ # Aggregate totals
663
+ stats['totals']['scheduled'] += state['scheduled_count']
664
+ # queued_count = unclaimed_count; inflight_count = pending_count
665
+ queued = state.get('queued_count', state.get('unclaimed_count', max(0, state.get('stream_length', 0) - state.get('pending_count', 0))))
666
+ inflight = state.get('inflight_count', state.get('pending_count', 0))
667
+ stats['totals']['queued'] += queued
668
+ stats['totals']['inflight'] += inflight
669
+ # Keep 'pending' as alias for queued for backward compatibility
670
+ stats['totals']['pending'] += queued
671
+
672
+ # Get all runners
673
+ all_runners = self.get_runners()
674
+ stats['runners'] = all_runners
675
+ alive_runners = [r for r in all_runners if r.get('alive')]
676
+ alive_ids = [r.get('runner_id') for r in alive_runners if r.get('runner_id')]
677
+ stats['totals']['runners_active'] = len(alive_runners)
678
+
679
+ # Database totals with active vs stale running split
680
+ running_total = Job.objects.filter(status='running').count()
681
+ if alive_ids:
682
+ running_active = Job.objects.filter(status='running', runner_id__in=alive_ids).count()
683
+ else:
684
+ running_active = 0
685
+ running_stale = max(0, running_total - running_active)
686
+
687
+ stats['totals']['running'] = running_total
688
+ stats['totals']['running_active'] = running_active
689
+ stats['totals']['running_stale'] = running_stale
690
+ stats['totals']['completed'] = Job.objects.filter(status='completed').count()
691
+ stats['totals']['failed'] = Job.objects.filter(status='failed').count()
692
+
693
+ # Check scheduler lock
694
+ lock_value = self.redis.get(self.keys.scheduler_lock())
695
+ if lock_value:
696
+ stats['scheduler']['active'] = True
697
+ stats['scheduler']['lock_holder'] = lock_value
698
+
699
+ except Exception as e:
700
+ logit.error(f"Failed to get system stats: {e}")
701
+
702
+ return stats
703
+
704
+ def pause_channel(self, channel: str) -> bool:
705
+ """
706
+ Pause a channel by setting a pause flag in Redis.
707
+ Runners and scheduler should respect this flag.
708
+ """
709
+ try:
710
+ self.redis.set(self.keys.channel_pause(channel), '1')
711
+ logit.info(f"Paused channel {channel}")
712
+ return True
713
+ except Exception as e:
714
+ logit.error(f"Failed to pause channel {channel}: {e}")
715
+ return False
716
+
717
+ def resume_channel(self, channel: str) -> bool:
718
+ """
719
+ Resume a channel by clearing the pause flag in Redis.
720
+ """
721
+ try:
722
+ self.redis.delete(self.keys.channel_pause(channel))
723
+ logit.info(f"Resumed channel {channel}")
724
+ return True
725
+ except Exception as e:
726
+ logit.error(f"Failed to resume channel {channel}: {e}")
727
+ return False
728
+
729
+ def clear_channel(self, channel: str, cancel_db_pending: bool = True) -> Dict[str, Any]:
730
+ """
731
+ Completely clear a channel’s Redis queues and optionally cancel DB-pending jobs.
732
+
733
+ Steps:
734
+ 1) Pause channel
735
+ 2) Delete main stream, broadcast stream, scheduled and scheduled_broadcast ZSETs
736
+ 3) Optionally mark DB pending jobs as canceled
737
+ 4) Resume channel
738
+ """
739
+ result: Dict[str, Any] = {
740
+ 'channel': channel,
741
+ 'deleted': {},
742
+ 'db_pending_canceled': 0,
743
+ 'status': True,
744
+ 'errors': []
745
+ }
746
+ try:
747
+ self.pause_channel(channel)
748
+
749
+ # Delete Plan B keys and legacy streams
750
+ stream_key = self.keys.stream(channel) # legacy
751
+ broadcast_key = self.keys.stream_broadcast(channel) # legacy
752
+ sched_key = self.keys.sched(channel)
753
+ sched_b_key = self.keys.sched_broadcast(channel)
754
+ queue_key = self.keys.queue(channel)
755
+ processing_key = self.keys.processing(channel)
756
+
757
+ deleted_stream = self.redis.delete(stream_key)
758
+ deleted_broadcast = self.redis.delete(broadcast_key)
759
+ deleted_sched = self.redis.delete(sched_key)
760
+ deleted_sched_b = self.redis.delete(sched_b_key)
761
+ deleted_queue = self.redis.delete(queue_key)
762
+ deleted_processing = self.redis.delete(processing_key)
763
+
764
+ result['deleted'] = {
765
+ 'stream': bool(deleted_stream),
766
+ 'broadcast': bool(deleted_broadcast),
767
+ 'scheduled': bool(deleted_sched),
768
+ 'scheduled_broadcast': bool(deleted_sched_b),
769
+ 'queue': bool(deleted_queue),
770
+ 'processing': bool(deleted_processing),
771
+ }
772
+
773
+ if cancel_db_pending:
774
+ try:
775
+ count = Job.objects.filter(
776
+ channel=channel,
777
+ status='pending'
778
+ ).update(
779
+ status='canceled',
780
+ finished_at=timezone.now()
781
+ )
782
+ result['db_pending_canceled'] = count
783
+ except Exception as e:
784
+ result['errors'].append(f"DB cancel pending failed: {e}")
785
+ result['status'] = False
786
+
787
+ except Exception as e:
788
+ result['errors'].append(str(e))
789
+ result['status'] = False
790
+ finally:
791
+ # Always attempt to resume to avoid leaving the channel paused
792
+ self.resume_channel(channel)
793
+
794
+ return result
795
+
796
+ def requeue_db_pending(self, channel: str, limit: Optional[int] = None) -> Dict[str, Any]:
797
+ """
798
+ Requeue DB 'pending' jobs for a channel back into Redis streams.
799
+ Useful after a clear to rebuild the stream from DB truth.
800
+ """
801
+ try:
802
+ qs = Job.objects.filter(channel=channel, status='pending').order_by('created')
803
+ if limit is not None:
804
+ qs = qs[:int(limit)]
805
+
806
+ requeued = 0
807
+ for job in qs:
808
+ stream_key = self.keys.stream_broadcast(channel) if job.broadcast else self.keys.stream(channel)
809
+ try:
810
+ self.redis.xadd(stream_key, {
811
+ 'job_id': job.id,
812
+ 'func': job.func,
813
+ 'created': timezone.now().isoformat()
814
+ })
815
+ try:
816
+ JobEvent.objects.create(
817
+ job=job,
818
+ channel=channel,
819
+ event='queued',
820
+ details={'requeued': True}
821
+ )
822
+ except Exception:
823
+ pass
824
+ requeued += 1
825
+ except Exception as e:
826
+ logit.warn(f"Failed to requeue job {job.id} on {channel}: {e}")
827
+
828
+ return {'status': True, 'requeued': requeued, 'channel': channel}
829
+ except Exception as e:
830
+ return {'status': False, 'error': str(e), 'channel': channel}
831
+
832
+ def purge_old_jobs(self, days_old: int, status: Optional[str] = None, dry_run: bool = False) -> Dict[str, Any]:
833
+ """
834
+ Purge old jobs (and their events via cascade) from the database.
835
+
836
+ Args:
837
+ days_old: Delete jobs older than this many days
838
+ status: Optional status filter to narrow deletion
839
+ dry_run: If true, only count and do not delete
840
+
841
+ Returns:
842
+ dict with status and either count (dry_run) or delete details
843
+ """
844
+ try:
845
+ cutoff = timezone.now() - timedelta(days=int(days_old))
846
+ from django.db.models import Q
847
+ query = Q(created__lt=cutoff)
848
+ if status:
849
+ query &= Q(status=status)
850
+ qs = Job.objects.filter(query)
851
+ count = qs.count()
852
+ if dry_run:
853
+ return {
854
+ 'status': True,
855
+ 'dry_run': True,
856
+ 'count': count,
857
+ 'cutoff': cutoff.isoformat(),
858
+ 'status_filter': status
859
+ }
860
+ deleted, details = qs.delete()
861
+ return {
862
+ 'status': True,
863
+ 'deleted': deleted,
864
+ 'details': details,
865
+ 'cutoff': cutoff.isoformat(),
866
+ 'status_filter': status
867
+ }
868
+ except Exception as e:
869
+ return {'status': False, 'error': str(e)}
870
+
871
+ def get_registered_channels(self) -> List[str]:
872
+ """
873
+ Discover registered channels by scanning Redis for main stream keys.
874
+ Returns a sorted, de-duplicated list of channel names.
875
+ """
876
+ channels: List[str] = []
877
+ try:
878
+ pattern = f"{self.keys.prefix}:stream:*"
879
+ client = self.redis.get_client()
880
+ cursor = 0
881
+ found = set()
882
+ while True:
883
+ cursor, keys = client.scan(cursor, match=pattern, count=200)
884
+ for k in keys or []:
885
+ key_str = k.decode('utf-8') if isinstance(k, (bytes, bytearray)) else k
886
+ parts = key_str.split(":stream:")
887
+ if len(parts) == 2 and parts[1]:
888
+ channel = parts[1]
889
+ # ignore broadcast suffix if present
890
+ if channel.endswith(":broadcast"):
891
+ channel = channel.rsplit(":broadcast", 1)[0]
892
+ if channel:
893
+ found.add(channel)
894
+ if cursor == 0:
895
+ break
896
+ channels = sorted(found)
897
+ except Exception as e:
898
+ logit.debug(f"Failed to discover channels via Redis scan: {e}")
899
+ channels = []
900
+ return channels
901
+
902
+ def get_queue_sizes(self, channels: Optional[List[str]] = None) -> Dict[str, Any]:
903
+ """
904
+ Get current queue sizes for channels including DB status counts.
905
+
906
+ Args:
907
+ channels: Optional list of channels. Defaults to discovered streams or settings.JOBS_CHANNELS
908
+
909
+ Returns:
910
+ dict with per-channel sizes and DB status counts
911
+ """
912
+ try:
913
+ from django.conf import settings as dj_settings
914
+ channels = channels or self.get_registered_channels() or getattr(dj_settings, 'JOBS_CHANNELS', ['default'])
915
+ sizes: Dict[str, Any] = {}
916
+ for channel in channels:
917
+ stream_key = self.keys.stream(channel)
918
+ sched_key = self.keys.sched(channel)
919
+ sched_b_key = self.keys.sched_broadcast(channel)
920
+
921
+ # Stream length
922
+ try:
923
+ info = self.redis.xinfo_stream(stream_key)
924
+ stream_len = info.get('length', 0)
925
+ except Exception:
926
+ stream_len = 0
927
+
928
+ # Scheduled counts (both ZSETs)
929
+ scheduled = (self.redis.zcard(sched_key) or 0) + (self.redis.zcard(sched_b_key) or 0)
930
+
931
+ # DB status counts
932
+ from django.db.models import Count
933
+ db_counts_qs = Job.objects.filter(channel=channel).values('status').annotate(count=Count('id'))
934
+ status_counts = {row['status']: row['count'] for row in db_counts_qs}
935
+
936
+ sizes[channel] = {
937
+ 'stream': stream_len,
938
+ 'scheduled': scheduled,
939
+ 'db_pending': status_counts.get('pending', 0),
940
+ 'db_running': status_counts.get('running', 0),
941
+ 'db_completed': status_counts.get('completed', 0),
942
+ 'db_failed': status_counts.get('failed', 0),
943
+ 'db_canceled': status_counts.get('canceled', 0),
944
+ 'db_expired': status_counts.get('expired', 0),
945
+ }
946
+
947
+ return {'status': True, 'data': sizes}
948
+ except Exception as e:
949
+ return {'status': False, 'error': str(e)}
950
+
951
+
952
+ def _jobmanager_cleanup_consumer_groups(self, channel: Optional[str] = None, destroy_empty_groups: bool = True) -> Dict[str, Any]:
953
+ """
954
+ Clean up Redis Stream consumer groups and consumers.
955
+
956
+ - If channel is provided, operates on that channel only.
957
+ - Otherwise, iterates discovered channels (or settings fallback).
958
+ - Removes consumers with no pending messages.
959
+ - Optionally destroys empty groups after consumer cleanup.
960
+
961
+ Returns:
962
+ Dict with per-channel cleanup results and any errors.
963
+ """
964
+ results: Dict[str, Any] = {'status': True, 'channels': {}, 'errors': []}
965
+ try:
966
+ # Determine channels to process
967
+ try:
968
+ from django.conf import settings as dj_settings
969
+ except Exception:
970
+ dj_settings = None
971
+
972
+ if channel:
973
+ channels = [channel]
974
+ else:
975
+ channels = self.get_registered_channels()
976
+ if not channels and dj_settings:
977
+ channels = getattr(dj_settings, 'JOBS_CHANNELS', ['default'])
978
+
979
+ client = self.redis.get_client()
980
+
981
+ for ch in channels:
982
+ channel_result: Dict[str, Any] = {
983
+ 'stream': self.keys.stream(ch),
984
+ 'groups_processed': 0,
985
+ 'consumers_removed': 0,
986
+ 'groups_destroyed': 0,
987
+ 'errors': []
988
+ }
989
+ stream_key = self.keys.stream(ch)
990
+
991
+ # Fetch groups for this stream
992
+ try:
993
+ groups = client.xinfo_groups(stream_key)
994
+ except Exception as e:
995
+ # If stream doesn't exist, nothing to clean
996
+ channel_result['errors'].append(f"xinfo_groups failed: {e}")
997
+ results['channels'][ch] = channel_result
998
+ continue
999
+
1000
+ # Normalize groups to dicts with string keys
1001
+ norm_groups = []
1002
+ try:
1003
+ for g in groups or []:
1004
+ if isinstance(g, dict):
1005
+ name = g.get('name')
1006
+ if isinstance(name, bytes):
1007
+ name = name.decode('utf-8')
1008
+ consumers_count = g.get('consumers', 0)
1009
+ pending = g.get('pending', 0)
1010
+ last_id = g.get('last-delivered-id')
1011
+ if isinstance(last_id, bytes):
1012
+ last_id = last_id.decode('utf-8')
1013
+ norm_groups.append({
1014
+ 'name': name,
1015
+ 'consumers': int(consumers_count or 0),
1016
+ 'pending': int(pending or 0),
1017
+ 'last_delivered_id': last_id or ''
1018
+ })
1019
+ except Exception as e:
1020
+ channel_result['errors'].append(f"group normalization failed: {e}")
1021
+ results['channels'][ch] = channel_result
1022
+ continue
1023
+
1024
+ # Process each group
1025
+ for g in norm_groups:
1026
+ group_name = g['name']
1027
+ channel_result['groups_processed'] += 1
1028
+ try:
1029
+ consumers = client.xinfo_consumers(stream_key, group_name)
1030
+ except Exception as e:
1031
+ channel_result['errors'].append(f"xinfo_consumers({group_name}) failed: {e}")
1032
+ consumers = []
1033
+
1034
+ # Remove consumers with no pending messages
1035
+ removed = 0
1036
+ try:
1037
+ for c in consumers or []:
1038
+ cname = c.get('name')
1039
+ if isinstance(cname, bytes):
1040
+ cname = cname.decode('utf-8')
1041
+ pending_c = int(c.get('pending', 0) or 0)
1042
+ if pending_c == 0 and cname:
1043
+ try:
1044
+ client.execute_command('XGROUP', 'DELCONSUMER', stream_key, group_name, cname)
1045
+ removed += 1
1046
+ except Exception as e:
1047
+ channel_result['errors'].append(f"DELCONSUMER {group_name}/{cname} failed: {e}")
1048
+ channel_result['consumers_removed'] += removed
1049
+ except Exception as e:
1050
+ channel_result['errors'].append(f"consumer removal loop failed for {group_name}: {e}")
1051
+
1052
+ # Optionally destroy empty group
1053
+ if destroy_empty_groups:
1054
+ try:
1055
+ # Refresh group info to check if any consumers remain
1056
+ refreshed_groups = client.xinfo_groups(stream_key)
1057
+ grp = None
1058
+ for rg in refreshed_groups or []:
1059
+ nm = rg.get('name')
1060
+ if isinstance(nm, bytes):
1061
+ nm = nm.decode('utf-8')
1062
+ if nm == group_name:
1063
+ grp = rg
1064
+ break
1065
+ remaining = int(grp.get('consumers', 0) or 0) if grp else 0
1066
+ if remaining == 0:
1067
+ try:
1068
+ client.execute_command('XGROUP', 'DESTROY', stream_key, group_name)
1069
+ channel_result['groups_destroyed'] += 1
1070
+ except Exception as e:
1071
+ channel_result['errors'].append(f"XGROUP DESTROY {group_name} failed: {e}")
1072
+ except Exception as e:
1073
+ channel_result['errors'].append(f"post-clean xinfo_groups failed: {e}")
1074
+
1075
+ results['channels'][ch] = channel_result
1076
+
1077
+ except Exception as e:
1078
+ results['status'] = False
1079
+ results['errors'].append(str(e))
1080
+
1081
+ return results
1082
+
1083
+ # Attach as a method on JobManager for runtime use
1084
+ JobManager.cleanup_consumer_groups = _jobmanager_cleanup_consumer_groups
1085
+
1086
+ def _jobmanager_rebuild_scheduled(self, channel: Optional[str] = None, limit: Optional[int] = None) -> Dict[str, Any]:
1087
+ """
1088
+ Rebuild scheduled ZSETs from DB truth for pending jobs with future run_at.
1089
+ Useful if ZSETs were not populated during publish or after outages.
1090
+
1091
+ Args:
1092
+ channel: Optional channel to restrict rebuild
1093
+ limit: Optional max number of jobs per channel
1094
+
1095
+ Returns:
1096
+ Dict with per-channel counts and errors.
1097
+ """
1098
+ results: Dict[str, Any] = {'status': True, 'channels': {}, 'errors': []}
1099
+ try:
1100
+ from django.utils import timezone
1101
+ now = timezone.now()
1102
+
1103
+ # Determine channels
1104
+ if channel:
1105
+ channels = [channel]
1106
+ else:
1107
+ channels = self.get_registered_channels()
1108
+ if not channels:
1109
+ try:
1110
+ from django.conf import settings as dj_settings
1111
+ channels = getattr(dj_settings, 'JOBS_CHANNELS', ['default'])
1112
+ except Exception:
1113
+ channels = ['default']
1114
+
1115
+ for ch in channels:
1116
+ ch_result = {'scheduled_added': 0, 'broadcast_added': 0, 'errors': []}
1117
+ try:
1118
+ # Query jobs pending with future run_at
1119
+ qs = Job.objects.filter(channel=ch, status='pending', run_at__gt=now).order_by('run_at')
1120
+ if limit is not None:
1121
+ qs = qs[:int(limit)]
1122
+
1123
+ sched_key = self.keys.sched(ch)
1124
+ sched_b_key = self.keys.sched_broadcast(ch)
1125
+
1126
+ for job in qs:
1127
+ try:
1128
+ score = job.run_at.timestamp() * 1000.0
1129
+ # Skip if already present
1130
+ exists = self.redis.zscore(sched_b_key if job.broadcast else sched_key, job.id)
1131
+ if exists is not None:
1132
+ continue
1133
+ # Insert into appropriate ZSET
1134
+ if job.broadcast:
1135
+ self.redis.zadd(sched_b_key, {job.id: score})
1136
+ ch_result['broadcast_added'] += 1
1137
+ else:
1138
+ self.redis.zadd(sched_key, {job.id: score})
1139
+ ch_result['scheduled_added'] += 1
1140
+ except Exception as ie:
1141
+ ch_result['errors'].append(f"{job.id}: {ie}")
1142
+
1143
+ except Exception as ce:
1144
+ ch_result['errors'].append(str(ce))
1145
+
1146
+ results['channels'][ch] = ch_result
1147
+
1148
+ except Exception as e:
1149
+ results['status'] = False
1150
+ results['errors'].append(str(e))
1151
+
1152
+ return results
1153
+
1154
+ # Attach as a method on JobManager for runtime use
1155
+ JobManager.rebuild_scheduled = _jobmanager_rebuild_scheduled
1156
+
1157
+ def _jobmanager_recover_stale_running(self, channel: Optional[str] = None, max_age_seconds: Optional[int] = None) -> Dict[str, Any]:
1158
+ """
1159
+ Recover stale running jobs (DB shows status='running' but no inflight messages in Redis).
1160
+ For each channel (or a specific channel), if inflight_count == 0, reset DB running jobs
1161
+ to pending and requeue them to the stream immediately.
1162
+
1163
+ Args:
1164
+ channel: Optional channel to restrict recovery
1165
+ max_age_seconds: Optional age threshold (only recover jobs started before now - max_age_seconds)
1166
+
1167
+ Returns:
1168
+ Dict with per-channel recovery results and any errors:
1169
+ {
1170
+ status: True/False,
1171
+ channels: {
1172
+ channel: {
1173
+ examined: N,
1174
+ recovered: M,
1175
+ errors: [...]
1176
+ },
1177
+ ...
1178
+ },
1179
+ errors: [...]
1180
+ }
1181
+ """
1182
+ results: Dict[str, Any] = {'status': True, 'channels': {}, 'errors': []}
1183
+ try:
1184
+ # Determine channels
1185
+ if channel:
1186
+ channels = [channel]
1187
+ else:
1188
+ try:
1189
+ channels = self.get_registered_channels()
1190
+ except Exception:
1191
+ channels = []
1192
+ if not channels:
1193
+ try:
1194
+ from django.conf import settings as dj_settings
1195
+ channels = getattr(dj_settings, 'JOBS_CHANNELS', ['default'])
1196
+ except Exception:
1197
+ channels = ['default']
1198
+
1199
+ now = timezone.now()
1200
+ for ch in channels:
1201
+ ch_result: Dict[str, Any] = {'examined': 0, 'recovered': 0, 'errors': []}
1202
+ try:
1203
+ # Check inflight (PEL) for this channel
1204
+ state = self.get_queue_state(ch)
1205
+ inflight = int(state.get('inflight_count', 0) or 0)
1206
+
1207
+ # Only recover when no inflight (avoid racing real running work)
1208
+ if inflight > 0:
1209
+ results['channels'][ch] = ch_result
1210
+ continue
1211
+
1212
+ # Build query for DB running jobs
1213
+ from django.db.models import Q
1214
+ q = Q(channel=ch, status='running')
1215
+ if max_age_seconds is not None and max_age_seconds > 0:
1216
+ cutoff = now - timedelta(seconds=int(max_age_seconds))
1217
+ q &= Q(started_at__lt=cutoff)
1218
+
1219
+ running_qs = Job.objects.filter(q).order_by('started_at')
1220
+ ch_result['examined'] = running_qs.count()
1221
+
1222
+ # Requeue each recovered job
1223
+ for job in running_qs:
1224
+ try:
1225
+ # Reset DB status to pending
1226
+ job.status = 'pending'
1227
+ job.runner_id = None
1228
+ job.cancel_requested = False
1229
+ job.started_at = None
1230
+ job.finished_at = None
1231
+ job.last_error = job.last_error or ''
1232
+ job.stack_trace = job.stack_trace or ''
1233
+ job.save(update_fields=['status', 'runner_id', 'cancel_requested', 'started_at', 'finished_at', 'last_error', 'stack_trace', 'modified'])
1234
+
1235
+ # Push to stream immediately
1236
+ stream_key = self.keys.stream(job.channel) if not job.broadcast else self.keys.stream_broadcast(job.channel)
1237
+ try:
1238
+ self.redis.xadd(stream_key, {
1239
+ 'job_id': job.id,
1240
+ 'func': job.func,
1241
+ 'recovered': now.isoformat()
1242
+ })
1243
+ except Exception as xe:
1244
+ ch_result['errors'].append(f"xadd failed for {job.id}: {xe}")
1245
+
1246
+ # Event
1247
+ try:
1248
+ JobEvent.objects.create(
1249
+ job=job,
1250
+ channel=job.channel,
1251
+ event='retry',
1252
+ details={'reason': 'recover_stale_running'}
1253
+ )
1254
+ except Exception as ee:
1255
+ ch_result['errors'].append(f"event failed for {job.id}: {ee}")
1256
+
1257
+ ch_result['recovered'] += 1
1258
+ except Exception as je:
1259
+ ch_result['errors'].append(f"{job.id}: {je}")
1260
+
1261
+ except Exception as ce:
1262
+ ch_result['errors'].append(str(ce))
1263
+
1264
+ results['channels'][ch] = ch_result
1265
+
1266
+ except Exception as e:
1267
+ results['status'] = False
1268
+ results['errors'].append(str(e))
1269
+
1270
+ return results
1271
+
1272
+ # Attach as a method on JobManager for runtime use
1273
+ JobManager.recover_stale_running = _jobmanager_recover_stale_running
1274
+
1275
+ # Module-level singleton
1276
+ _manager = None
1277
+
1278
+
1279
+ def get_manager() -> JobManager:
1280
+ """
1281
+ Get the JobManager singleton instance.
1282
+
1283
+ Returns:
1284
+ JobManager instance
1285
+ """
1286
+ global _manager
1287
+ if not _manager:
1288
+ _manager = JobManager()
1289
+ return _manager
1290
+
1291
+
1292
+ # Convenience functions for Django shell
1293
+ def clear_stuck_jobs(channel: str, idle_threshold_ms: int = 60000) -> Dict[str, Any]:
1294
+ """
1295
+ Convenience function to clear stuck jobs from Django shell.
1296
+
1297
+ Usage:
1298
+ from mojo.apps.jobs.manager import clear_stuck_jobs
1299
+ result = clear_stuck_jobs('email', idle_threshold_ms=60000)
1300
+ print(result)
1301
+
1302
+ Args:
1303
+ channel: Channel name to clear
1304
+ idle_threshold_ms: Consider stuck if idle longer than this (0 to clear all)
1305
+
1306
+ Returns:
1307
+ Dict with results
1308
+ """
1309
+ return get_manager().clear_stuck_jobs(channel, idle_threshold_ms=idle_threshold_ms)
1310
+
1311
+
1312
+ def get_channel_health(channel: str) -> Dict[str, Any]:
1313
+ """
1314
+ Convenience function to check channel health from Django shell.
1315
+
1316
+ Usage:
1317
+ from mojo.apps.jobs.manager import get_channel_health
1318
+ health = get_channel_health('email')
1319
+ print(f"Pending: {health['messages']['pending']}")
1320
+
1321
+ Args:
1322
+ channel: Channel name to check
1323
+
1324
+ Returns:
1325
+ Channel health dict
1326
+ """
1327
+ return get_manager().get_channel_health(channel)