more-compute 0.4.3__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- frontend/app/globals.css +734 -27
- frontend/app/layout.tsx +13 -3
- frontend/components/Notebook.tsx +2 -14
- frontend/components/cell/MonacoCell.tsx +99 -5
- frontend/components/layout/Sidebar.tsx +39 -4
- frontend/components/panels/ClaudePanel.tsx +461 -0
- frontend/components/popups/ComputePopup.tsx +739 -418
- frontend/components/popups/FilterPopup.tsx +305 -189
- frontend/components/popups/MetricsPopup.tsx +20 -1
- frontend/components/popups/ProviderConfigModal.tsx +322 -0
- frontend/components/popups/ProviderDropdown.tsx +398 -0
- frontend/components/popups/SettingsPopup.tsx +1 -1
- frontend/contexts/ClaudeContext.tsx +392 -0
- frontend/contexts/PodWebSocketContext.tsx +16 -21
- frontend/hooks/useInlineDiff.ts +269 -0
- frontend/lib/api.ts +323 -12
- frontend/lib/settings.ts +5 -0
- frontend/lib/websocket-native.ts +4 -8
- frontend/lib/websocket.ts +1 -2
- frontend/package-lock.json +733 -36
- frontend/package.json +2 -0
- frontend/public/assets/icons/providers/lambda_labs.svg +22 -0
- frontend/public/assets/icons/providers/prime_intellect.svg +18 -0
- frontend/public/assets/icons/providers/runpod.svg +9 -0
- frontend/public/assets/icons/providers/vastai.svg +1 -0
- frontend/settings.md +54 -0
- frontend/tsconfig.tsbuildinfo +1 -0
- frontend/types/claude.ts +194 -0
- kernel_run.py +13 -0
- {more_compute-0.4.3.dist-info → more_compute-0.5.0.dist-info}/METADATA +53 -11
- {more_compute-0.4.3.dist-info → more_compute-0.5.0.dist-info}/RECORD +56 -37
- {more_compute-0.4.3.dist-info → more_compute-0.5.0.dist-info}/WHEEL +1 -1
- morecompute/__init__.py +1 -1
- morecompute/__version__.py +1 -1
- morecompute/execution/executor.py +24 -67
- morecompute/execution/worker.py +6 -72
- morecompute/models/api_models.py +62 -0
- morecompute/notebook.py +11 -0
- morecompute/server.py +641 -133
- morecompute/services/claude_service.py +392 -0
- morecompute/services/pod_manager.py +168 -67
- morecompute/services/pod_monitor.py +67 -39
- morecompute/services/prime_intellect.py +0 -4
- morecompute/services/providers/__init__.py +92 -0
- morecompute/services/providers/base_provider.py +336 -0
- morecompute/services/providers/lambda_labs_provider.py +394 -0
- morecompute/services/providers/provider_factory.py +194 -0
- morecompute/services/providers/runpod_provider.py +504 -0
- morecompute/services/providers/vastai_provider.py +407 -0
- morecompute/utils/cell_magics.py +0 -3
- morecompute/utils/config_util.py +93 -3
- morecompute/utils/special_commands.py +5 -32
- morecompute/utils/version_check.py +117 -0
- frontend/styling_README.md +0 -23
- {more_compute-0.4.3.dist-info/licenses → more_compute-0.5.0.dist-info}/LICENSE +0 -0
- {more_compute-0.4.3.dist-info → more_compute-0.5.0.dist-info}/entry_points.txt +0 -0
- {more_compute-0.4.3.dist-info → more_compute-0.5.0.dist-info}/top_level.txt +0 -0
|
@@ -157,10 +157,8 @@ class NextZmqExecutor:
|
|
|
157
157
|
normalized_source, result, start_time, execution_count, websocket, cell_index
|
|
158
158
|
)
|
|
159
159
|
result['execution_time'] = f"{(time.time()-start_time)*1000:.1f}ms"
|
|
160
|
-
print(f"[EXECUTOR] Sending execution_complete for cell {cell_index}, status={result.get('status')}, has_error={result.get('error') is not None}", file=sys.stderr, flush=True)
|
|
161
160
|
if websocket:
|
|
162
161
|
await websocket.send_json({'type': 'execution_complete', 'data': {'cell_index': cell_index, 'result': result}})
|
|
163
|
-
print(f"[EXECUTOR] Sent execution_complete successfully", file=sys.stderr, flush=True)
|
|
164
162
|
return result
|
|
165
163
|
# For remote execution OR mixed commands, fall through to send via ZMQ
|
|
166
164
|
|
|
@@ -178,14 +176,12 @@ class NextZmqExecutor:
|
|
|
178
176
|
while True:
|
|
179
177
|
# Check if this cell was interrupted
|
|
180
178
|
if self.interrupted_cell == cell_index and interrupted_time is None:
|
|
181
|
-
print(f"[EXECUTE] Cell {cell_index} was interrupted, waiting for subprocess to be killed...", file=sys.stderr, flush=True)
|
|
182
179
|
interrupted_time = time.time()
|
|
183
180
|
# Don't break immediately - wait for execution_complete from worker
|
|
184
181
|
# Give worker 5 seconds to kill subprocess and send completion
|
|
185
182
|
|
|
186
183
|
# If interrupted and waited long enough, force break
|
|
187
184
|
if interrupted_time and (time.time() - interrupted_time > 5.0):
|
|
188
|
-
print(f"[EXECUTE] Cell {cell_index} interrupt timeout, breaking out", file=sys.stderr, flush=True)
|
|
189
185
|
self.interrupted_cell = None # Clear the flag
|
|
190
186
|
result.update({
|
|
191
187
|
'status': 'error',
|
|
@@ -200,7 +196,6 @@ class NextZmqExecutor:
|
|
|
200
196
|
|
|
201
197
|
# Timeout check for stuck operations
|
|
202
198
|
if time.time() - start_time > max_wait:
|
|
203
|
-
print(f"[EXECUTE] Cell {cell_index} exceeded max wait time, timing out", file=sys.stderr, flush=True)
|
|
204
199
|
result.update({
|
|
205
200
|
'status': 'error',
|
|
206
201
|
'error': {
|
|
@@ -236,7 +231,6 @@ class NextZmqExecutor:
|
|
|
236
231
|
result.setdefault('execution_count', execution_count)
|
|
237
232
|
# Clear interrupted flag if this was interrupted
|
|
238
233
|
if self.interrupted_cell == cell_index:
|
|
239
|
-
print(f"[EXECUTE] Cell {cell_index} completed after interrupt", file=sys.stderr, flush=True)
|
|
240
234
|
self.interrupted_cell = None
|
|
241
235
|
break
|
|
242
236
|
|
|
@@ -248,16 +242,14 @@ class NextZmqExecutor:
|
|
|
248
242
|
self.req.setsockopt(zmq.RCVTIMEO, -1) # type: ignore[reportAttributeAccessIssue]
|
|
249
243
|
except zmq.Again:
|
|
250
244
|
# Timeout - worker didn't reply (probably killed), need to reset socket
|
|
251
|
-
print(f"[EXECUTE] Worker didn't reply, resetting REQ socket", file=sys.stderr, flush=True)
|
|
252
245
|
try:
|
|
253
246
|
self.req.close(0) # type: ignore[reportAttributeAccessIssue]
|
|
254
247
|
self.req = self.ctx.socket(zmq.REQ) # type: ignore[reportUnknownMemberType, reportAttributeAccessIssue]
|
|
255
248
|
self.req.connect(self.cmd_addr) # type: ignore[reportAttributeAccessIssue]
|
|
256
|
-
except Exception
|
|
257
|
-
|
|
258
|
-
except Exception
|
|
249
|
+
except Exception:
|
|
250
|
+
pass
|
|
251
|
+
except Exception:
|
|
259
252
|
# Some other error, also reset socket to be safe
|
|
260
|
-
print(f"[EXECUTE] Error receiving reply: {e}, resetting socket", file=sys.stderr, flush=True)
|
|
261
253
|
try:
|
|
262
254
|
self.req.setsockopt(zmq.RCVTIMEO, -1) # type: ignore[reportAttributeAccessIssue]
|
|
263
255
|
self.req.close(0) # type: ignore[reportAttributeAccessIssue]
|
|
@@ -272,13 +264,9 @@ class NextZmqExecutor:
|
|
|
272
264
|
|
|
273
265
|
async def interrupt_kernel(self, cell_index: int | None = None) -> None:
|
|
274
266
|
"""Interrupt the kernel using the control socket"""
|
|
275
|
-
import sys
|
|
276
|
-
print(f"[INTERRUPT] Starting interrupt for cell {cell_index}", file=sys.stderr, flush=True)
|
|
277
|
-
|
|
278
267
|
# Mark this cell as interrupted so execute_cell can break out
|
|
279
268
|
if isinstance(cell_index, int):
|
|
280
269
|
self.interrupted_cell = cell_index
|
|
281
|
-
print(f"[INTERRUPT] Marked cell {cell_index} as interrupted", file=sys.stderr, flush=True)
|
|
282
270
|
|
|
283
271
|
payload: dict[str, object] = {'type': 'interrupt'}
|
|
284
272
|
if isinstance(cell_index, int):
|
|
@@ -290,11 +278,8 @@ class NextZmqExecutor:
|
|
|
290
278
|
self.ctrl.setsockopt(zmq.RCVTIMEO, 1000) # type: ignore[reportAttributeAccessIssue]
|
|
291
279
|
self.ctrl.send_json(payload) # type: ignore[reportAttributeAccessIssue]
|
|
292
280
|
_ = cast(dict[str, object], self.ctrl.recv_json()) # type: ignore[reportAttributeAccessIssue]
|
|
293
|
-
|
|
294
|
-
except Exception as e:
|
|
295
|
-
print(f"[INTERRUPT] Could not send interrupt signal: {e}", file=sys.stderr, flush=True)
|
|
281
|
+
except Exception:
|
|
296
282
|
# If control socket fails, try force-kill immediately
|
|
297
|
-
print(f"[INTERRUPT] Force killing worker immediately...", file=sys.stderr, flush=True)
|
|
298
283
|
await self._force_kill_worker()
|
|
299
284
|
finally:
|
|
300
285
|
# Reset timeouts
|
|
@@ -308,46 +293,37 @@ class NextZmqExecutor:
|
|
|
308
293
|
except Exception:
|
|
309
294
|
pass
|
|
310
295
|
|
|
311
|
-
print(f"[INTERRUPT] Interrupt complete", file=sys.stderr, flush=True)
|
|
312
|
-
|
|
313
296
|
async def _force_kill_worker(self) -> None:
|
|
314
297
|
"""Force kill the worker process and respawn"""
|
|
315
|
-
import sys
|
|
316
|
-
print(f"[FORCE_KILL] Killing worker PID={self.worker_pid}", file=sys.stderr, flush=True)
|
|
317
|
-
|
|
318
298
|
if self.worker_pid:
|
|
319
299
|
try:
|
|
320
300
|
# For blocking I/O, SIGKILL immediately - no mercy
|
|
321
|
-
print(f"[FORCE_KILL] Sending SIGKILL to {self.worker_pid}", file=sys.stderr, flush=True)
|
|
322
301
|
os.kill(self.worker_pid, signal.SIGKILL)
|
|
323
302
|
await asyncio.sleep(0.1) # Brief wait for process to die
|
|
324
303
|
except ProcessLookupError:
|
|
325
|
-
|
|
326
|
-
except Exception
|
|
327
|
-
|
|
304
|
+
pass
|
|
305
|
+
except Exception:
|
|
306
|
+
pass
|
|
328
307
|
|
|
329
308
|
# Also try via Popen object if available
|
|
330
309
|
if self.worker_proc:
|
|
331
310
|
try:
|
|
332
|
-
print(f"[FORCE_KILL] Killing via Popen object", file=sys.stderr, flush=True)
|
|
333
311
|
self.worker_proc.kill() # SIGKILL directly
|
|
334
312
|
await asyncio.sleep(0.1)
|
|
335
|
-
except Exception
|
|
336
|
-
|
|
313
|
+
except Exception:
|
|
314
|
+
pass
|
|
337
315
|
|
|
338
316
|
# CRITICAL: Reset socket state - close and recreate
|
|
339
317
|
# The REQ socket may be waiting for a reply from the dead worker
|
|
340
318
|
try:
|
|
341
|
-
print(f"[FORCE_KILL] Resetting REQ and CTRL sockets", file=sys.stderr, flush=True)
|
|
342
319
|
self.req.close(0) # type: ignore[reportAttributeAccessIssue]
|
|
343
320
|
self.req = self.ctx.socket(zmq.REQ) # type: ignore[reportUnknownMemberType, reportAttributeAccessIssue]
|
|
344
321
|
self.req.connect(self.cmd_addr) # type: ignore[reportAttributeAccessIssue]
|
|
345
322
|
self.ctrl.close(0) # type: ignore[reportAttributeAccessIssue]
|
|
346
323
|
self.ctrl = self.ctx.socket(zmq.DEALER) # type: ignore[reportUnknownMemberType, reportAttributeAccessIssue]
|
|
347
324
|
self.ctrl.connect(self.ctrl_addr) # type: ignore[reportAttributeAccessIssue]
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
print(f"[FORCE_KILL] Error resetting sockets: {e}", file=sys.stderr, flush=True)
|
|
325
|
+
except Exception:
|
|
326
|
+
pass
|
|
351
327
|
|
|
352
328
|
# Respawn worker
|
|
353
329
|
try:
|
|
@@ -357,27 +333,21 @@ class NextZmqExecutor:
|
|
|
357
333
|
|
|
358
334
|
def reset_kernel(self) -> None:
|
|
359
335
|
"""Reset the kernel by shutting down worker and restarting"""
|
|
360
|
-
import sys
|
|
361
|
-
print(f"[RESET] Starting kernel reset, worker_pid={self.worker_pid}, is_remote={self.is_remote}", file=sys.stderr, flush=True)
|
|
362
|
-
|
|
363
336
|
# If connected to remote GPU, DON'T kill the worker - just send shutdown message
|
|
364
337
|
if self.is_remote:
|
|
365
|
-
print(f"[RESET] Remote worker - sending shutdown message only", file=sys.stderr, flush=True)
|
|
366
338
|
try:
|
|
367
339
|
self.req.setsockopt(zmq.SNDTIMEO, 2000) # type: ignore[reportAttributeAccessIssue]
|
|
368
340
|
self.req.setsockopt(zmq.RCVTIMEO, 2000) # type: ignore[reportAttributeAccessIssue]
|
|
369
341
|
self.req.send_json({'type': 'shutdown'}) # type: ignore[reportAttributeAccessIssue]
|
|
370
342
|
_ = cast(dict[str, object], self.req.recv_json()) # type: ignore[reportAttributeAccessIssue]
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
print(f"[RESET] Remote worker shutdown failed: {e}", file=sys.stderr, flush=True)
|
|
343
|
+
except Exception:
|
|
344
|
+
pass
|
|
374
345
|
finally:
|
|
375
346
|
self.req.setsockopt(zmq.SNDTIMEO, -1) # type: ignore[reportAttributeAccessIssue]
|
|
376
347
|
self.req.setsockopt(zmq.RCVTIMEO, -1) # type: ignore[reportAttributeAccessIssue]
|
|
377
348
|
|
|
378
349
|
# Reset execution count but don't respawn worker
|
|
379
350
|
self.execution_count = 0
|
|
380
|
-
print(f"[RESET] Remote kernel reset complete", file=sys.stderr, flush=True)
|
|
381
351
|
return
|
|
382
352
|
|
|
383
353
|
# Local worker mode - try graceful shutdown first
|
|
@@ -386,9 +356,8 @@ class NextZmqExecutor:
|
|
|
386
356
|
self.req.setsockopt(zmq.RCVTIMEO, 500) # type: ignore[reportAttributeAccessIssue]
|
|
387
357
|
self.req.send_json({'type': 'shutdown'}) # type: ignore[reportAttributeAccessIssue]
|
|
388
358
|
_ = cast(dict[str, object], self.req.recv_json()) # type: ignore[reportAttributeAccessIssue]
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
print(f"[RESET] Graceful shutdown failed: {e}", file=sys.stderr, flush=True)
|
|
359
|
+
except Exception:
|
|
360
|
+
pass
|
|
392
361
|
finally:
|
|
393
362
|
self.req.setsockopt(zmq.SNDTIMEO, -1) # type: ignore[reportAttributeAccessIssue]
|
|
394
363
|
self.req.setsockopt(zmq.RCVTIMEO, -1) # type: ignore[reportAttributeAccessIssue]
|
|
@@ -396,35 +365,30 @@ class NextZmqExecutor:
|
|
|
396
365
|
# Force kill local worker if needed
|
|
397
366
|
if self.worker_pid:
|
|
398
367
|
try:
|
|
399
|
-
print(f"[RESET] Sending SIGTERM to worker PID {self.worker_pid}", file=sys.stderr, flush=True)
|
|
400
368
|
os.kill(self.worker_pid, signal.SIGTERM)
|
|
401
369
|
time.sleep(0.3) # Give it time to shutdown gracefully
|
|
402
370
|
try:
|
|
403
371
|
# Check if still alive
|
|
404
372
|
os.kill(self.worker_pid, 0)
|
|
405
373
|
# Still alive, force kill
|
|
406
|
-
print(f"[RESET] Worker still alive, sending SIGKILL", file=sys.stderr, flush=True)
|
|
407
374
|
os.kill(self.worker_pid, signal.SIGKILL)
|
|
408
375
|
time.sleep(0.2) # Wait for SIGKILL to complete
|
|
409
376
|
except ProcessLookupError:
|
|
410
|
-
|
|
411
|
-
except Exception
|
|
412
|
-
|
|
377
|
+
pass
|
|
378
|
+
except Exception:
|
|
379
|
+
pass
|
|
413
380
|
|
|
414
381
|
if self.worker_proc:
|
|
415
382
|
try:
|
|
416
383
|
self.worker_proc.terminate()
|
|
417
384
|
self.worker_proc.wait(timeout=1)
|
|
418
|
-
print(f"[RESET] Worker process terminated via Popen", file=sys.stderr, flush=True)
|
|
419
385
|
except Exception:
|
|
420
386
|
try:
|
|
421
387
|
self.worker_proc.kill()
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
print(f"[RESET] Error killing via Popen: {e}", file=sys.stderr, flush=True)
|
|
388
|
+
except Exception:
|
|
389
|
+
pass
|
|
425
390
|
|
|
426
391
|
# Close sockets first, BEFORE recreating them
|
|
427
|
-
print(f"[RESET] Closing old sockets", file=sys.stderr, flush=True)
|
|
428
392
|
try:
|
|
429
393
|
self.req.close(0) # type: ignore[reportAttributeAccessIssue]
|
|
430
394
|
except Exception:
|
|
@@ -436,7 +400,6 @@ class NextZmqExecutor:
|
|
|
436
400
|
|
|
437
401
|
# Wait for ZMQ to release the sockets (critical!)
|
|
438
402
|
time.sleep(0.5)
|
|
439
|
-
print(f"[RESET] Sockets closed, waited for cleanup", file=sys.stderr, flush=True)
|
|
440
403
|
|
|
441
404
|
# Reset state
|
|
442
405
|
self.execution_count = 0
|
|
@@ -445,14 +408,12 @@ class NextZmqExecutor:
|
|
|
445
408
|
|
|
446
409
|
# Recreate sockets
|
|
447
410
|
try:
|
|
448
|
-
print(f"[RESET] Creating new sockets", file=sys.stderr, flush=True)
|
|
449
411
|
self.req = self.ctx.socket(zmq.REQ) # type: ignore[reportUnknownMemberType, reportAttributeAccessIssue]
|
|
450
412
|
self.req.connect(self.cmd_addr) # type: ignore[reportAttributeAccessIssue]
|
|
451
413
|
self.ctrl = self.ctx.socket(zmq.DEALER) # type: ignore[reportUnknownMemberType, reportAttributeAccessIssue]
|
|
452
414
|
self.ctrl.connect(self.ctrl_addr) # type: ignore[reportAttributeAccessIssue]
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
print(f"[RESET] Error creating sockets: {e}", file=sys.stderr, flush=True)
|
|
415
|
+
except Exception:
|
|
416
|
+
pass
|
|
456
417
|
|
|
457
418
|
# Reset special handler
|
|
458
419
|
if self.special_handler is not None:
|
|
@@ -460,10 +421,6 @@ class NextZmqExecutor:
|
|
|
460
421
|
|
|
461
422
|
# Respawn worker
|
|
462
423
|
try:
|
|
463
|
-
print(f"[RESET] Respawning new worker", file=sys.stderr, flush=True)
|
|
464
424
|
self._ensure_worker()
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
print(f"[RESET] Error respawning worker: {e}", file=sys.stderr, flush=True)
|
|
468
|
-
import traceback
|
|
469
|
-
traceback.print_exc()
|
|
425
|
+
except Exception:
|
|
426
|
+
pass
|
morecompute/execution/worker.py
CHANGED
|
@@ -58,7 +58,6 @@ def _inject_shell_command_function(globals_dict: dict):
|
|
|
58
58
|
|
|
59
59
|
# Check if already interrupted before starting new command
|
|
60
60
|
if _interrupt_requested:
|
|
61
|
-
print(f"[WORKER] Shell command skipped due to previous interrupt", file=sys.stderr, flush=True)
|
|
62
61
|
raise KeyboardInterrupt("Execution was interrupted")
|
|
63
62
|
|
|
64
63
|
# Prepare command and environment (using shared utilities)
|
|
@@ -80,12 +79,6 @@ def _inject_shell_command_function(globals_dict: dict):
|
|
|
80
79
|
_current_subprocess = process
|
|
81
80
|
if os.name != 'nt':
|
|
82
81
|
_current_process_group = os.getpgid(process.pid)
|
|
83
|
-
# Also create a new process group for clean killing
|
|
84
|
-
print(f"[WORKER] Started subprocess PID={process.pid}, PGID={_current_process_group}", file=sys.stderr, flush=True)
|
|
85
|
-
else:
|
|
86
|
-
print(f"[WORKER] Started subprocess PID={process.pid}", file=sys.stderr, flush=True)
|
|
87
|
-
|
|
88
|
-
sys.stderr.flush()
|
|
89
82
|
|
|
90
83
|
try:
|
|
91
84
|
# Stream output line by line
|
|
@@ -120,7 +113,6 @@ def _inject_shell_command_function(globals_dict: dict):
|
|
|
120
113
|
except subprocess.TimeoutExpired:
|
|
121
114
|
# Check if interrupted
|
|
122
115
|
if _interrupt_requested:
|
|
123
|
-
print(f"[WORKER] Interrupt detected, killing subprocess", file=sys.stderr, flush=True)
|
|
124
116
|
try:
|
|
125
117
|
process.kill()
|
|
126
118
|
except Exception:
|
|
@@ -135,7 +127,6 @@ def _inject_shell_command_function(globals_dict: dict):
|
|
|
135
127
|
except Exception:
|
|
136
128
|
pass
|
|
137
129
|
# Don't wait for process or threads - raise immediately
|
|
138
|
-
print(f"[WORKER] Raising KeyboardInterrupt immediately", file=sys.stderr, flush=True)
|
|
139
130
|
raise KeyboardInterrupt("Execution interrupted by user")
|
|
140
131
|
|
|
141
132
|
# Normal completion - join threads briefly
|
|
@@ -235,63 +226,34 @@ def control_thread_main(ctrl, current_cell_ref):
|
|
|
235
226
|
"""Run control channel in separate thread (Jupyter pattern)"""
|
|
236
227
|
global _interrupt_requested, _current_subprocess, _current_process_group
|
|
237
228
|
|
|
238
|
-
print(f"[CONTROL] Control thread started", file=sys.stderr, flush=True)
|
|
239
|
-
|
|
240
229
|
while True:
|
|
241
230
|
try:
|
|
242
231
|
# Block waiting for control messages
|
|
243
232
|
identity = ctrl.recv()
|
|
244
233
|
msg = ctrl.recv_json()
|
|
245
234
|
|
|
246
|
-
print(f"[CONTROL] Received: {msg}", file=sys.stderr, flush=True)
|
|
247
|
-
|
|
248
235
|
mtype = msg.get('type')
|
|
249
236
|
if mtype == 'interrupt':
|
|
250
237
|
requested_cell = msg.get('cell_index')
|
|
251
238
|
current_cell = current_cell_ref[0]
|
|
252
239
|
|
|
253
|
-
print(f"[CONTROL] Interrupt check: requested={requested_cell}, current={current_cell}, subprocess={_current_subprocess}, pgid={_current_process_group}", file=sys.stderr)
|
|
254
|
-
sys.stderr.flush()
|
|
255
|
-
|
|
256
240
|
if requested_cell is None or requested_cell == current_cell:
|
|
257
|
-
print(f"[CONTROL] ✓ Match! Processing interrupt for cell {requested_cell}", file=sys.stderr)
|
|
258
|
-
sys.stderr.flush()
|
|
259
|
-
|
|
260
241
|
# Set global flag
|
|
261
242
|
_interrupt_requested = True
|
|
262
243
|
|
|
263
244
|
# Send SIGINT to process group (Jupyter pattern)
|
|
264
245
|
if _current_process_group and os.name != 'nt':
|
|
265
246
|
try:
|
|
266
|
-
print(f"[CONTROL] Sending SIGINT to process group {_current_process_group}", file=sys.stderr)
|
|
267
|
-
sys.stderr.flush()
|
|
268
247
|
os.killpg(_current_process_group, signal.SIGINT)
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
except Exception as e:
|
|
272
|
-
print(f"[CONTROL] Failed to kill process group: {e}", file=sys.stderr)
|
|
273
|
-
sys.stderr.flush()
|
|
248
|
+
except Exception:
|
|
249
|
+
pass
|
|
274
250
|
|
|
275
251
|
# Also kill subprocess directly
|
|
276
252
|
if _current_subprocess:
|
|
277
253
|
try:
|
|
278
|
-
print(f"[CONTROL] Killing subprocess PID={_current_subprocess.pid}", file=sys.stderr)
|
|
279
|
-
sys.stderr.flush()
|
|
280
254
|
_current_subprocess.kill()
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
except Exception as e:
|
|
284
|
-
print(f"[CONTROL] Failed to kill subprocess: {e}", file=sys.stderr)
|
|
285
|
-
sys.stderr.flush()
|
|
286
|
-
|
|
287
|
-
# Don't send SIGINT to self - let the execution thread finish gracefully
|
|
288
|
-
# Sending SIGINT here can interrupt the execution thread before it sends
|
|
289
|
-
# completion messages, leaving the frontend in a confused state
|
|
290
|
-
print(f"[CONTROL] Interrupt signal sent, waiting for execution thread to finish", file=sys.stderr)
|
|
291
|
-
sys.stderr.flush()
|
|
292
|
-
else:
|
|
293
|
-
print(f"[CONTROL] ✗ NO MATCH! Ignoring interrupt (requested cell {requested_cell} != current cell {current_cell})", file=sys.stderr)
|
|
294
|
-
sys.stderr.flush()
|
|
255
|
+
except Exception:
|
|
256
|
+
pass
|
|
295
257
|
|
|
296
258
|
# Reply
|
|
297
259
|
ctrl.send(identity, zmq.SNDMORE)
|
|
@@ -302,27 +264,18 @@ def control_thread_main(ctrl, current_cell_ref):
|
|
|
302
264
|
ctrl.send_json({'ok': True, 'pid': os.getpid()})
|
|
303
265
|
break
|
|
304
266
|
|
|
305
|
-
except Exception
|
|
306
|
-
|
|
307
|
-
import traceback
|
|
308
|
-
traceback.print_exc()
|
|
267
|
+
except Exception:
|
|
268
|
+
pass
|
|
309
269
|
|
|
310
270
|
|
|
311
271
|
def worker_main():
|
|
312
272
|
global _current_subprocess, _interrupt_requested, _current_process_group
|
|
313
273
|
|
|
314
|
-
print(f"[WORKER] ========================================", file=sys.stderr, flush=True)
|
|
315
|
-
print(f"[WORKER] Starting THREADED worker (new code!)", file=sys.stderr, flush=True)
|
|
316
|
-
print(f"[WORKER] PID: {os.getpid()}", file=sys.stderr, flush=True)
|
|
317
|
-
print(f"[WORKER] ========================================", file=sys.stderr, flush=True)
|
|
318
|
-
|
|
319
274
|
_setup_signals()
|
|
320
275
|
cmd_addr = os.environ['MC_ZMQ_CMD_ADDR']
|
|
321
276
|
pub_addr = os.environ['MC_ZMQ_PUB_ADDR']
|
|
322
277
|
ctrl_addr = os.environ.get('MC_ZMQ_CTRL_ADDR', cmd_addr.replace('5555', '5557'))
|
|
323
278
|
|
|
324
|
-
print(f"[WORKER] Binding to control socket: {ctrl_addr}", file=sys.stderr, flush=True)
|
|
325
|
-
|
|
326
279
|
ctx = zmq.Context.instance()
|
|
327
280
|
rep = ctx.socket(zmq.REP)
|
|
328
281
|
rep.bind(cmd_addr)
|
|
@@ -341,7 +294,6 @@ def worker_main():
|
|
|
341
294
|
# Start control thread (Jupyter pattern)
|
|
342
295
|
ctrl_thread = threading.Thread(target=control_thread_main, args=(ctrl, current_cell_ref), daemon=True)
|
|
343
296
|
ctrl_thread.start()
|
|
344
|
-
print(f"[WORKER] Started control thread", file=sys.stderr, flush=True)
|
|
345
297
|
|
|
346
298
|
# Persistent REPL state
|
|
347
299
|
g = {"__name__": "__main__"}
|
|
@@ -380,9 +332,7 @@ def worker_main():
|
|
|
380
332
|
cell_index = msg.get('cell_index')
|
|
381
333
|
requested_count = msg.get('execution_count')
|
|
382
334
|
|
|
383
|
-
print(f"[WORKER] Setting current_cell_ref[0] = {cell_index}", file=sys.stderr, flush=True)
|
|
384
335
|
current_cell_ref[0] = cell_index # Update for control thread
|
|
385
|
-
print(f"[WORKER] Confirmed current_cell_ref[0] = {current_cell_ref[0]}", file=sys.stderr, flush=True)
|
|
386
336
|
|
|
387
337
|
if isinstance(requested_count, int):
|
|
388
338
|
exec_count = requested_count - 1
|
|
@@ -404,7 +354,6 @@ def worker_main():
|
|
|
404
354
|
|
|
405
355
|
try:
|
|
406
356
|
shell_cmd = code.strip()[1:].strip()
|
|
407
|
-
print(f"[WORKER] Executing shell: {shell_cmd[:50]}...", file=sys.stderr, flush=True)
|
|
408
357
|
|
|
409
358
|
# Run shell command with streaming
|
|
410
359
|
process = subprocess.Popen(
|
|
@@ -453,7 +402,6 @@ def worker_main():
|
|
|
453
402
|
break
|
|
454
403
|
except subprocess.TimeoutExpired:
|
|
455
404
|
if _interrupt_requested:
|
|
456
|
-
print(f"[WORKER] Interrupt detected, killing shell process", file=sys.stderr, flush=True)
|
|
457
405
|
try:
|
|
458
406
|
process.kill()
|
|
459
407
|
except Exception:
|
|
@@ -468,7 +416,6 @@ def worker_main():
|
|
|
468
416
|
except Exception:
|
|
469
417
|
pass
|
|
470
418
|
# Set interrupted status immediately
|
|
471
|
-
print(f"[WORKER] Setting error status for interrupted shell command", file=sys.stderr, flush=True)
|
|
472
419
|
status = 'error'
|
|
473
420
|
error_payload = {
|
|
474
421
|
'ename': 'KeyboardInterrupt',
|
|
@@ -484,8 +431,6 @@ def worker_main():
|
|
|
484
431
|
stdout_thread.join(timeout=0.1)
|
|
485
432
|
stderr_thread.join(timeout=0.1)
|
|
486
433
|
|
|
487
|
-
print(f"[WORKER] Shell process finished: return_code={return_code}", file=sys.stderr, flush=True)
|
|
488
|
-
|
|
489
434
|
# Check return code
|
|
490
435
|
if return_code != 0:
|
|
491
436
|
status = 'error'
|
|
@@ -494,7 +439,6 @@ def worker_main():
|
|
|
494
439
|
'evalue': f'Command failed with return code {return_code}',
|
|
495
440
|
'traceback': [f'Shell command failed: {shell_cmd}']
|
|
496
441
|
}
|
|
497
|
-
print(f"[WORKER] Set error_payload to ShellCommandError", file=sys.stderr, flush=True)
|
|
498
442
|
except KeyboardInterrupt:
|
|
499
443
|
status = 'error'
|
|
500
444
|
error_payload = {
|
|
@@ -511,17 +455,12 @@ def worker_main():
|
|
|
511
455
|
error_payload = {'ename': type(exc).__name__, 'evalue': str(exc), 'traceback': traceback.format_exc().split('\n')}
|
|
512
456
|
|
|
513
457
|
duration_ms = f"{(time.time()-start)*1000:.1f}ms"
|
|
514
|
-
print(f"[WORKER] Sending completion messages: status={status}, error={error_payload is not None}", file=sys.stderr, flush=True)
|
|
515
458
|
if error_payload:
|
|
516
459
|
pub.send_json({'type': 'execution_error', 'cell_index': cell_index, 'error': error_payload})
|
|
517
|
-
print(f"[WORKER] Sent execution_error", file=sys.stderr, flush=True)
|
|
518
460
|
pub.send_json({'type': 'execution_complete', 'cell_index': cell_index, 'result': {'status': status, 'execution_count': exec_count, 'execution_time': duration_ms, 'outputs': [], 'error': error_payload}})
|
|
519
|
-
print(f"[WORKER] Sent execution_complete", file=sys.stderr, flush=True)
|
|
520
461
|
rep.send_json({'ok': True, 'pid': os.getpid()})
|
|
521
462
|
|
|
522
|
-
print(f"[WORKER] Clearing current_cell_ref[0] (was {current_cell_ref[0]})", file=sys.stderr, flush=True)
|
|
523
463
|
current_cell_ref[0] = None
|
|
524
|
-
print(f"[WORKER] Confirmed current_cell_ref[0] = {current_cell_ref[0]}", file=sys.stderr, flush=True)
|
|
525
464
|
continue
|
|
526
465
|
|
|
527
466
|
# Regular Python code execution
|
|
@@ -590,17 +529,12 @@ def worker_main():
|
|
|
590
529
|
sys.stdout, sys.stderr = old_out, old_err
|
|
591
530
|
exec_count += 1
|
|
592
531
|
duration_ms = f"{(time.time()-start)*1000:.1f}ms"
|
|
593
|
-
print(f"[WORKER] Sending completion messages (Python): status={status}, error={error_payload is not None}", file=sys.stderr, flush=True)
|
|
594
532
|
if error_payload:
|
|
595
533
|
pub.send_json({'type': 'execution_error', 'cell_index': cell_index, 'error': error_payload})
|
|
596
|
-
print(f"[WORKER] Sent execution_error", file=sys.stderr, flush=True)
|
|
597
534
|
pub.send_json({'type': 'execution_complete', 'cell_index': cell_index, 'result': {'status': status, 'execution_count': exec_count, 'execution_time': duration_ms, 'outputs': [], 'error': error_payload}})
|
|
598
|
-
print(f"[WORKER] Sent execution_complete", file=sys.stderr, flush=True)
|
|
599
535
|
rep.send_json({'ok': True, 'pid': os.getpid()})
|
|
600
536
|
|
|
601
|
-
print(f"[WORKER] Clearing current_cell_ref[0] (was {current_cell_ref[0]})", file=sys.stderr, flush=True)
|
|
602
537
|
current_cell_ref[0] = None
|
|
603
|
-
print(f"[WORKER] Confirmed current_cell_ref[0] = {current_cell_ref[0]}", file=sys.stderr, flush=True)
|
|
604
538
|
|
|
605
539
|
try:
|
|
606
540
|
rep.close(0); pub.close(0); ctrl.close(0)
|
morecompute/models/api_models.py
CHANGED
|
@@ -195,3 +195,65 @@ class AvailabilityQuery(BaseModel):
|
|
|
195
195
|
gpu_count: int | None = None
|
|
196
196
|
gpu_type: str | None = None
|
|
197
197
|
security: str | None = None
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
# ============================================================================
|
|
201
|
+
# Multi-Provider API Models
|
|
202
|
+
# ============================================================================
|
|
203
|
+
|
|
204
|
+
class ProviderInfo(BaseModel):
|
|
205
|
+
"""Information about a GPU cloud provider."""
|
|
206
|
+
name: str # Internal name (e.g., "runpod")
|
|
207
|
+
display_name: str # Human-readable name (e.g., "RunPod")
|
|
208
|
+
api_key_env_name: str # Environment variable name
|
|
209
|
+
supports_ssh: bool # Whether provider supports SSH connections
|
|
210
|
+
dashboard_url: str # URL to get API key
|
|
211
|
+
configured: bool = False # Whether API key is configured
|
|
212
|
+
is_active: bool = False # Whether this is the currently active provider
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
class ProviderListResponse(BaseModel):
|
|
216
|
+
"""Response model for listing providers."""
|
|
217
|
+
providers: list[ProviderInfo]
|
|
218
|
+
active_provider: str | None = None
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
class ProviderConfigRequest(BaseModel):
|
|
222
|
+
"""Request model for configuring a provider API key."""
|
|
223
|
+
api_key: str
|
|
224
|
+
token_secret: str | None = None # For Modal which requires two tokens
|
|
225
|
+
make_active: bool = False # Whether to make this the active provider
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
class SetActiveProviderRequest(BaseModel):
|
|
229
|
+
"""Request model for setting the active provider."""
|
|
230
|
+
provider: str
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
class GpuAvailabilityResponse(BaseModel):
|
|
234
|
+
"""Response model for GPU availability."""
|
|
235
|
+
data: list[dict]
|
|
236
|
+
total_count: int
|
|
237
|
+
provider: str
|
|
238
|
+
note: str | None = None
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
class PodListResponse(BaseModel):
|
|
242
|
+
"""Response model for listing pods."""
|
|
243
|
+
data: list[dict]
|
|
244
|
+
total_count: int
|
|
245
|
+
offset: int
|
|
246
|
+
limit: int
|
|
247
|
+
provider: str
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
class PodWithProvider(PodResponse):
|
|
251
|
+
"""Pod response with provider information."""
|
|
252
|
+
provider: str = "prime_intellect"
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
class CreatePodWithProviderRequest(BaseModel):
|
|
256
|
+
"""Request to create a pod with explicit provider selection."""
|
|
257
|
+
pod: PodConfig
|
|
258
|
+
provider_name: str # Provider to use (e.g., "runpod", "lambda_labs")
|
|
259
|
+
team: TeamConfig | None = None
|
morecompute/notebook.py
CHANGED
|
@@ -127,6 +127,17 @@ class Notebook:
|
|
|
127
127
|
cell['id'] = self._generate_cell_id()
|
|
128
128
|
self.cells.append(cell)
|
|
129
129
|
|
|
130
|
+
# Ensure at least one empty cell exists
|
|
131
|
+
if not self.cells:
|
|
132
|
+
self.cells.append({
|
|
133
|
+
'id': self._generate_cell_id(),
|
|
134
|
+
'cell_type': 'code',
|
|
135
|
+
'source': '',
|
|
136
|
+
'metadata': {},
|
|
137
|
+
'outputs': [],
|
|
138
|
+
'execution_count': None
|
|
139
|
+
})
|
|
140
|
+
|
|
130
141
|
self.metadata = data.get('metadata', {})
|
|
131
142
|
self.file_path = file_path
|
|
132
143
|
|