omi-watcher 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- omi_watcher-0.1.0/PKG-INFO +5 -0
- omi_watcher-0.1.0/README.md +114 -0
- omi_watcher-0.1.0/pyproject.toml +15 -0
- omi_watcher-0.1.0/setup.cfg +4 -0
- omi_watcher-0.1.0/src/omi_watcher/__init__.py +1 -0
- omi_watcher-0.1.0/src/omi_watcher/cli.py +233 -0
- omi_watcher-0.1.0/src/omi_watcher/daemon.py +319 -0
- omi_watcher-0.1.0/src/omi_watcher/db.py +128 -0
- omi_watcher-0.1.0/src/omi_watcher/models.py +75 -0
- omi_watcher-0.1.0/src/omi_watcher/protocol.py +54 -0
- omi_watcher-0.1.0/src/omi_watcher/scheduler.py +44 -0
- omi_watcher-0.1.0/src/omi_watcher.egg-info/PKG-INFO +5 -0
- omi_watcher-0.1.0/src/omi_watcher.egg-info/SOURCES.txt +17 -0
- omi_watcher-0.1.0/src/omi_watcher.egg-info/dependency_links.txt +1 -0
- omi_watcher-0.1.0/src/omi_watcher.egg-info/entry_points.txt +2 -0
- omi_watcher-0.1.0/src/omi_watcher.egg-info/top_level.txt +1 -0
- omi_watcher-0.1.0/tests/test_db.py +86 -0
- omi_watcher-0.1.0/tests/test_integration.py +105 -0
- omi_watcher-0.1.0/tests/test_scheduler.py +82 -0
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# omi-watcher
|
|
2
|
+
|
|
3
|
+
A CLI utility for managing background jobs with priority-based scheduling and eviction.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Queue background jobs** — run any bash command as a managed background job
|
|
8
|
+
- **Priority scheduling** — jobs have priorities: `low`, `medium`, `high`, `critical`
|
|
9
|
+
- **Max 10 concurrent jobs** — higher-priority jobs evict lower-priority ones when full
|
|
10
|
+
- **Eviction with re-queue** — evicted jobs are killed and automatically re-queued
|
|
11
|
+
- **Job monitoring** — view status, logs (stdout/stderr), and exit codes
|
|
12
|
+
- **Persistent state** — SQLite database survives daemon restarts
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
cd omi-watcher
|
|
18
|
+
pip install -e .
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Usage
|
|
22
|
+
|
|
23
|
+
### Start the daemon
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
omi-watcher start
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
### Submit a job
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
omi-watcher submit "make build" -p high
|
|
33
|
+
omi-watcher submit "python train.py" -p critical
|
|
34
|
+
omi-watcher submit "sleep 60" # default: medium priority
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
### Check job status
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
omi-watcher status # all jobs
|
|
41
|
+
omi-watcher status abc123 # specific job
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Example output:
|
|
45
|
+
```
|
|
46
|
+
ID STATUS PRIORITY PID EXIT COMMAND
|
|
47
|
+
------------------------------------------------------------------------------
|
|
48
|
+
a1b2c3d4 running high 12345 - make build
|
|
49
|
+
e5f6g7h8 queued medium - - sleep 60
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### View job logs
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
omi-watcher logs a1b2c3d4
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Cancel a job
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
omi-watcher cancel a1b2c3d4
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Purge completed/failed jobs
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
omi-watcher purge
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### Stop the daemon
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
omi-watcher stop
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Architecture
|
|
77
|
+
|
|
78
|
+
```
|
|
79
|
+
┌──────────────┐ Unix Socket ┌──────────────────┐
|
|
80
|
+
│ CLI Client │ ◄───────────────────► │ Daemon │
|
|
81
|
+
│ (omi-watcher)│ │ │
|
|
82
|
+
└──────────────┘ │ ┌──────────────┐ │
|
|
83
|
+
│ │ Scheduler │ │
|
|
84
|
+
│ └──────────────┘ │
|
|
85
|
+
│ ┌──────────────┐ │
|
|
86
|
+
│ │ SQLite DB │ │
|
|
87
|
+
│ └──────────────┘ │
|
|
88
|
+
│ ┌──────────────┐ │
|
|
89
|
+
│ │ Subprocesses │ │
|
|
90
|
+
│ │ (max 10) │ │
|
|
91
|
+
│ └──────────────┘ │
|
|
92
|
+
└──────────────────┘
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
- **Daemon** runs in the background, listening on `~/.omi-watcher/omi-watcher.sock`
|
|
96
|
+
- **Scheduler** picks the highest-priority queued job and launches it
|
|
97
|
+
- When all 10 slots are full and a higher-priority job arrives, the lowest-priority running job is killed and re-queued
|
|
98
|
+
- Job stdout/stderr captured to `~/.omi-watcher/logs/<job_id>.out` and `.err`
|
|
99
|
+
- State persisted in `~/.omi-watcher/omi-watcher.db`
|
|
100
|
+
|
|
101
|
+
## Priority & Eviction
|
|
102
|
+
|
|
103
|
+
| Priority | Value |
|
|
104
|
+
|----------|-------|
|
|
105
|
+
| critical | 3 |
|
|
106
|
+
| high | 2 |
|
|
107
|
+
| medium | 1 |
|
|
108
|
+
| low | 0 |
|
|
109
|
+
|
|
110
|
+
When all 10 slots are occupied and a higher-priority job is submitted:
|
|
111
|
+
1. The lowest-priority running job is selected (ties broken by most recently started)
|
|
112
|
+
2. It receives SIGTERM
|
|
113
|
+
3. It is re-queued with its original priority and an incremented retry count
|
|
114
|
+
4. The new higher-priority job takes its slot
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=64"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "omi-watcher"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A CLI utility for managing background jobs with priority-based scheduling"
|
|
9
|
+
requires-python = ">=3.9"
|
|
10
|
+
|
|
11
|
+
[project.scripts]
|
|
12
|
+
omi-watcher = "omi_watcher.cli:main"
|
|
13
|
+
|
|
14
|
+
[tool.setuptools.packages.find]
|
|
15
|
+
where = ["src"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""omi-watcher: Background job queue with priority-based scheduling."""
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import os
|
|
3
|
+
import signal
|
|
4
|
+
import subprocess
|
|
5
|
+
import sys
|
|
6
|
+
import time
|
|
7
|
+
|
|
8
|
+
from .models import APP_DIR, PID_FILE, SOCKET_PATH
|
|
9
|
+
from .protocol import client_request
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def main():
|
|
13
|
+
parser = argparse.ArgumentParser(
|
|
14
|
+
prog="omi-watcher",
|
|
15
|
+
description="Background job queue with priority-based scheduling",
|
|
16
|
+
)
|
|
17
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
18
|
+
|
|
19
|
+
# start
|
|
20
|
+
sub.add_parser("start", help="Start the daemon")
|
|
21
|
+
|
|
22
|
+
# stop
|
|
23
|
+
sub.add_parser("stop", help="Stop the daemon")
|
|
24
|
+
|
|
25
|
+
# submit
|
|
26
|
+
p_submit = sub.add_parser("submit", help="Submit a job")
|
|
27
|
+
p_submit.add_argument("cmd", help="Bash command to run")
|
|
28
|
+
p_submit.add_argument("-p", "--priority", default="medium",
|
|
29
|
+
choices=["low", "medium", "high", "critical"],
|
|
30
|
+
help="Job priority (default: medium)")
|
|
31
|
+
|
|
32
|
+
# status
|
|
33
|
+
p_status = sub.add_parser("status", help="Show job status")
|
|
34
|
+
p_status.add_argument("job_id", nargs="?", default=None, help="Job ID (omit for all)")
|
|
35
|
+
|
|
36
|
+
# cancel
|
|
37
|
+
p_cancel = sub.add_parser("cancel", help="Cancel a job")
|
|
38
|
+
p_cancel.add_argument("job_id", help="Job ID to cancel")
|
|
39
|
+
|
|
40
|
+
# logs
|
|
41
|
+
p_logs = sub.add_parser("logs", help="View job logs")
|
|
42
|
+
p_logs.add_argument("job_id", help="Job ID")
|
|
43
|
+
|
|
44
|
+
# purge
|
|
45
|
+
sub.add_parser("purge", help="Remove completed/failed jobs")
|
|
46
|
+
|
|
47
|
+
args = parser.parse_args()
|
|
48
|
+
|
|
49
|
+
handlers = {
|
|
50
|
+
"start": cmd_start,
|
|
51
|
+
"stop": cmd_stop,
|
|
52
|
+
"submit": cmd_submit,
|
|
53
|
+
"status": cmd_status,
|
|
54
|
+
"cancel": cmd_cancel,
|
|
55
|
+
"logs": cmd_logs,
|
|
56
|
+
"purge": cmd_purge,
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
handlers[args.command](args)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _daemon_running() -> bool:
|
|
63
|
+
if not os.path.exists(PID_FILE):
|
|
64
|
+
return False
|
|
65
|
+
with open(PID_FILE) as f:
|
|
66
|
+
pid = int(f.read().strip())
|
|
67
|
+
try:
|
|
68
|
+
os.kill(pid, 0)
|
|
69
|
+
return True
|
|
70
|
+
except ProcessLookupError:
|
|
71
|
+
return False
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _require_daemon():
|
|
75
|
+
if not _daemon_running():
|
|
76
|
+
print("Error: Daemon is not running. Start it with: omi-watcher start", file=sys.stderr)
|
|
77
|
+
sys.exit(1)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def cmd_start(args):
|
|
81
|
+
if _daemon_running():
|
|
82
|
+
print("Daemon is already running.")
|
|
83
|
+
return
|
|
84
|
+
|
|
85
|
+
os.makedirs(APP_DIR, exist_ok=True)
|
|
86
|
+
|
|
87
|
+
# Fork and run daemon in background
|
|
88
|
+
pid = os.fork()
|
|
89
|
+
if pid > 0:
|
|
90
|
+
# Parent: wait briefly for daemon to initialize
|
|
91
|
+
time.sleep(0.5)
|
|
92
|
+
if _daemon_running():
|
|
93
|
+
print(f"Daemon started (PID file: {PID_FILE})")
|
|
94
|
+
else:
|
|
95
|
+
print("Warning: Daemon may have failed to start. Check ~/.omi-watcher/daemon.log")
|
|
96
|
+
return
|
|
97
|
+
|
|
98
|
+
# Child: become session leader
|
|
99
|
+
os.setsid()
|
|
100
|
+
|
|
101
|
+
# Second fork to fully detach
|
|
102
|
+
pid2 = os.fork()
|
|
103
|
+
if pid2 > 0:
|
|
104
|
+
os._exit(0)
|
|
105
|
+
|
|
106
|
+
# Grandchild: redirect stdio and run daemon
|
|
107
|
+
sys.stdin.close()
|
|
108
|
+
devnull = open(os.devnull, "r")
|
|
109
|
+
os.dup2(devnull.fileno(), 0)
|
|
110
|
+
|
|
111
|
+
from .daemon import run_daemon
|
|
112
|
+
run_daemon()
|
|
113
|
+
os._exit(0)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def cmd_stop(args):
|
|
117
|
+
_require_daemon()
|
|
118
|
+
try:
|
|
119
|
+
resp = client_request(SOCKET_PATH, "stop")
|
|
120
|
+
if resp.get("ok"):
|
|
121
|
+
print("Daemon stopping...")
|
|
122
|
+
# Wait briefly for clean shutdown
|
|
123
|
+
for _ in range(10):
|
|
124
|
+
time.sleep(0.3)
|
|
125
|
+
if not _daemon_running():
|
|
126
|
+
print("Daemon stopped.")
|
|
127
|
+
return
|
|
128
|
+
print("Daemon is still shutting down.")
|
|
129
|
+
else:
|
|
130
|
+
print(f"Error: {resp.get('error', 'Unknown error')}", file=sys.stderr)
|
|
131
|
+
except ConnectionRefusedError:
|
|
132
|
+
print("Error: Could not connect to daemon.", file=sys.stderr)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def cmd_submit(args):
|
|
136
|
+
_require_daemon()
|
|
137
|
+
resp = client_request(SOCKET_PATH, "submit", {
|
|
138
|
+
"command": args.cmd,
|
|
139
|
+
"priority": args.priority,
|
|
140
|
+
})
|
|
141
|
+
if resp.get("ok"):
|
|
142
|
+
job = resp["data"]
|
|
143
|
+
print(f"Job submitted: {job['id']} (priority: {job['priority']})")
|
|
144
|
+
else:
|
|
145
|
+
print(f"Error: {resp.get('error', 'Unknown error')}", file=sys.stderr)
|
|
146
|
+
sys.exit(1)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def cmd_status(args):
|
|
150
|
+
_require_daemon()
|
|
151
|
+
payload = {}
|
|
152
|
+
if args.job_id:
|
|
153
|
+
payload["job_id"] = args.job_id
|
|
154
|
+
|
|
155
|
+
resp = client_request(SOCKET_PATH, "status", payload)
|
|
156
|
+
if not resp.get("ok"):
|
|
157
|
+
print(f"Error: {resp.get('error', 'Unknown error')}", file=sys.stderr)
|
|
158
|
+
sys.exit(1)
|
|
159
|
+
|
|
160
|
+
data = resp["data"]
|
|
161
|
+
if isinstance(data, dict):
|
|
162
|
+
_print_job(data)
|
|
163
|
+
elif isinstance(data, list):
|
|
164
|
+
if not data:
|
|
165
|
+
print("No jobs found.")
|
|
166
|
+
return
|
|
167
|
+
# Table header
|
|
168
|
+
print(f"{'ID':<10} {'STATUS':<12} {'PRIORITY':<10} {'PID':<8} {'EXIT':<6} {'COMMAND'}")
|
|
169
|
+
print("-" * 78)
|
|
170
|
+
for j in data:
|
|
171
|
+
pid = str(j.get("pid") or "-")
|
|
172
|
+
exit_code = str(j.get("exit_code") if j.get("exit_code") is not None else "-")
|
|
173
|
+
cmd_display = j["command"][:40]
|
|
174
|
+
print(f"{j['id']:<10} {j['status']:<12} {j['priority']:<10} {pid:<8} {exit_code:<6} {cmd_display}")
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _print_job(j: dict):
|
|
178
|
+
print(f"Job ID: {j['id']}")
|
|
179
|
+
print(f"Command: {j['command']}")
|
|
180
|
+
print(f"Status: {j['status']}")
|
|
181
|
+
print(f"Priority: {j['priority']}")
|
|
182
|
+
print(f"PID: {j.get('pid') or '-'}")
|
|
183
|
+
print(f"Exit Code: {j['exit_code'] if j.get('exit_code') is not None else '-'}")
|
|
184
|
+
print(f"Created: {j.get('created_at', '-')}")
|
|
185
|
+
print(f"Started: {j.get('started_at') or '-'}")
|
|
186
|
+
print(f"Finished: {j.get('finished_at') or '-'}")
|
|
187
|
+
print(f"Retries: {j.get('retry_count', 0)}")
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def cmd_cancel(args):
|
|
191
|
+
_require_daemon()
|
|
192
|
+
resp = client_request(SOCKET_PATH, "cancel", {"job_id": args.job_id})
|
|
193
|
+
if resp.get("ok"):
|
|
194
|
+
print(f"Job {args.job_id} cancelled.")
|
|
195
|
+
else:
|
|
196
|
+
print(f"Error: {resp.get('error', 'Unknown error')}", file=sys.stderr)
|
|
197
|
+
sys.exit(1)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def cmd_logs(args):
|
|
201
|
+
_require_daemon()
|
|
202
|
+
resp = client_request(SOCKET_PATH, "logs", {"job_id": args.job_id})
|
|
203
|
+
if not resp.get("ok"):
|
|
204
|
+
print(f"Error: {resp.get('error', 'Unknown error')}", file=sys.stderr)
|
|
205
|
+
sys.exit(1)
|
|
206
|
+
|
|
207
|
+
data = resp["data"]
|
|
208
|
+
stdout = data.get("stdout", "")
|
|
209
|
+
stderr = data.get("stderr", "")
|
|
210
|
+
|
|
211
|
+
if stdout:
|
|
212
|
+
print("=== STDOUT ===")
|
|
213
|
+
print(stdout)
|
|
214
|
+
if stderr:
|
|
215
|
+
print("=== STDERR ===")
|
|
216
|
+
print(stderr)
|
|
217
|
+
if not stdout and not stderr:
|
|
218
|
+
print("No logs available for this job.")
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def cmd_purge(args):
|
|
222
|
+
_require_daemon()
|
|
223
|
+
resp = client_request(SOCKET_PATH, "purge")
|
|
224
|
+
if resp.get("ok"):
|
|
225
|
+
count = resp["data"]["purged"]
|
|
226
|
+
print(f"Purged {count} completed/failed job(s).")
|
|
227
|
+
else:
|
|
228
|
+
print(f"Error: {resp.get('error', 'Unknown error')}", file=sys.stderr)
|
|
229
|
+
sys.exit(1)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
if __name__ == "__main__":
|
|
233
|
+
main()
|
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import select
|
|
4
|
+
import signal
|
|
5
|
+
import socket
|
|
6
|
+
import subprocess
|
|
7
|
+
import sys
|
|
8
|
+
import threading
|
|
9
|
+
import time
|
|
10
|
+
from typing import Dict, Optional
|
|
11
|
+
|
|
12
|
+
from .db import (
|
|
13
|
+
get_connection,
|
|
14
|
+
get_job,
|
|
15
|
+
get_running_jobs,
|
|
16
|
+
init_db,
|
|
17
|
+
insert_job,
|
|
18
|
+
list_jobs,
|
|
19
|
+
purge_completed,
|
|
20
|
+
update_job_status,
|
|
21
|
+
)
|
|
22
|
+
from .models import (
|
|
23
|
+
APP_DIR,
|
|
24
|
+
LOG_DIR,
|
|
25
|
+
PID_FILE,
|
|
26
|
+
SOCKET_PATH,
|
|
27
|
+
JobStatus,
|
|
28
|
+
Priority,
|
|
29
|
+
)
|
|
30
|
+
from .protocol import make_response, recv_message, send_message
|
|
31
|
+
from .scheduler import evict_job, pick_next_job, should_evict
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger("omi-watcher-daemon")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class Daemon:
|
|
37
|
+
def __init__(self):
|
|
38
|
+
self._running = False
|
|
39
|
+
self._processes: Dict[str, subprocess.Popen] = {} # job_id -> Popen
|
|
40
|
+
self._conn = None
|
|
41
|
+
self._server_sock = None
|
|
42
|
+
|
|
43
|
+
def start(self):
|
|
44
|
+
os.makedirs(APP_DIR, exist_ok=True)
|
|
45
|
+
os.makedirs(LOG_DIR, exist_ok=True)
|
|
46
|
+
|
|
47
|
+
self._setup_logging()
|
|
48
|
+
self._write_pid()
|
|
49
|
+
self._conn = get_connection()
|
|
50
|
+
init_db(self._conn)
|
|
51
|
+
|
|
52
|
+
# Mark any previously "running" jobs as failed (stale from prior crash)
|
|
53
|
+
for job in get_running_jobs(self._conn):
|
|
54
|
+
update_job_status(self._conn, job.id, JobStatus.FAILED, exit_code=-1)
|
|
55
|
+
|
|
56
|
+
self._setup_socket()
|
|
57
|
+
self._running = True
|
|
58
|
+
|
|
59
|
+
signal.signal(signal.SIGTERM, self._handle_signal)
|
|
60
|
+
signal.signal(signal.SIGINT, self._handle_signal)
|
|
61
|
+
|
|
62
|
+
logger.info("Daemon started (PID %d)", os.getpid())
|
|
63
|
+
self._main_loop()
|
|
64
|
+
|
|
65
|
+
def _setup_logging(self):
|
|
66
|
+
log_file = os.path.join(APP_DIR, "daemon.log")
|
|
67
|
+
logging.basicConfig(
|
|
68
|
+
filename=log_file,
|
|
69
|
+
level=logging.INFO,
|
|
70
|
+
format="%(asctime)s %(levelname)s %(message)s",
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
def _write_pid(self):
|
|
74
|
+
with open(PID_FILE, "w") as f:
|
|
75
|
+
f.write(str(os.getpid()))
|
|
76
|
+
|
|
77
|
+
def _setup_socket(self):
|
|
78
|
+
if os.path.exists(SOCKET_PATH):
|
|
79
|
+
os.unlink(SOCKET_PATH)
|
|
80
|
+
self._server_sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
|
81
|
+
self._server_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
|
82
|
+
self._server_sock.bind(SOCKET_PATH)
|
|
83
|
+
self._server_sock.listen(5)
|
|
84
|
+
self._server_sock.setblocking(False)
|
|
85
|
+
|
|
86
|
+
def _handle_signal(self, signum, frame):
|
|
87
|
+
logger.info("Received signal %d, shutting down", signum)
|
|
88
|
+
self._running = False
|
|
89
|
+
|
|
90
|
+
def _main_loop(self):
|
|
91
|
+
while self._running:
|
|
92
|
+
# Accept client connections (non-blocking)
|
|
93
|
+
try:
|
|
94
|
+
readable, _, _ = select.select([self._server_sock], [], [], 0.5)
|
|
95
|
+
for sock in readable:
|
|
96
|
+
client, _ = sock.accept()
|
|
97
|
+
try:
|
|
98
|
+
self._handle_client(client)
|
|
99
|
+
except Exception as e:
|
|
100
|
+
logger.error("Error handling client: %s", e)
|
|
101
|
+
finally:
|
|
102
|
+
client.close()
|
|
103
|
+
except OSError:
|
|
104
|
+
break
|
|
105
|
+
|
|
106
|
+
# Reap completed processes
|
|
107
|
+
self._reap_processes()
|
|
108
|
+
|
|
109
|
+
# Schedule queued jobs
|
|
110
|
+
self._schedule_jobs()
|
|
111
|
+
|
|
112
|
+
self._shutdown()
|
|
113
|
+
|
|
114
|
+
def _handle_client(self, client: socket.socket):
|
|
115
|
+
msg = recv_message(client)
|
|
116
|
+
if not msg:
|
|
117
|
+
return
|
|
118
|
+
|
|
119
|
+
action = msg.get("action", "")
|
|
120
|
+
payload = msg.get("payload", {})
|
|
121
|
+
|
|
122
|
+
handler = {
|
|
123
|
+
"submit": self._cmd_submit,
|
|
124
|
+
"status": self._cmd_status,
|
|
125
|
+
"cancel": self._cmd_cancel,
|
|
126
|
+
"logs": self._cmd_logs,
|
|
127
|
+
"purge": self._cmd_purge,
|
|
128
|
+
"stop": self._cmd_stop,
|
|
129
|
+
}.get(action)
|
|
130
|
+
|
|
131
|
+
if handler:
|
|
132
|
+
response = handler(payload)
|
|
133
|
+
else:
|
|
134
|
+
response = make_response(False, error=f"Unknown action: {action}")
|
|
135
|
+
|
|
136
|
+
send_message(client, response)
|
|
137
|
+
|
|
138
|
+
def _cmd_submit(self, payload: dict) -> dict:
|
|
139
|
+
command = payload.get("command")
|
|
140
|
+
priority_str = payload.get("priority", "medium")
|
|
141
|
+
if not command:
|
|
142
|
+
return make_response(False, error="No command provided")
|
|
143
|
+
|
|
144
|
+
try:
|
|
145
|
+
priority = Priority.from_str(priority_str)
|
|
146
|
+
except (KeyError, ValueError):
|
|
147
|
+
return make_response(False, error=f"Invalid priority: {priority_str}")
|
|
148
|
+
|
|
149
|
+
job = insert_job(self._conn, command, priority)
|
|
150
|
+
logger.info("Job %s submitted: %s (priority=%s)", job.id, command, priority.name)
|
|
151
|
+
|
|
152
|
+
# Immediately try to schedule
|
|
153
|
+
self._schedule_jobs()
|
|
154
|
+
|
|
155
|
+
return make_response(True, data=job.to_dict())
|
|
156
|
+
|
|
157
|
+
def _cmd_status(self, payload: dict) -> dict:
|
|
158
|
+
job_id = payload.get("job_id")
|
|
159
|
+
if job_id:
|
|
160
|
+
job = get_job(self._conn, job_id)
|
|
161
|
+
if not job:
|
|
162
|
+
return make_response(False, error=f"Job {job_id} not found")
|
|
163
|
+
return make_response(True, data=job.to_dict())
|
|
164
|
+
else:
|
|
165
|
+
jobs = list_jobs(self._conn)
|
|
166
|
+
return make_response(True, data=[j.to_dict() for j in jobs])
|
|
167
|
+
|
|
168
|
+
def _cmd_cancel(self, payload: dict) -> dict:
|
|
169
|
+
job_id = payload.get("job_id")
|
|
170
|
+
if not job_id:
|
|
171
|
+
return make_response(False, error="No job_id provided")
|
|
172
|
+
|
|
173
|
+
job = get_job(self._conn, job_id)
|
|
174
|
+
if not job:
|
|
175
|
+
return make_response(False, error=f"Job {job_id} not found")
|
|
176
|
+
|
|
177
|
+
if job.status == JobStatus.RUNNING and job.pid:
|
|
178
|
+
proc = self._processes.get(job_id)
|
|
179
|
+
if proc:
|
|
180
|
+
try:
|
|
181
|
+
proc.terminate()
|
|
182
|
+
except ProcessLookupError:
|
|
183
|
+
pass
|
|
184
|
+
|
|
185
|
+
if job.status in (JobStatus.QUEUED, JobStatus.RUNNING):
|
|
186
|
+
update_job_status(self._conn, job_id, JobStatus.CANCELLED)
|
|
187
|
+
logger.info("Job %s cancelled", job_id)
|
|
188
|
+
return make_response(True, data={"job_id": job_id, "status": "cancelled"})
|
|
189
|
+
|
|
190
|
+
return make_response(False, error=f"Job {job_id} is {job.status.value}, cannot cancel")
|
|
191
|
+
|
|
192
|
+
def _cmd_logs(self, payload: dict) -> dict:
|
|
193
|
+
job_id = payload.get("job_id")
|
|
194
|
+
if not job_id:
|
|
195
|
+
return make_response(False, error="No job_id provided")
|
|
196
|
+
|
|
197
|
+
stdout_path = os.path.join(LOG_DIR, f"{job_id}.out")
|
|
198
|
+
stderr_path = os.path.join(LOG_DIR, f"{job_id}.err")
|
|
199
|
+
|
|
200
|
+
result = {}
|
|
201
|
+
if os.path.exists(stdout_path):
|
|
202
|
+
with open(stdout_path, "r") as f:
|
|
203
|
+
result["stdout"] = f.read()
|
|
204
|
+
else:
|
|
205
|
+
result["stdout"] = ""
|
|
206
|
+
|
|
207
|
+
if os.path.exists(stderr_path):
|
|
208
|
+
with open(stderr_path, "r") as f:
|
|
209
|
+
result["stderr"] = f.read()
|
|
210
|
+
else:
|
|
211
|
+
result["stderr"] = ""
|
|
212
|
+
|
|
213
|
+
return make_response(True, data=result)
|
|
214
|
+
|
|
215
|
+
def _cmd_purge(self, payload: dict) -> dict:
|
|
216
|
+
count = purge_completed(self._conn)
|
|
217
|
+
return make_response(True, data={"purged": count})
|
|
218
|
+
|
|
219
|
+
def _cmd_stop(self, payload: dict) -> dict:
|
|
220
|
+
self._running = False
|
|
221
|
+
return make_response(True, data={"message": "Daemon stopping"})
|
|
222
|
+
|
|
223
|
+
def _reap_processes(self):
|
|
224
|
+
finished = []
|
|
225
|
+
for job_id, proc in self._processes.items():
|
|
226
|
+
ret = proc.poll()
|
|
227
|
+
if ret is not None:
|
|
228
|
+
finished.append((job_id, ret))
|
|
229
|
+
|
|
230
|
+
for job_id, ret in finished:
|
|
231
|
+
del self._processes[job_id]
|
|
232
|
+
status = JobStatus.COMPLETED if ret == 0 else JobStatus.FAILED
|
|
233
|
+
update_job_status(self._conn, job_id, status, exit_code=ret)
|
|
234
|
+
logger.info("Job %s finished with exit code %d (%s)", job_id, ret, status.value)
|
|
235
|
+
|
|
236
|
+
def _schedule_jobs(self):
|
|
237
|
+
running_jobs = get_running_jobs(self._conn)
|
|
238
|
+
running_count = len(running_jobs)
|
|
239
|
+
|
|
240
|
+
while True:
|
|
241
|
+
next_job = pick_next_job(self._conn)
|
|
242
|
+
if not next_job:
|
|
243
|
+
break
|
|
244
|
+
|
|
245
|
+
if running_count < 10:
|
|
246
|
+
self._launch_job(next_job)
|
|
247
|
+
running_count += 1
|
|
248
|
+
else:
|
|
249
|
+
victim = should_evict(next_job, running_jobs)
|
|
250
|
+
if victim:
|
|
251
|
+
logger.info("Evicting job %s (pri=%s) for job %s (pri=%s)",
|
|
252
|
+
victim.id, victim.priority.name, next_job.id, next_job.priority.name)
|
|
253
|
+
proc = self._processes.pop(victim.id, None)
|
|
254
|
+
if proc:
|
|
255
|
+
try:
|
|
256
|
+
proc.terminate()
|
|
257
|
+
except ProcessLookupError:
|
|
258
|
+
pass
|
|
259
|
+
evict_job(victim, self._conn)
|
|
260
|
+
self._launch_job(next_job)
|
|
261
|
+
# Refresh running jobs list after eviction
|
|
262
|
+
running_jobs = get_running_jobs(self._conn)
|
|
263
|
+
else:
|
|
264
|
+
break
|
|
265
|
+
|
|
266
|
+
def _launch_job(self, job):
|
|
267
|
+
stdout_path = os.path.join(LOG_DIR, f"{job.id}.out")
|
|
268
|
+
stderr_path = os.path.join(LOG_DIR, f"{job.id}.err")
|
|
269
|
+
|
|
270
|
+
stdout_f = open(stdout_path, "w")
|
|
271
|
+
stderr_f = open(stderr_path, "w")
|
|
272
|
+
|
|
273
|
+
try:
|
|
274
|
+
proc = subprocess.Popen(
|
|
275
|
+
job.command,
|
|
276
|
+
shell=True,
|
|
277
|
+
stdout=stdout_f,
|
|
278
|
+
stderr=stderr_f,
|
|
279
|
+
start_new_session=True,
|
|
280
|
+
)
|
|
281
|
+
except Exception as e:
|
|
282
|
+
stdout_f.close()
|
|
283
|
+
stderr_f.close()
|
|
284
|
+
update_job_status(self._conn, job.id, JobStatus.FAILED)
|
|
285
|
+
logger.error("Failed to launch job %s: %s", job.id, e)
|
|
286
|
+
return
|
|
287
|
+
|
|
288
|
+
self._processes[job.id] = proc
|
|
289
|
+
update_job_status(self._conn, job.id, JobStatus.RUNNING, pid=proc.pid)
|
|
290
|
+
logger.info("Launched job %s (PID %d): %s", job.id, proc.pid, job.command)
|
|
291
|
+
|
|
292
|
+
def _shutdown(self):
|
|
293
|
+
logger.info("Shutting down daemon")
|
|
294
|
+
# Terminate all running jobs
|
|
295
|
+
for job_id, proc in self._processes.items():
|
|
296
|
+
try:
|
|
297
|
+
proc.terminate()
|
|
298
|
+
except ProcessLookupError:
|
|
299
|
+
pass
|
|
300
|
+
update_job_status(self._conn, job_id, JobStatus.FAILED, exit_code=-15)
|
|
301
|
+
|
|
302
|
+
self._processes.clear()
|
|
303
|
+
|
|
304
|
+
if self._server_sock:
|
|
305
|
+
self._server_sock.close()
|
|
306
|
+
if os.path.exists(SOCKET_PATH):
|
|
307
|
+
os.unlink(SOCKET_PATH)
|
|
308
|
+
if os.path.exists(PID_FILE):
|
|
309
|
+
os.unlink(PID_FILE)
|
|
310
|
+
if self._conn:
|
|
311
|
+
self._conn.close()
|
|
312
|
+
|
|
313
|
+
logger.info("Daemon stopped")
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def run_daemon():
|
|
317
|
+
"""Entry point to run the daemon (called after daemonizing)."""
|
|
318
|
+
daemon = Daemon()
|
|
319
|
+
daemon.start()
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sqlite3
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
from typing import List, Optional
|
|
5
|
+
|
|
6
|
+
from .models import APP_DIR, DB_PATH, Job, JobStatus, Priority
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _ensure_dir():
|
|
10
|
+
os.makedirs(APP_DIR, exist_ok=True)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_connection() -> sqlite3.Connection:
|
|
14
|
+
_ensure_dir()
|
|
15
|
+
conn = sqlite3.connect(DB_PATH)
|
|
16
|
+
conn.row_factory = sqlite3.Row
|
|
17
|
+
conn.execute("PRAGMA journal_mode=WAL")
|
|
18
|
+
return conn
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def init_db(conn: sqlite3.Connection):
|
|
22
|
+
conn.execute("""
|
|
23
|
+
CREATE TABLE IF NOT EXISTS jobs (
|
|
24
|
+
id TEXT PRIMARY KEY,
|
|
25
|
+
command TEXT NOT NULL,
|
|
26
|
+
priority INTEGER NOT NULL,
|
|
27
|
+
status TEXT NOT NULL DEFAULT 'queued',
|
|
28
|
+
pid INTEGER,
|
|
29
|
+
created_at TEXT NOT NULL,
|
|
30
|
+
started_at TEXT,
|
|
31
|
+
finished_at TEXT,
|
|
32
|
+
exit_code INTEGER,
|
|
33
|
+
retry_count INTEGER NOT NULL DEFAULT 0
|
|
34
|
+
)
|
|
35
|
+
""")
|
|
36
|
+
conn.commit()
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _now() -> str:
|
|
40
|
+
return datetime.now(timezone.utc).isoformat()
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _row_to_job(row: sqlite3.Row) -> Job:
|
|
44
|
+
return Job(
|
|
45
|
+
id=row["id"],
|
|
46
|
+
command=row["command"],
|
|
47
|
+
priority=Priority(row["priority"]),
|
|
48
|
+
status=JobStatus(row["status"]),
|
|
49
|
+
pid=row["pid"],
|
|
50
|
+
created_at=row["created_at"],
|
|
51
|
+
started_at=row["started_at"],
|
|
52
|
+
finished_at=row["finished_at"],
|
|
53
|
+
exit_code=row["exit_code"],
|
|
54
|
+
retry_count=row["retry_count"],
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def insert_job(conn: sqlite3.Connection, command: str, priority: Priority) -> Job:
|
|
59
|
+
import uuid
|
|
60
|
+
job_id = uuid.uuid4().hex[:8]
|
|
61
|
+
now = _now()
|
|
62
|
+
conn.execute(
|
|
63
|
+
"INSERT INTO jobs (id, command, priority, status, created_at, retry_count) VALUES (?, ?, ?, ?, ?, ?)",
|
|
64
|
+
(job_id, command, priority.value, JobStatus.QUEUED.value, now, 0),
|
|
65
|
+
)
|
|
66
|
+
conn.commit()
|
|
67
|
+
return Job(id=job_id, command=command, priority=priority, status=JobStatus.QUEUED, created_at=now)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def requeue_job(conn: sqlite3.Connection, job: Job):
|
|
71
|
+
conn.execute(
|
|
72
|
+
"UPDATE jobs SET status = ?, pid = NULL, started_at = NULL, finished_at = NULL, exit_code = NULL, retry_count = ? WHERE id = ?",
|
|
73
|
+
(JobStatus.QUEUED.value, job.retry_count + 1, job.id),
|
|
74
|
+
)
|
|
75
|
+
conn.commit()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def update_job_status(conn: sqlite3.Connection, job_id: str, status: JobStatus,
|
|
79
|
+
pid: Optional[int] = None, exit_code: Optional[int] = None):
|
|
80
|
+
now = _now()
|
|
81
|
+
if status == JobStatus.RUNNING:
|
|
82
|
+
conn.execute("UPDATE jobs SET status = ?, pid = ?, started_at = ? WHERE id = ?",
|
|
83
|
+
(status.value, pid, now, job_id))
|
|
84
|
+
elif status in (JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED, JobStatus.EVICTED):
|
|
85
|
+
conn.execute("UPDATE jobs SET status = ?, finished_at = ?, exit_code = ? WHERE id = ?",
|
|
86
|
+
(status.value, now, exit_code, job_id))
|
|
87
|
+
else:
|
|
88
|
+
conn.execute("UPDATE jobs SET status = ? WHERE id = ?", (status.value, job_id))
|
|
89
|
+
conn.commit()
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def get_job(conn: sqlite3.Connection, job_id: str) -> Optional[Job]:
|
|
93
|
+
row = conn.execute("SELECT * FROM jobs WHERE id = ?", (job_id,)).fetchone()
|
|
94
|
+
return _row_to_job(row) if row else None
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def list_jobs(conn: sqlite3.Connection, status: Optional[JobStatus] = None) -> List[Job]:
|
|
98
|
+
if status:
|
|
99
|
+
rows = conn.execute("SELECT * FROM jobs WHERE status = ? ORDER BY priority DESC, created_at ASC",
|
|
100
|
+
(status.value,)).fetchall()
|
|
101
|
+
else:
|
|
102
|
+
rows = conn.execute("SELECT * FROM jobs ORDER BY created_at DESC").fetchall()
|
|
103
|
+
return [_row_to_job(r) for r in rows]
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def get_queued_jobs(conn: sqlite3.Connection) -> List[Job]:
|
|
107
|
+
rows = conn.execute(
|
|
108
|
+
"SELECT * FROM jobs WHERE status = ? ORDER BY priority DESC, created_at ASC",
|
|
109
|
+
(JobStatus.QUEUED.value,),
|
|
110
|
+
).fetchall()
|
|
111
|
+
return [_row_to_job(r) for r in rows]
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def get_running_jobs(conn: sqlite3.Connection) -> List[Job]:
|
|
115
|
+
rows = conn.execute(
|
|
116
|
+
"SELECT * FROM jobs WHERE status = ? ORDER BY priority ASC, started_at DESC",
|
|
117
|
+
(JobStatus.RUNNING.value,),
|
|
118
|
+
).fetchall()
|
|
119
|
+
return [_row_to_job(r) for r in rows]
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def purge_completed(conn: sqlite3.Connection) -> int:
|
|
123
|
+
cursor = conn.execute(
|
|
124
|
+
"DELETE FROM jobs WHERE status IN (?, ?)",
|
|
125
|
+
(JobStatus.COMPLETED.value, JobStatus.FAILED.value),
|
|
126
|
+
)
|
|
127
|
+
conn.commit()
|
|
128
|
+
return cursor.rowcount
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import enum
|
|
2
|
+
import os
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
# Constants
|
|
7
|
+
APP_DIR = os.path.join(os.path.expanduser("~"), ".omi-watcher")
|
|
8
|
+
DB_PATH = os.path.join(APP_DIR, "omi-watcher.db")
|
|
9
|
+
SOCKET_PATH = os.path.join(APP_DIR, "omi-watcher.sock")
|
|
10
|
+
PID_FILE = os.path.join(APP_DIR, "omi-watcher.pid")
|
|
11
|
+
LOG_DIR = os.path.join(APP_DIR, "logs")
|
|
12
|
+
MAX_CONCURRENT_JOBS = 10
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Priority(enum.IntEnum):
|
|
16
|
+
LOW = 0
|
|
17
|
+
MEDIUM = 1
|
|
18
|
+
HIGH = 2
|
|
19
|
+
CRITICAL = 3
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
def from_str(cls, s: str) -> "Priority":
|
|
23
|
+
return cls[s.upper()]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class JobStatus(str, enum.Enum):
|
|
27
|
+
QUEUED = "queued"
|
|
28
|
+
RUNNING = "running"
|
|
29
|
+
COMPLETED = "completed"
|
|
30
|
+
FAILED = "failed"
|
|
31
|
+
CANCELLED = "cancelled"
|
|
32
|
+
EVICTED = "evicted"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class Job:
|
|
37
|
+
id: str
|
|
38
|
+
command: str
|
|
39
|
+
priority: Priority
|
|
40
|
+
status: JobStatus = JobStatus.QUEUED
|
|
41
|
+
pid: Optional[int] = None
|
|
42
|
+
created_at: Optional[str] = None
|
|
43
|
+
started_at: Optional[str] = None
|
|
44
|
+
finished_at: Optional[str] = None
|
|
45
|
+
exit_code: Optional[int] = None
|
|
46
|
+
retry_count: int = 0
|
|
47
|
+
|
|
48
|
+
def to_dict(self) -> dict:
|
|
49
|
+
return {
|
|
50
|
+
"id": self.id,
|
|
51
|
+
"command": self.command,
|
|
52
|
+
"priority": self.priority.name.lower(),
|
|
53
|
+
"status": self.status.value,
|
|
54
|
+
"pid": self.pid,
|
|
55
|
+
"created_at": self.created_at,
|
|
56
|
+
"started_at": self.started_at,
|
|
57
|
+
"finished_at": self.finished_at,
|
|
58
|
+
"exit_code": self.exit_code,
|
|
59
|
+
"retry_count": self.retry_count,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
@classmethod
|
|
63
|
+
def from_dict(cls, d: dict) -> "Job":
|
|
64
|
+
return cls(
|
|
65
|
+
id=d["id"],
|
|
66
|
+
command=d["command"],
|
|
67
|
+
priority=Priority[d["priority"].upper()] if isinstance(d["priority"], str) else Priority(d["priority"]),
|
|
68
|
+
status=JobStatus(d["status"]),
|
|
69
|
+
pid=d.get("pid"),
|
|
70
|
+
created_at=d.get("created_at"),
|
|
71
|
+
started_at=d.get("started_at"),
|
|
72
|
+
finished_at=d.get("finished_at"),
|
|
73
|
+
exit_code=d.get("exit_code"),
|
|
74
|
+
retry_count=d.get("retry_count", 0),
|
|
75
|
+
)
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import socket
|
|
3
|
+
import struct
|
|
4
|
+
from typing import Any, Dict, Optional
|
|
5
|
+
|
|
6
|
+
HEADER_FORMAT = "!I" # 4-byte unsigned int, network byte order
|
|
7
|
+
HEADER_SIZE = struct.calcsize(HEADER_FORMAT)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def make_request(action: str, payload: Optional[Dict] = None) -> Dict:
|
|
11
|
+
return {"action": action, "payload": payload or {}}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def make_response(ok: bool, data: Any = None, error: Optional[str] = None) -> Dict:
|
|
15
|
+
return {"ok": ok, "data": data, "error": error}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def send_message(sock: socket.socket, msg: Dict):
|
|
19
|
+
data = json.dumps(msg).encode("utf-8")
|
|
20
|
+
header = struct.pack(HEADER_FORMAT, len(data))
|
|
21
|
+
sock.sendall(header + data)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def recv_message(sock: socket.socket) -> Optional[Dict]:
|
|
25
|
+
header = _recv_exact(sock, HEADER_SIZE)
|
|
26
|
+
if not header:
|
|
27
|
+
return None
|
|
28
|
+
(length,) = struct.unpack(HEADER_FORMAT, header)
|
|
29
|
+
data = _recv_exact(sock, length)
|
|
30
|
+
if not data:
|
|
31
|
+
return None
|
|
32
|
+
return json.loads(data.decode("utf-8"))
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _recv_exact(sock: socket.socket, n: int) -> Optional[bytes]:
|
|
36
|
+
buf = bytearray()
|
|
37
|
+
while len(buf) < n:
|
|
38
|
+
chunk = sock.recv(n - len(buf))
|
|
39
|
+
if not chunk:
|
|
40
|
+
return None
|
|
41
|
+
buf.extend(chunk)
|
|
42
|
+
return bytes(buf)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def client_request(socket_path: str, action: str, payload: Optional[Dict] = None) -> Dict:
|
|
46
|
+
"""Send a request to the daemon and return the response."""
|
|
47
|
+
sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
|
48
|
+
try:
|
|
49
|
+
sock.connect(socket_path)
|
|
50
|
+
send_message(sock, make_request(action, payload))
|
|
51
|
+
response = recv_message(sock)
|
|
52
|
+
return response or make_response(False, error="No response from daemon")
|
|
53
|
+
finally:
|
|
54
|
+
sock.close()
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import signal
|
|
3
|
+
import sqlite3
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from .db import get_queued_jobs, get_running_jobs, requeue_job, update_job_status
|
|
7
|
+
from .models import Job, JobStatus, MAX_CONCURRENT_JOBS
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def pick_next_job(conn: sqlite3.Connection) -> Optional[Job]:
|
|
11
|
+
"""Return the highest-priority queued job, or None."""
|
|
12
|
+
jobs = get_queued_jobs(conn)
|
|
13
|
+
return jobs[0] if jobs else None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def should_evict(new_job: Job, running_jobs: list[Job]) -> Optional[Job]:
|
|
17
|
+
"""If all slots are full and new_job has higher priority than the
|
|
18
|
+
lowest-priority running job, return that job for eviction."""
|
|
19
|
+
if len(running_jobs) < MAX_CONCURRENT_JOBS:
|
|
20
|
+
return None
|
|
21
|
+
# running_jobs sorted by priority ASC, started_at DESC (from DB query)
|
|
22
|
+
# so first element is lowest priority, most recently started
|
|
23
|
+
lowest = running_jobs[0]
|
|
24
|
+
if new_job.priority > lowest.priority:
|
|
25
|
+
return lowest
|
|
26
|
+
return None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def evict_job(job: Job, conn: sqlite3.Connection) -> bool:
|
|
30
|
+
"""Kill the running job, mark as evicted, then re-queue it."""
|
|
31
|
+
if job.pid:
|
|
32
|
+
try:
|
|
33
|
+
os.kill(job.pid, signal.SIGTERM)
|
|
34
|
+
except ProcessLookupError:
|
|
35
|
+
pass
|
|
36
|
+
update_job_status(conn, job.id, JobStatus.EVICTED)
|
|
37
|
+
requeue_job(conn, job)
|
|
38
|
+
return True
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def get_slot_count(conn: sqlite3.Connection) -> int:
|
|
42
|
+
"""Return number of available job slots."""
|
|
43
|
+
running = get_running_jobs(conn)
|
|
44
|
+
return MAX_CONCURRENT_JOBS - len(running)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/omi_watcher/__init__.py
|
|
4
|
+
src/omi_watcher/cli.py
|
|
5
|
+
src/omi_watcher/daemon.py
|
|
6
|
+
src/omi_watcher/db.py
|
|
7
|
+
src/omi_watcher/models.py
|
|
8
|
+
src/omi_watcher/protocol.py
|
|
9
|
+
src/omi_watcher/scheduler.py
|
|
10
|
+
src/omi_watcher.egg-info/PKG-INFO
|
|
11
|
+
src/omi_watcher.egg-info/SOURCES.txt
|
|
12
|
+
src/omi_watcher.egg-info/dependency_links.txt
|
|
13
|
+
src/omi_watcher.egg-info/entry_points.txt
|
|
14
|
+
src/omi_watcher.egg-info/top_level.txt
|
|
15
|
+
tests/test_db.py
|
|
16
|
+
tests/test_integration.py
|
|
17
|
+
tests/test_scheduler.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
omi_watcher
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sqlite3
|
|
3
|
+
import tempfile
|
|
4
|
+
import unittest
|
|
5
|
+
|
|
6
|
+
from omi_watcher.db import (
|
|
7
|
+
get_connection,
|
|
8
|
+
get_job,
|
|
9
|
+
get_queued_jobs,
|
|
10
|
+
get_running_jobs,
|
|
11
|
+
init_db,
|
|
12
|
+
insert_job,
|
|
13
|
+
list_jobs,
|
|
14
|
+
purge_completed,
|
|
15
|
+
requeue_job,
|
|
16
|
+
update_job_status,
|
|
17
|
+
)
|
|
18
|
+
from omi_watcher.models import JobStatus, Priority
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TestDB(unittest.TestCase):
|
|
22
|
+
def setUp(self):
|
|
23
|
+
self.conn = sqlite3.connect(":memory:")
|
|
24
|
+
self.conn.row_factory = sqlite3.Row
|
|
25
|
+
init_db(self.conn)
|
|
26
|
+
|
|
27
|
+
def tearDown(self):
|
|
28
|
+
self.conn.close()
|
|
29
|
+
|
|
30
|
+
def test_insert_and_get(self):
|
|
31
|
+
job = insert_job(self.conn, "echo hello", Priority.HIGH)
|
|
32
|
+
self.assertIsNotNone(job.id)
|
|
33
|
+
self.assertEqual(job.command, "echo hello")
|
|
34
|
+
self.assertEqual(job.priority, Priority.HIGH)
|
|
35
|
+
self.assertEqual(job.status, JobStatus.QUEUED)
|
|
36
|
+
|
|
37
|
+
fetched = get_job(self.conn, job.id)
|
|
38
|
+
self.assertIsNotNone(fetched)
|
|
39
|
+
self.assertEqual(fetched.command, "echo hello")
|
|
40
|
+
|
|
41
|
+
def test_list_jobs(self):
|
|
42
|
+
insert_job(self.conn, "cmd1", Priority.LOW)
|
|
43
|
+
insert_job(self.conn, "cmd2", Priority.HIGH)
|
|
44
|
+
jobs = list_jobs(self.conn)
|
|
45
|
+
self.assertEqual(len(jobs), 2)
|
|
46
|
+
|
|
47
|
+
def test_update_status(self):
|
|
48
|
+
job = insert_job(self.conn, "echo test", Priority.MEDIUM)
|
|
49
|
+
update_job_status(self.conn, job.id, JobStatus.RUNNING, pid=1234)
|
|
50
|
+
fetched = get_job(self.conn, job.id)
|
|
51
|
+
self.assertEqual(fetched.status, JobStatus.RUNNING)
|
|
52
|
+
self.assertEqual(fetched.pid, 1234)
|
|
53
|
+
|
|
54
|
+
def test_queued_jobs_ordered_by_priority(self):
|
|
55
|
+
insert_job(self.conn, "low", Priority.LOW)
|
|
56
|
+
insert_job(self.conn, "critical", Priority.CRITICAL)
|
|
57
|
+
insert_job(self.conn, "medium", Priority.MEDIUM)
|
|
58
|
+
queued = get_queued_jobs(self.conn)
|
|
59
|
+
self.assertEqual(queued[0].command, "critical")
|
|
60
|
+
self.assertEqual(queued[1].command, "medium")
|
|
61
|
+
self.assertEqual(queued[2].command, "low")
|
|
62
|
+
|
|
63
|
+
def test_purge_completed(self):
|
|
64
|
+
j1 = insert_job(self.conn, "done", Priority.LOW)
|
|
65
|
+
j2 = insert_job(self.conn, "fail", Priority.LOW)
|
|
66
|
+
j3 = insert_job(self.conn, "still running", Priority.LOW)
|
|
67
|
+
update_job_status(self.conn, j1.id, JobStatus.COMPLETED)
|
|
68
|
+
update_job_status(self.conn, j2.id, JobStatus.FAILED)
|
|
69
|
+
update_job_status(self.conn, j3.id, JobStatus.RUNNING, pid=999)
|
|
70
|
+
count = purge_completed(self.conn)
|
|
71
|
+
self.assertEqual(count, 2)
|
|
72
|
+
self.assertIsNone(get_job(self.conn, j1.id))
|
|
73
|
+
self.assertIsNotNone(get_job(self.conn, j3.id))
|
|
74
|
+
|
|
75
|
+
def test_requeue_job(self):
|
|
76
|
+
job = insert_job(self.conn, "requeue me", Priority.HIGH)
|
|
77
|
+
update_job_status(self.conn, job.id, JobStatus.RUNNING, pid=100)
|
|
78
|
+
requeue_job(self.conn, get_job(self.conn, job.id))
|
|
79
|
+
fetched = get_job(self.conn, job.id)
|
|
80
|
+
self.assertEqual(fetched.status, JobStatus.QUEUED)
|
|
81
|
+
self.assertEqual(fetched.retry_count, 1)
|
|
82
|
+
self.assertIsNone(fetched.pid)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
if __name__ == "__main__":
|
|
86
|
+
unittest.main()
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import subprocess
|
|
3
|
+
import sys
|
|
4
|
+
import time
|
|
5
|
+
import unittest
|
|
6
|
+
|
|
7
|
+
from omi_watcher.models import APP_DIR, PID_FILE, SOCKET_PATH
|
|
8
|
+
from omi_watcher.protocol import client_request
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TestIntegration(unittest.TestCase):
|
|
12
|
+
"""Integration tests that start a real daemon and interact with it."""
|
|
13
|
+
|
|
14
|
+
@classmethod
|
|
15
|
+
def setUpClass(cls):
|
|
16
|
+
# Start daemon via subprocess (fork-based, parent returns quickly)
|
|
17
|
+
proc = subprocess.Popen(
|
|
18
|
+
[sys.executable, "-m", "omi_watcher.cli", "start"],
|
|
19
|
+
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
|
20
|
+
)
|
|
21
|
+
proc.wait(timeout=10)
|
|
22
|
+
# Wait for daemon to be ready
|
|
23
|
+
for _ in range(20):
|
|
24
|
+
if os.path.exists(SOCKET_PATH):
|
|
25
|
+
break
|
|
26
|
+
time.sleep(0.25)
|
|
27
|
+
|
|
28
|
+
@classmethod
|
|
29
|
+
def tearDownClass(cls):
|
|
30
|
+
# Stop daemon
|
|
31
|
+
try:
|
|
32
|
+
client_request(SOCKET_PATH, "stop")
|
|
33
|
+
time.sleep(1)
|
|
34
|
+
except Exception:
|
|
35
|
+
pass
|
|
36
|
+
# Clean up PID file if still present
|
|
37
|
+
if os.path.exists(PID_FILE):
|
|
38
|
+
with open(PID_FILE) as f:
|
|
39
|
+
pid = int(f.read().strip())
|
|
40
|
+
try:
|
|
41
|
+
os.kill(pid, 9)
|
|
42
|
+
except ProcessLookupError:
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
def test_01_submit_job(self):
|
|
46
|
+
resp = client_request(SOCKET_PATH, "submit", {
|
|
47
|
+
"command": "echo hello world",
|
|
48
|
+
"priority": "medium",
|
|
49
|
+
})
|
|
50
|
+
self.assertTrue(resp["ok"])
|
|
51
|
+
self.assertIn("id", resp["data"])
|
|
52
|
+
self.__class__._job_id = resp["data"]["id"]
|
|
53
|
+
|
|
54
|
+
def test_02_status_all(self):
|
|
55
|
+
time.sleep(1) # Let job complete
|
|
56
|
+
resp = client_request(SOCKET_PATH, "status")
|
|
57
|
+
self.assertTrue(resp["ok"])
|
|
58
|
+
self.assertIsInstance(resp["data"], list)
|
|
59
|
+
self.assertGreater(len(resp["data"]), 0)
|
|
60
|
+
|
|
61
|
+
def test_03_status_single(self):
|
|
62
|
+
job_id = getattr(self.__class__, "_job_id", None)
|
|
63
|
+
if not job_id:
|
|
64
|
+
self.skipTest("No job ID from previous test")
|
|
65
|
+
time.sleep(1)
|
|
66
|
+
resp = client_request(SOCKET_PATH, "status", {"job_id": job_id})
|
|
67
|
+
self.assertTrue(resp["ok"])
|
|
68
|
+
self.assertEqual(resp["data"]["id"], job_id)
|
|
69
|
+
|
|
70
|
+
def test_04_logs(self):
|
|
71
|
+
job_id = getattr(self.__class__, "_job_id", None)
|
|
72
|
+
if not job_id:
|
|
73
|
+
self.skipTest("No job ID from previous test")
|
|
74
|
+
time.sleep(1)
|
|
75
|
+
resp = client_request(SOCKET_PATH, "logs", {"job_id": job_id})
|
|
76
|
+
self.assertTrue(resp["ok"])
|
|
77
|
+
|
|
78
|
+
def test_05_submit_with_priority(self):
|
|
79
|
+
resp = client_request(SOCKET_PATH, "submit", {
|
|
80
|
+
"command": "sleep 0.1",
|
|
81
|
+
"priority": "critical",
|
|
82
|
+
})
|
|
83
|
+
self.assertTrue(resp["ok"])
|
|
84
|
+
self.assertEqual(resp["data"]["priority"], "critical")
|
|
85
|
+
|
|
86
|
+
def test_06_cancel_queued_job(self):
|
|
87
|
+
resp = client_request(SOCKET_PATH, "submit", {
|
|
88
|
+
"command": "sleep 3600",
|
|
89
|
+
"priority": "low",
|
|
90
|
+
})
|
|
91
|
+
self.assertTrue(resp["ok"])
|
|
92
|
+
job_id = resp["data"]["id"]
|
|
93
|
+
time.sleep(0.5)
|
|
94
|
+
resp = client_request(SOCKET_PATH, "cancel", {"job_id": job_id})
|
|
95
|
+
self.assertTrue(resp["ok"])
|
|
96
|
+
|
|
97
|
+
def test_07_purge(self):
|
|
98
|
+
time.sleep(2) # Let jobs finish
|
|
99
|
+
resp = client_request(SOCKET_PATH, "purge")
|
|
100
|
+
self.assertTrue(resp["ok"])
|
|
101
|
+
self.assertIn("purged", resp["data"])
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
if __name__ == "__main__":
|
|
105
|
+
unittest.main()
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import sqlite3
|
|
2
|
+
import unittest
|
|
3
|
+
|
|
4
|
+
from omi_watcher.db import (
|
|
5
|
+
get_queued_jobs,
|
|
6
|
+
get_running_jobs,
|
|
7
|
+
init_db,
|
|
8
|
+
insert_job,
|
|
9
|
+
update_job_status,
|
|
10
|
+
)
|
|
11
|
+
from omi_watcher.models import Job, JobStatus, Priority
|
|
12
|
+
from omi_watcher.scheduler import pick_next_job, should_evict
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TestScheduler(unittest.TestCase):
|
|
16
|
+
def setUp(self):
|
|
17
|
+
self.conn = sqlite3.connect(":memory:")
|
|
18
|
+
self.conn.row_factory = sqlite3.Row
|
|
19
|
+
init_db(self.conn)
|
|
20
|
+
|
|
21
|
+
def tearDown(self):
|
|
22
|
+
self.conn.close()
|
|
23
|
+
|
|
24
|
+
def test_pick_next_job_highest_priority(self):
|
|
25
|
+
insert_job(self.conn, "low cmd", Priority.LOW)
|
|
26
|
+
insert_job(self.conn, "high cmd", Priority.HIGH)
|
|
27
|
+
insert_job(self.conn, "medium cmd", Priority.MEDIUM)
|
|
28
|
+
job = pick_next_job(self.conn)
|
|
29
|
+
self.assertEqual(job.command, "high cmd")
|
|
30
|
+
|
|
31
|
+
def test_pick_next_job_none_when_empty(self):
|
|
32
|
+
job = pick_next_job(self.conn)
|
|
33
|
+
self.assertIsNone(job)
|
|
34
|
+
|
|
35
|
+
def test_should_evict_when_slots_full(self):
|
|
36
|
+
# Create 10 running jobs at LOW priority
|
|
37
|
+
running = []
|
|
38
|
+
for i in range(10):
|
|
39
|
+
j = insert_job(self.conn, f"running {i}", Priority.LOW)
|
|
40
|
+
update_job_status(self.conn, j.id, JobStatus.RUNNING, pid=1000 + i)
|
|
41
|
+
|
|
42
|
+
running_jobs = get_running_jobs(self.conn)
|
|
43
|
+
new_job = Job(id="new", command="important", priority=Priority.CRITICAL)
|
|
44
|
+
victim = should_evict(new_job, running_jobs)
|
|
45
|
+
self.assertIsNotNone(victim)
|
|
46
|
+
self.assertEqual(victim.priority, Priority.LOW)
|
|
47
|
+
|
|
48
|
+
def test_no_evict_when_slots_available(self):
|
|
49
|
+
# Only 5 running jobs
|
|
50
|
+
for i in range(5):
|
|
51
|
+
j = insert_job(self.conn, f"running {i}", Priority.LOW)
|
|
52
|
+
update_job_status(self.conn, j.id, JobStatus.RUNNING, pid=1000 + i)
|
|
53
|
+
|
|
54
|
+
running_jobs = get_running_jobs(self.conn)
|
|
55
|
+
new_job = Job(id="new", command="important", priority=Priority.CRITICAL)
|
|
56
|
+
victim = should_evict(new_job, running_jobs)
|
|
57
|
+
self.assertIsNone(victim)
|
|
58
|
+
|
|
59
|
+
def test_no_evict_when_same_priority(self):
|
|
60
|
+
running = []
|
|
61
|
+
for i in range(10):
|
|
62
|
+
j = insert_job(self.conn, f"running {i}", Priority.MEDIUM)
|
|
63
|
+
update_job_status(self.conn, j.id, JobStatus.RUNNING, pid=1000 + i)
|
|
64
|
+
|
|
65
|
+
running_jobs = get_running_jobs(self.conn)
|
|
66
|
+
new_job = Job(id="new", command="same pri", priority=Priority.MEDIUM)
|
|
67
|
+
victim = should_evict(new_job, running_jobs)
|
|
68
|
+
self.assertIsNone(victim)
|
|
69
|
+
|
|
70
|
+
def test_no_evict_when_lower_priority(self):
|
|
71
|
+
for i in range(10):
|
|
72
|
+
j = insert_job(self.conn, f"running {i}", Priority.HIGH)
|
|
73
|
+
update_job_status(self.conn, j.id, JobStatus.RUNNING, pid=1000 + i)
|
|
74
|
+
|
|
75
|
+
running_jobs = get_running_jobs(self.conn)
|
|
76
|
+
new_job = Job(id="new", command="low pri", priority=Priority.LOW)
|
|
77
|
+
victim = should_evict(new_job, running_jobs)
|
|
78
|
+
self.assertIsNone(victim)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
if __name__ == "__main__":
|
|
82
|
+
unittest.main()
|