@smilintux/skcapstone 0.10.0 → 0.12.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +10 -4
- package/.github/workflows/ci.yml +2 -2
- package/.github/workflows/publish.yml +9 -2
- package/.openclaw-workspace.json +2 -2
- package/CLAUDE.md +37 -0
- package/MISSION.md +17 -2
- package/README.md +282 -3
- package/docker/Dockerfile +7 -7
- package/docker/compose-templates/dev-team.yml +12 -12
- package/docker/compose-templates/mini-team.yml +9 -9
- package/docker/compose-templates/ops-team.yml +10 -10
- package/docker/compose-templates/research-team.yml +10 -10
- package/docker/entrypoint.sh +4 -4
- package/docs/ADR-optional-integration-backbone.md +181 -0
- package/docs/ARCHITECTURE.md +186 -43
- package/docs/BOND_WITH_GROK.md +6 -6
- package/docs/CUSTOM_AGENT.md +123 -30
- package/docs/DREAMING.md +70 -0
- package/docs/GETTING_STARTED.md +7 -7
- package/docs/QUICKSTART.md +10 -6
- package/docs/SKJOULE_ARCHITECTURE.md +3 -3
- package/docs/SOUL_SWAPPER.md +5 -5
- package/docs/hammertime-audit.md +402 -0
- package/docs/sk-integration-HANDOFF.md +117 -0
- package/docs/skscheduler.md +155 -0
- package/docs/superpowers/examples/jobs.yaml +31 -0
- package/docs/superpowers/plans/2026-06-08-skscheduler.md +1265 -0
- package/docs/superpowers/specs/2026-06-08-skscheduler-design.md +186 -0
- package/examples/custom-bond-template.json +1 -1
- package/examples/grok-feb.json +1 -1
- package/examples/queen-ava-feb.json +1 -1
- package/launchd/{com.skcapstone.skcomm-heartbeat.plist → com.skcapstone.skcomms-heartbeat.plist} +4 -4
- package/launchd/{com.skcapstone.skcomm-queue-drain.plist → com.skcapstone.skcomms-queue-drain.plist} +4 -4
- package/launchd/install-launchd.sh +6 -6
- package/{openclaw-plugin → openclaw-plugin.archived-2026-04-23}/src/index.ts +3 -2
- package/package.json +1 -1
- package/pyproject.toml +16 -10
- package/scripts/archive-sessions.sh +7 -0
- package/scripts/check-updates.py +4 -4
- package/scripts/install-bundle.sh +8 -8
- package/scripts/install.ps1 +12 -11
- package/scripts/install.sh +159 -5
- package/scripts/model-fallback-monitor.sh +102 -0
- package/scripts/nvidia-proxy.mjs +78 -26
- package/scripts/refresh-anthropic-token.sh +172 -0
- package/scripts/release.sh +98 -0
- package/scripts/session-to-memory.py +219 -0
- package/scripts/skgateway.mjs +3 -3
- package/scripts/telegram-catchup-all.sh +12 -1
- package/scripts/verify_install.sh +2 -2
- package/scripts/wargov-ufo-capture/README.md +43 -0
- package/scripts/wargov-ufo-capture/cdp_capture_release2.py +273 -0
- package/scripts/wargov-ufo-capture/cdp_capture_splc_doj.py +246 -0
- package/scripts/wargov-ufo-capture/cdp_finish.py +271 -0
- package/scripts/wargov-ufo-capture/cdp_probe.py +188 -0
- package/scripts/wargov-ufo-capture/cdp_splc_pressrelease.py +101 -0
- package/scripts/wargov-ufo-capture/parse_csv.py +95 -0
- package/scripts/wargov-ufo-capture/pull_dvids.sh +107 -0
- package/scripts/watch-anthropic-token.sh +212 -0
- package/scripts/windows/install-tasks.ps1 +7 -7
- package/scripts/windows/skcapstone-task.xml +1 -1
- package/src/skcapstone/__init__.py +45 -3
- package/src/skcapstone/_cli_monolith.py +20 -15
- package/src/skcapstone/activity.py +5 -1
- package/src/skcapstone/agent_card.py +3 -2
- package/src/skcapstone/api.py +41 -40
- package/src/skcapstone/auction.py +14 -11
- package/src/skcapstone/backup.py +2 -1
- package/src/skcapstone/blueprint_registry.py +4 -3
- package/src/skcapstone/brain_first.py +238 -0
- package/src/skcapstone/changelog.py +1 -1
- package/src/skcapstone/chat.py +22 -17
- package/src/skcapstone/cli/__init__.py +9 -1
- package/src/skcapstone/cli/_common.py +1 -0
- package/src/skcapstone/cli/agents_spawner.py +5 -2
- package/src/skcapstone/cli/alerts.py +25 -4
- package/src/skcapstone/cli/bench.py +15 -15
- package/src/skcapstone/cli/chat.py +7 -4
- package/src/skcapstone/cli/consciousness.py +5 -2
- package/src/skcapstone/cli/context_cmd.py +18 -4
- package/src/skcapstone/cli/daemon.py +11 -7
- package/src/skcapstone/cli/gtd.py +26 -1
- package/src/skcapstone/cli/housekeeping.py +3 -3
- package/src/skcapstone/cli/identity_cmd.py +378 -0
- package/src/skcapstone/cli/joule_cmd.py +7 -3
- package/src/skcapstone/cli/memory.py +8 -6
- package/src/skcapstone/cli/peers_dir.py +1 -1
- package/src/skcapstone/cli/register_cmd.py +29 -3
- package/src/skcapstone/cli/scheduler_cmd.py +167 -0
- package/src/skcapstone/cli/session.py +25 -0
- package/src/skcapstone/cli/setup.py +96 -29
- package/src/skcapstone/cli/shell_cmd.py +53 -1
- package/src/skcapstone/cli/skills_cmd.py +2 -2
- package/src/skcapstone/cli/soul.py +8 -5
- package/src/skcapstone/cli/status.py +37 -11
- package/src/skcapstone/cli/telegram.py +21 -0
- package/src/skcapstone/cli/test_cmd.py +5 -5
- package/src/skcapstone/cli/test_connection.py +2 -2
- package/src/skcapstone/cli/upgrade_cmd.py +23 -14
- package/src/skcapstone/cli/version_cmd.py +1 -1
- package/src/skcapstone/cli/watch_cmd.py +9 -6
- package/src/skcapstone/cloud9_bridge.py +14 -14
- package/src/skcapstone/codex_setup.py +255 -0
- package/src/skcapstone/config_validator.py +7 -4
- package/src/skcapstone/consciousness_config.py +5 -1
- package/src/skcapstone/consciousness_loop.py +313 -273
- package/src/skcapstone/context_loader.py +121 -0
- package/src/skcapstone/coord_federation.py +2 -1
- package/src/skcapstone/coordination.py +23 -6
- package/src/skcapstone/crush_integration.py +2 -1
- package/src/skcapstone/daemon.py +132 -77
- package/src/skcapstone/dashboard.py +10 -10
- package/src/skcapstone/data/sk-agent-picker.sh +421 -0
- package/src/skcapstone/data/systemd/skcapstone-api.socket +9 -0
- package/src/skcapstone/data/systemd/skcapstone-memory-compress.service +18 -0
- package/src/skcapstone/data/systemd/skcapstone-memory-compress.timer +11 -0
- package/src/skcapstone/data/systemd/skcapstone.service +37 -0
- package/src/skcapstone/data/systemd/skcapstone@.service +50 -0
- package/src/skcapstone/data/systemd/skcomms-heartbeat.service +18 -0
- package/{systemd/skcomm-heartbeat.timer → src/skcapstone/data/systemd/skcomms-heartbeat.timer} +2 -2
- package/src/skcapstone/data/systemd/skcomms-queue-drain.service +17 -0
- package/{systemd/skcomm-queue-drain.timer → src/skcapstone/data/systemd/skcomms-queue-drain.timer} +2 -2
- package/src/skcapstone/defaults/claude/CLAUDE.md +67 -0
- package/src/skcapstone/defaults/claude/settings.json +74 -0
- package/src/skcapstone/defaults/lumina/config/claude-hooks.md +57 -0
- package/src/skcapstone/defaults/lumina/config/skgraph.yaml +55 -10
- package/src/skcapstone/defaults/lumina/config/skmemory.yaml +79 -13
- package/src/skcapstone/defaults/lumina/config/skvector.yaml +60 -9
- package/src/skcapstone/defaults/lumina/memory/long-term/18b9c0d1e2f3-cloud9-protocol.json +2 -2
- package/src/skcapstone/defaults/lumina/memory/long-term/a1b2c3d4e5f6-ecosystem-overview.json +2 -2
- package/src/skcapstone/defaults/lumina/memory/long-term/b2c3d4e5f6a7-five-pillars.json +9 -9
- package/src/skcapstone/defaults/lumina/memory/long-term/d4e5f6a7b8c9-site-directory.json +2 -2
- package/src/skcapstone/defaults/unhinged.json +13 -0
- package/src/skcapstone/discovery.py +43 -20
- package/src/skcapstone/doctor.py +941 -22
- package/src/skcapstone/dreaming.py +1183 -109
- package/src/skcapstone/emotion_tracker.py +2 -2
- package/src/skcapstone/export.py +4 -3
- package/src/skcapstone/fuse_mount.py +14 -12
- package/src/skcapstone/gui_installer.py +2 -2
- package/src/skcapstone/heartbeat.py +1 -1
- package/src/skcapstone/housekeeping.py +14 -14
- package/src/skcapstone/install_wizard.py +209 -7
- package/src/skcapstone/itil.py +13 -4
- package/src/skcapstone/kms_scheduler.py +10 -8
- package/src/skcapstone/launchd.py +19 -19
- package/src/skcapstone/mcp_launcher.py +15 -1
- package/src/skcapstone/mcp_server.py +83 -49
- package/src/skcapstone/mcp_tools/__init__.py +2 -0
- package/src/skcapstone/mcp_tools/_helpers.py +2 -2
- package/src/skcapstone/mcp_tools/ansible_tools.py +7 -4
- package/src/skcapstone/mcp_tools/brain_first_tools.py +90 -0
- package/src/skcapstone/mcp_tools/capauth_tools.py +7 -4
- package/src/skcapstone/mcp_tools/comm_tools.py +10 -10
- package/src/skcapstone/mcp_tools/coord_tools.py +8 -4
- package/src/skcapstone/mcp_tools/did_tools.py +11 -8
- package/src/skcapstone/mcp_tools/gtd_tools.py +4 -4
- package/src/skcapstone/mcp_tools/memory_tools.py +6 -2
- package/src/skcapstone/mcp_tools/notification_tools.py +22 -6
- package/src/skcapstone/mcp_tools/{skcomm_tools.py → skcomms_tools.py} +14 -14
- package/src/skcapstone/mcp_tools/soul_tools.py +8 -2
- package/src/skcapstone/mdns_discovery.py +2 -2
- package/src/skcapstone/memory_curator.py +1 -1
- package/src/skcapstone/memory_engine.py +10 -3
- package/src/skcapstone/metrics.py +30 -16
- package/src/skcapstone/migrate_memories.py +4 -3
- package/src/skcapstone/migrate_multi_agent.py +8 -7
- package/src/skcapstone/models.py +47 -5
- package/src/skcapstone/notifications.py +42 -18
- package/src/skcapstone/onboard.py +875 -121
- package/src/skcapstone/operator_link.py +170 -0
- package/src/skcapstone/peer_directory.py +4 -4
- package/src/skcapstone/peers.py +19 -19
- package/src/skcapstone/pillars/__init__.py +7 -5
- package/src/skcapstone/pillars/consciousness.py +191 -0
- package/src/skcapstone/pillars/identity.py +51 -7
- package/src/skcapstone/pillars/memory.py +9 -3
- package/src/skcapstone/pillars/sync.py +2 -2
- package/src/skcapstone/preflight.py +3 -3
- package/src/skcapstone/providers/docker.py +28 -28
- package/src/skcapstone/register.py +6 -6
- package/src/skcapstone/registry_client.py +5 -4
- package/src/skcapstone/runtime.py +14 -3
- package/src/skcapstone/scheduled_tasks.py +254 -19
- package/src/skcapstone/scheduler_jobs.py +456 -0
- package/src/skcapstone/scheduler_runner.py +239 -0
- package/src/skcapstone/scheduler_state.py +162 -0
- package/src/skcapstone/sdk.py +310 -0
- package/src/skcapstone/service_health.py +279 -39
- package/src/skcapstone/session_briefing.py +108 -0
- package/src/skcapstone/session_capture.py +1 -1
- package/src/skcapstone/shell.py +7 -1
- package/src/skcapstone/soul.py +3 -1
- package/src/skcapstone/soul_switch.py +3 -1
- package/src/skcapstone/summary.py +6 -6
- package/src/skcapstone/sync_engine.py +15 -15
- package/src/skcapstone/sync_watcher.py +2 -2
- package/src/skcapstone/systemd.py +55 -21
- package/src/skcapstone/team_comms.py +8 -8
- package/src/skcapstone/team_engine.py +1 -1
- package/src/skcapstone/testrunner.py +3 -3
- package/src/skcapstone/trust_graph.py +40 -5
- package/src/skcapstone/unified_search.py +15 -6
- package/src/skcapstone/uninstall_wizard.py +11 -3
- package/src/skcapstone/version_check.py +8 -4
- package/src/skcapstone/warmth_anchor.py +4 -2
- package/src/skcapstone/whoami.py +4 -4
- package/systemd/skcapstone.service +4 -6
- package/systemd/skcapstone@.service +7 -8
- package/systemd/skcomms-heartbeat.service +21 -0
- package/systemd/skcomms-heartbeat.timer +12 -0
- package/systemd/skcomms-queue-drain.service +17 -0
- package/systemd/skcomms-queue-drain.timer +12 -0
- package/tests/conftest.py +39 -0
- package/tests/integration/test_consciousness_e2e.py +39 -39
- package/tests/test_agent_card.py +1 -1
- package/tests/test_agent_home_scaffold.py +34 -0
- package/tests/test_alerts_consumer_topics.py +27 -0
- package/tests/test_backup.py +2 -1
- package/tests/test_chat.py +6 -6
- package/tests/test_claude_md.py +2 -2
- package/tests/test_cli_skills.py +10 -10
- package/tests/test_cli_test_cmd.py +4 -4
- package/tests/test_cli_test_connection.py +1 -1
- package/tests/test_cloud9_bridge.py +6 -6
- package/tests/test_consciousness_e2e.py +1 -1
- package/tests/test_consciousness_loop.py +10 -10
- package/tests/test_coordination.py +25 -0
- package/tests/test_cross_package.py +21 -21
- package/tests/test_daemon.py +4 -4
- package/tests/test_daemon_shutdown.py +1 -1
- package/tests/test_docker_provider.py +29 -29
- package/tests/test_doctor.py +400 -0
- package/tests/test_doctor_skscheduler.py +50 -0
- package/tests/test_dreaming_engine.py +147 -0
- package/tests/test_dreaming_gtd_capture.py +35 -0
- package/tests/test_e2e_automated.py +8 -5
- package/tests/test_fuse_mount.py +10 -10
- package/tests/test_gtd_brief.py +46 -0
- package/tests/test_gtd_malformed_tolerance.py +31 -0
- package/tests/test_housekeeping.py +15 -15
- package/tests/test_identity_migrate.py +251 -0
- package/tests/test_integration_backbone.py +598 -0
- package/tests/test_itil_gtd_lifecycle.py +37 -0
- package/tests/test_jobs_dropins.py +84 -0
- package/tests/test_mcp_server.py +82 -37
- package/tests/test_models.py +48 -4
- package/tests/test_multi_agent.py +31 -29
- package/tests/test_notifications.py +122 -32
- package/tests/test_onboard.py +63 -75
- package/tests/test_operator_link.py +78 -0
- package/tests/test_peers.py +14 -14
- package/tests/test_pillars.py +98 -0
- package/tests/test_preflight.py +3 -3
- package/tests/test_runtime.py +21 -0
- package/tests/test_scheduled_tasks.py +11 -6
- package/tests/test_scheduler_cli.py +47 -0
- package/tests/test_scheduler_features.py +133 -0
- package/tests/test_scheduler_integration.py +87 -0
- package/tests/test_scheduler_jobs.py +155 -0
- package/tests/test_scheduler_runner.py +64 -0
- package/tests/test_scheduler_state.py +57 -0
- package/tests/test_sdk.py +70 -0
- package/tests/test_service_health_incidents.py +34 -0
- package/tests/test_service_registry.py +52 -0
- package/tests/test_session_briefing.py +130 -0
- package/tests/test_snapshots.py +4 -4
- package/tests/test_sync_pipeline.py +26 -26
- package/tests/test_team_comms.py +2 -2
- package/tests/test_testrunner.py +2 -2
- package/tests/test_trust_graph.py +18 -0
- package/tests/test_unified_search.py +2 -2
- package/tests/test_version_check.py +10 -0
- package/tests/test_version_cmd.py +8 -8
- package/tests/test_whoami.py +1 -1
- package/systemd/skcomm-heartbeat.service +0 -18
- package/systemd/skcomm-queue-drain.service +0 -17
- /package/{openclaw-plugin → openclaw-plugin.archived-2026-04-23}/package.json +0 -0
- /package/{openclaw-plugin → openclaw-plugin.archived-2026-04-23}/src/openclaw.plugin.json +0 -0
|
@@ -0,0 +1,1265 @@
|
|
|
1
|
+
# skscheduler Implementation Plan
|
|
2
|
+
|
|
3
|
+
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
|
|
4
|
+
|
|
5
|
+
**Goal:** Turn skcapstone's interval-only `TaskScheduler` into a unified, config-driven fleet job scheduler ("skscheduler") with cron schedules, python/shell/agent job types, per-node affinity, node-local state, a CLI, and a daily GTD-triage agent job on .41.
|
|
6
|
+
|
|
7
|
+
**Architecture:** Extend the existing `src/skcapstone/scheduled_tasks.py`. A synced `~/.skcapstone/config/jobs.yaml` is the single registry; new focused modules handle the job spec/config (`scheduler_jobs.py`), node-local state + locks (`scheduler_state.py`), and execution (`scheduler_runner.py`). The `TaskScheduler` tick loop gains a second pass that fires due config jobs whose node-affinity includes this host. State/logs live under `~/.skcapstone/scheduler/<hostname>/` (never synced).
|
|
8
|
+
|
|
9
|
+
**Tech Stack:** Python 3.11+, `croniter` (new dep), `pyyaml` (already used), `click` (CLI), `pytest`. Spec: `docs/superpowers/specs/2026-06-08-skscheduler-design.md`.
|
|
10
|
+
|
|
11
|
+
**Branch:** `feat/skscheduler`. Run tests with `~/.skenv/bin/python -m pytest`.
|
|
12
|
+
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
## Quick wins first (independent of the scheduler)
|
|
16
|
+
|
|
17
|
+
### Task 1: Fix ITIL problem-close → GTD project completion leak
|
|
18
|
+
|
|
19
|
+
A GTD project is created per problem (`itil.py:510`) but its id is never stored on the problem, and closing/resolving a problem never completes it → stale projects (e.g. `prb-1c7ae152` at 82 days). Mirror the incident-side behavior.
|
|
20
|
+
|
|
21
|
+
**Files:**
|
|
22
|
+
- Modify: `src/skcapstone/itil.py` (`create_problem` ~510, `update_problem` ~536)
|
|
23
|
+
- Test: `tests/test_itil_gtd_lifecycle.py`
|
|
24
|
+
|
|
25
|
+
- [ ] **Step 1: Write the failing test**
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
# tests/test_itil_gtd_lifecycle.py
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
from skcapstone.itil import ITILManager
|
|
31
|
+
from skcapstone.mcp_tools.gtd_tools import _load_list, _load_archive
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_resolving_problem_completes_its_gtd_project(tmp_path: Path, monkeypatch):
|
|
35
|
+
# gtd_tools resolves paths from SHARED_ROOT/coordination/gtd — point it at tmp
|
|
36
|
+
monkeypatch.setenv("SKCAPSTONE_HOME", str(tmp_path))
|
|
37
|
+
mgr = ITILManager(str(tmp_path))
|
|
38
|
+
|
|
39
|
+
prb = mgr.create_problem(title="Flaky widget", managed_by="opus")
|
|
40
|
+
# The created problem must track its auto-created GTD project id
|
|
41
|
+
assert prb.gtd_item_ids, "problem should store its GTD project id"
|
|
42
|
+
assert any(p["id"] in prb.gtd_item_ids for p in _load_list("projects"))
|
|
43
|
+
|
|
44
|
+
mgr.update_problem(prb.id, agent="opus", new_status="analyzing")
|
|
45
|
+
mgr.update_problem(prb.id, agent="opus", new_status="resolved")
|
|
46
|
+
|
|
47
|
+
# Project should be gone from active projects and present in archive as done
|
|
48
|
+
assert not any(p["id"] in prb.gtd_item_ids for p in _load_list("projects"))
|
|
49
|
+
archived = _load_archive()
|
|
50
|
+
assert any(a["id"] in prb.gtd_item_ids and a["status"] == "done" for a in archived)
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
- [ ] **Step 2: Run test to verify it fails**
|
|
54
|
+
|
|
55
|
+
Run: `~/.skenv/bin/python -m pytest tests/test_itil_gtd_lifecycle.py -v`
|
|
56
|
+
Expected: FAIL — `prb.gtd_item_ids` is empty (project id not stored).
|
|
57
|
+
|
|
58
|
+
- [ ] **Step 3: Store the project id in `create_problem`**
|
|
59
|
+
|
|
60
|
+
In `src/skcapstone/itil.py`, change the auto-create call inside `create_problem` (currently line ~510):
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
# Auto-create GTD project (and track its id so we can close it later)
|
|
64
|
+
project_id = self._create_gtd_project_for_problem(problem)
|
|
65
|
+
if project_id:
|
|
66
|
+
problem.gtd_item_ids.append(project_id)
|
|
67
|
+
self._update_record(
|
|
68
|
+
self.problems_dir, problem.id, problem.title, problem.model_dump()
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
return problem
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
- [ ] **Step 4: Complete the project on resolve in `update_problem`**
|
|
75
|
+
|
|
76
|
+
In `update_problem`, inside the `if new_status:` block (after `prb.timeline.append(...)` at ~539), add:
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
if new_status == ProblemStatus.RESOLVED.value:
|
|
80
|
+
self._complete_gtd_items(prb.gtd_item_ids)
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
- [ ] **Step 5: Run test to verify it passes**
|
|
84
|
+
|
|
85
|
+
Run: `~/.skenv/bin/python -m pytest tests/test_itil_gtd_lifecycle.py -v`
|
|
86
|
+
Expected: PASS
|
|
87
|
+
|
|
88
|
+
- [ ] **Step 6: Commit**
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
git add src/skcapstone/itil.py tests/test_itil_gtd_lifecycle.py
|
|
92
|
+
git commit -m "fix(itil): complete GTD project when problem resolves (lifecycle leak)"
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## Phase 1 — Job spec, config loader, node affinity, due-check
|
|
98
|
+
|
|
99
|
+
### Task 2: Add `croniter` dependency
|
|
100
|
+
|
|
101
|
+
**Files:**
|
|
102
|
+
- Modify: `pyproject.toml` (`dependencies` array)
|
|
103
|
+
|
|
104
|
+
- [ ] **Step 1: Add the dependency**
|
|
105
|
+
|
|
106
|
+
Add `"croniter>=2.0"` to the `[project].dependencies` list in `pyproject.toml`.
|
|
107
|
+
|
|
108
|
+
- [ ] **Step 2: Install into the venv**
|
|
109
|
+
|
|
110
|
+
Run: `~/.skenv/bin/pip install 'croniter>=2.0' -q && ~/.skenv/bin/python -c "import croniter; print(croniter.__name__)"`
|
|
111
|
+
Expected: prints `croniter`
|
|
112
|
+
|
|
113
|
+
- [ ] **Step 3: Commit**
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
git add pyproject.toml
|
|
117
|
+
git commit -m "build: add croniter dependency for skscheduler cron schedules"
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### Task 3: `JobSpec` + config loader
|
|
121
|
+
|
|
122
|
+
**Files:**
|
|
123
|
+
- Create: `src/skcapstone/scheduler_jobs.py`
|
|
124
|
+
- Test: `tests/test_scheduler_jobs.py`
|
|
125
|
+
|
|
126
|
+
- [ ] **Step 1: Write the failing test**
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
# tests/test_scheduler_jobs.py
|
|
130
|
+
from pathlib import Path
|
|
131
|
+
from skcapstone.scheduler_jobs import JobSpec, load_jobs
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def test_load_jobs_parses_yaml(tmp_path: Path):
|
|
135
|
+
cfg = tmp_path / "jobs.yaml"
|
|
136
|
+
cfg.write_text(
|
|
137
|
+
"jobs:\n"
|
|
138
|
+
" gtd-triage:\n"
|
|
139
|
+
" schedule: '0 6 * * *'\n"
|
|
140
|
+
" type: agent\n"
|
|
141
|
+
" nodes: ['.41']\n"
|
|
142
|
+
" agent: lumina\n"
|
|
143
|
+
" prompt: 'triage inbox'\n"
|
|
144
|
+
" timeout: 900\n"
|
|
145
|
+
" health:\n"
|
|
146
|
+
" every: 300s\n"
|
|
147
|
+
" type: python\n"
|
|
148
|
+
" nodes: all\n"
|
|
149
|
+
" callback: skcapstone.service_health:run_once\n",
|
|
150
|
+
encoding="utf-8",
|
|
151
|
+
)
|
|
152
|
+
jobs = load_jobs(cfg)
|
|
153
|
+
by_name = {j.name: j for j in jobs}
|
|
154
|
+
assert by_name["gtd-triage"].schedule == "0 6 * * *"
|
|
155
|
+
assert by_name["gtd-triage"].every_seconds is None
|
|
156
|
+
assert by_name["gtd-triage"].type == "agent"
|
|
157
|
+
assert by_name["gtd-triage"].nodes == [".41"]
|
|
158
|
+
assert by_name["health"].every_seconds == 300.0
|
|
159
|
+
assert by_name["health"].nodes == "all"
|
|
160
|
+
assert by_name["health"].enabled is True # default
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def test_load_jobs_missing_file_returns_empty(tmp_path: Path):
|
|
164
|
+
assert load_jobs(tmp_path / "nope.yaml") == []
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
- [ ] **Step 2: Run test to verify it fails**
|
|
168
|
+
|
|
169
|
+
Run: `~/.skenv/bin/python -m pytest tests/test_scheduler_jobs.py -v`
|
|
170
|
+
Expected: FAIL — `No module named 'skcapstone.scheduler_jobs'`
|
|
171
|
+
|
|
172
|
+
- [ ] **Step 3: Write minimal implementation**
|
|
173
|
+
|
|
174
|
+
```python
|
|
175
|
+
# src/skcapstone/scheduler_jobs.py
|
|
176
|
+
"""Declarative job specs for the skscheduler, loaded from jobs.yaml."""
|
|
177
|
+
from __future__ import annotations
|
|
178
|
+
|
|
179
|
+
import logging
|
|
180
|
+
import re
|
|
181
|
+
from dataclasses import dataclass, field
|
|
182
|
+
from pathlib import Path
|
|
183
|
+
from typing import Optional, Union
|
|
184
|
+
|
|
185
|
+
logger = logging.getLogger("skcapstone.scheduler_jobs")
|
|
186
|
+
|
|
187
|
+
_DURATION_RE = re.compile(r"^\s*(\d+(?:\.\d+)?)\s*([smhd]?)\s*$")
|
|
188
|
+
_UNIT_SECONDS = {"": 1, "s": 1, "m": 60, "h": 3600, "d": 86400}
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _parse_duration(value: Union[str, int, float]) -> float:
|
|
192
|
+
"""Parse '300s' / '5m' / '1h' / 90 into seconds."""
|
|
193
|
+
if isinstance(value, (int, float)):
|
|
194
|
+
return float(value)
|
|
195
|
+
m = _DURATION_RE.match(str(value))
|
|
196
|
+
if not m:
|
|
197
|
+
raise ValueError(f"invalid duration: {value!r}")
|
|
198
|
+
return float(m.group(1)) * _UNIT_SECONDS[m.group(2)]
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
@dataclass
|
|
202
|
+
class JobSpec:
|
|
203
|
+
name: str
|
|
204
|
+
type: str = "python" # python | shell | agent
|
|
205
|
+
schedule: Optional[str] = None # cron expression
|
|
206
|
+
every_seconds: Optional[float] = None # interval form
|
|
207
|
+
nodes: Union[str, list[str]] = "all" # "all" or list of host aliases
|
|
208
|
+
agent: Optional[str] = None
|
|
209
|
+
prompt: Optional[str] = None
|
|
210
|
+
command: Optional[str] = None
|
|
211
|
+
callback: Optional[str] = None # dotted "module:fn" for python jobs
|
|
212
|
+
timeout: float = 900.0
|
|
213
|
+
enabled: bool = True
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def load_jobs(config_path: Path) -> list[JobSpec]:
|
|
217
|
+
"""Load JobSpecs from jobs.yaml. Missing file -> []."""
|
|
218
|
+
if not config_path.exists():
|
|
219
|
+
return []
|
|
220
|
+
import yaml
|
|
221
|
+
|
|
222
|
+
data = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
|
|
223
|
+
out: list[JobSpec] = []
|
|
224
|
+
for name, raw in (data.get("jobs") or {}).items():
|
|
225
|
+
raw = dict(raw or {})
|
|
226
|
+
every = raw.pop("every", None)
|
|
227
|
+
out.append(
|
|
228
|
+
JobSpec(
|
|
229
|
+
name=name,
|
|
230
|
+
type=raw.get("type", "python"),
|
|
231
|
+
schedule=raw.get("schedule"),
|
|
232
|
+
every_seconds=_parse_duration(every) if every is not None else None,
|
|
233
|
+
nodes=raw.get("nodes", "all"),
|
|
234
|
+
agent=raw.get("agent"),
|
|
235
|
+
prompt=raw.get("prompt"),
|
|
236
|
+
command=raw.get("command"),
|
|
237
|
+
callback=raw.get("callback"),
|
|
238
|
+
timeout=float(raw.get("timeout", 900.0)),
|
|
239
|
+
enabled=bool(raw.get("enabled", True)),
|
|
240
|
+
)
|
|
241
|
+
)
|
|
242
|
+
return out
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
- [ ] **Step 4: Run test to verify it passes**
|
|
246
|
+
|
|
247
|
+
Run: `~/.skenv/bin/python -m pytest tests/test_scheduler_jobs.py -v`
|
|
248
|
+
Expected: PASS
|
|
249
|
+
|
|
250
|
+
- [ ] **Step 5: Commit**
|
|
251
|
+
|
|
252
|
+
```bash
|
|
253
|
+
git add src/skcapstone/scheduler_jobs.py tests/test_scheduler_jobs.py
|
|
254
|
+
git commit -m "feat(scheduler): JobSpec + jobs.yaml loader"
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
### Task 4: Node affinity resolution
|
|
258
|
+
|
|
259
|
+
**Files:**
|
|
260
|
+
- Modify: `src/skcapstone/scheduler_jobs.py`
|
|
261
|
+
- Test: `tests/test_scheduler_jobs.py`
|
|
262
|
+
|
|
263
|
+
- [ ] **Step 1: Write the failing test**
|
|
264
|
+
|
|
265
|
+
```python
|
|
266
|
+
# append to tests/test_scheduler_jobs.py
|
|
267
|
+
from skcapstone.scheduler_jobs import job_runs_here, JobSpec
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def test_job_runs_here_all():
|
|
271
|
+
j = JobSpec(name="x", nodes="all")
|
|
272
|
+
assert job_runs_here(j, host_aliases={"cbrd21-laptop12thgenintelcore", ".41"})
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def test_job_runs_here_match_and_miss():
|
|
276
|
+
j = JobSpec(name="x", nodes=[".41"])
|
|
277
|
+
assert job_runs_here(j, host_aliases={".41"})
|
|
278
|
+
assert not job_runs_here(j, host_aliases={".158", "noroc2027"})
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
- [ ] **Step 2: Run test to verify it fails**
|
|
282
|
+
|
|
283
|
+
Run: `~/.skenv/bin/python -m pytest tests/test_scheduler_jobs.py -k job_runs_here -v`
|
|
284
|
+
Expected: FAIL — `cannot import name 'job_runs_here'`
|
|
285
|
+
|
|
286
|
+
- [ ] **Step 3: Add the function**
|
|
287
|
+
|
|
288
|
+
```python
|
|
289
|
+
# add to src/skcapstone/scheduler_jobs.py
|
|
290
|
+
def job_runs_here(job: JobSpec, host_aliases: set[str]) -> bool:
|
|
291
|
+
"""True if this host (any of its aliases) is in the job's affinity."""
|
|
292
|
+
if job.nodes == "all":
|
|
293
|
+
return True
|
|
294
|
+
if isinstance(job.nodes, list):
|
|
295
|
+
return any(n in host_aliases for n in job.nodes)
|
|
296
|
+
return False
|
|
297
|
+
```
|
|
298
|
+
|
|
299
|
+
- [ ] **Step 4: Run test to verify it passes**
|
|
300
|
+
|
|
301
|
+
Run: `~/.skenv/bin/python -m pytest tests/test_scheduler_jobs.py -k job_runs_here -v`
|
|
302
|
+
Expected: PASS
|
|
303
|
+
|
|
304
|
+
- [ ] **Step 5: Commit**
|
|
305
|
+
|
|
306
|
+
```bash
|
|
307
|
+
git add src/skcapstone/scheduler_jobs.py tests/test_scheduler_jobs.py
|
|
308
|
+
git commit -m "feat(scheduler): per-job node-affinity resolution"
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
### Task 5: Due-check (cron + interval, with misfire catch-up)
|
|
312
|
+
|
|
313
|
+
**Files:**
|
|
314
|
+
- Modify: `src/skcapstone/scheduler_jobs.py`
|
|
315
|
+
- Test: `tests/test_scheduler_jobs.py`
|
|
316
|
+
|
|
317
|
+
- [ ] **Step 1: Write the failing test**
|
|
318
|
+
|
|
319
|
+
```python
|
|
320
|
+
# append to tests/test_scheduler_jobs.py
|
|
321
|
+
from datetime import datetime, timedelta, timezone
|
|
322
|
+
from skcapstone.scheduler_jobs import is_due, JobSpec
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def test_interval_due():
|
|
326
|
+
j = JobSpec(name="x", every_seconds=300)
|
|
327
|
+
now = datetime(2026, 6, 8, 12, 0, 0, tzinfo=timezone.utc)
|
|
328
|
+
assert is_due(j, last_run=None, now=now) # never run
|
|
329
|
+
assert not is_due(j, last_run=now - timedelta(seconds=100), now=now)
|
|
330
|
+
assert is_due(j, last_run=now - timedelta(seconds=301), now=now)
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def test_cron_due_at_scheduled_minute():
|
|
334
|
+
j = JobSpec(name="x", schedule="0 6 * * *") # daily 06:00
|
|
335
|
+
six_am = datetime(2026, 6, 8, 6, 0, 30, tzinfo=timezone.utc)
|
|
336
|
+
# never run, and we are at/after today's 06:00 slot -> due (catch-up)
|
|
337
|
+
assert is_due(j, last_run=None, now=six_am)
|
|
338
|
+
# already ran after today's slot -> not due again
|
|
339
|
+
assert not is_due(j, last_run=six_am, now=six_am + timedelta(minutes=5))
|
|
340
|
+
# ran yesterday, now past today's slot -> due
|
|
341
|
+
assert is_due(j, last_run=six_am - timedelta(days=1), now=six_am)
|
|
342
|
+
```
|
|
343
|
+
|
|
344
|
+
- [ ] **Step 2: Run test to verify it fails**
|
|
345
|
+
|
|
346
|
+
Run: `~/.skenv/bin/python -m pytest tests/test_scheduler_jobs.py -k is_due -v`
|
|
347
|
+
Expected: FAIL — `cannot import name 'is_due'`
|
|
348
|
+
|
|
349
|
+
- [ ] **Step 3: Implement due-check**
|
|
350
|
+
|
|
351
|
+
```python
|
|
352
|
+
# add to src/skcapstone/scheduler_jobs.py
|
|
353
|
+
from datetime import datetime, timezone # ensure imported at top
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def is_due(job: JobSpec, last_run: Optional[datetime], now: Optional[datetime] = None) -> bool:
|
|
357
|
+
"""Return True if the job should fire now.
|
|
358
|
+
|
|
359
|
+
Interval jobs: due when elapsed >= every_seconds (or never run).
|
|
360
|
+
Cron jobs: due when the most recent scheduled slot at/just-before `now`
|
|
361
|
+
is later than last_run (gives single catch-up after downtime).
|
|
362
|
+
"""
|
|
363
|
+
now = now or datetime.now(timezone.utc)
|
|
364
|
+
|
|
365
|
+
if job.every_seconds is not None:
|
|
366
|
+
if last_run is None:
|
|
367
|
+
return True
|
|
368
|
+
return (now - last_run).total_seconds() >= job.every_seconds
|
|
369
|
+
|
|
370
|
+
if job.schedule:
|
|
371
|
+
from croniter import croniter
|
|
372
|
+
|
|
373
|
+
itr = croniter(job.schedule, now)
|
|
374
|
+
prev_slot = itr.get_prev(datetime) # most recent scheduled time <= now
|
|
375
|
+
if prev_slot.tzinfo is None:
|
|
376
|
+
prev_slot = prev_slot.replace(tzinfo=timezone.utc)
|
|
377
|
+
if last_run is None:
|
|
378
|
+
return True
|
|
379
|
+
return last_run < prev_slot
|
|
380
|
+
|
|
381
|
+
return False
|
|
382
|
+
```
|
|
383
|
+
|
|
384
|
+
- [ ] **Step 4: Run test to verify it passes**
|
|
385
|
+
|
|
386
|
+
Run: `~/.skenv/bin/python -m pytest tests/test_scheduler_jobs.py -k is_due -v`
|
|
387
|
+
Expected: PASS
|
|
388
|
+
|
|
389
|
+
- [ ] **Step 5: Commit**
|
|
390
|
+
|
|
391
|
+
```bash
|
|
392
|
+
git add src/skcapstone/scheduler_jobs.py tests/test_scheduler_jobs.py
|
|
393
|
+
git commit -m "feat(scheduler): cron + interval due-check with misfire catch-up"
|
|
394
|
+
```
|
|
395
|
+
|
|
396
|
+
---
|
|
397
|
+
|
|
398
|
+
## Phase 2 — Node-local state, locks, and the job runner
|
|
399
|
+
|
|
400
|
+
### Task 6: Node-local state store
|
|
401
|
+
|
|
402
|
+
**Files:**
|
|
403
|
+
- Create: `src/skcapstone/scheduler_state.py`
|
|
404
|
+
- Test: `tests/test_scheduler_state.py`
|
|
405
|
+
|
|
406
|
+
- [ ] **Step 1: Write the failing test**
|
|
407
|
+
|
|
408
|
+
```python
|
|
409
|
+
# tests/test_scheduler_state.py
|
|
410
|
+
from datetime import datetime, timezone
|
|
411
|
+
from pathlib import Path
|
|
412
|
+
from skcapstone.scheduler_state import SchedulerState
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
def test_state_roundtrip(tmp_path: Path):
|
|
416
|
+
st = SchedulerState(root=tmp_path, hostname="hostA")
|
|
417
|
+
assert st.last_run("job1") is None
|
|
418
|
+
now = datetime(2026, 6, 8, 6, 0, tzinfo=timezone.utc)
|
|
419
|
+
st.record_run("job1", now=now, ok=True)
|
|
420
|
+
# New instance reads persisted state
|
|
421
|
+
st2 = SchedulerState(root=tmp_path, hostname="hostA")
|
|
422
|
+
assert st2.last_run("job1") == now
|
|
423
|
+
rec = st2.get("job1")
|
|
424
|
+
assert rec["run_count"] == 1 and rec["error_count"] == 0
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
def test_state_path_is_host_scoped(tmp_path: Path):
|
|
428
|
+
st = SchedulerState(root=tmp_path, hostname="hostA")
|
|
429
|
+
assert st.state_file == tmp_path / "scheduler" / "hostA" / "state.json"
|
|
430
|
+
```
|
|
431
|
+
|
|
432
|
+
- [ ] **Step 2: Run test to verify it fails**
|
|
433
|
+
|
|
434
|
+
Run: `~/.skenv/bin/python -m pytest tests/test_scheduler_state.py -v`
|
|
435
|
+
Expected: FAIL — `No module named 'skcapstone.scheduler_state'`
|
|
436
|
+
|
|
437
|
+
- [ ] **Step 3: Write implementation**
|
|
438
|
+
|
|
439
|
+
```python
|
|
440
|
+
# src/skcapstone/scheduler_state.py
|
|
441
|
+
"""Node-local (never-synced) state for the skscheduler."""
|
|
442
|
+
from __future__ import annotations
|
|
443
|
+
|
|
444
|
+
import json
|
|
445
|
+
import logging
|
|
446
|
+
from datetime import datetime, timezone
|
|
447
|
+
from pathlib import Path
|
|
448
|
+
from typing import Optional
|
|
449
|
+
|
|
450
|
+
logger = logging.getLogger("skcapstone.scheduler_state")
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
class SchedulerState:
|
|
454
|
+
"""Per-host job state at <root>/scheduler/<hostname>/state.json."""
|
|
455
|
+
|
|
456
|
+
def __init__(self, root: Path, hostname: str) -> None:
|
|
457
|
+
self.state_file = Path(root) / "scheduler" / hostname / "state.json"
|
|
458
|
+
self._data: dict = {}
|
|
459
|
+
if self.state_file.exists():
|
|
460
|
+
try:
|
|
461
|
+
self._data = json.loads(self.state_file.read_text(encoding="utf-8"))
|
|
462
|
+
except (OSError, json.JSONDecodeError):
|
|
463
|
+
self._data = {}
|
|
464
|
+
|
|
465
|
+
def get(self, job: str) -> dict:
|
|
466
|
+
return self._data.get(job, {"run_count": 0, "error_count": 0, "last_run": None})
|
|
467
|
+
|
|
468
|
+
def last_run(self, job: str) -> Optional[datetime]:
|
|
469
|
+
raw = self.get(job).get("last_run")
|
|
470
|
+
return datetime.fromisoformat(raw) if raw else None
|
|
471
|
+
|
|
472
|
+
def record_run(self, job: str, now: Optional[datetime] = None, ok: bool = True,
|
|
473
|
+
error: str = "") -> None:
|
|
474
|
+
now = now or datetime.now(timezone.utc)
|
|
475
|
+
rec = self.get(job)
|
|
476
|
+
rec["last_run"] = now.isoformat()
|
|
477
|
+
rec["last_status"] = "ok" if ok else "error"
|
|
478
|
+
rec["last_error"] = "" if ok else error
|
|
479
|
+
rec["run_count"] = rec.get("run_count", 0) + (1 if ok else 0)
|
|
480
|
+
rec["error_count"] = rec.get("error_count", 0) + (0 if ok else 1)
|
|
481
|
+
self._data[job] = rec
|
|
482
|
+
self._flush()
|
|
483
|
+
|
|
484
|
+
def all(self) -> dict:
|
|
485
|
+
return dict(self._data)
|
|
486
|
+
|
|
487
|
+
def _flush(self) -> None:
|
|
488
|
+
self.state_file.parent.mkdir(parents=True, exist_ok=True)
|
|
489
|
+
self.state_file.write_text(json.dumps(self._data, indent=2) + "\n", encoding="utf-8")
|
|
490
|
+
```
|
|
491
|
+
|
|
492
|
+
- [ ] **Step 4: Run test to verify it passes**
|
|
493
|
+
|
|
494
|
+
Run: `~/.skenv/bin/python -m pytest tests/test_scheduler_state.py -v`
|
|
495
|
+
Expected: PASS
|
|
496
|
+
|
|
497
|
+
- [ ] **Step 5: Commit**
|
|
498
|
+
|
|
499
|
+
```bash
|
|
500
|
+
git add src/skcapstone/scheduler_state.py tests/test_scheduler_state.py
|
|
501
|
+
git commit -m "feat(scheduler): node-local (never-synced) state store"
|
|
502
|
+
```
|
|
503
|
+
|
|
504
|
+
### Task 7: Job runner (dispatch + overlap lock + logs)
|
|
505
|
+
|
|
506
|
+
**Files:**
|
|
507
|
+
- Create: `src/skcapstone/scheduler_runner.py`
|
|
508
|
+
- Test: `tests/test_scheduler_runner.py`
|
|
509
|
+
|
|
510
|
+
- [ ] **Step 1: Write the failing test**
|
|
511
|
+
|
|
512
|
+
```python
|
|
513
|
+
# tests/test_scheduler_runner.py
|
|
514
|
+
from pathlib import Path
|
|
515
|
+
from skcapstone.scheduler_jobs import JobSpec
|
|
516
|
+
from skcapstone.scheduler_runner import JobRunner
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
def test_python_job_calls_callback(tmp_path: Path):
|
|
520
|
+
called = {}
|
|
521
|
+
import skcapstone.scheduler_runner as sr
|
|
522
|
+
sr._TEST_HOOK = lambda: called.setdefault("hit", True) # type: ignore
|
|
523
|
+
job = JobSpec(name="t", type="python", callback="skcapstone.scheduler_runner:_TEST_HOOK")
|
|
524
|
+
runner = JobRunner(log_dir=tmp_path)
|
|
525
|
+
result = runner.run(job)
|
|
526
|
+
assert result.ok and called.get("hit") is True
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def test_shell_job_runs_command(tmp_path: Path):
|
|
530
|
+
job = JobSpec(name="echo", type="shell", command="echo hello", timeout=10)
|
|
531
|
+
result = JobRunner(log_dir=tmp_path).run(job)
|
|
532
|
+
assert result.ok
|
|
533
|
+
assert "hello" in result.output
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
def test_shell_job_nonzero_is_error(tmp_path: Path):
|
|
537
|
+
job = JobSpec(name="fail", type="shell", command="exit 3", timeout=10)
|
|
538
|
+
result = JobRunner(log_dir=tmp_path).run(job)
|
|
539
|
+
assert not result.ok and result.exit_code == 3
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
def test_overlap_lock_blocks_second_run(tmp_path: Path):
|
|
543
|
+
runner = JobRunner(log_dir=tmp_path)
|
|
544
|
+
job = JobSpec(name="locked", type="shell", command="echo x", timeout=10)
|
|
545
|
+
with runner.lock(job) as got:
|
|
546
|
+
assert got
|
|
547
|
+
with runner.lock(job) as second:
|
|
548
|
+
assert not second
|
|
549
|
+
```
|
|
550
|
+
|
|
551
|
+
- [ ] **Step 2: Run test to verify it fails**
|
|
552
|
+
|
|
553
|
+
Run: `~/.skenv/bin/python -m pytest tests/test_scheduler_runner.py -v`
|
|
554
|
+
Expected: FAIL — `No module named 'skcapstone.scheduler_runner'`
|
|
555
|
+
|
|
556
|
+
- [ ] **Step 3: Write implementation**
|
|
557
|
+
|
|
558
|
+
```python
|
|
559
|
+
# src/skcapstone/scheduler_runner.py
|
|
560
|
+
"""Executes JobSpecs by type (python | shell | agent) with overlap locking."""
|
|
561
|
+
from __future__ import annotations
|
|
562
|
+
|
|
563
|
+
import contextlib
|
|
564
|
+
import importlib
|
|
565
|
+
import logging
|
|
566
|
+
import os
|
|
567
|
+
import shlex
|
|
568
|
+
import subprocess
|
|
569
|
+
from dataclasses import dataclass
|
|
570
|
+
from datetime import datetime, timezone
|
|
571
|
+
from pathlib import Path
|
|
572
|
+
|
|
573
|
+
from .scheduler_jobs import JobSpec
|
|
574
|
+
|
|
575
|
+
logger = logging.getLogger("skcapstone.scheduler_runner")
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
@dataclass
|
|
579
|
+
class JobResult:
|
|
580
|
+
ok: bool
|
|
581
|
+
exit_code: int = 0
|
|
582
|
+
output: str = ""
|
|
583
|
+
error: str = ""
|
|
584
|
+
|
|
585
|
+
|
|
586
|
+
class JobRunner:
|
|
587
|
+
def __init__(self, log_dir: Path) -> None:
|
|
588
|
+
self.log_dir = Path(log_dir)
|
|
589
|
+
|
|
590
|
+
@contextlib.contextmanager
|
|
591
|
+
def lock(self, job: JobSpec):
|
|
592
|
+
"""Per-job lockfile; yields False if already held (overlap guard)."""
|
|
593
|
+
self.log_dir.mkdir(parents=True, exist_ok=True)
|
|
594
|
+
lock_path = self.log_dir / f"{job.name}.lock"
|
|
595
|
+
try:
|
|
596
|
+
fd = os.open(str(lock_path), os.O_CREAT | os.O_EXCL | os.O_WRONLY)
|
|
597
|
+
except FileExistsError:
|
|
598
|
+
yield False
|
|
599
|
+
return
|
|
600
|
+
try:
|
|
601
|
+
os.write(fd, str(os.getpid()).encode())
|
|
602
|
+
os.close(fd)
|
|
603
|
+
yield True
|
|
604
|
+
finally:
|
|
605
|
+
with contextlib.suppress(OSError):
|
|
606
|
+
lock_path.unlink()
|
|
607
|
+
|
|
608
|
+
def run(self, job: JobSpec) -> JobResult:
|
|
609
|
+
if job.type == "python":
|
|
610
|
+
return self._run_python(job)
|
|
611
|
+
if job.type == "shell":
|
|
612
|
+
return self._run_subprocess(job, shlex.split(job.command or ""))
|
|
613
|
+
if job.type == "agent":
|
|
614
|
+
cmd = ["claude", "-p", job.prompt or ""]
|
|
615
|
+
if job.agent:
|
|
616
|
+
cmd += ["--agent", job.agent]
|
|
617
|
+
return self._run_subprocess(job, cmd)
|
|
618
|
+
return JobResult(ok=False, error=f"unknown job type: {job.type}")
|
|
619
|
+
|
|
620
|
+
def _run_python(self, job: JobSpec) -> JobResult:
|
|
621
|
+
try:
|
|
622
|
+
mod_name, _, fn_name = (job.callback or "").partition(":")
|
|
623
|
+
fn = getattr(importlib.import_module(mod_name), fn_name)
|
|
624
|
+
fn()
|
|
625
|
+
return JobResult(ok=True)
|
|
626
|
+
except Exception as exc: # noqa: BLE001 - jobs must never crash the loop
|
|
627
|
+
logger.error("python job '%s' failed: %s", job.name, exc)
|
|
628
|
+
return JobResult(ok=False, error=str(exc))
|
|
629
|
+
|
|
630
|
+
def _run_subprocess(self, job: JobSpec, cmd: list[str]) -> JobResult:
|
|
631
|
+
self.log_dir.mkdir(parents=True, exist_ok=True)
|
|
632
|
+
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
|
633
|
+
log_path = self.log_dir / f"{job.name}-{ts}.log"
|
|
634
|
+
try:
|
|
635
|
+
proc = subprocess.run(
|
|
636
|
+
cmd, capture_output=True, text=True, timeout=job.timeout
|
|
637
|
+
)
|
|
638
|
+
out = (proc.stdout or "") + (proc.stderr or "")
|
|
639
|
+
log_path.write_text(out, encoding="utf-8")
|
|
640
|
+
return JobResult(
|
|
641
|
+
ok=proc.returncode == 0,
|
|
642
|
+
exit_code=proc.returncode,
|
|
643
|
+
output=out,
|
|
644
|
+
error="" if proc.returncode == 0 else out[-500:],
|
|
645
|
+
)
|
|
646
|
+
except subprocess.TimeoutExpired:
|
|
647
|
+
return JobResult(ok=False, exit_code=-1, error=f"timeout after {job.timeout}s")
|
|
648
|
+
except (OSError, ValueError) as exc:
|
|
649
|
+
return JobResult(ok=False, exit_code=-1, error=str(exc))
|
|
650
|
+
```
|
|
651
|
+
|
|
652
|
+
- [ ] **Step 4: Run test to verify it passes**
|
|
653
|
+
|
|
654
|
+
Run: `~/.skenv/bin/python -m pytest tests/test_scheduler_runner.py -v`
|
|
655
|
+
Expected: PASS
|
|
656
|
+
|
|
657
|
+
- [ ] **Step 5: Commit**
|
|
658
|
+
|
|
659
|
+
```bash
|
|
660
|
+
git add src/skcapstone/scheduler_runner.py tests/test_scheduler_runner.py
|
|
661
|
+
git commit -m "feat(scheduler): job runner (python/shell/agent) with overlap lock"
|
|
662
|
+
```
|
|
663
|
+
|
|
664
|
+
---
|
|
665
|
+
|
|
666
|
+
## Phase 3 — Wire config jobs into TaskScheduler
|
|
667
|
+
|
|
668
|
+
### Task 8: Tick-loop integration of config jobs
|
|
669
|
+
|
|
670
|
+
**Files:**
|
|
671
|
+
- Modify: `src/skcapstone/scheduled_tasks.py` (`TaskScheduler`)
|
|
672
|
+
- Test: `tests/test_scheduler_integration.py`
|
|
673
|
+
|
|
674
|
+
- [ ] **Step 1: Write the failing test**
|
|
675
|
+
|
|
676
|
+
```python
|
|
677
|
+
# tests/test_scheduler_integration.py
|
|
678
|
+
import threading
|
|
679
|
+
from datetime import datetime, timezone
|
|
680
|
+
from pathlib import Path
|
|
681
|
+
from skcapstone.scheduler_jobs import JobSpec
|
|
682
|
+
from skcapstone.scheduled_tasks import TaskScheduler
|
|
683
|
+
|
|
684
|
+
|
|
685
|
+
def test_due_config_job_for_this_host_fires(tmp_path: Path):
|
|
686
|
+
sched = TaskScheduler(home=tmp_path, stop_event=threading.Event())
|
|
687
|
+
fired = []
|
|
688
|
+
job = JobSpec(name="j", type="shell", command="true", every_seconds=1, nodes=["hostA"])
|
|
689
|
+
sched.load_config_jobs(
|
|
690
|
+
jobs=[job], hostname="hostA", host_aliases={"hostA"}, state_root=tmp_path
|
|
691
|
+
)
|
|
692
|
+
# patch the runner to record instead of subprocess
|
|
693
|
+
sched._job_runner.run = lambda j: fired.append(j.name) or _ok() # type: ignore
|
|
694
|
+
sched.tick_config_jobs(now=datetime(2026, 6, 8, 12, 0, tzinfo=timezone.utc))
|
|
695
|
+
assert fired == ["j"]
|
|
696
|
+
|
|
697
|
+
|
|
698
|
+
def test_job_not_for_this_host_skipped(tmp_path: Path):
|
|
699
|
+
sched = TaskScheduler(home=tmp_path, stop_event=threading.Event())
|
|
700
|
+
fired = []
|
|
701
|
+
job = JobSpec(name="j", type="shell", command="true", every_seconds=1, nodes=[".41"])
|
|
702
|
+
sched.load_config_jobs(jobs=[job], hostname="hostB", host_aliases={"hostB"}, state_root=tmp_path)
|
|
703
|
+
sched._job_runner.run = lambda j: fired.append(j.name) # type: ignore
|
|
704
|
+
sched.tick_config_jobs()
|
|
705
|
+
assert fired == []
|
|
706
|
+
|
|
707
|
+
|
|
708
|
+
def _ok():
|
|
709
|
+
from skcapstone.scheduler_runner import JobResult
|
|
710
|
+
return JobResult(ok=True)
|
|
711
|
+
```
|
|
712
|
+
|
|
713
|
+
- [ ] **Step 2: Run test to verify it fails**
|
|
714
|
+
|
|
715
|
+
Run: `~/.skenv/bin/python -m pytest tests/test_scheduler_integration.py -v`
|
|
716
|
+
Expected: FAIL — `TaskScheduler` has no `load_config_jobs`.
|
|
717
|
+
|
|
718
|
+
- [ ] **Step 3: Extend `TaskScheduler`**
|
|
719
|
+
|
|
720
|
+
Add imports near the top of `scheduled_tasks.py`:
|
|
721
|
+
|
|
722
|
+
```python
|
|
723
|
+
from .scheduler_jobs import JobSpec, is_due, job_runs_here
|
|
724
|
+
from .scheduler_runner import JobRunner
|
|
725
|
+
from .scheduler_state import SchedulerState
|
|
726
|
+
```
|
|
727
|
+
|
|
728
|
+
Add to `TaskScheduler.__init__` (after `self._thread = None`):
|
|
729
|
+
|
|
730
|
+
```python
|
|
731
|
+
self._config_jobs: list[JobSpec] = []
|
|
732
|
+
self._host_aliases: set[str] = set()
|
|
733
|
+
self._state: Optional[SchedulerState] = None
|
|
734
|
+
self._job_runner: Optional[JobRunner] = None
|
|
735
|
+
```
|
|
736
|
+
|
|
737
|
+
Add methods to `TaskScheduler`:
|
|
738
|
+
|
|
739
|
+
```python
|
|
740
|
+
def load_config_jobs(
|
|
741
|
+
self,
|
|
742
|
+
jobs: list[JobSpec],
|
|
743
|
+
hostname: str,
|
|
744
|
+
host_aliases: set[str],
|
|
745
|
+
state_root: Path,
|
|
746
|
+
) -> None:
|
|
747
|
+
"""Attach config-driven jobs filtered to this host's affinity."""
|
|
748
|
+
self._host_aliases = host_aliases
|
|
749
|
+
self._state = SchedulerState(root=state_root, hostname=hostname)
|
|
750
|
+
self._job_runner = JobRunner(log_dir=state_root / "scheduler" / hostname / "logs")
|
|
751
|
+
self._config_jobs = [
|
|
752
|
+
j for j in jobs if j.enabled and job_runs_here(j, host_aliases)
|
|
753
|
+
]
|
|
754
|
+
logger.info("Loaded %d config job(s) for host %s", len(self._config_jobs), hostname)
|
|
755
|
+
|
|
756
|
+
def tick_config_jobs(self, now: Optional[datetime] = None) -> None:
|
|
757
|
+
"""Fire due config jobs once (called each scheduler tick)."""
|
|
758
|
+
if not self._config_jobs or self._state is None or self._job_runner is None:
|
|
759
|
+
return
|
|
760
|
+
now = now or datetime.now(timezone.utc)
|
|
761
|
+
for job in self._config_jobs:
|
|
762
|
+
if not is_due(job, self._state.last_run(job.name), now):
|
|
763
|
+
continue
|
|
764
|
+
with self._job_runner.lock(job) as got:
|
|
765
|
+
if not got:
|
|
766
|
+
logger.debug("job '%s' still running — skip", job.name)
|
|
767
|
+
continue
|
|
768
|
+
result = self._job_runner.run(job)
|
|
769
|
+
self._state.record_run(
|
|
770
|
+
job.name, now=now, ok=result.ok, error=result.error
|
|
771
|
+
)
|
|
772
|
+
if not result.ok:
|
|
773
|
+
logger.warning("job '%s' failed: %s", job.name, result.error)
|
|
774
|
+
```
|
|
775
|
+
|
|
776
|
+
Then call it from the loop — in `_run`, after the existing `task.run()` loop and before `self._stop_event.wait(...)`:
|
|
777
|
+
|
|
778
|
+
```python
|
|
779
|
+
self.tick_config_jobs(now)
|
|
780
|
+
```
|
|
781
|
+
|
|
782
|
+
- [ ] **Step 4: Run test to verify it passes**
|
|
783
|
+
|
|
784
|
+
Run: `~/.skenv/bin/python -m pytest tests/test_scheduler_integration.py -v`
|
|
785
|
+
Expected: PASS
|
|
786
|
+
|
|
787
|
+
- [ ] **Step 5: Commit**
|
|
788
|
+
|
|
789
|
+
```bash
|
|
790
|
+
git add src/skcapstone/scheduled_tasks.py tests/test_scheduler_integration.py
|
|
791
|
+
git commit -m "feat(scheduler): fire config jobs from the TaskScheduler tick loop"
|
|
792
|
+
```
|
|
793
|
+
|
|
794
|
+
### Task 9: Load jobs.yaml in `build_scheduler`
|
|
795
|
+
|
|
796
|
+
**Files:**
|
|
797
|
+
- Modify: `src/skcapstone/scheduled_tasks.py` (`build_scheduler`)
|
|
798
|
+
- Test: `tests/test_scheduler_integration.py`
|
|
799
|
+
|
|
800
|
+
- [ ] **Step 1: Write the failing test**
|
|
801
|
+
|
|
802
|
+
```python
|
|
803
|
+
# append to tests/test_scheduler_integration.py
|
|
804
|
+
import socket as _socket
|
|
805
|
+
from skcapstone.scheduled_tasks import build_scheduler
|
|
806
|
+
|
|
807
|
+
|
|
808
|
+
def test_build_scheduler_loads_jobs_yaml(tmp_path, monkeypatch):
|
|
809
|
+
cfg_dir = tmp_path / "config"
|
|
810
|
+
cfg_dir.mkdir()
|
|
811
|
+
(cfg_dir / "jobs.yaml").write_text(
|
|
812
|
+
"jobs:\n noop:\n every: 60s\n type: shell\n command: 'true'\n nodes: all\n",
|
|
813
|
+
encoding="utf-8",
|
|
814
|
+
)
|
|
815
|
+
sched = build_scheduler(home=tmp_path, stop_event=threading.Event())
|
|
816
|
+
assert any(j.name == "noop" for j in sched._config_jobs)
|
|
817
|
+
```
|
|
818
|
+
|
|
819
|
+
- [ ] **Step 2: Run test to verify it fails**
|
|
820
|
+
|
|
821
|
+
Run: `~/.skenv/bin/python -m pytest tests/test_scheduler_integration.py -k build_scheduler -v`
|
|
822
|
+
Expected: FAIL — `_config_jobs` is empty (no loading yet).
|
|
823
|
+
|
|
824
|
+
- [ ] **Step 3: Implement loading + alias resolution helper**
|
|
825
|
+
|
|
826
|
+
Add a helper to `scheduler_jobs.py`:
|
|
827
|
+
|
|
828
|
+
```python
|
|
829
|
+
# src/skcapstone/scheduler_jobs.py
|
|
830
|
+
import socket
|
|
831
|
+
|
|
832
|
+
|
|
833
|
+
def current_host_aliases() -> set[str]:
|
|
834
|
+
"""Aliases identifying this host (hostname + any configured short alias)."""
|
|
835
|
+
aliases = {socket.gethostname()}
|
|
836
|
+
# Optional override so jobs.yaml can use friendly aliases like ".41"
|
|
837
|
+
import os
|
|
838
|
+
extra = os.environ.get("SK_NODE_ALIAS", "")
|
|
839
|
+
aliases.update(a.strip() for a in extra.split(",") if a.strip())
|
|
840
|
+
return aliases
|
|
841
|
+
```
|
|
842
|
+
|
|
843
|
+
In `build_scheduler` (after the built-in `scheduler.register(...)` calls, before `return scheduler`):
|
|
844
|
+
|
|
845
|
+
```python
|
|
846
|
+
# Config-driven jobs (jobs.yaml) — the unified registry
|
|
847
|
+
from .scheduler_jobs import load_jobs, current_host_aliases
|
|
848
|
+
|
|
849
|
+
jobs_path = Path(home) / "config" / "jobs.yaml"
|
|
850
|
+
jobs = load_jobs(jobs_path)
|
|
851
|
+
if jobs:
|
|
852
|
+
aliases = current_host_aliases()
|
|
853
|
+
scheduler.load_config_jobs(
|
|
854
|
+
jobs=jobs,
|
|
855
|
+
hostname=socket.gethostname(),
|
|
856
|
+
host_aliases=aliases,
|
|
857
|
+
state_root=Path(home),
|
|
858
|
+
)
|
|
859
|
+
```
|
|
860
|
+
|
|
861
|
+
Add `import socket` at the top of `scheduled_tasks.py` if not present.
|
|
862
|
+
|
|
863
|
+
- [ ] **Step 4: Run test to verify it passes**
|
|
864
|
+
|
|
865
|
+
Run: `~/.skenv/bin/python -m pytest tests/test_scheduler_integration.py -k build_scheduler -v`
|
|
866
|
+
Expected: PASS
|
|
867
|
+
|
|
868
|
+
- [ ] **Step 5: Commit**
|
|
869
|
+
|
|
870
|
+
```bash
|
|
871
|
+
git add src/skcapstone/scheduled_tasks.py src/skcapstone/scheduler_jobs.py tests/test_scheduler_integration.py
|
|
872
|
+
git commit -m "feat(scheduler): load jobs.yaml into build_scheduler with host-alias resolution"
|
|
873
|
+
```
|
|
874
|
+
|
|
875
|
+
---
|
|
876
|
+
|
|
877
|
+
## Phase 4 — CLI
|
|
878
|
+
|
|
879
|
+
### Task 10: `skcapstone scheduler` command group
|
|
880
|
+
|
|
881
|
+
**Files:**
|
|
882
|
+
- Create: `src/skcapstone/cli/scheduler_cmd.py`
|
|
883
|
+
- Modify: `src/skcapstone/cli/__init__.py`
|
|
884
|
+
- Test: `tests/test_scheduler_cli.py`
|
|
885
|
+
|
|
886
|
+
- [ ] **Step 1: Write the failing test**
|
|
887
|
+
|
|
888
|
+
```python
|
|
889
|
+
# tests/test_scheduler_cli.py
|
|
890
|
+
import click
|
|
891
|
+
from click.testing import CliRunner
|
|
892
|
+
from skcapstone.cli.scheduler_cmd import register_scheduler_commands
|
|
893
|
+
|
|
894
|
+
|
|
895
|
+
def _app(tmp_path, monkeypatch):
|
|
896
|
+
monkeypatch.setenv("SKCAPSTONE_HOME", str(tmp_path))
|
|
897
|
+
(tmp_path / "config").mkdir(parents=True, exist_ok=True)
|
|
898
|
+
(tmp_path / "config" / "jobs.yaml").write_text(
|
|
899
|
+
"jobs:\n demo:\n every: 60s\n type: shell\n command: 'echo hi'\n nodes: all\n",
|
|
900
|
+
encoding="utf-8",
|
|
901
|
+
)
|
|
902
|
+
|
|
903
|
+
@click.group()
|
|
904
|
+
def main():
|
|
905
|
+
pass
|
|
906
|
+
|
|
907
|
+
register_scheduler_commands(main)
|
|
908
|
+
return main
|
|
909
|
+
|
|
910
|
+
|
|
911
|
+
def test_scheduler_list(tmp_path, monkeypatch):
|
|
912
|
+
main = _app(tmp_path, monkeypatch)
|
|
913
|
+
res = CliRunner().invoke(main, ["scheduler", "list"])
|
|
914
|
+
assert res.exit_code == 0 and "demo" in res.output
|
|
915
|
+
|
|
916
|
+
|
|
917
|
+
def test_scheduler_run_now(tmp_path, monkeypatch):
|
|
918
|
+
main = _app(tmp_path, monkeypatch)
|
|
919
|
+
res = CliRunner().invoke(main, ["scheduler", "run", "demo"])
|
|
920
|
+
assert res.exit_code == 0 and "hi" in res.output
|
|
921
|
+
```
|
|
922
|
+
|
|
923
|
+
- [ ] **Step 2: Run test to verify it fails**
|
|
924
|
+
|
|
925
|
+
Run: `~/.skenv/bin/python -m pytest tests/test_scheduler_cli.py -v`
|
|
926
|
+
Expected: FAIL — `No module named 'skcapstone.cli.scheduler_cmd'`
|
|
927
|
+
|
|
928
|
+
- [ ] **Step 3: Write the CLI module**
|
|
929
|
+
|
|
930
|
+
```python
|
|
931
|
+
# src/skcapstone/cli/scheduler_cmd.py
|
|
932
|
+
"""`skcapstone scheduler` — manage the unified job scheduler."""
|
|
933
|
+
from __future__ import annotations
|
|
934
|
+
|
|
935
|
+
import json
|
|
936
|
+
import os
|
|
937
|
+
import socket
|
|
938
|
+
from pathlib import Path
|
|
939
|
+
|
|
940
|
+
import click
|
|
941
|
+
|
|
942
|
+
from .. import AGENT_HOME
|
|
943
|
+
from ..scheduler_jobs import load_jobs, current_host_aliases, job_runs_here
|
|
944
|
+
from ..scheduler_runner import JobRunner
|
|
945
|
+
from ..scheduler_state import SchedulerState
|
|
946
|
+
|
|
947
|
+
|
|
948
|
+
def _jobs_path() -> Path:
|
|
949
|
+
return Path(os.environ.get("SKCAPSTONE_HOME", AGENT_HOME)) / "config" / "jobs.yaml"
|
|
950
|
+
|
|
951
|
+
|
|
952
|
+
def _state_root() -> Path:
|
|
953
|
+
return Path(os.environ.get("SKCAPSTONE_HOME", AGENT_HOME))
|
|
954
|
+
|
|
955
|
+
|
|
956
|
+
def register_scheduler_commands(main: click.Group) -> None:
|
|
957
|
+
@main.group("scheduler")
|
|
958
|
+
def scheduler() -> None:
|
|
959
|
+
"""Manage the unified job scheduler (skscheduler)."""
|
|
960
|
+
|
|
961
|
+
@scheduler.command("list")
|
|
962
|
+
def list_jobs() -> None:
|
|
963
|
+
"""List all configured jobs and where they run."""
|
|
964
|
+
jobs = load_jobs(_jobs_path())
|
|
965
|
+
if not jobs:
|
|
966
|
+
click.echo("No jobs configured.")
|
|
967
|
+
return
|
|
968
|
+
here = current_host_aliases()
|
|
969
|
+
for j in jobs:
|
|
970
|
+
sched = j.schedule or (f"every {int(j.every_seconds)}s" if j.every_seconds else "—")
|
|
971
|
+
mark = "✓" if (j.enabled and job_runs_here(j, here)) else " "
|
|
972
|
+
click.echo(f"[{mark}] {j.name:24s} {j.type:6s} {sched:18s} nodes={j.nodes}")
|
|
973
|
+
|
|
974
|
+
@scheduler.command("status")
|
|
975
|
+
@click.option("--json", "as_json", is_flag=True)
|
|
976
|
+
def status(as_json: bool) -> None:
|
|
977
|
+
"""Show last-run status for this node."""
|
|
978
|
+
st = SchedulerState(root=_state_root(), hostname=socket.gethostname())
|
|
979
|
+
data = st.all()
|
|
980
|
+
if as_json:
|
|
981
|
+
click.echo(json.dumps(data, indent=2))
|
|
982
|
+
return
|
|
983
|
+
if not data:
|
|
984
|
+
click.echo("No run history on this node yet.")
|
|
985
|
+
return
|
|
986
|
+
for name, rec in data.items():
|
|
987
|
+
click.echo(f"{name:24s} last={rec.get('last_run')} "
|
|
988
|
+
f"status={rec.get('last_status')} runs={rec.get('run_count')} "
|
|
989
|
+
f"errors={rec.get('error_count')}")
|
|
990
|
+
|
|
991
|
+
@scheduler.command("run")
|
|
992
|
+
@click.argument("job_name")
|
|
993
|
+
def run_now(job_name: str) -> None:
|
|
994
|
+
"""Run a job now (respects node affinity)."""
|
|
995
|
+
jobs = {j.name: j for j in load_jobs(_jobs_path())}
|
|
996
|
+
job = jobs.get(job_name)
|
|
997
|
+
if not job:
|
|
998
|
+
raise click.ClickException(f"Unknown job: {job_name}")
|
|
999
|
+
runner = JobRunner(log_dir=_state_root() / "scheduler" / socket.gethostname() / "logs")
|
|
1000
|
+
result = runner.run(job)
|
|
1001
|
+
if result.output:
|
|
1002
|
+
click.echo(result.output.strip())
|
|
1003
|
+
if not result.ok:
|
|
1004
|
+
raise click.ClickException(f"Job failed: {result.error}")
|
|
1005
|
+
click.echo(f"✓ {job_name} done")
|
|
1006
|
+
|
|
1007
|
+
@scheduler.command("logs")
|
|
1008
|
+
@click.argument("job_name")
|
|
1009
|
+
@click.option("--tail", default=40, show_default=True)
|
|
1010
|
+
def logs(job_name: str, tail: int) -> None:
|
|
1011
|
+
"""Show the latest log for a job on this node."""
|
|
1012
|
+
log_dir = _state_root() / "scheduler" / socket.gethostname() / "logs"
|
|
1013
|
+
matches = sorted(log_dir.glob(f"{job_name}-*.log")) if log_dir.exists() else []
|
|
1014
|
+
if not matches:
|
|
1015
|
+
click.echo(f"No logs for '{job_name}'.")
|
|
1016
|
+
return
|
|
1017
|
+
lines = matches[-1].read_text(encoding="utf-8").splitlines()
|
|
1018
|
+
click.echo("\n".join(lines[-tail:]))
|
|
1019
|
+
|
|
1020
|
+
@scheduler.command("enable")
|
|
1021
|
+
@click.argument("job_name")
|
|
1022
|
+
def enable(job_name: str) -> None:
|
|
1023
|
+
"""Enable a job (sets enabled: true in jobs.yaml)."""
|
|
1024
|
+
_set_enabled(job_name, True)
|
|
1025
|
+
click.echo(f"✓ enabled {job_name}")
|
|
1026
|
+
|
|
1027
|
+
@scheduler.command("disable")
|
|
1028
|
+
@click.argument("job_name")
|
|
1029
|
+
def disable(job_name: str) -> None:
|
|
1030
|
+
"""Disable a job (sets enabled: false in jobs.yaml)."""
|
|
1031
|
+
_set_enabled(job_name, False)
|
|
1032
|
+
click.echo(f"✓ disabled {job_name}")
|
|
1033
|
+
|
|
1034
|
+
|
|
1035
|
+
def _set_enabled(job_name: str, value: bool) -> None:
|
|
1036
|
+
import yaml
|
|
1037
|
+
|
|
1038
|
+
path = _jobs_path()
|
|
1039
|
+
data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
|
|
1040
|
+
jobs = data.get("jobs") or {}
|
|
1041
|
+
if job_name not in jobs:
|
|
1042
|
+
raise click.ClickException(f"Unknown job: {job_name}")
|
|
1043
|
+
jobs[job_name]["enabled"] = value
|
|
1044
|
+
path.write_text(yaml.safe_dump(data, sort_keys=False), encoding="utf-8")
|
|
1045
|
+
```
|
|
1046
|
+
|
|
1047
|
+
- [ ] **Step 4: Wire into the CLI**
|
|
1048
|
+
|
|
1049
|
+
In `src/skcapstone/cli/__init__.py`, add an import alongside the others (after line ~61):
|
|
1050
|
+
|
|
1051
|
+
```python
|
|
1052
|
+
from .scheduler_cmd import register_scheduler_commands
|
|
1053
|
+
```
|
|
1054
|
+
|
|
1055
|
+
and call it where the other `register_*_commands(main)` calls live (search for `register_daemon_commands(main)` and add below it):
|
|
1056
|
+
|
|
1057
|
+
```python
|
|
1058
|
+
register_scheduler_commands(main)
|
|
1059
|
+
```
|
|
1060
|
+
|
|
1061
|
+
- [ ] **Step 5: Run tests to verify they pass**
|
|
1062
|
+
|
|
1063
|
+
Run: `~/.skenv/bin/python -m pytest tests/test_scheduler_cli.py -v`
|
|
1064
|
+
Expected: PASS
|
|
1065
|
+
|
|
1066
|
+
- [ ] **Step 6: Verify the command is wired**
|
|
1067
|
+
|
|
1068
|
+
Run: `~/.skenv/bin/skcapstone scheduler list`
|
|
1069
|
+
Expected: prints configured jobs (or "No jobs configured.")
|
|
1070
|
+
|
|
1071
|
+
- [ ] **Step 7: Commit**
|
|
1072
|
+
|
|
1073
|
+
```bash
|
|
1074
|
+
git add src/skcapstone/cli/scheduler_cmd.py src/skcapstone/cli/__init__.py tests/test_scheduler_cli.py
|
|
1075
|
+
git commit -m "feat(scheduler): skcapstone scheduler CLI (list/status/run/logs/enable/disable)"
|
|
1076
|
+
```
|
|
1077
|
+
|
|
1078
|
+
---
|
|
1079
|
+
|
|
1080
|
+
## Phase 5 — Rollout on .41
|
|
1081
|
+
|
|
1082
|
+
### Task 11: `.stignore` for node-local scheduler state
|
|
1083
|
+
|
|
1084
|
+
**Files:**
|
|
1085
|
+
- Modify: `~/.skcapstone/.stignore` (live config on .41 — not the repo)
|
|
1086
|
+
|
|
1087
|
+
- [ ] **Step 1: Add the ignore rule**
|
|
1088
|
+
|
|
1089
|
+
Ensure `~/.skcapstone/.stignore` contains (append if missing):
|
|
1090
|
+
|
|
1091
|
+
```
|
|
1092
|
+
// skscheduler node-local state/logs must never sync (avoids the very conflicts it prevents)
|
|
1093
|
+
scheduler
|
|
1094
|
+
```
|
|
1095
|
+
|
|
1096
|
+
- [ ] **Step 2: Verify syncthing picks it up**
|
|
1097
|
+
|
|
1098
|
+
Run: `grep -n "^scheduler" ~/.skcapstone/.stignore`
|
|
1099
|
+
Expected: prints the `scheduler` line.
|
|
1100
|
+
|
|
1101
|
+
- [ ] **Step 3: (no commit — live config, not repo)**
|
|
1102
|
+
|
|
1103
|
+
### Task 12: Author `jobs.yaml` with the GTD-triage job (on .41)
|
|
1104
|
+
|
|
1105
|
+
**Files:**
|
|
1106
|
+
- Create: `~/.skcapstone/config/jobs.yaml` (live, synced)
|
|
1107
|
+
- Create (repo): `docs/superpowers/examples/jobs.yaml` (committed reference copy)
|
|
1108
|
+
|
|
1109
|
+
- [ ] **Step 1: Write the reference example into the repo**
|
|
1110
|
+
|
|
1111
|
+
```yaml
|
|
1112
|
+
# docs/superpowers/examples/jobs.yaml — reference for ~/.skcapstone/config/jobs.yaml
|
|
1113
|
+
jobs:
|
|
1114
|
+
gtd-inbox-triage:
|
|
1115
|
+
schedule: "0 6 * * *" # daily 06:00
|
|
1116
|
+
type: agent
|
|
1117
|
+
nodes: [".41"]
|
|
1118
|
+
agent: lumina
|
|
1119
|
+
prompt: >
|
|
1120
|
+
Triage the GTD inbox: for each item, clarify into next-action / project /
|
|
1121
|
+
someday-maybe, or archive noise; move resolved-ITIL items to done; surface
|
|
1122
|
+
stale projects. Use the gtd_* and itil_* MCP tools. Keep it concise.
|
|
1123
|
+
timeout: 900
|
|
1124
|
+
enabled: true
|
|
1125
|
+
```
|
|
1126
|
+
|
|
1127
|
+
- [ ] **Step 2: Install it as the live config on .41**
|
|
1128
|
+
|
|
1129
|
+
Run:
|
|
1130
|
+
```bash
|
|
1131
|
+
mkdir -p ~/.skcapstone/config
|
|
1132
|
+
cp docs/superpowers/examples/jobs.yaml ~/.skcapstone/config/jobs.yaml
|
|
1133
|
+
SK_NODE_ALIAS=.41 ~/.skenv/bin/skcapstone scheduler list
|
|
1134
|
+
```
|
|
1135
|
+
Expected: `gtd-inbox-triage` listed with a `✓` (runs here, because alias `.41`).
|
|
1136
|
+
|
|
1137
|
+
- [ ] **Step 3: Smoke-test a manual run is wired (will invoke claude -p)**
|
|
1138
|
+
|
|
1139
|
+
Run: `~/.skenv/bin/skcapstone scheduler list`
|
|
1140
|
+
Expected: job present. (Do NOT auto-run the agent in CI; manual `scheduler run gtd-inbox-triage` is a human-initiated check.)
|
|
1141
|
+
|
|
1142
|
+
- [ ] **Step 4: Commit the repo reference**
|
|
1143
|
+
|
|
1144
|
+
```bash
|
|
1145
|
+
git add docs/superpowers/examples/jobs.yaml
|
|
1146
|
+
git commit -m "docs(scheduler): reference jobs.yaml with daily gtd-inbox-triage job"
|
|
1147
|
+
```
|
|
1148
|
+
|
|
1149
|
+
### Task 13: Activate the skcapstone daemon (systemd user) on .41
|
|
1150
|
+
|
|
1151
|
+
**Files:**
|
|
1152
|
+
- Live: `~/.config/systemd/user/skcapstone.service` (from repo `systemd/skcapstone.service`)
|
|
1153
|
+
|
|
1154
|
+
- [ ] **Step 1: Confirm the daemon runs the scheduler**
|
|
1155
|
+
|
|
1156
|
+
Run: `grep -n "build_scheduler\|self._scheduler.start" src/skcapstone/daemon.py`
|
|
1157
|
+
Expected: shows the daemon builds + starts the scheduler (it does — lines ~981, ~798).
|
|
1158
|
+
|
|
1159
|
+
- [ ] **Step 2: Set the node alias for the user service**
|
|
1160
|
+
|
|
1161
|
+
Add `Environment=SK_NODE_ALIAS=.41` to `~/.config/systemd/user/skcapstone.service` under `[Service]` (so `nodes: ['.41']` matches). Then:
|
|
1162
|
+
|
|
1163
|
+
```bash
|
|
1164
|
+
systemctl --user daemon-reload
|
|
1165
|
+
systemctl --user enable --now skcapstone.service
|
|
1166
|
+
systemctl --user status skcapstone.service --no-pager | head -15
|
|
1167
|
+
```
|
|
1168
|
+
Expected: `active (running)`.
|
|
1169
|
+
|
|
1170
|
+
- [ ] **Step 3: Verify the scheduler loaded the config job**
|
|
1171
|
+
|
|
1172
|
+
Run: `journalctl --user -u skcapstone.service --no-pager | grep -i "config job" | tail`
|
|
1173
|
+
Expected: a line like `Loaded 1 config job(s) for host cbrd21-laptop12thgenintelcore`.
|
|
1174
|
+
|
|
1175
|
+
- [ ] **Step 4: (no commit — live system change)**
|
|
1176
|
+
|
|
1177
|
+
---
|
|
1178
|
+
|
|
1179
|
+
## Phase 6 — Migration (after confirmation)
|
|
1180
|
+
|
|
1181
|
+
### Task 14: Migrate legacy crontab + dead timer (confirm each first)
|
|
1182
|
+
|
|
1183
|
+
**Files:**
|
|
1184
|
+
- Live: user `crontab`, `~/.config/systemd/user/`
|
|
1185
|
+
- Live: `~/.skcapstone/config/jobs.yaml`
|
|
1186
|
+
|
|
1187
|
+
- [ ] **Step 1: List what would migrate**
|
|
1188
|
+
|
|
1189
|
+
Run:
|
|
1190
|
+
```bash
|
|
1191
|
+
crontab -l
|
|
1192
|
+
systemctl --user list-timers --all --no-pager
|
|
1193
|
+
```
|
|
1194
|
+
For EACH legacy crontab entry, confirm with Chef whether it is still wanted (the `~/dkloud.douno.it/...` path predates skcapstone and may be dead). Do not migrate blindly.
|
|
1195
|
+
|
|
1196
|
+
- [ ] **Step 2: Add confirmed jobs to jobs.yaml as `shell` type**
|
|
1197
|
+
|
|
1198
|
+
For each kept job, add an entry (example shape):
|
|
1199
|
+
|
|
1200
|
+
```yaml
|
|
1201
|
+
memory-eod-rollup:
|
|
1202
|
+
schedule: "55 23 * * *"
|
|
1203
|
+
type: shell
|
|
1204
|
+
nodes: [".41"]
|
|
1205
|
+
command: "/home/cbrd21/dkloud.douno.it/p/gentistrust/skstack01/docs/memory/memory/scripts/memory-eod-rollup.sh"
|
|
1206
|
+
enabled: true
|
|
1207
|
+
```
|
|
1208
|
+
|
|
1209
|
+
- [ ] **Step 3: Remove migrated entries from crontab; retire dead `skcomms-heartbeat`**
|
|
1210
|
+
|
|
1211
|
+
```bash
|
|
1212
|
+
crontab -l | grep -v 'memory-eod-rollup.sh' | crontab - # repeat per migrated line
|
|
1213
|
+
systemctl --user disable skcomms-heartbeat.timer 2>/dev/null || true
|
|
1214
|
+
```
|
|
1215
|
+
|
|
1216
|
+
- [ ] **Step 4: Verify**
|
|
1217
|
+
|
|
1218
|
+
Run: `~/.skenv/bin/skcapstone scheduler list`
|
|
1219
|
+
Expected: migrated jobs appear; `scheduler status` shows them running over time.
|
|
1220
|
+
|
|
1221
|
+
- [ ] **Step 5: (no commit — live config)**
|
|
1222
|
+
|
|
1223
|
+
---
|
|
1224
|
+
|
|
1225
|
+
## Final verification
|
|
1226
|
+
|
|
1227
|
+
- [ ] Run the full new test set:
|
|
1228
|
+
|
|
1229
|
+
```bash
|
|
1230
|
+
~/.skenv/bin/python -m pytest tests/test_scheduler_jobs.py tests/test_scheduler_state.py \
|
|
1231
|
+
tests/test_scheduler_runner.py tests/test_scheduler_integration.py \
|
|
1232
|
+
tests/test_scheduler_cli.py tests/test_itil_gtd_lifecycle.py -v
|
|
1233
|
+
```
|
|
1234
|
+
Expected: all PASS.
|
|
1235
|
+
|
|
1236
|
+
- [ ] Confirm no regressions in existing scheduler/itil tests:
|
|
1237
|
+
|
|
1238
|
+
```bash
|
|
1239
|
+
~/.skenv/bin/python -m pytest tests/ -k "scheduled or itil or daemon" -q
|
|
1240
|
+
```
|
|
1241
|
+
|
|
1242
|
+
- [ ] Push branch and open PR:
|
|
1243
|
+
|
|
1244
|
+
```bash
|
|
1245
|
+
git push -u origin feat/skscheduler
|
|
1246
|
+
gh pr create --base main --title "skscheduler: unified fleet job scheduler" --body "Implements docs/superpowers/specs/2026-06-08-skscheduler-design.md. Refs prb-7810b08e, inc-455b1a64."
|
|
1247
|
+
```
|
|
1248
|
+
|
|
1249
|
+
---
|
|
1250
|
+
|
|
1251
|
+
## Self-review notes (coverage vs spec)
|
|
1252
|
+
|
|
1253
|
+
- Config registry (jobs.yaml) → Tasks 3, 9, 12. ✓
|
|
1254
|
+
- cron + interval schedules (croniter) → Tasks 2, 5. ✓
|
|
1255
|
+
- three job types (python/shell/agent, agent via `claude -p`) → Task 7. ✓
|
|
1256
|
+
- per-node affinity → Tasks 4, 8, 9. ✓
|
|
1257
|
+
- node-local non-synced state + .stignore → Tasks 6, 11. ✓
|
|
1258
|
+
- overlap-guard lockfiles → Task 7. ✓
|
|
1259
|
+
- misfire catch-up → Task 5. ✓
|
|
1260
|
+
- CLI list/status/run/enable/disable/logs → Task 10. ✓
|
|
1261
|
+
- daemon activation on .41 → Task 13. ✓
|
|
1262
|
+
- gtd-inbox-triage job → Task 12. ✓
|
|
1263
|
+
- migration of crontab/timers → Task 14. ✓
|
|
1264
|
+
- ITIL problem→project lifecycle fix (quick win) → Task 1. ✓
|
|
1265
|
+
- `service_health` multi-write fix (prb-7810b08e): **out of scope here** — tracked separately; this plan only makes affinity available for it.
|