clustermesh 0.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- clustermesh-0.9.0/.gitignore +19 -0
- clustermesh-0.9.0/LICENSE +21 -0
- clustermesh-0.9.0/PKG-INFO +200 -0
- clustermesh-0.9.0/README.md +150 -0
- clustermesh-0.9.0/Sparkpool +592 -0
- clustermesh-0.9.0/config/sites.example.yaml +21 -0
- clustermesh-0.9.0/docs/api-spec.md +206 -0
- clustermesh-0.9.0/docs/architecture.md +188 -0
- clustermesh-0.9.0/docs/fault-tolerance.md +249 -0
- clustermesh-0.9.0/docs/join-mesh.md +84 -0
- clustermesh-0.9.0/docs/publish-git.md +132 -0
- clustermesh-0.9.0/docs/publish-pypi.md +256 -0
- clustermesh-0.9.0/docs/roadmap.md +186 -0
- clustermesh-0.9.0/docs/testing-strategy.md +172 -0
- clustermesh-0.9.0/examples/checkpoint_recovery.py +53 -0
- clustermesh-0.9.0/examples/phase3_demo.py +9 -0
- clustermesh-0.9.0/examples/run_agent.py +6 -0
- clustermesh-0.9.0/examples/run_driver.py +6 -0
- clustermesh-0.9.0/examples/run_remote_job.py +30 -0
- clustermesh-0.9.0/frontend/.gitignore +4 -0
- clustermesh-0.9.0/frontend/index.html +19 -0
- clustermesh-0.9.0/frontend/package-lock.json +2860 -0
- clustermesh-0.9.0/frontend/package.json +29 -0
- clustermesh-0.9.0/frontend/postcss.config.js +6 -0
- clustermesh-0.9.0/frontend/public/favicon.svg +4 -0
- clustermesh-0.9.0/frontend/src/App.tsx +33 -0
- clustermesh-0.9.0/frontend/src/api/client.ts +305 -0
- clustermesh-0.9.0/frontend/src/api/useStream.ts +79 -0
- clustermesh-0.9.0/frontend/src/components/Badge.tsx +20 -0
- clustermesh-0.9.0/frontend/src/components/Header.tsx +61 -0
- clustermesh-0.9.0/frontend/src/components/HostMetricsPanel.tsx +139 -0
- clustermesh-0.9.0/frontend/src/components/Layout.tsx +38 -0
- clustermesh-0.9.0/frontend/src/components/NodeTerminal.tsx +106 -0
- clustermesh-0.9.0/frontend/src/components/ProgressBar.tsx +22 -0
- clustermesh-0.9.0/frontend/src/components/Sidebar.tsx +66 -0
- clustermesh-0.9.0/frontend/src/components/StatCard.tsx +36 -0
- clustermesh-0.9.0/frontend/src/index.css +44 -0
- clustermesh-0.9.0/frontend/src/lib/utils.ts +47 -0
- clustermesh-0.9.0/frontend/src/main.tsx +10 -0
- clustermesh-0.9.0/frontend/src/pages/Cluster.tsx +113 -0
- clustermesh-0.9.0/frontend/src/pages/JobDetail.tsx +96 -0
- clustermesh-0.9.0/frontend/src/pages/Jobs.tsx +109 -0
- clustermesh-0.9.0/frontend/src/pages/Libraries.tsx +277 -0
- clustermesh-0.9.0/frontend/src/pages/Logs.tsx +163 -0
- clustermesh-0.9.0/frontend/src/pages/Memory.tsx +145 -0
- clustermesh-0.9.0/frontend/src/pages/Mesh.tsx +174 -0
- clustermesh-0.9.0/frontend/src/pages/Nodes.tsx +174 -0
- clustermesh-0.9.0/frontend/src/pages/Notebook.tsx +253 -0
- clustermesh-0.9.0/frontend/src/pages/Overview.tsx +180 -0
- clustermesh-0.9.0/frontend/src/vite-env.d.ts +1 -0
- clustermesh-0.9.0/frontend/tailwind.config.js +32 -0
- clustermesh-0.9.0/frontend/tsconfig.app.json +22 -0
- clustermesh-0.9.0/frontend/tsconfig.json +7 -0
- clustermesh-0.9.0/frontend/tsconfig.node.json +10 -0
- clustermesh-0.9.0/frontend/vite.config.js +19 -0
- clustermesh-0.9.0/frontend/vite.config.ts +20 -0
- clustermesh-0.9.0/mesh/__init__.py +7 -0
- clustermesh-0.9.0/mesh/agent/__init__.py +7 -0
- clustermesh-0.9.0/mesh/agent/client.py +120 -0
- clustermesh-0.9.0/mesh/agent/config.py +28 -0
- clustermesh-0.9.0/mesh/agent/daemon.py +192 -0
- clustermesh-0.9.0/mesh/agent/executor.py +119 -0
- clustermesh-0.9.0/mesh/agent/host_metrics.py +232 -0
- clustermesh-0.9.0/mesh/agent/library.py +5 -0
- clustermesh-0.9.0/mesh/agent/monitor.py +197 -0
- clustermesh-0.9.0/mesh/agent/preemption.py +39 -0
- clustermesh-0.9.0/mesh/agent/server.py +108 -0
- clustermesh-0.9.0/mesh/agent/shell.py +84 -0
- clustermesh-0.9.0/mesh/api/__init__.py +6 -0
- clustermesh-0.9.0/mesh/api/app.py +401 -0
- clustermesh-0.9.0/mesh/api/auth.py +72 -0
- clustermesh-0.9.0/mesh/api/context.py +250 -0
- clustermesh-0.9.0/mesh/api/events.py +121 -0
- clustermesh-0.9.0/mesh/api/server.py +189 -0
- clustermesh-0.9.0/mesh/cli.py +196 -0
- clustermesh-0.9.0/mesh/discovery/__init__.py +5 -0
- clustermesh-0.9.0/mesh/discovery/mdns.py +160 -0
- clustermesh-0.9.0/mesh/driver/__init__.py +14 -0
- clustermesh-0.9.0/mesh/driver/cluster.py +170 -0
- clustermesh-0.9.0/mesh/driver/ha/__init__.py +6 -0
- clustermesh-0.9.0/mesh/driver/ha/coordinator.py +44 -0
- clustermesh-0.9.0/mesh/driver/ha/election.py +80 -0
- clustermesh-0.9.0/mesh/driver/job_manager.py +554 -0
- clustermesh-0.9.0/mesh/driver/library_installer.py +218 -0
- clustermesh-0.9.0/mesh/driver/server.py +246 -0
- clustermesh-0.9.0/mesh/execution/__init__.py +5 -0
- clustermesh-0.9.0/mesh/execution/executor.py +160 -0
- clustermesh-0.9.0/mesh/health/__init__.py +5 -0
- clustermesh-0.9.0/mesh/health/heartbeat.py +141 -0
- clustermesh-0.9.0/mesh/libraries/__init__.py +0 -0
- clustermesh-0.9.0/mesh/libraries/manager.py +87 -0
- clustermesh-0.9.0/mesh/memory/__init__.py +5 -0
- clustermesh-0.9.0/mesh/memory/fabric.py +174 -0
- clustermesh-0.9.0/mesh/meshvpn/__init__.py +6 -0
- clustermesh-0.9.0/mesh/meshvpn/coordinator.py +103 -0
- clustermesh-0.9.0/mesh/meshvpn/relay.py +128 -0
- clustermesh-0.9.0/mesh/meshvpn/site.py +68 -0
- clustermesh-0.9.0/mesh/models/__init__.py +15 -0
- clustermesh-0.9.0/mesh/models/enums.py +26 -0
- clustermesh-0.9.0/mesh/models/job.py +31 -0
- clustermesh-0.9.0/mesh/models/node.py +85 -0
- clustermesh-0.9.0/mesh/models/task.py +54 -0
- clustermesh-0.9.0/mesh/net/__init__.py +1 -0
- clustermesh-0.9.0/mesh/net/address.py +52 -0
- clustermesh-0.9.0/mesh/notebook/__init__.py +5 -0
- clustermesh-0.9.0/mesh/notebook/runner.py +100 -0
- clustermesh-0.9.0/mesh/proto/__init__.py +5 -0
- clustermesh-0.9.0/mesh/proto/mesh.proto +136 -0
- clustermesh-0.9.0/mesh/proto/mesh_pb2.py +68 -0
- clustermesh-0.9.0/mesh/proto/mesh_pb2_grpc.py +562 -0
- clustermesh-0.9.0/mesh/recovery/__init__.py +14 -0
- clustermesh-0.9.0/mesh/recovery/checkpoint.py +45 -0
- clustermesh-0.9.0/mesh/recovery/replication.py +102 -0
- clustermesh-0.9.0/mesh/recovery/speculation.py +59 -0
- clustermesh-0.9.0/mesh/recovery/work_stealing.py +66 -0
- clustermesh-0.9.0/mesh/scheduler/__init__.py +13 -0
- clustermesh-0.9.0/mesh/scheduler/benchmark.py +112 -0
- clustermesh-0.9.0/mesh/scheduler/placement.py +99 -0
- clustermesh-0.9.0/mesh/scheduler/pools.py +76 -0
- clustermesh-0.9.0/mesh/scheduler/rebalancing.py +68 -0
- clustermesh-0.9.0/mesh/scheduler/scoring.py +53 -0
- clustermesh-0.9.0/mesh/sdk/__init__.py +59 -0
- clustermesh-0.9.0/mesh/sdk/decorator.py +94 -0
- clustermesh-0.9.0/mesh/sdk/units.py +61 -0
- clustermesh-0.9.0/mesh/sim/__init__.py +8 -0
- clustermesh-0.9.0/mesh/sim/agent.py +72 -0
- clustermesh-0.9.0/mesh/sim/chaos.py +55 -0
- clustermesh-0.9.0/mesh/sim/clock.py +22 -0
- clustermesh-0.9.0/mesh/sim/cluster.py +186 -0
- clustermesh-0.9.0/mesh/sim/demo.py +76 -0
- clustermesh-0.9.0/mesh/sim/soak.py +171 -0
- clustermesh-0.9.0/mesh/state/__init__.py +7 -0
- clustermesh-0.9.0/mesh/state/factory.py +45 -0
- clustermesh-0.9.0/mesh/state/postgres_store.py +215 -0
- clustermesh-0.9.0/mesh/state/redis_store.py +134 -0
- clustermesh-0.9.0/mesh/state/serialize.py +125 -0
- clustermesh-0.9.0/mesh/state/sqlite_store.py +193 -0
- clustermesh-0.9.0/mesh/state/store.py +31 -0
- clustermesh-0.9.0/mesh/tasks/__init__.py +5 -0
- clustermesh-0.9.0/mesh/tasks/builtins.py +34 -0
- clustermesh-0.9.0/mesh/tasks/registry.py +30 -0
- clustermesh-0.9.0/mesh/worker/__init__.py +6 -0
- clustermesh-0.9.0/mesh/worker/runtime.py +63 -0
- clustermesh-0.9.0/mesh/worker/server.py +42 -0
- clustermesh-0.9.0/mesh/worker/state.py +114 -0
- clustermesh-0.9.0/mesh/worker/static/index.html +209 -0
- clustermesh-0.9.0/pyproject.toml +86 -0
- clustermesh-0.9.0/scripts/dogfood.sh +47 -0
- clustermesh-0.9.0/scripts/generate_proto.sh +13 -0
- clustermesh-0.9.0/scripts/install.sh +36 -0
- clustermesh-0.9.0/scripts/publish-pypi.sh +36 -0
- clustermesh-0.9.0/tests/__init__.py +0 -0
- clustermesh-0.9.0/tests/test_agent_address.py +25 -0
- clustermesh-0.9.0/tests/test_agent_monitor.py +62 -0
- clustermesh-0.9.0/tests/test_api.py +136 -0
- clustermesh-0.9.0/tests/test_execution.py +96 -0
- clustermesh-0.9.0/tests/test_grpc.py +120 -0
- clustermesh-0.9.0/tests/test_ha.py +73 -0
- clustermesh-0.9.0/tests/test_heartbeat.py +90 -0
- clustermesh-0.9.0/tests/test_host_metrics.py +20 -0
- clustermesh-0.9.0/tests/test_integration.py +131 -0
- clustermesh-0.9.0/tests/test_library.py +21 -0
- clustermesh-0.9.0/tests/test_library_installer.py +56 -0
- clustermesh-0.9.0/tests/test_notebook.py +42 -0
- clustermesh-0.9.0/tests/test_phase4.py +58 -0
- clustermesh-0.9.0/tests/test_phase6.py +152 -0
- clustermesh-0.9.0/tests/test_phase7.py +116 -0
- clustermesh-0.9.0/tests/test_phase8.py +100 -0
- clustermesh-0.9.0/tests/test_preemption.py +46 -0
- clustermesh-0.9.0/tests/test_scheduler.py +159 -0
- clustermesh-0.9.0/tests/test_sdk.py +96 -0
- clustermesh-0.9.0/tests/test_shell.py +77 -0
- clustermesh-0.9.0/tests/test_sim.py +135 -0
- clustermesh-0.9.0/tests/test_state_store.py +55 -0
- clustermesh-0.9.0/tests/test_worker.py +66 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 ClusterMesh Team
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: clustermesh
|
|
3
|
+
Version: 0.9.0
|
|
4
|
+
Summary: Enterprise compute fabric — pip install on any machine to join a compute cluster
|
|
5
|
+
Project-URL: Homepage, https://github.com/neetishsingh/ClusterMesh
|
|
6
|
+
Project-URL: Documentation, https://github.com/neetishsingh/ClusterMesh/blob/main/docs/join-mesh.md
|
|
7
|
+
Project-URL: Repository, https://github.com/neetishsingh/ClusterMesh
|
|
8
|
+
Author: ClusterMesh Team
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: cluster,compute,distributed,grpc,mesh,scheduler
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: System Administrators
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Topic :: System :: Clustering
|
|
23
|
+
Classifier: Topic :: System :: Distributed Computing
|
|
24
|
+
Requires-Python: >=3.11
|
|
25
|
+
Requires-Dist: fastapi>=0.110
|
|
26
|
+
Requires-Dist: grpcio>=1.60
|
|
27
|
+
Requires-Dist: protobuf>=4.25
|
|
28
|
+
Requires-Dist: psutil>=5.9
|
|
29
|
+
Requires-Dist: pyyaml>=6.0
|
|
30
|
+
Requires-Dist: uvicorn[standard]>=0.27
|
|
31
|
+
Requires-Dist: websockets>=12.0
|
|
32
|
+
Requires-Dist: zeroconf>=0.131
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Requires-Dist: grpcio-tools>=1.60; extra == 'dev'
|
|
35
|
+
Requires-Dist: httpx>=0.27; extra == 'dev'
|
|
36
|
+
Requires-Dist: hypothesis>=6.100; extra == 'dev'
|
|
37
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'dev'
|
|
38
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
39
|
+
Provides-Extra: discovery
|
|
40
|
+
Requires-Dist: zeroconf>=0.131; extra == 'discovery'
|
|
41
|
+
Provides-Extra: phase6
|
|
42
|
+
Requires-Dist: psycopg[binary]>=3.1; extra == 'phase6'
|
|
43
|
+
Requires-Dist: redis>=5.0; extra == 'phase6'
|
|
44
|
+
Requires-Dist: zeroconf>=0.131; extra == 'phase6'
|
|
45
|
+
Provides-Extra: postgres
|
|
46
|
+
Requires-Dist: psycopg[binary]>=3.1; extra == 'postgres'
|
|
47
|
+
Provides-Extra: redis
|
|
48
|
+
Requires-Dist: redis>=5.0; extra == 'redis'
|
|
49
|
+
Description-Content-Type: text/markdown
|
|
50
|
+
|
|
51
|
+
# ClusterMesh (ComputeMesh)
|
|
52
|
+
|
|
53
|
+
**An operating system for enterprise compute** — turn every laptop, desktop, VM, and GPU workstation into a single elastic, fault-tolerant compute cloud.
|
|
54
|
+
|
|
55
|
+
> Full vision: [Sparkpool](./Sparkpool) · Architecture: [docs/architecture.md](./docs/architecture.md) · Roadmap: [docs/roadmap.md](./docs/roadmap.md)
|
|
56
|
+
|
|
57
|
+
## The Problem
|
|
58
|
+
|
|
59
|
+
Organizations sit on thousands of idle cores:
|
|
60
|
+
|
|
61
|
+
| Resource | Typical utilization |
|
|
62
|
+
|----------|---------------------|
|
|
63
|
+
| CPU | 10–20% |
|
|
64
|
+
| RAM | 30–50% |
|
|
65
|
+
| GPU | 5–10% |
|
|
66
|
+
|
|
67
|
+
Databricks, Kubernetes, Spark, and Ray all require **dedicated** compute. Nobody fully solves:
|
|
68
|
+
|
|
69
|
+
> *"Use all idle enterprise hardware automatically and safely."*
|
|
70
|
+
|
|
71
|
+
ClusterMesh does.
|
|
72
|
+
|
|
73
|
+
## What We're Building
|
|
74
|
+
|
|
75
|
+
```
|
|
76
|
+
Control Plane
|
|
77
|
+
│
|
|
78
|
+
┌─────────────────┼─────────────────┐
|
|
79
|
+
│ │ │
|
|
80
|
+
Metadata Service Scheduler Service Auth Service
|
|
81
|
+
│ │ │
|
|
82
|
+
└─────────────────┼─────────────────┘
|
|
83
|
+
│
|
|
84
|
+
Driver Cluster (Raft HA)
|
|
85
|
+
│
|
|
86
|
+
┌────────────────────┼────────────────────┐
|
|
87
|
+
│ │ │
|
|
88
|
+
Agent-1 Agent-2 Agent-3
|
|
89
|
+
Laptop Desktop VM
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
**Killer features:** idle compute harvesting · GPU sharing · live discovery · fault-tolerant scheduling · work stealing · preemption handling · checkpoint recovery · multi-office clustering
|
|
93
|
+
|
|
94
|
+
## Join a worker (any Python machine)
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
pip install clustermesh
|
|
98
|
+
clustermesh join DRIVER_IP:50050 --open # local worker UI on :50052
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
See [docs/join-mesh.md](./docs/join-mesh.md) for full details.
|
|
102
|
+
|
|
103
|
+
## Quick Start (development)
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
# Install in development mode
|
|
107
|
+
python -m venv .venv
|
|
108
|
+
source .venv/bin/activate
|
|
109
|
+
pip install -e ".[dev]"
|
|
110
|
+
|
|
111
|
+
# Run tests
|
|
112
|
+
pytest
|
|
113
|
+
|
|
114
|
+
# Run a simulated 50-node cluster demo
|
|
115
|
+
python -m mesh.sim.demo
|
|
116
|
+
|
|
117
|
+
# Phase 5: platform with React dashboard (build UI first)
|
|
118
|
+
cd frontend && npm install && npm run build && cd ..
|
|
119
|
+
mesh-platform --port 8080 --db clustermesh.db # driver + API + UI
|
|
120
|
+
# Phase 6 options
|
|
121
|
+
mesh-platform --port 8080 --mdns --site bangalore # advertise via mDNS
|
|
122
|
+
mesh-platform --store-url postgres://user:pass@localhost/clustermesh
|
|
123
|
+
mesh-platform --api-key your-secret-key # require auth on API
|
|
124
|
+
mesh-agent --discover # auto-find driver on LAN
|
|
125
|
+
|
|
126
|
+
# Phase 7: multi-site mesh VPN
|
|
127
|
+
mesh-platform --mesh-config config/sites.example.yaml --site bangalore
|
|
128
|
+
mesh-relay --listen 0.0.0.0:6000 --target 127.0.0.1:50050 # standalone relay
|
|
129
|
+
mesh-soak --hours 24 --nodes 50 # accelerated 24h chaos test
|
|
130
|
+
mesh-bench --nodes 1000 # placement SLA benchmark
|
|
131
|
+
./scripts/dogfood.sh # local dogfood run
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
## Project Structure
|
|
135
|
+
|
|
136
|
+
```
|
|
137
|
+
ClusterMesh/
|
|
138
|
+
├── docs/ # Architecture, testing strategy, roadmap
|
|
139
|
+
├── mesh/ # Core Python package
|
|
140
|
+
│ ├── models/ # Node, Task, Job, Resource types
|
|
141
|
+
│ ├── health/ # Heartbeat FSM, node health tracking
|
|
142
|
+
│ ├── scheduler/ # Scoring, placement, pool routing
|
|
143
|
+
│ ├── execution/ # TaskExecutor, TaskContext
|
|
144
|
+
│ ├── recovery/ # Checkpointing, work stealing, replication
|
|
145
|
+
│ ├── driver/ # JobManager, DriverCluster, gRPC server
|
|
146
|
+
│ ├── agent/ # Daemon, monitor, preemption, library
|
|
147
|
+
│ ├── proto/ # gRPC protobuf definitions
|
|
148
|
+
│ ├── tasks/ # Task registry + built-ins
|
|
149
|
+
│ ├── sdk/ # @task decorator, submit() API
|
|
150
|
+
│ └── sim/ # SimAgent, SimCluster, chaos injection
|
|
151
|
+
├── tests/ # Unit + integration tests
|
|
152
|
+
├── frontend/ # React dashboard (Vite + Tailwind)
|
|
153
|
+
└── Sparkpool # Original product vision document
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
## Current Status (Phase 8) ✅
|
|
157
|
+
|
|
158
|
+
| Component | Status |
|
|
159
|
+
|-----------|--------|
|
|
160
|
+
| Phases 0–7 (full platform + mesh VPN) | ✅ Done |
|
|
161
|
+
| Distributed memory fabric | ✅ Done |
|
|
162
|
+
| 1000-node placement SLA (`mesh-bench`) | ✅ Done |
|
|
163
|
+
| Memory dashboard + dogfood script | ✅ Done |
|
|
164
|
+
|
|
165
|
+
## Developer SDK
|
|
166
|
+
|
|
167
|
+
```python
|
|
168
|
+
from mesh import task, submit, TaskContext
|
|
169
|
+
|
|
170
|
+
@task(cpu=4, ram="8GB", checkpoint=True, total_work=1_000_000)
|
|
171
|
+
def process_records(ctx: TaskContext):
|
|
172
|
+
for i in range(int(ctx.progress), 1_000_000):
|
|
173
|
+
ctx.set_progress(i + 1, records=i + 1)
|
|
174
|
+
return "done"
|
|
175
|
+
|
|
176
|
+
# Sync submit — blocks until complete
|
|
177
|
+
result = submit(process_records)
|
|
178
|
+
|
|
179
|
+
# Async submit — returns JobHandle
|
|
180
|
+
job = submit(process_records, async_=True)
|
|
181
|
+
result = job.wait(timeout=3600)
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
See [docs/api-spec.md](./docs/api-spec.md) for the full SDK specification.
|
|
185
|
+
|
|
186
|
+
## Documentation
|
|
187
|
+
|
|
188
|
+
| Document | Description |
|
|
189
|
+
|----------|-------------|
|
|
190
|
+
| [Architecture](./docs/architecture.md) | System design, components, data flows |
|
|
191
|
+
| [Fault Tolerance](./docs/fault-tolerance.md) | All 10 recovery mechanisms in detail |
|
|
192
|
+
| [Testing Strategy](./docs/testing-strategy.md) | Test pyramid, scenarios, SLAs |
|
|
193
|
+
| [Roadmap](./docs/roadmap.md) | Phased build plan with milestones |
|
|
194
|
+
| [API Spec](./docs/api-spec.md) | Developer SDK and internal APIs |
|
|
195
|
+
| [Join mesh](./docs/join-mesh.md) | `pip install clustermesh` and worker CLI |
|
|
196
|
+
| [Publish to PyPI](./docs/publish-pypi.md) | Build, token setup, and upload guide |
|
|
197
|
+
|
|
198
|
+
## License
|
|
199
|
+
|
|
200
|
+
MIT — see [LICENSE](./LICENSE).
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
# ClusterMesh (ComputeMesh)
|
|
2
|
+
|
|
3
|
+
**An operating system for enterprise compute** — turn every laptop, desktop, VM, and GPU workstation into a single elastic, fault-tolerant compute cloud.
|
|
4
|
+
|
|
5
|
+
> Full vision: [Sparkpool](./Sparkpool) · Architecture: [docs/architecture.md](./docs/architecture.md) · Roadmap: [docs/roadmap.md](./docs/roadmap.md)
|
|
6
|
+
|
|
7
|
+
## The Problem
|
|
8
|
+
|
|
9
|
+
Organizations sit on thousands of idle cores:
|
|
10
|
+
|
|
11
|
+
| Resource | Typical utilization |
|
|
12
|
+
|----------|---------------------|
|
|
13
|
+
| CPU | 10–20% |
|
|
14
|
+
| RAM | 30–50% |
|
|
15
|
+
| GPU | 5–10% |
|
|
16
|
+
|
|
17
|
+
Databricks, Kubernetes, Spark, and Ray all require **dedicated** compute. Nobody fully solves:
|
|
18
|
+
|
|
19
|
+
> *"Use all idle enterprise hardware automatically and safely."*
|
|
20
|
+
|
|
21
|
+
ClusterMesh does.
|
|
22
|
+
|
|
23
|
+
## What We're Building
|
|
24
|
+
|
|
25
|
+
```
|
|
26
|
+
Control Plane
|
|
27
|
+
│
|
|
28
|
+
┌─────────────────┼─────────────────┐
|
|
29
|
+
│ │ │
|
|
30
|
+
Metadata Service Scheduler Service Auth Service
|
|
31
|
+
│ │ │
|
|
32
|
+
└─────────────────┼─────────────────┘
|
|
33
|
+
│
|
|
34
|
+
Driver Cluster (Raft HA)
|
|
35
|
+
│
|
|
36
|
+
┌────────────────────┼────────────────────┐
|
|
37
|
+
│ │ │
|
|
38
|
+
Agent-1 Agent-2 Agent-3
|
|
39
|
+
Laptop Desktop VM
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
**Killer features:** idle compute harvesting · GPU sharing · live discovery · fault-tolerant scheduling · work stealing · preemption handling · checkpoint recovery · multi-office clustering
|
|
43
|
+
|
|
44
|
+
## Join a worker (any Python machine)
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
pip install clustermesh
|
|
48
|
+
clustermesh join DRIVER_IP:50050 --open # local worker UI on :50052
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
See [docs/join-mesh.md](./docs/join-mesh.md) for full details.
|
|
52
|
+
|
|
53
|
+
## Quick Start (development)
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
# Install in development mode
|
|
57
|
+
python -m venv .venv
|
|
58
|
+
source .venv/bin/activate
|
|
59
|
+
pip install -e ".[dev]"
|
|
60
|
+
|
|
61
|
+
# Run tests
|
|
62
|
+
pytest
|
|
63
|
+
|
|
64
|
+
# Run a simulated 50-node cluster demo
|
|
65
|
+
python -m mesh.sim.demo
|
|
66
|
+
|
|
67
|
+
# Phase 5: platform with React dashboard (build UI first)
|
|
68
|
+
cd frontend && npm install && npm run build && cd ..
|
|
69
|
+
mesh-platform --port 8080 --db clustermesh.db # driver + API + UI
|
|
70
|
+
# Phase 6 options
|
|
71
|
+
mesh-platform --port 8080 --mdns --site bangalore # advertise via mDNS
|
|
72
|
+
mesh-platform --store-url postgres://user:pass@localhost/clustermesh
|
|
73
|
+
mesh-platform --api-key your-secret-key # require auth on API
|
|
74
|
+
mesh-agent --discover # auto-find driver on LAN
|
|
75
|
+
|
|
76
|
+
# Phase 7: multi-site mesh VPN
|
|
77
|
+
mesh-platform --mesh-config config/sites.example.yaml --site bangalore
|
|
78
|
+
mesh-relay --listen 0.0.0.0:6000 --target 127.0.0.1:50050 # standalone relay
|
|
79
|
+
mesh-soak --hours 24 --nodes 50 # accelerated 24h chaos test
|
|
80
|
+
mesh-bench --nodes 1000 # placement SLA benchmark
|
|
81
|
+
./scripts/dogfood.sh # local dogfood run
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Project Structure
|
|
85
|
+
|
|
86
|
+
```
|
|
87
|
+
ClusterMesh/
|
|
88
|
+
├── docs/ # Architecture, testing strategy, roadmap
|
|
89
|
+
├── mesh/ # Core Python package
|
|
90
|
+
│ ├── models/ # Node, Task, Job, Resource types
|
|
91
|
+
│ ├── health/ # Heartbeat FSM, node health tracking
|
|
92
|
+
│ ├── scheduler/ # Scoring, placement, pool routing
|
|
93
|
+
│ ├── execution/ # TaskExecutor, TaskContext
|
|
94
|
+
│ ├── recovery/ # Checkpointing, work stealing, replication
|
|
95
|
+
│ ├── driver/ # JobManager, DriverCluster, gRPC server
|
|
96
|
+
│ ├── agent/ # Daemon, monitor, preemption, library
|
|
97
|
+
│ ├── proto/ # gRPC protobuf definitions
|
|
98
|
+
│ ├── tasks/ # Task registry + built-ins
|
|
99
|
+
│ ├── sdk/ # @task decorator, submit() API
|
|
100
|
+
│ └── sim/ # SimAgent, SimCluster, chaos injection
|
|
101
|
+
├── tests/ # Unit + integration tests
|
|
102
|
+
├── frontend/ # React dashboard (Vite + Tailwind)
|
|
103
|
+
└── Sparkpool # Original product vision document
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## Current Status (Phase 8) ✅
|
|
107
|
+
|
|
108
|
+
| Component | Status |
|
|
109
|
+
|-----------|--------|
|
|
110
|
+
| Phases 0–7 (full platform + mesh VPN) | ✅ Done |
|
|
111
|
+
| Distributed memory fabric | ✅ Done |
|
|
112
|
+
| 1000-node placement SLA (`mesh-bench`) | ✅ Done |
|
|
113
|
+
| Memory dashboard + dogfood script | ✅ Done |
|
|
114
|
+
|
|
115
|
+
## Developer SDK
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
from mesh import task, submit, TaskContext
|
|
119
|
+
|
|
120
|
+
@task(cpu=4, ram="8GB", checkpoint=True, total_work=1_000_000)
|
|
121
|
+
def process_records(ctx: TaskContext):
|
|
122
|
+
for i in range(int(ctx.progress), 1_000_000):
|
|
123
|
+
ctx.set_progress(i + 1, records=i + 1)
|
|
124
|
+
return "done"
|
|
125
|
+
|
|
126
|
+
# Sync submit — blocks until complete
|
|
127
|
+
result = submit(process_records)
|
|
128
|
+
|
|
129
|
+
# Async submit — returns JobHandle
|
|
130
|
+
job = submit(process_records, async_=True)
|
|
131
|
+
result = job.wait(timeout=3600)
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
See [docs/api-spec.md](./docs/api-spec.md) for the full SDK specification.
|
|
135
|
+
|
|
136
|
+
## Documentation
|
|
137
|
+
|
|
138
|
+
| Document | Description |
|
|
139
|
+
|----------|-------------|
|
|
140
|
+
| [Architecture](./docs/architecture.md) | System design, components, data flows |
|
|
141
|
+
| [Fault Tolerance](./docs/fault-tolerance.md) | All 10 recovery mechanisms in detail |
|
|
142
|
+
| [Testing Strategy](./docs/testing-strategy.md) | Test pyramid, scenarios, SLAs |
|
|
143
|
+
| [Roadmap](./docs/roadmap.md) | Phased build plan with milestones |
|
|
144
|
+
| [API Spec](./docs/api-spec.md) | Developer SDK and internal APIs |
|
|
145
|
+
| [Join mesh](./docs/join-mesh.md) | `pip install clustermesh` and worker CLI |
|
|
146
|
+
| [Publish to PyPI](./docs/publish-pypi.md) | Build, token setup, and upload guide |
|
|
147
|
+
|
|
148
|
+
## License
|
|
149
|
+
|
|
150
|
+
MIT — see [LICENSE](./LICENSE).
|