clustermesh 0.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. clustermesh-0.9.0/.gitignore +19 -0
  2. clustermesh-0.9.0/LICENSE +21 -0
  3. clustermesh-0.9.0/PKG-INFO +200 -0
  4. clustermesh-0.9.0/README.md +150 -0
  5. clustermesh-0.9.0/Sparkpool +592 -0
  6. clustermesh-0.9.0/config/sites.example.yaml +21 -0
  7. clustermesh-0.9.0/docs/api-spec.md +206 -0
  8. clustermesh-0.9.0/docs/architecture.md +188 -0
  9. clustermesh-0.9.0/docs/fault-tolerance.md +249 -0
  10. clustermesh-0.9.0/docs/join-mesh.md +84 -0
  11. clustermesh-0.9.0/docs/publish-git.md +132 -0
  12. clustermesh-0.9.0/docs/publish-pypi.md +256 -0
  13. clustermesh-0.9.0/docs/roadmap.md +186 -0
  14. clustermesh-0.9.0/docs/testing-strategy.md +172 -0
  15. clustermesh-0.9.0/examples/checkpoint_recovery.py +53 -0
  16. clustermesh-0.9.0/examples/phase3_demo.py +9 -0
  17. clustermesh-0.9.0/examples/run_agent.py +6 -0
  18. clustermesh-0.9.0/examples/run_driver.py +6 -0
  19. clustermesh-0.9.0/examples/run_remote_job.py +30 -0
  20. clustermesh-0.9.0/frontend/.gitignore +4 -0
  21. clustermesh-0.9.0/frontend/index.html +19 -0
  22. clustermesh-0.9.0/frontend/package-lock.json +2860 -0
  23. clustermesh-0.9.0/frontend/package.json +29 -0
  24. clustermesh-0.9.0/frontend/postcss.config.js +6 -0
  25. clustermesh-0.9.0/frontend/public/favicon.svg +4 -0
  26. clustermesh-0.9.0/frontend/src/App.tsx +33 -0
  27. clustermesh-0.9.0/frontend/src/api/client.ts +305 -0
  28. clustermesh-0.9.0/frontend/src/api/useStream.ts +79 -0
  29. clustermesh-0.9.0/frontend/src/components/Badge.tsx +20 -0
  30. clustermesh-0.9.0/frontend/src/components/Header.tsx +61 -0
  31. clustermesh-0.9.0/frontend/src/components/HostMetricsPanel.tsx +139 -0
  32. clustermesh-0.9.0/frontend/src/components/Layout.tsx +38 -0
  33. clustermesh-0.9.0/frontend/src/components/NodeTerminal.tsx +106 -0
  34. clustermesh-0.9.0/frontend/src/components/ProgressBar.tsx +22 -0
  35. clustermesh-0.9.0/frontend/src/components/Sidebar.tsx +66 -0
  36. clustermesh-0.9.0/frontend/src/components/StatCard.tsx +36 -0
  37. clustermesh-0.9.0/frontend/src/index.css +44 -0
  38. clustermesh-0.9.0/frontend/src/lib/utils.ts +47 -0
  39. clustermesh-0.9.0/frontend/src/main.tsx +10 -0
  40. clustermesh-0.9.0/frontend/src/pages/Cluster.tsx +113 -0
  41. clustermesh-0.9.0/frontend/src/pages/JobDetail.tsx +96 -0
  42. clustermesh-0.9.0/frontend/src/pages/Jobs.tsx +109 -0
  43. clustermesh-0.9.0/frontend/src/pages/Libraries.tsx +277 -0
  44. clustermesh-0.9.0/frontend/src/pages/Logs.tsx +163 -0
  45. clustermesh-0.9.0/frontend/src/pages/Memory.tsx +145 -0
  46. clustermesh-0.9.0/frontend/src/pages/Mesh.tsx +174 -0
  47. clustermesh-0.9.0/frontend/src/pages/Nodes.tsx +174 -0
  48. clustermesh-0.9.0/frontend/src/pages/Notebook.tsx +253 -0
  49. clustermesh-0.9.0/frontend/src/pages/Overview.tsx +180 -0
  50. clustermesh-0.9.0/frontend/src/vite-env.d.ts +1 -0
  51. clustermesh-0.9.0/frontend/tailwind.config.js +32 -0
  52. clustermesh-0.9.0/frontend/tsconfig.app.json +22 -0
  53. clustermesh-0.9.0/frontend/tsconfig.json +7 -0
  54. clustermesh-0.9.0/frontend/tsconfig.node.json +10 -0
  55. clustermesh-0.9.0/frontend/vite.config.js +19 -0
  56. clustermesh-0.9.0/frontend/vite.config.ts +20 -0
  57. clustermesh-0.9.0/mesh/__init__.py +7 -0
  58. clustermesh-0.9.0/mesh/agent/__init__.py +7 -0
  59. clustermesh-0.9.0/mesh/agent/client.py +120 -0
  60. clustermesh-0.9.0/mesh/agent/config.py +28 -0
  61. clustermesh-0.9.0/mesh/agent/daemon.py +192 -0
  62. clustermesh-0.9.0/mesh/agent/executor.py +119 -0
  63. clustermesh-0.9.0/mesh/agent/host_metrics.py +232 -0
  64. clustermesh-0.9.0/mesh/agent/library.py +5 -0
  65. clustermesh-0.9.0/mesh/agent/monitor.py +197 -0
  66. clustermesh-0.9.0/mesh/agent/preemption.py +39 -0
  67. clustermesh-0.9.0/mesh/agent/server.py +108 -0
  68. clustermesh-0.9.0/mesh/agent/shell.py +84 -0
  69. clustermesh-0.9.0/mesh/api/__init__.py +6 -0
  70. clustermesh-0.9.0/mesh/api/app.py +401 -0
  71. clustermesh-0.9.0/mesh/api/auth.py +72 -0
  72. clustermesh-0.9.0/mesh/api/context.py +250 -0
  73. clustermesh-0.9.0/mesh/api/events.py +121 -0
  74. clustermesh-0.9.0/mesh/api/server.py +189 -0
  75. clustermesh-0.9.0/mesh/cli.py +196 -0
  76. clustermesh-0.9.0/mesh/discovery/__init__.py +5 -0
  77. clustermesh-0.9.0/mesh/discovery/mdns.py +160 -0
  78. clustermesh-0.9.0/mesh/driver/__init__.py +14 -0
  79. clustermesh-0.9.0/mesh/driver/cluster.py +170 -0
  80. clustermesh-0.9.0/mesh/driver/ha/__init__.py +6 -0
  81. clustermesh-0.9.0/mesh/driver/ha/coordinator.py +44 -0
  82. clustermesh-0.9.0/mesh/driver/ha/election.py +80 -0
  83. clustermesh-0.9.0/mesh/driver/job_manager.py +554 -0
  84. clustermesh-0.9.0/mesh/driver/library_installer.py +218 -0
  85. clustermesh-0.9.0/mesh/driver/server.py +246 -0
  86. clustermesh-0.9.0/mesh/execution/__init__.py +5 -0
  87. clustermesh-0.9.0/mesh/execution/executor.py +160 -0
  88. clustermesh-0.9.0/mesh/health/__init__.py +5 -0
  89. clustermesh-0.9.0/mesh/health/heartbeat.py +141 -0
  90. clustermesh-0.9.0/mesh/libraries/__init__.py +0 -0
  91. clustermesh-0.9.0/mesh/libraries/manager.py +87 -0
  92. clustermesh-0.9.0/mesh/memory/__init__.py +5 -0
  93. clustermesh-0.9.0/mesh/memory/fabric.py +174 -0
  94. clustermesh-0.9.0/mesh/meshvpn/__init__.py +6 -0
  95. clustermesh-0.9.0/mesh/meshvpn/coordinator.py +103 -0
  96. clustermesh-0.9.0/mesh/meshvpn/relay.py +128 -0
  97. clustermesh-0.9.0/mesh/meshvpn/site.py +68 -0
  98. clustermesh-0.9.0/mesh/models/__init__.py +15 -0
  99. clustermesh-0.9.0/mesh/models/enums.py +26 -0
  100. clustermesh-0.9.0/mesh/models/job.py +31 -0
  101. clustermesh-0.9.0/mesh/models/node.py +85 -0
  102. clustermesh-0.9.0/mesh/models/task.py +54 -0
  103. clustermesh-0.9.0/mesh/net/__init__.py +1 -0
  104. clustermesh-0.9.0/mesh/net/address.py +52 -0
  105. clustermesh-0.9.0/mesh/notebook/__init__.py +5 -0
  106. clustermesh-0.9.0/mesh/notebook/runner.py +100 -0
  107. clustermesh-0.9.0/mesh/proto/__init__.py +5 -0
  108. clustermesh-0.9.0/mesh/proto/mesh.proto +136 -0
  109. clustermesh-0.9.0/mesh/proto/mesh_pb2.py +68 -0
  110. clustermesh-0.9.0/mesh/proto/mesh_pb2_grpc.py +562 -0
  111. clustermesh-0.9.0/mesh/recovery/__init__.py +14 -0
  112. clustermesh-0.9.0/mesh/recovery/checkpoint.py +45 -0
  113. clustermesh-0.9.0/mesh/recovery/replication.py +102 -0
  114. clustermesh-0.9.0/mesh/recovery/speculation.py +59 -0
  115. clustermesh-0.9.0/mesh/recovery/work_stealing.py +66 -0
  116. clustermesh-0.9.0/mesh/scheduler/__init__.py +13 -0
  117. clustermesh-0.9.0/mesh/scheduler/benchmark.py +112 -0
  118. clustermesh-0.9.0/mesh/scheduler/placement.py +99 -0
  119. clustermesh-0.9.0/mesh/scheduler/pools.py +76 -0
  120. clustermesh-0.9.0/mesh/scheduler/rebalancing.py +68 -0
  121. clustermesh-0.9.0/mesh/scheduler/scoring.py +53 -0
  122. clustermesh-0.9.0/mesh/sdk/__init__.py +59 -0
  123. clustermesh-0.9.0/mesh/sdk/decorator.py +94 -0
  124. clustermesh-0.9.0/mesh/sdk/units.py +61 -0
  125. clustermesh-0.9.0/mesh/sim/__init__.py +8 -0
  126. clustermesh-0.9.0/mesh/sim/agent.py +72 -0
  127. clustermesh-0.9.0/mesh/sim/chaos.py +55 -0
  128. clustermesh-0.9.0/mesh/sim/clock.py +22 -0
  129. clustermesh-0.9.0/mesh/sim/cluster.py +186 -0
  130. clustermesh-0.9.0/mesh/sim/demo.py +76 -0
  131. clustermesh-0.9.0/mesh/sim/soak.py +171 -0
  132. clustermesh-0.9.0/mesh/state/__init__.py +7 -0
  133. clustermesh-0.9.0/mesh/state/factory.py +45 -0
  134. clustermesh-0.9.0/mesh/state/postgres_store.py +215 -0
  135. clustermesh-0.9.0/mesh/state/redis_store.py +134 -0
  136. clustermesh-0.9.0/mesh/state/serialize.py +125 -0
  137. clustermesh-0.9.0/mesh/state/sqlite_store.py +193 -0
  138. clustermesh-0.9.0/mesh/state/store.py +31 -0
  139. clustermesh-0.9.0/mesh/tasks/__init__.py +5 -0
  140. clustermesh-0.9.0/mesh/tasks/builtins.py +34 -0
  141. clustermesh-0.9.0/mesh/tasks/registry.py +30 -0
  142. clustermesh-0.9.0/mesh/worker/__init__.py +6 -0
  143. clustermesh-0.9.0/mesh/worker/runtime.py +63 -0
  144. clustermesh-0.9.0/mesh/worker/server.py +42 -0
  145. clustermesh-0.9.0/mesh/worker/state.py +114 -0
  146. clustermesh-0.9.0/mesh/worker/static/index.html +209 -0
  147. clustermesh-0.9.0/pyproject.toml +86 -0
  148. clustermesh-0.9.0/scripts/dogfood.sh +47 -0
  149. clustermesh-0.9.0/scripts/generate_proto.sh +13 -0
  150. clustermesh-0.9.0/scripts/install.sh +36 -0
  151. clustermesh-0.9.0/scripts/publish-pypi.sh +36 -0
  152. clustermesh-0.9.0/tests/__init__.py +0 -0
  153. clustermesh-0.9.0/tests/test_agent_address.py +25 -0
  154. clustermesh-0.9.0/tests/test_agent_monitor.py +62 -0
  155. clustermesh-0.9.0/tests/test_api.py +136 -0
  156. clustermesh-0.9.0/tests/test_execution.py +96 -0
  157. clustermesh-0.9.0/tests/test_grpc.py +120 -0
  158. clustermesh-0.9.0/tests/test_ha.py +73 -0
  159. clustermesh-0.9.0/tests/test_heartbeat.py +90 -0
  160. clustermesh-0.9.0/tests/test_host_metrics.py +20 -0
  161. clustermesh-0.9.0/tests/test_integration.py +131 -0
  162. clustermesh-0.9.0/tests/test_library.py +21 -0
  163. clustermesh-0.9.0/tests/test_library_installer.py +56 -0
  164. clustermesh-0.9.0/tests/test_notebook.py +42 -0
  165. clustermesh-0.9.0/tests/test_phase4.py +58 -0
  166. clustermesh-0.9.0/tests/test_phase6.py +152 -0
  167. clustermesh-0.9.0/tests/test_phase7.py +116 -0
  168. clustermesh-0.9.0/tests/test_phase8.py +100 -0
  169. clustermesh-0.9.0/tests/test_preemption.py +46 -0
  170. clustermesh-0.9.0/tests/test_scheduler.py +159 -0
  171. clustermesh-0.9.0/tests/test_sdk.py +96 -0
  172. clustermesh-0.9.0/tests/test_shell.py +77 -0
  173. clustermesh-0.9.0/tests/test_sim.py +135 -0
  174. clustermesh-0.9.0/tests/test_state_store.py +55 -0
  175. clustermesh-0.9.0/tests/test_worker.py +66 -0
@@ -0,0 +1,19 @@
1
+ *.pyc
2
+ __pycache__/
3
+ .venv/
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .coverage
8
+ htmlcov/
9
+ .pytest_cache/
10
+ .mypy_cache/
11
+ .DS_Store
12
+ node_modules/
13
+ frontend/dist/
14
+ *.local
15
+ .env
16
+ .env.*
17
+ !.env.example
18
+ *.db
19
+ *.tsbuildinfo
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 ClusterMesh Team
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,200 @@
1
+ Metadata-Version: 2.4
2
+ Name: clustermesh
3
+ Version: 0.9.0
4
+ Summary: Enterprise compute fabric — pip install on any machine to join a compute cluster
5
+ Project-URL: Homepage, https://github.com/neetishsingh/ClusterMesh
6
+ Project-URL: Documentation, https://github.com/neetishsingh/ClusterMesh/blob/main/docs/join-mesh.md
7
+ Project-URL: Repository, https://github.com/neetishsingh/ClusterMesh
8
+ Author: ClusterMesh Team
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: cluster,compute,distributed,grpc,mesh,scheduler
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Environment :: Console
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: System Administrators
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Topic :: System :: Clustering
23
+ Classifier: Topic :: System :: Distributed Computing
24
+ Requires-Python: >=3.11
25
+ Requires-Dist: fastapi>=0.110
26
+ Requires-Dist: grpcio>=1.60
27
+ Requires-Dist: protobuf>=4.25
28
+ Requires-Dist: psutil>=5.9
29
+ Requires-Dist: pyyaml>=6.0
30
+ Requires-Dist: uvicorn[standard]>=0.27
31
+ Requires-Dist: websockets>=12.0
32
+ Requires-Dist: zeroconf>=0.131
33
+ Provides-Extra: dev
34
+ Requires-Dist: grpcio-tools>=1.60; extra == 'dev'
35
+ Requires-Dist: httpx>=0.27; extra == 'dev'
36
+ Requires-Dist: hypothesis>=6.100; extra == 'dev'
37
+ Requires-Dist: pytest-cov>=5.0; extra == 'dev'
38
+ Requires-Dist: pytest>=8.0; extra == 'dev'
39
+ Provides-Extra: discovery
40
+ Requires-Dist: zeroconf>=0.131; extra == 'discovery'
41
+ Provides-Extra: phase6
42
+ Requires-Dist: psycopg[binary]>=3.1; extra == 'phase6'
43
+ Requires-Dist: redis>=5.0; extra == 'phase6'
44
+ Requires-Dist: zeroconf>=0.131; extra == 'phase6'
45
+ Provides-Extra: postgres
46
+ Requires-Dist: psycopg[binary]>=3.1; extra == 'postgres'
47
+ Provides-Extra: redis
48
+ Requires-Dist: redis>=5.0; extra == 'redis'
49
+ Description-Content-Type: text/markdown
50
+
51
+ # ClusterMesh (ComputeMesh)
52
+
53
+ **An operating system for enterprise compute** — turn every laptop, desktop, VM, and GPU workstation into a single elastic, fault-tolerant compute cloud.
54
+
55
+ > Full vision: [Sparkpool](./Sparkpool) · Architecture: [docs/architecture.md](./docs/architecture.md) · Roadmap: [docs/roadmap.md](./docs/roadmap.md)
56
+
57
+ ## The Problem
58
+
59
+ Organizations sit on thousands of idle cores:
60
+
61
+ | Resource | Typical utilization |
62
+ |----------|---------------------|
63
+ | CPU | 10–20% |
64
+ | RAM | 30–50% |
65
+ | GPU | 5–10% |
66
+
67
+ Databricks, Kubernetes, Spark, and Ray all require **dedicated** compute. Nobody fully solves:
68
+
69
+ > *"Use all idle enterprise hardware automatically and safely."*
70
+
71
+ ClusterMesh does.
72
+
73
+ ## What We're Building
74
+
75
+ ```
76
+ Control Plane
77
+
78
+ ┌─────────────────┼─────────────────┐
79
+ │ │ │
80
+ Metadata Service Scheduler Service Auth Service
81
+ │ │ │
82
+ └─────────────────┼─────────────────┘
83
+
84
+ Driver Cluster (Raft HA)
85
+
86
+ ┌────────────────────┼────────────────────┐
87
+ │ │ │
88
+ Agent-1 Agent-2 Agent-3
89
+ Laptop Desktop VM
90
+ ```
91
+
92
+ **Killer features:** idle compute harvesting · GPU sharing · live discovery · fault-tolerant scheduling · work stealing · preemption handling · checkpoint recovery · multi-office clustering
93
+
94
+ ## Join a worker (any Python machine)
95
+
96
+ ```bash
97
+ pip install clustermesh
98
+ clustermesh join DRIVER_IP:50050 --open # local worker UI on :50052
99
+ ```
100
+
101
+ See [docs/join-mesh.md](./docs/join-mesh.md) for full details.
102
+
103
+ ## Quick Start (development)
104
+
105
+ ```bash
106
+ # Install in development mode
107
+ python -m venv .venv
108
+ source .venv/bin/activate
109
+ pip install -e ".[dev]"
110
+
111
+ # Run tests
112
+ pytest
113
+
114
+ # Run a simulated 50-node cluster demo
115
+ python -m mesh.sim.demo
116
+
117
+ # Phase 5: platform with React dashboard (build UI first)
118
+ cd frontend && npm install && npm run build && cd ..
119
+ mesh-platform --port 8080 --db clustermesh.db # driver + API + UI
120
+ # Phase 6 options
121
+ mesh-platform --port 8080 --mdns --site bangalore # advertise via mDNS
122
+ mesh-platform --store-url postgres://user:pass@localhost/clustermesh
123
+ mesh-platform --api-key your-secret-key # require auth on API
124
+ mesh-agent --discover # auto-find driver on LAN
125
+
126
+ # Phase 7: multi-site mesh VPN
127
+ mesh-platform --mesh-config config/sites.example.yaml --site bangalore
128
+ mesh-relay --listen 0.0.0.0:6000 --target 127.0.0.1:50050 # standalone relay
129
+ mesh-soak --hours 24 --nodes 50 # accelerated 24h chaos test
130
+ mesh-bench --nodes 1000 # placement SLA benchmark
131
+ ./scripts/dogfood.sh # local dogfood run
132
+ ```
133
+
134
+ ## Project Structure
135
+
136
+ ```
137
+ ClusterMesh/
138
+ ├── docs/ # Architecture, testing strategy, roadmap
139
+ ├── mesh/ # Core Python package
140
+ │ ├── models/ # Node, Task, Job, Resource types
141
+ │ ├── health/ # Heartbeat FSM, node health tracking
142
+ │ ├── scheduler/ # Scoring, placement, pool routing
143
+ │ ├── execution/ # TaskExecutor, TaskContext
144
+ │ ├── recovery/ # Checkpointing, work stealing, replication
145
+ │ ├── driver/ # JobManager, DriverCluster, gRPC server
146
+ │ ├── agent/ # Daemon, monitor, preemption, library
147
+ │ ├── proto/ # gRPC protobuf definitions
148
+ │ ├── tasks/ # Task registry + built-ins
149
+ │ ├── sdk/ # @task decorator, submit() API
150
+ │ └── sim/ # SimAgent, SimCluster, chaos injection
151
+ ├── tests/ # Unit + integration tests
152
+ ├── frontend/ # React dashboard (Vite + Tailwind)
153
+ └── Sparkpool # Original product vision document
154
+ ```
155
+
156
+ ## Current Status (Phase 8) ✅
157
+
158
+ | Component | Status |
159
+ |-----------|--------|
160
+ | Phases 0–7 (full platform + mesh VPN) | ✅ Done |
161
+ | Distributed memory fabric | ✅ Done |
162
+ | 1000-node placement SLA (`mesh-bench`) | ✅ Done |
163
+ | Memory dashboard + dogfood script | ✅ Done |
164
+
165
+ ## Developer SDK
166
+
167
+ ```python
168
+ from mesh import task, submit, TaskContext
169
+
170
+ @task(cpu=4, ram="8GB", checkpoint=True, total_work=1_000_000)
171
+ def process_records(ctx: TaskContext):
172
+ for i in range(int(ctx.progress), 1_000_000):
173
+ ctx.set_progress(i + 1, records=i + 1)
174
+ return "done"
175
+
176
+ # Sync submit — blocks until complete
177
+ result = submit(process_records)
178
+
179
+ # Async submit — returns JobHandle
180
+ job = submit(process_records, async_=True)
181
+ result = job.wait(timeout=3600)
182
+ ```
183
+
184
+ See [docs/api-spec.md](./docs/api-spec.md) for the full SDK specification.
185
+
186
+ ## Documentation
187
+
188
+ | Document | Description |
189
+ |----------|-------------|
190
+ | [Architecture](./docs/architecture.md) | System design, components, data flows |
191
+ | [Fault Tolerance](./docs/fault-tolerance.md) | All 10 recovery mechanisms in detail |
192
+ | [Testing Strategy](./docs/testing-strategy.md) | Test pyramid, scenarios, SLAs |
193
+ | [Roadmap](./docs/roadmap.md) | Phased build plan with milestones |
194
+ | [API Spec](./docs/api-spec.md) | Developer SDK and internal APIs |
195
+ | [Join mesh](./docs/join-mesh.md) | `pip install clustermesh` and worker CLI |
196
+ | [Publish to PyPI](./docs/publish-pypi.md) | Build, token setup, and upload guide |
197
+
198
+ ## License
199
+
200
+ MIT — see [LICENSE](./LICENSE).
@@ -0,0 +1,150 @@
1
+ # ClusterMesh (ComputeMesh)
2
+
3
+ **An operating system for enterprise compute** — turn every laptop, desktop, VM, and GPU workstation into a single elastic, fault-tolerant compute cloud.
4
+
5
+ > Full vision: [Sparkpool](./Sparkpool) · Architecture: [docs/architecture.md](./docs/architecture.md) · Roadmap: [docs/roadmap.md](./docs/roadmap.md)
6
+
7
+ ## The Problem
8
+
9
+ Organizations sit on thousands of idle cores:
10
+
11
+ | Resource | Typical utilization |
12
+ |----------|---------------------|
13
+ | CPU | 10–20% |
14
+ | RAM | 30–50% |
15
+ | GPU | 5–10% |
16
+
17
+ Databricks, Kubernetes, Spark, and Ray all require **dedicated** compute. Nobody fully solves:
18
+
19
+ > *"Use all idle enterprise hardware automatically and safely."*
20
+
21
+ ClusterMesh does.
22
+
23
+ ## What We're Building
24
+
25
+ ```
26
+ Control Plane
27
+
28
+ ┌─────────────────┼─────────────────┐
29
+ │ │ │
30
+ Metadata Service Scheduler Service Auth Service
31
+ │ │ │
32
+ └─────────────────┼─────────────────┘
33
+
34
+ Driver Cluster (Raft HA)
35
+
36
+ ┌────────────────────┼────────────────────┐
37
+ │ │ │
38
+ Agent-1 Agent-2 Agent-3
39
+ Laptop Desktop VM
40
+ ```
41
+
42
+ **Killer features:** idle compute harvesting · GPU sharing · live discovery · fault-tolerant scheduling · work stealing · preemption handling · checkpoint recovery · multi-office clustering
43
+
44
+ ## Join a worker (any Python machine)
45
+
46
+ ```bash
47
+ pip install clustermesh
48
+ clustermesh join DRIVER_IP:50050 --open # local worker UI on :50052
49
+ ```
50
+
51
+ See [docs/join-mesh.md](./docs/join-mesh.md) for full details.
52
+
53
+ ## Quick Start (development)
54
+
55
+ ```bash
56
+ # Install in development mode
57
+ python -m venv .venv
58
+ source .venv/bin/activate
59
+ pip install -e ".[dev]"
60
+
61
+ # Run tests
62
+ pytest
63
+
64
+ # Run a simulated 50-node cluster demo
65
+ python -m mesh.sim.demo
66
+
67
+ # Phase 5: platform with React dashboard (build UI first)
68
+ cd frontend && npm install && npm run build && cd ..
69
+ mesh-platform --port 8080 --db clustermesh.db # driver + API + UI
70
+ # Phase 6 options
71
+ mesh-platform --port 8080 --mdns --site bangalore # advertise via mDNS
72
+ mesh-platform --store-url postgres://user:pass@localhost/clustermesh
73
+ mesh-platform --api-key your-secret-key # require auth on API
74
+ mesh-agent --discover # auto-find driver on LAN
75
+
76
+ # Phase 7: multi-site mesh VPN
77
+ mesh-platform --mesh-config config/sites.example.yaml --site bangalore
78
+ mesh-relay --listen 0.0.0.0:6000 --target 127.0.0.1:50050 # standalone relay
79
+ mesh-soak --hours 24 --nodes 50 # accelerated 24h chaos test
80
+ mesh-bench --nodes 1000 # placement SLA benchmark
81
+ ./scripts/dogfood.sh # local dogfood run
82
+ ```
83
+
84
+ ## Project Structure
85
+
86
+ ```
87
+ ClusterMesh/
88
+ ├── docs/ # Architecture, testing strategy, roadmap
89
+ ├── mesh/ # Core Python package
90
+ │ ├── models/ # Node, Task, Job, Resource types
91
+ │ ├── health/ # Heartbeat FSM, node health tracking
92
+ │ ├── scheduler/ # Scoring, placement, pool routing
93
+ │ ├── execution/ # TaskExecutor, TaskContext
94
+ │ ├── recovery/ # Checkpointing, work stealing, replication
95
+ │ ├── driver/ # JobManager, DriverCluster, gRPC server
96
+ │ ├── agent/ # Daemon, monitor, preemption, library
97
+ │ ├── proto/ # gRPC protobuf definitions
98
+ │ ├── tasks/ # Task registry + built-ins
99
+ │ ├── sdk/ # @task decorator, submit() API
100
+ │ └── sim/ # SimAgent, SimCluster, chaos injection
101
+ ├── tests/ # Unit + integration tests
102
+ ├── frontend/ # React dashboard (Vite + Tailwind)
103
+ └── Sparkpool # Original product vision document
104
+ ```
105
+
106
+ ## Current Status (Phase 8) ✅
107
+
108
+ | Component | Status |
109
+ |-----------|--------|
110
+ | Phases 0–7 (full platform + mesh VPN) | ✅ Done |
111
+ | Distributed memory fabric | ✅ Done |
112
+ | 1000-node placement SLA (`mesh-bench`) | ✅ Done |
113
+ | Memory dashboard + dogfood script | ✅ Done |
114
+
115
+ ## Developer SDK
116
+
117
+ ```python
118
+ from mesh import task, submit, TaskContext
119
+
120
+ @task(cpu=4, ram="8GB", checkpoint=True, total_work=1_000_000)
121
+ def process_records(ctx: TaskContext):
122
+ for i in range(int(ctx.progress), 1_000_000):
123
+ ctx.set_progress(i + 1, records=i + 1)
124
+ return "done"
125
+
126
+ # Sync submit — blocks until complete
127
+ result = submit(process_records)
128
+
129
+ # Async submit — returns JobHandle
130
+ job = submit(process_records, async_=True)
131
+ result = job.wait(timeout=3600)
132
+ ```
133
+
134
+ See [docs/api-spec.md](./docs/api-spec.md) for the full SDK specification.
135
+
136
+ ## Documentation
137
+
138
+ | Document | Description |
139
+ |----------|-------------|
140
+ | [Architecture](./docs/architecture.md) | System design, components, data flows |
141
+ | [Fault Tolerance](./docs/fault-tolerance.md) | All 10 recovery mechanisms in detail |
142
+ | [Testing Strategy](./docs/testing-strategy.md) | Test pyramid, scenarios, SLAs |
143
+ | [Roadmap](./docs/roadmap.md) | Phased build plan with milestones |
144
+ | [API Spec](./docs/api-spec.md) | Developer SDK and internal APIs |
145
+ | [Join mesh](./docs/join-mesh.md) | `pip install clustermesh` and worker CLI |
146
+ | [Publish to PyPI](./docs/publish-pypi.md) | Build, token setup, and upload guide |
147
+
148
+ ## License
149
+
150
+ MIT — see [LICENSE](./LICENSE).