vaultlayer 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (246) hide show
  1. vaultlayer-0.1.0/PKG-INFO +258 -0
  2. vaultlayer-0.1.0/README.md +221 -0
  3. vaultlayer-0.1.0/pyproject.toml +83 -0
  4. vaultlayer-0.1.0/setup.cfg +4 -0
  5. vaultlayer-0.1.0/src/agents/__init__.py +0 -0
  6. vaultlayer-0.1.0/src/agents/broker/__init__.py +0 -0
  7. vaultlayer-0.1.0/src/agents/broker/agent.py +1086 -0
  8. vaultlayer-0.1.0/src/agents/broker/billing.py +154 -0
  9. vaultlayer-0.1.0/src/agents/broker/migration.py +961 -0
  10. vaultlayer-0.1.0/src/agents/broker/provider_startup.py +125 -0
  11. vaultlayer-0.1.0/src/agents/broker/providers/__init__.py +77 -0
  12. vaultlayer-0.1.0/src/agents/broker/providers/aws.py +914 -0
  13. vaultlayer-0.1.0/src/agents/broker/providers/aws_capacity.py +186 -0
  14. vaultlayer-0.1.0/src/agents/broker/providers/azure.py +247 -0
  15. vaultlayer-0.1.0/src/agents/broker/providers/base.py +32 -0
  16. vaultlayer-0.1.0/src/agents/broker/providers/coreweave.py +127 -0
  17. vaultlayer-0.1.0/src/agents/broker/providers/crusoe.py +214 -0
  18. vaultlayer-0.1.0/src/agents/broker/providers/gcp.py +626 -0
  19. vaultlayer-0.1.0/src/agents/broker/providers/gcp_capacity.py +177 -0
  20. vaultlayer-0.1.0/src/agents/broker/providers/hyperstack.py +156 -0
  21. vaultlayer-0.1.0/src/agents/broker/providers/lambda_labs.py +202 -0
  22. vaultlayer-0.1.0/src/agents/broker/providers/nebius.py +189 -0
  23. vaultlayer-0.1.0/src/agents/broker/providers/region_planner.py +134 -0
  24. vaultlayer-0.1.0/src/agents/broker/providers/runpod.py +259 -0
  25. vaultlayer-0.1.0/src/agents/broker/providers/vast_ai.py +282 -0
  26. vaultlayer-0.1.0/src/agents/broker/providers/voltage_park.py +130 -0
  27. vaultlayer-0.1.0/src/agents/broker/ssh.py +453 -0
  28. vaultlayer-0.1.0/src/agents/broker/startup_scripts.py +1054 -0
  29. vaultlayer-0.1.0/src/agents/broker/warm_pool.py +351 -0
  30. vaultlayer-0.1.0/src/agents/finops/__init__.py +0 -0
  31. vaultlayer-0.1.0/src/agents/finops/agent.py +267 -0
  32. vaultlayer-0.1.0/src/agents/namespace/__init__.py +16 -0
  33. vaultlayer-0.1.0/src/agents/namespace/agent.py +308 -0
  34. vaultlayer-0.1.0/src/agents/namespace/ingestion.py +208 -0
  35. vaultlayer-0.1.0/src/agents/namespace/inject.py +74 -0
  36. vaultlayer-0.1.0/src/agents/namespace/mount.py +177 -0
  37. vaultlayer-0.1.0/src/agents/namespace/prefetch.py +307 -0
  38. vaultlayer-0.1.0/src/agents/orchestration/__init__.py +0 -0
  39. vaultlayer-0.1.0/src/agents/orchestration/agent.py +882 -0
  40. vaultlayer-0.1.0/src/agents/orchestration/decisions.py +507 -0
  41. vaultlayer-0.1.0/src/agents/orchestration/egress.py +409 -0
  42. vaultlayer-0.1.0/src/agents/orchestration/prompts.py +65 -0
  43. vaultlayer-0.1.0/src/agents/pricing/__init__.py +0 -0
  44. vaultlayer-0.1.0/src/agents/pricing/agent.py +741 -0
  45. vaultlayer-0.1.0/src/agents/pricing/pollers.py +507 -0
  46. vaultlayer-0.1.0/src/agents/tester/__init__.py +0 -0
  47. vaultlayer-0.1.0/src/agents/tester/agent.py +360 -0
  48. vaultlayer-0.1.0/src/agents/vault/__init__.py +0 -0
  49. vaultlayer-0.1.0/src/agents/vault/agent.py +605 -0
  50. vaultlayer-0.1.0/src/agents/vault/delta.py +353 -0
  51. vaultlayer-0.1.0/src/agents/vault/multipart.py +160 -0
  52. vaultlayer-0.1.0/src/agents/vault/r2.py +848 -0
  53. vaultlayer-0.1.0/src/agents/watchdog/__init__.py +0 -0
  54. vaultlayer-0.1.0/src/agents/watchdog/agent.py +715 -0
  55. vaultlayer-0.1.0/src/agents/watchdog/signals.py +240 -0
  56. vaultlayer-0.1.0/src/api/__init__.py +1 -0
  57. vaultlayer-0.1.0/src/api/admin_dashboard.py +685 -0
  58. vaultlayer-0.1.0/src/api/admin_routes.py +686 -0
  59. vaultlayer-0.1.0/src/api/auth.py +305 -0
  60. vaultlayer-0.1.0/src/api/bill_reconcile.py +372 -0
  61. vaultlayer-0.1.0/src/api/credentials.py +152 -0
  62. vaultlayer-0.1.0/src/api/credits.py +517 -0
  63. vaultlayer-0.1.0/src/api/db.py +319 -0
  64. vaultlayer-0.1.0/src/api/email.py +226 -0
  65. vaultlayer-0.1.0/src/api/feedback_routes.py +195 -0
  66. vaultlayer-0.1.0/src/api/flags.py +75 -0
  67. vaultlayer-0.1.0/src/api/fraud.py +115 -0
  68. vaultlayer-0.1.0/src/api/invite_routes.py +553 -0
  69. vaultlayer-0.1.0/src/api/job_routes.py +1105 -0
  70. vaultlayer-0.1.0/src/api/job_worker.py +1817 -0
  71. vaultlayer-0.1.0/src/api/main.py +483 -0
  72. vaultlayer-0.1.0/src/api/models.py +131 -0
  73. vaultlayer-0.1.0/src/api/pricing_routes.py +597 -0
  74. vaultlayer-0.1.0/src/api/smoke_routes.py +156 -0
  75. vaultlayer-0.1.0/src/api/stripe_routes.py +206 -0
  76. vaultlayer-0.1.0/src/api/unified_queue.py +247 -0
  77. vaultlayer-0.1.0/src/api/unified_queue_persist.py +222 -0
  78. vaultlayer-0.1.0/src/cli/__init__.py +0 -0
  79. vaultlayer-0.1.0/src/cli/_dataset.py +207 -0
  80. vaultlayer-0.1.0/src/cli/_preflight.py +314 -0
  81. vaultlayer-0.1.0/src/cli/_remote.py +1105 -0
  82. vaultlayer-0.1.0/src/cli/_subprocess.py +233 -0
  83. vaultlayer-0.1.0/src/cli/admin_cmd.py +219 -0
  84. vaultlayer-0.1.0/src/cli/checkpoint_template.py +600 -0
  85. vaultlayer-0.1.0/src/cli/checkpoint_template_deepspeed.py +228 -0
  86. vaultlayer-0.1.0/src/cli/checkpoint_template_jax.py +260 -0
  87. vaultlayer-0.1.0/src/cli/connect.py +796 -0
  88. vaultlayer-0.1.0/src/cli/crash_reporter.py +139 -0
  89. vaultlayer-0.1.0/src/cli/data.py +202 -0
  90. vaultlayer-0.1.0/src/cli/delete.py +216 -0
  91. vaultlayer-0.1.0/src/cli/download.py +156 -0
  92. vaultlayer-0.1.0/src/cli/estimate.py +108 -0
  93. vaultlayer-0.1.0/src/cli/examples.py +298 -0
  94. vaultlayer-0.1.0/src/cli/feedback.py +231 -0
  95. vaultlayer-0.1.0/src/cli/inject.py +731 -0
  96. vaultlayer-0.1.0/src/cli/login_cmd.py +132 -0
  97. vaultlayer-0.1.0/src/cli/main.py +794 -0
  98. vaultlayer-0.1.0/src/cli/migrate_keys.py +123 -0
  99. vaultlayer-0.1.0/src/cli/notify.py +130 -0
  100. vaultlayer-0.1.0/src/cli/ps.py +135 -0
  101. vaultlayer-0.1.0/src/cli/remote_providers.py +142 -0
  102. vaultlayer-0.1.0/src/cli/restart.py +293 -0
  103. vaultlayer-0.1.0/src/cli/run.py +503 -0
  104. vaultlayer-0.1.0/src/cli/signup.py +134 -0
  105. vaultlayer-0.1.0/src/cli/smoke_history.py +150 -0
  106. vaultlayer-0.1.0/src/cli/sync.py +939 -0
  107. vaultlayer-0.1.0/src/shared/__init__.py +0 -0
  108. vaultlayer-0.1.0/src/shared/api_utils.py +34 -0
  109. vaultlayer-0.1.0/src/shared/bill_auditor.py +154 -0
  110. vaultlayer-0.1.0/src/shared/compute_tiers.py +198 -0
  111. vaultlayer-0.1.0/src/shared/config.py +424 -0
  112. vaultlayer-0.1.0/src/shared/credential_vault.py +140 -0
  113. vaultlayer-0.1.0/src/shared/credits.py +331 -0
  114. vaultlayer-0.1.0/src/shared/data_loader.py +462 -0
  115. vaultlayer-0.1.0/src/shared/email.py +184 -0
  116. vaultlayer-0.1.0/src/shared/env_utils.py +15 -0
  117. vaultlayer-0.1.0/src/shared/failure_codes.py +93 -0
  118. vaultlayer-0.1.0/src/shared/gpu_resolver.py +372 -0
  119. vaultlayer-0.1.0/src/shared/idempotency.py +69 -0
  120. vaultlayer-0.1.0/src/shared/invoice.py +500 -0
  121. vaultlayer-0.1.0/src/shared/job_logger.py +191 -0
  122. vaultlayer-0.1.0/src/shared/job_run_logger.py +446 -0
  123. vaultlayer-0.1.0/src/shared/logging_utils.py +17 -0
  124. vaultlayer-0.1.0/src/shared/manifest.py +126 -0
  125. vaultlayer-0.1.0/src/shared/models.py +349 -0
  126. vaultlayer-0.1.0/src/shared/models_additions.py +114 -0
  127. vaultlayer-0.1.0/src/shared/price_guard.py +449 -0
  128. vaultlayer-0.1.0/src/shared/provider_billing.py +171 -0
  129. vaultlayer-0.1.0/src/shared/provider_capacity.py +357 -0
  130. vaultlayer-0.1.0/src/shared/queue.py +404 -0
  131. vaultlayer-0.1.0/src/shared/r2_credentials.py +253 -0
  132. vaultlayer-0.1.0/src/shared/r2_utils.py +23 -0
  133. vaultlayer-0.1.0/src/shared/recovery_tiers.py +81 -0
  134. vaultlayer-0.1.0/src/shared/retry.py +169 -0
  135. vaultlayer-0.1.0/src/shared/script_upload.py +219 -0
  136. vaultlayer-0.1.0/src/shared/security.py +417 -0
  137. vaultlayer-0.1.0/src/shared/spend.py +316 -0
  138. vaultlayer-0.1.0/src/shared/storage_billing.py +292 -0
  139. vaultlayer-0.1.0/src/shared/sts.py +180 -0
  140. vaultlayer-0.1.0/src/shared/suggestions.py +105 -0
  141. vaultlayer-0.1.0/src/shared/telemetry.py +320 -0
  142. vaultlayer-0.1.0/src/shared/usage_tracker.py +224 -0
  143. vaultlayer-0.1.0/src/vaultlayer/__init__.py +1 -0
  144. vaultlayer-0.1.0/src/vaultlayer/_resume_hook.py +192 -0
  145. vaultlayer-0.1.0/src/vaultlayer/_resume_state.py +366 -0
  146. vaultlayer-0.1.0/src/vaultlayer/_resume_torch.py +759 -0
  147. vaultlayer-0.1.0/src/vaultlayer.egg-info/PKG-INFO +258 -0
  148. vaultlayer-0.1.0/src/vaultlayer.egg-info/SOURCES.txt +244 -0
  149. vaultlayer-0.1.0/src/vaultlayer.egg-info/dependency_links.txt +1 -0
  150. vaultlayer-0.1.0/src/vaultlayer.egg-info/entry_points.txt +2 -0
  151. vaultlayer-0.1.0/src/vaultlayer.egg-info/requires.txt +17 -0
  152. vaultlayer-0.1.0/src/vaultlayer.egg-info/top_level.txt +6 -0
  153. vaultlayer-0.1.0/src/workers/__init__.py +1 -0
  154. vaultlayer-0.1.0/src/workers/claude_bug_fixer.py +727 -0
  155. vaultlayer-0.1.0/src/workers/feedback_worker.py +70 -0
  156. vaultlayer-0.1.0/tests/test_admin_dashboard.py +82 -0
  157. vaultlayer-0.1.0/tests/test_admin_routes.py +577 -0
  158. vaultlayer-0.1.0/tests/test_agents_coverage.py +1714 -0
  159. vaultlayer-0.1.0/tests/test_agents_misc_coverage.py +655 -0
  160. vaultlayer-0.1.0/tests/test_api_coverage.py +1521 -0
  161. vaultlayer-0.1.0/tests/test_auth_coverage.py +264 -0
  162. vaultlayer-0.1.0/tests/test_auth_credits.py +294 -0
  163. vaultlayer-0.1.0/tests/test_auth_onboarding.py +505 -0
  164. vaultlayer-0.1.0/tests/test_aws_backoff_split.py +134 -0
  165. vaultlayer-0.1.0/tests/test_aws_ranker_fallback.py +398 -0
  166. vaultlayer-0.1.0/tests/test_aws_termination_retry.py +170 -0
  167. vaultlayer-0.1.0/tests/test_bill_reconcile.py +316 -0
  168. vaultlayer-0.1.0/tests/test_broker_agent.py +777 -0
  169. vaultlayer-0.1.0/tests/test_byoc_phase2.py +882 -0
  170. vaultlayer-0.1.0/tests/test_checkpoint_helper.py +530 -0
  171. vaultlayer-0.1.0/tests/test_cli_coverage.py +1041 -0
  172. vaultlayer-0.1.0/tests/test_cold_start_race.py +409 -0
  173. vaultlayer-0.1.0/tests/test_constraint_satisfaction.py +625 -0
  174. vaultlayer-0.1.0/tests/test_constraint_satisfaction_integration.py +460 -0
  175. vaultlayer-0.1.0/tests/test_coverage_gaps.py +1031 -0
  176. vaultlayer-0.1.0/tests/test_credentials_api.py +210 -0
  177. vaultlayer-0.1.0/tests/test_credits_coverage.py +414 -0
  178. vaultlayer-0.1.0/tests/test_critical_gaps_batch1.py +929 -0
  179. vaultlayer-0.1.0/tests/test_data_loader_ttl.py +132 -0
  180. vaultlayer-0.1.0/tests/test_e2e_aws_down.py +558 -0
  181. vaultlayer-0.1.0/tests/test_e2e_smoke.py +863 -0
  182. vaultlayer-0.1.0/tests/test_ec4_weight_storm.py +286 -0
  183. vaultlayer-0.1.0/tests/test_egress_engine.py +1091 -0
  184. vaultlayer-0.1.0/tests/test_exit_code_routing.py +80 -0
  185. vaultlayer-0.1.0/tests/test_feedback_system.py +289 -0
  186. vaultlayer-0.1.0/tests/test_find_checkpoint.py +109 -0
  187. vaultlayer-0.1.0/tests/test_finops_agent.py +110 -0
  188. vaultlayer-0.1.0/tests/test_flow_abc_coverage.py +747 -0
  189. vaultlayer-0.1.0/tests/test_flow_abc_smoke.py +481 -0
  190. vaultlayer-0.1.0/tests/test_gainshare.py +224 -0
  191. vaultlayer-0.1.0/tests/test_gap27_shard_sha256.py +122 -0
  192. vaultlayer-0.1.0/tests/test_gaps_batch2.py +1042 -0
  193. vaultlayer-0.1.0/tests/test_gcp_image_and_serial.py +126 -0
  194. vaultlayer-0.1.0/tests/test_gcp_zone_fallback.py +400 -0
  195. vaultlayer-0.1.0/tests/test_gpu_resolver.py +170 -0
  196. vaultlayer-0.1.0/tests/test_gtm_gaps.py +484 -0
  197. vaultlayer-0.1.0/tests/test_handle_provision_result.py +285 -0
  198. vaultlayer-0.1.0/tests/test_idem3_vault_orphan_gc.py +189 -0
  199. vaultlayer-0.1.0/tests/test_idempotency.py +172 -0
  200. vaultlayer-0.1.0/tests/test_included_providers.py +108 -0
  201. vaultlayer-0.1.0/tests/test_inject_coverage.py +493 -0
  202. vaultlayer-0.1.0/tests/test_integration.py +177 -0
  203. vaultlayer-0.1.0/tests/test_integration_fence_resume.py +246 -0
  204. vaultlayer-0.1.0/tests/test_invite_tokens.py +405 -0
  205. vaultlayer-0.1.0/tests/test_job_contract.py +96 -0
  206. vaultlayer-0.1.0/tests/test_log_dedup_final_post.py +137 -0
  207. vaultlayer-0.1.0/tests/test_matrix_health.py +292 -0
  208. vaultlayer-0.1.0/tests/test_migration_leg_reset.py +120 -0
  209. vaultlayer-0.1.0/tests/test_migration_nxn.py +634 -0
  210. vaultlayer-0.1.0/tests/test_multi_node_guard.py +115 -0
  211. vaultlayer-0.1.0/tests/test_namespace_agent.py +392 -0
  212. vaultlayer-0.1.0/tests/test_new_features.py +224 -0
  213. vaultlayer-0.1.0/tests/test_orchestration_agent.py +496 -0
  214. vaultlayer-0.1.0/tests/test_pricing_agent.py +345 -0
  215. vaultlayer-0.1.0/tests/test_provider_billing.py +136 -0
  216. vaultlayer-0.1.0/tests/test_provider_matrix.py +526 -0
  217. vaultlayer-0.1.0/tests/test_provider_smoke.py +616 -0
  218. vaultlayer-0.1.0/tests/test_provider_smoke_prod.py +831 -0
  219. vaultlayer-0.1.0/tests/test_provider_startup.py +212 -0
  220. vaultlayer-0.1.0/tests/test_provider_testing_state.py +216 -0
  221. vaultlayer-0.1.0/tests/test_queue_coverage.py +429 -0
  222. vaultlayer-0.1.0/tests/test_r2_credentials.py +437 -0
  223. vaultlayer-0.1.0/tests/test_rbac.py +141 -0
  224. vaultlayer-0.1.0/tests/test_resume_hook.py +852 -0
  225. vaultlayer-0.1.0/tests/test_run_data_flag.py +268 -0
  226. vaultlayer-0.1.0/tests/test_scenarios.py +557 -0
  227. vaultlayer-0.1.0/tests/test_script_upload.py +282 -0
  228. vaultlayer-0.1.0/tests/test_security.py +251 -0
  229. vaultlayer-0.1.0/tests/test_server_lifecycle.py +695 -0
  230. vaultlayer-0.1.0/tests/test_shared_coverage.py +2282 -0
  231. vaultlayer-0.1.0/tests/test_shared_utils.py +395 -0
  232. vaultlayer-0.1.0/tests/test_sla_latency.py +548 -0
  233. vaultlayer-0.1.0/tests/test_startup_scripts.py +225 -0
  234. vaultlayer-0.1.0/tests/test_storage_billing_coverage.py +368 -0
  235. vaultlayer-0.1.0/tests/test_sts_concurrent_cache.py +108 -0
  236. vaultlayer-0.1.0/tests/test_sync_cloud_mirror.py +431 -0
  237. vaultlayer-0.1.0/tests/test_sync_s3_mirror.py +261 -0
  238. vaultlayer-0.1.0/tests/test_technical_cracks.py +328 -0
  239. vaultlayer-0.1.0/tests/test_telemetry.py +185 -0
  240. vaultlayer-0.1.0/tests/test_unified_queue.py +161 -0
  241. vaultlayer-0.1.0/tests/test_unified_queue_integration.py +310 -0
  242. vaultlayer-0.1.0/tests/test_unified_queue_persist.py +170 -0
  243. vaultlayer-0.1.0/tests/test_v0_gaps.py +931 -0
  244. vaultlayer-0.1.0/tests/test_vault_agent.py +465 -0
  245. vaultlayer-0.1.0/tests/test_watchdog_agent.py +178 -0
  246. vaultlayer-0.1.0/tests/test_worker_loop_per_provider_caps.py +168 -0
@@ -0,0 +1,258 @@
1
+ Metadata-Version: 2.4
2
+ Name: vaultlayer
3
+ Version: 0.1.0
4
+ Summary: AI compute arbitrage CLI — move GPU training jobs between clouds automatically
5
+ Author: VaultLayer
6
+ License: MIT
7
+ Keywords: gpu,cloud,training,arbitrage,mlops,ai
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Environment :: Console
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Classifier: Topic :: System :: Distributed Computing
19
+ Requires-Python: >=3.10
20
+ Description-Content-Type: text/markdown
21
+ Requires-Dist: click>=8.1.0
22
+ Requires-Dist: httpx>=0.27.0
23
+ Requires-Dist: pydantic>=2.0.0
24
+ Requires-Dist: python-dotenv>=1.0.0
25
+ Requires-Dist: redis>=5.0.0
26
+ Requires-Dist: aiohttp>=3.9.0
27
+ Requires-Dist: boto3>=1.35.0
28
+ Requires-Dist: anthropic>=0.40.0
29
+ Provides-Extra: server
30
+ Requires-Dist: fastapi>=0.115.0; extra == "server"
31
+ Requires-Dist: uvicorn[standard]>=0.29.0; extra == "server"
32
+ Requires-Dist: supabase>=2.4.0; extra == "server"
33
+ Requires-Dist: resend>=2.0.0; extra == "server"
34
+ Requires-Dist: stripe>=9.0.0; extra == "server"
35
+ Requires-Dist: pydantic[email]>=2.0.0; extra == "server"
36
+ Requires-Dist: python-multipart>=0.0.9; extra == "server"
37
+
38
+ # VaultLayer
39
+
40
+ > Run AI training jobs across 11 GPU cloud providers — 60–70% cheaper than AWS on-demand, with 99.9% job completion SLA, using your existing commands unchanged. 93% GPU cloud market coverage (~$39B addressable).
41
+
42
+ ```bash
43
+ pip install vaultlayer
44
+ vaultlayer run python train.py --model llama-3-7b --epochs 10
45
+ ```
46
+
47
+ ```
48
+ ✓ Job completed in 4h 32m
49
+ ✓ 1 interruption recovered automatically (AWS Spot → Lambda H100)
50
+ ✓ Saved $142.40 vs AWS On-Demand
51
+ → View full report: https://vaultlayer.pages.dev/jobs/j-0042
52
+ ```
53
+
54
+ ---
55
+
56
+ ## What It Does
57
+
58
+ VaultLayer sits between your training script and the cloud. It:
59
+
60
+ - **Checkpoints automatically** — syncs model weights + optimizer state to a zero-egress R2 Vault on every save
61
+ - **Detects interruptions** — intercepts AWS/GCP/Azure termination signals before your job dies
62
+ - **Migrates instantly** — provisions a replacement node on the cheapest available provider and resumes from last checkpoint
63
+ - **Tracks savings** — shows real-time cost vs what you would have paid on AWS On-Demand
64
+
65
+ No changes to your PyTorch or JAX code. No YAML configs. No PhD-level infra knowledge required.
66
+
67
+ ### Commands
68
+
69
+ ```bash
70
+ # Training
71
+ vaultlayer run python train.py # run with full protection
72
+ vaultlayer run --data s3://bucket/prefix python train.py # mirror S3→R2 then run
73
+ vaultlayer run --data r2://my-dataset python train.py # use dataset already in R2
74
+ vaultlayer run --regions eu-central-1,eu-west-1 python train.py # GDPR-only regions
75
+ vaultlayer run --excluded-regions cn-north-1 python train.py # never use China region
76
+ vaultlayer stop <job-id> # graceful stop + checkpoint
77
+ vaultlayer logs <job-id> [--tail N] [--follow] # stream logs from R2
78
+
79
+ # Dataset storage (no S3 required)
80
+ vaultlayer sync ./data --dataset-id my-dataset # upload local data → R2
81
+ vaultlayer sync s3://bucket/prefix --dataset-id my-dataset # mirror S3 → R2 (one-time egress)
82
+ vaultlayer datasets # list datasets + storage costs
83
+ vaultlayer datasets --delete my-dataset # delete + stop billing
84
+
85
+ # Region discovery
86
+ vaultlayer regions list-all # list all valid regions + compliance notes
87
+ vaultlayer regions current # show current provisioning region
88
+ ```
89
+
90
+ ---
91
+
92
+ ## Supported Providers
93
+
94
+ | Provider | Type | Status |
95
+ |---|---|---|
96
+ | AWS EC2 Spot | Hyperscaler | ✅ Live |
97
+ | Lambda Labs | Neocloud | ✅ Live |
98
+ | CoreWeave | Neocloud | ✅ Live |
99
+ | RunPod | Neocloud | ✅ Live |
100
+ | Vast.ai | Neocloud | ✅ Live |
101
+ | Voltage Park | Neocloud | ✅ Live |
102
+ | Crusoe | Neocloud | ✅ Live |
103
+ | Nebius | Neocloud | ✅ Live |
104
+ | Hyperstack | Neocloud | ✅ Live |
105
+ | GCP | Hyperscaler | ✅ Live |
106
+ | Azure | Hyperscaler | ✅ Live |
107
+ | AWS On-Demand | Hyperscaler | ✅ Last-resort fallback |
108
+
109
+ 11 providers live — 93% GPU cloud market coverage (~$39B addressable, ~$21B migratable training).
110
+
111
+ ---
112
+
113
+ ## Model Size Support
114
+
115
+ | Model Size | Method | Checkpoint Size | Status |
116
+ |---|---|---|---|
117
+ | 7B | Full fine-tune | ~69 GB | ✅ MVP |
118
+ | 13B | Full fine-tune | ~125 GB | ✅ MVP |
119
+ | 30B | Full fine-tune | ~288 GB | ✅ MVP |
120
+ | 70B | QLoRA (4-bit) | ~46 GB | ✅ MVP |
121
+ | 70B | Full fine-tune | ~782 GB | 🔜 Phase 2 |
122
+
123
+ ---
124
+
125
+ ## Tech Stack
126
+
127
+ | Layer | Technology | Cost |
128
+ |---|---|---|
129
+ | Code + Docs | GitHub (this repo) | Free |
130
+ | CI/CD | GitHub Actions | Free (2k min/mo) |
131
+ | Vault / Storage | Cloudflare R2 | Free up to 10GB |
132
+ | Agent Runtime | Railway | Free $5/mo credit |
133
+ | Webhooks | Cloudflare Workers | Free 100k req/day |
134
+ | Agent Message Queue | Upstash Redis | Free 10k cmd/day |
135
+
136
+ ---
137
+
138
+ ## Repository Structure
139
+
140
+ ```
141
+ vaultlayer/
142
+ ├── README.md
143
+ ├── docs/
144
+ │ ├── PRD.md # Full product requirements
145
+ │ ├── ARCHITECTURE.md # System design + agent topology
146
+ │ └── AGENTS.md # Agent specs + build order
147
+ ├── dashboard/
148
+ │ └── index.html # Savings dashboard prototype
149
+ └── src/
150
+ ├── cli/
151
+ │ ├── main.py
152
+ │ ├── run.py
153
+ │ ├── checkpoint_template.py
154
+ │ └── init.py
155
+ ├── vaultlayer/
156
+ │ └── _resume_hook.py
157
+ ├── agents/
158
+ │ ├── orchestration/
159
+ │ ├── pricing/
160
+ │ ├── watchdog/
161
+ │ │ └── signals.py
162
+ │ ├── vault/
163
+ │ ├── broker/
164
+ │ ├── finops/
165
+ │ └── namespace/
166
+ └── shared/
167
+ ```
168
+
169
+ ---
170
+
171
+ ## SLA
172
+
173
+ **99.9% job completion rate** — not node uptime. Jobs survive infrastructure failures.
174
+ Recovery SLA: interrupted job resumes within 10 minutes from last checkpoint.
175
+
176
+ ---
177
+
178
+ ## Dataset Storage (No S3 Required)
179
+
180
+ VaultLayer's Neutral Zone (Cloudflare R2) is a first-class storage provider. Users with no AWS or
181
+ cloud storage account can upload training data directly and train from it on any provider.
182
+
183
+ ```bash
184
+ # Upload from your laptop / on-prem server
185
+ vaultlayer sync ./training-data --dataset-id my-dataset
186
+
187
+ # Train — data is mounted at /mnt/vaultlayer on every provisioned node
188
+ vaultlayer run --data r2://my-dataset python train.py
189
+
190
+ # See what you're storing and the monthly cost
191
+ vaultlayer datasets
192
+ ```
193
+
194
+ **Pricing:**
195
+
196
+ | Action | Cost |
197
+ |--------|------|
198
+ | Upload (local → R2) | Free |
199
+ | Storage | $0.020 / GB / month ($0.0195 — 30% markup over Cloudflare R2 base rate) |
200
+ | Read (R2 → training node) | $0.00 (zero egress within Cloudflare network) |
201
+ | S3 mirror (one-time) | AWS egress charge (~$0.09/GB, first 100 GB/month free) |
202
+
203
+ **Storage quotas by plan:**
204
+
205
+ | Plan | Storage limit |
206
+ |------|--------------|
207
+ | Free | 10 GB |
208
+ | Pro | 500 GB |
209
+ | Enterprise | Unlimited |
210
+
211
+ Datasets are soft-deleted with `vaultlayer datasets --delete <id>` — billing stops immediately,
212
+ R2 objects are purged within 24 hours.
213
+
214
+ ---
215
+
216
+ ## Region Control
217
+
218
+ VaultLayer can provision nodes in any AWS region that has GPU capacity. By default it uses
219
+ the region from your `vaultlayer init` configuration.
220
+
221
+ **Restrict to specific regions** (e.g. GDPR compliance — EU data stays in EU):
222
+ ```bash
223
+ vaultlayer run --regions eu-central-1,eu-west-1 python train.py
224
+ ```
225
+
226
+ **Exclude regions** (e.g. avoid China, GovCloud, sanctioned territories):
227
+ ```bash
228
+ vaultlayer run --excluded-regions cn-north-1,cn-northwest-1 python train.py
229
+ ```
230
+
231
+ If both flags are given, `--excluded-regions` takes priority. If neither is given, any region is allowed.
232
+
233
+ **Discover regions:**
234
+ ```bash
235
+ vaultlayer regions list-all # all GPU-capable regions with compliance notes
236
+ vaultlayer regions current # show which region your credentials point to
237
+ ```
238
+
239
+ > **Compliance note:** H100/A100 exports to certain regions (China, Russia, some Middle East countries)
240
+ > may require a US Bureau of Industry and Security (BIS) export license. VaultLayer blocks
241
+ > `cn-north-1`, `cn-northwest-1`, and `ru-central-1` by default via OFAC screening.
242
+ > Use `--regions` to limit jobs to GDPR-compliant EU regions.
243
+
244
+ ---
245
+
246
+ ## Getting Started
247
+
248
+ ```bash
249
+ pip install vaultlayer
250
+ vaultlayer init
251
+ vaultlayer run python train.py
252
+ ```
253
+
254
+ ---
255
+
256
+ ## License
257
+
258
+ Private — © 2026 VaultLayer
@@ -0,0 +1,221 @@
1
+ # VaultLayer
2
+
3
+ > Run AI training jobs across 11 GPU cloud providers — 60–70% cheaper than AWS on-demand, with 99.9% job completion SLA, using your existing commands unchanged. 93% GPU cloud market coverage (~$39B addressable).
4
+
5
+ ```bash
6
+ pip install vaultlayer
7
+ vaultlayer run python train.py --model llama-3-7b --epochs 10
8
+ ```
9
+
10
+ ```
11
+ ✓ Job completed in 4h 32m
12
+ ✓ 1 interruption recovered automatically (AWS Spot → Lambda H100)
13
+ ✓ Saved $142.40 vs AWS On-Demand
14
+ → View full report: https://vaultlayer.pages.dev/jobs/j-0042
15
+ ```
16
+
17
+ ---
18
+
19
+ ## What It Does
20
+
21
+ VaultLayer sits between your training script and the cloud. It:
22
+
23
+ - **Checkpoints automatically** — syncs model weights + optimizer state to a zero-egress R2 Vault on every save
24
+ - **Detects interruptions** — intercepts AWS/GCP/Azure termination signals before your job dies
25
+ - **Migrates instantly** — provisions a replacement node on the cheapest available provider and resumes from last checkpoint
26
+ - **Tracks savings** — shows real-time cost vs what you would have paid on AWS On-Demand
27
+
28
+ No changes to your PyTorch or JAX code. No YAML configs. No PhD-level infra knowledge required.
29
+
30
+ ### Commands
31
+
32
+ ```bash
33
+ # Training
34
+ vaultlayer run python train.py # run with full protection
35
+ vaultlayer run --data s3://bucket/prefix python train.py # mirror S3→R2 then run
36
+ vaultlayer run --data r2://my-dataset python train.py # use dataset already in R2
37
+ vaultlayer run --regions eu-central-1,eu-west-1 python train.py # GDPR-only regions
38
+ vaultlayer run --excluded-regions cn-north-1 python train.py # never use China region
39
+ vaultlayer stop <job-id> # graceful stop + checkpoint
40
+ vaultlayer logs <job-id> [--tail N] [--follow] # stream logs from R2
41
+
42
+ # Dataset storage (no S3 required)
43
+ vaultlayer sync ./data --dataset-id my-dataset # upload local data → R2
44
+ vaultlayer sync s3://bucket/prefix --dataset-id my-dataset # mirror S3 → R2 (one-time egress)
45
+ vaultlayer datasets # list datasets + storage costs
46
+ vaultlayer datasets --delete my-dataset # delete + stop billing
47
+
48
+ # Region discovery
49
+ vaultlayer regions list-all # list all valid regions + compliance notes
50
+ vaultlayer regions current # show current provisioning region
51
+ ```
52
+
53
+ ---
54
+
55
+ ## Supported Providers
56
+
57
+ | Provider | Type | Status |
58
+ |---|---|---|
59
+ | AWS EC2 Spot | Hyperscaler | ✅ Live |
60
+ | Lambda Labs | Neocloud | ✅ Live |
61
+ | CoreWeave | Neocloud | ✅ Live |
62
+ | RunPod | Neocloud | ✅ Live |
63
+ | Vast.ai | Neocloud | ✅ Live |
64
+ | Voltage Park | Neocloud | ✅ Live |
65
+ | Crusoe | Neocloud | ✅ Live |
66
+ | Nebius | Neocloud | ✅ Live |
67
+ | Hyperstack | Neocloud | ✅ Live |
68
+ | GCP | Hyperscaler | ✅ Live |
69
+ | Azure | Hyperscaler | ✅ Live |
70
+ | AWS On-Demand | Hyperscaler | ✅ Last-resort fallback |
71
+
72
+ 11 providers live — 93% GPU cloud market coverage (~$39B addressable, ~$21B migratable training).
73
+
74
+ ---
75
+
76
+ ## Model Size Support
77
+
78
+ | Model Size | Method | Checkpoint Size | Status |
79
+ |---|---|---|---|
80
+ | 7B | Full fine-tune | ~69 GB | ✅ MVP |
81
+ | 13B | Full fine-tune | ~125 GB | ✅ MVP |
82
+ | 30B | Full fine-tune | ~288 GB | ✅ MVP |
83
+ | 70B | QLoRA (4-bit) | ~46 GB | ✅ MVP |
84
+ | 70B | Full fine-tune | ~782 GB | 🔜 Phase 2 |
85
+
86
+ ---
87
+
88
+ ## Tech Stack
89
+
90
+ | Layer | Technology | Cost |
91
+ |---|---|---|
92
+ | Code + Docs | GitHub (this repo) | Free |
93
+ | CI/CD | GitHub Actions | Free (2k min/mo) |
94
+ | Vault / Storage | Cloudflare R2 | Free up to 10GB |
95
+ | Agent Runtime | Railway | Free $5/mo credit |
96
+ | Webhooks | Cloudflare Workers | Free 100k req/day |
97
+ | Agent Message Queue | Upstash Redis | Free 10k cmd/day |
98
+
99
+ ---
100
+
101
+ ## Repository Structure
102
+
103
+ ```
104
+ vaultlayer/
105
+ ├── README.md
106
+ ├── docs/
107
+ │ ├── PRD.md # Full product requirements
108
+ │ ├── ARCHITECTURE.md # System design + agent topology
109
+ │ └── AGENTS.md # Agent specs + build order
110
+ ├── dashboard/
111
+ │ └── index.html # Savings dashboard prototype
112
+ └── src/
113
+ ├── cli/
114
+ │ ├── main.py
115
+ │ ├── run.py
116
+ │ ├── checkpoint_template.py
117
+ │ └── init.py
118
+ ├── vaultlayer/
119
+ │ └── _resume_hook.py
120
+ ├── agents/
121
+ │ ├── orchestration/
122
+ │ ├── pricing/
123
+ │ ├── watchdog/
124
+ │ │ └── signals.py
125
+ │ ├── vault/
126
+ │ ├── broker/
127
+ │ ├── finops/
128
+ │ └── namespace/
129
+ └── shared/
130
+ ```
131
+
132
+ ---
133
+
134
+ ## SLA
135
+
136
+ **99.9% job completion rate** — not node uptime. Jobs survive infrastructure failures.
137
+ Recovery SLA: interrupted job resumes within 10 minutes from last checkpoint.
138
+
139
+ ---
140
+
141
+ ## Dataset Storage (No S3 Required)
142
+
143
+ VaultLayer's Neutral Zone (Cloudflare R2) is a first-class storage provider. Users with no AWS or
144
+ cloud storage account can upload training data directly and train from it on any provider.
145
+
146
+ ```bash
147
+ # Upload from your laptop / on-prem server
148
+ vaultlayer sync ./training-data --dataset-id my-dataset
149
+
150
+ # Train — data is mounted at /mnt/vaultlayer on every provisioned node
151
+ vaultlayer run --data r2://my-dataset python train.py
152
+
153
+ # See what you're storing and the monthly cost
154
+ vaultlayer datasets
155
+ ```
156
+
157
+ **Pricing:**
158
+
159
+ | Action | Cost |
160
+ |--------|------|
161
+ | Upload (local → R2) | Free |
162
+ | Storage | $0.020 / GB / month ($0.0195 — 30% markup over Cloudflare R2 base rate) |
163
+ | Read (R2 → training node) | $0.00 (zero egress within Cloudflare network) |
164
+ | S3 mirror (one-time) | AWS egress charge (~$0.09/GB, first 100 GB/month free) |
165
+
166
+ **Storage quotas by plan:**
167
+
168
+ | Plan | Storage limit |
169
+ |------|--------------|
170
+ | Free | 10 GB |
171
+ | Pro | 500 GB |
172
+ | Enterprise | Unlimited |
173
+
174
+ Datasets are soft-deleted with `vaultlayer datasets --delete <id>` — billing stops immediately,
175
+ R2 objects are purged within 24 hours.
176
+
177
+ ---
178
+
179
+ ## Region Control
180
+
181
+ VaultLayer can provision nodes in any AWS region that has GPU capacity. By default it uses
182
+ the region from your `vaultlayer init` configuration.
183
+
184
+ **Restrict to specific regions** (e.g. GDPR compliance — EU data stays in EU):
185
+ ```bash
186
+ vaultlayer run --regions eu-central-1,eu-west-1 python train.py
187
+ ```
188
+
189
+ **Exclude regions** (e.g. avoid China, GovCloud, sanctioned territories):
190
+ ```bash
191
+ vaultlayer run --excluded-regions cn-north-1,cn-northwest-1 python train.py
192
+ ```
193
+
194
+ If both flags are given, `--excluded-regions` takes priority. If neither is given, any region is allowed.
195
+
196
+ **Discover regions:**
197
+ ```bash
198
+ vaultlayer regions list-all # all GPU-capable regions with compliance notes
199
+ vaultlayer regions current # show which region your credentials point to
200
+ ```
201
+
202
+ > **Compliance note:** H100/A100 exports to certain regions (China, Russia, some Middle East countries)
203
+ > may require a US Bureau of Industry and Security (BIS) export license. VaultLayer blocks
204
+ > `cn-north-1`, `cn-northwest-1`, and `ru-central-1` by default via OFAC screening.
205
+ > Use `--regions` to limit jobs to GDPR-compliant EU regions.
206
+
207
+ ---
208
+
209
+ ## Getting Started
210
+
211
+ ```bash
212
+ pip install vaultlayer
213
+ vaultlayer init
214
+ vaultlayer run python train.py
215
+ ```
216
+
217
+ ---
218
+
219
+ ## License
220
+
221
+ Private — © 2026 VaultLayer
@@ -0,0 +1,83 @@
1
+ [build-system]
2
+ requires = ["setuptools>=42"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "vaultlayer"
7
+ version = "0.1.0"
8
+ description = "AI compute arbitrage CLI — move GPU training jobs between clouds automatically"
9
+ readme = "README.md"
10
+ license = { text = "MIT" }
11
+ requires-python = ">=3.10"
12
+ authors = [{ name = "VaultLayer" }]
13
+ keywords = ["gpu", "cloud", "training", "arbitrage", "mlops", "ai"]
14
+ classifiers = [
15
+ "Development Status :: 4 - Beta",
16
+ "Environment :: Console",
17
+ "Intended Audience :: Developers",
18
+ "Intended Audience :: Science/Research",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
25
+ "Topic :: System :: Distributed Computing",
26
+ ]
27
+
28
+ dependencies = [
29
+ "click>=8.1.0",
30
+ "httpx>=0.27.0",
31
+ "pydantic>=2.0.0",
32
+ "python-dotenv>=1.0.0",
33
+ "redis>=5.0.0",
34
+ "aiohttp>=3.9.0",
35
+ "boto3>=1.35.0",
36
+ "anthropic>=0.40.0",
37
+ ]
38
+
39
+ [project.optional-dependencies]
40
+ server = [
41
+ "fastapi>=0.115.0",
42
+ "uvicorn[standard]>=0.29.0",
43
+ "supabase>=2.4.0",
44
+ "resend>=2.0.0",
45
+ "stripe>=9.0.0",
46
+ "pydantic[email]>=2.0.0",
47
+ "python-multipart>=0.0.9",
48
+ ]
49
+
50
+ [project.scripts]
51
+ vaultlayer = "cli.main:cli"
52
+
53
+ [tool.setuptools.packages.find]
54
+ where = ["src"]
55
+
56
+ [tool.pytest.ini_options]
57
+ asyncio_mode = "strict"
58
+ testpaths = ["tests"]
59
+ filterwarnings = [
60
+ "ignore::DeprecationWarning:supabase",
61
+ "ignore::DeprecationWarning:postgrest",
62
+ "ignore::DeprecationWarning:pyiceberg",
63
+ "ignore::RuntimeWarning:asyncio",
64
+ ]
65
+
66
+ [tool.ruff]
67
+ line-length = 120
68
+ target-version = "py311"
69
+
70
+ [tool.ruff.lint]
71
+ ignore = [
72
+ "E401",
73
+ "E501",
74
+ "E402",
75
+ "E701",
76
+ "E702",
77
+ "E741",
78
+ "F401",
79
+ "F541",
80
+ "F811",
81
+ "F821",
82
+ "F841",
83
+ ]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
File without changes
File without changes