marin-iris 0.99__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (240) hide show
  1. marin_iris-0.99/.gitignore +243 -0
  2. marin_iris-0.99/PKG-INFO +30 -0
  3. marin_iris-0.99/examples/coreweave-ci.yaml +92 -0
  4. marin_iris-0.99/examples/coreweave-rno2a.yaml +99 -0
  5. marin_iris-0.99/examples/coreweave-usw09b.yaml +98 -0
  6. marin_iris-0.99/examples/coreweave.yaml +116 -0
  7. marin_iris-0.99/examples/local-auth-gcp.yaml +35 -0
  8. marin_iris-0.99/examples/local-auth-static.yaml +36 -0
  9. marin_iris-0.99/examples/local.yaml +29 -0
  10. marin_iris-0.99/examples/marin-dev.yaml +145 -0
  11. marin_iris-0.99/examples/marin.yaml +223 -0
  12. marin_iris-0.99/examples/smoke-gcp.yaml +71 -0
  13. marin_iris-0.99/examples/test.yaml +165 -0
  14. marin_iris-0.99/examples/tpu-demo.ipynb +461 -0
  15. marin_iris-0.99/iris/__init__.py +2 -0
  16. marin_iris-0.99/iris/_build_info.py +3 -0
  17. marin_iris-0.99/iris/actor/__init__.py +35 -0
  18. marin_iris-0.99/iris/actor/client.py +223 -0
  19. marin_iris-0.99/iris/actor/pool.py +281 -0
  20. marin_iris-0.99/iris/actor/resolver.py +108 -0
  21. marin_iris-0.99/iris/actor/server.py +355 -0
  22. marin_iris-0.99/iris/chaos.py +98 -0
  23. marin_iris-0.99/iris/cli/__init__.py +12 -0
  24. marin_iris-0.99/iris/cli/actor.py +69 -0
  25. marin_iris-0.99/iris/cli/bug_report.py +528 -0
  26. marin_iris-0.99/iris/cli/build.py +493 -0
  27. marin_iris-0.99/iris/cli/cluster.py +1142 -0
  28. marin_iris-0.99/iris/cli/job.py +1261 -0
  29. marin_iris-0.99/iris/cli/main.py +486 -0
  30. marin_iris-0.99/iris/cli/process_status.py +194 -0
  31. marin_iris-0.99/iris/cli/query.py +82 -0
  32. marin_iris-0.99/iris/cli/rpc.py +327 -0
  33. marin_iris-0.99/iris/cli/task.py +70 -0
  34. marin_iris-0.99/iris/cli/token_store.py +125 -0
  35. marin_iris-0.99/iris/client/__init__.py +49 -0
  36. marin_iris-0.99/iris/client/client.py +1081 -0
  37. marin_iris-0.99/iris/client/resolver.py +102 -0
  38. marin_iris-0.99/iris/client/worker_pool.py +595 -0
  39. marin_iris-0.99/iris/cluster/__init__.py +2 -0
  40. marin_iris-0.99/iris/cluster/bundle.py +185 -0
  41. marin_iris-0.99/iris/cluster/client/__init__.py +22 -0
  42. marin_iris-0.99/iris/cluster/client/bundle.py +213 -0
  43. marin_iris-0.99/iris/cluster/client/job_info.py +167 -0
  44. marin_iris-0.99/iris/cluster/client/protocol.py +108 -0
  45. marin_iris-0.99/iris/cluster/client/remote_client.py +501 -0
  46. marin_iris-0.99/iris/cluster/config.py +1331 -0
  47. marin_iris-0.99/iris/cluster/constraints.py +1169 -0
  48. marin_iris-0.99/iris/cluster/controller/__init__.py +2 -0
  49. marin_iris-0.99/iris/cluster/controller/actor_proxy.py +104 -0
  50. marin_iris-0.99/iris/cluster/controller/auth.py +424 -0
  51. marin_iris-0.99/iris/cluster/controller/autoscaler/__init__.py +6 -0
  52. marin_iris-0.99/iris/cluster/controller/autoscaler/models.py +75 -0
  53. marin_iris-0.99/iris/cluster/controller/autoscaler/operations.py +176 -0
  54. marin_iris-0.99/iris/cluster/controller/autoscaler/planning.py +135 -0
  55. marin_iris-0.99/iris/cluster/controller/autoscaler/recovery.py +136 -0
  56. marin_iris-0.99/iris/cluster/controller/autoscaler/routing.py +597 -0
  57. marin_iris-0.99/iris/cluster/controller/autoscaler/runtime.py +641 -0
  58. marin_iris-0.99/iris/cluster/controller/autoscaler/scaling_group.py +1340 -0
  59. marin_iris-0.99/iris/cluster/controller/autoscaler/status.py +169 -0
  60. marin_iris-0.99/iris/cluster/controller/autoscaler/worker_registry.py +175 -0
  61. marin_iris-0.99/iris/cluster/controller/budget.py +222 -0
  62. marin_iris-0.99/iris/cluster/controller/checkpoint.py +421 -0
  63. marin_iris-0.99/iris/cluster/controller/codec.py +117 -0
  64. marin_iris-0.99/iris/cluster/controller/controller.py +2671 -0
  65. marin_iris-0.99/iris/cluster/controller/dashboard.py +801 -0
  66. marin_iris-0.99/iris/cluster/controller/db.py +993 -0
  67. marin_iris-0.99/iris/cluster/controller/endpoint_proxy.py +288 -0
  68. marin_iris-0.99/iris/cluster/controller/main.py +358 -0
  69. marin_iris-0.99/iris/cluster/controller/migrations/0001_init.py +10 -0
  70. marin_iris-0.99/iris/cluster/controller/migrations/0002_read_indexes.py +8 -0
  71. marin_iris-0.99/iris/cluster/controller/migrations/0003_normalize_scaling_groups.py +39 -0
  72. marin_iris-0.99/iris/cluster/controller/migrations/0004_api_keys.py +38 -0
  73. marin_iris-0.99/iris/cluster/controller/migrations/0004_worker_indexes.py +15 -0
  74. marin_iris-0.99/iris/cluster/controller/migrations/0005_task_profiles.py +32 -0
  75. marin_iris-0.99/iris/cluster/controller/migrations/0006_jwt_signing_key.py +16 -0
  76. marin_iris-0.99/iris/cluster/controller/migrations/0007_perf_indexes.py +19 -0
  77. marin_iris-0.99/iris/cluster/controller/migrations/0008_jobs_name.py +38 -0
  78. marin_iris-0.99/iris/cluster/controller/migrations/0009_query_indexes.py +24 -0
  79. marin_iris-0.99/iris/cluster/controller/migrations/0010_dashboard_indexes.py +42 -0
  80. marin_iris-0.99/iris/cluster/controller/migrations/0010_purge_orphaned_endpoints.py +15 -0
  81. marin_iris-0.99/iris/cluster/controller/migrations/0011_direct_provider.py +33 -0
  82. marin_iris-0.99/iris/cluster/controller/migrations/0012_container_name.py +16 -0
  83. marin_iris-0.99/iris/cluster/controller/migrations/0012_separate_auth_db.py +53 -0
  84. marin_iris-0.99/iris/cluster/controller/migrations/0013_has_reservation.py +46 -0
  85. marin_iris-0.99/iris/cluster/controller/migrations/0014_profile_kind.py +36 -0
  86. marin_iris-0.99/iris/cluster/controller/migrations/0015_drop_redundant_index.py +11 -0
  87. marin_iris-0.99/iris/cluster/controller/migrations/0016_worker_scheduling_fields.py +57 -0
  88. marin_iris-0.99/iris/cluster/controller/migrations/0017_job_scheduling_fields.py +72 -0
  89. marin_iris-0.99/iris/cluster/controller/migrations/0018_task_assignment_fields.py +30 -0
  90. marin_iris-0.99/iris/cluster/controller/migrations/0019_worker_fk_cascade.py +73 -0
  91. marin_iris-0.99/iris/cluster/controller/migrations/0020_perf_indices_and_profiles_fk.py +53 -0
  92. marin_iris-0.99/iris/cluster/controller/migrations/0021_budgets.py +38 -0
  93. marin_iris-0.99/iris/cluster/controller/migrations/0022_workers_slice_and_group.py +28 -0
  94. marin_iris-0.99/iris/cluster/controller/migrations/0023_separate_profiles_db.py +63 -0
  95. marin_iris-0.99/iris/cluster/controller/migrations/0024_normalize_resource_usage.py +47 -0
  96. marin_iris-0.99/iris/cluster/controller/migrations/0024_task_resource_history.py +25 -0
  97. marin_iris-0.99/iris/cluster/controller/migrations/0025_normalize_resource_snapshots.py +105 -0
  98. marin_iris-0.99/iris/cluster/controller/migrations/0026_normalize_worker_metadata.py +95 -0
  99. marin_iris-0.99/iris/cluster/controller/migrations/0027_normalize_job_resources.py +101 -0
  100. marin_iris-0.99/iris/cluster/controller/migrations/0028_job_config_table.py +273 -0
  101. marin_iris-0.99/iris/cluster/controller/migrations/0029_drop_task_resource_usage_columns.py +30 -0
  102. marin_iris-0.99/iris/cluster/controller/migrations/0030_backfill_worker_region.py +57 -0
  103. marin_iris-0.99/iris/cluster/controller/migrations/0030_job_submit_argv.py +13 -0
  104. marin_iris-0.99/iris/cluster/controller/migrations/0031_auto_vacuum_incremental.py +24 -0
  105. marin_iris-0.99/iris/cluster/controller/migrations/0032_backfill_attempt_finished_at.py +52 -0
  106. marin_iris-0.99/iris/cluster/controller/migrations/0033_worker_task_history_fk_cascade.py +60 -0
  107. marin_iris-0.99/iris/cluster/controller/migrations/0034_task_summaries_covering_index.py +24 -0
  108. marin_iris-0.99/iris/cluster/controller/migrations/0035_drop_dead_logs_table.py +17 -0
  109. marin_iris-0.99/iris/cluster/controller/migrations/0036_reconcile_reservation_holder_attempt_ids.py +39 -0
  110. marin_iris-0.99/iris/cluster/controller/migrations/0037_drop_txn_log_and_txn_actions.py +19 -0
  111. marin_iris-0.99/iris/cluster/controller/migrations/0037_user_budget_default.py +18 -0
  112. marin_iris-0.99/iris/cluster/controller/migrations/0038_finalize_orphan_attempts.py +103 -0
  113. marin_iris-0.99/iris/cluster/controller/migrations/0039_requeue_split_coscheduled_jobs.py +214 -0
  114. marin_iris-0.99/iris/cluster/controller/migrations/0040_drop_resource_history_tables.py +35 -0
  115. marin_iris-0.99/iris/cluster/controller/migrations/0041_drop_worker_task_history.py +23 -0
  116. marin_iris-0.99/iris/cluster/controller/migrations/0042_drop_workers_dormant_columns.py +31 -0
  117. marin_iris-0.99/iris/cluster/controller/migrations/0043_drop_workers_committed_columns.py +33 -0
  118. marin_iris-0.99/iris/cluster/controller/migrations/0044_drop_dispatch_queue.py +45 -0
  119. marin_iris-0.99/iris/cluster/controller/migrations/0045_index_task_attempts_live_workerbound.py +40 -0
  120. marin_iris-0.99/iris/cluster/controller/migrations/0046_drop_slices_last_active_ms.py +22 -0
  121. marin_iris-0.99/iris/cluster/controller/provider.py +55 -0
  122. marin_iris-0.99/iris/cluster/controller/query.py +80 -0
  123. marin_iris-0.99/iris/cluster/controller/scheduler.py +940 -0
  124. marin_iris-0.99/iris/cluster/controller/schema.py +1710 -0
  125. marin_iris-0.99/iris/cluster/controller/service.py +2629 -0
  126. marin_iris-0.99/iris/cluster/controller/stores.py +2205 -0
  127. marin_iris-0.99/iris/cluster/controller/transitions.py +2764 -0
  128. marin_iris-0.99/iris/cluster/controller/vm_lifecycle.py +452 -0
  129. marin_iris-0.99/iris/cluster/controller/worker_health.py +199 -0
  130. marin_iris-0.99/iris/cluster/controller/worker_provider.py +289 -0
  131. marin_iris-0.99/iris/cluster/dashboard_common.py +181 -0
  132. marin_iris-0.99/iris/cluster/endpoints.py +187 -0
  133. marin_iris-0.99/iris/cluster/log_store_helpers.py +46 -0
  134. marin_iris-0.99/iris/cluster/process_status.py +105 -0
  135. marin_iris-0.99/iris/cluster/providers/__init__.py +30 -0
  136. marin_iris-0.99/iris/cluster/providers/_worker_base.py +116 -0
  137. marin_iris-0.99/iris/cluster/providers/factory.py +105 -0
  138. marin_iris-0.99/iris/cluster/providers/gcp/__init__.py +11 -0
  139. marin_iris-0.99/iris/cluster/providers/gcp/bootstrap.py +496 -0
  140. marin_iris-0.99/iris/cluster/providers/gcp/controller.py +378 -0
  141. marin_iris-0.99/iris/cluster/providers/gcp/fake.py +560 -0
  142. marin_iris-0.99/iris/cluster/providers/gcp/handles.py +492 -0
  143. marin_iris-0.99/iris/cluster/providers/gcp/local.py +171 -0
  144. marin_iris-0.99/iris/cluster/providers/gcp/service.py +948 -0
  145. marin_iris-0.99/iris/cluster/providers/gcp/ssh.py +158 -0
  146. marin_iris-0.99/iris/cluster/providers/gcp/workers.py +1029 -0
  147. marin_iris-0.99/iris/cluster/providers/k8s/__init__.py +4 -0
  148. marin_iris-0.99/iris/cluster/providers/k8s/bundle_fetch.py +84 -0
  149. marin_iris-0.99/iris/cluster/providers/k8s/constants.py +12 -0
  150. marin_iris-0.99/iris/cluster/providers/k8s/controller.py +919 -0
  151. marin_iris-0.99/iris/cluster/providers/k8s/fake.py +830 -0
  152. marin_iris-0.99/iris/cluster/providers/k8s/service.py +782 -0
  153. marin_iris-0.99/iris/cluster/providers/k8s/tasks.py +1680 -0
  154. marin_iris-0.99/iris/cluster/providers/k8s/types.py +146 -0
  155. marin_iris-0.99/iris/cluster/providers/local/__init__.py +2 -0
  156. marin_iris-0.99/iris/cluster/providers/local/cluster.py +338 -0
  157. marin_iris-0.99/iris/cluster/providers/manual/__init__.py +2 -0
  158. marin_iris-0.99/iris/cluster/providers/manual/provider.py +547 -0
  159. marin_iris-0.99/iris/cluster/providers/protocols.py +140 -0
  160. marin_iris-0.99/iris/cluster/providers/remote_exec.py +426 -0
  161. marin_iris-0.99/iris/cluster/providers/types.py +432 -0
  162. marin_iris-0.99/iris/cluster/redaction.py +93 -0
  163. marin_iris-0.99/iris/cluster/runtime/__init__.py +39 -0
  164. marin_iris-0.99/iris/cluster/runtime/docker.py +1182 -0
  165. marin_iris-0.99/iris/cluster/runtime/entrypoint.py +122 -0
  166. marin_iris-0.99/iris/cluster/runtime/env.py +134 -0
  167. marin_iris-0.99/iris/cluster/runtime/process.py +713 -0
  168. marin_iris-0.99/iris/cluster/runtime/profile.py +290 -0
  169. marin_iris-0.99/iris/cluster/runtime/types.py +385 -0
  170. marin_iris-0.99/iris/cluster/service_mode.py +10 -0
  171. marin_iris-0.99/iris/cluster/types.py +842 -0
  172. marin_iris-0.99/iris/cluster/worker/__init__.py +4 -0
  173. marin_iris-0.99/iris/cluster/worker/dashboard.py +61 -0
  174. marin_iris-0.99/iris/cluster/worker/env_probe.py +651 -0
  175. marin_iris-0.99/iris/cluster/worker/main.py +95 -0
  176. marin_iris-0.99/iris/cluster/worker/port_allocator.py +50 -0
  177. marin_iris-0.99/iris/cluster/worker/service.py +171 -0
  178. marin_iris-0.99/iris/cluster/worker/stats.py +151 -0
  179. marin_iris-0.99/iris/cluster/worker/task_attempt.py +1011 -0
  180. marin_iris-0.99/iris/cluster/worker/tpu_health.py +26 -0
  181. marin_iris-0.99/iris/cluster/worker/worker.py +1107 -0
  182. marin_iris-0.99/iris/cluster/worker/worker_types.py +70 -0
  183. marin_iris-0.99/iris/dev_tpu.py +87 -0
  184. marin_iris-0.99/iris/env_resources.py +174 -0
  185. marin_iris-0.99/iris/logging.py +12 -0
  186. marin_iris-0.99/iris/managed_thread.py +370 -0
  187. marin_iris-0.99/iris/rpc/__init__.py +12 -0
  188. marin_iris-0.99/iris/rpc/actor.proto +118 -0
  189. marin_iris-0.99/iris/rpc/actor_connect.py +513 -0
  190. marin_iris-0.99/iris/rpc/actor_pb2.py +70 -0
  191. marin_iris-0.99/iris/rpc/actor_pb2.pyi +134 -0
  192. marin_iris-0.99/iris/rpc/async_adapter.py +75 -0
  193. marin_iris-0.99/iris/rpc/auth.py +397 -0
  194. marin_iris-0.99/iris/rpc/codecs.py +62 -0
  195. marin_iris-0.99/iris/rpc/compression.py +23 -0
  196. marin_iris-0.99/iris/rpc/config.proto +534 -0
  197. marin_iris-0.99/iris/rpc/config_pb2.py +173 -0
  198. marin_iris-0.99/iris/rpc/config_pb2.pyi +581 -0
  199. marin_iris-0.99/iris/rpc/controller.proto +670 -0
  200. marin_iris-0.99/iris/rpc/controller_connect.py +2400 -0
  201. marin_iris-0.99/iris/rpc/controller_pb2.py +202 -0
  202. marin_iris-0.99/iris/rpc/controller_pb2.pyi +705 -0
  203. marin_iris-0.99/iris/rpc/errors.proto +28 -0
  204. marin_iris-0.99/iris/rpc/errors.py +301 -0
  205. marin_iris-0.99/iris/rpc/errors_pb2.py +38 -0
  206. marin_iris-0.99/iris/rpc/errors_pb2.pyi +19 -0
  207. marin_iris-0.99/iris/rpc/interceptors.py +190 -0
  208. marin_iris-0.99/iris/rpc/iris_logging.proto +46 -0
  209. marin_iris-0.99/iris/rpc/iris_logging_pb2.py +40 -0
  210. marin_iris-0.99/iris/rpc/iris_logging_pb2.pyi +39 -0
  211. marin_iris-0.99/iris/rpc/job.proto +621 -0
  212. marin_iris-0.99/iris/rpc/job_pb2.py +177 -0
  213. marin_iris-0.99/iris/rpc/job_pb2.pyi +768 -0
  214. marin_iris-0.99/iris/rpc/logging_pb2.py +9 -0
  215. marin_iris-0.99/iris/rpc/proto_utils.py +130 -0
  216. marin_iris-0.99/iris/rpc/query.proto +36 -0
  217. marin_iris-0.99/iris/rpc/query_pb2.py +41 -0
  218. marin_iris-0.99/iris/rpc/query_pb2.pyi +29 -0
  219. marin_iris-0.99/iris/rpc/stats.proto +70 -0
  220. marin_iris-0.99/iris/rpc/stats.py +289 -0
  221. marin_iris-0.99/iris/rpc/stats_connect.py +123 -0
  222. marin_iris-0.99/iris/rpc/stats_pb2.py +46 -0
  223. marin_iris-0.99/iris/rpc/stats_pb2.pyi +72 -0
  224. marin_iris-0.99/iris/rpc/stats_service.py +29 -0
  225. marin_iris-0.99/iris/rpc/time.proto +47 -0
  226. marin_iris-0.99/iris/rpc/time_pb2.py +39 -0
  227. marin_iris-0.99/iris/rpc/time_pb2.pyi +17 -0
  228. marin_iris-0.99/iris/rpc/vm.proto +189 -0
  229. marin_iris-0.99/iris/rpc/vm_pb2.py +89 -0
  230. marin_iris-0.99/iris/rpc/vm_pb2.pyi +288 -0
  231. marin_iris-0.99/iris/rpc/worker.proto +124 -0
  232. marin_iris-0.99/iris/rpc/worker_connect.py +709 -0
  233. marin_iris-0.99/iris/rpc/worker_pb2.py +73 -0
  234. marin_iris-0.99/iris/rpc/worker_pb2.pyi +109 -0
  235. marin_iris-0.99/iris/runtime/__init__.py +2 -0
  236. marin_iris-0.99/iris/runtime/jax_init.py +170 -0
  237. marin_iris-0.99/iris/test_util.py +65 -0
  238. marin_iris-0.99/iris/time_proto.py +28 -0
  239. marin_iris-0.99/iris/version.py +47 -0
  240. marin_iris-0.99/pyproject.toml +106 -0
@@ -0,0 +1,243 @@
1
+ # redundant, but Ray looks for this otherwise.
2
+ .git
3
+
4
+ logs/
5
+
6
+ # CPU profiles
7
+ prof/
8
+
9
+ # Downloaded build tools (zig, etc.)
10
+ .tools/
11
+
12
+ tests/snapshots/outputs
13
+ tests/snapshots/diffs
14
+
15
+ # don't log data/MD outputs to git
16
+ data/*
17
+ output/*
18
+ outputs/*
19
+
20
+ # Snapshot diffs and outputs
21
+ tests/snapshots/*/outputs/*
22
+ tests/snapshots/*/diffs/*
23
+
24
+ # This is mainly for Ray and using submodule
25
+ */**/.git
26
+
27
+ ### Python template
28
+ # Byte-compiled / optimized / DLL files
29
+ __pycache__/
30
+ *.py[cod]
31
+ *$py.class
32
+
33
+ # C extensions
34
+ *.so
35
+
36
+ # pypa/gh-action-pypi-publish caches its Docker action manifest here.
37
+ .github/.tmp/
38
+
39
+ # Distribution / packaging
40
+ .Python
41
+ build/
42
+ develop-eggs/
43
+ dist/
44
+ downloads/
45
+ eggs/
46
+ .eggs/
47
+ lib64/
48
+ parts/
49
+ sdist/
50
+ local_store/
51
+ wheels/
52
+ share/python-wheels/
53
+ *.egg-info/
54
+ .installed.cfg
55
+ *.egg
56
+ MANIFEST
57
+
58
+ # PyInstaller
59
+ # Usually these files are written by a python script from a template
60
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
61
+ *.manifest
62
+ *.spec
63
+
64
+ # Installer logs
65
+ pip-log.txt
66
+ pip-delete-this-directory.txt
67
+
68
+ # Unit test / coverage reports
69
+ htmlcov/
70
+ .tox/
71
+ .nox/
72
+ .coverage
73
+ .coverage.*
74
+ .cache
75
+ nosetests.xml
76
+ coverage.xml
77
+ *.cover
78
+ *.py,cover
79
+ .hypothesis/
80
+ .pytest_cache/
81
+ cover/
82
+
83
+ # Translations
84
+ *.mo
85
+ *.pot
86
+
87
+ # Django stuff:
88
+ *.log
89
+ local_settings.py
90
+ db.sqlite3
91
+ db.sqlite3-journal
92
+
93
+ # Flask stuff:
94
+ instance/
95
+ .webassets-cache
96
+
97
+ # Scrapy stuff:
98
+ .scrapy
99
+
100
+ # Sphinx documentation
101
+ docs/_build/
102
+
103
+ # PyBuilder
104
+ .pybuilder/
105
+ target/
106
+
107
+ # Jupyter Notebook
108
+ .ipynb_checkpoints
109
+
110
+ # IPython
111
+ profile_default/
112
+ ipython_config.py
113
+
114
+ # pyenv
115
+ # For a library or package, you might want to ignore these files since the code is
116
+ # intended to run in multiple environments; otherwise, check them in:
117
+ # .python-version
118
+
119
+ # pipenv
120
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
121
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
122
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
123
+ # install all needed dependencies.
124
+ #Pipfile.lock
125
+
126
+ # poetry
127
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
128
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
129
+ # commonly ignored for libraries.
130
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
131
+ #poetry.lock
132
+
133
+ # pdm
134
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
135
+ #pdm.lock
136
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
137
+ # in version control.
138
+ # https://pdm.fming.dev/#use-with-ide
139
+ .pdm.toml
140
+
141
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
142
+ __pypackages__/
143
+
144
+ # Celery stuff
145
+ celerybeat-schedule
146
+ celerybeat.pid
147
+
148
+ # SageMath parsed files
149
+ *.sage.py
150
+
151
+ # Environments
152
+ .env
153
+ .venv
154
+ env/
155
+ venv/
156
+ ENV/
157
+ env.bak/
158
+ venv.bak/
159
+
160
+ # Spyder project settings
161
+ .spyderproject
162
+ .spyproject
163
+
164
+ # Rope project settings
165
+ .ropeproject
166
+
167
+ # mkdocs documentation
168
+ /site
169
+
170
+ # mypy
171
+ .mypy_cache/
172
+ .dmypy.json
173
+ dmypy.json
174
+
175
+ # Pyre type checker
176
+ .pyre/
177
+
178
+ # Ruff
179
+ .ruff_cache/
180
+
181
+ # pytype static type analyzer
182
+ .pytype/
183
+
184
+ # Cython debug symbols
185
+ cython_debug/
186
+
187
+ # PyCharm
188
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
189
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
190
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
191
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
192
+ .idea/
193
+ *.iml
194
+
195
+ # IDE Config
196
+ .vscode/
197
+
198
+ # Mac OS
199
+ .DS_Store
200
+
201
+ # Secrets
202
+ credentials.json
203
+ marin/crawl/bigquery-gcs-key.json
204
+
205
+ # Archive
206
+ archive/
207
+
208
+ # Caches and Outputs
209
+ !/scripts/web/output/
210
+ !/output/
211
+
212
+ # csv
213
+ *.csv
214
+
215
+ # wandb logs
216
+ wandb
217
+
218
+ # Ignore generated credentials from google-github-actions/auth
219
+ gha-creds-*.json
220
+
221
+ .aider*
222
+ .git/*
223
+
224
+ *.jsonl
225
+ **/*.jsonl
226
+ scr/*
227
+ .weaver/
228
+
229
+ # Local host Marin config
230
+ .marin.yaml
231
+
232
+ /scratch
233
+
234
+ .forge
235
+ .claude
236
+ !.claude/skills
237
+ .agents/tmp/
238
+ .codex
239
+ .entire
240
+
241
+ .worktrees
242
+ .obsidian
243
+ .cw_env
@@ -0,0 +1,30 @@
1
+ Metadata-Version: 2.4
2
+ Name: marin-iris
3
+ Version: 0.99
4
+ Requires-Python: <3.13,>=3.11
5
+ Requires-Dist: click>=8.3.1
6
+ Requires-Dist: cloudpickle>=3.1.2
7
+ Requires-Dist: connect-python>=0.9.0
8
+ Requires-Dist: fsspec>=2024.0.0
9
+ Requires-Dist: gcsfs>=2024.0.0
10
+ Requires-Dist: google-auth>=2.0
11
+ Requires-Dist: google-cloud-tpu>=1.18.0
12
+ Requires-Dist: grpcio>=1.76.0
13
+ Requires-Dist: httpx>=0.28.1
14
+ Requires-Dist: humanfriendly>=10.0
15
+ Requires-Dist: marin-finelog==0.99
16
+ Requires-Dist: marin-rigging==0.99
17
+ Requires-Dist: pydantic>=2.0
18
+ Requires-Dist: pyjwt>=2.12.0
19
+ Requires-Dist: pyyaml>=6.0
20
+ Requires-Dist: s3fs>=2024.0.0
21
+ Requires-Dist: starlette>=0.50.0
22
+ Requires-Dist: tabulate>=0.9.0
23
+ Requires-Dist: typing-extensions>=4.0
24
+ Requires-Dist: uvicorn[standard]>=0.23.0
25
+ Requires-Dist: zstandard>=0.22.0
26
+ Provides-Extra: controller
27
+ Requires-Dist: duckdb>=1.0.0; extra == 'controller'
28
+ Requires-Dist: kubernetes>=31.0.0; extra == 'controller'
29
+ Requires-Dist: pyarrow>=19.0.0; extra == 'controller'
30
+ Provides-Extra: worker
@@ -0,0 +1,92 @@
1
+ # Persistent CoreWeave CI cluster. The CPU pool and the first H100 node stay
2
+ # warm between runs; the H100 pool may autoscale to a second node for manual
3
+ # multi-host canary runs.
4
+
5
+ platform:
6
+ label_prefix: iris-ci
7
+ coreweave:
8
+ region: US-WEST-04A
9
+ namespace: iris-ci
10
+ kubeconfig_path: ~/.kube/coreweave-iris
11
+ object_storage_endpoint: https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com
12
+
13
+ storage:
14
+ remote_state_dir: s3://marin-na/iris/state/ci
15
+
16
+ kubernetes_provider:
17
+ namespace: iris-ci
18
+ default_image: ghcr.io/marin-community/iris-task:latest
19
+ host_network: true
20
+ cache_dir: /mnt/local/iris-cache
21
+ controller_address: http://iris-ci-controller-svc.iris-ci.svc.cluster.local:10000
22
+
23
+ controller:
24
+ image: ghcr.io/marin-community/iris-controller:latest
25
+ coreweave:
26
+ port: 10000
27
+ service_name: iris-ci-controller-svc
28
+ scale_group: cpu-erapids
29
+
30
+ defaults:
31
+ autoscaler:
32
+ evaluation_interval:
33
+ milliseconds: 10000
34
+ scale_up_delay:
35
+ milliseconds: 60000
36
+ scale_down_delay:
37
+ milliseconds: 300000
38
+ startup_grace_period:
39
+ milliseconds: 1200000 # 20 min — nodes are pinned warm so this rarely fires
40
+ task_env:
41
+ MARIN_PREFIX: s3://marin-na/marin
42
+ # H100 hostNetwork pods also see IB and SR-IOV link-local interfaces.
43
+ NCCL_SOCKET_IFNAME: =enp157s0np0
44
+ worker:
45
+ docker_image: ghcr.io/marin-community/iris-worker:latest
46
+ port: 10001
47
+ cache_dir: /mnt/local/iris-cache
48
+ runtime: kubernetes
49
+ default_task_image: ghcr.io/marin-community/iris-task:latest
50
+
51
+ scale_groups:
52
+ cpu-erapids:
53
+ num_vms: 1
54
+ resources:
55
+ cpu: 64
56
+ ram: 256GB
57
+ disk: 1TB
58
+ device_type: cpu
59
+ capacity_type: on-demand
60
+ worker:
61
+ attributes:
62
+ pool: cpu-erapids
63
+ buffer_slices: 1
64
+ max_slices: 1
65
+ priority: 50
66
+ slice_template:
67
+ num_vms: 1
68
+ coreweave:
69
+ region: US-WEST-04A
70
+ instance_type: cd-gp-i64-erapids
71
+
72
+ h100-8x:
73
+ num_vms: 1
74
+ resources:
75
+ cpu: 128
76
+ ram: 2048GB
77
+ disk: 1TB
78
+ device_type: gpu
79
+ device_variant: H100
80
+ device_count: 8
81
+ capacity_type: on-demand
82
+ worker:
83
+ attributes:
84
+ pool: h100-8x
85
+ buffer_slices: 1
86
+ max_slices: 2
87
+ priority: 100
88
+ slice_template:
89
+ num_vms: 1
90
+ coreweave:
91
+ region: US-WEST-04A
92
+ instance_type: gd-8xh100ib-i128
@@ -0,0 +1,99 @@
1
+ # Iris on CoreWeave RNO2A (GH200, 1 GPU per node).
2
+ #
3
+ # Single GPU NodePool: gd-1xgh200. The controller is pinned onto this same pool
4
+ # because the cluster has no CPU nodes.
5
+ #
6
+ # Setup:
7
+ # 1. Split the multi-context kubeconfig (~/.kube/coreweave-rno2a holds both
8
+ # rno2a and usw09b) into a single-context file:
9
+ # KUBECONFIG=~/.kube/coreweave-rno2a \
10
+ # kubectl config view --minify --flatten --context=rno2a \
11
+ # > ~/.kube/cw-rno2a.yaml
12
+ # chmod 600 ~/.kube/cw-rno2a.yaml
13
+ # 2. CoreWeave object storage credentials in env (consumed by `iris cluster start`):
14
+ # export R2_ACCESS_KEY_ID=<...>
15
+ # export R2_SECRET_ACCESS_KEY=<...>
16
+ # 3. Start the cluster:
17
+ # cd lib/iris && uv run --group dev iris \
18
+ # --config=examples/coreweave-rno2a.yaml cluster start
19
+ #
20
+ # The controller Deployment includes the standard NVIDIA GPU toleration, so it
21
+ # can schedule onto this GPU-only cluster.
22
+
23
+ platform:
24
+ label_prefix: iris-rno2a
25
+ coreweave:
26
+ region: RNO2A
27
+ namespace: iris
28
+ kubeconfig_path: ~/.kube/cw-rno2a.yaml
29
+ # CoreWeave native object storage: virtual-hosted-style addressing
30
+ # (bucket goes in the subdomain). iris auto-detects cwobject.com domains
31
+ # and sets s3 addressing_style=virtual — see _needs_virtual_host_addressing.
32
+ # Bucket-less base endpoint. Virtual-host addressing prepends the bucket
33
+ # name as a subdomain at request time (s3://marin-poc/foo →
34
+ # https://marin-poc.cwobject.com/foo). Including the bucket here would
35
+ # double it (https://marin-poc.marin-poc.cwobject.com/...).
36
+ object_storage_endpoint: https://cwobject.com
37
+
38
+ storage:
39
+ remote_state_dir: s3://marin-poc/iris/state/rno2a
40
+
41
+ kubernetes_provider:
42
+ namespace: iris
43
+ default_image: ghcr.io/marin-community/iris-task:latest
44
+ # RNO2A GH200 nodes do not advertise rdma/ib devices. Keep host networking off
45
+ # so GPU task pods request only nvidia.com/gpu and can schedule.
46
+ host_network: false
47
+ cache_dir: /mnt/local/iris-cache
48
+ controller_address: http://iris-controller-svc.iris.svc.cluster.local:10000
49
+
50
+ controller:
51
+ image: ghcr.io/marin-community/iris-controller:latest
52
+ coreweave:
53
+ port: 10000
54
+ service_name: iris-controller-svc
55
+ scale_group: gh200-1x
56
+
57
+ defaults:
58
+ autoscaler:
59
+ evaluation_interval:
60
+ milliseconds: 10000
61
+ scale_up_delay:
62
+ milliseconds: 60000
63
+ scale_down_delay:
64
+ milliseconds: 300000
65
+ startup_grace_period:
66
+ milliseconds: 2400000 # 40 min — covers bare-metal node provisioning + Pod startup
67
+ task_env:
68
+ MARIN_PREFIX: s3://marin-na/marin
69
+ NCCL_SOCKET_IFNAME: =eth0
70
+ worker:
71
+ docker_image: ghcr.io/marin-community/iris-worker:latest
72
+ port: 10001
73
+ cache_dir: /mnt/local/iris-cache
74
+ runtime: kubernetes
75
+ default_task_image: ghcr.io/marin-community/iris-task:latest
76
+
77
+ scale_groups:
78
+ gh200-1x:
79
+ num_vms: 1
80
+ resources:
81
+ cpu: 64
82
+ ram: 256GB
83
+ disk: 1TB
84
+ device_type: gpu
85
+ device_variant: GH200
86
+ device_count: 1
87
+ capacity_type: on-demand
88
+ worker:
89
+ attributes:
90
+ pool: gh200-1x
91
+ # buffer_slices keeps one node warm so the controller always has a home.
92
+ buffer_slices: 1
93
+ max_slices: 2
94
+ priority: 100
95
+ slice_template:
96
+ num_vms: 1
97
+ coreweave:
98
+ region: RNO2A
99
+ instance_type: gd-1xgh200
@@ -0,0 +1,98 @@
1
+ # Iris on CoreWeave US-WEST-09B (B200, 8 GPUs per node).
2
+ #
3
+ # Single GPU NodePool: b200-8x. The controller is pinned onto this same pool
4
+ # because the cluster has no CPU nodes.
5
+ #
6
+ # Setup:
7
+ # 1. Split the multi-context kubeconfig (~/.kube/coreweave-rno2a holds both
8
+ # rno2a and usw09b) into a single-context file:
9
+ # KUBECONFIG=~/.kube/coreweave-rno2a \
10
+ # kubectl config view --minify --flatten --context=usw09b \
11
+ # > ~/.kube/cw-usw09b.yaml
12
+ # chmod 600 ~/.kube/cw-usw09b.yaml
13
+ # 2. CoreWeave object storage credentials in env (consumed by `iris cluster start`):
14
+ # export R2_ACCESS_KEY_ID=<...>
15
+ # export R2_SECRET_ACCESS_KEY=<...>
16
+ # 3. Start the cluster:
17
+ # cd lib/iris && uv run --group dev iris \
18
+ # --config=examples/coreweave-usw09b.yaml cluster start
19
+ #
20
+ # The controller Deployment includes the standard NVIDIA GPU toleration, so it
21
+ # can schedule onto this GPU-only cluster.
22
+
23
+ platform:
24
+ label_prefix: iris-usw09b
25
+ coreweave:
26
+ region: US-WEST-09B
27
+ namespace: iris
28
+ kubeconfig_path: ~/.kube/cw-usw09b.yaml
29
+ # CoreWeave native object storage. The marin-poc bucket lives in RNO2A;
30
+ # accessing it from US-WEST-09B is cross-region but works (just slower).
31
+ # iris auto-detects cwobject.com and switches addressing_style=virtual.
32
+ # Bucket-less base endpoint. Virtual-host addressing prepends the bucket
33
+ # name as a subdomain at request time (s3://marin-poc/foo →
34
+ # https://marin-poc.cwobject.com/foo). Including the bucket here would
35
+ # double it (https://marin-poc.marin-poc.cwobject.com/...).
36
+ object_storage_endpoint: https://cwobject.com
37
+
38
+ storage:
39
+ remote_state_dir: s3://marin-poc/iris/state/usw09b
40
+
41
+ kubernetes_provider:
42
+ namespace: iris
43
+ default_image: ghcr.io/marin-community/iris-task:latest
44
+ host_network: true
45
+ cache_dir: /mnt/local/iris-cache
46
+ controller_address: http://iris-controller-svc.iris.svc.cluster.local:10000
47
+
48
+ controller:
49
+ image: ghcr.io/marin-community/iris-controller:latest
50
+ coreweave:
51
+ port: 10000
52
+ service_name: iris-controller-svc
53
+ scale_group: b200-8x
54
+
55
+ defaults:
56
+ autoscaler:
57
+ evaluation_interval:
58
+ milliseconds: 10000
59
+ scale_up_delay:
60
+ milliseconds: 60000
61
+ scale_down_delay:
62
+ milliseconds: 300000
63
+ startup_grace_period:
64
+ milliseconds: 2400000 # 40 min — covers bare-metal node provisioning + Pod startup
65
+ task_env:
66
+ MARIN_PREFIX: s3://marin-na/marin
67
+ # B200 hostNetwork pods also see IB and SR-IOV link-local interfaces.
68
+ NCCL_SOCKET_IFNAME: =enp44s0np0
69
+ worker:
70
+ docker_image: ghcr.io/marin-community/iris-worker:latest
71
+ port: 10001
72
+ cache_dir: /mnt/local/iris-cache
73
+ runtime: kubernetes
74
+ default_task_image: ghcr.io/marin-community/iris-task:latest
75
+
76
+ scale_groups:
77
+ b200-8x:
78
+ num_vms: 1
79
+ resources:
80
+ cpu: 128
81
+ ram: 2048GB
82
+ disk: 1TB
83
+ device_type: gpu
84
+ device_variant: B200
85
+ device_count: 8
86
+ capacity_type: on-demand
87
+ worker:
88
+ attributes:
89
+ pool: b200-8x
90
+ # buffer_slices keeps one node warm so the controller always has a home.
91
+ buffer_slices: 1
92
+ max_slices: 2
93
+ priority: 100
94
+ slice_template:
95
+ num_vms: 1
96
+ coreweave:
97
+ region: US-WEST-09B
98
+ instance_type: b200-8x
@@ -0,0 +1,116 @@
1
+ # Iris development cluster configuration for CoreWeave CKS.
2
+ #
3
+ # Architecture: KubernetesProvider dispatches task pods directly to the k8s API.
4
+ # No worker daemons are needed. CoreWeave NodePool autoscaling provisions nodes
5
+ # on demand; when pods are deleted, NodePools scale to zero automatically.
6
+ # CPU nodes are kept always-on (buffer_slices=2) for monitoring software.
7
+ #
8
+ # Workflow:
9
+ #
10
+ # 1. Set S3 object storage credentials (required for s3:// storage URIs):
11
+ # export R2_ACCESS_KEY_ID=<your-r2-access-key-id>
12
+ # export R2_SECRET_ACCESS_KEY=<your-r2-secret-access-key>
13
+ # These are created in the Cloudflare dashboard under R2 > Manage R2 API Tokens.
14
+ # `iris cluster start` creates a K8s Secret from these env vars automatically.
15
+ #
16
+ # 2. Start the cluster (creates RBAC, shared NodePools, ConfigMap, Deployment, Service):
17
+ # iris --config=lib/iris/examples/coreweave.yaml cluster start
18
+ #
19
+ # 3. Use the Iris CLI:
20
+ # iris --config=lib/iris/examples/coreweave.yaml cluster status
21
+ # iris --config=lib/iris/examples/coreweave.yaml cluster dashboard
22
+ #
23
+ # This config file is used by:
24
+ # - The CLI on the operator's laptop (for `cluster start`, `cluster status`, job submission)
25
+ # - The controller and workers inside the cluster (mounted as ConfigMap at /etc/iris/config.yaml)
26
+ #
27
+ # To use a local kubeconfig (e.g. from CoreWeave Console > Tokens > Download):
28
+ # Set platform.coreweave.kubeconfig_path below, or:
29
+ # export KUBECONFIG=~/.kube/coreweave-iris
30
+
31
+ platform:
32
+ label_prefix: coreweave
33
+ coreweave:
34
+ region: US-WEST-04A
35
+ namespace: iris
36
+ kubeconfig_path: ~/.kube/coreweave-iris
37
+ object_storage_endpoint: https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com
38
+
39
+ storage:
40
+ remote_state_dir: s3://marin-na/iris/coreweave/state
41
+
42
+ kubernetes_provider:
43
+ namespace: iris
44
+ default_image: ghcr.io/marin-community/iris-task:latest
45
+ host_network: true # Required for NCCL/IB multi-host traffic
46
+ cache_dir: /mnt/local/iris-cache # NVMe; default /cache hits the 15GB ramdisk on bare-metal GPU nodes
47
+ controller_address: http://iris-controller-svc.iris.svc.cluster.local:10000
48
+
49
+ controller:
50
+ image: ghcr.io/marin-community/iris-controller:latest
51
+ coreweave:
52
+ port: 10000
53
+ service_name: iris-controller-svc
54
+ scale_group: cpu-erapids
55
+
56
+ defaults:
57
+ autoscaler:
58
+ evaluation_interval:
59
+ milliseconds: 10000
60
+ scale_up_delay:
61
+ milliseconds: 60000
62
+ scale_down_delay:
63
+ milliseconds: 300000
64
+ startup_grace_period:
65
+ milliseconds: 2400000 # 40 min — covers autoscaler node provisioning + Pod startup
66
+ task_env:
67
+ MARIN_PREFIX: s3://marin-na/marin
68
+ worker:
69
+ docker_image: ghcr.io/marin-community/iris-worker:latest
70
+ port: 10001
71
+ cache_dir: /mnt/local/iris-cache
72
+ runtime: kubernetes
73
+ default_task_image: ghcr.io/marin-community/iris-task:latest
74
+
75
+ scale_groups:
76
+ cpu-erapids:
77
+ num_vms: 1
78
+ resources:
79
+ cpu: 64
80
+ ram: 256GB
81
+ disk: 1TB
82
+ device_type: cpu
83
+ capacity_type: on-demand
84
+ worker:
85
+ attributes:
86
+ pool: cpu-erapids
87
+ buffer_slices: 1
88
+ max_slices: 4
89
+ priority: 50
90
+ slice_template:
91
+ num_vms: 1
92
+ coreweave:
93
+ region: US-WEST-04A
94
+ instance_type: cd-gp-i64-erapids
95
+
96
+ h100-8x:
97
+ num_vms: 1
98
+ resources:
99
+ cpu: 128
100
+ ram: 2048GB
101
+ disk: 1TB
102
+ device_type: gpu
103
+ device_variant: H100
104
+ device_count: 8
105
+ capacity_type: on-demand
106
+ worker:
107
+ attributes:
108
+ pool: h100-8x
109
+ buffer_slices: 0
110
+ max_slices: 8
111
+ priority: 100
112
+ slice_template:
113
+ num_vms: 1
114
+ coreweave:
115
+ region: US-WEST-04A
116
+ instance_type: gd-8xh100ib-i128
@@ -0,0 +1,35 @@
1
+ # Iris local configuration with GCP OAuth2 access token authentication
2
+ # Usage: iris --cluster=local-auth-gcp cluster start --local
3
+ # Requires: gcloud auth login && gcloud auth application-default login
4
+
5
+ platform:
6
+ local:
7
+
8
+ controller:
9
+ local:
10
+ port: 0 # auto-assign
11
+
12
+ worker_provider: {}
13
+
14
+ auth:
15
+ gcp:
16
+ project_id: hai-gcp-models # GCP project ID — users must have access to log in
17
+ admin_users:
18
+ - russell.power@gmail.com # Replace with actual admin email
19
+ optional: true
20
+
21
+ scale_groups:
22
+ cpu:
23
+ num_vms: 1
24
+ resources:
25
+ cpu: 16
26
+ ram: 32GB
27
+ disk: 100GB
28
+ device_type: cpu
29
+ device_variant: cpu
30
+ capacity_type: on-demand
31
+ buffer_slices: 1
32
+ max_slices: 4
33
+ slice_template:
34
+ num_vms: 1
35
+ local: