marin-iris 0.99__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- marin_iris-0.99/.gitignore +243 -0
- marin_iris-0.99/PKG-INFO +30 -0
- marin_iris-0.99/examples/coreweave-ci.yaml +92 -0
- marin_iris-0.99/examples/coreweave-rno2a.yaml +99 -0
- marin_iris-0.99/examples/coreweave-usw09b.yaml +98 -0
- marin_iris-0.99/examples/coreweave.yaml +116 -0
- marin_iris-0.99/examples/local-auth-gcp.yaml +35 -0
- marin_iris-0.99/examples/local-auth-static.yaml +36 -0
- marin_iris-0.99/examples/local.yaml +29 -0
- marin_iris-0.99/examples/marin-dev.yaml +145 -0
- marin_iris-0.99/examples/marin.yaml +223 -0
- marin_iris-0.99/examples/smoke-gcp.yaml +71 -0
- marin_iris-0.99/examples/test.yaml +165 -0
- marin_iris-0.99/examples/tpu-demo.ipynb +461 -0
- marin_iris-0.99/iris/__init__.py +2 -0
- marin_iris-0.99/iris/_build_info.py +3 -0
- marin_iris-0.99/iris/actor/__init__.py +35 -0
- marin_iris-0.99/iris/actor/client.py +223 -0
- marin_iris-0.99/iris/actor/pool.py +281 -0
- marin_iris-0.99/iris/actor/resolver.py +108 -0
- marin_iris-0.99/iris/actor/server.py +355 -0
- marin_iris-0.99/iris/chaos.py +98 -0
- marin_iris-0.99/iris/cli/__init__.py +12 -0
- marin_iris-0.99/iris/cli/actor.py +69 -0
- marin_iris-0.99/iris/cli/bug_report.py +528 -0
- marin_iris-0.99/iris/cli/build.py +493 -0
- marin_iris-0.99/iris/cli/cluster.py +1142 -0
- marin_iris-0.99/iris/cli/job.py +1261 -0
- marin_iris-0.99/iris/cli/main.py +486 -0
- marin_iris-0.99/iris/cli/process_status.py +194 -0
- marin_iris-0.99/iris/cli/query.py +82 -0
- marin_iris-0.99/iris/cli/rpc.py +327 -0
- marin_iris-0.99/iris/cli/task.py +70 -0
- marin_iris-0.99/iris/cli/token_store.py +125 -0
- marin_iris-0.99/iris/client/__init__.py +49 -0
- marin_iris-0.99/iris/client/client.py +1081 -0
- marin_iris-0.99/iris/client/resolver.py +102 -0
- marin_iris-0.99/iris/client/worker_pool.py +595 -0
- marin_iris-0.99/iris/cluster/__init__.py +2 -0
- marin_iris-0.99/iris/cluster/bundle.py +185 -0
- marin_iris-0.99/iris/cluster/client/__init__.py +22 -0
- marin_iris-0.99/iris/cluster/client/bundle.py +213 -0
- marin_iris-0.99/iris/cluster/client/job_info.py +167 -0
- marin_iris-0.99/iris/cluster/client/protocol.py +108 -0
- marin_iris-0.99/iris/cluster/client/remote_client.py +501 -0
- marin_iris-0.99/iris/cluster/config.py +1331 -0
- marin_iris-0.99/iris/cluster/constraints.py +1169 -0
- marin_iris-0.99/iris/cluster/controller/__init__.py +2 -0
- marin_iris-0.99/iris/cluster/controller/actor_proxy.py +104 -0
- marin_iris-0.99/iris/cluster/controller/auth.py +424 -0
- marin_iris-0.99/iris/cluster/controller/autoscaler/__init__.py +6 -0
- marin_iris-0.99/iris/cluster/controller/autoscaler/models.py +75 -0
- marin_iris-0.99/iris/cluster/controller/autoscaler/operations.py +176 -0
- marin_iris-0.99/iris/cluster/controller/autoscaler/planning.py +135 -0
- marin_iris-0.99/iris/cluster/controller/autoscaler/recovery.py +136 -0
- marin_iris-0.99/iris/cluster/controller/autoscaler/routing.py +597 -0
- marin_iris-0.99/iris/cluster/controller/autoscaler/runtime.py +641 -0
- marin_iris-0.99/iris/cluster/controller/autoscaler/scaling_group.py +1340 -0
- marin_iris-0.99/iris/cluster/controller/autoscaler/status.py +169 -0
- marin_iris-0.99/iris/cluster/controller/autoscaler/worker_registry.py +175 -0
- marin_iris-0.99/iris/cluster/controller/budget.py +222 -0
- marin_iris-0.99/iris/cluster/controller/checkpoint.py +421 -0
- marin_iris-0.99/iris/cluster/controller/codec.py +117 -0
- marin_iris-0.99/iris/cluster/controller/controller.py +2671 -0
- marin_iris-0.99/iris/cluster/controller/dashboard.py +801 -0
- marin_iris-0.99/iris/cluster/controller/db.py +993 -0
- marin_iris-0.99/iris/cluster/controller/endpoint_proxy.py +288 -0
- marin_iris-0.99/iris/cluster/controller/main.py +358 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0001_init.py +10 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0002_read_indexes.py +8 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0003_normalize_scaling_groups.py +39 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0004_api_keys.py +38 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0004_worker_indexes.py +15 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0005_task_profiles.py +32 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0006_jwt_signing_key.py +16 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0007_perf_indexes.py +19 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0008_jobs_name.py +38 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0009_query_indexes.py +24 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0010_dashboard_indexes.py +42 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0010_purge_orphaned_endpoints.py +15 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0011_direct_provider.py +33 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0012_container_name.py +16 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0012_separate_auth_db.py +53 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0013_has_reservation.py +46 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0014_profile_kind.py +36 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0015_drop_redundant_index.py +11 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0016_worker_scheduling_fields.py +57 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0017_job_scheduling_fields.py +72 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0018_task_assignment_fields.py +30 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0019_worker_fk_cascade.py +73 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0020_perf_indices_and_profiles_fk.py +53 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0021_budgets.py +38 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0022_workers_slice_and_group.py +28 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0023_separate_profiles_db.py +63 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0024_normalize_resource_usage.py +47 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0024_task_resource_history.py +25 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0025_normalize_resource_snapshots.py +105 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0026_normalize_worker_metadata.py +95 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0027_normalize_job_resources.py +101 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0028_job_config_table.py +273 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0029_drop_task_resource_usage_columns.py +30 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0030_backfill_worker_region.py +57 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0030_job_submit_argv.py +13 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0031_auto_vacuum_incremental.py +24 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0032_backfill_attempt_finished_at.py +52 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0033_worker_task_history_fk_cascade.py +60 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0034_task_summaries_covering_index.py +24 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0035_drop_dead_logs_table.py +17 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0036_reconcile_reservation_holder_attempt_ids.py +39 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0037_drop_txn_log_and_txn_actions.py +19 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0037_user_budget_default.py +18 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0038_finalize_orphan_attempts.py +103 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0039_requeue_split_coscheduled_jobs.py +214 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0040_drop_resource_history_tables.py +35 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0041_drop_worker_task_history.py +23 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0042_drop_workers_dormant_columns.py +31 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0043_drop_workers_committed_columns.py +33 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0044_drop_dispatch_queue.py +45 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0045_index_task_attempts_live_workerbound.py +40 -0
- marin_iris-0.99/iris/cluster/controller/migrations/0046_drop_slices_last_active_ms.py +22 -0
- marin_iris-0.99/iris/cluster/controller/provider.py +55 -0
- marin_iris-0.99/iris/cluster/controller/query.py +80 -0
- marin_iris-0.99/iris/cluster/controller/scheduler.py +940 -0
- marin_iris-0.99/iris/cluster/controller/schema.py +1710 -0
- marin_iris-0.99/iris/cluster/controller/service.py +2629 -0
- marin_iris-0.99/iris/cluster/controller/stores.py +2205 -0
- marin_iris-0.99/iris/cluster/controller/transitions.py +2764 -0
- marin_iris-0.99/iris/cluster/controller/vm_lifecycle.py +452 -0
- marin_iris-0.99/iris/cluster/controller/worker_health.py +199 -0
- marin_iris-0.99/iris/cluster/controller/worker_provider.py +289 -0
- marin_iris-0.99/iris/cluster/dashboard_common.py +181 -0
- marin_iris-0.99/iris/cluster/endpoints.py +187 -0
- marin_iris-0.99/iris/cluster/log_store_helpers.py +46 -0
- marin_iris-0.99/iris/cluster/process_status.py +105 -0
- marin_iris-0.99/iris/cluster/providers/__init__.py +30 -0
- marin_iris-0.99/iris/cluster/providers/_worker_base.py +116 -0
- marin_iris-0.99/iris/cluster/providers/factory.py +105 -0
- marin_iris-0.99/iris/cluster/providers/gcp/__init__.py +11 -0
- marin_iris-0.99/iris/cluster/providers/gcp/bootstrap.py +496 -0
- marin_iris-0.99/iris/cluster/providers/gcp/controller.py +378 -0
- marin_iris-0.99/iris/cluster/providers/gcp/fake.py +560 -0
- marin_iris-0.99/iris/cluster/providers/gcp/handles.py +492 -0
- marin_iris-0.99/iris/cluster/providers/gcp/local.py +171 -0
- marin_iris-0.99/iris/cluster/providers/gcp/service.py +948 -0
- marin_iris-0.99/iris/cluster/providers/gcp/ssh.py +158 -0
- marin_iris-0.99/iris/cluster/providers/gcp/workers.py +1029 -0
- marin_iris-0.99/iris/cluster/providers/k8s/__init__.py +4 -0
- marin_iris-0.99/iris/cluster/providers/k8s/bundle_fetch.py +84 -0
- marin_iris-0.99/iris/cluster/providers/k8s/constants.py +12 -0
- marin_iris-0.99/iris/cluster/providers/k8s/controller.py +919 -0
- marin_iris-0.99/iris/cluster/providers/k8s/fake.py +830 -0
- marin_iris-0.99/iris/cluster/providers/k8s/service.py +782 -0
- marin_iris-0.99/iris/cluster/providers/k8s/tasks.py +1680 -0
- marin_iris-0.99/iris/cluster/providers/k8s/types.py +146 -0
- marin_iris-0.99/iris/cluster/providers/local/__init__.py +2 -0
- marin_iris-0.99/iris/cluster/providers/local/cluster.py +338 -0
- marin_iris-0.99/iris/cluster/providers/manual/__init__.py +2 -0
- marin_iris-0.99/iris/cluster/providers/manual/provider.py +547 -0
- marin_iris-0.99/iris/cluster/providers/protocols.py +140 -0
- marin_iris-0.99/iris/cluster/providers/remote_exec.py +426 -0
- marin_iris-0.99/iris/cluster/providers/types.py +432 -0
- marin_iris-0.99/iris/cluster/redaction.py +93 -0
- marin_iris-0.99/iris/cluster/runtime/__init__.py +39 -0
- marin_iris-0.99/iris/cluster/runtime/docker.py +1182 -0
- marin_iris-0.99/iris/cluster/runtime/entrypoint.py +122 -0
- marin_iris-0.99/iris/cluster/runtime/env.py +134 -0
- marin_iris-0.99/iris/cluster/runtime/process.py +713 -0
- marin_iris-0.99/iris/cluster/runtime/profile.py +290 -0
- marin_iris-0.99/iris/cluster/runtime/types.py +385 -0
- marin_iris-0.99/iris/cluster/service_mode.py +10 -0
- marin_iris-0.99/iris/cluster/types.py +842 -0
- marin_iris-0.99/iris/cluster/worker/__init__.py +4 -0
- marin_iris-0.99/iris/cluster/worker/dashboard.py +61 -0
- marin_iris-0.99/iris/cluster/worker/env_probe.py +651 -0
- marin_iris-0.99/iris/cluster/worker/main.py +95 -0
- marin_iris-0.99/iris/cluster/worker/port_allocator.py +50 -0
- marin_iris-0.99/iris/cluster/worker/service.py +171 -0
- marin_iris-0.99/iris/cluster/worker/stats.py +151 -0
- marin_iris-0.99/iris/cluster/worker/task_attempt.py +1011 -0
- marin_iris-0.99/iris/cluster/worker/tpu_health.py +26 -0
- marin_iris-0.99/iris/cluster/worker/worker.py +1107 -0
- marin_iris-0.99/iris/cluster/worker/worker_types.py +70 -0
- marin_iris-0.99/iris/dev_tpu.py +87 -0
- marin_iris-0.99/iris/env_resources.py +174 -0
- marin_iris-0.99/iris/logging.py +12 -0
- marin_iris-0.99/iris/managed_thread.py +370 -0
- marin_iris-0.99/iris/rpc/__init__.py +12 -0
- marin_iris-0.99/iris/rpc/actor.proto +118 -0
- marin_iris-0.99/iris/rpc/actor_connect.py +513 -0
- marin_iris-0.99/iris/rpc/actor_pb2.py +70 -0
- marin_iris-0.99/iris/rpc/actor_pb2.pyi +134 -0
- marin_iris-0.99/iris/rpc/async_adapter.py +75 -0
- marin_iris-0.99/iris/rpc/auth.py +397 -0
- marin_iris-0.99/iris/rpc/codecs.py +62 -0
- marin_iris-0.99/iris/rpc/compression.py +23 -0
- marin_iris-0.99/iris/rpc/config.proto +534 -0
- marin_iris-0.99/iris/rpc/config_pb2.py +173 -0
- marin_iris-0.99/iris/rpc/config_pb2.pyi +581 -0
- marin_iris-0.99/iris/rpc/controller.proto +670 -0
- marin_iris-0.99/iris/rpc/controller_connect.py +2400 -0
- marin_iris-0.99/iris/rpc/controller_pb2.py +202 -0
- marin_iris-0.99/iris/rpc/controller_pb2.pyi +705 -0
- marin_iris-0.99/iris/rpc/errors.proto +28 -0
- marin_iris-0.99/iris/rpc/errors.py +301 -0
- marin_iris-0.99/iris/rpc/errors_pb2.py +38 -0
- marin_iris-0.99/iris/rpc/errors_pb2.pyi +19 -0
- marin_iris-0.99/iris/rpc/interceptors.py +190 -0
- marin_iris-0.99/iris/rpc/iris_logging.proto +46 -0
- marin_iris-0.99/iris/rpc/iris_logging_pb2.py +40 -0
- marin_iris-0.99/iris/rpc/iris_logging_pb2.pyi +39 -0
- marin_iris-0.99/iris/rpc/job.proto +621 -0
- marin_iris-0.99/iris/rpc/job_pb2.py +177 -0
- marin_iris-0.99/iris/rpc/job_pb2.pyi +768 -0
- marin_iris-0.99/iris/rpc/logging_pb2.py +9 -0
- marin_iris-0.99/iris/rpc/proto_utils.py +130 -0
- marin_iris-0.99/iris/rpc/query.proto +36 -0
- marin_iris-0.99/iris/rpc/query_pb2.py +41 -0
- marin_iris-0.99/iris/rpc/query_pb2.pyi +29 -0
- marin_iris-0.99/iris/rpc/stats.proto +70 -0
- marin_iris-0.99/iris/rpc/stats.py +289 -0
- marin_iris-0.99/iris/rpc/stats_connect.py +123 -0
- marin_iris-0.99/iris/rpc/stats_pb2.py +46 -0
- marin_iris-0.99/iris/rpc/stats_pb2.pyi +72 -0
- marin_iris-0.99/iris/rpc/stats_service.py +29 -0
- marin_iris-0.99/iris/rpc/time.proto +47 -0
- marin_iris-0.99/iris/rpc/time_pb2.py +39 -0
- marin_iris-0.99/iris/rpc/time_pb2.pyi +17 -0
- marin_iris-0.99/iris/rpc/vm.proto +189 -0
- marin_iris-0.99/iris/rpc/vm_pb2.py +89 -0
- marin_iris-0.99/iris/rpc/vm_pb2.pyi +288 -0
- marin_iris-0.99/iris/rpc/worker.proto +124 -0
- marin_iris-0.99/iris/rpc/worker_connect.py +709 -0
- marin_iris-0.99/iris/rpc/worker_pb2.py +73 -0
- marin_iris-0.99/iris/rpc/worker_pb2.pyi +109 -0
- marin_iris-0.99/iris/runtime/__init__.py +2 -0
- marin_iris-0.99/iris/runtime/jax_init.py +170 -0
- marin_iris-0.99/iris/test_util.py +65 -0
- marin_iris-0.99/iris/time_proto.py +28 -0
- marin_iris-0.99/iris/version.py +47 -0
- marin_iris-0.99/pyproject.toml +106 -0
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
# redundant, but Ray looks for this otherwise.
|
|
2
|
+
.git
|
|
3
|
+
|
|
4
|
+
logs/
|
|
5
|
+
|
|
6
|
+
# CPU profiles
|
|
7
|
+
prof/
|
|
8
|
+
|
|
9
|
+
# Downloaded build tools (zig, etc.)
|
|
10
|
+
.tools/
|
|
11
|
+
|
|
12
|
+
tests/snapshots/outputs
|
|
13
|
+
tests/snapshots/diffs
|
|
14
|
+
|
|
15
|
+
# don't log data/MD outputs to git
|
|
16
|
+
data/*
|
|
17
|
+
output/*
|
|
18
|
+
outputs/*
|
|
19
|
+
|
|
20
|
+
# Snapshot diffs and outputs
|
|
21
|
+
tests/snapshots/*/outputs/*
|
|
22
|
+
tests/snapshots/*/diffs/*
|
|
23
|
+
|
|
24
|
+
# This is mainly for Ray and using submodule
|
|
25
|
+
*/**/.git
|
|
26
|
+
|
|
27
|
+
### Python template
|
|
28
|
+
# Byte-compiled / optimized / DLL files
|
|
29
|
+
__pycache__/
|
|
30
|
+
*.py[cod]
|
|
31
|
+
*$py.class
|
|
32
|
+
|
|
33
|
+
# C extensions
|
|
34
|
+
*.so
|
|
35
|
+
|
|
36
|
+
# pypa/gh-action-pypi-publish caches its Docker action manifest here.
|
|
37
|
+
.github/.tmp/
|
|
38
|
+
|
|
39
|
+
# Distribution / packaging
|
|
40
|
+
.Python
|
|
41
|
+
build/
|
|
42
|
+
develop-eggs/
|
|
43
|
+
dist/
|
|
44
|
+
downloads/
|
|
45
|
+
eggs/
|
|
46
|
+
.eggs/
|
|
47
|
+
lib64/
|
|
48
|
+
parts/
|
|
49
|
+
sdist/
|
|
50
|
+
local_store/
|
|
51
|
+
wheels/
|
|
52
|
+
share/python-wheels/
|
|
53
|
+
*.egg-info/
|
|
54
|
+
.installed.cfg
|
|
55
|
+
*.egg
|
|
56
|
+
MANIFEST
|
|
57
|
+
|
|
58
|
+
# PyInstaller
|
|
59
|
+
# Usually these files are written by a python script from a template
|
|
60
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
61
|
+
*.manifest
|
|
62
|
+
*.spec
|
|
63
|
+
|
|
64
|
+
# Installer logs
|
|
65
|
+
pip-log.txt
|
|
66
|
+
pip-delete-this-directory.txt
|
|
67
|
+
|
|
68
|
+
# Unit test / coverage reports
|
|
69
|
+
htmlcov/
|
|
70
|
+
.tox/
|
|
71
|
+
.nox/
|
|
72
|
+
.coverage
|
|
73
|
+
.coverage.*
|
|
74
|
+
.cache
|
|
75
|
+
nosetests.xml
|
|
76
|
+
coverage.xml
|
|
77
|
+
*.cover
|
|
78
|
+
*.py,cover
|
|
79
|
+
.hypothesis/
|
|
80
|
+
.pytest_cache/
|
|
81
|
+
cover/
|
|
82
|
+
|
|
83
|
+
# Translations
|
|
84
|
+
*.mo
|
|
85
|
+
*.pot
|
|
86
|
+
|
|
87
|
+
# Django stuff:
|
|
88
|
+
*.log
|
|
89
|
+
local_settings.py
|
|
90
|
+
db.sqlite3
|
|
91
|
+
db.sqlite3-journal
|
|
92
|
+
|
|
93
|
+
# Flask stuff:
|
|
94
|
+
instance/
|
|
95
|
+
.webassets-cache
|
|
96
|
+
|
|
97
|
+
# Scrapy stuff:
|
|
98
|
+
.scrapy
|
|
99
|
+
|
|
100
|
+
# Sphinx documentation
|
|
101
|
+
docs/_build/
|
|
102
|
+
|
|
103
|
+
# PyBuilder
|
|
104
|
+
.pybuilder/
|
|
105
|
+
target/
|
|
106
|
+
|
|
107
|
+
# Jupyter Notebook
|
|
108
|
+
.ipynb_checkpoints
|
|
109
|
+
|
|
110
|
+
# IPython
|
|
111
|
+
profile_default/
|
|
112
|
+
ipython_config.py
|
|
113
|
+
|
|
114
|
+
# pyenv
|
|
115
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
116
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
117
|
+
# .python-version
|
|
118
|
+
|
|
119
|
+
# pipenv
|
|
120
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
121
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
122
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
123
|
+
# install all needed dependencies.
|
|
124
|
+
#Pipfile.lock
|
|
125
|
+
|
|
126
|
+
# poetry
|
|
127
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
128
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
129
|
+
# commonly ignored for libraries.
|
|
130
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
131
|
+
#poetry.lock
|
|
132
|
+
|
|
133
|
+
# pdm
|
|
134
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
135
|
+
#pdm.lock
|
|
136
|
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
|
137
|
+
# in version control.
|
|
138
|
+
# https://pdm.fming.dev/#use-with-ide
|
|
139
|
+
.pdm.toml
|
|
140
|
+
|
|
141
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
142
|
+
__pypackages__/
|
|
143
|
+
|
|
144
|
+
# Celery stuff
|
|
145
|
+
celerybeat-schedule
|
|
146
|
+
celerybeat.pid
|
|
147
|
+
|
|
148
|
+
# SageMath parsed files
|
|
149
|
+
*.sage.py
|
|
150
|
+
|
|
151
|
+
# Environments
|
|
152
|
+
.env
|
|
153
|
+
.venv
|
|
154
|
+
env/
|
|
155
|
+
venv/
|
|
156
|
+
ENV/
|
|
157
|
+
env.bak/
|
|
158
|
+
venv.bak/
|
|
159
|
+
|
|
160
|
+
# Spyder project settings
|
|
161
|
+
.spyderproject
|
|
162
|
+
.spyproject
|
|
163
|
+
|
|
164
|
+
# Rope project settings
|
|
165
|
+
.ropeproject
|
|
166
|
+
|
|
167
|
+
# mkdocs documentation
|
|
168
|
+
/site
|
|
169
|
+
|
|
170
|
+
# mypy
|
|
171
|
+
.mypy_cache/
|
|
172
|
+
.dmypy.json
|
|
173
|
+
dmypy.json
|
|
174
|
+
|
|
175
|
+
# Pyre type checker
|
|
176
|
+
.pyre/
|
|
177
|
+
|
|
178
|
+
# Ruff
|
|
179
|
+
.ruff_cache/
|
|
180
|
+
|
|
181
|
+
# pytype static type analyzer
|
|
182
|
+
.pytype/
|
|
183
|
+
|
|
184
|
+
# Cython debug symbols
|
|
185
|
+
cython_debug/
|
|
186
|
+
|
|
187
|
+
# PyCharm
|
|
188
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
189
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
190
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
191
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
192
|
+
.idea/
|
|
193
|
+
*.iml
|
|
194
|
+
|
|
195
|
+
# IDE Config
|
|
196
|
+
.vscode/
|
|
197
|
+
|
|
198
|
+
# Mac OS
|
|
199
|
+
.DS_Store
|
|
200
|
+
|
|
201
|
+
# Secrets
|
|
202
|
+
credentials.json
|
|
203
|
+
marin/crawl/bigquery-gcs-key.json
|
|
204
|
+
|
|
205
|
+
# Archive
|
|
206
|
+
archive/
|
|
207
|
+
|
|
208
|
+
# Caches and Outputs
|
|
209
|
+
!/scripts/web/output/
|
|
210
|
+
!/output/
|
|
211
|
+
|
|
212
|
+
# csv
|
|
213
|
+
*.csv
|
|
214
|
+
|
|
215
|
+
# wandb logs
|
|
216
|
+
wandb
|
|
217
|
+
|
|
218
|
+
# Ignore generated credentials from google-github-actions/auth
|
|
219
|
+
gha-creds-*.json
|
|
220
|
+
|
|
221
|
+
.aider*
|
|
222
|
+
.git/*
|
|
223
|
+
|
|
224
|
+
*.jsonl
|
|
225
|
+
**/*.jsonl
|
|
226
|
+
scr/*
|
|
227
|
+
.weaver/
|
|
228
|
+
|
|
229
|
+
# Local host Marin config
|
|
230
|
+
.marin.yaml
|
|
231
|
+
|
|
232
|
+
/scratch
|
|
233
|
+
|
|
234
|
+
.forge
|
|
235
|
+
.claude
|
|
236
|
+
!.claude/skills
|
|
237
|
+
.agents/tmp/
|
|
238
|
+
.codex
|
|
239
|
+
.entire
|
|
240
|
+
|
|
241
|
+
.worktrees
|
|
242
|
+
.obsidian
|
|
243
|
+
.cw_env
|
marin_iris-0.99/PKG-INFO
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: marin-iris
|
|
3
|
+
Version: 0.99
|
|
4
|
+
Requires-Python: <3.13,>=3.11
|
|
5
|
+
Requires-Dist: click>=8.3.1
|
|
6
|
+
Requires-Dist: cloudpickle>=3.1.2
|
|
7
|
+
Requires-Dist: connect-python>=0.9.0
|
|
8
|
+
Requires-Dist: fsspec>=2024.0.0
|
|
9
|
+
Requires-Dist: gcsfs>=2024.0.0
|
|
10
|
+
Requires-Dist: google-auth>=2.0
|
|
11
|
+
Requires-Dist: google-cloud-tpu>=1.18.0
|
|
12
|
+
Requires-Dist: grpcio>=1.76.0
|
|
13
|
+
Requires-Dist: httpx>=0.28.1
|
|
14
|
+
Requires-Dist: humanfriendly>=10.0
|
|
15
|
+
Requires-Dist: marin-finelog==0.99
|
|
16
|
+
Requires-Dist: marin-rigging==0.99
|
|
17
|
+
Requires-Dist: pydantic>=2.0
|
|
18
|
+
Requires-Dist: pyjwt>=2.12.0
|
|
19
|
+
Requires-Dist: pyyaml>=6.0
|
|
20
|
+
Requires-Dist: s3fs>=2024.0.0
|
|
21
|
+
Requires-Dist: starlette>=0.50.0
|
|
22
|
+
Requires-Dist: tabulate>=0.9.0
|
|
23
|
+
Requires-Dist: typing-extensions>=4.0
|
|
24
|
+
Requires-Dist: uvicorn[standard]>=0.23.0
|
|
25
|
+
Requires-Dist: zstandard>=0.22.0
|
|
26
|
+
Provides-Extra: controller
|
|
27
|
+
Requires-Dist: duckdb>=1.0.0; extra == 'controller'
|
|
28
|
+
Requires-Dist: kubernetes>=31.0.0; extra == 'controller'
|
|
29
|
+
Requires-Dist: pyarrow>=19.0.0; extra == 'controller'
|
|
30
|
+
Provides-Extra: worker
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# Persistent CoreWeave CI cluster. The CPU pool and the first H100 node stay
|
|
2
|
+
# warm between runs; the H100 pool may autoscale to a second node for manual
|
|
3
|
+
# multi-host canary runs.
|
|
4
|
+
|
|
5
|
+
platform:
|
|
6
|
+
label_prefix: iris-ci
|
|
7
|
+
coreweave:
|
|
8
|
+
region: US-WEST-04A
|
|
9
|
+
namespace: iris-ci
|
|
10
|
+
kubeconfig_path: ~/.kube/coreweave-iris
|
|
11
|
+
object_storage_endpoint: https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com
|
|
12
|
+
|
|
13
|
+
storage:
|
|
14
|
+
remote_state_dir: s3://marin-na/iris/state/ci
|
|
15
|
+
|
|
16
|
+
kubernetes_provider:
|
|
17
|
+
namespace: iris-ci
|
|
18
|
+
default_image: ghcr.io/marin-community/iris-task:latest
|
|
19
|
+
host_network: true
|
|
20
|
+
cache_dir: /mnt/local/iris-cache
|
|
21
|
+
controller_address: http://iris-ci-controller-svc.iris-ci.svc.cluster.local:10000
|
|
22
|
+
|
|
23
|
+
controller:
|
|
24
|
+
image: ghcr.io/marin-community/iris-controller:latest
|
|
25
|
+
coreweave:
|
|
26
|
+
port: 10000
|
|
27
|
+
service_name: iris-ci-controller-svc
|
|
28
|
+
scale_group: cpu-erapids
|
|
29
|
+
|
|
30
|
+
defaults:
|
|
31
|
+
autoscaler:
|
|
32
|
+
evaluation_interval:
|
|
33
|
+
milliseconds: 10000
|
|
34
|
+
scale_up_delay:
|
|
35
|
+
milliseconds: 60000
|
|
36
|
+
scale_down_delay:
|
|
37
|
+
milliseconds: 300000
|
|
38
|
+
startup_grace_period:
|
|
39
|
+
milliseconds: 1200000 # 20 min — nodes are pinned warm so this rarely fires
|
|
40
|
+
task_env:
|
|
41
|
+
MARIN_PREFIX: s3://marin-na/marin
|
|
42
|
+
# H100 hostNetwork pods also see IB and SR-IOV link-local interfaces.
|
|
43
|
+
NCCL_SOCKET_IFNAME: =enp157s0np0
|
|
44
|
+
worker:
|
|
45
|
+
docker_image: ghcr.io/marin-community/iris-worker:latest
|
|
46
|
+
port: 10001
|
|
47
|
+
cache_dir: /mnt/local/iris-cache
|
|
48
|
+
runtime: kubernetes
|
|
49
|
+
default_task_image: ghcr.io/marin-community/iris-task:latest
|
|
50
|
+
|
|
51
|
+
scale_groups:
|
|
52
|
+
cpu-erapids:
|
|
53
|
+
num_vms: 1
|
|
54
|
+
resources:
|
|
55
|
+
cpu: 64
|
|
56
|
+
ram: 256GB
|
|
57
|
+
disk: 1TB
|
|
58
|
+
device_type: cpu
|
|
59
|
+
capacity_type: on-demand
|
|
60
|
+
worker:
|
|
61
|
+
attributes:
|
|
62
|
+
pool: cpu-erapids
|
|
63
|
+
buffer_slices: 1
|
|
64
|
+
max_slices: 1
|
|
65
|
+
priority: 50
|
|
66
|
+
slice_template:
|
|
67
|
+
num_vms: 1
|
|
68
|
+
coreweave:
|
|
69
|
+
region: US-WEST-04A
|
|
70
|
+
instance_type: cd-gp-i64-erapids
|
|
71
|
+
|
|
72
|
+
h100-8x:
|
|
73
|
+
num_vms: 1
|
|
74
|
+
resources:
|
|
75
|
+
cpu: 128
|
|
76
|
+
ram: 2048GB
|
|
77
|
+
disk: 1TB
|
|
78
|
+
device_type: gpu
|
|
79
|
+
device_variant: H100
|
|
80
|
+
device_count: 8
|
|
81
|
+
capacity_type: on-demand
|
|
82
|
+
worker:
|
|
83
|
+
attributes:
|
|
84
|
+
pool: h100-8x
|
|
85
|
+
buffer_slices: 1
|
|
86
|
+
max_slices: 2
|
|
87
|
+
priority: 100
|
|
88
|
+
slice_template:
|
|
89
|
+
num_vms: 1
|
|
90
|
+
coreweave:
|
|
91
|
+
region: US-WEST-04A
|
|
92
|
+
instance_type: gd-8xh100ib-i128
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# Iris on CoreWeave RNO2A (GH200, 1 GPU per node).
|
|
2
|
+
#
|
|
3
|
+
# Single GPU NodePool: gd-1xgh200. The controller is pinned onto this same pool
|
|
4
|
+
# because the cluster has no CPU nodes.
|
|
5
|
+
#
|
|
6
|
+
# Setup:
|
|
7
|
+
# 1. Split the multi-context kubeconfig (~/.kube/coreweave-rno2a holds both
|
|
8
|
+
# rno2a and usw09b) into a single-context file:
|
|
9
|
+
# KUBECONFIG=~/.kube/coreweave-rno2a \
|
|
10
|
+
# kubectl config view --minify --flatten --context=rno2a \
|
|
11
|
+
# > ~/.kube/cw-rno2a.yaml
|
|
12
|
+
# chmod 600 ~/.kube/cw-rno2a.yaml
|
|
13
|
+
# 2. CoreWeave object storage credentials in env (consumed by `iris cluster start`):
|
|
14
|
+
# export R2_ACCESS_KEY_ID=<...>
|
|
15
|
+
# export R2_SECRET_ACCESS_KEY=<...>
|
|
16
|
+
# 3. Start the cluster:
|
|
17
|
+
# cd lib/iris && uv run --group dev iris \
|
|
18
|
+
# --config=examples/coreweave-rno2a.yaml cluster start
|
|
19
|
+
#
|
|
20
|
+
# The controller Deployment includes the standard NVIDIA GPU toleration, so it
|
|
21
|
+
# can schedule onto this GPU-only cluster.
|
|
22
|
+
|
|
23
|
+
platform:
|
|
24
|
+
label_prefix: iris-rno2a
|
|
25
|
+
coreweave:
|
|
26
|
+
region: RNO2A
|
|
27
|
+
namespace: iris
|
|
28
|
+
kubeconfig_path: ~/.kube/cw-rno2a.yaml
|
|
29
|
+
# CoreWeave native object storage: virtual-hosted-style addressing
|
|
30
|
+
# (bucket goes in the subdomain). iris auto-detects cwobject.com domains
|
|
31
|
+
# and sets s3 addressing_style=virtual — see _needs_virtual_host_addressing.
|
|
32
|
+
# Bucket-less base endpoint. Virtual-host addressing prepends the bucket
|
|
33
|
+
# name as a subdomain at request time (s3://marin-poc/foo →
|
|
34
|
+
# https://marin-poc.cwobject.com/foo). Including the bucket here would
|
|
35
|
+
# double it (https://marin-poc.marin-poc.cwobject.com/...).
|
|
36
|
+
object_storage_endpoint: https://cwobject.com
|
|
37
|
+
|
|
38
|
+
storage:
|
|
39
|
+
remote_state_dir: s3://marin-poc/iris/state/rno2a
|
|
40
|
+
|
|
41
|
+
kubernetes_provider:
|
|
42
|
+
namespace: iris
|
|
43
|
+
default_image: ghcr.io/marin-community/iris-task:latest
|
|
44
|
+
# RNO2A GH200 nodes do not advertise rdma/ib devices. Keep host networking off
|
|
45
|
+
# so GPU task pods request only nvidia.com/gpu and can schedule.
|
|
46
|
+
host_network: false
|
|
47
|
+
cache_dir: /mnt/local/iris-cache
|
|
48
|
+
controller_address: http://iris-controller-svc.iris.svc.cluster.local:10000
|
|
49
|
+
|
|
50
|
+
controller:
|
|
51
|
+
image: ghcr.io/marin-community/iris-controller:latest
|
|
52
|
+
coreweave:
|
|
53
|
+
port: 10000
|
|
54
|
+
service_name: iris-controller-svc
|
|
55
|
+
scale_group: gh200-1x
|
|
56
|
+
|
|
57
|
+
defaults:
|
|
58
|
+
autoscaler:
|
|
59
|
+
evaluation_interval:
|
|
60
|
+
milliseconds: 10000
|
|
61
|
+
scale_up_delay:
|
|
62
|
+
milliseconds: 60000
|
|
63
|
+
scale_down_delay:
|
|
64
|
+
milliseconds: 300000
|
|
65
|
+
startup_grace_period:
|
|
66
|
+
milliseconds: 2400000 # 40 min — covers bare-metal node provisioning + Pod startup
|
|
67
|
+
task_env:
|
|
68
|
+
MARIN_PREFIX: s3://marin-na/marin
|
|
69
|
+
NCCL_SOCKET_IFNAME: =eth0
|
|
70
|
+
worker:
|
|
71
|
+
docker_image: ghcr.io/marin-community/iris-worker:latest
|
|
72
|
+
port: 10001
|
|
73
|
+
cache_dir: /mnt/local/iris-cache
|
|
74
|
+
runtime: kubernetes
|
|
75
|
+
default_task_image: ghcr.io/marin-community/iris-task:latest
|
|
76
|
+
|
|
77
|
+
scale_groups:
|
|
78
|
+
gh200-1x:
|
|
79
|
+
num_vms: 1
|
|
80
|
+
resources:
|
|
81
|
+
cpu: 64
|
|
82
|
+
ram: 256GB
|
|
83
|
+
disk: 1TB
|
|
84
|
+
device_type: gpu
|
|
85
|
+
device_variant: GH200
|
|
86
|
+
device_count: 1
|
|
87
|
+
capacity_type: on-demand
|
|
88
|
+
worker:
|
|
89
|
+
attributes:
|
|
90
|
+
pool: gh200-1x
|
|
91
|
+
# buffer_slices keeps one node warm so the controller always has a home.
|
|
92
|
+
buffer_slices: 1
|
|
93
|
+
max_slices: 2
|
|
94
|
+
priority: 100
|
|
95
|
+
slice_template:
|
|
96
|
+
num_vms: 1
|
|
97
|
+
coreweave:
|
|
98
|
+
region: RNO2A
|
|
99
|
+
instance_type: gd-1xgh200
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# Iris on CoreWeave US-WEST-09B (B200, 8 GPUs per node).
|
|
2
|
+
#
|
|
3
|
+
# Single GPU NodePool: b200-8x. The controller is pinned onto this same pool
|
|
4
|
+
# because the cluster has no CPU nodes.
|
|
5
|
+
#
|
|
6
|
+
# Setup:
|
|
7
|
+
# 1. Split the multi-context kubeconfig (~/.kube/coreweave-rno2a holds both
|
|
8
|
+
# rno2a and usw09b) into a single-context file:
|
|
9
|
+
# KUBECONFIG=~/.kube/coreweave-rno2a \
|
|
10
|
+
# kubectl config view --minify --flatten --context=usw09b \
|
|
11
|
+
# > ~/.kube/cw-usw09b.yaml
|
|
12
|
+
# chmod 600 ~/.kube/cw-usw09b.yaml
|
|
13
|
+
# 2. CoreWeave object storage credentials in env (consumed by `iris cluster start`):
|
|
14
|
+
# export R2_ACCESS_KEY_ID=<...>
|
|
15
|
+
# export R2_SECRET_ACCESS_KEY=<...>
|
|
16
|
+
# 3. Start the cluster:
|
|
17
|
+
# cd lib/iris && uv run --group dev iris \
|
|
18
|
+
# --config=examples/coreweave-usw09b.yaml cluster start
|
|
19
|
+
#
|
|
20
|
+
# The controller Deployment includes the standard NVIDIA GPU toleration, so it
|
|
21
|
+
# can schedule onto this GPU-only cluster.
|
|
22
|
+
|
|
23
|
+
platform:
|
|
24
|
+
label_prefix: iris-usw09b
|
|
25
|
+
coreweave:
|
|
26
|
+
region: US-WEST-09B
|
|
27
|
+
namespace: iris
|
|
28
|
+
kubeconfig_path: ~/.kube/cw-usw09b.yaml
|
|
29
|
+
# CoreWeave native object storage. The marin-poc bucket lives in RNO2A;
|
|
30
|
+
# accessing it from US-WEST-09B is cross-region but works (just slower).
|
|
31
|
+
# iris auto-detects cwobject.com and switches addressing_style=virtual.
|
|
32
|
+
# Bucket-less base endpoint. Virtual-host addressing prepends the bucket
|
|
33
|
+
# name as a subdomain at request time (s3://marin-poc/foo →
|
|
34
|
+
# https://marin-poc.cwobject.com/foo). Including the bucket here would
|
|
35
|
+
# double it (https://marin-poc.marin-poc.cwobject.com/...).
|
|
36
|
+
object_storage_endpoint: https://cwobject.com
|
|
37
|
+
|
|
38
|
+
storage:
|
|
39
|
+
remote_state_dir: s3://marin-poc/iris/state/usw09b
|
|
40
|
+
|
|
41
|
+
kubernetes_provider:
|
|
42
|
+
namespace: iris
|
|
43
|
+
default_image: ghcr.io/marin-community/iris-task:latest
|
|
44
|
+
host_network: true
|
|
45
|
+
cache_dir: /mnt/local/iris-cache
|
|
46
|
+
controller_address: http://iris-controller-svc.iris.svc.cluster.local:10000
|
|
47
|
+
|
|
48
|
+
controller:
|
|
49
|
+
image: ghcr.io/marin-community/iris-controller:latest
|
|
50
|
+
coreweave:
|
|
51
|
+
port: 10000
|
|
52
|
+
service_name: iris-controller-svc
|
|
53
|
+
scale_group: b200-8x
|
|
54
|
+
|
|
55
|
+
defaults:
|
|
56
|
+
autoscaler:
|
|
57
|
+
evaluation_interval:
|
|
58
|
+
milliseconds: 10000
|
|
59
|
+
scale_up_delay:
|
|
60
|
+
milliseconds: 60000
|
|
61
|
+
scale_down_delay:
|
|
62
|
+
milliseconds: 300000
|
|
63
|
+
startup_grace_period:
|
|
64
|
+
milliseconds: 2400000 # 40 min — covers bare-metal node provisioning + Pod startup
|
|
65
|
+
task_env:
|
|
66
|
+
MARIN_PREFIX: s3://marin-na/marin
|
|
67
|
+
# B200 hostNetwork pods also see IB and SR-IOV link-local interfaces.
|
|
68
|
+
NCCL_SOCKET_IFNAME: =enp44s0np0
|
|
69
|
+
worker:
|
|
70
|
+
docker_image: ghcr.io/marin-community/iris-worker:latest
|
|
71
|
+
port: 10001
|
|
72
|
+
cache_dir: /mnt/local/iris-cache
|
|
73
|
+
runtime: kubernetes
|
|
74
|
+
default_task_image: ghcr.io/marin-community/iris-task:latest
|
|
75
|
+
|
|
76
|
+
scale_groups:
|
|
77
|
+
b200-8x:
|
|
78
|
+
num_vms: 1
|
|
79
|
+
resources:
|
|
80
|
+
cpu: 128
|
|
81
|
+
ram: 2048GB
|
|
82
|
+
disk: 1TB
|
|
83
|
+
device_type: gpu
|
|
84
|
+
device_variant: B200
|
|
85
|
+
device_count: 8
|
|
86
|
+
capacity_type: on-demand
|
|
87
|
+
worker:
|
|
88
|
+
attributes:
|
|
89
|
+
pool: b200-8x
|
|
90
|
+
# buffer_slices keeps one node warm so the controller always has a home.
|
|
91
|
+
buffer_slices: 1
|
|
92
|
+
max_slices: 2
|
|
93
|
+
priority: 100
|
|
94
|
+
slice_template:
|
|
95
|
+
num_vms: 1
|
|
96
|
+
coreweave:
|
|
97
|
+
region: US-WEST-09B
|
|
98
|
+
instance_type: b200-8x
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# Iris development cluster configuration for CoreWeave CKS.
|
|
2
|
+
#
|
|
3
|
+
# Architecture: KubernetesProvider dispatches task pods directly to the k8s API.
|
|
4
|
+
# No worker daemons are needed. CoreWeave NodePool autoscaling provisions nodes
|
|
5
|
+
# on demand; when pods are deleted, NodePools scale to zero automatically.
|
|
6
|
+
# CPU nodes are kept always-on (buffer_slices=2) for monitoring software.
|
|
7
|
+
#
|
|
8
|
+
# Workflow:
|
|
9
|
+
#
|
|
10
|
+
# 1. Set S3 object storage credentials (required for s3:// storage URIs):
|
|
11
|
+
# export R2_ACCESS_KEY_ID=<your-r2-access-key-id>
|
|
12
|
+
# export R2_SECRET_ACCESS_KEY=<your-r2-secret-access-key>
|
|
13
|
+
# These are created in the Cloudflare dashboard under R2 > Manage R2 API Tokens.
|
|
14
|
+
# `iris cluster start` creates a K8s Secret from these env vars automatically.
|
|
15
|
+
#
|
|
16
|
+
# 2. Start the cluster (creates RBAC, shared NodePools, ConfigMap, Deployment, Service):
|
|
17
|
+
# iris --config=lib/iris/examples/coreweave.yaml cluster start
|
|
18
|
+
#
|
|
19
|
+
# 3. Use the Iris CLI:
|
|
20
|
+
# iris --config=lib/iris/examples/coreweave.yaml cluster status
|
|
21
|
+
# iris --config=lib/iris/examples/coreweave.yaml cluster dashboard
|
|
22
|
+
#
|
|
23
|
+
# This config file is used by:
|
|
24
|
+
# - The CLI on the operator's laptop (for `cluster start`, `cluster status`, job submission)
|
|
25
|
+
# - The controller and workers inside the cluster (mounted as ConfigMap at /etc/iris/config.yaml)
|
|
26
|
+
#
|
|
27
|
+
# To use a local kubeconfig (e.g. from CoreWeave Console > Tokens > Download):
|
|
28
|
+
# Set platform.coreweave.kubeconfig_path below, or:
|
|
29
|
+
# export KUBECONFIG=~/.kube/coreweave-iris
|
|
30
|
+
|
|
31
|
+
platform:
|
|
32
|
+
label_prefix: coreweave
|
|
33
|
+
coreweave:
|
|
34
|
+
region: US-WEST-04A
|
|
35
|
+
namespace: iris
|
|
36
|
+
kubeconfig_path: ~/.kube/coreweave-iris
|
|
37
|
+
object_storage_endpoint: https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com
|
|
38
|
+
|
|
39
|
+
storage:
|
|
40
|
+
remote_state_dir: s3://marin-na/iris/coreweave/state
|
|
41
|
+
|
|
42
|
+
kubernetes_provider:
|
|
43
|
+
namespace: iris
|
|
44
|
+
default_image: ghcr.io/marin-community/iris-task:latest
|
|
45
|
+
host_network: true # Required for NCCL/IB multi-host traffic
|
|
46
|
+
cache_dir: /mnt/local/iris-cache # NVMe; default /cache hits the 15GB ramdisk on bare-metal GPU nodes
|
|
47
|
+
controller_address: http://iris-controller-svc.iris.svc.cluster.local:10000
|
|
48
|
+
|
|
49
|
+
controller:
|
|
50
|
+
image: ghcr.io/marin-community/iris-controller:latest
|
|
51
|
+
coreweave:
|
|
52
|
+
port: 10000
|
|
53
|
+
service_name: iris-controller-svc
|
|
54
|
+
scale_group: cpu-erapids
|
|
55
|
+
|
|
56
|
+
defaults:
|
|
57
|
+
autoscaler:
|
|
58
|
+
evaluation_interval:
|
|
59
|
+
milliseconds: 10000
|
|
60
|
+
scale_up_delay:
|
|
61
|
+
milliseconds: 60000
|
|
62
|
+
scale_down_delay:
|
|
63
|
+
milliseconds: 300000
|
|
64
|
+
startup_grace_period:
|
|
65
|
+
milliseconds: 2400000 # 40 min — covers autoscaler node provisioning + Pod startup
|
|
66
|
+
task_env:
|
|
67
|
+
MARIN_PREFIX: s3://marin-na/marin
|
|
68
|
+
worker:
|
|
69
|
+
docker_image: ghcr.io/marin-community/iris-worker:latest
|
|
70
|
+
port: 10001
|
|
71
|
+
cache_dir: /mnt/local/iris-cache
|
|
72
|
+
runtime: kubernetes
|
|
73
|
+
default_task_image: ghcr.io/marin-community/iris-task:latest
|
|
74
|
+
|
|
75
|
+
scale_groups:
|
|
76
|
+
cpu-erapids:
|
|
77
|
+
num_vms: 1
|
|
78
|
+
resources:
|
|
79
|
+
cpu: 64
|
|
80
|
+
ram: 256GB
|
|
81
|
+
disk: 1TB
|
|
82
|
+
device_type: cpu
|
|
83
|
+
capacity_type: on-demand
|
|
84
|
+
worker:
|
|
85
|
+
attributes:
|
|
86
|
+
pool: cpu-erapids
|
|
87
|
+
buffer_slices: 1
|
|
88
|
+
max_slices: 4
|
|
89
|
+
priority: 50
|
|
90
|
+
slice_template:
|
|
91
|
+
num_vms: 1
|
|
92
|
+
coreweave:
|
|
93
|
+
region: US-WEST-04A
|
|
94
|
+
instance_type: cd-gp-i64-erapids
|
|
95
|
+
|
|
96
|
+
h100-8x:
|
|
97
|
+
num_vms: 1
|
|
98
|
+
resources:
|
|
99
|
+
cpu: 128
|
|
100
|
+
ram: 2048GB
|
|
101
|
+
disk: 1TB
|
|
102
|
+
device_type: gpu
|
|
103
|
+
device_variant: H100
|
|
104
|
+
device_count: 8
|
|
105
|
+
capacity_type: on-demand
|
|
106
|
+
worker:
|
|
107
|
+
attributes:
|
|
108
|
+
pool: h100-8x
|
|
109
|
+
buffer_slices: 0
|
|
110
|
+
max_slices: 8
|
|
111
|
+
priority: 100
|
|
112
|
+
slice_template:
|
|
113
|
+
num_vms: 1
|
|
114
|
+
coreweave:
|
|
115
|
+
region: US-WEST-04A
|
|
116
|
+
instance_type: gd-8xh100ib-i128
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Iris local configuration with GCP OAuth2 access token authentication
|
|
2
|
+
# Usage: iris --cluster=local-auth-gcp cluster start --local
|
|
3
|
+
# Requires: gcloud auth login && gcloud auth application-default login
|
|
4
|
+
|
|
5
|
+
platform:
|
|
6
|
+
local:
|
|
7
|
+
|
|
8
|
+
controller:
|
|
9
|
+
local:
|
|
10
|
+
port: 0 # auto-assign
|
|
11
|
+
|
|
12
|
+
worker_provider: {}
|
|
13
|
+
|
|
14
|
+
auth:
|
|
15
|
+
gcp:
|
|
16
|
+
project_id: hai-gcp-models # GCP project ID — users must have access to log in
|
|
17
|
+
admin_users:
|
|
18
|
+
- russell.power@gmail.com # Replace with actual admin email
|
|
19
|
+
optional: true
|
|
20
|
+
|
|
21
|
+
scale_groups:
|
|
22
|
+
cpu:
|
|
23
|
+
num_vms: 1
|
|
24
|
+
resources:
|
|
25
|
+
cpu: 16
|
|
26
|
+
ram: 32GB
|
|
27
|
+
disk: 100GB
|
|
28
|
+
device_type: cpu
|
|
29
|
+
device_variant: cpu
|
|
30
|
+
capacity_type: on-demand
|
|
31
|
+
buffer_slices: 1
|
|
32
|
+
max_slices: 4
|
|
33
|
+
slice_template:
|
|
34
|
+
num_vms: 1
|
|
35
|
+
local:
|