ramjetio 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. ramjetio-0.4.0/.gitignore +150 -0
  2. ramjetio-0.4.0/CHANGELOG.md +38 -0
  3. ramjetio-0.4.0/LICENSE +141 -0
  4. ramjetio-0.4.0/Makefile +89 -0
  5. ramjetio-0.4.0/PKG-INFO +247 -0
  6. ramjetio-0.4.0/README.md +210 -0
  7. ramjetio-0.4.0/docs/API.md +283 -0
  8. ramjetio-0.4.0/docs/INTEGRATION.md +473 -0
  9. ramjetio-0.4.0/docs/TROUBLESHOOTING.md +227 -0
  10. ramjetio-0.4.0/examples/accelerate_example.py +85 -0
  11. ramjetio-0.4.0/examples/deepspeed_example.py +99 -0
  12. ramjetio-0.4.0/examples/simple.py +58 -0
  13. ramjetio-0.4.0/examples/torchrun_ddp.py +187 -0
  14. ramjetio-0.4.0/pyproject.toml +106 -0
  15. ramjetio-0.4.0/ramjetio/__init__.py +798 -0
  16. ramjetio-0.4.0/ramjetio/backend_client.py +554 -0
  17. ramjetio-0.4.0/ramjetio/cache.py +313 -0
  18. ramjetio-0.4.0/ramjetio/cli/__init__.py +1 -0
  19. ramjetio-0.4.0/ramjetio/cli/client.py +135 -0
  20. ramjetio-0.4.0/ramjetio/cli/server.py +160 -0
  21. ramjetio-0.4.0/ramjetio/config.py +63 -0
  22. ramjetio-0.4.0/ramjetio/consistent_hash.py +203 -0
  23. ramjetio-0.4.0/ramjetio/datasets.py +1629 -0
  24. ramjetio-0.4.0/ramjetio/helpers.py +244 -0
  25. ramjetio-0.4.0/ramjetio/proto/__init__.py +112 -0
  26. ramjetio-0.4.0/ramjetio/proto/ramjet.proto +419 -0
  27. ramjetio-0.4.0/ramjetio/pytorch.py +251 -0
  28. ramjetio-0.4.0/ramjetio/serialization.py +102 -0
  29. ramjetio-0.4.0/ramjetio/server.py +689 -0
  30. ramjetio-0.4.0/ramjetio.egg-info/PKG-INFO +247 -0
  31. ramjetio-0.4.0/ramjetio.egg-info/SOURCES.txt +39 -0
  32. ramjetio-0.4.0/ramjetio.egg-info/dependency_links.txt +1 -0
  33. ramjetio-0.4.0/ramjetio.egg-info/entry_points.txt +3 -0
  34. ramjetio-0.4.0/ramjetio.egg-info/requires.txt +13 -0
  35. ramjetio-0.4.0/ramjetio.egg-info/top_level.txt +1 -0
  36. ramjetio-0.4.0/requirements.txt +24 -0
  37. ramjetio-0.4.0/setup.cfg +4 -0
  38. ramjetio-0.4.0/tests/__init__.py +1 -0
  39. ramjetio-0.4.0/tests/test_cache.py +99 -0
  40. ramjetio-0.4.0/tests/test_consistent_hash.py +88 -0
  41. ramjetio-0.4.0/tests/test_pytorch.py +146 -0
@@ -0,0 +1,150 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ *.manifest
32
+ *.spec
33
+
34
+ # Installer logs
35
+ pip-log.txt
36
+ pip-delete-this-directory.txt
37
+
38
+ # Unit test / coverage reports
39
+ htmlcov/
40
+ .tox/
41
+ .nox/
42
+ .coverage
43
+ .coverage.*
44
+ .cache
45
+ nosetests.xml
46
+ coverage.xml
47
+ *.cover
48
+ *.py,cover
49
+ .hypothesis/
50
+ .pytest_cache/
51
+
52
+ # Translations
53
+ *.mo
54
+ *.pot
55
+
56
+ # Django stuff:
57
+ *.log
58
+ local_settings.py
59
+ db.sqlite3
60
+ db.sqlite3-journal
61
+
62
+ # Flask stuff:
63
+ instance/
64
+ .webassets-cache
65
+
66
+ # Scrapy stuff:
67
+ .scrapy
68
+
69
+ # Sphinx documentation
70
+ docs/_build/
71
+
72
+ # PyBuilder
73
+ target/
74
+
75
+ # Jupyter Notebook
76
+ .ipynb_checkpoints
77
+
78
+ # IPython
79
+ profile_default/
80
+ ipython_config.py
81
+
82
+ # pyenv
83
+ .python-version
84
+
85
+ # pipenv
86
+ Pipfile.lock
87
+
88
+ # PEP 582
89
+ __pypackages__/
90
+
91
+ # Celery stuff
92
+ celerybeat-schedule
93
+ celerybeat.pid
94
+
95
+ # SageMath parsed files
96
+ *.sage.py
97
+
98
+ # Environments
99
+ .env
100
+ .venv
101
+ env/
102
+ venv/
103
+ ENV/
104
+ env.bak/
105
+ venv.bak/
106
+
107
+ # Spyder project settings
108
+ .spyderproject
109
+ .spyproject
110
+
111
+ # Rope project settings
112
+ .ropeproject
113
+
114
+ # mkdocs documentation
115
+ /site
116
+
117
+ # mypy
118
+ .mypy_cache/
119
+ .dmypy.json
120
+ dmypy.json
121
+
122
+ # Pyre type checker
123
+ .pyre/
124
+
125
+ # IDEs
126
+ .vscode/
127
+ .idea/
128
+ *.swp
129
+ *.swo
130
+ *~
131
+
132
+ # OS
133
+ .DS_Store
134
+ Thumbs.db
135
+
136
+ # Project specific
137
+ cache_data/
138
+ *.cache
139
+ *.tmp
140
+
141
+ # Development and testing scripts (not needed for users)
142
+ quick_start_testing.sh
143
+ test_*.sh
144
+ run_*.sh
145
+ *_test_local.py
146
+ *_local_test.py
147
+ DEPLOYMENT_GUIDE.md
148
+
149
+ # Build artifacts (regenerated on pip install)
150
+ ramjet.egg-info/
@@ -0,0 +1,38 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ## [0.1.0] - 2025-10-22
11
+
12
+ ### Added
13
+ - Initial release of RAMJET distributed cache
14
+ - Consistent hashing implementation for key distribution
15
+ - HTTP-based cache server with disk storage
16
+ - Distributed cache client with retry logic
17
+ - PyTorch integration utilities:
18
+ - `CachedDataset` for caching dataset items
19
+ - `CacheCheckpoint` for model checkpoint management
20
+ - Forward hooks for activation caching
21
+ - Command-line interfaces:
22
+ - `ramjet-server` for starting cache servers
23
+ - `ramjet-client` for command-line cache operations
24
+ - Comprehensive test suite
25
+ - Example scripts demonstrating usage
26
+ - Documentation and README
27
+
28
+ ### Features
29
+ - Scalable to N servers with M TB each
30
+ - Automatic node addition/removal with minimal redistribution
31
+ - Optional data compression
32
+ - Configurable replication factor
33
+ - TTL support for cache entries
34
+ - PyTorch tensor serialization
35
+ - Statistics and monitoring endpoints
36
+
37
+ [Unreleased]: https://github.com/ramjetinc/ramjet/compare/v0.1.0...HEAD
38
+ [0.1.0]: https://github.com/ramjetinc/ramjet/releases/tag/v0.1.0
ramjetio-0.4.0/LICENSE ADDED
@@ -0,0 +1,141 @@
1
+ # PolyForm Noncommercial License 1.0.0
2
+
3
+ <https://polyformproject.org/licenses/noncommercial/1.0.0>
4
+
5
+ ## Acceptance
6
+
7
+ In order to get any license under these terms, you must agree
8
+ to them as both strict obligations and conditions to all
9
+ your licenses.
10
+
11
+ ## Copyright License
12
+
13
+ The licensor grants you a copyright license for the
14
+ software to do everything you might do with the software
15
+ that would otherwise infringe the licensor's copyright
16
+ in it for any permitted purpose. However, you may
17
+ only distribute the software according to [Distribution
18
+ License](#distribution-license) and make changes or new works
19
+ based on the software according to [Changes and New Works
20
+ License](#changes-and-new-works-license).
21
+
22
+ ## Distribution License
23
+
24
+ The licensor grants you an additional copyright license
25
+ to distribute copies of the software. Your license
26
+ to distribute covers distributing the software with
27
+ changes and new works permitted by [Changes and New Works
28
+ License](#changes-and-new-works-license).
29
+
30
+ ## Notices
31
+
32
+ You must ensure that anyone who gets a copy of any part of
33
+ the software from you also gets a copy of these terms or the
34
+ URL for them above, as well as copies of any plain-text lines
35
+ beginning with `Required Notice:` that the licensor provided
36
+ with the software. For example:
37
+
38
+ > Required Notice: Copyright NxGen Inc.
39
+
40
+ ## Changes and New Works License
41
+
42
+ The licensor grants you an additional copyright license to
43
+ make changes and new works based on the software for any
44
+ permitted purpose.
45
+
46
+ ## Patent License
47
+
48
+ The licensor grants you a patent license for the software that
49
+ covers patent claims the licensor can license, or becomes able
50
+ to license, that you would infringe by using the software.
51
+
52
+ ## Noncommercial Purposes
53
+
54
+ Any noncommercial purpose is a permitted purpose.
55
+
56
+ ## Personal Uses
57
+
58
+ Personal use for research, experiment, and testing for
59
+ the benefit of public knowledge, personal study, private
60
+ entertainment, hobby projects, amateur pursuits, or religious
61
+ observance, without any anticipated commercial application,
62
+ is use for a permitted purpose.
63
+
64
+ ## Noncommercial Organizations
65
+
66
+ Use by any charitable organization, educational institution,
67
+ public research organization, public safety or health
68
+ organization, environmental protection organization,
69
+ or government institution is use for a permitted purpose
70
+ regardless of the source of funding or obligations resulting
71
+ from the funding.
72
+
73
+ ## Fair Use
74
+
75
+ You may have "fair use" rights for the software under the
76
+ law. These terms do not limit them.
77
+
78
+ ## No Other Rights
79
+
80
+ These terms do not allow you to sublicense or transfer any of
81
+ your licenses to anyone else, or prevent the licensor from
82
+ granting licenses to anyone else. These terms do not imply
83
+ any other licenses.
84
+
85
+ ## Patent Defense
86
+
87
+ If you make any written claim that the software infringes or
88
+ contributes to infringement of any patent, your patent license
89
+ for the software granted under these terms ends immediately. If
90
+ your company makes such a claim, your patent license ends
91
+ immediately for work on behalf of your company.
92
+
93
+ ## Violations
94
+
95
+ The first time you are notified in writing that you have
96
+ violated any of these terms, or done anything with the software
97
+ not covered by your licenses, your licenses can nonetheless
98
+ continue if you come into full compliance with these terms,
99
+ and take practical steps to correct past violations, within
100
+ 32 days of receiving notice. Otherwise, all your licenses
101
+ end immediately.
102
+
103
+ ## No Liability
104
+
105
+ ***As far as the law allows, the software comes as is, without
106
+ any warranty or condition, and the licensor will not be liable
107
+ to you for any damages arising out of these terms or the use
108
+ or nature of the software, under any kind of legal claim.***
109
+
110
+ ## Definitions
111
+
112
+ The **licensor** is the entity offering these terms, and the
113
+ **software** is the software the licensor makes available under
114
+ these terms.
115
+
116
+ **You** refers to the individual or entity agreeing to these
117
+ terms.
118
+
119
+ **Your company** is any legal entity, sole proprietorship,
120
+ or other kind of organization that you work for, plus all
121
+ organizations that have control over, are under the control of,
122
+ or are under common control with that organization. **Control**
123
+ means ownership of substantially all the assets of an entity,
124
+ or the power to direct its management and policies by vote,
125
+ contract, or otherwise. Control can be direct or indirect.
126
+
127
+ **Your licenses** are all the licenses granted to you for the
128
+ software under these terms.
129
+
130
+ **Use** means anything you do with the software requiring one
131
+ of your licenses.
132
+
133
+ ---
134
+
135
+ Required Notice: Copyright 2025 NxGen Inc.
136
+
137
+ ---
138
+
139
+ ## Commercial Licensing
140
+
141
+ For commercial use, please contact jogrms@gmail.com
@@ -0,0 +1,89 @@
1
+ .PHONY: help install install-dev test test-cov format lint type-check clean build upload docs run-server run-examples
2
+
3
+ help:
4
+ @echo "RAMJET - Distributed Cache for PyTorch Training"
5
+ @echo ""
6
+ @echo "Available targets:"
7
+ @echo " install - Install package"
8
+ @echo " install-dev - Install package with dev dependencies"
9
+ @echo " test - Run tests"
10
+ @echo " test-cov - Run tests with coverage"
11
+ @echo " format - Format code with black and isort"
12
+ @echo " lint - Run linters (flake8)"
13
+ @echo " type-check - Run type checker (mypy)"
14
+ @echo " clean - Clean build artifacts"
15
+ @echo " build - Build distribution packages"
16
+ @echo " upload - Upload to PyPI (requires credentials)"
17
+ @echo " docs - Generate documentation"
18
+ @echo " run-server - Start test cache servers (3 nodes)"
19
+ @echo " run-examples - Run all example scripts"
20
+
21
+ install:
22
+ pip install -e .
23
+
24
+ install-dev:
25
+ pip install -e ".[dev]"
26
+ pre-commit install
27
+
28
+ test:
29
+ pytest tests/
30
+
31
+ test-cov:
32
+ pytest --cov=ramjet --cov-report=html --cov-report=term tests/
33
+
34
+ format:
35
+ black ramjet tests examples
36
+ isort ramjet tests examples
37
+
38
+ lint:
39
+ flake8 ramjet tests examples --max-line-length=120 --extend-ignore=E203,W503
40
+
41
+ type-check:
42
+ mypy ramjet --ignore-missing-imports
43
+
44
+ clean:
45
+ rm -rf build/
46
+ rm -rf dist/
47
+ rm -rf *.egg-info
48
+ rm -rf htmlcov/
49
+ rm -rf .pytest_cache/
50
+ rm -rf .mypy_cache/
51
+ find . -type d -name __pycache__ -exec rm -rf {} +
52
+ find . -type f -name "*.pyc" -delete
53
+
54
+ build: clean
55
+ python -m build
56
+
57
+ upload: build
58
+ python -m twine upload dist/*
59
+
60
+ docs:
61
+ @echo "Documentation is in markdown files:"
62
+ @echo " - README.md"
63
+ @echo " - QUICKSTART.md"
64
+ @echo " - INSTALLATION.md"
65
+ @echo " - PROJECT_STRUCTURE.md"
66
+ @echo " - CONTRIBUTING.md"
67
+
68
+ run-server:
69
+ @echo "Starting 3 cache server nodes..."
70
+ @echo "Node 1: http://localhost:9000"
71
+ @echo "Node 2: http://localhost:9001"
72
+ @echo "Node 3: http://localhost:9002"
73
+ @echo ""
74
+ @echo "Press Ctrl+C to stop all servers"
75
+ @trap 'kill 0' SIGINT; \
76
+ ramjet-server --port 9000 --storage-path /tmp/ramjet_cache_0 --capacity 1GB & \
77
+ ramjet-server --port 9001 --storage-path /tmp/ramjet_cache_1 --capacity 1GB & \
78
+ ramjet-server --port 9002 --storage-path /tmp/ramjet_cache_2 --capacity 1GB & \
79
+ wait
80
+
81
+ run-examples:
82
+ @echo "Running basic usage example..."
83
+ python examples/basic_usage.py
84
+ @echo ""
85
+ @echo "Running PyTorch DataLoader example..."
86
+ python examples/pytorch_dataloader.py
87
+ @echo ""
88
+ @echo "Running checkpoint caching example..."
89
+ python examples/checkpoint_caching.py
@@ -0,0 +1,247 @@
1
+ Metadata-Version: 2.4
2
+ Name: ramjetio
3
+ Version: 0.4.0
4
+ Summary: Distributed cache system for PyTorch training
5
+ Author-email: RAMJET <support@ramjet.io>
6
+ License-Expression: LicenseRef-PolyForm-Noncommercial-1.0.0
7
+ Project-URL: Homepage, https://ramjet.io
8
+ Keywords: distributed,cache,pytorch,deep-learning,machine-learning,training
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.8
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Classifier: Operating System :: POSIX :: Linux
20
+ Requires-Python: >=3.8
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: torch>=1.9.0
24
+ Requires-Dist: numpy>=1.19.0
25
+ Requires-Dist: grpcio>=1.50.0
26
+ Requires-Dist: grpcio-tools>=1.50.0
27
+ Requires-Dist: protobuf>=4.0.0
28
+ Requires-Dist: aiohttp>=3.8.0
29
+ Requires-Dist: msgpack>=1.0.0
30
+ Requires-Dist: pyyaml>=5.4.0
31
+ Requires-Dist: requests>=2.25.0
32
+ Requires-Dist: mmh3>=3.0.0
33
+ Requires-Dist: diskcache>=5.4.0
34
+ Requires-Dist: psutil>=5.8.0
35
+ Requires-Dist: boto3>=1.26.0
36
+ Dynamic: license-file
37
+
38
+ # RAMJET — Distributed Data Cache for PyTorch Training
39
+
40
+ [![Python](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
41
+ [![PyTorch](https://img.shields.io/badge/PyTorch-1.9+-red.svg)](https://pytorch.org/)
42
+ [![PyPI](https://img.shields.io/pypi/v/ramjetio.svg)](https://pypi.org/project/ramjetio/)
43
+ [![License](https://img.shields.io/badge/License-PolyForm%20NC-green.svg)](LICENSE)
44
+
45
+ **RAMJET** accelerates PyTorch distributed training by caching preprocessed data across your cluster. Works with any DDP setup — `torchrun`, DeepSpeed, Accelerate, or custom launchers.
46
+
47
+ ## Why RAMJET?
48
+
49
+ | Problem | Solution |
50
+ |---------|----------|
51
+ | Slow data preprocessing | Cache preprocessed samples across nodes |
52
+ | Network bottleneck from shared storage | Local SSD cache on each node |
53
+ | Repeated data loading across epochs | First epoch caches, next epochs are instant |
54
+ | No visibility into training | Real-time metrics dashboard |
55
+
56
+ ## Quick Start
57
+
58
+ ### 1. Install
59
+
60
+ ```bash
61
+ pip install ramjetio
62
+ ```
63
+
64
+ ### 2. Get API Key
65
+
66
+ 1. Go to [app.ramjet.io](https://app.ramjet.io)
67
+ 2. Create a cluster
68
+ 3. Copy your API key
69
+
70
+ ### 3. Add to Your Training Script
71
+
72
+ ```python
73
+ import ramjetio
74
+ from torch.utils.data import DataLoader
75
+
76
+ # Initialize RAMJET (connects to dashboard, starts local cache server)
77
+ ramjetio.init()
78
+
79
+ # Wrap your dataset
80
+ dataset = ramjetio.CachedDataset(your_dataset)
81
+
82
+ # Use with DataLoader as usual
83
+ loader = DataLoader(dataset, batch_size=32)
84
+
85
+ for batch in loader:
86
+ train_step(batch)
87
+ ```
88
+
89
+ ### 4. Run Training
90
+
91
+ ```bash
92
+ export RAMJET_API_KEY="your_api_key_here"
93
+
94
+ # Works with any launcher
95
+ torchrun --nproc_per_node=2 train.py
96
+ ```
97
+
98
+ That's it! Your nodes will appear in the dashboard within seconds.
99
+
100
+ ## How It Works
101
+
102
+ ```
103
+ ┌─────────────────────────────────────────────────────────────┐
104
+ │ Your Training Cluster │
105
+ ├─────────────────────────────────────────────────────────────┤
106
+ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
107
+ │ │ Node 0 │ │ Node 1 │ │ Node 2 │ │
108
+ │ │ ┌───────┐ │ │ ┌───────┐ │ │ ┌───────┐ │ │
109
+ │ │ │ Train │ │ │ │ Train │ │ │ │ Train │ │ │
110
+ │ │ └───┬───┘ │ │ └───┬───┘ │ │ └───┬───┘ │ │
111
+ │ │ │ │ │ │ │ │ │ │ │
112
+ │ │ ┌───▼───┐ │ │ ┌───▼───┐ │ │ ┌───▼───┐ │ │
113
+ │ │ │ RAMJET │◄─┼────┼──┤ RAMJET │◄─┼────┼──┤ RAMJET │ │ │
114
+ │ │ │ Cache │──┼────┼──► Cache │──┼────┼──► Cache │ │ │
115
+ │ │ └───────┘ │ │ └───────┘ │ │ └───────┘ │ │
116
+ │ │ 500GB SSD │ │ 500GB SSD │ │ 500GB SSD │ │
117
+ │ └─────────────┘ └─────────────┘ └─────────────┘ │
118
+ │ │ │
119
+ │ ▼ │
120
+ │ ┌─────────────────┐ │
121
+ │ │ RAMJET Dashboard │ │
122
+ │ │ (Metrics UI) │ │
123
+ │ └─────────────────┘ │
124
+ └─────────────────────────────────────────────────────────────┘
125
+ ```
126
+
127
+ ## Features
128
+
129
+ - 🚀 **Zero-config caching** — `ramjetio.init()` handles everything
130
+ - 📊 **Real-time dashboard** — monitor cache hits, throughput, GPU utilization
131
+ - 🔄 **Consistent hashing** — data distributed evenly across nodes
132
+ - 💾 **Disk-backed cache** — survives restarts, uses NVMe SSDs efficiently
133
+ - 🔌 **Works with any setup** — torchrun, DeepSpeed, Accelerate, custom launchers
134
+ - ☁️ **S3/MinIO integration** — configure data source in dashboard, not in code
135
+
136
+ ## Integration Examples
137
+
138
+ See [docs/INTEGRATION.md](docs/INTEGRATION.md) for detailed examples with:
139
+ - PyTorch DDP with `torchrun`
140
+ - DeepSpeed
141
+ - HuggingFace Accelerate
142
+ - Custom training loops
143
+ - Multi-node clusters
144
+
145
+ ## Configuration
146
+
147
+ ### Environment Variables
148
+
149
+ | Variable | Description | Default |
150
+ |----------|-------------|---------|
151
+ | `RAMJET_API_KEY` | Your API key (required) | — |
152
+ | `RAMJET_CACHE_PATH` | Local cache directory | `/tmp/ramjet_cache` |
153
+ | `RAMJET_CACHE_SIZE` | Max cache size | `100GB` |
154
+ | `RAMJET_PORT` | Cache server port | `9000` |
155
+
156
+ ### Dashboard Settings
157
+
158
+ Configure in the web dashboard (no code changes needed):
159
+ - **Data Source**: S3/MinIO endpoint, bucket, credentials
160
+ - **Cache Settings**: TTL, replication factor, eviction policy
161
+
162
+ ## Distributed Training (DDP)
163
+
164
+ RAMJET automatically detects `torchrun` and DDP environments:
165
+
166
+ ### Single Machine, Multiple GPUs (torchrun)
167
+
168
+ ```bash
169
+ # 4 GPUs on one machine
170
+ torchrun --nproc_per_node=4 train.py
171
+ ```
172
+
173
+ ```python
174
+ import ramjetio
175
+ import torch.distributed as dist
176
+
177
+ # Only LOCAL_RANK=0 starts cache server - others wait and share it
178
+ ramjetio.init()
179
+
180
+ # All ranks use the same cache
181
+ dataset = ramjetio.CachedDataset(your_dataset)
182
+ ```
183
+
184
+ ### Multi-Node Training
185
+
186
+ ```bash
187
+ # Node 0 (master)
188
+ torchrun --nnodes=2 --node_rank=0 --nproc_per_node=4 \
189
+ --master_addr=node0 --master_port=29500 train.py
190
+
191
+ # Node 1
192
+ torchrun --nnodes=2 --node_rank=1 --nproc_per_node=4 \
193
+ --master_addr=node0 --master_port=29500 train.py
194
+ ```
195
+
196
+ Each node runs one cache server (on LOCAL_RANK=0), and all nodes share data via consistent hashing.
197
+
198
+ ### Separate Processes per Rank
199
+
200
+ ```bash
201
+ # If not using torchrun, set env vars manually:
202
+ export RANK=0 WORLD_SIZE=4 LOCAL_RANK=0
203
+ python train.py
204
+
205
+ # On another terminal/machine:
206
+ export RANK=1 WORLD_SIZE=4 LOCAL_RANK=0
207
+ python train.py
208
+ ```
209
+
210
+ RAMJET reads `LOCAL_RANK`, `RANK`, `WORLD_SIZE` from environment to coordinate.
211
+
212
+ ## CLI Tools
213
+
214
+ ```bash
215
+ # Start cache server manually (usually not needed — ramjetio.init() does this)
216
+ ramjetio-server --port 9000 --capacity 100GB
217
+
218
+ # Check cache status
219
+ ramjetio-client stats
220
+
221
+ # Clear cache
222
+ ramjetio-client clear
223
+ ```
224
+
225
+ ## Requirements
226
+
227
+ - Python 3.8+
228
+ - PyTorch 1.9+
229
+ - Linux (recommended for production)
230
+ - SSD storage for cache (recommended)
231
+
232
+ ## Documentation
233
+
234
+ - [Integration Guide](docs/INTEGRATION.md) — detailed examples for all frameworks
235
+ - [API Reference](docs/API.md) — full API documentation
236
+ - [Troubleshooting](docs/TROUBLESHOOTING.md) — common issues and solutions
237
+
238
+ ## License
239
+
240
+ PolyForm Noncommercial License 1.0.0 — free for personal and non-commercial use.
241
+ For commercial licensing, contact licensing@ramjet.dev. See [LICENSE](LICENSE) for details.
242
+
243
+ ## Support
244
+
245
+ - 📧 Email: support@ramjet.io
246
+ - 💬 Discord: [discord.gg/ramjet](https://discord.gg/ramjet)
247
+ - 📖 Docs: [docs.ramjet.io](https://docs.ramjet.io)