ramjetio 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ramjetio-0.4.0/.gitignore +150 -0
- ramjetio-0.4.0/CHANGELOG.md +38 -0
- ramjetio-0.4.0/LICENSE +141 -0
- ramjetio-0.4.0/Makefile +89 -0
- ramjetio-0.4.0/PKG-INFO +247 -0
- ramjetio-0.4.0/README.md +210 -0
- ramjetio-0.4.0/docs/API.md +283 -0
- ramjetio-0.4.0/docs/INTEGRATION.md +473 -0
- ramjetio-0.4.0/docs/TROUBLESHOOTING.md +227 -0
- ramjetio-0.4.0/examples/accelerate_example.py +85 -0
- ramjetio-0.4.0/examples/deepspeed_example.py +99 -0
- ramjetio-0.4.0/examples/simple.py +58 -0
- ramjetio-0.4.0/examples/torchrun_ddp.py +187 -0
- ramjetio-0.4.0/pyproject.toml +106 -0
- ramjetio-0.4.0/ramjetio/__init__.py +798 -0
- ramjetio-0.4.0/ramjetio/backend_client.py +554 -0
- ramjetio-0.4.0/ramjetio/cache.py +313 -0
- ramjetio-0.4.0/ramjetio/cli/__init__.py +1 -0
- ramjetio-0.4.0/ramjetio/cli/client.py +135 -0
- ramjetio-0.4.0/ramjetio/cli/server.py +160 -0
- ramjetio-0.4.0/ramjetio/config.py +63 -0
- ramjetio-0.4.0/ramjetio/consistent_hash.py +203 -0
- ramjetio-0.4.0/ramjetio/datasets.py +1629 -0
- ramjetio-0.4.0/ramjetio/helpers.py +244 -0
- ramjetio-0.4.0/ramjetio/proto/__init__.py +112 -0
- ramjetio-0.4.0/ramjetio/proto/ramjet.proto +419 -0
- ramjetio-0.4.0/ramjetio/pytorch.py +251 -0
- ramjetio-0.4.0/ramjetio/serialization.py +102 -0
- ramjetio-0.4.0/ramjetio/server.py +689 -0
- ramjetio-0.4.0/ramjetio.egg-info/PKG-INFO +247 -0
- ramjetio-0.4.0/ramjetio.egg-info/SOURCES.txt +39 -0
- ramjetio-0.4.0/ramjetio.egg-info/dependency_links.txt +1 -0
- ramjetio-0.4.0/ramjetio.egg-info/entry_points.txt +3 -0
- ramjetio-0.4.0/ramjetio.egg-info/requires.txt +13 -0
- ramjetio-0.4.0/ramjetio.egg-info/top_level.txt +1 -0
- ramjetio-0.4.0/requirements.txt +24 -0
- ramjetio-0.4.0/setup.cfg +4 -0
- ramjetio-0.4.0/tests/__init__.py +1 -0
- ramjetio-0.4.0/tests/test_cache.py +99 -0
- ramjetio-0.4.0/tests/test_consistent_hash.py +88 -0
- ramjetio-0.4.0/tests/test_pytorch.py +146 -0
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
pip-wheel-metadata/
|
|
24
|
+
share/python-wheels/
|
|
25
|
+
*.egg-info/
|
|
26
|
+
.installed.cfg
|
|
27
|
+
*.egg
|
|
28
|
+
MANIFEST
|
|
29
|
+
|
|
30
|
+
# PyInstaller
|
|
31
|
+
*.manifest
|
|
32
|
+
*.spec
|
|
33
|
+
|
|
34
|
+
# Installer logs
|
|
35
|
+
pip-log.txt
|
|
36
|
+
pip-delete-this-directory.txt
|
|
37
|
+
|
|
38
|
+
# Unit test / coverage reports
|
|
39
|
+
htmlcov/
|
|
40
|
+
.tox/
|
|
41
|
+
.nox/
|
|
42
|
+
.coverage
|
|
43
|
+
.coverage.*
|
|
44
|
+
.cache
|
|
45
|
+
nosetests.xml
|
|
46
|
+
coverage.xml
|
|
47
|
+
*.cover
|
|
48
|
+
*.py,cover
|
|
49
|
+
.hypothesis/
|
|
50
|
+
.pytest_cache/
|
|
51
|
+
|
|
52
|
+
# Translations
|
|
53
|
+
*.mo
|
|
54
|
+
*.pot
|
|
55
|
+
|
|
56
|
+
# Django stuff:
|
|
57
|
+
*.log
|
|
58
|
+
local_settings.py
|
|
59
|
+
db.sqlite3
|
|
60
|
+
db.sqlite3-journal
|
|
61
|
+
|
|
62
|
+
# Flask stuff:
|
|
63
|
+
instance/
|
|
64
|
+
.webassets-cache
|
|
65
|
+
|
|
66
|
+
# Scrapy stuff:
|
|
67
|
+
.scrapy
|
|
68
|
+
|
|
69
|
+
# Sphinx documentation
|
|
70
|
+
docs/_build/
|
|
71
|
+
|
|
72
|
+
# PyBuilder
|
|
73
|
+
target/
|
|
74
|
+
|
|
75
|
+
# Jupyter Notebook
|
|
76
|
+
.ipynb_checkpoints
|
|
77
|
+
|
|
78
|
+
# IPython
|
|
79
|
+
profile_default/
|
|
80
|
+
ipython_config.py
|
|
81
|
+
|
|
82
|
+
# pyenv
|
|
83
|
+
.python-version
|
|
84
|
+
|
|
85
|
+
# pipenv
|
|
86
|
+
Pipfile.lock
|
|
87
|
+
|
|
88
|
+
# PEP 582
|
|
89
|
+
__pypackages__/
|
|
90
|
+
|
|
91
|
+
# Celery stuff
|
|
92
|
+
celerybeat-schedule
|
|
93
|
+
celerybeat.pid
|
|
94
|
+
|
|
95
|
+
# SageMath parsed files
|
|
96
|
+
*.sage.py
|
|
97
|
+
|
|
98
|
+
# Environments
|
|
99
|
+
.env
|
|
100
|
+
.venv
|
|
101
|
+
env/
|
|
102
|
+
venv/
|
|
103
|
+
ENV/
|
|
104
|
+
env.bak/
|
|
105
|
+
venv.bak/
|
|
106
|
+
|
|
107
|
+
# Spyder project settings
|
|
108
|
+
.spyderproject
|
|
109
|
+
.spyproject
|
|
110
|
+
|
|
111
|
+
# Rope project settings
|
|
112
|
+
.ropeproject
|
|
113
|
+
|
|
114
|
+
# mkdocs documentation
|
|
115
|
+
/site
|
|
116
|
+
|
|
117
|
+
# mypy
|
|
118
|
+
.mypy_cache/
|
|
119
|
+
.dmypy.json
|
|
120
|
+
dmypy.json
|
|
121
|
+
|
|
122
|
+
# Pyre type checker
|
|
123
|
+
.pyre/
|
|
124
|
+
|
|
125
|
+
# IDEs
|
|
126
|
+
.vscode/
|
|
127
|
+
.idea/
|
|
128
|
+
*.swp
|
|
129
|
+
*.swo
|
|
130
|
+
*~
|
|
131
|
+
|
|
132
|
+
# OS
|
|
133
|
+
.DS_Store
|
|
134
|
+
Thumbs.db
|
|
135
|
+
|
|
136
|
+
# Project specific
|
|
137
|
+
cache_data/
|
|
138
|
+
*.cache
|
|
139
|
+
*.tmp
|
|
140
|
+
|
|
141
|
+
# Development and testing scripts (not needed for users)
|
|
142
|
+
quick_start_testing.sh
|
|
143
|
+
test_*.sh
|
|
144
|
+
run_*.sh
|
|
145
|
+
*_test_local.py
|
|
146
|
+
*_local_test.py
|
|
147
|
+
DEPLOYMENT_GUIDE.md
|
|
148
|
+
|
|
149
|
+
# Build artifacts (regenerated on pip install)
|
|
150
|
+
ramjet.egg-info/
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [0.1.0] - 2025-10-22
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- Initial release of RAMJET distributed cache
|
|
14
|
+
- Consistent hashing implementation for key distribution
|
|
15
|
+
- HTTP-based cache server with disk storage
|
|
16
|
+
- Distributed cache client with retry logic
|
|
17
|
+
- PyTorch integration utilities:
|
|
18
|
+
- `CachedDataset` for caching dataset items
|
|
19
|
+
- `CacheCheckpoint` for model checkpoint management
|
|
20
|
+
- Forward hooks for activation caching
|
|
21
|
+
- Command-line interfaces:
|
|
22
|
+
- `ramjet-server` for starting cache servers
|
|
23
|
+
- `ramjet-client` for command-line cache operations
|
|
24
|
+
- Comprehensive test suite
|
|
25
|
+
- Example scripts demonstrating usage
|
|
26
|
+
- Documentation and README
|
|
27
|
+
|
|
28
|
+
### Features
|
|
29
|
+
- Scalable to N servers with M TB each
|
|
30
|
+
- Automatic node addition/removal with minimal redistribution
|
|
31
|
+
- Optional data compression
|
|
32
|
+
- Configurable replication factor
|
|
33
|
+
- TTL support for cache entries
|
|
34
|
+
- PyTorch tensor serialization
|
|
35
|
+
- Statistics and monitoring endpoints
|
|
36
|
+
|
|
37
|
+
[Unreleased]: https://github.com/ramjetinc/ramjet/compare/v0.1.0...HEAD
|
|
38
|
+
[0.1.0]: https://github.com/ramjetinc/ramjet/releases/tag/v0.1.0
|
ramjetio-0.4.0/LICENSE
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
# PolyForm Noncommercial License 1.0.0
|
|
2
|
+
|
|
3
|
+
<https://polyformproject.org/licenses/noncommercial/1.0.0>
|
|
4
|
+
|
|
5
|
+
## Acceptance
|
|
6
|
+
|
|
7
|
+
In order to get any license under these terms, you must agree
|
|
8
|
+
to them as both strict obligations and conditions to all
|
|
9
|
+
your licenses.
|
|
10
|
+
|
|
11
|
+
## Copyright License
|
|
12
|
+
|
|
13
|
+
The licensor grants you a copyright license for the
|
|
14
|
+
software to do everything you might do with the software
|
|
15
|
+
that would otherwise infringe the licensor's copyright
|
|
16
|
+
in it for any permitted purpose. However, you may
|
|
17
|
+
only distribute the software according to [Distribution
|
|
18
|
+
License](#distribution-license) and make changes or new works
|
|
19
|
+
based on the software according to [Changes and New Works
|
|
20
|
+
License](#changes-and-new-works-license).
|
|
21
|
+
|
|
22
|
+
## Distribution License
|
|
23
|
+
|
|
24
|
+
The licensor grants you an additional copyright license
|
|
25
|
+
to distribute copies of the software. Your license
|
|
26
|
+
to distribute covers distributing the software with
|
|
27
|
+
changes and new works permitted by [Changes and New Works
|
|
28
|
+
License](#changes-and-new-works-license).
|
|
29
|
+
|
|
30
|
+
## Notices
|
|
31
|
+
|
|
32
|
+
You must ensure that anyone who gets a copy of any part of
|
|
33
|
+
the software from you also gets a copy of these terms or the
|
|
34
|
+
URL for them above, as well as copies of any plain-text lines
|
|
35
|
+
beginning with `Required Notice:` that the licensor provided
|
|
36
|
+
with the software. For example:
|
|
37
|
+
|
|
38
|
+
> Required Notice: Copyright NxGen Inc.
|
|
39
|
+
|
|
40
|
+
## Changes and New Works License
|
|
41
|
+
|
|
42
|
+
The licensor grants you an additional copyright license to
|
|
43
|
+
make changes and new works based on the software for any
|
|
44
|
+
permitted purpose.
|
|
45
|
+
|
|
46
|
+
## Patent License
|
|
47
|
+
|
|
48
|
+
The licensor grants you a patent license for the software that
|
|
49
|
+
covers patent claims the licensor can license, or becomes able
|
|
50
|
+
to license, that you would infringe by using the software.
|
|
51
|
+
|
|
52
|
+
## Noncommercial Purposes
|
|
53
|
+
|
|
54
|
+
Any noncommercial purpose is a permitted purpose.
|
|
55
|
+
|
|
56
|
+
## Personal Uses
|
|
57
|
+
|
|
58
|
+
Personal use for research, experiment, and testing for
|
|
59
|
+
the benefit of public knowledge, personal study, private
|
|
60
|
+
entertainment, hobby projects, amateur pursuits, or religious
|
|
61
|
+
observance, without any anticipated commercial application,
|
|
62
|
+
is use for a permitted purpose.
|
|
63
|
+
|
|
64
|
+
## Noncommercial Organizations
|
|
65
|
+
|
|
66
|
+
Use by any charitable organization, educational institution,
|
|
67
|
+
public research organization, public safety or health
|
|
68
|
+
organization, environmental protection organization,
|
|
69
|
+
or government institution is use for a permitted purpose
|
|
70
|
+
regardless of the source of funding or obligations resulting
|
|
71
|
+
from the funding.
|
|
72
|
+
|
|
73
|
+
## Fair Use
|
|
74
|
+
|
|
75
|
+
You may have "fair use" rights for the software under the
|
|
76
|
+
law. These terms do not limit them.
|
|
77
|
+
|
|
78
|
+
## No Other Rights
|
|
79
|
+
|
|
80
|
+
These terms do not allow you to sublicense or transfer any of
|
|
81
|
+
your licenses to anyone else, or prevent the licensor from
|
|
82
|
+
granting licenses to anyone else. These terms do not imply
|
|
83
|
+
any other licenses.
|
|
84
|
+
|
|
85
|
+
## Patent Defense
|
|
86
|
+
|
|
87
|
+
If you make any written claim that the software infringes or
|
|
88
|
+
contributes to infringement of any patent, your patent license
|
|
89
|
+
for the software granted under these terms ends immediately. If
|
|
90
|
+
your company makes such a claim, your patent license ends
|
|
91
|
+
immediately for work on behalf of your company.
|
|
92
|
+
|
|
93
|
+
## Violations
|
|
94
|
+
|
|
95
|
+
The first time you are notified in writing that you have
|
|
96
|
+
violated any of these terms, or done anything with the software
|
|
97
|
+
not covered by your licenses, your licenses can nonetheless
|
|
98
|
+
continue if you come into full compliance with these terms,
|
|
99
|
+
and take practical steps to correct past violations, within
|
|
100
|
+
32 days of receiving notice. Otherwise, all your licenses
|
|
101
|
+
end immediately.
|
|
102
|
+
|
|
103
|
+
## No Liability
|
|
104
|
+
|
|
105
|
+
***As far as the law allows, the software comes as is, without
|
|
106
|
+
any warranty or condition, and the licensor will not be liable
|
|
107
|
+
to you for any damages arising out of these terms or the use
|
|
108
|
+
or nature of the software, under any kind of legal claim.***
|
|
109
|
+
|
|
110
|
+
## Definitions
|
|
111
|
+
|
|
112
|
+
The **licensor** is the entity offering these terms, and the
|
|
113
|
+
**software** is the software the licensor makes available under
|
|
114
|
+
these terms.
|
|
115
|
+
|
|
116
|
+
**You** refers to the individual or entity agreeing to these
|
|
117
|
+
terms.
|
|
118
|
+
|
|
119
|
+
**Your company** is any legal entity, sole proprietorship,
|
|
120
|
+
or other kind of organization that you work for, plus all
|
|
121
|
+
organizations that have control over, are under the control of,
|
|
122
|
+
or are under common control with that organization. **Control**
|
|
123
|
+
means ownership of substantially all the assets of an entity,
|
|
124
|
+
or the power to direct its management and policies by vote,
|
|
125
|
+
contract, or otherwise. Control can be direct or indirect.
|
|
126
|
+
|
|
127
|
+
**Your licenses** are all the licenses granted to you for the
|
|
128
|
+
software under these terms.
|
|
129
|
+
|
|
130
|
+
**Use** means anything you do with the software requiring one
|
|
131
|
+
of your licenses.
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
Required Notice: Copyright 2025 NxGen Inc.
|
|
136
|
+
|
|
137
|
+
---
|
|
138
|
+
|
|
139
|
+
## Commercial Licensing
|
|
140
|
+
|
|
141
|
+
For commercial use, please contact jogrms@gmail.com
|
ramjetio-0.4.0/Makefile
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
.PHONY: help install install-dev test test-cov format lint type-check clean build upload docs run-server run-examples
|
|
2
|
+
|
|
3
|
+
help:
|
|
4
|
+
@echo "RAMJET - Distributed Cache for PyTorch Training"
|
|
5
|
+
@echo ""
|
|
6
|
+
@echo "Available targets:"
|
|
7
|
+
@echo " install - Install package"
|
|
8
|
+
@echo " install-dev - Install package with dev dependencies"
|
|
9
|
+
@echo " test - Run tests"
|
|
10
|
+
@echo " test-cov - Run tests with coverage"
|
|
11
|
+
@echo " format - Format code with black and isort"
|
|
12
|
+
@echo " lint - Run linters (flake8)"
|
|
13
|
+
@echo " type-check - Run type checker (mypy)"
|
|
14
|
+
@echo " clean - Clean build artifacts"
|
|
15
|
+
@echo " build - Build distribution packages"
|
|
16
|
+
@echo " upload - Upload to PyPI (requires credentials)"
|
|
17
|
+
@echo " docs - Generate documentation"
|
|
18
|
+
@echo " run-server - Start test cache servers (3 nodes)"
|
|
19
|
+
@echo " run-examples - Run all example scripts"
|
|
20
|
+
|
|
21
|
+
install:
|
|
22
|
+
pip install -e .
|
|
23
|
+
|
|
24
|
+
install-dev:
|
|
25
|
+
pip install -e ".[dev]"
|
|
26
|
+
pre-commit install
|
|
27
|
+
|
|
28
|
+
test:
|
|
29
|
+
pytest tests/
|
|
30
|
+
|
|
31
|
+
test-cov:
|
|
32
|
+
pytest --cov=ramjet --cov-report=html --cov-report=term tests/
|
|
33
|
+
|
|
34
|
+
format:
|
|
35
|
+
black ramjet tests examples
|
|
36
|
+
isort ramjet tests examples
|
|
37
|
+
|
|
38
|
+
lint:
|
|
39
|
+
flake8 ramjet tests examples --max-line-length=120 --extend-ignore=E203,W503
|
|
40
|
+
|
|
41
|
+
type-check:
|
|
42
|
+
mypy ramjet --ignore-missing-imports
|
|
43
|
+
|
|
44
|
+
clean:
|
|
45
|
+
rm -rf build/
|
|
46
|
+
rm -rf dist/
|
|
47
|
+
rm -rf *.egg-info
|
|
48
|
+
rm -rf htmlcov/
|
|
49
|
+
rm -rf .pytest_cache/
|
|
50
|
+
rm -rf .mypy_cache/
|
|
51
|
+
find . -type d -name __pycache__ -exec rm -rf {} +
|
|
52
|
+
find . -type f -name "*.pyc" -delete
|
|
53
|
+
|
|
54
|
+
build: clean
|
|
55
|
+
python -m build
|
|
56
|
+
|
|
57
|
+
upload: build
|
|
58
|
+
python -m twine upload dist/*
|
|
59
|
+
|
|
60
|
+
docs:
|
|
61
|
+
@echo "Documentation is in markdown files:"
|
|
62
|
+
@echo " - README.md"
|
|
63
|
+
@echo " - QUICKSTART.md"
|
|
64
|
+
@echo " - INSTALLATION.md"
|
|
65
|
+
@echo " - PROJECT_STRUCTURE.md"
|
|
66
|
+
@echo " - CONTRIBUTING.md"
|
|
67
|
+
|
|
68
|
+
run-server:
|
|
69
|
+
@echo "Starting 3 cache server nodes..."
|
|
70
|
+
@echo "Node 1: http://localhost:9000"
|
|
71
|
+
@echo "Node 2: http://localhost:9001"
|
|
72
|
+
@echo "Node 3: http://localhost:9002"
|
|
73
|
+
@echo ""
|
|
74
|
+
@echo "Press Ctrl+C to stop all servers"
|
|
75
|
+
@trap 'kill 0' SIGINT; \
|
|
76
|
+
ramjet-server --port 9000 --storage-path /tmp/ramjet_cache_0 --capacity 1GB & \
|
|
77
|
+
ramjet-server --port 9001 --storage-path /tmp/ramjet_cache_1 --capacity 1GB & \
|
|
78
|
+
ramjet-server --port 9002 --storage-path /tmp/ramjet_cache_2 --capacity 1GB & \
|
|
79
|
+
wait
|
|
80
|
+
|
|
81
|
+
run-examples:
|
|
82
|
+
@echo "Running basic usage example..."
|
|
83
|
+
python examples/basic_usage.py
|
|
84
|
+
@echo ""
|
|
85
|
+
@echo "Running PyTorch DataLoader example..."
|
|
86
|
+
python examples/pytorch_dataloader.py
|
|
87
|
+
@echo ""
|
|
88
|
+
@echo "Running checkpoint caching example..."
|
|
89
|
+
python examples/checkpoint_caching.py
|
ramjetio-0.4.0/PKG-INFO
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ramjetio
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: Distributed cache system for PyTorch training
|
|
5
|
+
Author-email: RAMJET <support@ramjet.io>
|
|
6
|
+
License-Expression: LicenseRef-PolyForm-Noncommercial-1.0.0
|
|
7
|
+
Project-URL: Homepage, https://ramjet.io
|
|
8
|
+
Keywords: distributed,cache,pytorch,deep-learning,machine-learning,training
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
20
|
+
Requires-Python: >=3.8
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: torch>=1.9.0
|
|
24
|
+
Requires-Dist: numpy>=1.19.0
|
|
25
|
+
Requires-Dist: grpcio>=1.50.0
|
|
26
|
+
Requires-Dist: grpcio-tools>=1.50.0
|
|
27
|
+
Requires-Dist: protobuf>=4.0.0
|
|
28
|
+
Requires-Dist: aiohttp>=3.8.0
|
|
29
|
+
Requires-Dist: msgpack>=1.0.0
|
|
30
|
+
Requires-Dist: pyyaml>=5.4.0
|
|
31
|
+
Requires-Dist: requests>=2.25.0
|
|
32
|
+
Requires-Dist: mmh3>=3.0.0
|
|
33
|
+
Requires-Dist: diskcache>=5.4.0
|
|
34
|
+
Requires-Dist: psutil>=5.8.0
|
|
35
|
+
Requires-Dist: boto3>=1.26.0
|
|
36
|
+
Dynamic: license-file
|
|
37
|
+
|
|
38
|
+
# RAMJET — Distributed Data Cache for PyTorch Training
|
|
39
|
+
|
|
40
|
+
[](https://www.python.org/downloads/)
|
|
41
|
+
[](https://pytorch.org/)
|
|
42
|
+
[](https://pypi.org/project/ramjetio/)
|
|
43
|
+
[](LICENSE)
|
|
44
|
+
|
|
45
|
+
**RAMJET** accelerates PyTorch distributed training by caching preprocessed data across your cluster. Works with any DDP setup — `torchrun`, DeepSpeed, Accelerate, or custom launchers.
|
|
46
|
+
|
|
47
|
+
## Why RAMJET?
|
|
48
|
+
|
|
49
|
+
| Problem | Solution |
|
|
50
|
+
|---------|----------|
|
|
51
|
+
| Slow data preprocessing | Cache preprocessed samples across nodes |
|
|
52
|
+
| Network bottleneck from shared storage | Local SSD cache on each node |
|
|
53
|
+
| Repeated data loading across epochs | First epoch caches, next epochs are instant |
|
|
54
|
+
| No visibility into training | Real-time metrics dashboard |
|
|
55
|
+
|
|
56
|
+
## Quick Start
|
|
57
|
+
|
|
58
|
+
### 1. Install
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
pip install ramjetio
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### 2. Get API Key
|
|
65
|
+
|
|
66
|
+
1. Go to [app.ramjet.io](https://app.ramjet.io)
|
|
67
|
+
2. Create a cluster
|
|
68
|
+
3. Copy your API key
|
|
69
|
+
|
|
70
|
+
### 3. Add to Your Training Script
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
import ramjetio
|
|
74
|
+
from torch.utils.data import DataLoader
|
|
75
|
+
|
|
76
|
+
# Initialize RAMJET (connects to dashboard, starts local cache server)
|
|
77
|
+
ramjetio.init()
|
|
78
|
+
|
|
79
|
+
# Wrap your dataset
|
|
80
|
+
dataset = ramjetio.CachedDataset(your_dataset)
|
|
81
|
+
|
|
82
|
+
# Use with DataLoader as usual
|
|
83
|
+
loader = DataLoader(dataset, batch_size=32)
|
|
84
|
+
|
|
85
|
+
for batch in loader:
|
|
86
|
+
train_step(batch)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### 4. Run Training
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
export RAMJET_API_KEY="your_api_key_here"
|
|
93
|
+
|
|
94
|
+
# Works with any launcher
|
|
95
|
+
torchrun --nproc_per_node=2 train.py
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
That's it! Your nodes will appear in the dashboard within seconds.
|
|
99
|
+
|
|
100
|
+
## How It Works
|
|
101
|
+
|
|
102
|
+
```
|
|
103
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
104
|
+
│ Your Training Cluster │
|
|
105
|
+
├─────────────────────────────────────────────────────────────┤
|
|
106
|
+
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
|
107
|
+
│ │ Node 0 │ │ Node 1 │ │ Node 2 │ │
|
|
108
|
+
│ │ ┌───────┐ │ │ ┌───────┐ │ │ ┌───────┐ │ │
|
|
109
|
+
│ │ │ Train │ │ │ │ Train │ │ │ │ Train │ │ │
|
|
110
|
+
│ │ └───┬───┘ │ │ └───┬───┘ │ │ └───┬───┘ │ │
|
|
111
|
+
│ │ │ │ │ │ │ │ │ │ │
|
|
112
|
+
│ │ ┌───▼───┐ │ │ ┌───▼───┐ │ │ ┌───▼───┐ │ │
|
|
113
|
+
│ │ │ RAMJET │◄─┼────┼──┤ RAMJET │◄─┼────┼──┤ RAMJET │ │ │
|
|
114
|
+
│ │ │ Cache │──┼────┼──► Cache │──┼────┼──► Cache │ │ │
|
|
115
|
+
│ │ └───────┘ │ │ └───────┘ │ │ └───────┘ │ │
|
|
116
|
+
│ │ 500GB SSD │ │ 500GB SSD │ │ 500GB SSD │ │
|
|
117
|
+
│ └─────────────┘ └─────────────┘ └─────────────┘ │
|
|
118
|
+
│ │ │
|
|
119
|
+
│ ▼ │
|
|
120
|
+
│ ┌─────────────────┐ │
|
|
121
|
+
│ │ RAMJET Dashboard │ │
|
|
122
|
+
│ │ (Metrics UI) │ │
|
|
123
|
+
│ └─────────────────┘ │
|
|
124
|
+
└─────────────────────────────────────────────────────────────┘
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
## Features
|
|
128
|
+
|
|
129
|
+
- 🚀 **Zero-config caching** — `ramjetio.init()` handles everything
|
|
130
|
+
- 📊 **Real-time dashboard** — monitor cache hits, throughput, GPU utilization
|
|
131
|
+
- 🔄 **Consistent hashing** — data distributed evenly across nodes
|
|
132
|
+
- 💾 **Disk-backed cache** — survives restarts, uses NVMe SSDs efficiently
|
|
133
|
+
- 🔌 **Works with any setup** — torchrun, DeepSpeed, Accelerate, custom launchers
|
|
134
|
+
- ☁️ **S3/MinIO integration** — configure data source in dashboard, not in code
|
|
135
|
+
|
|
136
|
+
## Integration Examples
|
|
137
|
+
|
|
138
|
+
See [docs/INTEGRATION.md](docs/INTEGRATION.md) for detailed examples with:
|
|
139
|
+
- PyTorch DDP with `torchrun`
|
|
140
|
+
- DeepSpeed
|
|
141
|
+
- HuggingFace Accelerate
|
|
142
|
+
- Custom training loops
|
|
143
|
+
- Multi-node clusters
|
|
144
|
+
|
|
145
|
+
## Configuration
|
|
146
|
+
|
|
147
|
+
### Environment Variables
|
|
148
|
+
|
|
149
|
+
| Variable | Description | Default |
|
|
150
|
+
|----------|-------------|---------|
|
|
151
|
+
| `RAMJET_API_KEY` | Your API key (required) | — |
|
|
152
|
+
| `RAMJET_CACHE_PATH` | Local cache directory | `/tmp/ramjet_cache` |
|
|
153
|
+
| `RAMJET_CACHE_SIZE` | Max cache size | `100GB` |
|
|
154
|
+
| `RAMJET_PORT` | Cache server port | `9000` |
|
|
155
|
+
|
|
156
|
+
### Dashboard Settings
|
|
157
|
+
|
|
158
|
+
Configure in the web dashboard (no code changes needed):
|
|
159
|
+
- **Data Source**: S3/MinIO endpoint, bucket, credentials
|
|
160
|
+
- **Cache Settings**: TTL, replication factor, eviction policy
|
|
161
|
+
|
|
162
|
+
## Distributed Training (DDP)
|
|
163
|
+
|
|
164
|
+
RAMJET automatically detects `torchrun` and DDP environments:
|
|
165
|
+
|
|
166
|
+
### Single Machine, Multiple GPUs (torchrun)
|
|
167
|
+
|
|
168
|
+
```bash
|
|
169
|
+
# 4 GPUs on one machine
|
|
170
|
+
torchrun --nproc_per_node=4 train.py
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
```python
|
|
174
|
+
import ramjetio
|
|
175
|
+
import torch.distributed as dist
|
|
176
|
+
|
|
177
|
+
# Only LOCAL_RANK=0 starts cache server - others wait and share it
|
|
178
|
+
ramjetio.init()
|
|
179
|
+
|
|
180
|
+
# All ranks use the same cache
|
|
181
|
+
dataset = ramjetio.CachedDataset(your_dataset)
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
### Multi-Node Training
|
|
185
|
+
|
|
186
|
+
```bash
|
|
187
|
+
# Node 0 (master)
|
|
188
|
+
torchrun --nnodes=2 --node_rank=0 --nproc_per_node=4 \
|
|
189
|
+
--master_addr=node0 --master_port=29500 train.py
|
|
190
|
+
|
|
191
|
+
# Node 1
|
|
192
|
+
torchrun --nnodes=2 --node_rank=1 --nproc_per_node=4 \
|
|
193
|
+
--master_addr=node0 --master_port=29500 train.py
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
Each node runs one cache server (on LOCAL_RANK=0), and all nodes share data via consistent hashing.
|
|
197
|
+
|
|
198
|
+
### Separate Processes per Rank
|
|
199
|
+
|
|
200
|
+
```bash
|
|
201
|
+
# If not using torchrun, set env vars manually:
|
|
202
|
+
export RANK=0 WORLD_SIZE=4 LOCAL_RANK=0
|
|
203
|
+
python train.py
|
|
204
|
+
|
|
205
|
+
# On another terminal/machine:
|
|
206
|
+
export RANK=1 WORLD_SIZE=4 LOCAL_RANK=0
|
|
207
|
+
python train.py
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
RAMJET reads `LOCAL_RANK`, `RANK`, `WORLD_SIZE` from environment to coordinate.
|
|
211
|
+
|
|
212
|
+
## CLI Tools
|
|
213
|
+
|
|
214
|
+
```bash
|
|
215
|
+
# Start cache server manually (usually not needed — ramjetio.init() does this)
|
|
216
|
+
ramjetio-server --port 9000 --capacity 100GB
|
|
217
|
+
|
|
218
|
+
# Check cache status
|
|
219
|
+
ramjetio-client stats
|
|
220
|
+
|
|
221
|
+
# Clear cache
|
|
222
|
+
ramjetio-client clear
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
## Requirements
|
|
226
|
+
|
|
227
|
+
- Python 3.8+
|
|
228
|
+
- PyTorch 1.9+
|
|
229
|
+
- Linux (recommended for production)
|
|
230
|
+
- SSD storage for cache (recommended)
|
|
231
|
+
|
|
232
|
+
## Documentation
|
|
233
|
+
|
|
234
|
+
- [Integration Guide](docs/INTEGRATION.md) — detailed examples for all frameworks
|
|
235
|
+
- [API Reference](docs/API.md) — full API documentation
|
|
236
|
+
- [Troubleshooting](docs/TROUBLESHOOTING.md) — common issues and solutions
|
|
237
|
+
|
|
238
|
+
## License
|
|
239
|
+
|
|
240
|
+
PolyForm Noncommercial License 1.0.0 — free for personal and non-commercial use.
|
|
241
|
+
For commercial licensing, contact licensing@ramjet.dev. See [LICENSE](LICENSE) for details.
|
|
242
|
+
|
|
243
|
+
## Support
|
|
244
|
+
|
|
245
|
+
- 📧 Email: support@ramjet.io
|
|
246
|
+
- 💬 Discord: [discord.gg/ramjet](https://discord.gg/ramjet)
|
|
247
|
+
- 📖 Docs: [docs.ramjet.io](https://docs.ramjet.io)
|