purlin 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- purlin-0.1.0/LICENSE +28 -0
- purlin-0.1.0/PKG-INFO +17 -0
- purlin-0.1.0/README.md +3 -0
- purlin-0.1.0/purlin/CMakeLists.txt +102 -0
- purlin-0.1.0/purlin/CPM.cmake +24 -0
- purlin-0.1.0/purlin/__init__.py +107 -0
- purlin-0.1.0/purlin/bindings.py +161 -0
- purlin-0.1.0/purlin/jit.py +160 -0
- purlin-0.1.0/purlin.egg-info/PKG-INFO +17 -0
- purlin-0.1.0/purlin.egg-info/SOURCES.txt +13 -0
- purlin-0.1.0/purlin.egg-info/dependency_links.txt +1 -0
- purlin-0.1.0/purlin.egg-info/requires.txt +1 -0
- purlin-0.1.0/purlin.egg-info/top_level.txt +1 -0
- purlin-0.1.0/pyproject.toml +31 -0
- purlin-0.1.0/setup.cfg +4 -0
purlin-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
BSD 3-Clause License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026, Osayamen Jonathan Aimuyo
|
|
4
|
+
|
|
5
|
+
Redistribution and use in source and binary forms, with or without
|
|
6
|
+
modification, are permitted provided that the following conditions are met:
|
|
7
|
+
|
|
8
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
|
9
|
+
list of conditions and the following disclaimer.
|
|
10
|
+
|
|
11
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
12
|
+
this list of conditions and the following disclaimer in the documentation
|
|
13
|
+
and/or other materials provided with the distribution.
|
|
14
|
+
|
|
15
|
+
3. Neither the name of the copyright holder nor the names of its
|
|
16
|
+
contributors may be used to endorse or promote products derived from
|
|
17
|
+
this software without specific prior written permission.
|
|
18
|
+
|
|
19
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
20
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
21
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
22
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
23
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
24
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
25
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
26
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
27
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
28
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE
|
purlin-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: purlin
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Adaptable, template-based substrate for high-performance inter-GPU communication
|
|
5
|
+
Author-email: Osayamen Jonathan Aimuyo <osayamen@stanford.edu>
|
|
6
|
+
License-Expression: BSD-3-Clause
|
|
7
|
+
Project-URL: Homepage, https://github.com/osayamenja/purlin
|
|
8
|
+
Project-URL: Repository, https://github.com/osayamenja/purlin
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: torch>=2.8.0
|
|
13
|
+
Dynamic: license-file
|
|
14
|
+
|
|
15
|
+
# Purlin
|
|
16
|
+
Library that provides composable primitives for constructing high-performance and _adaptable_ device-initiated
|
|
17
|
+
communication operations.
|
purlin-0.1.0/README.md
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
cmake_minimum_required(VERSION 3.27)
|
|
2
|
+
project(purlin_v010 LANGUAGES CXX CUDA)
|
|
3
|
+
|
|
4
|
+
set(CMAKE_CXX_STANDARD 20)
|
|
5
|
+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
|
6
|
+
set(CMAKE_CXX_EXTENSIONS OFF)
|
|
7
|
+
|
|
8
|
+
set(CMAKE_CUDA_STANDARD 20)
|
|
9
|
+
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
|
|
10
|
+
set(CMAKE_CUDA_EXTENSIONS OFF)
|
|
11
|
+
|
|
12
|
+
# flags
|
|
13
|
+
add_library(purlin_warnings INTERFACE)
|
|
14
|
+
|
|
15
|
+
target_compile_options(purlin_warnings INTERFACE
|
|
16
|
+
$<$<COMPILE_LANGUAGE:CXX>:
|
|
17
|
+
-Wall -Wextra
|
|
18
|
+
-fno-strict-aliasing
|
|
19
|
+
-Wno-unknown-pragmas
|
|
20
|
+
-Wnull-dereference
|
|
21
|
+
-Wnarrowing
|
|
22
|
+
-Wno-switch
|
|
23
|
+
-Wduplicated-branches
|
|
24
|
+
-Wformat=2
|
|
25
|
+
-Wno-unused-but-set-parameter
|
|
26
|
+
-Wno-sign-compare
|
|
27
|
+
>
|
|
28
|
+
|
|
29
|
+
$<$<COMPILE_LANGUAGE:CUDA>:
|
|
30
|
+
-Xcompiler=-Wall,-Wextra
|
|
31
|
+
-Xcompiler=-fno-strict-aliasing
|
|
32
|
+
-Xcompiler=-Wno-unknown-pragmas,-Wnull-dereference,-Wnarrowing
|
|
33
|
+
-Xcompiler=-Wno-switch,-Wduplicated-branches,-Wformat=2
|
|
34
|
+
-Xcompiler=-Wno-unused-but-set-parameter
|
|
35
|
+
-Xcudafe --display_error_number
|
|
36
|
+
-Xcompiler=-Wno-sign-compare
|
|
37
|
+
>
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
find_package(CUDAToolkit REQUIRED)
|
|
41
|
+
|
|
42
|
+
if (NOT DEFINED GENERATED_SRC)
|
|
43
|
+
message(FATAL_ERROR "Need -DGENERATED_SRC")
|
|
44
|
+
endif()
|
|
45
|
+
|
|
46
|
+
if (NOT DEFINED TARGET_MODULE_NAME)
|
|
47
|
+
message(FATAL_ERROR "Need -DTARGET_MODULE_NAME")
|
|
48
|
+
endif()
|
|
49
|
+
|
|
50
|
+
if (NOT DEFINED ARCH)
|
|
51
|
+
message(FATAL_ERROR "Need -DARCH")
|
|
52
|
+
endif()
|
|
53
|
+
|
|
54
|
+
if(${ARCH} LESS 70)
|
|
55
|
+
message(FATAL_ERROR "Unsupported ARCH")
|
|
56
|
+
endif()
|
|
57
|
+
|
|
58
|
+
math(EXPR GPU_ARCH "${ARCH} * 10" OUTPUT_FORMAT DECIMAL)
|
|
59
|
+
set(ARCH_TAG "${ARCH}")
|
|
60
|
+
if(ARCH GREATER_EQUAL 90)
|
|
61
|
+
string(APPEND ARCH_TAG "a") # accelerated arch-specific instruction set
|
|
62
|
+
endif()
|
|
63
|
+
|
|
64
|
+
# ---- CPM ----
|
|
65
|
+
include(${CMAKE_CURRENT_LIST_DIR}/CPM.cmake)
|
|
66
|
+
|
|
67
|
+
#pybind11
|
|
68
|
+
set(PYBIND11_FINDPYTHON ON)
|
|
69
|
+
CPMAddPackage(
|
|
70
|
+
NAME pybind11
|
|
71
|
+
GITHUB_REPOSITORY pybind/pybind11
|
|
72
|
+
VERSION 3.0.4
|
|
73
|
+
)
|
|
74
|
+
if(NOT COMMAND pybind11_add_module)
|
|
75
|
+
message(FATAL_ERROR "pybind11 not found")
|
|
76
|
+
endif()
|
|
77
|
+
pybind11_add_module(${TARGET_MODULE_NAME} MODULE ${GENERATED_SRC})
|
|
78
|
+
|
|
79
|
+
# -------------------------------------------------------
|
|
80
|
+
# dependencies
|
|
81
|
+
# -------------------------------------------------------
|
|
82
|
+
CPMAddPackage(
|
|
83
|
+
NAME purlin
|
|
84
|
+
GITHUB_REPOSITORY osayamenja/purlin
|
|
85
|
+
GIT_TAG main
|
|
86
|
+
)
|
|
87
|
+
if (DEFINED PURLIN_SOURCE_DIR AND EXISTS "${PURLIN_SOURCE_DIR}/include")
|
|
88
|
+
target_include_directories(${TARGET_MODULE_NAME} PRIVATE "${PURLIN_SOURCE_DIR}/include")
|
|
89
|
+
else()
|
|
90
|
+
target_link_libraries(${TARGET_MODULE_NAME} PRIVATE purlin::purlin)
|
|
91
|
+
endif()
|
|
92
|
+
|
|
93
|
+
target_compile_options(${TARGET_MODULE_NAME} PRIVATE
|
|
94
|
+
$<$<COMPILE_LANGUAGE:CXX>:-O3>
|
|
95
|
+
$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xfatbin -compress-all>
|
|
96
|
+
$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr; -Xptxas -v; -t0>
|
|
97
|
+
$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-gencode=arch=compute_${ARCH_TAG},code=sm_${ARCH_TAG}>
|
|
98
|
+
)
|
|
99
|
+
target_compile_definitions(${TARGET_MODULE_NAME} PRIVATE
|
|
100
|
+
ARCH=${GPU_ARCH}
|
|
101
|
+
PURLIN_NVTX=1
|
|
102
|
+
)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# SPDX-License-Identifier: MIT
|
|
2
|
+
#
|
|
3
|
+
# SPDX-FileCopyrightText: Copyright (c) 2019-2023 Lars Melchior and contributors
|
|
4
|
+
|
|
5
|
+
set(CPM_DOWNLOAD_VERSION 0.42.1)
|
|
6
|
+
set(CPM_HASH_SUM "f3a6dcc6a04ce9e7f51a127307fa4f699fb2bade357a8eb4c5b45df76e1dc6a5")
|
|
7
|
+
|
|
8
|
+
if(CPM_SOURCE_CACHE)
|
|
9
|
+
set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
|
|
10
|
+
elseif(DEFINED ENV{CPM_SOURCE_CACHE})
|
|
11
|
+
set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
|
|
12
|
+
else()
|
|
13
|
+
set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
|
|
14
|
+
endif()
|
|
15
|
+
|
|
16
|
+
# Expand relative path. This is important if the provided path contains a tilde (~)
|
|
17
|
+
get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE)
|
|
18
|
+
|
|
19
|
+
file(DOWNLOAD
|
|
20
|
+
https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
|
|
21
|
+
${CPM_DOWNLOAD_LOCATION} EXPECTED_HASH SHA256=${CPM_HASH_SUM}
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
include(${CPM_DOWNLOAD_LOCATION})
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from enum import IntEnum
|
|
2
|
+
import torch
|
|
3
|
+
|
|
4
|
+
class ContextHandle:
|
|
5
|
+
__slots__ = ("mod", "ctx", "buf", "hdl", "sig_buf", "sig_hdl")
|
|
6
|
+
|
|
7
|
+
def __init__(self, mod, ctx, sym_buf, sym_hdl, sig_buf, sig_hdl):
|
|
8
|
+
self.mod = mod
|
|
9
|
+
self.ctx = ctx
|
|
10
|
+
self.buf = sym_buf
|
|
11
|
+
self.hdl = sym_hdl
|
|
12
|
+
self.sig_buf = sig_buf
|
|
13
|
+
self.sig_hdl = sig_hdl
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
STAGING_BUFFER_SIZE = 256 * 1024 * 1024
|
|
17
|
+
PACKET_BUFFER_SIZE = 2 * 512 * 1024
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DataType(IntEnum):
|
|
21
|
+
BF16 = 0
|
|
22
|
+
FP16 = 1
|
|
23
|
+
FP32 = 2
|
|
24
|
+
FP64 = 3
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def normalize_arch(arch: int):
|
|
28
|
+
if arch >= 100:
|
|
29
|
+
return 1000
|
|
30
|
+
if arch >= 90:
|
|
31
|
+
return 900
|
|
32
|
+
if arch >= 80:
|
|
33
|
+
return 800
|
|
34
|
+
return 700
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def buffer_type(t: torch.dtype):
|
|
38
|
+
if t == torch.float16:
|
|
39
|
+
return DataType.FP16
|
|
40
|
+
if t == torch.bfloat16:
|
|
41
|
+
return DataType.BF16
|
|
42
|
+
if t == torch.float32:
|
|
43
|
+
return DataType.FP32
|
|
44
|
+
if t == torch.float64:
|
|
45
|
+
return DataType.FP64
|
|
46
|
+
assert False, "invalid type"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def initialize(group, device: torch.device, arch: int, stream_ptr: int):
|
|
50
|
+
import torch.distributed._symmetric_memory as sym_mem
|
|
51
|
+
from . import jit
|
|
52
|
+
from .bindings import purlin_bindings
|
|
53
|
+
rank = torch.distributed.get_rank(group)
|
|
54
|
+
world = torch.distributed.get_world_size(group)
|
|
55
|
+
mod_prefix = "purlin"
|
|
56
|
+
n_arch = normalize_arch(arch)
|
|
57
|
+
mod_name = "purlin_{}".format(n_arch)
|
|
58
|
+
src = purlin_bindings.substitute(mod_name=mod_name)
|
|
59
|
+
mod = jit.get_compiled(arch, src, mod_prefix, mod_name)
|
|
60
|
+
# staging buffers
|
|
61
|
+
staging_size = 2 * (STAGING_BUFFER_SIZE + (world * PACKET_BUFFER_SIZE))
|
|
62
|
+
t = sym_mem.empty(staging_size, dtype=torch.uint8, device=device)
|
|
63
|
+
t.zero_()
|
|
64
|
+
# signal pads
|
|
65
|
+
t1 = sym_mem.empty(2 * world, dtype=torch.uint64, device=device)
|
|
66
|
+
t1.zero_()
|
|
67
|
+
hdl = sym_mem.rendezvous(t, group)
|
|
68
|
+
hdl1 = sym_mem.rendezvous(t1, group)
|
|
69
|
+
ctx = mod.initialize(rank, world, hdl.buffer_ptrs, hdl1.buffer_ptrs, stream_ptr)
|
|
70
|
+
return ContextHandle(mod, ctx, t, hdl, t1, hdl1)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def finalize(handle: ContextHandle, stream_ptr: int):
|
|
74
|
+
if handle.ctx is None:
|
|
75
|
+
return
|
|
76
|
+
handle.mod.finalize(handle.ctx, stream_ptr)
|
|
77
|
+
handle.ctx = None
|
|
78
|
+
handle.hdl = None
|
|
79
|
+
handle.buf = None
|
|
80
|
+
handle.sig_buf = None
|
|
81
|
+
handle.sig_hdl = None
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def all_gather(in_tensor: torch.Tensor, out_tensor: torch.Tensor, handle: ContextHandle, stream_ptr: int):
|
|
85
|
+
assert in_tensor.is_contiguous()
|
|
86
|
+
assert out_tensor.is_contiguous()
|
|
87
|
+
handle.mod.all_gather(in_tensor.data_ptr(), out_tensor.data_ptr(), in_tensor.nbytes, handle.ctx, stream_ptr)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def all_reduce(in_tensor: torch.Tensor, out_tensor: torch.Tensor, handle: ContextHandle, stream_ptr: int):
|
|
91
|
+
assert in_tensor.is_contiguous()
|
|
92
|
+
assert out_tensor.is_contiguous()
|
|
93
|
+
bt = buffer_type(in_tensor.dtype)
|
|
94
|
+
handle.mod.all_reduce(in_tensor.data_ptr(), out_tensor.data_ptr(), in_tensor.nbytes, bt, handle.ctx, stream_ptr)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def all_to_all(in_tensor: torch.Tensor, out_tensor: torch.Tensor, handle: ContextHandle, stream_ptr: int):
|
|
98
|
+
assert in_tensor.is_contiguous()
|
|
99
|
+
assert out_tensor.is_contiguous()
|
|
100
|
+
handle.mod.all_to_all(in_tensor.data_ptr(), out_tensor.data_ptr(), in_tensor.nbytes, handle.ctx, stream_ptr)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def reduce_scatter(in_tensor: torch.Tensor, out_tensor: torch.Tensor, handle: ContextHandle, stream_ptr: int):
|
|
104
|
+
assert in_tensor.is_contiguous()
|
|
105
|
+
assert out_tensor.is_contiguous()
|
|
106
|
+
bt = buffer_type(in_tensor.dtype)
|
|
107
|
+
handle.mod.reduce_scatter(in_tensor.data_ptr(), out_tensor.data_ptr(), in_tensor.nbytes, bt, handle.ctx, stream_ptr)
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
from string import Template
|
|
2
|
+
purlin_bindings = Template(r"""
|
|
3
|
+
//
|
|
4
|
+
// Created by Osayamen on 6/2/26.
|
|
5
|
+
//
|
|
6
|
+
#include <cstdint>
|
|
7
|
+
#include <cstdio>
|
|
8
|
+
#include <cuda_runtime.h>
|
|
9
|
+
|
|
10
|
+
#include <pybind11/pybind11.h>
|
|
11
|
+
#include <pybind11/stl.h>
|
|
12
|
+
#include <vector>
|
|
13
|
+
|
|
14
|
+
#include <purlin/host.cuh>
|
|
15
|
+
|
|
16
|
+
namespace py = pybind11;
|
|
17
|
+
|
|
18
|
+
static std::uintptr_t purlin_initialize(const int& rank,
|
|
19
|
+
const int& world,
|
|
20
|
+
const std::vector<std::uintptr_t>& staging_table,
|
|
21
|
+
const std::vector<std::uintptr_t>& signal_table,
|
|
22
|
+
const std::uintptr_t& stream_ptr) {
|
|
23
|
+
auto stream = reinterpret_cast<cudaStream_t>(stream_ptr);
|
|
24
|
+
// allocate pointer tables
|
|
25
|
+
void* stagingTR = nullptr;
|
|
26
|
+
void* stagingLR = nullptr;
|
|
27
|
+
void* signals = nullptr;
|
|
28
|
+
void* gatherSignals = nullptr;
|
|
29
|
+
|
|
30
|
+
CHECK_CUDA(cudaMallocAsync(&stagingTR, sizeof(cuda::std::byte*) * world, stream));
|
|
31
|
+
static_assert(sizeof(uintptr_t) == sizeof(cuda::std::byte*));
|
|
32
|
+
CHECK_CUDA(cudaMemcpyAsync(stagingTR, staging_table.data(), sizeof(uintptr_t) * world,
|
|
33
|
+
cudaMemcpyHostToDevice, stream));
|
|
34
|
+
CHECK_CUDA(cudaMallocAsync(&stagingLR, sizeof(cuda::std::byte*) * world, stream));
|
|
35
|
+
CHECK_CUDA(cudaMallocAsync(&signals, sizeof(uint64_t*) * world, stream));
|
|
36
|
+
CHECK_CUDA(cudaMemcpyAsync(signals, signal_table.data(), sizeof(uintptr_t) * world,
|
|
37
|
+
cudaMemcpyHostToDevice, stream))
|
|
38
|
+
CHECK_CUDA(cudaMallocAsync(&gatherSignals, sizeof(uint64_t*) * world, stream));
|
|
39
|
+
|
|
40
|
+
std::vector<uintptr_t> stagingStash(world);
|
|
41
|
+
constexpr auto offsetTR = 2 * purlin::STAGING_BUFFER_SIZE_;
|
|
42
|
+
for (int i = 0; i < world; i++) {
|
|
43
|
+
stagingStash[i] = reinterpret_cast<uintptr_t>(reinterpret_cast<cuda::std::byte*>(staging_table[i]) + offsetTR);
|
|
44
|
+
}
|
|
45
|
+
CHECK_CUDA(cudaMemcpyAsync(stagingLR, stagingStash.data(), sizeof(uintptr_t) * world,
|
|
46
|
+
cudaMemcpyHostToDevice, stream));
|
|
47
|
+
std::vector<uintptr_t> signalStash(world);
|
|
48
|
+
const auto offsetSig = world;
|
|
49
|
+
for (int i = 0; i < world; i++) {
|
|
50
|
+
signalStash[i] = reinterpret_cast<uintptr_t>(reinterpret_cast<uint64_t*>(signal_table[i]) + offsetSig);
|
|
51
|
+
}
|
|
52
|
+
CHECK_CUDA(cudaMemcpyAsync(gatherSignals, signalStash.data(), sizeof(uintptr_t) * world,
|
|
53
|
+
cudaMemcpyHostToDevice, stream));
|
|
54
|
+
|
|
55
|
+
const auto ctx = purlin::initialize(rank, world,
|
|
56
|
+
static_cast<cuda::std::byte**>(stagingLR),
|
|
57
|
+
static_cast<cuda::std::byte**>(stagingTR),
|
|
58
|
+
static_cast<uint64_t**>(signals),
|
|
59
|
+
static_cast<uint64_t**>(gatherSignals), stream);
|
|
60
|
+
CHECK_CUDA(cudaStreamSynchronize(stream));
|
|
61
|
+
auto* pyCtx = new purlin::Context(ctx);
|
|
62
|
+
return reinterpret_cast<uintptr_t>(pyCtx);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
static void purlin_finalize(const uintptr_t& raw_ctx, const uintptr_t& stream_ptr) {
|
|
66
|
+
auto stream = reinterpret_cast<cudaStream_t>(stream_ptr);
|
|
67
|
+
const auto* ctx = reinterpret_cast<purlin::Context*>(raw_ctx);
|
|
68
|
+
if (!ctx) return;
|
|
69
|
+
purlin::finalize(*ctx, stream);
|
|
70
|
+
CHECK_CUDA(cudaFreeAsync(ctx->staging, stream));
|
|
71
|
+
CHECK_CUDA(cudaFreeAsync(ctx->stagingLR, stream));
|
|
72
|
+
CHECK_CUDA(cudaFreeAsync(ctx->signals, stream));
|
|
73
|
+
CHECK_CUDA(cudaFreeAsync(ctx->gatherSignals, stream));
|
|
74
|
+
CHECK_CUDA(cudaStreamSynchronize(stream));
|
|
75
|
+
delete ctx;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
static void all_gather(const uintptr_t& src, const uintptr_t& dst, const size_t& bytes,
|
|
79
|
+
const uintptr_t& raw_ctx, const uintptr_t& stream_ptr) {
|
|
80
|
+
purlin::allGather(
|
|
81
|
+
reinterpret_cast<cuda::std::byte*>(src),
|
|
82
|
+
reinterpret_cast<cuda::std::byte*>(dst),
|
|
83
|
+
bytes, *reinterpret_cast<purlin::Context*>(raw_ctx), reinterpret_cast<cudaStream_t>(stream_ptr));
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
static void all_reduce(const uintptr_t& src, const uintptr_t& dst, const size_t& bytes,
|
|
87
|
+
const int& buffer_type, const uintptr_t& raw_ctx, const uintptr_t& stream_ptr) {
|
|
88
|
+
switch (buffer_type) {
|
|
89
|
+
case purlin::TensorType::fp16: {
|
|
90
|
+
purlin::allReduce<__half>(reinterpret_cast<cuda::std::byte*>(src),
|
|
91
|
+
reinterpret_cast<cuda::std::byte*>(dst),
|
|
92
|
+
bytes, *reinterpret_cast<purlin::Context*>(raw_ctx), reinterpret_cast<cudaStream_t>(stream_ptr));
|
|
93
|
+
}
|
|
94
|
+
break;
|
|
95
|
+
case purlin::TensorType::bf16: {
|
|
96
|
+
purlin::allReduce<__nv_bfloat16>(reinterpret_cast<cuda::std::byte*>(src),
|
|
97
|
+
reinterpret_cast<cuda::std::byte*>(dst),
|
|
98
|
+
bytes, *reinterpret_cast<purlin::Context*>(raw_ctx), reinterpret_cast<cudaStream_t>(stream_ptr));
|
|
99
|
+
}
|
|
100
|
+
break;
|
|
101
|
+
case purlin::TensorType::fp32: {
|
|
102
|
+
purlin::allReduce<float>(reinterpret_cast<cuda::std::byte*>(src),
|
|
103
|
+
reinterpret_cast<cuda::std::byte*>(dst),
|
|
104
|
+
bytes, *reinterpret_cast<purlin::Context*>(raw_ctx), reinterpret_cast<cudaStream_t>(stream_ptr));
|
|
105
|
+
}
|
|
106
|
+
break;
|
|
107
|
+
default: {
|
|
108
|
+
// fp64
|
|
109
|
+
purlin::allReduce<double>(reinterpret_cast<cuda::std::byte*>(src),
|
|
110
|
+
reinterpret_cast<cuda::std::byte*>(dst),
|
|
111
|
+
bytes, *reinterpret_cast<purlin::Context*>(raw_ctx), reinterpret_cast<cudaStream_t>(stream_ptr));
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
static void all_to_all(const uintptr_t& src, const uintptr_t& dst, const size_t& bytes,
|
|
117
|
+
const uintptr_t& raw_ctx, const uintptr_t& stream_ptr) {
|
|
118
|
+
purlin::all2all(
|
|
119
|
+
reinterpret_cast<cuda::std::byte*>(src),
|
|
120
|
+
reinterpret_cast<cuda::std::byte*>(dst),
|
|
121
|
+
bytes, *reinterpret_cast<purlin::Context*>(raw_ctx), reinterpret_cast<cudaStream_t>(stream_ptr));
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
static void reduce_scatter(const uintptr_t& src, const uintptr_t& dst, const size_t& bytes,
|
|
125
|
+
const int& buffer_type, const uintptr_t& raw_ctx, const uintptr_t& stream_ptr) {
|
|
126
|
+
const auto ctx = *reinterpret_cast<purlin::Context*>(raw_ctx);
|
|
127
|
+
auto stream = reinterpret_cast<cudaStream_t>(stream_ptr);
|
|
128
|
+
const auto localBytes = bytes / ctx.world_l;
|
|
129
|
+
switch (buffer_type) {
|
|
130
|
+
case purlin::TensorType::fp16: {
|
|
131
|
+
purlin::reduceScatter<__half>(reinterpret_cast<cuda::std::byte*>(src),
|
|
132
|
+
reinterpret_cast<cuda::std::byte*>(dst), localBytes, ctx, stream);
|
|
133
|
+
}
|
|
134
|
+
break;
|
|
135
|
+
case purlin::TensorType::bf16: {
|
|
136
|
+
purlin::reduceScatter<__nv_bfloat16>(reinterpret_cast<cuda::std::byte*>(src),
|
|
137
|
+
reinterpret_cast<cuda::std::byte*>(dst), localBytes, ctx, stream);
|
|
138
|
+
}
|
|
139
|
+
break;
|
|
140
|
+
case purlin::TensorType::fp32: {
|
|
141
|
+
purlin::reduceScatter<float>(reinterpret_cast<cuda::std::byte*>(src),
|
|
142
|
+
reinterpret_cast<cuda::std::byte*>(dst), localBytes, ctx, stream);
|
|
143
|
+
}
|
|
144
|
+
break;
|
|
145
|
+
default: {
|
|
146
|
+
// fp64
|
|
147
|
+
purlin::reduceScatter<double>(reinterpret_cast<cuda::std::byte*>(src),
|
|
148
|
+
reinterpret_cast<cuda::std::byte*>(dst), localBytes, ctx, stream);
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
PYBIND11_MODULE($mod_name, m) {
|
|
154
|
+
m.def("initialize", &purlin_initialize);
|
|
155
|
+
m.def("finalize", &purlin_finalize);
|
|
156
|
+
m.def("all_gather", &all_gather);
|
|
157
|
+
m.def("all_reduce", &all_reduce);
|
|
158
|
+
m.def("all_to_all", &all_to_all);
|
|
159
|
+
m.def("reduce_scatter", &reduce_scatter);
|
|
160
|
+
}
|
|
161
|
+
""")
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import threading
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
def _verify_dirs() -> None:
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
root = Path(__file__).resolve().parent
|
|
9
|
+
|
|
10
|
+
if not (root / "CMakeLists.txt").exists():
|
|
11
|
+
raise RuntimeError("JIT CMakeLists.txt not found at package root")
|
|
12
|
+
|
|
13
|
+
def _load_ext(mod_name: str, so_path: Path):
|
|
14
|
+
import importlib.util
|
|
15
|
+
spec = importlib.util.spec_from_file_location(mod_name, so_path)
|
|
16
|
+
if spec is None or spec.loader is None:
|
|
17
|
+
raise ImportError(f"Could not load {mod_name} from {so_path}")
|
|
18
|
+
mod = importlib.util.module_from_spec(spec)
|
|
19
|
+
spec.loader.exec_module(mod)
|
|
20
|
+
return mod
|
|
21
|
+
|
|
22
|
+
def get_compiled(arch: int, src: str, mod_prefix: str, mod_name: str):
|
|
23
|
+
import hashlib
|
|
24
|
+
import os
|
|
25
|
+
import shutil
|
|
26
|
+
import socket
|
|
27
|
+
import subprocess
|
|
28
|
+
import sys
|
|
29
|
+
import time
|
|
30
|
+
|
|
31
|
+
_verify_dirs()
|
|
32
|
+
|
|
33
|
+
cache = Path(os.environ.get("PURLIN_CACHE_DIR", str(Path.home() / ".cache" / "purlin_jit")))
|
|
34
|
+
cache.mkdir(parents=True, exist_ok=True)
|
|
35
|
+
|
|
36
|
+
key = hashlib.sha256(f"{mod_name}|py{sys.version_info[:2]}|{src}".encode()).hexdigest()[:16]
|
|
37
|
+
|
|
38
|
+
build_root = cache / f"{key}"
|
|
39
|
+
build_root.mkdir(parents=True, exist_ok=True)
|
|
40
|
+
|
|
41
|
+
so_path = build_root / f"{mod_name}.so"
|
|
42
|
+
lock_path = build_root / ".build.lock"
|
|
43
|
+
|
|
44
|
+
# Fast path
|
|
45
|
+
if so_path.exists():
|
|
46
|
+
return _load_ext(mod_name, so_path)
|
|
47
|
+
|
|
48
|
+
# Process-unique tag for temp dirs
|
|
49
|
+
host = socket.gethostname()
|
|
50
|
+
pid = os.getpid()
|
|
51
|
+
tid = threading.get_ident()
|
|
52
|
+
uniq = f"{host}_tid{tid}_pid{pid}"
|
|
53
|
+
|
|
54
|
+
gen_dir = build_root / f"gen_{uniq}"
|
|
55
|
+
bdir = build_root / f"build_{uniq}"
|
|
56
|
+
gen_dir.mkdir(exist_ok=True)
|
|
57
|
+
bdir.mkdir(exist_ok=True)
|
|
58
|
+
|
|
59
|
+
generated = gen_dir / f"{mod_prefix}_bindings.cu"
|
|
60
|
+
generated.write_text(src)
|
|
61
|
+
|
|
62
|
+
cmake_source_dir = Path(__file__).resolve().parent
|
|
63
|
+
|
|
64
|
+
def _try_acquire_lock() -> bool:
|
|
65
|
+
try:
|
|
66
|
+
fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY)
|
|
67
|
+
with os.fdopen(fd, "w") as f:
|
|
68
|
+
f.write(f"host={host}\npid={pid}\ntid={tid}\ntime={time.time()}\n")
|
|
69
|
+
return True
|
|
70
|
+
except FileExistsError:
|
|
71
|
+
return False
|
|
72
|
+
|
|
73
|
+
def _lock_is_stale() -> bool:
|
|
74
|
+
try:
|
|
75
|
+
fields = dict(
|
|
76
|
+
line.split("=", 1)
|
|
77
|
+
for line in lock_path.read_text().splitlines()
|
|
78
|
+
if "=" in line
|
|
79
|
+
)
|
|
80
|
+
except OSError:
|
|
81
|
+
return False
|
|
82
|
+
|
|
83
|
+
lock_host = fields.get("host")
|
|
84
|
+
lock_pid = fields.get("pid")
|
|
85
|
+
if not lock_host or not lock_pid:
|
|
86
|
+
return True
|
|
87
|
+
if lock_host != host:
|
|
88
|
+
return False
|
|
89
|
+
try:
|
|
90
|
+
os.kill(int(lock_pid), 0)
|
|
91
|
+
except ProcessLookupError:
|
|
92
|
+
return True
|
|
93
|
+
except (PermissionError, ValueError):
|
|
94
|
+
return False
|
|
95
|
+
return False
|
|
96
|
+
|
|
97
|
+
def _release_lock() -> None:
|
|
98
|
+
try:
|
|
99
|
+
lock_path.unlink()
|
|
100
|
+
except FileNotFoundError:
|
|
101
|
+
pass
|
|
102
|
+
|
|
103
|
+
def _wait_for_artifact(timeout_s: float = 1800.0, poll_s: float = 0.1):
|
|
104
|
+
start = time.time()
|
|
105
|
+
while True:
|
|
106
|
+
if so_path.exists():
|
|
107
|
+
return _load_ext(mod_name, so_path)
|
|
108
|
+
|
|
109
|
+
if time.time() - start > timeout_s:
|
|
110
|
+
raise TimeoutError(
|
|
111
|
+
f"Timed out waiting for JIT artifact {so_path} while another process was building it."
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
time.sleep(poll_s)
|
|
115
|
+
|
|
116
|
+
# package_dir = Path(__file__).resolve().parent
|
|
117
|
+
# repo_root = package_dir.parent
|
|
118
|
+
# csrc = repo_root / "csrc"
|
|
119
|
+
# Try to become the builder
|
|
120
|
+
have_lock = _try_acquire_lock()
|
|
121
|
+
|
|
122
|
+
if not have_lock and _lock_is_stale():
|
|
123
|
+
_release_lock()
|
|
124
|
+
have_lock = _try_acquire_lock()
|
|
125
|
+
|
|
126
|
+
if not have_lock:
|
|
127
|
+
# Another process is building. Wait for the final .so to appear.
|
|
128
|
+
return _wait_for_artifact()
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
# Double-check after lock acquisition in case another process finished just before us
|
|
132
|
+
if so_path.exists():
|
|
133
|
+
return _load_ext(mod_name, so_path)
|
|
134
|
+
|
|
135
|
+
subprocess.run([
|
|
136
|
+
"cmake", "-S", str(cmake_source_dir), "-B", str(bdir), "-G", "Ninja",
|
|
137
|
+
# f"-DPURLIN_SOURCE_DIR={csrc}",
|
|
138
|
+
f"-DGENERATED_SRC={generated}",
|
|
139
|
+
f"-DTARGET_MODULE_NAME={mod_name}",
|
|
140
|
+
f"-DCMAKE_CUDA_ARCHITECTURES={arch}",
|
|
141
|
+
f"-DCPM_SOURCE_CACHE={Path.home() / '.cache' / 'cpm'}",
|
|
142
|
+
"-DCMAKE_BUILD_TYPE=Release",
|
|
143
|
+
f"-DARCH={arch}"
|
|
144
|
+
], check=True)
|
|
145
|
+
|
|
146
|
+
subprocess.run([
|
|
147
|
+
"cmake", "--build", str(bdir), "--parallel"
|
|
148
|
+
], check=True)
|
|
149
|
+
|
|
150
|
+
built = next(bdir.glob(mod_name + "*.so"))
|
|
151
|
+
|
|
152
|
+
# Copy into a temp path in build_root, then atomically replace final path
|
|
153
|
+
tmp_so = build_root / f".{mod_name}.{uniq}.tmp.so"
|
|
154
|
+
shutil.copy2(built, tmp_so)
|
|
155
|
+
tmp_so.replace(so_path)
|
|
156
|
+
|
|
157
|
+
finally:
|
|
158
|
+
_release_lock()
|
|
159
|
+
|
|
160
|
+
return _load_ext(mod_name, so_path)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: purlin
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Adaptable, template-based substrate for high-performance inter-GPU communication
|
|
5
|
+
Author-email: Osayamen Jonathan Aimuyo <osayamen@stanford.edu>
|
|
6
|
+
License-Expression: BSD-3-Clause
|
|
7
|
+
Project-URL: Homepage, https://github.com/osayamenja/purlin
|
|
8
|
+
Project-URL: Repository, https://github.com/osayamenja/purlin
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: torch>=2.8.0
|
|
13
|
+
Dynamic: license-file
|
|
14
|
+
|
|
15
|
+
# Purlin
|
|
16
|
+
Library that provides composable primitives for constructing high-performance and _adaptable_ device-initiated
|
|
17
|
+
communication operations.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
purlin/CMakeLists.txt
|
|
5
|
+
purlin/CPM.cmake
|
|
6
|
+
purlin/__init__.py
|
|
7
|
+
purlin/bindings.py
|
|
8
|
+
purlin/jit.py
|
|
9
|
+
purlin.egg-info/PKG-INFO
|
|
10
|
+
purlin.egg-info/SOURCES.txt
|
|
11
|
+
purlin.egg-info/dependency_links.txt
|
|
12
|
+
purlin.egg-info/requires.txt
|
|
13
|
+
purlin.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
torch>=2.8.0
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
purlin
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=77"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "purlin"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
license = "BSD-3-Clause"
|
|
10
|
+
license-files = ["LICENSE"]
|
|
11
|
+
description = "Adaptable, template-based substrate for high-performance inter-GPU communication"
|
|
12
|
+
requires-python = ">=3.10"
|
|
13
|
+
authors = [
|
|
14
|
+
{ name = "Osayamen Jonathan Aimuyo", email = "osayamen@stanford.edu" },
|
|
15
|
+
]
|
|
16
|
+
dependencies = [
|
|
17
|
+
"torch>=2.8.0",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
[project.urls]
|
|
21
|
+
Homepage = "https://github.com/osayamenja/purlin"
|
|
22
|
+
Repository = "https://github.com/osayamenja/purlin"
|
|
23
|
+
|
|
24
|
+
[tool.setuptools]
|
|
25
|
+
packages = ["purlin"]
|
|
26
|
+
|
|
27
|
+
[tool.setuptools.package-data]
|
|
28
|
+
purlin = [
|
|
29
|
+
"CMakeLists.txt",
|
|
30
|
+
"CPM.cmake"
|
|
31
|
+
]
|
purlin-0.1.0/setup.cfg
ADDED