purlin 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
purlin/CMakeLists.txt ADDED
@@ -0,0 +1,102 @@
1
+ cmake_minimum_required(VERSION 3.27)
2
+ project(purlin_v010 LANGUAGES CXX CUDA)
3
+
4
+ set(CMAKE_CXX_STANDARD 20)
5
+ set(CMAKE_CXX_STANDARD_REQUIRED ON)
6
+ set(CMAKE_CXX_EXTENSIONS OFF)
7
+
8
+ set(CMAKE_CUDA_STANDARD 20)
9
+ set(CMAKE_CUDA_STANDARD_REQUIRED ON)
10
+ set(CMAKE_CUDA_EXTENSIONS OFF)
11
+
12
+ # flags
13
+ add_library(purlin_warnings INTERFACE)
14
+
15
+ target_compile_options(purlin_warnings INTERFACE
16
+ $<$<COMPILE_LANGUAGE:CXX>:
17
+ -Wall -Wextra
18
+ -fno-strict-aliasing
19
+ -Wno-unknown-pragmas
20
+ -Wnull-dereference
21
+ -Wnarrowing
22
+ -Wno-switch
23
+ -Wduplicated-branches
24
+ -Wformat=2
25
+ -Wno-unused-but-set-parameter
26
+ -Wno-sign-compare
27
+ >
28
+
29
+ $<$<COMPILE_LANGUAGE:CUDA>:
30
+ -Xcompiler=-Wall,-Wextra
31
+ -Xcompiler=-fno-strict-aliasing
32
+ -Xcompiler=-Wno-unknown-pragmas,-Wnull-dereference,-Wnarrowing
33
+ -Xcompiler=-Wno-switch,-Wduplicated-branches,-Wformat=2
34
+ -Xcompiler=-Wno-unused-but-set-parameter
35
+ -Xcudafe --display_error_number
36
+ -Xcompiler=-Wno-sign-compare
37
+ >
38
+ )
39
+
40
+ find_package(CUDAToolkit REQUIRED)
41
+
42
+ if (NOT DEFINED GENERATED_SRC)
43
+ message(FATAL_ERROR "Need -DGENERATED_SRC")
44
+ endif()
45
+
46
+ if (NOT DEFINED TARGET_MODULE_NAME)
47
+ message(FATAL_ERROR "Need -DTARGET_MODULE_NAME")
48
+ endif()
49
+
50
+ if (NOT DEFINED ARCH)
51
+ message(FATAL_ERROR "Need -DARCH")
52
+ endif()
53
+
54
+ if(${ARCH} LESS 70)
55
+ message(FATAL_ERROR "Unsupported ARCH")
56
+ endif()
57
+
58
+ math(EXPR GPU_ARCH "${ARCH} * 10" OUTPUT_FORMAT DECIMAL)
59
+ set(ARCH_TAG "${ARCH}")
60
+ if(ARCH GREATER_EQUAL 90)
61
+ string(APPEND ARCH_TAG "a") # accelerated arch-specific instruction set
62
+ endif()
63
+
64
+ # ---- CPM ----
65
+ include(${CMAKE_CURRENT_LIST_DIR}/CPM.cmake)
66
+
67
+ #pybind11
68
+ set(PYBIND11_FINDPYTHON ON)
69
+ CPMAddPackage(
70
+ NAME pybind11
71
+ GITHUB_REPOSITORY pybind/pybind11
72
+ VERSION 3.0.4
73
+ )
74
+ if(NOT COMMAND pybind11_add_module)
75
+ message(FATAL_ERROR "pybind11 not found")
76
+ endif()
77
+ pybind11_add_module(${TARGET_MODULE_NAME} MODULE ${GENERATED_SRC})
78
+
79
+ # -------------------------------------------------------
80
+ # dependencies
81
+ # -------------------------------------------------------
82
+ CPMAddPackage(
83
+ NAME purlin
84
+ GITHUB_REPOSITORY osayamenja/purlin
85
+ GIT_TAG main
86
+ )
87
+ if (DEFINED PURLIN_SOURCE_DIR AND EXISTS "${PURLIN_SOURCE_DIR}/include")
88
+ target_include_directories(${TARGET_MODULE_NAME} PRIVATE "${PURLIN_SOURCE_DIR}/include")
89
+ else()
90
+ target_link_libraries(${TARGET_MODULE_NAME} PRIVATE purlin::purlin)
91
+ endif()
92
+
93
+ target_compile_options(${TARGET_MODULE_NAME} PRIVATE
94
+ $<$<COMPILE_LANGUAGE:CXX>:-O3>
95
+ $<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xfatbin -compress-all>
96
+ $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr; -Xptxas -v; -t0>
97
+ $<$<COMPILE_LANGUAGE:CUDA>:SHELL:-gencode=arch=compute_${ARCH_TAG},code=sm_${ARCH_TAG}>
98
+ )
99
+ target_compile_definitions(${TARGET_MODULE_NAME} PRIVATE
100
+ ARCH=${GPU_ARCH}
101
+ PURLIN_NVTX=1
102
+ )
purlin/CPM.cmake ADDED
@@ -0,0 +1,24 @@
1
+ # SPDX-License-Identifier: MIT
2
+ #
3
+ # SPDX-FileCopyrightText: Copyright (c) 2019-2023 Lars Melchior and contributors
4
+
5
+ set(CPM_DOWNLOAD_VERSION 0.42.1)
6
+ set(CPM_HASH_SUM "f3a6dcc6a04ce9e7f51a127307fa4f699fb2bade357a8eb4c5b45df76e1dc6a5")
7
+
8
+ if(CPM_SOURCE_CACHE)
9
+ set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
10
+ elseif(DEFINED ENV{CPM_SOURCE_CACHE})
11
+ set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
12
+ else()
13
+ set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
14
+ endif()
15
+
16
+ # Expand relative path. This is important if the provided path contains a tilde (~)
17
+ get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE)
18
+
19
+ file(DOWNLOAD
20
+ https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
21
+ ${CPM_DOWNLOAD_LOCATION} EXPECTED_HASH SHA256=${CPM_HASH_SUM}
22
+ )
23
+
24
+ include(${CPM_DOWNLOAD_LOCATION})
purlin/__init__.py ADDED
@@ -0,0 +1,107 @@
1
+ from enum import IntEnum
2
+ import torch
3
+
4
+ class ContextHandle:
5
+ __slots__ = ("mod", "ctx", "buf", "hdl", "sig_buf", "sig_hdl")
6
+
7
+ def __init__(self, mod, ctx, sym_buf, sym_hdl, sig_buf, sig_hdl):
8
+ self.mod = mod
9
+ self.ctx = ctx
10
+ self.buf = sym_buf
11
+ self.hdl = sym_hdl
12
+ self.sig_buf = sig_buf
13
+ self.sig_hdl = sig_hdl
14
+
15
+
16
+ STAGING_BUFFER_SIZE = 256 * 1024 * 1024
17
+ PACKET_BUFFER_SIZE = 2 * 512 * 1024
18
+
19
+
20
+ class DataType(IntEnum):
21
+ BF16 = 0
22
+ FP16 = 1
23
+ FP32 = 2
24
+ FP64 = 3
25
+
26
+
27
+ def normalize_arch(arch: int):
28
+ if arch >= 100:
29
+ return 1000
30
+ if arch >= 90:
31
+ return 900
32
+ if arch >= 80:
33
+ return 800
34
+ return 700
35
+
36
+
37
+ def buffer_type(t: torch.dtype):
38
+ if t == torch.float16:
39
+ return DataType.FP16
40
+ if t == torch.bfloat16:
41
+ return DataType.BF16
42
+ if t == torch.float32:
43
+ return DataType.FP32
44
+ if t == torch.float64:
45
+ return DataType.FP64
46
+ assert False, "invalid type"
47
+
48
+
49
+ def initialize(group, device: torch.device, arch: int, stream_ptr: int):
50
+ import torch.distributed._symmetric_memory as sym_mem
51
+ from . import jit
52
+ from .bindings import purlin_bindings
53
+ rank = torch.distributed.get_rank(group)
54
+ world = torch.distributed.get_world_size(group)
55
+ mod_prefix = "purlin"
56
+ n_arch = normalize_arch(arch)
57
+ mod_name = "purlin_{}".format(n_arch)
58
+ src = purlin_bindings.substitute(mod_name=mod_name)
59
+ mod = jit.get_compiled(arch, src, mod_prefix, mod_name)
60
+ # staging buffers
61
+ staging_size = 2 * (STAGING_BUFFER_SIZE + (world * PACKET_BUFFER_SIZE))
62
+ t = sym_mem.empty(staging_size, dtype=torch.uint8, device=device)
63
+ t.zero_()
64
+ # signal pads
65
+ t1 = sym_mem.empty(2 * world, dtype=torch.uint64, device=device)
66
+ t1.zero_()
67
+ hdl = sym_mem.rendezvous(t, group)
68
+ hdl1 = sym_mem.rendezvous(t1, group)
69
+ ctx = mod.initialize(rank, world, hdl.buffer_ptrs, hdl1.buffer_ptrs, stream_ptr)
70
+ return ContextHandle(mod, ctx, t, hdl, t1, hdl1)
71
+
72
+
73
+ def finalize(handle: ContextHandle, stream_ptr: int):
74
+ if handle.ctx is None:
75
+ return
76
+ handle.mod.finalize(handle.ctx, stream_ptr)
77
+ handle.ctx = None
78
+ handle.hdl = None
79
+ handle.buf = None
80
+ handle.sig_buf = None
81
+ handle.sig_hdl = None
82
+
83
+
84
+ def all_gather(in_tensor: torch.Tensor, out_tensor: torch.Tensor, handle: ContextHandle, stream_ptr: int):
85
+ assert in_tensor.is_contiguous()
86
+ assert out_tensor.is_contiguous()
87
+ handle.mod.all_gather(in_tensor.data_ptr(), out_tensor.data_ptr(), in_tensor.nbytes, handle.ctx, stream_ptr)
88
+
89
+
90
+ def all_reduce(in_tensor: torch.Tensor, out_tensor: torch.Tensor, handle: ContextHandle, stream_ptr: int):
91
+ assert in_tensor.is_contiguous()
92
+ assert out_tensor.is_contiguous()
93
+ bt = buffer_type(in_tensor.dtype)
94
+ handle.mod.all_reduce(in_tensor.data_ptr(), out_tensor.data_ptr(), in_tensor.nbytes, bt, handle.ctx, stream_ptr)
95
+
96
+
97
+ def all_to_all(in_tensor: torch.Tensor, out_tensor: torch.Tensor, handle: ContextHandle, stream_ptr: int):
98
+ assert in_tensor.is_contiguous()
99
+ assert out_tensor.is_contiguous()
100
+ handle.mod.all_to_all(in_tensor.data_ptr(), out_tensor.data_ptr(), in_tensor.nbytes, handle.ctx, stream_ptr)
101
+
102
+
103
+ def reduce_scatter(in_tensor: torch.Tensor, out_tensor: torch.Tensor, handle: ContextHandle, stream_ptr: int):
104
+ assert in_tensor.is_contiguous()
105
+ assert out_tensor.is_contiguous()
106
+ bt = buffer_type(in_tensor.dtype)
107
+ handle.mod.reduce_scatter(in_tensor.data_ptr(), out_tensor.data_ptr(), in_tensor.nbytes, bt, handle.ctx, stream_ptr)
purlin/bindings.py ADDED
@@ -0,0 +1,161 @@
1
+ from string import Template
2
+ purlin_bindings = Template(r"""
3
+ //
4
+ // Created by Osayamen on 6/2/26.
5
+ //
6
+ #include <cstdint>
7
+ #include <cstdio>
8
+ #include <cuda_runtime.h>
9
+
10
+ #include <pybind11/pybind11.h>
11
+ #include <pybind11/stl.h>
12
+ #include <vector>
13
+
14
+ #include <purlin/host.cuh>
15
+
16
+ namespace py = pybind11;
17
+
18
+ static std::uintptr_t purlin_initialize(const int& rank,
19
+ const int& world,
20
+ const std::vector<std::uintptr_t>& staging_table,
21
+ const std::vector<std::uintptr_t>& signal_table,
22
+ const std::uintptr_t& stream_ptr) {
23
+ auto stream = reinterpret_cast<cudaStream_t>(stream_ptr);
24
+ // allocate pointer tables
25
+ void* stagingTR = nullptr;
26
+ void* stagingLR = nullptr;
27
+ void* signals = nullptr;
28
+ void* gatherSignals = nullptr;
29
+
30
+ CHECK_CUDA(cudaMallocAsync(&stagingTR, sizeof(cuda::std::byte*) * world, stream));
31
+ static_assert(sizeof(uintptr_t) == sizeof(cuda::std::byte*));
32
+ CHECK_CUDA(cudaMemcpyAsync(stagingTR, staging_table.data(), sizeof(uintptr_t) * world,
33
+ cudaMemcpyHostToDevice, stream));
34
+ CHECK_CUDA(cudaMallocAsync(&stagingLR, sizeof(cuda::std::byte*) * world, stream));
35
+ CHECK_CUDA(cudaMallocAsync(&signals, sizeof(uint64_t*) * world, stream));
36
+ CHECK_CUDA(cudaMemcpyAsync(signals, signal_table.data(), sizeof(uintptr_t) * world,
37
+ cudaMemcpyHostToDevice, stream))
38
+ CHECK_CUDA(cudaMallocAsync(&gatherSignals, sizeof(uint64_t*) * world, stream));
39
+
40
+ std::vector<uintptr_t> stagingStash(world);
41
+ constexpr auto offsetTR = 2 * purlin::STAGING_BUFFER_SIZE_;
42
+ for (int i = 0; i < world; i++) {
43
+ stagingStash[i] = reinterpret_cast<uintptr_t>(reinterpret_cast<cuda::std::byte*>(staging_table[i]) + offsetTR);
44
+ }
45
+ CHECK_CUDA(cudaMemcpyAsync(stagingLR, stagingStash.data(), sizeof(uintptr_t) * world,
46
+ cudaMemcpyHostToDevice, stream));
47
+ std::vector<uintptr_t> signalStash(world);
48
+ const auto offsetSig = world;
49
+ for (int i = 0; i < world; i++) {
50
+ signalStash[i] = reinterpret_cast<uintptr_t>(reinterpret_cast<uint64_t*>(signal_table[i]) + offsetSig);
51
+ }
52
+ CHECK_CUDA(cudaMemcpyAsync(gatherSignals, signalStash.data(), sizeof(uintptr_t) * world,
53
+ cudaMemcpyHostToDevice, stream));
54
+
55
+ const auto ctx = purlin::initialize(rank, world,
56
+ static_cast<cuda::std::byte**>(stagingLR),
57
+ static_cast<cuda::std::byte**>(stagingTR),
58
+ static_cast<uint64_t**>(signals),
59
+ static_cast<uint64_t**>(gatherSignals), stream);
60
+ CHECK_CUDA(cudaStreamSynchronize(stream));
61
+ auto* pyCtx = new purlin::Context(ctx);
62
+ return reinterpret_cast<uintptr_t>(pyCtx);
63
+ }
64
+
65
+ static void purlin_finalize(const uintptr_t& raw_ctx, const uintptr_t& stream_ptr) {
66
+ auto stream = reinterpret_cast<cudaStream_t>(stream_ptr);
67
+ const auto* ctx = reinterpret_cast<purlin::Context*>(raw_ctx);
68
+ if (!ctx) return;
69
+ purlin::finalize(*ctx, stream);
70
+ CHECK_CUDA(cudaFreeAsync(ctx->staging, stream));
71
+ CHECK_CUDA(cudaFreeAsync(ctx->stagingLR, stream));
72
+ CHECK_CUDA(cudaFreeAsync(ctx->signals, stream));
73
+ CHECK_CUDA(cudaFreeAsync(ctx->gatherSignals, stream));
74
+ CHECK_CUDA(cudaStreamSynchronize(stream));
75
+ delete ctx;
76
+ }
77
+
78
+ static void all_gather(const uintptr_t& src, const uintptr_t& dst, const size_t& bytes,
79
+ const uintptr_t& raw_ctx, const uintptr_t& stream_ptr) {
80
+ purlin::allGather(
81
+ reinterpret_cast<cuda::std::byte*>(src),
82
+ reinterpret_cast<cuda::std::byte*>(dst),
83
+ bytes, *reinterpret_cast<purlin::Context*>(raw_ctx), reinterpret_cast<cudaStream_t>(stream_ptr));
84
+ }
85
+
86
+ static void all_reduce(const uintptr_t& src, const uintptr_t& dst, const size_t& bytes,
87
+ const int& buffer_type, const uintptr_t& raw_ctx, const uintptr_t& stream_ptr) {
88
+ switch (buffer_type) {
89
+ case purlin::TensorType::fp16: {
90
+ purlin::allReduce<__half>(reinterpret_cast<cuda::std::byte*>(src),
91
+ reinterpret_cast<cuda::std::byte*>(dst),
92
+ bytes, *reinterpret_cast<purlin::Context*>(raw_ctx), reinterpret_cast<cudaStream_t>(stream_ptr));
93
+ }
94
+ break;
95
+ case purlin::TensorType::bf16: {
96
+ purlin::allReduce<__nv_bfloat16>(reinterpret_cast<cuda::std::byte*>(src),
97
+ reinterpret_cast<cuda::std::byte*>(dst),
98
+ bytes, *reinterpret_cast<purlin::Context*>(raw_ctx), reinterpret_cast<cudaStream_t>(stream_ptr));
99
+ }
100
+ break;
101
+ case purlin::TensorType::fp32: {
102
+ purlin::allReduce<float>(reinterpret_cast<cuda::std::byte*>(src),
103
+ reinterpret_cast<cuda::std::byte*>(dst),
104
+ bytes, *reinterpret_cast<purlin::Context*>(raw_ctx), reinterpret_cast<cudaStream_t>(stream_ptr));
105
+ }
106
+ break;
107
+ default: {
108
+ // fp64
109
+ purlin::allReduce<double>(reinterpret_cast<cuda::std::byte*>(src),
110
+ reinterpret_cast<cuda::std::byte*>(dst),
111
+ bytes, *reinterpret_cast<purlin::Context*>(raw_ctx), reinterpret_cast<cudaStream_t>(stream_ptr));
112
+ }
113
+ }
114
+ }
115
+
116
+ static void all_to_all(const uintptr_t& src, const uintptr_t& dst, const size_t& bytes,
117
+ const uintptr_t& raw_ctx, const uintptr_t& stream_ptr) {
118
+ purlin::all2all(
119
+ reinterpret_cast<cuda::std::byte*>(src),
120
+ reinterpret_cast<cuda::std::byte*>(dst),
121
+ bytes, *reinterpret_cast<purlin::Context*>(raw_ctx), reinterpret_cast<cudaStream_t>(stream_ptr));
122
+ }
123
+
124
+ static void reduce_scatter(const uintptr_t& src, const uintptr_t& dst, const size_t& bytes,
125
+ const int& buffer_type, const uintptr_t& raw_ctx, const uintptr_t& stream_ptr) {
126
+ const auto ctx = *reinterpret_cast<purlin::Context*>(raw_ctx);
127
+ auto stream = reinterpret_cast<cudaStream_t>(stream_ptr);
128
+ const auto localBytes = bytes / ctx.world_l;
129
+ switch (buffer_type) {
130
+ case purlin::TensorType::fp16: {
131
+ purlin::reduceScatter<__half>(reinterpret_cast<cuda::std::byte*>(src),
132
+ reinterpret_cast<cuda::std::byte*>(dst), localBytes, ctx, stream);
133
+ }
134
+ break;
135
+ case purlin::TensorType::bf16: {
136
+ purlin::reduceScatter<__nv_bfloat16>(reinterpret_cast<cuda::std::byte*>(src),
137
+ reinterpret_cast<cuda::std::byte*>(dst), localBytes, ctx, stream);
138
+ }
139
+ break;
140
+ case purlin::TensorType::fp32: {
141
+ purlin::reduceScatter<float>(reinterpret_cast<cuda::std::byte*>(src),
142
+ reinterpret_cast<cuda::std::byte*>(dst), localBytes, ctx, stream);
143
+ }
144
+ break;
145
+ default: {
146
+ // fp64
147
+ purlin::reduceScatter<double>(reinterpret_cast<cuda::std::byte*>(src),
148
+ reinterpret_cast<cuda::std::byte*>(dst), localBytes, ctx, stream);
149
+ }
150
+ }
151
+ }
152
+
153
+ PYBIND11_MODULE($mod_name, m) {
154
+ m.def("initialize", &purlin_initialize);
155
+ m.def("finalize", &purlin_finalize);
156
+ m.def("all_gather", &all_gather);
157
+ m.def("all_reduce", &all_reduce);
158
+ m.def("all_to_all", &all_to_all);
159
+ m.def("reduce_scatter", &reduce_scatter);
160
+ }
161
+ """)
purlin/jit.py ADDED
@@ -0,0 +1,160 @@
1
+ from __future__ import annotations
2
+
3
+ import threading
4
+ from pathlib import Path
5
+
6
+ def _verify_dirs() -> None:
7
+ from pathlib import Path
8
+ root = Path(__file__).resolve().parent
9
+
10
+ if not (root / "CMakeLists.txt").exists():
11
+ raise RuntimeError("JIT CMakeLists.txt not found at package root")
12
+
13
+ def _load_ext(mod_name: str, so_path: Path):
14
+ import importlib.util
15
+ spec = importlib.util.spec_from_file_location(mod_name, so_path)
16
+ if spec is None or spec.loader is None:
17
+ raise ImportError(f"Could not load {mod_name} from {so_path}")
18
+ mod = importlib.util.module_from_spec(spec)
19
+ spec.loader.exec_module(mod)
20
+ return mod
21
+
22
+ def get_compiled(arch: int, src: str, mod_prefix: str, mod_name: str):
23
+ import hashlib
24
+ import os
25
+ import shutil
26
+ import socket
27
+ import subprocess
28
+ import sys
29
+ import time
30
+
31
+ _verify_dirs()
32
+
33
+ cache = Path(os.environ.get("PURLIN_CACHE_DIR", str(Path.home() / ".cache" / "purlin_jit")))
34
+ cache.mkdir(parents=True, exist_ok=True)
35
+
36
+ key = hashlib.sha256(f"{mod_name}|py{sys.version_info[:2]}|{src}".encode()).hexdigest()[:16]
37
+
38
+ build_root = cache / f"{key}"
39
+ build_root.mkdir(parents=True, exist_ok=True)
40
+
41
+ so_path = build_root / f"{mod_name}.so"
42
+ lock_path = build_root / ".build.lock"
43
+
44
+ # Fast path
45
+ if so_path.exists():
46
+ return _load_ext(mod_name, so_path)
47
+
48
+ # Process-unique tag for temp dirs
49
+ host = socket.gethostname()
50
+ pid = os.getpid()
51
+ tid = threading.get_ident()
52
+ uniq = f"{host}_tid{tid}_pid{pid}"
53
+
54
+ gen_dir = build_root / f"gen_{uniq}"
55
+ bdir = build_root / f"build_{uniq}"
56
+ gen_dir.mkdir(exist_ok=True)
57
+ bdir.mkdir(exist_ok=True)
58
+
59
+ generated = gen_dir / f"{mod_prefix}_bindings.cu"
60
+ generated.write_text(src)
61
+
62
+ cmake_source_dir = Path(__file__).resolve().parent
63
+
64
+ def _try_acquire_lock() -> bool:
65
+ try:
66
+ fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY)
67
+ with os.fdopen(fd, "w") as f:
68
+ f.write(f"host={host}\npid={pid}\ntid={tid}\ntime={time.time()}\n")
69
+ return True
70
+ except FileExistsError:
71
+ return False
72
+
73
+ def _lock_is_stale() -> bool:
74
+ try:
75
+ fields = dict(
76
+ line.split("=", 1)
77
+ for line in lock_path.read_text().splitlines()
78
+ if "=" in line
79
+ )
80
+ except OSError:
81
+ return False
82
+
83
+ lock_host = fields.get("host")
84
+ lock_pid = fields.get("pid")
85
+ if not lock_host or not lock_pid:
86
+ return True
87
+ if lock_host != host:
88
+ return False
89
+ try:
90
+ os.kill(int(lock_pid), 0)
91
+ except ProcessLookupError:
92
+ return True
93
+ except (PermissionError, ValueError):
94
+ return False
95
+ return False
96
+
97
+ def _release_lock() -> None:
98
+ try:
99
+ lock_path.unlink()
100
+ except FileNotFoundError:
101
+ pass
102
+
103
+ def _wait_for_artifact(timeout_s: float = 1800.0, poll_s: float = 0.1):
104
+ start = time.time()
105
+ while True:
106
+ if so_path.exists():
107
+ return _load_ext(mod_name, so_path)
108
+
109
+ if time.time() - start > timeout_s:
110
+ raise TimeoutError(
111
+ f"Timed out waiting for JIT artifact {so_path} while another process was building it."
112
+ )
113
+
114
+ time.sleep(poll_s)
115
+
116
+ # package_dir = Path(__file__).resolve().parent
117
+ # repo_root = package_dir.parent
118
+ # csrc = repo_root / "csrc"
119
+ # Try to become the builder
120
+ have_lock = _try_acquire_lock()
121
+
122
+ if not have_lock and _lock_is_stale():
123
+ _release_lock()
124
+ have_lock = _try_acquire_lock()
125
+
126
+ if not have_lock:
127
+ # Another process is building. Wait for the final .so to appear.
128
+ return _wait_for_artifact()
129
+
130
+ try:
131
+ # Double-check after lock acquisition in case another process finished just before us
132
+ if so_path.exists():
133
+ return _load_ext(mod_name, so_path)
134
+
135
+ subprocess.run([
136
+ "cmake", "-S", str(cmake_source_dir), "-B", str(bdir), "-G", "Ninja",
137
+ # f"-DPURLIN_SOURCE_DIR={csrc}",
138
+ f"-DGENERATED_SRC={generated}",
139
+ f"-DTARGET_MODULE_NAME={mod_name}",
140
+ f"-DCMAKE_CUDA_ARCHITECTURES={arch}",
141
+ f"-DCPM_SOURCE_CACHE={Path.home() / '.cache' / 'cpm'}",
142
+ "-DCMAKE_BUILD_TYPE=Release",
143
+ f"-DARCH={arch}"
144
+ ], check=True)
145
+
146
+ subprocess.run([
147
+ "cmake", "--build", str(bdir), "--parallel"
148
+ ], check=True)
149
+
150
+ built = next(bdir.glob(mod_name + "*.so"))
151
+
152
+ # Copy into a temp path in build_root, then atomically replace final path
153
+ tmp_so = build_root / f".{mod_name}.{uniq}.tmp.so"
154
+ shutil.copy2(built, tmp_so)
155
+ tmp_so.replace(so_path)
156
+
157
+ finally:
158
+ _release_lock()
159
+
160
+ return _load_ext(mod_name, so_path)
@@ -0,0 +1,17 @@
1
+ Metadata-Version: 2.4
2
+ Name: purlin
3
+ Version: 0.1.0
4
+ Summary: Adaptable, template-based substrate for high-performance inter-GPU communication
5
+ Author-email: Osayamen Jonathan Aimuyo <osayamen@stanford.edu>
6
+ License-Expression: BSD-3-Clause
7
+ Project-URL: Homepage, https://github.com/osayamenja/purlin
8
+ Project-URL: Repository, https://github.com/osayamenja/purlin
9
+ Requires-Python: >=3.10
10
+ Description-Content-Type: text/markdown
11
+ License-File: LICENSE
12
+ Requires-Dist: torch>=2.8.0
13
+ Dynamic: license-file
14
+
15
+ # Purlin
16
+ Library that provides composable primitives for constructing high-performance and _adaptable_ device-initiated
17
+ communication operations.
@@ -0,0 +1,10 @@
1
+ purlin/CMakeLists.txt,sha256=Zsy2nNRByAfg2vsYM9-DsdHSHcBKMFO0vcipWlhiF6E,2884
2
+ purlin/CPM.cmake,sha256=eZFDu5wfvRTcmlE-r_A-4acJxu4W8yRWtQRD1m3Xr_o,961
3
+ purlin/__init__.py,sha256=1Sg9NsRYqW5nIRH5A_0VMZdIrhWy6e6-0IaDA53gY2U,3532
4
+ purlin/bindings.py,sha256=LCE4k4rYgjxiawPbQ3ekU18bqq1bZ-C4lKDi01-bPA0,6522
5
+ purlin/jit.py,sha256=rejsh8_0VGe6PzekzLUG-i-oQzEolIqDxu7rs8KR1MI,4913
6
+ purlin-0.1.0.dist-info/licenses/LICENSE,sha256=yEPBv9wFy-gTwrwew31PkGJu_UwaIpo44E83tsM6hkA,1510
7
+ purlin-0.1.0.dist-info/METADATA,sha256=6wXnrIrTE60z60D47hdFksiNYtcTFvnPvz0AMFFn00Q,642
8
+ purlin-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
9
+ purlin-0.1.0.dist-info/top_level.txt,sha256=RWTjmy96Ezv7R9SugklojuYjwkyYrYKrSmk__yVXNjQ,7
10
+ purlin-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,28 @@
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) 2026, Osayamen Jonathan Aimuyo
4
+
5
+ Redistribution and use in source and binary forms, with or without
6
+ modification, are permitted provided that the following conditions are met:
7
+
8
+ 1. Redistributions of source code must retain the above copyright notice, this
9
+ list of conditions and the following disclaimer.
10
+
11
+ 2. Redistributions in binary form must reproduce the above copyright notice,
12
+ this list of conditions and the following disclaimer in the documentation
13
+ and/or other materials provided with the distribution.
14
+
15
+ 3. Neither the name of the copyright holder nor the names of its
16
+ contributors may be used to endorse or promote products derived from
17
+ this software without specific prior written permission.
18
+
19
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE
@@ -0,0 +1 @@
1
+ purlin