buildz-gpu 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- buildz_gpu-0.1.0/MANIFEST.in +2 -0
- buildz_gpu-0.1.0/PKG-INFO +21 -0
- buildz_gpu-0.1.0/README.md +9 -0
- buildz_gpu-0.1.0/buildz/gpu/az/__init__.py +13 -0
- buildz_gpu-0.1.0/buildz/gpu/az/attrn.py +42 -0
- buildz_gpu-0.1.0/buildz/gpu/az/conv.py +50 -0
- buildz_gpu-0.1.0/buildz/gpu/az/ln.py +32 -0
- buildz_gpu-0.1.0/buildz/gpu/az/nets.py +25 -0
- buildz_gpu-0.1.0/buildz/gpu/az/nreshape.py +12 -0
- buildz_gpu-0.1.0/buildz/gpu/az/reshape.py +16 -0
- buildz_gpu-0.1.0/buildz/gpu/az/ups.py +40 -0
- buildz_gpu-0.1.0/buildz/gpu/az/util.py +45 -0
- buildz_gpu-0.1.0/buildz/gpu/demo/az.js +22 -0
- buildz_gpu-0.1.0/buildz/gpu/demo/az.py +72 -0
- buildz_gpu-0.1.0/buildz/gpu/demo/cal_cpu.py +44 -0
- buildz_gpu-0.1.0/buildz/gpu/demo/cal_gpu.py +44 -0
- buildz_gpu-0.1.0/buildz/gpu/demo/trans.py +34 -0
- buildz_gpu-0.1.0/buildz/gpu/demo/trans2.py +32 -0
- buildz_gpu-0.1.0/buildz/gpu/test/demo.py +168 -0
- buildz_gpu-0.1.0/buildz/gpu/test/test_recal.py +109 -0
- buildz_gpu-0.1.0/buildz/gpu/torch/__init__.py +9 -0
- buildz_gpu-0.1.0/buildz/gpu/torch/dv.py +121 -0
- buildz_gpu-0.1.0/buildz/gpu/torch/middle_base.py +102 -0
- buildz_gpu-0.1.0/buildz/gpu/torch/middle_cache.py +184 -0
- buildz_gpu-0.1.0/buildz/gpu/torch/recal.py +124 -0
- buildz_gpu-0.1.0/buildz/none +0 -0
- buildz_gpu-0.1.0/buildz_gpu.egg-info/PKG-INFO +21 -0
- buildz_gpu-0.1.0/buildz_gpu.egg-info/SOURCES.txt +31 -0
- buildz_gpu-0.1.0/buildz_gpu.egg-info/dependency_links.txt +1 -0
- buildz_gpu-0.1.0/buildz_gpu.egg-info/requires.txt +1 -0
- buildz_gpu-0.1.0/buildz_gpu.egg-info/top_level.txt +1 -0
- buildz_gpu-0.1.0/setup.cfg +4 -0
- buildz_gpu-0.1.0/setup.py +23 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: buildz_gpu
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: 用python写的gpu模型训练相关工具, buildz包的一部分(buildz.gpu)
|
|
5
|
+
Home-page: https://github.com/buildCodeZ/buildz
|
|
6
|
+
Author: Zzz
|
|
7
|
+
Author-email: 1309458652@qq.com
|
|
8
|
+
License: Apache License 2.0
|
|
9
|
+
Keywords: buildz
|
|
10
|
+
Platform: any
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
|
|
13
|
+
# buildz
|
|
14
|
+
声明:
|
|
15
|
+
禁止将本项目代码用于ai训练
|
|
16
|
+
declaration:
|
|
17
|
+
Codes of this project are not allowed to be used for AI training or any other form of machine learning processes.
|
|
18
|
+
|
|
19
|
+
```
|
|
20
|
+
用python写的gpu模型训练相关工具
|
|
21
|
+
```
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
|
|
2
|
+
from buildz.base import Base
|
|
3
|
+
from .util import *
|
|
4
|
+
class MultiAttrn(Base):
|
|
5
|
+
def init(self, din, dout, num_heads, bias=True):
|
|
6
|
+
self.din=din
|
|
7
|
+
self.dout=dout
|
|
8
|
+
self.num_heads=num_heads
|
|
9
|
+
self.dout_per_head = dout//num_heads
|
|
10
|
+
self.bias=bias
|
|
11
|
+
def fetch(self, shape):
|
|
12
|
+
if type(shape)==int:
|
|
13
|
+
shape = [shape]
|
|
14
|
+
shape = list(shape)
|
|
15
|
+
while len(shape)<3:
|
|
16
|
+
shape = [1]+shape
|
|
17
|
+
din = shape[2]
|
|
18
|
+
seq_n = shape[1]
|
|
19
|
+
batch = shape[0]
|
|
20
|
+
return batch, seq_n, din
|
|
21
|
+
def cache(self, shape, unit=1):
|
|
22
|
+
batch, seq_n, din = self.fetch(shape)
|
|
23
|
+
n_input = 3*batch*seq_n*din
|
|
24
|
+
n_w = 2*batch*self.num_heads*seq_n*seq_n
|
|
25
|
+
n = n_input+n_w
|
|
26
|
+
return n*unit
|
|
27
|
+
def cal(self, shape):
|
|
28
|
+
batch, seq_n, din = self.fetch(shape)
|
|
29
|
+
n = 6*batch*seq_n*din*self.dout+2*batch*self.num_heads*seq_n*seq_n*self.dout_per_head+2*batch*seq_n*self.dout*self.dout
|
|
30
|
+
return n
|
|
31
|
+
def backcal(self, shape):
|
|
32
|
+
return 2*self.cal(shape)
|
|
33
|
+
def size(self, unit=1):
|
|
34
|
+
kqv = 3*(self.din*self.dout+self.dout*self.bias)
|
|
35
|
+
out = self.dout*self.dout+self.dout*self.bias
|
|
36
|
+
n = kqv+out
|
|
37
|
+
return n*unit
|
|
38
|
+
def out(self, shape):
|
|
39
|
+
batch, seq_n, din = self.fetch(shape)
|
|
40
|
+
return [batch, seq_n, self.dout]
|
|
41
|
+
|
|
42
|
+
pass
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
|
|
2
|
+
from buildz.base import Base
|
|
3
|
+
from .util import *
|
|
4
|
+
class Conv(Base):
|
|
5
|
+
"""
|
|
6
|
+
[batch, channel, dim1, dim2, ...]
|
|
7
|
+
"""
|
|
8
|
+
def init(self, dims, ch_in, ch_out, kernel, bias=True, stride=1, padding=0):
|
|
9
|
+
self.ch_in = ch_in
|
|
10
|
+
self.ch_out = ch_out
|
|
11
|
+
self.dims = dims
|
|
12
|
+
self.bias=bias
|
|
13
|
+
self.kernel = exp(kernel, dims)
|
|
14
|
+
self.stride=exp(stride, dims)
|
|
15
|
+
self.padding=exp(padding, dims)
|
|
16
|
+
def fetch(self, shape):
|
|
17
|
+
if type(shape)==int:
|
|
18
|
+
shape = [shape]
|
|
19
|
+
shape = list(shape)
|
|
20
|
+
while len(shape)<self.dims+2:
|
|
21
|
+
shape = [1]+shape
|
|
22
|
+
batch = shape[0]
|
|
23
|
+
ch = shape[1]
|
|
24
|
+
return batch,ch, shape[2:]
|
|
25
|
+
def out(self, shape):
|
|
26
|
+
batch, ch, szs = self.fetch(shape)
|
|
27
|
+
outs = [batch, self.ch_out]
|
|
28
|
+
for kernel_size, stride, padding, dim_in in zip(self.kernel, self.stride, self.padding, szs):
|
|
29
|
+
val = int((dim_in - kernel_size + 2*padding)/stride)+1
|
|
30
|
+
outs.append(val)
|
|
31
|
+
return outs
|
|
32
|
+
def cache(self, shape, unit=1):
|
|
33
|
+
# shape = [batch, channel, dim1, dim2, ...]
|
|
34
|
+
batch, ch, szs = self.fetch(shape)
|
|
35
|
+
n_input = batch*ch*mul(shape[2:])
|
|
36
|
+
n_w = self.ch_out*self.ch_in*mul(self.kernel)
|
|
37
|
+
n = n_input+n_w
|
|
38
|
+
return n*unit
|
|
39
|
+
def cal(self, shape):
|
|
40
|
+
out = self.out(shape)
|
|
41
|
+
batch = shape[0]
|
|
42
|
+
ch = shape[1]
|
|
43
|
+
out_szs = shape[2:]
|
|
44
|
+
n = mul(out_szs)*self.ch_out*self.ch_in*mul(self.kernel)*2
|
|
45
|
+
return n
|
|
46
|
+
def backcal(self, batch=1):
|
|
47
|
+
return 2*self.cal(batch)
|
|
48
|
+
def size(self, unit=1):
|
|
49
|
+
n = self.ch_in*self.ch_out*mul(self.kernel)+self.bias*self.ch_out
|
|
50
|
+
return n*unit
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
|
|
2
|
+
from buildz.base import Base
|
|
3
|
+
from .util import *
|
|
4
|
+
class Linear(Base):
|
|
5
|
+
def init(self, din, dout, bias=True):
|
|
6
|
+
self.din=din
|
|
7
|
+
self.dout=dout
|
|
8
|
+
self.bias=bias
|
|
9
|
+
def fetch(self, shape):
|
|
10
|
+
if type(shape)==int:
|
|
11
|
+
shape = [shape]
|
|
12
|
+
shape = list(shape)
|
|
13
|
+
if len(shape)<2:
|
|
14
|
+
shape = [1]+shape
|
|
15
|
+
# print(f"shape:{shape}")
|
|
16
|
+
din = shape[-1]
|
|
17
|
+
batch = mul(shape[:-1])
|
|
18
|
+
return batch, din
|
|
19
|
+
def cache(self, shape, unit=1):
|
|
20
|
+
batch, din = self.fetch(shape)
|
|
21
|
+
return (batch*self.din+self.din*self.dout)*unit
|
|
22
|
+
def cal(self, shape):
|
|
23
|
+
batch, din = self.fetch(shape)
|
|
24
|
+
return 2*batch*self.din*self.dout
|
|
25
|
+
def backcal(self, shape):
|
|
26
|
+
return 2*self.cal(shape)
|
|
27
|
+
def size(self, unit=1):
|
|
28
|
+
n = self.din*self.dout+self.bias*self.dout
|
|
29
|
+
return n*unit
|
|
30
|
+
def out(self, shape):
|
|
31
|
+
batch, din = self.fetch(shape)
|
|
32
|
+
return [batch, self.dout]
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
|
|
2
|
+
from buildz.base import Base
|
|
3
|
+
from .util import *
|
|
4
|
+
class Nets(Base):
|
|
5
|
+
def init(self, *nets):
|
|
6
|
+
self.nets = nets
|
|
7
|
+
def call(self, fc, shape, *a, **b):
|
|
8
|
+
rst = 0
|
|
9
|
+
for net in self.nets:
|
|
10
|
+
rst+= getattr(net, fc)(shape, *a, **b)
|
|
11
|
+
shape = net.out(shape)
|
|
12
|
+
return rst
|
|
13
|
+
def cache(self, shape, unit=1):
|
|
14
|
+
return self.call("cache", shape, unit)
|
|
15
|
+
def cal(self, shape):
|
|
16
|
+
return self.call("cal", shape)
|
|
17
|
+
def backcal(self, shape):
|
|
18
|
+
return self.call("backcal", shape)
|
|
19
|
+
def size(self, unit=1):
|
|
20
|
+
sizes = [net.size(unit) for net in self.nets]
|
|
21
|
+
return sum(sizes)
|
|
22
|
+
def out(self, shape):
|
|
23
|
+
for net in self.nets:
|
|
24
|
+
shape = net.out(shape)
|
|
25
|
+
return shape
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
|
|
2
|
+
from torch import nn
|
|
3
|
+
class ReshapeModule(nn.Module):
|
|
4
|
+
def __init__(self, shape):
|
|
5
|
+
super().__init__()
|
|
6
|
+
self.shape = shape
|
|
7
|
+
def forward(self, inputs):
|
|
8
|
+
inputs = inputs.reshape(self.shape)
|
|
9
|
+
return inputs
|
|
10
|
+
|
|
11
|
+
def reshape(shape):
|
|
12
|
+
return ReshapeModule(shape)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
|
|
2
|
+
from buildz.base import Base
|
|
3
|
+
from .util import *
|
|
4
|
+
class Reshape(Base):
|
|
5
|
+
def init(self, shape):
|
|
6
|
+
self.shape = shape
|
|
7
|
+
def cache(self, shape, unit=1):
|
|
8
|
+
return 0
|
|
9
|
+
def cal(self, shape):
|
|
10
|
+
return 0
|
|
11
|
+
def backcal(self, shape):
|
|
12
|
+
return 0
|
|
13
|
+
def size(self, unit=1):
|
|
14
|
+
return 0
|
|
15
|
+
def out(self, shape):
|
|
16
|
+
return self.shape
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
|
|
2
|
+
from buildz.base import Base
|
|
3
|
+
from .util import *
|
|
4
|
+
class Upsample(Base):
|
|
5
|
+
def init(self, dims, size=None, scale_factor=None, mode='nearest'):
|
|
6
|
+
self.dims = dims
|
|
7
|
+
assert size is not None or scale_factor is not None
|
|
8
|
+
self.size= exp(size, dims)
|
|
9
|
+
self.scale_factor=exp(scale_factor, dims)
|
|
10
|
+
self.mode=mode
|
|
11
|
+
def fetch(self, shape):
|
|
12
|
+
if type(shape)==int:
|
|
13
|
+
shape = [shape]
|
|
14
|
+
shape = list(shape)
|
|
15
|
+
while len(shape)<self.dims+2:
|
|
16
|
+
shape = [1]+shape
|
|
17
|
+
batch = shape[0]
|
|
18
|
+
ch = shape[1]
|
|
19
|
+
return batch, ch, shape[2:]
|
|
20
|
+
def cache(self, shape, unit=1):
|
|
21
|
+
if self.mode=='nearest':
|
|
22
|
+
return 0
|
|
23
|
+
n = mul(shape)
|
|
24
|
+
return n*unit
|
|
25
|
+
def cal(self, shape):
|
|
26
|
+
oshape = self.out(shape)
|
|
27
|
+
batch, ch, oszs = self.fetch(oshape)
|
|
28
|
+
n = 4*batch*ch*mul(oszs)
|
|
29
|
+
return n
|
|
30
|
+
def backcal(self, shape):
|
|
31
|
+
return self.cal(batch)
|
|
32
|
+
def size(self, unit=1):
|
|
33
|
+
return 0
|
|
34
|
+
def out(self, shape):
|
|
35
|
+
batch, ch, szs = self.fetch(shape)
|
|
36
|
+
if self.size:
|
|
37
|
+
osz = list(self.size)
|
|
38
|
+
else:
|
|
39
|
+
osz = [int(sz/sf) for sz, sf in zip(self.size, self.scale_factor)]
|
|
40
|
+
return [batch, ch]+osz
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
def mul(arr):
|
|
4
|
+
if type(arr) not in {list, tuple}:
|
|
5
|
+
return arr
|
|
6
|
+
i=1
|
|
7
|
+
for j in arr:
|
|
8
|
+
i*=j
|
|
9
|
+
return i
|
|
10
|
+
|
|
11
|
+
def exp(val, dims):
|
|
12
|
+
if val is None:
|
|
13
|
+
return None
|
|
14
|
+
if type(val) not in {list, tuple}:
|
|
15
|
+
val = [val]*dims
|
|
16
|
+
return val
|
|
17
|
+
|
|
18
|
+
def format_size(n, unit=1024):
|
|
19
|
+
units = ",K,M,G,T,P,E,Z,Y".split(",")
|
|
20
|
+
i=0
|
|
21
|
+
while n>=unit and i<len(units)-1:
|
|
22
|
+
n = n/unit
|
|
23
|
+
i+=1
|
|
24
|
+
n = "%.3f"%(n,)
|
|
25
|
+
return f"{n}{units[i]}"
|
|
26
|
+
|
|
27
|
+
def nsize(fmt, unit=1024):
|
|
28
|
+
fmt=fmt.strip()
|
|
29
|
+
units = ",K,M,G,T,P,E,Z,Y".split(",")
|
|
30
|
+
maps = {}
|
|
31
|
+
val=1
|
|
32
|
+
for k in units:
|
|
33
|
+
maps[k] = val
|
|
34
|
+
val =val*unit
|
|
35
|
+
k= fmt[-1]
|
|
36
|
+
if k not in maps:
|
|
37
|
+
k = ""
|
|
38
|
+
val = float(fmt[:len(fmt)-len(k)])
|
|
39
|
+
return val*maps[k]
|
|
40
|
+
|
|
41
|
+
pass
|
|
42
|
+
ns=nsize
|
|
43
|
+
|
|
44
|
+
fmt_sz = format_size
|
|
45
|
+
fmt_size=format_size
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
|
|
2
|
+
cal:{
|
|
3
|
+
// 理论值
|
|
4
|
+
// RTX4060: 15T
|
|
5
|
+
// i7-13700H: 657G
|
|
6
|
+
// 测试值
|
|
7
|
+
RTX4060: 12T
|
|
8
|
+
gpu: 12T
|
|
9
|
+
i7-13700H: 500G
|
|
10
|
+
cpu: 500G
|
|
11
|
+
}
|
|
12
|
+
trans: {
|
|
13
|
+
gpu: 272G
|
|
14
|
+
// PCIE4x8 2G*8,理论值
|
|
15
|
+
// gpu_mem: 16G
|
|
16
|
+
// 实际值,需要加上pin_memory=True标志,如torch.randn(..., pin_memory=True),让内存分配连续才能达到
|
|
17
|
+
//
|
|
18
|
+
gpu2mem: 10G
|
|
19
|
+
// 实际训练数据,由于在内存里是分散存放的,拷贝到显存的速度更慢
|
|
20
|
+
gpu2mem: 1.6G
|
|
21
|
+
wifi6: 1.2G
|
|
22
|
+
}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
from buildz.gpuz.az import *
|
|
5
|
+
# from buildz.gpuz.az.nreshape import reshape
|
|
6
|
+
|
|
7
|
+
from buildz import xf, fz
|
|
8
|
+
|
|
9
|
+
# from torch import nn
|
|
10
|
+
# import torch
|
|
11
|
+
import os
|
|
12
|
+
dp = os.path.dirname(__file__)
|
|
13
|
+
fp = os.path.join(dp, "az.js")
|
|
14
|
+
conf = xf.loadf(fp)
|
|
15
|
+
speed = conf.get("cal", {})
|
|
16
|
+
speed_gpu = nsize(speed.get("gpu", "10T"))
|
|
17
|
+
speed_cpu = nsize(speed.get("cpu", "500G"))
|
|
18
|
+
trans = conf.get("trans", {})
|
|
19
|
+
trans_mem = nsize(trans.get("gpu2mem", "1G"))
|
|
20
|
+
def unit(din):
|
|
21
|
+
# batch, seq_n, din
|
|
22
|
+
nets = []
|
|
23
|
+
nets.append(MultiAttrn(din, din, 8))
|
|
24
|
+
#nets.append(MultiAttrn(din, din, 8))
|
|
25
|
+
#nets.append(Linear(din,din))
|
|
26
|
+
#nets.append(Linear(din,din))
|
|
27
|
+
nets.append(Linear(din,din))
|
|
28
|
+
return nets
|
|
29
|
+
|
|
30
|
+
def conv_unit(ksize, ch_in, ch_out, reshape):
|
|
31
|
+
# batch, seq_n, din
|
|
32
|
+
nets = []
|
|
33
|
+
nets.append(Reshape(reshape))
|
|
34
|
+
if type(ksize)!={list, tuple}:
|
|
35
|
+
ksize = [ksize, ksize]
|
|
36
|
+
nets.append(Conv(2, ch_in, ch_out, ksize, 1, 1, [k//2 for k in ksize]))
|
|
37
|
+
batch, ch, w, h = reshape
|
|
38
|
+
outshape = [batch, ch_out, w, h]
|
|
39
|
+
nets.append(Reshape(outshape))
|
|
40
|
+
#nets.append(MultiAttrn(din, din, 8))
|
|
41
|
+
nets.append(Linear(din,din))
|
|
42
|
+
return nets
|
|
43
|
+
|
|
44
|
+
din =4096
|
|
45
|
+
# din=256
|
|
46
|
+
batch=1
|
|
47
|
+
seq_n=1024
|
|
48
|
+
az_nets = []
|
|
49
|
+
for i in range(10):
|
|
50
|
+
az_nets+=conv_unit(3, 4, 4, [batch, 4, 32, 32])
|
|
51
|
+
az_nets+=unit(din)
|
|
52
|
+
|
|
53
|
+
az_nets = Nets(*az_nets)
|
|
54
|
+
data_shape = [batch, din]
|
|
55
|
+
|
|
56
|
+
data_size = mul(data_shape)*4
|
|
57
|
+
data_trans = data_size/trans_mem
|
|
58
|
+
print(f"data: {fmt_sz(data_size)}, data_trans: {data_trans:.3f} sec")
|
|
59
|
+
net_size = az_nets.size(4)
|
|
60
|
+
net_trans = net_size/trans_mem
|
|
61
|
+
print(f"net size: {fmt_sz(net_size)}, net_trans: {net_trans:.3f} sec")
|
|
62
|
+
cache_size = az_nets.cache(data_shape,4)
|
|
63
|
+
cache_trans = cache_size/trans_mem
|
|
64
|
+
print(f"cache: {fmt_sz(cache_size)}, cache_trans: {cache_trans:.3f} sec")
|
|
65
|
+
cal_amount = az_nets.cal(data_shape)
|
|
66
|
+
cal_gpu = cal_amount/speed_gpu
|
|
67
|
+
cal_cpu = cal_amount/speed_cpu
|
|
68
|
+
print(f"cal: {fmt_sz(cal_amount)}, cal_gpu: {cal_gpu:.3f} sec, cal_cpu: {cal_cpu:.3f} sec")
|
|
69
|
+
print("done")
|
|
70
|
+
"""
|
|
71
|
+
python -m buildz.gpuz.demo.az
|
|
72
|
+
"""
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
from buildz.gpuz.az import *
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
from buildz import xf, fz
|
|
8
|
+
|
|
9
|
+
from torch import nn
|
|
10
|
+
import torch
|
|
11
|
+
din =4096*2
|
|
12
|
+
dout = 4096*2
|
|
13
|
+
batch=10240
|
|
14
|
+
az_nets = []
|
|
15
|
+
nets = []
|
|
16
|
+
for i in range(5):
|
|
17
|
+
az_nets.append(Linear(din,dout))
|
|
18
|
+
nets.append(nn.Linear(din,dout))
|
|
19
|
+
az_nets = Nets(*az_nets)
|
|
20
|
+
nets = nn.Sequential(*nets)
|
|
21
|
+
data = torch.randn(batch, din)
|
|
22
|
+
print(f"data: {fmt_sz(batch*din)}")
|
|
23
|
+
print(f"net size: {fmt_sz(az_nets.size(4))}")
|
|
24
|
+
print(f"cache: {fmt_sz(az_nets.cache(data.size(),4))}")
|
|
25
|
+
print(f"cal: {fmt_sz(az_nets.cal(data.size()))}")
|
|
26
|
+
print("done")
|
|
27
|
+
import time
|
|
28
|
+
# nets=nets.cuda()
|
|
29
|
+
# data=data.cuda()
|
|
30
|
+
loop=5
|
|
31
|
+
out = data
|
|
32
|
+
curr=time.time()
|
|
33
|
+
with torch.no_grad():
|
|
34
|
+
for i in range(loop):
|
|
35
|
+
out = nets(out)
|
|
36
|
+
sec = time.time()-curr
|
|
37
|
+
print(f"time cost: {sec}/{loop}")
|
|
38
|
+
assert sec>0
|
|
39
|
+
speed = az_nets.cal(data.size())*loop/(sec+1e-10)
|
|
40
|
+
print(f"speed: {fmt_sz(speed)}")
|
|
41
|
+
|
|
42
|
+
"""
|
|
43
|
+
python -m buildz.gpuz.demo.test_cal
|
|
44
|
+
"""
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
from buildz.gpuz.az import *
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
from buildz import xf, fz
|
|
8
|
+
|
|
9
|
+
from torch import nn
|
|
10
|
+
import torch
|
|
11
|
+
din =4096*2
|
|
12
|
+
dout = 4096*2
|
|
13
|
+
batch=10240
|
|
14
|
+
az_nets = []
|
|
15
|
+
nets = []
|
|
16
|
+
for i in range(5):
|
|
17
|
+
az_nets.append(Linear(din,dout))
|
|
18
|
+
nets.append(nn.Linear(din,dout))
|
|
19
|
+
az_nets = Nets(*az_nets)
|
|
20
|
+
nets = nn.Sequential(*nets)
|
|
21
|
+
data = torch.randn(batch, din)
|
|
22
|
+
print(f"data: {fmt_sz(batch*din)}")
|
|
23
|
+
print(f"net size: {fmt_sz(az_nets.size(4))}")
|
|
24
|
+
print(f"cache: {fmt_sz(az_nets.cache(data.size(),4))}")
|
|
25
|
+
print(f"cal: {fmt_sz(az_nets.cal(data.size()))}")
|
|
26
|
+
print("done")
|
|
27
|
+
import time
|
|
28
|
+
nets=nets.cuda()
|
|
29
|
+
data=data.cuda()
|
|
30
|
+
loop=200
|
|
31
|
+
out = data
|
|
32
|
+
curr=time.time()
|
|
33
|
+
with torch.no_grad():
|
|
34
|
+
for i in range(loop):
|
|
35
|
+
out = nets(out)
|
|
36
|
+
sec = time.time()-curr
|
|
37
|
+
print(f"time cost: {sec}/{loop}")
|
|
38
|
+
assert sec>0
|
|
39
|
+
speed = az_nets.cal(data.size())*loop/(sec+1e-10)
|
|
40
|
+
print(f"speed: {fmt_sz(speed)}")
|
|
41
|
+
|
|
42
|
+
"""
|
|
43
|
+
python -m buildz.gpuz.demo.test_cal
|
|
44
|
+
"""
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
from buildz.gpuz.az import *
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
from buildz import xf, fz
|
|
8
|
+
|
|
9
|
+
from torch import nn
|
|
10
|
+
import torch
|
|
11
|
+
din =4096*4
|
|
12
|
+
batch=10240
|
|
13
|
+
loop=6
|
|
14
|
+
# data = torch.randn(batch, din, pin_memory=True)requires_grad=False
|
|
15
|
+
datas = [torch.randn(batch, din, pin_memory=True) for i in range(loop)]
|
|
16
|
+
n = batch*din
|
|
17
|
+
size = n*4
|
|
18
|
+
print(f"data: {fmt_sz(batch*din*loop)}")
|
|
19
|
+
import time
|
|
20
|
+
curr=time.time()
|
|
21
|
+
for i in range(loop):
|
|
22
|
+
data = datas[i]
|
|
23
|
+
data=data.cuda()
|
|
24
|
+
del data
|
|
25
|
+
# data=data.cpu()
|
|
26
|
+
sec = time.time()-curr
|
|
27
|
+
print(f"time cost: {sec}/{loop}")
|
|
28
|
+
assert sec>0
|
|
29
|
+
speed = size*loop/(sec+1e-10)
|
|
30
|
+
print(f"speed: {fmt_sz(speed)}")
|
|
31
|
+
|
|
32
|
+
"""
|
|
33
|
+
python -m buildz.gpuz.demo.trans
|
|
34
|
+
"""
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
from buildz.gpuz.az import *
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
from buildz import xf, fz
|
|
8
|
+
|
|
9
|
+
from torch import nn
|
|
10
|
+
import torch
|
|
11
|
+
din =4096*4
|
|
12
|
+
batch=10240
|
|
13
|
+
loop=10
|
|
14
|
+
data = torch.randn(batch, din, pin_memory=True)
|
|
15
|
+
n = batch*din
|
|
16
|
+
size = n*4
|
|
17
|
+
print(f"data: {fmt_sz(batch*din*loop)}")
|
|
18
|
+
import time
|
|
19
|
+
curr=time.time()
|
|
20
|
+
for i in range(loop):
|
|
21
|
+
data = data
|
|
22
|
+
data=data.cuda()
|
|
23
|
+
data=data.cpu().contiguous()
|
|
24
|
+
sec = time.time()-curr
|
|
25
|
+
print(f"time cost: {sec}/{loop}")
|
|
26
|
+
assert sec>0
|
|
27
|
+
speed = size*loop/(sec+1e-10)
|
|
28
|
+
print(f"speed: {fmt_sz(speed)}")
|
|
29
|
+
|
|
30
|
+
"""
|
|
31
|
+
python -m buildz.gpuz.demo.trans2
|
|
32
|
+
"""
|