buildz-gpu 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. buildz_gpu-0.1.0/MANIFEST.in +2 -0
  2. buildz_gpu-0.1.0/PKG-INFO +21 -0
  3. buildz_gpu-0.1.0/README.md +9 -0
  4. buildz_gpu-0.1.0/buildz/gpu/az/__init__.py +13 -0
  5. buildz_gpu-0.1.0/buildz/gpu/az/attrn.py +42 -0
  6. buildz_gpu-0.1.0/buildz/gpu/az/conv.py +50 -0
  7. buildz_gpu-0.1.0/buildz/gpu/az/ln.py +32 -0
  8. buildz_gpu-0.1.0/buildz/gpu/az/nets.py +25 -0
  9. buildz_gpu-0.1.0/buildz/gpu/az/nreshape.py +12 -0
  10. buildz_gpu-0.1.0/buildz/gpu/az/reshape.py +16 -0
  11. buildz_gpu-0.1.0/buildz/gpu/az/ups.py +40 -0
  12. buildz_gpu-0.1.0/buildz/gpu/az/util.py +45 -0
  13. buildz_gpu-0.1.0/buildz/gpu/demo/az.js +22 -0
  14. buildz_gpu-0.1.0/buildz/gpu/demo/az.py +72 -0
  15. buildz_gpu-0.1.0/buildz/gpu/demo/cal_cpu.py +44 -0
  16. buildz_gpu-0.1.0/buildz/gpu/demo/cal_gpu.py +44 -0
  17. buildz_gpu-0.1.0/buildz/gpu/demo/trans.py +34 -0
  18. buildz_gpu-0.1.0/buildz/gpu/demo/trans2.py +32 -0
  19. buildz_gpu-0.1.0/buildz/gpu/test/demo.py +168 -0
  20. buildz_gpu-0.1.0/buildz/gpu/test/test_recal.py +109 -0
  21. buildz_gpu-0.1.0/buildz/gpu/torch/__init__.py +9 -0
  22. buildz_gpu-0.1.0/buildz/gpu/torch/dv.py +121 -0
  23. buildz_gpu-0.1.0/buildz/gpu/torch/middle_base.py +102 -0
  24. buildz_gpu-0.1.0/buildz/gpu/torch/middle_cache.py +184 -0
  25. buildz_gpu-0.1.0/buildz/gpu/torch/recal.py +124 -0
  26. buildz_gpu-0.1.0/buildz/none +0 -0
  27. buildz_gpu-0.1.0/buildz_gpu.egg-info/PKG-INFO +21 -0
  28. buildz_gpu-0.1.0/buildz_gpu.egg-info/SOURCES.txt +31 -0
  29. buildz_gpu-0.1.0/buildz_gpu.egg-info/dependency_links.txt +1 -0
  30. buildz_gpu-0.1.0/buildz_gpu.egg-info/requires.txt +1 -0
  31. buildz_gpu-0.1.0/buildz_gpu.egg-info/top_level.txt +1 -0
  32. buildz_gpu-0.1.0/setup.cfg +4 -0
  33. buildz_gpu-0.1.0/setup.py +23 -0
@@ -0,0 +1,2 @@
1
+ include README.md
2
+ recursive-include buildz *
@@ -0,0 +1,21 @@
1
+ Metadata-Version: 2.1
2
+ Name: buildz_gpu
3
+ Version: 0.1.0
4
+ Summary: 用python写的gpu模型训练相关工具, buildz包的一部分(buildz.gpu)
5
+ Home-page: https://github.com/buildCodeZ/buildz
6
+ Author: Zzz
7
+ Author-email: 1309458652@qq.com
8
+ License: Apache License 2.0
9
+ Keywords: buildz
10
+ Platform: any
11
+ Description-Content-Type: text/markdown
12
+
13
+ # buildz
14
+ 声明:
15
+ 禁止将本项目代码用于ai训练
16
+ declaration:
17
+ Codes of this project are not allowed to be used for AI training or any other form of machine learning processes.
18
+
19
+ ```
20
+ 用python写的gpu模型训练相关工具
21
+ ```
@@ -0,0 +1,9 @@
1
+ # buildz
2
+ 声明:
3
+ 禁止将本项目代码用于ai训练
4
+ declaration:
5
+ Codes of this project are not allowed to be used for AI training or any other form of machine learning processes.
6
+
7
+ ```
8
+ 用python写的gpu模型训练相关工具
9
+ ```
@@ -0,0 +1,13 @@
1
+ #
2
+
3
+ from .attrn import MultiAttrn
4
+ from .conv import Conv
5
+ from .ups import Upsample
6
+ from .ln import Linear
7
+ from .reshape import Reshape
8
+ from .nets import Nets
9
+ from .util import *
10
+
11
+ __doc__="""
12
+ 分析工具
13
+ """
@@ -0,0 +1,42 @@
1
+
2
+ from buildz.base import Base
3
+ from .util import *
4
+ class MultiAttrn(Base):
5
+ def init(self, din, dout, num_heads, bias=True):
6
+ self.din=din
7
+ self.dout=dout
8
+ self.num_heads=num_heads
9
+ self.dout_per_head = dout//num_heads
10
+ self.bias=bias
11
+ def fetch(self, shape):
12
+ if type(shape)==int:
13
+ shape = [shape]
14
+ shape = list(shape)
15
+ while len(shape)<3:
16
+ shape = [1]+shape
17
+ din = shape[2]
18
+ seq_n = shape[1]
19
+ batch = shape[0]
20
+ return batch, seq_n, din
21
+ def cache(self, shape, unit=1):
22
+ batch, seq_n, din = self.fetch(shape)
23
+ n_input = 3*batch*seq_n*din
24
+ n_w = 2*batch*self.num_heads*seq_n*seq_n
25
+ n = n_input+n_w
26
+ return n*unit
27
+ def cal(self, shape):
28
+ batch, seq_n, din = self.fetch(shape)
29
+ n = 6*batch*seq_n*din*self.dout+2*batch*self.num_heads*seq_n*seq_n*self.dout_per_head+2*batch*seq_n*self.dout*self.dout
30
+ return n
31
+ def backcal(self, shape):
32
+ return 2*self.cal(shape)
33
+ def size(self, unit=1):
34
+ kqv = 3*(self.din*self.dout+self.dout*self.bias)
35
+ out = self.dout*self.dout+self.dout*self.bias
36
+ n = kqv+out
37
+ return n*unit
38
+ def out(self, shape):
39
+ batch, seq_n, din = self.fetch(shape)
40
+ return [batch, seq_n, self.dout]
41
+
42
+ pass
@@ -0,0 +1,50 @@
1
+
2
+ from buildz.base import Base
3
+ from .util import *
4
+ class Conv(Base):
5
+ """
6
+ [batch, channel, dim1, dim2, ...]
7
+ """
8
+ def init(self, dims, ch_in, ch_out, kernel, bias=True, stride=1, padding=0):
9
+ self.ch_in = ch_in
10
+ self.ch_out = ch_out
11
+ self.dims = dims
12
+ self.bias=bias
13
+ self.kernel = exp(kernel, dims)
14
+ self.stride=exp(stride, dims)
15
+ self.padding=exp(padding, dims)
16
+ def fetch(self, shape):
17
+ if type(shape)==int:
18
+ shape = [shape]
19
+ shape = list(shape)
20
+ while len(shape)<self.dims+2:
21
+ shape = [1]+shape
22
+ batch = shape[0]
23
+ ch = shape[1]
24
+ return batch,ch, shape[2:]
25
+ def out(self, shape):
26
+ batch, ch, szs = self.fetch(shape)
27
+ outs = [batch, self.ch_out]
28
+ for kernel_size, stride, padding, dim_in in zip(self.kernel, self.stride, self.padding, szs):
29
+ val = int((dim_in - kernel_size + 2*padding)/stride)+1
30
+ outs.append(val)
31
+ return outs
32
+ def cache(self, shape, unit=1):
33
+ # shape = [batch, channel, dim1, dim2, ...]
34
+ batch, ch, szs = self.fetch(shape)
35
+ n_input = batch*ch*mul(shape[2:])
36
+ n_w = self.ch_out*self.ch_in*mul(self.kernel)
37
+ n = n_input+n_w
38
+ return n*unit
39
+ def cal(self, shape):
40
+ out = self.out(shape)
41
+ batch = shape[0]
42
+ ch = shape[1]
43
+ out_szs = shape[2:]
44
+ n = mul(out_szs)*self.ch_out*self.ch_in*mul(self.kernel)*2
45
+ return n
46
+ def backcal(self, batch=1):
47
+ return 2*self.cal(batch)
48
+ def size(self, unit=1):
49
+ n = self.ch_in*self.ch_out*mul(self.kernel)+self.bias*self.ch_out
50
+ return n*unit
@@ -0,0 +1,32 @@
1
+
2
+ from buildz.base import Base
3
+ from .util import *
4
+ class Linear(Base):
5
+ def init(self, din, dout, bias=True):
6
+ self.din=din
7
+ self.dout=dout
8
+ self.bias=bias
9
+ def fetch(self, shape):
10
+ if type(shape)==int:
11
+ shape = [shape]
12
+ shape = list(shape)
13
+ if len(shape)<2:
14
+ shape = [1]+shape
15
+ # print(f"shape:{shape}")
16
+ din = shape[-1]
17
+ batch = mul(shape[:-1])
18
+ return batch, din
19
+ def cache(self, shape, unit=1):
20
+ batch, din = self.fetch(shape)
21
+ return (batch*self.din+self.din*self.dout)*unit
22
+ def cal(self, shape):
23
+ batch, din = self.fetch(shape)
24
+ return 2*batch*self.din*self.dout
25
+ def backcal(self, shape):
26
+ return 2*self.cal(shape)
27
+ def size(self, unit=1):
28
+ n = self.din*self.dout+self.bias*self.dout
29
+ return n*unit
30
+ def out(self, shape):
31
+ batch, din = self.fetch(shape)
32
+ return [batch, self.dout]
@@ -0,0 +1,25 @@
1
+
2
+ from buildz.base import Base
3
+ from .util import *
4
+ class Nets(Base):
5
+ def init(self, *nets):
6
+ self.nets = nets
7
+ def call(self, fc, shape, *a, **b):
8
+ rst = 0
9
+ for net in self.nets:
10
+ rst+= getattr(net, fc)(shape, *a, **b)
11
+ shape = net.out(shape)
12
+ return rst
13
+ def cache(self, shape, unit=1):
14
+ return self.call("cache", shape, unit)
15
+ def cal(self, shape):
16
+ return self.call("cal", shape)
17
+ def backcal(self, shape):
18
+ return self.call("backcal", shape)
19
+ def size(self, unit=1):
20
+ sizes = [net.size(unit) for net in self.nets]
21
+ return sum(sizes)
22
+ def out(self, shape):
23
+ for net in self.nets:
24
+ shape = net.out(shape)
25
+ return shape
@@ -0,0 +1,12 @@
1
+
2
+ from torch import nn
3
+ class ReshapeModule(nn.Module):
4
+ def __init__(self, shape):
5
+ super().__init__()
6
+ self.shape = shape
7
+ def forward(self, inputs):
8
+ inputs = inputs.reshape(self.shape)
9
+ return inputs
10
+
11
+ def reshape(shape):
12
+ return ReshapeModule(shape)
@@ -0,0 +1,16 @@
1
+
2
+ from buildz.base import Base
3
+ from .util import *
4
+ class Reshape(Base):
5
+ def init(self, shape):
6
+ self.shape = shape
7
+ def cache(self, shape, unit=1):
8
+ return 0
9
+ def cal(self, shape):
10
+ return 0
11
+ def backcal(self, shape):
12
+ return 0
13
+ def size(self, unit=1):
14
+ return 0
15
+ def out(self, shape):
16
+ return self.shape
@@ -0,0 +1,40 @@
1
+
2
+ from buildz.base import Base
3
+ from .util import *
4
+ class Upsample(Base):
5
+ def init(self, dims, size=None, scale_factor=None, mode='nearest'):
6
+ self.dims = dims
7
+ assert size is not None or scale_factor is not None
8
+ self.size= exp(size, dims)
9
+ self.scale_factor=exp(scale_factor, dims)
10
+ self.mode=mode
11
+ def fetch(self, shape):
12
+ if type(shape)==int:
13
+ shape = [shape]
14
+ shape = list(shape)
15
+ while len(shape)<self.dims+2:
16
+ shape = [1]+shape
17
+ batch = shape[0]
18
+ ch = shape[1]
19
+ return batch, ch, shape[2:]
20
+ def cache(self, shape, unit=1):
21
+ if self.mode=='nearest':
22
+ return 0
23
+ n = mul(shape)
24
+ return n*unit
25
+ def cal(self, shape):
26
+ oshape = self.out(shape)
27
+ batch, ch, oszs = self.fetch(oshape)
28
+ n = 4*batch*ch*mul(oszs)
29
+ return n
30
+ def backcal(self, shape):
31
+ return self.cal(batch)
32
+ def size(self, unit=1):
33
+ return 0
34
+ def out(self, shape):
35
+ batch, ch, szs = self.fetch(shape)
36
+ if self.size:
37
+ osz = list(self.size)
38
+ else:
39
+ osz = [int(sz/sf) for sz, sf in zip(self.size, self.scale_factor)]
40
+ return [batch, ch]+osz
@@ -0,0 +1,45 @@
1
+
2
+
3
+ def mul(arr):
4
+ if type(arr) not in {list, tuple}:
5
+ return arr
6
+ i=1
7
+ for j in arr:
8
+ i*=j
9
+ return i
10
+
11
+ def exp(val, dims):
12
+ if val is None:
13
+ return None
14
+ if type(val) not in {list, tuple}:
15
+ val = [val]*dims
16
+ return val
17
+
18
+ def format_size(n, unit=1024):
19
+ units = ",K,M,G,T,P,E,Z,Y".split(",")
20
+ i=0
21
+ while n>=unit and i<len(units)-1:
22
+ n = n/unit
23
+ i+=1
24
+ n = "%.3f"%(n,)
25
+ return f"{n}{units[i]}"
26
+
27
+ def nsize(fmt, unit=1024):
28
+ fmt=fmt.strip()
29
+ units = ",K,M,G,T,P,E,Z,Y".split(",")
30
+ maps = {}
31
+ val=1
32
+ for k in units:
33
+ maps[k] = val
34
+ val =val*unit
35
+ k= fmt[-1]
36
+ if k not in maps:
37
+ k = ""
38
+ val = float(fmt[:len(fmt)-len(k)])
39
+ return val*maps[k]
40
+
41
+ pass
42
+ ns=nsize
43
+
44
+ fmt_sz = format_size
45
+ fmt_size=format_size
@@ -0,0 +1,22 @@
1
+
2
+ cal:{
3
+ // 理论值
4
+ // RTX4060: 15T
5
+ // i7-13700H: 657G
6
+ // 测试值
7
+ RTX4060: 12T
8
+ gpu: 12T
9
+ i7-13700H: 500G
10
+ cpu: 500G
11
+ }
12
+ trans: {
13
+ gpu: 272G
14
+ // PCIE4x8 2G*8,理论值
15
+ // gpu_mem: 16G
16
+ // 实际值,需要加上pin_memory=True标志,如torch.randn(..., pin_memory=True),让内存分配连续才能达到
17
+ //
18
+ gpu2mem: 10G
19
+ // 实际训练数据,由于在内存里是分散存放的,拷贝到显存的速度更慢
20
+ gpu2mem: 1.6G
21
+ wifi6: 1.2G
22
+ }
@@ -0,0 +1,72 @@
1
+
2
+
3
+
4
+ from buildz.gpuz.az import *
5
+ # from buildz.gpuz.az.nreshape import reshape
6
+
7
+ from buildz import xf, fz
8
+
9
+ # from torch import nn
10
+ # import torch
11
+ import os
12
+ dp = os.path.dirname(__file__)
13
+ fp = os.path.join(dp, "az.js")
14
+ conf = xf.loadf(fp)
15
+ speed = conf.get("cal", {})
16
+ speed_gpu = nsize(speed.get("gpu", "10T"))
17
+ speed_cpu = nsize(speed.get("cpu", "500G"))
18
+ trans = conf.get("trans", {})
19
+ trans_mem = nsize(trans.get("gpu2mem", "1G"))
20
+ def unit(din):
21
+ # batch, seq_n, din
22
+ nets = []
23
+ nets.append(MultiAttrn(din, din, 8))
24
+ #nets.append(MultiAttrn(din, din, 8))
25
+ #nets.append(Linear(din,din))
26
+ #nets.append(Linear(din,din))
27
+ nets.append(Linear(din,din))
28
+ return nets
29
+
30
+ def conv_unit(ksize, ch_in, ch_out, reshape):
31
+ # batch, seq_n, din
32
+ nets = []
33
+ nets.append(Reshape(reshape))
34
+ if type(ksize)!={list, tuple}:
35
+ ksize = [ksize, ksize]
36
+ nets.append(Conv(2, ch_in, ch_out, ksize, 1, 1, [k//2 for k in ksize]))
37
+ batch, ch, w, h = reshape
38
+ outshape = [batch, ch_out, w, h]
39
+ nets.append(Reshape(outshape))
40
+ #nets.append(MultiAttrn(din, din, 8))
41
+ nets.append(Linear(din,din))
42
+ return nets
43
+
44
+ din =4096
45
+ # din=256
46
+ batch=1
47
+ seq_n=1024
48
+ az_nets = []
49
+ for i in range(10):
50
+ az_nets+=conv_unit(3, 4, 4, [batch, 4, 32, 32])
51
+ az_nets+=unit(din)
52
+
53
+ az_nets = Nets(*az_nets)
54
+ data_shape = [batch, din]
55
+
56
+ data_size = mul(data_shape)*4
57
+ data_trans = data_size/trans_mem
58
+ print(f"data: {fmt_sz(data_size)}, data_trans: {data_trans:.3f} sec")
59
+ net_size = az_nets.size(4)
60
+ net_trans = net_size/trans_mem
61
+ print(f"net size: {fmt_sz(net_size)}, net_trans: {net_trans:.3f} sec")
62
+ cache_size = az_nets.cache(data_shape,4)
63
+ cache_trans = cache_size/trans_mem
64
+ print(f"cache: {fmt_sz(cache_size)}, cache_trans: {cache_trans:.3f} sec")
65
+ cal_amount = az_nets.cal(data_shape)
66
+ cal_gpu = cal_amount/speed_gpu
67
+ cal_cpu = cal_amount/speed_cpu
68
+ print(f"cal: {fmt_sz(cal_amount)}, cal_gpu: {cal_gpu:.3f} sec, cal_cpu: {cal_cpu:.3f} sec")
69
+ print("done")
70
+ """
71
+ python -m buildz.gpuz.demo.az
72
+ """
@@ -0,0 +1,44 @@
1
+
2
+
3
+
4
+ from buildz.gpuz.az import *
5
+
6
+
7
+ from buildz import xf, fz
8
+
9
+ from torch import nn
10
+ import torch
11
+ din =4096*2
12
+ dout = 4096*2
13
+ batch=10240
14
+ az_nets = []
15
+ nets = []
16
+ for i in range(5):
17
+ az_nets.append(Linear(din,dout))
18
+ nets.append(nn.Linear(din,dout))
19
+ az_nets = Nets(*az_nets)
20
+ nets = nn.Sequential(*nets)
21
+ data = torch.randn(batch, din)
22
+ print(f"data: {fmt_sz(batch*din)}")
23
+ print(f"net size: {fmt_sz(az_nets.size(4))}")
24
+ print(f"cache: {fmt_sz(az_nets.cache(data.size(),4))}")
25
+ print(f"cal: {fmt_sz(az_nets.cal(data.size()))}")
26
+ print("done")
27
+ import time
28
+ # nets=nets.cuda()
29
+ # data=data.cuda()
30
+ loop=5
31
+ out = data
32
+ curr=time.time()
33
+ with torch.no_grad():
34
+ for i in range(loop):
35
+ out = nets(out)
36
+ sec = time.time()-curr
37
+ print(f"time cost: {sec}/{loop}")
38
+ assert sec>0
39
+ speed = az_nets.cal(data.size())*loop/(sec+1e-10)
40
+ print(f"speed: {fmt_sz(speed)}")
41
+
42
+ """
43
+ python -m buildz.gpuz.demo.test_cal
44
+ """
@@ -0,0 +1,44 @@
1
+
2
+
3
+
4
+ from buildz.gpuz.az import *
5
+
6
+
7
+ from buildz import xf, fz
8
+
9
+ from torch import nn
10
+ import torch
11
+ din =4096*2
12
+ dout = 4096*2
13
+ batch=10240
14
+ az_nets = []
15
+ nets = []
16
+ for i in range(5):
17
+ az_nets.append(Linear(din,dout))
18
+ nets.append(nn.Linear(din,dout))
19
+ az_nets = Nets(*az_nets)
20
+ nets = nn.Sequential(*nets)
21
+ data = torch.randn(batch, din)
22
+ print(f"data: {fmt_sz(batch*din)}")
23
+ print(f"net size: {fmt_sz(az_nets.size(4))}")
24
+ print(f"cache: {fmt_sz(az_nets.cache(data.size(),4))}")
25
+ print(f"cal: {fmt_sz(az_nets.cal(data.size()))}")
26
+ print("done")
27
+ import time
28
+ nets=nets.cuda()
29
+ data=data.cuda()
30
+ loop=200
31
+ out = data
32
+ curr=time.time()
33
+ with torch.no_grad():
34
+ for i in range(loop):
35
+ out = nets(out)
36
+ sec = time.time()-curr
37
+ print(f"time cost: {sec}/{loop}")
38
+ assert sec>0
39
+ speed = az_nets.cal(data.size())*loop/(sec+1e-10)
40
+ print(f"speed: {fmt_sz(speed)}")
41
+
42
+ """
43
+ python -m buildz.gpuz.demo.test_cal
44
+ """
@@ -0,0 +1,34 @@
1
+
2
+
3
+
4
+ from buildz.gpuz.az import *
5
+
6
+
7
+ from buildz import xf, fz
8
+
9
+ from torch import nn
10
+ import torch
11
+ din =4096*4
12
+ batch=10240
13
+ loop=6
14
+ # data = torch.randn(batch, din, pin_memory=True)requires_grad=False
15
+ datas = [torch.randn(batch, din, pin_memory=True) for i in range(loop)]
16
+ n = batch*din
17
+ size = n*4
18
+ print(f"data: {fmt_sz(batch*din*loop)}")
19
+ import time
20
+ curr=time.time()
21
+ for i in range(loop):
22
+ data = datas[i]
23
+ data=data.cuda()
24
+ del data
25
+ # data=data.cpu()
26
+ sec = time.time()-curr
27
+ print(f"time cost: {sec}/{loop}")
28
+ assert sec>0
29
+ speed = size*loop/(sec+1e-10)
30
+ print(f"speed: {fmt_sz(speed)}")
31
+
32
+ """
33
+ python -m buildz.gpuz.demo.trans
34
+ """
@@ -0,0 +1,32 @@
1
+
2
+
3
+
4
+ from buildz.gpuz.az import *
5
+
6
+
7
+ from buildz import xf, fz
8
+
9
+ from torch import nn
10
+ import torch
11
+ din =4096*4
12
+ batch=10240
13
+ loop=10
14
+ data = torch.randn(batch, din, pin_memory=True)
15
+ n = batch*din
16
+ size = n*4
17
+ print(f"data: {fmt_sz(batch*din*loop)}")
18
+ import time
19
+ curr=time.time()
20
+ for i in range(loop):
21
+ data = data
22
+ data=data.cuda()
23
+ data=data.cpu().contiguous()
24
+ sec = time.time()-curr
25
+ print(f"time cost: {sec}/{loop}")
26
+ assert sec>0
27
+ speed = size*loop/(sec+1e-10)
28
+ print(f"speed: {fmt_sz(speed)}")
29
+
30
+ """
31
+ python -m buildz.gpuz.demo.trans2
32
+ """