l0n0lacl 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- l0n0lacl-0.0.1/PKG-INFO +193 -0
- l0n0lacl-0.0.1/README.md +171 -0
- l0n0lacl-0.0.1/l0n0lacl/AclNDTensor.py +93 -0
- l0n0lacl-0.0.1/l0n0lacl/AclStream.py +24 -0
- l0n0lacl-0.0.1/l0n0lacl/OpRunner.py +97 -0
- l0n0lacl-0.0.1/l0n0lacl/__init__.py +20 -0
- l0n0lacl-0.0.1/l0n0lacl/utils.py +222 -0
- l0n0lacl-0.0.1/l0n0lacl.egg-info/PKG-INFO +193 -0
- l0n0lacl-0.0.1/l0n0lacl.egg-info/SOURCES.txt +12 -0
- l0n0lacl-0.0.1/l0n0lacl.egg-info/dependency_links.txt +1 -0
- l0n0lacl-0.0.1/l0n0lacl.egg-info/requires.txt +2 -0
- l0n0lacl-0.0.1/l0n0lacl.egg-info/top_level.txt +1 -0
- l0n0lacl-0.0.1/setup.cfg +4 -0
- l0n0lacl-0.0.1/setup.py +166 -0
l0n0lacl-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: l0n0lacl
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: 用于调用ascendc编写的算子
|
|
5
|
+
Author: l0n0l
|
|
6
|
+
Author-email: 1038352856@qq.com
|
|
7
|
+
Keywords: acl,ascendc,算子,算子开发
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Topic :: Software Development :: Build Tools
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
18
|
+
Requires-Python: >=3.7, <4
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
Requires-Dist: colorama
|
|
21
|
+
Requires-Dist: numpy
|
|
22
|
+
|
|
23
|
+
# 1 功能描述
|
|
24
|
+
由于在ascendc算子开发过程中运行算子比较复杂,为了简化算子的运行,将运行算子变成可以用python直接调用的函数。所以编写了此代码。
|
|
25
|
+
|
|
26
|
+
# 2 安装
|
|
27
|
+
```
|
|
28
|
+
pip install l0n0lacl
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
# 3 运行算子实例
|
|
32
|
+
## 3.1 先切换到cann环境,比如我的环境是:
|
|
33
|
+
```
|
|
34
|
+
source /home/HwHiAiUser/Ascend/ascend-toolkit/set_env.sh
|
|
35
|
+
```
|
|
36
|
+
## 3.2 先安装我们编写的算子
|
|
37
|
+
```
|
|
38
|
+
bash custom_opp_xxx_aarch64.run
|
|
39
|
+
```
|
|
40
|
+
## 3.3 创建算子运行器
|
|
41
|
+
```python
|
|
42
|
+
from l0n0lacl import *
|
|
43
|
+
ascendc_gelu = OpRunner("Gelu", op_path_prefix='customize')
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## 3.4 调用算子
|
|
47
|
+
### 3.4.1 先看调用传参顺序
|
|
48
|
+
在算子工程编译后,会有代码生成,在算子工程目录:
|
|
49
|
+
`${算子目录}/build_out/autogen/aclnn_xxx.h`中可以找到`aclnnXXXGetWorkspaceSize`函数。以Gelu为例:
|
|
50
|
+
```c++
|
|
51
|
+
__attribute__((visibility("default")))
|
|
52
|
+
aclnnStatus aclnnGeluGetWorkspaceSize(
|
|
53
|
+
const aclTensor *x,
|
|
54
|
+
const aclTensor *out,
|
|
55
|
+
uint64_t *workspaceSize,
|
|
56
|
+
aclOpExecutor **executor);
|
|
57
|
+
```
|
|
58
|
+
可以看到参数为 `x`, `out`, `workspaceSize`, `executor`。其中 `workspaceSize`, `executor`不需要管。
|
|
59
|
+
* `aclTensor*`对应`numpy.ndarray`
|
|
60
|
+
* 其他参考: <a href = "https://docs.python.org/zh-cn/3/library/ctypes.html#fundamental-data-types">ctypes类型</a>
|
|
61
|
+
### 3.4.2 调用算子
|
|
62
|
+
```python
|
|
63
|
+
import torch
|
|
64
|
+
from l0n0lacl import *
|
|
65
|
+
ascendc_gelu = OpRunner("Gelu", op_path_prefix='customize')
|
|
66
|
+
target_dtype = torch.float
|
|
67
|
+
x = torch.empty(shape, dtype=target_dtype).uniform_(-1, 1)
|
|
68
|
+
y = torch.empty(shape, dtype=target_dtype).zero_()
|
|
69
|
+
out = ascendc_gelu(x.numpy(), y.numpy()).to_cpu()
|
|
70
|
+
print(out)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
# 4. api参考
|
|
74
|
+
## 4.1 AclNDTensor
|
|
75
|
+
```python
|
|
76
|
+
class AclNDTensor:
|
|
77
|
+
def __init__(self, np_array: np.ndarray):
|
|
78
|
+
pass
|
|
79
|
+
def to_cpu(self):
|
|
80
|
+
pass
|
|
81
|
+
```
|
|
82
|
+
numpy ndarray与ascend nd tensor间的桥梁
|
|
83
|
+
### 4.1.1 `__init__`
|
|
84
|
+
* `np_array`: numpy的tensor
|
|
85
|
+
### 4.1.2 `to_cpu`
|
|
86
|
+
将运算结果从npu拷贝到cpu
|
|
87
|
+
## 4.2 OpRunner
|
|
88
|
+
```python
|
|
89
|
+
class OpRunner:
|
|
90
|
+
def __init__(self, name, op_path_prefix='customize', op_path=None, device_id=0) -> None:
|
|
91
|
+
pass
|
|
92
|
+
def __call__(self, *args, outCout=1, argtypes=None, stream=None) -> Union[AclNDTensor, List[AclNDTensor]]:
|
|
93
|
+
pass
|
|
94
|
+
def sync_stream(self)->None:
|
|
95
|
+
pass
|
|
96
|
+
```
|
|
97
|
+
### 4.2.1 `__init__`
|
|
98
|
+
* `name`:算子名称,
|
|
99
|
+
* `op_path_prefix`: 算子工程中**CMakePresets.json**文件中**vender_name**的值。默认是`customize`,可以不传
|
|
100
|
+
```json
|
|
101
|
+
"vendor_name": {
|
|
102
|
+
"type": "STRING",
|
|
103
|
+
"value": "customize"
|
|
104
|
+
},
|
|
105
|
+
```
|
|
106
|
+
* `op_path`: 算子`libcust_opapi.so`库的绝对位置。不传。
|
|
107
|
+
* `device_id`: 设备ID。默认`0`
|
|
108
|
+
|
|
109
|
+
### 4.2.2 `__call__`
|
|
110
|
+
* `args`: 表示传给`aclnnXXXGetWorkspaceSize`除了`workspaceSize`, `executor`的参数
|
|
111
|
+
* `outCout` : 表示算子的输出个数。如果出处个数为`1`,返回一个`AclNDTensor`。如果输出个数大于1,返回`List[AclNDTensor]`
|
|
112
|
+
* `argtypes`: 表示`aclnnXXXGetWorkspaceSize`的参数`ctypes`参数类型,对于特别复杂的算子,如果发现调用异常,可以手动指定类型。
|
|
113
|
+
比如(**仅用于举例,其实可以不传,自动推导就可运行。但是当发现运行异常的情况下,可以自己指定**),对于:
|
|
114
|
+
```c++
|
|
115
|
+
__attribute__((visibility("default")))
|
|
116
|
+
aclnnStatus aclnnCumsumGetWorkspaceSize(
|
|
117
|
+
const aclTensor *x,
|
|
118
|
+
const aclTensor *axis,
|
|
119
|
+
bool exclusiveOptional,
|
|
120
|
+
bool reverseOptional,
|
|
121
|
+
const aclTensor *out,
|
|
122
|
+
uint64_t *workspaceSize,
|
|
123
|
+
aclOpExecutor **executor);
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
import ctypes
|
|
128
|
+
from l0n0lacl import *
|
|
129
|
+
ascendc_cumsum = OpRunner("Cumsum")
|
|
130
|
+
target_dtype = np.float32
|
|
131
|
+
data_range = (-10, 10)
|
|
132
|
+
shape = [100, 3, 2304]
|
|
133
|
+
axis_py = 1
|
|
134
|
+
exclusive = True
|
|
135
|
+
reverse = False
|
|
136
|
+
x = np.random.uniform(*data_range, shape).astype(target_dtype)
|
|
137
|
+
axis = np.array([axis_py]).astype(np.int32)
|
|
138
|
+
golden: np.ndarray = tf.cumsum(x, axis_py, exclusive, reverse, argtypes=[
|
|
139
|
+
ctypes.c_void_p, # x
|
|
140
|
+
ctypes.c_void_p, # axis
|
|
141
|
+
ctypes.c_bool, # exclusiveOptional
|
|
142
|
+
ctypes.c_bool, # reverseOptional
|
|
143
|
+
ctypes.c_void_p, # out
|
|
144
|
+
ctypes.c_void_p, # workspaceSize
|
|
145
|
+
ctypes.c_void_p, # executor
|
|
146
|
+
]).numpy()
|
|
147
|
+
y = np.ones_like(golden, golden.dtype) * 123
|
|
148
|
+
ascendc_cumsum(x, axis, exclusive, reverse, y).to_cpu()
|
|
149
|
+
print(y)
|
|
150
|
+
```
|
|
151
|
+
* `stream` 如果是多stream的情况下,可以自己指定stream:
|
|
152
|
+
例如:
|
|
153
|
+
```python
|
|
154
|
+
import ctypes
|
|
155
|
+
import tensorflow as tf
|
|
156
|
+
from l0n0lacl import *
|
|
157
|
+
ascendc_cumsum = OpRunner("Cumsum")
|
|
158
|
+
target_dtype = np.float32
|
|
159
|
+
data_range = (-10, 10)
|
|
160
|
+
shape = [100, 3, 2304]
|
|
161
|
+
axis_py = 1
|
|
162
|
+
exclusive = True
|
|
163
|
+
reverse = False
|
|
164
|
+
x = np.random.uniform(*data_range, shape).astype(target_dtype)
|
|
165
|
+
axis = np.array([axis_py]).astype(np.int32)
|
|
166
|
+
golden: np.ndarray = tf.cumsum(x, axis_py, exclusive, reverse).numpy()
|
|
167
|
+
y = np.ones_like(golden, golden.dtype) * 123
|
|
168
|
+
ascendc_cumsum(x, axis, exclusive, reverse, y, argtypes=[
|
|
169
|
+
ctypes.c_void_p, # x
|
|
170
|
+
ctypes.c_void_p, # axis
|
|
171
|
+
ctypes.c_bool, # exclusiveOptional
|
|
172
|
+
ctypes.c_bool, # reverseOptional
|
|
173
|
+
ctypes.c_void_p, # out
|
|
174
|
+
ctypes.c_void_p, # workspaceSize
|
|
175
|
+
ctypes.c_void_p, # executor
|
|
176
|
+
]).to_cpu()
|
|
177
|
+
verify_result(y, golden)
|
|
178
|
+
print(y)
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
### 4.2.3 `sync_stream`
|
|
182
|
+
用于同步stream
|
|
183
|
+
|
|
184
|
+
## 4.3 verify_result
|
|
185
|
+
参考自:https://gitee.com/ascend/samples/blob/master/operator/AddCustomSample/KernelLaunch/AddKernelInvocationNeo/scripts/verify_result.py
|
|
186
|
+
```python
|
|
187
|
+
def verify_result(real_result:numpy.ndarray, golden:numpy.ndarray):
|
|
188
|
+
pass
|
|
189
|
+
```
|
|
190
|
+
判断精度是否符合
|
|
191
|
+
float16: 千分之一
|
|
192
|
+
float32: 万分之一
|
|
193
|
+
int16,int32,int8: 0
|
l0n0lacl-0.0.1/README.md
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
# 1 功能描述
|
|
2
|
+
由于在ascendc算子开发过程中运行算子比较复杂,为了简化算子的运行,将运行算子变成可以用python直接调用的函数。所以编写了此代码。
|
|
3
|
+
|
|
4
|
+
# 2 安装
|
|
5
|
+
```
|
|
6
|
+
pip install l0n0lacl
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
# 3 运行算子实例
|
|
10
|
+
## 3.1 先切换到cann环境,比如我的环境是:
|
|
11
|
+
```
|
|
12
|
+
source /home/HwHiAiUser/Ascend/ascend-toolkit/set_env.sh
|
|
13
|
+
```
|
|
14
|
+
## 3.2 先安装我们编写的算子
|
|
15
|
+
```
|
|
16
|
+
bash custom_opp_xxx_aarch64.run
|
|
17
|
+
```
|
|
18
|
+
## 3.3 创建算子运行器
|
|
19
|
+
```python
|
|
20
|
+
from l0n0lacl import *
|
|
21
|
+
ascendc_gelu = OpRunner("Gelu", op_path_prefix='customize')
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## 3.4 调用算子
|
|
25
|
+
### 3.4.1 先看调用传参顺序
|
|
26
|
+
在算子工程编译后,会有代码生成,在算子工程目录:
|
|
27
|
+
`${算子目录}/build_out/autogen/aclnn_xxx.h`中可以找到`aclnnXXXGetWorkspaceSize`函数。以Gelu为例:
|
|
28
|
+
```c++
|
|
29
|
+
__attribute__((visibility("default")))
|
|
30
|
+
aclnnStatus aclnnGeluGetWorkspaceSize(
|
|
31
|
+
const aclTensor *x,
|
|
32
|
+
const aclTensor *out,
|
|
33
|
+
uint64_t *workspaceSize,
|
|
34
|
+
aclOpExecutor **executor);
|
|
35
|
+
```
|
|
36
|
+
可以看到参数为 `x`, `out`, `workspaceSize`, `executor`。其中 `workspaceSize`, `executor`不需要管。
|
|
37
|
+
* `aclTensor*`对应`numpy.ndarray`
|
|
38
|
+
* 其他参考: <a href = "https://docs.python.org/zh-cn/3/library/ctypes.html#fundamental-data-types">ctypes类型</a>
|
|
39
|
+
### 3.4.2 调用算子
|
|
40
|
+
```python
|
|
41
|
+
import torch
|
|
42
|
+
from l0n0lacl import *
|
|
43
|
+
ascendc_gelu = OpRunner("Gelu", op_path_prefix='customize')
|
|
44
|
+
target_dtype = torch.float
|
|
45
|
+
x = torch.empty(shape, dtype=target_dtype).uniform_(-1, 1)
|
|
46
|
+
y = torch.empty(shape, dtype=target_dtype).zero_()
|
|
47
|
+
out = ascendc_gelu(x.numpy(), y.numpy()).to_cpu()
|
|
48
|
+
print(out)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
# 4. api参考
|
|
52
|
+
## 4.1 AclNDTensor
|
|
53
|
+
```python
|
|
54
|
+
class AclNDTensor:
|
|
55
|
+
def __init__(self, np_array: np.ndarray):
|
|
56
|
+
pass
|
|
57
|
+
def to_cpu(self):
|
|
58
|
+
pass
|
|
59
|
+
```
|
|
60
|
+
numpy ndarray与ascend nd tensor间的桥梁
|
|
61
|
+
### 4.1.1 `__init__`
|
|
62
|
+
* `np_array`: numpy的tensor
|
|
63
|
+
### 4.1.2 `to_cpu`
|
|
64
|
+
将运算结果从npu拷贝到cpu
|
|
65
|
+
## 4.2 OpRunner
|
|
66
|
+
```python
|
|
67
|
+
class OpRunner:
|
|
68
|
+
def __init__(self, name, op_path_prefix='customize', op_path=None, device_id=0) -> None:
|
|
69
|
+
pass
|
|
70
|
+
def __call__(self, *args, outCout=1, argtypes=None, stream=None) -> Union[AclNDTensor, List[AclNDTensor]]:
|
|
71
|
+
pass
|
|
72
|
+
def sync_stream(self)->None:
|
|
73
|
+
pass
|
|
74
|
+
```
|
|
75
|
+
### 4.2.1 `__init__`
|
|
76
|
+
* `name`:算子名称,
|
|
77
|
+
* `op_path_prefix`: 算子工程中**CMakePresets.json**文件中**vender_name**的值。默认是`customize`,可以不传
|
|
78
|
+
```json
|
|
79
|
+
"vendor_name": {
|
|
80
|
+
"type": "STRING",
|
|
81
|
+
"value": "customize"
|
|
82
|
+
},
|
|
83
|
+
```
|
|
84
|
+
* `op_path`: 算子`libcust_opapi.so`库的绝对位置。不传。
|
|
85
|
+
* `device_id`: 设备ID。默认`0`
|
|
86
|
+
|
|
87
|
+
### 4.2.2 `__call__`
|
|
88
|
+
* `args`: 表示传给`aclnnXXXGetWorkspaceSize`除了`workspaceSize`, `executor`的参数
|
|
89
|
+
* `outCout` : 表示算子的输出个数。如果出处个数为`1`,返回一个`AclNDTensor`。如果输出个数大于1,返回`List[AclNDTensor]`
|
|
90
|
+
* `argtypes`: 表示`aclnnXXXGetWorkspaceSize`的参数`ctypes`参数类型,对于特别复杂的算子,如果发现调用异常,可以手动指定类型。
|
|
91
|
+
比如(**仅用于举例,其实可以不传,自动推导就可运行。但是当发现运行异常的情况下,可以自己指定**),对于:
|
|
92
|
+
```c++
|
|
93
|
+
__attribute__((visibility("default")))
|
|
94
|
+
aclnnStatus aclnnCumsumGetWorkspaceSize(
|
|
95
|
+
const aclTensor *x,
|
|
96
|
+
const aclTensor *axis,
|
|
97
|
+
bool exclusiveOptional,
|
|
98
|
+
bool reverseOptional,
|
|
99
|
+
const aclTensor *out,
|
|
100
|
+
uint64_t *workspaceSize,
|
|
101
|
+
aclOpExecutor **executor);
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
import ctypes
|
|
106
|
+
from l0n0lacl import *
|
|
107
|
+
ascendc_cumsum = OpRunner("Cumsum")
|
|
108
|
+
target_dtype = np.float32
|
|
109
|
+
data_range = (-10, 10)
|
|
110
|
+
shape = [100, 3, 2304]
|
|
111
|
+
axis_py = 1
|
|
112
|
+
exclusive = True
|
|
113
|
+
reverse = False
|
|
114
|
+
x = np.random.uniform(*data_range, shape).astype(target_dtype)
|
|
115
|
+
axis = np.array([axis_py]).astype(np.int32)
|
|
116
|
+
golden: np.ndarray = tf.cumsum(x, axis_py, exclusive, reverse, argtypes=[
|
|
117
|
+
ctypes.c_void_p, # x
|
|
118
|
+
ctypes.c_void_p, # axis
|
|
119
|
+
ctypes.c_bool, # exclusiveOptional
|
|
120
|
+
ctypes.c_bool, # reverseOptional
|
|
121
|
+
ctypes.c_void_p, # out
|
|
122
|
+
ctypes.c_void_p, # workspaceSize
|
|
123
|
+
ctypes.c_void_p, # executor
|
|
124
|
+
]).numpy()
|
|
125
|
+
y = np.ones_like(golden, golden.dtype) * 123
|
|
126
|
+
ascendc_cumsum(x, axis, exclusive, reverse, y).to_cpu()
|
|
127
|
+
print(y)
|
|
128
|
+
```
|
|
129
|
+
* `stream` 如果是多stream的情况下,可以自己指定stream:
|
|
130
|
+
例如:
|
|
131
|
+
```python
|
|
132
|
+
import ctypes
|
|
133
|
+
import tensorflow as tf
|
|
134
|
+
from l0n0lacl import *
|
|
135
|
+
ascendc_cumsum = OpRunner("Cumsum")
|
|
136
|
+
target_dtype = np.float32
|
|
137
|
+
data_range = (-10, 10)
|
|
138
|
+
shape = [100, 3, 2304]
|
|
139
|
+
axis_py = 1
|
|
140
|
+
exclusive = True
|
|
141
|
+
reverse = False
|
|
142
|
+
x = np.random.uniform(*data_range, shape).astype(target_dtype)
|
|
143
|
+
axis = np.array([axis_py]).astype(np.int32)
|
|
144
|
+
golden: np.ndarray = tf.cumsum(x, axis_py, exclusive, reverse).numpy()
|
|
145
|
+
y = np.ones_like(golden, golden.dtype) * 123
|
|
146
|
+
ascendc_cumsum(x, axis, exclusive, reverse, y, argtypes=[
|
|
147
|
+
ctypes.c_void_p, # x
|
|
148
|
+
ctypes.c_void_p, # axis
|
|
149
|
+
ctypes.c_bool, # exclusiveOptional
|
|
150
|
+
ctypes.c_bool, # reverseOptional
|
|
151
|
+
ctypes.c_void_p, # out
|
|
152
|
+
ctypes.c_void_p, # workspaceSize
|
|
153
|
+
ctypes.c_void_p, # executor
|
|
154
|
+
]).to_cpu()
|
|
155
|
+
verify_result(y, golden)
|
|
156
|
+
print(y)
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### 4.2.3 `sync_stream`
|
|
160
|
+
用于同步stream
|
|
161
|
+
|
|
162
|
+
## 4.3 verify_result
|
|
163
|
+
参考自:https://gitee.com/ascend/samples/blob/master/operator/AddCustomSample/KernelLaunch/AddKernelInvocationNeo/scripts/verify_result.py
|
|
164
|
+
```python
|
|
165
|
+
def verify_result(real_result:numpy.ndarray, golden:numpy.ndarray):
|
|
166
|
+
pass
|
|
167
|
+
```
|
|
168
|
+
判断精度是否符合
|
|
169
|
+
float16: 千分之一
|
|
170
|
+
float32: 万分之一
|
|
171
|
+
int16,int32,int8: 0
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
from .utils import *
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def build_acl_libnnopbase_path():
|
|
5
|
+
ascend_home_path = os.environ["ASCEND_HOME_PATH"]
|
|
6
|
+
lib_path = f"{ascend_home_path}/aarch64-linux/lib64/libnnopbase.so"
|
|
7
|
+
return lib_path
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
libnnopbase = ctypes.CDLL(build_acl_libnnopbase_path())
|
|
11
|
+
# ACL_FUNC_VISIBILITY aclTensor * aclCreateTensor(
|
|
12
|
+
# const int64_t * viewDims,
|
|
13
|
+
# uint64_t viewDimsNum,
|
|
14
|
+
# aclDataType dataType,
|
|
15
|
+
# const int64_t * stride,
|
|
16
|
+
# int64_t offset,
|
|
17
|
+
# aclFormat format,
|
|
18
|
+
# const int64_t * storageDims,
|
|
19
|
+
# uint64_t storageDimsNum,
|
|
20
|
+
# void * tensorData)
|
|
21
|
+
libnnopbase.aclCreateTensor.argtypes = [
|
|
22
|
+
ctypes.c_void_p, # viewDims
|
|
23
|
+
ctypes.c_uint64, # viewDimsNum
|
|
24
|
+
ctypes.c_int, # dataType
|
|
25
|
+
ctypes.c_void_p, # stride
|
|
26
|
+
ctypes.c_int64, # offset
|
|
27
|
+
ctypes.c_int, # format
|
|
28
|
+
ctypes.c_void_p, # storageDims
|
|
29
|
+
ctypes.c_uint64, # storageDimsNum
|
|
30
|
+
ctypes.c_void_p, # tensorData
|
|
31
|
+
]
|
|
32
|
+
libnnopbase.aclCreateTensor.restype = ctypes.c_void_p
|
|
33
|
+
libnnopbase.aclDestroyTensor.argtypes = [ctypes.c_void_p]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class AclNDTensor:
|
|
37
|
+
def __init__(self, np_array: np.ndarray):
|
|
38
|
+
self.np_array = np_array
|
|
39
|
+
self.op_runner = None
|
|
40
|
+
self.data_bytes_size = np_array.size * np_array.itemsize
|
|
41
|
+
self.mem_size = int(
|
|
42
|
+
math.ceil(np_array.size * np_array.itemsize / 256) * 256)
|
|
43
|
+
if self.mem_size > 0:
|
|
44
|
+
self.device_ptr, ret = acl.rt.malloc(self.mem_size, 0)
|
|
45
|
+
assert ret == 0
|
|
46
|
+
ret = acl.rt.memcpy(
|
|
47
|
+
self.device_ptr,
|
|
48
|
+
self.mem_size,
|
|
49
|
+
np_array.ctypes.data,
|
|
50
|
+
self.data_bytes_size,
|
|
51
|
+
1,
|
|
52
|
+
)
|
|
53
|
+
assert ret == 0
|
|
54
|
+
else:
|
|
55
|
+
self.device_ptr = 0
|
|
56
|
+
self.shape = np.array(np_array.shape, dtype=np.int64)
|
|
57
|
+
self.shape_size = len(np_array.shape)
|
|
58
|
+
self.acl_dtype = numpy_dtype_2_acl_dtype(np_array.dtype)
|
|
59
|
+
self.ptr = libnnopbase.aclCreateTensor(
|
|
60
|
+
self.shape.ctypes.data,
|
|
61
|
+
self.shape_size,
|
|
62
|
+
self.acl_dtype,
|
|
63
|
+
0,
|
|
64
|
+
0,
|
|
65
|
+
2,
|
|
66
|
+
self.shape.ctypes.data,
|
|
67
|
+
self.shape_size,
|
|
68
|
+
self.device_ptr
|
|
69
|
+
)
|
|
70
|
+
assert (self.ptr != 0)
|
|
71
|
+
self.need_copy_to_cpu = False
|
|
72
|
+
|
|
73
|
+
def __str__(self) -> str:
|
|
74
|
+
return str(self.to_cpu())
|
|
75
|
+
|
|
76
|
+
def __del__(self):
|
|
77
|
+
assert (self.ptr != 0)
|
|
78
|
+
libnnopbase.aclDestroyTensor(self.ptr)
|
|
79
|
+
|
|
80
|
+
def to_cpu(self):
|
|
81
|
+
if self.op_runner is not None:
|
|
82
|
+
self.op_runner.sync_stream()
|
|
83
|
+
if self.need_copy_to_cpu:
|
|
84
|
+
ret = acl.rt.memcpy(
|
|
85
|
+
self.np_array.ctypes.data,
|
|
86
|
+
self.data_bytes_size,
|
|
87
|
+
self.device_ptr,
|
|
88
|
+
self.data_bytes_size,
|
|
89
|
+
2,
|
|
90
|
+
)
|
|
91
|
+
assert ret == 0
|
|
92
|
+
self.need_copy_to_cpu = False
|
|
93
|
+
return self.np_array
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from .utils import *
|
|
2
|
+
|
|
3
|
+
class AclStream:
|
|
4
|
+
def __init__(self, device_id=0):
|
|
5
|
+
self.stream = None
|
|
6
|
+
self.device_id = device_id
|
|
7
|
+
self.set_device(device_id)
|
|
8
|
+
|
|
9
|
+
def set_device(self, device_id):
|
|
10
|
+
acl.rt.set_device(device_id)
|
|
11
|
+
|
|
12
|
+
def __enter__(self):
|
|
13
|
+
stream, ret = acl.rt.create_stream()
|
|
14
|
+
print_ret("创建stream失败", ret)
|
|
15
|
+
self.stream = stream
|
|
16
|
+
return stream
|
|
17
|
+
|
|
18
|
+
def __exit__(self, *args, **kwargs):
|
|
19
|
+
if self.stream is None or self.stream == 0:
|
|
20
|
+
return
|
|
21
|
+
try_sync_stream(self.stream)
|
|
22
|
+
ret = acl.rt.destroy_stream(self.stream)
|
|
23
|
+
print_ret("销毁stream错误!", ret)
|
|
24
|
+
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from .AclNDTensor import *
|
|
2
|
+
|
|
3
|
+
def build_op_api_lib_path(prefix='customize'):
|
|
4
|
+
ascend_home_path = os.environ["ASCEND_HOME_PATH"]
|
|
5
|
+
op_path = f"{ascend_home_path}/opp/vendors/{prefix}/op_api/lib/libcust_opapi.so"
|
|
6
|
+
return op_path
|
|
7
|
+
|
|
8
|
+
class OpRunner:
|
|
9
|
+
default_stream = None
|
|
10
|
+
def __init__(self, name, op_path_prefix='customize', op_path=None, device_id=0) -> None:
|
|
11
|
+
op_path = op_path or build_op_api_lib_path(op_path_prefix)
|
|
12
|
+
self.op_lib = ctypes.CDLL(op_path)
|
|
13
|
+
self.get_workspace_size = getattr(
|
|
14
|
+
self.op_lib, f"aclnn{name}GetWorkspaceSize")
|
|
15
|
+
self.run = getattr(self.op_lib, f"aclnn{name}")
|
|
16
|
+
self.run.argtypes = [
|
|
17
|
+
ctypes.c_void_p,
|
|
18
|
+
ctypes.c_uint64,
|
|
19
|
+
ctypes.c_void_p,
|
|
20
|
+
ctypes.c_void_p,
|
|
21
|
+
]
|
|
22
|
+
self.executor = np.array([0], dtype=np.uint64)
|
|
23
|
+
|
|
24
|
+
def __call__(self, *args, outCout=1, argtypes=None, stream=None) -> Union[AclNDTensor, List[AclNDTensor]]:
|
|
25
|
+
stream = stream or OpRunner.default_stream
|
|
26
|
+
self.stream = stream
|
|
27
|
+
temp_args = []
|
|
28
|
+
for arg in args:
|
|
29
|
+
if isinstance(arg, np.ndarray):
|
|
30
|
+
acl_tensor = AclNDTensor(arg)
|
|
31
|
+
acl_tensor.op_runner = self
|
|
32
|
+
acl_tensor.need_copy_to_cpu = True
|
|
33
|
+
temp_args.append(acl_tensor)
|
|
34
|
+
elif isinstance(arg, AclNDTensor):
|
|
35
|
+
arg.op_runner = self
|
|
36
|
+
arg.need_copy_to_cpu = True
|
|
37
|
+
temp_args.append(arg)
|
|
38
|
+
else:
|
|
39
|
+
temp_args.append(arg)
|
|
40
|
+
workspace = np.zeros([1], dtype=np.uint64)
|
|
41
|
+
temp_args.append(workspace.ctypes.data)
|
|
42
|
+
temp_args.append(self.executor.ctypes.data)
|
|
43
|
+
real_args = []
|
|
44
|
+
run_argtypes = argtypes or []
|
|
45
|
+
for arg in temp_args:
|
|
46
|
+
if isinstance(arg, AclNDTensor):
|
|
47
|
+
real_args.append(arg.ptr)
|
|
48
|
+
if argtypes is None:
|
|
49
|
+
run_argtypes.append(ctypes.c_void_p)
|
|
50
|
+
elif isinstance(arg, bool):
|
|
51
|
+
real_args.append(arg)
|
|
52
|
+
if argtypes is None:
|
|
53
|
+
run_argtypes.append(ctypes.c_bool)
|
|
54
|
+
elif isinstance(arg, int):
|
|
55
|
+
real_args.append(arg)
|
|
56
|
+
if argtypes is None:
|
|
57
|
+
int32_info = np.iinfo(np.int32)
|
|
58
|
+
if arg > int32_info.max or arg < int32_info.min:
|
|
59
|
+
run_argtypes.append(ctypes.c_int64)
|
|
60
|
+
else:
|
|
61
|
+
run_argtypes.append(ctypes.c_int32)
|
|
62
|
+
elif isinstance(arg, float):
|
|
63
|
+
real_args.append(arg)
|
|
64
|
+
if argtypes is None:
|
|
65
|
+
float32_info = np.finfo(np.float32)
|
|
66
|
+
if arg > float32_info.max or arg < float32_info.min:
|
|
67
|
+
run_argtypes.append(ctypes.c_double)
|
|
68
|
+
else:
|
|
69
|
+
run_argtypes.append(ctypes.c_float)
|
|
70
|
+
elif isinstance(arg, bytes):
|
|
71
|
+
real_args.append(arg)
|
|
72
|
+
if argtypes is None:
|
|
73
|
+
run_argtypes.append(ctypes.c_char_p)
|
|
74
|
+
self.get_workspace_size.argtypes = run_argtypes
|
|
75
|
+
self.get_workspace_size(*real_args)
|
|
76
|
+
workspace_ptr = 0
|
|
77
|
+
workspace_size = int(workspace[0])
|
|
78
|
+
if workspace_size > 0:
|
|
79
|
+
print("需要workspace大小为:", workspace_size, flush=True)
|
|
80
|
+
if workspace_size > 0:
|
|
81
|
+
workspace_ptr, ret = acl.rt.malloc(workspace_size, 0)
|
|
82
|
+
print_ret("分配workspace失败", ret)
|
|
83
|
+
self.workspace_ptr = workspace_ptr
|
|
84
|
+
self.workspace_size = workspace_size
|
|
85
|
+
self.run(workspace_ptr, workspace_size,
|
|
86
|
+
int(self.executor[0]), stream)
|
|
87
|
+
if outCout == 1:
|
|
88
|
+
return temp_args[-3]
|
|
89
|
+
return temp_args[-(outCout + 2):-2]
|
|
90
|
+
|
|
91
|
+
def sync_stream(self):
|
|
92
|
+
if self.stream is None or not stream_need_sync(self.stream):
|
|
93
|
+
return
|
|
94
|
+
try_sync_stream(self.stream)
|
|
95
|
+
if self.workspace_size > 0:
|
|
96
|
+
acl.rt.free(self.workspace_ptr)
|
|
97
|
+
self.workspace_size = 0
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from .utils import *
|
|
2
|
+
from .AclNDTensor import AclNDTensor
|
|
3
|
+
from .OpRunner import OpRunner
|
|
4
|
+
from .AclStream import AclStream
|
|
5
|
+
|
|
6
|
+
# 初始化acl
|
|
7
|
+
acl.init()
|
|
8
|
+
acl.rt.set_device(0)
|
|
9
|
+
OpRunner.default_stream, ret = acl.rt.create_stream()
|
|
10
|
+
print_ret("创建_defualt_stream失败", ret)
|
|
11
|
+
|
|
12
|
+
@atexit.register
|
|
13
|
+
def finalize():
|
|
14
|
+
global default_stream
|
|
15
|
+
if OpRunner.default_stream is not None and OpRunner.default_stream != 0:
|
|
16
|
+
try_sync_stream(OpRunner.default_stream)
|
|
17
|
+
acl.rt.destroy_stream(OpRunner.default_stream)
|
|
18
|
+
OpRunner.default_stream = None
|
|
19
|
+
print_ret("销毁stream错误!", ret)
|
|
20
|
+
acl.finalize()
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import acl
|
|
3
|
+
import time
|
|
4
|
+
import math
|
|
5
|
+
import ctypes
|
|
6
|
+
import colorama
|
|
7
|
+
import numpy as np
|
|
8
|
+
import colorama
|
|
9
|
+
import atexit
|
|
10
|
+
from typing import List, Union
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def print_ret(msg, ret):
|
|
14
|
+
if ret == 0:
|
|
15
|
+
return
|
|
16
|
+
print(colorama.Fore.RED, "[错误]", msg, ret, flush=True)
|
|
17
|
+
print(colorama.Style.RESET_ALL, flush=True)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class AclStreamStatus:
|
|
21
|
+
# Stream上的所有任务已完成。
|
|
22
|
+
ACL_STREAM_STATUS_COMPLETE = 0
|
|
23
|
+
# Stream上至少有一个任务未完成。
|
|
24
|
+
ACL_STREAM_STATUS_NOT_READY = 1
|
|
25
|
+
# 预留。
|
|
26
|
+
ACL_STREAM_STATUS_RESERVED = 0xFFFF
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def stream_need_sync(stream: int):
|
|
30
|
+
status, ret = acl.rt.stream_query(stream)
|
|
31
|
+
if ret != 0:
|
|
32
|
+
print_ret("获取stream状态错误", ret)
|
|
33
|
+
return False
|
|
34
|
+
return status == AclStreamStatus.ACL_STREAM_STATUS_NOT_READY
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def try_sync_stream(stream: int):
|
|
38
|
+
ret = acl.rt.synchronize_stream(stream)
|
|
39
|
+
if ret != 0:
|
|
40
|
+
print_ret("同步stream错误", ret)
|
|
41
|
+
return False
|
|
42
|
+
return True
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def get_loss_by_type(dtype):
|
|
46
|
+
loss = 0
|
|
47
|
+
if dtype == np.float16:
|
|
48
|
+
loss = 1 / 1000
|
|
49
|
+
elif dtype == np.float32:
|
|
50
|
+
loss = 1 / 10000
|
|
51
|
+
return loss
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _compare(v1: np.ndarray, v2: np.ndarray):
|
|
55
|
+
loss = get_loss_by_type(v1.dtype)
|
|
56
|
+
return np.abs(v1 - v2) <= loss
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def compare(v1: np.ndarray, v2: np.ndarray):
|
|
60
|
+
return _compare(v1, v2).all()
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def right_rate(v1: np.ndarray, v2: np.ndarray):
|
|
64
|
+
ret = _compare(v1, v2)
|
|
65
|
+
return ret.astype(np.int32).sum() / v1.size
|
|
66
|
+
|
|
67
|
+
# 参考自:https://gitee.com/ascend/samples/blob/master/operator/AddCustomSample/KernelLaunch/AddKernelInvocationNeo/scripts/verify_result.py
|
|
68
|
+
def verify_result(real_result, golden):
|
|
69
|
+
loss = get_loss_by_type(real_result.dtype)
|
|
70
|
+
minimum = 10e-10
|
|
71
|
+
result = np.abs(real_result - golden) # 计算运算结果和预期结果偏差
|
|
72
|
+
deno = np.maximum(np.abs(real_result), np.abs(golden)) # 获取最大值并组成新数组
|
|
73
|
+
result_atol = np.less_equal(result, loss) # 计算绝对误差
|
|
74
|
+
result_rtol = np.less_equal(result / np.add(deno, minimum), loss) # 计算相对误差
|
|
75
|
+
if not result_rtol.all() and not result_atol.all():
|
|
76
|
+
if (
|
|
77
|
+
np.sum(result_rtol == False) > real_result.size * loss
|
|
78
|
+
and np.sum(result_atol == False) > real_result.size * loss
|
|
79
|
+
): # 误差超出预期时返回打印错误,返回对比失败
|
|
80
|
+
print(
|
|
81
|
+
colorama.Fore.RED, real_result.dtype, "[ERROR] result error", flush=True
|
|
82
|
+
)
|
|
83
|
+
print(colorama.Style.RESET_ALL, flush=True)
|
|
84
|
+
return False
|
|
85
|
+
print(colorama.Fore.GREEN, real_result.dtype, "test pass", flush=True)
|
|
86
|
+
print(colorama.Style.RESET_ALL, flush=True)
|
|
87
|
+
return True
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class AclDtype:
|
|
91
|
+
ACL_FLOAT = 0
|
|
92
|
+
ACL_FLOAT16 = 1
|
|
93
|
+
ACL_INT8 = 2
|
|
94
|
+
ACL_INT32 = 3
|
|
95
|
+
ACL_UINT8 = 4
|
|
96
|
+
ACL_INT16 = 6
|
|
97
|
+
ACL_UINT16 = 7
|
|
98
|
+
ACL_UINT32 = 8
|
|
99
|
+
ACL_INT64 = 9
|
|
100
|
+
ACL_UINT64 = 10
|
|
101
|
+
ACL_DOUBLE = 11
|
|
102
|
+
ACL_BOOL = 12
|
|
103
|
+
ACL_STRING = 13
|
|
104
|
+
ACL_COMPLEX64 = 16
|
|
105
|
+
ACL_COMPLEX128 = 17
|
|
106
|
+
ACL_BF16 = 27
|
|
107
|
+
ACL_INT4 = 29
|
|
108
|
+
ACL_UINT1 = 30
|
|
109
|
+
ACL_COMPLEX32 = 33
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class AclRunMode:
|
|
113
|
+
ACL_DEVICE = 0
|
|
114
|
+
ACL_HOST = 1
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class AclAllocPolicy:
|
|
118
|
+
"""
|
|
119
|
+
0:ACL_MEM_MALLOC_HUGE_FIRST,当申请的内存小于等于1M时,即使使用该内存分配规则,也是申请普通页的内存。当申请的内存大于1M时,优先申请大页内存,如果大页内存不够,则使用普通页的内存。
|
|
120
|
+
1:ACL_MEM_MALLOC_HUGE_ONLY,仅申请大页,如果大页内存不够,则返回错误。
|
|
121
|
+
2:ACL_MEM_MALLOC_NORMAL_ONLY,仅申请普通页。
|
|
122
|
+
3:ACL_MEM_MALLOC_HUGE_FIRST_P2P,仅Device之间内存复制场景下申请内存时使用该选项,表示优先申请大页内存,如果大页内存不够,则使用普通页的内存。预留选项。
|
|
123
|
+
4:ACL_MEM_MALLOC_HUGE_ONLY_P2P,仅Device之间内存复制场景下申请内存时使用该选项,仅申请大页内存,如果大页内存不够,则返回错误。预留选项。
|
|
124
|
+
5:ACL_MEM_MALLOC_NORMAL_ONLY_P2P,仅Device之间内存复制场景下申请内存时使用该选项,仅申请普通页的内存。预留选项。
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
ACL_MEM_MALLOC_HUGE_FIRST = 0
|
|
128
|
+
ACL_MEM_MALLOC_HUGE_ONLY = 1
|
|
129
|
+
ACL_MEM_MALLOC_NORMAL_ONLY = 2
|
|
130
|
+
ACL_MEM_MALLOC_HUGE_FIRST_P2P = 3
|
|
131
|
+
ACL_MEM_MALLOC_HUGE_ONLY_P2P = 4
|
|
132
|
+
ACL_MEM_MALLOC_NORMAL_ONLY_P2P = 5
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class AclMemcopyKind:
|
|
136
|
+
"""
|
|
137
|
+
0:ACL_MEMCPY_HOST_TO_HOST,Host内的内存复制。
|
|
138
|
+
1:ACL_MEMCPY_HOST_TO_DEVICE,Host到Device的内存复制。
|
|
139
|
+
2:ACL_MEMCPY_DEVICE_TO_HOST,Device到Host的内存复制。
|
|
140
|
+
3:ACL_MEMCPY_DEVICE_TO_DEVICE,Device内的内存复制。
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
ACL_MEMCPY_HOST_TO_HOST = 0
|
|
144
|
+
ACL_MEMCPY_HOST_TO_DEVICE = 1
|
|
145
|
+
ACL_MEMCPY_DEVICE_TO_HOST = 2
|
|
146
|
+
ACL_MEMCPY_DEVICE_TO_DEVICE = 3
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def numpy_dtype_2_acl_dtype(numpy_dtype):
|
|
150
|
+
if numpy_dtype == np.float32:
|
|
151
|
+
return AclDtype.ACL_FLOAT
|
|
152
|
+
if numpy_dtype == np.float16:
|
|
153
|
+
return AclDtype.ACL_FLOAT16
|
|
154
|
+
if numpy_dtype == np.int8:
|
|
155
|
+
return AclDtype.ACL_INT8
|
|
156
|
+
if numpy_dtype == np.int32:
|
|
157
|
+
return AclDtype.ACL_INT32
|
|
158
|
+
if numpy_dtype == np.uint8:
|
|
159
|
+
return AclDtype.ACL_UINT8
|
|
160
|
+
if numpy_dtype == np.int16:
|
|
161
|
+
return AclDtype.ACL_INT16
|
|
162
|
+
if numpy_dtype == np.uint16:
|
|
163
|
+
return AclDtype.ACL_UINT16
|
|
164
|
+
if numpy_dtype == np.uint32:
|
|
165
|
+
return AclDtype.ACL_UINT32
|
|
166
|
+
if numpy_dtype == np.int64:
|
|
167
|
+
return AclDtype.ACL_INT64
|
|
168
|
+
if numpy_dtype == np.uint64:
|
|
169
|
+
return AclDtype.ACL_UINT64
|
|
170
|
+
if numpy_dtype == np.double:
|
|
171
|
+
return AclDtype.ACL_DOUBLE
|
|
172
|
+
if numpy_dtype == np.bool_:
|
|
173
|
+
return AclDtype.ACL_BOOL
|
|
174
|
+
if numpy_dtype == np.string_:
|
|
175
|
+
return AclDtype.ACL_STRING
|
|
176
|
+
if numpy_dtype == np.complex64:
|
|
177
|
+
return AclDtype.ACL_COMPLEX64
|
|
178
|
+
if numpy_dtype == np.complex128:
|
|
179
|
+
return AclDtype.ACL_COMPLEX128
|
|
180
|
+
if numpy_dtype == np.complex_:
|
|
181
|
+
return AclDtype.ACL_COMPLEX32
|
|
182
|
+
# TODO 如何使用bf16
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def numpy_dtype_2_torch_dtype(numpy_dtype):
|
|
186
|
+
import torch
|
|
187
|
+
if numpy_dtype == np.float32:
|
|
188
|
+
return torch.float32
|
|
189
|
+
if numpy_dtype == np.float16:
|
|
190
|
+
return torch.float16
|
|
191
|
+
if numpy_dtype == np.int8:
|
|
192
|
+
return torch.int8
|
|
193
|
+
if numpy_dtype == np.int32:
|
|
194
|
+
return torch.int32
|
|
195
|
+
if numpy_dtype == np.uint8:
|
|
196
|
+
return torch.uint8
|
|
197
|
+
if numpy_dtype == np.int16:
|
|
198
|
+
return torch.int16
|
|
199
|
+
if numpy_dtype == np.uint16:
|
|
200
|
+
return torch.int16
|
|
201
|
+
if numpy_dtype == np.uint32:
|
|
202
|
+
return torch.int32
|
|
203
|
+
if numpy_dtype == np.int64:
|
|
204
|
+
return torch.int64
|
|
205
|
+
if numpy_dtype == np.uint64:
|
|
206
|
+
return torch.int64
|
|
207
|
+
if numpy_dtype == np.double:
|
|
208
|
+
return torch.double
|
|
209
|
+
if numpy_dtype == np.bool_:
|
|
210
|
+
return torch.bool
|
|
211
|
+
if numpy_dtype == np.complex64:
|
|
212
|
+
return torch.complex64
|
|
213
|
+
if numpy_dtype == np.complex128:
|
|
214
|
+
return torch.complex128
|
|
215
|
+
if numpy_dtype == np.complex_:
|
|
216
|
+
return torch.complex32
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
if __name__ == "__main__":
|
|
220
|
+
a = np.zeros((3, 3), dtype=np.float16)
|
|
221
|
+
env = AclEnv(0)
|
|
222
|
+
nd_tensor = AclNDTensor(a)
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: l0n0lacl
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: 用于调用ascendc编写的算子
|
|
5
|
+
Author: l0n0l
|
|
6
|
+
Author-email: 1038352856@qq.com
|
|
7
|
+
Keywords: acl,ascendc,算子,算子开发
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Topic :: Software Development :: Build Tools
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
18
|
+
Requires-Python: >=3.7, <4
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
Requires-Dist: colorama
|
|
21
|
+
Requires-Dist: numpy
|
|
22
|
+
|
|
23
|
+
# 1 功能描述
|
|
24
|
+
由于在ascendc算子开发过程中运行算子比较复杂,为了简化算子的运行,将运行算子变成可以用python直接调用的函数。所以编写了此代码。
|
|
25
|
+
|
|
26
|
+
# 2 安装
|
|
27
|
+
```
|
|
28
|
+
pip install l0n0lacl
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
# 3 运行算子实例
|
|
32
|
+
## 3.1 先切换到cann环境,比如我的环境是:
|
|
33
|
+
```
|
|
34
|
+
source /home/HwHiAiUser/Ascend/ascend-toolkit/set_env.sh
|
|
35
|
+
```
|
|
36
|
+
## 3.2 先安装我们编写的算子
|
|
37
|
+
```
|
|
38
|
+
bash custom_opp_xxx_aarch64.run
|
|
39
|
+
```
|
|
40
|
+
## 3.3 创建算子运行器
|
|
41
|
+
```python
|
|
42
|
+
from l0n0lacl import *
|
|
43
|
+
ascendc_gelu = OpRunner("Gelu", op_path_prefix='customize')
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## 3.4 调用算子
|
|
47
|
+
### 3.4.1 先看调用传参顺序
|
|
48
|
+
在算子工程编译后,会有代码生成,在算子工程目录:
|
|
49
|
+
`${算子目录}/build_out/autogen/aclnn_xxx.h`中可以找到`aclnnXXXGetWorkspaceSize`函数。以Gelu为例:
|
|
50
|
+
```c++
|
|
51
|
+
__attribute__((visibility("default")))
|
|
52
|
+
aclnnStatus aclnnGeluGetWorkspaceSize(
|
|
53
|
+
const aclTensor *x,
|
|
54
|
+
const aclTensor *out,
|
|
55
|
+
uint64_t *workspaceSize,
|
|
56
|
+
aclOpExecutor **executor);
|
|
57
|
+
```
|
|
58
|
+
可以看到参数为 `x`, `out`, `workspaceSize`, `executor`。其中 `workspaceSize`, `executor`不需要管。
|
|
59
|
+
* `aclTensor*`对应`numpy.ndarray`
|
|
60
|
+
* 其他参考: <a href = "https://docs.python.org/zh-cn/3/library/ctypes.html#fundamental-data-types">ctypes类型</a>
|
|
61
|
+
### 3.4.2 调用算子
|
|
62
|
+
```python
|
|
63
|
+
import torch
|
|
64
|
+
from l0n0lacl import *
|
|
65
|
+
ascendc_gelu = OpRunner("Gelu", op_path_prefix='customize')
|
|
66
|
+
target_dtype = torch.float
|
|
67
|
+
x = torch.empty(shape, dtype=target_dtype).uniform_(-1, 1)
|
|
68
|
+
y = torch.empty(shape, dtype=target_dtype).zero_()
|
|
69
|
+
out = ascendc_gelu(x.numpy(), y.numpy()).to_cpu()
|
|
70
|
+
print(out)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
# 4. api参考
|
|
74
|
+
## 4.1 AclNDTensor
|
|
75
|
+
```python
|
|
76
|
+
class AclNDTensor:
|
|
77
|
+
def __init__(self, np_array: np.ndarray):
|
|
78
|
+
pass
|
|
79
|
+
def to_cpu(self):
|
|
80
|
+
pass
|
|
81
|
+
```
|
|
82
|
+
numpy ndarray与ascend nd tensor间的桥梁
|
|
83
|
+
### 4.1.1 `__init__`
|
|
84
|
+
* `np_array`: numpy的tensor
|
|
85
|
+
### 4.1.2 `to_cpu`
|
|
86
|
+
将运算结果从npu拷贝到cpu
|
|
87
|
+
## 4.2 OpRunner
|
|
88
|
+
```python
|
|
89
|
+
class OpRunner:
|
|
90
|
+
def __init__(self, name, op_path_prefix='customize', op_path=None, device_id=0) -> None:
|
|
91
|
+
pass
|
|
92
|
+
def __call__(self, *args, outCout=1, argtypes=None, stream=None) -> Union[AclNDTensor, List[AclNDTensor]]:
|
|
93
|
+
pass
|
|
94
|
+
def sync_stream(self)->None:
|
|
95
|
+
pass
|
|
96
|
+
```
|
|
97
|
+
### 4.2.1 `__init__`
|
|
98
|
+
* `name`:算子名称,
|
|
99
|
+
* `op_path_prefix`: 算子工程中**CMakePresets.json**文件中**vender_name**的值。默认是`customize`,可以不传
|
|
100
|
+
```json
|
|
101
|
+
"vendor_name": {
|
|
102
|
+
"type": "STRING",
|
|
103
|
+
"value": "customize"
|
|
104
|
+
},
|
|
105
|
+
```
|
|
106
|
+
* `op_path`: 算子`libcust_opapi.so`库的绝对位置。不传。
|
|
107
|
+
* `device_id`: 设备ID。默认`0`
|
|
108
|
+
|
|
109
|
+
### 4.2.2 `__call__`
|
|
110
|
+
* `args`: 表示传给`aclnnXXXGetWorkspaceSize`除了`workspaceSize`, `executor`的参数
|
|
111
|
+
* `outCout` : 表示算子的输出个数。如果出处个数为`1`,返回一个`AclNDTensor`。如果输出个数大于1,返回`List[AclNDTensor]`
|
|
112
|
+
* `argtypes`: 表示`aclnnXXXGetWorkspaceSize`的参数`ctypes`参数类型,对于特别复杂的算子,如果发现调用异常,可以手动指定类型。
|
|
113
|
+
比如(**仅用于举例,其实可以不传,自动推导就可运行。但是当发现运行异常的情况下,可以自己指定**),对于:
|
|
114
|
+
```c++
|
|
115
|
+
__attribute__((visibility("default")))
|
|
116
|
+
aclnnStatus aclnnCumsumGetWorkspaceSize(
|
|
117
|
+
const aclTensor *x,
|
|
118
|
+
const aclTensor *axis,
|
|
119
|
+
bool exclusiveOptional,
|
|
120
|
+
bool reverseOptional,
|
|
121
|
+
const aclTensor *out,
|
|
122
|
+
uint64_t *workspaceSize,
|
|
123
|
+
aclOpExecutor **executor);
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
import ctypes
|
|
128
|
+
from l0n0lacl import *
|
|
129
|
+
ascendc_cumsum = OpRunner("Cumsum")
|
|
130
|
+
target_dtype = np.float32
|
|
131
|
+
data_range = (-10, 10)
|
|
132
|
+
shape = [100, 3, 2304]
|
|
133
|
+
axis_py = 1
|
|
134
|
+
exclusive = True
|
|
135
|
+
reverse = False
|
|
136
|
+
x = np.random.uniform(*data_range, shape).astype(target_dtype)
|
|
137
|
+
axis = np.array([axis_py]).astype(np.int32)
|
|
138
|
+
golden: np.ndarray = tf.cumsum(x, axis_py, exclusive, reverse, argtypes=[
|
|
139
|
+
ctypes.c_void_p, # x
|
|
140
|
+
ctypes.c_void_p, # axis
|
|
141
|
+
ctypes.c_bool, # exclusiveOptional
|
|
142
|
+
ctypes.c_bool, # reverseOptional
|
|
143
|
+
ctypes.c_void_p, # out
|
|
144
|
+
ctypes.c_void_p, # workspaceSize
|
|
145
|
+
ctypes.c_void_p, # executor
|
|
146
|
+
]).numpy()
|
|
147
|
+
y = np.ones_like(golden, golden.dtype) * 123
|
|
148
|
+
ascendc_cumsum(x, axis, exclusive, reverse, y).to_cpu()
|
|
149
|
+
print(y)
|
|
150
|
+
```
|
|
151
|
+
* `stream` 如果是多stream的情况下,可以自己指定stream:
|
|
152
|
+
例如:
|
|
153
|
+
```python
|
|
154
|
+
import ctypes
|
|
155
|
+
import tensorflow as tf
|
|
156
|
+
from l0n0lacl import *
|
|
157
|
+
ascendc_cumsum = OpRunner("Cumsum")
|
|
158
|
+
target_dtype = np.float32
|
|
159
|
+
data_range = (-10, 10)
|
|
160
|
+
shape = [100, 3, 2304]
|
|
161
|
+
axis_py = 1
|
|
162
|
+
exclusive = True
|
|
163
|
+
reverse = False
|
|
164
|
+
x = np.random.uniform(*data_range, shape).astype(target_dtype)
|
|
165
|
+
axis = np.array([axis_py]).astype(np.int32)
|
|
166
|
+
golden: np.ndarray = tf.cumsum(x, axis_py, exclusive, reverse).numpy()
|
|
167
|
+
y = np.ones_like(golden, golden.dtype) * 123
|
|
168
|
+
ascendc_cumsum(x, axis, exclusive, reverse, y, argtypes=[
|
|
169
|
+
ctypes.c_void_p, # x
|
|
170
|
+
ctypes.c_void_p, # axis
|
|
171
|
+
ctypes.c_bool, # exclusiveOptional
|
|
172
|
+
ctypes.c_bool, # reverseOptional
|
|
173
|
+
ctypes.c_void_p, # out
|
|
174
|
+
ctypes.c_void_p, # workspaceSize
|
|
175
|
+
ctypes.c_void_p, # executor
|
|
176
|
+
]).to_cpu()
|
|
177
|
+
verify_result(y, golden)
|
|
178
|
+
print(y)
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
### 4.2.3 `sync_stream`
|
|
182
|
+
用于同步stream
|
|
183
|
+
|
|
184
|
+
## 4.3 verify_result
|
|
185
|
+
参考自:https://gitee.com/ascend/samples/blob/master/operator/AddCustomSample/KernelLaunch/AddKernelInvocationNeo/scripts/verify_result.py
|
|
186
|
+
```python
|
|
187
|
+
def verify_result(real_result:numpy.ndarray, golden:numpy.ndarray):
|
|
188
|
+
pass
|
|
189
|
+
```
|
|
190
|
+
判断精度是否符合
|
|
191
|
+
float16: 千分之一
|
|
192
|
+
float32: 万分之一
|
|
193
|
+
int16,int32,int8: 0
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
setup.py
|
|
3
|
+
./l0n0lacl/AclNDTensor.py
|
|
4
|
+
./l0n0lacl/AclStream.py
|
|
5
|
+
./l0n0lacl/OpRunner.py
|
|
6
|
+
./l0n0lacl/__init__.py
|
|
7
|
+
./l0n0lacl/utils.py
|
|
8
|
+
./l0n0lacl.egg-info/PKG-INFO
|
|
9
|
+
./l0n0lacl.egg-info/SOURCES.txt
|
|
10
|
+
./l0n0lacl.egg-info/dependency_links.txt
|
|
11
|
+
./l0n0lacl.egg-info/requires.txt
|
|
12
|
+
./l0n0lacl.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
l0n0lacl
|
l0n0lacl-0.0.1/setup.cfg
ADDED
l0n0lacl-0.0.1/setup.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""A setuptools based setup module.
|
|
2
|
+
|
|
3
|
+
See:
|
|
4
|
+
https://packaging.python.org/guides/distributing-packages-using-setuptools/
|
|
5
|
+
https://github.com/pypa/sampleproject
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
# Always prefer setuptools over distutils
|
|
9
|
+
from setuptools import setup, find_packages
|
|
10
|
+
import pathlib
|
|
11
|
+
|
|
12
|
+
here = pathlib.Path(__file__).parent.resolve()
|
|
13
|
+
|
|
14
|
+
# Get the long description from the README file
|
|
15
|
+
long_description = (here / "README.md").read_text(encoding="utf-8")
|
|
16
|
+
# Arguments marked as "Required" below must be included for upload to PyPI.
|
|
17
|
+
# Fields marked as "Optional" may be commented out.
|
|
18
|
+
|
|
19
|
+
setup(
|
|
20
|
+
# This is the name of your project. The first time you publish this
|
|
21
|
+
# package, this name will be registered for you. It will determine how
|
|
22
|
+
# users can install this project, e.g.:
|
|
23
|
+
#
|
|
24
|
+
# $ pip install sampleproject
|
|
25
|
+
#
|
|
26
|
+
# And where it will live on PyPI: https://pypi.org/project/sampleproject/
|
|
27
|
+
#
|
|
28
|
+
# There are some restrictions on what makes a valid project name
|
|
29
|
+
# specification here:
|
|
30
|
+
# https://packaging.python.org/specifications/core-metadata/#name
|
|
31
|
+
name="l0n0lacl", # Required
|
|
32
|
+
# Versions should comply with PEP 440:
|
|
33
|
+
# https://www.python.org/dev/peps/pep-0440/
|
|
34
|
+
#
|
|
35
|
+
# For a discussion on single-sourcing the version across setup.py and the
|
|
36
|
+
# project code, see
|
|
37
|
+
# https://packaging.python.org/guides/single-sourcing-package-version/
|
|
38
|
+
version="0.0.1", # Required
|
|
39
|
+
# This is a one-line description or tagline of what your project does. This
|
|
40
|
+
# corresponds to the "Summary" metadata field:
|
|
41
|
+
# https://packaging.python.org/specifications/core-metadata/#summary
|
|
42
|
+
description="用于调用ascendc编写的算子", # Optional
|
|
43
|
+
# This is an optional longer description of your project that represents
|
|
44
|
+
# the body of text which users will see when they visit PyPI.
|
|
45
|
+
#
|
|
46
|
+
# Often, this is the same as your README, so you can just read it in from
|
|
47
|
+
# that file directly (as we have already done above)
|
|
48
|
+
#
|
|
49
|
+
# This field corresponds to the "Description" metadata field:
|
|
50
|
+
# https://packaging.python.org/specifications/core-metadata/#description-optional
|
|
51
|
+
long_description=long_description, # Optional
|
|
52
|
+
# Denotes that our long_description is in Markdown; valid values are
|
|
53
|
+
# text/plain, text/x-rst, and text/markdown
|
|
54
|
+
#
|
|
55
|
+
# Optional if long_description is written in reStructuredText (rst) but
|
|
56
|
+
# required for plain-text or Markdown; if unspecified, "applications should
|
|
57
|
+
# attempt to render [the long_description] as text/x-rst; charset=UTF-8 and
|
|
58
|
+
# fall back to text/plain if it is not valid rst" (see link below)
|
|
59
|
+
#
|
|
60
|
+
# This field corresponds to the "Description-Content-Type" metadata field:
|
|
61
|
+
# https://packaging.python.org/specifications/core-metadata/#description-content-type-optional
|
|
62
|
+
long_description_content_type="text/markdown", # Optional (see note above)
|
|
63
|
+
# This should be a valid link to your project's main homepage.
|
|
64
|
+
#
|
|
65
|
+
# This field corresponds to the "Home-Page" metadata field:
|
|
66
|
+
# https://packaging.python.org/specifications/core-metadata/#home-page-optional
|
|
67
|
+
# url="https://github.com/pypa/sampleproject", # Optional
|
|
68
|
+
# This should be your name or the name of the organization which owns the
|
|
69
|
+
# project.
|
|
70
|
+
author="l0n0l", # Optional
|
|
71
|
+
# This should be a valid email address corresponding to the author listed
|
|
72
|
+
# above.
|
|
73
|
+
author_email="1038352856@qq.com", # Optional
|
|
74
|
+
# Classifiers help users find your project by categorizing it.
|
|
75
|
+
#
|
|
76
|
+
# For a list of valid classifiers, see https://pypi.org/classifiers/
|
|
77
|
+
classifiers=[ # Optional
|
|
78
|
+
# How mature is this project? Common values are
|
|
79
|
+
# 3 - Alpha
|
|
80
|
+
# 4 - Beta
|
|
81
|
+
# 5 - Production/Stable
|
|
82
|
+
"Development Status :: 3 - Alpha",
|
|
83
|
+
# Indicate who your project is intended for
|
|
84
|
+
"Intended Audience :: Developers",
|
|
85
|
+
"Topic :: Software Development :: Build Tools",
|
|
86
|
+
# Pick your license as you wish
|
|
87
|
+
"License :: OSI Approved :: MIT License",
|
|
88
|
+
# Specify the Python versions you support here. In particular, ensure
|
|
89
|
+
# that you indicate you support Python 3. These classifiers are *not*
|
|
90
|
+
# checked by 'pip install'. See instead 'python_requires' below.
|
|
91
|
+
"Programming Language :: Python :: 3",
|
|
92
|
+
"Programming Language :: Python :: 3.7",
|
|
93
|
+
"Programming Language :: Python :: 3.8",
|
|
94
|
+
"Programming Language :: Python :: 3.9",
|
|
95
|
+
"Programming Language :: Python :: 3.10",
|
|
96
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
97
|
+
],
|
|
98
|
+
# This field adds keywords for your project which will appear on the
|
|
99
|
+
# project page. What does your project relate to?
|
|
100
|
+
#
|
|
101
|
+
# Note that this is a list of additional keywords, separated
|
|
102
|
+
# by commas, to be used to assist searching for the distribution in a
|
|
103
|
+
# larger catalog.
|
|
104
|
+
keywords="acl, ascendc, 算子, 算子开发", # Optional
|
|
105
|
+
# When your source code is in a subdirectory under the project root, e.g.
|
|
106
|
+
# `src/`, it is necessary to specify the `package_dir` argument.
|
|
107
|
+
package_dir={"": "./"}, # Optional
|
|
108
|
+
# You can just specify package directories manually here if your project is
|
|
109
|
+
# simple. Or you can use find_packages().
|
|
110
|
+
#
|
|
111
|
+
# Alternatively, if you just want to distribute a single Python file, use
|
|
112
|
+
# the `py_modules` argument instead as follows, which will expect a file
|
|
113
|
+
# called `my_module.py` to exist:
|
|
114
|
+
#
|
|
115
|
+
# py_modules=["my_module"],
|
|
116
|
+
#
|
|
117
|
+
packages=find_packages(where="./"), # Required
|
|
118
|
+
# Specify which Python versions you support. In contrast to the
|
|
119
|
+
# 'Programming Language' classifiers above, 'pip install' will check this
|
|
120
|
+
# and refuse to install the project if the version does not match. See
|
|
121
|
+
# https://packaging.python.org/guides/distributing-packages-using-setuptools/#python-requires
|
|
122
|
+
python_requires=">=3.7, <4",
|
|
123
|
+
# This field lists other packages that your project depends on to run.
|
|
124
|
+
# Any package you put here will be installed by pip when your project is
|
|
125
|
+
# installed, so they must be valid existing projects.
|
|
126
|
+
#
|
|
127
|
+
# For an analysis of "install_requires" vs pip's requirements files see:
|
|
128
|
+
# https://packaging.python.org/discussions/install-requires-vs-requirements/
|
|
129
|
+
install_requires=["colorama", "numpy"], # Optional
|
|
130
|
+
# List additional groups of dependencies here (e.g. development
|
|
131
|
+
# dependencies). Users will be able to install these using the "extras"
|
|
132
|
+
# syntax, for example:
|
|
133
|
+
#
|
|
134
|
+
# $ pip install sampleproject[dev]
|
|
135
|
+
#
|
|
136
|
+
# Similar to `install_requires` above, these must be valid existing
|
|
137
|
+
# projects.
|
|
138
|
+
extras_require={ # Optional
|
|
139
|
+
},
|
|
140
|
+
# If there are data files included in your packages that need to be
|
|
141
|
+
# installed, specify them here.
|
|
142
|
+
package_data={ # Optional
|
|
143
|
+
},
|
|
144
|
+
# Entry points. The following would provide a command called `sample` which
|
|
145
|
+
# executes the function `main` from this package when invoked:
|
|
146
|
+
entry_points={ # Optional
|
|
147
|
+
# "console_scripts": [
|
|
148
|
+
# "sample=sample:main",
|
|
149
|
+
# ],
|
|
150
|
+
},
|
|
151
|
+
# List additional URLs that are relevant to your project as a dict.
|
|
152
|
+
#
|
|
153
|
+
# This field corresponds to the "Project-URL" metadata fields:
|
|
154
|
+
# https://packaging.python.org/specifications/core-metadata/#project-url-multiple-use
|
|
155
|
+
#
|
|
156
|
+
# Examples listed include a pattern for specifying where the package tracks
|
|
157
|
+
# issues, where the source is hosted, where to say thanks to the package
|
|
158
|
+
# maintainers, and where to support the project financially. The key is
|
|
159
|
+
# what's used to render the link text on PyPI.
|
|
160
|
+
project_urls={ # Optional
|
|
161
|
+
# "Bug Reports": "https://github.com/pypa/sampleproject/issues",
|
|
162
|
+
# "Funding": "https://donate.pypi.org",
|
|
163
|
+
# "Say Thanks!": "http://saythanks.io/to/example",
|
|
164
|
+
# "Source": "https://github.com/pypa/sampleproject/",
|
|
165
|
+
},
|
|
166
|
+
)
|