da4ml 0.3.0.post1__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of da4ml might be problematic. Click here for more details.
- da4ml/_version.py +16 -3
- da4ml/cmvm/types.py +12 -2
- da4ml/codegen/cpp/cpp_codegen.py +4 -1
- da4ml/codegen/verilog/comb.py +19 -11
- da4ml/codegen/verilog/source/binder_util.hh +8 -6
- da4ml/codegen/verilog/source/build_prj.tcl +6 -8
- da4ml/codegen/verilog/source/ioutil.hh +2 -1
- da4ml/codegen/verilog/source/multiplier.v +37 -0
- da4ml/codegen/verilog/verilog_model.py +4 -5
- da4ml/converter/__init__.py +3 -0
- da4ml/converter/hgq2/__init__.py +3 -0
- da4ml/converter/hgq2/parser.py +60 -10
- da4ml/converter/hgq2/replica.py +125 -35
- da4ml/trace/fixed_variable.py +133 -20
- da4ml/trace/fixed_variable_array.py +55 -7
- da4ml/trace/ops/__init__.py +4 -4
- da4ml/trace/ops/einsum_utils.py +5 -2
- da4ml/trace/ops/reduce_utils.py +4 -2
- da4ml/trace/pipeline.py +6 -4
- da4ml/trace/tracer.py +27 -13
- da4ml-0.3.2.dist-info/METADATA +66 -0
- {da4ml-0.3.0.post1.dist-info → da4ml-0.3.2.dist-info}/RECORD +25 -23
- da4ml-0.3.0.post1.dist-info/METADATA +0 -107
- {da4ml-0.3.0.post1.dist-info → da4ml-0.3.2.dist-info}/WHEEL +0 -0
- {da4ml-0.3.0.post1.dist-info → da4ml-0.3.2.dist-info}/licenses/LICENSE +0 -0
- {da4ml-0.3.0.post1.dist-info → da4ml-0.3.2.dist-info}/top_level.txt +0 -0
|
@@ -86,6 +86,21 @@ class FixedVariableArray:
|
|
|
86
86
|
assert bind.arguments.get('out', None) is None, 'Output argument is not supported'
|
|
87
87
|
return einsum(eq, *operands)
|
|
88
88
|
|
|
89
|
+
if func in (np.dot, np.matmul):
|
|
90
|
+
assert len(args) in (2, 3), 'Dot function requires exactly two or three arguments'
|
|
91
|
+
|
|
92
|
+
assert len(args) == 2
|
|
93
|
+
a, b = args
|
|
94
|
+
if not isinstance(a, FixedVariableArray):
|
|
95
|
+
a = np.array(a)
|
|
96
|
+
if not isinstance(b, FixedVariableArray):
|
|
97
|
+
b = np.array(b)
|
|
98
|
+
if a.shape[-1] == b.shape[0]:
|
|
99
|
+
return a @ b
|
|
100
|
+
|
|
101
|
+
assert a.size == 1 or b.size == 1, f'Error in dot product: {a.shape} @ {b.shape}'
|
|
102
|
+
return a * b
|
|
103
|
+
|
|
89
104
|
args, kwargs = to_raw_arr(args), to_raw_arr(kwargs)
|
|
90
105
|
return FixedVariableArray(
|
|
91
106
|
func(*args, **kwargs),
|
|
@@ -155,7 +170,23 @@ class FixedVariableArray:
|
|
|
155
170
|
return cls.from_lhs(low, high, step, hwconf, latency, solver_options)
|
|
156
171
|
|
|
157
172
|
def __matmul__(self, other):
|
|
158
|
-
|
|
173
|
+
if isinstance(other, FixedVariableArray):
|
|
174
|
+
other = other._vars
|
|
175
|
+
if not isinstance(other, np.ndarray):
|
|
176
|
+
other = np.array(other)
|
|
177
|
+
if any(isinstance(x, FixedVariable) for x in other.ravel()):
|
|
178
|
+
mat0, mat1 = self._vars, other
|
|
179
|
+
shape = mat0.shape[:-1] + mat1.shape[1:]
|
|
180
|
+
mat0, mat1 = mat0.reshape((-1, mat0.shape[-1])), mat1.reshape((mat1.shape[0], -1))
|
|
181
|
+
_shape = (mat0.shape[0], mat1.shape[1])
|
|
182
|
+
_vars = np.empty(_shape, dtype=object)
|
|
183
|
+
for i in range(mat0.shape[0]):
|
|
184
|
+
for j in range(mat1.shape[1]):
|
|
185
|
+
vec0 = mat0[i]
|
|
186
|
+
vec1 = mat1[:, j]
|
|
187
|
+
_vars[i, j] = reduce(lambda x, y: x + y, vec0 * vec1)
|
|
188
|
+
return FixedVariableArray(_vars.reshape(shape), self.solver_options)
|
|
189
|
+
|
|
159
190
|
kwargs = (self.solver_options or {}).copy()
|
|
160
191
|
shape0, shape1 = self.shape, other.shape
|
|
161
192
|
assert shape0[-1] == shape1[0], f'Matrix shapes do not match: {shape0} @ {shape1}'
|
|
@@ -180,9 +211,9 @@ class FixedVariableArray:
|
|
|
180
211
|
|
|
181
212
|
def __rmatmul__(self, other):
|
|
182
213
|
mat1 = np.moveaxis(other, -1, 0)
|
|
183
|
-
mat0 = np.moveaxis(self
|
|
214
|
+
mat0 = np.moveaxis(self, 0, -1) # type: ignore
|
|
184
215
|
ndim0, ndim1 = mat0.ndim, mat1.ndim
|
|
185
|
-
r =
|
|
216
|
+
r = mat0 @ mat1
|
|
186
217
|
|
|
187
218
|
_axes = tuple(range(0, ndim0 + ndim1 - 2))
|
|
188
219
|
axes = _axes[ndim0 - 1 :] + _axes[: ndim0 - 1]
|
|
@@ -213,6 +244,8 @@ class FixedVariableArray:
|
|
|
213
244
|
return FixedVariableArray(self._vars - other, self.solver_options)
|
|
214
245
|
|
|
215
246
|
def __mul__(self, other):
|
|
247
|
+
if isinstance(other, FixedVariableArray):
|
|
248
|
+
return FixedVariableArray(self._vars * other._vars, self.solver_options)
|
|
216
249
|
return FixedVariableArray(self._vars * other, self.solver_options)
|
|
217
250
|
|
|
218
251
|
def __truediv__(self, other):
|
|
@@ -230,6 +263,11 @@ class FixedVariableArray:
|
|
|
230
263
|
max_lat = max(v.latency for v in self._vars.ravel())
|
|
231
264
|
return f'FixedVariableArray(shape={shape}, hwconf={hwconf_str}, latency={max_lat})'
|
|
232
265
|
|
|
266
|
+
def __pow__(self, power: int | float):
|
|
267
|
+
_power = int(power)
|
|
268
|
+
assert _power == power, 'Power must be an integer'
|
|
269
|
+
return FixedVariableArray(self._vars**_power, self.solver_options)
|
|
270
|
+
|
|
233
271
|
def relu(self, i: NDArray[np.integer] | None = None, f: NDArray[np.integer] | None = None, round_mode: str = 'TRN'):
|
|
234
272
|
shape = self._vars.shape
|
|
235
273
|
i = np.broadcast_to(i, shape) if i is not None else np.full(shape, None)
|
|
@@ -241,9 +279,9 @@ class FixedVariableArray:
|
|
|
241
279
|
|
|
242
280
|
def quantize(
|
|
243
281
|
self,
|
|
244
|
-
k: NDArray[np.integer] | None = None,
|
|
245
|
-
i: NDArray[np.integer] | None = None,
|
|
246
|
-
f: NDArray[np.integer] | None = None,
|
|
282
|
+
k: NDArray[np.integer] | np.integer | int | None = None,
|
|
283
|
+
i: NDArray[np.integer] | np.integer | int | None = None,
|
|
284
|
+
f: NDArray[np.integer] | np.integer | int | None = None,
|
|
247
285
|
overflow_mode: str = 'WRAP',
|
|
248
286
|
round_mode: str = 'TRN',
|
|
249
287
|
):
|
|
@@ -276,6 +314,10 @@ class FixedVariableArray:
|
|
|
276
314
|
def size(self):
|
|
277
315
|
return self._vars.size
|
|
278
316
|
|
|
317
|
+
@property
|
|
318
|
+
def ndim(self):
|
|
319
|
+
return self._vars.ndim
|
|
320
|
+
|
|
279
321
|
@property
|
|
280
322
|
def kif(self):
|
|
281
323
|
shape = self._vars.shape
|
|
@@ -284,7 +326,13 @@ class FixedVariableArray:
|
|
|
284
326
|
|
|
285
327
|
|
|
286
328
|
class FixedVariableArrayInput(FixedVariableArray):
|
|
287
|
-
def __init__(
|
|
329
|
+
def __init__(
|
|
330
|
+
self,
|
|
331
|
+
shape: tuple[int, ...] | int,
|
|
332
|
+
hwconf: HWConfig = HWConfig(1, -1, -1),
|
|
333
|
+
solver_options: dict[str, Any] | None = None,
|
|
334
|
+
latency=0.0,
|
|
335
|
+
):
|
|
288
336
|
_vars = np.empty(shape, dtype=object)
|
|
289
337
|
_vars_f = _vars.ravel()
|
|
290
338
|
for i in range(_vars.size):
|
da4ml/trace/ops/__init__.py
CHANGED
|
@@ -35,9 +35,9 @@ def relu(x: T, i: NDArray[np.integer] | None = None, f: NDArray[np.integer] | No
|
|
|
35
35
|
|
|
36
36
|
def quantize(
|
|
37
37
|
x: T,
|
|
38
|
-
k: NDArray[np.integer],
|
|
39
|
-
i: NDArray[np.integer],
|
|
40
|
-
f: NDArray[np.integer],
|
|
38
|
+
k: NDArray[np.integer] | np.integer | int,
|
|
39
|
+
i: NDArray[np.integer] | np.integer | int,
|
|
40
|
+
f: NDArray[np.integer] | np.integer | int,
|
|
41
41
|
overflow_mode: str = 'WRAP',
|
|
42
42
|
round_mode: str = 'TRN',
|
|
43
43
|
) -> T:
|
|
@@ -47,7 +47,7 @@ def quantize(
|
|
|
47
47
|
return x.quantize(k=k, i=i, f=f, overflow_mode=overflow_mode, round_mode=round_mode)
|
|
48
48
|
else:
|
|
49
49
|
x = x.copy()
|
|
50
|
-
if overflow_mode in ('SAT', '
|
|
50
|
+
if overflow_mode in ('SAT', 'SAT_SYM'):
|
|
51
51
|
step = 2.0**-f
|
|
52
52
|
_high = 2.0**i
|
|
53
53
|
high = _high - step
|
da4ml/trace/ops/einsum_utils.py
CHANGED
|
@@ -271,6 +271,10 @@ def _einsum(fn: str, input0, input1) -> np.ndarray:
|
|
|
271
271
|
return _exec_einsum(recipe, input0, input1)
|
|
272
272
|
|
|
273
273
|
|
|
274
|
+
@overload
|
|
275
|
+
def einsum(fn: str, input0: 'FixedVariableArray', input1: 'FixedVariableArray') -> 'FixedVariableArray': ...
|
|
276
|
+
|
|
277
|
+
|
|
274
278
|
@overload
|
|
275
279
|
def einsum(fn: str, input0: 'FixedVariableArray', input1: NDArray[np.integer | np.floating]) -> 'FixedVariableArray': ...
|
|
276
280
|
|
|
@@ -290,10 +294,9 @@ def einsum(fn: str, input0, input1):
|
|
|
290
294
|
|
|
291
295
|
fg0 = isinstance(input0, FixedVariableArray)
|
|
292
296
|
fg1 = isinstance(input1, FixedVariableArray)
|
|
293
|
-
if fg0 and fg1:
|
|
294
|
-
raise ValueError('Einsum does not support two FixedVariableArray inputs')
|
|
295
297
|
|
|
296
298
|
r = _einsum(fn, input0, input1)
|
|
299
|
+
|
|
297
300
|
if fg0:
|
|
298
301
|
return FixedVariableArray(r, input0.solver_options)
|
|
299
302
|
elif fg1:
|
da4ml/trace/ops/reduce_utils.py
CHANGED
|
@@ -99,5 +99,7 @@ def reduce(operator: Callable[[T, T], T], x: TA, axis: int | Sequence[int] | Non
|
|
|
99
99
|
r = _arr.reshape(target_shape) # type: ignore
|
|
100
100
|
|
|
101
101
|
if isinstance(x, FixedVariableArray):
|
|
102
|
-
|
|
103
|
-
|
|
102
|
+
ret = FixedVariableArray(r, solver_config)
|
|
103
|
+
if ret.size == 1 and not keepdims:
|
|
104
|
+
return ret.ravel()[0] # type: ignore
|
|
105
|
+
return r if r.size > 1 or keepdims else r.ravel()[0] # type: ignore
|
da4ml/trace/pipeline.py
CHANGED
|
@@ -38,7 +38,7 @@ def _get_new_idx(
|
|
|
38
38
|
out_idxd: dict[int, list[int]],
|
|
39
39
|
ops: list[Op],
|
|
40
40
|
stage: int,
|
|
41
|
-
latency_cutoff:
|
|
41
|
+
latency_cutoff: float,
|
|
42
42
|
):
|
|
43
43
|
if idx < 0:
|
|
44
44
|
return idx
|
|
@@ -60,7 +60,7 @@ def _get_new_idx(
|
|
|
60
60
|
return p0_idx
|
|
61
61
|
|
|
62
62
|
|
|
63
|
-
def to_pipeline(sol: Solution, latency_cutoff:
|
|
63
|
+
def to_pipeline(sol: Solution, latency_cutoff: float, retiming=True, verbose=True) -> CascadedSolution:
|
|
64
64
|
"""Split the record into multiple stages based on the latency of the operations.
|
|
65
65
|
Only useful for HDL generation.
|
|
66
66
|
|
|
@@ -68,7 +68,7 @@ def to_pipeline(sol: Solution, latency_cutoff: int, retiming=True, verbose=True)
|
|
|
68
68
|
----------
|
|
69
69
|
sol : Solution
|
|
70
70
|
The solution to be split into multiple stages.
|
|
71
|
-
latency_cutoff :
|
|
71
|
+
latency_cutoff : float
|
|
72
72
|
The latency cutoff for splitting the operations.
|
|
73
73
|
retiming : bool
|
|
74
74
|
Whether to retime the solution after splitting. Default is True.
|
|
@@ -126,10 +126,10 @@ def to_pipeline(sol: Solution, latency_cutoff: int, retiming=True, verbose=True)
|
|
|
126
126
|
locator.append({stage: len(opd[stage]) - 1})
|
|
127
127
|
sols = []
|
|
128
128
|
max_stage = max(opd.keys())
|
|
129
|
+
n_in = sol.shape[0]
|
|
129
130
|
for i, stage in enumerate(opd.keys()):
|
|
130
131
|
_ops = opd[stage]
|
|
131
132
|
_out_idx = out_idxd[stage]
|
|
132
|
-
n_in = sum(op.opcode == -1 for op in _ops)
|
|
133
133
|
n_out = len(_out_idx)
|
|
134
134
|
|
|
135
135
|
if i == max_stage:
|
|
@@ -150,6 +150,8 @@ def to_pipeline(sol: Solution, latency_cutoff: int, retiming=True, verbose=True)
|
|
|
150
150
|
adder_size=sol.adder_size,
|
|
151
151
|
)
|
|
152
152
|
sols.append(_sol)
|
|
153
|
+
|
|
154
|
+
n_in = n_out
|
|
153
155
|
csol = CascadedSolution(tuple(sols))
|
|
154
156
|
|
|
155
157
|
if retiming:
|
da4ml/trace/tracer.py
CHANGED
|
@@ -17,8 +17,7 @@ def _recursive_gather(v: FixedVariable, gathered: dict[UUID, FixedVariable]):
|
|
|
17
17
|
return
|
|
18
18
|
assert v._from is not None
|
|
19
19
|
for _v in v._from:
|
|
20
|
-
|
|
21
|
-
_recursive_gather(_v, gathered)
|
|
20
|
+
_recursive_gather(_v, gathered)
|
|
22
21
|
gathered[v.id] = v
|
|
23
22
|
|
|
24
23
|
|
|
@@ -26,13 +25,24 @@ def gather_variables(inputs: Sequence[FixedVariable], outputs: Sequence[FixedVar
|
|
|
26
25
|
gathered = {v.id: v for v in inputs}
|
|
27
26
|
for o in outputs:
|
|
28
27
|
_recursive_gather(o, gathered)
|
|
29
|
-
|
|
30
28
|
variables = list(gathered.values())
|
|
31
29
|
|
|
32
30
|
N = len(variables)
|
|
33
31
|
_index = sorted(list(range(N)), key=lambda i: variables[i].latency * N + i)
|
|
34
32
|
variables = [variables[i] for i in _index]
|
|
35
|
-
|
|
33
|
+
|
|
34
|
+
# Remove variables with 0 refcount
|
|
35
|
+
refcount = {v.id: 0 for v in variables}
|
|
36
|
+
for v in variables:
|
|
37
|
+
if v in inputs:
|
|
38
|
+
continue
|
|
39
|
+
for _v in v._from:
|
|
40
|
+
refcount[_v.id] += 1
|
|
41
|
+
for v in outputs:
|
|
42
|
+
refcount[v.id] += 1
|
|
43
|
+
|
|
44
|
+
variables = [v for v in variables if refcount[v.id] > 0]
|
|
45
|
+
index = {variables[i].id: i for i in range(len(variables))}
|
|
36
46
|
|
|
37
47
|
return variables, index
|
|
38
48
|
|
|
@@ -44,7 +54,7 @@ def _comb_trace(inputs: Sequence[FixedVariable], outputs: Sequence[FixedVariable
|
|
|
44
54
|
for i, v in enumerate(variables):
|
|
45
55
|
if v.id in inp_uuids and v.opr != 'const':
|
|
46
56
|
id0 = inp_uuids[v.id]
|
|
47
|
-
ops.append(Op(id0, -1, -1, 0, v.unscaled.qint, v.latency,
|
|
57
|
+
ops.append(Op(id0, -1, -1, 0, v.unscaled.qint, v.latency, 0.0))
|
|
48
58
|
continue
|
|
49
59
|
if v.opr == 'new':
|
|
50
60
|
raise NotImplementedError('Operation "new" is only expected in the input list')
|
|
@@ -56,7 +66,7 @@ def _comb_trace(inputs: Sequence[FixedVariable], outputs: Sequence[FixedVariable
|
|
|
56
66
|
sub = int(f1 < 0)
|
|
57
67
|
data = int(log2(abs(f1 / f0)))
|
|
58
68
|
assert id0 < i and id1 < i, f'{id0} {id1} {i} {v.id}'
|
|
59
|
-
|
|
69
|
+
op = Op(id0, id1, sub, data, v.unscaled.qint, v.latency, v.cost)
|
|
60
70
|
case 'cadd':
|
|
61
71
|
v0 = v._from[0]
|
|
62
72
|
f0 = v0._factor
|
|
@@ -65,19 +75,19 @@ def _comb_trace(inputs: Sequence[FixedVariable], outputs: Sequence[FixedVariable
|
|
|
65
75
|
qint = v.unscaled.qint
|
|
66
76
|
data = int(v._data / Decimal(qint.step))
|
|
67
77
|
assert id0 < i, f'{id0} {i} {v.id}'
|
|
68
|
-
|
|
78
|
+
op = Op(id0, -1, 4, data, qint, v.latency, v.cost)
|
|
69
79
|
case 'wrap':
|
|
70
80
|
v0 = v._from[0]
|
|
71
81
|
id0 = index[v0.id]
|
|
72
82
|
assert id0 < i, f'{id0} {i} {v.id}'
|
|
73
83
|
opcode = -3 if v._from[0]._factor < 0 else 3
|
|
74
|
-
|
|
84
|
+
op = Op(id0, -1, opcode, 0, v.unscaled.qint, v.latency, v.cost)
|
|
75
85
|
case 'relu':
|
|
76
86
|
v0 = v._from[0]
|
|
77
87
|
id0 = index[v0.id]
|
|
78
88
|
assert id0 < i, f'{id0} {i} {v.id}'
|
|
79
89
|
opcode = -2 if v._from[0]._factor < 0 else 2
|
|
80
|
-
|
|
90
|
+
op = Op(id0, -1, opcode, 0, v.unscaled.qint, v.latency, v.cost)
|
|
81
91
|
case 'const':
|
|
82
92
|
qint = v.unscaled.qint
|
|
83
93
|
assert qint.min == qint.max, f'const {v.id} {qint.min} {qint.max}'
|
|
@@ -85,7 +95,7 @@ def _comb_trace(inputs: Sequence[FixedVariable], outputs: Sequence[FixedVariable
|
|
|
85
95
|
step = 2.0**-f
|
|
86
96
|
qint = QInterval(qint.min, qint.min, step)
|
|
87
97
|
data = qint.min / step
|
|
88
|
-
|
|
98
|
+
op = Op(-1, -1, 5, int(data), qint, v.latency, v.cost)
|
|
89
99
|
case 'msb_mux':
|
|
90
100
|
qint = v.unscaled.qint
|
|
91
101
|
key, in0, in1 = v._from
|
|
@@ -97,10 +107,14 @@ def _comb_trace(inputs: Sequence[FixedVariable], outputs: Sequence[FixedVariable
|
|
|
97
107
|
assert idk < i and id0 < i and id1 < i
|
|
98
108
|
assert key._factor > 0, f'Cannot mux on v{key.id} with negative factor {key._factor}'
|
|
99
109
|
op = Op(id0, id1, opcode, data, qint, v.latency, v.cost)
|
|
100
|
-
|
|
101
|
-
|
|
110
|
+
case 'vmul':
|
|
111
|
+
v0, v1 = v._from
|
|
112
|
+
id0, id1 = index[v0.id], index[v1.id]
|
|
113
|
+
op = Op(id0, id1, 7, 0, v.unscaled.qint, v.latency, v.cost)
|
|
102
114
|
case _:
|
|
103
115
|
raise NotImplementedError(f'Operation "{v.opr}" is not supported in tracing')
|
|
116
|
+
|
|
117
|
+
ops.append(op)
|
|
104
118
|
out_index = [index[v.id] for v in outputs]
|
|
105
119
|
return ops, out_index
|
|
106
120
|
|
|
@@ -147,6 +161,6 @@ def comb_trace(inputs, outputs):
|
|
|
147
161
|
for i in range(len(ops)):
|
|
148
162
|
if ref_count[i] == 0:
|
|
149
163
|
op = ops[i]
|
|
150
|
-
sol.ops[i] = Op(-1, -1,
|
|
164
|
+
sol.ops[i] = Op(-1, -1, 5, 0, QInterval(0, 0, 1), op[5], 0.0)
|
|
151
165
|
|
|
152
166
|
return sol
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: da4ml
|
|
3
|
+
Version: 0.3.2
|
|
4
|
+
Summary: Digital Arithmetic for Machine Learning
|
|
5
|
+
Author-email: Chang Sun <chsun@cern.ch>
|
|
6
|
+
License: GNU Lesser General Public License v3 (LGPLv3)
|
|
7
|
+
Project-URL: repository, https://github.com/calad0i/da4ml
|
|
8
|
+
Keywords: CMVM,distributed arithmetic,hls4ml,MCM,subexpression elimination
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Requires-Dist: llvmlite>=0.44
|
|
21
|
+
Requires-Dist: numba>=0.61
|
|
22
|
+
Provides-Extra: docs
|
|
23
|
+
Requires-Dist: hgq2; extra == "docs"
|
|
24
|
+
Requires-Dist: myst-parser; extra == "docs"
|
|
25
|
+
Requires-Dist: pyparsing; extra == "docs"
|
|
26
|
+
Requires-Dist: sphinx; extra == "docs"
|
|
27
|
+
Requires-Dist: sphinx-rtd-theme; extra == "docs"
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
# da4ml: Distributed Arithmetic for Machine Learning
|
|
31
|
+
|
|
32
|
+
[](https://www.gnu.org/licenses/lgpl-3.0)
|
|
33
|
+
[](https://calad0i.github.io/da4ml/)
|
|
34
|
+
[](https://badge.fury.io/py/da4ml)
|
|
35
|
+
[](https://arxiv.org/abs/2507.04535)
|
|
36
|
+
|
|
37
|
+
da4ml is a library for implementing distributed arithmetic (DA) based algorithms for ultra-low latency machine learning (ML) applications on FPGAs. It as two major components:
|
|
38
|
+
- A fast and performant constant-matrix-vector multiplications (CMVM) optimizer to implement them as
|
|
39
|
+
efficient adder trees. Common sub-expressions elimination (CSE) with graph-based pre-optimization are
|
|
40
|
+
performed to reduce the firmware footprint and improve the performance.
|
|
41
|
+
- Low-level symbolic tracing frameworks for generating combinational/fully pipelined logics in HDL or HLS
|
|
42
|
+
code. For fully pipelined networks, da4ml can generate the firmware for the whole network standalone.
|
|
43
|
+
Alternatively, da4ml be used as a plugin in hls4ml to optimize the CMVM operations in the network.
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
Key Features
|
|
47
|
+
------------
|
|
48
|
+
|
|
49
|
+
- **Optimized Algorithms**: Comparing to hls4ml's latency strategy, da4ml's CMVM implementation uses no DSO and consumes up to 50% less LUT usage.
|
|
50
|
+
- **Fast code generation**: da4ml can generate HDL for a fully pipelined network in seconds. For the same models, high-level synthesis tools like Vivado/Vitis HLS can take up to days to generate the HDL code.
|
|
51
|
+
- **Low-level symbolic tracing**: As long as the operation can be expressed by a combination of the low-level operations supported, adding new operations is straightforward by "replaying" the operation on the symbolic tensor provided. In most cases, adding support for a new operation/layer takes just a few lines of code in numpy flavor.
|
|
52
|
+
- **Automatic model conversion**: da4ml can automatically convert models trained in `HGQ2 <https://github.com/calad0i/hgq2>`_.
|
|
53
|
+
- **Bit-accurate Simulation**: All operation in da4ml is bit-accurate, meaning the generated HDL code will produce the same output as the original model. da4ml's computation is converted to a RISC-like, instruction set level intermediate representation, distributed arithmetic instruction set (DAIS), which can be easily simulated in multiple ways.
|
|
54
|
+
- **hls4ml integration**: da4ml can be used as a plugin in hls4ml to optimize the CMVM operations in the network by setting `strategy='distributed_arithmetic'` for the strategy of the Dense, EinsumDense, or Conv1/2D layers.
|
|
55
|
+
|
|
56
|
+
Installation
|
|
57
|
+
------------
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install da4ml
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Getting Started
|
|
64
|
+
---------------
|
|
65
|
+
|
|
66
|
+
See the [Getting Started](https://calad0i.github.io/da4ml/getting_started.html) guide for a quick introduction to using da4ml.
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
da4ml/__init__.py,sha256=IETRRvzsJvPMLu1kzzi8UN5FYaM5MhNaXH2A_ZKr2_w,469
|
|
2
|
-
da4ml/_version.py,sha256=
|
|
2
|
+
da4ml/_version.py,sha256=e8NqPtZ8fggRgk3GPrqZ_U_BDV8aSULw1u_Gn9NNbnk,704
|
|
3
3
|
da4ml/cmvm/__init__.py,sha256=4Tbt913k9zP0w8R1p6Oss06v5jrManbUhskyHl6e-U0,154
|
|
4
4
|
da4ml/cmvm/api.py,sha256=JpecMt6g8zutGh_uWT61_0iX8TuXct7-jq7N7HMIsgA,9626
|
|
5
|
-
da4ml/cmvm/types.py,sha256=
|
|
5
|
+
da4ml/cmvm/types.py,sha256=O8BuBZ2SyucxoXt_KbulAuHNgim7Ls3M6Ovw8prLgXM,21340
|
|
6
6
|
da4ml/cmvm/core/__init__.py,sha256=bp2CXI4EOVOQSho1qwfusNs0RliZRt2dV0hZ33W_Kjo,7703
|
|
7
7
|
da4ml/cmvm/core/indexers.py,sha256=QjXgvExS-B2abHTJPDG4NufMdMEflo1i6cUhFOgJpH4,2945
|
|
8
8
|
da4ml/cmvm/core/state_opr.py,sha256=wLqO8qVuM2-qCE5LDeYJDNkUruIPHy63obsv4-x-aR8,8661
|
|
@@ -11,7 +11,7 @@ da4ml/cmvm/util/bit_decompose.py,sha256=SUco70HRYf4r1JU6BXwcgabDrhm_yAmucae5FC67
|
|
|
11
11
|
da4ml/cmvm/util/mat_decompose.py,sha256=eSJNlXwx_jxgqt5vLJrSLQaeq2ZXu8j9mC4d-eq883M,4094
|
|
12
12
|
da4ml/codegen/__init__.py,sha256=Chdh3oO_vLR4saLbT9VxBPz_0wlEzxJldFSZaVUJo7U,331
|
|
13
13
|
da4ml/codegen/cpp/__init__.py,sha256=SIePoi_T4iJph50OQUosAnaVuLCckukYjLxp91Y8xQs,134
|
|
14
|
-
da4ml/codegen/cpp/cpp_codegen.py,sha256=
|
|
14
|
+
da4ml/codegen/cpp/cpp_codegen.py,sha256=ot293c8aHBx7wy1R7hnB9IVI22jYMO0476ghYKD8ECA,6162
|
|
15
15
|
da4ml/codegen/cpp/hls_model.py,sha256=J5lnB8sAvMy0Bo5MSJOpgyUm1tzEJqBxgPTlOd38Gbg,8978
|
|
16
16
|
da4ml/codegen/cpp/source/binder_util.hh,sha256=pBVmhXIDvdCr8n2wwYehc3Fpp60sWYrrZaDoP3x9JZE,1880
|
|
17
17
|
da4ml/codegen/cpp/source/build_binder.mk,sha256=RLu4TP28aJsveyMOHxuDRGEJVoIPMo9T8WyPtqnmtbQ,584
|
|
@@ -33,32 +33,34 @@ da4ml/codegen/cpp/source/ap_types/hls_stream.h,sha256=NTkVfbE48c6XnMIfR9WzJbDwUn
|
|
|
33
33
|
da4ml/codegen/cpp/source/ap_types/etc/ap_private.h,sha256=TDdxGIX0r3D6Ql8KeXoceRmHhdlwFA3Akr3-vvMVAtk,261465
|
|
34
34
|
da4ml/codegen/cpp/source/ap_types/utils/x_hls_utils.h,sha256=x24cf1HyZKv0J8YQIoUvYE3uw6SNL7vWetRGIiFm2Jw,2227
|
|
35
35
|
da4ml/codegen/verilog/__init__.py,sha256=rXmW2V9sDp2RYMDAWlhj_gfMXH3G5lPNmLrFtsJjn_A,298
|
|
36
|
-
da4ml/codegen/verilog/comb.py,sha256=
|
|
36
|
+
da4ml/codegen/verilog/comb.py,sha256=AnrfJxJXe3hytXiX00VGbdW91AAJDF-dLdsSSWBivdU,7961
|
|
37
37
|
da4ml/codegen/verilog/io_wrapper.py,sha256=SSs-ZRhBVLR6tpFso8GNGk-FH6JDe-p7LPvVPjTspxo,5002
|
|
38
38
|
da4ml/codegen/verilog/pipeline.py,sha256=YsPRTLp04Aofg33QMw6_ga3fNX9LeCD7Pq2PnERLWOg,2377
|
|
39
|
-
da4ml/codegen/verilog/verilog_model.py,sha256=
|
|
40
|
-
da4ml/codegen/verilog/source/binder_util.hh,sha256=
|
|
39
|
+
da4ml/codegen/verilog/verilog_model.py,sha256=2uyrpQN_f1cdF5fz0fBR5nh6idHlzhh_JneLkJAruQs,12172
|
|
40
|
+
da4ml/codegen/verilog/source/binder_util.hh,sha256=2sab9M0vYBsaimzJ8tWJ9LsxYKMe3xTqdFSGO7YRPbk,2521
|
|
41
41
|
da4ml/codegen/verilog/source/build_binder.mk,sha256=rQbI98itE_b1wIQ_0uCXfBzNmGK2XT4vWmRyCJNnPKk,960
|
|
42
|
-
da4ml/codegen/verilog/source/build_prj.tcl,sha256=
|
|
43
|
-
da4ml/codegen/verilog/source/ioutil.hh,sha256=
|
|
42
|
+
da4ml/codegen/verilog/source/build_prj.tcl,sha256=JA-zLl7fd2PV-BFaX22-MTex04QTi0urWUXNAEUDTy0,3003
|
|
43
|
+
da4ml/codegen/verilog/source/ioutil.hh,sha256=QXiHbOfkprOL6b-gBQGwcEOQ39uO-bRxKxwObluiK44,3967
|
|
44
|
+
da4ml/codegen/verilog/source/multiplier.v,sha256=MfgRYi7jYPp4W94KLKWpc2MPu2Dg9CDiQ3lJizSIlIQ,1122
|
|
44
45
|
da4ml/codegen/verilog/source/mux.v,sha256=1PMSQKGR_Cku1EQnePBVCuX6we_dqYBXW54WBEURvs0,1928
|
|
45
46
|
da4ml/codegen/verilog/source/negative.v,sha256=YphTCLnYslktsnCPq1xjbYgIFavani5NBbqs20uwhBI,688
|
|
46
47
|
da4ml/codegen/verilog/source/shift_adder.v,sha256=qrpXBX9bhHI-o75v5zshOfq0giEATvbeGgTir20_S3Q,1915
|
|
47
48
|
da4ml/codegen/verilog/source/template.xdc,sha256=GlSRy8tw_orohSuUwUSNEYJLLkAAHttGTfLTcQqRQDg,1262
|
|
48
|
-
da4ml/converter/__init__.py,sha256=
|
|
49
|
-
da4ml/converter/hgq2/
|
|
50
|
-
da4ml/converter/hgq2/
|
|
49
|
+
da4ml/converter/__init__.py,sha256=x7J2PEXYZsVWffRAkucLxbwzzU404eaijMdLwdhBxtY,57
|
|
50
|
+
da4ml/converter/hgq2/__init__.py,sha256=-gnT_7zXY-KQtPLxsqngwDKZ2TUIynn996pUjjB03B8,59
|
|
51
|
+
da4ml/converter/hgq2/parser.py,sha256=O55QTrlkev0lvxiIweXlTGG9RPcfjdrJgpkZc-rwetg,5472
|
|
52
|
+
da4ml/converter/hgq2/replica.py,sha256=aKi6BF2x4s3VUF1Q-__GE4-is9eSC3H8TGFDT05vTWc,16292
|
|
51
53
|
da4ml/trace/__init__.py,sha256=dv-rti3t8iE0RqeThfOb40mAg8FZB2WkkGQq3enJft0,282
|
|
52
|
-
da4ml/trace/fixed_variable.py,sha256=
|
|
53
|
-
da4ml/trace/fixed_variable_array.py,sha256=
|
|
54
|
-
da4ml/trace/pipeline.py,sha256=
|
|
55
|
-
da4ml/trace/tracer.py,sha256=
|
|
56
|
-
da4ml/trace/ops/__init__.py,sha256=
|
|
54
|
+
da4ml/trace/fixed_variable.py,sha256=samW_xChnERsMaXVQz7aKUQJsIrnSHu2ox4x9dMzhR0,20918
|
|
55
|
+
da4ml/trace/fixed_variable_array.py,sha256=1gGSc-ZmRG59sUXvgdN7pulG4XhacAGmgSmzq7nAhJ4,12846
|
|
56
|
+
da4ml/trace/pipeline.py,sha256=AVeO9BNpQlo_WO6S1nQl7RxiHs5VFRR10tWMg_36C2o,5354
|
|
57
|
+
da4ml/trace/tracer.py,sha256=xnaVO4oTWwasfiEBqqeY9o60Lek3eX65IIbvB7JtVKQ,6099
|
|
58
|
+
da4ml/trace/ops/__init__.py,sha256=fz5Cg7ZQqPkZlUj4bIOKY6aaoA1fX_G22TeA8I1n4qY,2166
|
|
57
59
|
da4ml/trace/ops/conv_utils.py,sha256=Yn73t4F6Tcs1hBwK08L1DPOin2HYVcng4PSkU4vuZFo,8245
|
|
58
|
-
da4ml/trace/ops/einsum_utils.py,sha256=
|
|
59
|
-
da4ml/trace/ops/reduce_utils.py,sha256=
|
|
60
|
-
da4ml-0.3.
|
|
61
|
-
da4ml-0.3.
|
|
62
|
-
da4ml-0.3.
|
|
63
|
-
da4ml-0.3.
|
|
64
|
-
da4ml-0.3.
|
|
60
|
+
da4ml/trace/ops/einsum_utils.py,sha256=ODofbvR98FwKBTDZsJ0ObbMjU9_GjPu5AbGuWX6sdCY,11453
|
|
61
|
+
da4ml/trace/ops/reduce_utils.py,sha256=9bi-fizhl1BPy9quQzaWMs83eCDSRMFag2PuvqlVFgI,3500
|
|
62
|
+
da4ml-0.3.2.dist-info/licenses/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
|
|
63
|
+
da4ml-0.3.2.dist-info/METADATA,sha256=zZnCaLH3ndDuURdIXAZD37A06L0ommMlBzfuL93lG-E,4055
|
|
64
|
+
da4ml-0.3.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
65
|
+
da4ml-0.3.2.dist-info/top_level.txt,sha256=N0tnKVwRqFiffFdeAzCgFq71hUNySh5-ITbNd6-R58Q,6
|
|
66
|
+
da4ml-0.3.2.dist-info/RECORD,,
|
|
@@ -1,107 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: da4ml
|
|
3
|
-
Version: 0.3.0.post1
|
|
4
|
-
Summary: Digital Arithmetic for Machine Learning
|
|
5
|
-
Author-email: Chang Sun <chsun@cern.ch>
|
|
6
|
-
License: GNU Lesser General Public License v3 (LGPLv3)
|
|
7
|
-
Project-URL: repository, https://github.com/calad0i/da4ml
|
|
8
|
-
Keywords: CMVM,distributed arithmetic,hls4ml,MCM,subexpression elimination
|
|
9
|
-
Classifier: Development Status :: 4 - Beta
|
|
10
|
-
Classifier: License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)
|
|
11
|
-
Classifier: Operating System :: OS Independent
|
|
12
|
-
Classifier: Programming Language :: Python :: 3 :: Only
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
-
Requires-Python: >=3.10
|
|
18
|
-
Description-Content-Type: text/markdown
|
|
19
|
-
License-File: LICENSE
|
|
20
|
-
Requires-Dist: llvmlite>=0.44
|
|
21
|
-
Requires-Dist: numba>=0.61
|
|
22
|
-
Dynamic: license-file
|
|
23
|
-
|
|
24
|
-
# da4ml: Distributed Arithmetic for Machine Learning
|
|
25
|
-
|
|
26
|
-
This project performs Constant Matrix-Vector Multiplication (CMVM) with Distributed Arithmetic (DA) for Machine Learning (ML) on a Field Programmable Gate Arrays (FPGAs).
|
|
27
|
-
|
|
28
|
-
CMVM optimization is done through greedy CSE of two-term subexpressions, with possible Delay Constraints (DC). The optimization is done in jitted Python (Numba), and a list of optimized operations is generated as traced Python code.
|
|
29
|
-
|
|
30
|
-
The project generates Verilog or Vitis HLS code for the optimized CMVM operations. This project can be used in conjunction with [`hls4ml`](https://github.com/fastmachinelearning/hls4ml/) for optimizing the neural networks deployed on FPGAs. For a subset of neural networks, the full design can be generated standalone in Verilog or Vitis HLS.
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
## Installation
|
|
34
|
-
|
|
35
|
-
The project is available on PyPI and can be installed with pip:
|
|
36
|
-
|
|
37
|
-
```bash
|
|
38
|
-
pip install da4ml
|
|
39
|
-
```
|
|
40
|
-
|
|
41
|
-
Notice that `numba>=6.0.0` is required for the project to work. The project does not work with `python<3.10`. If the project fails to compile, try upgrading `numba` and `llvmlite` to the latest versions.
|
|
42
|
-
|
|
43
|
-
## `hls4ml`
|
|
44
|
-
|
|
45
|
-
The major use of this project is through the `distributed_arithmetic` strategy in the `hls4ml`:
|
|
46
|
-
|
|
47
|
-
```python
|
|
48
|
-
model_hls = hls4ml.converters.convert_from_keras_model(
|
|
49
|
-
model,
|
|
50
|
-
hls_config={
|
|
51
|
-
'Model': {
|
|
52
|
-
...
|
|
53
|
-
'Strategy': 'distributed_arithmetic',
|
|
54
|
-
},
|
|
55
|
-
...
|
|
56
|
-
},
|
|
57
|
-
...
|
|
58
|
-
)
|
|
59
|
-
```
|
|
60
|
-
|
|
61
|
-
Currently, `Dense/Conv1D/Conv2D` layers are supported for both `io_parallel` and `io_stream` dataflows. However, notice that distributed arithmetic implies `reuse_factor=1`, as the whole kernel is implemented in combinational logic.
|
|
62
|
-
|
|
63
|
-
## Standalone usage
|
|
64
|
-
|
|
65
|
-
### `HGQ2`
|
|
66
|
-
|
|
67
|
-
For some models trained with `HGQ2`, the `da4ml` can be used to generate the whole model in Verilog or Vitis HLS:
|
|
68
|
-
|
|
69
|
-
```python
|
|
70
|
-
from da4ml.codegen import HLSModel, VerilogModel
|
|
71
|
-
from da4ml.converter.hgq2.parser import trace_model
|
|
72
|
-
from da4ml.trace import comb_trace
|
|
73
|
-
|
|
74
|
-
inp, out = trace_model(hgq2_model)
|
|
75
|
-
comb_logic = comb_trace(inp[0], out[0]) # Currently, only models with 1 input and 1 output are supported
|
|
76
|
-
|
|
77
|
-
# Pipelined Verilog model generation
|
|
78
|
-
# `latency_cutoff` is used to control auto piplining behavior. To disable pipelining, set it to -1.
|
|
79
|
-
verilog_model = VerilogModel(sol, prj_name='barbar', path='/tmp/barbar', latency_cutoff=5)
|
|
80
|
-
verilog_model.compile() # write and verilator binding
|
|
81
|
-
verilog_model.predict(inputs)
|
|
82
|
-
|
|
83
|
-
vitis_hls_model = HLSModel(sol, prj_name='foo', path='/tmp/foo', flavor='vitis') # Only vitis is supported for now
|
|
84
|
-
vitis_hls_model.compile() # write and hls binding
|
|
85
|
-
vitis_hls_model.predict(inputs)
|
|
86
|
-
```
|
|
87
|
-
|
|
88
|
-
### Functional Definition
|
|
89
|
-
For generic operations, one can define a combinational logic with the functional API:
|
|
90
|
-
|
|
91
|
-
```python
|
|
92
|
-
from da4ml.trace import FixedVariableArray, HWConfig, comb_trace
|
|
93
|
-
from da4ml.trace.ops import einsum, relu, quantize, conv, pool
|
|
94
|
-
|
|
95
|
-
# k, i, f are numpy arrays of integers: keep_negative (0/1), integer bits (excl. sign), fractional bits
|
|
96
|
-
inp = FixedVariableArray.from_kif(k, i, f, HWConfig(1, -1, -1), solver_options={'hard_dc':2})
|
|
97
|
-
out = inp @ kernel
|
|
98
|
-
out = relu(out)
|
|
99
|
-
out = einsum(equation, out, weights)
|
|
100
|
-
...
|
|
101
|
-
|
|
102
|
-
comb = comb_trace(inp, out)
|
|
103
|
-
```
|
|
104
|
-
|
|
105
|
-
`+`, `-`, `@` are supported as well as `einsum`, `relu`, `quantize` (WRAP, with TRN or RND), `conv`, `pool` (average only). For multiplications, only power-of-two multipliers are supported, otherwise use `einsum` or `@` operators.
|
|
106
|
-
|
|
107
|
-
The `comb_trace` returns a `Solution` objects that contains a list of low-level operations that are used to implement the combinational logic, which in turn can be used to generate Verilog or Vitis HLS code.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|