da4ml 0.3.0.post1__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of da4ml might be problematic. Click here for more details.

@@ -86,6 +86,21 @@ class FixedVariableArray:
86
86
  assert bind.arguments.get('out', None) is None, 'Output argument is not supported'
87
87
  return einsum(eq, *operands)
88
88
 
89
+ if func in (np.dot, np.matmul):
90
+ assert len(args) in (2, 3), 'Dot function requires exactly two or three arguments'
91
+
92
+ assert len(args) == 2
93
+ a, b = args
94
+ if not isinstance(a, FixedVariableArray):
95
+ a = np.array(a)
96
+ if not isinstance(b, FixedVariableArray):
97
+ b = np.array(b)
98
+ if a.shape[-1] == b.shape[0]:
99
+ return a @ b
100
+
101
+ assert a.size == 1 or b.size == 1, f'Error in dot product: {a.shape} @ {b.shape}'
102
+ return a * b
103
+
89
104
  args, kwargs = to_raw_arr(args), to_raw_arr(kwargs)
90
105
  return FixedVariableArray(
91
106
  func(*args, **kwargs),
@@ -155,7 +170,23 @@ class FixedVariableArray:
155
170
  return cls.from_lhs(low, high, step, hwconf, latency, solver_options)
156
171
 
157
172
  def __matmul__(self, other):
158
- assert isinstance(other, np.ndarray)
173
+ if isinstance(other, FixedVariableArray):
174
+ other = other._vars
175
+ if not isinstance(other, np.ndarray):
176
+ other = np.array(other)
177
+ if any(isinstance(x, FixedVariable) for x in other.ravel()):
178
+ mat0, mat1 = self._vars, other
179
+ shape = mat0.shape[:-1] + mat1.shape[1:]
180
+ mat0, mat1 = mat0.reshape((-1, mat0.shape[-1])), mat1.reshape((mat1.shape[0], -1))
181
+ _shape = (mat0.shape[0], mat1.shape[1])
182
+ _vars = np.empty(_shape, dtype=object)
183
+ for i in range(mat0.shape[0]):
184
+ for j in range(mat1.shape[1]):
185
+ vec0 = mat0[i]
186
+ vec1 = mat1[:, j]
187
+ _vars[i, j] = reduce(lambda x, y: x + y, vec0 * vec1)
188
+ return FixedVariableArray(_vars.reshape(shape), self.solver_options)
189
+
159
190
  kwargs = (self.solver_options or {}).copy()
160
191
  shape0, shape1 = self.shape, other.shape
161
192
  assert shape0[-1] == shape1[0], f'Matrix shapes do not match: {shape0} @ {shape1}'
@@ -180,9 +211,9 @@ class FixedVariableArray:
180
211
 
181
212
  def __rmatmul__(self, other):
182
213
  mat1 = np.moveaxis(other, -1, 0)
183
- mat0 = np.moveaxis(self._vars, 0, -1)
214
+ mat0 = np.moveaxis(self, 0, -1) # type: ignore
184
215
  ndim0, ndim1 = mat0.ndim, mat1.ndim
185
- r = FixedVariableArray(mat0, self.solver_options) @ mat1
216
+ r = mat0 @ mat1
186
217
 
187
218
  _axes = tuple(range(0, ndim0 + ndim1 - 2))
188
219
  axes = _axes[ndim0 - 1 :] + _axes[: ndim0 - 1]
@@ -213,6 +244,8 @@ class FixedVariableArray:
213
244
  return FixedVariableArray(self._vars - other, self.solver_options)
214
245
 
215
246
  def __mul__(self, other):
247
+ if isinstance(other, FixedVariableArray):
248
+ return FixedVariableArray(self._vars * other._vars, self.solver_options)
216
249
  return FixedVariableArray(self._vars * other, self.solver_options)
217
250
 
218
251
  def __truediv__(self, other):
@@ -230,6 +263,11 @@ class FixedVariableArray:
230
263
  max_lat = max(v.latency for v in self._vars.ravel())
231
264
  return f'FixedVariableArray(shape={shape}, hwconf={hwconf_str}, latency={max_lat})'
232
265
 
266
+ def __pow__(self, power: int | float):
267
+ _power = int(power)
268
+ assert _power == power, 'Power must be an integer'
269
+ return FixedVariableArray(self._vars**_power, self.solver_options)
270
+
233
271
  def relu(self, i: NDArray[np.integer] | None = None, f: NDArray[np.integer] | None = None, round_mode: str = 'TRN'):
234
272
  shape = self._vars.shape
235
273
  i = np.broadcast_to(i, shape) if i is not None else np.full(shape, None)
@@ -241,9 +279,9 @@ class FixedVariableArray:
241
279
 
242
280
  def quantize(
243
281
  self,
244
- k: NDArray[np.integer] | None = None,
245
- i: NDArray[np.integer] | None = None,
246
- f: NDArray[np.integer] | None = None,
282
+ k: NDArray[np.integer] | np.integer | int | None = None,
283
+ i: NDArray[np.integer] | np.integer | int | None = None,
284
+ f: NDArray[np.integer] | np.integer | int | None = None,
247
285
  overflow_mode: str = 'WRAP',
248
286
  round_mode: str = 'TRN',
249
287
  ):
@@ -276,6 +314,10 @@ class FixedVariableArray:
276
314
  def size(self):
277
315
  return self._vars.size
278
316
 
317
+ @property
318
+ def ndim(self):
319
+ return self._vars.ndim
320
+
279
321
  @property
280
322
  def kif(self):
281
323
  shape = self._vars.shape
@@ -284,7 +326,13 @@ class FixedVariableArray:
284
326
 
285
327
 
286
328
  class FixedVariableArrayInput(FixedVariableArray):
287
- def __init__(self, shape: tuple[int, ...] | int, hwconf: HWConfig, solver_options: dict[str, Any] | None = None, latency=0.0):
329
+ def __init__(
330
+ self,
331
+ shape: tuple[int, ...] | int,
332
+ hwconf: HWConfig = HWConfig(1, -1, -1),
333
+ solver_options: dict[str, Any] | None = None,
334
+ latency=0.0,
335
+ ):
288
336
  _vars = np.empty(shape, dtype=object)
289
337
  _vars_f = _vars.ravel()
290
338
  for i in range(_vars.size):
@@ -35,9 +35,9 @@ def relu(x: T, i: NDArray[np.integer] | None = None, f: NDArray[np.integer] | No
35
35
 
36
36
  def quantize(
37
37
  x: T,
38
- k: NDArray[np.integer],
39
- i: NDArray[np.integer],
40
- f: NDArray[np.integer],
38
+ k: NDArray[np.integer] | np.integer | int,
39
+ i: NDArray[np.integer] | np.integer | int,
40
+ f: NDArray[np.integer] | np.integer | int,
41
41
  overflow_mode: str = 'WRAP',
42
42
  round_mode: str = 'TRN',
43
43
  ) -> T:
@@ -47,7 +47,7 @@ def quantize(
47
47
  return x.quantize(k=k, i=i, f=f, overflow_mode=overflow_mode, round_mode=round_mode)
48
48
  else:
49
49
  x = x.copy()
50
- if overflow_mode in ('SAT', 'SAT_SM'):
50
+ if overflow_mode in ('SAT', 'SAT_SYM'):
51
51
  step = 2.0**-f
52
52
  _high = 2.0**i
53
53
  high = _high - step
@@ -271,6 +271,10 @@ def _einsum(fn: str, input0, input1) -> np.ndarray:
271
271
  return _exec_einsum(recipe, input0, input1)
272
272
 
273
273
 
274
+ @overload
275
+ def einsum(fn: str, input0: 'FixedVariableArray', input1: 'FixedVariableArray') -> 'FixedVariableArray': ...
276
+
277
+
274
278
  @overload
275
279
  def einsum(fn: str, input0: 'FixedVariableArray', input1: NDArray[np.integer | np.floating]) -> 'FixedVariableArray': ...
276
280
 
@@ -290,10 +294,9 @@ def einsum(fn: str, input0, input1):
290
294
 
291
295
  fg0 = isinstance(input0, FixedVariableArray)
292
296
  fg1 = isinstance(input1, FixedVariableArray)
293
- if fg0 and fg1:
294
- raise ValueError('Einsum does not support two FixedVariableArray inputs')
295
297
 
296
298
  r = _einsum(fn, input0, input1)
299
+
297
300
  if fg0:
298
301
  return FixedVariableArray(r, input0.solver_options)
299
302
  elif fg1:
@@ -99,5 +99,7 @@ def reduce(operator: Callable[[T, T], T], x: TA, axis: int | Sequence[int] | Non
99
99
  r = _arr.reshape(target_shape) # type: ignore
100
100
 
101
101
  if isinstance(x, FixedVariableArray):
102
- return FixedVariableArray(r, solver_config)
103
- return r
102
+ ret = FixedVariableArray(r, solver_config)
103
+ if ret.size == 1 and not keepdims:
104
+ return ret.ravel()[0] # type: ignore
105
+ return r if r.size > 1 or keepdims else r.ravel()[0] # type: ignore
da4ml/trace/pipeline.py CHANGED
@@ -38,7 +38,7 @@ def _get_new_idx(
38
38
  out_idxd: dict[int, list[int]],
39
39
  ops: list[Op],
40
40
  stage: int,
41
- latency_cutoff: int,
41
+ latency_cutoff: float,
42
42
  ):
43
43
  if idx < 0:
44
44
  return idx
@@ -60,7 +60,7 @@ def _get_new_idx(
60
60
  return p0_idx
61
61
 
62
62
 
63
- def to_pipeline(sol: Solution, latency_cutoff: int, retiming=True, verbose=True) -> CascadedSolution:
63
+ def to_pipeline(sol: Solution, latency_cutoff: float, retiming=True, verbose=True) -> CascadedSolution:
64
64
  """Split the record into multiple stages based on the latency of the operations.
65
65
  Only useful for HDL generation.
66
66
 
@@ -68,7 +68,7 @@ def to_pipeline(sol: Solution, latency_cutoff: int, retiming=True, verbose=True)
68
68
  ----------
69
69
  sol : Solution
70
70
  The solution to be split into multiple stages.
71
- latency_cutoff : int
71
+ latency_cutoff : float
72
72
  The latency cutoff for splitting the operations.
73
73
  retiming : bool
74
74
  Whether to retime the solution after splitting. Default is True.
@@ -126,10 +126,10 @@ def to_pipeline(sol: Solution, latency_cutoff: int, retiming=True, verbose=True)
126
126
  locator.append({stage: len(opd[stage]) - 1})
127
127
  sols = []
128
128
  max_stage = max(opd.keys())
129
+ n_in = sol.shape[0]
129
130
  for i, stage in enumerate(opd.keys()):
130
131
  _ops = opd[stage]
131
132
  _out_idx = out_idxd[stage]
132
- n_in = sum(op.opcode == -1 for op in _ops)
133
133
  n_out = len(_out_idx)
134
134
 
135
135
  if i == max_stage:
@@ -150,6 +150,8 @@ def to_pipeline(sol: Solution, latency_cutoff: int, retiming=True, verbose=True)
150
150
  adder_size=sol.adder_size,
151
151
  )
152
152
  sols.append(_sol)
153
+
154
+ n_in = n_out
153
155
  csol = CascadedSolution(tuple(sols))
154
156
 
155
157
  if retiming:
da4ml/trace/tracer.py CHANGED
@@ -17,8 +17,7 @@ def _recursive_gather(v: FixedVariable, gathered: dict[UUID, FixedVariable]):
17
17
  return
18
18
  assert v._from is not None
19
19
  for _v in v._from:
20
- if _v.id not in gathered:
21
- _recursive_gather(_v, gathered)
20
+ _recursive_gather(_v, gathered)
22
21
  gathered[v.id] = v
23
22
 
24
23
 
@@ -26,13 +25,24 @@ def gather_variables(inputs: Sequence[FixedVariable], outputs: Sequence[FixedVar
26
25
  gathered = {v.id: v for v in inputs}
27
26
  for o in outputs:
28
27
  _recursive_gather(o, gathered)
29
-
30
28
  variables = list(gathered.values())
31
29
 
32
30
  N = len(variables)
33
31
  _index = sorted(list(range(N)), key=lambda i: variables[i].latency * N + i)
34
32
  variables = [variables[i] for i in _index]
35
- index = {variables[i].id: i for i in range(N)}
33
+
34
+ # Remove variables with 0 refcount
35
+ refcount = {v.id: 0 for v in variables}
36
+ for v in variables:
37
+ if v in inputs:
38
+ continue
39
+ for _v in v._from:
40
+ refcount[_v.id] += 1
41
+ for v in outputs:
42
+ refcount[v.id] += 1
43
+
44
+ variables = [v for v in variables if refcount[v.id] > 0]
45
+ index = {variables[i].id: i for i in range(len(variables))}
36
46
 
37
47
  return variables, index
38
48
 
@@ -44,7 +54,7 @@ def _comb_trace(inputs: Sequence[FixedVariable], outputs: Sequence[FixedVariable
44
54
  for i, v in enumerate(variables):
45
55
  if v.id in inp_uuids and v.opr != 'const':
46
56
  id0 = inp_uuids[v.id]
47
- ops.append(Op(id0, -1, -1, 0, v.unscaled.qint, v.latency, v.cost))
57
+ ops.append(Op(id0, -1, -1, 0, v.unscaled.qint, v.latency, 0.0))
48
58
  continue
49
59
  if v.opr == 'new':
50
60
  raise NotImplementedError('Operation "new" is only expected in the input list')
@@ -56,7 +66,7 @@ def _comb_trace(inputs: Sequence[FixedVariable], outputs: Sequence[FixedVariable
56
66
  sub = int(f1 < 0)
57
67
  data = int(log2(abs(f1 / f0)))
58
68
  assert id0 < i and id1 < i, f'{id0} {id1} {i} {v.id}'
59
- ops.append(Op(id0, id1, sub, data, v.unscaled.qint, v.latency, v.cost))
69
+ op = Op(id0, id1, sub, data, v.unscaled.qint, v.latency, v.cost)
60
70
  case 'cadd':
61
71
  v0 = v._from[0]
62
72
  f0 = v0._factor
@@ -65,19 +75,19 @@ def _comb_trace(inputs: Sequence[FixedVariable], outputs: Sequence[FixedVariable
65
75
  qint = v.unscaled.qint
66
76
  data = int(v._data / Decimal(qint.step))
67
77
  assert id0 < i, f'{id0} {i} {v.id}'
68
- ops.append(Op(id0, -1, 4, data, qint, v.latency, v.cost))
78
+ op = Op(id0, -1, 4, data, qint, v.latency, v.cost)
69
79
  case 'wrap':
70
80
  v0 = v._from[0]
71
81
  id0 = index[v0.id]
72
82
  assert id0 < i, f'{id0} {i} {v.id}'
73
83
  opcode = -3 if v._from[0]._factor < 0 else 3
74
- ops.append(Op(id0, -1, opcode, 0, v.unscaled.qint, v.latency, v.cost))
84
+ op = Op(id0, -1, opcode, 0, v.unscaled.qint, v.latency, v.cost)
75
85
  case 'relu':
76
86
  v0 = v._from[0]
77
87
  id0 = index[v0.id]
78
88
  assert id0 < i, f'{id0} {i} {v.id}'
79
89
  opcode = -2 if v._from[0]._factor < 0 else 2
80
- ops.append(Op(id0, -1, opcode, 0, v.unscaled.qint, v.latency, v.cost))
90
+ op = Op(id0, -1, opcode, 0, v.unscaled.qint, v.latency, v.cost)
81
91
  case 'const':
82
92
  qint = v.unscaled.qint
83
93
  assert qint.min == qint.max, f'const {v.id} {qint.min} {qint.max}'
@@ -85,7 +95,7 @@ def _comb_trace(inputs: Sequence[FixedVariable], outputs: Sequence[FixedVariable
85
95
  step = 2.0**-f
86
96
  qint = QInterval(qint.min, qint.min, step)
87
97
  data = qint.min / step
88
- ops.append(Op(-1, -1, 5, int(data), qint, v.latency, v.cost))
98
+ op = Op(-1, -1, 5, int(data), qint, v.latency, v.cost)
89
99
  case 'msb_mux':
90
100
  qint = v.unscaled.qint
91
101
  key, in0, in1 = v._from
@@ -97,10 +107,14 @@ def _comb_trace(inputs: Sequence[FixedVariable], outputs: Sequence[FixedVariable
97
107
  assert idk < i and id0 < i and id1 < i
98
108
  assert key._factor > 0, f'Cannot mux on v{key.id} with negative factor {key._factor}'
99
109
  op = Op(id0, id1, opcode, data, qint, v.latency, v.cost)
100
- ops.append(op)
101
-
110
+ case 'vmul':
111
+ v0, v1 = v._from
112
+ id0, id1 = index[v0.id], index[v1.id]
113
+ op = Op(id0, id1, 7, 0, v.unscaled.qint, v.latency, v.cost)
102
114
  case _:
103
115
  raise NotImplementedError(f'Operation "{v.opr}" is not supported in tracing')
116
+
117
+ ops.append(op)
104
118
  out_index = [index[v.id] for v in outputs]
105
119
  return ops, out_index
106
120
 
@@ -147,6 +161,6 @@ def comb_trace(inputs, outputs):
147
161
  for i in range(len(ops)):
148
162
  if ref_count[i] == 0:
149
163
  op = ops[i]
150
- sol.ops[i] = Op(-1, -1, op[2], 0, QInterval(0, 0, 1), op[5], op[6])
164
+ sol.ops[i] = Op(-1, -1, 5, 0, QInterval(0, 0, 1), op[5], 0.0)
151
165
 
152
166
  return sol
@@ -0,0 +1,66 @@
1
+ Metadata-Version: 2.4
2
+ Name: da4ml
3
+ Version: 0.3.2
4
+ Summary: Digital Arithmetic for Machine Learning
5
+ Author-email: Chang Sun <chsun@cern.ch>
6
+ License: GNU Lesser General Public License v3 (LGPLv3)
7
+ Project-URL: repository, https://github.com/calad0i/da4ml
8
+ Keywords: CMVM,distributed arithmetic,hls4ml,MCM,subexpression elimination
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3 :: Only
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Requires-Python: >=3.10
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: llvmlite>=0.44
21
+ Requires-Dist: numba>=0.61
22
+ Provides-Extra: docs
23
+ Requires-Dist: hgq2; extra == "docs"
24
+ Requires-Dist: myst-parser; extra == "docs"
25
+ Requires-Dist: pyparsing; extra == "docs"
26
+ Requires-Dist: sphinx; extra == "docs"
27
+ Requires-Dist: sphinx-rtd-theme; extra == "docs"
28
+ Dynamic: license-file
29
+
30
+ # da4ml: Distributed Arithmetic for Machine Learning
31
+
32
+ [![LGPLv3](https://img.shields.io/badge/License-LGPLv3-blue.svg)](https://www.gnu.org/licenses/lgpl-3.0)
33
+ [![Documentation](https://github.com/calad0i/da4ml/actions/workflows/sphinx-build.yml/badge.svg)](https://calad0i.github.io/da4ml/)
34
+ [![PyPI version](https://badge.fury.io/py/da4ml.svg)](https://badge.fury.io/py/da4ml)
35
+ [![ArXiv](https://img.shields.io/badge/arXiv-2507.04535-b31b1b.svg)](https://arxiv.org/abs/2507.04535)
36
+
37
+ da4ml is a library for implementing distributed arithmetic (DA) based algorithms for ultra-low latency machine learning (ML) applications on FPGAs. It as two major components:
38
+ - A fast and performant constant-matrix-vector multiplications (CMVM) optimizer to implement them as
39
+ efficient adder trees. Common sub-expressions elimination (CSE) with graph-based pre-optimization are
40
+ performed to reduce the firmware footprint and improve the performance.
41
+ - Low-level symbolic tracing frameworks for generating combinational/fully pipelined logics in HDL or HLS
42
+ code. For fully pipelined networks, da4ml can generate the firmware for the whole network standalone.
43
+ Alternatively, da4ml be used as a plugin in hls4ml to optimize the CMVM operations in the network.
44
+
45
+
46
+ Key Features
47
+ ------------
48
+
49
+ - **Optimized Algorithms**: Comparing to hls4ml's latency strategy, da4ml's CMVM implementation uses no DSO and consumes up to 50% less LUT usage.
50
+ - **Fast code generation**: da4ml can generate HDL for a fully pipelined network in seconds. For the same models, high-level synthesis tools like Vivado/Vitis HLS can take up to days to generate the HDL code.
51
+ - **Low-level symbolic tracing**: As long as the operation can be expressed by a combination of the low-level operations supported, adding new operations is straightforward by "replaying" the operation on the symbolic tensor provided. In most cases, adding support for a new operation/layer takes just a few lines of code in numpy flavor.
52
+ - **Automatic model conversion**: da4ml can automatically convert models trained in `HGQ2 <https://github.com/calad0i/hgq2>`_.
53
+ - **Bit-accurate Simulation**: All operation in da4ml is bit-accurate, meaning the generated HDL code will produce the same output as the original model. da4ml's computation is converted to a RISC-like, instruction set level intermediate representation, distributed arithmetic instruction set (DAIS), which can be easily simulated in multiple ways.
54
+ - **hls4ml integration**: da4ml can be used as a plugin in hls4ml to optimize the CMVM operations in the network by setting `strategy='distributed_arithmetic'` for the strategy of the Dense, EinsumDense, or Conv1/2D layers.
55
+
56
+ Installation
57
+ ------------
58
+
59
+ ```bash
60
+ pip install da4ml
61
+ ```
62
+
63
+ Getting Started
64
+ ---------------
65
+
66
+ See the [Getting Started](https://calad0i.github.io/da4ml/getting_started.html) guide for a quick introduction to using da4ml.
@@ -1,8 +1,8 @@
1
1
  da4ml/__init__.py,sha256=IETRRvzsJvPMLu1kzzi8UN5FYaM5MhNaXH2A_ZKr2_w,469
2
- da4ml/_version.py,sha256=uYHHQtYrsf_vg1G4qaENpapNqr41eUiWJdo-mm-U-PM,526
2
+ da4ml/_version.py,sha256=e8NqPtZ8fggRgk3GPrqZ_U_BDV8aSULw1u_Gn9NNbnk,704
3
3
  da4ml/cmvm/__init__.py,sha256=4Tbt913k9zP0w8R1p6Oss06v5jrManbUhskyHl6e-U0,154
4
4
  da4ml/cmvm/api.py,sha256=JpecMt6g8zutGh_uWT61_0iX8TuXct7-jq7N7HMIsgA,9626
5
- da4ml/cmvm/types.py,sha256=hdthYdP5muIQ-9qFE0CjObGT7lCxB1-udXU16LxtuBI,20959
5
+ da4ml/cmvm/types.py,sha256=O8BuBZ2SyucxoXt_KbulAuHNgim7Ls3M6Ovw8prLgXM,21340
6
6
  da4ml/cmvm/core/__init__.py,sha256=bp2CXI4EOVOQSho1qwfusNs0RliZRt2dV0hZ33W_Kjo,7703
7
7
  da4ml/cmvm/core/indexers.py,sha256=QjXgvExS-B2abHTJPDG4NufMdMEflo1i6cUhFOgJpH4,2945
8
8
  da4ml/cmvm/core/state_opr.py,sha256=wLqO8qVuM2-qCE5LDeYJDNkUruIPHy63obsv4-x-aR8,8661
@@ -11,7 +11,7 @@ da4ml/cmvm/util/bit_decompose.py,sha256=SUco70HRYf4r1JU6BXwcgabDrhm_yAmucae5FC67
11
11
  da4ml/cmvm/util/mat_decompose.py,sha256=eSJNlXwx_jxgqt5vLJrSLQaeq2ZXu8j9mC4d-eq883M,4094
12
12
  da4ml/codegen/__init__.py,sha256=Chdh3oO_vLR4saLbT9VxBPz_0wlEzxJldFSZaVUJo7U,331
13
13
  da4ml/codegen/cpp/__init__.py,sha256=SIePoi_T4iJph50OQUosAnaVuLCckukYjLxp91Y8xQs,134
14
- da4ml/codegen/cpp/cpp_codegen.py,sha256=6lBF1I-xXdIABEWF60owBmQiISuI6mrITCqLqhsEHrQ,6033
14
+ da4ml/codegen/cpp/cpp_codegen.py,sha256=ot293c8aHBx7wy1R7hnB9IVI22jYMO0476ghYKD8ECA,6162
15
15
  da4ml/codegen/cpp/hls_model.py,sha256=J5lnB8sAvMy0Bo5MSJOpgyUm1tzEJqBxgPTlOd38Gbg,8978
16
16
  da4ml/codegen/cpp/source/binder_util.hh,sha256=pBVmhXIDvdCr8n2wwYehc3Fpp60sWYrrZaDoP3x9JZE,1880
17
17
  da4ml/codegen/cpp/source/build_binder.mk,sha256=RLu4TP28aJsveyMOHxuDRGEJVoIPMo9T8WyPtqnmtbQ,584
@@ -33,32 +33,34 @@ da4ml/codegen/cpp/source/ap_types/hls_stream.h,sha256=NTkVfbE48c6XnMIfR9WzJbDwUn
33
33
  da4ml/codegen/cpp/source/ap_types/etc/ap_private.h,sha256=TDdxGIX0r3D6Ql8KeXoceRmHhdlwFA3Akr3-vvMVAtk,261465
34
34
  da4ml/codegen/cpp/source/ap_types/utils/x_hls_utils.h,sha256=x24cf1HyZKv0J8YQIoUvYE3uw6SNL7vWetRGIiFm2Jw,2227
35
35
  da4ml/codegen/verilog/__init__.py,sha256=rXmW2V9sDp2RYMDAWlhj_gfMXH3G5lPNmLrFtsJjn_A,298
36
- da4ml/codegen/verilog/comb.py,sha256=CmCwiddeiT4TCZV088lF2ENlAXx3vjZKszTz1sYXEao,7614
36
+ da4ml/codegen/verilog/comb.py,sha256=AnrfJxJXe3hytXiX00VGbdW91AAJDF-dLdsSSWBivdU,7961
37
37
  da4ml/codegen/verilog/io_wrapper.py,sha256=SSs-ZRhBVLR6tpFso8GNGk-FH6JDe-p7LPvVPjTspxo,5002
38
38
  da4ml/codegen/verilog/pipeline.py,sha256=YsPRTLp04Aofg33QMw6_ga3fNX9LeCD7Pq2PnERLWOg,2377
39
- da4ml/codegen/verilog/verilog_model.py,sha256=_50dggtH24xMdI0beuyvdsv8G8dlB4MWa1m8KWZQdNE,12295
40
- da4ml/codegen/verilog/source/binder_util.hh,sha256=Dn9ysUdonw0HR8bxom8YfQF7vc1LEvT_B1V_o8Gw1rY,2503
39
+ da4ml/codegen/verilog/verilog_model.py,sha256=2uyrpQN_f1cdF5fz0fBR5nh6idHlzhh_JneLkJAruQs,12172
40
+ da4ml/codegen/verilog/source/binder_util.hh,sha256=2sab9M0vYBsaimzJ8tWJ9LsxYKMe3xTqdFSGO7YRPbk,2521
41
41
  da4ml/codegen/verilog/source/build_binder.mk,sha256=rQbI98itE_b1wIQ_0uCXfBzNmGK2XT4vWmRyCJNnPKk,960
42
- da4ml/codegen/verilog/source/build_prj.tcl,sha256=bcFCpcHR26TJGOQZEpUx0eM1SEiJOCoH-9EPpIvqWu0,3124
43
- da4ml/codegen/verilog/source/ioutil.hh,sha256=1o1-oIyQyYc9CU91bBxuitVzzcrNT8p4MTarFKiJoG4,3967
42
+ da4ml/codegen/verilog/source/build_prj.tcl,sha256=JA-zLl7fd2PV-BFaX22-MTex04QTi0urWUXNAEUDTy0,3003
43
+ da4ml/codegen/verilog/source/ioutil.hh,sha256=QXiHbOfkprOL6b-gBQGwcEOQ39uO-bRxKxwObluiK44,3967
44
+ da4ml/codegen/verilog/source/multiplier.v,sha256=MfgRYi7jYPp4W94KLKWpc2MPu2Dg9CDiQ3lJizSIlIQ,1122
44
45
  da4ml/codegen/verilog/source/mux.v,sha256=1PMSQKGR_Cku1EQnePBVCuX6we_dqYBXW54WBEURvs0,1928
45
46
  da4ml/codegen/verilog/source/negative.v,sha256=YphTCLnYslktsnCPq1xjbYgIFavani5NBbqs20uwhBI,688
46
47
  da4ml/codegen/verilog/source/shift_adder.v,sha256=qrpXBX9bhHI-o75v5zshOfq0giEATvbeGgTir20_S3Q,1915
47
48
  da4ml/codegen/verilog/source/template.xdc,sha256=GlSRy8tw_orohSuUwUSNEYJLLkAAHttGTfLTcQqRQDg,1262
48
- da4ml/converter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
49
- da4ml/converter/hgq2/parser.py,sha256=bAtnEXQxRKU9a1HFJWTy-e_HDzZY_wXOBVdyYG3ndsM,3826
50
- da4ml/converter/hgq2/replica.py,sha256=9ICJGfK2Q2C_glwE0KMcvXttuWvJYRblkO7RLmalzss,13829
49
+ da4ml/converter/__init__.py,sha256=x7J2PEXYZsVWffRAkucLxbwzzU404eaijMdLwdhBxtY,57
50
+ da4ml/converter/hgq2/__init__.py,sha256=-gnT_7zXY-KQtPLxsqngwDKZ2TUIynn996pUjjB03B8,59
51
+ da4ml/converter/hgq2/parser.py,sha256=O55QTrlkev0lvxiIweXlTGG9RPcfjdrJgpkZc-rwetg,5472
52
+ da4ml/converter/hgq2/replica.py,sha256=aKi6BF2x4s3VUF1Q-__GE4-is9eSC3H8TGFDT05vTWc,16292
51
53
  da4ml/trace/__init__.py,sha256=dv-rti3t8iE0RqeThfOb40mAg8FZB2WkkGQq3enJft0,282
52
- da4ml/trace/fixed_variable.py,sha256=6dfMHBN1NfqYIbPZ79GCPCXj2JFQUKTyDZu6xDaG3rg,17082
53
- da4ml/trace/fixed_variable_array.py,sha256=A0ApTvZxpkr7kHrUQkyhrGJuuPe4kDgLFyD_1CW7lBk,10985
54
- da4ml/trace/pipeline.py,sha256=_R2uqWgnpuQ4tD7VKz2eu8CF9Air2RtYH2o03Vfg0Mk,5353
55
- da4ml/trace/tracer.py,sha256=NqPEH9hyVlGQOf9_kJL3A7SujCcxkT-z28bk0Ael5jE,5664
56
- da4ml/trace/ops/__init__.py,sha256=I4VqB43lVkFlLtkoWxiSDHBFGvxKwutNbAJw5aLVeAI,2108
54
+ da4ml/trace/fixed_variable.py,sha256=samW_xChnERsMaXVQz7aKUQJsIrnSHu2ox4x9dMzhR0,20918
55
+ da4ml/trace/fixed_variable_array.py,sha256=1gGSc-ZmRG59sUXvgdN7pulG4XhacAGmgSmzq7nAhJ4,12846
56
+ da4ml/trace/pipeline.py,sha256=AVeO9BNpQlo_WO6S1nQl7RxiHs5VFRR10tWMg_36C2o,5354
57
+ da4ml/trace/tracer.py,sha256=xnaVO4oTWwasfiEBqqeY9o60Lek3eX65IIbvB7JtVKQ,6099
58
+ da4ml/trace/ops/__init__.py,sha256=fz5Cg7ZQqPkZlUj4bIOKY6aaoA1fX_G22TeA8I1n4qY,2166
57
59
  da4ml/trace/ops/conv_utils.py,sha256=Yn73t4F6Tcs1hBwK08L1DPOin2HYVcng4PSkU4vuZFo,8245
58
- da4ml/trace/ops/einsum_utils.py,sha256=MoWvOfvtVjXGwqEhXEzZ3uGrgSmLTHngV8I1eLyANGE,11433
59
- da4ml/trace/ops/reduce_utils.py,sha256=8gohGQRVr8Bn5rfyrGsnE8EDxUXAObv521qu4mJrX9I,3348
60
- da4ml-0.3.0.post1.dist-info/licenses/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
61
- da4ml-0.3.0.post1.dist-info/METADATA,sha256=PTn1XMH7eBRfw3nLUqD1OzktCsH6V9SzxqXw3wK5ShE,4575
62
- da4ml-0.3.0.post1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
63
- da4ml-0.3.0.post1.dist-info/top_level.txt,sha256=N0tnKVwRqFiffFdeAzCgFq71hUNySh5-ITbNd6-R58Q,6
64
- da4ml-0.3.0.post1.dist-info/RECORD,,
60
+ da4ml/trace/ops/einsum_utils.py,sha256=ODofbvR98FwKBTDZsJ0ObbMjU9_GjPu5AbGuWX6sdCY,11453
61
+ da4ml/trace/ops/reduce_utils.py,sha256=9bi-fizhl1BPy9quQzaWMs83eCDSRMFag2PuvqlVFgI,3500
62
+ da4ml-0.3.2.dist-info/licenses/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
63
+ da4ml-0.3.2.dist-info/METADATA,sha256=zZnCaLH3ndDuURdIXAZD37A06L0ommMlBzfuL93lG-E,4055
64
+ da4ml-0.3.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
65
+ da4ml-0.3.2.dist-info/top_level.txt,sha256=N0tnKVwRqFiffFdeAzCgFq71hUNySh5-ITbNd6-R58Q,6
66
+ da4ml-0.3.2.dist-info/RECORD,,
@@ -1,107 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: da4ml
3
- Version: 0.3.0.post1
4
- Summary: Digital Arithmetic for Machine Learning
5
- Author-email: Chang Sun <chsun@cern.ch>
6
- License: GNU Lesser General Public License v3 (LGPLv3)
7
- Project-URL: repository, https://github.com/calad0i/da4ml
8
- Keywords: CMVM,distributed arithmetic,hls4ml,MCM,subexpression elimination
9
- Classifier: Development Status :: 4 - Beta
10
- Classifier: License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)
11
- Classifier: Operating System :: OS Independent
12
- Classifier: Programming Language :: Python :: 3 :: Only
13
- Classifier: Programming Language :: Python :: 3.10
14
- Classifier: Programming Language :: Python :: 3.11
15
- Classifier: Programming Language :: Python :: 3.12
16
- Classifier: Programming Language :: Python :: 3.13
17
- Requires-Python: >=3.10
18
- Description-Content-Type: text/markdown
19
- License-File: LICENSE
20
- Requires-Dist: llvmlite>=0.44
21
- Requires-Dist: numba>=0.61
22
- Dynamic: license-file
23
-
24
- # da4ml: Distributed Arithmetic for Machine Learning
25
-
26
- This project performs Constant Matrix-Vector Multiplication (CMVM) with Distributed Arithmetic (DA) for Machine Learning (ML) on a Field Programmable Gate Arrays (FPGAs).
27
-
28
- CMVM optimization is done through greedy CSE of two-term subexpressions, with possible Delay Constraints (DC). The optimization is done in jitted Python (Numba), and a list of optimized operations is generated as traced Python code.
29
-
30
- The project generates Verilog or Vitis HLS code for the optimized CMVM operations. This project can be used in conjunction with [`hls4ml`](https://github.com/fastmachinelearning/hls4ml/) for optimizing the neural networks deployed on FPGAs. For a subset of neural networks, the full design can be generated standalone in Verilog or Vitis HLS.
31
-
32
-
33
- ## Installation
34
-
35
- The project is available on PyPI and can be installed with pip:
36
-
37
- ```bash
38
- pip install da4ml
39
- ```
40
-
41
- Notice that `numba>=6.0.0` is required for the project to work. The project does not work with `python<3.10`. If the project fails to compile, try upgrading `numba` and `llvmlite` to the latest versions.
42
-
43
- ## `hls4ml`
44
-
45
- The major use of this project is through the `distributed_arithmetic` strategy in the `hls4ml`:
46
-
47
- ```python
48
- model_hls = hls4ml.converters.convert_from_keras_model(
49
- model,
50
- hls_config={
51
- 'Model': {
52
- ...
53
- 'Strategy': 'distributed_arithmetic',
54
- },
55
- ...
56
- },
57
- ...
58
- )
59
- ```
60
-
61
- Currently, `Dense/Conv1D/Conv2D` layers are supported for both `io_parallel` and `io_stream` dataflows. However, notice that distributed arithmetic implies `reuse_factor=1`, as the whole kernel is implemented in combinational logic.
62
-
63
- ## Standalone usage
64
-
65
- ### `HGQ2`
66
-
67
- For some models trained with `HGQ2`, the `da4ml` can be used to generate the whole model in Verilog or Vitis HLS:
68
-
69
- ```python
70
- from da4ml.codegen import HLSModel, VerilogModel
71
- from da4ml.converter.hgq2.parser import trace_model
72
- from da4ml.trace import comb_trace
73
-
74
- inp, out = trace_model(hgq2_model)
75
- comb_logic = comb_trace(inp[0], out[0]) # Currently, only models with 1 input and 1 output are supported
76
-
77
- # Pipelined Verilog model generation
78
- # `latency_cutoff` is used to control auto piplining behavior. To disable pipelining, set it to -1.
79
- verilog_model = VerilogModel(sol, prj_name='barbar', path='/tmp/barbar', latency_cutoff=5)
80
- verilog_model.compile() # write and verilator binding
81
- verilog_model.predict(inputs)
82
-
83
- vitis_hls_model = HLSModel(sol, prj_name='foo', path='/tmp/foo', flavor='vitis') # Only vitis is supported for now
84
- vitis_hls_model.compile() # write and hls binding
85
- vitis_hls_model.predict(inputs)
86
- ```
87
-
88
- ### Functional Definition
89
- For generic operations, one can define a combinational logic with the functional API:
90
-
91
- ```python
92
- from da4ml.trace import FixedVariableArray, HWConfig, comb_trace
93
- from da4ml.trace.ops import einsum, relu, quantize, conv, pool
94
-
95
- # k, i, f are numpy arrays of integers: keep_negative (0/1), integer bits (excl. sign), fractional bits
96
- inp = FixedVariableArray.from_kif(k, i, f, HWConfig(1, -1, -1), solver_options={'hard_dc':2})
97
- out = inp @ kernel
98
- out = relu(out)
99
- out = einsum(equation, out, weights)
100
- ...
101
-
102
- comb = comb_trace(inp, out)
103
- ```
104
-
105
- `+`, `-`, `@` are supported as well as `einsum`, `relu`, `quantize` (WRAP, with TRN or RND), `conv`, `pool` (average only). For multiplications, only power-of-two multipliers are supported, otherwise use `einsum` or `@` operators.
106
-
107
- The `comb_trace` returns a `Solution` objects that contains a list of low-level operations that are used to implement the combinational logic, which in turn can be used to generate Verilog or Vitis HLS code.