da4ml 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of da4ml might be problematic. Click here for more details.

@@ -86,6 +86,21 @@ class FixedVariableArray:
86
86
  assert bind.arguments.get('out', None) is None, 'Output argument is not supported'
87
87
  return einsum(eq, *operands)
88
88
 
89
+ if func in (np.dot, np.matmul):
90
+ assert len(args) in (2, 3), 'Dot function requires exactly two or three arguments'
91
+
92
+ assert len(args) == 2
93
+ a, b = args
94
+ if not isinstance(a, FixedVariableArray):
95
+ a = np.array(a)
96
+ if not isinstance(b, FixedVariableArray):
97
+ b = np.array(b)
98
+ if a.shape[-1] == b.shape[0]:
99
+ return a @ b
100
+
101
+ assert a.size == 1 or b.size == 1, f'Error in dot product: {a.shape} @ {b.shape}'
102
+ return a * b
103
+
89
104
  args, kwargs = to_raw_arr(args), to_raw_arr(kwargs)
90
105
  return FixedVariableArray(
91
106
  func(*args, **kwargs),
@@ -149,13 +164,33 @@ class FixedVariableArray:
149
164
  latency: NDArray[np.floating] | float = 0.0,
150
165
  solver_options: dict[str, Any] | None = None,
151
166
  ):
167
+ mask = k + i + f <= 0
168
+ k = np.where(mask, 0, k)
169
+ i = np.where(mask, 0, i)
170
+ f = np.where(mask, 0, f)
152
171
  step = 2.0**-f
153
172
  _high = 2.0**i
154
173
  high, low = _high - step, -_high * k
155
174
  return cls.from_lhs(low, high, step, hwconf, latency, solver_options)
156
175
 
157
176
  def __matmul__(self, other):
158
- assert isinstance(other, np.ndarray)
177
+ if isinstance(other, FixedVariableArray):
178
+ other = other._vars
179
+ if not isinstance(other, np.ndarray):
180
+ other = np.array(other)
181
+ if any(isinstance(x, FixedVariable) for x in other.ravel()):
182
+ mat0, mat1 = self._vars, other
183
+ shape = mat0.shape[:-1] + mat1.shape[1:]
184
+ mat0, mat1 = mat0.reshape((-1, mat0.shape[-1])), mat1.reshape((mat1.shape[0], -1))
185
+ _shape = (mat0.shape[0], mat1.shape[1])
186
+ _vars = np.empty(_shape, dtype=object)
187
+ for i in range(mat0.shape[0]):
188
+ for j in range(mat1.shape[1]):
189
+ vec0 = mat0[i]
190
+ vec1 = mat1[:, j]
191
+ _vars[i, j] = reduce(lambda x, y: x + y, vec0 * vec1)
192
+ return FixedVariableArray(_vars.reshape(shape), self.solver_options)
193
+
159
194
  kwargs = (self.solver_options or {}).copy()
160
195
  shape0, shape1 = self.shape, other.shape
161
196
  assert shape0[-1] == shape1[0], f'Matrix shapes do not match: {shape0} @ {shape1}'
@@ -180,9 +215,9 @@ class FixedVariableArray:
180
215
 
181
216
  def __rmatmul__(self, other):
182
217
  mat1 = np.moveaxis(other, -1, 0)
183
- mat0 = np.moveaxis(self._vars, 0, -1)
218
+ mat0 = np.moveaxis(self, 0, -1) # type: ignore
184
219
  ndim0, ndim1 = mat0.ndim, mat1.ndim
185
- r = FixedVariableArray(mat0, self.solver_options) @ mat1
220
+ r = mat0 @ mat1
186
221
 
187
222
  _axes = tuple(range(0, ndim0 + ndim1 - 2))
188
223
  axes = _axes[ndim0 - 1 :] + _axes[: ndim0 - 1]
@@ -213,6 +248,8 @@ class FixedVariableArray:
213
248
  return FixedVariableArray(self._vars - other, self.solver_options)
214
249
 
215
250
  def __mul__(self, other):
251
+ if isinstance(other, FixedVariableArray):
252
+ return FixedVariableArray(self._vars * other._vars, self.solver_options)
216
253
  return FixedVariableArray(self._vars * other, self.solver_options)
217
254
 
218
255
  def __truediv__(self, other):
@@ -230,6 +267,11 @@ class FixedVariableArray:
230
267
  max_lat = max(v.latency for v in self._vars.ravel())
231
268
  return f'FixedVariableArray(shape={shape}, hwconf={hwconf_str}, latency={max_lat})'
232
269
 
270
+ def __pow__(self, power: int | float):
271
+ _power = int(power)
272
+ assert _power == power, 'Power must be an integer'
273
+ return FixedVariableArray(self._vars**_power, self.solver_options)
274
+
233
275
  def relu(self, i: NDArray[np.integer] | None = None, f: NDArray[np.integer] | None = None, round_mode: str = 'TRN'):
234
276
  shape = self._vars.shape
235
277
  i = np.broadcast_to(i, shape) if i is not None else np.full(shape, None)
@@ -241,9 +283,9 @@ class FixedVariableArray:
241
283
 
242
284
  def quantize(
243
285
  self,
244
- k: NDArray[np.integer] | None = None,
245
- i: NDArray[np.integer] | None = None,
246
- f: NDArray[np.integer] | None = None,
286
+ k: NDArray[np.integer] | np.integer | int | None = None,
287
+ i: NDArray[np.integer] | np.integer | int | None = None,
288
+ f: NDArray[np.integer] | np.integer | int | None = None,
247
289
  overflow_mode: str = 'WRAP',
248
290
  round_mode: str = 'TRN',
249
291
  ):
@@ -276,6 +318,10 @@ class FixedVariableArray:
276
318
  def size(self):
277
319
  return self._vars.size
278
320
 
321
+ @property
322
+ def ndim(self):
323
+ return self._vars.ndim
324
+
279
325
  @property
280
326
  def kif(self):
281
327
  shape = self._vars.shape
@@ -284,7 +330,13 @@ class FixedVariableArray:
284
330
 
285
331
 
286
332
  class FixedVariableArrayInput(FixedVariableArray):
287
- def __init__(self, shape: tuple[int, ...] | int, hwconf: HWConfig, solver_options: dict[str, Any] | None = None, latency=0.0):
333
+ def __init__(
334
+ self,
335
+ shape: tuple[int, ...] | int,
336
+ hwconf: HWConfig = HWConfig(1, -1, -1),
337
+ solver_options: dict[str, Any] | None = None,
338
+ latency=0.0,
339
+ ):
288
340
  _vars = np.empty(shape, dtype=object)
289
341
  _vars_f = _vars.ravel()
290
342
  for i in range(_vars.size):
@@ -35,9 +35,9 @@ def relu(x: T, i: NDArray[np.integer] | None = None, f: NDArray[np.integer] | No
35
35
 
36
36
  def quantize(
37
37
  x: T,
38
- k: NDArray[np.integer],
39
- i: NDArray[np.integer],
40
- f: NDArray[np.integer],
38
+ k: NDArray[np.integer] | np.integer | int,
39
+ i: NDArray[np.integer] | np.integer | int,
40
+ f: NDArray[np.integer] | np.integer | int,
41
41
  overflow_mode: str = 'WRAP',
42
42
  round_mode: str = 'TRN',
43
43
  ) -> T:
@@ -47,7 +47,7 @@ def quantize(
47
47
  return x.quantize(k=k, i=i, f=f, overflow_mode=overflow_mode, round_mode=round_mode)
48
48
  else:
49
49
  x = x.copy()
50
- if overflow_mode in ('SAT', 'SAT_SM'):
50
+ if overflow_mode in ('SAT', 'SAT_SYM'):
51
51
  step = 2.0**-f
52
52
  _high = 2.0**i
53
53
  high = _high - step
@@ -271,6 +271,10 @@ def _einsum(fn: str, input0, input1) -> np.ndarray:
271
271
  return _exec_einsum(recipe, input0, input1)
272
272
 
273
273
 
274
+ @overload
275
+ def einsum(fn: str, input0: 'FixedVariableArray', input1: 'FixedVariableArray') -> 'FixedVariableArray': ...
276
+
277
+
274
278
  @overload
275
279
  def einsum(fn: str, input0: 'FixedVariableArray', input1: NDArray[np.integer | np.floating]) -> 'FixedVariableArray': ...
276
280
 
@@ -290,10 +294,9 @@ def einsum(fn: str, input0, input1):
290
294
 
291
295
  fg0 = isinstance(input0, FixedVariableArray)
292
296
  fg1 = isinstance(input1, FixedVariableArray)
293
- if fg0 and fg1:
294
- raise ValueError('Einsum does not support two FixedVariableArray inputs')
295
297
 
296
298
  r = _einsum(fn, input0, input1)
299
+
297
300
  if fg0:
298
301
  return FixedVariableArray(r, input0.solver_options)
299
302
  elif fg1:
@@ -99,5 +99,7 @@ def reduce(operator: Callable[[T, T], T], x: TA, axis: int | Sequence[int] | Non
99
99
  r = _arr.reshape(target_shape) # type: ignore
100
100
 
101
101
  if isinstance(x, FixedVariableArray):
102
- return FixedVariableArray(r, solver_config)
103
- return r
102
+ r = FixedVariableArray(r, solver_config)
103
+ if r.size == 1 and not keepdims:
104
+ return r.ravel()[0] # type: ignore
105
+ return r if r.size > 1 or keepdims else r.ravel()[0] # type: ignore
da4ml/trace/pipeline.py CHANGED
@@ -38,7 +38,7 @@ def _get_new_idx(
38
38
  out_idxd: dict[int, list[int]],
39
39
  ops: list[Op],
40
40
  stage: int,
41
- latency_cutoff: int,
41
+ latency_cutoff: float,
42
42
  ):
43
43
  if idx < 0:
44
44
  return idx
@@ -60,7 +60,7 @@ def _get_new_idx(
60
60
  return p0_idx
61
61
 
62
62
 
63
- def to_pipeline(sol: Solution, latency_cutoff: int, retiming=True, verbose=True) -> CascadedSolution:
63
+ def to_pipeline(sol: Solution, latency_cutoff: float, retiming=True, verbose=True) -> CascadedSolution:
64
64
  """Split the record into multiple stages based on the latency of the operations.
65
65
  Only useful for HDL generation.
66
66
 
@@ -68,7 +68,7 @@ def to_pipeline(sol: Solution, latency_cutoff: int, retiming=True, verbose=True)
68
68
  ----------
69
69
  sol : Solution
70
70
  The solution to be split into multiple stages.
71
- latency_cutoff : int
71
+ latency_cutoff : float
72
72
  The latency cutoff for splitting the operations.
73
73
  retiming : bool
74
74
  Whether to retime the solution after splitting. Default is True.
@@ -126,10 +126,10 @@ def to_pipeline(sol: Solution, latency_cutoff: int, retiming=True, verbose=True)
126
126
  locator.append({stage: len(opd[stage]) - 1})
127
127
  sols = []
128
128
  max_stage = max(opd.keys())
129
+ n_in = sol.shape[0]
129
130
  for i, stage in enumerate(opd.keys()):
130
131
  _ops = opd[stage]
131
132
  _out_idx = out_idxd[stage]
132
- n_in = sum(op.opcode == -1 for op in _ops)
133
133
  n_out = len(_out_idx)
134
134
 
135
135
  if i == max_stage:
@@ -150,6 +150,8 @@ def to_pipeline(sol: Solution, latency_cutoff: int, retiming=True, verbose=True)
150
150
  adder_size=sol.adder_size,
151
151
  )
152
152
  sols.append(_sol)
153
+
154
+ n_in = n_out
153
155
  csol = CascadedSolution(tuple(sols))
154
156
 
155
157
  if retiming:
da4ml/trace/tracer.py CHANGED
@@ -17,8 +17,7 @@ def _recursive_gather(v: FixedVariable, gathered: dict[UUID, FixedVariable]):
17
17
  return
18
18
  assert v._from is not None
19
19
  for _v in v._from:
20
- if _v.id not in gathered:
21
- _recursive_gather(_v, gathered)
20
+ _recursive_gather(_v, gathered)
22
21
  gathered[v.id] = v
23
22
 
24
23
 
@@ -26,13 +25,24 @@ def gather_variables(inputs: Sequence[FixedVariable], outputs: Sequence[FixedVar
26
25
  gathered = {v.id: v for v in inputs}
27
26
  for o in outputs:
28
27
  _recursive_gather(o, gathered)
29
-
30
28
  variables = list(gathered.values())
31
29
 
32
30
  N = len(variables)
33
31
  _index = sorted(list(range(N)), key=lambda i: variables[i].latency * N + i)
34
32
  variables = [variables[i] for i in _index]
35
- index = {variables[i].id: i for i in range(N)}
33
+
34
+ # Remove variables with 0 refcount
35
+ refcount = {v.id: 0 for v in variables}
36
+ for v in variables:
37
+ if v in inputs:
38
+ continue
39
+ for _v in v._from:
40
+ refcount[_v.id] += 1
41
+ for v in outputs:
42
+ refcount[v.id] += 1
43
+
44
+ variables = [v for v in variables if refcount[v.id] > 0]
45
+ index = {variables[i].id: i for i in range(len(variables))}
36
46
 
37
47
  return variables, index
38
48
 
@@ -44,7 +54,7 @@ def _comb_trace(inputs: Sequence[FixedVariable], outputs: Sequence[FixedVariable
44
54
  for i, v in enumerate(variables):
45
55
  if v.id in inp_uuids and v.opr != 'const':
46
56
  id0 = inp_uuids[v.id]
47
- ops.append(Op(id0, -1, -1, 0, v.unscaled.qint, v.latency, v.cost))
57
+ ops.append(Op(id0, -1, -1, 0, v.unscaled.qint, v.latency, 0.0))
48
58
  continue
49
59
  if v.opr == 'new':
50
60
  raise NotImplementedError('Operation "new" is only expected in the input list')
@@ -56,7 +66,7 @@ def _comb_trace(inputs: Sequence[FixedVariable], outputs: Sequence[FixedVariable
56
66
  sub = int(f1 < 0)
57
67
  data = int(log2(abs(f1 / f0)))
58
68
  assert id0 < i and id1 < i, f'{id0} {id1} {i} {v.id}'
59
- ops.append(Op(id0, id1, sub, data, v.unscaled.qint, v.latency, v.cost))
69
+ op = Op(id0, id1, sub, data, v.unscaled.qint, v.latency, v.cost)
60
70
  case 'cadd':
61
71
  v0 = v._from[0]
62
72
  f0 = v0._factor
@@ -65,19 +75,19 @@ def _comb_trace(inputs: Sequence[FixedVariable], outputs: Sequence[FixedVariable
65
75
  qint = v.unscaled.qint
66
76
  data = int(v._data / Decimal(qint.step))
67
77
  assert id0 < i, f'{id0} {i} {v.id}'
68
- ops.append(Op(id0, -1, 4, data, qint, v.latency, v.cost))
78
+ op = Op(id0, -1, 4, data, qint, v.latency, v.cost)
69
79
  case 'wrap':
70
80
  v0 = v._from[0]
71
81
  id0 = index[v0.id]
72
82
  assert id0 < i, f'{id0} {i} {v.id}'
73
83
  opcode = -3 if v._from[0]._factor < 0 else 3
74
- ops.append(Op(id0, -1, opcode, 0, v.unscaled.qint, v.latency, v.cost))
84
+ op = Op(id0, -1, opcode, 0, v.unscaled.qint, v.latency, v.cost)
75
85
  case 'relu':
76
86
  v0 = v._from[0]
77
87
  id0 = index[v0.id]
78
88
  assert id0 < i, f'{id0} {i} {v.id}'
79
89
  opcode = -2 if v._from[0]._factor < 0 else 2
80
- ops.append(Op(id0, -1, opcode, 0, v.unscaled.qint, v.latency, v.cost))
90
+ op = Op(id0, -1, opcode, 0, v.unscaled.qint, v.latency, v.cost)
81
91
  case 'const':
82
92
  qint = v.unscaled.qint
83
93
  assert qint.min == qint.max, f'const {v.id} {qint.min} {qint.max}'
@@ -85,7 +95,7 @@ def _comb_trace(inputs: Sequence[FixedVariable], outputs: Sequence[FixedVariable
85
95
  step = 2.0**-f
86
96
  qint = QInterval(qint.min, qint.min, step)
87
97
  data = qint.min / step
88
- ops.append(Op(-1, -1, 5, int(data), qint, v.latency, v.cost))
98
+ op = Op(-1, -1, 5, int(data), qint, v.latency, v.cost)
89
99
  case 'msb_mux':
90
100
  qint = v.unscaled.qint
91
101
  key, in0, in1 = v._from
@@ -97,10 +107,14 @@ def _comb_trace(inputs: Sequence[FixedVariable], outputs: Sequence[FixedVariable
97
107
  assert idk < i and id0 < i and id1 < i
98
108
  assert key._factor > 0, f'Cannot mux on v{key.id} with negative factor {key._factor}'
99
109
  op = Op(id0, id1, opcode, data, qint, v.latency, v.cost)
100
- ops.append(op)
101
-
110
+ case 'vmul':
111
+ v0, v1 = v._from
112
+ id0, id1 = index[v0.id], index[v1.id]
113
+ op = Op(id0, id1, 7, 0, v.unscaled.qint, v.latency, v.cost)
102
114
  case _:
103
115
  raise NotImplementedError(f'Operation "{v.opr}" is not supported in tracing')
116
+
117
+ ops.append(op)
104
118
  out_index = [index[v.id] for v in outputs]
105
119
  return ops, out_index
106
120
 
@@ -147,6 +161,6 @@ def comb_trace(inputs, outputs):
147
161
  for i in range(len(ops)):
148
162
  if ref_count[i] == 0:
149
163
  op = ops[i]
150
- sol.ops[i] = Op(-1, -1, op[2], 0, QInterval(0, 0, 1), op[5], op[6])
164
+ sol.ops[i] = Op(-1, -1, 5, 0, QInterval(0, 0, 1), op[5], 0.0)
151
165
 
152
166
  return sol
@@ -0,0 +1,66 @@
1
+ Metadata-Version: 2.4
2
+ Name: da4ml
3
+ Version: 0.3.3
4
+ Summary: Digital Arithmetic for Machine Learning
5
+ Author-email: Chang Sun <chsun@cern.ch>
6
+ License: GNU Lesser General Public License v3 (LGPLv3)
7
+ Project-URL: repository, https://github.com/calad0i/da4ml
8
+ Keywords: CMVM,distributed arithmetic,hls4ml,MCM,subexpression elimination
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3 :: Only
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Requires-Python: >=3.10
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: llvmlite>=0.44
21
+ Requires-Dist: numba>=0.61
22
+ Provides-Extra: docs
23
+ Requires-Dist: hgq2; extra == "docs"
24
+ Requires-Dist: myst-parser; extra == "docs"
25
+ Requires-Dist: pyparsing; extra == "docs"
26
+ Requires-Dist: sphinx; extra == "docs"
27
+ Requires-Dist: sphinx-rtd-theme; extra == "docs"
28
+ Dynamic: license-file
29
+
30
+ # da4ml: Distributed Arithmetic for Machine Learning
31
+
32
+ [![LGPLv3](https://img.shields.io/badge/License-LGPLv3-blue.svg)](https://www.gnu.org/licenses/lgpl-3.0)
33
+ [![Documentation](https://github.com/calad0i/da4ml/actions/workflows/sphinx-build.yml/badge.svg)](https://calad0i.github.io/da4ml/)
34
+ [![PyPI version](https://badge.fury.io/py/da4ml.svg)](https://badge.fury.io/py/da4ml)
35
+ [![ArXiv](https://img.shields.io/badge/arXiv-2507.04535-b31b1b.svg)](https://arxiv.org/abs/2507.04535)
36
+
37
+ da4ml is a library for implementing distributed arithmetic (DA) based algorithms for ultra-low latency machine learning (ML) applications on FPGAs. It as two major components:
38
+ - A fast and performant constant-matrix-vector multiplications (CMVM) optimizer to implement them as
39
+ efficient adder trees. Common sub-expressions elimination (CSE) with graph-based pre-optimization are
40
+ performed to reduce the firmware footprint and improve the performance.
41
+ - Low-level symbolic tracing frameworks for generating combinational/fully pipelined logics in HDL or HLS
42
+ code. For fully pipelined networks, da4ml can generate the firmware for the whole network standalone.
43
+ Alternatively, da4ml be used as a plugin in hls4ml to optimize the CMVM operations in the network.
44
+
45
+
46
+ Key Features
47
+ ------------
48
+
49
+ - **Optimized Algorithms**: Comparing to hls4ml's latency strategy, da4ml's CMVM implementation uses no DSO and consumes up to 50% less LUT usage.
50
+ - **Fast code generation**: da4ml can generate HDL for a fully pipelined network in seconds. For the same models, high-level synthesis tools like Vivado/Vitis HLS can take up to days to generate the HDL code.
51
+ - **Low-level symbolic tracing**: As long as the operation can be expressed by a combination of the low-level operations supported, adding new operations is straightforward by "replaying" the operation on the symbolic tensor provided. In most cases, adding support for a new operation/layer takes just a few lines of code in numpy flavor.
52
+ - **Automatic model conversion**: da4ml can automatically convert models trained in `HGQ2 <https://github.com/calad0i/hgq2>`_.
53
+ - **Bit-accurate Simulation**: All operation in da4ml is bit-accurate, meaning the generated HDL code will produce the same output as the original model. da4ml's computation is converted to a RISC-like, instruction set level intermediate representation, distributed arithmetic instruction set (DAIS), which can be easily simulated in multiple ways.
54
+ - **hls4ml integration**: da4ml can be used as a plugin in hls4ml to optimize the CMVM operations in the network by setting `strategy='distributed_arithmetic'` for the strategy of the Dense, EinsumDense, or Conv1/2D layers.
55
+
56
+ Installation
57
+ ------------
58
+
59
+ ```bash
60
+ pip install da4ml
61
+ ```
62
+
63
+ Getting Started
64
+ ---------------
65
+
66
+ See the [Getting Started](https://calad0i.github.io/da4ml/getting_started.html) guide for a quick introduction to using da4ml.
@@ -1,8 +1,8 @@
1
1
  da4ml/__init__.py,sha256=IETRRvzsJvPMLu1kzzi8UN5FYaM5MhNaXH2A_ZKr2_w,469
2
- da4ml/_version.py,sha256=lOWWIGJeBi0KkFopWU_n3GH71C1PsaZ-ZYDfxFkne6c,511
2
+ da4ml/_version.py,sha256=lemL_4Kl75FgrO6lVuFrrtw6-Dcf9wtXBalKkXuzkO4,704
3
3
  da4ml/cmvm/__init__.py,sha256=4Tbt913k9zP0w8R1p6Oss06v5jrManbUhskyHl6e-U0,154
4
4
  da4ml/cmvm/api.py,sha256=JpecMt6g8zutGh_uWT61_0iX8TuXct7-jq7N7HMIsgA,9626
5
- da4ml/cmvm/types.py,sha256=hdthYdP5muIQ-9qFE0CjObGT7lCxB1-udXU16LxtuBI,20959
5
+ da4ml/cmvm/types.py,sha256=O8BuBZ2SyucxoXt_KbulAuHNgim7Ls3M6Ovw8prLgXM,21340
6
6
  da4ml/cmvm/core/__init__.py,sha256=bp2CXI4EOVOQSho1qwfusNs0RliZRt2dV0hZ33W_Kjo,7703
7
7
  da4ml/cmvm/core/indexers.py,sha256=QjXgvExS-B2abHTJPDG4NufMdMEflo1i6cUhFOgJpH4,2945
8
8
  da4ml/cmvm/core/state_opr.py,sha256=wLqO8qVuM2-qCE5LDeYJDNkUruIPHy63obsv4-x-aR8,8661
@@ -11,11 +11,11 @@ da4ml/cmvm/util/bit_decompose.py,sha256=SUco70HRYf4r1JU6BXwcgabDrhm_yAmucae5FC67
11
11
  da4ml/cmvm/util/mat_decompose.py,sha256=eSJNlXwx_jxgqt5vLJrSLQaeq2ZXu8j9mC4d-eq883M,4094
12
12
  da4ml/codegen/__init__.py,sha256=Chdh3oO_vLR4saLbT9VxBPz_0wlEzxJldFSZaVUJo7U,331
13
13
  da4ml/codegen/cpp/__init__.py,sha256=SIePoi_T4iJph50OQUosAnaVuLCckukYjLxp91Y8xQs,134
14
- da4ml/codegen/cpp/cpp_codegen.py,sha256=6lBF1I-xXdIABEWF60owBmQiISuI6mrITCqLqhsEHrQ,6033
14
+ da4ml/codegen/cpp/cpp_codegen.py,sha256=I3YcxK524_oJ7jebxOlRGuYbN2uCY5mpKACoQShqZxs,6153
15
15
  da4ml/codegen/cpp/hls_model.py,sha256=J5lnB8sAvMy0Bo5MSJOpgyUm1tzEJqBxgPTlOd38Gbg,8978
16
- da4ml/codegen/cpp/source/binder_util.hh,sha256=pBVmhXIDvdCr8n2wwYehc3Fpp60sWYrrZaDoP3x9JZE,1880
16
+ da4ml/codegen/cpp/source/binder_util.hh,sha256=ClECVxcEynE_9i4jWCV4y1dnadG3wFqLZfjxg4qHFQQ,1752
17
17
  da4ml/codegen/cpp/source/build_binder.mk,sha256=RLu4TP28aJsveyMOHxuDRGEJVoIPMo9T8WyPtqnmtbQ,584
18
- da4ml/codegen/cpp/source/vitis_bitshift.hh,sha256=yFpYCVJ8gof-EzPjkIWWZYmdFh_wk133Pxzs7f61IQo,774
18
+ da4ml/codegen/cpp/source/vitis_bitshift.hh,sha256=u8wjT_cRn7bXcbC5pH3-rS76ekRbwv-VWAAdaP52-dw,765
19
19
  da4ml/codegen/cpp/source/ap_types/ap_binary.h,sha256=yOcafu2IofstDqxn0wDq8vY3JIwZQ9H5z6IY1dEqMr0,2764
20
20
  da4ml/codegen/cpp/source/ap_types/ap_common.h,sha256=1hJY9uvKOdwRSSll5uehUISZR4tsSsQ1z4PNRUc44KU,10180
21
21
  da4ml/codegen/cpp/source/ap_types/ap_decl.h,sha256=z1HsH-2RSvSoofTZR7RHeqIfAnEYVuHcIu_ute9gjEg,6473
@@ -33,32 +33,34 @@ da4ml/codegen/cpp/source/ap_types/hls_stream.h,sha256=NTkVfbE48c6XnMIfR9WzJbDwUn
33
33
  da4ml/codegen/cpp/source/ap_types/etc/ap_private.h,sha256=TDdxGIX0r3D6Ql8KeXoceRmHhdlwFA3Akr3-vvMVAtk,261465
34
34
  da4ml/codegen/cpp/source/ap_types/utils/x_hls_utils.h,sha256=x24cf1HyZKv0J8YQIoUvYE3uw6SNL7vWetRGIiFm2Jw,2227
35
35
  da4ml/codegen/verilog/__init__.py,sha256=rXmW2V9sDp2RYMDAWlhj_gfMXH3G5lPNmLrFtsJjn_A,298
36
- da4ml/codegen/verilog/comb.py,sha256=CmCwiddeiT4TCZV088lF2ENlAXx3vjZKszTz1sYXEao,7614
36
+ da4ml/codegen/verilog/comb.py,sha256=AnrfJxJXe3hytXiX00VGbdW91AAJDF-dLdsSSWBivdU,7961
37
37
  da4ml/codegen/verilog/io_wrapper.py,sha256=SSs-ZRhBVLR6tpFso8GNGk-FH6JDe-p7LPvVPjTspxo,5002
38
38
  da4ml/codegen/verilog/pipeline.py,sha256=YsPRTLp04Aofg33QMw6_ga3fNX9LeCD7Pq2PnERLWOg,2377
39
- da4ml/codegen/verilog/verilog_model.py,sha256=3ZFaHqx1ONX3uxDKsbzLPxy3D7dehveRmdBfBiiS64o,12299
40
- da4ml/codegen/verilog/source/binder_util.hh,sha256=Dn9ysUdonw0HR8bxom8YfQF7vc1LEvT_B1V_o8Gw1rY,2503
39
+ da4ml/codegen/verilog/verilog_model.py,sha256=2uyrpQN_f1cdF5fz0fBR5nh6idHlzhh_JneLkJAruQs,12172
40
+ da4ml/codegen/verilog/source/binder_util.hh,sha256=2sab9M0vYBsaimzJ8tWJ9LsxYKMe3xTqdFSGO7YRPbk,2521
41
41
  da4ml/codegen/verilog/source/build_binder.mk,sha256=rQbI98itE_b1wIQ_0uCXfBzNmGK2XT4vWmRyCJNnPKk,960
42
42
  da4ml/codegen/verilog/source/build_prj.tcl,sha256=JA-zLl7fd2PV-BFaX22-MTex04QTi0urWUXNAEUDTy0,3003
43
- da4ml/codegen/verilog/source/ioutil.hh,sha256=1o1-oIyQyYc9CU91bBxuitVzzcrNT8p4MTarFKiJoG4,3967
43
+ da4ml/codegen/verilog/source/ioutil.hh,sha256=QXiHbOfkprOL6b-gBQGwcEOQ39uO-bRxKxwObluiK44,3967
44
+ da4ml/codegen/verilog/source/multiplier.v,sha256=MfgRYi7jYPp4W94KLKWpc2MPu2Dg9CDiQ3lJizSIlIQ,1122
44
45
  da4ml/codegen/verilog/source/mux.v,sha256=1PMSQKGR_Cku1EQnePBVCuX6we_dqYBXW54WBEURvs0,1928
45
46
  da4ml/codegen/verilog/source/negative.v,sha256=YphTCLnYslktsnCPq1xjbYgIFavani5NBbqs20uwhBI,688
46
47
  da4ml/codegen/verilog/source/shift_adder.v,sha256=qrpXBX9bhHI-o75v5zshOfq0giEATvbeGgTir20_S3Q,1915
47
48
  da4ml/codegen/verilog/source/template.xdc,sha256=GlSRy8tw_orohSuUwUSNEYJLLkAAHttGTfLTcQqRQDg,1262
48
- da4ml/converter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
49
- da4ml/converter/hgq2/parser.py,sha256=bAtnEXQxRKU9a1HFJWTy-e_HDzZY_wXOBVdyYG3ndsM,3826
50
- da4ml/converter/hgq2/replica.py,sha256=VzasMOasU73ZQr0xH2e8uOeKVFX_Is0n9aZFscGKcik,13864
49
+ da4ml/converter/__init__.py,sha256=x7J2PEXYZsVWffRAkucLxbwzzU404eaijMdLwdhBxtY,57
50
+ da4ml/converter/hgq2/__init__.py,sha256=-gnT_7zXY-KQtPLxsqngwDKZ2TUIynn996pUjjB03B8,59
51
+ da4ml/converter/hgq2/parser.py,sha256=Yc5V-B_aEslqIXXJihRi3GMjF9vMkmUQ2_yHMGHMPVo,5573
52
+ da4ml/converter/hgq2/replica.py,sha256=aKi6BF2x4s3VUF1Q-__GE4-is9eSC3H8TGFDT05vTWc,16292
51
53
  da4ml/trace/__init__.py,sha256=dv-rti3t8iE0RqeThfOb40mAg8FZB2WkkGQq3enJft0,282
52
- da4ml/trace/fixed_variable.py,sha256=6dfMHBN1NfqYIbPZ79GCPCXj2JFQUKTyDZu6xDaG3rg,17082
53
- da4ml/trace/fixed_variable_array.py,sha256=A0ApTvZxpkr7kHrUQkyhrGJuuPe4kDgLFyD_1CW7lBk,10985
54
- da4ml/trace/pipeline.py,sha256=_R2uqWgnpuQ4tD7VKz2eu8CF9Air2RtYH2o03Vfg0Mk,5353
55
- da4ml/trace/tracer.py,sha256=NqPEH9hyVlGQOf9_kJL3A7SujCcxkT-z28bk0Ael5jE,5664
56
- da4ml/trace/ops/__init__.py,sha256=I4VqB43lVkFlLtkoWxiSDHBFGvxKwutNbAJw5aLVeAI,2108
54
+ da4ml/trace/fixed_variable.py,sha256=7vaXFZToCVzPtUZcHv4aoqpqJp46SHUzSWTQijVT0os,21101
55
+ da4ml/trace/fixed_variable_array.py,sha256=mJj9aU-jLCPVkFXrTbcRQndtUKEuhVwiFUGVSGX7PHE,12975
56
+ da4ml/trace/pipeline.py,sha256=AVeO9BNpQlo_WO6S1nQl7RxiHs5VFRR10tWMg_36C2o,5354
57
+ da4ml/trace/tracer.py,sha256=xnaVO4oTWwasfiEBqqeY9o60Lek3eX65IIbvB7JtVKQ,6099
58
+ da4ml/trace/ops/__init__.py,sha256=fz5Cg7ZQqPkZlUj4bIOKY6aaoA1fX_G22TeA8I1n4qY,2166
57
59
  da4ml/trace/ops/conv_utils.py,sha256=Yn73t4F6Tcs1hBwK08L1DPOin2HYVcng4PSkU4vuZFo,8245
58
- da4ml/trace/ops/einsum_utils.py,sha256=MoWvOfvtVjXGwqEhXEzZ3uGrgSmLTHngV8I1eLyANGE,11433
59
- da4ml/trace/ops/reduce_utils.py,sha256=8gohGQRVr8Bn5rfyrGsnE8EDxUXAObv521qu4mJrX9I,3348
60
- da4ml-0.3.1.dist-info/licenses/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
61
- da4ml-0.3.1.dist-info/METADATA,sha256=3H1yt5sKqrIncAGok6NqE27O66_yD7hPUN6jFmCdMqQ,4569
62
- da4ml-0.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
63
- da4ml-0.3.1.dist-info/top_level.txt,sha256=N0tnKVwRqFiffFdeAzCgFq71hUNySh5-ITbNd6-R58Q,6
64
- da4ml-0.3.1.dist-info/RECORD,,
60
+ da4ml/trace/ops/einsum_utils.py,sha256=ODofbvR98FwKBTDZsJ0ObbMjU9_GjPu5AbGuWX6sdCY,11453
61
+ da4ml/trace/ops/reduce_utils.py,sha256=vQjEUUbvnW8inAYJWHDzgy-PbgwIdHlH-uzPzSEvrSc,3494
62
+ da4ml-0.3.3.dist-info/licenses/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
63
+ da4ml-0.3.3.dist-info/METADATA,sha256=C3NAvObpQ5xNOmQQ-cE77AJMFevKJ0gCCO-BrlQpAeA,4055
64
+ da4ml-0.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
65
+ da4ml-0.3.3.dist-info/top_level.txt,sha256=N0tnKVwRqFiffFdeAzCgFq71hUNySh5-ITbNd6-R58Q,6
66
+ da4ml-0.3.3.dist-info/RECORD,,
@@ -1,107 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: da4ml
3
- Version: 0.3.1
4
- Summary: Digital Arithmetic for Machine Learning
5
- Author-email: Chang Sun <chsun@cern.ch>
6
- License: GNU Lesser General Public License v3 (LGPLv3)
7
- Project-URL: repository, https://github.com/calad0i/da4ml
8
- Keywords: CMVM,distributed arithmetic,hls4ml,MCM,subexpression elimination
9
- Classifier: Development Status :: 4 - Beta
10
- Classifier: License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)
11
- Classifier: Operating System :: OS Independent
12
- Classifier: Programming Language :: Python :: 3 :: Only
13
- Classifier: Programming Language :: Python :: 3.10
14
- Classifier: Programming Language :: Python :: 3.11
15
- Classifier: Programming Language :: Python :: 3.12
16
- Classifier: Programming Language :: Python :: 3.13
17
- Requires-Python: >=3.10
18
- Description-Content-Type: text/markdown
19
- License-File: LICENSE
20
- Requires-Dist: llvmlite>=0.44
21
- Requires-Dist: numba>=0.61
22
- Dynamic: license-file
23
-
24
- # da4ml: Distributed Arithmetic for Machine Learning
25
-
26
- This project performs Constant Matrix-Vector Multiplication (CMVM) with Distributed Arithmetic (DA) for Machine Learning (ML) on a Field Programmable Gate Arrays (FPGAs).
27
-
28
- CMVM optimization is done through greedy CSE of two-term subexpressions, with possible Delay Constraints (DC). The optimization is done in jitted Python (Numba), and a list of optimized operations is generated as traced Python code.
29
-
30
- The project generates Verilog or Vitis HLS code for the optimized CMVM operations. This project can be used in conjunction with [`hls4ml`](https://github.com/fastmachinelearning/hls4ml/) for optimizing the neural networks deployed on FPGAs. For a subset of neural networks, the full design can be generated standalone in Verilog or Vitis HLS.
31
-
32
-
33
- ## Installation
34
-
35
- The project is available on PyPI and can be installed with pip:
36
-
37
- ```bash
38
- pip install da4ml
39
- ```
40
-
41
- Notice that `numba>=6.0.0` is required for the project to work. The project does not work with `python<3.10`. If the project fails to compile, try upgrading `numba` and `llvmlite` to the latest versions.
42
-
43
- ## `hls4ml`
44
-
45
- The major use of this project is through the `distributed_arithmetic` strategy in the `hls4ml`:
46
-
47
- ```python
48
- model_hls = hls4ml.converters.convert_from_keras_model(
49
- model,
50
- hls_config={
51
- 'Model': {
52
- ...
53
- 'Strategy': 'distributed_arithmetic',
54
- },
55
- ...
56
- },
57
- ...
58
- )
59
- ```
60
-
61
- Currently, `Dense/Conv1D/Conv2D` layers are supported for both `io_parallel` and `io_stream` dataflows. However, notice that distributed arithmetic implies `reuse_factor=1`, as the whole kernel is implemented in combinational logic.
62
-
63
- ## Standalone usage
64
-
65
- ### `HGQ2`
66
-
67
- For some models trained with `HGQ2`, the `da4ml` can be used to generate the whole model in Verilog or Vitis HLS:
68
-
69
- ```python
70
- from da4ml.codegen import HLSModel, VerilogModel
71
- from da4ml.converter.hgq2.parser import trace_model
72
- from da4ml.trace import comb_trace
73
-
74
- inp, out = trace_model(hgq2_model)
75
- comb_logic = comb_trace(inp[0], out[0]) # Currently, only models with 1 input and 1 output are supported
76
-
77
- # Pipelined Verilog model generation
78
- # `latency_cutoff` is used to control auto piplining behavior. To disable pipelining, set it to -1.
79
- verilog_model = VerilogModel(sol, prj_name='barbar', path='/tmp/barbar', latency_cutoff=5)
80
- verilog_model.compile() # write and verilator binding
81
- verilog_model.predict(inputs)
82
-
83
- vitis_hls_model = HLSModel(sol, prj_name='foo', path='/tmp/foo', flavor='vitis') # Only vitis is supported for now
84
- vitis_hls_model.compile() # write and hls binding
85
- vitis_hls_model.predict(inputs)
86
- ```
87
-
88
- ### Functional Definition
89
- For generic operations, one can define a combinational logic with the functional API:
90
-
91
- ```python
92
- from da4ml.trace import FixedVariableArray, HWConfig, comb_trace
93
- from da4ml.trace.ops import einsum, relu, quantize, conv, pool
94
-
95
- # k, i, f are numpy arrays of integers: keep_negative (0/1), integer bits (excl. sign), fractional bits
96
- inp = FixedVariableArray.from_kif(k, i, f, HWConfig(1, -1, -1), solver_options={'hard_dc':2})
97
- out = inp @ kernel
98
- out = relu(out)
99
- out = einsum(equation, out, weights)
100
- ...
101
-
102
- comb = comb_trace(inp, out)
103
- ```
104
-
105
- `+`, `-`, `@` are supported as well as `einsum`, `relu`, `quantize` (WRAP, with TRN or RND), `conv`, `pool` (average only). For multiplications, only power-of-two multipliers are supported, otherwise use `einsum` or `@` operators.
106
-
107
- The `comb_trace` returns a `Solution` objects that contains a list of low-level operations that are used to implement the combinational logic, which in turn can be used to generate Verilog or Vitis HLS code.
File without changes