da4ml 0.2.0__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of da4ml might be problematic. Click here for more details.

Files changed (48) hide show
  1. {da4ml-0.2.0/src/da4ml.egg-info → da4ml-0.2.1}/PKG-INFO +1 -1
  2. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/_version.py +2 -2
  3. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/cmvm/api.py +2 -6
  4. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/cmvm/core/__init__.py +0 -1
  5. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/cmvm/types.py +4 -4
  6. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/codegen/cpp/cpp_codegen.py +2 -2
  7. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/codegen/verilog/io_wrapper.py +3 -3
  8. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/codegen/verilog/pipeline.py +21 -3
  9. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/codegen/verilog/source/build_prj.tcl +0 -1
  10. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/codegen/verilog/verilog_model.py +7 -4
  11. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/trace/fixed_variable.py +2 -2
  12. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/trace/fixed_variable_array.py +15 -5
  13. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/trace/tracer.py +2 -0
  14. {da4ml-0.2.0 → da4ml-0.2.1/src/da4ml.egg-info}/PKG-INFO +1 -1
  15. {da4ml-0.2.0 → da4ml-0.2.1}/.clang-format +0 -0
  16. {da4ml-0.2.0 → da4ml-0.2.1}/.github/workflows/python-publish.yml +0 -0
  17. {da4ml-0.2.0 → da4ml-0.2.1}/.gitignore +0 -0
  18. {da4ml-0.2.0 → da4ml-0.2.1}/.pre-commit-config.yaml +0 -0
  19. {da4ml-0.2.0 → da4ml-0.2.1}/LICENSE +0 -0
  20. {da4ml-0.2.0 → da4ml-0.2.1}/README.md +0 -0
  21. {da4ml-0.2.0 → da4ml-0.2.1}/pyproject.toml +0 -0
  22. {da4ml-0.2.0 → da4ml-0.2.1}/setup.cfg +0 -0
  23. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/__init__.py +0 -0
  24. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/cmvm/__init__.py +0 -0
  25. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/cmvm/core/indexers.py +0 -0
  26. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/cmvm/core/state_opr.py +0 -0
  27. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/cmvm/util/__init__.py +0 -0
  28. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/cmvm/util/bit_decompose.py +0 -0
  29. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/cmvm/util/mat_decompose.py +0 -0
  30. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/codegen/__init__.py +0 -0
  31. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/codegen/cpp/__init__.py +0 -0
  32. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/codegen/cpp/source/vitis.h +0 -0
  33. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/codegen/cpp/source/vitis_bridge.h +0 -0
  34. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/codegen/verilog/__init__.py +0 -0
  35. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/codegen/verilog/comb.py +0 -0
  36. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/codegen/verilog/source/build_binder.mk +0 -0
  37. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/codegen/verilog/source/ioutils.hh +0 -0
  38. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/codegen/verilog/source/shift_adder.v +0 -0
  39. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/codegen/verilog/source/template.xdc +0 -0
  40. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/trace/__init__.py +0 -0
  41. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/trace/ops/__init__.py +0 -0
  42. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/trace/ops/conv_utils.py +0 -0
  43. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/trace/ops/einsum_utils.py +0 -0
  44. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml/trace/pipeline.py +0 -0
  45. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml.egg-info/SOURCES.txt +0 -0
  46. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml.egg-info/dependency_links.txt +0 -0
  47. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml.egg-info/requires.txt +0 -0
  48. {da4ml-0.2.0 → da4ml-0.2.1}/src/da4ml.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: da4ml
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: Digital Arithmetic for Machine Learning
5
5
  Author-email: Chang Sun <chsun@cern.ch>
6
6
  License: GNU Lesser General Public License v3 (LGPLv3)
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '0.2.0'
21
- __version_tuple__ = version_tuple = (0, 2, 0)
20
+ __version__ = version = '0.2.1'
21
+ __version_tuple__ = version_tuple = (0, 2, 1)
@@ -140,10 +140,6 @@ def jit_solve(
140
140
  if not method0 == method1 == 'wmc-dc' or decompose_dc >= 0:
141
141
  decompose_dc -= 1
142
142
  continue
143
- if sum([op.cost for op in sol1.ops]) * 4 > sum([op.cost for op in sol0.ops]) and decompose_dc > 0:
144
- # If the second stage is too expensive, the decomposition usually doesn't worth it
145
- decompose_dc -= 1
146
- continue
147
143
  break
148
144
  if max(latencies1) > latency_allowed:
149
145
  # When latency depends on the bw, may happen
@@ -158,8 +154,8 @@ def solve(
158
154
  method1: str = 'auto',
159
155
  hard_dc: int = -1,
160
156
  decompose_dc: int = -2,
161
- qintervals: tuple[QInterval, ...] | None = None,
162
- latencies: tuple[float, ...] | None = None,
157
+ qintervals: list[QInterval] | None = None,
158
+ latencies: list[float] | None = None,
163
159
  adder_size: int = -1,
164
160
  carry_size: int = -1,
165
161
  search_all_decompose_dc: bool = True,
@@ -131,7 +131,6 @@ def to_solution(
131
131
 
132
132
  _global_id = len(ops)
133
133
  for i_out in range(n_out):
134
- heap = []
135
134
  idx, shifts = np.where(expr[:, i_out] != 0)
136
135
  sub = np.empty(len(idx), dtype=np.int64)
137
136
  for i, (i_in, shift) in enumerate(zip(idx, shifts)):
@@ -159,6 +159,8 @@ def _relu(v: 'T', i: int | None = None, f: int | None = None, inv: bool = False,
159
159
  from ..trace.fixed_variable import FixedVariable
160
160
 
161
161
  assert isinstance(v, FixedVariable), f'Unknown type {type(v)} for symbolic relu'
162
+ if inv:
163
+ v = -v
162
164
  return v.relu(i, f, round_mode=round_mode)
163
165
 
164
166
 
@@ -295,9 +297,7 @@ class Solution(NamedTuple):
295
297
  inp_qint = [op.qint for op in self.ops if op.opcode == -1]
296
298
  if quantize: # TRN and WRAP
297
299
  k, i, f = map(np.array, zip(*map(minimal_kif, inp_qint)))
298
- eps = 2.0**-f
299
- _low, _high = -(2.0 ** (i + f)) * k, 2.0 ** (i + f) - 1
300
- inp = eps * ((np.floor(inp / eps) - _low) % 2.0 ** (k + i + f) + _low)
300
+ inp = [_quantize(*x, round_mode='TRN') for x in zip(inp, k, i, f)]
301
301
 
302
302
  inp = inp * (2.0 ** np.array(self.inp_shift))
303
303
  for i, op in enumerate(self.ops):
@@ -561,7 +561,7 @@ class CascadedSolution(NamedTuple):
561
561
  @property
562
562
  def reg_bits(self):
563
563
  """The number of bits used for the register in the solution."""
564
- bits = 0
564
+ bits = sum(map(sum, (_minimal_kif(qint) for qint in self.inp_qint)))
565
565
  for _sol in self.solutions:
566
566
  kifs = [_minimal_kif(qint) for qint in _sol.out_qint]
567
567
  _bits = sum(map(sum, kifs))
@@ -4,13 +4,13 @@ from ...cmvm.types import Op, QInterval, Solution, _minimal_kif
4
4
  from ...trace.fixed_variable import _const_f
5
5
 
6
6
 
7
- def kif_to_vitis_type(k: bool | int, i: int, f: int):
7
+ def kif_to_vitis_type(k: bool | int = 1, i: int = 0, f: int = 0):
8
8
  if k == i == f == 0:
9
9
  f = 1
10
10
  return f'ap_{"" if k else "u"}fixed<{k+i+f},{k+i}>'
11
11
 
12
12
 
13
- def kif_to_hlslib_type(k: bool | int, i: int, f: int):
13
+ def kif_to_hlslib_type(k: bool | int = 1, i: int = 0, f: int = 0):
14
14
  if k == i == f == 0:
15
15
  f = 1
16
16
  return f'ac_fixed<{int(k)},{k+i+f},{k+i}>'
@@ -171,13 +171,13 @@ void inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
171
171
  }}"""
172
172
 
173
173
 
174
- def pipeline_binder_gen(csol: CascadedSolution, module_name: str, II: int = 1):
174
+ def pipeline_binder_gen(csol: CascadedSolution, module_name: str, II: int = 1, latency_multiplier: int = 1):
175
175
  k_in, i_in, f_in = zip(*map(_minimal_kif, csol.inp_qint))
176
176
  k_out, i_out, f_out = zip(*map(_minimal_kif, csol.out_qint))
177
177
  max_inp_bw = max(k + i for k, i in zip(k_in, i_in)) + max(f_in)
178
178
  max_out_bw = max(k + i for k, i in zip(k_out, i_out)) + max(f_out)
179
179
 
180
- n_stage = len(csol.solutions)
180
+ latency = len(csol.solutions) * latency_multiplier
181
181
 
182
182
  n_in, n_out = csol.shape
183
183
  return f"""#include "V{module_name}.h"
@@ -196,7 +196,7 @@ constexpr size_t N_out = {n_out};
196
196
  constexpr size_t max_inp_bw = {max_inp_bw};
197
197
  constexpr size_t max_out_bw = {max_out_bw};
198
198
  constexpr size_t II = {II};
199
- constexpr size_t latency = {n_stage};
199
+ constexpr size_t latency = {latency};
200
200
  typedef V{module_name} dut_t;
201
201
 
202
202
  extern "C" {{
@@ -3,19 +3,37 @@ from .comb import comb_logic_gen
3
3
 
4
4
 
5
5
  def pipeline_logic_gen(
6
- csol: CascadedSolution, name: str, print_latency=False, timescale: str | None = '`timescale 1 ns / 1 ps', reset_high=True
6
+ csol: CascadedSolution,
7
+ name: str,
8
+ print_latency=False,
9
+ timescale: str | None = '`timescale 1 ns / 1 ps',
10
+ register_layers: int = 1,
7
11
  ):
8
12
  N = len(csol.solutions)
9
13
  inp_bits = [sum(map(sum, map(_minimal_kif, sol.inp_qint))) for sol in csol.solutions]
10
14
  out_bits = inp_bits[1:] + [sum(map(sum, map(_minimal_kif, csol.out_qint)))]
11
15
 
12
16
  registers = [f'reg [{width}-1:0] stage{i}_inp;' for i, width in enumerate(inp_bits)]
17
+ for i in range(0, register_layers - 1):
18
+ registers += [f'reg [{width}-1:0] stage{j}_inp_copy{i};' for j, width in enumerate(inp_bits)]
13
19
  wires = [f'wire [{width}-1:0] stage{i}_out;' for i, width in enumerate(out_bits)]
14
20
 
15
21
  comb_logic = [f'{name}_stage{i} stage{i} (.inp(stage{i}_inp), .out(stage{i}_out));' for i in range(N)]
16
22
 
17
- serial_logic = ['stage0_inp <= inp;']
18
- serial_logic += [f'stage{i}_inp <= stage{i-1}_out;' for i in range(1, N)]
23
+ if register_layers == 1:
24
+ serial_logic = ['stage0_inp <= inp;']
25
+ serial_logic += [f'stage{i}_inp <= stage{i-1}_out;' for i in range(1, N)]
26
+ else:
27
+ serial_logic = ['stage0_inp_copy0 <= inp;']
28
+ for j in range(1, register_layers - 1):
29
+ serial_logic.append(f'stage0_inp_copy{j} <= stage0_inp_copy{j-1};')
30
+ serial_logic.append(f'stage0_inp <= stage0_inp_copy{register_layers - 2};')
31
+ for i in range(1, N):
32
+ serial_logic.append(f'stage{i}_inp_copy0 <= stage{i-1}_out;')
33
+ for j in range(1, register_layers - 1):
34
+ serial_logic.append(f'stage{i}_inp_copy{j} <= stage{i}_inp_copy{j-1};')
35
+ serial_logic.append(f'stage{i}_inp <= stage{i}_inp_copy{register_layers - 2};')
36
+
19
37
  serial_logic += [f'out <= stage{N-1}_out;']
20
38
 
21
39
  sep0 = '\n '
@@ -26,7 +26,6 @@ file mkdir "${output_dir}/reports"
26
26
  # synth
27
27
  synth_design -top $top_module -mode out_of_context -retiming \
28
28
  -flatten_hierarchy rebuilt -resource_sharing auto \
29
- -keep_equivalent_registers -shreg_min_size 8 \
30
29
  -directive AlternateRoutability
31
30
 
32
31
  write_checkpoint -force "${output_dir}/${project_name}_post_synth.dcp"
@@ -34,6 +34,7 @@ class VerilogModel:
34
34
  clock_period: int = 5,
35
35
  clock_uncertainty: float = 0.1,
36
36
  io_delay_minmax: tuple[float, float] = (0.2, 0.4),
37
+ register_layers: int = 1,
37
38
  ):
38
39
  self._solution = solution
39
40
  self._path = Path(path)
@@ -45,6 +46,7 @@ class VerilogModel:
45
46
  self._clock_period = clock_period
46
47
  self._clock_uncertainty = clock_uncertainty
47
48
  self._io_delay_minmax = io_delay_minmax
49
+ self._register_layers = register_layers
48
50
 
49
51
  self._pipe = solution if isinstance(solution, CascadedSolution) else None
50
52
  if latency_cutoff > 0 and self._pipe is None:
@@ -62,7 +64,7 @@ class VerilogModel:
62
64
  self._path.mkdir(parents=True, exist_ok=True)
63
65
  if self._pipe is not None: # Pipeline
64
66
  # Main logic
65
- codes = pipeline_logic_gen(self._pipe, self._prj_name, self._print_latency)
67
+ codes = pipeline_logic_gen(self._pipe, self._prj_name, self._print_latency, register_layers=self._register_layers)
66
68
  for k, v in codes.items():
67
69
  with open(self._path / f'{k}.v', 'w') as f:
68
70
  f.write(v)
@@ -86,8 +88,8 @@ class VerilogModel:
86
88
  with open(self._path / f'{self._prj_name}.xdc', 'w') as f:
87
89
  f.write(xdc)
88
90
 
89
- # C++ binder w/
90
- binder = pipeline_binder_gen(self._pipe, f'{self._prj_name}_wrapper', 1)
91
+ # C++ binder w/ verilog wrapper for uniform bw
92
+ binder = pipeline_binder_gen(self._pipe, f'{self._prj_name}_wrapper', 1, self._register_layers)
91
93
 
92
94
  # Verilog IO wrapper (non-uniform bw to uniform one, clk passthrough)
93
95
  io_wrapper = generate_io_wrapper(self._pipe, self._prj_name, True)
@@ -243,11 +245,12 @@ class VerilogModel:
243
245
  in_bits, out_bits = np.sum(kifs_in), np.sum(kifs_out)
244
246
  if self._pipe is not None:
245
247
  n_stage = len(self._pipe[0])
248
+ delay_suffix = '' if self._register_layers == 1 else f'x {self._register_layers} '
246
249
  lat_cutoff = self._latency_cutoff
247
250
  reg_bits = self._pipe.reg_bits
248
251
  spec = f"""Top Module: {self._prj_name}\n====================
249
252
  {inp_size} ({in_bits} bits) -> {out_size} ({out_bits} bits)
250
- {n_stage} stages @ max_delay={lat_cutoff}
253
+ {n_stage} {delay_suffix}stages @ max_delay={lat_cutoff}
251
254
  Estimated cost: {cost} LUTs, {reg_bits} FFs"""
252
255
 
253
256
  else:
@@ -266,7 +266,7 @@ class FixedVariable:
266
266
  step = Decimal(2) ** -f
267
267
  i = ceil(log2(val + step)) if not i else i
268
268
  eps = step / 2 if round_mode == 'RND' else 0
269
- val = floor(val / step + eps) % Decimal(2) ** i * step
269
+ val = (floor(val / step + eps) * step) % (Decimal(2) ** i)
270
270
  return FixedVariable(val, val, step, hwconf=self.hwconf)
271
271
 
272
272
  step = max(Decimal(2) ** -f, self.step) if f is not None else self.step
@@ -324,7 +324,7 @@ class FixedVariable:
324
324
  # bit-exactness will be lost in these cases, but they should never happen (quantizers are used in a weird way)
325
325
  # Keeping this for now; change if absolutely necessary
326
326
  f = min(f, _f)
327
- k = min(k, _k)
327
+ k = min(k, _k) if i >= _i else k
328
328
  i = min(i, _i)
329
329
 
330
330
  step = max(Decimal(2) ** -f, self.step)
@@ -1,6 +1,8 @@
1
+ from inspect import signature
1
2
  from typing import Any
2
3
 
3
4
  import numpy as np
5
+ from numba.typed import List as NumbaList
4
6
  from numpy.typing import NDArray
5
7
 
6
8
  from ..cmvm import solve
@@ -14,7 +16,13 @@ class FixedVariableArray:
14
16
  solver_options: dict[str, Any] | None = None,
15
17
  ):
16
18
  self._vars = np.array(vars)
17
- self.solver_options = solver_options
19
+ _solver_options = signature(solve).parameters
20
+ _solver_options = {k: v.default for k, v in _solver_options.items() if v.default is not v.empty}
21
+ if solver_options is not None:
22
+ _solver_options.update(solver_options)
23
+ _solver_options.pop('qintervals', None)
24
+ _solver_options.pop('latencies', None)
25
+ self.solver_options = _solver_options
18
26
 
19
27
  @classmethod
20
28
  def from_lhs(
@@ -75,8 +83,10 @@ class FixedVariableArray:
75
83
  r = []
76
84
  for i in range(mat0.shape[0]):
77
85
  vec = mat0[i]
78
- qintervals = tuple([QInterval(float(v.low), float(v.high), float(v.step)) for v in vec._vars])
79
- latencies = tuple([float(v.latency) for v in vec._vars])
86
+ _qintervals = [QInterval(float(v.low), float(v.high), float(v.step)) for v in vec._vars]
87
+ _latencies = [float(v.latency) for v in vec._vars]
88
+ qintervals = NumbaList(_qintervals) # type: ignore
89
+ latencies = NumbaList(_latencies) # type: ignore
80
90
  hwconf = self._vars.ravel()[0].hwconf
81
91
  kwargs.update(adder_size=hwconf.adder_size, carry_size=hwconf.carry_size)
82
92
  _mat = np.ascontiguousarray(mat1.astype(np.float32))
@@ -96,8 +106,8 @@ class FixedVariableArray:
96
106
  axes = _axes[ndim0 - 1 :] + _axes[: ndim0 - 1]
97
107
  return r.transpose(axes)
98
108
 
99
- def __getitem__(self, *item):
100
- vars = self._vars[*item]
109
+ def __getitem__(self, item):
110
+ vars = self._vars[item]
101
111
  if isinstance(vars, np.ndarray):
102
112
  return FixedVariableArray(vars, self.solver_options)
103
113
  else:
@@ -101,6 +101,8 @@ def comb_trace(inputs: FixedVariableArray, outputs: FixedVariableArray) -> Solut
101
101
 
102
102
  def comb_trace(inputs, outputs):
103
103
  inputs, outputs = list(np.ravel(inputs)), list(np.ravel(outputs))
104
+ # latency = max(v.latency if isinstance(v, FixedVariable) else 0 for v in outputs)
105
+ # outputs = [v if isinstance(v, FixedVariable) else FixedVariable(v,v,0, latency=latency, opr='const') for v in outputs]
104
106
  ops, out_index = _comb_trace(inputs, outputs)
105
107
  shape = len(inputs), len(outputs)
106
108
  inp_shift = [0] * shape[0]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: da4ml
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: Digital Arithmetic for Machine Learning
5
5
  Author-email: Chang Sun <chsun@cern.ch>
6
6
  License: GNU Lesser General Public License v3 (LGPLv3)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes