da4ml 0.3.2__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of da4ml might be problematic. Click here for more details.

Files changed (106) hide show
  1. {da4ml-0.3.2/src/da4ml.egg-info → da4ml-0.4.0}/PKG-INFO +2 -2
  2. {da4ml-0.3.2 → da4ml-0.4.0}/docs/faq.md +3 -0
  3. {da4ml-0.3.2 → da4ml-0.4.0}/docs/getting_started.md +1 -1
  4. {da4ml-0.3.2 → da4ml-0.4.0}/pyproject.toml +1 -1
  5. {da4ml-0.3.2 → da4ml-0.4.0}/src/da4ml/_version.py +3 -3
  6. da4ml-0.4.0/src/da4ml/codegen/__init__.py +9 -0
  7. da4ml-0.4.0/src/da4ml/codegen/hls/__init__.py +4 -0
  8. da4ml-0.3.2/src/da4ml/codegen/cpp/cpp_codegen.py → da4ml-0.4.0/src/da4ml/codegen/hls/hls_codegen.py +19 -12
  9. {da4ml-0.3.2/src/da4ml/codegen/cpp → da4ml-0.4.0/src/da4ml/codegen/hls}/hls_model.py +7 -7
  10. da4ml-0.4.0/src/da4ml/codegen/hls/source/binder_util.hh +50 -0
  11. {da4ml-0.3.2/src/da4ml/codegen/cpp → da4ml-0.4.0/src/da4ml/codegen/hls}/source/vitis_bitshift.hh +5 -3
  12. da4ml-0.4.0/src/da4ml/codegen/rtl/__init__.py +15 -0
  13. {da4ml-0.3.2/src/da4ml/codegen/verilog/source → da4ml-0.4.0/src/da4ml/codegen/rtl/common_source}/binder_util.hh +4 -4
  14. {da4ml-0.3.2/src/da4ml/codegen/verilog/source → da4ml-0.4.0/src/da4ml/codegen/rtl/common_source}/build_binder.mk +7 -1
  15. {da4ml-0.3.2/src/da4ml/codegen/verilog/source → da4ml-0.4.0/src/da4ml/codegen/rtl/common_source}/build_prj.tcl +28 -7
  16. da4ml-0.3.2/src/da4ml/codegen/verilog/verilog_model.py → da4ml-0.4.0/src/da4ml/codegen/rtl/rtl_model.py +87 -16
  17. {da4ml-0.3.2/src/da4ml/codegen → da4ml-0.4.0/src/da4ml/codegen/rtl}/verilog/__init__.py +0 -2
  18. {da4ml-0.3.2/src/da4ml/codegen → da4ml-0.4.0/src/da4ml/codegen/rtl}/verilog/comb.py +32 -34
  19. {da4ml-0.3.2/src/da4ml/codegen → da4ml-0.4.0/src/da4ml/codegen/rtl}/verilog/io_wrapper.py +8 -8
  20. {da4ml-0.3.2/src/da4ml/codegen → da4ml-0.4.0/src/da4ml/codegen/rtl}/verilog/pipeline.py +10 -10
  21. {da4ml-0.3.2/src/da4ml/codegen → da4ml-0.4.0/src/da4ml/codegen/rtl}/verilog/source/negative.v +2 -1
  22. da4ml-0.4.0/src/da4ml/codegen/rtl/vhdl/__init__.py +10 -0
  23. da4ml-0.4.0/src/da4ml/codegen/rtl/vhdl/comb.py +192 -0
  24. da4ml-0.4.0/src/da4ml/codegen/rtl/vhdl/io_wrapper.py +157 -0
  25. da4ml-0.4.0/src/da4ml/codegen/rtl/vhdl/pipeline.py +71 -0
  26. da4ml-0.4.0/src/da4ml/codegen/rtl/vhdl/source/multiplier.vhd +40 -0
  27. da4ml-0.4.0/src/da4ml/codegen/rtl/vhdl/source/mux.vhd +102 -0
  28. da4ml-0.4.0/src/da4ml/codegen/rtl/vhdl/source/negative.vhd +35 -0
  29. da4ml-0.4.0/src/da4ml/codegen/rtl/vhdl/source/shift_adder.vhd +101 -0
  30. da4ml-0.4.0/src/da4ml/codegen/rtl/vhdl/source/template.xdc +32 -0
  31. {da4ml-0.3.2 → da4ml-0.4.0}/src/da4ml/converter/hgq2/parser.py +4 -2
  32. {da4ml-0.3.2 → da4ml-0.4.0}/src/da4ml/trace/fixed_variable.py +4 -0
  33. {da4ml-0.3.2 → da4ml-0.4.0}/src/da4ml/trace/fixed_variable_array.py +4 -0
  34. {da4ml-0.3.2 → da4ml-0.4.0}/src/da4ml/trace/ops/reduce_utils.py +3 -3
  35. {da4ml-0.3.2 → da4ml-0.4.0/src/da4ml.egg-info}/PKG-INFO +2 -2
  36. da4ml-0.4.0/src/da4ml.egg-info/SOURCES.txt +100 -0
  37. da4ml-0.3.2/src/da4ml/codegen/__init__.py +0 -12
  38. da4ml-0.3.2/src/da4ml/codegen/cpp/__init__.py +0 -4
  39. da4ml-0.3.2/src/da4ml/codegen/cpp/source/binder_util.hh +0 -56
  40. da4ml-0.3.2/src/da4ml.egg-info/SOURCES.txt +0 -90
  41. {da4ml-0.3.2 → da4ml-0.4.0}/.clang-format +0 -0
  42. {da4ml-0.3.2 → da4ml-0.4.0}/.github/workflows/python-publish.yml +0 -0
  43. {da4ml-0.3.2 → da4ml-0.4.0}/.github/workflows/sphinx-build.yml +0 -0
  44. {da4ml-0.3.2 → da4ml-0.4.0}/.gitignore +0 -0
  45. {da4ml-0.3.2 → da4ml-0.4.0}/.pre-commit-config.yaml +0 -0
  46. {da4ml-0.3.2 → da4ml-0.4.0}/LICENSE +0 -0
  47. {da4ml-0.3.2 → da4ml-0.4.0}/README.md +0 -0
  48. {da4ml-0.3.2 → da4ml-0.4.0}/docs/Makefile +0 -0
  49. {da4ml-0.3.2 → da4ml-0.4.0}/docs/_static/example.svg +0 -0
  50. {da4ml-0.3.2 → da4ml-0.4.0}/docs/_static/icon.svg +0 -0
  51. {da4ml-0.3.2 → da4ml-0.4.0}/docs/_static/stage1.svg +0 -0
  52. {da4ml-0.3.2 → da4ml-0.4.0}/docs/_static/stage2.svg +0 -0
  53. {da4ml-0.3.2 → da4ml-0.4.0}/docs/_static/workflow.svg +0 -0
  54. {da4ml-0.3.2 → da4ml-0.4.0}/docs/cmvm.md +0 -0
  55. {da4ml-0.3.2 → da4ml-0.4.0}/docs/conf.py +0 -0
  56. {da4ml-0.3.2 → da4ml-0.4.0}/docs/dais.md +0 -0
  57. {da4ml-0.3.2 → da4ml-0.4.0}/docs/index.rst +0 -0
  58. {da4ml-0.3.2 → da4ml-0.4.0}/docs/install.md +0 -0
  59. {da4ml-0.3.2 → da4ml-0.4.0}/docs/status.md +0 -0
  60. {da4ml-0.3.2 → da4ml-0.4.0}/interperter/DAISInterpreter.cc +0 -0
  61. {da4ml-0.3.2 → da4ml-0.4.0}/interperter/DAISInterpreter.hh +0 -0
  62. {da4ml-0.3.2 → da4ml-0.4.0}/setup.cfg +0 -0
  63. {da4ml-0.3.2 → da4ml-0.4.0}/src/da4ml/__init__.py +0 -0
  64. {da4ml-0.3.2 → da4ml-0.4.0}/src/da4ml/cmvm/__init__.py +0 -0
  65. {da4ml-0.3.2 → da4ml-0.4.0}/src/da4ml/cmvm/api.py +0 -0
  66. {da4ml-0.3.2 → da4ml-0.4.0}/src/da4ml/cmvm/core/__init__.py +0 -0
  67. {da4ml-0.3.2 → da4ml-0.4.0}/src/da4ml/cmvm/core/indexers.py +0 -0
  68. {da4ml-0.3.2 → da4ml-0.4.0}/src/da4ml/cmvm/core/state_opr.py +0 -0
  69. {da4ml-0.3.2 → da4ml-0.4.0}/src/da4ml/cmvm/types.py +0 -0
  70. {da4ml-0.3.2 → da4ml-0.4.0}/src/da4ml/cmvm/util/__init__.py +0 -0
  71. {da4ml-0.3.2 → da4ml-0.4.0}/src/da4ml/cmvm/util/bit_decompose.py +0 -0
  72. {da4ml-0.3.2 → da4ml-0.4.0}/src/da4ml/cmvm/util/mat_decompose.py +0 -0
  73. {da4ml-0.3.2/src/da4ml/codegen/cpp → da4ml-0.4.0/src/da4ml/codegen/hls}/source/ap_types/ap_binary.h +0 -0
  74. {da4ml-0.3.2/src/da4ml/codegen/cpp → da4ml-0.4.0/src/da4ml/codegen/hls}/source/ap_types/ap_common.h +0 -0
  75. {da4ml-0.3.2/src/da4ml/codegen/cpp → da4ml-0.4.0/src/da4ml/codegen/hls}/source/ap_types/ap_decl.h +0 -0
  76. {da4ml-0.3.2/src/da4ml/codegen/cpp → da4ml-0.4.0/src/da4ml/codegen/hls}/source/ap_types/ap_fixed.h +0 -0
  77. {da4ml-0.3.2/src/da4ml/codegen/cpp → da4ml-0.4.0/src/da4ml/codegen/hls}/source/ap_types/ap_fixed_base.h +0 -0
  78. {da4ml-0.3.2/src/da4ml/codegen/cpp → da4ml-0.4.0/src/da4ml/codegen/hls}/source/ap_types/ap_fixed_ref.h +0 -0
  79. {da4ml-0.3.2/src/da4ml/codegen/cpp → da4ml-0.4.0/src/da4ml/codegen/hls}/source/ap_types/ap_fixed_special.h +0 -0
  80. {da4ml-0.3.2/src/da4ml/codegen/cpp → da4ml-0.4.0/src/da4ml/codegen/hls}/source/ap_types/ap_int.h +0 -0
  81. {da4ml-0.3.2/src/da4ml/codegen/cpp → da4ml-0.4.0/src/da4ml/codegen/hls}/source/ap_types/ap_int_base.h +0 -0
  82. {da4ml-0.3.2/src/da4ml/codegen/cpp → da4ml-0.4.0/src/da4ml/codegen/hls}/source/ap_types/ap_int_ref.h +0 -0
  83. {da4ml-0.3.2/src/da4ml/codegen/cpp → da4ml-0.4.0/src/da4ml/codegen/hls}/source/ap_types/ap_int_special.h +0 -0
  84. {da4ml-0.3.2/src/da4ml/codegen/cpp → da4ml-0.4.0/src/da4ml/codegen/hls}/source/ap_types/ap_shift_reg.h +0 -0
  85. {da4ml-0.3.2/src/da4ml/codegen/cpp → da4ml-0.4.0/src/da4ml/codegen/hls}/source/ap_types/etc/ap_private.h +0 -0
  86. {da4ml-0.3.2/src/da4ml/codegen/cpp → da4ml-0.4.0/src/da4ml/codegen/hls}/source/ap_types/hls_math.h +0 -0
  87. {da4ml-0.3.2/src/da4ml/codegen/cpp → da4ml-0.4.0/src/da4ml/codegen/hls}/source/ap_types/hls_stream.h +0 -0
  88. {da4ml-0.3.2/src/da4ml/codegen/cpp → da4ml-0.4.0/src/da4ml/codegen/hls}/source/ap_types/utils/x_hls_utils.h +0 -0
  89. {da4ml-0.3.2/src/da4ml/codegen/cpp → da4ml-0.4.0/src/da4ml/codegen/hls}/source/build_binder.mk +0 -0
  90. {da4ml-0.3.2/src/da4ml/codegen/verilog/source → da4ml-0.4.0/src/da4ml/codegen/rtl/common_source}/ioutil.hh +0 -0
  91. {da4ml-0.3.2/src/da4ml/codegen/verilog/source → da4ml-0.4.0/src/da4ml/codegen/rtl/common_source}/template.xdc +0 -0
  92. {da4ml-0.3.2/src/da4ml/codegen → da4ml-0.4.0/src/da4ml/codegen/rtl}/verilog/source/multiplier.v +0 -0
  93. {da4ml-0.3.2/src/da4ml/codegen → da4ml-0.4.0/src/da4ml/codegen/rtl}/verilog/source/mux.v +0 -0
  94. {da4ml-0.3.2/src/da4ml/codegen → da4ml-0.4.0/src/da4ml/codegen/rtl}/verilog/source/shift_adder.v +0 -0
  95. {da4ml-0.3.2 → da4ml-0.4.0}/src/da4ml/converter/__init__.py +0 -0
  96. {da4ml-0.3.2 → da4ml-0.4.0}/src/da4ml/converter/hgq2/__init__.py +0 -0
  97. {da4ml-0.3.2 → da4ml-0.4.0}/src/da4ml/converter/hgq2/replica.py +0 -0
  98. {da4ml-0.3.2 → da4ml-0.4.0}/src/da4ml/trace/__init__.py +0 -0
  99. {da4ml-0.3.2 → da4ml-0.4.0}/src/da4ml/trace/ops/__init__.py +0 -0
  100. {da4ml-0.3.2 → da4ml-0.4.0}/src/da4ml/trace/ops/conv_utils.py +0 -0
  101. {da4ml-0.3.2 → da4ml-0.4.0}/src/da4ml/trace/ops/einsum_utils.py +0 -0
  102. {da4ml-0.3.2 → da4ml-0.4.0}/src/da4ml/trace/pipeline.py +0 -0
  103. {da4ml-0.3.2 → da4ml-0.4.0}/src/da4ml/trace/tracer.py +0 -0
  104. {da4ml-0.3.2 → da4ml-0.4.0}/src/da4ml.egg-info/dependency_links.txt +0 -0
  105. {da4ml-0.3.2 → da4ml-0.4.0}/src/da4ml.egg-info/requires.txt +0 -0
  106. {da4ml-0.3.2 → da4ml-0.4.0}/src/da4ml.egg-info/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: da4ml
3
- Version: 0.3.2
4
- Summary: Digital Arithmetic for Machine Learning
3
+ Version: 0.4.0
4
+ Summary: Distributed Arithmetic for Machine Learning
5
5
  Author-email: Chang Sun <chsun@cern.ch>
6
6
  License: GNU Lesser General Public License v3 (LGPLv3)
7
7
  Project-URL: repository, https://github.com/calad0i/da4ml
@@ -5,6 +5,9 @@ Two things:
5
5
  1. Converting constant-matrix-vector multiplications (CMVMs) into optimized adder graphs with distributed arithmetic for FPGA implementation.
6
6
  2. Converting (a part of) neural networks to fully parallel HDL or HLS with the CMVM optimization above.
7
7
 
8
+ ## Should I use the standalone flow or the hls4ml-integrated flow?
9
+ If the network is supported by da4ml standalone, it is **recommended to use the standalone flow**. In most cases, the standalone flow gives better latency and timing, and is orders of magnitude faster in synthesis time. However, in some occasions, the hls4ml-integrated flow could provide better timing when the routing is highly challenging for the standalone flow. If the network is not supported by da4ml standalone (e.g., contains unsupported layers or operations), then the hls4ml-integrated flow is the only option.
10
+
8
11
  ## So does da4ml only work with neural networks with II=1?
9
12
  No. When integrated with hls4ml, da4ml only requires that **each CMVM operation is unrolled (II=1)**. This is different from unrolling the whole model, e.g., convolution layers can still have II>1 by reusing the same CMVM kernel for different input windows.
10
13
 
@@ -1,6 +1,6 @@
1
1
  # Getting Started with da4ml
2
2
 
3
- da4ml can be used in three different ways:
3
+ da4ml can be used in three different ways. When standalone code generation, it is recommended to use the functional API or HGQ2 integration. See [FAQ](./faq.html) for more details on when to use which flow.
4
4
 
5
5
  ## functional API:
6
6
 
@@ -5,7 +5,7 @@ requires = [ "setuptools>=67.8", "setuptools-scm>=8" ]
5
5
 
6
6
  [project]
7
7
  name = "da4ml"
8
- description = "Digital Arithmetic for Machine Learning"
8
+ description = "Distributed Arithmetic for Machine Learning"
9
9
  readme = "README.md"
10
10
  keywords = [
11
11
  "CMVM",
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.3.2'
32
- __version_tuple__ = version_tuple = (0, 3, 2)
31
+ __version__ = version = '0.4.0'
32
+ __version_tuple__ = version_tuple = (0, 4, 0)
33
33
 
34
- __commit_id__ = commit_id = 'g01e84ad19'
34
+ __commit_id__ = commit_id = 'gb2796d8af'
@@ -0,0 +1,9 @@
1
+ from .hls import HLSModel
2
+ from .rtl import RTLModel, VerilogModel, VHDLModel
3
+
4
+ __all__ = [
5
+ 'HLSModel',
6
+ 'VerilogModel',
7
+ 'VHDLModel',
8
+ 'RTLModel',
9
+ ]
@@ -0,0 +1,4 @@
1
+ from .hls_codegen import hls_logic_and_bridge_gen
2
+ from .hls_model import HLSModel
3
+
4
+ __all__ = ['hls_logic_and_bridge_gen', 'HLSModel']
@@ -16,12 +16,19 @@ def kif_to_hlslib_type(k: bool | int = 1, i: int = 0, f: int = 0):
16
16
  return f'ac_fixed<{int(k)},{k + i + f},{k + i}>'
17
17
 
18
18
 
19
+ def kif_to_oneapi_type(k: bool | int = 1, i: int = 0, f: int = 0):
20
+ # OneAPI requires at least 2 bits for all ac_fixed as of 2025.1
21
+ return f'ac_fixed<{int(k)},{max(k + i + f, 2)},{k + i}>'
22
+
23
+
19
24
  def get_typestr_fn(flavor: str):
20
25
  match flavor.lower():
21
26
  case 'vitis':
22
27
  typestr_fn = kif_to_vitis_type
23
28
  case 'hlslib':
24
29
  typestr_fn = kif_to_hlslib_type
30
+ case 'oneapi':
31
+ typestr_fn = kif_to_oneapi_type
25
32
  case _:
26
33
  raise ValueError(f'Unsupported flavor: {flavor}')
27
34
  return typestr_fn
@@ -46,18 +53,18 @@ def ssa_gen(sol: Solution, print_latency: bool, typestr_fn: Callable[[bool | int
46
53
  match op.opcode:
47
54
  case -1:
48
55
  # Input marker
49
- val = f'inp[{ops[op.id0].id0}]'
56
+ val = f'model_inp[{op.id0}]'
50
57
  case 0 | 1:
51
58
  # Common a+/-b<<shift op
52
59
  ref1 = f'bit_shift<{op.data}>(v{op.id1})' if op.data != 0 else f'v{op.id1}'
53
60
  val = f'{ref0} {"-" if op.opcode == 1 else "+"} {ref1}'
54
61
  case 2 | -2:
55
- if op.opcode == 2: # relu(inp)
62
+ if op.opcode == 2: # relu(model_inp)
56
63
  if ops[op.id0].qint.min < 0:
57
64
  val = f'{ref0} > 0 ? {_type}({ref0}) : {_type}(0)'
58
65
  else:
59
66
  val = ref0
60
- else: # relu(-inp)
67
+ else: # relu(-model_inp)
61
68
  if ops[op.id0].qint.max > 0:
62
69
  val = f'{ref0} > 0 ? {_type}(0) : {_type}(-{ref0})'
63
70
  else:
@@ -105,15 +112,15 @@ def output_gen(sol: Solution, typestr_fn: Callable[[bool | int, int, int], str])
105
112
  lines = []
106
113
  for i, idx in enumerate(sol.out_idxs):
107
114
  if idx < 0:
108
- lines.append(f'out[{i}] = 0;')
115
+ lines.append(f'model_out[{i}] = 0;')
109
116
  continue
110
117
  _type = typestr_fn(*_minimal_kif(sol.out_qint[i]))
111
118
  shift = sol.out_shifts[i]
112
119
  neg_str = '-' if sol.out_negs[i] else ''
113
120
  if shift == 0:
114
- lines.append(f'out[{i}] = {_type}({neg_str}v{idx});')
121
+ lines.append(f'model_out[{i}] = {_type}({neg_str}v{idx});')
115
122
  else:
116
- lines.append(f'out[{i}] = {_type}({neg_str}bit_shift<{shift}>(v{idx}));')
123
+ lines.append(f'model_out[{i}] = {_type}({neg_str}bit_shift<{shift}>(v{idx}));')
117
124
  return lines
118
125
 
119
126
 
@@ -126,7 +133,7 @@ def get_io_types(sol: Solution, flavor: str):
126
133
  return inp_type, out_type
127
134
 
128
135
 
129
- def cpp_logic_and_bridge_gen(
136
+ def hls_logic_and_bridge_gen(
130
137
  sol: Solution,
131
138
  fn_name: str,
132
139
  flavor: str,
@@ -140,7 +147,7 @@ def cpp_logic_and_bridge_gen(
140
147
 
141
148
  n_in, n_out = sol.shape
142
149
  template_def = 'template <typename inp_t, typename out_t>'
143
- fn_signature = f'void {fn_name}(inp_t inp[{n_in}], out_t out[{n_out}])'
150
+ fn_signature = f'void {fn_name}(inp_t model_inp[{n_in}], out_t model_out[{n_out}])'
144
151
  pragmas = pragmas or []
145
152
 
146
153
  ssa_lines = ssa_gen(sol, print_latency=print_latency, typestr_fn=typestr_fn)
@@ -173,12 +180,12 @@ bool openmp_enabled() {{
173
180
  return _openmp;
174
181
  }}
175
182
 
176
- void inference_f64(double *inp, double *out, size_t size) {{
177
- batch_inference<{fn_name}_config, double>(inp, out, size);
183
+ void inference_f64(double *model_inp, double *model_out, size_t size) {{
184
+ batch_inference<{fn_name}_config, double>(model_inp, model_out, size);
178
185
  }}
179
186
 
180
- void inference_f32(float *inp, float *out, size_t size) {{
181
- batch_inference<{fn_name}_config, float>(inp, out, size);
187
+ void inference_f32(float *model_inp, float *model_out, size_t size) {{
188
+ batch_inference<{fn_name}_config, float>(model_inp, model_out, size);
182
189
  }}
183
190
  }}"""
184
191
  return code, bridge
@@ -13,7 +13,7 @@ import numpy as np
13
13
  from numpy.typing import NDArray
14
14
 
15
15
  from da4ml.cmvm.types import Solution
16
- from da4ml.codegen.cpp.cpp_codegen import cpp_logic_and_bridge_gen, get_io_types
16
+ from da4ml.codegen.hls.hls_codegen import get_io_types, hls_logic_and_bridge_gen
17
17
 
18
18
  from ... import codegen
19
19
  from ...cmvm.types import _minimal_kif
@@ -39,7 +39,7 @@ class HLSModel:
39
39
  self._prj_name = prj_name
40
40
  self._path = Path(path)
41
41
  self._flavor = flavor.lower()
42
- assert self._flavor in ('vitis', 'hlslib'), f'Unsupported HLS flavor: {self._flavor}'
42
+ assert self._flavor in ('vitis', 'hlslib', 'oneapi'), f'Unsupported HLS flavor: {self._flavor}'
43
43
  self._print_latency = print_latency
44
44
  self._part_name = part_name
45
45
  self._clock_period = clock_period
@@ -64,7 +64,7 @@ class HLSModel:
64
64
  def write(self):
65
65
  if not self._path.exists():
66
66
  self._path.mkdir(parents=True, exist_ok=True)
67
- template_def, bridge = cpp_logic_and_bridge_gen(
67
+ template_def, bridge = hls_logic_and_bridge_gen(
68
68
  self._solution,
69
69
  self._prj_name,
70
70
  self._flavor,
@@ -104,11 +104,11 @@ class HLSModel:
104
104
  with open(self._path / f'{self._prj_name}_bridge.cc', 'w') as f:
105
105
  f.write(bridge)
106
106
 
107
- shutil.copy(self.__src_root / 'cpp/source/binder_util.hh', self._path)
108
- shutil.copy(self.__src_root / f'cpp/source/{self._flavor}_bitshift.hh', self._path / 'bitshift.hh')
109
- shutil.copy(self.__src_root / 'cpp/source/build_binder.mk', self._path)
107
+ shutil.copy(self.__src_root / 'hls/source/binder_util.hh', self._path)
108
+ shutil.copy(self.__src_root / f'hls/source/{self._flavor}_bitshift.hh', self._path / 'bitshift.hh')
109
+ shutil.copy(self.__src_root / 'hls/source/build_binder.mk', self._path)
110
110
  if self._flavor == 'vitis':
111
- shutil.copytree(self.__src_root / 'cpp/source/ap_types', self._path / 'ap_types', dirs_exist_ok=True)
111
+ shutil.copytree(self.__src_root / 'hls/source/ap_types', self._path / 'ap_types', dirs_exist_ok=True)
112
112
  else:
113
113
  pass
114
114
 
@@ -0,0 +1,50 @@
1
+ #pragma once
2
+ #include <cstddef>
3
+
4
+ #ifdef _OPENMP
5
+ #include <algorithm>
6
+ #include <omp.h>
7
+ constexpr bool _openmp = true;
8
+ #else
9
+ constexpr bool _openmp = false;
10
+ #endif
11
+
12
+ template <typename CONFIG_T, typename T> void _inference(T *c_inp, T *c_out, size_t n_samples) {
13
+ typename CONFIG_T::inp_t in_fixed_buf[CONFIG_T::N_inp];
14
+ typename CONFIG_T::out_t out_fixed_buf[CONFIG_T::N_out];
15
+
16
+ for (size_t i = 0; i < n_samples; ++i) {
17
+ size_t offset_in = i * CONFIG_T::N_inp;
18
+ size_t offset_out = i * CONFIG_T::N_out;
19
+ for (size_t j = 0; j < CONFIG_T::N_inp; ++j) {
20
+ in_fixed_buf[j] = c_inp[offset_in + j];
21
+ }
22
+
23
+ CONFIG_T::f(in_fixed_buf, out_fixed_buf);
24
+
25
+ for (size_t j = 0; j < CONFIG_T::N_out; ++j) {
26
+ c_out[offset_out + j] = out_fixed_buf[j];
27
+ }
28
+ }
29
+ }
30
+
31
+ template <typename CONFIG_T, typename T> void batch_inference(T *c_inp, T *c_out, size_t n_samples) {
32
+ #ifdef _OPENMP
33
+ size_t n_max_threads = omp_get_max_threads();
34
+ size_t n_samples_per_thread = std::max<size_t>(n_samples / n_max_threads, 32);
35
+ size_t n_thread = n_samples / n_samples_per_thread;
36
+ n_thread += (n_samples % n_samples_per_thread) ? 1 : 0;
37
+
38
+ #pragma omp parallel for num_threads(n_thread) schedule(static)
39
+ for (size_t i = 0; i < n_thread; ++i) {
40
+ size_t start = i * n_samples_per_thread;
41
+ size_t end = std::min<size_t>(start + n_samples_per_thread, n_samples);
42
+ size_t n_samples_this_thread = end - start;
43
+ size_t offset_in = start * CONFIG_T::N_inp;
44
+ size_t offset_out = start * CONFIG_T::N_out;
45
+ _inference<CONFIG_T, T>(&c_inp[offset_in], &c_out[offset_out], n_samples_this_thread);
46
+ }
47
+ #else
48
+ _inference<CONFIG_T, T>(c_inp, c_out, n_samples);
49
+ #endif
50
+ }
@@ -1,14 +1,16 @@
1
1
  #pragma once
2
- #include "ap_types/ap_fixed.h"
2
+ #include "ap_fixed.h"
3
3
 
4
- template <int s, int b, int i, ap_q_mode Q, ap_o_mode O, int N> ap_fixed<b, i + s> bit_shift(ap_fixed<b, i, Q, O, N> x) {
4
+ template <int s, int b, int i, ap_q_mode Q, ap_o_mode O, int N>
5
+ ap_fixed<b, i + s> bit_shift(ap_fixed<b, i, Q, O, N> x) {
5
6
  #pragma HLS INLINE
6
7
  ap_fixed<b, i + s> r;
7
8
  r.range() = x.range();
8
9
  return r;
9
10
  };
10
11
 
11
- template <int s, int b, int i, ap_q_mode Q, ap_o_mode O, int N> ap_ufixed<b, i + s> bit_shift(ap_ufixed<b, i, Q, O, N> x) {
12
+ template <int s, int b, int i, ap_q_mode Q, ap_o_mode O, int N>
13
+ ap_ufixed<b, i + s> bit_shift(ap_ufixed<b, i, Q, O, N> x) {
12
14
  #pragma HLS INLINE
13
15
  ap_ufixed<b, i + s> r;
14
16
  r.range() = x.range();
@@ -0,0 +1,15 @@
1
+ from .rtl_model import RTLModel, VerilogModel, VHDLModel
2
+ from .verilog import comb_logic_gen as verilog_comb_logic_gen
3
+ from .verilog import generate_io_wrapper as verilog_generate_io_wrapper
4
+ from .vhdl import comb_logic_gen as vhdl_comb_logic_gen
5
+ from .vhdl import generate_io_wrapper as vhdl_generate_io_wrapper
6
+
7
+ __all__ = [
8
+ 'RTLModel',
9
+ 'VerilogModel',
10
+ 'VHDLModel',
11
+ 'verilog_comb_logic_gen',
12
+ 'verilog_generate_io_wrapper',
13
+ 'vhdl_comb_logic_gen',
14
+ 'vhdl_generate_io_wrapper',
15
+ ]
@@ -19,7 +19,7 @@ std::enable_if_t<CONFIG_T::II != 0> _inference(int32_t *c_inp, int32_t *c_out, s
19
19
 
20
20
  if (t_inp < n_samples * CONFIG_T::II && t_inp % CONFIG_T::II == 0) {
21
21
  write_input<CONFIG_T::N_inp, CONFIG_T::max_inp_bw>(
22
- dut->inp, &c_inp[t_inp / CONFIG_T::II * CONFIG_T::N_inp]
22
+ dut->model_inp, &c_inp[t_inp / CONFIG_T::II * CONFIG_T::N_inp]
23
23
  );
24
24
  }
25
25
 
@@ -28,7 +28,7 @@ std::enable_if_t<CONFIG_T::II != 0> _inference(int32_t *c_inp, int32_t *c_out, s
28
28
 
29
29
  if (t_inp > CONFIG_T::latency && t_out % CONFIG_T::II == 0) {
30
30
  read_output<CONFIG_T::N_out, CONFIG_T::max_out_bw>(
31
- dut->out, &c_out[t_out / CONFIG_T::II * CONFIG_T::N_out]
31
+ dut->model_out, &c_out[t_out / CONFIG_T::II * CONFIG_T::N_out]
32
32
  );
33
33
  }
34
34
 
@@ -44,9 +44,9 @@ std::enable_if_t<CONFIG_T::II == 0> _inference(int32_t *c_inp, int32_t *c_out, s
44
44
  auto dut = std::make_unique<typename CONFIG_T::dut_t>();
45
45
 
46
46
  for (size_t i = 0; i < n_samples; ++i) {
47
- write_input<CONFIG_T::N_inp, CONFIG_T::max_inp_bw>(dut->inp, &c_inp[i * CONFIG_T::N_inp]);
47
+ write_input<CONFIG_T::N_inp, CONFIG_T::max_inp_bw>(dut->model_inp, &c_inp[i * CONFIG_T::N_inp]);
48
48
  dut->eval();
49
- read_output<CONFIG_T::N_out, CONFIG_T::max_out_bw>(dut->out, &c_out[i * CONFIG_T::N_out]);
49
+ read_output<CONFIG_T::N_out, CONFIG_T::max_out_bw>(dut->model_out, &c_out[i * CONFIG_T::N_out]);
50
50
  }
51
51
 
52
52
  dut->final();
@@ -7,10 +7,16 @@ CFLAGS = -std=c++17 -fPIC
7
7
  LINKFLAGS = $(INCLUDES) $(WARNINGS)
8
8
  LIBNAME = lib$(VM_PREFIX)_$(STAMP).so
9
9
  N_JOBS ?= $(shell nproc)
10
+ VERILATOR_FLAGS ?=
10
11
 
12
+ $(VM_PREFIX).v: $(wildcard $(VM_PREFIX).vhd)
13
+ # vhdl specific - convert to verilog first for verilating
14
+ mkdir -p obj_dir
15
+ ghdl -a --std=08 --workdir=obj_dir multiplier.vhd mux.vhd negative.vhd shift_adder.vhd $(wildcard $(VM_PREFIX:_wrapper=)_stage*.vhd) $(wildcard $(VM_PREFIX:_wrapper=).vhd) $(VM_PREFIX).vhd
16
+ ghdl synth --std=08 --workdir=obj_dir --out=verilog $(VM_PREFIX) > $(VM_PREFIX).v
11
17
 
12
18
  ./obj_dir/libV$(VM_PREFIX).a ./obj_dir/libverilated.a ./obj_dir/V$(VM_PREFIX)__ALL.a: $(VM_PREFIX).v
13
- verilator --cc -j $(N_JOBS) -Wall -build $(VM_PREFIX).v --prefix V$(VM_PREFIX) -CFLAGS "$(CFLAGS)"
19
+ verilator --cc -j $(N_JOBS) -build $(VM_PREFIX).v --prefix V$(VM_PREFIX) $(VERILATOR_FLAGS) -CFLAGS "$(CFLAGS)"
14
20
 
15
21
  $(LIBNAME): ./obj_dir/libV$(VM_PREFIX).a ./obj_dir/libverilated.a ./obj_dir/V$(VM_PREFIX)__ALL.a $(VM_PREFIX)_binder.cc
16
22
  $(CXX) $(CFLAGS) $(LINKFLAGS) $(CXXFLAGS2) -pthread -shared -o $(LIBNAME) $(VM_PREFIX)_binder.cc ./obj_dir/libV$(VM_PREFIX).a ./obj_dir/libverilated.a ./obj_dir/V$(VM_PREFIX)__ALL.a $(EXTRA_CXXFLAGS)
@@ -1,20 +1,41 @@
1
1
  set project_name "${PROJECT_NAME}"
2
2
  set device "${DEVICE}"
3
+ set source_type "${SOURCE_TYPE}"
3
4
 
4
5
  set top_module "${project_name}"
5
6
  set output_dir "./output_${project_name}"
6
7
 
7
8
  create_project $project_name "${output_dir}/$project_name" -force -part $device
8
9
 
9
- set_property TARGET_LANGUAGE Verilog [current_project]
10
10
  set_property DEFAULT_LIB work [current_project]
11
11
 
12
- read_verilog "${project_name}.v"
13
- read_verilog "shift_adder.v"
14
- read_verilog "negative.v"
15
- read_verilog "mux.v"
16
- foreach file [glob -nocomplain "${project_name}_stage*.v"] {
17
- read_verilog $file
12
+ if { $source_type != "vhdl" && $source_type != "verilog" } {
13
+ puts "Error: SOURCE_TYPE must be either 'vhdl' or 'verilog'."
14
+ exit 1
15
+ }
16
+
17
+ if { $source_type == "vhdl" } {
18
+ set_property TARGET_LANGUAGE VHDL [current_project]
19
+
20
+ read_vhdl -vhdl2008 "${project_name}.vhd"
21
+ read_vhdl -vhdl2008 "shift_adder.vhd"
22
+ read_vhdl -vhdl2008 "negative.vhd"
23
+ read_vhdl -vhdl2008 "mux.vhd"
24
+ read_vhdl -vhdl2008 "multiplier.vhd"
25
+ foreach file [glob -nocomplain "${project_name}_stage*.vhd"] {
26
+ read_vhdl -vhdl2008 $file
27
+ }
28
+ } else {
29
+ set_property TARGET_LANGUAGE Verilog [current_project]
30
+
31
+ read_verilog "${project_name}.v"
32
+ read_verilog "shift_adder.v"
33
+ read_verilog "negative.v"
34
+ read_verilog "mux.v"
35
+ read_verilog "multiplier.v"
36
+ foreach file [glob -nocomplain "${project_name}_stage*.v"] {
37
+ read_verilog $file
38
+ }
18
39
  }
19
40
 
20
41
  read_xdc "${project_name}.xdc" -mode out_of_context
@@ -10,10 +10,9 @@ from uuid import uuid4
10
10
  import numpy as np
11
11
  from numpy.typing import NDArray
12
12
 
13
- from ... import codegen
14
13
  from ...cmvm.types import CascadedSolution, Solution, _minimal_kif
15
14
  from ...trace.pipeline import to_pipeline
16
- from . import binder_gen, comb_logic_gen, generate_io_wrapper, pipeline_logic_gen
15
+ from .. import rtl
17
16
 
18
17
 
19
18
  def get_io_kifs(sol: Solution | CascadedSolution):
@@ -22,12 +21,13 @@ def get_io_kifs(sol: Solution | CascadedSolution):
22
21
  return np.array(inp_kifs, np.int8), np.array(out_kifs, np.int8)
23
22
 
24
23
 
25
- class VerilogModel:
24
+ class RTLModel:
26
25
  def __init__(
27
26
  self,
28
27
  solution: Solution | CascadedSolution,
29
28
  prj_name: str,
30
29
  path: str | Path,
30
+ flavor: str = 'verilog',
31
31
  latency_cutoff: float = -1,
32
32
  print_latency: bool = True,
33
33
  part_name: str = 'xcvu13p-flga2577-2-e',
@@ -36,18 +36,21 @@ class VerilogModel:
36
36
  io_delay_minmax: tuple[float, float] = (0.2, 0.4),
37
37
  register_layers: int = 1,
38
38
  ):
39
+ self._flavor = flavor.lower()
39
40
  self._solution = solution
40
41
  self._path = Path(path)
41
42
  self._prj_name = prj_name
42
43
  self._latency_cutoff = latency_cutoff
43
44
  self._print_latency = print_latency
44
- self.__src_root = Path(codegen.__file__).parent
45
+ self.__src_root = Path(rtl.__file__).parent
45
46
  self._part_name = part_name
46
47
  self._clock_period = clock_period
47
48
  self._clock_uncertainty = clock_uncertainty
48
49
  self._io_delay_minmax = io_delay_minmax
49
50
  self._register_layers = register_layers
50
51
 
52
+ assert self._flavor in ('vhdl', 'verilog'), f'Unsupported flavor {flavor}, only vhdl and verilog are supported.'
53
+
51
54
  self._pipe = solution if isinstance(solution, CascadedSolution) else None
52
55
  if latency_cutoff > 0 and self._pipe is None:
53
56
  assert isinstance(solution, Solution)
@@ -62,16 +65,23 @@ class VerilogModel:
62
65
  self._uuid = None
63
66
 
64
67
  def write(self):
68
+ flavor = self._flavor
69
+ suffix = 'v' if flavor == 'verilog' else 'vhd'
70
+ if flavor == 'vhdl':
71
+ from .vhdl import binder_gen, comb_logic_gen, generate_io_wrapper, pipeline_logic_gen
72
+ else: # verilog
73
+ from .verilog import binder_gen, comb_logic_gen, generate_io_wrapper, pipeline_logic_gen
74
+
65
75
  self._path.mkdir(parents=True, exist_ok=True)
66
76
  if self._pipe is not None: # Pipeline
67
77
  # Main logic
68
78
  codes = pipeline_logic_gen(self._pipe, self._prj_name, self._print_latency, register_layers=self._register_layers)
69
79
  for k, v in codes.items():
70
- with open(self._path / f'{k}.v', 'w') as f:
80
+ with open(self._path / f'{k}.{suffix}', 'w') as f:
71
81
  f.write(v)
72
82
 
73
83
  # Build script
74
- with open(self.__src_root / 'verilog/source/build_prj.tcl') as f:
84
+ with open(self.__src_root / 'common_source/build_prj.tcl') as f:
75
85
  tcl = f.read()
76
86
  tcl = tcl.replace('${DEVICE}', self._part_name)
77
87
  tcl = tcl.replace('${PROJECT_NAME}', self._prj_name)
@@ -79,7 +89,7 @@ class VerilogModel:
79
89
  f.write(tcl)
80
90
 
81
91
  # XDC
82
- with open(self.__src_root / 'verilog/source/template.xdc') as f:
92
+ with open(self.__src_root / 'common_source/template.xdc') as f:
83
93
  xdc = f.read()
84
94
  xdc = xdc.replace('${CLOCK_PERIOD}', str(self._clock_period))
85
95
  xdc = xdc.replace('${UNCERTAINITY_SETUP}', str(self._clock_uncertainty))
@@ -89,7 +99,7 @@ class VerilogModel:
89
99
  with open(self._path / f'{self._prj_name}.xdc', 'w') as f:
90
100
  f.write(xdc)
91
101
 
92
- # C++ binder w/ verilog wrapper for uniform bw
102
+ # C++ binder w/ HDL wrapper for uniform bw
93
103
  binder = binder_gen(self._pipe, f'{self._prj_name}_wrapper', 1, self._register_layers)
94
104
 
95
105
  # Verilog IO wrapper (non-uniform bw to uniform one, clk passthrough)
@@ -101,24 +111,25 @@ class VerilogModel:
101
111
 
102
112
  # Main logic
103
113
  code = comb_logic_gen(self._solution, self._prj_name, self._print_latency, '`timescale 1ns/1ps')
104
- with open(self._path / f'{self._prj_name}.v', 'w') as f:
114
+ with open(self._path / f'{self._prj_name}.{suffix}', 'w') as f:
105
115
  f.write(code)
106
116
 
107
117
  # Verilog IO wrapper (non-uniform bw to uniform one, no clk)
108
118
  io_wrapper = generate_io_wrapper(self._solution, self._prj_name, False)
109
119
  binder = binder_gen(self._solution, f'{self._prj_name}_wrapper')
110
120
 
111
- with open(self._path / f'{self._prj_name}_wrapper.v', 'w') as f:
121
+ with open(self._path / f'{self._prj_name}_wrapper.{suffix}', 'w') as f:
112
122
  f.write(io_wrapper)
113
123
  with open(self._path / f'{self._prj_name}_wrapper_binder.cc', 'w') as f:
114
124
  f.write(binder)
115
125
 
116
126
  # Common resource copy
117
- for fname in self.__src_root.glob('verilog/source/*.v'):
127
+ for fname in self.__src_root.glob(f'{flavor}/source/*.{suffix}'):
118
128
  shutil.copy(fname, self._path)
119
- shutil.copy(self.__src_root / 'verilog/source/build_binder.mk', self._path)
120
- shutil.copy(self.__src_root / 'verilog/source/ioutil.hh', self._path)
121
- shutil.copy(self.__src_root / 'verilog/source/binder_util.hh', self._path)
129
+
130
+ shutil.copy(self.__src_root / 'common_source/build_binder.mk', self._path)
131
+ shutil.copy(self.__src_root / 'common_source/ioutil.hh', self._path)
132
+ shutil.copy(self.__src_root / 'common_source/binder_util.hh', self._path)
122
133
  self._solution.save(self._path / 'model.json')
123
134
  with open(self._path / 'misc.json', 'w') as f:
124
135
  f.write(f'{{"cost": {self._solution.cost}}}')
@@ -152,6 +163,7 @@ class VerilogModel:
152
163
  env['VM_PREFIX'] = f'{self._prj_name}_wrapper'
153
164
  env['STAMP'] = self._uuid
154
165
  env['EXTRA_CXXFLAGS'] = '-fopenmp' if openmp else ''
166
+ env['VERILATOR_FLAGS'] = '-Wall' if self._flavor == 'verilog' else ''
155
167
  if nproc is not None:
156
168
  env['N_JOBS'] = str(nproc)
157
169
  if o3:
@@ -219,7 +231,7 @@ class VerilogModel:
219
231
  self.write()
220
232
  self._compile(verbose=verbose, openmp=openmp, nproc=nproc, o3=o3, clean=clean)
221
233
 
222
- def predict(self, data: NDArray[np.floating]):
234
+ def predict(self, data: NDArray[np.floating]) -> NDArray[np.float32]:
223
235
  """Run the model on the input data.
224
236
 
225
237
  Parameters
@@ -233,6 +245,7 @@ class VerilogModel:
233
245
  NDArray[np.float64]
234
246
  Output of the model in shape (n_samples, output_size).
235
247
  """
248
+
236
249
  assert self._lib is not None, 'Library not loaded, call .compile() first.'
237
250
  inp_size, out_size = self._solution.shape
238
251
 
@@ -258,7 +271,7 @@ class VerilogModel:
258
271
  # Unscale the output int32 to recover fp values
259
272
  k, i, f = np.max(k_out), np.max(i_out), np.max(f_out)
260
273
  a, b, c = 2.0 ** (k + i + f), k * 2.0 ** (i + f), 2.0**-f
261
- return ((out_data.reshape(n_sample, out_size) + b) % a - b) * c
274
+ return ((out_data.reshape(n_sample, out_size) + b) % a - b) * c.astype(np.float32)
262
275
 
263
276
  def __repr__(self):
264
277
  inp_size, out_size = self._solution.shape
@@ -289,3 +302,61 @@ Estimated cost: {cost} LUTs"""
289
302
  else:
290
303
  spec += '\nEmulator is **not compiled**'
291
304
  return spec
305
+
306
+
307
+ class VerilogModel(RTLModel):
308
+ def __init__(
309
+ self,
310
+ solution: Solution | CascadedSolution,
311
+ prj_name: str,
312
+ path: str | Path,
313
+ latency_cutoff: float = -1,
314
+ print_latency: bool = True,
315
+ part_name: str = 'xcvu13p-flga2577-2-e',
316
+ clock_period: float = 5,
317
+ clock_uncertainty: float = 0.1,
318
+ io_delay_minmax: tuple[float, float] = (0.2, 0.4),
319
+ register_layers: int = 1,
320
+ ):
321
+ self._hdl_model = super().__init__(
322
+ solution,
323
+ prj_name,
324
+ path,
325
+ 'verilog',
326
+ latency_cutoff,
327
+ print_latency,
328
+ part_name,
329
+ clock_period,
330
+ clock_uncertainty,
331
+ io_delay_minmax,
332
+ register_layers,
333
+ )
334
+
335
+
336
+ class VHDLModel(RTLModel):
337
+ def __init__(
338
+ self,
339
+ solution: Solution | CascadedSolution,
340
+ prj_name: str,
341
+ path: str | Path,
342
+ latency_cutoff: float = -1,
343
+ print_latency: bool = True,
344
+ part_name: str = 'xcvu13p-flga2577-2-e',
345
+ clock_period: float = 5,
346
+ clock_uncertainty: float = 0.1,
347
+ io_delay_minmax: tuple[float, float] = (0.2, 0.4),
348
+ register_layers: int = 1,
349
+ ):
350
+ self._hdl_model = super().__init__(
351
+ solution,
352
+ prj_name,
353
+ path,
354
+ 'vhdl',
355
+ latency_cutoff,
356
+ print_latency,
357
+ part_name,
358
+ clock_period,
359
+ clock_uncertainty,
360
+ io_delay_minmax,
361
+ register_layers,
362
+ )
@@ -1,12 +1,10 @@
1
1
  from .comb import comb_logic_gen
2
2
  from .io_wrapper import binder_gen, generate_io_wrapper
3
3
  from .pipeline import pipeline_logic_gen
4
- from .verilog_model import VerilogModel
5
4
 
6
5
  __all__ = [
7
6
  'comb_logic_gen',
8
7
  'generate_io_wrapper',
9
8
  'pipeline_logic_gen',
10
9
  'binder_gen',
11
- 'VerilogModel',
12
10
  ]