PyPI - da4ml - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

da4ml 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of da4ml might be problematic. Click here for more details.

Files changed (59) hide show

da4ml/_version.py +2 -2
da4ml/cmvm/api.py +2 -6
da4ml/cmvm/core/__init__.py +0 -1
da4ml/cmvm/types.py +99 -19
da4ml/codegen/__init__.py +5 -4
da4ml/codegen/cpp/__init__.py +2 -1
da4ml/codegen/cpp/cpp_codegen.py +58 -25
da4ml/codegen/cpp/hls_model.py +252 -0
da4ml/codegen/cpp/source/ap_types/ap_binary.h +78 -0
da4ml/codegen/cpp/source/ap_types/ap_common.h +376 -0
da4ml/codegen/cpp/source/ap_types/ap_decl.h +212 -0
da4ml/codegen/cpp/source/ap_types/ap_fixed.h +360 -0
da4ml/codegen/cpp/source/ap_types/ap_fixed_base.h +2354 -0
da4ml/codegen/cpp/source/ap_types/ap_fixed_ref.h +718 -0
da4ml/codegen/cpp/source/ap_types/ap_fixed_special.h +230 -0
da4ml/codegen/cpp/source/ap_types/ap_int.h +330 -0
da4ml/codegen/cpp/source/ap_types/ap_int_base.h +1885 -0
da4ml/codegen/cpp/source/ap_types/ap_int_ref.h +1346 -0
da4ml/codegen/cpp/source/ap_types/ap_int_special.h +223 -0
da4ml/codegen/cpp/source/ap_types/ap_shift_reg.h +138 -0
da4ml/codegen/cpp/source/ap_types/etc/ap_private.h +7199 -0
da4ml/codegen/cpp/source/ap_types/hls_math.h +27 -0
da4ml/codegen/cpp/source/ap_types/hls_stream.h +263 -0
da4ml/codegen/cpp/source/ap_types/utils/x_hls_utils.h +80 -0
da4ml/codegen/cpp/source/binder_util.hh +56 -0
da4ml/codegen/cpp/source/build_binder.mk +24 -0
da4ml/codegen/cpp/source/{vitis.h → vitis_bitshift.hh} +1 -1
da4ml/codegen/verilog/__init__.py +2 -3
da4ml/codegen/verilog/comb.py +65 -24
da4ml/codegen/verilog/io_wrapper.py +36 -141
da4ml/codegen/verilog/pipeline.py +21 -3
da4ml/codegen/verilog/source/binder_util.hh +72 -0
da4ml/codegen/verilog/source/build_prj.tcl +0 -1
da4ml/codegen/verilog/source/mux.v +58 -0
da4ml/codegen/verilog/source/negative.v +28 -0
da4ml/codegen/verilog/source/shift_adder.v +4 -1
da4ml/codegen/verilog/source/template.xdc +3 -0
da4ml/codegen/verilog/verilog_model.py +42 -15
da4ml/converter/__init__.py +0 -0
da4ml/converter/hgq2/parser.py +105 -0
da4ml/converter/hgq2/replica.py +383 -0
da4ml/trace/__init__.py +2 -2
da4ml/trace/fixed_variable.py +177 -18
da4ml/trace/fixed_variable_array.py +124 -9
da4ml/trace/ops/__init__.py +22 -6
da4ml/trace/ops/conv_utils.py +146 -14
da4ml/trace/ops/einsum_utils.py +9 -6
da4ml/trace/ops/reduce_utils.py +103 -0
da4ml/trace/pipeline.py +36 -34
da4ml/trace/tracer.py +37 -5
da4ml-0.3.0.dist-info/METADATA +107 -0
da4ml-0.3.0.dist-info/RECORD +64 -0
da4ml/codegen/cpp/source/vitis_bridge.h +0 -17
da4ml-0.2.0.dist-info/METADATA +0 -65
da4ml-0.2.0.dist-info/RECORD +0 -39
/da4ml/codegen/verilog/source/{ioutils.hh → ioutil.hh} +0 -0
{da4ml-0.2.0.dist-info → da4ml-0.3.0.dist-info}/WHEEL +0 -0
{da4ml-0.2.0.dist-info → da4ml-0.3.0.dist-info}/licenses/LICENSE +0 -0
{da4ml-0.2.0.dist-info → da4ml-0.3.0.dist-info}/top_level.txt +0 -0

da4ml/codegen/verilog/io_wrapper.py CHANGED Viewed

@@ -7,7 +7,7 @@ def hetero_io_map(qints: list[QInterval], merge: bool = False):
     N = len(qints)
     ks, _is, fs = zip(*map(_minimal_kif, qints))
     Is = [_i + _k for _i, _k in zip(_is, ks)]
-    max_I, max_f = max(Is), max(fs)
+    max_I, max_f = max(_is) + max(ks), max(fs)
     max_bw = max_I + max_f
     width_regular, width_packed = max_bw * N, sum(Is) + sum(fs)
@@ -32,11 +32,16 @@ def hetero_io_map(qints: list[QInterval], merge: bool = False):
             copy_from = hetero[i][0] if ks[i] else -1
             pads.append((base + max_bw - 1, base + max_bw - bias_high, copy_from))
+    mask = list(high < low for high, low in hetero)
+    regular = [r for r, m in zip(regular, mask) if not m]
+    hetero = [h for h, m in zip(hetero, mask) if not m]
     if not merge:
         return regular, hetero, pads, (width_regular, width_packed)
     # Merging consecutive intervals when possible
-    for i in range(N - 2, -1, -1):
+    NN = len(regular) - 2
+    for i in range(NN, -1, -1):
         this_high = regular[i][0]
         next_low = regular[i + 1][1]
         if next_low - this_high != 1:
@@ -65,6 +70,8 @@ def generate_io_wrapper(sol: Solution | CascadedSolution, module_name: str, pipe
     _out_assignment: list[tuple[int, str]] = []
     for i, ((ih, jh), (ir, jr)) in enumerate(zip(het_out, reg_out)):
+        if ih == jh - 1:
+            continue
         _out_assignment.append((ih, f'assign out[{ir}:{jr}] = packed_out[{ih}:{jh}];'))
     for i, (i, j, copy_from) in enumerate(pad_out):
@@ -86,12 +93,12 @@ def generate_io_wrapper(sol: Solution | CascadedSolution, module_name: str, pipe
 module {module_name}_wrapper ({clk_and_rst_inp}
     // verilator lint_off UNUSEDSIGNAL
-    input [{w_reg_in-1}:0] inp,
+    input [{w_reg_in - 1}:0] inp,
     // verilator lint_on UNUSEDSIGNAL
-    output [{w_reg_out-1}:0] out
+    output [{w_reg_out - 1}:0] out
 );
-    wire [{w_het_in-1}:0] packed_inp;
-    wire [{w_het_out-1}:0] packed_out;
+    wire [{w_het_in - 1}:0] packed_inp;
+    wire [{w_het_out - 1}:0] packed_out;
     {inp_assignment_str}
@@ -106,150 +113,38 @@ endmodule
 """
-def comb_binder_gen(sol: Solution, module_name: str):
-    k_in, i_in, f_in = zip(*map(_minimal_kif, sol.inp_qint))
-    k_out, i_out, f_out = zip(*map(_minimal_kif, sol.out_qint))
-    max_inp_bw = max(k + i for k, i in zip(k_in, i_in)) + max(f_in)
-    max_out_bw = max(k + i for k, i in zip(k_out, i_out)) + max(f_out)
-    n_in, n_out = sol.shape
-    return f"""#include "V{module_name}.h"
-#include "ioutils.hh"
-#include <verilated.h>
-#ifdef _OPENMP
-#include <omp.h>
-constexpr bool _openmp = true;
-#else
-constexpr bool _openmp = false;
-#endif
-constexpr size_t N_inp = {n_in};
-constexpr size_t N_out = {n_out};
-constexpr size_t max_inp_bw = {max_inp_bw};
-constexpr size_t max_out_bw = {max_out_bw};
-typedef V{module_name} dut_t;
-extern "C" {{
-bool openmp_enabled() {{
-    return _openmp;
-}}
-void _inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
-    dut_t *dut = new dut_t;
-    for (size_t i = 0; i < n_samples; ++i) {{
-        write_input<N_inp, max_inp_bw>(dut->inp, &c_inp[i * N_inp]);
-        dut->eval();
-        read_output<N_out, max_out_bw>(dut->out, &c_out[i * N_out]);
-    }}
-    dut->final();
-    delete dut;
-}}
-void inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
-    size_t n_max_threads = omp_get_max_threads();
-    size_t n_samples_per_thread = std::max<size_t>(n_samples / n_max_threads, 32);
-    size_t n_thread = n_samples / n_samples_per_thread;
-    n_thread += (n_samples % n_samples_per_thread) ? 1 : 0;
-    #ifdef _OPENMP
-    #pragma omp parallel for num_threads(n_thread) schedule(static)
-    for (size_t i = 0; i < n_thread; ++i) {{
-        size_t start = i * n_samples_per_thread;
-        size_t end = std::min<size_t>(start + n_samples_per_thread, n_samples);
-        size_t n_samples_this_thread = end - start;
-        _inference(&c_inp[start * N_inp], &c_out[start * N_out], n_samples_this_thread);
-    }}
-    #else
-    _inference(c_inp, c_out, n_samples);
-    #endif
-}}
-}}"""
-def pipeline_binder_gen(csol: CascadedSolution, module_name: str, II: int = 1):
+def binder_gen(csol: CascadedSolution | Solution, module_name: str, II: int = 1, latency_multiplier: int = 1):
     k_in, i_in, f_in = zip(*map(_minimal_kif, csol.inp_qint))
     k_out, i_out, f_out = zip(*map(_minimal_kif, csol.out_qint))
-    max_inp_bw = max(k + i for k, i in zip(k_in, i_in)) + max(f_in)
-    max_out_bw = max(k + i for k, i in zip(k_out, i_out)) + max(f_out)
-    n_stage = len(csol.solutions)
+    max_inp_bw = max(k_in) + max(i_in) + max(f_in)
+    max_out_bw = max(k_out) + max(i_out) + max(f_out)
+    if isinstance(csol, Solution):
+        II = latency = 0
+    else:
+        latency = len(csol.solutions) * latency_multiplier
     n_in, n_out = csol.shape
-    return f"""#include "V{module_name}.h"
-#include "ioutils.hh"
-#include <verilated.h>
-#ifdef _OPENMP
-#include <omp.h>
-constexpr bool _openmp = true;
-#else
-constexpr bool _openmp = false;
-#endif
-constexpr size_t N_inp = {n_in};
-constexpr size_t N_out = {n_out};
-constexpr size_t max_inp_bw = {max_inp_bw};
-constexpr size_t max_out_bw = {max_out_bw};
-constexpr size_t II = {II};
-constexpr size_t latency = {n_stage};
-typedef V{module_name} dut_t;
+    return f"""#include <cstddef>
+#include "binder_util.hh"
+#include "V{module_name}.h"
+struct {module_name}_config {{
+    static const size_t N_inp = {n_in};
+    static const size_t N_out = {n_out};
+    static const size_t max_inp_bw = {max_inp_bw};
+    static const size_t max_out_bw = {max_out_bw};
+    static const size_t II = {II};
+    static const size_t latency = {latency};
+    typedef V{module_name} dut_t;
+}};
 extern "C" {{
 bool openmp_enabled() {{
     return _openmp;
 }}
-void _inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
-    dut_t *dut = new dut_t;
-    size_t clk_req = n_samples * II + latency + 1;
-    for (size_t t_inp = 0; t_inp < clk_req; ++t_inp) {{
-        size_t t_out = t_inp - latency - 1;
-        if (t_inp < n_samples * II && t_inp % II == 0) {{
-            write_input<N_inp, max_inp_bw>(dut->inp, &c_inp[t_inp / II * N_inp]);
-        }}
-        dut->clk = 0;
-        dut->eval();
-        if (t_inp > latency && t_out % II == 0) {{
-            read_output<N_out, max_out_bw>(dut->out, &c_out[t_out / II * N_out]);
-        }}
-        dut->clk = 1;
-        dut->eval();
-    }}
-    dut->final();
-    delete dut;
-}}
 void inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
-#ifdef _OPENMP
-    size_t n_max_threads = omp_get_max_threads();
-    size_t n_samples_per_thread = std::max<size_t>(n_samples / n_max_threads, 32);
-    size_t n_thread = n_samples / n_samples_per_thread;
-    n_thread += (n_samples % n_samples_per_thread) ? 1 : 0;
-    #pragma omp parallel for num_threads(n_thread) schedule(static)
-    for (size_t i = 0; i < n_thread; ++i) {{
-        size_t start = i * n_samples_per_thread;
-        size_t end = std::min<size_t>(start + n_samples_per_thread, n_samples);
-        size_t n_samples_this_thread = end - start;
-        _inference(&c_inp[start * N_inp], &c_out[start * N_out], n_samples_this_thread);
-    }}
-#else
-    _inference(c_inp, c_out, n_samples);
-#endif
+    batch_inference<{module_name}_config>(c_inp, c_out, n_samples);
 }}
-}}"""
+}}
+"""

da4ml/codegen/verilog/pipeline.py CHANGED Viewed

@@ -3,19 +3,37 @@ from .comb import comb_logic_gen
 def pipeline_logic_gen(
-    csol: CascadedSolution, name: str, print_latency=False, timescale: str | None = '`timescale 1 ns / 1 ps', reset_high=True
+    csol: CascadedSolution,
+    name: str,
+    print_latency=False,
+    timescale: str | None = '`timescale 1 ns / 1 ps',
+    register_layers: int = 1,
 ):
     N = len(csol.solutions)
     inp_bits = [sum(map(sum, map(_minimal_kif, sol.inp_qint))) for sol in csol.solutions]
     out_bits = inp_bits[1:] + [sum(map(sum, map(_minimal_kif, csol.out_qint)))]
     registers = [f'reg [{width}-1:0] stage{i}_inp;' for i, width in enumerate(inp_bits)]
+    for i in range(0, register_layers - 1):
+        registers += [f'reg [{width}-1:0] stage{j}_inp_copy{i};' for j, width in enumerate(inp_bits)]
     wires = [f'wire [{width}-1:0] stage{i}_out;' for i, width in enumerate(out_bits)]
     comb_logic = [f'{name}_stage{i} stage{i} (.inp(stage{i}_inp), .out(stage{i}_out));' for i in range(N)]
-    serial_logic = ['stage0_inp <= inp;']
-    serial_logic += [f'stage{i}_inp <= stage{i-1}_out;' for i in range(1, N)]
+    if register_layers == 1:
+        serial_logic = ['stage0_inp <= inp;']
+        serial_logic += [f'stage{i}_inp <= stage{i-1}_out;' for i in range(1, N)]
+    else:
+        serial_logic = ['stage0_inp_copy0 <= inp;']
+        for j in range(1, register_layers - 1):
+            serial_logic.append(f'stage0_inp_copy{j} <= stage0_inp_copy{j-1};')
+        serial_logic.append(f'stage0_inp <= stage0_inp_copy{register_layers - 2};')
+        for i in range(1, N):
+            serial_logic.append(f'stage{i}_inp_copy0 <= stage{i-1}_out;')
+            for j in range(1, register_layers - 1):
+                serial_logic.append(f'stage{i}_inp_copy{j} <= stage{i}_inp_copy{j-1};')
+            serial_logic.append(f'stage{i}_inp <= stage{i}_inp_copy{register_layers - 2};')
     serial_logic += [f'out <= stage{N-1}_out;']
     sep0 = '\n    '

da4ml/codegen/verilog/source/binder_util.hh ADDED Viewed

@@ -0,0 +1,72 @@
+#include "ioutil.hh"
+#include <verilated.h>
+#ifdef _OPENMP
+#include <omp.h>
+constexpr bool _openmp = true;
+#else
+constexpr bool _openmp = false;
+#endif
+template <typename CONFIG_T>
+std::enable_if_t<CONFIG_T::II != 0> _inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {
+    typename CONFIG_T::dut_t *dut = new typename CONFIG_T::dut_t;
+    size_t clk_req = n_samples * CONFIG_T::II + CONFIG_T::latency + 1;
+    for (size_t t_inp = 0; t_inp < clk_req; ++t_inp) {
+        size_t t_out = t_inp - CONFIG_T::latency - 1;
+        if (t_inp < n_samples * CONFIG_T::II && t_inp % CONFIG_T::II == 0) {
+            write_input<CONFIG_T::N_inp, CONFIG_T::max_inp_bw>(dut->inp, &c_inp[t_inp / CONFIG_T::II * CONFIG_T::N_inp]);
+        }
+        dut->clk = 0;
+        dut->eval();
+        if (t_inp > CONFIG_T::latency && t_out % CONFIG_T::II == 0) {
+            read_output<CONFIG_T::N_out, CONFIG_T::max_out_bw>(dut->out, &c_out[t_out / CONFIG_T::II * CONFIG_T::N_out]);
+        }
+        dut->clk = 1;
+        dut->eval();
+    }
+    dut->final();
+    delete dut;
+}
+template <typename CONFIG_T>
+std::enable_if_t<CONFIG_T::II == 0> _inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {
+    typename CONFIG_T::dut_t *dut = new typename CONFIG_T::dut_t;
+    for (size_t i = 0; i < n_samples; ++i) {
+        write_input<CONFIG_T::N_inp, CONFIG_T::max_inp_bw>(dut->inp, &c_inp[i * CONFIG_T::N_inp]);
+        dut->eval();
+        read_output<CONFIG_T::N_out, CONFIG_T::max_out_bw>(dut->out, &c_out[i * CONFIG_T::N_out]);
+    }
+    dut->final();
+    delete dut;
+}
+template <typename CONFIG_T> void batch_inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {
+#ifdef _OPENMP
+    size_t n_max_threads = omp_get_max_threads();
+    size_t n_samples_per_thread = std::max<size_t>(n_samples / n_max_threads, 32);
+    size_t n_thread = n_samples / n_samples_per_thread;
+    n_thread += (n_samples % n_samples_per_thread) ? 1 : 0;
+#pragma omp parallel for num_threads(n_thread) schedule(static)
+    for (size_t i = 0; i < n_thread; ++i) {
+        size_t start = i * n_samples_per_thread;
+        size_t end = std::min<size_t>(start + n_samples_per_thread, n_samples);
+        size_t n_samples_this_thread = end - start;
+        size_t offset_in = start * CONFIG_T::N_inp;
+        size_t offset_out = start * CONFIG_T::N_out;
+        _inference<CONFIG_T>(&c_inp[offset_in], &c_out[offset_out], n_samples_this_thread);
+    }
+#else
+    _inference<CONFIG_T>(c_inp, c_out, n_samples);
+#endif
+}

da4ml/codegen/verilog/source/build_prj.tcl CHANGED Viewed

@@ -26,7 +26,6 @@ file mkdir "${output_dir}/reports"
 # synth
 synth_design -top $top_module -mode out_of_context -retiming \
     -flatten_hierarchy rebuilt -resource_sharing auto \
-    -keep_equivalent_registers -shreg_min_size 8 \
     -directive AlternateRoutability
 write_checkpoint -force "${output_dir}/${project_name}_post_synth.dcp"

da4ml/codegen/verilog/source/mux.v ADDED Viewed

@@ -0,0 +1,58 @@
+`timescale 1ns / 1ps
+module mux #(
+    parameter BW_INPUT0 = 32,
+    parameter BW_INPUT1 = 32,
+    parameter SIGNED0 = 0,
+    parameter SIGNED1 = 0,
+    parameter BW_OUT = 32,
+    parameter SHIFT1 = 0,
+    parameter INVERT1 = 0
+) (
+    input key,
+    input [BW_INPUT0-1:0] in0,
+    input [BW_INPUT1-1:0] in1,
+    output [BW_OUT-1:0] out
+);
+  localparam IN0_NEED_BITS = (SHIFT1 < 0) ? BW_INPUT0 - SHIFT1 : BW_INPUT0;
+  localparam IN1_NEED_BITS = (SHIFT1 > 0) ? BW_INPUT1 + SHIFT1 : BW_INPUT1;
+  localparam EXTRA_PAD = (SIGNED0 != SIGNED1) ? INVERT1 + 1 : INVERT1 + 0;
+  localparam BW_BUF = (IN0_NEED_BITS > IN1_NEED_BITS) ? IN0_NEED_BITS + EXTRA_PAD : IN1_NEED_BITS + EXTRA_PAD;
+  localparam IN0_PAD_LEFT = (SHIFT1 < 0) ? BW_BUF - BW_INPUT0 + SHIFT1 : BW_BUF - BW_INPUT0;
+  localparam IN0_PAD_RIGHT = (SHIFT1 < 0) ? -SHIFT1 : 0;
+  localparam IN1_PAD_LEFT = (SHIFT1 > 0) ? BW_BUF - BW_INPUT1 - SHIFT1 : BW_BUF - BW_INPUT1;
+  localparam IN1_PAD_RIGHT = (SHIFT1 > 0) ? SHIFT1 : 0;
+  // verilator lint_off UNUSEDSIGNAL
+  wire [BW_BUF-1:0] in0_ext;
+  wire [BW_BUF-1:0] in1_ext;
+  // verilator lint_on UNUSEDSIGNAL
+  generate
+    if (SIGNED0 == 1) begin : in0_is_signed
+      assign in0_ext = {{IN0_PAD_LEFT{in0[BW_INPUT0-1]}}, in0, {IN0_PAD_RIGHT{1'b0}}};
+    end else begin : in0_is_unsigned
+      assign in0_ext = {{IN0_PAD_LEFT{1'b0}}, in0, {IN0_PAD_RIGHT{1'b0}}};
+    end
+  endgenerate
+  generate
+    if (SIGNED1 == 1) begin : in1_is_signed
+      assign in1_ext = {{IN1_PAD_LEFT{in1[BW_INPUT1-1]}}, in1, {IN1_PAD_RIGHT{1'b0}}};
+    end else begin : in1_is_unsigned
+      assign in1_ext = {{IN1_PAD_LEFT{1'b0}}, in1, {IN1_PAD_RIGHT{1'b0}}};
+    end
+  endgenerate
+  generate
+    if (INVERT1 == 1) begin : is_invert
+      assign out = (key) ? in0_ext[BW_OUT-1:0] : -in1_ext[BW_OUT-1:0];
+    end else begin : is_not_invert
+      assign out = (key) ? in0_ext[BW_OUT-1:0] : in1_ext[BW_OUT-1:0];
+    end
+  endgenerate
+endmodule

da4ml/codegen/verilog/source/negative.v ADDED Viewed

@@ -0,0 +1,28 @@
+`timescale 1ns / 1ps
+module negative #(
+    parameter BW_IN = 32,
+    parameter BW_OUT = 32,
+    parameter IN_SIGNED = 0
+) (
+    // verilator lint_off UNUSEDSIGNAL
+    input  [ BW_IN-1:0] in,
+    // verilator lint_off UNUSEDSIGNAL
+    output [BW_OUT-1:0] out
+);
+  generate
+    if (BW_IN < BW_OUT) begin : in_is_smaller
+      wire [BW_OUT-1:0] in_ext;
+      if (IN_SIGNED == 1) begin : is_signed
+        assign in_ext = {{BW_OUT - BW_IN{in[BW_IN-1]}}, in};
+      end else begin : is_unsigned
+        assign in_ext = {{BW_OUT - BW_IN{1'b0}}, in};
+      end
+      assign out = -in_ext;
+    end else begin : in_is_bigger
+      assign out = -in[BW_OUT-1:0];
+    end
+  endgenerate
+endmodule

da4ml/codegen/verilog/source/shift_adder.v CHANGED Viewed

@@ -17,7 +17,7 @@ module shift_adder #(
   localparam IN0_NEED_BITS = (SHIFT1 < 0) ? BW_INPUT0 - SHIFT1 : BW_INPUT0;
   localparam IN1_NEED_BITS = (SHIFT1 > 0) ? BW_INPUT1 + SHIFT1 : BW_INPUT1;
-  localparam EXTRA_PAD = (SIGNED0 != SIGNED1) ? IS_SUB+1 : IS_SUB+0;
+  localparam EXTRA_PAD = (SIGNED0 != SIGNED1) ? IS_SUB + 1 : IS_SUB + 0;
   localparam BW_ADD = (IN0_NEED_BITS > IN1_NEED_BITS) ? IN0_NEED_BITS + EXTRA_PAD + 1 : IN1_NEED_BITS + EXTRA_PAD + 1;
   localparam IN0_PAD_LEFT = (SHIFT1 < 0) ? BW_ADD - BW_INPUT0 + SHIFT1 : BW_ADD - BW_INPUT0;
   localparam IN0_PAD_RIGHT = (SHIFT1 < 0) ? -SHIFT1 : 0;
@@ -37,6 +37,9 @@ module shift_adder #(
     end else begin : in0_is_unsigned
       assign in0_ext = {{IN0_PAD_LEFT{1'b0}}, in0, {IN0_PAD_RIGHT{1'b0}}};
     end
+  endgenerate
+  generate
     if (SIGNED1 == 1) begin : in1_is_signed
       assign in1_ext = {{IN1_PAD_LEFT{in1[BW_INPUT1-1]}}, in1, {IN1_PAD_RIGHT{1'b0}}};
     end else begin : in1_is_unsigned

da4ml/codegen/verilog/source/template.xdc CHANGED Viewed

@@ -27,3 +27,6 @@ set_clock_uncertainty -setup $uncertainty_setup [get_clocks sys_clk]
 set_clock_uncertainty -hold $uncertainty_hold [get_clocks sys_clk]
 set_property HD.CLK_SRC BUFG_X0Y0 [get_ports clk]
+set_property retiming_forward 1 [get_cells {stage[*]_inp}]
+set_property retiming_backward 1 [get_cells {stage[*]_inp}]

da4ml/codegen/verilog/verilog_model.py CHANGED Viewed

@@ -13,7 +13,7 @@ from numpy.typing import NDArray
 from ... import codegen
 from ...cmvm.types import CascadedSolution, Solution, _minimal_kif
 from ...trace.pipeline import to_pipeline
-from . import comb_binder_gen, comb_logic_gen, generate_io_wrapper, pipeline_binder_gen, pipeline_logic_gen
+from . import binder_gen, comb_logic_gen, generate_io_wrapper, pipeline_logic_gen
 def get_io_kifs(sol: Solution | CascadedSolution):
@@ -34,6 +34,7 @@ class VerilogModel:
         clock_period: int = 5,
         clock_uncertainty: float = 0.1,
         io_delay_minmax: tuple[float, float] = (0.2, 0.4),
+        register_layers: int = 1,
     ):
         self._solution = solution
         self._path = Path(path)
@@ -45,6 +46,7 @@ class VerilogModel:
         self._clock_period = clock_period
         self._clock_uncertainty = clock_uncertainty
         self._io_delay_minmax = io_delay_minmax
+        self._register_layers = register_layers
         self._pipe = solution if isinstance(solution, CascadedSolution) else None
         if latency_cutoff > 0 and self._pipe is None:
@@ -57,12 +59,13 @@ class VerilogModel:
             self._latency_cutoff = latency_cutoff
         self._lib = None
+        self._uuid = None
     def write(self):
         self._path.mkdir(parents=True, exist_ok=True)
         if self._pipe is not None:  # Pipeline
             # Main logic
-            codes = pipeline_logic_gen(self._pipe, self._prj_name, self._print_latency)
+            codes = pipeline_logic_gen(self._pipe, self._prj_name, self._print_latency, register_layers=self._register_layers)
             for k, v in codes.items():
                 with open(self._path / f'{k}.v', 'w') as f:
                     f.write(v)
@@ -86,8 +89,8 @@ class VerilogModel:
             with open(self._path / f'{self._prj_name}.xdc', 'w') as f:
                 f.write(xdc)
-            # C++ binder w/
-            binder = pipeline_binder_gen(self._pipe, f'{self._prj_name}_wrapper', 1)
+            # C++ binder w/ verilog wrapper for uniform bw
+            binder = binder_gen(self._pipe, f'{self._prj_name}_wrapper', 1, self._register_layers)
             # Verilog IO wrapper (non-uniform bw to uniform one, clk passthrough)
             io_wrapper = generate_io_wrapper(self._pipe, self._prj_name, True)
@@ -103,7 +106,7 @@ class VerilogModel:
             # Verilog IO wrapper (non-uniform bw to uniform one, no clk)
             io_wrapper = generate_io_wrapper(self._solution, self._prj_name, False)
-            binder = comb_binder_gen(self._solution, f'{self._prj_name}_wrapper')
+            binder = binder_gen(self._solution, f'{self._prj_name}_wrapper')
         with open(self._path / f'{self._prj_name}_wrapper.v', 'w') as f:
             f.write(io_wrapper)
@@ -112,13 +115,16 @@ class VerilogModel:
         # Common resource copy
         shutil.copy(self.__src_root / 'verilog/source/shift_adder.v', self._path)
+        shutil.copy(self.__src_root / 'verilog/source/mux.v', self._path)
+        shutil.copy(self.__src_root / 'verilog/source/negative.v', self._path)
         shutil.copy(self.__src_root / 'verilog/source/build_binder.mk', self._path)
-        shutil.copy(self.__src_root / 'verilog/source/ioutils.hh', self._path)
+        shutil.copy(self.__src_root / 'verilog/source/ioutil.hh', self._path)
+        shutil.copy(self.__src_root / 'verilog/source/binder_util.hh', self._path)
         self._solution.save(self._path / 'model.json')
         with open(self._path / 'misc.json', 'w') as f:
             f.write(f'{{"cost": {self._solution.cost}}}')
-    def _compile(self, verbose=False, openmp=True, o3: bool = False, clean=True):
+    def _compile(self, verbose=False, openmp=True, nproc=None, o3: bool = False, clean=True):
         """Same as compile, but will not write to the library
         Parameters
@@ -127,6 +133,9 @@ class VerilogModel:
             Verbose output, by default False
         openmp : bool, optional
             Enable openmp, by default True
+        nproc : int | None, optional
+            Number of processes to use for compilation, by default None
+            If None, will use the number of CPU cores, but not more than 32.
         o3 : bool | None, optional
             Turn on -O3 flag, by default False
         clean : bool, optional
@@ -144,14 +153,20 @@ class VerilogModel:
         env['VM_PREFIX'] = f'{self._prj_name}_wrapper'
         env['STAMP'] = self._uuid
         env['EXTRA_CXXFLAGS'] = '-fopenmp' if openmp else ''
+        if nproc is not None:
+            env['N_JOBS'] = str(nproc)
         if o3:
             args.append('fast')
-        if clean:
+        if clean is not False:
             m = re.compile(r'^lib.*[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\.so$')
             for p in self._path.iterdir():
                 if not p.is_dir() and m.match(p.name):
                     p.unlink()
+        if clean:
+            subprocess.run(
+                ['make', '-f', 'build_binder.mk', 'clean'], env=env, cwd=self._path, check=True, capture_output=not verbose
+            )
         try:
             r = subprocess.run(args, env=env, check=True, cwd=self._path, capture_output=not verbose)
@@ -168,13 +183,19 @@ class VerilogModel:
     def _load_lib(self, uuid: str | None = None):
         uuid = uuid if uuid is not None else self._uuid
+        if uuid is None:
+            # load .so if there is only one, otherwise raise an error
+            libs = list(self._path.glob(f'lib{self._prj_name}_wrapper_*.so'))
+            if len(libs) == 0:
+                raise RuntimeError(f'Cannot load library, found {len(libs)} libraries in {self._path}')
+            uuid = libs[0].name.split('_')[-1].split('.', 1)[0]
         self._uuid = uuid
         lib_path = self._path / f'lib{self._prj_name}_wrapper_{uuid}.so'
         if not lib_path.exists():
             raise RuntimeError(f'Library {lib_path} does not exist')
         self._lib = ctypes.CDLL(str(lib_path))
-    def compile(self, verbose=False, openmp=True, o3: bool = False):
+    def compile(self, verbose=False, openmp=True, nproc: int | None = None, o3: bool = False, clean=True):
         """Compile the generated code to a emulator for logic simulation.
         Parameters
@@ -183,8 +204,13 @@ class VerilogModel:
             Verbose output, by default False
         openmp : bool, optional
             Enable openmp, by default True
+        nproc : int | None, optional
+            Number of processes to use for compilation, by default None
+            If None, will use the number of CPU cores, but not more than 32.
         o3 : bool | None, optional
             Turn on -O3 flag, by default False
+        clean : bool, optional
+            Remove obsolete shared object files, by default True
         Raises
         ------
@@ -192,8 +218,7 @@ class VerilogModel:
             If compilation fails
         """
         self.write()
-        self._compile(verbose=verbose, openmp=openmp, o3=o3)
-        self._load_lib()
+        self._compile(verbose=verbose, openmp=openmp, nproc=nproc, o3=o3, clean=clean)
     def predict(self, data: NDArray[np.floating]):
         """Run the model on the input data.
@@ -225,7 +250,7 @@ class VerilogModel:
         out_data = np.empty(n_sample * out_size, dtype=np.int32)
         # Convert to int32 matching the LSB position
-        inp_data[:] = data.ravel() * 2.0 ** np.max(f_in)
+        inp_data[:] = np.floor(data.ravel() * 2.0**f_in)
         inp_buf = inp_data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
         out_buf = out_data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
@@ -233,7 +258,7 @@ class VerilogModel:
         # Unscale the output int32 to recover fp values
         k, i, f = np.max(k_out), np.max(i_out), np.max(f_out)
-        a, b, c = 2.0 ** (k + i + f), 2.0 ** (i + f), 2.0**-f
+        a, b, c = 2.0 ** (k + i + f), k * 2.0 ** (i + f), 2.0**-f
         return ((out_data.reshape(n_sample, out_size) + b) % a - b) * c
     def __repr__(self):
@@ -243,11 +268,12 @@ class VerilogModel:
         in_bits, out_bits = np.sum(kifs_in), np.sum(kifs_out)
         if self._pipe is not None:
             n_stage = len(self._pipe[0])
+            delay_suffix = '' if self._register_layers == 1 else f'x {self._register_layers} '
             lat_cutoff = self._latency_cutoff
             reg_bits = self._pipe.reg_bits
             spec = f"""Top Module: {self._prj_name}\n====================
 {inp_size} ({in_bits} bits) -> {out_size} ({out_bits} bits)
-{n_stage} stages @ max_delay={lat_cutoff}
+{n_stage} {delay_suffix}stages @ max_delay={lat_cutoff}
 Estimated cost: {cost} LUTs, {reg_bits} FFs"""
         else:
@@ -258,7 +284,8 @@ Estimated cost: {cost} LUTs"""
         is_compiled = self._lib is not None
         if is_compiled:
-            openmp = 'with OpenMP' if self._lib.openmp_enabled else ''  # type: ignore
+            assert self._uuid is not None
+            openmp = 'with OpenMP' if self._lib.openmp_enabled() else ''  # type: ignore
             spec += f'\nEmulator is compiled {openmp} ({self._uuid[-12:]})'
         else:
             spec += '\nEmulator is **not compiled**'

da4ml/converter/__init__.py ADDED Viewed

File without changes

da4ml 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

Potentially problematic release.

da4ml 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl