PyPI - da4ml - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

da4ml 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of da4ml might be problematic. Click here for more details.

Files changed (59) hide show

da4ml/_version.py +2 -2
da4ml/cmvm/api.py +2 -6
da4ml/cmvm/core/__init__.py +0 -1
da4ml/cmvm/types.py +99 -19
da4ml/codegen/__init__.py +5 -4
da4ml/codegen/cpp/__init__.py +2 -1
da4ml/codegen/cpp/cpp_codegen.py +58 -25
da4ml/codegen/cpp/hls_model.py +252 -0
da4ml/codegen/cpp/source/ap_types/ap_binary.h +78 -0
da4ml/codegen/cpp/source/ap_types/ap_common.h +376 -0
da4ml/codegen/cpp/source/ap_types/ap_decl.h +212 -0
da4ml/codegen/cpp/source/ap_types/ap_fixed.h +360 -0
da4ml/codegen/cpp/source/ap_types/ap_fixed_base.h +2354 -0
da4ml/codegen/cpp/source/ap_types/ap_fixed_ref.h +718 -0
da4ml/codegen/cpp/source/ap_types/ap_fixed_special.h +230 -0
da4ml/codegen/cpp/source/ap_types/ap_int.h +330 -0
da4ml/codegen/cpp/source/ap_types/ap_int_base.h +1885 -0
da4ml/codegen/cpp/source/ap_types/ap_int_ref.h +1346 -0
da4ml/codegen/cpp/source/ap_types/ap_int_special.h +223 -0
da4ml/codegen/cpp/source/ap_types/ap_shift_reg.h +138 -0
da4ml/codegen/cpp/source/ap_types/etc/ap_private.h +7199 -0
da4ml/codegen/cpp/source/ap_types/hls_math.h +27 -0
da4ml/codegen/cpp/source/ap_types/hls_stream.h +263 -0
da4ml/codegen/cpp/source/ap_types/utils/x_hls_utils.h +80 -0
da4ml/codegen/cpp/source/binder_util.hh +56 -0
da4ml/codegen/cpp/source/build_binder.mk +24 -0
da4ml/codegen/cpp/source/{vitis.h → vitis_bitshift.hh} +1 -1
da4ml/codegen/verilog/__init__.py +2 -3
da4ml/codegen/verilog/comb.py +65 -24
da4ml/codegen/verilog/io_wrapper.py +36 -141
da4ml/codegen/verilog/pipeline.py +21 -3
da4ml/codegen/verilog/source/binder_util.hh +72 -0
da4ml/codegen/verilog/source/build_prj.tcl +0 -1
da4ml/codegen/verilog/source/mux.v +58 -0
da4ml/codegen/verilog/source/negative.v +28 -0
da4ml/codegen/verilog/source/shift_adder.v +4 -1
da4ml/codegen/verilog/source/template.xdc +3 -0
da4ml/codegen/verilog/verilog_model.py +42 -15
da4ml/converter/__init__.py +0 -0
da4ml/converter/hgq2/parser.py +105 -0
da4ml/converter/hgq2/replica.py +383 -0
da4ml/trace/__init__.py +2 -2
da4ml/trace/fixed_variable.py +177 -18
da4ml/trace/fixed_variable_array.py +124 -9
da4ml/trace/ops/__init__.py +22 -6
da4ml/trace/ops/conv_utils.py +146 -14
da4ml/trace/ops/einsum_utils.py +9 -6
da4ml/trace/ops/reduce_utils.py +103 -0
da4ml/trace/pipeline.py +36 -34
da4ml/trace/tracer.py +37 -5
da4ml-0.3.0.dist-info/METADATA +107 -0
da4ml-0.3.0.dist-info/RECORD +64 -0
da4ml/codegen/cpp/source/vitis_bridge.h +0 -17
da4ml-0.2.0.dist-info/METADATA +0 -65
da4ml-0.2.0.dist-info/RECORD +0 -39
/da4ml/codegen/verilog/source/{ioutils.hh → ioutil.hh} +0 -0
{da4ml-0.2.0.dist-info → da4ml-0.3.0.dist-info}/WHEEL +0 -0
{da4ml-0.2.0.dist-info → da4ml-0.3.0.dist-info}/licenses/LICENSE +0 -0
{da4ml-0.2.0.dist-info → da4ml-0.3.0.dist-info}/top_level.txt +0 -0

da4ml/_version.py CHANGED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.2.0'
-__version_tuple__ = version_tuple = (0, 2, 0)
+__version__ = version = '0.3.0'
+__version_tuple__ = version_tuple = (0, 3, 0)

da4ml/cmvm/api.py CHANGED Viewed

@@ -140,10 +140,6 @@ def jit_solve(
             if not method0 == method1 == 'wmc-dc' or decompose_dc >= 0:
                 decompose_dc -= 1
                 continue
-        if sum([op.cost for op in sol1.ops]) * 4 > sum([op.cost for op in sol0.ops]) and decompose_dc > 0:
-            # If the second stage is too expensive, the decomposition usually doesn't worth it
-            decompose_dc -= 1
-            continue
         break
     if max(latencies1) > latency_allowed:
         # When latency depends on the bw, may happen
@@ -158,8 +154,8 @@ def solve(
     method1: str = 'auto',
     hard_dc: int = -1,
     decompose_dc: int = -2,
-    qintervals: tuple[QInterval, ...] | None = None,
-    latencies: tuple[float, ...] | None = None,
+    qintervals: list[QInterval] | None = None,
+    latencies: list[float] | None = None,
     adder_size: int = -1,
     carry_size: int = -1,
     search_all_decompose_dc: bool = True,

da4ml/cmvm/core/__init__.py CHANGED Viewed

@@ -131,7 +131,6 @@ def to_solution(
     _global_id = len(ops)
     for i_out in range(n_out):
-        heap = []
         idx, shifts = np.where(expr[:, i_out] != 0)
         sub = np.empty(len(idx), dtype=np.int64)
         for i, (i_in, shift) in enumerate(zip(idx, shifts)):

da4ml/cmvm/types.py CHANGED Viewed

@@ -159,6 +159,8 @@ def _relu(v: 'T', i: int | None = None, f: int | None = None, inv: bool = False,
     from ..trace.fixed_variable import FixedVariable
     assert isinstance(v, FixedVariable), f'Unknown type {type(v)} for symbolic relu'
+    if inv:
+        v = -v
     return v.relu(i, f, round_mode=round_mode)
@@ -289,15 +291,16 @@ class Solution(NamedTuple):
             The output data after applying the operations defined in the solution.
         """
+        from ..trace.fixed_variable import FixedVariable
         buf = np.empty(len(self.ops), dtype=object)
         inp = np.asarray(inp)
         inp_qint = [op.qint for op in self.ops if op.opcode == -1]
         if quantize:  # TRN and WRAP
             k, i, f = map(np.array, zip(*map(minimal_kif, inp_qint)))
-            eps = 2.0**-f
-            _low, _high = -(2.0 ** (i + f)) * k, 2.0 ** (i + f) - 1
-            inp = eps * ((np.floor(inp / eps) - _low) % 2.0 ** (k + i + f) + _low)
+            inp = [_quantize(*x, round_mode='TRN') for x in zip(inp, k, i, f)]
         inp = inp * (2.0 ** np.array(self.inp_shift))
         for i, op in enumerate(self.ops):
@@ -320,39 +323,61 @@ class Solution(NamedTuple):
                     buf[i] = buf[op.id0] + bias
                 case 5:
                     buf[i] = op.data * op.qint.step  # const definition
+                case 6 | -6:  # MSB Mux
+                    id_c = op.data & 0xFFFFFFFF
+                    k, v0, v1 = buf[id_c], buf[op.id0], buf[op.id1]
+                    shift = (op.data >> 32) & 0xFFFFFFFF
+                    shift = shift if shift < 0x80000000 else shift - 0x100000000
+                    if op.opcode == -6:
+                        v1 = -v1
+                    if isinstance(k, FixedVariable):
+                        buf[i] = k.msb_mux(v0, v1 * 2**shift)
+                    else:
+                        qint_k = self.ops[id_c].qint
+                        if qint_k.min < 0:
+                            buf[i] = v0 if k < 0 else v1 * 2.0**shift
+                        else:
+                            _k, _i, _f = _minimal_kif(qint_k)
+                            buf[i] = v0 if k >= 2.0 ** (_i - 1) else v1 * 2.0**shift
                 case _:
                     raise ValueError(f'Unknown opcode {op.opcode} in {op}')
-        sf = 2.0 ** np.array(self.out_shifts)
+        sf = 2.0 ** np.array(self.out_shifts, dtype=np.float64)
         sign = np.where(self.out_negs, -1, 1)
-        out_idx = np.array(self.out_idxs)
+        out_idx = np.array(self.out_idxs, dtype=np.int32)
         mask = np.where(out_idx < 0, 0, 1)
         if debug:
+            operands = []
             for i, v in enumerate(buf):
                 op = self.ops[i]
                 match op.opcode:
                     case -1:
                         op_str = 'inp'
-                    case 0:
-                        op_str = f'buf[{op.id0}] + buf[{op.id1}]<<{op.data}'
-                    case 1:
-                        op_str = f'buf[{op.id0}] - buf[{op.id1}]<<{op.data}'
-                    case 2:
-                        op_str = f'relu(buf[{op.id0}])'
-                    case -2:
-                        op_str = f'relu(-buf[{op.id0}])'
-                    case 3:
-                        op_str = f'quantize(buf[{op.id0}])'
-                    case -3:
-                        op_str = f'quantize(-buf[{op.id0}])'
+                    case 0 | 1:
+                        _sign = '-' if op.opcode == 1 else '+'
+                        op_str = f'buf[{op.id0}] {_sign} buf[{op.id1}]<<{op.data}'
+                    case 2 | -2:
+                        _sign = '' if op.opcode == 2 else '-'
+                        op_str = f'relu({_sign}buf[{op.id0}])'
+                    case 3 | -3:
+                        _sign = '' if op.opcode == 3 else '-'
+                        op_str = f'quantize({_sign}buf[{op.id0}])'
                     case 4:
                         op_str = f'buf[{op.id0}] + {op.data * op.qint.step}'
                     case 5:
                         op_str = f'const {op.data * op.qint.step}'
+                    case 6 | -6:
+                        _sign = '-' if op.opcode == -6 else ''
+                        op_str = f'msb(buf[{op.data}]) ? buf[{op.id0}] : {_sign}buf[{op.id1}]'
                     case _:
                         raise ValueError(f'Unknown opcode {op.opcode} in {op}')
-                print(f'{op_str:24} |-> buf[{i}] = {v}')
+                result = f'|-> buf[{i}] = {v}'
+                operands.append((op_str, result))
+            max_len = max(len(op[0]) for op in operands)
+            for op_str, result in operands:
+                print(f'{op_str:<{max_len}} {result}')
         if dump:
             return buf
@@ -443,6 +468,61 @@ class Solution(NamedTuple):
             data = json.load(f)
         return cls.deserialize(data)
+    @property
+    def ref_count(self) -> np.ndarray:
+        """The number of references to the output elements in the solution."""
+        ref_count = np.zeros(len(self.ops), dtype=np.uint64)
+        for op in self.ops:
+            if op.opcode == -1:
+                continue
+            id0, id1 = op.id0, op.id1
+            if id0 != -1:
+                ref_count[id0] += 1
+            if id1 != -1:
+                ref_count[id1] += 1
+            if op.opcode in (6, -6):
+                # msb_mux operation
+                ref_count[op.data & 0xFFFFFFFF] += 1
+        for i in self.out_idxs:
+            if i < 0:
+                continue
+            ref_count[i] += 1
+        return ref_count
+    def to_binary(self):
+        n_in, n_out = self.shape
+        header_size_i32 = 2 + n_in + n_out * 3 + 1
+        header = np.concatenate(
+            [
+                [n_in, n_out, len(self.ops)],
+                self.inp_shift,
+                self.out_idxs,
+                self.out_shifts,
+                self.out_negs,
+            ],
+            axis=0,
+            dtype=np.int32,
+        )
+        assert len(header) == header_size_i32, f'Header size mismatch: {len(header)} != {header_size_i32}'
+        code = np.empty((len(self.ops), 8), dtype=np.int32)
+        for i, op in enumerate(self.ops):
+            buf = code[i]
+            buf[0] = op.opcode
+            buf[1] = op.id0
+            buf[2] = op.id1
+            buf[5:] = _minimal_kif(op.qint)
+            buf_i64 = buf[3:5].view(np.int64)
+            buf_i64[0] = op.data
+        data = np.concatenate([header, code.flatten()])
+        return data
+    def save_binary(self, path: str | Path):
+        """Dump the solution to a binary file."""
+        data = self.to_binary()
+        with open(path, 'wb') as f:
+            data.tofile(f)
 class CascadedSolution(NamedTuple):
     """A solution that implements cascaded matrix-vector multiplications through multiple CMVM stages.
@@ -561,7 +641,7 @@ class CascadedSolution(NamedTuple):
     @property
     def reg_bits(self):
         """The number of bits used for the register in the solution."""
-        bits = 0
+        bits = sum(map(sum, (_minimal_kif(qint) for qint in self.inp_qint)))
         for _sol in self.solutions:
             kifs = [_minimal_kif(qint) for qint in _sol.out_qint]
             _bits = sum(map(sum, kifs))

da4ml/codegen/__init__.py CHANGED Viewed

@@ -1,11 +1,12 @@
-from .cpp import cpp_logic_and_bridge_gen
-from .verilog import comb_binder_gen, comb_logic_gen, generate_io_wrapper, pipeline_binder_gen, pipeline_logic_gen
+from .cpp import HLSModel, cpp_logic_and_bridge_gen
+from .verilog import VerilogModel, binder_gen, comb_logic_gen, generate_io_wrapper, pipeline_logic_gen
 __all__ = [
     'cpp_logic_and_bridge_gen',
     'comb_logic_gen',
     'generate_io_wrapper',
-    'comb_binder_gen',
     'pipeline_logic_gen',
-    'pipeline_binder_gen',
+    'binder_gen',
+    'HLSModel',
+    'VerilogModel',
 ]

da4ml/codegen/cpp/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from .cpp_codegen import cpp_logic_and_bridge_gen
+from .hls_model import HLSModel
-__all__ = ['cpp_logic_and_bridge_gen']
+__all__ = ['cpp_logic_and_bridge_gen', 'HLSModel']

da4ml/codegen/cpp/cpp_codegen.py CHANGED Viewed

@@ -1,19 +1,19 @@
 from collections.abc import Callable
-from ...cmvm.types import Op, QInterval, Solution, _minimal_kif
+from ...cmvm.types import QInterval, Solution, _minimal_kif
 from ...trace.fixed_variable import _const_f
-def kif_to_vitis_type(k: bool | int, i: int, f: int):
+def kif_to_vitis_type(k: bool | int = 1, i: int = 0, f: int = 0):
     if k == i == f == 0:
         f = 1
-    return f'ap_{"" if k else "u"}fixed<{k+i+f},{k+i}>'
+    return f'ap_{"" if k else "u"}fixed<{k + i + f},{k + i}>'
-def kif_to_hlslib_type(k: bool | int, i: int, f: int):
+def kif_to_hlslib_type(k: bool | int = 1, i: int = 0, f: int = 0):
     if k == i == f == 0:
         f = 1
-    return f'ac_fixed<{int(k)},{k+i+f},{k+i}>'
+    return f'ac_fixed<{int(k)},{k + i + f},{k + i}>'
 def get_typestr_fn(flavor: str):
@@ -27,13 +27,18 @@ def get_typestr_fn(flavor: str):
     return typestr_fn
-def ssa_gen(ops: list[Op], print_latency: bool, typestr_fn: Callable[[bool | int, int, int], str]):
-    all_kifs = map(_minimal_kif, (op.qint for op in ops))
+def ssa_gen(sol: Solution, print_latency: bool, typestr_fn: Callable[[bool | int, int, int], str]):
+    ops = sol.ops
+    all_kifs = list(map(_minimal_kif, (op.qint for op in ops)))
     all_types = list(map(lambda x: typestr_fn(*x), all_kifs))
     lines = []
+    ref_count = sol.ref_count
     for i, op in enumerate(ops):
+        if ref_count[i] == 0:
+            # Skip unused ops
+            continue
         _type = all_types[i]
         ref0 = f'v{op.id0}'
@@ -42,12 +47,10 @@ def ssa_gen(ops: list[Op], print_latency: bool, typestr_fn: Callable[[bool | int
             case -1:
                 # Input marker
                 val = f'inp[{ops[op.id0].id0}]'
             case 0 | 1:
                 # Common a+/-b<<shift op
                 ref1 = f'bit_shift<{op.data}>(v{op.id1})' if op.data != 0 else f'v{op.id1}'
                 val = f'{ref0} {"-" if op.opcode == 1 else "+"} {ref1}'
             case 2 | -2:
                 if op.opcode == 2:  # relu(inp)
                     if ops[op.id0].qint.min < 0:
@@ -59,11 +62,9 @@ def ssa_gen(ops: list[Op], print_latency: bool, typestr_fn: Callable[[bool | int
                         val = f'{ref0} > 0 ? {_type}(0) : {_type}(-{ref0})'
                     else:
                         val = f'-{ref0}'
             case 3 | -3:
                 # Explicit quantization op, done implicitly via assignment
                 val = ref0 if op.opcode == 3 else f'-{ref0}'
             case 4:
                 # Constant addition
                 _number = op.data * op.qint.step
@@ -71,10 +72,20 @@ def ssa_gen(ops: list[Op], print_latency: bool, typestr_fn: Callable[[bool | int
                 f = _const_f(mag)
                 const_type_str = typestr_fn(*_minimal_kif(QInterval(mag, mag, 2.0**-f)))
                 val = f'{ref0} {sign} {const_type_str}({mag})'
             case 5:
+                # Define constant
                 _number = op.data * op.qint.step
                 val = f'{_number}'
+            case 6 | -6:
+                # MSB Mux
+                id_c = op.data & 0xFFFFFFFF
+                bw_k = sum(all_kifs[id_c])
+                shift = (op.data >> 32) & 0xFFFFFFFF
+                shift = shift if shift < 0x80000000 else shift - 0x100000000
+                ref_k = f'v{id_c}[{bw_k - 1}]'
+                sign = '-' if op.opcode == -6 else ''
+                ref1 = f'v{op.id1}' if shift == 0 else f'bit_shift<{shift}>(v{op.id1})'
+                val = f'{ref_k} ? {_type}({ref0}) : {_type}({sign}{ref1})'
             case _:
                 raise ValueError(f'Unsupported opcode: {op.opcode}')
@@ -103,6 +114,15 @@ def output_gen(sol: Solution, typestr_fn: Callable[[bool | int, int, int], str])
     return lines
+def get_io_types(sol: Solution, flavor: str):
+    typestr_fn = get_typestr_fn(flavor)
+    in_kif = map(max, zip(*map(_minimal_kif, sol.inp_qint)))
+    inp_type = typestr_fn(*in_kif)
+    out_kif = map(max, zip(*map(_minimal_kif, sol.out_qint)))
+    out_type = typestr_fn(*out_kif)
+    return inp_type, out_type
 def cpp_logic_and_bridge_gen(
     sol: Solution,
     fn_name: str,
@@ -113,36 +133,49 @@ def cpp_logic_and_bridge_gen(
     print_latency: bool = False,
 ):
     typestr_fn = get_typestr_fn(flavor)
-    in_kif = map(max, zip(*map(_minimal_kif, sol.inp_qint)))
-    inp_type = typestr_fn(*in_kif)
-    out_kif = map(max, zip(*map(_minimal_kif, sol.out_qint)))
-    out_type = typestr_fn(*out_kif)
+    inp_t, out_t = get_io_types(sol, flavor)
     n_in, n_out = sol.shape
     template_def = 'template <typename inp_t, typename out_t>'
     fn_signature = f'void {fn_name}(inp_t inp[{n_in}], out_t out[{n_out}])'
     pragmas = pragmas or []
-    ssa_lines = ssa_gen(sol.ops, print_latency=print_latency, typestr_fn=typestr_fn)
+    ssa_lines = ssa_gen(sol, print_latency=print_latency, typestr_fn=typestr_fn)
     output_lines = output_gen(sol, typestr_fn=typestr_fn)
     indent = ' ' * n_indent
     base_indent = indent * n_base_indent
     body_indent = '\n' + base_indent + indent
     code = f"""{base_indent}{template_def}
-{base_indent}{fn_signature} {{ // {inp_type} -> {out_type}
-{body_indent}{body_indent.join(pragmas)}
+{base_indent}{fn_signature} {{ // {inp_t} -> {out_t}
+{base_indent + indent}{body_indent.join(pragmas)}
 {body_indent}{body_indent.join(ssa_lines)}
 {body_indent}{body_indent.join(output_lines)}
 {base_indent}}}
 """
-    bridge = f"""#include "bridge.h"
-#include "fn.h"
+    bridge = f"""#include "binder_util.hh"
+#include "{fn_name}.hh"
+struct {fn_name}_config {{
+    static const size_t N_inp = {n_in};
+    static const size_t N_out = {n_out};
+    typedef {inp_t} inp_t;
+    typedef {out_t} out_t;
+    constexpr static auto f = {fn_name}<inp_t, out_t>;
+}};
 extern "C" {{
-void bridge(double *inp, double *out, int size) {{
-    auto fn = {fn_name}<{inp_type}, {out_type}>;
-    vitis_bridge<{inp_type}, {out_type}, {n_in}, {n_out}>(fn, inp, out, size);
+bool openmp_enabled() {{
+    return _openmp;
+}}
+void inference_f64(double *inp, double *out, size_t size) {{
+    batch_inference<{fn_name}_config, double>(inp, out, size);
+}}
+void inference_f32(float *inp, float *out, size_t size) {{
+    batch_inference<{fn_name}_config, float>(inp, out, size);
 }}
 }}"""
     return code, bridge

da4ml/codegen/cpp/hls_model.py ADDED Viewed

@@ -0,0 +1,252 @@
+import ctypes
+import os
+import re
+import shutil
+import subprocess
+import sys
+from collections.abc import Sequence
+from pathlib import Path
+from typing import TypeVar
+from uuid import uuid4
+import numpy as np
+from numpy.typing import NDArray
+from da4ml.cmvm.types import Solution
+from da4ml.codegen.cpp.cpp_codegen import cpp_logic_and_bridge_gen, get_io_types
+from ... import codegen
+from ...cmvm.types import _minimal_kif
+T = TypeVar('T', bound=np.floating)
+class HLSModel:
+    def __init__(
+        self,
+        solution: Solution,
+        prj_name: str,
+        path: str | Path,
+        flavor: str = 'vitis',
+        print_latency: bool = True,
+        part_name: str = 'xcvu13p-flga2577-2-e',
+        pragma: Sequence[str] | None = None,
+        clock_period: int = 5,
+        clock_uncertainty: float = 0.1,
+        io_delay_minmax: tuple[float, float] = (0.2, 0.4),
+    ):
+        self._solution = solution
+        self._prj_name = prj_name
+        self._path = Path(path)
+        self._flavor = flavor.lower()
+        assert self._flavor in ('vitis', 'hlslib'), f'Unsupported HLS flavor: {self._flavor}'
+        self._print_latency = print_latency
+        self._part_name = part_name
+        self._clock_period = clock_period
+        self._clock_uncertainty = clock_uncertainty
+        self._io_delay_minmax = io_delay_minmax
+        self.__src_root = Path(codegen.__file__).parent
+        self._lib = None
+        self._uuid = None
+        if pragma is None:
+            if self._flavor == 'vitis':
+                self._pragma = (
+                    '#pragma HLS ARRAY_PARTITION variable=inp complete',
+                    '#pragma HLS ARRAY_PARTITION variable=out complete',
+                    '#pragma HLS PIPELINE II=1',
+                )
+            else:
+                self._pragma = ()
+        else:
+            self._pragma = tuple(pragma)
+    def write(self):
+        if not self._path.exists():
+            self._path.mkdir(parents=True, exist_ok=True)
+        template_def, bridge = cpp_logic_and_bridge_gen(
+            self._solution,
+            self._prj_name,
+            self._flavor,
+            ['#pragma HLS INLINE'],
+            4,
+            0,
+            self._print_latency,
+        )
+        headers = ['#pragma once', '#include "bitshift.hh"']
+        inp_type, out_type = get_io_types(self._solution, self._flavor)
+        n_in, n_out = len(self._solution.inp_qint), len(self._solution.out_qint)
+        template_signature = (
+            f'template <typename inp_t, typename out_t>\nvoid {self._prj_name}(inp_t inp[{n_in}], out_t out[{n_out}]);'
+        )
+        fn_signature = f'void {self._prj_name}_fn({inp_type} inp[{n_in}], {out_type} out[{n_out}])'
+        with open(self._path / f'{self._prj_name}.hh', 'w') as f:
+            f.write('\n'.join(headers) + '\n\n')
+            f.write(f'{template_signature}\n\n{fn_signature};\n')
+        pragma_str = '\n'.join(self._pragma)
+        cpp_def = f"""
+#include "{self._prj_name}.hh"
+{template_def}
+{fn_signature} {{
+{pragma_str}
+    {self._prj_name}<{inp_type}, {out_type}>(inp, out);
+}}
+"""
+        with open(self._path / f'{self._prj_name}.cc', 'w') as f:
+            f.write(cpp_def)
+        with open(self._path / f'{self._prj_name}_bridge.cc', 'w') as f:
+            f.write(bridge)
+        shutil.copy(self.__src_root / 'cpp/source/binder_util.hh', self._path)
+        shutil.copy(self.__src_root / f'cpp/source/{self._flavor}_bitshift.hh', self._path / 'bitshift.hh')
+        shutil.copy(self.__src_root / 'cpp/source/build_binder.mk', self._path)
+        if self._flavor == 'vitis':
+            shutil.copytree(self.__src_root / 'cpp/source/ap_types', self._path / 'ap_types', dirs_exist_ok=True)
+        else:
+            pass
+        self._solution.save(self._path / 'project.json')
+    def _compile(self, verbose=False, openmp=True, o3: bool = False, clean=True):
+        """Same as compile, but will not write to the library
+        Parameters
+        ----------
+        verbose : bool, optional
+            Verbose output, by default False
+        openmp : bool, optional
+            Enable openmp, by default True
+        o3 : bool | None, optional
+            Turn on -O3 flag, by default False
+        clean : bool, optional
+            Remove obsolete shared object files, by default True
+        Raises
+        ------
+        RuntimeError
+            If compilation fails
+        """
+        self._uuid = str(uuid4())
+        args = ['make', '-f', 'build_binder.mk']
+        env = os.environ.copy()
+        env['PRJ_NAME'] = self._prj_name
+        env['STAMP'] = self._uuid
+        env['EXTRA_CXXFLAGS'] = '-fopenmp' if openmp else ''
+        if o3:
+            args.append('fast')
+        if clean:
+            m = re.compile(r'^lib.*[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\.so$')
+            for p in self._path.iterdir():
+                if not p.is_dir() and m.match(p.name):
+                    p.unlink()
+        try:
+            r = subprocess.run(args, env=env, check=True, cwd=self._path, capture_output=not verbose)
+        except subprocess.CalledProcessError as e:
+            print(e.stderr.decode(), file=sys.stderr)
+            print(e.stdout.decode(), file=sys.stdout)
+            raise RuntimeError('Compilation failed!!') from e
+        if r.returncode != 0:
+            print(r.stderr.decode(), file=sys.stderr)
+            print(r.stdout.decode(), file=sys.stderr)
+            raise RuntimeError('Compilation failed!!')
+        self._load_lib(self._uuid)
+    def _load_lib(self, uuid: str | None = None):
+        uuid = uuid if uuid is not None else self._uuid
+        self._uuid = uuid
+        lib_path = self._path / f'lib{self._prj_name}_{uuid}.so'
+        if not lib_path.exists():
+            raise RuntimeError(f'Library {lib_path} does not exist')
+        self._lib = ctypes.CDLL(str(lib_path))
+    def compile(self, verbose=False, openmp=True, o3: bool = False, clean=True):
+        """Compile the model to a shared object file
+        Parameters
+        ----------
+        verbose : bool, optional
+            Verbose output, by default False
+        openmp : bool, optional
+            Enable openmp, by default True
+        o3 : bool | None, optional
+            Turn on -O3 flag, by default False
+        clean : bool, optional
+            Remove obsolete shared object files, by default True
+        Raises
+        ------
+        RuntimeError
+            If compilation fails
+        """
+        self.write()
+        self._compile(verbose, openmp, o3, clean)
+    def predict(self, data: NDArray[T]) -> NDArray[T]:
+        """Run the model on the input data.
+        Parameters
+        ----------
+        data : NDArray[np.floating]
+            Input data to the model. The shape is ignored, and the number of samples is
+            determined by the size of the data.
+        Returns
+        -------
+        NDArray[np.floating]
+            Output of the model in shape (n_samples, output_size).
+        """
+        assert self._lib is not None, 'Library not loaded, call .compile() first.'
+        inp_size, out_size = self._solution.shape
+        dtype = data.dtype
+        if dtype not in (np.float32, np.float64):
+            raise TypeError(f'Unsupported input data type: {dtype}. Expected float32 or float64.')
+        c_dtype = ctypes.c_float if dtype == np.float32 else ctypes.c_double
+        assert data.size % inp_size == 0, f'Input size {data.size} is not divisible by {inp_size}'
+        n_sample = data.size // inp_size
+        inp_data = np.ascontiguousarray(data)
+        out_data = np.empty(n_sample * out_size, dtype=dtype)
+        inp_buf = inp_data.ctypes.data_as(ctypes.POINTER(c_dtype))
+        out_buf = out_data.ctypes.data_as(ctypes.POINTER(c_dtype))
+        if dtype == np.float32:
+            self._lib.inference_f32(inp_buf, out_buf, n_sample)
+        else:
+            self._lib.inference_f64(inp_buf, out_buf, n_sample)
+        return out_data.reshape(n_sample, out_size)  # type: ignore
+    def __repr__(self):
+        inp_size, out_size = self._solution.shape
+        inp_size, out_size = self._solution.shape
+        cost = round(self._solution.cost)
+        inp_kifs = tuple(zip(*map(_minimal_kif, self._solution.inp_qint)))
+        out_kifs = tuple(zip(*map(_minimal_kif, self._solution.out_qint)))
+        in_bits, out_bits = np.sum(inp_kifs), np.sum(out_kifs)
+        spec = f"""Top Function: {self._prj_name}\n====================
+{inp_size} ({in_bits} bits) -> {out_size} ({out_bits} bits)
+combinational @ delay={self._solution.latency}
+Estimated cost: {cost} LUTs"""
+        is_compiled = self._lib is not None
+        if is_compiled:
+            assert self._uuid is not None
+            openmp = 'with OpenMP' if self._lib.openmp_enabled() else ''  # type: ignore
+            spec += f'\nEmulator is compiled {openmp} ({self._uuid[-12:]})'
+        else:
+            spec += '\nEmulator is **not compiled**'
+        return spec

da4ml 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

Potentially problematic release.

da4ml 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl