PyPI - da4ml - Versions diffs - 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

da4ml 0.1.2py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of da4ml might be problematic. Click here for more details.

Files changed (50) hide show

da4ml/__init__.py +16 -16
da4ml/_version.py +2 -2
da4ml/cmvm/__init__.py +3 -34
da4ml/cmvm/api.py +235 -73
da4ml/cmvm/core/__init__.py +221 -0
da4ml/cmvm/core/indexers.py +83 -0
da4ml/cmvm/core/state_opr.py +284 -0
da4ml/cmvm/types.py +569 -0
da4ml/cmvm/util/__init__.py +7 -0
da4ml/cmvm/util/bit_decompose.py +86 -0
da4ml/cmvm/util/mat_decompose.py +121 -0
da4ml/codegen/__init__.py +11 -0
da4ml/codegen/cpp/__init__.py +3 -0
da4ml/codegen/cpp/cpp_codegen.py +148 -0
da4ml/codegen/cpp/source/vitis.h +30 -0
da4ml/codegen/cpp/source/vitis_bridge.h +17 -0
da4ml/codegen/verilog/__init__.py +13 -0
da4ml/codegen/verilog/comb.py +146 -0
da4ml/codegen/verilog/io_wrapper.py +255 -0
da4ml/codegen/verilog/pipeline.py +67 -0
da4ml/codegen/verilog/source/build_binder.mk +27 -0
da4ml/codegen/verilog/source/build_prj.tcl +74 -0
da4ml/codegen/verilog/source/ioutils.hh +117 -0
da4ml/codegen/verilog/source/shift_adder.v +56 -0
da4ml/codegen/verilog/source/template.xdc +29 -0
da4ml/codegen/verilog/verilog_model.py +268 -0
da4ml/trace/__init__.py +6 -0
da4ml/trace/fixed_variable.py +358 -0
da4ml/trace/fixed_variable_array.py +187 -0
da4ml/trace/ops/__init__.py +55 -0
da4ml/trace/ops/conv_utils.py +104 -0
da4ml/trace/ops/einsum_utils.py +299 -0
da4ml/trace/pipeline.py +155 -0
da4ml/trace/tracer.py +122 -0
da4ml-0.2.1.dist-info/METADATA +65 -0
da4ml-0.2.1.dist-info/RECORD +39 -0
{da4ml-0.1.2.dist-info → da4ml-0.2.1.dist-info}/WHEEL +1 -1
da4ml/cmvm/balanced_reduction.py +0 -46
da4ml/cmvm/cmvm.py +0 -328
da4ml/cmvm/codegen.py +0 -159
da4ml/cmvm/csd.py +0 -73
da4ml/cmvm/fixed_variable.py +0 -205
da4ml/cmvm/graph_compile.py +0 -85
da4ml/cmvm/nb_fixed_precision.py +0 -98
da4ml/cmvm/scoring.py +0 -55
da4ml/cmvm/utils.py +0 -5
da4ml-0.1.2.dist-info/METADATA +0 -122
da4ml-0.1.2.dist-info/RECORD +0 -18
{da4ml-0.1.2.dist-info → da4ml-0.2.1.dist-info}/licenses/LICENSE +0 -0
{da4ml-0.1.2.dist-info → da4ml-0.2.1.dist-info}/top_level.txt +0 -0

da4ml/cmvm/util/mat_decompose.py ADDED Viewed

@@ -0,0 +1,121 @@
+from math import ceil, log2
+import numpy as np
+from numba import jit
+from .bit_decompose import _center, _volatile_int_arr_to_csd
+@jit
+def prim_mst_dc(cost_mat: np.ndarray, dc: int = -1):
+    """Minimum Spanning Tree (MST) using Prim's algorithm with a delay constraint. May not be optimal.
+    Always start from the root node (0).
+    Parameters
+    ----------
+    cost_mat : np.ndarray
+        The adjacency matrix of the graph, where cost_mat[i, j] is the cost of the edge between i and j.
+    dc : int, optional
+        The delay constraint, by default -1
+        If -1, no delay constraint is applied.
+        Delay of each edge is ceiling(log2(cost_mat[i, j])).
+        Delay from the root node to any node is the **maximum** latency of each edge connecting in between,
+        plus ceiling(log2(#number of connection edges)).
+        Latency is **NOT** the sum of the latencies.
+    Returns
+    -------
+    np.ndarray
+        The adjacency list of the MST, where each row is a pair of nodes (parent, child).
+    """
+    N = len(cost_mat)
+    lat_mat = np.ceil(np.log2(np.maximum(cost_mat, 1)))
+    parent = np.full(N, -2, dtype=np.int32)  # -2: not visited, -1: root
+    parent[0] = -1
+    idxs = np.arange(N)
+    mapping = np.empty((N - 1, 2), dtype=np.int32)
+    latency = np.zeros((N,), dtype=np.int32)
+    if dc >= 0:
+        _dc = (2**dc - 1) + ceil(log2(np.max(cost_mat[0]) + 1e-32))
+    else:
+        _dc = -1
+    for n_impl in range(1, N):
+        implemented = parent != -2
+        _cost = cost_mat[~implemented][:, implemented]
+        if dc >= 0:
+            _lat = lat_mat[~implemented][:, implemented]
+            _cost = np.where(np.maximum(_lat, latency[implemented]) + 1 <= _dc, _cost, np.iinfo(_cost.dtype).max // 2)
+        _idx = int(np.argmin(_cost))
+        _i, _j = _idx // n_impl, _idx % n_impl
+        i, j = idxs[~implemented][_i], idxs[implemented][_j]
+        parent[i] = j
+        mapping[n_impl - 1, 0] = j
+        mapping[n_impl - 1, 1] = i
+        latency[i] = max(lat_mat[i, j], latency[j]) + 1  # type: ignore
+    return mapping
+@jit
+def kernel_decompose(kernel: np.ndarray, dc: int = -2):
+    """Decompose a 2D kernel matrix into two matrices with the delay-constrained approx MST.
+    Parameters
+    ----------
+    kernel : np.ndarray
+        The input kernel matrix to decompose.
+    dc : int, optional
+        Delay constraint, by default -1
+        If -2, no delay constraint is applied.
+        If -1, return trivial decomposition (m0 = kernel, m1 = I).
+        The delay constraint limits the maximum latency (hops) of the decomposed
+        multiplication structure.
+    Returns
+    -------
+    tuple[np.ndarray, np.ndarray]
+        The decomposed matrices (m0, m1): kernel = m0 @ m1
+    """
+    kernel, shift0, shift1 = _center(kernel)
+    scale0, scale1 = 2.0**shift0, 2.0**shift1
+    m, n = kernel.shape[0], kernel.shape[1] + 1
+    mat_aug = np.zeros((m, n), dtype=kernel.dtype)
+    mat_aug[:, 1:] = kernel
+    diff0 = mat_aug[:, :, None] - mat_aug[:, None, :]
+    diff1 = mat_aug[:, :, None] + mat_aug[:, None, :]
+    dist0 = np.sum(np.sum(_volatile_int_arr_to_csd(diff0) != 0, axis=3), axis=0)
+    dist1 = np.sum(np.sum(_volatile_int_arr_to_csd(diff1) != 0, axis=3), axis=0)
+    sign = np.where(dist1 - dist0 < 0, -1, 1)
+    dist = np.minimum(dist0, dist1)
+    mapping = prim_mst_dc(dist, dc=dc)
+    n_in, n_out = kernel.shape
+    m0, m1 = np.zeros((n_in, n_out), dtype=kernel.dtype), np.zeros((n_out, n_out), dtype=kernel.dtype)
+    if dc == -1:
+        m0[:] = kernel
+        m1[:] = np.eye(n_out, dtype=kernel.dtype)
+        return m0 * scale0[:, None], m1 * scale1
+    cnt = 0
+    for _from, _to in mapping:
+        col0 = mat_aug[:, _to] - mat_aug[:, _from] * sign[_to, _from]
+        if _from != 0:
+            col1 = m1[:, _from - 1].copy() * sign[_to, _from]
+        else:
+            col1 = np.zeros(n_out, dtype=kernel.dtype)
+        if np.any(col0 != 0):
+            col1[cnt] = 1
+            m0[:, cnt] = col0
+            cnt += 1
+        m1[:, _to - 1] = col1
+    return m0 * scale0[:, None], m1 * scale1

da4ml/codegen/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from .cpp import cpp_logic_and_bridge_gen
+from .verilog import comb_binder_gen, comb_logic_gen, generate_io_wrapper, pipeline_binder_gen, pipeline_logic_gen
+__all__ = [
+    'cpp_logic_and_bridge_gen',
+    'comb_logic_gen',
+    'generate_io_wrapper',
+    'comb_binder_gen',
+    'pipeline_logic_gen',
+    'pipeline_binder_gen',
+]

da4ml/codegen/cpp/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .cpp_codegen import cpp_logic_and_bridge_gen
+__all__ = ['cpp_logic_and_bridge_gen']

da4ml/codegen/cpp/cpp_codegen.py ADDED Viewed

@@ -0,0 +1,148 @@
+from collections.abc import Callable
+from ...cmvm.types import Op, QInterval, Solution, _minimal_kif
+from ...trace.fixed_variable import _const_f
+def kif_to_vitis_type(k: bool | int = 1, i: int = 0, f: int = 0):
+    if k == i == f == 0:
+        f = 1
+    return f'ap_{"" if k else "u"}fixed<{k+i+f},{k+i}>'
+def kif_to_hlslib_type(k: bool | int = 1, i: int = 0, f: int = 0):
+    if k == i == f == 0:
+        f = 1
+    return f'ac_fixed<{int(k)},{k+i+f},{k+i}>'
+def get_typestr_fn(flavor: str):
+    match flavor.lower():
+        case 'vitis':
+            typestr_fn = kif_to_vitis_type
+        case 'hlslib':
+            typestr_fn = kif_to_hlslib_type
+        case _:
+            raise ValueError(f'Unsupported flavor: {flavor}')
+    return typestr_fn
+def ssa_gen(ops: list[Op], print_latency: bool, typestr_fn: Callable[[bool | int, int, int], str]):
+    all_kifs = map(_minimal_kif, (op.qint for op in ops))
+    all_types = list(map(lambda x: typestr_fn(*x), all_kifs))
+    lines = []
+    for i, op in enumerate(ops):
+        _type = all_types[i]
+        ref0 = f'v{op.id0}'
+        match op.opcode:
+            case -1:
+                # Input marker
+                val = f'inp[{ops[op.id0].id0}]'
+            case 0 | 1:
+                # Common a+/-b<<shift op
+                ref1 = f'bit_shift<{op.data}>(v{op.id1})' if op.data != 0 else f'v{op.id1}'
+                val = f'{ref0} {"-" if op.opcode == 1 else "+"} {ref1}'
+            case 2 | -2:
+                if op.opcode == 2:  # relu(inp)
+                    if ops[op.id0].qint.min < 0:
+                        val = f'{ref0} > 0 ? {_type}({ref0}) : {_type}(0)'
+                    else:
+                        val = ref0
+                else:  # relu(-inp)
+                    if ops[op.id0].qint.max > 0:
+                        val = f'{ref0} > 0 ? {_type}(0) : {_type}(-{ref0})'
+                    else:
+                        val = f'-{ref0}'
+            case 3 | -3:
+                # Explicit quantization op, done implicitly via assignment
+                val = ref0 if op.opcode == 3 else f'-{ref0}'
+            case 4:
+                # Constant addition
+                _number = op.data * op.qint.step
+                sign, mag = ('-' if _number < 0 else '+'), abs(_number)
+                f = _const_f(mag)
+                const_type_str = typestr_fn(*_minimal_kif(QInterval(mag, mag, 2.0**-f)))
+                val = f'{ref0} {sign} {const_type_str}({mag})'
+            case 5:
+                _number = op.data * op.qint.step
+                val = f'{_number}'
+            case _:
+                raise ValueError(f'Unsupported opcode: {op.opcode}')
+        line = f'{_type} v{i} = {val};'
+        if print_latency:
+            line += f' // {op.latency}'
+        lines.append(line)
+    return lines
+def output_gen(sol: Solution, typestr_fn: Callable[[bool | int, int, int], str]):
+    lines = []
+    for i, idx in enumerate(sol.out_idxs):
+        if idx < 0:
+            lines.append(f'out[{i}] = 0;')
+            continue
+        _type = typestr_fn(*_minimal_kif(sol.out_qint[i]))
+        shift = sol.out_shifts[i]
+        neg_str = '-' if sol.out_negs[i] else ''
+        if shift == 0:
+            lines.append(f'out[{i}] = {_type}({neg_str}v{idx});')
+        else:
+            lines.append(f'out[{i}] = {_type}({neg_str}bit_shift<{shift}>(v{idx}));')
+    return lines
+def cpp_logic_and_bridge_gen(
+    sol: Solution,
+    fn_name: str,
+    flavor: str,
+    pragmas: list[str] | None = None,
+    n_indent: int = 4,
+    n_base_indent: int = 0,
+    print_latency: bool = False,
+):
+    typestr_fn = get_typestr_fn(flavor)
+    in_kif = map(max, zip(*map(_minimal_kif, sol.inp_qint)))
+    inp_type = typestr_fn(*in_kif)
+    out_kif = map(max, zip(*map(_minimal_kif, sol.out_qint)))
+    out_type = typestr_fn(*out_kif)
+    n_in, n_out = sol.shape
+    template_def = 'template <typename inp_t, typename out_t>'
+    fn_signature = f'void {fn_name}(inp_t inp[{n_in}], out_t out[{n_out}])'
+    pragmas = pragmas or []
+    ssa_lines = ssa_gen(sol.ops, print_latency=print_latency, typestr_fn=typestr_fn)
+    output_lines = output_gen(sol, typestr_fn=typestr_fn)
+    indent = ' ' * n_indent
+    base_indent = indent * n_base_indent
+    body_indent = '\n' + base_indent + indent
+    code = f"""{base_indent}{template_def}
+{base_indent}{fn_signature} {{ // {inp_type} -> {out_type}
+{body_indent}{body_indent.join(pragmas)}
+{body_indent}{body_indent.join(ssa_lines)}
+{body_indent}{body_indent.join(output_lines)}
+{base_indent}}}
+"""
+    bridge = f"""#include "bridge.h"
+#include "fn.h"
+extern "C" {{
+void bridge(double *inp, double *out, int size) {{
+    auto fn = {fn_name}<{inp_type}, {out_type}>;
+    vitis_bridge<{inp_type}, {out_type}, {n_in}, {n_out}>(fn, inp, out, size);
+}}
+}}"""
+    return code, bridge

da4ml/codegen/cpp/source/vitis.h ADDED Viewed

@@ -0,0 +1,30 @@
+#pragma once
+#include "ap_fixed.h"
+template <int s, int b, int i, ap_q_mode Q, ap_o_mode O, int N> ap_fixed<b, i + s> bit_shift(ap_fixed<b, i, Q, O, N> x) {
+#pragma HLS INLINE
+    ap_fixed<b, i + s> r;
+    r.range() = x.range();
+    return r;
+};
+template <int s, int b, int i, ap_q_mode Q, ap_o_mode O, int N> ap_ufixed<b, i + s> bit_shift(ap_ufixed<b, i, Q, O, N> x) {
+#pragma HLS INLINE
+    ap_ufixed<b, i + s> r;
+    r.range() = x.range();
+    return r;
+};
+template <int s, int b> ap_fixed<b, s> bit_shift(ap_int<b> x) {
+#pragma HLS INLINE
+    ap_fixed<b, s> r;
+    r.range() = x.range();
+    return r;
+};
+template <int s, int b> ap_ufixed<b, s> bit_shift(ap_uint<b> x) {
+#pragma HLS INLINE
+    ap_ufixed<b, s> r;
+    r.range() = x.range();
+    return r;
+};

da4ml/codegen/cpp/source/vitis_bridge.h ADDED Viewed

@@ -0,0 +1,17 @@
+#pragma once
+#include "ap_fixed.h"
+template <typename inp_t, typename out_t, size_t SIZE_IN, size_t SIZE_OUT, typename F>
+void vitis_bridge(F f, double *inp, double *out, int size) {
+    inp_t in_fixed_buf[SIZE_IN];
+    out_t out_fixed_buf[SIZE_OUT];
+    for (int i = 0; i < size; i++) {
+        for (int j = 0; j < SIZE_IN; j++) {
+            in_fixed_buf[j] = inp_t(inp[i * SIZE_IN + j]);
+        }
+        f(in_fixed_buf, out_fixed_buf);
+        for (int j = 0; j < SIZE_OUT; j++) {
+            out[i * SIZE_OUT + j] = double(out_fixed_buf[j]);
+        }
+    }
+}

da4ml/codegen/verilog/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+from .comb import comb_logic_gen
+from .io_wrapper import comb_binder_gen, generate_io_wrapper, pipeline_binder_gen
+from .pipeline import pipeline_logic_gen
+from .verilog_model import VerilogModel
+__all__ = [
+    'comb_logic_gen',
+    'generate_io_wrapper',
+    'comb_binder_gen',
+    'pipeline_logic_gen',
+    'pipeline_binder_gen',
+    'VerilogModel',
+]

da4ml/codegen/verilog/comb.py ADDED Viewed

@@ -0,0 +1,146 @@
+from math import ceil, log2
+import numpy as np
+from da4ml.cmvm.types import Op, QInterval, Solution, _minimal_kif
+def ssa_gen(ops: list[Op], print_latency: bool = False):
+    kifs = list(map(_minimal_kif, (op.qint for op in ops)))
+    widths = list(map(sum, kifs))
+    inp_kifs = [_minimal_kif(op.qint) for op in ops if op.opcode == -1]
+    inp_widths = list(map(sum, inp_kifs))
+    _inp_widths = np.cumsum([0] + inp_widths)
+    inp_idxs = np.stack([_inp_widths[1:] - 1, _inp_widths[:-1]], axis=1)
+    lines = []
+    for i, op in enumerate(ops):
+        bw = widths[i]
+        v = f'v{i}[{bw-1}:0]'
+        _def = f'wire [{bw-1}:0] v{i};'
+        match op.opcode:
+            case -1:  # Input marker
+                i0, i1 = inp_idxs[op.id0]
+                line = f'{_def} assign {v} = inp[{i0}:{i1}];'
+            case 2 | -2:  # ReLU
+                lsb_bias = kifs[op.id0][2] - kifs[i][2]
+                i0, i1 = bw + lsb_bias - 1, lsb_bias
+                v0_name = f'v{op.id0}'
+                bw0 = widths[op.id0]
+                if op.opcode == -2:
+                    _min, _max, step = ops[op.id0].qint
+                    bw_neg = max(sum(_minimal_kif(QInterval(-_max, -_min, step))), bw0)
+                    lines.append(
+                        f'wire [{bw_neg-1}:0] v{op.id0}_neg; assign v{op.id0}_neg[{bw_neg-1}:0] = -{v0_name}[{bw0-1}:0];'
+                    )
+                    v0_name = f'v{op.id0}_neg'
+                if ops[op.id0].qint.min < 0:
+                    line = f'{_def} assign {v} = {v0_name}[{i0}:{i1}] & {{{bw}{{~{v0_name}[{bw0-1}]}}}};'
+                else:
+                    line = f'{_def} assign {v} = {v0_name}[{i0}:{i1}];'
+            case 3 | -3:  # Explicit quantization
+                lsb_bias = kifs[op.id0][2] - kifs[i][2]
+                i0, i1 = bw + lsb_bias - 1, lsb_bias
+                v0_name = f'v{op.id0}'
+                bw0 = widths[op.id0]
+                if op.opcode == -3:
+                    _min, _max, step = ops[op.id0].qint
+                    bw_neg = max(sum(_minimal_kif(QInterval(-_max, -_min, step))), bw0)
+                    lines.append(
+                        f'wire [{bw_neg-1}:0] v{op.id0}_neg; assign v{op.id0}_neg[{bw_neg-1}:0] = -{v0_name}[{bw0-1}:0];'
+                    )
+                    v0_name = f'v{op.id0}_neg'
+                line = f'{_def} assign {v} = {v0_name}[{i0}:{i1}];'
+            case 4:  # constant addition
+                num = op.data
+                sign, mag = int(num < 0), abs(num)
+                line = f"{_def} assign {v} = '{bin(mag)[1:]};"
+                bw1 = ceil(log2(mag + 1))
+                bw0 = widths[op.id0]
+                s0 = int(kifs[op.id0][0])
+                v0 = f'v{op.id0}[{bw0-1}:0]'
+                v1 = f"'{bin(mag)[1:]}"
+                shift = int(log2(op.qint.step / ops[op.id0].qint.step))
+                line = f'{_def} shift_adder #({bw0}, {bw1}, {s0}, 0, {bw}, {shift}, {sign}) op_{i} ({v0}, {v1}, {v});'
+            case 5:  # constant
+                num = op.data
+                if num < 0:
+                    num = 2**bw + num
+                line = f"{_def} assign {v} = '{bin(num)[1:]};"
+            case 0 | 1:  # Common a+/-b<<shift oprs
+                p0, p1 = kifs[op.id0], kifs[op.id1]  # precision -> keep_neg, integers (no sign), fractional
+                bw0, bw1 = widths[op.id0], widths[op.id1]  # width
+                s0, f0, s1, f1 = int(p0[0]), p0[2], int(p1[0]), p1[2]
+                shift = op.data + f0 - f1
+                v0, v1 = f'v{op.id0}[{bw0-1}:0]', f'v{op.id1}[{bw1-1}:0]'
+                line = f'{_def} shift_adder #({bw0}, {bw1}, {s0}, {s1}, {bw}, {shift}, {op.opcode}) op_{i} ({v0}, {v1}, {v});'
+            case _:
+                raise ValueError(f'Unknown opcode {op.opcode} for operation {i} ({op})')
+        if print_latency:
+            line += f' // {op.latency}'
+        lines.append(line)
+    return lines
+def output_gen(sol: Solution):
+    lines = []
+    widths = list(map(sum, map(_minimal_kif, sol.out_qint)))
+    _widths = np.cumsum([0] + widths)
+    out_idxs = np.stack([_widths[1:] - 1, _widths[:-1]], axis=1)
+    for i, idx in enumerate(sol.out_idxs):
+        if idx < 0:
+            continue
+        i0, i1 = out_idxs[i]
+        bw = widths[i]
+        bw0 = sum(_minimal_kif(sol.ops[idx].qint))
+        if sol.out_negs[i]:
+            lines.append(f'wire [{bw-1}:0] out_neg{i}; assign out_neg{i} = -v{idx}[{bw0-1}:0];')
+            lines.append(f'assign out[{i0}:{i1}] = out_neg{i}[{bw-1}:0];')
+        else:
+            lines.append(f'assign out[{i0}:{i1}] = v{idx}[{bw-1}:0];')
+    return lines
+def comb_logic_gen(sol: Solution, fn_name: str, print_latency: bool = False, timescale: str | None = None):
+    inp_bits = sum(map(sum, map(_minimal_kif, sol.inp_qint)))
+    out_bits = sum(map(sum, map(_minimal_kif, sol.out_qint)))
+    fn_signature = [
+        f'module {fn_name} (',
+        f'    input [{inp_bits-1}:0] inp,',
+        f'    output [{out_bits-1}:0] out',
+        ');',
+    ]
+    ssa_lines = ssa_gen(sol.ops, print_latency=print_latency)
+    output_lines = output_gen(sol)
+    indent = '    '
+    base_indent = '\n'
+    body_indent = base_indent + indent
+    code = f"""{base_indent[1:]}{base_indent.join(fn_signature)}
+    // verilator lint_off UNUSEDSIGNAL
+    // Explicit quantization operation will drop bits if exists
+    {body_indent.join(ssa_lines)}
+    // verilator lint_on UNUSEDSIGNAL
+    {body_indent.join(output_lines)}
+    endmodule
+"""
+    if timescale is not None:
+        code = f'{timescale}\n\n{code}'
+    return code

da4ml 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl

Potentially problematic release.

da4ml 0.1.2py3-none-any.whl → 0.2.1py3-none-any.whl