PyPI - da4ml - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

da4ml 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of da4ml might be problematic. Click here for more details.

Files changed (59) hide show

da4ml/_version.py +2 -2
da4ml/cmvm/api.py +2 -6
da4ml/cmvm/core/__init__.py +0 -1
da4ml/cmvm/types.py +99 -19
da4ml/codegen/__init__.py +5 -4
da4ml/codegen/cpp/__init__.py +2 -1
da4ml/codegen/cpp/cpp_codegen.py +58 -25
da4ml/codegen/cpp/hls_model.py +252 -0
da4ml/codegen/cpp/source/ap_types/ap_binary.h +78 -0
da4ml/codegen/cpp/source/ap_types/ap_common.h +376 -0
da4ml/codegen/cpp/source/ap_types/ap_decl.h +212 -0
da4ml/codegen/cpp/source/ap_types/ap_fixed.h +360 -0
da4ml/codegen/cpp/source/ap_types/ap_fixed_base.h +2354 -0
da4ml/codegen/cpp/source/ap_types/ap_fixed_ref.h +718 -0
da4ml/codegen/cpp/source/ap_types/ap_fixed_special.h +230 -0
da4ml/codegen/cpp/source/ap_types/ap_int.h +330 -0
da4ml/codegen/cpp/source/ap_types/ap_int_base.h +1885 -0
da4ml/codegen/cpp/source/ap_types/ap_int_ref.h +1346 -0
da4ml/codegen/cpp/source/ap_types/ap_int_special.h +223 -0
da4ml/codegen/cpp/source/ap_types/ap_shift_reg.h +138 -0
da4ml/codegen/cpp/source/ap_types/etc/ap_private.h +7199 -0
da4ml/codegen/cpp/source/ap_types/hls_math.h +27 -0
da4ml/codegen/cpp/source/ap_types/hls_stream.h +263 -0
da4ml/codegen/cpp/source/ap_types/utils/x_hls_utils.h +80 -0
da4ml/codegen/cpp/source/binder_util.hh +56 -0
da4ml/codegen/cpp/source/build_binder.mk +24 -0
da4ml/codegen/cpp/source/{vitis.h → vitis_bitshift.hh} +1 -1
da4ml/codegen/verilog/__init__.py +2 -3
da4ml/codegen/verilog/comb.py +65 -24
da4ml/codegen/verilog/io_wrapper.py +36 -141
da4ml/codegen/verilog/pipeline.py +21 -3
da4ml/codegen/verilog/source/binder_util.hh +72 -0
da4ml/codegen/verilog/source/build_prj.tcl +0 -1
da4ml/codegen/verilog/source/mux.v +58 -0
da4ml/codegen/verilog/source/negative.v +28 -0
da4ml/codegen/verilog/source/shift_adder.v +4 -1
da4ml/codegen/verilog/source/template.xdc +3 -0
da4ml/codegen/verilog/verilog_model.py +42 -15
da4ml/converter/__init__.py +0 -0
da4ml/converter/hgq2/parser.py +105 -0
da4ml/converter/hgq2/replica.py +383 -0
da4ml/trace/__init__.py +2 -2
da4ml/trace/fixed_variable.py +177 -18
da4ml/trace/fixed_variable_array.py +124 -9
da4ml/trace/ops/__init__.py +22 -6
da4ml/trace/ops/conv_utils.py +146 -14
da4ml/trace/ops/einsum_utils.py +9 -6
da4ml/trace/ops/reduce_utils.py +103 -0
da4ml/trace/pipeline.py +36 -34
da4ml/trace/tracer.py +37 -5
da4ml-0.3.0.dist-info/METADATA +107 -0
da4ml-0.3.0.dist-info/RECORD +64 -0
da4ml/codegen/cpp/source/vitis_bridge.h +0 -17
da4ml-0.2.0.dist-info/METADATA +0 -65
da4ml-0.2.0.dist-info/RECORD +0 -39
/da4ml/codegen/verilog/source/{ioutils.hh → ioutil.hh} +0 -0
{da4ml-0.2.0.dist-info → da4ml-0.3.0.dist-info}/WHEEL +0 -0
{da4ml-0.2.0.dist-info → da4ml-0.3.0.dist-info}/licenses/LICENSE +0 -0
{da4ml-0.2.0.dist-info → da4ml-0.3.0.dist-info}/top_level.txt +0 -0

da4ml/trace/ops/conv_utils.py CHANGED Viewed

@@ -1,10 +1,15 @@
+import typing
 from collections.abc import Sequence
+from math import ceil, prod
 from typing import TypeVar
 import numpy as np
 from numpy.typing import NDArray
-from ..fixed_variable_array import FixedVariableArray
+from .reduce_utils import reduce
+if typing.TYPE_CHECKING:
+    from ..fixed_variable_array import FixedVariableArray
 def r_im2col(kernel_size: Sequence[int], arr: np.ndarray, buffer: np.ndarray, axis: int):
@@ -33,23 +38,23 @@ def stride_arr(stride: int | tuple[int, ...], arr: np.ndarray):
     ndim = arr.ndim
     if isinstance(stride, int):
         stride = (stride,) * (ndim - 1)
-    assert len(stride) == ndim - 1, f'Invalid stride {stride} for array with {ndim} dimensions'
     _idx = tuple(slice(None, None, st) for st in stride)
     return arr[*_idx]
-T = TypeVar('T', FixedVariableArray, NDArray[np.integer | np.floating])
+TA = TypeVar('TA', 'FixedVariableArray', NDArray[np.integer | np.floating])
-def conv(
-    x: T,
+def _conv(
+    x: TA,
     kernel: NDArray[np.integer | np.floating],
     bias: NDArray[np.integer | np.floating] | None = None,
     strides: int | tuple[int, ...] = 1,
     padding: tuple[tuple[int, int], ...] | str = 'VALID',
-    format: str = 'channels_last',
-):
+) -> TA:
+    from ..fixed_variable_array import FixedVariableArray
     if isinstance(x, FixedVariableArray):
         solver_options = x.solver_options
         data = x._vars
@@ -63,10 +68,10 @@ def conv(
     ch_in, ch_out = kernel.shape[-2:]
     _ch_in = data.shape[-1]
     assert ch_in == _ch_in, f'Invalid input shape {data.shape} for kernel {kernel.shape}'
-    assert kernel.ndim == ndim + 1
-    assert format in ('channels_last', 'channels_first'), f'Invalid format {format}'
+    if kernel.ndim != ndim + 1:
+        if kernel.ndim == ndim:
+            raise ValueError('Inputs should not contain batch dimension')
+        raise ValueError(f'Invalid kernel shape {kernel.shape} for input with {ndim} dimensions')
     if isinstance(strides, int):
         strides = (strides,) * (ndim - 1)
     assert len(strides) == ndim - 1, f'Invalid stride {strides} for array with {ndim} dimensions'
@@ -89,16 +94,143 @@ def conv(
     data = np.pad(data, padding + ((0, 0),), mode='constant', constant_values=0.0)
     data = _im2col(kernel.shape, data)
+    data = stride_arr(strides, data)
     if is_symbolic:
         _data = FixedVariableArray(data, solver_options) @ kernel.reshape(-1, ch_out)
         data = _data._vars
     else:
         data = data @ kernel.reshape(-1, ch_out)
-    data = stride_arr(strides, data)
     if bias is not None:
         data = data + bias
+    if isinstance(x, FixedVariableArray):
+        return FixedVariableArray(data, solver_options)
+    return data
+def conv(
+    x: TA,
+    kernel: NDArray[np.integer | np.floating],
+    bias: NDArray[np.integer | np.floating] | None = None,
+    strides: int | tuple[int, ...] = 1,
+    padding: tuple[tuple[int, int], ...] | str = 'VALID',
+    format: str = 'channels_last',
+    groups: int | None = None,
+) -> TA:
+    from ..fixed_variable_array import FixedVariableArray
+    assert format in ('channels_last', 'channels_first'), f'Invalid format {format}'
+    if format == 'channels_first':
+        x = np.moveaxis(x, 0, -1)  # type: ignore
+    *_, _ch_in, ch_out = kernel.shape
+    ch_in = x.shape[-1]
+    assert ch_in % _ch_in == 0, f'groups is not integer (total_ch_in={ch_in}, kernel_ch_in={_ch_in})'
+    if groups is None:
+        groups = ch_in // _ch_in
+    else:
+        assert (
+            groups == ch_in // _ch_in
+        ), f'groups {groups} does not match input channels {ch_in} and kernel input channels {_ch_in}'
+    assert ch_out % groups == 0, f'groups is not integer (total_ch_out={ch_out}, groups={groups})'
+    _ch_out = ch_out // groups
+    buf: list[TA] = []
+    for gp in range(groups):
+        _kernel = kernel[..., gp * _ch_out : (gp + 1) * _ch_out]
+        _x = x[..., gp * _ch_in : (gp + 1) * _ch_in]
+        _buf = _conv(
+            _x,
+            _kernel,
+            strides=strides,
+            padding=padding,
+        )
+        buf.append(_buf)  # type: ignore
+    if isinstance(x, FixedVariableArray):
+        data = np.concatenate([b._vars for b in buf], axis=-1)  # type: ignore
+    else:
+        data = np.concatenate(buf, axis=-1)  # type: ignore
+    data = data + bias if bias is not None else data
     if format == 'channels_first':
-        data = np.moveaxis(data, -1, 1)
-    if solver_options is not None:
+        return np.moveaxis(data, -1, 0)  # type: ignore
+    if isinstance(x, FixedVariableArray):
+        return FixedVariableArray(data, x.solver_options)
+    return data
+def pool(
+    x: TA,
+    pool_size: Sequence[int],
+    strides: int | Sequence[int] | None = None,
+    padding: tuple[tuple[int, int], ...] | str = 'VALID',
+    pool_type: str = 'avg',
+    format: str = 'channels_last',
+) -> TA:
+    from ..fixed_variable import FixedVariable
+    from ..fixed_variable_array import FixedVariableArray
+    if isinstance(x, FixedVariableArray):
+        solver_options = x.solver_options
+        data = x._vars
+    else:
+        solver_options = None
+        data = x
+    if format == 'channels_first':
+        data = np.moveaxis(data, 0, -1)
+    strides = strides or pool_size
+    assert pool_type in ('avg', 'max'), f'Invalid pool type {pool_type}'
+    ndim = data.ndim
+    if isinstance(strides, int):
+        strides = (strides,) * (ndim - 1)
+    assert len(strides) == ndim - 1, f'Invalid stride {strides} for array with {ndim} dimensions'
+    if isinstance(padding, str):
+        padding = padding.upper()
+        if padding == 'VALID':
+            padding = ((0, 0),) * (ndim - 1)
+        elif padding == 'SAME':
+            _padding = []
+            for i in range(ndim - 1):
+                n_pad = ceil(data.shape[i] / strides[i]) * strides[i] + (pool_size[i] - strides[i]) - data.shape[i]
+                pad0 = n_pad // 2
+                pad1 = n_pad - pad0
+                _padding.append((pad0, pad1))
+            padding = tuple(_padding)
+        else:
+            raise ValueError(f'Invalid padding {padding}')
+    assert len(padding) == ndim - 1, f'Invalid padding {padding} for array with {ndim} dimensions'
+    assert all(len(p) == 2 for p in padding), f'Invalid padding {padding} for array with {ndim} dimensions'
+    data = np.pad(data, padding + ((0, 0),), mode='constant', constant_values=-np.inf)
+    ch_in = data.shape[-1]
+    fake_kernel_shape = tuple(pool_size) + (ch_in, ch_in)
+    data = _im2col(fake_kernel_shape, data)
+    data = data.reshape(*data.shape[:-1], prod(pool_size), ch_in)
+    data = stride_arr(tuple(strides), data)
+    if pool_type == 'avg':
+        div = np.sum(data != -np.inf, axis=-2)
+        data = np.where(data == -np.inf, 0, data)
+        data = reduce(lambda x, y: x + y, data, axis=-2) * (1 / div)
+    else:
+        def max_of(a, b):
+            if isinstance(a, FixedVariable):
+                return a.max_of(b)
+            if isinstance(b, FixedVariable):
+                return b.max_of(a)
+            return max(a, b)
+        data = reduce(lambda x, y: max_of(x, y), data, axis=-2)
+    if format == 'channels_first':
+        data = np.moveaxis(data, -1, 0)
+    if isinstance(x, FixedVariableArray):
         return FixedVariableArray(data, solver_options)
     return data

da4ml/trace/ops/einsum_utils.py CHANGED Viewed

@@ -1,10 +1,11 @@
 from math import prod
-from typing import TypedDict, overload
+from typing import TYPE_CHECKING, TypedDict, overload
 import numpy as np
 from numpy.typing import NDArray
-from ..fixed_variable_array import FixedVariableArray
+if TYPE_CHECKING:
+    from ..fixed_variable_array import FixedVariableArray
 class EinsumRecipe(TypedDict):
@@ -105,7 +106,7 @@ def _validate_einsum_expr(fn: str, shape0: tuple[int, ...], shape1: tuple[int, .
         # Axes expansion in input0 or input1 only
         if '0' in sax_in0:
             if len(sax_in0) - 1 > len(shape0):
-                raise ValueError(f'Input0 requires at least {len(sax_in0)-1} dimensions, but only {len(shape0)} given')
+                raise ValueError(f'Input0 requires at least {len(sax_in0) - 1} dimensions, but only {len(shape0)} given')
             # Replace auto expansion indices with free indices
             n_broadcast = len(shape0) - len(sax_in0) + 1
             in0 = in0.replace('0', free_indices[:n_broadcast])
@@ -118,7 +119,7 @@ def _validate_einsum_expr(fn: str, shape0: tuple[int, ...], shape1: tuple[int, .
         if '0' in sax_in1:
             if len(sax_in1) - 1 > len(shape1):
-                raise ValueError(f'Input1 requires at least {len(sax_in1)-1} dimensions, but only {len(shape1)} given')
+                raise ValueError(f'Input1 requires at least {len(sax_in1) - 1} dimensions, but only {len(shape1)} given')
             # Replace expansion indices with free indices
             n_broadcast = len(shape1) - len(sax_in1) + 1
             in1 = in1.replace('0', free_indices[:n_broadcast])
@@ -271,11 +272,11 @@ def _einsum(fn: str, input0, input1) -> np.ndarray:
 @overload
-def einsum(fn: str, input0: FixedVariableArray, input1: NDArray[np.integer | np.floating]) -> FixedVariableArray: ...
+def einsum(fn: str, input0: 'FixedVariableArray', input1: NDArray[np.integer | np.floating]) -> 'FixedVariableArray': ...
 @overload
-def einsum(fn: str, input0: NDArray[np.integer | np.floating], input1: FixedVariableArray) -> FixedVariableArray: ...
+def einsum(fn: str, input0: NDArray[np.integer | np.floating], input1: 'FixedVariableArray') -> 'FixedVariableArray': ...
 @overload
@@ -285,6 +286,8 @@ def einsum(
 def einsum(fn: str, input0, input1):
+    from ..fixed_variable_array import FixedVariableArray
     fg0 = isinstance(input0, FixedVariableArray)
     fg1 = isinstance(input1, FixedVariableArray)
     if fg0 and fg1:

da4ml/trace/ops/reduce_utils.py ADDED Viewed

@@ -0,0 +1,103 @@
+import heapq
+import typing
+from collections.abc import Callable, Sequence
+from math import prod
+from typing import TypeVar
+import numpy as np
+from numpy.typing import NDArray
+if typing.TYPE_CHECKING:
+    from ..fixed_variable import FixedVariable
+    from ..fixed_variable_array import FixedVariableArray
+T = typing.TypeVar('T', 'FixedVariable', float, np.floating)
+TA = TypeVar('TA', 'FixedVariableArray', NDArray[np.integer | np.floating])
+class Packet:
+    def __init__(self, v):
+        self.value = v
+    def __gt__(self, other: 'Packet') -> bool:  # type: ignore
+        from ..fixed_variable_array import FixedVariable
+        a, b = self.value, other.value
+        if isinstance(a, FixedVariable):
+            if isinstance(b, FixedVariable):
+                if b.latency > a.latency:
+                    return False
+                if b.latency < a.latency:
+                    return True
+                if b._factor > 0 and a._factor < 0:
+                    return False
+                if b._factor < 0 and a._factor > 0:
+                    return True
+                return sum(a.kif[:2]) > sum(b.kif[:2])
+            return True
+        return False
+    def __lt__(self, other: 'Packet') -> bool:  # type: ignore
+        return not self.__gt__(other)
+def _reduce(operator: Callable[[T, T], T], arr: Sequence[T]) -> T:
+    from ..fixed_variable_array import FixedVariable
+    if isinstance(arr, np.ndarray):
+        arr = list(arr.ravel())
+    assert len(arr) > 0, 'Array must not be empty'
+    if len(arr) == 1:
+        return arr[0]
+    dtype = arr[0].__class__
+    if not issubclass(dtype, FixedVariable):
+        r = operator(arr[0], arr[1])
+        for i in range(2, len(arr)):
+            r = operator(r, arr[i])
+        return r
+    heap = [Packet(v) for v in arr]  # type: ignore
+    heapq.heapify(heap)
+    while len(heap) > 1:
+        v1 = heapq.heappop(heap).value
+        v2 = heapq.heappop(heap).value
+        v = operator(v1, v2)
+        heapq.heappush(heap, Packet(v))  # type: ignore
+    return heap[0].value
+def reduce(operator: Callable[[T, T], T], x: TA, axis: int | Sequence[int] | None = None, keepdims: bool = False) -> TA:
+    """
+    Reduce the array by summing over the specified axis.
+    """
+    from ..fixed_variable_array import FixedVariableArray
+    if isinstance(x, FixedVariableArray):
+        solver_config = x.solver_options
+        arr = x._vars
+    else:
+        solver_config = None
+        arr = x
+    all_axis = tuple(range(arr.ndim))
+    axis = axis if axis is not None else all_axis
+    axis = (axis,) if isinstance(axis, int) else tuple(axis)
+    axis = tuple(a if a >= 0 else a + arr.ndim for a in axis)
+    xpose_axis = sorted(all_axis, key=lambda a: (a in axis) * 1000 + a)
+    if keepdims:
+        target_shape = tuple(d if ax not in axis else 1 for ax, d in enumerate(arr.shape))
+    else:
+        target_shape = tuple(d for ax, d in enumerate(arr.shape) if ax not in axis)
+    dim_contract = prod(arr.shape[a] for a in axis)
+    arr = np.transpose(arr, xpose_axis)  # type: ignore
+    _arr = arr.reshape(-1, dim_contract)
+    _arr = np.array([_reduce(operator, _arr[i]) for i in range(_arr.shape[0])])
+    r = _arr.reshape(target_shape)  # type: ignore
+    if isinstance(x, FixedVariableArray):
+        return FixedVariableArray(r, solver_config)
+    return r

da4ml/trace/pipeline.py CHANGED Viewed

@@ -31,6 +31,35 @@ def retime_pipeline(csol: CascadedSolution, verbose=True):
     return best
+def _get_new_idx(
+    idx: int,
+    locator: list[dict[int, int]],
+    opd: dict[int, list[Op]],
+    out_idxd: dict[int, list[int]],
+    ops: list[Op],
+    stage: int,
+    latency_cutoff: int,
+):
+    if idx < 0:
+        return idx
+    p0_stages = locator[idx].keys()
+    if stage not in p0_stages:
+        # Need to copy parent to later states
+        p0_stage = max(p0_stages)
+        p0_idx = locator[idx][p0_stage]
+        for j in range(p0_stage, stage):
+            op0 = ops[idx]
+            latency = float(latency_cutoff * (j + 1))
+            out_idxd.setdefault(j, []).append(locator[idx][j])
+            _copy_op = Op(len(out_idxd[j]) - 1, -1, -1, 0, op0.qint, latency, 0.0)
+            opd.setdefault(j + 1, []).append(_copy_op)
+            p0_idx = len(opd[j + 1]) - 1
+            locator[idx][j + 1] = p0_idx
+    else:
+        p0_idx = locator[idx][stage]
+    return p0_idx
 def to_pipeline(sol: Solution, latency_cutoff: int, retiming=True, verbose=True) -> CascadedSolution:
     """Split the record into multiple stages based on the latency of the operations.
     Only useful for HDL generation.
@@ -80,46 +109,19 @@ def to_pipeline(sol: Solution, latency_cutoff: int, retiming=True, verbose=True)
             opd.setdefault(stage, []).append(op)
             locator.append({stage: len(opd[stage]) - 1})
             continue
-        p0_stages = locator[op.id0].keys()
-        if stage not in p0_stages:
-            # Need to copy parent to later states
-            p0_stage = max(p0_stages)
-            p0_idx = locator[op.id0][p0_stage]
-            for j in range(p0_stage, stage):
-                op0 = ops[op.id0]
-                latency = float(latency_cutoff * (j + 1))
-                out_idxd.setdefault(j, []).append(locator[op.id0][j])
-                _copy_op = Op(len(out_idxd[j]) - 1, -1, -1, 0, op0.qint, latency, 0.0)
-                opd.setdefault(j + 1, []).append(_copy_op)
-                p0_idx = len(opd[j + 1]) - 1
-                locator[op.id0][j + 1] = p0_idx
-        else:
-            p0_idx = locator[op.id0][stage]
-        if op.opcode in (0, 1):
-            p1_stages = locator[op.id1].keys()
-            if stage not in p1_stages:
-                # Need to copy parent to later states
-                p1_stage = max(p1_stages)
-                p1_idx = locator[op.id1][p1_stage]
-                for j in range(p1_stage, stage):
-                    op1 = ops[op.id1]
-                    latency = float(latency_cutoff * (j + 1))
-                    out_idxd.setdefault(j, []).append(locator[op.id1][j])
-                    _copy_op = Op(len(out_idxd[j]) - 1, -1, -1, 0, op1.qint, latency, 0.0)
-                    opd.setdefault(j + 1, []).append(_copy_op)
-                    p1_idx = len(opd[j + 1]) - 1
-                    locator[op.id1][j + 1] = p1_idx
-            else:
-                p1_idx = locator[op.id1][stage]
+        p0_idx = _get_new_idx(op.id0, locator, opd, out_idxd, ops, stage, latency_cutoff)
+        p1_idx = _get_new_idx(op.id1, locator, opd, out_idxd, ops, stage, latency_cutoff)
+        if op.opcode in (6, -6):
+            data = _get_new_idx(op.data, locator, opd, out_idxd, ops, stage, latency_cutoff)
         else:
-            p1_idx = op.id1
+            data = op.data
         if p1_idx == -1001:
             # Output to external buffer
             out_idxd.setdefault(stage, []).append(p0_idx)
         else:
-            _Op = Op(p0_idx, p1_idx, op.opcode, op.data, op.qint, op.latency, op.cost)
+            _Op = Op(p0_idx, p1_idx, op.opcode, data, op.qint, op.latency, op.cost)
             opd.setdefault(stage, []).append(_Op)
             locator.append({stage: len(opd[stage]) - 1})
     sols = []

da4ml/trace/tracer.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from collections.abc import Sequence
 from decimal import Decimal
+from itertools import chain
 from math import log2
 from typing import overload
 from uuid import UUID
@@ -11,20 +12,20 @@ from .fixed_variable import FixedVariable, _const_f
 from .fixed_variable_array import FixedVariableArray
-def _recursive_trace(v: FixedVariable, gathered: dict[UUID, FixedVariable]):
-    if v in gathered:
+def _recursive_gather(v: FixedVariable, gathered: dict[UUID, FixedVariable]):
+    if v.id in gathered:
         return
     assert v._from is not None
     for _v in v._from:
         if _v.id not in gathered:
-            _recursive_trace(_v, gathered)
+            _recursive_gather(_v, gathered)
     gathered[v.id] = v
 def gather_variables(inputs: Sequence[FixedVariable], outputs: Sequence[FixedVariable]):
     gathered = {v.id: v for v in inputs}
     for o in outputs:
-        _recursive_trace(o, gathered)
+        _recursive_gather(o, gathered)
     variables = list(gathered.values())
@@ -85,6 +86,19 @@ def _comb_trace(inputs: Sequence[FixedVariable], outputs: Sequence[FixedVariable
                 qint = QInterval(qint.min, qint.min, step)
                 data = qint.min / step
                 ops.append(Op(-1, -1, 5, int(data), qint, v.latency, v.cost))
+            case 'msb_mux':
+                qint = v.unscaled.qint
+                key, in0, in1 = v._from
+                opcode = 6 if in1._factor > 0 else -6
+                idk, id0, id1 = index[key.id], index[in0.id], index[in1.id]
+                f0, f1 = in0._factor, in1._factor
+                shift = int(log2(abs(f1 / f0)))
+                data = idk + (shift << 32)
+                assert idk < i and id0 < i and id1 < i
+                assert key._factor > 0, f'Cannot mux on v{key.id} with negative factor {key._factor}'
+                op = Op(id0, id1, opcode, data, qint, v.latency, v.cost)
+                ops.append(op)
             case _:
                 raise NotImplementedError(f'Operation "{v.opr}" is not supported in tracing')
     out_index = [index[v.id] for v in outputs]
@@ -101,6 +115,15 @@ def comb_trace(inputs: FixedVariableArray, outputs: FixedVariableArray) -> Solut
 def comb_trace(inputs, outputs):
     inputs, outputs = list(np.ravel(inputs)), list(np.ravel(outputs))
+    if any(not isinstance(v, FixedVariable) for v in outputs):
+        hwconf = inputs[0].hwconf
+        latency = max(v.latency for v in chain(inputs, outputs) if isinstance(v, FixedVariable))
+        outputs = list(outputs)
+        for i, v in enumerate(outputs):
+            if not isinstance(v, FixedVariable):
+                outputs[i] = FixedVariable.from_const(v, hwconf, latency, 1)
     ops, out_index = _comb_trace(inputs, outputs)
     shape = len(inputs), len(outputs)
     inp_shift = [0] * shape[0]
@@ -108,7 +131,7 @@ def comb_trace(inputs, outputs):
     out_shift = [int(log2(abs(sf))) for sf in out_sf]
     out_neg = [sf < 0 for sf in out_sf]
-    return Solution(
+    sol = Solution(
         shape,
         inp_shift,
         out_index,
@@ -118,3 +141,12 @@ def comb_trace(inputs, outputs):
         outputs[0].hwconf.carry_size,
         outputs[0].hwconf.adder_size,
     )
+    ref_count = sol.ref_count
+    for i in range(len(ops)):
+        if ref_count[i] == 0:
+            op = ops[i]
+            sol.ops[i] = Op(-1, -1, op[2], 0, QInterval(0, 0, 1), op[5], op[6])
+    return sol

da4ml-0.3.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,107 @@
+Metadata-Version: 2.4
+Name: da4ml
+Version: 0.3.0
+Summary: Digital Arithmetic for Machine Learning
+Author-email: Chang Sun <chsun@cern.ch>
+License: GNU Lesser General Public License v3 (LGPLv3)
+Project-URL: repository, https://github.com/calad0i/da4ml
+Keywords: CMVM,distributed arithmetic,hls4ml,MCM,subexpression elimination
+Classifier: Development Status :: 4 - Beta
+Classifier: License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: llvmlite>=0.44
+Requires-Dist: numba>=0.61
+Dynamic: license-file
+# da4ml: Distributed Arithmetic for Machine Learning
+This project performs Constant Matrix-Vector Multiplication (CMVM) with Distributed Arithmetic (DA) for Machine Learning (ML) on a Field Programmable Gate Arrays (FPGAs).
+CMVM optimization is done through greedy CSE of two-term subexpressions, with possible Delay Constraints (DC). The optimization is done in jitted Python (Numba), and a list of optimized operations is generated as traced Python code.
+The project generates Verilog or Vitis HLS code for the optimized CMVM operations. This project can be used in conjunction with [`hls4ml`](https://github.com/fastmachinelearning/hls4ml/) for optimizing the neural networks deployed on FPGAs. For a subset of neural networks, the full design can be generated standalone in Verilog or Vitis HLS.
+## Installation
+The project is available on PyPI and can be installed with pip:
+```bash
+pip install da4ml
+```
+Notice that `numba>=6.0.0` is required for the project to work. The project does not work with `python<3.10`. If the project fails to compile, try upgrading `numba` and `llvmlite` to the latest versions.
+## `hls4ml`
+The major use of this project is through the `distributed_arithmetic` strategy in the `hls4ml`:
+```python
+model_hls = hls4ml.converters.convert_from_keras_model(
+    model,
+    hls_config={
+        'Model': {
+            ...
+            'Strategy': 'distributed_arithmetic',
+        },
+        ...
+    },
+    ...
+)
+```
+Currently, `Dense/Conv1D/Conv2D` layers are supported for both `io_parallel` and `io_stream` dataflows. However, notice that distributed arithmetic implies `reuse_factor=1`, as the whole kernel is implemented in combinational logic.
+## Standalone usage
+### `HGQ2`
+For some models trained with `HGQ2`, the `da4ml` can be used to generate the whole model in Verilog or Vitis HLS:
+```python
+from da4ml.codegen import HLSModel, VerilogModel
+from da4ml.converter.hgq2.parser import trace_model
+from da4ml.trace import comb_trace
+inp, out = trace_model(hgq2_model)
+comb_logic = comb_trace(inp[0], out[0]) # Currently, only models with 1 input and 1 output are supported
+# Pipelined Verilog model generation
+# `latency_cutoff` is used to control auto piplining behavior. To disable pipelining, set it to -1.
+verilog_model = VerilogModel(sol, prj_name='barbar', path='/tmp/barbar', latency_cutoff=5)
+verilog_model.compile() # write and verilator binding
+verilog_model.predict(inputs)
+vitis_hls_model = HLSModel(sol, prj_name='foo', path='/tmp/foo', flavor='vitis') # Only vitis is supported for now
+vitis_hls_model.compile() # write and hls binding
+vitis_hls_model.predict(inputs)
+```
+### Functional Definition
+For generic operations, one can define a combinational logic with the functional API:
+```python
+from da4ml.trace import FixedVariableArray, HWConfig, comb_trace
+from da4ml.trace.ops import einsum, relu, quantize, conv, pool
+# k, i, f are numpy arrays of integers: keep_negative (0/1), integer bits (excl. sign), fractional bits
+inp = FixedVariableArray.from_kif(k, i, f, HWConfig(1, -1, -1), solver_options={'hard_dc':2})
+out = inp @ kernel
+out = relu(out)
+out = einsum(equation, out, weights)
+...
+comb = comb_trace(inp, out)
+```
+`+`, `-`, `@` are supported as well as `einsum`, `relu`, `quantize` (WRAP, with TRN or RND), `conv`, `pool` (average only). For multiplications, only power-of-two multipliers are supported, otherwise use `einsum` or `@` operators.
+The `comb_trace` returns a `Solution` objects that contains a list of low-level operations that are used to implement the combinational logic, which in turn can be used to generate Verilog or Vitis HLS code.

da4ml 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

Potentially problematic release.

da4ml 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl