PyPI - causal-conv1d - Versions diffs - 1.3.0.post1__tar.gz → 1.5.0.post5__tar.gz - Mend

causal-conv1d 1.3.0.post1tar.gz → 1.5.0.post5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

{causal_conv1d-1.3.0.post1 → causal_conv1d-1.5.0.post5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: causal_conv1d
-Version: 1.3.0.post1
+Version: 1.5.0.post5
 Summary: Causal depthwise conv1d in CUDA, with a PyTorch interface
 Home-page: https://github.com/Dao-AILab/causal-conv1d
 Author: Tri Dao
@@ -8,7 +8,7 @@ Author-email: tri@tridao.me
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: BSD License
 Classifier: Operating System :: Unix
-Requires-Python: >=3.7
+Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
 License-File: AUTHORS
@@ -21,11 +21,11 @@ Features:
 ## How to use
-```
+```python
 from causal_conv1d import causal_conv1d_fn
 ```
-```
+```python
 def causal_conv1d_fn(x, weight, bias=None, activation=None):
     """
     x: (batch, dim, seqlen)
@@ -38,7 +38,7 @@ def causal_conv1d_fn(x, weight, bias=None, activation=None):
 ```
 Equivalent to:
-```
+```python
 import torch.nn.functional as F
 F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)[..., :seqlen]

{causal_conv1d-1.3.0.post1 → causal_conv1d-1.5.0.post5}/README.md RENAMED Viewed

@@ -6,11 +6,11 @@ Features:
 ## How to use
-```
+```python
 from causal_conv1d import causal_conv1d_fn
 ```
-```
+```python
 def causal_conv1d_fn(x, weight, bias=None, activation=None):
     """
     x: (batch, dim, seqlen)
@@ -23,7 +23,7 @@ def causal_conv1d_fn(x, weight, bias=None, activation=None):
 ```
 Equivalent to:
-```
+```python
 import torch.nn.functional as F
 F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)[..., :seqlen]

{causal_conv1d-1.3.0.post1 → causal_conv1d-1.5.0.post5}/causal_conv1d/__init__.py RENAMED Viewed

@@ -1,3 +1,3 @@
-__version__ = "1.3.0.post1"
+__version__ = "1.5.0.post5"
 from causal_conv1d.causal_conv1d_interface import causal_conv1d_fn, causal_conv1d_update

{causal_conv1d-1.3.0.post1 → causal_conv1d-1.5.0.post5}/causal_conv1d/causal_conv1d_interface.py RENAMED Viewed

@@ -172,42 +172,72 @@ def causal_conv1d_ref(
     return out if not return_final_states else (out, final_states_out)
-def causal_conv1d_update(x, conv_state, weight, bias=None, activation=None):
+def causal_conv1d_update(x, conv_state, weight, bias=None, activation=None, cache_seqlens=None, conv_state_indices=None):
     """
-    x: (batch, dim)
-    conv_state: (batch, dim, width)
+    x: (batch, dim) or (batch, dim, seqlen)
+    conv_state: (batch, dim, state_len), where state_len >= width - 1
     weight: (dim, width)
     bias: (dim,)
-    out: (batch, dim)
+    cache_seqlens: (batch,), dtype int32.
+        If not None, the conv_state is treated as a circular buffer.
+        The conv_state will be updated by copying x to the conv_state starting at the index
+        @cache_seqlens % state_len.
+    conv_state_indices: (batch,), dtype int32
+        If None, the conv_state is a larger tensor along the batch dim,
+        and we are selecting the batch coords specified by conv_state_indices.
+        Useful for a continuous batching scenario.
+    out: (batch, dim) or (batch, dim, seqlen)
     """
     if activation not in [None, "silu", "swish"]:
         raise NotImplementedError("activation must be None, silu, or swish")
     activation = activation in ["silu", "swish"]
-    return causal_conv1d_cuda.causal_conv1d_update(
-        x, conv_state, weight, bias, activation
+    unsqueeze = x.dim() == 2
+    if unsqueeze:
+        x = x.unsqueeze(-1)
+    out = causal_conv1d_cuda.causal_conv1d_update(
+        x, conv_state, weight, bias, activation, cache_seqlens, conv_state_indices
     )
+    if unsqueeze:
+        out = out.squeeze(-1)
+    return out
-def causal_conv1d_update_ref(x, conv_state, weight, bias=None, activation=None):
+def causal_conv1d_update_ref(x, conv_state, weight, bias=None, activation=None, cache_seqlens=None):
     """
-    x: (batch, dim)
-    conv_state: (batch, dim, width)
+    x: (batch, dim) or (batch, dim, seqlen)
+    conv_state: (batch, dim, state_len), where state_len >= width - 1
     weight: (dim, width)
     bias: (dim,)
+    cache_seqlens: (batch,), dtype int32.
+        If not None, the conv_state is treated as a circular buffer.
+        The conv_state will be updated by copying x to the conv_state starting at the index
+        @cache_seqlens % state_len before performing the convolution.
-    out: (batch, dim)
+    out: (batch, dim) or (batch, dim, seqlen)
     """
     if activation not in [None, "silu", "swish"]:
         raise NotImplementedError("activation must be None, silu, or swish")
     dtype_in = x.dtype
-    batch, dim = x.shape
+    unsqueeze = x.dim() == 2
+    if unsqueeze:
+        x = x.unsqueeze(-1)
+    batch, dim, seqlen = x.shape
     width = weight.shape[1]
-    assert conv_state.shape == (batch, dim, width)
+    state_len = conv_state.shape[-1]
+    assert conv_state.shape == (batch, dim, state_len)
     assert weight.shape == (dim, width)
-    conv_state.copy_(torch.roll(conv_state, shifts=-1, dims=-1))  # Update state (B D W)
-    conv_state[:, :, -1] = x
-    out = torch.sum(conv_state * weight, dim=-1)  # (B D)
-    if bias is not None:
-        out += bias
+    if cache_seqlens is None:
+        x_new = torch.cat([conv_state, x], dim=-1).to(weight.dtype)  # (batch, dim, state_len + seqlen)
+        conv_state.copy_(x_new[:, :, -state_len:])
+    else:
+        width_idx = torch.arange(-(width - 1), 0, dtype=torch.long, device=x.device).unsqueeze(0) + cache_seqlens.unsqueeze(1)
+        width_idx = torch.remainder(width_idx, state_len).unsqueeze(1).expand(-1, dim, -1)
+        x_new = torch.cat([conv_state.gather(2, width_idx), x], dim=-1).to(weight.dtype)
+        copy_idx = torch.arange(seqlen, dtype=torch.long, device=x.device).unsqueeze(0) + cache_seqlens.unsqueeze(1)
+        copy_idx = torch.remainder(copy_idx, state_len).unsqueeze(1).expand(-1, dim, -1)
+        conv_state.scatter_(2, copy_idx, x)
+    out = F.conv1d(x_new, weight.unsqueeze(1), bias, padding=0, groups=dim)[:, :, -seqlen:]
+    if unsqueeze:
+        out = out.squeeze(-1)
     return (out if activation is None else F.silu(out)).to(dtype=dtype_in)

causal_conv1d-1.5.0.post5/causal_conv1d/causal_conv1d_varlen.py ADDED Viewed

@@ -0,0 +1,86 @@
+import torch
+from torch import Tensor
+import triton
+import triton.language as tl
+@triton.jit
+def _causal_conv1d_varlen_states(
+    X,
+    CU_SEQLENS,
+    STATES,
+    state_len,
+    dim,
+    stride_x_seqlen, stride_x_dim,
+    stride_states_batch, stride_states_seqlen, stride_states_dim,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr
+):
+    batch_idx = tl.program_id(2)
+    STATES += batch_idx * stride_states_batch
+    end_idx = tl.load(CU_SEQLENS + batch_idx + 1)
+    start_idx = tl.maximum(tl.load(CU_SEQLENS + batch_idx), end_idx - state_len)
+    rows = end_idx - (tl.program_id(1) + 1) * BLOCK_M + tl.arange(0, BLOCK_M)
+    cols = tl.program_id(0) * BLOCK_N + tl.arange(0, BLOCK_N)
+    x = tl.load(X + rows[:, None] * stride_x_seqlen + cols[None, :] * stride_x_dim,
+                mask=(rows[:, None] >= start_idx) & (cols[None, :] < dim),
+                other=0)
+    rows_states = state_len - (tl.program_id(1) + 1) * BLOCK_M + tl.arange(0, BLOCK_M)
+    tl.store(STATES + rows_states[:, None] * stride_states_seqlen + cols[None, :] * stride_states_dim,
+             x,
+             mask=(rows_states[:, None] >= 0) & (cols[None, :] < dim))
+def causal_conv1d_varlen_states(x: Tensor, cu_seqlens: Tensor, state_len: int) -> Tensor:
+    """
+    Forward pass only, does not support backward pass.
+    Parameters:
+        x: (total_tokens, dim)
+        cu_seqlens: (batch + 1), must already be sorted. The cumulative sum of the sequence lengths, starting from 0.
+        state_len: int. For each cu_seqlens, how many elements from x should be copied to the state.
+            If some of those elements belong to a different sequence, the value of the states will be zero.
+    Return:
+        states: (batch, dim, state_len)
+    """
+    _, dim = x.shape
+    batch = cu_seqlens.shape[0] - 1
+    cu_seqlens = cu_seqlens.contiguous()
+    states = torch.empty(batch, state_len, dim, dtype=x.dtype, device=x.device).transpose(1, 2)
+    BLOCK_M = min(triton.next_power_of_2(state_len), 16)
+    BLOCK_N = min(triton.next_power_of_2(dim), 256)
+    grid = (triton.cdiv(dim, BLOCK_N), triton.cdiv(state_len, BLOCK_M), batch)
+    with torch.cuda.device(x.device.index):
+        _causal_conv1d_varlen_states[grid](
+            x,
+            cu_seqlens,
+            states,
+            state_len,
+            dim,
+            x.stride(0), x.stride(1),
+            states.stride(0), states.stride(2), states.stride(1),
+            BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N
+        )
+    return states
+def causal_conv1d_varlen_states_ref(x: Tensor, cu_seqlens: Tensor, state_len: int) -> Tensor:
+    """
+    Forward pass only, does not support backward pass.
+    Parameters:
+        x: (total_tokens, dim)
+        cu_seqlens: (batch + 1), must already be sorted. The cumulative sum of the sequence lengths, starting from 0.
+        state_len: int. For each cu_seqlens, how many elements from x should be copied to the state.
+            If some of those elements belong to a different sequence, the value of the states will be zero.
+    Return:
+        states: (batch, dim, state_len)
+    """
+    _, dim = x.shape
+    batch = cu_seqlens.shape[0] - 1
+    cu_seqlens = cu_seqlens.contiguous()
+    states = torch.zeros(batch, state_len, dim, dtype=x.dtype, device=x.device).transpose(1, 2)
+    for i in range(batch):
+        end_idx = cu_seqlens[i + 1]
+        start_idx = torch.maximum(cu_seqlens[i], end_idx - state_len)
+        states[i, :, -(end_idx - start_idx):] = x[start_idx:end_idx].T
+    return states

{causal_conv1d-1.3.0.post1 → causal_conv1d-1.5.0.post5}/causal_conv1d.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: causal-conv1d
-Version: 1.3.0.post1
+Version: 1.5.0.post5
 Summary: Causal depthwise conv1d in CUDA, with a PyTorch interface
 Home-page: https://github.com/Dao-AILab/causal-conv1d
 Author: Tri Dao
@@ -8,7 +8,7 @@ Author-email: tri@tridao.me
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: BSD License
 Classifier: Operating System :: Unix
-Requires-Python: >=3.7
+Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: LICENSE
 License-File: AUTHORS
@@ -21,11 +21,11 @@ Features:
 ## How to use
-```
+```python
 from causal_conv1d import causal_conv1d_fn
 ```
-```
+```python
 def causal_conv1d_fn(x, weight, bias=None, activation=None):
     """
     x: (batch, dim, seqlen)
@@ -38,7 +38,7 @@ def causal_conv1d_fn(x, weight, bias=None, activation=None):
 ```
 Equivalent to:
-```
+```python
 import torch.nn.functional as F
 F.conv1d(x, weight.unsqueeze(1), bias, padding=width - 1, groups=dim)[..., :seqlen]

{causal_conv1d-1.3.0.post1 → causal_conv1d-1.5.0.post5}/causal_conv1d.egg-info/SOURCES.txt RENAMED Viewed

@@ -4,6 +4,7 @@ README.md
 setup.py
 causal_conv1d/__init__.py
 causal_conv1d/causal_conv1d_interface.py
+causal_conv1d/causal_conv1d_varlen.py
 causal_conv1d.egg-info/PKG-INFO
 causal_conv1d.egg-info/SOURCES.txt
 causal_conv1d.egg-info/dependency_links.txt

{causal_conv1d-1.3.0.post1 → causal_conv1d-1.5.0.post5}/setup.py RENAMED Viewed

@@ -202,7 +202,6 @@ if not SKIP_CUDA_BUILD:
                 f"--offload-arch={os.getenv('HIP_ARCHITECTURES', 'native')}",
                 "-U__CUDA_NO_HALF_OPERATORS__",
                 "-U__CUDA_NO_HALF_CONVERSIONS__",
-                "-DCK_FMHA_FWD_FAST_EXP2=1",
                 "-fgpu-flush-denormals-to-zero",
             ]
             + cc_flag,
@@ -268,10 +267,10 @@ def get_wheel_url():
         # We're using the CUDA version used to build torch, not the one currently installed
         # _, cuda_version_raw = get_cuda_bare_metal_version(CUDA_HOME)
         torch_cuda_version = parse(torch.version.cuda)
-        # For CUDA 11, we only compile for CUDA 11.8, and for CUDA 12 we only compile for CUDA 12.2
+        # For CUDA 11, we only compile for CUDA 11.8, and for CUDA 12 we only compile for CUDA 12.4
         # to save CI time. Minor versions should be compatible.
-        torch_cuda_version = parse("11.8") if torch_cuda_version.major == 11 else parse("12.2")
-        cuda_version = f"{torch_cuda_version.major}{torch_cuda_version.minor}"
+        torch_cuda_version = parse("11.8") if torch_cuda_version.major == 11 else parse("12.4")
+        cuda_version = f"{torch_cuda_version.major}"
     gpu_compute_version = hip_version if HIP_BUILD else cuda_version
     cuda_or_hip = "hip" if HIP_BUILD else "cu"
@@ -359,7 +358,7 @@ setup(
     else {
         "bdist_wheel": CachedWheelsCommand,
     },
-    python_requires=">=3.7",
+    python_requires=">=3.9",
     install_requires=[
         "torch",
         "packaging",

{causal_conv1d-1.3.0.post1 → causal_conv1d-1.5.0.post5}/AUTHORS RENAMED Viewed

File without changes

{causal_conv1d-1.3.0.post1 → causal_conv1d-1.5.0.post5}/LICENSE RENAMED Viewed

File without changes

{causal_conv1d-1.3.0.post1 → causal_conv1d-1.5.0.post5}/causal_conv1d.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{causal_conv1d-1.3.0.post1 → causal_conv1d-1.5.0.post5}/causal_conv1d.egg-info/requires.txt RENAMED Viewed

File without changes

{causal_conv1d-1.3.0.post1 → causal_conv1d-1.5.0.post5}/causal_conv1d.egg-info/top_level.txt RENAMED Viewed

File without changes

{causal_conv1d-1.3.0.post1 → causal_conv1d-1.5.0.post5}/setup.cfg RENAMED Viewed

File without changes

causal-conv1d 1.3.0.post1__tar.gz → 1.5.0.post5__tar.gz

causal-conv1d 1.3.0.post1tar.gz → 1.5.0.post5tar.gz