PyPI - triton-windows - Versions diffs - 3.2.0.post12__cp312-cp312-win_amd64.whl → 3.3.0a0.post12__cp312-cp312-win_amd64.whl - Mend

triton-windows 3.2.0.post12__cp312-cp312-win_amd64.whl → 3.3.0a0.post12__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (68) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +3 -3
triton/_internal_testing.py +59 -4
triton/_utils.py +35 -0
triton/backends/amd/compiler.py +121 -74
triton/backends/amd/driver.py +77 -43
triton/backends/amd/include/hip/amd_detail/amd_device_functions.h +28 -49
triton/backends/amd/include/hip/amd_detail/amd_hip_atomic.h +35 -9
triton/backends/amd/include/hip/amd_detail/amd_hip_bf16.h +761 -284
triton/backends/amd/include/hip/amd_detail/amd_hip_cooperative_groups.h +9 -3
triton/backends/amd/include/hip/amd_detail/amd_hip_fp8.h +1391 -0
triton/backends/amd/include/hip/amd_detail/amd_hip_gl_interop.h +3 -3
triton/backends/amd/include/hip/amd_detail/amd_warp_functions.h +44 -0
triton/backends/amd/include/hip/amd_detail/amd_warp_sync_functions.h +288 -0
triton/backends/amd/include/hip/amd_detail/hip_api_trace.hpp +110 -14
triton/backends/amd/include/hip/amd_detail/hip_prof_str.h +504 -103
triton/backends/amd/include/hip/amd_detail/hip_runtime_prof.h +2 -1
triton/backends/amd/include/hip/amd_detail/host_defines.h +4 -0
triton/backends/amd/include/hip/hip_ext.h +4 -2
triton/backends/amd/include/hip/hip_fp8.h +33 -0
triton/backends/amd/include/hip/hip_runtime_api.h +375 -33
triton/backends/amd/include/hip/hip_version.h +3 -3
triton/backends/amd/include/hip/hiprtc.h +25 -25
triton/backends/amd/include/hsa/amd_hsa_elf.h +40 -14
triton/backends/amd/include/hsa/hsa.h +11 -2
triton/backends/amd/include/hsa/hsa_api_trace.h +30 -17
triton/backends/amd/include/hsa/hsa_api_trace_version.h +68 -0
triton/backends/amd/include/hsa/hsa_ext_amd.h +83 -27
triton/backends/amd/include/hsa/hsa_ven_amd_aqlprofile.h +46 -46
triton/backends/amd/include/hsa/hsa_ven_amd_pc_sampling.h +416 -0
triton/backends/amd/include/roctracer/hip_ostream_ops.h +84 -4
triton/backends/amd/include/roctracer/hsa_ostream_ops.h +260 -0
triton/backends/amd/include/roctracer/hsa_prof_str.h +51 -19
triton/backends/amd/lib/asanrtl.bc +0 -0
triton/backends/compiler.py +25 -225
triton/backends/driver.py +7 -2
triton/backends/nvidia/bin/ptxas.exe +0 -0
triton/backends/nvidia/compiler.py +135 -90
triton/backends/nvidia/driver.c +0 -1
triton/backends/nvidia/driver.py +135 -49
triton/backends/nvidia/include/cuda.h +2162 -241
triton/backends/nvidia/lib/x64/cuda.lib +0 -0
triton/compiler/__init__.py +2 -2
triton/compiler/code_generator.py +334 -231
triton/compiler/compiler.py +77 -66
triton/language/__init__.py +22 -5
triton/language/core.py +448 -74
triton/language/extra/cuda/_experimental_tma.py +3 -5
triton/language/math.py +1 -1
triton/language/random.py +2 -1
triton/language/semantic.py +206 -52
triton/language/standard.py +35 -18
triton/runtime/_allocation.py +32 -0
triton/runtime/autotuner.py +27 -32
triton/runtime/build.py +1 -48
triton/runtime/cache.py +6 -6
triton/runtime/errors.py +10 -0
triton/runtime/interpreter.py +179 -45
triton/runtime/jit.py +149 -190
triton/testing.py +39 -11
triton/tools/compile.py +27 -20
triton/tools/{compile.c → extra/cuda/compile.c} +1 -0
triton/tools/mxfp.py +301 -0
{triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/METADATA +5 -2
{triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/RECORD +68 -59
{triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/top_level.txt +2 -0
/triton/tools/{compile.h → extra/cuda/compile.h} +0 -0
{triton_windows-3.2.0.post12.dist-info → triton_windows-3.3.0a0.post12.dist-info}/WHEEL +0 -0

triton/language/extra/cuda/_experimental_tma.py CHANGED Viewed

@@ -29,7 +29,7 @@ def experimental_device_tensormap_create1d(
     load_size: core.tensor,
     global_size: core.tensor,
     element_ty: core.dtype,
-    _builder: ir.builder,
+    _builder: ir.builder = None,
 ):
     load_size = core._constexpr_to_value(load_size)
     global_size = semantic.to_tensor(global_size, _builder)
@@ -58,7 +58,7 @@ def experimental_device_tensormap_create2d(
     load_size: Sequence[core.constexpr],
     global_size: Sequence[core.tensor],
     element_ty: core.dtype,
-    _builder: ir.builder,
+    _builder: ir.builder = None,
 ):
     assert len(load_size) == 2
     assert len(global_size) == 2
@@ -68,8 +68,6 @@ def experimental_device_tensormap_create2d(
     element_size = element_ty.primitive_bitwidth // 8
     element_size_t = core.full([], element_size, core.int64, _builder=_builder)
     global_stride = semantic.mul(element_size_t, global_size[-1], True, _builder)
-    # Undocumented, but global_stride seems to be divided by 16
-    global_stride = semantic.ashr(global_stride, semantic.to_tensor(4, _builder), _builder)
     contig_dim_size_in_bytes = element_size * load_size[-1]
     if contig_dim_size_in_bytes > 128:
@@ -104,5 +102,5 @@ def _determine_swizzle_mode_2d(contig_dim_size_in_bytes, load_size):
 @core.builtin
-def experimental_tensormap_fenceproxy_acquire(desc_ptr: core.tensor, _builder: ir.builder):
+def experimental_tensormap_fenceproxy_acquire(desc_ptr: core.tensor, _builder: ir.builder = None):
     semantic.tensormap_fenceproxy_acquire(desc_ptr, _builder)

triton/language/math.py CHANGED Viewed

@@ -173,9 +173,9 @@ def rsqrt(x, _builder=None):
     return core.tensor(_builder.create_rsqrt(x.handle), x.type)
+@core._tensor_member_fn
 @core.builtin
 @_add_math_1arg_docstr("absolute value")
-@core._tensor_member_fn
 def abs(x, _builder=None):
     x = semantic.to_tensor(x, _builder)
     dtype = x.dtype

triton/language/random.py CHANGED Viewed

@@ -45,11 +45,12 @@ def philox_impl(c0, c1, c2, c3, k0, k1, n_rounds: tl.constexpr = N_ROUNDS_DEFAUL
 @jit
 def philox(seed, c0, c1, c2, c3, n_rounds: tl.constexpr = N_ROUNDS_DEFAULT):
     seed = tl.to_tensor(seed)
+    tl.static_assert(seed.dtype.is_int())
+    seed = seed.to(tl.uint64)
     c0 = tl.to_tensor(c0)
     c1 = tl.to_tensor(c1)
     c2 = tl.to_tensor(c2)
     c3 = tl.to_tensor(c3)
-    seed = seed.to(tl.uint64)
     if tl.constexpr(c0.dtype.primitive_bitwidth) == 32:
         int_dtype = tl.uint32
         seed_hi = ((seed >> 32) & 0xffffffff).to(tl.uint32)

triton/language/semantic.py CHANGED Viewed

@@ -6,7 +6,6 @@ import numbers
 from .._C.libtriton import ir
 from . import core as tl
-from . import math
 T = TypeVar('T')
@@ -62,7 +61,7 @@ def computation_type_impl(a_ty: tl.dtype, a_is_scalar: bool, b_ty: tl.dtype, b_i
                           div_or_mod: bool) -> tl.dtype:
     # 0) For scalars we follow semantics similar to PyTorch, namely:
     # - If the scalar is of a lower or equal kind (bool < uint < int < fp),
-    #   it doesn't participate in the pomotion
+    #   it doesn't participate in the promotion
     if a_is_scalar != b_is_scalar:
         scalar_ty, tensor_ty = (a_ty, b_ty) if a_is_scalar else (b_ty, a_ty)
         if scalar_ty.kind().value <= tensor_ty.kind().value:
@@ -88,11 +87,12 @@ def computation_type_impl(a_ty: tl.dtype, a_is_scalar: bool, b_ty: tl.dtype, b_i
         else:
             return tl.float16
     # 4) return bf16 only if both operands are of bf16
-    if a_ty.is_bf16() or b_ty.is_bf16():
+    if a_ty.is_bf16() and b_ty.is_bf16():
         if div_or_mod:
             return tl.float32
-        if a_ty.is_bf16() and b_ty.is_bf16():
+        else:
             return tl.bfloat16
+    if a_ty.is_bf16() or b_ty.is_bf16():
         return tl.float32
     # 5) return fp16 if operands are different fp8
     if a_ty.is_fp8() and b_ty.is_fp8():
@@ -186,6 +186,11 @@ def binary_op_type_checking_impl(lhs: tl.tensor | numbers.Number, rhs: tl.tensor
                 or rhs_is_scalar and rhs_scalar < 0 and ret_sca_ty.is_int_unsigned()):
             raise ValueError("Cannot perform a binary operation between an unsigned tensor and a negative scalar. "
                              "Perform a explicit cast on one of them.")
+        if ret_sca_ty.is_int():
+            if lhs_is_scalar and not (ret_sca_ty.get_int_min_value() <= lhs_scalar <= ret_sca_ty.get_int_max_value()):
+                raise ValueError(f"Scalar {lhs_scalar} is out of range for type {ret_sca_ty}")
+            if rhs_is_scalar and not (ret_sca_ty.get_int_min_value() <= rhs_scalar <= ret_sca_ty.get_int_max_value()):
+                raise ValueError(f"Scalar {rhs_scalar} is out of range for type {ret_sca_ty}")
         lhs = full(
             (), lhs_scalar, dtype=ret_sca_ty, builder=builder) if lhs_is_scalar else cast(lhs, ret_sca_ty, builder)
         rhs = full(
@@ -230,7 +235,15 @@ def add(input: tl.tensor | numbers.Number, other: tl.tensor | numbers.Number, sa
         input_scalar_ty = input.type.scalar
         other_scalar_ty = other.type.scalar
     if input_scalar_ty.is_ptr():
-        return tl.tensor(builder.create_addptr(input.handle, other.handle), input.type)
+        other_handle = other.handle
+        if other.dtype.is_int_unsigned() and other.dtype.int_bitwidth < 64:
+            # addptr treats offset as signed. Zero-extend unsigned offsets to ensure they're positive
+            if other.type.is_block():
+                i64_ty = tl.block_type(tl.int64, other.type.get_block_shapes()).to_ir(builder)
+            else:
+                i64_ty = tl.int64.to_ir(builder)
+            other_handle = builder.create_int_cast(other.handle, i64_ty, False)
+        return tl.tensor(builder.create_addptr(input.handle, other_handle), input.type)
     # float + float
     elif input_scalar_ty.is_floating():
         return tl.tensor(builder.create_fadd(input.handle, other.handle), input.type)
@@ -333,10 +346,7 @@ def mod(input: tl.tensor | numbers.Number, other: tl.tensor | numbers.Number, bu
     other_scalar_ty = other.type.scalar
     # float % float
     if scalar_ty.is_floating():
-        # input - input.div(other, rounding_mode="floor") * other
-        floor = math.floor(fdiv(input, other, False, builder), _builder=builder)
-        ret = sub(input, mul(floor, other, True, builder), True, builder)
-        return ret
+        return tl.tensor(builder.create_frem(input.handle, other.handle), input.type)
     # % int
     elif scalar_ty.is_int():
         if scalar_ty.int_signedness != other_scalar_ty.int_signedness:
@@ -762,14 +772,14 @@ def broadcast_impl_value(lhs: tl.tensor, rhs: tl.tensor, builder: ir.builder) ->
             # Add new axes to lhs
             for _ in range(len(lhs_shape), len(rhs_shape)):
                 lhs = tl.tensor(builder.create_expand_dims(lhs.handle, 0),
-                                tl.block_type(lhs_ty.scalar, [1] + lhs_shape))
+                                tl.block_type(lhs_ty.scalar, [1] + lhs_shape.values))
                 lhs_ty = lhs.type
                 lhs_shape = lhs_ty.get_block_shapes()
         elif len(rhs_shape) < len(lhs_shape):
             # Add new axes to rhs
             for _ in range(len(rhs_shape), len(lhs_shape)):
                 rhs = tl.tensor(builder.create_expand_dims(rhs.handle, 0),
-                                tl.block_type(rhs_ty.scalar, [1] + rhs_shape))
+                                tl.block_type(rhs_ty.scalar, [1] + rhs_shape.values))
                 rhs_ty = rhs.type
                 rhs_shape = rhs_ty.get_block_shapes()
         assert len(rhs_shape) == len(lhs_shape)
@@ -831,10 +841,6 @@ def bitcast(input: tl.tensor, dst_ty: tl.dtype, builder: ir.builder) -> tl.tenso
 def cast(input: tl.tensor, dst_ty: tl.dtype, builder: ir.builder,
          fp_downcast_rounding: Optional[str] = None) -> tl.tensor:
     src_ty = input.type
-    if isinstance(dst_ty, tl.constexpr):
-        dst_ty = dst_ty.value
-    if isinstance(fp_downcast_rounding, tl.constexpr):
-        fp_downcast_rounding = fp_downcast_rounding.value
     if src_ty.is_block():
         dst_ty = tl.block_type(dst_ty.scalar, input.type.get_block_shapes())
     if src_ty == dst_ty:
@@ -1048,7 +1054,7 @@ def _load_block_pointer(ptr, mask, other, boundary_check, padding, cache, evicti
         raise ValueError("`mask` and `other` arguments cannot be specified for loading block pointers")
     elt_ty = ptr.type.element_ty.element_ty
-    assert elt_ty != tl.int1, "`tl.int1` should be rewrited in `tl.make_block_ptr`"
+    assert elt_ty != tl.int1, "`tl.int1` should be rewritten in `tl.make_block_ptr`"
     if elt_ty.is_int() and padding == ir.PADDING_OPTION.PAD_NAN:
         raise ValueError("Padding option `nan` is not supported for integer block pointers")
@@ -1141,18 +1147,93 @@ def load(ptr: tl.tensor, mask: Optional[tl.tensor], other: Optional[tl.tensor],
         return _load_legacy(ptr, mask, other, boundary_check, padding, cache, eviction, is_volatile, builder)
-def descriptor_load(desc_ptr: tl.tensor, offsets, cache_modifier: str, eviction_policy: str, type,
+def reinterpret_tensor_descriptor(desc_ptr: tl.tensor, block_ty: tl.block_type, builder: ir.builder):
+    handle = builder.create_reinterpret_tensor_descriptor(desc_ptr.handle, block_ty.to_ir(builder))
+    return tl._experimental_tensor_descriptor_base(handle, block_ty)
+def validate_descriptor_block(shape, dtype):
+    if len(shape) != 2:
+        return
+    # Due to limitations of the shared memory encoding, the TMA bounding box has
+    # to be at least as big as the swizzle tile.
+    assert shape[0] >= 8, f"tensor descriptor block shape must have at least 8 rows, but got {shape[0]}"
+    min_cols = 32 // dtype.primitive_bitwidth * 8
+    assert shape[
+        1] >= min_cols, f"{dtype} tensor descriptor block shape must have at least {min_cols} columns, but got {shape[1]}"
+def descriptor_load(desc: tl._experimental_tensor_desciptor_base, offsets, cache_modifier: str, eviction_policy: str,
                     builder: ir.builder) -> tl.tensor:
+    assert isinstance(desc, tl._experimental_tensor_descriptor_base)
+    validate_descriptor_block(desc.block_shape, desc.dtype)
+    ndim = len(desc.block_shape)
+    assert len(offsets) == ndim, f"expected {ndim} offsets, but got {len(offsets)}"
     offsets = _convert_to_ir_values(builder, offsets, require_i64=False)
-    x = builder.create_descriptor_load(desc_ptr.handle, offsets, type.to_ir(builder),
-                                       _str_to_load_cache_modifier(cache_modifier),
+    x = builder.create_descriptor_load(desc.handle, offsets, _str_to_load_cache_modifier(cache_modifier),
                                        _str_to_eviction_policy(eviction_policy))
-    return tl.tensor(x, type)
+    return tl.tensor(x, desc.block_type)
+def descriptor_store(desc: tl._experimental_tensor_descriptor_base, value: tl.tensor, offsets,
+                     builder: ir.builder) -> tl.tensor:
+    assert isinstance(desc, tl._experimental_tensor_descriptor_base)
+    validate_descriptor_block(desc.block_shape, desc.dtype)
+    ndim = len(desc.block_shape)
+    assert len(offsets) == ndim, f"expected {ndim} offsets, but got {len(offsets)}"
+    assert value.shape == desc.block_shape
-def descriptor_store(desc_ptr: tl.tensor, value: tl.tensor, offsets, builder: ir.builder) -> tl.tensor:
     offsets = _convert_to_ir_values(builder, offsets, require_i64=False)
-    return tl.tensor(builder.create_descriptor_store(desc_ptr.handle, value.handle, offsets), tl.void)
+    return tl.tensor(builder.create_descriptor_store(desc.handle, value.handle, offsets), tl.void)
+def descriptor_gather(desc, x_offsets, y_offset, cache_modifier: str, eviction_policy: str,
+                      builder: ir.builder) -> tl.tensor:
+    assert isinstance(desc, tl._experimental_tensor_descriptor_base)
+    assert cache_modifier == "", "cache modifier is not supported yet"
+    assert eviction_policy == "", "eviction policy is not supported yet"
+    # Validate descriptor.
+    assert len(desc.block_shape) == 2, f"descriptor must be 2D, but got {desc.block_shape}"
+    assert desc.block_shape[0] == 1, f"descriptor block must have 1 row, but got {desc.block_shape}"
+    # Validate offsets.
+    assert len(x_offsets.shape) == 1, f"x offsets must be 1D, but got {x_offsets.shape}"
+    # Validate minimum block size.
+    assert x_offsets.shape[0] >= 8, f"descriptor gather must have at least 8 rows, but got {x_offsets.shape}"
+    dtype = desc.dtype
+    min_cols = 32 // dtype.primitive_bitwidth * 8
+    assert desc.block_shape[
+        1] >= min_cols, f"descriptor gather of {dtype} must have at least {min_cols} columns, but got {desc.block_shape[1]}"
+    type = tl.block_type(desc.dtype, [x_offsets.shape[0], desc.block_shape[1]])
+    y_offset = _convert_to_ir_values(builder, (y_offset, ), require_i64=False)[0]
+    x = builder.create_descriptor_gather(desc.handle, x_offsets.handle, y_offset, type.to_ir(builder))
+    return tl.tensor(x, type)
+def descriptor_scatter(desc, value: tl.tensor, x_offsets, y_offset, builder: ir.builder) -> tl.tensor:
+    assert isinstance(desc, tl._experimental_tensor_descriptor_base)
+    # Validate descriptor.
+    assert len(desc.block_shape) == 2, f"descriptor must be 2D, but got {desc.block_shape}"
+    assert desc.block_shape[0] == 1, f"descriptor block must have 1 row, but got {desc.block_shape}"
+    # Validate offsets.
+    assert len(x_offsets.shape) == 1, f"x offsets must be 1D, but got {x_offsets.shapae}"
+    # Validate minimum block size.
+    assert x_offsets.shape[0] >= 8, f"descriptor scatter must have at least 8 rows, but got {x_offsets.shape}"
+    dtype = desc.dtype
+    min_cols = 32 // dtype.primitive_bitwidth * 8
+    assert desc.block_shape[
+        1] >= min_cols, f"descriptor scatter of {dtype} must have at least {min_cols} columns, but got {desc.block_shape[1]}"
+    y_offset = _convert_to_ir_values(builder, (y_offset, ), require_i64=False)[0]
+    builder.create_descriptor_scatter(desc.handle, value.handle, x_offsets.handle, y_offset)
+    return tl.tensor(None, tl.void)
 def tensormap_create(
@@ -1206,7 +1287,7 @@ def _store_block_pointer(ptr, val, mask, boundary_check, cache, eviction, builde
     assert ptr.type.element_ty.element_ty == val.type.element_ty, f"Block element type({ptr.type.element_ty.element_ty}) and value element type({val.type.element_ty}) mismatch"
     elt_ty = ptr.type.element_ty.element_ty
-    assert elt_ty != tl.int1, "`tl.int1` should be rewrited in `tl.make_block_ptr`"
+    assert elt_ty != tl.int1, "`tl.int1` should be rewritten in `tl.make_block_ptr`"
     # Check `boundary_check` argument
     boundary_check = _canonicalize_boundary_check(boundary_check, block_shape)
@@ -1256,7 +1337,7 @@ def _store_legacy(ptr, val, mask, boundary_check, cache, eviction, builder):
     val = cast(val, elt_ty, builder)
     # Build IR
-    if not mask:
+    if mask is None:
         return tl.tensor(builder.create_store(ptr.handle, val.handle, cache, eviction), tl.void)
     if not mask.type.scalar.is_bool():
         raise ValueError("Mask must have boolean scalar type")
@@ -1311,7 +1392,7 @@ def atom_red_typechecking_impl(ptr: tl.tensor, val: tl.tensor, mask: tl.tensor,
         if val is not None:
             val = broadcast_impl_shape(val, ptr.type.get_block_shapes(), builder)
     val = cast(val, ptr.type.scalar.element_ty, builder)
-    if not mask:
+    if mask is None:
         mask_ir = builder.get_int1(True)
         mask_ty = tl.int1
         if ptr.type.is_block():
@@ -1470,6 +1551,7 @@ def dot(lhs: tl.tensor, rhs: tl.tensor, acc: tl.tensor, input_precision: Optiona
         assert lhs.dtype == rhs.dtype, f"Both operands must be same dtype. Got {lhs.dtype} and {rhs.dtype}"
     if lhs.dtype.is_fp8e4b15() or rhs.dtype.is_fp8e4b15():
+        # We upcast because there's no fp8e4b15 type in MLIR
         lhs = cast(lhs, tl.float16, builder)
         rhs = cast(rhs, tl.float16, builder)
@@ -1527,40 +1609,58 @@ def dot(lhs: tl.tensor, rhs: tl.tensor, acc: tl.tensor, input_precision: Optiona
                      ret_ty)
-def _str_to_fp_type(float_format: Optional[str]):
-    if float_format == 'e4m3':
-        return ir.F8F6F4TY.E4M3
-    if float_format == 'e5m2':
-        return ir.F8F6F4TY.E5M2
-    if float_format == 'e2m3':
-        return ir.F8F6F4TY.E2M3
-    if float_format == 'e3m2':
-        return ir.F8F6F4TY.E3M2
-    if float_format == 'e2m1':
-        return ir.F8F6F4TY.E2M1
-    raise ValueError(f"Invalid float format: {float_format}.")
+def _str_to_fp_type(float_format: str):
+    ty_enum = getattr(ir.ScaleDotElemTypeTY, float_format.upper(), None)
+    if ty_enum is None:
+        raise ValueError(f"Invalid float format: {float_format}.")
+    return ty_enum
+def _bitcast_to_fp_type(val: tl.tensor, float_format: str, builder: ir.builder):
+    """
+    If float_format is subbyte, make sure it's packed as uint8 and return it.
+    Otherwise, return a tensor (perhaps bitcasting) of the specified float format.
+    """
+    triton_ty = {"e5m2": tl.float8e5, "e4m3": tl.float8e4nv, "bf16": tl.bfloat16, "fp16": tl.float16}.get(float_format)
+    if triton_ty is None:
+        assert float_format == "e2m1", f"Internal Error: Unexpected float format: {float_format}"
+        assert val.dtype == tl.uint8, f"e2m1 format must be packed as uint8. Got {val.dtype}"
+        return val
+    if val.dtype == triton_ty:
+        return val
+    else:
+        unsigned_ty = {"e5m2": tl.uint8, "e4m3": tl.uint8, "bf16": tl.uint16, "fp16": tl.uint16}[float_format]
+        assert val.dtype == unsigned_ty, f"Unexpected dtype for {float_format}. Got {val.dtype}"
+        return bitcast(val, triton_ty, builder)
-def dot_scaled(lhs: tl.tensor, lhs_scale: tl.tensor, lhs_format, rhs: tl.tensor, rhs_scale: Optional[tl.tensor],
-               rhs_format, acc: tl.tensor | None, out_dtype: tl.dtype, builder: ir.builder) -> tl.tensor:
+def dot_scaled(lhs: tl.tensor, lhs_scale: tl.tensor, lhs_format: str, rhs: tl.tensor, rhs_scale: Optional[tl.tensor],
+               rhs_format: str, acc: tl.tensor | None, fast_math: bool, out_dtype: tl.dtype,
+               builder: ir.builder) -> tl.tensor:
     assert lhs.type.is_block() and rhs.type.is_block()
     #TODO: validate types.
     lhs_rank = len(lhs.shape)
     rhs_rank = len(rhs.shape)
     assert lhs_rank == rhs_rank == 2 or lhs_rank == rhs_rank == 3, f"Both inputs must be either 2D or 3D; (lhs: {lhs.shape} vs rhs: {rhs.shape})"
+    lhs_format: str = lhs_format.value
+    rhs_format: str = rhs_format.value
     lhs_format_enum = _str_to_fp_type(lhs_format)
     rhs_format_enum = _str_to_fp_type(rhs_format)
-    assert lhs_format in ("e2m1", "e4m3", "e5m2"), f"NYI: lhs_format {lhs_format}"
-    assert rhs_format in ("e4m3", "e5m2"), f"NYI: rhs_format {rhs_format}"
-    rhs_scale_is_none = isinstance(rhs_scale, tl.constexpr) and rhs_scale.value is None
-    assert rhs_scale_is_none, "NYI: rhs_scale not supported"
+    allowed_formats = {"e2m1", "e4m3", "e5m2", "bf16", "fp16"}
+    assert lhs_format in allowed_formats, f"NYI: lhs_format {lhs_format}"
+    assert rhs_format in allowed_formats, f"NYI: rhs_format {rhs_format}"
+    rhs_scale_is_none = rhs_scale is None or (isinstance(rhs_scale, tl.constexpr) and rhs_scale.value is None)
+    lhs_scale_is_none = lhs_scale is None or (isinstance(lhs_scale, tl.constexpr) and lhs_scale.value is None)
+    lhs = _bitcast_to_fp_type(lhs, lhs_format, builder)
+    rhs = _bitcast_to_fp_type(rhs, rhs_format, builder)
     M = lhs.type.shape[-2]
     K, N = rhs.type.shape[-2:]
-    PACKED = 2 if lhs_format == "e2m1" else 1
-    assert K == PACKED * lhs.type.shape[
+    PACKED_A = 2 if lhs_format == "e2m1" else 1
+    PACKED_B = 2 if rhs_format == "e2m1" else 1
+    assert K * PACKED_B == PACKED_A * lhs.type.shape[
         -1], f"Reduction dimension should pack the same number of elements; (lhs: {lhs.shape} vs rhs: {rhs.shape})"
-    assert K >= 64, f"scaled_dot NYI for K < 64. Got {K=}"
+    #assert K * PACKED_B >= 64, f"scaled_dot NYI for K < 64. Got {K=}"
     B = lhs.type.shape[0] if lhs_rank == 3 else None
     ret_ty = tl.block_type(out_dtype, [B, M, N] if B else [M, N])
@@ -1571,9 +1671,10 @@ def dot_scaled(lhs: tl.tensor, lhs_scale: tl.tensor, lhs_format, rhs: tl.tensor,
         acc_handle = acc.handle
         assert acc.type == ret_ty
     rhs_scale_handle = None if rhs_scale_is_none else rhs_scale.handle
+    lhs_scale_handle = None if lhs_scale_is_none else lhs_scale.handle
     return tl.tensor(
-        builder.create_dot_scaled(lhs.handle, lhs_scale.handle, lhs_format_enum, rhs.handle, rhs_scale_handle,
-                                  rhs_format_enum, acc_handle), ret_ty)
+        builder.create_dot_scaled(lhs.handle, lhs_scale_handle, lhs_format_enum, rhs.handle, rhs_scale_handle,
+                                  rhs_format_enum, fast_math, acc_handle), ret_ty)
 # ===----------------------------------------------------------------------===//
@@ -1655,6 +1756,30 @@ def associative_scan(inputs: Sequence[tl.tensor], axis: int, region_builder_fn,
     return tuple(wrap_tensor(scan_op.get_result(i), inputs[i].type.scalar, shape) for i in range(len(inputs)))
+# ===----------------------------------------------------------------------===
+#                               Gather
+# ===----------------------------------------------------------------------===
+def gather(src: tl.tensor, index: tl.tensor, axis: int, builder: ir.builder) -> tl.tensor:
+    assert index.dtype.is_int(), "index must be an integer tensor"
+    rank = len(src.type.shape)
+    assert len(index.type.shape) == rank, "source and index tensors must have the same rank"
+    assert -rank <= axis < rank, f"gather axis {axis} must be < source rank ({rank})"
+    if axis < 0:
+        axis += rank
+    for d in range(rank):
+        if d == axis:
+            continue
+        assert index.type.shape[d] == src.type.shape[d], f"index dim {axis} must match the corresponding source dim"
+    gather = builder.create_gather(src.handle, index.handle, axis)
+    return wrap_tensor(gather, src.type.scalar, index.type.shape)
 # ===----------------------------------------------------------------------===
 #                               Histogram
 # ===----------------------------------------------------------------------===
@@ -1663,10 +1788,7 @@ def associative_scan(inputs: Sequence[tl.tensor], axis: int, region_builder_fn,
 def histogram(input: tl.tensor, num_bins: int, builder: ir.builder) -> tl.tensor:
     assert len(input.shape) == 1, "histogram only supports 1D input"
     assert input.dtype.is_int(), "histogram only supports integer input"
-    return tl.tensor(builder.create_histogram(input.handle, num_bins), tl.block_type(tl.int32, (num_bins, )))
-##
+    return tl.tensor(builder.create_histogram(input.handle, num_bins), tl.block_type(tl.int32, [num_bins]))
 def multiple_of(x: tl.tensor, values: List[int]) -> tl.tensor:
@@ -1794,3 +1916,35 @@ def advance(base: tl.tensor, offsets, builder: ir.builder) -> tl.tensor:
     # Advanced block pointer type is the same as before
     return tl.tensor(builder.create_advance(base.handle, offsets), base.type)
+def make_tensor_descriptor(
+    base: tl.tensor,
+    shape: List[tl.tensor],
+    strides: List[tl.tensor],
+    block_shape: List[tl.constexpr],
+    builder: ir.builder,
+) -> tl._experimental_tensor_descriptor:
+    ndim = len(shape)
+    if not (2 <= ndim <= 5):
+        raise ValueError(f"Expected 2 <= ndim <= 5 but got {ndim} dimensions")
+    if len(strides) != ndim:
+        raise ValueError(f"Expected {ndim} strides but got {len(strides)}")
+    if len(block_shape) != ndim:
+        raise ValueError(f"Expected block_shape to have {ndim} dimensions but got {len(strides)}")
+    strides[-1] = tl._constexpr_to_value(strides[-1])
+    if strides[-1] != 1:
+        raise ValueError(f"Tensor descriptor last dim must be 1 but got {strides[-1]}")
+    shape = [to_tensor(x, builder) for x in shape]
+    strides = [to_tensor(x, builder).to(tl.int64, _builder=builder) for x in strides]
+    # Check whether `block_shape` is static
+    block_shape = tl._unwrap_shape(block_shape)
+    assert isinstance(base.type, tl.pointer_type)
+    type = tl.block_type(base.type.element_ty, block_shape)
+    handle = builder.create_make_tensor_descriptor(base.handle, [s.handle for s in shape], [s.handle for s in strides],
+                                                   block_shape)
+    return tl._experimental_tensor_descriptor(handle, shape, strides, type)

triton/language/standard.py CHANGED Viewed

@@ -59,14 +59,14 @@ def softmax(x, ieee_rounding=False):
 @core._tensor_member_fn
 @jit
-def ravel(x):
+def ravel(x, can_reorder=False):
     """
     Returns a contiguous flattened view of :code:`x`.
     :param x: the input tensor
     :type x: Block
     """
-    return core.reshape(x, [x.numel], can_reorder=True)
+    return core.reshape(x, [x.numel], can_reorder=can_reorder)
 @jit
@@ -259,11 +259,30 @@ def _sum_combine(a, b):
 # sum
+def _pick_sum_dtype(in_dtype: core.constexpr, dtype: core.constexpr):
+    dtype = core._unwrap_if_constexpr(dtype)
+    if dtype is not None:
+        return dtype
+    # For integer bitwidths less than 32, pick int32 with the same sign to
+    # avoid overflow.
+    out_dtype = None
+    if in_dtype.is_int_signed():
+        out_dtype = core.int32 if in_dtype.int_bitwidth < 32 else None
+    elif in_dtype.is_int_unsigned():
+        out_dtype = core.uint32 if in_dtype.int_bitwidth < 32 else None
+    return out_dtype
 @core._tensor_member_fn
 @jit
-@core._add_reduction_docstr("sum")
-def sum(input, axis=None, keep_dims=False):
-    input = core._promote_bfloat16_to_float32(input)
+@core._add_reduction_docstr("sum", dtype_arg="dtype")
+def sum(input, axis=None, keep_dims=False, dtype: core.constexpr = None):
+    # Pick a default dtype for the reduction if one was not specified.
+    out_dtype: core.constexpr = _pick_sum_dtype(input.dtype, dtype)
+    if out_dtype is not None:
+        input = input.to(out_dtype)
     return core.reduce(input, axis, _sum_combine, keep_dims=keep_dims)
@@ -276,15 +295,11 @@ def _xor_combine(a, b):
 @core._tensor_member_fn
-@core.builtin
+@jit
 @core._add_reduction_docstr("xor sum")
-def xor_sum(input, axis=None, keep_dims=False, _builder=None, _generator=None):
-    scalar_ty = input.type.scalar
-    if not scalar_ty.is_int():
-        raise ValueError("xor_sum only supported for integers")
-    input = core._promote_bfloat16_to_float32(input, _builder=_builder)
-    return core.reduce(input, axis, _xor_combine, keep_dims=keep_dims, _builder=_builder, _generator=_generator)
+def xor_sum(input, axis=None, keep_dims=False):
+    core.static_assert(input.type.scalar.is_int(), "xor_sum only supported for integers")
+    return core.reduce(input, axis, _xor_combine, keep_dims=keep_dims)
 # cumsum
@@ -412,11 +427,13 @@ def flip(x, dim=None):
     """
     core.static_assert(_is_power_of_two(x.shape[_get_flip_dim(dim, x.shape)]))
     core.static_assert(_is_power_of_two(x.numel))
-    # # reshape the tensor to have all dimensions be 2.
-    # # TODO: We shouldn't have to change the dimensions not sorted.
+    # reshape the tensor to have all dimensions be 2.
+    # TODO: We shouldn't have to change the dimensions not sorted.
     steps: core.constexpr = _log2(x.numel)
     start: core.constexpr = _log2(x.numel) - _log2(x.shape[_get_flip_dim(dim, x.shape)])
-    y = core.reshape(x, [2] * steps)
+    idtype = core.get_int_dtype(bitwidth=x.dtype.primitive_bitwidth, signed=True)
+    y = core.reshape(x.to(idtype, bitcast=True), [2] * steps)
     y = core.expand_dims(y, start)
     flip = (core.arange(0, 2)[:, None] == 1 - core.arange(0, 2))
     for i in core.static_range(start, steps):
@@ -424,8 +441,8 @@ def flip(x, dim=None):
         for j in core.static_range(0, steps + 1):
             if j != i and j != i + 1:
                 flip2 = core.expand_dims(flip2, j)
-        y = sum(y * flip2, i + 1, keep_dims=True)
-    x = core.reshape(y, x.shape)
+        y = sum(y * flip2, i + 1, keep_dims=True, dtype=y.dtype)
+    x = core.reshape(y, x.shape).to(x.dtype, bitcast=True)
     return x

triton/runtime/_allocation.py ADDED Viewed

@@ -0,0 +1,32 @@
+from typing import Optional, Protocol
+class Buffer(Protocol):
+    def data_ptr(self) -> int:
+        ...
+class Allocator(Protocol):
+    def __call__(self, size: int, alignment: int, stream: Optional[int]) -> Buffer:
+        ...
+class NullAllocator:
+    def __call__(self, size: int, alignment: int, stream: Optional[int]) -> Buffer:
+        raise RuntimeError("Kernel requires a runtime memory allocation, but no allocator was set. " +
+                           "Use triton.set_allocator to specify an allocator.")
+_allocator: Allocator = NullAllocator()
+def set_allocator(allocator: Allocator):
+    """
+    The allocator function is called during kernel launch for kernels that
+    require additional global memory workspace.
+    """
+    global _allocator
+    _allocator = allocator