PyPI - triton-windows - Versions diffs - 3.3.1.post21__cp311-cp311-win_amd64.whl → 3.4.0.post21__cp311-cp311-win_amd64.whl - Mend

triton-windows 3.3.1.post21__cp311-cp311-win_amd64.whl → 3.4.0.post21__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (68) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +4 -1
triton/_filecheck.py +87 -0
triton/_internal_testing.py +26 -15
triton/_utils.py +110 -21
triton/backends/__init__.py +20 -23
triton/backends/amd/__init__.py +0 -0
triton/backends/amd/compiler.py +112 -78
triton/backends/amd/driver.c +5 -2
triton/backends/amd/driver.py +143 -46
triton/backends/compiler.py +7 -21
triton/backends/nvidia/bin/ptxas.exe +0 -0
triton/backends/nvidia/compiler.py +94 -94
triton/backends/nvidia/driver.c +90 -98
triton/backends/nvidia/driver.py +296 -125
triton/compiler/code_generator.py +212 -111
triton/compiler/compiler.py +110 -25
triton/experimental/__init__.py +0 -0
triton/experimental/gluon/__init__.py +4 -0
triton/experimental/gluon/_compiler.py +0 -0
triton/experimental/gluon/_runtime.py +99 -0
triton/experimental/gluon/language/__init__.py +18 -0
triton/experimental/gluon/language/_core.py +312 -0
triton/experimental/gluon/language/_layouts.py +230 -0
triton/experimental/gluon/language/_math.py +12 -0
triton/experimental/gluon/language/_semantic.py +287 -0
triton/experimental/gluon/language/_standard.py +47 -0
triton/experimental/gluon/language/nvidia/__init__.py +4 -0
triton/experimental/gluon/language/nvidia/blackwell/__init__.py +202 -0
triton/experimental/gluon/language/nvidia/blackwell/tma.py +32 -0
triton/experimental/gluon/language/nvidia/hopper/__init__.py +11 -0
triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +51 -0
triton/experimental/gluon/language/nvidia/hopper/tma.py +96 -0
triton/experimental/gluon/nvidia/__init__.py +4 -0
triton/experimental/gluon/nvidia/blackwell.py +3 -0
triton/experimental/gluon/nvidia/hopper.py +40 -0
triton/knobs.py +481 -0
triton/language/__init__.py +39 -14
triton/language/core.py +794 -537
triton/language/extra/cuda/__init__.py +10 -7
triton/language/extra/cuda/gdc.py +42 -0
triton/language/extra/cuda/libdevice.py +394 -394
triton/language/extra/cuda/utils.py +21 -21
triton/language/extra/hip/libdevice.py +113 -104
triton/language/math.py +65 -66
triton/language/random.py +12 -2
triton/language/semantic.py +1706 -1770
triton/language/standard.py +116 -51
triton/runtime/autotuner.py +117 -59
triton/runtime/build.py +73 -9
triton/runtime/cache.py +18 -47
triton/runtime/driver.py +32 -29
triton/runtime/interpreter.py +72 -35
triton/runtime/jit.py +146 -110
triton/testing.py +16 -12
triton/tools/disasm.py +3 -4
triton/tools/tensor_descriptor.py +36 -0
triton/windows_utils.py +47 -83
{triton_windows-3.3.1.post21.dist-info → triton_windows-3.4.0.post21.dist-info}/METADATA +7 -2
{triton_windows-3.3.1.post21.dist-info → triton_windows-3.4.0.post21.dist-info}/RECORD +64 -41
triton_windows-3.4.0.post21.dist-info/entry_points.txt +3 -0
triton_windows-3.4.0.post21.dist-info/licenses/LICENSE +23 -0
triton_windows-3.4.0.post21.dist-info/top_level.txt +1 -0
triton/language/_utils.py +0 -21
triton/language/extra/cuda/_experimental_tma.py +0 -106
triton/tools/experimental_descriptor.py +0 -32
triton_windows-3.3.1.post21.dist-info/top_level.txt +0 -14
{triton_windows-3.3.1.post21.dist-info → triton_windows-3.4.0.post21.dist-info}/WHEEL +0 -0

triton/language/__init__.py CHANGED Viewed

@@ -6,6 +6,7 @@ from . import extra
 from .standard import (
     argmax,
     argmin,
+    bitonic_merge,
     cdiv,
     cumprod,
     cumsum,
@@ -14,11 +15,13 @@ from .standard import (
     max,
     min,
     ravel,
+    reduce_or,
     sigmoid,
     softmax,
     sort,
     sum,
     swizzle2d,
+    topk,
     xor_sum,
     zeros,
     zeros_like,
@@ -26,16 +29,17 @@ from .standard import (
 from .core import (
     PropagateNan,
     TRITON_MAX_TENSOR_NUMEL,
-    _experimental_descriptor_load,
-    _experimental_descriptor_store,
-    _experimental_make_tensor_descriptor,
-    _experimental_reinterpret_tensor_descriptor,
-    _experimental_tensor_descriptor,
+    load_tensor_descriptor,
+    store_tensor_descriptor,
+    make_tensor_descriptor,
+    tensor_descriptor,
+    tensor_descriptor_type,
     add,
     advance,
     arange,
     associative_scan,
     assume,
+    async_task,
     atomic_add,
     atomic_and,
     atomic_cas,
@@ -53,6 +57,7 @@ from .core import (
     clamp,
     const,
     constexpr,
+    constexpr_function,
     debug_barrier,
     device_assert,
     device_print,
@@ -89,7 +94,6 @@ from .core import (
     permute,
     pi32_t,
     pointer_type,
-    nv_tma_desc_type,
     program_id,
     range,
     reduce,
@@ -130,11 +134,10 @@ from .random import (
 __all__ = [
     "PropagateNan",
     "TRITON_MAX_TENSOR_NUMEL",
-    "_experimental_descriptor_load",
-    "_experimental_descriptor_store",
-    "_experimental_make_tensor_descriptor",
-    "_experimental_reinterpret_tensor_descriptor",
-    "_experimental_tensor_descriptor",
+    "load_tensor_descriptor",
+    "store_tensor_descriptor",
+    "make_tensor_descriptor",
+    "tensor_descriptor",
     "abs",
     "add",
     "advance",
@@ -143,6 +146,7 @@ __all__ = [
     "argmin",
     "associative_scan",
     "assume",
+    "async_task",
     "atomic_add",
     "atomic_and",
     "atomic_cas",
@@ -152,6 +156,7 @@ __all__ = [
     "atomic_xchg",
     "atomic_xor",
     "bfloat16",
+    "bitonic_merge",
     "block_type",
     "broadcast",
     "broadcast_to",
@@ -162,6 +167,7 @@ __all__ = [
     "clamp",
     "const",
     "constexpr",
+    "constexpr_function",
     "cos",
     "cumprod",
     "cumsum",
@@ -219,7 +225,6 @@ __all__ = [
     "philox_impl",
     "pi32_t",
     "pointer_type",
-    "nv_tma_desc_type",
     "program_id",
     "rand",
     "rand4x",
@@ -230,6 +235,7 @@ __all__ = [
     "range",
     "ravel",
     "reduce",
+    "reduce_or",
     "reshape",
     "rsqrt",
     "slice",
@@ -247,6 +253,7 @@ __all__ = [
     "sum",
     "swizzle2d",
     "tensor",
+    "topk",
     "trans",
     "tuple",
     "uint16",
@@ -280,8 +287,26 @@ def str_to_ty(name):
         ty = str_to_ty(name)
         return pointer_type(element_ty=ty, const=const)
-    if name == "nvTmaDesc":
-        return nv_tma_desc_type()
+    if name.startswith("tensordesc"):
+        inner = name.split("<")[1].rstrip(">")
+        dtype, rest = inner.split("[", maxsplit=2)
+        block_shape, rest = rest.split("]", maxsplit=2)
+        block_shape = [int(s.strip()) for s in block_shape.rstrip("]").split(",")]
+        layout = rest.lstrip(",")
+        is_gluon = len(layout)
+        dtype = str_to_ty(dtype)
+        ndim = len(block_shape)
+        shape_type = tuple_type([int32] * ndim)
+        # FIXME: Last dim stride should be constexpr(1)
+        stride_type = tuple_type(([int64] * ndim))
+        block = block_type(dtype, block_shape)
+        if is_gluon:
+            from triton.experimental.gluon.language._layouts import NVMMASharedLayout
+            from triton.experimental.gluon.language.nvidia.hopper.tma import tensor_descriptor_type as gluon_tensor_descriptor_type
+            layout = eval(layout, dict(NVMMASharedLayout=NVMMASharedLayout))
+            assert isinstance(layout, NVMMASharedLayout)
+            return gluon_tensor_descriptor_type(block, shape_type, stride_type, layout)
+        return tensor_descriptor_type(block, shape_type, stride_type)
     if name == "constexpr":
         return constexpr