PyPI - warp-lang - Versions diffs - 1.8.0__py3-none-win_amd64.whl → 1.9.0__py3-none-win_amd64.whl - Mend

warp-lang 1.8.0__py3-none-win_amd64.whl → 1.9.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (153) hide show

warp/__init__.py +282 -103
warp/__init__.pyi +482 -110
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +93 -30
warp/build_dll.py +48 -63
warp/builtins.py +955 -137
warp/codegen.py +327 -209
warp/config.py +1 -1
warp/context.py +1363 -800
warp/examples/core/example_marching_cubes.py +1 -0
warp/examples/core/example_render_opengl.py +100 -3
warp/examples/fem/example_apic_fluid.py +98 -52
warp/examples/fem/example_convection_diffusion_dg.py +25 -4
warp/examples/fem/example_diffusion_mgpu.py +8 -3
warp/examples/fem/utils.py +68 -22
warp/examples/interop/example_jax_callable.py +34 -4
warp/examples/interop/example_jax_kernel.py +27 -1
warp/fabric.py +1 -1
warp/fem/cache.py +27 -19
warp/fem/domain.py +2 -2
warp/fem/field/nodal_field.py +2 -2
warp/fem/field/virtual.py +266 -166
warp/fem/geometry/geometry.py +5 -5
warp/fem/integrate.py +200 -91
warp/fem/space/restriction.py +4 -0
warp/fem/space/shape/tet_shape_function.py +3 -10
warp/jax_experimental/custom_call.py +1 -1
warp/jax_experimental/ffi.py +203 -54
warp/marching_cubes.py +708 -0
warp/native/array.h +103 -8
warp/native/builtin.h +90 -9
warp/native/bvh.cpp +64 -28
warp/native/bvh.cu +58 -58
warp/native/bvh.h +2 -2
warp/native/clang/clang.cpp +7 -7
warp/native/coloring.cpp +13 -3
warp/native/crt.cpp +2 -2
warp/native/crt.h +3 -5
warp/native/cuda_util.cpp +42 -11
warp/native/cuda_util.h +10 -4
warp/native/exports.h +1842 -1908
warp/native/fabric.h +2 -1
warp/native/hashgrid.cpp +37 -37
warp/native/hashgrid.cu +2 -2
warp/native/initializer_array.h +1 -1
warp/native/intersect.h +4 -4
warp/native/mat.h +1913 -119
warp/native/mathdx.cpp +43 -43
warp/native/mesh.cpp +24 -24
warp/native/mesh.cu +26 -26
warp/native/mesh.h +5 -3
warp/native/nanovdb/GridHandle.h +179 -12
warp/native/nanovdb/HostBuffer.h +8 -7
warp/native/nanovdb/NanoVDB.h +517 -895
warp/native/nanovdb/NodeManager.h +323 -0
warp/native/nanovdb/PNanoVDB.h +2 -2
warp/native/quat.h +337 -16
warp/native/rand.h +7 -7
warp/native/range.h +7 -1
warp/native/reduce.cpp +10 -10
warp/native/reduce.cu +13 -14
warp/native/runlength_encode.cpp +2 -2
warp/native/runlength_encode.cu +5 -5
warp/native/scan.cpp +3 -3
warp/native/scan.cu +4 -4
warp/native/sort.cpp +10 -10
warp/native/sort.cu +22 -22
warp/native/sparse.cpp +8 -8
warp/native/sparse.cu +14 -14
warp/native/spatial.h +366 -17
warp/native/svd.h +23 -8
warp/native/temp_buffer.h +2 -2
warp/native/tile.h +303 -70
warp/native/tile_radix_sort.h +5 -1
warp/native/tile_reduce.h +16 -25
warp/native/tuple.h +2 -2
warp/native/vec.h +385 -18
warp/native/volume.cpp +54 -54
warp/native/volume.cu +1 -1
warp/native/volume.h +2 -1
warp/native/volume_builder.cu +30 -37
warp/native/warp.cpp +150 -149
warp/native/warp.cu +337 -193
warp/native/warp.h +227 -226
warp/optim/linear.py +736 -271
warp/render/imgui_manager.py +289 -0
warp/render/render_opengl.py +137 -57
warp/render/render_usd.py +0 -1
warp/sim/collide.py +1 -2
warp/sim/graph_coloring.py +2 -2
warp/sim/integrator_vbd.py +10 -2
warp/sparse.py +559 -176
warp/tape.py +2 -0
warp/tests/aux_test_module_aot.py +7 -0
warp/tests/cuda/test_async.py +3 -3
warp/tests/cuda/test_conditional_captures.py +101 -0
warp/tests/geometry/test_marching_cubes.py +233 -12
warp/tests/sim/test_cloth.py +89 -6
warp/tests/sim/test_coloring.py +82 -7
warp/tests/test_array.py +56 -5
warp/tests/test_assert.py +53 -0
warp/tests/test_atomic_cas.py +127 -114
warp/tests/test_codegen.py +3 -2
warp/tests/test_context.py +8 -15
warp/tests/test_enum.py +136 -0
warp/tests/test_examples.py +2 -2
warp/tests/test_fem.py +45 -2
warp/tests/test_fixedarray.py +229 -0
warp/tests/test_func.py +18 -15
warp/tests/test_future_annotations.py +7 -5
warp/tests/test_linear_solvers.py +30 -0
warp/tests/test_map.py +1 -1
warp/tests/test_mat.py +1540 -378
warp/tests/test_mat_assign_copy.py +178 -0
warp/tests/test_mat_constructors.py +574 -0
warp/tests/test_module_aot.py +287 -0
warp/tests/test_print.py +69 -0
warp/tests/test_quat.py +162 -34
warp/tests/test_quat_assign_copy.py +145 -0
warp/tests/test_reload.py +2 -1
warp/tests/test_sparse.py +103 -0
warp/tests/test_spatial.py +140 -34
warp/tests/test_spatial_assign_copy.py +160 -0
warp/tests/test_static.py +48 -0
warp/tests/test_struct.py +43 -3
warp/tests/test_tape.py +38 -0
warp/tests/test_types.py +0 -20
warp/tests/test_vec.py +216 -441
warp/tests/test_vec_assign_copy.py +143 -0
warp/tests/test_vec_constructors.py +325 -0
warp/tests/tile/test_tile.py +206 -152
warp/tests/tile/test_tile_cholesky.py +605 -0
warp/tests/tile/test_tile_load.py +169 -0
warp/tests/tile/test_tile_mathdx.py +2 -558
warp/tests/tile/test_tile_matmul.py +179 -0
warp/tests/tile/test_tile_mlp.py +1 -1
warp/tests/tile/test_tile_reduce.py +100 -11
warp/tests/tile/test_tile_shared_memory.py +16 -16
warp/tests/tile/test_tile_sort.py +59 -55
warp/tests/unittest_suites.py +16 -0
warp/tests/walkthrough_debug.py +1 -1
warp/thirdparty/unittest_parallel.py +108 -9
warp/types.py +554 -264
warp/utils.py +68 -86
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/METADATA +28 -65
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/RECORD +150 -138
warp/native/marching.cpp +0 -19
warp/native/marching.cu +0 -514
warp/native/marching.h +0 -19
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/WHEEL +0 -0
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/top_level.txt +0 -0

warp/__init__.pyi CHANGED Viewed

@@ -36,120 +36,299 @@ FabricArray = Generic[DType]
 IndexedFabricArray = Generic[DType]
 Tile = Generic[DType, Shape]
-from warp.types import array, array1d, array2d, array3d, array4d, constant, from_ptr
-from warp.types import indexedarray, indexedarray1d, indexedarray2d, indexedarray3d, indexedarray4d
-from warp.fabric import fabricarray, fabricarrayarray, indexedfabricarray, indexedfabricarrayarray
-from warp.types import tile
-from warp.types import bool, int8, uint8, int16, uint16, int32, uint32, int64, uint64, float16, float32, float64
-from warp.types import vec2, vec2b, vec2ub, vec2s, vec2us, vec2i, vec2ui, vec2l, vec2ul, vec2h, vec2f, vec2d
-from warp.types import vec3, vec3b, vec3ub, vec3s, vec3us, vec3i, vec3ui, vec3l, vec3ul, vec3h, vec3f, vec3d
-from warp.types import vec4, vec4b, vec4ub, vec4s, vec4us, vec4i, vec4ui, vec4l, vec4ul, vec4h, vec4f, vec4d
-from warp.types import mat22, mat22h, mat22f, mat22d
-from warp.types import mat33, mat33h, mat33f, mat33d
-from warp.types import mat44, mat44h, mat44f, mat44d
-from warp.types import quat, quath, quatf, quatd
-from warp.types import transform, transformh, transformf, transformd
-from warp.types import spatial_vector, spatial_vectorh, spatial_vectorf, spatial_vectord
-from warp.types import spatial_matrix, spatial_matrixh, spatial_matrixf, spatial_matrixd
-from warp.types import Int, Float, Scalar
-from warp.types import Bvh, Mesh, HashGrid, Volume, MarchingCubes
-from warp.types import BvhQuery, HashGridQuery, MeshQueryAABB, MeshQueryPoint, MeshQueryRay
-from warp.types import matmul, adj_matmul, batched_matmul, adj_batched_matmul
+from warp.types import array as array
+from warp.types import array1d as array1d
+from warp.types import array2d as array2d
+from warp.types import array3d as array3d
+from warp.types import array4d as array4d
+from warp.types import constant as constant
+from warp.types import from_ptr as from_ptr
+from warp.types import fixedarray as fixedarray
+from warp.types import indexedarray as indexedarray
+from warp.types import indexedarray1d as indexedarray1d
+from warp.types import indexedarray2d as indexedarray2d
+from warp.types import indexedarray3d as indexedarray3d
+from warp.types import indexedarray4d as indexedarray4d
+from warp.fabric import fabricarray as fabricarray
+from warp.fabric import fabricarrayarray as fabricarrayarray
+from warp.fabric import indexedfabricarray as indexedfabricarray
+from warp.fabric import indexedfabricarrayarray as indexedfabricarrayarray
+from warp.types import tile as tile
+from warp.types import bool as bool
+from warp.types import int8 as int8
+from warp.types import uint8 as uint8
+from warp.types import int16 as int16
+from warp.types import uint16 as uint16
+from warp.types import int32 as int32
+from warp.types import uint32 as uint32
+from warp.types import int64 as int64
+from warp.types import uint64 as uint64
+from warp.types import float16 as float16
+from warp.types import float32 as float32
+from warp.types import float64 as float64
+from warp.types import vec2 as vec2
+from warp.types import vec2b as vec2b
+from warp.types import vec2ub as vec2ub
+from warp.types import vec2s as vec2s
+from warp.types import vec2us as vec2us
+from warp.types import vec2i as vec2i
+from warp.types import vec2ui as vec2ui
+from warp.types import vec2l as vec2l
+from warp.types import vec2ul as vec2ul
+from warp.types import vec2h as vec2h
+from warp.types import vec2f as vec2f
+from warp.types import vec2d as vec2d
+from warp.types import vec3 as vec3
+from warp.types import vec3b as vec3b
+from warp.types import vec3ub as vec3ub
+from warp.types import vec3s as vec3s
+from warp.types import vec3us as vec3us
+from warp.types import vec3i as vec3i
+from warp.types import vec3ui as vec3ui
+from warp.types import vec3l as vec3l
+from warp.types import vec3ul as vec3ul
+from warp.types import vec3h as vec3h
+from warp.types import vec3f as vec3f
+from warp.types import vec3d as vec3d
+from warp.types import vec4 as vec4
+from warp.types import vec4b as vec4b
+from warp.types import vec4ub as vec4ub
+from warp.types import vec4s as vec4s
+from warp.types import vec4us as vec4us
+from warp.types import vec4i as vec4i
+from warp.types import vec4ui as vec4ui
+from warp.types import vec4l as vec4l
+from warp.types import vec4ul as vec4ul
+from warp.types import vec4h as vec4h
+from warp.types import vec4f as vec4f
+from warp.types import vec4d as vec4d
+from warp.types import mat22 as mat22
+from warp.types import mat22h as mat22h
+from warp.types import mat22f as mat22f
+from warp.types import mat22d as mat22d
+from warp.types import mat33 as mat33
+from warp.types import mat33h as mat33h
+from warp.types import mat33f as mat33f
+from warp.types import mat33d as mat33d
+from warp.types import mat44 as mat44
+from warp.types import mat44h as mat44h
+from warp.types import mat44f as mat44f
+from warp.types import mat44d as mat44d
+from warp.types import quat as quat
+from warp.types import quath as quath
+from warp.types import quatf as quatf
+from warp.types import quatd as quatd
+from warp.types import transform as transform
+from warp.types import transformh as transformh
+from warp.types import transformf as transformf
+from warp.types import transformd as transformd
+from warp.types import spatial_vector as spatial_vector
+from warp.types import spatial_vectorh as spatial_vectorh
+from warp.types import spatial_vectorf as spatial_vectorf
+from warp.types import spatial_vectord as spatial_vectord
+from warp.types import spatial_matrix as spatial_matrix
+from warp.types import spatial_matrixh as spatial_matrixh
+from warp.types import spatial_matrixf as spatial_matrixf
+from warp.types import spatial_matrixd as spatial_matrixd
+from warp.types import Int as Int
+from warp.types import Float as Float
+from warp.types import Scalar as Scalar
+from warp.types import Bvh as Bvh
+from warp.types import Mesh as Mesh
+from warp.types import HashGrid as HashGrid
+from warp.types import Volume as Volume
+from warp.types import BvhQuery as BvhQuery
+from warp.types import HashGridQuery as HashGridQuery
+from warp.types import MeshQueryAABB as MeshQueryAABB
+from warp.types import MeshQueryPoint as MeshQueryPoint
+from warp.types import MeshQueryRay as MeshQueryRay
+from warp.types import matmul as matmul
+from warp.types import adj_matmul as adj_matmul
+from warp.types import batched_matmul as batched_matmul
+from warp.types import adj_batched_matmul as adj_batched_matmul
 from warp.types import vector as vec
 from warp.types import matrix as mat
-from warp.types import dtype_from_numpy, dtype_to_numpy
-from warp.types import from_ipc_handle
-from warp.context import init, func, func_grad, func_replay, func_native, kernel, struct, overload
-from warp.context import is_cpu_available, is_cuda_available, is_device_available
-from warp.context import get_devices, get_preferred_device
-from warp.context import get_cuda_devices, get_cuda_device_count, get_cuda_device, map_cuda_device, unmap_cuda_device
-from warp.context import get_device, set_device, synchronize_device
-from warp.context import (
-    zeros,
-    zeros_like,
-    ones,
-    ones_like,
-    full,
-    full_like,
-    clone,
-    empty,
-    empty_like,
-    copy,
-    from_numpy,
-    launch,
-    launch_tiled,
-    synchronize,
-    force_load,
-    load_module,
-    event_from_ipc_handle,
-)
-from warp.context import set_module_options, get_module_options, get_module
-from warp.context import capture_begin, capture_end, capture_launch, capture_if, capture_while, capture_debug_dot_print
-from warp.context import Kernel, Function, Launch
-from warp.context import Stream, get_stream, set_stream, wait_stream, synchronize_stream
-from warp.context import Event, record_event, wait_event, synchronize_event, get_event_elapsed_time
-from warp.context import RegisteredGLBuffer
-from warp.context import is_mempool_supported, is_mempool_enabled, set_mempool_enabled
-from warp.context import (
-    set_mempool_release_threshold,
-    get_mempool_release_threshold,
-    get_mempool_used_mem_current,
-    get_mempool_used_mem_high,
-)
-from warp.context import is_mempool_access_supported, is_mempool_access_enabled, set_mempool_access_enabled
-from warp.context import is_peer_access_supported, is_peer_access_enabled, set_peer_access_enabled
-from warp.tape import Tape
-from warp.utils import ScopedTimer, ScopedDevice, ScopedStream
-from warp.utils import ScopedMempool, ScopedMempoolAccess, ScopedPeerAccess
-from warp.utils import ScopedCapture
-from warp.utils import transform_expand, quat_between_vectors
-from warp.utils import TimingResult, timing_begin, timing_end, timing_print
-from warp.utils import (
-    TIMING_KERNEL,
-    TIMING_KERNEL_BUILTIN,
-    TIMING_MEMCPY,
-    TIMING_MEMSET,
-    TIMING_GRAPH,
-    TIMING_ALL,
-)
-from warp.utils import map
-from warp.torch import from_torch, to_torch
-from warp.torch import dtype_from_torch, dtype_to_torch
-from warp.torch import device_from_torch, device_to_torch
-from warp.torch import stream_from_torch, stream_to_torch
-from warp.jax import from_jax, to_jax
-from warp.jax import dtype_from_jax, dtype_to_jax
-from warp.jax import device_from_jax, device_to_jax
-from warp.dlpack import from_dlpack, to_dlpack
-from warp.paddle import from_paddle, to_paddle
-from warp.paddle import dtype_from_paddle, dtype_to_paddle
-from warp.paddle import device_from_paddle, device_to_paddle
-from warp.paddle import stream_from_paddle
-from warp.build import clear_kernel_cache
-from warp.build import clear_lto_cache
+from warp.types import matrix_from_cols as matrix_from_cols
+from warp.types import matrix_from_rows as matrix_from_rows
+from warp.types import dtype_from_numpy as dtype_from_numpy
+from warp.types import dtype_to_numpy as dtype_to_numpy
+from warp.types import from_ipc_handle as from_ipc_handle
+from warp.context import init as init
+from warp.context import func as func
+from warp.context import func_grad as func_grad
+from warp.context import func_replay as func_replay
+from warp.context import func_native as func_native
+from warp.context import kernel as kernel
+from warp.context import struct as struct
+from warp.context import overload as overload
+from warp.context import is_cpu_available as is_cpu_available
+from warp.context import is_cuda_available as is_cuda_available
+from warp.context import is_device_available as is_device_available
+from warp.context import get_devices as get_devices
+from warp.context import get_preferred_device as get_preferred_device
+from warp.context import get_cuda_devices as get_cuda_devices
+from warp.context import get_cuda_device_count as get_cuda_device_count
+from warp.context import get_cuda_device as get_cuda_device
+from warp.context import map_cuda_device as map_cuda_device
+from warp.context import unmap_cuda_device as unmap_cuda_device
+from warp.context import get_device as get_device
+from warp.context import set_device as set_device
+from warp.context import synchronize_device as synchronize_device
+from warp.context import zeros as zeros
+from warp.context import zeros_like as zeros_like
+from warp.context import ones as ones
+from warp.context import ones_like as ones_like
+from warp.context import full as full
+from warp.context import full_like as full_like
+from warp.context import clone as clone
+from warp.context import empty as empty
+from warp.context import empty_like as empty_like
+from warp.context import copy as copy
+from warp.context import from_numpy as from_numpy
+from warp.context import launch as launch
+from warp.context import launch_tiled as launch_tiled
+from warp.context import synchronize as synchronize
+from warp.context import compile_aot_module as compile_aot_module
+from warp.context import force_load as force_load
+from warp.context import load_module as load_module
+from warp.context import load_aot_module as load_aot_module
+from warp.context import event_from_ipc_handle as event_from_ipc_handle
+from warp.context import set_module_options as set_module_options
+from warp.context import get_module_options as get_module_options
+from warp.context import get_module as get_module
+from warp.context import capture_begin as capture_begin
+from warp.context import capture_end as capture_end
+from warp.context import capture_launch as capture_launch
+from warp.context import capture_if as capture_if
+from warp.context import capture_while as capture_while
+from warp.context import capture_debug_dot_print as capture_debug_dot_print
+from warp.context import Kernel as Kernel
+from warp.context import Function as Function
+from warp.context import Launch as Launch
+from warp.context import Stream as Stream
+from warp.context import get_stream as get_stream
+from warp.context import set_stream as set_stream
+from warp.context import wait_stream as wait_stream
+from warp.context import synchronize_stream as synchronize_stream
+from warp.context import Event as Event
+from warp.context import record_event as record_event
+from warp.context import wait_event as wait_event
+from warp.context import synchronize_event as synchronize_event
+from warp.context import get_event_elapsed_time as get_event_elapsed_time
+from warp.context import RegisteredGLBuffer as RegisteredGLBuffer
+from warp.context import is_mempool_supported as is_mempool_supported
+from warp.context import is_mempool_enabled as is_mempool_enabled
+from warp.context import set_mempool_enabled as set_mempool_enabled
+from warp.context import set_mempool_release_threshold as set_mempool_release_threshold
+from warp.context import get_mempool_release_threshold as get_mempool_release_threshold
+from warp.context import get_mempool_used_mem_current as get_mempool_used_mem_current
+from warp.context import get_mempool_used_mem_high as get_mempool_used_mem_high
+from warp.context import is_mempool_access_supported as is_mempool_access_supported
+from warp.context import is_mempool_access_enabled as is_mempool_access_enabled
+from warp.context import set_mempool_access_enabled as set_mempool_access_enabled
+from warp.context import is_peer_access_supported as is_peer_access_supported
+from warp.context import is_peer_access_enabled as is_peer_access_enabled
+from warp.context import set_peer_access_enabled as set_peer_access_enabled
+from warp.tape import Tape as Tape
+from warp.utils import ScopedTimer as ScopedTimer
+from warp.utils import ScopedDevice as ScopedDevice
+from warp.utils import ScopedStream as ScopedStream
+from warp.utils import ScopedMempool as ScopedMempool
+from warp.utils import ScopedMempoolAccess as ScopedMempoolAccess
+from warp.utils import ScopedPeerAccess as ScopedPeerAccess
+from warp.utils import ScopedCapture as ScopedCapture
+from warp.utils import transform_expand as transform_expand
+from warp.utils import quat_between_vectors as quat_between_vectors
+from warp.utils import TimingResult as TimingResult
+from warp.utils import timing_begin as timing_begin
+from warp.utils import timing_end as timing_end
+from warp.utils import timing_print as timing_print
+from warp.utils import TIMING_KERNEL as TIMING_KERNEL
+from warp.utils import TIMING_KERNEL_BUILTIN as TIMING_KERNEL_BUILTIN
+from warp.utils import TIMING_MEMCPY as TIMING_MEMCPY
+from warp.utils import TIMING_MEMSET as TIMING_MEMSET
+from warp.utils import TIMING_GRAPH as TIMING_GRAPH
+from warp.utils import TIMING_ALL as TIMING_ALL
+from warp.utils import map as map
+from warp.marching_cubes import MarchingCubes as MarchingCubes
+from warp.torch import from_torch as from_torch
+from warp.torch import to_torch as to_torch
+from warp.torch import dtype_from_torch as dtype_from_torch
+from warp.torch import dtype_to_torch as dtype_to_torch
+from warp.torch import device_from_torch as device_from_torch
+from warp.torch import device_to_torch as device_to_torch
+from warp.torch import stream_from_torch as stream_from_torch
+from warp.torch import stream_to_torch as stream_to_torch
+from warp.jax import from_jax as from_jax
+from warp.jax import to_jax as to_jax
+from warp.jax import dtype_from_jax as dtype_from_jax
+from warp.jax import dtype_to_jax as dtype_to_jax
+from warp.jax import device_from_jax as device_from_jax
+from warp.jax import device_to_jax as device_to_jax
+from warp.dlpack import from_dlpack as from_dlpack
+from warp.dlpack import to_dlpack as to_dlpack
+from warp.paddle import from_paddle as from_paddle
+from warp.paddle import to_paddle as to_paddle
+from warp.paddle import dtype_from_paddle as dtype_from_paddle
+from warp.paddle import dtype_to_paddle as dtype_to_paddle
+from warp.paddle import device_from_paddle as device_from_paddle
+from warp.paddle import device_to_paddle as device_to_paddle
+from warp.paddle import stream_from_paddle as stream_from_paddle
+from warp.build import clear_kernel_cache as clear_kernel_cache
+from warp.build import clear_lto_cache as clear_lto_cache
 from warp.constants import *
 from . import builtins
-from warp.builtins import static
+from warp.builtins import static as static
 from warp.math import *
-import warp.config as config
+from . import config as config
 __version__ = config.version
@@ -924,7 +1103,7 @@ def tile_arange(*args: Scalar, dtype: Scalar, storage: str) -> Tile[Scalar, Tupl
 @over
 def tile_load(
-    a: Array[Any], shape: Tuple[int, ...], offset: Tuple[int, ...], storage: str
+    a: Array[Any], shape: Tuple[int, ...], offset: Tuple[int, ...], storage: str, bounds_check: bool
 ) -> Tile[Any, Tuple[int, ...]]:
     """Loads a tile from a global memory array.
@@ -935,12 +1114,80 @@ def tile_load(
     :param offset: Offset in the source array to begin reading from (optional)
     :param storage: The storage location for the tile: ``"register"`` for registers
       (default) or ``"shared"`` for shared memory.
+    :param bounds_check: Needed for unaligned tiles, but can disable for memory-aligned tiles for faster load times
     :returns: A tile with shape as specified and data type the same as the source array
     """
     ...
 @over
-def tile_store(a: Array[Any], t: Tile[Any, Tuple[int, ...]], offset: Tuple[int, ...]):
+def tile_load_indexed(
+    a: Array[Any],
+    indices: Tile[int32, Tuple[int]],
+    shape: Tuple[int, ...],
+    offset: Tuple[int, ...],
+    axis: int32,
+    storage: str,
+) -> Tile[Any, Tuple[int, ...]]:
+    """Loads a tile from a global memory array, with loads along a specified axis mapped according to a 1D tile of indices.
+    :param a: The source array in global memory
+    :param indices: A 1D tile of integer indices mapping to elements in ``a``.
+    :param shape: Shape of the tile to load, must have the same number of dimensions as ``a``, and along ``axis``, it must have the same number of elements as the ``indices`` tile.
+    :param offset: Offset in the source array to begin reading from (optional)
+    :param axis: Axis of ``a`` that indices refer to
+    :param storage: The storage location for the tile: ``"register"`` for registers (default) or ``"shared"`` for shared memory.
+    :returns: A tile with shape as specified and data type the same as the source array
+    This example shows how to select and store the even indexed rows from a 2D array:
+    .. code-block:: python
+        TILE_M = wp.constant(2)
+        TILE_N = wp.constant(2)
+        HALF_M = wp.constant(TILE_M // 2)
+        HALF_N = wp.constant(TILE_N // 2)
+        @wp.kernel
+        def compute(x: wp.array2d(dtype=float), y: wp.array2d(dtype=float)):
+            i, j = wp.tid()
+            evens = wp.tile_arange(HALF_M, dtype=int, storage="shared") * 2
+            t0 = wp.tile_load_indexed(
+                x, indices=evens, shape=(HALF_M, TILE_N), offset=(i * TILE_M, j * TILE_N), axis=0, storage="register"
+            )
+            wp.tile_store(y, t0, offset=(i * HALF_M, j * TILE_N))
+        M = TILE_M * 2
+        N = TILE_N * 2
+        arr = np.arange(M * N).reshape(M, N)
+        x = wp.array(arr, dtype=float)
+        y = wp.zeros((M // 2, N), dtype=float)
+        wp.launch_tiled(compute, dim=[2, 2], inputs=[x], outputs=[y], block_dim=32, device=device)
+        print(x.numpy())
+        print(y.numpy())
+    Prints:
+    .. code-block:: text
+        [[ 0.  1.  2.  3.]
+         [ 4.  5.  6.  7.]
+         [ 8.  9. 10. 11.]
+         [12. 13. 14. 15.]]
+        [[ 0.  1.  2.  3.]
+         [ 8.  9. 10. 11.]]
+    """
+    ...
+@over
+def tile_store(a: Array[Any], t: Tile[Any, Tuple[int, ...]], offset: Tuple[int, ...], bounds_check: bool):
     """Store a tile to a global memory array.
     This method will cooperatively store a tile to global memory using all threads in the block.
@@ -948,22 +1195,147 @@ def tile_store(a: Array[Any], t: Tile[Any, Tuple[int, ...]], offset: Tuple[int,
     :param a: The destination array in global memory
     :param t: The source tile to store data from, must have the same data type and number of dimensions as the destination array
     :param offset: Offset in the destination array (optional)
+    :param bounds_check: Needed for unaligned tiles, but can disable for memory-aligned tiles for faster write times
+    """
+    ...
+@over
+def tile_store_indexed(
+    a: Array[Any], indices: Tile[int32, Tuple[int]], t: Tile[Any, Tuple[int, ...]], offset: Tuple[int, ...], axis: int32
+):
+    """Store a tile to a global memory array, with storage along a specified axis mapped according to a 1D tile of indices.
+    :param a: The destination array in global memory
+    :param indices: A 1D tile of integer indices mapping to elements in ``a``.
+    :param t: The source tile to store data from, must have the same data type and number of dimensions as the destination array, and along ``axis``, it must have the same number of elements as the ``indices`` tile.
+    :param offset: Offset in the destination array (optional)
+    :param axis: Axis of ``a`` that indices refer to
+    This example shows how to map tile rows to the even rows of a 2D array:
+    .. code-block:: python
+        TILE_M = wp.constant(2)
+        TILE_N = wp.constant(2)
+        TWO_M = wp.constant(TILE_M * 2)
+        TWO_N = wp.constant(TILE_N * 2)
+        @wp.kernel
+        def compute(x: wp.array2d(dtype=float), y: wp.array2d(dtype=float)):
+            i, j = wp.tid()
+            t = wp.tile_load(x, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N), storage="register")
+            evens_M = wp.tile_arange(TILE_M, dtype=int, storage="shared") * 2
+            wp.tile_store_indexed(y, indices=evens_M, t=t, offset=(i * TWO_M, j * TILE_N), axis=0)
+        M = TILE_M * 2
+        N = TILE_N * 2
+        arr = np.arange(M * N, dtype=float).reshape(M, N)
+        x = wp.array(arr, dtype=float, requires_grad=True, device=device)
+        y = wp.zeros((M * 2, N), dtype=float, requires_grad=True, device=device)
+        wp.launch_tiled(compute, dim=[2, 2], inputs=[x], outputs=[y], block_dim=32, device=device)
+        print(x.numpy())
+        print(y.numpy())
+    Prints:
+    .. code-block:: text
+        [[ 0.  1.  2.  3.]
+         [ 4.  5.  6.  7.]
+         [ 8.  9. 10. 11.]
+         [12. 13. 14. 15.]]
+        [[ 0.  1.  2.  3.]
+         [ 0.  0.  0.  0.]
+         [ 4.  5.  6.  7.]
+         [ 0.  0.  0.  0.]
+         [ 8.  9. 10. 11.]
+         [ 0.  0.  0.  0.]
+         [12. 13. 14. 15.]
+         [ 0.  0.  0.  0.]]
     """
     ...
 @over
 def tile_atomic_add(
-    a: Array[Any], t: Tile[Any, Tuple[int, ...]], offset: Tuple[int, ...]
+    a: Array[Any], t: Tile[Any, Tuple[int, ...]], offset: Tuple[int, ...], bounds_check: bool
 ) -> Tile[Any, Tuple[int, ...]]:
     """Atomically add a tile onto the array `a`, each element will be updated atomically.
     :param a: Array in global memory, should have the same ``dtype`` as the input tile
     :param t: Source tile to add to the destination array
     :param offset: Offset in the destination array (optional)
+    :param bounds_check: Needed for unaligned tiles, but can disable for memory-aligned tiles for faster write times
     :returns: A tile with the same dimensions and data type as the source tile, holding the original value of the destination elements
     """
     ...
+@over
+def tile_atomic_add_indexed(
+    a: Array[Any], indices: Tile[int32, Tuple[int]], t: Tile[Any, Tuple[int, ...]], offset: Tuple[int, ...], axis: int32
+) -> Tile[Any, Tuple[int, ...]]:
+    """Atomically add a tile to a global memory array, with storage along a specified axis mapped according to a 1D tile of indices.
+    :param a: The destination array in global memory
+    :param indices: A 1D tile of integer indices mapping to elements in ``a``.
+    :param t: The source tile to extract data from, must have the same data type and number of dimensions as the destination array, and along ``axis``, it must have the same number of elements as the ``indices`` tile.
+    :param offset: Offset in the destination array (optional)
+    :param axis: Axis of ``a`` that indices refer to
+    This example shows how to compute a blocked, row-wise reduction:
+    .. code-block:: python
+        TILE_M = wp.constant(2)
+        TILE_N = wp.constant(2)
+        @wp.kernel
+        def tile_atomic_add_indexed(x: wp.array2d(dtype=float), y: wp.array2d(dtype=float)):
+            i, j = wp.tid()
+            t = wp.tile_load(x, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N), storage="register")
+            zeros = wp.tile_zeros(TILE_M, dtype=int, storage="shared")
+            wp.tile_atomic_add_indexed(y, indices=zeros, t=t, offset=(i, j * TILE_N), axis=0)
+        M = TILE_M * 2
+        N = TILE_N * 2
+        arr = np.arange(M * N, dtype=float).reshape(M, N)
+        x = wp.array(arr, dtype=float, requires_grad=True, device=device)
+        y = wp.zeros((2, N), dtype=float, requires_grad=True, device=device)
+        wp.launch_tiled(tile_atomic_add_indexed, dim=[2, 2], inputs=[x], outputs=[y], block_dim=32, device=device)
+        print(x.numpy())
+        print(y.numpy())
+    Prints:
+    .. code-block:: text
+        [[ 0.  1.  2.  3.]
+         [ 4.  5.  6.  7.]
+         [ 8.  9. 10. 11.]
+         [12. 13. 14. 15.]]
+        [[ 4.  6.  8. 10.]
+         [20. 22. 24. 26.]]
+    """
+    ...
 @over
 def tile_view(
     t: Tile[Any, Tuple[int, ...]], offset: Tuple[int, ...], shape: Tuple[int, ...]
@@ -1370,7 +1742,7 @@ def tile_map(op: Callable, a: Tile[Scalar, Tuple[int, ...]]) -> Tile[Scalar, Tup
     :param op: A callable function that accepts one argument and returns one argument, may be a user function or builtin
     :param a: The input tile, the operator (or one of its overloads) must be able to accept the tile's data type
-    :returns: A tile with the same dimensions and data type as the input tile.
+    :returns: A tile with the same dimensions as the input tile. Its datatype is specified by the return type of op
     Example:
@@ -1401,12 +1773,12 @@ def tile_map(
     """Apply a binary function onto the tile.
     This function cooperatively applies a binary function to each element of the tiles using all threads in the block.
-    Both input tiles must have the same dimensions and datatype.
+    Both input tiles must have the same dimensions, and if using a builtin op, the same datatypes.
     :param op: A callable function that accepts two arguments and returns one argument, all of the same type, may be a user function or builtin
     :param a: The first input tile, the operator (or one of its overloads) must be able to accept the tile's dtype
     :param b: The second input tile, the operator (or one of its overloads) must be able to accept the tile's dtype
-    :returns: A tile with the same dimensions and datatype as the input tiles.
+    :returns: A tile with the same dimensions as the input tiles. Its datatype is specified by the return type of op
     Example:
@@ -2971,7 +3343,7 @@ def mod(a: Scalar, b: Scalar) -> Scalar:
     ...
 @over
-def mod(a: Vector[Any, Scalar], b: Vector[Any, Scalar]) -> Scalar:
+def mod(a: Vector[Any, Scalar], b: Vector[Any, Scalar]) -> Vector[Any, Scalar]:
     """Modulo operation using truncated division."""
     ...

warp/bin/warp-clang.dll CHANGED Viewed

Binary file

warp/bin/warp.dll CHANGED Viewed

Binary file