PyPI - triton-windows - Versions diffs - 3.4.0.post20__cp313-cp313-win_amd64.whl → 3.5.0.post21__cp313-cp313-win_amd64.whl - Mend

triton-windows 3.4.0.post20__cp313-cp313-win_amd64.whl → 3.5.0.post21__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of triton-windows might be problematic. Click here for more details.

Files changed (107) hide show

triton/_C/libtriton.pyd +0 -0
triton/__init__.py +8 -2
triton/_filecheck.py +24 -14
triton/_internal_testing.py +70 -4
triton/_utils.py +3 -1
triton/backends/amd/compiler.py +68 -60
triton/backends/amd/driver.c +113 -44
triton/backends/amd/driver.py +133 -57
triton/backends/driver.py +13 -0
triton/backends/nvidia/compiler.py +80 -22
triton/backends/nvidia/driver.c +88 -15
triton/backends/nvidia/driver.py +130 -123
triton/compiler/__init__.py +5 -2
triton/compiler/code_generator.py +270 -163
triton/compiler/compiler.py +45 -62
triton/experimental/gluon/__init__.py +3 -2
triton/experimental/gluon/_runtime.py +9 -6
triton/experimental/gluon/language/__init__.py +117 -16
triton/experimental/gluon/language/_core.py +246 -68
triton/experimental/gluon/language/_layouts.py +398 -45
triton/experimental/gluon/language/_math.py +17 -9
triton/experimental/gluon/language/_semantic.py +130 -37
triton/experimental/gluon/language/_standard.py +55 -22
triton/experimental/gluon/language/amd/__init__.py +4 -0
triton/experimental/gluon/language/amd/_layouts.py +96 -0
triton/experimental/gluon/language/amd/cdna3/__init__.py +100 -0
triton/experimental/gluon/language/amd/cdna4/__init__.py +48 -0
triton/experimental/gluon/language/amd/cdna4/async_copy.py +151 -0
triton/experimental/gluon/language/extra/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/ampere/__init__.py +3 -0
triton/experimental/gluon/language/nvidia/ampere/async_copy.py +74 -0
triton/experimental/gluon/language/nvidia/ampere/mbarrier.py +80 -0
triton/experimental/gluon/language/nvidia/blackwell/__init__.py +192 -7
triton/experimental/gluon/language/nvidia/blackwell/tma.py +20 -0
triton/experimental/gluon/language/nvidia/hopper/__init__.py +124 -3
triton/experimental/gluon/language/nvidia/hopper/mbarrier.py +20 -37
triton/experimental/gluon/language/nvidia/hopper/tma.py +4 -3
triton/experimental/gluon/nvidia/hopper.py +6 -1
triton/knobs.py +132 -67
triton/language/__init__.py +16 -10
triton/language/core.py +163 -83
triton/language/extra/cuda/gdc.py +6 -6
triton/language/extra/hip/__init__.py +3 -1
triton/language/extra/hip/libdevice.py +7 -0
triton/language/extra/hip/utils.py +35 -0
triton/language/extra/libdevice.py +4 -0
triton/language/semantic.py +76 -23
triton/language/standard.py +14 -14
triton/language/target_info.py +54 -0
triton/runtime/_allocation.py +15 -3
triton/runtime/_async_compile.py +55 -0
triton/runtime/autotuner.py +4 -5
triton/runtime/build.py +11 -9
triton/runtime/cache.py +44 -1
triton/runtime/driver.py +16 -41
triton/runtime/interpreter.py +31 -23
triton/runtime/jit.py +318 -157
triton/runtime/tcc/include/_mingw.h +8 -10
triton/runtime/tcc/include/assert.h +5 -0
triton/runtime/tcc/include/errno.h +1 -1
triton/runtime/tcc/include/float.h +21 -3
triton/runtime/tcc/include/iso646.h +36 -0
triton/runtime/tcc/include/limits.h +5 -0
triton/runtime/tcc/include/malloc.h +2 -2
triton/runtime/tcc/include/math.h +21 -261
triton/runtime/tcc/include/stdalign.h +16 -0
triton/runtime/tcc/include/stdarg.h +5 -70
triton/runtime/tcc/include/stdatomic.h +171 -0
triton/runtime/tcc/include/stddef.h +7 -19
triton/runtime/tcc/include/stdlib.h +15 -4
triton/runtime/tcc/include/stdnoreturn.h +7 -0
triton/runtime/tcc/include/sys/stat.h +2 -2
triton/runtime/tcc/include/sys/types.h +5 -0
triton/runtime/tcc/include/tcc/tcc_libm.h +444 -27
triton/runtime/tcc/include/tccdefs.h +342 -0
triton/runtime/tcc/include/tgmath.h +89 -0
triton/runtime/tcc/include/uchar.h +33 -0
triton/runtime/tcc/include/unistd.h +1 -0
triton/runtime/tcc/include/winapi/qos.h +72 -0
triton/runtime/tcc/include/winapi/shellapi.h +59 -0
triton/runtime/tcc/include/winapi/winbase.h +9 -2
triton/runtime/tcc/include/winapi/wincon.h +8 -0
triton/runtime/tcc/include/winapi/windows.h +1 -1
triton/runtime/tcc/include/winapi/winnls.h +778 -0
triton/runtime/tcc/include/winapi/winnt.h +9 -7
triton/runtime/tcc/include/winapi/winsock2.h +1474 -0
triton/runtime/tcc/include/winapi/ws2ipdef.h +21 -0
triton/runtime/tcc/include/winapi/ws2tcpip.h +391 -0
triton/runtime/tcc/lib/libtcc1.a +0 -0
triton/runtime/tcc/lib/python314.def +1800 -0
triton/runtime/tcc/lib/python314t.def +1809 -0
triton/runtime/tcc/libtcc.dll +0 -0
triton/runtime/tcc/tcc.exe +0 -0
triton/tools/compile.py +62 -14
triton/tools/extra/cuda/compile.c +1 -0
triton/tools/extra/hip/compile.cpp +66 -0
triton/tools/extra/hip/compile.h +13 -0
triton/tools/ragged_tma.py +92 -0
triton/tools/tensor_descriptor.py +7 -9
triton/windows_utils.py +42 -79
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/METADATA +3 -4
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/RECORD +106 -75
triton/runtime/tcc/lib/libtcc1-64.a +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/WHEEL +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/entry_points.txt +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/licenses/LICENSE +0 -0
{triton_windows-3.4.0.post20.dist-info → triton_windows-3.5.0.post21.dist-info}/top_level.txt +0 -0

triton/runtime/tcc/include/tcc/tcc_libm.h CHANGED Viewed

@@ -2,6 +2,7 @@
 #define _TCC_LIBM_H_
 #include "../math.h"
+#include "../stdint.h"
 /* TCC uses 8 bytes for double and long double, so effectively the l variants
  * are never used. For now, they just run the normal (double) variant.
@@ -46,7 +47,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 /* fpclassify */
 __CRT_INLINE int __cdecl __fpclassify (double x) {
-  union {double f; uint64_t i;} u = {x};
+  union {double f; uint64_t i;} u = {.f = x};
   int e = u.i>>52 & 0x7ff;
   if (!e) return u.i<<1 ? FP_SUBNORMAL : FP_ZERO;
   if (e==0x7ff) return u.i<<12 ? FP_NAN : FP_INFINITE;
@@ -54,7 +55,7 @@ __CRT_INLINE int __cdecl __fpclassify (double x) {
 }
 __CRT_INLINE int __cdecl __fpclassifyf (float x) {
-  union {float f; uint32_t i;} u = {x};
+  union {float f; uint32_t i;} u = {.f = x};
   int e = u.i>>23 & 0xff;
   if (!e) return u.i<<1 ? FP_SUBNORMAL : FP_ZERO;
   if (e==0xff) return u.i<<9 ? FP_NAN : FP_INFINITE;
@@ -69,13 +70,13 @@ __CRT_INLINE int __cdecl __fpclassifyl (long double x) {
 /* signbit */
 __CRT_INLINE int __cdecl __signbit (double x) {
-  union {double d; uint64_t i;} y = { x };
-  return y.i>>63;
+  union {double f; uint64_t i;} u = {.f = x};
+  return u.i>>63;
 }
 __CRT_INLINE int __cdecl __signbitf (float x) {
-  union {float f; uint32_t i; } y = { x };
-  return y.i>>31;
+  union {float f; uint32_t i; } u = {.f = x};
+  return u.i>>31;
 }
 __CRT_INLINE int __cdecl __signbitl (long double x) {
@@ -122,21 +123,13 @@ __CRT_INLINE long double __cdecl fmaxl (long double x, long double y) {
 /* *round* */
-#define TCCFP_FORCE_EVAL(x) do {            \
-if (sizeof(x) == sizeof(float)) {           \
-  volatile float __x;                       \
-  __x = (x);                                \
-} else if (sizeof(x) == sizeof(double)) {   \
-  volatile double __x;                      \
-  __x = (x);                                \
-} else {                                    \
-  volatile long double __x;                 \
-  __x = (x);                                \
-}                                           \
+#define TCCFP_FORCE_EVAL(x) do {  \
+  volatile typeof(x) __x;         \
+  __x = (x);                      \
 } while(0)
 __CRT_INLINE double __cdecl round (double x) {
-  union {double f; uint64_t i;} u = {x};
+  union {double f; uint64_t i;} u = {.f = x};
   int e = u.i >> 52 & 0x7ff;
   double y;
@@ -150,15 +143,8 @@ __CRT_INLINE double __cdecl round (double x) {
     return 0*u.f;
   }
   y = (double)(x + 0x1p52) - 0x1p52 - x;
-  if (y > 0.5)
-    y = y + x - 1;
-  else if (y <= -0.5)
-    y = y + x + 1;
-  else
-    y = y + x;
-  if (u.i >> 63)
-    y = -y;
-  return y;
+  y = y + x - (y > 0.5) + (y <= -0.5); /* branchless */
+  return (u.i >> 63) ? -y : y;
 }
 __CRT_INLINE long __cdecl lround (double x) {
@@ -194,8 +180,439 @@ __CRT_INLINE long long __cdecl llroundl (long double x) {
 }
+/* MUSL asinh, acosh, atanh */
+__CRT_INLINE double __cdecl asinh(double x) {
+  union {double f; uint64_t i;} u = {.f = x};
+  unsigned e = u.i >> 52 & 0x7ff, s = u.i >> 63;
+  u.i &= -1ull / 2, x = u.f;
+  if (e >= 0x3ff + 26) x = log(x) + 0.693147180559945309;
+  else if (e >= 0x3ff + 1) x = log(2*x + 1 / (sqrt(x*x + 1) + x)); /* |x|>=2 */
+  else if (e >= 0x3ff - 26) x = log1p(x + x*x / (sqrt(x*x + 1) + 1));
+  else TCCFP_FORCE_EVAL(x + 0x1p120f);
+  return s ? -x : x;
+}
+__CRT_INLINE double __cdecl acosh(double x) {
+  union {double f; uint64_t i;} u = {.f = x};
+  unsigned e = u.i >> 52 & 0x7ff;
+  if (e < 0x3ff + 1) return --x, log1p(x + sqrt(x*x + 2*x)); /* |x|<2 */
+  if (e < 0x3ff + 26) return log(2*x - 1 / (x + sqrt(x*x - 1)));
+  return log(x) + 0.693147180559945309;
+}
+__CRT_INLINE double __cdecl atanh(double x) {
+  union {double f; uint64_t i;} u = {.f = x};
+  unsigned e = u.i >> 52 & 0x7ff, s = u.i >> 63;
+  u.i &= -1ull / 2, x = u.f;
+  if (e < 0x3ff - 1) {
+    if (e < 0x3ff - 32) { if (e == 0) TCCFP_FORCE_EVAL((float)x); }
+    else x = 0.5 * log1p(2*x + 2*x*x / (1 - x)); /* |x| < 0.5 */
+  } else x = 0.5 * log1p(2*(x / (1 - x))); /* avoid overflow */
+  return s ? -x : x;
+}
+/* MUSL scalbn */
+__CRT_INLINE double __cdecl scalbn(double x, int n) {
+  union {double f; uint64_t i;} u;
+  if (n > 1023) {
+    x *= 0x1p1023, n -= 1023;
+    if (n > 1023) {
+      x *= 0x1p1023, n -= 1023;
+      if (n > 1023) n = 1023;
+    }
+  } else if (n < -1022) {
+    x *= 0x1p-1022 * 0x1p53, n += 1022 - 53;
+    if (n < -1022) {
+      x *= 0x1p-1022 * 0x1p53, n += 1022 - 53;
+      if (n < -1022) n = -1022;
+    }
+  }
+  u.i = (0x3ffull + n) << 52;
+  return x * u.f;
+}
+/* MUSL: Override msvcrt frexp(): 4.5x speedup! */
+__CRT_INLINE double __cdecl frexp(double x, int *e) {
+  union {double f; uint64_t i;} u = {.f = x};
+  int ee = u.i>>52 & 0x7ff;
+  if (!ee) {
+    if (x) x = frexp(x*0x1p64, e), *e -= 64;
+    else *e = 0;
+    return x;
+  } else if (ee == 0x7ff)
+    return x;
+  *e = ee - 0x3fe;
+  u.i &= 0x800fffffffffffffull;
+  u.i |= 0x3fe0000000000000ull;
+  return u.f;
+}
+/* MUSL nan */
+__CRT_INLINE double __cdecl nan(const char* s) {
+  return NAN;
+}
+__CRT_INLINE float __cdecl nanf(const char* s) {
+  return NAN;
+}
+__CRT_INLINE long double __cdecl nanl(const char* s) {
+  return NAN;
+}
 /*******************************************************************************
   End of code based on MUSL
 *******************************************************************************/
+/* Following are math functions missing from msvcrt.dll, and not defined
+ * in math.h or above. Functions still remaining:
+ * remquo(), remainder(), fma(), erf(), erfc(), nearbyint().
+ * In <stdlib.h>: lldiv().
+ */
+__CRT_INLINE float __cdecl scalbnf(float x, int n) {
+  return scalbn(x, n);
+}
+__CRT_INLINE long double __cdecl scalbnl(long double x, int n) {
+  return scalbn(x, n);
+}
+__CRT_INLINE double __cdecl scalbln(double x, long n) {
+  return scalbn(x, n);
+}
+__CRT_INLINE float __cdecl scalblnf(float x, long n) {
+  return scalbn(x, n);
+}
+__CRT_INLINE long double __cdecl scalblnl(long double x, long n) {
+  return scalbn(x, n);
+}
+/* Override msvcrt ldexp(): 7.3x speedup! */
+__CRT_INLINE double __cdecl ldexp(double x, int expn) {
+  return scalbn(x, expn);
+}
+__CRT_INLINE float __cdecl ldexpf(float x, int expn) {
+  return scalbn(x, expn);
+}
+__CRT_INLINE long double __cdecl ldexpl(long double x, int expn) {
+  return scalbn(x, expn);
+}
+__CRT_INLINE float __cdecl frexpf(float x, int *y) {
+  return frexp(x, y);
+}
+__CRT_INLINE long double __cdecl frexpl (long double x, int* y) {
+  return frexp(x, y);
+}
+__CRT_INLINE double __cdecl rint(double x) {
+double retval;
+  __asm__ (
+    "fldl    %1\n"
+    "frndint   \n"
+    "fstpl    %0\n" : "=m" (retval) : "m" (x));
+  return retval;
+}
+__CRT_INLINE float __cdecl rintf(float x) {
+  float retval;
+  __asm__ (
+    "flds    %1\n"
+    "frndint   \n"
+    "fstps    %0\n" : "=m" (retval) : "m" (x));
+   return retval;
+}
+__CRT_INLINE long double __cdecl rintl (long double x) {
+  return rint(x);
+}
+/* 7.12.9.5 */
+__CRT_INLINE long __cdecl lrint(double x) {
+  long retval;
+  __asm__ __volatile__
+    ("fldl   %1\n"
+     "fistpl %0"  : "=m" (retval) : "m" (x));
+  return retval;
+}
+__CRT_INLINE long __cdecl lrintf(float x) {
+  long retval;
+  __asm__ __volatile__
+    ("flds   %1\n"
+     "fistpl %0"  : "=m" (retval) : "m" (x));
+  return retval;
+}
+__CRT_INLINE long __cdecl lrintl (long double x) {
+  return lrint(x);
+}
+__CRT_INLINE long long __cdecl llrint(double x) {
+long long retval;
+  __asm__ __volatile__
+    ("fldl    %1\n"
+     "fistpll %0"  : "=m" (retval) : "m" (x));
+  return retval;
+}
+__CRT_INLINE long long __cdecl llrintf(float x) {
+  long long retval;
+  __asm__ __volatile__
+    ("flds   %1\n"
+     "fistpll %0"  : "=m" (retval) : "m" (x));
+  return retval;
+}
+__CRT_INLINE long long __cdecl llrintl (long double x) {
+  return llrint(x);
+}
+__CRT_INLINE double __cdecl trunc(double _x) {
+  double retval;
+  unsigned short saved_cw;
+  unsigned short tmp_cw;
+  __asm__ ("fnstcw %0;" : "=m" (saved_cw)); /* save FPU control word */
+  tmp_cw = (saved_cw & ~(FE_TONEAREST | FE_DOWNWARD | FE_UPWARD | FE_TOWARDZERO))
+    | FE_TOWARDZERO;
+  __asm__ ("fldcw %0;" : : "m" (tmp_cw));
+  __asm__ ("fldl  %1;"
+           "frndint;"
+           "fstpl  %0;" : "=m" (retval)  : "m" (_x)); /* round towards zero */
+  __asm__ ("fldcw %0;" : : "m" (saved_cw) ); /* restore saved control word */
+  return retval;
+}
+__CRT_INLINE float __cdecl truncf(float x) {
+  return (float) ((intptr_t) x);
+}
+__CRT_INLINE long double __cdecl truncl(long double x) {
+  return trunc(x);
+}
+__CRT_INLINE long double __cdecl nextafterl(long double x, long double to) {
+  return nextafter(x, to);
+}
+__CRT_INLINE double __cdecl nexttoward(double x, long double to) {
+  return nextafter(x, to);
+}
+__CRT_INLINE float __cdecl nexttowardf(float x, long double to) {
+  return nextafterf(x, to);
+}
+__CRT_INLINE long double __cdecl nexttowardl(long double x, long double to) {
+  return nextafter(x, to);
+}
+/* Override msvcrt fabs(): 6.3x speedup! */
+__CRT_INLINE double __cdecl fabs(double x) {
+  return x < 0 ? -x : x;
+}
+__CRT_INLINE float __cdecl fabsf(float x) {
+  return x < 0 ? -x : x;
+}
+__CRT_INLINE long double __cdecl fabsl(long double x) {
+  return x < 0 ? -x : x;
+}
+#if defined(_WIN32) && !defined(_WIN64) && !defined(__ia64__)
+  __CRT_INLINE float acosf(float x) { return acos(x); }
+  __CRT_INLINE float asinf(float x) { return asin(x); }
+  __CRT_INLINE float atanf(float x) { return atan(x); }
+  __CRT_INLINE float atan2f(float x, float y) { return atan2(x, y); }
+  __CRT_INLINE float ceilf(float x) { return ceil(x); }
+  __CRT_INLINE float cosf(float x) { return cos(x); }
+  __CRT_INLINE float coshf(float x) { return cosh(x); }
+  __CRT_INLINE float expf(float x) { return exp(x); }
+  __CRT_INLINE float floorf(float x) { return floor(x); }
+  __CRT_INLINE float fmodf(float x, float y) { return fmod(x, y); }
+  __CRT_INLINE float logf(float x) { return log(x); }
+  __CRT_INLINE float logbf(float x) { return logb(x); }
+  __CRT_INLINE float log10f(float x) { return log10(x); }
+  __CRT_INLINE float modff(float x, float *y) { double di, df = modf(x, &di); *y = di; return df; }
+  __CRT_INLINE float powf(float x, float y) { return pow(x, y); }
+  __CRT_INLINE float sinf(float x) { return sin(x); }
+  __CRT_INLINE float sinhf(float x) { return sinh(x); }
+  __CRT_INLINE float sqrtf(float x) { return sqrt(x); }
+  __CRT_INLINE float tanf(float x) { return tan(x); }
+  __CRT_INLINE float tanhf(float x) { return tanh(x); }
+#endif
+__CRT_INLINE float __cdecl asinhf(float x) { return asinh(x); }
+__CRT_INLINE float __cdecl acoshf(float x) { return acosh(x); }
+__CRT_INLINE float __cdecl atanhf(float x) { return atanh(x); }
+__CRT_INLINE long double __cdecl asinhl(long double x) { return asinh(x); }
+__CRT_INLINE long double __cdecl acoshl(long double x) { return acosh(x); }
+__CRT_INLINE long double __cdecl atanhl(long double x) { return atanh(x); }
+__CRT_INLINE long double __cdecl asinl(long double x) { return asin(x); }
+__CRT_INLINE long double __cdecl acosl(long double x) { return acos(x); }
+__CRT_INLINE long double __cdecl atanl(long double x) { return atan(x); }
+__CRT_INLINE long double __cdecl ceill(long double x) { return ceil(x); }
+__CRT_INLINE long double __cdecl coshl(long double x) { return cosh(x); }
+__CRT_INLINE long double __cdecl cosl(long double x) { return cos(x); }
+__CRT_INLINE long double __cdecl expl(long double x) { return exp(x); }
+__CRT_INLINE long double __cdecl floorl(long double x) { return floor(x); }
+__CRT_INLINE long double __cdecl fmodl(long double x, long double y) { return fmod(x, y); }
+__CRT_INLINE long double __cdecl hypotl(long double x, long double y) { return hypot(x, y); }
+__CRT_INLINE long double __cdecl logl(long double x) { return log(x); }
+__CRT_INLINE long double __cdecl logbl(long double x) { return logb(x); }
+__CRT_INLINE long double __cdecl log10l(long double x) { return log10(x); }
+__CRT_INLINE long double __cdecl modfl(long double x, long double* y) { double y1 = *y; x = modf(x, &y1); *y = y1; return x; }
+__CRT_INLINE long double __cdecl powl(long double x, long double y) { return pow(x, y); }
+__CRT_INLINE long double __cdecl sinhl(long double x) { return sinh(x); }
+__CRT_INLINE long double __cdecl sinl(long double x) { return sin(x); }
+__CRT_INLINE long double __cdecl sqrtl(long double x) { return sqrt(x); }
+__CRT_INLINE long double __cdecl tanhl(long double x) { return tanh(x); }
+__CRT_INLINE long double __cdecl tanl(long double x) { return tan(x); }
+/* Following are accurate, but much shorter implementations than MUSL lib. */
+__CRT_INLINE double __cdecl log1p(double x) {
+  double u = 1.0 + x;
+  return u == 1.0 ? x : log(u)*(x / (u - 1.0));
+}
+__CRT_INLINE float __cdecl log1pf(float x) {
+  float u = 1.0f + x;
+  return u == 1.0f ? x : logf(u)*(x / (u - 1.0f));
+}
+__CRT_INLINE long double __cdecl log1pl(long double x) {
+  return log1p(x);
+}
+__CRT_INLINE double __cdecl expm1(double x) {
+  if (x > 0.0024 || x < -0.0024) return exp(x) - 1.0;
+  return x*(1.0 + 0.5*x*(1.0 + (1/3.0)*x*(1.0 + 0.25*x*(1.0 + 0.2*x))));
+}
+__CRT_INLINE float __cdecl expm1f(float x) {
+  if (x > 0.085f || x < -0.085f) return expf(x) - 1.0f;
+  return x*(1.0f + 0.5f*x*(1.0f + (1/3.0f)*x*(1.0f + 0.25f*x)));
+}
+__CRT_INLINE long double __cdecl expm1l(long double x) {
+  return expm1(x);
+}
+__CRT_INLINE double __cdecl cbrt(double x) {
+  return x < 0.0 ? -pow(-x, 1/3.0) : pow(x, 1/3.0);
+}
+__CRT_INLINE float __cdecl cbrtf(float x) {
+  return x < 0.0f ? -pow(-x, 1/3.0) : pow(x, 1/3.0);
+}
+__CRT_INLINE long double __cdecl cbrtl(long double x) {
+  return cbrt(x);
+}
+__CRT_INLINE double __cdecl log2(double x) {
+  return log(x) * 1.442695040888963407;
+}
+__CRT_INLINE float __cdecl log2f(float x) {
+  return log(x) * 1.442695040888963407;
+}
+__CRT_INLINE long double __cdecl log2l(long double x) {
+  return log(x) * 1.442695040888963407;
+}
+__CRT_INLINE double __cdecl exp2(double x) {
+  return exp(x * 0.693147180559945309);
+}
+__CRT_INLINE float __cdecl exp2f(float x) {
+  return exp(x * 0.693147180559945309);
+}
+__CRT_INLINE long double __cdecl exp2l(long double x) {
+  return exp(x * 0.693147180559945309);
+}
+__CRT_INLINE int __cdecl ilogb(double x) {
+  return (int) logb(x);
+}
+__CRT_INLINE int __cdecl ilogbf(float x) {
+  return (int) logbf(x);
+}
+__CRT_INLINE int __cdecl ilogbl(long double x) {
+  return (int) logb(x);
+}
+__CRT_INLINE double __cdecl fdim(double x, double y) {
+  if (isnan(x) || isnan(y)) return NAN;
+  return x > y ? x - y : 0;
+}
+__CRT_INLINE float __cdecl fdimf(float x, float y) {
+  if (isnan(x) || isnan(y)) return NAN;
+  return x > y ? x - y : 0;
+}
+__CRT_INLINE long double __cdecl fdiml(long double x, long double y) {
+  if (isnan(x) || isnan(y)) return NAN;
+  return x > y ? x - y : 0;
+}
+/* tgamma and lgamma: Lanczos approximation
+ * https://rosettacode.org/wiki/Gamma_function
+ * https://www.johndcook.com/blog/cpp_gamma
+ */
+__CRT_INLINE double __cdecl tgamma(double x) {
+  double m = 1.0, t = 3.14159265358979323;
+  if (x == floor(x)) {
+    if (x == 0) return INFINITY;
+    if (x < 0) return NAN;
+    if (x < 26) { for (double k = 2; k < x; ++k) m *= k; return m; }
+  }
+  if (x < 0.5)
+    return t / (sin(t*x)*tgamma(1.0 - x));
+  if (x > 12.0)
+    return exp(lgamma(x));
+  static const double c[8] = {676.5203681218851, -1259.1392167224028,
+                              771.32342877765313, -176.61502916214059,
+                              12.507343278686905, -0.13857109526572012,
+                              9.9843695780195716e-6, 1.5056327351493116e-7};
+  m = 0.99999999999980993, t = x + 6.5; /* x-1+8-.5 */
+  for (int k = 0; k < 8; ++k) m += c[k] / (x + k);
+  return 2.50662827463100050 * pow(t, x - 0.5)*exp(-t)*m; /* C=sqrt(2pi) */
+}
+__CRT_INLINE double __cdecl lgamma(double x) {
+  if (x < 12.0) {
+    if (x <= 0.0 && x == floor(x)) return INFINITY;
+    x = tgamma(x);
+    return log(x < 0.0 ? -x : x);
+  }
+  static const double c[7] = {1.0/12.0, -1.0/360.0, 1.0/1260.0, -1.0/1680.0,
+                              1.0/1188.0, -691.0/360360.0, 1.0/156.0};
+  double m = -3617.0/122400.0, t = 1.0 / (x*x);
+  for (int k = 6; k >= 0; --k) m = m*t + c[k];
+  return (x - 0.5)*log(x) - x + 0.918938533204672742 + m / x; /* C=log(2pi)/2 */
+}
+__CRT_INLINE float __cdecl tgammaf(float x) {
+  return tgamma(x);
+}
+__CRT_INLINE float __cdecl lgammaf(float x) {
+  return lgamma(x);
+}
+__CRT_INLINE long double __cdecl tgammal(long double x) {
+  return tgamma(x);
+}
+__CRT_INLINE long double __cdecl lgammal(long double x) {
+  return lgamma(x);
+}
 #endif /* _TCC_LIBM_H_ */