react-native-executorch 0.5.6 → 0.5.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. package/android/libs/classes.jar +0 -0
  2. package/android/src/main/cpp/CMakeLists.txt +23 -14
  3. package/common/rnexecutorch/RnExecutorchInstaller.cpp +4 -21
  4. package/common/rnexecutorch/host_objects/ModelHostObject.h +67 -51
  5. package/common/rnexecutorch/models/llm/LLM.cpp +24 -1
  6. package/common/rnexecutorch/models/llm/LLM.h +4 -1
  7. package/common/rnexecutorch/threads/GlobalThreadPool.h +79 -0
  8. package/common/rnexecutorch/threads/HighPerformanceThreadPool.h +364 -0
  9. package/common/rnexecutorch/threads/utils/ThreadUtils.h +29 -0
  10. package/common/runner/runner.cpp +9 -3
  11. package/common/runner/runner.h +4 -3
  12. package/common/runner/text_token_generator.h +28 -10
  13. package/lib/module/controllers/LLMController.js +21 -2
  14. package/lib/module/controllers/LLMController.js.map +1 -1
  15. package/lib/module/hooks/natural_language_processing/useLLM.js +6 -2
  16. package/lib/module/hooks/natural_language_processing/useLLM.js.map +1 -1
  17. package/lib/module/modules/natural_language_processing/LLMModule.js +4 -2
  18. package/lib/module/modules/natural_language_processing/LLMModule.js.map +1 -1
  19. package/lib/module/types/llm.js.map +1 -1
  20. package/lib/typescript/controllers/LLMController.d.ts +4 -2
  21. package/lib/typescript/controllers/LLMController.d.ts.map +1 -1
  22. package/lib/typescript/hooks/natural_language_processing/useLLM.d.ts.map +1 -1
  23. package/lib/typescript/modules/natural_language_processing/LLMModule.d.ts +3 -2
  24. package/lib/typescript/modules/natural_language_processing/LLMModule.d.ts.map +1 -1
  25. package/lib/typescript/types/llm.d.ts +7 -1
  26. package/lib/typescript/types/llm.d.ts.map +1 -1
  27. package/package.json +3 -1
  28. package/react-native-executorch.podspec +12 -31
  29. package/src/controllers/LLMController.ts +29 -5
  30. package/src/hooks/natural_language_processing/useLLM.ts +15 -1
  31. package/src/modules/natural_language_processing/LLMModule.ts +10 -2
  32. package/src/types/llm.ts +8 -0
  33. package/third-party/android/libs/cpuinfo/arm64-v8a/libcpuinfo.so +0 -0
  34. package/third-party/android/libs/executorch/arm64-v8a/libexecutorch.so +0 -0
  35. package/third-party/android/libs/executorch/x86_64/libexecutorch.so +0 -0
  36. package/third-party/android/libs/pthreadpool/arm64-v8a/libpthreadpool.so +0 -0
  37. package/third-party/android/libs/tokenizers-cpp/arm64-v8a/libsentencepiece.a +0 -0
  38. package/third-party/android/libs/tokenizers-cpp/arm64-v8a/libtokenizers_c.a +0 -0
  39. package/third-party/android/libs/tokenizers-cpp/arm64-v8a/libtokenizers_cpp.a +0 -0
  40. package/third-party/android/libs/tokenizers-cpp/x86_64/libsentencepiece.a +0 -0
  41. package/third-party/android/libs/tokenizers-cpp/x86_64/libtokenizers_c.a +0 -0
  42. package/third-party/android/libs/tokenizers-cpp/x86_64/libtokenizers_cpp.a +0 -0
  43. package/third-party/include/c10/macros/Export.h +2 -86
  44. package/third-party/include/c10/macros/Macros.h +28 -5
  45. package/third-party/include/c10/util/BFloat16-inl.h +1 -4
  46. package/third-party/include/c10/util/BFloat16.h +5 -8
  47. package/third-party/include/c10/util/Half.h +5 -0
  48. package/third-party/include/c10/util/bit_cast.h +1 -1
  49. package/third-party/include/c10/util/complex.h +639 -0
  50. package/third-party/include/c10/util/complex_math.h +399 -0
  51. package/third-party/include/c10/util/complex_utils.h +41 -0
  52. package/third-party/include/c10/util/irange.h +2 -2
  53. package/third-party/include/c10/util/overflows.h +95 -0
  54. package/third-party/include/executorch/ExecuTorchError.h +75 -0
  55. package/third-party/include/executorch/ExecuTorchModule.h +115 -11
  56. package/third-party/include/executorch/ExecuTorchTensor.h +731 -51
  57. package/third-party/include/executorch/ExecuTorchValue.h +61 -9
  58. package/third-party/include/executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h +181 -0
  59. package/third-party/include/executorch/extension/kernel_util/meta_programming.h +108 -0
  60. package/third-party/include/executorch/extension/kernel_util/type_list.h +137 -0
  61. package/third-party/include/executorch/extension/module/bundled_module.h +131 -0
  62. package/third-party/include/executorch/extension/module/module.h +46 -20
  63. package/third-party/include/executorch/extension/threadpool/cpuinfo_utils.h +1 -3
  64. package/third-party/include/executorch/extension/threadpool/threadpool.h +1 -3
  65. package/third-party/include/executorch/extension/threadpool/threadpool_guard.h +35 -0
  66. package/third-party/include/executorch/runtime/backend/backend_execution_context.h +3 -3
  67. package/third-party/include/executorch/runtime/backend/backend_init_context.h +12 -6
  68. package/third-party/include/executorch/runtime/backend/backend_option_context.h +34 -0
  69. package/third-party/include/executorch/runtime/backend/interface.h +70 -9
  70. package/third-party/include/executorch/runtime/backend/options.h +206 -0
  71. package/third-party/include/executorch/runtime/core/evalue.h +19 -25
  72. package/third-party/include/executorch/runtime/core/event_tracer.h +32 -17
  73. package/third-party/include/executorch/runtime/core/event_tracer_hooks.h +23 -14
  74. package/third-party/include/executorch/runtime/core/exec_aten/exec_aten.h +32 -9
  75. package/third-party/include/executorch/runtime/core/exec_aten/util/dim_order_util.h +3 -2
  76. package/third-party/include/executorch/runtime/core/exec_aten/util/scalar_type_util.h +43 -75
  77. package/third-party/include/executorch/runtime/core/exec_aten/util/tensor_util.h +88 -87
  78. package/third-party/include/executorch/runtime/core/function_ref.h +100 -0
  79. package/third-party/include/executorch/runtime/core/named_data_map.h +14 -14
  80. package/third-party/include/executorch/runtime/core/portable_type/c10/c10/macros/Export.h +2 -86
  81. package/third-party/include/executorch/runtime/core/portable_type/c10/c10/macros/Macros.h +28 -5
  82. package/third-party/include/executorch/runtime/core/portable_type/c10/c10/util/BFloat16-inl.h +1 -4
  83. package/third-party/include/executorch/runtime/core/portable_type/c10/c10/util/BFloat16.h +5 -8
  84. package/third-party/include/executorch/runtime/core/portable_type/c10/c10/util/Half.h +5 -0
  85. package/third-party/include/executorch/runtime/core/portable_type/c10/c10/util/bit_cast.h +1 -1
  86. package/third-party/include/executorch/runtime/core/portable_type/c10/c10/util/complex.h +639 -0
  87. package/third-party/include/executorch/runtime/core/portable_type/c10/c10/util/complex_math.h +399 -0
  88. package/third-party/include/executorch/runtime/core/portable_type/c10/c10/util/complex_utils.h +41 -0
  89. package/third-party/include/executorch/runtime/core/portable_type/c10/c10/util/irange.h +2 -2
  90. package/third-party/include/executorch/runtime/core/portable_type/c10/c10/util/overflows.h +95 -0
  91. package/third-party/include/executorch/runtime/core/portable_type/c10/torch/headeronly/macros/Export.h +88 -0
  92. package/third-party/include/executorch/runtime/core/portable_type/complex.h +6 -29
  93. package/third-party/include/executorch/runtime/core/portable_type/tensor_impl.h +20 -0
  94. package/third-party/include/executorch/runtime/core/span.h +4 -0
  95. package/third-party/include/executorch/runtime/core/tag.h +19 -0
  96. package/third-party/include/executorch/runtime/core/tensor_layout.h +2 -2
  97. package/third-party/include/executorch/runtime/executor/method.h +15 -3
  98. package/third-party/include/executorch/runtime/executor/method_meta.h +34 -5
  99. package/third-party/include/executorch/runtime/executor/program.h +3 -4
  100. package/third-party/include/executorch/runtime/executor/pte_data_map.h +9 -8
  101. package/third-party/include/executorch/runtime/executor/tensor_parser.h +14 -13
  102. package/third-party/include/executorch/runtime/kernel/kernel_runtime_context.h +5 -5
  103. package/third-party/include/executorch/runtime/kernel/operator_registry.h +21 -19
  104. package/third-party/include/executorch/runtime/platform/compiler.h +8 -0
  105. package/third-party/include/executorch/runtime/platform/platform.h +126 -0
  106. package/third-party/include/headeronly/macros/Export.h +88 -0
  107. package/third-party/include/tokenizers-cpp/tokenizers_c.h +61 -0
  108. package/third-party/include/torch/headeronly/macros/Export.h +88 -0
  109. package/third-party/ios/ExecutorchLib.xcframework/Info.plist +43 -0
  110. package/third-party/ios/ExecutorchLib.xcframework/ios-arm64/ExecutorchLib.framework/ExecutorchLib +0 -0
  111. package/third-party/ios/ExecutorchLib.xcframework/ios-arm64/ExecutorchLib.framework/Info.plist +0 -0
  112. package/third-party/ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/ExecutorchLib +0 -0
  113. package/third-party/ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/Info.plist +0 -0
  114. package/third-party/ios/libs/cpuinfo/libcpuinfo.a +0 -0
  115. package/third-party/ios/libs/pthreadpool/physical-arm64-release/libpthreadpool.a +0 -0
  116. package/third-party/ios/libs/pthreadpool/simulator-arm64-debug/libpthreadpool.a +0 -0
  117. package/ios/libs/executorch/libbackend_coreml_ios.a +0 -0
  118. package/ios/libs/executorch/libbackend_coreml_simulator.a +0 -0
  119. package/ios/libs/executorch/libbackend_mps_ios.a +0 -0
  120. package/ios/libs/executorch/libbackend_mps_simulator.a +0 -0
  121. package/ios/libs/executorch/libbackend_xnnpack_ios.a +0 -0
  122. package/ios/libs/executorch/libbackend_xnnpack_simulator.a +0 -0
  123. package/ios/libs/executorch/libexecutorch_ios.a +0 -0
  124. package/ios/libs/executorch/libexecutorch_simulator.a +0 -0
  125. package/ios/libs/executorch/libkernels_custom_ios.a +0 -0
  126. package/ios/libs/executorch/libkernels_custom_simulator.a +0 -0
  127. package/ios/libs/executorch/libkernels_optimized_ios.a +0 -0
  128. package/ios/libs/executorch/libkernels_optimized_simulator.a +0 -0
  129. package/ios/libs/executorch/libkernels_portable_ios.a +0 -0
  130. package/ios/libs/executorch/libkernels_portable_simulator.a +0 -0
  131. package/ios/libs/executorch/libkernels_quantized_ios.a +0 -0
  132. package/ios/libs/executorch/libkernels_quantized_simulator.a +0 -0
  133. package/third-party/ios/ios.toolchain.cmake +0 -1122
  134. /package/{ios → third-party/ios}/libs/tokenizers-cpp/physical-arm64-release/libsentencepiece.a +0 -0
  135. /package/{ios → third-party/ios}/libs/tokenizers-cpp/physical-arm64-release/libtokenizers_c.a +0 -0
  136. /package/{ios → third-party/ios}/libs/tokenizers-cpp/physical-arm64-release/libtokenizers_cpp.a +0 -0
  137. /package/{ios → third-party/ios}/libs/tokenizers-cpp/simulator-arm64-debug/libsentencepiece.a +0 -0
  138. /package/{ios → third-party/ios}/libs/tokenizers-cpp/simulator-arm64-debug/libtokenizers_c.a +0 -0
  139. /package/{ios → third-party/ios}/libs/tokenizers-cpp/simulator-arm64-debug/libtokenizers_cpp.a +0 -0
@@ -22,8 +22,7 @@
22
22
  #include <executorch/runtime/platform/compiler.h>
23
23
 
24
24
  namespace executorch {
25
- namespace runtime {
26
-
25
+ namespace ET_RUNTIME_NAMESPACE {
27
26
  /**
28
27
  * Interface to access and retrieve data via name.
29
28
  * See executorch/extension/flat_tensor/ for an example.
@@ -32,33 +31,34 @@ class ET_EXPERIMENTAL NamedDataMap {
32
31
  public:
33
32
  virtual ~NamedDataMap() = default;
34
33
  /**
35
- * Get metadata by key.
34
+ * Get tensor_layout by key.
36
35
  *
37
36
  * @param key The name of the tensor.
38
- * @return Result containing TensorLayout with tensor metadata.
37
+ * @return Result containing TensorLayout.
39
38
  */
40
- ET_NODISCARD virtual Result<const executorch::runtime::TensorLayout>
41
- get_metadata(const char *key) const = 0;
39
+ ET_NODISCARD virtual Result<const TensorLayout>
40
+ get_tensor_layout(executorch::aten::string_view key) const = 0;
42
41
  /**
43
42
  * Get data by key.
44
43
  *
45
44
  * @param key Name of the data.
46
- * @return Result containing a FreeableBuffer with the tensor data.
45
+ * @return Result containing a FreeableBuffer.
47
46
  */
48
47
  ET_NODISCARD virtual Result<FreeableBuffer>
49
- get_data(const char *key) const = 0;
48
+ get_data(executorch::aten::string_view key) const = 0;
50
49
 
51
50
  /**
52
51
  * Loads data corresponding to the key into the provided buffer.
53
52
  *
54
53
  * @param key The name of the data.
55
- * @param size The number of bytes to load. Use `get_metadata` to retrieve the
56
- * size of the data for a given key.
54
+ * @param size The number of bytes to load. Use `get_tensor_layout` to
55
+ * retrieve the size of the data for a given key.
57
56
  * @param buffer The buffer to load the data into. Must point to at least
58
57
  * `size` bytes of memory.
59
58
  * @returns an Error indicating if the load was successful.
60
59
  */
61
- ET_NODISCARD virtual Error load_data_into(const char *key, void *buffer,
60
+ ET_NODISCARD virtual Error load_data_into(executorch::aten::string_view key,
61
+ void *buffer,
62
62
  size_t size) const = 0;
63
63
 
64
64
  /**
@@ -66,7 +66,7 @@ public:
66
66
  *
67
67
  * @return Result containing the number of keys.
68
68
  */
69
- ET_NODISCARD virtual Result<size_t> get_num_keys() const = 0;
69
+ ET_NODISCARD virtual Result<uint32_t> get_num_keys() const = 0;
70
70
 
71
71
  /**
72
72
  * Get the key at the given index.
@@ -75,10 +75,10 @@ public:
75
75
  * @return Result containing the key at the given index. Note: the returned
76
76
  * pointer is only valid for the lifetime of the DataMap.
77
77
  */
78
- ET_NODISCARD virtual Result<const char *> get_key(size_t index) const = 0;
78
+ ET_NODISCARD virtual Result<const char *> get_key(uint32_t index) const = 0;
79
79
  };
80
80
 
81
- } // namespace runtime
81
+ } // namespace ET_RUNTIME_NAMESPACE
82
82
  } // namespace executorch
83
83
 
84
84
  #ifdef __GNUC__
@@ -2,95 +2,11 @@
2
2
  #ifndef C10_MACROS_EXPORT_H_
3
3
  #define C10_MACROS_EXPORT_H_
4
4
 
5
- /* Header file to define the common scaffolding for exported symbols.
6
- *
7
- * Export is by itself a quite tricky situation to deal with, and if you are
8
- * hitting this file, make sure you start with the background here:
9
- * - Linux: https://gcc.gnu.org/wiki/Visibility
10
- * - Windows:
11
- * https://docs.microsoft.com/en-us/cpp/cpp/dllexport-dllimport?view=vs-2017
12
- *
13
- * Do NOT include this file directly. Instead, use c10/macros/Macros.h
14
- */
15
-
16
- // You do not need to edit this part of file unless you are changing the core
17
- // pytorch export abstractions.
18
- //
19
- // This part defines the C10 core export and import macros. This is controlled
20
- // by whether we are building shared libraries or not, which is determined
21
- // during build time and codified in c10/core/cmake_macros.h.
22
- // When the library is built as a shared lib, EXPORT and IMPORT will contain
23
- // visibility attributes. If it is being built as a static lib, then EXPORT
24
- // and IMPORT basically have no effect.
25
-
26
- // As a rule of thumb, you should almost NEVER mix static and shared builds for
27
- // libraries that depend on c10. AKA, if c10 is built as a static library, we
28
- // recommend everything dependent on c10 to be built statically. If c10 is built
29
- // as a shared library, everything dependent on it should be built as shared. In
30
- // the PyTorch project, all native libraries shall use the macro
31
- // C10_BUILD_SHARED_LIB to check whether pytorch is building shared or static
32
- // libraries.
33
-
34
- // For build systems that do not directly depend on CMake and directly build
35
- // from the source directory (such as Buck), one may not have a cmake_macros.h
36
- // file at all. In this case, the build system is responsible for providing
37
- // correct macro definitions corresponding to the cmake_macros.h.in file.
38
- //
39
- // In such scenarios, one should define the macro
40
- // C10_USING_CUSTOM_GENERATED_MACROS
41
- // to inform this header that it does not need to include the cmake_macros.h
42
- // file.
43
-
44
5
  #ifndef C10_USING_CUSTOM_GENERATED_MACROS
45
6
  #include <c10/macros/cmake_macros.h>
46
7
  #endif // C10_USING_CUSTOM_GENERATED_MACROS
47
8
 
48
- #ifdef _WIN32
49
- #define C10_HIDDEN
50
- #if defined(C10_BUILD_SHARED_LIBS)
51
- #define C10_EXPORT __declspec(dllexport)
52
- #define C10_IMPORT __declspec(dllimport)
53
- #else
54
- #define C10_EXPORT
55
- #define C10_IMPORT
56
- #endif
57
- #else // _WIN32
58
- #if defined(__GNUC__)
59
- #define C10_EXPORT __attribute__((__visibility__("default")))
60
- #define C10_HIDDEN __attribute__((__visibility__("hidden")))
61
- #else // defined(__GNUC__)
62
- #define C10_EXPORT
63
- #define C10_HIDDEN
64
- #endif // defined(__GNUC__)
65
- #define C10_IMPORT C10_EXPORT
66
- #endif // _WIN32
67
-
68
- #ifdef NO_EXPORT
69
- #undef C10_EXPORT
70
- #define C10_EXPORT
71
- #endif
72
-
73
- // Definition of an adaptive XX_API macro, that depends on whether you are
74
- // building the library itself or not, routes to XX_EXPORT and XX_IMPORT.
75
- // Basically, you will need to do this for each shared library that you are
76
- // building, and the instruction is as follows: assuming that you are building
77
- // a library called libawesome.so. You should:
78
- // (1) for your cmake target (usually done by "add_library(awesome, ...)"),
79
- // define a macro called AWESOME_BUILD_MAIN_LIB using
80
- // target_compile_options.
81
- // (2) define the AWESOME_API macro similar to the one below.
82
- // And in the source file of your awesome library, use AWESOME_API to
83
- // annotate public symbols.
84
-
85
- // Here, for the C10 library, we will define the macro C10_API for both import
86
- // and export.
87
-
88
- // This one is being used by libc10.so
89
- #ifdef C10_BUILD_MAIN_LIB
90
- #define C10_API C10_EXPORT
91
- #else
92
- #define C10_API C10_IMPORT
93
- #endif
9
+ #include <torch/headeronly/macros/Export.h>
94
10
 
95
11
  // This one is being used by libtorch.so
96
12
  #ifdef CAFFE2_BUILD_MAIN_LIB
@@ -160,4 +76,4 @@
160
76
  #define C10_API_ENUM
161
77
  #endif
162
78
 
163
- #endif // C10_MACROS_MACROS_H_
79
+ #endif // C10_MACROS_EXPORT_H_
@@ -242,7 +242,7 @@ using namespace c10::xpu;
242
242
  #ifdef __HIPCC__
243
243
  // Unlike CUDA, HIP requires a HIP header to be included for __host__ to work.
244
244
  // We do this #include here so that C10_HOST_DEVICE and friends will Just Work.
245
- // See https://github.com/ROCm-Developer-Tools/HIP/issues/441
245
+ // See https://github.com/ROCm/hip/issues/441
246
246
  #include <hip/hip_runtime.h>
247
247
  #endif
248
248
 
@@ -390,11 +390,24 @@ __host__ __device__
390
390
  #endif // __SYCL_DEVICE_ONLY__
391
391
  }
392
392
  #endif // NDEBUG
393
- // ROCm disable kernel assert by default
393
+ // ROCm disables kernel assert by default for performance considerations.
394
+ // Though ROCm supports __assert_fail, it uses kernel printf which has
395
+ // a non-negligible performance impact even if the assert condition is
396
+ // never triggered. We choose to use abort() instead which will still
397
+ // terminate the application but without a more useful error message.
394
398
  #if !defined(C10_USE_ROCM_KERNEL_ASSERT) and defined(USE_ROCM)
395
- #define CUDA_KERNEL_ASSERT(cond)
396
- #define CUDA_KERNEL_ASSERT_MSG(cond, msg)
397
- #define SYCL_KERNEL_ASSERT(cond)
399
+ #define CUDA_KERNEL_ASSERT(cond) \
400
+ if C10_UNLIKELY (!(cond)) { \
401
+ abort(); \
402
+ }
403
+ #define CUDA_KERNEL_ASSERT_MSG(cond, msg) \
404
+ if C10_UNLIKELY (!(cond)) { \
405
+ abort(); \
406
+ }
407
+ #define SYCL_KERNEL_ASSERT(cond) \
408
+ if C10_UNLIKELY (!(cond)) { \
409
+ abort(); \
410
+ }
398
411
  #else
399
412
  #define CUDA_KERNEL_ASSERT(cond) \
400
413
  if (C10_UNLIKELY(!(cond))) { \
@@ -494,4 +507,14 @@ __host__ __device__
494
507
 
495
508
  #endif
496
509
 
510
+ // This macro is used to find older C++ compilers
511
+ // that don't support move optimization for return values.
512
+
513
+ #if (defined(__GNUC__) && __GNUC__ < 13) || \
514
+ (defined(__clang_major__) && __clang_major__ < 13)
515
+ #define C10_RETURN_MOVE_IF_OLD_COMPILER 1
516
+ #else
517
+ #define C10_RETURN_MOVE_IF_OLD_COMPILER 0
518
+ #endif
519
+
497
520
  #endif // C10_MACROS_MACROS_H_
@@ -10,14 +10,11 @@ C10_CLANG_DIAGNOSTIC_PUSH()
10
10
  C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
11
11
  #endif
12
12
 
13
- #if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
14
13
  #if defined(CL_SYCL_LANGUAGE_VERSION)
15
14
  #include <CL/sycl.hpp> // for SYCL 1.2.1
16
- #else
15
+ #elif defined(SYCL_LANGUAGE_VERSION)
17
16
  #include <sycl/sycl.hpp> // for SYCL 2020
18
17
  #endif
19
- #include <ext/oneapi/bfloat16.hpp>
20
- #endif
21
18
 
22
19
  namespace c10 {
23
20
 
@@ -14,14 +14,11 @@
14
14
  #include <cuda_bf16.h>
15
15
  #endif
16
16
 
17
- #if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
18
17
  #if defined(CL_SYCL_LANGUAGE_VERSION)
19
18
  #include <CL/sycl.hpp> // for SYCL 1.2.1
20
- #else
19
+ #elif defined(SYCL_LANGUAGE_VERSION)
21
20
  #include <sycl/sycl.hpp> // for SYCL 2020
22
21
  #endif
23
- #include <ext/oneapi/bfloat16.hpp>
24
- #endif
25
22
 
26
23
  namespace c10 {
27
24
 
@@ -31,7 +28,7 @@ inline C10_HOST_DEVICE float f32_from_bits(uint16_t src) {
31
28
  uint32_t tmp = src;
32
29
  tmp <<= 16;
33
30
 
34
- #if defined(USE_ROCM)
31
+ #if defined(USE_ROCM) && defined(__HIPCC__)
35
32
  float *tempRes;
36
33
 
37
34
  // We should be using memcpy in order to respect the strict aliasing rule
@@ -48,7 +45,7 @@ inline C10_HOST_DEVICE float f32_from_bits(uint16_t src) {
48
45
  inline C10_HOST_DEVICE uint16_t bits_from_f32(float src) {
49
46
  uint32_t res = 0;
50
47
 
51
- #if defined(USE_ROCM)
48
+ #if defined(USE_ROCM) && defined(__HIPCC__)
52
49
  // We should be using memcpy in order to respect the strict aliasing rule
53
50
  // but it fails in the HIP environment.
54
51
  uint32_t *tempRes = reinterpret_cast<uint32_t *>(&src);
@@ -61,7 +58,7 @@ inline C10_HOST_DEVICE uint16_t bits_from_f32(float src) {
61
58
  }
62
59
 
63
60
  inline C10_HOST_DEVICE uint16_t round_to_nearest_even(float src) {
64
- #if defined(USE_ROCM)
61
+ #if defined(USE_ROCM) && defined(__HIPCC__)
65
62
  if (src != src) {
66
63
  #elif defined(_MSC_VER)
67
64
  if (isnan(src)) {
@@ -87,7 +84,7 @@ struct alignas(2) BFloat16 {
87
84
  uint16_t x;
88
85
 
89
86
  // HIP wants __host__ __device__ tag, CUDA does not
90
- #if defined(USE_ROCM)
87
+ #if defined(USE_ROCM) && defined(__HIPCC__)
91
88
  C10_HOST_DEVICE BFloat16() = default;
92
89
  #else
93
90
  BFloat16() = default;
@@ -242,7 +242,12 @@ C10_HOST_DEVICE inline float fp16_ieee_to_fp32_value(uint16_t h) {
242
242
  // const float exp_scale = 0x1.0p-112f;
243
243
  constexpr uint32_t scale_bits = (uint32_t)15 << 23;
244
244
  float exp_scale_val = 0;
245
+ #if defined(_MSC_VER) && defined(__clang__)
246
+ __builtin_memcpy(&exp_scale_val, &scale_bits, sizeof(exp_scale_val));
247
+ #else
245
248
  std::memcpy(&exp_scale_val, &scale_bits, sizeof(exp_scale_val));
249
+ #endif
250
+
246
251
  const float exp_scale = exp_scale_val;
247
252
  const float normalized_value =
248
253
  fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
@@ -3,7 +3,7 @@
3
3
  #include <cstring>
4
4
  #include <type_traits>
5
5
 
6
- #if __has_include(<bit>) && (__cplusplus >= 202002L || (defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L))
6
+ #if __has_include(<bit>) && (defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L)
7
7
  #include <bit>
8
8
  #define C10_HAVE_STD_BIT_CAST 1
9
9
  #else