nvfuser-cu121-torch25 0.2.25.dev20250201__cp310-cp310-manylinux_2_28_x86_64.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (242) hide show
  1. nvfuser/_C.cpython-310-x86_64-linux-gnu.so +0 -0
  2. nvfuser/__init__.py +618 -0
  3. nvfuser/__init__.pyi +4 -0
  4. nvfuser/contrib/__init__.py +9 -0
  5. nvfuser/contrib/nn/__init__.py +13 -0
  6. nvfuser/contrib/nn/normalization.py +725 -0
  7. nvfuser/include/nvfuser/alias_analysis.h +116 -0
  8. nvfuser/include/nvfuser/bfs.h +929 -0
  9. nvfuser/include/nvfuser/codegen.h +26 -0
  10. nvfuser/include/nvfuser/compute_at.h +28 -0
  11. nvfuser/include/nvfuser/compute_at_map.h +394 -0
  12. nvfuser/include/nvfuser/contiguity.h +351 -0
  13. nvfuser/include/nvfuser/cuda_utils.h +50 -0
  14. nvfuser/include/nvfuser/debug.h +50 -0
  15. nvfuser/include/nvfuser/device_lower/analysis/bank_conflict.h +53 -0
  16. nvfuser/include/nvfuser/device_lower/analysis/circular_buffer.h +109 -0
  17. nvfuser/include/nvfuser/device_lower/analysis/device_version.h +65 -0
  18. nvfuser/include/nvfuser/device_lower/analysis/divisible_split.h +28 -0
  19. nvfuser/include/nvfuser/device_lower/analysis/fused_reduction.h +36 -0
  20. nvfuser/include/nvfuser/device_lower/analysis/index_compute.h +322 -0
  21. nvfuser/include/nvfuser/device_lower/analysis/predicate_elimination.h +71 -0
  22. nvfuser/include/nvfuser/device_lower/analysis/sync_information.h +47 -0
  23. nvfuser/include/nvfuser/device_lower/analysis/tensor_memory.h +65 -0
  24. nvfuser/include/nvfuser/device_lower/analysis/thread_predicate.h +158 -0
  25. nvfuser/include/nvfuser/device_lower/analysis/tma.h +93 -0
  26. nvfuser/include/nvfuser/device_lower/analysis/trivial_broadcast.h +75 -0
  27. nvfuser/include/nvfuser/device_lower/id_model_options.h +135 -0
  28. nvfuser/include/nvfuser/device_lower/lower2device.h +391 -0
  29. nvfuser/include/nvfuser/device_lower/pass/alias_memory.h +37 -0
  30. nvfuser/include/nvfuser/device_lower/pass/allocation.h +32 -0
  31. nvfuser/include/nvfuser/device_lower/pass/circular_buffer.h +191 -0
  32. nvfuser/include/nvfuser/device_lower/pass/expr_sort.h +17 -0
  33. nvfuser/include/nvfuser/device_lower/pass/fusion_simplifier.h +21 -0
  34. nvfuser/include/nvfuser/device_lower/pass/grid_serialization.h +26 -0
  35. nvfuser/include/nvfuser/device_lower/pass/index.h +200 -0
  36. nvfuser/include/nvfuser/device_lower/pass/inline_ptx.h +16 -0
  37. nvfuser/include/nvfuser/device_lower/pass/insert_syncs.h +39 -0
  38. nvfuser/include/nvfuser/device_lower/pass/instrument.h +24 -0
  39. nvfuser/include/nvfuser/device_lower/pass/loop_rotation.h +150 -0
  40. nvfuser/include/nvfuser/device_lower/pass/loops.h +68 -0
  41. nvfuser/include/nvfuser/device_lower/pass/magic_zero.h +86 -0
  42. nvfuser/include/nvfuser/device_lower/pass/misaligned_vectorization.h +118 -0
  43. nvfuser/include/nvfuser/device_lower/pass/predicate.h +23 -0
  44. nvfuser/include/nvfuser/device_lower/pass/replace_size.h +24 -0
  45. nvfuser/include/nvfuser/device_lower/pass/scalar_hoist.h +115 -0
  46. nvfuser/include/nvfuser/device_lower/pass/unroll.h +98 -0
  47. nvfuser/include/nvfuser/device_lower/pass/vectorize_welford.h +45 -0
  48. nvfuser/include/nvfuser/device_lower/pass/warp_reduce.h +23 -0
  49. nvfuser/include/nvfuser/device_lower/utils.h +382 -0
  50. nvfuser/include/nvfuser/device_lower/validation.h +74 -0
  51. nvfuser/include/nvfuser/disjoint_set.h +556 -0
  52. nvfuser/include/nvfuser/dispatch.h +334 -0
  53. nvfuser/include/nvfuser/driver_api.h +49 -0
  54. nvfuser/include/nvfuser/dynamic_transform.h +316 -0
  55. nvfuser/include/nvfuser/dynamic_type/C++20/type_traits +37 -0
  56. nvfuser/include/nvfuser/dynamic_type/dynamic_type.h +969 -0
  57. nvfuser/include/nvfuser/dynamic_type/error.h +24 -0
  58. nvfuser/include/nvfuser/dynamic_type/type_traits.h +703 -0
  59. nvfuser/include/nvfuser/evaluator_common.h +295 -0
  60. nvfuser/include/nvfuser/exceptions.h +283 -0
  61. nvfuser/include/nvfuser/expr_evaluator.h +125 -0
  62. nvfuser/include/nvfuser/expr_simplifier.h +218 -0
  63. nvfuser/include/nvfuser/flatbuffers/allocator.h +68 -0
  64. nvfuser/include/nvfuser/flatbuffers/array.h +253 -0
  65. nvfuser/include/nvfuser/flatbuffers/base.h +486 -0
  66. nvfuser/include/nvfuser/flatbuffers/buffer.h +154 -0
  67. nvfuser/include/nvfuser/flatbuffers/buffer_ref.h +53 -0
  68. nvfuser/include/nvfuser/flatbuffers/code_generator.h +80 -0
  69. nvfuser/include/nvfuser/flatbuffers/code_generators.h +234 -0
  70. nvfuser/include/nvfuser/flatbuffers/default_allocator.h +64 -0
  71. nvfuser/include/nvfuser/flatbuffers/detached_buffer.h +114 -0
  72. nvfuser/include/nvfuser/flatbuffers/flatbuffer_builder.h +1225 -0
  73. nvfuser/include/nvfuser/flatbuffers/flatbuffers.h +272 -0
  74. nvfuser/include/nvfuser/flatbuffers/flatc.h +130 -0
  75. nvfuser/include/nvfuser/flatbuffers/flex_flat_util.h +36 -0
  76. nvfuser/include/nvfuser/flatbuffers/flexbuffers.h +1889 -0
  77. nvfuser/include/nvfuser/flatbuffers/grpc.h +300 -0
  78. nvfuser/include/nvfuser/flatbuffers/hash.h +127 -0
  79. nvfuser/include/nvfuser/flatbuffers/idl.h +1359 -0
  80. nvfuser/include/nvfuser/flatbuffers/minireflect.h +420 -0
  81. nvfuser/include/nvfuser/flatbuffers/reflection.h +522 -0
  82. nvfuser/include/nvfuser/flatbuffers/reflection_generated.h +1471 -0
  83. nvfuser/include/nvfuser/flatbuffers/registry.h +128 -0
  84. nvfuser/include/nvfuser/flatbuffers/stl_emulation.h +513 -0
  85. nvfuser/include/nvfuser/flatbuffers/string.h +64 -0
  86. nvfuser/include/nvfuser/flatbuffers/struct.h +53 -0
  87. nvfuser/include/nvfuser/flatbuffers/table.h +168 -0
  88. nvfuser/include/nvfuser/flatbuffers/util.h +731 -0
  89. nvfuser/include/nvfuser/flatbuffers/vector.h +393 -0
  90. nvfuser/include/nvfuser/flatbuffers/vector_downward.h +273 -0
  91. nvfuser/include/nvfuser/flatbuffers/verifier.h +317 -0
  92. nvfuser/include/nvfuser/fusion.h +511 -0
  93. nvfuser/include/nvfuser/fusion_guard.h +37 -0
  94. nvfuser/include/nvfuser/fusion_profiler.h +311 -0
  95. nvfuser/include/nvfuser/fusion_segmenter.h +751 -0
  96. nvfuser/include/nvfuser/global_allocator.h +27 -0
  97. nvfuser/include/nvfuser/grouped_reduction.h +47 -0
  98. nvfuser/include/nvfuser/host_ir/container.h +60 -0
  99. nvfuser/include/nvfuser/host_ir/executor.h +152 -0
  100. nvfuser/include/nvfuser/host_ir/host_ir.h +320 -0
  101. nvfuser/include/nvfuser/host_ir/lower.h +35 -0
  102. nvfuser/include/nvfuser/id_model/circular_buffer_indexing.h +56 -0
  103. nvfuser/include/nvfuser/id_model/contiguity.h +166 -0
  104. nvfuser/include/nvfuser/id_model/id_model.h +359 -0
  105. nvfuser/include/nvfuser/id_model/id_model_index_compute.h +81 -0
  106. nvfuser/include/nvfuser/id_model/indexing.h +208 -0
  107. nvfuser/include/nvfuser/id_model/indexing_traversal.h +72 -0
  108. nvfuser/include/nvfuser/id_model/indexing_utils.h +62 -0
  109. nvfuser/include/nvfuser/id_model/loop_promotion.h +180 -0
  110. nvfuser/include/nvfuser/id_model/predicate_indexing.h +104 -0
  111. nvfuser/include/nvfuser/id_model/schedule.h +54 -0
  112. nvfuser/include/nvfuser/id_model/to_string.h +87 -0
  113. nvfuser/include/nvfuser/id_model/transform_replay.h +58 -0
  114. nvfuser/include/nvfuser/id_model/utils.h +176 -0
  115. nvfuser/include/nvfuser/id_model/validation_utils.h +55 -0
  116. nvfuser/include/nvfuser/index_compute.h +651 -0
  117. nvfuser/include/nvfuser/instrumentation.h +107 -0
  118. nvfuser/include/nvfuser/ir/all_nodes.h +14 -0
  119. nvfuser/include/nvfuser/ir/base_nodes.h +687 -0
  120. nvfuser/include/nvfuser/ir/builder.h +215 -0
  121. nvfuser/include/nvfuser/ir/builder_passkey.h +29 -0
  122. nvfuser/include/nvfuser/ir/cloner.h +185 -0
  123. nvfuser/include/nvfuser/ir/container.h +226 -0
  124. nvfuser/include/nvfuser/ir/graphviz.h +119 -0
  125. nvfuser/include/nvfuser/ir/interface_nodes.h +957 -0
  126. nvfuser/include/nvfuser/ir/internal_base_nodes.h +744 -0
  127. nvfuser/include/nvfuser/ir/internal_nodes.h +2792 -0
  128. nvfuser/include/nvfuser/ir/iostream.h +98 -0
  129. nvfuser/include/nvfuser/ir/printer.h +57 -0
  130. nvfuser/include/nvfuser/ir/utils.h +801 -0
  131. nvfuser/include/nvfuser/iter_visitor.h +661 -0
  132. nvfuser/include/nvfuser/kernel.h +299 -0
  133. nvfuser/include/nvfuser/kernel_db/kernel_db.h +109 -0
  134. nvfuser/include/nvfuser/kernel_db/utils.h +37 -0
  135. nvfuser/include/nvfuser/kernel_ir.h +1457 -0
  136. nvfuser/include/nvfuser/kernel_ir_dispatch.h +147 -0
  137. nvfuser/include/nvfuser/linked_hash_map.h +97 -0
  138. nvfuser/include/nvfuser/logical_domain_map.h +577 -0
  139. nvfuser/include/nvfuser/macros.h +23 -0
  140. nvfuser/include/nvfuser/mma_type.h +257 -0
  141. nvfuser/include/nvfuser/multidevice/c10d_mock.h +175 -0
  142. nvfuser/include/nvfuser/multidevice/communication.h +232 -0
  143. nvfuser/include/nvfuser/multidevice/communicator.h +179 -0
  144. nvfuser/include/nvfuser/multidevice/device_mesh.h +95 -0
  145. nvfuser/include/nvfuser/multidevice/executor.h +107 -0
  146. nvfuser/include/nvfuser/multidevice/multidevice.h +18 -0
  147. nvfuser/include/nvfuser/multidevice/utils.h +187 -0
  148. nvfuser/include/nvfuser/non_divisible_split.h +86 -0
  149. nvfuser/include/nvfuser/opaque_type.h +129 -0
  150. nvfuser/include/nvfuser/ops/alias.h +192 -0
  151. nvfuser/include/nvfuser/ops/all_ops.h +13 -0
  152. nvfuser/include/nvfuser/ops/arith.h +712 -0
  153. nvfuser/include/nvfuser/ops/composite.h +130 -0
  154. nvfuser/include/nvfuser/ops/indexing.h +55 -0
  155. nvfuser/include/nvfuser/ops/normalization.h +263 -0
  156. nvfuser/include/nvfuser/ops/utils.h +127 -0
  157. nvfuser/include/nvfuser/options.h +313 -0
  158. nvfuser/include/nvfuser/parallel_dimension_map.h +95 -0
  159. nvfuser/include/nvfuser/parallel_type_bitmap.h +365 -0
  160. nvfuser/include/nvfuser/polymorphic_value.h +432 -0
  161. nvfuser/include/nvfuser/predicate_compute.h +213 -0
  162. nvfuser/include/nvfuser/python_frontend/distributed_tensor.h +50 -0
  163. nvfuser/include/nvfuser/python_frontend/fusion_cache.h +298 -0
  164. nvfuser/include/nvfuser/python_frontend/fusion_definition.h +372 -0
  165. nvfuser/include/nvfuser/python_frontend/fusion_record.h +3124 -0
  166. nvfuser/include/nvfuser/python_frontend/fusion_state.h +143 -0
  167. nvfuser/include/nvfuser/python_frontend/python_bindings.h +27 -0
  168. nvfuser/include/nvfuser/python_frontend/segmentation.h +246 -0
  169. nvfuser/include/nvfuser/python_frontend/translation.h +20 -0
  170. nvfuser/include/nvfuser/python_frontend/translation_utils.h +308 -0
  171. nvfuser/include/nvfuser/scheduler/all_schedulers.h +17 -0
  172. nvfuser/include/nvfuser/scheduler/ampere_multi_matmul.h +206 -0
  173. nvfuser/include/nvfuser/scheduler/cache_policy_refiner.h +19 -0
  174. nvfuser/include/nvfuser/scheduler/compile_time_info.h +322 -0
  175. nvfuser/include/nvfuser/scheduler/debug_utils.h +68 -0
  176. nvfuser/include/nvfuser/scheduler/expr_eval_sched.h +45 -0
  177. nvfuser/include/nvfuser/scheduler/heuristic.h +113 -0
  178. nvfuser/include/nvfuser/scheduler/hopper_multi_matmul.h +204 -0
  179. nvfuser/include/nvfuser/scheduler/mark_aliases.h +19 -0
  180. nvfuser/include/nvfuser/scheduler/matmul.h +40 -0
  181. nvfuser/include/nvfuser/scheduler/matmul_heuristic.h +293 -0
  182. nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin.h +65 -0
  183. nvfuser/include/nvfuser/scheduler/matmul_heuristic_plugin_api.h +99 -0
  184. nvfuser/include/nvfuser/scheduler/matmul_utils.h +54 -0
  185. nvfuser/include/nvfuser/scheduler/mma_utils.h +500 -0
  186. nvfuser/include/nvfuser/scheduler/multi_matmul.h +74 -0
  187. nvfuser/include/nvfuser/scheduler/no_op.h +48 -0
  188. nvfuser/include/nvfuser/scheduler/normalization_inner.h +49 -0
  189. nvfuser/include/nvfuser/scheduler/normalization_inner_outer.h +51 -0
  190. nvfuser/include/nvfuser/scheduler/normalization_outer.h +48 -0
  191. nvfuser/include/nvfuser/scheduler/normalization_utils.h +379 -0
  192. nvfuser/include/nvfuser/scheduler/pointwise.h +183 -0
  193. nvfuser/include/nvfuser/scheduler/pointwise_heuristic.h +118 -0
  194. nvfuser/include/nvfuser/scheduler/pointwise_utils.h +24 -0
  195. nvfuser/include/nvfuser/scheduler/reduction.h +43 -0
  196. nvfuser/include/nvfuser/scheduler/reduction_heuristic.h +339 -0
  197. nvfuser/include/nvfuser/scheduler/reduction_utils.h +159 -0
  198. nvfuser/include/nvfuser/scheduler/registry.h +97 -0
  199. nvfuser/include/nvfuser/scheduler/registry_utils.h +111 -0
  200. nvfuser/include/nvfuser/scheduler/resize.h +41 -0
  201. nvfuser/include/nvfuser/scheduler/resize_heuristic.h +67 -0
  202. nvfuser/include/nvfuser/scheduler/runtime_info.h +166 -0
  203. nvfuser/include/nvfuser/scheduler/scheduler_types.h +80 -0
  204. nvfuser/include/nvfuser/scheduler/transpose.h +114 -0
  205. nvfuser/include/nvfuser/scheduler/transpose_heuristic.h +164 -0
  206. nvfuser/include/nvfuser/scheduler/utils.h +771 -0
  207. nvfuser/include/nvfuser/scheduler/vectorize_helper.h +349 -0
  208. nvfuser/include/nvfuser/serde/factory.h +55 -0
  209. nvfuser/include/nvfuser/serde/fusion_cache_generated.h +4319 -0
  210. nvfuser/include/nvfuser/serde/fusion_record.h +124 -0
  211. nvfuser/include/nvfuser/serde/polymorphic_value.h +52 -0
  212. nvfuser/include/nvfuser/serde/utils.h +34 -0
  213. nvfuser/include/nvfuser/struct.inl +127 -0
  214. nvfuser/include/nvfuser/swizzle.h +54 -0
  215. nvfuser/include/nvfuser/sys_utils.h +40 -0
  216. nvfuser/include/nvfuser/tensor_metadata.h +118 -0
  217. nvfuser/include/nvfuser/tma.h +124 -0
  218. nvfuser/include/nvfuser/transform_iter.h +522 -0
  219. nvfuser/include/nvfuser/transform_replay.h +297 -0
  220. nvfuser/include/nvfuser/transform_rfactor.h +33 -0
  221. nvfuser/include/nvfuser/transform_view.h +136 -0
  222. nvfuser/include/nvfuser/type.h +1125 -0
  223. nvfuser/include/nvfuser/type_promotion.h +61 -0
  224. nvfuser/include/nvfuser/utils.h +619 -0
  225. nvfuser/include/nvfuser/val_graph.h +446 -0
  226. nvfuser/include/nvfuser/val_graph_visitor.h +259 -0
  227. nvfuser/include/nvfuser/validator_utils.h +92 -0
  228. nvfuser/include/nvfuser/vectorization_info.h +31 -0
  229. nvfuser/include/nvfuser/visibility.h +21 -0
  230. nvfuser/lib/libnvfuser_codegen.so +0 -0
  231. nvfuser/nvfuser_version.py +69 -0
  232. nvfuser/pytorch_utils.py +184 -0
  233. nvfuser/share/cmake/nvfuser/NvfuserConfig-release.cmake +20 -0
  234. nvfuser/share/cmake/nvfuser/NvfuserConfig.cmake +106 -0
  235. nvfuser/utils.py +18 -0
  236. nvfuser/version.py +1 -0
  237. nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/LICENSE +976 -0
  238. nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/METADATA +20 -0
  239. nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/RECORD +242 -0
  240. nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/WHEEL +5 -0
  241. nvfuser_cu121_torch25-0.2.25.dev20250201.dist-info/top_level.txt +1 -0
  242. nvfuser_cu121_torch25.libs/libnvToolsExt-847d78f2.so.1.0.0 +0 -0
@@ -0,0 +1,218 @@
1
+ // clang-format off
2
+ /*
3
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
4
+ * All rights reserved.
5
+ * SPDX-License-Identifier: BSD-3-Clause
6
+ */
7
+ // clang-format on
8
+ #pragma once
9
+
10
+ #include <exceptions.h>
11
+ #include <ir/all_nodes.h>
12
+ #include <visibility.h>
13
+
14
+ #include <vector>
15
+
16
+ // Note: [The Mathematics of Integer Arithmetic]
17
+ //
18
+ // We learnt arithmetic from as early as elementary school, and have been used
19
+ // to simplify expressions using rules like (a+b)/c = a/c+b/c. However, when we
20
+ // are dealing with integer arithmetic, which is the case for index and
21
+ // predicate simplification, lots of rules we learnt in elementary school no
22
+ // longer hold. For example, (1+1)/2 != 1/2+1/2 because the left hand side is 1
23
+ // and the right hand side is 0 + 0 = 0. So when considering adding a new
24
+ // simplification rule, we need to be very careful to make sure the rule is
25
+ // mathematically correct.
26
+ //
27
+ // Suggested reading matherials:
28
+ // - doc/math/abstract-algebra.md reviews abstract algebra, a theory that tells
29
+ // us which rule we are used to is still valid, and which is not.
30
+ // - doc/math/integer-division.md reviews the definitions and properties of div
31
+ // and mod in textbooks, it also describes some theorems that we proved
32
+ // ourselves that is useful for simplifying integer expressions.
33
+ // - doc/math/monotonic-function.md reviews the definition and properties of
34
+ // monotonic function.
35
+ //
36
+ // We can use the following rules to simplify integer expressions:
37
+ //
38
+ // A) Associativity of +: a + (b + c) = (a + b) + c
39
+ // B) Associativity of *: a * (b * c) = (a * b) * c
40
+ // C) Commutativity of +: a + b = b + a
41
+ // D) Commutativity of *: a * b = b * a
42
+ // E) Distributivity of * over +: a * (b + c) = (a * b) + (a * c)
43
+ // F) Distributivity of * over +: (a + b) * c = (a * c) + (b * c)
44
+ // G) (-a) / b = -(a / b) = a / (-b)
45
+ // H) (-a) % b = -(a % b) = a % (-b)
46
+ // I) If -|a| < r < |a|, then r % a = r, r / a = 0
47
+ // J) Distributivity of % over +:
48
+ // If compatible_sign(a, b), then (a + b) % c = (a % c + b % c) % c
49
+ // J.1) If compatible_sign(a, b) and a % c = 0, then (a + b) % c = b % c
50
+ // J.2) Let g = gcd(a, c). If compatible_sign(a, b), and -|g| < b < |g|
51
+ // then (a + b) % c = a % c + b
52
+ // K) Distributivity of % over *:
53
+ // If compatible_sign(a, b), then (a * b) % c = (a % c * b % c) % c
54
+ // L) If a is a multiple of b, then a % b = 0
55
+ // M) If b is a multiple of c, then we have: a*(b/c) = (a*b)/c
56
+ // N) a / (b * c) = (a / b) / c
57
+ // O) If d divides a and b, then a % b = ((a / d) % (b / d)) * d
58
+ // P) If b is a multiple of c, then a/(b/c) = (a*c)/b
59
+ // Q) If compatible_sign(a, b) and -|c| < a % c + b % c < |c|, then
60
+ // (a+b)/c = a/c + b/c
61
+ // Q.1) If compatible_sign(a, b) and a % c = 0, then (a+b)/c = a/c + b/c
62
+ // Q.2) Let g = gcd(a, c). If compatible_sign(a, b), and -|g| < b < |g|
63
+ // then (a + b) / c = a/c
64
+ //
65
+ // See doc/math/integer-division.md for proofs of these rules.
66
+ //
67
+ // Some examples on applying the above rules to simplify expressions:
68
+ //
69
+ // Example 7.1: Given that a >= 0 and b >= 0, simplify (a*4 + b) % 4
70
+ // Answer: (a*4 + b) % 4 = ((a*4)%4 + b%4) % 4 (Rule J)
71
+ // = (0 + b%4) % 4 (Rule L)
72
+ // = b % 4 % 4 (Basic math)
73
+ // = b % 4 (Rule I)
74
+ //
75
+ // Example 7.2: Given that 0 <= a < 3, simplify a % 4
76
+ // Answer: a % 4 = a (Rule I)
77
+ //
78
+ // Example 7.3: Simplify (a * 256) / 4
79
+ // Answer: (a * 256) / 4 = a * (256 / 4) (Rule M)
80
+ // = a * 64 (Basic math)
81
+ //
82
+ // Example 7.4: Simplify (a / 4) / 64
83
+ // Answer: (a / 4) / 64 = a / (4 * 64) (Rule N)
84
+ // = a / 256 (Basic math)
85
+ //
86
+ // Example 7.5: Simplify (a * 64) % 256 / 4
87
+ // Answer: (a * 64) % 256 / 4 = ((a % 4) * 64) / 4 (Rule O)
88
+ // = (a % 4) * (64 / 4) (Rule M)
89
+ // = (a % 4) * 16 (Basic math)
90
+ //
91
+ // Example 7.6: Simplify (a * 4) / 256
92
+ // Answer: (a * 4) / 256 = a / (256 / 4) (Rule P)
93
+ // = a / 64 (Basic math)
94
+ //
95
+ // Example 7.7: Given that a >= 0 and b >= 0, simplify (a * 256 + b) / 4
96
+ // Answer: because (a * 256) % 4 = 0, we have
97
+ // (a * 256 + b) / 4 = a * 256 / 4 + b / 4 (Rule Q)
98
+ // = a * (256 / 4) + b / 4 (Rule M)
99
+ // = a * 64 + b / 4 (Basic math)
100
+ //
101
+ // Example 7.8: Given that a >= 0 and 0 <= b < 4, simplify (a * 4 + b) / 4
102
+ // Answer: Similar to above, we have
103
+ // (a * 4 + b) / 4 = a + b / 4
104
+ // = a + 0 (Rule I)
105
+ // = a
106
+
107
+ namespace nvfuser {
108
+
109
+ // Information for a single variable. Possible values that this variable can
110
+ // take is: start, start + step, start + 2 * step, ... (< stop), which is
111
+ // similar to the loop variable of for loop:
112
+ // for variable in range(start, stop, step)
113
+ struct VarInfo {
114
+ Val* variable = nullptr;
115
+ // If this variable is an unrolled loop index. It is important to know this
116
+ // because unrolled loop index is compile constant to nvRTC. Note that a
117
+ // constant to nvRTC might not be a constant to nvFuser. For example, if I
118
+ // have loop
119
+ // #pragma unroll
120
+ // FOR i1 in ...:
121
+ // ...
122
+ // Then `i1` is a compile constant to nvRTC, but not a compile time constant
123
+ // to nvFuser.
124
+ bool is_unrolled_loop_index = false;
125
+ };
126
+
127
+ // Analyze expression register usage
128
+ enum class RegisterType { GeneralPurpose, Uniform, Immediate, Unknown };
129
+ RegisterType getRegisterType(Val* value);
130
+
131
+ // Simplify expressions with the given information of variables.
132
+ //
133
+ // The argument `variables` specifies which scalar are considered variable and
134
+ // some information about these variables. Any scalar not contained in
135
+ // `variables` are considered constants. Tensors are always considered as
136
+ // variables, regardless of if it is specified in `variables`.
137
+ //
138
+ // Note that in `variables`, the order matters. This order specifies how we
139
+ // should organize associative and commutative expressions. For example, if the
140
+ // `variables` is {a, b, c, d}, then we will simplify (a + d) + (c + b) as
141
+ // ((a + b) + c) + d. Tensors are always considered as at the right of all
142
+ // scalars, regardless of if it is inside `variables` or not.
143
+ // See note [Reordering associative and commutative operators] for detailed
144
+ // information about this reordering.
145
+ //
146
+ // Some simplifications like a*b/b -> a is always correct in valid case, but
147
+ // when there is an error (e.g. division-by-zero), these simplifications could
148
+ // potentially hide the error. The argument `preserve_error` specifies whether
149
+ // we should disable these optimization, unless we can prove there won't be an
150
+ // error.
151
+ NVF_API Val* simplifyExpr(
152
+ Val* value,
153
+ const std::list<VarInfo>& variables = {},
154
+ std::vector<Val*> assumptions = {},
155
+ bool preserve_error = false);
156
+
157
+ class Context;
158
+ namespace assoc_comm {
159
+ // The expression type that represents the flattened ops. For example, if I have
160
+ // out = a + b + 3 + c + 5, then I will have:
161
+ // FlattenedAssocCommOp {
162
+ // inputs: [a, b, 3, c, 5]
163
+ // outputs: [out]
164
+ // }
165
+ class FlattenedAssocCommOp : public Expr {
166
+ public:
167
+ using Expr::Expr;
168
+
169
+ FlattenedAssocCommOp(
170
+ IrBuilderPasskey passkey,
171
+ BinaryOpType op,
172
+ Val* out,
173
+ std::vector<Val*> terms);
174
+
175
+ NVFUSER_DECLARE_CLONE_AND_CREATE
176
+
177
+ const char* getOpString() const override;
178
+
179
+ // FlattenedAssocCommOp is unordered, so we should have
180
+ // FlattenedAdd(a, b)->sameAs(FlattenedAdd(b, a))
181
+ bool sameAs(const Statement* other) const override;
182
+
183
+ std::string toString(int indent_size = 0) const override;
184
+
185
+ std::string toInlineString(int indent_size = 0) const override;
186
+
187
+ DataType dtype() const {
188
+ return *output(0)->getDataType();
189
+ }
190
+
191
+ BinaryOpType getOpType() const {
192
+ return attribute<BinaryOpType>(0);
193
+ }
194
+
195
+ // Get a vector of inputs, sorted as the order given by `variables`. Note that
196
+ // the sorting key is the rightmost variable that an input depends on. For
197
+ // example, if I have two inputs.
198
+ // v1 = a * c
199
+ // v2 = b
200
+ // and variables is [a, b, c], then v2 < v1 because the rightmost depending
201
+ // variable of v2 is b, and the rightmost depending variable of v1 is c,
202
+ // and b < c. So in this example, this function will return [v2, v1].
203
+ // Tensors are always considered as variables and they are always considered
204
+ // as the rightmost.
205
+ std::vector<Val*> sortedInputs(const Context& context);
206
+
207
+ bool isTrivial() const {
208
+ return inputs().size() == 1;
209
+ }
210
+
211
+ std::vector<PolymorphicValue> evaluate(
212
+ const ExpressionEvaluator& ee,
213
+ const std::vector<PolymorphicValue>& inputs) const override;
214
+ };
215
+
216
+ } // namespace assoc_comm
217
+
218
+ } // namespace nvfuser
@@ -0,0 +1,68 @@
1
+ /*
2
+ * Copyright 2021 Google Inc. All rights reserved.
3
+ *
4
+ * Licensed under the Apache License, Version 2.0 (the "License");
5
+ * you may not use this file except in compliance with the License.
6
+ * You may obtain a copy of the License at
7
+ *
8
+ * http://www.apache.org/licenses/LICENSE-2.0
9
+ *
10
+ * Unless required by applicable law or agreed to in writing, software
11
+ * distributed under the License is distributed on an "AS IS" BASIS,
12
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ * See the License for the specific language governing permissions and
14
+ * limitations under the License.
15
+ */
16
+
17
+ #ifndef FLATBUFFERS_ALLOCATOR_H_
18
+ #define FLATBUFFERS_ALLOCATOR_H_
19
+
20
+ #include "flatbuffers/base.h"
21
+
22
+ namespace flatbuffers {
23
+
24
+ // Allocator interface. This is flatbuffers-specific and meant only for
25
+ // `vector_downward` usage.
26
+ class Allocator {
27
+ public:
28
+ virtual ~Allocator() {}
29
+
30
+ // Allocate `size` bytes of memory.
31
+ virtual uint8_t *allocate(size_t size) = 0;
32
+
33
+ // Deallocate `size` bytes of memory at `p` allocated by this allocator.
34
+ virtual void deallocate(uint8_t *p, size_t size) = 0;
35
+
36
+ // Reallocate `new_size` bytes of memory, replacing the old region of size
37
+ // `old_size` at `p`. In contrast to a normal realloc, this grows downwards,
38
+ // and is intended specifcally for `vector_downward` use.
39
+ // `in_use_back` and `in_use_front` indicate how much of `old_size` is
40
+ // actually in use at each end, and needs to be copied.
41
+ virtual uint8_t *reallocate_downward(uint8_t *old_p, size_t old_size,
42
+ size_t new_size, size_t in_use_back,
43
+ size_t in_use_front) {
44
+ FLATBUFFERS_ASSERT(new_size > old_size); // vector_downward only grows
45
+ uint8_t *new_p = allocate(new_size);
46
+ memcpy_downward(old_p, old_size, new_p, new_size, in_use_back,
47
+ in_use_front);
48
+ deallocate(old_p, old_size);
49
+ return new_p;
50
+ }
51
+
52
+ protected:
53
+ // Called by `reallocate_downward` to copy memory from `old_p` of `old_size`
54
+ // to `new_p` of `new_size`. Only memory of size `in_use_front` and
55
+ // `in_use_back` will be copied from the front and back of the old memory
56
+ // allocation.
57
+ void memcpy_downward(uint8_t *old_p, size_t old_size, uint8_t *new_p,
58
+ size_t new_size, size_t in_use_back,
59
+ size_t in_use_front) {
60
+ memcpy(new_p + new_size - in_use_back, old_p + old_size - in_use_back,
61
+ in_use_back);
62
+ memcpy(new_p, old_p, in_use_front);
63
+ }
64
+ };
65
+
66
+ } // namespace flatbuffers
67
+
68
+ #endif // FLATBUFFERS_ALLOCATOR_H_
@@ -0,0 +1,253 @@
1
+ /*
2
+ * Copyright 2021 Google Inc. All rights reserved.
3
+ *
4
+ * Licensed under the Apache License, Version 2.0 (the "License");
5
+ * you may not use this file except in compliance with the License.
6
+ * You may obtain a copy of the License at
7
+ *
8
+ * http://www.apache.org/licenses/LICENSE-2.0
9
+ *
10
+ * Unless required by applicable law or agreed to in writing, software
11
+ * distributed under the License is distributed on an "AS IS" BASIS,
12
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ * See the License for the specific language governing permissions and
14
+ * limitations under the License.
15
+ */
16
+
17
+ #ifndef FLATBUFFERS_ARRAY_H_
18
+ #define FLATBUFFERS_ARRAY_H_
19
+
20
+ #include <memory>
21
+
22
+ #include "flatbuffers/base.h"
23
+ #include "flatbuffers/stl_emulation.h"
24
+ #include "flatbuffers/vector.h"
25
+
26
+ namespace flatbuffers {
27
+
28
+ // This is used as a helper type for accessing arrays.
29
+ template<typename T, uint16_t length> class Array {
30
+ // Array<T> can carry only POD data types (scalars or structs).
31
+ typedef typename flatbuffers::bool_constant<flatbuffers::is_scalar<T>::value>
32
+ scalar_tag;
33
+ typedef
34
+ typename flatbuffers::conditional<scalar_tag::value, T, const T *>::type
35
+ IndirectHelperType;
36
+
37
+ public:
38
+ typedef uint16_t size_type;
39
+ typedef typename IndirectHelper<IndirectHelperType>::return_type return_type;
40
+ typedef VectorConstIterator<T, return_type> const_iterator;
41
+ typedef VectorReverseIterator<const_iterator> const_reverse_iterator;
42
+
43
+ // If T is a LE-scalar or a struct (!scalar_tag::value).
44
+ static FLATBUFFERS_CONSTEXPR bool is_span_observable =
45
+ (scalar_tag::value && (FLATBUFFERS_LITTLEENDIAN || sizeof(T) == 1)) ||
46
+ !scalar_tag::value;
47
+
48
+ FLATBUFFERS_CONSTEXPR uint16_t size() const { return length; }
49
+
50
+ return_type Get(uoffset_t i) const {
51
+ FLATBUFFERS_ASSERT(i < size());
52
+ return IndirectHelper<IndirectHelperType>::Read(Data(), i);
53
+ }
54
+
55
+ return_type operator[](uoffset_t i) const { return Get(i); }
56
+
57
+ // If this is a Vector of enums, T will be its storage type, not the enum
58
+ // type. This function makes it convenient to retrieve value with enum
59
+ // type E.
60
+ template<typename E> E GetEnum(uoffset_t i) const {
61
+ return static_cast<E>(Get(i));
62
+ }
63
+
64
+ const_iterator begin() const { return const_iterator(Data(), 0); }
65
+ const_iterator end() const { return const_iterator(Data(), size()); }
66
+
67
+ const_reverse_iterator rbegin() const {
68
+ return const_reverse_iterator(end());
69
+ }
70
+ const_reverse_iterator rend() const {
71
+ return const_reverse_iterator(begin());
72
+ }
73
+
74
+ const_iterator cbegin() const { return begin(); }
75
+ const_iterator cend() const { return end(); }
76
+
77
+ const_reverse_iterator crbegin() const { return rbegin(); }
78
+ const_reverse_iterator crend() const { return rend(); }
79
+
80
+ // Get a mutable pointer to elements inside this array.
81
+ // This method used to mutate arrays of structs followed by a @p Mutate
82
+ // operation. For primitive types use @p Mutate directly.
83
+ // @warning Assignments and reads to/from the dereferenced pointer are not
84
+ // automatically converted to the correct endianness.
85
+ typename flatbuffers::conditional<scalar_tag::value, void, T *>::type
86
+ GetMutablePointer(uoffset_t i) const {
87
+ FLATBUFFERS_ASSERT(i < size());
88
+ return const_cast<T *>(&data()[i]);
89
+ }
90
+
91
+ // Change elements if you have a non-const pointer to this object.
92
+ void Mutate(uoffset_t i, const T &val) { MutateImpl(scalar_tag(), i, val); }
93
+
94
+ // The raw data in little endian format. Use with care.
95
+ const uint8_t *Data() const { return data_; }
96
+
97
+ uint8_t *Data() { return data_; }
98
+
99
+ // Similarly, but typed, much like std::vector::data
100
+ const T *data() const { return reinterpret_cast<const T *>(Data()); }
101
+ T *data() { return reinterpret_cast<T *>(Data()); }
102
+
103
+ // Copy data from a span with endian conversion.
104
+ // If this Array and the span overlap, the behavior is undefined.
105
+ void CopyFromSpan(flatbuffers::span<const T, length> src) {
106
+ const auto p1 = reinterpret_cast<const uint8_t *>(src.data());
107
+ const auto p2 = Data();
108
+ FLATBUFFERS_ASSERT(!(p1 >= p2 && p1 < (p2 + length)) &&
109
+ !(p2 >= p1 && p2 < (p1 + length)));
110
+ (void)p1;
111
+ (void)p2;
112
+ CopyFromSpanImpl(flatbuffers::bool_constant<is_span_observable>(), src);
113
+ }
114
+
115
+ protected:
116
+ void MutateImpl(flatbuffers::true_type, uoffset_t i, const T &val) {
117
+ FLATBUFFERS_ASSERT(i < size());
118
+ WriteScalar(data() + i, val);
119
+ }
120
+
121
+ void MutateImpl(flatbuffers::false_type, uoffset_t i, const T &val) {
122
+ *(GetMutablePointer(i)) = val;
123
+ }
124
+
125
+ void CopyFromSpanImpl(flatbuffers::true_type,
126
+ flatbuffers::span<const T, length> src) {
127
+ // Use std::memcpy() instead of std::copy() to avoid performance degradation
128
+ // due to aliasing if T is char or unsigned char.
129
+ // The size is known at compile time, so memcpy would be inlined.
130
+ std::memcpy(data(), src.data(), length * sizeof(T));
131
+ }
132
+
133
+ // Copy data from flatbuffers::span with endian conversion.
134
+ void CopyFromSpanImpl(flatbuffers::false_type,
135
+ flatbuffers::span<const T, length> src) {
136
+ for (size_type k = 0; k < length; k++) { Mutate(k, src[k]); }
137
+ }
138
+
139
+ // This class is only used to access pre-existing data. Don't ever
140
+ // try to construct these manually.
141
+ // 'constexpr' allows us to use 'size()' at compile time.
142
+ // @note Must not use 'FLATBUFFERS_CONSTEXPR' here, as const is not allowed on
143
+ // a constructor.
144
+ #if defined(__cpp_constexpr)
145
+ constexpr Array();
146
+ #else
147
+ Array();
148
+ #endif
149
+
150
+ uint8_t data_[length * sizeof(T)];
151
+
152
+ private:
153
+ // This class is a pointer. Copying will therefore create an invalid object.
154
+ // Private and unimplemented copy constructor.
155
+ Array(const Array &);
156
+ Array &operator=(const Array &);
157
+ };
158
+
159
+ // Specialization for Array[struct] with access using Offset<void> pointer.
160
+ // This specialization used by idl_gen_text.cpp.
161
+ template<typename T, uint16_t length> class Array<Offset<T>, length> {
162
+ static_assert(flatbuffers::is_same<T, void>::value, "unexpected type T");
163
+
164
+ public:
165
+ typedef const void *return_type;
166
+
167
+ const uint8_t *Data() const { return data_; }
168
+
169
+ // Make idl_gen_text.cpp::PrintContainer happy.
170
+ return_type operator[](uoffset_t) const {
171
+ FLATBUFFERS_ASSERT(false);
172
+ return nullptr;
173
+ }
174
+
175
+ private:
176
+ // This class is only used to access pre-existing data.
177
+ Array();
178
+ Array(const Array &);
179
+ Array &operator=(const Array &);
180
+
181
+ uint8_t data_[1];
182
+ };
183
+
184
+ template<class U, uint16_t N>
185
+ FLATBUFFERS_CONSTEXPR_CPP11 flatbuffers::span<U, N> make_span(Array<U, N> &arr)
186
+ FLATBUFFERS_NOEXCEPT {
187
+ static_assert(
188
+ Array<U, N>::is_span_observable,
189
+ "wrong type U, only plain struct, LE-scalar, or byte types are allowed");
190
+ return span<U, N>(arr.data(), N);
191
+ }
192
+
193
+ template<class U, uint16_t N>
194
+ FLATBUFFERS_CONSTEXPR_CPP11 flatbuffers::span<const U, N> make_span(
195
+ const Array<U, N> &arr) FLATBUFFERS_NOEXCEPT {
196
+ static_assert(
197
+ Array<U, N>::is_span_observable,
198
+ "wrong type U, only plain struct, LE-scalar, or byte types are allowed");
199
+ return span<const U, N>(arr.data(), N);
200
+ }
201
+
202
+ template<class U, uint16_t N>
203
+ FLATBUFFERS_CONSTEXPR_CPP11 flatbuffers::span<uint8_t, sizeof(U) * N>
204
+ make_bytes_span(Array<U, N> &arr) FLATBUFFERS_NOEXCEPT {
205
+ static_assert(Array<U, N>::is_span_observable,
206
+ "internal error, Array<T> might hold only scalars or structs");
207
+ return span<uint8_t, sizeof(U) * N>(arr.Data(), sizeof(U) * N);
208
+ }
209
+
210
+ template<class U, uint16_t N>
211
+ FLATBUFFERS_CONSTEXPR_CPP11 flatbuffers::span<const uint8_t, sizeof(U) * N>
212
+ make_bytes_span(const Array<U, N> &arr) FLATBUFFERS_NOEXCEPT {
213
+ static_assert(Array<U, N>::is_span_observable,
214
+ "internal error, Array<T> might hold only scalars or structs");
215
+ return span<const uint8_t, sizeof(U) * N>(arr.Data(), sizeof(U) * N);
216
+ }
217
+
218
+ // Cast a raw T[length] to a raw flatbuffers::Array<T, length>
219
+ // without endian conversion. Use with care.
220
+ // TODO: move these Cast-methods to `internal` namespace.
221
+ template<typename T, uint16_t length>
222
+ Array<T, length> &CastToArray(T (&arr)[length]) {
223
+ return *reinterpret_cast<Array<T, length> *>(arr);
224
+ }
225
+
226
+ template<typename T, uint16_t length>
227
+ const Array<T, length> &CastToArray(const T (&arr)[length]) {
228
+ return *reinterpret_cast<const Array<T, length> *>(arr);
229
+ }
230
+
231
+ template<typename E, typename T, uint16_t length>
232
+ Array<E, length> &CastToArrayOfEnum(T (&arr)[length]) {
233
+ static_assert(sizeof(E) == sizeof(T), "invalid enum type E");
234
+ return *reinterpret_cast<Array<E, length> *>(arr);
235
+ }
236
+
237
+ template<typename E, typename T, uint16_t length>
238
+ const Array<E, length> &CastToArrayOfEnum(const T (&arr)[length]) {
239
+ static_assert(sizeof(E) == sizeof(T), "invalid enum type E");
240
+ return *reinterpret_cast<const Array<E, length> *>(arr);
241
+ }
242
+
243
+ template<typename T, uint16_t length>
244
+ bool operator==(const Array<T, length> &lhs,
245
+ const Array<T, length> &rhs) noexcept {
246
+ return std::addressof(lhs) == std::addressof(rhs) ||
247
+ (lhs.size() == rhs.size() &&
248
+ std::memcmp(lhs.Data(), rhs.Data(), rhs.size() * sizeof(T)) == 0);
249
+ }
250
+
251
+ } // namespace flatbuffers
252
+
253
+ #endif // FLATBUFFERS_ARRAY_H_