asmjit 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (201) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +1 -1
  3. data/asmjit.gemspec +1 -1
  4. data/ext/asmjit/asmjit/.editorconfig +10 -0
  5. data/ext/asmjit/asmjit/.github/FUNDING.yml +1 -0
  6. data/ext/asmjit/asmjit/.github/workflows/build-config.json +47 -0
  7. data/ext/asmjit/asmjit/.github/workflows/build.yml +156 -0
  8. data/ext/asmjit/asmjit/.gitignore +6 -0
  9. data/ext/asmjit/asmjit/CMakeLists.txt +611 -0
  10. data/ext/asmjit/asmjit/LICENSE.md +17 -0
  11. data/ext/asmjit/asmjit/README.md +69 -0
  12. data/ext/asmjit/asmjit/src/asmjit/a64.h +62 -0
  13. data/ext/asmjit/asmjit/src/asmjit/arm/a64archtraits_p.h +81 -0
  14. data/ext/asmjit/asmjit/src/asmjit/arm/a64assembler.cpp +5115 -0
  15. data/ext/asmjit/asmjit/src/asmjit/arm/a64assembler.h +72 -0
  16. data/ext/asmjit/asmjit/src/asmjit/arm/a64builder.cpp +51 -0
  17. data/ext/asmjit/asmjit/src/asmjit/arm/a64builder.h +57 -0
  18. data/ext/asmjit/asmjit/src/asmjit/arm/a64compiler.cpp +60 -0
  19. data/ext/asmjit/asmjit/src/asmjit/arm/a64compiler.h +247 -0
  20. data/ext/asmjit/asmjit/src/asmjit/arm/a64emithelper.cpp +464 -0
  21. data/ext/asmjit/asmjit/src/asmjit/arm/a64emithelper_p.h +50 -0
  22. data/ext/asmjit/asmjit/src/asmjit/arm/a64emitter.h +1228 -0
  23. data/ext/asmjit/asmjit/src/asmjit/arm/a64formatter.cpp +298 -0
  24. data/ext/asmjit/asmjit/src/asmjit/arm/a64formatter_p.h +59 -0
  25. data/ext/asmjit/asmjit/src/asmjit/arm/a64func.cpp +189 -0
  26. data/ext/asmjit/asmjit/src/asmjit/arm/a64func_p.h +33 -0
  27. data/ext/asmjit/asmjit/src/asmjit/arm/a64globals.h +1894 -0
  28. data/ext/asmjit/asmjit/src/asmjit/arm/a64instapi.cpp +278 -0
  29. data/ext/asmjit/asmjit/src/asmjit/arm/a64instapi_p.h +41 -0
  30. data/ext/asmjit/asmjit/src/asmjit/arm/a64instdb.cpp +1957 -0
  31. data/ext/asmjit/asmjit/src/asmjit/arm/a64instdb.h +74 -0
  32. data/ext/asmjit/asmjit/src/asmjit/arm/a64instdb_p.h +876 -0
  33. data/ext/asmjit/asmjit/src/asmjit/arm/a64operand.cpp +85 -0
  34. data/ext/asmjit/asmjit/src/asmjit/arm/a64operand.h +312 -0
  35. data/ext/asmjit/asmjit/src/asmjit/arm/a64rapass.cpp +852 -0
  36. data/ext/asmjit/asmjit/src/asmjit/arm/a64rapass_p.h +105 -0
  37. data/ext/asmjit/asmjit/src/asmjit/arm/a64utils.h +179 -0
  38. data/ext/asmjit/asmjit/src/asmjit/arm/armformatter.cpp +143 -0
  39. data/ext/asmjit/asmjit/src/asmjit/arm/armformatter_p.h +44 -0
  40. data/ext/asmjit/asmjit/src/asmjit/arm/armglobals.h +21 -0
  41. data/ext/asmjit/asmjit/src/asmjit/arm/armoperand.h +621 -0
  42. data/ext/asmjit/asmjit/src/asmjit/arm.h +62 -0
  43. data/ext/asmjit/asmjit/src/asmjit/asmjit-scope-begin.h +17 -0
  44. data/ext/asmjit/asmjit/src/asmjit/asmjit-scope-end.h +9 -0
  45. data/ext/asmjit/asmjit/src/asmjit/asmjit.h +33 -0
  46. data/ext/asmjit/asmjit/src/asmjit/core/api-build_p.h +55 -0
  47. data/ext/asmjit/asmjit/src/asmjit/core/api-config.h +613 -0
  48. data/ext/asmjit/asmjit/src/asmjit/core/archcommons.h +229 -0
  49. data/ext/asmjit/asmjit/src/asmjit/core/archtraits.cpp +160 -0
  50. data/ext/asmjit/asmjit/src/asmjit/core/archtraits.h +290 -0
  51. data/ext/asmjit/asmjit/src/asmjit/core/assembler.cpp +406 -0
  52. data/ext/asmjit/asmjit/src/asmjit/core/assembler.h +129 -0
  53. data/ext/asmjit/asmjit/src/asmjit/core/builder.cpp +889 -0
  54. data/ext/asmjit/asmjit/src/asmjit/core/builder.h +1391 -0
  55. data/ext/asmjit/asmjit/src/asmjit/core/codebuffer.h +113 -0
  56. data/ext/asmjit/asmjit/src/asmjit/core/codeholder.cpp +1149 -0
  57. data/ext/asmjit/asmjit/src/asmjit/core/codeholder.h +1035 -0
  58. data/ext/asmjit/asmjit/src/asmjit/core/codewriter.cpp +175 -0
  59. data/ext/asmjit/asmjit/src/asmjit/core/codewriter_p.h +179 -0
  60. data/ext/asmjit/asmjit/src/asmjit/core/compiler.cpp +582 -0
  61. data/ext/asmjit/asmjit/src/asmjit/core/compiler.h +737 -0
  62. data/ext/asmjit/asmjit/src/asmjit/core/compilerdefs.h +173 -0
  63. data/ext/asmjit/asmjit/src/asmjit/core/constpool.cpp +363 -0
  64. data/ext/asmjit/asmjit/src/asmjit/core/constpool.h +250 -0
  65. data/ext/asmjit/asmjit/src/asmjit/core/cpuinfo.cpp +1162 -0
  66. data/ext/asmjit/asmjit/src/asmjit/core/cpuinfo.h +813 -0
  67. data/ext/asmjit/asmjit/src/asmjit/core/emithelper.cpp +323 -0
  68. data/ext/asmjit/asmjit/src/asmjit/core/emithelper_p.h +58 -0
  69. data/ext/asmjit/asmjit/src/asmjit/core/emitter.cpp +333 -0
  70. data/ext/asmjit/asmjit/src/asmjit/core/emitter.h +741 -0
  71. data/ext/asmjit/asmjit/src/asmjit/core/emitterutils.cpp +129 -0
  72. data/ext/asmjit/asmjit/src/asmjit/core/emitterutils_p.h +89 -0
  73. data/ext/asmjit/asmjit/src/asmjit/core/environment.cpp +46 -0
  74. data/ext/asmjit/asmjit/src/asmjit/core/environment.h +508 -0
  75. data/ext/asmjit/asmjit/src/asmjit/core/errorhandler.cpp +14 -0
  76. data/ext/asmjit/asmjit/src/asmjit/core/errorhandler.h +228 -0
  77. data/ext/asmjit/asmjit/src/asmjit/core/formatter.cpp +584 -0
  78. data/ext/asmjit/asmjit/src/asmjit/core/formatter.h +247 -0
  79. data/ext/asmjit/asmjit/src/asmjit/core/formatter_p.h +34 -0
  80. data/ext/asmjit/asmjit/src/asmjit/core/func.cpp +286 -0
  81. data/ext/asmjit/asmjit/src/asmjit/core/func.h +1445 -0
  82. data/ext/asmjit/asmjit/src/asmjit/core/funcargscontext.cpp +293 -0
  83. data/ext/asmjit/asmjit/src/asmjit/core/funcargscontext_p.h +199 -0
  84. data/ext/asmjit/asmjit/src/asmjit/core/globals.cpp +133 -0
  85. data/ext/asmjit/asmjit/src/asmjit/core/globals.h +393 -0
  86. data/ext/asmjit/asmjit/src/asmjit/core/inst.cpp +113 -0
  87. data/ext/asmjit/asmjit/src/asmjit/core/inst.h +772 -0
  88. data/ext/asmjit/asmjit/src/asmjit/core/jitallocator.cpp +1242 -0
  89. data/ext/asmjit/asmjit/src/asmjit/core/jitallocator.h +261 -0
  90. data/ext/asmjit/asmjit/src/asmjit/core/jitruntime.cpp +80 -0
  91. data/ext/asmjit/asmjit/src/asmjit/core/jitruntime.h +89 -0
  92. data/ext/asmjit/asmjit/src/asmjit/core/logger.cpp +69 -0
  93. data/ext/asmjit/asmjit/src/asmjit/core/logger.h +198 -0
  94. data/ext/asmjit/asmjit/src/asmjit/core/misc_p.h +33 -0
  95. data/ext/asmjit/asmjit/src/asmjit/core/operand.cpp +132 -0
  96. data/ext/asmjit/asmjit/src/asmjit/core/operand.h +1611 -0
  97. data/ext/asmjit/asmjit/src/asmjit/core/osutils.cpp +84 -0
  98. data/ext/asmjit/asmjit/src/asmjit/core/osutils.h +61 -0
  99. data/ext/asmjit/asmjit/src/asmjit/core/osutils_p.h +68 -0
  100. data/ext/asmjit/asmjit/src/asmjit/core/raassignment_p.h +418 -0
  101. data/ext/asmjit/asmjit/src/asmjit/core/rabuilders_p.h +612 -0
  102. data/ext/asmjit/asmjit/src/asmjit/core/radefs_p.h +1204 -0
  103. data/ext/asmjit/asmjit/src/asmjit/core/ralocal.cpp +1166 -0
  104. data/ext/asmjit/asmjit/src/asmjit/core/ralocal_p.h +254 -0
  105. data/ext/asmjit/asmjit/src/asmjit/core/rapass.cpp +1969 -0
  106. data/ext/asmjit/asmjit/src/asmjit/core/rapass_p.h +1183 -0
  107. data/ext/asmjit/asmjit/src/asmjit/core/rastack.cpp +184 -0
  108. data/ext/asmjit/asmjit/src/asmjit/core/rastack_p.h +171 -0
  109. data/ext/asmjit/asmjit/src/asmjit/core/string.cpp +559 -0
  110. data/ext/asmjit/asmjit/src/asmjit/core/string.h +372 -0
  111. data/ext/asmjit/asmjit/src/asmjit/core/support.cpp +494 -0
  112. data/ext/asmjit/asmjit/src/asmjit/core/support.h +1773 -0
  113. data/ext/asmjit/asmjit/src/asmjit/core/target.cpp +14 -0
  114. data/ext/asmjit/asmjit/src/asmjit/core/target.h +53 -0
  115. data/ext/asmjit/asmjit/src/asmjit/core/type.cpp +74 -0
  116. data/ext/asmjit/asmjit/src/asmjit/core/type.h +419 -0
  117. data/ext/asmjit/asmjit/src/asmjit/core/virtmem.cpp +722 -0
  118. data/ext/asmjit/asmjit/src/asmjit/core/virtmem.h +242 -0
  119. data/ext/asmjit/asmjit/src/asmjit/core/zone.cpp +353 -0
  120. data/ext/asmjit/asmjit/src/asmjit/core/zone.h +615 -0
  121. data/ext/asmjit/asmjit/src/asmjit/core/zonehash.cpp +309 -0
  122. data/ext/asmjit/asmjit/src/asmjit/core/zonehash.h +186 -0
  123. data/ext/asmjit/asmjit/src/asmjit/core/zonelist.cpp +163 -0
  124. data/ext/asmjit/asmjit/src/asmjit/core/zonelist.h +209 -0
  125. data/ext/asmjit/asmjit/src/asmjit/core/zonestack.cpp +176 -0
  126. data/ext/asmjit/asmjit/src/asmjit/core/zonestack.h +239 -0
  127. data/ext/asmjit/asmjit/src/asmjit/core/zonestring.h +120 -0
  128. data/ext/asmjit/asmjit/src/asmjit/core/zonetree.cpp +99 -0
  129. data/ext/asmjit/asmjit/src/asmjit/core/zonetree.h +380 -0
  130. data/ext/asmjit/asmjit/src/asmjit/core/zonevector.cpp +356 -0
  131. data/ext/asmjit/asmjit/src/asmjit/core/zonevector.h +690 -0
  132. data/ext/asmjit/asmjit/src/asmjit/core.h +1861 -0
  133. data/ext/asmjit/asmjit/src/asmjit/x86/x86archtraits_p.h +148 -0
  134. data/ext/asmjit/asmjit/src/asmjit/x86/x86assembler.cpp +5110 -0
  135. data/ext/asmjit/asmjit/src/asmjit/x86/x86assembler.h +685 -0
  136. data/ext/asmjit/asmjit/src/asmjit/x86/x86builder.cpp +52 -0
  137. data/ext/asmjit/asmjit/src/asmjit/x86/x86builder.h +351 -0
  138. data/ext/asmjit/asmjit/src/asmjit/x86/x86compiler.cpp +61 -0
  139. data/ext/asmjit/asmjit/src/asmjit/x86/x86compiler.h +721 -0
  140. data/ext/asmjit/asmjit/src/asmjit/x86/x86emithelper.cpp +619 -0
  141. data/ext/asmjit/asmjit/src/asmjit/x86/x86emithelper_p.h +60 -0
  142. data/ext/asmjit/asmjit/src/asmjit/x86/x86emitter.h +4315 -0
  143. data/ext/asmjit/asmjit/src/asmjit/x86/x86formatter.cpp +944 -0
  144. data/ext/asmjit/asmjit/src/asmjit/x86/x86formatter_p.h +58 -0
  145. data/ext/asmjit/asmjit/src/asmjit/x86/x86func.cpp +503 -0
  146. data/ext/asmjit/asmjit/src/asmjit/x86/x86func_p.h +33 -0
  147. data/ext/asmjit/asmjit/src/asmjit/x86/x86globals.h +2169 -0
  148. data/ext/asmjit/asmjit/src/asmjit/x86/x86instapi.cpp +1732 -0
  149. data/ext/asmjit/asmjit/src/asmjit/x86/x86instapi_p.h +41 -0
  150. data/ext/asmjit/asmjit/src/asmjit/x86/x86instdb.cpp +4427 -0
  151. data/ext/asmjit/asmjit/src/asmjit/x86/x86instdb.h +563 -0
  152. data/ext/asmjit/asmjit/src/asmjit/x86/x86instdb_p.h +311 -0
  153. data/ext/asmjit/asmjit/src/asmjit/x86/x86opcode_p.h +436 -0
  154. data/ext/asmjit/asmjit/src/asmjit/x86/x86operand.cpp +231 -0
  155. data/ext/asmjit/asmjit/src/asmjit/x86/x86operand.h +1085 -0
  156. data/ext/asmjit/asmjit/src/asmjit/x86/x86rapass.cpp +1509 -0
  157. data/ext/asmjit/asmjit/src/asmjit/x86/x86rapass_p.h +94 -0
  158. data/ext/asmjit/asmjit/src/asmjit/x86.h +93 -0
  159. data/ext/asmjit/asmjit/src/asmjit.natvis +245 -0
  160. data/ext/asmjit/asmjit/test/asmjit_test_assembler.cpp +84 -0
  161. data/ext/asmjit/asmjit/test/asmjit_test_assembler.h +85 -0
  162. data/ext/asmjit/asmjit/test/asmjit_test_assembler_a64.cpp +4006 -0
  163. data/ext/asmjit/asmjit/test/asmjit_test_assembler_x64.cpp +17833 -0
  164. data/ext/asmjit/asmjit/test/asmjit_test_assembler_x86.cpp +8300 -0
  165. data/ext/asmjit/asmjit/test/asmjit_test_compiler.cpp +253 -0
  166. data/ext/asmjit/asmjit/test/asmjit_test_compiler.h +73 -0
  167. data/ext/asmjit/asmjit/test/asmjit_test_compiler_a64.cpp +690 -0
  168. data/ext/asmjit/asmjit/test/asmjit_test_compiler_x86.cpp +4317 -0
  169. data/ext/asmjit/asmjit/test/asmjit_test_emitters.cpp +197 -0
  170. data/ext/asmjit/asmjit/test/asmjit_test_instinfo.cpp +181 -0
  171. data/ext/asmjit/asmjit/test/asmjit_test_misc.h +257 -0
  172. data/ext/asmjit/asmjit/test/asmjit_test_perf.cpp +62 -0
  173. data/ext/asmjit/asmjit/test/asmjit_test_perf.h +61 -0
  174. data/ext/asmjit/asmjit/test/asmjit_test_perf_a64.cpp +699 -0
  175. data/ext/asmjit/asmjit/test/asmjit_test_perf_x86.cpp +5032 -0
  176. data/ext/asmjit/asmjit/test/asmjit_test_unit.cpp +172 -0
  177. data/ext/asmjit/asmjit/test/asmjit_test_x86_sections.cpp +172 -0
  178. data/ext/asmjit/asmjit/test/asmjitutils.h +38 -0
  179. data/ext/asmjit/asmjit/test/broken.cpp +312 -0
  180. data/ext/asmjit/asmjit/test/broken.h +148 -0
  181. data/ext/asmjit/asmjit/test/cmdline.h +61 -0
  182. data/ext/asmjit/asmjit/test/performancetimer.h +41 -0
  183. data/ext/asmjit/asmjit/tools/configure-makefiles.sh +13 -0
  184. data/ext/asmjit/asmjit/tools/configure-ninja.sh +13 -0
  185. data/ext/asmjit/asmjit/tools/configure-sanitizers.sh +13 -0
  186. data/ext/asmjit/asmjit/tools/configure-vs2019-x64.bat +2 -0
  187. data/ext/asmjit/asmjit/tools/configure-vs2019-x86.bat +2 -0
  188. data/ext/asmjit/asmjit/tools/configure-vs2022-x64.bat +2 -0
  189. data/ext/asmjit/asmjit/tools/configure-vs2022-x86.bat +2 -0
  190. data/ext/asmjit/asmjit/tools/configure-xcode.sh +8 -0
  191. data/ext/asmjit/asmjit/tools/enumgen.js +417 -0
  192. data/ext/asmjit/asmjit/tools/enumgen.sh +3 -0
  193. data/ext/asmjit/asmjit/tools/tablegen-arm.js +365 -0
  194. data/ext/asmjit/asmjit/tools/tablegen-arm.sh +3 -0
  195. data/ext/asmjit/asmjit/tools/tablegen-x86.js +2638 -0
  196. data/ext/asmjit/asmjit/tools/tablegen-x86.sh +3 -0
  197. data/ext/asmjit/asmjit/tools/tablegen.js +947 -0
  198. data/ext/asmjit/asmjit/tools/tablegen.sh +4 -0
  199. data/ext/asmjit/asmjit.cc +18 -0
  200. data/lib/asmjit/version.rb +1 -1
  201. metadata +197 -2
@@ -0,0 +1,5032 @@
1
+ // This file is part of AsmJit project <https://asmjit.com>
2
+ //
3
+ // See asmjit.h or LICENSE.md for license and copyright information
4
+ // SPDX-License-Identifier: Zlib
5
+
6
+ #include <asmjit/core.h>
7
+
8
+ #if !defined(ASMJIT_NO_X86)
9
+ #include <asmjit/x86.h>
10
+
11
+ #include <limits>
12
+ #include <stdio.h>
13
+ #include <string.h>
14
+
15
+ #include "asmjit_test_misc.h"
16
+ #include "asmjit_test_perf.h"
17
+
18
+ using namespace asmjit;
19
+
20
+ enum class InstForm {
21
+ kReg,
22
+ kMem
23
+ };
24
+
25
+ // Generates a long sequence of GP instructions.
26
+ template<typename Emitter>
27
+ static void generateGpSequenceInternal(
28
+ Emitter& cc,
29
+ InstForm form,
30
+ const x86::Gp& a, const x86::Gp& b, const x86::Gp& c, const x86::Gp& d) {
31
+
32
+ cc.mov(a, 0xAAAAAAAA);
33
+ cc.mov(b, 0xBBBBBBBB);
34
+ cc.mov(c, 0xCCCCCCCC);
35
+ cc.mov(d, 0xFFFFFFFF);
36
+
37
+ if (form == InstForm::kReg) {
38
+ cc.adc(a, b);
39
+ cc.adc(b, c);
40
+ cc.adc(c, d);
41
+ cc.add(a, b);
42
+ cc.add(b, c);
43
+ cc.add(c, d);
44
+ cc.and_(a, b);
45
+ cc.and_(b, c);
46
+ cc.and_(c, d);
47
+ cc.bsf(a, b);
48
+ cc.bsf(b, c);
49
+ cc.bsf(c, d);
50
+ cc.bsr(a, b);
51
+ cc.bsr(b, c);
52
+ cc.bsr(c, d);
53
+ cc.bswap(a);
54
+ cc.bswap(b);
55
+ cc.bswap(c);
56
+ cc.bt(a, b);
57
+ cc.bt(b, c);
58
+ cc.bt(c, d);
59
+ cc.btc(a, b);
60
+ cc.btc(b, c);
61
+ cc.btc(c, d);
62
+ cc.btr(a, b);
63
+ cc.btr(b, c);
64
+ cc.btr(c, d);
65
+ cc.bts(a, b);
66
+ cc.bts(b, c);
67
+ cc.bts(c, d);
68
+ cc.cmp(a, b);
69
+ cc.cmovc(a, b);
70
+ cc.cmp(b, c);
71
+ cc.cmovc(b, c);
72
+ cc.cmp(c, d);
73
+ cc.cmovc(c, d);
74
+ cc.dec(a);
75
+ cc.dec(b);
76
+ cc.dec(c);
77
+ cc.imul(a, b);
78
+ cc.imul(b, c);
79
+ cc.imul(c, d);
80
+ cc.movsx(a, b.r8Lo());
81
+ cc.movsx(b, c.r8Lo());
82
+ cc.movsx(c, d.r8Lo());
83
+ cc.movzx(a, b.r8Lo());
84
+ cc.movzx(b, c.r8Lo());
85
+ cc.movzx(c, d.r8Lo());
86
+ cc.neg(a);
87
+ cc.neg(b);
88
+ cc.neg(c);
89
+ cc.not_(a);
90
+ cc.not_(b);
91
+ cc.not_(c);
92
+ cc.or_(a, b);
93
+ cc.or_(b, c);
94
+ cc.or_(c, d);
95
+ cc.sbb(a, b);
96
+ cc.sbb(b, c);
97
+ cc.sbb(c, d);
98
+ cc.sub(a, b);
99
+ cc.sub(b, c);
100
+ cc.sub(c, d);
101
+ cc.test(a, b);
102
+ cc.test(b, c);
103
+ cc.test(c, d);
104
+ cc.xchg(a, b);
105
+ cc.xchg(b, c);
106
+ cc.xchg(c, d);
107
+ cc.xor_(a, b);
108
+ cc.xor_(b, c);
109
+ cc.xor_(c, d);
110
+
111
+ cc.rcl(a, c.r8Lo());
112
+ cc.rcl(b, c.r8Lo());
113
+ cc.rcl(d, c.r8Lo());
114
+ cc.rcr(a, c.r8Lo());
115
+ cc.rcr(b, c.r8Lo());
116
+ cc.rcr(d, c.r8Lo());
117
+ cc.rol(a, c.r8Lo());
118
+ cc.rol(b, c.r8Lo());
119
+ cc.rol(d, c.r8Lo());
120
+ cc.ror(a, c.r8Lo());
121
+ cc.ror(b, c.r8Lo());
122
+ cc.ror(d, c.r8Lo());
123
+ cc.shl(a, c.r8Lo());
124
+ cc.shl(b, c.r8Lo());
125
+ cc.shl(d, c.r8Lo());
126
+ cc.shr(a, c.r8Lo());
127
+ cc.shr(b, c.r8Lo());
128
+ cc.shr(d, c.r8Lo());
129
+ cc.sar(a, c.r8Lo());
130
+ cc.sar(b, c.r8Lo());
131
+ cc.sar(d, c.r8Lo());
132
+ cc.shld(a, b, c.r8Lo());
133
+ cc.shld(b, d, c.r8Lo());
134
+ cc.shld(d, a, c.r8Lo());
135
+ cc.shrd(a, b, c.r8Lo());
136
+ cc.shrd(b, d, c.r8Lo());
137
+ cc.shrd(d, a, c.r8Lo());
138
+
139
+ cc.adcx(a, b);
140
+ cc.adox(a, b);
141
+ cc.adcx(b, c);
142
+ cc.adox(b, c);
143
+ cc.adcx(c, d);
144
+ cc.adox(c, d);
145
+ cc.andn(a, b, c);
146
+ cc.andn(b, c, d);
147
+ cc.andn(c, d, a);
148
+ cc.bextr(a, b, c);
149
+ cc.bextr(b, c, d);
150
+ cc.bextr(c, d, a);
151
+ cc.blsi(a, b);
152
+ cc.blsi(b, c);
153
+ cc.blsi(c, d);
154
+ cc.blsmsk(a, b);
155
+ cc.blsmsk(b, c);
156
+ cc.blsmsk(c, d);
157
+ cc.blsr(a, b);
158
+ cc.blsr(b, c);
159
+ cc.blsr(c, d);
160
+ cc.bzhi(a, b, c);
161
+ cc.bzhi(b, c, d);
162
+ cc.bzhi(c, d, a);
163
+ cc.lzcnt(a, b);
164
+ cc.lzcnt(b, c);
165
+ cc.lzcnt(c, d);
166
+ cc.pdep(a, b, c);
167
+ cc.pdep(b, c, d);
168
+ cc.pdep(c, d, a);
169
+ cc.pext(a, b, c);
170
+ cc.pext(b, c, d);
171
+ cc.pext(c, d, a);
172
+ cc.popcnt(a, b);
173
+ cc.popcnt(b, c);
174
+ cc.popcnt(c, d);
175
+ cc.rorx(a, b, 8);
176
+ cc.rorx(b, c, 8);
177
+ cc.rorx(c, d, 8);
178
+ cc.sarx(a, b, c);
179
+ cc.sarx(b, c, d);
180
+ cc.sarx(c, d, a);
181
+ cc.shlx(a, b, c);
182
+ cc.shlx(b, c, d);
183
+ cc.shlx(c, d, a);
184
+ cc.shrx(a, b, c);
185
+ cc.shrx(b, c, d);
186
+ cc.shrx(c, d, a);
187
+ cc.tzcnt(a, b);
188
+ cc.tzcnt(b, c);
189
+ cc.tzcnt(c, d);
190
+ }
191
+ else {
192
+ uint32_t regSize = cc.registerSize();
193
+ x86::Mem m = x86::ptr(c, 0, regSize);
194
+ x86::Mem m8 = x86::byte_ptr(c);
195
+
196
+ cc.adc(a, m);
197
+ cc.adc(b, m);
198
+ cc.adc(c, m);
199
+ cc.add(a, m);
200
+ cc.add(b, m);
201
+ cc.add(c, m);
202
+ cc.and_(a, m);
203
+ cc.and_(b, m);
204
+ cc.and_(c, m);
205
+ cc.bsf(a, m);
206
+ cc.bsf(b, m);
207
+ cc.bsf(c, m);
208
+ cc.bsr(a, m);
209
+ cc.bsr(b, m);
210
+ cc.bsr(c, m);
211
+ cc.bt(m, a);
212
+ cc.bt(m, b);
213
+ cc.bt(m, c);
214
+ cc.btc(m, a);
215
+ cc.btc(m, b);
216
+ cc.btc(m, c);
217
+ cc.btr(m, a);
218
+ cc.btr(m, b);
219
+ cc.btr(m, c);
220
+ cc.bts(m, a);
221
+ cc.bts(m, b);
222
+ cc.bts(m, c);
223
+ cc.cmp(a, m);
224
+ cc.cmovc(a, m);
225
+ cc.cmp(b, m);
226
+ cc.cmovc(b, m);
227
+ cc.cmp(c, m);
228
+ cc.cmovc(c, m);
229
+ cc.dec(m);
230
+ cc.movsx(a, m8);
231
+ cc.movsx(b, m8);
232
+ cc.movsx(c, m8);
233
+ cc.movzx(a, m8);
234
+ cc.movzx(b, m8);
235
+ cc.movzx(c, m8);
236
+ cc.neg(m);
237
+ cc.not_(m);
238
+ cc.or_(a, m);
239
+ cc.or_(b, m);
240
+ cc.or_(c, m);
241
+ cc.sbb(a, m);
242
+ cc.sbb(b, m);
243
+ cc.sbb(c, m);
244
+ cc.sub(a, m);
245
+ cc.sub(b, m);
246
+ cc.sub(c, m);
247
+ cc.test(m, a);
248
+ cc.test(m, b);
249
+ cc.test(m, c);
250
+ cc.xchg(a, m);
251
+ cc.xchg(b, m);
252
+ cc.xchg(c, m);
253
+ cc.xor_(a, m);
254
+ cc.xor_(b, m);
255
+ cc.xor_(c, m);
256
+
257
+ cc.rcl(m, c.r8Lo());
258
+ cc.rcr(m, c.r8Lo());
259
+ cc.rol(m, c.r8Lo());
260
+ cc.ror(m, c.r8Lo());
261
+ cc.shl(m, c.r8Lo());
262
+ cc.shr(m, c.r8Lo());
263
+ cc.sar(m, c.r8Lo());
264
+ cc.shld(m, b, c.r8Lo());
265
+ cc.shld(m, d, c.r8Lo());
266
+ cc.shld(m, a, c.r8Lo());
267
+ cc.shrd(m, b, c.r8Lo());
268
+ cc.shrd(m, d, c.r8Lo());
269
+ cc.shrd(m, a, c.r8Lo());
270
+
271
+ cc.adcx(a, m);
272
+ cc.adox(a, m);
273
+ cc.adcx(b, m);
274
+ cc.adox(b, m);
275
+ cc.adcx(c, m);
276
+ cc.adox(c, m);
277
+ cc.andn(a, b, m);
278
+ cc.andn(b, c, m);
279
+ cc.andn(c, d, m);
280
+ cc.bextr(a, m, c);
281
+ cc.bextr(b, m, d);
282
+ cc.bextr(c, m, a);
283
+ cc.blsi(a, m);
284
+ cc.blsi(b, m);
285
+ cc.blsi(c, m);
286
+ cc.blsmsk(a, m);
287
+ cc.blsmsk(b, m);
288
+ cc.blsmsk(c, m);
289
+ cc.blsr(a, m);
290
+ cc.blsr(b, m);
291
+ cc.blsr(c, m);
292
+ cc.bzhi(a, m, c);
293
+ cc.bzhi(b, m, d);
294
+ cc.bzhi(c, m, a);
295
+ cc.lzcnt(a, m);
296
+ cc.lzcnt(b, m);
297
+ cc.lzcnt(c, m);
298
+ cc.pdep(a, b, m);
299
+ cc.pdep(b, c, m);
300
+ cc.pdep(c, d, m);
301
+ cc.pext(a, b, m);
302
+ cc.pext(b, c, m);
303
+ cc.pext(c, d, m);
304
+ cc.popcnt(a, m);
305
+ cc.popcnt(b, m);
306
+ cc.popcnt(c, m);
307
+ cc.rorx(a, m, 8);
308
+ cc.rorx(b, m, 8);
309
+ cc.rorx(c, m, 8);
310
+ cc.sarx(a, m, c);
311
+ cc.sarx(b, m, d);
312
+ cc.sarx(c, m, a);
313
+ cc.shlx(a, m, c);
314
+ cc.shlx(b, m, d);
315
+ cc.shlx(c, m, a);
316
+ cc.shrx(a, m, c);
317
+ cc.shrx(b, m, d);
318
+ cc.shrx(c, m, a);
319
+ cc.tzcnt(a, m);
320
+ cc.tzcnt(b, m);
321
+ cc.tzcnt(c, m);
322
+ }
323
+ }
324
+
325
+ static void generateGpSequence(BaseEmitter& emitter, InstForm form, bool emitPrologEpilog) {
326
+ using namespace asmjit::x86;
327
+
328
+ if (emitter.isAssembler()) {
329
+ Assembler& cc = *emitter.as<Assembler>();
330
+
331
+ x86::Gp a = cc.zax();
332
+ x86::Gp b = cc.zbx();
333
+ x86::Gp c = cc.zcx();
334
+ x86::Gp d = cc.zdx();
335
+
336
+ if (emitPrologEpilog) {
337
+ FuncDetail func;
338
+ func.init(FuncSignatureT<void, void*, const void*, size_t>(CallConvId::kHost), cc.environment());
339
+
340
+ FuncFrame frame;
341
+ frame.init(func);
342
+ frame.addDirtyRegs(a, b, c, d);
343
+ frame.finalize();
344
+
345
+ cc.emitProlog(frame);
346
+ generateGpSequenceInternal(cc, form, a, b, c, d);
347
+ cc.emitEpilog(frame);
348
+ }
349
+ else {
350
+ generateGpSequenceInternal(cc, form, a, b, c, d);
351
+ }
352
+ }
353
+ #ifndef ASMJIT_NO_BUILDER
354
+ else if (emitter.isBuilder()) {
355
+ Builder& cc = *emitter.as<Builder>();
356
+
357
+ x86::Gp a = cc.zax();
358
+ x86::Gp b = cc.zbx();
359
+ x86::Gp c = cc.zcx();
360
+ x86::Gp d = cc.zdx();
361
+
362
+ if (emitPrologEpilog) {
363
+ FuncDetail func;
364
+ func.init(FuncSignatureT<void, void*, const void*, size_t>(CallConvId::kHost), cc.environment());
365
+
366
+ FuncFrame frame;
367
+ frame.init(func);
368
+ frame.addDirtyRegs(a, b, c, d);
369
+ frame.finalize();
370
+
371
+ cc.emitProlog(frame);
372
+ generateGpSequenceInternal(cc, form, a, b, c, d);
373
+ cc.emitEpilog(frame);
374
+ }
375
+ else {
376
+ generateGpSequenceInternal(cc, form, a, b, c, d);
377
+ }
378
+ }
379
+ #endif
380
+ #ifndef ASMJIT_NO_COMPILER
381
+ else if (emitter.isCompiler()) {
382
+ Compiler& cc = *emitter.as<Compiler>();
383
+
384
+ Gp a = cc.newIntPtr("a");
385
+ Gp b = cc.newIntPtr("b");
386
+ Gp c = cc.newIntPtr("c");
387
+ Gp d = cc.newIntPtr("d");
388
+
389
+ cc.addFunc(FuncSignatureT<void>(CallConvId::kHost));
390
+ generateGpSequenceInternal(cc, form, a, b, c, d);
391
+ cc.endFunc();
392
+ }
393
+ #endif
394
+ }
395
+
396
+ // Generates a long sequence of SSE instructions using only registers.
397
+ template<typename Emitter>
398
+ static void generateSseSequenceInternal(
399
+ Emitter& cc,
400
+ InstForm form,
401
+ const x86::Gp& gp,
402
+ const x86::Xmm& xmmA, const x86::Xmm& xmmB, const x86::Xmm& xmmC, const x86::Xmm& xmmD) {
403
+
404
+ x86::Gp gpd = gp.r32();
405
+ x86::Gp gpq = gp.r64();
406
+ x86::Gp gpz = cc.is32Bit() ? gpd : gpq;
407
+
408
+ cc.xor_(gpd, gpd);
409
+ cc.xorps(xmmA, xmmA);
410
+ cc.xorps(xmmB, xmmB);
411
+ cc.xorps(xmmC, xmmC);
412
+ cc.xorps(xmmD, xmmD);
413
+
414
+ if (form == InstForm::kReg) {
415
+ // SSE.
416
+ cc.addps(xmmA, xmmB);
417
+ cc.addss(xmmA, xmmB);
418
+ cc.andnps(xmmA, xmmB);
419
+ cc.andps(xmmA, xmmB);
420
+ cc.cmpps(xmmA, xmmB, 0);
421
+ cc.cmpss(xmmA, xmmB, 0);
422
+ cc.comiss(xmmA, xmmB);
423
+ cc.cvtsi2ss(xmmA, gpd);
424
+ cc.cvtsi2ss(xmmA, gpz);
425
+ cc.cvtss2si(gpd, xmmB);
426
+ cc.cvtss2si(gpz, xmmB);
427
+ cc.cvttss2si(gpd, xmmB);
428
+ cc.cvttss2si(gpz, xmmB);
429
+ cc.divps(xmmA, xmmB);
430
+ cc.divss(xmmA, xmmB);
431
+ cc.maxps(xmmA, xmmB);
432
+ cc.maxss(xmmA, xmmB);
433
+ cc.minps(xmmA, xmmB);
434
+ cc.minss(xmmA, xmmB);
435
+ cc.movaps(xmmA, xmmB);
436
+ cc.movd(gpd, xmmB);
437
+ cc.movd(xmmA, gpd);
438
+ cc.movq(xmmA, xmmB);
439
+ cc.movhlps(xmmA, xmmB);
440
+ cc.movlhps(xmmA, xmmB);
441
+ cc.movups(xmmA, xmmB);
442
+ cc.mulps(xmmA, xmmB);
443
+ cc.mulss(xmmA, xmmB);
444
+ cc.orps(xmmA, xmmB);
445
+ cc.rcpps(xmmA, xmmB);
446
+ cc.rcpss(xmmA, xmmB);
447
+ cc.psadbw(xmmA, xmmB);
448
+ cc.rsqrtps(xmmA, xmmB);
449
+ cc.rsqrtss(xmmA, xmmB);
450
+ cc.sfence();
451
+ cc.shufps(xmmA, xmmB, 0);
452
+ cc.sqrtps(xmmA, xmmB);
453
+ cc.sqrtss(xmmA, xmmB);
454
+ cc.subps(xmmA, xmmB);
455
+ cc.subss(xmmA, xmmB);
456
+ cc.ucomiss(xmmA, xmmB);
457
+ cc.unpckhps(xmmA, xmmB);
458
+ cc.unpcklps(xmmA, xmmB);
459
+ cc.xorps(xmmA, xmmB);
460
+
461
+ // SSE2.
462
+ cc.addpd(xmmA, xmmB);
463
+ cc.addsd(xmmA, xmmB);
464
+ cc.andnpd(xmmA, xmmB);
465
+ cc.andpd(xmmA, xmmB);
466
+ cc.cmppd(xmmA, xmmB, 0);
467
+ cc.cmpsd(xmmA, xmmB, 0);
468
+ cc.comisd(xmmA, xmmB);
469
+ cc.cvtdq2pd(xmmA, xmmB);
470
+ cc.cvtdq2ps(xmmA, xmmB);
471
+ cc.cvtpd2dq(xmmA, xmmB);
472
+ cc.cvtpd2ps(xmmA, xmmB);
473
+ cc.cvtps2dq(xmmA, xmmB);
474
+ cc.cvtps2pd(xmmA, xmmB);
475
+ cc.cvtsd2si(gpd, xmmB);
476
+ cc.cvtsd2si(gpz, xmmB);
477
+ cc.cvtsd2ss(xmmA, xmmB);
478
+ cc.cvtsi2sd(xmmA, gpd);
479
+ cc.cvtsi2sd(xmmA, gpz);
480
+ cc.cvtss2sd(xmmA, xmmB);
481
+ cc.cvtss2si(gpd, xmmB);
482
+ cc.cvtss2si(gpz, xmmB);
483
+ cc.cvttpd2dq(xmmA, xmmB);
484
+ cc.cvttps2dq(xmmA, xmmB);
485
+ cc.cvttsd2si(gpd, xmmB);
486
+ cc.cvttsd2si(gpz, xmmB);
487
+ cc.divpd(xmmA, xmmB);
488
+ cc.divsd(xmmA, xmmB);
489
+ cc.maxpd(xmmA, xmmB);
490
+ cc.maxsd(xmmA, xmmB);
491
+ cc.minpd(xmmA, xmmB);
492
+ cc.minsd(xmmA, xmmB);
493
+ cc.movdqa(xmmA, xmmB);
494
+ cc.movdqu(xmmA, xmmB);
495
+ cc.movmskps(gpd, xmmB);
496
+ cc.movmskpd(gpd, xmmB);
497
+ cc.movsd(xmmA, xmmB);
498
+ cc.mulpd(xmmA, xmmB);
499
+ cc.mulsd(xmmA, xmmB);
500
+ cc.orpd(xmmA, xmmB);
501
+ cc.packsswb(xmmA, xmmB);
502
+ cc.packssdw(xmmA, xmmB);
503
+ cc.packuswb(xmmA, xmmB);
504
+ cc.paddb(xmmA, xmmB);
505
+ cc.paddw(xmmA, xmmB);
506
+ cc.paddd(xmmA, xmmB);
507
+ cc.paddq(xmmA, xmmB);
508
+ cc.paddsb(xmmA, xmmB);
509
+ cc.paddsw(xmmA, xmmB);
510
+ cc.paddusb(xmmA, xmmB);
511
+ cc.paddusw(xmmA, xmmB);
512
+ cc.pand(xmmA, xmmB);
513
+ cc.pandn(xmmA, xmmB);
514
+ cc.pavgb(xmmA, xmmB);
515
+ cc.pavgw(xmmA, xmmB);
516
+ cc.pcmpeqb(xmmA, xmmB);
517
+ cc.pcmpeqw(xmmA, xmmB);
518
+ cc.pcmpeqd(xmmA, xmmB);
519
+ cc.pcmpgtb(xmmA, xmmB);
520
+ cc.pcmpgtw(xmmA, xmmB);
521
+ cc.pcmpgtd(xmmA, xmmB);
522
+ cc.pmaxsw(xmmA, xmmB);
523
+ cc.pmaxub(xmmA, xmmB);
524
+ cc.pminsw(xmmA, xmmB);
525
+ cc.pminub(xmmA, xmmB);
526
+ cc.pmovmskb(gpd, xmmB);
527
+ cc.pmulhw(xmmA, xmmB);
528
+ cc.pmulhuw(xmmA, xmmB);
529
+ cc.pmullw(xmmA, xmmB);
530
+ cc.pmuludq(xmmA, xmmB);
531
+ cc.por(xmmA, xmmB);
532
+ cc.pslld(xmmA, xmmB);
533
+ cc.pslld(xmmA, 0);
534
+ cc.psllq(xmmA, xmmB);
535
+ cc.psllq(xmmA, 0);
536
+ cc.psllw(xmmA, xmmB);
537
+ cc.psllw(xmmA, 0);
538
+ cc.pslldq(xmmA, 0);
539
+ cc.psrad(xmmA, xmmB);
540
+ cc.psrad(xmmA, 0);
541
+ cc.psraw(xmmA, xmmB);
542
+ cc.psraw(xmmA, 0);
543
+ cc.psubb(xmmA, xmmB);
544
+ cc.psubw(xmmA, xmmB);
545
+ cc.psubd(xmmA, xmmB);
546
+ cc.psubq(xmmA, xmmB);
547
+ cc.pmaddwd(xmmA, xmmB);
548
+ cc.pshufd(xmmA, xmmB, 0);
549
+ cc.pshufhw(xmmA, xmmB, 0);
550
+ cc.pshuflw(xmmA, xmmB, 0);
551
+ cc.psrld(xmmA, xmmB);
552
+ cc.psrld(xmmA, 0);
553
+ cc.psrlq(xmmA, xmmB);
554
+ cc.psrlq(xmmA, 0);
555
+ cc.psrldq(xmmA, 0);
556
+ cc.psrlw(xmmA, xmmB);
557
+ cc.psrlw(xmmA, 0);
558
+ cc.psubsb(xmmA, xmmB);
559
+ cc.psubsw(xmmA, xmmB);
560
+ cc.psubusb(xmmA, xmmB);
561
+ cc.psubusw(xmmA, xmmB);
562
+ cc.punpckhbw(xmmA, xmmB);
563
+ cc.punpckhwd(xmmA, xmmB);
564
+ cc.punpckhdq(xmmA, xmmB);
565
+ cc.punpckhqdq(xmmA, xmmB);
566
+ cc.punpcklbw(xmmA, xmmB);
567
+ cc.punpcklwd(xmmA, xmmB);
568
+ cc.punpckldq(xmmA, xmmB);
569
+ cc.punpcklqdq(xmmA, xmmB);
570
+ cc.pxor(xmmA, xmmB);
571
+ cc.sqrtpd(xmmA, xmmB);
572
+ cc.sqrtsd(xmmA, xmmB);
573
+ cc.subpd(xmmA, xmmB);
574
+ cc.subsd(xmmA, xmmB);
575
+ cc.ucomisd(xmmA, xmmB);
576
+ cc.unpckhpd(xmmA, xmmB);
577
+ cc.unpcklpd(xmmA, xmmB);
578
+ cc.xorpd(xmmA, xmmB);
579
+
580
+ // SSE3.
581
+ cc.addsubpd(xmmA, xmmB);
582
+ cc.addsubps(xmmA, xmmB);
583
+ cc.haddpd(xmmA, xmmB);
584
+ cc.haddps(xmmA, xmmB);
585
+ cc.hsubpd(xmmA, xmmB);
586
+ cc.hsubps(xmmA, xmmB);
587
+ cc.movddup(xmmA, xmmB);
588
+ cc.movshdup(xmmA, xmmB);
589
+ cc.movsldup(xmmA, xmmB);
590
+
591
+ // SSSE3.
592
+ cc.psignb(xmmA, xmmB);
593
+ cc.psignw(xmmA, xmmB);
594
+ cc.psignd(xmmA, xmmB);
595
+ cc.phaddw(xmmA, xmmB);
596
+ cc.phaddd(xmmA, xmmB);
597
+ cc.phaddsw(xmmA, xmmB);
598
+ cc.phsubw(xmmA, xmmB);
599
+ cc.phsubd(xmmA, xmmB);
600
+ cc.phsubsw(xmmA, xmmB);
601
+ cc.pmaddubsw(xmmA, xmmB);
602
+ cc.pabsb(xmmA, xmmB);
603
+ cc.pabsw(xmmA, xmmB);
604
+ cc.pabsd(xmmA, xmmB);
605
+ cc.pmulhrsw(xmmA, xmmB);
606
+ cc.pshufb(xmmA, xmmB);
607
+ cc.palignr(xmmA, xmmB, 0);
608
+
609
+ // SSE4.1.
610
+ cc.blendpd(xmmA, xmmB, 0);
611
+ cc.blendps(xmmA, xmmB, 0);
612
+ cc.blendvpd(xmmA, xmmB, xmmA);
613
+ cc.blendvps(xmmA, xmmB, xmmA);
614
+
615
+ cc.dppd(xmmA, xmmB, 0);
616
+ cc.dpps(xmmA, xmmB, 0);
617
+ cc.extractps(gpd, xmmB, 0);
618
+ cc.insertps(xmmA, xmmB, 0);
619
+ cc.mpsadbw(xmmA, xmmB, 0);
620
+ cc.packusdw(xmmA, xmmB);
621
+ cc.pblendvb(xmmA, xmmB, xmmA);
622
+ cc.pblendw(xmmA, xmmB, 0);
623
+ cc.pcmpeqq(xmmA, xmmB);
624
+ cc.pextrb(gpd, xmmB, 0);
625
+ cc.pextrd(gpd, xmmB, 0);
626
+ if (cc.is64Bit()) cc.pextrq(gpq, xmmB, 0);
627
+ cc.pextrw(gpd, xmmB, 0);
628
+ cc.phminposuw(xmmA, xmmB);
629
+ cc.pinsrb(xmmA, gpd, 0);
630
+ cc.pinsrd(xmmA, gpd, 0);
631
+ cc.pinsrw(xmmA, gpd, 0);
632
+ cc.pmaxuw(xmmA, xmmB);
633
+ cc.pmaxsb(xmmA, xmmB);
634
+ cc.pmaxsd(xmmA, xmmB);
635
+ cc.pmaxud(xmmA, xmmB);
636
+ cc.pminsb(xmmA, xmmB);
637
+ cc.pminuw(xmmA, xmmB);
638
+ cc.pminud(xmmA, xmmB);
639
+ cc.pminsd(xmmA, xmmB);
640
+ cc.pmovsxbw(xmmA, xmmB);
641
+ cc.pmovsxbd(xmmA, xmmB);
642
+ cc.pmovsxbq(xmmA, xmmB);
643
+ cc.pmovsxwd(xmmA, xmmB);
644
+ cc.pmovsxwq(xmmA, xmmB);
645
+ cc.pmovsxdq(xmmA, xmmB);
646
+ cc.pmovzxbw(xmmA, xmmB);
647
+ cc.pmovzxbd(xmmA, xmmB);
648
+ cc.pmovzxbq(xmmA, xmmB);
649
+ cc.pmovzxwd(xmmA, xmmB);
650
+ cc.pmovzxwq(xmmA, xmmB);
651
+ cc.pmovzxdq(xmmA, xmmB);
652
+ cc.pmuldq(xmmA, xmmB);
653
+ cc.pmulld(xmmA, xmmB);
654
+ cc.ptest(xmmA, xmmB);
655
+ cc.roundps(xmmA, xmmB, 0);
656
+ cc.roundss(xmmA, xmmB, 0);
657
+ cc.roundpd(xmmA, xmmB, 0);
658
+ cc.roundsd(xmmA, xmmB, 0);
659
+ }
660
+ else {
661
+ x86::Mem m = x86::ptr(gpz);
662
+
663
+ cc.addps(xmmA, m);
664
+ cc.addss(xmmA, m);
665
+ cc.andnps(xmmA, m);
666
+ cc.andps(xmmA, m);
667
+ cc.cmpps(xmmA, m, 0);
668
+ cc.cmpss(xmmA, m, 0);
669
+ cc.comiss(xmmA, m);
670
+ cc.cvtpi2ps(xmmA, m);
671
+ cc.cvtsi2ss(xmmA, m);
672
+ cc.cvtss2si(gpd, m);
673
+ if (cc.is64Bit()) cc.cvtss2si(gpq, m);
674
+ cc.cvttss2si(gpd, m);
675
+ if (cc.is64Bit()) cc.cvttss2si(gpq, m);
676
+ cc.divps(xmmA, m);
677
+ cc.divss(xmmA, m);
678
+ cc.maxps(xmmA, m);
679
+ cc.maxss(xmmA, m);
680
+ cc.minps(xmmA, m);
681
+ cc.minss(xmmA, m);
682
+ cc.movaps(xmmA, m);
683
+ cc.movaps(m, xmmB);
684
+ cc.movd(m, xmmB);
685
+ cc.movd(xmmA, m);
686
+ cc.movq(m, xmmB);
687
+ cc.movq(xmmA, m);
688
+ cc.movhps(xmmA, m);
689
+ cc.movhps(m, xmmB);
690
+ cc.movlps(xmmA, m);
691
+ cc.movlps(m, xmmB);
692
+ cc.movntps(m, xmmB);
693
+ cc.movss(xmmA, m);
694
+ cc.movss(m, xmmB);
695
+ cc.movups(xmmA, m);
696
+ cc.movups(m, xmmB);
697
+ cc.mulps(xmmA, m);
698
+ cc.mulss(xmmA, m);
699
+ cc.orps(xmmA, m);
700
+ cc.rcpps(xmmA, m);
701
+ cc.rcpss(xmmA, m);
702
+ cc.psadbw(xmmA, m);
703
+ cc.rsqrtps(xmmA, m);
704
+ cc.rsqrtss(xmmA, m);
705
+ cc.shufps(xmmA, m, 0);
706
+ cc.sqrtps(xmmA, m);
707
+ cc.sqrtss(xmmA, m);
708
+ cc.stmxcsr(m);
709
+ cc.subps(xmmA, m);
710
+ cc.subss(xmmA, m);
711
+ cc.ucomiss(xmmA, m);
712
+ cc.unpckhps(xmmA, m);
713
+ cc.unpcklps(xmmA, m);
714
+ cc.xorps(xmmA, m);
715
+
716
+ // SSE2.
717
+ cc.addpd(xmmA, m);
718
+ cc.addsd(xmmA, m);
719
+ cc.andnpd(xmmA, m);
720
+ cc.andpd(xmmA, m);
721
+ cc.cmppd(xmmA, m, 0);
722
+ cc.cmpsd(xmmA, m, 0);
723
+ cc.comisd(xmmA, m);
724
+ cc.cvtdq2pd(xmmA, m);
725
+ cc.cvtdq2ps(xmmA, m);
726
+ cc.cvtpd2dq(xmmA, m);
727
+ cc.cvtpd2ps(xmmA, m);
728
+ cc.cvtpi2pd(xmmA, m);
729
+ cc.cvtps2dq(xmmA, m);
730
+ cc.cvtps2pd(xmmA, m);
731
+ cc.cvtsd2si(gpd, m);
732
+ if (cc.is64Bit()) cc.cvtsd2si(gpq, m);
733
+ cc.cvtsd2ss(xmmA, m);
734
+ cc.cvtsi2sd(xmmA, m);
735
+ cc.cvtss2sd(xmmA, m);
736
+ cc.cvtss2si(gpd, m);
737
+ if (cc.is64Bit()) cc.cvtss2si(gpq, m);
738
+ cc.cvttpd2dq(xmmA, m);
739
+ cc.cvttps2dq(xmmA, m);
740
+ cc.cvttsd2si(gpd, m);
741
+ if (cc.is64Bit()) cc.cvttsd2si(gpq, m);
742
+ cc.divpd(xmmA, m);
743
+ cc.divsd(xmmA, m);
744
+ cc.maxpd(xmmA, m);
745
+ cc.maxsd(xmmA, m);
746
+ cc.minpd(xmmA, m);
747
+ cc.minsd(xmmA, m);
748
+ cc.movdqa(xmmA, m);
749
+ cc.movdqa(m, xmmB);
750
+ cc.movdqu(xmmA, m);
751
+ cc.movdqu(m, xmmB);
752
+ cc.movsd(xmmA, m);
753
+ cc.movsd(m, xmmB);
754
+ cc.movapd(xmmA, m);
755
+ cc.movapd(m, xmmB);
756
+ cc.movhpd(xmmA, m);
757
+ cc.movhpd(m, xmmB);
758
+ cc.movlpd(xmmA, m);
759
+ cc.movlpd(m, xmmB);
760
+ cc.movntdq(m, xmmB);
761
+ cc.movntpd(m, xmmB);
762
+ cc.movupd(xmmA, m);
763
+ cc.movupd(m, xmmB);
764
+ cc.mulpd(xmmA, m);
765
+ cc.mulsd(xmmA, m);
766
+ cc.orpd(xmmA, m);
767
+ cc.packsswb(xmmA, m);
768
+ cc.packssdw(xmmA, m);
769
+ cc.packuswb(xmmA, m);
770
+ cc.paddb(xmmA, m);
771
+ cc.paddw(xmmA, m);
772
+ cc.paddd(xmmA, m);
773
+ cc.paddq(xmmA, m);
774
+ cc.paddsb(xmmA, m);
775
+ cc.paddsw(xmmA, m);
776
+ cc.paddusb(xmmA, m);
777
+ cc.paddusw(xmmA, m);
778
+ cc.pand(xmmA, m);
779
+ cc.pandn(xmmA, m);
780
+ cc.pavgb(xmmA, m);
781
+ cc.pavgw(xmmA, m);
782
+ cc.pcmpeqb(xmmA, m);
783
+ cc.pcmpeqw(xmmA, m);
784
+ cc.pcmpeqd(xmmA, m);
785
+ cc.pcmpgtb(xmmA, m);
786
+ cc.pcmpgtw(xmmA, m);
787
+ cc.pcmpgtd(xmmA, m);
788
+ cc.pmaxsw(xmmA, m);
789
+ cc.pmaxub(xmmA, m);
790
+ cc.pminsw(xmmA, m);
791
+ cc.pminub(xmmA, m);
792
+ cc.pmulhw(xmmA, m);
793
+ cc.pmulhuw(xmmA, m);
794
+ cc.pmullw(xmmA, m);
795
+ cc.pmuludq(xmmA, m);
796
+ cc.por(xmmA, m);
797
+ cc.pslld(xmmA, m);
798
+ cc.psllq(xmmA, m);
799
+ cc.psllw(xmmA, m);
800
+ cc.psrad(xmmA, m);
801
+ cc.psraw(xmmA, m);
802
+ cc.psubb(xmmA, m);
803
+ cc.psubw(xmmA, m);
804
+ cc.psubd(xmmA, m);
805
+ cc.psubq(xmmA, m);
806
+ cc.pmaddwd(xmmA, m);
807
+ cc.pshufd(xmmA, m, 0);
808
+ cc.pshufhw(xmmA, m, 0);
809
+ cc.pshuflw(xmmA, m, 0);
810
+ cc.psrld(xmmA, m);
811
+ cc.psrlq(xmmA, m);
812
+ cc.psrlw(xmmA, m);
813
+ cc.psubsb(xmmA, m);
814
+ cc.psubsw(xmmA, m);
815
+ cc.psubusb(xmmA, m);
816
+ cc.psubusw(xmmA, m);
817
+ cc.punpckhbw(xmmA, m);
818
+ cc.punpckhwd(xmmA, m);
819
+ cc.punpckhdq(xmmA, m);
820
+ cc.punpckhqdq(xmmA, m);
821
+ cc.punpcklbw(xmmA, m);
822
+ cc.punpcklwd(xmmA, m);
823
+ cc.punpckldq(xmmA, m);
824
+ cc.punpcklqdq(xmmA, m);
825
+ cc.pxor(xmmA, m);
826
+ cc.sqrtpd(xmmA, m);
827
+ cc.sqrtsd(xmmA, m);
828
+ cc.subpd(xmmA, m);
829
+ cc.subsd(xmmA, m);
830
+ cc.ucomisd(xmmA, m);
831
+ cc.unpckhpd(xmmA, m);
832
+ cc.unpcklpd(xmmA, m);
833
+ cc.xorpd(xmmA, m);
834
+
835
+ // SSE3.
836
+ cc.addsubpd(xmmA, m);
837
+ cc.addsubps(xmmA, m);
838
+ cc.haddpd(xmmA, m);
839
+ cc.haddps(xmmA, m);
840
+ cc.hsubpd(xmmA, m);
841
+ cc.hsubps(xmmA, m);
842
+ cc.lddqu(xmmA, m);
843
+ cc.movddup(xmmA, m);
844
+ cc.movshdup(xmmA, m);
845
+ cc.movsldup(xmmA, m);
846
+
847
+ // SSSE3.
848
+ cc.psignb(xmmA, m);
849
+ cc.psignw(xmmA, m);
850
+ cc.psignd(xmmA, m);
851
+ cc.phaddw(xmmA, m);
852
+ cc.phaddd(xmmA, m);
853
+ cc.phaddsw(xmmA, m);
854
+ cc.phsubw(xmmA, m);
855
+ cc.phsubd(xmmA, m);
856
+ cc.phsubsw(xmmA, m);
857
+ cc.pmaddubsw(xmmA, m);
858
+ cc.pabsb(xmmA, m);
859
+ cc.pabsw(xmmA, m);
860
+ cc.pabsd(xmmA, m);
861
+ cc.pmulhrsw(xmmA, m);
862
+ cc.pshufb(xmmA, m);
863
+ cc.palignr(xmmA, m, 0);
864
+
865
+ // SSE4.1.
866
+ cc.blendpd(xmmA, m, 0);
867
+ cc.blendps(xmmA, m, 0);
868
+ cc.blendvpd(xmmA, m, xmmA);
869
+ cc.blendvps(xmmA, m, xmmA);
870
+
871
+ cc.dppd(xmmA, m, 0);
872
+ cc.dpps(xmmA, m, 0);
873
+ cc.extractps(m, xmmB, 0);
874
+ cc.insertps(xmmA, m, 0);
875
+ cc.movntdqa(xmmA, m);
876
+ cc.mpsadbw(xmmA, m, 0);
877
+ cc.packusdw(xmmA, m);
878
+ cc.pblendvb(xmmA, m, xmmA);
879
+ cc.pblendw(xmmA, m, 0);
880
+ cc.pcmpeqq(xmmA, m);
881
+ cc.pextrb(m, xmmB, 0);
882
+ cc.pextrd(m, xmmB, 0);
883
+ if (cc.is64Bit()) cc.pextrq(m, xmmB, 0);
884
+ cc.pextrw(m, xmmB, 0);
885
+ cc.phminposuw(xmmA, m);
886
+ cc.pinsrb(xmmA, m, 0);
887
+ cc.pinsrd(xmmA, m, 0);
888
+ cc.pinsrw(xmmA, m, 0);
889
+ cc.pmaxuw(xmmA, m);
890
+ cc.pmaxsb(xmmA, m);
891
+ cc.pmaxsd(xmmA, m);
892
+ cc.pmaxud(xmmA, m);
893
+ cc.pminsb(xmmA, m);
894
+ cc.pminuw(xmmA, m);
895
+ cc.pminud(xmmA, m);
896
+ cc.pminsd(xmmA, m);
897
+ cc.pmovsxbw(xmmA, m);
898
+ cc.pmovsxbd(xmmA, m);
899
+ cc.pmovsxbq(xmmA, m);
900
+ cc.pmovsxwd(xmmA, m);
901
+ cc.pmovsxwq(xmmA, m);
902
+ cc.pmovsxdq(xmmA, m);
903
+ cc.pmovzxbw(xmmA, m);
904
+ cc.pmovzxbd(xmmA, m);
905
+ cc.pmovzxbq(xmmA, m);
906
+ cc.pmovzxwd(xmmA, m);
907
+ cc.pmovzxwq(xmmA, m);
908
+ cc.pmovzxdq(xmmA, m);
909
+ cc.pmuldq(xmmA, m);
910
+ cc.pmulld(xmmA, m);
911
+ cc.ptest(xmmA, m);
912
+ cc.roundps(xmmA, m, 0);
913
+ cc.roundss(xmmA, m, 0);
914
+ cc.roundpd(xmmA, m, 0);
915
+ cc.roundsd(xmmA, m, 0);
916
+
917
+ // SSE4.2.
918
+ cc.pcmpgtq(xmmA, m);
919
+ }
920
+ }
921
+
922
+ static void generateSseSequence(BaseEmitter& emitter, InstForm form, bool emitPrologEpilog) {
923
+ using namespace asmjit::x86;
924
+
925
+ if (emitter.isAssembler()) {
926
+ Assembler& cc = *emitter.as<Assembler>();
927
+
928
+ if (emitPrologEpilog) {
929
+ FuncDetail func;
930
+ func.init(FuncSignatureT<void, void*, const void*, size_t>(CallConvId::kHost), cc.environment());
931
+
932
+ FuncFrame frame;
933
+ frame.init(func);
934
+ frame.addDirtyRegs(eax, xmm0, xmm1, xmm2, xmm3);
935
+ frame.finalize();
936
+
937
+ cc.emitProlog(frame);
938
+ generateSseSequenceInternal(cc, form, eax, xmm0, xmm1, xmm2, xmm3);
939
+ cc.emitEpilog(frame);
940
+ }
941
+ else {
942
+ generateSseSequenceInternal(cc, form, eax, xmm0, xmm1, xmm2, xmm3);
943
+ }
944
+ }
945
+ #ifndef ASMJIT_NO_BUILDER
946
+ else if (emitter.isBuilder()) {
947
+ Builder& cc = *emitter.as<Builder>();
948
+
949
+ if (emitPrologEpilog) {
950
+ FuncDetail func;
951
+ func.init(FuncSignatureT<void, void*, const void*, size_t>(CallConvId::kHost), cc.environment());
952
+
953
+ FuncFrame frame;
954
+ frame.init(func);
955
+ frame.addDirtyRegs(eax, xmm0, xmm1, xmm2, xmm3);
956
+ frame.finalize();
957
+
958
+ cc.emitProlog(frame);
959
+ generateSseSequenceInternal(cc, form, eax, xmm0, xmm1, xmm2, xmm3);
960
+ cc.emitEpilog(frame);
961
+ }
962
+ else {
963
+ generateSseSequenceInternal(cc, form, eax, xmm0, xmm1, xmm2, xmm3);
964
+ }
965
+ }
966
+ #endif
967
+ #ifndef ASMJIT_NO_COMPILER
968
+ else if (emitter.isCompiler()) {
969
+ Compiler& cc = *emitter.as<Compiler>();
970
+
971
+ Gp gp = cc.newGpz("gp");
972
+ Xmm a = cc.newXmm("a");
973
+ Xmm b = cc.newXmm("b");
974
+ Xmm c = cc.newXmm("c");
975
+ Xmm d = cc.newXmm("d");
976
+
977
+ cc.addFunc(FuncSignatureT<void>(CallConvId::kHost));
978
+ generateSseSequenceInternal(cc, form, gp, a, b, c, d);
979
+ cc.endFunc();
980
+ }
981
+ #endif
982
+ }
983
+
984
+ // Generates a long sequence of AVX instructions.
985
+ template<typename Emitter>
986
+ static void generateAvxSequenceInternal(
987
+ Emitter& cc,
988
+ InstForm form,
989
+ const x86::Gp& gp,
990
+ const x86::Vec& vecA, const x86::Vec& vecB, const x86::Vec& vecC, const x86::Vec& vecD) {
991
+
992
+ x86::Gp gpd = gp.r32();
993
+ x86::Gp gpq = gp.r64();
994
+ x86::Gp gpz = cc.is32Bit() ? gpd : gpq;
995
+
996
+ x86::Xmm xmmA = vecA.xmm();
997
+ x86::Xmm xmmB = vecB.xmm();
998
+ x86::Xmm xmmC = vecC.xmm();
999
+ x86::Xmm xmmD = vecD.xmm();
1000
+
1001
+ x86::Ymm ymmA = vecA.ymm();
1002
+ x86::Ymm ymmB = vecB.ymm();
1003
+ x86::Ymm ymmC = vecC.ymm();
1004
+ x86::Ymm ymmD = vecD.ymm();
1005
+
1006
+ cc.xor_(gpd, gpd);
1007
+ cc.vxorps(xmmA, xmmA, xmmA);
1008
+ cc.vxorps(xmmB, xmmB, xmmB);
1009
+ cc.vxorps(xmmC, xmmC, xmmC);
1010
+ cc.vxorps(xmmD, xmmD, xmmD);
1011
+
1012
+ if (form == InstForm::kReg) {
1013
+ cc.vaddpd(xmmA, xmmB, xmmC);
1014
+ cc.vaddpd(ymmA, ymmB, ymmC);
1015
+ cc.vaddps(xmmA, xmmB, xmmC);
1016
+ cc.vaddps(ymmA, ymmB, ymmC);
1017
+ cc.vaddsd(xmmA, xmmB, xmmC);
1018
+ cc.vaddss(xmmA, xmmB, xmmC);
1019
+ cc.vaddsubpd(xmmA, xmmB, xmmC);
1020
+ cc.vaddsubpd(ymmA, ymmB, ymmC);
1021
+ cc.vaddsubps(xmmA, xmmB, xmmC);
1022
+ cc.vaddsubps(ymmA, ymmB, ymmC);
1023
+ cc.vandpd(xmmA, xmmB, xmmC);
1024
+ cc.vandpd(ymmA, ymmB, ymmC);
1025
+ cc.vandps(xmmA, xmmB, xmmC);
1026
+ cc.vandps(ymmA, ymmB, ymmC);
1027
+ cc.vandnpd(xmmA, xmmB, xmmC);
1028
+ cc.vandnpd(ymmA, ymmB, ymmC);
1029
+ cc.vandnps(xmmA, xmmB, xmmC);
1030
+ cc.vandnps(ymmA, ymmB, ymmC);
1031
+ cc.vblendpd(xmmA, xmmB, xmmC, 0);
1032
+ cc.vblendpd(ymmA, ymmB, ymmC, 0);
1033
+ cc.vblendps(xmmA, xmmB, xmmC, 0);
1034
+ cc.vblendps(ymmA, ymmB, ymmC, 0);
1035
+ cc.vblendvpd(xmmA, xmmB, xmmC, xmmA);
1036
+ cc.vblendvpd(ymmA, ymmB, ymmC, ymmA);
1037
+ cc.vcmppd(xmmA, xmmB, xmmC, 0);
1038
+ cc.vcmppd(ymmA, ymmB, ymmC, 0);
1039
+ cc.vcmpps(xmmA, xmmB, xmmC, 0);
1040
+ cc.vcmpps(ymmA, ymmB, ymmC, 0);
1041
+ cc.vcmpsd(xmmA, xmmB, xmmC, 0);
1042
+ cc.vcmpss(xmmA, xmmB, xmmC, 0);
1043
+ cc.vcomisd(xmmA, xmmB);
1044
+ cc.vcomiss(xmmA, xmmB);
1045
+ cc.vcvtdq2pd(xmmA, xmmB);
1046
+ cc.vcvtdq2pd(ymmA, xmmB);
1047
+ cc.vcvtdq2ps(xmmA, xmmB);
1048
+ cc.vcvtdq2ps(ymmA, ymmB);
1049
+ cc.vcvtpd2dq(xmmA, xmmB);
1050
+ cc.vcvtpd2dq(xmmA, ymmB);
1051
+ cc.vcvtpd2ps(xmmA, xmmB);
1052
+ cc.vcvtpd2ps(xmmA, ymmB);
1053
+ cc.vcvtps2dq(xmmA, xmmB);
1054
+ cc.vcvtps2dq(ymmA, ymmB);
1055
+ cc.vcvtps2pd(xmmA, xmmB);
1056
+ cc.vcvtps2pd(ymmA, xmmB);
1057
+ cc.vcvtsd2si(gpd, xmmB);
1058
+ cc.vcvtsd2si(gpz, xmmB);
1059
+ cc.vcvtsd2ss(xmmA, xmmB, xmmC);
1060
+ cc.vcvtsi2sd(xmmA, xmmB, gpd);
1061
+ cc.vcvtsi2sd(xmmA, xmmB, gpz);
1062
+ cc.vcvtsi2ss(xmmA, xmmB, gpd);
1063
+ cc.vcvtsi2ss(xmmA, xmmB, gpz);
1064
+ cc.vcvtss2sd(xmmA, xmmB, xmmC);
1065
+ cc.vcvtss2si(gpd, xmmB);
1066
+ cc.vcvttpd2dq(xmmA, xmmB);
1067
+ cc.vcvttpd2dq(xmmA, ymmB);
1068
+ cc.vcvttps2dq(xmmA, xmmB);
1069
+ cc.vcvttps2dq(ymmA, ymmB);
1070
+ cc.vcvttsd2si(gpd, xmmB);
1071
+ cc.vcvttss2si(gpz, xmmB);
1072
+ cc.vdivpd(xmmA, xmmB, xmmC);
1073
+ cc.vdivpd(ymmA, ymmB, ymmC);
1074
+ cc.vdivps(xmmA, xmmB, xmmC);
1075
+ cc.vdivps(ymmA, ymmB, ymmC);
1076
+ cc.vdivsd(xmmA, xmmB, xmmC);
1077
+ cc.vdivss(xmmA, xmmB, xmmC);
1078
+ cc.vdppd(xmmA, xmmB, xmmC, 0);
1079
+ cc.vdpps(xmmA, xmmB, xmmC, 0);
1080
+ cc.vdpps(ymmA, ymmB, ymmC, 0);
1081
+ cc.vextractf128(xmmA, ymmB, 0);
1082
+ cc.vextractps(gpd, xmmB, 0);
1083
+ cc.vhaddpd(xmmA, xmmB, xmmC);
1084
+ cc.vhaddpd(ymmA, ymmB, ymmC);
1085
+ cc.vhaddps(xmmA, xmmB, xmmC);
1086
+ cc.vhaddps(ymmA, ymmB, ymmC);
1087
+ cc.vhsubpd(xmmA, xmmB, xmmC);
1088
+ cc.vhsubpd(ymmA, ymmB, ymmC);
1089
+ cc.vhsubps(xmmA, xmmB, xmmC);
1090
+ cc.vhsubps(ymmA, ymmB, ymmC);
1091
+ cc.vinsertf128(ymmA, ymmB, xmmC, 0);
1092
+ cc.vinsertps(xmmA, xmmB, xmmC, 0);
1093
+ cc.vmaxpd(xmmA, xmmB, xmmC);
1094
+ cc.vmaxpd(ymmA, ymmB, ymmC);
1095
+ cc.vmaxps(xmmA, xmmB, xmmC);
1096
+ cc.vmaxps(ymmA, ymmB, ymmC);
1097
+ cc.vmaxsd(xmmA, xmmB, xmmC);
1098
+ cc.vmaxss(xmmA, xmmB, xmmC);
1099
+ cc.vminpd(xmmA, xmmB, xmmC);
1100
+ cc.vminpd(ymmA, ymmB, ymmC);
1101
+ cc.vminps(xmmA, xmmB, xmmC);
1102
+ cc.vminps(ymmA, ymmB, ymmC);
1103
+ cc.vminsd(xmmA, xmmB, xmmC);
1104
+ cc.vminss(xmmA, xmmB, xmmC);
1105
+ cc.vmovapd(xmmA, xmmB);
1106
+ cc.vmovapd(ymmA, ymmB);
1107
+ cc.vmovaps(xmmA, xmmB);
1108
+ cc.vmovaps(ymmA, ymmB);
1109
+ cc.vmovd(xmmA, gpd);
1110
+ cc.vmovd(gpd, xmmB);
1111
+ cc.vmovddup(xmmA, xmmB);
1112
+ cc.vmovddup(ymmA, ymmB);
1113
+ cc.vmovdqa(xmmA, xmmB);
1114
+ cc.vmovdqa(ymmA, ymmB);
1115
+ cc.vmovdqu(xmmA, xmmB);
1116
+ cc.vmovdqu(ymmA, ymmB);
1117
+ cc.vmovhlps(xmmA, xmmB, xmmC);
1118
+ cc.vmovlhps(xmmA, xmmB, xmmC);
1119
+ cc.vmovmskpd(gpd, xmmB);
1120
+ cc.vmovmskpd(gpd, ymmB);
1121
+ cc.vmovmskps(gpd, xmmB);
1122
+ cc.vmovmskps(gpd, ymmB);
1123
+ cc.vmovsd(xmmA, xmmB, xmmC);
1124
+ cc.vmovshdup(xmmA, xmmB);
1125
+ cc.vmovshdup(ymmA, ymmB);
1126
+ cc.vmovsldup(xmmA, xmmB);
1127
+ cc.vmovsldup(ymmA, ymmB);
1128
+ cc.vmovss(xmmA, xmmB, xmmC);
1129
+ cc.vmovupd(xmmA, xmmB);
1130
+ cc.vmovupd(ymmA, ymmB);
1131
+ cc.vmovups(xmmA, xmmB);
1132
+ cc.vmovups(ymmA, ymmB);
1133
+ cc.vmpsadbw(xmmA, xmmB, xmmC, 0);
1134
+ cc.vmulpd(xmmA, xmmB, xmmC);
1135
+ cc.vmulpd(ymmA, ymmB, ymmC);
1136
+ cc.vmulps(xmmA, xmmB, xmmC);
1137
+ cc.vmulps(ymmA, ymmB, ymmC);
1138
+ cc.vmulsd(xmmA, xmmB, xmmC);
1139
+ cc.vmulss(xmmA, xmmB, xmmC);
1140
+ cc.vorpd(xmmA, xmmB, xmmC);
1141
+ cc.vorpd(ymmA, ymmB, ymmC);
1142
+ cc.vorps(xmmA, xmmB, xmmC);
1143
+ cc.vorps(ymmA, ymmB, ymmC);
1144
+ cc.vpabsb(xmmA, xmmB);
1145
+ cc.vpabsd(xmmA, xmmB);
1146
+ cc.vpabsw(xmmA, xmmB);
1147
+ cc.vpackssdw(xmmA, xmmB, xmmC);
1148
+ cc.vpacksswb(xmmA, xmmB, xmmC);
1149
+ cc.vpackusdw(xmmA, xmmB, xmmC);
1150
+ cc.vpackuswb(xmmA, xmmB, xmmC);
1151
+ cc.vpaddb(xmmA, xmmB, xmmC);
1152
+ cc.vpaddd(xmmA, xmmB, xmmC);
1153
+ cc.vpaddq(xmmA, xmmB, xmmC);
1154
+ cc.vpaddw(xmmA, xmmB, xmmC);
1155
+ cc.vpaddsb(xmmA, xmmB, xmmC);
1156
+ cc.vpaddsw(xmmA, xmmB, xmmC);
1157
+ cc.vpaddusb(xmmA, xmmB, xmmC);
1158
+ cc.vpaddusw(xmmA, xmmB, xmmC);
1159
+ cc.vpalignr(xmmA, xmmB, xmmC, 0);
1160
+ cc.vpand(xmmA, xmmB, xmmC);
1161
+ cc.vpandn(xmmA, xmmB, xmmC);
1162
+ cc.vpavgb(xmmA, xmmB, xmmC);
1163
+ cc.vpavgw(xmmA, xmmB, xmmC);
1164
+ cc.vpblendvb(xmmA, xmmB, xmmC, xmmA);
1165
+ cc.vpblendw(xmmA, xmmB, xmmC, 0);
1166
+ cc.vpcmpeqb(xmmA, xmmB, xmmC);
1167
+ cc.vpcmpeqd(xmmA, xmmB, xmmC);
1168
+ cc.vpcmpeqq(xmmA, xmmB, xmmC);
1169
+ cc.vpcmpeqw(xmmA, xmmB, xmmC);
1170
+ cc.vpcmpgtb(xmmA, xmmB, xmmC);
1171
+ cc.vpcmpgtd(xmmA, xmmB, xmmC);
1172
+ cc.vpcmpgtq(xmmA, xmmB, xmmC);
1173
+ cc.vpcmpgtw(xmmA, xmmB, xmmC);
1174
+ cc.vpermilpd(xmmA, xmmB, xmmC);
1175
+ cc.vpermilpd(ymmA, ymmB, ymmC);
1176
+ cc.vpermilpd(xmmA, xmmB, 0);
1177
+ cc.vpermilpd(ymmA, ymmB, 0);
1178
+ cc.vpermilps(xmmA, xmmB, xmmC);
1179
+ cc.vpermilps(ymmA, ymmB, ymmC);
1180
+ cc.vpermilps(xmmA, xmmB, 0);
1181
+ cc.vpermilps(ymmA, ymmB, 0);
1182
+ cc.vperm2f128(ymmA, ymmB, ymmC, 0);
1183
+ cc.vpextrb(gpd, xmmB, 0);
1184
+ cc.vpextrd(gpd, xmmB, 0);
1185
+ if (cc.is64Bit()) cc.vpextrq(gpq, xmmB, 0);
1186
+ cc.vpextrw(gpd, xmmB, 0);
1187
+ cc.vphaddd(xmmA, xmmB, xmmC);
1188
+ cc.vphaddsw(xmmA, xmmB, xmmC);
1189
+ cc.vphaddw(xmmA, xmmB, xmmC);
1190
+ cc.vphminposuw(xmmA, xmmB);
1191
+ cc.vphsubd(xmmA, xmmB, xmmC);
1192
+ cc.vphsubsw(xmmA, xmmB, xmmC);
1193
+ cc.vphsubw(xmmA, xmmB, xmmC);
1194
+ cc.vpinsrb(xmmA, xmmB, gpd, 0);
1195
+ cc.vpinsrd(xmmA, xmmB, gpd, 0);
1196
+ cc.vpinsrw(xmmA, xmmB, gpd, 0);
1197
+ cc.vpmaddubsw(xmmA, xmmB, xmmC);
1198
+ cc.vpmaddwd(xmmA, xmmB, xmmC);
1199
+ cc.vpmaxsb(xmmA, xmmB, xmmC);
1200
+ cc.vpmaxsd(xmmA, xmmB, xmmC);
1201
+ cc.vpmaxsw(xmmA, xmmB, xmmC);
1202
+ cc.vpmaxub(xmmA, xmmB, xmmC);
1203
+ cc.vpmaxud(xmmA, xmmB, xmmC);
1204
+ cc.vpmaxuw(xmmA, xmmB, xmmC);
1205
+ cc.vpminsb(xmmA, xmmB, xmmC);
1206
+ cc.vpminsd(xmmA, xmmB, xmmC);
1207
+ cc.vpminsw(xmmA, xmmB, xmmC);
1208
+ cc.vpminub(xmmA, xmmB, xmmC);
1209
+ cc.vpminud(xmmA, xmmB, xmmC);
1210
+ cc.vpminuw(xmmA, xmmB, xmmC);
1211
+ cc.vpmovmskb(gpd, xmmB);
1212
+ cc.vpmovsxbd(xmmA, xmmB);
1213
+ cc.vpmovsxbq(xmmA, xmmB);
1214
+ cc.vpmovsxbw(xmmA, xmmB);
1215
+ cc.vpmovsxdq(xmmA, xmmB);
1216
+ cc.vpmovsxwd(xmmA, xmmB);
1217
+ cc.vpmovsxwq(xmmA, xmmB);
1218
+ cc.vpmovzxbd(xmmA, xmmB);
1219
+ cc.vpmovzxbq(xmmA, xmmB);
1220
+ cc.vpmovzxbw(xmmA, xmmB);
1221
+ cc.vpmovzxdq(xmmA, xmmB);
1222
+ cc.vpmovzxwd(xmmA, xmmB);
1223
+ cc.vpmovzxwq(xmmA, xmmB);
1224
+ cc.vpmuldq(xmmA, xmmB, xmmC);
1225
+ cc.vpmulhrsw(xmmA, xmmB, xmmC);
1226
+ cc.vpmulhuw(xmmA, xmmB, xmmC);
1227
+ cc.vpmulhw(xmmA, xmmB, xmmC);
1228
+ cc.vpmulld(xmmA, xmmB, xmmC);
1229
+ cc.vpmullw(xmmA, xmmB, xmmC);
1230
+ cc.vpmuludq(xmmA, xmmB, xmmC);
1231
+ cc.vpor(xmmA, xmmB, xmmC);
1232
+ cc.vpsadbw(xmmA, xmmB, xmmC);
1233
+ cc.vpshufb(xmmA, xmmB, xmmC);
1234
+ cc.vpshufd(xmmA, xmmB, 0);
1235
+ cc.vpshufhw(xmmA, xmmB, 0);
1236
+ cc.vpshuflw(xmmA, xmmB, 0);
1237
+ cc.vpsignb(xmmA, xmmB, xmmC);
1238
+ cc.vpsignd(xmmA, xmmB, xmmC);
1239
+ cc.vpsignw(xmmA, xmmB, xmmC);
1240
+ cc.vpslld(xmmA, xmmB, xmmC);
1241
+ cc.vpslld(xmmA, xmmB, 0);
1242
+ cc.vpslldq(xmmA, xmmB, 0);
1243
+ cc.vpsllq(xmmA, xmmB, xmmC);
1244
+ cc.vpsllq(xmmA, xmmB, 0);
1245
+ cc.vpsllw(xmmA, xmmB, xmmC);
1246
+ cc.vpsllw(xmmA, xmmB, 0);
1247
+ cc.vpsrad(xmmA, xmmB, xmmC);
1248
+ cc.vpsrad(xmmA, xmmB, 0);
1249
+ cc.vpsraw(xmmA, xmmB, xmmC);
1250
+ cc.vpsraw(xmmA, xmmB, 0);
1251
+ cc.vpsrld(xmmA, xmmB, xmmC);
1252
+ cc.vpsrld(xmmA, xmmB, 0);
1253
+ cc.vpsrldq(xmmA, xmmB, 0);
1254
+ cc.vpsrlq(xmmA, xmmB, xmmC);
1255
+ cc.vpsrlq(xmmA, xmmB, 0);
1256
+ cc.vpsrlw(xmmA, xmmB, xmmC);
1257
+ cc.vpsrlw(xmmA, xmmB, 0);
1258
+ cc.vpsubb(xmmA, xmmB, xmmC);
1259
+ cc.vpsubd(xmmA, xmmB, xmmC);
1260
+ cc.vpsubq(xmmA, xmmB, xmmC);
1261
+ cc.vpsubw(xmmA, xmmB, xmmC);
1262
+ cc.vpsubsb(xmmA, xmmB, xmmC);
1263
+ cc.vpsubsw(xmmA, xmmB, xmmC);
1264
+ cc.vpsubusb(xmmA, xmmB, xmmC);
1265
+ cc.vpsubusw(xmmA, xmmB, xmmC);
1266
+ cc.vptest(xmmA, xmmB);
1267
+ cc.vptest(ymmA, ymmB);
1268
+ cc.vpunpckhbw(xmmA, xmmB, xmmC);
1269
+ cc.vpunpckhdq(xmmA, xmmB, xmmC);
1270
+ cc.vpunpckhqdq(xmmA, xmmB, xmmC);
1271
+ cc.vpunpckhwd(xmmA, xmmB, xmmC);
1272
+ cc.vpunpcklbw(xmmA, xmmB, xmmC);
1273
+ cc.vpunpckldq(xmmA, xmmB, xmmC);
1274
+ cc.vpunpcklqdq(xmmA, xmmB, xmmC);
1275
+ cc.vpunpcklwd(xmmA, xmmB, xmmC);
1276
+ cc.vpxor(xmmA, xmmB, xmmC);
1277
+ cc.vrcpps(xmmA, xmmB);
1278
+ cc.vrcpps(ymmA, ymmB);
1279
+ cc.vrcpss(xmmA, xmmB, xmmC);
1280
+ cc.vrsqrtps(xmmA, xmmB);
1281
+ cc.vrsqrtps(ymmA, ymmB);
1282
+ cc.vrsqrtss(xmmA, xmmB, xmmC);
1283
+ cc.vroundpd(xmmA, xmmB, 0);
1284
+ cc.vroundpd(ymmA, ymmB, 0);
1285
+ cc.vroundps(xmmA, xmmB, 0);
1286
+ cc.vroundps(ymmA, ymmB, 0);
1287
+ cc.vroundsd(xmmA, xmmB, xmmC, 0);
1288
+ cc.vroundss(xmmA, xmmB, xmmC, 0);
1289
+ cc.vshufpd(xmmA, xmmB, xmmC, 0);
1290
+ cc.vshufpd(ymmA, ymmB, ymmC, 0);
1291
+ cc.vshufps(xmmA, xmmB, xmmC, 0);
1292
+ cc.vshufps(ymmA, ymmB, ymmC, 0);
1293
+ cc.vsqrtpd(xmmA, xmmB);
1294
+ cc.vsqrtpd(ymmA, ymmB);
1295
+ cc.vsqrtps(xmmA, xmmB);
1296
+ cc.vsqrtps(ymmA, ymmB);
1297
+ cc.vsqrtsd(xmmA, xmmB, xmmC);
1298
+ cc.vsqrtss(xmmA, xmmB, xmmC);
1299
+ cc.vsubpd(xmmA, xmmB, xmmC);
1300
+ cc.vsubpd(ymmA, ymmB, ymmC);
1301
+ cc.vsubps(xmmA, xmmB, xmmC);
1302
+ cc.vsubps(ymmA, ymmB, ymmC);
1303
+ cc.vsubsd(xmmA, xmmB, xmmC);
1304
+ cc.vsubss(xmmA, xmmB, xmmC);
1305
+ cc.vtestps(xmmA, xmmB);
1306
+ cc.vtestps(ymmA, ymmB);
1307
+ cc.vtestpd(xmmA, xmmB);
1308
+ cc.vtestpd(ymmA, ymmB);
1309
+ cc.vucomisd(xmmA, xmmB);
1310
+ cc.vucomiss(xmmA, xmmB);
1311
+ cc.vunpckhpd(xmmA, xmmB, xmmC);
1312
+ cc.vunpckhpd(ymmA, ymmB, ymmC);
1313
+ cc.vunpckhps(xmmA, xmmB, xmmC);
1314
+ cc.vunpckhps(ymmA, ymmB, ymmC);
1315
+ cc.vunpcklpd(xmmA, xmmB, xmmC);
1316
+ cc.vunpcklpd(ymmA, ymmB, ymmC);
1317
+ cc.vunpcklps(xmmA, xmmB, xmmC);
1318
+ cc.vunpcklps(ymmA, ymmB, ymmC);
1319
+ cc.vxorpd(xmmA, xmmB, xmmC);
1320
+ cc.vxorpd(ymmA, ymmB, ymmC);
1321
+ cc.vxorps(xmmA, xmmB, xmmC);
1322
+ cc.vxorps(ymmA, ymmB, ymmC);
1323
+
1324
+ // AVX+AESNI.
1325
+ cc.vaesdec(xmmA, xmmB, xmmC);
1326
+ cc.vaesdeclast(xmmA, xmmB, xmmC);
1327
+ cc.vaesenc(xmmA, xmmB, xmmC);
1328
+ cc.vaesenclast(xmmA, xmmB, xmmC);
1329
+ cc.vaesimc(xmmA, xmmB);
1330
+ cc.vaeskeygenassist(xmmA, xmmB, 0);
1331
+
1332
+ // AVX+PCLMULQDQ.
1333
+ cc.vpclmulqdq(xmmA, xmmB, xmmC, 0);
1334
+
1335
+ // AVX2.
1336
+ cc.vbroadcastsd(ymmA, xmmB);
1337
+ cc.vbroadcastss(xmmA, xmmB);
1338
+ cc.vbroadcastss(ymmA, xmmB);
1339
+ cc.vextracti128(xmmA, ymmB, 0);
1340
+ cc.vinserti128(ymmA, ymmB, xmmC, 0);
1341
+ cc.vmpsadbw(ymmA, ymmB, ymmC, 0);
1342
+ cc.vpabsb(ymmA, ymmB);
1343
+ cc.vpabsd(ymmA, ymmB);
1344
+ cc.vpabsw(ymmA, ymmB);
1345
+ cc.vpackssdw(ymmA, ymmB, ymmC);
1346
+ cc.vpacksswb(ymmA, ymmB, ymmC);
1347
+ cc.vpackusdw(ymmA, ymmB, ymmC);
1348
+ cc.vpackuswb(ymmA, ymmB, ymmC);
1349
+ cc.vpaddb(ymmA, ymmB, ymmC);
1350
+ cc.vpaddd(ymmA, ymmB, ymmC);
1351
+ cc.vpaddq(ymmA, ymmB, ymmC);
1352
+ cc.vpaddw(ymmA, ymmB, ymmC);
1353
+ cc.vpaddsb(ymmA, ymmB, ymmC);
1354
+ cc.vpaddsw(ymmA, ymmB, ymmC);
1355
+ cc.vpaddusb(ymmA, ymmB, ymmC);
1356
+ cc.vpaddusw(ymmA, ymmB, ymmC);
1357
+ cc.vpalignr(ymmA, ymmB, ymmC, 0);
1358
+ cc.vpand(ymmA, ymmB, ymmC);
1359
+ cc.vpandn(ymmA, ymmB, ymmC);
1360
+ cc.vpavgb(ymmA, ymmB, ymmC);
1361
+ cc.vpavgw(ymmA, ymmB, ymmC);
1362
+ cc.vpblendd(xmmA, xmmB, xmmC, 0);
1363
+ cc.vpblendd(ymmA, ymmB, ymmC, 0);
1364
+ cc.vpblendvb(ymmA, ymmB, ymmC, ymmA);
1365
+ cc.vpblendw(ymmA, ymmB, ymmC, 0);
1366
+ cc.vpbroadcastb(xmmA, xmmB);
1367
+ cc.vpbroadcastb(ymmA, xmmB);
1368
+ cc.vpbroadcastd(xmmA, xmmB);
1369
+ cc.vpbroadcastd(ymmA, xmmB);
1370
+ cc.vpbroadcastq(xmmA, xmmB);
1371
+ cc.vpbroadcastq(ymmA, xmmB);
1372
+ cc.vpbroadcastw(xmmA, xmmB);
1373
+ cc.vpbroadcastw(ymmA, xmmB);
1374
+ cc.vpcmpeqb(ymmA, ymmB, ymmC);
1375
+ cc.vpcmpeqd(ymmA, ymmB, ymmC);
1376
+ cc.vpcmpeqq(ymmA, ymmB, ymmC);
1377
+ cc.vpcmpeqw(ymmA, ymmB, ymmC);
1378
+ cc.vpcmpgtb(ymmA, ymmB, ymmC);
1379
+ cc.vpcmpgtd(ymmA, ymmB, ymmC);
1380
+ cc.vpcmpgtq(ymmA, ymmB, ymmC);
1381
+ cc.vpcmpgtw(ymmA, ymmB, ymmC);
1382
+ cc.vperm2i128(ymmA, ymmB, ymmC, 0);
1383
+ cc.vpermd(ymmA, ymmB, ymmC);
1384
+ cc.vpermps(ymmA, ymmB, ymmC);
1385
+ cc.vpermpd(ymmA, ymmB, 0);
1386
+ cc.vpermq(ymmA, ymmB, 0);
1387
+ cc.vpmovmskb(gpd, ymmB);
1388
+ cc.vpmovsxbd(ymmA, xmmB);
1389
+ cc.vpmovsxbq(ymmA, xmmB);
1390
+ cc.vpmovsxbw(ymmA, xmmB);
1391
+ cc.vpmovsxdq(ymmA, xmmB);
1392
+ cc.vpmovsxwd(ymmA, xmmB);
1393
+ cc.vpmovsxwq(ymmA, xmmB);
1394
+ cc.vpmovzxbd(ymmA, xmmB);
1395
+ cc.vpmovzxbq(ymmA, xmmB);
1396
+ cc.vpmovzxbw(ymmA, xmmB);
1397
+ cc.vpmovzxdq(ymmA, xmmB);
1398
+ cc.vpmovzxwd(ymmA, xmmB);
1399
+ cc.vpmovzxwq(ymmA, xmmB);
1400
+ cc.vpshufd(ymmA, ymmB, 0);
1401
+ cc.vpshufhw(ymmA, ymmB, 0);
1402
+ cc.vpshuflw(ymmA, ymmB, 0);
1403
+ cc.vpslld(ymmA, ymmB, 0);
1404
+ cc.vpslldq(ymmA, ymmB, 0);
1405
+ cc.vpsllq(ymmA, ymmB, 0);
1406
+ cc.vpsllw(ymmA, ymmB, 0);
1407
+ cc.vpsrad(ymmA, ymmB, 0);
1408
+ cc.vpsraw(ymmA, ymmB, 0);
1409
+ cc.vpsrld(ymmA, ymmB, 0);
1410
+ cc.vpsrldq(ymmA, ymmB, 0);
1411
+ cc.vpsrlq(ymmA, ymmB, 0);
1412
+ cc.vpsrlw(ymmA, ymmB, 0);
1413
+ cc.vphaddd(ymmA, ymmB, ymmC);
1414
+ cc.vphaddsw(ymmA, ymmB, ymmC);
1415
+ cc.vphaddw(ymmA, ymmB, ymmC);
1416
+ cc.vphsubd(ymmA, ymmB, ymmC);
1417
+ cc.vphsubsw(ymmA, ymmB, ymmC);
1418
+ cc.vphsubw(ymmA, ymmB, ymmC);
1419
+ cc.vpmaddubsw(ymmA, ymmB, ymmC);
1420
+ cc.vpmaddwd(ymmA, ymmB, ymmC);
1421
+ cc.vpmaxsb(ymmA, ymmB, ymmC);
1422
+ cc.vpmaxsd(ymmA, ymmB, ymmC);
1423
+ cc.vpmaxsw(ymmA, ymmB, ymmC);
1424
+ cc.vpmaxub(ymmA, ymmB, ymmC);
1425
+ cc.vpmaxud(ymmA, ymmB, ymmC);
1426
+ cc.vpmaxuw(ymmA, ymmB, ymmC);
1427
+ cc.vpminsb(ymmA, ymmB, ymmC);
1428
+ cc.vpminsd(ymmA, ymmB, ymmC);
1429
+ cc.vpminsw(ymmA, ymmB, ymmC);
1430
+ cc.vpminub(ymmA, ymmB, ymmC);
1431
+ cc.vpminud(ymmA, ymmB, ymmC);
1432
+ cc.vpminuw(ymmA, ymmB, ymmC);
1433
+ cc.vpmuldq(ymmA, ymmB, ymmC);
1434
+ cc.vpmulhrsw(ymmA, ymmB, ymmC);
1435
+ cc.vpmulhuw(ymmA, ymmB, ymmC);
1436
+ cc.vpmulhw(ymmA, ymmB, ymmC);
1437
+ cc.vpmulld(ymmA, ymmB, ymmC);
1438
+ cc.vpmullw(ymmA, ymmB, ymmC);
1439
+ cc.vpmuludq(ymmA, ymmB, ymmC);
1440
+ cc.vpor(ymmA, ymmB, ymmC);
1441
+ cc.vpsadbw(ymmA, ymmB, ymmC);
1442
+ cc.vpshufb(ymmA, ymmB, ymmC);
1443
+ cc.vpsignb(ymmA, ymmB, ymmC);
1444
+ cc.vpsignd(ymmA, ymmB, ymmC);
1445
+ cc.vpsignw(ymmA, ymmB, ymmC);
1446
+ cc.vpslld(ymmA, ymmB, xmmC);
1447
+ cc.vpsllq(ymmA, ymmB, xmmC);
1448
+ cc.vpsllvd(xmmA, xmmB, xmmC);
1449
+ cc.vpsllvd(ymmA, ymmB, ymmC);
1450
+ cc.vpsllvq(xmmA, xmmB, xmmC);
1451
+ cc.vpsllvq(ymmA, ymmB, ymmC);
1452
+ cc.vpsllw(ymmA, ymmB, xmmC);
1453
+ cc.vpsrad(ymmA, ymmB, xmmC);
1454
+ cc.vpsravd(xmmA, xmmB, xmmC);
1455
+ cc.vpsravd(ymmA, ymmB, ymmC);
1456
+ cc.vpsraw(ymmA, ymmB, xmmC);
1457
+ cc.vpsrld(ymmA, ymmB, xmmC);
1458
+ cc.vpsrlq(ymmA, ymmB, xmmC);
1459
+ cc.vpsrlvd(xmmA, xmmB, xmmC);
1460
+ cc.vpsrlvd(ymmA, ymmB, ymmC);
1461
+ cc.vpsrlvq(xmmA, xmmB, xmmC);
1462
+ cc.vpsrlvq(ymmA, ymmB, ymmC);
1463
+ cc.vpsrlw(ymmA, ymmB, xmmC);
1464
+ cc.vpsubb(ymmA, ymmB, ymmC);
1465
+ cc.vpsubd(ymmA, ymmB, ymmC);
1466
+ cc.vpsubq(ymmA, ymmB, ymmC);
1467
+ cc.vpsubsb(ymmA, ymmB, ymmC);
1468
+ cc.vpsubsw(ymmA, ymmB, ymmC);
1469
+ cc.vpsubusb(ymmA, ymmB, ymmC);
1470
+ cc.vpsubusw(ymmA, ymmB, ymmC);
1471
+ cc.vpsubw(ymmA, ymmB, ymmC);
1472
+ cc.vpunpckhbw(ymmA, ymmB, ymmC);
1473
+ cc.vpunpckhdq(ymmA, ymmB, ymmC);
1474
+ cc.vpunpckhqdq(ymmA, ymmB, ymmC);
1475
+ cc.vpunpckhwd(ymmA, ymmB, ymmC);
1476
+ cc.vpunpcklbw(ymmA, ymmB, ymmC);
1477
+ cc.vpunpckldq(ymmA, ymmB, ymmC);
1478
+ cc.vpunpcklqdq(ymmA, ymmB, ymmC);
1479
+ cc.vpunpcklwd(ymmA, ymmB, ymmC);
1480
+ cc.vpxor(ymmA, ymmB, ymmC);
1481
+
1482
+ // FMA.
1483
+ cc.vfmadd132pd(xmmA, xmmB, xmmC);
1484
+ cc.vfmadd132pd(ymmA, ymmB, ymmC);
1485
+ cc.vfmadd132ps(xmmA, xmmB, xmmC);
1486
+ cc.vfmadd132ps(ymmA, ymmB, ymmC);
1487
+ cc.vfmadd132sd(xmmA, xmmB, xmmC);
1488
+ cc.vfmadd132ss(xmmA, xmmB, xmmC);
1489
+ cc.vfmadd213pd(xmmA, xmmB, xmmC);
1490
+ cc.vfmadd213pd(ymmA, ymmB, ymmC);
1491
+ cc.vfmadd213ps(xmmA, xmmB, xmmC);
1492
+ cc.vfmadd213ps(ymmA, ymmB, ymmC);
1493
+ cc.vfmadd213sd(xmmA, xmmB, xmmC);
1494
+ cc.vfmadd213ss(xmmA, xmmB, xmmC);
1495
+ cc.vfmadd231pd(xmmA, xmmB, xmmC);
1496
+ cc.vfmadd231pd(ymmA, ymmB, ymmC);
1497
+ cc.vfmadd231ps(xmmA, xmmB, xmmC);
1498
+ cc.vfmadd231ps(ymmA, ymmB, ymmC);
1499
+ cc.vfmadd231sd(xmmA, xmmB, xmmC);
1500
+ cc.vfmadd231ss(xmmA, xmmB, xmmC);
1501
+ cc.vfmaddsub132pd(xmmA, xmmB, xmmC);
1502
+ cc.vfmaddsub132pd(ymmA, ymmB, ymmC);
1503
+ cc.vfmaddsub132ps(xmmA, xmmB, xmmC);
1504
+ cc.vfmaddsub132ps(ymmA, ymmB, ymmC);
1505
+ cc.vfmaddsub213pd(xmmA, xmmB, xmmC);
1506
+ cc.vfmaddsub213pd(ymmA, ymmB, ymmC);
1507
+ cc.vfmaddsub213ps(xmmA, xmmB, xmmC);
1508
+ cc.vfmaddsub213ps(ymmA, ymmB, ymmC);
1509
+ cc.vfmaddsub231pd(xmmA, xmmB, xmmC);
1510
+ cc.vfmaddsub231pd(ymmA, ymmB, ymmC);
1511
+ cc.vfmaddsub231ps(xmmA, xmmB, xmmC);
1512
+ cc.vfmaddsub231ps(ymmA, ymmB, ymmC);
1513
+ cc.vfmsub132pd(xmmA, xmmB, xmmC);
1514
+ cc.vfmsub132pd(ymmA, ymmB, ymmC);
1515
+ cc.vfmsub132ps(xmmA, xmmB, xmmC);
1516
+ cc.vfmsub132ps(ymmA, ymmB, ymmC);
1517
+ cc.vfmsub132sd(xmmA, xmmB, xmmC);
1518
+ cc.vfmsub132ss(xmmA, xmmB, xmmC);
1519
+ cc.vfmsub213pd(xmmA, xmmB, xmmC);
1520
+ cc.vfmsub213pd(ymmA, ymmB, ymmC);
1521
+ cc.vfmsub213ps(xmmA, xmmB, xmmC);
1522
+ cc.vfmsub213ps(ymmA, ymmB, ymmC);
1523
+ cc.vfmsub213sd(xmmA, xmmB, xmmC);
1524
+ cc.vfmsub213ss(xmmA, xmmB, xmmC);
1525
+ cc.vfmsub231pd(xmmA, xmmB, xmmC);
1526
+ cc.vfmsub231pd(ymmA, ymmB, ymmC);
1527
+ cc.vfmsub231ps(xmmA, xmmB, xmmC);
1528
+ cc.vfmsub231ps(ymmA, ymmB, ymmC);
1529
+ cc.vfmsub231sd(xmmA, xmmB, xmmC);
1530
+ cc.vfmsub231ss(xmmA, xmmB, xmmC);
1531
+ cc.vfmsubadd132pd(xmmA, xmmB, xmmC);
1532
+ cc.vfmsubadd132pd(ymmA, ymmB, ymmC);
1533
+ cc.vfmsubadd132ps(xmmA, xmmB, xmmC);
1534
+ cc.vfmsubadd132ps(ymmA, ymmB, ymmC);
1535
+ cc.vfmsubadd213pd(xmmA, xmmB, xmmC);
1536
+ cc.vfmsubadd213pd(ymmA, ymmB, ymmC);
1537
+ cc.vfmsubadd213ps(xmmA, xmmB, xmmC);
1538
+ cc.vfmsubadd213ps(ymmA, ymmB, ymmC);
1539
+ cc.vfmsubadd231pd(xmmA, xmmB, xmmC);
1540
+ cc.vfmsubadd231pd(ymmA, ymmB, ymmC);
1541
+ cc.vfmsubadd231ps(xmmA, xmmB, xmmC);
1542
+ cc.vfmsubadd231ps(ymmA, ymmB, ymmC);
1543
+ cc.vfnmadd132pd(xmmA, xmmB, xmmC);
1544
+ cc.vfnmadd132pd(ymmA, ymmB, ymmC);
1545
+ cc.vfnmadd132ps(xmmA, xmmB, xmmC);
1546
+ cc.vfnmadd132ps(ymmA, ymmB, ymmC);
1547
+ cc.vfnmadd132sd(xmmA, xmmB, xmmC);
1548
+ cc.vfnmadd132ss(xmmA, xmmB, xmmC);
1549
+ cc.vfnmadd213pd(xmmA, xmmB, xmmC);
1550
+ cc.vfnmadd213pd(ymmA, ymmB, ymmC);
1551
+ cc.vfnmadd213ps(xmmA, xmmB, xmmC);
1552
+ cc.vfnmadd213ps(ymmA, ymmB, ymmC);
1553
+ cc.vfnmadd213sd(xmmA, xmmB, xmmC);
1554
+ cc.vfnmadd213ss(xmmA, xmmB, xmmC);
1555
+ cc.vfnmadd231pd(xmmA, xmmB, xmmC);
1556
+ cc.vfnmadd231pd(ymmA, ymmB, ymmC);
1557
+ cc.vfnmadd231ps(xmmA, xmmB, xmmC);
1558
+ cc.vfnmadd231ps(ymmA, ymmB, ymmC);
1559
+ cc.vfnmadd231sd(xmmA, xmmB, xmmC);
1560
+ cc.vfnmadd231ss(xmmA, xmmB, xmmC);
1561
+ cc.vfnmsub132pd(xmmA, xmmB, xmmC);
1562
+ cc.vfnmsub132pd(ymmA, ymmB, ymmC);
1563
+ cc.vfnmsub132ps(xmmA, xmmB, xmmC);
1564
+ cc.vfnmsub132ps(ymmA, ymmB, ymmC);
1565
+ cc.vfnmsub132sd(xmmA, xmmB, xmmC);
1566
+ cc.vfnmsub132ss(xmmA, xmmB, xmmC);
1567
+ cc.vfnmsub213pd(xmmA, xmmB, xmmC);
1568
+ cc.vfnmsub213pd(ymmA, ymmB, ymmC);
1569
+ cc.vfnmsub213ps(xmmA, xmmB, xmmC);
1570
+ cc.vfnmsub213ps(ymmA, ymmB, ymmC);
1571
+ cc.vfnmsub213sd(xmmA, xmmB, xmmC);
1572
+ cc.vfnmsub213ss(xmmA, xmmB, xmmC);
1573
+ cc.vfnmsub231pd(xmmA, xmmB, xmmC);
1574
+ cc.vfnmsub231pd(ymmA, ymmB, ymmC);
1575
+ cc.vfnmsub231ps(xmmA, xmmB, xmmC);
1576
+ cc.vfnmsub231ps(ymmA, ymmB, ymmC);
1577
+ cc.vfnmsub231sd(xmmA, xmmB, xmmC);
1578
+ cc.vfnmsub231ss(xmmA, xmmB, xmmC);
1579
+ }
1580
+ else {
1581
+ x86::Mem m = x86::ptr(gpz);
1582
+ x86::Mem m128 = x86::xmmword_ptr(gpz);
1583
+ x86::Mem m256 = x86::xmmword_ptr(gpz);
1584
+ x86::Mem vx_ptr = x86::ptr(gpz, xmmD);
1585
+ x86::Mem vy_ptr = x86::ptr(gpz, ymmD);
1586
+
1587
+ cc.vaddpd(xmmA, xmmB, m);
1588
+ cc.vaddpd(ymmA, ymmB, m);
1589
+ cc.vaddps(xmmA, xmmB, m);
1590
+ cc.vaddps(ymmA, ymmB, m);
1591
+ cc.vaddsd(xmmA, xmmB, m);
1592
+ cc.vaddss(xmmA, xmmB, m);
1593
+ cc.vaddsubpd(xmmA, xmmB, m);
1594
+ cc.vaddsubpd(ymmA, ymmB, m);
1595
+ cc.vaddsubps(xmmA, xmmB, m);
1596
+ cc.vaddsubps(ymmA, ymmB, m);
1597
+ cc.vandpd(xmmA, xmmB, m);
1598
+ cc.vandpd(ymmA, ymmB, m);
1599
+ cc.vandps(xmmA, xmmB, m);
1600
+ cc.vandps(ymmA, ymmB, m);
1601
+ cc.vandnpd(xmmA, xmmB, m);
1602
+ cc.vandnpd(ymmA, ymmB, m);
1603
+ cc.vandnps(xmmA, xmmB, m);
1604
+ cc.vandnps(ymmA, ymmB, m);
1605
+ cc.vblendpd(xmmA, xmmB, m, 0);
1606
+ cc.vblendpd(ymmA, ymmB, m, 0);
1607
+ cc.vblendps(xmmA, xmmB, m, 0);
1608
+ cc.vblendps(ymmA, ymmB, m, 0);
1609
+ cc.vblendvpd(xmmA, xmmB, m, xmmA);
1610
+ cc.vblendvpd(ymmA, ymmB, m, ymmA);
1611
+ cc.vbroadcastf128(ymmA, m);
1612
+ cc.vbroadcastsd(ymmA, m);
1613
+ cc.vbroadcastss(xmmA, m);
1614
+ cc.vbroadcastss(ymmA, m);
1615
+ cc.vcmppd(xmmA, xmmB, m, 0);
1616
+ cc.vcmppd(ymmA, ymmB, m, 0);
1617
+ cc.vcmpps(xmmA, xmmB, m, 0);
1618
+ cc.vcmpps(ymmA, ymmB, m, 0);
1619
+ cc.vcmpsd(xmmA, xmmB, m, 0);
1620
+ cc.vcmpss(xmmA, xmmB, m, 0);
1621
+ cc.vcomisd(xmmA, m);
1622
+ cc.vcomiss(xmmA, m);
1623
+ cc.vcvtdq2pd(xmmA, m);
1624
+ cc.vcvtdq2pd(ymmA, m);
1625
+ cc.vcvtdq2ps(xmmA, m);
1626
+ cc.vcvtdq2ps(ymmA, m);
1627
+ cc.vcvtpd2dq(xmmA, m128);
1628
+ cc.vcvtpd2dq(xmmA, m256);
1629
+ cc.vcvtpd2ps(xmmA, m128);
1630
+ cc.vcvtpd2ps(xmmA, m256);
1631
+ cc.vcvtps2dq(xmmA, m);
1632
+ cc.vcvtps2dq(ymmA, m);
1633
+ cc.vcvtps2pd(xmmA, m);
1634
+ cc.vcvtps2pd(ymmA, m);
1635
+ cc.vcvtsd2si(gpd, m);
1636
+ cc.vcvtsd2ss(xmmA, xmmB, m);
1637
+ cc.vcvtsi2sd(xmmA, xmmB, m);
1638
+ cc.vcvtsi2ss(xmmA, xmmB, m);
1639
+ cc.vcvtss2sd(xmmA, xmmB, m);
1640
+ cc.vcvtss2si(gpd, m);
1641
+ cc.vcvttpd2dq(xmmA, m128);
1642
+ cc.vcvttpd2dq(xmmA, m256);
1643
+ cc.vcvttps2dq(xmmA, m);
1644
+ cc.vcvttps2dq(ymmA, m);
1645
+ cc.vcvttsd2si(gpd, m);
1646
+ cc.vcvttss2si(gpd, m);
1647
+ cc.vdivpd(xmmA, xmmB, m);
1648
+ cc.vdivpd(ymmA, ymmB, m);
1649
+ cc.vdivps(xmmA, xmmB, m);
1650
+ cc.vdivps(ymmA, ymmB, m);
1651
+ cc.vdivsd(xmmA, xmmB, m);
1652
+ cc.vdivss(xmmA, xmmB, m);
1653
+ cc.vdppd(xmmA, xmmB, m, 0);
1654
+ cc.vdpps(xmmA, xmmB, m, 0);
1655
+ cc.vdpps(ymmA, ymmB, m, 0);
1656
+ cc.vextractf128(m, ymmB, 0);
1657
+ cc.vextractps(m, xmmB, 0);
1658
+ cc.vhaddpd(xmmA, xmmB, m);
1659
+ cc.vhaddpd(ymmA, ymmB, m);
1660
+ cc.vhaddps(xmmA, xmmB, m);
1661
+ cc.vhaddps(ymmA, ymmB, m);
1662
+ cc.vhsubpd(xmmA, xmmB, m);
1663
+ cc.vhsubpd(ymmA, ymmB, m);
1664
+ cc.vhsubps(xmmA, xmmB, m);
1665
+ cc.vhsubps(ymmA, ymmB, m);
1666
+ cc.vinsertf128(ymmA, ymmB, m, 0);
1667
+ cc.vinsertps(xmmA, xmmB, m, 0);
1668
+ cc.vlddqu(xmmA, m);
1669
+ cc.vlddqu(ymmA, m);
1670
+ cc.vmaskmovps(xmmA, xmmB, m);
1671
+ cc.vmaskmovps(ymmA, ymmB, m);
1672
+ cc.vmaskmovps(m, xmmB, xmmC);
1673
+ cc.vmaskmovps(m, ymmB, ymmC);
1674
+ cc.vmaskmovpd(xmmA, xmmB, m);
1675
+ cc.vmaskmovpd(ymmA, ymmB, m);
1676
+ cc.vmaskmovpd(m, xmmB, xmmC);
1677
+ cc.vmaskmovpd(m, ymmB, ymmC);
1678
+ cc.vmaxpd(xmmA, xmmB, m);
1679
+ cc.vmaxpd(ymmA, ymmB, m);
1680
+ cc.vmaxps(xmmA, xmmB, m);
1681
+ cc.vmaxps(ymmA, ymmB, m);
1682
+ cc.vmaxsd(xmmA, xmmB, m);
1683
+ cc.vmaxss(xmmA, xmmB, m);
1684
+ cc.vminpd(xmmA, xmmB, m);
1685
+ cc.vminpd(ymmA, ymmB, m);
1686
+ cc.vminps(xmmA, xmmB, m);
1687
+ cc.vminps(ymmA, ymmB, m);
1688
+ cc.vminsd(xmmA, xmmB, m);
1689
+ cc.vminss(xmmA, xmmB, m);
1690
+ cc.vmovapd(xmmA, m);
1691
+ cc.vmovapd(m, xmmB);
1692
+ cc.vmovapd(ymmA, m);
1693
+ cc.vmovapd(m, ymmB);
1694
+ cc.vmovaps(xmmA, m);
1695
+ cc.vmovaps(m, xmmB);
1696
+ cc.vmovaps(ymmA, m);
1697
+ cc.vmovaps(m, ymmB);
1698
+ cc.vmovd(xmmA, m);
1699
+ cc.vmovd(m, xmmB);
1700
+ cc.vmovddup(xmmA, m);
1701
+ cc.vmovddup(ymmA, m);
1702
+ cc.vmovdqa(xmmA, m);
1703
+ cc.vmovdqa(m, xmmB);
1704
+ cc.vmovdqa(ymmA, m);
1705
+ cc.vmovdqa(m, ymmB);
1706
+ cc.vmovdqu(xmmA, m);
1707
+ cc.vmovdqu(m, xmmB);
1708
+ cc.vmovdqu(ymmA, m);
1709
+ cc.vmovdqu(m, ymmB);
1710
+ cc.vmovhpd(xmmA, xmmB, m);
1711
+ cc.vmovhps(xmmA, xmmB, m);
1712
+ cc.vmovhps(m, xmmB);
1713
+ cc.vmovlpd(xmmA, xmmB, m);
1714
+ cc.vmovlpd(m, xmmB);
1715
+ cc.vmovlps(xmmA, xmmB, m);
1716
+ cc.vmovlps(m, xmmB);
1717
+ cc.vmovntdq(m, xmmB);
1718
+ cc.vmovntdq(m, ymmB);
1719
+ cc.vmovntdqa(xmmA, m);
1720
+ cc.vmovntpd(m, xmmB);
1721
+ cc.vmovntpd(m, ymmB);
1722
+ cc.vmovntps(m, xmmB);
1723
+ cc.vmovntps(m, ymmB);
1724
+ cc.vmovsd(xmmA, m);
1725
+ cc.vmovsd(m, xmmB);
1726
+ cc.vmovshdup(xmmA, m);
1727
+ cc.vmovshdup(ymmA, m);
1728
+ cc.vmovsldup(xmmA, m);
1729
+ cc.vmovsldup(ymmA, m);
1730
+ cc.vmovss(xmmA, m);
1731
+ cc.vmovss(m, xmmB);
1732
+ cc.vmovupd(xmmA, m);
1733
+ cc.vmovupd(m, xmmB);
1734
+ cc.vmovupd(ymmA, m);
1735
+ cc.vmovupd(m, ymmB);
1736
+ cc.vmovups(xmmA, m);
1737
+ cc.vmovups(m, xmmB);
1738
+ cc.vmovups(ymmA, m);
1739
+ cc.vmovups(m, ymmB);
1740
+ cc.vmpsadbw(xmmA, xmmB, m, 0);
1741
+ cc.vmulpd(xmmA, xmmB, m);
1742
+ cc.vmulpd(ymmA, ymmB, m);
1743
+ cc.vmulps(xmmA, xmmB, m);
1744
+ cc.vmulps(ymmA, ymmB, m);
1745
+ cc.vmulsd(xmmA, xmmB, m);
1746
+ cc.vmulss(xmmA, xmmB, m);
1747
+ cc.vorpd(xmmA, xmmB, m);
1748
+ cc.vorpd(ymmA, ymmB, m);
1749
+ cc.vorps(xmmA, xmmB, m);
1750
+ cc.vorps(ymmA, ymmB, m);
1751
+ cc.vpabsb(xmmA, m);
1752
+ cc.vpabsd(xmmA, m);
1753
+ cc.vpabsw(xmmA, m);
1754
+ cc.vpackssdw(xmmA, xmmB, m);
1755
+ cc.vpacksswb(xmmA, xmmB, m);
1756
+ cc.vpackusdw(xmmA, xmmB, m);
1757
+ cc.vpackuswb(xmmA, xmmB, m);
1758
+ cc.vpaddb(xmmA, xmmB, m);
1759
+ cc.vpaddd(xmmA, xmmB, m);
1760
+ cc.vpaddq(xmmA, xmmB, m);
1761
+ cc.vpaddw(xmmA, xmmB, m);
1762
+ cc.vpaddsb(xmmA, xmmB, m);
1763
+ cc.vpaddsw(xmmA, xmmB, m);
1764
+ cc.vpaddusb(xmmA, xmmB, m);
1765
+ cc.vpaddusw(xmmA, xmmB, m);
1766
+ cc.vpalignr(xmmA, xmmB, m, 0);
1767
+ cc.vpand(xmmA, xmmB, m);
1768
+ cc.vpandn(xmmA, xmmB, m);
1769
+ cc.vpavgb(xmmA, xmmB, m);
1770
+ cc.vpavgw(xmmA, xmmB, m);
1771
+ cc.vpblendvb(xmmA, xmmB, m, xmmA);
1772
+ cc.vpblendw(xmmA, xmmB, m, 0);
1773
+ cc.vpcmpeqb(xmmA, xmmB, m);
1774
+ cc.vpcmpeqd(xmmA, xmmB, m);
1775
+ cc.vpcmpeqq(xmmA, xmmB, m);
1776
+ cc.vpcmpeqw(xmmA, xmmB, m);
1777
+ cc.vpcmpgtb(xmmA, xmmB, m);
1778
+ cc.vpcmpgtd(xmmA, xmmB, m);
1779
+ cc.vpcmpgtq(xmmA, xmmB, m);
1780
+ cc.vpcmpgtw(xmmA, xmmB, m);
1781
+ cc.vpermilpd(xmmA, xmmB, m);
1782
+ cc.vpermilpd(ymmA, ymmB, m);
1783
+ cc.vpermilpd(xmmA, m, 0);
1784
+ cc.vpermilpd(ymmA, m, 0);
1785
+ cc.vpermilps(xmmA, xmmB, m);
1786
+ cc.vpermilps(ymmA, ymmB, m);
1787
+ cc.vpermilps(xmmA, m, 0);
1788
+ cc.vpermilps(ymmA, m, 0);
1789
+ cc.vperm2f128(ymmA, ymmB, m, 0);
1790
+ cc.vpextrb(m, xmmB, 0);
1791
+ cc.vpextrd(m, xmmB, 0);
1792
+ if (cc.is64Bit()) cc.vpextrq(m, xmmB, 0);
1793
+ cc.vpextrw(m, xmmB, 0);
1794
+ cc.vphaddd(xmmA, xmmB, m);
1795
+ cc.vphaddsw(xmmA, xmmB, m);
1796
+ cc.vphaddw(xmmA, xmmB, m);
1797
+ cc.vphminposuw(xmmA, m);
1798
+ cc.vphsubd(xmmA, xmmB, m);
1799
+ cc.vphsubsw(xmmA, xmmB, m);
1800
+ cc.vphsubw(xmmA, xmmB, m);
1801
+ cc.vpinsrb(xmmA, xmmB, m, 0);
1802
+ cc.vpinsrd(xmmA, xmmB, m, 0);
1803
+ cc.vpinsrw(xmmA, xmmB, m, 0);
1804
+ cc.vpmaddubsw(xmmA, xmmB, m);
1805
+ cc.vpmaddwd(xmmA, xmmB, m);
1806
+ cc.vpmaxsb(xmmA, xmmB, m);
1807
+ cc.vpmaxsd(xmmA, xmmB, m);
1808
+ cc.vpmaxsw(xmmA, xmmB, m);
1809
+ cc.vpmaxub(xmmA, xmmB, m);
1810
+ cc.vpmaxud(xmmA, xmmB, m);
1811
+ cc.vpmaxuw(xmmA, xmmB, m);
1812
+ cc.vpminsb(xmmA, xmmB, m);
1813
+ cc.vpminsd(xmmA, xmmB, m);
1814
+ cc.vpminsw(xmmA, xmmB, m);
1815
+ cc.vpminub(xmmA, xmmB, m);
1816
+ cc.vpminud(xmmA, xmmB, m);
1817
+ cc.vpminuw(xmmA, xmmB, m);
1818
+ cc.vpmovsxbd(xmmA, m);
1819
+ cc.vpmovsxbq(xmmA, m);
1820
+ cc.vpmovsxbw(xmmA, m);
1821
+ cc.vpmovsxdq(xmmA, m);
1822
+ cc.vpmovsxwd(xmmA, m);
1823
+ cc.vpmovsxwq(xmmA, m);
1824
+ cc.vpmovzxbd(xmmA, m);
1825
+ cc.vpmovzxbq(xmmA, m);
1826
+ cc.vpmovzxbw(xmmA, m);
1827
+ cc.vpmovzxdq(xmmA, m);
1828
+ cc.vpmovzxwd(xmmA, m);
1829
+ cc.vpmovzxwq(xmmA, m);
1830
+ cc.vpmuldq(xmmA, xmmB, m);
1831
+ cc.vpmulhrsw(xmmA, xmmB, m);
1832
+ cc.vpmulhuw(xmmA, xmmB, m);
1833
+ cc.vpmulhw(xmmA, xmmB, m);
1834
+ cc.vpmulld(xmmA, xmmB, m);
1835
+ cc.vpmullw(xmmA, xmmB, m);
1836
+ cc.vpmuludq(xmmA, xmmB, m);
1837
+ cc.vpor(xmmA, xmmB, m);
1838
+ cc.vpsadbw(xmmA, xmmB, m);
1839
+ cc.vpshufb(xmmA, xmmB, m);
1840
+ cc.vpshufd(xmmA, m, 0);
1841
+ cc.vpshufhw(xmmA, m, 0);
1842
+ cc.vpshuflw(xmmA, m, 0);
1843
+ cc.vpsignb(xmmA, xmmB, m);
1844
+ cc.vpsignd(xmmA, xmmB, m);
1845
+ cc.vpsignw(xmmA, xmmB, m);
1846
+ cc.vpslld(xmmA, xmmB, m);
1847
+ cc.vpsllq(xmmA, xmmB, m);
1848
+ cc.vpsllw(xmmA, xmmB, m);
1849
+ cc.vpsrad(xmmA, xmmB, m);
1850
+ cc.vpsraw(xmmA, xmmB, m);
1851
+ cc.vpsrld(xmmA, xmmB, m);
1852
+ cc.vpsrlq(xmmA, xmmB, m);
1853
+ cc.vpsrlw(xmmA, xmmB, m);
1854
+ cc.vpsubb(xmmA, xmmB, m);
1855
+ cc.vpsubd(xmmA, xmmB, m);
1856
+ cc.vpsubq(xmmA, xmmB, m);
1857
+ cc.vpsubw(xmmA, xmmB, m);
1858
+ cc.vpsubsb(xmmA, xmmB, m);
1859
+ cc.vpsubsw(xmmA, xmmB, m);
1860
+ cc.vpsubusb(xmmA, xmmB, m);
1861
+ cc.vpsubusw(xmmA, xmmB, m);
1862
+ cc.vptest(xmmA, m);
1863
+ cc.vptest(ymmA, m);
1864
+ cc.vpunpckhbw(xmmA, xmmB, m);
1865
+ cc.vpunpckhdq(xmmA, xmmB, m);
1866
+ cc.vpunpckhqdq(xmmA, xmmB, m);
1867
+ cc.vpunpckhwd(xmmA, xmmB, m);
1868
+ cc.vpunpcklbw(xmmA, xmmB, m);
1869
+ cc.vpunpckldq(xmmA, xmmB, m);
1870
+ cc.vpunpcklqdq(xmmA, xmmB, m);
1871
+ cc.vpunpcklwd(xmmA, xmmB, m);
1872
+ cc.vpxor(xmmA, xmmB, m);
1873
+ cc.vrcpps(xmmA, m);
1874
+ cc.vrcpps(ymmA, m);
1875
+ cc.vrcpss(xmmA, xmmB, m);
1876
+ cc.vrsqrtps(xmmA, m);
1877
+ cc.vrsqrtps(ymmA, m);
1878
+ cc.vrsqrtss(xmmA, xmmB, m);
1879
+ cc.vroundpd(xmmA, m, 0);
1880
+ cc.vroundpd(ymmA, m, 0);
1881
+ cc.vroundps(xmmA, m, 0);
1882
+ cc.vroundps(ymmA, m, 0);
1883
+ cc.vroundsd(xmmA, xmmB, m, 0);
1884
+ cc.vroundss(xmmA, xmmB, m, 0);
1885
+ cc.vshufpd(xmmA, xmmB, m, 0);
1886
+ cc.vshufpd(ymmA, ymmB, m, 0);
1887
+ cc.vshufps(xmmA, xmmB, m, 0);
1888
+ cc.vshufps(ymmA, ymmB, m, 0);
1889
+ cc.vsqrtpd(xmmA, m);
1890
+ cc.vsqrtpd(ymmA, m);
1891
+ cc.vsqrtps(xmmA, m);
1892
+ cc.vsqrtps(ymmA, m);
1893
+ cc.vsqrtsd(xmmA, xmmB, m);
1894
+ cc.vsqrtss(xmmA, xmmB, m);
1895
+ cc.vsubpd(xmmA, xmmB, m);
1896
+ cc.vsubpd(ymmA, ymmB, m);
1897
+ cc.vsubps(xmmA, xmmB, m);
1898
+ cc.vsubps(ymmA, ymmB, m);
1899
+ cc.vsubsd(xmmA, xmmB, m);
1900
+ cc.vsubss(xmmA, xmmB, m);
1901
+ cc.vtestps(xmmA, m);
1902
+ cc.vtestps(ymmA, m);
1903
+ cc.vtestpd(xmmA, m);
1904
+ cc.vtestpd(ymmA, m);
1905
+ cc.vucomisd(xmmA, m);
1906
+ cc.vucomiss(xmmA, m);
1907
+ cc.vunpckhpd(xmmA, xmmB, m);
1908
+ cc.vunpckhpd(ymmA, ymmB, m);
1909
+ cc.vunpckhps(xmmA, xmmB, m);
1910
+ cc.vunpckhps(ymmA, ymmB, m);
1911
+ cc.vunpcklpd(xmmA, xmmB, m);
1912
+ cc.vunpcklpd(ymmA, ymmB, m);
1913
+ cc.vunpcklps(xmmA, xmmB, m);
1914
+ cc.vunpcklps(ymmA, ymmB, m);
1915
+ cc.vxorpd(xmmA, xmmB, m);
1916
+ cc.vxorpd(ymmA, ymmB, m);
1917
+ cc.vxorps(xmmA, xmmB, m);
1918
+ cc.vxorps(ymmA, ymmB, m);
1919
+
1920
+ // AVX+AESNI.
1921
+ cc.vaesdec(xmmA, xmmB, m);
1922
+ cc.vaesdeclast(xmmA, xmmB, m);
1923
+ cc.vaesenc(xmmA, xmmB, m);
1924
+ cc.vaesenclast(xmmA, xmmB, m);
1925
+ cc.vaesimc(xmmA, m);
1926
+ cc.vaeskeygenassist(xmmA, m, 0);
1927
+
1928
+ // AVX+PCLMULQDQ.
1929
+ cc.vpclmulqdq(xmmA, xmmB, m, 0);
1930
+
1931
+ // AVX2.
1932
+ cc.vbroadcasti128(ymmA, m);
1933
+ cc.vextracti128(m, ymmB, 0);
1934
+ cc.vgatherdpd(xmmA, vx_ptr, xmmC);
1935
+ cc.vgatherdpd(ymmA, vx_ptr, ymmC);
1936
+ cc.vgatherdps(xmmA, vx_ptr, xmmC);
1937
+ cc.vgatherdps(ymmA, vy_ptr, ymmC);
1938
+ cc.vgatherqpd(xmmA, vx_ptr, xmmC);
1939
+ cc.vgatherqpd(ymmA, vy_ptr, ymmC);
1940
+ cc.vgatherqps(xmmA, vx_ptr, xmmC);
1941
+ cc.vgatherqps(xmmA, vy_ptr, xmmC);
1942
+ cc.vinserti128(ymmA, ymmB, m, 0);
1943
+ cc.vmovntdqa(ymmA, m);
1944
+ cc.vmpsadbw(ymmA, ymmB, m, 0);
1945
+ cc.vpabsb(ymmA, m);
1946
+ cc.vpabsd(ymmA, m);
1947
+ cc.vpabsw(ymmA, m);
1948
+ cc.vpackssdw(ymmA, ymmB, m);
1949
+ cc.vpacksswb(ymmA, ymmB, m);
1950
+ cc.vpackusdw(ymmA, ymmB, m);
1951
+ cc.vpackuswb(ymmA, ymmB, m);
1952
+ cc.vpaddb(ymmA, ymmB, m);
1953
+ cc.vpaddd(ymmA, ymmB, m);
1954
+ cc.vpaddq(ymmA, ymmB, m);
1955
+ cc.vpaddw(ymmA, ymmB, m);
1956
+ cc.vpaddsb(ymmA, ymmB, m);
1957
+ cc.vpaddsw(ymmA, ymmB, m);
1958
+ cc.vpaddusb(ymmA, ymmB, m);
1959
+ cc.vpaddusw(ymmA, ymmB, m);
1960
+ cc.vpalignr(ymmA, ymmB, m, 0);
1961
+ cc.vpand(ymmA, ymmB, m);
1962
+ cc.vpandn(ymmA, ymmB, m);
1963
+ cc.vpavgb(ymmA, ymmB, m);
1964
+ cc.vpavgw(ymmA, ymmB, m);
1965
+ cc.vpblendd(xmmA, xmmB, m, 0);
1966
+ cc.vpblendd(ymmA, ymmB, m, 0);
1967
+ cc.vpblendvb(ymmA, ymmB, m, ymmA);
1968
+ cc.vpblendw(ymmA, ymmB, m, 0);
1969
+ cc.vpbroadcastb(xmmA, m);
1970
+ cc.vpbroadcastb(ymmA, m);
1971
+ cc.vpbroadcastd(xmmA, m);
1972
+ cc.vpbroadcastd(ymmA, m);
1973
+ cc.vpbroadcastq(xmmA, m);
1974
+ cc.vpbroadcastq(ymmA, m);
1975
+ cc.vpbroadcastw(xmmA, m);
1976
+ cc.vpbroadcastw(ymmA, m);
1977
+ cc.vpcmpeqb(ymmA, ymmB, m);
1978
+ cc.vpcmpeqd(ymmA, ymmB, m);
1979
+ cc.vpcmpeqq(ymmA, ymmB, m);
1980
+ cc.vpcmpeqw(ymmA, ymmB, m);
1981
+ cc.vpcmpgtb(ymmA, ymmB, m);
1982
+ cc.vpcmpgtd(ymmA, ymmB, m);
1983
+ cc.vpcmpgtq(ymmA, ymmB, m);
1984
+ cc.vpcmpgtw(ymmA, ymmB, m);
1985
+ cc.vperm2i128(ymmA, ymmB, m, 0);
1986
+ cc.vpermd(ymmA, ymmB, m);
1987
+ cc.vpermps(ymmA, ymmB, m);
1988
+ cc.vpermpd(ymmA, m, 0);
1989
+ cc.vpermq(ymmA, m, 0);
1990
+ cc.vpgatherdd(xmmA, vx_ptr, xmmC);
1991
+ cc.vpgatherdd(ymmA, vy_ptr, ymmC);
1992
+ cc.vpgatherdq(xmmA, vx_ptr, xmmC);
1993
+ cc.vpgatherdq(ymmA, vx_ptr, ymmC);
1994
+ cc.vpgatherqd(xmmA, vx_ptr, xmmC);
1995
+ cc.vpgatherqd(xmmA, vy_ptr, xmmC);
1996
+ cc.vpgatherqq(xmmA, vx_ptr, xmmC);
1997
+ cc.vpgatherqq(ymmA, vy_ptr, ymmC);
1998
+ cc.vpmovsxbd(ymmA, m);
1999
+ cc.vpmovsxbq(ymmA, m);
2000
+ cc.vpmovsxbw(ymmA, m);
2001
+ cc.vpmovsxdq(ymmA, m);
2002
+ cc.vpmovsxwd(ymmA, m);
2003
+ cc.vpmovsxwq(ymmA, m);
2004
+ cc.vpmovzxbd(ymmA, m);
2005
+ cc.vpmovzxbq(ymmA, m);
2006
+ cc.vpmovzxbw(ymmA, m);
2007
+ cc.vpmovzxdq(ymmA, m);
2008
+ cc.vpmovzxwd(ymmA, m);
2009
+ cc.vpmovzxwq(ymmA, m);
2010
+ cc.vpshufd(ymmA, m, 0);
2011
+ cc.vpshufhw(ymmA, m, 0);
2012
+ cc.vpshuflw(ymmA, m, 0);
2013
+ cc.vphaddd(ymmA, ymmB, m);
2014
+ cc.vphaddsw(ymmA, ymmB, m);
2015
+ cc.vphaddw(ymmA, ymmB, m);
2016
+ cc.vphsubd(ymmA, ymmB, m);
2017
+ cc.vphsubsw(ymmA, ymmB, m);
2018
+ cc.vphsubw(ymmA, ymmB, m);
2019
+ cc.vpmaddubsw(ymmA, ymmB, m);
2020
+ cc.vpmaddwd(ymmA, ymmB, m);
2021
+ cc.vpmaskmovd(m, xmmB, xmmC);
2022
+ cc.vpmaskmovd(m, ymmB, ymmC);
2023
+ cc.vpmaskmovd(xmmA, xmmB, m);
2024
+ cc.vpmaskmovd(ymmA, ymmB, m);
2025
+ cc.vpmaskmovq(m, xmmB, xmmC);
2026
+ cc.vpmaskmovq(m, ymmB, ymmC);
2027
+ cc.vpmaskmovq(xmmA, xmmB, m);
2028
+ cc.vpmaskmovq(ymmA, ymmB, m);
2029
+ cc.vpmaxsb(ymmA, ymmB, m);
2030
+ cc.vpmaxsd(ymmA, ymmB, m);
2031
+ cc.vpmaxsw(ymmA, ymmB, m);
2032
+ cc.vpmaxub(ymmA, ymmB, m);
2033
+ cc.vpmaxud(ymmA, ymmB, m);
2034
+ cc.vpmaxuw(ymmA, ymmB, m);
2035
+ cc.vpminsb(ymmA, ymmB, m);
2036
+ cc.vpminsd(ymmA, ymmB, m);
2037
+ cc.vpminsw(ymmA, ymmB, m);
2038
+ cc.vpminub(ymmA, ymmB, m);
2039
+ cc.vpminud(ymmA, ymmB, m);
2040
+ cc.vpminuw(ymmA, ymmB, m);
2041
+ cc.vpmuldq(ymmA, ymmB, m);
2042
+ cc.vpmulhrsw(ymmA, ymmB, m);
2043
+ cc.vpmulhuw(ymmA, ymmB, m);
2044
+ cc.vpmulhw(ymmA, ymmB, m);
2045
+ cc.vpmulld(ymmA, ymmB, m);
2046
+ cc.vpmullw(ymmA, ymmB, m);
2047
+ cc.vpmuludq(ymmA, ymmB, m);
2048
+ cc.vpor(ymmA, ymmB, m);
2049
+ cc.vpsadbw(ymmA, ymmB, m);
2050
+ cc.vpshufb(ymmA, ymmB, m);
2051
+ cc.vpsignb(ymmA, ymmB, m);
2052
+ cc.vpsignd(ymmA, ymmB, m);
2053
+ cc.vpsignw(ymmA, ymmB, m);
2054
+ cc.vpslld(ymmA, ymmB, m);
2055
+ cc.vpsllq(ymmA, ymmB, m);
2056
+ cc.vpsllvd(xmmA, xmmB, m);
2057
+ cc.vpsllvd(ymmA, ymmB, m);
2058
+ cc.vpsllvq(xmmA, xmmB, m);
2059
+ cc.vpsllvq(ymmA, ymmB, m);
2060
+ cc.vpsllw(ymmA, ymmB, m);
2061
+ cc.vpsrad(ymmA, ymmB, m);
2062
+ cc.vpsravd(xmmA, xmmB, m);
2063
+ cc.vpsravd(ymmA, ymmB, m);
2064
+ cc.vpsraw(ymmA, ymmB, m);
2065
+ cc.vpsrld(ymmA, ymmB, m);
2066
+ cc.vpsrlq(ymmA, ymmB, m);
2067
+ cc.vpsrlvd(xmmA, xmmB, m);
2068
+ cc.vpsrlvd(ymmA, ymmB, m);
2069
+ cc.vpsrlvq(xmmA, xmmB, m);
2070
+ cc.vpsrlvq(ymmA, ymmB, m);
2071
+ cc.vpsrlw(ymmA, ymmB, m);
2072
+ cc.vpsubb(ymmA, ymmB, m);
2073
+ cc.vpsubd(ymmA, ymmB, m);
2074
+ cc.vpsubq(ymmA, ymmB, m);
2075
+ cc.vpsubsb(ymmA, ymmB, m);
2076
+ cc.vpsubsw(ymmA, ymmB, m);
2077
+ cc.vpsubusb(ymmA, ymmB, m);
2078
+ cc.vpsubusw(ymmA, ymmB, m);
2079
+ cc.vpsubw(ymmA, ymmB, m);
2080
+ cc.vpunpckhbw(ymmA, ymmB, m);
2081
+ cc.vpunpckhdq(ymmA, ymmB, m);
2082
+ cc.vpunpckhqdq(ymmA, ymmB, m);
2083
+ cc.vpunpckhwd(ymmA, ymmB, m);
2084
+ cc.vpunpcklbw(ymmA, ymmB, m);
2085
+ cc.vpunpckldq(ymmA, ymmB, m);
2086
+ cc.vpunpcklqdq(ymmA, ymmB, m);
2087
+ cc.vpunpcklwd(ymmA, ymmB, m);
2088
+ cc.vpxor(ymmA, ymmB, m);
2089
+ }
2090
+ }
2091
+
2092
+ static void generateAvxSequence(BaseEmitter& emitter, InstForm form, bool emitPrologEpilog) {
2093
+ using namespace asmjit::x86;
2094
+
2095
+ if (emitter.isAssembler()) {
2096
+ Assembler& cc = *emitter.as<Assembler>();
2097
+
2098
+ if (emitPrologEpilog) {
2099
+ FuncDetail func;
2100
+ func.init(FuncSignatureT<void, void*, const void*, size_t>(CallConvId::kHost), cc.environment());
2101
+
2102
+ FuncFrame frame;
2103
+ frame.init(func);
2104
+ frame.addDirtyRegs(eax, ymm0, ymm1, ymm2, ymm3);
2105
+ frame.finalize();
2106
+
2107
+ cc.emitProlog(frame);
2108
+ generateAvxSequenceInternal(cc, form, eax, ymm0, ymm1, ymm2, ymm3);
2109
+ cc.emitEpilog(frame);
2110
+ }
2111
+ else {
2112
+ generateAvxSequenceInternal(cc, form, eax, ymm0, ymm1, ymm2, ymm3);
2113
+ }
2114
+ }
2115
+ #ifndef ASMJIT_NO_BUILDER
2116
+ else if (emitter.isBuilder()) {
2117
+ Builder& cc = *emitter.as<Builder>();
2118
+
2119
+ if (emitPrologEpilog) {
2120
+ FuncDetail func;
2121
+ func.init(FuncSignatureT<void, void*, const void*, size_t>(CallConvId::kHost), cc.environment());
2122
+
2123
+ FuncFrame frame;
2124
+ frame.init(func);
2125
+ frame.addDirtyRegs(eax, ymm0, ymm1, ymm2, ymm3);
2126
+ frame.finalize();
2127
+
2128
+ cc.emitProlog(frame);
2129
+ generateAvxSequenceInternal(cc, form, eax, ymm0, ymm1, ymm2, ymm3);
2130
+ cc.emitEpilog(frame);
2131
+ }
2132
+ else {
2133
+ generateAvxSequenceInternal(cc, form, eax, ymm0, ymm1, ymm2, ymm3);
2134
+ }
2135
+ }
2136
+ #endif
2137
+ #ifndef ASMJIT_NO_COMPILER
2138
+ else if (emitter.isCompiler()) {
2139
+ Compiler& cc = *emitter.as<Compiler>();
2140
+
2141
+ Gp gp = cc.newGpz("gp");
2142
+ Ymm a = cc.newYmm("a");
2143
+ Ymm b = cc.newYmm("b");
2144
+ Ymm c = cc.newYmm("c");
2145
+ Ymm d = cc.newYmm("d");
2146
+
2147
+ cc.addFunc(FuncSignatureT<void>(CallConvId::kHost));
2148
+ generateAvxSequenceInternal(cc, form, gp, a, b, c, d);
2149
+ cc.endFunc();
2150
+ }
2151
+ #endif
2152
+ }
2153
+
2154
+ // Generates a long sequence of AVX512 instructions.
2155
+ template<typename Emitter>
2156
+ static void generateAvx512SequenceInternal(
2157
+ Emitter& cc,
2158
+ InstForm form,
2159
+ const x86::Gp& gp,
2160
+ const x86::KReg& kA, const x86::KReg& kB, const x86::KReg& kC,
2161
+ const x86::Vec& vecA, const x86::Vec& vecB, const x86::Vec& vecC, const x86::Vec& vecD) {
2162
+
2163
+ x86::Gp gpd = gp.r32();
2164
+ x86::Gp gpq = gp.r64();
2165
+ x86::Gp gpz = cc.is32Bit() ? gpd : gpq;
2166
+
2167
+ x86::Xmm xmmA = vecA.xmm();
2168
+ x86::Xmm xmmB = vecB.xmm();
2169
+ x86::Xmm xmmC = vecC.xmm();
2170
+ x86::Xmm xmmD = vecD.xmm();
2171
+
2172
+ x86::Ymm ymmA = vecA.ymm();
2173
+ x86::Ymm ymmB = vecB.ymm();
2174
+ x86::Ymm ymmC = vecC.ymm();
2175
+ x86::Ymm ymmD = vecD.ymm();
2176
+
2177
+ x86::Zmm zmmA = vecA.zmm();
2178
+ x86::Zmm zmmB = vecB.zmm();
2179
+ x86::Zmm zmmC = vecC.zmm();
2180
+ x86::Zmm zmmD = vecD.zmm();
2181
+
2182
+ cc.xor_(gpd, gpd);
2183
+ cc.vxorps(xmmA, xmmA, xmmA);
2184
+ cc.vxorps(xmmB, xmmB, xmmB);
2185
+ cc.vxorps(xmmC, xmmC, xmmC);
2186
+ cc.vxorps(xmmD, xmmD, xmmD);
2187
+
2188
+ if (form == InstForm::kReg) {
2189
+ cc.kaddb(kA, kB, kC);
2190
+ cc.kaddd(kA, kB, kC);
2191
+ cc.kaddq(kA, kB, kC);
2192
+ cc.kaddw(kA, kB, kC);
2193
+ cc.kandb(kA, kB, kC);
2194
+ cc.kandd(kA, kB, kC);
2195
+ cc.kandnb(kA, kB, kC);
2196
+ cc.kandnd(kA, kB, kC);
2197
+ cc.kandnq(kA, kB, kC);
2198
+ cc.kandnw(kA, kB, kC);
2199
+ cc.kandq(kA, kB, kC);
2200
+ cc.kandw(kA, kB, kC);
2201
+ cc.kmovb(kA, kB);
2202
+ cc.kmovb(kA, gpd);
2203
+ cc.kmovb(gpd, kB);
2204
+ cc.kmovd(kA, kB);
2205
+ cc.kmovd(kA, gpd);
2206
+ cc.kmovd(gpd, kB);
2207
+ cc.kmovq(kA, kB);
2208
+ if (cc.is64Bit()) cc.kmovq(kA, gpq);
2209
+ if (cc.is64Bit()) cc.kmovq(gpq, kB);
2210
+ cc.kmovw(kA, kB);
2211
+ cc.kmovw(kA, gpd);
2212
+ cc.kmovw(gpd, kB);
2213
+ cc.knotb(kA, kB);
2214
+ cc.knotd(kA, kB);
2215
+ cc.knotq(kA, kB);
2216
+ cc.knotw(kA, kB);
2217
+ cc.korb(kA, kB, kC);
2218
+ cc.kord(kA, kB, kC);
2219
+ cc.korq(kA, kB, kC);
2220
+ cc.kortestb(kA, kB);
2221
+ cc.kortestd(kA, kB);
2222
+ cc.kortestq(kA, kB);
2223
+ cc.kortestw(kA, kB);
2224
+ cc.korw(kA, kB, kC);
2225
+ cc.kshiftlb(kA, kB, 0);
2226
+ cc.kshiftld(kA, kB, 0);
2227
+ cc.kshiftlq(kA, kB, 0);
2228
+ cc.kshiftlw(kA, kB, 0);
2229
+ cc.kshiftrb(kA, kB, 0);
2230
+ cc.kshiftrd(kA, kB, 0);
2231
+ cc.kshiftrq(kA, kB, 0);
2232
+ cc.kshiftrw(kA, kB, 0);
2233
+ cc.ktestb(kA, kB);
2234
+ cc.ktestd(kA, kB);
2235
+ cc.ktestq(kA, kB);
2236
+ cc.ktestw(kA, kB);
2237
+ cc.kunpckbw(kA, kB, kC);
2238
+ cc.kunpckdq(kA, kB, kC);
2239
+ cc.kunpckwd(kA, kB, kC);
2240
+ cc.kxnorb(kA, kB, kC);
2241
+ cc.kxnord(kA, kB, kC);
2242
+ cc.kxnorq(kA, kB, kC);
2243
+ cc.kxnorw(kA, kB, kC);
2244
+ cc.kxorb(kA, kB, kC);
2245
+ cc.kxord(kA, kB, kC);
2246
+ cc.kxorq(kA, kB, kC);
2247
+ cc.kxorw(kA, kB, kC);
2248
+ cc.nop();
2249
+
2250
+ cc.evex().vaddpd(xmmA, xmmB, xmmC);
2251
+ cc.evex().vaddpd(ymmA, ymmB, ymmC);
2252
+ cc.evex().vaddpd(zmmA, zmmB, zmmC);
2253
+ cc.evex().vaddps(xmmA, xmmB, xmmC);
2254
+ cc.evex().vaddps(ymmA, ymmB, ymmC);
2255
+ cc.evex().vaddps(zmmA, zmmB, zmmC);
2256
+ cc.evex().vaddsd(xmmA, xmmB, xmmC);
2257
+ cc.evex().vaddss(xmmA, xmmB, xmmC);
2258
+ cc.evex().valignd(xmmA, xmmB, xmmC, 0);
2259
+ cc.evex().valignd(ymmA, ymmB, ymmC, 0);
2260
+ cc.evex().valignd(zmmA, zmmB, zmmC, 0);
2261
+ cc.evex().valignq(xmmA, xmmB, xmmC, 0);
2262
+ cc.evex().valignq(ymmA, ymmB, ymmC, 0);
2263
+ cc.evex().valignq(zmmA, zmmB, zmmC, 0);
2264
+ cc.evex().vandnpd(xmmA, xmmB, xmmC);
2265
+ cc.evex().vandnpd(ymmA, ymmB, ymmC);
2266
+ cc.evex().vandnpd(zmmA, zmmB, zmmC);
2267
+ cc.evex().vandnps(xmmA, xmmB, xmmC);
2268
+ cc.evex().vandnps(ymmA, ymmB, ymmC);
2269
+ cc.evex().vandnps(zmmA, zmmB, zmmC);
2270
+ cc.evex().vandpd(xmmA, xmmB, xmmC);
2271
+ cc.evex().vandpd(ymmA, ymmB, ymmC);
2272
+ cc.evex().vandpd(zmmA, zmmB, zmmC);
2273
+ cc.evex().vandps(xmmA, xmmB, xmmC);
2274
+ cc.evex().vandps(ymmA, ymmB, ymmC);
2275
+ cc.evex().vandps(zmmA, zmmB, zmmC);
2276
+ cc.evex().vblendmpd(xmmA, xmmB, xmmC);
2277
+ cc.evex().vblendmpd(ymmA, ymmB, ymmC);
2278
+ cc.evex().vblendmpd(zmmA, zmmB, zmmC);
2279
+ cc.evex().vblendmps(xmmA, xmmB, xmmC);
2280
+ cc.evex().vblendmps(ymmA, ymmB, ymmC);
2281
+ cc.evex().vblendmps(zmmA, zmmB, zmmC);
2282
+ cc.evex().vbroadcastf32x2(ymmA, xmmB);
2283
+ cc.evex().vbroadcastf32x2(zmmA, xmmB);
2284
+ cc.evex().vbroadcasti32x2(xmmA, xmmB);
2285
+ cc.evex().vbroadcasti32x2(ymmA, xmmB);
2286
+ cc.evex().vbroadcasti32x2(zmmA, xmmB);
2287
+ cc.evex().vbroadcastsd(ymmA, xmmB);
2288
+ cc.evex().vbroadcastsd(zmmA, xmmB);
2289
+ cc.evex().vbroadcastss(xmmA, xmmB);
2290
+ cc.evex().vbroadcastss(ymmA, xmmB);
2291
+ cc.evex().vbroadcastss(zmmA, xmmB);
2292
+ cc.evex().vcmppd(kA, xmmB, xmmC, 0);
2293
+ cc.evex().vcmppd(kA, ymmB, ymmC, 0);
2294
+ cc.evex().vcmppd(kA, zmmB, zmmC, 0);
2295
+ cc.evex().vcmpps(kA, xmmB, xmmC, 0);
2296
+ cc.evex().vcmpps(kA, ymmB, ymmC, 0);
2297
+ cc.evex().vcmpps(kA, zmmB, zmmC, 0);
2298
+ cc.evex().vcmpsd(kA, xmmB, xmmC, 0);
2299
+ cc.evex().vcmpss(kA, xmmB, xmmC, 0);
2300
+ cc.evex().vcomisd(xmmA, xmmB);
2301
+ cc.evex().vcomiss(xmmA, xmmB);
2302
+ cc.evex().vcompresspd(xmmA, xmmB);
2303
+ cc.evex().vcompresspd(ymmA, ymmB);
2304
+ cc.evex().vcompresspd(zmmA, zmmB);
2305
+ cc.evex().vcompressps(xmmA, xmmB);
2306
+ cc.evex().vcompressps(ymmA, ymmB);
2307
+ cc.evex().vcompressps(zmmA, zmmB);
2308
+ cc.evex().vcvtdq2pd(xmmA, xmmB);
2309
+ cc.evex().vcvtdq2pd(ymmA, xmmB);
2310
+ cc.evex().vcvtdq2pd(zmmA, ymmB);
2311
+ cc.evex().vcvtdq2ps(xmmA, xmmB);
2312
+ cc.evex().vcvtdq2ps(ymmA, ymmB);
2313
+ cc.evex().vcvtdq2ps(zmmA, zmmB);
2314
+ cc.evex().vcvtpd2dq(xmmA, xmmB);
2315
+ cc.evex().vcvtpd2dq(xmmA, ymmB);
2316
+ cc.evex().vcvtpd2dq(ymmA, zmmB);
2317
+ cc.evex().vcvtpd2qq(xmmA, xmmB);
2318
+ cc.evex().vcvtpd2qq(ymmA, ymmB);
2319
+ cc.evex().vcvtpd2qq(zmmA, zmmB);
2320
+ cc.evex().vcvtpd2udq(xmmA, xmmB);
2321
+ cc.evex().vcvtpd2udq(xmmA, ymmB);
2322
+ cc.evex().vcvtpd2udq(ymmA, zmmB);
2323
+ cc.evex().vcvtpd2uqq(xmmA, xmmB);
2324
+ cc.evex().vcvtpd2uqq(ymmA, ymmB);
2325
+ cc.evex().vcvtpd2uqq(zmmA, zmmB);
2326
+ cc.evex().vcvtph2ps(xmmA, xmmB);
2327
+ cc.evex().vcvtph2ps(ymmA, xmmB);
2328
+ cc.evex().vcvtph2ps(zmmA, ymmB);
2329
+ cc.evex().vcvtps2dq(xmmA, xmmB);
2330
+ cc.evex().vcvtps2dq(ymmA, ymmB);
2331
+ cc.evex().vcvtps2dq(zmmA, zmmB);
2332
+ cc.evex().vcvtps2pd(xmmA, xmmB);
2333
+ cc.evex().vcvtps2pd(ymmA, xmmB);
2334
+ cc.evex().vcvtps2pd(zmmA, ymmB);
2335
+ cc.evex().vcvtps2ph(xmmA, xmmB, 0);
2336
+ cc.evex().vcvtps2ph(xmmA, ymmB, 0);
2337
+ cc.evex().vcvtps2ph(ymmA, zmmB, 0);
2338
+ cc.evex().vcvtps2qq(xmmA, xmmB);
2339
+ cc.evex().vcvtps2qq(ymmA, xmmB);
2340
+ cc.evex().vcvtps2qq(zmmA, ymmB);
2341
+ cc.evex().vcvtps2udq(xmmA, xmmB);
2342
+ cc.evex().vcvtps2udq(ymmA, ymmB);
2343
+ cc.evex().vcvtps2udq(zmmA, zmmB);
2344
+ cc.evex().vcvtps2uqq(xmmA, xmmB);
2345
+ cc.evex().vcvtps2uqq(ymmA, xmmB);
2346
+ cc.evex().vcvtps2uqq(zmmA, ymmB);
2347
+ cc.evex().vcvtqq2pd(xmmA, xmmB);
2348
+ cc.evex().vcvtqq2pd(ymmA, ymmB);
2349
+ cc.evex().vcvtqq2pd(zmmA, zmmB);
2350
+ cc.evex().vcvtqq2ps(xmmA, xmmB);
2351
+ cc.evex().vcvtqq2ps(xmmA, ymmB);
2352
+ cc.evex().vcvtqq2ps(ymmA, zmmB);
2353
+ cc.evex().vcvtsd2si(gpd, xmmB);
2354
+ if (cc.is64Bit()) cc.evex().vcvtsd2si(gpq, xmmB);
2355
+ cc.evex().vcvtsd2ss(xmmA, xmmB, xmmC);
2356
+ cc.evex().vcvtsd2usi(gpd, xmmB);
2357
+ if (cc.is64Bit()) cc.evex().vcvtsd2usi(gpq, xmmB);
2358
+ cc.evex().vcvtsi2sd(xmmA, xmmB, gpd);
2359
+ if (cc.is64Bit()) cc.evex().vcvtsi2sd(xmmA, xmmB, gpq);
2360
+ cc.evex().vcvtsi2ss(xmmA, xmmB, gpd);
2361
+ if (cc.is64Bit()) cc.evex().vcvtsi2ss(xmmA, xmmB, gpq);
2362
+ cc.evex().vcvtss2sd(xmmA, xmmB, xmmC);
2363
+ cc.evex().vcvtss2si(gpd, xmmB);
2364
+ if (cc.is64Bit()) cc.evex().vcvtss2si(gpq, xmmB);
2365
+ cc.evex().vcvtss2usi(gpd, xmmB);
2366
+ if (cc.is64Bit()) cc.evex().vcvtss2usi(gpq, xmmB);
2367
+ cc.evex().vcvttpd2dq(xmmA, xmmB);
2368
+ cc.evex().vcvttpd2dq(xmmA, ymmB);
2369
+ cc.evex().vcvttpd2dq(ymmA, zmmB);
2370
+ cc.evex().vcvttpd2qq(xmmA, xmmB);
2371
+ cc.evex().vcvttpd2qq(ymmA, ymmB);
2372
+ cc.evex().vcvttpd2qq(zmmA, zmmB);
2373
+ cc.evex().vcvttpd2udq(xmmA, xmmB);
2374
+ cc.evex().vcvttpd2udq(xmmA, ymmB);
2375
+ cc.evex().vcvttpd2udq(ymmA, zmmB);
2376
+ cc.evex().vcvttpd2uqq(xmmA, xmmB);
2377
+ cc.evex().vcvttpd2uqq(ymmA, ymmB);
2378
+ cc.evex().vcvttpd2uqq(zmmA, zmmB);
2379
+ cc.evex().vcvttps2dq(xmmA, xmmB);
2380
+ cc.evex().vcvttps2dq(ymmA, ymmB);
2381
+ cc.evex().vcvttps2dq(zmmA, zmmB);
2382
+ cc.evex().vcvttps2qq(xmmA, xmmB);
2383
+ cc.evex().vcvttps2qq(ymmA, xmmB);
2384
+ cc.evex().vcvttps2qq(zmmA, ymmB);
2385
+ cc.evex().vcvttps2udq(xmmA, xmmB);
2386
+ cc.evex().vcvttps2udq(ymmA, ymmB);
2387
+ cc.evex().vcvttps2udq(zmmA, zmmB);
2388
+ cc.evex().vcvttps2uqq(xmmA, xmmB);
2389
+ cc.evex().vcvttps2uqq(ymmA, xmmB);
2390
+ cc.evex().vcvttps2uqq(zmmA, ymmB);
2391
+ cc.evex().vcvttsd2si(gpd, xmmB);
2392
+ if (cc.is64Bit()) cc.evex().vcvttsd2si(gpq, xmmB);
2393
+ cc.evex().vcvttsd2usi(gpd, xmmB);
2394
+ if (cc.is64Bit()) cc.evex().vcvttsd2usi(gpq, xmmB);
2395
+ cc.evex().vcvttss2si(gpd, xmmB);
2396
+ if (cc.is64Bit()) cc.evex().vcvttss2si(gpq, xmmB);
2397
+ cc.evex().vcvttss2usi(gpd, xmmB);
2398
+ if (cc.is64Bit()) cc.evex().vcvttss2usi(gpq, xmmB);
2399
+ cc.evex().vcvtudq2pd(xmmA, xmmB);
2400
+ cc.evex().vcvtudq2pd(ymmA, xmmB);
2401
+ cc.evex().vcvtudq2pd(zmmA, ymmB);
2402
+ cc.evex().vcvtudq2ps(xmmA, xmmB);
2403
+ cc.evex().vcvtudq2ps(ymmA, ymmB);
2404
+ cc.evex().vcvtudq2ps(zmmA, zmmB);
2405
+ cc.evex().vcvtuqq2pd(xmmA, xmmB);
2406
+ cc.evex().vcvtuqq2pd(ymmA, ymmB);
2407
+ cc.evex().vcvtuqq2pd(zmmA, zmmB);
2408
+ cc.evex().vcvtuqq2ps(xmmA, xmmB);
2409
+ cc.evex().vcvtuqq2ps(xmmA, ymmB);
2410
+ cc.evex().vcvtuqq2ps(ymmA, zmmB);
2411
+ cc.evex().vcvtusi2sd(xmmA, xmmB, gpd);
2412
+ if (cc.is64Bit()) cc.evex().vcvtusi2sd(xmmA, xmmB, gpq);
2413
+ cc.evex().vcvtusi2ss(xmmA, xmmB, gpd);
2414
+ if (cc.is64Bit()) cc.evex().vcvtusi2ss(xmmA, xmmB, gpq);
2415
+ cc.evex().vdbpsadbw(xmmA, xmmB, xmmC, 0);
2416
+ cc.evex().vdbpsadbw(ymmA, ymmB, ymmC, 0);
2417
+ cc.evex().vdbpsadbw(zmmA, zmmB, zmmC, 0);
2418
+ cc.evex().vdivpd(xmmA, xmmB, xmmC);
2419
+ cc.evex().vdivpd(ymmA, ymmB, ymmC);
2420
+ cc.evex().vdivpd(zmmA, zmmB, zmmC);
2421
+ cc.evex().vdivps(xmmA, xmmB, xmmC);
2422
+ cc.evex().vdivps(ymmA, ymmB, ymmC);
2423
+ cc.evex().vdivps(zmmA, zmmB, zmmC);
2424
+ cc.evex().vdivsd(xmmA, xmmB, xmmC);
2425
+ cc.evex().vdivss(xmmA, xmmB, xmmC);
2426
+ cc.evex().vexp2pd(zmmA, zmmB);
2427
+ cc.evex().vexp2ps(zmmA, zmmB);
2428
+ cc.evex().vexpandpd(xmmA, xmmB);
2429
+ cc.evex().vexpandpd(ymmA, ymmB);
2430
+ cc.evex().vexpandpd(zmmA, zmmB);
2431
+ cc.evex().vexpandps(xmmA, xmmB);
2432
+ cc.evex().vexpandps(ymmA, ymmB);
2433
+ cc.evex().vexpandps(zmmA, zmmB);
2434
+ cc.evex().vextractf32x4(xmmA, ymmB, 0);
2435
+ cc.evex().vextractf32x4(xmmA, zmmB, 0);
2436
+ cc.evex().vextractf32x8(ymmA, zmmB, 0);
2437
+ cc.evex().vextractf64x2(xmmA, ymmB, 0);
2438
+ cc.evex().vextractf64x2(xmmA, zmmB, 0);
2439
+ cc.evex().vextractf64x4(ymmA, zmmB, 0);
2440
+ cc.evex().vextracti32x4(xmmA, ymmB, 0);
2441
+ cc.evex().vextracti32x4(xmmA, zmmB, 0);
2442
+ cc.evex().vextracti32x8(ymmA, zmmB, 0);
2443
+ cc.evex().vextracti64x2(xmmA, ymmB, 0);
2444
+ cc.evex().vextracti64x2(xmmA, zmmB, 0);
2445
+ cc.evex().vextracti64x4(ymmA, zmmB, 0);
2446
+ cc.evex().vextractps(gpd, xmmB, 0);
2447
+ cc.evex().vfixupimmpd(xmmA, xmmB, xmmC, 0);
2448
+ cc.evex().vfixupimmpd(ymmA, ymmB, ymmC, 0);
2449
+ cc.evex().vfixupimmpd(zmmA, zmmB, zmmC, 0);
2450
+ cc.evex().vfixupimmps(xmmA, xmmB, xmmC, 0);
2451
+ cc.evex().vfixupimmps(ymmA, ymmB, ymmC, 0);
2452
+ cc.evex().vfixupimmps(zmmA, zmmB, zmmC, 0);
2453
+ cc.evex().vfixupimmsd(xmmA, xmmB, xmmC, 0);
2454
+ cc.evex().vfixupimmss(xmmA, xmmB, xmmC, 0);
2455
+ cc.evex().vfmadd132pd(xmmA, xmmB, xmmC);
2456
+ cc.evex().vfmadd132pd(ymmA, ymmB, ymmC);
2457
+ cc.evex().vfmadd132pd(zmmA, zmmB, zmmC);
2458
+ cc.evex().vfmadd132ps(xmmA, xmmB, xmmC);
2459
+ cc.evex().vfmadd132ps(ymmA, ymmB, ymmC);
2460
+ cc.evex().vfmadd132ps(zmmA, zmmB, zmmC);
2461
+ cc.evex().vfmadd132sd(xmmA, xmmB, xmmC);
2462
+ cc.evex().vfmadd132ss(xmmA, xmmB, xmmC);
2463
+ cc.evex().vfmadd213pd(xmmA, xmmB, xmmC);
2464
+ cc.evex().vfmadd213pd(ymmA, ymmB, ymmC);
2465
+ cc.evex().vfmadd213pd(zmmA, zmmB, zmmC);
2466
+ cc.evex().vfmadd213ps(xmmA, xmmB, xmmC);
2467
+ cc.evex().vfmadd213ps(ymmA, ymmB, ymmC);
2468
+ cc.evex().vfmadd213ps(zmmA, zmmB, zmmC);
2469
+ cc.evex().vfmadd213sd(xmmA, xmmB, xmmC);
2470
+ cc.evex().vfmadd213ss(xmmA, xmmB, xmmC);
2471
+ cc.evex().vfmadd231pd(xmmA, xmmB, xmmC);
2472
+ cc.evex().vfmadd231pd(ymmA, ymmB, ymmC);
2473
+ cc.evex().vfmadd231pd(zmmA, zmmB, zmmC);
2474
+ cc.evex().vfmadd231ps(xmmA, xmmB, xmmC);
2475
+ cc.evex().vfmadd231ps(ymmA, ymmB, ymmC);
2476
+ cc.evex().vfmadd231ps(zmmA, zmmB, zmmC);
2477
+ cc.evex().vfmadd231sd(xmmA, xmmB, xmmC);
2478
+ cc.evex().vfmadd231ss(xmmA, xmmB, xmmC);
2479
+ cc.evex().vfmaddsub132pd(xmmA, xmmB, xmmC);
2480
+ cc.evex().vfmaddsub132pd(ymmA, ymmB, ymmC);
2481
+ cc.evex().vfmaddsub132pd(zmmA, zmmB, zmmC);
2482
+ cc.evex().vfmaddsub132ps(xmmA, xmmB, xmmC);
2483
+ cc.evex().vfmaddsub132ps(ymmA, ymmB, ymmC);
2484
+ cc.evex().vfmaddsub132ps(zmmA, zmmB, zmmC);
2485
+ cc.evex().vfmaddsub213pd(xmmA, xmmB, xmmC);
2486
+ cc.evex().vfmaddsub213pd(ymmA, ymmB, ymmC);
2487
+ cc.evex().vfmaddsub213pd(zmmA, zmmB, zmmC);
2488
+ cc.evex().vfmaddsub213ps(xmmA, xmmB, xmmC);
2489
+ cc.evex().vfmaddsub213ps(ymmA, ymmB, ymmC);
2490
+ cc.evex().vfmaddsub213ps(zmmA, zmmB, zmmC);
2491
+ cc.evex().vfmaddsub231pd(xmmA, xmmB, xmmC);
2492
+ cc.evex().vfmaddsub231pd(ymmA, ymmB, ymmC);
2493
+ cc.evex().vfmaddsub231pd(zmmA, zmmB, zmmC);
2494
+ cc.evex().vfmaddsub231ps(xmmA, xmmB, xmmC);
2495
+ cc.evex().vfmaddsub231ps(ymmA, ymmB, ymmC);
2496
+ cc.evex().vfmaddsub231ps(zmmA, zmmB, zmmC);
2497
+ cc.evex().vfmsub132pd(xmmA, xmmB, xmmC);
2498
+ cc.evex().vfmsub132pd(ymmA, ymmB, ymmC);
2499
+ cc.evex().vfmsub132pd(zmmA, zmmB, zmmC);
2500
+ cc.evex().vfmsub132ps(xmmA, xmmB, xmmC);
2501
+ cc.evex().vfmsub132ps(ymmA, ymmB, ymmC);
2502
+ cc.evex().vfmsub132ps(zmmA, zmmB, zmmC);
2503
+ cc.evex().vfmsub132sd(xmmA, xmmB, xmmC);
2504
+ cc.evex().vfmsub132ss(xmmA, xmmB, xmmC);
2505
+ cc.evex().vfmsub213pd(xmmA, xmmB, xmmC);
2506
+ cc.evex().vfmsub213pd(ymmA, ymmB, ymmC);
2507
+ cc.evex().vfmsub213pd(zmmA, zmmB, zmmC);
2508
+ cc.evex().vfmsub213ps(xmmA, xmmB, xmmC);
2509
+ cc.evex().vfmsub213ps(ymmA, ymmB, ymmC);
2510
+ cc.evex().vfmsub213ps(zmmA, zmmB, zmmC);
2511
+ cc.evex().vfmsub213sd(xmmA, xmmB, xmmC);
2512
+ cc.evex().vfmsub213ss(xmmA, xmmB, xmmC);
2513
+ cc.evex().vfmsub231pd(xmmA, xmmB, xmmC);
2514
+ cc.evex().vfmsub231pd(ymmA, ymmB, ymmC);
2515
+ cc.evex().vfmsub231pd(zmmA, zmmB, zmmC);
2516
+ cc.evex().vfmsub231ps(xmmA, xmmB, xmmC);
2517
+ cc.evex().vfmsub231ps(ymmA, ymmB, ymmC);
2518
+ cc.evex().vfmsub231ps(zmmA, zmmB, zmmC);
2519
+ cc.evex().vfmsub231sd(xmmA, xmmB, xmmC);
2520
+ cc.evex().vfmsub231ss(xmmA, xmmB, xmmC);
2521
+ cc.evex().vfmsubadd132pd(xmmA, xmmB, xmmC);
2522
+ cc.evex().vfmsubadd132pd(ymmA, ymmB, ymmC);
2523
+ cc.evex().vfmsubadd132pd(zmmA, zmmB, zmmC);
2524
+ cc.evex().vfmsubadd132ps(xmmA, xmmB, xmmC);
2525
+ cc.evex().vfmsubadd132ps(ymmA, ymmB, ymmC);
2526
+ cc.evex().vfmsubadd132ps(zmmA, zmmB, zmmC);
2527
+ cc.evex().vfmsubadd213pd(xmmA, xmmB, xmmC);
2528
+ cc.evex().vfmsubadd213pd(ymmA, ymmB, ymmC);
2529
+ cc.evex().vfmsubadd213pd(zmmA, zmmB, zmmC);
2530
+ cc.evex().vfmsubadd213ps(xmmA, xmmB, xmmC);
2531
+ cc.evex().vfmsubadd213ps(ymmA, ymmB, ymmC);
2532
+ cc.evex().vfmsubadd213ps(zmmA, zmmB, zmmC);
2533
+ cc.evex().vfmsubadd231pd(xmmA, xmmB, xmmC);
2534
+ cc.evex().vfmsubadd231pd(ymmA, ymmB, ymmC);
2535
+ cc.evex().vfmsubadd231pd(zmmA, zmmB, zmmC);
2536
+ cc.evex().vfmsubadd231ps(xmmA, xmmB, xmmC);
2537
+ cc.evex().vfmsubadd231ps(ymmA, ymmB, ymmC);
2538
+ cc.evex().vfmsubadd231ps(zmmA, zmmB, zmmC);
2539
+ cc.evex().vfnmadd132pd(xmmA, xmmB, xmmC);
2540
+ cc.evex().vfnmadd132pd(ymmA, ymmB, ymmC);
2541
+ cc.evex().vfnmadd132pd(zmmA, zmmB, zmmC);
2542
+ cc.evex().vfnmadd132ps(xmmA, xmmB, xmmC);
2543
+ cc.evex().vfnmadd132ps(ymmA, ymmB, ymmC);
2544
+ cc.evex().vfnmadd132ps(zmmA, zmmB, zmmC);
2545
+ cc.evex().vfnmadd132sd(xmmA, xmmB, xmmC);
2546
+ cc.evex().vfnmadd132ss(xmmA, xmmB, xmmC);
2547
+ cc.evex().vfnmadd213pd(xmmA, xmmB, xmmC);
2548
+ cc.evex().vfnmadd213pd(ymmA, ymmB, ymmC);
2549
+ cc.evex().vfnmadd213pd(zmmA, zmmB, zmmC);
2550
+ cc.evex().vfnmadd213ps(xmmA, xmmB, xmmC);
2551
+ cc.evex().vfnmadd213ps(ymmA, ymmB, ymmC);
2552
+ cc.evex().vfnmadd213ps(zmmA, zmmB, zmmC);
2553
+ cc.evex().vfnmadd213sd(xmmA, xmmB, xmmC);
2554
+ cc.evex().vfnmadd213ss(xmmA, xmmB, xmmC);
2555
+ cc.evex().vfnmadd231pd(xmmA, xmmB, xmmC);
2556
+ cc.evex().vfnmadd231pd(ymmA, ymmB, ymmC);
2557
+ cc.evex().vfnmadd231pd(zmmA, zmmB, zmmC);
2558
+ cc.evex().vfnmadd231ps(xmmA, xmmB, xmmC);
2559
+ cc.evex().vfnmadd231ps(ymmA, ymmB, ymmC);
2560
+ cc.evex().vfnmadd231ps(zmmA, zmmB, zmmC);
2561
+ cc.evex().vfnmadd231sd(xmmA, xmmB, xmmC);
2562
+ cc.evex().vfnmadd231ss(xmmA, xmmB, xmmC);
2563
+ cc.evex().vfnmsub132pd(xmmA, xmmB, xmmC);
2564
+ cc.evex().vfnmsub132pd(ymmA, ymmB, ymmC);
2565
+ cc.evex().vfnmsub132pd(zmmA, zmmB, zmmC);
2566
+ cc.evex().vfnmsub132ps(xmmA, xmmB, xmmC);
2567
+ cc.evex().vfnmsub132ps(ymmA, ymmB, ymmC);
2568
+ cc.evex().vfnmsub132ps(zmmA, zmmB, zmmC);
2569
+ cc.evex().vfnmsub132sd(xmmA, xmmB, xmmC);
2570
+ cc.evex().vfnmsub132ss(xmmA, xmmB, xmmC);
2571
+ cc.evex().vfnmsub213pd(xmmA, xmmB, xmmC);
2572
+ cc.evex().vfnmsub213pd(ymmA, ymmB, ymmC);
2573
+ cc.evex().vfnmsub213pd(zmmA, zmmB, zmmC);
2574
+ cc.evex().vfnmsub213ps(xmmA, xmmB, xmmC);
2575
+ cc.evex().vfnmsub213ps(ymmA, ymmB, ymmC);
2576
+ cc.evex().vfnmsub213ps(zmmA, zmmB, zmmC);
2577
+ cc.evex().vfnmsub213sd(xmmA, xmmB, xmmC);
2578
+ cc.evex().vfnmsub213ss(xmmA, xmmB, xmmC);
2579
+ cc.evex().vfnmsub231pd(xmmA, xmmB, xmmC);
2580
+ cc.evex().vfnmsub231pd(ymmA, ymmB, ymmC);
2581
+ cc.evex().vfnmsub231pd(zmmA, zmmB, zmmC);
2582
+ cc.evex().vfnmsub231ps(xmmA, xmmB, xmmC);
2583
+ cc.evex().vfnmsub231ps(ymmA, ymmB, ymmC);
2584
+ cc.evex().vfnmsub231ps(zmmA, zmmB, zmmC);
2585
+ cc.evex().vfnmsub231sd(xmmA, xmmB, xmmC);
2586
+ cc.evex().vfnmsub231ss(xmmA, xmmB, xmmC);
2587
+ cc.evex().vfpclasspd(kA, xmmB, 0);
2588
+ cc.evex().vfpclasspd(kA, ymmB, 0);
2589
+ cc.evex().vfpclasspd(kA, zmmB, 0);
2590
+ cc.evex().vfpclassps(kA, xmmB, 0);
2591
+ cc.evex().vfpclassps(kA, ymmB, 0);
2592
+ cc.evex().vfpclassps(kA, zmmB, 0);
2593
+ cc.evex().vfpclasssd(kA, xmmB, 0);
2594
+ cc.evex().vfpclassss(kA, xmmB, 0);
2595
+ cc.evex().vgetexppd(xmmA, xmmB);
2596
+ cc.evex().vgetexppd(ymmA, ymmB);
2597
+ cc.evex().vgetexppd(zmmA, zmmB);
2598
+ cc.evex().vgetexpps(xmmA, xmmB);
2599
+ cc.evex().vgetexpps(ymmA, ymmB);
2600
+ cc.evex().vgetexpps(zmmA, zmmB);
2601
+ cc.evex().vgetexpsd(xmmA, xmmB, xmmC);
2602
+ cc.evex().vgetexpss(xmmA, xmmB, xmmC);
2603
+ cc.evex().vgetmantpd(xmmA, xmmB, 0);
2604
+ cc.evex().vgetmantpd(ymmA, ymmB, 0);
2605
+ cc.evex().vgetmantpd(zmmA, zmmB, 0);
2606
+ cc.evex().vgetmantps(xmmA, xmmB, 0);
2607
+ cc.evex().vgetmantps(ymmA, ymmB, 0);
2608
+ cc.evex().vgetmantps(zmmA, zmmB, 0);
2609
+ cc.evex().vgetmantsd(xmmA, xmmB, xmmC, 0);
2610
+ cc.evex().vgetmantss(xmmA, xmmB, xmmC, 0);
2611
+ cc.evex().vinsertf32x4(ymmA, ymmB, xmmC, 0);
2612
+ cc.evex().vinsertf32x4(zmmA, zmmB, xmmC, 0);
2613
+ cc.evex().vinsertf32x8(zmmA, zmmB, ymmC, 0);
2614
+ cc.evex().vinsertf64x2(ymmA, ymmB, xmmC, 0);
2615
+ cc.evex().vinsertf64x2(zmmA, zmmB, xmmC, 0);
2616
+ cc.evex().vinsertf64x4(zmmA, zmmB, ymmC, 0);
2617
+ cc.evex().vinserti32x4(ymmA, ymmB, xmmC, 0);
2618
+ cc.evex().vinserti32x4(zmmA, zmmB, xmmC, 0);
2619
+ cc.evex().vinserti32x8(zmmA, zmmB, ymmC, 0);
2620
+ cc.evex().vinserti64x2(ymmA, ymmB, xmmC, 0);
2621
+ cc.evex().vinserti64x2(zmmA, zmmB, xmmC, 0);
2622
+ cc.evex().vinserti64x4(zmmA, zmmB, ymmC, 0);
2623
+ cc.evex().vinsertps(xmmA, xmmB, xmmC, 0);
2624
+ cc.evex().vmaxpd(xmmA, xmmB, xmmC);
2625
+ cc.evex().vmaxpd(ymmA, ymmB, ymmC);
2626
+ cc.evex().vmaxpd(zmmA, zmmB, zmmC);
2627
+ cc.evex().vmaxps(xmmA, xmmB, xmmC);
2628
+ cc.evex().vmaxps(ymmA, ymmB, ymmC);
2629
+ cc.evex().vmaxps(zmmA, zmmB, zmmC);
2630
+ cc.evex().vmaxsd(xmmA, xmmB, xmmC);
2631
+ cc.evex().vmaxss(xmmA, xmmB, xmmC);
2632
+ cc.evex().vminpd(xmmA, xmmB, xmmC);
2633
+ cc.evex().vminpd(ymmA, ymmB, ymmC);
2634
+ cc.evex().vminpd(zmmA, zmmB, zmmC);
2635
+ cc.evex().vminps(xmmA, xmmB, xmmC);
2636
+ cc.evex().vminps(ymmA, ymmB, ymmC);
2637
+ cc.evex().vminps(zmmA, zmmB, zmmC);
2638
+ cc.evex().vminsd(xmmA, xmmB, xmmC);
2639
+ cc.evex().vminss(xmmA, xmmB, xmmC);
2640
+ cc.evex().vmovapd(xmmA, xmmB);
2641
+ cc.evex().vmovapd(xmmA, xmmB);
2642
+ cc.evex().vmovapd(ymmA, ymmB);
2643
+ cc.evex().vmovapd(ymmA, ymmB);
2644
+ cc.evex().vmovapd(zmmA, zmmB);
2645
+ cc.evex().vmovapd(zmmA, zmmB);
2646
+ cc.evex().vmovaps(xmmA, xmmB);
2647
+ cc.evex().vmovaps(xmmA, xmmB);
2648
+ cc.evex().vmovaps(ymmA, ymmB);
2649
+ cc.evex().vmovaps(ymmA, ymmB);
2650
+ cc.evex().vmovaps(zmmA, zmmB);
2651
+ cc.evex().vmovaps(zmmA, zmmB);
2652
+ cc.evex().vmovd(gpd, xmmB);
2653
+ cc.evex().vmovd(xmmA, gpd);
2654
+ cc.evex().vmovddup(xmmA, xmmB);
2655
+ cc.evex().vmovddup(ymmA, ymmB);
2656
+ cc.evex().vmovddup(zmmA, zmmB);
2657
+ cc.evex().vmovdqa32(xmmA, xmmB);
2658
+ cc.evex().vmovdqa32(xmmA, xmmB);
2659
+ cc.evex().vmovdqa32(ymmA, ymmB);
2660
+ cc.evex().vmovdqa32(ymmA, ymmB);
2661
+ cc.evex().vmovdqa32(zmmA, zmmB);
2662
+ cc.evex().vmovdqa32(zmmA, zmmB);
2663
+ cc.evex().vmovdqa64(xmmA, xmmB);
2664
+ cc.evex().vmovdqa64(xmmA, xmmB);
2665
+ cc.evex().vmovdqa64(ymmA, ymmB);
2666
+ cc.evex().vmovdqa64(ymmA, ymmB);
2667
+ cc.evex().vmovdqa64(zmmA, zmmB);
2668
+ cc.evex().vmovdqa64(zmmA, zmmB);
2669
+ cc.evex().vmovdqu16(xmmA, xmmB);
2670
+ cc.evex().vmovdqu16(xmmA, xmmB);
2671
+ cc.evex().vmovdqu16(ymmA, ymmB);
2672
+ cc.evex().vmovdqu16(ymmA, ymmB);
2673
+ cc.evex().vmovdqu16(zmmA, zmmB);
2674
+ cc.evex().vmovdqu16(zmmA, zmmB);
2675
+ cc.evex().vmovdqu32(xmmA, xmmB);
2676
+ cc.evex().vmovdqu32(xmmA, xmmB);
2677
+ cc.evex().vmovdqu32(ymmA, ymmB);
2678
+ cc.evex().vmovdqu32(ymmA, ymmB);
2679
+ cc.evex().vmovdqu32(zmmA, zmmB);
2680
+ cc.evex().vmovdqu32(zmmA, zmmB);
2681
+ cc.evex().vmovdqu64(xmmA, xmmB);
2682
+ cc.evex().vmovdqu64(xmmA, xmmB);
2683
+ cc.evex().vmovdqu64(ymmA, ymmB);
2684
+ cc.evex().vmovdqu64(ymmA, ymmB);
2685
+ cc.evex().vmovdqu64(zmmA, zmmB);
2686
+ cc.evex().vmovdqu64(zmmA, zmmB);
2687
+ cc.evex().vmovdqu8(xmmA, xmmB);
2688
+ cc.evex().vmovdqu8(xmmA, xmmB);
2689
+ cc.evex().vmovdqu8(ymmA, ymmB);
2690
+ cc.evex().vmovdqu8(ymmA, ymmB);
2691
+ cc.evex().vmovdqu8(zmmA, zmmB);
2692
+ cc.evex().vmovdqu8(zmmA, zmmB);
2693
+ cc.evex().vmovhlps(xmmA, xmmB, xmmC);
2694
+ if (cc.is64Bit()) cc.evex().vmovq(gpq, xmmB);
2695
+ if (cc.is64Bit()) cc.evex().vmovq(xmmA, gpq);
2696
+ cc.evex().vmovq(xmmA, xmmB);
2697
+ cc.evex().vmovsd(xmmA, xmmB, xmmC);
2698
+ cc.evex().vmovshdup(xmmA, xmmB);
2699
+ cc.evex().vmovshdup(ymmA, ymmB);
2700
+ cc.evex().vmovshdup(zmmA, zmmB);
2701
+ cc.evex().vmovsldup(xmmA, xmmB);
2702
+ cc.evex().vmovsldup(ymmA, ymmB);
2703
+ cc.evex().vmovsldup(zmmA, zmmB);
2704
+ cc.evex().vmovss(xmmA, xmmB, xmmC);
2705
+ cc.evex().vmovupd(xmmA, xmmB);
2706
+ cc.evex().vmovupd(xmmA, xmmB);
2707
+ cc.evex().vmovupd(ymmA, ymmB);
2708
+ cc.evex().vmovupd(ymmA, ymmB);
2709
+ cc.evex().vmovupd(zmmA, zmmB);
2710
+ cc.evex().vmovupd(zmmA, zmmB);
2711
+ cc.evex().vmovups(xmmA, xmmB);
2712
+ cc.evex().vmovups(xmmA, xmmB);
2713
+ cc.evex().vmovups(ymmA, ymmB);
2714
+ cc.evex().vmovups(ymmA, ymmB);
2715
+ cc.evex().vmovups(zmmA, zmmB);
2716
+ cc.evex().vmovups(zmmA, zmmB);
2717
+ cc.evex().vmulpd(xmmA, xmmB, xmmC);
2718
+ cc.evex().vmulpd(ymmA, ymmB, ymmC);
2719
+ cc.evex().vmulpd(zmmA, zmmB, zmmC);
2720
+ cc.evex().vmulps(xmmA, xmmB, xmmC);
2721
+ cc.evex().vmulps(ymmA, ymmB, ymmC);
2722
+ cc.evex().vmulps(zmmA, zmmB, zmmC);
2723
+ cc.evex().vmulsd(xmmA, xmmB, xmmC);
2724
+ cc.evex().vmulss(xmmA, xmmB, xmmC);
2725
+ cc.evex().vorpd(xmmA, xmmB, xmmC);
2726
+ cc.evex().vorpd(ymmA, ymmB, ymmC);
2727
+ cc.evex().vorpd(zmmA, zmmB, zmmC);
2728
+ cc.evex().vorps(xmmA, xmmB, xmmC);
2729
+ cc.evex().vorps(ymmA, ymmB, ymmC);
2730
+ cc.evex().vorps(zmmA, zmmB, zmmC);
2731
+ cc.evex().vpabsb(xmmA, xmmB);
2732
+ cc.evex().vpabsb(ymmA, ymmB);
2733
+ cc.evex().vpabsb(zmmA, zmmB);
2734
+ cc.evex().vpabsd(xmmA, xmmB);
2735
+ cc.evex().vpabsd(ymmA, ymmB);
2736
+ cc.evex().vpabsd(zmmA, zmmB);
2737
+ cc.evex().vpabsq(xmmA, xmmB);
2738
+ cc.evex().vpabsq(ymmA, ymmB);
2739
+ cc.evex().vpabsq(zmmA, zmmB);
2740
+ cc.evex().vpabsw(xmmA, xmmB);
2741
+ cc.evex().vpabsw(ymmA, ymmB);
2742
+ cc.evex().vpabsw(zmmA, zmmB);
2743
+ cc.evex().vpackssdw(xmmA, xmmB, xmmC);
2744
+ cc.evex().vpackssdw(ymmA, ymmB, ymmC);
2745
+ cc.evex().vpackssdw(zmmA, zmmB, zmmC);
2746
+ cc.evex().vpacksswb(xmmA, xmmB, xmmC);
2747
+ cc.evex().vpacksswb(ymmA, ymmB, ymmC);
2748
+ cc.evex().vpacksswb(zmmA, zmmB, zmmC);
2749
+ cc.evex().vpackusdw(xmmA, xmmB, xmmC);
2750
+ cc.evex().vpackusdw(ymmA, ymmB, ymmC);
2751
+ cc.evex().vpackusdw(zmmA, zmmB, zmmC);
2752
+ cc.evex().vpackuswb(xmmA, xmmB, xmmC);
2753
+ cc.evex().vpackuswb(ymmA, ymmB, ymmC);
2754
+ cc.evex().vpackuswb(zmmA, zmmB, zmmC);
2755
+ cc.evex().vpaddb(xmmA, xmmB, xmmC);
2756
+ cc.evex().vpaddb(ymmA, ymmB, ymmC);
2757
+ cc.evex().vpaddb(zmmA, zmmB, zmmC);
2758
+ cc.evex().vpaddd(xmmA, xmmB, xmmC);
2759
+ cc.evex().vpaddd(ymmA, ymmB, ymmC);
2760
+ cc.evex().vpaddd(zmmA, zmmB, zmmC);
2761
+ cc.evex().vpaddq(xmmA, xmmB, xmmC);
2762
+ cc.evex().vpaddq(ymmA, ymmB, ymmC);
2763
+ cc.evex().vpaddq(zmmA, zmmB, zmmC);
2764
+ cc.evex().vpaddsb(xmmA, xmmB, xmmC);
2765
+ cc.evex().vpaddsb(ymmA, ymmB, ymmC);
2766
+ cc.evex().vpaddsb(zmmA, zmmB, zmmC);
2767
+ cc.evex().vpaddsw(xmmA, xmmB, xmmC);
2768
+ cc.evex().vpaddsw(ymmA, ymmB, ymmC);
2769
+ cc.evex().vpaddsw(zmmA, zmmB, zmmC);
2770
+ cc.evex().vpaddusb(xmmA, xmmB, xmmC);
2771
+ cc.evex().vpaddusb(ymmA, ymmB, ymmC);
2772
+ cc.evex().vpaddusb(zmmA, zmmB, zmmC);
2773
+ cc.evex().vpaddusw(xmmA, xmmB, xmmC);
2774
+ cc.evex().vpaddusw(ymmA, ymmB, ymmC);
2775
+ cc.evex().vpaddusw(zmmA, zmmB, zmmC);
2776
+ cc.evex().vpaddw(xmmA, xmmB, xmmC);
2777
+ cc.evex().vpaddw(ymmA, ymmB, ymmC);
2778
+ cc.evex().vpaddw(zmmA, zmmB, zmmC);
2779
+ cc.evex().vpalignr(xmmA, xmmB, xmmC, 0);
2780
+ cc.evex().vpalignr(ymmA, ymmB, ymmC, 0);
2781
+ cc.evex().vpalignr(zmmA, zmmB, zmmC, 0);
2782
+ cc.evex().vpandd(xmmA, xmmB, xmmC);
2783
+ cc.evex().vpandd(ymmA, ymmB, ymmC);
2784
+ cc.evex().vpandd(zmmA, zmmB, zmmC);
2785
+ cc.evex().vpandnd(xmmA, xmmB, xmmC);
2786
+ cc.evex().vpandnd(ymmA, ymmB, ymmC);
2787
+ cc.evex().vpandnd(zmmA, zmmB, zmmC);
2788
+ cc.evex().vpandnq(xmmA, xmmB, xmmC);
2789
+ cc.evex().vpandnq(ymmA, ymmB, ymmC);
2790
+ cc.evex().vpandnq(zmmA, zmmB, zmmC);
2791
+ cc.evex().vpandq(xmmA, xmmB, xmmC);
2792
+ cc.evex().vpandq(ymmA, ymmB, ymmC);
2793
+ cc.evex().vpandq(zmmA, zmmB, zmmC);
2794
+ cc.evex().vpavgb(xmmA, xmmB, xmmC);
2795
+ cc.evex().vpavgb(ymmA, ymmB, ymmC);
2796
+ cc.evex().vpavgb(zmmA, zmmB, zmmC);
2797
+ cc.evex().vpavgw(xmmA, xmmB, xmmC);
2798
+ cc.evex().vpavgw(ymmA, ymmB, ymmC);
2799
+ cc.evex().vpavgw(zmmA, zmmB, zmmC);
2800
+ cc.evex().vpblendmb(xmmA, xmmB, xmmC);
2801
+ cc.evex().vpblendmb(ymmA, ymmB, ymmC);
2802
+ cc.evex().vpblendmb(zmmA, zmmB, zmmC);
2803
+ cc.evex().vpblendmd(xmmA, xmmB, xmmC);
2804
+ cc.evex().vpblendmd(ymmA, ymmB, ymmC);
2805
+ cc.evex().vpblendmd(zmmA, zmmB, zmmC);
2806
+ cc.evex().vpblendmq(xmmA, xmmB, xmmC);
2807
+ cc.evex().vpblendmq(ymmA, ymmB, ymmC);
2808
+ cc.evex().vpblendmq(zmmA, zmmB, zmmC);
2809
+ cc.evex().vpblendmw(xmmA, xmmB, xmmC);
2810
+ cc.evex().vpblendmw(ymmA, ymmB, ymmC);
2811
+ cc.evex().vpblendmw(zmmA, zmmB, zmmC);
2812
+ cc.evex().vpbroadcastb(xmmA, gpd);
2813
+ cc.evex().vpbroadcastb(xmmA, xmmB);
2814
+ cc.evex().vpbroadcastb(ymmA, gpd);
2815
+ cc.evex().vpbroadcastb(ymmA, xmmB);
2816
+ cc.evex().vpbroadcastb(zmmA, gpd);
2817
+ cc.evex().vpbroadcastb(zmmA, xmmB);
2818
+ cc.evex().vpbroadcastd(xmmA, gpd);
2819
+ cc.evex().vpbroadcastd(xmmA, xmmB);
2820
+ cc.evex().vpbroadcastd(ymmA, gpd);
2821
+ cc.evex().vpbroadcastd(ymmA, xmmB);
2822
+ cc.evex().vpbroadcastd(zmmA, gpd);
2823
+ cc.evex().vpbroadcastd(zmmA, xmmB);
2824
+ cc.evex().vpbroadcastmb2q(xmmA, kB);
2825
+ cc.evex().vpbroadcastmb2q(ymmA, kB);
2826
+ cc.evex().vpbroadcastmb2q(zmmA, kB);
2827
+ cc.evex().vpbroadcastmw2d(xmmA, kB);
2828
+ cc.evex().vpbroadcastmw2d(ymmA, kB);
2829
+ cc.evex().vpbroadcastmw2d(zmmA, kB);
2830
+ if (cc.is64Bit()) cc.evex().vpbroadcastq(xmmA, gpq);
2831
+ cc.evex().vpbroadcastq(xmmA, xmmB);
2832
+ if (cc.is64Bit()) cc.evex().vpbroadcastq(ymmA, gpq);
2833
+ cc.evex().vpbroadcastq(ymmA, xmmB);
2834
+ if (cc.is64Bit()) cc.evex().vpbroadcastq(zmmA, gpq);
2835
+ cc.evex().vpbroadcastq(zmmA, xmmB);
2836
+ cc.evex().vpbroadcastw(xmmA, gpd);
2837
+ cc.evex().vpbroadcastw(xmmA, xmmB);
2838
+ cc.evex().vpbroadcastw(ymmA, gpd);
2839
+ cc.evex().vpbroadcastw(ymmA, xmmB);
2840
+ cc.evex().vpbroadcastw(zmmA, gpd);
2841
+ cc.evex().vpbroadcastw(zmmA, xmmB);
2842
+ cc.evex().vpcmpb(kA, xmmB, xmmC, 0);
2843
+ cc.evex().vpcmpb(kA, ymmB, ymmC, 0);
2844
+ cc.evex().vpcmpb(kA, zmmB, zmmC, 0);
2845
+ cc.evex().vpcmpd(kA, xmmB, xmmC, 0);
2846
+ cc.evex().vpcmpd(kA, ymmB, ymmC, 0);
2847
+ cc.evex().vpcmpd(kA, zmmB, zmmC, 0);
2848
+ cc.evex().vpcmpeqb(kA, xmmB, xmmC);
2849
+ cc.evex().vpcmpeqb(kA, ymmB, ymmC);
2850
+ cc.evex().vpcmpeqb(kA, zmmB, zmmC);
2851
+ cc.evex().vpcmpeqd(kA, xmmB, xmmC);
2852
+ cc.evex().vpcmpeqd(kA, ymmB, ymmC);
2853
+ cc.evex().vpcmpeqd(kA, zmmB, zmmC);
2854
+ cc.evex().vpcmpeqq(kA, xmmB, xmmC);
2855
+ cc.evex().vpcmpeqq(kA, ymmB, ymmC);
2856
+ cc.evex().vpcmpeqq(kA, zmmB, zmmC);
2857
+ cc.evex().vpcmpeqw(kA, xmmB, xmmC);
2858
+ cc.evex().vpcmpeqw(kA, ymmB, ymmC);
2859
+ cc.evex().vpcmpeqw(kA, zmmB, zmmC);
2860
+ cc.evex().vpcmpgtb(kA, xmmB, xmmC);
2861
+ cc.evex().vpcmpgtb(kA, ymmB, ymmC);
2862
+ cc.evex().vpcmpgtb(kA, zmmB, zmmC);
2863
+ cc.evex().vpcmpgtd(kA, xmmB, xmmC);
2864
+ cc.evex().vpcmpgtd(kA, ymmB, ymmC);
2865
+ cc.evex().vpcmpgtd(kA, zmmB, zmmC);
2866
+ cc.evex().vpcmpgtq(kA, xmmB, xmmC);
2867
+ cc.evex().vpcmpgtq(kA, ymmB, ymmC);
2868
+ cc.evex().vpcmpgtq(kA, zmmB, zmmC);
2869
+ cc.evex().vpcmpgtw(kA, xmmB, xmmC);
2870
+ cc.evex().vpcmpgtw(kA, ymmB, ymmC);
2871
+ cc.evex().vpcmpgtw(kA, zmmB, zmmC);
2872
+ cc.evex().vpcmpq(kA, xmmB, xmmC, 0);
2873
+ cc.evex().vpcmpq(kA, ymmB, ymmC, 0);
2874
+ cc.evex().vpcmpq(kA, zmmB, zmmC, 0);
2875
+ cc.evex().vpcmpub(kA, xmmB, xmmC, 0);
2876
+ cc.evex().vpcmpub(kA, ymmB, ymmC, 0);
2877
+ cc.evex().vpcmpub(kA, zmmB, zmmC, 0);
2878
+ cc.evex().vpcmpud(kA, xmmB, xmmC, 0);
2879
+ cc.evex().vpcmpud(kA, ymmB, ymmC, 0);
2880
+ cc.evex().vpcmpud(kA, zmmB, zmmC, 0);
2881
+ cc.evex().vpcmpuq(kA, xmmB, xmmC, 0);
2882
+ cc.evex().vpcmpuq(kA, ymmB, ymmC, 0);
2883
+ cc.evex().vpcmpuq(kA, zmmB, zmmC, 0);
2884
+ cc.evex().vpcmpuw(kA, xmmB, xmmC, 0);
2885
+ cc.evex().vpcmpuw(kA, ymmB, ymmC, 0);
2886
+ cc.evex().vpcmpuw(kA, zmmB, zmmC, 0);
2887
+ cc.evex().vpcmpw(kA, xmmB, xmmC, 0);
2888
+ cc.evex().vpcmpw(kA, ymmB, ymmC, 0);
2889
+ cc.evex().vpcmpw(kA, zmmB, zmmC, 0);
2890
+ cc.evex().vpcompressd(xmmA, xmmB);
2891
+ cc.evex().vpcompressd(ymmA, ymmB);
2892
+ cc.evex().vpcompressd(zmmA, zmmB);
2893
+ cc.evex().vpcompressq(xmmA, xmmB);
2894
+ cc.evex().vpcompressq(ymmA, ymmB);
2895
+ cc.evex().vpcompressq(zmmA, zmmB);
2896
+ cc.evex().vpconflictd(xmmA, xmmB);
2897
+ cc.evex().vpconflictd(ymmA, ymmB);
2898
+ cc.evex().vpconflictd(zmmA, zmmB);
2899
+ cc.evex().vpconflictq(xmmA, xmmB);
2900
+ cc.evex().vpconflictq(ymmA, ymmB);
2901
+ cc.evex().vpconflictq(zmmA, zmmB);
2902
+ cc.evex().vpermb(xmmA, xmmB, xmmC);
2903
+ cc.evex().vpermb(ymmA, ymmB, ymmC);
2904
+ cc.evex().vpermb(zmmA, zmmB, zmmC);
2905
+ cc.evex().vpermd(ymmA, ymmB, ymmC);
2906
+ cc.evex().vpermd(zmmA, zmmB, zmmC);
2907
+ cc.evex().vpermi2b(xmmA, xmmB, xmmC);
2908
+ cc.evex().vpermi2b(ymmA, ymmB, ymmC);
2909
+ cc.evex().vpermi2b(zmmA, zmmB, zmmC);
2910
+ cc.evex().vpermi2d(xmmA, xmmB, xmmC);
2911
+ cc.evex().vpermi2d(ymmA, ymmB, ymmC);
2912
+ cc.evex().vpermi2d(zmmA, zmmB, zmmC);
2913
+ cc.evex().vpermi2pd(xmmA, xmmB, xmmC);
2914
+ cc.evex().vpermi2pd(ymmA, ymmB, ymmC);
2915
+ cc.evex().vpermi2pd(zmmA, zmmB, zmmC);
2916
+ cc.evex().vpermi2ps(xmmA, xmmB, xmmC);
2917
+ cc.evex().vpermi2ps(ymmA, ymmB, ymmC);
2918
+ cc.evex().vpermi2ps(zmmA, zmmB, zmmC);
2919
+ cc.evex().vpermi2q(xmmA, xmmB, xmmC);
2920
+ cc.evex().vpermi2q(ymmA, ymmB, ymmC);
2921
+ cc.evex().vpermi2q(zmmA, zmmB, zmmC);
2922
+ cc.evex().vpermi2w(xmmA, xmmB, xmmC);
2923
+ cc.evex().vpermi2w(ymmA, ymmB, ymmC);
2924
+ cc.evex().vpermi2w(zmmA, zmmB, zmmC);
2925
+ cc.evex().vpermilpd(xmmA, xmmB, xmmC);
2926
+ cc.evex().vpermilpd(ymmA, ymmB, ymmC);
2927
+ cc.evex().vpermilpd(zmmA, zmmB, zmmC);
2928
+ cc.evex().vpermilpd(xmmA, xmmB, 0);
2929
+ cc.evex().vpermilpd(ymmA, ymmB, 0);
2930
+ cc.evex().vpermilpd(zmmA, zmmB, 0);
2931
+ cc.evex().vpermilps(xmmA, xmmB, xmmC);
2932
+ cc.evex().vpermilps(ymmA, ymmB, ymmC);
2933
+ cc.evex().vpermilps(zmmA, zmmB, zmmC);
2934
+ cc.evex().vpermilps(xmmA, xmmB, 0);
2935
+ cc.evex().vpermilps(ymmA, ymmB, 0);
2936
+ cc.evex().vpermilps(zmmA, zmmB, 0);
2937
+ cc.evex().vpermq(ymmA, ymmB, ymmC);
2938
+ cc.evex().vpermq(zmmA, zmmB, zmmC);
2939
+ cc.evex().vpermq(ymmA, ymmB, 0);
2940
+ cc.evex().vpermq(zmmA, zmmB, 0);
2941
+ cc.evex().vpermt2b(xmmA, xmmB, xmmC);
2942
+ cc.evex().vpermt2b(ymmA, ymmB, ymmC);
2943
+ cc.evex().vpermt2b(zmmA, zmmB, zmmC);
2944
+ cc.evex().vpermt2d(xmmA, xmmB, xmmC);
2945
+ cc.evex().vpermt2d(ymmA, ymmB, ymmC);
2946
+ cc.evex().vpermt2d(zmmA, zmmB, zmmC);
2947
+ cc.evex().vpermt2pd(xmmA, xmmB, xmmC);
2948
+ cc.evex().vpermt2pd(ymmA, ymmB, ymmC);
2949
+ cc.evex().vpermt2pd(zmmA, zmmB, zmmC);
2950
+ cc.evex().vpermt2ps(xmmA, xmmB, xmmC);
2951
+ cc.evex().vpermt2ps(ymmA, ymmB, ymmC);
2952
+ cc.evex().vpermt2ps(zmmA, zmmB, zmmC);
2953
+ cc.evex().vpermt2q(xmmA, xmmB, xmmC);
2954
+ cc.evex().vpermt2q(ymmA, ymmB, ymmC);
2955
+ cc.evex().vpermt2q(zmmA, zmmB, zmmC);
2956
+ cc.evex().vpermt2w(xmmA, xmmB, xmmC);
2957
+ cc.evex().vpermt2w(ymmA, ymmB, ymmC);
2958
+ cc.evex().vpermt2w(zmmA, zmmB, zmmC);
2959
+ cc.evex().vpermw(xmmA, xmmB, xmmC);
2960
+ cc.evex().vpermw(ymmA, ymmB, ymmC);
2961
+ cc.evex().vpermw(zmmA, zmmB, zmmC);
2962
+ cc.evex().vpexpandd(xmmA, xmmB);
2963
+ cc.evex().vpexpandd(ymmA, ymmB);
2964
+ cc.evex().vpexpandd(zmmA, zmmB);
2965
+ cc.evex().vpexpandq(xmmA, xmmB);
2966
+ cc.evex().vpexpandq(ymmA, ymmB);
2967
+ cc.evex().vpexpandq(zmmA, zmmB);
2968
+ cc.evex().vpextrb(gpd, xmmB, 0);
2969
+ cc.evex().vpextrd(gpd, xmmB, 0);
2970
+ if (cc.is64Bit()) cc.evex().vpextrq(gpq, xmmB, 0);
2971
+ cc.evex().vpextrw(gpd, xmmB, 0);
2972
+ cc.evex().vpinsrb(xmmA, xmmB, gpd, 0);
2973
+ cc.evex().vpinsrd(xmmA, xmmB, gpd, 0);
2974
+ if (cc.is64Bit()) cc.evex().vpinsrq(xmmA, xmmB, gpq, 0);
2975
+ cc.evex().vpinsrw(xmmA, xmmB, gpd, 0);
2976
+ cc.evex().vplzcntd(xmmA, xmmB);
2977
+ cc.evex().vplzcntd(ymmA, ymmB);
2978
+ cc.evex().vplzcntd(zmmA, zmmB);
2979
+ cc.evex().vplzcntq(xmmA, xmmB);
2980
+ cc.evex().vplzcntq(ymmA, ymmB);
2981
+ cc.evex().vplzcntq(zmmA, zmmB);
2982
+ cc.evex().vpmadd52huq(xmmA, xmmB, xmmC);
2983
+ cc.evex().vpmadd52huq(ymmA, ymmB, ymmC);
2984
+ cc.evex().vpmadd52huq(zmmA, zmmB, zmmC);
2985
+ cc.evex().vpmadd52luq(xmmA, xmmB, xmmC);
2986
+ cc.evex().vpmadd52luq(ymmA, ymmB, ymmC);
2987
+ cc.evex().vpmadd52luq(zmmA, zmmB, zmmC);
2988
+ cc.evex().vpmaddubsw(xmmA, xmmB, xmmC);
2989
+ cc.evex().vpmaddubsw(ymmA, ymmB, ymmC);
2990
+ cc.evex().vpmaddubsw(zmmA, zmmB, zmmC);
2991
+ cc.evex().vpmaddwd(xmmA, xmmB, xmmC);
2992
+ cc.evex().vpmaddwd(ymmA, ymmB, ymmC);
2993
+ cc.evex().vpmaddwd(zmmA, zmmB, zmmC);
2994
+ cc.evex().vpmaxsb(xmmA, xmmB, xmmC);
2995
+ cc.evex().vpmaxsb(ymmA, ymmB, ymmC);
2996
+ cc.evex().vpmaxsb(zmmA, zmmB, zmmC);
2997
+ cc.evex().vpmaxsd(xmmA, xmmB, xmmC);
2998
+ cc.evex().vpmaxsd(ymmA, ymmB, ymmC);
2999
+ cc.evex().vpmaxsd(zmmA, zmmB, zmmC);
3000
+ cc.evex().vpmaxsq(xmmA, xmmB, xmmC);
3001
+ cc.evex().vpmaxsq(ymmA, ymmB, ymmC);
3002
+ cc.evex().vpmaxsq(zmmA, zmmB, zmmC);
3003
+ cc.evex().vpmaxsw(xmmA, xmmB, xmmC);
3004
+ cc.evex().vpmaxsw(ymmA, ymmB, ymmC);
3005
+ cc.evex().vpmaxsw(zmmA, zmmB, zmmC);
3006
+ cc.evex().vpmaxub(xmmA, xmmB, xmmC);
3007
+ cc.evex().vpmaxub(ymmA, ymmB, ymmC);
3008
+ cc.evex().vpmaxub(zmmA, zmmB, zmmC);
3009
+ cc.evex().vpmaxud(xmmA, xmmB, xmmC);
3010
+ cc.evex().vpmaxud(ymmA, ymmB, ymmC);
3011
+ cc.evex().vpmaxud(zmmA, zmmB, zmmC);
3012
+ cc.evex().vpmaxuq(xmmA, xmmB, xmmC);
3013
+ cc.evex().vpmaxuq(ymmA, ymmB, ymmC);
3014
+ cc.evex().vpmaxuq(zmmA, zmmB, zmmC);
3015
+ cc.evex().vpmaxuw(xmmA, xmmB, xmmC);
3016
+ cc.evex().vpmaxuw(ymmA, ymmB, ymmC);
3017
+ cc.evex().vpmaxuw(zmmA, zmmB, zmmC);
3018
+ cc.evex().vpminsb(xmmA, xmmB, xmmC);
3019
+ cc.evex().vpminsb(ymmA, ymmB, ymmC);
3020
+ cc.evex().vpminsb(zmmA, zmmB, zmmC);
3021
+ cc.evex().vpminsd(xmmA, xmmB, xmmC);
3022
+ cc.evex().vpminsd(ymmA, ymmB, ymmC);
3023
+ cc.evex().vpminsd(zmmA, zmmB, zmmC);
3024
+ cc.evex().vpminsq(xmmA, xmmB, xmmC);
3025
+ cc.evex().vpminsq(ymmA, ymmB, ymmC);
3026
+ cc.evex().vpminsq(zmmA, zmmB, zmmC);
3027
+ cc.evex().vpminsw(xmmA, xmmB, xmmC);
3028
+ cc.evex().vpminsw(ymmA, ymmB, ymmC);
3029
+ cc.evex().vpminsw(zmmA, zmmB, zmmC);
3030
+ cc.evex().vpminub(xmmA, xmmB, xmmC);
3031
+ cc.evex().vpminub(ymmA, ymmB, ymmC);
3032
+ cc.evex().vpminub(zmmA, zmmB, zmmC);
3033
+ cc.evex().vpminud(xmmA, xmmB, xmmC);
3034
+ cc.evex().vpminud(ymmA, ymmB, ymmC);
3035
+ cc.evex().vpminud(zmmA, zmmB, zmmC);
3036
+ cc.evex().vpminuq(xmmA, xmmB, xmmC);
3037
+ cc.evex().vpminuq(ymmA, ymmB, ymmC);
3038
+ cc.evex().vpminuq(zmmA, zmmB, zmmC);
3039
+ cc.evex().vpminuw(xmmA, xmmB, xmmC);
3040
+ cc.evex().vpminuw(ymmA, ymmB, ymmC);
3041
+ cc.evex().vpminuw(zmmA, zmmB, zmmC);
3042
+ cc.evex().vpmovb2m(kA, xmmB);
3043
+ cc.evex().vpmovb2m(kA, ymmB);
3044
+ cc.evex().vpmovb2m(kA, zmmB);
3045
+ cc.evex().vpmovd2m(kA, xmmB);
3046
+ cc.evex().vpmovd2m(kA, ymmB);
3047
+ cc.evex().vpmovd2m(kA, zmmB);
3048
+ cc.evex().vpmovdb(xmmA, xmmB);
3049
+ cc.evex().vpmovdb(xmmA, ymmB);
3050
+ cc.evex().vpmovdb(xmmA, zmmB);
3051
+ cc.evex().vpmovdw(xmmA, xmmB);
3052
+ cc.evex().vpmovdw(xmmA, ymmB);
3053
+ cc.evex().vpmovdw(ymmA, zmmB);
3054
+ cc.evex().vpmovm2b(xmmA, kB);
3055
+ cc.evex().vpmovm2b(ymmA, kB);
3056
+ cc.evex().vpmovm2b(zmmA, kB);
3057
+ cc.evex().vpmovm2d(xmmA, kB);
3058
+ cc.evex().vpmovm2d(ymmA, kB);
3059
+ cc.evex().vpmovm2d(zmmA, kB);
3060
+ cc.evex().vpmovm2q(xmmA, kB);
3061
+ cc.evex().vpmovm2q(ymmA, kB);
3062
+ cc.evex().vpmovm2q(zmmA, kB);
3063
+ cc.evex().vpmovm2w(xmmA, kB);
3064
+ cc.evex().vpmovm2w(ymmA, kB);
3065
+ cc.evex().vpmovm2w(zmmA, kB);
3066
+ cc.evex().vpmovq2m(kA, xmmB);
3067
+ cc.evex().vpmovq2m(kA, ymmB);
3068
+ cc.evex().vpmovq2m(kA, zmmB);
3069
+ cc.evex().vpmovqb(xmmA, xmmB);
3070
+ cc.evex().vpmovqb(xmmA, ymmB);
3071
+ cc.evex().vpmovqb(xmmA, zmmB);
3072
+ cc.evex().vpmovqd(xmmA, xmmB);
3073
+ cc.evex().vpmovqd(xmmA, ymmB);
3074
+ cc.evex().vpmovqd(ymmA, zmmB);
3075
+ cc.evex().vpmovqw(xmmA, xmmB);
3076
+ cc.evex().vpmovqw(xmmA, ymmB);
3077
+ cc.evex().vpmovqw(xmmA, zmmB);
3078
+ cc.evex().vpmovsdb(xmmA, xmmB);
3079
+ cc.evex().vpmovsdb(xmmA, ymmB);
3080
+ cc.evex().vpmovsdb(xmmA, zmmB);
3081
+ cc.evex().vpmovsdw(xmmA, xmmB);
3082
+ cc.evex().vpmovsdw(xmmA, ymmB);
3083
+ cc.evex().vpmovsdw(ymmA, zmmB);
3084
+ cc.evex().vpmovsqb(xmmA, xmmB);
3085
+ cc.evex().vpmovsqb(xmmA, ymmB);
3086
+ cc.evex().vpmovsqb(xmmA, zmmB);
3087
+ cc.evex().vpmovsqd(xmmA, xmmB);
3088
+ cc.evex().vpmovsqd(xmmA, ymmB);
3089
+ cc.evex().vpmovsqd(ymmA, zmmB);
3090
+ cc.evex().vpmovsqw(xmmA, xmmB);
3091
+ cc.evex().vpmovsqw(xmmA, ymmB);
3092
+ cc.evex().vpmovsqw(xmmA, zmmB);
3093
+ cc.evex().vpmovswb(xmmA, xmmB);
3094
+ cc.evex().vpmovswb(xmmA, ymmB);
3095
+ cc.evex().vpmovswb(ymmA, zmmB);
3096
+ cc.evex().vpmovsxbd(xmmA, xmmB);
3097
+ cc.evex().vpmovsxbd(ymmA, xmmB);
3098
+ cc.evex().vpmovsxbd(zmmA, xmmB);
3099
+ cc.evex().vpmovsxbq(xmmA, xmmB);
3100
+ cc.evex().vpmovsxbq(ymmA, xmmB);
3101
+ cc.evex().vpmovsxbq(zmmA, xmmB);
3102
+ cc.evex().vpmovsxbw(xmmA, xmmB);
3103
+ cc.evex().vpmovsxbw(ymmA, xmmB);
3104
+ cc.evex().vpmovsxbw(zmmA, ymmB);
3105
+ cc.evex().vpmovsxdq(xmmA, xmmB);
3106
+ cc.evex().vpmovsxdq(ymmA, xmmB);
3107
+ cc.evex().vpmovsxdq(zmmA, ymmB);
3108
+ cc.evex().vpmovsxwd(xmmA, xmmB);
3109
+ cc.evex().vpmovsxwd(ymmA, xmmB);
3110
+ cc.evex().vpmovsxwd(zmmA, ymmB);
3111
+ cc.evex().vpmovsxwq(xmmA, xmmB);
3112
+ cc.evex().vpmovsxwq(ymmA, xmmB);
3113
+ cc.evex().vpmovsxwq(zmmA, xmmB);
3114
+ cc.evex().vpmovusdb(xmmA, xmmB);
3115
+ cc.evex().vpmovusdb(xmmA, ymmB);
3116
+ cc.evex().vpmovusdb(xmmA, zmmB);
3117
+ cc.evex().vpmovusdw(xmmA, xmmB);
3118
+ cc.evex().vpmovusdw(xmmA, ymmB);
3119
+ cc.evex().vpmovusdw(ymmA, zmmB);
3120
+ cc.evex().vpmovusqb(xmmA, xmmB);
3121
+ cc.evex().vpmovusqb(xmmA, ymmB);
3122
+ cc.evex().vpmovusqb(xmmA, zmmB);
3123
+ cc.evex().vpmovusqd(xmmA, xmmB);
3124
+ cc.evex().vpmovusqd(xmmA, ymmB);
3125
+ cc.evex().vpmovusqd(ymmA, zmmB);
3126
+ cc.evex().vpmovusqw(xmmA, xmmB);
3127
+ cc.evex().vpmovusqw(xmmA, ymmB);
3128
+ cc.evex().vpmovusqw(xmmA, zmmB);
3129
+ cc.evex().vpmovuswb(xmmA, xmmB);
3130
+ cc.evex().vpmovuswb(xmmA, ymmB);
3131
+ cc.evex().vpmovuswb(ymmA, zmmB);
3132
+ cc.evex().vpmovw2m(kA, xmmB);
3133
+ cc.evex().vpmovw2m(kA, ymmB);
3134
+ cc.evex().vpmovw2m(kA, zmmB);
3135
+ cc.evex().vpmovwb(xmmA, xmmB);
3136
+ cc.evex().vpmovwb(xmmA, ymmB);
3137
+ cc.evex().vpmovwb(ymmA, zmmB);
3138
+ cc.evex().vpmovzxbd(xmmA, xmmB);
3139
+ cc.evex().vpmovzxbd(ymmA, xmmB);
3140
+ cc.evex().vpmovzxbd(zmmA, xmmB);
3141
+ cc.evex().vpmovzxbq(xmmA, xmmB);
3142
+ cc.evex().vpmovzxbq(ymmA, xmmB);
3143
+ cc.evex().vpmovzxbq(zmmA, xmmB);
3144
+ cc.evex().vpmovzxbw(xmmA, xmmB);
3145
+ cc.evex().vpmovzxbw(ymmA, xmmB);
3146
+ cc.evex().vpmovzxbw(zmmA, ymmB);
3147
+ cc.evex().vpmovzxdq(xmmA, xmmB);
3148
+ cc.evex().vpmovzxdq(ymmA, xmmB);
3149
+ cc.evex().vpmovzxdq(zmmA, ymmB);
3150
+ cc.evex().vpmovzxwd(xmmA, xmmB);
3151
+ cc.evex().vpmovzxwd(ymmA, xmmB);
3152
+ cc.evex().vpmovzxwd(zmmA, ymmB);
3153
+ cc.evex().vpmovzxwq(xmmA, xmmB);
3154
+ cc.evex().vpmovzxwq(ymmA, xmmB);
3155
+ cc.evex().vpmovzxwq(zmmA, xmmB);
3156
+ cc.evex().vpmuldq(xmmA, xmmB, xmmC);
3157
+ cc.evex().vpmuldq(ymmA, ymmB, ymmC);
3158
+ cc.evex().vpmuldq(zmmA, zmmB, zmmC);
3159
+ cc.evex().vpmulhrsw(xmmA, xmmB, xmmC);
3160
+ cc.evex().vpmulhrsw(ymmA, ymmB, ymmC);
3161
+ cc.evex().vpmulhrsw(zmmA, zmmB, zmmC);
3162
+ cc.evex().vpmulhuw(xmmA, xmmB, xmmC);
3163
+ cc.evex().vpmulhuw(ymmA, ymmB, ymmC);
3164
+ cc.evex().vpmulhuw(zmmA, zmmB, zmmC);
3165
+ cc.evex().vpmulhw(xmmA, xmmB, xmmC);
3166
+ cc.evex().vpmulhw(ymmA, ymmB, ymmC);
3167
+ cc.evex().vpmulhw(zmmA, zmmB, zmmC);
3168
+ cc.evex().vpmulld(xmmA, xmmB, xmmC);
3169
+ cc.evex().vpmulld(ymmA, ymmB, ymmC);
3170
+ cc.evex().vpmulld(zmmA, zmmB, zmmC);
3171
+ cc.evex().vpmullq(xmmA, xmmB, xmmC);
3172
+ cc.evex().vpmullq(ymmA, ymmB, ymmC);
3173
+ cc.evex().vpmullq(zmmA, zmmB, zmmC);
3174
+ cc.evex().vpmullw(xmmA, xmmB, xmmC);
3175
+ cc.evex().vpmullw(ymmA, ymmB, ymmC);
3176
+ cc.evex().vpmullw(zmmA, zmmB, zmmC);
3177
+ cc.evex().vpmultishiftqb(xmmA, xmmB, xmmC);
3178
+ cc.evex().vpmultishiftqb(ymmA, ymmB, ymmC);
3179
+ cc.evex().vpmultishiftqb(zmmA, zmmB, zmmC);
3180
+ cc.evex().vpmuludq(xmmA, xmmB, xmmC);
3181
+ cc.evex().vpmuludq(ymmA, ymmB, ymmC);
3182
+ cc.evex().vpmuludq(zmmA, zmmB, zmmC);
3183
+ cc.evex().vpopcntd(zmmA, zmmB);
3184
+ cc.evex().vpopcntq(zmmA, zmmB);
3185
+ cc.evex().vpord(xmmA, xmmB, xmmC);
3186
+ cc.evex().vpord(ymmA, ymmB, ymmC);
3187
+ cc.evex().vpord(zmmA, zmmB, zmmC);
3188
+ cc.evex().vporq(xmmA, xmmB, xmmC);
3189
+ cc.evex().vporq(ymmA, ymmB, ymmC);
3190
+ cc.evex().vporq(zmmA, zmmB, zmmC);
3191
+ cc.evex().vprold(xmmA, xmmB, 0);
3192
+ cc.evex().vprold(ymmA, ymmB, 0);
3193
+ cc.evex().vprold(zmmA, zmmB, 0);
3194
+ cc.evex().vprolq(xmmA, xmmB, 0);
3195
+ cc.evex().vprolq(ymmA, ymmB, 0);
3196
+ cc.evex().vprolq(zmmA, zmmB, 0);
3197
+ cc.evex().vprolvd(xmmA, xmmB, xmmC);
3198
+ cc.evex().vprolvd(ymmA, ymmB, ymmC);
3199
+ cc.evex().vprolvd(zmmA, zmmB, zmmC);
3200
+ cc.evex().vprolvq(xmmA, xmmB, xmmC);
3201
+ cc.evex().vprolvq(ymmA, ymmB, ymmC);
3202
+ cc.evex().vprolvq(zmmA, zmmB, zmmC);
3203
+ cc.evex().vprord(xmmA, xmmB, 0);
3204
+ cc.evex().vprord(ymmA, ymmB, 0);
3205
+ cc.evex().vprord(zmmA, zmmB, 0);
3206
+ cc.evex().vprorq(xmmA, xmmB, 0);
3207
+ cc.evex().vprorq(ymmA, ymmB, 0);
3208
+ cc.evex().vprorq(zmmA, zmmB, 0);
3209
+ cc.evex().vprorvd(xmmA, xmmB, xmmC);
3210
+ cc.evex().vprorvd(ymmA, ymmB, ymmC);
3211
+ cc.evex().vprorvd(zmmA, zmmB, zmmC);
3212
+ cc.evex().vprorvq(xmmA, xmmB, xmmC);
3213
+ cc.evex().vprorvq(ymmA, ymmB, ymmC);
3214
+ cc.evex().vprorvq(zmmA, zmmB, zmmC);
3215
+ cc.evex().vpsadbw(xmmA, xmmB, xmmC);
3216
+ cc.evex().vpsadbw(ymmA, ymmB, ymmC);
3217
+ cc.evex().vpsadbw(zmmA, zmmB, zmmC);
3218
+ cc.evex().vpshufb(xmmA, xmmB, xmmC);
3219
+ cc.evex().vpshufb(ymmA, ymmB, ymmC);
3220
+ cc.evex().vpshufb(zmmA, zmmB, zmmC);
3221
+ cc.evex().vpshufd(xmmA, xmmB, 0);
3222
+ cc.evex().vpshufd(ymmA, ymmB, 0);
3223
+ cc.evex().vpshufd(zmmA, zmmB, 0);
3224
+ cc.evex().vpshufhw(xmmA, xmmB, 0);
3225
+ cc.evex().vpshufhw(ymmA, ymmB, 0);
3226
+ cc.evex().vpshufhw(zmmA, zmmB, 0);
3227
+ cc.evex().vpshuflw(xmmA, xmmB, 0);
3228
+ cc.evex().vpshuflw(ymmA, ymmB, 0);
3229
+ cc.evex().vpshuflw(zmmA, zmmB, 0);
3230
+ cc.evex().vpslld(xmmA, xmmB, xmmC);
3231
+ cc.evex().vpslld(xmmA, xmmB, 0);
3232
+ cc.evex().vpslld(ymmA, ymmB, xmmC);
3233
+ cc.evex().vpslld(ymmA, ymmB, 0);
3234
+ cc.evex().vpslld(zmmA, zmmB, xmmC);
3235
+ cc.evex().vpslld(zmmA, zmmB, 0);
3236
+ cc.evex().vpslldq(xmmA, xmmB, 0);
3237
+ cc.evex().vpslldq(ymmA, ymmB, 0);
3238
+ cc.evex().vpslldq(zmmA, zmmB, 0);
3239
+ cc.evex().vpsllq(xmmA, xmmB, xmmC);
3240
+ cc.evex().vpsllq(xmmA, xmmB, 0);
3241
+ cc.evex().vpsllq(ymmA, ymmB, xmmC);
3242
+ cc.evex().vpsllq(ymmA, ymmB, 0);
3243
+ cc.evex().vpsllq(zmmA, zmmB, xmmC);
3244
+ cc.evex().vpsllq(zmmA, zmmB, 0);
3245
+ cc.evex().vpsllvd(xmmA, xmmB, xmmC);
3246
+ cc.evex().vpsllvd(ymmA, ymmB, ymmC);
3247
+ cc.evex().vpsllvd(zmmA, zmmB, zmmC);
3248
+ cc.evex().vpsllvq(xmmA, xmmB, xmmC);
3249
+ cc.evex().vpsllvq(ymmA, ymmB, ymmC);
3250
+ cc.evex().vpsllvq(zmmA, zmmB, zmmC);
3251
+ cc.evex().vpsllvw(xmmA, xmmB, xmmC);
3252
+ cc.evex().vpsllvw(ymmA, ymmB, ymmC);
3253
+ cc.evex().vpsllvw(zmmA, zmmB, zmmC);
3254
+ cc.evex().vpsllw(xmmA, xmmB, xmmC);
3255
+ cc.evex().vpsllw(xmmA, xmmB, 0);
3256
+ cc.evex().vpsllw(ymmA, ymmB, xmmC);
3257
+ cc.evex().vpsllw(ymmA, ymmB, 0);
3258
+ cc.evex().vpsllw(zmmA, zmmB, xmmC);
3259
+ cc.evex().vpsllw(zmmA, zmmB, 0);
3260
+ cc.evex().vpsrad(xmmA, xmmB, xmmC);
3261
+ cc.evex().vpsrad(xmmA, xmmB, 0);
3262
+ cc.evex().vpsrad(ymmA, ymmB, xmmC);
3263
+ cc.evex().vpsrad(ymmA, ymmB, 0);
3264
+ cc.evex().vpsrad(zmmA, zmmB, xmmC);
3265
+ cc.evex().vpsrad(zmmA, zmmB, 0);
3266
+ cc.evex().vpsraq(xmmA, xmmB, xmmC);
3267
+ cc.evex().vpsraq(xmmA, xmmB, 0);
3268
+ cc.evex().vpsraq(ymmA, ymmB, xmmC);
3269
+ cc.evex().vpsraq(ymmA, ymmB, 0);
3270
+ cc.evex().vpsraq(zmmA, zmmB, xmmC);
3271
+ cc.evex().vpsraq(zmmA, zmmB, 0);
3272
+ cc.evex().vpsravd(xmmA, xmmB, xmmC);
3273
+ cc.evex().vpsravd(ymmA, ymmB, ymmC);
3274
+ cc.evex().vpsravd(zmmA, zmmB, zmmC);
3275
+ cc.evex().vpsravq(xmmA, xmmB, xmmC);
3276
+ cc.evex().vpsravq(ymmA, ymmB, ymmC);
3277
+ cc.evex().vpsravq(zmmA, zmmB, zmmC);
3278
+ cc.evex().vpsravw(xmmA, xmmB, xmmC);
3279
+ cc.evex().vpsravw(ymmA, ymmB, ymmC);
3280
+ cc.evex().vpsravw(zmmA, zmmB, zmmC);
3281
+ cc.evex().vpsraw(xmmA, xmmB, xmmC);
3282
+ cc.evex().vpsraw(xmmA, xmmB, 0);
3283
+ cc.evex().vpsraw(ymmA, ymmB, xmmC);
3284
+ cc.evex().vpsraw(ymmA, ymmB, 0);
3285
+ cc.evex().vpsraw(zmmA, zmmB, xmmC);
3286
+ cc.evex().vpsraw(zmmA, zmmB, 0);
3287
+ cc.evex().vpsrld(xmmA, xmmB, xmmC);
3288
+ cc.evex().vpsrld(xmmA, xmmB, 0);
3289
+ cc.evex().vpsrld(ymmA, ymmB, xmmC);
3290
+ cc.evex().vpsrld(ymmA, ymmB, 0);
3291
+ cc.evex().vpsrld(zmmA, zmmB, xmmC);
3292
+ cc.evex().vpsrld(zmmA, zmmB, 0);
3293
+ cc.evex().vpsrldq(xmmA, xmmB, 0);
3294
+ cc.evex().vpsrldq(ymmA, ymmB, 0);
3295
+ cc.evex().vpsrldq(zmmA, zmmB, 0);
3296
+ cc.evex().vpsrlq(xmmA, xmmB, xmmC);
3297
+ cc.evex().vpsrlq(xmmA, xmmB, 0);
3298
+ cc.evex().vpsrlq(ymmA, ymmB, xmmC);
3299
+ cc.evex().vpsrlq(ymmA, ymmB, 0);
3300
+ cc.evex().vpsrlq(zmmA, zmmB, xmmC);
3301
+ cc.evex().vpsrlq(zmmA, zmmB, 0);
3302
+ cc.evex().vpsrlvd(xmmA, xmmB, xmmC);
3303
+ cc.evex().vpsrlvd(ymmA, ymmB, ymmC);
3304
+ cc.evex().vpsrlvd(zmmA, zmmB, zmmC);
3305
+ cc.evex().vpsrlvq(xmmA, xmmB, xmmC);
3306
+ cc.evex().vpsrlvq(ymmA, ymmB, ymmC);
3307
+ cc.evex().vpsrlvq(zmmA, zmmB, zmmC);
3308
+ cc.evex().vpsrlvw(xmmA, xmmB, xmmC);
3309
+ cc.evex().vpsrlvw(ymmA, ymmB, ymmC);
3310
+ cc.evex().vpsrlvw(zmmA, zmmB, zmmC);
3311
+ cc.evex().vpsrlw(xmmA, xmmB, xmmC);
3312
+ cc.evex().vpsrlw(xmmA, xmmB, 0);
3313
+ cc.evex().vpsrlw(ymmA, ymmB, xmmC);
3314
+ cc.evex().vpsrlw(ymmA, ymmB, 0);
3315
+ cc.evex().vpsrlw(zmmA, zmmB, xmmC);
3316
+ cc.evex().vpsrlw(zmmA, zmmB, 0);
3317
+ cc.evex().vpsubb(xmmA, xmmB, xmmC);
3318
+ cc.evex().vpsubb(ymmA, ymmB, ymmC);
3319
+ cc.evex().vpsubb(zmmA, zmmB, zmmC);
3320
+ cc.evex().vpsubd(xmmA, xmmB, xmmC);
3321
+ cc.evex().vpsubd(ymmA, ymmB, ymmC);
3322
+ cc.evex().vpsubd(zmmA, zmmB, zmmC);
3323
+ cc.evex().vpsubq(xmmA, xmmB, xmmC);
3324
+ cc.evex().vpsubq(ymmA, ymmB, ymmC);
3325
+ cc.evex().vpsubq(zmmA, zmmB, zmmC);
3326
+ cc.evex().vpsubsb(xmmA, xmmB, xmmC);
3327
+ cc.evex().vpsubsb(ymmA, ymmB, ymmC);
3328
+ cc.evex().vpsubsb(zmmA, zmmB, zmmC);
3329
+ cc.evex().vpsubsw(xmmA, xmmB, xmmC);
3330
+ cc.evex().vpsubsw(ymmA, ymmB, ymmC);
3331
+ cc.evex().vpsubsw(zmmA, zmmB, zmmC);
3332
+ cc.evex().vpsubusb(xmmA, xmmB, xmmC);
3333
+ cc.evex().vpsubusb(ymmA, ymmB, ymmC);
3334
+ cc.evex().vpsubusb(zmmA, zmmB, zmmC);
3335
+ cc.evex().vpsubusw(xmmA, xmmB, xmmC);
3336
+ cc.evex().vpsubusw(ymmA, ymmB, ymmC);
3337
+ cc.evex().vpsubusw(zmmA, zmmB, zmmC);
3338
+ cc.evex().vpsubw(xmmA, xmmB, xmmC);
3339
+ cc.evex().vpsubw(ymmA, ymmB, ymmC);
3340
+ cc.evex().vpsubw(zmmA, zmmB, zmmC);
3341
+ cc.evex().vpternlogd(xmmA, xmmB, xmmC, 0);
3342
+ cc.evex().vpternlogd(ymmA, ymmB, ymmC, 0);
3343
+ cc.evex().vpternlogd(zmmA, zmmB, zmmC, 0);
3344
+ cc.evex().vpternlogq(xmmA, xmmB, xmmC, 0);
3345
+ cc.evex().vpternlogq(ymmA, ymmB, ymmC, 0);
3346
+ cc.evex().vpternlogq(zmmA, zmmB, zmmC, 0);
3347
+ cc.evex().vptestmb(kA, xmmB, xmmC);
3348
+ cc.evex().vptestmb(kA, ymmB, ymmC);
3349
+ cc.evex().vptestmb(kA, zmmB, zmmC);
3350
+ cc.evex().vptestmd(kA, xmmB, xmmC);
3351
+ cc.evex().vptestmd(kA, ymmB, ymmC);
3352
+ cc.evex().vptestmd(kA, zmmB, zmmC);
3353
+ cc.evex().vptestmq(kA, xmmB, xmmC);
3354
+ cc.evex().vptestmq(kA, ymmB, ymmC);
3355
+ cc.evex().vptestmq(kA, zmmB, zmmC);
3356
+ cc.evex().vptestmw(kA, xmmB, xmmC);
3357
+ cc.evex().vptestmw(kA, ymmB, ymmC);
3358
+ cc.evex().vptestmw(kA, zmmB, zmmC);
3359
+ cc.evex().vptestnmb(kA, xmmB, xmmC);
3360
+ cc.evex().vptestnmb(kA, ymmB, ymmC);
3361
+ cc.evex().vptestnmb(kA, zmmB, zmmC);
3362
+ cc.evex().vptestnmd(kA, xmmB, xmmC);
3363
+ cc.evex().vptestnmd(kA, ymmB, ymmC);
3364
+ cc.evex().vptestnmd(kA, zmmB, zmmC);
3365
+ cc.evex().vptestnmq(kA, xmmB, xmmC);
3366
+ cc.evex().vptestnmq(kA, ymmB, ymmC);
3367
+ cc.evex().vptestnmq(kA, zmmB, zmmC);
3368
+ cc.evex().vptestnmw(kA, xmmB, xmmC);
3369
+ cc.evex().vptestnmw(kA, ymmB, ymmC);
3370
+ cc.evex().vptestnmw(kA, zmmB, zmmC);
3371
+ cc.evex().vpunpckhbw(xmmA, xmmB, xmmC);
3372
+ cc.evex().vpunpckhbw(ymmA, ymmB, ymmC);
3373
+ cc.evex().vpunpckhbw(zmmA, zmmB, zmmC);
3374
+ cc.evex().vpunpckhdq(xmmA, xmmB, xmmC);
3375
+ cc.evex().vpunpckhdq(ymmA, ymmB, ymmC);
3376
+ cc.evex().vpunpckhdq(zmmA, zmmB, zmmC);
3377
+ cc.evex().vpunpckhqdq(xmmA, xmmB, xmmC);
3378
+ cc.evex().vpunpckhqdq(ymmA, ymmB, ymmC);
3379
+ cc.evex().vpunpckhqdq(zmmA, zmmB, zmmC);
3380
+ cc.evex().vpunpckhwd(xmmA, xmmB, xmmC);
3381
+ cc.evex().vpunpckhwd(ymmA, ymmB, ymmC);
3382
+ cc.evex().vpunpckhwd(zmmA, zmmB, zmmC);
3383
+ cc.evex().vpunpcklbw(xmmA, xmmB, xmmC);
3384
+ cc.evex().vpunpcklbw(ymmA, ymmB, ymmC);
3385
+ cc.evex().vpunpcklbw(zmmA, zmmB, zmmC);
3386
+ cc.evex().vpunpckldq(xmmA, xmmB, xmmC);
3387
+ cc.evex().vpunpckldq(ymmA, ymmB, ymmC);
3388
+ cc.evex().vpunpckldq(zmmA, zmmB, zmmC);
3389
+ cc.evex().vpunpcklqdq(xmmA, xmmB, xmmC);
3390
+ cc.evex().vpunpcklqdq(ymmA, ymmB, ymmC);
3391
+ cc.evex().vpunpcklqdq(zmmA, zmmB, zmmC);
3392
+ cc.evex().vpunpcklwd(xmmA, xmmB, xmmC);
3393
+ cc.evex().vpunpcklwd(ymmA, ymmB, ymmC);
3394
+ cc.evex().vpunpcklwd(zmmA, zmmB, zmmC);
3395
+ cc.evex().vpxord(xmmA, xmmB, xmmC);
3396
+ cc.evex().vpxord(ymmA, ymmB, ymmC);
3397
+ cc.evex().vpxord(zmmA, zmmB, zmmC);
3398
+ cc.evex().vpxorq(xmmA, xmmB, xmmC);
3399
+ cc.evex().vpxorq(ymmA, ymmB, ymmC);
3400
+ cc.evex().vpxorq(zmmA, zmmB, zmmC);
3401
+ cc.evex().vrangepd(xmmA, xmmB, xmmC, 0);
3402
+ cc.evex().vrangepd(ymmA, ymmB, ymmC, 0);
3403
+ cc.evex().vrangepd(zmmA, zmmB, zmmC, 0);
3404
+ cc.evex().vrangeps(xmmA, xmmB, xmmC, 0);
3405
+ cc.evex().vrangeps(ymmA, ymmB, ymmC, 0);
3406
+ cc.evex().vrangeps(zmmA, zmmB, zmmC, 0);
3407
+ cc.evex().vrangesd(xmmA, xmmB, xmmC, 0);
3408
+ cc.evex().vrangess(xmmA, xmmB, xmmC, 0);
3409
+ cc.evex().vrcp14pd(xmmA, xmmB);
3410
+ cc.evex().vrcp14pd(ymmA, ymmB);
3411
+ cc.evex().vrcp14pd(zmmA, zmmB);
3412
+ cc.evex().vrcp14ps(xmmA, xmmB);
3413
+ cc.evex().vrcp14ps(ymmA, ymmB);
3414
+ cc.evex().vrcp14ps(zmmA, zmmB);
3415
+ cc.evex().vrcp14sd(xmmA, xmmB, xmmC);
3416
+ cc.evex().vrcp14ss(xmmA, xmmB, xmmC);
3417
+ cc.evex().vrcp28pd(zmmA, zmmB);
3418
+ cc.evex().vrcp28ps(zmmA, zmmB);
3419
+ cc.evex().vrcp28sd(xmmA, xmmB, xmmC);
3420
+ cc.evex().vrcp28ss(xmmA, xmmB, xmmC);
3421
+ cc.evex().vreducepd(xmmA, xmmB, 0);
3422
+ cc.evex().vreducepd(ymmA, ymmB, 0);
3423
+ cc.evex().vreducepd(zmmA, zmmB, 0);
3424
+ cc.evex().vreduceps(xmmA, xmmB, 0);
3425
+ cc.evex().vreduceps(ymmA, ymmB, 0);
3426
+ cc.evex().vreduceps(zmmA, zmmB, 0);
3427
+ cc.evex().vreducesd(xmmA, xmmB, xmmC, 0);
3428
+ cc.evex().vreducess(xmmA, xmmB, xmmC, 0);
3429
+ cc.evex().vrndscalepd(xmmA, xmmB, 0);
3430
+ cc.evex().vrndscalepd(ymmA, ymmB, 0);
3431
+ cc.evex().vrndscalepd(zmmA, zmmB, 0);
3432
+ cc.evex().vrndscaleps(xmmA, xmmB, 0);
3433
+ cc.evex().vrndscaleps(ymmA, ymmB, 0);
3434
+ cc.evex().vrndscaleps(zmmA, zmmB, 0);
3435
+ cc.evex().vrndscalesd(xmmA, xmmB, xmmC, 0);
3436
+ cc.evex().vrndscaless(xmmA, xmmB, xmmC, 0);
3437
+ cc.evex().vrsqrt14pd(xmmA, xmmB);
3438
+ cc.evex().vrsqrt14pd(ymmA, ymmB);
3439
+ cc.evex().vrsqrt14pd(zmmA, zmmB);
3440
+ cc.evex().vrsqrt14ps(xmmA, xmmB);
3441
+ cc.evex().vrsqrt14ps(ymmA, ymmB);
3442
+ cc.evex().vrsqrt14ps(zmmA, zmmB);
3443
+ cc.evex().vrsqrt14sd(xmmA, xmmB, xmmC);
3444
+ cc.evex().vrsqrt14ss(xmmA, xmmB, xmmC);
3445
+ cc.evex().vrsqrt28pd(zmmA, zmmB);
3446
+ cc.evex().vrsqrt28ps(zmmA, zmmB);
3447
+ cc.evex().vrsqrt28sd(xmmA, xmmB, xmmC);
3448
+ cc.evex().vrsqrt28ss(xmmA, xmmB, xmmC);
3449
+ cc.evex().vscalefpd(xmmA, xmmB, xmmC);
3450
+ cc.evex().vscalefpd(ymmA, ymmB, ymmC);
3451
+ cc.evex().vscalefpd(zmmA, zmmB, zmmC);
3452
+ cc.evex().vscalefps(xmmA, xmmB, xmmC);
3453
+ cc.evex().vscalefps(ymmA, ymmB, ymmC);
3454
+ cc.evex().vscalefps(zmmA, zmmB, zmmC);
3455
+ cc.evex().vscalefsd(xmmA, xmmB, xmmC);
3456
+ cc.evex().vscalefss(xmmA, xmmB, xmmC);
3457
+ cc.evex().vshuff32x4(ymmA, ymmB, ymmC, 0);
3458
+ cc.evex().vshuff32x4(zmmA, zmmB, zmmC, 0);
3459
+ cc.evex().vshuff64x2(ymmA, ymmB, ymmC, 0);
3460
+ cc.evex().vshuff64x2(zmmA, zmmB, zmmC, 0);
3461
+ cc.evex().vshufi32x4(ymmA, ymmB, ymmC, 0);
3462
+ cc.evex().vshufi32x4(zmmA, zmmB, zmmC, 0);
3463
+ cc.evex().vshufi64x2(ymmA, ymmB, ymmC, 0);
3464
+ cc.evex().vshufi64x2(zmmA, zmmB, zmmC, 0);
3465
+ cc.evex().vshufpd(xmmA, xmmB, xmmC, 0);
3466
+ cc.evex().vshufpd(ymmA, ymmB, ymmC, 0);
3467
+ cc.evex().vshufpd(zmmA, zmmB, zmmC, 0);
3468
+ cc.evex().vshufps(xmmA, xmmB, xmmC, 0);
3469
+ cc.evex().vshufps(ymmA, ymmB, ymmC, 0);
3470
+ cc.evex().vshufps(zmmA, zmmB, zmmC, 0);
3471
+ cc.evex().vsqrtpd(xmmA, xmmB);
3472
+ cc.evex().vsqrtpd(ymmA, ymmB);
3473
+ cc.evex().vsqrtpd(zmmA, zmmB);
3474
+ cc.evex().vsqrtps(xmmA, xmmB);
3475
+ cc.evex().vsqrtps(ymmA, ymmB);
3476
+ cc.evex().vsqrtps(zmmA, zmmB);
3477
+ cc.evex().vsqrtsd(xmmA, xmmB, xmmC);
3478
+ cc.evex().vsqrtss(xmmA, xmmB, xmmC);
3479
+ cc.evex().vsubpd(xmmA, xmmB, xmmC);
3480
+ cc.evex().vsubpd(ymmA, ymmB, ymmC);
3481
+ cc.evex().vsubpd(zmmA, zmmB, zmmC);
3482
+ cc.evex().vsubps(xmmA, xmmB, xmmC);
3483
+ cc.evex().vsubps(ymmA, ymmB, ymmC);
3484
+ cc.evex().vsubps(zmmA, zmmB, zmmC);
3485
+ cc.evex().vsubsd(xmmA, xmmB, xmmC);
3486
+ cc.evex().vsubss(xmmA, xmmB, xmmC);
3487
+ cc.evex().vucomisd(xmmA, xmmB);
3488
+ cc.evex().vucomiss(xmmA, xmmB);
3489
+ cc.evex().vunpckhpd(xmmA, xmmB, xmmC);
3490
+ cc.evex().vunpckhpd(ymmA, ymmB, ymmC);
3491
+ cc.evex().vunpckhpd(zmmA, zmmB, zmmC);
3492
+ cc.evex().vunpckhps(xmmA, xmmB, xmmC);
3493
+ cc.evex().vunpckhps(ymmA, ymmB, ymmC);
3494
+ cc.evex().vunpckhps(zmmA, zmmB, zmmC);
3495
+ cc.evex().vunpcklpd(xmmA, xmmB, xmmC);
3496
+ cc.evex().vunpcklpd(ymmA, ymmB, ymmC);
3497
+ cc.evex().vunpcklpd(zmmA, zmmB, zmmC);
3498
+ cc.evex().vunpcklps(xmmA, xmmB, xmmC);
3499
+ cc.evex().vunpcklps(ymmA, ymmB, ymmC);
3500
+ cc.evex().vunpcklps(zmmA, zmmB, zmmC);
3501
+ cc.evex().vxorpd(xmmA, xmmB, xmmC);
3502
+ cc.evex().vxorpd(ymmA, ymmB, ymmC);
3503
+ cc.evex().vxorpd(zmmA, zmmB, zmmC);
3504
+ cc.evex().vxorps(xmmA, xmmB, xmmC);
3505
+ cc.evex().vxorps(ymmA, ymmB, ymmC);
3506
+ cc.evex().vxorps(zmmA, zmmB, zmmC);
3507
+ }
3508
+ else {
3509
+ x86::Mem m = x86::ptr(gpz);
3510
+ x86::Mem m32 = x86::dword_ptr(gpz);
3511
+ x86::Mem m64 = x86::qword_ptr(gpz);
3512
+ x86::Mem m128 = x86::xmmword_ptr(gpz);
3513
+ x86::Mem m256 = x86::ymmword_ptr(gpz);
3514
+ x86::Mem m512 = x86::zmmword_ptr(gpz);
3515
+ x86::Mem vx_ptr = x86::ptr(gpz, xmmD);
3516
+ x86::Mem vy_ptr = x86::ptr(gpz, ymmD);
3517
+ x86::Mem vz_ptr = x86::ptr(gpz, zmmD);
3518
+
3519
+ cc.kmovb(kA, m);
3520
+ cc.kmovb(m, kB);
3521
+ cc.kmovd(kA, m);
3522
+ cc.kmovd(m, kB);
3523
+ cc.kmovq(kA, m);
3524
+ cc.kmovq(m, kB);
3525
+ cc.kmovw(kA, m);
3526
+ cc.kmovw(m, kB);
3527
+
3528
+ cc.evex().vaddpd(xmmA, xmmB, m);
3529
+ cc.evex().vaddpd(ymmA, ymmB, m);
3530
+ cc.evex().vaddpd(zmmA, zmmB, m);
3531
+ cc.evex().vaddps(xmmA, xmmB, m);
3532
+ cc.evex().vaddps(ymmA, ymmB, m);
3533
+ cc.evex().vaddps(zmmA, zmmB, m);
3534
+ cc.evex().vaddsd(xmmA, xmmB, m);
3535
+ cc.evex().vaddss(xmmA, xmmB, m);
3536
+ cc.evex().valignd(xmmA, xmmB, m, 0);
3537
+ cc.evex().valignd(ymmA, ymmB, m, 0);
3538
+ cc.evex().valignd(zmmA, zmmB, m, 0);
3539
+ cc.evex().valignq(xmmA, xmmB, m, 0);
3540
+ cc.evex().valignq(ymmA, ymmB, m, 0);
3541
+ cc.evex().valignq(zmmA, zmmB, m, 0);
3542
+ cc.evex().vandnpd(xmmA, xmmB, m);
3543
+ cc.evex().vandnpd(ymmA, ymmB, m);
3544
+ cc.evex().vandnpd(zmmA, zmmB, m);
3545
+ cc.evex().vandnps(xmmA, xmmB, m);
3546
+ cc.evex().vandnps(ymmA, ymmB, m);
3547
+ cc.evex().vandnps(zmmA, zmmB, m);
3548
+ cc.evex().vandpd(xmmA, xmmB, m);
3549
+ cc.evex().vandpd(ymmA, ymmB, m);
3550
+ cc.evex().vandpd(zmmA, zmmB, m);
3551
+ cc.evex().vandps(xmmA, xmmB, m);
3552
+ cc.evex().vandps(ymmA, ymmB, m);
3553
+ cc.evex().vandps(zmmA, zmmB, m);
3554
+ cc.evex().vblendmpd(xmmA, xmmB, m);
3555
+ cc.evex().vblendmpd(ymmA, ymmB, m);
3556
+ cc.evex().vblendmpd(zmmA, zmmB, m);
3557
+ cc.evex().vblendmps(xmmA, xmmB, m);
3558
+ cc.evex().vblendmps(ymmA, ymmB, m);
3559
+ cc.evex().vblendmps(zmmA, zmmB, m);
3560
+ cc.evex().vbroadcastf32x2(ymmA, m);
3561
+ cc.evex().vbroadcastf32x2(zmmA, m);
3562
+ cc.evex().vbroadcastf32x4(ymmA, m);
3563
+ cc.evex().vbroadcastf32x4(zmmA, m);
3564
+ cc.evex().vbroadcastf32x8(zmmA, m);
3565
+ cc.evex().vbroadcastf64x2(ymmA, m);
3566
+ cc.evex().vbroadcastf64x2(zmmA, m);
3567
+ cc.evex().vbroadcastf64x4(zmmA, m);
3568
+ cc.evex().vbroadcasti32x2(xmmA, m);
3569
+ cc.evex().vbroadcasti32x2(ymmA, m);
3570
+ cc.evex().vbroadcasti32x2(zmmA, m);
3571
+ cc.evex().vbroadcasti32x4(ymmA, m);
3572
+ cc.evex().vbroadcasti32x4(zmmA, m);
3573
+ cc.evex().vbroadcasti32x8(zmmA, m);
3574
+ cc.evex().vbroadcasti64x2(ymmA, m);
3575
+ cc.evex().vbroadcasti64x2(zmmA, m);
3576
+ cc.evex().vbroadcasti64x4(zmmA, m);
3577
+ cc.evex().vbroadcastsd(ymmA, m);
3578
+ cc.evex().vbroadcastsd(zmmA, m);
3579
+ cc.evex().vbroadcastss(xmmA, m);
3580
+ cc.evex().vbroadcastss(ymmA, m);
3581
+ cc.evex().vbroadcastss(zmmA, m);
3582
+ cc.evex().vcmppd(kA, xmmB, m, 0);
3583
+ cc.evex().vcmppd(kA, ymmB, m, 0);
3584
+ cc.evex().vcmppd(kA, zmmB, m, 0);
3585
+ cc.evex().vcmpps(kA, xmmB, m, 0);
3586
+ cc.evex().vcmpps(kA, ymmB, m, 0);
3587
+ cc.evex().vcmpps(kA, zmmB, m, 0);
3588
+ cc.evex().vcmpsd(kA, xmmB, m, 0);
3589
+ cc.evex().vcmpss(kA, xmmB, m, 0);
3590
+ cc.evex().vcomisd(xmmA, m);
3591
+ cc.evex().vcomiss(xmmA, m);
3592
+ cc.evex().vcompresspd(m, xmmB);
3593
+ cc.evex().vcompresspd(m, ymmB);
3594
+ cc.evex().vcompresspd(m, zmmB);
3595
+ cc.evex().vcompressps(m, xmmB);
3596
+ cc.evex().vcompressps(m, ymmB);
3597
+ cc.evex().vcompressps(m, zmmB);
3598
+ cc.evex().vcvtdq2pd(xmmA, m);
3599
+ cc.evex().vcvtdq2pd(ymmA, m);
3600
+ cc.evex().vcvtdq2pd(zmmA, m);
3601
+ cc.evex().vcvtdq2ps(xmmA, m);
3602
+ cc.evex().vcvtdq2ps(ymmA, m);
3603
+ cc.evex().vcvtdq2ps(zmmA, m);
3604
+ cc.evex().vcvtpd2dq(xmmA, m128);
3605
+ cc.evex().vcvtpd2dq(xmmA, m256);
3606
+ cc.evex().vcvtpd2dq(ymmA, m512);
3607
+ cc.evex().vcvtpd2qq(xmmA, m);
3608
+ cc.evex().vcvtpd2qq(ymmA, m);
3609
+ cc.evex().vcvtpd2qq(zmmA, m);
3610
+ cc.evex().vcvtpd2udq(xmmA, m128);
3611
+ cc.evex().vcvtpd2udq(xmmA, m256);
3612
+ cc.evex().vcvtpd2udq(ymmA, m512);
3613
+ cc.evex().vcvtpd2uqq(xmmA, m);
3614
+ cc.evex().vcvtpd2uqq(ymmA, m);
3615
+ cc.evex().vcvtpd2uqq(zmmA, m);
3616
+ cc.evex().vcvtph2ps(xmmA, m);
3617
+ cc.evex().vcvtph2ps(ymmA, m);
3618
+ cc.evex().vcvtph2ps(zmmA, m);
3619
+ cc.evex().vcvtps2dq(xmmA, m);
3620
+ cc.evex().vcvtps2dq(ymmA, m);
3621
+ cc.evex().vcvtps2dq(zmmA, m);
3622
+ cc.evex().vcvtps2pd(xmmA, m);
3623
+ cc.evex().vcvtps2pd(ymmA, m);
3624
+ cc.evex().vcvtps2pd(zmmA, m);
3625
+ cc.evex().vcvtps2ph(m, xmmB, 0);
3626
+ cc.evex().vcvtps2ph(m, ymmB, 0);
3627
+ cc.evex().vcvtps2ph(m, zmmB, 0);
3628
+ cc.evex().vcvtps2qq(xmmA, m);
3629
+ cc.evex().vcvtps2qq(ymmA, m);
3630
+ cc.evex().vcvtps2qq(zmmA, m);
3631
+ cc.evex().vcvtps2udq(xmmA, m);
3632
+ cc.evex().vcvtps2udq(ymmA, m);
3633
+ cc.evex().vcvtps2udq(zmmA, m);
3634
+ cc.evex().vcvtps2uqq(xmmA, m);
3635
+ cc.evex().vcvtps2uqq(ymmA, m);
3636
+ cc.evex().vcvtps2uqq(zmmA, m);
3637
+ cc.evex().vcvtqq2pd(xmmA, m);
3638
+ cc.evex().vcvtqq2pd(ymmA, m);
3639
+ cc.evex().vcvtqq2pd(zmmA, m);
3640
+ cc.evex().vcvtqq2ps(xmmA, m128);
3641
+ cc.evex().vcvtqq2ps(xmmA, m256);
3642
+ cc.evex().vcvtqq2ps(ymmA, m512);
3643
+ cc.evex().vcvtsd2si(gpd, m);
3644
+ if (cc.is64Bit()) cc.evex().vcvtsd2si(gpq, m);
3645
+ cc.evex().vcvtsd2ss(xmmA, xmmB, m);
3646
+ cc.evex().vcvtsd2usi(gpd, m);
3647
+ if (cc.is64Bit()) cc.evex().vcvtsd2usi(gpq, m);
3648
+ cc.evex().vcvtsi2sd(xmmA, xmmB, m32);
3649
+ if (cc.is64Bit()) cc.evex().vcvtsi2sd(xmmA, xmmB, m64);
3650
+ cc.evex().vcvtsi2ss(xmmA, xmmB, m32);
3651
+ if (cc.is64Bit()) cc.evex().vcvtsi2ss(xmmA, xmmB, m64);
3652
+ cc.evex().vcvtss2sd(xmmA, xmmB, m);
3653
+ cc.evex().vcvtss2si(gpd, m);
3654
+ if (cc.is64Bit()) cc.evex().vcvtss2si(gpq, m);
3655
+ cc.evex().vcvtss2usi(gpd, m);
3656
+ if (cc.is64Bit()) cc.evex().vcvtss2usi(gpq, m);
3657
+ cc.evex().vcvttpd2dq(xmmA, m128);
3658
+ cc.evex().vcvttpd2dq(xmmA, m256);
3659
+ cc.evex().vcvttpd2dq(ymmA, m512);
3660
+ cc.evex().vcvttpd2qq(xmmA, m);
3661
+ cc.evex().vcvttpd2qq(ymmA, m);
3662
+ cc.evex().vcvttpd2qq(zmmA, m);
3663
+ cc.evex().vcvttpd2udq(xmmA, m128);
3664
+ cc.evex().vcvttpd2udq(xmmA, m256);
3665
+ cc.evex().vcvttpd2udq(ymmA, m512);
3666
+ cc.evex().vcvttpd2uqq(xmmA, m);
3667
+ cc.evex().vcvttpd2uqq(ymmA, m);
3668
+ cc.evex().vcvttpd2uqq(zmmA, m);
3669
+ cc.evex().vcvttps2dq(xmmA, m);
3670
+ cc.evex().vcvttps2dq(ymmA, m);
3671
+ cc.evex().vcvttps2dq(zmmA, m);
3672
+ cc.evex().vcvttps2qq(xmmA, m);
3673
+ cc.evex().vcvttps2qq(ymmA, m);
3674
+ cc.evex().vcvttps2qq(zmmA, m);
3675
+ cc.evex().vcvttps2udq(xmmA, m);
3676
+ cc.evex().vcvttps2udq(ymmA, m);
3677
+ cc.evex().vcvttps2udq(zmmA, m);
3678
+ cc.evex().vcvttps2uqq(xmmA, m);
3679
+ cc.evex().vcvttps2uqq(ymmA, m);
3680
+ cc.evex().vcvttps2uqq(zmmA, m);
3681
+ cc.evex().vcvttsd2si(gpd, m);
3682
+ if (cc.is64Bit()) cc.evex().vcvttsd2si(gpq, m);
3683
+ cc.evex().vcvttsd2usi(gpd, m);
3684
+ if (cc.is64Bit()) cc.evex().vcvttsd2usi(gpq, m);
3685
+ cc.evex().vcvttss2si(gpd, m);
3686
+ if (cc.is64Bit()) cc.evex().vcvttss2si(gpq, m);
3687
+ cc.evex().vcvttss2usi(gpd, m);
3688
+ if (cc.is64Bit()) cc.evex().vcvttss2usi(gpq, m);
3689
+ cc.evex().vcvtudq2pd(xmmA, m);
3690
+ cc.evex().vcvtudq2pd(ymmA, m);
3691
+ cc.evex().vcvtudq2pd(zmmA, m);
3692
+ cc.evex().vcvtudq2ps(xmmA, m);
3693
+ cc.evex().vcvtudq2ps(ymmA, m);
3694
+ cc.evex().vcvtudq2ps(zmmA, m);
3695
+ cc.evex().vcvtuqq2pd(xmmA, m);
3696
+ cc.evex().vcvtuqq2pd(ymmA, m);
3697
+ cc.evex().vcvtuqq2pd(zmmA, m);
3698
+ cc.evex().vcvtuqq2ps(xmmA, m128);
3699
+ cc.evex().vcvtuqq2ps(xmmA, m256);
3700
+ cc.evex().vcvtuqq2ps(ymmA, m512);
3701
+ cc.evex().vcvtusi2sd(xmmA, xmmB, m32);
3702
+ if (cc.is64Bit()) cc.evex().vcvtusi2sd(xmmA, xmmB, m64);
3703
+ cc.evex().vcvtusi2ss(xmmA, xmmB, m32);
3704
+ if (cc.is64Bit()) cc.evex().vcvtusi2ss(xmmA, xmmB, m64);
3705
+ cc.evex().vdbpsadbw(xmmA, xmmB, m, 0);
3706
+ cc.evex().vdbpsadbw(ymmA, ymmB, m, 0);
3707
+ cc.evex().vdbpsadbw(zmmA, zmmB, m, 0);
3708
+ cc.evex().vdivpd(xmmA, xmmB, m);
3709
+ cc.evex().vdivpd(ymmA, ymmB, m);
3710
+ cc.evex().vdivpd(zmmA, zmmB, m);
3711
+ cc.evex().vdivps(xmmA, xmmB, m);
3712
+ cc.evex().vdivps(ymmA, ymmB, m);
3713
+ cc.evex().vdivps(zmmA, zmmB, m);
3714
+ cc.evex().vdivsd(xmmA, xmmB, m);
3715
+ cc.evex().vdivss(xmmA, xmmB, m);
3716
+ cc.evex().vexp2pd(zmmA, m);
3717
+ cc.evex().vexp2ps(zmmA, m);
3718
+ cc.evex().vexpandpd(xmmA, m);
3719
+ cc.evex().vexpandpd(ymmA, m);
3720
+ cc.evex().vexpandpd(zmmA, m);
3721
+ cc.evex().vexpandps(xmmA, m);
3722
+ cc.evex().vexpandps(ymmA, m);
3723
+ cc.evex().vexpandps(zmmA, m);
3724
+ cc.evex().vextractf32x4(m, ymmB, 0);
3725
+ cc.evex().vextractf32x4(m, zmmB, 0);
3726
+ cc.evex().vextractf32x8(m, zmmB, 0);
3727
+ cc.evex().vextractf64x2(m, ymmB, 0);
3728
+ cc.evex().vextractf64x2(m, zmmB, 0);
3729
+ cc.evex().vextractf64x4(m, zmmB, 0);
3730
+ cc.evex().vextracti32x4(m, ymmB, 0);
3731
+ cc.evex().vextracti32x4(m, zmmB, 0);
3732
+ cc.evex().vextracti32x8(m, zmmB, 0);
3733
+ cc.evex().vextracti64x2(m, ymmB, 0);
3734
+ cc.evex().vextracti64x2(m, zmmB, 0);
3735
+ cc.evex().vextracti64x4(m, zmmB, 0);
3736
+ cc.evex().vextractps(m, xmmB, 0);
3737
+ cc.evex().vfixupimmpd(xmmA, xmmB, m, 0);
3738
+ cc.evex().vfixupimmpd(ymmA, ymmB, m, 0);
3739
+ cc.evex().vfixupimmpd(zmmA, zmmB, m, 0);
3740
+ cc.evex().vfixupimmps(xmmA, xmmB, m, 0);
3741
+ cc.evex().vfixupimmps(ymmA, ymmB, m, 0);
3742
+ cc.evex().vfixupimmps(zmmA, zmmB, m, 0);
3743
+ cc.evex().vfixupimmsd(xmmA, xmmB, m, 0);
3744
+ cc.evex().vfixupimmss(xmmA, xmmB, m, 0);
3745
+ cc.evex().vfmadd132pd(xmmA, xmmB, m);
3746
+ cc.evex().vfmadd132pd(ymmA, ymmB, m);
3747
+ cc.evex().vfmadd132pd(zmmA, zmmB, m);
3748
+ cc.evex().vfmadd132ps(xmmA, xmmB, m);
3749
+ cc.evex().vfmadd132ps(ymmA, ymmB, m);
3750
+ cc.evex().vfmadd132ps(zmmA, zmmB, m);
3751
+ cc.evex().vfmadd132sd(xmmA, xmmB, m);
3752
+ cc.evex().vfmadd132ss(xmmA, xmmB, m);
3753
+ cc.evex().vfmadd213pd(xmmA, xmmB, m);
3754
+ cc.evex().vfmadd213pd(ymmA, ymmB, m);
3755
+ cc.evex().vfmadd213pd(zmmA, zmmB, m);
3756
+ cc.evex().vfmadd213ps(xmmA, xmmB, m);
3757
+ cc.evex().vfmadd213ps(ymmA, ymmB, m);
3758
+ cc.evex().vfmadd213ps(zmmA, zmmB, m);
3759
+ cc.evex().vfmadd213sd(xmmA, xmmB, m);
3760
+ cc.evex().vfmadd213ss(xmmA, xmmB, m);
3761
+ cc.evex().vfmadd231pd(xmmA, xmmB, m);
3762
+ cc.evex().vfmadd231pd(ymmA, ymmB, m);
3763
+ cc.evex().vfmadd231pd(zmmA, zmmB, m);
3764
+ cc.evex().vfmadd231ps(xmmA, xmmB, m);
3765
+ cc.evex().vfmadd231ps(ymmA, ymmB, m);
3766
+ cc.evex().vfmadd231ps(zmmA, zmmB, m);
3767
+ cc.evex().vfmadd231sd(xmmA, xmmB, m);
3768
+ cc.evex().vfmadd231ss(xmmA, xmmB, m);
3769
+ cc.evex().vfmaddsub132pd(xmmA, xmmB, m);
3770
+ cc.evex().vfmaddsub132pd(ymmA, ymmB, m);
3771
+ cc.evex().vfmaddsub132pd(zmmA, zmmB, m);
3772
+ cc.evex().vfmaddsub132ps(xmmA, xmmB, m);
3773
+ cc.evex().vfmaddsub132ps(ymmA, ymmB, m);
3774
+ cc.evex().vfmaddsub132ps(zmmA, zmmB, m);
3775
+ cc.evex().vfmaddsub213pd(xmmA, xmmB, m);
3776
+ cc.evex().vfmaddsub213pd(ymmA, ymmB, m);
3777
+ cc.evex().vfmaddsub213pd(zmmA, zmmB, m);
3778
+ cc.evex().vfmaddsub213ps(xmmA, xmmB, m);
3779
+ cc.evex().vfmaddsub213ps(ymmA, ymmB, m);
3780
+ cc.evex().vfmaddsub213ps(zmmA, zmmB, m);
3781
+ cc.evex().vfmaddsub231pd(xmmA, xmmB, m);
3782
+ cc.evex().vfmaddsub231pd(ymmA, ymmB, m);
3783
+ cc.evex().vfmaddsub231pd(zmmA, zmmB, m);
3784
+ cc.evex().vfmaddsub231ps(xmmA, xmmB, m);
3785
+ cc.evex().vfmaddsub231ps(ymmA, ymmB, m);
3786
+ cc.evex().vfmaddsub231ps(zmmA, zmmB, m);
3787
+ cc.evex().vfmsub132pd(xmmA, xmmB, m);
3788
+ cc.evex().vfmsub132pd(ymmA, ymmB, m);
3789
+ cc.evex().vfmsub132pd(zmmA, zmmB, m);
3790
+ cc.evex().vfmsub132ps(xmmA, xmmB, m);
3791
+ cc.evex().vfmsub132ps(ymmA, ymmB, m);
3792
+ cc.evex().vfmsub132ps(zmmA, zmmB, m);
3793
+ cc.evex().vfmsub132sd(xmmA, xmmB, m);
3794
+ cc.evex().vfmsub132ss(xmmA, xmmB, m);
3795
+ cc.evex().vfmsub213pd(xmmA, xmmB, m);
3796
+ cc.evex().vfmsub213pd(ymmA, ymmB, m);
3797
+ cc.evex().vfmsub213pd(zmmA, zmmB, m);
3798
+ cc.evex().vfmsub213ps(xmmA, xmmB, m);
3799
+ cc.evex().vfmsub213ps(ymmA, ymmB, m);
3800
+ cc.evex().vfmsub213ps(zmmA, zmmB, m);
3801
+ cc.evex().vfmsub213sd(xmmA, xmmB, m);
3802
+ cc.evex().vfmsub213ss(xmmA, xmmB, m);
3803
+ cc.evex().vfmsub231pd(xmmA, xmmB, m);
3804
+ cc.evex().vfmsub231pd(ymmA, ymmB, m);
3805
+ cc.evex().vfmsub231pd(zmmA, zmmB, m);
3806
+ cc.evex().vfmsub231ps(xmmA, xmmB, m);
3807
+ cc.evex().vfmsub231ps(ymmA, ymmB, m);
3808
+ cc.evex().vfmsub231ps(zmmA, zmmB, m);
3809
+ cc.evex().vfmsub231sd(xmmA, xmmB, m);
3810
+ cc.evex().vfmsub231ss(xmmA, xmmB, m);
3811
+ cc.evex().vfmsubadd132pd(xmmA, xmmB, m);
3812
+ cc.evex().vfmsubadd132pd(ymmA, ymmB, m);
3813
+ cc.evex().vfmsubadd132pd(zmmA, zmmB, m);
3814
+ cc.evex().vfmsubadd132ps(xmmA, xmmB, m);
3815
+ cc.evex().vfmsubadd132ps(ymmA, ymmB, m);
3816
+ cc.evex().vfmsubadd132ps(zmmA, zmmB, m);
3817
+ cc.evex().vfmsubadd213pd(xmmA, xmmB, m);
3818
+ cc.evex().vfmsubadd213pd(ymmA, ymmB, m);
3819
+ cc.evex().vfmsubadd213pd(zmmA, zmmB, m);
3820
+ cc.evex().vfmsubadd213ps(xmmA, xmmB, m);
3821
+ cc.evex().vfmsubadd213ps(ymmA, ymmB, m);
3822
+ cc.evex().vfmsubadd213ps(zmmA, zmmB, m);
3823
+ cc.evex().vfmsubadd231pd(xmmA, xmmB, m);
3824
+ cc.evex().vfmsubadd231pd(ymmA, ymmB, m);
3825
+ cc.evex().vfmsubadd231pd(zmmA, zmmB, m);
3826
+ cc.evex().vfmsubadd231ps(xmmA, xmmB, m);
3827
+ cc.evex().vfmsubadd231ps(ymmA, ymmB, m);
3828
+ cc.evex().vfmsubadd231ps(zmmA, zmmB, m);
3829
+ cc.evex().vfnmadd132pd(xmmA, xmmB, m);
3830
+ cc.evex().vfnmadd132pd(ymmA, ymmB, m);
3831
+ cc.evex().vfnmadd132pd(zmmA, zmmB, m);
3832
+ cc.evex().vfnmadd132ps(xmmA, xmmB, m);
3833
+ cc.evex().vfnmadd132ps(ymmA, ymmB, m);
3834
+ cc.evex().vfnmadd132ps(zmmA, zmmB, m);
3835
+ cc.evex().vfnmadd132sd(xmmA, xmmB, m);
3836
+ cc.evex().vfnmadd132ss(xmmA, xmmB, m);
3837
+ cc.evex().vfnmadd213pd(xmmA, xmmB, m);
3838
+ cc.evex().vfnmadd213pd(ymmA, ymmB, m);
3839
+ cc.evex().vfnmadd213pd(zmmA, zmmB, m);
3840
+ cc.evex().vfnmadd213ps(xmmA, xmmB, m);
3841
+ cc.evex().vfnmadd213ps(ymmA, ymmB, m);
3842
+ cc.evex().vfnmadd213ps(zmmA, zmmB, m);
3843
+ cc.evex().vfnmadd213sd(xmmA, xmmB, m);
3844
+ cc.evex().vfnmadd213ss(xmmA, xmmB, m);
3845
+ cc.evex().vfnmadd231pd(xmmA, xmmB, m);
3846
+ cc.evex().vfnmadd231pd(ymmA, ymmB, m);
3847
+ cc.evex().vfnmadd231pd(zmmA, zmmB, m);
3848
+ cc.evex().vfnmadd231ps(xmmA, xmmB, m);
3849
+ cc.evex().vfnmadd231ps(ymmA, ymmB, m);
3850
+ cc.evex().vfnmadd231ps(zmmA, zmmB, m);
3851
+ cc.evex().vfnmadd231sd(xmmA, xmmB, m);
3852
+ cc.evex().vfnmadd231ss(xmmA, xmmB, m);
3853
+ cc.evex().vfnmsub132pd(xmmA, xmmB, m);
3854
+ cc.evex().vfnmsub132pd(ymmA, ymmB, m);
3855
+ cc.evex().vfnmsub132pd(zmmA, zmmB, m);
3856
+ cc.evex().vfnmsub132ps(xmmA, xmmB, m);
3857
+ cc.evex().vfnmsub132ps(ymmA, ymmB, m);
3858
+ cc.evex().vfnmsub132ps(zmmA, zmmB, m);
3859
+ cc.evex().vfnmsub132sd(xmmA, xmmB, m);
3860
+ cc.evex().vfnmsub132ss(xmmA, xmmB, m);
3861
+ cc.evex().vfnmsub213pd(xmmA, xmmB, m);
3862
+ cc.evex().vfnmsub213pd(ymmA, ymmB, m);
3863
+ cc.evex().vfnmsub213pd(zmmA, zmmB, m);
3864
+ cc.evex().vfnmsub213ps(xmmA, xmmB, m);
3865
+ cc.evex().vfnmsub213ps(ymmA, ymmB, m);
3866
+ cc.evex().vfnmsub213ps(zmmA, zmmB, m);
3867
+ cc.evex().vfnmsub213sd(xmmA, xmmB, m);
3868
+ cc.evex().vfnmsub213ss(xmmA, xmmB, m);
3869
+ cc.evex().vfnmsub231pd(xmmA, xmmB, m);
3870
+ cc.evex().vfnmsub231pd(ymmA, ymmB, m);
3871
+ cc.evex().vfnmsub231pd(zmmA, zmmB, m);
3872
+ cc.evex().vfnmsub231ps(xmmA, xmmB, m);
3873
+ cc.evex().vfnmsub231ps(ymmA, ymmB, m);
3874
+ cc.evex().vfnmsub231ps(zmmA, zmmB, m);
3875
+ cc.evex().vfnmsub231sd(xmmA, xmmB, m);
3876
+ cc.evex().vfnmsub231ss(xmmA, xmmB, m);
3877
+ cc.evex().vfpclasspd(kA, m128, 0);
3878
+ cc.evex().vfpclasspd(kA, m256, 0);
3879
+ cc.evex().vfpclasspd(kA, m512, 0);
3880
+ cc.evex().vfpclassps(kA, m128, 0);
3881
+ cc.evex().vfpclassps(kA, m256, 0);
3882
+ cc.evex().vfpclassps(kA, m512, 0);
3883
+ cc.evex().vfpclasssd(kA, m, 0);
3884
+ cc.evex().vfpclassss(kA, m, 0);
3885
+ cc.evex().k(kA).vgatherdpd(xmmA, vx_ptr);
3886
+ cc.evex().k(kA).vgatherdpd(ymmA, vx_ptr);
3887
+ cc.evex().k(kA).vgatherdpd(zmmA, vy_ptr);
3888
+ cc.evex().k(kA).vgatherdps(xmmA, vx_ptr);
3889
+ cc.evex().k(kA).vgatherdps(ymmA, vy_ptr);
3890
+ cc.evex().k(kA).vgatherdps(zmmA, vz_ptr);
3891
+ cc.evex().k(kA).vgatherpf0dpd(vy_ptr);
3892
+ cc.evex().k(kA).vgatherpf0dps(vz_ptr);
3893
+ cc.evex().k(kA).vgatherpf0qpd(vz_ptr);
3894
+ cc.evex().k(kA).vgatherpf0qps(vz_ptr);
3895
+ cc.evex().k(kA).vgatherpf1dpd(vy_ptr);
3896
+ cc.evex().k(kA).vgatherpf1dps(vz_ptr);
3897
+ cc.evex().k(kA).vgatherpf1qpd(vz_ptr);
3898
+ cc.evex().k(kA).vgatherpf1qps(vz_ptr);
3899
+ cc.evex().k(kA).vgatherqpd(xmmA, vx_ptr);
3900
+ cc.evex().k(kA).vgatherqpd(ymmA, vy_ptr);
3901
+ cc.evex().k(kA).vgatherqpd(zmmA, vz_ptr);
3902
+ cc.evex().k(kA).vgatherqps(xmmA, vx_ptr);
3903
+ cc.evex().k(kA).vgatherqps(xmmA, vy_ptr);
3904
+ cc.evex().k(kA).vgatherqps(ymmA, vz_ptr);
3905
+ cc.evex().vgetexppd(xmmA, m);
3906
+ cc.evex().vgetexppd(ymmA, m);
3907
+ cc.evex().vgetexppd(zmmA, m);
3908
+ cc.evex().vgetexpps(xmmA, m);
3909
+ cc.evex().vgetexpps(ymmA, m);
3910
+ cc.evex().vgetexpps(zmmA, m);
3911
+ cc.evex().vgetexpsd(xmmA, xmmB, m);
3912
+ cc.evex().vgetexpss(xmmA, xmmB, m);
3913
+ cc.evex().vgetmantpd(xmmA, m, 0);
3914
+ cc.evex().vgetmantpd(ymmA, m, 0);
3915
+ cc.evex().vgetmantpd(zmmA, m, 0);
3916
+ cc.evex().vgetmantps(xmmA, m, 0);
3917
+ cc.evex().vgetmantps(ymmA, m, 0);
3918
+ cc.evex().vgetmantps(zmmA, m, 0);
3919
+ cc.evex().vgetmantsd(xmmA, xmmB, m, 0);
3920
+ cc.evex().vgetmantss(xmmA, xmmB, m, 0);
3921
+ cc.evex().vinsertf32x4(ymmA, ymmB, m, 0);
3922
+ cc.evex().vinsertf32x4(zmmA, zmmB, m, 0);
3923
+ cc.evex().vinsertf32x8(zmmA, zmmB, m, 0);
3924
+ cc.evex().vinsertf64x2(ymmA, ymmB, m, 0);
3925
+ cc.evex().vinsertf64x2(zmmA, zmmB, m, 0);
3926
+ cc.evex().vinsertf64x4(zmmA, zmmB, m, 0);
3927
+ cc.evex().vinserti32x4(ymmA, ymmB, m, 0);
3928
+ cc.evex().vinserti32x4(zmmA, zmmB, m, 0);
3929
+ cc.evex().vinserti32x8(zmmA, zmmB, m, 0);
3930
+ cc.evex().vinserti64x2(ymmA, ymmB, m, 0);
3931
+ cc.evex().vinserti64x2(zmmA, zmmB, m, 0);
3932
+ cc.evex().vinserti64x4(zmmA, zmmB, m, 0);
3933
+ cc.evex().vinsertps(xmmA, xmmB, m, 0);
3934
+ cc.evex().vmaxpd(xmmA, xmmB, m);
3935
+ cc.evex().vmaxpd(ymmA, ymmB, m);
3936
+ cc.evex().vmaxpd(zmmA, zmmB, m);
3937
+ cc.evex().vmaxps(xmmA, xmmB, m);
3938
+ cc.evex().vmaxps(ymmA, ymmB, m);
3939
+ cc.evex().vmaxps(zmmA, zmmB, m);
3940
+ cc.evex().vmaxsd(xmmA, xmmB, m);
3941
+ cc.evex().vmaxss(xmmA, xmmB, m);
3942
+ cc.evex().vminpd(xmmA, xmmB, m);
3943
+ cc.evex().vminpd(ymmA, ymmB, m);
3944
+ cc.evex().vminpd(zmmA, zmmB, m);
3945
+ cc.evex().vminps(xmmA, xmmB, m);
3946
+ cc.evex().vminps(ymmA, ymmB, m);
3947
+ cc.evex().vminps(zmmA, zmmB, m);
3948
+ cc.evex().vminsd(xmmA, xmmB, m);
3949
+ cc.evex().vminss(xmmA, xmmB, m);
3950
+ cc.evex().vmovapd(xmmA, m);
3951
+ cc.evex().vmovapd(m, xmmB);
3952
+ cc.evex().vmovapd(ymmA, m);
3953
+ cc.evex().vmovapd(m, ymmB);
3954
+ cc.evex().vmovapd(zmmA, m);
3955
+ cc.evex().vmovapd(m, zmmB);
3956
+ cc.evex().vmovaps(xmmA, m);
3957
+ cc.evex().vmovaps(m, xmmB);
3958
+ cc.evex().vmovaps(ymmA, m);
3959
+ cc.evex().vmovaps(m, ymmB);
3960
+ cc.evex().vmovaps(zmmA, m);
3961
+ cc.evex().vmovaps(m, zmmB);
3962
+ cc.evex().vmovd(m, xmmB);
3963
+ cc.evex().vmovd(xmmA, m);
3964
+ cc.evex().vmovddup(xmmA, m);
3965
+ cc.evex().vmovddup(ymmA, m);
3966
+ cc.evex().vmovddup(zmmA, m);
3967
+ cc.evex().vmovdqa32(xmmA, m);
3968
+ cc.evex().vmovdqa32(m, xmmB);
3969
+ cc.evex().vmovdqa32(ymmA, m);
3970
+ cc.evex().vmovdqa32(m, ymmB);
3971
+ cc.evex().vmovdqa32(zmmA, m);
3972
+ cc.evex().vmovdqa32(m, zmmB);
3973
+ cc.evex().vmovdqa64(xmmA, m);
3974
+ cc.evex().vmovdqa64(m, xmmB);
3975
+ cc.evex().vmovdqa64(ymmA, m);
3976
+ cc.evex().vmovdqa64(m, ymmB);
3977
+ cc.evex().vmovdqa64(zmmA, m);
3978
+ cc.evex().vmovdqa64(m, zmmB);
3979
+ cc.evex().vmovdqu16(xmmA, m);
3980
+ cc.evex().vmovdqu16(m, xmmB);
3981
+ cc.evex().vmovdqu16(ymmA, m);
3982
+ cc.evex().vmovdqu16(m, ymmB);
3983
+ cc.evex().vmovdqu16(zmmA, m);
3984
+ cc.evex().vmovdqu16(m, zmmB);
3985
+ cc.evex().vmovdqu32(xmmA, m);
3986
+ cc.evex().vmovdqu32(m, xmmB);
3987
+ cc.evex().vmovdqu32(ymmA, m);
3988
+ cc.evex().vmovdqu32(m, ymmB);
3989
+ cc.evex().vmovdqu32(zmmA, m);
3990
+ cc.evex().vmovdqu32(m, zmmB);
3991
+ cc.evex().vmovdqu64(xmmA, m);
3992
+ cc.evex().vmovdqu64(m, xmmB);
3993
+ cc.evex().vmovdqu64(ymmA, m);
3994
+ cc.evex().vmovdqu64(m, ymmB);
3995
+ cc.evex().vmovdqu64(zmmA, m);
3996
+ cc.evex().vmovdqu64(m, zmmB);
3997
+ cc.evex().vmovdqu8(xmmA, m);
3998
+ cc.evex().vmovdqu8(m, xmmB);
3999
+ cc.evex().vmovdqu8(ymmA, m);
4000
+ cc.evex().vmovdqu8(m, ymmB);
4001
+ cc.evex().vmovdqu8(zmmA, m);
4002
+ cc.evex().vmovdqu8(m, zmmB);
4003
+ cc.evex().vmovhpd(m, xmmB);
4004
+ cc.evex().vmovhpd(xmmA, xmmB, m);
4005
+ cc.evex().vmovhps(m, xmmB);
4006
+ cc.evex().vmovhps(xmmA, xmmB, m);
4007
+ cc.evex().vmovlpd(m, xmmB);
4008
+ cc.evex().vmovlpd(xmmA, xmmB, m);
4009
+ cc.evex().vmovlps(m, xmmB);
4010
+ cc.evex().vmovlps(xmmA, xmmB, m);
4011
+ cc.evex().vmovntdq(m, xmmB);
4012
+ cc.evex().vmovntdq(m, ymmB);
4013
+ cc.evex().vmovntdq(m, zmmB);
4014
+ cc.evex().vmovntdqa(xmmA, m);
4015
+ cc.evex().vmovntdqa(ymmA, m);
4016
+ cc.evex().vmovntdqa(zmmA, m);
4017
+ cc.evex().vmovntpd(m, xmmB);
4018
+ cc.evex().vmovntpd(m, ymmB);
4019
+ cc.evex().vmovntpd(m, zmmB);
4020
+ cc.evex().vmovntps(m, xmmB);
4021
+ cc.evex().vmovntps(m, ymmB);
4022
+ cc.evex().vmovntps(m, zmmB);
4023
+ cc.evex().vmovq(m, xmmB);
4024
+ cc.evex().vmovq(xmmA, m);
4025
+ cc.evex().vmovq(xmmA, m);
4026
+ cc.evex().vmovq(m, xmmB);
4027
+ cc.evex().vmovsd(m, xmmB);
4028
+ cc.evex().vmovsd(xmmA, m);
4029
+ cc.evex().vmovshdup(xmmA, m);
4030
+ cc.evex().vmovshdup(ymmA, m);
4031
+ cc.evex().vmovshdup(zmmA, m);
4032
+ cc.evex().vmovsldup(xmmA, m);
4033
+ cc.evex().vmovsldup(ymmA, m);
4034
+ cc.evex().vmovsldup(zmmA, m);
4035
+ cc.evex().vmovss(m, xmmB);
4036
+ cc.evex().vmovss(xmmA, m);
4037
+ cc.evex().vmovupd(xmmA, m);
4038
+ cc.evex().vmovupd(m, xmmB);
4039
+ cc.evex().vmovupd(ymmA, m);
4040
+ cc.evex().vmovupd(m, ymmB);
4041
+ cc.evex().vmovupd(zmmA, m);
4042
+ cc.evex().vmovupd(m, zmmB);
4043
+ cc.evex().vmovups(xmmA, m);
4044
+ cc.evex().vmovups(m, xmmB);
4045
+ cc.evex().vmovups(ymmA, m);
4046
+ cc.evex().vmovups(m, ymmB);
4047
+ cc.evex().vmovups(zmmA, m);
4048
+ cc.evex().vmovups(m, zmmB);
4049
+ cc.evex().vmulpd(xmmA, xmmB, m);
4050
+ cc.evex().vmulpd(ymmA, ymmB, m);
4051
+ cc.evex().vmulpd(zmmA, zmmB, m);
4052
+ cc.evex().vmulps(xmmA, xmmB, m);
4053
+ cc.evex().vmulps(ymmA, ymmB, m);
4054
+ cc.evex().vmulps(zmmA, zmmB, m);
4055
+ cc.evex().vmulsd(xmmA, xmmB, m);
4056
+ cc.evex().vmulss(xmmA, xmmB, m);
4057
+ cc.evex().vorpd(xmmA, xmmB, m);
4058
+ cc.evex().vorpd(ymmA, ymmB, m);
4059
+ cc.evex().vorpd(zmmA, zmmB, m);
4060
+ cc.evex().vorps(xmmA, xmmB, m);
4061
+ cc.evex().vorps(ymmA, ymmB, m);
4062
+ cc.evex().vorps(zmmA, zmmB, m);
4063
+ cc.evex().vpabsb(xmmA, m);
4064
+ cc.evex().vpabsb(ymmA, m);
4065
+ cc.evex().vpabsb(zmmA, m);
4066
+ cc.evex().vpabsd(xmmA, m);
4067
+ cc.evex().vpabsd(ymmA, m);
4068
+ cc.evex().vpabsd(zmmA, m);
4069
+ cc.evex().vpabsq(xmmA, m);
4070
+ cc.evex().vpabsq(ymmA, m);
4071
+ cc.evex().vpabsq(zmmA, m);
4072
+ cc.evex().vpabsw(xmmA, m);
4073
+ cc.evex().vpabsw(ymmA, m);
4074
+ cc.evex().vpabsw(zmmA, m);
4075
+ cc.evex().vpackssdw(xmmA, xmmB, m);
4076
+ cc.evex().vpackssdw(ymmA, ymmB, m);
4077
+ cc.evex().vpackssdw(zmmA, zmmB, m);
4078
+ cc.evex().vpacksswb(xmmA, xmmB, m);
4079
+ cc.evex().vpacksswb(ymmA, ymmB, m);
4080
+ cc.evex().vpacksswb(zmmA, zmmB, m);
4081
+ cc.evex().vpackusdw(xmmA, xmmB, m);
4082
+ cc.evex().vpackusdw(ymmA, ymmB, m);
4083
+ cc.evex().vpackusdw(zmmA, zmmB, m);
4084
+ cc.evex().vpackuswb(xmmA, xmmB, m);
4085
+ cc.evex().vpackuswb(ymmA, ymmB, m);
4086
+ cc.evex().vpackuswb(zmmA, zmmB, m);
4087
+ cc.evex().vpaddb(xmmA, xmmB, m);
4088
+ cc.evex().vpaddb(ymmA, ymmB, m);
4089
+ cc.evex().vpaddb(zmmA, zmmB, m);
4090
+ cc.evex().vpaddd(xmmA, xmmB, m);
4091
+ cc.evex().vpaddd(ymmA, ymmB, m);
4092
+ cc.evex().vpaddd(zmmA, zmmB, m);
4093
+ cc.evex().vpaddq(xmmA, xmmB, m);
4094
+ cc.evex().vpaddq(ymmA, ymmB, m);
4095
+ cc.evex().vpaddq(zmmA, zmmB, m);
4096
+ cc.evex().vpaddsb(xmmA, xmmB, m);
4097
+ cc.evex().vpaddsb(ymmA, ymmB, m);
4098
+ cc.evex().vpaddsb(zmmA, zmmB, m);
4099
+ cc.evex().vpaddsw(xmmA, xmmB, m);
4100
+ cc.evex().vpaddsw(ymmA, ymmB, m);
4101
+ cc.evex().vpaddsw(zmmA, zmmB, m);
4102
+ cc.evex().vpaddusb(xmmA, xmmB, m);
4103
+ cc.evex().vpaddusb(ymmA, ymmB, m);
4104
+ cc.evex().vpaddusb(zmmA, zmmB, m);
4105
+ cc.evex().vpaddusw(xmmA, xmmB, m);
4106
+ cc.evex().vpaddusw(ymmA, ymmB, m);
4107
+ cc.evex().vpaddusw(zmmA, zmmB, m);
4108
+ cc.evex().vpaddw(xmmA, xmmB, m);
4109
+ cc.evex().vpaddw(ymmA, ymmB, m);
4110
+ cc.evex().vpaddw(zmmA, zmmB, m);
4111
+ cc.evex().vpalignr(xmmA, xmmB, m, 0);
4112
+ cc.evex().vpalignr(ymmA, ymmB, m, 0);
4113
+ cc.evex().vpalignr(zmmA, zmmB, m, 0);
4114
+ cc.evex().vpandd(xmmA, xmmB, m);
4115
+ cc.evex().vpandd(ymmA, ymmB, m);
4116
+ cc.evex().vpandd(zmmA, zmmB, m);
4117
+ cc.evex().vpandnd(xmmA, xmmB, m);
4118
+ cc.evex().vpandnd(ymmA, ymmB, m);
4119
+ cc.evex().vpandnd(zmmA, zmmB, m);
4120
+ cc.evex().vpandnq(xmmA, xmmB, m);
4121
+ cc.evex().vpandnq(ymmA, ymmB, m);
4122
+ cc.evex().vpandnq(zmmA, zmmB, m);
4123
+ cc.evex().vpandq(xmmA, xmmB, m);
4124
+ cc.evex().vpandq(ymmA, ymmB, m);
4125
+ cc.evex().vpandq(zmmA, zmmB, m);
4126
+ cc.evex().vpavgb(xmmA, xmmB, m);
4127
+ cc.evex().vpavgb(ymmA, ymmB, m);
4128
+ cc.evex().vpavgb(zmmA, zmmB, m);
4129
+ cc.evex().vpavgw(xmmA, xmmB, m);
4130
+ cc.evex().vpavgw(ymmA, ymmB, m);
4131
+ cc.evex().vpavgw(zmmA, zmmB, m);
4132
+ cc.evex().vpblendmb(xmmA, xmmB, m);
4133
+ cc.evex().vpblendmb(ymmA, ymmB, m);
4134
+ cc.evex().vpblendmb(zmmA, zmmB, m);
4135
+ cc.evex().vpblendmd(xmmA, xmmB, m);
4136
+ cc.evex().vpblendmd(ymmA, ymmB, m);
4137
+ cc.evex().vpblendmd(zmmA, zmmB, m);
4138
+ cc.evex().vpblendmq(xmmA, xmmB, m);
4139
+ cc.evex().vpblendmq(ymmA, ymmB, m);
4140
+ cc.evex().vpblendmq(zmmA, zmmB, m);
4141
+ cc.evex().vpblendmw(xmmA, xmmB, m);
4142
+ cc.evex().vpblendmw(ymmA, ymmB, m);
4143
+ cc.evex().vpblendmw(zmmA, zmmB, m);
4144
+ cc.evex().vpbroadcastb(xmmA, m);
4145
+ cc.evex().vpbroadcastb(ymmA, m);
4146
+ cc.evex().vpbroadcastb(zmmA, m);
4147
+ cc.evex().vpbroadcastd(xmmA, m);
4148
+ cc.evex().vpbroadcastd(ymmA, m);
4149
+ cc.evex().vpbroadcastd(zmmA, m);
4150
+ cc.evex().vpbroadcastq(xmmA, m);
4151
+ cc.evex().vpbroadcastq(ymmA, m);
4152
+ cc.evex().vpbroadcastq(zmmA, m);
4153
+ cc.evex().vpbroadcastw(xmmA, m);
4154
+ cc.evex().vpbroadcastw(ymmA, m);
4155
+ cc.evex().vpbroadcastw(zmmA, m);
4156
+ cc.evex().vpcmpb(kA, xmmB, m, 0);
4157
+ cc.evex().vpcmpb(kA, ymmB, m, 0);
4158
+ cc.evex().vpcmpb(kA, zmmB, m, 0);
4159
+ cc.evex().vpcmpd(kA, xmmB, m, 0);
4160
+ cc.evex().vpcmpd(kA, ymmB, m, 0);
4161
+ cc.evex().vpcmpd(kA, zmmB, m, 0);
4162
+ cc.evex().vpcmpeqb(kA, xmmB, m);
4163
+ cc.evex().vpcmpeqb(kA, ymmB, m);
4164
+ cc.evex().vpcmpeqb(kA, zmmB, m);
4165
+ cc.evex().vpcmpeqd(kA, xmmB, m);
4166
+ cc.evex().vpcmpeqd(kA, ymmB, m);
4167
+ cc.evex().vpcmpeqd(kA, zmmB, m);
4168
+ cc.evex().vpcmpeqq(kA, xmmB, m);
4169
+ cc.evex().vpcmpeqq(kA, ymmB, m);
4170
+ cc.evex().vpcmpeqq(kA, zmmB, m);
4171
+ cc.evex().vpcmpeqw(kA, xmmB, m);
4172
+ cc.evex().vpcmpeqw(kA, ymmB, m);
4173
+ cc.evex().vpcmpeqw(kA, zmmB, m);
4174
+ cc.evex().vpcmpgtb(kA, xmmB, m);
4175
+ cc.evex().vpcmpgtb(kA, ymmB, m);
4176
+ cc.evex().vpcmpgtb(kA, zmmB, m);
4177
+ cc.evex().vpcmpgtd(kA, xmmB, m);
4178
+ cc.evex().vpcmpgtd(kA, ymmB, m);
4179
+ cc.evex().vpcmpgtd(kA, zmmB, m);
4180
+ cc.evex().vpcmpgtq(kA, xmmB, m);
4181
+ cc.evex().vpcmpgtq(kA, ymmB, m);
4182
+ cc.evex().vpcmpgtq(kA, zmmB, m);
4183
+ cc.evex().vpcmpgtw(kA, xmmB, m);
4184
+ cc.evex().vpcmpgtw(kA, ymmB, m);
4185
+ cc.evex().vpcmpgtw(kA, zmmB, m);
4186
+ cc.evex().vpcmpq(kA, xmmB, m, 0);
4187
+ cc.evex().vpcmpq(kA, ymmB, m, 0);
4188
+ cc.evex().vpcmpq(kA, zmmB, m, 0);
4189
+ cc.evex().vpcmpub(kA, xmmB, m, 0);
4190
+ cc.evex().vpcmpub(kA, ymmB, m, 0);
4191
+ cc.evex().vpcmpub(kA, zmmB, m, 0);
4192
+ cc.evex().vpcmpud(kA, xmmB, m, 0);
4193
+ cc.evex().vpcmpud(kA, ymmB, m, 0);
4194
+ cc.evex().vpcmpud(kA, zmmB, m, 0);
4195
+ cc.evex().vpcmpuq(kA, xmmB, m, 0);
4196
+ cc.evex().vpcmpuq(kA, ymmB, m, 0);
4197
+ cc.evex().vpcmpuq(kA, zmmB, m, 0);
4198
+ cc.evex().vpcmpuw(kA, xmmB, m, 0);
4199
+ cc.evex().vpcmpuw(kA, ymmB, m, 0);
4200
+ cc.evex().vpcmpuw(kA, zmmB, m, 0);
4201
+ cc.evex().vpcmpw(kA, xmmB, m, 0);
4202
+ cc.evex().vpcmpw(kA, ymmB, m, 0);
4203
+ cc.evex().vpcmpw(kA, zmmB, m, 0);
4204
+ cc.evex().vpcompressd(m, xmmB);
4205
+ cc.evex().vpcompressd(m, ymmB);
4206
+ cc.evex().vpcompressd(m, zmmB);
4207
+ cc.evex().vpcompressq(m, xmmB);
4208
+ cc.evex().vpcompressq(m, ymmB);
4209
+ cc.evex().vpcompressq(m, zmmB);
4210
+ cc.evex().vpconflictd(xmmA, m);
4211
+ cc.evex().vpconflictd(ymmA, m);
4212
+ cc.evex().vpconflictd(zmmA, m);
4213
+ cc.evex().vpconflictq(xmmA, m);
4214
+ cc.evex().vpconflictq(ymmA, m);
4215
+ cc.evex().vpconflictq(zmmA, m);
4216
+ cc.evex().vpermb(xmmA, xmmB, m);
4217
+ cc.evex().vpermb(ymmA, ymmB, m);
4218
+ cc.evex().vpermb(zmmA, zmmB, m);
4219
+ cc.evex().vpermd(ymmA, ymmB, m);
4220
+ cc.evex().vpermd(zmmA, zmmB, m);
4221
+ cc.evex().vpermi2b(xmmA, xmmB, m);
4222
+ cc.evex().vpermi2b(ymmA, ymmB, m);
4223
+ cc.evex().vpermi2b(zmmA, zmmB, m);
4224
+ cc.evex().vpermi2d(xmmA, xmmB, m);
4225
+ cc.evex().vpermi2d(ymmA, ymmB, m);
4226
+ cc.evex().vpermi2d(zmmA, zmmB, m);
4227
+ cc.evex().vpermi2pd(xmmA, xmmB, m);
4228
+ cc.evex().vpermi2pd(ymmA, ymmB, m);
4229
+ cc.evex().vpermi2pd(zmmA, zmmB, m);
4230
+ cc.evex().vpermi2ps(xmmA, xmmB, m);
4231
+ cc.evex().vpermi2ps(ymmA, ymmB, m);
4232
+ cc.evex().vpermi2ps(zmmA, zmmB, m);
4233
+ cc.evex().vpermi2q(xmmA, xmmB, m);
4234
+ cc.evex().vpermi2q(ymmA, ymmB, m);
4235
+ cc.evex().vpermi2q(zmmA, zmmB, m);
4236
+ cc.evex().vpermi2w(xmmA, xmmB, m);
4237
+ cc.evex().vpermi2w(ymmA, ymmB, m);
4238
+ cc.evex().vpermi2w(zmmA, zmmB, m);
4239
+ cc.evex().vpermilpd(xmmA, xmmB, m);
4240
+ cc.evex().vpermilpd(ymmA, ymmB, m);
4241
+ cc.evex().vpermilpd(zmmA, zmmB, m);
4242
+ cc.evex().vpermilpd(xmmA, m, 0);
4243
+ cc.evex().vpermilpd(ymmA, m, 0);
4244
+ cc.evex().vpermilpd(zmmA, m, 0);
4245
+ cc.evex().vpermilps(xmmA, xmmB, m);
4246
+ cc.evex().vpermilps(ymmA, ymmB, m);
4247
+ cc.evex().vpermilps(zmmA, zmmB, m);
4248
+ cc.evex().vpermilps(xmmA, m, 0);
4249
+ cc.evex().vpermilps(ymmA, m, 0);
4250
+ cc.evex().vpermilps(zmmA, m, 0);
4251
+ cc.evex().vpermq(ymmA, ymmB, m);
4252
+ cc.evex().vpermq(zmmA, zmmB, m);
4253
+ cc.evex().vpermq(ymmA, m, 0);
4254
+ cc.evex().vpermq(zmmA, m, 0);
4255
+ cc.evex().vpermt2b(xmmA, xmmB, m);
4256
+ cc.evex().vpermt2b(ymmA, ymmB, m);
4257
+ cc.evex().vpermt2b(zmmA, zmmB, m);
4258
+ cc.evex().vpermt2d(xmmA, xmmB, m);
4259
+ cc.evex().vpermt2d(ymmA, ymmB, m);
4260
+ cc.evex().vpermt2d(zmmA, zmmB, m);
4261
+ cc.evex().vpermt2pd(xmmA, xmmB, m);
4262
+ cc.evex().vpermt2pd(ymmA, ymmB, m);
4263
+ cc.evex().vpermt2pd(zmmA, zmmB, m);
4264
+ cc.evex().vpermt2ps(xmmA, xmmB, m);
4265
+ cc.evex().vpermt2ps(ymmA, ymmB, m);
4266
+ cc.evex().vpermt2ps(zmmA, zmmB, m);
4267
+ cc.evex().vpermt2q(xmmA, xmmB, m);
4268
+ cc.evex().vpermt2q(ymmA, ymmB, m);
4269
+ cc.evex().vpermt2q(zmmA, zmmB, m);
4270
+ cc.evex().vpermt2w(xmmA, xmmB, m);
4271
+ cc.evex().vpermt2w(ymmA, ymmB, m);
4272
+ cc.evex().vpermt2w(zmmA, zmmB, m);
4273
+ cc.evex().vpermw(xmmA, xmmB, m);
4274
+ cc.evex().vpermw(ymmA, ymmB, m);
4275
+ cc.evex().vpermw(zmmA, zmmB, m);
4276
+ cc.evex().vpexpandd(xmmA, m);
4277
+ cc.evex().vpexpandd(ymmA, m);
4278
+ cc.evex().vpexpandd(zmmA, m);
4279
+ cc.evex().vpexpandq(xmmA, m);
4280
+ cc.evex().vpexpandq(ymmA, m);
4281
+ cc.evex().vpexpandq(zmmA, m);
4282
+ cc.evex().vpextrb(m, xmmB, 0);
4283
+ cc.evex().vpextrd(m, xmmB, 0);
4284
+ if (cc.is64Bit()) cc.evex().vpextrq(m, xmmB, 0);
4285
+ cc.evex().vpextrw(m, xmmB, 0);
4286
+ cc.evex().k(kA).vpgatherdd(xmmA, vx_ptr);
4287
+ cc.evex().k(kA).vpgatherdd(ymmA, vy_ptr);
4288
+ cc.evex().k(kA).vpgatherdd(zmmA, vz_ptr);
4289
+ cc.evex().k(kA).vpgatherdq(xmmA, vx_ptr);
4290
+ cc.evex().k(kA).vpgatherdq(ymmA, vx_ptr);
4291
+ cc.evex().k(kA).vpgatherdq(zmmA, vy_ptr);
4292
+ cc.evex().k(kA).vpgatherqd(xmmA, vx_ptr);
4293
+ cc.evex().k(kA).vpgatherqd(xmmA, vy_ptr);
4294
+ cc.evex().k(kA).vpgatherqd(ymmA, vz_ptr);
4295
+ cc.evex().k(kA).vpgatherqq(xmmA, vx_ptr);
4296
+ cc.evex().k(kA).vpgatherqq(ymmA, vy_ptr);
4297
+ cc.evex().k(kA).vpgatherqq(zmmA, vz_ptr);
4298
+ cc.evex().vpinsrb(xmmA, xmmB, m, 0);
4299
+ cc.evex().vpinsrd(xmmA, xmmB, m, 0);
4300
+ if (cc.is64Bit()) cc.evex().vpinsrq(xmmA, xmmB, m, 0);
4301
+ cc.evex().vpinsrw(xmmA, xmmB, m, 0);
4302
+ cc.evex().vplzcntd(xmmA, m);
4303
+ cc.evex().vplzcntd(ymmA, m);
4304
+ cc.evex().vplzcntd(zmmA, m);
4305
+ cc.evex().vplzcntq(xmmA, m);
4306
+ cc.evex().vplzcntq(ymmA, m);
4307
+ cc.evex().vplzcntq(zmmA, m);
4308
+ cc.evex().vpmadd52huq(xmmA, xmmB, m);
4309
+ cc.evex().vpmadd52huq(ymmA, ymmB, m);
4310
+ cc.evex().vpmadd52huq(zmmA, zmmB, m);
4311
+ cc.evex().vpmadd52luq(xmmA, xmmB, m);
4312
+ cc.evex().vpmadd52luq(ymmA, ymmB, m);
4313
+ cc.evex().vpmadd52luq(zmmA, zmmB, m);
4314
+ cc.evex().vpmaddubsw(xmmA, xmmB, m);
4315
+ cc.evex().vpmaddubsw(ymmA, ymmB, m);
4316
+ cc.evex().vpmaddubsw(zmmA, zmmB, m);
4317
+ cc.evex().vpmaddwd(xmmA, xmmB, m);
4318
+ cc.evex().vpmaddwd(ymmA, ymmB, m);
4319
+ cc.evex().vpmaddwd(zmmA, zmmB, m);
4320
+ cc.evex().vpmaxsb(xmmA, xmmB, m);
4321
+ cc.evex().vpmaxsb(ymmA, ymmB, m);
4322
+ cc.evex().vpmaxsb(zmmA, zmmB, m);
4323
+ cc.evex().vpmaxsd(xmmA, xmmB, m);
4324
+ cc.evex().vpmaxsd(ymmA, ymmB, m);
4325
+ cc.evex().vpmaxsd(zmmA, zmmB, m);
4326
+ cc.evex().vpmaxsq(xmmA, xmmB, m);
4327
+ cc.evex().vpmaxsq(ymmA, ymmB, m);
4328
+ cc.evex().vpmaxsq(zmmA, zmmB, m);
4329
+ cc.evex().vpmaxsw(xmmA, xmmB, m);
4330
+ cc.evex().vpmaxsw(ymmA, ymmB, m);
4331
+ cc.evex().vpmaxsw(zmmA, zmmB, m);
4332
+ cc.evex().vpmaxub(xmmA, xmmB, m);
4333
+ cc.evex().vpmaxub(ymmA, ymmB, m);
4334
+ cc.evex().vpmaxub(zmmA, zmmB, m);
4335
+ cc.evex().vpmaxud(xmmA, xmmB, m);
4336
+ cc.evex().vpmaxud(ymmA, ymmB, m);
4337
+ cc.evex().vpmaxud(zmmA, zmmB, m);
4338
+ cc.evex().vpmaxuq(xmmA, xmmB, m);
4339
+ cc.evex().vpmaxuq(ymmA, ymmB, m);
4340
+ cc.evex().vpmaxuq(zmmA, zmmB, m);
4341
+ cc.evex().vpmaxuw(xmmA, xmmB, m);
4342
+ cc.evex().vpmaxuw(ymmA, ymmB, m);
4343
+ cc.evex().vpmaxuw(zmmA, zmmB, m);
4344
+ cc.evex().vpminsb(xmmA, xmmB, m);
4345
+ cc.evex().vpminsb(ymmA, ymmB, m);
4346
+ cc.evex().vpminsb(zmmA, zmmB, m);
4347
+ cc.evex().vpminsd(xmmA, xmmB, m);
4348
+ cc.evex().vpminsd(ymmA, ymmB, m);
4349
+ cc.evex().vpminsd(zmmA, zmmB, m);
4350
+ cc.evex().vpminsq(xmmA, xmmB, m);
4351
+ cc.evex().vpminsq(ymmA, ymmB, m);
4352
+ cc.evex().vpminsq(zmmA, zmmB, m);
4353
+ cc.evex().vpminsw(xmmA, xmmB, m);
4354
+ cc.evex().vpminsw(ymmA, ymmB, m);
4355
+ cc.evex().vpminsw(zmmA, zmmB, m);
4356
+ cc.evex().vpminub(xmmA, xmmB, m);
4357
+ cc.evex().vpminub(ymmA, ymmB, m);
4358
+ cc.evex().vpminub(zmmA, zmmB, m);
4359
+ cc.evex().vpminud(xmmA, xmmB, m);
4360
+ cc.evex().vpminud(ymmA, ymmB, m);
4361
+ cc.evex().vpminud(zmmA, zmmB, m);
4362
+ cc.evex().vpminuq(xmmA, xmmB, m);
4363
+ cc.evex().vpminuq(ymmA, ymmB, m);
4364
+ cc.evex().vpminuq(zmmA, zmmB, m);
4365
+ cc.evex().vpminuw(xmmA, xmmB, m);
4366
+ cc.evex().vpminuw(ymmA, ymmB, m);
4367
+ cc.evex().vpminuw(zmmA, zmmB, m);
4368
+ cc.evex().vpmovdb(m, xmmB);
4369
+ cc.evex().vpmovdb(m, ymmB);
4370
+ cc.evex().vpmovdb(m, zmmB);
4371
+ cc.evex().vpmovdw(m, xmmB);
4372
+ cc.evex().vpmovdw(m, ymmB);
4373
+ cc.evex().vpmovdw(m, zmmB);
4374
+ cc.evex().vpmovqb(m, xmmB);
4375
+ cc.evex().vpmovqb(m, ymmB);
4376
+ cc.evex().vpmovqb(m, zmmB);
4377
+ cc.evex().vpmovqd(m, xmmB);
4378
+ cc.evex().vpmovqd(m, ymmB);
4379
+ cc.evex().vpmovqd(m, zmmB);
4380
+ cc.evex().vpmovqw(m, xmmB);
4381
+ cc.evex().vpmovqw(m, ymmB);
4382
+ cc.evex().vpmovqw(m, zmmB);
4383
+ cc.evex().vpmovsdb(m, xmmB);
4384
+ cc.evex().vpmovsdb(m, ymmB);
4385
+ cc.evex().vpmovsdb(m, zmmB);
4386
+ cc.evex().vpmovsdw(m, xmmB);
4387
+ cc.evex().vpmovsdw(m, ymmB);
4388
+ cc.evex().vpmovsdw(m, zmmB);
4389
+ cc.evex().vpmovsqb(m, xmmB);
4390
+ cc.evex().vpmovsqb(m, ymmB);
4391
+ cc.evex().vpmovsqb(m, zmmB);
4392
+ cc.evex().vpmovsqd(m, xmmB);
4393
+ cc.evex().vpmovsqd(m, ymmB);
4394
+ cc.evex().vpmovsqd(m, zmmB);
4395
+ cc.evex().vpmovsqw(m, xmmB);
4396
+ cc.evex().vpmovsqw(m, ymmB);
4397
+ cc.evex().vpmovsqw(m, zmmB);
4398
+ cc.evex().vpmovswb(m, xmmB);
4399
+ cc.evex().vpmovswb(m, ymmB);
4400
+ cc.evex().vpmovswb(m, zmmB);
4401
+ cc.evex().vpmovsxbd(xmmA, m);
4402
+ cc.evex().vpmovsxbd(ymmA, m);
4403
+ cc.evex().vpmovsxbd(zmmA, m);
4404
+ cc.evex().vpmovsxbq(xmmA, m);
4405
+ cc.evex().vpmovsxbq(ymmA, m);
4406
+ cc.evex().vpmovsxbq(zmmA, m);
4407
+ cc.evex().vpmovsxbw(xmmA, m);
4408
+ cc.evex().vpmovsxbw(ymmA, m);
4409
+ cc.evex().vpmovsxbw(zmmA, m);
4410
+ cc.evex().vpmovsxdq(xmmA, m);
4411
+ cc.evex().vpmovsxdq(ymmA, m);
4412
+ cc.evex().vpmovsxdq(zmmA, m);
4413
+ cc.evex().vpmovsxwd(xmmA, m);
4414
+ cc.evex().vpmovsxwd(ymmA, m);
4415
+ cc.evex().vpmovsxwd(zmmA, m);
4416
+ cc.evex().vpmovsxwq(xmmA, m);
4417
+ cc.evex().vpmovsxwq(ymmA, m);
4418
+ cc.evex().vpmovsxwq(zmmA, m);
4419
+ cc.evex().vpmovusdb(m, xmmB);
4420
+ cc.evex().vpmovusdb(m, ymmB);
4421
+ cc.evex().vpmovusdb(m, zmmB);
4422
+ cc.evex().vpmovusdw(m, xmmB);
4423
+ cc.evex().vpmovusdw(m, ymmB);
4424
+ cc.evex().vpmovusdw(m, zmmB);
4425
+ cc.evex().vpmovusqb(m, xmmB);
4426
+ cc.evex().vpmovusqb(m, ymmB);
4427
+ cc.evex().vpmovusqb(m, zmmB);
4428
+ cc.evex().vpmovusqd(m, xmmB);
4429
+ cc.evex().vpmovusqd(m, ymmB);
4430
+ cc.evex().vpmovusqd(m, zmmB);
4431
+ cc.evex().vpmovusqw(m, xmmB);
4432
+ cc.evex().vpmovusqw(m, ymmB);
4433
+ cc.evex().vpmovusqw(m, zmmB);
4434
+ cc.evex().vpmovuswb(m, xmmB);
4435
+ cc.evex().vpmovuswb(m, ymmB);
4436
+ cc.evex().vpmovuswb(m, zmmB);
4437
+ cc.evex().vpmovwb(m, xmmB);
4438
+ cc.evex().vpmovwb(m, ymmB);
4439
+ cc.evex().vpmovwb(m, zmmB);
4440
+ cc.evex().vpmovzxbd(xmmA, m);
4441
+ cc.evex().vpmovzxbd(ymmA, m);
4442
+ cc.evex().vpmovzxbd(zmmA, m);
4443
+ cc.evex().vpmovzxbq(xmmA, m);
4444
+ cc.evex().vpmovzxbq(ymmA, m);
4445
+ cc.evex().vpmovzxbq(zmmA, m);
4446
+ cc.evex().vpmovzxbw(xmmA, m);
4447
+ cc.evex().vpmovzxbw(ymmA, m);
4448
+ cc.evex().vpmovzxbw(zmmA, m);
4449
+ cc.evex().vpmovzxdq(xmmA, m);
4450
+ cc.evex().vpmovzxdq(ymmA, m);
4451
+ cc.evex().vpmovzxdq(zmmA, m);
4452
+ cc.evex().vpmovzxwd(xmmA, m);
4453
+ cc.evex().vpmovzxwd(ymmA, m);
4454
+ cc.evex().vpmovzxwd(zmmA, m);
4455
+ cc.evex().vpmovzxwq(xmmA, m);
4456
+ cc.evex().vpmovzxwq(ymmA, m);
4457
+ cc.evex().vpmovzxwq(zmmA, m);
4458
+ cc.evex().vpmuldq(xmmA, xmmB, m);
4459
+ cc.evex().vpmuldq(ymmA, ymmB, m);
4460
+ cc.evex().vpmuldq(zmmA, zmmB, m);
4461
+ cc.evex().vpmulhrsw(xmmA, xmmB, m);
4462
+ cc.evex().vpmulhrsw(ymmA, ymmB, m);
4463
+ cc.evex().vpmulhrsw(zmmA, zmmB, m);
4464
+ cc.evex().vpmulhuw(xmmA, xmmB, m);
4465
+ cc.evex().vpmulhuw(ymmA, ymmB, m);
4466
+ cc.evex().vpmulhuw(zmmA, zmmB, m);
4467
+ cc.evex().vpmulhw(xmmA, xmmB, m);
4468
+ cc.evex().vpmulhw(ymmA, ymmB, m);
4469
+ cc.evex().vpmulhw(zmmA, zmmB, m);
4470
+ cc.evex().vpmulld(xmmA, xmmB, m);
4471
+ cc.evex().vpmulld(ymmA, ymmB, m);
4472
+ cc.evex().vpmulld(zmmA, zmmB, m);
4473
+ cc.evex().vpmullq(xmmA, xmmB, m);
4474
+ cc.evex().vpmullq(ymmA, ymmB, m);
4475
+ cc.evex().vpmullq(zmmA, zmmB, m);
4476
+ cc.evex().vpmullw(xmmA, xmmB, m);
4477
+ cc.evex().vpmullw(ymmA, ymmB, m);
4478
+ cc.evex().vpmullw(zmmA, zmmB, m);
4479
+ cc.evex().vpmultishiftqb(xmmA, xmmB, m);
4480
+ cc.evex().vpmultishiftqb(ymmA, ymmB, m);
4481
+ cc.evex().vpmultishiftqb(zmmA, zmmB, m);
4482
+ cc.evex().vpmuludq(xmmA, xmmB, m);
4483
+ cc.evex().vpmuludq(ymmA, ymmB, m);
4484
+ cc.evex().vpmuludq(zmmA, zmmB, m);
4485
+ cc.evex().vpopcntd(zmmA, m);
4486
+ cc.evex().vpopcntq(zmmA, m);
4487
+ cc.evex().vpord(xmmA, xmmB, m);
4488
+ cc.evex().vpord(ymmA, ymmB, m);
4489
+ cc.evex().vpord(zmmA, zmmB, m);
4490
+ cc.evex().vporq(xmmA, xmmB, m);
4491
+ cc.evex().vporq(ymmA, ymmB, m);
4492
+ cc.evex().vporq(zmmA, zmmB, m);
4493
+ cc.evex().vprold(xmmA, m, 0);
4494
+ cc.evex().vprold(ymmA, m, 0);
4495
+ cc.evex().vprold(zmmA, m, 0);
4496
+ cc.evex().vprolq(xmmA, m, 0);
4497
+ cc.evex().vprolq(ymmA, m, 0);
4498
+ cc.evex().vprolq(zmmA, m, 0);
4499
+ cc.evex().vprolvd(xmmA, xmmB, m);
4500
+ cc.evex().vprolvd(ymmA, ymmB, m);
4501
+ cc.evex().vprolvd(zmmA, zmmB, m);
4502
+ cc.evex().vprolvq(xmmA, xmmB, m);
4503
+ cc.evex().vprolvq(ymmA, ymmB, m);
4504
+ cc.evex().vprolvq(zmmA, zmmB, m);
4505
+ cc.evex().vprord(xmmA, m, 0);
4506
+ cc.evex().vprord(ymmA, m, 0);
4507
+ cc.evex().vprord(zmmA, m, 0);
4508
+ cc.evex().vprorq(xmmA, m, 0);
4509
+ cc.evex().vprorq(ymmA, m, 0);
4510
+ cc.evex().vprorq(zmmA, m, 0);
4511
+ cc.evex().vprorvd(xmmA, xmmB, m);
4512
+ cc.evex().vprorvd(ymmA, ymmB, m);
4513
+ cc.evex().vprorvd(zmmA, zmmB, m);
4514
+ cc.evex().vprorvq(xmmA, xmmB, m);
4515
+ cc.evex().vprorvq(ymmA, ymmB, m);
4516
+ cc.evex().vprorvq(zmmA, zmmB, m);
4517
+ cc.evex().vpsadbw(xmmA, xmmB, m);
4518
+ cc.evex().vpsadbw(ymmA, ymmB, m);
4519
+ cc.evex().vpsadbw(zmmA, zmmB, m);
4520
+ cc.evex().k(kA).vpscatterdd(vx_ptr, xmmB);
4521
+ cc.evex().k(kA).vpscatterdd(vy_ptr, ymmB);
4522
+ cc.evex().k(kA).vpscatterdd(vz_ptr, zmmB);
4523
+ cc.evex().k(kA).vpscatterdq(vx_ptr, xmmB);
4524
+ cc.evex().k(kA).vpscatterdq(vx_ptr, ymmB);
4525
+ cc.evex().k(kA).vpscatterdq(vy_ptr, zmmB);
4526
+ cc.evex().k(kA).vpscatterqd(vx_ptr, xmmB);
4527
+ cc.evex().k(kA).vpscatterqd(vy_ptr, xmmB);
4528
+ cc.evex().k(kA).vpscatterqd(vz_ptr, ymmB);
4529
+ cc.evex().k(kA).vpscatterqq(vx_ptr, xmmB);
4530
+ cc.evex().k(kA).vpscatterqq(vy_ptr, ymmB);
4531
+ cc.evex().k(kA).vpscatterqq(vz_ptr, zmmB);
4532
+ cc.evex().vpshufb(xmmA, xmmB, m);
4533
+ cc.evex().vpshufb(ymmA, ymmB, m);
4534
+ cc.evex().vpshufb(zmmA, zmmB, m);
4535
+ cc.evex().vpshufd(xmmA, m, 0);
4536
+ cc.evex().vpshufd(ymmA, m, 0);
4537
+ cc.evex().vpshufd(zmmA, m, 0);
4538
+ cc.evex().vpshufhw(xmmA, m, 0);
4539
+ cc.evex().vpshufhw(ymmA, m, 0);
4540
+ cc.evex().vpshufhw(zmmA, m, 0);
4541
+ cc.evex().vpshuflw(xmmA, m, 0);
4542
+ cc.evex().vpshuflw(ymmA, m, 0);
4543
+ cc.evex().vpshuflw(zmmA, m, 0);
4544
+ cc.evex().vpslld(xmmA, xmmB, m);
4545
+ cc.evex().vpslld(xmmA, m, 0);
4546
+ cc.evex().vpslld(ymmA, ymmB, m);
4547
+ cc.evex().vpslld(ymmA, m, 0);
4548
+ cc.evex().vpslld(zmmA, zmmB, m);
4549
+ cc.evex().vpslld(zmmA, m, 0);
4550
+ cc.evex().vpslldq(xmmA, m, 0);
4551
+ cc.evex().vpslldq(ymmA, m, 0);
4552
+ cc.evex().vpslldq(zmmA, m, 0);
4553
+ cc.evex().vpsllq(xmmA, xmmB, m);
4554
+ cc.evex().vpsllq(xmmA, m, 0);
4555
+ cc.evex().vpsllq(ymmA, ymmB, m);
4556
+ cc.evex().vpsllq(ymmA, m, 0);
4557
+ cc.evex().vpsllq(zmmA, zmmB, m);
4558
+ cc.evex().vpsllq(zmmA, m, 0);
4559
+ cc.evex().vpsllvd(xmmA, xmmB, m);
4560
+ cc.evex().vpsllvd(ymmA, ymmB, m);
4561
+ cc.evex().vpsllvd(zmmA, zmmB, m);
4562
+ cc.evex().vpsllvq(xmmA, xmmB, m);
4563
+ cc.evex().vpsllvq(ymmA, ymmB, m);
4564
+ cc.evex().vpsllvq(zmmA, zmmB, m);
4565
+ cc.evex().vpsllvw(xmmA, xmmB, m);
4566
+ cc.evex().vpsllvw(ymmA, ymmB, m);
4567
+ cc.evex().vpsllvw(zmmA, zmmB, m);
4568
+ cc.evex().vpsllw(xmmA, xmmB, m);
4569
+ cc.evex().vpsllw(xmmA, m, 0);
4570
+ cc.evex().vpsllw(ymmA, ymmB, m);
4571
+ cc.evex().vpsllw(ymmA, m, 0);
4572
+ cc.evex().vpsllw(zmmA, zmmB, m);
4573
+ cc.evex().vpsllw(zmmA, m, 0);
4574
+ cc.evex().vpsrad(xmmA, xmmB, m);
4575
+ cc.evex().vpsrad(xmmA, m, 0);
4576
+ cc.evex().vpsrad(ymmA, ymmB, m);
4577
+ cc.evex().vpsrad(ymmA, m, 0);
4578
+ cc.evex().vpsrad(zmmA, zmmB, m);
4579
+ cc.evex().vpsrad(zmmA, m, 0);
4580
+ cc.evex().vpsraq(xmmA, xmmB, m);
4581
+ cc.evex().vpsraq(xmmA, m, 0);
4582
+ cc.evex().vpsraq(ymmA, ymmB, m);
4583
+ cc.evex().vpsraq(ymmA, m, 0);
4584
+ cc.evex().vpsraq(zmmA, zmmB, m);
4585
+ cc.evex().vpsraq(zmmA, m, 0);
4586
+ cc.evex().vpsravd(xmmA, xmmB, m);
4587
+ cc.evex().vpsravd(ymmA, ymmB, m);
4588
+ cc.evex().vpsravd(zmmA, zmmB, m);
4589
+ cc.evex().vpsravq(xmmA, xmmB, m);
4590
+ cc.evex().vpsravq(ymmA, ymmB, m);
4591
+ cc.evex().vpsravq(zmmA, zmmB, m);
4592
+ cc.evex().vpsravw(xmmA, xmmB, m);
4593
+ cc.evex().vpsravw(ymmA, ymmB, m);
4594
+ cc.evex().vpsravw(zmmA, zmmB, m);
4595
+ cc.evex().vpsraw(xmmA, xmmB, m);
4596
+ cc.evex().vpsraw(xmmA, m, 0);
4597
+ cc.evex().vpsraw(ymmA, ymmB, m);
4598
+ cc.evex().vpsraw(ymmA, m, 0);
4599
+ cc.evex().vpsraw(zmmA, zmmB, m);
4600
+ cc.evex().vpsraw(zmmA, m, 0);
4601
+ cc.evex().vpsrld(xmmA, xmmB, m);
4602
+ cc.evex().vpsrld(xmmA, m, 0);
4603
+ cc.evex().vpsrld(ymmA, ymmB, m);
4604
+ cc.evex().vpsrld(ymmA, m, 0);
4605
+ cc.evex().vpsrld(zmmA, zmmB, m);
4606
+ cc.evex().vpsrld(zmmA, m, 0);
4607
+ cc.evex().vpsrldq(xmmA, m, 0);
4608
+ cc.evex().vpsrldq(ymmA, m, 0);
4609
+ cc.evex().vpsrldq(zmmA, m, 0);
4610
+ cc.evex().vpsrlq(xmmA, xmmB, m);
4611
+ cc.evex().vpsrlq(xmmA, m, 0);
4612
+ cc.evex().vpsrlq(ymmA, ymmB, m);
4613
+ cc.evex().vpsrlq(ymmA, m, 0);
4614
+ cc.evex().vpsrlq(zmmA, zmmB, m);
4615
+ cc.evex().vpsrlq(zmmA, m, 0);
4616
+ cc.evex().vpsrlvd(xmmA, xmmB, m);
4617
+ cc.evex().vpsrlvd(ymmA, ymmB, m);
4618
+ cc.evex().vpsrlvd(zmmA, zmmB, m);
4619
+ cc.evex().vpsrlvq(xmmA, xmmB, m);
4620
+ cc.evex().vpsrlvq(ymmA, ymmB, m);
4621
+ cc.evex().vpsrlvq(zmmA, zmmB, m);
4622
+ cc.evex().vpsrlvw(xmmA, xmmB, m);
4623
+ cc.evex().vpsrlvw(ymmA, ymmB, m);
4624
+ cc.evex().vpsrlvw(zmmA, zmmB, m);
4625
+ cc.evex().vpsrlw(xmmA, xmmB, m);
4626
+ cc.evex().vpsrlw(xmmA, m, 0);
4627
+ cc.evex().vpsrlw(ymmA, ymmB, m);
4628
+ cc.evex().vpsrlw(ymmA, m, 0);
4629
+ cc.evex().vpsrlw(zmmA, zmmB, m);
4630
+ cc.evex().vpsrlw(zmmA, m, 0);
4631
+ cc.evex().vpsubb(xmmA, xmmB, m);
4632
+ cc.evex().vpsubb(ymmA, ymmB, m);
4633
+ cc.evex().vpsubb(zmmA, zmmB, m);
4634
+ cc.evex().vpsubd(xmmA, xmmB, m);
4635
+ cc.evex().vpsubd(ymmA, ymmB, m);
4636
+ cc.evex().vpsubd(zmmA, zmmB, m);
4637
+ cc.evex().vpsubq(xmmA, xmmB, m);
4638
+ cc.evex().vpsubq(ymmA, ymmB, m);
4639
+ cc.evex().vpsubq(zmmA, zmmB, m);
4640
+ cc.evex().vpsubsb(xmmA, xmmB, m);
4641
+ cc.evex().vpsubsb(ymmA, ymmB, m);
4642
+ cc.evex().vpsubsb(zmmA, zmmB, m);
4643
+ cc.evex().vpsubsw(xmmA, xmmB, m);
4644
+ cc.evex().vpsubsw(ymmA, ymmB, m);
4645
+ cc.evex().vpsubsw(zmmA, zmmB, m);
4646
+ cc.evex().vpsubusb(xmmA, xmmB, m);
4647
+ cc.evex().vpsubusb(ymmA, ymmB, m);
4648
+ cc.evex().vpsubusb(zmmA, zmmB, m);
4649
+ cc.evex().vpsubusw(xmmA, xmmB, m);
4650
+ cc.evex().vpsubusw(ymmA, ymmB, m);
4651
+ cc.evex().vpsubusw(zmmA, zmmB, m);
4652
+ cc.evex().vpsubw(xmmA, xmmB, m);
4653
+ cc.evex().vpsubw(ymmA, ymmB, m);
4654
+ cc.evex().vpsubw(zmmA, zmmB, m);
4655
+ cc.evex().vpternlogd(xmmA, xmmB, m, 0);
4656
+ cc.evex().vpternlogd(ymmA, ymmB, m, 0);
4657
+ cc.evex().vpternlogd(zmmA, zmmB, m, 0);
4658
+ cc.evex().vpternlogq(xmmA, xmmB, m, 0);
4659
+ cc.evex().vpternlogq(ymmA, ymmB, m, 0);
4660
+ cc.evex().vpternlogq(zmmA, zmmB, m, 0);
4661
+ cc.evex().vptestmb(kA, xmmB, m);
4662
+ cc.evex().vptestmb(kA, ymmB, m);
4663
+ cc.evex().vptestmb(kA, zmmB, m);
4664
+ cc.evex().vptestmd(kA, xmmB, m);
4665
+ cc.evex().vptestmd(kA, ymmB, m);
4666
+ cc.evex().vptestmd(kA, zmmB, m);
4667
+ cc.evex().vptestmq(kA, xmmB, m);
4668
+ cc.evex().vptestmq(kA, ymmB, m);
4669
+ cc.evex().vptestmq(kA, zmmB, m);
4670
+ cc.evex().vptestmw(kA, xmmB, m);
4671
+ cc.evex().vptestmw(kA, ymmB, m);
4672
+ cc.evex().vptestmw(kA, zmmB, m);
4673
+ cc.evex().vptestnmb(kA, xmmB, m);
4674
+ cc.evex().vptestnmb(kA, ymmB, m);
4675
+ cc.evex().vptestnmb(kA, zmmB, m);
4676
+ cc.evex().vptestnmd(kA, xmmB, m);
4677
+ cc.evex().vptestnmd(kA, ymmB, m);
4678
+ cc.evex().vptestnmd(kA, zmmB, m);
4679
+ cc.evex().vptestnmq(kA, xmmB, m);
4680
+ cc.evex().vptestnmq(kA, ymmB, m);
4681
+ cc.evex().vptestnmq(kA, zmmB, m);
4682
+ cc.evex().vptestnmw(kA, xmmB, m);
4683
+ cc.evex().vptestnmw(kA, ymmB, m);
4684
+ cc.evex().vptestnmw(kA, zmmB, m);
4685
+ cc.evex().vpunpckhbw(xmmA, xmmB, m);
4686
+ cc.evex().vpunpckhbw(ymmA, ymmB, m);
4687
+ cc.evex().vpunpckhbw(zmmA, zmmB, m);
4688
+ cc.evex().vpunpckhdq(xmmA, xmmB, m);
4689
+ cc.evex().vpunpckhdq(ymmA, ymmB, m);
4690
+ cc.evex().vpunpckhdq(zmmA, zmmB, m);
4691
+ cc.evex().vpunpckhqdq(xmmA, xmmB, m);
4692
+ cc.evex().vpunpckhqdq(ymmA, ymmB, m);
4693
+ cc.evex().vpunpckhqdq(zmmA, zmmB, m);
4694
+ cc.evex().vpunpckhwd(xmmA, xmmB, m);
4695
+ cc.evex().vpunpckhwd(ymmA, ymmB, m);
4696
+ cc.evex().vpunpckhwd(zmmA, zmmB, m);
4697
+ cc.evex().vpunpcklbw(xmmA, xmmB, m);
4698
+ cc.evex().vpunpcklbw(ymmA, ymmB, m);
4699
+ cc.evex().vpunpcklbw(zmmA, zmmB, m);
4700
+ cc.evex().vpunpckldq(xmmA, xmmB, m);
4701
+ cc.evex().vpunpckldq(ymmA, ymmB, m);
4702
+ cc.evex().vpunpckldq(zmmA, zmmB, m);
4703
+ cc.evex().vpunpcklqdq(xmmA, xmmB, m);
4704
+ cc.evex().vpunpcklqdq(ymmA, ymmB, m);
4705
+ cc.evex().vpunpcklqdq(zmmA, zmmB, m);
4706
+ cc.evex().vpunpcklwd(xmmA, xmmB, m);
4707
+ cc.evex().vpunpcklwd(ymmA, ymmB, m);
4708
+ cc.evex().vpunpcklwd(zmmA, zmmB, m);
4709
+ cc.evex().vpxord(xmmA, xmmB, m);
4710
+ cc.evex().vpxord(ymmA, ymmB, m);
4711
+ cc.evex().vpxord(zmmA, zmmB, m);
4712
+ cc.evex().vpxorq(xmmA, xmmB, m);
4713
+ cc.evex().vpxorq(ymmA, ymmB, m);
4714
+ cc.evex().vpxorq(zmmA, zmmB, m);
4715
+ cc.evex().vrangepd(xmmA, xmmB, m, 0);
4716
+ cc.evex().vrangepd(ymmA, ymmB, m, 0);
4717
+ cc.evex().vrangepd(zmmA, zmmB, m, 0);
4718
+ cc.evex().vrangeps(xmmA, xmmB, m, 0);
4719
+ cc.evex().vrangeps(ymmA, ymmB, m, 0);
4720
+ cc.evex().vrangeps(zmmA, zmmB, m, 0);
4721
+ cc.evex().vrangesd(xmmA, xmmB, m, 0);
4722
+ cc.evex().vrangess(xmmA, xmmB, m, 0);
4723
+ cc.evex().vrcp14pd(xmmA, m);
4724
+ cc.evex().vrcp14pd(ymmA, m);
4725
+ cc.evex().vrcp14pd(zmmA, m);
4726
+ cc.evex().vrcp14ps(xmmA, m);
4727
+ cc.evex().vrcp14ps(ymmA, m);
4728
+ cc.evex().vrcp14ps(zmmA, m);
4729
+ cc.evex().vrcp14sd(xmmA, xmmB, m);
4730
+ cc.evex().vrcp14ss(xmmA, xmmB, m);
4731
+ cc.evex().vrcp28pd(zmmA, m);
4732
+ cc.evex().vrcp28ps(zmmA, m);
4733
+ cc.evex().vrcp28sd(xmmA, xmmB, m);
4734
+ cc.evex().vrcp28ss(xmmA, xmmB, m);
4735
+ cc.evex().vreducepd(xmmA, m, 0);
4736
+ cc.evex().vreducepd(ymmA, m, 0);
4737
+ cc.evex().vreducepd(zmmA, m, 0);
4738
+ cc.evex().vreduceps(xmmA, m, 0);
4739
+ cc.evex().vreduceps(ymmA, m, 0);
4740
+ cc.evex().vreduceps(zmmA, m, 0);
4741
+ cc.evex().vreducesd(xmmA, xmmB, m, 0);
4742
+ cc.evex().vreducess(xmmA, xmmB, m, 0);
4743
+ cc.evex().vrndscalepd(xmmA, m, 0);
4744
+ cc.evex().vrndscalepd(ymmA, m, 0);
4745
+ cc.evex().vrndscalepd(zmmA, m, 0);
4746
+ cc.evex().vrndscaleps(xmmA, m, 0);
4747
+ cc.evex().vrndscaleps(ymmA, m, 0);
4748
+ cc.evex().vrndscaleps(zmmA, m, 0);
4749
+ cc.evex().vrndscalesd(xmmA, xmmB, m, 0);
4750
+ cc.evex().vrndscaless(xmmA, xmmB, m, 0);
4751
+ cc.evex().vrsqrt14pd(xmmA, m);
4752
+ cc.evex().vrsqrt14pd(ymmA, m);
4753
+ cc.evex().vrsqrt14pd(zmmA, m);
4754
+ cc.evex().vrsqrt14ps(xmmA, m);
4755
+ cc.evex().vrsqrt14ps(ymmA, m);
4756
+ cc.evex().vrsqrt14ps(zmmA, m);
4757
+ cc.evex().vrsqrt14sd(xmmA, xmmB, m);
4758
+ cc.evex().vrsqrt14ss(xmmA, xmmB, m);
4759
+ cc.evex().vrsqrt28pd(zmmA, m);
4760
+ cc.evex().vrsqrt28ps(zmmA, m);
4761
+ cc.evex().vrsqrt28sd(xmmA, xmmB, m);
4762
+ cc.evex().vrsqrt28ss(xmmA, xmmB, m);
4763
+ cc.evex().vscalefpd(xmmA, xmmB, m);
4764
+ cc.evex().vscalefpd(ymmA, ymmB, m);
4765
+ cc.evex().vscalefpd(zmmA, zmmB, m);
4766
+ cc.evex().vscalefps(xmmA, xmmB, m);
4767
+ cc.evex().vscalefps(ymmA, ymmB, m);
4768
+ cc.evex().vscalefps(zmmA, zmmB, m);
4769
+ cc.evex().vscalefsd(xmmA, xmmB, m);
4770
+ cc.evex().vscalefss(xmmA, xmmB, m);
4771
+ cc.evex().k(kA).vscatterdpd(vx_ptr, xmmB);
4772
+ cc.evex().k(kA).vscatterdpd(vx_ptr, ymmB);
4773
+ cc.evex().k(kA).vscatterdpd(vy_ptr, zmmB);
4774
+ cc.evex().k(kA).vscatterdps(vx_ptr, xmmB);
4775
+ cc.evex().k(kA).vscatterdps(vy_ptr, ymmB);
4776
+ cc.evex().k(kA).vscatterdps(vz_ptr, zmmB);
4777
+ cc.evex().k(kA).vscatterpf0dpd(vy_ptr);
4778
+ cc.evex().k(kA).vscatterpf0dps(vz_ptr);
4779
+ cc.evex().k(kA).vscatterpf0qpd(vz_ptr);
4780
+ cc.evex().k(kA).vscatterpf0qps(vz_ptr);
4781
+ cc.evex().k(kA).vscatterpf1dpd(vy_ptr);
4782
+ cc.evex().k(kA).vscatterpf1dps(vz_ptr);
4783
+ cc.evex().k(kA).vscatterpf1qpd(vz_ptr);
4784
+ cc.evex().k(kA).vscatterpf1qps(vz_ptr);
4785
+ cc.evex().k(kA).vscatterqpd(vx_ptr, xmmB);
4786
+ cc.evex().k(kA).vscatterqpd(vy_ptr, ymmB);
4787
+ cc.evex().k(kA).vscatterqpd(vz_ptr, zmmB);
4788
+ cc.evex().k(kA).vscatterqps(vx_ptr, xmmB);
4789
+ cc.evex().k(kA).vscatterqps(vy_ptr, xmmB);
4790
+ cc.evex().k(kA).vscatterqps(vz_ptr, ymmB);
4791
+ cc.evex().vshuff32x4(ymmA, ymmB, m, 0);
4792
+ cc.evex().vshuff32x4(zmmA, zmmB, m, 0);
4793
+ cc.evex().vshuff64x2(ymmA, ymmB, m, 0);
4794
+ cc.evex().vshuff64x2(zmmA, zmmB, m, 0);
4795
+ cc.evex().vshufi32x4(ymmA, ymmB, m, 0);
4796
+ cc.evex().vshufi32x4(zmmA, zmmB, m, 0);
4797
+ cc.evex().vshufi64x2(ymmA, ymmB, m, 0);
4798
+ cc.evex().vshufi64x2(zmmA, zmmB, m, 0);
4799
+ cc.evex().vshufpd(xmmA, xmmB, m, 0);
4800
+ cc.evex().vshufpd(ymmA, ymmB, m, 0);
4801
+ cc.evex().vshufpd(zmmA, zmmB, m, 0);
4802
+ cc.evex().vshufps(xmmA, xmmB, m, 0);
4803
+ cc.evex().vshufps(ymmA, ymmB, m, 0);
4804
+ cc.evex().vshufps(zmmA, zmmB, m, 0);
4805
+ cc.evex().vsqrtpd(xmmA, m);
4806
+ cc.evex().vsqrtpd(ymmA, m);
4807
+ cc.evex().vsqrtpd(zmmA, m);
4808
+ cc.evex().vsqrtps(xmmA, m);
4809
+ cc.evex().vsqrtps(ymmA, m);
4810
+ cc.evex().vsqrtps(zmmA, m);
4811
+ cc.evex().vsqrtsd(xmmA, xmmB, m);
4812
+ cc.evex().vsqrtss(xmmA, xmmB, m);
4813
+ cc.evex().vsubpd(xmmA, xmmB, m);
4814
+ cc.evex().vsubpd(ymmA, ymmB, m);
4815
+ cc.evex().vsubpd(zmmA, zmmB, m);
4816
+ cc.evex().vsubps(xmmA, xmmB, m);
4817
+ cc.evex().vsubps(ymmA, ymmB, m);
4818
+ cc.evex().vsubps(zmmA, zmmB, m);
4819
+ cc.evex().vsubsd(xmmA, xmmB, m);
4820
+ cc.evex().vsubss(xmmA, xmmB, m);
4821
+ cc.evex().vucomisd(xmmA, m);
4822
+ cc.evex().vucomiss(xmmA, m);
4823
+ cc.evex().vunpckhpd(xmmA, xmmB, m);
4824
+ cc.evex().vunpckhpd(ymmA, ymmB, m);
4825
+ cc.evex().vunpckhpd(zmmA, zmmB, m);
4826
+ cc.evex().vunpckhps(xmmA, xmmB, m);
4827
+ cc.evex().vunpckhps(ymmA, ymmB, m);
4828
+ cc.evex().vunpckhps(zmmA, zmmB, m);
4829
+ cc.evex().vunpcklpd(xmmA, xmmB, m);
4830
+ cc.evex().vunpcklpd(ymmA, ymmB, m);
4831
+ cc.evex().vunpcklpd(zmmA, zmmB, m);
4832
+ cc.evex().vunpcklps(xmmA, xmmB, m);
4833
+ cc.evex().vunpcklps(ymmA, ymmB, m);
4834
+ cc.evex().vunpcklps(zmmA, zmmB, m);
4835
+ cc.evex().vxorpd(xmmA, xmmB, m);
4836
+ cc.evex().vxorpd(ymmA, ymmB, m);
4837
+ cc.evex().vxorpd(zmmA, zmmB, m);
4838
+ cc.evex().vxorps(xmmA, xmmB, m);
4839
+ cc.evex().vxorps(ymmA, ymmB, m);
4840
+ cc.evex().vxorps(zmmA, zmmB, m);
4841
+ }
4842
+ }
4843
+
4844
+ static void generateAvx512Sequence(BaseEmitter& emitter, InstForm form, bool emitPrologEpilog) {
4845
+ using namespace asmjit::x86;
4846
+
4847
+ if (emitter.isAssembler()) {
4848
+ Assembler& cc = *emitter.as<Assembler>();
4849
+
4850
+ if (emitPrologEpilog) {
4851
+ FuncDetail func;
4852
+ func.init(FuncSignatureT<void, void*, const void*, size_t>(CallConvId::kHost), cc.environment());
4853
+
4854
+ FuncFrame frame;
4855
+ frame.init(func);
4856
+ frame.addDirtyRegs(eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3);
4857
+ frame.finalize();
4858
+
4859
+ cc.emitProlog(frame);
4860
+ generateAvx512SequenceInternal(cc, form, eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3);
4861
+ cc.emitEpilog(frame);
4862
+ }
4863
+ else {
4864
+ generateAvx512SequenceInternal(cc, form, eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3);
4865
+ }
4866
+ }
4867
+ #ifndef ASMJIT_NO_BUILDER
4868
+ else if (emitter.isBuilder()) {
4869
+ Builder& cc = *emitter.as<Builder>();
4870
+
4871
+ if (emitPrologEpilog) {
4872
+ FuncDetail func;
4873
+ func.init(FuncSignatureT<void, void*, const void*, size_t>(CallConvId::kHost), cc.environment());
4874
+
4875
+ FuncFrame frame;
4876
+ frame.init(func);
4877
+ frame.addDirtyRegs(eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3);
4878
+ frame.finalize();
4879
+
4880
+ cc.emitProlog(frame);
4881
+ generateAvx512SequenceInternal(cc, form, eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3);
4882
+ cc.emitEpilog(frame);
4883
+ }
4884
+ else {
4885
+ generateAvx512SequenceInternal(cc, form, eax, k1, k2, k3, zmm0, zmm1, zmm2, zmm3);
4886
+ }
4887
+ }
4888
+ #endif
4889
+ #ifndef ASMJIT_NO_COMPILER
4890
+ else if (emitter.isCompiler()) {
4891
+ Compiler& cc = *emitter.as<Compiler>();
4892
+
4893
+ Gp gp = cc.newGpz("gp");
4894
+ Zmm vecA = cc.newZmm("vecA");
4895
+ Zmm vecB = cc.newZmm("vecB");
4896
+ Zmm vecC = cc.newZmm("vecC");
4897
+ Zmm vecD = cc.newZmm("vecD");
4898
+
4899
+ KReg kA = cc.newKq("kA");
4900
+ KReg kB = cc.newKq("kB");
4901
+ KReg kC = cc.newKq("kC");
4902
+
4903
+ cc.addFunc(FuncSignatureT<void>(CallConvId::kHost));
4904
+ generateAvx512SequenceInternal(cc, form, gp, kA, kB, kC, vecA, vecB, vecC, vecD);
4905
+ cc.endFunc();
4906
+ }
4907
+ #endif
4908
+ }
4909
+
4910
+ template<typename EmitterFn>
4911
+ static void benchmarkX86Function(Arch arch, uint32_t numIterations, const char* description, const EmitterFn& emitterFn) noexcept {
4912
+ CodeHolder code;
4913
+ printf("%s:\n", description);
4914
+
4915
+ bench<x86::Assembler>(code, arch, numIterations, "[raw]", [&](x86::Assembler& cc) {
4916
+ emitterFn(cc, false);
4917
+ });
4918
+
4919
+ bench<x86::Assembler>(code, arch, numIterations, "[validated]", [&](x86::Assembler& cc) {
4920
+ cc.addDiagnosticOptions(DiagnosticOptions::kValidateAssembler);
4921
+ emitterFn(cc, false);
4922
+ });
4923
+
4924
+ bench<x86::Assembler>(code, arch, numIterations, "[prolog/epilog]", [&](x86::Assembler& cc) {
4925
+ cc.addDiagnosticOptions(DiagnosticOptions::kValidateAssembler);
4926
+ emitterFn(cc, true);
4927
+ });
4928
+
4929
+ #ifndef ASMJIT_NO_BUILDER
4930
+ bench<x86::Builder>(code, arch, numIterations, "[no-asm]", [&](x86::Builder& cc) {
4931
+ emitterFn(cc, false);
4932
+ });
4933
+
4934
+ bench<x86::Builder>(code, arch, numIterations, "[finalized]", [&](x86::Builder& cc) {
4935
+ emitterFn(cc, false);
4936
+ cc.finalize();
4937
+ });
4938
+
4939
+ bench<x86::Builder>(code, arch, numIterations, "[prolog/epilog]", [&](x86::Builder& cc) {
4940
+ emitterFn(cc, true);
4941
+ cc.finalize();
4942
+ });
4943
+ #endif
4944
+
4945
+ #ifndef ASMJIT_NO_COMPILER
4946
+ bench<x86::Compiler>(code, arch, numIterations, "[no-asm]", [&](x86::Compiler& cc) {
4947
+ emitterFn(cc, true);
4948
+ });
4949
+
4950
+ bench<x86::Compiler>(code, arch, numIterations, "[finalized]", [&](x86::Compiler& cc) {
4951
+ emitterFn(cc, true);
4952
+ cc.finalize();
4953
+ });
4954
+ #endif
4955
+
4956
+ printf("\n");
4957
+ }
4958
+
4959
+ void benchmarkX86Emitters(uint32_t numIterations, bool testX86, bool testX64) {
4960
+ uint32_t i = 0;
4961
+ uint32_t n = 0;
4962
+
4963
+ Arch archs[2] {};
4964
+
4965
+ if (testX86) archs[n++] = Arch::kX86;
4966
+ if (testX64) archs[n++] = Arch::kX64;
4967
+
4968
+ for (i = 0; i < n; i++) {
4969
+ static const char description[] = "GpSequence<Reg> (Sequence of GP instructions - reg-only)";
4970
+ benchmarkX86Function(archs[i], numIterations, description, [](BaseEmitter& emitter, bool emitPrologEpilog) {
4971
+ generateGpSequence(emitter, InstForm::kReg, emitPrologEpilog);
4972
+ });
4973
+ }
4974
+
4975
+ for (i = 0; i < n; i++) {
4976
+ static const char description[] = "GpSequence<Mem> (Sequence of GP instructions - reg/mem)";
4977
+ benchmarkX86Function(archs[i], numIterations, description, [](BaseEmitter& emitter, bool emitPrologEpilog) {
4978
+ generateGpSequence(emitter, InstForm::kMem, emitPrologEpilog);
4979
+ });
4980
+ }
4981
+
4982
+ for (i = 0; i < n; i++) {
4983
+ static const char description[] = "SseSequence<Reg> (sequence of SSE+ instructions - reg-only)";
4984
+ benchmarkX86Function(archs[i], numIterations, description, [](BaseEmitter& emitter, bool emitPrologEpilog) {
4985
+ generateSseSequence(emitter, InstForm::kReg, emitPrologEpilog);
4986
+ });
4987
+ }
4988
+
4989
+ for (i = 0; i < n; i++) {
4990
+ static const char description[] = "SseSequence<Mem> (sequence of SSE+ instructions - reg/mem)";
4991
+ benchmarkX86Function(archs[i], numIterations, description, [](BaseEmitter& emitter, bool emitPrologEpilog) {
4992
+ generateSseSequence(emitter, InstForm::kMem, emitPrologEpilog);
4993
+ });
4994
+ }
4995
+
4996
+ for (i = 0; i < n; i++) {
4997
+ static const char description[] = "AvxSequence<Reg> (sequence of AVX+ instructions - reg-only)";
4998
+ benchmarkX86Function(archs[i], numIterations, description, [](BaseEmitter& emitter, bool emitPrologEpilog) {
4999
+ generateAvxSequence(emitter, InstForm::kReg, emitPrologEpilog);
5000
+ });
5001
+ }
5002
+
5003
+ for (i = 0; i < n; i++) {
5004
+ static const char description[] = "AvxSequence<Mem> (sequence of AVX+ instructions - reg/mem)";
5005
+ benchmarkX86Function(archs[i], numIterations, description, [](BaseEmitter& emitter, bool emitPrologEpilog) {
5006
+ generateAvxSequence(emitter, InstForm::kMem, emitPrologEpilog);
5007
+ });
5008
+ }
5009
+
5010
+ for (i = 0; i < n; i++) {
5011
+ static const char description[] = "Avx512Sequence<Reg> (sequence of AVX512+ instructions - reg-only)";
5012
+ benchmarkX86Function(archs[i], numIterations, description, [](BaseEmitter& emitter, bool emitPrologEpilog) {
5013
+ generateAvx512Sequence(emitter, InstForm::kReg, emitPrologEpilog);
5014
+ });
5015
+ }
5016
+
5017
+ for (i = 0; i < n; i++) {
5018
+ static const char description[] = "Avx512Sequence<Mem> (sequence of AVX512+ instructions - reg/mem)";
5019
+ benchmarkX86Function(archs[i], numIterations, description, [](BaseEmitter& emitter, bool emitPrologEpilog) {
5020
+ generateAvx512Sequence(emitter, InstForm::kMem, emitPrologEpilog);
5021
+ });
5022
+ }
5023
+
5024
+ for (i = 0; i < n; i++) {
5025
+ static const char description[] = "SseAlphaBlend (alpha-blend function with labels and jumps)";
5026
+ benchmarkX86Function(archs[i], numIterations, description, [](BaseEmitter& emitter, bool emitPrologEpilog) {
5027
+ asmtest::generateSseAlphaBlend(emitter, emitPrologEpilog);
5028
+ });
5029
+ }
5030
+ }
5031
+
5032
+ #endif // !ASMJIT_NO_X86