sleeping_kangaroo12 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (284) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +127 -0
  3. data/ext/Rakefile +73 -0
  4. data/ext/binding/sleeping_kangaroo12.c +39 -0
  5. data/ext/config/xkcp.build +17 -0
  6. data/ext/xkcp/LICENSE +1 -0
  7. data/ext/xkcp/Makefile +15 -0
  8. data/ext/xkcp/Makefile.build +200 -0
  9. data/ext/xkcp/README.markdown +296 -0
  10. data/ext/xkcp/lib/HighLevel.build +143 -0
  11. data/ext/xkcp/lib/LowLevel.build +757 -0
  12. data/ext/xkcp/lib/common/align.h +33 -0
  13. data/ext/xkcp/lib/common/brg_endian.h +143 -0
  14. data/ext/xkcp/lib/high/KangarooTwelve/KangarooTwelve.c +301 -0
  15. data/ext/xkcp/lib/high/KangarooTwelve/KangarooTwelve.h +97 -0
  16. data/ext/xkcp/lib/high/Keccak/FIPS202/KeccakHash.c +81 -0
  17. data/ext/xkcp/lib/high/Keccak/FIPS202/KeccakHash.h +125 -0
  18. data/ext/xkcp/lib/high/Keccak/FIPS202/SimpleFIPS202.c +48 -0
  19. data/ext/xkcp/lib/high/Keccak/FIPS202/SimpleFIPS202.h +79 -0
  20. data/ext/xkcp/lib/high/Keccak/KeccakDuplex.c +81 -0
  21. data/ext/xkcp/lib/high/Keccak/KeccakDuplex.h +73 -0
  22. data/ext/xkcp/lib/high/Keccak/KeccakDuplex.inc +195 -0
  23. data/ext/xkcp/lib/high/Keccak/KeccakSponge.c +111 -0
  24. data/ext/xkcp/lib/high/Keccak/KeccakSponge.h +76 -0
  25. data/ext/xkcp/lib/high/Keccak/KeccakSponge.inc +314 -0
  26. data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.c +61 -0
  27. data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.h +67 -0
  28. data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.inc +128 -0
  29. data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.c +93 -0
  30. data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.h +599 -0
  31. data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.inc +573 -0
  32. data/ext/xkcp/lib/high/Ketje/Ketjev2.c +87 -0
  33. data/ext/xkcp/lib/high/Ketje/Ketjev2.h +88 -0
  34. data/ext/xkcp/lib/high/Ketje/Ketjev2.inc +274 -0
  35. data/ext/xkcp/lib/high/Keyak/Keyakv2.c +132 -0
  36. data/ext/xkcp/lib/high/Keyak/Keyakv2.h +217 -0
  37. data/ext/xkcp/lib/high/Keyak/Keyakv2.inc +81 -0
  38. data/ext/xkcp/lib/high/Keyak/Motorist.inc +953 -0
  39. data/ext/xkcp/lib/high/Kravatte/Kravatte.c +533 -0
  40. data/ext/xkcp/lib/high/Kravatte/Kravatte.h +115 -0
  41. data/ext/xkcp/lib/high/Kravatte/KravatteModes.c +557 -0
  42. data/ext/xkcp/lib/high/Kravatte/KravatteModes.h +247 -0
  43. data/ext/xkcp/lib/high/Xoodyak/Cyclist.h +66 -0
  44. data/ext/xkcp/lib/high/Xoodyak/Cyclist.inc +336 -0
  45. data/ext/xkcp/lib/high/Xoodyak/Xoodyak-parameters.h +26 -0
  46. data/ext/xkcp/lib/high/Xoodyak/Xoodyak.c +55 -0
  47. data/ext/xkcp/lib/high/Xoodyak/Xoodyak.h +35 -0
  48. data/ext/xkcp/lib/high/Xoofff/Xoofff.c +634 -0
  49. data/ext/xkcp/lib/high/Xoofff/Xoofff.h +147 -0
  50. data/ext/xkcp/lib/high/Xoofff/XoofffModes.c +483 -0
  51. data/ext/xkcp/lib/high/Xoofff/XoofffModes.h +241 -0
  52. data/ext/xkcp/lib/high/common/Phases.h +25 -0
  53. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-SnP.h +41 -0
  54. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv6m-le-armcc.s +1666 -0
  55. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv6m-le-gcc.s +1655 -0
  56. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7a-le-armcc.s +1268 -0
  57. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7a-le-gcc.s +1264 -0
  58. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7m-le-armcc.s +1178 -0
  59. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7m-le-gcc.s +1175 -0
  60. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u1-32bi-armv6m-le-armcc.s +1338 -0
  61. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u1-32bi-armv6m-le-gcc.s +1336 -0
  62. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u2-32bi-armv6m-le-armcc.s +1343 -0
  63. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u2-32bi-armv6m-le-gcc.s +1339 -0
  64. data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-SnP.h +42 -0
  65. data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-armv7a-le-neon-armcc.s +823 -0
  66. data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-armv7a-le-neon-gcc.s +831 -0
  67. data/ext/xkcp/lib/low/KeccakP-1600/ARMv8A/KeccakP-1600-SnP.h +31 -0
  68. data/ext/xkcp/lib/low/KeccakP-1600/ARMv8A/KeccakP-1600-armv8a-neon.s +540 -0
  69. data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-SnP.h +42 -0
  70. data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-avr8-compact.s +733 -0
  71. data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-avr8-fast.s +1121 -0
  72. data/ext/xkcp/lib/low/KeccakP-1600/AVX2/KeccakP-1600-AVX2.s +1100 -0
  73. data/ext/xkcp/lib/low/KeccakP-1600/AVX2/KeccakP-1600-SnP.h +52 -0
  74. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/KeccakP-1600-AVX512.c +623 -0
  75. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/KeccakP-1600-SnP.h +47 -0
  76. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/u12/KeccakP-1600-AVX512-config.h +6 -0
  77. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/u6/KeccakP-1600-AVX512-config.h +6 -0
  78. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/ua/KeccakP-1600-AVX512-config.h +6 -0
  79. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/KeccakP-1600-AVX512.s +1031 -0
  80. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/KeccakP-1600-SnP.h +53 -0
  81. data/ext/xkcp/lib/low/KeccakP-1600/XOP/KeccakP-1600-SnP.h +44 -0
  82. data/ext/xkcp/lib/low/KeccakP-1600/XOP/KeccakP-1600-XOP.c +476 -0
  83. data/ext/xkcp/lib/low/KeccakP-1600/XOP/u6/KeccakP-1600-XOP-config.h +6 -0
  84. data/ext/xkcp/lib/low/KeccakP-1600/XOP/ua/KeccakP-1600-XOP-config.h +6 -0
  85. data/ext/xkcp/lib/low/KeccakP-1600/common/KeccakP-1600-64.macros +748 -0
  86. data/ext/xkcp/lib/low/KeccakP-1600/common/KeccakP-1600-unrolling.macros +305 -0
  87. data/ext/xkcp/lib/low/KeccakP-1600/compact/KeccakP-1600-SnP.h +40 -0
  88. data/ext/xkcp/lib/low/KeccakP-1600/compact/KeccakP-1600-compact64.c +420 -0
  89. data/ext/xkcp/lib/low/KeccakP-1600/plain-32bits-inplace/KeccakP-1600-SnP.h +43 -0
  90. data/ext/xkcp/lib/low/KeccakP-1600/plain-32bits-inplace/KeccakP-1600-inplace32BI.c +1163 -0
  91. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/KeccakP-1600-SnP.h +54 -0
  92. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/KeccakP-1600-opt64.c +565 -0
  93. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcu6/KeccakP-1600-opt64-config.h +7 -0
  94. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcua/KeccakP-1600-opt64-config.h +7 -0
  95. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcua-shld/KeccakP-1600-opt64-config.h +8 -0
  96. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/u6/KeccakP-1600-opt64-config.h +6 -0
  97. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/ua/KeccakP-1600-opt64-config.h +6 -0
  98. data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-SnP.h +44 -0
  99. data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-reference.h +23 -0
  100. data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-reference32BI.c +625 -0
  101. data/ext/xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-SnP.h +44 -0
  102. data/ext/xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-reference.c +440 -0
  103. data/ext/xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-reference.h +23 -0
  104. data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-SnP.h +42 -0
  105. data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-gas.s +1196 -0
  106. data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-gas_Apple.s +1124 -0
  107. data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-shld-gas.s +1196 -0
  108. data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-inplace-pl2-armv7a-neon-le-armcc.s +1392 -0
  109. data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-inplace-pl2-armv7a-neon-le-gcc.s +1394 -0
  110. data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-times2-SnP.h +42 -0
  111. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512u12/SIMD512-2-config.h +7 -0
  112. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512u4/SIMD512-2-config.h +7 -0
  113. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512ufull/SIMD512-2-config.h +7 -0
  114. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/KeccakP-1600-times2-SIMD512.c +850 -0
  115. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/KeccakP-1600-times2-SnP.h +51 -0
  116. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SIMD128.c +957 -0
  117. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SnP.h +49 -0
  118. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/SSSE3-u2/SIMD128-config.h +8 -0
  119. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/SSSE3-ua/SIMD128-config.h +8 -0
  120. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/XOP-u2/SIMD128-config.h +9 -0
  121. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/XOP-ua/SIMD128-config.h +9 -0
  122. data/ext/xkcp/lib/low/KeccakP-1600-times2/fallback-on1/KeccakP-1600-times2-SnP.h +45 -0
  123. data/ext/xkcp/lib/low/KeccakP-1600-times2/fallback-on1/KeccakP-1600-times2-on1.c +37 -0
  124. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/KeccakP-1600-times4-SIMD256.c +1321 -0
  125. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/KeccakP-1600-times4-SnP.h +55 -0
  126. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/u12/SIMD256-config.h +7 -0
  127. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/u6/SIMD256-config.h +7 -0
  128. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/ua/SIMD256-config.h +7 -0
  129. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512u12/SIMD512-4-config.h +7 -0
  130. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512u4/SIMD512-4-config.h +7 -0
  131. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512ufull/SIMD512-4-config.h +7 -0
  132. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/KeccakP-1600-times4-SIMD512.c +881 -0
  133. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/KeccakP-1600-times4-SnP.h +51 -0
  134. data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on1/KeccakP-1600-times4-SnP.h +45 -0
  135. data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on1/KeccakP-1600-times4-on1.c +37 -0
  136. data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on2/KeccakP-1600-times4-SnP.h +45 -0
  137. data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on2/KeccakP-1600-times4-on2.c +38 -0
  138. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/KeccakP-1600-times8-SIMD512.c +1615 -0
  139. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/KeccakP-1600-times8-SnP.h +57 -0
  140. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/u12/SIMD512-config.h +7 -0
  141. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/u4/SIMD512-config.h +7 -0
  142. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/ua/SIMD512-config.h +7 -0
  143. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on1/KeccakP-1600-times8-SnP.h +45 -0
  144. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on1/KeccakP-1600-times8-on1.c +37 -0
  145. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on2/KeccakP-1600-times8-SnP.h +45 -0
  146. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on2/KeccakP-1600-times8-on2.c +38 -0
  147. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on4/KeccakP-1600-times8-SnP.h +45 -0
  148. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on4/KeccakP-1600-times8-on4.c +38 -0
  149. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-SnP.h +41 -0
  150. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv6m-le-armcc.s +442 -0
  151. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv6m-le-gcc.s +446 -0
  152. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv7m-le-armcc.s +419 -0
  153. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv7m-le-gcc.s +427 -0
  154. data/ext/xkcp/lib/low/KeccakP-200/AVR8/KeccakP-200-SnP.h +41 -0
  155. data/ext/xkcp/lib/low/KeccakP-200/AVR8/KeccakP-200-avr8-fast.s +647 -0
  156. data/ext/xkcp/lib/low/KeccakP-200/compact/KeccakP-200-SnP.h +39 -0
  157. data/ext/xkcp/lib/low/KeccakP-200/compact/KeccakP-200-compact.c +190 -0
  158. data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-SnP.h +43 -0
  159. data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-reference.c +412 -0
  160. data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-reference.h +23 -0
  161. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-SnP.h +41 -0
  162. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv6m-le-armcc.s +454 -0
  163. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv6m-le-gcc.s +458 -0
  164. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv7m-le-armcc.s +455 -0
  165. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv7m-le-gcc.s +458 -0
  166. data/ext/xkcp/lib/low/KeccakP-400/AVR8/KeccakP-400-SnP.h +41 -0
  167. data/ext/xkcp/lib/low/KeccakP-400/AVR8/KeccakP-400-avr8-fast.s +728 -0
  168. data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-SnP.h +43 -0
  169. data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-reference.c +414 -0
  170. data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-reference.h +23 -0
  171. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-SnP.h +42 -0
  172. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u1-armv6m-le-armcc.s +527 -0
  173. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u1-armv6m-le-gcc.s +533 -0
  174. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv6m-le-armcc.s +528 -0
  175. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv6m-le-gcc.s +534 -0
  176. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7a-le-armcc.s +521 -0
  177. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7a-le-gcc.s +527 -0
  178. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7m-le-armcc.s +517 -0
  179. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7m-le-gcc.s +523 -0
  180. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-uf-armv7m-le-armcc.s +550 -0
  181. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-uf-armv7m-le-gcc.s +556 -0
  182. data/ext/xkcp/lib/low/KeccakP-800/ARMv8A/KeccakP-800-SnP.h +32 -0
  183. data/ext/xkcp/lib/low/KeccakP-800/ARMv8A/KeccakP-800-armv8a-neon.s +432 -0
  184. data/ext/xkcp/lib/low/KeccakP-800/AVR8/KeccakP-800-SnP.h +42 -0
  185. data/ext/xkcp/lib/low/KeccakP-800/AVR8/KeccakP-800-avr8-fast.s +929 -0
  186. data/ext/xkcp/lib/low/KeccakP-800/compact/KeccakP-800-SnP.h +40 -0
  187. data/ext/xkcp/lib/low/KeccakP-800/compact/KeccakP-800-compact.c +244 -0
  188. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-SnP.h +46 -0
  189. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32-bis.macros +184 -0
  190. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32.c +454 -0
  191. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32.macros +459 -0
  192. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-unrolling-bis.macros +83 -0
  193. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-unrolling.macros +88 -0
  194. data/ext/xkcp/lib/low/KeccakP-800/plain/lcu2/KeccakP-800-opt32-config.h +7 -0
  195. data/ext/xkcp/lib/low/KeccakP-800/plain/lcua/KeccakP-800-opt32-config.h +7 -0
  196. data/ext/xkcp/lib/low/KeccakP-800/plain/u2/KeccakP-800-opt32-config.h +7 -0
  197. data/ext/xkcp/lib/low/KeccakP-800/plain/ua/KeccakP-800-opt32-config.h +7 -0
  198. data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-SnP.h +44 -0
  199. data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-reference.c +437 -0
  200. data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-reference.h +23 -0
  201. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/Ket.h +57 -0
  202. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeJr-armv7m-le-armcc.s +475 -0
  203. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeJr-armv7m-le-gcc.s +480 -0
  204. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeSr-armv7m-le-armcc.s +590 -0
  205. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeSr-armv7m-le-gcc.s +590 -0
  206. data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.c +126 -0
  207. data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.h +68 -0
  208. data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.inc +174 -0
  209. data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.c +80 -0
  210. data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.h +68 -0
  211. data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.inc +142 -0
  212. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-SnP.h +55 -0
  213. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-u1-armv6m-le-armcc.s +1086 -0
  214. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-u1-armv6m-le-gcc.s +1092 -0
  215. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv6-le-armcc.s +721 -0
  216. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv6-le-gcc.s +726 -0
  217. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv7m-le-armcc.s +723 -0
  218. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv7m-le-gcc.s +729 -0
  219. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-u1-armv6m-le-armcc.s +1164 -0
  220. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-u1-armv6m-le-gcc.s +1165 -0
  221. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv6-le-armcc.s +562 -0
  222. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv6-le-gcc.s +563 -0
  223. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv7m-le-armcc.s +563 -0
  224. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv7m-le-gcc.s +565 -0
  225. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-SnP.h +55 -0
  226. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-uf-armv7a-neon-le-armcc.s +476 -0
  227. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-uf-armv7a-neon-le-gcc.s +485 -0
  228. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodyak-uf-armv7a-neon-le-armcc.s +362 -0
  229. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodyak-uf-armv7a-neon-le-gcc.s +367 -0
  230. data/ext/xkcp/lib/low/Xoodoo/AVR8/Xoodoo-SnP.h +43 -0
  231. data/ext/xkcp/lib/low/Xoodoo/AVR8/Xoodoo-avr8-u1.s +1341 -0
  232. data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodoo-SIMD512.c +581 -0
  233. data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodoo-SnP.h +58 -0
  234. data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodyak-full-block-SIMD512.c +332 -0
  235. data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodoo-SIMD128.c +329 -0
  236. data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodoo-SnP.h +53 -0
  237. data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodyak-full-block-SIMD128.c +355 -0
  238. data/ext/xkcp/lib/low/Xoodoo/Xoodoo.h +79 -0
  239. data/ext/xkcp/lib/low/Xoodoo/plain/Xoodoo-SnP.h +56 -0
  240. data/ext/xkcp/lib/low/Xoodoo/plain/Xoodoo-optimized.c +399 -0
  241. data/ext/xkcp/lib/low/Xoodoo/plain/Xoodyak-full-blocks.c +127 -0
  242. data/ext/xkcp/lib/low/Xoodoo/ref/Xoodoo-SnP.h +43 -0
  243. data/ext/xkcp/lib/low/Xoodoo/ref/Xoodoo-reference.c +253 -0
  244. data/ext/xkcp/lib/low/Xoodoo-times16/AVX512/Xoodoo-times16-SIMD512.c +1044 -0
  245. data/ext/xkcp/lib/low/Xoodoo-times16/AVX512/Xoodoo-times16-SnP.h +49 -0
  246. data/ext/xkcp/lib/low/Xoodoo-times16/fallback-on1/Xoodoo-times16-SnP.h +45 -0
  247. data/ext/xkcp/lib/low/Xoodoo-times16/fallback-on1/Xoodoo-times16-on1.c +37 -0
  248. data/ext/xkcp/lib/low/Xoodoo-times4/ARMv7A-NEON/Xoodoo-times4-ARMv7A.s +1587 -0
  249. data/ext/xkcp/lib/low/Xoodoo-times4/ARMv7A-NEON/Xoodoo-times4-SnP.h +48 -0
  250. data/ext/xkcp/lib/low/Xoodoo-times4/AVX512/Xoodoo-times4-SIMD512.c +1202 -0
  251. data/ext/xkcp/lib/low/Xoodoo-times4/AVX512/Xoodoo-times4-SnP.h +48 -0
  252. data/ext/xkcp/lib/low/Xoodoo-times4/SSSE3/Xoodoo-times4-SIMD128.c +484 -0
  253. data/ext/xkcp/lib/low/Xoodoo-times4/SSSE3/Xoodoo-times4-SnP.h +44 -0
  254. data/ext/xkcp/lib/low/Xoodoo-times4/fallback-on1/Xoodoo-times4-SnP.h +45 -0
  255. data/ext/xkcp/lib/low/Xoodoo-times4/fallback-on1/Xoodoo-times4-on1.c +37 -0
  256. data/ext/xkcp/lib/low/Xoodoo-times8/AVX2/Xoodoo-times8-SIMD256.c +939 -0
  257. data/ext/xkcp/lib/low/Xoodoo-times8/AVX2/Xoodoo-times8-SnP.h +49 -0
  258. data/ext/xkcp/lib/low/Xoodoo-times8/AVX512/Xoodoo-times8-SIMD512.c +1216 -0
  259. data/ext/xkcp/lib/low/Xoodoo-times8/AVX512/Xoodoo-times8-SnP.h +48 -0
  260. data/ext/xkcp/lib/low/Xoodoo-times8/fallback-on1/Xoodoo-times8-SnP.h +45 -0
  261. data/ext/xkcp/lib/low/Xoodoo-times8/fallback-on1/Xoodoo-times8-on1.c +37 -0
  262. data/ext/xkcp/lib/low/common/PlSnP-Fallback.inc +290 -0
  263. data/ext/xkcp/lib/low/common/SnP-Relaned.h +141 -0
  264. data/ext/xkcp/support/Build/ExpandProducts.xsl +79 -0
  265. data/ext/xkcp/support/Build/ToGlobalMakefile.xsl +206 -0
  266. data/ext/xkcp/support/Build/ToOneTarget.xsl +89 -0
  267. data/ext/xkcp/support/Build/ToTargetConfigFile.xsl +37 -0
  268. data/ext/xkcp/support/Build/ToTargetMakefile.xsl +298 -0
  269. data/ext/xkcp/support/Build/ToVCXProj.xsl +198 -0
  270. data/ext/xkcp/support/Kernel-PMU/Kernel-pmu.md +133 -0
  271. data/ext/xkcp/support/Kernel-PMU/Makefile +8 -0
  272. data/ext/xkcp/support/Kernel-PMU/enable_arm_pmu.c +129 -0
  273. data/ext/xkcp/support/Kernel-PMU/load-module +1 -0
  274. data/ext/xkcp/util/KeccakSum/KeccakSum.c +394 -0
  275. data/ext/xkcp/util/KeccakSum/base64.c +86 -0
  276. data/ext/xkcp/util/KeccakSum/base64.h +12 -0
  277. data/lib/sleeping_kangaroo12/binding.rb +15 -0
  278. data/lib/sleeping_kangaroo12/build/loader.rb +40 -0
  279. data/lib/sleeping_kangaroo12/build/platform.rb +37 -0
  280. data/lib/sleeping_kangaroo12/build.rb +4 -0
  281. data/lib/sleeping_kangaroo12/digest.rb +103 -0
  282. data/lib/sleeping_kangaroo12/version.rb +5 -0
  283. data/lib/sleeping_kangaroo12.rb +7 -0
  284. metadata +372 -0
@@ -0,0 +1,1202 @@
1
+ /*
2
+ The eXtended Keccak Code Package (XKCP)
3
+ https://github.com/XKCP/XKCP
4
+
5
+ The Xoodoo permutation, designed by Joan Daemen, Seth Hoffert, Gilles Van Assche and Ronny Van Keer.
6
+
7
+ Implementation by Ronny Van Keer, hereby denoted as "the implementer".
8
+
9
+ For more information, feedback or questions, please refer to the Keccak Team website:
10
+ https://keccak.team/
11
+
12
+ To the extent possible under law, the implementer has waived all copyright
13
+ and related or neighboring rights to the source code in this file.
14
+ http://creativecommons.org/publicdomain/zero/1.0/
15
+ */
16
+
17
+ #include <stdio.h>
18
+ #include <string.h>
19
+ #include <smmintrin.h>
20
+ #include <wmmintrin.h>
21
+ #include <immintrin.h>
22
+ #include <emmintrin.h>
23
+ #include "align.h"
24
+ #include "brg_endian.h"
25
+ #include "Xoodoo.h"
26
+ #include "Xoodoo-times4-SnP.h"
27
+
28
+ #if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
29
+ #error Expecting a little-endian platform
30
+ #endif
31
+
32
+ /* #define SIMULATE_AVX512 */
33
+
34
+ #define VERBOSE 0
35
+
36
+ #if defined(SIMULATE_AVX512)
37
+
38
+ typedef struct
39
+ {
40
+ uint32_t x[16];
41
+ } __m512i;
42
+
43
+ static void _mm512_mask_store_epi64(void *mem_addr, uint8_t k, __m512i a)
44
+ {
45
+ uint64_t *p64 = (uint64_t *)mem_addr;
46
+ unsigned int i;
47
+
48
+ for ( i = 0; i < 8; ++i ) {
49
+ if ((k & (1 << i)) != 0)
50
+ p64[i] = (uint64_t)a.x[2*i] | ((uint64_t)a.x[2*i+1] << 32);
51
+ }
52
+ }
53
+
54
+ static __m512i _mm512_maskz_load_epi64(uint8_t k, const void *mem_addr)
55
+ {
56
+ __m512i r;
57
+ const uint64_t *p64 = (const uint64_t *)mem_addr;
58
+ unsigned int i;
59
+
60
+ for ( i = 0; i < 8; ++i ) {
61
+ if ((k & (1 << i)) != 0) {
62
+ r.x[2*i] = (uint32_t)p64[i];
63
+ r.x[2*i+1] = (uint32_t)(p64[i] >> 32);
64
+ }
65
+ else {
66
+ r.x[2*i] = 0;
67
+ r.x[2*i+1] = 0;
68
+ }
69
+ }
70
+ return(r);
71
+ }
72
+
73
+ static void _mm512_storeu_si512(__m512i * mem_addr, __m512i a)
74
+ {
75
+ uint32_t *p32 = (uint32_t *)mem_addr;
76
+ unsigned int i;
77
+
78
+ for ( i = 0; i < 16; ++i )
79
+ p32[i] = a.x[i];
80
+ }
81
+
82
+ #define _mm512_store_si512 _mm512_storeu_si512
83
+
84
+ typedef union
85
+ {
86
+ uint32_t x[4];
87
+ __m128i s;
88
+ } s__m128i;
89
+
90
+ typedef union
91
+ {
92
+ uint32_t x[8];
93
+ __m256i s;
94
+ } s__m256i;
95
+
96
+
97
+ static void _mm256_storeu_si256(__m256i * mem_addr, __m256i aa)
98
+ {
99
+ uint32_t *p32 = (uint32_t *)mem_addr;
100
+ s__m256i a;
101
+ unsigned int i;
102
+
103
+ a.s = aa;
104
+ for ( i = 0; i < 8; ++i )
105
+ p32[i] = a.x[i];
106
+ }
107
+
108
+ #define _mm256_store_si256 _mm256_storeu_si256
109
+
110
+ static __m512i _mm512_loadu_si512(const __m512i * mem_addr)
111
+ {
112
+ __m512i r;
113
+ const uint32_t *p32 = (const uint32_t *)mem_addr;
114
+ unsigned int i;
115
+
116
+ for ( i = 0; i < 16; ++i )
117
+ r.x[i] = p32[i];
118
+ return(r);
119
+ }
120
+
121
+ #define _mm512_load_si512 _mm512_loadu_si512
122
+
123
+ static __m256i _mm256_loadu_si256(const __m256i * mem_addr)
124
+ {
125
+ s__m256i r;
126
+ const uint32_t *p32 = (const uint32_t *)mem_addr;
127
+ unsigned int i;
128
+
129
+ for ( i = 0; i < 8; ++i )
130
+ r.x[i] = p32[i];
131
+ return(r.s);
132
+ }
133
+
134
+ #define _mm256_load_si256 _mm256_loadu_si256
135
+
136
+ static __m512i _mm512_setzero_si512(void)
137
+ {
138
+ __m512i r;
139
+ unsigned int i;
140
+
141
+ for ( i = 0; i < 16; ++i )
142
+ r.x[i] = 0;
143
+ return(r);
144
+ }
145
+
146
+ static __m128i _mm128_setzero_si128(void)
147
+ {
148
+ s__m128i r;
149
+ unsigned int i;
150
+
151
+ for ( i = 0; i < 4; ++i )
152
+ r.x[i] = 0;
153
+ return(r.s);
154
+ }
155
+
156
+ static __m512i _mm512_xor_si512( __m512i a, __m512i b)
157
+ {
158
+ __m512i r;
159
+ unsigned int i;
160
+
161
+ for ( i = 0; i < 16; ++i )
162
+ r.x[i] = a.x[i] ^ b.x[i];
163
+ return(r);
164
+ }
165
+
166
+ static __m512i _mm512_and_si512( __m512i a, __m512i b)
167
+ {
168
+ __m512i r;
169
+ unsigned int i;
170
+
171
+ for ( i = 0; i < 16; ++i )
172
+ r.x[i] = a.x[i] & b.x[i];
173
+ return(r);
174
+ }
175
+
176
+ static __m512i _mm512_ternarylogic_epi32(__m512i a, __m512i b, __m512i c, int imm)
177
+ {
178
+
179
+ if (imm == 0x96)
180
+ return ( _mm512_xor_si512( _mm512_xor_si512( a, b ), c ) );
181
+ if (imm == 0xD2) {
182
+ __m512i t;
183
+ unsigned int i;
184
+
185
+ for ( i = 0; i < 16; ++i )
186
+ t.x[i] = ~b.x[i] & c.x[i];
187
+ return ( _mm512_xor_si512( a, t ) );
188
+ }
189
+ printf( "_mm512_ternarylogic_epi32( a, b, c, %02X) not implemented!\n", imm );
190
+ exit(1);
191
+
192
+ }
193
+
194
+ static __m128i _mm_ternarylogic_epi32(__m128i a, __m128i b, __m128i c, int imm)
195
+ {
196
+
197
+ if (imm == 0x96)
198
+ return ( _mm_xor_si128( _mm_xor_si128( a, b ), c ) );
199
+ if (imm == 0xD2) {
200
+ s__m128i t;
201
+ s__m128i bb;
202
+ s__m128i cc;
203
+ unsigned int i;
204
+
205
+ bb.s = b;
206
+ cc.s = c;
207
+ for ( i = 0; i < 4; ++i )
208
+ t.x[i] = ~bb.x[i] & cc.x[i];
209
+ return ( _mm_xor_si128( a, t.s ) );
210
+ }
211
+ printf( "_mm_ternarylogic_epi32( a, b, c, %02X) not implemented!\n", imm );
212
+ exit(1);
213
+
214
+ }
215
+
216
+ static __m512i _mm512_rol_epi32(__m512i a, int offset)
217
+ {
218
+ __m512i r;
219
+ unsigned int i;
220
+
221
+ for ( i = 0; i < 16; ++i )
222
+ r.x[i] = (a.x[i] << offset) | (a.x[i] >> (32-offset));
223
+ return(r);
224
+ }
225
+
226
+ static __m128i _mm_rol_epi32(__m128i a, int offset)
227
+ {
228
+ s__m128i r;
229
+ s__m128i aa;
230
+ unsigned int i;
231
+
232
+ aa.s = a;
233
+ for ( i = 0; i < 4; ++i )
234
+ r.x[i] = (aa.x[i] << offset) | (aa.x[i] >> (32-offset));
235
+ return(r.s);
236
+ }
237
+
238
+ static __m512i _mm512_slli_epi32(__m512i a, int offset)
239
+ {
240
+ __m512i r;
241
+ unsigned int i;
242
+
243
+ for ( i = 0; i < 16; ++i )
244
+ r.x[i] = (a.x[i] << offset);
245
+ return(r);
246
+ }
247
+
248
+ static __m512i _mm512_set1_epi32(uint32_t a)
249
+ {
250
+ unsigned int i;
251
+ __m512i r;
252
+
253
+ for ( i = 0; i < 16; ++i )
254
+ r.x[i] = a;
255
+ return(r);
256
+ }
257
+
258
+ static __m512i _mm512_i32gather_epi32(__m512i idx, const void *p, int scale)
259
+ {
260
+ __m512i r;
261
+ unsigned int i;
262
+ for ( i = 0; i < 16; ++i )
263
+ r.x[i] = *(const uint32_t*)((const char*)p + idx.x[i] * scale);
264
+ return(r);
265
+ }
266
+
267
+ static void _mm512_i32scatter_epi32( void *p, __m512i idx, __m512i value, int scale)
268
+ {
269
+ unsigned int i;
270
+
271
+ for ( i = 0; i < 16; ++i )
272
+ *(uint32_t*)((char*)p + idx.x[i] * scale) = value.x[i];
273
+ }
274
+
275
+ static void _mm_i32scatter_epi32( void *p, __m128i idx, __m128i value, int scale)
276
+ {
277
+ s__m128i iidx, vvalue;
278
+ unsigned int i;
279
+
280
+ iidx.s = idx;
281
+ vvalue.s = value;
282
+ for ( i = 0; i < 4; ++i )
283
+ *(uint32_t*)((char*)p + iidx.x[i] * scale) = vvalue.x[i];
284
+ }
285
+
286
+ static void _mm512_mask_i32scatter_epi32( void *p, uint16_t k, __m512i idx, __m512i value, int scale)
287
+ {
288
+ unsigned int i;
289
+ for ( i = 0; i < 16; ++i ) {
290
+ if ((k & (1 << i)) != 0)
291
+ *(uint32_t*)((char*)p + idx.x[i] * scale) = value.x[i];
292
+ }
293
+ }
294
+
295
+ static void _mm_mask_i32scatter_epi32( void *p, uint16_t k, __m128i idx, __m128i value, int scale)
296
+ {
297
+ s__m128i iidx, vvalue;
298
+ unsigned int i;
299
+
300
+ iidx.s = idx;
301
+ vvalue.s = value;
302
+ for ( i = 0; i < 4; ++i ) {
303
+ if ((k & (1 << i)) != 0)
304
+ *(uint32_t*)((char*)p + iidx.x[i] * scale) = vvalue.x[i];
305
+ }
306
+ }
307
+
308
+ static __m512i _mm512_setr_epi32( int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8,
309
+ int e7, int e6, int e5, int e4, int e3, int e2, int ee1, int ee0)
310
+ {
311
+ __m512i r;
312
+
313
+ r.x[ 0] = e15;
314
+ r.x[ 1] = e14;
315
+ r.x[ 2] = e13;
316
+ r.x[ 3] = e12;
317
+ r.x[ 4] = e11;
318
+ r.x[ 5] = e10;
319
+ r.x[ 6] = e9;
320
+ r.x[ 7] = e8;
321
+ r.x[ 8] = e7;
322
+ r.x[ 9] = e6;
323
+ r.x[10] = e5;
324
+ r.x[11] = e4;
325
+ r.x[12] = e3;
326
+ r.x[13] = e2;
327
+ r.x[14] = ee1;
328
+ r.x[15] = ee0;
329
+ return(r);
330
+ }
331
+
332
+ static __m128i _mm_setr_epi32(int e3, int e2, int ee1, int ee0)
333
+ {
334
+ s__m128i r;
335
+
336
+ r.x[0] = e3;
337
+ r.x[1] = e2;
338
+ r.x[2] = ee1;
339
+ r.x[3] = ee0;
340
+ return(r.s);
341
+ }
342
+
343
+ static __m512i _mm512_inserti64x4 (__m512i a, __m256i b, int imm8)
344
+ {
345
+ __m512i r;
346
+ s__m256i bb;
347
+ unsigned int i;
348
+
349
+ r = a;
350
+ bb.s = b;
351
+ if (imm8 == 0) {
352
+ for ( i = 0; i < 8; ++i )
353
+ r.x[i] = bb.x[i];
354
+ } else {
355
+ for ( i = 0; i < 8; ++i )
356
+ r.x[i+8] = bb.x[i];
357
+ }
358
+ return(r);
359
+ }
360
+
361
+ static __m512i _mm512_permutex2var_epi32(__m512i a, __m512i idx, __m512i b)
362
+ {
363
+ __m512i r;
364
+ unsigned int i;
365
+ for ( i = 0; i < 16; ++i )
366
+ r.x[i] = (idx.x[i] & 0x10) ? b.x[idx.x[i] & 0x0F] : a.x[idx.x[i] & 0x0F];
367
+ return(r);
368
+ }
369
+
370
+ static __m128i _mm_permutex2var_epi32(__m128i a, __m128i idx, __m128i b)
371
+ {
372
+ s__m128i r;
373
+ s__m128i iidx, aa, bb;
374
+ unsigned int i;
375
+
376
+ iidx.s = idx;
377
+ aa.s = a;
378
+ bb.s = b;
379
+ for ( i = 0; i < 4; ++i )
380
+ r.x[i] = (iidx.x[i] & 4) ? bb.x[iidx.x[i] & 3] : aa.x[iidx.x[i] & 3];
381
+ return(r.s);
382
+ }
383
+
384
+ static __m512i _mm512_permutexvar_epi32(__m512i idx, __m512i a)
385
+ {
386
+ __m512i r;
387
+ unsigned int i;
388
+ for ( i = 0; i < 16; ++i )
389
+ r.x[i] = a.x[idx.x[i]];
390
+ return(r);
391
+ }
392
+
393
+ static __m128i _mm_permutexvar_epi32(__m128i idx, __m128i a)
394
+ {
395
+ s__m128i r;
396
+ s__m128i iidx, aa;
397
+ unsigned int i;
398
+
399
+ iidx.s = idx;
400
+ aa.s = a;
401
+ for ( i = 0; i < 4; ++i )
402
+ r.x[i] = aa.x[iidx.x[i]];
403
+ return(r.s);
404
+ }
405
+
406
+ static __m512i _mm512_castsi256_si512(__m256i a)
407
+ {
408
+ __m512i r;
409
+ s__m256i aa;
410
+ unsigned int i;
411
+
412
+ r = _mm512_setzero_si512();
413
+ aa.s = a;
414
+ for ( i = 0; i < 8; ++i )
415
+ r.x[i] = aa.x[i];
416
+ return(r);
417
+ }
418
+
419
+ #endif
420
+
421
+ typedef __m128i V128;
422
+ typedef __m256i V256;
423
+ typedef __m512i V512;
424
+
425
+ #define SnP_laneLengthInBytes 4
426
+ #define laneIndex(instanceIndex, lanePosition) ((lanePosition)*4 + instanceIndex)
427
+
428
+ #define Chi(a,b,c) _mm_ternarylogic_epi32(a,b,c,0xD2)
429
+
430
+ #define CONST4_32(a) _mm_set1_epi32(a)
431
+ #define LOAD256u(a) _mm256_loadu_si256((const V256 *)&(a))
432
+
433
+ #define LOAD512(a) _mm512_load_si512((const V512 *)&(a))
434
+ #define LOAD512u(a) _mm512_loadu_si512((const V512 *)&(a))
435
+
436
+ #define LOAD_GATHER4_32(idx,p) _mm_i32gather_epi32((const void*)(p), idx, 4)
437
+ #define STORE_SCATTER4_32(idx,a,p) _mm_i32scatter_epi32((void*)(p), idx, a, 4)
438
+ #define LOAD4_32(a,b,c,d) _mm_setr_epi32(a,b,c,d)
439
+
440
+
441
+ #define SHUFFLE_LANES_RIGHT(idx, a) _mm_permutexvar_epi32(idx, a)
442
+
443
+ #define ROL32(a, o) _mm_rol_epi32(a, o)
444
+ #define SHL32(a, o) _mm_slli_epi32(a, o)
445
+
446
+ #define SET4_32 _mm_setr_epi32
447
+
448
+ #define STORE128(a, b) _mm_store_si128((V128 *)&(a), b)
449
+ #define STORE128u(a, b) _mm_storeu_si128((V128 *)&(a), b)
450
+ #define STORE256u(a, b) _mm256_storeu_si256((V256 *)&(a), b)
451
+ #define STORE256(a, b) _mm256_store_si256((V256 *)&(a), b)
452
+ #define STORE512(a, b) _mm512_store_si512((V512 *)&(a), b)
453
+ #define STORE512u(a, b) _mm512_storeu_si512((V512 *)&(a), b)
454
+
455
+ #define AND(a, b) _mm_and_si128(a, b)
456
+ #define XOR(a, b) _mm_xor_si128(a, b)
457
+ #define XOR256(a, b) _mm256_xor_si256(a, b)
458
+ #define XOR512(a, b) _mm512_xor_si512(a, b)
459
+ #define XOR3(a,b,c) _mm_ternarylogic_epi32(a,b,c,0x96)
460
+
461
+ #if (VERBOSE > 0)
462
+ #define DumpOne(__b,__v,__i) STORE128(__b, __v##__i); \
463
+ printf("%02u %08x %08x %08x %08x\n", __i, buf[0], buf[1], buf[2], buf[3])
464
+
465
+ #define Dump(__t,__v) { \
466
+ uint32_t buf[8]; \
467
+ printf("%s\n", __t); \
468
+ DumpOne(buf, __v, 00); \
469
+ DumpOne(buf, __v, 01); \
470
+ DumpOne(buf, __v, 02); \
471
+ DumpOne(buf, __v, 03); \
472
+ DumpOne(buf, __v, 10); \
473
+ DumpOne(buf, __v, 11); \
474
+ DumpOne(buf, __v, 12); \
475
+ DumpOne(buf, __v, 13); \
476
+ DumpOne(buf, __v, 20); \
477
+ DumpOne(buf, __v, 21); \
478
+ DumpOne(buf, __v, 22); \
479
+ DumpOne(buf, __v, 23); \
480
+ }
481
+ #else
482
+ #define Dump(__t,__v)
483
+ #endif
484
+
485
+ #if (VERBOSE >= 1)
486
+ #define Dump1(__t,__v) Dump(__t,__v)
487
+ #else
488
+ #define Dump1(__t,__v)
489
+ #endif
490
+
491
+ #if (VERBOSE >= 2)
492
+ #define Dump2(__t,__v) Dump(__t,__v)
493
+ #else
494
+ #define Dump2(__t,__v)
495
+ #endif
496
+
497
+ #if (VERBOSE >= 3)
498
+ #define Dump3(__t,__v) Dump(__t,__v)
499
+ #else
500
+ #define Dump3(__t,__v)
501
+ #endif
502
+
503
+ #if (VERBOSE > 0)
504
+ #define DUMP32(tt, buf) printf("%s %08x %08x %08x %08x %08x %08x %08x %08x\n", tt, buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7])
505
+
506
+ #define DUMP32_12(tt, buf) printf("%s %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", tt, buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7], buf[8], buf[9], buf[10], buf[11])
507
+
508
+ #define DumpLane(__t,__v) { uint32_t buf[8]; \
509
+ STORE128(buf[0], __v); \
510
+ printf("%s %08x %08x %08x %08x\n", __t, buf[0], buf[1], buf[2], buf[3]); }
511
+
512
+ #else
513
+ #define DUMP32(__t, buf)
514
+ #define DUMP32_12(__t, buf)
515
+ #define DumpLane(__t,__v)
516
+ #endif
517
+
518
+ ALIGN(32) static const uint32_t oAllFrom1_0[] = { 1, 2, 3, 4+0 };
519
+ ALIGN(32) static const uint32_t oAllFrom2_0[] = { 2, 3, 4+0, 4+1 };
520
+ ALIGN(32) static const uint32_t oAllFrom3_0[] = { 3, 4+0, 4+1, 4+2 };
521
+
522
+ ALIGN(32) static const uint32_t oLow64[] = { 0, 1, 4+0, 4+1 };
523
+ ALIGN(32) static const uint32_t oHigh64[] = { 2, 3, 4+2, 4+3 };
524
+
525
+ ALIGN(32) static const uint32_t oLow32[] = { 0, 4+0, 2, 4+2 };
526
+ ALIGN(32) static const uint32_t oHigh32[] = { 1, 4+1, 3, 4+3 };
527
+
528
+ ALIGN(32) static const uint32_t oGatherScatterOffsets[] = { 0*12, 1*12, 2*12, 3*12 };
529
+
530
+ void Xoodootimes4_InitializeAll(void *states)
531
+ {
532
+ memset(states, 0, Xoodootimes4_statesSizeInBytes);
533
+ }
534
+
535
+ void Xoodootimes4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
536
+ {
537
+ unsigned int sizeLeft = length;
538
+ unsigned int lanePosition = offset/SnP_laneLengthInBytes;
539
+ unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
540
+ const unsigned char *curData = data;
541
+ uint32_t *statesAsLanes = (uint32_t *)states;
542
+
543
+ if ((sizeLeft > 0) && (offsetInLane != 0)) {
544
+ unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
545
+ uint32_t lane = 0;
546
+ if (bytesInLane > sizeLeft)
547
+ bytesInLane = sizeLeft;
548
+ memcpy((unsigned char*)&lane + offsetInLane, curData, bytesInLane);
549
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
550
+ sizeLeft -= bytesInLane;
551
+ lanePosition++;
552
+ curData += bytesInLane;
553
+ }
554
+
555
+ while(sizeLeft >= SnP_laneLengthInBytes) {
556
+ uint32_t lane = *((const uint32_t*)curData);
557
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
558
+ sizeLeft -= SnP_laneLengthInBytes;
559
+ lanePosition++;
560
+ curData += SnP_laneLengthInBytes;
561
+ }
562
+
563
+ if (sizeLeft > 0) {
564
+ uint32_t lane = 0;
565
+ memcpy(&lane, curData, sizeLeft);
566
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
567
+ }
568
+ }
569
+
570
+ void Xoodootimes4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
571
+ {
572
+ V128 *stateAsLanes = (V128 *)states;
573
+ unsigned int i;
574
+ const uint32_t *data32 = (const uint32_t *)data;
575
+ V128 offsets = SET4_32(0*laneOffset, 1*laneOffset, 2*laneOffset, 3*laneOffset);
576
+
577
+ #define Xor_In( argIndex ) stateAsLanes[argIndex] = XOR(stateAsLanes[argIndex], LOAD_GATHER4_32(offsets, &data32[argIndex]))
578
+
579
+ if ( laneCount == 12 ) {
580
+ Xor_In( 0 );
581
+ Xor_In( 1 );
582
+ Xor_In( 2 );
583
+ Xor_In( 3 );
584
+ Xor_In( 4 );
585
+ Xor_In( 5 );
586
+ Xor_In( 6 );
587
+ Xor_In( 7 );
588
+ Xor_In( 8 );
589
+ Xor_In( 9 );
590
+ Xor_In( 10 );
591
+ Xor_In( 11 );
592
+ }
593
+ else {
594
+ for(i=0; i<laneCount; i++)
595
+ Xor_In( i );
596
+ }
597
+ #undef Xor_In
598
+ }
599
+
600
+ void Xoodootimes4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
601
+ {
602
+ unsigned int sizeLeft = length;
603
+ unsigned int lanePosition = offset/SnP_laneLengthInBytes;
604
+ unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
605
+ const unsigned char *curData = data;
606
+ uint32_t *statesAsLanes = (uint32_t *)states;
607
+
608
+ if ((sizeLeft > 0) && (offsetInLane != 0)) {
609
+ unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
610
+ if (bytesInLane > sizeLeft)
611
+ bytesInLane = sizeLeft;
612
+ memcpy( ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, curData, bytesInLane);
613
+ sizeLeft -= bytesInLane;
614
+ lanePosition++;
615
+ curData += bytesInLane;
616
+ }
617
+
618
+ while(sizeLeft >= SnP_laneLengthInBytes) {
619
+ uint32_t lane = *((const uint32_t*)curData);
620
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] = lane;
621
+ sizeLeft -= SnP_laneLengthInBytes;
622
+ lanePosition++;
623
+ curData += SnP_laneLengthInBytes;
624
+ }
625
+
626
+ if (sizeLeft > 0) {
627
+ memcpy(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], curData, sizeLeft);
628
+ }
629
+ }
630
+
631
+ void Xoodootimes4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
632
+ {
633
+ V128 *stateAsLanes = (V128 *)states;
634
+ unsigned int i;
635
+ const uint32_t *data32 = (const uint32_t *)data;
636
+ V128 offsets = SET4_32(0*laneOffset, 1*laneOffset, 2*laneOffset, 3*laneOffset);
637
+
638
+ #define OverWr( argIndex ) stateAsLanes[argIndex] = LOAD_GATHER4_32(offsets, &data32[argIndex])
639
+
640
+ if ( laneCount == 12 ) {
641
+ OverWr( 0 );
642
+ OverWr( 1 );
643
+ OverWr( 2 );
644
+ OverWr( 3 );
645
+ OverWr( 4 );
646
+ OverWr( 5 );
647
+ OverWr( 6 );
648
+ OverWr( 7 );
649
+ OverWr( 8 );
650
+ OverWr( 9 );
651
+ OverWr( 10 );
652
+ OverWr( 11 );
653
+ }
654
+ else {
655
+ for(i=0; i<laneCount; i++)
656
+ OverWr( i );
657
+ }
658
+ #undef OverWr
659
+ }
660
+
661
+ void Xoodootimes4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount)
662
+ {
663
+ unsigned int sizeLeft = byteCount;
664
+ unsigned int lanePosition = 0;
665
+ uint32_t *statesAsLanes = (uint32_t *)states;
666
+
667
+ while(sizeLeft >= SnP_laneLengthInBytes) {
668
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] = 0;
669
+ sizeLeft -= SnP_laneLengthInBytes;
670
+ lanePosition++;
671
+ }
672
+
673
+ if (sizeLeft > 0) {
674
+ memset(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], 0, sizeLeft);
675
+ }
676
+ }
677
+
678
+ void Xoodootimes4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length)
679
+ {
680
+ unsigned int sizeLeft = length;
681
+ unsigned int lanePosition = offset/SnP_laneLengthInBytes;
682
+ unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
683
+ unsigned char *curData = data;
684
+ const uint32_t *statesAsLanes = (const uint32_t *)states;
685
+
686
+ if ((sizeLeft > 0) && (offsetInLane != 0)) {
687
+ unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
688
+ if (bytesInLane > sizeLeft)
689
+ bytesInLane = sizeLeft;
690
+ memcpy( curData, ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane);
691
+ sizeLeft -= bytesInLane;
692
+ lanePosition++;
693
+ curData += bytesInLane;
694
+ }
695
+
696
+ while(sizeLeft >= SnP_laneLengthInBytes) {
697
+ *(uint32_t*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
698
+ sizeLeft -= SnP_laneLengthInBytes;
699
+ lanePosition++;
700
+ curData += SnP_laneLengthInBytes;
701
+ }
702
+
703
+ if (sizeLeft > 0) {
704
+ memcpy( curData, &statesAsLanes[laneIndex(instanceIndex, lanePosition)], sizeLeft);
705
+ }
706
+ }
707
+
708
+ void Xoodootimes4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
709
+ {
710
+ const V128 *stateAsLanes = (const V128 *)states;
711
+ unsigned int i;
712
+ uint32_t *data32 = (uint32_t *)data;
713
+ V128 offsets = SET4_32(0*laneOffset, 1*laneOffset, 2*laneOffset, 3*laneOffset);
714
+
715
+ #define Extr( argIndex ) STORE_SCATTER4_32(offsets, stateAsLanes[argIndex], &data32[argIndex])
716
+
717
+ if ( laneCount == 12 ) {
718
+ Extr( 0 );
719
+ Extr( 1 );
720
+ Extr( 2 );
721
+ Extr( 3 );
722
+ Extr( 4 );
723
+ Extr( 5 );
724
+ Extr( 6 );
725
+ Extr( 7 );
726
+ Extr( 8 );
727
+ Extr( 9 );
728
+ Extr( 10 );
729
+ Extr( 11 );
730
+ }
731
+ else {
732
+ for(i=0; i<laneCount; i++)
733
+ Extr( i );
734
+ }
735
+ #undef Extr
736
+ }
737
+
738
+ void Xoodootimes4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
739
+ {
740
+ unsigned int sizeLeft = length;
741
+ unsigned int lanePosition = offset/SnP_laneLengthInBytes;
742
+ unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
743
+ const unsigned char *curInput = input;
744
+ unsigned char *curOutput = output;
745
+ const uint32_t *statesAsLanes = (const uint32_t *)states;
746
+
747
+ if ((sizeLeft > 0) && (offsetInLane != 0)) {
748
+ unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
749
+ uint32_t lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)] >> (8 * offsetInLane);
750
+ if (bytesInLane > sizeLeft)
751
+ bytesInLane = sizeLeft;
752
+ sizeLeft -= bytesInLane;
753
+ do {
754
+ *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
755
+ lane >>= 8;
756
+ } while ( --bytesInLane != 0);
757
+ lanePosition++;
758
+ }
759
+
760
+ while(sizeLeft >= SnP_laneLengthInBytes) {
761
+ *((uint32_t*)curOutput) = *((uint32_t*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)];
762
+ sizeLeft -= SnP_laneLengthInBytes;
763
+ lanePosition++;
764
+ curInput += SnP_laneLengthInBytes;
765
+ curOutput += SnP_laneLengthInBytes;
766
+ }
767
+
768
+ if (sizeLeft != 0) {
769
+ uint32_t lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
770
+ do {
771
+ *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
772
+ lane >>= 8;
773
+ } while ( --sizeLeft != 0);
774
+ }
775
+ }
776
+
777
+ void Xoodootimes4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset)
778
+ {
779
+ const V128 *stateAsLanes = (const V128 *)states;
780
+ unsigned int i;
781
+ const uint32_t *datai32 = (const uint32_t *)input;
782
+ uint32_t *datao32 = (uint32_t *)output;
783
+ V128 offsets = SET4_32(0*laneOffset, 1*laneOffset, 2*laneOffset, 3*laneOffset);
784
+
785
+ #define ExtrXor( argIndex ) STORE_SCATTER4_32(offsets, XOR( stateAsLanes[argIndex], LOAD_GATHER4_32(offsets, &datai32[argIndex])), &datao32[argIndex])
786
+
787
+ if ( laneCount == 12 ) {
788
+ ExtrXor( 0 );
789
+ ExtrXor( 1 );
790
+ ExtrXor( 2 );
791
+ ExtrXor( 3 );
792
+ ExtrXor( 4 );
793
+ ExtrXor( 5 );
794
+ ExtrXor( 6 );
795
+ ExtrXor( 7 );
796
+ ExtrXor( 8 );
797
+ ExtrXor( 9 );
798
+ ExtrXor( 10 );
799
+ ExtrXor( 11 );
800
+ }
801
+ else {
802
+ for(i=0; i<laneCount; i++) {
803
+ ExtrXor( i );
804
+ }
805
+ }
806
+ #undef ExtrXor
807
+ }
808
+
809
+ #define DeclareVars V128 a00, a01, a02, a03; \
810
+ V128 a10, a11, a12, a13; \
811
+ V128 a20, a21, a22, a23; \
812
+ V128 v1, v2;
813
+
814
+ #define State2Vars2 a00 = states[0], a01 = states[1], a02 = states[ 2], a03 = states[ 3]; \
815
+ a12 = states[4], a13 = states[5], a10 = states[ 6], a11 = states[ 7]; \
816
+ a20 = states[8], a21 = states[9], a22 = states[10], a23 = states[11]
817
+
818
+ #define State2Vars a00 = states[0], a01 = states[1], a02 = states[ 2], a03 = states[ 3]; \
819
+ a10 = states[4], a11 = states[5], a12 = states[ 6], a13 = states[ 7]; \
820
+ a20 = states[8], a21 = states[9], a22 = states[10], a23 = states[11]
821
+
822
+ #define Vars2State states[0] = a00, states[1] = a01, states[ 2] = a02, states[ 3] = a03; \
823
+ states[4] = a10, states[5] = a11, states[ 6] = a12, states[ 7] = a13; \
824
+ states[8] = a20, states[9] = a21, states[10] = a22, states[11] = a23
825
+
826
+ #define Round(a10i, a11i, a12i, a13i, a10w, a11w, a12w, a13w, a20i, a21i, a22i, a23i, __rc) \
827
+ \
828
+ /* Theta: Column Parity Mixer */ \
829
+ /* Iota: round constants */ \
830
+ v1 = XOR3( a03, a13i, a23i ); \
831
+ v2 = XOR3( a00, a10i, a20i ); \
832
+ v1 = XOR( ROL32(v1, 5), ROL32(v1, 14) ); \
833
+ a00 = XOR3( a00, v1, CONST4_32(__rc) ); /* Iota */ \
834
+ a10i = XOR( a10i, v1 ); \
835
+ a20i = XOR( a20i, v1 ); \
836
+ v1 = XOR3( a01, a11i, a21i ); \
837
+ v2 = XOR( ROL32(v2, 5), ROL32(v2, 14) ); \
838
+ a01 = XOR( a01, v2 ); \
839
+ a11i = XOR( a11i, v2 ); \
840
+ a21i = XOR( a21i, v2 ); \
841
+ v2 = XOR3( a02, a12i, a22i ); \
842
+ v1 = XOR( ROL32(v1, 5), ROL32(v1, 14) ); \
843
+ a02 = XOR( a02, v1 ); \
844
+ a12i = XOR( a12i, v1 ); \
845
+ a22i = XOR( a22i, v1 ); \
846
+ v2 = XOR( ROL32(v2, 5), ROL32(v2, 14) ); \
847
+ a03 = XOR( a03, v2 ); \
848
+ a13i = XOR( a13i, v2 ); \
849
+ a23i = XOR( a23i, v2 ); \
850
+ Dump3("Theta",a); \
851
+ \
852
+ /* Rho-west: Plane shift */ \
853
+ a20i = ROL32(a20i, 11); \
854
+ a21i = ROL32(a21i, 11); \
855
+ a22i = ROL32(a22i, 11); \
856
+ a23i = ROL32(a23i, 11); \
857
+ Dump3("Rho-west",a); \
858
+ \
859
+ /* Chi: non linear step, on colums */ \
860
+ a00 = Chi(a00, a10w, a20i); \
861
+ a01 = Chi(a01, a11w, a21i); \
862
+ a02 = Chi(a02, a12w, a22i); \
863
+ a03 = Chi(a03, a13w, a23i); \
864
+ a10w = Chi(a10w, a20i, a00); \
865
+ a11w = Chi(a11w, a21i, a01); \
866
+ a12w = Chi(a12w, a22i, a02); \
867
+ a13w = Chi(a13w, a23i, a03); \
868
+ a20i = Chi(a20i, a00, a10w); \
869
+ a21i = Chi(a21i, a01, a11w); \
870
+ a22i = Chi(a22i, a02, a12w); \
871
+ a23i = Chi(a23i, a03, a13w); \
872
+ Dump3("Chi",a); \
873
+ \
874
+ /* Rho-east: Plane shift */ \
875
+ a10w = ROL32(a10w, 1); \
876
+ a11w = ROL32(a11w, 1); \
877
+ a12w = ROL32(a12w, 1); \
878
+ a13w = ROL32(a13w, 1); \
879
+ a20i = ROL32(a20i, 8); \
880
+ a21i = ROL32(a21i, 8); \
881
+ a22i = ROL32(a22i, 8); \
882
+ a23i = ROL32(a23i, 8); \
883
+ Dump3("Rho-east",a)
884
+
885
+ void Xoodootimes4_PermuteAll_6rounds(void *argstates)
886
+ {
887
+ V128 * states = (V128 *)argstates;
888
+ DeclareVars;
889
+
890
+ State2Vars2;
891
+ Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc6 );
892
+ Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc5 );
893
+ Round( a10, a11, a12, a13, a13, a10, a11, a12, a20, a21, a22, a23, _rc4 );
894
+ Round( a13, a10, a11, a12, a12, a13, a10, a11, a22, a23, a20, a21, _rc3 );
895
+ Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc2 );
896
+ Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc1 );
897
+ Dump2("Permutation\n", a);
898
+ Vars2State;
899
+ }
900
+
901
+ void Xoodootimes4_PermuteAll_12rounds(void *argstates)
902
+ {
903
+ V128 * states = (V128 *)argstates;
904
+ DeclareVars;
905
+
906
+ State2Vars;
907
+ Round( a10, a11, a12, a13, a13, a10, a11, a12, a20, a21, a22, a23, _rc12 );
908
+ Round( a13, a10, a11, a12, a12, a13, a10, a11, a22, a23, a20, a21, _rc11 );
909
+ Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc10 );
910
+ Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc9 );
911
+ Round( a10, a11, a12, a13, a13, a10, a11, a12, a20, a21, a22, a23, _rc8 );
912
+ Round( a13, a10, a11, a12, a12, a13, a10, a11, a22, a23, a20, a21, _rc7 );
913
+ Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc6 );
914
+ Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc5 );
915
+ Round( a10, a11, a12, a13, a13, a10, a11, a12, a20, a21, a22, a23, _rc4 );
916
+ Round( a13, a10, a11, a12, a12, a13, a10, a11, a22, a23, a20, a21, _rc3 );
917
+ Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc2 );
918
+ Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc1 );
919
+ Dump2("Permutation\n", a);
920
+ Vars2State;
921
+ }
922
+
923
+ void Xooffftimes4_AddIs(unsigned char *output, const unsigned char *input, size_t bitLen)
924
+ {
925
+ size_t byteLen = bitLen / 8;
926
+ V512 lanes1, lanes2, lanes3, lanes4;
927
+ V256 lanesA, lanesB;
928
+
929
+ while ( byteLen >= 128 ) {
930
+ lanes1 = LOAD512u(input[ 0]);
931
+ lanes2 = LOAD512u(input[64]);
932
+ lanes3 = LOAD512u(output[ 0]);
933
+ lanes4 = LOAD512u(output[64]);
934
+ lanes1 = XOR512(lanes1, lanes3);
935
+ lanes2 = XOR512(lanes2, lanes4);
936
+ STORE512u(output[ 0], lanes1);
937
+ STORE512u(output[64], lanes2);
938
+ input += 128;
939
+ output += 128;
940
+ byteLen -= 128;
941
+ }
942
+ while ( byteLen >= 32 ) {
943
+ lanesA = LOAD256u(input[0]);
944
+ lanesB = LOAD256u(output[0]);
945
+ input += 32;
946
+ lanesA = XOR256(lanesA, lanesB);
947
+ byteLen -= 32;
948
+ STORE256u(output[0], lanesA);
949
+ output += 32;
950
+ }
951
+ while ( byteLen >= 8 ) {
952
+ *((uint64_t*)output) ^= *((uint64_t*)input);
953
+ input += 8;
954
+ output += 8;
955
+ byteLen -= 8;
956
+ }
957
+ while ( byteLen-- != 0 ) {
958
+ *output++ ^= *input++;
959
+ }
960
+
961
+ bitLen &= 7;
962
+ if (bitLen != 0)
963
+ {
964
+ *output ^= *input;
965
+ *output &= (1 << bitLen) - 1;
966
+ }
967
+ }
968
+
969
+ size_t Xooffftimes4_CompressFastLoop(unsigned char *k, unsigned char *x, const unsigned char *input, size_t length)
970
+ {
971
+ DeclareVars;
972
+ uint32_t *k32 = (uint32_t*)k;
973
+ uint32_t *x32 = (uint32_t*)x;
974
+ uint32_t *i32 = (uint32_t*)input;
975
+ size_t initialLength;
976
+ V128 r0481;
977
+ V128 r5926;
978
+ V128 ra37b;
979
+ V128 offsets;
980
+ V128 x00, x01, x02, x03, x10, x11, x12, x13, x20, x21, x22, x23;
981
+ V256 x256;
982
+ V512 x512;
983
+
984
+ DUMP32("k32",k32);
985
+ r0481 = LOAD_GATHER4_32(LOAD4_32( 0, 4, 8, 1), k32);
986
+ r5926 = LOAD_GATHER4_32(LOAD4_32( 5, 9, 2, 6), k32);
987
+ ra37b = LOAD_GATHER4_32(LOAD4_32( 10, 3, 7, 11), k32);
988
+
989
+ offsets = *(V128*)oGatherScatterOffsets;
990
+
991
+ x00 = _mm_setzero_si128();
992
+ x01 = _mm_setzero_si128();
993
+ x02 = _mm_setzero_si128();
994
+ x03 = _mm_setzero_si128();
995
+ x10 = _mm_setzero_si128();
996
+ x11 = _mm_setzero_si128();
997
+ x12 = _mm_setzero_si128();
998
+ x13 = _mm_setzero_si128();
999
+ x20 = _mm_setzero_si128();
1000
+ x21 = _mm_setzero_si128();
1001
+ x22 = _mm_setzero_si128();
1002
+ x23 = _mm_setzero_si128();
1003
+ initialLength = length;
1004
+ do {
1005
+ #define rCGKD ra37b
1006
+
1007
+ /* Note that a10-a12 and a11-a13 are swapped */
1008
+ a00 = r0481;
1009
+ a13 = r5926;
1010
+ a22 = ra37b;
1011
+
1012
+ a12 = _mm_permutex2var_epi32(a00, *(const V128*)oAllFrom1_0, r5926); /* 481 5 */
1013
+
1014
+ r0481 = r5926;
1015
+ r5926 = ra37b;
1016
+ rCGKD = XOR3(a00, SHL32(a00, 13), ROL32(a12, 3));
1017
+
1018
+ a01 = _mm_permutex2var_epi32(a00, *(const V128*)oAllFrom3_0, a13); /* 1 592 */
1019
+ a02 = _mm_permutex2var_epi32(a13, *(const V128*)oAllFrom2_0, a22); /* 26 a3 */
1020
+ a03 = _mm_permutex2var_epi32(a22, *(const V128*)oAllFrom1_0, rCGKD); /* 37b c */
1021
+
1022
+ a10 = _mm_permutex2var_epi32(a13, *(const V128*)oAllFrom3_0, a22); /* 6 a37 */
1023
+ a11 = _mm_permutex2var_epi32(a22, *(const V128*)oAllFrom2_0, rCGKD); /* 7b cg */
1024
+
1025
+ a20 = _mm_permutex2var_epi32(a00, *(const V128*)oAllFrom2_0, a13); /* 81 59 */
1026
+ a21 = _mm_permutex2var_epi32(a13, *(const V128*)oAllFrom1_0, a22); /* 926 a */
1027
+ a23 = _mm_permutex2var_epi32(a22, *(const V128*)oAllFrom3_0, rCGKD); /* b cgk */
1028
+ Dump("Roll-c", a);
1029
+
1030
+ a00 = XOR( a00, LOAD_GATHER4_32(offsets, i32+0));
1031
+ a01 = XOR( a01, LOAD_GATHER4_32(offsets, i32+1));
1032
+ a02 = XOR( a02, LOAD_GATHER4_32(offsets, i32+2));
1033
+ a03 = XOR( a03, LOAD_GATHER4_32(offsets, i32+3));
1034
+ a12 = XOR( a12, LOAD_GATHER4_32(offsets, i32+4));
1035
+ a13 = XOR( a13, LOAD_GATHER4_32(offsets, i32+5));
1036
+ a10 = XOR( a10, LOAD_GATHER4_32(offsets, i32+6));
1037
+ a11 = XOR( a11, LOAD_GATHER4_32(offsets, i32+7));
1038
+ a20 = XOR( a20, LOAD_GATHER4_32(offsets, i32+8));
1039
+ a21 = XOR( a21, LOAD_GATHER4_32(offsets, i32+9));
1040
+ a22 = XOR( a22, LOAD_GATHER4_32(offsets, i32+10));
1041
+ a23 = XOR( a23, LOAD_GATHER4_32(offsets, i32+11));
1042
+ Dump("Input Xoodoo (after add)", a);
1043
+
1044
+ Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc6 );
1045
+ Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc5 );
1046
+ Round( a10, a11, a12, a13, a13, a10, a11, a12, a20, a21, a22, a23, _rc4 );
1047
+ Round( a13, a10, a11, a12, a12, a13, a10, a11, a22, a23, a20, a21, _rc3 );
1048
+ Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc2 );
1049
+ Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc1 );
1050
+ Dump("Output Xoodoo", a);
1051
+
1052
+ x00 = XOR(x00, a00);
1053
+ x01 = XOR(x01, a01);
1054
+ x02 = XOR(x02, a02);
1055
+ x03 = XOR(x03, a03);
1056
+ x10 = XOR(x10, a10);
1057
+ x11 = XOR(x11, a11);
1058
+ x12 = XOR(x12, a12);
1059
+ x13 = XOR(x13, a13);
1060
+ x20 = XOR(x20, a20);
1061
+ x21 = XOR(x21, a21);
1062
+ x22 = XOR(x22, a22);
1063
+ x23 = XOR(x23, a23);
1064
+ Dump("Accu x", x);
1065
+
1066
+ i32 += NLANES*4;
1067
+ length -= NLANES*4*4;
1068
+ }
1069
+ while (length >= (NLANES*4*4));
1070
+
1071
+ /* Reduce from 4 lanes to 2 */
1072
+ v1 = *(V128*)oLow64;
1073
+ v2 = *(V128*)oHigh64;
1074
+ x00 = XOR(_mm_permutex2var_epi32(x00, v1, x02), _mm_permutex2var_epi32(x00, v2, x02));
1075
+ x01 = XOR(_mm_permutex2var_epi32(x01, v1, x03), _mm_permutex2var_epi32(x01, v2, x03));
1076
+ x10 = XOR(_mm_permutex2var_epi32(x10, v1, x12), _mm_permutex2var_epi32(x10, v2, x12));
1077
+ x11 = XOR(_mm_permutex2var_epi32(x11, v1, x13), _mm_permutex2var_epi32(x11, v2, x13));
1078
+ x20 = XOR(_mm_permutex2var_epi32(x20, v1, x22), _mm_permutex2var_epi32(x20, v2, x22));
1079
+ x21 = XOR(_mm_permutex2var_epi32(x21, v1, x23), _mm_permutex2var_epi32(x21, v2, x23));
1080
+
1081
+ /* Reduce from 2 lanes to 1 */
1082
+ v1 = *( V128*)oLow32;
1083
+ v2 = *( V128*)oHigh32;
1084
+ x00 = XOR(_mm_permutex2var_epi32(x00, v1, x01), _mm_permutex2var_epi32(x00, v2, x01));
1085
+ x10 = XOR(_mm_permutex2var_epi32(x10, v1, x11), _mm_permutex2var_epi32(x10, v2, x11));
1086
+ x20 = XOR(_mm_permutex2var_epi32(x20, v1, x21), _mm_permutex2var_epi32(x20, v2, x21));
1087
+
1088
+ /* Combine x00 and x20 */
1089
+ x256 = _mm256_inserti128_si256 (_mm256_castsi128_si256(x00), x10, 1);
1090
+
1091
+ /* Combine (x00,x01) and x20 */
1092
+ x512 = _mm512_inserti64x4 (_mm512_castsi256_si512(x256), _mm256_castsi128_si256(x20), 1);
1093
+
1094
+ /* load xAccu, xor and store 12 lanes */
1095
+ x512 = XOR512(x512, _mm512_maskz_load_epi64(0x3F, x32));
1096
+ _mm512_mask_store_epi64(x32, 0x3F, x512);
1097
+ DUMP32_12("x32",x32);
1098
+
1099
+ /* Save new k */
1100
+ _mm_i32scatter_epi32(k32, LOAD4_32( 0, 4, 8, 1), r0481, 4);
1101
+ _mm_i32scatter_epi32(k32, LOAD4_32( 5, 9, 2, 6), r5926, 4);
1102
+ _mm_i32scatter_epi32(k32, LOAD4_32(10, 3, 7, 11), ra37b, 4);
1103
+ DUMP32_12( "k32", k32);
1104
+
1105
+ return initialLength - length;
1106
+ }
1107
+
1108
+ size_t Xooffftimes4_ExpandFastLoop(unsigned char *yAccu, const unsigned char *kRoll, unsigned char *output, size_t length)
1109
+ {
1110
+ DeclareVars;
1111
+ uint32_t *k32 = (uint32_t*)kRoll;
1112
+ uint32_t *y32 = (uint32_t*)yAccu;
1113
+ uint32_t *o32 = (uint32_t*)output;
1114
+ size_t initialLength;
1115
+ V128 r0481;
1116
+ V128 r5926;
1117
+ V128 ra37b;
1118
+ V128 offsets;
1119
+
1120
+ r0481 = LOAD_GATHER4_32(LOAD4_32( 0, 4, 8, 1), y32);
1121
+ r5926 = LOAD_GATHER4_32(LOAD4_32( 5, 9, 2, 6), y32);
1122
+ ra37b = LOAD_GATHER4_32(LOAD4_32( 10, 3, 7, 11), y32);
1123
+
1124
+ offsets = *(V128*)oGatherScatterOffsets;
1125
+
1126
+ initialLength = length;
1127
+ do {
1128
+ #define rCGKD ra37b
1129
+
1130
+ /* Note that a10-a12 and a11-a13 are swapped */
1131
+ a00 = r0481;
1132
+ a13 = r5926;
1133
+ a22 = ra37b;
1134
+
1135
+ a12 = _mm_permutex2var_epi32(a00, *(const V128*)oAllFrom1_0, r5926); /* 481 5 */
1136
+ a20 = _mm_permutex2var_epi32(a00, *(const V128*)oAllFrom2_0, a13); /* 81 59 */
1137
+
1138
+ r0481 = r5926;
1139
+ r5926 = ra37b;
1140
+ rCGKD = XOR3(ROL32(a00, 5), ROL32(a12, 13), AND(a20, a12));
1141
+ rCGKD = XOR(rCGKD, CONST4_32(7));
1142
+
1143
+ a01 = _mm_permutex2var_epi32(a00, *(const V128*)oAllFrom3_0, a13); /* 1 592 */
1144
+ a02 = _mm_permutex2var_epi32(a13, *(const V128*)oAllFrom2_0, a22); /* 26 a3 */
1145
+ a03 = _mm_permutex2var_epi32(a22, *(const V128*)oAllFrom1_0, rCGKD); /* 37b c */
1146
+
1147
+ a10 = _mm_permutex2var_epi32(a13, *(const V128*)oAllFrom3_0, a22); /* 6 a37 */
1148
+ a11 = _mm_permutex2var_epi32(a22, *(const V128*)oAllFrom2_0, rCGKD); /* 7b cg */
1149
+
1150
+ a21 = _mm_permutex2var_epi32(a13, *(const V128*)oAllFrom1_0, a22); /* 926 a */
1151
+ a23 = _mm_permutex2var_epi32(a22, *(const V128*)oAllFrom3_0, rCGKD); /* b cgk */
1152
+ Dump("Roll-e", a);
1153
+
1154
+ Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc6 );
1155
+ Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc5 );
1156
+ Round( a10, a11, a12, a13, a13, a10, a11, a12, a20, a21, a22, a23, _rc4 );
1157
+ Round( a13, a10, a11, a12, a12, a13, a10, a11, a22, a23, a20, a21, _rc3 );
1158
+ Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc2 );
1159
+ Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc1 );
1160
+ Dump("Xoodoo(y)", a);
1161
+
1162
+ a00 = XOR(a00, CONST4_32(k32[0]));
1163
+ a01 = XOR(a01, CONST4_32(k32[1]));
1164
+ a02 = XOR(a02, CONST4_32(k32[2]));
1165
+ a03 = XOR(a03, CONST4_32(k32[3]));
1166
+ a10 = XOR(a10, CONST4_32(k32[4]));
1167
+ a11 = XOR(a11, CONST4_32(k32[5]));
1168
+ a12 = XOR(a12, CONST4_32(k32[6]));
1169
+ a13 = XOR(a13, CONST4_32(k32[7]));
1170
+ a20 = XOR(a20, CONST4_32(k32[8]));
1171
+ a21 = XOR(a21, CONST4_32(k32[9]));
1172
+ a22 = XOR(a22, CONST4_32(k32[10]));
1173
+ a23 = XOR(a23, CONST4_32(k32[11]));
1174
+ Dump("Xoodoo(y) + kRoll", a);
1175
+
1176
+ /* Extract */
1177
+ STORE_SCATTER4_32(offsets, a00, o32+0);
1178
+ STORE_SCATTER4_32(offsets, a01, o32+1);
1179
+ STORE_SCATTER4_32(offsets, a02, o32+2);
1180
+ STORE_SCATTER4_32(offsets, a03, o32+3);
1181
+ STORE_SCATTER4_32(offsets, a10, o32+4);
1182
+ STORE_SCATTER4_32(offsets, a11, o32+5);
1183
+ STORE_SCATTER4_32(offsets, a12, o32+6);
1184
+ STORE_SCATTER4_32(offsets, a13, o32+7);
1185
+ STORE_SCATTER4_32(offsets, a20, o32+8);
1186
+ STORE_SCATTER4_32(offsets, a21, o32+9);
1187
+ STORE_SCATTER4_32(offsets, a22, o32+10);
1188
+ STORE_SCATTER4_32(offsets, a23, o32+11);
1189
+
1190
+ o32 += NLANES*4;
1191
+ length -= NLANES*4*4;
1192
+ }
1193
+ while (length >= (NLANES*4*4));
1194
+
1195
+ /* Save new y */
1196
+ _mm_i32scatter_epi32(y32, LOAD4_32( 0, 4, 8, 1), r0481, 4);
1197
+ _mm_i32scatter_epi32(y32, LOAD4_32( 5, 9, 2, 6), r5926, 4);
1198
+ _mm_i32scatter_epi32(y32, LOAD4_32(10, 3, 7, 11), ra37b, 4);
1199
+ DUMP32_12( "y32", y32);
1200
+
1201
+ return initialLength - length;
1202
+ }