sleeping_kangaroo12 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (291) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +34 -67
  3. data/ext/Rakefile +12 -37
  4. data/ext/binding/sleeping_kangaroo12.c +1 -16
  5. data/ext/{xkcp → k12}/Makefile +0 -0
  6. data/ext/k12/Makefile.build +118 -0
  7. data/ext/k12/README.markdown +86 -0
  8. data/ext/k12/lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S +623 -0
  9. data/ext/k12/lib/ARMv8Asha3/KeccakP-1600-SnP.h +65 -0
  10. data/ext/k12/lib/ARMv8Asha3/KeccakP-1600-opt64.c +227 -0
  11. data/ext/{xkcp/lib/low/KeccakP-1600/compact → k12/lib/Inplace32BI}/KeccakP-1600-SnP.h +4 -9
  12. data/ext/{xkcp/lib/low/KeccakP-1600/plain-32bits-inplace → k12/lib/Inplace32BI}/KeccakP-1600-inplace32BI.c +65 -160
  13. data/ext/k12/lib/KangarooTwelve.c +332 -0
  14. data/ext/{xkcp/lib/high/KangarooTwelve → k12/lib}/KangarooTwelve.h +53 -16
  15. data/ext/{xkcp/lib/low/KeccakP-1600/AVX2 → k12/lib/Optimized64}/KeccakP-1600-AVX2.s +122 -558
  16. data/ext/k12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c +241 -0
  17. data/ext/k12/lib/Optimized64/KeccakP-1600-AVX512.s +551 -0
  18. data/ext/k12/lib/Optimized64/KeccakP-1600-SnP.h +74 -0
  19. data/ext/{xkcp/lib/low/KeccakP-1600/common/KeccakP-1600-64.macros → k12/lib/Optimized64/KeccakP-1600-opt64.c} +447 -169
  20. data/ext/k12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c +406 -0
  21. data/ext/k12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c +419 -0
  22. data/ext/k12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c +458 -0
  23. data/ext/k12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c +438 -0
  24. data/ext/{xkcp/lib/low/KeccakP-1600/plain-64bits → k12/lib/Plain64}/KeccakP-1600-SnP.h +14 -20
  25. data/ext/{xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-reference.h → k12/lib/Plain64/KeccakP-1600-plain64.c} +9 -8
  26. data/ext/{xkcp/lib/common → k12/lib}/align.h +3 -2
  27. data/ext/{xkcp/lib/common → k12/lib}/brg_endian.h +0 -0
  28. data/ext/{xkcp → k12}/support/Build/ExpandProducts.xsl +0 -0
  29. data/ext/{xkcp → k12}/support/Build/ToGlobalMakefile.xsl +0 -0
  30. data/ext/{xkcp → k12}/support/Build/ToOneTarget.xsl +0 -0
  31. data/ext/{xkcp → k12}/support/Build/ToTargetConfigFile.xsl +0 -0
  32. data/ext/{xkcp → k12}/support/Build/ToTargetMakefile.xsl +10 -16
  33. data/ext/{xkcp → k12}/support/Build/ToVCXProj.xsl +0 -0
  34. data/lib/sleeping_kangaroo12/version.rb +1 -1
  35. metadata +33 -276
  36. data/ext/config/xkcp.build +0 -17
  37. data/ext/xkcp/LICENSE +0 -1
  38. data/ext/xkcp/Makefile.build +0 -200
  39. data/ext/xkcp/README.markdown +0 -296
  40. data/ext/xkcp/lib/HighLevel.build +0 -143
  41. data/ext/xkcp/lib/LowLevel.build +0 -757
  42. data/ext/xkcp/lib/high/KangarooTwelve/KangarooTwelve.c +0 -301
  43. data/ext/xkcp/lib/high/Keccak/FIPS202/KeccakHash.c +0 -81
  44. data/ext/xkcp/lib/high/Keccak/FIPS202/KeccakHash.h +0 -125
  45. data/ext/xkcp/lib/high/Keccak/FIPS202/SimpleFIPS202.c +0 -48
  46. data/ext/xkcp/lib/high/Keccak/FIPS202/SimpleFIPS202.h +0 -79
  47. data/ext/xkcp/lib/high/Keccak/KeccakDuplex.c +0 -81
  48. data/ext/xkcp/lib/high/Keccak/KeccakDuplex.h +0 -73
  49. data/ext/xkcp/lib/high/Keccak/KeccakDuplex.inc +0 -195
  50. data/ext/xkcp/lib/high/Keccak/KeccakSponge.c +0 -111
  51. data/ext/xkcp/lib/high/Keccak/KeccakSponge.h +0 -76
  52. data/ext/xkcp/lib/high/Keccak/KeccakSponge.inc +0 -314
  53. data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.c +0 -61
  54. data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.h +0 -67
  55. data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.inc +0 -128
  56. data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.c +0 -93
  57. data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.h +0 -599
  58. data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.inc +0 -573
  59. data/ext/xkcp/lib/high/Ketje/Ketjev2.c +0 -87
  60. data/ext/xkcp/lib/high/Ketje/Ketjev2.h +0 -88
  61. data/ext/xkcp/lib/high/Ketje/Ketjev2.inc +0 -274
  62. data/ext/xkcp/lib/high/Keyak/Keyakv2.c +0 -132
  63. data/ext/xkcp/lib/high/Keyak/Keyakv2.h +0 -217
  64. data/ext/xkcp/lib/high/Keyak/Keyakv2.inc +0 -81
  65. data/ext/xkcp/lib/high/Keyak/Motorist.inc +0 -953
  66. data/ext/xkcp/lib/high/Kravatte/Kravatte.c +0 -533
  67. data/ext/xkcp/lib/high/Kravatte/Kravatte.h +0 -115
  68. data/ext/xkcp/lib/high/Kravatte/KravatteModes.c +0 -557
  69. data/ext/xkcp/lib/high/Kravatte/KravatteModes.h +0 -247
  70. data/ext/xkcp/lib/high/Xoodyak/Cyclist.h +0 -66
  71. data/ext/xkcp/lib/high/Xoodyak/Cyclist.inc +0 -336
  72. data/ext/xkcp/lib/high/Xoodyak/Xoodyak-parameters.h +0 -26
  73. data/ext/xkcp/lib/high/Xoodyak/Xoodyak.c +0 -55
  74. data/ext/xkcp/lib/high/Xoodyak/Xoodyak.h +0 -35
  75. data/ext/xkcp/lib/high/Xoofff/Xoofff.c +0 -634
  76. data/ext/xkcp/lib/high/Xoofff/Xoofff.h +0 -147
  77. data/ext/xkcp/lib/high/Xoofff/XoofffModes.c +0 -483
  78. data/ext/xkcp/lib/high/Xoofff/XoofffModes.h +0 -241
  79. data/ext/xkcp/lib/high/common/Phases.h +0 -25
  80. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-SnP.h +0 -41
  81. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv6m-le-armcc.s +0 -1666
  82. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv6m-le-gcc.s +0 -1655
  83. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7a-le-armcc.s +0 -1268
  84. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7a-le-gcc.s +0 -1264
  85. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7m-le-armcc.s +0 -1178
  86. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7m-le-gcc.s +0 -1175
  87. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u1-32bi-armv6m-le-armcc.s +0 -1338
  88. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u1-32bi-armv6m-le-gcc.s +0 -1336
  89. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u2-32bi-armv6m-le-armcc.s +0 -1343
  90. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u2-32bi-armv6m-le-gcc.s +0 -1339
  91. data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-SnP.h +0 -42
  92. data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-armv7a-le-neon-armcc.s +0 -823
  93. data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-armv7a-le-neon-gcc.s +0 -831
  94. data/ext/xkcp/lib/low/KeccakP-1600/ARMv8A/KeccakP-1600-SnP.h +0 -31
  95. data/ext/xkcp/lib/low/KeccakP-1600/ARMv8A/KeccakP-1600-armv8a-neon.s +0 -540
  96. data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-SnP.h +0 -42
  97. data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-avr8-compact.s +0 -733
  98. data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-avr8-fast.s +0 -1121
  99. data/ext/xkcp/lib/low/KeccakP-1600/AVX2/KeccakP-1600-SnP.h +0 -52
  100. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/KeccakP-1600-AVX512.c +0 -623
  101. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/KeccakP-1600-SnP.h +0 -47
  102. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/u12/KeccakP-1600-AVX512-config.h +0 -6
  103. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/u6/KeccakP-1600-AVX512-config.h +0 -6
  104. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/ua/KeccakP-1600-AVX512-config.h +0 -6
  105. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/KeccakP-1600-AVX512.s +0 -1031
  106. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/KeccakP-1600-SnP.h +0 -53
  107. data/ext/xkcp/lib/low/KeccakP-1600/XOP/KeccakP-1600-SnP.h +0 -44
  108. data/ext/xkcp/lib/low/KeccakP-1600/XOP/KeccakP-1600-XOP.c +0 -476
  109. data/ext/xkcp/lib/low/KeccakP-1600/XOP/u6/KeccakP-1600-XOP-config.h +0 -6
  110. data/ext/xkcp/lib/low/KeccakP-1600/XOP/ua/KeccakP-1600-XOP-config.h +0 -6
  111. data/ext/xkcp/lib/low/KeccakP-1600/common/KeccakP-1600-unrolling.macros +0 -305
  112. data/ext/xkcp/lib/low/KeccakP-1600/compact/KeccakP-1600-compact64.c +0 -420
  113. data/ext/xkcp/lib/low/KeccakP-1600/plain-32bits-inplace/KeccakP-1600-SnP.h +0 -43
  114. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/KeccakP-1600-opt64.c +0 -565
  115. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcu6/KeccakP-1600-opt64-config.h +0 -7
  116. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcua/KeccakP-1600-opt64-config.h +0 -7
  117. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcua-shld/KeccakP-1600-opt64-config.h +0 -8
  118. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/u6/KeccakP-1600-opt64-config.h +0 -6
  119. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/ua/KeccakP-1600-opt64-config.h +0 -6
  120. data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-SnP.h +0 -44
  121. data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-reference.h +0 -23
  122. data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-reference32BI.c +0 -625
  123. data/ext/xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-SnP.h +0 -44
  124. data/ext/xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-reference.c +0 -440
  125. data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-SnP.h +0 -42
  126. data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-gas.s +0 -1196
  127. data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-gas_Apple.s +0 -1124
  128. data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-shld-gas.s +0 -1196
  129. data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-inplace-pl2-armv7a-neon-le-armcc.s +0 -1392
  130. data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-inplace-pl2-armv7a-neon-le-gcc.s +0 -1394
  131. data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-times2-SnP.h +0 -42
  132. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512u12/SIMD512-2-config.h +0 -7
  133. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512u4/SIMD512-2-config.h +0 -7
  134. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512ufull/SIMD512-2-config.h +0 -7
  135. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/KeccakP-1600-times2-SIMD512.c +0 -850
  136. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/KeccakP-1600-times2-SnP.h +0 -51
  137. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SIMD128.c +0 -957
  138. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SnP.h +0 -49
  139. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/SSSE3-u2/SIMD128-config.h +0 -8
  140. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/SSSE3-ua/SIMD128-config.h +0 -8
  141. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/XOP-u2/SIMD128-config.h +0 -9
  142. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/XOP-ua/SIMD128-config.h +0 -9
  143. data/ext/xkcp/lib/low/KeccakP-1600-times2/fallback-on1/KeccakP-1600-times2-SnP.h +0 -45
  144. data/ext/xkcp/lib/low/KeccakP-1600-times2/fallback-on1/KeccakP-1600-times2-on1.c +0 -37
  145. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/KeccakP-1600-times4-SIMD256.c +0 -1321
  146. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/KeccakP-1600-times4-SnP.h +0 -55
  147. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/u12/SIMD256-config.h +0 -7
  148. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/u6/SIMD256-config.h +0 -7
  149. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/ua/SIMD256-config.h +0 -7
  150. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512u12/SIMD512-4-config.h +0 -7
  151. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512u4/SIMD512-4-config.h +0 -7
  152. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512ufull/SIMD512-4-config.h +0 -7
  153. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/KeccakP-1600-times4-SIMD512.c +0 -881
  154. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/KeccakP-1600-times4-SnP.h +0 -51
  155. data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on1/KeccakP-1600-times4-SnP.h +0 -45
  156. data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on1/KeccakP-1600-times4-on1.c +0 -37
  157. data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on2/KeccakP-1600-times4-SnP.h +0 -45
  158. data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on2/KeccakP-1600-times4-on2.c +0 -38
  159. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/KeccakP-1600-times8-SIMD512.c +0 -1615
  160. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/KeccakP-1600-times8-SnP.h +0 -57
  161. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/u12/SIMD512-config.h +0 -7
  162. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/u4/SIMD512-config.h +0 -7
  163. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/ua/SIMD512-config.h +0 -7
  164. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on1/KeccakP-1600-times8-SnP.h +0 -45
  165. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on1/KeccakP-1600-times8-on1.c +0 -37
  166. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on2/KeccakP-1600-times8-SnP.h +0 -45
  167. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on2/KeccakP-1600-times8-on2.c +0 -38
  168. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on4/KeccakP-1600-times8-SnP.h +0 -45
  169. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on4/KeccakP-1600-times8-on4.c +0 -38
  170. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-SnP.h +0 -41
  171. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv6m-le-armcc.s +0 -442
  172. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv6m-le-gcc.s +0 -446
  173. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv7m-le-armcc.s +0 -419
  174. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv7m-le-gcc.s +0 -427
  175. data/ext/xkcp/lib/low/KeccakP-200/AVR8/KeccakP-200-SnP.h +0 -41
  176. data/ext/xkcp/lib/low/KeccakP-200/AVR8/KeccakP-200-avr8-fast.s +0 -647
  177. data/ext/xkcp/lib/low/KeccakP-200/compact/KeccakP-200-SnP.h +0 -39
  178. data/ext/xkcp/lib/low/KeccakP-200/compact/KeccakP-200-compact.c +0 -190
  179. data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-SnP.h +0 -43
  180. data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-reference.c +0 -412
  181. data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-reference.h +0 -23
  182. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-SnP.h +0 -41
  183. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv6m-le-armcc.s +0 -454
  184. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv6m-le-gcc.s +0 -458
  185. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv7m-le-armcc.s +0 -455
  186. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv7m-le-gcc.s +0 -458
  187. data/ext/xkcp/lib/low/KeccakP-400/AVR8/KeccakP-400-SnP.h +0 -41
  188. data/ext/xkcp/lib/low/KeccakP-400/AVR8/KeccakP-400-avr8-fast.s +0 -728
  189. data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-SnP.h +0 -43
  190. data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-reference.c +0 -414
  191. data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-reference.h +0 -23
  192. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-SnP.h +0 -42
  193. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u1-armv6m-le-armcc.s +0 -527
  194. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u1-armv6m-le-gcc.s +0 -533
  195. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv6m-le-armcc.s +0 -528
  196. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv6m-le-gcc.s +0 -534
  197. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7a-le-armcc.s +0 -521
  198. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7a-le-gcc.s +0 -527
  199. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7m-le-armcc.s +0 -517
  200. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7m-le-gcc.s +0 -523
  201. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-uf-armv7m-le-armcc.s +0 -550
  202. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-uf-armv7m-le-gcc.s +0 -556
  203. data/ext/xkcp/lib/low/KeccakP-800/ARMv8A/KeccakP-800-SnP.h +0 -32
  204. data/ext/xkcp/lib/low/KeccakP-800/ARMv8A/KeccakP-800-armv8a-neon.s +0 -432
  205. data/ext/xkcp/lib/low/KeccakP-800/AVR8/KeccakP-800-SnP.h +0 -42
  206. data/ext/xkcp/lib/low/KeccakP-800/AVR8/KeccakP-800-avr8-fast.s +0 -929
  207. data/ext/xkcp/lib/low/KeccakP-800/compact/KeccakP-800-SnP.h +0 -40
  208. data/ext/xkcp/lib/low/KeccakP-800/compact/KeccakP-800-compact.c +0 -244
  209. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-SnP.h +0 -46
  210. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32-bis.macros +0 -184
  211. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32.c +0 -454
  212. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32.macros +0 -459
  213. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-unrolling-bis.macros +0 -83
  214. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-unrolling.macros +0 -88
  215. data/ext/xkcp/lib/low/KeccakP-800/plain/lcu2/KeccakP-800-opt32-config.h +0 -7
  216. data/ext/xkcp/lib/low/KeccakP-800/plain/lcua/KeccakP-800-opt32-config.h +0 -7
  217. data/ext/xkcp/lib/low/KeccakP-800/plain/u2/KeccakP-800-opt32-config.h +0 -7
  218. data/ext/xkcp/lib/low/KeccakP-800/plain/ua/KeccakP-800-opt32-config.h +0 -7
  219. data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-SnP.h +0 -44
  220. data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-reference.c +0 -437
  221. data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-reference.h +0 -23
  222. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/Ket.h +0 -57
  223. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeJr-armv7m-le-armcc.s +0 -475
  224. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeJr-armv7m-le-gcc.s +0 -480
  225. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeSr-armv7m-le-armcc.s +0 -590
  226. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeSr-armv7m-le-gcc.s +0 -590
  227. data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.c +0 -126
  228. data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.h +0 -68
  229. data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.inc +0 -174
  230. data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.c +0 -80
  231. data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.h +0 -68
  232. data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.inc +0 -142
  233. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-SnP.h +0 -55
  234. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-u1-armv6m-le-armcc.s +0 -1086
  235. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-u1-armv6m-le-gcc.s +0 -1092
  236. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv6-le-armcc.s +0 -721
  237. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv6-le-gcc.s +0 -726
  238. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv7m-le-armcc.s +0 -723
  239. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv7m-le-gcc.s +0 -729
  240. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-u1-armv6m-le-armcc.s +0 -1164
  241. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-u1-armv6m-le-gcc.s +0 -1165
  242. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv6-le-armcc.s +0 -562
  243. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv6-le-gcc.s +0 -563
  244. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv7m-le-armcc.s +0 -563
  245. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv7m-le-gcc.s +0 -565
  246. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-SnP.h +0 -55
  247. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-uf-armv7a-neon-le-armcc.s +0 -476
  248. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-uf-armv7a-neon-le-gcc.s +0 -485
  249. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodyak-uf-armv7a-neon-le-armcc.s +0 -362
  250. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodyak-uf-armv7a-neon-le-gcc.s +0 -367
  251. data/ext/xkcp/lib/low/Xoodoo/AVR8/Xoodoo-SnP.h +0 -43
  252. data/ext/xkcp/lib/low/Xoodoo/AVR8/Xoodoo-avr8-u1.s +0 -1341
  253. data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodoo-SIMD512.c +0 -581
  254. data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodoo-SnP.h +0 -58
  255. data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodyak-full-block-SIMD512.c +0 -332
  256. data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodoo-SIMD128.c +0 -329
  257. data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodoo-SnP.h +0 -53
  258. data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodyak-full-block-SIMD128.c +0 -355
  259. data/ext/xkcp/lib/low/Xoodoo/Xoodoo.h +0 -79
  260. data/ext/xkcp/lib/low/Xoodoo/plain/Xoodoo-SnP.h +0 -56
  261. data/ext/xkcp/lib/low/Xoodoo/plain/Xoodoo-optimized.c +0 -399
  262. data/ext/xkcp/lib/low/Xoodoo/plain/Xoodyak-full-blocks.c +0 -127
  263. data/ext/xkcp/lib/low/Xoodoo/ref/Xoodoo-SnP.h +0 -43
  264. data/ext/xkcp/lib/low/Xoodoo/ref/Xoodoo-reference.c +0 -253
  265. data/ext/xkcp/lib/low/Xoodoo-times16/AVX512/Xoodoo-times16-SIMD512.c +0 -1044
  266. data/ext/xkcp/lib/low/Xoodoo-times16/AVX512/Xoodoo-times16-SnP.h +0 -49
  267. data/ext/xkcp/lib/low/Xoodoo-times16/fallback-on1/Xoodoo-times16-SnP.h +0 -45
  268. data/ext/xkcp/lib/low/Xoodoo-times16/fallback-on1/Xoodoo-times16-on1.c +0 -37
  269. data/ext/xkcp/lib/low/Xoodoo-times4/ARMv7A-NEON/Xoodoo-times4-ARMv7A.s +0 -1587
  270. data/ext/xkcp/lib/low/Xoodoo-times4/ARMv7A-NEON/Xoodoo-times4-SnP.h +0 -48
  271. data/ext/xkcp/lib/low/Xoodoo-times4/AVX512/Xoodoo-times4-SIMD512.c +0 -1202
  272. data/ext/xkcp/lib/low/Xoodoo-times4/AVX512/Xoodoo-times4-SnP.h +0 -48
  273. data/ext/xkcp/lib/low/Xoodoo-times4/SSSE3/Xoodoo-times4-SIMD128.c +0 -484
  274. data/ext/xkcp/lib/low/Xoodoo-times4/SSSE3/Xoodoo-times4-SnP.h +0 -44
  275. data/ext/xkcp/lib/low/Xoodoo-times4/fallback-on1/Xoodoo-times4-SnP.h +0 -45
  276. data/ext/xkcp/lib/low/Xoodoo-times4/fallback-on1/Xoodoo-times4-on1.c +0 -37
  277. data/ext/xkcp/lib/low/Xoodoo-times8/AVX2/Xoodoo-times8-SIMD256.c +0 -939
  278. data/ext/xkcp/lib/low/Xoodoo-times8/AVX2/Xoodoo-times8-SnP.h +0 -49
  279. data/ext/xkcp/lib/low/Xoodoo-times8/AVX512/Xoodoo-times8-SIMD512.c +0 -1216
  280. data/ext/xkcp/lib/low/Xoodoo-times8/AVX512/Xoodoo-times8-SnP.h +0 -48
  281. data/ext/xkcp/lib/low/Xoodoo-times8/fallback-on1/Xoodoo-times8-SnP.h +0 -45
  282. data/ext/xkcp/lib/low/Xoodoo-times8/fallback-on1/Xoodoo-times8-on1.c +0 -37
  283. data/ext/xkcp/lib/low/common/PlSnP-Fallback.inc +0 -290
  284. data/ext/xkcp/lib/low/common/SnP-Relaned.h +0 -141
  285. data/ext/xkcp/support/Kernel-PMU/Kernel-pmu.md +0 -133
  286. data/ext/xkcp/support/Kernel-PMU/Makefile +0 -8
  287. data/ext/xkcp/support/Kernel-PMU/enable_arm_pmu.c +0 -129
  288. data/ext/xkcp/support/Kernel-PMU/load-module +0 -1
  289. data/ext/xkcp/util/KeccakSum/KeccakSum.c +0 -394
  290. data/ext/xkcp/util/KeccakSum/base64.c +0 -86
  291. data/ext/xkcp/util/KeccakSum/base64.h +0 -12
@@ -1,1615 +0,0 @@
1
- /*
2
- The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
3
-
4
- Implementation by Ronny Van Keer, hereby denoted as "the implementer".
5
-
6
- For more information, feedback or questions, please refer to the Keccak Team website:
7
- https://keccak.team/
8
-
9
- To the extent possible under law, the implementer has waived all copyright
10
- and related or neighboring rights to the source code in this file.
11
- http://creativecommons.org/publicdomain/zero/1.0/
12
-
13
- ---
14
-
15
- This file implements Keccak-p[1600]×8 in a PlSnP-compatible way.
16
- Please refer to PlSnP-documentation.h for more details.
17
-
18
- This implementation comes with KeccakP-1600-times8-SnP.h in the same folder.
19
- Please refer to LowLevel.build for the exact list of other files it must be combined with.
20
- */
21
-
22
- #include <stdio.h>
23
- #include <stdlib.h>
24
- #include <string.h>
25
- #include <stdint.h>
26
- #include <smmintrin.h>
27
- #include <wmmintrin.h>
28
- #include <immintrin.h>
29
- #include <emmintrin.h>
30
- #include "align.h"
31
- #include "KeccakP-1600-times8-SnP.h"
32
- #include "SIMD512-config.h"
33
-
34
- #include "brg_endian.h"
35
- #if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
36
- #error Expecting a little-endian platform
37
- #endif
38
-
39
- /*
40
- ** Uncomment the define hereunder when compiling for a CPU without AVX-512 SIMD.
41
- #define SIMULATE_AVX512
42
- */
43
-
44
- #define VERBOSE 0
45
-
46
- #if defined(SIMULATE_AVX512)
47
-
48
- typedef struct
49
- {
50
- uint64_t x[8];
51
- } __m512i;
52
-
53
- static __m512i _mm512_and_si512( __m512i a, __m512i b)
54
- {
55
- __m512i r;
56
- unsigned int i;
57
-
58
- for ( i = 0; i < 8; ++i )
59
- r.x[i] = a.x[i] & b.x[i];
60
- return(r);
61
- }
62
-
63
- static __m512i _mm512_xor_si512( __m512i a, __m512i b)
64
- {
65
- __m512i r;
66
- unsigned int i;
67
-
68
- for ( i = 0; i < 8; ++i )
69
- r.x[i] = a.x[i] ^ b.x[i];
70
- return(r);
71
- }
72
-
73
- static __m512i _mm512_ternarylogic_epi64(__m512i a, __m512i b, __m512i c, int imm)
74
- {
75
-
76
- if (imm == 0x96)
77
- return ( _mm512_xor_si512( _mm512_xor_si512( a, b ), c ) );
78
- if (imm == 0xD2) {
79
- __m512i t;
80
- unsigned int i;
81
-
82
- for ( i = 0; i < 8; ++i )
83
- t.x[i] = ~b.x[i] & c.x[i];
84
- return ( _mm512_xor_si512( a, t ) );
85
- }
86
- printf( "_mm512_ternarylogic_epi64( a, b, c, %02X) not implemented!\n", imm );
87
- exit(1);
88
-
89
- }
90
-
91
- static __m512i _mm512_rol_epi64(__m512i a, int offset)
92
- {
93
- __m512i r;
94
- unsigned int i;
95
-
96
- for ( i = 0; i < 8; ++i )
97
- r.x[i] = (a.x[i] << offset) | (a.x[i] >> (64-offset));
98
- return(r);
99
- }
100
-
101
- static __m512i _mm512_srli_epi64(__m512i a, int offset)
102
- {
103
- __m512i r;
104
- unsigned int i;
105
-
106
- for ( i = 0; i < 8; ++i )
107
- r.x[i] = (a.x[i] >> offset);
108
- return(r);
109
- }
110
-
111
-
112
- static __m512i _mm512_broadcast_f64x4(__m256d a)
113
- {
114
- __m512i r;
115
- unsigned int i;
116
- uint64_t t[4];
117
-
118
- _mm256_store_si256( (__m256i*)t, (__m256i)a );
119
- for ( i = 0; i < 4; ++i )
120
- r.x[i+4] = r.x[i] = t[i];
121
- return(r);
122
- }
123
-
124
- static __m512i _mm512_set_epi64(uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e, uint64_t f, uint64_t g, uint64_t h)
125
- {
126
- __m512i r;
127
-
128
- r.x[0] = h;
129
- r.x[1] = g;
130
- r.x[2] = f;
131
- r.x[3] = e;
132
- r.x[4] = d;
133
- r.x[5] = c;
134
- r.x[6] = b;
135
- r.x[7] = a;
136
- return(r);
137
- }
138
-
139
- static __m512i _mm512_i32gather_epi64(__m256i idx, const void *p, int scale)
140
- {
141
- __m512i r;
142
- unsigned int i;
143
- uint32_t offset[8];
144
-
145
- _mm256_store_si256( (__m256i*)offset, idx );
146
- for ( i = 0; i < 8; ++i )
147
- r.x[i] = *(const uint64_t*)((const char*)p + offset[i] * scale);
148
- return(r);
149
- }
150
-
151
- static void _mm512_i32scatter_epi64( void *p, __m256i idx, __m512i value, int scale)
152
- {
153
- unsigned int i;
154
- uint32_t offset[8];
155
-
156
- _mm256_store_si256( (__m256i*)offset, idx );
157
- for ( i = 0; i < 8; ++i )
158
- *(uint64_t*)((char*)p + offset[i] * scale) = value.x[i];
159
- }
160
-
161
- static __m512i _mm512_permutex2var_epi64(__m512i a, __m512i idx, __m512i b)
162
- {
163
- __m512i r;
164
- unsigned int i;
165
- for ( i = 0; i < 8; ++i )
166
- r.x[i] = (idx.x[i] & 8) ? b.x[idx.x[i] & 7] : a.x[idx.x[i] & 7];
167
- return(r);
168
- }
169
-
170
- static __m512i _mm512_maskz_loadu_epi64(uint8_t k, const void *mem_addr)
171
- {
172
- __m512i r;
173
- const uint64_t *p64 = (const uint64_t *)mem_addr;
174
- unsigned int i;
175
-
176
- for ( i = 0; i < 8; ++i ) {
177
- if ((k & (1 << i)) != 0) {
178
- r.x[i] = p64[i];
179
- }
180
- else {
181
- r.x[i] = 0;
182
- }
183
- }
184
- return(r);
185
- }
186
-
187
- #define _mm512_maskz_load_epi64 _mm512_maskz_loadu_epi64
188
-
189
- static void _mm512_storeu_si512(__m512i * mem_addr, __m512i a)
190
- {
191
- uint64_t *p64 = (uint64_t *)mem_addr;
192
- unsigned int i;
193
-
194
- for ( i = 0; i < 8; ++i )
195
- p64[i] = a.x[i];
196
- }
197
-
198
- #define _mm512_store_si512 _mm512_storeu_si512
199
-
200
- static __m512i _mm512_loadu_si512(const __m512i * mem_addr)
201
- {
202
- __m512i r;
203
- const uint64_t *p64 = (const uint64_t *)mem_addr;
204
- unsigned int i;
205
-
206
- for ( i = 0; i < 8; ++i )
207
- r.x[i] = p64[i];
208
- return(r);
209
- }
210
-
211
- #define _mm512_load_si512 _mm512_loadu_si512
212
-
213
- static void _mm512_mask_storeu_epi64(void *mem_addr, uint8_t k, __m512i a)
214
- {
215
- uint64_t *p64 = (uint64_t *)mem_addr;
216
- unsigned int i;
217
-
218
- for ( i = 0; i < 8; ++i ) {
219
- if ((k & (1 << i)) != 0)
220
- p64[i] = a.x[i];
221
- }
222
- }
223
-
224
- #define _mm512_mask_store_epi64 _mm512_mask_storeu_epi64
225
-
226
- static __m512i _mm512_setzero_si512(void)
227
- {
228
- __m512i r;
229
- unsigned int i;
230
-
231
- for ( i = 0; i < 8; ++i )
232
- r.x[i] = 0;
233
- return(r);
234
- }
235
-
236
- static __m256i _mm512_extracti64x4_epi64(__m512i a, int imm8)
237
- {
238
- uint64_t buf[8];
239
- __m256i r;
240
-
241
- _mm512_storeu_si512((__m512i*)buf, a);
242
- r = *(__m256i*)&buf[((imm8 == 0) ? 0 : 4)];
243
- return(r);
244
- }
245
-
246
- #endif
247
-
248
- typedef __m128i V128;
249
- typedef __m256i V256;
250
- typedef __m512i V512;
251
-
252
- #if defined(KeccakP1600times8_useAVX512)
253
-
254
- #define XOR(a,b) _mm512_xor_si512(a,b)
255
- #define XOR3(a,b,c) _mm512_ternarylogic_epi64(a,b,c,0x96)
256
- #define XOR5(a,b,c,d,e) XOR3(XOR3(a,b,c),d,e)
257
- #define XOReq512(a, b) a = XOR(a,b)
258
-
259
- #define ROL(a,offset) _mm512_rol_epi64(a,offset)
260
- #define Chi(a,b,c) _mm512_ternarylogic_epi64(a,b,c,0xD2)
261
-
262
- #define CONST8_64(a) _mm512_set1_epi64(a)
263
-
264
- #define LOAD512(a) _mm512_load_si512((const V512 *)&(a))
265
- #define LOAD512u(a) _mm512_loadu_si512((const V512 *)&(a))
266
- #define LOAD8_32(a,b,c,d,e,f,g,h) _mm256_set_epi32((uint64_t)(a), (uint32_t)(b), (uint32_t)(c), (uint32_t)(d), (uint32_t)(e), (uint32_t)(f), (uint32_t)(g), (uint32_t)(h))
267
- #define LOAD8_64(a,b,c,d,e,f,g,h) _mm512_set_epi64((uint64_t)(a), (uint64_t)(b), (uint64_t)(c), (uint64_t)(d), (uint64_t)(e), (uint64_t)(f), (uint64_t)(g), (uint64_t)(h))
268
- #define LOAD_GATHER8_64(idx,p) _mm512_i32gather_epi64( idx, (const void*)(p), 8)
269
-
270
- #define STORE_SCATTER8_64(p,idx, v) _mm512_i32scatter_epi64( (void*)(p), idx, v, 8)
271
-
272
- #endif
273
-
274
- #if (VERBOSE > 0)
275
- #define DumpMem(__t, buf, __n) { \
276
- uint32_t i; \
277
- printf("%s ", __t); \
278
- for (i = 0; i < __n; ++i) { \
279
- printf("%016lx ", (buf)[i]); \
280
- /*if ((i%5) == 4) printf("\n"); */\
281
- } \
282
- printf("\n"); \
283
- }
284
-
285
- #define DumpOne(__v,__i) { \
286
- uint64_t buf[8]; \
287
- _mm512_storeu_si512((V512*)buf, __v##__i); \
288
- printf("%016lx %016lx %016lx %016lx %016lx %016lx %016lx %016lx\n", \
289
- buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); \
290
- }
291
-
292
- #define Dump(__t,__v) { \
293
- printf("%s\n", __t); \
294
- DumpOne(__v, ba); \
295
- DumpOne(__v, be); \
296
- DumpOne(__v, bi); \
297
- DumpOne(__v, bo); \
298
- DumpOne(__v, bu); \
299
- DumpOne(__v, ga); \
300
- DumpOne(__v, ge); \
301
- DumpOne(__v, gi); \
302
- DumpOne(__v, go); \
303
- DumpOne(__v, gu); \
304
- DumpOne(__v, ka); \
305
- DumpOne(__v, ke); \
306
- DumpOne(__v, ki); \
307
- DumpOne(__v, ko); \
308
- DumpOne(__v, ku); \
309
- DumpOne(__v, ma); \
310
- DumpOne(__v, me); \
311
- DumpOne(__v, mi); \
312
- DumpOne(__v, mo); \
313
- DumpOne(__v, mu); \
314
- DumpOne(__v, sa); \
315
- DumpOne(__v, se); \
316
- DumpOne(__v, si); \
317
- DumpOne(__v, so); \
318
- DumpOne(__v, su); \
319
- printf("\n"); \
320
- }
321
-
322
- #define DumpReg(__t,__v,__i) printf("%s ", __t); DumpOne(__v,__i)
323
-
324
- #else
325
- #define DumpMem(__t, buf,len)
326
- #define DumpOne(__v,__i)
327
- #define Dump(__t,__v)
328
- #define DumpReg(__t,__v,__i)
329
- #endif
330
-
331
-
332
- #define laneIndex(instanceIndex, lanePosition) ((lanePosition)*8 + instanceIndex)
333
- #define SnP_laneLengthInBytes 8
334
-
335
- void KeccakP1600times8_InitializeAll(void *states)
336
- {
337
- memset(states, 0, KeccakP1600times8_statesSizeInBytes);
338
- }
339
-
340
- void KeccakP1600times8_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
341
- {
342
- unsigned int sizeLeft = length;
343
- unsigned int lanePosition = offset/SnP_laneLengthInBytes;
344
- unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
345
- const unsigned char *curData = data;
346
- uint64_t *statesAsLanes = states;
347
-
348
- if ((sizeLeft > 0) && (offsetInLane != 0)) {
349
- unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
350
- uint64_t lane = 0;
351
- if (bytesInLane > sizeLeft)
352
- bytesInLane = sizeLeft;
353
- memcpy((unsigned char*)&lane + offsetInLane, curData, bytesInLane);
354
- statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
355
- sizeLeft -= bytesInLane;
356
- lanePosition++;
357
- curData += bytesInLane;
358
- }
359
-
360
- while(sizeLeft >= SnP_laneLengthInBytes) {
361
- uint64_t lane = *((const uint64_t*)curData);
362
- statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
363
- sizeLeft -= SnP_laneLengthInBytes;
364
- lanePosition++;
365
- curData += SnP_laneLengthInBytes;
366
- }
367
-
368
- if (sizeLeft > 0) {
369
- uint64_t lane = 0;
370
- memcpy(&lane, curData, sizeLeft);
371
- statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
372
- }
373
- }
374
-
375
- void KeccakP1600times8_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
376
- {
377
- V512 *stateAsLanes = states;
378
- const uint64_t *dataAsLanes = (const uint64_t *)data;
379
- unsigned int i;
380
- V256 index;
381
-
382
- #define Add_In( argIndex ) stateAsLanes[argIndex] = XOR(stateAsLanes[argIndex], LOAD_GATHER8_64(index, dataAsLanes+argIndex))
383
- index = LOAD8_32(7*laneOffset, 6*laneOffset, 5*laneOffset, 4*laneOffset, 3*laneOffset, 2*laneOffset, 1*laneOffset, 0*laneOffset);
384
- if ( laneCount >= 16 ) {
385
- Add_In( 0 );
386
- Add_In( 1 );
387
- Add_In( 2 );
388
- Add_In( 3 );
389
- Add_In( 4 );
390
- Add_In( 5 );
391
- Add_In( 6 );
392
- Add_In( 7 );
393
- Add_In( 8 );
394
- Add_In( 9 );
395
- Add_In( 10 );
396
- Add_In( 11 );
397
- Add_In( 12 );
398
- Add_In( 13 );
399
- Add_In( 14 );
400
- Add_In( 15 );
401
- if ( laneCount >= 20 ) {
402
- Add_In( 16 );
403
- Add_In( 17 );
404
- Add_In( 18 );
405
- Add_In( 19 );
406
- for(i=20; i<laneCount; i++)
407
- Add_In( i );
408
- }
409
- else {
410
- for(i=16; i<laneCount; i++)
411
- Add_In( i );
412
- }
413
- }
414
- else {
415
- for(i=0; i<laneCount; i++)
416
- Add_In( i );
417
- }
418
- #undef Add_In
419
- }
420
-
421
- void KeccakP1600times8_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
422
- {
423
- unsigned int sizeLeft = length;
424
- unsigned int lanePosition = offset/SnP_laneLengthInBytes;
425
- unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
426
- const unsigned char *curData = data;
427
- uint64_t *statesAsLanes = states;
428
-
429
- if ((sizeLeft > 0) && (offsetInLane != 0)) {
430
- unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
431
- if (bytesInLane > sizeLeft)
432
- bytesInLane = sizeLeft;
433
- memcpy( ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, curData, bytesInLane);
434
- sizeLeft -= bytesInLane;
435
- lanePosition++;
436
- curData += bytesInLane;
437
- }
438
-
439
- while(sizeLeft >= SnP_laneLengthInBytes) {
440
- uint64_t lane = *((const uint64_t*)curData);
441
- statesAsLanes[laneIndex(instanceIndex, lanePosition)] = lane;
442
- sizeLeft -= SnP_laneLengthInBytes;
443
- lanePosition++;
444
- curData += SnP_laneLengthInBytes;
445
- }
446
-
447
- if (sizeLeft > 0) {
448
- memcpy(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], curData, sizeLeft);
449
- }
450
- }
451
-
452
- void KeccakP1600times8_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
453
- {
454
- V512 *stateAsLanes = states;
455
- const uint64_t *dataAsLanes = (const uint64_t *)data;
456
- unsigned int i;
457
- V256 index;
458
-
459
- #define OverWr( argIndex ) stateAsLanes[argIndex] = LOAD_GATHER8_64(index, dataAsLanes+argIndex)
460
- index = LOAD8_32(7*laneOffset, 6*laneOffset, 5*laneOffset, 4*laneOffset, 3*laneOffset, 2*laneOffset, 1*laneOffset, 0*laneOffset);
461
- if ( laneCount >= 16 ) {
462
- OverWr( 0 );
463
- OverWr( 1 );
464
- OverWr( 2 );
465
- OverWr( 3 );
466
- OverWr( 4 );
467
- OverWr( 5 );
468
- OverWr( 6 );
469
- OverWr( 7 );
470
- OverWr( 8 );
471
- OverWr( 9 );
472
- OverWr( 10 );
473
- OverWr( 11 );
474
- OverWr( 12 );
475
- OverWr( 13 );
476
- OverWr( 14 );
477
- OverWr( 15 );
478
- if ( laneCount >= 20 ) {
479
- OverWr( 16 );
480
- OverWr( 17 );
481
- OverWr( 18 );
482
- OverWr( 19 );
483
- for(i=20; i<laneCount; i++)
484
- OverWr( i );
485
- }
486
- else {
487
- for(i=16; i<laneCount; i++)
488
- OverWr( i );
489
- }
490
- }
491
- else {
492
- for(i=0; i<laneCount; i++)
493
- OverWr( i );
494
- }
495
- #undef OverWr
496
- }
497
-
498
- void KeccakP1600times8_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount)
499
- {
500
- unsigned int sizeLeft = byteCount;
501
- unsigned int lanePosition = 0;
502
- uint64_t *statesAsLanes = states;
503
-
504
- while(sizeLeft >= SnP_laneLengthInBytes) {
505
- statesAsLanes[laneIndex(instanceIndex, lanePosition)] = 0;
506
- sizeLeft -= SnP_laneLengthInBytes;
507
- lanePosition++;
508
- }
509
-
510
- if (sizeLeft > 0) {
511
- memset(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], 0, sizeLeft);
512
- }
513
- }
514
-
515
- void KeccakP1600times8_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length)
516
- {
517
- unsigned int sizeLeft = length;
518
- unsigned int lanePosition = offset/SnP_laneLengthInBytes;
519
- unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
520
- unsigned char *curData = data;
521
- const uint64_t *statesAsLanes = states;
522
-
523
- if ((sizeLeft > 0) && (offsetInLane != 0)) {
524
- unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
525
- if (bytesInLane > sizeLeft)
526
- bytesInLane = sizeLeft;
527
- memcpy( curData, ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane);
528
- sizeLeft -= bytesInLane;
529
- lanePosition++;
530
- curData += bytesInLane;
531
- }
532
-
533
- while(sizeLeft >= SnP_laneLengthInBytes) {
534
- *(uint64_t*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
535
- sizeLeft -= SnP_laneLengthInBytes;
536
- lanePosition++;
537
- curData += SnP_laneLengthInBytes;
538
- }
539
-
540
- if (sizeLeft > 0) {
541
- memcpy( curData, &statesAsLanes[laneIndex(instanceIndex, lanePosition)], sizeLeft);
542
- }
543
- }
544
-
545
- void KeccakP1600times8_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
546
- {
547
- const V512 *stateAsLanes = states;
548
- uint64_t *dataAsLanes = (uint64_t *)data;
549
- unsigned int i;
550
- V256 index;
551
-
552
- #define Extr( argIndex ) STORE_SCATTER8_64(dataAsLanes+argIndex, index, stateAsLanes[argIndex])
553
- index = LOAD8_32(7*laneOffset, 6*laneOffset, 5*laneOffset, 4*laneOffset, 3*laneOffset, 2*laneOffset, 1*laneOffset, 0*laneOffset);
554
- if ( laneCount >= 16 ) {
555
- Extr( 0 );
556
- Extr( 1 );
557
- Extr( 2 );
558
- Extr( 3 );
559
- Extr( 4 );
560
- Extr( 5 );
561
- Extr( 6 );
562
- Extr( 7 );
563
- Extr( 8 );
564
- Extr( 9 );
565
- Extr( 10 );
566
- Extr( 11 );
567
- Extr( 12 );
568
- Extr( 13 );
569
- Extr( 14 );
570
- Extr( 15 );
571
- if ( laneCount >= 20 ) {
572
- Extr( 16 );
573
- Extr( 17 );
574
- Extr( 18 );
575
- Extr( 19 );
576
- for(i=20; i<laneCount; i++)
577
- Extr( i );
578
- }
579
- else {
580
- for(i=16; i<laneCount; i++)
581
- Extr( i );
582
- }
583
- }
584
- else {
585
- for(i=0; i<laneCount; i++)
586
- Extr( i );
587
- }
588
- #undef Extr
589
- }
590
-
591
- void KeccakP1600times8_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
592
- {
593
- unsigned int sizeLeft = length;
594
- unsigned int lanePosition = offset/SnP_laneLengthInBytes;
595
- unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
596
- const unsigned char *curInput = input;
597
- unsigned char *curOutput = output;
598
- const uint64_t *statesAsLanes = states;
599
-
600
- if ((sizeLeft > 0) && (offsetInLane != 0)) {
601
- unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
602
- uint64_t lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)] >> (8 * offsetInLane);
603
- if (bytesInLane > sizeLeft)
604
- bytesInLane = sizeLeft;
605
- sizeLeft -= bytesInLane;
606
- do {
607
- *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
608
- lane >>= 8;
609
- } while ( --bytesInLane != 0);
610
- lanePosition++;
611
- }
612
-
613
- while(sizeLeft >= SnP_laneLengthInBytes) {
614
- *((uint64_t*)curOutput) = *((uint64_t*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)];
615
- sizeLeft -= SnP_laneLengthInBytes;
616
- lanePosition++;
617
- curInput += SnP_laneLengthInBytes;
618
- curOutput += SnP_laneLengthInBytes;
619
- }
620
-
621
- if (sizeLeft != 0) {
622
- uint64_t lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
623
- do {
624
- *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
625
- lane >>= 8;
626
- } while ( --sizeLeft != 0);
627
- }
628
- }
629
-
630
- void KeccakP1600times8_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset)
631
- {
632
- const V512 *stateAsLanes = states;
633
- const uint64_t *inAsLanes = (const uint64_t *)input;
634
- uint64_t *outAsLanes = (uint64_t *)output;
635
- unsigned int i;
636
- V256 index;
637
-
638
- #define ExtrAdd( argIndex ) STORE_SCATTER8_64(outAsLanes+argIndex, index, XOR(stateAsLanes[argIndex], LOAD_GATHER8_64(index, inAsLanes+argIndex)))
639
- index = LOAD8_32(7*laneOffset, 6*laneOffset, 5*laneOffset, 4*laneOffset, 3*laneOffset, 2*laneOffset, 1*laneOffset, 0*laneOffset);
640
- if ( laneCount >= 16 ) {
641
- ExtrAdd( 0 );
642
- ExtrAdd( 1 );
643
- ExtrAdd( 2 );
644
- ExtrAdd( 3 );
645
- ExtrAdd( 4 );
646
- ExtrAdd( 5 );
647
- ExtrAdd( 6 );
648
- ExtrAdd( 7 );
649
- ExtrAdd( 8 );
650
- ExtrAdd( 9 );
651
- ExtrAdd( 10 );
652
- ExtrAdd( 11 );
653
- ExtrAdd( 12 );
654
- ExtrAdd( 13 );
655
- ExtrAdd( 14 );
656
- ExtrAdd( 15 );
657
- if ( laneCount >= 20 ) {
658
- ExtrAdd( 16 );
659
- ExtrAdd( 17 );
660
- ExtrAdd( 18 );
661
- ExtrAdd( 19 );
662
- for(i=20; i<laneCount; i++)
663
- ExtrAdd( i );
664
- }
665
- else {
666
- for(i=16; i<laneCount; i++)
667
- ExtrAdd( i );
668
- }
669
- }
670
- else {
671
- for(i=0; i<laneCount; i++)
672
- ExtrAdd( i );
673
- }
674
- #undef ExtrAdd
675
-
676
- }
677
-
678
- static ALIGN(KeccakP1600times8_statesAlignment) const uint64_t KeccakP1600RoundConstants[24] = {
679
- 0x0000000000000001ULL,
680
- 0x0000000000008082ULL,
681
- 0x800000000000808aULL,
682
- 0x8000000080008000ULL,
683
- 0x000000000000808bULL,
684
- 0x0000000080000001ULL,
685
- 0x8000000080008081ULL,
686
- 0x8000000000008009ULL,
687
- 0x000000000000008aULL,
688
- 0x0000000000000088ULL,
689
- 0x0000000080008009ULL,
690
- 0x000000008000000aULL,
691
- 0x000000008000808bULL,
692
- 0x800000000000008bULL,
693
- 0x8000000000008089ULL,
694
- 0x8000000000008003ULL,
695
- 0x8000000000008002ULL,
696
- 0x8000000000000080ULL,
697
- 0x000000000000800aULL,
698
- 0x800000008000000aULL,
699
- 0x8000000080008081ULL,
700
- 0x8000000000008080ULL,
701
- 0x0000000080000001ULL,
702
- 0x8000000080008008ULL};
703
-
704
- #define KeccakP_DeclareVars \
705
- V512 _Ba, _Be, _Bi, _Bo, _Bu; \
706
- V512 _Da, _De, _Di, _Do, _Du; \
707
- V512 _ba, _be, _bi, _bo, _bu; \
708
- V512 _ga, _ge, _gi, _go, _gu; \
709
- V512 _ka, _ke, _ki, _ko, _ku; \
710
- V512 _ma, _me, _mi, _mo, _mu; \
711
- V512 _sa, _se, _si, _so, _su
712
-
713
- #define KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bb1, _Bb2, _Bb3, _Bb4, _Bb5, _Rr1, _Rr2, _Rr3, _Rr4, _Rr5 ) \
714
- _Bb1 = XOR(_L1, _Da); \
715
- _Bb2 = XOR(_L2, _De); \
716
- _Bb3 = XOR(_L3, _Di); \
717
- _Bb4 = XOR(_L4, _Do); \
718
- _Bb5 = XOR(_L5, _Du); \
719
- if (_Rr1 != 0) _Bb1 = ROL(_Bb1, _Rr1); \
720
- _Bb2 = ROL(_Bb2, _Rr2); \
721
- _Bb3 = ROL(_Bb3, _Rr3); \
722
- _Bb4 = ROL(_Bb4, _Rr4); \
723
- _Bb5 = ROL(_Bb5, _Rr5); \
724
- _L1 = Chi( _Ba, _Be, _Bi); \
725
- _L2 = Chi( _Be, _Bi, _Bo); \
726
- _L3 = Chi( _Bi, _Bo, _Bu); \
727
- _L4 = Chi( _Bo, _Bu, _Ba); \
728
- _L5 = Chi( _Bu, _Ba, _Be);
729
-
730
- #define KeccakP_ThetaRhoPiChiIota0( _L1, _L2, _L3, _L4, _L5, _rc ) \
731
- _Ba = XOR5( _ba, _ga, _ka, _ma, _sa ); /* Theta effect */ \
732
- _Be = XOR5( _be, _ge, _ke, _me, _se ); \
733
- _Bi = XOR5( _bi, _gi, _ki, _mi, _si ); \
734
- _Bo = XOR5( _bo, _go, _ko, _mo, _so ); \
735
- _Bu = XOR5( _bu, _gu, _ku, _mu, _su ); \
736
- _Da = ROL( _Be, 1 ); \
737
- _De = ROL( _Bi, 1 ); \
738
- _Di = ROL( _Bo, 1 ); \
739
- _Do = ROL( _Bu, 1 ); \
740
- _Du = ROL( _Ba, 1 ); \
741
- _Da = XOR( _Da, _Bu ); \
742
- _De = XOR( _De, _Ba ); \
743
- _Di = XOR( _Di, _Be ); \
744
- _Do = XOR( _Do, _Bi ); \
745
- _Du = XOR( _Du, _Bo ); \
746
- KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Ba, _Be, _Bi, _Bo, _Bu, 0, 44, 43, 21, 14 ); \
747
- _L1 = XOR(_L1, _rc) /* Iota */
748
-
749
- #define KeccakP_ThetaRhoPiChi1( _L1, _L2, _L3, _L4, _L5 ) \
750
- KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bi, _Bo, _Bu, _Ba, _Be, 3, 45, 61, 28, 20 )
751
-
752
- #define KeccakP_ThetaRhoPiChi2( _L1, _L2, _L3, _L4, _L5 ) \
753
- KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bu, _Ba, _Be, _Bi, _Bo, 18, 1, 6, 25, 8 )
754
-
755
- #define KeccakP_ThetaRhoPiChi3( _L1, _L2, _L3, _L4, _L5 ) \
756
- KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Be, _Bi, _Bo, _Bu, _Ba, 36, 10, 15, 56, 27 )
757
-
758
- #define KeccakP_ThetaRhoPiChi4( _L1, _L2, _L3, _L4, _L5 ) \
759
- KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bo, _Bu, _Ba, _Be, _Bi, 41, 2, 62, 55, 39 )
760
-
761
- #define KeccakP_4rounds( i ) \
762
- KeccakP_ThetaRhoPiChiIota0(_ba, _ge, _ki, _mo, _su, CONST8_64(KeccakP1600RoundConstants[i]) ); \
763
- KeccakP_ThetaRhoPiChi1( _ka, _me, _si, _bo, _gu ); \
764
- KeccakP_ThetaRhoPiChi2( _sa, _be, _gi, _ko, _mu ); \
765
- KeccakP_ThetaRhoPiChi3( _ga, _ke, _mi, _so, _bu ); \
766
- KeccakP_ThetaRhoPiChi4( _ma, _se, _bi, _go, _ku ); \
767
- \
768
- KeccakP_ThetaRhoPiChiIota0(_ba, _me, _gi, _so, _ku, CONST8_64(KeccakP1600RoundConstants[i+1]) ); \
769
- KeccakP_ThetaRhoPiChi1( _sa, _ke, _bi, _mo, _gu ); \
770
- KeccakP_ThetaRhoPiChi2( _ma, _ge, _si, _ko, _bu ); \
771
- KeccakP_ThetaRhoPiChi3( _ka, _be, _mi, _go, _su ); \
772
- KeccakP_ThetaRhoPiChi4( _ga, _se, _ki, _bo, _mu ); \
773
- \
774
- KeccakP_ThetaRhoPiChiIota0(_ba, _ke, _si, _go, _mu, CONST8_64(KeccakP1600RoundConstants[i+2]) ); \
775
- KeccakP_ThetaRhoPiChi1( _ma, _be, _ki, _so, _gu ); \
776
- KeccakP_ThetaRhoPiChi2( _ga, _me, _bi, _ko, _su ); \
777
- KeccakP_ThetaRhoPiChi3( _sa, _ge, _mi, _bo, _ku ); \
778
- KeccakP_ThetaRhoPiChi4( _ka, _se, _gi, _mo, _bu ); \
779
- \
780
- KeccakP_ThetaRhoPiChiIota0(_ba, _be, _bi, _bo, _bu, CONST8_64(KeccakP1600RoundConstants[i+3]) ); \
781
- KeccakP_ThetaRhoPiChi1( _ga, _ge, _gi, _go, _gu ); \
782
- KeccakP_ThetaRhoPiChi2( _ka, _ke, _ki, _ko, _ku ); \
783
- KeccakP_ThetaRhoPiChi3( _ma, _me, _mi, _mo, _mu ); \
784
- KeccakP_ThetaRhoPiChi4( _sa, _se, _si, _so, _su )
785
-
786
- #define KeccakP_2rounds( i ) \
787
- KeccakP_ThetaRhoPiChiIota0(_ba, _ke, _si, _go, _mu, CONST8_64(KeccakP1600RoundConstants[i]) ); \
788
- KeccakP_ThetaRhoPiChi1( _ma, _be, _ki, _so, _gu ); \
789
- KeccakP_ThetaRhoPiChi2( _ga, _me, _bi, _ko, _su ); \
790
- KeccakP_ThetaRhoPiChi3( _sa, _ge, _mi, _bo, _ku ); \
791
- KeccakP_ThetaRhoPiChi4( _ka, _se, _gi, _mo, _bu ); \
792
- \
793
- KeccakP_ThetaRhoPiChiIota0(_ba, _be, _bi, _bo, _bu, CONST8_64(KeccakP1600RoundConstants[i+1]) ); \
794
- KeccakP_ThetaRhoPiChi1( _ga, _ge, _gi, _go, _gu ); \
795
- KeccakP_ThetaRhoPiChi2( _ka, _ke, _ki, _ko, _ku ); \
796
- KeccakP_ThetaRhoPiChi3( _ma, _me, _mi, _mo, _mu ); \
797
- KeccakP_ThetaRhoPiChi4( _sa, _se, _si, _so, _su )
798
-
799
- #ifdef KeccakP1600times8_fullUnrolling
800
-
801
- #define rounds12 \
802
- KeccakP_4rounds( 12 ); \
803
- KeccakP_4rounds( 16 ); \
804
- KeccakP_4rounds( 20 )
805
-
806
- #define rounds24 \
807
- KeccakP_4rounds( 0 ); \
808
- KeccakP_4rounds( 4 ); \
809
- KeccakP_4rounds( 8 ); \
810
- KeccakP_4rounds( 12 ); \
811
- KeccakP_4rounds( 16 ); \
812
- KeccakP_4rounds( 20 )
813
-
814
- #elif (KeccakP1600times8_unrolling == 4)
815
-
816
- #define rounds12 \
817
- i = 12; \
818
- do { \
819
- KeccakP_4rounds( i ); \
820
- } while( (i += 4) < 24 )
821
-
822
- #define rounds24 \
823
- i = 0; \
824
- do { \
825
- KeccakP_4rounds( i ); \
826
- } while( (i += 4) < 24 )
827
-
828
- #elif (KeccakP1600times8_unrolling == 12)
829
-
830
- #define rounds12 \
831
- KeccakP_4rounds( 12 ); \
832
- KeccakP_4rounds( 16 ); \
833
- KeccakP_4rounds( 20 )
834
-
835
- #define rounds24 \
836
- i = 0; \
837
- do { \
838
- KeccakP_4rounds( i ); \
839
- KeccakP_4rounds( i+4 ); \
840
- KeccakP_4rounds( i+8 ); \
841
- } while( (i += 12) < 24 )
842
-
843
- #else
844
- #error "Unrolling is not correctly specified!"
845
- #endif
846
-
847
- #define rounds6 \
848
- KeccakP_2rounds( 18 ); \
849
- KeccakP_4rounds( 20 )
850
-
851
- #define rounds4 \
852
- KeccakP_4rounds( 20 )
853
-
854
- #define copyFromState(pState) \
855
- _ba = pState[ 0]; \
856
- _be = pState[ 1]; \
857
- _bi = pState[ 2]; \
858
- _bo = pState[ 3]; \
859
- _bu = pState[ 4]; \
860
- _ga = pState[ 5]; \
861
- _ge = pState[ 6]; \
862
- _gi = pState[ 7]; \
863
- _go = pState[ 8]; \
864
- _gu = pState[ 9]; \
865
- _ka = pState[10]; \
866
- _ke = pState[11]; \
867
- _ki = pState[12]; \
868
- _ko = pState[13]; \
869
- _ku = pState[14]; \
870
- _ma = pState[15]; \
871
- _me = pState[16]; \
872
- _mi = pState[17]; \
873
- _mo = pState[18]; \
874
- _mu = pState[19]; \
875
- _sa = pState[20]; \
876
- _se = pState[21]; \
877
- _si = pState[22]; \
878
- _so = pState[23]; \
879
- _su = pState[24]
880
-
881
- #define copyFromState2rounds(pState) \
882
- _ba = pState[ 0]; \
883
- _be = pState[16]; /* me */ \
884
- _bi = pState[ 7]; /* gi */ \
885
- _bo = pState[23]; /* so */ \
886
- _bu = pState[14]; /* ku */ \
887
- _ga = pState[20]; /* sa */ \
888
- _ge = pState[11]; /* ke */ \
889
- _gi = pState[ 2]; /* bi */ \
890
- _go = pState[18]; /* mo */ \
891
- _gu = pState[ 9]; \
892
- _ka = pState[15]; /* ma */ \
893
- _ke = pState[ 6]; /* ge */ \
894
- _ki = pState[22]; /* si */ \
895
- _ko = pState[13]; \
896
- _ku = pState[ 4]; /* bu */ \
897
- _ma = pState[10]; /* ka */ \
898
- _me = pState[ 1]; /* be */ \
899
- _mi = pState[17]; \
900
- _mo = pState[ 8]; /* go */ \
901
- _mu = pState[24]; /* su */ \
902
- _sa = pState[ 5]; /* ga */ \
903
- _se = pState[21]; \
904
- _si = pState[12]; /* ki */ \
905
- _so = pState[ 3]; /* bo */ \
906
- _su = pState[19] /* mu */
907
-
908
- #define copyToState(pState) \
909
- pState[ 0] = _ba; \
910
- pState[ 1] = _be; \
911
- pState[ 2] = _bi; \
912
- pState[ 3] = _bo; \
913
- pState[ 4] = _bu; \
914
- pState[ 5] = _ga; \
915
- pState[ 6] = _ge; \
916
- pState[ 7] = _gi; \
917
- pState[ 8] = _go; \
918
- pState[ 9] = _gu; \
919
- pState[10] = _ka; \
920
- pState[11] = _ke; \
921
- pState[12] = _ki; \
922
- pState[13] = _ko; \
923
- pState[14] = _ku; \
924
- pState[15] = _ma; \
925
- pState[16] = _me; \
926
- pState[17] = _mi; \
927
- pState[18] = _mo; \
928
- pState[19] = _mu; \
929
- pState[20] = _sa; \
930
- pState[21] = _se; \
931
- pState[22] = _si; \
932
- pState[23] = _so; \
933
- pState[24] = _su
934
-
935
- void KeccakP1600times8_PermuteAll_24rounds(void *states)
936
- {
937
- V512 *statesAsLanes = states;
938
- KeccakP_DeclareVars;
939
- #ifndef KeccakP1600times8_fullUnrolling
940
- unsigned int i;
941
- #endif
942
-
943
- copyFromState(statesAsLanes);
944
- rounds24;
945
- copyToState(statesAsLanes);
946
- }
947
-
948
- void KeccakP1600times8_PermuteAll_12rounds(void *states)
949
- {
950
- V512 *statesAsLanes = states;
951
- KeccakP_DeclareVars;
952
- #if (KeccakP1600times8_unrolling < 12)
953
- unsigned int i;
954
- #endif
955
-
956
- copyFromState(statesAsLanes);
957
- rounds12;
958
- copyToState(statesAsLanes);
959
- }
960
-
961
- void KeccakP1600times8_PermuteAll_6rounds(void *states)
962
- {
963
- V512 *statesAsLanes = states;
964
- KeccakP_DeclareVars;
965
-
966
- copyFromState2rounds(statesAsLanes);
967
- rounds6;
968
- copyToState(statesAsLanes);
969
- }
970
-
971
- void KeccakP1600times8_PermuteAll_4rounds(void *states)
972
- {
973
- V512 *statesAsLanes = states;
974
- KeccakP_DeclareVars;
975
-
976
- copyFromState(statesAsLanes);
977
- rounds4;
978
- copyToState(statesAsLanes);
979
- }
980
-
981
- size_t KeccakF1600times8_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
982
- {
983
- size_t dataMinimumSize = (laneOffsetParallel*7 + laneCount)*8;
984
-
985
- if (laneCount == 21) {
986
- #ifndef KeccakP1600times8_fullUnrolling
987
- unsigned int i;
988
- #endif
989
- const unsigned char *dataStart = data;
990
- V512 *statesAsLanes = states;
991
- const uint64_t *dataAsLanes = (const uint64_t *)data;
992
- KeccakP_DeclareVars;
993
- V256 index;
994
-
995
- copyFromState(statesAsLanes);
996
- index = LOAD8_32(7*laneOffsetParallel, 6*laneOffsetParallel, 5*laneOffsetParallel, 4*laneOffsetParallel, 3*laneOffsetParallel, 2*laneOffsetParallel, 1*laneOffsetParallel, 0*laneOffsetParallel);
997
- while(dataByteLen >= dataMinimumSize) {
998
- #define Add_In( argLane, argIndex ) argLane = XOR(argLane, LOAD_GATHER8_64(index, dataAsLanes+argIndex))
999
- Add_In( _ba, 0 );
1000
- Add_In( _be, 1 );
1001
- Add_In( _bi, 2 );
1002
- Add_In( _bo, 3 );
1003
- Add_In( _bu, 4 );
1004
- Add_In( _ga, 5 );
1005
- Add_In( _ge, 6 );
1006
- Add_In( _gi, 7 );
1007
- Add_In( _go, 8 );
1008
- Add_In( _gu, 9 );
1009
- Add_In( _ka, 10 );
1010
- Add_In( _ke, 11 );
1011
- Add_In( _ki, 12 );
1012
- Add_In( _ko, 13 );
1013
- Add_In( _ku, 14 );
1014
- Add_In( _ma, 15 );
1015
- Add_In( _me, 16 );
1016
- Add_In( _mi, 17 );
1017
- Add_In( _mo, 18 );
1018
- Add_In( _mu, 19 );
1019
- Add_In( _sa, 20 );
1020
- #undef Add_In
1021
- rounds24;
1022
- dataAsLanes += laneOffsetSerial;
1023
- dataByteLen -= laneOffsetSerial*8;
1024
- }
1025
- copyToState(statesAsLanes);
1026
- return (const unsigned char *)dataAsLanes - dataStart;
1027
- }
1028
- else {
1029
- const unsigned char *dataStart = data;
1030
-
1031
- while(dataByteLen >= dataMinimumSize) {
1032
- KeccakP1600times8_AddLanesAll(states, data, laneCount, laneOffsetParallel);
1033
- KeccakP1600times8_PermuteAll_24rounds(states);
1034
- data += laneOffsetSerial*8;
1035
- dataByteLen -= laneOffsetSerial*8;
1036
- }
1037
- return data - dataStart;
1038
- }
1039
- }
1040
-
1041
- size_t KeccakP1600times8_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
1042
- {
1043
- size_t dataMinimumSize = (laneOffsetParallel*7 + laneCount)*8;
1044
-
1045
- if (laneCount == 21) {
1046
- #if (KeccakP1600times8_unrolling < 12)
1047
- unsigned int i;
1048
- #endif
1049
- const unsigned char *dataStart = data;
1050
- V512 *statesAsLanes = states;
1051
- const uint64_t *dataAsLanes = (const uint64_t *)data;
1052
- KeccakP_DeclareVars;
1053
- V256 index;
1054
-
1055
- copyFromState(statesAsLanes);
1056
- index = LOAD8_32(7*laneOffsetParallel, 6*laneOffsetParallel, 5*laneOffsetParallel, 4*laneOffsetParallel, 3*laneOffsetParallel, 2*laneOffsetParallel, 1*laneOffsetParallel, 0*laneOffsetParallel);
1057
- while(dataByteLen >= dataMinimumSize) {
1058
- #define Add_In( argLane, argIndex ) argLane = XOR(argLane, LOAD_GATHER8_64(index, dataAsLanes+argIndex))
1059
- Add_In( _ba, 0 );
1060
- Add_In( _be, 1 );
1061
- Add_In( _bi, 2 );
1062
- Add_In( _bo, 3 );
1063
- Add_In( _bu, 4 );
1064
- Add_In( _ga, 5 );
1065
- Add_In( _ge, 6 );
1066
- Add_In( _gi, 7 );
1067
- Add_In( _go, 8 );
1068
- Add_In( _gu, 9 );
1069
- Add_In( _ka, 10 );
1070
- Add_In( _ke, 11 );
1071
- Add_In( _ki, 12 );
1072
- Add_In( _ko, 13 );
1073
- Add_In( _ku, 14 );
1074
- Add_In( _ma, 15 );
1075
- Add_In( _me, 16 );
1076
- Add_In( _mi, 17 );
1077
- Add_In( _mo, 18 );
1078
- Add_In( _mu, 19 );
1079
- Add_In( _sa, 20 );
1080
- #undef Add_In
1081
- rounds12;
1082
- dataAsLanes += laneOffsetSerial;
1083
- dataByteLen -= laneOffsetSerial*8;
1084
- }
1085
- copyToState(statesAsLanes);
1086
- return (const unsigned char *)dataAsLanes - dataStart;
1087
- }
1088
- else {
1089
- const unsigned char *dataStart = data;
1090
-
1091
- while(dataByteLen >= dataMinimumSize) {
1092
- KeccakP1600times8_AddLanesAll(states, data, laneCount, laneOffsetParallel);
1093
- KeccakP1600times8_PermuteAll_12rounds(states);
1094
- data += laneOffsetSerial*8;
1095
- dataByteLen -= laneOffsetSerial*8;
1096
- }
1097
- return data - dataStart;
1098
- }
1099
- }
1100
-
1101
- /* ------------------------------------------------------------------------- */
1102
-
1103
- #define LOAD(p) _mm512_loadu_si512(p)
1104
- #define XOReq(a,b) a = _mm512_xor_si512(a,b)
1105
- #define ZERO() _mm512_setzero_si512()
1106
- #define CONST_64(a) _mm512_set1_epi64(a)
1107
-
1108
- #define chunkSize 8192
1109
- #define rateInBytes 168
1110
-
1111
- #define initializeState(X) \
1112
- X##ba = ZERO(); \
1113
- X##be = ZERO(); \
1114
- X##bi = ZERO(); \
1115
- X##bo = ZERO(); \
1116
- X##bu = ZERO(); \
1117
- X##ga = ZERO(); \
1118
- X##ge = ZERO(); \
1119
- X##gi = ZERO(); \
1120
- X##go = ZERO(); \
1121
- X##gu = ZERO(); \
1122
- X##ka = ZERO(); \
1123
- X##ke = ZERO(); \
1124
- X##ki = ZERO(); \
1125
- X##ko = ZERO(); \
1126
- X##ku = ZERO(); \
1127
- X##ma = ZERO(); \
1128
- X##me = ZERO(); \
1129
- X##mi = ZERO(); \
1130
- X##mo = ZERO(); \
1131
- X##mu = ZERO(); \
1132
- X##sa = ZERO(); \
1133
- X##se = ZERO(); \
1134
- X##si = ZERO(); \
1135
- X##so = ZERO(); \
1136
- X##su = ZERO(); \
1137
-
1138
- #define LoadAndTranspose8(dataAsLanes, offset) \
1139
- t0 = LOAD((dataAsLanes) + (offset) + 0*chunkSize/8); \
1140
- t1 = LOAD((dataAsLanes) + (offset) + 1*chunkSize/8); \
1141
- t2 = LOAD((dataAsLanes) + (offset) + 2*chunkSize/8); \
1142
- t3 = LOAD((dataAsLanes) + (offset) + 3*chunkSize/8); \
1143
- t4 = LOAD((dataAsLanes) + (offset) + 4*chunkSize/8); \
1144
- t5 = LOAD((dataAsLanes) + (offset) + 5*chunkSize/8); \
1145
- t6 = LOAD((dataAsLanes) + (offset) + 6*chunkSize/8); \
1146
- t7 = LOAD((dataAsLanes) + (offset) + 7*chunkSize/8); \
1147
- r0 = _mm512_unpacklo_epi64(t0, t1); \
1148
- r1 = _mm512_unpackhi_epi64(t0, t1); \
1149
- r2 = _mm512_unpacklo_epi64(t2, t3); \
1150
- r3 = _mm512_unpackhi_epi64(t2, t3); \
1151
- r4 = _mm512_unpacklo_epi64(t4, t5); \
1152
- r5 = _mm512_unpackhi_epi64(t4, t5); \
1153
- r6 = _mm512_unpacklo_epi64(t6, t7); \
1154
- r7 = _mm512_unpackhi_epi64(t6, t7); \
1155
- t0 = _mm512_shuffle_i32x4(r0, r2, 0x88); \
1156
- t1 = _mm512_shuffle_i32x4(r1, r3, 0x88); \
1157
- t2 = _mm512_shuffle_i32x4(r0, r2, 0xdd); \
1158
- t3 = _mm512_shuffle_i32x4(r1, r3, 0xdd); \
1159
- t4 = _mm512_shuffle_i32x4(r4, r6, 0x88); \
1160
- t5 = _mm512_shuffle_i32x4(r5, r7, 0x88); \
1161
- t6 = _mm512_shuffle_i32x4(r4, r6, 0xdd); \
1162
- t7 = _mm512_shuffle_i32x4(r5, r7, 0xdd); \
1163
- r0 = _mm512_shuffle_i32x4(t0, t4, 0x88); \
1164
- r1 = _mm512_shuffle_i32x4(t1, t5, 0x88); \
1165
- r2 = _mm512_shuffle_i32x4(t2, t6, 0x88); \
1166
- r3 = _mm512_shuffle_i32x4(t3, t7, 0x88); \
1167
- r4 = _mm512_shuffle_i32x4(t0, t4, 0xdd); \
1168
- r5 = _mm512_shuffle_i32x4(t1, t5, 0xdd); \
1169
- r6 = _mm512_shuffle_i32x4(t2, t6, 0xdd); \
1170
- r7 = _mm512_shuffle_i32x4(t3, t7, 0xdd); \
1171
-
1172
- #define XORdata16(X, index, dataAsLanes) \
1173
- LoadAndTranspose8(dataAsLanes, 0) \
1174
- XOReq(X##ba, r0); \
1175
- XOReq(X##be, r1); \
1176
- XOReq(X##bi, r2); \
1177
- XOReq(X##bo, r3); \
1178
- XOReq(X##bu, r4); \
1179
- XOReq(X##ga, r5); \
1180
- XOReq(X##ge, r6); \
1181
- XOReq(X##gi, r7); \
1182
- LoadAndTranspose8(dataAsLanes, 8) \
1183
- XOReq(X##go, r0); \
1184
- XOReq(X##gu, r1); \
1185
- XOReq(X##ka, r2); \
1186
- XOReq(X##ke, r3); \
1187
- XOReq(X##ki, r4); \
1188
- XOReq(X##ko, r5); \
1189
- XOReq(X##ku, r6); \
1190
- XOReq(X##ma, r7); \
1191
-
1192
- #define XORdata21(X, index, dataAsLanes) \
1193
- XORdata16(X, index, dataAsLanes) \
1194
- XOReq(X##me, LOAD_GATHER8_64(index, (dataAsLanes) + 16)); \
1195
- XOReq(X##mi, LOAD_GATHER8_64(index, (dataAsLanes) + 17)); \
1196
- XOReq(X##mo, LOAD_GATHER8_64(index, (dataAsLanes) + 18)); \
1197
- XOReq(X##mu, LOAD_GATHER8_64(index, (dataAsLanes) + 19)); \
1198
- XOReq(X##sa, LOAD_GATHER8_64(index, (dataAsLanes) + 20)); \
1199
-
1200
- void KeccakP1600times8_K12ProcessLeaves(const unsigned char *input, unsigned char *output)
1201
- {
1202
- KeccakP_DeclareVars;
1203
- unsigned int j;
1204
- const uint64_t *outputAsLanes = (const uint64_t *)output;
1205
- __m256i index;
1206
- __m512i t0, t1, t2, t3, t4, t5, t6, t7;
1207
- __m512i r0, r1, r2, r3, r4, r5, r6, r7;
1208
-
1209
- initializeState(_);
1210
-
1211
- index = LOAD8_32(7*(chunkSize / 8), 6*(chunkSize / 8), 5*(chunkSize / 8), 4*(chunkSize / 8), 3*(chunkSize / 8), 2*(chunkSize / 8), 1*(chunkSize / 8), 0*(chunkSize / 8));
1212
- for(j = 0; j < (chunkSize - rateInBytes); j += rateInBytes) {
1213
- XORdata21(_, index, (const uint64_t *)input);
1214
- rounds12
1215
- input += rateInBytes;
1216
- }
1217
-
1218
- XORdata16(_, index, (const uint64_t *)input);
1219
- XOReq(_me, CONST_64(0x0BULL));
1220
- XOReq(_sa, CONST_64(0x8000000000000000ULL));
1221
- rounds12
1222
-
1223
- index = LOAD8_32(7*4, 6*4, 5*4, 4*4, 3*4, 2*4, 1*4, 0*4);
1224
- STORE_SCATTER8_64(outputAsLanes+0, index, _ba);
1225
- STORE_SCATTER8_64(outputAsLanes+1, index, _be);
1226
- STORE_SCATTER8_64(outputAsLanes+2, index, _bi);
1227
- STORE_SCATTER8_64(outputAsLanes+3, index, _bo);
1228
- }
1229
-
1230
- #undef LOAD
1231
- #undef XOReq
1232
- #undef ZERO
1233
- #undef CONST_64
1234
- #undef chunkSize
1235
- #undef rateInBytes
1236
-
1237
- /* ------------------------------------------------------------------------- */
1238
-
1239
- /* Remap lanes to start after two rounds */
1240
- #define Iba _ba
1241
- #define Ibe _me
1242
- #define Ibi _gi
1243
- #define Ibo _so
1244
- #define Ibu _ku
1245
- #define Iga _sa
1246
- #define Ige _ke
1247
- #define Igi _bi
1248
- #define Igo _mo
1249
- #define Igu _gu
1250
- #define Ika _ma
1251
- #define Ike _ge
1252
- #define Iki _si
1253
- #define Iko _ko
1254
- #define Iku _bu
1255
- #define Ima _ka
1256
- #define Ime _be
1257
- #define Imi _mi
1258
- #define Imo _go
1259
- #define Imu _su
1260
- #define Isa _ga
1261
- #define Ise _se
1262
- #define Isi _ki
1263
- #define Iso _bo
1264
- #define Isu _mu
1265
-
1266
- #define LoadInput(argIndex) _mm512_i32gather_epi64(gather, (const long long int *)&in64[argIndex], 8)
1267
- #define AddInput(argIndex) XOR( LoadInput(argIndex), CONST8_64(kRoll[argIndex]))
1268
-
1269
-
1270
- ALIGN(64) static const uint64_t oLow256[] = { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 };
1271
- ALIGN(64) static const uint64_t oHigh256[] = { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 };
1272
-
1273
- ALIGN(64) static const uint64_t oLow128[] = { 0, 1, 8+0, 8+1, 4, 5, 8+4, 8+5 };
1274
- ALIGN(64) static const uint64_t oHigh128[] = { 2, 3, 8+2, 8+3, 6, 7, 8+6, 8+7 };
1275
-
1276
- ALIGN(64) static const uint64_t oLow64[] = { 0, 8+0, 2, 8+2, 4, 8+4, 6, 8+6 };
1277
- ALIGN(64) static const uint64_t oHigh64[] = { 1, 8+1, 3, 8+3, 5, 8+5, 7, 8+7 };
1278
-
1279
- ALIGN(64) static const uint64_t o01234_012[] = { 0, 1, 2, 3, 4, 8+0, 8+1, 8+2 };
1280
- ALIGN(64) static const uint64_t o1234_0123[] = { 1, 2, 3, 4, 8+0, 8+1, 8+2, 8+3 };
1281
- ALIGN(64) static const uint64_t o1234567_0[] = { 1, 2, 3, 4, 5, 6, 7, 8+0 };
1282
- ALIGN(64) static const uint64_t o1234567_3[] = { 1, 2, 3, 4, 5, 6, 7, 8+3 };
1283
- ALIGN(64) static const uint64_t o1234567_4[] = { 1, 2, 3, 4, 5, 6, 7, 8+4 };
1284
- ALIGN(64) static const uint64_t o234567_45[] = { 2, 3, 4, 5, 6, 7, 8+4, 8+5 };
1285
- ALIGN(64) static const uint64_t o34567_456[] = { 3, 4, 5, 6, 7, 8+4, 8+5, 8+6 };
1286
-
1287
- ALIGN(32) static const uint32_t oGatherScatter[]= {0*25, 1*25, 2*25, 3*25, 4*25, 5*25, 6*25, 7*25};
1288
-
1289
- #if defined(__i386__) || defined(_M_IX86)
1290
- #define _mm256_extract_epi64(a, index) \
1291
- ((uint64_t)_mm256_extract_epi32((a), (index)*2) || ((uint64_t)_mm256_extract_epi32((a), (index)*2+1) << 32))
1292
- #endif
1293
-
1294
- size_t KeccakP1600times8_KravatteCompress(uint64_t *xAccu, uint64_t *kRoll, const unsigned char *input, size_t inputByteLen)
1295
- {
1296
- #if !defined(KeccakP1600times4_fullUnrolling)
1297
- unsigned int i;
1298
- #endif
1299
- uint64_t *in64 = (uint64_t *)input;
1300
- size_t nBlocks = inputByteLen / (8 * 200);
1301
- KeccakP_DeclareVars;
1302
- V512 x01234567, x12345678;
1303
- V512 Xba, Xbe, Xbi, Xbo, Xbu;
1304
- V512 Xga, Xge, Xgi, Xgo, Xgu;
1305
- V512 Xka, Xke, Xki, Xko, Xku;
1306
- V512 Xma, Xme, Xmi, Xmo, Xmu;
1307
- V512 Xsa, Xse, Xsi, Xso, Xsu;
1308
- V256 v1;
1309
- V512 p1, p2;
1310
- V256 gather = *(V256*)oGatherScatter;
1311
-
1312
- /* Clear internal X accu */
1313
- Xba = _mm512_setzero_si512();
1314
- Xbe = _mm512_setzero_si512();
1315
- Xbi = _mm512_setzero_si512();
1316
- Xbo = _mm512_setzero_si512();
1317
- Xbu = _mm512_setzero_si512();
1318
- Xga = _mm512_setzero_si512();
1319
- Xge = _mm512_setzero_si512();
1320
- Xgi = _mm512_setzero_si512();
1321
- Xgo = _mm512_setzero_si512();
1322
- Xgu = _mm512_setzero_si512();
1323
- Xka = _mm512_setzero_si512();
1324
- Xke = _mm512_setzero_si512();
1325
- Xki = _mm512_setzero_si512();
1326
- Xko = _mm512_setzero_si512();
1327
- Xku = _mm512_setzero_si512();
1328
- Xma = _mm512_setzero_si512();
1329
- Xme = _mm512_setzero_si512();
1330
- Xmi = _mm512_setzero_si512();
1331
- Xmo = _mm512_setzero_si512();
1332
- Xmu = _mm512_setzero_si512();
1333
- Xsa = _mm512_setzero_si512();
1334
- Xse = _mm512_setzero_si512();
1335
- Xsi = _mm512_setzero_si512();
1336
- Xso = _mm512_setzero_si512();
1337
- Xsu = _mm512_setzero_si512();
1338
-
1339
- /* prepare 8 lanes for roll-c */
1340
- x01234567 = _mm512_maskz_loadu_epi64(0x1F, &kRoll[20]); /* 5 lanes ok */
1341
- _ba = _mm512_maskz_loadu_epi64(0x0F, &kRoll[21]); /* 4 lanes ok */
1342
- _be = XOR3(ROL(x01234567, 7), _ba, _mm512_srli_epi64(_ba, 3));
1343
- x01234567 = _mm512_permutex2var_epi64(x01234567, *(V512*)o01234_012, _be);
1344
- x12345678 = _mm512_permutex2var_epi64(x01234567, *(V512*)o1234_0123, _be);
1345
-
1346
- do {
1347
- Iba = AddInput( 0);
1348
- Ibe = AddInput( 1);
1349
- Ibi = AddInput( 2);
1350
- Ibo = AddInput( 3);
1351
- Ibu = AddInput( 4);
1352
- Iga = AddInput( 5);
1353
- Ige = AddInput( 6);
1354
- Igi = AddInput( 7);
1355
- Igo = AddInput( 8);
1356
- Igu = AddInput( 9);
1357
- Ika = AddInput(10);
1358
- Ike = AddInput(11);
1359
- Iki = AddInput(12);
1360
- Iko = AddInput(13);
1361
- Iku = AddInput(14);
1362
- Ima = AddInput(15);
1363
- Ime = AddInput(16);
1364
- Imi = AddInput(17);
1365
- Imo = AddInput(18);
1366
- Imu = AddInput(19);
1367
-
1368
- /* Roll-c */
1369
- Isa = x01234567;
1370
- Ise = x12345678;
1371
- Isu = XOR3(ROL(x01234567, 7), x12345678, _mm512_srli_epi64(x12345678, 3));
1372
- Ise = _mm512_permutex2var_epi64(x01234567, *(V512*)o1234567_3, Isu);
1373
- Isi = _mm512_permutex2var_epi64(Ise, *(V512*)o1234567_4, Isu);
1374
- Iso = _mm512_permutex2var_epi64(Ise, *(V512*)o234567_45, Isu);
1375
- Isu = _mm512_permutex2var_epi64(Ise, *(V512*)o34567_456, Isu);
1376
-
1377
- x01234567 = XOR3(ROL(Iso, 7), Isu, _mm512_srli_epi64(Isu, 3));
1378
- x12345678 = _mm512_permutex2var_epi64(x01234567, *(V512*)o1234567_4, x01234567);
1379
-
1380
- XOReq512(Isa, LoadInput(20));
1381
- XOReq512(Ise, LoadInput(21));
1382
- XOReq512(Isi, LoadInput(22));
1383
- XOReq512(Iso, LoadInput(23));
1384
- XOReq512(Isu, LoadInput(24));
1385
-
1386
- rounds6
1387
- Dump( "P-out", _);
1388
-
1389
- /* Accumulate in X */
1390
- XOReq512(Xba, _ba);
1391
- XOReq512(Xbe, _be);
1392
- XOReq512(Xbi, _bi);
1393
- XOReq512(Xbo, _bo);
1394
- XOReq512(Xbu, _bu);
1395
- XOReq512(Xga, _ga);
1396
- XOReq512(Xge, _ge);
1397
- XOReq512(Xgi, _gi);
1398
- XOReq512(Xgo, _go);
1399
- XOReq512(Xgu, _gu);
1400
- XOReq512(Xka, _ka);
1401
- XOReq512(Xke, _ke);
1402
- XOReq512(Xki, _ki);
1403
- XOReq512(Xko, _ko);
1404
- XOReq512(Xku, _ku);
1405
- XOReq512(Xma, _ma);
1406
- XOReq512(Xme, _me);
1407
- XOReq512(Xmi, _mi);
1408
- XOReq512(Xmo, _mo);
1409
- XOReq512(Xmu, _mu);
1410
- XOReq512(Xsa, _sa);
1411
- XOReq512(Xse, _se);
1412
- XOReq512(Xsi, _si);
1413
- XOReq512(Xso, _so);
1414
- XOReq512(Xsu, _su);
1415
- Dump( "X", X);
1416
-
1417
- in64 += 8 * 25;
1418
- }
1419
- while(--nBlocks != 0);
1420
-
1421
- /* Add horizontally Xba ... Xgi Reduce from lanes 8 to 4 */
1422
- p1 = *(V512*)oLow256;
1423
- p2 = *(V512*)oHigh256;
1424
- Xba = XOR(_mm512_permutex2var_epi64(Xba, p1, Xbu), _mm512_permutex2var_epi64(Xba, p2, Xbu));
1425
- Xbe = XOR(_mm512_permutex2var_epi64(Xbe, p1, Xga), _mm512_permutex2var_epi64(Xbe, p2, Xga));
1426
- Xbi = XOR(_mm512_permutex2var_epi64(Xbi, p1, Xge), _mm512_permutex2var_epi64(Xbi, p2, Xge));
1427
- Xbo = XOR(_mm512_permutex2var_epi64(Xbo, p1, Xgi), _mm512_permutex2var_epi64(Xbo, p2, Xgi));
1428
-
1429
- /* Add horizontally Xgo ... Xma Reduce from lanes 8 to 4 */
1430
- Xgo = XOR(_mm512_permutex2var_epi64(Xgo, p1, Xki), _mm512_permutex2var_epi64(Xgo, p2, Xki));
1431
- Xgu = XOR(_mm512_permutex2var_epi64(Xgu, p1, Xko), _mm512_permutex2var_epi64(Xgu, p2, Xko));
1432
- Xka = XOR(_mm512_permutex2var_epi64(Xka, p1, Xku), _mm512_permutex2var_epi64(Xka, p2, Xku));
1433
- Xke = XOR(_mm512_permutex2var_epi64(Xke, p1, Xma), _mm512_permutex2var_epi64(Xke, p2, Xma));
1434
-
1435
- /* Add horizontally Xme ... Xso Reduce from lanes 8 to 4 */
1436
- Xme = XOR(_mm512_permutex2var_epi64(Xme, p1, Xsa), _mm512_permutex2var_epi64(Xme, p2, Xsa));
1437
- Xmi = XOR(_mm512_permutex2var_epi64(Xmi, p1, Xse), _mm512_permutex2var_epi64(Xmi, p2, Xse));
1438
- Xmo = XOR(_mm512_permutex2var_epi64(Xmo, p1, Xsi), _mm512_permutex2var_epi64(Xmo, p2, Xsi));
1439
- Xmu = XOR(_mm512_permutex2var_epi64(Xmu, p1, Xso), _mm512_permutex2var_epi64(Xmu, p2, Xso));
1440
-
1441
- /* Add horizontally Xba ... Xbo Reduce from lanes 4 to 2 */
1442
- p1 = *(V512*)oLow128;
1443
- p2 = *(V512*)oHigh128;
1444
- Xba = XOR(_mm512_permutex2var_epi64(Xba, p1, Xbi), _mm512_permutex2var_epi64(Xba, p2, Xbi));
1445
- Xbe = XOR(_mm512_permutex2var_epi64(Xbe, p1, Xbo), _mm512_permutex2var_epi64(Xbe, p2, Xbo));
1446
-
1447
- /* Add horizontally Xgo ... Xke Reduce from lanes 4 to 2 */
1448
- Xgo = XOR(_mm512_permutex2var_epi64(Xgo, p1, Xka), _mm512_permutex2var_epi64(Xgo, p2, Xka));
1449
- Xgu = XOR(_mm512_permutex2var_epi64(Xgu, p1, Xke), _mm512_permutex2var_epi64(Xgu, p2, Xke));
1450
-
1451
- /* Add horizontally Xme ... Xmu Reduce from lanes 4 to 2 */
1452
- Xme = XOR(_mm512_permutex2var_epi64(Xme, p1, Xmo), _mm512_permutex2var_epi64(Xme, p2, Xmo));
1453
- Xmi = XOR(_mm512_permutex2var_epi64(Xmi, p1, Xmu), _mm512_permutex2var_epi64(Xmi, p2, Xmu));
1454
-
1455
- /* Add horizontally Xba ... Xbe Reduce from lanes 2 to 1 */
1456
- p1 = *(V512*)oLow64;
1457
- p2 = *(V512*)oHigh64;
1458
- Xba = XOR(_mm512_permutex2var_epi64(Xba, p1, Xbe), _mm512_permutex2var_epi64(Xba, p2, Xbe));
1459
-
1460
- /* Add horizontally Xgo ... Xgu Reduce from lanes 2 to 1 */
1461
- Xgo = XOR(_mm512_permutex2var_epi64(Xgo, p1, Xgu), _mm512_permutex2var_epi64(Xgo, p2, Xgu));
1462
-
1463
- /* Add horizontally Xme ... Xmi Reduce from lanes 2 to 1 */
1464
- Xme = XOR(_mm512_permutex2var_epi64(Xme, p1, Xmi), _mm512_permutex2var_epi64(Xme, p2, Xmi));
1465
-
1466
- /* Add and store in xAccu */
1467
- Xba = XOR( Xba, LOAD512u(xAccu[0]));
1468
- Xgo = XOR( Xgo, LOAD512u(xAccu[8]));
1469
- Xme = XOR( Xme, LOAD512u(xAccu[16]));
1470
- _mm512_storeu_si512((V512*)&xAccu[0], Xba);
1471
- _mm512_storeu_si512((V512*)&xAccu[8], Xgo);
1472
- _mm512_storeu_si512((V512*)&xAccu[16], Xme);
1473
-
1474
- /* Add horizontally Xsu */
1475
- v1 = _mm256_xor_si256( _mm512_extracti64x4_epi64(Xsu, 0), _mm512_extracti64x4_epi64(Xsu, 1));
1476
- v1 = _mm256_xor_si256( v1, _mm256_permute4x64_epi64(v1, 0xEE));
1477
- xAccu[24] ^= _mm256_extract_epi64(v1, 0) ^ _mm256_extract_epi64(v1, 1);
1478
- DumpMem("xAccu", xAccu, 5*5);
1479
-
1480
- /* Store new kRoll */
1481
- _mm512_mask_storeu_epi64(&kRoll[20], 0x1F, x01234567);
1482
- DumpMem("Next kRoll", kRoll+20, 5);
1483
-
1484
- return (size_t)in64 - (size_t)input;
1485
- }
1486
-
1487
- #undef LoadInput
1488
- #undef AddInput
1489
-
1490
- ALIGN(64) static const uint64_t o1234567_6[] = { 1, 2, 3, 4, 5, 6, 7, 8+6 };
1491
- ALIGN(64) static const uint64_t o234567_01[] = { 2, 3, 4, 5, 6, 7, 8+0, 8+1 };
1492
- ALIGN(64) static const uint64_t o34567_012[] = { 3, 4, 5, 6, 7, 8+0, 8+1, 8+2 };
1493
- ALIGN(64) static const uint64_t o4567_0123[] = { 4, 5, 6, 7, 8+0, 8+1, 8+2, 8+3 };
1494
- ALIGN(64) static const uint64_t o567_01234[] = { 5, 6, 7, 8+0, 8+1, 8+2, 8+3, 8+4 };
1495
- ALIGN(64) static const uint64_t o67_012345[] = { 6, 7, 8+0, 8+1, 8+2, 8+3, 8+4, 8+5 };
1496
- ALIGN(64) static const uint64_t o7_0123456[] = { 7, 8+0, 8+1, 8+2, 8+3, 8+4, 8+5, 8+6 };
1497
-
1498
- size_t KeccakP1600times8_KravatteExpand(uint64_t *yAccu, const uint64_t *kRoll, unsigned char *output, size_t outputByteLen)
1499
- {
1500
- uint64_t *o64 = (uint64_t *)output;
1501
- size_t nBlocks = outputByteLen / (8 * 200);
1502
- KeccakP_DeclareVars;
1503
- #if !defined(KeccakP1600times4_fullUnrolling)
1504
- unsigned int i;
1505
- #endif
1506
- V512 x01234567, x23456789;
1507
- V256 scatter = *(V256*)oGatherScatter;
1508
-
1509
- x01234567 = LOAD512u(yAccu[15]);
1510
- x23456789 = LOAD512u(yAccu[17]);
1511
-
1512
- do {
1513
- Iba = CONST8_64(yAccu[0]);
1514
- Ibe = CONST8_64(yAccu[1]);
1515
- Ibi = CONST8_64(yAccu[2]);
1516
- Ibo = CONST8_64(yAccu[3]);
1517
- Ibu = CONST8_64(yAccu[4]);
1518
-
1519
- Iga = CONST8_64(yAccu[5]);
1520
- Ige = CONST8_64(yAccu[6]);
1521
- Igi = CONST8_64(yAccu[7]);
1522
- Igo = CONST8_64(yAccu[8]);
1523
- Igu = CONST8_64(yAccu[9]);
1524
-
1525
- Ika = CONST8_64(yAccu[10]);
1526
- Ike = CONST8_64(yAccu[11]);
1527
- Iki = CONST8_64(yAccu[12]);
1528
- Iko = CONST8_64(yAccu[13]);
1529
- Iku = CONST8_64(yAccu[14]);
1530
-
1531
- /* roll-e */
1532
- Ima = x01234567;
1533
- Ime = _mm512_permutex2var_epi64(x01234567, *(V512*)o1234567_6, x23456789);
1534
- Imi = x23456789;
1535
-
1536
- x23456789 = XOR3(ROL(Ima, 7), ROL(Ime, 18), _mm512_and_si512(Imi, _mm512_srli_epi64(Ime, 1)));
1537
- Imo = _mm512_permutex2var_epi64(Imi, *(V512*)o1234567_0, x23456789);
1538
- Imu = _mm512_permutex2var_epi64(Imi, *(V512*)o234567_01, x23456789);
1539
- Isa = _mm512_permutex2var_epi64(Imi, *(V512*)o34567_012, x23456789);
1540
- Ise = _mm512_permutex2var_epi64(Imi, *(V512*)o4567_0123, x23456789);
1541
- Isi = _mm512_permutex2var_epi64(Imi, *(V512*)o567_01234, x23456789);
1542
- Iso = _mm512_permutex2var_epi64(Imi, *(V512*)o67_012345, x23456789);
1543
- Isu = _mm512_permutex2var_epi64(Imi, *(V512*)o7_0123456, x23456789);
1544
- x01234567 = Iso;
1545
- Dump( "After roll-e", I);
1546
-
1547
- rounds6
1548
-
1549
- /* Add kRoll */
1550
- _ba = XOR(_ba, CONST8_64(kRoll[0]));
1551
- _be = XOR(_be, CONST8_64(kRoll[1]));
1552
- _bi = XOR(_bi, CONST8_64(kRoll[2]));
1553
- _bo = XOR(_bo, CONST8_64(kRoll[3]));
1554
- _bu = XOR(_bu, CONST8_64(kRoll[4]));
1555
- _ga = XOR(_ga, CONST8_64(kRoll[5]));
1556
- _ge = XOR(_ge, CONST8_64(kRoll[6]));
1557
- _gi = XOR(_gi, CONST8_64(kRoll[7]));
1558
- _go = XOR(_go, CONST8_64(kRoll[8]));
1559
- _gu = XOR(_gu, CONST8_64(kRoll[9]));
1560
- _ka = XOR(_ka, CONST8_64(kRoll[10]));
1561
- _ke = XOR(_ke, CONST8_64(kRoll[11]));
1562
- _ki = XOR(_ki, CONST8_64(kRoll[12]));
1563
- _ko = XOR(_ko, CONST8_64(kRoll[13]));
1564
- _ku = XOR(_ku, CONST8_64(kRoll[14]));
1565
- _ma = XOR(_ma, CONST8_64(kRoll[15]));
1566
- _me = XOR(_me, CONST8_64(kRoll[16]));
1567
- _mi = XOR(_mi, CONST8_64(kRoll[17]));
1568
- _mo = XOR(_mo, CONST8_64(kRoll[18]));
1569
- _mu = XOR(_mu, CONST8_64(kRoll[19]));
1570
- _sa = XOR(_sa, CONST8_64(kRoll[20]));
1571
- _se = XOR(_se, CONST8_64(kRoll[21]));
1572
- _si = XOR(_si, CONST8_64(kRoll[22]));
1573
- _so = XOR(_so, CONST8_64(kRoll[23]));
1574
- _su = XOR(_su, CONST8_64(kRoll[24]));
1575
- Dump( "After add kRoll", _);
1576
-
1577
- /* Extract */
1578
- STORE_SCATTER8_64(o64+0, scatter, _ba);
1579
- STORE_SCATTER8_64(o64+1, scatter, _be);
1580
- STORE_SCATTER8_64(o64+2, scatter, _bi);
1581
- STORE_SCATTER8_64(o64+3, scatter, _bo);
1582
- STORE_SCATTER8_64(o64+4, scatter, _bu);
1583
- STORE_SCATTER8_64(o64+5, scatter, _ga);
1584
- STORE_SCATTER8_64(o64+6, scatter, _ge);
1585
- STORE_SCATTER8_64(o64+7, scatter, _gi);
1586
- STORE_SCATTER8_64(o64+8, scatter, _go);
1587
- STORE_SCATTER8_64(o64+9, scatter, _gu);
1588
- STORE_SCATTER8_64(o64+10, scatter, _ka);
1589
- STORE_SCATTER8_64(o64+11, scatter, _ke);
1590
- STORE_SCATTER8_64(o64+12, scatter, _ki);
1591
- STORE_SCATTER8_64(o64+13, scatter, _ko);
1592
- STORE_SCATTER8_64(o64+14, scatter, _ku);
1593
- STORE_SCATTER8_64(o64+15, scatter, _ma);
1594
- STORE_SCATTER8_64(o64+16, scatter, _me);
1595
- STORE_SCATTER8_64(o64+17, scatter, _mi);
1596
- STORE_SCATTER8_64(o64+18, scatter, _mo);
1597
- STORE_SCATTER8_64(o64+19, scatter, _mu);
1598
- STORE_SCATTER8_64(o64+20, scatter, _sa);
1599
- STORE_SCATTER8_64(o64+21, scatter, _se);
1600
- STORE_SCATTER8_64(o64+22, scatter, _si);
1601
- STORE_SCATTER8_64(o64+23, scatter, _so);
1602
- STORE_SCATTER8_64(o64+24, scatter, _su);
1603
- DumpMem("Output", o64, 8*25);
1604
-
1605
- o64 += 8 * 25;
1606
- }
1607
- while(--nBlocks != 0);
1608
-
1609
- /* Store new yAccu */
1610
- _mm512_mask_storeu_epi64(&yAccu[15], 0xFF, x01234567);
1611
- _mm512_mask_storeu_epi64(&yAccu[17], 0xC0, x23456789);
1612
- DumpMem("yAccu", yAccu, 25);
1613
-
1614
- return (size_t)o64 - (size_t)output;
1615
- }