sleeping_kangaroo12 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (291) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +34 -67
  3. data/ext/Rakefile +12 -37
  4. data/ext/binding/sleeping_kangaroo12.c +1 -16
  5. data/ext/{xkcp → k12}/Makefile +0 -0
  6. data/ext/k12/Makefile.build +118 -0
  7. data/ext/k12/README.markdown +86 -0
  8. data/ext/k12/lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S +623 -0
  9. data/ext/k12/lib/ARMv8Asha3/KeccakP-1600-SnP.h +65 -0
  10. data/ext/k12/lib/ARMv8Asha3/KeccakP-1600-opt64.c +227 -0
  11. data/ext/{xkcp/lib/low/KeccakP-1600/compact → k12/lib/Inplace32BI}/KeccakP-1600-SnP.h +4 -9
  12. data/ext/{xkcp/lib/low/KeccakP-1600/plain-32bits-inplace → k12/lib/Inplace32BI}/KeccakP-1600-inplace32BI.c +65 -160
  13. data/ext/k12/lib/KangarooTwelve.c +332 -0
  14. data/ext/{xkcp/lib/high/KangarooTwelve → k12/lib}/KangarooTwelve.h +53 -16
  15. data/ext/{xkcp/lib/low/KeccakP-1600/AVX2 → k12/lib/Optimized64}/KeccakP-1600-AVX2.s +122 -558
  16. data/ext/k12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c +241 -0
  17. data/ext/k12/lib/Optimized64/KeccakP-1600-AVX512.s +551 -0
  18. data/ext/k12/lib/Optimized64/KeccakP-1600-SnP.h +74 -0
  19. data/ext/{xkcp/lib/low/KeccakP-1600/common/KeccakP-1600-64.macros → k12/lib/Optimized64/KeccakP-1600-opt64.c} +447 -169
  20. data/ext/k12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c +406 -0
  21. data/ext/k12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c +419 -0
  22. data/ext/k12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c +458 -0
  23. data/ext/k12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c +438 -0
  24. data/ext/{xkcp/lib/low/KeccakP-1600/plain-64bits → k12/lib/Plain64}/KeccakP-1600-SnP.h +14 -20
  25. data/ext/{xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-reference.h → k12/lib/Plain64/KeccakP-1600-plain64.c} +9 -8
  26. data/ext/{xkcp/lib/common → k12/lib}/align.h +3 -2
  27. data/ext/{xkcp/lib/common → k12/lib}/brg_endian.h +0 -0
  28. data/ext/{xkcp → k12}/support/Build/ExpandProducts.xsl +0 -0
  29. data/ext/{xkcp → k12}/support/Build/ToGlobalMakefile.xsl +0 -0
  30. data/ext/{xkcp → k12}/support/Build/ToOneTarget.xsl +0 -0
  31. data/ext/{xkcp → k12}/support/Build/ToTargetConfigFile.xsl +0 -0
  32. data/ext/{xkcp → k12}/support/Build/ToTargetMakefile.xsl +10 -16
  33. data/ext/{xkcp → k12}/support/Build/ToVCXProj.xsl +0 -0
  34. data/lib/sleeping_kangaroo12/version.rb +1 -1
  35. metadata +33 -276
  36. data/ext/config/xkcp.build +0 -17
  37. data/ext/xkcp/LICENSE +0 -1
  38. data/ext/xkcp/Makefile.build +0 -200
  39. data/ext/xkcp/README.markdown +0 -296
  40. data/ext/xkcp/lib/HighLevel.build +0 -143
  41. data/ext/xkcp/lib/LowLevel.build +0 -757
  42. data/ext/xkcp/lib/high/KangarooTwelve/KangarooTwelve.c +0 -301
  43. data/ext/xkcp/lib/high/Keccak/FIPS202/KeccakHash.c +0 -81
  44. data/ext/xkcp/lib/high/Keccak/FIPS202/KeccakHash.h +0 -125
  45. data/ext/xkcp/lib/high/Keccak/FIPS202/SimpleFIPS202.c +0 -48
  46. data/ext/xkcp/lib/high/Keccak/FIPS202/SimpleFIPS202.h +0 -79
  47. data/ext/xkcp/lib/high/Keccak/KeccakDuplex.c +0 -81
  48. data/ext/xkcp/lib/high/Keccak/KeccakDuplex.h +0 -73
  49. data/ext/xkcp/lib/high/Keccak/KeccakDuplex.inc +0 -195
  50. data/ext/xkcp/lib/high/Keccak/KeccakSponge.c +0 -111
  51. data/ext/xkcp/lib/high/Keccak/KeccakSponge.h +0 -76
  52. data/ext/xkcp/lib/high/Keccak/KeccakSponge.inc +0 -314
  53. data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.c +0 -61
  54. data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.h +0 -67
  55. data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.inc +0 -128
  56. data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.c +0 -93
  57. data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.h +0 -599
  58. data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.inc +0 -573
  59. data/ext/xkcp/lib/high/Ketje/Ketjev2.c +0 -87
  60. data/ext/xkcp/lib/high/Ketje/Ketjev2.h +0 -88
  61. data/ext/xkcp/lib/high/Ketje/Ketjev2.inc +0 -274
  62. data/ext/xkcp/lib/high/Keyak/Keyakv2.c +0 -132
  63. data/ext/xkcp/lib/high/Keyak/Keyakv2.h +0 -217
  64. data/ext/xkcp/lib/high/Keyak/Keyakv2.inc +0 -81
  65. data/ext/xkcp/lib/high/Keyak/Motorist.inc +0 -953
  66. data/ext/xkcp/lib/high/Kravatte/Kravatte.c +0 -533
  67. data/ext/xkcp/lib/high/Kravatte/Kravatte.h +0 -115
  68. data/ext/xkcp/lib/high/Kravatte/KravatteModes.c +0 -557
  69. data/ext/xkcp/lib/high/Kravatte/KravatteModes.h +0 -247
  70. data/ext/xkcp/lib/high/Xoodyak/Cyclist.h +0 -66
  71. data/ext/xkcp/lib/high/Xoodyak/Cyclist.inc +0 -336
  72. data/ext/xkcp/lib/high/Xoodyak/Xoodyak-parameters.h +0 -26
  73. data/ext/xkcp/lib/high/Xoodyak/Xoodyak.c +0 -55
  74. data/ext/xkcp/lib/high/Xoodyak/Xoodyak.h +0 -35
  75. data/ext/xkcp/lib/high/Xoofff/Xoofff.c +0 -634
  76. data/ext/xkcp/lib/high/Xoofff/Xoofff.h +0 -147
  77. data/ext/xkcp/lib/high/Xoofff/XoofffModes.c +0 -483
  78. data/ext/xkcp/lib/high/Xoofff/XoofffModes.h +0 -241
  79. data/ext/xkcp/lib/high/common/Phases.h +0 -25
  80. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-SnP.h +0 -41
  81. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv6m-le-armcc.s +0 -1666
  82. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv6m-le-gcc.s +0 -1655
  83. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7a-le-armcc.s +0 -1268
  84. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7a-le-gcc.s +0 -1264
  85. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7m-le-armcc.s +0 -1178
  86. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7m-le-gcc.s +0 -1175
  87. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u1-32bi-armv6m-le-armcc.s +0 -1338
  88. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u1-32bi-armv6m-le-gcc.s +0 -1336
  89. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u2-32bi-armv6m-le-armcc.s +0 -1343
  90. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u2-32bi-armv6m-le-gcc.s +0 -1339
  91. data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-SnP.h +0 -42
  92. data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-armv7a-le-neon-armcc.s +0 -823
  93. data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-armv7a-le-neon-gcc.s +0 -831
  94. data/ext/xkcp/lib/low/KeccakP-1600/ARMv8A/KeccakP-1600-SnP.h +0 -31
  95. data/ext/xkcp/lib/low/KeccakP-1600/ARMv8A/KeccakP-1600-armv8a-neon.s +0 -540
  96. data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-SnP.h +0 -42
  97. data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-avr8-compact.s +0 -733
  98. data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-avr8-fast.s +0 -1121
  99. data/ext/xkcp/lib/low/KeccakP-1600/AVX2/KeccakP-1600-SnP.h +0 -52
  100. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/KeccakP-1600-AVX512.c +0 -623
  101. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/KeccakP-1600-SnP.h +0 -47
  102. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/u12/KeccakP-1600-AVX512-config.h +0 -6
  103. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/u6/KeccakP-1600-AVX512-config.h +0 -6
  104. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/ua/KeccakP-1600-AVX512-config.h +0 -6
  105. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/KeccakP-1600-AVX512.s +0 -1031
  106. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/KeccakP-1600-SnP.h +0 -53
  107. data/ext/xkcp/lib/low/KeccakP-1600/XOP/KeccakP-1600-SnP.h +0 -44
  108. data/ext/xkcp/lib/low/KeccakP-1600/XOP/KeccakP-1600-XOP.c +0 -476
  109. data/ext/xkcp/lib/low/KeccakP-1600/XOP/u6/KeccakP-1600-XOP-config.h +0 -6
  110. data/ext/xkcp/lib/low/KeccakP-1600/XOP/ua/KeccakP-1600-XOP-config.h +0 -6
  111. data/ext/xkcp/lib/low/KeccakP-1600/common/KeccakP-1600-unrolling.macros +0 -305
  112. data/ext/xkcp/lib/low/KeccakP-1600/compact/KeccakP-1600-compact64.c +0 -420
  113. data/ext/xkcp/lib/low/KeccakP-1600/plain-32bits-inplace/KeccakP-1600-SnP.h +0 -43
  114. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/KeccakP-1600-opt64.c +0 -565
  115. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcu6/KeccakP-1600-opt64-config.h +0 -7
  116. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcua/KeccakP-1600-opt64-config.h +0 -7
  117. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcua-shld/KeccakP-1600-opt64-config.h +0 -8
  118. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/u6/KeccakP-1600-opt64-config.h +0 -6
  119. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/ua/KeccakP-1600-opt64-config.h +0 -6
  120. data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-SnP.h +0 -44
  121. data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-reference.h +0 -23
  122. data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-reference32BI.c +0 -625
  123. data/ext/xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-SnP.h +0 -44
  124. data/ext/xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-reference.c +0 -440
  125. data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-SnP.h +0 -42
  126. data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-gas.s +0 -1196
  127. data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-gas_Apple.s +0 -1124
  128. data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-shld-gas.s +0 -1196
  129. data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-inplace-pl2-armv7a-neon-le-armcc.s +0 -1392
  130. data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-inplace-pl2-armv7a-neon-le-gcc.s +0 -1394
  131. data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-times2-SnP.h +0 -42
  132. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512u12/SIMD512-2-config.h +0 -7
  133. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512u4/SIMD512-2-config.h +0 -7
  134. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512ufull/SIMD512-2-config.h +0 -7
  135. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/KeccakP-1600-times2-SIMD512.c +0 -850
  136. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/KeccakP-1600-times2-SnP.h +0 -51
  137. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SIMD128.c +0 -957
  138. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SnP.h +0 -49
  139. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/SSSE3-u2/SIMD128-config.h +0 -8
  140. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/SSSE3-ua/SIMD128-config.h +0 -8
  141. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/XOP-u2/SIMD128-config.h +0 -9
  142. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/XOP-ua/SIMD128-config.h +0 -9
  143. data/ext/xkcp/lib/low/KeccakP-1600-times2/fallback-on1/KeccakP-1600-times2-SnP.h +0 -45
  144. data/ext/xkcp/lib/low/KeccakP-1600-times2/fallback-on1/KeccakP-1600-times2-on1.c +0 -37
  145. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/KeccakP-1600-times4-SIMD256.c +0 -1321
  146. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/KeccakP-1600-times4-SnP.h +0 -55
  147. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/u12/SIMD256-config.h +0 -7
  148. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/u6/SIMD256-config.h +0 -7
  149. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/ua/SIMD256-config.h +0 -7
  150. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512u12/SIMD512-4-config.h +0 -7
  151. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512u4/SIMD512-4-config.h +0 -7
  152. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512ufull/SIMD512-4-config.h +0 -7
  153. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/KeccakP-1600-times4-SIMD512.c +0 -881
  154. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/KeccakP-1600-times4-SnP.h +0 -51
  155. data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on1/KeccakP-1600-times4-SnP.h +0 -45
  156. data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on1/KeccakP-1600-times4-on1.c +0 -37
  157. data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on2/KeccakP-1600-times4-SnP.h +0 -45
  158. data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on2/KeccakP-1600-times4-on2.c +0 -38
  159. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/KeccakP-1600-times8-SIMD512.c +0 -1615
  160. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/KeccakP-1600-times8-SnP.h +0 -57
  161. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/u12/SIMD512-config.h +0 -7
  162. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/u4/SIMD512-config.h +0 -7
  163. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/ua/SIMD512-config.h +0 -7
  164. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on1/KeccakP-1600-times8-SnP.h +0 -45
  165. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on1/KeccakP-1600-times8-on1.c +0 -37
  166. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on2/KeccakP-1600-times8-SnP.h +0 -45
  167. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on2/KeccakP-1600-times8-on2.c +0 -38
  168. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on4/KeccakP-1600-times8-SnP.h +0 -45
  169. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on4/KeccakP-1600-times8-on4.c +0 -38
  170. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-SnP.h +0 -41
  171. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv6m-le-armcc.s +0 -442
  172. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv6m-le-gcc.s +0 -446
  173. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv7m-le-armcc.s +0 -419
  174. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv7m-le-gcc.s +0 -427
  175. data/ext/xkcp/lib/low/KeccakP-200/AVR8/KeccakP-200-SnP.h +0 -41
  176. data/ext/xkcp/lib/low/KeccakP-200/AVR8/KeccakP-200-avr8-fast.s +0 -647
  177. data/ext/xkcp/lib/low/KeccakP-200/compact/KeccakP-200-SnP.h +0 -39
  178. data/ext/xkcp/lib/low/KeccakP-200/compact/KeccakP-200-compact.c +0 -190
  179. data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-SnP.h +0 -43
  180. data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-reference.c +0 -412
  181. data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-reference.h +0 -23
  182. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-SnP.h +0 -41
  183. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv6m-le-armcc.s +0 -454
  184. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv6m-le-gcc.s +0 -458
  185. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv7m-le-armcc.s +0 -455
  186. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv7m-le-gcc.s +0 -458
  187. data/ext/xkcp/lib/low/KeccakP-400/AVR8/KeccakP-400-SnP.h +0 -41
  188. data/ext/xkcp/lib/low/KeccakP-400/AVR8/KeccakP-400-avr8-fast.s +0 -728
  189. data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-SnP.h +0 -43
  190. data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-reference.c +0 -414
  191. data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-reference.h +0 -23
  192. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-SnP.h +0 -42
  193. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u1-armv6m-le-armcc.s +0 -527
  194. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u1-armv6m-le-gcc.s +0 -533
  195. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv6m-le-armcc.s +0 -528
  196. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv6m-le-gcc.s +0 -534
  197. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7a-le-armcc.s +0 -521
  198. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7a-le-gcc.s +0 -527
  199. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7m-le-armcc.s +0 -517
  200. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7m-le-gcc.s +0 -523
  201. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-uf-armv7m-le-armcc.s +0 -550
  202. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-uf-armv7m-le-gcc.s +0 -556
  203. data/ext/xkcp/lib/low/KeccakP-800/ARMv8A/KeccakP-800-SnP.h +0 -32
  204. data/ext/xkcp/lib/low/KeccakP-800/ARMv8A/KeccakP-800-armv8a-neon.s +0 -432
  205. data/ext/xkcp/lib/low/KeccakP-800/AVR8/KeccakP-800-SnP.h +0 -42
  206. data/ext/xkcp/lib/low/KeccakP-800/AVR8/KeccakP-800-avr8-fast.s +0 -929
  207. data/ext/xkcp/lib/low/KeccakP-800/compact/KeccakP-800-SnP.h +0 -40
  208. data/ext/xkcp/lib/low/KeccakP-800/compact/KeccakP-800-compact.c +0 -244
  209. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-SnP.h +0 -46
  210. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32-bis.macros +0 -184
  211. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32.c +0 -454
  212. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32.macros +0 -459
  213. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-unrolling-bis.macros +0 -83
  214. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-unrolling.macros +0 -88
  215. data/ext/xkcp/lib/low/KeccakP-800/plain/lcu2/KeccakP-800-opt32-config.h +0 -7
  216. data/ext/xkcp/lib/low/KeccakP-800/plain/lcua/KeccakP-800-opt32-config.h +0 -7
  217. data/ext/xkcp/lib/low/KeccakP-800/plain/u2/KeccakP-800-opt32-config.h +0 -7
  218. data/ext/xkcp/lib/low/KeccakP-800/plain/ua/KeccakP-800-opt32-config.h +0 -7
  219. data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-SnP.h +0 -44
  220. data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-reference.c +0 -437
  221. data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-reference.h +0 -23
  222. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/Ket.h +0 -57
  223. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeJr-armv7m-le-armcc.s +0 -475
  224. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeJr-armv7m-le-gcc.s +0 -480
  225. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeSr-armv7m-le-armcc.s +0 -590
  226. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeSr-armv7m-le-gcc.s +0 -590
  227. data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.c +0 -126
  228. data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.h +0 -68
  229. data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.inc +0 -174
  230. data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.c +0 -80
  231. data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.h +0 -68
  232. data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.inc +0 -142
  233. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-SnP.h +0 -55
  234. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-u1-armv6m-le-armcc.s +0 -1086
  235. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-u1-armv6m-le-gcc.s +0 -1092
  236. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv6-le-armcc.s +0 -721
  237. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv6-le-gcc.s +0 -726
  238. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv7m-le-armcc.s +0 -723
  239. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv7m-le-gcc.s +0 -729
  240. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-u1-armv6m-le-armcc.s +0 -1164
  241. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-u1-armv6m-le-gcc.s +0 -1165
  242. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv6-le-armcc.s +0 -562
  243. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv6-le-gcc.s +0 -563
  244. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv7m-le-armcc.s +0 -563
  245. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv7m-le-gcc.s +0 -565
  246. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-SnP.h +0 -55
  247. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-uf-armv7a-neon-le-armcc.s +0 -476
  248. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-uf-armv7a-neon-le-gcc.s +0 -485
  249. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodyak-uf-armv7a-neon-le-armcc.s +0 -362
  250. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodyak-uf-armv7a-neon-le-gcc.s +0 -367
  251. data/ext/xkcp/lib/low/Xoodoo/AVR8/Xoodoo-SnP.h +0 -43
  252. data/ext/xkcp/lib/low/Xoodoo/AVR8/Xoodoo-avr8-u1.s +0 -1341
  253. data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodoo-SIMD512.c +0 -581
  254. data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodoo-SnP.h +0 -58
  255. data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodyak-full-block-SIMD512.c +0 -332
  256. data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodoo-SIMD128.c +0 -329
  257. data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodoo-SnP.h +0 -53
  258. data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodyak-full-block-SIMD128.c +0 -355
  259. data/ext/xkcp/lib/low/Xoodoo/Xoodoo.h +0 -79
  260. data/ext/xkcp/lib/low/Xoodoo/plain/Xoodoo-SnP.h +0 -56
  261. data/ext/xkcp/lib/low/Xoodoo/plain/Xoodoo-optimized.c +0 -399
  262. data/ext/xkcp/lib/low/Xoodoo/plain/Xoodyak-full-blocks.c +0 -127
  263. data/ext/xkcp/lib/low/Xoodoo/ref/Xoodoo-SnP.h +0 -43
  264. data/ext/xkcp/lib/low/Xoodoo/ref/Xoodoo-reference.c +0 -253
  265. data/ext/xkcp/lib/low/Xoodoo-times16/AVX512/Xoodoo-times16-SIMD512.c +0 -1044
  266. data/ext/xkcp/lib/low/Xoodoo-times16/AVX512/Xoodoo-times16-SnP.h +0 -49
  267. data/ext/xkcp/lib/low/Xoodoo-times16/fallback-on1/Xoodoo-times16-SnP.h +0 -45
  268. data/ext/xkcp/lib/low/Xoodoo-times16/fallback-on1/Xoodoo-times16-on1.c +0 -37
  269. data/ext/xkcp/lib/low/Xoodoo-times4/ARMv7A-NEON/Xoodoo-times4-ARMv7A.s +0 -1587
  270. data/ext/xkcp/lib/low/Xoodoo-times4/ARMv7A-NEON/Xoodoo-times4-SnP.h +0 -48
  271. data/ext/xkcp/lib/low/Xoodoo-times4/AVX512/Xoodoo-times4-SIMD512.c +0 -1202
  272. data/ext/xkcp/lib/low/Xoodoo-times4/AVX512/Xoodoo-times4-SnP.h +0 -48
  273. data/ext/xkcp/lib/low/Xoodoo-times4/SSSE3/Xoodoo-times4-SIMD128.c +0 -484
  274. data/ext/xkcp/lib/low/Xoodoo-times4/SSSE3/Xoodoo-times4-SnP.h +0 -44
  275. data/ext/xkcp/lib/low/Xoodoo-times4/fallback-on1/Xoodoo-times4-SnP.h +0 -45
  276. data/ext/xkcp/lib/low/Xoodoo-times4/fallback-on1/Xoodoo-times4-on1.c +0 -37
  277. data/ext/xkcp/lib/low/Xoodoo-times8/AVX2/Xoodoo-times8-SIMD256.c +0 -939
  278. data/ext/xkcp/lib/low/Xoodoo-times8/AVX2/Xoodoo-times8-SnP.h +0 -49
  279. data/ext/xkcp/lib/low/Xoodoo-times8/AVX512/Xoodoo-times8-SIMD512.c +0 -1216
  280. data/ext/xkcp/lib/low/Xoodoo-times8/AVX512/Xoodoo-times8-SnP.h +0 -48
  281. data/ext/xkcp/lib/low/Xoodoo-times8/fallback-on1/Xoodoo-times8-SnP.h +0 -45
  282. data/ext/xkcp/lib/low/Xoodoo-times8/fallback-on1/Xoodoo-times8-on1.c +0 -37
  283. data/ext/xkcp/lib/low/common/PlSnP-Fallback.inc +0 -290
  284. data/ext/xkcp/lib/low/common/SnP-Relaned.h +0 -141
  285. data/ext/xkcp/support/Kernel-PMU/Kernel-pmu.md +0 -133
  286. data/ext/xkcp/support/Kernel-PMU/Makefile +0 -8
  287. data/ext/xkcp/support/Kernel-PMU/enable_arm_pmu.c +0 -129
  288. data/ext/xkcp/support/Kernel-PMU/load-module +0 -1
  289. data/ext/xkcp/util/KeccakSum/KeccakSum.c +0 -394
  290. data/ext/xkcp/util/KeccakSum/base64.c +0 -86
  291. data/ext/xkcp/util/KeccakSum/base64.h +0 -12
@@ -1,1321 +0,0 @@
1
- /*
2
- The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche.
3
-
4
- Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
5
-
6
- For more information, feedback or questions, please refer to the Keccak Team website:
7
- https://keccak.team/
8
-
9
- To the extent possible under law, the implementer has waived all copyright
10
- and related or neighboring rights to the source code in this file.
11
- http://creativecommons.org/publicdomain/zero/1.0/
12
-
13
- ---
14
-
15
- This file implements Keccak-p[1600]×4 in a PlSnP-compatible way.
16
- Please refer to PlSnP-documentation.h for more details.
17
-
18
- This implementation comes with KeccakP-1600-times4-SnP.h in the same folder.
19
- Please refer to LowLevel.build for the exact list of other files it must be combined with.
20
- */
21
-
22
- #include <stdint.h>
23
- #include <stdio.h>
24
- #include <stdlib.h>
25
- #include <string.h>
26
- #include <smmintrin.h>
27
- #include <wmmintrin.h>
28
- #include <immintrin.h>
29
- #include <emmintrin.h>
30
- #include "align.h"
31
- #include "KeccakP-1600-times4-SnP.h"
32
- #include "SIMD256-config.h"
33
-
34
- #include "brg_endian.h"
35
- #if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
36
- #error Expecting a little-endian platform
37
- #endif
38
-
39
- typedef __m128i V128;
40
- typedef __m256i V256;
41
-
42
- //#define UseGatherScatter
43
-
44
- #define laneIndex(instanceIndex, lanePosition) ((lanePosition)*4 + instanceIndex)
45
-
46
- #if defined(KeccakP1600times4_useAVX2)
47
- #define ANDnu256(a, b) _mm256_andnot_si256(a, b)
48
- #define CONST256(a) _mm256_load_si256((const V256 *)&(a))
49
- #define CONST256_64(a) _mm256_set1_epi64x(a)
50
- #define LOAD256(a) _mm256_load_si256((const V256 *)&(a))
51
- #define LOAD256u(a) _mm256_loadu_si256((const V256 *)&(a))
52
- #define LOAD4_64(a, b, c, d) _mm256_set_epi64x((uint64_t)(a), (uint64_t)(b), (uint64_t)(c), (uint64_t)(d))
53
- #define ROL64in256(d, a, o) d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o)))
54
- #define ROL64in256_8(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho8))
55
- #define ROL64in256_56(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho56))
56
- static const uint64_t rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F};
57
- static const uint64_t rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19};
58
- #define STORE256(a, b) _mm256_store_si256((V256 *)&(a), b)
59
- #define STORE256u(a, b) _mm256_storeu_si256((V256 *)&(a), b)
60
- #define STORE2_128(ah, al, v) _mm256_storeu2_m128i(&(ah), &(al), v)
61
- #define XOR256(a, b) _mm256_xor_si256(a, b)
62
- #define XOReq256(a, b) a = _mm256_xor_si256(a, b)
63
- #define UNPACKL( a, b ) _mm256_unpacklo_epi64((a), (b))
64
- #define UNPACKH( a, b ) _mm256_unpackhi_epi64((a), (b))
65
- #define PERM128( a, b, c ) _mm256_permute2f128_si256((a), (b), c)
66
- #define SHUFFLE64( a, b, c ) _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b), c))
67
-
68
- #define UNINTLEAVE() lanesL01 = UNPACKL( lanes0, lanes1 ), \
69
- lanesH01 = UNPACKH( lanes0, lanes1 ), \
70
- lanesL23 = UNPACKL( lanes2, lanes3 ), \
71
- lanesH23 = UNPACKH( lanes2, lanes3 ), \
72
- lanes0 = PERM128( lanesL01, lanesL23, 0x20 ), \
73
- lanes2 = PERM128( lanesL01, lanesL23, 0x31 ), \
74
- lanes1 = PERM128( lanesH01, lanesH23, 0x20 ), \
75
- lanes3 = PERM128( lanesH01, lanesH23, 0x31 )
76
-
77
- #define INTLEAVE() lanesL01 = PERM128( lanes0, lanes2, 0x20 ), \
78
- lanesH01 = PERM128( lanes1, lanes3, 0x20 ), \
79
- lanesL23 = PERM128( lanes0, lanes2, 0x31 ), \
80
- lanesH23 = PERM128( lanes1, lanes3, 0x31 ), \
81
- lanes0 = SHUFFLE64( lanesL01, lanesH01, 0x00 ), \
82
- lanes1 = SHUFFLE64( lanesL01, lanesH01, 0x0F ), \
83
- lanes2 = SHUFFLE64( lanesL23, lanesH23, 0x00 ), \
84
- lanes3 = SHUFFLE64( lanesL23, lanesH23, 0x0F )
85
-
86
- #endif
87
-
88
- #define SnP_laneLengthInBytes 8
89
-
90
- void KeccakP1600times4_InitializeAll(void *states)
91
- {
92
- memset(states, 0, KeccakP1600times4_statesSizeInBytes);
93
- }
94
-
95
- void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
96
- {
97
- unsigned int sizeLeft = length;
98
- unsigned int lanePosition = offset/SnP_laneLengthInBytes;
99
- unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
100
- const unsigned char *curData = data;
101
- uint64_t *statesAsLanes = (uint64_t *)states;
102
-
103
- if ((sizeLeft > 0) && (offsetInLane != 0)) {
104
- unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
105
- uint64_t lane = 0;
106
- if (bytesInLane > sizeLeft)
107
- bytesInLane = sizeLeft;
108
- memcpy((unsigned char*)&lane + offsetInLane, curData, bytesInLane);
109
- statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
110
- sizeLeft -= bytesInLane;
111
- lanePosition++;
112
- curData += bytesInLane;
113
- }
114
-
115
- while(sizeLeft >= SnP_laneLengthInBytes) {
116
- uint64_t lane = *((const uint64_t*)curData);
117
- statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
118
- sizeLeft -= SnP_laneLengthInBytes;
119
- lanePosition++;
120
- curData += SnP_laneLengthInBytes;
121
- }
122
-
123
- if (sizeLeft > 0) {
124
- uint64_t lane = 0;
125
- memcpy(&lane, curData, sizeLeft);
126
- statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
127
- }
128
- }
129
-
130
- void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
131
- {
132
- V256 *stateAsLanes = (V256 *)states;
133
- unsigned int i;
134
- const uint64_t *curData0 = (const uint64_t *)data;
135
- const uint64_t *curData1 = (const uint64_t *)(data+laneOffset*SnP_laneLengthInBytes);
136
- const uint64_t *curData2 = (const uint64_t *)(data+laneOffset*2*SnP_laneLengthInBytes);
137
- const uint64_t *curData3 = (const uint64_t *)(data+laneOffset*3*SnP_laneLengthInBytes);
138
- V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
139
-
140
- #define Xor_In( argIndex ) XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
141
-
142
- #define Xor_In4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\
143
- lanes1 = LOAD256u( curData1[argIndex]),\
144
- lanes2 = LOAD256u( curData2[argIndex]),\
145
- lanes3 = LOAD256u( curData3[argIndex]),\
146
- INTLEAVE(),\
147
- XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
148
- XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
149
- XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
150
- XOReq256( stateAsLanes[argIndex+3], lanes3 )
151
-
152
- if ( laneCount >= 16 ) {
153
- Xor_In4( 0 );
154
- Xor_In4( 4 );
155
- Xor_In4( 8 );
156
- Xor_In4( 12 );
157
- if ( laneCount >= 20 ) {
158
- Xor_In4( 16 );
159
- for(i=20; i<laneCount; i++)
160
- Xor_In( i );
161
- }
162
- else {
163
- for(i=16; i<laneCount; i++)
164
- Xor_In( i );
165
- }
166
- }
167
- else {
168
- for(i=0; i<laneCount; i++)
169
- Xor_In( i );
170
- }
171
- #undef Xor_In
172
- #undef Xor_In4
173
- }
174
-
175
- void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
176
- {
177
- unsigned int sizeLeft = length;
178
- unsigned int lanePosition = offset/SnP_laneLengthInBytes;
179
- unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
180
- const unsigned char *curData = data;
181
- uint64_t *statesAsLanes = (uint64_t *)states;
182
-
183
- if ((sizeLeft > 0) && (offsetInLane != 0)) {
184
- unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
185
- if (bytesInLane > sizeLeft)
186
- bytesInLane = sizeLeft;
187
- memcpy( ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, curData, bytesInLane);
188
- sizeLeft -= bytesInLane;
189
- lanePosition++;
190
- curData += bytesInLane;
191
- }
192
-
193
- while(sizeLeft >= SnP_laneLengthInBytes) {
194
- uint64_t lane = *((const uint64_t*)curData);
195
- statesAsLanes[laneIndex(instanceIndex, lanePosition)] = lane;
196
- sizeLeft -= SnP_laneLengthInBytes;
197
- lanePosition++;
198
- curData += SnP_laneLengthInBytes;
199
- }
200
-
201
- if (sizeLeft > 0) {
202
- memcpy(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], curData, sizeLeft);
203
- }
204
- }
205
-
206
- void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
207
- {
208
- V256 *stateAsLanes = (V256 *)states;
209
- unsigned int i;
210
- const uint64_t *curData0 = (const uint64_t *)data;
211
- const uint64_t *curData1 = (const uint64_t *)(data+laneOffset*SnP_laneLengthInBytes);
212
- const uint64_t *curData2 = (const uint64_t *)(data+laneOffset*2*SnP_laneLengthInBytes);
213
- const uint64_t *curData3 = (const uint64_t *)(data+laneOffset*3*SnP_laneLengthInBytes);
214
- V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
215
-
216
- #define OverWr( argIndex ) STORE256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
217
-
218
- #define OverWr4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\
219
- lanes1 = LOAD256u( curData1[argIndex]),\
220
- lanes2 = LOAD256u( curData2[argIndex]),\
221
- lanes3 = LOAD256u( curData3[argIndex]),\
222
- INTLEAVE(),\
223
- STORE256( stateAsLanes[argIndex+0], lanes0 ),\
224
- STORE256( stateAsLanes[argIndex+1], lanes1 ),\
225
- STORE256( stateAsLanes[argIndex+2], lanes2 ),\
226
- STORE256( stateAsLanes[argIndex+3], lanes3 )
227
-
228
- if ( laneCount >= 16 ) {
229
- OverWr4( 0 );
230
- OverWr4( 4 );
231
- OverWr4( 8 );
232
- OverWr4( 12 );
233
- if ( laneCount >= 20 ) {
234
- OverWr4( 16 );
235
- for(i=20; i<laneCount; i++)
236
- OverWr( i );
237
- }
238
- else {
239
- for(i=16; i<laneCount; i++)
240
- OverWr( i );
241
- }
242
- }
243
- else {
244
- for(i=0; i<laneCount; i++)
245
- OverWr( i );
246
- }
247
- #undef OverWr
248
- #undef OverWr4
249
- }
250
-
251
- void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount)
252
- {
253
- unsigned int sizeLeft = byteCount;
254
- unsigned int lanePosition = 0;
255
- uint64_t *statesAsLanes = (uint64_t *)states;
256
-
257
- while(sizeLeft >= SnP_laneLengthInBytes) {
258
- statesAsLanes[laneIndex(instanceIndex, lanePosition)] = 0;
259
- sizeLeft -= SnP_laneLengthInBytes;
260
- lanePosition++;
261
- }
262
-
263
- if (sizeLeft > 0) {
264
- memset(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], 0, sizeLeft);
265
- }
266
- }
267
-
268
- void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length)
269
- {
270
- unsigned int sizeLeft = length;
271
- unsigned int lanePosition = offset/SnP_laneLengthInBytes;
272
- unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
273
- unsigned char *curData = data;
274
- const uint64_t *statesAsLanes = (const uint64_t *)states;
275
-
276
- if ((sizeLeft > 0) && (offsetInLane != 0)) {
277
- unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
278
- if (bytesInLane > sizeLeft)
279
- bytesInLane = sizeLeft;
280
- memcpy( curData, ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane);
281
- sizeLeft -= bytesInLane;
282
- lanePosition++;
283
- curData += bytesInLane;
284
- }
285
-
286
- while(sizeLeft >= SnP_laneLengthInBytes) {
287
- *(uint64_t*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
288
- sizeLeft -= SnP_laneLengthInBytes;
289
- lanePosition++;
290
- curData += SnP_laneLengthInBytes;
291
- }
292
-
293
- if (sizeLeft > 0) {
294
- memcpy( curData, &statesAsLanes[laneIndex(instanceIndex, lanePosition)], sizeLeft);
295
- }
296
- }
297
-
298
- void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
299
- {
300
- uint64_t *curData0 = (uint64_t *)data;
301
- uint64_t *curData1 = (uint64_t *)(data+laneOffset*1*SnP_laneLengthInBytes);
302
- uint64_t *curData2 = (uint64_t *)(data+laneOffset*2*SnP_laneLengthInBytes);
303
- uint64_t *curData3 = (uint64_t *)(data+laneOffset*3*SnP_laneLengthInBytes);
304
-
305
- const V256 *stateAsLanes = (const V256 *)states;
306
- const uint64_t *stateAsLanes64 = (const uint64_t*)states;
307
- V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
308
- unsigned int i;
309
-
310
- #define Extr( argIndex ) curData0[argIndex] = stateAsLanes64[4*(argIndex)], \
311
- curData1[argIndex] = stateAsLanes64[4*(argIndex)+1], \
312
- curData2[argIndex] = stateAsLanes64[4*(argIndex)+2], \
313
- curData3[argIndex] = stateAsLanes64[4*(argIndex)+3]
314
-
315
- #define Extr4( argIndex ) lanes0 = LOAD256( stateAsLanes[argIndex+0] ), \
316
- lanes1 = LOAD256( stateAsLanes[argIndex+1] ), \
317
- lanes2 = LOAD256( stateAsLanes[argIndex+2] ), \
318
- lanes3 = LOAD256( stateAsLanes[argIndex+3] ), \
319
- UNINTLEAVE(), \
320
- STORE256u( curData0[argIndex], lanes0 ), \
321
- STORE256u( curData1[argIndex], lanes1 ), \
322
- STORE256u( curData2[argIndex], lanes2 ), \
323
- STORE256u( curData3[argIndex], lanes3 )
324
-
325
- if ( laneCount >= 16 ) {
326
- Extr4( 0 );
327
- Extr4( 4 );
328
- Extr4( 8 );
329
- Extr4( 12 );
330
- if ( laneCount >= 20 ) {
331
- Extr4( 16 );
332
- for(i=20; i<laneCount; i++)
333
- Extr( i );
334
- }
335
- else {
336
- for(i=16; i<laneCount; i++)
337
- Extr( i );
338
- }
339
- }
340
- else {
341
- for(i=0; i<laneCount; i++)
342
- Extr( i );
343
- }
344
- #undef Extr
345
- #undef Extr4
346
- }
347
-
348
- void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
349
- {
350
- unsigned int sizeLeft = length;
351
- unsigned int lanePosition = offset/SnP_laneLengthInBytes;
352
- unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
353
- const unsigned char *curInput = input;
354
- unsigned char *curOutput = output;
355
- const uint64_t *statesAsLanes = (const uint64_t *)states;
356
-
357
- if ((sizeLeft > 0) && (offsetInLane != 0)) {
358
- unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
359
- uint64_t lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)] >> (8 * offsetInLane);
360
- if (bytesInLane > sizeLeft)
361
- bytesInLane = sizeLeft;
362
- sizeLeft -= bytesInLane;
363
- do {
364
- *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
365
- lane >>= 8;
366
- } while ( --bytesInLane != 0);
367
- lanePosition++;
368
- }
369
-
370
- while(sizeLeft >= SnP_laneLengthInBytes) {
371
- *((uint64_t*)curOutput) = *((uint64_t*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)];
372
- sizeLeft -= SnP_laneLengthInBytes;
373
- lanePosition++;
374
- curInput += SnP_laneLengthInBytes;
375
- curOutput += SnP_laneLengthInBytes;
376
- }
377
-
378
- if (sizeLeft != 0) {
379
- uint64_t lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
380
- do {
381
- *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
382
- lane >>= 8;
383
- } while ( --sizeLeft != 0);
384
- }
385
- }
386
-
387
- void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset)
388
- {
389
- const uint64_t *curInput0 = (uint64_t *)input;
390
- const uint64_t *curInput1 = (uint64_t *)(input+laneOffset*1*SnP_laneLengthInBytes);
391
- const uint64_t *curInput2 = (uint64_t *)(input+laneOffset*2*SnP_laneLengthInBytes);
392
- const uint64_t *curInput3 = (uint64_t *)(input+laneOffset*3*SnP_laneLengthInBytes);
393
- uint64_t *curOutput0 = (uint64_t *)output;
394
- uint64_t *curOutput1 = (uint64_t *)(output+laneOffset*1*SnP_laneLengthInBytes);
395
- uint64_t *curOutput2 = (uint64_t *)(output+laneOffset*2*SnP_laneLengthInBytes);
396
- uint64_t *curOutput3 = (uint64_t *)(output+laneOffset*3*SnP_laneLengthInBytes);
397
-
398
- const V256 *stateAsLanes = (const V256 *)states;
399
- const uint64_t *stateAsLanes64 = (const uint64_t*)states;
400
- V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
401
- unsigned int i;
402
-
403
- #define ExtrXor( argIndex ) \
404
- curOutput0[argIndex] = curInput0[argIndex] ^ stateAsLanes64[4*(argIndex)],\
405
- curOutput1[argIndex] = curInput1[argIndex] ^ stateAsLanes64[4*(argIndex)+1],\
406
- curOutput2[argIndex] = curInput2[argIndex] ^ stateAsLanes64[4*(argIndex)+2],\
407
- curOutput3[argIndex] = curInput3[argIndex] ^ stateAsLanes64[4*(argIndex)+3]
408
-
409
- #define ExtrXor4( argIndex ) \
410
- lanes0 = LOAD256( stateAsLanes[argIndex+0] ),\
411
- lanes1 = LOAD256( stateAsLanes[argIndex+1] ),\
412
- lanes2 = LOAD256( stateAsLanes[argIndex+2] ),\
413
- lanes3 = LOAD256( stateAsLanes[argIndex+3] ),\
414
- UNINTLEAVE(),\
415
- lanesL01 = LOAD256u( curInput0[argIndex]),\
416
- lanesH01 = LOAD256u( curInput1[argIndex]),\
417
- lanesL23 = LOAD256u( curInput2[argIndex]),\
418
- lanesH23 = LOAD256u( curInput3[argIndex]),\
419
- XOReq256( lanes0, lanesL01 ),\
420
- XOReq256( lanes1, lanesH01 ),\
421
- XOReq256( lanes2, lanesL23 ),\
422
- XOReq256( lanes3, lanesH23 ),\
423
- STORE256u( curOutput0[argIndex], lanes0 ),\
424
- STORE256u( curOutput1[argIndex], lanes1 ),\
425
- STORE256u( curOutput2[argIndex], lanes2 ),\
426
- STORE256u( curOutput3[argIndex], lanes3 )
427
-
428
- if ( laneCount >= 16 ) {
429
- ExtrXor4( 0 );
430
- ExtrXor4( 4 );
431
- ExtrXor4( 8 );
432
- ExtrXor4( 12 );
433
- if ( laneCount >= 20 ) {
434
- ExtrXor4( 16 );
435
- for(i=20; i<laneCount; i++)
436
- ExtrXor( i );
437
- }
438
- else {
439
- for(i=16; i<laneCount; i++)
440
- ExtrXor( i );
441
- }
442
- }
443
- else {
444
- for(i=0; i<laneCount; i++)
445
- ExtrXor( i );
446
- }
447
- #undef ExtrXor
448
- #undef ExtrXor4
449
- }
450
-
451
- #define declareABCDE \
452
- V256 Aba, Abe, Abi, Abo, Abu; \
453
- V256 Aga, Age, Agi, Ago, Agu; \
454
- V256 Aka, Ake, Aki, Ako, Aku; \
455
- V256 Ama, Ame, Ami, Amo, Amu; \
456
- V256 Asa, Ase, Asi, Aso, Asu; \
457
- V256 Bba, Bbe, Bbi, Bbo, Bbu; \
458
- V256 Bga, Bge, Bgi, Bgo, Bgu; \
459
- V256 Bka, Bke, Bki, Bko, Bku; \
460
- V256 Bma, Bme, Bmi, Bmo, Bmu; \
461
- V256 Bsa, Bse, Bsi, Bso, Bsu; \
462
- V256 Ca, Ce, Ci, Co, Cu; \
463
- V256 Ca1, Ce1, Ci1, Co1, Cu1; \
464
- V256 Da, De, Di, Do, Du; \
465
- V256 Eba, Ebe, Ebi, Ebo, Ebu; \
466
- V256 Ega, Ege, Egi, Ego, Egu; \
467
- V256 Eka, Eke, Eki, Eko, Eku; \
468
- V256 Ema, Eme, Emi, Emo, Emu; \
469
- V256 Esa, Ese, Esi, Eso, Esu; \
470
-
471
- #define prepareTheta \
472
- Ca = XOR256(Aba, XOR256(Aga, XOR256(Aka, XOR256(Ama, Asa)))); \
473
- Ce = XOR256(Abe, XOR256(Age, XOR256(Ake, XOR256(Ame, Ase)))); \
474
- Ci = XOR256(Abi, XOR256(Agi, XOR256(Aki, XOR256(Ami, Asi)))); \
475
- Co = XOR256(Abo, XOR256(Ago, XOR256(Ako, XOR256(Amo, Aso)))); \
476
- Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu)))); \
477
-
478
- /* --- Theta Rho Pi Chi Iota Prepare-theta */
479
- /* --- 64-bit lanes mapped to 64-bit words */
480
- #define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
481
- ROL64in256(Ce1, Ce, 1); \
482
- Da = XOR256(Cu, Ce1); \
483
- ROL64in256(Ci1, Ci, 1); \
484
- De = XOR256(Ca, Ci1); \
485
- ROL64in256(Co1, Co, 1); \
486
- Di = XOR256(Ce, Co1); \
487
- ROL64in256(Cu1, Cu, 1); \
488
- Do = XOR256(Ci, Cu1); \
489
- ROL64in256(Ca1, Ca, 1); \
490
- Du = XOR256(Co, Ca1); \
491
- \
492
- XOReq256(A##ba, Da); \
493
- Bba = A##ba; \
494
- XOReq256(A##ge, De); \
495
- ROL64in256(Bbe, A##ge, 44); \
496
- XOReq256(A##ki, Di); \
497
- ROL64in256(Bbi, A##ki, 43); \
498
- E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
499
- XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \
500
- Ca = E##ba; \
501
- XOReq256(A##mo, Do); \
502
- ROL64in256(Bbo, A##mo, 21); \
503
- E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
504
- Ce = E##be; \
505
- XOReq256(A##su, Du); \
506
- ROL64in256(Bbu, A##su, 14); \
507
- E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
508
- Ci = E##bi; \
509
- E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
510
- Co = E##bo; \
511
- E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
512
- Cu = E##bu; \
513
- \
514
- XOReq256(A##bo, Do); \
515
- ROL64in256(Bga, A##bo, 28); \
516
- XOReq256(A##gu, Du); \
517
- ROL64in256(Bge, A##gu, 20); \
518
- XOReq256(A##ka, Da); \
519
- ROL64in256(Bgi, A##ka, 3); \
520
- E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
521
- XOReq256(Ca, E##ga); \
522
- XOReq256(A##me, De); \
523
- ROL64in256(Bgo, A##me, 45); \
524
- E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
525
- XOReq256(Ce, E##ge); \
526
- XOReq256(A##si, Di); \
527
- ROL64in256(Bgu, A##si, 61); \
528
- E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
529
- XOReq256(Ci, E##gi); \
530
- E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
531
- XOReq256(Co, E##go); \
532
- E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
533
- XOReq256(Cu, E##gu); \
534
- \
535
- XOReq256(A##be, De); \
536
- ROL64in256(Bka, A##be, 1); \
537
- XOReq256(A##gi, Di); \
538
- ROL64in256(Bke, A##gi, 6); \
539
- XOReq256(A##ko, Do); \
540
- ROL64in256(Bki, A##ko, 25); \
541
- E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
542
- XOReq256(Ca, E##ka); \
543
- XOReq256(A##mu, Du); \
544
- ROL64in256_8(Bko, A##mu); \
545
- E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
546
- XOReq256(Ce, E##ke); \
547
- XOReq256(A##sa, Da); \
548
- ROL64in256(Bku, A##sa, 18); \
549
- E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
550
- XOReq256(Ci, E##ki); \
551
- E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
552
- XOReq256(Co, E##ko); \
553
- E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
554
- XOReq256(Cu, E##ku); \
555
- \
556
- XOReq256(A##bu, Du); \
557
- ROL64in256(Bma, A##bu, 27); \
558
- XOReq256(A##ga, Da); \
559
- ROL64in256(Bme, A##ga, 36); \
560
- XOReq256(A##ke, De); \
561
- ROL64in256(Bmi, A##ke, 10); \
562
- E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
563
- XOReq256(Ca, E##ma); \
564
- XOReq256(A##mi, Di); \
565
- ROL64in256(Bmo, A##mi, 15); \
566
- E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
567
- XOReq256(Ce, E##me); \
568
- XOReq256(A##so, Do); \
569
- ROL64in256_56(Bmu, A##so); \
570
- E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
571
- XOReq256(Ci, E##mi); \
572
- E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
573
- XOReq256(Co, E##mo); \
574
- E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
575
- XOReq256(Cu, E##mu); \
576
- \
577
- XOReq256(A##bi, Di); \
578
- ROL64in256(Bsa, A##bi, 62); \
579
- XOReq256(A##go, Do); \
580
- ROL64in256(Bse, A##go, 55); \
581
- XOReq256(A##ku, Du); \
582
- ROL64in256(Bsi, A##ku, 39); \
583
- E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
584
- XOReq256(Ca, E##sa); \
585
- XOReq256(A##ma, Da); \
586
- ROL64in256(Bso, A##ma, 41); \
587
- E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
588
- XOReq256(Ce, E##se); \
589
- XOReq256(A##se, De); \
590
- ROL64in256(Bsu, A##se, 2); \
591
- E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
592
- XOReq256(Ci, E##si); \
593
- E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
594
- XOReq256(Co, E##so); \
595
- E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
596
- XOReq256(Cu, E##su); \
597
- \
598
-
599
- /* --- Theta Rho Pi Chi Iota */
600
- /* --- 64-bit lanes mapped to 64-bit words */
601
- #define thetaRhoPiChiIota(i, A, E) \
602
- ROL64in256(Ce1, Ce, 1); \
603
- Da = XOR256(Cu, Ce1); \
604
- ROL64in256(Ci1, Ci, 1); \
605
- De = XOR256(Ca, Ci1); \
606
- ROL64in256(Co1, Co, 1); \
607
- Di = XOR256(Ce, Co1); \
608
- ROL64in256(Cu1, Cu, 1); \
609
- Do = XOR256(Ci, Cu1); \
610
- ROL64in256(Ca1, Ca, 1); \
611
- Du = XOR256(Co, Ca1); \
612
- \
613
- XOReq256(A##ba, Da); \
614
- Bba = A##ba; \
615
- XOReq256(A##ge, De); \
616
- ROL64in256(Bbe, A##ge, 44); \
617
- XOReq256(A##ki, Di); \
618
- ROL64in256(Bbi, A##ki, 43); \
619
- E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
620
- XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \
621
- XOReq256(A##mo, Do); \
622
- ROL64in256(Bbo, A##mo, 21); \
623
- E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
624
- XOReq256(A##su, Du); \
625
- ROL64in256(Bbu, A##su, 14); \
626
- E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
627
- E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
628
- E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
629
- \
630
- XOReq256(A##bo, Do); \
631
- ROL64in256(Bga, A##bo, 28); \
632
- XOReq256(A##gu, Du); \
633
- ROL64in256(Bge, A##gu, 20); \
634
- XOReq256(A##ka, Da); \
635
- ROL64in256(Bgi, A##ka, 3); \
636
- E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
637
- XOReq256(A##me, De); \
638
- ROL64in256(Bgo, A##me, 45); \
639
- E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
640
- XOReq256(A##si, Di); \
641
- ROL64in256(Bgu, A##si, 61); \
642
- E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
643
- E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
644
- E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
645
- \
646
- XOReq256(A##be, De); \
647
- ROL64in256(Bka, A##be, 1); \
648
- XOReq256(A##gi, Di); \
649
- ROL64in256(Bke, A##gi, 6); \
650
- XOReq256(A##ko, Do); \
651
- ROL64in256(Bki, A##ko, 25); \
652
- E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
653
- XOReq256(A##mu, Du); \
654
- ROL64in256_8(Bko, A##mu); \
655
- E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
656
- XOReq256(A##sa, Da); \
657
- ROL64in256(Bku, A##sa, 18); \
658
- E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
659
- E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
660
- E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
661
- \
662
- XOReq256(A##bu, Du); \
663
- ROL64in256(Bma, A##bu, 27); \
664
- XOReq256(A##ga, Da); \
665
- ROL64in256(Bme, A##ga, 36); \
666
- XOReq256(A##ke, De); \
667
- ROL64in256(Bmi, A##ke, 10); \
668
- E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
669
- XOReq256(A##mi, Di); \
670
- ROL64in256(Bmo, A##mi, 15); \
671
- E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
672
- XOReq256(A##so, Do); \
673
- ROL64in256_56(Bmu, A##so); \
674
- E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
675
- E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
676
- E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
677
- \
678
- XOReq256(A##bi, Di); \
679
- ROL64in256(Bsa, A##bi, 62); \
680
- XOReq256(A##go, Do); \
681
- ROL64in256(Bse, A##go, 55); \
682
- XOReq256(A##ku, Du); \
683
- ROL64in256(Bsi, A##ku, 39); \
684
- E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
685
- XOReq256(A##ma, Da); \
686
- ROL64in256(Bso, A##ma, 41); \
687
- E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
688
- XOReq256(A##se, De); \
689
- ROL64in256(Bsu, A##se, 2); \
690
- E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
691
- E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
692
- E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
693
- \
694
-
695
- static ALIGN(KeccakP1600times4_statesAlignment) const uint64_t KeccakF1600RoundConstants[24] = {
696
- 0x0000000000000001ULL,
697
- 0x0000000000008082ULL,
698
- 0x800000000000808aULL,
699
- 0x8000000080008000ULL,
700
- 0x000000000000808bULL,
701
- 0x0000000080000001ULL,
702
- 0x8000000080008081ULL,
703
- 0x8000000000008009ULL,
704
- 0x000000000000008aULL,
705
- 0x0000000000000088ULL,
706
- 0x0000000080008009ULL,
707
- 0x000000008000000aULL,
708
- 0x000000008000808bULL,
709
- 0x800000000000008bULL,
710
- 0x8000000000008089ULL,
711
- 0x8000000000008003ULL,
712
- 0x8000000000008002ULL,
713
- 0x8000000000000080ULL,
714
- 0x000000000000800aULL,
715
- 0x800000008000000aULL,
716
- 0x8000000080008081ULL,
717
- 0x8000000000008080ULL,
718
- 0x0000000080000001ULL,
719
- 0x8000000080008008ULL};
720
-
721
- #define copyFromState(X, state) \
722
- X##ba = LOAD256(state[ 0]); \
723
- X##be = LOAD256(state[ 1]); \
724
- X##bi = LOAD256(state[ 2]); \
725
- X##bo = LOAD256(state[ 3]); \
726
- X##bu = LOAD256(state[ 4]); \
727
- X##ga = LOAD256(state[ 5]); \
728
- X##ge = LOAD256(state[ 6]); \
729
- X##gi = LOAD256(state[ 7]); \
730
- X##go = LOAD256(state[ 8]); \
731
- X##gu = LOAD256(state[ 9]); \
732
- X##ka = LOAD256(state[10]); \
733
- X##ke = LOAD256(state[11]); \
734
- X##ki = LOAD256(state[12]); \
735
- X##ko = LOAD256(state[13]); \
736
- X##ku = LOAD256(state[14]); \
737
- X##ma = LOAD256(state[15]); \
738
- X##me = LOAD256(state[16]); \
739
- X##mi = LOAD256(state[17]); \
740
- X##mo = LOAD256(state[18]); \
741
- X##mu = LOAD256(state[19]); \
742
- X##sa = LOAD256(state[20]); \
743
- X##se = LOAD256(state[21]); \
744
- X##si = LOAD256(state[22]); \
745
- X##so = LOAD256(state[23]); \
746
- X##su = LOAD256(state[24]); \
747
-
748
- #define copyToState(state, X) \
749
- STORE256(state[ 0], X##ba); \
750
- STORE256(state[ 1], X##be); \
751
- STORE256(state[ 2], X##bi); \
752
- STORE256(state[ 3], X##bo); \
753
- STORE256(state[ 4], X##bu); \
754
- STORE256(state[ 5], X##ga); \
755
- STORE256(state[ 6], X##ge); \
756
- STORE256(state[ 7], X##gi); \
757
- STORE256(state[ 8], X##go); \
758
- STORE256(state[ 9], X##gu); \
759
- STORE256(state[10], X##ka); \
760
- STORE256(state[11], X##ke); \
761
- STORE256(state[12], X##ki); \
762
- STORE256(state[13], X##ko); \
763
- STORE256(state[14], X##ku); \
764
- STORE256(state[15], X##ma); \
765
- STORE256(state[16], X##me); \
766
- STORE256(state[17], X##mi); \
767
- STORE256(state[18], X##mo); \
768
- STORE256(state[19], X##mu); \
769
- STORE256(state[20], X##sa); \
770
- STORE256(state[21], X##se); \
771
- STORE256(state[22], X##si); \
772
- STORE256(state[23], X##so); \
773
- STORE256(state[24], X##su); \
774
-
775
- #define copyStateVariables(X, Y) \
776
- X##ba = Y##ba; \
777
- X##be = Y##be; \
778
- X##bi = Y##bi; \
779
- X##bo = Y##bo; \
780
- X##bu = Y##bu; \
781
- X##ga = Y##ga; \
782
- X##ge = Y##ge; \
783
- X##gi = Y##gi; \
784
- X##go = Y##go; \
785
- X##gu = Y##gu; \
786
- X##ka = Y##ka; \
787
- X##ke = Y##ke; \
788
- X##ki = Y##ki; \
789
- X##ko = Y##ko; \
790
- X##ku = Y##ku; \
791
- X##ma = Y##ma; \
792
- X##me = Y##me; \
793
- X##mi = Y##mi; \
794
- X##mo = Y##mo; \
795
- X##mu = Y##mu; \
796
- X##sa = Y##sa; \
797
- X##se = Y##se; \
798
- X##si = Y##si; \
799
- X##so = Y##so; \
800
- X##su = Y##su; \
801
-
802
- #ifdef KeccakP1600times4_fullUnrolling
803
- #define FullUnrolling
804
- #else
805
- #define Unrolling KeccakP1600times4_unrolling
806
- #endif
807
- #include "KeccakP-1600-unrolling.macros"
808
-
809
- void KeccakP1600times4_PermuteAll_24rounds(void *states)
810
- {
811
- V256 *statesAsLanes = (V256 *)states;
812
- declareABCDE
813
- #ifndef KeccakP1600times4_fullUnrolling
814
- unsigned int i;
815
- #endif
816
-
817
- copyFromState(A, statesAsLanes)
818
- rounds24
819
- copyToState(statesAsLanes, A)
820
- }
821
-
822
- void KeccakP1600times4_PermuteAll_12rounds(void *states)
823
- {
824
- V256 *statesAsLanes = (V256 *)states;
825
- declareABCDE
826
- #ifndef KeccakP1600times4_fullUnrolling
827
- unsigned int i;
828
- #endif
829
-
830
- copyFromState(A, statesAsLanes)
831
- rounds12
832
- copyToState(statesAsLanes, A)
833
- }
834
-
835
- void KeccakP1600times4_PermuteAll_6rounds(void *states)
836
- {
837
- V256 *statesAsLanes = (V256 *)states;
838
- declareABCDE
839
- #ifndef KeccakP1600times4_fullUnrolling
840
- unsigned int i;
841
- #endif
842
-
843
- copyFromState(A, statesAsLanes)
844
- rounds6
845
- copyToState(statesAsLanes, A)
846
- }
847
-
848
- void KeccakP1600times4_PermuteAll_4rounds(void *states)
849
- {
850
- V256 *statesAsLanes = (V256 *)states;
851
- declareABCDE
852
- #ifndef KeccakP1600times4_fullUnrolling
853
- unsigned int i;
854
- #endif
855
-
856
- copyFromState(A, statesAsLanes)
857
- rounds4
858
- copyToState(statesAsLanes, A)
859
- }
860
-
861
- size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
862
- {
863
- if (laneCount == 21) {
864
- #if 0
865
- const unsigned char *dataStart = data;
866
- const uint64_t *curData0 = (const uint64_t *)data;
867
- const uint64_t *curData1 = (const uint64_t *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
868
- const uint64_t *curData2 = (const uint64_t *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
869
- const uint64_t *curData3 = (const uint64_t *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
870
-
871
- while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
872
- V256 *stateAsLanes = (V256 *)states;
873
- V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
874
- #define Xor_In( argIndex ) \
875
- XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
876
- #define Xor_In4( argIndex ) \
877
- lanes0 = LOAD256u( curData0[argIndex]),\
878
- lanes1 = LOAD256u( curData1[argIndex]),\
879
- lanes2 = LOAD256u( curData2[argIndex]),\
880
- lanes3 = LOAD256u( curData3[argIndex]),\
881
- INTLEAVE(),\
882
- XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
883
- XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
884
- XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
885
- XOReq256( stateAsLanes[argIndex+3], lanes3 )
886
- Xor_In4( 0 );
887
- Xor_In4( 4 );
888
- Xor_In4( 8 );
889
- Xor_In4( 12 );
890
- Xor_In4( 16 );
891
- Xor_In( 20 );
892
- #undef Xor_In
893
- #undef Xor_In4
894
- KeccakP1600times4_PermuteAll_24rounds(states);
895
- curData0 += laneOffsetSerial;
896
- curData1 += laneOffsetSerial;
897
- curData2 += laneOffsetSerial;
898
- curData3 += laneOffsetSerial;
899
- dataByteLen -= laneOffsetSerial*8;
900
- }
901
- return (const unsigned char *)curData0 - dataStart;
902
- #else
903
- unsigned int i;
904
- const unsigned char *dataStart = data;
905
- const uint64_t *curData0 = (const uint64_t *)data;
906
- const uint64_t *curData1 = (const uint64_t *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
907
- const uint64_t *curData2 = (const uint64_t *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
908
- const uint64_t *curData3 = (const uint64_t *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
909
- V256 *statesAsLanes = (V256 *)states;
910
- declareABCDE
911
-
912
- copyFromState(A, statesAsLanes)
913
- while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
914
- #define XOR_In( Xxx, argIndex ) \
915
- XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
916
- XOR_In( Aba, 0 );
917
- XOR_In( Abe, 1 );
918
- XOR_In( Abi, 2 );
919
- XOR_In( Abo, 3 );
920
- XOR_In( Abu, 4 );
921
- XOR_In( Aga, 5 );
922
- XOR_In( Age, 6 );
923
- XOR_In( Agi, 7 );
924
- XOR_In( Ago, 8 );
925
- XOR_In( Agu, 9 );
926
- XOR_In( Aka, 10 );
927
- XOR_In( Ake, 11 );
928
- XOR_In( Aki, 12 );
929
- XOR_In( Ako, 13 );
930
- XOR_In( Aku, 14 );
931
- XOR_In( Ama, 15 );
932
- XOR_In( Ame, 16 );
933
- XOR_In( Ami, 17 );
934
- XOR_In( Amo, 18 );
935
- XOR_In( Amu, 19 );
936
- XOR_In( Asa, 20 );
937
- #undef XOR_In
938
- rounds24
939
- curData0 += laneOffsetSerial;
940
- curData1 += laneOffsetSerial;
941
- curData2 += laneOffsetSerial;
942
- curData3 += laneOffsetSerial;
943
- dataByteLen -= laneOffsetSerial*8;
944
- }
945
- copyToState(statesAsLanes, A)
946
- return (const unsigned char *)curData0 - dataStart;
947
- #endif
948
- }
949
- else {
950
- unsigned int i;
951
- const unsigned char *dataStart = data;
952
-
953
- while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
954
- KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);
955
- KeccakP1600times4_PermuteAll_24rounds(states);
956
- data += laneOffsetSerial*8;
957
- dataByteLen -= laneOffsetSerial*8;
958
- }
959
- return data - dataStart;
960
- }
961
- }
962
-
963
- size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
964
- {
965
- if (laneCount == 21) {
966
- #if 0
967
- const unsigned char *dataStart = data;
968
- const uint64_t *curData0 = (const uint64_t *)data;
969
- const uint64_t *curData1 = (const uint64_t *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
970
- const uint64_t *curData2 = (const uint64_t *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
971
- const uint64_t *curData3 = (const uint64_t *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
972
-
973
- while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
974
- V256 *stateAsLanes = states;
975
- V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
976
- #define Xor_In( argIndex ) \
977
- XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
978
- #define Xor_In4( argIndex ) \
979
- lanes0 = LOAD256u( curData0[argIndex]),\
980
- lanes1 = LOAD256u( curData1[argIndex]),\
981
- lanes2 = LOAD256u( curData2[argIndex]),\
982
- lanes3 = LOAD256u( curData3[argIndex]),\
983
- INTLEAVE(),\
984
- XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
985
- XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
986
- XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
987
- XOReq256( stateAsLanes[argIndex+3], lanes3 )
988
- Xor_In4( 0 );
989
- Xor_In4( 4 );
990
- Xor_In4( 8 );
991
- Xor_In4( 12 );
992
- Xor_In4( 16 );
993
- Xor_In( 20 );
994
- #undef Xor_In
995
- #undef Xor_In4
996
- KeccakP1600times4_PermuteAll_12rounds(states);
997
- curData0 += laneOffsetSerial;
998
- curData1 += laneOffsetSerial;
999
- curData2 += laneOffsetSerial;
1000
- curData3 += laneOffsetSerial;
1001
- dataByteLen -= laneOffsetSerial*8;
1002
- }
1003
- return (const unsigned char *)curData0 - dataStart;
1004
- #else
1005
- unsigned int i;
1006
- const unsigned char *dataStart = data;
1007
- const uint64_t *curData0 = (const uint64_t *)data;
1008
- const uint64_t *curData1 = (const uint64_t *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
1009
- const uint64_t *curData2 = (const uint64_t *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
1010
- const uint64_t *curData3 = (const uint64_t *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
1011
- V256 *statesAsLanes = states;
1012
- declareABCDE
1013
-
1014
- copyFromState(A, statesAsLanes)
1015
- while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
1016
- #define XOR_In( Xxx, argIndex ) \
1017
- XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
1018
- XOR_In( Aba, 0 );
1019
- XOR_In( Abe, 1 );
1020
- XOR_In( Abi, 2 );
1021
- XOR_In( Abo, 3 );
1022
- XOR_In( Abu, 4 );
1023
- XOR_In( Aga, 5 );
1024
- XOR_In( Age, 6 );
1025
- XOR_In( Agi, 7 );
1026
- XOR_In( Ago, 8 );
1027
- XOR_In( Agu, 9 );
1028
- XOR_In( Aka, 10 );
1029
- XOR_In( Ake, 11 );
1030
- XOR_In( Aki, 12 );
1031
- XOR_In( Ako, 13 );
1032
- XOR_In( Aku, 14 );
1033
- XOR_In( Ama, 15 );
1034
- XOR_In( Ame, 16 );
1035
- XOR_In( Ami, 17 );
1036
- XOR_In( Amo, 18 );
1037
- XOR_In( Amu, 19 );
1038
- XOR_In( Asa, 20 );
1039
- #undef XOR_In
1040
- rounds12
1041
- curData0 += laneOffsetSerial;
1042
- curData1 += laneOffsetSerial;
1043
- curData2 += laneOffsetSerial;
1044
- curData3 += laneOffsetSerial;
1045
- dataByteLen -= laneOffsetSerial*8;
1046
- }
1047
- copyToState(statesAsLanes, A)
1048
- return (const unsigned char *)curData0 - dataStart;
1049
- #endif
1050
- }
1051
- else {
1052
- unsigned int i;
1053
- const unsigned char *dataStart = data;
1054
-
1055
- while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
1056
- KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);
1057
- KeccakP1600times4_PermuteAll_12rounds(states);
1058
- data += laneOffsetSerial*8;
1059
- dataByteLen -= laneOffsetSerial*8;
1060
- }
1061
- return data - dataStart;
1062
- }
1063
- }
1064
-
1065
- /* ------------------------------------------------------------------------- */
1066
-
1067
- #define UNINTLEAVEa(lanes0, lanes1, lanes2, lanes3) \
1068
- lanesL01 = UNPACKL( lanes0, lanes1 ), \
1069
- lanesH01 = UNPACKH( lanes0, lanes1 ), \
1070
- lanesL23 = UNPACKL( lanes2, lanes3 ), \
1071
- lanesH23 = UNPACKH( lanes2, lanes3 ), \
1072
- lanes0 = PERM128( lanesL01, lanesL23, 0x20 ), \
1073
- lanes2 = PERM128( lanesL01, lanesL23, 0x31 ), \
1074
- lanes1 = PERM128( lanesH01, lanesH23, 0x20 ), \
1075
- lanes3 = PERM128( lanesH01, lanesH23, 0x31 )
1076
-
1077
- #define INTLEAVEa(lanes0, lanes1, lanes2, lanes3) \
1078
- lanesL01 = PERM128( lanes0, lanes2, 0x20 ), \
1079
- lanesH01 = PERM128( lanes1, lanes3, 0x20 ), \
1080
- lanesL23 = PERM128( lanes0, lanes2, 0x31 ), \
1081
- lanesH23 = PERM128( lanes1, lanes3, 0x31 ), \
1082
- lanes0 = SHUFFLE64( lanesL01, lanesH01, 0x00 ), \
1083
- lanes1 = SHUFFLE64( lanesL01, lanesH01, 0x0F ), \
1084
- lanes2 = SHUFFLE64( lanesL23, lanesH23, 0x00 ), \
1085
- lanes3 = SHUFFLE64( lanesL23, lanesH23, 0x0F )
1086
-
1087
-
1088
- #define LoadXOReq256( lanes, inp, argIndex) XOReq256( lanes, LOAD4_64(inp[3*25+argIndex], inp[2*25+argIndex], inp[1*25+argIndex], inp[0*25+argIndex]) )
1089
-
1090
- /* ------------------------------------------------------------------------- */
1091
-
1092
- #if defined(UseGatherScatter)
1093
-
1094
- #define AddOverWr4( lanes0, lanes1, lanes2, lanes3, key, inp, argIndex ) \
1095
- lanes0 = _mm256_i32gather_epi64((const long long int *)&inp[argIndex+0], gather, 1), \
1096
- lanes1 = _mm256_i32gather_epi64((const long long int *)&inp[argIndex+1], gather, 1), \
1097
- lanes2 = _mm256_i32gather_epi64((const long long int *)&inp[argIndex+2], gather, 1), \
1098
- lanes3 = _mm256_i32gather_epi64((const long long int *)&inp[argIndex+3], gather, 1), \
1099
- XOReq256( lanes0, CONST256_64( key[argIndex+0])), \
1100
- XOReq256( lanes1, CONST256_64( key[argIndex+1])), \
1101
- XOReq256( lanes2, CONST256_64( key[argIndex+2])), \
1102
- XOReq256( lanes3, CONST256_64( key[argIndex+3]))
1103
-
1104
- #else
1105
-
1106
- #define AddOverWr4( lanes0, lanes1, lanes2, lanes3, key, inp, argIndex ) \
1107
- lanes0 = LOAD256u( inp[argIndex+0*25]), \
1108
- lanes1 = LOAD256u( inp[argIndex+1*25]), \
1109
- lanes2 = LOAD256u( inp[argIndex+2*25]), \
1110
- lanes3 = LOAD256u( inp[argIndex+3*25]), \
1111
- INTLEAVEa(lanes0, lanes1, lanes2, lanes3), \
1112
- XOReq256( lanes0, CONST256_64( key[argIndex+0])), \
1113
- XOReq256( lanes1, CONST256_64( key[argIndex+1])), \
1114
- XOReq256( lanes2, CONST256_64( key[argIndex+2])), \
1115
- XOReq256( lanes3, CONST256_64( key[argIndex+3]))
1116
-
1117
- #endif
1118
-
1119
- #if defined(__i386__) || defined(_M_IX86)
1120
- #define _mm256_extract_epi64(a, index) \
1121
- ((uint64_t)_mm256_extract_epi32((a), (index)*2) || ((uint64_t)_mm256_extract_epi32((a), (index)*2+1) << 32))
1122
- #endif
1123
-
1124
- #define ExtrAccu( lanes, p, argIndex ) p[argIndex] ^= _mm256_extract_epi64(lanes, 0) ^ _mm256_extract_epi64(lanes, 1) \
1125
- ^ _mm256_extract_epi64(lanes, 2) ^ _mm256_extract_epi64(lanes, 3)
1126
-
1127
- #define ExtrAccu4( lanes0, lanes1, lanes2, lanes3, p, argIndex ) \
1128
- UNINTLEAVEa(lanes0, lanes1, lanes2, lanes3), \
1129
- XOReq256( lanes0, lanes1 ), \
1130
- XOReq256( lanes2, lanes3 ), \
1131
- lanes1 = LOAD256u( p[argIndex]), \
1132
- XOReq256( lanes0, lanes2 ), \
1133
- XOReq256( lanes0, lanes1 ), \
1134
- STORE256u( p[argIndex], lanes0 )
1135
-
1136
- #define Kravatte_Rollc() \
1137
- Asa = x0x1x2x3, \
1138
- Ase = x1x2x3x4, \
1139
- ROL64in256(x1x2x3x4, x0x1x2x3, 7), \
1140
- XOReq256(x1x2x3x4, Ase), \
1141
- XOReq256(x1x2x3x4, _mm256_srli_epi64(Ase, 3)), \
1142
- Asi = _mm256_blend_epi32(_mm256_permute4x64_epi64(Ase, 0x39), _mm256_permute4x64_epi64(x1x2x3x4, 0x39), 0xC0), \
1143
- Aso = PERM128(Ase, x1x2x3x4, 0x21), \
1144
- Asu = _mm256_blend_epi32(_mm256_permute4x64_epi64(Ase, 0xFF), _mm256_permute4x64_epi64(x1x2x3x4, 0x90), 0xFC), \
1145
- x0x1x2x3 = Asu
1146
-
1147
- size_t KeccakP1600times4_KravatteCompress(uint64_t *xAccu, uint64_t *kRoll, const unsigned char *input, size_t inputByteLen)
1148
- {
1149
- uint64_t *in64 = (uint64_t *)input;
1150
- size_t nBlocks = inputByteLen / (4 * 200);
1151
- declareABCDE
1152
- #if !defined(KeccakP1600times4_fullUnrolling)
1153
- unsigned int i;
1154
- #endif
1155
- V256 lanesL01, lanesL23, lanesH01, lanesH23;
1156
- V256 x0x1x2x3, x1x2x3x4;
1157
- #if defined(UseGatherScatter)
1158
- V128 gather = _mm_setr_epi32(0*25*8, 1*25*8, 2*25*8, 3*25*8);
1159
- #endif
1160
-
1161
- x0x1x2x3 = LOAD256u(kRoll[20]);
1162
- x1x2x3x4 = LOAD256u(kRoll[21]);
1163
- do {
1164
- AddOverWr4( Aba, Abe, Abi, Abo, kRoll, in64, 0 );
1165
- AddOverWr4( Abu, Aga, Age, Agi, kRoll, in64, 4 );
1166
- AddOverWr4( Ago, Agu, Aka, Ake, kRoll, in64, 8 );
1167
- AddOverWr4( Aki, Ako, Aku, Ama, kRoll, in64, 12 );
1168
- AddOverWr4( Ame, Ami, Amo, Amu, kRoll, in64, 16 );
1169
- Kravatte_Rollc();
1170
- LoadXOReq256(Asa, in64, 20);
1171
- LoadXOReq256(Ase, in64, 21);
1172
- LoadXOReq256(Asi, in64, 22);
1173
- LoadXOReq256(Aso, in64, 23);
1174
- LoadXOReq256(Asu, in64, 24);
1175
- rounds6
1176
- ExtrAccu4(Aba, Abe, Abi, Abo, xAccu, 0 );
1177
- ExtrAccu4(Abu, Aga, Age, Agi, xAccu, 4 );
1178
- ExtrAccu4(Ago, Agu, Aka, Ake, xAccu, 8 );
1179
- ExtrAccu4(Aki, Ako, Aku, Ama, xAccu, 12 );
1180
- ExtrAccu4(Ame, Ami, Amo, Amu, xAccu, 16 );
1181
- ExtrAccu4(Asa, Ase, Asi, Aso, xAccu, 20 );
1182
- ExtrAccu( Asu, xAccu, 24 );
1183
- in64 += 4 * 25;
1184
- }
1185
- while(--nBlocks != 0);
1186
- STORE256u(kRoll[20], x0x1x2x3);
1187
- kRoll[24] = _mm256_extract_epi64(x1x2x3x4, 3);
1188
-
1189
- return (size_t)in64 - (size_t)input;
1190
- }
1191
-
1192
- #undef LoadXOReq256
1193
- #undef AddOverWr4
1194
- #undef ExtrAccu
1195
- #undef ExtrAccu4
1196
-
1197
- /* ------------------------------------------------------------------------- */
1198
-
1199
- #define ExtrAddKey( lanes, p, argIndex ) \
1200
- XOReq256(lanes, CONST256_64(kRoll[argIndex])), \
1201
- p[argIndex+0*25] = _mm256_extract_epi64(lanes, 0), \
1202
- p[argIndex+1*25] = _mm256_extract_epi64(lanes, 1), \
1203
- p[argIndex+2*25] = _mm256_extract_epi64(lanes, 2), \
1204
- p[argIndex+3*25] = _mm256_extract_epi64(lanes, 3)
1205
-
1206
- #if 0//defined(UseGatherScatter)
1207
-
1208
- #define ExtrAddKey4( lanes0, lanes1, lanes2, lanes3, p, argIndex ) \
1209
- XOReq256(lanes0, CONST256_64(kRoll[argIndex+0])), \
1210
- XOReq256(lanes1, CONST256_64(kRoll[argIndex+1])), \
1211
- XOReq256(lanes2, CONST256_64(kRoll[argIndex+2])), \
1212
- XOReq256(lanes3, CONST256_64(kRoll[argIndex+3])), \
1213
- _mm256_i32scatter_epi64((long long int *)&p[argIndex+0], scatter, lanes0, 1), \
1214
- _mm256_i32scatter_epi64((long long int *)&p[argIndex+1], scatter, lanes1, 1), \
1215
- _mm256_i32scatter_epi64((long long int *)&p[argIndex+2], scatter, lanes2, 1), \
1216
- _mm256_i32scatter_epi64((long long int *)&p[argIndex+3], scatter, lanes3, 1)
1217
-
1218
- #else
1219
-
1220
- #define ExtrAddKey4( lanes0, lanes1, lanes2, lanes3, p, argIndex ) \
1221
- XOReq256(lanes0, CONST256_64(kRoll[argIndex+0])), \
1222
- XOReq256(lanes1, CONST256_64(kRoll[argIndex+1])), \
1223
- XOReq256(lanes2, CONST256_64(kRoll[argIndex+2])), \
1224
- XOReq256(lanes3, CONST256_64(kRoll[argIndex+3])), \
1225
- UNINTLEAVEa(lanes0, lanes1, lanes2, lanes3), \
1226
- STORE256u( p[argIndex+0*25], lanes0 ), \
1227
- STORE256u( p[argIndex+1*25], lanes1 ), \
1228
- STORE256u( p[argIndex+2*25], lanes2 ), \
1229
- STORE256u( p[argIndex+3*25], lanes3 )
1230
-
1231
- #endif
1232
-
1233
- size_t KeccakP1600times4_KravatteExpand(uint64_t *yAccu, const uint64_t *kRoll, unsigned char *output, size_t outputByteLen)
1234
- {
1235
- uint64_t *out64 = (uint64_t *)output;
1236
- size_t nBlocks = outputByteLen / (4 * 200);
1237
- declareABCDE
1238
- #if !defined(KeccakP1600times4_fullUnrolling)
1239
- unsigned int i;
1240
- #endif
1241
- V256 lanesL01, lanesL23, lanesH01, lanesH23;
1242
- #if defined(UseGatherScatter)
1243
- V128 scatter = _mm_setr_epi32(0*25*8, 1*25*8, 2*25*8, 3*25*8);
1244
- #endif
1245
-
1246
- do {
1247
- Aba = CONST256_64(yAccu[0]);
1248
- Abe = CONST256_64(yAccu[1]);
1249
- Abi = CONST256_64(yAccu[2]);
1250
- Abo = CONST256_64(yAccu[3]);
1251
- Abu = CONST256_64(yAccu[4]);
1252
-
1253
- Aga = CONST256_64(yAccu[5]);
1254
- Age = CONST256_64(yAccu[6]);
1255
- Agi = CONST256_64(yAccu[7]);
1256
- Ago = CONST256_64(yAccu[8]);
1257
- Agu = CONST256_64(yAccu[9]);
1258
-
1259
- Aka = CONST256_64(yAccu[10]);
1260
- Ake = CONST256_64(yAccu[11]);
1261
- Aki = CONST256_64(yAccu[12]);
1262
- Ako = CONST256_64(yAccu[13]);
1263
- Aku = CONST256_64(yAccu[14]);
1264
-
1265
- Ama = LOAD256u(yAccu[15]);
1266
- Ame = LOAD256u(yAccu[16]);
1267
- Ami = LOAD256u(yAccu[17]);
1268
- Amo = LOAD256u(yAccu[18]);
1269
- Amu = LOAD256u(yAccu[19]);
1270
-
1271
- ROL64in256(lanesL01, Ama, 7);
1272
- ROL64in256(lanesH01, Ame, 18);
1273
- lanesL01 = XOR256(lanesL01, lanesH01);
1274
- lanesH01 = _mm256_and_si256(Ami, _mm256_srli_epi64(Ame, 1));
1275
- lanesL01 = XOR256(lanesL01, lanesH01);
1276
-
1277
- Asa = LOAD256u(yAccu[20]);
1278
- Ase = LOAD256u(yAccu[21]);
1279
- #if defined(__i386__) || defined(_M_IX86)
1280
- Asi = _mm256_permute4x64_epi64(Ase, 0x39);
1281
- Asi = _mm256_insert_epi32(Asi, _mm256_extract_epi32(lanesL01, 0), 6);
1282
- Asi = _mm256_insert_epi32(Asi, _mm256_extract_epi32(lanesL01, 1), 7);
1283
- #else
1284
- Asi = _mm256_insert_epi64(_mm256_permute4x64_epi64(Ase, 0x39), _mm256_extract_epi64(lanesL01, 0), 3);
1285
- #endif
1286
- Aso = _mm256_permute2x128_si256(Ase, lanesL01, 0x21);
1287
- #if defined(__i386__) || defined(_M_IX86)
1288
- Asu = _mm256_permute4x64_epi64(lanesL01, 0x93);
1289
- Asu = _mm256_insert_epi32(Asu, _mm256_extract_epi32(Ase, 6), 0);
1290
- Asu = _mm256_insert_epi32(Asu, _mm256_extract_epi32(Ase, 7), 1);
1291
- #else
1292
- Asu = _mm256_insert_epi64(_mm256_permute4x64_epi64(lanesL01, 0x93), _mm256_extract_epi64(Ase, 3), 0);
1293
- #endif
1294
-
1295
- STORE256u(yAccu[15], Amu);
1296
- yAccu[19] = _mm256_extract_epi64(Aso, 0);
1297
- yAccu[20] = _mm256_extract_epi64(Aso, 1);
1298
- STORE256u(yAccu[21], lanesL01);
1299
-
1300
- rounds6
1301
- ExtrAddKey4(Aba, Abe, Abi, Abo, out64, 0 );
1302
- ExtrAddKey4(Abu, Aga, Age, Agi, out64, 4 );
1303
- ExtrAddKey4(Ago, Agu, Aka, Ake, out64, 8 );
1304
- ExtrAddKey4(Aki, Ako, Aku, Ama, out64, 12 );
1305
- ExtrAddKey4(Ame, Ami, Amo, Amu, out64, 16 );
1306
- ExtrAddKey4(Asa, Ase, Asi, Aso, out64, 20 );
1307
- ExtrAddKey( Asu, out64, 24 );
1308
- out64 += 4 * 25;
1309
- }
1310
- while(--nBlocks != 0);
1311
-
1312
- return (size_t)out64 - (size_t)output;
1313
- }
1314
-
1315
- #undef OverWr4
1316
- #undef ExtrAddKey
1317
- #undef ExtrAddKey4
1318
-
1319
- #undef Kravatte_Roll
1320
- #undef UNINTLEAVEa
1321
- #undef INTLEAVEa