digest-kangarootwelve 0.0.2 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (307) hide show
  1. checksums.yaml +5 -5
  2. data/README.md +71 -37
  3. data/Rakefile +7 -9
  4. data/digest-kangarootwelve.gemspec +323 -14
  5. data/ext/digest/kangarootwelve/ext.c +228 -177
  6. data/ext/digest/kangarootwelve/extconf.rb +15 -1
  7. data/ext/digest/kangarootwelve/keccak/armv6m/KangarooTwelve.link.c +1 -0
  8. data/ext/digest/kangarootwelve/keccak/armv6m/KeccakDuplexWidth1600.link.c +1 -0
  9. data/ext/digest/kangarootwelve/keccak/armv6m/KeccakP-1600-SnP.h +36 -0
  10. data/ext/digest/kangarootwelve/{KeccakP-1600-times2-SnP.h → keccak/armv6m/KeccakP-1600-times2-SnP.h} +10 -10
  11. data/ext/digest/kangarootwelve/{KeccakP-1600-times2-on1.c → keccak/armv6m/KeccakP-1600-times2-on1.c} +13 -7
  12. data/ext/digest/kangarootwelve/{KeccakP-1600-times4-SnP.h → keccak/armv6m/KeccakP-1600-times4-SnP.h} +10 -10
  13. data/ext/digest/kangarootwelve/{KeccakP-1600-times4-on1.c → keccak/armv6m/KeccakP-1600-times4-on1.c} +13 -7
  14. data/ext/digest/kangarootwelve/{KeccakP-1600-times8-SnP.h → keccak/armv6m/KeccakP-1600-times8-SnP.h} +10 -10
  15. data/ext/digest/kangarootwelve/{KeccakP-1600-times8-on1.c → keccak/armv6m/KeccakP-1600-times8-on1.c} +13 -7
  16. data/ext/digest/kangarootwelve/keccak/armv6m/KeccakP-1600-u2-32bi-armv6m-le-gcc.s +1334 -0
  17. data/ext/digest/kangarootwelve/keccak/armv6m/KeccakSpongeWidth1600.link.c +1 -0
  18. data/ext/digest/kangarootwelve/{PlSnP-Fallback.inc → keccak/armv6m/PlSnP-Fallback.inc} +11 -7
  19. data/ext/digest/kangarootwelve/keccak/armv6m/ext.link.c +1 -0
  20. data/ext/digest/kangarootwelve/keccak/armv7a/KangarooTwelve.link.c +1 -0
  21. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakDuplexWidth1600.link.c +1 -0
  22. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-SnP.h +37 -0
  23. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-armv7a-le-neon-gcc.s +826 -0
  24. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-inplace-pl2-armv7a-neon-le-gcc.s +1245 -0
  25. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-times2-SnP.h +38 -0
  26. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-times4-SnP.h +45 -0
  27. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-times4-on2.c +38 -0
  28. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-times8-SnP.h +45 -0
  29. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-times8-on2.c +38 -0
  30. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakSpongeWidth1600.link.c +1 -0
  31. data/ext/digest/kangarootwelve/keccak/armv7a/PlSnP-Fallback.inc +287 -0
  32. data/ext/digest/kangarootwelve/keccak/armv7a/ext.link.c +1 -0
  33. data/ext/digest/kangarootwelve/keccak/armv7m/KangarooTwelve.link.c +1 -0
  34. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakDuplexWidth1600.link.c +1 -0
  35. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-SnP.h +36 -0
  36. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-inplace-32bi-armv7m-le-gcc.s +1170 -0
  37. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-times2-SnP.h +45 -0
  38. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-times2-on1.c +37 -0
  39. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-times4-SnP.h +45 -0
  40. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-times4-on1.c +37 -0
  41. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-times8-SnP.h +45 -0
  42. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-times8-on1.c +37 -0
  43. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakSpongeWidth1600.link.c +1 -0
  44. data/ext/digest/kangarootwelve/keccak/armv7m/PlSnP-Fallback.inc +287 -0
  45. data/ext/digest/kangarootwelve/keccak/armv7m/ext.link.c +1 -0
  46. data/ext/digest/kangarootwelve/keccak/armv8a/KangarooTwelve.link.c +1 -0
  47. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakDuplexWidth1600.link.c +1 -0
  48. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-SnP.h +28 -0
  49. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-armv8a-neon.s +537 -0
  50. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-times2-SnP.h +45 -0
  51. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-times2-on1.c +37 -0
  52. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-times4-SnP.h +45 -0
  53. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-times4-on1.c +37 -0
  54. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-times8-SnP.h +45 -0
  55. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-times8-on1.c +37 -0
  56. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakSpongeWidth1600.link.c +1 -0
  57. data/ext/digest/kangarootwelve/keccak/armv8a/PlSnP-Fallback.inc +287 -0
  58. data/ext/digest/kangarootwelve/keccak/armv8a/ext.link.c +1 -0
  59. data/ext/digest/kangarootwelve/keccak/asmx86-64/KangarooTwelve.link.c +1 -0
  60. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakDuplexWidth1600.link.c +1 -0
  61. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-SnP.h +37 -0
  62. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-times2-SnP.h +45 -0
  63. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-times2-on1.c +37 -0
  64. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-times4-SnP.h +45 -0
  65. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-times4-on1.c +37 -0
  66. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-times8-SnP.h +45 -0
  67. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-times8-on1.c +37 -0
  68. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-x86-64-gas.s +1190 -0
  69. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakSpongeWidth1600.link.c +1 -0
  70. data/ext/digest/kangarootwelve/keccak/asmx86-64/PlSnP-Fallback.inc +287 -0
  71. data/ext/digest/kangarootwelve/keccak/asmx86-64/ext.link.c +1 -0
  72. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KangarooTwelve.link.c +1 -0
  73. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakDuplexWidth1600.link.c +1 -0
  74. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-SnP.h +37 -0
  75. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-times2-SnP.h +45 -0
  76. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-times2-on1.c +37 -0
  77. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-times4-SnP.h +45 -0
  78. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-times4-on1.c +37 -0
  79. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-times8-SnP.h +45 -0
  80. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-times8-on1.c +37 -0
  81. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-x86-64-shld-gas.s +1190 -0
  82. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakSpongeWidth1600.link.c +1 -0
  83. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/PlSnP-Fallback.inc +287 -0
  84. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/ext.link.c +1 -0
  85. data/ext/digest/kangarootwelve/keccak/avr8/KangarooTwelve.link.c +1 -0
  86. data/ext/digest/kangarootwelve/keccak/avr8/KeccakDuplexWidth1600.link.c +1 -0
  87. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-SnP.h +37 -0
  88. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-avr8-fast.s +1116 -0
  89. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-times2-SnP.h +45 -0
  90. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-times2-on1.c +37 -0
  91. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-times4-SnP.h +45 -0
  92. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-times4-on1.c +37 -0
  93. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-times8-SnP.h +45 -0
  94. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-times8-on1.c +37 -0
  95. data/ext/digest/kangarootwelve/keccak/avr8/KeccakSpongeWidth1600.link.c +1 -0
  96. data/ext/digest/kangarootwelve/keccak/avr8/PlSnP-Fallback.inc +287 -0
  97. data/ext/digest/kangarootwelve/keccak/avr8/ext.link.c +1 -0
  98. data/ext/digest/kangarootwelve/keccak/bulldozer/KangarooTwelve.link.c +1 -0
  99. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakDuplexWidth1600.link.c +1 -0
  100. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-SnP.h +39 -0
  101. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-XOP-config.h +6 -0
  102. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-XOP.c +473 -0
  103. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-times2-SIMD128.c +954 -0
  104. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-times2-SnP.h +47 -0
  105. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-times4-SnP.h +45 -0
  106. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-times4-on2.c +38 -0
  107. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-times8-SnP.h +45 -0
  108. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-times8-on2.c +38 -0
  109. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-unrolling.macros +302 -0
  110. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakSpongeWidth1600.link.c +1 -0
  111. data/ext/digest/kangarootwelve/keccak/bulldozer/PlSnP-Fallback.inc +287 -0
  112. data/ext/digest/kangarootwelve/keccak/bulldozer/SIMD128-config.h +9 -0
  113. data/ext/digest/kangarootwelve/{SnP-Relaned.h → keccak/bulldozer/SnP-Relaned.h} +13 -7
  114. data/ext/digest/kangarootwelve/keccak/bulldozer/ext.link.c +1 -0
  115. data/ext/digest/kangarootwelve/{KangarooTwelve.c → keccak/common/KangarooTwelve.c} +6 -10
  116. data/ext/digest/kangarootwelve/{KangarooTwelve.h → keccak/common/KangarooTwelve.h} +3 -7
  117. data/ext/digest/kangarootwelve/keccak/common/KeccakDuplex-common.h +37 -0
  118. data/ext/digest/kangarootwelve/keccak/common/KeccakDuplex.inc +192 -0
  119. data/ext/digest/kangarootwelve/keccak/common/KeccakDuplexWidth1600.c +34 -0
  120. data/ext/digest/kangarootwelve/keccak/common/KeccakDuplexWidth1600.h +25 -0
  121. data/ext/digest/kangarootwelve/{KeccakSponge-common.h → keccak/common/KeccakSponge-common.h} +5 -7
  122. data/ext/digest/kangarootwelve/{KeccakSponge.inc → keccak/common/KeccakSponge.inc} +6 -8
  123. data/ext/digest/kangarootwelve/{KeccakSpongeWidth1600.c → keccak/common/KeccakSpongeWidth1600.c} +6 -8
  124. data/ext/digest/kangarootwelve/{KeccakSpongeWidth1600.h → keccak/common/KeccakSpongeWidth1600.h} +5 -7
  125. data/ext/digest/kangarootwelve/{Phases.h → keccak/common/Phases.h} +3 -7
  126. data/ext/digest/kangarootwelve/{align.h → keccak/common/align.h} +5 -7
  127. data/ext/digest/kangarootwelve/{brg_endian.h → keccak/common/brg_endian.h} +0 -0
  128. data/ext/digest/kangarootwelve/keccak/compact/KangarooTwelve.link.c +1 -0
  129. data/ext/digest/kangarootwelve/keccak/compact/KeccakDuplexWidth1600.link.c +1 -0
  130. data/ext/digest/kangarootwelve/{KeccakP-1600-SnP.h → keccak/compact/KeccakP-1600-SnP.h} +7 -10
  131. data/ext/digest/kangarootwelve/{KeccakP-1600-compact64.c → keccak/compact/KeccakP-1600-compact64.c} +11 -7
  132. data/ext/digest/kangarootwelve/keccak/compact/KeccakP-1600-times2-SnP.h +45 -0
  133. data/ext/digest/kangarootwelve/keccak/compact/KeccakP-1600-times2-on1.c +37 -0
  134. data/ext/digest/kangarootwelve/keccak/compact/KeccakP-1600-times4-SnP.h +45 -0
  135. data/ext/digest/kangarootwelve/keccak/compact/KeccakP-1600-times4-on1.c +37 -0
  136. data/ext/digest/kangarootwelve/keccak/compact/KeccakP-1600-times8-SnP.h +45 -0
  137. data/ext/digest/kangarootwelve/keccak/compact/KeccakP-1600-times8-on1.c +37 -0
  138. data/ext/digest/kangarootwelve/keccak/compact/KeccakSpongeWidth1600.link.c +1 -0
  139. data/ext/digest/kangarootwelve/keccak/compact/PlSnP-Fallback.inc +287 -0
  140. data/ext/digest/kangarootwelve/keccak/compact/SnP-Relaned.h +140 -0
  141. data/ext/digest/kangarootwelve/keccak/compact/ext.link.c +1 -0
  142. data/ext/digest/kangarootwelve/keccak/generic32/KangarooTwelve.link.c +1 -0
  143. data/ext/digest/kangarootwelve/keccak/generic32/KeccakDuplexWidth1600.link.c +1 -0
  144. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-SnP.h +38 -0
  145. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-inplace32BI.c +1162 -0
  146. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-times2-SnP.h +45 -0
  147. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-times2-on1.c +37 -0
  148. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-times4-SnP.h +45 -0
  149. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-times4-on1.c +37 -0
  150. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-times8-SnP.h +45 -0
  151. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-times8-on1.c +37 -0
  152. data/ext/digest/kangarootwelve/keccak/generic32/KeccakSpongeWidth1600.link.c +1 -0
  153. data/ext/digest/kangarootwelve/keccak/generic32/PlSnP-Fallback.inc +287 -0
  154. data/ext/digest/kangarootwelve/keccak/generic32/SnP-Relaned.h +140 -0
  155. data/ext/digest/kangarootwelve/keccak/generic32/ext.link.c +1 -0
  156. data/ext/digest/kangarootwelve/keccak/generic32lc/KangarooTwelve.link.c +1 -0
  157. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakDuplexWidth1600.link.c +1 -0
  158. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-SnP.h +38 -0
  159. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-inplace32BI.c +1162 -0
  160. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-times2-SnP.h +45 -0
  161. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-times2-on1.c +37 -0
  162. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-times4-SnP.h +45 -0
  163. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-times4-on1.c +37 -0
  164. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-times8-SnP.h +45 -0
  165. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-times8-on1.c +37 -0
  166. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakSpongeWidth1600.link.c +1 -0
  167. data/ext/digest/kangarootwelve/keccak/generic32lc/PlSnP-Fallback.inc +287 -0
  168. data/ext/digest/kangarootwelve/keccak/generic32lc/SnP-Relaned.h +140 -0
  169. data/ext/digest/kangarootwelve/keccak/generic32lc/ext.link.c +1 -0
  170. data/ext/digest/kangarootwelve/keccak/generic64/KangarooTwelve.link.c +1 -0
  171. data/ext/digest/kangarootwelve/keccak/generic64/KeccakDuplexWidth1600.link.c +1 -0
  172. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-64.macros +2195 -0
  173. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-SnP.h +49 -0
  174. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-opt64-config.h +6 -0
  175. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-opt64.c +541 -0
  176. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-times2-SnP.h +45 -0
  177. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-times2-on1.c +37 -0
  178. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-times4-SnP.h +45 -0
  179. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-times4-on1.c +37 -0
  180. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-times8-SnP.h +45 -0
  181. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-times8-on1.c +37 -0
  182. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-unrolling.macros +302 -0
  183. data/ext/digest/kangarootwelve/keccak/generic64/KeccakSpongeWidth1600.link.c +1 -0
  184. data/ext/digest/kangarootwelve/keccak/generic64/PlSnP-Fallback.inc +287 -0
  185. data/ext/digest/kangarootwelve/keccak/generic64/SnP-Relaned.h +140 -0
  186. data/ext/digest/kangarootwelve/keccak/generic64/ext.link.c +1 -0
  187. data/ext/digest/kangarootwelve/keccak/generic64lc/KangarooTwelve.link.c +1 -0
  188. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakDuplexWidth1600.link.c +1 -0
  189. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-64.macros +2195 -0
  190. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-SnP.h +49 -0
  191. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-opt64-config.h +7 -0
  192. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-opt64.c +541 -0
  193. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-times2-SnP.h +45 -0
  194. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-times2-on1.c +37 -0
  195. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-times4-SnP.h +45 -0
  196. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-times4-on1.c +37 -0
  197. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-times8-SnP.h +45 -0
  198. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-times8-on1.c +37 -0
  199. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-unrolling.macros +302 -0
  200. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakSpongeWidth1600.link.c +1 -0
  201. data/ext/digest/kangarootwelve/keccak/generic64lc/PlSnP-Fallback.inc +287 -0
  202. data/ext/digest/kangarootwelve/keccak/generic64lc/SnP-Relaned.h +140 -0
  203. data/ext/digest/kangarootwelve/keccak/generic64lc/ext.link.c +1 -0
  204. data/ext/digest/kangarootwelve/keccak/haswell/KangarooTwelve.link.c +1 -0
  205. data/ext/digest/kangarootwelve/keccak/haswell/KeccakDuplexWidth1600.link.c +1 -0
  206. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-AVX2.s +993 -0
  207. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-SnP.h +41 -0
  208. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-times2-SIMD128.c +954 -0
  209. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-times2-SnP.h +47 -0
  210. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-times4-SIMD256.c +1303 -0
  211. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-times4-SnP.h +53 -0
  212. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-times8-SnP.h +45 -0
  213. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-times8-on4.c +38 -0
  214. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-unrolling.macros +302 -0
  215. data/ext/digest/kangarootwelve/keccak/haswell/KeccakSpongeWidth1600.link.c +1 -0
  216. data/ext/digest/kangarootwelve/keccak/haswell/PlSnP-Fallback.inc +287 -0
  217. data/ext/digest/kangarootwelve/keccak/haswell/SIMD128-config.h +8 -0
  218. data/ext/digest/kangarootwelve/keccak/haswell/SIMD256-config.h +7 -0
  219. data/ext/digest/kangarootwelve/keccak/haswell/ext.link.c +1 -0
  220. data/ext/digest/kangarootwelve/keccak/nehalem/KangarooTwelve.link.c +1 -0
  221. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakDuplexWidth1600.link.c +1 -0
  222. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-64.macros +2195 -0
  223. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-SnP.h +49 -0
  224. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-opt64-config.h +7 -0
  225. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-opt64.c +541 -0
  226. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-times2-SIMD128.c +954 -0
  227. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-times2-SnP.h +47 -0
  228. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-times4-SnP.h +45 -0
  229. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-times4-on2.c +38 -0
  230. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-times8-SnP.h +45 -0
  231. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-times8-on2.c +38 -0
  232. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-unrolling.macros +302 -0
  233. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakSpongeWidth1600.link.c +1 -0
  234. data/ext/digest/kangarootwelve/keccak/nehalem/PlSnP-Fallback.inc +287 -0
  235. data/ext/digest/kangarootwelve/keccak/nehalem/SIMD128-config.h +8 -0
  236. data/ext/digest/kangarootwelve/keccak/nehalem/SnP-Relaned.h +140 -0
  237. data/ext/digest/kangarootwelve/keccak/nehalem/ext.link.c +1 -0
  238. data/ext/digest/kangarootwelve/keccak/reference/KangarooTwelve.link.c +1 -0
  239. data/ext/digest/kangarootwelve/keccak/reference/KeccakDuplexWidth1600.link.c +1 -0
  240. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-SnP.h +41 -0
  241. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-reference.c +424 -0
  242. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-reference.h +20 -0
  243. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-times2-SnP.h +45 -0
  244. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-times2-on1.c +37 -0
  245. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-times4-SnP.h +45 -0
  246. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-times4-on1.c +37 -0
  247. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-times8-SnP.h +45 -0
  248. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-times8-on1.c +37 -0
  249. data/ext/digest/kangarootwelve/keccak/reference/KeccakSpongeWidth1600.link.c +1 -0
  250. data/ext/digest/kangarootwelve/keccak/reference/PlSnP-Fallback.inc +287 -0
  251. data/ext/digest/kangarootwelve/keccak/reference/displayIntermediateValues.c +176 -0
  252. data/ext/digest/kangarootwelve/keccak/reference/displayIntermediateValues.h +29 -0
  253. data/ext/digest/kangarootwelve/keccak/reference/ext.link.c +1 -0
  254. data/ext/digest/kangarootwelve/keccak/reference32bits/KangarooTwelve.link.c +1 -0
  255. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakDuplexWidth1600.link.c +1 -0
  256. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-SnP.h +41 -0
  257. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-reference.h +20 -0
  258. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-reference32BI.c +612 -0
  259. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-times2-SnP.h +45 -0
  260. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-times2-on1.c +37 -0
  261. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-times4-SnP.h +45 -0
  262. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-times4-on1.c +37 -0
  263. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-times8-SnP.h +45 -0
  264. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-times8-on1.c +37 -0
  265. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakSpongeWidth1600.link.c +1 -0
  266. data/ext/digest/kangarootwelve/keccak/reference32bits/PlSnP-Fallback.inc +287 -0
  267. data/ext/digest/kangarootwelve/keccak/reference32bits/displayIntermediateValues.c +176 -0
  268. data/ext/digest/kangarootwelve/keccak/reference32bits/displayIntermediateValues.h +29 -0
  269. data/ext/digest/kangarootwelve/keccak/reference32bits/ext.link.c +1 -0
  270. data/ext/digest/kangarootwelve/keccak/sandybridge/KangarooTwelve.link.c +1 -0
  271. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakDuplexWidth1600.link.c +1 -0
  272. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-64.macros +2195 -0
  273. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-SnP.h +49 -0
  274. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-opt64-config.h +8 -0
  275. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-opt64.c +541 -0
  276. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-times2-SIMD128.c +954 -0
  277. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-times2-SnP.h +47 -0
  278. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-times4-SnP.h +45 -0
  279. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-times4-on2.c +38 -0
  280. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-times8-SnP.h +45 -0
  281. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-times8-on2.c +38 -0
  282. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-unrolling.macros +302 -0
  283. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakSpongeWidth1600.link.c +1 -0
  284. data/ext/digest/kangarootwelve/keccak/sandybridge/PlSnP-Fallback.inc +287 -0
  285. data/ext/digest/kangarootwelve/keccak/sandybridge/SIMD128-config.h +8 -0
  286. data/ext/digest/kangarootwelve/keccak/sandybridge/SnP-Relaned.h +140 -0
  287. data/ext/digest/kangarootwelve/keccak/sandybridge/ext.link.c +1 -0
  288. data/ext/digest/kangarootwelve/keccak/skylakex/KangarooTwelve.link.c +1 -0
  289. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakDuplexWidth1600.link.c +1 -0
  290. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-AVX512-config.h +6 -0
  291. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-AVX512.c +621 -0
  292. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-SnP.h +42 -0
  293. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-times2-SIMD512.c +852 -0
  294. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-times2-SnP.h +49 -0
  295. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-times4-SIMD512.c +883 -0
  296. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-times4-SnP.h +49 -0
  297. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-times8-SIMD512.c +1473 -0
  298. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-times8-SnP.h +53 -0
  299. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakSpongeWidth1600.link.c +1 -0
  300. data/ext/digest/kangarootwelve/keccak/skylakex/SIMD512-2-config.h +7 -0
  301. data/ext/digest/kangarootwelve/keccak/skylakex/SIMD512-4-config.h +7 -0
  302. data/ext/digest/kangarootwelve/keccak/skylakex/SIMD512-config.h +7 -0
  303. data/ext/digest/kangarootwelve/keccak/skylakex/ext.link.c +1 -0
  304. data/ext/digest/kangarootwelve/utils.h +101 -0
  305. data/lib/digest/kangarootwelve/version.rb +2 -2
  306. data/test/test.rb +68 -31
  307. metadata +305 -27
@@ -0,0 +1,49 @@
1
+ /*
2
+ Implementation by Ronny Van Keer, hereby denoted as "the implementer".
3
+
4
+ For more information, feedback or questions, please refer to our website:
5
+ https://keccak.team/
6
+
7
+ To the extent possible under law, the implementer has waived all copyright
8
+ and related or neighboring rights to the source code in this file.
9
+ http://creativecommons.org/publicdomain/zero/1.0/
10
+
11
+ ---
12
+
13
+ Please refer to PlSnP-documentation.h for more details.
14
+ */
15
+
16
+ #ifndef _KeccakP_1600_times4_SnP_h_
17
+ #define _KeccakP_1600_times4_SnP_h_
18
+
19
+ #include "SIMD512-4-config.h"
20
+
21
+ #define KeccakP1600times4_implementation "512-bit SIMD implementation (" KeccakP1600times4_implementation_config ")"
22
+ #define KeccakP1600times4_statesSizeInBytes 800
23
+ #define KeccakP1600times4_statesAlignment 64
24
+ #define KeccakF1600times4_FastLoop_supported
25
+ #define KeccakP1600times4_12rounds_FastLoop_supported
26
+
27
+ #include <stddef.h>
28
+
29
+ #define KeccakP1600times4_StaticInitialize()
30
+ void KeccakP1600times4_InitializeAll(void *states);
31
+ #define KeccakP1600times4_AddByte(states, instanceIndex, byte, offset) \
32
+ ((unsigned char*)(states))[(instanceIndex)*8 + ((offset)/8)*4*8 + (offset)%8] ^= (byte)
33
+ void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length);
34
+ void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset);
35
+ void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length);
36
+ void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset);
37
+ void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount);
38
+ void KeccakP1600times4_PermuteAll_4rounds(void *states);
39
+ void KeccakP1600times4_PermuteAll_6rounds(void *states);
40
+ void KeccakP1600times4_PermuteAll_12rounds(void *states);
41
+ void KeccakP1600times4_PermuteAll_24rounds(void *states);
42
+ void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length);
43
+ void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset);
44
+ void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length);
45
+ void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset);
46
+ size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen);
47
+ size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen);
48
+
49
+ #endif
@@ -0,0 +1,1473 @@
1
+ /*
2
+ Implementation by Ronny Van Keer, hereby denoted as "the implementer".
3
+
4
+ For more information, feedback or questions, please refer to our website:
5
+ https://keccak.team/
6
+
7
+ To the extent possible under law, the implementer has waived all copyright
8
+ and related or neighboring rights to the source code in this file.
9
+ http://creativecommons.org/publicdomain/zero/1.0/
10
+
11
+ ---
12
+
13
+ This file implements Keccak-p[1600]×8 in a PlSnP-compatible way.
14
+ Please refer to PlSnP-documentation.h for more details.
15
+
16
+ This implementation comes with KeccakP-1600-times8-SnP.h in the same folder.
17
+ Please refer to LowLevel.build for the exact list of other files it must be combined with.
18
+ */
19
+
20
+ #include <stdio.h>
21
+ #include <stdlib.h>
22
+ #include <string.h>
23
+ #include <stdint.h>
24
+ #include <assert.h>
25
+ #include <smmintrin.h>
26
+ #include <wmmintrin.h>
27
+ #include <immintrin.h>
28
+ #include <emmintrin.h>
29
+ #include "align.h"
30
+ #include "KeccakP-1600-times8-SnP.h"
31
+ #include "SIMD512-config.h"
32
+
33
+ #include "brg_endian.h"
34
+ #if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
35
+ #error Expecting a little-endian platform
36
+ #endif
37
+
38
+ /*
39
+ ** Uncomment the define hereunder when compiling for a CPU without AVX-512 SIMD.
40
+ #define SIMULATE_AVX512
41
+ */
42
+
43
+ #define VERBOSE 0
44
+
45
+ #if defined(SIMULATE_AVX512)
46
+
47
+ typedef struct
48
+ {
49
+ uint64_t x[8];
50
+ } __m512i;
51
+
52
+ static __m512i _mm512_and_si512( __m512i a, __m512i b)
53
+ {
54
+ __m512i r;
55
+ unsigned int i;
56
+
57
+ for ( i = 0; i < 8; ++i )
58
+ r.x[i] = a.x[i] & b.x[i];
59
+ return(r);
60
+ }
61
+
62
+ static __m512i _mm512_xor_si512( __m512i a, __m512i b)
63
+ {
64
+ __m512i r;
65
+ unsigned int i;
66
+
67
+ for ( i = 0; i < 8; ++i )
68
+ r.x[i] = a.x[i] ^ b.x[i];
69
+ return(r);
70
+ }
71
+
72
+ static __m512i _mm512_ternarylogic_epi64(__m512i a, __m512i b, __m512i c, int imm)
73
+ {
74
+
75
+ if (imm == 0x96)
76
+ return ( _mm512_xor_si512( _mm512_xor_si512( a, b ), c ) );
77
+ if (imm == 0xD2) {
78
+ __m512i t;
79
+ unsigned int i;
80
+
81
+ for ( i = 0; i < 8; ++i )
82
+ t.x[i] = ~b.x[i] & c.x[i];
83
+ return ( _mm512_xor_si512( a, t ) );
84
+ }
85
+ printf( "_mm512_ternarylogic_epi64( a, b, c, %02X) not implemented!\n", imm );
86
+ exit(1);
87
+
88
+ }
89
+
90
+ static __m512i _mm512_rol_epi64(__m512i a, int offset)
91
+ {
92
+ __m512i r;
93
+ unsigned int i;
94
+
95
+ for ( i = 0; i < 8; ++i )
96
+ r.x[i] = (a.x[i] << offset) | (a.x[i] >> (64-offset));
97
+ return(r);
98
+ }
99
+
100
+ static __m512i _mm512_srli_epi64(__m512i a, int offset)
101
+ {
102
+ __m512i r;
103
+ unsigned int i;
104
+
105
+ for ( i = 0; i < 8; ++i )
106
+ r.x[i] = (a.x[i] >> offset);
107
+ return(r);
108
+ }
109
+
110
+
111
+ static __m512i _mm512_broadcast_f64x4(__m256d a)
112
+ {
113
+ __m512i r;
114
+ unsigned int i;
115
+ uint64_t t[4];
116
+
117
+ _mm256_store_si256( (__m256i*)t, (__m256i)a );
118
+ for ( i = 0; i < 4; ++i )
119
+ r.x[i+4] = r.x[i] = t[i];
120
+ return(r);
121
+ }
122
+
123
+ static __m512i _mm512_set_epi64(uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e, uint64_t f, uint64_t g, uint64_t h)
124
+ {
125
+ __m512i r;
126
+
127
+ r.x[0] = h;
128
+ r.x[1] = g;
129
+ r.x[2] = f;
130
+ r.x[3] = e;
131
+ r.x[4] = d;
132
+ r.x[5] = c;
133
+ r.x[6] = b;
134
+ r.x[7] = a;
135
+ return(r);
136
+ }
137
+
138
+ static __m512i _mm512_i32gather_epi64(__m256i idx, const void *p, int scale)
139
+ {
140
+ __m512i r;
141
+ unsigned int i;
142
+ uint32_t offset[8];
143
+
144
+ _mm256_store_si256( (__m256i*)offset, idx );
145
+ for ( i = 0; i < 8; ++i )
146
+ r.x[i] = *(const uint64_t*)((const char*)p + offset[i] * scale);
147
+ return(r);
148
+ }
149
+
150
+ static void _mm512_i32scatter_epi64( void *p, __m256i idx, __m512i value, int scale)
151
+ {
152
+ unsigned int i;
153
+ uint32_t offset[8];
154
+
155
+ _mm256_store_si256( (__m256i*)offset, idx );
156
+ for ( i = 0; i < 8; ++i )
157
+ *(uint64_t*)((char*)p + offset[i] * scale) = value.x[i];
158
+ }
159
+
160
+ static __m512i _mm512_permutex2var_epi64(__m512i a, __m512i idx, __m512i b)
161
+ {
162
+ __m512i r;
163
+ unsigned int i;
164
+ for ( i = 0; i < 8; ++i )
165
+ r.x[i] = (idx.x[i] & 8) ? b.x[idx.x[i] & 7] : a.x[idx.x[i] & 7];
166
+ return(r);
167
+ }
168
+
169
+ static __m512i _mm512_maskz_loadu_epi64(uint8_t k, const void *mem_addr)
170
+ {
171
+ __m512i r;
172
+ const uint64_t *p64 = (const uint64_t *)mem_addr;
173
+ unsigned int i;
174
+
175
+ for ( i = 0; i < 8; ++i ) {
176
+ if ((k & (1 << i)) != 0) {
177
+ r.x[i] = p64[i];
178
+ }
179
+ else {
180
+ r.x[i] = 0;
181
+ }
182
+ }
183
+ return(r);
184
+ }
185
+
186
+ #define _mm512_maskz_load_epi64 _mm512_maskz_loadu_epi64
187
+
188
+ static void _mm512_storeu_si512(__m512i * mem_addr, __m512i a)
189
+ {
190
+ uint64_t *p64 = (uint64_t *)mem_addr;
191
+ unsigned int i;
192
+
193
+ for ( i = 0; i < 8; ++i )
194
+ p64[i] = a.x[i];
195
+ }
196
+
197
+ #define _mm512_store_si512 _mm512_storeu_si512
198
+
199
+ static __m512i _mm512_loadu_si512(const __m512i * mem_addr)
200
+ {
201
+ __m512i r;
202
+ const uint64_t *p64 = (const uint64_t *)mem_addr;
203
+ unsigned int i;
204
+
205
+ for ( i = 0; i < 8; ++i )
206
+ r.x[i] = p64[i];
207
+ return(r);
208
+ }
209
+
210
+ #define _mm512_load_si512 _mm512_loadu_si512
211
+
212
+ static void _mm512_mask_storeu_epi64(void *mem_addr, uint8_t k, __m512i a)
213
+ {
214
+ uint64_t *p64 = (uint64_t *)mem_addr;
215
+ unsigned int i;
216
+
217
+ for ( i = 0; i < 8; ++i ) {
218
+ if ((k & (1 << i)) != 0)
219
+ p64[i] = a.x[i];
220
+ }
221
+ }
222
+
223
+ #define _mm512_mask_store_epi64 _mm512_mask_storeu_epi64
224
+
225
+ static __m512i _mm512_setzero_si512(void)
226
+ {
227
+ __m512i r;
228
+ unsigned int i;
229
+
230
+ for ( i = 0; i < 8; ++i )
231
+ r.x[i] = 0;
232
+ return(r);
233
+ }
234
+
235
+ static __m256i _mm512_extracti64x4_epi64(__m512i a, int imm8)
236
+ {
237
+ uint64_t buf[8];
238
+ __m256i r;
239
+
240
+ _mm512_storeu_si512((__m512i*)buf, a);
241
+ r = *(__m256i*)&buf[((imm8 == 0) ? 0 : 4)];
242
+ return(r);
243
+ }
244
+
245
+ #endif
246
+
247
+ typedef __m128i V128;
248
+ typedef __m256i V256;
249
+ typedef __m512i V512;
250
+
251
+ #if defined(KeccakP1600times8_useAVX512)
252
+
253
+ #define XOR(a,b) _mm512_xor_si512(a,b)
254
+ #define XOR3(a,b,c) _mm512_ternarylogic_epi64(a,b,c,0x96)
255
+ #define XOR5(a,b,c,d,e) XOR3(XOR3(a,b,c),d,e)
256
+ #define XOReq512(a, b) a = XOR(a,b)
257
+
258
+ #define ROL(a,offset) _mm512_rol_epi64(a,offset)
259
+ #define Chi(a,b,c) _mm512_ternarylogic_epi64(a,b,c,0xD2)
260
+
261
+ #define CONST8_64(a) (V512)_mm512_broadcast_f64x4(_mm256_broadcast_sd((const double*)(&a)))
262
+
263
+ #define LOAD512(a) _mm512_load_si512((const V512 *)&(a))
264
+ #define LOAD512u(a) _mm512_loadu_si512((const V512 *)&(a))
265
+ #define LOAD8_32(a,b,c,d,e,f,g,h) _mm256_set_epi32((uint64_t)(a), (uint32_t)(b), (uint32_t)(c), (uint32_t)(d), (uint32_t)(e), (uint32_t)(f), (uint32_t)(g), (uint32_t)(h))
266
+ #define LOAD8_64(a,b,c,d,e,f,g,h) _mm512_set_epi64((uint64_t)(a), (uint64_t)(b), (uint64_t)(c), (uint64_t)(d), (uint64_t)(e), (uint64_t)(f), (uint64_t)(g), (uint64_t)(h))
267
+ #define LOAD_GATHER8_64(idx,p) _mm512_i32gather_epi64( idx, (const void*)(p), 8)
268
+
269
+ #define STORE_SCATTER8_64(p,idx, v) _mm512_i32scatter_epi64( (void*)(p), idx, v, 8)
270
+
271
+ #endif
272
+
273
+ #if (VERBOSE > 0)
274
+ #define DumpMem(__t, buf, __n) { \
275
+ uint32_t i; \
276
+ printf("%s ", __t); \
277
+ for (i = 0; i < __n; ++i) { \
278
+ printf("%016lx ", (buf)[i]); \
279
+ /*if ((i%5) == 4) printf("\n"); */\
280
+ } \
281
+ printf("\n"); \
282
+ }
283
+
284
+ #define DumpOne(__v,__i) { \
285
+ uint64_t buf[8]; \
286
+ _mm512_storeu_si512((V512*)buf, __v##__i); \
287
+ printf("%016lx %016lx %016lx %016lx %016lx %016lx %016lx %016lx\n", \
288
+ buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); \
289
+ }
290
+
291
+ #define Dump(__t,__v) { \
292
+ printf("%s\n", __t); \
293
+ DumpOne(__v, ba); \
294
+ DumpOne(__v, be); \
295
+ DumpOne(__v, bi); \
296
+ DumpOne(__v, bo); \
297
+ DumpOne(__v, bu); \
298
+ DumpOne(__v, ga); \
299
+ DumpOne(__v, ge); \
300
+ DumpOne(__v, gi); \
301
+ DumpOne(__v, go); \
302
+ DumpOne(__v, gu); \
303
+ DumpOne(__v, ka); \
304
+ DumpOne(__v, ke); \
305
+ DumpOne(__v, ki); \
306
+ DumpOne(__v, ko); \
307
+ DumpOne(__v, ku); \
308
+ DumpOne(__v, ma); \
309
+ DumpOne(__v, me); \
310
+ DumpOne(__v, mi); \
311
+ DumpOne(__v, mo); \
312
+ DumpOne(__v, mu); \
313
+ DumpOne(__v, sa); \
314
+ DumpOne(__v, se); \
315
+ DumpOne(__v, si); \
316
+ DumpOne(__v, so); \
317
+ DumpOne(__v, su); \
318
+ printf("\n"); \
319
+ }
320
+
321
+ #define DumpReg(__t,__v,__i) printf("%s ", __t); DumpOne(__v,__i)
322
+
323
+ #else
324
+ #define DumpMem(__t, buf,len)
325
+ #define DumpOne(__v,__i)
326
+ #define Dump(__t,__v)
327
+ #define DumpReg(__t,__v,__i)
328
+ #endif
329
+
330
+
331
+ #define laneIndex(instanceIndex, lanePosition) ((lanePosition)*8 + instanceIndex)
332
+ #define SnP_laneLengthInBytes 8
333
+
334
+ void KeccakP1600times8_InitializeAll(void *states)
335
+ {
336
+ memset(states, 0, KeccakP1600times8_statesSizeInBytes);
337
+ }
338
+
339
+ void KeccakP1600times8_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
340
+ {
341
+ unsigned int sizeLeft = length;
342
+ unsigned int lanePosition = offset/SnP_laneLengthInBytes;
343
+ unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
344
+ const unsigned char *curData = data;
345
+ uint64_t *statesAsLanes = states;
346
+
347
+ if ((sizeLeft > 0) && (offsetInLane != 0)) {
348
+ unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
349
+ uint64_t lane = 0;
350
+ if (bytesInLane > sizeLeft)
351
+ bytesInLane = sizeLeft;
352
+ memcpy((unsigned char*)&lane + offsetInLane, curData, bytesInLane);
353
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
354
+ sizeLeft -= bytesInLane;
355
+ lanePosition++;
356
+ curData += bytesInLane;
357
+ }
358
+
359
+ while(sizeLeft >= SnP_laneLengthInBytes) {
360
+ uint64_t lane = *((const uint64_t*)curData);
361
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
362
+ sizeLeft -= SnP_laneLengthInBytes;
363
+ lanePosition++;
364
+ curData += SnP_laneLengthInBytes;
365
+ }
366
+
367
+ if (sizeLeft > 0) {
368
+ uint64_t lane = 0;
369
+ memcpy(&lane, curData, sizeLeft);
370
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
371
+ }
372
+ }
373
+
374
+ void KeccakP1600times8_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
375
+ {
376
+ V512 *stateAsLanes = states;
377
+ const uint64_t *dataAsLanes = (const uint64_t *)data;
378
+ unsigned int i;
379
+ V256 index;
380
+
381
+ #define Add_In( argIndex ) stateAsLanes[argIndex] = XOR(stateAsLanes[argIndex], LOAD_GATHER8_64(index, dataAsLanes+argIndex))
382
+ index = LOAD8_32(7*laneOffset, 6*laneOffset, 5*laneOffset, 4*laneOffset, 3*laneOffset, 2*laneOffset, 1*laneOffset, 0*laneOffset);
383
+ if ( laneCount >= 16 ) {
384
+ Add_In( 0 );
385
+ Add_In( 1 );
386
+ Add_In( 2 );
387
+ Add_In( 3 );
388
+ Add_In( 4 );
389
+ Add_In( 5 );
390
+ Add_In( 6 );
391
+ Add_In( 7 );
392
+ Add_In( 8 );
393
+ Add_In( 9 );
394
+ Add_In( 10 );
395
+ Add_In( 11 );
396
+ Add_In( 12 );
397
+ Add_In( 13 );
398
+ Add_In( 14 );
399
+ Add_In( 15 );
400
+ if ( laneCount >= 20 ) {
401
+ Add_In( 16 );
402
+ Add_In( 17 );
403
+ Add_In( 18 );
404
+ Add_In( 19 );
405
+ for(i=20; i<laneCount; i++)
406
+ Add_In( i );
407
+ }
408
+ else {
409
+ for(i=16; i<laneCount; i++)
410
+ Add_In( i );
411
+ }
412
+ }
413
+ else {
414
+ for(i=0; i<laneCount; i++)
415
+ Add_In( i );
416
+ }
417
+ #undef Add_In
418
+ }
419
+
420
+ void KeccakP1600times8_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
421
+ {
422
+ unsigned int sizeLeft = length;
423
+ unsigned int lanePosition = offset/SnP_laneLengthInBytes;
424
+ unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
425
+ const unsigned char *curData = data;
426
+ uint64_t *statesAsLanes = states;
427
+
428
+ if ((sizeLeft > 0) && (offsetInLane != 0)) {
429
+ unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
430
+ if (bytesInLane > sizeLeft)
431
+ bytesInLane = sizeLeft;
432
+ memcpy( ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, curData, bytesInLane);
433
+ sizeLeft -= bytesInLane;
434
+ lanePosition++;
435
+ curData += bytesInLane;
436
+ }
437
+
438
+ while(sizeLeft >= SnP_laneLengthInBytes) {
439
+ uint64_t lane = *((const uint64_t*)curData);
440
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] = lane;
441
+ sizeLeft -= SnP_laneLengthInBytes;
442
+ lanePosition++;
443
+ curData += SnP_laneLengthInBytes;
444
+ }
445
+
446
+ if (sizeLeft > 0) {
447
+ memcpy(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], curData, sizeLeft);
448
+ }
449
+ }
450
+
451
+ void KeccakP1600times8_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
452
+ {
453
+ V512 *stateAsLanes = states;
454
+ const uint64_t *dataAsLanes = (const uint64_t *)data;
455
+ unsigned int i;
456
+ V256 index;
457
+
458
+ #define OverWr( argIndex ) stateAsLanes[argIndex] = LOAD_GATHER8_64(index, dataAsLanes+argIndex)
459
+ index = LOAD8_32(7*laneOffset, 6*laneOffset, 5*laneOffset, 4*laneOffset, 3*laneOffset, 2*laneOffset, 1*laneOffset, 0*laneOffset);
460
+ if ( laneCount >= 16 ) {
461
+ OverWr( 0 );
462
+ OverWr( 1 );
463
+ OverWr( 2 );
464
+ OverWr( 3 );
465
+ OverWr( 4 );
466
+ OverWr( 5 );
467
+ OverWr( 6 );
468
+ OverWr( 7 );
469
+ OverWr( 8 );
470
+ OverWr( 9 );
471
+ OverWr( 10 );
472
+ OverWr( 11 );
473
+ OverWr( 12 );
474
+ OverWr( 13 );
475
+ OverWr( 14 );
476
+ OverWr( 15 );
477
+ if ( laneCount >= 20 ) {
478
+ OverWr( 16 );
479
+ OverWr( 17 );
480
+ OverWr( 18 );
481
+ OverWr( 19 );
482
+ for(i=20; i<laneCount; i++)
483
+ OverWr( i );
484
+ }
485
+ else {
486
+ for(i=16; i<laneCount; i++)
487
+ OverWr( i );
488
+ }
489
+ }
490
+ else {
491
+ for(i=0; i<laneCount; i++)
492
+ OverWr( i );
493
+ }
494
+ #undef OverWr
495
+ }
496
+
497
+ void KeccakP1600times8_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount)
498
+ {
499
+ unsigned int sizeLeft = byteCount;
500
+ unsigned int lanePosition = 0;
501
+ uint64_t *statesAsLanes = states;
502
+
503
+ while(sizeLeft >= SnP_laneLengthInBytes) {
504
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] = 0;
505
+ sizeLeft -= SnP_laneLengthInBytes;
506
+ lanePosition++;
507
+ }
508
+
509
+ if (sizeLeft > 0) {
510
+ memset(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], 0, sizeLeft);
511
+ }
512
+ }
513
+
514
+ void KeccakP1600times8_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length)
515
+ {
516
+ unsigned int sizeLeft = length;
517
+ unsigned int lanePosition = offset/SnP_laneLengthInBytes;
518
+ unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
519
+ unsigned char *curData = data;
520
+ const uint64_t *statesAsLanes = states;
521
+
522
+ if ((sizeLeft > 0) && (offsetInLane != 0)) {
523
+ unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
524
+ if (bytesInLane > sizeLeft)
525
+ bytesInLane = sizeLeft;
526
+ memcpy( curData, ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane);
527
+ sizeLeft -= bytesInLane;
528
+ lanePosition++;
529
+ curData += bytesInLane;
530
+ }
531
+
532
+ while(sizeLeft >= SnP_laneLengthInBytes) {
533
+ *(uint64_t*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
534
+ sizeLeft -= SnP_laneLengthInBytes;
535
+ lanePosition++;
536
+ curData += SnP_laneLengthInBytes;
537
+ }
538
+
539
+ if (sizeLeft > 0) {
540
+ memcpy( curData, &statesAsLanes[laneIndex(instanceIndex, lanePosition)], sizeLeft);
541
+ }
542
+ }
543
+
544
+ void KeccakP1600times8_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
545
+ {
546
+ const V512 *stateAsLanes = states;
547
+ uint64_t *dataAsLanes = (uint64_t *)data;
548
+ unsigned int i;
549
+ V256 index;
550
+
551
+ #define Extr( argIndex ) STORE_SCATTER8_64(dataAsLanes+argIndex, index, stateAsLanes[argIndex])
552
+ index = LOAD8_32(7*laneOffset, 6*laneOffset, 5*laneOffset, 4*laneOffset, 3*laneOffset, 2*laneOffset, 1*laneOffset, 0*laneOffset);
553
+ if ( laneCount >= 16 ) {
554
+ Extr( 0 );
555
+ Extr( 1 );
556
+ Extr( 2 );
557
+ Extr( 3 );
558
+ Extr( 4 );
559
+ Extr( 5 );
560
+ Extr( 6 );
561
+ Extr( 7 );
562
+ Extr( 8 );
563
+ Extr( 9 );
564
+ Extr( 10 );
565
+ Extr( 11 );
566
+ Extr( 12 );
567
+ Extr( 13 );
568
+ Extr( 14 );
569
+ Extr( 15 );
570
+ if ( laneCount >= 20 ) {
571
+ Extr( 16 );
572
+ Extr( 17 );
573
+ Extr( 18 );
574
+ Extr( 19 );
575
+ for(i=20; i<laneCount; i++)
576
+ Extr( i );
577
+ }
578
+ else {
579
+ for(i=16; i<laneCount; i++)
580
+ Extr( i );
581
+ }
582
+ }
583
+ else {
584
+ for(i=0; i<laneCount; i++)
585
+ Extr( i );
586
+ }
587
+ #undef Extr
588
+ }
589
+
590
+ void KeccakP1600times8_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
591
+ {
592
+ unsigned int sizeLeft = length;
593
+ unsigned int lanePosition = offset/SnP_laneLengthInBytes;
594
+ unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
595
+ const unsigned char *curInput = input;
596
+ unsigned char *curOutput = output;
597
+ const uint64_t *statesAsLanes = states;
598
+
599
+ if ((sizeLeft > 0) && (offsetInLane != 0)) {
600
+ unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
601
+ uint64_t lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)] >> (8 * offsetInLane);
602
+ if (bytesInLane > sizeLeft)
603
+ bytesInLane = sizeLeft;
604
+ sizeLeft -= bytesInLane;
605
+ do {
606
+ *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
607
+ lane >>= 8;
608
+ } while ( --bytesInLane != 0);
609
+ lanePosition++;
610
+ }
611
+
612
+ while(sizeLeft >= SnP_laneLengthInBytes) {
613
+ *((uint64_t*)curOutput) = *((uint64_t*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)];
614
+ sizeLeft -= SnP_laneLengthInBytes;
615
+ lanePosition++;
616
+ curInput += SnP_laneLengthInBytes;
617
+ curOutput += SnP_laneLengthInBytes;
618
+ }
619
+
620
+ if (sizeLeft != 0) {
621
+ uint64_t lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
622
+ do {
623
+ *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
624
+ lane >>= 8;
625
+ } while ( --sizeLeft != 0);
626
+ }
627
+ }
628
+
629
+ void KeccakP1600times8_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset)
630
+ {
631
+ const V512 *stateAsLanes = states;
632
+ const uint64_t *inAsLanes = (const uint64_t *)input;
633
+ uint64_t *outAsLanes = (uint64_t *)output;
634
+ unsigned int i;
635
+ V256 index;
636
+
637
+ #define ExtrAdd( argIndex ) STORE_SCATTER8_64(outAsLanes+argIndex, index, XOR(stateAsLanes[argIndex], LOAD_GATHER8_64(index, inAsLanes+argIndex)))
638
+ index = LOAD8_32(7*laneOffset, 6*laneOffset, 5*laneOffset, 4*laneOffset, 3*laneOffset, 2*laneOffset, 1*laneOffset, 0*laneOffset);
639
+ if ( laneCount >= 16 ) {
640
+ ExtrAdd( 0 );
641
+ ExtrAdd( 1 );
642
+ ExtrAdd( 2 );
643
+ ExtrAdd( 3 );
644
+ ExtrAdd( 4 );
645
+ ExtrAdd( 5 );
646
+ ExtrAdd( 6 );
647
+ ExtrAdd( 7 );
648
+ ExtrAdd( 8 );
649
+ ExtrAdd( 9 );
650
+ ExtrAdd( 10 );
651
+ ExtrAdd( 11 );
652
+ ExtrAdd( 12 );
653
+ ExtrAdd( 13 );
654
+ ExtrAdd( 14 );
655
+ ExtrAdd( 15 );
656
+ if ( laneCount >= 20 ) {
657
+ ExtrAdd( 16 );
658
+ ExtrAdd( 17 );
659
+ ExtrAdd( 18 );
660
+ ExtrAdd( 19 );
661
+ for(i=20; i<laneCount; i++)
662
+ ExtrAdd( i );
663
+ }
664
+ else {
665
+ for(i=16; i<laneCount; i++)
666
+ ExtrAdd( i );
667
+ }
668
+ }
669
+ else {
670
+ for(i=0; i<laneCount; i++)
671
+ ExtrAdd( i );
672
+ }
673
+ #undef ExtrAdd
674
+
675
+ }
676
+
677
+ static ALIGN(KeccakP1600times8_statesAlignment) const uint64_t KeccakP1600RoundConstants[24] = {
678
+ 0x0000000000000001ULL,
679
+ 0x0000000000008082ULL,
680
+ 0x800000000000808aULL,
681
+ 0x8000000080008000ULL,
682
+ 0x000000000000808bULL,
683
+ 0x0000000080000001ULL,
684
+ 0x8000000080008081ULL,
685
+ 0x8000000000008009ULL,
686
+ 0x000000000000008aULL,
687
+ 0x0000000000000088ULL,
688
+ 0x0000000080008009ULL,
689
+ 0x000000008000000aULL,
690
+ 0x000000008000808bULL,
691
+ 0x800000000000008bULL,
692
+ 0x8000000000008089ULL,
693
+ 0x8000000000008003ULL,
694
+ 0x8000000000008002ULL,
695
+ 0x8000000000000080ULL,
696
+ 0x000000000000800aULL,
697
+ 0x800000008000000aULL,
698
+ 0x8000000080008081ULL,
699
+ 0x8000000000008080ULL,
700
+ 0x0000000080000001ULL,
701
+ 0x8000000080008008ULL};
702
+
703
+ #define KeccakP_DeclareVars \
704
+ V512 _Ba, _Be, _Bi, _Bo, _Bu; \
705
+ V512 _Da, _De, _Di, _Do, _Du; \
706
+ V512 _ba, _be, _bi, _bo, _bu; \
707
+ V512 _ga, _ge, _gi, _go, _gu; \
708
+ V512 _ka, _ke, _ki, _ko, _ku; \
709
+ V512 _ma, _me, _mi, _mo, _mu; \
710
+ V512 _sa, _se, _si, _so, _su
711
+
712
+ #define KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bb1, _Bb2, _Bb3, _Bb4, _Bb5, _Rr1, _Rr2, _Rr3, _Rr4, _Rr5 ) \
713
+ _Bb1 = XOR(_L1, _Da); \
714
+ _Bb2 = XOR(_L2, _De); \
715
+ _Bb3 = XOR(_L3, _Di); \
716
+ _Bb4 = XOR(_L4, _Do); \
717
+ _Bb5 = XOR(_L5, _Du); \
718
+ if (_Rr1 != 0) _Bb1 = ROL(_Bb1, _Rr1); \
719
+ _Bb2 = ROL(_Bb2, _Rr2); \
720
+ _Bb3 = ROL(_Bb3, _Rr3); \
721
+ _Bb4 = ROL(_Bb4, _Rr4); \
722
+ _Bb5 = ROL(_Bb5, _Rr5); \
723
+ _L1 = Chi( _Ba, _Be, _Bi); \
724
+ _L2 = Chi( _Be, _Bi, _Bo); \
725
+ _L3 = Chi( _Bi, _Bo, _Bu); \
726
+ _L4 = Chi( _Bo, _Bu, _Ba); \
727
+ _L5 = Chi( _Bu, _Ba, _Be);
728
+
729
+ #define KeccakP_ThetaRhoPiChiIota0( _L1, _L2, _L3, _L4, _L5, _rc ) \
730
+ _Ba = XOR5( _ba, _ga, _ka, _ma, _sa ); /* Theta effect */ \
731
+ _Be = XOR5( _be, _ge, _ke, _me, _se ); \
732
+ _Bi = XOR5( _bi, _gi, _ki, _mi, _si ); \
733
+ _Bo = XOR5( _bo, _go, _ko, _mo, _so ); \
734
+ _Bu = XOR5( _bu, _gu, _ku, _mu, _su ); \
735
+ _Da = ROL( _Be, 1 ); \
736
+ _De = ROL( _Bi, 1 ); \
737
+ _Di = ROL( _Bo, 1 ); \
738
+ _Do = ROL( _Bu, 1 ); \
739
+ _Du = ROL( _Ba, 1 ); \
740
+ _Da = XOR( _Da, _Bu ); \
741
+ _De = XOR( _De, _Ba ); \
742
+ _Di = XOR( _Di, _Be ); \
743
+ _Do = XOR( _Do, _Bi ); \
744
+ _Du = XOR( _Du, _Bo ); \
745
+ KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Ba, _Be, _Bi, _Bo, _Bu, 0, 44, 43, 21, 14 ); \
746
+ _L1 = XOR(_L1, _rc) /* Iota */
747
+
748
+ #define KeccakP_ThetaRhoPiChi1( _L1, _L2, _L3, _L4, _L5 ) \
749
+ KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bi, _Bo, _Bu, _Ba, _Be, 3, 45, 61, 28, 20 )
750
+
751
+ #define KeccakP_ThetaRhoPiChi2( _L1, _L2, _L3, _L4, _L5 ) \
752
+ KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bu, _Ba, _Be, _Bi, _Bo, 18, 1, 6, 25, 8 )
753
+
754
+ #define KeccakP_ThetaRhoPiChi3( _L1, _L2, _L3, _L4, _L5 ) \
755
+ KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Be, _Bi, _Bo, _Bu, _Ba, 36, 10, 15, 56, 27 )
756
+
757
+ #define KeccakP_ThetaRhoPiChi4( _L1, _L2, _L3, _L4, _L5 ) \
758
+ KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bo, _Bu, _Ba, _Be, _Bi, 41, 2, 62, 55, 39 )
759
+
760
+ #define KeccakP_4rounds( i ) \
761
+ KeccakP_ThetaRhoPiChiIota0(_ba, _ge, _ki, _mo, _su, CONST8_64(KeccakP1600RoundConstants[i]) ); \
762
+ KeccakP_ThetaRhoPiChi1( _ka, _me, _si, _bo, _gu ); \
763
+ KeccakP_ThetaRhoPiChi2( _sa, _be, _gi, _ko, _mu ); \
764
+ KeccakP_ThetaRhoPiChi3( _ga, _ke, _mi, _so, _bu ); \
765
+ KeccakP_ThetaRhoPiChi4( _ma, _se, _bi, _go, _ku ); \
766
+ \
767
+ KeccakP_ThetaRhoPiChiIota0(_ba, _me, _gi, _so, _ku, CONST8_64(KeccakP1600RoundConstants[i+1]) ); \
768
+ KeccakP_ThetaRhoPiChi1( _sa, _ke, _bi, _mo, _gu ); \
769
+ KeccakP_ThetaRhoPiChi2( _ma, _ge, _si, _ko, _bu ); \
770
+ KeccakP_ThetaRhoPiChi3( _ka, _be, _mi, _go, _su ); \
771
+ KeccakP_ThetaRhoPiChi4( _ga, _se, _ki, _bo, _mu ); \
772
+ \
773
+ KeccakP_ThetaRhoPiChiIota0(_ba, _ke, _si, _go, _mu, CONST8_64(KeccakP1600RoundConstants[i+2]) ); \
774
+ KeccakP_ThetaRhoPiChi1( _ma, _be, _ki, _so, _gu ); \
775
+ KeccakP_ThetaRhoPiChi2( _ga, _me, _bi, _ko, _su ); \
776
+ KeccakP_ThetaRhoPiChi3( _sa, _ge, _mi, _bo, _ku ); \
777
+ KeccakP_ThetaRhoPiChi4( _ka, _se, _gi, _mo, _bu ); \
778
+ \
779
+ KeccakP_ThetaRhoPiChiIota0(_ba, _be, _bi, _bo, _bu, CONST8_64(KeccakP1600RoundConstants[i+3]) ); \
780
+ KeccakP_ThetaRhoPiChi1( _ga, _ge, _gi, _go, _gu ); \
781
+ KeccakP_ThetaRhoPiChi2( _ka, _ke, _ki, _ko, _ku ); \
782
+ KeccakP_ThetaRhoPiChi3( _ma, _me, _mi, _mo, _mu ); \
783
+ KeccakP_ThetaRhoPiChi4( _sa, _se, _si, _so, _su )
784
+
785
+ #define KeccakP_2rounds( i ) \
786
+ KeccakP_ThetaRhoPiChiIota0(_ba, _ke, _si, _go, _mu, CONST8_64(KeccakP1600RoundConstants[i]) ); \
787
+ KeccakP_ThetaRhoPiChi1( _ma, _be, _ki, _so, _gu ); \
788
+ KeccakP_ThetaRhoPiChi2( _ga, _me, _bi, _ko, _su ); \
789
+ KeccakP_ThetaRhoPiChi3( _sa, _ge, _mi, _bo, _ku ); \
790
+ KeccakP_ThetaRhoPiChi4( _ka, _se, _gi, _mo, _bu ); \
791
+ \
792
+ KeccakP_ThetaRhoPiChiIota0(_ba, _be, _bi, _bo, _bu, CONST8_64(KeccakP1600RoundConstants[i+1]) ); \
793
+ KeccakP_ThetaRhoPiChi1( _ga, _ge, _gi, _go, _gu ); \
794
+ KeccakP_ThetaRhoPiChi2( _ka, _ke, _ki, _ko, _ku ); \
795
+ KeccakP_ThetaRhoPiChi3( _ma, _me, _mi, _mo, _mu ); \
796
+ KeccakP_ThetaRhoPiChi4( _sa, _se, _si, _so, _su )
797
+
798
+ #ifdef KeccakP1600times8_fullUnrolling
799
+
800
+ #define rounds12 \
801
+ KeccakP_4rounds( 12 ); \
802
+ KeccakP_4rounds( 16 ); \
803
+ KeccakP_4rounds( 20 )
804
+
805
+ #define rounds24 \
806
+ KeccakP_4rounds( 0 ); \
807
+ KeccakP_4rounds( 4 ); \
808
+ KeccakP_4rounds( 8 ); \
809
+ KeccakP_4rounds( 12 ); \
810
+ KeccakP_4rounds( 16 ); \
811
+ KeccakP_4rounds( 20 )
812
+
813
+ #elif (KeccakP1600times8_unrolling == 4)
814
+
815
+ #define rounds12 \
816
+ i = 12; \
817
+ do { \
818
+ KeccakP_4rounds( i ); \
819
+ } while( (i += 4) < 24 )
820
+
821
+ #define rounds24 \
822
+ i = 0; \
823
+ do { \
824
+ KeccakP_4rounds( i ); \
825
+ } while( (i += 4) < 24 )
826
+
827
+ #elif (KeccakP1600times8_unrolling == 12)
828
+
829
+ #define rounds12 \
830
+ KeccakP_4rounds( 12 ); \
831
+ KeccakP_4rounds( 16 ); \
832
+ KeccakP_4rounds( 20 )
833
+
834
+ #define rounds24 \
835
+ i = 0; \
836
+ do { \
837
+ KeccakP_4rounds( i ); \
838
+ KeccakP_4rounds( i+4 ); \
839
+ KeccakP_4rounds( i+8 ); \
840
+ } while( (i += 12) < 24 )
841
+
842
+ #else
843
+ #error "Unrolling is not correctly specified!"
844
+ #endif
845
+
846
+ #define rounds6 \
847
+ KeccakP_2rounds( 18 ); \
848
+ KeccakP_4rounds( 20 )
849
+
850
+ #define rounds4 \
851
+ KeccakP_4rounds( 20 )
852
+
853
+ #define copyFromState(pState) \
854
+ _ba = pState[ 0]; \
855
+ _be = pState[ 1]; \
856
+ _bi = pState[ 2]; \
857
+ _bo = pState[ 3]; \
858
+ _bu = pState[ 4]; \
859
+ _ga = pState[ 5]; \
860
+ _ge = pState[ 6]; \
861
+ _gi = pState[ 7]; \
862
+ _go = pState[ 8]; \
863
+ _gu = pState[ 9]; \
864
+ _ka = pState[10]; \
865
+ _ke = pState[11]; \
866
+ _ki = pState[12]; \
867
+ _ko = pState[13]; \
868
+ _ku = pState[14]; \
869
+ _ma = pState[15]; \
870
+ _me = pState[16]; \
871
+ _mi = pState[17]; \
872
+ _mo = pState[18]; \
873
+ _mu = pState[19]; \
874
+ _sa = pState[20]; \
875
+ _se = pState[21]; \
876
+ _si = pState[22]; \
877
+ _so = pState[23]; \
878
+ _su = pState[24]
879
+
880
+ #define copyFromState2rounds(pState) \
881
+ _ba = pState[ 0]; \
882
+ _be = pState[16]; /* me */ \
883
+ _bi = pState[ 7]; /* gi */ \
884
+ _bo = pState[23]; /* so */ \
885
+ _bu = pState[14]; /* ku */ \
886
+ _ga = pState[20]; /* sa */ \
887
+ _ge = pState[11]; /* ke */ \
888
+ _gi = pState[ 2]; /* bi */ \
889
+ _go = pState[18]; /* mo */ \
890
+ _gu = pState[ 9]; \
891
+ _ka = pState[15]; /* ma */ \
892
+ _ke = pState[ 6]; /* ge */ \
893
+ _ki = pState[22]; /* si */ \
894
+ _ko = pState[13]; \
895
+ _ku = pState[ 4]; /* bu */ \
896
+ _ma = pState[10]; /* ka */ \
897
+ _me = pState[ 1]; /* be */ \
898
+ _mi = pState[17]; \
899
+ _mo = pState[ 8]; /* go */ \
900
+ _mu = pState[24]; /* su */ \
901
+ _sa = pState[ 5]; /* ga */ \
902
+ _se = pState[21]; \
903
+ _si = pState[12]; /* ki */ \
904
+ _so = pState[ 3]; /* bo */ \
905
+ _su = pState[19] /* mu */
906
+
907
+ #define copyToState(pState) \
908
+ pState[ 0] = _ba; \
909
+ pState[ 1] = _be; \
910
+ pState[ 2] = _bi; \
911
+ pState[ 3] = _bo; \
912
+ pState[ 4] = _bu; \
913
+ pState[ 5] = _ga; \
914
+ pState[ 6] = _ge; \
915
+ pState[ 7] = _gi; \
916
+ pState[ 8] = _go; \
917
+ pState[ 9] = _gu; \
918
+ pState[10] = _ka; \
919
+ pState[11] = _ke; \
920
+ pState[12] = _ki; \
921
+ pState[13] = _ko; \
922
+ pState[14] = _ku; \
923
+ pState[15] = _ma; \
924
+ pState[16] = _me; \
925
+ pState[17] = _mi; \
926
+ pState[18] = _mo; \
927
+ pState[19] = _mu; \
928
+ pState[20] = _sa; \
929
+ pState[21] = _se; \
930
+ pState[22] = _si; \
931
+ pState[23] = _so; \
932
+ pState[24] = _su
933
+
934
+ void KeccakP1600times8_PermuteAll_24rounds(void *states)
935
+ {
936
+ V512 *statesAsLanes = states;
937
+ KeccakP_DeclareVars;
938
+ #ifndef KeccakP1600times8_fullUnrolling
939
+ unsigned int i;
940
+ #endif
941
+
942
+ copyFromState(statesAsLanes);
943
+ rounds24;
944
+ copyToState(statesAsLanes);
945
+ }
946
+
947
+ void KeccakP1600times8_PermuteAll_12rounds(void *states)
948
+ {
949
+ V512 *statesAsLanes = states;
950
+ KeccakP_DeclareVars;
951
+ #if (KeccakP1600times8_unrolling < 12)
952
+ unsigned int i;
953
+ #endif
954
+
955
+ copyFromState(statesAsLanes);
956
+ rounds12;
957
+ copyToState(statesAsLanes);
958
+ }
959
+
960
+ void KeccakP1600times8_PermuteAll_6rounds(void *states)
961
+ {
962
+ V512 *statesAsLanes = states;
963
+ KeccakP_DeclareVars;
964
+
965
+ copyFromState2rounds(statesAsLanes);
966
+ rounds6;
967
+ copyToState(statesAsLanes);
968
+ }
969
+
970
+ void KeccakP1600times8_PermuteAll_4rounds(void *states)
971
+ {
972
+ V512 *statesAsLanes = states;
973
+ KeccakP_DeclareVars;
974
+
975
+ copyFromState(statesAsLanes);
976
+ rounds4;
977
+ copyToState(statesAsLanes);
978
+ }
979
+
980
+ size_t KeccakF1600times8_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
981
+ {
982
+ size_t dataMinimumSize = (laneOffsetParallel*7 + laneCount)*8;
983
+
984
+ if (laneCount == 21) {
985
+ #ifndef KeccakP1600times8_fullUnrolling
986
+ unsigned int i;
987
+ #endif
988
+ const unsigned char *dataStart = data;
989
+ V512 *statesAsLanes = states;
990
+ const uint64_t *dataAsLanes = (const uint64_t *)data;
991
+ KeccakP_DeclareVars;
992
+ V256 index;
993
+
994
+ copyFromState(statesAsLanes);
995
+ index = LOAD8_32(7*laneOffsetParallel, 6*laneOffsetParallel, 5*laneOffsetParallel, 4*laneOffsetParallel, 3*laneOffsetParallel, 2*laneOffsetParallel, 1*laneOffsetParallel, 0*laneOffsetParallel);
996
+ while(dataByteLen >= dataMinimumSize) {
997
+ #define Add_In( argLane, argIndex ) argLane = XOR(argLane, LOAD_GATHER8_64(index, dataAsLanes+argIndex))
998
+ Add_In( _ba, 0 );
999
+ Add_In( _be, 1 );
1000
+ Add_In( _bi, 2 );
1001
+ Add_In( _bo, 3 );
1002
+ Add_In( _bu, 4 );
1003
+ Add_In( _ga, 5 );
1004
+ Add_In( _ge, 6 );
1005
+ Add_In( _gi, 7 );
1006
+ Add_In( _go, 8 );
1007
+ Add_In( _gu, 9 );
1008
+ Add_In( _ka, 10 );
1009
+ Add_In( _ke, 11 );
1010
+ Add_In( _ki, 12 );
1011
+ Add_In( _ko, 13 );
1012
+ Add_In( _ku, 14 );
1013
+ Add_In( _ma, 15 );
1014
+ Add_In( _me, 16 );
1015
+ Add_In( _mi, 17 );
1016
+ Add_In( _mo, 18 );
1017
+ Add_In( _mu, 19 );
1018
+ Add_In( _sa, 20 );
1019
+ #undef Add_In
1020
+ rounds24;
1021
+ dataAsLanes += laneOffsetSerial;
1022
+ dataByteLen -= laneOffsetSerial*8;
1023
+ }
1024
+ copyToState(statesAsLanes);
1025
+ return (const unsigned char *)dataAsLanes - dataStart;
1026
+ }
1027
+ else {
1028
+ const unsigned char *dataStart = data;
1029
+
1030
+ while(dataByteLen >= dataMinimumSize) {
1031
+ KeccakP1600times8_AddLanesAll(states, data, laneCount, laneOffsetParallel);
1032
+ KeccakP1600times8_PermuteAll_24rounds(states);
1033
+ data += laneOffsetSerial*8;
1034
+ dataByteLen -= laneOffsetSerial*8;
1035
+ }
1036
+ return data - dataStart;
1037
+ }
1038
+ }
1039
+
1040
+ size_t KeccakP1600times8_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
1041
+ {
1042
+ size_t dataMinimumSize = (laneOffsetParallel*7 + laneCount)*8;
1043
+
1044
+ if (laneCount == 21) {
1045
+ #if (KeccakP1600times8_unrolling < 12)
1046
+ unsigned int i;
1047
+ #endif
1048
+ const unsigned char *dataStart = data;
1049
+ V512 *statesAsLanes = states;
1050
+ const uint64_t *dataAsLanes = (const uint64_t *)data;
1051
+ KeccakP_DeclareVars;
1052
+ V256 index;
1053
+
1054
+ copyFromState(statesAsLanes);
1055
+ index = LOAD8_32(7*laneOffsetParallel, 6*laneOffsetParallel, 5*laneOffsetParallel, 4*laneOffsetParallel, 3*laneOffsetParallel, 2*laneOffsetParallel, 1*laneOffsetParallel, 0*laneOffsetParallel);
1056
+ while(dataByteLen >= dataMinimumSize) {
1057
+ #define Add_In( argLane, argIndex ) argLane = XOR(argLane, LOAD_GATHER8_64(index, dataAsLanes+argIndex))
1058
+ Add_In( _ba, 0 );
1059
+ Add_In( _be, 1 );
1060
+ Add_In( _bi, 2 );
1061
+ Add_In( _bo, 3 );
1062
+ Add_In( _bu, 4 );
1063
+ Add_In( _ga, 5 );
1064
+ Add_In( _ge, 6 );
1065
+ Add_In( _gi, 7 );
1066
+ Add_In( _go, 8 );
1067
+ Add_In( _gu, 9 );
1068
+ Add_In( _ka, 10 );
1069
+ Add_In( _ke, 11 );
1070
+ Add_In( _ki, 12 );
1071
+ Add_In( _ko, 13 );
1072
+ Add_In( _ku, 14 );
1073
+ Add_In( _ma, 15 );
1074
+ Add_In( _me, 16 );
1075
+ Add_In( _mi, 17 );
1076
+ Add_In( _mo, 18 );
1077
+ Add_In( _mu, 19 );
1078
+ Add_In( _sa, 20 );
1079
+ #undef Add_In
1080
+ rounds12;
1081
+ dataAsLanes += laneOffsetSerial;
1082
+ dataByteLen -= laneOffsetSerial*8;
1083
+ }
1084
+ copyToState(statesAsLanes);
1085
+ return (const unsigned char *)dataAsLanes - dataStart;
1086
+ }
1087
+ else {
1088
+ const unsigned char *dataStart = data;
1089
+
1090
+ while(dataByteLen >= dataMinimumSize) {
1091
+ KeccakP1600times8_AddLanesAll(states, data, laneCount, laneOffsetParallel);
1092
+ KeccakP1600times8_PermuteAll_12rounds(states);
1093
+ data += laneOffsetSerial*8;
1094
+ dataByteLen -= laneOffsetSerial*8;
1095
+ }
1096
+ return data - dataStart;
1097
+ }
1098
+ }
1099
+
1100
+ /* ------------------------------------------------------------------------- */
1101
+
1102
+ /* Remap lanes to start after two rounds */
1103
+ #define Iba _ba
1104
+ #define Ibe _me
1105
+ #define Ibi _gi
1106
+ #define Ibo _so
1107
+ #define Ibu _ku
1108
+ #define Iga _sa
1109
+ #define Ige _ke
1110
+ #define Igi _bi
1111
+ #define Igo _mo
1112
+ #define Igu _gu
1113
+ #define Ika _ma
1114
+ #define Ike _ge
1115
+ #define Iki _si
1116
+ #define Iko _ko
1117
+ #define Iku _bu
1118
+ #define Ima _ka
1119
+ #define Ime _be
1120
+ #define Imi _mi
1121
+ #define Imo _go
1122
+ #define Imu _su
1123
+ #define Isa _ga
1124
+ #define Ise _se
1125
+ #define Isi _ki
1126
+ #define Iso _bo
1127
+ #define Isu _mu
1128
+
1129
+ #define LoadInput(argIndex) _mm512_i32gather_epi64(gather, (const long long int *)&in64[argIndex], 8)
1130
+ #define AddInput(argIndex) XOR( LoadInput(argIndex), CONST8_64(kRoll[argIndex]))
1131
+
1132
+
1133
+ ALIGN(64) static const uint64_t oLow256[] = { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 };
1134
+ ALIGN(64) static const uint64_t oHigh256[] = { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 };
1135
+
1136
+ ALIGN(64) static const uint64_t oLow128[] = { 0, 1, 8+0, 8+1, 4, 5, 8+4, 8+5 };
1137
+ ALIGN(64) static const uint64_t oHigh128[] = { 2, 3, 8+2, 8+3, 6, 7, 8+6, 8+7 };
1138
+
1139
+ ALIGN(64) static const uint64_t oLow64[] = { 0, 8+0, 2, 8+2, 4, 8+4, 6, 8+6 };
1140
+ ALIGN(64) static const uint64_t oHigh64[] = { 1, 8+1, 3, 8+3, 5, 8+5, 7, 8+7 };
1141
+
1142
+ ALIGN(64) static const uint64_t o01234_012[] = { 0, 1, 2, 3, 4, 8+0, 8+1, 8+2 };
1143
+ ALIGN(64) static const uint64_t o1234_0123[] = { 1, 2, 3, 4, 8+0, 8+1, 8+2, 8+3 };
1144
+ ALIGN(64) static const uint64_t o1234567_0[] = { 1, 2, 3, 4, 5, 6, 7, 8+0 };
1145
+ ALIGN(64) static const uint64_t o1234567_3[] = { 1, 2, 3, 4, 5, 6, 7, 8+3 };
1146
+ ALIGN(64) static const uint64_t o1234567_4[] = { 1, 2, 3, 4, 5, 6, 7, 8+4 };
1147
+ ALIGN(64) static const uint64_t o234567_45[] = { 2, 3, 4, 5, 6, 7, 8+4, 8+5 };
1148
+ ALIGN(64) static const uint64_t o34567_456[] = { 3, 4, 5, 6, 7, 8+4, 8+5, 8+6 };
1149
+
1150
+ ALIGN(32) static const uint32_t oGatherScatter[]= {0*25, 1*25, 2*25, 3*25, 4*25, 5*25, 6*25, 7*25};
1151
+
1152
+ size_t KeccakP1600times8_KravatteCompress(uint64_t *xAccu, uint64_t *kRoll, const unsigned char *input, size_t inputByteLen)
1153
+ {
1154
+ #if !defined(KeccakP1600times4_fullUnrolling)
1155
+ unsigned int i;
1156
+ #endif
1157
+ uint64_t *in64 = (uint64_t *)input;
1158
+ size_t nBlocks = inputByteLen / (8 * 200);
1159
+ KeccakP_DeclareVars;
1160
+ V512 x01234567, x12345678;
1161
+ V512 Xba, Xbe, Xbi, Xbo, Xbu;
1162
+ V512 Xga, Xge, Xgi, Xgo, Xgu;
1163
+ V512 Xka, Xke, Xki, Xko, Xku;
1164
+ V512 Xma, Xme, Xmi, Xmo, Xmu;
1165
+ V512 Xsa, Xse, Xsi, Xso, Xsu;
1166
+ V256 v1, v2;
1167
+ V512 p1, p2;
1168
+ V256 gather = *(V256*)oGatherScatter;
1169
+
1170
+ /* Clear internal X accu */
1171
+ Xba = _mm512_setzero_si512();
1172
+ Xbe = _mm512_setzero_si512();
1173
+ Xbi = _mm512_setzero_si512();
1174
+ Xbo = _mm512_setzero_si512();
1175
+ Xbu = _mm512_setzero_si512();
1176
+ Xga = _mm512_setzero_si512();
1177
+ Xge = _mm512_setzero_si512();
1178
+ Xgi = _mm512_setzero_si512();
1179
+ Xgo = _mm512_setzero_si512();
1180
+ Xgu = _mm512_setzero_si512();
1181
+ Xka = _mm512_setzero_si512();
1182
+ Xke = _mm512_setzero_si512();
1183
+ Xki = _mm512_setzero_si512();
1184
+ Xko = _mm512_setzero_si512();
1185
+ Xku = _mm512_setzero_si512();
1186
+ Xma = _mm512_setzero_si512();
1187
+ Xme = _mm512_setzero_si512();
1188
+ Xmi = _mm512_setzero_si512();
1189
+ Xmo = _mm512_setzero_si512();
1190
+ Xmu = _mm512_setzero_si512();
1191
+ Xsa = _mm512_setzero_si512();
1192
+ Xse = _mm512_setzero_si512();
1193
+ Xsi = _mm512_setzero_si512();
1194
+ Xso = _mm512_setzero_si512();
1195
+ Xsu = _mm512_setzero_si512();
1196
+
1197
+ /* prepare 8 lanes for roll-c */
1198
+ x01234567 = _mm512_maskz_loadu_epi64(0x1F, &kRoll[20]); /* 5 lanes ok */
1199
+ _ba = _mm512_maskz_loadu_epi64(0x0F, &kRoll[21]); /* 4 lanes ok */
1200
+ _be = XOR3(ROL(x01234567, 7), _ba, _mm512_srli_epi64(_ba, 3));
1201
+ x01234567 = _mm512_permutex2var_epi64(x01234567, *(V512*)o01234_012, _be);
1202
+ x12345678 = _mm512_permutex2var_epi64(x01234567, *(V512*)o1234_0123, _be);
1203
+
1204
+ do {
1205
+ Iba = AddInput( 0);
1206
+ Ibe = AddInput( 1);
1207
+ Ibi = AddInput( 2);
1208
+ Ibo = AddInput( 3);
1209
+ Ibu = AddInput( 4);
1210
+ Iga = AddInput( 5);
1211
+ Ige = AddInput( 6);
1212
+ Igi = AddInput( 7);
1213
+ Igo = AddInput( 8);
1214
+ Igu = AddInput( 9);
1215
+ Ika = AddInput(10);
1216
+ Ike = AddInput(11);
1217
+ Iki = AddInput(12);
1218
+ Iko = AddInput(13);
1219
+ Iku = AddInput(14);
1220
+ Ima = AddInput(15);
1221
+ Ime = AddInput(16);
1222
+ Imi = AddInput(17);
1223
+ Imo = AddInput(18);
1224
+ Imu = AddInput(19);
1225
+
1226
+ /* Roll-c */
1227
+ Isa = x01234567;
1228
+ Ise = x12345678;
1229
+ Isu = XOR3(ROL(x01234567, 7), x12345678, _mm512_srli_epi64(x12345678, 3));
1230
+ Ise = _mm512_permutex2var_epi64(x01234567, *(V512*)o1234567_3, Isu);
1231
+ Isi = _mm512_permutex2var_epi64(Ise, *(V512*)o1234567_4, Isu);
1232
+ Iso = _mm512_permutex2var_epi64(Ise, *(V512*)o234567_45, Isu);
1233
+ Isu = _mm512_permutex2var_epi64(Ise, *(V512*)o34567_456, Isu);
1234
+
1235
+ x01234567 = XOR3(ROL(Iso, 7), Isu, _mm512_srli_epi64(Isu, 3));
1236
+ x12345678 = _mm512_permutex2var_epi64(x01234567, *(V512*)o1234567_4, x01234567);
1237
+
1238
+ XOReq512(Isa, LoadInput(20));
1239
+ XOReq512(Ise, LoadInput(21));
1240
+ XOReq512(Isi, LoadInput(22));
1241
+ XOReq512(Iso, LoadInput(23));
1242
+ XOReq512(Isu, LoadInput(24));
1243
+
1244
+ rounds6
1245
+ Dump( "P-out", _);
1246
+
1247
+ /* Accumulate in X */
1248
+ XOReq512(Xba, _ba);
1249
+ XOReq512(Xbe, _be);
1250
+ XOReq512(Xbi, _bi);
1251
+ XOReq512(Xbo, _bo);
1252
+ XOReq512(Xbu, _bu);
1253
+ XOReq512(Xga, _ga);
1254
+ XOReq512(Xge, _ge);
1255
+ XOReq512(Xgi, _gi);
1256
+ XOReq512(Xgo, _go);
1257
+ XOReq512(Xgu, _gu);
1258
+ XOReq512(Xka, _ka);
1259
+ XOReq512(Xke, _ke);
1260
+ XOReq512(Xki, _ki);
1261
+ XOReq512(Xko, _ko);
1262
+ XOReq512(Xku, _ku);
1263
+ XOReq512(Xma, _ma);
1264
+ XOReq512(Xme, _me);
1265
+ XOReq512(Xmi, _mi);
1266
+ XOReq512(Xmo, _mo);
1267
+ XOReq512(Xmu, _mu);
1268
+ XOReq512(Xsa, _sa);
1269
+ XOReq512(Xse, _se);
1270
+ XOReq512(Xsi, _si);
1271
+ XOReq512(Xso, _so);
1272
+ XOReq512(Xsu, _su);
1273
+ Dump( "X", X);
1274
+
1275
+ in64 += 8 * 25;
1276
+ }
1277
+ while(--nBlocks != 0);
1278
+
1279
+ /* Add horizontally Xba ... Xgi Reduce from lanes 8 to 4 */
1280
+ p1 = *(V512*)oLow256;
1281
+ p2 = *(V512*)oHigh256;
1282
+ Xba = XOR(_mm512_permutex2var_epi64(Xba, p1, Xbu), _mm512_permutex2var_epi64(Xba, p2, Xbu));
1283
+ Xbe = XOR(_mm512_permutex2var_epi64(Xbe, p1, Xga), _mm512_permutex2var_epi64(Xbe, p2, Xga));
1284
+ Xbi = XOR(_mm512_permutex2var_epi64(Xbi, p1, Xge), _mm512_permutex2var_epi64(Xbi, p2, Xge));
1285
+ Xbo = XOR(_mm512_permutex2var_epi64(Xbo, p1, Xgi), _mm512_permutex2var_epi64(Xbo, p2, Xgi));
1286
+
1287
+ /* Add horizontally Xgo ... Xma Reduce from lanes 8 to 4 */
1288
+ Xgo = XOR(_mm512_permutex2var_epi64(Xgo, p1, Xki), _mm512_permutex2var_epi64(Xgo, p2, Xki));
1289
+ Xgu = XOR(_mm512_permutex2var_epi64(Xgu, p1, Xko), _mm512_permutex2var_epi64(Xgu, p2, Xko));
1290
+ Xka = XOR(_mm512_permutex2var_epi64(Xka, p1, Xku), _mm512_permutex2var_epi64(Xka, p2, Xku));
1291
+ Xke = XOR(_mm512_permutex2var_epi64(Xke, p1, Xma), _mm512_permutex2var_epi64(Xke, p2, Xma));
1292
+
1293
+ /* Add horizontally Xme ... Xso Reduce from lanes 8 to 4 */
1294
+ Xme = XOR(_mm512_permutex2var_epi64(Xme, p1, Xsa), _mm512_permutex2var_epi64(Xme, p2, Xsa));
1295
+ Xmi = XOR(_mm512_permutex2var_epi64(Xmi, p1, Xse), _mm512_permutex2var_epi64(Xmi, p2, Xse));
1296
+ Xmo = XOR(_mm512_permutex2var_epi64(Xmo, p1, Xsi), _mm512_permutex2var_epi64(Xmo, p2, Xsi));
1297
+ Xmu = XOR(_mm512_permutex2var_epi64(Xmu, p1, Xso), _mm512_permutex2var_epi64(Xmu, p2, Xso));
1298
+
1299
+ /* Add horizontally Xba ... Xbo Reduce from lanes 4 to 2 */
1300
+ p1 = *(V512*)oLow128;
1301
+ p2 = *(V512*)oHigh128;
1302
+ Xba = XOR(_mm512_permutex2var_epi64(Xba, p1, Xbi), _mm512_permutex2var_epi64(Xba, p2, Xbi));
1303
+ Xbe = XOR(_mm512_permutex2var_epi64(Xbe, p1, Xbo), _mm512_permutex2var_epi64(Xbe, p2, Xbo));
1304
+
1305
+ /* Add horizontally Xgo ... Xke Reduce from lanes 4 to 2 */
1306
+ Xgo = XOR(_mm512_permutex2var_epi64(Xgo, p1, Xka), _mm512_permutex2var_epi64(Xgo, p2, Xka));
1307
+ Xgu = XOR(_mm512_permutex2var_epi64(Xgu, p1, Xke), _mm512_permutex2var_epi64(Xgu, p2, Xke));
1308
+
1309
+ /* Add horizontally Xme ... Xmu Reduce from lanes 4 to 2 */
1310
+ Xme = XOR(_mm512_permutex2var_epi64(Xme, p1, Xmo), _mm512_permutex2var_epi64(Xme, p2, Xmo));
1311
+ Xmi = XOR(_mm512_permutex2var_epi64(Xmi, p1, Xmu), _mm512_permutex2var_epi64(Xmi, p2, Xmu));
1312
+
1313
+ /* Add horizontally Xba ... Xbe Reduce from lanes 2 to 1 */
1314
+ p1 = *(V512*)oLow64;
1315
+ p2 = *(V512*)oHigh64;
1316
+ Xba = XOR(_mm512_permutex2var_epi64(Xba, p1, Xbe), _mm512_permutex2var_epi64(Xba, p2, Xbe));
1317
+
1318
+ /* Add horizontally Xgo ... Xgu Reduce from lanes 2 to 1 */
1319
+ Xgo = XOR(_mm512_permutex2var_epi64(Xgo, p1, Xgu), _mm512_permutex2var_epi64(Xgo, p2, Xgu));
1320
+
1321
+ /* Add horizontally Xme ... Xmi Reduce from lanes 2 to 1 */
1322
+ Xme = XOR(_mm512_permutex2var_epi64(Xme, p1, Xmi), _mm512_permutex2var_epi64(Xme, p2, Xmi));
1323
+
1324
+ /* Add and store in xAccu */
1325
+ Xba = XOR( Xba, *(V512*)&xAccu[0]);
1326
+ Xgo = XOR( Xgo, *(V512*)&xAccu[8]);
1327
+ Xme = XOR( Xme, *(V512*)&xAccu[16]);
1328
+ _mm512_store_si512((V512*)&xAccu[0], Xba);
1329
+ _mm512_store_si512((V512*)&xAccu[8], Xgo);
1330
+ _mm512_store_si512((V512*)&xAccu[16], Xme);
1331
+
1332
+ /* Add horizontally Xsu */
1333
+ v1 = _mm256_xor_si256( _mm512_extracti64x4_epi64(Xsu, 0), _mm512_extracti64x4_epi64(Xsu, 1));
1334
+ v1 = _mm256_xor_si256( v1, _mm256_permute4x64_epi64(v1, 0xEE));
1335
+ xAccu[24] ^= _mm256_extract_epi64(v1, 0) ^ _mm256_extract_epi64(v1, 1);
1336
+ DumpMem("xAccu", xAccu, 5*5);
1337
+
1338
+ /* Store new kRoll */
1339
+ _mm512_mask_storeu_epi64(&kRoll[20], 0x1F, x01234567);
1340
+ DumpMem("Next kRoll", kRoll+20, 5);
1341
+
1342
+ return (size_t)in64 - (size_t)input;
1343
+ }
1344
+
1345
+ #undef LoadInput
1346
+ #undef AddInput
1347
+
1348
+ ALIGN(64) static const uint64_t o1234567_6[] = { 1, 2, 3, 4, 5, 6, 7, 8+6 };
1349
+ ALIGN(64) static const uint64_t o234567_01[] = { 2, 3, 4, 5, 6, 7, 8+0, 8+1 };
1350
+ ALIGN(64) static const uint64_t o34567_012[] = { 3, 4, 5, 6, 7, 8+0, 8+1, 8+2 };
1351
+ ALIGN(64) static const uint64_t o4567_0123[] = { 4, 5, 6, 7, 8+0, 8+1, 8+2, 8+3 };
1352
+ ALIGN(64) static const uint64_t o567_01234[] = { 5, 6, 7, 8+0, 8+1, 8+2, 8+3, 8+4 };
1353
+ ALIGN(64) static const uint64_t o67_012345[] = { 6, 7, 8+0, 8+1, 8+2, 8+3, 8+4, 8+5 };
1354
+ ALIGN(64) static const uint64_t o7_0123456[] = { 7, 8+0, 8+1, 8+2, 8+3, 8+4, 8+5, 8+6 };
1355
+
1356
+ size_t KeccakP1600times8_KravatteExpand(uint64_t *yAccu, const uint64_t *kRoll, unsigned char *output, size_t outputByteLen)
1357
+ {
1358
+ uint64_t *o64 = (uint64_t *)output;
1359
+ size_t nBlocks = outputByteLen / (8 * 200);
1360
+ KeccakP_DeclareVars;
1361
+ #if !defined(KeccakP1600times4_fullUnrolling)
1362
+ unsigned int i;
1363
+ #endif
1364
+ V512 x01234567, x23456789;
1365
+ V256 scatter = *(V256*)oGatherScatter;
1366
+
1367
+ x01234567 = LOAD512u(yAccu[15]);
1368
+ x23456789 = LOAD512u(yAccu[17]);
1369
+
1370
+ do {
1371
+ Iba = CONST8_64(yAccu[0]);
1372
+ Ibe = CONST8_64(yAccu[1]);
1373
+ Ibi = CONST8_64(yAccu[2]);
1374
+ Ibo = CONST8_64(yAccu[3]);
1375
+ Ibu = CONST8_64(yAccu[4]);
1376
+
1377
+ Iga = CONST8_64(yAccu[5]);
1378
+ Ige = CONST8_64(yAccu[6]);
1379
+ Igi = CONST8_64(yAccu[7]);
1380
+ Igo = CONST8_64(yAccu[8]);
1381
+ Igu = CONST8_64(yAccu[9]);
1382
+
1383
+ Ika = CONST8_64(yAccu[10]);
1384
+ Ike = CONST8_64(yAccu[11]);
1385
+ Iki = CONST8_64(yAccu[12]);
1386
+ Iko = CONST8_64(yAccu[13]);
1387
+ Iku = CONST8_64(yAccu[14]);
1388
+
1389
+ /* roll-e */
1390
+ Ima = x01234567;
1391
+ Ime = _mm512_permutex2var_epi64(x01234567, *(V512*)o1234567_6, x23456789);
1392
+ Imi = x23456789;
1393
+
1394
+ x23456789 = XOR3(ROL(Ima, 7), ROL(Ime, 18), _mm512_and_si512(Imi, _mm512_srli_epi64(Ime, 1)));
1395
+ Imo = _mm512_permutex2var_epi64(Imi, *(V512*)o1234567_0, x23456789);
1396
+ Imu = _mm512_permutex2var_epi64(Imi, *(V512*)o234567_01, x23456789);
1397
+ Isa = _mm512_permutex2var_epi64(Imi, *(V512*)o34567_012, x23456789);
1398
+ Ise = _mm512_permutex2var_epi64(Imi, *(V512*)o4567_0123, x23456789);
1399
+ Isi = _mm512_permutex2var_epi64(Imi, *(V512*)o567_01234, x23456789);
1400
+ Iso = _mm512_permutex2var_epi64(Imi, *(V512*)o67_012345, x23456789);
1401
+ Isu = _mm512_permutex2var_epi64(Imi, *(V512*)o7_0123456, x23456789);
1402
+ x01234567 = Iso;
1403
+ Dump( "After roll-e", I);
1404
+
1405
+ rounds6
1406
+
1407
+ /* Add kRoll */
1408
+ _ba = XOR(_ba, CONST8_64(kRoll[0]));
1409
+ _be = XOR(_be, CONST8_64(kRoll[1]));
1410
+ _bi = XOR(_bi, CONST8_64(kRoll[2]));
1411
+ _bo = XOR(_bo, CONST8_64(kRoll[3]));
1412
+ _bu = XOR(_bu, CONST8_64(kRoll[4]));
1413
+ _ga = XOR(_ga, CONST8_64(kRoll[5]));
1414
+ _ge = XOR(_ge, CONST8_64(kRoll[6]));
1415
+ _gi = XOR(_gi, CONST8_64(kRoll[7]));
1416
+ _go = XOR(_go, CONST8_64(kRoll[8]));
1417
+ _gu = XOR(_gu, CONST8_64(kRoll[9]));
1418
+ _ka = XOR(_ka, CONST8_64(kRoll[10]));
1419
+ _ke = XOR(_ke, CONST8_64(kRoll[11]));
1420
+ _ki = XOR(_ki, CONST8_64(kRoll[12]));
1421
+ _ko = XOR(_ko, CONST8_64(kRoll[13]));
1422
+ _ku = XOR(_ku, CONST8_64(kRoll[14]));
1423
+ _ma = XOR(_ma, CONST8_64(kRoll[15]));
1424
+ _me = XOR(_me, CONST8_64(kRoll[16]));
1425
+ _mi = XOR(_mi, CONST8_64(kRoll[17]));
1426
+ _mo = XOR(_mo, CONST8_64(kRoll[18]));
1427
+ _mu = XOR(_mu, CONST8_64(kRoll[19]));
1428
+ _sa = XOR(_sa, CONST8_64(kRoll[20]));
1429
+ _se = XOR(_se, CONST8_64(kRoll[21]));
1430
+ _si = XOR(_si, CONST8_64(kRoll[22]));
1431
+ _so = XOR(_so, CONST8_64(kRoll[23]));
1432
+ _su = XOR(_su, CONST8_64(kRoll[24]));
1433
+ Dump( "After add kRoll", _);
1434
+
1435
+ /* Extract */
1436
+ STORE_SCATTER8_64(o64+0, scatter, _ba);
1437
+ STORE_SCATTER8_64(o64+1, scatter, _be);
1438
+ STORE_SCATTER8_64(o64+2, scatter, _bi);
1439
+ STORE_SCATTER8_64(o64+3, scatter, _bo);
1440
+ STORE_SCATTER8_64(o64+4, scatter, _bu);
1441
+ STORE_SCATTER8_64(o64+5, scatter, _ga);
1442
+ STORE_SCATTER8_64(o64+6, scatter, _ge);
1443
+ STORE_SCATTER8_64(o64+7, scatter, _gi);
1444
+ STORE_SCATTER8_64(o64+8, scatter, _go);
1445
+ STORE_SCATTER8_64(o64+9, scatter, _gu);
1446
+ STORE_SCATTER8_64(o64+10, scatter, _ka);
1447
+ STORE_SCATTER8_64(o64+11, scatter, _ke);
1448
+ STORE_SCATTER8_64(o64+12, scatter, _ki);
1449
+ STORE_SCATTER8_64(o64+13, scatter, _ko);
1450
+ STORE_SCATTER8_64(o64+14, scatter, _ku);
1451
+ STORE_SCATTER8_64(o64+15, scatter, _ma);
1452
+ STORE_SCATTER8_64(o64+16, scatter, _me);
1453
+ STORE_SCATTER8_64(o64+17, scatter, _mi);
1454
+ STORE_SCATTER8_64(o64+18, scatter, _mo);
1455
+ STORE_SCATTER8_64(o64+19, scatter, _mu);
1456
+ STORE_SCATTER8_64(o64+20, scatter, _sa);
1457
+ STORE_SCATTER8_64(o64+21, scatter, _se);
1458
+ STORE_SCATTER8_64(o64+22, scatter, _si);
1459
+ STORE_SCATTER8_64(o64+23, scatter, _so);
1460
+ STORE_SCATTER8_64(o64+24, scatter, _su);
1461
+ DumpMem("Output", o64, 8*25);
1462
+
1463
+ o64 += 8 * 25;
1464
+ }
1465
+ while(--nBlocks != 0);
1466
+
1467
+ /* Store new yAccu */
1468
+ _mm512_mask_storeu_epi64(&yAccu[15], 0xFF, x01234567);
1469
+ _mm512_mask_storeu_epi64(&yAccu[17], 0xC0, x23456789);
1470
+ DumpMem("yAccu", yAccu, 25);
1471
+
1472
+ return (size_t)o64 - (size_t)output;
1473
+ }