digest-kangarootwelve 0.0.2 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (307) hide show
  1. checksums.yaml +5 -5
  2. data/README.md +71 -37
  3. data/Rakefile +7 -9
  4. data/digest-kangarootwelve.gemspec +323 -14
  5. data/ext/digest/kangarootwelve/ext.c +228 -177
  6. data/ext/digest/kangarootwelve/extconf.rb +15 -1
  7. data/ext/digest/kangarootwelve/keccak/armv6m/KangarooTwelve.link.c +1 -0
  8. data/ext/digest/kangarootwelve/keccak/armv6m/KeccakDuplexWidth1600.link.c +1 -0
  9. data/ext/digest/kangarootwelve/keccak/armv6m/KeccakP-1600-SnP.h +36 -0
  10. data/ext/digest/kangarootwelve/{KeccakP-1600-times2-SnP.h → keccak/armv6m/KeccakP-1600-times2-SnP.h} +10 -10
  11. data/ext/digest/kangarootwelve/{KeccakP-1600-times2-on1.c → keccak/armv6m/KeccakP-1600-times2-on1.c} +13 -7
  12. data/ext/digest/kangarootwelve/{KeccakP-1600-times4-SnP.h → keccak/armv6m/KeccakP-1600-times4-SnP.h} +10 -10
  13. data/ext/digest/kangarootwelve/{KeccakP-1600-times4-on1.c → keccak/armv6m/KeccakP-1600-times4-on1.c} +13 -7
  14. data/ext/digest/kangarootwelve/{KeccakP-1600-times8-SnP.h → keccak/armv6m/KeccakP-1600-times8-SnP.h} +10 -10
  15. data/ext/digest/kangarootwelve/{KeccakP-1600-times8-on1.c → keccak/armv6m/KeccakP-1600-times8-on1.c} +13 -7
  16. data/ext/digest/kangarootwelve/keccak/armv6m/KeccakP-1600-u2-32bi-armv6m-le-gcc.s +1334 -0
  17. data/ext/digest/kangarootwelve/keccak/armv6m/KeccakSpongeWidth1600.link.c +1 -0
  18. data/ext/digest/kangarootwelve/{PlSnP-Fallback.inc → keccak/armv6m/PlSnP-Fallback.inc} +11 -7
  19. data/ext/digest/kangarootwelve/keccak/armv6m/ext.link.c +1 -0
  20. data/ext/digest/kangarootwelve/keccak/armv7a/KangarooTwelve.link.c +1 -0
  21. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakDuplexWidth1600.link.c +1 -0
  22. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-SnP.h +37 -0
  23. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-armv7a-le-neon-gcc.s +826 -0
  24. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-inplace-pl2-armv7a-neon-le-gcc.s +1245 -0
  25. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-times2-SnP.h +38 -0
  26. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-times4-SnP.h +45 -0
  27. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-times4-on2.c +38 -0
  28. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-times8-SnP.h +45 -0
  29. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-times8-on2.c +38 -0
  30. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakSpongeWidth1600.link.c +1 -0
  31. data/ext/digest/kangarootwelve/keccak/armv7a/PlSnP-Fallback.inc +287 -0
  32. data/ext/digest/kangarootwelve/keccak/armv7a/ext.link.c +1 -0
  33. data/ext/digest/kangarootwelve/keccak/armv7m/KangarooTwelve.link.c +1 -0
  34. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakDuplexWidth1600.link.c +1 -0
  35. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-SnP.h +36 -0
  36. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-inplace-32bi-armv7m-le-gcc.s +1170 -0
  37. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-times2-SnP.h +45 -0
  38. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-times2-on1.c +37 -0
  39. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-times4-SnP.h +45 -0
  40. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-times4-on1.c +37 -0
  41. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-times8-SnP.h +45 -0
  42. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-times8-on1.c +37 -0
  43. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakSpongeWidth1600.link.c +1 -0
  44. data/ext/digest/kangarootwelve/keccak/armv7m/PlSnP-Fallback.inc +287 -0
  45. data/ext/digest/kangarootwelve/keccak/armv7m/ext.link.c +1 -0
  46. data/ext/digest/kangarootwelve/keccak/armv8a/KangarooTwelve.link.c +1 -0
  47. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakDuplexWidth1600.link.c +1 -0
  48. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-SnP.h +28 -0
  49. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-armv8a-neon.s +537 -0
  50. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-times2-SnP.h +45 -0
  51. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-times2-on1.c +37 -0
  52. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-times4-SnP.h +45 -0
  53. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-times4-on1.c +37 -0
  54. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-times8-SnP.h +45 -0
  55. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-times8-on1.c +37 -0
  56. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakSpongeWidth1600.link.c +1 -0
  57. data/ext/digest/kangarootwelve/keccak/armv8a/PlSnP-Fallback.inc +287 -0
  58. data/ext/digest/kangarootwelve/keccak/armv8a/ext.link.c +1 -0
  59. data/ext/digest/kangarootwelve/keccak/asmx86-64/KangarooTwelve.link.c +1 -0
  60. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakDuplexWidth1600.link.c +1 -0
  61. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-SnP.h +37 -0
  62. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-times2-SnP.h +45 -0
  63. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-times2-on1.c +37 -0
  64. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-times4-SnP.h +45 -0
  65. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-times4-on1.c +37 -0
  66. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-times8-SnP.h +45 -0
  67. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-times8-on1.c +37 -0
  68. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-x86-64-gas.s +1190 -0
  69. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakSpongeWidth1600.link.c +1 -0
  70. data/ext/digest/kangarootwelve/keccak/asmx86-64/PlSnP-Fallback.inc +287 -0
  71. data/ext/digest/kangarootwelve/keccak/asmx86-64/ext.link.c +1 -0
  72. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KangarooTwelve.link.c +1 -0
  73. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakDuplexWidth1600.link.c +1 -0
  74. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-SnP.h +37 -0
  75. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-times2-SnP.h +45 -0
  76. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-times2-on1.c +37 -0
  77. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-times4-SnP.h +45 -0
  78. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-times4-on1.c +37 -0
  79. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-times8-SnP.h +45 -0
  80. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-times8-on1.c +37 -0
  81. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-x86-64-shld-gas.s +1190 -0
  82. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakSpongeWidth1600.link.c +1 -0
  83. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/PlSnP-Fallback.inc +287 -0
  84. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/ext.link.c +1 -0
  85. data/ext/digest/kangarootwelve/keccak/avr8/KangarooTwelve.link.c +1 -0
  86. data/ext/digest/kangarootwelve/keccak/avr8/KeccakDuplexWidth1600.link.c +1 -0
  87. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-SnP.h +37 -0
  88. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-avr8-fast.s +1116 -0
  89. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-times2-SnP.h +45 -0
  90. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-times2-on1.c +37 -0
  91. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-times4-SnP.h +45 -0
  92. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-times4-on1.c +37 -0
  93. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-times8-SnP.h +45 -0
  94. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-times8-on1.c +37 -0
  95. data/ext/digest/kangarootwelve/keccak/avr8/KeccakSpongeWidth1600.link.c +1 -0
  96. data/ext/digest/kangarootwelve/keccak/avr8/PlSnP-Fallback.inc +287 -0
  97. data/ext/digest/kangarootwelve/keccak/avr8/ext.link.c +1 -0
  98. data/ext/digest/kangarootwelve/keccak/bulldozer/KangarooTwelve.link.c +1 -0
  99. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakDuplexWidth1600.link.c +1 -0
  100. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-SnP.h +39 -0
  101. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-XOP-config.h +6 -0
  102. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-XOP.c +473 -0
  103. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-times2-SIMD128.c +954 -0
  104. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-times2-SnP.h +47 -0
  105. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-times4-SnP.h +45 -0
  106. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-times4-on2.c +38 -0
  107. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-times8-SnP.h +45 -0
  108. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-times8-on2.c +38 -0
  109. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-unrolling.macros +302 -0
  110. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakSpongeWidth1600.link.c +1 -0
  111. data/ext/digest/kangarootwelve/keccak/bulldozer/PlSnP-Fallback.inc +287 -0
  112. data/ext/digest/kangarootwelve/keccak/bulldozer/SIMD128-config.h +9 -0
  113. data/ext/digest/kangarootwelve/{SnP-Relaned.h → keccak/bulldozer/SnP-Relaned.h} +13 -7
  114. data/ext/digest/kangarootwelve/keccak/bulldozer/ext.link.c +1 -0
  115. data/ext/digest/kangarootwelve/{KangarooTwelve.c → keccak/common/KangarooTwelve.c} +6 -10
  116. data/ext/digest/kangarootwelve/{KangarooTwelve.h → keccak/common/KangarooTwelve.h} +3 -7
  117. data/ext/digest/kangarootwelve/keccak/common/KeccakDuplex-common.h +37 -0
  118. data/ext/digest/kangarootwelve/keccak/common/KeccakDuplex.inc +192 -0
  119. data/ext/digest/kangarootwelve/keccak/common/KeccakDuplexWidth1600.c +34 -0
  120. data/ext/digest/kangarootwelve/keccak/common/KeccakDuplexWidth1600.h +25 -0
  121. data/ext/digest/kangarootwelve/{KeccakSponge-common.h → keccak/common/KeccakSponge-common.h} +5 -7
  122. data/ext/digest/kangarootwelve/{KeccakSponge.inc → keccak/common/KeccakSponge.inc} +6 -8
  123. data/ext/digest/kangarootwelve/{KeccakSpongeWidth1600.c → keccak/common/KeccakSpongeWidth1600.c} +6 -8
  124. data/ext/digest/kangarootwelve/{KeccakSpongeWidth1600.h → keccak/common/KeccakSpongeWidth1600.h} +5 -7
  125. data/ext/digest/kangarootwelve/{Phases.h → keccak/common/Phases.h} +3 -7
  126. data/ext/digest/kangarootwelve/{align.h → keccak/common/align.h} +5 -7
  127. data/ext/digest/kangarootwelve/{brg_endian.h → keccak/common/brg_endian.h} +0 -0
  128. data/ext/digest/kangarootwelve/keccak/compact/KangarooTwelve.link.c +1 -0
  129. data/ext/digest/kangarootwelve/keccak/compact/KeccakDuplexWidth1600.link.c +1 -0
  130. data/ext/digest/kangarootwelve/{KeccakP-1600-SnP.h → keccak/compact/KeccakP-1600-SnP.h} +7 -10
  131. data/ext/digest/kangarootwelve/{KeccakP-1600-compact64.c → keccak/compact/KeccakP-1600-compact64.c} +11 -7
  132. data/ext/digest/kangarootwelve/keccak/compact/KeccakP-1600-times2-SnP.h +45 -0
  133. data/ext/digest/kangarootwelve/keccak/compact/KeccakP-1600-times2-on1.c +37 -0
  134. data/ext/digest/kangarootwelve/keccak/compact/KeccakP-1600-times4-SnP.h +45 -0
  135. data/ext/digest/kangarootwelve/keccak/compact/KeccakP-1600-times4-on1.c +37 -0
  136. data/ext/digest/kangarootwelve/keccak/compact/KeccakP-1600-times8-SnP.h +45 -0
  137. data/ext/digest/kangarootwelve/keccak/compact/KeccakP-1600-times8-on1.c +37 -0
  138. data/ext/digest/kangarootwelve/keccak/compact/KeccakSpongeWidth1600.link.c +1 -0
  139. data/ext/digest/kangarootwelve/keccak/compact/PlSnP-Fallback.inc +287 -0
  140. data/ext/digest/kangarootwelve/keccak/compact/SnP-Relaned.h +140 -0
  141. data/ext/digest/kangarootwelve/keccak/compact/ext.link.c +1 -0
  142. data/ext/digest/kangarootwelve/keccak/generic32/KangarooTwelve.link.c +1 -0
  143. data/ext/digest/kangarootwelve/keccak/generic32/KeccakDuplexWidth1600.link.c +1 -0
  144. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-SnP.h +38 -0
  145. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-inplace32BI.c +1162 -0
  146. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-times2-SnP.h +45 -0
  147. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-times2-on1.c +37 -0
  148. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-times4-SnP.h +45 -0
  149. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-times4-on1.c +37 -0
  150. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-times8-SnP.h +45 -0
  151. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-times8-on1.c +37 -0
  152. data/ext/digest/kangarootwelve/keccak/generic32/KeccakSpongeWidth1600.link.c +1 -0
  153. data/ext/digest/kangarootwelve/keccak/generic32/PlSnP-Fallback.inc +287 -0
  154. data/ext/digest/kangarootwelve/keccak/generic32/SnP-Relaned.h +140 -0
  155. data/ext/digest/kangarootwelve/keccak/generic32/ext.link.c +1 -0
  156. data/ext/digest/kangarootwelve/keccak/generic32lc/KangarooTwelve.link.c +1 -0
  157. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakDuplexWidth1600.link.c +1 -0
  158. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-SnP.h +38 -0
  159. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-inplace32BI.c +1162 -0
  160. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-times2-SnP.h +45 -0
  161. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-times2-on1.c +37 -0
  162. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-times4-SnP.h +45 -0
  163. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-times4-on1.c +37 -0
  164. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-times8-SnP.h +45 -0
  165. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-times8-on1.c +37 -0
  166. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakSpongeWidth1600.link.c +1 -0
  167. data/ext/digest/kangarootwelve/keccak/generic32lc/PlSnP-Fallback.inc +287 -0
  168. data/ext/digest/kangarootwelve/keccak/generic32lc/SnP-Relaned.h +140 -0
  169. data/ext/digest/kangarootwelve/keccak/generic32lc/ext.link.c +1 -0
  170. data/ext/digest/kangarootwelve/keccak/generic64/KangarooTwelve.link.c +1 -0
  171. data/ext/digest/kangarootwelve/keccak/generic64/KeccakDuplexWidth1600.link.c +1 -0
  172. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-64.macros +2195 -0
  173. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-SnP.h +49 -0
  174. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-opt64-config.h +6 -0
  175. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-opt64.c +541 -0
  176. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-times2-SnP.h +45 -0
  177. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-times2-on1.c +37 -0
  178. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-times4-SnP.h +45 -0
  179. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-times4-on1.c +37 -0
  180. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-times8-SnP.h +45 -0
  181. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-times8-on1.c +37 -0
  182. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-unrolling.macros +302 -0
  183. data/ext/digest/kangarootwelve/keccak/generic64/KeccakSpongeWidth1600.link.c +1 -0
  184. data/ext/digest/kangarootwelve/keccak/generic64/PlSnP-Fallback.inc +287 -0
  185. data/ext/digest/kangarootwelve/keccak/generic64/SnP-Relaned.h +140 -0
  186. data/ext/digest/kangarootwelve/keccak/generic64/ext.link.c +1 -0
  187. data/ext/digest/kangarootwelve/keccak/generic64lc/KangarooTwelve.link.c +1 -0
  188. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakDuplexWidth1600.link.c +1 -0
  189. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-64.macros +2195 -0
  190. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-SnP.h +49 -0
  191. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-opt64-config.h +7 -0
  192. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-opt64.c +541 -0
  193. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-times2-SnP.h +45 -0
  194. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-times2-on1.c +37 -0
  195. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-times4-SnP.h +45 -0
  196. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-times4-on1.c +37 -0
  197. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-times8-SnP.h +45 -0
  198. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-times8-on1.c +37 -0
  199. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-unrolling.macros +302 -0
  200. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakSpongeWidth1600.link.c +1 -0
  201. data/ext/digest/kangarootwelve/keccak/generic64lc/PlSnP-Fallback.inc +287 -0
  202. data/ext/digest/kangarootwelve/keccak/generic64lc/SnP-Relaned.h +140 -0
  203. data/ext/digest/kangarootwelve/keccak/generic64lc/ext.link.c +1 -0
  204. data/ext/digest/kangarootwelve/keccak/haswell/KangarooTwelve.link.c +1 -0
  205. data/ext/digest/kangarootwelve/keccak/haswell/KeccakDuplexWidth1600.link.c +1 -0
  206. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-AVX2.s +993 -0
  207. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-SnP.h +41 -0
  208. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-times2-SIMD128.c +954 -0
  209. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-times2-SnP.h +47 -0
  210. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-times4-SIMD256.c +1303 -0
  211. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-times4-SnP.h +53 -0
  212. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-times8-SnP.h +45 -0
  213. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-times8-on4.c +38 -0
  214. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-unrolling.macros +302 -0
  215. data/ext/digest/kangarootwelve/keccak/haswell/KeccakSpongeWidth1600.link.c +1 -0
  216. data/ext/digest/kangarootwelve/keccak/haswell/PlSnP-Fallback.inc +287 -0
  217. data/ext/digest/kangarootwelve/keccak/haswell/SIMD128-config.h +8 -0
  218. data/ext/digest/kangarootwelve/keccak/haswell/SIMD256-config.h +7 -0
  219. data/ext/digest/kangarootwelve/keccak/haswell/ext.link.c +1 -0
  220. data/ext/digest/kangarootwelve/keccak/nehalem/KangarooTwelve.link.c +1 -0
  221. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakDuplexWidth1600.link.c +1 -0
  222. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-64.macros +2195 -0
  223. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-SnP.h +49 -0
  224. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-opt64-config.h +7 -0
  225. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-opt64.c +541 -0
  226. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-times2-SIMD128.c +954 -0
  227. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-times2-SnP.h +47 -0
  228. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-times4-SnP.h +45 -0
  229. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-times4-on2.c +38 -0
  230. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-times8-SnP.h +45 -0
  231. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-times8-on2.c +38 -0
  232. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-unrolling.macros +302 -0
  233. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakSpongeWidth1600.link.c +1 -0
  234. data/ext/digest/kangarootwelve/keccak/nehalem/PlSnP-Fallback.inc +287 -0
  235. data/ext/digest/kangarootwelve/keccak/nehalem/SIMD128-config.h +8 -0
  236. data/ext/digest/kangarootwelve/keccak/nehalem/SnP-Relaned.h +140 -0
  237. data/ext/digest/kangarootwelve/keccak/nehalem/ext.link.c +1 -0
  238. data/ext/digest/kangarootwelve/keccak/reference/KangarooTwelve.link.c +1 -0
  239. data/ext/digest/kangarootwelve/keccak/reference/KeccakDuplexWidth1600.link.c +1 -0
  240. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-SnP.h +41 -0
  241. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-reference.c +424 -0
  242. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-reference.h +20 -0
  243. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-times2-SnP.h +45 -0
  244. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-times2-on1.c +37 -0
  245. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-times4-SnP.h +45 -0
  246. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-times4-on1.c +37 -0
  247. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-times8-SnP.h +45 -0
  248. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-times8-on1.c +37 -0
  249. data/ext/digest/kangarootwelve/keccak/reference/KeccakSpongeWidth1600.link.c +1 -0
  250. data/ext/digest/kangarootwelve/keccak/reference/PlSnP-Fallback.inc +287 -0
  251. data/ext/digest/kangarootwelve/keccak/reference/displayIntermediateValues.c +176 -0
  252. data/ext/digest/kangarootwelve/keccak/reference/displayIntermediateValues.h +29 -0
  253. data/ext/digest/kangarootwelve/keccak/reference/ext.link.c +1 -0
  254. data/ext/digest/kangarootwelve/keccak/reference32bits/KangarooTwelve.link.c +1 -0
  255. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakDuplexWidth1600.link.c +1 -0
  256. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-SnP.h +41 -0
  257. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-reference.h +20 -0
  258. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-reference32BI.c +612 -0
  259. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-times2-SnP.h +45 -0
  260. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-times2-on1.c +37 -0
  261. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-times4-SnP.h +45 -0
  262. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-times4-on1.c +37 -0
  263. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-times8-SnP.h +45 -0
  264. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-times8-on1.c +37 -0
  265. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakSpongeWidth1600.link.c +1 -0
  266. data/ext/digest/kangarootwelve/keccak/reference32bits/PlSnP-Fallback.inc +287 -0
  267. data/ext/digest/kangarootwelve/keccak/reference32bits/displayIntermediateValues.c +176 -0
  268. data/ext/digest/kangarootwelve/keccak/reference32bits/displayIntermediateValues.h +29 -0
  269. data/ext/digest/kangarootwelve/keccak/reference32bits/ext.link.c +1 -0
  270. data/ext/digest/kangarootwelve/keccak/sandybridge/KangarooTwelve.link.c +1 -0
  271. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakDuplexWidth1600.link.c +1 -0
  272. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-64.macros +2195 -0
  273. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-SnP.h +49 -0
  274. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-opt64-config.h +8 -0
  275. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-opt64.c +541 -0
  276. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-times2-SIMD128.c +954 -0
  277. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-times2-SnP.h +47 -0
  278. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-times4-SnP.h +45 -0
  279. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-times4-on2.c +38 -0
  280. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-times8-SnP.h +45 -0
  281. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-times8-on2.c +38 -0
  282. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-unrolling.macros +302 -0
  283. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakSpongeWidth1600.link.c +1 -0
  284. data/ext/digest/kangarootwelve/keccak/sandybridge/PlSnP-Fallback.inc +287 -0
  285. data/ext/digest/kangarootwelve/keccak/sandybridge/SIMD128-config.h +8 -0
  286. data/ext/digest/kangarootwelve/keccak/sandybridge/SnP-Relaned.h +140 -0
  287. data/ext/digest/kangarootwelve/keccak/sandybridge/ext.link.c +1 -0
  288. data/ext/digest/kangarootwelve/keccak/skylakex/KangarooTwelve.link.c +1 -0
  289. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakDuplexWidth1600.link.c +1 -0
  290. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-AVX512-config.h +6 -0
  291. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-AVX512.c +621 -0
  292. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-SnP.h +42 -0
  293. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-times2-SIMD512.c +852 -0
  294. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-times2-SnP.h +49 -0
  295. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-times4-SIMD512.c +883 -0
  296. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-times4-SnP.h +49 -0
  297. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-times8-SIMD512.c +1473 -0
  298. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-times8-SnP.h +53 -0
  299. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakSpongeWidth1600.link.c +1 -0
  300. data/ext/digest/kangarootwelve/keccak/skylakex/SIMD512-2-config.h +7 -0
  301. data/ext/digest/kangarootwelve/keccak/skylakex/SIMD512-4-config.h +7 -0
  302. data/ext/digest/kangarootwelve/keccak/skylakex/SIMD512-config.h +7 -0
  303. data/ext/digest/kangarootwelve/keccak/skylakex/ext.link.c +1 -0
  304. data/ext/digest/kangarootwelve/utils.h +101 -0
  305. data/lib/digest/kangarootwelve/version.rb +2 -2
  306. data/test/test.rb +68 -31
  307. metadata +305 -27
@@ -0,0 +1,47 @@
1
+ /*
2
+ Implementation by Gilles Van Assche, hereby denoted as "the implementer".
3
+
4
+ For more information, feedback or questions, please refer to our website:
5
+ https://keccak.team/
6
+
7
+ To the extent possible under law, the implementer has waived all copyright
8
+ and related or neighboring rights to the source code in this file.
9
+ http://creativecommons.org/publicdomain/zero/1.0/
10
+
11
+ ---
12
+
13
+ Please refer to PlSnP-documentation.h for more details.
14
+ */
15
+
16
+ #ifndef _KeccakP_1600_times2_SnP_h_
17
+ #define _KeccakP_1600_times2_SnP_h_
18
+
19
+ #include "SIMD128-config.h"
20
+
21
+ #define KeccakP1600times2_implementation "128-bit SIMD implementation (" KeccakP1600times2_implementation_config ")"
22
+ #define KeccakP1600times2_statesSizeInBytes 400
23
+ #define KeccakP1600times2_statesAlignment 16
24
+ #define KeccakF1600times2_FastLoop_supported
25
+
26
+ #include <stddef.h>
27
+
28
+ #define KeccakP1600times2_StaticInitialize()
29
+ void KeccakP1600times2_InitializeAll(void *states);
30
+ #define KeccakP1600times2_AddByte(states, instanceIndex, byte, offset) \
31
+ ((unsigned char*)(states))[(instanceIndex)*8 + ((offset)/8)*2*8 + (offset)%8] ^= (byte)
32
+ void KeccakP1600times2_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length);
33
+ void KeccakP1600times2_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset);
34
+ void KeccakP1600times2_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length);
35
+ void KeccakP1600times2_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset);
36
+ void KeccakP1600times2_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount);
37
+ void KeccakP1600times2_PermuteAll_4rounds(void *states);
38
+ void KeccakP1600times2_PermuteAll_6rounds(void *states);
39
+ void KeccakP1600times2_PermuteAll_12rounds(void *states);
40
+ void KeccakP1600times2_PermuteAll_24rounds(void *states);
41
+ void KeccakP1600times2_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length);
42
+ void KeccakP1600times2_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset);
43
+ void KeccakP1600times2_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length);
44
+ void KeccakP1600times2_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset);
45
+ size_t KeccakF1600times2_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen);
46
+
47
+ #endif
@@ -0,0 +1,1303 @@
1
+ /*
2
+ Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
3
+
4
+ For more information, feedback or questions, please refer to our website:
5
+ https://keccak.team/
6
+
7
+ To the extent possible under law, the implementer has waived all copyright
8
+ and related or neighboring rights to the source code in this file.
9
+ http://creativecommons.org/publicdomain/zero/1.0/
10
+
11
+ ---
12
+
13
+ This file implements Keccak-p[1600]×4 in a PlSnP-compatible way.
14
+ Please refer to PlSnP-documentation.h for more details.
15
+
16
+ This implementation comes with KeccakP-1600-times4-SnP.h in the same folder.
17
+ Please refer to LowLevel.build for the exact list of other files it must be combined with.
18
+ */
19
+
20
+ #include <stdio.h>
21
+ #include <stdlib.h>
22
+ #include <string.h>
23
+ #include <smmintrin.h>
24
+ #include <wmmintrin.h>
25
+ #include <immintrin.h>
26
+ #include <emmintrin.h>
27
+ #include "align.h"
28
+ #include "KeccakP-1600-times4-SnP.h"
29
+ #include "SIMD256-config.h"
30
+
31
+ #include "brg_endian.h"
32
+ #if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
33
+ #error Expecting a little-endian platform
34
+ #endif
35
+
36
+ typedef unsigned char UINT8;
37
+ typedef unsigned long long int UINT64;
38
+ typedef __m128i V128;
39
+ typedef __m256i V256;
40
+
41
+ //#define UseGatherScatter
42
+
43
+ #define laneIndex(instanceIndex, lanePosition) ((lanePosition)*4 + instanceIndex)
44
+
45
+ #if defined(KeccakP1600times4_useAVX2)
46
+ #define ANDnu256(a, b) _mm256_andnot_si256(a, b)
47
+ #define CONST256(a) _mm256_load_si256((const V256 *)&(a))
48
+ #define CONST256_64(a) (V256)_mm256_broadcast_sd((const double*)(&a))
49
+ #define LOAD256(a) _mm256_load_si256((const V256 *)&(a))
50
+ #define LOAD256u(a) _mm256_loadu_si256((const V256 *)&(a))
51
+ #define LOAD4_64(a, b, c, d) _mm256_set_epi64x((UINT64)(a), (UINT64)(b), (UINT64)(c), (UINT64)(d))
52
+ #define ROL64in256(d, a, o) d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o)))
53
+ #define ROL64in256_8(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho8))
54
+ #define ROL64in256_56(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho56))
55
+ static const UINT64 rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F};
56
+ static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19};
57
+ #define STORE256(a, b) _mm256_store_si256((V256 *)&(a), b)
58
+ #define STORE256u(a, b) _mm256_storeu_si256((V256 *)&(a), b)
59
+ #define STORE2_128(ah, al, v) _mm256_storeu2_m128d((V128*)&(ah), (V128*)&(al), v)
60
+ #define XOR256(a, b) _mm256_xor_si256(a, b)
61
+ #define XOReq256(a, b) a = _mm256_xor_si256(a, b)
62
+ #define UNPACKL( a, b ) _mm256_unpacklo_epi64((a), (b))
63
+ #define UNPACKH( a, b ) _mm256_unpackhi_epi64((a), (b))
64
+ #define PERM128( a, b, c ) (V256)_mm256_permute2f128_ps((__m256)(a), (__m256)(b), c)
65
+ #define SHUFFLE64( a, b, c ) (V256)_mm256_shuffle_pd((__m256d)(a), (__m256d)(b), c)
66
+
67
+ #define UNINTLEAVE() lanesL01 = UNPACKL( lanes0, lanes1 ), \
68
+ lanesH01 = UNPACKH( lanes0, lanes1 ), \
69
+ lanesL23 = UNPACKL( lanes2, lanes3 ), \
70
+ lanesH23 = UNPACKH( lanes2, lanes3 ), \
71
+ lanes0 = PERM128( lanesL01, lanesL23, 0x20 ), \
72
+ lanes2 = PERM128( lanesL01, lanesL23, 0x31 ), \
73
+ lanes1 = PERM128( lanesH01, lanesH23, 0x20 ), \
74
+ lanes3 = PERM128( lanesH01, lanesH23, 0x31 )
75
+
76
+ #define INTLEAVE() lanesL01 = PERM128( lanes0, lanes2, 0x20 ), \
77
+ lanesH01 = PERM128( lanes1, lanes3, 0x20 ), \
78
+ lanesL23 = PERM128( lanes0, lanes2, 0x31 ), \
79
+ lanesH23 = PERM128( lanes1, lanes3, 0x31 ), \
80
+ lanes0 = SHUFFLE64( lanesL01, lanesH01, 0x00 ), \
81
+ lanes1 = SHUFFLE64( lanesL01, lanesH01, 0x0F ), \
82
+ lanes2 = SHUFFLE64( lanesL23, lanesH23, 0x00 ), \
83
+ lanes3 = SHUFFLE64( lanesL23, lanesH23, 0x0F )
84
+
85
+ #endif
86
+
87
+ #define SnP_laneLengthInBytes 8
88
+
89
+ void KeccakP1600times4_InitializeAll(void *states)
90
+ {
91
+ memset(states, 0, KeccakP1600times4_statesSizeInBytes);
92
+ }
93
+
94
+ void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
95
+ {
96
+ unsigned int sizeLeft = length;
97
+ unsigned int lanePosition = offset/SnP_laneLengthInBytes;
98
+ unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
99
+ const unsigned char *curData = data;
100
+ UINT64 *statesAsLanes = (UINT64 *)states;
101
+
102
+ if ((sizeLeft > 0) && (offsetInLane != 0)) {
103
+ unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
104
+ UINT64 lane = 0;
105
+ if (bytesInLane > sizeLeft)
106
+ bytesInLane = sizeLeft;
107
+ memcpy((unsigned char*)&lane + offsetInLane, curData, bytesInLane);
108
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
109
+ sizeLeft -= bytesInLane;
110
+ lanePosition++;
111
+ curData += bytesInLane;
112
+ }
113
+
114
+ while(sizeLeft >= SnP_laneLengthInBytes) {
115
+ UINT64 lane = *((const UINT64*)curData);
116
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
117
+ sizeLeft -= SnP_laneLengthInBytes;
118
+ lanePosition++;
119
+ curData += SnP_laneLengthInBytes;
120
+ }
121
+
122
+ if (sizeLeft > 0) {
123
+ UINT64 lane = 0;
124
+ memcpy(&lane, curData, sizeLeft);
125
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
126
+ }
127
+ }
128
+
129
+ void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
130
+ {
131
+ V256 *stateAsLanes = (V256 *)states;
132
+ unsigned int i;
133
+ const UINT64 *curData0 = (const UINT64 *)data;
134
+ const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes);
135
+ const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
136
+ const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
137
+ V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
138
+
139
+ #define Xor_In( argIndex ) XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
140
+
141
+ #define Xor_In4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\
142
+ lanes1 = LOAD256u( curData1[argIndex]),\
143
+ lanes2 = LOAD256u( curData2[argIndex]),\
144
+ lanes3 = LOAD256u( curData3[argIndex]),\
145
+ INTLEAVE(),\
146
+ XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
147
+ XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
148
+ XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
149
+ XOReq256( stateAsLanes[argIndex+3], lanes3 )
150
+
151
+ if ( laneCount >= 16 ) {
152
+ Xor_In4( 0 );
153
+ Xor_In4( 4 );
154
+ Xor_In4( 8 );
155
+ Xor_In4( 12 );
156
+ if ( laneCount >= 20 ) {
157
+ Xor_In4( 16 );
158
+ for(i=20; i<laneCount; i++)
159
+ Xor_In( i );
160
+ }
161
+ else {
162
+ for(i=16; i<laneCount; i++)
163
+ Xor_In( i );
164
+ }
165
+ }
166
+ else {
167
+ for(i=0; i<laneCount; i++)
168
+ Xor_In( i );
169
+ }
170
+ #undef Xor_In
171
+ #undef Xor_In4
172
+ }
173
+
174
+ void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
175
+ {
176
+ unsigned int sizeLeft = length;
177
+ unsigned int lanePosition = offset/SnP_laneLengthInBytes;
178
+ unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
179
+ const unsigned char *curData = data;
180
+ UINT64 *statesAsLanes = (UINT64 *)states;
181
+
182
+ if ((sizeLeft > 0) && (offsetInLane != 0)) {
183
+ unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
184
+ if (bytesInLane > sizeLeft)
185
+ bytesInLane = sizeLeft;
186
+ memcpy( ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, curData, bytesInLane);
187
+ sizeLeft -= bytesInLane;
188
+ lanePosition++;
189
+ curData += bytesInLane;
190
+ }
191
+
192
+ while(sizeLeft >= SnP_laneLengthInBytes) {
193
+ UINT64 lane = *((const UINT64*)curData);
194
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] = lane;
195
+ sizeLeft -= SnP_laneLengthInBytes;
196
+ lanePosition++;
197
+ curData += SnP_laneLengthInBytes;
198
+ }
199
+
200
+ if (sizeLeft > 0) {
201
+ memcpy(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], curData, sizeLeft);
202
+ }
203
+ }
204
+
205
+ void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
206
+ {
207
+ V256 *stateAsLanes = (V256 *)states;
208
+ unsigned int i;
209
+ const UINT64 *curData0 = (const UINT64 *)data;
210
+ const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes);
211
+ const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
212
+ const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
213
+ V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
214
+
215
+ #define OverWr( argIndex ) STORE256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
216
+
217
+ #define OverWr4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\
218
+ lanes1 = LOAD256u( curData1[argIndex]),\
219
+ lanes2 = LOAD256u( curData2[argIndex]),\
220
+ lanes3 = LOAD256u( curData3[argIndex]),\
221
+ INTLEAVE(),\
222
+ STORE256( stateAsLanes[argIndex+0], lanes0 ),\
223
+ STORE256( stateAsLanes[argIndex+1], lanes1 ),\
224
+ STORE256( stateAsLanes[argIndex+2], lanes2 ),\
225
+ STORE256( stateAsLanes[argIndex+3], lanes3 )
226
+
227
+ if ( laneCount >= 16 ) {
228
+ OverWr4( 0 );
229
+ OverWr4( 4 );
230
+ OverWr4( 8 );
231
+ OverWr4( 12 );
232
+ if ( laneCount >= 20 ) {
233
+ OverWr4( 16 );
234
+ for(i=20; i<laneCount; i++)
235
+ OverWr( i );
236
+ }
237
+ else {
238
+ for(i=16; i<laneCount; i++)
239
+ OverWr( i );
240
+ }
241
+ }
242
+ else {
243
+ for(i=0; i<laneCount; i++)
244
+ OverWr( i );
245
+ }
246
+ #undef OverWr
247
+ #undef OverWr4
248
+ }
249
+
250
+ void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount)
251
+ {
252
+ unsigned int sizeLeft = byteCount;
253
+ unsigned int lanePosition = 0;
254
+ UINT64 *statesAsLanes = (UINT64 *)states;
255
+
256
+ while(sizeLeft >= SnP_laneLengthInBytes) {
257
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] = 0;
258
+ sizeLeft -= SnP_laneLengthInBytes;
259
+ lanePosition++;
260
+ }
261
+
262
+ if (sizeLeft > 0) {
263
+ memset(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], 0, sizeLeft);
264
+ }
265
+ }
266
+
267
+ void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length)
268
+ {
269
+ unsigned int sizeLeft = length;
270
+ unsigned int lanePosition = offset/SnP_laneLengthInBytes;
271
+ unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
272
+ unsigned char *curData = data;
273
+ const UINT64 *statesAsLanes = (const UINT64 *)states;
274
+
275
+ if ((sizeLeft > 0) && (offsetInLane != 0)) {
276
+ unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
277
+ if (bytesInLane > sizeLeft)
278
+ bytesInLane = sizeLeft;
279
+ memcpy( curData, ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane);
280
+ sizeLeft -= bytesInLane;
281
+ lanePosition++;
282
+ curData += bytesInLane;
283
+ }
284
+
285
+ while(sizeLeft >= SnP_laneLengthInBytes) {
286
+ *(UINT64*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
287
+ sizeLeft -= SnP_laneLengthInBytes;
288
+ lanePosition++;
289
+ curData += SnP_laneLengthInBytes;
290
+ }
291
+
292
+ if (sizeLeft > 0) {
293
+ memcpy( curData, &statesAsLanes[laneIndex(instanceIndex, lanePosition)], sizeLeft);
294
+ }
295
+ }
296
+
297
+ void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
298
+ {
299
+ UINT64 *curData0 = (UINT64 *)data;
300
+ UINT64 *curData1 = (UINT64 *)(data+laneOffset*1*SnP_laneLengthInBytes);
301
+ UINT64 *curData2 = (UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
302
+ UINT64 *curData3 = (UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
303
+
304
+ const V256 *stateAsLanes = (const V256 *)states;
305
+ const UINT64 *stateAsLanes64 = (const UINT64*)states;
306
+ V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
307
+ unsigned int i;
308
+
309
+ #define Extr( argIndex ) curData0[argIndex] = stateAsLanes64[4*(argIndex)], \
310
+ curData1[argIndex] = stateAsLanes64[4*(argIndex)+1], \
311
+ curData2[argIndex] = stateAsLanes64[4*(argIndex)+2], \
312
+ curData3[argIndex] = stateAsLanes64[4*(argIndex)+3]
313
+
314
+ #define Extr4( argIndex ) lanes0 = LOAD256( stateAsLanes[argIndex+0] ), \
315
+ lanes1 = LOAD256( stateAsLanes[argIndex+1] ), \
316
+ lanes2 = LOAD256( stateAsLanes[argIndex+2] ), \
317
+ lanes3 = LOAD256( stateAsLanes[argIndex+3] ), \
318
+ UNINTLEAVE(), \
319
+ STORE256u( curData0[argIndex], lanes0 ), \
320
+ STORE256u( curData1[argIndex], lanes1 ), \
321
+ STORE256u( curData2[argIndex], lanes2 ), \
322
+ STORE256u( curData3[argIndex], lanes3 )
323
+
324
+ if ( laneCount >= 16 ) {
325
+ Extr4( 0 );
326
+ Extr4( 4 );
327
+ Extr4( 8 );
328
+ Extr4( 12 );
329
+ if ( laneCount >= 20 ) {
330
+ Extr4( 16 );
331
+ for(i=20; i<laneCount; i++)
332
+ Extr( i );
333
+ }
334
+ else {
335
+ for(i=16; i<laneCount; i++)
336
+ Extr( i );
337
+ }
338
+ }
339
+ else {
340
+ for(i=0; i<laneCount; i++)
341
+ Extr( i );
342
+ }
343
+ #undef Extr
344
+ #undef Extr4
345
+ }
346
+
347
+ void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
348
+ {
349
+ unsigned int sizeLeft = length;
350
+ unsigned int lanePosition = offset/SnP_laneLengthInBytes;
351
+ unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
352
+ const unsigned char *curInput = input;
353
+ unsigned char *curOutput = output;
354
+ const UINT64 *statesAsLanes = (const UINT64 *)states;
355
+
356
+ if ((sizeLeft > 0) && (offsetInLane != 0)) {
357
+ unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
358
+ UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)] >> (8 * offsetInLane);
359
+ if (bytesInLane > sizeLeft)
360
+ bytesInLane = sizeLeft;
361
+ sizeLeft -= bytesInLane;
362
+ do {
363
+ *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
364
+ lane >>= 8;
365
+ } while ( --bytesInLane != 0);
366
+ lanePosition++;
367
+ }
368
+
369
+ while(sizeLeft >= SnP_laneLengthInBytes) {
370
+ *((UINT64*)curOutput) = *((UINT64*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)];
371
+ sizeLeft -= SnP_laneLengthInBytes;
372
+ lanePosition++;
373
+ curInput += SnP_laneLengthInBytes;
374
+ curOutput += SnP_laneLengthInBytes;
375
+ }
376
+
377
+ if (sizeLeft != 0) {
378
+ UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
379
+ do {
380
+ *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
381
+ lane >>= 8;
382
+ } while ( --sizeLeft != 0);
383
+ }
384
+ }
385
+
386
+ void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset)
387
+ {
388
+ const UINT64 *curInput0 = (UINT64 *)input;
389
+ const UINT64 *curInput1 = (UINT64 *)(input+laneOffset*1*SnP_laneLengthInBytes);
390
+ const UINT64 *curInput2 = (UINT64 *)(input+laneOffset*2*SnP_laneLengthInBytes);
391
+ const UINT64 *curInput3 = (UINT64 *)(input+laneOffset*3*SnP_laneLengthInBytes);
392
+ UINT64 *curOutput0 = (UINT64 *)output;
393
+ UINT64 *curOutput1 = (UINT64 *)(output+laneOffset*1*SnP_laneLengthInBytes);
394
+ UINT64 *curOutput2 = (UINT64 *)(output+laneOffset*2*SnP_laneLengthInBytes);
395
+ UINT64 *curOutput3 = (UINT64 *)(output+laneOffset*3*SnP_laneLengthInBytes);
396
+
397
+ const V256 *stateAsLanes = (const V256 *)states;
398
+ const UINT64 *stateAsLanes64 = (const UINT64*)states;
399
+ V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
400
+ unsigned int i;
401
+
402
+ #define ExtrXor( argIndex ) \
403
+ curOutput0[argIndex] = curInput0[argIndex] ^ stateAsLanes64[4*(argIndex)],\
404
+ curOutput1[argIndex] = curInput1[argIndex] ^ stateAsLanes64[4*(argIndex)+1],\
405
+ curOutput2[argIndex] = curInput2[argIndex] ^ stateAsLanes64[4*(argIndex)+2],\
406
+ curOutput3[argIndex] = curInput3[argIndex] ^ stateAsLanes64[4*(argIndex)+3]
407
+
408
+ #define ExtrXor4( argIndex ) \
409
+ lanes0 = LOAD256( stateAsLanes[argIndex+0] ),\
410
+ lanes1 = LOAD256( stateAsLanes[argIndex+1] ),\
411
+ lanes2 = LOAD256( stateAsLanes[argIndex+2] ),\
412
+ lanes3 = LOAD256( stateAsLanes[argIndex+3] ),\
413
+ UNINTLEAVE(),\
414
+ lanesL01 = LOAD256u( curInput0[argIndex]),\
415
+ lanesH01 = LOAD256u( curInput1[argIndex]),\
416
+ lanesL23 = LOAD256u( curInput2[argIndex]),\
417
+ lanesH23 = LOAD256u( curInput3[argIndex]),\
418
+ XOReq256( lanes0, lanesL01 ),\
419
+ XOReq256( lanes1, lanesH01 ),\
420
+ XOReq256( lanes2, lanesL23 ),\
421
+ XOReq256( lanes3, lanesH23 ),\
422
+ STORE256u( curOutput0[argIndex], lanes0 ),\
423
+ STORE256u( curOutput1[argIndex], lanes1 ),\
424
+ STORE256u( curOutput2[argIndex], lanes2 ),\
425
+ STORE256u( curOutput3[argIndex], lanes3 )
426
+
427
+ if ( laneCount >= 16 ) {
428
+ ExtrXor4( 0 );
429
+ ExtrXor4( 4 );
430
+ ExtrXor4( 8 );
431
+ ExtrXor4( 12 );
432
+ if ( laneCount >= 20 ) {
433
+ ExtrXor4( 16 );
434
+ for(i=20; i<laneCount; i++)
435
+ ExtrXor( i );
436
+ }
437
+ else {
438
+ for(i=16; i<laneCount; i++)
439
+ ExtrXor( i );
440
+ }
441
+ }
442
+ else {
443
+ for(i=0; i<laneCount; i++)
444
+ ExtrXor( i );
445
+ }
446
+ #undef ExtrXor
447
+ #undef ExtrXor4
448
+ }
449
+
450
+ #define declareABCDE \
451
+ V256 Aba, Abe, Abi, Abo, Abu; \
452
+ V256 Aga, Age, Agi, Ago, Agu; \
453
+ V256 Aka, Ake, Aki, Ako, Aku; \
454
+ V256 Ama, Ame, Ami, Amo, Amu; \
455
+ V256 Asa, Ase, Asi, Aso, Asu; \
456
+ V256 Bba, Bbe, Bbi, Bbo, Bbu; \
457
+ V256 Bga, Bge, Bgi, Bgo, Bgu; \
458
+ V256 Bka, Bke, Bki, Bko, Bku; \
459
+ V256 Bma, Bme, Bmi, Bmo, Bmu; \
460
+ V256 Bsa, Bse, Bsi, Bso, Bsu; \
461
+ V256 Ca, Ce, Ci, Co, Cu; \
462
+ V256 Ca1, Ce1, Ci1, Co1, Cu1; \
463
+ V256 Da, De, Di, Do, Du; \
464
+ V256 Eba, Ebe, Ebi, Ebo, Ebu; \
465
+ V256 Ega, Ege, Egi, Ego, Egu; \
466
+ V256 Eka, Eke, Eki, Eko, Eku; \
467
+ V256 Ema, Eme, Emi, Emo, Emu; \
468
+ V256 Esa, Ese, Esi, Eso, Esu; \
469
+
470
+ #define prepareTheta \
471
+ Ca = XOR256(Aba, XOR256(Aga, XOR256(Aka, XOR256(Ama, Asa)))); \
472
+ Ce = XOR256(Abe, XOR256(Age, XOR256(Ake, XOR256(Ame, Ase)))); \
473
+ Ci = XOR256(Abi, XOR256(Agi, XOR256(Aki, XOR256(Ami, Asi)))); \
474
+ Co = XOR256(Abo, XOR256(Ago, XOR256(Ako, XOR256(Amo, Aso)))); \
475
+ Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu)))); \
476
+
477
+ /* --- Theta Rho Pi Chi Iota Prepare-theta */
478
+ /* --- 64-bit lanes mapped to 64-bit words */
479
+ #define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
480
+ ROL64in256(Ce1, Ce, 1); \
481
+ Da = XOR256(Cu, Ce1); \
482
+ ROL64in256(Ci1, Ci, 1); \
483
+ De = XOR256(Ca, Ci1); \
484
+ ROL64in256(Co1, Co, 1); \
485
+ Di = XOR256(Ce, Co1); \
486
+ ROL64in256(Cu1, Cu, 1); \
487
+ Do = XOR256(Ci, Cu1); \
488
+ ROL64in256(Ca1, Ca, 1); \
489
+ Du = XOR256(Co, Ca1); \
490
+ \
491
+ XOReq256(A##ba, Da); \
492
+ Bba = A##ba; \
493
+ XOReq256(A##ge, De); \
494
+ ROL64in256(Bbe, A##ge, 44); \
495
+ XOReq256(A##ki, Di); \
496
+ ROL64in256(Bbi, A##ki, 43); \
497
+ E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
498
+ XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \
499
+ Ca = E##ba; \
500
+ XOReq256(A##mo, Do); \
501
+ ROL64in256(Bbo, A##mo, 21); \
502
+ E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
503
+ Ce = E##be; \
504
+ XOReq256(A##su, Du); \
505
+ ROL64in256(Bbu, A##su, 14); \
506
+ E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
507
+ Ci = E##bi; \
508
+ E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
509
+ Co = E##bo; \
510
+ E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
511
+ Cu = E##bu; \
512
+ \
513
+ XOReq256(A##bo, Do); \
514
+ ROL64in256(Bga, A##bo, 28); \
515
+ XOReq256(A##gu, Du); \
516
+ ROL64in256(Bge, A##gu, 20); \
517
+ XOReq256(A##ka, Da); \
518
+ ROL64in256(Bgi, A##ka, 3); \
519
+ E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
520
+ XOReq256(Ca, E##ga); \
521
+ XOReq256(A##me, De); \
522
+ ROL64in256(Bgo, A##me, 45); \
523
+ E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
524
+ XOReq256(Ce, E##ge); \
525
+ XOReq256(A##si, Di); \
526
+ ROL64in256(Bgu, A##si, 61); \
527
+ E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
528
+ XOReq256(Ci, E##gi); \
529
+ E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
530
+ XOReq256(Co, E##go); \
531
+ E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
532
+ XOReq256(Cu, E##gu); \
533
+ \
534
+ XOReq256(A##be, De); \
535
+ ROL64in256(Bka, A##be, 1); \
536
+ XOReq256(A##gi, Di); \
537
+ ROL64in256(Bke, A##gi, 6); \
538
+ XOReq256(A##ko, Do); \
539
+ ROL64in256(Bki, A##ko, 25); \
540
+ E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
541
+ XOReq256(Ca, E##ka); \
542
+ XOReq256(A##mu, Du); \
543
+ ROL64in256_8(Bko, A##mu); \
544
+ E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
545
+ XOReq256(Ce, E##ke); \
546
+ XOReq256(A##sa, Da); \
547
+ ROL64in256(Bku, A##sa, 18); \
548
+ E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
549
+ XOReq256(Ci, E##ki); \
550
+ E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
551
+ XOReq256(Co, E##ko); \
552
+ E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
553
+ XOReq256(Cu, E##ku); \
554
+ \
555
+ XOReq256(A##bu, Du); \
556
+ ROL64in256(Bma, A##bu, 27); \
557
+ XOReq256(A##ga, Da); \
558
+ ROL64in256(Bme, A##ga, 36); \
559
+ XOReq256(A##ke, De); \
560
+ ROL64in256(Bmi, A##ke, 10); \
561
+ E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
562
+ XOReq256(Ca, E##ma); \
563
+ XOReq256(A##mi, Di); \
564
+ ROL64in256(Bmo, A##mi, 15); \
565
+ E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
566
+ XOReq256(Ce, E##me); \
567
+ XOReq256(A##so, Do); \
568
+ ROL64in256_56(Bmu, A##so); \
569
+ E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
570
+ XOReq256(Ci, E##mi); \
571
+ E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
572
+ XOReq256(Co, E##mo); \
573
+ E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
574
+ XOReq256(Cu, E##mu); \
575
+ \
576
+ XOReq256(A##bi, Di); \
577
+ ROL64in256(Bsa, A##bi, 62); \
578
+ XOReq256(A##go, Do); \
579
+ ROL64in256(Bse, A##go, 55); \
580
+ XOReq256(A##ku, Du); \
581
+ ROL64in256(Bsi, A##ku, 39); \
582
+ E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
583
+ XOReq256(Ca, E##sa); \
584
+ XOReq256(A##ma, Da); \
585
+ ROL64in256(Bso, A##ma, 41); \
586
+ E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
587
+ XOReq256(Ce, E##se); \
588
+ XOReq256(A##se, De); \
589
+ ROL64in256(Bsu, A##se, 2); \
590
+ E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
591
+ XOReq256(Ci, E##si); \
592
+ E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
593
+ XOReq256(Co, E##so); \
594
+ E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
595
+ XOReq256(Cu, E##su); \
596
+ \
597
+
598
+ /* --- Theta Rho Pi Chi Iota */
599
+ /* --- 64-bit lanes mapped to 64-bit words */
600
+ #define thetaRhoPiChiIota(i, A, E) \
601
+ ROL64in256(Ce1, Ce, 1); \
602
+ Da = XOR256(Cu, Ce1); \
603
+ ROL64in256(Ci1, Ci, 1); \
604
+ De = XOR256(Ca, Ci1); \
605
+ ROL64in256(Co1, Co, 1); \
606
+ Di = XOR256(Ce, Co1); \
607
+ ROL64in256(Cu1, Cu, 1); \
608
+ Do = XOR256(Ci, Cu1); \
609
+ ROL64in256(Ca1, Ca, 1); \
610
+ Du = XOR256(Co, Ca1); \
611
+ \
612
+ XOReq256(A##ba, Da); \
613
+ Bba = A##ba; \
614
+ XOReq256(A##ge, De); \
615
+ ROL64in256(Bbe, A##ge, 44); \
616
+ XOReq256(A##ki, Di); \
617
+ ROL64in256(Bbi, A##ki, 43); \
618
+ E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
619
+ XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \
620
+ XOReq256(A##mo, Do); \
621
+ ROL64in256(Bbo, A##mo, 21); \
622
+ E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
623
+ XOReq256(A##su, Du); \
624
+ ROL64in256(Bbu, A##su, 14); \
625
+ E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
626
+ E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
627
+ E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
628
+ \
629
+ XOReq256(A##bo, Do); \
630
+ ROL64in256(Bga, A##bo, 28); \
631
+ XOReq256(A##gu, Du); \
632
+ ROL64in256(Bge, A##gu, 20); \
633
+ XOReq256(A##ka, Da); \
634
+ ROL64in256(Bgi, A##ka, 3); \
635
+ E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
636
+ XOReq256(A##me, De); \
637
+ ROL64in256(Bgo, A##me, 45); \
638
+ E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
639
+ XOReq256(A##si, Di); \
640
+ ROL64in256(Bgu, A##si, 61); \
641
+ E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
642
+ E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
643
+ E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
644
+ \
645
+ XOReq256(A##be, De); \
646
+ ROL64in256(Bka, A##be, 1); \
647
+ XOReq256(A##gi, Di); \
648
+ ROL64in256(Bke, A##gi, 6); \
649
+ XOReq256(A##ko, Do); \
650
+ ROL64in256(Bki, A##ko, 25); \
651
+ E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
652
+ XOReq256(A##mu, Du); \
653
+ ROL64in256_8(Bko, A##mu); \
654
+ E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
655
+ XOReq256(A##sa, Da); \
656
+ ROL64in256(Bku, A##sa, 18); \
657
+ E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
658
+ E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
659
+ E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
660
+ \
661
+ XOReq256(A##bu, Du); \
662
+ ROL64in256(Bma, A##bu, 27); \
663
+ XOReq256(A##ga, Da); \
664
+ ROL64in256(Bme, A##ga, 36); \
665
+ XOReq256(A##ke, De); \
666
+ ROL64in256(Bmi, A##ke, 10); \
667
+ E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
668
+ XOReq256(A##mi, Di); \
669
+ ROL64in256(Bmo, A##mi, 15); \
670
+ E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
671
+ XOReq256(A##so, Do); \
672
+ ROL64in256_56(Bmu, A##so); \
673
+ E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
674
+ E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
675
+ E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
676
+ \
677
+ XOReq256(A##bi, Di); \
678
+ ROL64in256(Bsa, A##bi, 62); \
679
+ XOReq256(A##go, Do); \
680
+ ROL64in256(Bse, A##go, 55); \
681
+ XOReq256(A##ku, Du); \
682
+ ROL64in256(Bsi, A##ku, 39); \
683
+ E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
684
+ XOReq256(A##ma, Da); \
685
+ ROL64in256(Bso, A##ma, 41); \
686
+ E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
687
+ XOReq256(A##se, De); \
688
+ ROL64in256(Bsu, A##se, 2); \
689
+ E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
690
+ E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
691
+ E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
692
+ \
693
+
694
+ static ALIGN(KeccakP1600times4_statesAlignment) const UINT64 KeccakF1600RoundConstants[24] = {
695
+ 0x0000000000000001ULL,
696
+ 0x0000000000008082ULL,
697
+ 0x800000000000808aULL,
698
+ 0x8000000080008000ULL,
699
+ 0x000000000000808bULL,
700
+ 0x0000000080000001ULL,
701
+ 0x8000000080008081ULL,
702
+ 0x8000000000008009ULL,
703
+ 0x000000000000008aULL,
704
+ 0x0000000000000088ULL,
705
+ 0x0000000080008009ULL,
706
+ 0x000000008000000aULL,
707
+ 0x000000008000808bULL,
708
+ 0x800000000000008bULL,
709
+ 0x8000000000008089ULL,
710
+ 0x8000000000008003ULL,
711
+ 0x8000000000008002ULL,
712
+ 0x8000000000000080ULL,
713
+ 0x000000000000800aULL,
714
+ 0x800000008000000aULL,
715
+ 0x8000000080008081ULL,
716
+ 0x8000000000008080ULL,
717
+ 0x0000000080000001ULL,
718
+ 0x8000000080008008ULL};
719
+
720
+ #define copyFromState(X, state) \
721
+ X##ba = LOAD256(state[ 0]); \
722
+ X##be = LOAD256(state[ 1]); \
723
+ X##bi = LOAD256(state[ 2]); \
724
+ X##bo = LOAD256(state[ 3]); \
725
+ X##bu = LOAD256(state[ 4]); \
726
+ X##ga = LOAD256(state[ 5]); \
727
+ X##ge = LOAD256(state[ 6]); \
728
+ X##gi = LOAD256(state[ 7]); \
729
+ X##go = LOAD256(state[ 8]); \
730
+ X##gu = LOAD256(state[ 9]); \
731
+ X##ka = LOAD256(state[10]); \
732
+ X##ke = LOAD256(state[11]); \
733
+ X##ki = LOAD256(state[12]); \
734
+ X##ko = LOAD256(state[13]); \
735
+ X##ku = LOAD256(state[14]); \
736
+ X##ma = LOAD256(state[15]); \
737
+ X##me = LOAD256(state[16]); \
738
+ X##mi = LOAD256(state[17]); \
739
+ X##mo = LOAD256(state[18]); \
740
+ X##mu = LOAD256(state[19]); \
741
+ X##sa = LOAD256(state[20]); \
742
+ X##se = LOAD256(state[21]); \
743
+ X##si = LOAD256(state[22]); \
744
+ X##so = LOAD256(state[23]); \
745
+ X##su = LOAD256(state[24]); \
746
+
747
+ #define copyToState(state, X) \
748
+ STORE256(state[ 0], X##ba); \
749
+ STORE256(state[ 1], X##be); \
750
+ STORE256(state[ 2], X##bi); \
751
+ STORE256(state[ 3], X##bo); \
752
+ STORE256(state[ 4], X##bu); \
753
+ STORE256(state[ 5], X##ga); \
754
+ STORE256(state[ 6], X##ge); \
755
+ STORE256(state[ 7], X##gi); \
756
+ STORE256(state[ 8], X##go); \
757
+ STORE256(state[ 9], X##gu); \
758
+ STORE256(state[10], X##ka); \
759
+ STORE256(state[11], X##ke); \
760
+ STORE256(state[12], X##ki); \
761
+ STORE256(state[13], X##ko); \
762
+ STORE256(state[14], X##ku); \
763
+ STORE256(state[15], X##ma); \
764
+ STORE256(state[16], X##me); \
765
+ STORE256(state[17], X##mi); \
766
+ STORE256(state[18], X##mo); \
767
+ STORE256(state[19], X##mu); \
768
+ STORE256(state[20], X##sa); \
769
+ STORE256(state[21], X##se); \
770
+ STORE256(state[22], X##si); \
771
+ STORE256(state[23], X##so); \
772
+ STORE256(state[24], X##su); \
773
+
774
+ #define copyStateVariables(X, Y) \
775
+ X##ba = Y##ba; \
776
+ X##be = Y##be; \
777
+ X##bi = Y##bi; \
778
+ X##bo = Y##bo; \
779
+ X##bu = Y##bu; \
780
+ X##ga = Y##ga; \
781
+ X##ge = Y##ge; \
782
+ X##gi = Y##gi; \
783
+ X##go = Y##go; \
784
+ X##gu = Y##gu; \
785
+ X##ka = Y##ka; \
786
+ X##ke = Y##ke; \
787
+ X##ki = Y##ki; \
788
+ X##ko = Y##ko; \
789
+ X##ku = Y##ku; \
790
+ X##ma = Y##ma; \
791
+ X##me = Y##me; \
792
+ X##mi = Y##mi; \
793
+ X##mo = Y##mo; \
794
+ X##mu = Y##mu; \
795
+ X##sa = Y##sa; \
796
+ X##se = Y##se; \
797
+ X##si = Y##si; \
798
+ X##so = Y##so; \
799
+ X##su = Y##su; \
800
+
801
+ #ifdef KeccakP1600times4_fullUnrolling
802
+ #define FullUnrolling
803
+ #else
804
+ #define Unrolling KeccakP1600times4_unrolling
805
+ #endif
806
+ #include "KeccakP-1600-unrolling.macros"
807
+
808
+ void KeccakP1600times4_PermuteAll_24rounds(void *states)
809
+ {
810
+ V256 *statesAsLanes = (V256 *)states;
811
+ declareABCDE
812
+ #ifndef KeccakP1600times4_fullUnrolling
813
+ unsigned int i;
814
+ #endif
815
+
816
+ copyFromState(A, statesAsLanes)
817
+ rounds24
818
+ copyToState(statesAsLanes, A)
819
+ }
820
+
821
+ void KeccakP1600times4_PermuteAll_12rounds(void *states)
822
+ {
823
+ V256 *statesAsLanes = (V256 *)states;
824
+ declareABCDE
825
+ #ifndef KeccakP1600times4_fullUnrolling
826
+ unsigned int i;
827
+ #endif
828
+
829
+ copyFromState(A, statesAsLanes)
830
+ rounds12
831
+ copyToState(statesAsLanes, A)
832
+ }
833
+
834
+ void KeccakP1600times4_PermuteAll_6rounds(void *states)
835
+ {
836
+ V256 *statesAsLanes = (V256 *)states;
837
+ declareABCDE
838
+ #ifndef KeccakP1600times4_fullUnrolling
839
+ unsigned int i;
840
+ #endif
841
+
842
+ copyFromState(A, statesAsLanes)
843
+ rounds6
844
+ copyToState(statesAsLanes, A)
845
+ }
846
+
847
+ void KeccakP1600times4_PermuteAll_4rounds(void *states)
848
+ {
849
+ V256 *statesAsLanes = (V256 *)states;
850
+ declareABCDE
851
+ #ifndef KeccakP1600times4_fullUnrolling
852
+ unsigned int i;
853
+ #endif
854
+
855
+ copyFromState(A, statesAsLanes)
856
+ rounds4
857
+ copyToState(statesAsLanes, A)
858
+ }
859
+
860
+ size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
861
+ {
862
+ if (laneCount == 21) {
863
+ #if 0
864
+ const unsigned char *dataStart = data;
865
+ const UINT64 *curData0 = (const UINT64 *)data;
866
+ const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
867
+ const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
868
+ const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
869
+
870
+ while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
871
+ V256 *stateAsLanes = (V256 *)states;
872
+ V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
873
+ #define Xor_In( argIndex ) \
874
+ XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
875
+ #define Xor_In4( argIndex ) \
876
+ lanes0 = LOAD256u( curData0[argIndex]),\
877
+ lanes1 = LOAD256u( curData1[argIndex]),\
878
+ lanes2 = LOAD256u( curData2[argIndex]),\
879
+ lanes3 = LOAD256u( curData3[argIndex]),\
880
+ INTLEAVE(),\
881
+ XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
882
+ XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
883
+ XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
884
+ XOReq256( stateAsLanes[argIndex+3], lanes3 )
885
+ Xor_In4( 0 );
886
+ Xor_In4( 4 );
887
+ Xor_In4( 8 );
888
+ Xor_In4( 12 );
889
+ Xor_In4( 16 );
890
+ Xor_In( 20 );
891
+ #undef Xor_In
892
+ #undef Xor_In4
893
+ KeccakP1600times4_PermuteAll_24rounds(states);
894
+ curData0 += laneOffsetSerial;
895
+ curData1 += laneOffsetSerial;
896
+ curData2 += laneOffsetSerial;
897
+ curData3 += laneOffsetSerial;
898
+ dataByteLen -= laneOffsetSerial*8;
899
+ }
900
+ return (const unsigned char *)curData0 - dataStart;
901
+ #else
902
+ unsigned int i;
903
+ const unsigned char *dataStart = data;
904
+ const UINT64 *curData0 = (const UINT64 *)data;
905
+ const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
906
+ const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
907
+ const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
908
+ V256 *statesAsLanes = (V256 *)states;
909
+ declareABCDE
910
+
911
+ copyFromState(A, statesAsLanes)
912
+ while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
913
+ #define XOR_In( Xxx, argIndex ) \
914
+ XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
915
+ XOR_In( Aba, 0 );
916
+ XOR_In( Abe, 1 );
917
+ XOR_In( Abi, 2 );
918
+ XOR_In( Abo, 3 );
919
+ XOR_In( Abu, 4 );
920
+ XOR_In( Aga, 5 );
921
+ XOR_In( Age, 6 );
922
+ XOR_In( Agi, 7 );
923
+ XOR_In( Ago, 8 );
924
+ XOR_In( Agu, 9 );
925
+ XOR_In( Aka, 10 );
926
+ XOR_In( Ake, 11 );
927
+ XOR_In( Aki, 12 );
928
+ XOR_In( Ako, 13 );
929
+ XOR_In( Aku, 14 );
930
+ XOR_In( Ama, 15 );
931
+ XOR_In( Ame, 16 );
932
+ XOR_In( Ami, 17 );
933
+ XOR_In( Amo, 18 );
934
+ XOR_In( Amu, 19 );
935
+ XOR_In( Asa, 20 );
936
+ #undef XOR_In
937
+ rounds24
938
+ curData0 += laneOffsetSerial;
939
+ curData1 += laneOffsetSerial;
940
+ curData2 += laneOffsetSerial;
941
+ curData3 += laneOffsetSerial;
942
+ dataByteLen -= laneOffsetSerial*8;
943
+ }
944
+ copyToState(statesAsLanes, A)
945
+ return (const unsigned char *)curData0 - dataStart;
946
+ #endif
947
+ }
948
+ else {
949
+ unsigned int i;
950
+ const unsigned char *dataStart = data;
951
+
952
+ while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
953
+ KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);
954
+ KeccakP1600times4_PermuteAll_24rounds(states);
955
+ data += laneOffsetSerial*8;
956
+ dataByteLen -= laneOffsetSerial*8;
957
+ }
958
+ return data - dataStart;
959
+ }
960
+ }
961
+
962
+ size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
963
+ {
964
+ if (laneCount == 21) {
965
+ #if 0
966
+ const unsigned char *dataStart = data;
967
+ const UINT64 *curData0 = (const UINT64 *)data;
968
+ const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
969
+ const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
970
+ const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
971
+
972
+ while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
973
+ V256 *stateAsLanes = states;
974
+ V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
975
+ #define Xor_In( argIndex ) \
976
+ XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
977
+ #define Xor_In4( argIndex ) \
978
+ lanes0 = LOAD256u( curData0[argIndex]),\
979
+ lanes1 = LOAD256u( curData1[argIndex]),\
980
+ lanes2 = LOAD256u( curData2[argIndex]),\
981
+ lanes3 = LOAD256u( curData3[argIndex]),\
982
+ INTLEAVE(),\
983
+ XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
984
+ XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
985
+ XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
986
+ XOReq256( stateAsLanes[argIndex+3], lanes3 )
987
+ Xor_In4( 0 );
988
+ Xor_In4( 4 );
989
+ Xor_In4( 8 );
990
+ Xor_In4( 12 );
991
+ Xor_In4( 16 );
992
+ Xor_In( 20 );
993
+ #undef Xor_In
994
+ #undef Xor_In4
995
+ KeccakP1600times4_PermuteAll_12rounds(states);
996
+ curData0 += laneOffsetSerial;
997
+ curData1 += laneOffsetSerial;
998
+ curData2 += laneOffsetSerial;
999
+ curData3 += laneOffsetSerial;
1000
+ dataByteLen -= laneOffsetSerial*8;
1001
+ }
1002
+ return (const unsigned char *)curData0 - dataStart;
1003
+ #else
1004
+ unsigned int i;
1005
+ const unsigned char *dataStart = data;
1006
+ const UINT64 *curData0 = (const UINT64 *)data;
1007
+ const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
1008
+ const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
1009
+ const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
1010
+ V256 *statesAsLanes = states;
1011
+ declareABCDE
1012
+
1013
+ copyFromState(A, statesAsLanes)
1014
+ while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
1015
+ #define XOR_In( Xxx, argIndex ) \
1016
+ XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
1017
+ XOR_In( Aba, 0 );
1018
+ XOR_In( Abe, 1 );
1019
+ XOR_In( Abi, 2 );
1020
+ XOR_In( Abo, 3 );
1021
+ XOR_In( Abu, 4 );
1022
+ XOR_In( Aga, 5 );
1023
+ XOR_In( Age, 6 );
1024
+ XOR_In( Agi, 7 );
1025
+ XOR_In( Ago, 8 );
1026
+ XOR_In( Agu, 9 );
1027
+ XOR_In( Aka, 10 );
1028
+ XOR_In( Ake, 11 );
1029
+ XOR_In( Aki, 12 );
1030
+ XOR_In( Ako, 13 );
1031
+ XOR_In( Aku, 14 );
1032
+ XOR_In( Ama, 15 );
1033
+ XOR_In( Ame, 16 );
1034
+ XOR_In( Ami, 17 );
1035
+ XOR_In( Amo, 18 );
1036
+ XOR_In( Amu, 19 );
1037
+ XOR_In( Asa, 20 );
1038
+ #undef XOR_In
1039
+ rounds12
1040
+ curData0 += laneOffsetSerial;
1041
+ curData1 += laneOffsetSerial;
1042
+ curData2 += laneOffsetSerial;
1043
+ curData3 += laneOffsetSerial;
1044
+ dataByteLen -= laneOffsetSerial*8;
1045
+ }
1046
+ copyToState(statesAsLanes, A)
1047
+ return (const unsigned char *)curData0 - dataStart;
1048
+ #endif
1049
+ }
1050
+ else {
1051
+ unsigned int i;
1052
+ const unsigned char *dataStart = data;
1053
+
1054
+ while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
1055
+ KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);
1056
+ KeccakP1600times4_PermuteAll_12rounds(states);
1057
+ data += laneOffsetSerial*8;
1058
+ dataByteLen -= laneOffsetSerial*8;
1059
+ }
1060
+ return data - dataStart;
1061
+ }
1062
+ }
1063
+
1064
+ /* ------------------------------------------------------------------------- */
1065
+
1066
+ #define UNINTLEAVEa(lanes0, lanes1, lanes2, lanes3) \
1067
+ lanesL01 = UNPACKL( lanes0, lanes1 ), \
1068
+ lanesH01 = UNPACKH( lanes0, lanes1 ), \
1069
+ lanesL23 = UNPACKL( lanes2, lanes3 ), \
1070
+ lanesH23 = UNPACKH( lanes2, lanes3 ), \
1071
+ lanes0 = PERM128( lanesL01, lanesL23, 0x20 ), \
1072
+ lanes2 = PERM128( lanesL01, lanesL23, 0x31 ), \
1073
+ lanes1 = PERM128( lanesH01, lanesH23, 0x20 ), \
1074
+ lanes3 = PERM128( lanesH01, lanesH23, 0x31 )
1075
+
1076
+ #define INTLEAVEa(lanes0, lanes1, lanes2, lanes3) \
1077
+ lanesL01 = PERM128( lanes0, lanes2, 0x20 ), \
1078
+ lanesH01 = PERM128( lanes1, lanes3, 0x20 ), \
1079
+ lanesL23 = PERM128( lanes0, lanes2, 0x31 ), \
1080
+ lanesH23 = PERM128( lanes1, lanes3, 0x31 ), \
1081
+ lanes0 = SHUFFLE64( lanesL01, lanesH01, 0x00 ), \
1082
+ lanes1 = SHUFFLE64( lanesL01, lanesH01, 0x0F ), \
1083
+ lanes2 = SHUFFLE64( lanesL23, lanesH23, 0x00 ), \
1084
+ lanes3 = SHUFFLE64( lanesL23, lanesH23, 0x0F )
1085
+
1086
+
1087
+ #define LoadXOReq256( lanes, inp, argIndex) XOReq256( lanes, LOAD4_64(inp[3*25+argIndex], inp[2*25+argIndex], inp[1*25+argIndex], inp[0*25+argIndex]) )
1088
+
1089
+ /* ------------------------------------------------------------------------- */
1090
+
1091
+ #if defined(UseGatherScatter)
1092
+
1093
+ #define AddOverWr4( lanes0, lanes1, lanes2, lanes3, key, inp, argIndex ) \
1094
+ lanes0 = _mm256_i32gather_epi64((const long long int *)&inp[argIndex+0], gather, 1), \
1095
+ lanes1 = _mm256_i32gather_epi64((const long long int *)&inp[argIndex+1], gather, 1), \
1096
+ lanes2 = _mm256_i32gather_epi64((const long long int *)&inp[argIndex+2], gather, 1), \
1097
+ lanes3 = _mm256_i32gather_epi64((const long long int *)&inp[argIndex+3], gather, 1), \
1098
+ XOReq256( lanes0, CONST256_64( key[argIndex+0])), \
1099
+ XOReq256( lanes1, CONST256_64( key[argIndex+1])), \
1100
+ XOReq256( lanes2, CONST256_64( key[argIndex+2])), \
1101
+ XOReq256( lanes3, CONST256_64( key[argIndex+3]))
1102
+
1103
+ #else
1104
+
1105
+ #define AddOverWr4( lanes0, lanes1, lanes2, lanes3, key, inp, argIndex ) \
1106
+ lanes0 = LOAD256u( inp[argIndex+0*25]), \
1107
+ lanes1 = LOAD256u( inp[argIndex+1*25]), \
1108
+ lanes2 = LOAD256u( inp[argIndex+2*25]), \
1109
+ lanes3 = LOAD256u( inp[argIndex+3*25]), \
1110
+ INTLEAVEa(lanes0, lanes1, lanes2, lanes3), \
1111
+ XOReq256( lanes0, CONST256_64( key[argIndex+0])), \
1112
+ XOReq256( lanes1, CONST256_64( key[argIndex+1])), \
1113
+ XOReq256( lanes2, CONST256_64( key[argIndex+2])), \
1114
+ XOReq256( lanes3, CONST256_64( key[argIndex+3]))
1115
+
1116
+ #endif
1117
+
1118
+ #define ExtrAccu( lanes, p, argIndex ) p[argIndex] ^= _mm256_extract_epi64(lanes, 0) ^ _mm256_extract_epi64(lanes, 1) \
1119
+ ^ _mm256_extract_epi64(lanes, 2) ^ _mm256_extract_epi64(lanes, 3)
1120
+
1121
+ #define ExtrAccu4( lanes0, lanes1, lanes2, lanes3, p, argIndex ) \
1122
+ UNINTLEAVEa(lanes0, lanes1, lanes2, lanes3), \
1123
+ XOReq256( lanes0, lanes1 ), \
1124
+ XOReq256( lanes2, lanes3 ), \
1125
+ lanes1 = LOAD256( p[argIndex]), \
1126
+ XOReq256( lanes0, lanes2 ), \
1127
+ XOReq256( lanes0, lanes1 ), \
1128
+ STORE256( p[argIndex], lanes0 )
1129
+
1130
+ #define Kravatte_Rollc() \
1131
+ Asa = x0x1x2x3, \
1132
+ Ase = x1x2x3x4, \
1133
+ ROL64in256(x1x2x3x4, x0x1x2x3, 7), \
1134
+ XOReq256(x1x2x3x4, Ase), \
1135
+ XOReq256(x1x2x3x4, _mm256_srli_epi64(Ase, 3)), \
1136
+ Asi = _mm256_blend_epi32(_mm256_permute4x64_epi64(Ase, 0x39), _mm256_permute4x64_epi64(x1x2x3x4, 0x39), 0xC0), \
1137
+ Aso = PERM128(Ase, x1x2x3x4, 0x21), \
1138
+ Asu = _mm256_blend_epi32(_mm256_permute4x64_epi64(Ase, 0xFF), _mm256_permute4x64_epi64(x1x2x3x4, 0x90), 0xFC), \
1139
+ x0x1x2x3 = Asu
1140
+
1141
+ size_t KeccakP1600times4_KravatteCompress(uint64_t *xAccu, uint64_t *kRoll, const unsigned char *input, size_t inputByteLen)
1142
+ {
1143
+ uint64_t *in64 = (uint64_t *)input;
1144
+ size_t nBlocks = inputByteLen / (4 * 200);
1145
+ declareABCDE
1146
+ #if !defined(KeccakP1600times4_fullUnrolling)
1147
+ unsigned int i;
1148
+ #endif
1149
+ V256 lanesL01, lanesL23, lanesH01, lanesH23;
1150
+ V256 x0x1x2x3, x1x2x3x4;
1151
+ #if defined(UseGatherScatter)
1152
+ V128 gather = _mm_setr_epi32(0*25*8, 1*25*8, 2*25*8, 3*25*8);
1153
+ #endif
1154
+
1155
+ x0x1x2x3 = LOAD256(kRoll[20]);
1156
+ x1x2x3x4 = LOAD256u(kRoll[21]);
1157
+ do {
1158
+ AddOverWr4( Aba, Abe, Abi, Abo, kRoll, in64, 0 );
1159
+ AddOverWr4( Abu, Aga, Age, Agi, kRoll, in64, 4 );
1160
+ AddOverWr4( Ago, Agu, Aka, Ake, kRoll, in64, 8 );
1161
+ AddOverWr4( Aki, Ako, Aku, Ama, kRoll, in64, 12 );
1162
+ AddOverWr4( Ame, Ami, Amo, Amu, kRoll, in64, 16 );
1163
+ Kravatte_Rollc();
1164
+ LoadXOReq256(Asa, in64, 20);
1165
+ LoadXOReq256(Ase, in64, 21);
1166
+ LoadXOReq256(Asi, in64, 22);
1167
+ LoadXOReq256(Aso, in64, 23);
1168
+ LoadXOReq256(Asu, in64, 24);
1169
+ rounds6
1170
+ ExtrAccu4(Aba, Abe, Abi, Abo, xAccu, 0 );
1171
+ ExtrAccu4(Abu, Aga, Age, Agi, xAccu, 4 );
1172
+ ExtrAccu4(Ago, Agu, Aka, Ake, xAccu, 8 );
1173
+ ExtrAccu4(Aki, Ako, Aku, Ama, xAccu, 12 );
1174
+ ExtrAccu4(Ame, Ami, Amo, Amu, xAccu, 16 );
1175
+ ExtrAccu4(Asa, Ase, Asi, Aso, xAccu, 20 );
1176
+ ExtrAccu( Asu, xAccu, 24 );
1177
+ in64 += 4 * 25;
1178
+ }
1179
+ while(--nBlocks != 0);
1180
+ STORE256(kRoll[20], x0x1x2x3);
1181
+ kRoll[24] = _mm256_extract_epi64(x1x2x3x4, 3);
1182
+
1183
+ return (size_t)in64 - (size_t)input;
1184
+ }
1185
+
1186
+ #undef LoadXOReq256
1187
+ #undef AddOverWr4
1188
+ #undef ExtrAccu
1189
+ #undef ExtrAccu4
1190
+
1191
+ /* ------------------------------------------------------------------------- */
1192
+
1193
+ #define ExtrAddKey( lanes, p, argIndex ) \
1194
+ XOReq256(lanes, CONST256_64(kRoll[argIndex])), \
1195
+ p[argIndex+0*25] = _mm256_extract_epi64(lanes, 0), \
1196
+ p[argIndex+1*25] = _mm256_extract_epi64(lanes, 1), \
1197
+ p[argIndex+2*25] = _mm256_extract_epi64(lanes, 2), \
1198
+ p[argIndex+3*25] = _mm256_extract_epi64(lanes, 3)
1199
+
1200
+ #if 0//defined(UseGatherScatter)
1201
+
1202
+ #define ExtrAddKey4( lanes0, lanes1, lanes2, lanes3, p, argIndex ) \
1203
+ XOReq256(lanes0, CONST256_64(kRoll[argIndex+0])), \
1204
+ XOReq256(lanes1, CONST256_64(kRoll[argIndex+1])), \
1205
+ XOReq256(lanes2, CONST256_64(kRoll[argIndex+2])), \
1206
+ XOReq256(lanes3, CONST256_64(kRoll[argIndex+3])), \
1207
+ _mm256_i32scatter_epi64((long long int *)&p[argIndex+0], scatter, lanes0, 1), \
1208
+ _mm256_i32scatter_epi64((long long int *)&p[argIndex+1], scatter, lanes1, 1), \
1209
+ _mm256_i32scatter_epi64((long long int *)&p[argIndex+2], scatter, lanes2, 1), \
1210
+ _mm256_i32scatter_epi64((long long int *)&p[argIndex+3], scatter, lanes3, 1)
1211
+
1212
+ #else
1213
+
1214
+ #define ExtrAddKey4( lanes0, lanes1, lanes2, lanes3, p, argIndex ) \
1215
+ XOReq256(lanes0, CONST256_64(kRoll[argIndex+0])), \
1216
+ XOReq256(lanes1, CONST256_64(kRoll[argIndex+1])), \
1217
+ XOReq256(lanes2, CONST256_64(kRoll[argIndex+2])), \
1218
+ XOReq256(lanes3, CONST256_64(kRoll[argIndex+3])), \
1219
+ UNINTLEAVEa(lanes0, lanes1, lanes2, lanes3), \
1220
+ STORE256u( p[argIndex+0*25], lanes0 ), \
1221
+ STORE256u( p[argIndex+1*25], lanes1 ), \
1222
+ STORE256u( p[argIndex+2*25], lanes2 ), \
1223
+ STORE256u( p[argIndex+3*25], lanes3 )
1224
+
1225
+ #endif
1226
+
1227
+ size_t KeccakP1600times4_KravatteExpand(uint64_t *yAccu, const uint64_t *kRoll, unsigned char *output, size_t outputByteLen)
1228
+ {
1229
+ uint64_t *out64 = (uint64_t *)output;
1230
+ size_t nBlocks = outputByteLen / (4 * 200);
1231
+ declareABCDE
1232
+ #if !defined(KeccakP1600times4_fullUnrolling)
1233
+ unsigned int i;
1234
+ #endif
1235
+ V256 lanesL01, lanesL23, lanesH01, lanesH23;
1236
+ #if defined(UseGatherScatter)
1237
+ V128 scatter = _mm_setr_epi32(0*25*8, 1*25*8, 2*25*8, 3*25*8);
1238
+ #endif
1239
+
1240
+ do {
1241
+ Aba = CONST256_64(yAccu[0]);
1242
+ Abe = CONST256_64(yAccu[1]);
1243
+ Abi = CONST256_64(yAccu[2]);
1244
+ Abo = CONST256_64(yAccu[3]);
1245
+ Abu = CONST256_64(yAccu[4]);
1246
+
1247
+ Aga = CONST256_64(yAccu[5]);
1248
+ Age = CONST256_64(yAccu[6]);
1249
+ Agi = CONST256_64(yAccu[7]);
1250
+ Ago = CONST256_64(yAccu[8]);
1251
+ Agu = CONST256_64(yAccu[9]);
1252
+
1253
+ Aka = CONST256_64(yAccu[10]);
1254
+ Ake = CONST256_64(yAccu[11]);
1255
+ Aki = CONST256_64(yAccu[12]);
1256
+ Ako = CONST256_64(yAccu[13]);
1257
+ Aku = CONST256_64(yAccu[14]);
1258
+
1259
+ Ama = LOAD256u(yAccu[15]);
1260
+ Ame = LOAD256 (yAccu[16]);
1261
+ Ami = LOAD256u(yAccu[17]);
1262
+ Amo = LOAD256u(yAccu[18]);
1263
+ Amu = LOAD256u(yAccu[19]);
1264
+
1265
+ ROL64in256(lanesL01, Ama, 7);
1266
+ ROL64in256(lanesH01, Ame, 18);
1267
+ lanesL01 = XOR256(lanesL01, lanesH01);
1268
+ lanesH01 = _mm256_and_si256(Ami, _mm256_srli_epi64(Ame, 1));
1269
+ lanesL01 = XOR256(lanesL01, lanesH01);
1270
+
1271
+ Asa = LOAD256 (yAccu[20]);
1272
+ Ase = LOAD256u(yAccu[21]);
1273
+ Asi = _mm256_insert_epi64(_mm256_permute4x64_epi64(Ase, 0x39), _mm256_extract_epi64(lanesL01, 0), 3);
1274
+ Aso = _mm256_permute2x128_si256(Ase, lanesL01, 0x21);
1275
+ Asu = _mm256_insert_epi64(_mm256_permute4x64_epi64(lanesL01, 0x93), _mm256_extract_epi64(Ase, 3), 0);
1276
+
1277
+ STORE256u(yAccu[15], Amu);
1278
+ yAccu[19] = _mm256_extract_epi64(Aso, 0);
1279
+ yAccu[20] = _mm256_extract_epi64(Aso, 1);
1280
+ STORE256u(yAccu[21], lanesL01);
1281
+
1282
+ rounds6
1283
+ ExtrAddKey4(Aba, Abe, Abi, Abo, out64, 0 );
1284
+ ExtrAddKey4(Abu, Aga, Age, Agi, out64, 4 );
1285
+ ExtrAddKey4(Ago, Agu, Aka, Ake, out64, 8 );
1286
+ ExtrAddKey4(Aki, Ako, Aku, Ama, out64, 12 );
1287
+ ExtrAddKey4(Ame, Ami, Amo, Amu, out64, 16 );
1288
+ ExtrAddKey4(Asa, Ase, Asi, Aso, out64, 20 );
1289
+ ExtrAddKey( Asu, out64, 24 );
1290
+ out64 += 4 * 25;
1291
+ }
1292
+ while(--nBlocks != 0);
1293
+
1294
+ return (size_t)out64 - (size_t)output;
1295
+ }
1296
+
1297
+ #undef OverWr4
1298
+ #undef ExtrAddKey
1299
+ #undef ExtrAddKey4
1300
+
1301
+ #undef Kravatte_Roll
1302
+ #undef UNINTLEAVEa
1303
+ #undef INTLEAVEa