digest-kangarootwelve 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (305) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +51 -11
  3. data/Rakefile +2 -2
  4. data/digest-kangarootwelve.gemspec +322 -42
  5. data/ext/digest/kangarootwelve/ext.c +1 -1
  6. data/ext/digest/kangarootwelve/extconf.rb +13 -1
  7. data/ext/digest/kangarootwelve/keccak/armv6m/KangarooTwelve.link.c +1 -0
  8. data/ext/digest/kangarootwelve/keccak/armv6m/KeccakDuplexWidth1600.link.c +1 -0
  9. data/ext/digest/kangarootwelve/keccak/armv6m/KeccakP-1600-SnP.h +36 -0
  10. data/ext/digest/kangarootwelve/{KeccakP-1600-times2-SnP.h → keccak/armv6m/KeccakP-1600-times2-SnP.h} +10 -10
  11. data/ext/digest/kangarootwelve/{KeccakP-1600-times2-on1.c → keccak/armv6m/KeccakP-1600-times2-on1.c} +13 -7
  12. data/ext/digest/kangarootwelve/{KeccakP-1600-times4-SnP.h → keccak/armv6m/KeccakP-1600-times4-SnP.h} +10 -10
  13. data/ext/digest/kangarootwelve/{KeccakP-1600-times4-on1.c → keccak/armv6m/KeccakP-1600-times4-on1.c} +13 -7
  14. data/ext/digest/kangarootwelve/{KeccakP-1600-times8-SnP.h → keccak/armv6m/KeccakP-1600-times8-SnP.h} +10 -10
  15. data/ext/digest/kangarootwelve/{KeccakP-1600-times8-on1.c → keccak/armv6m/KeccakP-1600-times8-on1.c} +13 -7
  16. data/ext/digest/kangarootwelve/keccak/armv6m/KeccakP-1600-u2-32bi-armv6m-le-gcc.s +1334 -0
  17. data/ext/digest/kangarootwelve/keccak/armv6m/KeccakSpongeWidth1600.link.c +1 -0
  18. data/ext/digest/kangarootwelve/{PlSnP-Fallback.inc → keccak/armv6m/PlSnP-Fallback.inc} +11 -7
  19. data/ext/digest/kangarootwelve/keccak/armv6m/ext.link.c +1 -0
  20. data/ext/digest/kangarootwelve/keccak/armv7a/KangarooTwelve.link.c +1 -0
  21. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakDuplexWidth1600.link.c +1 -0
  22. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-SnP.h +37 -0
  23. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-armv7a-le-neon-gcc.s +826 -0
  24. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-inplace-pl2-armv7a-neon-le-gcc.s +1245 -0
  25. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-times2-SnP.h +38 -0
  26. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-times4-SnP.h +45 -0
  27. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-times4-on2.c +38 -0
  28. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-times8-SnP.h +45 -0
  29. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-times8-on2.c +38 -0
  30. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakSpongeWidth1600.link.c +1 -0
  31. data/ext/digest/kangarootwelve/keccak/armv7a/PlSnP-Fallback.inc +287 -0
  32. data/ext/digest/kangarootwelve/keccak/armv7a/ext.link.c +1 -0
  33. data/ext/digest/kangarootwelve/keccak/armv7m/KangarooTwelve.link.c +1 -0
  34. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakDuplexWidth1600.link.c +1 -0
  35. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-SnP.h +36 -0
  36. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-inplace-32bi-armv7m-le-gcc.s +1170 -0
  37. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-times2-SnP.h +45 -0
  38. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-times2-on1.c +37 -0
  39. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-times4-SnP.h +45 -0
  40. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-times4-on1.c +37 -0
  41. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-times8-SnP.h +45 -0
  42. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-times8-on1.c +37 -0
  43. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakSpongeWidth1600.link.c +1 -0
  44. data/ext/digest/kangarootwelve/keccak/armv7m/PlSnP-Fallback.inc +287 -0
  45. data/ext/digest/kangarootwelve/keccak/armv7m/ext.link.c +1 -0
  46. data/ext/digest/kangarootwelve/keccak/armv8a/KangarooTwelve.link.c +1 -0
  47. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakDuplexWidth1600.link.c +1 -0
  48. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-SnP.h +28 -0
  49. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-armv8a-neon.s +537 -0
  50. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-times2-SnP.h +45 -0
  51. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-times2-on1.c +37 -0
  52. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-times4-SnP.h +45 -0
  53. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-times4-on1.c +37 -0
  54. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-times8-SnP.h +45 -0
  55. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-times8-on1.c +37 -0
  56. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakSpongeWidth1600.link.c +1 -0
  57. data/ext/digest/kangarootwelve/keccak/armv8a/PlSnP-Fallback.inc +287 -0
  58. data/ext/digest/kangarootwelve/keccak/armv8a/ext.link.c +1 -0
  59. data/ext/digest/kangarootwelve/keccak/asmx86-64/KangarooTwelve.link.c +1 -0
  60. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakDuplexWidth1600.link.c +1 -0
  61. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-SnP.h +37 -0
  62. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-times2-SnP.h +45 -0
  63. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-times2-on1.c +37 -0
  64. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-times4-SnP.h +45 -0
  65. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-times4-on1.c +37 -0
  66. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-times8-SnP.h +45 -0
  67. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-times8-on1.c +37 -0
  68. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-x86-64-gas.s +1190 -0
  69. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakSpongeWidth1600.link.c +1 -0
  70. data/ext/digest/kangarootwelve/keccak/asmx86-64/PlSnP-Fallback.inc +287 -0
  71. data/ext/digest/kangarootwelve/keccak/asmx86-64/ext.link.c +1 -0
  72. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KangarooTwelve.link.c +1 -0
  73. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakDuplexWidth1600.link.c +1 -0
  74. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-SnP.h +37 -0
  75. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-times2-SnP.h +45 -0
  76. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-times2-on1.c +37 -0
  77. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-times4-SnP.h +45 -0
  78. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-times4-on1.c +37 -0
  79. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-times8-SnP.h +45 -0
  80. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-times8-on1.c +37 -0
  81. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-x86-64-shld-gas.s +1190 -0
  82. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakSpongeWidth1600.link.c +1 -0
  83. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/PlSnP-Fallback.inc +287 -0
  84. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/ext.link.c +1 -0
  85. data/ext/digest/kangarootwelve/keccak/avr8/KangarooTwelve.link.c +1 -0
  86. data/ext/digest/kangarootwelve/keccak/avr8/KeccakDuplexWidth1600.link.c +1 -0
  87. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-SnP.h +37 -0
  88. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-avr8-fast.s +1116 -0
  89. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-times2-SnP.h +45 -0
  90. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-times2-on1.c +37 -0
  91. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-times4-SnP.h +45 -0
  92. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-times4-on1.c +37 -0
  93. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-times8-SnP.h +45 -0
  94. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-times8-on1.c +37 -0
  95. data/ext/digest/kangarootwelve/keccak/avr8/KeccakSpongeWidth1600.link.c +1 -0
  96. data/ext/digest/kangarootwelve/keccak/avr8/PlSnP-Fallback.inc +287 -0
  97. data/ext/digest/kangarootwelve/keccak/avr8/ext.link.c +1 -0
  98. data/ext/digest/kangarootwelve/keccak/bulldozer/KangarooTwelve.link.c +1 -0
  99. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakDuplexWidth1600.link.c +1 -0
  100. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-SnP.h +39 -0
  101. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-XOP-config.h +6 -0
  102. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-XOP.c +473 -0
  103. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-times2-SIMD128.c +954 -0
  104. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-times2-SnP.h +47 -0
  105. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-times4-SnP.h +45 -0
  106. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-times4-on2.c +38 -0
  107. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-times8-SnP.h +45 -0
  108. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-times8-on2.c +38 -0
  109. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-unrolling.macros +302 -0
  110. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakSpongeWidth1600.link.c +1 -0
  111. data/ext/digest/kangarootwelve/keccak/bulldozer/PlSnP-Fallback.inc +287 -0
  112. data/ext/digest/kangarootwelve/keccak/bulldozer/SIMD128-config.h +9 -0
  113. data/ext/digest/kangarootwelve/{SnP-Relaned.h → keccak/bulldozer/SnP-Relaned.h} +13 -7
  114. data/ext/digest/kangarootwelve/keccak/bulldozer/ext.link.c +1 -0
  115. data/ext/digest/kangarootwelve/{KangarooTwelve.c → keccak/common/KangarooTwelve.c} +6 -10
  116. data/ext/digest/kangarootwelve/{KangarooTwelve.h → keccak/common/KangarooTwelve.h} +3 -7
  117. data/ext/digest/kangarootwelve/keccak/common/KeccakDuplex-common.h +37 -0
  118. data/ext/digest/kangarootwelve/keccak/common/KeccakDuplex.inc +192 -0
  119. data/ext/digest/kangarootwelve/keccak/common/KeccakDuplexWidth1600.c +34 -0
  120. data/ext/digest/kangarootwelve/keccak/common/KeccakDuplexWidth1600.h +25 -0
  121. data/ext/digest/kangarootwelve/{KeccakSponge-common.h → keccak/common/KeccakSponge-common.h} +5 -7
  122. data/ext/digest/kangarootwelve/{KeccakSponge.inc → keccak/common/KeccakSponge.inc} +6 -8
  123. data/ext/digest/kangarootwelve/{KeccakSpongeWidth1600.c → keccak/common/KeccakSpongeWidth1600.c} +6 -8
  124. data/ext/digest/kangarootwelve/{KeccakSpongeWidth1600.h → keccak/common/KeccakSpongeWidth1600.h} +5 -7
  125. data/ext/digest/kangarootwelve/{Phases.h → keccak/common/Phases.h} +3 -7
  126. data/ext/digest/kangarootwelve/{align.h → keccak/common/align.h} +5 -7
  127. data/ext/digest/kangarootwelve/{brg_endian.h → keccak/common/brg_endian.h} +0 -0
  128. data/ext/digest/kangarootwelve/keccak/compact/KangarooTwelve.link.c +1 -0
  129. data/ext/digest/kangarootwelve/keccak/compact/KeccakDuplexWidth1600.link.c +1 -0
  130. data/ext/digest/kangarootwelve/{KeccakP-1600-SnP.h → keccak/compact/KeccakP-1600-SnP.h} +7 -10
  131. data/ext/digest/kangarootwelve/{KeccakP-1600-compact64.c → keccak/compact/KeccakP-1600-compact64.c} +11 -7
  132. data/ext/digest/kangarootwelve/keccak/compact/KeccakP-1600-times2-SnP.h +45 -0
  133. data/ext/digest/kangarootwelve/keccak/compact/KeccakP-1600-times2-on1.c +37 -0
  134. data/ext/digest/kangarootwelve/keccak/compact/KeccakP-1600-times4-SnP.h +45 -0
  135. data/ext/digest/kangarootwelve/keccak/compact/KeccakP-1600-times4-on1.c +37 -0
  136. data/ext/digest/kangarootwelve/keccak/compact/KeccakP-1600-times8-SnP.h +45 -0
  137. data/ext/digest/kangarootwelve/keccak/compact/KeccakP-1600-times8-on1.c +37 -0
  138. data/ext/digest/kangarootwelve/keccak/compact/KeccakSpongeWidth1600.link.c +1 -0
  139. data/ext/digest/kangarootwelve/keccak/compact/PlSnP-Fallback.inc +287 -0
  140. data/ext/digest/kangarootwelve/keccak/compact/SnP-Relaned.h +140 -0
  141. data/ext/digest/kangarootwelve/keccak/compact/ext.link.c +1 -0
  142. data/ext/digest/kangarootwelve/keccak/generic32/KangarooTwelve.link.c +1 -0
  143. data/ext/digest/kangarootwelve/keccak/generic32/KeccakDuplexWidth1600.link.c +1 -0
  144. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-SnP.h +38 -0
  145. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-inplace32BI.c +1162 -0
  146. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-times2-SnP.h +45 -0
  147. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-times2-on1.c +37 -0
  148. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-times4-SnP.h +45 -0
  149. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-times4-on1.c +37 -0
  150. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-times8-SnP.h +45 -0
  151. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-times8-on1.c +37 -0
  152. data/ext/digest/kangarootwelve/keccak/generic32/KeccakSpongeWidth1600.link.c +1 -0
  153. data/ext/digest/kangarootwelve/keccak/generic32/PlSnP-Fallback.inc +287 -0
  154. data/ext/digest/kangarootwelve/keccak/generic32/SnP-Relaned.h +140 -0
  155. data/ext/digest/kangarootwelve/keccak/generic32/ext.link.c +1 -0
  156. data/ext/digest/kangarootwelve/keccak/generic32lc/KangarooTwelve.link.c +1 -0
  157. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakDuplexWidth1600.link.c +1 -0
  158. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-SnP.h +38 -0
  159. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-inplace32BI.c +1162 -0
  160. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-times2-SnP.h +45 -0
  161. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-times2-on1.c +37 -0
  162. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-times4-SnP.h +45 -0
  163. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-times4-on1.c +37 -0
  164. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-times8-SnP.h +45 -0
  165. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-times8-on1.c +37 -0
  166. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakSpongeWidth1600.link.c +1 -0
  167. data/ext/digest/kangarootwelve/keccak/generic32lc/PlSnP-Fallback.inc +287 -0
  168. data/ext/digest/kangarootwelve/keccak/generic32lc/SnP-Relaned.h +140 -0
  169. data/ext/digest/kangarootwelve/keccak/generic32lc/ext.link.c +1 -0
  170. data/ext/digest/kangarootwelve/keccak/generic64/KangarooTwelve.link.c +1 -0
  171. data/ext/digest/kangarootwelve/keccak/generic64/KeccakDuplexWidth1600.link.c +1 -0
  172. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-64.macros +2195 -0
  173. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-SnP.h +49 -0
  174. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-opt64-config.h +6 -0
  175. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-opt64.c +541 -0
  176. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-times2-SnP.h +45 -0
  177. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-times2-on1.c +37 -0
  178. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-times4-SnP.h +45 -0
  179. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-times4-on1.c +37 -0
  180. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-times8-SnP.h +45 -0
  181. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-times8-on1.c +37 -0
  182. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-unrolling.macros +302 -0
  183. data/ext/digest/kangarootwelve/keccak/generic64/KeccakSpongeWidth1600.link.c +1 -0
  184. data/ext/digest/kangarootwelve/keccak/generic64/PlSnP-Fallback.inc +287 -0
  185. data/ext/digest/kangarootwelve/keccak/generic64/SnP-Relaned.h +140 -0
  186. data/ext/digest/kangarootwelve/keccak/generic64/ext.link.c +1 -0
  187. data/ext/digest/kangarootwelve/keccak/generic64lc/KangarooTwelve.link.c +1 -0
  188. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakDuplexWidth1600.link.c +1 -0
  189. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-64.macros +2195 -0
  190. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-SnP.h +49 -0
  191. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-opt64-config.h +7 -0
  192. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-opt64.c +541 -0
  193. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-times2-SnP.h +45 -0
  194. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-times2-on1.c +37 -0
  195. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-times4-SnP.h +45 -0
  196. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-times4-on1.c +37 -0
  197. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-times8-SnP.h +45 -0
  198. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-times8-on1.c +37 -0
  199. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-unrolling.macros +302 -0
  200. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakSpongeWidth1600.link.c +1 -0
  201. data/ext/digest/kangarootwelve/keccak/generic64lc/PlSnP-Fallback.inc +287 -0
  202. data/ext/digest/kangarootwelve/keccak/generic64lc/SnP-Relaned.h +140 -0
  203. data/ext/digest/kangarootwelve/keccak/generic64lc/ext.link.c +1 -0
  204. data/ext/digest/kangarootwelve/keccak/haswell/KangarooTwelve.link.c +1 -0
  205. data/ext/digest/kangarootwelve/keccak/haswell/KeccakDuplexWidth1600.link.c +1 -0
  206. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-AVX2.s +993 -0
  207. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-SnP.h +41 -0
  208. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-times2-SIMD128.c +954 -0
  209. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-times2-SnP.h +47 -0
  210. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-times4-SIMD256.c +1303 -0
  211. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-times4-SnP.h +53 -0
  212. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-times8-SnP.h +45 -0
  213. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-times8-on4.c +38 -0
  214. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-unrolling.macros +302 -0
  215. data/ext/digest/kangarootwelve/keccak/haswell/KeccakSpongeWidth1600.link.c +1 -0
  216. data/ext/digest/kangarootwelve/keccak/haswell/PlSnP-Fallback.inc +287 -0
  217. data/ext/digest/kangarootwelve/keccak/haswell/SIMD128-config.h +8 -0
  218. data/ext/digest/kangarootwelve/keccak/haswell/SIMD256-config.h +7 -0
  219. data/ext/digest/kangarootwelve/keccak/haswell/ext.link.c +1 -0
  220. data/ext/digest/kangarootwelve/keccak/nehalem/KangarooTwelve.link.c +1 -0
  221. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakDuplexWidth1600.link.c +1 -0
  222. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-64.macros +2195 -0
  223. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-SnP.h +49 -0
  224. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-opt64-config.h +7 -0
  225. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-opt64.c +541 -0
  226. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-times2-SIMD128.c +954 -0
  227. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-times2-SnP.h +47 -0
  228. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-times4-SnP.h +45 -0
  229. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-times4-on2.c +38 -0
  230. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-times8-SnP.h +45 -0
  231. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-times8-on2.c +38 -0
  232. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-unrolling.macros +302 -0
  233. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakSpongeWidth1600.link.c +1 -0
  234. data/ext/digest/kangarootwelve/keccak/nehalem/PlSnP-Fallback.inc +287 -0
  235. data/ext/digest/kangarootwelve/keccak/nehalem/SIMD128-config.h +8 -0
  236. data/ext/digest/kangarootwelve/keccak/nehalem/SnP-Relaned.h +140 -0
  237. data/ext/digest/kangarootwelve/keccak/nehalem/ext.link.c +1 -0
  238. data/ext/digest/kangarootwelve/keccak/reference/KangarooTwelve.link.c +1 -0
  239. data/ext/digest/kangarootwelve/keccak/reference/KeccakDuplexWidth1600.link.c +1 -0
  240. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-SnP.h +41 -0
  241. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-reference.c +424 -0
  242. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-reference.h +20 -0
  243. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-times2-SnP.h +45 -0
  244. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-times2-on1.c +37 -0
  245. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-times4-SnP.h +45 -0
  246. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-times4-on1.c +37 -0
  247. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-times8-SnP.h +45 -0
  248. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-times8-on1.c +37 -0
  249. data/ext/digest/kangarootwelve/keccak/reference/KeccakSpongeWidth1600.link.c +1 -0
  250. data/ext/digest/kangarootwelve/keccak/reference/PlSnP-Fallback.inc +287 -0
  251. data/ext/digest/kangarootwelve/keccak/reference/displayIntermediateValues.c +176 -0
  252. data/ext/digest/kangarootwelve/keccak/reference/displayIntermediateValues.h +29 -0
  253. data/ext/digest/kangarootwelve/keccak/reference/ext.link.c +1 -0
  254. data/ext/digest/kangarootwelve/keccak/reference32bits/KangarooTwelve.link.c +1 -0
  255. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakDuplexWidth1600.link.c +1 -0
  256. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-SnP.h +41 -0
  257. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-reference.h +20 -0
  258. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-reference32BI.c +612 -0
  259. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-times2-SnP.h +45 -0
  260. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-times2-on1.c +37 -0
  261. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-times4-SnP.h +45 -0
  262. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-times4-on1.c +37 -0
  263. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-times8-SnP.h +45 -0
  264. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-times8-on1.c +37 -0
  265. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakSpongeWidth1600.link.c +1 -0
  266. data/ext/digest/kangarootwelve/keccak/reference32bits/PlSnP-Fallback.inc +287 -0
  267. data/ext/digest/kangarootwelve/keccak/reference32bits/displayIntermediateValues.c +176 -0
  268. data/ext/digest/kangarootwelve/keccak/reference32bits/displayIntermediateValues.h +29 -0
  269. data/ext/digest/kangarootwelve/keccak/reference32bits/ext.link.c +1 -0
  270. data/ext/digest/kangarootwelve/keccak/sandybridge/KangarooTwelve.link.c +1 -0
  271. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakDuplexWidth1600.link.c +1 -0
  272. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-64.macros +2195 -0
  273. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-SnP.h +49 -0
  274. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-opt64-config.h +8 -0
  275. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-opt64.c +541 -0
  276. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-times2-SIMD128.c +954 -0
  277. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-times2-SnP.h +47 -0
  278. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-times4-SnP.h +45 -0
  279. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-times4-on2.c +38 -0
  280. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-times8-SnP.h +45 -0
  281. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-times8-on2.c +38 -0
  282. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-unrolling.macros +302 -0
  283. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakSpongeWidth1600.link.c +1 -0
  284. data/ext/digest/kangarootwelve/keccak/sandybridge/PlSnP-Fallback.inc +287 -0
  285. data/ext/digest/kangarootwelve/keccak/sandybridge/SIMD128-config.h +8 -0
  286. data/ext/digest/kangarootwelve/keccak/sandybridge/SnP-Relaned.h +140 -0
  287. data/ext/digest/kangarootwelve/keccak/sandybridge/ext.link.c +1 -0
  288. data/ext/digest/kangarootwelve/keccak/skylakex/KangarooTwelve.link.c +1 -0
  289. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakDuplexWidth1600.link.c +1 -0
  290. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-AVX512-config.h +6 -0
  291. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-AVX512.c +621 -0
  292. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-SnP.h +42 -0
  293. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-times2-SIMD512.c +852 -0
  294. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-times2-SnP.h +49 -0
  295. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-times4-SIMD512.c +883 -0
  296. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-times4-SnP.h +49 -0
  297. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-times8-SIMD512.c +1473 -0
  298. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-times8-SnP.h +53 -0
  299. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakSpongeWidth1600.link.c +1 -0
  300. data/ext/digest/kangarootwelve/keccak/skylakex/SIMD512-2-config.h +7 -0
  301. data/ext/digest/kangarootwelve/keccak/skylakex/SIMD512-4-config.h +7 -0
  302. data/ext/digest/kangarootwelve/keccak/skylakex/SIMD512-config.h +7 -0
  303. data/ext/digest/kangarootwelve/keccak/skylakex/ext.link.c +1 -0
  304. data/lib/digest/kangarootwelve/version.rb +1 -1
  305. metadata +299 -21
@@ -0,0 +1,47 @@
1
+ /*
2
+ Implementation by Gilles Van Assche, hereby denoted as "the implementer".
3
+
4
+ For more information, feedback or questions, please refer to our website:
5
+ https://keccak.team/
6
+
7
+ To the extent possible under law, the implementer has waived all copyright
8
+ and related or neighboring rights to the source code in this file.
9
+ http://creativecommons.org/publicdomain/zero/1.0/
10
+
11
+ ---
12
+
13
+ Please refer to PlSnP-documentation.h for more details.
14
+ */
15
+
16
+ #ifndef _KeccakP_1600_times2_SnP_h_
17
+ #define _KeccakP_1600_times2_SnP_h_
18
+
19
+ #include "SIMD128-config.h"
20
+
21
+ #define KeccakP1600times2_implementation "128-bit SIMD implementation (" KeccakP1600times2_implementation_config ")"
22
+ #define KeccakP1600times2_statesSizeInBytes 400
23
+ #define KeccakP1600times2_statesAlignment 16
24
+ #define KeccakF1600times2_FastLoop_supported
25
+
26
+ #include <stddef.h>
27
+
28
+ #define KeccakP1600times2_StaticInitialize()
29
+ void KeccakP1600times2_InitializeAll(void *states);
30
+ #define KeccakP1600times2_AddByte(states, instanceIndex, byte, offset) \
31
+ ((unsigned char*)(states))[(instanceIndex)*8 + ((offset)/8)*2*8 + (offset)%8] ^= (byte)
32
+ void KeccakP1600times2_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length);
33
+ void KeccakP1600times2_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset);
34
+ void KeccakP1600times2_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length);
35
+ void KeccakP1600times2_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset);
36
+ void KeccakP1600times2_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount);
37
+ void KeccakP1600times2_PermuteAll_4rounds(void *states);
38
+ void KeccakP1600times2_PermuteAll_6rounds(void *states);
39
+ void KeccakP1600times2_PermuteAll_12rounds(void *states);
40
+ void KeccakP1600times2_PermuteAll_24rounds(void *states);
41
+ void KeccakP1600times2_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length);
42
+ void KeccakP1600times2_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset);
43
+ void KeccakP1600times2_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length);
44
+ void KeccakP1600times2_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset);
45
+ size_t KeccakF1600times2_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen);
46
+
47
+ #endif
@@ -0,0 +1,1303 @@
1
+ /*
2
+ Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer".
3
+
4
+ For more information, feedback or questions, please refer to our website:
5
+ https://keccak.team/
6
+
7
+ To the extent possible under law, the implementer has waived all copyright
8
+ and related or neighboring rights to the source code in this file.
9
+ http://creativecommons.org/publicdomain/zero/1.0/
10
+
11
+ ---
12
+
13
+ This file implements Keccak-p[1600]×4 in a PlSnP-compatible way.
14
+ Please refer to PlSnP-documentation.h for more details.
15
+
16
+ This implementation comes with KeccakP-1600-times4-SnP.h in the same folder.
17
+ Please refer to LowLevel.build for the exact list of other files it must be combined with.
18
+ */
19
+
20
+ #include <stdio.h>
21
+ #include <stdlib.h>
22
+ #include <string.h>
23
+ #include <smmintrin.h>
24
+ #include <wmmintrin.h>
25
+ #include <immintrin.h>
26
+ #include <emmintrin.h>
27
+ #include "align.h"
28
+ #include "KeccakP-1600-times4-SnP.h"
29
+ #include "SIMD256-config.h"
30
+
31
+ #include "brg_endian.h"
32
+ #if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
33
+ #error Expecting a little-endian platform
34
+ #endif
35
+
36
+ typedef unsigned char UINT8;
37
+ typedef unsigned long long int UINT64;
38
+ typedef __m128i V128;
39
+ typedef __m256i V256;
40
+
41
+ //#define UseGatherScatter
42
+
43
+ #define laneIndex(instanceIndex, lanePosition) ((lanePosition)*4 + instanceIndex)
44
+
45
+ #if defined(KeccakP1600times4_useAVX2)
46
+ #define ANDnu256(a, b) _mm256_andnot_si256(a, b)
47
+ #define CONST256(a) _mm256_load_si256((const V256 *)&(a))
48
+ #define CONST256_64(a) (V256)_mm256_broadcast_sd((const double*)(&a))
49
+ #define LOAD256(a) _mm256_load_si256((const V256 *)&(a))
50
+ #define LOAD256u(a) _mm256_loadu_si256((const V256 *)&(a))
51
+ #define LOAD4_64(a, b, c, d) _mm256_set_epi64x((UINT64)(a), (UINT64)(b), (UINT64)(c), (UINT64)(d))
52
+ #define ROL64in256(d, a, o) d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o)))
53
+ #define ROL64in256_8(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho8))
54
+ #define ROL64in256_56(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho56))
55
+ static const UINT64 rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F};
56
+ static const UINT64 rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19};
57
+ #define STORE256(a, b) _mm256_store_si256((V256 *)&(a), b)
58
+ #define STORE256u(a, b) _mm256_storeu_si256((V256 *)&(a), b)
59
+ #define STORE2_128(ah, al, v) _mm256_storeu2_m128d((V128*)&(ah), (V128*)&(al), v)
60
+ #define XOR256(a, b) _mm256_xor_si256(a, b)
61
+ #define XOReq256(a, b) a = _mm256_xor_si256(a, b)
62
+ #define UNPACKL( a, b ) _mm256_unpacklo_epi64((a), (b))
63
+ #define UNPACKH( a, b ) _mm256_unpackhi_epi64((a), (b))
64
+ #define PERM128( a, b, c ) (V256)_mm256_permute2f128_ps((__m256)(a), (__m256)(b), c)
65
+ #define SHUFFLE64( a, b, c ) (V256)_mm256_shuffle_pd((__m256d)(a), (__m256d)(b), c)
66
+
67
+ #define UNINTLEAVE() lanesL01 = UNPACKL( lanes0, lanes1 ), \
68
+ lanesH01 = UNPACKH( lanes0, lanes1 ), \
69
+ lanesL23 = UNPACKL( lanes2, lanes3 ), \
70
+ lanesH23 = UNPACKH( lanes2, lanes3 ), \
71
+ lanes0 = PERM128( lanesL01, lanesL23, 0x20 ), \
72
+ lanes2 = PERM128( lanesL01, lanesL23, 0x31 ), \
73
+ lanes1 = PERM128( lanesH01, lanesH23, 0x20 ), \
74
+ lanes3 = PERM128( lanesH01, lanesH23, 0x31 )
75
+
76
+ #define INTLEAVE() lanesL01 = PERM128( lanes0, lanes2, 0x20 ), \
77
+ lanesH01 = PERM128( lanes1, lanes3, 0x20 ), \
78
+ lanesL23 = PERM128( lanes0, lanes2, 0x31 ), \
79
+ lanesH23 = PERM128( lanes1, lanes3, 0x31 ), \
80
+ lanes0 = SHUFFLE64( lanesL01, lanesH01, 0x00 ), \
81
+ lanes1 = SHUFFLE64( lanesL01, lanesH01, 0x0F ), \
82
+ lanes2 = SHUFFLE64( lanesL23, lanesH23, 0x00 ), \
83
+ lanes3 = SHUFFLE64( lanesL23, lanesH23, 0x0F )
84
+
85
+ #endif
86
+
87
+ #define SnP_laneLengthInBytes 8
88
+
89
+ void KeccakP1600times4_InitializeAll(void *states)
90
+ {
91
+ memset(states, 0, KeccakP1600times4_statesSizeInBytes);
92
+ }
93
+
94
+ void KeccakP1600times4_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
95
+ {
96
+ unsigned int sizeLeft = length;
97
+ unsigned int lanePosition = offset/SnP_laneLengthInBytes;
98
+ unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
99
+ const unsigned char *curData = data;
100
+ UINT64 *statesAsLanes = (UINT64 *)states;
101
+
102
+ if ((sizeLeft > 0) && (offsetInLane != 0)) {
103
+ unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
104
+ UINT64 lane = 0;
105
+ if (bytesInLane > sizeLeft)
106
+ bytesInLane = sizeLeft;
107
+ memcpy((unsigned char*)&lane + offsetInLane, curData, bytesInLane);
108
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
109
+ sizeLeft -= bytesInLane;
110
+ lanePosition++;
111
+ curData += bytesInLane;
112
+ }
113
+
114
+ while(sizeLeft >= SnP_laneLengthInBytes) {
115
+ UINT64 lane = *((const UINT64*)curData);
116
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
117
+ sizeLeft -= SnP_laneLengthInBytes;
118
+ lanePosition++;
119
+ curData += SnP_laneLengthInBytes;
120
+ }
121
+
122
+ if (sizeLeft > 0) {
123
+ UINT64 lane = 0;
124
+ memcpy(&lane, curData, sizeLeft);
125
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
126
+ }
127
+ }
128
+
129
+ void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
130
+ {
131
+ V256 *stateAsLanes = (V256 *)states;
132
+ unsigned int i;
133
+ const UINT64 *curData0 = (const UINT64 *)data;
134
+ const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes);
135
+ const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
136
+ const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
137
+ V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
138
+
139
+ #define Xor_In( argIndex ) XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
140
+
141
+ #define Xor_In4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\
142
+ lanes1 = LOAD256u( curData1[argIndex]),\
143
+ lanes2 = LOAD256u( curData2[argIndex]),\
144
+ lanes3 = LOAD256u( curData3[argIndex]),\
145
+ INTLEAVE(),\
146
+ XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
147
+ XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
148
+ XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
149
+ XOReq256( stateAsLanes[argIndex+3], lanes3 )
150
+
151
+ if ( laneCount >= 16 ) {
152
+ Xor_In4( 0 );
153
+ Xor_In4( 4 );
154
+ Xor_In4( 8 );
155
+ Xor_In4( 12 );
156
+ if ( laneCount >= 20 ) {
157
+ Xor_In4( 16 );
158
+ for(i=20; i<laneCount; i++)
159
+ Xor_In( i );
160
+ }
161
+ else {
162
+ for(i=16; i<laneCount; i++)
163
+ Xor_In( i );
164
+ }
165
+ }
166
+ else {
167
+ for(i=0; i<laneCount; i++)
168
+ Xor_In( i );
169
+ }
170
+ #undef Xor_In
171
+ #undef Xor_In4
172
+ }
173
+
174
+ void KeccakP1600times4_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
175
+ {
176
+ unsigned int sizeLeft = length;
177
+ unsigned int lanePosition = offset/SnP_laneLengthInBytes;
178
+ unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
179
+ const unsigned char *curData = data;
180
+ UINT64 *statesAsLanes = (UINT64 *)states;
181
+
182
+ if ((sizeLeft > 0) && (offsetInLane != 0)) {
183
+ unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
184
+ if (bytesInLane > sizeLeft)
185
+ bytesInLane = sizeLeft;
186
+ memcpy( ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, curData, bytesInLane);
187
+ sizeLeft -= bytesInLane;
188
+ lanePosition++;
189
+ curData += bytesInLane;
190
+ }
191
+
192
+ while(sizeLeft >= SnP_laneLengthInBytes) {
193
+ UINT64 lane = *((const UINT64*)curData);
194
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] = lane;
195
+ sizeLeft -= SnP_laneLengthInBytes;
196
+ lanePosition++;
197
+ curData += SnP_laneLengthInBytes;
198
+ }
199
+
200
+ if (sizeLeft > 0) {
201
+ memcpy(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], curData, sizeLeft);
202
+ }
203
+ }
204
+
205
+ void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
206
+ {
207
+ V256 *stateAsLanes = (V256 *)states;
208
+ unsigned int i;
209
+ const UINT64 *curData0 = (const UINT64 *)data;
210
+ const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes);
211
+ const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
212
+ const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
213
+ V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
214
+
215
+ #define OverWr( argIndex ) STORE256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
216
+
217
+ #define OverWr4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\
218
+ lanes1 = LOAD256u( curData1[argIndex]),\
219
+ lanes2 = LOAD256u( curData2[argIndex]),\
220
+ lanes3 = LOAD256u( curData3[argIndex]),\
221
+ INTLEAVE(),\
222
+ STORE256( stateAsLanes[argIndex+0], lanes0 ),\
223
+ STORE256( stateAsLanes[argIndex+1], lanes1 ),\
224
+ STORE256( stateAsLanes[argIndex+2], lanes2 ),\
225
+ STORE256( stateAsLanes[argIndex+3], lanes3 )
226
+
227
+ if ( laneCount >= 16 ) {
228
+ OverWr4( 0 );
229
+ OverWr4( 4 );
230
+ OverWr4( 8 );
231
+ OverWr4( 12 );
232
+ if ( laneCount >= 20 ) {
233
+ OverWr4( 16 );
234
+ for(i=20; i<laneCount; i++)
235
+ OverWr( i );
236
+ }
237
+ else {
238
+ for(i=16; i<laneCount; i++)
239
+ OverWr( i );
240
+ }
241
+ }
242
+ else {
243
+ for(i=0; i<laneCount; i++)
244
+ OverWr( i );
245
+ }
246
+ #undef OverWr
247
+ #undef OverWr4
248
+ }
249
+
250
+ void KeccakP1600times4_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount)
251
+ {
252
+ unsigned int sizeLeft = byteCount;
253
+ unsigned int lanePosition = 0;
254
+ UINT64 *statesAsLanes = (UINT64 *)states;
255
+
256
+ while(sizeLeft >= SnP_laneLengthInBytes) {
257
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] = 0;
258
+ sizeLeft -= SnP_laneLengthInBytes;
259
+ lanePosition++;
260
+ }
261
+
262
+ if (sizeLeft > 0) {
263
+ memset(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], 0, sizeLeft);
264
+ }
265
+ }
266
+
267
+ void KeccakP1600times4_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length)
268
+ {
269
+ unsigned int sizeLeft = length;
270
+ unsigned int lanePosition = offset/SnP_laneLengthInBytes;
271
+ unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
272
+ unsigned char *curData = data;
273
+ const UINT64 *statesAsLanes = (const UINT64 *)states;
274
+
275
+ if ((sizeLeft > 0) && (offsetInLane != 0)) {
276
+ unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
277
+ if (bytesInLane > sizeLeft)
278
+ bytesInLane = sizeLeft;
279
+ memcpy( curData, ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane);
280
+ sizeLeft -= bytesInLane;
281
+ lanePosition++;
282
+ curData += bytesInLane;
283
+ }
284
+
285
+ while(sizeLeft >= SnP_laneLengthInBytes) {
286
+ *(UINT64*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
287
+ sizeLeft -= SnP_laneLengthInBytes;
288
+ lanePosition++;
289
+ curData += SnP_laneLengthInBytes;
290
+ }
291
+
292
+ if (sizeLeft > 0) {
293
+ memcpy( curData, &statesAsLanes[laneIndex(instanceIndex, lanePosition)], sizeLeft);
294
+ }
295
+ }
296
+
297
+ void KeccakP1600times4_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
298
+ {
299
+ UINT64 *curData0 = (UINT64 *)data;
300
+ UINT64 *curData1 = (UINT64 *)(data+laneOffset*1*SnP_laneLengthInBytes);
301
+ UINT64 *curData2 = (UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
302
+ UINT64 *curData3 = (UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
303
+
304
+ const V256 *stateAsLanes = (const V256 *)states;
305
+ const UINT64 *stateAsLanes64 = (const UINT64*)states;
306
+ V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
307
+ unsigned int i;
308
+
309
+ #define Extr( argIndex ) curData0[argIndex] = stateAsLanes64[4*(argIndex)], \
310
+ curData1[argIndex] = stateAsLanes64[4*(argIndex)+1], \
311
+ curData2[argIndex] = stateAsLanes64[4*(argIndex)+2], \
312
+ curData3[argIndex] = stateAsLanes64[4*(argIndex)+3]
313
+
314
+ #define Extr4( argIndex ) lanes0 = LOAD256( stateAsLanes[argIndex+0] ), \
315
+ lanes1 = LOAD256( stateAsLanes[argIndex+1] ), \
316
+ lanes2 = LOAD256( stateAsLanes[argIndex+2] ), \
317
+ lanes3 = LOAD256( stateAsLanes[argIndex+3] ), \
318
+ UNINTLEAVE(), \
319
+ STORE256u( curData0[argIndex], lanes0 ), \
320
+ STORE256u( curData1[argIndex], lanes1 ), \
321
+ STORE256u( curData2[argIndex], lanes2 ), \
322
+ STORE256u( curData3[argIndex], lanes3 )
323
+
324
+ if ( laneCount >= 16 ) {
325
+ Extr4( 0 );
326
+ Extr4( 4 );
327
+ Extr4( 8 );
328
+ Extr4( 12 );
329
+ if ( laneCount >= 20 ) {
330
+ Extr4( 16 );
331
+ for(i=20; i<laneCount; i++)
332
+ Extr( i );
333
+ }
334
+ else {
335
+ for(i=16; i<laneCount; i++)
336
+ Extr( i );
337
+ }
338
+ }
339
+ else {
340
+ for(i=0; i<laneCount; i++)
341
+ Extr( i );
342
+ }
343
+ #undef Extr
344
+ #undef Extr4
345
+ }
346
+
347
+ void KeccakP1600times4_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
348
+ {
349
+ unsigned int sizeLeft = length;
350
+ unsigned int lanePosition = offset/SnP_laneLengthInBytes;
351
+ unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
352
+ const unsigned char *curInput = input;
353
+ unsigned char *curOutput = output;
354
+ const UINT64 *statesAsLanes = (const UINT64 *)states;
355
+
356
+ if ((sizeLeft > 0) && (offsetInLane != 0)) {
357
+ unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
358
+ UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)] >> (8 * offsetInLane);
359
+ if (bytesInLane > sizeLeft)
360
+ bytesInLane = sizeLeft;
361
+ sizeLeft -= bytesInLane;
362
+ do {
363
+ *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
364
+ lane >>= 8;
365
+ } while ( --bytesInLane != 0);
366
+ lanePosition++;
367
+ }
368
+
369
+ while(sizeLeft >= SnP_laneLengthInBytes) {
370
+ *((UINT64*)curOutput) = *((UINT64*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)];
371
+ sizeLeft -= SnP_laneLengthInBytes;
372
+ lanePosition++;
373
+ curInput += SnP_laneLengthInBytes;
374
+ curOutput += SnP_laneLengthInBytes;
375
+ }
376
+
377
+ if (sizeLeft != 0) {
378
+ UINT64 lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
379
+ do {
380
+ *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
381
+ lane >>= 8;
382
+ } while ( --sizeLeft != 0);
383
+ }
384
+ }
385
+
386
+ void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset)
387
+ {
388
+ const UINT64 *curInput0 = (UINT64 *)input;
389
+ const UINT64 *curInput1 = (UINT64 *)(input+laneOffset*1*SnP_laneLengthInBytes);
390
+ const UINT64 *curInput2 = (UINT64 *)(input+laneOffset*2*SnP_laneLengthInBytes);
391
+ const UINT64 *curInput3 = (UINT64 *)(input+laneOffset*3*SnP_laneLengthInBytes);
392
+ UINT64 *curOutput0 = (UINT64 *)output;
393
+ UINT64 *curOutput1 = (UINT64 *)(output+laneOffset*1*SnP_laneLengthInBytes);
394
+ UINT64 *curOutput2 = (UINT64 *)(output+laneOffset*2*SnP_laneLengthInBytes);
395
+ UINT64 *curOutput3 = (UINT64 *)(output+laneOffset*3*SnP_laneLengthInBytes);
396
+
397
+ const V256 *stateAsLanes = (const V256 *)states;
398
+ const UINT64 *stateAsLanes64 = (const UINT64*)states;
399
+ V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
400
+ unsigned int i;
401
+
402
+ #define ExtrXor( argIndex ) \
403
+ curOutput0[argIndex] = curInput0[argIndex] ^ stateAsLanes64[4*(argIndex)],\
404
+ curOutput1[argIndex] = curInput1[argIndex] ^ stateAsLanes64[4*(argIndex)+1],\
405
+ curOutput2[argIndex] = curInput2[argIndex] ^ stateAsLanes64[4*(argIndex)+2],\
406
+ curOutput3[argIndex] = curInput3[argIndex] ^ stateAsLanes64[4*(argIndex)+3]
407
+
408
+ #define ExtrXor4( argIndex ) \
409
+ lanes0 = LOAD256( stateAsLanes[argIndex+0] ),\
410
+ lanes1 = LOAD256( stateAsLanes[argIndex+1] ),\
411
+ lanes2 = LOAD256( stateAsLanes[argIndex+2] ),\
412
+ lanes3 = LOAD256( stateAsLanes[argIndex+3] ),\
413
+ UNINTLEAVE(),\
414
+ lanesL01 = LOAD256u( curInput0[argIndex]),\
415
+ lanesH01 = LOAD256u( curInput1[argIndex]),\
416
+ lanesL23 = LOAD256u( curInput2[argIndex]),\
417
+ lanesH23 = LOAD256u( curInput3[argIndex]),\
418
+ XOReq256( lanes0, lanesL01 ),\
419
+ XOReq256( lanes1, lanesH01 ),\
420
+ XOReq256( lanes2, lanesL23 ),\
421
+ XOReq256( lanes3, lanesH23 ),\
422
+ STORE256u( curOutput0[argIndex], lanes0 ),\
423
+ STORE256u( curOutput1[argIndex], lanes1 ),\
424
+ STORE256u( curOutput2[argIndex], lanes2 ),\
425
+ STORE256u( curOutput3[argIndex], lanes3 )
426
+
427
+ if ( laneCount >= 16 ) {
428
+ ExtrXor4( 0 );
429
+ ExtrXor4( 4 );
430
+ ExtrXor4( 8 );
431
+ ExtrXor4( 12 );
432
+ if ( laneCount >= 20 ) {
433
+ ExtrXor4( 16 );
434
+ for(i=20; i<laneCount; i++)
435
+ ExtrXor( i );
436
+ }
437
+ else {
438
+ for(i=16; i<laneCount; i++)
439
+ ExtrXor( i );
440
+ }
441
+ }
442
+ else {
443
+ for(i=0; i<laneCount; i++)
444
+ ExtrXor( i );
445
+ }
446
+ #undef ExtrXor
447
+ #undef ExtrXor4
448
+ }
449
+
450
+ #define declareABCDE \
451
+ V256 Aba, Abe, Abi, Abo, Abu; \
452
+ V256 Aga, Age, Agi, Ago, Agu; \
453
+ V256 Aka, Ake, Aki, Ako, Aku; \
454
+ V256 Ama, Ame, Ami, Amo, Amu; \
455
+ V256 Asa, Ase, Asi, Aso, Asu; \
456
+ V256 Bba, Bbe, Bbi, Bbo, Bbu; \
457
+ V256 Bga, Bge, Bgi, Bgo, Bgu; \
458
+ V256 Bka, Bke, Bki, Bko, Bku; \
459
+ V256 Bma, Bme, Bmi, Bmo, Bmu; \
460
+ V256 Bsa, Bse, Bsi, Bso, Bsu; \
461
+ V256 Ca, Ce, Ci, Co, Cu; \
462
+ V256 Ca1, Ce1, Ci1, Co1, Cu1; \
463
+ V256 Da, De, Di, Do, Du; \
464
+ V256 Eba, Ebe, Ebi, Ebo, Ebu; \
465
+ V256 Ega, Ege, Egi, Ego, Egu; \
466
+ V256 Eka, Eke, Eki, Eko, Eku; \
467
+ V256 Ema, Eme, Emi, Emo, Emu; \
468
+ V256 Esa, Ese, Esi, Eso, Esu; \
469
+
470
+ #define prepareTheta \
471
+ Ca = XOR256(Aba, XOR256(Aga, XOR256(Aka, XOR256(Ama, Asa)))); \
472
+ Ce = XOR256(Abe, XOR256(Age, XOR256(Ake, XOR256(Ame, Ase)))); \
473
+ Ci = XOR256(Abi, XOR256(Agi, XOR256(Aki, XOR256(Ami, Asi)))); \
474
+ Co = XOR256(Abo, XOR256(Ago, XOR256(Ako, XOR256(Amo, Aso)))); \
475
+ Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu)))); \
476
+
477
+ /* --- Theta Rho Pi Chi Iota Prepare-theta */
478
+ /* --- 64-bit lanes mapped to 64-bit words */
479
+ #define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
480
+ ROL64in256(Ce1, Ce, 1); \
481
+ Da = XOR256(Cu, Ce1); \
482
+ ROL64in256(Ci1, Ci, 1); \
483
+ De = XOR256(Ca, Ci1); \
484
+ ROL64in256(Co1, Co, 1); \
485
+ Di = XOR256(Ce, Co1); \
486
+ ROL64in256(Cu1, Cu, 1); \
487
+ Do = XOR256(Ci, Cu1); \
488
+ ROL64in256(Ca1, Ca, 1); \
489
+ Du = XOR256(Co, Ca1); \
490
+ \
491
+ XOReq256(A##ba, Da); \
492
+ Bba = A##ba; \
493
+ XOReq256(A##ge, De); \
494
+ ROL64in256(Bbe, A##ge, 44); \
495
+ XOReq256(A##ki, Di); \
496
+ ROL64in256(Bbi, A##ki, 43); \
497
+ E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
498
+ XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \
499
+ Ca = E##ba; \
500
+ XOReq256(A##mo, Do); \
501
+ ROL64in256(Bbo, A##mo, 21); \
502
+ E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
503
+ Ce = E##be; \
504
+ XOReq256(A##su, Du); \
505
+ ROL64in256(Bbu, A##su, 14); \
506
+ E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
507
+ Ci = E##bi; \
508
+ E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
509
+ Co = E##bo; \
510
+ E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
511
+ Cu = E##bu; \
512
+ \
513
+ XOReq256(A##bo, Do); \
514
+ ROL64in256(Bga, A##bo, 28); \
515
+ XOReq256(A##gu, Du); \
516
+ ROL64in256(Bge, A##gu, 20); \
517
+ XOReq256(A##ka, Da); \
518
+ ROL64in256(Bgi, A##ka, 3); \
519
+ E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
520
+ XOReq256(Ca, E##ga); \
521
+ XOReq256(A##me, De); \
522
+ ROL64in256(Bgo, A##me, 45); \
523
+ E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
524
+ XOReq256(Ce, E##ge); \
525
+ XOReq256(A##si, Di); \
526
+ ROL64in256(Bgu, A##si, 61); \
527
+ E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
528
+ XOReq256(Ci, E##gi); \
529
+ E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
530
+ XOReq256(Co, E##go); \
531
+ E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
532
+ XOReq256(Cu, E##gu); \
533
+ \
534
+ XOReq256(A##be, De); \
535
+ ROL64in256(Bka, A##be, 1); \
536
+ XOReq256(A##gi, Di); \
537
+ ROL64in256(Bke, A##gi, 6); \
538
+ XOReq256(A##ko, Do); \
539
+ ROL64in256(Bki, A##ko, 25); \
540
+ E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
541
+ XOReq256(Ca, E##ka); \
542
+ XOReq256(A##mu, Du); \
543
+ ROL64in256_8(Bko, A##mu); \
544
+ E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
545
+ XOReq256(Ce, E##ke); \
546
+ XOReq256(A##sa, Da); \
547
+ ROL64in256(Bku, A##sa, 18); \
548
+ E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
549
+ XOReq256(Ci, E##ki); \
550
+ E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
551
+ XOReq256(Co, E##ko); \
552
+ E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
553
+ XOReq256(Cu, E##ku); \
554
+ \
555
+ XOReq256(A##bu, Du); \
556
+ ROL64in256(Bma, A##bu, 27); \
557
+ XOReq256(A##ga, Da); \
558
+ ROL64in256(Bme, A##ga, 36); \
559
+ XOReq256(A##ke, De); \
560
+ ROL64in256(Bmi, A##ke, 10); \
561
+ E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
562
+ XOReq256(Ca, E##ma); \
563
+ XOReq256(A##mi, Di); \
564
+ ROL64in256(Bmo, A##mi, 15); \
565
+ E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
566
+ XOReq256(Ce, E##me); \
567
+ XOReq256(A##so, Do); \
568
+ ROL64in256_56(Bmu, A##so); \
569
+ E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
570
+ XOReq256(Ci, E##mi); \
571
+ E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
572
+ XOReq256(Co, E##mo); \
573
+ E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
574
+ XOReq256(Cu, E##mu); \
575
+ \
576
+ XOReq256(A##bi, Di); \
577
+ ROL64in256(Bsa, A##bi, 62); \
578
+ XOReq256(A##go, Do); \
579
+ ROL64in256(Bse, A##go, 55); \
580
+ XOReq256(A##ku, Du); \
581
+ ROL64in256(Bsi, A##ku, 39); \
582
+ E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
583
+ XOReq256(Ca, E##sa); \
584
+ XOReq256(A##ma, Da); \
585
+ ROL64in256(Bso, A##ma, 41); \
586
+ E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
587
+ XOReq256(Ce, E##se); \
588
+ XOReq256(A##se, De); \
589
+ ROL64in256(Bsu, A##se, 2); \
590
+ E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
591
+ XOReq256(Ci, E##si); \
592
+ E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
593
+ XOReq256(Co, E##so); \
594
+ E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
595
+ XOReq256(Cu, E##su); \
596
+ \
597
+
598
+ /* --- Theta Rho Pi Chi Iota */
599
+ /* --- 64-bit lanes mapped to 64-bit words */
600
+ #define thetaRhoPiChiIota(i, A, E) \
601
+ ROL64in256(Ce1, Ce, 1); \
602
+ Da = XOR256(Cu, Ce1); \
603
+ ROL64in256(Ci1, Ci, 1); \
604
+ De = XOR256(Ca, Ci1); \
605
+ ROL64in256(Co1, Co, 1); \
606
+ Di = XOR256(Ce, Co1); \
607
+ ROL64in256(Cu1, Cu, 1); \
608
+ Do = XOR256(Ci, Cu1); \
609
+ ROL64in256(Ca1, Ca, 1); \
610
+ Du = XOR256(Co, Ca1); \
611
+ \
612
+ XOReq256(A##ba, Da); \
613
+ Bba = A##ba; \
614
+ XOReq256(A##ge, De); \
615
+ ROL64in256(Bbe, A##ge, 44); \
616
+ XOReq256(A##ki, Di); \
617
+ ROL64in256(Bbi, A##ki, 43); \
618
+ E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \
619
+ XOReq256(E##ba, CONST256_64(KeccakF1600RoundConstants[i])); \
620
+ XOReq256(A##mo, Do); \
621
+ ROL64in256(Bbo, A##mo, 21); \
622
+ E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \
623
+ XOReq256(A##su, Du); \
624
+ ROL64in256(Bbu, A##su, 14); \
625
+ E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \
626
+ E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \
627
+ E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \
628
+ \
629
+ XOReq256(A##bo, Do); \
630
+ ROL64in256(Bga, A##bo, 28); \
631
+ XOReq256(A##gu, Du); \
632
+ ROL64in256(Bge, A##gu, 20); \
633
+ XOReq256(A##ka, Da); \
634
+ ROL64in256(Bgi, A##ka, 3); \
635
+ E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \
636
+ XOReq256(A##me, De); \
637
+ ROL64in256(Bgo, A##me, 45); \
638
+ E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \
639
+ XOReq256(A##si, Di); \
640
+ ROL64in256(Bgu, A##si, 61); \
641
+ E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \
642
+ E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \
643
+ E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \
644
+ \
645
+ XOReq256(A##be, De); \
646
+ ROL64in256(Bka, A##be, 1); \
647
+ XOReq256(A##gi, Di); \
648
+ ROL64in256(Bke, A##gi, 6); \
649
+ XOReq256(A##ko, Do); \
650
+ ROL64in256(Bki, A##ko, 25); \
651
+ E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \
652
+ XOReq256(A##mu, Du); \
653
+ ROL64in256_8(Bko, A##mu); \
654
+ E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \
655
+ XOReq256(A##sa, Da); \
656
+ ROL64in256(Bku, A##sa, 18); \
657
+ E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \
658
+ E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \
659
+ E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \
660
+ \
661
+ XOReq256(A##bu, Du); \
662
+ ROL64in256(Bma, A##bu, 27); \
663
+ XOReq256(A##ga, Da); \
664
+ ROL64in256(Bme, A##ga, 36); \
665
+ XOReq256(A##ke, De); \
666
+ ROL64in256(Bmi, A##ke, 10); \
667
+ E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \
668
+ XOReq256(A##mi, Di); \
669
+ ROL64in256(Bmo, A##mi, 15); \
670
+ E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \
671
+ XOReq256(A##so, Do); \
672
+ ROL64in256_56(Bmu, A##so); \
673
+ E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \
674
+ E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \
675
+ E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \
676
+ \
677
+ XOReq256(A##bi, Di); \
678
+ ROL64in256(Bsa, A##bi, 62); \
679
+ XOReq256(A##go, Do); \
680
+ ROL64in256(Bse, A##go, 55); \
681
+ XOReq256(A##ku, Du); \
682
+ ROL64in256(Bsi, A##ku, 39); \
683
+ E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \
684
+ XOReq256(A##ma, Da); \
685
+ ROL64in256(Bso, A##ma, 41); \
686
+ E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \
687
+ XOReq256(A##se, De); \
688
+ ROL64in256(Bsu, A##se, 2); \
689
+ E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \
690
+ E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \
691
+ E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \
692
+ \
693
+
694
+ static ALIGN(KeccakP1600times4_statesAlignment) const UINT64 KeccakF1600RoundConstants[24] = {
695
+ 0x0000000000000001ULL,
696
+ 0x0000000000008082ULL,
697
+ 0x800000000000808aULL,
698
+ 0x8000000080008000ULL,
699
+ 0x000000000000808bULL,
700
+ 0x0000000080000001ULL,
701
+ 0x8000000080008081ULL,
702
+ 0x8000000000008009ULL,
703
+ 0x000000000000008aULL,
704
+ 0x0000000000000088ULL,
705
+ 0x0000000080008009ULL,
706
+ 0x000000008000000aULL,
707
+ 0x000000008000808bULL,
708
+ 0x800000000000008bULL,
709
+ 0x8000000000008089ULL,
710
+ 0x8000000000008003ULL,
711
+ 0x8000000000008002ULL,
712
+ 0x8000000000000080ULL,
713
+ 0x000000000000800aULL,
714
+ 0x800000008000000aULL,
715
+ 0x8000000080008081ULL,
716
+ 0x8000000000008080ULL,
717
+ 0x0000000080000001ULL,
718
+ 0x8000000080008008ULL};
719
+
720
+ #define copyFromState(X, state) \
721
+ X##ba = LOAD256(state[ 0]); \
722
+ X##be = LOAD256(state[ 1]); \
723
+ X##bi = LOAD256(state[ 2]); \
724
+ X##bo = LOAD256(state[ 3]); \
725
+ X##bu = LOAD256(state[ 4]); \
726
+ X##ga = LOAD256(state[ 5]); \
727
+ X##ge = LOAD256(state[ 6]); \
728
+ X##gi = LOAD256(state[ 7]); \
729
+ X##go = LOAD256(state[ 8]); \
730
+ X##gu = LOAD256(state[ 9]); \
731
+ X##ka = LOAD256(state[10]); \
732
+ X##ke = LOAD256(state[11]); \
733
+ X##ki = LOAD256(state[12]); \
734
+ X##ko = LOAD256(state[13]); \
735
+ X##ku = LOAD256(state[14]); \
736
+ X##ma = LOAD256(state[15]); \
737
+ X##me = LOAD256(state[16]); \
738
+ X##mi = LOAD256(state[17]); \
739
+ X##mo = LOAD256(state[18]); \
740
+ X##mu = LOAD256(state[19]); \
741
+ X##sa = LOAD256(state[20]); \
742
+ X##se = LOAD256(state[21]); \
743
+ X##si = LOAD256(state[22]); \
744
+ X##so = LOAD256(state[23]); \
745
+ X##su = LOAD256(state[24]); \
746
+
747
+ #define copyToState(state, X) \
748
+ STORE256(state[ 0], X##ba); \
749
+ STORE256(state[ 1], X##be); \
750
+ STORE256(state[ 2], X##bi); \
751
+ STORE256(state[ 3], X##bo); \
752
+ STORE256(state[ 4], X##bu); \
753
+ STORE256(state[ 5], X##ga); \
754
+ STORE256(state[ 6], X##ge); \
755
+ STORE256(state[ 7], X##gi); \
756
+ STORE256(state[ 8], X##go); \
757
+ STORE256(state[ 9], X##gu); \
758
+ STORE256(state[10], X##ka); \
759
+ STORE256(state[11], X##ke); \
760
+ STORE256(state[12], X##ki); \
761
+ STORE256(state[13], X##ko); \
762
+ STORE256(state[14], X##ku); \
763
+ STORE256(state[15], X##ma); \
764
+ STORE256(state[16], X##me); \
765
+ STORE256(state[17], X##mi); \
766
+ STORE256(state[18], X##mo); \
767
+ STORE256(state[19], X##mu); \
768
+ STORE256(state[20], X##sa); \
769
+ STORE256(state[21], X##se); \
770
+ STORE256(state[22], X##si); \
771
+ STORE256(state[23], X##so); \
772
+ STORE256(state[24], X##su); \
773
+
774
+ #define copyStateVariables(X, Y) \
775
+ X##ba = Y##ba; \
776
+ X##be = Y##be; \
777
+ X##bi = Y##bi; \
778
+ X##bo = Y##bo; \
779
+ X##bu = Y##bu; \
780
+ X##ga = Y##ga; \
781
+ X##ge = Y##ge; \
782
+ X##gi = Y##gi; \
783
+ X##go = Y##go; \
784
+ X##gu = Y##gu; \
785
+ X##ka = Y##ka; \
786
+ X##ke = Y##ke; \
787
+ X##ki = Y##ki; \
788
+ X##ko = Y##ko; \
789
+ X##ku = Y##ku; \
790
+ X##ma = Y##ma; \
791
+ X##me = Y##me; \
792
+ X##mi = Y##mi; \
793
+ X##mo = Y##mo; \
794
+ X##mu = Y##mu; \
795
+ X##sa = Y##sa; \
796
+ X##se = Y##se; \
797
+ X##si = Y##si; \
798
+ X##so = Y##so; \
799
+ X##su = Y##su; \
800
+
801
+ #ifdef KeccakP1600times4_fullUnrolling
802
+ #define FullUnrolling
803
+ #else
804
+ #define Unrolling KeccakP1600times4_unrolling
805
+ #endif
806
+ #include "KeccakP-1600-unrolling.macros"
807
+
808
+ void KeccakP1600times4_PermuteAll_24rounds(void *states)
809
+ {
810
+ V256 *statesAsLanes = (V256 *)states;
811
+ declareABCDE
812
+ #ifndef KeccakP1600times4_fullUnrolling
813
+ unsigned int i;
814
+ #endif
815
+
816
+ copyFromState(A, statesAsLanes)
817
+ rounds24
818
+ copyToState(statesAsLanes, A)
819
+ }
820
+
821
+ void KeccakP1600times4_PermuteAll_12rounds(void *states)
822
+ {
823
+ V256 *statesAsLanes = (V256 *)states;
824
+ declareABCDE
825
+ #ifndef KeccakP1600times4_fullUnrolling
826
+ unsigned int i;
827
+ #endif
828
+
829
+ copyFromState(A, statesAsLanes)
830
+ rounds12
831
+ copyToState(statesAsLanes, A)
832
+ }
833
+
834
+ void KeccakP1600times4_PermuteAll_6rounds(void *states)
835
+ {
836
+ V256 *statesAsLanes = (V256 *)states;
837
+ declareABCDE
838
+ #ifndef KeccakP1600times4_fullUnrolling
839
+ unsigned int i;
840
+ #endif
841
+
842
+ copyFromState(A, statesAsLanes)
843
+ rounds6
844
+ copyToState(statesAsLanes, A)
845
+ }
846
+
847
+ void KeccakP1600times4_PermuteAll_4rounds(void *states)
848
+ {
849
+ V256 *statesAsLanes = (V256 *)states;
850
+ declareABCDE
851
+ #ifndef KeccakP1600times4_fullUnrolling
852
+ unsigned int i;
853
+ #endif
854
+
855
+ copyFromState(A, statesAsLanes)
856
+ rounds4
857
+ copyToState(statesAsLanes, A)
858
+ }
859
+
860
+ size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
861
+ {
862
+ if (laneCount == 21) {
863
+ #if 0
864
+ const unsigned char *dataStart = data;
865
+ const UINT64 *curData0 = (const UINT64 *)data;
866
+ const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
867
+ const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
868
+ const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
869
+
870
+ while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
871
+ V256 *stateAsLanes = (V256 *)states;
872
+ V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
873
+ #define Xor_In( argIndex ) \
874
+ XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
875
+ #define Xor_In4( argIndex ) \
876
+ lanes0 = LOAD256u( curData0[argIndex]),\
877
+ lanes1 = LOAD256u( curData1[argIndex]),\
878
+ lanes2 = LOAD256u( curData2[argIndex]),\
879
+ lanes3 = LOAD256u( curData3[argIndex]),\
880
+ INTLEAVE(),\
881
+ XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
882
+ XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
883
+ XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
884
+ XOReq256( stateAsLanes[argIndex+3], lanes3 )
885
+ Xor_In4( 0 );
886
+ Xor_In4( 4 );
887
+ Xor_In4( 8 );
888
+ Xor_In4( 12 );
889
+ Xor_In4( 16 );
890
+ Xor_In( 20 );
891
+ #undef Xor_In
892
+ #undef Xor_In4
893
+ KeccakP1600times4_PermuteAll_24rounds(states);
894
+ curData0 += laneOffsetSerial;
895
+ curData1 += laneOffsetSerial;
896
+ curData2 += laneOffsetSerial;
897
+ curData3 += laneOffsetSerial;
898
+ dataByteLen -= laneOffsetSerial*8;
899
+ }
900
+ return (const unsigned char *)curData0 - dataStart;
901
+ #else
902
+ unsigned int i;
903
+ const unsigned char *dataStart = data;
904
+ const UINT64 *curData0 = (const UINT64 *)data;
905
+ const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
906
+ const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
907
+ const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
908
+ V256 *statesAsLanes = (V256 *)states;
909
+ declareABCDE
910
+
911
+ copyFromState(A, statesAsLanes)
912
+ while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
913
+ #define XOR_In( Xxx, argIndex ) \
914
+ XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
915
+ XOR_In( Aba, 0 );
916
+ XOR_In( Abe, 1 );
917
+ XOR_In( Abi, 2 );
918
+ XOR_In( Abo, 3 );
919
+ XOR_In( Abu, 4 );
920
+ XOR_In( Aga, 5 );
921
+ XOR_In( Age, 6 );
922
+ XOR_In( Agi, 7 );
923
+ XOR_In( Ago, 8 );
924
+ XOR_In( Agu, 9 );
925
+ XOR_In( Aka, 10 );
926
+ XOR_In( Ake, 11 );
927
+ XOR_In( Aki, 12 );
928
+ XOR_In( Ako, 13 );
929
+ XOR_In( Aku, 14 );
930
+ XOR_In( Ama, 15 );
931
+ XOR_In( Ame, 16 );
932
+ XOR_In( Ami, 17 );
933
+ XOR_In( Amo, 18 );
934
+ XOR_In( Amu, 19 );
935
+ XOR_In( Asa, 20 );
936
+ #undef XOR_In
937
+ rounds24
938
+ curData0 += laneOffsetSerial;
939
+ curData1 += laneOffsetSerial;
940
+ curData2 += laneOffsetSerial;
941
+ curData3 += laneOffsetSerial;
942
+ dataByteLen -= laneOffsetSerial*8;
943
+ }
944
+ copyToState(statesAsLanes, A)
945
+ return (const unsigned char *)curData0 - dataStart;
946
+ #endif
947
+ }
948
+ else {
949
+ unsigned int i;
950
+ const unsigned char *dataStart = data;
951
+
952
+ while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
953
+ KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);
954
+ KeccakP1600times4_PermuteAll_24rounds(states);
955
+ data += laneOffsetSerial*8;
956
+ dataByteLen -= laneOffsetSerial*8;
957
+ }
958
+ return data - dataStart;
959
+ }
960
+ }
961
+
962
+ size_t KeccakP1600times4_12rounds_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
963
+ {
964
+ if (laneCount == 21) {
965
+ #if 0
966
+ const unsigned char *dataStart = data;
967
+ const UINT64 *curData0 = (const UINT64 *)data;
968
+ const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
969
+ const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
970
+ const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
971
+
972
+ while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
973
+ V256 *stateAsLanes = states;
974
+ V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
975
+ #define Xor_In( argIndex ) \
976
+ XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
977
+ #define Xor_In4( argIndex ) \
978
+ lanes0 = LOAD256u( curData0[argIndex]),\
979
+ lanes1 = LOAD256u( curData1[argIndex]),\
980
+ lanes2 = LOAD256u( curData2[argIndex]),\
981
+ lanes3 = LOAD256u( curData3[argIndex]),\
982
+ INTLEAVE(),\
983
+ XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
984
+ XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
985
+ XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
986
+ XOReq256( stateAsLanes[argIndex+3], lanes3 )
987
+ Xor_In4( 0 );
988
+ Xor_In4( 4 );
989
+ Xor_In4( 8 );
990
+ Xor_In4( 12 );
991
+ Xor_In4( 16 );
992
+ Xor_In( 20 );
993
+ #undef Xor_In
994
+ #undef Xor_In4
995
+ KeccakP1600times4_PermuteAll_12rounds(states);
996
+ curData0 += laneOffsetSerial;
997
+ curData1 += laneOffsetSerial;
998
+ curData2 += laneOffsetSerial;
999
+ curData3 += laneOffsetSerial;
1000
+ dataByteLen -= laneOffsetSerial*8;
1001
+ }
1002
+ return (const unsigned char *)curData0 - dataStart;
1003
+ #else
1004
+ unsigned int i;
1005
+ const unsigned char *dataStart = data;
1006
+ const UINT64 *curData0 = (const UINT64 *)data;
1007
+ const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
1008
+ const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
1009
+ const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
1010
+ V256 *statesAsLanes = states;
1011
+ declareABCDE
1012
+
1013
+ copyFromState(A, statesAsLanes)
1014
+ while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
1015
+ #define XOR_In( Xxx, argIndex ) \
1016
+ XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
1017
+ XOR_In( Aba, 0 );
1018
+ XOR_In( Abe, 1 );
1019
+ XOR_In( Abi, 2 );
1020
+ XOR_In( Abo, 3 );
1021
+ XOR_In( Abu, 4 );
1022
+ XOR_In( Aga, 5 );
1023
+ XOR_In( Age, 6 );
1024
+ XOR_In( Agi, 7 );
1025
+ XOR_In( Ago, 8 );
1026
+ XOR_In( Agu, 9 );
1027
+ XOR_In( Aka, 10 );
1028
+ XOR_In( Ake, 11 );
1029
+ XOR_In( Aki, 12 );
1030
+ XOR_In( Ako, 13 );
1031
+ XOR_In( Aku, 14 );
1032
+ XOR_In( Ama, 15 );
1033
+ XOR_In( Ame, 16 );
1034
+ XOR_In( Ami, 17 );
1035
+ XOR_In( Amo, 18 );
1036
+ XOR_In( Amu, 19 );
1037
+ XOR_In( Asa, 20 );
1038
+ #undef XOR_In
1039
+ rounds12
1040
+ curData0 += laneOffsetSerial;
1041
+ curData1 += laneOffsetSerial;
1042
+ curData2 += laneOffsetSerial;
1043
+ curData3 += laneOffsetSerial;
1044
+ dataByteLen -= laneOffsetSerial*8;
1045
+ }
1046
+ copyToState(statesAsLanes, A)
1047
+ return (const unsigned char *)curData0 - dataStart;
1048
+ #endif
1049
+ }
1050
+ else {
1051
+ unsigned int i;
1052
+ const unsigned char *dataStart = data;
1053
+
1054
+ while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
1055
+ KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);
1056
+ KeccakP1600times4_PermuteAll_12rounds(states);
1057
+ data += laneOffsetSerial*8;
1058
+ dataByteLen -= laneOffsetSerial*8;
1059
+ }
1060
+ return data - dataStart;
1061
+ }
1062
+ }
1063
+
1064
+ /* ------------------------------------------------------------------------- */
1065
+
1066
+ #define UNINTLEAVEa(lanes0, lanes1, lanes2, lanes3) \
1067
+ lanesL01 = UNPACKL( lanes0, lanes1 ), \
1068
+ lanesH01 = UNPACKH( lanes0, lanes1 ), \
1069
+ lanesL23 = UNPACKL( lanes2, lanes3 ), \
1070
+ lanesH23 = UNPACKH( lanes2, lanes3 ), \
1071
+ lanes0 = PERM128( lanesL01, lanesL23, 0x20 ), \
1072
+ lanes2 = PERM128( lanesL01, lanesL23, 0x31 ), \
1073
+ lanes1 = PERM128( lanesH01, lanesH23, 0x20 ), \
1074
+ lanes3 = PERM128( lanesH01, lanesH23, 0x31 )
1075
+
1076
+ #define INTLEAVEa(lanes0, lanes1, lanes2, lanes3) \
1077
+ lanesL01 = PERM128( lanes0, lanes2, 0x20 ), \
1078
+ lanesH01 = PERM128( lanes1, lanes3, 0x20 ), \
1079
+ lanesL23 = PERM128( lanes0, lanes2, 0x31 ), \
1080
+ lanesH23 = PERM128( lanes1, lanes3, 0x31 ), \
1081
+ lanes0 = SHUFFLE64( lanesL01, lanesH01, 0x00 ), \
1082
+ lanes1 = SHUFFLE64( lanesL01, lanesH01, 0x0F ), \
1083
+ lanes2 = SHUFFLE64( lanesL23, lanesH23, 0x00 ), \
1084
+ lanes3 = SHUFFLE64( lanesL23, lanesH23, 0x0F )
1085
+
1086
+
1087
+ #define LoadXOReq256( lanes, inp, argIndex) XOReq256( lanes, LOAD4_64(inp[3*25+argIndex], inp[2*25+argIndex], inp[1*25+argIndex], inp[0*25+argIndex]) )
1088
+
1089
+ /* ------------------------------------------------------------------------- */
1090
+
1091
+ #if defined(UseGatherScatter)
1092
+
1093
+ #define AddOverWr4( lanes0, lanes1, lanes2, lanes3, key, inp, argIndex ) \
1094
+ lanes0 = _mm256_i32gather_epi64((const long long int *)&inp[argIndex+0], gather, 1), \
1095
+ lanes1 = _mm256_i32gather_epi64((const long long int *)&inp[argIndex+1], gather, 1), \
1096
+ lanes2 = _mm256_i32gather_epi64((const long long int *)&inp[argIndex+2], gather, 1), \
1097
+ lanes3 = _mm256_i32gather_epi64((const long long int *)&inp[argIndex+3], gather, 1), \
1098
+ XOReq256( lanes0, CONST256_64( key[argIndex+0])), \
1099
+ XOReq256( lanes1, CONST256_64( key[argIndex+1])), \
1100
+ XOReq256( lanes2, CONST256_64( key[argIndex+2])), \
1101
+ XOReq256( lanes3, CONST256_64( key[argIndex+3]))
1102
+
1103
+ #else
1104
+
1105
+ #define AddOverWr4( lanes0, lanes1, lanes2, lanes3, key, inp, argIndex ) \
1106
+ lanes0 = LOAD256u( inp[argIndex+0*25]), \
1107
+ lanes1 = LOAD256u( inp[argIndex+1*25]), \
1108
+ lanes2 = LOAD256u( inp[argIndex+2*25]), \
1109
+ lanes3 = LOAD256u( inp[argIndex+3*25]), \
1110
+ INTLEAVEa(lanes0, lanes1, lanes2, lanes3), \
1111
+ XOReq256( lanes0, CONST256_64( key[argIndex+0])), \
1112
+ XOReq256( lanes1, CONST256_64( key[argIndex+1])), \
1113
+ XOReq256( lanes2, CONST256_64( key[argIndex+2])), \
1114
+ XOReq256( lanes3, CONST256_64( key[argIndex+3]))
1115
+
1116
+ #endif
1117
+
1118
+ #define ExtrAccu( lanes, p, argIndex ) p[argIndex] ^= _mm256_extract_epi64(lanes, 0) ^ _mm256_extract_epi64(lanes, 1) \
1119
+ ^ _mm256_extract_epi64(lanes, 2) ^ _mm256_extract_epi64(lanes, 3)
1120
+
1121
+ #define ExtrAccu4( lanes0, lanes1, lanes2, lanes3, p, argIndex ) \
1122
+ UNINTLEAVEa(lanes0, lanes1, lanes2, lanes3), \
1123
+ XOReq256( lanes0, lanes1 ), \
1124
+ XOReq256( lanes2, lanes3 ), \
1125
+ lanes1 = LOAD256( p[argIndex]), \
1126
+ XOReq256( lanes0, lanes2 ), \
1127
+ XOReq256( lanes0, lanes1 ), \
1128
+ STORE256( p[argIndex], lanes0 )
1129
+
1130
+ #define Kravatte_Rollc() \
1131
+ Asa = x0x1x2x3, \
1132
+ Ase = x1x2x3x4, \
1133
+ ROL64in256(x1x2x3x4, x0x1x2x3, 7), \
1134
+ XOReq256(x1x2x3x4, Ase), \
1135
+ XOReq256(x1x2x3x4, _mm256_srli_epi64(Ase, 3)), \
1136
+ Asi = _mm256_blend_epi32(_mm256_permute4x64_epi64(Ase, 0x39), _mm256_permute4x64_epi64(x1x2x3x4, 0x39), 0xC0), \
1137
+ Aso = PERM128(Ase, x1x2x3x4, 0x21), \
1138
+ Asu = _mm256_blend_epi32(_mm256_permute4x64_epi64(Ase, 0xFF), _mm256_permute4x64_epi64(x1x2x3x4, 0x90), 0xFC), \
1139
+ x0x1x2x3 = Asu
1140
+
1141
+ size_t KeccakP1600times4_KravatteCompress(uint64_t *xAccu, uint64_t *kRoll, const unsigned char *input, size_t inputByteLen)
1142
+ {
1143
+ uint64_t *in64 = (uint64_t *)input;
1144
+ size_t nBlocks = inputByteLen / (4 * 200);
1145
+ declareABCDE
1146
+ #if !defined(KeccakP1600times4_fullUnrolling)
1147
+ unsigned int i;
1148
+ #endif
1149
+ V256 lanesL01, lanesL23, lanesH01, lanesH23;
1150
+ V256 x0x1x2x3, x1x2x3x4;
1151
+ #if defined(UseGatherScatter)
1152
+ V128 gather = _mm_setr_epi32(0*25*8, 1*25*8, 2*25*8, 3*25*8);
1153
+ #endif
1154
+
1155
+ x0x1x2x3 = LOAD256(kRoll[20]);
1156
+ x1x2x3x4 = LOAD256u(kRoll[21]);
1157
+ do {
1158
+ AddOverWr4( Aba, Abe, Abi, Abo, kRoll, in64, 0 );
1159
+ AddOverWr4( Abu, Aga, Age, Agi, kRoll, in64, 4 );
1160
+ AddOverWr4( Ago, Agu, Aka, Ake, kRoll, in64, 8 );
1161
+ AddOverWr4( Aki, Ako, Aku, Ama, kRoll, in64, 12 );
1162
+ AddOverWr4( Ame, Ami, Amo, Amu, kRoll, in64, 16 );
1163
+ Kravatte_Rollc();
1164
+ LoadXOReq256(Asa, in64, 20);
1165
+ LoadXOReq256(Ase, in64, 21);
1166
+ LoadXOReq256(Asi, in64, 22);
1167
+ LoadXOReq256(Aso, in64, 23);
1168
+ LoadXOReq256(Asu, in64, 24);
1169
+ rounds6
1170
+ ExtrAccu4(Aba, Abe, Abi, Abo, xAccu, 0 );
1171
+ ExtrAccu4(Abu, Aga, Age, Agi, xAccu, 4 );
1172
+ ExtrAccu4(Ago, Agu, Aka, Ake, xAccu, 8 );
1173
+ ExtrAccu4(Aki, Ako, Aku, Ama, xAccu, 12 );
1174
+ ExtrAccu4(Ame, Ami, Amo, Amu, xAccu, 16 );
1175
+ ExtrAccu4(Asa, Ase, Asi, Aso, xAccu, 20 );
1176
+ ExtrAccu( Asu, xAccu, 24 );
1177
+ in64 += 4 * 25;
1178
+ }
1179
+ while(--nBlocks != 0);
1180
+ STORE256(kRoll[20], x0x1x2x3);
1181
+ kRoll[24] = _mm256_extract_epi64(x1x2x3x4, 3);
1182
+
1183
+ return (size_t)in64 - (size_t)input;
1184
+ }
1185
+
1186
+ #undef LoadXOReq256
1187
+ #undef AddOverWr4
1188
+ #undef ExtrAccu
1189
+ #undef ExtrAccu4
1190
+
1191
+ /* ------------------------------------------------------------------------- */
1192
+
1193
+ #define ExtrAddKey( lanes, p, argIndex ) \
1194
+ XOReq256(lanes, CONST256_64(kRoll[argIndex])), \
1195
+ p[argIndex+0*25] = _mm256_extract_epi64(lanes, 0), \
1196
+ p[argIndex+1*25] = _mm256_extract_epi64(lanes, 1), \
1197
+ p[argIndex+2*25] = _mm256_extract_epi64(lanes, 2), \
1198
+ p[argIndex+3*25] = _mm256_extract_epi64(lanes, 3)
1199
+
1200
+ #if 0//defined(UseGatherScatter)
1201
+
1202
+ #define ExtrAddKey4( lanes0, lanes1, lanes2, lanes3, p, argIndex ) \
1203
+ XOReq256(lanes0, CONST256_64(kRoll[argIndex+0])), \
1204
+ XOReq256(lanes1, CONST256_64(kRoll[argIndex+1])), \
1205
+ XOReq256(lanes2, CONST256_64(kRoll[argIndex+2])), \
1206
+ XOReq256(lanes3, CONST256_64(kRoll[argIndex+3])), \
1207
+ _mm256_i32scatter_epi64((long long int *)&p[argIndex+0], scatter, lanes0, 1), \
1208
+ _mm256_i32scatter_epi64((long long int *)&p[argIndex+1], scatter, lanes1, 1), \
1209
+ _mm256_i32scatter_epi64((long long int *)&p[argIndex+2], scatter, lanes2, 1), \
1210
+ _mm256_i32scatter_epi64((long long int *)&p[argIndex+3], scatter, lanes3, 1)
1211
+
1212
+ #else
1213
+
1214
+ #define ExtrAddKey4( lanes0, lanes1, lanes2, lanes3, p, argIndex ) \
1215
+ XOReq256(lanes0, CONST256_64(kRoll[argIndex+0])), \
1216
+ XOReq256(lanes1, CONST256_64(kRoll[argIndex+1])), \
1217
+ XOReq256(lanes2, CONST256_64(kRoll[argIndex+2])), \
1218
+ XOReq256(lanes3, CONST256_64(kRoll[argIndex+3])), \
1219
+ UNINTLEAVEa(lanes0, lanes1, lanes2, lanes3), \
1220
+ STORE256u( p[argIndex+0*25], lanes0 ), \
1221
+ STORE256u( p[argIndex+1*25], lanes1 ), \
1222
+ STORE256u( p[argIndex+2*25], lanes2 ), \
1223
+ STORE256u( p[argIndex+3*25], lanes3 )
1224
+
1225
+ #endif
1226
+
1227
+ size_t KeccakP1600times4_KravatteExpand(uint64_t *yAccu, const uint64_t *kRoll, unsigned char *output, size_t outputByteLen)
1228
+ {
1229
+ uint64_t *out64 = (uint64_t *)output;
1230
+ size_t nBlocks = outputByteLen / (4 * 200);
1231
+ declareABCDE
1232
+ #if !defined(KeccakP1600times4_fullUnrolling)
1233
+ unsigned int i;
1234
+ #endif
1235
+ V256 lanesL01, lanesL23, lanesH01, lanesH23;
1236
+ #if defined(UseGatherScatter)
1237
+ V128 scatter = _mm_setr_epi32(0*25*8, 1*25*8, 2*25*8, 3*25*8);
1238
+ #endif
1239
+
1240
+ do {
1241
+ Aba = CONST256_64(yAccu[0]);
1242
+ Abe = CONST256_64(yAccu[1]);
1243
+ Abi = CONST256_64(yAccu[2]);
1244
+ Abo = CONST256_64(yAccu[3]);
1245
+ Abu = CONST256_64(yAccu[4]);
1246
+
1247
+ Aga = CONST256_64(yAccu[5]);
1248
+ Age = CONST256_64(yAccu[6]);
1249
+ Agi = CONST256_64(yAccu[7]);
1250
+ Ago = CONST256_64(yAccu[8]);
1251
+ Agu = CONST256_64(yAccu[9]);
1252
+
1253
+ Aka = CONST256_64(yAccu[10]);
1254
+ Ake = CONST256_64(yAccu[11]);
1255
+ Aki = CONST256_64(yAccu[12]);
1256
+ Ako = CONST256_64(yAccu[13]);
1257
+ Aku = CONST256_64(yAccu[14]);
1258
+
1259
+ Ama = LOAD256u(yAccu[15]);
1260
+ Ame = LOAD256 (yAccu[16]);
1261
+ Ami = LOAD256u(yAccu[17]);
1262
+ Amo = LOAD256u(yAccu[18]);
1263
+ Amu = LOAD256u(yAccu[19]);
1264
+
1265
+ ROL64in256(lanesL01, Ama, 7);
1266
+ ROL64in256(lanesH01, Ame, 18);
1267
+ lanesL01 = XOR256(lanesL01, lanesH01);
1268
+ lanesH01 = _mm256_and_si256(Ami, _mm256_srli_epi64(Ame, 1));
1269
+ lanesL01 = XOR256(lanesL01, lanesH01);
1270
+
1271
+ Asa = LOAD256 (yAccu[20]);
1272
+ Ase = LOAD256u(yAccu[21]);
1273
+ Asi = _mm256_insert_epi64(_mm256_permute4x64_epi64(Ase, 0x39), _mm256_extract_epi64(lanesL01, 0), 3);
1274
+ Aso = _mm256_permute2x128_si256(Ase, lanesL01, 0x21);
1275
+ Asu = _mm256_insert_epi64(_mm256_permute4x64_epi64(lanesL01, 0x93), _mm256_extract_epi64(Ase, 3), 0);
1276
+
1277
+ STORE256u(yAccu[15], Amu);
1278
+ yAccu[19] = _mm256_extract_epi64(Aso, 0);
1279
+ yAccu[20] = _mm256_extract_epi64(Aso, 1);
1280
+ STORE256u(yAccu[21], lanesL01);
1281
+
1282
+ rounds6
1283
+ ExtrAddKey4(Aba, Abe, Abi, Abo, out64, 0 );
1284
+ ExtrAddKey4(Abu, Aga, Age, Agi, out64, 4 );
1285
+ ExtrAddKey4(Ago, Agu, Aka, Ake, out64, 8 );
1286
+ ExtrAddKey4(Aki, Ako, Aku, Ama, out64, 12 );
1287
+ ExtrAddKey4(Ame, Ami, Amo, Amu, out64, 16 );
1288
+ ExtrAddKey4(Asa, Ase, Asi, Aso, out64, 20 );
1289
+ ExtrAddKey( Asu, out64, 24 );
1290
+ out64 += 4 * 25;
1291
+ }
1292
+ while(--nBlocks != 0);
1293
+
1294
+ return (size_t)out64 - (size_t)output;
1295
+ }
1296
+
1297
+ #undef OverWr4
1298
+ #undef ExtrAddKey
1299
+ #undef ExtrAddKey4
1300
+
1301
+ #undef Kravatte_Roll
1302
+ #undef UNINTLEAVEa
1303
+ #undef INTLEAVEa