digest-kangarootwelve 0.0.2 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (307) hide show
  1. checksums.yaml +5 -5
  2. data/README.md +71 -37
  3. data/Rakefile +7 -9
  4. data/digest-kangarootwelve.gemspec +323 -14
  5. data/ext/digest/kangarootwelve/ext.c +228 -177
  6. data/ext/digest/kangarootwelve/extconf.rb +15 -1
  7. data/ext/digest/kangarootwelve/keccak/armv6m/KangarooTwelve.link.c +1 -0
  8. data/ext/digest/kangarootwelve/keccak/armv6m/KeccakDuplexWidth1600.link.c +1 -0
  9. data/ext/digest/kangarootwelve/keccak/armv6m/KeccakP-1600-SnP.h +36 -0
  10. data/ext/digest/kangarootwelve/{KeccakP-1600-times2-SnP.h → keccak/armv6m/KeccakP-1600-times2-SnP.h} +10 -10
  11. data/ext/digest/kangarootwelve/{KeccakP-1600-times2-on1.c → keccak/armv6m/KeccakP-1600-times2-on1.c} +13 -7
  12. data/ext/digest/kangarootwelve/{KeccakP-1600-times4-SnP.h → keccak/armv6m/KeccakP-1600-times4-SnP.h} +10 -10
  13. data/ext/digest/kangarootwelve/{KeccakP-1600-times4-on1.c → keccak/armv6m/KeccakP-1600-times4-on1.c} +13 -7
  14. data/ext/digest/kangarootwelve/{KeccakP-1600-times8-SnP.h → keccak/armv6m/KeccakP-1600-times8-SnP.h} +10 -10
  15. data/ext/digest/kangarootwelve/{KeccakP-1600-times8-on1.c → keccak/armv6m/KeccakP-1600-times8-on1.c} +13 -7
  16. data/ext/digest/kangarootwelve/keccak/armv6m/KeccakP-1600-u2-32bi-armv6m-le-gcc.s +1334 -0
  17. data/ext/digest/kangarootwelve/keccak/armv6m/KeccakSpongeWidth1600.link.c +1 -0
  18. data/ext/digest/kangarootwelve/{PlSnP-Fallback.inc → keccak/armv6m/PlSnP-Fallback.inc} +11 -7
  19. data/ext/digest/kangarootwelve/keccak/armv6m/ext.link.c +1 -0
  20. data/ext/digest/kangarootwelve/keccak/armv7a/KangarooTwelve.link.c +1 -0
  21. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakDuplexWidth1600.link.c +1 -0
  22. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-SnP.h +37 -0
  23. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-armv7a-le-neon-gcc.s +826 -0
  24. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-inplace-pl2-armv7a-neon-le-gcc.s +1245 -0
  25. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-times2-SnP.h +38 -0
  26. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-times4-SnP.h +45 -0
  27. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-times4-on2.c +38 -0
  28. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-times8-SnP.h +45 -0
  29. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakP-1600-times8-on2.c +38 -0
  30. data/ext/digest/kangarootwelve/keccak/armv7a/KeccakSpongeWidth1600.link.c +1 -0
  31. data/ext/digest/kangarootwelve/keccak/armv7a/PlSnP-Fallback.inc +287 -0
  32. data/ext/digest/kangarootwelve/keccak/armv7a/ext.link.c +1 -0
  33. data/ext/digest/kangarootwelve/keccak/armv7m/KangarooTwelve.link.c +1 -0
  34. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakDuplexWidth1600.link.c +1 -0
  35. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-SnP.h +36 -0
  36. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-inplace-32bi-armv7m-le-gcc.s +1170 -0
  37. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-times2-SnP.h +45 -0
  38. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-times2-on1.c +37 -0
  39. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-times4-SnP.h +45 -0
  40. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-times4-on1.c +37 -0
  41. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-times8-SnP.h +45 -0
  42. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakP-1600-times8-on1.c +37 -0
  43. data/ext/digest/kangarootwelve/keccak/armv7m/KeccakSpongeWidth1600.link.c +1 -0
  44. data/ext/digest/kangarootwelve/keccak/armv7m/PlSnP-Fallback.inc +287 -0
  45. data/ext/digest/kangarootwelve/keccak/armv7m/ext.link.c +1 -0
  46. data/ext/digest/kangarootwelve/keccak/armv8a/KangarooTwelve.link.c +1 -0
  47. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakDuplexWidth1600.link.c +1 -0
  48. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-SnP.h +28 -0
  49. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-armv8a-neon.s +537 -0
  50. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-times2-SnP.h +45 -0
  51. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-times2-on1.c +37 -0
  52. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-times4-SnP.h +45 -0
  53. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-times4-on1.c +37 -0
  54. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-times8-SnP.h +45 -0
  55. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakP-1600-times8-on1.c +37 -0
  56. data/ext/digest/kangarootwelve/keccak/armv8a/KeccakSpongeWidth1600.link.c +1 -0
  57. data/ext/digest/kangarootwelve/keccak/armv8a/PlSnP-Fallback.inc +287 -0
  58. data/ext/digest/kangarootwelve/keccak/armv8a/ext.link.c +1 -0
  59. data/ext/digest/kangarootwelve/keccak/asmx86-64/KangarooTwelve.link.c +1 -0
  60. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakDuplexWidth1600.link.c +1 -0
  61. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-SnP.h +37 -0
  62. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-times2-SnP.h +45 -0
  63. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-times2-on1.c +37 -0
  64. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-times4-SnP.h +45 -0
  65. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-times4-on1.c +37 -0
  66. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-times8-SnP.h +45 -0
  67. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-times8-on1.c +37 -0
  68. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakP-1600-x86-64-gas.s +1190 -0
  69. data/ext/digest/kangarootwelve/keccak/asmx86-64/KeccakSpongeWidth1600.link.c +1 -0
  70. data/ext/digest/kangarootwelve/keccak/asmx86-64/PlSnP-Fallback.inc +287 -0
  71. data/ext/digest/kangarootwelve/keccak/asmx86-64/ext.link.c +1 -0
  72. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KangarooTwelve.link.c +1 -0
  73. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakDuplexWidth1600.link.c +1 -0
  74. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-SnP.h +37 -0
  75. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-times2-SnP.h +45 -0
  76. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-times2-on1.c +37 -0
  77. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-times4-SnP.h +45 -0
  78. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-times4-on1.c +37 -0
  79. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-times8-SnP.h +45 -0
  80. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-times8-on1.c +37 -0
  81. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakP-1600-x86-64-shld-gas.s +1190 -0
  82. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/KeccakSpongeWidth1600.link.c +1 -0
  83. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/PlSnP-Fallback.inc +287 -0
  84. data/ext/digest/kangarootwelve/keccak/asmx86-64shld/ext.link.c +1 -0
  85. data/ext/digest/kangarootwelve/keccak/avr8/KangarooTwelve.link.c +1 -0
  86. data/ext/digest/kangarootwelve/keccak/avr8/KeccakDuplexWidth1600.link.c +1 -0
  87. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-SnP.h +37 -0
  88. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-avr8-fast.s +1116 -0
  89. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-times2-SnP.h +45 -0
  90. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-times2-on1.c +37 -0
  91. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-times4-SnP.h +45 -0
  92. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-times4-on1.c +37 -0
  93. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-times8-SnP.h +45 -0
  94. data/ext/digest/kangarootwelve/keccak/avr8/KeccakP-1600-times8-on1.c +37 -0
  95. data/ext/digest/kangarootwelve/keccak/avr8/KeccakSpongeWidth1600.link.c +1 -0
  96. data/ext/digest/kangarootwelve/keccak/avr8/PlSnP-Fallback.inc +287 -0
  97. data/ext/digest/kangarootwelve/keccak/avr8/ext.link.c +1 -0
  98. data/ext/digest/kangarootwelve/keccak/bulldozer/KangarooTwelve.link.c +1 -0
  99. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakDuplexWidth1600.link.c +1 -0
  100. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-SnP.h +39 -0
  101. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-XOP-config.h +6 -0
  102. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-XOP.c +473 -0
  103. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-times2-SIMD128.c +954 -0
  104. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-times2-SnP.h +47 -0
  105. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-times4-SnP.h +45 -0
  106. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-times4-on2.c +38 -0
  107. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-times8-SnP.h +45 -0
  108. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-times8-on2.c +38 -0
  109. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakP-1600-unrolling.macros +302 -0
  110. data/ext/digest/kangarootwelve/keccak/bulldozer/KeccakSpongeWidth1600.link.c +1 -0
  111. data/ext/digest/kangarootwelve/keccak/bulldozer/PlSnP-Fallback.inc +287 -0
  112. data/ext/digest/kangarootwelve/keccak/bulldozer/SIMD128-config.h +9 -0
  113. data/ext/digest/kangarootwelve/{SnP-Relaned.h → keccak/bulldozer/SnP-Relaned.h} +13 -7
  114. data/ext/digest/kangarootwelve/keccak/bulldozer/ext.link.c +1 -0
  115. data/ext/digest/kangarootwelve/{KangarooTwelve.c → keccak/common/KangarooTwelve.c} +6 -10
  116. data/ext/digest/kangarootwelve/{KangarooTwelve.h → keccak/common/KangarooTwelve.h} +3 -7
  117. data/ext/digest/kangarootwelve/keccak/common/KeccakDuplex-common.h +37 -0
  118. data/ext/digest/kangarootwelve/keccak/common/KeccakDuplex.inc +192 -0
  119. data/ext/digest/kangarootwelve/keccak/common/KeccakDuplexWidth1600.c +34 -0
  120. data/ext/digest/kangarootwelve/keccak/common/KeccakDuplexWidth1600.h +25 -0
  121. data/ext/digest/kangarootwelve/{KeccakSponge-common.h → keccak/common/KeccakSponge-common.h} +5 -7
  122. data/ext/digest/kangarootwelve/{KeccakSponge.inc → keccak/common/KeccakSponge.inc} +6 -8
  123. data/ext/digest/kangarootwelve/{KeccakSpongeWidth1600.c → keccak/common/KeccakSpongeWidth1600.c} +6 -8
  124. data/ext/digest/kangarootwelve/{KeccakSpongeWidth1600.h → keccak/common/KeccakSpongeWidth1600.h} +5 -7
  125. data/ext/digest/kangarootwelve/{Phases.h → keccak/common/Phases.h} +3 -7
  126. data/ext/digest/kangarootwelve/{align.h → keccak/common/align.h} +5 -7
  127. data/ext/digest/kangarootwelve/{brg_endian.h → keccak/common/brg_endian.h} +0 -0
  128. data/ext/digest/kangarootwelve/keccak/compact/KangarooTwelve.link.c +1 -0
  129. data/ext/digest/kangarootwelve/keccak/compact/KeccakDuplexWidth1600.link.c +1 -0
  130. data/ext/digest/kangarootwelve/{KeccakP-1600-SnP.h → keccak/compact/KeccakP-1600-SnP.h} +7 -10
  131. data/ext/digest/kangarootwelve/{KeccakP-1600-compact64.c → keccak/compact/KeccakP-1600-compact64.c} +11 -7
  132. data/ext/digest/kangarootwelve/keccak/compact/KeccakP-1600-times2-SnP.h +45 -0
  133. data/ext/digest/kangarootwelve/keccak/compact/KeccakP-1600-times2-on1.c +37 -0
  134. data/ext/digest/kangarootwelve/keccak/compact/KeccakP-1600-times4-SnP.h +45 -0
  135. data/ext/digest/kangarootwelve/keccak/compact/KeccakP-1600-times4-on1.c +37 -0
  136. data/ext/digest/kangarootwelve/keccak/compact/KeccakP-1600-times8-SnP.h +45 -0
  137. data/ext/digest/kangarootwelve/keccak/compact/KeccakP-1600-times8-on1.c +37 -0
  138. data/ext/digest/kangarootwelve/keccak/compact/KeccakSpongeWidth1600.link.c +1 -0
  139. data/ext/digest/kangarootwelve/keccak/compact/PlSnP-Fallback.inc +287 -0
  140. data/ext/digest/kangarootwelve/keccak/compact/SnP-Relaned.h +140 -0
  141. data/ext/digest/kangarootwelve/keccak/compact/ext.link.c +1 -0
  142. data/ext/digest/kangarootwelve/keccak/generic32/KangarooTwelve.link.c +1 -0
  143. data/ext/digest/kangarootwelve/keccak/generic32/KeccakDuplexWidth1600.link.c +1 -0
  144. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-SnP.h +38 -0
  145. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-inplace32BI.c +1162 -0
  146. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-times2-SnP.h +45 -0
  147. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-times2-on1.c +37 -0
  148. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-times4-SnP.h +45 -0
  149. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-times4-on1.c +37 -0
  150. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-times8-SnP.h +45 -0
  151. data/ext/digest/kangarootwelve/keccak/generic32/KeccakP-1600-times8-on1.c +37 -0
  152. data/ext/digest/kangarootwelve/keccak/generic32/KeccakSpongeWidth1600.link.c +1 -0
  153. data/ext/digest/kangarootwelve/keccak/generic32/PlSnP-Fallback.inc +287 -0
  154. data/ext/digest/kangarootwelve/keccak/generic32/SnP-Relaned.h +140 -0
  155. data/ext/digest/kangarootwelve/keccak/generic32/ext.link.c +1 -0
  156. data/ext/digest/kangarootwelve/keccak/generic32lc/KangarooTwelve.link.c +1 -0
  157. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakDuplexWidth1600.link.c +1 -0
  158. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-SnP.h +38 -0
  159. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-inplace32BI.c +1162 -0
  160. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-times2-SnP.h +45 -0
  161. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-times2-on1.c +37 -0
  162. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-times4-SnP.h +45 -0
  163. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-times4-on1.c +37 -0
  164. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-times8-SnP.h +45 -0
  165. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakP-1600-times8-on1.c +37 -0
  166. data/ext/digest/kangarootwelve/keccak/generic32lc/KeccakSpongeWidth1600.link.c +1 -0
  167. data/ext/digest/kangarootwelve/keccak/generic32lc/PlSnP-Fallback.inc +287 -0
  168. data/ext/digest/kangarootwelve/keccak/generic32lc/SnP-Relaned.h +140 -0
  169. data/ext/digest/kangarootwelve/keccak/generic32lc/ext.link.c +1 -0
  170. data/ext/digest/kangarootwelve/keccak/generic64/KangarooTwelve.link.c +1 -0
  171. data/ext/digest/kangarootwelve/keccak/generic64/KeccakDuplexWidth1600.link.c +1 -0
  172. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-64.macros +2195 -0
  173. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-SnP.h +49 -0
  174. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-opt64-config.h +6 -0
  175. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-opt64.c +541 -0
  176. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-times2-SnP.h +45 -0
  177. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-times2-on1.c +37 -0
  178. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-times4-SnP.h +45 -0
  179. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-times4-on1.c +37 -0
  180. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-times8-SnP.h +45 -0
  181. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-times8-on1.c +37 -0
  182. data/ext/digest/kangarootwelve/keccak/generic64/KeccakP-1600-unrolling.macros +302 -0
  183. data/ext/digest/kangarootwelve/keccak/generic64/KeccakSpongeWidth1600.link.c +1 -0
  184. data/ext/digest/kangarootwelve/keccak/generic64/PlSnP-Fallback.inc +287 -0
  185. data/ext/digest/kangarootwelve/keccak/generic64/SnP-Relaned.h +140 -0
  186. data/ext/digest/kangarootwelve/keccak/generic64/ext.link.c +1 -0
  187. data/ext/digest/kangarootwelve/keccak/generic64lc/KangarooTwelve.link.c +1 -0
  188. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakDuplexWidth1600.link.c +1 -0
  189. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-64.macros +2195 -0
  190. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-SnP.h +49 -0
  191. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-opt64-config.h +7 -0
  192. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-opt64.c +541 -0
  193. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-times2-SnP.h +45 -0
  194. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-times2-on1.c +37 -0
  195. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-times4-SnP.h +45 -0
  196. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-times4-on1.c +37 -0
  197. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-times8-SnP.h +45 -0
  198. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-times8-on1.c +37 -0
  199. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakP-1600-unrolling.macros +302 -0
  200. data/ext/digest/kangarootwelve/keccak/generic64lc/KeccakSpongeWidth1600.link.c +1 -0
  201. data/ext/digest/kangarootwelve/keccak/generic64lc/PlSnP-Fallback.inc +287 -0
  202. data/ext/digest/kangarootwelve/keccak/generic64lc/SnP-Relaned.h +140 -0
  203. data/ext/digest/kangarootwelve/keccak/generic64lc/ext.link.c +1 -0
  204. data/ext/digest/kangarootwelve/keccak/haswell/KangarooTwelve.link.c +1 -0
  205. data/ext/digest/kangarootwelve/keccak/haswell/KeccakDuplexWidth1600.link.c +1 -0
  206. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-AVX2.s +993 -0
  207. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-SnP.h +41 -0
  208. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-times2-SIMD128.c +954 -0
  209. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-times2-SnP.h +47 -0
  210. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-times4-SIMD256.c +1303 -0
  211. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-times4-SnP.h +53 -0
  212. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-times8-SnP.h +45 -0
  213. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-times8-on4.c +38 -0
  214. data/ext/digest/kangarootwelve/keccak/haswell/KeccakP-1600-unrolling.macros +302 -0
  215. data/ext/digest/kangarootwelve/keccak/haswell/KeccakSpongeWidth1600.link.c +1 -0
  216. data/ext/digest/kangarootwelve/keccak/haswell/PlSnP-Fallback.inc +287 -0
  217. data/ext/digest/kangarootwelve/keccak/haswell/SIMD128-config.h +8 -0
  218. data/ext/digest/kangarootwelve/keccak/haswell/SIMD256-config.h +7 -0
  219. data/ext/digest/kangarootwelve/keccak/haswell/ext.link.c +1 -0
  220. data/ext/digest/kangarootwelve/keccak/nehalem/KangarooTwelve.link.c +1 -0
  221. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakDuplexWidth1600.link.c +1 -0
  222. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-64.macros +2195 -0
  223. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-SnP.h +49 -0
  224. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-opt64-config.h +7 -0
  225. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-opt64.c +541 -0
  226. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-times2-SIMD128.c +954 -0
  227. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-times2-SnP.h +47 -0
  228. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-times4-SnP.h +45 -0
  229. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-times4-on2.c +38 -0
  230. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-times8-SnP.h +45 -0
  231. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-times8-on2.c +38 -0
  232. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakP-1600-unrolling.macros +302 -0
  233. data/ext/digest/kangarootwelve/keccak/nehalem/KeccakSpongeWidth1600.link.c +1 -0
  234. data/ext/digest/kangarootwelve/keccak/nehalem/PlSnP-Fallback.inc +287 -0
  235. data/ext/digest/kangarootwelve/keccak/nehalem/SIMD128-config.h +8 -0
  236. data/ext/digest/kangarootwelve/keccak/nehalem/SnP-Relaned.h +140 -0
  237. data/ext/digest/kangarootwelve/keccak/nehalem/ext.link.c +1 -0
  238. data/ext/digest/kangarootwelve/keccak/reference/KangarooTwelve.link.c +1 -0
  239. data/ext/digest/kangarootwelve/keccak/reference/KeccakDuplexWidth1600.link.c +1 -0
  240. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-SnP.h +41 -0
  241. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-reference.c +424 -0
  242. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-reference.h +20 -0
  243. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-times2-SnP.h +45 -0
  244. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-times2-on1.c +37 -0
  245. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-times4-SnP.h +45 -0
  246. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-times4-on1.c +37 -0
  247. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-times8-SnP.h +45 -0
  248. data/ext/digest/kangarootwelve/keccak/reference/KeccakP-1600-times8-on1.c +37 -0
  249. data/ext/digest/kangarootwelve/keccak/reference/KeccakSpongeWidth1600.link.c +1 -0
  250. data/ext/digest/kangarootwelve/keccak/reference/PlSnP-Fallback.inc +287 -0
  251. data/ext/digest/kangarootwelve/keccak/reference/displayIntermediateValues.c +176 -0
  252. data/ext/digest/kangarootwelve/keccak/reference/displayIntermediateValues.h +29 -0
  253. data/ext/digest/kangarootwelve/keccak/reference/ext.link.c +1 -0
  254. data/ext/digest/kangarootwelve/keccak/reference32bits/KangarooTwelve.link.c +1 -0
  255. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakDuplexWidth1600.link.c +1 -0
  256. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-SnP.h +41 -0
  257. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-reference.h +20 -0
  258. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-reference32BI.c +612 -0
  259. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-times2-SnP.h +45 -0
  260. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-times2-on1.c +37 -0
  261. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-times4-SnP.h +45 -0
  262. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-times4-on1.c +37 -0
  263. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-times8-SnP.h +45 -0
  264. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakP-1600-times8-on1.c +37 -0
  265. data/ext/digest/kangarootwelve/keccak/reference32bits/KeccakSpongeWidth1600.link.c +1 -0
  266. data/ext/digest/kangarootwelve/keccak/reference32bits/PlSnP-Fallback.inc +287 -0
  267. data/ext/digest/kangarootwelve/keccak/reference32bits/displayIntermediateValues.c +176 -0
  268. data/ext/digest/kangarootwelve/keccak/reference32bits/displayIntermediateValues.h +29 -0
  269. data/ext/digest/kangarootwelve/keccak/reference32bits/ext.link.c +1 -0
  270. data/ext/digest/kangarootwelve/keccak/sandybridge/KangarooTwelve.link.c +1 -0
  271. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakDuplexWidth1600.link.c +1 -0
  272. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-64.macros +2195 -0
  273. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-SnP.h +49 -0
  274. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-opt64-config.h +8 -0
  275. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-opt64.c +541 -0
  276. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-times2-SIMD128.c +954 -0
  277. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-times2-SnP.h +47 -0
  278. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-times4-SnP.h +45 -0
  279. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-times4-on2.c +38 -0
  280. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-times8-SnP.h +45 -0
  281. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-times8-on2.c +38 -0
  282. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakP-1600-unrolling.macros +302 -0
  283. data/ext/digest/kangarootwelve/keccak/sandybridge/KeccakSpongeWidth1600.link.c +1 -0
  284. data/ext/digest/kangarootwelve/keccak/sandybridge/PlSnP-Fallback.inc +287 -0
  285. data/ext/digest/kangarootwelve/keccak/sandybridge/SIMD128-config.h +8 -0
  286. data/ext/digest/kangarootwelve/keccak/sandybridge/SnP-Relaned.h +140 -0
  287. data/ext/digest/kangarootwelve/keccak/sandybridge/ext.link.c +1 -0
  288. data/ext/digest/kangarootwelve/keccak/skylakex/KangarooTwelve.link.c +1 -0
  289. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakDuplexWidth1600.link.c +1 -0
  290. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-AVX512-config.h +6 -0
  291. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-AVX512.c +621 -0
  292. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-SnP.h +42 -0
  293. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-times2-SIMD512.c +852 -0
  294. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-times2-SnP.h +49 -0
  295. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-times4-SIMD512.c +883 -0
  296. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-times4-SnP.h +49 -0
  297. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-times8-SIMD512.c +1473 -0
  298. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakP-1600-times8-SnP.h +53 -0
  299. data/ext/digest/kangarootwelve/keccak/skylakex/KeccakSpongeWidth1600.link.c +1 -0
  300. data/ext/digest/kangarootwelve/keccak/skylakex/SIMD512-2-config.h +7 -0
  301. data/ext/digest/kangarootwelve/keccak/skylakex/SIMD512-4-config.h +7 -0
  302. data/ext/digest/kangarootwelve/keccak/skylakex/SIMD512-config.h +7 -0
  303. data/ext/digest/kangarootwelve/keccak/skylakex/ext.link.c +1 -0
  304. data/ext/digest/kangarootwelve/utils.h +101 -0
  305. data/lib/digest/kangarootwelve/version.rb +2 -2
  306. data/test/test.rb +68 -31
  307. metadata +305 -27
@@ -0,0 +1 @@
1
+ #include "../common/KeccakSpongeWidth1600.c"
@@ -0,0 +1,287 @@
1
+ /*
2
+ Implementation by Gilles Van Assche, hereby denoted as "the implementer".
3
+
4
+ For more information, feedback or questions, please refer to our website:
5
+ https://keccak.team/
6
+
7
+ To the extent possible under law, the implementer has waived all copyright
8
+ and related or neighboring rights to the source code in this file.
9
+ http://creativecommons.org/publicdomain/zero/1.0/
10
+
11
+ ---
12
+
13
+ This file contains macros that help make a PlSnP-compatible implementation by
14
+ serially falling back on a SnP-compatible implementation or on a PlSnP-compatible
15
+ implementation of lower parallism degree.
16
+
17
+ Please refer to PlSnP-documentation.h for more details.
18
+ */
19
+
20
+ /* expect PlSnP_baseParallelism, PlSnP_targetParallelism */
21
+ /* expect SnP_stateSizeInBytes, SnP_stateAlignment */
22
+ /* expect prefix */
23
+ /* expect SnP_* */
24
+
25
+ #define JOIN0(a, b) a ## b
26
+ #define JOIN(a, b) JOIN0(a, b)
27
+
28
+ #define PlSnP_StaticInitialize JOIN(prefix, _StaticInitialize)
29
+ #define PlSnP_InitializeAll JOIN(prefix, _InitializeAll)
30
+ #define PlSnP_AddByte JOIN(prefix, _AddByte)
31
+ #define PlSnP_AddBytes JOIN(prefix, _AddBytes)
32
+ #define PlSnP_AddLanesAll JOIN(prefix, _AddLanesAll)
33
+ #define PlSnP_OverwriteBytes JOIN(prefix, _OverwriteBytes)
34
+ #define PlSnP_OverwriteLanesAll JOIN(prefix, _OverwriteLanesAll)
35
+ #define PlSnP_OverwriteWithZeroes JOIN(prefix, _OverwriteWithZeroes)
36
+ #define PlSnP_ExtractBytes JOIN(prefix, _ExtractBytes)
37
+ #define PlSnP_ExtractLanesAll JOIN(prefix, _ExtractLanesAll)
38
+ #define PlSnP_ExtractAndAddBytes JOIN(prefix, _ExtractAndAddBytes)
39
+ #define PlSnP_ExtractAndAddLanesAll JOIN(prefix, _ExtractAndAddLanesAll)
40
+
41
+ #if (PlSnP_baseParallelism == 1)
42
+ #define SnP_stateSizeInBytes JOIN(SnP, _stateSizeInBytes)
43
+ #define SnP_stateAlignment JOIN(SnP, _stateAlignment)
44
+ #else
45
+ #define SnP_stateSizeInBytes JOIN(SnP, _statesSizeInBytes)
46
+ #define SnP_stateAlignment JOIN(SnP, _statesAlignment)
47
+ #endif
48
+ #define PlSnP_factor ((PlSnP_targetParallelism)/(PlSnP_baseParallelism))
49
+ #define SnP_stateOffset (((SnP_stateSizeInBytes+(SnP_stateAlignment-1))/SnP_stateAlignment)*SnP_stateAlignment)
50
+ #define stateWithIndex(i) ((unsigned char *)states+((i)*SnP_stateOffset))
51
+
52
+ #define SnP_StaticInitialize JOIN(SnP, _StaticInitialize)
53
+ #define SnP_Initialize JOIN(SnP, _Initialize)
54
+ #define SnP_InitializeAll JOIN(SnP, _InitializeAll)
55
+ #define SnP_AddByte JOIN(SnP, _AddByte)
56
+ #define SnP_AddBytes JOIN(SnP, _AddBytes)
57
+ #define SnP_AddLanesAll JOIN(SnP, _AddLanesAll)
58
+ #define SnP_OverwriteBytes JOIN(SnP, _OverwriteBytes)
59
+ #define SnP_OverwriteLanesAll JOIN(SnP, _OverwriteLanesAll)
60
+ #define SnP_OverwriteWithZeroes JOIN(SnP, _OverwriteWithZeroes)
61
+ #define SnP_ExtractBytes JOIN(SnP, _ExtractBytes)
62
+ #define SnP_ExtractLanesAll JOIN(SnP, _ExtractLanesAll)
63
+ #define SnP_ExtractAndAddBytes JOIN(SnP, _ExtractAndAddBytes)
64
+ #define SnP_ExtractAndAddLanesAll JOIN(SnP, _ExtractAndAddLanesAll)
65
+
66
+ void PlSnP_StaticInitialize( void )
67
+ {
68
+ SnP_StaticInitialize();
69
+ }
70
+
71
+ void PlSnP_InitializeAll(void *states)
72
+ {
73
+ unsigned int i;
74
+
75
+ for(i=0; i<PlSnP_factor; i++)
76
+ #if (PlSnP_baseParallelism == 1)
77
+ SnP_Initialize(stateWithIndex(i));
78
+ #else
79
+ SnP_InitializeAll(stateWithIndex(i));
80
+ #endif
81
+ }
82
+
83
+ void PlSnP_AddByte(void *states, unsigned int instanceIndex, unsigned char byte, unsigned int offset)
84
+ {
85
+ #if (PlSnP_baseParallelism == 1)
86
+ SnP_AddByte(stateWithIndex(instanceIndex), byte, offset);
87
+ #else
88
+ SnP_AddByte(stateWithIndex(instanceIndex/PlSnP_baseParallelism), instanceIndex%PlSnP_baseParallelism, byte, offset);
89
+ #endif
90
+ }
91
+
92
+ void PlSnP_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
93
+ {
94
+ #if (PlSnP_baseParallelism == 1)
95
+ SnP_AddBytes(stateWithIndex(instanceIndex), data, offset, length);
96
+ #else
97
+ SnP_AddBytes(stateWithIndex(instanceIndex/PlSnP_baseParallelism), instanceIndex%PlSnP_baseParallelism, data, offset, length);
98
+ #endif
99
+ }
100
+
101
+ void PlSnP_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
102
+ {
103
+ unsigned int i;
104
+
105
+ for(i=0; i<PlSnP_factor; i++) {
106
+ #if (PlSnP_baseParallelism == 1)
107
+ SnP_AddBytes(stateWithIndex(i), data, 0, laneCount*SnP_laneLengthInBytes);
108
+ #else
109
+ SnP_AddLanesAll(stateWithIndex(i), data, laneCount, laneOffset);
110
+ #endif
111
+ data += PlSnP_baseParallelism*laneOffset*SnP_laneLengthInBytes;
112
+ }
113
+ }
114
+
115
+ void PlSnP_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
116
+ {
117
+ #if (PlSnP_baseParallelism == 1)
118
+ SnP_OverwriteBytes(stateWithIndex(instanceIndex), data, offset, length);
119
+ #else
120
+ SnP_OverwriteBytes(stateWithIndex(instanceIndex/PlSnP_baseParallelism), instanceIndex%PlSnP_baseParallelism, data, offset, length);
121
+ #endif
122
+ }
123
+
124
+ void PlSnP_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
125
+ {
126
+ unsigned int i;
127
+
128
+ for(i=0; i<PlSnP_factor; i++) {
129
+ #if (PlSnP_baseParallelism == 1)
130
+ SnP_OverwriteBytes(stateWithIndex(i), data, 0, laneCount*SnP_laneLengthInBytes);
131
+ #else
132
+ SnP_OverwriteLanesAll(stateWithIndex(i), data, laneCount, laneOffset);
133
+ #endif
134
+ data += PlSnP_baseParallelism*laneOffset*SnP_laneLengthInBytes;
135
+ }
136
+ }
137
+
138
+ void PlSnP_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount)
139
+ {
140
+ #if (PlSnP_baseParallelism == 1)
141
+ SnP_OverwriteWithZeroes(stateWithIndex(instanceIndex), byteCount);
142
+ #else
143
+ SnP_OverwriteWithZeroes(stateWithIndex(instanceIndex/PlSnP_baseParallelism), instanceIndex%PlSnP_baseParallelism, byteCount);
144
+ #endif
145
+ }
146
+
147
+ void PlSnP_PermuteAll(void *states)
148
+ {
149
+ unsigned int i;
150
+
151
+ for(i=0; i<PlSnP_factor; i++) {
152
+ #if (PlSnP_baseParallelism == 1)
153
+ SnP_Permute(stateWithIndex(i));
154
+ #else
155
+ SnP_PermuteAll(stateWithIndex(i));
156
+ #endif
157
+ }
158
+ }
159
+
160
+ #if (defined(SnP_Permute_12rounds) || defined(SnP_PermuteAll_12rounds))
161
+ void PlSnP_PermuteAll_12rounds(void *states)
162
+ {
163
+ unsigned int i;
164
+
165
+ for(i=0; i<PlSnP_factor; i++) {
166
+ #if (PlSnP_baseParallelism == 1)
167
+ SnP_Permute_12rounds(stateWithIndex(i));
168
+ #else
169
+ SnP_PermuteAll_12rounds(stateWithIndex(i));
170
+ #endif
171
+ }
172
+ }
173
+ #endif
174
+
175
+ #if (defined(SnP_Permute_Nrounds) || defined(SnP_PermuteAll_6rounds))
176
+ void PlSnP_PermuteAll_6rounds(void *states)
177
+ {
178
+ unsigned int i;
179
+
180
+ for(i=0; i<PlSnP_factor; i++) {
181
+ #if (PlSnP_baseParallelism == 1)
182
+ SnP_Permute_Nrounds(stateWithIndex(i), 6);
183
+ #else
184
+ SnP_PermuteAll_6rounds(stateWithIndex(i));
185
+ #endif
186
+ }
187
+ }
188
+ #endif
189
+
190
+ #if (defined(SnP_Permute_Nrounds) || defined(SnP_PermuteAll_4rounds))
191
+ void PlSnP_PermuteAll_4rounds(void *states)
192
+ {
193
+ unsigned int i;
194
+
195
+ for(i=0; i<PlSnP_factor; i++) {
196
+ #if (PlSnP_baseParallelism == 1)
197
+ SnP_Permute_Nrounds(stateWithIndex(i), 4);
198
+ #else
199
+ SnP_PermuteAll_4rounds(stateWithIndex(i));
200
+ #endif
201
+ }
202
+ }
203
+ #endif
204
+
205
+ void PlSnP_ExtractBytes(void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length)
206
+ {
207
+ #if (PlSnP_baseParallelism == 1)
208
+ SnP_ExtractBytes(stateWithIndex(instanceIndex), data, offset, length);
209
+ #else
210
+ SnP_ExtractBytes(stateWithIndex(instanceIndex/PlSnP_baseParallelism), instanceIndex%PlSnP_baseParallelism, data, offset, length);
211
+ #endif
212
+ }
213
+
214
+ void PlSnP_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
215
+ {
216
+ unsigned int i;
217
+
218
+ for(i=0; i<PlSnP_factor; i++) {
219
+ #if (PlSnP_baseParallelism == 1)
220
+ SnP_ExtractBytes(stateWithIndex(i), data, 0, laneCount*SnP_laneLengthInBytes);
221
+ #else
222
+ SnP_ExtractLanesAll(stateWithIndex(i), data, laneCount, laneOffset);
223
+ #endif
224
+ data += laneOffset*SnP_laneLengthInBytes*PlSnP_baseParallelism;
225
+ }
226
+ }
227
+
228
+ void PlSnP_ExtractAndAddBytes(void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
229
+ {
230
+ #if (PlSnP_baseParallelism == 1)
231
+ SnP_ExtractAndAddBytes(stateWithIndex(instanceIndex), input, output, offset, length);
232
+ #else
233
+ SnP_ExtractAndAddBytes(stateWithIndex(instanceIndex/PlSnP_baseParallelism), instanceIndex%PlSnP_baseParallelism, input, output, offset, length);
234
+ #endif
235
+ }
236
+
237
+ void PlSnP_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset)
238
+ {
239
+ unsigned int i;
240
+
241
+ for(i=0; i<PlSnP_factor; i++) {
242
+ #if (PlSnP_baseParallelism == 1)
243
+ SnP_ExtractAndAddBytes(stateWithIndex(i), input, output, 0, laneCount*SnP_laneLengthInBytes);
244
+ #else
245
+ SnP_ExtractAndAddLanesAll(stateWithIndex(i), input, output, laneCount, laneOffset);
246
+ #endif
247
+ input += laneOffset*SnP_laneLengthInBytes*PlSnP_baseParallelism;
248
+ output += laneOffset*SnP_laneLengthInBytes*PlSnP_baseParallelism;
249
+ }
250
+ }
251
+
252
+ #undef PlSnP_factor
253
+ #undef SnP_stateOffset
254
+ #undef stateWithIndex
255
+ #undef JOIN0
256
+ #undef JOIN
257
+ #undef PlSnP_StaticInitialize
258
+ #undef PlSnP_InitializeAll
259
+ #undef PlSnP_AddByte
260
+ #undef PlSnP_AddBytes
261
+ #undef PlSnP_AddLanesAll
262
+ #undef PlSnP_OverwriteBytes
263
+ #undef PlSnP_OverwriteLanesAll
264
+ #undef PlSnP_OverwriteWithZeroes
265
+ #undef PlSnP_PermuteAll
266
+ #undef PlSnP_ExtractBytes
267
+ #undef PlSnP_ExtractLanesAll
268
+ #undef PlSnP_ExtractAndAddBytes
269
+ #undef PlSnP_ExtractAndAddLanesAll
270
+ #undef SnP_stateAlignment
271
+ #undef SnP_stateSizeInBytes
272
+ #undef PlSnP_factor
273
+ #undef SnP_stateOffset
274
+ #undef stateWithIndex
275
+ #undef SnP_StaticInitialize
276
+ #undef SnP_Initialize
277
+ #undef SnP_InitializeAll
278
+ #undef SnP_AddByte
279
+ #undef SnP_AddBytes
280
+ #undef SnP_AddLanesAll
281
+ #undef SnP_OverwriteBytes
282
+ #undef SnP_OverwriteWithZeroes
283
+ #undef SnP_OverwriteLanesAll
284
+ #undef SnP_ExtractBytes
285
+ #undef SnP_ExtractLanesAll
286
+ #undef SnP_ExtractAndAddBytes
287
+ #undef SnP_ExtractAndAddLanesAll
@@ -0,0 +1,140 @@
1
+ /*
2
+ Implementation by the Keccak Team, namely, Guido Bertoni, Joan Daemen,
3
+ Michaël Peeters, Gilles Van Assche and Ronny Van Keer,
4
+ hereby denoted as "the implementer".
5
+
6
+ For more information, feedback or questions, please refer to our website:
7
+ https://keccak.team/
8
+
9
+ To the extent possible under law, the implementer has waived all copyright
10
+ and related or neighboring rights to the source code in this file.
11
+ http://creativecommons.org/publicdomain/zero/1.0/
12
+
13
+ ---
14
+
15
+ This file contains macros that help implement a permutation in a SnP-compatible way.
16
+ It converts an implementation that implement state input/output functions
17
+ in a lane-oriented fashion (i.e., using SnP_AddLanes() and SnP_AddBytesInLane,
18
+ and similarly for Overwite, Extract and ExtractAndAdd) to the byte-oriented SnP.
19
+ Please refer to SnP-documentation.h for more details.
20
+ */
21
+
22
+ #ifndef _SnP_Relaned_h_
23
+ #define _SnP_Relaned_h_
24
+
25
+ #define SnP_AddBytes(state, data, offset, length, SnP_AddLanes, SnP_AddBytesInLane, SnP_laneLengthInBytes) \
26
+ { \
27
+ if ((offset) == 0) { \
28
+ SnP_AddLanes(state, data, (length)/SnP_laneLengthInBytes); \
29
+ SnP_AddBytesInLane(state, \
30
+ (length)/SnP_laneLengthInBytes, \
31
+ (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
32
+ 0, \
33
+ (length)%SnP_laneLengthInBytes); \
34
+ } \
35
+ else { \
36
+ unsigned int _sizeLeft = (length); \
37
+ unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
38
+ unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
39
+ const unsigned char *_curData = (data); \
40
+ while(_sizeLeft > 0) { \
41
+ unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
42
+ if (_bytesInLane > _sizeLeft) \
43
+ _bytesInLane = _sizeLeft; \
44
+ SnP_AddBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \
45
+ _sizeLeft -= _bytesInLane; \
46
+ _lanePosition++; \
47
+ _offsetInLane = 0; \
48
+ _curData += _bytesInLane; \
49
+ } \
50
+ } \
51
+ }
52
+
53
+ #define SnP_OverwriteBytes(state, data, offset, length, SnP_OverwriteLanes, SnP_OverwriteBytesInLane, SnP_laneLengthInBytes) \
54
+ { \
55
+ if ((offset) == 0) { \
56
+ SnP_OverwriteLanes(state, data, (length)/SnP_laneLengthInBytes); \
57
+ SnP_OverwriteBytesInLane(state, \
58
+ (length)/SnP_laneLengthInBytes, \
59
+ (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
60
+ 0, \
61
+ (length)%SnP_laneLengthInBytes); \
62
+ } \
63
+ else { \
64
+ unsigned int _sizeLeft = (length); \
65
+ unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
66
+ unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
67
+ const unsigned char *_curData = (data); \
68
+ while(_sizeLeft > 0) { \
69
+ unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
70
+ if (_bytesInLane > _sizeLeft) \
71
+ _bytesInLane = _sizeLeft; \
72
+ SnP_OverwriteBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \
73
+ _sizeLeft -= _bytesInLane; \
74
+ _lanePosition++; \
75
+ _offsetInLane = 0; \
76
+ _curData += _bytesInLane; \
77
+ } \
78
+ } \
79
+ }
80
+
81
+ #define SnP_ExtractBytes(state, data, offset, length, SnP_ExtractLanes, SnP_ExtractBytesInLane, SnP_laneLengthInBytes) \
82
+ { \
83
+ if ((offset) == 0) { \
84
+ SnP_ExtractLanes(state, data, (length)/SnP_laneLengthInBytes); \
85
+ SnP_ExtractBytesInLane(state, \
86
+ (length)/SnP_laneLengthInBytes, \
87
+ (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
88
+ 0, \
89
+ (length)%SnP_laneLengthInBytes); \
90
+ } \
91
+ else { \
92
+ unsigned int _sizeLeft = (length); \
93
+ unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
94
+ unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
95
+ unsigned char *_curData = (data); \
96
+ while(_sizeLeft > 0) { \
97
+ unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
98
+ if (_bytesInLane > _sizeLeft) \
99
+ _bytesInLane = _sizeLeft; \
100
+ SnP_ExtractBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \
101
+ _sizeLeft -= _bytesInLane; \
102
+ _lanePosition++; \
103
+ _offsetInLane = 0; \
104
+ _curData += _bytesInLane; \
105
+ } \
106
+ } \
107
+ }
108
+
109
+ #define SnP_ExtractAndAddBytes(state, input, output, offset, length, SnP_ExtractAndAddLanes, SnP_ExtractAndAddBytesInLane, SnP_laneLengthInBytes) \
110
+ { \
111
+ if ((offset) == 0) { \
112
+ SnP_ExtractAndAddLanes(state, input, output, (length)/SnP_laneLengthInBytes); \
113
+ SnP_ExtractAndAddBytesInLane(state, \
114
+ (length)/SnP_laneLengthInBytes, \
115
+ (input)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
116
+ (output)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
117
+ 0, \
118
+ (length)%SnP_laneLengthInBytes); \
119
+ } \
120
+ else { \
121
+ unsigned int _sizeLeft = (length); \
122
+ unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
123
+ unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
124
+ const unsigned char *_curInput = (input); \
125
+ unsigned char *_curOutput = (output); \
126
+ while(_sizeLeft > 0) { \
127
+ unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
128
+ if (_bytesInLane > _sizeLeft) \
129
+ _bytesInLane = _sizeLeft; \
130
+ SnP_ExtractAndAddBytesInLane(state, _lanePosition, _curInput, _curOutput, _offsetInLane, _bytesInLane); \
131
+ _sizeLeft -= _bytesInLane; \
132
+ _lanePosition++; \
133
+ _offsetInLane = 0; \
134
+ _curInput += _bytesInLane; \
135
+ _curOutput += _bytesInLane; \
136
+ } \
137
+ } \
138
+ }
139
+
140
+ #endif
@@ -0,0 +1 @@
1
+ #include "../../ext.c"
@@ -0,0 +1 @@
1
+ #include "../common/KangarooTwelve.c"
@@ -0,0 +1 @@
1
+ #include "../common/KeccakDuplexWidth1600.c"
@@ -0,0 +1,993 @@
1
+ # Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
2
+ # Copyright (c) 2017 Ronny Van Keer
3
+ # All rights reserved.
4
+ #
5
+ # The source code in this file is licensed under the CRYPTOGAMS license.
6
+ # For further details see http://www.openssl.org/~appro/cryptogams/.
7
+ #
8
+ # Notes:
9
+ # The code for the permutation (__KeccakF1600) was generated with
10
+ # Andy Polyakov's keccak1600-avx2.pl from the CRYPTOGAMS project
11
+ # (https://github.com/dot-asm/cryptogams/blob/master/x86_64/keccak1600-avx2.pl).
12
+ # The rest of the code was written by Ronny Van Keer.
13
+
14
+ .text
15
+
16
+ # -----------------------------------------------------------------------------
17
+ #
18
+ # void KeccakP1600_Initialize(void *state);
19
+ #
20
+ .globl KeccakP1600_Initialize
21
+ .type KeccakP1600_Initialize,@function
22
+ .align 32
23
+ KeccakP1600_Initialize:
24
+ vpxor %ymm0,%ymm0,%ymm0
25
+ vmovdqa %ymm0,0*32(%rdi)
26
+ vmovdqa %ymm0,1*32(%rdi)
27
+ vmovdqa %ymm0,2*32(%rdi)
28
+ vmovdqa %ymm0,3*32(%rdi)
29
+ vmovdqa %ymm0,4*32(%rdi)
30
+ vmovdqa %ymm0,5*32(%rdi)
31
+ movq $0,6*32(%rdi)
32
+ ret
33
+ .size KeccakP1600_Initialize,.-KeccakP1600_Initialize
34
+
35
+ # -----------------------------------------------------------------------------
36
+ #
37
+ # void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset);
38
+ # %rdi %rsi %rdx
39
+ #
40
+ .globl KeccakP1600_AddByte
41
+ .type KeccakP1600_AddByte,@function
42
+ .align 32
43
+ KeccakP1600_AddByte:
44
+ mov %rdx, %rax
45
+ and $7, %rax
46
+ and $0xFFFFFFF8, %edx
47
+ mov mapState(%rdx), %rdx
48
+ add %rdx, %rdi
49
+ add %rax, %rdi
50
+ xorb %sil, (%rdi)
51
+ ret
52
+ .size KeccakP1600_AddByte,.-KeccakP1600_AddByte
53
+
54
+ # -----------------------------------------------------------------------------
55
+ #
56
+ # void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
57
+ # %rdi %rsi %rdx %rcx
58
+ #
59
+ .globl KeccakP1600_AddBytes
60
+ .type KeccakP1600_AddBytes,@function
61
+ .align 32
62
+ KeccakP1600_AddBytes:
63
+ cmp $0, %rcx
64
+ jz KeccakP1600_AddBytes_Exit
65
+ mov %rdx, %rax # rax offset in lane
66
+ and $0xFFFFFFF8, %edx # rdx pointer into state index mapper
67
+ lea mapState(%rdx), %rdx
68
+ and $7, %rax
69
+ jz KeccakP1600_AddBytes_LaneAlignedCheck
70
+ mov $8, %r9 # r9 is (max) length of incomplete lane
71
+ sub %rax, %r9
72
+ cmp %rcx, %r9
73
+ cmovae %rcx, %r9
74
+ sub %r9, %rcx # length -= length of incomplete lane
75
+ add (%rdx), %rax # rax = pointer to state lane
76
+ add $8, %rdx
77
+ add %rdi, %rax
78
+ KeccakP1600_AddBytes_NotAlignedLoop:
79
+ mov (%rsi), %r8b
80
+ inc %rsi
81
+ xorb %r8b, (%rax)
82
+ inc %rax
83
+ dec %r9
84
+ jnz KeccakP1600_AddBytes_NotAlignedLoop
85
+ jmp KeccakP1600_AddBytes_LaneAlignedCheck
86
+ KeccakP1600_AddBytes_LaneAlignedLoop:
87
+ mov (%rsi), %r8
88
+ add $8, %rsi
89
+ mov (%rdx), %rax
90
+ add $8, %rdx
91
+ add %rdi, %rax
92
+ xor %r8, (%rax)
93
+ KeccakP1600_AddBytes_LaneAlignedCheck:
94
+ sub $8, %rcx
95
+ jnc KeccakP1600_AddBytes_LaneAlignedLoop
96
+ KeccakP1600_AddBytes_LastIncompleteLane:
97
+ add $8, %rcx
98
+ jz KeccakP1600_AddBytes_Exit
99
+ mov (%rdx), %rax
100
+ add %rdi, %rax
101
+ KeccakP1600_AddBytes_LastIncompleteLaneLoop:
102
+ mov (%rsi), %r8b
103
+ inc %rsi
104
+ xor %r8b, (%rax)
105
+ inc %rax
106
+ dec %rcx
107
+ jnz KeccakP1600_AddBytes_LastIncompleteLaneLoop
108
+ KeccakP1600_AddBytes_Exit:
109
+ ret
110
+ .size KeccakP1600_AddBytes,.-KeccakP1600_AddBytes
111
+
112
+ # -----------------------------------------------------------------------------
113
+ #
114
+ # void KeccakP1600_OverwriteBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
115
+ # %rdi %rsi %rdx %rcx
116
+ #
117
+ .globl KeccakP1600_OverwriteBytes
118
+ .type KeccakP1600_OverwriteBytes,@function
119
+ .align 32
120
+ KeccakP1600_OverwriteBytes:
121
+ cmp $0, %rcx
122
+ jz KeccakP1600_OverwriteBytes_Exit
123
+ mov %rdx, %rax # rax offset in lane
124
+ and $0xFFFFFFF8, %edx # rdx pointer into state index mapper
125
+ lea mapState(%rdx), %rdx
126
+ and $7, %rax
127
+ jz KeccakP1600_OverwriteBytes_LaneAlignedCheck
128
+ mov $8, %r9 # r9 is (max) length of incomplete lane
129
+ sub %rax, %r9
130
+ cmp %rcx, %r9
131
+ cmovae %rcx, %r9
132
+ sub %r9, %rcx # length -= length of incomplete lane
133
+ add (%rdx), %rax # rax = pointer to state lane
134
+ add $8, %rdx
135
+ add %rdi, %rax
136
+ KeccakP1600_OverwriteBytes_NotAlignedLoop:
137
+ mov (%rsi), %r8b
138
+ inc %rsi
139
+ mov %r8b, (%rax)
140
+ inc %rax
141
+ dec %r9
142
+ jnz KeccakP1600_OverwriteBytes_NotAlignedLoop
143
+ jmp KeccakP1600_OverwriteBytes_LaneAlignedCheck
144
+ KeccakP1600_OverwriteBytes_LaneAlignedLoop:
145
+ mov (%rsi), %r8
146
+ add $8, %rsi
147
+ mov (%rdx), %rax
148
+ add $8, %rdx
149
+ add %rdi, %rax
150
+ mov %r8, (%rax)
151
+ KeccakP1600_OverwriteBytes_LaneAlignedCheck:
152
+ sub $8, %rcx
153
+ jnc KeccakP1600_OverwriteBytes_LaneAlignedLoop
154
+ KeccakP1600_OverwriteBytes_LastIncompleteLane:
155
+ add $8, %rcx
156
+ jz KeccakP1600_OverwriteBytes_Exit
157
+ mov (%rdx), %rax
158
+ add %rdi, %rax
159
+ KeccakP1600_OverwriteBytes_LastIncompleteLaneLoop:
160
+ mov (%rsi), %r8b
161
+ inc %rsi
162
+ mov %r8b, (%rax)
163
+ inc %rax
164
+ dec %rcx
165
+ jnz KeccakP1600_OverwriteBytes_LastIncompleteLaneLoop
166
+ KeccakP1600_OverwriteBytes_Exit:
167
+ ret
168
+ .size KeccakP1600_OverwriteBytes,.-KeccakP1600_OverwriteBytes
169
+
170
+ # -----------------------------------------------------------------------------
171
+ #
172
+ # void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount);
173
+ # %rdi %rsi
174
+ #
175
+ .globl KeccakP1600_OverwriteWithZeroes
176
+ .type KeccakP1600_OverwriteWithZeroes,@function
177
+ .align 32
178
+ KeccakP1600_OverwriteWithZeroes:
179
+ cmp $0, %rsi
180
+ jz KeccakP1600_OverwriteWithZeroes_Exit
181
+ lea mapState, %rdx # rdx pointer into state index mapper
182
+ jmp KeccakP1600_OverwriteWithZeroes_LaneAlignedCheck
183
+ KeccakP1600_OverwriteWithZeroes_LaneAlignedLoop:
184
+ mov (%rdx), %rax
185
+ add $8, %rdx
186
+ add %rdi, %rax
187
+ movq $0, (%rax)
188
+ KeccakP1600_OverwriteWithZeroes_LaneAlignedCheck:
189
+ sub $8, %rsi
190
+ jnc KeccakP1600_OverwriteWithZeroes_LaneAlignedLoop
191
+ KeccakP1600_OverwriteWithZeroes_LastIncompleteLane:
192
+ add $8, %rsi
193
+ jz KeccakP1600_OverwriteWithZeroes_Exit
194
+ mov (%rdx), %rax
195
+ add %rdi, %rax
196
+ KeccakP1600_OverwriteWithZeroes_LastIncompleteLaneLoop:
197
+ movb $0, (%rax)
198
+ inc %rax
199
+ dec %rsi
200
+ jnz KeccakP1600_OverwriteWithZeroes_LastIncompleteLaneLoop
201
+ KeccakP1600_OverwriteWithZeroes_Exit:
202
+ ret
203
+ .size KeccakP1600_OverwriteWithZeroes,.-KeccakP1600_OverwriteWithZeroes
204
+
205
+ # -----------------------------------------------------------------------------
206
+ #
207
+ # void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
208
+ # %rdi %rsi %rdx %rcx
209
+ #
210
+ .globl KeccakP1600_ExtractBytes
211
+ .type KeccakP1600_ExtractBytes,@function
212
+ .align 32
213
+ KeccakP1600_ExtractBytes:
214
+ push %rbx
215
+ cmp $0, %rcx
216
+ jz KeccakP1600_ExtractBytes_Exit
217
+ mov %rdx, %rax # rax offset in lane
218
+ and $0xFFFFFFF8, %edx # rdx pointer into state index mapper
219
+ lea mapState(%rdx), %rdx
220
+ and $7, %rax
221
+ jz KeccakP1600_ExtractBytes_LaneAlignedCheck
222
+ mov $8, %rbx # rbx is (max) length of incomplete lane
223
+ sub %rax, %rbx
224
+ cmp %rcx, %rbx
225
+ cmovae %rcx, %rbx
226
+ sub %rbx, %rcx # length -= length of incomplete lane
227
+ mov (%rdx), %r9
228
+ add $8, %rdx
229
+ add %rdi, %r9
230
+ add %rax, %r9
231
+ KeccakP1600_ExtractBytes_NotAlignedLoop:
232
+ mov (%r9), %r8b
233
+ inc %r9
234
+ mov %r8b, (%rsi)
235
+ inc %rsi
236
+ dec %rbx
237
+ jnz KeccakP1600_ExtractBytes_NotAlignedLoop
238
+ jmp KeccakP1600_ExtractBytes_LaneAlignedCheck
239
+ KeccakP1600_ExtractBytes_LaneAlignedLoop:
240
+ mov (%rdx), %rax
241
+ add $8, %rdx
242
+ add %rdi, %rax
243
+ mov (%rax), %r8
244
+ mov %r8, (%rsi)
245
+ add $8, %rsi
246
+ KeccakP1600_ExtractBytes_LaneAlignedCheck:
247
+ sub $8, %rcx
248
+ jnc KeccakP1600_ExtractBytes_LaneAlignedLoop
249
+ KeccakP1600_ExtractBytes_LastIncompleteLane:
250
+ add $8, %rcx
251
+ jz KeccakP1600_ExtractBytes_Exit
252
+ mov (%rdx), %rax
253
+ add %rdi, %rax
254
+ mov (%rax), %r8
255
+ KeccakP1600_ExtractBytes_LastIncompleteLaneLoop:
256
+ mov %r8b, (%rsi)
257
+ shr $8, %r8
258
+ inc %rsi
259
+ dec %rcx
260
+ jnz KeccakP1600_ExtractBytes_LastIncompleteLaneLoop
261
+ KeccakP1600_ExtractBytes_Exit:
262
+ pop %rbx
263
+ ret
264
+ .size KeccakP1600_ExtractBytes,.-KeccakP1600_ExtractBytes
265
+
266
+ # -----------------------------------------------------------------------------
267
+ #
268
+ # void KeccakP1600_ExtractAndAddBytes(const void *state, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length);
269
+ # %rdi %rsi %rdx %rcx %r8
270
+ #
271
+ .globl KeccakP1600_ExtractAndAddBytes
272
+ .type KeccakP1600_ExtractAndAddBytes,@function
273
+ .align 32
274
+ KeccakP1600_ExtractAndAddBytes:
275
+ push %rbx
276
+ push %r10
277
+ cmp $0, %r8
278
+ jz KeccakP1600_ExtractAndAddBytes_Exit
279
+ mov %rcx, %rax # rax offset in lane
280
+ and $0xFFFFFFF8, %ecx # rcx pointer into state index mapper
281
+ lea mapState(%rcx), %rcx
282
+ and $7, %rax
283
+ jz KeccakP1600_ExtractAndAddBytes_LaneAlignedCheck
284
+ mov $8, %rbx # rbx is (max) length of incomplete lane
285
+ sub %rax, %rbx
286
+ cmp %r8, %rbx
287
+ cmovae %r8, %rbx
288
+ sub %rbx, %r8 # length -= length of incomplete lane
289
+ mov (%rcx), %r9
290
+ add $8, %rcx
291
+ add %rdi, %r9
292
+ add %rax, %r9
293
+ KeccakP1600_ExtractAndAddBytes_NotAlignedLoop:
294
+ mov (%r9), %r10b
295
+ inc %r9
296
+ xor (%rsi), %r10b
297
+ inc %rsi
298
+ mov %r10b, (%rdx)
299
+ inc %rdx
300
+ dec %rbx
301
+ jnz KeccakP1600_ExtractAndAddBytes_NotAlignedLoop
302
+ jmp KeccakP1600_ExtractAndAddBytes_LaneAlignedCheck
303
+ KeccakP1600_ExtractAndAddBytes_LaneAlignedLoop:
304
+ mov (%rcx), %rax
305
+ add $8, %rcx
306
+ add %rdi, %rax
307
+ mov (%rax), %r10
308
+ xor (%rsi), %r10
309
+ add $8, %rsi
310
+ mov %r10, (%rdx)
311
+ add $8, %rdx
312
+ KeccakP1600_ExtractAndAddBytes_LaneAlignedCheck:
313
+ sub $8, %r8
314
+ jnc KeccakP1600_ExtractAndAddBytes_LaneAlignedLoop
315
+ KeccakP1600_ExtractAndAddBytes_LastIncompleteLane:
316
+ add $8, %r8
317
+ jz KeccakP1600_ExtractAndAddBytes_Exit
318
+ mov (%rcx), %rax
319
+ add %rdi, %rax
320
+ mov (%rax), %r10
321
+ KeccakP1600_ExtractAndAddBytes_LastIncompleteLaneLoop:
322
+ xor (%rsi), %r10b
323
+ inc %rsi
324
+ mov %r10b, (%rdx)
325
+ inc %rdx
326
+ shr $8, %r10
327
+ dec %r8
328
+ jnz KeccakP1600_ExtractAndAddBytes_LastIncompleteLaneLoop
329
+ KeccakP1600_ExtractAndAddBytes_Exit:
330
+ pop %r10
331
+ pop %rbx
332
+ ret
333
+ .size KeccakP1600_ExtractAndAddBytes,.-KeccakP1600_ExtractAndAddBytes
334
+
335
+ # -----------------------------------------------------------------------------
336
+ #
337
+ # internal
338
+ #
339
+ .type __KeccakF1600,@function
340
+ .align 32
341
+ __KeccakF1600:
342
+ .Loop_avx2:
343
+ ######################################### Theta
344
+ vpshufd $0b01001110,%ymm2,%ymm13
345
+ vpxor %ymm3,%ymm5,%ymm12
346
+ vpxor %ymm6,%ymm4,%ymm9
347
+ vpxor %ymm1,%ymm12,%ymm12
348
+ vpxor %ymm9,%ymm12,%ymm12 # C[1..4]
349
+
350
+ vpermq $0b10010011,%ymm12,%ymm11
351
+ vpxor %ymm2,%ymm13,%ymm13
352
+ vpermq $0b01001110,%ymm13,%ymm7
353
+
354
+ vpsrlq $63,%ymm12,%ymm8
355
+ vpaddq %ymm12,%ymm12,%ymm9
356
+ vpor %ymm9,%ymm8,%ymm8 # ROL64(C[1..4],1)
357
+
358
+ vpermq $0b00111001,%ymm8,%ymm15
359
+ vpxor %ymm11,%ymm8,%ymm14
360
+ vpermq $0b00000000,%ymm14,%ymm14 # D[0..0] = ROL64(C[1],1) ^ C[4]
361
+
362
+ vpxor %ymm0,%ymm13,%ymm13
363
+ vpxor %ymm7,%ymm13,%ymm13 # C[0..0]
364
+
365
+ vpsrlq $63,%ymm13,%ymm7
366
+ vpaddq %ymm13,%ymm13,%ymm8
367
+ vpor %ymm7,%ymm8,%ymm8 # ROL64(C[0..0],1)
368
+
369
+ vpxor %ymm14,%ymm2,%ymm2 # ^= D[0..0]
370
+ vpxor %ymm14,%ymm0,%ymm0 # ^= D[0..0]
371
+
372
+ vpblendd $0b11000000,%ymm8,%ymm15,%ymm15
373
+ vpblendd $0b00000011,%ymm13,%ymm11,%ymm11
374
+ vpxor %ymm11,%ymm15,%ymm15 # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]
375
+
376
+ ######################################### Rho + Pi + pre-Chi shuffle
377
+ vpsllvq 0*32-96(%r8),%ymm2,%ymm10
378
+ vpsrlvq 0*32-96(%r9),%ymm2,%ymm2
379
+ vpor %ymm10,%ymm2,%ymm2
380
+
381
+ vpxor %ymm15,%ymm3,%ymm3 # ^= D[1..4] from Theta
382
+ vpsllvq 2*32-96(%r8),%ymm3,%ymm11
383
+ vpsrlvq 2*32-96(%r9),%ymm3,%ymm3
384
+ vpor %ymm11,%ymm3,%ymm3
385
+
386
+ vpxor %ymm15,%ymm4,%ymm4 # ^= D[1..4] from Theta
387
+ vpsllvq 3*32-96(%r8),%ymm4,%ymm12
388
+ vpsrlvq 3*32-96(%r9),%ymm4,%ymm4
389
+ vpor %ymm12,%ymm4,%ymm4
390
+
391
+ vpxor %ymm15,%ymm5,%ymm5 # ^= D[1..4] from Theta
392
+ vpsllvq 4*32-96(%r8),%ymm5,%ymm13
393
+ vpsrlvq 4*32-96(%r9),%ymm5,%ymm5
394
+ vpor %ymm13,%ymm5,%ymm5
395
+
396
+ vpxor %ymm15,%ymm6,%ymm6 # ^= D[1..4] from Theta
397
+ vpermq $0b10001101,%ymm2,%ymm10 # %ymm2 -> future %ymm3
398
+ vpermq $0b10001101,%ymm3,%ymm11 # %ymm3 -> future %ymm4
399
+ vpsllvq 5*32-96(%r8),%ymm6,%ymm14
400
+ vpsrlvq 5*32-96(%r9),%ymm6,%ymm8
401
+ vpor %ymm14,%ymm8,%ymm8 # %ymm6 -> future %ymm1
402
+
403
+ vpxor %ymm15,%ymm1,%ymm1 # ^= D[1..4] from Theta
404
+ vpermq $0b00011011,%ymm4,%ymm12 # %ymm4 -> future %ymm5
405
+ vpermq $0b01110010,%ymm5,%ymm13 # %ymm5 -> future %ymm6
406
+ vpsllvq 1*32-96(%r8),%ymm1,%ymm15
407
+ vpsrlvq 1*32-96(%r9),%ymm1,%ymm9
408
+ vpor %ymm15,%ymm9,%ymm9 # %ymm1 -> future %ymm2
409
+
410
+ ######################################### Chi
411
+ vpsrldq $8,%ymm8,%ymm14
412
+ vpandn %ymm14,%ymm8,%ymm7 # tgting [0][0] [0][0] [0][0] [0][0]
413
+
414
+ vpblendd $0b00001100,%ymm13,%ymm9,%ymm3 # [4][4] [2][0]
415
+ vpblendd $0b00001100,%ymm9,%ymm11,%ymm15 # [4][0] [2][1]
416
+ vpblendd $0b00001100,%ymm11,%ymm10,%ymm5 # [4][2] [2][4]
417
+ vpblendd $0b00001100,%ymm10,%ymm9,%ymm14 # [4][3] [2][0]
418
+ vpblendd $0b00110000,%ymm11,%ymm3,%ymm3 # [1][3] [4][4] [2][0]
419
+ vpblendd $0b00110000,%ymm12,%ymm15,%ymm15 # [1][4] [4][0] [2][1]
420
+ vpblendd $0b00110000,%ymm9,%ymm5,%ymm5 # [1][0] [4][2] [2][4]
421
+ vpblendd $0b00110000,%ymm13,%ymm14,%ymm14 # [1][1] [4][3] [2][0]
422
+ vpblendd $0b11000000,%ymm12,%ymm3,%ymm3 # [3][2] [1][3] [4][4] [2][0]
423
+ vpblendd $0b11000000,%ymm13,%ymm15,%ymm15 # [3][3] [1][4] [4][0] [2][1]
424
+ vpblendd $0b11000000,%ymm13,%ymm5,%ymm5 # [3][3] [1][0] [4][2] [2][4]
425
+ vpblendd $0b11000000,%ymm11,%ymm14,%ymm14 # [3][4] [1][1] [4][3] [2][0]
426
+ vpandn %ymm15,%ymm3,%ymm3 # tgting [3][1] [1][2] [4][3] [2][4]
427
+ vpandn %ymm14,%ymm5,%ymm5 # tgting [3][2] [1][4] [4][1] [2][3]
428
+
429
+ vpblendd $0b00001100,%ymm9,%ymm12,%ymm6 # [4][0] [2][3]
430
+ vpblendd $0b00001100,%ymm12,%ymm10,%ymm15 # [4][1] [2][4]
431
+ vpxor %ymm10,%ymm3,%ymm3
432
+ vpblendd $0b00110000,%ymm10,%ymm6,%ymm6 # [1][2] [4][0] [2][3]
433
+ vpblendd $0b00110000,%ymm11,%ymm15,%ymm15 # [1][3] [4][1] [2][4]
434
+ vpxor %ymm12,%ymm5,%ymm5
435
+ vpblendd $0b11000000,%ymm11,%ymm6,%ymm6 # [3][4] [1][2] [4][0] [2][3]
436
+ vpblendd $0b11000000,%ymm9,%ymm15,%ymm15 # [3][0] [1][3] [4][1] [2][4]
437
+ vpandn %ymm15,%ymm6,%ymm6 # tgting [3][3] [1][1] [4][4] [2][2]
438
+ vpxor %ymm13,%ymm6,%ymm6
439
+
440
+ vpermq $0b00011110,%ymm8,%ymm4 # [0][1] [0][2] [0][4] [0][3]
441
+ vpblendd $0b00110000,%ymm0,%ymm4,%ymm15 # [0][1] [0][0] [0][4] [0][3]
442
+ vpermq $0b00111001,%ymm8,%ymm1 # [0][1] [0][4] [0][3] [0][2]
443
+ vpblendd $0b11000000,%ymm0,%ymm1,%ymm1 # [0][0] [0][4] [0][3] [0][2]
444
+ vpandn %ymm15,%ymm1,%ymm1 # tgting [0][4] [0][3] [0][2] [0][1]
445
+
446
+ vpblendd $0b00001100,%ymm12,%ymm11,%ymm2 # [4][1] [2][1]
447
+ vpblendd $0b00001100,%ymm11,%ymm13,%ymm14 # [4][2] [2][2]
448
+ vpblendd $0b00110000,%ymm13,%ymm2,%ymm2 # [1][1] [4][1] [2][1]
449
+ vpblendd $0b00110000,%ymm10,%ymm14,%ymm14 # [1][2] [4][2] [2][2]
450
+ vpblendd $0b11000000,%ymm10,%ymm2,%ymm2 # [3][1] [1][1] [4][1] [2][1]
451
+ vpblendd $0b11000000,%ymm12,%ymm14,%ymm14 # [3][2] [1][2] [4][2] [2][2]
452
+ vpandn %ymm14,%ymm2,%ymm2 # tgting [3][0] [1][0] [4][0] [2][0]
453
+ vpxor %ymm9,%ymm2,%ymm2
454
+
455
+ vpermq $0b00000000,%ymm7,%ymm7 # [0][0] [0][0] [0][0] [0][0]
456
+ vpermq $0b00011011,%ymm3,%ymm3 # post-Chi shuffle
457
+ vpermq $0b10001101,%ymm5,%ymm5
458
+ vpermq $0b01110010,%ymm6,%ymm6
459
+
460
+ vpblendd $0b00001100,%ymm10,%ymm13,%ymm4 # [4][3] [2][2]
461
+ vpblendd $0b00001100,%ymm13,%ymm12,%ymm14 # [4][4] [2][3]
462
+ vpblendd $0b00110000,%ymm12,%ymm4,%ymm4 # [1][4] [4][3] [2][2]
463
+ vpblendd $0b00110000,%ymm9,%ymm14,%ymm14 # [1][0] [4][4] [2][3]
464
+ vpblendd $0b11000000,%ymm9,%ymm4,%ymm4 # [3][0] [1][4] [4][3] [2][2]
465
+ vpblendd $0b11000000,%ymm10,%ymm14,%ymm14 # [3][1] [1][0] [4][4] [2][3]
466
+ vpandn %ymm14,%ymm4,%ymm4 # tgting [3][4] [1][3] [4][2] [2][1]
467
+
468
+ vpxor %ymm7,%ymm0,%ymm0
469
+ vpxor %ymm8,%ymm1,%ymm1
470
+ vpxor %ymm11,%ymm4,%ymm4
471
+
472
+ ######################################### Iota
473
+ vpxor (%r10),%ymm0,%ymm0
474
+ lea 32(%r10),%r10
475
+
476
+ dec %eax
477
+ jnz .Loop_avx2
478
+ ret
479
+ .size __KeccakF1600,.-__KeccakF1600
480
+
481
+
482
+
483
+ .globl KeccakP1600_Permute_24rounds
484
+ .type KeccakP1600_Permute_24rounds,@function
485
+ .align 32
486
+ KeccakP1600_Permute_24rounds:
487
+ lea rhotates_left+96(%rip),%r8
488
+ lea rhotates_right+96(%rip),%r9
489
+ lea iotas(%rip),%r10
490
+ mov $24,%eax
491
+ lea 96(%rdi),%rdi
492
+ vzeroupper
493
+ vpbroadcastq -96(%rdi),%ymm0 # load A[5][5]
494
+ vmovdqu 8+32*0-96(%rdi),%ymm1
495
+ vmovdqu 8+32*1-96(%rdi),%ymm2
496
+ vmovdqu 8+32*2-96(%rdi),%ymm3
497
+ vmovdqu 8+32*3-96(%rdi),%ymm4
498
+ vmovdqu 8+32*4-96(%rdi),%ymm5
499
+ vmovdqu 8+32*5-96(%rdi),%ymm6
500
+ call __KeccakF1600
501
+ vmovq %xmm0,-96(%rdi)
502
+ vmovdqu %ymm1,8+32*0-96(%rdi)
503
+ vmovdqu %ymm2,8+32*1-96(%rdi)
504
+ vmovdqu %ymm3,8+32*2-96(%rdi)
505
+ vmovdqu %ymm4,8+32*3-96(%rdi)
506
+ vmovdqu %ymm5,8+32*4-96(%rdi)
507
+ vmovdqu %ymm6,8+32*5-96(%rdi)
508
+ vzeroupper
509
+ ret
510
+ .size KeccakP1600_Permute_24rounds,.-KeccakP1600_Permute_24rounds
511
+
512
+ .globl KeccakP1600_Permute_12rounds
513
+ .type KeccakP1600_Permute_12rounds,@function
514
+ .align 32
515
+ KeccakP1600_Permute_12rounds:
516
+ lea rhotates_left+96(%rip),%r8
517
+ lea rhotates_right+96(%rip),%r9
518
+ lea iotas+12*4*8(%rip),%r10
519
+ mov $12,%eax
520
+ lea 96(%rdi),%rdi
521
+ vzeroupper
522
+ vpbroadcastq -96(%rdi),%ymm0 # load A[5][5]
523
+ vmovdqu 8+32*0-96(%rdi),%ymm1
524
+ vmovdqu 8+32*1-96(%rdi),%ymm2
525
+ vmovdqu 8+32*2-96(%rdi),%ymm3
526
+ vmovdqu 8+32*3-96(%rdi),%ymm4
527
+ vmovdqu 8+32*4-96(%rdi),%ymm5
528
+ vmovdqu 8+32*5-96(%rdi),%ymm6
529
+ call __KeccakF1600
530
+ vmovq %xmm0,-96(%rdi)
531
+ vmovdqu %ymm1,8+32*0-96(%rdi)
532
+ vmovdqu %ymm2,8+32*1-96(%rdi)
533
+ vmovdqu %ymm3,8+32*2-96(%rdi)
534
+ vmovdqu %ymm4,8+32*3-96(%rdi)
535
+ vmovdqu %ymm5,8+32*4-96(%rdi)
536
+ vmovdqu %ymm6,8+32*5-96(%rdi)
537
+ vzeroupper
538
+ ret
539
+ .size KeccakP1600_Permute_12rounds,.-KeccakP1600_Permute_12rounds
540
+
541
+ .globl KeccakP1600_Permute_Nrounds
542
+ .type KeccakP1600_Permute_Nrounds,@function
543
+ .align 32
544
+ KeccakP1600_Permute_Nrounds:
545
+ lea rhotates_left+96(%rip),%r8
546
+ lea rhotates_right+96(%rip),%r9
547
+ lea iotas+24*4*8(%rip),%r10
548
+ mov %rsi,%rax
549
+ shl $2+3,%rsi
550
+ sub %rsi, %r10
551
+ lea 96(%rdi),%rdi
552
+ vzeroupper
553
+ vpbroadcastq -96(%rdi),%ymm0 # load A[5][5]
554
+ vmovdqu 8+32*0-96(%rdi),%ymm1
555
+ vmovdqu 8+32*1-96(%rdi),%ymm2
556
+ vmovdqu 8+32*2-96(%rdi),%ymm3
557
+ vmovdqu 8+32*3-96(%rdi),%ymm4
558
+ vmovdqu 8+32*4-96(%rdi),%ymm5
559
+ vmovdqu 8+32*5-96(%rdi),%ymm6
560
+ call __KeccakF1600
561
+ vmovq %xmm0,-96(%rdi)
562
+ vmovdqu %ymm1,8+32*0-96(%rdi)
563
+ vmovdqu %ymm2,8+32*1-96(%rdi)
564
+ vmovdqu %ymm3,8+32*2-96(%rdi)
565
+ vmovdqu %ymm4,8+32*3-96(%rdi)
566
+ vmovdqu %ymm5,8+32*4-96(%rdi)
567
+ vmovdqu %ymm6,8+32*5-96(%rdi)
568
+ vzeroupper
569
+ ret
570
+ .size KeccakP1600_Permute_Nrounds,.-KeccakP1600_Permute_Nrounds
571
+
572
+ # -----------------------------------------------------------------------------
573
+ #
574
+ # size_t KeccakF1600_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen);
575
+ # %rdi %rsi %rdx %rcx
576
+ #
577
+ .globl KeccakF1600_FastLoop_Absorb
578
+ .type KeccakF1600_FastLoop_Absorb,@function
579
+ .align 32
580
+ KeccakF1600_FastLoop_Absorb:
581
+ push %rbx
582
+ push %r10
583
+ shr $3, %rcx # rcx = data length in lanes
584
+ mov %rdx, %rbx # rbx = initial data pointer
585
+ cmp %rsi, %rcx
586
+ jb KeccakF1600_FastLoop_Absorb_Exit
587
+ vzeroupper
588
+ cmp $21, %rsi
589
+ jnz KeccakF1600_FastLoop_Absorb_Not21Lanes
590
+ sub $21, %rcx
591
+ lea rhotates_left+96(%rip),%r8
592
+ lea rhotates_right+96(%rip),%r9
593
+ lea 96(%rdi),%rdi
594
+ vpbroadcastq -96(%rdi),%ymm0 # load A[5][5]
595
+ vmovdqu 8+32*0-96(%rdi),%ymm1
596
+ vmovdqu 8+32*1-96(%rdi),%ymm2
597
+ vmovdqu 8+32*2-96(%rdi),%ymm3
598
+ vmovdqu 8+32*3-96(%rdi),%ymm4
599
+ vmovdqu 8+32*4-96(%rdi),%ymm5
600
+ vmovdqu 8+32*5-96(%rdi),%ymm6
601
+ KeccakF1600_FastLoop_Absorb_Loop21Lanes:
602
+ vpbroadcastq (%rdx),%ymm7
603
+ vmovdqu 8(%rdx),%ymm8
604
+
605
+ vmovdqa map2(%rip), %xmm15
606
+ vpcmpeqq %ymm14, %ymm14, %ymm14
607
+ vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm9
608
+
609
+ vmovdqa mask3_21(%rip), %ymm14
610
+ vpxor %ymm10, %ymm10, %ymm10
611
+ vmovdqa map3(%rip), %xmm15
612
+ vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm10
613
+
614
+ vmovdqa mask4_21(%rip), %ymm14
615
+ vpxor %ymm11, %ymm11, %ymm11
616
+ vmovdqa map4(%rip), %xmm15
617
+ vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm11
618
+
619
+ vmovdqa mask5_21(%rip), %ymm14
620
+ vpxor %ymm12, %ymm12, %ymm12
621
+ vmovdqa map5(%rip), %xmm15
622
+ vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm12
623
+
624
+ vmovdqa mask6_21(%rip), %ymm14
625
+ vpxor %ymm13, %ymm13, %ymm13
626
+ vmovdqa map6(%rip), %xmm15
627
+ vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm13
628
+
629
+ vpxor %ymm7,%ymm0,%ymm0
630
+ vpxor %ymm8,%ymm1,%ymm1
631
+ vpxor %ymm9,%ymm2,%ymm2
632
+ vpxor %ymm10,%ymm3,%ymm3
633
+ vpxor %ymm11,%ymm4,%ymm4
634
+ vpxor %ymm12,%ymm5,%ymm5
635
+ vpxor %ymm13,%ymm6,%ymm6
636
+ add $21*8, %rdx
637
+ lea iotas(%rip),%r10
638
+ mov $24,%eax
639
+ call __KeccakF1600
640
+ sub $21, %rcx
641
+ jnc KeccakF1600_FastLoop_Absorb_Loop21Lanes
642
+ KeccakF1600_FastLoop_Absorb_SaveAndExit:
643
+ vmovq %xmm0,-96(%rdi)
644
+ vmovdqu %ymm1,8+32*0-96(%rdi)
645
+ vmovdqu %ymm2,8+32*1-96(%rdi)
646
+ vmovdqu %ymm3,8+32*2-96(%rdi)
647
+ vmovdqu %ymm4,8+32*3-96(%rdi)
648
+ vmovdqu %ymm5,8+32*4-96(%rdi)
649
+ vmovdqu %ymm6,8+32*5-96(%rdi)
650
+ KeccakF1600_FastLoop_Absorb_Exit:
651
+ vzeroupper
652
+ mov %rdx, %rax # return number of bytes processed
653
+ sub %rbx, %rax
654
+ pop %r10
655
+ pop %rbx
656
+ ret
657
+ KeccakF1600_FastLoop_Absorb_Not21Lanes:
658
+ cmp $17, %rsi
659
+ jnz KeccakF1600_FastLoop_Absorb_Not17Lanes
660
+ sub $17, %rcx
661
+ lea rhotates_left+96(%rip),%r8
662
+ lea rhotates_right+96(%rip),%r9
663
+ lea 96(%rdi),%rdi
664
+ vpbroadcastq -96(%rdi),%ymm0 # load A[5][5]
665
+ vmovdqu 8+32*0-96(%rdi),%ymm1
666
+ vmovdqu 8+32*1-96(%rdi),%ymm2
667
+ vmovdqu 8+32*2-96(%rdi),%ymm3
668
+ vmovdqu 8+32*3-96(%rdi),%ymm4
669
+ vmovdqu 8+32*4-96(%rdi),%ymm5
670
+ vmovdqu 8+32*5-96(%rdi),%ymm6
671
+ KeccakF1600_FastLoop_Absorb_Loop17Lanes:
672
+ vpbroadcastq (%rdx),%ymm7
673
+ vmovdqu 8(%rdx),%ymm8
674
+
675
+ vmovdqa mask2_17(%rip), %ymm14
676
+ vpxor %ymm9, %ymm9, %ymm9
677
+ vmovdqa map2(%rip), %xmm15
678
+ vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm9
679
+
680
+ vmovdqa mask3_17(%rip), %ymm14
681
+ vpxor %ymm10, %ymm10, %ymm10
682
+ vmovdqa map3(%rip), %xmm15
683
+ vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm10
684
+
685
+ vmovdqa mask4_17(%rip), %ymm14
686
+ vpxor %ymm11, %ymm11, %ymm11
687
+ vmovdqa map4(%rip), %xmm15
688
+ vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm11
689
+
690
+ vmovdqa mask5_17(%rip), %ymm14
691
+ vpxor %ymm12, %ymm12, %ymm12
692
+ vmovdqa map5(%rip), %xmm15
693
+ vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm12
694
+
695
+ vmovdqa mask6_17(%rip), %ymm14
696
+ vpxor %ymm13, %ymm13, %ymm13
697
+ vmovdqa map6(%rip), %xmm15
698
+ vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm13
699
+
700
+ vpxor %ymm7,%ymm0,%ymm0
701
+ vpxor %ymm8,%ymm1,%ymm1
702
+ vpxor %ymm9,%ymm2,%ymm2
703
+ vpxor %ymm10,%ymm3,%ymm3
704
+ vpxor %ymm11,%ymm4,%ymm4
705
+ vpxor %ymm12,%ymm5,%ymm5
706
+ vpxor %ymm13,%ymm6,%ymm6
707
+ add $17*8, %rdx
708
+ lea iotas(%rip),%r10
709
+ mov $24,%eax
710
+ call __KeccakF1600
711
+ sub $17, %rcx
712
+ jnc KeccakF1600_FastLoop_Absorb_Loop17Lanes
713
+ jmp KeccakF1600_FastLoop_Absorb_SaveAndExit
714
+ KeccakF1600_FastLoop_Absorb_Not17Lanes:
715
+ lea mapState(%rip), %r9
716
+ mov %rsi, %rax
717
+ KeccakF1600_FastLoop_Absorb_LanesAddLoop:
718
+ mov (%rdx), %r8
719
+ add $8, %rdx
720
+ mov (%r9), %r10
721
+ add $8, %r9
722
+ add %rdi, %r10
723
+ xor %r8, (%r10)
724
+ sub $1, %rax
725
+ jnz KeccakF1600_FastLoop_Absorb_LanesAddLoop
726
+ sub %rsi, %rcx
727
+ push %rdi
728
+ push %rsi
729
+ push %rdx
730
+ push %rcx
731
+ call KeccakP1600_Permute_24rounds
732
+ pop %rcx
733
+ pop %rdx
734
+ pop %rsi
735
+ pop %rdi
736
+ cmp %rsi, %rcx
737
+ jae KeccakF1600_FastLoop_Absorb_Not17Lanes
738
+ jmp KeccakF1600_FastLoop_Absorb_Exit
739
+ .size KeccakF1600_FastLoop_Absorb,.-KeccakF1600_FastLoop_Absorb
740
+
741
+ # -----------------------------------------------------------------------------
742
+ #
743
+ # size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen);
744
+ # %rdi %rsi %rdx %rcx
745
+ #
746
+ .globl KeccakP1600_12rounds_FastLoop_Absorb
747
+ .type KeccakP1600_12rounds_FastLoop_Absorb,@function
748
+ .align 32
749
+ KeccakP1600_12rounds_FastLoop_Absorb:
750
+ push %rbx
751
+ push %r10
752
+ shr $3, %rcx # rcx = data length in lanes
753
+ mov %rdx, %rbx # rbx = initial data pointer
754
+ cmp %rsi, %rcx
755
+ jb KeccakP1600_12rounds_FastLoop_Absorb_Exit
756
+ vzeroupper
757
+ cmp $21, %rsi
758
+ jnz KeccakP1600_12rounds_FastLoop_Absorb_Not21Lanes
759
+ sub $21, %rcx
760
+ lea rhotates_left+96(%rip),%r8
761
+ lea rhotates_right+96(%rip),%r9
762
+ lea 96(%rdi),%rdi
763
+ vpbroadcastq -96(%rdi),%ymm0 # load A[5][5]
764
+ vmovdqu 8+32*0-96(%rdi),%ymm1
765
+ vmovdqu 8+32*1-96(%rdi),%ymm2
766
+ vmovdqu 8+32*2-96(%rdi),%ymm3
767
+ vmovdqu 8+32*3-96(%rdi),%ymm4
768
+ vmovdqu 8+32*4-96(%rdi),%ymm5
769
+ vmovdqu 8+32*5-96(%rdi),%ymm6
770
+ KeccakP1600_12rounds_FastLoop_Absorb_Loop21Lanes:
771
+ vpbroadcastq (%rdx),%ymm7
772
+ vmovdqu 8(%rdx),%ymm8
773
+
774
+ vmovdqa map2(%rip), %xmm15
775
+ vpcmpeqq %ymm14, %ymm14, %ymm14
776
+ vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm9
777
+
778
+ vmovdqa mask3_21(%rip), %ymm14
779
+ vpxor %ymm10, %ymm10, %ymm10
780
+ vmovdqa map3(%rip), %xmm15
781
+ vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm10
782
+
783
+ vmovdqa mask4_21(%rip), %ymm14
784
+ vpxor %ymm11, %ymm11, %ymm11
785
+ vmovdqa map4(%rip), %xmm15
786
+ vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm11
787
+
788
+ vmovdqa mask5_21(%rip), %ymm14
789
+ vpxor %ymm12, %ymm12, %ymm12
790
+ vmovdqa map5(%rip), %xmm15
791
+ vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm12
792
+
793
+ vmovdqa mask6_21(%rip), %ymm14
794
+ vpxor %ymm13, %ymm13, %ymm13
795
+ vmovdqa map6(%rip), %xmm15
796
+ vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm13
797
+
798
+ vpxor %ymm7,%ymm0,%ymm0
799
+ vpxor %ymm8,%ymm1,%ymm1
800
+ vpxor %ymm9,%ymm2,%ymm2
801
+ vpxor %ymm10,%ymm3,%ymm3
802
+ vpxor %ymm11,%ymm4,%ymm4
803
+ vpxor %ymm12,%ymm5,%ymm5
804
+ vpxor %ymm13,%ymm6,%ymm6
805
+ add $21*8, %rdx
806
+ lea iotas+12*4*8(%rip),%r10
807
+ mov $12,%eax
808
+ call __KeccakF1600
809
+ sub $21, %rcx
810
+ jnc KeccakP1600_12rounds_FastLoop_Absorb_Loop21Lanes
811
+ KeccakP1600_12rounds_FastLoop_Absorb_SaveAndExit:
812
+ vmovq %xmm0,-96(%rdi)
813
+ vmovdqu %ymm1,8+32*0-96(%rdi)
814
+ vmovdqu %ymm2,8+32*1-96(%rdi)
815
+ vmovdqu %ymm3,8+32*2-96(%rdi)
816
+ vmovdqu %ymm4,8+32*3-96(%rdi)
817
+ vmovdqu %ymm5,8+32*4-96(%rdi)
818
+ vmovdqu %ymm6,8+32*5-96(%rdi)
819
+ KeccakP1600_12rounds_FastLoop_Absorb_Exit:
820
+ vzeroupper
821
+ mov %rdx, %rax # return number of bytes processed
822
+ sub %rbx, %rax
823
+ pop %r10
824
+ pop %rbx
825
+ ret
826
+ KeccakP1600_12rounds_FastLoop_Absorb_Not21Lanes:
827
+ cmp $17, %rsi
828
+ jnz KeccakP1600_12rounds_FastLoop_Absorb_Not17Lanes
829
+ sub $17, %rcx
830
+ lea rhotates_left+96(%rip),%r8
831
+ lea rhotates_right+96(%rip),%r9
832
+ lea 96(%rdi),%rdi
833
+ vpbroadcastq -96(%rdi),%ymm0 # load A[5][5]
834
+ vmovdqu 8+32*0-96(%rdi),%ymm1
835
+ vmovdqu 8+32*1-96(%rdi),%ymm2
836
+ vmovdqu 8+32*2-96(%rdi),%ymm3
837
+ vmovdqu 8+32*3-96(%rdi),%ymm4
838
+ vmovdqu 8+32*4-96(%rdi),%ymm5
839
+ vmovdqu 8+32*5-96(%rdi),%ymm6
840
+ KeccakP1600_12rounds_FastLoop_Absorb_Loop17Lanes:
841
+ vpbroadcastq (%rdx),%ymm7
842
+ vmovdqu 8(%rdx),%ymm8
843
+
844
+ vmovdqa mask2_17(%rip), %ymm14
845
+ vpxor %ymm9, %ymm9, %ymm9
846
+ vmovdqa map2(%rip), %xmm15
847
+ vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm9
848
+
849
+ vmovdqa mask3_17(%rip), %ymm14
850
+ vpxor %ymm10, %ymm10, %ymm10
851
+ vmovdqa map3(%rip), %xmm15
852
+ vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm10
853
+
854
+ vmovdqa mask4_17(%rip), %ymm14
855
+ vpxor %ymm11, %ymm11, %ymm11
856
+ vmovdqa map4(%rip), %xmm15
857
+ vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm11
858
+
859
+ vmovdqa mask5_17(%rip), %ymm14
860
+ vpxor %ymm12, %ymm12, %ymm12
861
+ vmovdqa map5(%rip), %xmm15
862
+ vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm12
863
+
864
+ vmovdqa mask6_17(%rip), %ymm14
865
+ vpxor %ymm13, %ymm13, %ymm13
866
+ vmovdqa map6(%rip), %xmm15
867
+ vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm13
868
+
869
+ vpxor %ymm7,%ymm0,%ymm0
870
+ vpxor %ymm8,%ymm1,%ymm1
871
+ vpxor %ymm9,%ymm2,%ymm2
872
+ vpxor %ymm10,%ymm3,%ymm3
873
+ vpxor %ymm11,%ymm4,%ymm4
874
+ vpxor %ymm12,%ymm5,%ymm5
875
+ vpxor %ymm13,%ymm6,%ymm6
876
+ add $17*8, %rdx
877
+ lea iotas+12*4*8(%rip),%r10
878
+ mov $12,%eax
879
+ call __KeccakF1600
880
+ sub $17, %rcx
881
+ jnc KeccakP1600_12rounds_FastLoop_Absorb_Loop17Lanes
882
+ jmp KeccakP1600_12rounds_FastLoop_Absorb_SaveAndExit
883
+ KeccakP1600_12rounds_FastLoop_Absorb_Not17Lanes:
884
+ lea mapState(%rip), %r9
885
+ mov %rsi, %rax
886
+ KeccakP1600_12rounds_FastLoop_Absorb_LanesAddLoop:
887
+ mov (%rdx), %r8
888
+ add $8, %rdx
889
+ mov (%r9), %r10
890
+ add $8, %r9
891
+ add %rdi, %r10
892
+ xor %r8, (%r10)
893
+ sub $1, %rax
894
+ jnz KeccakP1600_12rounds_FastLoop_Absorb_LanesAddLoop
895
+ sub %rsi, %rcx
896
+ push %rdi
897
+ push %rsi
898
+ push %rdx
899
+ push %rcx
900
+ call KeccakP1600_Permute_12rounds
901
+ pop %rcx
902
+ pop %rdx
903
+ pop %rsi
904
+ pop %rdi
905
+ cmp %rsi, %rcx
906
+ jae KeccakP1600_12rounds_FastLoop_Absorb_Not17Lanes
907
+ jmp KeccakP1600_12rounds_FastLoop_Absorb_Exit
908
+ .size KeccakP1600_12rounds_FastLoop_Absorb,.-KeccakP1600_12rounds_FastLoop_Absorb
909
+
910
+ .equ ALLON, 0xFFFFFFFFFFFFFFFF
911
+
912
+ .align 64
913
+ rhotates_left:
914
+ .quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0]
915
+ .quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4]
916
+ .quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4]
917
+ .quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4]
918
+ .quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4]
919
+ .quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4]
920
+ rhotates_right:
921
+ .quad 64-3, 64-18, 64-36, 64-41
922
+ .quad 64-1, 64-62, 64-28, 64-27
923
+ .quad 64-45, 64-6, 64-56, 64-39
924
+ .quad 64-10, 64-61, 64-55, 64-8
925
+ .quad 64-2, 64-15, 64-25, 64-20
926
+ .quad 64-44, 64-43, 64-21, 64-14
927
+ iotas:
928
+ .quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
929
+ .quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
930
+ .quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
931
+ .quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
932
+ .quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
933
+ .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
934
+ .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
935
+ .quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
936
+ .quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
937
+ .quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
938
+ .quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
939
+ .quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
940
+ .quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
941
+ .quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
942
+ .quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
943
+ .quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
944
+ .quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
945
+ .quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
946
+ .quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
947
+ .quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
948
+ .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
949
+ .quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
950
+ .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
951
+ .quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
952
+
953
+ mapState:
954
+ .quad 0*8, 1*8, 2*8, 3*8, 4*8
955
+ .quad 7*8, 21*8, 10*8, 15*8, 20*8
956
+ .quad 5*8, 13*8, 22*8, 19*8, 12*8
957
+ .quad 8*8, 9*8, 18*8, 23*8, 16*8
958
+ .quad 6*8, 17*8, 14*8, 11*8, 24*8
959
+
960
+ .align 16
961
+ map2:
962
+ .long 10*8, 20*8, 5*8, 15*8
963
+ map3:
964
+ .long 16*8, 7*8, 23*8, 14*8
965
+ map4:
966
+ .long 11*8, 22*8, 8*8, 19*8
967
+ map5:
968
+ .long 21*8, 17*8, 13*8, 9*8
969
+ map6:
970
+ .long 6*8, 12*8, 18*8, 24*8
971
+
972
+ .align 32
973
+ mask3_21:
974
+ .quad ALLON, ALLON, 0, ALLON
975
+ mask4_21:
976
+ .quad ALLON, 0, ALLON, ALLON
977
+ mask5_21:
978
+ .quad 0, ALLON, ALLON, ALLON
979
+ mask6_21:
980
+ .quad ALLON, ALLON, ALLON, 0
981
+
982
+ mask2_17:
983
+ .quad ALLON, 0, ALLON, ALLON
984
+ mask3_17:
985
+ .quad ALLON, ALLON, 0, ALLON
986
+ mask4_17:
987
+ .quad ALLON, 0, ALLON, 0
988
+ mask5_17:
989
+ .quad 0, 0, ALLON, ALLON
990
+ mask6_17:
991
+ .quad ALLON, ALLON, 0, 0
992
+
993
+ .asciz "Keccak-1600 for AVX2, CRYPTOGAMS by <appro@openssl.org>"