sleeping_kangaroo12 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (284) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +127 -0
  3. data/ext/Rakefile +73 -0
  4. data/ext/binding/sleeping_kangaroo12.c +39 -0
  5. data/ext/config/xkcp.build +17 -0
  6. data/ext/xkcp/LICENSE +1 -0
  7. data/ext/xkcp/Makefile +15 -0
  8. data/ext/xkcp/Makefile.build +200 -0
  9. data/ext/xkcp/README.markdown +296 -0
  10. data/ext/xkcp/lib/HighLevel.build +143 -0
  11. data/ext/xkcp/lib/LowLevel.build +757 -0
  12. data/ext/xkcp/lib/common/align.h +33 -0
  13. data/ext/xkcp/lib/common/brg_endian.h +143 -0
  14. data/ext/xkcp/lib/high/KangarooTwelve/KangarooTwelve.c +301 -0
  15. data/ext/xkcp/lib/high/KangarooTwelve/KangarooTwelve.h +97 -0
  16. data/ext/xkcp/lib/high/Keccak/FIPS202/KeccakHash.c +81 -0
  17. data/ext/xkcp/lib/high/Keccak/FIPS202/KeccakHash.h +125 -0
  18. data/ext/xkcp/lib/high/Keccak/FIPS202/SimpleFIPS202.c +48 -0
  19. data/ext/xkcp/lib/high/Keccak/FIPS202/SimpleFIPS202.h +79 -0
  20. data/ext/xkcp/lib/high/Keccak/KeccakDuplex.c +81 -0
  21. data/ext/xkcp/lib/high/Keccak/KeccakDuplex.h +73 -0
  22. data/ext/xkcp/lib/high/Keccak/KeccakDuplex.inc +195 -0
  23. data/ext/xkcp/lib/high/Keccak/KeccakSponge.c +111 -0
  24. data/ext/xkcp/lib/high/Keccak/KeccakSponge.h +76 -0
  25. data/ext/xkcp/lib/high/Keccak/KeccakSponge.inc +314 -0
  26. data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.c +61 -0
  27. data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.h +67 -0
  28. data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.inc +128 -0
  29. data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.c +93 -0
  30. data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.h +599 -0
  31. data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.inc +573 -0
  32. data/ext/xkcp/lib/high/Ketje/Ketjev2.c +87 -0
  33. data/ext/xkcp/lib/high/Ketje/Ketjev2.h +88 -0
  34. data/ext/xkcp/lib/high/Ketje/Ketjev2.inc +274 -0
  35. data/ext/xkcp/lib/high/Keyak/Keyakv2.c +132 -0
  36. data/ext/xkcp/lib/high/Keyak/Keyakv2.h +217 -0
  37. data/ext/xkcp/lib/high/Keyak/Keyakv2.inc +81 -0
  38. data/ext/xkcp/lib/high/Keyak/Motorist.inc +953 -0
  39. data/ext/xkcp/lib/high/Kravatte/Kravatte.c +533 -0
  40. data/ext/xkcp/lib/high/Kravatte/Kravatte.h +115 -0
  41. data/ext/xkcp/lib/high/Kravatte/KravatteModes.c +557 -0
  42. data/ext/xkcp/lib/high/Kravatte/KravatteModes.h +247 -0
  43. data/ext/xkcp/lib/high/Xoodyak/Cyclist.h +66 -0
  44. data/ext/xkcp/lib/high/Xoodyak/Cyclist.inc +336 -0
  45. data/ext/xkcp/lib/high/Xoodyak/Xoodyak-parameters.h +26 -0
  46. data/ext/xkcp/lib/high/Xoodyak/Xoodyak.c +55 -0
  47. data/ext/xkcp/lib/high/Xoodyak/Xoodyak.h +35 -0
  48. data/ext/xkcp/lib/high/Xoofff/Xoofff.c +634 -0
  49. data/ext/xkcp/lib/high/Xoofff/Xoofff.h +147 -0
  50. data/ext/xkcp/lib/high/Xoofff/XoofffModes.c +483 -0
  51. data/ext/xkcp/lib/high/Xoofff/XoofffModes.h +241 -0
  52. data/ext/xkcp/lib/high/common/Phases.h +25 -0
  53. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-SnP.h +41 -0
  54. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv6m-le-armcc.s +1666 -0
  55. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv6m-le-gcc.s +1655 -0
  56. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7a-le-armcc.s +1268 -0
  57. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7a-le-gcc.s +1264 -0
  58. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7m-le-armcc.s +1178 -0
  59. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7m-le-gcc.s +1175 -0
  60. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u1-32bi-armv6m-le-armcc.s +1338 -0
  61. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u1-32bi-armv6m-le-gcc.s +1336 -0
  62. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u2-32bi-armv6m-le-armcc.s +1343 -0
  63. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u2-32bi-armv6m-le-gcc.s +1339 -0
  64. data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-SnP.h +42 -0
  65. data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-armv7a-le-neon-armcc.s +823 -0
  66. data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-armv7a-le-neon-gcc.s +831 -0
  67. data/ext/xkcp/lib/low/KeccakP-1600/ARMv8A/KeccakP-1600-SnP.h +31 -0
  68. data/ext/xkcp/lib/low/KeccakP-1600/ARMv8A/KeccakP-1600-armv8a-neon.s +540 -0
  69. data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-SnP.h +42 -0
  70. data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-avr8-compact.s +733 -0
  71. data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-avr8-fast.s +1121 -0
  72. data/ext/xkcp/lib/low/KeccakP-1600/AVX2/KeccakP-1600-AVX2.s +1100 -0
  73. data/ext/xkcp/lib/low/KeccakP-1600/AVX2/KeccakP-1600-SnP.h +52 -0
  74. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/KeccakP-1600-AVX512.c +623 -0
  75. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/KeccakP-1600-SnP.h +47 -0
  76. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/u12/KeccakP-1600-AVX512-config.h +6 -0
  77. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/u6/KeccakP-1600-AVX512-config.h +6 -0
  78. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/ua/KeccakP-1600-AVX512-config.h +6 -0
  79. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/KeccakP-1600-AVX512.s +1031 -0
  80. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/KeccakP-1600-SnP.h +53 -0
  81. data/ext/xkcp/lib/low/KeccakP-1600/XOP/KeccakP-1600-SnP.h +44 -0
  82. data/ext/xkcp/lib/low/KeccakP-1600/XOP/KeccakP-1600-XOP.c +476 -0
  83. data/ext/xkcp/lib/low/KeccakP-1600/XOP/u6/KeccakP-1600-XOP-config.h +6 -0
  84. data/ext/xkcp/lib/low/KeccakP-1600/XOP/ua/KeccakP-1600-XOP-config.h +6 -0
  85. data/ext/xkcp/lib/low/KeccakP-1600/common/KeccakP-1600-64.macros +748 -0
  86. data/ext/xkcp/lib/low/KeccakP-1600/common/KeccakP-1600-unrolling.macros +305 -0
  87. data/ext/xkcp/lib/low/KeccakP-1600/compact/KeccakP-1600-SnP.h +40 -0
  88. data/ext/xkcp/lib/low/KeccakP-1600/compact/KeccakP-1600-compact64.c +420 -0
  89. data/ext/xkcp/lib/low/KeccakP-1600/plain-32bits-inplace/KeccakP-1600-SnP.h +43 -0
  90. data/ext/xkcp/lib/low/KeccakP-1600/plain-32bits-inplace/KeccakP-1600-inplace32BI.c +1163 -0
  91. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/KeccakP-1600-SnP.h +54 -0
  92. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/KeccakP-1600-opt64.c +565 -0
  93. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcu6/KeccakP-1600-opt64-config.h +7 -0
  94. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcua/KeccakP-1600-opt64-config.h +7 -0
  95. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcua-shld/KeccakP-1600-opt64-config.h +8 -0
  96. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/u6/KeccakP-1600-opt64-config.h +6 -0
  97. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/ua/KeccakP-1600-opt64-config.h +6 -0
  98. data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-SnP.h +44 -0
  99. data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-reference.h +23 -0
  100. data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-reference32BI.c +625 -0
  101. data/ext/xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-SnP.h +44 -0
  102. data/ext/xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-reference.c +440 -0
  103. data/ext/xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-reference.h +23 -0
  104. data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-SnP.h +42 -0
  105. data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-gas.s +1196 -0
  106. data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-gas_Apple.s +1124 -0
  107. data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-shld-gas.s +1196 -0
  108. data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-inplace-pl2-armv7a-neon-le-armcc.s +1392 -0
  109. data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-inplace-pl2-armv7a-neon-le-gcc.s +1394 -0
  110. data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-times2-SnP.h +42 -0
  111. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512u12/SIMD512-2-config.h +7 -0
  112. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512u4/SIMD512-2-config.h +7 -0
  113. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512ufull/SIMD512-2-config.h +7 -0
  114. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/KeccakP-1600-times2-SIMD512.c +850 -0
  115. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/KeccakP-1600-times2-SnP.h +51 -0
  116. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SIMD128.c +957 -0
  117. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SnP.h +49 -0
  118. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/SSSE3-u2/SIMD128-config.h +8 -0
  119. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/SSSE3-ua/SIMD128-config.h +8 -0
  120. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/XOP-u2/SIMD128-config.h +9 -0
  121. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/XOP-ua/SIMD128-config.h +9 -0
  122. data/ext/xkcp/lib/low/KeccakP-1600-times2/fallback-on1/KeccakP-1600-times2-SnP.h +45 -0
  123. data/ext/xkcp/lib/low/KeccakP-1600-times2/fallback-on1/KeccakP-1600-times2-on1.c +37 -0
  124. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/KeccakP-1600-times4-SIMD256.c +1321 -0
  125. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/KeccakP-1600-times4-SnP.h +55 -0
  126. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/u12/SIMD256-config.h +7 -0
  127. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/u6/SIMD256-config.h +7 -0
  128. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/ua/SIMD256-config.h +7 -0
  129. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512u12/SIMD512-4-config.h +7 -0
  130. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512u4/SIMD512-4-config.h +7 -0
  131. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512ufull/SIMD512-4-config.h +7 -0
  132. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/KeccakP-1600-times4-SIMD512.c +881 -0
  133. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/KeccakP-1600-times4-SnP.h +51 -0
  134. data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on1/KeccakP-1600-times4-SnP.h +45 -0
  135. data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on1/KeccakP-1600-times4-on1.c +37 -0
  136. data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on2/KeccakP-1600-times4-SnP.h +45 -0
  137. data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on2/KeccakP-1600-times4-on2.c +38 -0
  138. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/KeccakP-1600-times8-SIMD512.c +1615 -0
  139. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/KeccakP-1600-times8-SnP.h +57 -0
  140. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/u12/SIMD512-config.h +7 -0
  141. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/u4/SIMD512-config.h +7 -0
  142. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/ua/SIMD512-config.h +7 -0
  143. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on1/KeccakP-1600-times8-SnP.h +45 -0
  144. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on1/KeccakP-1600-times8-on1.c +37 -0
  145. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on2/KeccakP-1600-times8-SnP.h +45 -0
  146. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on2/KeccakP-1600-times8-on2.c +38 -0
  147. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on4/KeccakP-1600-times8-SnP.h +45 -0
  148. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on4/KeccakP-1600-times8-on4.c +38 -0
  149. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-SnP.h +41 -0
  150. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv6m-le-armcc.s +442 -0
  151. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv6m-le-gcc.s +446 -0
  152. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv7m-le-armcc.s +419 -0
  153. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv7m-le-gcc.s +427 -0
  154. data/ext/xkcp/lib/low/KeccakP-200/AVR8/KeccakP-200-SnP.h +41 -0
  155. data/ext/xkcp/lib/low/KeccakP-200/AVR8/KeccakP-200-avr8-fast.s +647 -0
  156. data/ext/xkcp/lib/low/KeccakP-200/compact/KeccakP-200-SnP.h +39 -0
  157. data/ext/xkcp/lib/low/KeccakP-200/compact/KeccakP-200-compact.c +190 -0
  158. data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-SnP.h +43 -0
  159. data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-reference.c +412 -0
  160. data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-reference.h +23 -0
  161. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-SnP.h +41 -0
  162. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv6m-le-armcc.s +454 -0
  163. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv6m-le-gcc.s +458 -0
  164. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv7m-le-armcc.s +455 -0
  165. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv7m-le-gcc.s +458 -0
  166. data/ext/xkcp/lib/low/KeccakP-400/AVR8/KeccakP-400-SnP.h +41 -0
  167. data/ext/xkcp/lib/low/KeccakP-400/AVR8/KeccakP-400-avr8-fast.s +728 -0
  168. data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-SnP.h +43 -0
  169. data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-reference.c +414 -0
  170. data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-reference.h +23 -0
  171. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-SnP.h +42 -0
  172. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u1-armv6m-le-armcc.s +527 -0
  173. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u1-armv6m-le-gcc.s +533 -0
  174. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv6m-le-armcc.s +528 -0
  175. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv6m-le-gcc.s +534 -0
  176. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7a-le-armcc.s +521 -0
  177. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7a-le-gcc.s +527 -0
  178. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7m-le-armcc.s +517 -0
  179. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7m-le-gcc.s +523 -0
  180. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-uf-armv7m-le-armcc.s +550 -0
  181. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-uf-armv7m-le-gcc.s +556 -0
  182. data/ext/xkcp/lib/low/KeccakP-800/ARMv8A/KeccakP-800-SnP.h +32 -0
  183. data/ext/xkcp/lib/low/KeccakP-800/ARMv8A/KeccakP-800-armv8a-neon.s +432 -0
  184. data/ext/xkcp/lib/low/KeccakP-800/AVR8/KeccakP-800-SnP.h +42 -0
  185. data/ext/xkcp/lib/low/KeccakP-800/AVR8/KeccakP-800-avr8-fast.s +929 -0
  186. data/ext/xkcp/lib/low/KeccakP-800/compact/KeccakP-800-SnP.h +40 -0
  187. data/ext/xkcp/lib/low/KeccakP-800/compact/KeccakP-800-compact.c +244 -0
  188. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-SnP.h +46 -0
  189. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32-bis.macros +184 -0
  190. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32.c +454 -0
  191. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32.macros +459 -0
  192. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-unrolling-bis.macros +83 -0
  193. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-unrolling.macros +88 -0
  194. data/ext/xkcp/lib/low/KeccakP-800/plain/lcu2/KeccakP-800-opt32-config.h +7 -0
  195. data/ext/xkcp/lib/low/KeccakP-800/plain/lcua/KeccakP-800-opt32-config.h +7 -0
  196. data/ext/xkcp/lib/low/KeccakP-800/plain/u2/KeccakP-800-opt32-config.h +7 -0
  197. data/ext/xkcp/lib/low/KeccakP-800/plain/ua/KeccakP-800-opt32-config.h +7 -0
  198. data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-SnP.h +44 -0
  199. data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-reference.c +437 -0
  200. data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-reference.h +23 -0
  201. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/Ket.h +57 -0
  202. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeJr-armv7m-le-armcc.s +475 -0
  203. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeJr-armv7m-le-gcc.s +480 -0
  204. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeSr-armv7m-le-armcc.s +590 -0
  205. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeSr-armv7m-le-gcc.s +590 -0
  206. data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.c +126 -0
  207. data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.h +68 -0
  208. data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.inc +174 -0
  209. data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.c +80 -0
  210. data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.h +68 -0
  211. data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.inc +142 -0
  212. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-SnP.h +55 -0
  213. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-u1-armv6m-le-armcc.s +1086 -0
  214. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-u1-armv6m-le-gcc.s +1092 -0
  215. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv6-le-armcc.s +721 -0
  216. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv6-le-gcc.s +726 -0
  217. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv7m-le-armcc.s +723 -0
  218. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv7m-le-gcc.s +729 -0
  219. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-u1-armv6m-le-armcc.s +1164 -0
  220. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-u1-armv6m-le-gcc.s +1165 -0
  221. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv6-le-armcc.s +562 -0
  222. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv6-le-gcc.s +563 -0
  223. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv7m-le-armcc.s +563 -0
  224. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv7m-le-gcc.s +565 -0
  225. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-SnP.h +55 -0
  226. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-uf-armv7a-neon-le-armcc.s +476 -0
  227. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-uf-armv7a-neon-le-gcc.s +485 -0
  228. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodyak-uf-armv7a-neon-le-armcc.s +362 -0
  229. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodyak-uf-armv7a-neon-le-gcc.s +367 -0
  230. data/ext/xkcp/lib/low/Xoodoo/AVR8/Xoodoo-SnP.h +43 -0
  231. data/ext/xkcp/lib/low/Xoodoo/AVR8/Xoodoo-avr8-u1.s +1341 -0
  232. data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodoo-SIMD512.c +581 -0
  233. data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodoo-SnP.h +58 -0
  234. data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodyak-full-block-SIMD512.c +332 -0
  235. data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodoo-SIMD128.c +329 -0
  236. data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodoo-SnP.h +53 -0
  237. data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodyak-full-block-SIMD128.c +355 -0
  238. data/ext/xkcp/lib/low/Xoodoo/Xoodoo.h +79 -0
  239. data/ext/xkcp/lib/low/Xoodoo/plain/Xoodoo-SnP.h +56 -0
  240. data/ext/xkcp/lib/low/Xoodoo/plain/Xoodoo-optimized.c +399 -0
  241. data/ext/xkcp/lib/low/Xoodoo/plain/Xoodyak-full-blocks.c +127 -0
  242. data/ext/xkcp/lib/low/Xoodoo/ref/Xoodoo-SnP.h +43 -0
  243. data/ext/xkcp/lib/low/Xoodoo/ref/Xoodoo-reference.c +253 -0
  244. data/ext/xkcp/lib/low/Xoodoo-times16/AVX512/Xoodoo-times16-SIMD512.c +1044 -0
  245. data/ext/xkcp/lib/low/Xoodoo-times16/AVX512/Xoodoo-times16-SnP.h +49 -0
  246. data/ext/xkcp/lib/low/Xoodoo-times16/fallback-on1/Xoodoo-times16-SnP.h +45 -0
  247. data/ext/xkcp/lib/low/Xoodoo-times16/fallback-on1/Xoodoo-times16-on1.c +37 -0
  248. data/ext/xkcp/lib/low/Xoodoo-times4/ARMv7A-NEON/Xoodoo-times4-ARMv7A.s +1587 -0
  249. data/ext/xkcp/lib/low/Xoodoo-times4/ARMv7A-NEON/Xoodoo-times4-SnP.h +48 -0
  250. data/ext/xkcp/lib/low/Xoodoo-times4/AVX512/Xoodoo-times4-SIMD512.c +1202 -0
  251. data/ext/xkcp/lib/low/Xoodoo-times4/AVX512/Xoodoo-times4-SnP.h +48 -0
  252. data/ext/xkcp/lib/low/Xoodoo-times4/SSSE3/Xoodoo-times4-SIMD128.c +484 -0
  253. data/ext/xkcp/lib/low/Xoodoo-times4/SSSE3/Xoodoo-times4-SnP.h +44 -0
  254. data/ext/xkcp/lib/low/Xoodoo-times4/fallback-on1/Xoodoo-times4-SnP.h +45 -0
  255. data/ext/xkcp/lib/low/Xoodoo-times4/fallback-on1/Xoodoo-times4-on1.c +37 -0
  256. data/ext/xkcp/lib/low/Xoodoo-times8/AVX2/Xoodoo-times8-SIMD256.c +939 -0
  257. data/ext/xkcp/lib/low/Xoodoo-times8/AVX2/Xoodoo-times8-SnP.h +49 -0
  258. data/ext/xkcp/lib/low/Xoodoo-times8/AVX512/Xoodoo-times8-SIMD512.c +1216 -0
  259. data/ext/xkcp/lib/low/Xoodoo-times8/AVX512/Xoodoo-times8-SnP.h +48 -0
  260. data/ext/xkcp/lib/low/Xoodoo-times8/fallback-on1/Xoodoo-times8-SnP.h +45 -0
  261. data/ext/xkcp/lib/low/Xoodoo-times8/fallback-on1/Xoodoo-times8-on1.c +37 -0
  262. data/ext/xkcp/lib/low/common/PlSnP-Fallback.inc +290 -0
  263. data/ext/xkcp/lib/low/common/SnP-Relaned.h +141 -0
  264. data/ext/xkcp/support/Build/ExpandProducts.xsl +79 -0
  265. data/ext/xkcp/support/Build/ToGlobalMakefile.xsl +206 -0
  266. data/ext/xkcp/support/Build/ToOneTarget.xsl +89 -0
  267. data/ext/xkcp/support/Build/ToTargetConfigFile.xsl +37 -0
  268. data/ext/xkcp/support/Build/ToTargetMakefile.xsl +298 -0
  269. data/ext/xkcp/support/Build/ToVCXProj.xsl +198 -0
  270. data/ext/xkcp/support/Kernel-PMU/Kernel-pmu.md +133 -0
  271. data/ext/xkcp/support/Kernel-PMU/Makefile +8 -0
  272. data/ext/xkcp/support/Kernel-PMU/enable_arm_pmu.c +129 -0
  273. data/ext/xkcp/support/Kernel-PMU/load-module +1 -0
  274. data/ext/xkcp/util/KeccakSum/KeccakSum.c +394 -0
  275. data/ext/xkcp/util/KeccakSum/base64.c +86 -0
  276. data/ext/xkcp/util/KeccakSum/base64.h +12 -0
  277. data/lib/sleeping_kangaroo12/binding.rb +15 -0
  278. data/lib/sleeping_kangaroo12/build/loader.rb +40 -0
  279. data/lib/sleeping_kangaroo12/build/platform.rb +37 -0
  280. data/lib/sleeping_kangaroo12/build.rb +4 -0
  281. data/lib/sleeping_kangaroo12/digest.rb +103 -0
  282. data/lib/sleeping_kangaroo12/version.rb +5 -0
  283. data/lib/sleeping_kangaroo12.rb +7 -0
  284. metadata +372 -0
@@ -0,0 +1,939 @@
1
+ /*
2
+ The eXtended Keccak Code Package (XKCP)
3
+ https://github.com/XKCP/XKCP
4
+
5
+ The Xoodoo permutation, designed by Joan Daemen, Seth Hoffert, Gilles Van Assche and Ronny Van Keer.
6
+
7
+ Implementation by Ronny Van Keer, hereby denoted as "the implementer".
8
+
9
+ For more information, feedback or questions, please refer to the Keccak Team website:
10
+ https://keccak.team/
11
+
12
+ To the extent possible under law, the implementer has waived all copyright
13
+ and related or neighboring rights to the source code in this file.
14
+ http://creativecommons.org/publicdomain/zero/1.0/
15
+ */
16
+
17
+ #include <stdio.h>
18
+ #include <string.h>
19
+ #include <smmintrin.h>
20
+ #include <wmmintrin.h>
21
+ #include <immintrin.h>
22
+ #include <emmintrin.h>
23
+ #include "align.h"
24
+ #include "brg_endian.h"
25
+ #include "Xoodoo.h"
26
+ #include "Xoodoo-times8-SnP.h"
27
+
28
+ #if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
29
+ #error Expecting a little-endian platform
30
+ #endif
31
+
32
+ typedef __m128i V128;
33
+ typedef __m256i V256;
34
+
35
+ #define SnP_laneLengthInBytes 4
36
+ #define laneIndex(instanceIndex, lanePosition) ((lanePosition)*8 + instanceIndex)
37
+
38
+ #define AND256(a, b) _mm256_and_si256(a, b)
39
+ #define ANDnu256(a, b) _mm256_andnot_si256(a, b)
40
+ #define CONST8_32(a) _mm256_set1_epi32(a)
41
+ #define LOAD256(a) _mm256_load_si256((const V256 *)&(a))
42
+ #define LOAD256u(a) _mm256_loadu_si256((const V256 *)&(a))
43
+ #define LOAD8_32(a,b,c,d,e,f,g,h) _mm256_setr_epi32(a,b,c,d,e,f,g,h)
44
+ #define LOAD_GATHER8_32(idx,p) _mm256_i32gather_epi32((const void*)(p), idx, 4)
45
+
46
+ #define SHUFFLE_LANES_RIGHT(a, n) _mm256_permutevar8x32_epi32(a, shuffleR_##n)
47
+ #define SHUFFLE_LANES_RIGHT_2(a) _mm256_permute4x64_epi64(a, 0x39)
48
+ #define INSERT_LANE( a, val, n) _mm256_insert_epi32(a, val, n)
49
+ #define EXTRACT_LANE( a, n) _mm256_extract_epi32(a, n)
50
+ #define INSERT_2LANES( a, val, n) _mm256_insert_epi64(a, val, (n)/2)
51
+ #define EXTRACT_2LANES( a, n) _mm256_extract_epi64(a, (n)/2)
52
+
53
+
54
+ #define ROL32in256(a, o) _mm256_or_si256(_mm256_slli_epi32(a, o), _mm256_srli_epi32(a, 32-(o)))
55
+ #define ROL32in256_8(a) _mm256_shuffle_epi8(a, rho8)
56
+ #define SHL32in256(a, o) _mm256_slli_epi32(a, o)
57
+
58
+ #define STORE128(a, b) _mm_store_si128((V128 *)&(a), b)
59
+ #define STORE128u(a, b) _mm_storeu_si128((V128 *)&(a), b)
60
+ #define STORE256(a, b) _mm256_store_si256((V256 *)&(a), b)
61
+ #define STORE256u(a, b) _mm256_storeu_si256((V256 *)&(a), b)
62
+
63
+ #define XOR256(a, b) _mm256_xor_si256(a, b)
64
+ #define XOReq256(a, b) a = XOR256(a, b)
65
+ #define XOR128(a, b) _mm_xor_si128(a, b)
66
+ #define XOReq128(a, b) a = XOR128(a, b)
67
+
68
+ #ifndef _mm256_storeu2_m128i
69
+ #define _mm256_storeu2_m128i(hi, lo, a) _mm_storeu_si128((V128*)(lo), _mm256_castsi256_si128(a)), _mm_storeu_si128((V128*)(hi), _mm256_extracti128_si256(a, 1))
70
+ #endif
71
+
72
+ #define VERBOSE 0
73
+
74
+ #if (VERBOSE > 0)
75
+ #define Dump(__t,__v) { \
76
+ uint32_t buf[8]; \
77
+ printf("%s\n", __t); \
78
+ STORE256(buf, __v##00); printf("00 %08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); \
79
+ STORE256(buf, __v##01); printf("01 %08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); \
80
+ STORE256(buf, __v##02); printf("02 %08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); \
81
+ STORE256(buf, __v##03); printf("03 %08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); \
82
+ STORE256(buf, __v##10); printf("10 %08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); \
83
+ STORE256(buf, __v##11); printf("11 %08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); \
84
+ STORE256(buf, __v##12); printf("12 %08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); \
85
+ STORE256(buf, __v##13); printf("13 %08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); \
86
+ STORE256(buf, __v##20); printf("20 %08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); \
87
+ STORE256(buf, __v##21); printf("21 %08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); \
88
+ STORE256(buf, __v##22); printf("22 %08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); \
89
+ STORE256(buf, __v##23); printf("23 %08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); \
90
+ }
91
+ #else
92
+ #define Dump(__t,__v)
93
+ #endif
94
+
95
+ #if (VERBOSE >= 1)
96
+ #define Dump1(__t,__v) Dump(__t,__v)
97
+ #else
98
+ #define Dump1(__t,__v)
99
+ #endif
100
+
101
+ #if (VERBOSE >= 2)
102
+ #define Dump2(__t,__v) Dump(__t,__v)
103
+ #else
104
+ #define Dump2(__t,__v)
105
+ #endif
106
+
107
+ #if (VERBOSE >= 3)
108
+ #define Dump3(__t,__v) Dump(__t,__v)
109
+ #else
110
+ #define Dump3(__t,__v)
111
+ #endif
112
+
113
+ ALIGN(32) static const uint32_t oshuffleR_1[] = {1, 2, 3, 4, 5, 6, 7, 0};
114
+ ALIGN(32) static const uint32_t oshuffleR_3[] = {3, 4, 5, 6, 7, 0, 1, 2};
115
+ ALIGN(32) static const uint32_t oshuffleR_5[] = {5, 6, 7, 0, 1, 2, 3, 4};
116
+ ALIGN(32) static const uint32_t oshuffleR_7[] = {7, 0, 1, 2, 3, 4, 5, 6};
117
+ ALIGN(32) static const uint32_t shufflePack[] = {0, 2, 4, 6, 1, 3, 5, 7};
118
+
119
+
120
+ void Xoodootimes8_InitializeAll(void *states)
121
+ {
122
+ memset(states, 0, Xoodootimes8_statesSizeInBytes);
123
+ }
124
+
125
+ void Xoodootimes8_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
126
+ {
127
+ unsigned int sizeLeft = length;
128
+ unsigned int lanePosition = offset/SnP_laneLengthInBytes;
129
+ unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
130
+ const unsigned char *curData = data;
131
+ uint32_t *statesAsLanes = (uint32_t *)states;
132
+
133
+ if ((sizeLeft > 0) && (offsetInLane != 0)) {
134
+ unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
135
+ uint32_t lane = 0;
136
+ if (bytesInLane > sizeLeft)
137
+ bytesInLane = sizeLeft;
138
+ memcpy((unsigned char*)&lane + offsetInLane, curData, bytesInLane);
139
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
140
+ sizeLeft -= bytesInLane;
141
+ lanePosition++;
142
+ curData += bytesInLane;
143
+ }
144
+
145
+ while(sizeLeft >= SnP_laneLengthInBytes) {
146
+ uint32_t lane = *((const uint32_t*)curData);
147
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
148
+ sizeLeft -= SnP_laneLengthInBytes;
149
+ lanePosition++;
150
+ curData += SnP_laneLengthInBytes;
151
+ }
152
+
153
+ if (sizeLeft > 0) {
154
+ uint32_t lane = 0;
155
+ memcpy(&lane, curData, sizeLeft);
156
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
157
+ }
158
+ }
159
+
160
+ void Xoodootimes8_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
161
+ {
162
+ V256 *stateAsLanes = (V256 *)states;
163
+ unsigned int i;
164
+ const uint32_t *curData0 = (const uint32_t *)(data+laneOffset*0*SnP_laneLengthInBytes);
165
+ const uint32_t *curData1 = (const uint32_t *)(data+laneOffset*1*SnP_laneLengthInBytes);
166
+ const uint32_t *curData2 = (const uint32_t *)(data+laneOffset*2*SnP_laneLengthInBytes);
167
+ const uint32_t *curData3 = (const uint32_t *)(data+laneOffset*3*SnP_laneLengthInBytes);
168
+ const uint32_t *curData4 = (const uint32_t *)(data+laneOffset*4*SnP_laneLengthInBytes);
169
+ const uint32_t *curData5 = (const uint32_t *)(data+laneOffset*5*SnP_laneLengthInBytes);
170
+ const uint32_t *curData6 = (const uint32_t *)(data+laneOffset*6*SnP_laneLengthInBytes);
171
+ const uint32_t *curData7 = (const uint32_t *)(data+laneOffset*7*SnP_laneLengthInBytes);
172
+
173
+ #define Xor_In( argIndex ) XOReq256(stateAsLanes[argIndex], LOAD8_32(curData0[argIndex], curData1[argIndex], curData2[argIndex], curData3[argIndex], curData4[argIndex], curData5[argIndex], curData6[argIndex], curData7[argIndex]))
174
+
175
+ if ( laneCount == 12 ) {
176
+ Xor_In( 0 );
177
+ Xor_In( 1 );
178
+ Xor_In( 2 );
179
+ Xor_In( 3 );
180
+ Xor_In( 4 );
181
+ Xor_In( 5 );
182
+ Xor_In( 6 );
183
+ Xor_In( 7 );
184
+ Xor_In( 8 );
185
+ Xor_In( 9 );
186
+ Xor_In( 10 );
187
+ Xor_In( 11 );
188
+ }
189
+ else {
190
+ for(i=0; i<laneCount; i++)
191
+ Xor_In( i );
192
+ }
193
+ #undef Xor_In
194
+ }
195
+
196
+ void Xoodootimes8_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
197
+ {
198
+ unsigned int sizeLeft = length;
199
+ unsigned int lanePosition = offset/SnP_laneLengthInBytes;
200
+ unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
201
+ const unsigned char *curData = data;
202
+ uint32_t *statesAsLanes = (uint32_t *)states;
203
+
204
+ if ((sizeLeft > 0) && (offsetInLane != 0)) {
205
+ unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
206
+ if (bytesInLane > sizeLeft)
207
+ bytesInLane = sizeLeft;
208
+ memcpy( ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, curData, bytesInLane);
209
+ sizeLeft -= bytesInLane;
210
+ lanePosition++;
211
+ curData += bytesInLane;
212
+ }
213
+
214
+ while(sizeLeft >= SnP_laneLengthInBytes) {
215
+ uint32_t lane = *((const uint32_t*)curData);
216
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] = lane;
217
+ sizeLeft -= SnP_laneLengthInBytes;
218
+ lanePosition++;
219
+ curData += SnP_laneLengthInBytes;
220
+ }
221
+
222
+ if (sizeLeft > 0) {
223
+ memcpy(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], curData, sizeLeft);
224
+ }
225
+ }
226
+
227
+ void Xoodootimes8_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
228
+ {
229
+ V256 *stateAsLanes = (V256 *)states;
230
+ unsigned int i;
231
+ const uint32_t *curData0 = (const uint32_t *)(data+laneOffset*0*SnP_laneLengthInBytes);
232
+ const uint32_t *curData1 = (const uint32_t *)(data+laneOffset*1*SnP_laneLengthInBytes);
233
+ const uint32_t *curData2 = (const uint32_t *)(data+laneOffset*2*SnP_laneLengthInBytes);
234
+ const uint32_t *curData3 = (const uint32_t *)(data+laneOffset*3*SnP_laneLengthInBytes);
235
+ const uint32_t *curData4 = (const uint32_t *)(data+laneOffset*4*SnP_laneLengthInBytes);
236
+ const uint32_t *curData5 = (const uint32_t *)(data+laneOffset*5*SnP_laneLengthInBytes);
237
+ const uint32_t *curData6 = (const uint32_t *)(data+laneOffset*6*SnP_laneLengthInBytes);
238
+ const uint32_t *curData7 = (const uint32_t *)(data+laneOffset*7*SnP_laneLengthInBytes);
239
+
240
+ #define OverWr( argIndex ) STORE256(stateAsLanes[argIndex], LOAD8_32(curData0[argIndex], curData1[argIndex], curData2[argIndex], curData3[argIndex], curData4[argIndex], curData5[argIndex], curData6[argIndex], curData7[argIndex]))
241
+
242
+ if ( laneCount == 12 ) {
243
+ OverWr( 0 );
244
+ OverWr( 1 );
245
+ OverWr( 2 );
246
+ OverWr( 3 );
247
+ OverWr( 4 );
248
+ OverWr( 5 );
249
+ OverWr( 6 );
250
+ OverWr( 7 );
251
+ OverWr( 8 );
252
+ OverWr( 9 );
253
+ OverWr( 10 );
254
+ OverWr( 11 );
255
+ }
256
+ else {
257
+ for(i=0; i<laneCount; i++)
258
+ OverWr( i );
259
+ }
260
+ #undef OverWr
261
+ }
262
+
263
+ void Xoodootimes8_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount)
264
+ {
265
+ unsigned int sizeLeft = byteCount;
266
+ unsigned int lanePosition = 0;
267
+ uint32_t *statesAsLanes = (uint32_t *)states;
268
+
269
+ while(sizeLeft >= SnP_laneLengthInBytes) {
270
+ statesAsLanes[laneIndex(instanceIndex, lanePosition)] = 0;
271
+ sizeLeft -= SnP_laneLengthInBytes;
272
+ lanePosition++;
273
+ }
274
+
275
+ if (sizeLeft > 0) {
276
+ memset(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], 0, sizeLeft);
277
+ }
278
+ }
279
+
280
+ void Xoodootimes8_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length)
281
+ {
282
+ unsigned int sizeLeft = length;
283
+ unsigned int lanePosition = offset/SnP_laneLengthInBytes;
284
+ unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
285
+ unsigned char *curData = data;
286
+ const uint32_t *statesAsLanes = (const uint32_t *)states;
287
+
288
+ if ((sizeLeft > 0) && (offsetInLane != 0)) {
289
+ unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
290
+ if (bytesInLane > sizeLeft)
291
+ bytesInLane = sizeLeft;
292
+ memcpy( curData, ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane);
293
+ sizeLeft -= bytesInLane;
294
+ lanePosition++;
295
+ curData += bytesInLane;
296
+ }
297
+
298
+ while(sizeLeft >= SnP_laneLengthInBytes) {
299
+ *(uint32_t*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
300
+ sizeLeft -= SnP_laneLengthInBytes;
301
+ lanePosition++;
302
+ curData += SnP_laneLengthInBytes;
303
+ }
304
+
305
+ if (sizeLeft > 0) {
306
+ memcpy( curData, &statesAsLanes[laneIndex(instanceIndex, lanePosition)], sizeLeft);
307
+ }
308
+ }
309
+
310
+ void Xoodootimes8_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
311
+ {
312
+ uint32_t *curData0 = (uint32_t *)(data+laneOffset*0*SnP_laneLengthInBytes);
313
+ uint32_t *curData1 = (uint32_t *)(data+laneOffset*1*SnP_laneLengthInBytes);
314
+ uint32_t *curData2 = (uint32_t *)(data+laneOffset*2*SnP_laneLengthInBytes);
315
+ uint32_t *curData3 = (uint32_t *)(data+laneOffset*3*SnP_laneLengthInBytes);
316
+ uint32_t *curData4 = (uint32_t *)(data+laneOffset*4*SnP_laneLengthInBytes);
317
+ uint32_t *curData5 = (uint32_t *)(data+laneOffset*5*SnP_laneLengthInBytes);
318
+ uint32_t *curData6 = (uint32_t *)(data+laneOffset*6*SnP_laneLengthInBytes);
319
+ uint32_t *curData7 = (uint32_t *)(data+laneOffset*7*SnP_laneLengthInBytes);
320
+ const V256 *stateAsLanes = (const V256 *)states;
321
+ const uint32_t *stateAsLanes32 = (const uint32_t*)states;
322
+ unsigned int i;
323
+
324
+ #define Extr( argIndex ) curData0[argIndex] = stateAsLanes32[8*(argIndex)], \
325
+ curData1[argIndex] = stateAsLanes32[8*(argIndex)+1], \
326
+ curData2[argIndex] = stateAsLanes32[8*(argIndex)+2], \
327
+ curData3[argIndex] = stateAsLanes32[8*(argIndex)+3], \
328
+ curData4[argIndex] = stateAsLanes32[8*(argIndex)+4], \
329
+ curData5[argIndex] = stateAsLanes32[8*(argIndex)+5], \
330
+ curData6[argIndex] = stateAsLanes32[8*(argIndex)+6], \
331
+ curData7[argIndex] = stateAsLanes32[8*(argIndex)+7]
332
+
333
+ if ( laneCount == 12 ) {
334
+ Extr( 0 );
335
+ Extr( 1 );
336
+ Extr( 2 );
337
+ Extr( 3 );
338
+ Extr( 4 );
339
+ Extr( 5 );
340
+ Extr( 6 );
341
+ Extr( 7 );
342
+ Extr( 8 );
343
+ Extr( 9 );
344
+ Extr( 10 );
345
+ Extr( 11 );
346
+ }
347
+ else {
348
+ for(i=0; i<laneCount; i++)
349
+ Extr( i );
350
+ }
351
+ #undef Extr
352
+ }
353
+
354
+ void Xoodootimes8_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
355
+ {
356
+ unsigned int sizeLeft = length;
357
+ unsigned int lanePosition = offset/SnP_laneLengthInBytes;
358
+ unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
359
+ const unsigned char *curInput = input;
360
+ unsigned char *curOutput = output;
361
+ const uint32_t *statesAsLanes = (const uint32_t *)states;
362
+
363
+ if ((sizeLeft > 0) && (offsetInLane != 0)) {
364
+ unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
365
+ uint32_t lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)] >> (8 * offsetInLane);
366
+ if (bytesInLane > sizeLeft)
367
+ bytesInLane = sizeLeft;
368
+ sizeLeft -= bytesInLane;
369
+ do {
370
+ *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
371
+ lane >>= 8;
372
+ } while ( --bytesInLane != 0);
373
+ lanePosition++;
374
+ }
375
+
376
+ while(sizeLeft >= SnP_laneLengthInBytes) {
377
+ *((uint32_t*)curOutput) = *((uint32_t*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)];
378
+ sizeLeft -= SnP_laneLengthInBytes;
379
+ lanePosition++;
380
+ curInput += SnP_laneLengthInBytes;
381
+ curOutput += SnP_laneLengthInBytes;
382
+ }
383
+
384
+ if (sizeLeft != 0) {
385
+ uint32_t lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
386
+ do {
387
+ *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
388
+ lane >>= 8;
389
+ } while ( --sizeLeft != 0);
390
+ }
391
+ }
392
+
393
+ void Xoodootimes8_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset)
394
+ {
395
+ const uint32_t *curInput0 = (uint32_t *)(input+laneOffset*0*SnP_laneLengthInBytes);
396
+ const uint32_t *curInput1 = (uint32_t *)(input+laneOffset*1*SnP_laneLengthInBytes);
397
+ const uint32_t *curInput2 = (uint32_t *)(input+laneOffset*2*SnP_laneLengthInBytes);
398
+ const uint32_t *curInput3 = (uint32_t *)(input+laneOffset*3*SnP_laneLengthInBytes);
399
+ const uint32_t *curInput4 = (uint32_t *)(input+laneOffset*4*SnP_laneLengthInBytes);
400
+ const uint32_t *curInput5 = (uint32_t *)(input+laneOffset*5*SnP_laneLengthInBytes);
401
+ const uint32_t *curInput6 = (uint32_t *)(input+laneOffset*6*SnP_laneLengthInBytes);
402
+ const uint32_t *curInput7 = (uint32_t *)(input+laneOffset*7*SnP_laneLengthInBytes);
403
+ uint32_t *curOutput0 = (uint32_t *)(output+laneOffset*0*SnP_laneLengthInBytes);
404
+ uint32_t *curOutput1 = (uint32_t *)(output+laneOffset*1*SnP_laneLengthInBytes);
405
+ uint32_t *curOutput2 = (uint32_t *)(output+laneOffset*2*SnP_laneLengthInBytes);
406
+ uint32_t *curOutput3 = (uint32_t *)(output+laneOffset*3*SnP_laneLengthInBytes);
407
+ uint32_t *curOutput4 = (uint32_t *)(output+laneOffset*4*SnP_laneLengthInBytes);
408
+ uint32_t *curOutput5 = (uint32_t *)(output+laneOffset*5*SnP_laneLengthInBytes);
409
+ uint32_t *curOutput6 = (uint32_t *)(output+laneOffset*6*SnP_laneLengthInBytes);
410
+ uint32_t *curOutput7 = (uint32_t *)(output+laneOffset*7*SnP_laneLengthInBytes);
411
+
412
+ const V256 *stateAsLanes = (const V256 *)states;
413
+ const uint32_t *stateAsLanes32 = (const uint32_t*)states;
414
+ unsigned int i;
415
+
416
+ #define ExtrXor( argIndex ) \
417
+ curOutput0[argIndex] = curInput0[argIndex] ^ stateAsLanes32[8*(argIndex)+0],\
418
+ curOutput1[argIndex] = curInput1[argIndex] ^ stateAsLanes32[8*(argIndex)+1],\
419
+ curOutput2[argIndex] = curInput2[argIndex] ^ stateAsLanes32[8*(argIndex)+2],\
420
+ curOutput3[argIndex] = curInput3[argIndex] ^ stateAsLanes32[8*(argIndex)+3],\
421
+ curOutput4[argIndex] = curInput4[argIndex] ^ stateAsLanes32[8*(argIndex)+4],\
422
+ curOutput5[argIndex] = curInput5[argIndex] ^ stateAsLanes32[8*(argIndex)+5],\
423
+ curOutput6[argIndex] = curInput6[argIndex] ^ stateAsLanes32[8*(argIndex)+6],\
424
+ curOutput7[argIndex] = curInput7[argIndex] ^ stateAsLanes32[8*(argIndex)+7]
425
+
426
+ if ( laneCount == 12 ) {
427
+ ExtrXor( 0 );
428
+ ExtrXor( 1 );
429
+ ExtrXor( 2 );
430
+ ExtrXor( 3 );
431
+ ExtrXor( 4 );
432
+ ExtrXor( 5 );
433
+ ExtrXor( 6 );
434
+ ExtrXor( 7 );
435
+ ExtrXor( 8 );
436
+ ExtrXor( 9 );
437
+ ExtrXor( 10 );
438
+ ExtrXor( 11 );
439
+ }
440
+ else {
441
+ for(i=0; i<laneCount; i++) {
442
+ ExtrXor( i );
443
+ }
444
+ }
445
+ #undef ExtrXor
446
+ }
447
+
448
+ #define DeclareVars V256 a00, a01, a02, a03; \
449
+ V256 a10, a11, a12, a13; \
450
+ V256 a20, a21, a22, a23; \
451
+ V256 v1, v2; \
452
+ V256 rho8 = LOAD8_32(0x02010003, 0x06050407, 0x0A09080B, 0x0E0D0C0F, 0x12111013, 0x16151417, 0x1A19181B, 0x1E1D1C1F)
453
+
454
+ #define State2Vars2 a00 = LOAD256(states[8*(0+0)]), a01 = LOAD256(states[8*(0+1)]), a02 = LOAD256(states[8*(0+2)]), a03 = LOAD256(states[8*(0+3)]); \
455
+ a12 = LOAD256(states[8*(4+0)]), a13 = LOAD256(states[8*(4+1)]), a10 = LOAD256(states[8*(4+2)]), a11 = LOAD256(states[8*(4+3)]); \
456
+ a20 = LOAD256(states[8*(8+0)]), a21 = LOAD256(states[8*(8+1)]), a22 = LOAD256(states[8*(8+2)]), a23 = LOAD256(states[8*(8+3)])
457
+
458
+ #define State2Vars a00 = LOAD256(states[8*(0+0)]), a01 = LOAD256(states[8*(0+1)]), a02 = LOAD256(states[8*(0+2)]), a03 = LOAD256(states[8*(0+3)]); \
459
+ a10 = LOAD256(states[8*(4+0)]), a11 = LOAD256(states[8*(4+1)]), a12 = LOAD256(states[8*(4+2)]), a13 = LOAD256(states[8*(4+3)]); \
460
+ a20 = LOAD256(states[8*(8+0)]), a21 = LOAD256(states[8*(8+1)]), a22 = LOAD256(states[8*(8+2)]), a23 = LOAD256(states[8*(8+3)])
461
+
462
+ #define Vars2State STORE256(states[8*(0+0)], a00), STORE256(states[8*(0+1)], a01), STORE256(states[8*(0+2)], a02), STORE256(states[8*(0+3)], a03); \
463
+ STORE256(states[8*(4+0)], a10), STORE256(states[8*(4+1)], a11), STORE256(states[8*(4+2)], a12), STORE256(states[8*(4+3)], a13); \
464
+ STORE256(states[8*(8+0)], a20), STORE256(states[8*(8+1)], a21), STORE256(states[8*(8+2)], a22), STORE256(states[8*(8+3)], a23)
465
+
466
+ #define Round(a10i, a11i, a12i, a13i, a10w, a11w, a12w, a13w, a20i, a21i, a22i, a23i, __rc) \
467
+ \
468
+ /* Theta: Column Parity Mixer */ \
469
+ v1 = XOR256( a03, XOR256( a13i, a23i ) ); \
470
+ v2 = XOR256( a00, XOR256( a10i, a20i ) ); \
471
+ v1 = XOR256( ROL32in256(v1, 5), ROL32in256(v1, 14) ); \
472
+ a00 = XOR256( a00, v1 ); \
473
+ a10i = XOR256( a10i, v1 ); \
474
+ a20i = XOR256( a20i, v1 ); \
475
+ v1 = XOR256( a01, XOR256( a11i, a21i ) ); \
476
+ v2 = XOR256( ROL32in256(v2, 5), ROL32in256(v2, 14) ); \
477
+ a01 = XOR256( a01, v2 ); \
478
+ a11i = XOR256( a11i, v2 ); \
479
+ a21i = XOR256( a21i, v2 ); \
480
+ v2 = XOR256( a02, XOR256( a12i, a22i ) ); \
481
+ v1 = XOR256( ROL32in256(v1, 5), ROL32in256(v1, 14) ); \
482
+ a02 = XOR256( a02, v1 ); \
483
+ a12i = XOR256( a12i, v1 ); \
484
+ a22i = XOR256( a22i, v1 ); \
485
+ v2 = XOR256( ROL32in256(v2, 5), ROL32in256(v2, 14) ); \
486
+ a03 = XOR256( a03, v2 ); \
487
+ a13i = XOR256( a13i, v2 ); \
488
+ a23i = XOR256( a23i, v2 ); \
489
+ Dump3("Theta",a); \
490
+ \
491
+ /* Rho-west: Plane shift */ \
492
+ a20i = ROL32in256(a20i, 11); \
493
+ a21i = ROL32in256(a21i, 11); \
494
+ a22i = ROL32in256(a22i, 11); \
495
+ a23i = ROL32in256(a23i, 11); \
496
+ Dump3("Rho-west",a); \
497
+ \
498
+ /* Iota: round constants */ \
499
+ a00 = XOR256( a00, CONST8_32(__rc)); \
500
+ Dump3("Iota",a); \
501
+ \
502
+ /* Chi: non linear step, on colums */ \
503
+ a00 = XOR256( a00, ANDnu256( a10w, a20i ) ); \
504
+ a01 = XOR256( a01, ANDnu256( a11w, a21i ) ); \
505
+ a02 = XOR256( a02, ANDnu256( a12w, a22i ) ); \
506
+ a03 = XOR256( a03, ANDnu256( a13w, a23i ) ); \
507
+ a10w = XOR256( a10w, ANDnu256( a20i, a00 ) ); \
508
+ a11w = XOR256( a11w, ANDnu256( a21i, a01 ) ); \
509
+ a12w = XOR256( a12w, ANDnu256( a22i, a02 ) ); \
510
+ a13w = XOR256( a13w, ANDnu256( a23i, a03 ) ); \
511
+ a20i = XOR256( a20i, ANDnu256( a00, a10w ) ); \
512
+ a21i = XOR256( a21i, ANDnu256( a01, a11w ) ); \
513
+ a22i = XOR256( a22i, ANDnu256( a02, a12w ) ); \
514
+ a23i = XOR256( a23i, ANDnu256( a03, a13w ) ); \
515
+ Dump3("Chi",a); \
516
+ \
517
+ /* Rho-east: Plane shift */ \
518
+ a10w = ROL32in256(a10w, 1); \
519
+ a11w = ROL32in256(a11w, 1); \
520
+ a12w = ROL32in256(a12w, 1); \
521
+ a13w = ROL32in256(a13w, 1); \
522
+ a20i = ROL32in256_8(a20i); \
523
+ a21i = ROL32in256_8(a21i); \
524
+ a22i = ROL32in256_8(a22i); \
525
+ a23i = ROL32in256_8(a23i); \
526
+ Dump3("Rho-east",a)
527
+
528
+ void Xoodootimes8_PermuteAll_6rounds(void *argstates)
529
+ {
530
+ uint32_t * states = (uint32_t *)argstates;
531
+ DeclareVars;
532
+
533
+ State2Vars2;
534
+ Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc6 );
535
+ Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc5 );
536
+ Round( a10, a11, a12, a13, a13, a10, a11, a12, a20, a21, a22, a23, _rc4 );
537
+ Round( a13, a10, a11, a12, a12, a13, a10, a11, a22, a23, a20, a21, _rc3 );
538
+ Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc2 );
539
+ Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc1 );
540
+ //Dump1("Permutation\n", a);
541
+ Vars2State;
542
+ }
543
+
544
+ void Xoodootimes8_PermuteAll_12rounds(void *argstates)
545
+ {
546
+ uint32_t * states = (uint32_t *)argstates;
547
+ DeclareVars;
548
+
549
+ State2Vars;
550
+ Round( a10, a11, a12, a13, a13, a10, a11, a12, a20, a21, a22, a23, _rc12 );
551
+ Round( a13, a10, a11, a12, a12, a13, a10, a11, a22, a23, a20, a21, _rc11 );
552
+ Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc10 );
553
+ Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc9 );
554
+ Round( a10, a11, a12, a13, a13, a10, a11, a12, a20, a21, a22, a23, _rc8 );
555
+ Round( a13, a10, a11, a12, a12, a13, a10, a11, a22, a23, a20, a21, _rc7 );
556
+ Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc6 );
557
+ Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc5 );
558
+ Round( a10, a11, a12, a13, a13, a10, a11, a12, a20, a21, a22, a23, _rc4 );
559
+ Round( a13, a10, a11, a12, a12, a13, a10, a11, a22, a23, a20, a21, _rc3 );
560
+ Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc2 );
561
+ Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc1 );
562
+ //Dump1("Permutation\n", a);
563
+ Vars2State;
564
+ }
565
+
566
+ void Xooffftimes8_AddIs(unsigned char *output, const unsigned char *input, size_t bitLen)
567
+ {
568
+ size_t byteLen = bitLen / 8;
569
+ V256 lanes1, lanes2, lanes3, lanes4, lanes5, lanes6, lanes7, lanes8;
570
+
571
+ while ( byteLen >= 128 ) {
572
+ lanes1 = LOAD256u(input[ 0]);
573
+ lanes2 = LOAD256u(input[32]);
574
+ lanes3 = LOAD256u(input[64]);
575
+ lanes4 = LOAD256u(input[96]);
576
+ lanes5 = LOAD256u(output[ 0]);
577
+ lanes6 = LOAD256u(output[32]);
578
+ lanes7 = LOAD256u(output[64]);
579
+ lanes8 = LOAD256u(output[96]);
580
+ lanes1 = XOR256(lanes1, lanes5);
581
+ lanes2 = XOR256(lanes2, lanes6);
582
+ lanes3 = XOR256(lanes3, lanes7);
583
+ lanes4 = XOR256(lanes4, lanes8);
584
+ STORE256u(output[ 0], lanes1);
585
+ STORE256u(output[32], lanes2);
586
+ STORE256u(output[64], lanes3);
587
+ STORE256u(output[96], lanes4);
588
+ input += 128;
589
+ output += 128;
590
+ byteLen -= 128;
591
+ }
592
+ while ( byteLen >= 32 ) {
593
+ lanes1 = LOAD256u(input[0]);
594
+ lanes2 = LOAD256u(output[0]);
595
+ input += 32;
596
+ lanes1 = XOR256(lanes1, lanes2);
597
+ byteLen -= 32;
598
+ STORE256u(output[0], lanes1);
599
+ output += 32;
600
+ }
601
+ while ( byteLen >= 8 ) {
602
+ *((uint64_t*)output) ^= *((uint64_t*)input);
603
+ input += 8;
604
+ output += 8;
605
+ byteLen -= 8;
606
+ }
607
+ while ( byteLen-- != 0 ) {
608
+ *output++ ^= *input++;
609
+ }
610
+
611
+ bitLen &= 7;
612
+ if (bitLen != 0)
613
+ {
614
+ *output ^= *input;
615
+ *output &= (1 << bitLen) - 1;
616
+ }
617
+ }
618
+
619
+ size_t Xooffftimes8_CompressFastLoop(unsigned char *k, unsigned char *x, const unsigned char *input, size_t length)
620
+ {
621
+ DeclareVars;
622
+ uint32_t *k32 = (uint32_t*)k;
623
+ uint32_t *x32 = (uint32_t*)x;
624
+ uint32_t *i32 = (uint32_t*)input;
625
+ size_t initialLength;
626
+ V256 r04815926;
627
+ V256 r5926a37b;
628
+ V256 t;
629
+ V256 x00, x01, x02, x03, x10, x11, x12, x13, x20, x21, x22, x23;
630
+ V128 x4;
631
+ V256 shuffleR_1 = *(const V256*)oshuffleR_1;
632
+ V256 shuffleR_3 = *(const V256*)oshuffleR_3;
633
+ V256 shuffleR_5 = *(const V256*)oshuffleR_5;
634
+ V256 shuffleR_7 = *(const V256*)oshuffleR_7;
635
+
636
+ r04815926 = LOAD_GATHER8_32(LOAD8_32( 0, 4, 8, 1, 5, 9, 2, 6), k32);
637
+ r5926a37b = LOAD_GATHER8_32(LOAD8_32( 5, 9, 2, 6, 10, 3, 7, 11), k32);
638
+ t = LOAD8_32( 0*12, 1*12, 2*12, 3*12, 4*12, 5*12, 6*12, 7*12);
639
+
640
+ initialLength = length;
641
+
642
+ /* Clear x accumulator */
643
+ x00 = _mm256_setzero_si256();
644
+ x01 = _mm256_setzero_si256();
645
+ x02 = _mm256_setzero_si256();
646
+ x03 = _mm256_setzero_si256();
647
+ x10 = _mm256_setzero_si256();
648
+ x11 = _mm256_setzero_si256();
649
+ x12 = _mm256_setzero_si256();
650
+ x13 = _mm256_setzero_si256();
651
+ x20 = _mm256_setzero_si256();
652
+ x21 = _mm256_setzero_si256();
653
+ x22 = _mm256_setzero_si256();
654
+ x23 = _mm256_setzero_si256();
655
+
656
+ #define rCGKDHLEI r5926a37b
657
+ #define aCGKDHLEI ((uint32_t*)&rCGKDHLEI)
658
+ do {
659
+ /* Note that a10-a12 and a11-a13 are swapped */
660
+ a00 = r04815926;
661
+ a01 = _mm256_blend_epi32(SHUFFLE_LANES_RIGHT(r04815926, 3), SHUFFLE_LANES_RIGHT(r5926a37b, 7), 0xE0); /* 15926 */
662
+ a02 = SHUFFLE_LANES_RIGHT_2(r5926a37b); /* 26a37b */
663
+
664
+ a12 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a00, 1), EXTRACT_LANE(a01, 5), 7); /* 4815926 A */
665
+
666
+ rCGKDHLEI = XOR256(a00, XOR256(SHL32in256(a00, 13), ROL32in256(a12, 3)));
667
+
668
+ a02 = _mm256_blend_epi32(a02, SHUFFLE_LANES_RIGHT_2(rCGKDHLEI), 0xC0);
669
+ a03 = _mm256_blend_epi32(SHUFFLE_LANES_RIGHT(a02, 3), SHUFFLE_LANES_RIGHT(rCGKDHLEI, 5), 0xF8);
670
+
671
+ a13 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a01, 1), EXTRACT_LANE(a02, 5), 7); /* B */
672
+ a10 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a02, 1), aCGKDHLEI[2], 7); /* K */
673
+ a11 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a03, 1), aCGKDHLEI[5], 7); /* L */
674
+
675
+ a20 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a12, 1), EXTRACT_LANE(a01, 6), 7); /* 815926A+3 */
676
+ a21 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a13, 1), aCGKDHLEI[0], 7); /* C */
677
+ a22 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a10, 1), aCGKDHLEI[3], 7); /* D */
678
+ a23 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a11, 1), aCGKDHLEI[6], 7); /* E */
679
+ r04815926 = a22;
680
+ Dump("Roll-c", a);
681
+
682
+ a00 = XOR256( a00, LOAD_GATHER8_32(t, i32+0));
683
+ a01 = XOR256( a01, LOAD_GATHER8_32(t, i32+1));
684
+ a02 = XOR256( a02, LOAD_GATHER8_32(t, i32+2));
685
+ a03 = XOR256( a03, LOAD_GATHER8_32(t, i32+3));
686
+
687
+ a12 = XOR256( a12, LOAD_GATHER8_32(t, i32+4));
688
+ a13 = XOR256( a13, LOAD_GATHER8_32(t, i32+5));
689
+ a10 = XOR256( a10, LOAD_GATHER8_32(t, i32+6));
690
+ a11 = XOR256( a11, LOAD_GATHER8_32(t, i32+7));
691
+
692
+ a20 = XOR256( a20, LOAD_GATHER8_32(t, i32+8));
693
+ a21 = XOR256( a21, LOAD_GATHER8_32(t, i32+9));
694
+ a22 = XOR256( a22, LOAD_GATHER8_32(t, i32+10));
695
+ a23 = XOR256( a23, LOAD_GATHER8_32(t, i32+11));
696
+ Dump("Add input", a);
697
+
698
+ Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc6 );
699
+ Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc5 );
700
+ Round( a10, a11, a12, a13, a13, a10, a11, a12, a20, a21, a22, a23, _rc4 );
701
+ Round( a13, a10, a11, a12, a12, a13, a10, a11, a22, a23, a20, a21, _rc3 );
702
+ Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc2 );
703
+ Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc1 );
704
+ Dump("Xoodoo", a);
705
+
706
+ x00 = XOR256(x00, a00);
707
+ x01 = XOR256(x01, a01);
708
+ x02 = XOR256(x02, a02);
709
+ x03 = XOR256(x03, a03);
710
+ x10 = XOR256(x10, a10);
711
+ x11 = XOR256(x11, a11);
712
+ x12 = XOR256(x12, a12);
713
+ x13 = XOR256(x13, a13);
714
+ x20 = XOR256(x20, a20);
715
+ x21 = XOR256(x21, a21);
716
+ x22 = XOR256(x22, a22);
717
+ x23 = XOR256(x23, a23);
718
+ Dump("Accu x", x);
719
+
720
+ i32 += NLANES*8;
721
+ length -= NLANES*4*8;
722
+ }
723
+ while (length >= (NLANES*4*8));
724
+
725
+ /* Reduce from 8 to 4 lanes (x00 - x13), reduce from 4 to 2 lanes (x20 - x23) */
726
+ x00 = XOR256(x00, _mm256_permute4x64_epi64(x00, 0x4e));
727
+ x01 = XOR256(x01, _mm256_permute4x64_epi64(x01, 0x4e));
728
+ x02 = XOR256(x02, _mm256_permute4x64_epi64(x02, 0x4e));
729
+ x03 = XOR256(x03, _mm256_permute4x64_epi64(x03, 0x4e));
730
+ x10 = XOR256(x10, _mm256_permute4x64_epi64(x10, 0x4e));
731
+ x11 = XOR256(x11, _mm256_permute4x64_epi64(x11, 0x4e));
732
+ x12 = XOR256(x12, _mm256_permute4x64_epi64(x12, 0x4e));
733
+ x13 = XOR256(x13, _mm256_permute4x64_epi64(x13, 0x4e));
734
+ x20 = XOR256(x20, _mm256_permute4x64_epi64(x20, 0x4e));
735
+ x21 = XOR256(x21, _mm256_permute4x64_epi64(x21, 0x4e));
736
+ x22 = XOR256(x22, _mm256_permute4x64_epi64(x22, 0x4e));
737
+ x23 = XOR256(x23, _mm256_permute4x64_epi64(x23, 0x4e));
738
+ x00 = _mm256_permute2x128_si256( x00, x10, 0x20);
739
+ x01 = _mm256_permute2x128_si256( x01, x11, 0x20);
740
+ x02 = _mm256_permute2x128_si256( x02, x12, 0x20);
741
+ x03 = _mm256_permute2x128_si256( x03, x13, 0x20);
742
+ x20 = _mm256_permute2x128_si256( x20, x22, 0x20);
743
+ x21 = _mm256_permute2x128_si256( x21, x23, 0x20);
744
+
745
+ /* Reduce from 4 to 2 lanes (x00 - x03), reduce from 2 to 1 lane (x20 - x21) */
746
+ x00 = XOR256(x00, _mm256_permute4x64_epi64(x00, 0xB1));
747
+ x01 = XOR256(x01, _mm256_permute4x64_epi64(x01, 0xB1));
748
+ x02 = XOR256(x02, _mm256_permute4x64_epi64(x02, 0xB1));
749
+ x03 = XOR256(x03, _mm256_permute4x64_epi64(x03, 0xB1));
750
+ x20 = XOR256(x20, _mm256_permute4x64_epi64(x20, 0xB1));
751
+ x21 = XOR256(x21, _mm256_permute4x64_epi64(x21, 0xB1));
752
+ x00 = _mm256_blend_epi32( x00, x02, 0xCC);
753
+ x01 = _mm256_blend_epi32( x01, x03, 0xCC);
754
+ x20 = _mm256_blend_epi32( x20, x21, 0xCC);
755
+
756
+ /* Reduce from 2 to 1 lane (x00 - x01), 1 to half lane (x20) */
757
+ x00 = XOR256(x00, SHUFFLE_LANES_RIGHT(x00, 1));
758
+ x01 = XOR256(x01, SHUFFLE_LANES_RIGHT(x01, 1));
759
+ x20 = XOR256(x20, SHUFFLE_LANES_RIGHT(x20, 1));
760
+ x00 = _mm256_blend_epi32( x00, SHUFFLE_LANES_RIGHT(x01, 7), 0xAA);
761
+ x20 = _mm256_permutevar8x32_epi32( x20, *(V256*)shufflePack);
762
+
763
+ x00 = XOR256(x00, *(V256*)&x32[0]);
764
+ x4 = XOR128(_mm256_castsi256_si128(x20), *(V128*)&x32[8]);
765
+
766
+ STORE256u( *(V256*)&x32[0], x00);
767
+ STORE128u( *(V128*)&x32[8], x4);
768
+
769
+ /* Save new k from r04815926 and rCGKDHLEI */
770
+ k32[ 0] = _mm256_extract_epi32(r04815926, 0);
771
+ k32[ 1] = _mm256_extract_epi32(r04815926, 3);
772
+ k32[ 2] = _mm256_extract_epi32(rCGKDHLEI, 2); /* K */
773
+ k32[ 3] = _mm256_extract_epi32(rCGKDHLEI, 5); /* L */
774
+ k32[ 4] = _mm256_extract_epi32(r04815926, 1);
775
+ k32[ 5] = _mm256_extract_epi32(rCGKDHLEI, 0); /* C */
776
+ k32[ 6] = _mm256_extract_epi32(rCGKDHLEI, 3); /* D */
777
+ k32[ 7] = _mm256_extract_epi32(rCGKDHLEI, 6); /* E */
778
+ k32[ 8] = _mm256_extract_epi32(r04815926, 2);
779
+ k32[ 9] = _mm256_extract_epi32(rCGKDHLEI, 1); /* G */
780
+ k32[10] = _mm256_extract_epi32(rCGKDHLEI, 4); /* H */
781
+ k32[11] = _mm256_extract_epi32(rCGKDHLEI, 7); /* I */
782
+ #undef rCGKDHLEI
783
+
784
+ return initialLength - length;
785
+ }
786
+
787
+ size_t Xooffftimes8_ExpandFastLoop(unsigned char *yAccu, const unsigned char *kRoll, unsigned char *output, size_t length)
788
+ {
789
+ DeclareVars;
790
+ const uint32_t *k32 = (uint32_t*)kRoll;
791
+ uint32_t *y32 = (uint32_t*)yAccu;
792
+ uint32_t *o32 = (uint32_t*)output;
793
+ size_t initialLength;
794
+ V256 r04815926;
795
+ V256 r5926a37b;
796
+ V256 v3, v4;
797
+ V256 shuffleR_1 = *(const V256*)oshuffleR_1;
798
+ V256 shuffleR_3 = *(const V256*)oshuffleR_3;
799
+ V256 shuffleR_5 = *(const V256*)oshuffleR_5;
800
+ V256 shuffleR_7 = *(const V256*)oshuffleR_7;
801
+
802
+ r04815926 = LOAD_GATHER8_32(LOAD8_32( 0, 4, 8, 1, 5, 9, 2, 6), y32);
803
+ r5926a37b = LOAD_GATHER8_32(LOAD8_32( 5, 9, 2, 6, 10, 3, 7, 11), y32);
804
+
805
+ initialLength = length;
806
+
807
+ #define rCGKDHLEI r5926a37b
808
+ #define aCGKDHLEI ((uint32_t*)&rCGKDHLEI)
809
+ do {
810
+ a00 = r04815926;
811
+ a01 = _mm256_blend_epi32(SHUFFLE_LANES_RIGHT(r04815926, 3), SHUFFLE_LANES_RIGHT(r5926a37b, 7), 0xE0); /* 15926+A37 */
812
+ a02 = SHUFFLE_LANES_RIGHT_2(r5926a37b); /* 26a37b+-- */
813
+
814
+ a12 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a00, 1), EXTRACT_LANE(a01, 5), 7); /* 4815926+A */
815
+ a20 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a12, 1), EXTRACT_LANE(a01, 6), 7); /* 815926A+3 */
816
+
817
+ rCGKDHLEI = XOR256(ROL32in256(a00, 5), ROL32in256(a12, 13));
818
+ rCGKDHLEI = XOR256(rCGKDHLEI, AND256(a20, a12));
819
+ rCGKDHLEI = XOR256(rCGKDHLEI, CONST8_32(7));
820
+
821
+ a02 = _mm256_blend_epi32(a02, SHUFFLE_LANES_RIGHT_2(rCGKDHLEI), 0xC0);
822
+ a03 = _mm256_blend_epi32(SHUFFLE_LANES_RIGHT(a02, 3), SHUFFLE_LANES_RIGHT(rCGKDHLEI, 5), 0xF8);
823
+
824
+ a13 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a01, 1), EXTRACT_LANE(a02, 5), 7); /* B */
825
+ a10 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a02, 1), aCGKDHLEI[2], 7); /* K */
826
+ a11 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a03, 1), aCGKDHLEI[5], 7); /* L */
827
+
828
+ a21 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a13, 1), aCGKDHLEI[0], 7); /* C */
829
+ a22 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a10, 1), aCGKDHLEI[3], 7); /* D */
830
+ a23 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a11, 1), aCGKDHLEI[6], 7); /* E */
831
+ r04815926 = a22;
832
+ Dump("Roll-e", a);
833
+
834
+ Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc6 );
835
+ Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc5 );
836
+ Round( a10, a11, a12, a13, a13, a10, a11, a12, a20, a21, a22, a23, _rc4 );
837
+ Round( a13, a10, a11, a12, a12, a13, a10, a11, a22, a23, a20, a21, _rc3 );
838
+ Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc2 );
839
+ Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc1 );
840
+ Dump("Xoodoo(y)", a);
841
+
842
+ a00 = XOR256(a00, CONST8_32(k32[0]));
843
+ a01 = XOR256(a01, CONST8_32(k32[1]));
844
+ a02 = XOR256(a02, CONST8_32(k32[2]));
845
+ a03 = XOR256(a03, CONST8_32(k32[3]));
846
+ a10 = XOR256(a10, CONST8_32(k32[4]));
847
+ a11 = XOR256(a11, CONST8_32(k32[5]));
848
+ a12 = XOR256(a12, CONST8_32(k32[6]));
849
+ a13 = XOR256(a13, CONST8_32(k32[7]));
850
+ a20 = XOR256(a20, CONST8_32(k32[8]));
851
+ a21 = XOR256(a21, CONST8_32(k32[9]));
852
+ a22 = XOR256(a22, CONST8_32(k32[10]));
853
+ a23 = XOR256(a23, CONST8_32(k32[11]));
854
+ Dump("Xoodoo(y) + kRoll", a);
855
+
856
+ /* Extract */
857
+ #define UNPACKL32(a, b) _mm256_unpacklo_epi32(a, b)
858
+ #define UNPACKH32(a, b) _mm256_unpackhi_epi32(a, b)
859
+ #define UNPACKL64(a, b) _mm256_unpacklo_epi64(a, b)
860
+ #define UNPACKH64(a, b) _mm256_unpackhi_epi64(a, b)
861
+ #define UNPACKL128(a, b) _mm256_permute2x128_si256(a, b, 0x20)
862
+ #define UNPACKH128(a, b) _mm256_permute2x128_si256(a, b, 0x31)
863
+ #define lanesL01 v1
864
+ #define lanesH01 v2
865
+ #define lanesL23 v3
866
+ #define lanesH23 v4
867
+
868
+ lanesL01 = UNPACKL32( a00, a01 );
869
+ lanesH01 = UNPACKH32( a00, a01 );
870
+ lanesL23 = UNPACKL32( a02, a03 );
871
+ lanesH23 = UNPACKH32( a02, a03 );
872
+ a00 = UNPACKL64( lanesL01, lanesL23 );
873
+ a01 = UNPACKH64( lanesL01, lanesL23 );
874
+ a02 = UNPACKL64( lanesH01, lanesH23 );
875
+ a03 = UNPACKH64( lanesH01, lanesH23 );
876
+
877
+ lanesL01 = UNPACKL32( a10, a11 );
878
+ lanesH01 = UNPACKH32( a10, a11 );
879
+ lanesL23 = UNPACKL32( a12, a13 );
880
+ lanesH23 = UNPACKH32( a12, a13 );
881
+ a10 = UNPACKL64( lanesL01, lanesL23 );
882
+ a11 = UNPACKH64( lanesL01, lanesL23 );
883
+ a12 = UNPACKL64( lanesH01, lanesH23 );
884
+ a13 = UNPACKH64( lanesH01, lanesH23 );
885
+
886
+ lanesL01 = UNPACKL128( a00, a10 );
887
+ lanesH01 = UNPACKH128( a00, a10 );
888
+ lanesL23 = UNPACKL128( a01, a11 );
889
+ lanesH23 = UNPACKH128( a01, a11 );
890
+ STORE256u(o32[0*12+0], lanesL01);
891
+ STORE256u(o32[4*12+0], lanesH01);
892
+ STORE256u(o32[1*12+0], lanesL23);
893
+ STORE256u(o32[5*12+0], lanesH23);
894
+
895
+ lanesL01 = UNPACKL128( a02, a12 );
896
+ lanesH01 = UNPACKH128( a02, a12 );
897
+ lanesL23 = UNPACKL128( a03, a13 );
898
+ lanesH23 = UNPACKH128( a03, a13 );
899
+ STORE256u(o32[2*12+0], lanesL01);
900
+ STORE256u(o32[6*12+0], lanesH01);
901
+ STORE256u(o32[3*12+0], lanesL23);
902
+ STORE256u(o32[7*12+0], lanesH23);
903
+
904
+ lanesL01 = UNPACKL32( a20, a21 );
905
+ lanesH01 = UNPACKH32( a20, a21 );
906
+ lanesL23 = UNPACKL32( a22, a23 );
907
+ lanesH23 = UNPACKH32( a22, a23 );
908
+ a20 = UNPACKL64( lanesL01, lanesL23 );
909
+ a21 = UNPACKH64( lanesL01, lanesL23 );
910
+ a22 = UNPACKL64( lanesH01, lanesH23 );
911
+ a23 = UNPACKH64( lanesH01, lanesH23 );
912
+ _mm256_storeu2_m128i((__m128i*)(o32+4*12+8), (__m128i*)(o32+0*12+8), a20);
913
+ _mm256_storeu2_m128i((__m128i*)(o32+5*12+8), (__m128i*)(o32+1*12+8), a21);
914
+ _mm256_storeu2_m128i((__m128i*)(o32+6*12+8), (__m128i*)(o32+2*12+8), a22);
915
+ _mm256_storeu2_m128i((__m128i*)(o32+7*12+8), (__m128i*)(o32+3*12+8), a23);
916
+ Dump("shuffle", a);
917
+
918
+ o32 += NLANES*8;
919
+ length -= NLANES*4*8;
920
+ }
921
+ while (length >= (NLANES*4*8));
922
+
923
+ /* Save new y from r04815926 and rCGKDHLEI */
924
+ y32[ 0] = _mm256_extract_epi32(r04815926, 0);
925
+ y32[ 1] = _mm256_extract_epi32(r04815926, 3);
926
+ y32[ 2] = _mm256_extract_epi32(rCGKDHLEI, 2); /* K */
927
+ y32[ 3] = _mm256_extract_epi32(rCGKDHLEI, 5); /* L */
928
+ y32[ 4] = _mm256_extract_epi32(r04815926, 1);
929
+ y32[ 5] = _mm256_extract_epi32(rCGKDHLEI, 0); /* C */
930
+ y32[ 6] = _mm256_extract_epi32(rCGKDHLEI, 3); /* D */
931
+ y32[ 7] = _mm256_extract_epi32(rCGKDHLEI, 6); /* E */
932
+ y32[ 8] = _mm256_extract_epi32(r04815926, 2);
933
+ y32[ 9] = _mm256_extract_epi32(rCGKDHLEI, 1); /* G */
934
+ y32[10] = _mm256_extract_epi32(rCGKDHLEI, 4); /* H */
935
+ y32[11] = _mm256_extract_epi32(rCGKDHLEI, 7); /* I */
936
+ #undef rCGKDHLEI
937
+
938
+ return initialLength - length;
939
+ }