sleeping_kangaroo12 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (291) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +34 -67
  3. data/ext/Rakefile +12 -37
  4. data/ext/binding/sleeping_kangaroo12.c +1 -16
  5. data/ext/{xkcp → k12}/Makefile +0 -0
  6. data/ext/k12/Makefile.build +118 -0
  7. data/ext/k12/README.markdown +86 -0
  8. data/ext/k12/lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S +623 -0
  9. data/ext/k12/lib/ARMv8Asha3/KeccakP-1600-SnP.h +65 -0
  10. data/ext/k12/lib/ARMv8Asha3/KeccakP-1600-opt64.c +227 -0
  11. data/ext/{xkcp/lib/low/KeccakP-1600/compact → k12/lib/Inplace32BI}/KeccakP-1600-SnP.h +4 -9
  12. data/ext/{xkcp/lib/low/KeccakP-1600/plain-32bits-inplace → k12/lib/Inplace32BI}/KeccakP-1600-inplace32BI.c +65 -160
  13. data/ext/k12/lib/KangarooTwelve.c +332 -0
  14. data/ext/{xkcp/lib/high/KangarooTwelve → k12/lib}/KangarooTwelve.h +53 -16
  15. data/ext/{xkcp/lib/low/KeccakP-1600/AVX2 → k12/lib/Optimized64}/KeccakP-1600-AVX2.s +122 -558
  16. data/ext/k12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c +241 -0
  17. data/ext/k12/lib/Optimized64/KeccakP-1600-AVX512.s +551 -0
  18. data/ext/k12/lib/Optimized64/KeccakP-1600-SnP.h +74 -0
  19. data/ext/{xkcp/lib/low/KeccakP-1600/common/KeccakP-1600-64.macros → k12/lib/Optimized64/KeccakP-1600-opt64.c} +447 -169
  20. data/ext/k12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c +406 -0
  21. data/ext/k12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c +419 -0
  22. data/ext/k12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c +458 -0
  23. data/ext/k12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c +438 -0
  24. data/ext/{xkcp/lib/low/KeccakP-1600/plain-64bits → k12/lib/Plain64}/KeccakP-1600-SnP.h +14 -20
  25. data/ext/{xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-reference.h → k12/lib/Plain64/KeccakP-1600-plain64.c} +9 -8
  26. data/ext/{xkcp/lib/common → k12/lib}/align.h +3 -2
  27. data/ext/{xkcp/lib/common → k12/lib}/brg_endian.h +0 -0
  28. data/ext/{xkcp → k12}/support/Build/ExpandProducts.xsl +0 -0
  29. data/ext/{xkcp → k12}/support/Build/ToGlobalMakefile.xsl +0 -0
  30. data/ext/{xkcp → k12}/support/Build/ToOneTarget.xsl +0 -0
  31. data/ext/{xkcp → k12}/support/Build/ToTargetConfigFile.xsl +0 -0
  32. data/ext/{xkcp → k12}/support/Build/ToTargetMakefile.xsl +10 -16
  33. data/ext/{xkcp → k12}/support/Build/ToVCXProj.xsl +0 -0
  34. data/lib/sleeping_kangaroo12/version.rb +1 -1
  35. metadata +33 -276
  36. data/ext/config/xkcp.build +0 -17
  37. data/ext/xkcp/LICENSE +0 -1
  38. data/ext/xkcp/Makefile.build +0 -200
  39. data/ext/xkcp/README.markdown +0 -296
  40. data/ext/xkcp/lib/HighLevel.build +0 -143
  41. data/ext/xkcp/lib/LowLevel.build +0 -757
  42. data/ext/xkcp/lib/high/KangarooTwelve/KangarooTwelve.c +0 -301
  43. data/ext/xkcp/lib/high/Keccak/FIPS202/KeccakHash.c +0 -81
  44. data/ext/xkcp/lib/high/Keccak/FIPS202/KeccakHash.h +0 -125
  45. data/ext/xkcp/lib/high/Keccak/FIPS202/SimpleFIPS202.c +0 -48
  46. data/ext/xkcp/lib/high/Keccak/FIPS202/SimpleFIPS202.h +0 -79
  47. data/ext/xkcp/lib/high/Keccak/KeccakDuplex.c +0 -81
  48. data/ext/xkcp/lib/high/Keccak/KeccakDuplex.h +0 -73
  49. data/ext/xkcp/lib/high/Keccak/KeccakDuplex.inc +0 -195
  50. data/ext/xkcp/lib/high/Keccak/KeccakSponge.c +0 -111
  51. data/ext/xkcp/lib/high/Keccak/KeccakSponge.h +0 -76
  52. data/ext/xkcp/lib/high/Keccak/KeccakSponge.inc +0 -314
  53. data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.c +0 -61
  54. data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.h +0 -67
  55. data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.inc +0 -128
  56. data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.c +0 -93
  57. data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.h +0 -599
  58. data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.inc +0 -573
  59. data/ext/xkcp/lib/high/Ketje/Ketjev2.c +0 -87
  60. data/ext/xkcp/lib/high/Ketje/Ketjev2.h +0 -88
  61. data/ext/xkcp/lib/high/Ketje/Ketjev2.inc +0 -274
  62. data/ext/xkcp/lib/high/Keyak/Keyakv2.c +0 -132
  63. data/ext/xkcp/lib/high/Keyak/Keyakv2.h +0 -217
  64. data/ext/xkcp/lib/high/Keyak/Keyakv2.inc +0 -81
  65. data/ext/xkcp/lib/high/Keyak/Motorist.inc +0 -953
  66. data/ext/xkcp/lib/high/Kravatte/Kravatte.c +0 -533
  67. data/ext/xkcp/lib/high/Kravatte/Kravatte.h +0 -115
  68. data/ext/xkcp/lib/high/Kravatte/KravatteModes.c +0 -557
  69. data/ext/xkcp/lib/high/Kravatte/KravatteModes.h +0 -247
  70. data/ext/xkcp/lib/high/Xoodyak/Cyclist.h +0 -66
  71. data/ext/xkcp/lib/high/Xoodyak/Cyclist.inc +0 -336
  72. data/ext/xkcp/lib/high/Xoodyak/Xoodyak-parameters.h +0 -26
  73. data/ext/xkcp/lib/high/Xoodyak/Xoodyak.c +0 -55
  74. data/ext/xkcp/lib/high/Xoodyak/Xoodyak.h +0 -35
  75. data/ext/xkcp/lib/high/Xoofff/Xoofff.c +0 -634
  76. data/ext/xkcp/lib/high/Xoofff/Xoofff.h +0 -147
  77. data/ext/xkcp/lib/high/Xoofff/XoofffModes.c +0 -483
  78. data/ext/xkcp/lib/high/Xoofff/XoofffModes.h +0 -241
  79. data/ext/xkcp/lib/high/common/Phases.h +0 -25
  80. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-SnP.h +0 -41
  81. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv6m-le-armcc.s +0 -1666
  82. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv6m-le-gcc.s +0 -1655
  83. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7a-le-armcc.s +0 -1268
  84. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7a-le-gcc.s +0 -1264
  85. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7m-le-armcc.s +0 -1178
  86. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7m-le-gcc.s +0 -1175
  87. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u1-32bi-armv6m-le-armcc.s +0 -1338
  88. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u1-32bi-armv6m-le-gcc.s +0 -1336
  89. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u2-32bi-armv6m-le-armcc.s +0 -1343
  90. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u2-32bi-armv6m-le-gcc.s +0 -1339
  91. data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-SnP.h +0 -42
  92. data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-armv7a-le-neon-armcc.s +0 -823
  93. data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-armv7a-le-neon-gcc.s +0 -831
  94. data/ext/xkcp/lib/low/KeccakP-1600/ARMv8A/KeccakP-1600-SnP.h +0 -31
  95. data/ext/xkcp/lib/low/KeccakP-1600/ARMv8A/KeccakP-1600-armv8a-neon.s +0 -540
  96. data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-SnP.h +0 -42
  97. data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-avr8-compact.s +0 -733
  98. data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-avr8-fast.s +0 -1121
  99. data/ext/xkcp/lib/low/KeccakP-1600/AVX2/KeccakP-1600-SnP.h +0 -52
  100. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/KeccakP-1600-AVX512.c +0 -623
  101. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/KeccakP-1600-SnP.h +0 -47
  102. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/u12/KeccakP-1600-AVX512-config.h +0 -6
  103. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/u6/KeccakP-1600-AVX512-config.h +0 -6
  104. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/ua/KeccakP-1600-AVX512-config.h +0 -6
  105. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/KeccakP-1600-AVX512.s +0 -1031
  106. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/KeccakP-1600-SnP.h +0 -53
  107. data/ext/xkcp/lib/low/KeccakP-1600/XOP/KeccakP-1600-SnP.h +0 -44
  108. data/ext/xkcp/lib/low/KeccakP-1600/XOP/KeccakP-1600-XOP.c +0 -476
  109. data/ext/xkcp/lib/low/KeccakP-1600/XOP/u6/KeccakP-1600-XOP-config.h +0 -6
  110. data/ext/xkcp/lib/low/KeccakP-1600/XOP/ua/KeccakP-1600-XOP-config.h +0 -6
  111. data/ext/xkcp/lib/low/KeccakP-1600/common/KeccakP-1600-unrolling.macros +0 -305
  112. data/ext/xkcp/lib/low/KeccakP-1600/compact/KeccakP-1600-compact64.c +0 -420
  113. data/ext/xkcp/lib/low/KeccakP-1600/plain-32bits-inplace/KeccakP-1600-SnP.h +0 -43
  114. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/KeccakP-1600-opt64.c +0 -565
  115. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcu6/KeccakP-1600-opt64-config.h +0 -7
  116. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcua/KeccakP-1600-opt64-config.h +0 -7
  117. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcua-shld/KeccakP-1600-opt64-config.h +0 -8
  118. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/u6/KeccakP-1600-opt64-config.h +0 -6
  119. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/ua/KeccakP-1600-opt64-config.h +0 -6
  120. data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-SnP.h +0 -44
  121. data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-reference.h +0 -23
  122. data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-reference32BI.c +0 -625
  123. data/ext/xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-SnP.h +0 -44
  124. data/ext/xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-reference.c +0 -440
  125. data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-SnP.h +0 -42
  126. data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-gas.s +0 -1196
  127. data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-gas_Apple.s +0 -1124
  128. data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-shld-gas.s +0 -1196
  129. data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-inplace-pl2-armv7a-neon-le-armcc.s +0 -1392
  130. data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-inplace-pl2-armv7a-neon-le-gcc.s +0 -1394
  131. data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-times2-SnP.h +0 -42
  132. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512u12/SIMD512-2-config.h +0 -7
  133. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512u4/SIMD512-2-config.h +0 -7
  134. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512ufull/SIMD512-2-config.h +0 -7
  135. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/KeccakP-1600-times2-SIMD512.c +0 -850
  136. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/KeccakP-1600-times2-SnP.h +0 -51
  137. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SIMD128.c +0 -957
  138. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SnP.h +0 -49
  139. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/SSSE3-u2/SIMD128-config.h +0 -8
  140. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/SSSE3-ua/SIMD128-config.h +0 -8
  141. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/XOP-u2/SIMD128-config.h +0 -9
  142. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/XOP-ua/SIMD128-config.h +0 -9
  143. data/ext/xkcp/lib/low/KeccakP-1600-times2/fallback-on1/KeccakP-1600-times2-SnP.h +0 -45
  144. data/ext/xkcp/lib/low/KeccakP-1600-times2/fallback-on1/KeccakP-1600-times2-on1.c +0 -37
  145. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/KeccakP-1600-times4-SIMD256.c +0 -1321
  146. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/KeccakP-1600-times4-SnP.h +0 -55
  147. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/u12/SIMD256-config.h +0 -7
  148. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/u6/SIMD256-config.h +0 -7
  149. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/ua/SIMD256-config.h +0 -7
  150. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512u12/SIMD512-4-config.h +0 -7
  151. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512u4/SIMD512-4-config.h +0 -7
  152. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512ufull/SIMD512-4-config.h +0 -7
  153. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/KeccakP-1600-times4-SIMD512.c +0 -881
  154. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/KeccakP-1600-times4-SnP.h +0 -51
  155. data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on1/KeccakP-1600-times4-SnP.h +0 -45
  156. data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on1/KeccakP-1600-times4-on1.c +0 -37
  157. data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on2/KeccakP-1600-times4-SnP.h +0 -45
  158. data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on2/KeccakP-1600-times4-on2.c +0 -38
  159. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/KeccakP-1600-times8-SIMD512.c +0 -1615
  160. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/KeccakP-1600-times8-SnP.h +0 -57
  161. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/u12/SIMD512-config.h +0 -7
  162. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/u4/SIMD512-config.h +0 -7
  163. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/ua/SIMD512-config.h +0 -7
  164. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on1/KeccakP-1600-times8-SnP.h +0 -45
  165. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on1/KeccakP-1600-times8-on1.c +0 -37
  166. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on2/KeccakP-1600-times8-SnP.h +0 -45
  167. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on2/KeccakP-1600-times8-on2.c +0 -38
  168. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on4/KeccakP-1600-times8-SnP.h +0 -45
  169. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on4/KeccakP-1600-times8-on4.c +0 -38
  170. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-SnP.h +0 -41
  171. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv6m-le-armcc.s +0 -442
  172. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv6m-le-gcc.s +0 -446
  173. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv7m-le-armcc.s +0 -419
  174. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv7m-le-gcc.s +0 -427
  175. data/ext/xkcp/lib/low/KeccakP-200/AVR8/KeccakP-200-SnP.h +0 -41
  176. data/ext/xkcp/lib/low/KeccakP-200/AVR8/KeccakP-200-avr8-fast.s +0 -647
  177. data/ext/xkcp/lib/low/KeccakP-200/compact/KeccakP-200-SnP.h +0 -39
  178. data/ext/xkcp/lib/low/KeccakP-200/compact/KeccakP-200-compact.c +0 -190
  179. data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-SnP.h +0 -43
  180. data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-reference.c +0 -412
  181. data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-reference.h +0 -23
  182. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-SnP.h +0 -41
  183. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv6m-le-armcc.s +0 -454
  184. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv6m-le-gcc.s +0 -458
  185. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv7m-le-armcc.s +0 -455
  186. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv7m-le-gcc.s +0 -458
  187. data/ext/xkcp/lib/low/KeccakP-400/AVR8/KeccakP-400-SnP.h +0 -41
  188. data/ext/xkcp/lib/low/KeccakP-400/AVR8/KeccakP-400-avr8-fast.s +0 -728
  189. data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-SnP.h +0 -43
  190. data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-reference.c +0 -414
  191. data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-reference.h +0 -23
  192. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-SnP.h +0 -42
  193. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u1-armv6m-le-armcc.s +0 -527
  194. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u1-armv6m-le-gcc.s +0 -533
  195. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv6m-le-armcc.s +0 -528
  196. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv6m-le-gcc.s +0 -534
  197. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7a-le-armcc.s +0 -521
  198. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7a-le-gcc.s +0 -527
  199. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7m-le-armcc.s +0 -517
  200. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7m-le-gcc.s +0 -523
  201. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-uf-armv7m-le-armcc.s +0 -550
  202. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-uf-armv7m-le-gcc.s +0 -556
  203. data/ext/xkcp/lib/low/KeccakP-800/ARMv8A/KeccakP-800-SnP.h +0 -32
  204. data/ext/xkcp/lib/low/KeccakP-800/ARMv8A/KeccakP-800-armv8a-neon.s +0 -432
  205. data/ext/xkcp/lib/low/KeccakP-800/AVR8/KeccakP-800-SnP.h +0 -42
  206. data/ext/xkcp/lib/low/KeccakP-800/AVR8/KeccakP-800-avr8-fast.s +0 -929
  207. data/ext/xkcp/lib/low/KeccakP-800/compact/KeccakP-800-SnP.h +0 -40
  208. data/ext/xkcp/lib/low/KeccakP-800/compact/KeccakP-800-compact.c +0 -244
  209. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-SnP.h +0 -46
  210. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32-bis.macros +0 -184
  211. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32.c +0 -454
  212. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32.macros +0 -459
  213. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-unrolling-bis.macros +0 -83
  214. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-unrolling.macros +0 -88
  215. data/ext/xkcp/lib/low/KeccakP-800/plain/lcu2/KeccakP-800-opt32-config.h +0 -7
  216. data/ext/xkcp/lib/low/KeccakP-800/plain/lcua/KeccakP-800-opt32-config.h +0 -7
  217. data/ext/xkcp/lib/low/KeccakP-800/plain/u2/KeccakP-800-opt32-config.h +0 -7
  218. data/ext/xkcp/lib/low/KeccakP-800/plain/ua/KeccakP-800-opt32-config.h +0 -7
  219. data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-SnP.h +0 -44
  220. data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-reference.c +0 -437
  221. data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-reference.h +0 -23
  222. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/Ket.h +0 -57
  223. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeJr-armv7m-le-armcc.s +0 -475
  224. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeJr-armv7m-le-gcc.s +0 -480
  225. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeSr-armv7m-le-armcc.s +0 -590
  226. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeSr-armv7m-le-gcc.s +0 -590
  227. data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.c +0 -126
  228. data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.h +0 -68
  229. data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.inc +0 -174
  230. data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.c +0 -80
  231. data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.h +0 -68
  232. data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.inc +0 -142
  233. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-SnP.h +0 -55
  234. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-u1-armv6m-le-armcc.s +0 -1086
  235. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-u1-armv6m-le-gcc.s +0 -1092
  236. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv6-le-armcc.s +0 -721
  237. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv6-le-gcc.s +0 -726
  238. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv7m-le-armcc.s +0 -723
  239. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv7m-le-gcc.s +0 -729
  240. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-u1-armv6m-le-armcc.s +0 -1164
  241. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-u1-armv6m-le-gcc.s +0 -1165
  242. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv6-le-armcc.s +0 -562
  243. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv6-le-gcc.s +0 -563
  244. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv7m-le-armcc.s +0 -563
  245. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv7m-le-gcc.s +0 -565
  246. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-SnP.h +0 -55
  247. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-uf-armv7a-neon-le-armcc.s +0 -476
  248. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-uf-armv7a-neon-le-gcc.s +0 -485
  249. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodyak-uf-armv7a-neon-le-armcc.s +0 -362
  250. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodyak-uf-armv7a-neon-le-gcc.s +0 -367
  251. data/ext/xkcp/lib/low/Xoodoo/AVR8/Xoodoo-SnP.h +0 -43
  252. data/ext/xkcp/lib/low/Xoodoo/AVR8/Xoodoo-avr8-u1.s +0 -1341
  253. data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodoo-SIMD512.c +0 -581
  254. data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodoo-SnP.h +0 -58
  255. data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodyak-full-block-SIMD512.c +0 -332
  256. data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodoo-SIMD128.c +0 -329
  257. data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodoo-SnP.h +0 -53
  258. data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodyak-full-block-SIMD128.c +0 -355
  259. data/ext/xkcp/lib/low/Xoodoo/Xoodoo.h +0 -79
  260. data/ext/xkcp/lib/low/Xoodoo/plain/Xoodoo-SnP.h +0 -56
  261. data/ext/xkcp/lib/low/Xoodoo/plain/Xoodoo-optimized.c +0 -399
  262. data/ext/xkcp/lib/low/Xoodoo/plain/Xoodyak-full-blocks.c +0 -127
  263. data/ext/xkcp/lib/low/Xoodoo/ref/Xoodoo-SnP.h +0 -43
  264. data/ext/xkcp/lib/low/Xoodoo/ref/Xoodoo-reference.c +0 -253
  265. data/ext/xkcp/lib/low/Xoodoo-times16/AVX512/Xoodoo-times16-SIMD512.c +0 -1044
  266. data/ext/xkcp/lib/low/Xoodoo-times16/AVX512/Xoodoo-times16-SnP.h +0 -49
  267. data/ext/xkcp/lib/low/Xoodoo-times16/fallback-on1/Xoodoo-times16-SnP.h +0 -45
  268. data/ext/xkcp/lib/low/Xoodoo-times16/fallback-on1/Xoodoo-times16-on1.c +0 -37
  269. data/ext/xkcp/lib/low/Xoodoo-times4/ARMv7A-NEON/Xoodoo-times4-ARMv7A.s +0 -1587
  270. data/ext/xkcp/lib/low/Xoodoo-times4/ARMv7A-NEON/Xoodoo-times4-SnP.h +0 -48
  271. data/ext/xkcp/lib/low/Xoodoo-times4/AVX512/Xoodoo-times4-SIMD512.c +0 -1202
  272. data/ext/xkcp/lib/low/Xoodoo-times4/AVX512/Xoodoo-times4-SnP.h +0 -48
  273. data/ext/xkcp/lib/low/Xoodoo-times4/SSSE3/Xoodoo-times4-SIMD128.c +0 -484
  274. data/ext/xkcp/lib/low/Xoodoo-times4/SSSE3/Xoodoo-times4-SnP.h +0 -44
  275. data/ext/xkcp/lib/low/Xoodoo-times4/fallback-on1/Xoodoo-times4-SnP.h +0 -45
  276. data/ext/xkcp/lib/low/Xoodoo-times4/fallback-on1/Xoodoo-times4-on1.c +0 -37
  277. data/ext/xkcp/lib/low/Xoodoo-times8/AVX2/Xoodoo-times8-SIMD256.c +0 -939
  278. data/ext/xkcp/lib/low/Xoodoo-times8/AVX2/Xoodoo-times8-SnP.h +0 -49
  279. data/ext/xkcp/lib/low/Xoodoo-times8/AVX512/Xoodoo-times8-SIMD512.c +0 -1216
  280. data/ext/xkcp/lib/low/Xoodoo-times8/AVX512/Xoodoo-times8-SnP.h +0 -48
  281. data/ext/xkcp/lib/low/Xoodoo-times8/fallback-on1/Xoodoo-times8-SnP.h +0 -45
  282. data/ext/xkcp/lib/low/Xoodoo-times8/fallback-on1/Xoodoo-times8-on1.c +0 -37
  283. data/ext/xkcp/lib/low/common/PlSnP-Fallback.inc +0 -290
  284. data/ext/xkcp/lib/low/common/SnP-Relaned.h +0 -141
  285. data/ext/xkcp/support/Kernel-PMU/Kernel-pmu.md +0 -133
  286. data/ext/xkcp/support/Kernel-PMU/Makefile +0 -8
  287. data/ext/xkcp/support/Kernel-PMU/enable_arm_pmu.c +0 -129
  288. data/ext/xkcp/support/Kernel-PMU/load-module +0 -1
  289. data/ext/xkcp/util/KeccakSum/KeccakSum.c +0 -394
  290. data/ext/xkcp/util/KeccakSum/base64.c +0 -86
  291. data/ext/xkcp/util/KeccakSum/base64.h +0 -12
@@ -1,939 +0,0 @@
1
- /*
2
- The eXtended Keccak Code Package (XKCP)
3
- https://github.com/XKCP/XKCP
4
-
5
- The Xoodoo permutation, designed by Joan Daemen, Seth Hoffert, Gilles Van Assche and Ronny Van Keer.
6
-
7
- Implementation by Ronny Van Keer, hereby denoted as "the implementer".
8
-
9
- For more information, feedback or questions, please refer to the Keccak Team website:
10
- https://keccak.team/
11
-
12
- To the extent possible under law, the implementer has waived all copyright
13
- and related or neighboring rights to the source code in this file.
14
- http://creativecommons.org/publicdomain/zero/1.0/
15
- */
16
-
17
- #include <stdio.h>
18
- #include <string.h>
19
- #include <smmintrin.h>
20
- #include <wmmintrin.h>
21
- #include <immintrin.h>
22
- #include <emmintrin.h>
23
- #include "align.h"
24
- #include "brg_endian.h"
25
- #include "Xoodoo.h"
26
- #include "Xoodoo-times8-SnP.h"
27
-
28
- #if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
29
- #error Expecting a little-endian platform
30
- #endif
31
-
32
- typedef __m128i V128;
33
- typedef __m256i V256;
34
-
35
- #define SnP_laneLengthInBytes 4
36
- #define laneIndex(instanceIndex, lanePosition) ((lanePosition)*8 + instanceIndex)
37
-
38
- #define AND256(a, b) _mm256_and_si256(a, b)
39
- #define ANDnu256(a, b) _mm256_andnot_si256(a, b)
40
- #define CONST8_32(a) _mm256_set1_epi32(a)
41
- #define LOAD256(a) _mm256_load_si256((const V256 *)&(a))
42
- #define LOAD256u(a) _mm256_loadu_si256((const V256 *)&(a))
43
- #define LOAD8_32(a,b,c,d,e,f,g,h) _mm256_setr_epi32(a,b,c,d,e,f,g,h)
44
- #define LOAD_GATHER8_32(idx,p) _mm256_i32gather_epi32((const void*)(p), idx, 4)
45
-
46
- #define SHUFFLE_LANES_RIGHT(a, n) _mm256_permutevar8x32_epi32(a, shuffleR_##n)
47
- #define SHUFFLE_LANES_RIGHT_2(a) _mm256_permute4x64_epi64(a, 0x39)
48
- #define INSERT_LANE( a, val, n) _mm256_insert_epi32(a, val, n)
49
- #define EXTRACT_LANE( a, n) _mm256_extract_epi32(a, n)
50
- #define INSERT_2LANES( a, val, n) _mm256_insert_epi64(a, val, (n)/2)
51
- #define EXTRACT_2LANES( a, n) _mm256_extract_epi64(a, (n)/2)
52
-
53
-
54
- #define ROL32in256(a, o) _mm256_or_si256(_mm256_slli_epi32(a, o), _mm256_srli_epi32(a, 32-(o)))
55
- #define ROL32in256_8(a) _mm256_shuffle_epi8(a, rho8)
56
- #define SHL32in256(a, o) _mm256_slli_epi32(a, o)
57
-
58
- #define STORE128(a, b) _mm_store_si128((V128 *)&(a), b)
59
- #define STORE128u(a, b) _mm_storeu_si128((V128 *)&(a), b)
60
- #define STORE256(a, b) _mm256_store_si256((V256 *)&(a), b)
61
- #define STORE256u(a, b) _mm256_storeu_si256((V256 *)&(a), b)
62
-
63
- #define XOR256(a, b) _mm256_xor_si256(a, b)
64
- #define XOReq256(a, b) a = XOR256(a, b)
65
- #define XOR128(a, b) _mm_xor_si128(a, b)
66
- #define XOReq128(a, b) a = XOR128(a, b)
67
-
68
- #ifndef _mm256_storeu2_m128i
69
- #define _mm256_storeu2_m128i(hi, lo, a) _mm_storeu_si128((V128*)(lo), _mm256_castsi256_si128(a)), _mm_storeu_si128((V128*)(hi), _mm256_extracti128_si256(a, 1))
70
- #endif
71
-
72
- #define VERBOSE 0
73
-
74
- #if (VERBOSE > 0)
75
- #define Dump(__t,__v) { \
76
- uint32_t buf[8]; \
77
- printf("%s\n", __t); \
78
- STORE256(buf, __v##00); printf("00 %08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); \
79
- STORE256(buf, __v##01); printf("01 %08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); \
80
- STORE256(buf, __v##02); printf("02 %08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); \
81
- STORE256(buf, __v##03); printf("03 %08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); \
82
- STORE256(buf, __v##10); printf("10 %08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); \
83
- STORE256(buf, __v##11); printf("11 %08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); \
84
- STORE256(buf, __v##12); printf("12 %08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); \
85
- STORE256(buf, __v##13); printf("13 %08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); \
86
- STORE256(buf, __v##20); printf("20 %08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); \
87
- STORE256(buf, __v##21); printf("21 %08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); \
88
- STORE256(buf, __v##22); printf("22 %08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); \
89
- STORE256(buf, __v##23); printf("23 %08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); \
90
- }
91
- #else
92
- #define Dump(__t,__v)
93
- #endif
94
-
95
- #if (VERBOSE >= 1)
96
- #define Dump1(__t,__v) Dump(__t,__v)
97
- #else
98
- #define Dump1(__t,__v)
99
- #endif
100
-
101
- #if (VERBOSE >= 2)
102
- #define Dump2(__t,__v) Dump(__t,__v)
103
- #else
104
- #define Dump2(__t,__v)
105
- #endif
106
-
107
- #if (VERBOSE >= 3)
108
- #define Dump3(__t,__v) Dump(__t,__v)
109
- #else
110
- #define Dump3(__t,__v)
111
- #endif
112
-
113
- ALIGN(32) static const uint32_t oshuffleR_1[] = {1, 2, 3, 4, 5, 6, 7, 0};
114
- ALIGN(32) static const uint32_t oshuffleR_3[] = {3, 4, 5, 6, 7, 0, 1, 2};
115
- ALIGN(32) static const uint32_t oshuffleR_5[] = {5, 6, 7, 0, 1, 2, 3, 4};
116
- ALIGN(32) static const uint32_t oshuffleR_7[] = {7, 0, 1, 2, 3, 4, 5, 6};
117
- ALIGN(32) static const uint32_t shufflePack[] = {0, 2, 4, 6, 1, 3, 5, 7};
118
-
119
-
120
- void Xoodootimes8_InitializeAll(void *states)
121
- {
122
- memset(states, 0, Xoodootimes8_statesSizeInBytes);
123
- }
124
-
125
- void Xoodootimes8_AddBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
126
- {
127
- unsigned int sizeLeft = length;
128
- unsigned int lanePosition = offset/SnP_laneLengthInBytes;
129
- unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
130
- const unsigned char *curData = data;
131
- uint32_t *statesAsLanes = (uint32_t *)states;
132
-
133
- if ((sizeLeft > 0) && (offsetInLane != 0)) {
134
- unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
135
- uint32_t lane = 0;
136
- if (bytesInLane > sizeLeft)
137
- bytesInLane = sizeLeft;
138
- memcpy((unsigned char*)&lane + offsetInLane, curData, bytesInLane);
139
- statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
140
- sizeLeft -= bytesInLane;
141
- lanePosition++;
142
- curData += bytesInLane;
143
- }
144
-
145
- while(sizeLeft >= SnP_laneLengthInBytes) {
146
- uint32_t lane = *((const uint32_t*)curData);
147
- statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
148
- sizeLeft -= SnP_laneLengthInBytes;
149
- lanePosition++;
150
- curData += SnP_laneLengthInBytes;
151
- }
152
-
153
- if (sizeLeft > 0) {
154
- uint32_t lane = 0;
155
- memcpy(&lane, curData, sizeLeft);
156
- statesAsLanes[laneIndex(instanceIndex, lanePosition)] ^= lane;
157
- }
158
- }
159
-
160
- void Xoodootimes8_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
161
- {
162
- V256 *stateAsLanes = (V256 *)states;
163
- unsigned int i;
164
- const uint32_t *curData0 = (const uint32_t *)(data+laneOffset*0*SnP_laneLengthInBytes);
165
- const uint32_t *curData1 = (const uint32_t *)(data+laneOffset*1*SnP_laneLengthInBytes);
166
- const uint32_t *curData2 = (const uint32_t *)(data+laneOffset*2*SnP_laneLengthInBytes);
167
- const uint32_t *curData3 = (const uint32_t *)(data+laneOffset*3*SnP_laneLengthInBytes);
168
- const uint32_t *curData4 = (const uint32_t *)(data+laneOffset*4*SnP_laneLengthInBytes);
169
- const uint32_t *curData5 = (const uint32_t *)(data+laneOffset*5*SnP_laneLengthInBytes);
170
- const uint32_t *curData6 = (const uint32_t *)(data+laneOffset*6*SnP_laneLengthInBytes);
171
- const uint32_t *curData7 = (const uint32_t *)(data+laneOffset*7*SnP_laneLengthInBytes);
172
-
173
- #define Xor_In( argIndex ) XOReq256(stateAsLanes[argIndex], LOAD8_32(curData0[argIndex], curData1[argIndex], curData2[argIndex], curData3[argIndex], curData4[argIndex], curData5[argIndex], curData6[argIndex], curData7[argIndex]))
174
-
175
- if ( laneCount == 12 ) {
176
- Xor_In( 0 );
177
- Xor_In( 1 );
178
- Xor_In( 2 );
179
- Xor_In( 3 );
180
- Xor_In( 4 );
181
- Xor_In( 5 );
182
- Xor_In( 6 );
183
- Xor_In( 7 );
184
- Xor_In( 8 );
185
- Xor_In( 9 );
186
- Xor_In( 10 );
187
- Xor_In( 11 );
188
- }
189
- else {
190
- for(i=0; i<laneCount; i++)
191
- Xor_In( i );
192
- }
193
- #undef Xor_In
194
- }
195
-
196
- void Xoodootimes8_OverwriteBytes(void *states, unsigned int instanceIndex, const unsigned char *data, unsigned int offset, unsigned int length)
197
- {
198
- unsigned int sizeLeft = length;
199
- unsigned int lanePosition = offset/SnP_laneLengthInBytes;
200
- unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
201
- const unsigned char *curData = data;
202
- uint32_t *statesAsLanes = (uint32_t *)states;
203
-
204
- if ((sizeLeft > 0) && (offsetInLane != 0)) {
205
- unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
206
- if (bytesInLane > sizeLeft)
207
- bytesInLane = sizeLeft;
208
- memcpy( ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, curData, bytesInLane);
209
- sizeLeft -= bytesInLane;
210
- lanePosition++;
211
- curData += bytesInLane;
212
- }
213
-
214
- while(sizeLeft >= SnP_laneLengthInBytes) {
215
- uint32_t lane = *((const uint32_t*)curData);
216
- statesAsLanes[laneIndex(instanceIndex, lanePosition)] = lane;
217
- sizeLeft -= SnP_laneLengthInBytes;
218
- lanePosition++;
219
- curData += SnP_laneLengthInBytes;
220
- }
221
-
222
- if (sizeLeft > 0) {
223
- memcpy(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], curData, sizeLeft);
224
- }
225
- }
226
-
227
- void Xoodootimes8_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
228
- {
229
- V256 *stateAsLanes = (V256 *)states;
230
- unsigned int i;
231
- const uint32_t *curData0 = (const uint32_t *)(data+laneOffset*0*SnP_laneLengthInBytes);
232
- const uint32_t *curData1 = (const uint32_t *)(data+laneOffset*1*SnP_laneLengthInBytes);
233
- const uint32_t *curData2 = (const uint32_t *)(data+laneOffset*2*SnP_laneLengthInBytes);
234
- const uint32_t *curData3 = (const uint32_t *)(data+laneOffset*3*SnP_laneLengthInBytes);
235
- const uint32_t *curData4 = (const uint32_t *)(data+laneOffset*4*SnP_laneLengthInBytes);
236
- const uint32_t *curData5 = (const uint32_t *)(data+laneOffset*5*SnP_laneLengthInBytes);
237
- const uint32_t *curData6 = (const uint32_t *)(data+laneOffset*6*SnP_laneLengthInBytes);
238
- const uint32_t *curData7 = (const uint32_t *)(data+laneOffset*7*SnP_laneLengthInBytes);
239
-
240
- #define OverWr( argIndex ) STORE256(stateAsLanes[argIndex], LOAD8_32(curData0[argIndex], curData1[argIndex], curData2[argIndex], curData3[argIndex], curData4[argIndex], curData5[argIndex], curData6[argIndex], curData7[argIndex]))
241
-
242
- if ( laneCount == 12 ) {
243
- OverWr( 0 );
244
- OverWr( 1 );
245
- OverWr( 2 );
246
- OverWr( 3 );
247
- OverWr( 4 );
248
- OverWr( 5 );
249
- OverWr( 6 );
250
- OverWr( 7 );
251
- OverWr( 8 );
252
- OverWr( 9 );
253
- OverWr( 10 );
254
- OverWr( 11 );
255
- }
256
- else {
257
- for(i=0; i<laneCount; i++)
258
- OverWr( i );
259
- }
260
- #undef OverWr
261
- }
262
-
263
- void Xoodootimes8_OverwriteWithZeroes(void *states, unsigned int instanceIndex, unsigned int byteCount)
264
- {
265
- unsigned int sizeLeft = byteCount;
266
- unsigned int lanePosition = 0;
267
- uint32_t *statesAsLanes = (uint32_t *)states;
268
-
269
- while(sizeLeft >= SnP_laneLengthInBytes) {
270
- statesAsLanes[laneIndex(instanceIndex, lanePosition)] = 0;
271
- sizeLeft -= SnP_laneLengthInBytes;
272
- lanePosition++;
273
- }
274
-
275
- if (sizeLeft > 0) {
276
- memset(&statesAsLanes[laneIndex(instanceIndex, lanePosition)], 0, sizeLeft);
277
- }
278
- }
279
-
280
- void Xoodootimes8_ExtractBytes(const void *states, unsigned int instanceIndex, unsigned char *data, unsigned int offset, unsigned int length)
281
- {
282
- unsigned int sizeLeft = length;
283
- unsigned int lanePosition = offset/SnP_laneLengthInBytes;
284
- unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
285
- unsigned char *curData = data;
286
- const uint32_t *statesAsLanes = (const uint32_t *)states;
287
-
288
- if ((sizeLeft > 0) && (offsetInLane != 0)) {
289
- unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
290
- if (bytesInLane > sizeLeft)
291
- bytesInLane = sizeLeft;
292
- memcpy( curData, ((unsigned char *)&statesAsLanes[laneIndex(instanceIndex, lanePosition)]) + offsetInLane, bytesInLane);
293
- sizeLeft -= bytesInLane;
294
- lanePosition++;
295
- curData += bytesInLane;
296
- }
297
-
298
- while(sizeLeft >= SnP_laneLengthInBytes) {
299
- *(uint32_t*)curData = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
300
- sizeLeft -= SnP_laneLengthInBytes;
301
- lanePosition++;
302
- curData += SnP_laneLengthInBytes;
303
- }
304
-
305
- if (sizeLeft > 0) {
306
- memcpy( curData, &statesAsLanes[laneIndex(instanceIndex, lanePosition)], sizeLeft);
307
- }
308
- }
309
-
310
- void Xoodootimes8_ExtractLanesAll(const void *states, unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
311
- {
312
- uint32_t *curData0 = (uint32_t *)(data+laneOffset*0*SnP_laneLengthInBytes);
313
- uint32_t *curData1 = (uint32_t *)(data+laneOffset*1*SnP_laneLengthInBytes);
314
- uint32_t *curData2 = (uint32_t *)(data+laneOffset*2*SnP_laneLengthInBytes);
315
- uint32_t *curData3 = (uint32_t *)(data+laneOffset*3*SnP_laneLengthInBytes);
316
- uint32_t *curData4 = (uint32_t *)(data+laneOffset*4*SnP_laneLengthInBytes);
317
- uint32_t *curData5 = (uint32_t *)(data+laneOffset*5*SnP_laneLengthInBytes);
318
- uint32_t *curData6 = (uint32_t *)(data+laneOffset*6*SnP_laneLengthInBytes);
319
- uint32_t *curData7 = (uint32_t *)(data+laneOffset*7*SnP_laneLengthInBytes);
320
- const V256 *stateAsLanes = (const V256 *)states;
321
- const uint32_t *stateAsLanes32 = (const uint32_t*)states;
322
- unsigned int i;
323
-
324
- #define Extr( argIndex ) curData0[argIndex] = stateAsLanes32[8*(argIndex)], \
325
- curData1[argIndex] = stateAsLanes32[8*(argIndex)+1], \
326
- curData2[argIndex] = stateAsLanes32[8*(argIndex)+2], \
327
- curData3[argIndex] = stateAsLanes32[8*(argIndex)+3], \
328
- curData4[argIndex] = stateAsLanes32[8*(argIndex)+4], \
329
- curData5[argIndex] = stateAsLanes32[8*(argIndex)+5], \
330
- curData6[argIndex] = stateAsLanes32[8*(argIndex)+6], \
331
- curData7[argIndex] = stateAsLanes32[8*(argIndex)+7]
332
-
333
- if ( laneCount == 12 ) {
334
- Extr( 0 );
335
- Extr( 1 );
336
- Extr( 2 );
337
- Extr( 3 );
338
- Extr( 4 );
339
- Extr( 5 );
340
- Extr( 6 );
341
- Extr( 7 );
342
- Extr( 8 );
343
- Extr( 9 );
344
- Extr( 10 );
345
- Extr( 11 );
346
- }
347
- else {
348
- for(i=0; i<laneCount; i++)
349
- Extr( i );
350
- }
351
- #undef Extr
352
- }
353
-
354
- void Xoodootimes8_ExtractAndAddBytes(const void *states, unsigned int instanceIndex, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
355
- {
356
- unsigned int sizeLeft = length;
357
- unsigned int lanePosition = offset/SnP_laneLengthInBytes;
358
- unsigned int offsetInLane = offset%SnP_laneLengthInBytes;
359
- const unsigned char *curInput = input;
360
- unsigned char *curOutput = output;
361
- const uint32_t *statesAsLanes = (const uint32_t *)states;
362
-
363
- if ((sizeLeft > 0) && (offsetInLane != 0)) {
364
- unsigned int bytesInLane = SnP_laneLengthInBytes - offsetInLane;
365
- uint32_t lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)] >> (8 * offsetInLane);
366
- if (bytesInLane > sizeLeft)
367
- bytesInLane = sizeLeft;
368
- sizeLeft -= bytesInLane;
369
- do {
370
- *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
371
- lane >>= 8;
372
- } while ( --bytesInLane != 0);
373
- lanePosition++;
374
- }
375
-
376
- while(sizeLeft >= SnP_laneLengthInBytes) {
377
- *((uint32_t*)curOutput) = *((uint32_t*)curInput) ^ statesAsLanes[laneIndex(instanceIndex, lanePosition)];
378
- sizeLeft -= SnP_laneLengthInBytes;
379
- lanePosition++;
380
- curInput += SnP_laneLengthInBytes;
381
- curOutput += SnP_laneLengthInBytes;
382
- }
383
-
384
- if (sizeLeft != 0) {
385
- uint32_t lane = statesAsLanes[laneIndex(instanceIndex, lanePosition)];
386
- do {
387
- *(curOutput++) = *(curInput++) ^ (unsigned char)lane;
388
- lane >>= 8;
389
- } while ( --sizeLeft != 0);
390
- }
391
- }
392
-
393
- void Xoodootimes8_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset)
394
- {
395
- const uint32_t *curInput0 = (uint32_t *)(input+laneOffset*0*SnP_laneLengthInBytes);
396
- const uint32_t *curInput1 = (uint32_t *)(input+laneOffset*1*SnP_laneLengthInBytes);
397
- const uint32_t *curInput2 = (uint32_t *)(input+laneOffset*2*SnP_laneLengthInBytes);
398
- const uint32_t *curInput3 = (uint32_t *)(input+laneOffset*3*SnP_laneLengthInBytes);
399
- const uint32_t *curInput4 = (uint32_t *)(input+laneOffset*4*SnP_laneLengthInBytes);
400
- const uint32_t *curInput5 = (uint32_t *)(input+laneOffset*5*SnP_laneLengthInBytes);
401
- const uint32_t *curInput6 = (uint32_t *)(input+laneOffset*6*SnP_laneLengthInBytes);
402
- const uint32_t *curInput7 = (uint32_t *)(input+laneOffset*7*SnP_laneLengthInBytes);
403
- uint32_t *curOutput0 = (uint32_t *)(output+laneOffset*0*SnP_laneLengthInBytes);
404
- uint32_t *curOutput1 = (uint32_t *)(output+laneOffset*1*SnP_laneLengthInBytes);
405
- uint32_t *curOutput2 = (uint32_t *)(output+laneOffset*2*SnP_laneLengthInBytes);
406
- uint32_t *curOutput3 = (uint32_t *)(output+laneOffset*3*SnP_laneLengthInBytes);
407
- uint32_t *curOutput4 = (uint32_t *)(output+laneOffset*4*SnP_laneLengthInBytes);
408
- uint32_t *curOutput5 = (uint32_t *)(output+laneOffset*5*SnP_laneLengthInBytes);
409
- uint32_t *curOutput6 = (uint32_t *)(output+laneOffset*6*SnP_laneLengthInBytes);
410
- uint32_t *curOutput7 = (uint32_t *)(output+laneOffset*7*SnP_laneLengthInBytes);
411
-
412
- const V256 *stateAsLanes = (const V256 *)states;
413
- const uint32_t *stateAsLanes32 = (const uint32_t*)states;
414
- unsigned int i;
415
-
416
- #define ExtrXor( argIndex ) \
417
- curOutput0[argIndex] = curInput0[argIndex] ^ stateAsLanes32[8*(argIndex)+0],\
418
- curOutput1[argIndex] = curInput1[argIndex] ^ stateAsLanes32[8*(argIndex)+1],\
419
- curOutput2[argIndex] = curInput2[argIndex] ^ stateAsLanes32[8*(argIndex)+2],\
420
- curOutput3[argIndex] = curInput3[argIndex] ^ stateAsLanes32[8*(argIndex)+3],\
421
- curOutput4[argIndex] = curInput4[argIndex] ^ stateAsLanes32[8*(argIndex)+4],\
422
- curOutput5[argIndex] = curInput5[argIndex] ^ stateAsLanes32[8*(argIndex)+5],\
423
- curOutput6[argIndex] = curInput6[argIndex] ^ stateAsLanes32[8*(argIndex)+6],\
424
- curOutput7[argIndex] = curInput7[argIndex] ^ stateAsLanes32[8*(argIndex)+7]
425
-
426
- if ( laneCount == 12 ) {
427
- ExtrXor( 0 );
428
- ExtrXor( 1 );
429
- ExtrXor( 2 );
430
- ExtrXor( 3 );
431
- ExtrXor( 4 );
432
- ExtrXor( 5 );
433
- ExtrXor( 6 );
434
- ExtrXor( 7 );
435
- ExtrXor( 8 );
436
- ExtrXor( 9 );
437
- ExtrXor( 10 );
438
- ExtrXor( 11 );
439
- }
440
- else {
441
- for(i=0; i<laneCount; i++) {
442
- ExtrXor( i );
443
- }
444
- }
445
- #undef ExtrXor
446
- }
447
-
448
- #define DeclareVars V256 a00, a01, a02, a03; \
449
- V256 a10, a11, a12, a13; \
450
- V256 a20, a21, a22, a23; \
451
- V256 v1, v2; \
452
- V256 rho8 = LOAD8_32(0x02010003, 0x06050407, 0x0A09080B, 0x0E0D0C0F, 0x12111013, 0x16151417, 0x1A19181B, 0x1E1D1C1F)
453
-
454
- #define State2Vars2 a00 = LOAD256(states[8*(0+0)]), a01 = LOAD256(states[8*(0+1)]), a02 = LOAD256(states[8*(0+2)]), a03 = LOAD256(states[8*(0+3)]); \
455
- a12 = LOAD256(states[8*(4+0)]), a13 = LOAD256(states[8*(4+1)]), a10 = LOAD256(states[8*(4+2)]), a11 = LOAD256(states[8*(4+3)]); \
456
- a20 = LOAD256(states[8*(8+0)]), a21 = LOAD256(states[8*(8+1)]), a22 = LOAD256(states[8*(8+2)]), a23 = LOAD256(states[8*(8+3)])
457
-
458
- #define State2Vars a00 = LOAD256(states[8*(0+0)]), a01 = LOAD256(states[8*(0+1)]), a02 = LOAD256(states[8*(0+2)]), a03 = LOAD256(states[8*(0+3)]); \
459
- a10 = LOAD256(states[8*(4+0)]), a11 = LOAD256(states[8*(4+1)]), a12 = LOAD256(states[8*(4+2)]), a13 = LOAD256(states[8*(4+3)]); \
460
- a20 = LOAD256(states[8*(8+0)]), a21 = LOAD256(states[8*(8+1)]), a22 = LOAD256(states[8*(8+2)]), a23 = LOAD256(states[8*(8+3)])
461
-
462
- #define Vars2State STORE256(states[8*(0+0)], a00), STORE256(states[8*(0+1)], a01), STORE256(states[8*(0+2)], a02), STORE256(states[8*(0+3)], a03); \
463
- STORE256(states[8*(4+0)], a10), STORE256(states[8*(4+1)], a11), STORE256(states[8*(4+2)], a12), STORE256(states[8*(4+3)], a13); \
464
- STORE256(states[8*(8+0)], a20), STORE256(states[8*(8+1)], a21), STORE256(states[8*(8+2)], a22), STORE256(states[8*(8+3)], a23)
465
-
466
- #define Round(a10i, a11i, a12i, a13i, a10w, a11w, a12w, a13w, a20i, a21i, a22i, a23i, __rc) \
467
- \
468
- /* Theta: Column Parity Mixer */ \
469
- v1 = XOR256( a03, XOR256( a13i, a23i ) ); \
470
- v2 = XOR256( a00, XOR256( a10i, a20i ) ); \
471
- v1 = XOR256( ROL32in256(v1, 5), ROL32in256(v1, 14) ); \
472
- a00 = XOR256( a00, v1 ); \
473
- a10i = XOR256( a10i, v1 ); \
474
- a20i = XOR256( a20i, v1 ); \
475
- v1 = XOR256( a01, XOR256( a11i, a21i ) ); \
476
- v2 = XOR256( ROL32in256(v2, 5), ROL32in256(v2, 14) ); \
477
- a01 = XOR256( a01, v2 ); \
478
- a11i = XOR256( a11i, v2 ); \
479
- a21i = XOR256( a21i, v2 ); \
480
- v2 = XOR256( a02, XOR256( a12i, a22i ) ); \
481
- v1 = XOR256( ROL32in256(v1, 5), ROL32in256(v1, 14) ); \
482
- a02 = XOR256( a02, v1 ); \
483
- a12i = XOR256( a12i, v1 ); \
484
- a22i = XOR256( a22i, v1 ); \
485
- v2 = XOR256( ROL32in256(v2, 5), ROL32in256(v2, 14) ); \
486
- a03 = XOR256( a03, v2 ); \
487
- a13i = XOR256( a13i, v2 ); \
488
- a23i = XOR256( a23i, v2 ); \
489
- Dump3("Theta",a); \
490
- \
491
- /* Rho-west: Plane shift */ \
492
- a20i = ROL32in256(a20i, 11); \
493
- a21i = ROL32in256(a21i, 11); \
494
- a22i = ROL32in256(a22i, 11); \
495
- a23i = ROL32in256(a23i, 11); \
496
- Dump3("Rho-west",a); \
497
- \
498
- /* Iota: round constants */ \
499
- a00 = XOR256( a00, CONST8_32(__rc)); \
500
- Dump3("Iota",a); \
501
- \
502
- /* Chi: non linear step, on colums */ \
503
- a00 = XOR256( a00, ANDnu256( a10w, a20i ) ); \
504
- a01 = XOR256( a01, ANDnu256( a11w, a21i ) ); \
505
- a02 = XOR256( a02, ANDnu256( a12w, a22i ) ); \
506
- a03 = XOR256( a03, ANDnu256( a13w, a23i ) ); \
507
- a10w = XOR256( a10w, ANDnu256( a20i, a00 ) ); \
508
- a11w = XOR256( a11w, ANDnu256( a21i, a01 ) ); \
509
- a12w = XOR256( a12w, ANDnu256( a22i, a02 ) ); \
510
- a13w = XOR256( a13w, ANDnu256( a23i, a03 ) ); \
511
- a20i = XOR256( a20i, ANDnu256( a00, a10w ) ); \
512
- a21i = XOR256( a21i, ANDnu256( a01, a11w ) ); \
513
- a22i = XOR256( a22i, ANDnu256( a02, a12w ) ); \
514
- a23i = XOR256( a23i, ANDnu256( a03, a13w ) ); \
515
- Dump3("Chi",a); \
516
- \
517
- /* Rho-east: Plane shift */ \
518
- a10w = ROL32in256(a10w, 1); \
519
- a11w = ROL32in256(a11w, 1); \
520
- a12w = ROL32in256(a12w, 1); \
521
- a13w = ROL32in256(a13w, 1); \
522
- a20i = ROL32in256_8(a20i); \
523
- a21i = ROL32in256_8(a21i); \
524
- a22i = ROL32in256_8(a22i); \
525
- a23i = ROL32in256_8(a23i); \
526
- Dump3("Rho-east",a)
527
-
528
- void Xoodootimes8_PermuteAll_6rounds(void *argstates)
529
- {
530
- uint32_t * states = (uint32_t *)argstates;
531
- DeclareVars;
532
-
533
- State2Vars2;
534
- Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc6 );
535
- Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc5 );
536
- Round( a10, a11, a12, a13, a13, a10, a11, a12, a20, a21, a22, a23, _rc4 );
537
- Round( a13, a10, a11, a12, a12, a13, a10, a11, a22, a23, a20, a21, _rc3 );
538
- Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc2 );
539
- Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc1 );
540
- //Dump1("Permutation\n", a);
541
- Vars2State;
542
- }
543
-
544
- void Xoodootimes8_PermuteAll_12rounds(void *argstates)
545
- {
546
- uint32_t * states = (uint32_t *)argstates;
547
- DeclareVars;
548
-
549
- State2Vars;
550
- Round( a10, a11, a12, a13, a13, a10, a11, a12, a20, a21, a22, a23, _rc12 );
551
- Round( a13, a10, a11, a12, a12, a13, a10, a11, a22, a23, a20, a21, _rc11 );
552
- Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc10 );
553
- Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc9 );
554
- Round( a10, a11, a12, a13, a13, a10, a11, a12, a20, a21, a22, a23, _rc8 );
555
- Round( a13, a10, a11, a12, a12, a13, a10, a11, a22, a23, a20, a21, _rc7 );
556
- Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc6 );
557
- Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc5 );
558
- Round( a10, a11, a12, a13, a13, a10, a11, a12, a20, a21, a22, a23, _rc4 );
559
- Round( a13, a10, a11, a12, a12, a13, a10, a11, a22, a23, a20, a21, _rc3 );
560
- Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc2 );
561
- Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc1 );
562
- //Dump1("Permutation\n", a);
563
- Vars2State;
564
- }
565
-
566
- void Xooffftimes8_AddIs(unsigned char *output, const unsigned char *input, size_t bitLen)
567
- {
568
- size_t byteLen = bitLen / 8;
569
- V256 lanes1, lanes2, lanes3, lanes4, lanes5, lanes6, lanes7, lanes8;
570
-
571
- while ( byteLen >= 128 ) {
572
- lanes1 = LOAD256u(input[ 0]);
573
- lanes2 = LOAD256u(input[32]);
574
- lanes3 = LOAD256u(input[64]);
575
- lanes4 = LOAD256u(input[96]);
576
- lanes5 = LOAD256u(output[ 0]);
577
- lanes6 = LOAD256u(output[32]);
578
- lanes7 = LOAD256u(output[64]);
579
- lanes8 = LOAD256u(output[96]);
580
- lanes1 = XOR256(lanes1, lanes5);
581
- lanes2 = XOR256(lanes2, lanes6);
582
- lanes3 = XOR256(lanes3, lanes7);
583
- lanes4 = XOR256(lanes4, lanes8);
584
- STORE256u(output[ 0], lanes1);
585
- STORE256u(output[32], lanes2);
586
- STORE256u(output[64], lanes3);
587
- STORE256u(output[96], lanes4);
588
- input += 128;
589
- output += 128;
590
- byteLen -= 128;
591
- }
592
- while ( byteLen >= 32 ) {
593
- lanes1 = LOAD256u(input[0]);
594
- lanes2 = LOAD256u(output[0]);
595
- input += 32;
596
- lanes1 = XOR256(lanes1, lanes2);
597
- byteLen -= 32;
598
- STORE256u(output[0], lanes1);
599
- output += 32;
600
- }
601
- while ( byteLen >= 8 ) {
602
- *((uint64_t*)output) ^= *((uint64_t*)input);
603
- input += 8;
604
- output += 8;
605
- byteLen -= 8;
606
- }
607
- while ( byteLen-- != 0 ) {
608
- *output++ ^= *input++;
609
- }
610
-
611
- bitLen &= 7;
612
- if (bitLen != 0)
613
- {
614
- *output ^= *input;
615
- *output &= (1 << bitLen) - 1;
616
- }
617
- }
618
-
619
- size_t Xooffftimes8_CompressFastLoop(unsigned char *k, unsigned char *x, const unsigned char *input, size_t length)
620
- {
621
- DeclareVars;
622
- uint32_t *k32 = (uint32_t*)k;
623
- uint32_t *x32 = (uint32_t*)x;
624
- uint32_t *i32 = (uint32_t*)input;
625
- size_t initialLength;
626
- V256 r04815926;
627
- V256 r5926a37b;
628
- V256 t;
629
- V256 x00, x01, x02, x03, x10, x11, x12, x13, x20, x21, x22, x23;
630
- V128 x4;
631
- V256 shuffleR_1 = *(const V256*)oshuffleR_1;
632
- V256 shuffleR_3 = *(const V256*)oshuffleR_3;
633
- V256 shuffleR_5 = *(const V256*)oshuffleR_5;
634
- V256 shuffleR_7 = *(const V256*)oshuffleR_7;
635
-
636
- r04815926 = LOAD_GATHER8_32(LOAD8_32( 0, 4, 8, 1, 5, 9, 2, 6), k32);
637
- r5926a37b = LOAD_GATHER8_32(LOAD8_32( 5, 9, 2, 6, 10, 3, 7, 11), k32);
638
- t = LOAD8_32( 0*12, 1*12, 2*12, 3*12, 4*12, 5*12, 6*12, 7*12);
639
-
640
- initialLength = length;
641
-
642
- /* Clear x accumulator */
643
- x00 = _mm256_setzero_si256();
644
- x01 = _mm256_setzero_si256();
645
- x02 = _mm256_setzero_si256();
646
- x03 = _mm256_setzero_si256();
647
- x10 = _mm256_setzero_si256();
648
- x11 = _mm256_setzero_si256();
649
- x12 = _mm256_setzero_si256();
650
- x13 = _mm256_setzero_si256();
651
- x20 = _mm256_setzero_si256();
652
- x21 = _mm256_setzero_si256();
653
- x22 = _mm256_setzero_si256();
654
- x23 = _mm256_setzero_si256();
655
-
656
- #define rCGKDHLEI r5926a37b
657
- #define aCGKDHLEI ((uint32_t*)&rCGKDHLEI)
658
- do {
659
- /* Note that a10-a12 and a11-a13 are swapped */
660
- a00 = r04815926;
661
- a01 = _mm256_blend_epi32(SHUFFLE_LANES_RIGHT(r04815926, 3), SHUFFLE_LANES_RIGHT(r5926a37b, 7), 0xE0); /* 15926 */
662
- a02 = SHUFFLE_LANES_RIGHT_2(r5926a37b); /* 26a37b */
663
-
664
- a12 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a00, 1), EXTRACT_LANE(a01, 5), 7); /* 4815926 A */
665
-
666
- rCGKDHLEI = XOR256(a00, XOR256(SHL32in256(a00, 13), ROL32in256(a12, 3)));
667
-
668
- a02 = _mm256_blend_epi32(a02, SHUFFLE_LANES_RIGHT_2(rCGKDHLEI), 0xC0);
669
- a03 = _mm256_blend_epi32(SHUFFLE_LANES_RIGHT(a02, 3), SHUFFLE_LANES_RIGHT(rCGKDHLEI, 5), 0xF8);
670
-
671
- a13 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a01, 1), EXTRACT_LANE(a02, 5), 7); /* B */
672
- a10 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a02, 1), aCGKDHLEI[2], 7); /* K */
673
- a11 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a03, 1), aCGKDHLEI[5], 7); /* L */
674
-
675
- a20 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a12, 1), EXTRACT_LANE(a01, 6), 7); /* 815926A+3 */
676
- a21 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a13, 1), aCGKDHLEI[0], 7); /* C */
677
- a22 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a10, 1), aCGKDHLEI[3], 7); /* D */
678
- a23 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a11, 1), aCGKDHLEI[6], 7); /* E */
679
- r04815926 = a22;
680
- Dump("Roll-c", a);
681
-
682
- a00 = XOR256( a00, LOAD_GATHER8_32(t, i32+0));
683
- a01 = XOR256( a01, LOAD_GATHER8_32(t, i32+1));
684
- a02 = XOR256( a02, LOAD_GATHER8_32(t, i32+2));
685
- a03 = XOR256( a03, LOAD_GATHER8_32(t, i32+3));
686
-
687
- a12 = XOR256( a12, LOAD_GATHER8_32(t, i32+4));
688
- a13 = XOR256( a13, LOAD_GATHER8_32(t, i32+5));
689
- a10 = XOR256( a10, LOAD_GATHER8_32(t, i32+6));
690
- a11 = XOR256( a11, LOAD_GATHER8_32(t, i32+7));
691
-
692
- a20 = XOR256( a20, LOAD_GATHER8_32(t, i32+8));
693
- a21 = XOR256( a21, LOAD_GATHER8_32(t, i32+9));
694
- a22 = XOR256( a22, LOAD_GATHER8_32(t, i32+10));
695
- a23 = XOR256( a23, LOAD_GATHER8_32(t, i32+11));
696
- Dump("Add input", a);
697
-
698
- Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc6 );
699
- Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc5 );
700
- Round( a10, a11, a12, a13, a13, a10, a11, a12, a20, a21, a22, a23, _rc4 );
701
- Round( a13, a10, a11, a12, a12, a13, a10, a11, a22, a23, a20, a21, _rc3 );
702
- Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc2 );
703
- Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc1 );
704
- Dump("Xoodoo", a);
705
-
706
- x00 = XOR256(x00, a00);
707
- x01 = XOR256(x01, a01);
708
- x02 = XOR256(x02, a02);
709
- x03 = XOR256(x03, a03);
710
- x10 = XOR256(x10, a10);
711
- x11 = XOR256(x11, a11);
712
- x12 = XOR256(x12, a12);
713
- x13 = XOR256(x13, a13);
714
- x20 = XOR256(x20, a20);
715
- x21 = XOR256(x21, a21);
716
- x22 = XOR256(x22, a22);
717
- x23 = XOR256(x23, a23);
718
- Dump("Accu x", x);
719
-
720
- i32 += NLANES*8;
721
- length -= NLANES*4*8;
722
- }
723
- while (length >= (NLANES*4*8));
724
-
725
- /* Reduce from 8 to 4 lanes (x00 - x13), reduce from 4 to 2 lanes (x20 - x23) */
726
- x00 = XOR256(x00, _mm256_permute4x64_epi64(x00, 0x4e));
727
- x01 = XOR256(x01, _mm256_permute4x64_epi64(x01, 0x4e));
728
- x02 = XOR256(x02, _mm256_permute4x64_epi64(x02, 0x4e));
729
- x03 = XOR256(x03, _mm256_permute4x64_epi64(x03, 0x4e));
730
- x10 = XOR256(x10, _mm256_permute4x64_epi64(x10, 0x4e));
731
- x11 = XOR256(x11, _mm256_permute4x64_epi64(x11, 0x4e));
732
- x12 = XOR256(x12, _mm256_permute4x64_epi64(x12, 0x4e));
733
- x13 = XOR256(x13, _mm256_permute4x64_epi64(x13, 0x4e));
734
- x20 = XOR256(x20, _mm256_permute4x64_epi64(x20, 0x4e));
735
- x21 = XOR256(x21, _mm256_permute4x64_epi64(x21, 0x4e));
736
- x22 = XOR256(x22, _mm256_permute4x64_epi64(x22, 0x4e));
737
- x23 = XOR256(x23, _mm256_permute4x64_epi64(x23, 0x4e));
738
- x00 = _mm256_permute2x128_si256( x00, x10, 0x20);
739
- x01 = _mm256_permute2x128_si256( x01, x11, 0x20);
740
- x02 = _mm256_permute2x128_si256( x02, x12, 0x20);
741
- x03 = _mm256_permute2x128_si256( x03, x13, 0x20);
742
- x20 = _mm256_permute2x128_si256( x20, x22, 0x20);
743
- x21 = _mm256_permute2x128_si256( x21, x23, 0x20);
744
-
745
- /* Reduce from 4 to 2 lanes (x00 - x03), reduce from 2 to 1 lane (x20 - x21) */
746
- x00 = XOR256(x00, _mm256_permute4x64_epi64(x00, 0xB1));
747
- x01 = XOR256(x01, _mm256_permute4x64_epi64(x01, 0xB1));
748
- x02 = XOR256(x02, _mm256_permute4x64_epi64(x02, 0xB1));
749
- x03 = XOR256(x03, _mm256_permute4x64_epi64(x03, 0xB1));
750
- x20 = XOR256(x20, _mm256_permute4x64_epi64(x20, 0xB1));
751
- x21 = XOR256(x21, _mm256_permute4x64_epi64(x21, 0xB1));
752
- x00 = _mm256_blend_epi32( x00, x02, 0xCC);
753
- x01 = _mm256_blend_epi32( x01, x03, 0xCC);
754
- x20 = _mm256_blend_epi32( x20, x21, 0xCC);
755
-
756
- /* Reduce from 2 to 1 lane (x00 - x01), 1 to half lane (x20) */
757
- x00 = XOR256(x00, SHUFFLE_LANES_RIGHT(x00, 1));
758
- x01 = XOR256(x01, SHUFFLE_LANES_RIGHT(x01, 1));
759
- x20 = XOR256(x20, SHUFFLE_LANES_RIGHT(x20, 1));
760
- x00 = _mm256_blend_epi32( x00, SHUFFLE_LANES_RIGHT(x01, 7), 0xAA);
761
- x20 = _mm256_permutevar8x32_epi32( x20, *(V256*)shufflePack);
762
-
763
- x00 = XOR256(x00, *(V256*)&x32[0]);
764
- x4 = XOR128(_mm256_castsi256_si128(x20), *(V128*)&x32[8]);
765
-
766
- STORE256u( *(V256*)&x32[0], x00);
767
- STORE128u( *(V128*)&x32[8], x4);
768
-
769
- /* Save new k from r04815926 and rCGKDHLEI */
770
- k32[ 0] = _mm256_extract_epi32(r04815926, 0);
771
- k32[ 1] = _mm256_extract_epi32(r04815926, 3);
772
- k32[ 2] = _mm256_extract_epi32(rCGKDHLEI, 2); /* K */
773
- k32[ 3] = _mm256_extract_epi32(rCGKDHLEI, 5); /* L */
774
- k32[ 4] = _mm256_extract_epi32(r04815926, 1);
775
- k32[ 5] = _mm256_extract_epi32(rCGKDHLEI, 0); /* C */
776
- k32[ 6] = _mm256_extract_epi32(rCGKDHLEI, 3); /* D */
777
- k32[ 7] = _mm256_extract_epi32(rCGKDHLEI, 6); /* E */
778
- k32[ 8] = _mm256_extract_epi32(r04815926, 2);
779
- k32[ 9] = _mm256_extract_epi32(rCGKDHLEI, 1); /* G */
780
- k32[10] = _mm256_extract_epi32(rCGKDHLEI, 4); /* H */
781
- k32[11] = _mm256_extract_epi32(rCGKDHLEI, 7); /* I */
782
- #undef rCGKDHLEI
783
-
784
- return initialLength - length;
785
- }
786
-
787
- size_t Xooffftimes8_ExpandFastLoop(unsigned char *yAccu, const unsigned char *kRoll, unsigned char *output, size_t length)
788
- {
789
- DeclareVars;
790
- const uint32_t *k32 = (uint32_t*)kRoll;
791
- uint32_t *y32 = (uint32_t*)yAccu;
792
- uint32_t *o32 = (uint32_t*)output;
793
- size_t initialLength;
794
- V256 r04815926;
795
- V256 r5926a37b;
796
- V256 v3, v4;
797
- V256 shuffleR_1 = *(const V256*)oshuffleR_1;
798
- V256 shuffleR_3 = *(const V256*)oshuffleR_3;
799
- V256 shuffleR_5 = *(const V256*)oshuffleR_5;
800
- V256 shuffleR_7 = *(const V256*)oshuffleR_7;
801
-
802
- r04815926 = LOAD_GATHER8_32(LOAD8_32( 0, 4, 8, 1, 5, 9, 2, 6), y32);
803
- r5926a37b = LOAD_GATHER8_32(LOAD8_32( 5, 9, 2, 6, 10, 3, 7, 11), y32);
804
-
805
- initialLength = length;
806
-
807
- #define rCGKDHLEI r5926a37b
808
- #define aCGKDHLEI ((uint32_t*)&rCGKDHLEI)
809
- do {
810
- a00 = r04815926;
811
- a01 = _mm256_blend_epi32(SHUFFLE_LANES_RIGHT(r04815926, 3), SHUFFLE_LANES_RIGHT(r5926a37b, 7), 0xE0); /* 15926+A37 */
812
- a02 = SHUFFLE_LANES_RIGHT_2(r5926a37b); /* 26a37b+-- */
813
-
814
- a12 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a00, 1), EXTRACT_LANE(a01, 5), 7); /* 4815926+A */
815
- a20 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a12, 1), EXTRACT_LANE(a01, 6), 7); /* 815926A+3 */
816
-
817
- rCGKDHLEI = XOR256(ROL32in256(a00, 5), ROL32in256(a12, 13));
818
- rCGKDHLEI = XOR256(rCGKDHLEI, AND256(a20, a12));
819
- rCGKDHLEI = XOR256(rCGKDHLEI, CONST8_32(7));
820
-
821
- a02 = _mm256_blend_epi32(a02, SHUFFLE_LANES_RIGHT_2(rCGKDHLEI), 0xC0);
822
- a03 = _mm256_blend_epi32(SHUFFLE_LANES_RIGHT(a02, 3), SHUFFLE_LANES_RIGHT(rCGKDHLEI, 5), 0xF8);
823
-
824
- a13 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a01, 1), EXTRACT_LANE(a02, 5), 7); /* B */
825
- a10 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a02, 1), aCGKDHLEI[2], 7); /* K */
826
- a11 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a03, 1), aCGKDHLEI[5], 7); /* L */
827
-
828
- a21 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a13, 1), aCGKDHLEI[0], 7); /* C */
829
- a22 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a10, 1), aCGKDHLEI[3], 7); /* D */
830
- a23 = INSERT_LANE(SHUFFLE_LANES_RIGHT(a11, 1), aCGKDHLEI[6], 7); /* E */
831
- r04815926 = a22;
832
- Dump("Roll-e", a);
833
-
834
- Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc6 );
835
- Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc5 );
836
- Round( a10, a11, a12, a13, a13, a10, a11, a12, a20, a21, a22, a23, _rc4 );
837
- Round( a13, a10, a11, a12, a12, a13, a10, a11, a22, a23, a20, a21, _rc3 );
838
- Round( a12, a13, a10, a11, a11, a12, a13, a10, a20, a21, a22, a23, _rc2 );
839
- Round( a11, a12, a13, a10, a10, a11, a12, a13, a22, a23, a20, a21, _rc1 );
840
- Dump("Xoodoo(y)", a);
841
-
842
- a00 = XOR256(a00, CONST8_32(k32[0]));
843
- a01 = XOR256(a01, CONST8_32(k32[1]));
844
- a02 = XOR256(a02, CONST8_32(k32[2]));
845
- a03 = XOR256(a03, CONST8_32(k32[3]));
846
- a10 = XOR256(a10, CONST8_32(k32[4]));
847
- a11 = XOR256(a11, CONST8_32(k32[5]));
848
- a12 = XOR256(a12, CONST8_32(k32[6]));
849
- a13 = XOR256(a13, CONST8_32(k32[7]));
850
- a20 = XOR256(a20, CONST8_32(k32[8]));
851
- a21 = XOR256(a21, CONST8_32(k32[9]));
852
- a22 = XOR256(a22, CONST8_32(k32[10]));
853
- a23 = XOR256(a23, CONST8_32(k32[11]));
854
- Dump("Xoodoo(y) + kRoll", a);
855
-
856
- /* Extract */
857
- #define UNPACKL32(a, b) _mm256_unpacklo_epi32(a, b)
858
- #define UNPACKH32(a, b) _mm256_unpackhi_epi32(a, b)
859
- #define UNPACKL64(a, b) _mm256_unpacklo_epi64(a, b)
860
- #define UNPACKH64(a, b) _mm256_unpackhi_epi64(a, b)
861
- #define UNPACKL128(a, b) _mm256_permute2x128_si256(a, b, 0x20)
862
- #define UNPACKH128(a, b) _mm256_permute2x128_si256(a, b, 0x31)
863
- #define lanesL01 v1
864
- #define lanesH01 v2
865
- #define lanesL23 v3
866
- #define lanesH23 v4
867
-
868
- lanesL01 = UNPACKL32( a00, a01 );
869
- lanesH01 = UNPACKH32( a00, a01 );
870
- lanesL23 = UNPACKL32( a02, a03 );
871
- lanesH23 = UNPACKH32( a02, a03 );
872
- a00 = UNPACKL64( lanesL01, lanesL23 );
873
- a01 = UNPACKH64( lanesL01, lanesL23 );
874
- a02 = UNPACKL64( lanesH01, lanesH23 );
875
- a03 = UNPACKH64( lanesH01, lanesH23 );
876
-
877
- lanesL01 = UNPACKL32( a10, a11 );
878
- lanesH01 = UNPACKH32( a10, a11 );
879
- lanesL23 = UNPACKL32( a12, a13 );
880
- lanesH23 = UNPACKH32( a12, a13 );
881
- a10 = UNPACKL64( lanesL01, lanesL23 );
882
- a11 = UNPACKH64( lanesL01, lanesL23 );
883
- a12 = UNPACKL64( lanesH01, lanesH23 );
884
- a13 = UNPACKH64( lanesH01, lanesH23 );
885
-
886
- lanesL01 = UNPACKL128( a00, a10 );
887
- lanesH01 = UNPACKH128( a00, a10 );
888
- lanesL23 = UNPACKL128( a01, a11 );
889
- lanesH23 = UNPACKH128( a01, a11 );
890
- STORE256u(o32[0*12+0], lanesL01);
891
- STORE256u(o32[4*12+0], lanesH01);
892
- STORE256u(o32[1*12+0], lanesL23);
893
- STORE256u(o32[5*12+0], lanesH23);
894
-
895
- lanesL01 = UNPACKL128( a02, a12 );
896
- lanesH01 = UNPACKH128( a02, a12 );
897
- lanesL23 = UNPACKL128( a03, a13 );
898
- lanesH23 = UNPACKH128( a03, a13 );
899
- STORE256u(o32[2*12+0], lanesL01);
900
- STORE256u(o32[6*12+0], lanesH01);
901
- STORE256u(o32[3*12+0], lanesL23);
902
- STORE256u(o32[7*12+0], lanesH23);
903
-
904
- lanesL01 = UNPACKL32( a20, a21 );
905
- lanesH01 = UNPACKH32( a20, a21 );
906
- lanesL23 = UNPACKL32( a22, a23 );
907
- lanesH23 = UNPACKH32( a22, a23 );
908
- a20 = UNPACKL64( lanesL01, lanesL23 );
909
- a21 = UNPACKH64( lanesL01, lanesL23 );
910
- a22 = UNPACKL64( lanesH01, lanesH23 );
911
- a23 = UNPACKH64( lanesH01, lanesH23 );
912
- _mm256_storeu2_m128i((__m128i*)(o32+4*12+8), (__m128i*)(o32+0*12+8), a20);
913
- _mm256_storeu2_m128i((__m128i*)(o32+5*12+8), (__m128i*)(o32+1*12+8), a21);
914
- _mm256_storeu2_m128i((__m128i*)(o32+6*12+8), (__m128i*)(o32+2*12+8), a22);
915
- _mm256_storeu2_m128i((__m128i*)(o32+7*12+8), (__m128i*)(o32+3*12+8), a23);
916
- Dump("shuffle", a);
917
-
918
- o32 += NLANES*8;
919
- length -= NLANES*4*8;
920
- }
921
- while (length >= (NLANES*4*8));
922
-
923
- /* Save new y from r04815926 and rCGKDHLEI */
924
- y32[ 0] = _mm256_extract_epi32(r04815926, 0);
925
- y32[ 1] = _mm256_extract_epi32(r04815926, 3);
926
- y32[ 2] = _mm256_extract_epi32(rCGKDHLEI, 2); /* K */
927
- y32[ 3] = _mm256_extract_epi32(rCGKDHLEI, 5); /* L */
928
- y32[ 4] = _mm256_extract_epi32(r04815926, 1);
929
- y32[ 5] = _mm256_extract_epi32(rCGKDHLEI, 0); /* C */
930
- y32[ 6] = _mm256_extract_epi32(rCGKDHLEI, 3); /* D */
931
- y32[ 7] = _mm256_extract_epi32(rCGKDHLEI, 6); /* E */
932
- y32[ 8] = _mm256_extract_epi32(r04815926, 2);
933
- y32[ 9] = _mm256_extract_epi32(rCGKDHLEI, 1); /* G */
934
- y32[10] = _mm256_extract_epi32(rCGKDHLEI, 4); /* H */
935
- y32[11] = _mm256_extract_epi32(rCGKDHLEI, 7); /* I */
936
- #undef rCGKDHLEI
937
-
938
- return initialLength - length;
939
- }