sleeping_kangaroo12 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (284) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +127 -0
  3. data/ext/Rakefile +73 -0
  4. data/ext/binding/sleeping_kangaroo12.c +39 -0
  5. data/ext/config/xkcp.build +17 -0
  6. data/ext/xkcp/LICENSE +1 -0
  7. data/ext/xkcp/Makefile +15 -0
  8. data/ext/xkcp/Makefile.build +200 -0
  9. data/ext/xkcp/README.markdown +296 -0
  10. data/ext/xkcp/lib/HighLevel.build +143 -0
  11. data/ext/xkcp/lib/LowLevel.build +757 -0
  12. data/ext/xkcp/lib/common/align.h +33 -0
  13. data/ext/xkcp/lib/common/brg_endian.h +143 -0
  14. data/ext/xkcp/lib/high/KangarooTwelve/KangarooTwelve.c +301 -0
  15. data/ext/xkcp/lib/high/KangarooTwelve/KangarooTwelve.h +97 -0
  16. data/ext/xkcp/lib/high/Keccak/FIPS202/KeccakHash.c +81 -0
  17. data/ext/xkcp/lib/high/Keccak/FIPS202/KeccakHash.h +125 -0
  18. data/ext/xkcp/lib/high/Keccak/FIPS202/SimpleFIPS202.c +48 -0
  19. data/ext/xkcp/lib/high/Keccak/FIPS202/SimpleFIPS202.h +79 -0
  20. data/ext/xkcp/lib/high/Keccak/KeccakDuplex.c +81 -0
  21. data/ext/xkcp/lib/high/Keccak/KeccakDuplex.h +73 -0
  22. data/ext/xkcp/lib/high/Keccak/KeccakDuplex.inc +195 -0
  23. data/ext/xkcp/lib/high/Keccak/KeccakSponge.c +111 -0
  24. data/ext/xkcp/lib/high/Keccak/KeccakSponge.h +76 -0
  25. data/ext/xkcp/lib/high/Keccak/KeccakSponge.inc +314 -0
  26. data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.c +61 -0
  27. data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.h +67 -0
  28. data/ext/xkcp/lib/high/Keccak/PRG/KeccakPRG.inc +128 -0
  29. data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.c +93 -0
  30. data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.h +599 -0
  31. data/ext/xkcp/lib/high/Keccak/SP800-185/SP800-185.inc +573 -0
  32. data/ext/xkcp/lib/high/Ketje/Ketjev2.c +87 -0
  33. data/ext/xkcp/lib/high/Ketje/Ketjev2.h +88 -0
  34. data/ext/xkcp/lib/high/Ketje/Ketjev2.inc +274 -0
  35. data/ext/xkcp/lib/high/Keyak/Keyakv2.c +132 -0
  36. data/ext/xkcp/lib/high/Keyak/Keyakv2.h +217 -0
  37. data/ext/xkcp/lib/high/Keyak/Keyakv2.inc +81 -0
  38. data/ext/xkcp/lib/high/Keyak/Motorist.inc +953 -0
  39. data/ext/xkcp/lib/high/Kravatte/Kravatte.c +533 -0
  40. data/ext/xkcp/lib/high/Kravatte/Kravatte.h +115 -0
  41. data/ext/xkcp/lib/high/Kravatte/KravatteModes.c +557 -0
  42. data/ext/xkcp/lib/high/Kravatte/KravatteModes.h +247 -0
  43. data/ext/xkcp/lib/high/Xoodyak/Cyclist.h +66 -0
  44. data/ext/xkcp/lib/high/Xoodyak/Cyclist.inc +336 -0
  45. data/ext/xkcp/lib/high/Xoodyak/Xoodyak-parameters.h +26 -0
  46. data/ext/xkcp/lib/high/Xoodyak/Xoodyak.c +55 -0
  47. data/ext/xkcp/lib/high/Xoodyak/Xoodyak.h +35 -0
  48. data/ext/xkcp/lib/high/Xoofff/Xoofff.c +634 -0
  49. data/ext/xkcp/lib/high/Xoofff/Xoofff.h +147 -0
  50. data/ext/xkcp/lib/high/Xoofff/XoofffModes.c +483 -0
  51. data/ext/xkcp/lib/high/Xoofff/XoofffModes.h +241 -0
  52. data/ext/xkcp/lib/high/common/Phases.h +25 -0
  53. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-SnP.h +41 -0
  54. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv6m-le-armcc.s +1666 -0
  55. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv6m-le-gcc.s +1655 -0
  56. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7a-le-armcc.s +1268 -0
  57. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7a-le-gcc.s +1264 -0
  58. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7m-le-armcc.s +1178 -0
  59. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-inplace-32bi-armv7m-le-gcc.s +1175 -0
  60. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u1-32bi-armv6m-le-armcc.s +1338 -0
  61. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u1-32bi-armv6m-le-gcc.s +1336 -0
  62. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u2-32bi-armv6m-le-armcc.s +1343 -0
  63. data/ext/xkcp/lib/low/KeccakP-1600/ARM/KeccakP-1600-u2-32bi-armv6m-le-gcc.s +1339 -0
  64. data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-SnP.h +42 -0
  65. data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-armv7a-le-neon-armcc.s +823 -0
  66. data/ext/xkcp/lib/low/KeccakP-1600/ARMv7A-NEON/KeccakP-1600-armv7a-le-neon-gcc.s +831 -0
  67. data/ext/xkcp/lib/low/KeccakP-1600/ARMv8A/KeccakP-1600-SnP.h +31 -0
  68. data/ext/xkcp/lib/low/KeccakP-1600/ARMv8A/KeccakP-1600-armv8a-neon.s +540 -0
  69. data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-SnP.h +42 -0
  70. data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-avr8-compact.s +733 -0
  71. data/ext/xkcp/lib/low/KeccakP-1600/AVR8/KeccakP-1600-avr8-fast.s +1121 -0
  72. data/ext/xkcp/lib/low/KeccakP-1600/AVX2/KeccakP-1600-AVX2.s +1100 -0
  73. data/ext/xkcp/lib/low/KeccakP-1600/AVX2/KeccakP-1600-SnP.h +52 -0
  74. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/KeccakP-1600-AVX512.c +623 -0
  75. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/KeccakP-1600-SnP.h +47 -0
  76. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/u12/KeccakP-1600-AVX512-config.h +6 -0
  77. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/u6/KeccakP-1600-AVX512-config.h +6 -0
  78. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/C/ua/KeccakP-1600-AVX512-config.h +6 -0
  79. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/KeccakP-1600-AVX512.s +1031 -0
  80. data/ext/xkcp/lib/low/KeccakP-1600/AVX512/KeccakP-1600-SnP.h +53 -0
  81. data/ext/xkcp/lib/low/KeccakP-1600/XOP/KeccakP-1600-SnP.h +44 -0
  82. data/ext/xkcp/lib/low/KeccakP-1600/XOP/KeccakP-1600-XOP.c +476 -0
  83. data/ext/xkcp/lib/low/KeccakP-1600/XOP/u6/KeccakP-1600-XOP-config.h +6 -0
  84. data/ext/xkcp/lib/low/KeccakP-1600/XOP/ua/KeccakP-1600-XOP-config.h +6 -0
  85. data/ext/xkcp/lib/low/KeccakP-1600/common/KeccakP-1600-64.macros +748 -0
  86. data/ext/xkcp/lib/low/KeccakP-1600/common/KeccakP-1600-unrolling.macros +305 -0
  87. data/ext/xkcp/lib/low/KeccakP-1600/compact/KeccakP-1600-SnP.h +40 -0
  88. data/ext/xkcp/lib/low/KeccakP-1600/compact/KeccakP-1600-compact64.c +420 -0
  89. data/ext/xkcp/lib/low/KeccakP-1600/plain-32bits-inplace/KeccakP-1600-SnP.h +43 -0
  90. data/ext/xkcp/lib/low/KeccakP-1600/plain-32bits-inplace/KeccakP-1600-inplace32BI.c +1163 -0
  91. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/KeccakP-1600-SnP.h +54 -0
  92. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/KeccakP-1600-opt64.c +565 -0
  93. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcu6/KeccakP-1600-opt64-config.h +7 -0
  94. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcua/KeccakP-1600-opt64-config.h +7 -0
  95. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/lcua-shld/KeccakP-1600-opt64-config.h +8 -0
  96. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/u6/KeccakP-1600-opt64-config.h +6 -0
  97. data/ext/xkcp/lib/low/KeccakP-1600/plain-64bits/ua/KeccakP-1600-opt64-config.h +6 -0
  98. data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-SnP.h +44 -0
  99. data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-reference.h +23 -0
  100. data/ext/xkcp/lib/low/KeccakP-1600/ref-32bits/KeccakP-1600-reference32BI.c +625 -0
  101. data/ext/xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-SnP.h +44 -0
  102. data/ext/xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-reference.c +440 -0
  103. data/ext/xkcp/lib/low/KeccakP-1600/ref-64bits/KeccakP-1600-reference.h +23 -0
  104. data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-SnP.h +42 -0
  105. data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-gas.s +1196 -0
  106. data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-gas_Apple.s +1124 -0
  107. data/ext/xkcp/lib/low/KeccakP-1600/x86-64/KeccakP-1600-x86-64-shld-gas.s +1196 -0
  108. data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-inplace-pl2-armv7a-neon-le-armcc.s +1392 -0
  109. data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-inplace-pl2-armv7a-neon-le-gcc.s +1394 -0
  110. data/ext/xkcp/lib/low/KeccakP-1600-times2/ARMv7A-NEON/KeccakP-1600-times2-SnP.h +42 -0
  111. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512u12/SIMD512-2-config.h +7 -0
  112. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512u4/SIMD512-2-config.h +7 -0
  113. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/AVX512ufull/SIMD512-2-config.h +7 -0
  114. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/KeccakP-1600-times2-SIMD512.c +850 -0
  115. data/ext/xkcp/lib/low/KeccakP-1600-times2/AVX512/KeccakP-1600-times2-SnP.h +51 -0
  116. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SIMD128.c +957 -0
  117. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/KeccakP-1600-times2-SnP.h +49 -0
  118. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/SSSE3-u2/SIMD128-config.h +8 -0
  119. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/SSSE3-ua/SIMD128-config.h +8 -0
  120. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/XOP-u2/SIMD128-config.h +9 -0
  121. data/ext/xkcp/lib/low/KeccakP-1600-times2/SIMD128/XOP-ua/SIMD128-config.h +9 -0
  122. data/ext/xkcp/lib/low/KeccakP-1600-times2/fallback-on1/KeccakP-1600-times2-SnP.h +45 -0
  123. data/ext/xkcp/lib/low/KeccakP-1600-times2/fallback-on1/KeccakP-1600-times2-on1.c +37 -0
  124. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/KeccakP-1600-times4-SIMD256.c +1321 -0
  125. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/KeccakP-1600-times4-SnP.h +55 -0
  126. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/u12/SIMD256-config.h +7 -0
  127. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/u6/SIMD256-config.h +7 -0
  128. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX2/ua/SIMD256-config.h +7 -0
  129. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512u12/SIMD512-4-config.h +7 -0
  130. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512u4/SIMD512-4-config.h +7 -0
  131. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/AVX512ufull/SIMD512-4-config.h +7 -0
  132. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/KeccakP-1600-times4-SIMD512.c +881 -0
  133. data/ext/xkcp/lib/low/KeccakP-1600-times4/AVX512/KeccakP-1600-times4-SnP.h +51 -0
  134. data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on1/KeccakP-1600-times4-SnP.h +45 -0
  135. data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on1/KeccakP-1600-times4-on1.c +37 -0
  136. data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on2/KeccakP-1600-times4-SnP.h +45 -0
  137. data/ext/xkcp/lib/low/KeccakP-1600-times4/fallback-on2/KeccakP-1600-times4-on2.c +38 -0
  138. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/KeccakP-1600-times8-SIMD512.c +1615 -0
  139. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/KeccakP-1600-times8-SnP.h +57 -0
  140. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/u12/SIMD512-config.h +7 -0
  141. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/u4/SIMD512-config.h +7 -0
  142. data/ext/xkcp/lib/low/KeccakP-1600-times8/AVX512/ua/SIMD512-config.h +7 -0
  143. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on1/KeccakP-1600-times8-SnP.h +45 -0
  144. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on1/KeccakP-1600-times8-on1.c +37 -0
  145. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on2/KeccakP-1600-times8-SnP.h +45 -0
  146. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on2/KeccakP-1600-times8-on2.c +38 -0
  147. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on4/KeccakP-1600-times8-SnP.h +45 -0
  148. data/ext/xkcp/lib/low/KeccakP-1600-times8/fallback-on4/KeccakP-1600-times8-on4.c +38 -0
  149. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-SnP.h +41 -0
  150. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv6m-le-armcc.s +442 -0
  151. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv6m-le-gcc.s +446 -0
  152. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv7m-le-armcc.s +419 -0
  153. data/ext/xkcp/lib/low/KeccakP-200/ARM/KeccakP-200-armv7m-le-gcc.s +427 -0
  154. data/ext/xkcp/lib/low/KeccakP-200/AVR8/KeccakP-200-SnP.h +41 -0
  155. data/ext/xkcp/lib/low/KeccakP-200/AVR8/KeccakP-200-avr8-fast.s +647 -0
  156. data/ext/xkcp/lib/low/KeccakP-200/compact/KeccakP-200-SnP.h +39 -0
  157. data/ext/xkcp/lib/low/KeccakP-200/compact/KeccakP-200-compact.c +190 -0
  158. data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-SnP.h +43 -0
  159. data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-reference.c +412 -0
  160. data/ext/xkcp/lib/low/KeccakP-200/ref/KeccakP-200-reference.h +23 -0
  161. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-SnP.h +41 -0
  162. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv6m-le-armcc.s +454 -0
  163. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv6m-le-gcc.s +458 -0
  164. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv7m-le-armcc.s +455 -0
  165. data/ext/xkcp/lib/low/KeccakP-400/ARM/KeccakP-400-armv7m-le-gcc.s +458 -0
  166. data/ext/xkcp/lib/low/KeccakP-400/AVR8/KeccakP-400-SnP.h +41 -0
  167. data/ext/xkcp/lib/low/KeccakP-400/AVR8/KeccakP-400-avr8-fast.s +728 -0
  168. data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-SnP.h +43 -0
  169. data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-reference.c +414 -0
  170. data/ext/xkcp/lib/low/KeccakP-400/ref/KeccakP-400-reference.h +23 -0
  171. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-SnP.h +42 -0
  172. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u1-armv6m-le-armcc.s +527 -0
  173. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u1-armv6m-le-gcc.s +533 -0
  174. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv6m-le-armcc.s +528 -0
  175. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv6m-le-gcc.s +534 -0
  176. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7a-le-armcc.s +521 -0
  177. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7a-le-gcc.s +527 -0
  178. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7m-le-armcc.s +517 -0
  179. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-u2-armv7m-le-gcc.s +523 -0
  180. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-uf-armv7m-le-armcc.s +550 -0
  181. data/ext/xkcp/lib/low/KeccakP-800/ARM/KeccakP-800-uf-armv7m-le-gcc.s +556 -0
  182. data/ext/xkcp/lib/low/KeccakP-800/ARMv8A/KeccakP-800-SnP.h +32 -0
  183. data/ext/xkcp/lib/low/KeccakP-800/ARMv8A/KeccakP-800-armv8a-neon.s +432 -0
  184. data/ext/xkcp/lib/low/KeccakP-800/AVR8/KeccakP-800-SnP.h +42 -0
  185. data/ext/xkcp/lib/low/KeccakP-800/AVR8/KeccakP-800-avr8-fast.s +929 -0
  186. data/ext/xkcp/lib/low/KeccakP-800/compact/KeccakP-800-SnP.h +40 -0
  187. data/ext/xkcp/lib/low/KeccakP-800/compact/KeccakP-800-compact.c +244 -0
  188. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-SnP.h +46 -0
  189. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32-bis.macros +184 -0
  190. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32.c +454 -0
  191. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-opt32.macros +459 -0
  192. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-unrolling-bis.macros +83 -0
  193. data/ext/xkcp/lib/low/KeccakP-800/plain/KeccakP-800-unrolling.macros +88 -0
  194. data/ext/xkcp/lib/low/KeccakP-800/plain/lcu2/KeccakP-800-opt32-config.h +7 -0
  195. data/ext/xkcp/lib/low/KeccakP-800/plain/lcua/KeccakP-800-opt32-config.h +7 -0
  196. data/ext/xkcp/lib/low/KeccakP-800/plain/u2/KeccakP-800-opt32-config.h +7 -0
  197. data/ext/xkcp/lib/low/KeccakP-800/plain/ua/KeccakP-800-opt32-config.h +7 -0
  198. data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-SnP.h +44 -0
  199. data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-reference.c +437 -0
  200. data/ext/xkcp/lib/low/KeccakP-800/ref/KeccakP-800-reference.h +23 -0
  201. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/Ket.h +57 -0
  202. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeJr-armv7m-le-armcc.s +475 -0
  203. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeJr-armv7m-le-gcc.s +480 -0
  204. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeSr-armv7m-le-armcc.s +590 -0
  205. data/ext/xkcp/lib/low/Ketje/OptimizedAsmARM/KetjeSr-armv7m-le-gcc.s +590 -0
  206. data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.c +126 -0
  207. data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.h +68 -0
  208. data/ext/xkcp/lib/low/Ketje/OptimizedLE/Ket.inc +174 -0
  209. data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.c +80 -0
  210. data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.h +68 -0
  211. data/ext/xkcp/lib/low/Ketje/SnP-compliant/Ket.inc +142 -0
  212. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-SnP.h +55 -0
  213. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-u1-armv6m-le-armcc.s +1086 -0
  214. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-u1-armv6m-le-gcc.s +1092 -0
  215. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv6-le-armcc.s +721 -0
  216. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv6-le-gcc.s +726 -0
  217. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv7m-le-armcc.s +723 -0
  218. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodoo-uf-armv7m-le-gcc.s +729 -0
  219. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-u1-armv6m-le-armcc.s +1164 -0
  220. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-u1-armv6m-le-gcc.s +1165 -0
  221. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv6-le-armcc.s +562 -0
  222. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv6-le-gcc.s +563 -0
  223. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv7m-le-armcc.s +563 -0
  224. data/ext/xkcp/lib/low/Xoodoo/ARM/Xoodyak-uf-armv7m-le-gcc.s +565 -0
  225. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-SnP.h +55 -0
  226. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-uf-armv7a-neon-le-armcc.s +476 -0
  227. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodoo-uf-armv7a-neon-le-gcc.s +485 -0
  228. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodyak-uf-armv7a-neon-le-armcc.s +362 -0
  229. data/ext/xkcp/lib/low/Xoodoo/ARMv7A-NEON/Xoodyak-uf-armv7a-neon-le-gcc.s +367 -0
  230. data/ext/xkcp/lib/low/Xoodoo/AVR8/Xoodoo-SnP.h +43 -0
  231. data/ext/xkcp/lib/low/Xoodoo/AVR8/Xoodoo-avr8-u1.s +1341 -0
  232. data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodoo-SIMD512.c +581 -0
  233. data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodoo-SnP.h +58 -0
  234. data/ext/xkcp/lib/low/Xoodoo/AVX512/Xoodyak-full-block-SIMD512.c +332 -0
  235. data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodoo-SIMD128.c +329 -0
  236. data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodoo-SnP.h +53 -0
  237. data/ext/xkcp/lib/low/Xoodoo/SSE2/Xoodyak-full-block-SIMD128.c +355 -0
  238. data/ext/xkcp/lib/low/Xoodoo/Xoodoo.h +79 -0
  239. data/ext/xkcp/lib/low/Xoodoo/plain/Xoodoo-SnP.h +56 -0
  240. data/ext/xkcp/lib/low/Xoodoo/plain/Xoodoo-optimized.c +399 -0
  241. data/ext/xkcp/lib/low/Xoodoo/plain/Xoodyak-full-blocks.c +127 -0
  242. data/ext/xkcp/lib/low/Xoodoo/ref/Xoodoo-SnP.h +43 -0
  243. data/ext/xkcp/lib/low/Xoodoo/ref/Xoodoo-reference.c +253 -0
  244. data/ext/xkcp/lib/low/Xoodoo-times16/AVX512/Xoodoo-times16-SIMD512.c +1044 -0
  245. data/ext/xkcp/lib/low/Xoodoo-times16/AVX512/Xoodoo-times16-SnP.h +49 -0
  246. data/ext/xkcp/lib/low/Xoodoo-times16/fallback-on1/Xoodoo-times16-SnP.h +45 -0
  247. data/ext/xkcp/lib/low/Xoodoo-times16/fallback-on1/Xoodoo-times16-on1.c +37 -0
  248. data/ext/xkcp/lib/low/Xoodoo-times4/ARMv7A-NEON/Xoodoo-times4-ARMv7A.s +1587 -0
  249. data/ext/xkcp/lib/low/Xoodoo-times4/ARMv7A-NEON/Xoodoo-times4-SnP.h +48 -0
  250. data/ext/xkcp/lib/low/Xoodoo-times4/AVX512/Xoodoo-times4-SIMD512.c +1202 -0
  251. data/ext/xkcp/lib/low/Xoodoo-times4/AVX512/Xoodoo-times4-SnP.h +48 -0
  252. data/ext/xkcp/lib/low/Xoodoo-times4/SSSE3/Xoodoo-times4-SIMD128.c +484 -0
  253. data/ext/xkcp/lib/low/Xoodoo-times4/SSSE3/Xoodoo-times4-SnP.h +44 -0
  254. data/ext/xkcp/lib/low/Xoodoo-times4/fallback-on1/Xoodoo-times4-SnP.h +45 -0
  255. data/ext/xkcp/lib/low/Xoodoo-times4/fallback-on1/Xoodoo-times4-on1.c +37 -0
  256. data/ext/xkcp/lib/low/Xoodoo-times8/AVX2/Xoodoo-times8-SIMD256.c +939 -0
  257. data/ext/xkcp/lib/low/Xoodoo-times8/AVX2/Xoodoo-times8-SnP.h +49 -0
  258. data/ext/xkcp/lib/low/Xoodoo-times8/AVX512/Xoodoo-times8-SIMD512.c +1216 -0
  259. data/ext/xkcp/lib/low/Xoodoo-times8/AVX512/Xoodoo-times8-SnP.h +48 -0
  260. data/ext/xkcp/lib/low/Xoodoo-times8/fallback-on1/Xoodoo-times8-SnP.h +45 -0
  261. data/ext/xkcp/lib/low/Xoodoo-times8/fallback-on1/Xoodoo-times8-on1.c +37 -0
  262. data/ext/xkcp/lib/low/common/PlSnP-Fallback.inc +290 -0
  263. data/ext/xkcp/lib/low/common/SnP-Relaned.h +141 -0
  264. data/ext/xkcp/support/Build/ExpandProducts.xsl +79 -0
  265. data/ext/xkcp/support/Build/ToGlobalMakefile.xsl +206 -0
  266. data/ext/xkcp/support/Build/ToOneTarget.xsl +89 -0
  267. data/ext/xkcp/support/Build/ToTargetConfigFile.xsl +37 -0
  268. data/ext/xkcp/support/Build/ToTargetMakefile.xsl +298 -0
  269. data/ext/xkcp/support/Build/ToVCXProj.xsl +198 -0
  270. data/ext/xkcp/support/Kernel-PMU/Kernel-pmu.md +133 -0
  271. data/ext/xkcp/support/Kernel-PMU/Makefile +8 -0
  272. data/ext/xkcp/support/Kernel-PMU/enable_arm_pmu.c +129 -0
  273. data/ext/xkcp/support/Kernel-PMU/load-module +1 -0
  274. data/ext/xkcp/util/KeccakSum/KeccakSum.c +394 -0
  275. data/ext/xkcp/util/KeccakSum/base64.c +86 -0
  276. data/ext/xkcp/util/KeccakSum/base64.h +12 -0
  277. data/lib/sleeping_kangaroo12/binding.rb +15 -0
  278. data/lib/sleeping_kangaroo12/build/loader.rb +40 -0
  279. data/lib/sleeping_kangaroo12/build/platform.rb +37 -0
  280. data/lib/sleeping_kangaroo12/build.rb +4 -0
  281. data/lib/sleeping_kangaroo12/digest.rb +103 -0
  282. data/lib/sleeping_kangaroo12/version.rb +5 -0
  283. data/lib/sleeping_kangaroo12.rb +7 -0
  284. metadata +372 -0
@@ -0,0 +1,1587 @@
1
+ @
2
+ @ The eXtended Keccak Code Package (XKCP)
3
+ @ https://github.com/XKCP/XKCP
4
+ @
5
+ @ The Xoodoo permutation, designed by Joan Daemen, Seth Hoffert, Gilles Van Assche and Ronny Van Keer.
6
+ @
7
+ @ Implementation by Conno Boel, hereby denoted as "the implementer".
8
+ @
9
+ @ For more information, feedback or questions, please refer to the Keccak Team website:
10
+ @ https://keccak.team/
11
+ @
12
+ @ To the extent possible under law, the implementer has waived all copyright
13
+ @ and related or neighboring rights to the source code in this file.
14
+ @ http://creativecommons.org/publicdomain/zero/1.0/
15
+ @
16
+
17
+ @ WARNING: These functions work only on little endian CPU with@ ARMv7A + NEON architecture (Cortex-A8, ...).
18
+
19
+ .text
20
+
21
+ @ Xoodootimes4_InitializeAll: void * states -> void
22
+ .align 8
23
+ .global Xoodootimes4_InitializeAll
24
+ .type Xoodootimes4_InitializeAll, %function
25
+ Xoodootimes4_InitializeAll:
26
+ vmov.i32 q0, #0
27
+ vstm r0!, {d0-d1}
28
+ vstm r0!, {d0-d1}
29
+ vstm r0!, {d0-d1}
30
+
31
+ vstm r0!, {d0-d1}
32
+ vstm r0!, {d0-d1}
33
+ vstm r0!, {d0-d1}
34
+
35
+ vstm r0!, {d0-d1}
36
+ vstm r0!, {d0-d1}
37
+ vstm r0!, {d0-d1}
38
+
39
+ vstm r0!, {d0-d1}
40
+ vstm r0!, {d0-d1}
41
+ vstm r0!, {d0-d1}
42
+ bx lr
43
+
44
+
45
+ @ Xoodootimes4_AddByte: void * states -> uint instanceIndex -> const uchar byte -> uint offset -> void
46
+ .align 8
47
+ .global Xoodootimes4_AddByte
48
+ .type Xoodootimes4_AddByte, %function
49
+ Xoodootimes4_AddByte:
50
+ add r1, r1, r3
51
+ and r3, r3, #3
52
+ sub r1, r1, r3
53
+ add r0, r0, r1, lsl #2 @ states+(WORD instanceIndex)
54
+ add r0, r0, r3
55
+ ldrb r3, [r0]
56
+ eor r3, r3, r2
57
+ strb r3, [r0]
58
+ bx lr
59
+
60
+ @ Xoodootimes4_AddBytes: void * states -> uint instanceIndex -> const uchar * data -> uint offset -> uint length -> void
61
+ .align 8
62
+ .global Xoodootimes4_AddBytes
63
+ .type Xoodootimes4_AddBytes, %function
64
+ Xoodootimes4_AddBytes:
65
+ add r1, r1, r3
66
+ and r3, r3, #3
67
+ sub r1, r1, r3
68
+ add r0, r0, r1, lsl #2 @ states+(WORD instanceIndex)
69
+ add r0, r0, r3
70
+
71
+ ldr r1, [sp]
72
+ subs r1, r1, #1
73
+ bxcc lr
74
+
75
+ @ r0 start
76
+ @ r1 lenght > 0
77
+ @ r2 data
78
+ @ r3 byte offset {0,1,2,3}
79
+
80
+ push {r4, r5, lr}
81
+ Xt4_AddBytes_Loop:
82
+ ldrb r4, [r0]
83
+ ldrb r5, [r2], #1
84
+ eor r4, r4, r5
85
+ strb r4, [r0], #1
86
+ and r5, r0, #3
87
+ cmp r5, #0
88
+ addeq r0, r0, #12 @ Skip state
89
+ subs r1, r1, #1
90
+ bcs Xt4_AddBytes_Loop
91
+ pop {r4, r5, pc}
92
+
93
+ @ Xoodootimes4_AddLanesAll: void * states -> const uchar * data -> uint laneCount -> uint laneOffset -> void
94
+ .align 8
95
+ .global Xoodootimes4_AddLanesAll
96
+ .type Xoodootimes4_AddLanesAll, %function
97
+ Xoodootimes4_AddLanesAll:
98
+ cmp r2, r3
99
+ cmpeq r2, #12
100
+ tsteq r1, #3
101
+ moveq r3, lr
102
+ beq Xt4_AddLanesAll_Full
103
+
104
+ push {r4-r7,lr}
105
+
106
+ add r4, r1, r3, lsl #2
107
+ add r5, r4, r3, lsl #2
108
+ add r6, r5, r3, lsl #2
109
+
110
+ subs r2, r2, #1
111
+ popcc {r4-r7,pc}
112
+
113
+ and r3, r1, #3
114
+ cmp r3, #0
115
+
116
+ bne Xt4_AddLanesAll_Unaligned_Loop
117
+ Xt4_AddLanesAll_Aligned_Loop:
118
+ vldm r0, {d0, d1}
119
+ ldr r3, [r1], #4
120
+ vmov s4, r3
121
+ ldr r3, [r4], #4
122
+ vmov s5, r3
123
+ ldr r3, [r5], #4
124
+ vmov s6, r3
125
+ ldr r3, [r6], #4
126
+ vmov s7, r3
127
+ veor q0, q0, q1
128
+ vstm r0!, {d0, d1}
129
+ subs r2, r2, #1
130
+ bcs Xt4_AddLanesAll_Aligned_Loop
131
+ pop {r4-r7,pc}
132
+ Xt4_AddLanesAll_Unaligned_Loop:
133
+ vldm r0, {d0, d1}
134
+
135
+ ldrb r3, [r1], #1
136
+ ldrb r7, [r1], #1
137
+ eor r3, r3, r7, lsl #8
138
+ ldrb r7, [r1], #1
139
+ eor r3, r3, r7, lsl #16
140
+ ldrb r7, [r1], #1
141
+ eor r3, r3, r7, lsl #24
142
+ vmov s4, r3
143
+
144
+ ldrb r3, [r4], #1
145
+ ldrb r7, [r4], #1
146
+ eor r3, r3, r7, lsl #8
147
+ ldrb r7, [r4], #1
148
+ eor r3, r3, r7, lsl #16
149
+ ldrb r7, [r4], #1
150
+ eor r3, r3, r7, lsl #24
151
+ vmov s5, r3
152
+
153
+ ldrb r3, [r5], #1
154
+ ldrb r7, [r5], #1
155
+ eor r3, r3, r7, lsl #8
156
+ ldrb r7, [r5], #1
157
+ eor r3, r3, r7, lsl #16
158
+ ldrb r7, [r5], #1
159
+ eor r3, r3, r7, lsl #24
160
+ vmov s6, r3
161
+
162
+ ldrb r3, [r6], #1
163
+ ldrb r7, [r6], #1
164
+ eor r3, r3, r7, lsl #8
165
+ ldrb r7, [r6], #1
166
+ eor r3, r3, r7, lsl #16
167
+ ldrb r7, [r6], #1
168
+ eor r3, r3, r7, lsl #24
169
+ vmov s7, r3
170
+
171
+ veor q0, q0, q1
172
+ vstm r0!, {d0, d1}
173
+ subs r2, r2, #1
174
+ bcs Xt4_AddLanesAll_Unaligned_Loop
175
+ pop {r4-r7,pc}
176
+ Xt4_AddLanesAll_Full:
177
+ vldm r1!, {d0-d15}
178
+ vuzp.32 q0, q6
179
+ vldm r1, {d16-d23}
180
+ vuzp.32 q3, q9
181
+ vtrn.32 q0, q3
182
+ vtrn.32 q6, q9
183
+
184
+ vuzp.32 q1, q7
185
+ vuzp.32 q4, q10
186
+ vtrn.32 q1, q4
187
+ vtrn.32 q7, q10
188
+
189
+ vuzp.32 q2, q8
190
+ vuzp.32 q5, q11
191
+ vtrn.32 q2, q5
192
+ vtrn.32 q8, q11
193
+
194
+ vldm r0, {d24-d31}
195
+ veor q12, q0, q12
196
+ veor q13, q6, q13
197
+ veor q14, q3, q14
198
+ veor q15, q9, q15
199
+ vstm r0!, {d24-d31}
200
+ vldm r0, {d24-d31}
201
+ veor q12, q1, q12
202
+ veor q13, q7, q13
203
+ veor q14, q4, q14
204
+ veor q15, q10, q15
205
+ vstm r0!, {d24-d31}
206
+ vldm r0, {d24-d31}
207
+ veor q12, q2, q12
208
+ veor q13, q8, q13
209
+ veor q14, q5, q14
210
+ veor q15, q11, q15
211
+ vstm r0, {d24-d31}
212
+ mov pc, r3
213
+
214
+ @ Xoodootimes4_OverwriteBytes: void * states -> uint instanceIndex -> const uchar * data -> uint offset -> uint length -> void
215
+ .align 8
216
+ .global Xoodootimes4_OverwriteBytes
217
+ .type Xoodootimes4_OverwriteBytes, %function
218
+ Xoodootimes4_OverwriteBytes:
219
+ push {r4, lr}
220
+ ldr r4, [sp, #8]
221
+ cmp r4, #48
222
+ tsteq r2, #3
223
+ beq Xt4_OverwriteBytes_Full
224
+
225
+ add r1, r1, r3
226
+ and r3, r3, #3
227
+ sub r1, r1, r3
228
+ add r0, r0, r1, lsl #2 @ states+(WORD instanceIndex)
229
+ add r0, r0, r3
230
+
231
+ subs r1, r4, #1
232
+ popcc {r4, pc}
233
+
234
+ @ r0 start
235
+ @ r1 lenght > 0
236
+ @ r2 data
237
+ @ r3 byte offset {0,1,2,3}
238
+
239
+ Xt4_OverwriteBytes_Loop:
240
+ ldrb r4, [r2], #1
241
+ strb r4, [r0], #1
242
+ and r4, r0, #3
243
+ cmp r4, #0
244
+ addeq r0, r0, #12 @ Skip state
245
+ subs r1, r1, #1
246
+ bcs Xt4_OverwriteBytes_Loop
247
+ pop {r4, pc}
248
+ Xt4_OverwriteBytes_Full:
249
+ add r0, r0, r1, lsl #2
250
+ ldmia r2!, {r1, r3, r4, r14}
251
+ str r1, [r0], #16
252
+ str r3, [r0], #16
253
+ str r4, [r0], #16
254
+ str r14, [r0], #16
255
+ ldmia r2!, {r1, r3, r4, r14}
256
+ str r1, [r0], #16
257
+ str r3, [r0], #16
258
+ str r4, [r0], #16
259
+ str r14, [r0], #16
260
+ ldmia r2, {r1, r3, r4, r14}
261
+ str r1, [r0], #16
262
+ str r3, [r0], #16
263
+ str r4, [r0], #16
264
+ str r14, [r0], #16
265
+ pop {r4, pc}
266
+
267
+ @ Xoodootimes4_OverwriteLanesAll: void * states -> uchar * data -> uint lanecount -> uint laneOffset -> void
268
+ .align 8
269
+ .global Xoodootimes4_OverwriteLanesAll
270
+ .type Xoodootimes4_OverwriteLanesAll, %function
271
+ Xoodootimes4_OverwriteLanesAll:
272
+ push {r4-r6,lr}
273
+
274
+ add r4, r1, r3, lsl #2
275
+ add r5, r4, r3, lsl #2
276
+ add r6, r5, r3, lsl #2
277
+
278
+ subs r2, r2, #1
279
+ popcc {r4-r6,pc}
280
+
281
+ and r3, r1, #3
282
+ cmp r3, #0
283
+ bne Xt4_OverwriteLanesAll_Unaligned_Loop
284
+
285
+ Xt4_OverwriteLanesAll_Aligned_Loop:
286
+ ldr r3, [r1], #4
287
+ vmov s0, r3
288
+ ldr r3, [r4], #4
289
+ vmov s1, r3
290
+ ldr r3, [r5], #4
291
+ vmov s2, r3
292
+ ldr r3, [r6], #4
293
+ vmov s3, r3
294
+ vstm r0!, {d0-d1}
295
+ subs r2, r2, #1
296
+ bcs Xt4_OverwriteLanesAll_Aligned_Loop
297
+ pop {r4-r6,pc}
298
+ Xt4_OverwriteLanesAll_Unaligned_Loop:
299
+ push {r7}
300
+
301
+ ldrb r3, [r1], #1
302
+ ldrb r7, [r1], #1
303
+ eor r3, r3, r7, lsl #8
304
+ ldrb r7, [r1], #1
305
+ eor r3, r3, r7, lsl #16
306
+ ldrb r7, [r1], #1
307
+ eor r3, r3, r7, lsl #24
308
+ vmov s0, r3
309
+
310
+ ldrb r3, [r4], #1
311
+ ldrb r7, [r4], #1
312
+ eor r3, r3, r7, lsl #8
313
+ ldrb r7, [r4], #1
314
+ eor r3, r3, r7, lsl #16
315
+ ldrb r7, [r4], #1
316
+ eor r3, r3, r7, lsl #24
317
+ vmov s1, r3
318
+
319
+ ldrb r3, [r5], #1
320
+ ldrb r7, [r5], #1
321
+ eor r3, r3, r7, lsl #8
322
+ ldrb r7, [r5], #1
323
+ eor r3, r3, r7, lsl #16
324
+ ldrb r7, [r5], #1
325
+ eor r3, r3, r7, lsl #24
326
+ vmov s2, r3
327
+
328
+ ldrb r3, [r6], #1
329
+ ldrb r7, [r6], #1
330
+ eor r3, r3, r7, lsl #8
331
+ ldrb r7, [r6], #1
332
+ eor r3, r3, r7, lsl #16
333
+ ldrb r7, [r6], #1
334
+ eor r3, r3, r7, lsl #24
335
+ vmov s3, r3
336
+
337
+ vstm r0!, {d0-d1}
338
+ pop {r7}
339
+ subs r2, r2, #1
340
+ bcs Xt4_OverwriteLanesAll_Unaligned_Loop
341
+ pop {r4-r6,pc}
342
+
343
+
344
+ @ Xoodootimes4_OverwriteWithZeroes: void * states -> uint instanceIndex -> uint byteCount -> void
345
+ .align 8
346
+ .global Xoodootimes4_OverwriteWithZeroes
347
+ .type Xoodootimes4_OverwriteWithZeroes, %function
348
+ Xoodootimes4_OverwriteWithZeroes:
349
+ add r0, r0, r1, lsl #2 @ states + 4*instance = state start
350
+ mov r1, #0
351
+ mov r3, lr
352
+ Xt4_OverwriteWithZeroes_Aligned:
353
+ subs r2, r2, #4
354
+ strcs r1, [r0], #16
355
+ bhi Xt4_OverwriteWithZeroes_Aligned
356
+ moveq pc, r3
357
+ add r2, r2, #4
358
+ Xt4_OverwriteWithZeroes_Leftovers:
359
+ subs r2, r2, #1
360
+ movcc pc, r3
361
+ strb r1, [r0], #1
362
+ bhi Xt4_OverwriteWithZeroes_Leftovers
363
+ mov pc, r3
364
+
365
+
366
+ @ Xoodootimes4_ExtractBytes: void * states -> uint instanceIndex -> const uchar * data -> uint offset -> uint length -> void
367
+ .align 8
368
+ .global Xoodootimes4_ExtractBytes
369
+ .type Xoodootimes4_ExtractBytes, %function
370
+ Xoodootimes4_ExtractBytes:
371
+ add r1, r1, r3
372
+ and r3, r3, #3
373
+ sub r1, r1, r3
374
+ add r0, r0, r1, lsl #2 @ states+(WORD instanceIndex)
375
+ add r0, r0, r3
376
+
377
+ ldr r1, [sp]
378
+ subs r1, r1, #1
379
+ bxcc lr
380
+
381
+ push {r4, lr}
382
+ Xt4_ExtractBytes_Loop:
383
+ ldrb r4, [r0], #1
384
+ strb r4, [r2], #1
385
+ and r4, r0, #3
386
+ cmp r4, #0
387
+ addeq r0, r0, #12 @ Skip state
388
+ subs r1, r1, #1
389
+ bcs Xt4_ExtractBytes_Loop
390
+ pop {r4, pc}
391
+
392
+ @ Xoodootimes4_ExtractLanesAll: void * states -> uchar * data -> uint lanecount -> uint laneoffset -> void
393
+ .align 8
394
+ .global Xoodootimes4_ExtractLanesAll
395
+ .type Xoodootimes4_ExtractLanesAll, %function
396
+ Xoodootimes4_ExtractLanesAll:
397
+ push {r4-r6,lr}
398
+
399
+ add r4, r1, r3, lsl #2
400
+ add r5, r4, r3, lsl #2
401
+ add r6, r5, r3, lsl #2
402
+
403
+ subs r2, r2, #1
404
+ popcc {r4-r6,pc}
405
+
406
+ and r3, r1, #3
407
+ cmp r3, #0
408
+ bne Xt4_ExtractLanesAll_Unaligned_Loop
409
+ Xt4_ExtractLanesAll_Aligned_Loop:
410
+ vldm r0!, {d0-d1}
411
+ vmov r3, s0
412
+ str r3, [r1], #4
413
+ vmov r3, s1
414
+ str r3, [r4], #4
415
+ vmov r3, s2
416
+ str r3, [r5], #4
417
+ vmov r3, s3
418
+ str r3, [r6], #4
419
+ subs r2, r2, #1
420
+ bcs Xt4_ExtractLanesAll_Aligned_Loop
421
+ pop {r4-r6,pc}
422
+ Xt4_ExtractLanesAll_Unaligned_Loop:
423
+ push {r7}
424
+ vldm r0!, {d0-d1}
425
+ vmov r3, s0
426
+ strb r3, [r1], #1
427
+ lsr r3, r3, #8
428
+ strb r3, [r1], #1
429
+ lsr r3, r3, #8
430
+ strb r3, [r1], #1
431
+ lsr r3, r3, #8
432
+ strb r3, [r1], #1
433
+
434
+ vmov r3, s1
435
+ strb r3, [r4], #1
436
+ lsr r3, r3, #8
437
+ strb r3, [r4], #1
438
+ lsr r3, r3, #8
439
+ strb r3, [r4], #1
440
+ lsr r3, r3, #8
441
+ strb r3, [r4], #1
442
+
443
+ vmov r3, s2
444
+ strb r3, [r5], #1
445
+ lsr r3, r3, #8
446
+ strb r3, [r5], #1
447
+ lsr r3, r3, #8
448
+ strb r3, [r5], #1
449
+ lsr r3, r3, #8
450
+ strb r3, [r5], #1
451
+
452
+ vmov r3, s3
453
+ strb r3, [r6], #1
454
+ lsr r3, r3, #8
455
+ strb r3, [r6], #1
456
+ lsr r3, r3, #8
457
+ strb r3, [r6], #1
458
+ lsr r3, r3, #8
459
+ strb r3, [r6], #1
460
+
461
+ pop {r7}
462
+ subs r2, r2, #1
463
+ bcs Xt4_ExtractLanesAll_Unaligned_Loop
464
+ pop {r4-r6,pc}
465
+
466
+ @ Xoodootimes4_ExtractAndAddBytes: void * states -> uint instanceIndex -> uchar * input -> uchar * output -> uint offset -> uint length -> void
467
+ .align 8
468
+ .global Xoodootimes4_ExtractAndAddBytes
469
+ .type Xoodootimes4_ExtractAndAddBytes, %function
470
+ Xoodootimes4_ExtractAndAddBytes:
471
+ push {r4, r5, lr}
472
+ ldr r4, [sp, #12]
473
+ ldr r5, [sp, #16]
474
+ cmp r5, #48
475
+ tsteq r2, #3
476
+ tsteq r3, #3
477
+ beq Xt4_ExtractAndAddBytes_Full
478
+
479
+ add r1, r1, r4
480
+ and r4, r4, #3
481
+ sub r1, r1, r4
482
+ add r0, r0, r1, lsl #2 @ states+(WORD instanceIndex)
483
+ add r0, r0, r4
484
+
485
+ subs r1, r5, #1
486
+ popcc {r4, r5, pc}
487
+
488
+ Xt4_ExtractAndAddBytes_Loop:
489
+ ldrb r4, [r0], #1
490
+ ldrb r5, [r2], #1
491
+ eor r4, r4, r5
492
+ strb r4, [r3], #1
493
+ and r4, r0, #3
494
+ cmp r4, #0
495
+ addeq r0, r0, #12 @ Skip state
496
+ subs r1, r1, #1
497
+ bcs Xt4_ExtractAndAddBytes_Loop
498
+ pop {r4, r5, pc}
499
+ Xt4_ExtractAndAddBytes_Full:
500
+ add r0, r0, r1, lsl #2
501
+ ldmia r2!, {r1, r4, r5}
502
+ ldr r14, [r0], #16
503
+ eor r1, r1, r14
504
+ ldr r14, [r0], #16
505
+ eor r4, r4, r14
506
+ ldr r14, [r0], #16
507
+ eor r5, r5, r14
508
+ stmia r3!, {r1, r4, r5}
509
+ ldmia r2!, {r1, r4, r5}
510
+ ldr r14, [r0], #16
511
+ eor r1, r1, r14
512
+ ldr r14, [r0], #16
513
+ eor r4, r4, r14
514
+ ldr r14, [r0], #16
515
+ eor r5, r5, r14
516
+ stmia r3!, {r1, r4, r5}
517
+ ldmia r2!, {r1, r4, r5}
518
+ ldr r14, [r0], #16
519
+ eor r1, r1, r14
520
+ ldr r14, [r0], #16
521
+ eor r4, r4, r14
522
+ ldr r14, [r0], #16
523
+ eor r5, r5, r14
524
+ stmia r3!, {r1, r4, r5}
525
+ ldmia r2, {r1, r4, r5}
526
+ ldr r14, [r0], #16
527
+ eor r1, r1, r14
528
+ ldr r14, [r0], #16
529
+ eor r4, r4, r14
530
+ ldr r14, [r0], #16
531
+ eor r5, r5, r14
532
+ stmia r3, {r1, r4, r5}
533
+ pop {r4, r5, pc}
534
+
535
+
536
+ @ Xoodootimes4_ExtractAndAddLanesAll: void * states -> uchar * input -> uchar * output -> uint laneCount -> uint laneOffset
537
+ .align 8
538
+ .global Xoodootimes4_ExtractAndAddLanesAll
539
+ .type Xoodootimes4_ExtractAndAddLanesAll, %function
540
+ Xoodootimes4_ExtractAndAddLanesAll:
541
+ subs r3, r3, #1
542
+ bxcc lr
543
+
544
+ push {r4-r11,lr}
545
+ ldr r9, [sp, #36]
546
+
547
+ add r4, r1, r9, lsl #2 @ r4 = r1 + 48
548
+ add r5, r4, r9, lsl #2 @ r5 = r1 + 96
549
+ add r6, r5, r9, lsl #2 @ r6 = r1 + 144
550
+
551
+ add r7, r2, r9, lsl #2 @ r7 = r2 + 48
552
+ add r8, r7, r9, lsl #2 @ r8 = r2 + 96
553
+ add r9, r8, r9, lsl #2 @ r9 = r2 + 144
554
+
555
+ Xt4_ExtractAndAddLanesAll_Unaligned_Loop:
556
+ vldm r0!, {d2-d3}
557
+
558
+ ldrb r11, [r1], #1
559
+ ldrb r10, [r1], #1
560
+ eor r11, r11, r10, lsl #8
561
+ ldrb r10, [r1], #1
562
+ eor r11, r11, r10, lsl #16
563
+ ldrb r10, [r1], #1
564
+ eor r11, r11, r10, lsl #24
565
+ vmov s0, r11
566
+
567
+ ldrb r11, [r4], #1
568
+ ldrb r10, [r4], #1
569
+ eor r11, r11, r10, lsl #8
570
+ ldrb r10, [r4], #1
571
+ eor r11, r11, r10, lsl #16
572
+ ldrb r10, [r4], #1
573
+ eor r11, r11, r10, lsl #24
574
+ vmov s1, r11
575
+
576
+ ldrb r11, [r5], #1
577
+ ldrb r10, [r5], #1
578
+ eor r11, r11, r10, lsl #8
579
+ ldrb r10, [r5], #1
580
+ eor r11, r11, r10, lsl #16
581
+ ldrb r10, [r5], #1
582
+ eor r11, r11, r10, lsl #24
583
+ vmov s2, r11
584
+
585
+ ldrb r11, [r6], #1
586
+ ldrb r10, [r6], #1
587
+ eor r11, r11, r10, lsl #8
588
+ ldrb r10, [r6], #1
589
+ eor r11, r11, r10, lsl #16
590
+ ldrb r10, [r6], #1
591
+ eor r11, r11, r10, lsl #24
592
+ vmov s3, r11
593
+ veor q0, q0, q1
594
+
595
+ vmov r10, s0
596
+ strb r10, [r2], #1
597
+ lsr r10, r10, #8
598
+ strb r10, [r2], #1
599
+ lsr r10, r10, #8
600
+ strb r10, [r2], #1
601
+ lsr r10, r10, #8
602
+ strb r10, [r2], #1
603
+
604
+ vmov r10, s1
605
+ strb r10, [r7], #1
606
+ lsr r10, r10, #8
607
+ strb r10, [r7], #1
608
+ lsr r10, r10, #8
609
+ strb r10, [r7], #1
610
+ lsr r10, r10, #8
611
+ strb r10, [r7], #1
612
+
613
+ vmov r10, s2
614
+ strb r10, [r8], #1
615
+ lsr r10, r10, #8
616
+ strb r10, [r8], #1
617
+ lsr r10, r10, #8
618
+ strb r10, [r8], #1
619
+ lsr r10, r10, #8
620
+ strb r10, [r8], #1
621
+
622
+ vmov r10, s3
623
+ strb r10, [r9], #1
624
+ lsr r10, r10, #8
625
+ strb r10, [r9], #1
626
+ lsr r10, r10, #8
627
+ strb r10, [r9], #1
628
+ lsr r10, r10, #8
629
+ strb r10, [r9], #1
630
+
631
+ subs r3, r3, #1
632
+ bcs Xt4_ExtractAndAddLanesAll_Unaligned_Loop
633
+ pop {r4-r11,pc}
634
+
635
+ @ q0: a00 -> q1: a01 -> q2: a02 -> q3: a03 ->
636
+ @ q4: a10 -> q5: a11 -> q6: a12 -> q7: a13 ->
637
+ @ q8: a20 -> q9: a21 -> q10: a22 -> q11: a23
638
+
639
+ .macro theta
640
+ veor q15, q3, q7
641
+ veor q15, q15, q11
642
+
643
+ vmov r3, r4, d30
644
+ vmov r1, r2, d31
645
+ ror r3, r3, #27
646
+ veor q14, q0, q4
647
+ ror r4, r4, #27
648
+ veor q14, q14, q8
649
+ ror r1, r1, #27
650
+ ror r2, r2, #27
651
+ eor r3, r3, r3, ror #23
652
+ eor r4, r4, r4, ror #23
653
+ eor r1, r1, r1, ror #23
654
+ vmov d30, r3, r4
655
+ eor r2, r2, r2, ror #23
656
+ vmov d31, r1, r2
657
+
658
+ vmov r3, r4, d28
659
+ vmov r1, r2, d29
660
+ ror r3, r3, #27
661
+ veor q0, q0, q15
662
+ ror r4, r4, #27
663
+ veor q4, q4, q15
664
+ ror r1, r1, #27
665
+ veor q8, q8, q15
666
+ ror r2, r2, #27
667
+ veor q15, q1, q5
668
+ eor r3, r3, r3, ror #23
669
+ veor q15, q15, q9
670
+ eor r4, r4, r4, ror #23
671
+ eor r1, r1, r1, ror #23
672
+ vmov d28, r3, r4
673
+ eor r2, r2, r2, ror #23
674
+ vmov d29, r1, r2
675
+
676
+ vmov r3, r4, d30
677
+ vmov r1, r2, d31
678
+ ror r3, r3, #27
679
+ veor q1, q1, q14
680
+ ror r4, r4, #27
681
+ veor q5, q5, q14
682
+ ror r1, r1, #27
683
+ veor q9, q9, q14
684
+ ror r2, r2, #27
685
+ veor q14, q2, q6
686
+ eor r3, r3, r3, ror #23
687
+ veor q14, q14, q10
688
+ eor r4, r4, r4, ror #23
689
+ eor r1, r1, r1, ror #23
690
+ vmov d30, r3, r4
691
+ eor r2, r2, r2, ror #23
692
+ vmov d31, r1, r2
693
+
694
+ vmov r3, r4, d28
695
+ vmov r1, r2, d29
696
+ ror r3, r3, #27
697
+ veor q2, q2, q15
698
+ ror r4, r4, #27
699
+ veor q6, q6, q15
700
+ ror r1, r1, #27
701
+ veor q10, q10, q15
702
+ ror r2, r2, #27
703
+ eor r3, r3, r3, ror #23
704
+ eor r4, r4, r4, ror #23
705
+ eor r1, r1, r1, ror #23
706
+ vmov d28, r3, r4
707
+ eor r2, r2, r2, ror #23
708
+ vmov d29, r1, r2
709
+ veor q3, q3, q14
710
+ veor q7, q7, q14
711
+ veor q11, q11, q14
712
+ .endm
713
+
714
+ .macro rho_w
715
+ @ vshl.U32 q12, q8, #11
716
+ @ vsri.U32 q12, q8, #21
717
+ vmov r1, r2, d16
718
+ vshl.U32 q13, q9, #11
719
+ vmov r3, r4, d17
720
+ vsri.U32 q13, q9, #21
721
+ ror r1, r1, #21
722
+ vshl.U32 q14, q10, #11
723
+ ror r2, r2, #21
724
+ vsri.U32 q14, q10, #21
725
+ ror r3, r3, #21
726
+ vshl.U32 q15, q11, #11
727
+ ror r4, r4, #21
728
+ vsri.U32 q15, q11, #21
729
+ vmov d24, r1, r2
730
+ vmov d25, r3, r4
731
+ @ NOTE: Here we are hiding in the shadows. What happens is that the ROR action is interleaved with the vector actions so that they get executed for free instead of a NOP .
732
+ .endm
733
+
734
+ .macro chi
735
+ @ NOTE: Iota
736
+ vdup.32 q8, r3
737
+ veor q0, q0, q8
738
+
739
+ @ NOTE: Probably this is optimal. (Prove?)
740
+ vbic q11, q12, q7
741
+ vbic q9, q0, q12
742
+ vbic q10, q7, q0
743
+ veor q8, q10, q12
744
+ veor q12, q7, q9
745
+ veor q0, q0, q11
746
+
747
+ vbic q7, q13, q4
748
+ vbic q10, q1, q13
749
+ vbic q11, q4, q1
750
+ veor q9, q11, q13
751
+ veor q13, q4, q10
752
+ veor q1, q1, q7
753
+
754
+ vbic q4, q14, q5
755
+ vbic q11, q2, q14
756
+ vbic q7, q5, q2
757
+ veor q10, q7, q14
758
+ veor q14, q5, q11
759
+ veor q2, q2, q4
760
+
761
+ vbic q5, q15, q6
762
+ vbic q7, q3, q15
763
+ vbic q4, q6, q3
764
+ veor q4, q4, q15
765
+ veor q15, q6, q7
766
+ veor q3, q3, q5
767
+ .endm
768
+
769
+ .macro rho_e
770
+ vshl.U32 q11, q9, #8
771
+ vsri.U32 q11, q9, #24
772
+
773
+ vshl.U32 q9, q4, #8
774
+ vsri.U32 q9, q4, #24
775
+
776
+ vmov r1, r2, d16
777
+ vmov r3, r4, d17
778
+ ror r1, r1, #24
779
+ vshl.U32 q8, q10, #8
780
+ ror r2, r2, #24
781
+ vsri.U32 q8, q10, #24
782
+ ror r3, r3, #24
783
+ vmov d20, r1, r2
784
+ ror r4, r4, #24
785
+ vmov d21, r3, r4
786
+
787
+ vshl.U32 q4, q12, #1
788
+ vsri.U32 q4, q12, #31
789
+
790
+ vshl.U32 q5, q13, #1
791
+ vsri.U32 q5, q13, #31
792
+
793
+ vshl.U32 q6, q14, #1
794
+ vsri.U32 q6, q14, #31
795
+
796
+ vshl.U32 q7, q15, #1
797
+ vsri.U32 q7, q15, #31
798
+ .endm
799
+
800
+ @ NOTE: The idea was to maybe merge rho_e and theta partially, however because P depends on the registers it also XORs into, we do not save cycles by stepping to core registers. Because at no point can we use the barrel shifter, which is the only reason we should want to choose the core registers over the vector registers.
801
+
802
+ @ Xoodootimes4_PermuteAll_6rounds: void * argStates -> void
803
+ .align 8
804
+ .global Xoodootimes4_PermuteAll_6rounds
805
+ .type Xoodootimes4_PermuteAll_6rounds, %function
806
+ Xoodootimes4_PermuteAll_6rounds:
807
+ vpush {d8-d15}
808
+ push {r4}
809
+ vldm r0!, {d0-d15}
810
+ vldm r0, {d16-d23}
811
+ sub r0, r0, #128 @ (16*64)/8
812
+
813
+ theta
814
+ rho_w
815
+ mov r3, #0x00000060
816
+ chi
817
+ rho_e
818
+
819
+ theta
820
+ rho_w
821
+ mov r3, #0x0000002C
822
+ chi
823
+ rho_e
824
+
825
+ theta
826
+ rho_w
827
+ mov r3, #0x00000380
828
+ chi
829
+ rho_e
830
+
831
+ theta
832
+ rho_w
833
+ mov r3, #0x000000F0
834
+ chi
835
+ rho_e
836
+
837
+ theta
838
+ rho_w
839
+ mov r3, #0x000001A0
840
+ chi
841
+ rho_e
842
+
843
+ theta
844
+ rho_w
845
+ mov r3, #0x00000012
846
+ chi
847
+ rho_e
848
+
849
+ vstm r0!, {d0-d15}
850
+ vstm r0, {d16-d23}
851
+ pop {r4}
852
+ vpop {d8-d15}
853
+ bx lr
854
+
855
+ @ Xoodootimes4_PermuteAll_12rounds: void * argStates -> void
856
+ .align 8
857
+ .global Xoodootimes4_PermuteAll_12rounds
858
+ .type Xoodootimes4_PermuteAll_12rounds, %function
859
+ Xoodootimes4_PermuteAll_12rounds:
860
+ vpush {d8-d15}
861
+ push {r4-r5}
862
+ vldm r0!, {d0-d15}
863
+ vldm r0, {d16-d23}
864
+ sub r0, r0, #128
865
+
866
+ theta
867
+ rho_w
868
+ mov r3, #0x00000058
869
+ chi
870
+ rho_e
871
+
872
+ theta
873
+ rho_w
874
+ mov r3, #0x00000038
875
+ chi
876
+ rho_e
877
+
878
+ theta
879
+ rho_w
880
+ mov r3, #0x000003C0
881
+ chi
882
+ rho_e
883
+
884
+ theta
885
+ rho_w
886
+ mov r3, #0x000000D0
887
+ chi
888
+ rho_e
889
+
890
+ theta
891
+ rho_w
892
+ mov r3, #0x00000120
893
+ chi
894
+ rho_e
895
+
896
+ theta
897
+ rho_w
898
+ mov r3, #0x00000014
899
+ chi
900
+ rho_e
901
+
902
+ theta
903
+ rho_w
904
+ mov r3, #0x00000060
905
+ chi
906
+ rho_e
907
+
908
+ theta
909
+ rho_w
910
+ mov r3, #0x0000002C
911
+ chi
912
+ rho_e
913
+
914
+ theta
915
+ rho_w
916
+ mov r3, #0x00000380
917
+ chi
918
+ rho_e
919
+
920
+ theta
921
+ rho_w
922
+ mov r3, #0x000000F0
923
+ chi
924
+ rho_e
925
+
926
+ theta
927
+ rho_w
928
+ mov r3, #0x000001A0
929
+ chi
930
+ rho_e
931
+
932
+ theta
933
+ rho_w
934
+ mov r3, #0x00000012
935
+ chi
936
+ rho_e
937
+ vstm r0!, {d0-d15}
938
+ vstm r0, {d16-d23}
939
+ pop {r4-r5}
940
+ vpop {d8-d15}
941
+ bx lr
942
+
943
+ @
944
+ @ FASTLOOP SUPPORT
945
+ @
946
+
947
+ @ Xooffftimes4_AddIs: uchar * output -> uchar * input -> size_t bitLen -> void
948
+ .align 8
949
+ .global Xooffftimes4_AddIs
950
+ .type Xooffftimes4_AddIs, %function
951
+ Xooffftimes4_AddIs:
952
+ push {r4-r12,lr}
953
+ @ When unaligned always skip to 32.
954
+ tst r0, #3
955
+ bne Xft4_AddIs_32
956
+ tst r1, #3
957
+ bne Xft4_AddIs_32
958
+ Xft4_AddIs_384: @ Test core registers and interleaving.
959
+ cmp r2, #384
960
+ bcc Xft4_AddIs_128
961
+ vldm r0, {d0-d5}
962
+ vldm r1!, {d16-d19}
963
+ veor q0, q0, q8
964
+ vldm r1!, {d20-d21}
965
+ veor q1, q1, q9
966
+ veor q2, q2, q10
967
+ vstm r0!, {d0-d5}
968
+ subs r2, #384
969
+ beq Xft4_AddIs_0
970
+ b Xft4_AddIs_384
971
+ Xft4_AddIs_128: @Test if core registers are faster here...
972
+ cmp r2, #128
973
+ bcc Xft4_AddIs_32
974
+ vldm r0, {d0-d1}
975
+ vldm r1!, {d2-d3}
976
+ veor q0, q0, q1
977
+ vstm r0!, {d0-d1}
978
+ subs r2, #128
979
+ beq Xft4_AddIs_0
980
+ b Xft4_AddIs_128
981
+ Xft4_AddIs_32: @ Add 64-support
982
+ cmp r2, #32
983
+ bcc Xft4_AddIs_8
984
+ ldr r4, [r0]
985
+ ldr r5, [r1], #4
986
+ eor r4, r4, r5
987
+ str r4, [r0], #4
988
+ sub r2, r2, #32
989
+ b Xft4_AddIs_32
990
+ Xft4_AddIs_8:
991
+ cmp r2, #8
992
+ bcc Xft4_AddIs_7
993
+ ldrb r4, [r0]
994
+ ldrb r5, [r1], #1
995
+ eor r4, r4, r5
996
+ strb r4, [r0], #1
997
+ sub r2, r2, #8
998
+ b Xft4_AddIs_8
999
+ Xft4_AddIs_7:
1000
+ cmp r2, #0
1001
+ beq Xft4_AddIs_0
1002
+ mov r3, #1
1003
+ lsl r3, r3, r2
1004
+ sub r3, r3, #1
1005
+ ldrb r4, [r0]
1006
+ ldrb r5, [r1], #4
1007
+ eor r4, r4, r5
1008
+ and r4, r4, r3
1009
+ strb r4, [r0], #4
1010
+ Xft4_AddIs_0:
1011
+ pop {r4-r12,pc}
1012
+
1013
+ .macro theta_star
1014
+ veor q15, q3, q7
1015
+ veor q15, q15, q11
1016
+
1017
+ vmov r7, r8, d30
1018
+ vmov r5, r6, d31
1019
+ ror r7, r7, #27
1020
+ veor q14, q0, q4
1021
+ ror r8, r8, #27
1022
+ veor q14, q14, q8
1023
+ ror r5, r5, #27
1024
+ ror r6, r6, #27
1025
+ eor r7, r7, r7, ror #23
1026
+ eor r8, r8, r8, ror #23
1027
+ eor r5, r5, r5, ror #23
1028
+ vmov d30, r7, r8
1029
+ eor r6, r6, r6, ror #23
1030
+ vmov d31, r5, r6
1031
+
1032
+ vmov r7, r8, d28
1033
+ vmov r5, r6, d29
1034
+ ror r7, r7, #27
1035
+ veor q0, q0, q15
1036
+ ror r8, r8, #27
1037
+ veor q4, q4, q15
1038
+ ror r5, r5, #27
1039
+ veor q8, q8, q15
1040
+ ror r6, r6, #27
1041
+ veor q15, q1, q5
1042
+ eor r7, r7, r7, ror #23
1043
+ veor q15, q15, q9
1044
+ eor r8, r8, r8, ror #23
1045
+ eor r5, r5, r5, ror #23
1046
+ vmov d28, r7, r8
1047
+ eor r6, r6, r6, ror #23
1048
+ vmov d29, r5, r6
1049
+
1050
+ vmov r7, r8, d30
1051
+ vmov r5, r6, d31
1052
+ ror r7, r7, #27
1053
+ veor q1, q1, q14
1054
+ ror r8, r8, #27
1055
+ veor q5, q5, q14
1056
+ ror r5, r5, #27
1057
+ veor q9, q9, q14
1058
+ ror r6, r6, #27
1059
+ veor q14, q2, q6
1060
+ eor r7, r7, r7, ror #23
1061
+ veor q14, q14, q10
1062
+ eor r8, r8, r8, ror #23
1063
+ eor r5, r5, r5, ror #23
1064
+ vmov d30, r7, r8
1065
+ eor r6, r6, r6, ror #23
1066
+ vmov d31, r5, r6
1067
+
1068
+ vmov r7, r8, d28
1069
+ vmov r5, r6, d29
1070
+ ror r7, r7, #27
1071
+ veor q2, q2, q15
1072
+ ror r8, r8, #27
1073
+ veor q6, q6, q15
1074
+ ror r5, r5, #27
1075
+ veor q10, q10, q15
1076
+ ror r6, r6, #27
1077
+ eor r7, r7, r7, ror #23
1078
+ eor r8, r8, r8, ror #23
1079
+ eor r5, r5, r5, ror #23
1080
+ vmov d28, r7, r8
1081
+ eor r6, r6, r6, ror #23
1082
+ vmov d29, r5, r6
1083
+ veor q3, q3, q14
1084
+ veor q7, q7, q14
1085
+ veor q11, q11, q14
1086
+ .endm
1087
+
1088
+ .macro rho_w_star
1089
+ vmov r5, r6, d16
1090
+ vshl.U32 q13, q9, #11
1091
+ vmov r7, r8, d17
1092
+ vsri.U32 q13, q9, #21
1093
+ ror r5, r5, #21
1094
+ vshl.U32 q14, q10, #11
1095
+ ror r6, r6, #21
1096
+ vsri.U32 q14, q10, #21
1097
+ ror r7, r7, #21
1098
+ vshl.U32 q15, q11, #11
1099
+ ror r8, r8, #21
1100
+ vsri.U32 q15, q11, #21
1101
+ vmov d24, r5, r6
1102
+ vmov d25, r7, r8
1103
+ .endm
1104
+
1105
+ .macro chi_star
1106
+ @ NOTE: Iota
1107
+ vdup.32 q8, r7
1108
+ veor q0, q0, q8
1109
+
1110
+ vbic q11, q12, q7
1111
+ vbic q9, q0, q12
1112
+ vbic q10, q7, q0
1113
+ veor q8, q10, q12
1114
+ veor q12, q7, q9
1115
+ veor q0, q0, q11
1116
+
1117
+ vbic q7, q13, q4
1118
+ vbic q10, q1, q13
1119
+ vbic q11, q4, q1
1120
+ veor q9, q11, q13
1121
+ veor q13, q4, q10
1122
+ veor q1, q1, q7
1123
+
1124
+ vbic q4, q14, q5
1125
+ vbic q11, q2, q14
1126
+ vbic q7, q5, q2
1127
+ veor q10, q7, q14
1128
+ veor q14, q5, q11
1129
+ veor q2, q2, q4
1130
+
1131
+ vbic q5, q15, q6
1132
+ vbic q7, q3, q15
1133
+ vbic q4, q6, q3
1134
+ veor q4, q4, q15
1135
+ veor q15, q6, q7
1136
+ veor q3, q3, q5
1137
+ .endm
1138
+
1139
+ .macro rho_e_star
1140
+ vshl.U32 q11, q9, #8
1141
+ vsri.U32 q11, q9, #24
1142
+
1143
+ vshl.U32 q9, q4, #8
1144
+ vsri.U32 q9, q4, #24
1145
+
1146
+ vmov r5, r6, d16
1147
+ vmov r7, r8, d17
1148
+ ror r5, r5, #24
1149
+ vshl.U32 q8, q10, #8
1150
+ ror r6, r6, #24
1151
+ vsri.U32 q8, q10, #24
1152
+ ror r7, r7, #24
1153
+ vmov d20, r5, r6
1154
+ ror r8, r8, #24
1155
+ vmov d21, r7, r8
1156
+
1157
+ vshl.U32 q4, q12, #1
1158
+ vsri.U32 q4, q12, #31
1159
+
1160
+ vshl.U32 q5, q13, #1
1161
+ vsri.U32 q5, q13, #31
1162
+
1163
+ vshl.U32 q6, q14, #1
1164
+ vsri.U32 q6, q14, #31
1165
+
1166
+ vshl.U32 q7, q15, #1
1167
+ vsri.U32 q7, q15, #31
1168
+ .endm
1169
+
1170
+ .macro xoodoo_6_star
1171
+ theta_star
1172
+ rho_w_star
1173
+ mov r7, #0x00000060
1174
+ chi_star
1175
+ rho_e_star
1176
+
1177
+ theta_star
1178
+ rho_w_star
1179
+ mov r7, #0x0000002C
1180
+ chi_star
1181
+ rho_e_star
1182
+
1183
+ theta_star
1184
+ rho_w_star
1185
+ mov r7, #0x00000380
1186
+ chi_star
1187
+ rho_e_star
1188
+
1189
+ theta_star
1190
+ rho_w_star
1191
+ mov r7, #0x000000F0
1192
+ chi_star
1193
+ rho_e_star
1194
+
1195
+ theta_star
1196
+ rho_w_star
1197
+ mov r7, #0x000001A0
1198
+ chi_star
1199
+ rho_e_star
1200
+
1201
+ theta_star
1202
+ rho_w_star
1203
+ mov r7, #0x00000012
1204
+ chi_star
1205
+ rho_e_star
1206
+ .endm
1207
+
1208
+ .macro focus_c
1209
+ tst r2, #3
1210
+ beq focused
1211
+ unfocused:
1212
+ ldmia r2!, {r4-r9}
1213
+ vmov d8, r4, r5
1214
+ vmov d9, r6, r7
1215
+ ldmia r2!, {r4-r5}
1216
+ vmov d10, r8, r9
1217
+ ldmia r2!, {r6-r7}
1218
+ vmov d11, r4, r5
1219
+ ldmia r2!, {r8-r9}
1220
+ vmov d12, r6, r7
1221
+ ldmia r2!, {r4-r5}
1222
+ vmov d13, r8, r9
1223
+ ldmia r2!, {r6-r7}
1224
+ vmov d14, r4, r5
1225
+ ldmia r2!, {r8-r9}
1226
+ vmov d15, r6, r7
1227
+ ldmia r2!, {r4-r5}
1228
+ vmov d16, r8, r9
1229
+ ldmia r2!, {r6-r7}
1230
+ vmov d17, r4, r5
1231
+ ldmia r2!, {r8-r9}
1232
+ vmov d18, r6, r7
1233
+ ldmia r2!, {r4-r5}
1234
+ vmov d19, r8, r9
1235
+ ldmia r2!, {r6-r7}
1236
+ vmov d20, r4, r5
1237
+ ldmia r2!, {r8-r9}
1238
+ vmov d21, r6, r7
1239
+ ldmia r2!, {r4-r5}
1240
+ vmov d22, r8, r9
1241
+ ldmia r2!, {r6-r7}
1242
+ vmov d23, r4, r5
1243
+ ldmia r2!, {r8-r9}
1244
+ vmov d24, r6, r7
1245
+ ldmia r2!, {r4-r5}
1246
+ vmov d25, r8, r9
1247
+ ldmia r2!, {r6-r7}
1248
+ vmov d26, r4, r5
1249
+ ldmia r2!, {r8-r9}
1250
+ vmov d27, r6, r7
1251
+ ldmia r2!, {r4-r5}
1252
+ vmov d28, r8, r9
1253
+ ldmia r2!, {r6-r7}
1254
+ vmov d29, r4, r5
1255
+ ldmia r2!, {r8-r9}
1256
+ vmov d30, r6, r7
1257
+ vmov d31, r8, r9
1258
+ b snapped
1259
+ focused:
1260
+ vldm r2!, {d8-d23}
1261
+ vldm r2!, {d24-d31}
1262
+ snapped:
1263
+ .endm
1264
+
1265
+ .macro zip_x
1266
+ @ Shatter
1267
+ vuzp.32 q4, q10
1268
+ vuzp.32 q7, q13
1269
+ vtrn.32 q4, q7
1270
+ vtrn.32 q10, q13
1271
+ @ q4, q10, q7, q13
1272
+
1273
+ vuzp.32 q5, q11
1274
+ vuzp.32 q8, q14
1275
+ vtrn.32 q5, q8
1276
+ vtrn.32 q11, q14
1277
+ @ q5, q11, q8, q14
1278
+
1279
+ vuzp.32 q6, q12
1280
+ vuzp.32 q9, q15
1281
+ vtrn.32 q6, q9
1282
+ vtrn.32 q12, q15
1283
+ @ q6, q12, q9, q15
1284
+
1285
+ @ Reordering (merge later, this is for convenience) (try merge up first!)
1286
+ vmov q0, q4
1287
+ vmov q4, q5
1288
+ vmov q5, q11
1289
+ vmov q11, q15
1290
+
1291
+ vmov q1, q10
1292
+ vmov q10, q9
1293
+ vmov q9, q12
1294
+
1295
+ vmov q2, q7
1296
+ vmov q7, q14
1297
+
1298
+ vmov q3, q13
1299
+
1300
+ vswp q6, q8
1301
+ .endm
1302
+
1303
+ .macro roll_zip_c
1304
+ @ Key seed bytes
1305
+ vldm r0, {d0-d5}
1306
+
1307
+ @ Get keystream generation inputs
1308
+ vmov r4, r5, d0 @ 0,1
1309
+ vmov r6, r7, d2 @ 4,5
1310
+ vmov r8, s8 @ 8
1311
+
1312
+ eor r4, r4, r4, lsl #13
1313
+ eor r4, r4, r6, ror #29
1314
+ @ r4 = 12
1315
+ eor r6, r6, r6, lsl #13
1316
+ eor r6, r6, r8, ror #29
1317
+ @ r6 = 13
1318
+ eor r8, r8, r8, lsl #13
1319
+ eor r8, r8, r5, ror #29
1320
+ @ r8 = 14
1321
+ eor r5, r5, r5, lsl #13
1322
+ eor r5, r5, r7, ror #29
1323
+ @ r5 = 15
1324
+
1325
+ @ 0,1,2,3
1326
+ veor q4, q0, q4
1327
+ @ 4,5,6,7
1328
+ veor q5, q1, q5
1329
+ veor q7, q1, q7
1330
+ @ 8,9,10,11
1331
+ veor q6, q2, q6
1332
+ veor q8, q2, q8
1333
+ veor q10, q2, q10
1334
+
1335
+ vmov s12, s1
1336
+ vmov s13, s2
1337
+ vmov s14, s3
1338
+ vmov s15, r4
1339
+
1340
+ vmov s0, s5
1341
+ vmov s1, s6
1342
+ vmov s2, s7
1343
+ vmov s3, r6
1344
+
1345
+ vmov s4, s9
1346
+ vmov s5, s10
1347
+ vmov s6, s11
1348
+ vmov s7, r8
1349
+
1350
+ vmov s8, s13
1351
+ vmov s9, s14
1352
+ vmov d5, r4, r5
1353
+ vstm r0, {d0-d5}
1354
+
1355
+ @ 1,2,3,12
1356
+ veor q9, q3, q9
1357
+ veor q11, q3, q11
1358
+ veor q13, q3, q13
1359
+
1360
+ @ 5,6,7,13
1361
+ veor q12, q0, q12
1362
+ veor q14, q0, q14
1363
+
1364
+ @ 9,10,11,14
1365
+ veor q15, q1, q15
1366
+
1367
+ zip_x
1368
+ .endm
1369
+
1370
+ .macro accumulate
1371
+ vldm r1, {d24-d29}
1372
+
1373
+ vtrn.32 q0, q2
1374
+ vtrn.32 q1, q3
1375
+ vzip.32 q0, q1
1376
+ vzip.32 q2, q3
1377
+
1378
+ veor q0, q0, q1
1379
+ veor q2, q2, q3
1380
+ veor q12, q12, q0
1381
+ veor q12, q12, q2
1382
+
1383
+ vtrn.32 q4, q6
1384
+ vtrn.32 q5, q7
1385
+ vzip.32 q4, q5
1386
+ vzip.32 q6, q7
1387
+
1388
+ veor q4, q4, q5
1389
+ veor q6, q6, q7
1390
+ veor q13, q13, q4
1391
+ veor q13, q13, q6
1392
+
1393
+ vtrn.32 q8, q10
1394
+ vtrn.32 q9, q11
1395
+ vzip.32 q8, q9
1396
+ vzip.32 q10, q11
1397
+
1398
+ veor q8, q8, q9
1399
+ veor q10, q10, q11
1400
+ veor q14, q14, q8
1401
+ veor q14, q14, q10
1402
+
1403
+ vstm r1, {d24-d29}
1404
+ .endm
1405
+
1406
+ @ Xooffftimes4_CompressFastLoop: uchar * k -> uchar * x -> uchar * input -> size_t length -> size_t
1407
+ .align 8
1408
+ .global Xooffftimes4_CompressFastLoop
1409
+ .type Xooffftimes4_CompressFastLoop, %function
1410
+ Xooffftimes4_CompressFastLoop:
1411
+ @ Do not use this function for unaligned access (for now).
1412
+ tst r2, #3
1413
+ movne r0, #0
1414
+ bxne lr
1415
+
1416
+ push {r4-r10, lr} @ Save LR, macros might branch.
1417
+ vpush {d8-d15}
1418
+ mov r10, #0
1419
+ sub r3, #192
1420
+ Xft4_CompressFast:
1421
+ focus_c @ Handle unaligned access
1422
+ roll_zip_c @ Roll_c with message addition (XOR)
1423
+ xoodoo_6_star @ Same as Xoodoo_6; different registers
1424
+ accumulate @ Add up the four states we processed
1425
+ add r10, #192
1426
+ subs r3, #192
1427
+ bcs Xft4_CompressFast
1428
+ mov r0, r10
1429
+ vpop {d8-d15}
1430
+ pop {r4-r10, pc}
1431
+
1432
+ .macro roll_zip_e
1433
+ vldm r0, {d0-d5}
1434
+
1435
+ @ Get keystream generation inputs
1436
+ vmov r4, r5, d0 @ 0,1
1437
+ vmov r6, r7, d2 @ 4,5
1438
+ vmov r8, r9, d4 @ 8,9
1439
+
1440
+ and r10, r6, r8
1441
+ eor r4, r10, r4, ror #27
1442
+ eor r4, r4, r6, ror #19
1443
+ eor r4, r4, #7
1444
+ @ r4 = 12
1445
+
1446
+ and r10, r8, r5
1447
+ eor r6, r10, r6, ror #27
1448
+ eor r6, r6, r8, ror #19
1449
+ eor r6, r6, #7
1450
+ @r6 = 13
1451
+
1452
+ and r10, r5, r7
1453
+ eor r8, r10, r8, ror #27
1454
+ eor r8, r8, r5, ror #19
1455
+ eor r8, r8, #7
1456
+ @r8 = 14
1457
+
1458
+ and r10, r7, r9
1459
+ eor r5, r10, r5, ror #27
1460
+ eor r5, r5, r7, ror #19
1461
+ eor r5, r5, #7
1462
+ @r5 = 15
1463
+
1464
+ @ 0,1,2,3
1465
+ vmov q4, q0
1466
+ @ 4,5,6,7
1467
+ vmov q5, q1
1468
+ vmov q7, q1
1469
+ @ 8,9,10,11
1470
+ vmov q6, q2
1471
+ vmov q8, q2
1472
+ vmov q10, q2
1473
+
1474
+ @ Optimize movement here. Merge into zip_x or VLDM.
1475
+ vmov s12, s1
1476
+ vmov s13, s2
1477
+ vmov s14, s3
1478
+ vmov s15, r4
1479
+
1480
+ vmov s0, s5
1481
+ vmov s1, s6
1482
+ vmov s2, s7
1483
+ vmov s3, r6
1484
+
1485
+ vmov s4, s9
1486
+ vmov s5, s10
1487
+ vmov s6, s11
1488
+ vmov s7, r8
1489
+
1490
+ vmov s8, s13
1491
+ vmov s9, s14
1492
+ vmov d5, r4, r5
1493
+ vstm r0, {d0-d5}
1494
+
1495
+ @ 1,2,3,12
1496
+ vmov q9, q3
1497
+ vmov q11, q3
1498
+ vmov q13, q3
1499
+
1500
+ @ 5,6,7,13
1501
+ vmov q12, q0
1502
+ vmov q14, q0
1503
+
1504
+ @ 9,10,11,14
1505
+ vmov q15, q1
1506
+
1507
+ zip_x
1508
+ .endm
1509
+
1510
+ .macro sequentiate
1511
+ @ Roll_e_n -> Pe + kRoll = Zn
1512
+ vldm r1, {d24-d29}
1513
+
1514
+ add r4, r2, #48
1515
+ add r5, r4, #48
1516
+ add r6, r5, #48
1517
+
1518
+ vtrn.32 q0, q2
1519
+ vtrn.32 q1, q3
1520
+ vzip.32 q0, q1
1521
+ vzip.32 q2, q3
1522
+ @ 0 1 2 3 for A C B D
1523
+
1524
+ veor q0, q0, q12
1525
+ veor q1, q1, q12
1526
+ veor q2, q2, q12
1527
+ veor q3, q3, q12
1528
+
1529
+ vstm r2!, {d0-d1}
1530
+ vstm r4!, {d4-d5}
1531
+ vstm r5!, {d2-d3}
1532
+ vstm r6!, {d6-d7}
1533
+
1534
+ vtrn.32 q4, q6
1535
+ vtrn.32 q5, q7
1536
+ vzip.32 q4, q5
1537
+ vzip.32 q6, q7
1538
+ @ 4 5 6 7 for A C B D
1539
+
1540
+ veor q0, q4, q13
1541
+ veor q2, q5, q13
1542
+ veor q4, q6, q13
1543
+ veor q6, q7, q13
1544
+
1545
+ vtrn.32 q8, q10
1546
+ vtrn.32 q9, q11
1547
+ vzip.32 q8, q9
1548
+ vzip.32 q10, q11
1549
+ @ 8 9 10 11 for A C B D
1550
+
1551
+ veor q1, q8, q14
1552
+ veor q3, q9, q14
1553
+ veor q5, q10, q14
1554
+ veor q7, q11, q14
1555
+
1556
+ vstm r2, {d0-d3}
1557
+ vstm r4, {d8-d11}
1558
+ vstm r5, {d4-d7}
1559
+ vstm r6!, {d12-d15}
1560
+
1561
+ mov r2, r6
1562
+ .endm
1563
+
1564
+ @ Xooffftimes4_ExpandFastLoop: uchar * yAccu -> uchar * kRoll -> uchar * output -> size_t length -> size_t
1565
+ .align 8
1566
+ .global Xooffftimes4_ExpandFastLoop
1567
+ .type Xooffftimes4_ExpandFastLoop, %function
1568
+ Xooffftimes4_ExpandFastLoop:
1569
+ @ Do not use this function for unaligned access (for now).
1570
+ tst r2, #3
1571
+ movne r0, #0
1572
+ bxne lr
1573
+
1574
+ push {r4-r11, lr} @ Save LR, macros might branch.
1575
+ vpush {d8-d15}
1576
+ mov r11, #0
1577
+ sub r3, #192
1578
+ Xft4_ExpandFast: @The second loop breaks something.
1579
+ roll_zip_e
1580
+ xoodoo_6_star
1581
+ sequentiate
1582
+ add r11, #192
1583
+ subs r3, #192
1584
+ bcs Xft4_ExpandFast
1585
+ mov r0, r11
1586
+ vpop {d8-d15}
1587
+ pop {r4-r11, pc}