ring-native 0.0.0 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (267) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/CHANGES.md +7 -0
  4. data/Makefile +5 -0
  5. data/README.md +12 -5
  6. data/Rakefile +4 -0
  7. data/ext/ring/extconf.rb +4 -5
  8. data/lib/ring/native.rb +3 -1
  9. data/lib/ring/native/version.rb +5 -1
  10. data/ring-native.gemspec +6 -6
  11. data/vendor/ring-ffi/Cargo.lock +26 -0
  12. data/vendor/ring-ffi/Cargo.toml +45 -0
  13. data/vendor/ring-ffi/LICENSE +16 -0
  14. data/vendor/ring-ffi/README.md +59 -0
  15. data/vendor/ring-ffi/src/lib.rs +79 -0
  16. metadata +10 -255
  17. data/vendor/ring/BUILDING.md +0 -40
  18. data/vendor/ring/Cargo.toml +0 -43
  19. data/vendor/ring/LICENSE +0 -185
  20. data/vendor/ring/Makefile +0 -35
  21. data/vendor/ring/PORTING.md +0 -163
  22. data/vendor/ring/README.md +0 -113
  23. data/vendor/ring/STYLE.md +0 -197
  24. data/vendor/ring/appveyor.yml +0 -27
  25. data/vendor/ring/build.rs +0 -108
  26. data/vendor/ring/crypto/aes/aes.c +0 -1142
  27. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +0 -25
  28. data/vendor/ring/crypto/aes/aes_test.cc +0 -93
  29. data/vendor/ring/crypto/aes/asm/aes-586.pl +0 -2368
  30. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +0 -1249
  31. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +0 -2246
  32. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +0 -1318
  33. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +0 -2084
  34. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +0 -675
  35. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +0 -1364
  36. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +0 -1565
  37. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +0 -841
  38. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +0 -1116
  39. data/vendor/ring/crypto/aes/internal.h +0 -87
  40. data/vendor/ring/crypto/aes/mode_wrappers.c +0 -61
  41. data/vendor/ring/crypto/bn/add.c +0 -394
  42. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +0 -694
  43. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +0 -1503
  44. data/vendor/ring/crypto/bn/asm/bn-586.pl +0 -774
  45. data/vendor/ring/crypto/bn/asm/co-586.pl +0 -287
  46. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +0 -1882
  47. data/vendor/ring/crypto/bn/asm/x86-mont.pl +0 -592
  48. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +0 -599
  49. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +0 -1393
  50. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +0 -3507
  51. data/vendor/ring/crypto/bn/bn.c +0 -352
  52. data/vendor/ring/crypto/bn/bn_asn1.c +0 -74
  53. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +0 -25
  54. data/vendor/ring/crypto/bn/bn_test.cc +0 -1696
  55. data/vendor/ring/crypto/bn/cmp.c +0 -200
  56. data/vendor/ring/crypto/bn/convert.c +0 -433
  57. data/vendor/ring/crypto/bn/ctx.c +0 -311
  58. data/vendor/ring/crypto/bn/div.c +0 -594
  59. data/vendor/ring/crypto/bn/exponentiation.c +0 -1335
  60. data/vendor/ring/crypto/bn/gcd.c +0 -711
  61. data/vendor/ring/crypto/bn/generic.c +0 -1019
  62. data/vendor/ring/crypto/bn/internal.h +0 -316
  63. data/vendor/ring/crypto/bn/montgomery.c +0 -516
  64. data/vendor/ring/crypto/bn/mul.c +0 -888
  65. data/vendor/ring/crypto/bn/prime.c +0 -829
  66. data/vendor/ring/crypto/bn/random.c +0 -334
  67. data/vendor/ring/crypto/bn/rsaz_exp.c +0 -262
  68. data/vendor/ring/crypto/bn/rsaz_exp.h +0 -53
  69. data/vendor/ring/crypto/bn/shift.c +0 -276
  70. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +0 -25
  71. data/vendor/ring/crypto/bytestring/bytestring_test.cc +0 -421
  72. data/vendor/ring/crypto/bytestring/cbb.c +0 -399
  73. data/vendor/ring/crypto/bytestring/cbs.c +0 -227
  74. data/vendor/ring/crypto/bytestring/internal.h +0 -46
  75. data/vendor/ring/crypto/chacha/chacha_generic.c +0 -140
  76. data/vendor/ring/crypto/chacha/chacha_vec.c +0 -323
  77. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +0 -1447
  78. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +0 -153
  79. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +0 -25
  80. data/vendor/ring/crypto/cipher/e_aes.c +0 -390
  81. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +0 -208
  82. data/vendor/ring/crypto/cipher/internal.h +0 -173
  83. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +0 -543
  84. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +0 -9
  85. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +0 -475
  86. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +0 -23
  87. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +0 -422
  88. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +0 -484
  89. data/vendor/ring/crypto/cipher/test/cipher_test.txt +0 -100
  90. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +0 -25
  91. data/vendor/ring/crypto/constant_time_test.c +0 -304
  92. data/vendor/ring/crypto/cpu-arm-asm.S +0 -32
  93. data/vendor/ring/crypto/cpu-arm.c +0 -199
  94. data/vendor/ring/crypto/cpu-intel.c +0 -261
  95. data/vendor/ring/crypto/crypto.c +0 -151
  96. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +0 -2118
  97. data/vendor/ring/crypto/curve25519/curve25519.c +0 -4888
  98. data/vendor/ring/crypto/curve25519/x25519_test.cc +0 -128
  99. data/vendor/ring/crypto/digest/md32_common.h +0 -181
  100. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +0 -2725
  101. data/vendor/ring/crypto/ec/ec.c +0 -193
  102. data/vendor/ring/crypto/ec/ec_curves.c +0 -61
  103. data/vendor/ring/crypto/ec/ec_key.c +0 -228
  104. data/vendor/ring/crypto/ec/ec_montgomery.c +0 -114
  105. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +0 -25
  106. data/vendor/ring/crypto/ec/internal.h +0 -243
  107. data/vendor/ring/crypto/ec/oct.c +0 -253
  108. data/vendor/ring/crypto/ec/p256-64.c +0 -1794
  109. data/vendor/ring/crypto/ec/p256-x86_64-table.h +0 -9548
  110. data/vendor/ring/crypto/ec/p256-x86_64.c +0 -509
  111. data/vendor/ring/crypto/ec/simple.c +0 -1007
  112. data/vendor/ring/crypto/ec/util-64.c +0 -183
  113. data/vendor/ring/crypto/ec/wnaf.c +0 -508
  114. data/vendor/ring/crypto/ecdh/ecdh.c +0 -155
  115. data/vendor/ring/crypto/ecdsa/ecdsa.c +0 -304
  116. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +0 -193
  117. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +0 -25
  118. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +0 -327
  119. data/vendor/ring/crypto/header_removed.h +0 -17
  120. data/vendor/ring/crypto/internal.h +0 -495
  121. data/vendor/ring/crypto/libring.Windows.vcxproj +0 -101
  122. data/vendor/ring/crypto/mem.c +0 -98
  123. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +0 -1045
  124. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +0 -517
  125. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +0 -1393
  126. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +0 -1741
  127. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +0 -422
  128. data/vendor/ring/crypto/modes/ctr.c +0 -226
  129. data/vendor/ring/crypto/modes/gcm.c +0 -1206
  130. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +0 -25
  131. data/vendor/ring/crypto/modes/gcm_test.c +0 -348
  132. data/vendor/ring/crypto/modes/internal.h +0 -299
  133. data/vendor/ring/crypto/perlasm/arm-xlate.pl +0 -170
  134. data/vendor/ring/crypto/perlasm/readme +0 -100
  135. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +0 -1164
  136. data/vendor/ring/crypto/perlasm/x86asm.pl +0 -292
  137. data/vendor/ring/crypto/perlasm/x86gas.pl +0 -263
  138. data/vendor/ring/crypto/perlasm/x86masm.pl +0 -200
  139. data/vendor/ring/crypto/perlasm/x86nasm.pl +0 -187
  140. data/vendor/ring/crypto/poly1305/poly1305.c +0 -331
  141. data/vendor/ring/crypto/poly1305/poly1305_arm.c +0 -301
  142. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +0 -2015
  143. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +0 -25
  144. data/vendor/ring/crypto/poly1305/poly1305_test.cc +0 -80
  145. data/vendor/ring/crypto/poly1305/poly1305_test.txt +0 -52
  146. data/vendor/ring/crypto/poly1305/poly1305_vec.c +0 -892
  147. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +0 -75
  148. data/vendor/ring/crypto/rand/internal.h +0 -32
  149. data/vendor/ring/crypto/rand/rand.c +0 -189
  150. data/vendor/ring/crypto/rand/urandom.c +0 -219
  151. data/vendor/ring/crypto/rand/windows.c +0 -56
  152. data/vendor/ring/crypto/refcount_c11.c +0 -66
  153. data/vendor/ring/crypto/refcount_lock.c +0 -53
  154. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +0 -25
  155. data/vendor/ring/crypto/refcount_test.c +0 -58
  156. data/vendor/ring/crypto/rsa/blinding.c +0 -462
  157. data/vendor/ring/crypto/rsa/internal.h +0 -108
  158. data/vendor/ring/crypto/rsa/padding.c +0 -300
  159. data/vendor/ring/crypto/rsa/rsa.c +0 -450
  160. data/vendor/ring/crypto/rsa/rsa_asn1.c +0 -261
  161. data/vendor/ring/crypto/rsa/rsa_impl.c +0 -944
  162. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +0 -25
  163. data/vendor/ring/crypto/rsa/rsa_test.cc +0 -437
  164. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +0 -436
  165. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +0 -2390
  166. data/vendor/ring/crypto/sha/asm/sha256-586.pl +0 -1275
  167. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +0 -735
  168. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +0 -14
  169. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +0 -14
  170. data/vendor/ring/crypto/sha/asm/sha512-586.pl +0 -911
  171. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +0 -666
  172. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +0 -14
  173. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +0 -14
  174. data/vendor/ring/crypto/sha/sha1.c +0 -271
  175. data/vendor/ring/crypto/sha/sha256.c +0 -204
  176. data/vendor/ring/crypto/sha/sha512.c +0 -355
  177. data/vendor/ring/crypto/test/file_test.cc +0 -326
  178. data/vendor/ring/crypto/test/file_test.h +0 -181
  179. data/vendor/ring/crypto/test/malloc.cc +0 -150
  180. data/vendor/ring/crypto/test/scoped_types.h +0 -95
  181. data/vendor/ring/crypto/test/test.Windows.vcxproj +0 -35
  182. data/vendor/ring/crypto/test/test_util.cc +0 -46
  183. data/vendor/ring/crypto/test/test_util.h +0 -41
  184. data/vendor/ring/crypto/thread_none.c +0 -55
  185. data/vendor/ring/crypto/thread_pthread.c +0 -165
  186. data/vendor/ring/crypto/thread_test.Windows.vcxproj +0 -25
  187. data/vendor/ring/crypto/thread_test.c +0 -200
  188. data/vendor/ring/crypto/thread_win.c +0 -282
  189. data/vendor/ring/examples/checkdigest.rs +0 -103
  190. data/vendor/ring/include/openssl/aes.h +0 -121
  191. data/vendor/ring/include/openssl/arm_arch.h +0 -129
  192. data/vendor/ring/include/openssl/base.h +0 -156
  193. data/vendor/ring/include/openssl/bn.h +0 -794
  194. data/vendor/ring/include/openssl/buffer.h +0 -18
  195. data/vendor/ring/include/openssl/bytestring.h +0 -235
  196. data/vendor/ring/include/openssl/chacha.h +0 -37
  197. data/vendor/ring/include/openssl/cmac.h +0 -76
  198. data/vendor/ring/include/openssl/cpu.h +0 -184
  199. data/vendor/ring/include/openssl/crypto.h +0 -43
  200. data/vendor/ring/include/openssl/curve25519.h +0 -88
  201. data/vendor/ring/include/openssl/ec.h +0 -225
  202. data/vendor/ring/include/openssl/ec_key.h +0 -129
  203. data/vendor/ring/include/openssl/ecdh.h +0 -110
  204. data/vendor/ring/include/openssl/ecdsa.h +0 -156
  205. data/vendor/ring/include/openssl/err.h +0 -201
  206. data/vendor/ring/include/openssl/mem.h +0 -101
  207. data/vendor/ring/include/openssl/obj_mac.h +0 -71
  208. data/vendor/ring/include/openssl/opensslfeatures.h +0 -68
  209. data/vendor/ring/include/openssl/opensslv.h +0 -18
  210. data/vendor/ring/include/openssl/ossl_typ.h +0 -18
  211. data/vendor/ring/include/openssl/poly1305.h +0 -51
  212. data/vendor/ring/include/openssl/rand.h +0 -70
  213. data/vendor/ring/include/openssl/rsa.h +0 -399
  214. data/vendor/ring/include/openssl/thread.h +0 -133
  215. data/vendor/ring/include/openssl/type_check.h +0 -71
  216. data/vendor/ring/mk/Common.props +0 -63
  217. data/vendor/ring/mk/Windows.props +0 -42
  218. data/vendor/ring/mk/WindowsTest.props +0 -18
  219. data/vendor/ring/mk/appveyor.bat +0 -62
  220. data/vendor/ring/mk/bottom_of_makefile.mk +0 -54
  221. data/vendor/ring/mk/ring.mk +0 -266
  222. data/vendor/ring/mk/top_of_makefile.mk +0 -214
  223. data/vendor/ring/mk/travis.sh +0 -40
  224. data/vendor/ring/mk/update-travis-yml.py +0 -229
  225. data/vendor/ring/ring.sln +0 -153
  226. data/vendor/ring/src/aead.rs +0 -682
  227. data/vendor/ring/src/agreement.rs +0 -248
  228. data/vendor/ring/src/c.rs +0 -129
  229. data/vendor/ring/src/constant_time.rs +0 -37
  230. data/vendor/ring/src/der.rs +0 -96
  231. data/vendor/ring/src/digest.rs +0 -690
  232. data/vendor/ring/src/digest_tests.txt +0 -57
  233. data/vendor/ring/src/ecc.rs +0 -28
  234. data/vendor/ring/src/ecc_build.rs +0 -279
  235. data/vendor/ring/src/ecc_curves.rs +0 -117
  236. data/vendor/ring/src/ed25519_tests.txt +0 -2579
  237. data/vendor/ring/src/exe_tests.rs +0 -46
  238. data/vendor/ring/src/ffi.rs +0 -29
  239. data/vendor/ring/src/file_test.rs +0 -187
  240. data/vendor/ring/src/hkdf.rs +0 -153
  241. data/vendor/ring/src/hkdf_tests.txt +0 -59
  242. data/vendor/ring/src/hmac.rs +0 -414
  243. data/vendor/ring/src/hmac_tests.txt +0 -97
  244. data/vendor/ring/src/input.rs +0 -312
  245. data/vendor/ring/src/lib.rs +0 -41
  246. data/vendor/ring/src/pbkdf2.rs +0 -265
  247. data/vendor/ring/src/pbkdf2_tests.txt +0 -113
  248. data/vendor/ring/src/polyfill.rs +0 -57
  249. data/vendor/ring/src/rand.rs +0 -28
  250. data/vendor/ring/src/signature.rs +0 -314
  251. data/vendor/ring/third-party/NIST/README.md +0 -9
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +0 -263
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +0 -309
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +0 -267
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +0 -263
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +0 -309
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +0 -267
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +0 -263
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +0 -309
  260. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +0 -267
  261. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +0 -519
  262. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +0 -309
  263. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +0 -523
  264. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +0 -519
  265. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +0 -309
  266. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +0 -523
  267. data/vendor/ring/third-party/NIST/sha256sums.txt +0 -1
@@ -1,1393 +0,0 @@
1
- #!/usr/bin/env perl
2
- #
3
- # ====================================================================
4
- # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5
- # project. The module is, however, dual licensed under OpenSSL and
6
- # CRYPTOGAMS licenses depending on where you obtain it. For further
7
- # details see http://www.openssl.org/~appro/cryptogams/.
8
- # ====================================================================
9
- #
10
- # March, May, June 2010
11
- #
12
- # The module implements "4-bit" GCM GHASH function and underlying
13
- # single multiplication operation in GF(2^128). "4-bit" means that it
14
- # uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
15
- # code paths: vanilla x86 and vanilla SSE. Former will be executed on
16
- # 486 and Pentium, latter on all others. SSE GHASH features so called
17
- # "528B" variant of "4-bit" method utilizing additional 256+16 bytes
18
- # of per-key storage [+512 bytes shared table]. Performance results
19
- # are for streamed GHASH subroutine and are expressed in cycles per
20
- # processed byte, less is better:
21
- #
22
- # gcc 2.95.3(*) SSE assembler x86 assembler
23
- #
24
- # Pentium 105/111(**) - 50
25
- # PIII 68 /75 12.2 24
26
- # P4 125/125 17.8 84(***)
27
- # Opteron 66 /70 10.1 30
28
- # Core2 54 /67 8.4 18
29
- # Atom 105/105 16.8 53
30
- # VIA Nano 69 /71 13.0 27
31
- #
32
- # (*) gcc 3.4.x was observed to generate few percent slower code,
33
- # which is one of reasons why 2.95.3 results were chosen,
34
- # another reason is lack of 3.4.x results for older CPUs;
35
- # comparison with SSE results is not completely fair, because C
36
- # results are for vanilla "256B" implementation, while
37
- # assembler results are for "528B";-)
38
- # (**) second number is result for code compiled with -fPIC flag,
39
- # which is actually more relevant, because assembler code is
40
- # position-independent;
41
- # (***) see comment in non-MMX routine for further details;
42
- #
43
- # To summarize, it's >2-5 times faster than gcc-generated code. To
44
- # anchor it to something else SHA1 assembler processes one byte in
45
- # ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE
46
- # in particular, see comment at the end of the file...
47
-
48
- # May 2010
49
- #
50
- # Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
51
- # The question is how close is it to theoretical limit? The pclmulqdq
52
- # instruction latency appears to be 14 cycles and there can't be more
53
- # than 2 of them executing at any given time. This means that single
54
- # Karatsuba multiplication would take 28 cycles *plus* few cycles for
55
- # pre- and post-processing. Then multiplication has to be followed by
56
- # modulo-reduction. Given that aggregated reduction method [see
57
- # "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
58
- # white paper by Intel] allows you to perform reduction only once in
59
- # a while we can assume that asymptotic performance can be estimated
60
- # as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
61
- # and Naggr is the aggregation factor.
62
- #
63
- # Before we proceed to this implementation let's have closer look at
64
- # the best-performing code suggested by Intel in their white paper.
65
- # By tracing inter-register dependencies Tmod is estimated as ~19
66
- # cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
67
- # processed byte. As implied, this is quite optimistic estimate,
68
- # because it does not account for Karatsuba pre- and post-processing,
69
- # which for a single multiplication is ~5 cycles. Unfortunately Intel
70
- # does not provide performance data for GHASH alone. But benchmarking
71
- # AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
72
- # alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
73
- # the result accounts even for pre-computing of degrees of the hash
74
- # key H, but its portion is negligible at 16KB buffer size.
75
- #
76
- # Moving on to the implementation in question. Tmod is estimated as
77
- # ~13 cycles and Naggr is 2, giving asymptotic performance of ...
78
- # 2.16. How is it possible that measured performance is better than
79
- # optimistic theoretical estimate? There is one thing Intel failed
80
- # to recognize. By serializing GHASH with CTR in same subroutine
81
- # former's performance is really limited to above (Tmul + Tmod/Naggr)
82
- # equation. But if GHASH procedure is detached, the modulo-reduction
83
- # can be interleaved with Naggr-1 multiplications at instruction level
84
- # and under ideal conditions even disappear from the equation. So that
85
- # optimistic theoretical estimate for this implementation is ...
86
- # 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
87
- # at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
88
- # where Tproc is time required for Karatsuba pre- and post-processing,
89
- # is more realistic estimate. In this case it gives ... 1.91 cycles.
90
- # Or in other words, depending on how well we can interleave reduction
91
- # and one of the two multiplications the performance should be betwen
92
- # 1.91 and 2.16. As already mentioned, this implementation processes
93
- # one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
94
- # - in 2.02. x86_64 performance is better, because larger register
95
- # bank allows to interleave reduction and multiplication better.
96
- #
97
- # Does it make sense to increase Naggr? To start with it's virtually
98
- # impossible in 32-bit mode, because of limited register bank
99
- # capacity. Otherwise improvement has to be weighed agiainst slower
100
- # setup, as well as code size and complexity increase. As even
101
- # optimistic estimate doesn't promise 30% performance improvement,
102
- # there are currently no plans to increase Naggr.
103
- #
104
- # Special thanks to David Woodhouse <dwmw2@infradead.org> for
105
- # providing access to a Westmere-based system on behalf of Intel
106
- # Open Source Technology Centre.
107
-
108
- # January 2010
109
- #
110
- # Tweaked to optimize transitions between integer and FP operations
111
- # on same XMM register, PCLMULQDQ subroutine was measured to process
112
- # one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
113
- # The minor regression on Westmere is outweighed by ~15% improvement
114
- # on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
115
- # similar manner resulted in almost 20% degradation on Sandy Bridge,
116
- # where original 64-bit code processes one byte in 1.95 cycles.
117
-
118
- #####################################################################
119
- # For reference, AMD Bulldozer processes one byte in 1.98 cycles in
120
- # 32-bit mode and 1.89 in 64-bit.
121
-
122
- # February 2013
123
- #
124
- # Overhaul: aggregate Karatsuba post-processing, improve ILP in
125
- # reduction_alg9. Resulting performance is 1.96 cycles per byte on
126
- # Westmere, 1.95 - on Sandy/Ivy Bridge, 1.76 - on Bulldozer.
127
-
128
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
129
- push(@INC,"${dir}","${dir}../../perlasm");
130
- require "x86asm.pl";
131
-
132
- &asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
133
-
134
- $sse2=0;
135
- for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
136
-
137
- ($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx");
138
- $inp = "edi";
139
- $Htbl = "esi";
140
-
141
- $unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse
142
- # than unrolled, which has to be weighted against
143
- # 2.5x x86-specific code size reduction.
144
-
145
- sub x86_loop {
146
- my $off = shift;
147
- my $rem = "eax";
148
-
149
- &mov ($Zhh,&DWP(4,$Htbl,$Zll));
150
- &mov ($Zhl,&DWP(0,$Htbl,$Zll));
151
- &mov ($Zlh,&DWP(12,$Htbl,$Zll));
152
- &mov ($Zll,&DWP(8,$Htbl,$Zll));
153
- &xor ($rem,$rem); # avoid partial register stalls on PIII
154
-
155
- # shrd practically kills P4, 2.5x deterioration, but P4 has
156
- # MMX code-path to execute. shrd runs tad faster [than twice
157
- # the shifts, move's and or's] on pre-MMX Pentium (as well as
158
- # PIII and Core2), *but* minimizes code size, spares register
159
- # and thus allows to fold the loop...
160
- if (!$unroll) {
161
- my $cnt = $inp;
162
- &mov ($cnt,15);
163
- &jmp (&label("x86_loop"));
164
- &set_label("x86_loop",16);
165
- for($i=1;$i<=2;$i++) {
166
- &mov (&LB($rem),&LB($Zll));
167
- &shrd ($Zll,$Zlh,4);
168
- &and (&LB($rem),0xf);
169
- &shrd ($Zlh,$Zhl,4);
170
- &shrd ($Zhl,$Zhh,4);
171
- &shr ($Zhh,4);
172
- &xor ($Zhh,&DWP($off+16,"esp",$rem,4));
173
-
174
- &mov (&LB($rem),&BP($off,"esp",$cnt));
175
- if ($i&1) {
176
- &and (&LB($rem),0xf0);
177
- } else {
178
- &shl (&LB($rem),4);
179
- }
180
-
181
- &xor ($Zll,&DWP(8,$Htbl,$rem));
182
- &xor ($Zlh,&DWP(12,$Htbl,$rem));
183
- &xor ($Zhl,&DWP(0,$Htbl,$rem));
184
- &xor ($Zhh,&DWP(4,$Htbl,$rem));
185
-
186
- if ($i&1) {
187
- &dec ($cnt);
188
- &js (&label("x86_break"));
189
- } else {
190
- &jmp (&label("x86_loop"));
191
- }
192
- }
193
- &set_label("x86_break",16);
194
- } else {
195
- for($i=1;$i<32;$i++) {
196
- &comment($i);
197
- &mov (&LB($rem),&LB($Zll));
198
- &shrd ($Zll,$Zlh,4);
199
- &and (&LB($rem),0xf);
200
- &shrd ($Zlh,$Zhl,4);
201
- &shrd ($Zhl,$Zhh,4);
202
- &shr ($Zhh,4);
203
- &xor ($Zhh,&DWP($off+16,"esp",$rem,4));
204
-
205
- if ($i&1) {
206
- &mov (&LB($rem),&BP($off+15-($i>>1),"esp"));
207
- &and (&LB($rem),0xf0);
208
- } else {
209
- &mov (&LB($rem),&BP($off+15-($i>>1),"esp"));
210
- &shl (&LB($rem),4);
211
- }
212
-
213
- &xor ($Zll,&DWP(8,$Htbl,$rem));
214
- &xor ($Zlh,&DWP(12,$Htbl,$rem));
215
- &xor ($Zhl,&DWP(0,$Htbl,$rem));
216
- &xor ($Zhh,&DWP(4,$Htbl,$rem));
217
- }
218
- }
219
- &bswap ($Zll);
220
- &bswap ($Zlh);
221
- &bswap ($Zhl);
222
- if (!$x86only) {
223
- &bswap ($Zhh);
224
- } else {
225
- &mov ("eax",$Zhh);
226
- &bswap ("eax");
227
- &mov ($Zhh,"eax");
228
- }
229
- }
230
-
231
- if ($unroll) {
232
- &function_begin_B("_x86_gmult_4bit_inner");
233
- &x86_loop(4);
234
- &ret ();
235
- &function_end_B("_x86_gmult_4bit_inner");
236
- }
237
-
238
- sub deposit_rem_4bit {
239
- my $bias = shift;
240
-
241
- &mov (&DWP($bias+0, "esp"),0x0000<<16);
242
- &mov (&DWP($bias+4, "esp"),0x1C20<<16);
243
- &mov (&DWP($bias+8, "esp"),0x3840<<16);
244
- &mov (&DWP($bias+12,"esp"),0x2460<<16);
245
- &mov (&DWP($bias+16,"esp"),0x7080<<16);
246
- &mov (&DWP($bias+20,"esp"),0x6CA0<<16);
247
- &mov (&DWP($bias+24,"esp"),0x48C0<<16);
248
- &mov (&DWP($bias+28,"esp"),0x54E0<<16);
249
- &mov (&DWP($bias+32,"esp"),0xE100<<16);
250
- &mov (&DWP($bias+36,"esp"),0xFD20<<16);
251
- &mov (&DWP($bias+40,"esp"),0xD940<<16);
252
- &mov (&DWP($bias+44,"esp"),0xC560<<16);
253
- &mov (&DWP($bias+48,"esp"),0x9180<<16);
254
- &mov (&DWP($bias+52,"esp"),0x8DA0<<16);
255
- &mov (&DWP($bias+56,"esp"),0xA9C0<<16);
256
- &mov (&DWP($bias+60,"esp"),0xB5E0<<16);
257
- }
258
-
259
- $suffix = $x86only ? "" : "_x86";
260
-
261
- &function_begin("gcm_gmult_4bit".$suffix);
262
- &stack_push(16+4+1); # +1 for stack alignment
263
- &mov ($inp,&wparam(0)); # load Xi
264
- &mov ($Htbl,&wparam(1)); # load Htable
265
-
266
- &mov ($Zhh,&DWP(0,$inp)); # load Xi[16]
267
- &mov ($Zhl,&DWP(4,$inp));
268
- &mov ($Zlh,&DWP(8,$inp));
269
- &mov ($Zll,&DWP(12,$inp));
270
-
271
- &deposit_rem_4bit(16);
272
-
273
- &mov (&DWP(0,"esp"),$Zhh); # copy Xi[16] on stack
274
- &mov (&DWP(4,"esp"),$Zhl);
275
- &mov (&DWP(8,"esp"),$Zlh);
276
- &mov (&DWP(12,"esp"),$Zll);
277
- &shr ($Zll,20);
278
- &and ($Zll,0xf0);
279
-
280
- if ($unroll) {
281
- &call ("_x86_gmult_4bit_inner");
282
- } else {
283
- &x86_loop(0);
284
- &mov ($inp,&wparam(0));
285
- }
286
-
287
- &mov (&DWP(12,$inp),$Zll);
288
- &mov (&DWP(8,$inp),$Zlh);
289
- &mov (&DWP(4,$inp),$Zhl);
290
- &mov (&DWP(0,$inp),$Zhh);
291
- &stack_pop(16+4+1);
292
- &function_end("gcm_gmult_4bit".$suffix);
293
-
294
- &function_begin("gcm_ghash_4bit".$suffix);
295
- &stack_push(16+4+1); # +1 for 64-bit alignment
296
- &mov ($Zll,&wparam(0)); # load Xi
297
- &mov ($Htbl,&wparam(1)); # load Htable
298
- &mov ($inp,&wparam(2)); # load in
299
- &mov ("ecx",&wparam(3)); # load len
300
- &add ("ecx",$inp);
301
- &mov (&wparam(3),"ecx");
302
-
303
- &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16]
304
- &mov ($Zhl,&DWP(4,$Zll));
305
- &mov ($Zlh,&DWP(8,$Zll));
306
- &mov ($Zll,&DWP(12,$Zll));
307
-
308
- &deposit_rem_4bit(16);
309
-
310
- &set_label("x86_outer_loop",16);
311
- &xor ($Zll,&DWP(12,$inp)); # xor with input
312
- &xor ($Zlh,&DWP(8,$inp));
313
- &xor ($Zhl,&DWP(4,$inp));
314
- &xor ($Zhh,&DWP(0,$inp));
315
- &mov (&DWP(12,"esp"),$Zll); # dump it on stack
316
- &mov (&DWP(8,"esp"),$Zlh);
317
- &mov (&DWP(4,"esp"),$Zhl);
318
- &mov (&DWP(0,"esp"),$Zhh);
319
-
320
- &shr ($Zll,20);
321
- &and ($Zll,0xf0);
322
-
323
- if ($unroll) {
324
- &call ("_x86_gmult_4bit_inner");
325
- } else {
326
- &x86_loop(0);
327
- &mov ($inp,&wparam(2));
328
- }
329
- &lea ($inp,&DWP(16,$inp));
330
- &cmp ($inp,&wparam(3));
331
- &mov (&wparam(2),$inp) if (!$unroll);
332
- &jb (&label("x86_outer_loop"));
333
-
334
- &mov ($inp,&wparam(0)); # load Xi
335
- &mov (&DWP(12,$inp),$Zll);
336
- &mov (&DWP(8,$inp),$Zlh);
337
- &mov (&DWP(4,$inp),$Zhl);
338
- &mov (&DWP(0,$inp),$Zhh);
339
- &stack_pop(16+4+1);
340
- &function_end("gcm_ghash_4bit".$suffix);
341
-
342
- if (!$x86only) {{{
343
-
344
- &static_label("rem_4bit");
345
-
346
- if (!$sse2) {{ # pure-MMX "May" version...
347
-
348
- $S=12; # shift factor for rem_4bit
349
-
350
- &function_begin_B("_mmx_gmult_4bit_inner");
351
- # MMX version performs 3.5 times better on P4 (see comment in non-MMX
352
- # routine for further details), 100% better on Opteron, ~70% better
353
- # on Core2 and PIII... In other words effort is considered to be well
354
- # spent... Since initial release the loop was unrolled in order to
355
- # "liberate" register previously used as loop counter. Instead it's
356
- # used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'.
357
- # The path involves move of Z.lo from MMX to integer register,
358
- # effective address calculation and finally merge of value to Z.hi.
359
- # Reference to rem_4bit is scheduled so late that I had to >>4
360
- # rem_4bit elements. This resulted in 20-45% procent improvement
361
- # on contemporary µ-archs.
362
- {
363
- my $cnt;
364
- my $rem_4bit = "eax";
365
- my @rem = ($Zhh,$Zll);
366
- my $nhi = $Zhl;
367
- my $nlo = $Zlh;
368
-
369
- my ($Zlo,$Zhi) = ("mm0","mm1");
370
- my $tmp = "mm2";
371
-
372
- &xor ($nlo,$nlo); # avoid partial register stalls on PIII
373
- &mov ($nhi,$Zll);
374
- &mov (&LB($nlo),&LB($nhi));
375
- &shl (&LB($nlo),4);
376
- &and ($nhi,0xf0);
377
- &movq ($Zlo,&QWP(8,$Htbl,$nlo));
378
- &movq ($Zhi,&QWP(0,$Htbl,$nlo));
379
- &movd ($rem[0],$Zlo);
380
-
381
- for ($cnt=28;$cnt>=-2;$cnt--) {
382
- my $odd = $cnt&1;
383
- my $nix = $odd ? $nlo : $nhi;
384
-
385
- &shl (&LB($nlo),4) if ($odd);
386
- &psrlq ($Zlo,4);
387
- &movq ($tmp,$Zhi);
388
- &psrlq ($Zhi,4);
389
- &pxor ($Zlo,&QWP(8,$Htbl,$nix));
390
- &mov (&LB($nlo),&BP($cnt/2,$inp)) if (!$odd && $cnt>=0);
391
- &psllq ($tmp,60);
392
- &and ($nhi,0xf0) if ($odd);
393
- &pxor ($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28);
394
- &and ($rem[0],0xf);
395
- &pxor ($Zhi,&QWP(0,$Htbl,$nix));
396
- &mov ($nhi,$nlo) if (!$odd && $cnt>=0);
397
- &movd ($rem[1],$Zlo);
398
- &pxor ($Zlo,$tmp);
399
-
400
- push (@rem,shift(@rem)); # "rotate" registers
401
- }
402
-
403
- &mov ($inp,&DWP(4,$rem_4bit,$rem[1],8)); # last rem_4bit[rem]
404
-
405
- &psrlq ($Zlo,32); # lower part of Zlo is already there
406
- &movd ($Zhl,$Zhi);
407
- &psrlq ($Zhi,32);
408
- &movd ($Zlh,$Zlo);
409
- &movd ($Zhh,$Zhi);
410
- &shl ($inp,4); # compensate for rem_4bit[i] being >>4
411
-
412
- &bswap ($Zll);
413
- &bswap ($Zhl);
414
- &bswap ($Zlh);
415
- &xor ($Zhh,$inp);
416
- &bswap ($Zhh);
417
-
418
- &ret ();
419
- }
420
- &function_end_B("_mmx_gmult_4bit_inner");
421
-
422
- &function_begin("gcm_gmult_4bit_mmx");
423
- &mov ($inp,&wparam(0)); # load Xi
424
- &mov ($Htbl,&wparam(1)); # load Htable
425
-
426
- &call (&label("pic_point"));
427
- &set_label("pic_point");
428
- &blindpop("eax");
429
- &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
430
-
431
- &movz ($Zll,&BP(15,$inp));
432
-
433
- &call ("_mmx_gmult_4bit_inner");
434
-
435
- &mov ($inp,&wparam(0)); # load Xi
436
- &emms ();
437
- &mov (&DWP(12,$inp),$Zll);
438
- &mov (&DWP(4,$inp),$Zhl);
439
- &mov (&DWP(8,$inp),$Zlh);
440
- &mov (&DWP(0,$inp),$Zhh);
441
- &function_end("gcm_gmult_4bit_mmx");
442
-
443
- # Streamed version performs 20% better on P4, 7% on Opteron,
444
- # 10% on Core2 and PIII...
445
- &function_begin("gcm_ghash_4bit_mmx");
446
- &mov ($Zhh,&wparam(0)); # load Xi
447
- &mov ($Htbl,&wparam(1)); # load Htable
448
- &mov ($inp,&wparam(2)); # load in
449
- &mov ($Zlh,&wparam(3)); # load len
450
-
451
- &call (&label("pic_point"));
452
- &set_label("pic_point");
453
- &blindpop("eax");
454
- &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
455
-
456
- &add ($Zlh,$inp);
457
- &mov (&wparam(3),$Zlh); # len to point at the end of input
458
- &stack_push(4+1); # +1 for stack alignment
459
-
460
- &mov ($Zll,&DWP(12,$Zhh)); # load Xi[16]
461
- &mov ($Zhl,&DWP(4,$Zhh));
462
- &mov ($Zlh,&DWP(8,$Zhh));
463
- &mov ($Zhh,&DWP(0,$Zhh));
464
- &jmp (&label("mmx_outer_loop"));
465
-
466
- &set_label("mmx_outer_loop",16);
467
- &xor ($Zll,&DWP(12,$inp));
468
- &xor ($Zhl,&DWP(4,$inp));
469
- &xor ($Zlh,&DWP(8,$inp));
470
- &xor ($Zhh,&DWP(0,$inp));
471
- &mov (&wparam(2),$inp);
472
- &mov (&DWP(12,"esp"),$Zll);
473
- &mov (&DWP(4,"esp"),$Zhl);
474
- &mov (&DWP(8,"esp"),$Zlh);
475
- &mov (&DWP(0,"esp"),$Zhh);
476
-
477
- &mov ($inp,"esp");
478
- &shr ($Zll,24);
479
-
480
- &call ("_mmx_gmult_4bit_inner");
481
-
482
- &mov ($inp,&wparam(2));
483
- &lea ($inp,&DWP(16,$inp));
484
- &cmp ($inp,&wparam(3));
485
- &jb (&label("mmx_outer_loop"));
486
-
487
- &mov ($inp,&wparam(0)); # load Xi
488
- &emms ();
489
- &mov (&DWP(12,$inp),$Zll);
490
- &mov (&DWP(4,$inp),$Zhl);
491
- &mov (&DWP(8,$inp),$Zlh);
492
- &mov (&DWP(0,$inp),$Zhh);
493
-
494
- &stack_pop(4+1);
495
- &function_end("gcm_ghash_4bit_mmx");
496
-
497
- }} else {{ # "June" MMX version...
498
- # ... has slower "April" gcm_gmult_4bit_mmx with folded
499
- # loop. This is done to conserve code size...
500
- $S=16; # shift factor for rem_4bit
501
-
502
- sub mmx_loop() {
503
- # MMX version performs 2.8 times better on P4 (see comment in non-MMX
504
- # routine for further details), 40% better on Opteron and Core2, 50%
505
- # better on PIII... In other words effort is considered to be well
506
- # spent...
507
- my $inp = shift;
508
- my $rem_4bit = shift;
509
- my $cnt = $Zhh;
510
- my $nhi = $Zhl;
511
- my $nlo = $Zlh;
512
- my $rem = $Zll;
513
-
514
- my ($Zlo,$Zhi) = ("mm0","mm1");
515
- my $tmp = "mm2";
516
-
517
- &xor ($nlo,$nlo); # avoid partial register stalls on PIII
518
- &mov ($nhi,$Zll);
519
- &mov (&LB($nlo),&LB($nhi));
520
- &mov ($cnt,14);
521
- &shl (&LB($nlo),4);
522
- &and ($nhi,0xf0);
523
- &movq ($Zlo,&QWP(8,$Htbl,$nlo));
524
- &movq ($Zhi,&QWP(0,$Htbl,$nlo));
525
- &movd ($rem,$Zlo);
526
- &jmp (&label("mmx_loop"));
527
-
528
- &set_label("mmx_loop",16);
529
- &psrlq ($Zlo,4);
530
- &and ($rem,0xf);
531
- &movq ($tmp,$Zhi);
532
- &psrlq ($Zhi,4);
533
- &pxor ($Zlo,&QWP(8,$Htbl,$nhi));
534
- &mov (&LB($nlo),&BP(0,$inp,$cnt));
535
- &psllq ($tmp,60);
536
- &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
537
- &dec ($cnt);
538
- &movd ($rem,$Zlo);
539
- &pxor ($Zhi,&QWP(0,$Htbl,$nhi));
540
- &mov ($nhi,$nlo);
541
- &pxor ($Zlo,$tmp);
542
- &js (&label("mmx_break"));
543
-
544
- &shl (&LB($nlo),4);
545
- &and ($rem,0xf);
546
- &psrlq ($Zlo,4);
547
- &and ($nhi,0xf0);
548
- &movq ($tmp,$Zhi);
549
- &psrlq ($Zhi,4);
550
- &pxor ($Zlo,&QWP(8,$Htbl,$nlo));
551
- &psllq ($tmp,60);
552
- &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
553
- &movd ($rem,$Zlo);
554
- &pxor ($Zhi,&QWP(0,$Htbl,$nlo));
555
- &pxor ($Zlo,$tmp);
556
- &jmp (&label("mmx_loop"));
557
-
558
- &set_label("mmx_break",16);
559
- &shl (&LB($nlo),4);
560
- &and ($rem,0xf);
561
- &psrlq ($Zlo,4);
562
- &and ($nhi,0xf0);
563
- &movq ($tmp,$Zhi);
564
- &psrlq ($Zhi,4);
565
- &pxor ($Zlo,&QWP(8,$Htbl,$nlo));
566
- &psllq ($tmp,60);
567
- &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
568
- &movd ($rem,$Zlo);
569
- &pxor ($Zhi,&QWP(0,$Htbl,$nlo));
570
- &pxor ($Zlo,$tmp);
571
-
572
- &psrlq ($Zlo,4);
573
- &and ($rem,0xf);
574
- &movq ($tmp,$Zhi);
575
- &psrlq ($Zhi,4);
576
- &pxor ($Zlo,&QWP(8,$Htbl,$nhi));
577
- &psllq ($tmp,60);
578
- &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
579
- &movd ($rem,$Zlo);
580
- &pxor ($Zhi,&QWP(0,$Htbl,$nhi));
581
- &pxor ($Zlo,$tmp);
582
-
583
- &psrlq ($Zlo,32); # lower part of Zlo is already there
584
- &movd ($Zhl,$Zhi);
585
- &psrlq ($Zhi,32);
586
- &movd ($Zlh,$Zlo);
587
- &movd ($Zhh,$Zhi);
588
-
589
- &bswap ($Zll);
590
- &bswap ($Zhl);
591
- &bswap ($Zlh);
592
- &bswap ($Zhh);
593
- }
594
-
595
- &function_begin("gcm_gmult_4bit_mmx");
596
- &mov ($inp,&wparam(0)); # load Xi
597
- &mov ($Htbl,&wparam(1)); # load Htable
598
-
599
- &call (&label("pic_point"));
600
- &set_label("pic_point");
601
- &blindpop("eax");
602
- &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
603
-
604
- &movz ($Zll,&BP(15,$inp));
605
-
606
- &mmx_loop($inp,"eax");
607
-
608
- &emms ();
609
- &mov (&DWP(12,$inp),$Zll);
610
- &mov (&DWP(4,$inp),$Zhl);
611
- &mov (&DWP(8,$inp),$Zlh);
612
- &mov (&DWP(0,$inp),$Zhh);
613
- &function_end("gcm_gmult_4bit_mmx");
614
-
615
- ######################################################################
616
- # Below subroutine is "528B" variant of "4-bit" GCM GHASH function
617
- # (see gcm128.c for details). It provides further 20-40% performance
618
- # improvement over above mentioned "May" version.
619
-
620
- &static_label("rem_8bit");
621
-
622
- &function_begin("gcm_ghash_4bit_mmx");
623
- { my ($Zlo,$Zhi) = ("mm7","mm6");
624
- my $rem_8bit = "esi";
625
- my $Htbl = "ebx";
626
-
627
- # parameter block
628
- &mov ("eax",&wparam(0)); # Xi
629
- &mov ("ebx",&wparam(1)); # Htable
630
- &mov ("ecx",&wparam(2)); # inp
631
- &mov ("edx",&wparam(3)); # len
632
- &mov ("ebp","esp"); # original %esp
633
- &call (&label("pic_point"));
634
- &set_label ("pic_point");
635
- &blindpop ($rem_8bit);
636
- &lea ($rem_8bit,&DWP(&label("rem_8bit")."-".&label("pic_point"),$rem_8bit));
637
-
638
- &sub ("esp",512+16+16); # allocate stack frame...
639
- &and ("esp",-64); # ...and align it
640
- &sub ("esp",16); # place for (u8)(H[]<<4)
641
-
642
- &add ("edx","ecx"); # pointer to the end of input
643
- &mov (&DWP(528+16+0,"esp"),"eax"); # save Xi
644
- &mov (&DWP(528+16+8,"esp"),"edx"); # save inp+len
645
- &mov (&DWP(528+16+12,"esp"),"ebp"); # save original %esp
646
-
647
- { my @lo = ("mm0","mm1","mm2");
648
- my @hi = ("mm3","mm4","mm5");
649
- my @tmp = ("mm6","mm7");
650
- my ($off1,$off2,$i) = (0,0,);
651
-
652
- &add ($Htbl,128); # optimize for size
653
- &lea ("edi",&DWP(16+128,"esp"));
654
- &lea ("ebp",&DWP(16+256+128,"esp"));
655
-
656
- # decompose Htable (low and high parts are kept separately),
657
- # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack...
658
- for ($i=0;$i<18;$i++) {
659
-
660
- &mov ("edx",&DWP(16*$i+8-128,$Htbl)) if ($i<16);
661
- &movq ($lo[0],&QWP(16*$i+8-128,$Htbl)) if ($i<16);
662
- &psllq ($tmp[1],60) if ($i>1);
663
- &movq ($hi[0],&QWP(16*$i+0-128,$Htbl)) if ($i<16);
664
- &por ($lo[2],$tmp[1]) if ($i>1);
665
- &movq (&QWP($off1-128,"edi"),$lo[1]) if ($i>0 && $i<17);
666
- &psrlq ($lo[1],4) if ($i>0 && $i<17);
667
- &movq (&QWP($off1,"edi"),$hi[1]) if ($i>0 && $i<17);
668
- &movq ($tmp[0],$hi[1]) if ($i>0 && $i<17);
669
- &movq (&QWP($off2-128,"ebp"),$lo[2]) if ($i>1);
670
- &psrlq ($hi[1],4) if ($i>0 && $i<17);
671
- &movq (&QWP($off2,"ebp"),$hi[2]) if ($i>1);
672
- &shl ("edx",4) if ($i<16);
673
- &mov (&BP($i,"esp"),&LB("edx")) if ($i<16);
674
-
675
- unshift (@lo,pop(@lo)); # "rotate" registers
676
- unshift (@hi,pop(@hi));
677
- unshift (@tmp,pop(@tmp));
678
- $off1 += 8 if ($i>0);
679
- $off2 += 8 if ($i>1);
680
- }
681
- }
682
-
683
- &movq ($Zhi,&QWP(0,"eax"));
684
- &mov ("ebx",&DWP(8,"eax"));
685
- &mov ("edx",&DWP(12,"eax")); # load Xi
686
-
687
- &set_label("outer",16);
688
- { my $nlo = "eax";
689
- my $dat = "edx";
690
- my @nhi = ("edi","ebp");
691
- my @rem = ("ebx","ecx");
692
- my @red = ("mm0","mm1","mm2");
693
- my $tmp = "mm3";
694
-
695
- &xor ($dat,&DWP(12,"ecx")); # merge input data
696
- &xor ("ebx",&DWP(8,"ecx"));
697
- &pxor ($Zhi,&QWP(0,"ecx"));
698
- &lea ("ecx",&DWP(16,"ecx")); # inp+=16
699
- #&mov (&DWP(528+12,"esp"),$dat); # save inp^Xi
700
- &mov (&DWP(528+8,"esp"),"ebx");
701
- &movq (&QWP(528+0,"esp"),$Zhi);
702
- &mov (&DWP(528+16+4,"esp"),"ecx"); # save inp
703
-
704
- &xor ($nlo,$nlo);
705
- &rol ($dat,8);
706
- &mov (&LB($nlo),&LB($dat));
707
- &mov ($nhi[1],$nlo);
708
- &and (&LB($nlo),0x0f);
709
- &shr ($nhi[1],4);
710
- &pxor ($red[0],$red[0]);
711
- &rol ($dat,8); # next byte
712
- &pxor ($red[1],$red[1]);
713
- &pxor ($red[2],$red[2]);
714
-
715
- # Just like in "May" verson modulo-schedule for critical path in
716
- # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor'
717
- # is scheduled so late that rem_8bit[] has to be shifted *right*
718
- # by 16, which is why last argument to pinsrw is 2, which
719
- # corresponds to <<32=<<48>>16...
720
- for ($j=11,$i=0;$i<15;$i++) {
721
-
722
- if ($i>0) {
723
- &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo]
724
- &rol ($dat,8); # next byte
725
- &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8));
726
-
727
- &pxor ($Zlo,$tmp);
728
- &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
729
- &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4)
730
- } else {
731
- &movq ($Zlo,&QWP(16,"esp",$nlo,8));
732
- &movq ($Zhi,&QWP(16+128,"esp",$nlo,8));
733
- }
734
-
735
- &mov (&LB($nlo),&LB($dat));
736
- &mov ($dat,&DWP(528+$j,"esp")) if (--$j%4==0);
737
-
738
- &movd ($rem[0],$Zlo);
739
- &movz ($rem[1],&LB($rem[1])) if ($i>0);
740
- &psrlq ($Zlo,8); # Z>>=8
741
-
742
- &movq ($tmp,$Zhi);
743
- &mov ($nhi[0],$nlo);
744
- &psrlq ($Zhi,8);
745
-
746
- &pxor ($Zlo,&QWP(16+256+0,"esp",$nhi[1],8)); # Z^=H[nhi]>>4
747
- &and (&LB($nlo),0x0f);
748
- &psllq ($tmp,56);
749
-
750
- &pxor ($Zhi,$red[1]) if ($i>1);
751
- &shr ($nhi[0],4);
752
- &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2) if ($i>0);
753
-
754
- unshift (@red,pop(@red)); # "rotate" registers
755
- unshift (@rem,pop(@rem));
756
- unshift (@nhi,pop(@nhi));
757
- }
758
-
759
- &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo]
760
- &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8));
761
- &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4)
762
-
763
- &pxor ($Zlo,$tmp);
764
- &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
765
- &movz ($rem[1],&LB($rem[1]));
766
-
767
- &pxor ($red[2],$red[2]); # clear 2nd word
768
- &psllq ($red[1],4);
769
-
770
- &movd ($rem[0],$Zlo);
771
- &psrlq ($Zlo,4); # Z>>=4
772
-
773
- &movq ($tmp,$Zhi);
774
- &psrlq ($Zhi,4);
775
- &shl ($rem[0],4); # rem<<4
776
-
777
- &pxor ($Zlo,&QWP(16,"esp",$nhi[1],8)); # Z^=H[nhi]
778
- &psllq ($tmp,60);
779
- &movz ($rem[0],&LB($rem[0]));
780
-
781
- &pxor ($Zlo,$tmp);
782
- &pxor ($Zhi,&QWP(16+128,"esp",$nhi[1],8));
783
-
784
- &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2);
785
- &pxor ($Zhi,$red[1]);
786
-
787
- &movd ($dat,$Zlo);
788
- &pinsrw ($red[2],&WP(0,$rem_8bit,$rem[0],2),3); # last is <<48
789
-
790
- &psllq ($red[0],12); # correct by <<16>>4
791
- &pxor ($Zhi,$red[0]);
792
- &psrlq ($Zlo,32);
793
- &pxor ($Zhi,$red[2]);
794
-
795
- &mov ("ecx",&DWP(528+16+4,"esp")); # restore inp
796
- &movd ("ebx",$Zlo);
797
- &movq ($tmp,$Zhi); # 01234567
798
- &psllw ($Zhi,8); # 1.3.5.7.
799
- &psrlw ($tmp,8); # .0.2.4.6
800
- &por ($Zhi,$tmp); # 10325476
801
- &bswap ($dat);
802
- &pshufw ($Zhi,$Zhi,0b00011011); # 76543210
803
- &bswap ("ebx");
804
-
805
- &cmp ("ecx",&DWP(528+16+8,"esp")); # are we done?
806
- &jne (&label("outer"));
807
- }
808
-
809
- &mov ("eax",&DWP(528+16+0,"esp")); # restore Xi
810
- &mov (&DWP(12,"eax"),"edx");
811
- &mov (&DWP(8,"eax"),"ebx");
812
- &movq (&QWP(0,"eax"),$Zhi);
813
-
814
- &mov ("esp",&DWP(528+16+12,"esp")); # restore original %esp
815
- &emms ();
816
- }
817
- &function_end("gcm_ghash_4bit_mmx");
818
- }}
819
-
820
- if ($sse2) {{
821
- ######################################################################
822
- # PCLMULQDQ version.
823
-
824
- $Xip="eax";
825
- $Htbl="edx";
826
- $const="ecx";
827
- $inp="esi";
828
- $len="ebx";
829
-
830
- ($Xi,$Xhi)=("xmm0","xmm1"); $Hkey="xmm2";
831
- ($T1,$T2,$T3)=("xmm3","xmm4","xmm5");
832
- ($Xn,$Xhn)=("xmm6","xmm7");
833
-
834
- &static_label("bswap");
835
-
836
- sub clmul64x64_T2 { # minimal "register" pressure
837
- my ($Xhi,$Xi,$Hkey,$HK)=@_;
838
-
839
- &movdqa ($Xhi,$Xi); #
840
- &pshufd ($T1,$Xi,0b01001110);
841
- &pshufd ($T2,$Hkey,0b01001110) if (!defined($HK));
842
- &pxor ($T1,$Xi); #
843
- &pxor ($T2,$Hkey) if (!defined($HK));
844
- $HK=$T2 if (!defined($HK));
845
-
846
- &pclmulqdq ($Xi,$Hkey,0x00); #######
847
- &pclmulqdq ($Xhi,$Hkey,0x11); #######
848
- &pclmulqdq ($T1,$HK,0x00); #######
849
- &xorps ($T1,$Xi); #
850
- &xorps ($T1,$Xhi); #
851
-
852
- &movdqa ($T2,$T1); #
853
- &psrldq ($T1,8);
854
- &pslldq ($T2,8); #
855
- &pxor ($Xhi,$T1);
856
- &pxor ($Xi,$T2); #
857
- }
858
-
859
- sub clmul64x64_T3 {
860
- # Even though this subroutine offers visually better ILP, it
861
- # was empirically found to be a tad slower than above version.
862
- # At least in gcm_ghash_clmul context. But it's just as well,
863
- # because loop modulo-scheduling is possible only thanks to
864
- # minimized "register" pressure...
865
- my ($Xhi,$Xi,$Hkey)=@_;
866
-
867
- &movdqa ($T1,$Xi); #
868
- &movdqa ($Xhi,$Xi);
869
- &pclmulqdq ($Xi,$Hkey,0x00); #######
870
- &pclmulqdq ($Xhi,$Hkey,0x11); #######
871
- &pshufd ($T2,$T1,0b01001110); #
872
- &pshufd ($T3,$Hkey,0b01001110);
873
- &pxor ($T2,$T1); #
874
- &pxor ($T3,$Hkey);
875
- &pclmulqdq ($T2,$T3,0x00); #######
876
- &pxor ($T2,$Xi); #
877
- &pxor ($T2,$Xhi); #
878
-
879
- &movdqa ($T3,$T2); #
880
- &psrldq ($T2,8);
881
- &pslldq ($T3,8); #
882
- &pxor ($Xhi,$T2);
883
- &pxor ($Xi,$T3); #
884
- }
885
-
886
- if (1) { # Algorithm 9 with <<1 twist.
887
- # Reduction is shorter and uses only two
888
- # temporary registers, which makes it better
889
- # candidate for interleaving with 64x64
890
- # multiplication. Pre-modulo-scheduled loop
891
- # was found to be ~20% faster than Algorithm 5
892
- # below. Algorithm 9 was therefore chosen for
893
- # further optimization...
894
-
895
- sub reduction_alg9 { # 17/11 times faster than Intel version
896
- my ($Xhi,$Xi) = @_;
897
-
898
- # 1st phase
899
- &movdqa ($T2,$Xi); #
900
- &movdqa ($T1,$Xi);
901
- &psllq ($Xi,5);
902
- &pxor ($T1,$Xi); #
903
- &psllq ($Xi,1);
904
- &pxor ($Xi,$T1); #
905
- &psllq ($Xi,57); #
906
- &movdqa ($T1,$Xi); #
907
- &pslldq ($Xi,8);
908
- &psrldq ($T1,8); #
909
- &pxor ($Xi,$T2);
910
- &pxor ($Xhi,$T1); #
911
-
912
- # 2nd phase
913
- &movdqa ($T2,$Xi);
914
- &psrlq ($Xi,1);
915
- &pxor ($Xhi,$T2); #
916
- &pxor ($T2,$Xi);
917
- &psrlq ($Xi,5);
918
- &pxor ($Xi,$T2); #
919
- &psrlq ($Xi,1); #
920
- &pxor ($Xi,$Xhi) #
921
- }
922
-
923
- &function_begin_B("gcm_init_clmul");
924
- &mov ($Htbl,&wparam(0));
925
- &mov ($Xip,&wparam(1));
926
-
927
- &call (&label("pic"));
928
- &set_label("pic");
929
- &blindpop ($const);
930
- &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
931
-
932
- &movdqu ($Hkey,&QWP(0,$Xip));
933
- &pshufd ($Hkey,$Hkey,0b01001110);# dword swap
934
-
935
- # <<1 twist
936
- &pshufd ($T2,$Hkey,0b11111111); # broadcast uppermost dword
937
- &movdqa ($T1,$Hkey);
938
- &psllq ($Hkey,1);
939
- &pxor ($T3,$T3); #
940
- &psrlq ($T1,63);
941
- &pcmpgtd ($T3,$T2); # broadcast carry bit
942
- &pslldq ($T1,8);
943
- &por ($Hkey,$T1); # H<<=1
944
-
945
- # magic reduction
946
- &pand ($T3,&QWP(16,$const)); # 0x1c2_polynomial
947
- &pxor ($Hkey,$T3); # if(carry) H^=0x1c2_polynomial
948
-
949
- # calculate H^2
950
- &movdqa ($Xi,$Hkey);
951
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
952
- &reduction_alg9 ($Xhi,$Xi);
953
-
954
- &pshufd ($T1,$Hkey,0b01001110);
955
- &pshufd ($T2,$Xi,0b01001110);
956
- &pxor ($T1,$Hkey); # Karatsuba pre-processing
957
- &movdqu (&QWP(0,$Htbl),$Hkey); # save H
958
- &pxor ($T2,$Xi); # Karatsuba pre-processing
959
- &movdqu (&QWP(16,$Htbl),$Xi); # save H^2
960
- &palignr ($T2,$T1,8); # low part is H.lo^H.hi
961
- &movdqu (&QWP(32,$Htbl),$T2); # save Karatsuba "salt"
962
-
963
- &ret ();
964
- &function_end_B("gcm_init_clmul");
965
-
966
- &function_begin_B("gcm_gmult_clmul");
967
- &mov ($Xip,&wparam(0));
968
- &mov ($Htbl,&wparam(1));
969
-
970
- &call (&label("pic"));
971
- &set_label("pic");
972
- &blindpop ($const);
973
- &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
974
-
975
- &movdqu ($Xi,&QWP(0,$Xip));
976
- &movdqa ($T3,&QWP(0,$const));
977
- &movups ($Hkey,&QWP(0,$Htbl));
978
- &pshufb ($Xi,$T3);
979
- &movups ($T2,&QWP(32,$Htbl));
980
-
981
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
982
- &reduction_alg9 ($Xhi,$Xi);
983
-
984
- &pshufb ($Xi,$T3);
985
- &movdqu (&QWP(0,$Xip),$Xi);
986
-
987
- &ret ();
988
- &function_end_B("gcm_gmult_clmul");
989
-
990
- &function_begin("gcm_ghash_clmul");
991
- &mov ($Xip,&wparam(0));
992
- &mov ($Htbl,&wparam(1));
993
- &mov ($inp,&wparam(2));
994
- &mov ($len,&wparam(3));
995
-
996
- &call (&label("pic"));
997
- &set_label("pic");
998
- &blindpop ($const);
999
- &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
1000
-
1001
- &movdqu ($Xi,&QWP(0,$Xip));
1002
- &movdqa ($T3,&QWP(0,$const));
1003
- &movdqu ($Hkey,&QWP(0,$Htbl));
1004
- &pshufb ($Xi,$T3);
1005
-
1006
- &sub ($len,0x10);
1007
- &jz (&label("odd_tail"));
1008
-
1009
- #######
1010
- # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
1011
- # [(H*Ii+1) + (H*Xi+1)] mod P =
1012
- # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
1013
- #
1014
- &movdqu ($T1,&QWP(0,$inp)); # Ii
1015
- &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
1016
- &pshufb ($T1,$T3);
1017
- &pshufb ($Xn,$T3);
1018
- &movdqu ($T3,&QWP(32,$Htbl));
1019
- &pxor ($Xi,$T1); # Ii+Xi
1020
-
1021
- &pshufd ($T1,$Xn,0b01001110); # H*Ii+1
1022
- &movdqa ($Xhn,$Xn);
1023
- &pxor ($T1,$Xn); #
1024
- &lea ($inp,&DWP(32,$inp)); # i+=2
1025
-
1026
- &pclmulqdq ($Xn,$Hkey,0x00); #######
1027
- &pclmulqdq ($Xhn,$Hkey,0x11); #######
1028
- &pclmulqdq ($T1,$T3,0x00); #######
1029
- &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
1030
- &nop ();
1031
-
1032
- &sub ($len,0x20);
1033
- &jbe (&label("even_tail"));
1034
- &jmp (&label("mod_loop"));
1035
-
1036
- &set_label("mod_loop",32);
1037
- &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi)
1038
- &movdqa ($Xhi,$Xi);
1039
- &pxor ($T2,$Xi); #
1040
- &nop ();
1041
-
1042
- &pclmulqdq ($Xi,$Hkey,0x00); #######
1043
- &pclmulqdq ($Xhi,$Hkey,0x11); #######
1044
- &pclmulqdq ($T2,$T3,0x10); #######
1045
- &movups ($Hkey,&QWP(0,$Htbl)); # load H
1046
-
1047
- &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
1048
- &movdqa ($T3,&QWP(0,$const));
1049
- &xorps ($Xhi,$Xhn);
1050
- &movdqu ($Xhn,&QWP(0,$inp)); # Ii
1051
- &pxor ($T1,$Xi); # aggregated Karatsuba post-processing
1052
- &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
1053
- &pxor ($T1,$Xhi); #
1054
-
1055
- &pshufb ($Xhn,$T3);
1056
- &pxor ($T2,$T1); #
1057
-
1058
- &movdqa ($T1,$T2); #
1059
- &psrldq ($T2,8);
1060
- &pslldq ($T1,8); #
1061
- &pxor ($Xhi,$T2);
1062
- &pxor ($Xi,$T1); #
1063
- &pshufb ($Xn,$T3);
1064
- &pxor ($Xhi,$Xhn); # "Ii+Xi", consume early
1065
-
1066
- &movdqa ($Xhn,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
1067
- &movdqa ($T2,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase
1068
- &movdqa ($T1,$Xi);
1069
- &psllq ($Xi,5);
1070
- &pxor ($T1,$Xi); #
1071
- &psllq ($Xi,1);
1072
- &pxor ($Xi,$T1); #
1073
- &pclmulqdq ($Xn,$Hkey,0x00); #######
1074
- &movups ($T3,&QWP(32,$Htbl));
1075
- &psllq ($Xi,57); #
1076
- &movdqa ($T1,$Xi); #
1077
- &pslldq ($Xi,8);
1078
- &psrldq ($T1,8); #
1079
- &pxor ($Xi,$T2);
1080
- &pxor ($Xhi,$T1); #
1081
- &pshufd ($T1,$Xhn,0b01001110);
1082
- &movdqa ($T2,$Xi); # 2nd phase
1083
- &psrlq ($Xi,1);
1084
- &pxor ($T1,$Xhn);
1085
- &pxor ($Xhi,$T2); #
1086
- &pclmulqdq ($Xhn,$Hkey,0x11); #######
1087
- &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
1088
- &pxor ($T2,$Xi);
1089
- &psrlq ($Xi,5);
1090
- &pxor ($Xi,$T2); #
1091
- &psrlq ($Xi,1); #
1092
- &pxor ($Xi,$Xhi) #
1093
- &pclmulqdq ($T1,$T3,0x00); #######
1094
-
1095
- &lea ($inp,&DWP(32,$inp));
1096
- &sub ($len,0x20);
1097
- &ja (&label("mod_loop"));
1098
-
1099
- &set_label("even_tail");
1100
- &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi)
1101
- &movdqa ($Xhi,$Xi);
1102
- &pxor ($T2,$Xi); #
1103
-
1104
- &pclmulqdq ($Xi,$Hkey,0x00); #######
1105
- &pclmulqdq ($Xhi,$Hkey,0x11); #######
1106
- &pclmulqdq ($T2,$T3,0x10); #######
1107
- &movdqa ($T3,&QWP(0,$const));
1108
-
1109
- &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
1110
- &xorps ($Xhi,$Xhn);
1111
- &pxor ($T1,$Xi); # aggregated Karatsuba post-processing
1112
- &pxor ($T1,$Xhi); #
1113
-
1114
- &pxor ($T2,$T1); #
1115
-
1116
- &movdqa ($T1,$T2); #
1117
- &psrldq ($T2,8);
1118
- &pslldq ($T1,8); #
1119
- &pxor ($Xhi,$T2);
1120
- &pxor ($Xi,$T1); #
1121
-
1122
- &reduction_alg9 ($Xhi,$Xi);
1123
-
1124
- &test ($len,$len);
1125
- &jnz (&label("done"));
1126
-
1127
- &movups ($Hkey,&QWP(0,$Htbl)); # load H
1128
- &set_label("odd_tail");
1129
- &movdqu ($T1,&QWP(0,$inp)); # Ii
1130
- &pshufb ($T1,$T3);
1131
- &pxor ($Xi,$T1); # Ii+Xi
1132
-
1133
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
1134
- &reduction_alg9 ($Xhi,$Xi);
1135
-
1136
- &set_label("done");
1137
- &pshufb ($Xi,$T3);
1138
- &movdqu (&QWP(0,$Xip),$Xi);
1139
- &function_end("gcm_ghash_clmul");
1140
-
1141
- } else { # Algorith 5. Kept for reference purposes.
1142
-
1143
- sub reduction_alg5 { # 19/16 times faster than Intel version
1144
- my ($Xhi,$Xi)=@_;
1145
-
1146
- # <<1
1147
- &movdqa ($T1,$Xi); #
1148
- &movdqa ($T2,$Xhi);
1149
- &pslld ($Xi,1);
1150
- &pslld ($Xhi,1); #
1151
- &psrld ($T1,31);
1152
- &psrld ($T2,31); #
1153
- &movdqa ($T3,$T1);
1154
- &pslldq ($T1,4);
1155
- &psrldq ($T3,12); #
1156
- &pslldq ($T2,4);
1157
- &por ($Xhi,$T3); #
1158
- &por ($Xi,$T1);
1159
- &por ($Xhi,$T2); #
1160
-
1161
- # 1st phase
1162
- &movdqa ($T1,$Xi);
1163
- &movdqa ($T2,$Xi);
1164
- &movdqa ($T3,$Xi); #
1165
- &pslld ($T1,31);
1166
- &pslld ($T2,30);
1167
- &pslld ($Xi,25); #
1168
- &pxor ($T1,$T2);
1169
- &pxor ($T1,$Xi); #
1170
- &movdqa ($T2,$T1); #
1171
- &pslldq ($T1,12);
1172
- &psrldq ($T2,4); #
1173
- &pxor ($T3,$T1);
1174
-
1175
- # 2nd phase
1176
- &pxor ($Xhi,$T3); #
1177
- &movdqa ($Xi,$T3);
1178
- &movdqa ($T1,$T3);
1179
- &psrld ($Xi,1); #
1180
- &psrld ($T1,2);
1181
- &psrld ($T3,7); #
1182
- &pxor ($Xi,$T1);
1183
- &pxor ($Xhi,$T2);
1184
- &pxor ($Xi,$T3); #
1185
- &pxor ($Xi,$Xhi); #
1186
- }
1187
-
1188
- &function_begin_B("gcm_init_clmul");
1189
- &mov ($Htbl,&wparam(0));
1190
- &mov ($Xip,&wparam(1));
1191
-
1192
- &call (&label("pic"));
1193
- &set_label("pic");
1194
- &blindpop ($const);
1195
- &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
1196
-
1197
- &movdqu ($Hkey,&QWP(0,$Xip));
1198
- &pshufd ($Hkey,$Hkey,0b01001110);# dword swap
1199
-
1200
- # calculate H^2
1201
- &movdqa ($Xi,$Hkey);
1202
- &clmul64x64_T3 ($Xhi,$Xi,$Hkey);
1203
- &reduction_alg5 ($Xhi,$Xi);
1204
-
1205
- &movdqu (&QWP(0,$Htbl),$Hkey); # save H
1206
- &movdqu (&QWP(16,$Htbl),$Xi); # save H^2
1207
-
1208
- &ret ();
1209
- &function_end_B("gcm_init_clmul");
1210
-
1211
- &function_begin_B("gcm_gmult_clmul");
1212
- &mov ($Xip,&wparam(0));
1213
- &mov ($Htbl,&wparam(1));
1214
-
1215
- &call (&label("pic"));
1216
- &set_label("pic");
1217
- &blindpop ($const);
1218
- &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
1219
-
1220
- &movdqu ($Xi,&QWP(0,$Xip));
1221
- &movdqa ($Xn,&QWP(0,$const));
1222
- &movdqu ($Hkey,&QWP(0,$Htbl));
1223
- &pshufb ($Xi,$Xn);
1224
-
1225
- &clmul64x64_T3 ($Xhi,$Xi,$Hkey);
1226
- &reduction_alg5 ($Xhi,$Xi);
1227
-
1228
- &pshufb ($Xi,$Xn);
1229
- &movdqu (&QWP(0,$Xip),$Xi);
1230
-
1231
- &ret ();
1232
- &function_end_B("gcm_gmult_clmul");
1233
-
1234
- &function_begin("gcm_ghash_clmul");
1235
- &mov ($Xip,&wparam(0));
1236
- &mov ($Htbl,&wparam(1));
1237
- &mov ($inp,&wparam(2));
1238
- &mov ($len,&wparam(3));
1239
-
1240
- &call (&label("pic"));
1241
- &set_label("pic");
1242
- &blindpop ($const);
1243
- &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
1244
-
1245
- &movdqu ($Xi,&QWP(0,$Xip));
1246
- &movdqa ($T3,&QWP(0,$const));
1247
- &movdqu ($Hkey,&QWP(0,$Htbl));
1248
- &pshufb ($Xi,$T3);
1249
-
1250
- &sub ($len,0x10);
1251
- &jz (&label("odd_tail"));
1252
-
1253
- #######
1254
- # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
1255
- # [(H*Ii+1) + (H*Xi+1)] mod P =
1256
- # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
1257
- #
1258
- &movdqu ($T1,&QWP(0,$inp)); # Ii
1259
- &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
1260
- &pshufb ($T1,$T3);
1261
- &pshufb ($Xn,$T3);
1262
- &pxor ($Xi,$T1); # Ii+Xi
1263
-
1264
- &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
1265
- &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
1266
-
1267
- &sub ($len,0x20);
1268
- &lea ($inp,&DWP(32,$inp)); # i+=2
1269
- &jbe (&label("even_tail"));
1270
-
1271
- &set_label("mod_loop");
1272
- &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
1273
- &movdqu ($Hkey,&QWP(0,$Htbl)); # load H
1274
-
1275
- &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
1276
- &pxor ($Xhi,$Xhn);
1277
-
1278
- &reduction_alg5 ($Xhi,$Xi);
1279
-
1280
- #######
1281
- &movdqa ($T3,&QWP(0,$const));
1282
- &movdqu ($T1,&QWP(0,$inp)); # Ii
1283
- &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
1284
- &pshufb ($T1,$T3);
1285
- &pshufb ($Xn,$T3);
1286
- &pxor ($Xi,$T1); # Ii+Xi
1287
-
1288
- &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
1289
- &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
1290
-
1291
- &sub ($len,0x20);
1292
- &lea ($inp,&DWP(32,$inp));
1293
- &ja (&label("mod_loop"));
1294
-
1295
- &set_label("even_tail");
1296
- &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
1297
-
1298
- &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
1299
- &pxor ($Xhi,$Xhn);
1300
-
1301
- &reduction_alg5 ($Xhi,$Xi);
1302
-
1303
- &movdqa ($T3,&QWP(0,$const));
1304
- &test ($len,$len);
1305
- &jnz (&label("done"));
1306
-
1307
- &movdqu ($Hkey,&QWP(0,$Htbl)); # load H
1308
- &set_label("odd_tail");
1309
- &movdqu ($T1,&QWP(0,$inp)); # Ii
1310
- &pshufb ($T1,$T3);
1311
- &pxor ($Xi,$T1); # Ii+Xi
1312
-
1313
- &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
1314
- &reduction_alg5 ($Xhi,$Xi);
1315
-
1316
- &movdqa ($T3,&QWP(0,$const));
1317
- &set_label("done");
1318
- &pshufb ($Xi,$T3);
1319
- &movdqu (&QWP(0,$Xip),$Xi);
1320
- &function_end("gcm_ghash_clmul");
1321
-
1322
- }
1323
-
1324
- &set_label("bswap",64);
1325
- &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
1326
- &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial
1327
- &set_label("rem_8bit",64);
1328
- &data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
1329
- &data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
1330
- &data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E);
1331
- &data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E);
1332
- &data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E);
1333
- &data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E);
1334
- &data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E);
1335
- &data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E);
1336
- &data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE);
1337
- &data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE);
1338
- &data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE);
1339
- &data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE);
1340
- &data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E);
1341
- &data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E);
1342
- &data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE);
1343
- &data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE);
1344
- &data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E);
1345
- &data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E);
1346
- &data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E);
1347
- &data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E);
1348
- &data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E);
1349
- &data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E);
1350
- &data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E);
1351
- &data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E);
1352
- &data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE);
1353
- &data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE);
1354
- &data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE);
1355
- &data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE);
1356
- &data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E);
1357
- &data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
1358
- &data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
1359
- &data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
1360
- }} # $sse2
1361
-
1362
- &set_label("rem_4bit",64);
1363
- &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
1364
- &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
1365
- &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
1366
- &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
1367
- }}} # !$x86only
1368
-
1369
- &asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
1370
- &asm_finish();
1371
-
1372
- # A question was risen about choice of vanilla MMX. Or rather why wasn't
1373
- # SSE2 chosen instead? In addition to the fact that MMX runs on legacy
1374
- # CPUs such as PIII, "4-bit" MMX version was observed to provide better
1375
- # performance than *corresponding* SSE2 one even on contemporary CPUs.
1376
- # SSE2 results were provided by Peter-Michael Hager. He maintains SSE2
1377
- # implementation featuring full range of lookup-table sizes, but with
1378
- # per-invocation lookup table setup. Latter means that table size is
1379
- # chosen depending on how much data is to be hashed in every given call,
1380
- # more data - larger table. Best reported result for Core2 is ~4 cycles
1381
- # per processed byte out of 64KB block. This number accounts even for
1382
- # 64KB table setup overhead. As discussed in gcm128.c we choose to be
1383
- # more conservative in respect to lookup table sizes, but how do the
1384
- # results compare? Minimalistic "256B" MMX version delivers ~11 cycles
1385
- # on same platform. As also discussed in gcm128.c, next in line "8-bit
1386
- # Shoup's" or "4KB" method should deliver twice the performance of
1387
- # "256B" one, in other words not worse than ~6 cycles per byte. It
1388
- # should be also be noted that in SSE2 case improvement can be "super-
1389
- # linear," i.e. more than twice, mostly because >>8 maps to single
1390
- # instruction on SSE2 register. This is unlike "4-bit" case when >>4
1391
- # maps to same amount of instructions in both MMX and SSE2 cases.
1392
- # Bottom line is that switch to SSE2 is considered to be justifiable
1393
- # only in case we choose to implement "8-bit" method...