ring-native 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/Gemfile +3 -0
  4. data/README.md +22 -0
  5. data/Rakefile +1 -0
  6. data/ext/ring/extconf.rb +29 -0
  7. data/lib/ring/native.rb +8 -0
  8. data/lib/ring/native/version.rb +5 -0
  9. data/ring-native.gemspec +25 -0
  10. data/vendor/ring/BUILDING.md +40 -0
  11. data/vendor/ring/Cargo.toml +43 -0
  12. data/vendor/ring/LICENSE +185 -0
  13. data/vendor/ring/Makefile +35 -0
  14. data/vendor/ring/PORTING.md +163 -0
  15. data/vendor/ring/README.md +113 -0
  16. data/vendor/ring/STYLE.md +197 -0
  17. data/vendor/ring/appveyor.yml +27 -0
  18. data/vendor/ring/build.rs +108 -0
  19. data/vendor/ring/crypto/aes/aes.c +1142 -0
  20. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +25 -0
  21. data/vendor/ring/crypto/aes/aes_test.cc +93 -0
  22. data/vendor/ring/crypto/aes/asm/aes-586.pl +2368 -0
  23. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +1249 -0
  24. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +2246 -0
  25. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +1318 -0
  26. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +2084 -0
  27. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +675 -0
  28. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +1364 -0
  29. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +1565 -0
  30. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +841 -0
  31. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +1116 -0
  32. data/vendor/ring/crypto/aes/internal.h +87 -0
  33. data/vendor/ring/crypto/aes/mode_wrappers.c +61 -0
  34. data/vendor/ring/crypto/bn/add.c +394 -0
  35. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +694 -0
  36. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +1503 -0
  37. data/vendor/ring/crypto/bn/asm/bn-586.pl +774 -0
  38. data/vendor/ring/crypto/bn/asm/co-586.pl +287 -0
  39. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +1882 -0
  40. data/vendor/ring/crypto/bn/asm/x86-mont.pl +592 -0
  41. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +599 -0
  42. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +1393 -0
  43. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +3507 -0
  44. data/vendor/ring/crypto/bn/bn.c +352 -0
  45. data/vendor/ring/crypto/bn/bn_asn1.c +74 -0
  46. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +25 -0
  47. data/vendor/ring/crypto/bn/bn_test.cc +1696 -0
  48. data/vendor/ring/crypto/bn/cmp.c +200 -0
  49. data/vendor/ring/crypto/bn/convert.c +433 -0
  50. data/vendor/ring/crypto/bn/ctx.c +311 -0
  51. data/vendor/ring/crypto/bn/div.c +594 -0
  52. data/vendor/ring/crypto/bn/exponentiation.c +1335 -0
  53. data/vendor/ring/crypto/bn/gcd.c +711 -0
  54. data/vendor/ring/crypto/bn/generic.c +1019 -0
  55. data/vendor/ring/crypto/bn/internal.h +316 -0
  56. data/vendor/ring/crypto/bn/montgomery.c +516 -0
  57. data/vendor/ring/crypto/bn/mul.c +888 -0
  58. data/vendor/ring/crypto/bn/prime.c +829 -0
  59. data/vendor/ring/crypto/bn/random.c +334 -0
  60. data/vendor/ring/crypto/bn/rsaz_exp.c +262 -0
  61. data/vendor/ring/crypto/bn/rsaz_exp.h +53 -0
  62. data/vendor/ring/crypto/bn/shift.c +276 -0
  63. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +25 -0
  64. data/vendor/ring/crypto/bytestring/bytestring_test.cc +421 -0
  65. data/vendor/ring/crypto/bytestring/cbb.c +399 -0
  66. data/vendor/ring/crypto/bytestring/cbs.c +227 -0
  67. data/vendor/ring/crypto/bytestring/internal.h +46 -0
  68. data/vendor/ring/crypto/chacha/chacha_generic.c +140 -0
  69. data/vendor/ring/crypto/chacha/chacha_vec.c +323 -0
  70. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +1447 -0
  71. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +153 -0
  72. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +25 -0
  73. data/vendor/ring/crypto/cipher/e_aes.c +390 -0
  74. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +208 -0
  75. data/vendor/ring/crypto/cipher/internal.h +173 -0
  76. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +543 -0
  77. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +9 -0
  78. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +475 -0
  79. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +23 -0
  80. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +422 -0
  81. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +484 -0
  82. data/vendor/ring/crypto/cipher/test/cipher_test.txt +100 -0
  83. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +25 -0
  84. data/vendor/ring/crypto/constant_time_test.c +304 -0
  85. data/vendor/ring/crypto/cpu-arm-asm.S +32 -0
  86. data/vendor/ring/crypto/cpu-arm.c +199 -0
  87. data/vendor/ring/crypto/cpu-intel.c +261 -0
  88. data/vendor/ring/crypto/crypto.c +151 -0
  89. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +2118 -0
  90. data/vendor/ring/crypto/curve25519/curve25519.c +4888 -0
  91. data/vendor/ring/crypto/curve25519/x25519_test.cc +128 -0
  92. data/vendor/ring/crypto/digest/md32_common.h +181 -0
  93. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +2725 -0
  94. data/vendor/ring/crypto/ec/ec.c +193 -0
  95. data/vendor/ring/crypto/ec/ec_curves.c +61 -0
  96. data/vendor/ring/crypto/ec/ec_key.c +228 -0
  97. data/vendor/ring/crypto/ec/ec_montgomery.c +114 -0
  98. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +25 -0
  99. data/vendor/ring/crypto/ec/internal.h +243 -0
  100. data/vendor/ring/crypto/ec/oct.c +253 -0
  101. data/vendor/ring/crypto/ec/p256-64.c +1794 -0
  102. data/vendor/ring/crypto/ec/p256-x86_64-table.h +9548 -0
  103. data/vendor/ring/crypto/ec/p256-x86_64.c +509 -0
  104. data/vendor/ring/crypto/ec/simple.c +1007 -0
  105. data/vendor/ring/crypto/ec/util-64.c +183 -0
  106. data/vendor/ring/crypto/ec/wnaf.c +508 -0
  107. data/vendor/ring/crypto/ecdh/ecdh.c +155 -0
  108. data/vendor/ring/crypto/ecdsa/ecdsa.c +304 -0
  109. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +193 -0
  110. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +25 -0
  111. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +327 -0
  112. data/vendor/ring/crypto/header_removed.h +17 -0
  113. data/vendor/ring/crypto/internal.h +495 -0
  114. data/vendor/ring/crypto/libring.Windows.vcxproj +101 -0
  115. data/vendor/ring/crypto/mem.c +98 -0
  116. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +1045 -0
  117. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +517 -0
  118. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +1393 -0
  119. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +1741 -0
  120. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +422 -0
  121. data/vendor/ring/crypto/modes/ctr.c +226 -0
  122. data/vendor/ring/crypto/modes/gcm.c +1206 -0
  123. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +25 -0
  124. data/vendor/ring/crypto/modes/gcm_test.c +348 -0
  125. data/vendor/ring/crypto/modes/internal.h +299 -0
  126. data/vendor/ring/crypto/perlasm/arm-xlate.pl +170 -0
  127. data/vendor/ring/crypto/perlasm/readme +100 -0
  128. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +1164 -0
  129. data/vendor/ring/crypto/perlasm/x86asm.pl +292 -0
  130. data/vendor/ring/crypto/perlasm/x86gas.pl +263 -0
  131. data/vendor/ring/crypto/perlasm/x86masm.pl +200 -0
  132. data/vendor/ring/crypto/perlasm/x86nasm.pl +187 -0
  133. data/vendor/ring/crypto/poly1305/poly1305.c +331 -0
  134. data/vendor/ring/crypto/poly1305/poly1305_arm.c +301 -0
  135. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +2015 -0
  136. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +25 -0
  137. data/vendor/ring/crypto/poly1305/poly1305_test.cc +80 -0
  138. data/vendor/ring/crypto/poly1305/poly1305_test.txt +52 -0
  139. data/vendor/ring/crypto/poly1305/poly1305_vec.c +892 -0
  140. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +75 -0
  141. data/vendor/ring/crypto/rand/internal.h +32 -0
  142. data/vendor/ring/crypto/rand/rand.c +189 -0
  143. data/vendor/ring/crypto/rand/urandom.c +219 -0
  144. data/vendor/ring/crypto/rand/windows.c +56 -0
  145. data/vendor/ring/crypto/refcount_c11.c +66 -0
  146. data/vendor/ring/crypto/refcount_lock.c +53 -0
  147. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +25 -0
  148. data/vendor/ring/crypto/refcount_test.c +58 -0
  149. data/vendor/ring/crypto/rsa/blinding.c +462 -0
  150. data/vendor/ring/crypto/rsa/internal.h +108 -0
  151. data/vendor/ring/crypto/rsa/padding.c +300 -0
  152. data/vendor/ring/crypto/rsa/rsa.c +450 -0
  153. data/vendor/ring/crypto/rsa/rsa_asn1.c +261 -0
  154. data/vendor/ring/crypto/rsa/rsa_impl.c +944 -0
  155. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +25 -0
  156. data/vendor/ring/crypto/rsa/rsa_test.cc +437 -0
  157. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +436 -0
  158. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +2390 -0
  159. data/vendor/ring/crypto/sha/asm/sha256-586.pl +1275 -0
  160. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +735 -0
  161. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +14 -0
  162. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +14 -0
  163. data/vendor/ring/crypto/sha/asm/sha512-586.pl +911 -0
  164. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +666 -0
  165. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +14 -0
  166. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +14 -0
  167. data/vendor/ring/crypto/sha/sha1.c +271 -0
  168. data/vendor/ring/crypto/sha/sha256.c +204 -0
  169. data/vendor/ring/crypto/sha/sha512.c +355 -0
  170. data/vendor/ring/crypto/test/file_test.cc +326 -0
  171. data/vendor/ring/crypto/test/file_test.h +181 -0
  172. data/vendor/ring/crypto/test/malloc.cc +150 -0
  173. data/vendor/ring/crypto/test/scoped_types.h +95 -0
  174. data/vendor/ring/crypto/test/test.Windows.vcxproj +35 -0
  175. data/vendor/ring/crypto/test/test_util.cc +46 -0
  176. data/vendor/ring/crypto/test/test_util.h +41 -0
  177. data/vendor/ring/crypto/thread_none.c +55 -0
  178. data/vendor/ring/crypto/thread_pthread.c +165 -0
  179. data/vendor/ring/crypto/thread_test.Windows.vcxproj +25 -0
  180. data/vendor/ring/crypto/thread_test.c +200 -0
  181. data/vendor/ring/crypto/thread_win.c +282 -0
  182. data/vendor/ring/examples/checkdigest.rs +103 -0
  183. data/vendor/ring/include/openssl/aes.h +121 -0
  184. data/vendor/ring/include/openssl/arm_arch.h +129 -0
  185. data/vendor/ring/include/openssl/base.h +156 -0
  186. data/vendor/ring/include/openssl/bn.h +794 -0
  187. data/vendor/ring/include/openssl/buffer.h +18 -0
  188. data/vendor/ring/include/openssl/bytestring.h +235 -0
  189. data/vendor/ring/include/openssl/chacha.h +37 -0
  190. data/vendor/ring/include/openssl/cmac.h +76 -0
  191. data/vendor/ring/include/openssl/cpu.h +184 -0
  192. data/vendor/ring/include/openssl/crypto.h +43 -0
  193. data/vendor/ring/include/openssl/curve25519.h +88 -0
  194. data/vendor/ring/include/openssl/ec.h +225 -0
  195. data/vendor/ring/include/openssl/ec_key.h +129 -0
  196. data/vendor/ring/include/openssl/ecdh.h +110 -0
  197. data/vendor/ring/include/openssl/ecdsa.h +156 -0
  198. data/vendor/ring/include/openssl/err.h +201 -0
  199. data/vendor/ring/include/openssl/mem.h +101 -0
  200. data/vendor/ring/include/openssl/obj_mac.h +71 -0
  201. data/vendor/ring/include/openssl/opensslfeatures.h +68 -0
  202. data/vendor/ring/include/openssl/opensslv.h +18 -0
  203. data/vendor/ring/include/openssl/ossl_typ.h +18 -0
  204. data/vendor/ring/include/openssl/poly1305.h +51 -0
  205. data/vendor/ring/include/openssl/rand.h +70 -0
  206. data/vendor/ring/include/openssl/rsa.h +399 -0
  207. data/vendor/ring/include/openssl/thread.h +133 -0
  208. data/vendor/ring/include/openssl/type_check.h +71 -0
  209. data/vendor/ring/mk/Common.props +63 -0
  210. data/vendor/ring/mk/Windows.props +42 -0
  211. data/vendor/ring/mk/WindowsTest.props +18 -0
  212. data/vendor/ring/mk/appveyor.bat +62 -0
  213. data/vendor/ring/mk/bottom_of_makefile.mk +54 -0
  214. data/vendor/ring/mk/ring.mk +266 -0
  215. data/vendor/ring/mk/top_of_makefile.mk +214 -0
  216. data/vendor/ring/mk/travis.sh +40 -0
  217. data/vendor/ring/mk/update-travis-yml.py +229 -0
  218. data/vendor/ring/ring.sln +153 -0
  219. data/vendor/ring/src/aead.rs +682 -0
  220. data/vendor/ring/src/agreement.rs +248 -0
  221. data/vendor/ring/src/c.rs +129 -0
  222. data/vendor/ring/src/constant_time.rs +37 -0
  223. data/vendor/ring/src/der.rs +96 -0
  224. data/vendor/ring/src/digest.rs +690 -0
  225. data/vendor/ring/src/digest_tests.txt +57 -0
  226. data/vendor/ring/src/ecc.rs +28 -0
  227. data/vendor/ring/src/ecc_build.rs +279 -0
  228. data/vendor/ring/src/ecc_curves.rs +117 -0
  229. data/vendor/ring/src/ed25519_tests.txt +2579 -0
  230. data/vendor/ring/src/exe_tests.rs +46 -0
  231. data/vendor/ring/src/ffi.rs +29 -0
  232. data/vendor/ring/src/file_test.rs +187 -0
  233. data/vendor/ring/src/hkdf.rs +153 -0
  234. data/vendor/ring/src/hkdf_tests.txt +59 -0
  235. data/vendor/ring/src/hmac.rs +414 -0
  236. data/vendor/ring/src/hmac_tests.txt +97 -0
  237. data/vendor/ring/src/input.rs +312 -0
  238. data/vendor/ring/src/lib.rs +41 -0
  239. data/vendor/ring/src/pbkdf2.rs +265 -0
  240. data/vendor/ring/src/pbkdf2_tests.txt +113 -0
  241. data/vendor/ring/src/polyfill.rs +57 -0
  242. data/vendor/ring/src/rand.rs +28 -0
  243. data/vendor/ring/src/signature.rs +314 -0
  244. data/vendor/ring/third-party/NIST/README.md +9 -0
  245. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +263 -0
  246. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +309 -0
  247. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +267 -0
  248. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +263 -0
  249. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +309 -0
  250. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +267 -0
  251. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +263 -0
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +309 -0
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +267 -0
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +519 -0
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +309 -0
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +523 -0
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +519 -0
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +309 -0
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +523 -0
  260. data/vendor/ring/third-party/NIST/sha256sums.txt +1 -0
  261. metadata +333 -0
@@ -0,0 +1,1393 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # ====================================================================
4
+ # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5
+ # project. The module is, however, dual licensed under OpenSSL and
6
+ # CRYPTOGAMS licenses depending on where you obtain it. For further
7
+ # details see http://www.openssl.org/~appro/cryptogams/.
8
+ # ====================================================================
9
+ #
10
+ # March, May, June 2010
11
+ #
12
+ # The module implements "4-bit" GCM GHASH function and underlying
13
+ # single multiplication operation in GF(2^128). "4-bit" means that it
14
+ # uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
15
+ # code paths: vanilla x86 and vanilla SSE. Former will be executed on
16
+ # 486 and Pentium, latter on all others. SSE GHASH features so called
17
+ # "528B" variant of "4-bit" method utilizing additional 256+16 bytes
18
+ # of per-key storage [+512 bytes shared table]. Performance results
19
+ # are for streamed GHASH subroutine and are expressed in cycles per
20
+ # processed byte, less is better:
21
+ #
22
+ # gcc 2.95.3(*) SSE assembler x86 assembler
23
+ #
24
+ # Pentium 105/111(**) - 50
25
+ # PIII 68 /75 12.2 24
26
+ # P4 125/125 17.8 84(***)
27
+ # Opteron 66 /70 10.1 30
28
+ # Core2 54 /67 8.4 18
29
+ # Atom 105/105 16.8 53
30
+ # VIA Nano 69 /71 13.0 27
31
+ #
32
+ # (*) gcc 3.4.x was observed to generate few percent slower code,
33
+ # which is one of reasons why 2.95.3 results were chosen,
34
+ # another reason is lack of 3.4.x results for older CPUs;
35
+ # comparison with SSE results is not completely fair, because C
36
+ # results are for vanilla "256B" implementation, while
37
+ # assembler results are for "528B";-)
38
+ # (**) second number is result for code compiled with -fPIC flag,
39
+ # which is actually more relevant, because assembler code is
40
+ # position-independent;
41
+ # (***) see comment in non-MMX routine for further details;
42
+ #
43
+ # To summarize, it's >2-5 times faster than gcc-generated code. To
44
+ # anchor it to something else SHA1 assembler processes one byte in
45
+ # ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE
46
+ # in particular, see comment at the end of the file...
47
+
48
+ # May 2010
49
+ #
50
+ # Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
51
+ # The question is how close is it to theoretical limit? The pclmulqdq
52
+ # instruction latency appears to be 14 cycles and there can't be more
53
+ # than 2 of them executing at any given time. This means that single
54
+ # Karatsuba multiplication would take 28 cycles *plus* few cycles for
55
+ # pre- and post-processing. Then multiplication has to be followed by
56
+ # modulo-reduction. Given that aggregated reduction method [see
57
+ # "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
58
+ # white paper by Intel] allows you to perform reduction only once in
59
+ # a while we can assume that asymptotic performance can be estimated
60
+ # as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
61
+ # and Naggr is the aggregation factor.
62
+ #
63
+ # Before we proceed to this implementation let's have closer look at
64
+ # the best-performing code suggested by Intel in their white paper.
65
+ # By tracing inter-register dependencies Tmod is estimated as ~19
66
+ # cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
67
+ # processed byte. As implied, this is quite optimistic estimate,
68
+ # because it does not account for Karatsuba pre- and post-processing,
69
+ # which for a single multiplication is ~5 cycles. Unfortunately Intel
70
+ # does not provide performance data for GHASH alone. But benchmarking
71
+ # AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
72
+ # alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
73
+ # the result accounts even for pre-computing of degrees of the hash
74
+ # key H, but its portion is negligible at 16KB buffer size.
75
+ #
76
+ # Moving on to the implementation in question. Tmod is estimated as
77
+ # ~13 cycles and Naggr is 2, giving asymptotic performance of ...
78
+ # 2.16. How is it possible that measured performance is better than
79
+ # optimistic theoretical estimate? There is one thing Intel failed
80
+ # to recognize. By serializing GHASH with CTR in same subroutine
81
+ # former's performance is really limited to above (Tmul + Tmod/Naggr)
82
+ # equation. But if GHASH procedure is detached, the modulo-reduction
83
+ # can be interleaved with Naggr-1 multiplications at instruction level
84
+ # and under ideal conditions even disappear from the equation. So that
85
+ # optimistic theoretical estimate for this implementation is ...
86
+ # 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
87
+ # at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
88
+ # where Tproc is time required for Karatsuba pre- and post-processing,
89
+ # is more realistic estimate. In this case it gives ... 1.91 cycles.
90
+ # Or in other words, depending on how well we can interleave reduction
91
+ # and one of the two multiplications the performance should be betwen
92
+ # 1.91 and 2.16. As already mentioned, this implementation processes
93
+ # one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
94
+ # - in 2.02. x86_64 performance is better, because larger register
95
+ # bank allows to interleave reduction and multiplication better.
96
+ #
97
+ # Does it make sense to increase Naggr? To start with it's virtually
98
+ # impossible in 32-bit mode, because of limited register bank
99
+ # capacity. Otherwise improvement has to be weighed agiainst slower
100
+ # setup, as well as code size and complexity increase. As even
101
+ # optimistic estimate doesn't promise 30% performance improvement,
102
+ # there are currently no plans to increase Naggr.
103
+ #
104
+ # Special thanks to David Woodhouse <dwmw2@infradead.org> for
105
+ # providing access to a Westmere-based system on behalf of Intel
106
+ # Open Source Technology Centre.
107
+
108
+ # January 2010
109
+ #
110
+ # Tweaked to optimize transitions between integer and FP operations
111
+ # on same XMM register, PCLMULQDQ subroutine was measured to process
112
+ # one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
113
+ # The minor regression on Westmere is outweighed by ~15% improvement
114
+ # on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
115
+ # similar manner resulted in almost 20% degradation on Sandy Bridge,
116
+ # where original 64-bit code processes one byte in 1.95 cycles.
117
+
118
+ #####################################################################
119
+ # For reference, AMD Bulldozer processes one byte in 1.98 cycles in
120
+ # 32-bit mode and 1.89 in 64-bit.
121
+
122
+ # February 2013
123
+ #
124
+ # Overhaul: aggregate Karatsuba post-processing, improve ILP in
125
+ # reduction_alg9. Resulting performance is 1.96 cycles per byte on
126
+ # Westmere, 1.95 - on Sandy/Ivy Bridge, 1.76 - on Bulldozer.
127
+
128
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
129
+ push(@INC,"${dir}","${dir}../../perlasm");
130
+ require "x86asm.pl";
131
+
132
+ &asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
133
+
134
+ $sse2=0;
135
+ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
136
+
137
+ ($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx");
138
+ $inp = "edi";
139
+ $Htbl = "esi";
140
+
141
+ $unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse
142
+ # than unrolled, which has to be weighted against
143
+ # 2.5x x86-specific code size reduction.
144
+
145
+ sub x86_loop {
146
+ my $off = shift;
147
+ my $rem = "eax";
148
+
149
+ &mov ($Zhh,&DWP(4,$Htbl,$Zll));
150
+ &mov ($Zhl,&DWP(0,$Htbl,$Zll));
151
+ &mov ($Zlh,&DWP(12,$Htbl,$Zll));
152
+ &mov ($Zll,&DWP(8,$Htbl,$Zll));
153
+ &xor ($rem,$rem); # avoid partial register stalls on PIII
154
+
155
+ # shrd practically kills P4, 2.5x deterioration, but P4 has
156
+ # MMX code-path to execute. shrd runs tad faster [than twice
157
+ # the shifts, move's and or's] on pre-MMX Pentium (as well as
158
+ # PIII and Core2), *but* minimizes code size, spares register
159
+ # and thus allows to fold the loop...
160
+ if (!$unroll) {
161
+ my $cnt = $inp;
162
+ &mov ($cnt,15);
163
+ &jmp (&label("x86_loop"));
164
+ &set_label("x86_loop",16);
165
+ for($i=1;$i<=2;$i++) {
166
+ &mov (&LB($rem),&LB($Zll));
167
+ &shrd ($Zll,$Zlh,4);
168
+ &and (&LB($rem),0xf);
169
+ &shrd ($Zlh,$Zhl,4);
170
+ &shrd ($Zhl,$Zhh,4);
171
+ &shr ($Zhh,4);
172
+ &xor ($Zhh,&DWP($off+16,"esp",$rem,4));
173
+
174
+ &mov (&LB($rem),&BP($off,"esp",$cnt));
175
+ if ($i&1) {
176
+ &and (&LB($rem),0xf0);
177
+ } else {
178
+ &shl (&LB($rem),4);
179
+ }
180
+
181
+ &xor ($Zll,&DWP(8,$Htbl,$rem));
182
+ &xor ($Zlh,&DWP(12,$Htbl,$rem));
183
+ &xor ($Zhl,&DWP(0,$Htbl,$rem));
184
+ &xor ($Zhh,&DWP(4,$Htbl,$rem));
185
+
186
+ if ($i&1) {
187
+ &dec ($cnt);
188
+ &js (&label("x86_break"));
189
+ } else {
190
+ &jmp (&label("x86_loop"));
191
+ }
192
+ }
193
+ &set_label("x86_break",16);
194
+ } else {
195
+ for($i=1;$i<32;$i++) {
196
+ &comment($i);
197
+ &mov (&LB($rem),&LB($Zll));
198
+ &shrd ($Zll,$Zlh,4);
199
+ &and (&LB($rem),0xf);
200
+ &shrd ($Zlh,$Zhl,4);
201
+ &shrd ($Zhl,$Zhh,4);
202
+ &shr ($Zhh,4);
203
+ &xor ($Zhh,&DWP($off+16,"esp",$rem,4));
204
+
205
+ if ($i&1) {
206
+ &mov (&LB($rem),&BP($off+15-($i>>1),"esp"));
207
+ &and (&LB($rem),0xf0);
208
+ } else {
209
+ &mov (&LB($rem),&BP($off+15-($i>>1),"esp"));
210
+ &shl (&LB($rem),4);
211
+ }
212
+
213
+ &xor ($Zll,&DWP(8,$Htbl,$rem));
214
+ &xor ($Zlh,&DWP(12,$Htbl,$rem));
215
+ &xor ($Zhl,&DWP(0,$Htbl,$rem));
216
+ &xor ($Zhh,&DWP(4,$Htbl,$rem));
217
+ }
218
+ }
219
+ &bswap ($Zll);
220
+ &bswap ($Zlh);
221
+ &bswap ($Zhl);
222
+ if (!$x86only) {
223
+ &bswap ($Zhh);
224
+ } else {
225
+ &mov ("eax",$Zhh);
226
+ &bswap ("eax");
227
+ &mov ($Zhh,"eax");
228
+ }
229
+ }
230
+
231
+ if ($unroll) {
232
+ &function_begin_B("_x86_gmult_4bit_inner");
233
+ &x86_loop(4);
234
+ &ret ();
235
+ &function_end_B("_x86_gmult_4bit_inner");
236
+ }
237
+
238
+ sub deposit_rem_4bit {
239
+ my $bias = shift;
240
+
241
+ &mov (&DWP($bias+0, "esp"),0x0000<<16);
242
+ &mov (&DWP($bias+4, "esp"),0x1C20<<16);
243
+ &mov (&DWP($bias+8, "esp"),0x3840<<16);
244
+ &mov (&DWP($bias+12,"esp"),0x2460<<16);
245
+ &mov (&DWP($bias+16,"esp"),0x7080<<16);
246
+ &mov (&DWP($bias+20,"esp"),0x6CA0<<16);
247
+ &mov (&DWP($bias+24,"esp"),0x48C0<<16);
248
+ &mov (&DWP($bias+28,"esp"),0x54E0<<16);
249
+ &mov (&DWP($bias+32,"esp"),0xE100<<16);
250
+ &mov (&DWP($bias+36,"esp"),0xFD20<<16);
251
+ &mov (&DWP($bias+40,"esp"),0xD940<<16);
252
+ &mov (&DWP($bias+44,"esp"),0xC560<<16);
253
+ &mov (&DWP($bias+48,"esp"),0x9180<<16);
254
+ &mov (&DWP($bias+52,"esp"),0x8DA0<<16);
255
+ &mov (&DWP($bias+56,"esp"),0xA9C0<<16);
256
+ &mov (&DWP($bias+60,"esp"),0xB5E0<<16);
257
+ }
258
+
259
+ $suffix = $x86only ? "" : "_x86";
260
+
261
+ &function_begin("gcm_gmult_4bit".$suffix);
262
+ &stack_push(16+4+1); # +1 for stack alignment
263
+ &mov ($inp,&wparam(0)); # load Xi
264
+ &mov ($Htbl,&wparam(1)); # load Htable
265
+
266
+ &mov ($Zhh,&DWP(0,$inp)); # load Xi[16]
267
+ &mov ($Zhl,&DWP(4,$inp));
268
+ &mov ($Zlh,&DWP(8,$inp));
269
+ &mov ($Zll,&DWP(12,$inp));
270
+
271
+ &deposit_rem_4bit(16);
272
+
273
+ &mov (&DWP(0,"esp"),$Zhh); # copy Xi[16] on stack
274
+ &mov (&DWP(4,"esp"),$Zhl);
275
+ &mov (&DWP(8,"esp"),$Zlh);
276
+ &mov (&DWP(12,"esp"),$Zll);
277
+ &shr ($Zll,20);
278
+ &and ($Zll,0xf0);
279
+
280
+ if ($unroll) {
281
+ &call ("_x86_gmult_4bit_inner");
282
+ } else {
283
+ &x86_loop(0);
284
+ &mov ($inp,&wparam(0));
285
+ }
286
+
287
+ &mov (&DWP(12,$inp),$Zll);
288
+ &mov (&DWP(8,$inp),$Zlh);
289
+ &mov (&DWP(4,$inp),$Zhl);
290
+ &mov (&DWP(0,$inp),$Zhh);
291
+ &stack_pop(16+4+1);
292
+ &function_end("gcm_gmult_4bit".$suffix);
293
+
294
+ &function_begin("gcm_ghash_4bit".$suffix);
295
+ &stack_push(16+4+1); # +1 for 64-bit alignment
296
+ &mov ($Zll,&wparam(0)); # load Xi
297
+ &mov ($Htbl,&wparam(1)); # load Htable
298
+ &mov ($inp,&wparam(2)); # load in
299
+ &mov ("ecx",&wparam(3)); # load len
300
+ &add ("ecx",$inp);
301
+ &mov (&wparam(3),"ecx");
302
+
303
+ &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16]
304
+ &mov ($Zhl,&DWP(4,$Zll));
305
+ &mov ($Zlh,&DWP(8,$Zll));
306
+ &mov ($Zll,&DWP(12,$Zll));
307
+
308
+ &deposit_rem_4bit(16);
309
+
310
+ &set_label("x86_outer_loop",16);
311
+ &xor ($Zll,&DWP(12,$inp)); # xor with input
312
+ &xor ($Zlh,&DWP(8,$inp));
313
+ &xor ($Zhl,&DWP(4,$inp));
314
+ &xor ($Zhh,&DWP(0,$inp));
315
+ &mov (&DWP(12,"esp"),$Zll); # dump it on stack
316
+ &mov (&DWP(8,"esp"),$Zlh);
317
+ &mov (&DWP(4,"esp"),$Zhl);
318
+ &mov (&DWP(0,"esp"),$Zhh);
319
+
320
+ &shr ($Zll,20);
321
+ &and ($Zll,0xf0);
322
+
323
+ if ($unroll) {
324
+ &call ("_x86_gmult_4bit_inner");
325
+ } else {
326
+ &x86_loop(0);
327
+ &mov ($inp,&wparam(2));
328
+ }
329
+ &lea ($inp,&DWP(16,$inp));
330
+ &cmp ($inp,&wparam(3));
331
+ &mov (&wparam(2),$inp) if (!$unroll);
332
+ &jb (&label("x86_outer_loop"));
333
+
334
+ &mov ($inp,&wparam(0)); # load Xi
335
+ &mov (&DWP(12,$inp),$Zll);
336
+ &mov (&DWP(8,$inp),$Zlh);
337
+ &mov (&DWP(4,$inp),$Zhl);
338
+ &mov (&DWP(0,$inp),$Zhh);
339
+ &stack_pop(16+4+1);
340
+ &function_end("gcm_ghash_4bit".$suffix);
341
+
342
+ if (!$x86only) {{{
343
+
344
+ &static_label("rem_4bit");
345
+
346
+ if (!$sse2) {{ # pure-MMX "May" version...
347
+
348
+ $S=12; # shift factor for rem_4bit
349
+
350
+ &function_begin_B("_mmx_gmult_4bit_inner");
351
+ # MMX version performs 3.5 times better on P4 (see comment in non-MMX
352
+ # routine for further details), 100% better on Opteron, ~70% better
353
+ # on Core2 and PIII... In other words effort is considered to be well
354
+ # spent... Since initial release the loop was unrolled in order to
355
+ # "liberate" register previously used as loop counter. Instead it's
356
+ # used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'.
357
+ # The path involves move of Z.lo from MMX to integer register,
358
+ # effective address calculation and finally merge of value to Z.hi.
359
+ # Reference to rem_4bit is scheduled so late that I had to >>4
360
+ # rem_4bit elements. This resulted in 20-45% procent improvement
361
+ # on contemporary µ-archs.
362
+ {
363
+ my $cnt;
364
+ my $rem_4bit = "eax";
365
+ my @rem = ($Zhh,$Zll);
366
+ my $nhi = $Zhl;
367
+ my $nlo = $Zlh;
368
+
369
+ my ($Zlo,$Zhi) = ("mm0","mm1");
370
+ my $tmp = "mm2";
371
+
372
+ &xor ($nlo,$nlo); # avoid partial register stalls on PIII
373
+ &mov ($nhi,$Zll);
374
+ &mov (&LB($nlo),&LB($nhi));
375
+ &shl (&LB($nlo),4);
376
+ &and ($nhi,0xf0);
377
+ &movq ($Zlo,&QWP(8,$Htbl,$nlo));
378
+ &movq ($Zhi,&QWP(0,$Htbl,$nlo));
379
+ &movd ($rem[0],$Zlo);
380
+
381
+ for ($cnt=28;$cnt>=-2;$cnt--) {
382
+ my $odd = $cnt&1;
383
+ my $nix = $odd ? $nlo : $nhi;
384
+
385
+ &shl (&LB($nlo),4) if ($odd);
386
+ &psrlq ($Zlo,4);
387
+ &movq ($tmp,$Zhi);
388
+ &psrlq ($Zhi,4);
389
+ &pxor ($Zlo,&QWP(8,$Htbl,$nix));
390
+ &mov (&LB($nlo),&BP($cnt/2,$inp)) if (!$odd && $cnt>=0);
391
+ &psllq ($tmp,60);
392
+ &and ($nhi,0xf0) if ($odd);
393
+ &pxor ($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28);
394
+ &and ($rem[0],0xf);
395
+ &pxor ($Zhi,&QWP(0,$Htbl,$nix));
396
+ &mov ($nhi,$nlo) if (!$odd && $cnt>=0);
397
+ &movd ($rem[1],$Zlo);
398
+ &pxor ($Zlo,$tmp);
399
+
400
+ push (@rem,shift(@rem)); # "rotate" registers
401
+ }
402
+
403
+ &mov ($inp,&DWP(4,$rem_4bit,$rem[1],8)); # last rem_4bit[rem]
404
+
405
+ &psrlq ($Zlo,32); # lower part of Zlo is already there
406
+ &movd ($Zhl,$Zhi);
407
+ &psrlq ($Zhi,32);
408
+ &movd ($Zlh,$Zlo);
409
+ &movd ($Zhh,$Zhi);
410
+ &shl ($inp,4); # compensate for rem_4bit[i] being >>4
411
+
412
+ &bswap ($Zll);
413
+ &bswap ($Zhl);
414
+ &bswap ($Zlh);
415
+ &xor ($Zhh,$inp);
416
+ &bswap ($Zhh);
417
+
418
+ &ret ();
419
+ }
420
+ &function_end_B("_mmx_gmult_4bit_inner");
421
+
422
+ &function_begin("gcm_gmult_4bit_mmx");
423
+ &mov ($inp,&wparam(0)); # load Xi
424
+ &mov ($Htbl,&wparam(1)); # load Htable
425
+
426
+ &call (&label("pic_point"));
427
+ &set_label("pic_point");
428
+ &blindpop("eax");
429
+ &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
430
+
431
+ &movz ($Zll,&BP(15,$inp));
432
+
433
+ &call ("_mmx_gmult_4bit_inner");
434
+
435
+ &mov ($inp,&wparam(0)); # load Xi
436
+ &emms ();
437
+ &mov (&DWP(12,$inp),$Zll);
438
+ &mov (&DWP(4,$inp),$Zhl);
439
+ &mov (&DWP(8,$inp),$Zlh);
440
+ &mov (&DWP(0,$inp),$Zhh);
441
+ &function_end("gcm_gmult_4bit_mmx");
442
+
443
+ # Streamed version performs 20% better on P4, 7% on Opteron,
444
+ # 10% on Core2 and PIII...
445
+ &function_begin("gcm_ghash_4bit_mmx");
446
+ &mov ($Zhh,&wparam(0)); # load Xi
447
+ &mov ($Htbl,&wparam(1)); # load Htable
448
+ &mov ($inp,&wparam(2)); # load in
449
+ &mov ($Zlh,&wparam(3)); # load len
450
+
451
+ &call (&label("pic_point"));
452
+ &set_label("pic_point");
453
+ &blindpop("eax");
454
+ &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
455
+
456
+ &add ($Zlh,$inp);
457
+ &mov (&wparam(3),$Zlh); # len to point at the end of input
458
+ &stack_push(4+1); # +1 for stack alignment
459
+
460
+ &mov ($Zll,&DWP(12,$Zhh)); # load Xi[16]
461
+ &mov ($Zhl,&DWP(4,$Zhh));
462
+ &mov ($Zlh,&DWP(8,$Zhh));
463
+ &mov ($Zhh,&DWP(0,$Zhh));
464
+ &jmp (&label("mmx_outer_loop"));
465
+
466
+ &set_label("mmx_outer_loop",16);
467
+ &xor ($Zll,&DWP(12,$inp));
468
+ &xor ($Zhl,&DWP(4,$inp));
469
+ &xor ($Zlh,&DWP(8,$inp));
470
+ &xor ($Zhh,&DWP(0,$inp));
471
+ &mov (&wparam(2),$inp);
472
+ &mov (&DWP(12,"esp"),$Zll);
473
+ &mov (&DWP(4,"esp"),$Zhl);
474
+ &mov (&DWP(8,"esp"),$Zlh);
475
+ &mov (&DWP(0,"esp"),$Zhh);
476
+
477
+ &mov ($inp,"esp");
478
+ &shr ($Zll,24);
479
+
480
+ &call ("_mmx_gmult_4bit_inner");
481
+
482
+ &mov ($inp,&wparam(2));
483
+ &lea ($inp,&DWP(16,$inp));
484
+ &cmp ($inp,&wparam(3));
485
+ &jb (&label("mmx_outer_loop"));
486
+
487
+ &mov ($inp,&wparam(0)); # load Xi
488
+ &emms ();
489
+ &mov (&DWP(12,$inp),$Zll);
490
+ &mov (&DWP(4,$inp),$Zhl);
491
+ &mov (&DWP(8,$inp),$Zlh);
492
+ &mov (&DWP(0,$inp),$Zhh);
493
+
494
+ &stack_pop(4+1);
495
+ &function_end("gcm_ghash_4bit_mmx");
496
+
497
+ }} else {{ # "June" MMX version...
498
+ # ... has slower "April" gcm_gmult_4bit_mmx with folded
499
+ # loop. This is done to conserve code size...
500
+ $S=16; # shift factor for rem_4bit
501
+
502
+ sub mmx_loop() {
503
+ # MMX version performs 2.8 times better on P4 (see comment in non-MMX
504
+ # routine for further details), 40% better on Opteron and Core2, 50%
505
+ # better on PIII... In other words effort is considered to be well
506
+ # spent...
507
+ my $inp = shift;
508
+ my $rem_4bit = shift;
509
+ my $cnt = $Zhh;
510
+ my $nhi = $Zhl;
511
+ my $nlo = $Zlh;
512
+ my $rem = $Zll;
513
+
514
+ my ($Zlo,$Zhi) = ("mm0","mm1");
515
+ my $tmp = "mm2";
516
+
517
+ &xor ($nlo,$nlo); # avoid partial register stalls on PIII
518
+ &mov ($nhi,$Zll);
519
+ &mov (&LB($nlo),&LB($nhi));
520
+ &mov ($cnt,14);
521
+ &shl (&LB($nlo),4);
522
+ &and ($nhi,0xf0);
523
+ &movq ($Zlo,&QWP(8,$Htbl,$nlo));
524
+ &movq ($Zhi,&QWP(0,$Htbl,$nlo));
525
+ &movd ($rem,$Zlo);
526
+ &jmp (&label("mmx_loop"));
527
+
528
+ &set_label("mmx_loop",16);
529
+ &psrlq ($Zlo,4);
530
+ &and ($rem,0xf);
531
+ &movq ($tmp,$Zhi);
532
+ &psrlq ($Zhi,4);
533
+ &pxor ($Zlo,&QWP(8,$Htbl,$nhi));
534
+ &mov (&LB($nlo),&BP(0,$inp,$cnt));
535
+ &psllq ($tmp,60);
536
+ &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
537
+ &dec ($cnt);
538
+ &movd ($rem,$Zlo);
539
+ &pxor ($Zhi,&QWP(0,$Htbl,$nhi));
540
+ &mov ($nhi,$nlo);
541
+ &pxor ($Zlo,$tmp);
542
+ &js (&label("mmx_break"));
543
+
544
+ &shl (&LB($nlo),4);
545
+ &and ($rem,0xf);
546
+ &psrlq ($Zlo,4);
547
+ &and ($nhi,0xf0);
548
+ &movq ($tmp,$Zhi);
549
+ &psrlq ($Zhi,4);
550
+ &pxor ($Zlo,&QWP(8,$Htbl,$nlo));
551
+ &psllq ($tmp,60);
552
+ &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
553
+ &movd ($rem,$Zlo);
554
+ &pxor ($Zhi,&QWP(0,$Htbl,$nlo));
555
+ &pxor ($Zlo,$tmp);
556
+ &jmp (&label("mmx_loop"));
557
+
558
+ &set_label("mmx_break",16);
559
+ &shl (&LB($nlo),4);
560
+ &and ($rem,0xf);
561
+ &psrlq ($Zlo,4);
562
+ &and ($nhi,0xf0);
563
+ &movq ($tmp,$Zhi);
564
+ &psrlq ($Zhi,4);
565
+ &pxor ($Zlo,&QWP(8,$Htbl,$nlo));
566
+ &psllq ($tmp,60);
567
+ &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
568
+ &movd ($rem,$Zlo);
569
+ &pxor ($Zhi,&QWP(0,$Htbl,$nlo));
570
+ &pxor ($Zlo,$tmp);
571
+
572
+ &psrlq ($Zlo,4);
573
+ &and ($rem,0xf);
574
+ &movq ($tmp,$Zhi);
575
+ &psrlq ($Zhi,4);
576
+ &pxor ($Zlo,&QWP(8,$Htbl,$nhi));
577
+ &psllq ($tmp,60);
578
+ &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
579
+ &movd ($rem,$Zlo);
580
+ &pxor ($Zhi,&QWP(0,$Htbl,$nhi));
581
+ &pxor ($Zlo,$tmp);
582
+
583
+ &psrlq ($Zlo,32); # lower part of Zlo is already there
584
+ &movd ($Zhl,$Zhi);
585
+ &psrlq ($Zhi,32);
586
+ &movd ($Zlh,$Zlo);
587
+ &movd ($Zhh,$Zhi);
588
+
589
+ &bswap ($Zll);
590
+ &bswap ($Zhl);
591
+ &bswap ($Zlh);
592
+ &bswap ($Zhh);
593
+ }
594
+
595
+ &function_begin("gcm_gmult_4bit_mmx");
596
+ &mov ($inp,&wparam(0)); # load Xi
597
+ &mov ($Htbl,&wparam(1)); # load Htable
598
+
599
+ &call (&label("pic_point"));
600
+ &set_label("pic_point");
601
+ &blindpop("eax");
602
+ &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
603
+
604
+ &movz ($Zll,&BP(15,$inp));
605
+
606
+ &mmx_loop($inp,"eax");
607
+
608
+ &emms ();
609
+ &mov (&DWP(12,$inp),$Zll);
610
+ &mov (&DWP(4,$inp),$Zhl);
611
+ &mov (&DWP(8,$inp),$Zlh);
612
+ &mov (&DWP(0,$inp),$Zhh);
613
+ &function_end("gcm_gmult_4bit_mmx");
614
+
615
+ ######################################################################
616
+ # Below subroutine is "528B" variant of "4-bit" GCM GHASH function
617
+ # (see gcm128.c for details). It provides further 20-40% performance
618
+ # improvement over above mentioned "May" version.
619
+
620
+ &static_label("rem_8bit");
621
+
622
+ &function_begin("gcm_ghash_4bit_mmx");
623
+ { my ($Zlo,$Zhi) = ("mm7","mm6");
624
+ my $rem_8bit = "esi";
625
+ my $Htbl = "ebx";
626
+
627
+ # parameter block
628
+ &mov ("eax",&wparam(0)); # Xi
629
+ &mov ("ebx",&wparam(1)); # Htable
630
+ &mov ("ecx",&wparam(2)); # inp
631
+ &mov ("edx",&wparam(3)); # len
632
+ &mov ("ebp","esp"); # original %esp
633
+ &call (&label("pic_point"));
634
+ &set_label ("pic_point");
635
+ &blindpop ($rem_8bit);
636
+ &lea ($rem_8bit,&DWP(&label("rem_8bit")."-".&label("pic_point"),$rem_8bit));
637
+
638
+ &sub ("esp",512+16+16); # allocate stack frame...
639
+ &and ("esp",-64); # ...and align it
640
+ &sub ("esp",16); # place for (u8)(H[]<<4)
641
+
642
+ &add ("edx","ecx"); # pointer to the end of input
643
+ &mov (&DWP(528+16+0,"esp"),"eax"); # save Xi
644
+ &mov (&DWP(528+16+8,"esp"),"edx"); # save inp+len
645
+ &mov (&DWP(528+16+12,"esp"),"ebp"); # save original %esp
646
+
647
+ { my @lo = ("mm0","mm1","mm2");
648
+ my @hi = ("mm3","mm4","mm5");
649
+ my @tmp = ("mm6","mm7");
650
+ my ($off1,$off2,$i) = (0,0,);
651
+
652
+ &add ($Htbl,128); # optimize for size
653
+ &lea ("edi",&DWP(16+128,"esp"));
654
+ &lea ("ebp",&DWP(16+256+128,"esp"));
655
+
656
+ # decompose Htable (low and high parts are kept separately),
657
+ # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack...
658
+ for ($i=0;$i<18;$i++) {
659
+
660
+ &mov ("edx",&DWP(16*$i+8-128,$Htbl)) if ($i<16);
661
+ &movq ($lo[0],&QWP(16*$i+8-128,$Htbl)) if ($i<16);
662
+ &psllq ($tmp[1],60) if ($i>1);
663
+ &movq ($hi[0],&QWP(16*$i+0-128,$Htbl)) if ($i<16);
664
+ &por ($lo[2],$tmp[1]) if ($i>1);
665
+ &movq (&QWP($off1-128,"edi"),$lo[1]) if ($i>0 && $i<17);
666
+ &psrlq ($lo[1],4) if ($i>0 && $i<17);
667
+ &movq (&QWP($off1,"edi"),$hi[1]) if ($i>0 && $i<17);
668
+ &movq ($tmp[0],$hi[1]) if ($i>0 && $i<17);
669
+ &movq (&QWP($off2-128,"ebp"),$lo[2]) if ($i>1);
670
+ &psrlq ($hi[1],4) if ($i>0 && $i<17);
671
+ &movq (&QWP($off2,"ebp"),$hi[2]) if ($i>1);
672
+ &shl ("edx",4) if ($i<16);
673
+ &mov (&BP($i,"esp"),&LB("edx")) if ($i<16);
674
+
675
+ unshift (@lo,pop(@lo)); # "rotate" registers
676
+ unshift (@hi,pop(@hi));
677
+ unshift (@tmp,pop(@tmp));
678
+ $off1 += 8 if ($i>0);
679
+ $off2 += 8 if ($i>1);
680
+ }
681
+ }
682
+
683
+ &movq ($Zhi,&QWP(0,"eax"));
684
+ &mov ("ebx",&DWP(8,"eax"));
685
+ &mov ("edx",&DWP(12,"eax")); # load Xi
686
+
687
+ &set_label("outer",16);
688
+ { my $nlo = "eax";
689
+ my $dat = "edx";
690
+ my @nhi = ("edi","ebp");
691
+ my @rem = ("ebx","ecx");
692
+ my @red = ("mm0","mm1","mm2");
693
+ my $tmp = "mm3";
694
+
695
+ &xor ($dat,&DWP(12,"ecx")); # merge input data
696
+ &xor ("ebx",&DWP(8,"ecx"));
697
+ &pxor ($Zhi,&QWP(0,"ecx"));
698
+ &lea ("ecx",&DWP(16,"ecx")); # inp+=16
699
+ #&mov (&DWP(528+12,"esp"),$dat); # save inp^Xi
700
+ &mov (&DWP(528+8,"esp"),"ebx");
701
+ &movq (&QWP(528+0,"esp"),$Zhi);
702
+ &mov (&DWP(528+16+4,"esp"),"ecx"); # save inp
703
+
704
+ &xor ($nlo,$nlo);
705
+ &rol ($dat,8);
706
+ &mov (&LB($nlo),&LB($dat));
707
+ &mov ($nhi[1],$nlo);
708
+ &and (&LB($nlo),0x0f);
709
+ &shr ($nhi[1],4);
710
+ &pxor ($red[0],$red[0]);
711
+ &rol ($dat,8); # next byte
712
+ &pxor ($red[1],$red[1]);
713
+ &pxor ($red[2],$red[2]);
714
+
715
+ # Just like in "May" verson modulo-schedule for critical path in
716
+ # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor'
717
+ # is scheduled so late that rem_8bit[] has to be shifted *right*
718
+ # by 16, which is why last argument to pinsrw is 2, which
719
+ # corresponds to <<32=<<48>>16...
720
+ for ($j=11,$i=0;$i<15;$i++) {
721
+
722
+ if ($i>0) {
723
+ &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo]
724
+ &rol ($dat,8); # next byte
725
+ &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8));
726
+
727
+ &pxor ($Zlo,$tmp);
728
+ &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
729
+ &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4)
730
+ } else {
731
+ &movq ($Zlo,&QWP(16,"esp",$nlo,8));
732
+ &movq ($Zhi,&QWP(16+128,"esp",$nlo,8));
733
+ }
734
+
735
+ &mov (&LB($nlo),&LB($dat));
736
+ &mov ($dat,&DWP(528+$j,"esp")) if (--$j%4==0);
737
+
738
+ &movd ($rem[0],$Zlo);
739
+ &movz ($rem[1],&LB($rem[1])) if ($i>0);
740
+ &psrlq ($Zlo,8); # Z>>=8
741
+
742
+ &movq ($tmp,$Zhi);
743
+ &mov ($nhi[0],$nlo);
744
+ &psrlq ($Zhi,8);
745
+
746
+ &pxor ($Zlo,&QWP(16+256+0,"esp",$nhi[1],8)); # Z^=H[nhi]>>4
747
+ &and (&LB($nlo),0x0f);
748
+ &psllq ($tmp,56);
749
+
750
+ &pxor ($Zhi,$red[1]) if ($i>1);
751
+ &shr ($nhi[0],4);
752
+ &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2) if ($i>0);
753
+
754
+ unshift (@red,pop(@red)); # "rotate" registers
755
+ unshift (@rem,pop(@rem));
756
+ unshift (@nhi,pop(@nhi));
757
+ }
758
+
759
+ &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo]
760
+ &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8));
761
+ &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4)
762
+
763
+ &pxor ($Zlo,$tmp);
764
+ &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
765
+ &movz ($rem[1],&LB($rem[1]));
766
+
767
+ &pxor ($red[2],$red[2]); # clear 2nd word
768
+ &psllq ($red[1],4);
769
+
770
+ &movd ($rem[0],$Zlo);
771
+ &psrlq ($Zlo,4); # Z>>=4
772
+
773
+ &movq ($tmp,$Zhi);
774
+ &psrlq ($Zhi,4);
775
+ &shl ($rem[0],4); # rem<<4
776
+
777
+ &pxor ($Zlo,&QWP(16,"esp",$nhi[1],8)); # Z^=H[nhi]
778
+ &psllq ($tmp,60);
779
+ &movz ($rem[0],&LB($rem[0]));
780
+
781
+ &pxor ($Zlo,$tmp);
782
+ &pxor ($Zhi,&QWP(16+128,"esp",$nhi[1],8));
783
+
784
+ &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2);
785
+ &pxor ($Zhi,$red[1]);
786
+
787
+ &movd ($dat,$Zlo);
788
+ &pinsrw ($red[2],&WP(0,$rem_8bit,$rem[0],2),3); # last is <<48
789
+
790
+ &psllq ($red[0],12); # correct by <<16>>4
791
+ &pxor ($Zhi,$red[0]);
792
+ &psrlq ($Zlo,32);
793
+ &pxor ($Zhi,$red[2]);
794
+
795
+ &mov ("ecx",&DWP(528+16+4,"esp")); # restore inp
796
+ &movd ("ebx",$Zlo);
797
+ &movq ($tmp,$Zhi); # 01234567
798
+ &psllw ($Zhi,8); # 1.3.5.7.
799
+ &psrlw ($tmp,8); # .0.2.4.6
800
+ &por ($Zhi,$tmp); # 10325476
801
+ &bswap ($dat);
802
+ &pshufw ($Zhi,$Zhi,0b00011011); # 76543210
803
+ &bswap ("ebx");
804
+
805
+ &cmp ("ecx",&DWP(528+16+8,"esp")); # are we done?
806
+ &jne (&label("outer"));
807
+ }
808
+
809
+ &mov ("eax",&DWP(528+16+0,"esp")); # restore Xi
810
+ &mov (&DWP(12,"eax"),"edx");
811
+ &mov (&DWP(8,"eax"),"ebx");
812
+ &movq (&QWP(0,"eax"),$Zhi);
813
+
814
+ &mov ("esp",&DWP(528+16+12,"esp")); # restore original %esp
815
+ &emms ();
816
+ }
817
+ &function_end("gcm_ghash_4bit_mmx");
818
+ }}
819
+
820
+ if ($sse2) {{
821
+ ######################################################################
822
+ # PCLMULQDQ version.
823
+
824
+ $Xip="eax";
825
+ $Htbl="edx";
826
+ $const="ecx";
827
+ $inp="esi";
828
+ $len="ebx";
829
+
830
+ ($Xi,$Xhi)=("xmm0","xmm1"); $Hkey="xmm2";
831
+ ($T1,$T2,$T3)=("xmm3","xmm4","xmm5");
832
+ ($Xn,$Xhn)=("xmm6","xmm7");
833
+
834
+ &static_label("bswap");
835
+
836
+ sub clmul64x64_T2 { # minimal "register" pressure
837
+ my ($Xhi,$Xi,$Hkey,$HK)=@_;
838
+
839
+ &movdqa ($Xhi,$Xi); #
840
+ &pshufd ($T1,$Xi,0b01001110);
841
+ &pshufd ($T2,$Hkey,0b01001110) if (!defined($HK));
842
+ &pxor ($T1,$Xi); #
843
+ &pxor ($T2,$Hkey) if (!defined($HK));
844
+ $HK=$T2 if (!defined($HK));
845
+
846
+ &pclmulqdq ($Xi,$Hkey,0x00); #######
847
+ &pclmulqdq ($Xhi,$Hkey,0x11); #######
848
+ &pclmulqdq ($T1,$HK,0x00); #######
849
+ &xorps ($T1,$Xi); #
850
+ &xorps ($T1,$Xhi); #
851
+
852
+ &movdqa ($T2,$T1); #
853
+ &psrldq ($T1,8);
854
+ &pslldq ($T2,8); #
855
+ &pxor ($Xhi,$T1);
856
+ &pxor ($Xi,$T2); #
857
+ }
858
+
859
+ sub clmul64x64_T3 {
860
+ # Even though this subroutine offers visually better ILP, it
861
+ # was empirically found to be a tad slower than above version.
862
+ # At least in gcm_ghash_clmul context. But it's just as well,
863
+ # because loop modulo-scheduling is possible only thanks to
864
+ # minimized "register" pressure...
865
+ my ($Xhi,$Xi,$Hkey)=@_;
866
+
867
+ &movdqa ($T1,$Xi); #
868
+ &movdqa ($Xhi,$Xi);
869
+ &pclmulqdq ($Xi,$Hkey,0x00); #######
870
+ &pclmulqdq ($Xhi,$Hkey,0x11); #######
871
+ &pshufd ($T2,$T1,0b01001110); #
872
+ &pshufd ($T3,$Hkey,0b01001110);
873
+ &pxor ($T2,$T1); #
874
+ &pxor ($T3,$Hkey);
875
+ &pclmulqdq ($T2,$T3,0x00); #######
876
+ &pxor ($T2,$Xi); #
877
+ &pxor ($T2,$Xhi); #
878
+
879
+ &movdqa ($T3,$T2); #
880
+ &psrldq ($T2,8);
881
+ &pslldq ($T3,8); #
882
+ &pxor ($Xhi,$T2);
883
+ &pxor ($Xi,$T3); #
884
+ }
885
+
886
+ if (1) { # Algorithm 9 with <<1 twist.
887
+ # Reduction is shorter and uses only two
888
+ # temporary registers, which makes it better
889
+ # candidate for interleaving with 64x64
890
+ # multiplication. Pre-modulo-scheduled loop
891
+ # was found to be ~20% faster than Algorithm 5
892
+ # below. Algorithm 9 was therefore chosen for
893
+ # further optimization...
894
+
895
+ sub reduction_alg9 { # 17/11 times faster than Intel version
896
+ my ($Xhi,$Xi) = @_;
897
+
898
+ # 1st phase
899
+ &movdqa ($T2,$Xi); #
900
+ &movdqa ($T1,$Xi);
901
+ &psllq ($Xi,5);
902
+ &pxor ($T1,$Xi); #
903
+ &psllq ($Xi,1);
904
+ &pxor ($Xi,$T1); #
905
+ &psllq ($Xi,57); #
906
+ &movdqa ($T1,$Xi); #
907
+ &pslldq ($Xi,8);
908
+ &psrldq ($T1,8); #
909
+ &pxor ($Xi,$T2);
910
+ &pxor ($Xhi,$T1); #
911
+
912
+ # 2nd phase
913
+ &movdqa ($T2,$Xi);
914
+ &psrlq ($Xi,1);
915
+ &pxor ($Xhi,$T2); #
916
+ &pxor ($T2,$Xi);
917
+ &psrlq ($Xi,5);
918
+ &pxor ($Xi,$T2); #
919
+ &psrlq ($Xi,1); #
920
+ &pxor ($Xi,$Xhi) #
921
+ }
922
+
923
+ &function_begin_B("gcm_init_clmul");
924
+ &mov ($Htbl,&wparam(0));
925
+ &mov ($Xip,&wparam(1));
926
+
927
+ &call (&label("pic"));
928
+ &set_label("pic");
929
+ &blindpop ($const);
930
+ &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
931
+
932
+ &movdqu ($Hkey,&QWP(0,$Xip));
933
+ &pshufd ($Hkey,$Hkey,0b01001110);# dword swap
934
+
935
+ # <<1 twist
936
+ &pshufd ($T2,$Hkey,0b11111111); # broadcast uppermost dword
937
+ &movdqa ($T1,$Hkey);
938
+ &psllq ($Hkey,1);
939
+ &pxor ($T3,$T3); #
940
+ &psrlq ($T1,63);
941
+ &pcmpgtd ($T3,$T2); # broadcast carry bit
942
+ &pslldq ($T1,8);
943
+ &por ($Hkey,$T1); # H<<=1
944
+
945
+ # magic reduction
946
+ &pand ($T3,&QWP(16,$const)); # 0x1c2_polynomial
947
+ &pxor ($Hkey,$T3); # if(carry) H^=0x1c2_polynomial
948
+
949
+ # calculate H^2
950
+ &movdqa ($Xi,$Hkey);
951
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey);
952
+ &reduction_alg9 ($Xhi,$Xi);
953
+
954
+ &pshufd ($T1,$Hkey,0b01001110);
955
+ &pshufd ($T2,$Xi,0b01001110);
956
+ &pxor ($T1,$Hkey); # Karatsuba pre-processing
957
+ &movdqu (&QWP(0,$Htbl),$Hkey); # save H
958
+ &pxor ($T2,$Xi); # Karatsuba pre-processing
959
+ &movdqu (&QWP(16,$Htbl),$Xi); # save H^2
960
+ &palignr ($T2,$T1,8); # low part is H.lo^H.hi
961
+ &movdqu (&QWP(32,$Htbl),$T2); # save Karatsuba "salt"
962
+
963
+ &ret ();
964
+ &function_end_B("gcm_init_clmul");
965
+
966
+ &function_begin_B("gcm_gmult_clmul");
967
+ &mov ($Xip,&wparam(0));
968
+ &mov ($Htbl,&wparam(1));
969
+
970
+ &call (&label("pic"));
971
+ &set_label("pic");
972
+ &blindpop ($const);
973
+ &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
974
+
975
+ &movdqu ($Xi,&QWP(0,$Xip));
976
+ &movdqa ($T3,&QWP(0,$const));
977
+ &movups ($Hkey,&QWP(0,$Htbl));
978
+ &pshufb ($Xi,$T3);
979
+ &movups ($T2,&QWP(32,$Htbl));
980
+
981
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
982
+ &reduction_alg9 ($Xhi,$Xi);
983
+
984
+ &pshufb ($Xi,$T3);
985
+ &movdqu (&QWP(0,$Xip),$Xi);
986
+
987
+ &ret ();
988
+ &function_end_B("gcm_gmult_clmul");
989
+
990
+ &function_begin("gcm_ghash_clmul");
991
+ &mov ($Xip,&wparam(0));
992
+ &mov ($Htbl,&wparam(1));
993
+ &mov ($inp,&wparam(2));
994
+ &mov ($len,&wparam(3));
995
+
996
+ &call (&label("pic"));
997
+ &set_label("pic");
998
+ &blindpop ($const);
999
+ &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
1000
+
1001
+ &movdqu ($Xi,&QWP(0,$Xip));
1002
+ &movdqa ($T3,&QWP(0,$const));
1003
+ &movdqu ($Hkey,&QWP(0,$Htbl));
1004
+ &pshufb ($Xi,$T3);
1005
+
1006
+ &sub ($len,0x10);
1007
+ &jz (&label("odd_tail"));
1008
+
1009
+ #######
1010
+ # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
1011
+ # [(H*Ii+1) + (H*Xi+1)] mod P =
1012
+ # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
1013
+ #
1014
+ &movdqu ($T1,&QWP(0,$inp)); # Ii
1015
+ &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
1016
+ &pshufb ($T1,$T3);
1017
+ &pshufb ($Xn,$T3);
1018
+ &movdqu ($T3,&QWP(32,$Htbl));
1019
+ &pxor ($Xi,$T1); # Ii+Xi
1020
+
1021
+ &pshufd ($T1,$Xn,0b01001110); # H*Ii+1
1022
+ &movdqa ($Xhn,$Xn);
1023
+ &pxor ($T1,$Xn); #
1024
+ &lea ($inp,&DWP(32,$inp)); # i+=2
1025
+
1026
+ &pclmulqdq ($Xn,$Hkey,0x00); #######
1027
+ &pclmulqdq ($Xhn,$Hkey,0x11); #######
1028
+ &pclmulqdq ($T1,$T3,0x00); #######
1029
+ &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
1030
+ &nop ();
1031
+
1032
+ &sub ($len,0x20);
1033
+ &jbe (&label("even_tail"));
1034
+ &jmp (&label("mod_loop"));
1035
+
1036
+ &set_label("mod_loop",32);
1037
+ &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi)
1038
+ &movdqa ($Xhi,$Xi);
1039
+ &pxor ($T2,$Xi); #
1040
+ &nop ();
1041
+
1042
+ &pclmulqdq ($Xi,$Hkey,0x00); #######
1043
+ &pclmulqdq ($Xhi,$Hkey,0x11); #######
1044
+ &pclmulqdq ($T2,$T3,0x10); #######
1045
+ &movups ($Hkey,&QWP(0,$Htbl)); # load H
1046
+
1047
+ &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
1048
+ &movdqa ($T3,&QWP(0,$const));
1049
+ &xorps ($Xhi,$Xhn);
1050
+ &movdqu ($Xhn,&QWP(0,$inp)); # Ii
1051
+ &pxor ($T1,$Xi); # aggregated Karatsuba post-processing
1052
+ &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
1053
+ &pxor ($T1,$Xhi); #
1054
+
1055
+ &pshufb ($Xhn,$T3);
1056
+ &pxor ($T2,$T1); #
1057
+
1058
+ &movdqa ($T1,$T2); #
1059
+ &psrldq ($T2,8);
1060
+ &pslldq ($T1,8); #
1061
+ &pxor ($Xhi,$T2);
1062
+ &pxor ($Xi,$T1); #
1063
+ &pshufb ($Xn,$T3);
1064
+ &pxor ($Xhi,$Xhn); # "Ii+Xi", consume early
1065
+
1066
+ &movdqa ($Xhn,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
1067
+ &movdqa ($T2,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase
1068
+ &movdqa ($T1,$Xi);
1069
+ &psllq ($Xi,5);
1070
+ &pxor ($T1,$Xi); #
1071
+ &psllq ($Xi,1);
1072
+ &pxor ($Xi,$T1); #
1073
+ &pclmulqdq ($Xn,$Hkey,0x00); #######
1074
+ &movups ($T3,&QWP(32,$Htbl));
1075
+ &psllq ($Xi,57); #
1076
+ &movdqa ($T1,$Xi); #
1077
+ &pslldq ($Xi,8);
1078
+ &psrldq ($T1,8); #
1079
+ &pxor ($Xi,$T2);
1080
+ &pxor ($Xhi,$T1); #
1081
+ &pshufd ($T1,$Xhn,0b01001110);
1082
+ &movdqa ($T2,$Xi); # 2nd phase
1083
+ &psrlq ($Xi,1);
1084
+ &pxor ($T1,$Xhn);
1085
+ &pxor ($Xhi,$T2); #
1086
+ &pclmulqdq ($Xhn,$Hkey,0x11); #######
1087
+ &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
1088
+ &pxor ($T2,$Xi);
1089
+ &psrlq ($Xi,5);
1090
+ &pxor ($Xi,$T2); #
1091
+ &psrlq ($Xi,1); #
1092
+ &pxor ($Xi,$Xhi) #
1093
+ &pclmulqdq ($T1,$T3,0x00); #######
1094
+
1095
+ &lea ($inp,&DWP(32,$inp));
1096
+ &sub ($len,0x20);
1097
+ &ja (&label("mod_loop"));
1098
+
1099
+ &set_label("even_tail");
1100
+ &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi)
1101
+ &movdqa ($Xhi,$Xi);
1102
+ &pxor ($T2,$Xi); #
1103
+
1104
+ &pclmulqdq ($Xi,$Hkey,0x00); #######
1105
+ &pclmulqdq ($Xhi,$Hkey,0x11); #######
1106
+ &pclmulqdq ($T2,$T3,0x10); #######
1107
+ &movdqa ($T3,&QWP(0,$const));
1108
+
1109
+ &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
1110
+ &xorps ($Xhi,$Xhn);
1111
+ &pxor ($T1,$Xi); # aggregated Karatsuba post-processing
1112
+ &pxor ($T1,$Xhi); #
1113
+
1114
+ &pxor ($T2,$T1); #
1115
+
1116
+ &movdqa ($T1,$T2); #
1117
+ &psrldq ($T2,8);
1118
+ &pslldq ($T1,8); #
1119
+ &pxor ($Xhi,$T2);
1120
+ &pxor ($Xi,$T1); #
1121
+
1122
+ &reduction_alg9 ($Xhi,$Xi);
1123
+
1124
+ &test ($len,$len);
1125
+ &jnz (&label("done"));
1126
+
1127
+ &movups ($Hkey,&QWP(0,$Htbl)); # load H
1128
+ &set_label("odd_tail");
1129
+ &movdqu ($T1,&QWP(0,$inp)); # Ii
1130
+ &pshufb ($T1,$T3);
1131
+ &pxor ($Xi,$T1); # Ii+Xi
1132
+
1133
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
1134
+ &reduction_alg9 ($Xhi,$Xi);
1135
+
1136
+ &set_label("done");
1137
+ &pshufb ($Xi,$T3);
1138
+ &movdqu (&QWP(0,$Xip),$Xi);
1139
+ &function_end("gcm_ghash_clmul");
1140
+
1141
+ } else { # Algorith 5. Kept for reference purposes.
1142
+
1143
+ sub reduction_alg5 { # 19/16 times faster than Intel version
1144
+ my ($Xhi,$Xi)=@_;
1145
+
1146
+ # <<1
1147
+ &movdqa ($T1,$Xi); #
1148
+ &movdqa ($T2,$Xhi);
1149
+ &pslld ($Xi,1);
1150
+ &pslld ($Xhi,1); #
1151
+ &psrld ($T1,31);
1152
+ &psrld ($T2,31); #
1153
+ &movdqa ($T3,$T1);
1154
+ &pslldq ($T1,4);
1155
+ &psrldq ($T3,12); #
1156
+ &pslldq ($T2,4);
1157
+ &por ($Xhi,$T3); #
1158
+ &por ($Xi,$T1);
1159
+ &por ($Xhi,$T2); #
1160
+
1161
+ # 1st phase
1162
+ &movdqa ($T1,$Xi);
1163
+ &movdqa ($T2,$Xi);
1164
+ &movdqa ($T3,$Xi); #
1165
+ &pslld ($T1,31);
1166
+ &pslld ($T2,30);
1167
+ &pslld ($Xi,25); #
1168
+ &pxor ($T1,$T2);
1169
+ &pxor ($T1,$Xi); #
1170
+ &movdqa ($T2,$T1); #
1171
+ &pslldq ($T1,12);
1172
+ &psrldq ($T2,4); #
1173
+ &pxor ($T3,$T1);
1174
+
1175
+ # 2nd phase
1176
+ &pxor ($Xhi,$T3); #
1177
+ &movdqa ($Xi,$T3);
1178
+ &movdqa ($T1,$T3);
1179
+ &psrld ($Xi,1); #
1180
+ &psrld ($T1,2);
1181
+ &psrld ($T3,7); #
1182
+ &pxor ($Xi,$T1);
1183
+ &pxor ($Xhi,$T2);
1184
+ &pxor ($Xi,$T3); #
1185
+ &pxor ($Xi,$Xhi); #
1186
+ }
1187
+
1188
+ &function_begin_B("gcm_init_clmul");
1189
+ &mov ($Htbl,&wparam(0));
1190
+ &mov ($Xip,&wparam(1));
1191
+
1192
+ &call (&label("pic"));
1193
+ &set_label("pic");
1194
+ &blindpop ($const);
1195
+ &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
1196
+
1197
+ &movdqu ($Hkey,&QWP(0,$Xip));
1198
+ &pshufd ($Hkey,$Hkey,0b01001110);# dword swap
1199
+
1200
+ # calculate H^2
1201
+ &movdqa ($Xi,$Hkey);
1202
+ &clmul64x64_T3 ($Xhi,$Xi,$Hkey);
1203
+ &reduction_alg5 ($Xhi,$Xi);
1204
+
1205
+ &movdqu (&QWP(0,$Htbl),$Hkey); # save H
1206
+ &movdqu (&QWP(16,$Htbl),$Xi); # save H^2
1207
+
1208
+ &ret ();
1209
+ &function_end_B("gcm_init_clmul");
1210
+
1211
+ &function_begin_B("gcm_gmult_clmul");
1212
+ &mov ($Xip,&wparam(0));
1213
+ &mov ($Htbl,&wparam(1));
1214
+
1215
+ &call (&label("pic"));
1216
+ &set_label("pic");
1217
+ &blindpop ($const);
1218
+ &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
1219
+
1220
+ &movdqu ($Xi,&QWP(0,$Xip));
1221
+ &movdqa ($Xn,&QWP(0,$const));
1222
+ &movdqu ($Hkey,&QWP(0,$Htbl));
1223
+ &pshufb ($Xi,$Xn);
1224
+
1225
+ &clmul64x64_T3 ($Xhi,$Xi,$Hkey);
1226
+ &reduction_alg5 ($Xhi,$Xi);
1227
+
1228
+ &pshufb ($Xi,$Xn);
1229
+ &movdqu (&QWP(0,$Xip),$Xi);
1230
+
1231
+ &ret ();
1232
+ &function_end_B("gcm_gmult_clmul");
1233
+
1234
+ &function_begin("gcm_ghash_clmul");
1235
+ &mov ($Xip,&wparam(0));
1236
+ &mov ($Htbl,&wparam(1));
1237
+ &mov ($inp,&wparam(2));
1238
+ &mov ($len,&wparam(3));
1239
+
1240
+ &call (&label("pic"));
1241
+ &set_label("pic");
1242
+ &blindpop ($const);
1243
+ &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
1244
+
1245
+ &movdqu ($Xi,&QWP(0,$Xip));
1246
+ &movdqa ($T3,&QWP(0,$const));
1247
+ &movdqu ($Hkey,&QWP(0,$Htbl));
1248
+ &pshufb ($Xi,$T3);
1249
+
1250
+ &sub ($len,0x10);
1251
+ &jz (&label("odd_tail"));
1252
+
1253
+ #######
1254
+ # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
1255
+ # [(H*Ii+1) + (H*Xi+1)] mod P =
1256
+ # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
1257
+ #
1258
+ &movdqu ($T1,&QWP(0,$inp)); # Ii
1259
+ &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
1260
+ &pshufb ($T1,$T3);
1261
+ &pshufb ($Xn,$T3);
1262
+ &pxor ($Xi,$T1); # Ii+Xi
1263
+
1264
+ &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
1265
+ &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
1266
+
1267
+ &sub ($len,0x20);
1268
+ &lea ($inp,&DWP(32,$inp)); # i+=2
1269
+ &jbe (&label("even_tail"));
1270
+
1271
+ &set_label("mod_loop");
1272
+ &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
1273
+ &movdqu ($Hkey,&QWP(0,$Htbl)); # load H
1274
+
1275
+ &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
1276
+ &pxor ($Xhi,$Xhn);
1277
+
1278
+ &reduction_alg5 ($Xhi,$Xi);
1279
+
1280
+ #######
1281
+ &movdqa ($T3,&QWP(0,$const));
1282
+ &movdqu ($T1,&QWP(0,$inp)); # Ii
1283
+ &movdqu ($Xn,&QWP(16,$inp)); # Ii+1
1284
+ &pshufb ($T1,$T3);
1285
+ &pshufb ($Xn,$T3);
1286
+ &pxor ($Xi,$T1); # Ii+Xi
1287
+
1288
+ &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
1289
+ &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
1290
+
1291
+ &sub ($len,0x20);
1292
+ &lea ($inp,&DWP(32,$inp));
1293
+ &ja (&label("mod_loop"));
1294
+
1295
+ &set_label("even_tail");
1296
+ &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
1297
+
1298
+ &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
1299
+ &pxor ($Xhi,$Xhn);
1300
+
1301
+ &reduction_alg5 ($Xhi,$Xi);
1302
+
1303
+ &movdqa ($T3,&QWP(0,$const));
1304
+ &test ($len,$len);
1305
+ &jnz (&label("done"));
1306
+
1307
+ &movdqu ($Hkey,&QWP(0,$Htbl)); # load H
1308
+ &set_label("odd_tail");
1309
+ &movdqu ($T1,&QWP(0,$inp)); # Ii
1310
+ &pshufb ($T1,$T3);
1311
+ &pxor ($Xi,$T1); # Ii+Xi
1312
+
1313
+ &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
1314
+ &reduction_alg5 ($Xhi,$Xi);
1315
+
1316
+ &movdqa ($T3,&QWP(0,$const));
1317
+ &set_label("done");
1318
+ &pshufb ($Xi,$T3);
1319
+ &movdqu (&QWP(0,$Xip),$Xi);
1320
+ &function_end("gcm_ghash_clmul");
1321
+
1322
+ }
1323
+
1324
+ &set_label("bswap",64);
1325
+ &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
1326
+ &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial
1327
+ &set_label("rem_8bit",64);
1328
+ &data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
1329
+ &data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
1330
+ &data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E);
1331
+ &data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E);
1332
+ &data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E);
1333
+ &data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E);
1334
+ &data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E);
1335
+ &data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E);
1336
+ &data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE);
1337
+ &data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE);
1338
+ &data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE);
1339
+ &data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE);
1340
+ &data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E);
1341
+ &data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E);
1342
+ &data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE);
1343
+ &data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE);
1344
+ &data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E);
1345
+ &data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E);
1346
+ &data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E);
1347
+ &data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E);
1348
+ &data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E);
1349
+ &data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E);
1350
+ &data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E);
1351
+ &data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E);
1352
+ &data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE);
1353
+ &data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE);
1354
+ &data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE);
1355
+ &data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE);
1356
+ &data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E);
1357
+ &data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
1358
+ &data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
1359
+ &data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
1360
+ }} # $sse2
1361
+
1362
+ &set_label("rem_4bit",64);
1363
+ &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
1364
+ &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
1365
+ &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
1366
+ &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
1367
+ }}} # !$x86only
1368
+
1369
+ &asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
1370
+ &asm_finish();
1371
+
1372
+ # A question was risen about choice of vanilla MMX. Or rather why wasn't
1373
+ # SSE2 chosen instead? In addition to the fact that MMX runs on legacy
1374
+ # CPUs such as PIII, "4-bit" MMX version was observed to provide better
1375
+ # performance than *corresponding* SSE2 one even on contemporary CPUs.
1376
+ # SSE2 results were provided by Peter-Michael Hager. He maintains SSE2
1377
+ # implementation featuring full range of lookup-table sizes, but with
1378
+ # per-invocation lookup table setup. Latter means that table size is
1379
+ # chosen depending on how much data is to be hashed in every given call,
1380
+ # more data - larger table. Best reported result for Core2 is ~4 cycles
1381
+ # per processed byte out of 64KB block. This number accounts even for
1382
+ # 64KB table setup overhead. As discussed in gcm128.c we choose to be
1383
+ # more conservative in respect to lookup table sizes, but how do the
1384
+ # results compare? Minimalistic "256B" MMX version delivers ~11 cycles
1385
+ # on same platform. As also discussed in gcm128.c, next in line "8-bit
1386
+ # Shoup's" or "4KB" method should deliver twice the performance of
1387
+ # "256B" one, in other words not worse than ~6 cycles per byte. It
1388
+ # should be also be noted that in SSE2 case improvement can be "super-
1389
+ # linear," i.e. more than twice, mostly because >>8 maps to single
1390
+ # instruction on SSE2 register. This is unlike "4-bit" case when >>4
1391
+ # maps to same amount of instructions in both MMX and SSE2 cases.
1392
+ # Bottom line is that switch to SSE2 is considered to be justifiable
1393
+ # only in case we choose to implement "8-bit" method...