ring-native 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/Gemfile +3 -0
  4. data/README.md +22 -0
  5. data/Rakefile +1 -0
  6. data/ext/ring/extconf.rb +29 -0
  7. data/lib/ring/native.rb +8 -0
  8. data/lib/ring/native/version.rb +5 -0
  9. data/ring-native.gemspec +25 -0
  10. data/vendor/ring/BUILDING.md +40 -0
  11. data/vendor/ring/Cargo.toml +43 -0
  12. data/vendor/ring/LICENSE +185 -0
  13. data/vendor/ring/Makefile +35 -0
  14. data/vendor/ring/PORTING.md +163 -0
  15. data/vendor/ring/README.md +113 -0
  16. data/vendor/ring/STYLE.md +197 -0
  17. data/vendor/ring/appveyor.yml +27 -0
  18. data/vendor/ring/build.rs +108 -0
  19. data/vendor/ring/crypto/aes/aes.c +1142 -0
  20. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +25 -0
  21. data/vendor/ring/crypto/aes/aes_test.cc +93 -0
  22. data/vendor/ring/crypto/aes/asm/aes-586.pl +2368 -0
  23. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +1249 -0
  24. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +2246 -0
  25. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +1318 -0
  26. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +2084 -0
  27. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +675 -0
  28. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +1364 -0
  29. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +1565 -0
  30. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +841 -0
  31. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +1116 -0
  32. data/vendor/ring/crypto/aes/internal.h +87 -0
  33. data/vendor/ring/crypto/aes/mode_wrappers.c +61 -0
  34. data/vendor/ring/crypto/bn/add.c +394 -0
  35. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +694 -0
  36. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +1503 -0
  37. data/vendor/ring/crypto/bn/asm/bn-586.pl +774 -0
  38. data/vendor/ring/crypto/bn/asm/co-586.pl +287 -0
  39. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +1882 -0
  40. data/vendor/ring/crypto/bn/asm/x86-mont.pl +592 -0
  41. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +599 -0
  42. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +1393 -0
  43. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +3507 -0
  44. data/vendor/ring/crypto/bn/bn.c +352 -0
  45. data/vendor/ring/crypto/bn/bn_asn1.c +74 -0
  46. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +25 -0
  47. data/vendor/ring/crypto/bn/bn_test.cc +1696 -0
  48. data/vendor/ring/crypto/bn/cmp.c +200 -0
  49. data/vendor/ring/crypto/bn/convert.c +433 -0
  50. data/vendor/ring/crypto/bn/ctx.c +311 -0
  51. data/vendor/ring/crypto/bn/div.c +594 -0
  52. data/vendor/ring/crypto/bn/exponentiation.c +1335 -0
  53. data/vendor/ring/crypto/bn/gcd.c +711 -0
  54. data/vendor/ring/crypto/bn/generic.c +1019 -0
  55. data/vendor/ring/crypto/bn/internal.h +316 -0
  56. data/vendor/ring/crypto/bn/montgomery.c +516 -0
  57. data/vendor/ring/crypto/bn/mul.c +888 -0
  58. data/vendor/ring/crypto/bn/prime.c +829 -0
  59. data/vendor/ring/crypto/bn/random.c +334 -0
  60. data/vendor/ring/crypto/bn/rsaz_exp.c +262 -0
  61. data/vendor/ring/crypto/bn/rsaz_exp.h +53 -0
  62. data/vendor/ring/crypto/bn/shift.c +276 -0
  63. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +25 -0
  64. data/vendor/ring/crypto/bytestring/bytestring_test.cc +421 -0
  65. data/vendor/ring/crypto/bytestring/cbb.c +399 -0
  66. data/vendor/ring/crypto/bytestring/cbs.c +227 -0
  67. data/vendor/ring/crypto/bytestring/internal.h +46 -0
  68. data/vendor/ring/crypto/chacha/chacha_generic.c +140 -0
  69. data/vendor/ring/crypto/chacha/chacha_vec.c +323 -0
  70. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +1447 -0
  71. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +153 -0
  72. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +25 -0
  73. data/vendor/ring/crypto/cipher/e_aes.c +390 -0
  74. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +208 -0
  75. data/vendor/ring/crypto/cipher/internal.h +173 -0
  76. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +543 -0
  77. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +9 -0
  78. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +475 -0
  79. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +23 -0
  80. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +422 -0
  81. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +484 -0
  82. data/vendor/ring/crypto/cipher/test/cipher_test.txt +100 -0
  83. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +25 -0
  84. data/vendor/ring/crypto/constant_time_test.c +304 -0
  85. data/vendor/ring/crypto/cpu-arm-asm.S +32 -0
  86. data/vendor/ring/crypto/cpu-arm.c +199 -0
  87. data/vendor/ring/crypto/cpu-intel.c +261 -0
  88. data/vendor/ring/crypto/crypto.c +151 -0
  89. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +2118 -0
  90. data/vendor/ring/crypto/curve25519/curve25519.c +4888 -0
  91. data/vendor/ring/crypto/curve25519/x25519_test.cc +128 -0
  92. data/vendor/ring/crypto/digest/md32_common.h +181 -0
  93. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +2725 -0
  94. data/vendor/ring/crypto/ec/ec.c +193 -0
  95. data/vendor/ring/crypto/ec/ec_curves.c +61 -0
  96. data/vendor/ring/crypto/ec/ec_key.c +228 -0
  97. data/vendor/ring/crypto/ec/ec_montgomery.c +114 -0
  98. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +25 -0
  99. data/vendor/ring/crypto/ec/internal.h +243 -0
  100. data/vendor/ring/crypto/ec/oct.c +253 -0
  101. data/vendor/ring/crypto/ec/p256-64.c +1794 -0
  102. data/vendor/ring/crypto/ec/p256-x86_64-table.h +9548 -0
  103. data/vendor/ring/crypto/ec/p256-x86_64.c +509 -0
  104. data/vendor/ring/crypto/ec/simple.c +1007 -0
  105. data/vendor/ring/crypto/ec/util-64.c +183 -0
  106. data/vendor/ring/crypto/ec/wnaf.c +508 -0
  107. data/vendor/ring/crypto/ecdh/ecdh.c +155 -0
  108. data/vendor/ring/crypto/ecdsa/ecdsa.c +304 -0
  109. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +193 -0
  110. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +25 -0
  111. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +327 -0
  112. data/vendor/ring/crypto/header_removed.h +17 -0
  113. data/vendor/ring/crypto/internal.h +495 -0
  114. data/vendor/ring/crypto/libring.Windows.vcxproj +101 -0
  115. data/vendor/ring/crypto/mem.c +98 -0
  116. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +1045 -0
  117. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +517 -0
  118. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +1393 -0
  119. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +1741 -0
  120. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +422 -0
  121. data/vendor/ring/crypto/modes/ctr.c +226 -0
  122. data/vendor/ring/crypto/modes/gcm.c +1206 -0
  123. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +25 -0
  124. data/vendor/ring/crypto/modes/gcm_test.c +348 -0
  125. data/vendor/ring/crypto/modes/internal.h +299 -0
  126. data/vendor/ring/crypto/perlasm/arm-xlate.pl +170 -0
  127. data/vendor/ring/crypto/perlasm/readme +100 -0
  128. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +1164 -0
  129. data/vendor/ring/crypto/perlasm/x86asm.pl +292 -0
  130. data/vendor/ring/crypto/perlasm/x86gas.pl +263 -0
  131. data/vendor/ring/crypto/perlasm/x86masm.pl +200 -0
  132. data/vendor/ring/crypto/perlasm/x86nasm.pl +187 -0
  133. data/vendor/ring/crypto/poly1305/poly1305.c +331 -0
  134. data/vendor/ring/crypto/poly1305/poly1305_arm.c +301 -0
  135. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +2015 -0
  136. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +25 -0
  137. data/vendor/ring/crypto/poly1305/poly1305_test.cc +80 -0
  138. data/vendor/ring/crypto/poly1305/poly1305_test.txt +52 -0
  139. data/vendor/ring/crypto/poly1305/poly1305_vec.c +892 -0
  140. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +75 -0
  141. data/vendor/ring/crypto/rand/internal.h +32 -0
  142. data/vendor/ring/crypto/rand/rand.c +189 -0
  143. data/vendor/ring/crypto/rand/urandom.c +219 -0
  144. data/vendor/ring/crypto/rand/windows.c +56 -0
  145. data/vendor/ring/crypto/refcount_c11.c +66 -0
  146. data/vendor/ring/crypto/refcount_lock.c +53 -0
  147. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +25 -0
  148. data/vendor/ring/crypto/refcount_test.c +58 -0
  149. data/vendor/ring/crypto/rsa/blinding.c +462 -0
  150. data/vendor/ring/crypto/rsa/internal.h +108 -0
  151. data/vendor/ring/crypto/rsa/padding.c +300 -0
  152. data/vendor/ring/crypto/rsa/rsa.c +450 -0
  153. data/vendor/ring/crypto/rsa/rsa_asn1.c +261 -0
  154. data/vendor/ring/crypto/rsa/rsa_impl.c +944 -0
  155. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +25 -0
  156. data/vendor/ring/crypto/rsa/rsa_test.cc +437 -0
  157. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +436 -0
  158. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +2390 -0
  159. data/vendor/ring/crypto/sha/asm/sha256-586.pl +1275 -0
  160. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +735 -0
  161. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +14 -0
  162. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +14 -0
  163. data/vendor/ring/crypto/sha/asm/sha512-586.pl +911 -0
  164. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +666 -0
  165. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +14 -0
  166. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +14 -0
  167. data/vendor/ring/crypto/sha/sha1.c +271 -0
  168. data/vendor/ring/crypto/sha/sha256.c +204 -0
  169. data/vendor/ring/crypto/sha/sha512.c +355 -0
  170. data/vendor/ring/crypto/test/file_test.cc +326 -0
  171. data/vendor/ring/crypto/test/file_test.h +181 -0
  172. data/vendor/ring/crypto/test/malloc.cc +150 -0
  173. data/vendor/ring/crypto/test/scoped_types.h +95 -0
  174. data/vendor/ring/crypto/test/test.Windows.vcxproj +35 -0
  175. data/vendor/ring/crypto/test/test_util.cc +46 -0
  176. data/vendor/ring/crypto/test/test_util.h +41 -0
  177. data/vendor/ring/crypto/thread_none.c +55 -0
  178. data/vendor/ring/crypto/thread_pthread.c +165 -0
  179. data/vendor/ring/crypto/thread_test.Windows.vcxproj +25 -0
  180. data/vendor/ring/crypto/thread_test.c +200 -0
  181. data/vendor/ring/crypto/thread_win.c +282 -0
  182. data/vendor/ring/examples/checkdigest.rs +103 -0
  183. data/vendor/ring/include/openssl/aes.h +121 -0
  184. data/vendor/ring/include/openssl/arm_arch.h +129 -0
  185. data/vendor/ring/include/openssl/base.h +156 -0
  186. data/vendor/ring/include/openssl/bn.h +794 -0
  187. data/vendor/ring/include/openssl/buffer.h +18 -0
  188. data/vendor/ring/include/openssl/bytestring.h +235 -0
  189. data/vendor/ring/include/openssl/chacha.h +37 -0
  190. data/vendor/ring/include/openssl/cmac.h +76 -0
  191. data/vendor/ring/include/openssl/cpu.h +184 -0
  192. data/vendor/ring/include/openssl/crypto.h +43 -0
  193. data/vendor/ring/include/openssl/curve25519.h +88 -0
  194. data/vendor/ring/include/openssl/ec.h +225 -0
  195. data/vendor/ring/include/openssl/ec_key.h +129 -0
  196. data/vendor/ring/include/openssl/ecdh.h +110 -0
  197. data/vendor/ring/include/openssl/ecdsa.h +156 -0
  198. data/vendor/ring/include/openssl/err.h +201 -0
  199. data/vendor/ring/include/openssl/mem.h +101 -0
  200. data/vendor/ring/include/openssl/obj_mac.h +71 -0
  201. data/vendor/ring/include/openssl/opensslfeatures.h +68 -0
  202. data/vendor/ring/include/openssl/opensslv.h +18 -0
  203. data/vendor/ring/include/openssl/ossl_typ.h +18 -0
  204. data/vendor/ring/include/openssl/poly1305.h +51 -0
  205. data/vendor/ring/include/openssl/rand.h +70 -0
  206. data/vendor/ring/include/openssl/rsa.h +399 -0
  207. data/vendor/ring/include/openssl/thread.h +133 -0
  208. data/vendor/ring/include/openssl/type_check.h +71 -0
  209. data/vendor/ring/mk/Common.props +63 -0
  210. data/vendor/ring/mk/Windows.props +42 -0
  211. data/vendor/ring/mk/WindowsTest.props +18 -0
  212. data/vendor/ring/mk/appveyor.bat +62 -0
  213. data/vendor/ring/mk/bottom_of_makefile.mk +54 -0
  214. data/vendor/ring/mk/ring.mk +266 -0
  215. data/vendor/ring/mk/top_of_makefile.mk +214 -0
  216. data/vendor/ring/mk/travis.sh +40 -0
  217. data/vendor/ring/mk/update-travis-yml.py +229 -0
  218. data/vendor/ring/ring.sln +153 -0
  219. data/vendor/ring/src/aead.rs +682 -0
  220. data/vendor/ring/src/agreement.rs +248 -0
  221. data/vendor/ring/src/c.rs +129 -0
  222. data/vendor/ring/src/constant_time.rs +37 -0
  223. data/vendor/ring/src/der.rs +96 -0
  224. data/vendor/ring/src/digest.rs +690 -0
  225. data/vendor/ring/src/digest_tests.txt +57 -0
  226. data/vendor/ring/src/ecc.rs +28 -0
  227. data/vendor/ring/src/ecc_build.rs +279 -0
  228. data/vendor/ring/src/ecc_curves.rs +117 -0
  229. data/vendor/ring/src/ed25519_tests.txt +2579 -0
  230. data/vendor/ring/src/exe_tests.rs +46 -0
  231. data/vendor/ring/src/ffi.rs +29 -0
  232. data/vendor/ring/src/file_test.rs +187 -0
  233. data/vendor/ring/src/hkdf.rs +153 -0
  234. data/vendor/ring/src/hkdf_tests.txt +59 -0
  235. data/vendor/ring/src/hmac.rs +414 -0
  236. data/vendor/ring/src/hmac_tests.txt +97 -0
  237. data/vendor/ring/src/input.rs +312 -0
  238. data/vendor/ring/src/lib.rs +41 -0
  239. data/vendor/ring/src/pbkdf2.rs +265 -0
  240. data/vendor/ring/src/pbkdf2_tests.txt +113 -0
  241. data/vendor/ring/src/polyfill.rs +57 -0
  242. data/vendor/ring/src/rand.rs +28 -0
  243. data/vendor/ring/src/signature.rs +314 -0
  244. data/vendor/ring/third-party/NIST/README.md +9 -0
  245. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +263 -0
  246. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +309 -0
  247. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +267 -0
  248. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +263 -0
  249. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +309 -0
  250. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +267 -0
  251. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +263 -0
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +309 -0
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +267 -0
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +519 -0
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +309 -0
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +523 -0
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +519 -0
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +309 -0
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +523 -0
  260. data/vendor/ring/third-party/NIST/sha256sums.txt +1 -0
  261. metadata +333 -0
@@ -0,0 +1,592 @@
1
+ #!/usr/bin/env perl
2
+
3
+ # ====================================================================
4
+ # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5
+ # project. The module is, however, dual licensed under OpenSSL and
6
+ # CRYPTOGAMS licenses depending on where you obtain it. For further
7
+ # details see http://www.openssl.org/~appro/cryptogams/.
8
+ # ====================================================================
9
+
10
+ # October 2005
11
+ #
12
+ # This is a "teaser" code, as it can be improved in several ways...
13
+ # First of all non-SSE2 path should be implemented (yes, for now it
14
+ # performs Montgomery multiplication/convolution only on SSE2-capable
15
+ # CPUs such as P4, others fall down to original code). Then inner loop
16
+ # can be unrolled and modulo-scheduled to improve ILP and possibly
17
+ # moved to 128-bit XMM register bank (though it would require input
18
+ # rearrangement and/or increase bus bandwidth utilization). Dedicated
19
+ # squaring procedure should give further performance improvement...
20
+ # Yet, for being draft, the code improves rsa512 *sign* benchmark by
21
+ # 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
22
+
23
+ # December 2006
24
+ #
25
+ # Modulo-scheduling SSE2 loops results in further 15-20% improvement.
26
+ # Integer-only code [being equipped with dedicated squaring procedure]
27
+ # gives ~40% on rsa512 sign benchmark...
28
+
29
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30
+ push(@INC,"${dir}","${dir}../../perlasm");
31
+ require "x86asm.pl";
32
+
33
+ &asm_init($ARGV[0],$0);
34
+
35
+ $sse2=0;
36
+ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
37
+
38
+ &external_label("OPENSSL_ia32cap_P") if ($sse2);
39
+
40
+ &function_begin("bn_mul_mont");
41
+
42
+ $i="edx";
43
+ $j="ecx";
44
+ $ap="esi"; $tp="esi"; # overlapping variables!!!
45
+ $rp="edi"; $bp="edi"; # overlapping variables!!!
46
+ $np="ebp";
47
+ $num="ebx";
48
+
49
+ $_num=&DWP(4*0,"esp"); # stack top layout
50
+ $_rp=&DWP(4*1,"esp");
51
+ $_ap=&DWP(4*2,"esp");
52
+ $_bp=&DWP(4*3,"esp");
53
+ $_np=&DWP(4*4,"esp");
54
+ $_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp");
55
+ $_sp=&DWP(4*6,"esp");
56
+ $_bpend=&DWP(4*7,"esp");
57
+ $frame=32; # size of above frame rounded up to 16n
58
+
59
+ &xor ("eax","eax");
60
+ &mov ("edi",&wparam(5)); # int num
61
+ &cmp ("edi",4);
62
+ &jl (&label("just_leave"));
63
+
64
+ &lea ("esi",&wparam(0)); # put aside pointer to argument block
65
+ &lea ("edx",&wparam(1)); # load ap
66
+ &mov ("ebp","esp"); # saved stack pointer!
67
+ &add ("edi",2); # extra two words on top of tp
68
+ &neg ("edi");
69
+ &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2))
70
+ &neg ("edi");
71
+
72
+ # minimize cache contention by arraning 2K window between stack
73
+ # pointer and ap argument [np is also position sensitive vector,
74
+ # but it's assumed to be near ap, as it's allocated at ~same
75
+ # time].
76
+ &mov ("eax","esp");
77
+ &sub ("eax","edx");
78
+ &and ("eax",2047);
79
+ &sub ("esp","eax"); # this aligns sp and ap modulo 2048
80
+
81
+ &xor ("edx","esp");
82
+ &and ("edx",2048);
83
+ &xor ("edx",2048);
84
+ &sub ("esp","edx"); # this splits them apart modulo 4096
85
+
86
+ &and ("esp",-64); # align to cache line
87
+
88
+ ################################# load argument block...
89
+ &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
90
+ &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
91
+ &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
92
+ &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
93
+ &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
94
+ #&mov ("edi",&DWP(5*4,"esi"));# int num
95
+
96
+ &mov ("esi",&DWP(0,"esi")); # pull n0[0]
97
+ &mov ($_rp,"eax"); # ... save a copy of argument block
98
+ &mov ($_ap,"ebx");
99
+ &mov ($_bp,"ecx");
100
+ &mov ($_np,"edx");
101
+ &mov ($_n0,"esi");
102
+ &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
103
+ #&mov ($_num,$num); # redundant as $num is not reused
104
+ &mov ($_sp,"ebp"); # saved stack pointer!
105
+
106
+ if($sse2) {
107
+ $acc0="mm0"; # mmx register bank layout
108
+ $acc1="mm1";
109
+ $car0="mm2";
110
+ $car1="mm3";
111
+ $mul0="mm4";
112
+ $mul1="mm5";
113
+ $temp="mm6";
114
+ $mask="mm7";
115
+
116
+ &picmeup("eax","OPENSSL_ia32cap_P");
117
+ &bt (&DWP(0,"eax"),26);
118
+ &jnc (&label("non_sse2"));
119
+
120
+ &mov ("eax",-1);
121
+ &movd ($mask,"eax"); # mask 32 lower bits
122
+
123
+ &mov ($ap,$_ap); # load input pointers
124
+ &mov ($bp,$_bp);
125
+ &mov ($np,$_np);
126
+
127
+ &xor ($i,$i); # i=0
128
+ &xor ($j,$j); # j=0
129
+
130
+ &movd ($mul0,&DWP(0,$bp)); # bp[0]
131
+ &movd ($mul1,&DWP(0,$ap)); # ap[0]
132
+ &movd ($car1,&DWP(0,$np)); # np[0]
133
+
134
+ &pmuludq($mul1,$mul0); # ap[0]*bp[0]
135
+ &movq ($car0,$mul1);
136
+ &movq ($acc0,$mul1); # I wish movd worked for
137
+ &pand ($acc0,$mask); # inter-register transfers
138
+
139
+ &pmuludq($mul1,$_n0q); # *=n0
140
+
141
+ &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0
142
+ &paddq ($car1,$acc0);
143
+
144
+ &movd ($acc1,&DWP(4,$np)); # np[1]
145
+ &movd ($acc0,&DWP(4,$ap)); # ap[1]
146
+
147
+ &psrlq ($car0,32);
148
+ &psrlq ($car1,32);
149
+
150
+ &inc ($j); # j++
151
+ &set_label("1st",16);
152
+ &pmuludq($acc0,$mul0); # ap[j]*bp[0]
153
+ &pmuludq($acc1,$mul1); # np[j]*m1
154
+ &paddq ($car0,$acc0); # +=c0
155
+ &paddq ($car1,$acc1); # +=c1
156
+
157
+ &movq ($acc0,$car0);
158
+ &pand ($acc0,$mask);
159
+ &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
160
+ &paddq ($car1,$acc0); # +=ap[j]*bp[0];
161
+ &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
162
+ &psrlq ($car0,32);
163
+ &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]=
164
+ &psrlq ($car1,32);
165
+
166
+ &lea ($j,&DWP(1,$j));
167
+ &cmp ($j,$num);
168
+ &jl (&label("1st"));
169
+
170
+ &pmuludq($acc0,$mul0); # ap[num-1]*bp[0]
171
+ &pmuludq($acc1,$mul1); # np[num-1]*m1
172
+ &paddq ($car0,$acc0); # +=c0
173
+ &paddq ($car1,$acc1); # +=c1
174
+
175
+ &movq ($acc0,$car0);
176
+ &pand ($acc0,$mask);
177
+ &paddq ($car1,$acc0); # +=ap[num-1]*bp[0];
178
+ &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
179
+
180
+ &psrlq ($car0,32);
181
+ &psrlq ($car1,32);
182
+
183
+ &paddq ($car1,$car0);
184
+ &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
185
+
186
+ &inc ($i); # i++
187
+ &set_label("outer");
188
+ &xor ($j,$j); # j=0
189
+
190
+ &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i]
191
+ &movd ($mul1,&DWP(0,$ap)); # ap[0]
192
+ &movd ($temp,&DWP($frame,"esp")); # tp[0]
193
+ &movd ($car1,&DWP(0,$np)); # np[0]
194
+ &pmuludq($mul1,$mul0); # ap[0]*bp[i]
195
+
196
+ &paddq ($mul1,$temp); # +=tp[0]
197
+ &movq ($acc0,$mul1);
198
+ &movq ($car0,$mul1);
199
+ &pand ($acc0,$mask);
200
+
201
+ &pmuludq($mul1,$_n0q); # *=n0
202
+
203
+ &pmuludq($car1,$mul1);
204
+ &paddq ($car1,$acc0);
205
+
206
+ &movd ($temp,&DWP($frame+4,"esp")); # tp[1]
207
+ &movd ($acc1,&DWP(4,$np)); # np[1]
208
+ &movd ($acc0,&DWP(4,$ap)); # ap[1]
209
+
210
+ &psrlq ($car0,32);
211
+ &psrlq ($car1,32);
212
+ &paddq ($car0,$temp); # +=tp[1]
213
+
214
+ &inc ($j); # j++
215
+ &dec ($num);
216
+ &set_label("inner");
217
+ &pmuludq($acc0,$mul0); # ap[j]*bp[i]
218
+ &pmuludq($acc1,$mul1); # np[j]*m1
219
+ &paddq ($car0,$acc0); # +=c0
220
+ &paddq ($car1,$acc1); # +=c1
221
+
222
+ &movq ($acc0,$car0);
223
+ &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
224
+ &pand ($acc0,$mask);
225
+ &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
226
+ &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j]
227
+ &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
228
+ &psrlq ($car0,32);
229
+ &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
230
+ &psrlq ($car1,32);
231
+ &paddq ($car0,$temp); # +=tp[j+1]
232
+
233
+ &dec ($num);
234
+ &lea ($j,&DWP(1,$j)); # j++
235
+ &jnz (&label("inner"));
236
+
237
+ &mov ($num,$j);
238
+ &pmuludq($acc0,$mul0); # ap[num-1]*bp[i]
239
+ &pmuludq($acc1,$mul1); # np[num-1]*m1
240
+ &paddq ($car0,$acc0); # +=c0
241
+ &paddq ($car1,$acc1); # +=c1
242
+
243
+ &movq ($acc0,$car0);
244
+ &pand ($acc0,$mask);
245
+ &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1]
246
+ &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
247
+ &psrlq ($car0,32);
248
+ &psrlq ($car1,32);
249
+
250
+ &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num]
251
+ &paddq ($car1,$car0);
252
+ &paddq ($car1,$temp);
253
+ &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
254
+
255
+ &lea ($i,&DWP(1,$i)); # i++
256
+ &cmp ($i,$num);
257
+ &jle (&label("outer"));
258
+
259
+ &emms (); # done with mmx bank
260
+ &jmp (&label("common_tail"));
261
+
262
+ &set_label("non_sse2",16);
263
+ }
264
+
265
+ if (0) {
266
+ &mov ("esp",$_sp);
267
+ &xor ("eax","eax"); # signal "not fast enough [yet]"
268
+ &jmp (&label("just_leave"));
269
+ # While the below code provides competitive performance for
270
+ # all key lengthes on modern Intel cores, it's still more
271
+ # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
272
+ # means compared to the original integer-only assembler.
273
+ # 512-bit RSA sign is better by ~40%, but that's about all
274
+ # one can say about all CPUs...
275
+ } else {
276
+ $inp="esi"; # integer path uses these registers differently
277
+ $word="edi";
278
+ $carry="ebp";
279
+
280
+ &mov ($inp,$_ap);
281
+ &lea ($carry,&DWP(1,$num));
282
+ &mov ($word,$_bp);
283
+ &xor ($j,$j); # j=0
284
+ &mov ("edx",$inp);
285
+ &and ($carry,1); # see if num is even
286
+ &sub ("edx",$word); # see if ap==bp
287
+ &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num]
288
+ &or ($carry,"edx");
289
+ &mov ($word,&DWP(0,$word)); # bp[0]
290
+ &jz (&label("bn_sqr_mont"));
291
+ &mov ($_bpend,"eax");
292
+ &mov ("eax",&DWP(0,$inp));
293
+ &xor ("edx","edx");
294
+
295
+ &set_label("mull",16);
296
+ &mov ($carry,"edx");
297
+ &mul ($word); # ap[j]*bp[0]
298
+ &add ($carry,"eax");
299
+ &lea ($j,&DWP(1,$j));
300
+ &adc ("edx",0);
301
+ &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
302
+ &cmp ($j,$num);
303
+ &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
304
+ &jl (&label("mull"));
305
+
306
+ &mov ($carry,"edx");
307
+ &mul ($word); # ap[num-1]*bp[0]
308
+ &mov ($word,$_n0);
309
+ &add ("eax",$carry);
310
+ &mov ($inp,$_np);
311
+ &adc ("edx",0);
312
+ &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
313
+
314
+ &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]=
315
+ &xor ($j,$j);
316
+ &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
317
+ &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
318
+
319
+ &mov ("eax",&DWP(0,$inp)); # np[0]
320
+ &mul ($word); # np[0]*m
321
+ &add ("eax",&DWP($frame,"esp")); # +=tp[0]
322
+ &mov ("eax",&DWP(4,$inp)); # np[1]
323
+ &adc ("edx",0);
324
+ &inc ($j);
325
+
326
+ &jmp (&label("2ndmadd"));
327
+
328
+ &set_label("1stmadd",16);
329
+ &mov ($carry,"edx");
330
+ &mul ($word); # ap[j]*bp[i]
331
+ &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
332
+ &lea ($j,&DWP(1,$j));
333
+ &adc ("edx",0);
334
+ &add ($carry,"eax");
335
+ &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
336
+ &adc ("edx",0);
337
+ &cmp ($j,$num);
338
+ &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
339
+ &jl (&label("1stmadd"));
340
+
341
+ &mov ($carry,"edx");
342
+ &mul ($word); # ap[num-1]*bp[i]
343
+ &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1]
344
+ &mov ($word,$_n0);
345
+ &adc ("edx",0);
346
+ &mov ($inp,$_np);
347
+ &add ($carry,"eax");
348
+ &adc ("edx",0);
349
+ &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
350
+
351
+ &xor ($j,$j);
352
+ &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
353
+ &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]=
354
+ &adc ($j,0);
355
+ &mov ("eax",&DWP(0,$inp)); # np[0]
356
+ &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
357
+ &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
358
+
359
+ &mul ($word); # np[0]*m
360
+ &add ("eax",&DWP($frame,"esp")); # +=tp[0]
361
+ &mov ("eax",&DWP(4,$inp)); # np[1]
362
+ &adc ("edx",0);
363
+ &mov ($j,1);
364
+
365
+ &set_label("2ndmadd",16);
366
+ &mov ($carry,"edx");
367
+ &mul ($word); # np[j]*m
368
+ &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
369
+ &lea ($j,&DWP(1,$j));
370
+ &adc ("edx",0);
371
+ &add ($carry,"eax");
372
+ &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1]
373
+ &adc ("edx",0);
374
+ &cmp ($j,$num);
375
+ &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]=
376
+ &jl (&label("2ndmadd"));
377
+
378
+ &mov ($carry,"edx");
379
+ &mul ($word); # np[j]*m
380
+ &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
381
+ &adc ("edx",0);
382
+ &add ($carry,"eax");
383
+ &adc ("edx",0);
384
+ &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
385
+
386
+ &xor ("eax","eax");
387
+ &mov ($j,$_bp); # &bp[i]
388
+ &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
389
+ &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
390
+ &lea ($j,&DWP(4,$j));
391
+ &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
392
+ &cmp ($j,$_bpend);
393
+ &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
394
+ &je (&label("common_tail"));
395
+
396
+ &mov ($word,&DWP(0,$j)); # bp[i+1]
397
+ &mov ($inp,$_ap);
398
+ &mov ($_bp,$j); # &bp[++i]
399
+ &xor ($j,$j);
400
+ &xor ("edx","edx");
401
+ &mov ("eax",&DWP(0,$inp));
402
+ &jmp (&label("1stmadd"));
403
+
404
+ &set_label("bn_sqr_mont",16);
405
+ $sbit=$num;
406
+ &mov ($_num,$num);
407
+ &mov ($_bp,$j); # i=0
408
+
409
+ &mov ("eax",$word); # ap[0]
410
+ &mul ($word); # ap[0]*ap[0]
411
+ &mov (&DWP($frame,"esp"),"eax"); # tp[0]=
412
+ &mov ($sbit,"edx");
413
+ &shr ("edx",1);
414
+ &and ($sbit,1);
415
+ &inc ($j);
416
+ &set_label("sqr",16);
417
+ &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
418
+ &mov ($carry,"edx");
419
+ &mul ($word); # ap[j]*ap[0]
420
+ &add ("eax",$carry);
421
+ &lea ($j,&DWP(1,$j));
422
+ &adc ("edx",0);
423
+ &lea ($carry,&DWP(0,$sbit,"eax",2));
424
+ &shr ("eax",31);
425
+ &cmp ($j,$_num);
426
+ &mov ($sbit,"eax");
427
+ &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
428
+ &jl (&label("sqr"));
429
+
430
+ &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1]
431
+ &mov ($carry,"edx");
432
+ &mul ($word); # ap[num-1]*ap[0]
433
+ &add ("eax",$carry);
434
+ &mov ($word,$_n0);
435
+ &adc ("edx",0);
436
+ &mov ($inp,$_np);
437
+ &lea ($carry,&DWP(0,$sbit,"eax",2));
438
+ &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
439
+ &shr ("eax",31);
440
+ &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]=
441
+
442
+ &lea ($carry,&DWP(0,"eax","edx",2));
443
+ &mov ("eax",&DWP(0,$inp)); # np[0]
444
+ &shr ("edx",31);
445
+ &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]=
446
+ &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]=
447
+
448
+ &mul ($word); # np[0]*m
449
+ &add ("eax",&DWP($frame,"esp")); # +=tp[0]
450
+ &mov ($num,$j);
451
+ &adc ("edx",0);
452
+ &mov ("eax",&DWP(4,$inp)); # np[1]
453
+ &mov ($j,1);
454
+
455
+ &set_label("3rdmadd",16);
456
+ &mov ($carry,"edx");
457
+ &mul ($word); # np[j]*m
458
+ &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
459
+ &adc ("edx",0);
460
+ &add ($carry,"eax");
461
+ &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1]
462
+ &adc ("edx",0);
463
+ &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]=
464
+
465
+ &mov ($carry,"edx");
466
+ &mul ($word); # np[j+1]*m
467
+ &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1]
468
+ &lea ($j,&DWP(2,$j));
469
+ &adc ("edx",0);
470
+ &add ($carry,"eax");
471
+ &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2]
472
+ &adc ("edx",0);
473
+ &cmp ($j,$num);
474
+ &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]=
475
+ &jl (&label("3rdmadd"));
476
+
477
+ &mov ($carry,"edx");
478
+ &mul ($word); # np[j]*m
479
+ &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
480
+ &adc ("edx",0);
481
+ &add ($carry,"eax");
482
+ &adc ("edx",0);
483
+ &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
484
+
485
+ &mov ($j,$_bp); # i
486
+ &xor ("eax","eax");
487
+ &mov ($inp,$_ap);
488
+ &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
489
+ &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
490
+ &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
491
+ &cmp ($j,$num);
492
+ &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
493
+ &je (&label("common_tail"));
494
+
495
+ &mov ($word,&DWP(4,$inp,$j,4)); # ap[i]
496
+ &lea ($j,&DWP(1,$j));
497
+ &mov ("eax",$word);
498
+ &mov ($_bp,$j); # ++i
499
+ &mul ($word); # ap[i]*ap[i]
500
+ &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i]
501
+ &adc ("edx",0);
502
+ &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]=
503
+ &xor ($carry,$carry);
504
+ &cmp ($j,$num);
505
+ &lea ($j,&DWP(1,$j));
506
+ &je (&label("sqrlast"));
507
+
508
+ &mov ($sbit,"edx"); # zaps $num
509
+ &shr ("edx",1);
510
+ &and ($sbit,1);
511
+ &set_label("sqradd",16);
512
+ &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
513
+ &mov ($carry,"edx");
514
+ &mul ($word); # ap[j]*ap[i]
515
+ &add ("eax",$carry);
516
+ &lea ($carry,&DWP(0,"eax","eax"));
517
+ &adc ("edx",0);
518
+ &shr ("eax",31);
519
+ &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
520
+ &lea ($j,&DWP(1,$j));
521
+ &adc ("eax",0);
522
+ &add ($carry,$sbit);
523
+ &adc ("eax",0);
524
+ &cmp ($j,$_num);
525
+ &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
526
+ &mov ($sbit,"eax");
527
+ &jle (&label("sqradd"));
528
+
529
+ &mov ($carry,"edx");
530
+ &add ("edx","edx");
531
+ &shr ($carry,31);
532
+ &add ("edx",$sbit);
533
+ &adc ($carry,0);
534
+ &set_label("sqrlast");
535
+ &mov ($word,$_n0);
536
+ &mov ($inp,$_np);
537
+ &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
538
+
539
+ &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num]
540
+ &mov ("eax",&DWP(0,$inp)); # np[0]
541
+ &adc ($carry,0);
542
+ &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]=
543
+ &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]=
544
+
545
+ &mul ($word); # np[0]*m
546
+ &add ("eax",&DWP($frame,"esp")); # +=tp[0]
547
+ &lea ($num,&DWP(-1,$j));
548
+ &adc ("edx",0);
549
+ &mov ($j,1);
550
+ &mov ("eax",&DWP(4,$inp)); # np[1]
551
+
552
+ &jmp (&label("3rdmadd"));
553
+ }
554
+
555
+ &set_label("common_tail",16);
556
+ &mov ($np,$_np); # load modulus pointer
557
+ &mov ($rp,$_rp); # load result pointer
558
+ &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
559
+
560
+ &mov ("eax",&DWP(0,$tp)); # tp[0]
561
+ &mov ($j,$num); # j=num-1
562
+ &xor ($i,$i); # i=0 and clear CF!
563
+
564
+ &set_label("sub",16);
565
+ &sbb ("eax",&DWP(0,$np,$i,4));
566
+ &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i]
567
+ &dec ($j); # doesn't affect CF!
568
+ &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1]
569
+ &lea ($i,&DWP(1,$i)); # i++
570
+ &jge (&label("sub"));
571
+
572
+ &sbb ("eax",0); # handle upmost overflow bit
573
+
574
+ &set_label("copy",16); # copy or in-place refresh
575
+ &mov ("edx",&DWP(0,$tp,$num,4));
576
+ &mov ($np,&DWP(0,$rp,$num,4));
577
+ &xor ("edx",$np); # conditional select
578
+ &and ("edx","eax");
579
+ &xor ("edx",$np);
580
+ &mov (&DWP(0,$tp,$num,4),$j) # zap temporary vector
581
+ &mov (&DWP(0,$rp,$num,4),"edx"); # rp[i]=tp[i]
582
+ &dec ($num);
583
+ &jge (&label("copy"));
584
+
585
+ &mov ("esp",$_sp); # pull saved stack pointer
586
+ &mov ("eax",1);
587
+ &set_label("just_leave");
588
+ &function_end("bn_mul_mont");
589
+
590
+ &asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
591
+
592
+ &asm_finish();