RubyGems - grpc - Versions diffs - 1.24.0 → 1.25.0.pre1 - Mend

grpc 1.24.0 → 1.25.0.pre1

Potentially problematic release.

This version of grpc might be problematic. Click here for more details.

Files changed (504) hide show

checksums.yaml +4 -4
data/Makefile +306 -243
data/etc/roots.pem +0 -100
data/include/grpc/grpc_security.h +44 -18
data/include/grpc/impl/codegen/grpc_types.h +15 -0
data/include/grpc/impl/codegen/port_platform.h +27 -11
data/include/grpc/impl/codegen/sync_generic.h +1 -1
data/src/boringssl/err_data.c +695 -650
data/src/core/ext/filters/client_channel/client_channel.cc +257 -179
data/src/core/ext/filters/client_channel/client_channel.h +24 -0
data/src/core/ext/filters/client_channel/client_channel_channelz.cc +2 -3
data/src/core/ext/filters/client_channel/client_channel_factory.h +1 -5
data/src/core/ext/filters/client_channel/health/health_check_client.cc +18 -45
data/src/core/ext/filters/client_channel/health/health_check_client.h +5 -13
data/src/core/ext/filters/client_channel/http_connect_handshaker.cc +1 -1
data/src/core/ext/filters/client_channel/lb_policy.cc +2 -3
data/src/core/ext/filters/client_channel/lb_policy.h +65 -55
data/src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.cc +14 -14
data/src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.cc +113 -36
data/src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.cc +14 -19
data/src/core/ext/filters/client_channel/lb_policy/round_robin/round_robin.cc +36 -13
data/src/core/ext/filters/client_channel/lb_policy/subchannel_list.h +3 -10
data/src/core/ext/filters/client_channel/lb_policy/xds/xds.cc +814 -1589
data/src/core/ext/filters/client_channel/lb_policy/xds/xds.h +2 -5
data/src/core/ext/filters/client_channel/lb_policy_factory.h +3 -6
data/src/core/ext/filters/client_channel/resolver.cc +1 -2
data/src/core/ext/filters/client_channel/resolver.h +8 -16
data/src/core/ext/filters/client_channel/resolver/dns/c_ares/dns_resolver_ares.cc +25 -8
data/src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_ev_driver.cc +46 -12
data/src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_ev_driver.h +10 -17
data/src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_ev_driver_libuv.cc +7 -8
data/src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_ev_driver_posix.cc +4 -4
data/src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_ev_driver_windows.cc +111 -44
data/src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_wrapper.cc +22 -14
data/src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_wrapper.h +1 -1
data/src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_wrapper_fallback.cc +2 -2
data/src/core/ext/filters/client_channel/resolver/dns/native/dns_resolver.cc +29 -10
data/src/core/ext/filters/client_channel/resolver/fake/fake_resolver.cc +27 -36
data/src/core/ext/filters/client_channel/resolver/sockaddr/sockaddr_resolver.cc +7 -10
data/src/core/ext/filters/client_channel/resolver/xds/xds_resolver.cc +60 -16
data/src/core/ext/filters/client_channel/resolver_factory.h +4 -8
data/src/core/ext/filters/client_channel/resolver_registry.cc +1 -1
data/src/core/ext/filters/client_channel/resolver_registry.h +1 -1
data/src/core/ext/filters/client_channel/resolver_result_parsing.cc +7 -10
data/src/core/ext/filters/client_channel/resolving_lb_policy.cc +7 -8
data/src/core/ext/filters/client_channel/resolving_lb_policy.h +1 -1
data/src/core/ext/filters/client_channel/retry_throttle.cc +5 -5
data/src/core/ext/filters/client_channel/retry_throttle.h +1 -4
data/src/core/ext/filters/client_channel/service_config.h +8 -8
data/src/core/ext/filters/client_channel/subchannel.cc +53 -86
data/src/core/ext/filters/client_channel/subchannel.h +7 -9
data/src/core/ext/filters/client_channel/subchannel_interface.h +9 -13
data/src/core/ext/filters/client_channel/subchannel_pool_interface.h +3 -6
data/src/core/ext/filters/client_channel/{lb_policy/xds/xds_load_balancer_api.cc → xds/xds_api.cc} +169 -52
data/src/core/ext/filters/client_channel/xds/xds_api.h +171 -0
data/src/core/ext/filters/client_channel/xds/xds_bootstrap.cc +450 -0
data/src/core/ext/filters/client_channel/xds/xds_bootstrap.h +99 -0
data/src/core/ext/filters/client_channel/{lb_policy/xds → xds}/xds_channel.h +8 -6
data/src/core/ext/filters/client_channel/xds/xds_channel_args.h +26 -0
data/src/core/ext/filters/client_channel/{lb_policy/xds → xds}/xds_channel_secure.cc +28 -11
data/src/core/ext/filters/client_channel/xds/xds_client.cc +1413 -0
data/src/core/ext/filters/client_channel/xds/xds_client.h +221 -0
data/src/core/ext/filters/client_channel/{lb_policy/xds → xds}/xds_client_stats.cc +1 -5
data/src/core/ext/filters/client_channel/{lb_policy/xds → xds}/xds_client_stats.h +3 -4
data/src/core/ext/filters/deadline/deadline_filter.cc +20 -20
data/src/core/ext/filters/http/client/http_client_filter.cc +15 -15
data/src/core/ext/filters/http/client_authority_filter.cc +14 -14
data/src/core/ext/filters/http/message_compress/message_compress_filter.cc +12 -12
data/src/core/ext/filters/max_age/max_age_filter.cc +59 -50
data/src/core/ext/filters/message_size/message_size_filter.cc +18 -18
data/src/core/ext/filters/workarounds/workaround_cronet_compression_filter.cc +15 -14
data/src/core/ext/transport/chttp2/transport/chttp2_transport.cc +233 -175
data/src/core/ext/transport/chttp2/transport/flow_control.h +21 -24
data/src/core/ext/transport/chttp2/transport/hpack_encoder.cc +253 -163
data/src/core/ext/transport/chttp2/transport/hpack_encoder.h +24 -12
data/src/core/ext/transport/chttp2/transport/hpack_parser.cc +2 -3
data/src/core/ext/transport/chttp2/transport/internal.h +13 -15
data/src/core/ext/transport/chttp2/transport/writing.cc +3 -0
data/src/core/ext/transport/inproc/inproc_transport.cc +20 -13
data/src/core/lib/channel/channel_args.cc +16 -0
data/src/core/lib/channel/channel_args.h +22 -0
data/src/core/lib/channel/channelz.cc +5 -6
data/src/core/lib/channel/channelz.h +1 -1
data/src/core/lib/channel/connected_channel.cc +20 -20
data/src/core/lib/channel/handshaker.h +3 -4
data/src/core/lib/channel/handshaker_factory.h +1 -3
data/src/core/lib/debug/trace.h +3 -2
data/src/core/lib/gprpp/arena.cc +3 -3
data/src/core/lib/gprpp/arena.h +2 -3
data/src/core/lib/gprpp/inlined_vector.h +9 -0
data/src/core/lib/gprpp/map.h +3 -501
data/src/core/lib/gprpp/memory.h +45 -41
data/src/core/lib/gprpp/mpscq.cc +108 -0
data/src/core/lib/gprpp/mpscq.h +98 -0
data/src/core/lib/gprpp/orphanable.h +6 -11
data/src/core/lib/gprpp/ref_counted.h +25 -19
data/src/core/lib/gprpp/set.h +33 -0
data/src/core/lib/gprpp/thd.h +2 -4
data/src/core/lib/http/httpcli.cc +1 -1
data/src/core/lib/http/httpcli_security_connector.cc +15 -11
data/src/core/lib/http/parser.cc +1 -1
data/src/core/lib/iomgr/buffer_list.cc +4 -5
data/src/core/lib/iomgr/buffer_list.h +5 -6
data/src/core/lib/iomgr/call_combiner.cc +4 -5
data/src/core/lib/iomgr/call_combiner.h +2 -2
data/src/core/lib/iomgr/cfstream_handle.h +3 -5
data/src/core/lib/iomgr/closure.h +8 -3
data/src/core/lib/iomgr/combiner.cc +45 -82
data/src/core/lib/iomgr/combiner.h +32 -8
data/src/core/lib/iomgr/endpoint_cfstream.cc +5 -3
data/src/core/lib/iomgr/ev_epoll1_linux.cc +19 -15
data/src/core/lib/iomgr/exec_ctx.h +4 -3
data/src/core/lib/iomgr/executor.cc +4 -2
data/src/core/lib/iomgr/executor.h +3 -0
data/src/core/lib/iomgr/executor/mpmcqueue.h +3 -6
data/src/core/lib/iomgr/executor/threadpool.cc +1 -2
data/src/core/lib/iomgr/executor/threadpool.h +7 -11
data/src/core/lib/iomgr/resource_quota.cc +55 -51
data/src/core/lib/iomgr/resource_quota.h +13 -9
data/src/core/lib/iomgr/socket_utils_common_posix.cc +13 -0
data/src/core/lib/iomgr/socket_utils_posix.h +4 -0
data/src/core/lib/iomgr/tcp_client_posix.cc +4 -11
data/src/core/lib/iomgr/tcp_custom.cc +9 -7
data/src/core/lib/iomgr/tcp_posix.cc +20 -16
data/src/core/lib/iomgr/tcp_server.h +1 -4
data/src/core/lib/iomgr/tcp_server_custom.cc +5 -5
data/src/core/lib/iomgr/tcp_server_posix.cc +1 -1
data/src/core/lib/iomgr/tcp_server_utils_posix_common.cc +2 -11
data/src/core/lib/iomgr/timer_custom.cc +2 -2
data/src/core/lib/iomgr/udp_server.cc +3 -2
data/src/core/lib/iomgr/udp_server.h +6 -12
data/src/core/lib/json/json.h +1 -1
data/src/core/lib/json/json_string.cc +2 -2
data/src/core/lib/profiling/basic_timers.cc +2 -2
data/src/core/lib/security/credentials/alts/alts_credentials.cc +2 -2
data/src/core/lib/security/credentials/alts/grpc_alts_credentials_server_options.cc +1 -1
data/src/core/lib/security/credentials/credentials.h +4 -20
data/src/core/lib/security/credentials/fake/fake_credentials.cc +4 -4
data/src/core/lib/security/credentials/oauth2/oauth2_credentials.h +1 -3
data/src/core/lib/security/credentials/tls/grpc_tls_credentials_options.h +64 -0
data/src/core/lib/security/security_connector/alts/alts_security_connector.cc +4 -4
data/src/core/lib/security/security_connector/fake/fake_security_connector.cc +9 -7
data/src/core/lib/security/security_connector/load_system_roots_linux.cc +2 -0
data/src/core/lib/security/security_connector/local/local_security_connector.cc +4 -4
data/src/core/lib/security/security_connector/security_connector.cc +1 -0
data/src/core/lib/security/security_connector/security_connector.h +19 -17
data/src/core/lib/security/security_connector/ssl/ssl_security_connector.cc +8 -5
data/src/core/lib/security/security_connector/ssl_utils.cc +2 -2
data/src/core/lib/security/security_connector/ssl_utils.h +1 -1
data/src/core/lib/security/security_connector/tls/spiffe_security_connector.cc +14 -6
data/src/core/lib/security/security_connector/tls/spiffe_security_connector.h +4 -2
data/src/core/lib/security/transport/client_auth_filter.cc +17 -17
data/src/core/lib/security/transport/security_handshaker.cc +29 -13
data/src/core/lib/security/transport/security_handshaker.h +4 -2
data/src/core/lib/security/transport/server_auth_filter.cc +14 -14
data/src/core/lib/slice/slice.cc +2 -10
data/src/core/lib/slice/slice_hash_table.h +4 -6
data/src/core/lib/slice/slice_intern.cc +42 -39
data/src/core/lib/slice/slice_internal.h +3 -3
data/src/core/lib/slice/slice_utils.h +21 -4
data/src/core/lib/slice/slice_weak_hash_table.h +4 -6
data/src/core/lib/surface/call.cc +3 -3
data/src/core/lib/surface/channel.cc +7 -0
data/src/core/lib/surface/completion_queue.cc +12 -11
data/src/core/lib/surface/completion_queue.h +4 -2
data/src/core/lib/surface/init.cc +1 -0
data/src/core/lib/surface/lame_client.cc +33 -18
data/src/core/lib/surface/server.cc +77 -76
data/src/core/lib/surface/version.cc +1 -1
data/src/core/lib/transport/byte_stream.h +3 -7
data/src/core/lib/transport/connectivity_state.cc +112 -98
data/src/core/lib/transport/connectivity_state.h +100 -50
data/src/core/lib/transport/static_metadata.cc +276 -288
data/src/core/lib/transport/static_metadata.h +73 -76
data/src/core/lib/transport/status_conversion.cc +1 -1
data/src/core/lib/transport/status_metadata.cc +1 -1
data/src/core/lib/transport/transport.cc +2 -2
data/src/core/lib/transport/transport.h +12 -4
data/src/core/lib/transport/transport_op_string.cc +14 -11
data/src/core/tsi/alts/frame_protector/alts_unseal_privacy_integrity_crypter.cc +1 -1
data/src/core/tsi/alts/handshaker/alts_shared_resource.cc +1 -1
data/src/core/tsi/alts/handshaker/alts_tsi_handshaker.cc +5 -5
data/src/core/tsi/alts/zero_copy_frame_protector/alts_zero_copy_grpc_protector.cc +12 -2
data/src/core/tsi/fake_transport_security.cc +7 -5
data/src/core/tsi/grpc_shadow_boringssl.h +2918 -2627
data/src/core/tsi/local_transport_security.cc +8 -6
data/src/core/tsi/ssl/session_cache/ssl_session.h +1 -3
data/src/core/tsi/ssl/session_cache/ssl_session_boringssl.cc +1 -2
data/src/core/tsi/ssl/session_cache/ssl_session_cache.cc +7 -5
data/src/core/tsi/ssl/session_cache/ssl_session_cache.h +4 -6
data/src/core/tsi/ssl/session_cache/ssl_session_openssl.cc +1 -2
data/src/core/tsi/ssl_transport_security.cc +12 -12
data/src/core/tsi/ssl_transport_security.h +2 -2
data/src/core/tsi/transport_security_grpc.cc +7 -0
data/src/core/tsi/transport_security_grpc.h +6 -0
data/src/ruby/ext/grpc/extconf.rb +1 -0
data/src/ruby/ext/grpc/rb_call.c +1 -1
data/src/ruby/ext/grpc/rb_channel.c +1 -1
data/src/ruby/lib/grpc/generic/bidi_call.rb +1 -1
data/src/ruby/lib/grpc/generic/rpc_server.rb +1 -1
data/src/ruby/lib/grpc/version.rb +1 -1
data/src/ruby/spec/google_rpc_status_utils_spec.rb +2 -2
data/third_party/boringssl/crypto/asn1/a_bool.c +18 -5
data/third_party/boringssl/crypto/asn1/a_d2i_fp.c +17 -221
data/third_party/boringssl/crypto/asn1/a_dup.c +0 -24
data/third_party/boringssl/crypto/asn1/a_enum.c +2 -2
data/third_party/boringssl/crypto/asn1/a_i2d_fp.c +10 -72
data/third_party/boringssl/crypto/asn1/a_int.c +12 -71
data/third_party/boringssl/crypto/asn1/a_mbstr.c +110 -216
data/third_party/boringssl/crypto/asn1/a_object.c +16 -5
data/third_party/boringssl/crypto/asn1/a_strnid.c +1 -0
data/third_party/boringssl/crypto/asn1/asn1_lib.c +5 -1
data/third_party/boringssl/crypto/asn1/tasn_enc.c +3 -1
data/third_party/boringssl/crypto/base64/base64.c +2 -2
data/third_party/boringssl/crypto/bio/bio.c +73 -9
data/third_party/boringssl/crypto/bio/connect.c +4 -0
data/third_party/boringssl/crypto/bio/fd.c +4 -0
data/third_party/boringssl/crypto/bio/file.c +5 -2
data/third_party/boringssl/crypto/bio/socket.c +4 -0
data/third_party/boringssl/crypto/bio/socket_helper.c +4 -0
data/third_party/boringssl/crypto/bn_extra/convert.c +11 -7
data/third_party/boringssl/crypto/bytestring/ber.c +8 -4
data/third_party/boringssl/crypto/bytestring/cbb.c +19 -7
data/third_party/boringssl/crypto/bytestring/cbs.c +28 -15
data/third_party/boringssl/crypto/bytestring/internal.h +28 -7
data/third_party/boringssl/crypto/bytestring/unicode.c +155 -0
data/third_party/boringssl/crypto/chacha/chacha.c +36 -19
data/third_party/boringssl/crypto/chacha/internal.h +45 -0
data/third_party/boringssl/crypto/cipher_extra/cipher_extra.c +29 -0
data/third_party/boringssl/crypto/cipher_extra/e_aesccm.c +269 -25
data/third_party/boringssl/crypto/cipher_extra/e_aesctrhmac.c +16 -14
data/third_party/boringssl/crypto/cipher_extra/e_aesgcmsiv.c +54 -38
data/third_party/boringssl/crypto/cipher_extra/e_chacha20poly1305.c +133 -41
data/third_party/boringssl/crypto/cipher_extra/e_tls.c +23 -15
data/third_party/boringssl/crypto/cipher_extra/tls_cbc.c +24 -15
data/third_party/boringssl/crypto/cmac/cmac.c +62 -25
data/third_party/boringssl/crypto/conf/conf.c +7 -0
data/third_party/boringssl/crypto/cpu-arm-linux.c +4 -148
data/third_party/boringssl/crypto/cpu-arm-linux.h +201 -0
data/third_party/boringssl/crypto/cpu-intel.c +45 -51
data/third_party/boringssl/crypto/crypto.c +39 -22
data/third_party/boringssl/crypto/curve25519/spake25519.c +1 -1
data/third_party/boringssl/crypto/dsa/dsa.c +77 -53
data/third_party/boringssl/crypto/ec_extra/ec_asn1.c +20 -8
data/third_party/boringssl/crypto/ec_extra/ec_derive.c +96 -0
data/third_party/boringssl/crypto/{ecdh/ecdh.c → ecdh_extra/ecdh_extra.c} +20 -58
data/third_party/boringssl/crypto/ecdsa_extra/ecdsa_asn1.c +1 -9
data/third_party/boringssl/crypto/engine/engine.c +2 -1
data/third_party/boringssl/crypto/err/err.c +2 -0
data/third_party/boringssl/crypto/err/internal.h +2 -2
data/third_party/boringssl/crypto/evp/evp.c +89 -8
data/third_party/boringssl/crypto/evp/evp_asn1.c +56 -5
data/third_party/boringssl/crypto/evp/evp_ctx.c +52 -14
data/third_party/boringssl/crypto/evp/internal.h +18 -1
data/third_party/boringssl/crypto/evp/p_dsa_asn1.c +5 -0
data/third_party/boringssl/crypto/evp/p_ec.c +51 -3
data/third_party/boringssl/crypto/evp/p_ec_asn1.c +6 -7
data/third_party/boringssl/crypto/evp/p_ed25519.c +36 -3
data/third_party/boringssl/crypto/evp/p_ed25519_asn1.c +76 -45
data/third_party/boringssl/crypto/evp/p_rsa.c +3 -1
data/third_party/boringssl/crypto/evp/p_rsa_asn1.c +5 -0
data/third_party/boringssl/crypto/evp/p_x25519.c +110 -0
data/third_party/boringssl/crypto/evp/p_x25519_asn1.c +249 -0
data/third_party/boringssl/crypto/evp/scrypt.c +6 -2
data/third_party/boringssl/crypto/fipsmodule/aes/aes.c +34 -274
data/third_party/boringssl/crypto/fipsmodule/aes/internal.h +161 -21
data/third_party/boringssl/crypto/fipsmodule/aes/key_wrap.c +111 -13
data/third_party/boringssl/crypto/fipsmodule/aes/mode_wrappers.c +17 -21
data/third_party/boringssl/crypto/fipsmodule/bcm.c +119 -7
data/third_party/boringssl/crypto/fipsmodule/bn/bn.c +19 -2
data/third_party/boringssl/crypto/fipsmodule/bn/cmp.c +2 -2
data/third_party/boringssl/crypto/fipsmodule/bn/ctx.c +93 -160
data/third_party/boringssl/crypto/fipsmodule/bn/div.c +48 -57
data/third_party/boringssl/crypto/fipsmodule/bn/div_extra.c +87 -0
data/third_party/boringssl/crypto/fipsmodule/bn/exponentiation.c +143 -211
data/third_party/boringssl/crypto/fipsmodule/bn/gcd.c +0 -305
data/third_party/boringssl/crypto/fipsmodule/bn/gcd_extra.c +325 -0
data/third_party/boringssl/crypto/fipsmodule/bn/internal.h +168 -50
data/third_party/boringssl/crypto/fipsmodule/bn/montgomery.c +68 -92
data/third_party/boringssl/crypto/fipsmodule/bn/montgomery_inv.c +7 -6
data/third_party/boringssl/crypto/fipsmodule/bn/mul.c +11 -14
data/third_party/boringssl/crypto/fipsmodule/bn/prime.c +358 -443
data/third_party/boringssl/crypto/fipsmodule/bn/random.c +25 -35
data/third_party/boringssl/crypto/fipsmodule/bn/rsaz_exp.c +20 -25
data/third_party/boringssl/crypto/fipsmodule/bn/rsaz_exp.h +76 -5
data/third_party/boringssl/crypto/fipsmodule/bn/shift.c +14 -14
data/third_party/boringssl/crypto/fipsmodule/cipher/cipher.c +7 -2
data/third_party/boringssl/crypto/fipsmodule/cipher/e_aes.c +383 -516
data/third_party/boringssl/crypto/fipsmodule/cipher/e_des.c +4 -0
data/third_party/boringssl/crypto/fipsmodule/cipher/internal.h +3 -4
data/third_party/boringssl/crypto/fipsmodule/delocate.h +3 -2
data/third_party/boringssl/crypto/fipsmodule/digest/digest.c +32 -17
data/third_party/boringssl/crypto/fipsmodule/digest/md32_common.h +3 -3
data/third_party/boringssl/crypto/fipsmodule/ec/ec.c +228 -122
data/third_party/boringssl/crypto/fipsmodule/ec/ec_key.c +34 -8
data/third_party/boringssl/crypto/fipsmodule/ec/ec_montgomery.c +311 -98
data/third_party/boringssl/crypto/fipsmodule/ec/felem.c +82 -0
data/third_party/boringssl/crypto/fipsmodule/ec/internal.h +263 -97
data/third_party/boringssl/crypto/fipsmodule/ec/oct.c +22 -59
data/third_party/boringssl/crypto/fipsmodule/ec/p224-64.c +317 -234
data/third_party/boringssl/crypto/fipsmodule/ec/p256-x86_64-table.h +9473 -9475
data/third_party/boringssl/crypto/fipsmodule/ec/p256-x86_64.c +313 -109
data/third_party/boringssl/crypto/fipsmodule/ec/p256-x86_64.h +36 -0
data/third_party/boringssl/crypto/fipsmodule/ec/scalar.c +96 -0
data/third_party/boringssl/crypto/fipsmodule/ec/simple.c +126 -792
data/third_party/boringssl/crypto/fipsmodule/ec/simple_mul.c +84 -0
data/third_party/boringssl/crypto/fipsmodule/ec/util.c +163 -12
data/third_party/boringssl/crypto/fipsmodule/ec/wnaf.c +84 -211
data/third_party/boringssl/crypto/fipsmodule/ecdh/ecdh.c +122 -0
data/third_party/boringssl/crypto/fipsmodule/ecdsa/ecdsa.c +60 -205
data/third_party/boringssl/crypto/fipsmodule/fips_shared_support.c +32 -0
data/third_party/boringssl/crypto/fipsmodule/is_fips.c +2 -0
data/third_party/boringssl/crypto/fipsmodule/md4/md4.c +3 -1
data/third_party/boringssl/crypto/fipsmodule/md5/internal.h +37 -0
data/third_party/boringssl/crypto/fipsmodule/md5/md5.c +11 -8
data/third_party/boringssl/crypto/fipsmodule/modes/cbc.c +35 -79
data/third_party/boringssl/crypto/fipsmodule/modes/cfb.c +7 -39
data/third_party/boringssl/crypto/fipsmodule/modes/ctr.c +7 -27
data/third_party/boringssl/crypto/fipsmodule/modes/gcm.c +123 -309
data/third_party/boringssl/crypto/fipsmodule/modes/internal.h +189 -126
data/third_party/boringssl/crypto/fipsmodule/modes/ofb.c +3 -2
data/third_party/boringssl/crypto/fipsmodule/rand/ctrdrbg.c +2 -2
data/third_party/boringssl/crypto/fipsmodule/rand/internal.h +35 -0
data/third_party/boringssl/crypto/fipsmodule/rand/rand.c +24 -19
data/third_party/boringssl/crypto/fipsmodule/rand/urandom.c +256 -77
data/third_party/boringssl/crypto/fipsmodule/rsa/padding.c +10 -7
data/third_party/boringssl/crypto/fipsmodule/rsa/rsa.c +5 -1
data/third_party/boringssl/crypto/fipsmodule/rsa/rsa_impl.c +131 -14
data/third_party/boringssl/crypto/fipsmodule/self_check/self_check.c +83 -10
data/third_party/boringssl/crypto/fipsmodule/sha/internal.h +53 -0
data/third_party/boringssl/crypto/fipsmodule/sha/sha1.c +9 -13
data/third_party/boringssl/crypto/fipsmodule/sha/sha256.c +18 -12
data/third_party/boringssl/crypto/fipsmodule/sha/sha512.c +95 -168
data/third_party/boringssl/crypto/hrss/hrss.c +2201 -0
data/third_party/boringssl/crypto/hrss/internal.h +62 -0
data/third_party/boringssl/crypto/internal.h +95 -20
data/third_party/boringssl/crypto/lhash/lhash.c +45 -33
data/third_party/boringssl/crypto/mem.c +39 -2
data/third_party/boringssl/crypto/obj/obj.c +4 -4
data/third_party/boringssl/crypto/obj/obj_dat.h +6181 -875
data/third_party/boringssl/crypto/pem/pem_all.c +2 -3
data/third_party/boringssl/crypto/pem/pem_info.c +144 -162
data/third_party/boringssl/crypto/pem/pem_lib.c +53 -52
data/third_party/boringssl/crypto/pem/pem_pkey.c +13 -21
data/third_party/boringssl/crypto/pkcs7/pkcs7.c +15 -22
data/third_party/boringssl/crypto/pkcs7/pkcs7_x509.c +168 -16
data/third_party/boringssl/crypto/pkcs8/internal.h +11 -0
data/third_party/boringssl/crypto/pkcs8/p5_pbev2.c +24 -15
data/third_party/boringssl/crypto/pkcs8/pkcs8.c +42 -25
data/third_party/boringssl/crypto/pkcs8/pkcs8_x509.c +559 -43
data/third_party/boringssl/crypto/pool/internal.h +1 -1
data/third_party/boringssl/crypto/pool/pool.c +21 -0
data/third_party/boringssl/crypto/rand_extra/deterministic.c +8 -0
data/third_party/boringssl/crypto/rand_extra/fuchsia.c +1 -14
data/third_party/boringssl/crypto/refcount_lock.c +2 -2
data/third_party/boringssl/crypto/rsa_extra/rsa_print.c +22 -0
data/third_party/boringssl/crypto/siphash/siphash.c +80 -0
data/third_party/boringssl/crypto/stack/stack.c +83 -32
data/third_party/boringssl/crypto/thread_none.c +2 -2
data/third_party/boringssl/crypto/thread_pthread.c +2 -2
data/third_party/boringssl/crypto/thread_win.c +38 -19
data/third_party/boringssl/crypto/x509/a_strex.c +22 -2
data/third_party/boringssl/crypto/x509/asn1_gen.c +2 -1
data/third_party/boringssl/crypto/x509/by_dir.c +7 -0
data/third_party/boringssl/crypto/x509/by_file.c +12 -10
data/third_party/boringssl/crypto/x509/t_crl.c +5 -8
data/third_party/boringssl/crypto/x509/t_req.c +1 -3
data/third_party/boringssl/crypto/x509/t_x509.c +5 -8
data/third_party/boringssl/crypto/x509/x509_cmp.c +1 -1
data/third_party/boringssl/crypto/x509/x509_def.c +1 -1
data/third_party/boringssl/crypto/x509/x509_lu.c +114 -5
data/third_party/boringssl/crypto/x509/x509_req.c +20 -0
data/third_party/boringssl/crypto/x509/x509_set.c +5 -0
data/third_party/boringssl/crypto/x509/x509_trs.c +1 -0
data/third_party/boringssl/crypto/x509/x509_txt.c +4 -5
data/third_party/boringssl/crypto/x509/x509_vfy.c +145 -138
data/third_party/boringssl/crypto/x509/x509_vpm.c +2 -0
data/third_party/boringssl/crypto/x509/x509cset.c +40 -0
data/third_party/boringssl/crypto/x509/x509name.c +2 -3
data/third_party/boringssl/crypto/x509/x_all.c +109 -210
data/third_party/boringssl/crypto/x509/x_x509.c +6 -0
data/third_party/boringssl/crypto/x509v3/ext_dat.h +1 -3
data/third_party/boringssl/crypto/x509v3/internal.h +56 -0
data/third_party/boringssl/crypto/x509v3/pcy_cache.c +2 -0
data/third_party/boringssl/crypto/x509v3/pcy_node.c +1 -0
data/third_party/boringssl/crypto/x509v3/pcy_tree.c +4 -2
data/third_party/boringssl/crypto/x509v3/v3_akey.c +5 -2
data/third_party/boringssl/crypto/x509v3/v3_alt.c +19 -13
data/third_party/boringssl/crypto/x509v3/v3_conf.c +2 -1
data/third_party/boringssl/crypto/x509v3/v3_cpols.c +3 -2
data/third_party/boringssl/crypto/x509v3/v3_genn.c +1 -6
data/third_party/boringssl/crypto/x509v3/v3_lib.c +1 -0
data/third_party/boringssl/crypto/x509v3/v3_ocsp.c +68 -0
data/third_party/boringssl/crypto/x509v3/v3_pci.c +2 -1
data/third_party/boringssl/crypto/x509v3/v3_purp.c +47 -69
data/third_party/boringssl/crypto/x509v3/v3_skey.c +5 -2
data/third_party/boringssl/crypto/x509v3/v3_utl.c +69 -25
data/third_party/boringssl/include/openssl/aead.h +45 -19
data/third_party/boringssl/include/openssl/aes.h +32 -7
data/third_party/boringssl/include/openssl/asn1.h +7 -77
data/third_party/boringssl/include/openssl/base.h +120 -6
data/third_party/boringssl/include/openssl/base64.h +4 -1
data/third_party/boringssl/include/openssl/bio.h +112 -81
data/third_party/boringssl/include/openssl/blowfish.h +3 -3
data/third_party/boringssl/include/openssl/bn.h +55 -29
data/third_party/boringssl/include/openssl/buf.h +2 -2
data/third_party/boringssl/include/openssl/bytestring.h +54 -32
data/third_party/boringssl/include/openssl/cast.h +2 -2
data/third_party/boringssl/include/openssl/cipher.h +46 -16
data/third_party/boringssl/include/openssl/cmac.h +6 -2
data/third_party/boringssl/include/openssl/conf.h +3 -6
data/third_party/boringssl/include/openssl/cpu.h +25 -9
data/third_party/boringssl/include/openssl/crypto.h +32 -10
data/third_party/boringssl/include/openssl/curve25519.h +4 -4
data/third_party/boringssl/include/openssl/dh.h +3 -2
data/third_party/boringssl/include/openssl/digest.h +21 -7
data/third_party/boringssl/include/openssl/dsa.h +8 -2
data/third_party/boringssl/include/openssl/e_os2.h +18 -0
data/third_party/boringssl/include/openssl/ec.h +25 -21
data/third_party/boringssl/include/openssl/ec_key.h +36 -8
data/third_party/boringssl/include/openssl/ecdh.h +17 -0
data/third_party/boringssl/include/openssl/ecdsa.h +3 -3
data/third_party/boringssl/include/openssl/engine.h +4 -4
data/third_party/boringssl/include/openssl/err.h +3 -0
data/third_party/boringssl/include/openssl/evp.h +199 -42
data/third_party/boringssl/include/openssl/hmac.h +4 -4
data/third_party/boringssl/include/openssl/hrss.h +100 -0
data/third_party/boringssl/include/openssl/lhash.h +131 -23
data/third_party/boringssl/include/openssl/md4.h +6 -4
data/third_party/boringssl/include/openssl/md5.h +6 -4
data/third_party/boringssl/include/openssl/mem.h +6 -2
data/third_party/boringssl/include/openssl/nid.h +3 -0
data/third_party/boringssl/include/openssl/obj.h +3 -0
data/third_party/boringssl/include/openssl/pem.h +102 -64
data/third_party/boringssl/include/openssl/pkcs7.h +136 -3
data/third_party/boringssl/include/openssl/pkcs8.h +42 -3
data/third_party/boringssl/include/openssl/pool.h +13 -2
data/third_party/boringssl/include/openssl/ripemd.h +5 -4
data/third_party/boringssl/include/openssl/rsa.h +46 -15
data/third_party/boringssl/include/openssl/sha.h +40 -28
data/third_party/boringssl/include/openssl/siphash.h +37 -0
data/third_party/boringssl/include/openssl/span.h +17 -9
data/third_party/boringssl/include/openssl/ssl.h +766 -393
data/third_party/boringssl/include/openssl/ssl3.h +4 -3
data/third_party/boringssl/include/openssl/stack.h +134 -77
data/third_party/boringssl/include/openssl/thread.h +1 -1
data/third_party/boringssl/include/openssl/tls1.h +25 -9
data/third_party/boringssl/include/openssl/type_check.h +14 -15
data/third_party/boringssl/include/openssl/x509.h +28 -3
data/third_party/boringssl/include/openssl/x509_vfy.h +98 -32
data/third_party/boringssl/include/openssl/x509v3.h +17 -13
data/third_party/boringssl/ssl/d1_both.cc +9 -18
data/third_party/boringssl/ssl/d1_lib.cc +4 -3
data/third_party/boringssl/ssl/d1_pkt.cc +4 -4
data/third_party/boringssl/ssl/d1_srtp.cc +15 -15
data/third_party/boringssl/ssl/dtls_method.cc +0 -1
data/third_party/boringssl/ssl/dtls_record.cc +28 -28
data/third_party/boringssl/ssl/handoff.cc +295 -91
data/third_party/boringssl/ssl/handshake.cc +133 -72
data/third_party/boringssl/ssl/handshake_client.cc +218 -189
data/third_party/boringssl/ssl/handshake_server.cc +399 -272
data/third_party/boringssl/ssl/internal.h +1413 -928
data/third_party/boringssl/ssl/s3_both.cc +175 -36
data/third_party/boringssl/ssl/s3_lib.cc +9 -13
data/third_party/boringssl/ssl/s3_pkt.cc +63 -29
data/third_party/boringssl/ssl/ssl_aead_ctx.cc +55 -35
data/third_party/boringssl/ssl/ssl_asn1.cc +57 -73
data/third_party/boringssl/ssl/ssl_buffer.cc +13 -12
data/third_party/boringssl/ssl/ssl_cert.cc +313 -210
data/third_party/boringssl/ssl/ssl_cipher.cc +159 -221
data/third_party/boringssl/ssl/ssl_file.cc +2 -0
data/third_party/boringssl/ssl/ssl_key_share.cc +164 -19
data/third_party/boringssl/ssl/ssl_lib.cc +847 -555
data/third_party/boringssl/ssl/ssl_privkey.cc +441 -111
data/third_party/boringssl/ssl/ssl_session.cc +230 -178
data/third_party/boringssl/ssl/ssl_transcript.cc +21 -142
data/third_party/boringssl/ssl/ssl_versions.cc +88 -93
data/third_party/boringssl/ssl/ssl_x509.cc +279 -218
data/third_party/boringssl/ssl/t1_enc.cc +5 -96
data/third_party/boringssl/ssl/t1_lib.cc +931 -678
data/third_party/boringssl/ssl/tls13_both.cc +251 -121
data/third_party/boringssl/ssl/tls13_client.cc +129 -73
data/third_party/boringssl/ssl/tls13_enc.cc +350 -282
data/third_party/boringssl/ssl/tls13_server.cc +259 -192
data/third_party/boringssl/ssl/tls_method.cc +26 -21
data/third_party/boringssl/ssl/tls_record.cc +42 -47
data/third_party/boringssl/third_party/fiat/curve25519.c +261 -1324
data/third_party/boringssl/third_party/fiat/curve25519_32.h +911 -0
data/third_party/boringssl/third_party/fiat/curve25519_64.h +559 -0
data/third_party/boringssl/third_party/fiat/p256.c +238 -999
data/third_party/boringssl/third_party/fiat/p256_32.h +3226 -0
data/third_party/boringssl/third_party/fiat/p256_64.h +1217 -0
data/third_party/upb/upb/port_def.inc +1 -1
data/third_party/upb/upb/table.c +2 -1
metadata +71 -43
data/src/core/ext/filters/client_channel/lb_policy/xds/xds_load_balancer_api.h +0 -127
data/src/core/lib/gpr/mpscq.cc +0 -117
data/src/core/lib/gpr/mpscq.h +0 -88
data/src/core/lib/gprpp/abstract.h +0 -47
data/src/core/lib/gprpp/pair.h +0 -38
data/third_party/boringssl/crypto/cipher_extra/e_ssl3.c +0 -460
data/third_party/boringssl/crypto/fipsmodule/modes/ccm.c +0 -256
data/third_party/boringssl/include/openssl/lhash_macros.h +0 -174
data/third_party/boringssl/ssl/custom_extensions.cc +0 -265

data/third_party/boringssl/crypto/hrss/hrss.c ADDED

@@ -0,0 +1,2201 @@
+/* Copyright (c) 2018, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+#include <openssl/hrss.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/bn.h>
+#include <openssl/cpu.h>
+#include <openssl/hmac.h>
+#include <openssl/mem.h>
+#include <openssl/sha.h>
+#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
+#include <emmintrin.h>
+#endif
+#if (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && \
+    (defined(__ARM_NEON__) || defined(__ARM_NEON))
+#include <arm_neon.h>
+#endif
+#if defined(_MSC_VER)
+#define RESTRICT
+#else
+#define RESTRICT restrict
+#endif
+#include "../internal.h"
+#include "internal.h"
+// This is an implementation of [HRSS], but with a KEM transformation based on
+// [SXY]. The primary references are:
+// HRSS: https://eprint.iacr.org/2017/667.pdf
+// HRSSNIST:
+// https://csrc.nist.gov/CSRC/media/Projects/Post-Quantum-Cryptography/documents/round-1/submissions/NTRU_HRSS_KEM.zip
+// SXY: https://eprint.iacr.org/2017/1005.pdf
+// NTRUTN14:
+// https://assets.onboardsecurity.com/static/downloads/NTRU/resources/NTRUTech014.pdf
+// NTRUCOMP:
+// https://eprint.iacr.org/2018/1174
+// Vector operations.
+//
+// A couple of functions in this file can use vector operations to meaningful
+// effect. If we're building for a target that has a supported vector unit,
+// |HRSS_HAVE_VECTOR_UNIT| will be defined and |vec_t| will be typedefed to a
+// 128-bit vector. The following functions abstract over the differences between
+// NEON and SSE2 for implementing some vector operations.
+// TODO: MSVC can likely also be made to work with vector operations.
+#if ((defined(__SSE__) && defined(OPENSSL_X86)) || defined(OPENSSL_X86_64)) && \
+    (defined(__clang__) || !defined(_MSC_VER))
+#define HRSS_HAVE_VECTOR_UNIT
+typedef __m128i vec_t;
+// vec_capable returns one iff the current platform supports SSE2.
+static int vec_capable(void) {
+#if defined(__SSE2__)
+  return 1;
+#else
+  int has_sse2 = (OPENSSL_ia32cap_P[0] & (1 << 26)) != 0;
+  return has_sse2;
+#endif
+}
+// vec_add performs a pair-wise addition of four uint16s from |a| and |b|.
+static inline vec_t vec_add(vec_t a, vec_t b) { return _mm_add_epi16(a, b); }
+// vec_sub performs a pair-wise subtraction of four uint16s from |a| and |b|.
+static inline vec_t vec_sub(vec_t a, vec_t b) { return _mm_sub_epi16(a, b); }
+// vec_mul multiplies each uint16_t in |a| by |b| and returns the resulting
+// vector.
+static inline vec_t vec_mul(vec_t a, uint16_t b) {
+  return _mm_mullo_epi16(a, _mm_set1_epi16(b));
+}
+// vec_fma multiplies each uint16_t in |b| by |c|, adds the result to |a|, and
+// returns the resulting vector.
+static inline vec_t vec_fma(vec_t a, vec_t b, uint16_t c) {
+  return _mm_add_epi16(a, _mm_mullo_epi16(b, _mm_set1_epi16(c)));
+}
+// vec3_rshift_word right-shifts the 24 uint16_t's in |v| by one uint16.
+static inline void vec3_rshift_word(vec_t v[3]) {
+  // Intel's left and right shifting is backwards compared to the order in
+  // memory because they're based on little-endian order of words (and not just
+  // bytes). So the shifts in this function will be backwards from what one
+  // might expect.
+  const __m128i carry0 = _mm_srli_si128(v[0], 14);
+  v[0] = _mm_slli_si128(v[0], 2);
+  const __m128i carry1 = _mm_srli_si128(v[1], 14);
+  v[1] = _mm_slli_si128(v[1], 2);
+  v[1] |= carry0;
+  v[2] = _mm_slli_si128(v[2], 2);
+  v[2] |= carry1;
+}
+// vec4_rshift_word right-shifts the 32 uint16_t's in |v| by one uint16.
+static inline void vec4_rshift_word(vec_t v[4]) {
+  // Intel's left and right shifting is backwards compared to the order in
+  // memory because they're based on little-endian order of words (and not just
+  // bytes). So the shifts in this function will be backwards from what one
+  // might expect.
+  const __m128i carry0 = _mm_srli_si128(v[0], 14);
+  v[0] = _mm_slli_si128(v[0], 2);
+  const __m128i carry1 = _mm_srli_si128(v[1], 14);
+  v[1] = _mm_slli_si128(v[1], 2);
+  v[1] |= carry0;
+  const __m128i carry2 = _mm_srli_si128(v[2], 14);
+  v[2] = _mm_slli_si128(v[2], 2);
+  v[2] |= carry1;
+  v[3] = _mm_slli_si128(v[3], 2);
+  v[3] |= carry2;
+}
+// vec_merge_3_5 takes the final three uint16_t's from |left|, appends the first
+// five from |right|, and returns the resulting vector.
+static inline vec_t vec_merge_3_5(vec_t left, vec_t right) {
+  return _mm_srli_si128(left, 10) | _mm_slli_si128(right, 6);
+}
+// poly3_vec_lshift1 left-shifts the 768 bits in |a_s|, and in |a_a|, by one
+// bit.
+static inline void poly3_vec_lshift1(vec_t a_s[6], vec_t a_a[6]) {
+  vec_t carry_s = {0};
+  vec_t carry_a = {0};
+  for (int i = 0; i < 6; i++) {
+    vec_t next_carry_s = _mm_srli_epi64(a_s[i], 63);
+    a_s[i] = _mm_slli_epi64(a_s[i], 1);
+    a_s[i] |= _mm_slli_si128(next_carry_s, 8);
+    a_s[i] |= carry_s;
+    carry_s = _mm_srli_si128(next_carry_s, 8);
+    vec_t next_carry_a = _mm_srli_epi64(a_a[i], 63);
+    a_a[i] = _mm_slli_epi64(a_a[i], 1);
+    a_a[i] |= _mm_slli_si128(next_carry_a, 8);
+    a_a[i] |= carry_a;
+    carry_a = _mm_srli_si128(next_carry_a, 8);
+  }
+}
+// poly3_vec_rshift1 right-shifts the 768 bits in |a_s|, and in |a_a|, by one
+// bit.
+static inline void poly3_vec_rshift1(vec_t a_s[6], vec_t a_a[6]) {
+  vec_t carry_s = {0};
+  vec_t carry_a = {0};
+  for (int i = 5; i >= 0; i--) {
+    const vec_t next_carry_s = _mm_slli_epi64(a_s[i], 63);
+    a_s[i] = _mm_srli_epi64(a_s[i], 1);
+    a_s[i] |= _mm_srli_si128(next_carry_s, 8);
+    a_s[i] |= carry_s;
+    carry_s = _mm_slli_si128(next_carry_s, 8);
+    const vec_t next_carry_a = _mm_slli_epi64(a_a[i], 63);
+    a_a[i] = _mm_srli_epi64(a_a[i], 1);
+    a_a[i] |= _mm_srli_si128(next_carry_a, 8);
+    a_a[i] |= carry_a;
+    carry_a = _mm_slli_si128(next_carry_a, 8);
+  }
+}
+// vec_broadcast_bit duplicates the least-significant bit in |a| to all bits in
+// a vector and returns the result.
+static inline vec_t vec_broadcast_bit(vec_t a) {
+  return _mm_shuffle_epi32(_mm_srai_epi32(_mm_slli_epi64(a, 63), 31),
+                           0b01010101);
+}
+// vec_broadcast_bit15 duplicates the most-significant bit of the first word in
+// |a| to all bits in a vector and returns the result.
+static inline vec_t vec_broadcast_bit15(vec_t a) {
+  return _mm_shuffle_epi32(_mm_srai_epi32(_mm_slli_epi64(a, 63 - 15), 31),
+                           0b01010101);
+}
+// vec_get_word returns the |i|th uint16_t in |v|. (This is a macro because the
+// compiler requires that |i| be a compile-time constant.)
+#define vec_get_word(v, i) _mm_extract_epi16(v, i)
+#elif (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && \
+    (defined(__ARM_NEON__) || defined(__ARM_NEON))
+#define HRSS_HAVE_VECTOR_UNIT
+typedef uint16x8_t vec_t;
+// These functions perform the same actions as the SSE2 function of the same
+// name, above.
+static int vec_capable(void) { return CRYPTO_is_NEON_capable(); }
+static inline vec_t vec_add(vec_t a, vec_t b) { return a + b; }
+static inline vec_t vec_sub(vec_t a, vec_t b) { return a - b; }
+static inline vec_t vec_mul(vec_t a, uint16_t b) { return vmulq_n_u16(a, b); }
+static inline vec_t vec_fma(vec_t a, vec_t b, uint16_t c) {
+  return vmlaq_n_u16(a, b, c);
+}
+static inline void vec3_rshift_word(vec_t v[3]) {
+  const uint16x8_t kZero = {0};
+  v[2] = vextq_u16(v[1], v[2], 7);
+  v[1] = vextq_u16(v[0], v[1], 7);
+  v[0] = vextq_u16(kZero, v[0], 7);
+}
+static inline void vec4_rshift_word(vec_t v[4]) {
+  const uint16x8_t kZero = {0};
+  v[3] = vextq_u16(v[2], v[3], 7);
+  v[2] = vextq_u16(v[1], v[2], 7);
+  v[1] = vextq_u16(v[0], v[1], 7);
+  v[0] = vextq_u16(kZero, v[0], 7);
+}
+static inline vec_t vec_merge_3_5(vec_t left, vec_t right) {
+  return vextq_u16(left, right, 5);
+}
+static inline uint16_t vec_get_word(vec_t v, unsigned i) {
+  return v[i];
+}
+#if !defined(OPENSSL_AARCH64)
+static inline vec_t vec_broadcast_bit(vec_t a) {
+  a = (vec_t)vshrq_n_s16(((int16x8_t)a) << 15, 15);
+  return vdupq_lane_u16(vget_low_u16(a), 0);
+}
+static inline vec_t vec_broadcast_bit15(vec_t a) {
+  a = (vec_t)vshrq_n_s16((int16x8_t)a, 15);
+  return vdupq_lane_u16(vget_low_u16(a), 0);
+}
+static inline void poly3_vec_lshift1(vec_t a_s[6], vec_t a_a[6]) {
+  vec_t carry_s = {0};
+  vec_t carry_a = {0};
+  const vec_t kZero = {0};
+  for (int i = 0; i < 6; i++) {
+    vec_t next_carry_s = a_s[i] >> 15;
+    a_s[i] <<= 1;
+    a_s[i] |= vextq_u16(kZero, next_carry_s, 7);
+    a_s[i] |= carry_s;
+    carry_s = vextq_u16(next_carry_s, kZero, 7);
+    vec_t next_carry_a = a_a[i] >> 15;
+    a_a[i] <<= 1;
+    a_a[i] |= vextq_u16(kZero, next_carry_a, 7);
+    a_a[i] |= carry_a;
+    carry_a = vextq_u16(next_carry_a, kZero, 7);
+  }
+}
+static inline void poly3_vec_rshift1(vec_t a_s[6], vec_t a_a[6]) {
+  vec_t carry_s = {0};
+  vec_t carry_a = {0};
+  const vec_t kZero = {0};
+  for (int i = 5; i >= 0; i--) {
+    vec_t next_carry_s = a_s[i] << 15;
+    a_s[i] >>= 1;
+    a_s[i] |= vextq_u16(next_carry_s, kZero, 1);
+    a_s[i] |= carry_s;
+    carry_s = vextq_u16(kZero, next_carry_s, 1);
+    vec_t next_carry_a = a_a[i] << 15;
+    a_a[i] >>= 1;
+    a_a[i] |= vextq_u16(next_carry_a, kZero, 1);
+    a_a[i] |= carry_a;
+    carry_a = vextq_u16(kZero, next_carry_a, 1);
+  }
+}
+#endif  // !OPENSSL_AARCH64
+#endif  // (ARM || AARCH64) && NEON
+// Polynomials in this scheme have N terms.
+// #define N 701
+// Underlying data types and arithmetic operations.
+// ------------------------------------------------
+// Binary polynomials.
+// poly2 represents a degree-N polynomial over GF(2). The words are in little-
+// endian order, i.e. the coefficient of x^0 is the LSB of the first word. The
+// final word is only partially used since N is not a multiple of the word size.
+// Defined in internal.h:
+// struct poly2 {
+//  crypto_word_t v[WORDS_PER_POLY];
+// };
+OPENSSL_UNUSED static void hexdump(const void *void_in, size_t len) {
+  const uint8_t *in = (const uint8_t *)void_in;
+  for (size_t i = 0; i < len; i++) {
+    printf("%02x", in[i]);
+  }
+  printf("\n");
+}
+static void poly2_zero(struct poly2 *p) {
+  OPENSSL_memset(&p->v[0], 0, sizeof(crypto_word_t) * WORDS_PER_POLY);
+}
+// poly2_cmov sets |out| to |in| iff |mov| is all ones.
+static void poly2_cmov(struct poly2 *out, const struct poly2 *in,
+                       crypto_word_t mov) {
+  for (size_t i = 0; i < WORDS_PER_POLY; i++) {
+    out->v[i] = (out->v[i] & ~mov) | (in->v[i] & mov);
+  }
+}
+// poly2_rotr_words performs a right-rotate on |in|, writing the result to
+// |out|. The shift count, |bits|, must be a non-zero multiple of the word size.
+static void poly2_rotr_words(struct poly2 *out, const struct poly2 *in,
+                             size_t bits) {
+  assert(bits >= BITS_PER_WORD && bits % BITS_PER_WORD == 0);
+  assert(out != in);
+  const size_t start = bits / BITS_PER_WORD;
+  const size_t n = (N - bits) / BITS_PER_WORD;
+  // The rotate is by a whole number of words so the first few words are easy:
+  // just move them down.
+  for (size_t i = 0; i < n; i++) {
+    out->v[i] = in->v[start + i];
+  }
+  // Since the last word is only partially filled, however, the remainder needs
+  // shifting and merging of words to take care of that.
+  crypto_word_t carry = in->v[WORDS_PER_POLY - 1];
+  for (size_t i = 0; i < start; i++) {
+    out->v[n + i] = carry | in->v[i] << BITS_IN_LAST_WORD;
+    carry = in->v[i] >> (BITS_PER_WORD - BITS_IN_LAST_WORD);
+  }
+  out->v[WORDS_PER_POLY - 1] = carry;
+}
+// poly2_rotr_bits performs a right-rotate on |in|, writing the result to |out|.
+// The shift count, |bits|, must be a power of two that is less than
+// |BITS_PER_WORD|.
+static void poly2_rotr_bits(struct poly2 *out, const struct poly2 *in,
+                            size_t bits) {
+  assert(bits <= BITS_PER_WORD / 2);
+  assert(bits != 0);
+  assert((bits & (bits - 1)) == 0);
+  assert(out != in);
+  // BITS_PER_WORD/2 is the greatest legal value of |bits|. If
+  // |BITS_IN_LAST_WORD| is smaller than this then the code below doesn't work
+  // because more than the last word needs to carry down in the previous one and
+  // so on.
+  OPENSSL_STATIC_ASSERT(
+      BITS_IN_LAST_WORD >= BITS_PER_WORD / 2,
+      "there are more carry bits than fit in BITS_IN_LAST_WORD");
+  crypto_word_t carry = in->v[WORDS_PER_POLY - 1] << (BITS_PER_WORD - bits);
+  for (size_t i = WORDS_PER_POLY - 2; i < WORDS_PER_POLY; i--) {
+    out->v[i] = carry | in->v[i] >> bits;
+    carry = in->v[i] << (BITS_PER_WORD - bits);
+  }
+  crypto_word_t last_word = carry >> (BITS_PER_WORD - BITS_IN_LAST_WORD) |
+                            in->v[WORDS_PER_POLY - 1] >> bits;
+  last_word &= (UINT64_C(1) << BITS_IN_LAST_WORD) - 1;
+  out->v[WORDS_PER_POLY - 1] = last_word;
+}
+// HRSS_poly2_rotr_consttime right-rotates |p| by |bits| in constant-time.
+void HRSS_poly2_rotr_consttime(struct poly2 *p, size_t bits) {
+  assert(bits <= N);
+  assert(p->v[WORDS_PER_POLY-1] >> BITS_IN_LAST_WORD == 0);
+  // Constant-time rotation is implemented by calculating the rotations of
+  // powers-of-two bits and throwing away the unneeded values. 2^9 (i.e. 512) is
+  // the largest power-of-two shift that we need to consider because 2^10 > N.
+#define HRSS_POLY2_MAX_SHIFT 9
+  size_t shift = HRSS_POLY2_MAX_SHIFT;
+  OPENSSL_STATIC_ASSERT((1 << (HRSS_POLY2_MAX_SHIFT + 1)) > N,
+                        "maximum shift is too small");
+  OPENSSL_STATIC_ASSERT((1 << HRSS_POLY2_MAX_SHIFT) <= N,
+                        "maximum shift is too large");
+  struct poly2 shifted;
+  for (; (UINT64_C(1) << shift) >= BITS_PER_WORD; shift--) {
+    poly2_rotr_words(&shifted, p, UINT64_C(1) << shift);
+    poly2_cmov(p, &shifted, ~((1 & (bits >> shift)) - 1));
+  }
+  for (; shift < HRSS_POLY2_MAX_SHIFT; shift--) {
+    poly2_rotr_bits(&shifted, p, UINT64_C(1) << shift);
+    poly2_cmov(p, &shifted, ~((1 & (bits >> shift)) - 1));
+  }
+#undef HRSS_POLY2_MAX_SHIFT
+}
+// poly2_cswap exchanges the values of |a| and |b| if |swap| is all ones.
+static void poly2_cswap(struct poly2 *a, struct poly2 *b, crypto_word_t swap) {
+  for (size_t i = 0; i < WORDS_PER_POLY; i++) {
+    const crypto_word_t sum = swap & (a->v[i] ^ b->v[i]);
+    a->v[i] ^= sum;
+    b->v[i] ^= sum;
+  }
+}
+// poly2_fmadd sets |out| to |out| + |in| * m, where m is either
+// |CONSTTIME_TRUE_W| or |CONSTTIME_FALSE_W|.
+static void poly2_fmadd(struct poly2 *out, const struct poly2 *in,
+                        crypto_word_t m) {
+  for (size_t i = 0; i < WORDS_PER_POLY; i++) {
+    out->v[i] ^= in->v[i] & m;
+  }
+}
+// poly2_lshift1 left-shifts |p| by one bit.
+static void poly2_lshift1(struct poly2 *p) {
+  crypto_word_t carry = 0;
+  for (size_t i = 0; i < WORDS_PER_POLY; i++) {
+    const crypto_word_t next_carry = p->v[i] >> (BITS_PER_WORD - 1);
+    p->v[i] <<= 1;
+    p->v[i] |= carry;
+    carry = next_carry;
+  }
+}
+// poly2_rshift1 right-shifts |p| by one bit.
+static void poly2_rshift1(struct poly2 *p) {
+  crypto_word_t carry = 0;
+  for (size_t i = WORDS_PER_POLY - 1; i < WORDS_PER_POLY; i--) {
+    const crypto_word_t next_carry = p->v[i] & 1;
+    p->v[i] >>= 1;
+    p->v[i] |= carry << (BITS_PER_WORD - 1);
+    carry = next_carry;
+  }
+}
+// poly2_clear_top_bits clears the bits in the final word that are only for
+// alignment.
+static void poly2_clear_top_bits(struct poly2 *p) {
+  p->v[WORDS_PER_POLY - 1] &= (UINT64_C(1) << BITS_IN_LAST_WORD) - 1;
+}
+// poly2_top_bits_are_clear returns one iff the extra bits in the final words of
+// |p| are zero.
+static int poly2_top_bits_are_clear(const struct poly2 *p) {
+  return (p->v[WORDS_PER_POLY - 1] &
+          ~((UINT64_C(1) << BITS_IN_LAST_WORD) - 1)) == 0;
+}
+// Ternary polynomials.
+// poly3 represents a degree-N polynomial over GF(3). Each coefficient is
+// bitsliced across the |s| and |a| arrays, like this:
+//
+//   s  |  a  | value
+//  -----------------
+//   0  |  0  | 0
+//   0  |  1  | 1
+//   1  |  1  | -1 (aka 2)
+//   1  |  0  | <invalid>
+//
+// ('s' is for sign, and 'a' is the absolute value.)
+//
+// Once bitsliced as such, the following circuits can be used to implement
+// addition and multiplication mod 3:
+//
+//   (s3, a3) = (s1, a1) × (s2, a2)
+//   a3 = a1 ∧ a2
+//   s3 = (s1 ⊕ s2) ∧ a3
+//
+//   (s3, a3) = (s1, a1) + (s2, a2)
+//   t = s1 ⊕ a2
+//   s3 = t ∧ (s2 ⊕ a1)
+//   a3 = (a1 ⊕ a2) ∨ (t ⊕ s2)
+//
+//   (s3, a3) = (s1, a1) - (s2, a2)
+//   t = a1 ⊕ a2
+//   s3 = (s1 ⊕ a2) ∧ (t ⊕ s2)
+//   a3 = t ∨ (s1 ⊕ s2)
+//
+// Negating a value just involves XORing s by a.
+//
+// struct poly3 {
+//   struct poly2 s, a;
+// };
+OPENSSL_UNUSED static void poly3_print(const struct poly3 *in) {
+  struct poly3 p;
+  OPENSSL_memcpy(&p, in, sizeof(p));
+  p.s.v[WORDS_PER_POLY - 1] &= ((crypto_word_t)1 << BITS_IN_LAST_WORD) - 1;
+  p.a.v[WORDS_PER_POLY - 1] &= ((crypto_word_t)1 << BITS_IN_LAST_WORD) - 1;
+  printf("{[");
+  for (unsigned i = 0; i < WORDS_PER_POLY; i++) {
+    if (i) {
+      printf(" ");
+    }
+    printf(BN_HEX_FMT2, p.s.v[i]);
+  }
+  printf("] [");
+  for (unsigned i = 0; i < WORDS_PER_POLY; i++) {
+    if (i) {
+      printf(" ");
+    }
+    printf(BN_HEX_FMT2, p.a.v[i]);
+  }
+  printf("]}\n");
+}
+static void poly3_zero(struct poly3 *p) {
+  poly2_zero(&p->s);
+  poly2_zero(&p->a);
+}
+// poly3_word_mul sets (|out_s|, |out_a) to (|s1|, |a1|) × (|s2|, |a2|).
+static void poly3_word_mul(crypto_word_t *out_s, crypto_word_t *out_a,
+                           const crypto_word_t s1, const crypto_word_t a1,
+                           const crypto_word_t s2, const crypto_word_t a2) {
+  *out_a = a1 & a2;
+  *out_s = (s1 ^ s2) & *out_a;
+}
+// poly3_word_add sets (|out_s|, |out_a|) to (|s1|, |a1|) + (|s2|, |a2|).
+static void poly3_word_add(crypto_word_t *out_s, crypto_word_t *out_a,
+                           const crypto_word_t s1, const crypto_word_t a1,
+                           const crypto_word_t s2, const crypto_word_t a2) {
+  const crypto_word_t t = s1 ^ a2;
+  *out_s = t & (s2 ^ a1);
+  *out_a = (a1 ^ a2) | (t ^ s2);
+}
+// poly3_word_sub sets (|out_s|, |out_a|) to (|s1|, |a1|) - (|s2|, |a2|).
+static void poly3_word_sub(crypto_word_t *out_s, crypto_word_t *out_a,
+                           const crypto_word_t s1, const crypto_word_t a1,
+                           const crypto_word_t s2, const crypto_word_t a2) {
+  const crypto_word_t t = a1 ^ a2;
+  *out_s = (s1 ^ a2) & (t ^ s2);
+  *out_a = t | (s1 ^ s2);
+}
+// lsb_to_all replicates the least-significant bit of |v| to all bits of the
+// word. This is used in bit-slicing operations to make a vector from a fixed
+// value.
+static crypto_word_t lsb_to_all(crypto_word_t v) { return 0u - (v & 1); }
+// poly3_mul_const sets |p| to |p|×m, where m = (ms, ma).
+static void poly3_mul_const(struct poly3 *p, crypto_word_t ms,
+                            crypto_word_t ma) {
+  ms = lsb_to_all(ms);
+  ma = lsb_to_all(ma);
+  for (size_t i = 0; i < WORDS_PER_POLY; i++) {
+    poly3_word_mul(&p->s.v[i], &p->a.v[i], p->s.v[i], p->a.v[i], ms, ma);
+  }
+}
+// poly3_rotr_consttime right-rotates |p| by |bits| in constant-time.
+static void poly3_rotr_consttime(struct poly3 *p, size_t bits) {
+  assert(bits <= N);
+  HRSS_poly2_rotr_consttime(&p->s, bits);
+  HRSS_poly2_rotr_consttime(&p->a, bits);
+}
+// poly3_fmadd sets |out| to |out| - |in|×m, where m is (ms, ma).
+static void poly3_fmsub(struct poly3 *RESTRICT out,
+                        const struct poly3 *RESTRICT in, crypto_word_t ms,
+                        crypto_word_t ma) {
+  crypto_word_t product_s, product_a;
+  for (size_t i = 0; i < WORDS_PER_POLY; i++) {
+    poly3_word_mul(&product_s, &product_a, in->s.v[i], in->a.v[i], ms, ma);
+    poly3_word_sub(&out->s.v[i], &out->a.v[i], out->s.v[i], out->a.v[i],
+                   product_s, product_a);
+  }
+}
+// final_bit_to_all replicates the bit in the final position of the last word to
+// all the bits in the word.
+static crypto_word_t final_bit_to_all(crypto_word_t v) {
+  return lsb_to_all(v >> (BITS_IN_LAST_WORD - 1));
+}
+// poly3_top_bits_are_clear returns one iff the extra bits in the final words of
+// |p| are zero.
+OPENSSL_UNUSED static int poly3_top_bits_are_clear(const struct poly3 *p) {
+  return poly2_top_bits_are_clear(&p->s) && poly2_top_bits_are_clear(&p->a);
+}
+// poly3_mod_phiN reduces |p| by Φ(N).
+static void poly3_mod_phiN(struct poly3 *p) {
+  // In order to reduce by Φ(N) we subtract by the value of the greatest
+  // coefficient.
+  const crypto_word_t factor_s = final_bit_to_all(p->s.v[WORDS_PER_POLY - 1]);
+  const crypto_word_t factor_a = final_bit_to_all(p->a.v[WORDS_PER_POLY - 1]);
+  for (size_t i = 0; i < WORDS_PER_POLY; i++) {
+    poly3_word_sub(&p->s.v[i], &p->a.v[i], p->s.v[i], p->a.v[i], factor_s,
+                   factor_a);
+  }
+  poly2_clear_top_bits(&p->s);
+  poly2_clear_top_bits(&p->a);
+}
+static void poly3_cswap(struct poly3 *a, struct poly3 *b, crypto_word_t swap) {
+  poly2_cswap(&a->s, &b->s, swap);
+  poly2_cswap(&a->a, &b->a, swap);
+}
+static void poly3_lshift1(struct poly3 *p) {
+  poly2_lshift1(&p->s);
+  poly2_lshift1(&p->a);
+}
+static void poly3_rshift1(struct poly3 *p) {
+  poly2_rshift1(&p->s);
+  poly2_rshift1(&p->a);
+}
+// poly3_span represents a pointer into a poly3.
+struct poly3_span {
+  crypto_word_t *s;
+  crypto_word_t *a;
+};
+// poly3_span_add adds |n| words of values from |a| and |b| and writes the
+// result to |out|.
+static void poly3_span_add(const struct poly3_span *out,
+                           const struct poly3_span *a,
+                           const struct poly3_span *b, size_t n) {
+  for (size_t i = 0; i < n; i++) {
+    poly3_word_add(&out->s[i], &out->a[i], a->s[i], a->a[i], b->s[i], b->a[i]);
+  }
+}
+// poly3_span_sub subtracts |n| words of |b| from |n| words of |a|.
+static void poly3_span_sub(const struct poly3_span *a,
+                           const struct poly3_span *b, size_t n) {
+  for (size_t i = 0; i < n; i++) {
+    poly3_word_sub(&a->s[i], &a->a[i], a->s[i], a->a[i], b->s[i], b->a[i]);
+  }
+}
+// poly3_mul_aux is a recursive function that multiplies |n| words from |a| and
+// |b| and writes 2×|n| words to |out|. Each call uses 2*ceil(n/2) elements of
+// |scratch| and the function recurses, except if |n| == 1, when |scratch| isn't
+// used and the recursion stops. For |n| in {11, 22}, the transitive total
+// amount of |scratch| needed happens to be 2n+2.
+static void poly3_mul_aux(const struct poly3_span *out,
+                          const struct poly3_span *scratch,
+                          const struct poly3_span *a,
+                          const struct poly3_span *b, size_t n) {
+  if (n == 1) {
+    crypto_word_t r_s_low = 0, r_s_high = 0, r_a_low = 0, r_a_high = 0;
+    crypto_word_t b_s = b->s[0], b_a = b->a[0];
+    const crypto_word_t a_s = a->s[0], a_a = a->a[0];
+    for (size_t i = 0; i < BITS_PER_WORD; i++) {
+      // Multiply (s, a) by the next value from (b_s, b_a).
+      crypto_word_t m_s, m_a;
+      poly3_word_mul(&m_s, &m_a, a_s, a_a, lsb_to_all(b_s), lsb_to_all(b_a));
+      b_s >>= 1;
+      b_a >>= 1;
+      if (i == 0) {
+        // Special case otherwise the code tries to shift by BITS_PER_WORD
+        // below, which is undefined.
+        r_s_low = m_s;
+        r_a_low = m_a;
+        continue;
+      }
+      // Shift the multiplication result to the correct position.
+      const crypto_word_t m_s_low = m_s << i;
+      const crypto_word_t m_s_high = m_s >> (BITS_PER_WORD - i);
+      const crypto_word_t m_a_low = m_a << i;
+      const crypto_word_t m_a_high = m_a >> (BITS_PER_WORD - i);
+      // Add into the result.
+      poly3_word_add(&r_s_low, &r_a_low, r_s_low, r_a_low, m_s_low, m_a_low);
+      poly3_word_add(&r_s_high, &r_a_high, r_s_high, r_a_high, m_s_high,
+                     m_a_high);
+    }
+    out->s[0] = r_s_low;
+    out->s[1] = r_s_high;
+    out->a[0] = r_a_low;
+    out->a[1] = r_a_high;
+    return;
+  }
+  // Karatsuba multiplication.
+  // https://en.wikipedia.org/wiki/Karatsuba_algorithm
+  // When |n| is odd, the two "halves" will have different lengths. The first
+  // is always the smaller.
+  const size_t low_len = n / 2;
+  const size_t high_len = n - low_len;
+  const struct poly3_span a_high = {&a->s[low_len], &a->a[low_len]};
+  const struct poly3_span b_high = {&b->s[low_len], &b->a[low_len]};
+  // Store a_1 + a_0 in the first half of |out| and b_1 + b_0 in the second
+  // half.
+  const struct poly3_span a_cross_sum = *out;
+  const struct poly3_span b_cross_sum = {&out->s[high_len], &out->a[high_len]};
+  poly3_span_add(&a_cross_sum, a, &a_high, low_len);
+  poly3_span_add(&b_cross_sum, b, &b_high, low_len);
+  if (high_len != low_len) {
+    a_cross_sum.s[low_len] = a_high.s[low_len];
+    a_cross_sum.a[low_len] = a_high.a[low_len];
+    b_cross_sum.s[low_len] = b_high.s[low_len];
+    b_cross_sum.a[low_len] = b_high.a[low_len];
+  }
+  const struct poly3_span child_scratch = {&scratch->s[2 * high_len],
+                                           &scratch->a[2 * high_len]};
+  const struct poly3_span out_mid = {&out->s[low_len], &out->a[low_len]};
+  const struct poly3_span out_high = {&out->s[2 * low_len],
+                                      &out->a[2 * low_len]};
+  // Calculate (a_1 + a_0) × (b_1 + b_0) and write to scratch buffer.
+  poly3_mul_aux(scratch, &child_scratch, &a_cross_sum, &b_cross_sum, high_len);
+  // Calculate a_1 × b_1.
+  poly3_mul_aux(&out_high, &child_scratch, &a_high, &b_high, high_len);
+  // Calculate a_0 × b_0.
+  poly3_mul_aux(out, &child_scratch, a, b, low_len);
+  // Subtract those last two products from the first.
+  poly3_span_sub(scratch, out, low_len * 2);
+  poly3_span_sub(scratch, &out_high, high_len * 2);
+  // Add the middle product into the output.
+  poly3_span_add(&out_mid, &out_mid, scratch, high_len * 2);
+}
+// HRSS_poly3_mul sets |*out| to |x|×|y| mod Φ(N).
+void HRSS_poly3_mul(struct poly3 *out, const struct poly3 *x,
+                    const struct poly3 *y) {
+  crypto_word_t prod_s[WORDS_PER_POLY * 2];
+  crypto_word_t prod_a[WORDS_PER_POLY * 2];
+  crypto_word_t scratch_s[WORDS_PER_POLY * 2 + 2];
+  crypto_word_t scratch_a[WORDS_PER_POLY * 2 + 2];
+  const struct poly3_span prod_span = {prod_s, prod_a};
+  const struct poly3_span scratch_span = {scratch_s, scratch_a};
+  const struct poly3_span x_span = {(crypto_word_t *)x->s.v,
+                                    (crypto_word_t *)x->a.v};
+  const struct poly3_span y_span = {(crypto_word_t *)y->s.v,
+                                    (crypto_word_t *)y->a.v};
+  poly3_mul_aux(&prod_span, &scratch_span, &x_span, &y_span, WORDS_PER_POLY);
+  // |prod| needs to be reduced mod (𝑥^n - 1), which just involves adding the
+  // upper-half to the lower-half. However, N is 701, which isn't a multiple of
+  // BITS_PER_WORD, so the upper-half vectors all have to be shifted before
+  // being added to the lower-half.
+  for (size_t i = 0; i < WORDS_PER_POLY; i++) {
+    crypto_word_t v_s = prod_s[WORDS_PER_POLY + i - 1] >> BITS_IN_LAST_WORD;
+    v_s |= prod_s[WORDS_PER_POLY + i] << (BITS_PER_WORD - BITS_IN_LAST_WORD);
+    crypto_word_t v_a = prod_a[WORDS_PER_POLY + i - 1] >> BITS_IN_LAST_WORD;
+    v_a |= prod_a[WORDS_PER_POLY + i] << (BITS_PER_WORD - BITS_IN_LAST_WORD);
+    poly3_word_add(&out->s.v[i], &out->a.v[i], prod_s[i], prod_a[i], v_s, v_a);
+  }
+  poly3_mod_phiN(out);
+}
+#if defined(HRSS_HAVE_VECTOR_UNIT) && !defined(OPENSSL_AARCH64)
+// poly3_vec_cswap swaps (|a_s|, |a_a|) and (|b_s|, |b_a|) if |swap| is
+// |0xff..ff|. Otherwise, |swap| must be zero.
+static inline void poly3_vec_cswap(vec_t a_s[6], vec_t a_a[6], vec_t b_s[6],
+                                   vec_t b_a[6], const vec_t swap) {
+  for (int i = 0; i < 6; i++) {
+    const vec_t sum_s = swap & (a_s[i] ^ b_s[i]);
+    a_s[i] ^= sum_s;
+    b_s[i] ^= sum_s;
+    const vec_t sum_a = swap & (a_a[i] ^ b_a[i]);
+    a_a[i] ^= sum_a;
+    b_a[i] ^= sum_a;
+  }
+}
+// poly3_vec_fmsub subtracts (|ms|, |ma|) × (|b_s|, |b_a|) from (|a_s|, |a_a|).
+static inline void poly3_vec_fmsub(vec_t a_s[6], vec_t a_a[6], vec_t b_s[6],
+                                   vec_t b_a[6], const vec_t ms,
+                                   const vec_t ma) {
+  for (int i = 0; i < 6; i++) {
+    // See the bitslice formula, above.
+    const vec_t s = b_s[i];
+    const vec_t a = b_a[i];
+    const vec_t product_a = a & ma;
+    const vec_t product_s = (s ^ ms) & product_a;
+    const vec_t out_s = a_s[i];
+    const vec_t out_a = a_a[i];
+    const vec_t t = out_a ^ product_a;
+    a_s[i] = (out_s ^ product_a) & (t ^ product_s);
+    a_a[i] = t | (out_s ^ product_s);
+  }
+}
+// poly3_invert_vec sets |*out| to |in|^-1, i.e. such that |out|×|in| == 1 mod
+// Φ(N).
+static void poly3_invert_vec(struct poly3 *out, const struct poly3 *in) {
+  // See the comment in |HRSS_poly3_invert| about this algorithm. In addition to
+  // the changes described there, this implementation attempts to use vector
+  // registers to speed up the computation. Even non-poly3 variables are held in
+  // vectors where possible to minimise the amount of data movement between
+  // the vector and general-purpose registers.
+  vec_t b_s[6], b_a[6], c_s[6], c_a[6], f_s[6], f_a[6], g_s[6], g_a[6];
+  const vec_t kZero = {0};
+  const vec_t kOne = {1};
+  static const uint8_t kOneBytes[sizeof(vec_t)] = {1};
+  static const uint8_t kBottomSixtyOne[sizeof(vec_t)] = {
+      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x1f};
+  memset(b_s, 0, sizeof(b_s));
+  memcpy(b_a, kOneBytes, sizeof(kOneBytes));
+  memset(&b_a[1], 0, 5 * sizeof(vec_t));
+  memset(c_s, 0, sizeof(c_s));
+  memset(c_a, 0, sizeof(c_a));
+  f_s[5] = kZero;
+  memcpy(f_s, in->s.v, WORDS_PER_POLY * sizeof(crypto_word_t));
+  f_a[5] = kZero;
+  memcpy(f_a, in->a.v, WORDS_PER_POLY * sizeof(crypto_word_t));
+  // Set g to all ones.
+  memset(g_s, 0, sizeof(g_s));
+  memset(g_a, 0xff, 5 * sizeof(vec_t));
+  memcpy(&g_a[5], kBottomSixtyOne, sizeof(kBottomSixtyOne));
+  vec_t deg_f = {N - 1}, deg_g = {N - 1}, rotation = kZero;
+  vec_t k = kOne;
+  vec_t f0s = {0}, f0a = {0};
+  vec_t still_going;
+  memset(&still_going, 0xff, sizeof(still_going));
+  for (unsigned i = 0; i < 2 * (N - 1) - 1; i++) {
+    const vec_t s_a = vec_broadcast_bit(still_going & (f_a[0] & g_a[0]));
+    const vec_t s_s =
+        vec_broadcast_bit(still_going & ((f_s[0] ^ g_s[0]) & s_a));
+    const vec_t should_swap =
+        (s_s | s_a) & vec_broadcast_bit15(deg_f - deg_g);
+    poly3_vec_cswap(f_s, f_a, g_s, g_a, should_swap);
+    poly3_vec_fmsub(f_s, f_a, g_s, g_a, s_s, s_a);
+    poly3_vec_rshift1(f_s, f_a);
+    poly3_vec_cswap(b_s, b_a, c_s, c_a, should_swap);
+    poly3_vec_fmsub(b_s, b_a, c_s, c_a, s_s, s_a);
+    poly3_vec_lshift1(c_s, c_a);
+    const vec_t deg_sum = should_swap & (deg_f ^ deg_g);
+    deg_f ^= deg_sum;
+    deg_g ^= deg_sum;
+    deg_f -= kOne;
+    still_going &= ~vec_broadcast_bit15(deg_f - kOne);
+    const vec_t f0_is_nonzero = vec_broadcast_bit(f_s[0] | f_a[0]);
+    // |f0_is_nonzero| implies |still_going|.
+    rotation ^= f0_is_nonzero & (k ^ rotation);
+    k += kOne;
+    const vec_t f0s_sum = f0_is_nonzero & (f_s[0] ^ f0s);
+    f0s ^= f0s_sum;
+    const vec_t f0a_sum = f0_is_nonzero & (f_a[0] ^ f0a);
+    f0a ^= f0a_sum;
+  }
+  crypto_word_t rotation_word = vec_get_word(rotation, 0);
+  rotation_word -= N & constant_time_lt_w(N, rotation_word);
+  memcpy(out->s.v, b_s, WORDS_PER_POLY * sizeof(crypto_word_t));
+  memcpy(out->a.v, b_a, WORDS_PER_POLY * sizeof(crypto_word_t));
+  assert(poly3_top_bits_are_clear(out));
+  poly3_rotr_consttime(out, rotation_word);
+  poly3_mul_const(out, vec_get_word(f0s, 0), vec_get_word(f0a, 0));
+  poly3_mod_phiN(out);
+}
+#endif  // HRSS_HAVE_VECTOR_UNIT
+// HRSS_poly3_invert sets |*out| to |in|^-1, i.e. such that |out|×|in| == 1 mod
+// Φ(N).
+void HRSS_poly3_invert(struct poly3 *out, const struct poly3 *in) {
+  // The vector version of this function seems slightly slower on AArch64, but
+  // is useful on ARMv7 and x86-64.
+#if defined(HRSS_HAVE_VECTOR_UNIT) && !defined(OPENSSL_AARCH64)
+  if (vec_capable()) {
+    poly3_invert_vec(out, in);
+    return;
+  }
+#endif
+  // This algorithm mostly follows algorithm 10 in the paper. Some changes:
+  //   1) k should start at zero, not one. In the code below k is omitted and
+  //      the loop counter, |i|, is used instead.
+  //   2) The rotation count is conditionally updated to handle trailing zero
+  //      coefficients.
+  // The best explanation for why it works is in the "Why it works" section of
+  // [NTRUTN14].
+  struct poly3 c, f, g;
+  OPENSSL_memcpy(&f, in, sizeof(f));
+  // Set g to all ones.
+  OPENSSL_memset(&g.s, 0, sizeof(struct poly2));
+  OPENSSL_memset(&g.a, 0xff, sizeof(struct poly2));
+  g.a.v[WORDS_PER_POLY - 1] >>= BITS_PER_WORD - BITS_IN_LAST_WORD;
+  struct poly3 *b = out;
+  poly3_zero(b);
+  poly3_zero(&c);
+  // Set b to one.
+  b->a.v[0] = 1;
+  crypto_word_t deg_f = N - 1, deg_g = N - 1, rotation = 0;
+  crypto_word_t f0s = 0, f0a = 0;
+  crypto_word_t still_going = CONSTTIME_TRUE_W;
+  for (unsigned i = 0; i < 2 * (N - 1) - 1; i++) {
+    const crypto_word_t s_a = lsb_to_all(
+        still_going & (f.a.v[0] & g.a.v[0]));
+    const crypto_word_t s_s = lsb_to_all(
+        still_going & ((f.s.v[0] ^ g.s.v[0]) & s_a));
+    const crypto_word_t should_swap =
+        (s_s | s_a) & constant_time_lt_w(deg_f, deg_g);
+    poly3_cswap(&f, &g, should_swap);
+    poly3_cswap(b, &c, should_swap);
+    const crypto_word_t deg_sum = should_swap & (deg_f ^ deg_g);
+    deg_f ^= deg_sum;
+    deg_g ^= deg_sum;
+    assert(deg_g >= 1);
+    poly3_fmsub(&f, &g, s_s, s_a);
+    poly3_fmsub(b, &c, s_s, s_a);
+    poly3_rshift1(&f);
+    poly3_lshift1(&c);
+    deg_f--;
+    const crypto_word_t f0_is_nonzero =
+        lsb_to_all(f.s.v[0]) | lsb_to_all(f.a.v[0]);
+    // |f0_is_nonzero| implies |still_going|.
+    assert(!(f0_is_nonzero && !still_going));
+    still_going &= ~constant_time_is_zero_w(deg_f);
+    rotation = constant_time_select_w(f0_is_nonzero, i, rotation);
+    f0s = constant_time_select_w(f0_is_nonzero, f.s.v[0], f0s);
+    f0a = constant_time_select_w(f0_is_nonzero, f.a.v[0], f0a);
+  }
+  rotation++;
+  rotation -= N & constant_time_lt_w(N, rotation);
+  assert(poly3_top_bits_are_clear(out));
+  poly3_rotr_consttime(out, rotation);
+  poly3_mul_const(out, f0s, f0a);
+  poly3_mod_phiN(out);
+}
+// Polynomials in Q.
+// Coefficients are reduced mod Q. (Q is clearly not prime, therefore the
+// coefficients do not form a field.)
+#define Q 8192
+// VECS_PER_POLY is the number of 128-bit vectors needed to represent a
+// polynomial.
+#define COEFFICIENTS_PER_VEC (sizeof(vec_t) / sizeof(uint16_t))
+#define VECS_PER_POLY ((N + COEFFICIENTS_PER_VEC - 1) / COEFFICIENTS_PER_VEC)
+// poly represents a polynomial with coefficients mod Q. Note that, while Q is a
+// power of two, this does not operate in GF(Q). That would be a binary field
+// but this is simply mod Q. Thus the coefficients are not a field.
+//
+// Coefficients are ordered little-endian, thus the coefficient of x^0 is the
+// first element of the array.
+struct poly {
+#if defined(HRSS_HAVE_VECTOR_UNIT)
+  union {
+    // N + 3 = 704, which is a multiple of 64 and thus aligns things, esp for
+    // the vector code.
+    uint16_t v[N + 3];
+    vec_t vectors[VECS_PER_POLY];
+  };
+#else
+  // Even if !HRSS_HAVE_VECTOR_UNIT, external assembly may be called that
+  // requires alignment.
+  alignas(16) uint16_t v[N + 3];
+#endif
+};
+OPENSSL_UNUSED static void poly_print(const struct poly *p) {
+  printf("[");
+  for (unsigned i = 0; i < N; i++) {
+    if (i) {
+      printf(" ");
+    }
+    printf("%d", p->v[i]);
+  }
+  printf("]\n");
+}
+#if defined(HRSS_HAVE_VECTOR_UNIT)
+// poly_mul_vec_aux is a recursive function that multiplies |n| words from |a|
+// and |b| and writes 2×|n| words to |out|. Each call uses 2*ceil(n/2) elements
+// of |scratch| and the function recurses, except if |n| < 3, when |scratch|
+// isn't used and the recursion stops. If |n| == |VECS_PER_POLY| then |scratch|
+// needs 172 elements.
+static void poly_mul_vec_aux(vec_t *restrict out, vec_t *restrict scratch,
+                             const vec_t *restrict a, const vec_t *restrict b,
+                             const size_t n) {
+  // In [HRSS], the technique they used for polynomial multiplication is
+  // described: they start with Toom-4 at the top level and then two layers of
+  // Karatsuba. Karatsuba is a specific instance of the general Toom–Cook
+  // decomposition, which splits an input n-ways and produces 2n-1
+  // multiplications of those parts. So, starting with 704 coefficients (rounded
+  // up from 701 to have more factors of two), Toom-4 gives seven
+  // multiplications of degree-174 polynomials. Each round of Karatsuba (which
+  // is Toom-2) increases the number of multiplications by a factor of three
+  // while halving the size of the values being multiplied. So two rounds gives
+  // 63 multiplications of degree-44 polynomials. Then they (I think) form
+  // vectors by gathering all 63 coefficients of each power together, for each
+  // input, and doing more rounds of Karatsuba on the vectors until they bottom-
+  // out somewhere with schoolbook multiplication.
+  //
+  // I tried something like that for NEON. NEON vectors are 128 bits so hold
+  // eight coefficients. I wrote a function that did Karatsuba on eight
+  // multiplications at the same time, using such vectors, and a Go script that
+  // decomposed from degree-704, with Karatsuba in non-transposed form, until it
+  // reached multiplications of degree-44. It batched up those 81
+  // multiplications into lots of eight with a single one left over (which was
+  // handled directly).
+  //
+  // It worked, but it was significantly slower than the dumb algorithm used
+  // below. Potentially that was because I misunderstood how [HRSS] did it, or
+  // because Clang is bad at generating good code from NEON intrinsics on ARMv7.
+  // (Which is true: the code generated by Clang for the below is pretty crap.)
+  //
+  // This algorithm is much simpler. It just does Karatsuba decomposition all
+  // the way down and never transposes. When it gets down to degree-16 or
+  // degree-24 values, they are multiplied using schoolbook multiplication and
+  // vector intrinsics. The vector operations form each of the eight phase-
+  // shifts of one of the inputs, point-wise multiply, and then add into the
+  // result at the correct place. This means that 33% (degree-16) or 25%
+  // (degree-24) of the multiplies and adds are wasted, but it does ok.
+  if (n == 2) {
+    vec_t result[4];
+    vec_t vec_a[3];
+    static const vec_t kZero = {0};
+    vec_a[0] = a[0];
+    vec_a[1] = a[1];
+    vec_a[2] = kZero;
+    result[0] = vec_mul(vec_a[0], vec_get_word(b[0], 0));
+    result[1] = vec_mul(vec_a[1], vec_get_word(b[0], 0));
+    result[1] = vec_fma(result[1], vec_a[0], vec_get_word(b[1], 0));
+    result[2] = vec_mul(vec_a[1], vec_get_word(b[1], 0));
+    result[3] = kZero;
+    vec3_rshift_word(vec_a);
+#define BLOCK(x, y)                                                      \
+  do {                                                                   \
+    result[x + 0] =                                                      \
+        vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \
+    result[x + 1] =                                                      \
+        vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \
+    result[x + 2] =                                                      \
+        vec_fma(result[x + 2], vec_a[2], vec_get_word(b[y / 8], y % 8)); \
+  } while (0)
+    BLOCK(0, 1);
+    BLOCK(1, 9);
+    vec3_rshift_word(vec_a);
+    BLOCK(0, 2);
+    BLOCK(1, 10);
+    vec3_rshift_word(vec_a);
+    BLOCK(0, 3);
+    BLOCK(1, 11);
+    vec3_rshift_word(vec_a);
+    BLOCK(0, 4);
+    BLOCK(1, 12);
+    vec3_rshift_word(vec_a);
+    BLOCK(0, 5);
+    BLOCK(1, 13);
+    vec3_rshift_word(vec_a);
+    BLOCK(0, 6);
+    BLOCK(1, 14);
+    vec3_rshift_word(vec_a);
+    BLOCK(0, 7);
+    BLOCK(1, 15);
+#undef BLOCK
+    memcpy(out, result, sizeof(result));
+    return;
+  }
+  if (n == 3) {
+    vec_t result[6];
+    vec_t vec_a[4];
+    static const vec_t kZero = {0};
+    vec_a[0] = a[0];
+    vec_a[1] = a[1];
+    vec_a[2] = a[2];
+    vec_a[3] = kZero;
+    result[0] = vec_mul(a[0], vec_get_word(b[0], 0));
+    result[1] = vec_mul(a[1], vec_get_word(b[0], 0));
+    result[2] = vec_mul(a[2], vec_get_word(b[0], 0));
+#define BLOCK_PRE(x, y)                                                  \
+  do {                                                                   \
+    result[x + 0] =                                                      \
+        vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \
+    result[x + 1] =                                                      \
+        vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \
+    result[x + 2] = vec_mul(vec_a[2], vec_get_word(b[y / 8], y % 8));    \
+  } while (0)
+    BLOCK_PRE(1, 8);
+    BLOCK_PRE(2, 16);
+    result[5] = kZero;
+    vec4_rshift_word(vec_a);
+#define BLOCK(x, y)                                                      \
+  do {                                                                   \
+    result[x + 0] =                                                      \
+        vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \
+    result[x + 1] =                                                      \
+        vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \
+    result[x + 2] =                                                      \
+        vec_fma(result[x + 2], vec_a[2], vec_get_word(b[y / 8], y % 8)); \
+    result[x + 3] =                                                      \
+        vec_fma(result[x + 3], vec_a[3], vec_get_word(b[y / 8], y % 8)); \
+  } while (0)
+    BLOCK(0, 1);
+    BLOCK(1, 9);
+    BLOCK(2, 17);
+    vec4_rshift_word(vec_a);
+    BLOCK(0, 2);
+    BLOCK(1, 10);
+    BLOCK(2, 18);
+    vec4_rshift_word(vec_a);
+    BLOCK(0, 3);
+    BLOCK(1, 11);
+    BLOCK(2, 19);
+    vec4_rshift_word(vec_a);
+    BLOCK(0, 4);
+    BLOCK(1, 12);
+    BLOCK(2, 20);
+    vec4_rshift_word(vec_a);
+    BLOCK(0, 5);
+    BLOCK(1, 13);
+    BLOCK(2, 21);
+    vec4_rshift_word(vec_a);
+    BLOCK(0, 6);
+    BLOCK(1, 14);
+    BLOCK(2, 22);
+    vec4_rshift_word(vec_a);
+    BLOCK(0, 7);
+    BLOCK(1, 15);
+    BLOCK(2, 23);
+#undef BLOCK
+#undef BLOCK_PRE
+    memcpy(out, result, sizeof(result));
+    return;
+  }
+  // Karatsuba multiplication.
+  // https://en.wikipedia.org/wiki/Karatsuba_algorithm
+  // When |n| is odd, the two "halves" will have different lengths. The first is
+  // always the smaller.
+  const size_t low_len = n / 2;
+  const size_t high_len = n - low_len;
+  const vec_t *a_high = &a[low_len];
+  const vec_t *b_high = &b[low_len];
+  // Store a_1 + a_0 in the first half of |out| and b_1 + b_0 in the second
+  // half.
+  for (size_t i = 0; i < low_len; i++) {
+    out[i] = vec_add(a_high[i], a[i]);
+    out[high_len + i] = vec_add(b_high[i], b[i]);
+  }
+  if (high_len != low_len) {
+    out[low_len] = a_high[low_len];
+    out[high_len + low_len] = b_high[low_len];
+  }
+  vec_t *const child_scratch = &scratch[2 * high_len];
+  // Calculate (a_1 + a_0) × (b_1 + b_0) and write to scratch buffer.
+  poly_mul_vec_aux(scratch, child_scratch, out, &out[high_len], high_len);
+  // Calculate a_1 × b_1.
+  poly_mul_vec_aux(&out[low_len * 2], child_scratch, a_high, b_high, high_len);
+  // Calculate a_0 × b_0.
+  poly_mul_vec_aux(out, child_scratch, a, b, low_len);
+  // Subtract those last two products from the first.
+  for (size_t i = 0; i < low_len * 2; i++) {
+    scratch[i] = vec_sub(scratch[i], vec_add(out[i], out[low_len * 2 + i]));
+  }
+  if (low_len != high_len) {
+    scratch[low_len * 2] = vec_sub(scratch[low_len * 2], out[low_len * 4]);
+    scratch[low_len * 2 + 1] =
+        vec_sub(scratch[low_len * 2 + 1], out[low_len * 4 + 1]);
+  }
+  // Add the middle product into the output.
+  for (size_t i = 0; i < high_len * 2; i++) {
+    out[low_len + i] = vec_add(out[low_len + i], scratch[i]);
+  }
+}
+// poly_mul_vec sets |*out| to |x|×|y| mod (𝑥^n - 1).
+static void poly_mul_vec(struct poly *out, const struct poly *x,
+                         const struct poly *y) {
+  OPENSSL_memset((uint16_t *)&x->v[N], 0, 3 * sizeof(uint16_t));
+  OPENSSL_memset((uint16_t *)&y->v[N], 0, 3 * sizeof(uint16_t));
+  OPENSSL_STATIC_ASSERT(sizeof(out->v) == sizeof(vec_t) * VECS_PER_POLY,
+                        "struct poly is the wrong size");
+  OPENSSL_STATIC_ASSERT(alignof(struct poly) == alignof(vec_t),
+                        "struct poly has incorrect alignment");
+  vec_t prod[VECS_PER_POLY * 2];
+  vec_t scratch[172];
+  poly_mul_vec_aux(prod, scratch, x->vectors, y->vectors, VECS_PER_POLY);
+  // |prod| needs to be reduced mod (𝑥^n - 1), which just involves adding the
+  // upper-half to the lower-half. However, N is 701, which isn't a multiple of
+  // the vector size, so the upper-half vectors all have to be shifted before
+  // being added to the lower-half.
+  vec_t *out_vecs = (vec_t *)out->v;
+  for (size_t i = 0; i < VECS_PER_POLY; i++) {
+    const vec_t prev = prod[VECS_PER_POLY - 1 + i];
+    const vec_t this = prod[VECS_PER_POLY + i];
+    out_vecs[i] = vec_add(prod[i], vec_merge_3_5(prev, this));
+  }
+  OPENSSL_memset(&out->v[N], 0, 3 * sizeof(uint16_t));
+}
+#endif  // HRSS_HAVE_VECTOR_UNIT
+// poly_mul_novec_aux writes the product of |a| and |b| to |out|, using
+// |scratch| as scratch space. It'll use Karatsuba if the inputs are large
+// enough to warrant it. Each call uses 2*ceil(n/2) elements of |scratch| and
+// the function recurses, except if |n| < 64, when |scratch| isn't used and the
+// recursion stops. If |n| == |N| then |scratch| needs 1318 elements.
+static void poly_mul_novec_aux(uint16_t *out, uint16_t *scratch,
+                               const uint16_t *a, const uint16_t *b, size_t n) {
+  static const size_t kSchoolbookLimit = 64;
+  if (n < kSchoolbookLimit) {
+    OPENSSL_memset(out, 0, sizeof(uint16_t) * n * 2);
+    for (size_t i = 0; i < n; i++) {
+      for (size_t j = 0; j < n; j++) {
+        out[i + j] += (unsigned) a[i] * b[j];
+      }
+    }
+    return;
+  }
+  // Karatsuba multiplication.
+  // https://en.wikipedia.org/wiki/Karatsuba_algorithm
+  // When |n| is odd, the two "halves" will have different lengths. The
+  // first is always the smaller.
+  const size_t low_len = n / 2;
+  const size_t high_len = n - low_len;
+  const uint16_t *const a_high = &a[low_len];
+  const uint16_t *const b_high = &b[low_len];
+  for (size_t i = 0; i < low_len; i++) {
+    out[i] = a_high[i] + a[i];
+    out[high_len + i] = b_high[i] + b[i];
+  }
+  if (high_len != low_len) {
+    out[low_len] = a_high[low_len];
+    out[high_len + low_len] = b_high[low_len];
+  }
+  uint16_t *const child_scratch = &scratch[2 * high_len];
+  poly_mul_novec_aux(scratch, child_scratch, out, &out[high_len], high_len);
+  poly_mul_novec_aux(&out[low_len * 2], child_scratch, a_high, b_high,
+                     high_len);
+  poly_mul_novec_aux(out, child_scratch, a, b, low_len);
+  for (size_t i = 0; i < low_len * 2; i++) {
+    scratch[i] -= out[i] + out[low_len * 2 + i];
+  }
+  if (low_len != high_len) {
+    scratch[low_len * 2] -= out[low_len * 4];
+    assert(out[low_len * 4 + 1] == 0);
+  }
+  for (size_t i = 0; i < high_len * 2; i++) {
+    out[low_len + i] += scratch[i];
+  }
+}
+// poly_mul_novec sets |*out| to |x|×|y| mod (𝑥^n - 1).
+static void poly_mul_novec(struct poly *out, const struct poly *x,
+                           const struct poly *y) {
+  uint16_t prod[2 * N];
+  uint16_t scratch[1318];
+  poly_mul_novec_aux(prod, scratch, x->v, y->v, N);
+  for (size_t i = 0; i < N; i++) {
+    out->v[i] = prod[i] + prod[i + N];
+  }
+  OPENSSL_memset(&out->v[N], 0, 3 * sizeof(uint16_t));
+}
+static void poly_mul(struct poly *r, const struct poly *a,
+                     const struct poly *b) {
+#if defined(POLY_RQ_MUL_ASM)
+  const int has_avx2 = (OPENSSL_ia32cap_P[2] & (1 << 5)) != 0;
+  if (has_avx2) {
+    poly_Rq_mul(r->v, a->v, b->v);
+    return;
+  }
+#endif
+#if defined(HRSS_HAVE_VECTOR_UNIT)
+  if (vec_capable()) {
+    poly_mul_vec(r, a, b);
+    return;
+  }
+#endif
+  // Fallback, non-vector case.
+  poly_mul_novec(r, a, b);
+}
+// poly_mul_x_minus_1 sets |p| to |p|×(𝑥 - 1) mod (𝑥^n - 1).
+static void poly_mul_x_minus_1(struct poly *p) {
+  // Multiplying by (𝑥 - 1) means negating each coefficient and adding in
+  // the value of the previous one.
+  const uint16_t orig_final_coefficient = p->v[N - 1];
+  for (size_t i = N - 1; i > 0; i--) {
+    p->v[i] = p->v[i - 1] - p->v[i];
+  }
+  p->v[0] = orig_final_coefficient - p->v[0];
+}
+// poly_mod_phiN sets |p| to |p| mod Φ(N).
+static void poly_mod_phiN(struct poly *p) {
+  const uint16_t coeff700 = p->v[N - 1];
+  for (unsigned i = 0; i < N; i++) {
+    p->v[i] -= coeff700;
+  }
+}
+// poly_clamp reduces each coefficient mod Q.
+static void poly_clamp(struct poly *p) {
+  for (unsigned i = 0; i < N; i++) {
+    p->v[i] &= Q - 1;
+  }
+}
+// Conversion functions
+// --------------------
+// poly2_from_poly sets |*out| to |in| mod 2.
+static void poly2_from_poly(struct poly2 *out, const struct poly *in) {
+  crypto_word_t *words = out->v;
+  unsigned shift = 0;
+  crypto_word_t word = 0;
+  for (unsigned i = 0; i < N; i++) {
+    word >>= 1;
+    word |= (crypto_word_t)(in->v[i] & 1) << (BITS_PER_WORD - 1);
+    shift++;
+    if (shift == BITS_PER_WORD) {
+      *words = word;
+      words++;
+      word = 0;
+      shift = 0;
+    }
+  }
+  word >>= BITS_PER_WORD - shift;
+  *words = word;
+}
+// mod3 treats |a| as a signed number and returns |a| mod 3.
+static uint16_t mod3(int16_t a) {
+  const int16_t q = ((int32_t)a * 21845) >> 16;
+  int16_t ret = a - 3 * q;
+  // At this point, |ret| is in {0, 1, 2, 3} and that needs to be mapped to {0,
+  // 1, 2, 0}.
+  return ret & ((ret & (ret >> 1)) - 1);
+}
+// poly3_from_poly sets |*out| to |in|.
+static void poly3_from_poly(struct poly3 *out, const struct poly *in) {
+  crypto_word_t *words_s = out->s.v;
+  crypto_word_t *words_a = out->a.v;
+  crypto_word_t s = 0;
+  crypto_word_t a = 0;
+  unsigned shift = 0;
+  for (unsigned i = 0; i < N; i++) {
+    // This duplicates the 13th bit upwards to the top of the uint16,
+    // essentially treating it as a sign bit and converting into a signed int16.
+    // The signed value is reduced mod 3, yielding {0, 1, 2}.
+    const uint16_t v = mod3((int16_t)(in->v[i] << 3) >> 3);
+    s >>= 1;
+    const crypto_word_t s_bit = (crypto_word_t)(v & 2) << (BITS_PER_WORD - 2);
+    s |= s_bit;
+    a >>= 1;
+    a |= s_bit | (crypto_word_t)(v & 1) << (BITS_PER_WORD - 1);
+    shift++;
+    if (shift == BITS_PER_WORD) {
+      *words_s = s;
+      words_s++;
+      *words_a = a;
+      words_a++;
+      s = a = 0;
+      shift = 0;
+    }
+  }
+  s >>= BITS_PER_WORD - shift;
+  a >>= BITS_PER_WORD - shift;
+  *words_s = s;
+  *words_a = a;
+}
+// poly3_from_poly_checked sets |*out| to |in|, which has coefficients in {0, 1,
+// Q-1}. It returns a mask indicating whether all coefficients were found to be
+// in that set.
+static crypto_word_t poly3_from_poly_checked(struct poly3 *out,
+                                             const struct poly *in) {
+  crypto_word_t *words_s = out->s.v;
+  crypto_word_t *words_a = out->a.v;
+  crypto_word_t s = 0;
+  crypto_word_t a = 0;
+  unsigned shift = 0;
+  crypto_word_t ok = CONSTTIME_TRUE_W;
+  for (unsigned i = 0; i < N; i++) {
+    const uint16_t v = in->v[i];
+    // Maps {0, 1, Q-1} to {0, 1, 2}.
+    uint16_t mod3 = v & 3;
+    mod3 ^= mod3 >> 1;
+    const uint16_t expected = (uint16_t)((~((mod3 >> 1) - 1)) | mod3) % Q;
+    ok &= constant_time_eq_w(v, expected);
+    s >>= 1;
+    const crypto_word_t s_bit = (crypto_word_t)(mod3 & 2)
+                                << (BITS_PER_WORD - 2);
+    s |= s_bit;
+    a >>= 1;
+    a |= s_bit | (crypto_word_t)(mod3 & 1) << (BITS_PER_WORD - 1);
+    shift++;
+    if (shift == BITS_PER_WORD) {
+      *words_s = s;
+      words_s++;
+      *words_a = a;
+      words_a++;
+      s = a = 0;
+      shift = 0;
+    }
+  }
+  s >>= BITS_PER_WORD - shift;
+  a >>= BITS_PER_WORD - shift;
+  *words_s = s;
+  *words_a = a;
+  return ok;
+}
+static void poly_from_poly2(struct poly *out, const struct poly2 *in) {
+  const crypto_word_t *words = in->v;
+  unsigned shift = 0;
+  crypto_word_t word = *words;
+  for (unsigned i = 0; i < N; i++) {
+    out->v[i] = word & 1;
+    word >>= 1;
+    shift++;
+    if (shift == BITS_PER_WORD) {
+      words++;
+      word = *words;
+      shift = 0;
+    }
+  }
+}
+static void poly_from_poly3(struct poly *out, const struct poly3 *in) {
+  const crypto_word_t *words_s = in->s.v;
+  const crypto_word_t *words_a = in->a.v;
+  crypto_word_t word_s = ~(*words_s);
+  crypto_word_t word_a = *words_a;
+  unsigned shift = 0;
+  for (unsigned i = 0; i < N; i++) {
+    out->v[i] = (uint16_t)(word_s & 1) - 1;
+    out->v[i] |= word_a & 1;
+    word_s >>= 1;
+    word_a >>= 1;
+    shift++;
+    if (shift == BITS_PER_WORD) {
+      words_s++;
+      words_a++;
+      word_s = ~(*words_s);
+      word_a = *words_a;
+      shift = 0;
+    }
+  }
+}
+// Polynomial inversion
+// --------------------
+// poly_invert_mod2 sets |*out| to |in^-1| (i.e. such that |*out|×|in| = 1 mod
+// Φ(N)), all mod 2. This isn't useful in itself, but is part of doing inversion
+// mod Q.
+static void poly_invert_mod2(struct poly *out, const struct poly *in) {
+  // This algorithm follows algorithm 10 in the paper. (Although, in contrast to
+  // the paper, k should start at zero, not one, and the rotation count is needs
+  // to handle trailing zero coefficients.) The best explanation for why it
+  // works is in the "Why it works" section of [NTRUTN14].
+  struct poly2 b, c, f, g;
+  poly2_from_poly(&f, in);
+  OPENSSL_memset(&b, 0, sizeof(b));
+  b.v[0] = 1;
+  OPENSSL_memset(&c, 0, sizeof(c));
+  // Set g to all ones.
+  OPENSSL_memset(&g, 0xff, sizeof(struct poly2));
+  g.v[WORDS_PER_POLY - 1] >>= BITS_PER_WORD - BITS_IN_LAST_WORD;
+  crypto_word_t deg_f = N - 1, deg_g = N - 1, rotation = 0;
+  crypto_word_t still_going = CONSTTIME_TRUE_W;
+  for (unsigned i = 0; i < 2 * (N - 1) - 1; i++) {
+    const crypto_word_t s = still_going & lsb_to_all(f.v[0]);
+    const crypto_word_t should_swap = s & constant_time_lt_w(deg_f, deg_g);
+    poly2_cswap(&f, &g, should_swap);
+    poly2_cswap(&b, &c, should_swap);
+    const crypto_word_t deg_sum = should_swap & (deg_f ^ deg_g);
+    deg_f ^= deg_sum;
+    deg_g ^= deg_sum;
+    assert(deg_g >= 1);
+    poly2_fmadd(&f, &g, s);
+    poly2_fmadd(&b, &c, s);
+    poly2_rshift1(&f);
+    poly2_lshift1(&c);
+    deg_f--;
+    const crypto_word_t f0_is_nonzero = lsb_to_all(f.v[0]);
+    // |f0_is_nonzero| implies |still_going|.
+    assert(!(f0_is_nonzero && !still_going));
+    rotation = constant_time_select_w(f0_is_nonzero, i, rotation);
+    still_going &= ~constant_time_is_zero_w(deg_f);
+  }
+  rotation++;
+  rotation -= N & constant_time_lt_w(N, rotation);
+  assert(poly2_top_bits_are_clear(&b));
+  HRSS_poly2_rotr_consttime(&b, rotation);
+  poly_from_poly2(out, &b);
+}
+// poly_invert sets |*out| to |in^-1| (i.e. such that |*out|×|in| = 1 mod Φ(N)).
+static void poly_invert(struct poly *out, const struct poly *in) {
+  // Inversion mod Q, which is done based on the result of inverting mod
+  // 2. See [NTRUTN14] paper, bottom of page two.
+  struct poly a, *b, tmp;
+  // a = -in.
+  for (unsigned i = 0; i < N; i++) {
+    a.v[i] = -in->v[i];
+  }
+  // b = in^-1 mod 2.
+  b = out;
+  poly_invert_mod2(b, in);
+  // We are working mod Q=2**13 and we need to iterate ceil(log_2(13))
+  // times, which is four.
+  for (unsigned i = 0; i < 4; i++) {
+    poly_mul(&tmp, &a, b);
+    tmp.v[0] += 2;
+    poly_mul(b, b, &tmp);
+  }
+}
+// Marshal and unmarshal functions for various basic types.
+// --------------------------------------------------------
+#define POLY_BYTES 1138
+// poly_marshal serialises all but the final coefficient of |in| to |out|.
+static void poly_marshal(uint8_t out[POLY_BYTES], const struct poly *in) {
+  const uint16_t *p = in->v;
+  for (size_t i = 0; i < N / 8; i++) {
+    out[0] = p[0];
+    out[1] = (0x1f & (p[0] >> 8)) | ((p[1] & 0x07) << 5);
+    out[2] = p[1] >> 3;
+    out[3] = (3 & (p[1] >> 11)) | ((p[2] & 0x3f) << 2);
+    out[4] = (0x7f & (p[2] >> 6)) | ((p[3] & 0x01) << 7);
+    out[5] = p[3] >> 1;
+    out[6] = (0xf & (p[3] >> 9)) | ((p[4] & 0x0f) << 4);
+    out[7] = p[4] >> 4;
+    out[8] = (1 & (p[4] >> 12)) | ((p[5] & 0x7f) << 1);
+    out[9] = (0x3f & (p[5] >> 7)) | ((p[6] & 0x03) << 6);
+    out[10] = p[6] >> 2;
+    out[11] = (7 & (p[6] >> 10)) | ((p[7] & 0x1f) << 3);
+    out[12] = p[7] >> 5;
+    p += 8;
+    out += 13;
+  }
+  // There are four remaining values.
+  out[0] = p[0];
+  out[1] = (0x1f & (p[0] >> 8)) | ((p[1] & 0x07) << 5);
+  out[2] = p[1] >> 3;
+  out[3] = (3 & (p[1] >> 11)) | ((p[2] & 0x3f) << 2);
+  out[4] = (0x7f & (p[2] >> 6)) | ((p[3] & 0x01) << 7);
+  out[5] = p[3] >> 1;
+  out[6] = 0xf & (p[3] >> 9);
+}
+// poly_unmarshal parses the output of |poly_marshal| and sets |out| such that
+// all but the final coefficients match, and the final coefficient is calculated
+// such that evaluating |out| at one results in zero. It returns one on success
+// or zero if |in| is an invalid encoding.
+static int poly_unmarshal(struct poly *out, const uint8_t in[POLY_BYTES]) {
+  uint16_t *p = out->v;
+  for (size_t i = 0; i < N / 8; i++) {
+    p[0] = (uint16_t)(in[0]) | (uint16_t)(in[1] & 0x1f) << 8;
+    p[1] = (uint16_t)(in[1] >> 5) | (uint16_t)(in[2]) << 3 |
+           (uint16_t)(in[3] & 3) << 11;
+    p[2] = (uint16_t)(in[3] >> 2) | (uint16_t)(in[4] & 0x7f) << 6;
+    p[3] = (uint16_t)(in[4] >> 7) | (uint16_t)(in[5]) << 1 |
+           (uint16_t)(in[6] & 0xf) << 9;
+    p[4] = (uint16_t)(in[6] >> 4) | (uint16_t)(in[7]) << 4 |
+           (uint16_t)(in[8] & 1) << 12;
+    p[5] = (uint16_t)(in[8] >> 1) | (uint16_t)(in[9] & 0x3f) << 7;
+    p[6] = (uint16_t)(in[9] >> 6) | (uint16_t)(in[10]) << 2 |
+           (uint16_t)(in[11] & 7) << 10;
+    p[7] = (uint16_t)(in[11] >> 3) | (uint16_t)(in[12]) << 5;
+    p += 8;
+    in += 13;
+  }
+  // There are four coefficients remaining.
+  p[0] = (uint16_t)(in[0]) | (uint16_t)(in[1] & 0x1f) << 8;
+  p[1] = (uint16_t)(in[1] >> 5) | (uint16_t)(in[2]) << 3 |
+         (uint16_t)(in[3] & 3) << 11;
+  p[2] = (uint16_t)(in[3] >> 2) | (uint16_t)(in[4] & 0x7f) << 6;
+  p[3] = (uint16_t)(in[4] >> 7) | (uint16_t)(in[5]) << 1 |
+         (uint16_t)(in[6] & 0xf) << 9;
+  for (unsigned i = 0; i < N - 1; i++) {
+    out->v[i] = (int16_t)(out->v[i] << 3) >> 3;
+  }
+  // There are four unused bits in the last byte. We require them to be zero.
+  if ((in[6] & 0xf0) != 0) {
+    return 0;
+  }
+  // Set the final coefficient as specifed in [HRSSNIST] 1.9.2 step 6.
+  uint32_t sum = 0;
+  for (size_t i = 0; i < N - 1; i++) {
+    sum += out->v[i];
+  }
+  out->v[N - 1] = (uint16_t)(0u - sum);
+  return 1;
+}
+// mod3_from_modQ maps {0, 1, Q-1, 65535} -> {0, 1, 2, 2}. Note that |v| may
+// have an invalid value when processing attacker-controlled inputs.
+static uint16_t mod3_from_modQ(uint16_t v) {
+  v &= 3;
+  return v ^ (v >> 1);
+}
+// poly_marshal_mod3 marshals |in| to |out| where the coefficients of |in| are
+// all in {0, 1, Q-1, 65535} and |in| is mod Φ(N). (Note that coefficients may
+// have invalid values when processing attacker-controlled inputs.)
+static void poly_marshal_mod3(uint8_t out[HRSS_POLY3_BYTES],
+                              const struct poly *in) {
+  const uint16_t *coeffs = in->v;
+  // Only 700 coefficients are marshaled because in[700] must be zero.
+  assert(coeffs[N-1] == 0);
+  for (size_t i = 0; i < HRSS_POLY3_BYTES; i++) {
+    const uint16_t coeffs0 = mod3_from_modQ(coeffs[0]);
+    const uint16_t coeffs1 = mod3_from_modQ(coeffs[1]);
+    const uint16_t coeffs2 = mod3_from_modQ(coeffs[2]);
+    const uint16_t coeffs3 = mod3_from_modQ(coeffs[3]);
+    const uint16_t coeffs4 = mod3_from_modQ(coeffs[4]);
+    out[i] = coeffs0 + coeffs1 * 3 + coeffs2 * 9 + coeffs3 * 27 + coeffs4 * 81;
+    coeffs += 5;
+  }
+}
+// HRSS-specific functions
+// -----------------------
+// poly_short_sample samples a vector of values in {0xffff (i.e. -1), 0, 1}.
+// This is the same action as the algorithm in [HRSSNIST] section 1.8.1, but
+// with HRSS-SXY the sampling algorithm is now a private detail of the
+// implementation (previously it had to match between two parties). This
+// function uses that freedom to implement a flatter distribution of values.
+static void poly_short_sample(struct poly *out,
+                              const uint8_t in[HRSS_SAMPLE_BYTES]) {
+  OPENSSL_STATIC_ASSERT(HRSS_SAMPLE_BYTES == N - 1,
+                        "HRSS_SAMPLE_BYTES incorrect");
+  for (size_t i = 0; i < N - 1; i++) {
+    uint16_t v = mod3(in[i]);
+    // Map {0, 1, 2} -> {0, 1, 0xffff}
+    v |= ((v >> 1) ^ 1) - 1;
+    out->v[i] = v;
+  }
+  out->v[N - 1] = 0;
+}
+// poly_short_sample_plus performs the T+ sample as defined in [HRSSNIST],
+// section 1.8.2.
+static void poly_short_sample_plus(struct poly *out,
+                                   const uint8_t in[HRSS_SAMPLE_BYTES]) {
+  poly_short_sample(out, in);
+  // sum (and the product in the for loop) will overflow. But that's fine
+  // because |sum| is bound by +/- (N-2), and N < 2^15 so it works out.
+  uint16_t sum = 0;
+  for (unsigned i = 0; i < N - 2; i++) {
+    sum += (unsigned) out->v[i] * out->v[i + 1];
+  }
+  // If the sum is negative, flip the sign of even-positioned coefficients. (See
+  // page 8 of [HRSS].)
+  sum = ((int16_t) sum) >> 15;
+  const uint16_t scale = sum | (~sum & 1);
+  for (unsigned i = 0; i < N; i += 2) {
+    out->v[i] = (unsigned) out->v[i] * scale;
+  }
+}
+// poly_lift computes the function discussed in [HRSS], appendix B.
+static void poly_lift(struct poly *out, const struct poly *a) {
+  // We wish to calculate a/(𝑥-1) mod Φ(N) over GF(3), where Φ(N) is the
+  // Nth cyclotomic polynomial, i.e. 1 + 𝑥 + … + 𝑥^700 (since N is prime).
+  // 1/(𝑥-1) has a fairly basic structure that we can exploit to speed this up:
+  //
+  // R.<x> = PolynomialRing(GF(3)…)
+  // inv = R.cyclotomic_polynomial(1).inverse_mod(R.cyclotomic_polynomial(n))
+  // list(inv)[:15]
+  //   [1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2]
+  //
+  // This three-element pattern of coefficients repeats for the whole
+  // polynomial.
+  //
+  // Next define the overbar operator such that z̅ = z[0] +
+  // reverse(z[1:]). (Index zero of a polynomial here is the coefficient
+  // of the constant term. So index one is the coefficient of 𝑥 and so
+  // on.)
+  //
+  // A less odd way to define this is to see that z̅ negates the indexes,
+  // so z̅[0] = z[-0], z̅[1] = z[-1] and so on.
+  //
+  // The use of z̅ is that, when working mod (𝑥^701 - 1), vz[0] = <v,
+  // z̅>, vz[1] = <v, 𝑥z̅>, …. (Where <a, b> is the inner product: the sum
+  // of the point-wise products.) Although we calculated the inverse mod
+  // Φ(N), we can work mod (𝑥^N - 1) and reduce mod Φ(N) at the end.
+  // (That's because (𝑥^N - 1) is a multiple of Φ(N).)
+  //
+  // When working mod (𝑥^N - 1), multiplication by 𝑥 is a right-rotation
+  // of the list of coefficients.
+  //
+  // Thus we can consider what the pattern of z̅, 𝑥z̅, 𝑥^2z̅, … looks like:
+  //
+  // def reverse(xs):
+  //   suffix = list(xs[1:])
+  //   suffix.reverse()
+  //   return [xs[0]] + suffix
+  //
+  // def rotate(xs):
+  //   return [xs[-1]] + xs[:-1]
+  //
+  // zoverbar = reverse(list(inv) + [0])
+  // xzoverbar = rotate(reverse(list(inv) + [0]))
+  // x2zoverbar = rotate(rotate(reverse(list(inv) + [0])))
+  //
+  // zoverbar[:15]
+  //   [1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1]
+  // xzoverbar[:15]
+  //   [0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0]
+  // x2zoverbar[:15]
+  //   [2, 0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]
+  //
+  // (For a formula for z̅, see lemma two of appendix B.)
+  //
+  // After the first three elements have been taken care of, all then have
+  // a repeating three-element cycle. The next value (𝑥^3z̅) involves
+  // three rotations of the first pattern, thus the three-element cycle
+  // lines up. However, the discontinuity in the first three elements
+  // obviously moves to a different position. Consider the difference
+  // between 𝑥^3z̅ and z̅:
+  //
+  // [x-y for (x,y) in zip(zoverbar, x3zoverbar)][:15]
+  //    [0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+  //
+  // This pattern of differences is the same for all elements, although it
+  // obviously moves right with the rotations.
+  //
+  // From this, we reach algorithm eight of appendix B.
+  // Handle the first three elements of the inner products.
+  out->v[0] = a->v[0] + a->v[2];
+  out->v[1] = a->v[1];
+  out->v[2] = -a->v[0] + a->v[2];
+  // s0, s1, s2 are added into out->v[0], out->v[1], and out->v[2],
+  // respectively. We do not compute s1 because it's just -(s0 + s1).
+  uint16_t s0 = 0, s2 = 0;
+  for (size_t i = 3; i < 699; i += 3) {
+    s0 += -a->v[i] + a->v[i + 2];
+    // s1 += a->v[i] - a->v[i + 1];
+    s2 += a->v[i + 1] - a->v[i + 2];
+  }
+  // Handle the fact that the three-element pattern doesn't fill the
+  // polynomial exactly (since 701 isn't a multiple of three).
+  s0 -= a->v[699];
+  // s1 += a->v[699] - a->v[700];
+  s2 += a->v[700];
+  // Note that s0 + s1 + s2 = 0.
+  out->v[0] += s0;
+  out->v[1] -= (s0 + s2); // = s1
+  out->v[2] += s2;
+  // Calculate the remaining inner products by taking advantage of the
+  // fact that the pattern repeats every three cycles and the pattern of
+  // differences moves with the rotation.
+  for (size_t i = 3; i < N; i++) {
+    out->v[i] = (out->v[i - 3] - (a->v[i - 2] + a->v[i - 1] + a->v[i]));
+  }
+  // Reduce mod Φ(N) by subtracting a multiple of out[700] from every
+  // element and convert to mod Q. (See above about adding twice as
+  // subtraction.)
+  const crypto_word_t v = out->v[700];
+  for (unsigned i = 0; i < N; i++) {
+    const uint16_t vi_mod3 = mod3(out->v[i] - v);
+    // Map {0, 1, 2} to {0, 1, 0xffff}.
+    out->v[i] = (~((vi_mod3 >> 1) - 1)) | vi_mod3;
+  }
+  poly_mul_x_minus_1(out);
+}
+struct public_key {
+  struct poly ph;
+};
+struct private_key {
+  struct poly3 f, f_inverse;
+  struct poly ph_inverse;
+  uint8_t hmac_key[32];
+};
+// public_key_from_external converts an external public key pointer into an
+// internal one. Externally the alignment is only specified to be eight bytes
+// but we need 16-byte alignment. We could annotate the external struct with
+// that alignment but we can only assume that malloced pointers are 8-byte
+// aligned in any case. (Even if the underlying malloc returns values with
+// 16-byte alignment, |OPENSSL_malloc| will store an 8-byte size prefix and mess
+// that up.)
+static struct public_key *public_key_from_external(
+    struct HRSS_public_key *ext) {
+  OPENSSL_STATIC_ASSERT(
+      sizeof(struct HRSS_public_key) >= sizeof(struct public_key) + 15,
+      "HRSS public key too small");
+  uintptr_t p = (uintptr_t)ext;
+  p = (p + 15) & ~15;
+  return (struct public_key *)p;
+}
+// private_key_from_external does the same thing as |public_key_from_external|,
+// but for private keys. See the comment on that function about alignment
+// issues.
+static struct private_key *private_key_from_external(
+    struct HRSS_private_key *ext) {
+  OPENSSL_STATIC_ASSERT(
+      sizeof(struct HRSS_private_key) >= sizeof(struct private_key) + 15,
+      "HRSS private key too small");
+  uintptr_t p = (uintptr_t)ext;
+  p = (p + 15) & ~15;
+  return (struct private_key *)p;
+}
+void HRSS_generate_key(
+    struct HRSS_public_key *out_pub, struct HRSS_private_key *out_priv,
+    const uint8_t in[HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES + 32]) {
+  struct public_key *pub = public_key_from_external(out_pub);
+  struct private_key *priv = private_key_from_external(out_priv);
+  OPENSSL_memcpy(priv->hmac_key, in + 2 * HRSS_SAMPLE_BYTES,
+                 sizeof(priv->hmac_key));
+  struct poly f;
+  poly_short_sample_plus(&f, in);
+  poly3_from_poly(&priv->f, &f);
+  HRSS_poly3_invert(&priv->f_inverse, &priv->f);
+  // pg_phi1 is p (i.e. 3) × g × Φ(1) (i.e. 𝑥-1).
+  struct poly pg_phi1;
+  poly_short_sample_plus(&pg_phi1, in + HRSS_SAMPLE_BYTES);
+  for (unsigned i = 0; i < N; i++) {
+    pg_phi1.v[i] *= 3;
+  }
+  poly_mul_x_minus_1(&pg_phi1);
+  struct poly pfg_phi1;
+  poly_mul(&pfg_phi1, &f, &pg_phi1);
+  struct poly pfg_phi1_inverse;
+  poly_invert(&pfg_phi1_inverse, &pfg_phi1);
+  poly_mul(&pub->ph, &pfg_phi1_inverse, &pg_phi1);
+  poly_mul(&pub->ph, &pub->ph, &pg_phi1);
+  poly_clamp(&pub->ph);
+  poly_mul(&priv->ph_inverse, &pfg_phi1_inverse, &f);
+  poly_mul(&priv->ph_inverse, &priv->ph_inverse, &f);
+  poly_clamp(&priv->ph_inverse);
+}
+static const char kSharedKey[] = "shared key";
+void HRSS_encap(uint8_t out_ciphertext[POLY_BYTES],
+                uint8_t out_shared_key[32],
+                const struct HRSS_public_key *in_pub,
+                const uint8_t in[HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES]) {
+  const struct public_key *pub =
+      public_key_from_external((struct HRSS_public_key *)in_pub);
+  struct poly m, r, m_lifted;
+  poly_short_sample(&m, in);
+  poly_short_sample(&r, in + HRSS_SAMPLE_BYTES);
+  poly_lift(&m_lifted, &m);
+  struct poly prh_plus_m;
+  poly_mul(&prh_plus_m, &r, &pub->ph);
+  for (unsigned i = 0; i < N; i++) {
+    prh_plus_m.v[i] += m_lifted.v[i];
+  }
+  poly_marshal(out_ciphertext, &prh_plus_m);
+  uint8_t m_bytes[HRSS_POLY3_BYTES], r_bytes[HRSS_POLY3_BYTES];
+  poly_marshal_mod3(m_bytes, &m);
+  poly_marshal_mod3(r_bytes, &r);
+  SHA256_CTX hash_ctx;
+  SHA256_Init(&hash_ctx);
+  SHA256_Update(&hash_ctx, kSharedKey, sizeof(kSharedKey));
+  SHA256_Update(&hash_ctx, m_bytes, sizeof(m_bytes));
+  SHA256_Update(&hash_ctx, r_bytes, sizeof(r_bytes));
+  SHA256_Update(&hash_ctx, out_ciphertext, POLY_BYTES);
+  SHA256_Final(out_shared_key, &hash_ctx);
+}
+void HRSS_decap(uint8_t out_shared_key[HRSS_KEY_BYTES],
+                const struct HRSS_private_key *in_priv,
+                const uint8_t *ciphertext, size_t ciphertext_len) {
+  const struct private_key *priv =
+      private_key_from_external((struct HRSS_private_key *)in_priv);
+  // This is HMAC, expanded inline rather than using the |HMAC| function so that
+  // we can avoid dealing with possible allocation failures and so keep this
+  // function infallible.
+  uint8_t masked_key[SHA256_CBLOCK];
+  OPENSSL_STATIC_ASSERT(sizeof(priv->hmac_key) <= sizeof(masked_key),
+                        "HRSS HMAC key larger than SHA-256 block size");
+  for (size_t i = 0; i < sizeof(priv->hmac_key); i++) {
+    masked_key[i] = priv->hmac_key[i] ^ 0x36;
+  }
+  OPENSSL_memset(masked_key + sizeof(priv->hmac_key), 0x36,
+                 sizeof(masked_key) - sizeof(priv->hmac_key));
+  SHA256_CTX hash_ctx;
+  SHA256_Init(&hash_ctx);
+  SHA256_Update(&hash_ctx, masked_key, sizeof(masked_key));
+  SHA256_Update(&hash_ctx, ciphertext, ciphertext_len);
+  uint8_t inner_digest[SHA256_DIGEST_LENGTH];
+  SHA256_Final(inner_digest, &hash_ctx);
+  for (size_t i = 0; i < sizeof(priv->hmac_key); i++) {
+    masked_key[i] ^= (0x5c ^ 0x36);
+  }
+  OPENSSL_memset(masked_key + sizeof(priv->hmac_key), 0x5c,
+                 sizeof(masked_key) - sizeof(priv->hmac_key));
+  SHA256_Init(&hash_ctx);
+  SHA256_Update(&hash_ctx, masked_key, sizeof(masked_key));
+  SHA256_Update(&hash_ctx, inner_digest, sizeof(inner_digest));
+  OPENSSL_STATIC_ASSERT(HRSS_KEY_BYTES == SHA256_DIGEST_LENGTH,
+                        "HRSS shared key length incorrect");
+  SHA256_Final(out_shared_key, &hash_ctx);
+  struct poly c;
+  // If the ciphertext is publicly invalid then a random shared key is still
+  // returned to simply the logic of the caller, but this path is not constant
+  // time.
+  if (ciphertext_len != HRSS_CIPHERTEXT_BYTES ||
+      !poly_unmarshal(&c, ciphertext)) {
+    return;
+  }
+  struct poly f, cf;
+  struct poly3 cf3, m3;
+  poly_from_poly3(&f, &priv->f);
+  poly_mul(&cf, &c, &f);
+  poly3_from_poly(&cf3, &cf);
+  // Note that cf3 is not reduced mod Φ(N). That reduction is deferred.
+  HRSS_poly3_mul(&m3, &cf3, &priv->f_inverse);
+  struct poly m, m_lifted;
+  poly_from_poly3(&m, &m3);
+  poly_lift(&m_lifted, &m);
+  struct poly r;
+  for (unsigned i = 0; i < N; i++) {
+    r.v[i] = c.v[i] - m_lifted.v[i];
+  }
+  poly_mul(&r, &r, &priv->ph_inverse);
+  poly_mod_phiN(&r);
+  poly_clamp(&r);
+  struct poly3 r3;
+  crypto_word_t ok = poly3_from_poly_checked(&r3, &r);
+  // [NTRUCOMP] section 5.1 includes ReEnc2 and a proof that it's valid. Rather
+  // than do an expensive |poly_mul|, it rebuilds |c'| from |c - lift(m)|
+  // (called |b|) with:
+  //   t = (−b(1)/N) mod Q
+  //   c' = b + tΦ(N) + lift(m) mod Q
+  //
+  // When polynomials are transmitted, the final coefficient is omitted and
+  // |poly_unmarshal| sets it such that f(1) == 0. Thus c(1) == 0. Also,
+  // |poly_lift| multiplies the result by (x-1) and therefore evaluating a
+  // lifted polynomial at 1 is also zero. Thus lift(m)(1) == 0 and so
+  // (c - lift(m))(1) == 0.
+  //
+  // Although we defer the reduction above, |b| is conceptually reduced mod
+  // Φ(N). In order to do that reduction one subtracts |c[N-1]| from every
+  // coefficient. Therefore b(1) = -c[N-1]×N. The value of |t|, above, then is
+  // just recovering |c[N-1]|, and adding tΦ(N) is simply undoing the reduction.
+  // Therefore b + tΦ(N) + lift(m) = c by construction and we don't need to
+  // recover |c| at all so long as we do the checks in
+  // |poly3_from_poly_checked|.
+  //
+  // The |poly_marshal| here then is just confirming that |poly_unmarshal| is
+  // strict and could be omitted.
+  uint8_t expected_ciphertext[HRSS_CIPHERTEXT_BYTES];
+  OPENSSL_STATIC_ASSERT(HRSS_CIPHERTEXT_BYTES == POLY_BYTES,
+                        "ciphertext is the wrong size");
+  assert(ciphertext_len == sizeof(expected_ciphertext));
+  poly_marshal(expected_ciphertext, &c);
+  uint8_t m_bytes[HRSS_POLY3_BYTES];
+  uint8_t r_bytes[HRSS_POLY3_BYTES];
+  poly_marshal_mod3(m_bytes, &m);
+  poly_marshal_mod3(r_bytes, &r);
+  ok &= constant_time_is_zero_w(CRYPTO_memcmp(ciphertext, expected_ciphertext,
+                                              sizeof(expected_ciphertext)));
+  uint8_t shared_key[32];
+  SHA256_Init(&hash_ctx);
+  SHA256_Update(&hash_ctx, kSharedKey, sizeof(kSharedKey));
+  SHA256_Update(&hash_ctx, m_bytes, sizeof(m_bytes));
+  SHA256_Update(&hash_ctx, r_bytes, sizeof(r_bytes));
+  SHA256_Update(&hash_ctx, expected_ciphertext, sizeof(expected_ciphertext));
+  SHA256_Final(shared_key, &hash_ctx);
+  for (unsigned i = 0; i < sizeof(shared_key); i++) {
+    out_shared_key[i] =
+        constant_time_select_8(ok, shared_key[i], out_shared_key[i]);
+  }
+}
+void HRSS_marshal_public_key(uint8_t out[HRSS_PUBLIC_KEY_BYTES],
+                             const struct HRSS_public_key *in_pub) {
+  const struct public_key *pub =
+      public_key_from_external((struct HRSS_public_key *)in_pub);
+  poly_marshal(out, &pub->ph);
+}
+int HRSS_parse_public_key(struct HRSS_public_key *out,
+                          const uint8_t in[HRSS_PUBLIC_KEY_BYTES]) {
+  struct public_key *pub = public_key_from_external(out);
+  if (!poly_unmarshal(&pub->ph, in)) {
+    return 0;
+  }
+  OPENSSL_memset(&pub->ph.v[N], 0, 3 * sizeof(uint16_t));
+  return 1;
+}