grpc 1.38.0 → 1.40.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of grpc might be problematic. Click here for more details.

Files changed (340) hide show
  1. checksums.yaml +4 -4
  2. data/Makefile +68 -21
  3. data/include/grpc/event_engine/endpoint_config.h +48 -0
  4. data/include/grpc/event_engine/event_engine.h +23 -29
  5. data/include/grpc/event_engine/port.h +2 -0
  6. data/include/grpc/event_engine/slice_allocator.h +21 -36
  7. data/include/grpc/grpc.h +9 -2
  8. data/include/grpc/grpc_security.h +32 -0
  9. data/include/grpc/grpc_security_constants.h +1 -0
  10. data/include/grpc/impl/codegen/grpc_types.h +33 -19
  11. data/include/grpc/impl/codegen/port_platform.h +41 -0
  12. data/src/core/ext/filters/client_channel/client_channel.cc +415 -249
  13. data/src/core/ext/filters/client_channel/client_channel.h +42 -18
  14. data/src/core/ext/filters/client_channel/config_selector.h +19 -6
  15. data/src/core/ext/filters/client_channel/health/health_check_client.cc +2 -0
  16. data/src/core/ext/filters/client_channel/health/health_check_client.h +3 -3
  17. data/src/core/ext/filters/client_channel/http_proxy.cc +16 -1
  18. data/src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.cc +7 -8
  19. data/src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.cc +12 -21
  20. data/src/core/ext/filters/client_channel/lb_policy/priority/priority.cc +3 -5
  21. data/src/core/ext/filters/client_channel/lb_policy/ring_hash/ring_hash.cc +734 -0
  22. data/src/core/ext/filters/client_channel/lb_policy/ring_hash/ring_hash.h +10 -0
  23. data/src/core/ext/filters/client_channel/lb_policy/round_robin/round_robin.cc +8 -15
  24. data/src/core/ext/filters/client_channel/lb_policy/weighted_target/weighted_target.cc +3 -6
  25. data/src/core/ext/filters/client_channel/lb_policy/xds/cds.cc +18 -36
  26. data/src/core/ext/filters/client_channel/lb_policy/xds/xds_cluster_impl.cc +14 -22
  27. data/src/core/ext/filters/client_channel/lb_policy/xds/xds_cluster_manager.cc +2 -9
  28. data/src/core/ext/filters/client_channel/lb_policy/xds/xds_cluster_resolver.cc +68 -103
  29. data/src/core/ext/filters/client_channel/lb_policy.cc +1 -15
  30. data/src/core/ext/filters/client_channel/lb_policy.h +70 -46
  31. data/src/core/ext/filters/client_channel/resolver/dns/c_ares/dns_resolver_ares.cc +1 -3
  32. data/src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_ev_driver_event_engine.cc +31 -0
  33. data/src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_wrapper_event_engine.cc +28 -0
  34. data/src/core/ext/filters/client_channel/resolver/dns/native/dns_resolver.cc +1 -3
  35. data/src/core/ext/filters/client_channel/resolver/google_c2p/google_c2p_resolver.cc +7 -2
  36. data/src/core/ext/filters/client_channel/resolver/xds/xds_resolver.cc +116 -76
  37. data/src/core/ext/filters/client_channel/retry_filter.cc +967 -544
  38. data/src/core/ext/filters/client_channel/retry_service_config.cc +57 -28
  39. data/src/core/ext/filters/client_channel/retry_service_config.h +9 -3
  40. data/src/core/ext/filters/client_channel/service_config_call_data.h +45 -5
  41. data/src/core/ext/filters/client_idle/client_idle_filter.cc +1 -1
  42. data/src/core/ext/filters/http/client/http_client_filter.cc +5 -2
  43. data/src/core/ext/transport/chttp2/client/insecure/channel_create_posix.cc +2 -1
  44. data/src/core/ext/transport/chttp2/server/chttp2_server.cc +5 -1
  45. data/src/core/ext/transport/chttp2/server/insecure/server_chttp2_posix.cc +3 -2
  46. data/src/core/ext/transport/chttp2/transport/bin_decoder.cc +1 -1
  47. data/src/core/{lib/event_engine/slice_allocator.cc → ext/transport/chttp2/transport/chttp2_slice_allocator.cc} +23 -16
  48. data/src/core/ext/transport/chttp2/transport/chttp2_slice_allocator.h +74 -0
  49. data/src/core/ext/transport/chttp2/transport/chttp2_transport.cc +12 -10
  50. data/src/core/ext/transport/chttp2/transport/flow_control.h +1 -1
  51. data/src/core/ext/transport/chttp2/transport/frame_data.cc +4 -4
  52. data/src/core/ext/transport/chttp2/transport/frame_goaway.cc +8 -8
  53. data/src/core/ext/transport/chttp2/transport/frame_settings.cc +5 -5
  54. data/src/core/ext/transport/chttp2/transport/hpack_parser.cc +639 -752
  55. data/src/core/ext/transport/chttp2/transport/hpack_parser.h +190 -69
  56. data/src/core/ext/transport/chttp2/transport/internal.h +2 -1
  57. data/src/core/ext/transport/chttp2/transport/parsing.cc +72 -56
  58. data/src/core/ext/transport/chttp2/transport/varint.cc +6 -4
  59. data/src/core/ext/transport/inproc/inproc_transport.cc +42 -31
  60. data/src/core/ext/upb-generated/envoy/config/bootstrap/v3/bootstrap.upb.c +56 -35
  61. data/src/core/ext/upb-generated/envoy/config/bootstrap/v3/bootstrap.upb.h +180 -76
  62. data/src/core/ext/upb-generated/envoy/config/cluster/v3/cluster.upb.c +35 -27
  63. data/src/core/ext/upb-generated/envoy/config/cluster/v3/cluster.upb.h +97 -48
  64. data/src/core/ext/upb-generated/envoy/config/core/v3/base.upb.c +45 -9
  65. data/src/core/ext/upb-generated/envoy/config/core/v3/base.upb.h +67 -7
  66. data/src/core/ext/upb-generated/envoy/config/core/v3/protocol.upb.c +66 -9
  67. data/src/core/ext/upb-generated/envoy/config/core/v3/protocol.upb.h +227 -0
  68. data/src/core/ext/upb-generated/envoy/config/core/v3/resolver.upb.c +46 -0
  69. data/src/core/ext/upb-generated/envoy/config/core/v3/resolver.upb.h +121 -0
  70. data/src/core/ext/upb-generated/envoy/config/core/v3/substitution_format_string.upb.c +1 -0
  71. data/src/core/ext/upb-generated/envoy/config/core/v3/udp_socket_config.upb.c +35 -0
  72. data/src/core/ext/upb-generated/envoy/config/core/v3/udp_socket_config.upb.h +90 -0
  73. data/src/core/ext/upb-generated/envoy/config/listener/v3/listener.upb.c +32 -24
  74. data/src/core/ext/upb-generated/envoy/config/listener/v3/listener.upb.h +120 -73
  75. data/src/core/ext/upb-generated/envoy/config/listener/v3/listener_components.upb.c +4 -2
  76. data/src/core/ext/upb-generated/envoy/config/listener/v3/listener_components.upb.h +15 -0
  77. data/src/core/ext/upb-generated/envoy/config/listener/v3/quic_config.upb.c +48 -0
  78. data/src/core/ext/upb-generated/envoy/config/listener/v3/quic_config.upb.h +171 -0
  79. data/src/core/ext/upb-generated/envoy/config/listener/v3/udp_listener_config.upb.c +8 -6
  80. data/src/core/ext/upb-generated/envoy/config/listener/v3/udp_listener_config.upb.h +27 -19
  81. data/src/core/ext/upb-generated/envoy/config/rbac/v3/rbac.upb.c +1 -0
  82. data/src/core/ext/upb-generated/envoy/config/route/v3/route.upb.c +24 -7
  83. data/src/core/ext/upb-generated/envoy/config/route/v3/route.upb.h +57 -0
  84. data/src/core/ext/upb-generated/envoy/config/route/v3/route_components.upb.c +29 -17
  85. data/src/core/ext/upb-generated/envoy/config/route/v3/route_components.upb.h +72 -0
  86. data/src/core/ext/upb-generated/envoy/extensions/filters/http/fault/v3/fault.upb.c +3 -2
  87. data/src/core/ext/upb-generated/envoy/extensions/filters/http/fault/v3/fault.upb.h +4 -0
  88. data/src/core/ext/upb-generated/envoy/extensions/filters/http/router/v3/router.upb.c +6 -5
  89. data/src/core/ext/upb-generated/envoy/extensions/filters/http/router/v3/router.upb.h +15 -11
  90. data/src/core/ext/upb-generated/envoy/extensions/filters/network/http_connection_manager/v3/http_connection_manager.upb.c +85 -43
  91. data/src/core/ext/upb-generated/envoy/extensions/filters/network/http_connection_manager/v3/http_connection_manager.upb.h +274 -91
  92. data/src/core/ext/upb-generated/envoy/extensions/transport_sockets/tls/v3/common.upb.c +11 -8
  93. data/src/core/ext/upb-generated/envoy/extensions/transport_sockets/tls/v3/common.upb.h +30 -13
  94. data/src/core/ext/upb-generated/envoy/service/status/v3/csds.upb.c +33 -5
  95. data/src/core/ext/upb-generated/envoy/service/status/v3/csds.upb.h +115 -0
  96. data/src/core/ext/upb-generated/envoy/type/http/v3/path_transformation.upb.c +60 -0
  97. data/src/core/ext/upb-generated/envoy/type/http/v3/path_transformation.upb.h +181 -0
  98. data/src/core/ext/upb-generated/envoy/type/matcher/v3/regex.upb.c +1 -0
  99. data/src/core/ext/upb-generated/validate/validate.upb.c +82 -66
  100. data/src/core/ext/upb-generated/validate/validate.upb.h +220 -124
  101. data/src/core/ext/upbdefs-generated/envoy/annotations/deprecation.upbdefs.c +15 -7
  102. data/src/core/ext/upbdefs-generated/envoy/config/accesslog/v3/accesslog.upbdefs.c +53 -52
  103. data/src/core/ext/upbdefs-generated/envoy/config/bootstrap/v3/bootstrap.upbdefs.c +318 -277
  104. data/src/core/ext/upbdefs-generated/envoy/config/bootstrap/v3/bootstrap.upbdefs.h +5 -0
  105. data/src/core/ext/upbdefs-generated/envoy/config/cluster/v3/cluster.upbdefs.c +437 -410
  106. data/src/core/ext/upbdefs-generated/envoy/config/core/v3/base.upbdefs.c +198 -170
  107. data/src/core/ext/upbdefs-generated/envoy/config/core/v3/base.upbdefs.h +10 -0
  108. data/src/core/ext/upbdefs-generated/envoy/config/core/v3/config_source.upbdefs.c +9 -8
  109. data/src/core/ext/upbdefs-generated/envoy/config/core/v3/protocol.upbdefs.c +219 -163
  110. data/src/core/ext/upbdefs-generated/envoy/config/core/v3/protocol.upbdefs.h +15 -0
  111. data/src/core/ext/upbdefs-generated/envoy/config/core/v3/resolver.upbdefs.c +59 -0
  112. data/src/core/ext/upbdefs-generated/envoy/config/core/v3/resolver.upbdefs.h +40 -0
  113. data/src/core/ext/upbdefs-generated/envoy/config/core/v3/substitution_format_string.upbdefs.c +29 -25
  114. data/src/core/ext/upbdefs-generated/envoy/config/core/v3/udp_socket_config.upbdefs.c +52 -0
  115. data/src/core/ext/upbdefs-generated/envoy/config/core/v3/udp_socket_config.upbdefs.h +35 -0
  116. data/src/core/ext/upbdefs-generated/envoy/config/listener/v3/listener.upbdefs.c +135 -125
  117. data/src/core/ext/upbdefs-generated/envoy/config/listener/v3/listener.upbdefs.h +5 -0
  118. data/src/core/ext/upbdefs-generated/envoy/config/listener/v3/listener_components.upbdefs.c +131 -123
  119. data/src/core/ext/upbdefs-generated/envoy/config/listener/v3/quic_config.upbdefs.c +90 -0
  120. data/src/core/ext/upbdefs-generated/envoy/config/listener/v3/quic_config.upbdefs.h +35 -0
  121. data/src/core/ext/upbdefs-generated/envoy/config/listener/v3/udp_listener_config.upbdefs.c +32 -24
  122. data/src/core/ext/upbdefs-generated/envoy/config/route/v3/route.upbdefs.c +69 -55
  123. data/src/core/ext/upbdefs-generated/envoy/config/route/v3/route.upbdefs.h +5 -0
  124. data/src/core/ext/upbdefs-generated/envoy/config/route/v3/route_components.upbdefs.c +684 -664
  125. data/src/core/ext/upbdefs-generated/envoy/config/route/v3/route_components.upbdefs.h +5 -0
  126. data/src/core/ext/upbdefs-generated/envoy/extensions/filters/http/fault/v3/fault.upbdefs.c +13 -10
  127. data/src/core/ext/upbdefs-generated/envoy/extensions/filters/http/router/v3/router.upbdefs.c +13 -10
  128. data/src/core/ext/upbdefs-generated/envoy/extensions/filters/network/http_connection_manager/v3/http_connection_manager.upbdefs.c +441 -375
  129. data/src/core/ext/upbdefs-generated/envoy/extensions/filters/network/http_connection_manager/v3/http_connection_manager.upbdefs.h +10 -0
  130. data/src/core/ext/upbdefs-generated/envoy/extensions/transport_sockets/tls/v3/common.upbdefs.c +122 -114
  131. data/src/core/ext/upbdefs-generated/envoy/extensions/transport_sockets/tls/v3/tls.upbdefs.c +1 -1
  132. data/src/core/ext/upbdefs-generated/envoy/service/status/v3/csds.upbdefs.c +112 -79
  133. data/src/core/ext/upbdefs-generated/envoy/service/status/v3/csds.upbdefs.h +5 -0
  134. data/src/core/ext/upbdefs-generated/envoy/type/http/v3/path_transformation.upbdefs.c +64 -0
  135. data/src/core/ext/upbdefs-generated/envoy/type/http/v3/path_transformation.upbdefs.h +50 -0
  136. data/src/core/ext/upbdefs-generated/envoy/type/matcher/v3/regex.upbdefs.c +35 -32
  137. data/src/core/ext/upbdefs-generated/google/rpc/status.upbdefs.c +4 -4
  138. data/src/core/ext/upbdefs-generated/validate/validate.upbdefs.c +182 -160
  139. data/src/core/ext/xds/certificate_provider_store.h +1 -1
  140. data/src/core/ext/xds/xds_api.cc +582 -257
  141. data/src/core/ext/xds/xds_api.h +46 -8
  142. data/src/core/ext/xds/xds_bootstrap.cc +4 -1
  143. data/src/core/ext/xds/xds_client.cc +66 -43
  144. data/src/core/ext/xds/xds_client.h +0 -4
  145. data/src/core/ext/xds/xds_http_filters.cc +3 -2
  146. data/src/core/ext/xds/xds_http_filters.h +3 -0
  147. data/src/core/lib/address_utils/sockaddr_utils.cc +13 -0
  148. data/src/core/lib/address_utils/sockaddr_utils.h +10 -0
  149. data/src/core/lib/channel/call_tracer.h +85 -0
  150. data/src/core/lib/channel/channel_stack.h +1 -1
  151. data/src/core/lib/channel/channelz.h +3 -0
  152. data/src/core/lib/channel/context.h +3 -0
  153. data/src/core/lib/channel/status_util.h +4 -0
  154. data/src/core/lib/compression/stream_compression.h +1 -1
  155. data/src/core/lib/compression/stream_compression_gzip.h +1 -1
  156. data/src/core/lib/compression/stream_compression_identity.h +1 -1
  157. data/src/core/lib/debug/stats.h +1 -1
  158. data/src/core/lib/event_engine/endpoint_config.cc +46 -0
  159. data/src/core/lib/event_engine/endpoint_config_internal.h +42 -0
  160. data/src/core/lib/event_engine/event_engine.cc +50 -0
  161. data/src/core/lib/event_engine/sockaddr.cc +14 -12
  162. data/src/core/lib/event_engine/sockaddr.h +44 -0
  163. data/src/core/lib/gpr/murmur_hash.cc +4 -2
  164. data/src/core/lib/gpr/wrap_memcpy.cc +2 -1
  165. data/src/core/lib/gprpp/manual_constructor.h +1 -1
  166. data/src/core/lib/gprpp/orphanable.h +3 -3
  167. data/src/core/lib/gprpp/status_helper.h +3 -0
  168. data/src/core/lib/gprpp/sync.h +2 -30
  169. data/src/core/lib/iomgr/buffer_list.cc +1 -1
  170. data/src/core/lib/iomgr/endpoint_pair_event_engine.cc +33 -0
  171. data/src/core/lib/iomgr/error.cc +5 -4
  172. data/src/core/lib/iomgr/error.h +1 -1
  173. data/src/core/lib/iomgr/ev_apple.h +1 -1
  174. data/src/core/lib/iomgr/event_engine/closure.cc +54 -0
  175. data/src/core/lib/iomgr/event_engine/closure.h +33 -0
  176. data/src/core/lib/iomgr/event_engine/endpoint.cc +192 -0
  177. data/src/core/lib/iomgr/event_engine/endpoint.h +53 -0
  178. data/src/core/lib/iomgr/event_engine/iomgr.cc +105 -0
  179. data/src/core/lib/iomgr/event_engine/iomgr.h +24 -0
  180. data/src/core/lib/iomgr/event_engine/pollset.cc +87 -0
  181. data/{include/grpc/event_engine/channel_args.h → src/core/lib/iomgr/event_engine/pollset.h} +7 -10
  182. data/src/core/lib/iomgr/event_engine/promise.h +51 -0
  183. data/src/core/lib/iomgr/event_engine/resolved_address_internal.cc +41 -0
  184. data/src/core/lib/iomgr/event_engine/resolved_address_internal.h +35 -0
  185. data/src/core/lib/iomgr/event_engine/resolver.cc +110 -0
  186. data/src/core/lib/iomgr/event_engine/tcp.cc +263 -0
  187. data/src/core/lib/iomgr/event_engine/timer.cc +57 -0
  188. data/src/core/lib/iomgr/exec_ctx.cc +8 -0
  189. data/src/core/lib/iomgr/exec_ctx.h +3 -4
  190. data/src/core/lib/iomgr/executor/threadpool.cc +2 -3
  191. data/src/core/lib/iomgr/executor/threadpool.h +2 -2
  192. data/src/core/lib/iomgr/iomgr.cc +1 -1
  193. data/src/core/lib/iomgr/iomgr_posix.cc +2 -0
  194. data/src/core/lib/iomgr/iomgr_posix_cfstream.cc +40 -10
  195. data/src/core/lib/iomgr/pollset_custom.cc +2 -2
  196. data/src/core/lib/iomgr/pollset_custom.h +3 -1
  197. data/src/core/lib/iomgr/pollset_uv.cc +3 -1
  198. data/src/core/lib/iomgr/pollset_uv.h +5 -1
  199. data/src/core/lib/iomgr/port.h +7 -5
  200. data/src/core/lib/iomgr/python_util.h +1 -1
  201. data/src/core/lib/iomgr/resolve_address.cc +5 -1
  202. data/src/core/lib/iomgr/resolve_address.h +6 -0
  203. data/src/core/lib/iomgr/resource_quota.cc +2 -0
  204. data/src/core/lib/iomgr/sockaddr.h +1 -0
  205. data/src/core/lib/iomgr/socket_mutator.cc +15 -2
  206. data/src/core/lib/iomgr/socket_mutator.h +26 -2
  207. data/src/core/lib/iomgr/socket_utils_common_posix.cc +4 -4
  208. data/src/core/lib/iomgr/socket_utils_posix.h +2 -2
  209. data/src/core/lib/iomgr/tcp_client_posix.cc +7 -2
  210. data/src/core/lib/iomgr/tcp_client_windows.cc +2 -0
  211. data/src/core/lib/iomgr/tcp_posix.cc +42 -39
  212. data/src/core/lib/iomgr/tcp_posix.h +8 -0
  213. data/src/core/lib/iomgr/tcp_server_custom.cc +3 -4
  214. data/src/core/lib/iomgr/tcp_server_posix.cc +7 -0
  215. data/src/core/lib/iomgr/tcp_server_utils_posix_common.cc +2 -1
  216. data/src/core/lib/iomgr/timer.h +6 -1
  217. data/src/core/lib/iomgr/timer_manager.cc +1 -1
  218. data/src/core/lib/json/json_reader.cc +1 -2
  219. data/src/core/lib/matchers/matchers.cc +8 -20
  220. data/src/core/lib/matchers/matchers.h +2 -1
  221. data/src/core/lib/security/authorization/authorization_engine.h +44 -0
  222. data/src/core/lib/security/authorization/authorization_policy_provider.h +32 -0
  223. data/src/core/lib/security/authorization/authorization_policy_provider_vtable.cc +46 -0
  224. data/src/core/lib/security/authorization/evaluate_args.cc +209 -0
  225. data/src/core/lib/security/authorization/evaluate_args.h +91 -0
  226. data/src/core/lib/security/credentials/google_default/google_default_credentials.cc +3 -1
  227. data/src/core/lib/security/credentials/tls/grpc_tls_certificate_provider.cc +49 -0
  228. data/src/core/lib/security/credentials/tls/grpc_tls_certificate_provider.h +7 -0
  229. data/src/core/lib/security/credentials/tls/tls_utils.cc +32 -0
  230. data/src/core/lib/security/credentials/tls/tls_utils.h +13 -0
  231. data/src/core/lib/security/security_connector/local/local_security_connector.cc +9 -6
  232. data/src/core/lib/security/security_connector/ssl_utils.cc +5 -0
  233. data/src/core/lib/security/security_connector/tls/tls_security_connector.cc +6 -18
  234. data/src/core/lib/security/transport/security_handshaker.cc +12 -4
  235. data/src/core/lib/security/transport/server_auth_filter.cc +0 -7
  236. data/src/core/lib/slice/slice.cc +12 -2
  237. data/src/core/lib/slice/slice_internal.h +1 -0
  238. data/src/core/lib/surface/call.cc +26 -7
  239. data/src/core/lib/surface/call.h +11 -0
  240. data/src/core/lib/surface/completion_queue.cc +22 -22
  241. data/src/core/lib/surface/completion_queue.h +1 -1
  242. data/src/core/lib/surface/completion_queue_factory.cc +1 -2
  243. data/src/core/lib/surface/init.cc +1 -3
  244. data/src/core/lib/surface/init.h +10 -1
  245. data/src/core/lib/surface/server.cc +3 -1
  246. data/src/core/lib/surface/server.h +3 -3
  247. data/src/core/lib/surface/version.cc +2 -4
  248. data/src/core/lib/transport/error_utils.cc +2 -2
  249. data/src/core/lib/transport/metadata_batch.cc +13 -2
  250. data/src/core/lib/transport/metadata_batch.h +7 -0
  251. data/src/core/lib/transport/transport.h +2 -0
  252. data/src/core/lib/transport/transport_op_string.cc +1 -1
  253. data/src/core/plugin_registry/grpc_plugin_registry.cc +4 -0
  254. data/src/core/tsi/alts/crypt/gsec.h +2 -0
  255. data/src/ruby/ext/grpc/extconf.rb +2 -0
  256. data/src/ruby/ext/grpc/rb_grpc_imports.generated.c +6 -0
  257. data/src/ruby/ext/grpc/rb_grpc_imports.generated.h +10 -1
  258. data/src/ruby/lib/grpc/version.rb +1 -1
  259. data/third_party/boringssl-with-bazel/err_data.c +269 -263
  260. data/third_party/boringssl-with-bazel/src/crypto/asn1/a_object.c +8 -6
  261. data/third_party/boringssl-with-bazel/src/crypto/cipher_extra/cipher_extra.c +4 -0
  262. data/third_party/boringssl-with-bazel/src/crypto/curve25519/curve25519.c +1 -1
  263. data/third_party/boringssl-with-bazel/src/crypto/curve25519/internal.h +1 -1
  264. data/third_party/boringssl-with-bazel/src/crypto/evp/evp.c +9 -0
  265. data/third_party/boringssl-with-bazel/src/crypto/fipsmodule/bn/prime.c +0 -4
  266. data/third_party/boringssl-with-bazel/src/crypto/fipsmodule/digest/digest.c +7 -0
  267. data/third_party/boringssl-with-bazel/src/crypto/fipsmodule/digest/md32_common.h +87 -121
  268. data/third_party/boringssl-with-bazel/src/crypto/fipsmodule/md4/md4.c +20 -30
  269. data/third_party/boringssl-with-bazel/src/crypto/fipsmodule/md5/md5.c +19 -30
  270. data/third_party/boringssl-with-bazel/src/crypto/fipsmodule/rand/internal.h +1 -4
  271. data/third_party/boringssl-with-bazel/src/crypto/fipsmodule/rand/rand.c +0 -13
  272. data/third_party/boringssl-with-bazel/src/crypto/fipsmodule/rsa/rsa.c +26 -24
  273. data/third_party/boringssl-with-bazel/src/crypto/fipsmodule/rsa/rsa_impl.c +10 -7
  274. data/third_party/boringssl-with-bazel/src/crypto/fipsmodule/sha/sha1.c +28 -39
  275. data/third_party/boringssl-with-bazel/src/crypto/fipsmodule/sha/sha256.c +48 -66
  276. data/third_party/boringssl-with-bazel/src/crypto/fipsmodule/sha/sha512.c +4 -5
  277. data/third_party/boringssl-with-bazel/src/crypto/hpke/hpke.c +362 -371
  278. data/third_party/boringssl-with-bazel/src/crypto/pkcs7/pkcs7_x509.c +4 -2
  279. data/third_party/boringssl-with-bazel/src/crypto/rand_extra/passive.c +2 -2
  280. data/third_party/boringssl-with-bazel/src/crypto/rsa_extra/rsa_asn1.c +1 -2
  281. data/third_party/boringssl-with-bazel/src/crypto/x509/internal.h +101 -11
  282. data/third_party/boringssl-with-bazel/src/crypto/x509/t_x509a.c +3 -0
  283. data/third_party/boringssl-with-bazel/src/crypto/x509/x509_cmp.c +2 -2
  284. data/third_party/boringssl-with-bazel/src/crypto/x509/x509_req.c +3 -0
  285. data/third_party/boringssl-with-bazel/src/crypto/x509/x509_set.c +1 -1
  286. data/third_party/boringssl-with-bazel/src/crypto/x509/x509_trs.c +2 -0
  287. data/third_party/boringssl-with-bazel/src/crypto/x509/x509_vfy.c +14 -15
  288. data/third_party/boringssl-with-bazel/src/crypto/x509/x509_vpm.c +53 -73
  289. data/third_party/boringssl-with-bazel/src/crypto/x509/x509cset.c +31 -0
  290. data/third_party/boringssl-with-bazel/src/crypto/x509/x509rset.c +3 -0
  291. data/third_party/boringssl-with-bazel/src/crypto/x509/x_all.c +3 -0
  292. data/third_party/boringssl-with-bazel/src/crypto/x509/x_req.c +5 -8
  293. data/third_party/boringssl-with-bazel/src/crypto/x509/x_sig.c +5 -0
  294. data/third_party/boringssl-with-bazel/src/crypto/x509/x_x509a.c +3 -0
  295. data/third_party/boringssl-with-bazel/src/crypto/x509v3/internal.h +7 -0
  296. data/third_party/boringssl-with-bazel/src/crypto/x509v3/v3_purp.c +1 -1
  297. data/third_party/boringssl-with-bazel/src/crypto/x509v3/v3_utl.c +5 -8
  298. data/third_party/boringssl-with-bazel/src/include/openssl/aead.h +1 -1
  299. data/third_party/boringssl-with-bazel/src/include/openssl/arm_arch.h +66 -1
  300. data/third_party/boringssl-with-bazel/src/include/openssl/base.h +40 -9
  301. data/third_party/boringssl-with-bazel/src/include/openssl/bytestring.h +1 -0
  302. data/third_party/boringssl-with-bazel/src/include/openssl/chacha.h +1 -1
  303. data/third_party/boringssl-with-bazel/src/include/openssl/digest.h +6 -2
  304. data/third_party/boringssl-with-bazel/src/include/openssl/ecdsa.h +14 -0
  305. data/third_party/boringssl-with-bazel/src/include/openssl/evp.h +19 -11
  306. data/third_party/boringssl-with-bazel/src/include/openssl/hpke.h +325 -0
  307. data/third_party/boringssl-with-bazel/src/include/openssl/pkcs7.h +23 -7
  308. data/third_party/boringssl-with-bazel/src/include/openssl/rsa.h +99 -63
  309. data/third_party/boringssl-with-bazel/src/include/openssl/ssl.h +139 -109
  310. data/third_party/boringssl-with-bazel/src/include/openssl/tls1.h +12 -19
  311. data/third_party/boringssl-with-bazel/src/include/openssl/x509.h +48 -50
  312. data/third_party/boringssl-with-bazel/src/include/openssl/x509_vfy.h +451 -435
  313. data/third_party/boringssl-with-bazel/src/include/openssl/x509v3.h +0 -1
  314. data/third_party/boringssl-with-bazel/src/ssl/d1_both.cc +2 -2
  315. data/third_party/boringssl-with-bazel/src/ssl/d1_srtp.cc +1 -1
  316. data/third_party/boringssl-with-bazel/src/ssl/encrypted_client_hello.cc +773 -84
  317. data/third_party/boringssl-with-bazel/src/ssl/handoff.cc +80 -47
  318. data/third_party/boringssl-with-bazel/src/ssl/handshake.cc +24 -19
  319. data/third_party/boringssl-with-bazel/src/ssl/handshake_client.cc +189 -86
  320. data/third_party/boringssl-with-bazel/src/ssl/handshake_server.cc +45 -56
  321. data/third_party/boringssl-with-bazel/src/ssl/internal.h +272 -167
  322. data/third_party/boringssl-with-bazel/src/ssl/s3_both.cc +2 -2
  323. data/third_party/boringssl-with-bazel/src/ssl/s3_lib.cc +2 -2
  324. data/third_party/boringssl-with-bazel/src/ssl/s3_pkt.cc +14 -19
  325. data/third_party/boringssl-with-bazel/src/ssl/ssl_lib.cc +34 -102
  326. data/third_party/boringssl-with-bazel/src/ssl/ssl_privkey.cc +2 -0
  327. data/third_party/boringssl-with-bazel/src/ssl/ssl_session.cc +8 -31
  328. data/third_party/boringssl-with-bazel/src/ssl/ssl_stat.cc +3 -0
  329. data/third_party/boringssl-with-bazel/src/ssl/ssl_transcript.cc +4 -3
  330. data/third_party/boringssl-with-bazel/src/ssl/ssl_versions.cc +7 -3
  331. data/third_party/boringssl-with-bazel/src/ssl/t1_lib.cc +576 -648
  332. data/third_party/boringssl-with-bazel/src/ssl/tls13_both.cc +31 -3
  333. data/third_party/boringssl-with-bazel/src/ssl/tls13_client.cc +98 -39
  334. data/third_party/boringssl-with-bazel/src/ssl/tls13_enc.cc +141 -94
  335. data/third_party/boringssl-with-bazel/src/ssl/tls13_server.cc +58 -68
  336. data/third_party/xxhash/xxhash.h +77 -195
  337. metadata +81 -39
  338. data/src/core/lib/gpr/arena.h +0 -47
  339. data/third_party/boringssl-with-bazel/src/crypto/hpke/internal.h +0 -267
  340. data/third_party/boringssl-with-bazel/src/crypto/x509/vpm_int.h +0 -71
@@ -88,9 +88,7 @@
88
88
 
89
89
  // TODO(roth): In subsequent PRs:
90
90
  // - add support for transparent retries (including initial metadata)
91
- // - figure out how to record stats in census for retries
92
- // (census filter is on top of this one)
93
- // - add census stats for retries
91
+ // - implement hedging
94
92
 
95
93
  // By default, we buffer 256 KiB per RPC for retries.
96
94
  // TODO(roth): Do we have any data to suggest a better value?
@@ -200,7 +198,6 @@ class RetryFilter::CallData {
200
198
  static void SetPollent(grpc_call_element* elem, grpc_polling_entity* pollent);
201
199
 
202
200
  private:
203
- class Canceller;
204
201
  class CallStackDestructionBarrier;
205
202
 
206
203
  // Pending batches stored in call data.
@@ -212,13 +209,12 @@ class RetryFilter::CallData {
212
209
  };
213
210
 
214
211
  // State associated with each call attempt.
215
- // Allocated on the arena.
216
- class CallAttempt
217
- : public RefCounted<CallAttempt, PolymorphicRefCount, kUnrefCallDtor> {
212
+ class CallAttempt : public RefCounted<CallAttempt> {
218
213
  public:
219
214
  explicit CallAttempt(CallData* calld);
215
+ ~CallAttempt() override;
220
216
 
221
- ClientChannel::LoadBalancedCall* lb_call() const { return lb_call_.get(); }
217
+ bool lb_call_committed() const { return lb_call_committed_; }
222
218
 
223
219
  // Constructs and starts whatever batches are needed on this call
224
220
  // attempt.
@@ -228,6 +224,9 @@ class RetryFilter::CallData {
228
224
  // committing the call.
229
225
  void FreeCachedSendOpDataAfterCommit();
230
226
 
227
+ // Cancels the call attempt.
228
+ void CancelFromSurface(grpc_transport_stream_op_batch* cancel_batch);
229
+
231
230
  private:
232
231
  // State used for starting a retryable batch on the call attempt's LB call.
233
232
  // This provides its own grpc_transport_stream_op_batch and other data
@@ -235,7 +234,7 @@ class RetryFilter::CallData {
235
234
  // We allocate one struct on the arena for each attempt at starting a
236
235
  // batch on a given LB call.
237
236
  class BatchData
238
- : public RefCounted<CallAttempt, PolymorphicRefCount, kUnrefCallDtor> {
237
+ : public RefCounted<BatchData, PolymorphicRefCount, kUnrefCallDtor> {
239
238
  public:
240
239
  BatchData(RefCountedPtr<CallAttempt> call_attempt, int refcount,
241
240
  bool set_on_complete);
@@ -243,48 +242,50 @@ class RetryFilter::CallData {
243
242
 
244
243
  grpc_transport_stream_op_batch* batch() { return &batch_; }
245
244
 
246
- // Adds retriable send_initial_metadata op to batch_data.
245
+ // Adds retriable send_initial_metadata op.
247
246
  void AddRetriableSendInitialMetadataOp();
248
- // Adds retriable send_message op to batch_data.
247
+ // Adds retriable send_message op.
249
248
  void AddRetriableSendMessageOp();
250
- // Adds retriable send_trailing_metadata op to batch_data.
249
+ // Adds retriable send_trailing_metadata op.
251
250
  void AddRetriableSendTrailingMetadataOp();
252
- // Adds retriable recv_initial_metadata op to batch_data.
251
+ // Adds retriable recv_initial_metadata op.
253
252
  void AddRetriableRecvInitialMetadataOp();
254
- // Adds retriable recv_message op to batch_data.
253
+ // Adds retriable recv_message op.
255
254
  void AddRetriableRecvMessageOp();
256
- // Adds retriable recv_trailing_metadata op to batch_data.
255
+ // Adds retriable recv_trailing_metadata op.
257
256
  void AddRetriableRecvTrailingMetadataOp();
257
+ // Adds cancel_stream op.
258
+ void AddCancelStreamOp(grpc_error_handle error);
258
259
 
259
260
  private:
260
- // Returns true if the call is being retried.
261
- bool MaybeRetry(grpc_status_code status, grpc_mdelem* server_pushback_md,
262
- bool is_lb_drop);
263
-
264
261
  // Frees cached send ops that were completed by the completed batch in
265
262
  // batch_data. Used when batches are completed after the call is
266
263
  // committed.
267
264
  void FreeCachedSendOpDataForCompletedBatch();
268
265
 
269
- // Invokes recv_initial_metadata_ready for a batch.
270
- static void InvokeRecvInitialMetadataCallback(void* arg,
271
- grpc_error_handle error);
266
+ // If there is a pending recv_initial_metadata op, adds a closure
267
+ // to closures for recv_initial_metadata_ready.
268
+ void MaybeAddClosureForRecvInitialMetadataCallback(
269
+ grpc_error_handle error, CallCombinerClosureList* closures);
272
270
  // Intercepts recv_initial_metadata_ready callback for retries.
273
271
  // Commits the call and returns the initial metadata up the stack.
274
272
  static void RecvInitialMetadataReady(void* arg, grpc_error_handle error);
275
273
 
276
- // Invokes recv_message_ready for a batch.
277
- static void InvokeRecvMessageCallback(void* arg, grpc_error_handle error);
274
+ // If there is a pending recv_message op, adds a closure to closures
275
+ // for recv_message_ready.
276
+ void MaybeAddClosureForRecvMessageCallback(
277
+ grpc_error_handle error, CallCombinerClosureList* closures);
278
278
  // Intercepts recv_message_ready callback for retries.
279
279
  // Commits the call and returns the message up the stack.
280
280
  static void RecvMessageReady(void* arg, grpc_error_handle error);
281
281
 
282
- // Adds recv_trailing_metadata_ready closure to closures.
283
- void AddClosureForRecvTrailingMetadataReady(
282
+ // If there is a pending recv_trailing_metadata op, adds a closure to
283
+ // closures for recv_trailing_metadata_ready.
284
+ void MaybeAddClosureForRecvTrailingMetadataReady(
284
285
  grpc_error_handle error, CallCombinerClosureList* closures);
285
- // Adds any necessary closures for deferred recv_initial_metadata and
286
- // recv_message callbacks to closures.
287
- void AddClosuresForDeferredRecvCallbacks(
286
+ // Adds any necessary closures for deferred batch completion
287
+ // callbacks to closures.
288
+ void AddClosuresForDeferredCompletionCallbacks(
288
289
  CallCombinerClosureList* closures);
289
290
  // For any pending batch containing an op that has not yet been started,
290
291
  // adds the pending batch's completion closures to closures.
@@ -309,6 +310,10 @@ class RetryFilter::CallData {
309
310
  // Callback used to intercept on_complete from LB calls.
310
311
  static void OnComplete(void* arg, grpc_error_handle error);
311
312
 
313
+ // Callback used to handle on_complete for internally generated
314
+ // cancel_stream op.
315
+ static void OnCompleteForCancelOp(void* arg, grpc_error_handle error);
316
+
312
317
  RefCountedPtr<CallAttempt> call_attempt_;
313
318
  // The batch to use in the LB call.
314
319
  // Its payload field points to CallAttempt::batch_payload_.
@@ -317,12 +322,37 @@ class RetryFilter::CallData {
317
322
  grpc_closure on_complete_;
318
323
  };
319
324
 
325
+ class AttemptDispatchController
326
+ : public ConfigSelector::CallDispatchController {
327
+ public:
328
+ explicit AttemptDispatchController(CallAttempt* call_attempt)
329
+ : call_attempt_(call_attempt) {}
330
+
331
+ // Will never be called.
332
+ bool ShouldRetry() override { return false; }
333
+
334
+ void Commit() override {
335
+ call_attempt_->lb_call_committed_ = true;
336
+ auto* calld = call_attempt_->calld_;
337
+ if (calld->retry_committed_) {
338
+ auto* service_config_call_data = static_cast<ServiceConfigCallData*>(
339
+ calld->call_context_[GRPC_CONTEXT_SERVICE_CONFIG_CALL_DATA]
340
+ .value);
341
+ service_config_call_data->call_dispatch_controller()->Commit();
342
+ }
343
+ }
344
+
345
+ private:
346
+ CallAttempt* call_attempt_;
347
+ };
348
+
320
349
  // Creates a BatchData object on the call's arena with the
321
350
  // specified refcount. If set_on_complete is true, the batch's
322
351
  // on_complete callback will be set to point to on_complete();
323
352
  // otherwise, the batch's on_complete callback will be null.
324
353
  BatchData* CreateBatch(int refcount, bool set_on_complete) {
325
- return calld_->arena_->New<BatchData>(Ref(), refcount, set_on_complete);
354
+ return calld_->arena_->New<BatchData>(Ref(DEBUG_LOCATION, "CreateBatch"),
355
+ refcount, set_on_complete);
326
356
  }
327
357
 
328
358
  // If there are any cached send ops that need to be replayed on this
@@ -330,23 +360,61 @@ class RetryFilter::CallData {
330
360
  // Otherwise, returns nullptr.
331
361
  BatchData* MaybeCreateBatchForReplay();
332
362
 
363
+ // Adds a closure to closures that will execute batch in the call combiner.
364
+ void AddClosureForBatch(grpc_transport_stream_op_batch* batch,
365
+ const char* reason,
366
+ CallCombinerClosureList* closures);
367
+
368
+ // Helper function used to start a recv_trailing_metadata batch. This
369
+ // is used in the case where a recv_initial_metadata or recv_message
370
+ // op fails in a way that we know the call is over but when the application
371
+ // has not yet started its own recv_trailing_metadata op.
372
+ void AddBatchForInternalRecvTrailingMetadata(
373
+ CallCombinerClosureList* closures);
374
+
375
+ // Adds a batch to closures to cancel this call attempt.
376
+ void AddBatchForCancelOp(grpc_error_handle error,
377
+ CallCombinerClosureList* closures);
378
+
333
379
  // Adds batches for pending batches to closures.
334
380
  void AddBatchesForPendingBatches(CallCombinerClosureList* closures);
335
381
 
336
382
  // Adds whatever batches are needed on this attempt to closures.
337
383
  void AddRetriableBatches(CallCombinerClosureList* closures);
338
384
 
339
- // Returns true if any op in the batch was not yet started on this attempt.
340
- bool PendingBatchIsUnstarted(PendingBatch* pending);
385
+ // Returns true if any send op in the batch was not yet started on this
386
+ // attempt.
387
+ bool PendingBatchContainsUnstartedSendOps(PendingBatch* pending);
341
388
 
342
- // Helper function used to start a recv_trailing_metadata batch. This
343
- // is used in the case where a recv_initial_metadata or recv_message
344
- // op fails in a way that we know the call is over but when the application
345
- // has not yet started its own recv_trailing_metadata op.
346
- void StartInternalRecvTrailingMetadata();
389
+ // Returns true if there are cached send ops to replay.
390
+ bool HaveSendOpsToReplay();
391
+
392
+ // If our retry state is no longer needed, switch to fast path by moving
393
+ // our LB call into calld_->committed_call_ and having calld_ drop
394
+ // its ref to us.
395
+ void MaybeSwitchToFastPath();
396
+
397
+ // Returns true if the call should be retried.
398
+ // If server_pushback_md is non-null, sets *server_pushback_ms.
399
+ bool ShouldRetry(absl::optional<grpc_status_code> status, bool is_lb_drop,
400
+ grpc_mdelem* server_pushback_md,
401
+ grpc_millis* server_pushback_ms);
402
+
403
+ // Abandons the call attempt. Unrefs any deferred batches.
404
+ void Abandon();
405
+
406
+ static void OnPerAttemptRecvTimer(void* arg, grpc_error_handle error);
407
+ static void OnPerAttemptRecvTimerLocked(void* arg, grpc_error_handle error);
408
+ void MaybeCancelPerAttemptRecvTimer();
347
409
 
348
410
  CallData* calld_;
349
- RefCountedPtr<ClientChannel::LoadBalancedCall> lb_call_;
411
+ AttemptDispatchController attempt_dispatch_controller_;
412
+ OrphanablePtr<ClientChannel::LoadBalancedCall> lb_call_;
413
+ bool lb_call_committed_ = false;
414
+
415
+ grpc_timer per_attempt_recv_timer_;
416
+ grpc_closure on_per_attempt_recv_timer_;
417
+ bool per_attempt_recv_timer_pending_ = false;
350
418
 
351
419
  // BatchData.batch.payload points to this.
352
420
  grpc_transport_stream_op_batch_payload batch_payload_;
@@ -389,16 +457,28 @@ class RetryFilter::CallData {
389
457
  bool started_recv_trailing_metadata_ : 1;
390
458
  bool completed_recv_trailing_metadata_ : 1;
391
459
  // State for callback processing.
392
- BatchData* recv_initial_metadata_ready_deferred_batch_ = nullptr;
460
+ RefCountedPtr<BatchData> recv_initial_metadata_ready_deferred_batch_;
393
461
  grpc_error_handle recv_initial_metadata_error_ = GRPC_ERROR_NONE;
394
- BatchData* recv_message_ready_deferred_batch_ = nullptr;
462
+ RefCountedPtr<BatchData> recv_message_ready_deferred_batch_;
395
463
  grpc_error_handle recv_message_error_ = GRPC_ERROR_NONE;
396
- BatchData* recv_trailing_metadata_internal_batch_ = nullptr;
464
+ struct OnCompleteDeferredBatch {
465
+ OnCompleteDeferredBatch(RefCountedPtr<BatchData> batch,
466
+ grpc_error_handle error)
467
+ : batch(std::move(batch)), error(error) {}
468
+ RefCountedPtr<BatchData> batch;
469
+ grpc_error_handle error;
470
+ };
471
+ // There cannot be more than 3 pending send op batches at a time.
472
+ absl::InlinedVector<OnCompleteDeferredBatch, 3>
473
+ on_complete_deferred_batches_;
474
+ RefCountedPtr<BatchData> recv_trailing_metadata_internal_batch_;
475
+ grpc_error_handle recv_trailing_metadata_error_ = GRPC_ERROR_NONE;
476
+ bool seen_recv_trailing_metadata_from_surface_ : 1;
397
477
  // NOTE: Do not move this next to the metadata bitfields above. That would
398
478
  // save space but will also result in a data race because compiler
399
479
  // will generate a 2 byte store which overwrites the meta-data
400
480
  // fields upon setting this field.
401
- bool retry_dispatched_ : 1;
481
+ bool abandoned_ : 1;
402
482
  };
403
483
 
404
484
  CallData(RetryFilter* chand, const grpc_call_element_args& args);
@@ -432,18 +512,18 @@ class RetryFilter::CallData {
432
512
  // Commits the call so that no further retry attempts will be performed.
433
513
  void RetryCommit(CallAttempt* call_attempt);
434
514
 
435
- // Starts a retry after appropriate back-off.
436
- void DoRetry(grpc_millis server_pushback_ms);
515
+ // Starts a timer to retry after appropriate back-off.
516
+ // If server_pushback_ms is -1, retry_backoff_ is used.
517
+ void StartRetryTimer(grpc_millis server_pushback_ms);
518
+
437
519
  static void OnRetryTimer(void* arg, grpc_error_handle error);
520
+ static void OnRetryTimerLocked(void* arg, grpc_error_handle error);
438
521
 
439
- RefCountedPtr<ClientChannel::LoadBalancedCall> CreateLoadBalancedCall();
522
+ OrphanablePtr<ClientChannel::LoadBalancedCall> CreateLoadBalancedCall(
523
+ ConfigSelector::CallDispatchController* call_dispatch_controller);
440
524
 
441
525
  void CreateCallAttempt();
442
526
 
443
- // Adds a closure to closures that will execute batch in the call combiner.
444
- void AddClosureForBatch(grpc_transport_stream_op_batch* batch,
445
- CallCombinerClosureList* closures);
446
-
447
527
  RetryFilter* chand_;
448
528
  grpc_polling_entity* pollent_;
449
529
  RefCountedPtr<ServerRetryThrottleData> retry_throttle_data_;
@@ -451,13 +531,14 @@ class RetryFilter::CallData {
451
531
  BackOff retry_backoff_;
452
532
 
453
533
  grpc_slice path_; // Request path.
454
- gpr_cycle_counter call_start_time_;
455
534
  grpc_millis deadline_;
456
535
  Arena* arena_;
457
536
  grpc_call_stack* owning_call_;
458
537
  CallCombiner* call_combiner_;
459
538
  grpc_call_context_element* call_context_;
460
539
 
540
+ grpc_error_handle cancelled_from_surface_ = GRPC_ERROR_NONE;
541
+
461
542
  RefCountedPtr<CallStackDestructionBarrier> call_stack_destruction_barrier_;
462
543
 
463
544
  // TODO(roth): As part of implementing hedging, we will need to maintain a
@@ -465,13 +546,10 @@ class RetryFilter::CallData {
465
546
  // gets cancelled.
466
547
  RefCountedPtr<CallAttempt> call_attempt_;
467
548
 
468
- // LB call used when the call is commited before any CallAttempt is
469
- // created.
470
- // TODO(roth): Change CallAttempt logic such that once we've committed
471
- // and all cached send ops have been replayed, we move the LB call
472
- // from the CallAttempt here, thus creating a fast path for the
473
- // remainder of the streaming call.
474
- RefCountedPtr<ClientChannel::LoadBalancedCall> committed_call_;
549
+ // LB call used when we've committed to a call attempt and the retry
550
+ // state for that attempt is no longer needed. This provides a fast
551
+ // path for long-running streaming calls that minimizes overhead.
552
+ OrphanablePtr<ClientChannel::LoadBalancedCall> committed_call_;
475
553
 
476
554
  // When are are not yet fully committed to a particular call (i.e.,
477
555
  // either we might still retry or we have committed to the call but
@@ -486,23 +564,11 @@ class RetryFilter::CallData {
486
564
 
487
565
  // Retry state.
488
566
  bool retry_committed_ : 1;
489
- bool last_attempt_got_server_pushback_ : 1;
567
+ bool retry_timer_pending_ : 1;
490
568
  int num_attempts_completed_ = 0;
491
- Mutex timer_mu_;
492
- Canceller* canceller_ ABSL_GUARDED_BY(timer_mu_);
493
- grpc_timer retry_timer_ ABSL_GUARDED_BY(timer_mu_);
569
+ grpc_timer retry_timer_;
494
570
  grpc_closure retry_closure_;
495
571
 
496
- // The number of batches containing send ops that are currently in-flight
497
- // on any call attempt.
498
- // We hold a ref to the call stack while this is non-zero, since replay
499
- // batches may not complete until after all callbacks have been returned
500
- // to the surface, and we need to make sure that the call is not destroyed
501
- // until all of these batches have completed.
502
- // Note that we actually only need to track replay batches, but it's
503
- // easier to track all batches with send ops.
504
- int num_in_flight_call_attempt_send_batches_ = 0;
505
-
506
572
  // Cached data for retrying send ops.
507
573
  // send_initial_metadata
508
574
  bool seen_send_initial_metadata_ = false;
@@ -513,7 +579,10 @@ class RetryFilter::CallData {
513
579
  // have the LB call set a value in CallAttempt and then propagate it
514
580
  // from CallAttempt to the parent call when we commit. Otherwise, we
515
581
  // may leave this with a value for a peer other than the one we
516
- // actually commit to.
582
+ // actually commit to. Alternatively, maybe see if there's a way to
583
+ // change the surface API such that the peer isn't available until
584
+ // after initial metadata is received? (Could even change the
585
+ // transport API to return this with the recv_initial_metadata op.)
517
586
  gpr_atm* peer_string_;
518
587
  // send_message
519
588
  // When we get a send_message op, we replace the original byte stream
@@ -522,6 +591,10 @@ class RetryFilter::CallData {
522
591
  // Note: We inline the cache for the first 3 send_message ops and use
523
592
  // dynamic allocation after that. This number was essentially picked
524
593
  // at random; it could be changed in the future to tune performance.
594
+ // TODO(roth): As part of implementing hedging, we may need some
595
+ // synchronization here, since ByteStreamCache does not provide any
596
+ // synchronization, so it's not safe to have multiple
597
+ // CachingByteStreams read from the same ByteStreamCache concurrently.
525
598
  absl::InlinedVector<ByteStreamCache*, 3> send_messages_;
526
599
  // send_trailing_metadata
527
600
  bool seen_send_trailing_metadata_ = false;
@@ -582,52 +655,15 @@ class RetryFilter::CallData::CallStackDestructionBarrier
582
655
  grpc_closure* on_call_stack_destruction_ = nullptr;
583
656
  };
584
657
 
585
- //
586
- // RetryFilter::CallData::Canceller
587
- //
588
-
589
- class RetryFilter::CallData::Canceller {
590
- public:
591
- explicit Canceller(CallData* calld) : calld_(calld) {
592
- GRPC_CALL_STACK_REF(calld_->owning_call_, "RetryCanceller");
593
- GRPC_CLOSURE_INIT(&closure_, &Cancel, this, nullptr);
594
- calld_->call_combiner_->SetNotifyOnCancel(&closure_);
595
- }
596
-
597
- private:
598
- static void Cancel(void* arg, grpc_error_handle error) {
599
- auto* self = static_cast<Canceller*>(arg);
600
- auto* calld = self->calld_;
601
- {
602
- MutexLock lock(&calld->timer_mu_);
603
- if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
604
- gpr_log(GPR_INFO,
605
- "calld=%p: cancelling retry timer: error=%s self=%p "
606
- "calld->canceller_=%p",
607
- calld, grpc_error_std_string(error).c_str(), self,
608
- calld->canceller_);
609
- }
610
- if (calld->canceller_ == self && error != GRPC_ERROR_NONE) {
611
- calld->canceller_ = nullptr; // Checked by OnRetryTimer().
612
- grpc_timer_cancel(&calld->retry_timer_);
613
- calld->FreeAllCachedSendOpData();
614
- GRPC_CALL_COMBINER_STOP(calld->call_combiner_, "Canceller");
615
- }
616
- }
617
- GRPC_CALL_STACK_UNREF(calld->owning_call_, "RetryCanceller");
618
- delete self;
619
- }
620
-
621
- CallData* calld_;
622
- grpc_closure closure_;
623
- };
624
-
625
658
  //
626
659
  // RetryFilter::CallData::CallAttempt
627
660
  //
628
661
 
629
662
  RetryFilter::CallData::CallAttempt::CallAttempt(CallData* calld)
630
- : calld_(calld),
663
+ : RefCounted(GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace) ? "CallAttempt"
664
+ : nullptr),
665
+ calld_(calld),
666
+ attempt_dispatch_controller_(this),
631
667
  batch_payload_(calld->call_context_),
632
668
  started_send_initial_metadata_(false),
633
669
  completed_send_initial_metadata_(false),
@@ -637,12 +673,42 @@ RetryFilter::CallData::CallAttempt::CallAttempt(CallData* calld)
637
673
  completed_recv_initial_metadata_(false),
638
674
  started_recv_trailing_metadata_(false),
639
675
  completed_recv_trailing_metadata_(false),
640
- retry_dispatched_(false) {
641
- lb_call_ = calld->CreateLoadBalancedCall();
676
+ seen_recv_trailing_metadata_from_surface_(false),
677
+ abandoned_(false) {
678
+ lb_call_ = calld->CreateLoadBalancedCall(&attempt_dispatch_controller_);
642
679
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
643
- gpr_log(GPR_INFO, "chand=%p calld=%p: attempt=%p: create lb_call=%p",
680
+ gpr_log(GPR_INFO, "chand=%p calld=%p attempt=%p: create lb_call=%p",
644
681
  calld->chand_, calld, this, lb_call_.get());
645
682
  }
683
+ // If per_attempt_recv_timeout is set, start a timer.
684
+ if (calld->retry_policy_ != nullptr &&
685
+ calld->retry_policy_->per_attempt_recv_timeout().has_value()) {
686
+ grpc_millis per_attempt_recv_deadline =
687
+ ExecCtx::Get()->Now() +
688
+ *calld->retry_policy_->per_attempt_recv_timeout();
689
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
690
+ gpr_log(GPR_INFO,
691
+ "chand=%p calld=%p attempt=%p: per-attempt timeout in %" PRId64
692
+ " ms",
693
+ calld->chand_, calld, this,
694
+ *calld->retry_policy_->per_attempt_recv_timeout());
695
+ }
696
+ // Schedule retry after computed delay.
697
+ GRPC_CLOSURE_INIT(&on_per_attempt_recv_timer_, OnPerAttemptRecvTimer, this,
698
+ nullptr);
699
+ GRPC_CALL_STACK_REF(calld->owning_call_, "OnPerAttemptRecvTimer");
700
+ Ref(DEBUG_LOCATION, "OnPerAttemptRecvTimer").release();
701
+ per_attempt_recv_timer_pending_ = true;
702
+ grpc_timer_init(&per_attempt_recv_timer_, per_attempt_recv_deadline,
703
+ &on_per_attempt_recv_timer_);
704
+ }
705
+ }
706
+
707
+ RetryFilter::CallData::CallAttempt::~CallAttempt() {
708
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
709
+ gpr_log(GPR_INFO, "chand=%p calld=%p attempt=%p: destroying call attempt",
710
+ calld_->chand_, calld_, this);
711
+ }
646
712
  }
647
713
 
648
714
  void RetryFilter::CallData::CallAttempt::FreeCachedSendOpDataAfterCommit() {
@@ -661,13 +727,9 @@ void RetryFilter::CallData::CallAttempt::FreeCachedSendOpDataAfterCommit() {
661
727
  }
662
728
  }
663
729
 
664
- bool RetryFilter::CallData::CallAttempt::PendingBatchIsUnstarted(
730
+ bool RetryFilter::CallData::CallAttempt::PendingBatchContainsUnstartedSendOps(
665
731
  PendingBatch* pending) {
666
- // Only look at batches containing send ops, since batches containing
667
- // only recv ops are always started immediately.
668
- if (pending->batch == nullptr || pending->batch->on_complete == nullptr) {
669
- return false;
670
- }
732
+ if (pending->batch->on_complete == nullptr) return false;
671
733
  if (pending->batch->send_initial_metadata &&
672
734
  !started_send_initial_metadata_) {
673
735
  return true;
@@ -683,22 +745,40 @@ bool RetryFilter::CallData::CallAttempt::PendingBatchIsUnstarted(
683
745
  return false;
684
746
  }
685
747
 
686
- void RetryFilter::CallData::CallAttempt::StartInternalRecvTrailingMetadata() {
748
+ bool RetryFilter::CallData::CallAttempt::HaveSendOpsToReplay() {
749
+ // We don't check send_initial_metadata here, because that op will always
750
+ // be started as soon as it is received from the surface, so it will
751
+ // never need to be started at this point.
752
+ return started_send_message_count_ < calld_->send_messages_.size() ||
753
+ (calld_->seen_send_trailing_metadata_ &&
754
+ !started_send_trailing_metadata_);
755
+ }
756
+
757
+ void RetryFilter::CallData::CallAttempt::MaybeSwitchToFastPath() {
758
+ // If we're not yet committed, we can't switch yet.
759
+ // TODO(roth): As part of implementing hedging, this logic needs to
760
+ // check that *this* call attempt is the one that we've committed to.
761
+ // Might need to replace abandoned_ with an enum indicating whether we're
762
+ // in flight, abandoned, or the winning call attempt.
763
+ if (!calld_->retry_committed_) return;
764
+ // If we've already switched to fast path, there's nothing to do here.
765
+ if (calld_->committed_call_ != nullptr) return;
766
+ // If the perAttemptRecvTimeout timer is pending, we can't switch yet.
767
+ if (per_attempt_recv_timer_pending_) return;
768
+ // If there are still send ops to replay, we can't switch yet.
769
+ if (HaveSendOpsToReplay()) return;
770
+ // If we started an internal batch for recv_trailing_metadata but have not
771
+ // yet seen that op from the surface, we can't switch yet.
772
+ if (recv_trailing_metadata_internal_batch_ != nullptr) return;
773
+ // Switch to fast path.
687
774
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
688
775
  gpr_log(GPR_INFO,
689
- "chand=%p calld=%p: call failed but recv_trailing_metadata not "
690
- "started; starting it internally",
691
- calld_->chand_, calld_);
776
+ "chand=%p calld=%p attempt=%p: retry state no longer needed; "
777
+ "moving LB call to parent and unreffing the call attempt",
778
+ calld_->chand_, calld_, this);
692
779
  }
693
- // Create batch_data with 2 refs, since this batch will be unreffed twice:
694
- // once for the recv_trailing_metadata_ready callback when the batch
695
- // completes, and again when we actually get a recv_trailing_metadata
696
- // op from the surface.
697
- BatchData* batch_data = CreateBatch(2, false /* set_on_complete */);
698
- batch_data->AddRetriableRecvTrailingMetadataOp();
699
- recv_trailing_metadata_internal_batch_ = batch_data;
700
- // Note: This will release the call combiner.
701
- lb_call_->StartTransportStreamOpBatch(batch_data->batch());
780
+ calld_->committed_call_ = std::move(lb_call_);
781
+ calld_->call_attempt_.reset(DEBUG_LOCATION, "MaybeSwitchToFastPath");
702
782
  }
703
783
 
704
784
  // If there are any cached send ops that need to be replayed on the
@@ -712,9 +792,9 @@ RetryFilter::CallData::CallAttempt::MaybeCreateBatchForReplay() {
712
792
  !calld_->pending_send_initial_metadata_) {
713
793
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
714
794
  gpr_log(GPR_INFO,
715
- "chand=%p calld=%p: replaying previously completed "
795
+ "chand=%p calld=%p attempt=%p: replaying previously completed "
716
796
  "send_initial_metadata op",
717
- calld_->chand_, calld_);
797
+ calld_->chand_, calld_, this);
718
798
  }
719
799
  replay_batch_data = CreateBatch(1, true /* set_on_complete */);
720
800
  replay_batch_data->AddRetriableSendInitialMetadataOp();
@@ -726,9 +806,9 @@ RetryFilter::CallData::CallAttempt::MaybeCreateBatchForReplay() {
726
806
  !calld_->pending_send_message_) {
727
807
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
728
808
  gpr_log(GPR_INFO,
729
- "chand=%p calld=%p: replaying previously completed "
809
+ "chand=%p calld=%p attempt=%p: replaying previously completed "
730
810
  "send_message op",
731
- calld_->chand_, calld_);
811
+ calld_->chand_, calld_, this);
732
812
  }
733
813
  if (replay_batch_data == nullptr) {
734
814
  replay_batch_data = CreateBatch(1, true /* set_on_complete */);
@@ -745,9 +825,9 @@ RetryFilter::CallData::CallAttempt::MaybeCreateBatchForReplay() {
745
825
  !calld_->pending_send_trailing_metadata_) {
746
826
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
747
827
  gpr_log(GPR_INFO,
748
- "chand=%p calld=%p: replaying previously completed "
828
+ "chand=%p calld=%p attempt=%p: replaying previously completed "
749
829
  "send_trailing_metadata op",
750
- calld_->chand_, calld_);
830
+ calld_->chand_, calld_, this);
751
831
  }
752
832
  if (replay_batch_data == nullptr) {
753
833
  replay_batch_data = CreateBatch(1, true /* set_on_complete */);
@@ -757,12 +837,67 @@ RetryFilter::CallData::CallAttempt::MaybeCreateBatchForReplay() {
757
837
  return replay_batch_data;
758
838
  }
759
839
 
840
+ namespace {
841
+
842
+ void StartBatchInCallCombiner(void* arg, grpc_error_handle /*ignored*/) {
843
+ grpc_transport_stream_op_batch* batch =
844
+ static_cast<grpc_transport_stream_op_batch*>(arg);
845
+ auto* lb_call = static_cast<ClientChannel::LoadBalancedCall*>(
846
+ batch->handler_private.extra_arg);
847
+ // Note: This will release the call combiner.
848
+ lb_call->StartTransportStreamOpBatch(batch);
849
+ }
850
+
851
+ } // namespace
852
+
853
+ void RetryFilter::CallData::CallAttempt::AddClosureForBatch(
854
+ grpc_transport_stream_op_batch* batch, const char* reason,
855
+ CallCombinerClosureList* closures) {
856
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
857
+ gpr_log(GPR_INFO, "chand=%p calld=%p attempt=%p: adding batch (%s): %s",
858
+ calld_->chand_, calld_, this, reason,
859
+ grpc_transport_stream_op_batch_string(batch).c_str());
860
+ }
861
+ batch->handler_private.extra_arg = lb_call_.get();
862
+ GRPC_CLOSURE_INIT(&batch->handler_private.closure, StartBatchInCallCombiner,
863
+ batch, grpc_schedule_on_exec_ctx);
864
+ closures->Add(&batch->handler_private.closure, GRPC_ERROR_NONE, reason);
865
+ }
866
+
867
+ void RetryFilter::CallData::CallAttempt::
868
+ AddBatchForInternalRecvTrailingMetadata(CallCombinerClosureList* closures) {
869
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
870
+ gpr_log(GPR_INFO,
871
+ "chand=%p calld=%p attempt=%p: call failed but "
872
+ "recv_trailing_metadata not started; starting it internally",
873
+ calld_->chand_, calld_, this);
874
+ }
875
+ // Create batch_data with 2 refs, since this batch will be unreffed twice:
876
+ // once for the recv_trailing_metadata_ready callback when the batch
877
+ // completes, and again when we actually get a recv_trailing_metadata
878
+ // op from the surface.
879
+ BatchData* batch_data = CreateBatch(2, false /* set_on_complete */);
880
+ batch_data->AddRetriableRecvTrailingMetadataOp();
881
+ recv_trailing_metadata_internal_batch_.reset(batch_data);
882
+ AddClosureForBatch(batch_data->batch(),
883
+ "starting internal recv_trailing_metadata", closures);
884
+ }
885
+
886
+ void RetryFilter::CallData::CallAttempt::AddBatchForCancelOp(
887
+ grpc_error_handle error, CallCombinerClosureList* closures) {
888
+ BatchData* cancel_batch_data = CreateBatch(1, /*set_on_complete=*/true);
889
+ cancel_batch_data->AddCancelStreamOp(error);
890
+ AddClosureForBatch(cancel_batch_data->batch(),
891
+ "start cancellation batch on call attempt", closures);
892
+ }
893
+
760
894
  void RetryFilter::CallData::CallAttempt::AddBatchesForPendingBatches(
761
895
  CallCombinerClosureList* closures) {
762
896
  for (size_t i = 0; i < GPR_ARRAY_SIZE(calld_->pending_batches_); ++i) {
763
897
  PendingBatch* pending = &calld_->pending_batches_[i];
764
898
  grpc_transport_stream_op_batch* batch = pending->batch;
765
899
  if (batch == nullptr) continue;
900
+ bool has_send_ops = false;
766
901
  // Skip any batch that either (a) has already been started on this
767
902
  // call attempt or (b) we can't start yet because we're still
768
903
  // replaying send ops that need to be completed first.
@@ -773,65 +908,93 @@ void RetryFilter::CallData::CallAttempt::AddBatchesForPendingBatches(
773
908
  // starting a recv op due to it being in the same batch with a send
774
909
  // op. If/when we revamp the callback protocol in
775
910
  // transport_stream_op_batch, we may be able to fix this.
776
- if (batch->send_initial_metadata && started_send_initial_metadata_) {
777
- continue;
911
+ if (batch->send_initial_metadata) {
912
+ if (started_send_initial_metadata_) continue;
913
+ has_send_ops = true;
778
914
  }
779
- if (batch->send_message &&
780
- completed_send_message_count_ < started_send_message_count_) {
781
- continue;
915
+ if (batch->send_message) {
916
+ if (completed_send_message_count_ < started_send_message_count_) {
917
+ continue;
918
+ }
919
+ has_send_ops = true;
782
920
  }
783
921
  // Note that we only start send_trailing_metadata if we have no more
784
922
  // send_message ops to start, since we can't send down any more
785
923
  // send_message ops after send_trailing_metadata.
786
- if (batch->send_trailing_metadata &&
787
- (started_send_message_count_ + batch->send_message <
788
- calld_->send_messages_.size() ||
789
- started_send_trailing_metadata_)) {
790
- continue;
924
+ if (batch->send_trailing_metadata) {
925
+ if (started_send_message_count_ + batch->send_message <
926
+ calld_->send_messages_.size() ||
927
+ started_send_trailing_metadata_) {
928
+ continue;
929
+ }
930
+ has_send_ops = true;
791
931
  }
792
- if (batch->recv_initial_metadata && started_recv_initial_metadata_) {
793
- continue;
932
+ int num_callbacks = has_send_ops; // All send ops share one callback.
933
+ if (batch->recv_initial_metadata) {
934
+ if (started_recv_initial_metadata_) continue;
935
+ ++num_callbacks;
794
936
  }
795
- if (batch->recv_message &&
796
- completed_recv_message_count_ < started_recv_message_count_) {
797
- continue;
937
+ if (batch->recv_message) {
938
+ if (completed_recv_message_count_ < started_recv_message_count_) {
939
+ continue;
940
+ }
941
+ ++num_callbacks;
798
942
  }
799
- if (batch->recv_trailing_metadata && started_recv_trailing_metadata_) {
800
- // If we previously completed a recv_trailing_metadata op
801
- // initiated by StartInternalRecvTrailingMetadata(), use the
802
- // result of that instead of trying to re-start this op.
803
- if (GPR_UNLIKELY(recv_trailing_metadata_internal_batch_ != nullptr)) {
804
- // If the batch completed, then trigger the completion callback
805
- // directly, so that we return the previously returned results to
806
- // the application. Otherwise, just unref the internally started
807
- // batch, since we'll propagate the completion when it completes.
808
- if (completed_recv_trailing_metadata_) {
809
- // Batches containing recv_trailing_metadata always succeed.
810
- closures->Add(
811
- &recv_trailing_metadata_ready_, GRPC_ERROR_NONE,
812
- "re-executing recv_trailing_metadata_ready to propagate "
813
- "internally triggered result");
814
- } else {
815
- recv_trailing_metadata_internal_batch_->Unref();
943
+ if (batch->recv_trailing_metadata) {
944
+ if (started_recv_trailing_metadata_) {
945
+ seen_recv_trailing_metadata_from_surface_ = true;
946
+ // If we previously completed a recv_trailing_metadata op
947
+ // initiated by AddBatchForInternalRecvTrailingMetadata(), use the
948
+ // result of that instead of trying to re-start this op.
949
+ if (GPR_UNLIKELY(recv_trailing_metadata_internal_batch_ != nullptr)) {
950
+ // If the batch completed, then trigger the completion callback
951
+ // directly, so that we return the previously returned results to
952
+ // the application. Otherwise, just unref the internally started
953
+ // batch, since we'll propagate the completion when it completes.
954
+ if (completed_recv_trailing_metadata_) {
955
+ closures->Add(
956
+ &recv_trailing_metadata_ready_, recv_trailing_metadata_error_,
957
+ "re-executing recv_trailing_metadata_ready to propagate "
958
+ "internally triggered result");
959
+ // Ref will be released by callback.
960
+ recv_trailing_metadata_internal_batch_.release();
961
+ } else {
962
+ recv_trailing_metadata_internal_batch_.reset(
963
+ DEBUG_LOCATION,
964
+ "internally started recv_trailing_metadata batch pending and "
965
+ "recv_trailing_metadata started from surface");
966
+ GRPC_ERROR_UNREF(recv_trailing_metadata_error_);
967
+ }
968
+ recv_trailing_metadata_error_ = GRPC_ERROR_NONE;
816
969
  }
817
- recv_trailing_metadata_internal_batch_ = nullptr;
970
+ // We don't want the fact that we've already started this op internally
971
+ // to prevent us from adding a batch that may contain other ops.
972
+ // Instead, we'll just skip adding this op below.
973
+ if (num_callbacks == 0) continue;
974
+ } else {
975
+ ++num_callbacks;
818
976
  }
819
- continue;
820
977
  }
821
- // If we're already committed, just send the batch as-is.
822
- if (calld_->retry_committed_) {
823
- calld_->AddClosureForBatch(batch, closures);
978
+ // If we're already committed and the following conditions are met,
979
+ // just send the batch down as-is:
980
+ // - The batch contains no cached send ops. (If it does, we need
981
+ // the logic below to use the cached payloads.)
982
+ // - The batch does not contain recv_trailing_metadata when we have
983
+ // already started an internal recv_trailing_metadata batch. (If
984
+ // we've already started an internal recv_trailing_metadata batch,
985
+ // then we need the logic below to send all ops in the batch
986
+ // *except* the recv_trailing_metadata op.)
987
+ if (calld_->retry_committed_ && !pending->send_ops_cached &&
988
+ (!batch->recv_trailing_metadata || !started_recv_trailing_metadata_)) {
989
+ AddClosureForBatch(
990
+ batch,
991
+ "start non-replayable pending batch on call attempt after commit",
992
+ closures);
824
993
  calld_->PendingBatchClear(pending);
825
994
  continue;
826
995
  }
827
996
  // Create batch with the right number of callbacks.
828
- const bool has_send_ops = batch->send_initial_metadata ||
829
- batch->send_message ||
830
- batch->send_trailing_metadata;
831
- const int num_callbacks = has_send_ops + batch->recv_initial_metadata +
832
- batch->recv_message +
833
- batch->recv_trailing_metadata;
834
- CallAttempt::BatchData* batch_data =
997
+ BatchData* batch_data =
835
998
  CreateBatch(num_callbacks, has_send_ops /* set_on_complete */);
836
999
  // Cache send ops if needed.
837
1000
  calld_->MaybeCacheSendOpsForBatch(pending);
@@ -858,19 +1021,12 @@ void RetryFilter::CallData::CallAttempt::AddBatchesForPendingBatches(
858
1021
  batch_data->AddRetriableRecvMessageOp();
859
1022
  }
860
1023
  // recv_trailing_metadata.
861
- if (batch->recv_trailing_metadata) {
1024
+ if (batch->recv_trailing_metadata && !started_recv_trailing_metadata_) {
862
1025
  batch_data->AddRetriableRecvTrailingMetadataOp();
863
1026
  }
864
- calld_->AddClosureForBatch(batch_data->batch(), closures);
865
- // Track number of in-flight send batches.
866
- // If this is the first one, take a ref to the call stack.
867
- if (batch->send_initial_metadata || batch->send_message ||
868
- batch->send_trailing_metadata) {
869
- if (calld_->num_in_flight_call_attempt_send_batches_ == 0) {
870
- GRPC_CALL_STACK_REF(calld_->owning_call_, "retriable_send_batches");
871
- }
872
- ++calld_->num_in_flight_call_attempt_send_batches_;
873
- }
1027
+ AddClosureForBatch(batch_data->batch(),
1028
+ "start replayable pending batch on call attempt",
1029
+ closures);
874
1030
  }
875
1031
  }
876
1032
 
@@ -879,13 +1035,8 @@ void RetryFilter::CallData::CallAttempt::AddRetriableBatches(
879
1035
  // Replay previously-returned send_* ops if needed.
880
1036
  BatchData* replay_batch_data = MaybeCreateBatchForReplay();
881
1037
  if (replay_batch_data != nullptr) {
882
- calld_->AddClosureForBatch(replay_batch_data->batch(), closures);
883
- // Track number of pending send batches.
884
- // If this is the first one, take a ref to the call stack.
885
- if (calld_->num_in_flight_call_attempt_send_batches_ == 0) {
886
- GRPC_CALL_STACK_REF(calld_->owning_call_, "retriable_send_batches");
887
- }
888
- ++calld_->num_in_flight_call_attempt_send_batches_;
1038
+ AddClosureForBatch(replay_batch_data->batch(),
1039
+ "start replay batch on call attempt", closures);
889
1040
  }
890
1041
  // Now add pending batches.
891
1042
  AddBatchesForPendingBatches(closures);
@@ -893,8 +1044,9 @@ void RetryFilter::CallData::CallAttempt::AddRetriableBatches(
893
1044
 
894
1045
  void RetryFilter::CallData::CallAttempt::StartRetriableBatches() {
895
1046
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
896
- gpr_log(GPR_INFO, "chand=%p calld=%p: constructing retriable batches",
897
- calld_->chand_, calld_);
1047
+ gpr_log(GPR_INFO,
1048
+ "chand=%p calld=%p attempt=%p: constructing retriable batches",
1049
+ calld_->chand_, calld_, this);
898
1050
  }
899
1051
  // Construct list of closures to execute, one for each pending batch.
900
1052
  CallCombinerClosureList closures;
@@ -903,107 +1055,51 @@ void RetryFilter::CallData::CallAttempt::StartRetriableBatches() {
903
1055
  // Start batches on LB call.
904
1056
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
905
1057
  gpr_log(GPR_INFO,
906
- "chand=%p calld=%p: starting %" PRIuPTR
1058
+ "chand=%p calld=%p attempt=%p: starting %" PRIuPTR
907
1059
  " retriable batches on lb_call=%p",
908
- calld_->chand_, calld_, closures.size(), lb_call());
1060
+ calld_->chand_, calld_, this, closures.size(), lb_call_.get());
909
1061
  }
910
1062
  closures.RunClosures(calld_->call_combiner_);
911
1063
  }
912
1064
 
913
- //
914
- // RetryFilter::CallData::CallAttempt::BatchData
915
- //
916
-
917
- RetryFilter::CallData::CallAttempt::BatchData::BatchData(
918
- RefCountedPtr<CallAttempt> attempt, int refcount, bool set_on_complete)
919
- : RefCounted(nullptr, refcount), call_attempt_(std::move(attempt)) {
920
- // TODO(roth): Consider holding this ref on the call stack in
921
- // CallAttempt instead of here in BatchData. This would eliminate the
922
- // need for CallData::num_in_flight_call_attempt_send_batches_.
923
- // But it would require having a way to unref CallAttempt when it is
924
- // no longer needed (i.e., when the call is committed and all cached
925
- // send ops have been replayed and the LB call is moved into
926
- // CallData::committed_call_).
927
- GRPC_CALL_STACK_REF(call_attempt_->calld_->owning_call_, "CallAttempt");
928
- batch_.payload = &call_attempt_->batch_payload_;
929
- if (set_on_complete) {
930
- GRPC_CLOSURE_INIT(&on_complete_, OnComplete, this,
931
- grpc_schedule_on_exec_ctx);
932
- batch_.on_complete = &on_complete_;
933
- }
1065
+ void RetryFilter::CallData::CallAttempt::CancelFromSurface(
1066
+ grpc_transport_stream_op_batch* cancel_batch) {
1067
+ MaybeCancelPerAttemptRecvTimer();
1068
+ // Propagate cancellation to LB call.
1069
+ lb_call_->StartTransportStreamOpBatch(cancel_batch);
934
1070
  }
935
1071
 
936
- RetryFilter::CallData::CallAttempt::BatchData::~BatchData() {
937
- if (batch_.send_initial_metadata) {
938
- grpc_metadata_batch_destroy(&call_attempt_->send_initial_metadata_);
939
- }
940
- if (batch_.send_trailing_metadata) {
941
- grpc_metadata_batch_destroy(&call_attempt_->send_trailing_metadata_);
942
- }
943
- if (batch_.recv_initial_metadata) {
944
- grpc_metadata_batch_destroy(&call_attempt_->recv_initial_metadata_);
945
- }
946
- if (batch_.recv_trailing_metadata) {
947
- grpc_metadata_batch_destroy(&call_attempt_->recv_trailing_metadata_);
948
- }
949
- GRPC_CALL_STACK_UNREF(call_attempt_->calld_->owning_call_, "CallAttempt");
950
- }
951
-
952
- void RetryFilter::CallData::CallAttempt::BatchData::
953
- FreeCachedSendOpDataForCompletedBatch() {
954
- auto* calld = call_attempt_->calld_;
955
- // TODO(roth): When we implement hedging, this logic will need to get
956
- // a bit more complex, because there may be other (now abandoned) call
957
- // attempts still using this data. We may need to do some sort of
958
- // ref-counting instead.
959
- if (batch_.send_initial_metadata) {
960
- calld->FreeCachedSendInitialMetadata();
961
- }
962
- if (batch_.send_message) {
963
- calld->FreeCachedSendMessage(call_attempt_->completed_send_message_count_ -
964
- 1);
965
- }
966
- if (batch_.send_trailing_metadata) {
967
- calld->FreeCachedSendTrailingMetadata();
968
- }
969
- }
970
-
971
- bool RetryFilter::CallData::CallAttempt::BatchData::MaybeRetry(
972
- grpc_status_code status, grpc_mdelem* server_pushback_md, bool is_lb_drop) {
973
- auto* calld = call_attempt_->calld_;
1072
+ bool RetryFilter::CallData::CallAttempt::ShouldRetry(
1073
+ absl::optional<grpc_status_code> status, bool is_lb_drop,
1074
+ grpc_mdelem* server_pushback_md, grpc_millis* server_pushback_ms) {
974
1075
  // LB drops always inhibit retries.
975
1076
  if (is_lb_drop) return false;
976
- // Get retry policy.
977
- if (calld->retry_policy_ == nullptr) return false;
978
- // If we've already dispatched a retry from this call, return true.
979
- // This catches the case where the batch has multiple callbacks
980
- // (i.e., it includes either recv_message or recv_initial_metadata).
981
- if (call_attempt_->retry_dispatched_) {
982
- if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
983
- gpr_log(GPR_INFO, "chand=%p calld=%p: retry already dispatched",
984
- calld->chand_, calld);
985
- }
986
- return true;
987
- }
1077
+ // TODO(roth): Handle transparent retries here.
1078
+ // If no retry policy, don't retry.
1079
+ if (calld_->retry_policy_ == nullptr) return false;
988
1080
  // Check status.
989
- if (GPR_LIKELY(status == GRPC_STATUS_OK)) {
990
- if (calld->retry_throttle_data_ != nullptr) {
991
- calld->retry_throttle_data_->RecordSuccess();
992
- }
993
- if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
994
- gpr_log(GPR_INFO, "chand=%p calld=%p: call succeeded", calld->chand_,
995
- calld);
1081
+ if (status.has_value()) {
1082
+ if (GPR_LIKELY(*status == GRPC_STATUS_OK)) {
1083
+ if (calld_->retry_throttle_data_ != nullptr) {
1084
+ calld_->retry_throttle_data_->RecordSuccess();
1085
+ }
1086
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1087
+ gpr_log(GPR_INFO, "chand=%p calld=%p attempt=%p: call succeeded",
1088
+ calld_->chand_, calld_, this);
1089
+ }
1090
+ return false;
996
1091
  }
997
- return false;
998
- }
999
- // Status is not OK. Check whether the status is retryable.
1000
- if (!calld->retry_policy_->retryable_status_codes().Contains(status)) {
1001
- if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1002
- gpr_log(GPR_INFO,
1003
- "chand=%p calld=%p: status %s not configured as retryable",
1004
- calld->chand_, calld, grpc_status_code_to_string(status));
1092
+ // Status is not OK. Check whether the status is retryable.
1093
+ if (!calld_->retry_policy_->retryable_status_codes().Contains(*status)) {
1094
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1095
+ gpr_log(GPR_INFO,
1096
+ "chand=%p calld=%p attempt=%p: status %s not configured as "
1097
+ "retryable",
1098
+ calld_->chand_, calld_, this,
1099
+ grpc_status_code_to_string(*status));
1100
+ }
1101
+ return false;
1005
1102
  }
1006
- return false;
1007
1103
  }
1008
1104
  // Record the failure and check whether retries are throttled.
1009
1105
  // Note that it's important for this check to come after the status
@@ -1012,78 +1108,267 @@ bool RetryFilter::CallData::CallAttempt::BatchData::MaybeRetry(
1012
1108
  // things like failures due to malformed requests (INVALID_ARGUMENT).
1013
1109
  // Conversely, it's important for this to come before the remaining
1014
1110
  // checks, so that we don't fail to record failures due to other factors.
1015
- if (calld->retry_throttle_data_ != nullptr &&
1016
- !calld->retry_throttle_data_->RecordFailure()) {
1111
+ if (calld_->retry_throttle_data_ != nullptr &&
1112
+ !calld_->retry_throttle_data_->RecordFailure()) {
1017
1113
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1018
- gpr_log(GPR_INFO, "chand=%p calld=%p: retries throttled", calld->chand_,
1019
- calld);
1114
+ gpr_log(GPR_INFO, "chand=%p calld=%p attempt=%p: retries throttled",
1115
+ calld_->chand_, calld_, this);
1020
1116
  }
1021
1117
  return false;
1022
1118
  }
1023
1119
  // Check whether the call is committed.
1024
- if (calld->retry_committed_) {
1120
+ if (calld_->retry_committed_) {
1025
1121
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1026
- gpr_log(GPR_INFO, "chand=%p calld=%p: retries already committed",
1027
- calld->chand_, calld);
1122
+ gpr_log(GPR_INFO,
1123
+ "chand=%p calld=%p attempt=%p: retries already committed",
1124
+ calld_->chand_, calld_, this);
1028
1125
  }
1029
1126
  return false;
1030
1127
  }
1031
1128
  // Check whether we have retries remaining.
1032
- ++calld->num_attempts_completed_;
1033
- if (calld->num_attempts_completed_ >= calld->retry_policy_->max_attempts()) {
1129
+ ++calld_->num_attempts_completed_;
1130
+ if (calld_->num_attempts_completed_ >=
1131
+ calld_->retry_policy_->max_attempts()) {
1034
1132
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1035
- gpr_log(GPR_INFO, "chand=%p calld=%p: exceeded %d retry attempts",
1036
- calld->chand_, calld, calld->retry_policy_->max_attempts());
1133
+ gpr_log(
1134
+ GPR_INFO, "chand=%p calld=%p attempt=%p: exceeded %d retry attempts",
1135
+ calld_->chand_, calld_, this, calld_->retry_policy_->max_attempts());
1037
1136
  }
1038
1137
  return false;
1039
1138
  }
1040
1139
  // Check server push-back.
1041
- grpc_millis server_pushback_ms = -1;
1042
1140
  if (server_pushback_md != nullptr) {
1043
1141
  // If the value is "-1" or any other unparseable string, we do not retry.
1044
1142
  uint32_t ms;
1045
1143
  if (!grpc_parse_slice_to_uint32(GRPC_MDVALUE(*server_pushback_md), &ms)) {
1046
1144
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1047
1145
  gpr_log(GPR_INFO,
1048
- "chand=%p calld=%p: not retrying due to server push-back",
1049
- calld->chand_, calld);
1146
+ "chand=%p calld=%p attempt=%p: not retrying due to server "
1147
+ "push-back",
1148
+ calld_->chand_, calld_, this);
1050
1149
  }
1051
1150
  return false;
1052
1151
  } else {
1053
1152
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1054
- gpr_log(GPR_INFO, "chand=%p calld=%p: server push-back: retry in %u ms",
1055
- calld->chand_, calld, ms);
1153
+ gpr_log(
1154
+ GPR_INFO,
1155
+ "chand=%p calld=%p attempt=%p: server push-back: retry in %u ms",
1156
+ calld_->chand_, calld_, this, ms);
1056
1157
  }
1057
- server_pushback_ms = static_cast<grpc_millis>(ms);
1158
+ *server_pushback_ms = static_cast<grpc_millis>(ms);
1058
1159
  }
1059
1160
  }
1060
- // Do retry.
1061
- call_attempt_->retry_dispatched_ = true;
1062
- calld->DoRetry(server_pushback_ms);
1161
+ // Check with call dispatch controller.
1162
+ auto* service_config_call_data = static_cast<ServiceConfigCallData*>(
1163
+ calld_->call_context_[GRPC_CONTEXT_SERVICE_CONFIG_CALL_DATA].value);
1164
+ if (!service_config_call_data->call_dispatch_controller()->ShouldRetry()) {
1165
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1166
+ gpr_log(
1167
+ GPR_INFO,
1168
+ "chand=%p calld=%p attempt=%p: call dispatch controller denied retry",
1169
+ calld_->chand_, calld_, this);
1170
+ }
1171
+ return false;
1172
+ }
1173
+ // We should retry.
1063
1174
  return true;
1064
1175
  }
1065
1176
 
1177
+ void RetryFilter::CallData::CallAttempt::Abandon() {
1178
+ abandoned_ = true;
1179
+ // Unref batches for deferred completion callbacks that will now never
1180
+ // be invoked.
1181
+ if (started_recv_trailing_metadata_ &&
1182
+ !seen_recv_trailing_metadata_from_surface_) {
1183
+ recv_trailing_metadata_internal_batch_.reset(
1184
+ DEBUG_LOCATION,
1185
+ "internal recv_trailing_metadata completed before that op was "
1186
+ "started from the surface");
1187
+ }
1188
+ GRPC_ERROR_UNREF(recv_trailing_metadata_error_);
1189
+ recv_trailing_metadata_error_ = GRPC_ERROR_NONE;
1190
+ recv_initial_metadata_ready_deferred_batch_.reset(
1191
+ DEBUG_LOCATION,
1192
+ "unref deferred recv_initial_metadata_ready batch due to retry");
1193
+ GRPC_ERROR_UNREF(recv_initial_metadata_error_);
1194
+ recv_initial_metadata_error_ = GRPC_ERROR_NONE;
1195
+ recv_message_ready_deferred_batch_.reset(
1196
+ DEBUG_LOCATION, "unref deferred recv_message_ready batch due to retry");
1197
+ GRPC_ERROR_UNREF(recv_message_error_);
1198
+ recv_message_error_ = GRPC_ERROR_NONE;
1199
+ for (auto& on_complete_deferred_batch : on_complete_deferred_batches_) {
1200
+ on_complete_deferred_batch.batch.reset(
1201
+ DEBUG_LOCATION, "unref deferred on_complete batch due to retry");
1202
+ GRPC_ERROR_UNREF(on_complete_deferred_batch.error);
1203
+ }
1204
+ on_complete_deferred_batches_.clear();
1205
+ }
1206
+
1207
+ void RetryFilter::CallData::CallAttempt::OnPerAttemptRecvTimer(
1208
+ void* arg, grpc_error_handle error) {
1209
+ auto* call_attempt = static_cast<CallAttempt*>(arg);
1210
+ GRPC_CLOSURE_INIT(&call_attempt->on_per_attempt_recv_timer_,
1211
+ OnPerAttemptRecvTimerLocked, call_attempt, nullptr);
1212
+ GRPC_CALL_COMBINER_START(call_attempt->calld_->call_combiner_,
1213
+ &call_attempt->on_per_attempt_recv_timer_,
1214
+ GRPC_ERROR_REF(error), "per-attempt timer fired");
1215
+ }
1216
+
1217
+ void RetryFilter::CallData::CallAttempt::OnPerAttemptRecvTimerLocked(
1218
+ void* arg, grpc_error_handle error) {
1219
+ auto* call_attempt = static_cast<CallAttempt*>(arg);
1220
+ auto* calld = call_attempt->calld_;
1221
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1222
+ gpr_log(GPR_INFO,
1223
+ "chand=%p calld=%p attempt=%p: perAttemptRecvTimeout timer fired: "
1224
+ "error=%s, per_attempt_recv_timer_pending_=%d",
1225
+ calld->chand_, calld, call_attempt,
1226
+ grpc_error_std_string(error).c_str(),
1227
+ call_attempt->per_attempt_recv_timer_pending_);
1228
+ }
1229
+ CallCombinerClosureList closures;
1230
+ if (error == GRPC_ERROR_NONE &&
1231
+ call_attempt->per_attempt_recv_timer_pending_) {
1232
+ call_attempt->per_attempt_recv_timer_pending_ = false;
1233
+ // Cancel this attempt.
1234
+ // TODO(roth): When implementing hedging, we should not cancel the
1235
+ // current attempt.
1236
+ call_attempt->AddBatchForCancelOp(
1237
+ grpc_error_set_int(GRPC_ERROR_CREATE_FROM_STATIC_STRING(
1238
+ "retry perAttemptRecvTimeout exceeded"),
1239
+ GRPC_ERROR_INT_GRPC_STATUS, GRPC_STATUS_CANCELLED),
1240
+ &closures);
1241
+ // Check whether we should retry.
1242
+ if (call_attempt->ShouldRetry(
1243
+ /*status=*/absl::nullopt, /*is_lb_drop=*/false,
1244
+ /*server_pushback_md=*/nullptr, /*server_pushback_ms=*/nullptr)) {
1245
+ // Mark current attempt as abandoned.
1246
+ call_attempt->Abandon();
1247
+ // We are retrying. Start backoff timer.
1248
+ calld->StartRetryTimer(/*server_pushback_ms=*/-1);
1249
+ } else {
1250
+ // Not retrying, so commit the call.
1251
+ calld->RetryCommit(call_attempt);
1252
+ // If retry state is no longer needed, switch to fast path for
1253
+ // subsequent batches.
1254
+ call_attempt->MaybeSwitchToFastPath();
1255
+ }
1256
+ }
1257
+ closures.RunClosures(calld->call_combiner_);
1258
+ call_attempt->Unref(DEBUG_LOCATION, "OnPerAttemptRecvTimer");
1259
+ GRPC_CALL_STACK_UNREF(calld->owning_call_, "OnPerAttemptRecvTimer");
1260
+ }
1261
+
1262
+ void RetryFilter::CallData::CallAttempt::MaybeCancelPerAttemptRecvTimer() {
1263
+ if (per_attempt_recv_timer_pending_) {
1264
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1265
+ gpr_log(GPR_INFO,
1266
+ "chand=%p calld=%p attempt=%p: cancelling "
1267
+ "perAttemptRecvTimeout timer",
1268
+ calld_->chand_, calld_, this);
1269
+ }
1270
+ per_attempt_recv_timer_pending_ = false;
1271
+ grpc_timer_cancel(&per_attempt_recv_timer_);
1272
+ }
1273
+ }
1274
+
1275
+ //
1276
+ // RetryFilter::CallData::CallAttempt::BatchData
1277
+ //
1278
+
1279
+ RetryFilter::CallData::CallAttempt::BatchData::BatchData(
1280
+ RefCountedPtr<CallAttempt> attempt, int refcount, bool set_on_complete)
1281
+ : RefCounted(
1282
+ GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace) ? "BatchData" : nullptr,
1283
+ refcount),
1284
+ call_attempt_(std::move(attempt)) {
1285
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1286
+ gpr_log(GPR_INFO, "chand=%p calld=%p attempt=%p: creating batch %p",
1287
+ call_attempt_->calld_->chand_, call_attempt_->calld_,
1288
+ call_attempt_.get(), this);
1289
+ }
1290
+ // We hold a ref to the call stack for every batch sent on a call attempt.
1291
+ // This is because some batches on the call attempt may not complete
1292
+ // until after all of the batches are completed at the surface (because
1293
+ // each batch that is pending at the surface holds a ref). This
1294
+ // can happen for replayed send ops, and it can happen for
1295
+ // recv_initial_metadata and recv_message ops on a call attempt that has
1296
+ // been abandoned.
1297
+ GRPC_CALL_STACK_REF(call_attempt_->calld_->owning_call_, "Retry BatchData");
1298
+ batch_.payload = &call_attempt_->batch_payload_;
1299
+ if (set_on_complete) {
1300
+ GRPC_CLOSURE_INIT(&on_complete_, OnComplete, this, nullptr);
1301
+ batch_.on_complete = &on_complete_;
1302
+ }
1303
+ }
1304
+
1305
+ RetryFilter::CallData::CallAttempt::BatchData::~BatchData() {
1306
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1307
+ gpr_log(GPR_INFO, "chand=%p calld=%p attempt=%p: destroying batch %p",
1308
+ call_attempt_->calld_->chand_, call_attempt_->calld_,
1309
+ call_attempt_.get(), this);
1310
+ }
1311
+ if (batch_.send_initial_metadata) {
1312
+ grpc_metadata_batch_destroy(&call_attempt_->send_initial_metadata_);
1313
+ }
1314
+ if (batch_.send_trailing_metadata) {
1315
+ grpc_metadata_batch_destroy(&call_attempt_->send_trailing_metadata_);
1316
+ }
1317
+ if (batch_.recv_initial_metadata) {
1318
+ grpc_metadata_batch_destroy(&call_attempt_->recv_initial_metadata_);
1319
+ }
1320
+ if (batch_.recv_trailing_metadata) {
1321
+ grpc_metadata_batch_destroy(&call_attempt_->recv_trailing_metadata_);
1322
+ }
1323
+ GRPC_CALL_STACK_UNREF(call_attempt_->calld_->owning_call_, "Retry BatchData");
1324
+ call_attempt_.reset(DEBUG_LOCATION, "~BatchData");
1325
+ }
1326
+
1327
+ void RetryFilter::CallData::CallAttempt::BatchData::
1328
+ FreeCachedSendOpDataForCompletedBatch() {
1329
+ auto* calld = call_attempt_->calld_;
1330
+ // TODO(roth): When we implement hedging, this logic will need to get
1331
+ // a bit more complex, because there may be other (now abandoned) call
1332
+ // attempts still using this data. We may need to do some sort of
1333
+ // ref-counting instead.
1334
+ if (batch_.send_initial_metadata) {
1335
+ calld->FreeCachedSendInitialMetadata();
1336
+ }
1337
+ if (batch_.send_message) {
1338
+ calld->FreeCachedSendMessage(call_attempt_->completed_send_message_count_ -
1339
+ 1);
1340
+ }
1341
+ if (batch_.send_trailing_metadata) {
1342
+ calld->FreeCachedSendTrailingMetadata();
1343
+ }
1344
+ }
1345
+
1066
1346
  //
1067
1347
  // recv_initial_metadata callback handling
1068
1348
  //
1069
1349
 
1070
1350
  void RetryFilter::CallData::CallAttempt::BatchData::
1071
- InvokeRecvInitialMetadataCallback(void* arg, grpc_error_handle error) {
1072
- auto* batch_data = static_cast<CallAttempt::BatchData*>(arg);
1073
- auto* call_attempt = batch_data->call_attempt_.get();
1351
+ MaybeAddClosureForRecvInitialMetadataCallback(
1352
+ grpc_error_handle error, CallCombinerClosureList* closures) {
1074
1353
  // Find pending batch.
1075
- PendingBatch* pending = call_attempt->calld_->PendingBatchFind(
1354
+ PendingBatch* pending = call_attempt_->calld_->PendingBatchFind(
1076
1355
  "invoking recv_initial_metadata_ready for",
1077
1356
  [](grpc_transport_stream_op_batch* batch) {
1078
1357
  return batch->recv_initial_metadata &&
1079
1358
  batch->payload->recv_initial_metadata
1080
1359
  .recv_initial_metadata_ready != nullptr;
1081
1360
  });
1082
- GPR_ASSERT(pending != nullptr);
1361
+ if (pending == nullptr) {
1362
+ GRPC_ERROR_UNREF(error);
1363
+ return;
1364
+ }
1083
1365
  // Return metadata.
1084
1366
  grpc_metadata_batch_move(
1085
- &call_attempt->recv_initial_metadata_,
1367
+ &call_attempt_->recv_initial_metadata_,
1086
1368
  pending->batch->payload->recv_initial_metadata.recv_initial_metadata);
1369
+ // Propagate trailing_metadata_available.
1370
+ *pending->batch->payload->recv_initial_metadata.trailing_metadata_available =
1371
+ call_attempt_->trailing_metadata_available_;
1087
1372
  // Update bookkeeping.
1088
1373
  // Note: Need to do this before invoking the callback, since invoking
1089
1374
  // the callback will result in yielding the call combiner.
@@ -1092,33 +1377,36 @@ void RetryFilter::CallData::CallAttempt::BatchData::
1092
1377
  .recv_initial_metadata_ready;
1093
1378
  pending->batch->payload->recv_initial_metadata.recv_initial_metadata_ready =
1094
1379
  nullptr;
1095
- call_attempt->calld_->MaybeClearPendingBatch(pending);
1096
- batch_data->Unref();
1097
- // Invoke callback.
1098
- Closure::Run(DEBUG_LOCATION, recv_initial_metadata_ready,
1099
- GRPC_ERROR_REF(error));
1380
+ call_attempt_->calld_->MaybeClearPendingBatch(pending);
1381
+ // Add callback to closures.
1382
+ closures->Add(recv_initial_metadata_ready, error,
1383
+ "recv_initial_metadata_ready for pending batch");
1100
1384
  }
1101
1385
 
1102
1386
  void RetryFilter::CallData::CallAttempt::BatchData::RecvInitialMetadataReady(
1103
1387
  void* arg, grpc_error_handle error) {
1104
- CallAttempt::BatchData* batch_data =
1105
- static_cast<CallAttempt::BatchData*>(arg);
1388
+ RefCountedPtr<BatchData> batch_data(static_cast<BatchData*>(arg));
1106
1389
  CallAttempt* call_attempt = batch_data->call_attempt_.get();
1107
1390
  CallData* calld = call_attempt->calld_;
1108
1391
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1109
1392
  gpr_log(GPR_INFO,
1110
- "chand=%p calld=%p: got recv_initial_metadata_ready, error=%s",
1111
- calld->chand_, calld, grpc_error_std_string(error).c_str());
1393
+ "chand=%p calld=%p attempt=%p batch_data=%p: "
1394
+ "got recv_initial_metadata_ready, error=%s",
1395
+ calld->chand_, calld, call_attempt, batch_data.get(),
1396
+ grpc_error_std_string(error).c_str());
1112
1397
  }
1113
1398
  call_attempt->completed_recv_initial_metadata_ = true;
1114
- // If a retry was already dispatched, then we're not going to use the
1399
+ // If this attempt has been abandoned, then we're not going to use the
1115
1400
  // result of this recv_initial_metadata op, so do nothing.
1116
- if (call_attempt->retry_dispatched_) {
1401
+ if (call_attempt->abandoned_) {
1117
1402
  GRPC_CALL_COMBINER_STOP(
1118
1403
  calld->call_combiner_,
1119
- "recv_initial_metadata_ready after retry dispatched");
1404
+ "recv_initial_metadata_ready for abandoned attempt");
1120
1405
  return;
1121
1406
  }
1407
+ // Cancel per-attempt recv timer, if any.
1408
+ call_attempt->MaybeCancelPerAttemptRecvTimer();
1409
+ // If we're not committed, check the response to see if we need to commit.
1122
1410
  if (!calld->retry_committed_) {
1123
1411
  // If we got an error or a Trailers-Only response and have not yet gotten
1124
1412
  // the recv_trailing_metadata_ready callback, then defer propagating this
@@ -1129,82 +1417,94 @@ void RetryFilter::CallData::CallAttempt::BatchData::RecvInitialMetadataReady(
1129
1417
  !call_attempt->completed_recv_trailing_metadata_)) {
1130
1418
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1131
1419
  gpr_log(GPR_INFO,
1132
- "chand=%p calld=%p: deferring recv_initial_metadata_ready "
1133
- "(Trailers-Only)",
1134
- calld->chand_, calld);
1420
+ "chand=%p calld=%p attempt=%p: deferring "
1421
+ "recv_initial_metadata_ready (Trailers-Only)",
1422
+ calld->chand_, calld, call_attempt);
1135
1423
  }
1136
- call_attempt->recv_initial_metadata_ready_deferred_batch_ = batch_data;
1424
+ call_attempt->recv_initial_metadata_ready_deferred_batch_ =
1425
+ std::move(batch_data);
1137
1426
  call_attempt->recv_initial_metadata_error_ = GRPC_ERROR_REF(error);
1427
+ CallCombinerClosureList closures;
1428
+ if (error != GRPC_ERROR_NONE) {
1429
+ call_attempt->AddBatchForCancelOp(GRPC_ERROR_REF(error), &closures);
1430
+ }
1138
1431
  if (!call_attempt->started_recv_trailing_metadata_) {
1139
1432
  // recv_trailing_metadata not yet started by application; start it
1140
1433
  // ourselves to get status.
1141
- call_attempt->StartInternalRecvTrailingMetadata();
1142
- } else {
1143
- GRPC_CALL_COMBINER_STOP(
1144
- calld->call_combiner_,
1145
- "recv_initial_metadata_ready trailers-only or error");
1434
+ call_attempt->AddBatchForInternalRecvTrailingMetadata(&closures);
1146
1435
  }
1436
+ closures.RunClosures(calld->call_combiner_);
1147
1437
  return;
1148
1438
  }
1149
1439
  // Received valid initial metadata, so commit the call.
1150
1440
  calld->RetryCommit(call_attempt);
1441
+ // If retry state is no longer needed, switch to fast path for
1442
+ // subsequent batches.
1443
+ call_attempt->MaybeSwitchToFastPath();
1151
1444
  }
1152
1445
  // Invoke the callback to return the result to the surface.
1153
- // Manually invoking a callback function; it does not take ownership of error.
1154
- InvokeRecvInitialMetadataCallback(batch_data, error);
1446
+ CallCombinerClosureList closures;
1447
+ batch_data->MaybeAddClosureForRecvInitialMetadataCallback(
1448
+ GRPC_ERROR_REF(error), &closures);
1449
+ closures.RunClosures(calld->call_combiner_);
1155
1450
  }
1156
1451
 
1157
1452
  //
1158
1453
  // recv_message callback handling
1159
1454
  //
1160
1455
 
1161
- void RetryFilter::CallData::CallAttempt::BatchData::InvokeRecvMessageCallback(
1162
- void* arg, grpc_error_handle error) {
1163
- CallAttempt::BatchData* batch_data =
1164
- static_cast<CallAttempt::BatchData*>(arg);
1165
- CallAttempt* call_attempt = batch_data->call_attempt_.get();
1166
- CallData* calld = call_attempt->calld_;
1456
+ void RetryFilter::CallData::CallAttempt::BatchData::
1457
+ MaybeAddClosureForRecvMessageCallback(grpc_error_handle error,
1458
+ CallCombinerClosureList* closures) {
1167
1459
  // Find pending op.
1168
- PendingBatch* pending = calld->PendingBatchFind(
1460
+ PendingBatch* pending = call_attempt_->calld_->PendingBatchFind(
1169
1461
  "invoking recv_message_ready for",
1170
1462
  [](grpc_transport_stream_op_batch* batch) {
1171
1463
  return batch->recv_message &&
1172
1464
  batch->payload->recv_message.recv_message_ready != nullptr;
1173
1465
  });
1174
- GPR_ASSERT(pending != nullptr);
1466
+ if (pending == nullptr) {
1467
+ GRPC_ERROR_UNREF(error);
1468
+ return;
1469
+ }
1175
1470
  // Return payload.
1176
1471
  *pending->batch->payload->recv_message.recv_message =
1177
- std::move(call_attempt->recv_message_);
1472
+ std::move(call_attempt_->recv_message_);
1178
1473
  // Update bookkeeping.
1179
1474
  // Note: Need to do this before invoking the callback, since invoking
1180
1475
  // the callback will result in yielding the call combiner.
1181
1476
  grpc_closure* recv_message_ready =
1182
1477
  pending->batch->payload->recv_message.recv_message_ready;
1183
1478
  pending->batch->payload->recv_message.recv_message_ready = nullptr;
1184
- calld->MaybeClearPendingBatch(pending);
1185
- batch_data->Unref();
1186
- // Invoke callback.
1187
- Closure::Run(DEBUG_LOCATION, recv_message_ready, GRPC_ERROR_REF(error));
1479
+ call_attempt_->calld_->MaybeClearPendingBatch(pending);
1480
+ // Add callback to closures.
1481
+ closures->Add(recv_message_ready, error,
1482
+ "recv_message_ready for pending batch");
1188
1483
  }
1189
1484
 
1190
1485
  void RetryFilter::CallData::CallAttempt::BatchData::RecvMessageReady(
1191
1486
  void* arg, grpc_error_handle error) {
1192
- CallAttempt::BatchData* batch_data =
1193
- static_cast<CallAttempt::BatchData*>(arg);
1487
+ RefCountedPtr<BatchData> batch_data(static_cast<BatchData*>(arg));
1194
1488
  CallAttempt* call_attempt = batch_data->call_attempt_.get();
1195
1489
  CallData* calld = call_attempt->calld_;
1196
1490
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1197
- gpr_log(GPR_INFO, "chand=%p calld=%p: got recv_message_ready, error=%s",
1198
- calld->chand_, calld, grpc_error_std_string(error).c_str());
1491
+ gpr_log(GPR_INFO,
1492
+ "chand=%p calld=%p attempt=%p batch_data=%p: "
1493
+ "got recv_message_ready, error=%s",
1494
+ calld->chand_, calld, call_attempt, batch_data.get(),
1495
+ grpc_error_std_string(error).c_str());
1199
1496
  }
1200
1497
  ++call_attempt->completed_recv_message_count_;
1201
- // If a retry was already dispatched, then we're not going to use the
1498
+ // If this attempt has been abandoned, then we're not going to use the
1202
1499
  // result of this recv_message op, so do nothing.
1203
- if (call_attempt->retry_dispatched_) {
1500
+ if (call_attempt->abandoned_) {
1204
1501
  GRPC_CALL_COMBINER_STOP(calld->call_combiner_,
1205
- "recv_message_ready after retry dispatched");
1502
+ "recv_message_ready for abandoned attempt");
1206
1503
  return;
1207
1504
  }
1505
+ // Cancel per-attempt recv timer, if any.
1506
+ call_attempt->MaybeCancelPerAttemptRecvTimer();
1507
+ // If we're not committed, check the response to see if we need to commit.
1208
1508
  if (!calld->retry_committed_) {
1209
1509
  // If we got an error or the payload was nullptr and we have not yet gotten
1210
1510
  // the recv_trailing_metadata_ready callback, then defer propagating this
@@ -1215,28 +1515,35 @@ void RetryFilter::CallData::CallAttempt::BatchData::RecvMessageReady(
1215
1515
  !call_attempt->completed_recv_trailing_metadata_)) {
1216
1516
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1217
1517
  gpr_log(GPR_INFO,
1218
- "chand=%p calld=%p: deferring recv_message_ready (nullptr "
1219
- "message and recv_trailing_metadata pending)",
1220
- calld->chand_, calld);
1518
+ "chand=%p calld=%p attempt=%p: deferring recv_message_ready "
1519
+ "(nullptr message and recv_trailing_metadata pending)",
1520
+ calld->chand_, calld, call_attempt);
1221
1521
  }
1222
- call_attempt->recv_message_ready_deferred_batch_ = batch_data;
1522
+ call_attempt->recv_message_ready_deferred_batch_ = std::move(batch_data);
1223
1523
  call_attempt->recv_message_error_ = GRPC_ERROR_REF(error);
1524
+ CallCombinerClosureList closures;
1525
+ if (error != GRPC_ERROR_NONE) {
1526
+ call_attempt->AddBatchForCancelOp(GRPC_ERROR_REF(error), &closures);
1527
+ }
1224
1528
  if (!call_attempt->started_recv_trailing_metadata_) {
1225
1529
  // recv_trailing_metadata not yet started by application; start it
1226
1530
  // ourselves to get status.
1227
- call_attempt->StartInternalRecvTrailingMetadata();
1228
- } else {
1229
- GRPC_CALL_COMBINER_STOP(calld->call_combiner_,
1230
- "recv_message_ready null");
1531
+ call_attempt->AddBatchForInternalRecvTrailingMetadata(&closures);
1231
1532
  }
1533
+ closures.RunClosures(calld->call_combiner_);
1232
1534
  return;
1233
1535
  }
1234
1536
  // Received a valid message, so commit the call.
1235
1537
  calld->RetryCommit(call_attempt);
1538
+ // If retry state is no longer needed, switch to fast path for
1539
+ // subsequent batches.
1540
+ call_attempt->MaybeSwitchToFastPath();
1236
1541
  }
1237
1542
  // Invoke the callback to return the result to the surface.
1238
- // Manually invoking a callback function; it does not take ownership of error.
1239
- InvokeRecvMessageCallback(batch_data, error);
1543
+ CallCombinerClosureList closures;
1544
+ batch_data->MaybeAddClosureForRecvMessageCallback(GRPC_ERROR_REF(error),
1545
+ &closures);
1546
+ closures.RunClosures(calld->call_combiner_);
1240
1547
  }
1241
1548
 
1242
1549
  //
@@ -1271,23 +1578,28 @@ void GetCallStatus(grpc_millis deadline, grpc_metadata_batch* md_batch,
1271
1578
  } // namespace
1272
1579
 
1273
1580
  void RetryFilter::CallData::CallAttempt::BatchData::
1274
- AddClosureForRecvTrailingMetadataReady(grpc_error_handle error,
1275
- CallCombinerClosureList* closures) {
1581
+ MaybeAddClosureForRecvTrailingMetadataReady(
1582
+ grpc_error_handle error, CallCombinerClosureList* closures) {
1276
1583
  auto* calld = call_attempt_->calld_;
1277
1584
  // Find pending batch.
1278
1585
  PendingBatch* pending = calld->PendingBatchFind(
1279
- "invoking recv_trailing_metadata for",
1586
+ "invoking recv_trailing_metadata_ready for",
1280
1587
  [](grpc_transport_stream_op_batch* batch) {
1281
1588
  return batch->recv_trailing_metadata &&
1282
1589
  batch->payload->recv_trailing_metadata
1283
1590
  .recv_trailing_metadata_ready != nullptr;
1284
1591
  });
1285
1592
  // If we generated the recv_trailing_metadata op internally via
1286
- // StartInternalRecvTrailingMetadata(), then there will be no pending batch.
1593
+ // AddBatchForInternalRecvTrailingMetadata(), then there will be no
1594
+ // pending batch.
1287
1595
  if (pending == nullptr) {
1288
- GRPC_ERROR_UNREF(error);
1596
+ call_attempt_->recv_trailing_metadata_error_ = error;
1289
1597
  return;
1290
1598
  }
1599
+ // Copy transport stats to be delivered up to the surface.
1600
+ grpc_transport_move_stats(
1601
+ &call_attempt_->collect_stats_,
1602
+ pending->batch->payload->recv_trailing_metadata.collect_stats);
1291
1603
  // Return metadata.
1292
1604
  grpc_metadata_batch_move(
1293
1605
  &call_attempt_->recv_trailing_metadata_,
@@ -1303,35 +1615,34 @@ void RetryFilter::CallData::CallAttempt::BatchData::
1303
1615
  }
1304
1616
 
1305
1617
  void RetryFilter::CallData::CallAttempt::BatchData::
1306
- AddClosuresForDeferredRecvCallbacks(CallCombinerClosureList* closures) {
1307
- if (batch_.recv_trailing_metadata) {
1308
- // Add closure for deferred recv_initial_metadata_ready.
1309
- if (GPR_UNLIKELY(
1310
- call_attempt_->recv_initial_metadata_ready_deferred_batch_ !=
1311
- nullptr)) {
1312
- GRPC_CLOSURE_INIT(
1313
- &call_attempt_->recv_initial_metadata_ready_,
1314
- InvokeRecvInitialMetadataCallback,
1315
- call_attempt_->recv_initial_metadata_ready_deferred_batch_,
1316
- grpc_schedule_on_exec_ctx);
1317
- closures->Add(&call_attempt_->recv_initial_metadata_ready_,
1318
- call_attempt_->recv_initial_metadata_error_,
1319
- "resuming recv_initial_metadata_ready");
1320
- call_attempt_->recv_initial_metadata_ready_deferred_batch_ = nullptr;
1321
- }
1322
- // Add closure for deferred recv_message_ready.
1323
- if (GPR_UNLIKELY(call_attempt_->recv_message_ready_deferred_batch_ !=
1324
- nullptr)) {
1325
- GRPC_CLOSURE_INIT(&call_attempt_->recv_message_ready_,
1326
- InvokeRecvMessageCallback,
1327
- call_attempt_->recv_message_ready_deferred_batch_,
1328
- grpc_schedule_on_exec_ctx);
1329
- closures->Add(&call_attempt_->recv_message_ready_,
1330
- call_attempt_->recv_message_error_,
1331
- "resuming recv_message_ready");
1332
- call_attempt_->recv_message_ready_deferred_batch_ = nullptr;
1333
- }
1334
- }
1618
+ AddClosuresForDeferredCompletionCallbacks(
1619
+ CallCombinerClosureList* closures) {
1620
+ // Add closure for deferred recv_initial_metadata_ready.
1621
+ if (GPR_UNLIKELY(call_attempt_->recv_initial_metadata_ready_deferred_batch_ !=
1622
+ nullptr)) {
1623
+ MaybeAddClosureForRecvInitialMetadataCallback(
1624
+ call_attempt_->recv_initial_metadata_error_, closures);
1625
+ call_attempt_->recv_initial_metadata_ready_deferred_batch_.reset(
1626
+ DEBUG_LOCATION, "resuming deferred recv_initial_metadata_ready");
1627
+ call_attempt_->recv_initial_metadata_error_ = GRPC_ERROR_NONE;
1628
+ }
1629
+ // Add closure for deferred recv_message_ready.
1630
+ if (GPR_UNLIKELY(call_attempt_->recv_message_ready_deferred_batch_ !=
1631
+ nullptr)) {
1632
+ MaybeAddClosureForRecvMessageCallback(call_attempt_->recv_message_error_,
1633
+ closures);
1634
+ call_attempt_->recv_message_ready_deferred_batch_.reset(
1635
+ DEBUG_LOCATION, "resuming deferred recv_message_ready");
1636
+ call_attempt_->recv_message_error_ = GRPC_ERROR_NONE;
1637
+ }
1638
+ // Add closures for deferred on_complete callbacks.
1639
+ for (auto& on_complete_deferred_batch :
1640
+ call_attempt_->on_complete_deferred_batches_) {
1641
+ closures->Add(&on_complete_deferred_batch.batch->on_complete_,
1642
+ on_complete_deferred_batch.error, "resuming on_complete");
1643
+ on_complete_deferred_batch.batch.release();
1644
+ }
1645
+ call_attempt_->on_complete_deferred_batches_.clear();
1335
1646
  }
1336
1647
 
1337
1648
  void RetryFilter::CallData::CallAttempt::BatchData::
@@ -1340,13 +1651,8 @@ void RetryFilter::CallData::CallAttempt::BatchData::
1340
1651
  auto* calld = call_attempt_->calld_;
1341
1652
  for (size_t i = 0; i < GPR_ARRAY_SIZE(calld->pending_batches_); ++i) {
1342
1653
  PendingBatch* pending = &calld->pending_batches_[i];
1343
- if (call_attempt_->PendingBatchIsUnstarted(pending)) {
1344
- if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1345
- gpr_log(GPR_INFO,
1346
- "chand=%p calld=%p: failing unstarted pending batch at "
1347
- "index %" PRIuPTR,
1348
- calld->chand_, calld, i);
1349
- }
1654
+ if (pending->batch == nullptr) continue;
1655
+ if (call_attempt_->PendingBatchContainsUnstartedSendOps(pending)) {
1350
1656
  closures->Add(pending->batch->on_complete, GRPC_ERROR_REF(error),
1351
1657
  "failing on_complete for pending batch");
1352
1658
  pending->batch->on_complete = nullptr;
@@ -1361,32 +1667,40 @@ void RetryFilter::CallData::CallAttempt::BatchData::RunClosuresForCompletedCall(
1361
1667
  // Construct list of closures to execute.
1362
1668
  CallCombinerClosureList closures;
1363
1669
  // First, add closure for recv_trailing_metadata_ready.
1364
- AddClosureForRecvTrailingMetadataReady(GRPC_ERROR_REF(error), &closures);
1365
- // If there are deferred recv_initial_metadata_ready or recv_message_ready
1366
- // callbacks, add them to closures.
1367
- AddClosuresForDeferredRecvCallbacks(&closures);
1670
+ MaybeAddClosureForRecvTrailingMetadataReady(GRPC_ERROR_REF(error), &closures);
1671
+ // If there are deferred batch completion callbacks, add them to closures.
1672
+ AddClosuresForDeferredCompletionCallbacks(&closures);
1368
1673
  // Add closures to fail any pending batches that have not yet been started.
1369
1674
  AddClosuresToFailUnstartedPendingBatches(GRPC_ERROR_REF(error), &closures);
1370
1675
  // Schedule all of the closures identified above.
1371
1676
  // Note: This will release the call combiner.
1372
1677
  closures.RunClosures(call_attempt_->calld_->call_combiner_);
1373
- // Don't need batch_data anymore.
1374
- Unref();
1375
1678
  GRPC_ERROR_UNREF(error);
1376
1679
  }
1377
1680
 
1378
1681
  void RetryFilter::CallData::CallAttempt::BatchData::RecvTrailingMetadataReady(
1379
1682
  void* arg, grpc_error_handle error) {
1380
- CallAttempt::BatchData* batch_data =
1381
- static_cast<CallAttempt::BatchData*>(arg);
1683
+ RefCountedPtr<BatchData> batch_data(static_cast<BatchData*>(arg));
1382
1684
  CallAttempt* call_attempt = batch_data->call_attempt_.get();
1383
1685
  CallData* calld = call_attempt->calld_;
1384
1686
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1385
1687
  gpr_log(GPR_INFO,
1386
- "chand=%p calld=%p: got recv_trailing_metadata_ready, error=%s",
1387
- calld->chand_, calld, grpc_error_std_string(error).c_str());
1688
+ "chand=%p calld=%p attempt=%p batch_data=%p: "
1689
+ "got recv_trailing_metadata_ready, error=%s",
1690
+ calld->chand_, calld, call_attempt, batch_data.get(),
1691
+ grpc_error_std_string(error).c_str());
1388
1692
  }
1389
1693
  call_attempt->completed_recv_trailing_metadata_ = true;
1694
+ // If this attempt has been abandoned, then we're not going to use the
1695
+ // result of this recv_trailing_metadata op, so do nothing.
1696
+ if (call_attempt->abandoned_) {
1697
+ GRPC_CALL_COMBINER_STOP(
1698
+ calld->call_combiner_,
1699
+ "recv_trailing_metadata_ready for abandoned attempt");
1700
+ return;
1701
+ }
1702
+ // Cancel per-attempt recv timer, if any.
1703
+ call_attempt->MaybeCancelPerAttemptRecvTimer();
1390
1704
  // Get the call's status and check for server pushback metadata.
1391
1705
  grpc_status_code status = GRPC_STATUS_OK;
1392
1706
  grpc_mdelem* server_pushback_md = nullptr;
@@ -1397,26 +1711,37 @@ void RetryFilter::CallData::CallAttempt::BatchData::RecvTrailingMetadataReady(
1397
1711
  &server_pushback_md, &is_lb_drop);
1398
1712
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1399
1713
  gpr_log(
1400
- GPR_INFO, "chand=%p calld=%p: call finished, status=%s is_lb_drop=%d",
1401
- calld->chand_, calld, grpc_status_code_to_string(status), is_lb_drop);
1714
+ GPR_INFO,
1715
+ "chand=%p calld=%p attempt=%p: call finished, status=%s is_lb_drop=%d",
1716
+ calld->chand_, calld, call_attempt, grpc_status_code_to_string(status),
1717
+ is_lb_drop);
1402
1718
  }
1403
1719
  // Check if we should retry.
1404
- if (batch_data->MaybeRetry(status, server_pushback_md, is_lb_drop)) {
1405
- // Unref batch_data for deferred recv_initial_metadata_ready or
1406
- // recv_message_ready callbacks, if any.
1407
- if (call_attempt->recv_initial_metadata_ready_deferred_batch_ != nullptr) {
1408
- GRPC_ERROR_UNREF(call_attempt->recv_initial_metadata_error_);
1409
- batch_data->Unref();
1410
- }
1411
- if (call_attempt->recv_message_ready_deferred_batch_ != nullptr) {
1412
- GRPC_ERROR_UNREF(call_attempt->recv_message_error_);
1413
- batch_data->Unref();
1414
- }
1415
- batch_data->Unref();
1720
+ grpc_millis server_pushback_ms = -1;
1721
+ if (call_attempt->ShouldRetry(status, is_lb_drop, server_pushback_md,
1722
+ &server_pushback_ms)) {
1723
+ // Start retry timer.
1724
+ calld->StartRetryTimer(server_pushback_ms);
1725
+ // Cancel call attempt.
1726
+ CallCombinerClosureList closures;
1727
+ call_attempt->AddBatchForCancelOp(
1728
+ error == GRPC_ERROR_NONE
1729
+ ? grpc_error_set_int(
1730
+ GRPC_ERROR_CREATE_FROM_STATIC_STRING("call attempt failed"),
1731
+ GRPC_ERROR_INT_GRPC_STATUS, GRPC_STATUS_CANCELLED)
1732
+ : GRPC_ERROR_REF(error),
1733
+ &closures);
1734
+ // Record that this attempt has been abandoned.
1735
+ call_attempt->Abandon();
1736
+ // Yields call combiner.
1737
+ closures.RunClosures(calld->call_combiner_);
1416
1738
  return;
1417
1739
  }
1418
1740
  // Not retrying, so commit the call.
1419
1741
  calld->RetryCommit(call_attempt);
1742
+ // If retry state is no longer needed, switch to fast path for
1743
+ // subsequent batches.
1744
+ call_attempt->MaybeSwitchToFastPath();
1420
1745
  // Run any necessary closures.
1421
1746
  batch_data->RunClosuresForCompletedCall(GRPC_ERROR_REF(error));
1422
1747
  }
@@ -1444,6 +1769,11 @@ void RetryFilter::CallData::CallAttempt::BatchData::
1444
1769
  GRPC_ERROR_UNREF(error);
1445
1770
  return;
1446
1771
  }
1772
+ // Propagate payload.
1773
+ if (batch_.send_message) {
1774
+ pending->batch->payload->send_message.stream_write_closed =
1775
+ batch_.payload->send_message.stream_write_closed;
1776
+ }
1447
1777
  // Add closure.
1448
1778
  closures->Add(pending->batch->on_complete, error,
1449
1779
  "on_complete for pending batch");
@@ -1454,31 +1784,27 @@ void RetryFilter::CallData::CallAttempt::BatchData::
1454
1784
  void RetryFilter::CallData::CallAttempt::BatchData::
1455
1785
  AddClosuresForReplayOrPendingSendOps(CallCombinerClosureList* closures) {
1456
1786
  auto* calld = call_attempt_->calld_;
1787
+ bool have_pending_send_ops = call_attempt_->HaveSendOpsToReplay();
1457
1788
  // We don't check send_initial_metadata here, because that op will always
1458
1789
  // be started as soon as it is received from the surface, so it will
1459
1790
  // never need to be started at this point.
1460
- bool have_pending_send_message_ops =
1461
- call_attempt_->started_send_message_count_ < calld->send_messages_.size();
1462
- bool have_pending_send_trailing_metadata_op =
1463
- calld->seen_send_trailing_metadata_ &&
1464
- !call_attempt_->started_send_trailing_metadata_;
1465
- if (!have_pending_send_message_ops &&
1466
- !have_pending_send_trailing_metadata_op) {
1791
+ if (!have_pending_send_ops) {
1467
1792
  for (size_t i = 0; i < GPR_ARRAY_SIZE(calld->pending_batches_); ++i) {
1468
1793
  PendingBatch* pending = &calld->pending_batches_[i];
1469
1794
  grpc_transport_stream_op_batch* batch = pending->batch;
1470
1795
  if (batch == nullptr || pending->send_ops_cached) continue;
1471
- if (batch->send_message) have_pending_send_message_ops = true;
1472
- if (batch->send_trailing_metadata) {
1473
- have_pending_send_trailing_metadata_op = true;
1796
+ if (batch->send_message || batch->send_trailing_metadata) {
1797
+ have_pending_send_ops = true;
1798
+ break;
1474
1799
  }
1475
1800
  }
1476
1801
  }
1477
- if (have_pending_send_message_ops || have_pending_send_trailing_metadata_op) {
1802
+ if (have_pending_send_ops) {
1478
1803
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1479
1804
  gpr_log(GPR_INFO,
1480
- "chand=%p calld=%p: starting next batch for pending send op(s)",
1481
- calld->chand_, calld);
1805
+ "chand=%p calld=%p attempt=%p: starting next batch for pending "
1806
+ "send op(s)",
1807
+ calld->chand_, calld, call_attempt_.get());
1482
1808
  }
1483
1809
  call_attempt_->AddRetriableBatches(closures);
1484
1810
  }
@@ -1486,15 +1812,46 @@ void RetryFilter::CallData::CallAttempt::BatchData::
1486
1812
 
1487
1813
  void RetryFilter::CallData::CallAttempt::BatchData::OnComplete(
1488
1814
  void* arg, grpc_error_handle error) {
1489
- CallAttempt::BatchData* batch_data =
1490
- static_cast<CallAttempt::BatchData*>(arg);
1815
+ RefCountedPtr<BatchData> batch_data(static_cast<BatchData*>(arg));
1491
1816
  CallAttempt* call_attempt = batch_data->call_attempt_.get();
1492
1817
  CallData* calld = call_attempt->calld_;
1493
1818
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1494
- gpr_log(GPR_INFO, "chand=%p calld=%p: got on_complete, error=%s, batch=%s",
1495
- calld->chand_, calld, grpc_error_std_string(error).c_str(),
1819
+ gpr_log(GPR_INFO,
1820
+ "chand=%p calld=%p attempt=%p batch_data=%p: "
1821
+ "got on_complete, error=%s, batch=%s",
1822
+ calld->chand_, calld, call_attempt, batch_data.get(),
1823
+ grpc_error_std_string(error).c_str(),
1496
1824
  grpc_transport_stream_op_batch_string(&batch_data->batch_).c_str());
1497
1825
  }
1826
+ // If this attempt has been abandoned, then we're not going to propagate
1827
+ // the completion of this batch, so do nothing.
1828
+ if (call_attempt->abandoned_) {
1829
+ GRPC_CALL_COMBINER_STOP(calld->call_combiner_,
1830
+ "on_complete for abandoned attempt");
1831
+ return;
1832
+ }
1833
+ // If we got an error and have not yet gotten the
1834
+ // recv_trailing_metadata_ready callback, then defer propagating this
1835
+ // callback back to the surface. We can evaluate whether to retry when
1836
+ // recv_trailing_metadata comes back.
1837
+ if (GPR_UNLIKELY(!calld->retry_committed_ && error != GRPC_ERROR_NONE &&
1838
+ !call_attempt->completed_recv_trailing_metadata_)) {
1839
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1840
+ gpr_log(GPR_INFO, "chand=%p calld=%p attempt=%p: deferring on_complete",
1841
+ calld->chand_, calld, call_attempt);
1842
+ }
1843
+ call_attempt->on_complete_deferred_batches_.emplace_back(
1844
+ std::move(batch_data), GRPC_ERROR_REF(error));
1845
+ CallCombinerClosureList closures;
1846
+ call_attempt->AddBatchForCancelOp(GRPC_ERROR_REF(error), &closures);
1847
+ if (!call_attempt->started_recv_trailing_metadata_) {
1848
+ // recv_trailing_metadata not yet started by application; start it
1849
+ // ourselves to get status.
1850
+ call_attempt->AddBatchForInternalRecvTrailingMetadata(&closures);
1851
+ }
1852
+ closures.RunClosures(calld->call_combiner_);
1853
+ return;
1854
+ }
1498
1855
  // Update bookkeeping in call_attempt.
1499
1856
  if (batch_data->batch_.send_initial_metadata) {
1500
1857
  call_attempt->completed_send_initial_metadata_ = true;
@@ -1512,33 +1869,39 @@ void RetryFilter::CallData::CallAttempt::BatchData::OnComplete(
1512
1869
  }
1513
1870
  // Construct list of closures to execute.
1514
1871
  CallCombinerClosureList closures;
1515
- // If a retry was already dispatched, that means we saw
1516
- // recv_trailing_metadata before this, so we do nothing here.
1517
- // Otherwise, invoke the callback to return the result to the surface.
1518
- if (!call_attempt->retry_dispatched_) {
1519
- // Add closure for the completed pending batch, if any.
1520
- batch_data->AddClosuresForCompletedPendingBatch(GRPC_ERROR_REF(error),
1521
- &closures);
1522
- // If needed, add a callback to start any replay or pending send ops on
1523
- // the LB call.
1524
- if (!call_attempt->completed_recv_trailing_metadata_) {
1525
- batch_data->AddClosuresForReplayOrPendingSendOps(&closures);
1526
- }
1527
- }
1528
- // Track number of in-flight send batches and determine if this was the
1529
- // last one.
1530
- --calld->num_in_flight_call_attempt_send_batches_;
1531
- const bool last_send_batch_complete =
1532
- calld->num_in_flight_call_attempt_send_batches_ == 0;
1533
- // Don't need batch_data anymore.
1534
- batch_data->Unref();
1872
+ // Add closure for the completed pending batch, if any.
1873
+ batch_data->AddClosuresForCompletedPendingBatch(GRPC_ERROR_REF(error),
1874
+ &closures);
1875
+ // If needed, add a callback to start any replay or pending send ops on
1876
+ // the LB call.
1877
+ if (!call_attempt->completed_recv_trailing_metadata_) {
1878
+ batch_data->AddClosuresForReplayOrPendingSendOps(&closures);
1879
+ }
1880
+ // If retry state is no longer needed (i.e., we're committed and there
1881
+ // are no more send ops to replay), switch to fast path for subsequent
1882
+ // batches.
1883
+ call_attempt->MaybeSwitchToFastPath();
1535
1884
  // Schedule all of the closures identified above.
1536
1885
  // Note: This yields the call combiner.
1537
1886
  closures.RunClosures(calld->call_combiner_);
1538
- // If this was the last in-flight send batch, unref the call stack.
1539
- if (last_send_batch_complete) {
1540
- GRPC_CALL_STACK_UNREF(calld->owning_call_, "retriable_send_batches");
1887
+ }
1888
+
1889
+ void RetryFilter::CallData::CallAttempt::BatchData::OnCompleteForCancelOp(
1890
+ void* arg, grpc_error_handle error) {
1891
+ RefCountedPtr<BatchData> batch_data(static_cast<BatchData*>(arg));
1892
+ CallAttempt* call_attempt = batch_data->call_attempt_.get();
1893
+ CallData* calld = call_attempt->calld_;
1894
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1895
+ gpr_log(GPR_INFO,
1896
+ "chand=%p calld=%p attempt=%p batch_data=%p: "
1897
+ "got on_complete for cancel_stream batch, error=%s, batch=%s",
1898
+ calld->chand_, calld, call_attempt, batch_data.get(),
1899
+ grpc_error_std_string(error).c_str(),
1900
+ grpc_transport_stream_op_batch_string(&batch_data->batch_).c_str());
1541
1901
  }
1902
+ GRPC_CALL_COMBINER_STOP(
1903
+ calld->call_combiner_,
1904
+ "on_complete for internally generated cancel_stream op");
1542
1905
  }
1543
1906
 
1544
1907
  //
@@ -1598,9 +1961,12 @@ void RetryFilter::CallData::CallAttempt::BatchData::
1598
1961
  AddRetriableSendMessageOp() {
1599
1962
  auto* calld = call_attempt_->calld_;
1600
1963
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1601
- gpr_log(GPR_INFO,
1602
- "chand=%p calld=%p: starting calld->send_messages[%" PRIuPTR "]",
1603
- calld->chand_, calld, call_attempt_->started_send_message_count_);
1964
+ gpr_log(
1965
+ GPR_INFO,
1966
+ "chand=%p calld=%p attempt=%p: starting calld->send_messages[%" PRIuPTR
1967
+ "]",
1968
+ calld->chand_, calld, call_attempt_.get(),
1969
+ call_attempt_->started_send_message_count_);
1604
1970
  }
1605
1971
  ByteStreamCache* cache =
1606
1972
  calld->send_messages_[call_attempt_->started_send_message_count_];
@@ -1650,6 +2016,7 @@ void RetryFilter::CallData::CallAttempt::BatchData::
1650
2016
  ++call_attempt_->started_recv_message_count_;
1651
2017
  batch_.recv_message = true;
1652
2018
  batch_.payload->recv_message.recv_message = &call_attempt_->recv_message_;
2019
+ batch_.payload->recv_message.call_failed_before_recv_message = nullptr;
1653
2020
  GRPC_CLOSURE_INIT(&call_attempt_->recv_message_ready_, RecvMessageReady, this,
1654
2021
  grpc_schedule_on_exec_ctx);
1655
2022
  batch_.payload->recv_message.recv_message_ready =
@@ -1671,6 +2038,14 @@ void RetryFilter::CallData::CallAttempt::BatchData::
1671
2038
  &call_attempt_->recv_trailing_metadata_ready_;
1672
2039
  }
1673
2040
 
2041
+ void RetryFilter::CallData::CallAttempt::BatchData::AddCancelStreamOp(
2042
+ grpc_error_handle error) {
2043
+ batch_.cancel_stream = true;
2044
+ batch_.payload->cancel_stream.cancel_error = error;
2045
+ // Override on_complete callback.
2046
+ GRPC_CLOSURE_INIT(&on_complete_, OnCompleteForCancelOp, this, nullptr);
2047
+ }
2048
+
1674
2049
  //
1675
2050
  // CallData vtable functions
1676
2051
  //
@@ -1680,7 +2055,8 @@ grpc_error_handle RetryFilter::CallData::Init(
1680
2055
  auto* chand = static_cast<RetryFilter*>(elem->channel_data);
1681
2056
  new (elem->call_data) CallData(chand, *args);
1682
2057
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1683
- gpr_log(GPR_INFO, "chand=%p: created call=%p", chand, elem->call_data);
2058
+ gpr_log(GPR_INFO, "chand=%p calld=%p: created call", chand,
2059
+ elem->call_data);
1684
2060
  }
1685
2061
  return GRPC_ERROR_NONE;
1686
2062
  }
@@ -1746,7 +2122,6 @@ RetryFilter::CallData::CallData(RetryFilter* chand,
1746
2122
  .set_max_backoff(
1747
2123
  retry_policy_ == nullptr ? 0 : retry_policy_->max_backoff())),
1748
2124
  path_(grpc_slice_ref_internal(args.path)),
1749
- call_start_time_(args.start_time),
1750
2125
  deadline_(args.deadline),
1751
2126
  arena_(args.arena),
1752
2127
  owning_call_(args.call_stack),
@@ -1758,7 +2133,7 @@ RetryFilter::CallData::CallData(RetryFilter* chand,
1758
2133
  pending_send_message_(false),
1759
2134
  pending_send_trailing_metadata_(false),
1760
2135
  retry_committed_(false),
1761
- last_attempt_got_server_pushback_(false) {}
2136
+ retry_timer_pending_(false) {}
1762
2137
 
1763
2138
  RetryFilter::CallData::~CallData() {
1764
2139
  grpc_slice_unref_internal(path_);
@@ -1766,6 +2141,7 @@ RetryFilter::CallData::~CallData() {
1766
2141
  for (size_t i = 0; i < GPR_ARRAY_SIZE(pending_batches_); ++i) {
1767
2142
  GPR_ASSERT(pending_batches_[i].batch == nullptr);
1768
2143
  }
2144
+ GRPC_ERROR_UNREF(cancelled_from_surface_);
1769
2145
  }
1770
2146
 
1771
2147
  void RetryFilter::CallData::StartTransportStreamOpBatch(
@@ -1788,10 +2164,29 @@ void RetryFilter::CallData::StartTransportStreamOpBatch(
1788
2164
  // will not be retried, because we have committed it here.
1789
2165
  if (call_attempt_ != nullptr) {
1790
2166
  RetryCommit(call_attempt_.get());
2167
+ // TODO(roth): When implementing hedging, this will get more
2168
+ // complex, because instead of just passing the batch down to a
2169
+ // single call attempt, we'll need to cancel multiple call
2170
+ // attempts and wait for the cancellation on_complete from each call
2171
+ // attempt before we propagate the on_complete from this batch
2172
+ // back to the surface.
1791
2173
  // Note: This will release the call combiner.
1792
- call_attempt_->lb_call()->StartTransportStreamOpBatch(batch);
2174
+ call_attempt_->CancelFromSurface(batch);
1793
2175
  return;
1794
2176
  }
2177
+ // Save cancel_error in case subsequent batches are started.
2178
+ GRPC_ERROR_UNREF(cancelled_from_surface_);
2179
+ cancelled_from_surface_ = GRPC_ERROR_REF(cancel_error);
2180
+ // Cancel retry timer.
2181
+ if (retry_timer_pending_) {
2182
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
2183
+ gpr_log(GPR_INFO, "chand=%p calld=%p: cancelling retry timer", chand_,
2184
+ this);
2185
+ }
2186
+ retry_timer_pending_ = false; // Lame timer callback.
2187
+ grpc_timer_cancel(&retry_timer_);
2188
+ FreeAllCachedSendOpData();
2189
+ }
1795
2190
  // Fail pending batches.
1796
2191
  PendingBatchesFail(GRPC_ERROR_REF(cancel_error));
1797
2192
  // Note: This will release the call combiner.
@@ -1801,13 +2196,47 @@ void RetryFilter::CallData::StartTransportStreamOpBatch(
1801
2196
  }
1802
2197
  // Add the batch to the pending list.
1803
2198
  PendingBatch* pending = PendingBatchesAdd(batch);
2199
+ // If the timer is pending, yield the call combiner and wait for it to
2200
+ // run, since we don't want to start another call attempt until it does.
2201
+ if (retry_timer_pending_) {
2202
+ GRPC_CALL_COMBINER_STOP(call_combiner_,
2203
+ "added pending batch while retry timer pending");
2204
+ return;
2205
+ }
2206
+ // If we do not yet have a call attempt, create one.
1804
2207
  if (call_attempt_ == nullptr) {
2208
+ // If we were previously cancelled from the surface, cancel this
2209
+ // batch instead of creating a call attempt.
2210
+ if (cancelled_from_surface_ != GRPC_ERROR_NONE) {
2211
+ PendingBatchClear(pending);
2212
+ // Note: This will release the call combiner.
2213
+ grpc_transport_stream_op_batch_finish_with_failure(
2214
+ batch, GRPC_ERROR_REF(cancelled_from_surface_), call_combiner_);
2215
+ return;
2216
+ }
2217
+ // If there is no retry policy, then commit retries immediately.
2218
+ // This ensures that the code below will always jump to the fast path.
2219
+ // TODO(roth): Remove this special case when we implement
2220
+ // transparent retries.
2221
+ if (retry_policy_ == nullptr) retry_committed_ = true;
1805
2222
  // If this is the first batch and retries are already committed
1806
2223
  // (e.g., if this batch put the call above the buffer size limit), then
1807
2224
  // immediately create an LB call and delegate the batch to it. This
1808
2225
  // avoids the overhead of unnecessarily allocating a CallAttempt
1809
2226
  // object or caching any of the send op data.
1810
- if (num_attempts_completed_ == 0 && retry_committed_) {
2227
+ // Note that we would ideally like to do this also on subsequent
2228
+ // attempts (e.g., if a batch puts the call above the buffer size
2229
+ // limit since the last attempt was complete), but in practice that's
2230
+ // not really worthwhile, because we will almost always have cached and
2231
+ // completed at least the send_initial_metadata op on the previous
2232
+ // attempt, which means that we'd need special logic to replay the
2233
+ // batch anyway, which is exactly what the CallAttempt object provides.
2234
+ // We also skip this optimization if perAttemptRecvTimeout is set in the
2235
+ // retry policy, because we need the code in CallAttempt to handle
2236
+ // the associated timer.
2237
+ if (num_attempts_completed_ == 0 && retry_committed_ &&
2238
+ (retry_policy_ == nullptr ||
2239
+ !retry_policy_->per_attempt_recv_timeout().has_value())) {
1811
2240
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1812
2241
  gpr_log(GPR_INFO,
1813
2242
  "chand=%p calld=%p: retry committed before first attempt; "
@@ -1815,11 +2244,16 @@ void RetryFilter::CallData::StartTransportStreamOpBatch(
1815
2244
  chand_, this);
1816
2245
  }
1817
2246
  PendingBatchClear(pending);
1818
- committed_call_ = CreateLoadBalancedCall();
2247
+ auto* service_config_call_data = static_cast<ServiceConfigCallData*>(
2248
+ call_context_[GRPC_CONTEXT_SERVICE_CONFIG_CALL_DATA].value);
2249
+ committed_call_ = CreateLoadBalancedCall(
2250
+ service_config_call_data->call_dispatch_controller());
1819
2251
  committed_call_->StartTransportStreamOpBatch(batch);
1820
2252
  return;
1821
2253
  }
1822
- // We do not yet have a call attempt, so create one.
2254
+ // Otherwise, create a call attempt.
2255
+ // The attempt will automatically start any necessary replays or
2256
+ // pending batches.
1823
2257
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1824
2258
  gpr_log(GPR_INFO, "chand=%p calld=%p: creating call attempt", chand_,
1825
2259
  this);
@@ -1829,56 +2263,31 @@ void RetryFilter::CallData::StartTransportStreamOpBatch(
1829
2263
  }
1830
2264
  // Send batches to call attempt.
1831
2265
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1832
- gpr_log(GPR_INFO,
1833
- "chand=%p calld=%p: starting batch on attempt=%p lb_call=%p",
1834
- chand_, this, call_attempt_.get(), call_attempt_->lb_call());
2266
+ gpr_log(GPR_INFO, "chand=%p calld=%p: starting batch on attempt=%p", chand_,
2267
+ this, call_attempt_.get());
1835
2268
  }
1836
2269
  call_attempt_->StartRetriableBatches();
1837
2270
  }
1838
2271
 
1839
- RefCountedPtr<ClientChannel::LoadBalancedCall>
1840
- RetryFilter::CallData::CreateLoadBalancedCall() {
2272
+ OrphanablePtr<ClientChannel::LoadBalancedCall>
2273
+ RetryFilter::CallData::CreateLoadBalancedCall(
2274
+ ConfigSelector::CallDispatchController* call_dispatch_controller) {
1841
2275
  grpc_call_element_args args = {owning_call_, nullptr, call_context_,
1842
- path_, call_start_time_, deadline_,
2276
+ path_, /*start_time=*/0, deadline_,
1843
2277
  arena_, call_combiner_};
1844
2278
  return chand_->client_channel_->CreateLoadBalancedCall(
1845
2279
  args, pollent_,
1846
2280
  // This callback holds a ref to the CallStackDestructionBarrier
1847
2281
  // object until the LB call is destroyed.
1848
- call_stack_destruction_barrier_->MakeLbCallDestructionClosure(this));
2282
+ call_stack_destruction_barrier_->MakeLbCallDestructionClosure(this),
2283
+ call_dispatch_controller,
2284
+ // TODO(roth): Change this when we support transparent retries.
2285
+ /*is_transparent_retry=*/false);
1849
2286
  }
1850
2287
 
1851
2288
  void RetryFilter::CallData::CreateCallAttempt() {
1852
- call_attempt_.reset(arena_->New<CallAttempt>(this));
2289
+ call_attempt_ = MakeRefCounted<CallAttempt>(this);
1853
2290
  call_attempt_->StartRetriableBatches();
1854
- // TODO(roth): When implementing hedging, change this to start a timer
1855
- // for the next hedging attempt.
1856
- }
1857
-
1858
- namespace {
1859
-
1860
- void StartBatchInCallCombiner(void* arg, grpc_error_handle /*ignored*/) {
1861
- grpc_transport_stream_op_batch* batch =
1862
- static_cast<grpc_transport_stream_op_batch*>(arg);
1863
- auto* lb_call = static_cast<ClientChannel::LoadBalancedCall*>(
1864
- batch->handler_private.extra_arg);
1865
- // Note: This will release the call combiner.
1866
- lb_call->StartTransportStreamOpBatch(batch);
1867
- }
1868
-
1869
- } // namespace
1870
-
1871
- void RetryFilter::CallData::AddClosureForBatch(
1872
- grpc_transport_stream_op_batch* batch, CallCombinerClosureList* closures) {
1873
- batch->handler_private.extra_arg = call_attempt_->lb_call();
1874
- GRPC_CLOSURE_INIT(&batch->handler_private.closure, StartBatchInCallCombiner,
1875
- batch, grpc_schedule_on_exec_ctx);
1876
- if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1877
- gpr_log(GPR_INFO, "chand=%p calld=%p: starting batch on LB call: %s",
1878
- chand_, this, grpc_transport_stream_op_batch_string(batch).c_str());
1879
- }
1880
- closures->Add(&batch->handler_private.closure, GRPC_ERROR_NONE,
1881
- "start_batch_on_lb_call");
1882
2291
  }
1883
2292
 
1884
2293
  //
@@ -1943,7 +2352,7 @@ void RetryFilter::CallData::FreeCachedSendMessage(size_t idx) {
1943
2352
 
1944
2353
  void RetryFilter::CallData::FreeCachedSendTrailingMetadata() {
1945
2354
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1946
- gpr_log(GPR_INFO, "chand_=%p calld=%p: destroying send_trailing_metadata",
2355
+ gpr_log(GPR_INFO, "chand=%p calld=%p: destroying send_trailing_metadata",
1947
2356
  chand_, this);
1948
2357
  }
1949
2358
  grpc_metadata_batch_destroy(&send_trailing_metadata_);
@@ -1982,7 +2391,7 @@ RetryFilter::CallData::PendingBatch* RetryFilter::CallData::PendingBatchesAdd(
1982
2391
  const size_t idx = GetBatchIndex(batch);
1983
2392
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1984
2393
  gpr_log(GPR_INFO,
1985
- "chand_=%p calld=%p: adding pending batch at index %" PRIuPTR,
2394
+ "chand=%p calld=%p: adding pending batch at index %" PRIuPTR,
1986
2395
  chand_, this, idx);
1987
2396
  }
1988
2397
  PendingBatch* pending = &pending_batches_[idx];
@@ -2006,6 +2415,9 @@ RetryFilter::CallData::PendingBatch* RetryFilter::CallData::PendingBatchesAdd(
2006
2415
  if (batch->send_trailing_metadata) {
2007
2416
  pending_send_trailing_metadata_ = true;
2008
2417
  }
2418
+ // TODO(roth): When we implement hedging, if there are currently attempts
2419
+ // in flight, we will need to pick the one on which the max number of send
2420
+ // ops have already been sent, and we commit to that attempt.
2009
2421
  if (GPR_UNLIKELY(bytes_buffered_for_retry_ >
2010
2422
  chand_->per_rpc_retry_buffer_size_)) {
2011
2423
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
@@ -2122,22 +2534,31 @@ void RetryFilter::CallData::RetryCommit(CallAttempt* call_attempt) {
2122
2534
  gpr_log(GPR_INFO, "chand=%p calld=%p: committing retries", chand_, this);
2123
2535
  }
2124
2536
  if (call_attempt != nullptr) {
2537
+ // If the call attempt's LB call has been committed, inform the call
2538
+ // dispatch controller that the call has been committed.
2539
+ // Note: If call_attempt is null, this is happening before the first
2540
+ // retry attempt is started, in which case we'll just pass the real
2541
+ // call dispatch controller down into the LB call, and it won't be
2542
+ // our problem anymore.
2543
+ if (call_attempt->lb_call_committed()) {
2544
+ auto* service_config_call_data = static_cast<ServiceConfigCallData*>(
2545
+ call_context_[GRPC_CONTEXT_SERVICE_CONFIG_CALL_DATA].value);
2546
+ service_config_call_data->call_dispatch_controller()->Commit();
2547
+ }
2548
+ // Free cached send ops.
2125
2549
  call_attempt->FreeCachedSendOpDataAfterCommit();
2126
2550
  }
2127
2551
  }
2128
2552
 
2129
- void RetryFilter::CallData::DoRetry(grpc_millis server_pushback_ms) {
2553
+ void RetryFilter::CallData::StartRetryTimer(grpc_millis server_pushback_ms) {
2130
2554
  // Reset call attempt.
2131
- call_attempt_.reset();
2555
+ call_attempt_.reset(DEBUG_LOCATION, "StartRetryTimer");
2132
2556
  // Compute backoff delay.
2133
2557
  grpc_millis next_attempt_time;
2134
2558
  if (server_pushback_ms >= 0) {
2135
2559
  next_attempt_time = ExecCtx::Get()->Now() + server_pushback_ms;
2136
- last_attempt_got_server_pushback_ = true;
2560
+ retry_backoff_.Reset();
2137
2561
  } else {
2138
- if (num_attempts_completed_ == 1 || last_attempt_got_server_pushback_) {
2139
- last_attempt_got_server_pushback_ = false;
2140
- }
2141
2562
  next_attempt_time = retry_backoff_.NextAttemptTime();
2142
2563
  }
2143
2564
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
@@ -2148,23 +2569,25 @@ void RetryFilter::CallData::DoRetry(grpc_millis server_pushback_ms) {
2148
2569
  // Schedule retry after computed delay.
2149
2570
  GRPC_CLOSURE_INIT(&retry_closure_, OnRetryTimer, this, nullptr);
2150
2571
  GRPC_CALL_STACK_REF(owning_call_, "OnRetryTimer");
2151
- MutexLock lock(&timer_mu_);
2152
- canceller_ = new Canceller(this);
2572
+ retry_timer_pending_ = true;
2153
2573
  grpc_timer_init(&retry_timer_, next_attempt_time, &retry_closure_);
2154
2574
  }
2155
2575
 
2156
2576
  void RetryFilter::CallData::OnRetryTimer(void* arg, grpc_error_handle error) {
2157
2577
  auto* calld = static_cast<CallData*>(arg);
2158
- if (error == GRPC_ERROR_NONE) {
2159
- bool start_attempt = false;
2160
- {
2161
- MutexLock lock(&calld->timer_mu_);
2162
- if (calld->canceller_ != nullptr) {
2163
- calld->canceller_ = nullptr;
2164
- start_attempt = true;
2165
- }
2166
- }
2167
- if (start_attempt) calld->CreateCallAttempt();
2578
+ GRPC_CLOSURE_INIT(&calld->retry_closure_, OnRetryTimerLocked, calld, nullptr);
2579
+ GRPC_CALL_COMBINER_START(calld->call_combiner_, &calld->retry_closure_,
2580
+ GRPC_ERROR_REF(error), "retry timer fired");
2581
+ }
2582
+
2583
+ void RetryFilter::CallData::OnRetryTimerLocked(void* arg,
2584
+ grpc_error_handle error) {
2585
+ auto* calld = static_cast<CallData*>(arg);
2586
+ if (error == GRPC_ERROR_NONE && calld->retry_timer_pending_) {
2587
+ calld->retry_timer_pending_ = false;
2588
+ calld->CreateCallAttempt();
2589
+ } else {
2590
+ GRPC_CALL_COMBINER_STOP(calld->call_combiner_, "retry timer cancelled");
2168
2591
  }
2169
2592
  GRPC_CALL_STACK_UNREF(calld->owning_call_, "OnRetryTimer");
2170
2593
  }