grpc 1.38.0 → 1.39.0.pre1

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of grpc might be problematic. Click here for more details.

Files changed (199) hide show
  1. checksums.yaml +4 -4
  2. data/Makefile +50 -19
  3. data/include/grpc/event_engine/endpoint_config.h +48 -0
  4. data/include/grpc/event_engine/event_engine.h +13 -15
  5. data/include/grpc/event_engine/port.h +2 -0
  6. data/include/grpc/event_engine/slice_allocator.h +17 -7
  7. data/include/grpc/grpc.h +9 -2
  8. data/include/grpc/grpc_security.h +32 -0
  9. data/include/grpc/grpc_security_constants.h +1 -0
  10. data/include/grpc/impl/codegen/grpc_types.h +17 -13
  11. data/include/grpc/impl/codegen/port_platform.h +17 -0
  12. data/src/core/ext/filters/client_channel/client_channel.cc +2 -2
  13. data/src/core/ext/filters/client_channel/health/health_check_client.cc +2 -0
  14. data/src/core/ext/filters/client_channel/health/health_check_client.h +3 -3
  15. data/src/core/ext/filters/client_channel/http_proxy.cc +16 -1
  16. data/src/core/ext/filters/client_channel/lb_policy/ring_hash/ring_hash.cc +755 -0
  17. data/src/core/ext/filters/client_channel/lb_policy/ring_hash/ring_hash.h +10 -0
  18. data/src/core/ext/filters/client_channel/lb_policy/xds/cds.cc +10 -24
  19. data/src/core/ext/filters/client_channel/lb_policy/xds/xds_cluster_resolver.cc +63 -95
  20. data/src/core/ext/filters/client_channel/resolver/dns/c_ares/dns_resolver_ares.cc +1 -3
  21. data/src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_ev_driver_event_engine.cc +31 -0
  22. data/src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_wrapper_event_engine.cc +28 -0
  23. data/src/core/ext/filters/client_channel/resolver/dns/native/dns_resolver.cc +1 -3
  24. data/src/core/ext/filters/client_channel/resolver/google_c2p/google_c2p_resolver.cc +7 -2
  25. data/src/core/ext/filters/client_channel/resolver/xds/xds_resolver.cc +15 -3
  26. data/src/core/ext/filters/client_channel/retry_filter.cc +665 -404
  27. data/src/core/ext/filters/client_channel/retry_service_config.cc +43 -24
  28. data/src/core/ext/filters/client_channel/retry_service_config.h +8 -2
  29. data/src/core/ext/filters/client_idle/client_idle_filter.cc +1 -1
  30. data/src/core/ext/filters/fault_injection/fault_injection_filter.cc +6 -0
  31. data/src/core/ext/transport/chttp2/client/insecure/channel_create_posix.cc +2 -1
  32. data/src/core/ext/transport/chttp2/server/insecure/server_chttp2_posix.cc +3 -2
  33. data/src/core/ext/transport/chttp2/transport/chttp2_transport.cc +10 -4
  34. data/src/core/ext/transport/chttp2/transport/internal.h +1 -0
  35. data/src/core/ext/transport/chttp2/transport/parsing.cc +2 -2
  36. data/src/core/ext/transport/inproc/inproc_transport.cc +42 -31
  37. data/src/core/ext/xds/xds_api.cc +247 -106
  38. data/src/core/ext/xds/xds_api.h +15 -6
  39. data/src/core/lib/address_utils/sockaddr_utils.cc +13 -0
  40. data/src/core/lib/address_utils/sockaddr_utils.h +10 -0
  41. data/src/core/lib/channel/channelz.h +3 -0
  42. data/src/core/lib/event_engine/endpoint_config.cc +46 -0
  43. data/src/core/lib/event_engine/endpoint_config_internal.h +42 -0
  44. data/src/core/lib/event_engine/event_engine.cc +50 -0
  45. data/src/core/lib/event_engine/slice_allocator.cc +33 -3
  46. data/src/core/lib/event_engine/sockaddr.cc +14 -12
  47. data/src/core/lib/event_engine/sockaddr.h +44 -0
  48. data/src/core/lib/gpr/wrap_memcpy.cc +2 -1
  49. data/src/core/lib/gprpp/status_helper.h +3 -0
  50. data/src/core/lib/iomgr/endpoint_pair_event_engine.cc +33 -0
  51. data/src/core/lib/iomgr/error.cc +5 -4
  52. data/src/core/lib/iomgr/error.h +1 -1
  53. data/src/core/lib/iomgr/event_engine/closure.cc +54 -0
  54. data/src/core/lib/iomgr/event_engine/closure.h +33 -0
  55. data/src/core/lib/iomgr/event_engine/endpoint.cc +194 -0
  56. data/src/core/lib/iomgr/event_engine/endpoint.h +53 -0
  57. data/src/core/lib/iomgr/event_engine/iomgr.cc +105 -0
  58. data/src/core/lib/iomgr/event_engine/iomgr.h +24 -0
  59. data/src/core/lib/iomgr/event_engine/pollset.cc +87 -0
  60. data/{include/grpc/event_engine/channel_args.h → src/core/lib/iomgr/event_engine/pollset.h} +7 -10
  61. data/src/core/lib/iomgr/event_engine/promise.h +51 -0
  62. data/src/core/lib/iomgr/event_engine/resolved_address_internal.cc +41 -0
  63. data/src/core/lib/iomgr/event_engine/resolved_address_internal.h +35 -0
  64. data/src/core/lib/iomgr/event_engine/resolver.cc +110 -0
  65. data/src/core/lib/iomgr/event_engine/tcp.cc +243 -0
  66. data/src/core/lib/iomgr/event_engine/timer.cc +57 -0
  67. data/src/core/lib/iomgr/exec_ctx.cc +8 -0
  68. data/src/core/lib/iomgr/exec_ctx.h +3 -4
  69. data/src/core/lib/iomgr/executor/threadpool.cc +2 -3
  70. data/src/core/lib/iomgr/executor/threadpool.h +2 -2
  71. data/src/core/lib/iomgr/iomgr.cc +1 -1
  72. data/src/core/lib/iomgr/iomgr_posix.cc +2 -0
  73. data/src/core/lib/iomgr/iomgr_posix_cfstream.cc +40 -10
  74. data/src/core/lib/iomgr/pollset_custom.cc +2 -2
  75. data/src/core/lib/iomgr/pollset_custom.h +3 -1
  76. data/src/core/lib/iomgr/pollset_uv.cc +3 -1
  77. data/src/core/lib/iomgr/pollset_uv.h +5 -1
  78. data/src/core/lib/iomgr/port.h +7 -5
  79. data/src/core/lib/iomgr/resolve_address.cc +5 -1
  80. data/src/core/lib/iomgr/resolve_address.h +6 -0
  81. data/src/core/lib/iomgr/sockaddr.h +1 -0
  82. data/src/core/lib/iomgr/socket_mutator.cc +15 -2
  83. data/src/core/lib/iomgr/socket_mutator.h +26 -2
  84. data/src/core/lib/iomgr/socket_utils_common_posix.cc +4 -4
  85. data/src/core/lib/iomgr/socket_utils_posix.h +2 -2
  86. data/src/core/lib/iomgr/tcp_client_posix.cc +7 -2
  87. data/src/core/lib/iomgr/tcp_posix.cc +42 -39
  88. data/src/core/lib/iomgr/tcp_posix.h +8 -0
  89. data/src/core/lib/iomgr/tcp_server_custom.cc +3 -4
  90. data/src/core/lib/iomgr/tcp_server_posix.cc +6 -0
  91. data/src/core/lib/iomgr/tcp_server_utils_posix_common.cc +2 -1
  92. data/src/core/lib/iomgr/timer.h +6 -1
  93. data/src/core/lib/security/authorization/authorization_engine.h +44 -0
  94. data/src/core/lib/security/authorization/authorization_policy_provider.h +32 -0
  95. data/src/core/lib/security/authorization/authorization_policy_provider_vtable.cc +46 -0
  96. data/src/core/lib/security/authorization/evaluate_args.cc +209 -0
  97. data/src/core/lib/security/authorization/evaluate_args.h +91 -0
  98. data/src/core/lib/security/credentials/google_default/google_default_credentials.cc +3 -1
  99. data/src/core/lib/security/credentials/tls/tls_utils.cc +32 -0
  100. data/src/core/lib/security/credentials/tls/tls_utils.h +13 -0
  101. data/src/core/lib/security/security_connector/local/local_security_connector.cc +9 -6
  102. data/src/core/lib/security/security_connector/ssl_utils.cc +5 -0
  103. data/src/core/lib/surface/call.cc +21 -1
  104. data/src/core/lib/surface/call.h +11 -0
  105. data/src/core/lib/surface/completion_queue.cc +22 -22
  106. data/src/core/lib/surface/completion_queue.h +1 -1
  107. data/src/core/lib/surface/completion_queue_factory.cc +1 -2
  108. data/src/core/lib/surface/init.cc +1 -3
  109. data/src/core/lib/surface/init.h +10 -1
  110. data/src/core/lib/surface/version.cc +1 -1
  111. data/src/core/lib/transport/error_utils.cc +2 -2
  112. data/src/core/lib/transport/transport.h +2 -0
  113. data/src/core/lib/transport/transport_op_string.cc +1 -1
  114. data/src/core/plugin_registry/grpc_plugin_registry.cc +4 -0
  115. data/src/core/tsi/alts/crypt/gsec.h +2 -0
  116. data/src/ruby/ext/grpc/extconf.rb +2 -0
  117. data/src/ruby/ext/grpc/rb_grpc_imports.generated.c +6 -0
  118. data/src/ruby/ext/grpc/rb_grpc_imports.generated.h +10 -1
  119. data/src/ruby/lib/grpc/version.rb +1 -1
  120. data/third_party/boringssl-with-bazel/err_data.c +269 -263
  121. data/third_party/boringssl-with-bazel/src/crypto/asn1/a_object.c +8 -6
  122. data/third_party/boringssl-with-bazel/src/crypto/cipher_extra/cipher_extra.c +4 -0
  123. data/third_party/boringssl-with-bazel/src/crypto/curve25519/curve25519.c +1 -1
  124. data/third_party/boringssl-with-bazel/src/crypto/curve25519/internal.h +1 -1
  125. data/third_party/boringssl-with-bazel/src/crypto/evp/evp.c +9 -0
  126. data/third_party/boringssl-with-bazel/src/crypto/fipsmodule/bn/prime.c +0 -4
  127. data/third_party/boringssl-with-bazel/src/crypto/fipsmodule/digest/digest.c +7 -0
  128. data/third_party/boringssl-with-bazel/src/crypto/fipsmodule/digest/md32_common.h +87 -121
  129. data/third_party/boringssl-with-bazel/src/crypto/fipsmodule/md4/md4.c +20 -30
  130. data/third_party/boringssl-with-bazel/src/crypto/fipsmodule/md5/md5.c +19 -30
  131. data/third_party/boringssl-with-bazel/src/crypto/fipsmodule/rand/internal.h +1 -4
  132. data/third_party/boringssl-with-bazel/src/crypto/fipsmodule/rand/rand.c +0 -13
  133. data/third_party/boringssl-with-bazel/src/crypto/fipsmodule/rsa/rsa.c +26 -24
  134. data/third_party/boringssl-with-bazel/src/crypto/fipsmodule/rsa/rsa_impl.c +10 -7
  135. data/third_party/boringssl-with-bazel/src/crypto/fipsmodule/sha/sha1.c +28 -39
  136. data/third_party/boringssl-with-bazel/src/crypto/fipsmodule/sha/sha256.c +48 -66
  137. data/third_party/boringssl-with-bazel/src/crypto/fipsmodule/sha/sha512.c +4 -5
  138. data/third_party/boringssl-with-bazel/src/crypto/hpke/hpke.c +362 -371
  139. data/third_party/boringssl-with-bazel/src/crypto/pkcs7/pkcs7_x509.c +4 -2
  140. data/third_party/boringssl-with-bazel/src/crypto/rand_extra/passive.c +2 -2
  141. data/third_party/boringssl-with-bazel/src/crypto/rsa_extra/rsa_asn1.c +1 -2
  142. data/third_party/boringssl-with-bazel/src/crypto/x509/internal.h +101 -11
  143. data/third_party/boringssl-with-bazel/src/crypto/x509/t_x509a.c +3 -0
  144. data/third_party/boringssl-with-bazel/src/crypto/x509/x509_cmp.c +2 -2
  145. data/third_party/boringssl-with-bazel/src/crypto/x509/x509_req.c +3 -0
  146. data/third_party/boringssl-with-bazel/src/crypto/x509/x509_set.c +1 -1
  147. data/third_party/boringssl-with-bazel/src/crypto/x509/x509_trs.c +2 -0
  148. data/third_party/boringssl-with-bazel/src/crypto/x509/x509_vfy.c +14 -15
  149. data/third_party/boringssl-with-bazel/src/crypto/x509/x509_vpm.c +53 -73
  150. data/third_party/boringssl-with-bazel/src/crypto/x509/x509cset.c +31 -0
  151. data/third_party/boringssl-with-bazel/src/crypto/x509/x509rset.c +3 -0
  152. data/third_party/boringssl-with-bazel/src/crypto/x509/x_all.c +3 -0
  153. data/third_party/boringssl-with-bazel/src/crypto/x509/x_req.c +5 -8
  154. data/third_party/boringssl-with-bazel/src/crypto/x509/x_sig.c +5 -0
  155. data/third_party/boringssl-with-bazel/src/crypto/x509/x_x509a.c +3 -0
  156. data/third_party/boringssl-with-bazel/src/crypto/x509v3/internal.h +7 -0
  157. data/third_party/boringssl-with-bazel/src/crypto/x509v3/v3_purp.c +1 -1
  158. data/third_party/boringssl-with-bazel/src/crypto/x509v3/v3_utl.c +5 -8
  159. data/third_party/boringssl-with-bazel/src/include/openssl/aead.h +1 -1
  160. data/third_party/boringssl-with-bazel/src/include/openssl/arm_arch.h +66 -1
  161. data/third_party/boringssl-with-bazel/src/include/openssl/base.h +40 -9
  162. data/third_party/boringssl-with-bazel/src/include/openssl/bytestring.h +1 -0
  163. data/third_party/boringssl-with-bazel/src/include/openssl/chacha.h +1 -1
  164. data/third_party/boringssl-with-bazel/src/include/openssl/digest.h +6 -2
  165. data/third_party/boringssl-with-bazel/src/include/openssl/ecdsa.h +14 -0
  166. data/third_party/boringssl-with-bazel/src/include/openssl/evp.h +19 -11
  167. data/third_party/boringssl-with-bazel/src/include/openssl/hpke.h +325 -0
  168. data/third_party/boringssl-with-bazel/src/include/openssl/pkcs7.h +23 -7
  169. data/third_party/boringssl-with-bazel/src/include/openssl/rsa.h +99 -63
  170. data/third_party/boringssl-with-bazel/src/include/openssl/ssl.h +139 -109
  171. data/third_party/boringssl-with-bazel/src/include/openssl/tls1.h +12 -19
  172. data/third_party/boringssl-with-bazel/src/include/openssl/x509.h +48 -50
  173. data/third_party/boringssl-with-bazel/src/include/openssl/x509_vfy.h +451 -435
  174. data/third_party/boringssl-with-bazel/src/include/openssl/x509v3.h +0 -1
  175. data/third_party/boringssl-with-bazel/src/ssl/d1_both.cc +2 -2
  176. data/third_party/boringssl-with-bazel/src/ssl/d1_srtp.cc +1 -1
  177. data/third_party/boringssl-with-bazel/src/ssl/encrypted_client_hello.cc +773 -84
  178. data/third_party/boringssl-with-bazel/src/ssl/handoff.cc +80 -47
  179. data/third_party/boringssl-with-bazel/src/ssl/handshake.cc +24 -19
  180. data/third_party/boringssl-with-bazel/src/ssl/handshake_client.cc +189 -86
  181. data/third_party/boringssl-with-bazel/src/ssl/handshake_server.cc +45 -56
  182. data/third_party/boringssl-with-bazel/src/ssl/internal.h +272 -167
  183. data/third_party/boringssl-with-bazel/src/ssl/s3_both.cc +2 -2
  184. data/third_party/boringssl-with-bazel/src/ssl/s3_lib.cc +2 -2
  185. data/third_party/boringssl-with-bazel/src/ssl/s3_pkt.cc +14 -19
  186. data/third_party/boringssl-with-bazel/src/ssl/ssl_lib.cc +34 -102
  187. data/third_party/boringssl-with-bazel/src/ssl/ssl_privkey.cc +2 -0
  188. data/third_party/boringssl-with-bazel/src/ssl/ssl_session.cc +8 -31
  189. data/third_party/boringssl-with-bazel/src/ssl/ssl_stat.cc +3 -0
  190. data/third_party/boringssl-with-bazel/src/ssl/ssl_transcript.cc +4 -3
  191. data/third_party/boringssl-with-bazel/src/ssl/ssl_versions.cc +7 -3
  192. data/third_party/boringssl-with-bazel/src/ssl/t1_lib.cc +576 -648
  193. data/third_party/boringssl-with-bazel/src/ssl/tls13_both.cc +31 -3
  194. data/third_party/boringssl-with-bazel/src/ssl/tls13_client.cc +98 -39
  195. data/third_party/boringssl-with-bazel/src/ssl/tls13_enc.cc +141 -94
  196. data/third_party/boringssl-with-bazel/src/ssl/tls13_server.cc +58 -68
  197. metadata +65 -40
  198. data/third_party/boringssl-with-bazel/src/crypto/hpke/internal.h +0 -267
  199. data/third_party/boringssl-with-bazel/src/crypto/x509/vpm_int.h +0 -71
@@ -16,6 +16,8 @@
16
16
 
17
17
  #include <grpc/support/port_platform.h>
18
18
 
19
+ #include <random>
20
+
19
21
  #include "src/core/ext/filters/client_channel/resolver_registry.h"
20
22
  #include "src/core/ext/xds/xds_client.h"
21
23
  #include "src/core/lib/gpr/env.h"
@@ -195,7 +197,7 @@ void GoogleCloud2ProdResolver::ZoneQuery::OnDone(
195
197
  gpr_log(GPR_ERROR, "could not parse zone from metadata server: %s",
196
198
  std::string(body).c_str());
197
199
  } else {
198
- zone = std::string(body.substr(i));
200
+ zone = std::string(body.substr(i + 1));
199
201
  }
200
202
  }
201
203
  resolver->ZoneQueryDone(std::move(zone));
@@ -297,8 +299,11 @@ void GoogleCloud2ProdResolver::IPv6QueryDone(bool ipv6_supported) {
297
299
 
298
300
  void GoogleCloud2ProdResolver::StartXdsResolver() {
299
301
  // Construct bootstrap JSON.
302
+ std::random_device rd;
303
+ std::mt19937 mt(rd());
304
+ std::uniform_int_distribution<uint64_t> dist(1, UINT64_MAX);
300
305
  Json::Object node = {
301
- {"id", "C2P"},
306
+ {"id", absl::StrCat("C2P-", dist(mt))},
302
307
  };
303
308
  if (!zone_->empty()) {
304
309
  node["locality"] = Json::Object{
@@ -568,6 +568,9 @@ absl::optional<uint64_t> HeaderHashHelper(
568
568
  std::string value_buffer;
569
569
  absl::optional<absl::string_view> header_value =
570
570
  GetHeaderValue(initial_metadata, policy.header_name, &value_buffer);
571
+ if (!header_value.has_value()) {
572
+ return absl::nullopt;
573
+ }
571
574
  if (policy.regex != nullptr) {
572
575
  // If GetHeaderValue() did not already store the value in
573
576
  // value_buffer, copy it there now, so we can modify it.
@@ -671,7 +674,12 @@ ConfigSelector::CallConfig XdsResolver::XdsConfigSelector::GetCallConfig(
671
674
  }
672
675
  if (!hash.has_value()) {
673
676
  // If there is no hash, we just choose a random value as a default.
674
- hash = rand();
677
+ // We cannot directly use the result of rand() as the hash value,
678
+ // since it is a 32-bit number and not a 64-bit number and will
679
+ // therefore not be evenly distributed.
680
+ uint32_t upper = rand();
681
+ uint32_t lower = rand();
682
+ hash = (static_cast<uint64_t>(upper) << 32) | lower;
675
683
  }
676
684
  CallConfig call_config;
677
685
  if (method_config != nullptr) {
@@ -680,8 +688,12 @@ ConfigSelector::CallConfig XdsResolver::XdsConfigSelector::GetCallConfig(
680
688
  call_config.service_config = std::move(method_config);
681
689
  }
682
690
  call_config.call_attributes[kXdsClusterAttribute] = it->first;
683
- call_config.call_attributes[kRequestRingHashAttribute] =
684
- absl::StrFormat("%" PRIu64, hash.value());
691
+ std::string hash_string = absl::StrCat(hash.value());
692
+ char* hash_value =
693
+ static_cast<char*>(args.arena->Alloc(hash_string.size() + 1));
694
+ memcpy(hash_value, hash_string.c_str(), hash_string.size());
695
+ hash_value[hash_string.size()] = '\0';
696
+ call_config.call_attributes[kRequestRingHashAttribute] = hash_value;
685
697
  call_config.on_call_committed = [resolver, cluster_state]() {
686
698
  cluster_state->Unref();
687
699
  ExecCtx::Run(
@@ -200,7 +200,6 @@ class RetryFilter::CallData {
200
200
  static void SetPollent(grpc_call_element* elem, grpc_polling_entity* pollent);
201
201
 
202
202
  private:
203
- class Canceller;
204
203
  class CallStackDestructionBarrier;
205
204
 
206
205
  // Pending batches stored in call data.
@@ -212,13 +211,10 @@ class RetryFilter::CallData {
212
211
  };
213
212
 
214
213
  // State associated with each call attempt.
215
- // Allocated on the arena.
216
- class CallAttempt
217
- : public RefCounted<CallAttempt, PolymorphicRefCount, kUnrefCallDtor> {
214
+ class CallAttempt : public RefCounted<CallAttempt> {
218
215
  public:
219
216
  explicit CallAttempt(CallData* calld);
220
-
221
- ClientChannel::LoadBalancedCall* lb_call() const { return lb_call_.get(); }
217
+ ~CallAttempt() override;
222
218
 
223
219
  // Constructs and starts whatever batches are needed on this call
224
220
  // attempt.
@@ -228,6 +224,9 @@ class RetryFilter::CallData {
228
224
  // committing the call.
229
225
  void FreeCachedSendOpDataAfterCommit();
230
226
 
227
+ // Cancels the call attempt.
228
+ void CancelFromSurface(grpc_transport_stream_op_batch* cancel_batch);
229
+
231
230
  private:
232
231
  // State used for starting a retryable batch on the call attempt's LB call.
233
232
  // This provides its own grpc_transport_stream_op_batch and other data
@@ -235,7 +234,7 @@ class RetryFilter::CallData {
235
234
  // We allocate one struct on the arena for each attempt at starting a
236
235
  // batch on a given LB call.
237
236
  class BatchData
238
- : public RefCounted<CallAttempt, PolymorphicRefCount, kUnrefCallDtor> {
237
+ : public RefCounted<BatchData, PolymorphicRefCount, kUnrefCallDtor> {
239
238
  public:
240
239
  BatchData(RefCountedPtr<CallAttempt> call_attempt, int refcount,
241
240
  bool set_on_complete);
@@ -243,24 +242,22 @@ class RetryFilter::CallData {
243
242
 
244
243
  grpc_transport_stream_op_batch* batch() { return &batch_; }
245
244
 
246
- // Adds retriable send_initial_metadata op to batch_data.
245
+ // Adds retriable send_initial_metadata op.
247
246
  void AddRetriableSendInitialMetadataOp();
248
- // Adds retriable send_message op to batch_data.
247
+ // Adds retriable send_message op.
249
248
  void AddRetriableSendMessageOp();
250
- // Adds retriable send_trailing_metadata op to batch_data.
249
+ // Adds retriable send_trailing_metadata op.
251
250
  void AddRetriableSendTrailingMetadataOp();
252
- // Adds retriable recv_initial_metadata op to batch_data.
251
+ // Adds retriable recv_initial_metadata op.
253
252
  void AddRetriableRecvInitialMetadataOp();
254
- // Adds retriable recv_message op to batch_data.
253
+ // Adds retriable recv_message op.
255
254
  void AddRetriableRecvMessageOp();
256
- // Adds retriable recv_trailing_metadata op to batch_data.
255
+ // Adds retriable recv_trailing_metadata op.
257
256
  void AddRetriableRecvTrailingMetadataOp();
257
+ // Adds cancel_stream op.
258
+ void AddCancelStreamOp();
258
259
 
259
260
  private:
260
- // Returns true if the call is being retried.
261
- bool MaybeRetry(grpc_status_code status, grpc_mdelem* server_pushback_md,
262
- bool is_lb_drop);
263
-
264
261
  // Frees cached send ops that were completed by the completed batch in
265
262
  // batch_data. Used when batches are completed after the call is
266
263
  // committed.
@@ -282,9 +279,9 @@ class RetryFilter::CallData {
282
279
  // Adds recv_trailing_metadata_ready closure to closures.
283
280
  void AddClosureForRecvTrailingMetadataReady(
284
281
  grpc_error_handle error, CallCombinerClosureList* closures);
285
- // Adds any necessary closures for deferred recv_initial_metadata and
286
- // recv_message callbacks to closures.
287
- void AddClosuresForDeferredRecvCallbacks(
282
+ // Adds any necessary closures for deferred batch completion
283
+ // callbacks to closures.
284
+ void AddClosuresForDeferredCompletionCallbacks(
288
285
  CallCombinerClosureList* closures);
289
286
  // For any pending batch containing an op that has not yet been started,
290
287
  // adds the pending batch's completion closures to closures.
@@ -322,7 +319,8 @@ class RetryFilter::CallData {
322
319
  // on_complete callback will be set to point to on_complete();
323
320
  // otherwise, the batch's on_complete callback will be null.
324
321
  BatchData* CreateBatch(int refcount, bool set_on_complete) {
325
- return calld_->arena_->New<BatchData>(Ref(), refcount, set_on_complete);
322
+ return calld_->arena_->New<BatchData>(Ref(DEBUG_LOCATION, "CreateBatch"),
323
+ refcount, set_on_complete);
326
324
  }
327
325
 
328
326
  // If there are any cached send ops that need to be replayed on this
@@ -330,6 +328,11 @@ class RetryFilter::CallData {
330
328
  // Otherwise, returns nullptr.
331
329
  BatchData* MaybeCreateBatchForReplay();
332
330
 
331
+ // Adds a closure to closures that will execute batch in the call combiner.
332
+ void AddClosureForBatch(grpc_transport_stream_op_batch* batch,
333
+ const char* reason,
334
+ CallCombinerClosureList* closures);
335
+
333
336
  // Adds batches for pending batches to closures.
334
337
  void AddBatchesForPendingBatches(CallCombinerClosureList* closures);
335
338
 
@@ -339,15 +342,41 @@ class RetryFilter::CallData {
339
342
  // Returns true if any op in the batch was not yet started on this attempt.
340
343
  bool PendingBatchIsUnstarted(PendingBatch* pending);
341
344
 
345
+ // Returns true if there are cached send ops to replay.
346
+ bool HaveSendOpsToReplay();
347
+
348
+ // If our retry state is no longer needed, switch to fast path by moving
349
+ // our LB call into calld_->committed_call_ and having calld_ drop
350
+ // its ref to us.
351
+ void MaybeSwitchToFastPath();
352
+
342
353
  // Helper function used to start a recv_trailing_metadata batch. This
343
354
  // is used in the case where a recv_initial_metadata or recv_message
344
355
  // op fails in a way that we know the call is over but when the application
345
356
  // has not yet started its own recv_trailing_metadata op.
346
357
  void StartInternalRecvTrailingMetadata();
347
358
 
359
+ // Returns true if the call should be retried.
360
+ // If server_pushback_md is non-null, sets *server_pushback_ms.
361
+ bool ShouldRetry(absl::optional<grpc_status_code> status, bool is_lb_drop,
362
+ grpc_mdelem* server_pushback_md,
363
+ grpc_millis* server_pushback_ms);
364
+
365
+ // Cancels the call attempt. Unrefs any deferred batches.
366
+ // Adds a batch to closures to cancel this call attempt.
367
+ void Cancel(CallCombinerClosureList* closures);
368
+
369
+ static void OnPerAttemptRecvTimer(void* arg, grpc_error_handle error);
370
+ static void OnPerAttemptRecvTimerLocked(void* arg, grpc_error_handle error);
371
+ void MaybeCancelPerAttemptRecvTimer();
372
+
348
373
  CallData* calld_;
349
374
  RefCountedPtr<ClientChannel::LoadBalancedCall> lb_call_;
350
375
 
376
+ grpc_timer per_attempt_recv_timer_;
377
+ grpc_closure on_per_attempt_recv_timer_;
378
+ bool per_attempt_recv_timer_pending_ = false;
379
+
351
380
  // BatchData.batch.payload points to this.
352
381
  grpc_transport_stream_op_batch_payload batch_payload_;
353
382
  // For send_initial_metadata.
@@ -389,16 +418,20 @@ class RetryFilter::CallData {
389
418
  bool started_recv_trailing_metadata_ : 1;
390
419
  bool completed_recv_trailing_metadata_ : 1;
391
420
  // State for callback processing.
392
- BatchData* recv_initial_metadata_ready_deferred_batch_ = nullptr;
421
+ RefCountedPtr<BatchData> recv_initial_metadata_ready_deferred_batch_;
393
422
  grpc_error_handle recv_initial_metadata_error_ = GRPC_ERROR_NONE;
394
- BatchData* recv_message_ready_deferred_batch_ = nullptr;
423
+ RefCountedPtr<BatchData> recv_message_ready_deferred_batch_;
395
424
  grpc_error_handle recv_message_error_ = GRPC_ERROR_NONE;
396
- BatchData* recv_trailing_metadata_internal_batch_ = nullptr;
425
+ RefCountedPtr<BatchData> on_complete_deferred_batch_;
426
+ grpc_error_handle on_complete_error_ = GRPC_ERROR_NONE;
427
+ RefCountedPtr<BatchData> recv_trailing_metadata_internal_batch_;
428
+ grpc_error_handle recv_trailing_metadata_error_ = GRPC_ERROR_NONE;
429
+ bool seen_recv_trailing_metadata_from_surface_ : 1;
397
430
  // NOTE: Do not move this next to the metadata bitfields above. That would
398
431
  // save space but will also result in a data race because compiler
399
432
  // will generate a 2 byte store which overwrites the meta-data
400
433
  // fields upon setting this field.
401
- bool retry_dispatched_ : 1;
434
+ bool cancelled_ : 1;
402
435
  };
403
436
 
404
437
  CallData(RetryFilter* chand, const grpc_call_element_args& args);
@@ -432,18 +465,17 @@ class RetryFilter::CallData {
432
465
  // Commits the call so that no further retry attempts will be performed.
433
466
  void RetryCommit(CallAttempt* call_attempt);
434
467
 
435
- // Starts a retry after appropriate back-off.
436
- void DoRetry(grpc_millis server_pushback_ms);
468
+ // Starts a timer to retry after appropriate back-off.
469
+ // If server_pushback_ms is -1, retry_backoff_ is used.
470
+ void StartRetryTimer(grpc_millis server_pushback_ms);
471
+
437
472
  static void OnRetryTimer(void* arg, grpc_error_handle error);
473
+ static void OnRetryTimerLocked(void* arg, grpc_error_handle error);
438
474
 
439
475
  RefCountedPtr<ClientChannel::LoadBalancedCall> CreateLoadBalancedCall();
440
476
 
441
477
  void CreateCallAttempt();
442
478
 
443
- // Adds a closure to closures that will execute batch in the call combiner.
444
- void AddClosureForBatch(grpc_transport_stream_op_batch* batch,
445
- CallCombinerClosureList* closures);
446
-
447
479
  RetryFilter* chand_;
448
480
  grpc_polling_entity* pollent_;
449
481
  RefCountedPtr<ServerRetryThrottleData> retry_throttle_data_;
@@ -465,12 +497,9 @@ class RetryFilter::CallData {
465
497
  // gets cancelled.
466
498
  RefCountedPtr<CallAttempt> call_attempt_;
467
499
 
468
- // LB call used when the call is commited before any CallAttempt is
469
- // created.
470
- // TODO(roth): Change CallAttempt logic such that once we've committed
471
- // and all cached send ops have been replayed, we move the LB call
472
- // from the CallAttempt here, thus creating a fast path for the
473
- // remainder of the streaming call.
500
+ // LB call used when we've committed to a call attempt and the retry
501
+ // state for that attempt is no longer needed. This provides a fast
502
+ // path for long-running streaming calls that minimizes overhead.
474
503
  RefCountedPtr<ClientChannel::LoadBalancedCall> committed_call_;
475
504
 
476
505
  // When are are not yet fully committed to a particular call (i.e.,
@@ -486,23 +515,11 @@ class RetryFilter::CallData {
486
515
 
487
516
  // Retry state.
488
517
  bool retry_committed_ : 1;
489
- bool last_attempt_got_server_pushback_ : 1;
518
+ bool retry_timer_pending_ : 1;
490
519
  int num_attempts_completed_ = 0;
491
- Mutex timer_mu_;
492
- Canceller* canceller_ ABSL_GUARDED_BY(timer_mu_);
493
- grpc_timer retry_timer_ ABSL_GUARDED_BY(timer_mu_);
520
+ grpc_timer retry_timer_;
494
521
  grpc_closure retry_closure_;
495
522
 
496
- // The number of batches containing send ops that are currently in-flight
497
- // on any call attempt.
498
- // We hold a ref to the call stack while this is non-zero, since replay
499
- // batches may not complete until after all callbacks have been returned
500
- // to the surface, and we need to make sure that the call is not destroyed
501
- // until all of these batches have completed.
502
- // Note that we actually only need to track replay batches, but it's
503
- // easier to track all batches with send ops.
504
- int num_in_flight_call_attempt_send_batches_ = 0;
505
-
506
523
  // Cached data for retrying send ops.
507
524
  // send_initial_metadata
508
525
  bool seen_send_initial_metadata_ = false;
@@ -513,7 +530,10 @@ class RetryFilter::CallData {
513
530
  // have the LB call set a value in CallAttempt and then propagate it
514
531
  // from CallAttempt to the parent call when we commit. Otherwise, we
515
532
  // may leave this with a value for a peer other than the one we
516
- // actually commit to.
533
+ // actually commit to. Alternatively, maybe see if there's a way to
534
+ // change the surface API such that the peer isn't available until
535
+ // after initial metadata is received? (Could even change the
536
+ // transport API to return this with the recv_initial_metadata op.)
517
537
  gpr_atm* peer_string_;
518
538
  // send_message
519
539
  // When we get a send_message op, we replace the original byte stream
@@ -522,6 +542,10 @@ class RetryFilter::CallData {
522
542
  // Note: We inline the cache for the first 3 send_message ops and use
523
543
  // dynamic allocation after that. This number was essentially picked
524
544
  // at random; it could be changed in the future to tune performance.
545
+ // TODO(roth): As part of implementing hedging, we may need some
546
+ // synchronization here, since ByteStreamCache does not provide any
547
+ // synchronization, so it's not safe to have multiple
548
+ // CachingByteStreams read from the same ByteStreamCache concurrently.
525
549
  absl::InlinedVector<ByteStreamCache*, 3> send_messages_;
526
550
  // send_trailing_metadata
527
551
  bool seen_send_trailing_metadata_ = false;
@@ -582,52 +606,14 @@ class RetryFilter::CallData::CallStackDestructionBarrier
582
606
  grpc_closure* on_call_stack_destruction_ = nullptr;
583
607
  };
584
608
 
585
- //
586
- // RetryFilter::CallData::Canceller
587
- //
588
-
589
- class RetryFilter::CallData::Canceller {
590
- public:
591
- explicit Canceller(CallData* calld) : calld_(calld) {
592
- GRPC_CALL_STACK_REF(calld_->owning_call_, "RetryCanceller");
593
- GRPC_CLOSURE_INIT(&closure_, &Cancel, this, nullptr);
594
- calld_->call_combiner_->SetNotifyOnCancel(&closure_);
595
- }
596
-
597
- private:
598
- static void Cancel(void* arg, grpc_error_handle error) {
599
- auto* self = static_cast<Canceller*>(arg);
600
- auto* calld = self->calld_;
601
- {
602
- MutexLock lock(&calld->timer_mu_);
603
- if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
604
- gpr_log(GPR_INFO,
605
- "calld=%p: cancelling retry timer: error=%s self=%p "
606
- "calld->canceller_=%p",
607
- calld, grpc_error_std_string(error).c_str(), self,
608
- calld->canceller_);
609
- }
610
- if (calld->canceller_ == self && error != GRPC_ERROR_NONE) {
611
- calld->canceller_ = nullptr; // Checked by OnRetryTimer().
612
- grpc_timer_cancel(&calld->retry_timer_);
613
- calld->FreeAllCachedSendOpData();
614
- GRPC_CALL_COMBINER_STOP(calld->call_combiner_, "Canceller");
615
- }
616
- }
617
- GRPC_CALL_STACK_UNREF(calld->owning_call_, "RetryCanceller");
618
- delete self;
619
- }
620
-
621
- CallData* calld_;
622
- grpc_closure closure_;
623
- };
624
-
625
609
  //
626
610
  // RetryFilter::CallData::CallAttempt
627
611
  //
628
612
 
629
613
  RetryFilter::CallData::CallAttempt::CallAttempt(CallData* calld)
630
- : calld_(calld),
614
+ : RefCounted(GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace) ? "CallAttempt"
615
+ : nullptr),
616
+ calld_(calld),
631
617
  batch_payload_(calld->call_context_),
632
618
  started_send_initial_metadata_(false),
633
619
  completed_send_initial_metadata_(false),
@@ -637,12 +623,42 @@ RetryFilter::CallData::CallAttempt::CallAttempt(CallData* calld)
637
623
  completed_recv_initial_metadata_(false),
638
624
  started_recv_trailing_metadata_(false),
639
625
  completed_recv_trailing_metadata_(false),
640
- retry_dispatched_(false) {
626
+ seen_recv_trailing_metadata_from_surface_(false),
627
+ cancelled_(false) {
641
628
  lb_call_ = calld->CreateLoadBalancedCall();
642
629
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
643
- gpr_log(GPR_INFO, "chand=%p calld=%p: attempt=%p: create lb_call=%p",
630
+ gpr_log(GPR_INFO, "chand=%p calld=%p attempt=%p: create lb_call=%p",
644
631
  calld->chand_, calld, this, lb_call_.get());
645
632
  }
633
+ // If per_attempt_recv_timeout is set, start a timer.
634
+ if (calld->retry_policy_ != nullptr &&
635
+ calld->retry_policy_->per_attempt_recv_timeout().has_value()) {
636
+ grpc_millis per_attempt_recv_deadline =
637
+ ExecCtx::Get()->Now() +
638
+ *calld->retry_policy_->per_attempt_recv_timeout();
639
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
640
+ gpr_log(GPR_INFO,
641
+ "chand=%p calld=%p attempt=%p: per-attempt timeout in %" PRId64
642
+ " ms",
643
+ calld->chand_, calld, this,
644
+ *calld->retry_policy_->per_attempt_recv_timeout());
645
+ }
646
+ // Schedule retry after computed delay.
647
+ GRPC_CLOSURE_INIT(&on_per_attempt_recv_timer_, OnPerAttemptRecvTimer, this,
648
+ nullptr);
649
+ GRPC_CALL_STACK_REF(calld->owning_call_, "OnPerAttemptRecvTimer");
650
+ Ref(DEBUG_LOCATION, "OnPerAttemptRecvTimer").release();
651
+ per_attempt_recv_timer_pending_ = true;
652
+ grpc_timer_init(&per_attempt_recv_timer_, per_attempt_recv_deadline,
653
+ &on_per_attempt_recv_timer_);
654
+ }
655
+ }
656
+
657
+ RetryFilter::CallData::CallAttempt::~CallAttempt() {
658
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
659
+ gpr_log(GPR_INFO, "chand=%p calld=%p attempt=%p: destroying call attempt",
660
+ calld_->chand_, calld_, this);
661
+ }
646
662
  }
647
663
 
648
664
  void RetryFilter::CallData::CallAttempt::FreeCachedSendOpDataAfterCommit() {
@@ -683,12 +699,48 @@ bool RetryFilter::CallData::CallAttempt::PendingBatchIsUnstarted(
683
699
  return false;
684
700
  }
685
701
 
702
+ bool RetryFilter::CallData::CallAttempt::HaveSendOpsToReplay() {
703
+ // We don't check send_initial_metadata here, because that op will always
704
+ // be started as soon as it is received from the surface, so it will
705
+ // never need to be started at this point.
706
+ return started_send_message_count_ < calld_->send_messages_.size() ||
707
+ (calld_->seen_send_trailing_metadata_ &&
708
+ !started_send_trailing_metadata_);
709
+ }
710
+
711
+ void RetryFilter::CallData::CallAttempt::MaybeSwitchToFastPath() {
712
+ // If we're not yet committed, we can't switch yet.
713
+ // TODO(roth): As part of implementing hedging, this logic needs to
714
+ // check that *this* call attempt is the one that we've committed to.
715
+ // Might need to replace cancelled_ with an enum indicating whether we're
716
+ // in flight, cancelled, or the winning call attempt.
717
+ if (!calld_->retry_committed_) return;
718
+ // If we've already switched to fast path, there's nothing to do here.
719
+ if (calld_->committed_call_ != nullptr) return;
720
+ // If the perAttemptRecvTimeout timer is pending, we can't switch yet.
721
+ if (per_attempt_recv_timer_pending_) return;
722
+ // If there are still send ops to replay, we can't switch yet.
723
+ if (HaveSendOpsToReplay()) return;
724
+ // If we started an internal batch for recv_trailing_metadata but have not
725
+ // yet seen that op from the surface, we can't switch yet.
726
+ if (recv_trailing_metadata_internal_batch_ != nullptr) return;
727
+ // Switch to fast path.
728
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
729
+ gpr_log(GPR_INFO,
730
+ "chand=%p calld=%p attempt=%p: retry state no longer needed; "
731
+ "moving LB call to parent and unreffing the call attempt",
732
+ calld_->chand_, calld_, this);
733
+ }
734
+ calld_->committed_call_ = std::move(lb_call_);
735
+ calld_->call_attempt_.reset(DEBUG_LOCATION, "MaybeSwitchToFastPath");
736
+ }
737
+
686
738
  void RetryFilter::CallData::CallAttempt::StartInternalRecvTrailingMetadata() {
687
739
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
688
740
  gpr_log(GPR_INFO,
689
- "chand=%p calld=%p: call failed but recv_trailing_metadata not "
690
- "started; starting it internally",
691
- calld_->chand_, calld_);
741
+ "chand=%p calld=%p attempt=%p: call failed but "
742
+ "recv_trailing_metadata not started; starting it internally",
743
+ calld_->chand_, calld_, this);
692
744
  }
693
745
  // Create batch_data with 2 refs, since this batch will be unreffed twice:
694
746
  // once for the recv_trailing_metadata_ready callback when the batch
@@ -696,7 +748,7 @@ void RetryFilter::CallData::CallAttempt::StartInternalRecvTrailingMetadata() {
696
748
  // op from the surface.
697
749
  BatchData* batch_data = CreateBatch(2, false /* set_on_complete */);
698
750
  batch_data->AddRetriableRecvTrailingMetadataOp();
699
- recv_trailing_metadata_internal_batch_ = batch_data;
751
+ recv_trailing_metadata_internal_batch_.reset(batch_data);
700
752
  // Note: This will release the call combiner.
701
753
  lb_call_->StartTransportStreamOpBatch(batch_data->batch());
702
754
  }
@@ -712,9 +764,9 @@ RetryFilter::CallData::CallAttempt::MaybeCreateBatchForReplay() {
712
764
  !calld_->pending_send_initial_metadata_) {
713
765
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
714
766
  gpr_log(GPR_INFO,
715
- "chand=%p calld=%p: replaying previously completed "
767
+ "chand=%p calld=%p attempt=%p: replaying previously completed "
716
768
  "send_initial_metadata op",
717
- calld_->chand_, calld_);
769
+ calld_->chand_, calld_, this);
718
770
  }
719
771
  replay_batch_data = CreateBatch(1, true /* set_on_complete */);
720
772
  replay_batch_data->AddRetriableSendInitialMetadataOp();
@@ -726,9 +778,9 @@ RetryFilter::CallData::CallAttempt::MaybeCreateBatchForReplay() {
726
778
  !calld_->pending_send_message_) {
727
779
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
728
780
  gpr_log(GPR_INFO,
729
- "chand=%p calld=%p: replaying previously completed "
781
+ "chand=%p calld=%p attempt=%p: replaying previously completed "
730
782
  "send_message op",
731
- calld_->chand_, calld_);
783
+ calld_->chand_, calld_, this);
732
784
  }
733
785
  if (replay_batch_data == nullptr) {
734
786
  replay_batch_data = CreateBatch(1, true /* set_on_complete */);
@@ -745,9 +797,9 @@ RetryFilter::CallData::CallAttempt::MaybeCreateBatchForReplay() {
745
797
  !calld_->pending_send_trailing_metadata_) {
746
798
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
747
799
  gpr_log(GPR_INFO,
748
- "chand=%p calld=%p: replaying previously completed "
800
+ "chand=%p calld=%p attempt=%p: replaying previously completed "
749
801
  "send_trailing_metadata op",
750
- calld_->chand_, calld_);
802
+ calld_->chand_, calld_, this);
751
803
  }
752
804
  if (replay_batch_data == nullptr) {
753
805
  replay_batch_data = CreateBatch(1, true /* set_on_complete */);
@@ -757,6 +809,33 @@ RetryFilter::CallData::CallAttempt::MaybeCreateBatchForReplay() {
757
809
  return replay_batch_data;
758
810
  }
759
811
 
812
+ namespace {
813
+
814
+ void StartBatchInCallCombiner(void* arg, grpc_error_handle /*ignored*/) {
815
+ grpc_transport_stream_op_batch* batch =
816
+ static_cast<grpc_transport_stream_op_batch*>(arg);
817
+ auto* lb_call = static_cast<ClientChannel::LoadBalancedCall*>(
818
+ batch->handler_private.extra_arg);
819
+ // Note: This will release the call combiner.
820
+ lb_call->StartTransportStreamOpBatch(batch);
821
+ }
822
+
823
+ } // namespace
824
+
825
+ void RetryFilter::CallData::CallAttempt::AddClosureForBatch(
826
+ grpc_transport_stream_op_batch* batch, const char* reason,
827
+ CallCombinerClosureList* closures) {
828
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
829
+ gpr_log(GPR_INFO, "chand=%p calld=%p attempt=%p: adding batch (%s): %s",
830
+ calld_->chand_, calld_, this, reason,
831
+ grpc_transport_stream_op_batch_string(batch).c_str());
832
+ }
833
+ batch->handler_private.extra_arg = lb_call_.get();
834
+ GRPC_CLOSURE_INIT(&batch->handler_private.closure, StartBatchInCallCombiner,
835
+ batch, grpc_schedule_on_exec_ctx);
836
+ closures->Add(&batch->handler_private.closure, GRPC_ERROR_NONE, reason);
837
+ }
838
+
760
839
  void RetryFilter::CallData::CallAttempt::AddBatchesForPendingBatches(
761
840
  CallCombinerClosureList* closures) {
762
841
  for (size_t i = 0; i < GPR_ARRAY_SIZE(calld_->pending_batches_); ++i) {
@@ -797,6 +876,7 @@ void RetryFilter::CallData::CallAttempt::AddBatchesForPendingBatches(
797
876
  continue;
798
877
  }
799
878
  if (batch->recv_trailing_metadata && started_recv_trailing_metadata_) {
879
+ seen_recv_trailing_metadata_from_surface_ = true;
800
880
  // If we previously completed a recv_trailing_metadata op
801
881
  // initiated by StartInternalRecvTrailingMetadata(), use the
802
882
  // result of that instead of trying to re-start this op.
@@ -806,21 +886,30 @@ void RetryFilter::CallData::CallAttempt::AddBatchesForPendingBatches(
806
886
  // the application. Otherwise, just unref the internally started
807
887
  // batch, since we'll propagate the completion when it completes.
808
888
  if (completed_recv_trailing_metadata_) {
809
- // Batches containing recv_trailing_metadata always succeed.
810
889
  closures->Add(
811
- &recv_trailing_metadata_ready_, GRPC_ERROR_NONE,
890
+ &recv_trailing_metadata_ready_, recv_trailing_metadata_error_,
812
891
  "re-executing recv_trailing_metadata_ready to propagate "
813
892
  "internally triggered result");
893
+ // Ref will be released by callback.
894
+ recv_trailing_metadata_internal_batch_.release();
814
895
  } else {
815
- recv_trailing_metadata_internal_batch_->Unref();
896
+ recv_trailing_metadata_internal_batch_.reset(
897
+ DEBUG_LOCATION,
898
+ "internally started recv_trailing_metadata batch pending and "
899
+ "recv_trailing_metadata started from surface");
900
+ GRPC_ERROR_UNREF(recv_trailing_metadata_error_);
816
901
  }
817
- recv_trailing_metadata_internal_batch_ = nullptr;
902
+ recv_trailing_metadata_error_ = GRPC_ERROR_NONE;
818
903
  }
819
904
  continue;
820
905
  }
821
- // If we're already committed, just send the batch as-is.
822
- if (calld_->retry_committed_) {
823
- calld_->AddClosureForBatch(batch, closures);
906
+ // If we're already committed and these send ops aren't cached, just send
907
+ // the batch as-is.
908
+ if (calld_->retry_committed_ && !pending->send_ops_cached) {
909
+ AddClosureForBatch(
910
+ batch,
911
+ "start non-replayable pending batch on call attempt after commit",
912
+ closures);
824
913
  calld_->PendingBatchClear(pending);
825
914
  continue;
826
915
  }
@@ -831,7 +920,7 @@ void RetryFilter::CallData::CallAttempt::AddBatchesForPendingBatches(
831
920
  const int num_callbacks = has_send_ops + batch->recv_initial_metadata +
832
921
  batch->recv_message +
833
922
  batch->recv_trailing_metadata;
834
- CallAttempt::BatchData* batch_data =
923
+ BatchData* batch_data =
835
924
  CreateBatch(num_callbacks, has_send_ops /* set_on_complete */);
836
925
  // Cache send ops if needed.
837
926
  calld_->MaybeCacheSendOpsForBatch(pending);
@@ -861,16 +950,9 @@ void RetryFilter::CallData::CallAttempt::AddBatchesForPendingBatches(
861
950
  if (batch->recv_trailing_metadata) {
862
951
  batch_data->AddRetriableRecvTrailingMetadataOp();
863
952
  }
864
- calld_->AddClosureForBatch(batch_data->batch(), closures);
865
- // Track number of in-flight send batches.
866
- // If this is the first one, take a ref to the call stack.
867
- if (batch->send_initial_metadata || batch->send_message ||
868
- batch->send_trailing_metadata) {
869
- if (calld_->num_in_flight_call_attempt_send_batches_ == 0) {
870
- GRPC_CALL_STACK_REF(calld_->owning_call_, "retriable_send_batches");
871
- }
872
- ++calld_->num_in_flight_call_attempt_send_batches_;
873
- }
953
+ AddClosureForBatch(batch_data->batch(),
954
+ "start replayable pending batch on call attempt",
955
+ closures);
874
956
  }
875
957
  }
876
958
 
@@ -879,13 +961,8 @@ void RetryFilter::CallData::CallAttempt::AddRetriableBatches(
879
961
  // Replay previously-returned send_* ops if needed.
880
962
  BatchData* replay_batch_data = MaybeCreateBatchForReplay();
881
963
  if (replay_batch_data != nullptr) {
882
- calld_->AddClosureForBatch(replay_batch_data->batch(), closures);
883
- // Track number of pending send batches.
884
- // If this is the first one, take a ref to the call stack.
885
- if (calld_->num_in_flight_call_attempt_send_batches_ == 0) {
886
- GRPC_CALL_STACK_REF(calld_->owning_call_, "retriable_send_batches");
887
- }
888
- ++calld_->num_in_flight_call_attempt_send_batches_;
964
+ AddClosureForBatch(replay_batch_data->batch(),
965
+ "start replay batch on call attempt", closures);
889
966
  }
890
967
  // Now add pending batches.
891
968
  AddBatchesForPendingBatches(closures);
@@ -893,8 +970,9 @@ void RetryFilter::CallData::CallAttempt::AddRetriableBatches(
893
970
 
894
971
  void RetryFilter::CallData::CallAttempt::StartRetriableBatches() {
895
972
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
896
- gpr_log(GPR_INFO, "chand=%p calld=%p: constructing retriable batches",
897
- calld_->chand_, calld_);
973
+ gpr_log(GPR_INFO,
974
+ "chand=%p calld=%p attempt=%p: constructing retriable batches",
975
+ calld_->chand_, calld_, this);
898
976
  }
899
977
  // Construct list of closures to execute, one for each pending batch.
900
978
  CallCombinerClosureList closures;
@@ -903,28 +981,235 @@ void RetryFilter::CallData::CallAttempt::StartRetriableBatches() {
903
981
  // Start batches on LB call.
904
982
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
905
983
  gpr_log(GPR_INFO,
906
- "chand=%p calld=%p: starting %" PRIuPTR
984
+ "chand=%p calld=%p attempt=%p: starting %" PRIuPTR
907
985
  " retriable batches on lb_call=%p",
908
- calld_->chand_, calld_, closures.size(), lb_call());
986
+ calld_->chand_, calld_, this, closures.size(), lb_call_.get());
909
987
  }
910
988
  closures.RunClosures(calld_->call_combiner_);
911
989
  }
912
990
 
991
+ void RetryFilter::CallData::CallAttempt::CancelFromSurface(
992
+ grpc_transport_stream_op_batch* cancel_batch) {
993
+ MaybeCancelPerAttemptRecvTimer();
994
+ // Propagate cancellation to LB call.
995
+ lb_call_->StartTransportStreamOpBatch(cancel_batch);
996
+ }
997
+
998
+ bool RetryFilter::CallData::CallAttempt::ShouldRetry(
999
+ absl::optional<grpc_status_code> status, bool is_lb_drop,
1000
+ grpc_mdelem* server_pushback_md, grpc_millis* server_pushback_ms) {
1001
+ // LB drops always inhibit retries.
1002
+ if (is_lb_drop) return false;
1003
+ // TODO(roth): Handle transparent retries here.
1004
+ // If no retry policy, don't retry.
1005
+ if (calld_->retry_policy_ == nullptr) return false;
1006
+ // Check status.
1007
+ if (status.has_value()) {
1008
+ if (GPR_LIKELY(*status == GRPC_STATUS_OK)) {
1009
+ if (calld_->retry_throttle_data_ != nullptr) {
1010
+ calld_->retry_throttle_data_->RecordSuccess();
1011
+ }
1012
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1013
+ gpr_log(GPR_INFO, "chand=%p calld=%p attempt=%p: call succeeded",
1014
+ calld_->chand_, calld_, this);
1015
+ }
1016
+ return false;
1017
+ }
1018
+ // Status is not OK. Check whether the status is retryable.
1019
+ if (!calld_->retry_policy_->retryable_status_codes().Contains(*status)) {
1020
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1021
+ gpr_log(GPR_INFO,
1022
+ "chand=%p calld=%p attempt=%p: status %s not configured as "
1023
+ "retryable",
1024
+ calld_->chand_, calld_, this,
1025
+ grpc_status_code_to_string(*status));
1026
+ }
1027
+ return false;
1028
+ }
1029
+ }
1030
+ // Record the failure and check whether retries are throttled.
1031
+ // Note that it's important for this check to come after the status
1032
+ // code check above, since we should only record failures whose statuses
1033
+ // match the configured retryable status codes, so that we don't count
1034
+ // things like failures due to malformed requests (INVALID_ARGUMENT).
1035
+ // Conversely, it's important for this to come before the remaining
1036
+ // checks, so that we don't fail to record failures due to other factors.
1037
+ if (calld_->retry_throttle_data_ != nullptr &&
1038
+ !calld_->retry_throttle_data_->RecordFailure()) {
1039
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1040
+ gpr_log(GPR_INFO, "chand=%p calld=%p attempt=%p: retries throttled",
1041
+ calld_->chand_, calld_, this);
1042
+ }
1043
+ return false;
1044
+ }
1045
+ // Check whether the call is committed.
1046
+ if (calld_->retry_committed_) {
1047
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1048
+ gpr_log(GPR_INFO,
1049
+ "chand=%p calld=%p attempt=%p: retries already committed",
1050
+ calld_->chand_, calld_, this);
1051
+ }
1052
+ return false;
1053
+ }
1054
+ // Check whether we have retries remaining.
1055
+ ++calld_->num_attempts_completed_;
1056
+ if (calld_->num_attempts_completed_ >=
1057
+ calld_->retry_policy_->max_attempts()) {
1058
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1059
+ gpr_log(
1060
+ GPR_INFO, "chand=%p calld=%p attempt=%p: exceeded %d retry attempts",
1061
+ calld_->chand_, calld_, this, calld_->retry_policy_->max_attempts());
1062
+ }
1063
+ return false;
1064
+ }
1065
+ // Check server push-back.
1066
+ if (server_pushback_md != nullptr) {
1067
+ // If the value is "-1" or any other unparseable string, we do not retry.
1068
+ uint32_t ms;
1069
+ if (!grpc_parse_slice_to_uint32(GRPC_MDVALUE(*server_pushback_md), &ms)) {
1070
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1071
+ gpr_log(GPR_INFO,
1072
+ "chand=%p calld=%p attempt=%p: not retrying due to server "
1073
+ "push-back",
1074
+ calld_->chand_, calld_, this);
1075
+ }
1076
+ return false;
1077
+ } else {
1078
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1079
+ gpr_log(
1080
+ GPR_INFO,
1081
+ "chand=%p calld=%p attempt=%p: server push-back: retry in %u ms",
1082
+ calld_->chand_, calld_, this, ms);
1083
+ }
1084
+ *server_pushback_ms = static_cast<grpc_millis>(ms);
1085
+ }
1086
+ }
1087
+ // We should retry.
1088
+ return true;
1089
+ }
1090
+
1091
+ void RetryFilter::CallData::CallAttempt::Cancel(
1092
+ CallCombinerClosureList* closures) {
1093
+ // Record that this attempt has been cancelled.
1094
+ cancelled_ = true;
1095
+ // Unref batches for deferred completion callbacks that will now never
1096
+ // be invoked.
1097
+ if (started_recv_trailing_metadata_ &&
1098
+ !seen_recv_trailing_metadata_from_surface_) {
1099
+ recv_trailing_metadata_internal_batch_.reset(
1100
+ DEBUG_LOCATION,
1101
+ "internal recv_trailing_metadata completed before that op was "
1102
+ "started from the surface");
1103
+ }
1104
+ GRPC_ERROR_UNREF(recv_trailing_metadata_error_);
1105
+ recv_trailing_metadata_error_ = GRPC_ERROR_NONE;
1106
+ recv_initial_metadata_ready_deferred_batch_.reset(
1107
+ DEBUG_LOCATION,
1108
+ "unref deferred recv_initial_metadata_ready batch due to retry");
1109
+ GRPC_ERROR_UNREF(recv_initial_metadata_error_);
1110
+ recv_initial_metadata_error_ = GRPC_ERROR_NONE;
1111
+ recv_message_ready_deferred_batch_.reset(
1112
+ DEBUG_LOCATION, "unref deferred recv_message_ready batch due to retry");
1113
+ GRPC_ERROR_UNREF(recv_message_error_);
1114
+ recv_message_error_ = GRPC_ERROR_NONE;
1115
+ on_complete_deferred_batch_.reset(
1116
+ DEBUG_LOCATION, "unref deferred on_complete batch due to retry");
1117
+ GRPC_ERROR_UNREF(on_complete_error_);
1118
+ on_complete_error_ = GRPC_ERROR_NONE;
1119
+ // Start a cancellation op on this call attempt to make sure the
1120
+ // transport knows that this call should be cleaned up, even if it
1121
+ // hasn't received any ops.
1122
+ BatchData* cancel_batch_data = CreateBatch(1, /*set_on_complete=*/true);
1123
+ cancel_batch_data->AddCancelStreamOp();
1124
+ AddClosureForBatch(cancel_batch_data->batch(),
1125
+ "start cancellation batch on call attempt", closures);
1126
+ }
1127
+
1128
+ void RetryFilter::CallData::CallAttempt::OnPerAttemptRecvTimer(
1129
+ void* arg, grpc_error_handle error) {
1130
+ auto* call_attempt = static_cast<CallAttempt*>(arg);
1131
+ GRPC_CLOSURE_INIT(&call_attempt->on_per_attempt_recv_timer_,
1132
+ OnPerAttemptRecvTimerLocked, call_attempt, nullptr);
1133
+ GRPC_CALL_COMBINER_START(call_attempt->calld_->call_combiner_,
1134
+ &call_attempt->on_per_attempt_recv_timer_,
1135
+ GRPC_ERROR_REF(error), "per-attempt timer fired");
1136
+ }
1137
+
1138
+ void RetryFilter::CallData::CallAttempt::OnPerAttemptRecvTimerLocked(
1139
+ void* arg, grpc_error_handle error) {
1140
+ auto* call_attempt = static_cast<CallAttempt*>(arg);
1141
+ auto* calld = call_attempt->calld_;
1142
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1143
+ gpr_log(GPR_INFO,
1144
+ "chand=%p calld=%p attempt=%p: perAttemptRecvTimeout timer fired: "
1145
+ "error=%s, per_attempt_recv_timer_pending_=%d",
1146
+ calld->chand_, calld, call_attempt,
1147
+ grpc_error_std_string(error).c_str(),
1148
+ call_attempt->per_attempt_recv_timer_pending_);
1149
+ }
1150
+ CallCombinerClosureList closures;
1151
+ if (error == GRPC_ERROR_NONE &&
1152
+ call_attempt->per_attempt_recv_timer_pending_) {
1153
+ call_attempt->per_attempt_recv_timer_pending_ = false;
1154
+ // Cancel this attempt.
1155
+ // TODO(roth): When implementing hedging, we should not cancel the
1156
+ // current attempt.
1157
+ call_attempt->Cancel(&closures);
1158
+ // Check whether we should retry.
1159
+ if (call_attempt->ShouldRetry(
1160
+ /*status=*/absl::nullopt, /*is_lb_drop=*/false,
1161
+ /*server_pushback_md=*/nullptr, /*server_pushback_ms=*/nullptr)) {
1162
+ // We are retrying. Start backoff timer.
1163
+ calld->StartRetryTimer(/*server_pushback_ms=*/-1);
1164
+ } else {
1165
+ // Not retrying, so commit the call.
1166
+ calld->RetryCommit(call_attempt);
1167
+ // If retry state is no longer needed, switch to fast path for
1168
+ // subsequent batches.
1169
+ call_attempt->MaybeSwitchToFastPath();
1170
+ }
1171
+ }
1172
+ closures.RunClosures(calld->call_combiner_);
1173
+ call_attempt->Unref(DEBUG_LOCATION, "OnPerAttemptRecvTimer");
1174
+ GRPC_CALL_STACK_UNREF(calld->owning_call_, "OnPerAttemptRecvTimer");
1175
+ }
1176
+
1177
+ void RetryFilter::CallData::CallAttempt::MaybeCancelPerAttemptRecvTimer() {
1178
+ if (per_attempt_recv_timer_pending_) {
1179
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1180
+ gpr_log(GPR_INFO,
1181
+ "chand=%p calld=%p attempt=%p: cancelling "
1182
+ "perAttemptRecvTimeout timer",
1183
+ calld_->chand_, calld_, this);
1184
+ }
1185
+ per_attempt_recv_timer_pending_ = false;
1186
+ grpc_timer_cancel(&per_attempt_recv_timer_);
1187
+ }
1188
+ }
1189
+
913
1190
  //
914
1191
  // RetryFilter::CallData::CallAttempt::BatchData
915
1192
  //
916
1193
 
917
1194
  RetryFilter::CallData::CallAttempt::BatchData::BatchData(
918
1195
  RefCountedPtr<CallAttempt> attempt, int refcount, bool set_on_complete)
919
- : RefCounted(nullptr, refcount), call_attempt_(std::move(attempt)) {
920
- // TODO(roth): Consider holding this ref on the call stack in
921
- // CallAttempt instead of here in BatchData. This would eliminate the
922
- // need for CallData::num_in_flight_call_attempt_send_batches_.
923
- // But it would require having a way to unref CallAttempt when it is
924
- // no longer needed (i.e., when the call is committed and all cached
925
- // send ops have been replayed and the LB call is moved into
926
- // CallData::committed_call_).
927
- GRPC_CALL_STACK_REF(call_attempt_->calld_->owning_call_, "CallAttempt");
1196
+ : RefCounted(
1197
+ GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace) ? "BatchData" : nullptr,
1198
+ refcount),
1199
+ call_attempt_(std::move(attempt)) {
1200
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1201
+ gpr_log(GPR_INFO, "chand=%p calld=%p attempt=%p: creating batch %p",
1202
+ call_attempt_->calld_->chand_, call_attempt_->calld_,
1203
+ call_attempt_.get(), this);
1204
+ }
1205
+ // We hold a ref to the call stack for every batch sent on a call attempt.
1206
+ // This is because some batches on the call attempt may not complete
1207
+ // until after all of the batches are completed at the surface (because
1208
+ // each batch that is pending at the surface holds a ref). This
1209
+ // can happen for replayed send ops, and it can happen for
1210
+ // recv_initial_metadata and recv_message ops on a call attempt that has
1211
+ // been abandoned.
1212
+ GRPC_CALL_STACK_REF(call_attempt_->calld_->owning_call_, "Retry BatchData");
928
1213
  batch_.payload = &call_attempt_->batch_payload_;
929
1214
  if (set_on_complete) {
930
1215
  GRPC_CLOSURE_INIT(&on_complete_, OnComplete, this,
@@ -934,6 +1219,11 @@ RetryFilter::CallData::CallAttempt::BatchData::BatchData(
934
1219
  }
935
1220
 
936
1221
  RetryFilter::CallData::CallAttempt::BatchData::~BatchData() {
1222
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1223
+ gpr_log(GPR_INFO, "chand=%p calld=%p attempt=%p: destroying batch %p",
1224
+ call_attempt_->calld_->chand_, call_attempt_->calld_,
1225
+ call_attempt_.get(), this);
1226
+ }
937
1227
  if (batch_.send_initial_metadata) {
938
1228
  grpc_metadata_batch_destroy(&call_attempt_->send_initial_metadata_);
939
1229
  }
@@ -946,7 +1236,8 @@ RetryFilter::CallData::CallAttempt::BatchData::~BatchData() {
946
1236
  if (batch_.recv_trailing_metadata) {
947
1237
  grpc_metadata_batch_destroy(&call_attempt_->recv_trailing_metadata_);
948
1238
  }
949
- GRPC_CALL_STACK_UNREF(call_attempt_->calld_->owning_call_, "CallAttempt");
1239
+ GRPC_CALL_STACK_UNREF(call_attempt_->calld_->owning_call_, "Retry BatchData");
1240
+ call_attempt_.reset(DEBUG_LOCATION, "~BatchData");
950
1241
  }
951
1242
 
952
1243
  void RetryFilter::CallData::CallAttempt::BatchData::
@@ -968,108 +1259,13 @@ void RetryFilter::CallData::CallAttempt::BatchData::
968
1259
  }
969
1260
  }
970
1261
 
971
- bool RetryFilter::CallData::CallAttempt::BatchData::MaybeRetry(
972
- grpc_status_code status, grpc_mdelem* server_pushback_md, bool is_lb_drop) {
973
- auto* calld = call_attempt_->calld_;
974
- // LB drops always inhibit retries.
975
- if (is_lb_drop) return false;
976
- // Get retry policy.
977
- if (calld->retry_policy_ == nullptr) return false;
978
- // If we've already dispatched a retry from this call, return true.
979
- // This catches the case where the batch has multiple callbacks
980
- // (i.e., it includes either recv_message or recv_initial_metadata).
981
- if (call_attempt_->retry_dispatched_) {
982
- if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
983
- gpr_log(GPR_INFO, "chand=%p calld=%p: retry already dispatched",
984
- calld->chand_, calld);
985
- }
986
- return true;
987
- }
988
- // Check status.
989
- if (GPR_LIKELY(status == GRPC_STATUS_OK)) {
990
- if (calld->retry_throttle_data_ != nullptr) {
991
- calld->retry_throttle_data_->RecordSuccess();
992
- }
993
- if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
994
- gpr_log(GPR_INFO, "chand=%p calld=%p: call succeeded", calld->chand_,
995
- calld);
996
- }
997
- return false;
998
- }
999
- // Status is not OK. Check whether the status is retryable.
1000
- if (!calld->retry_policy_->retryable_status_codes().Contains(status)) {
1001
- if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1002
- gpr_log(GPR_INFO,
1003
- "chand=%p calld=%p: status %s not configured as retryable",
1004
- calld->chand_, calld, grpc_status_code_to_string(status));
1005
- }
1006
- return false;
1007
- }
1008
- // Record the failure and check whether retries are throttled.
1009
- // Note that it's important for this check to come after the status
1010
- // code check above, since we should only record failures whose statuses
1011
- // match the configured retryable status codes, so that we don't count
1012
- // things like failures due to malformed requests (INVALID_ARGUMENT).
1013
- // Conversely, it's important for this to come before the remaining
1014
- // checks, so that we don't fail to record failures due to other factors.
1015
- if (calld->retry_throttle_data_ != nullptr &&
1016
- !calld->retry_throttle_data_->RecordFailure()) {
1017
- if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1018
- gpr_log(GPR_INFO, "chand=%p calld=%p: retries throttled", calld->chand_,
1019
- calld);
1020
- }
1021
- return false;
1022
- }
1023
- // Check whether the call is committed.
1024
- if (calld->retry_committed_) {
1025
- if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1026
- gpr_log(GPR_INFO, "chand=%p calld=%p: retries already committed",
1027
- calld->chand_, calld);
1028
- }
1029
- return false;
1030
- }
1031
- // Check whether we have retries remaining.
1032
- ++calld->num_attempts_completed_;
1033
- if (calld->num_attempts_completed_ >= calld->retry_policy_->max_attempts()) {
1034
- if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1035
- gpr_log(GPR_INFO, "chand=%p calld=%p: exceeded %d retry attempts",
1036
- calld->chand_, calld, calld->retry_policy_->max_attempts());
1037
- }
1038
- return false;
1039
- }
1040
- // Check server push-back.
1041
- grpc_millis server_pushback_ms = -1;
1042
- if (server_pushback_md != nullptr) {
1043
- // If the value is "-1" or any other unparseable string, we do not retry.
1044
- uint32_t ms;
1045
- if (!grpc_parse_slice_to_uint32(GRPC_MDVALUE(*server_pushback_md), &ms)) {
1046
- if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1047
- gpr_log(GPR_INFO,
1048
- "chand=%p calld=%p: not retrying due to server push-back",
1049
- calld->chand_, calld);
1050
- }
1051
- return false;
1052
- } else {
1053
- if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1054
- gpr_log(GPR_INFO, "chand=%p calld=%p: server push-back: retry in %u ms",
1055
- calld->chand_, calld, ms);
1056
- }
1057
- server_pushback_ms = static_cast<grpc_millis>(ms);
1058
- }
1059
- }
1060
- // Do retry.
1061
- call_attempt_->retry_dispatched_ = true;
1062
- calld->DoRetry(server_pushback_ms);
1063
- return true;
1064
- }
1065
-
1066
1262
  //
1067
1263
  // recv_initial_metadata callback handling
1068
1264
  //
1069
1265
 
1070
1266
  void RetryFilter::CallData::CallAttempt::BatchData::
1071
1267
  InvokeRecvInitialMetadataCallback(void* arg, grpc_error_handle error) {
1072
- auto* batch_data = static_cast<CallAttempt::BatchData*>(arg);
1268
+ auto* batch_data = static_cast<BatchData*>(arg);
1073
1269
  auto* call_attempt = batch_data->call_attempt_.get();
1074
1270
  // Find pending batch.
1075
1271
  PendingBatch* pending = call_attempt->calld_->PendingBatchFind(
@@ -1101,24 +1297,27 @@ void RetryFilter::CallData::CallAttempt::BatchData::
1101
1297
 
1102
1298
  void RetryFilter::CallData::CallAttempt::BatchData::RecvInitialMetadataReady(
1103
1299
  void* arg, grpc_error_handle error) {
1104
- CallAttempt::BatchData* batch_data =
1105
- static_cast<CallAttempt::BatchData*>(arg);
1300
+ RefCountedPtr<BatchData> batch_data(static_cast<BatchData*>(arg));
1106
1301
  CallAttempt* call_attempt = batch_data->call_attempt_.get();
1107
1302
  CallData* calld = call_attempt->calld_;
1108
1303
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1109
1304
  gpr_log(GPR_INFO,
1110
- "chand=%p calld=%p: got recv_initial_metadata_ready, error=%s",
1111
- calld->chand_, calld, grpc_error_std_string(error).c_str());
1305
+ "chand=%p calld=%p attempt=%p: got recv_initial_metadata_ready, "
1306
+ "error=%s",
1307
+ calld->chand_, calld, call_attempt,
1308
+ grpc_error_std_string(error).c_str());
1112
1309
  }
1113
1310
  call_attempt->completed_recv_initial_metadata_ = true;
1114
- // If a retry was already dispatched, then we're not going to use the
1311
+ // If this attempt has been cancelled, then we're not going to use the
1115
1312
  // result of this recv_initial_metadata op, so do nothing.
1116
- if (call_attempt->retry_dispatched_) {
1117
- GRPC_CALL_COMBINER_STOP(
1118
- calld->call_combiner_,
1119
- "recv_initial_metadata_ready after retry dispatched");
1313
+ if (call_attempt->cancelled_) {
1314
+ GRPC_CALL_COMBINER_STOP(calld->call_combiner_,
1315
+ "recv_initial_metadata_ready after cancellation");
1120
1316
  return;
1121
1317
  }
1318
+ // Cancel per-attempt recv timer, if any.
1319
+ call_attempt->MaybeCancelPerAttemptRecvTimer();
1320
+ // If we're not committed, check the response to see if we need to commit.
1122
1321
  if (!calld->retry_committed_) {
1123
1322
  // If we got an error or a Trailers-Only response and have not yet gotten
1124
1323
  // the recv_trailing_metadata_ready callback, then defer propagating this
@@ -1129,11 +1328,12 @@ void RetryFilter::CallData::CallAttempt::BatchData::RecvInitialMetadataReady(
1129
1328
  !call_attempt->completed_recv_trailing_metadata_)) {
1130
1329
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1131
1330
  gpr_log(GPR_INFO,
1132
- "chand=%p calld=%p: deferring recv_initial_metadata_ready "
1133
- "(Trailers-Only)",
1134
- calld->chand_, calld);
1331
+ "chand=%p calld=%p attempt=%p: deferring "
1332
+ "recv_initial_metadata_ready (Trailers-Only)",
1333
+ calld->chand_, calld, call_attempt);
1135
1334
  }
1136
- call_attempt->recv_initial_metadata_ready_deferred_batch_ = batch_data;
1335
+ call_attempt->recv_initial_metadata_ready_deferred_batch_ =
1336
+ std::move(batch_data);
1137
1337
  call_attempt->recv_initial_metadata_error_ = GRPC_ERROR_REF(error);
1138
1338
  if (!call_attempt->started_recv_trailing_metadata_) {
1139
1339
  // recv_trailing_metadata not yet started by application; start it
@@ -1148,10 +1348,13 @@ void RetryFilter::CallData::CallAttempt::BatchData::RecvInitialMetadataReady(
1148
1348
  }
1149
1349
  // Received valid initial metadata, so commit the call.
1150
1350
  calld->RetryCommit(call_attempt);
1351
+ // If retry state is no longer needed, switch to fast path for
1352
+ // subsequent batches.
1353
+ call_attempt->MaybeSwitchToFastPath();
1151
1354
  }
1152
1355
  // Invoke the callback to return the result to the surface.
1153
1356
  // Manually invoking a callback function; it does not take ownership of error.
1154
- InvokeRecvInitialMetadataCallback(batch_data, error);
1357
+ InvokeRecvInitialMetadataCallback(batch_data.release(), error);
1155
1358
  }
1156
1359
 
1157
1360
  //
@@ -1160,8 +1363,7 @@ void RetryFilter::CallData::CallAttempt::BatchData::RecvInitialMetadataReady(
1160
1363
 
1161
1364
  void RetryFilter::CallData::CallAttempt::BatchData::InvokeRecvMessageCallback(
1162
1365
  void* arg, grpc_error_handle error) {
1163
- CallAttempt::BatchData* batch_data =
1164
- static_cast<CallAttempt::BatchData*>(arg);
1366
+ auto* batch_data = static_cast<BatchData*>(arg);
1165
1367
  CallAttempt* call_attempt = batch_data->call_attempt_.get();
1166
1368
  CallData* calld = call_attempt->calld_;
1167
1369
  // Find pending op.
@@ -1189,22 +1391,26 @@ void RetryFilter::CallData::CallAttempt::BatchData::InvokeRecvMessageCallback(
1189
1391
 
1190
1392
  void RetryFilter::CallData::CallAttempt::BatchData::RecvMessageReady(
1191
1393
  void* arg, grpc_error_handle error) {
1192
- CallAttempt::BatchData* batch_data =
1193
- static_cast<CallAttempt::BatchData*>(arg);
1394
+ RefCountedPtr<BatchData> batch_data(static_cast<BatchData*>(arg));
1194
1395
  CallAttempt* call_attempt = batch_data->call_attempt_.get();
1195
1396
  CallData* calld = call_attempt->calld_;
1196
1397
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1197
- gpr_log(GPR_INFO, "chand=%p calld=%p: got recv_message_ready, error=%s",
1198
- calld->chand_, calld, grpc_error_std_string(error).c_str());
1398
+ gpr_log(GPR_INFO,
1399
+ "chand=%p calld=%p attempt=%p: got recv_message_ready, error=%s",
1400
+ calld->chand_, calld, call_attempt,
1401
+ grpc_error_std_string(error).c_str());
1199
1402
  }
1200
1403
  ++call_attempt->completed_recv_message_count_;
1201
- // If a retry was already dispatched, then we're not going to use the
1404
+ // If this attempt has been cancelled, then we're not going to use the
1202
1405
  // result of this recv_message op, so do nothing.
1203
- if (call_attempt->retry_dispatched_) {
1406
+ if (call_attempt->cancelled_) {
1204
1407
  GRPC_CALL_COMBINER_STOP(calld->call_combiner_,
1205
- "recv_message_ready after retry dispatched");
1408
+ "recv_message_ready after cancellation");
1206
1409
  return;
1207
1410
  }
1411
+ // Cancel per-attempt recv timer, if any.
1412
+ call_attempt->MaybeCancelPerAttemptRecvTimer();
1413
+ // If we're not committed, check the response to see if we need to commit.
1208
1414
  if (!calld->retry_committed_) {
1209
1415
  // If we got an error or the payload was nullptr and we have not yet gotten
1210
1416
  // the recv_trailing_metadata_ready callback, then defer propagating this
@@ -1215,11 +1421,11 @@ void RetryFilter::CallData::CallAttempt::BatchData::RecvMessageReady(
1215
1421
  !call_attempt->completed_recv_trailing_metadata_)) {
1216
1422
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1217
1423
  gpr_log(GPR_INFO,
1218
- "chand=%p calld=%p: deferring recv_message_ready (nullptr "
1219
- "message and recv_trailing_metadata pending)",
1220
- calld->chand_, calld);
1424
+ "chand=%p calld=%p attempt=%p: deferring recv_message_ready "
1425
+ "(nullptr message and recv_trailing_metadata pending)",
1426
+ calld->chand_, calld, call_attempt);
1221
1427
  }
1222
- call_attempt->recv_message_ready_deferred_batch_ = batch_data;
1428
+ call_attempt->recv_message_ready_deferred_batch_ = std::move(batch_data);
1223
1429
  call_attempt->recv_message_error_ = GRPC_ERROR_REF(error);
1224
1430
  if (!call_attempt->started_recv_trailing_metadata_) {
1225
1431
  // recv_trailing_metadata not yet started by application; start it
@@ -1233,10 +1439,13 @@ void RetryFilter::CallData::CallAttempt::BatchData::RecvMessageReady(
1233
1439
  }
1234
1440
  // Received a valid message, so commit the call.
1235
1441
  calld->RetryCommit(call_attempt);
1442
+ // If retry state is no longer needed, switch to fast path for
1443
+ // subsequent batches.
1444
+ call_attempt->MaybeSwitchToFastPath();
1236
1445
  }
1237
1446
  // Invoke the callback to return the result to the surface.
1238
1447
  // Manually invoking a callback function; it does not take ownership of error.
1239
- InvokeRecvMessageCallback(batch_data, error);
1448
+ InvokeRecvMessageCallback(batch_data.release(), error);
1240
1449
  }
1241
1450
 
1242
1451
  //
@@ -1285,7 +1494,7 @@ void RetryFilter::CallData::CallAttempt::BatchData::
1285
1494
  // If we generated the recv_trailing_metadata op internally via
1286
1495
  // StartInternalRecvTrailingMetadata(), then there will be no pending batch.
1287
1496
  if (pending == nullptr) {
1288
- GRPC_ERROR_UNREF(error);
1497
+ call_attempt_->recv_trailing_metadata_error_ = error;
1289
1498
  return;
1290
1499
  }
1291
1500
  // Return metadata.
@@ -1303,7 +1512,8 @@ void RetryFilter::CallData::CallAttempt::BatchData::
1303
1512
  }
1304
1513
 
1305
1514
  void RetryFilter::CallData::CallAttempt::BatchData::
1306
- AddClosuresForDeferredRecvCallbacks(CallCombinerClosureList* closures) {
1515
+ AddClosuresForDeferredCompletionCallbacks(
1516
+ CallCombinerClosureList* closures) {
1307
1517
  if (batch_.recv_trailing_metadata) {
1308
1518
  // Add closure for deferred recv_initial_metadata_ready.
1309
1519
  if (GPR_UNLIKELY(
@@ -1312,24 +1522,28 @@ void RetryFilter::CallData::CallAttempt::BatchData::
1312
1522
  GRPC_CLOSURE_INIT(
1313
1523
  &call_attempt_->recv_initial_metadata_ready_,
1314
1524
  InvokeRecvInitialMetadataCallback,
1315
- call_attempt_->recv_initial_metadata_ready_deferred_batch_,
1525
+ call_attempt_->recv_initial_metadata_ready_deferred_batch_.release(),
1316
1526
  grpc_schedule_on_exec_ctx);
1317
1527
  closures->Add(&call_attempt_->recv_initial_metadata_ready_,
1318
1528
  call_attempt_->recv_initial_metadata_error_,
1319
1529
  "resuming recv_initial_metadata_ready");
1320
- call_attempt_->recv_initial_metadata_ready_deferred_batch_ = nullptr;
1321
1530
  }
1322
1531
  // Add closure for deferred recv_message_ready.
1323
1532
  if (GPR_UNLIKELY(call_attempt_->recv_message_ready_deferred_batch_ !=
1324
1533
  nullptr)) {
1325
- GRPC_CLOSURE_INIT(&call_attempt_->recv_message_ready_,
1326
- InvokeRecvMessageCallback,
1327
- call_attempt_->recv_message_ready_deferred_batch_,
1328
- grpc_schedule_on_exec_ctx);
1534
+ GRPC_CLOSURE_INIT(
1535
+ &call_attempt_->recv_message_ready_, InvokeRecvMessageCallback,
1536
+ call_attempt_->recv_message_ready_deferred_batch_.release(),
1537
+ grpc_schedule_on_exec_ctx);
1329
1538
  closures->Add(&call_attempt_->recv_message_ready_,
1330
1539
  call_attempt_->recv_message_error_,
1331
1540
  "resuming recv_message_ready");
1332
- call_attempt_->recv_message_ready_deferred_batch_ = nullptr;
1541
+ }
1542
+ // Add closure for deferred on_complete.
1543
+ if (GPR_UNLIKELY(call_attempt_->on_complete_deferred_batch_ != nullptr)) {
1544
+ closures->Add(&call_attempt_->on_complete_deferred_batch_->on_complete_,
1545
+ call_attempt_->on_complete_error_, "resuming on_complete");
1546
+ call_attempt_->on_complete_deferred_batch_.release();
1333
1547
  }
1334
1548
  }
1335
1549
  }
@@ -1343,9 +1557,9 @@ void RetryFilter::CallData::CallAttempt::BatchData::
1343
1557
  if (call_attempt_->PendingBatchIsUnstarted(pending)) {
1344
1558
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1345
1559
  gpr_log(GPR_INFO,
1346
- "chand=%p calld=%p: failing unstarted pending batch at "
1347
- "index %" PRIuPTR,
1348
- calld->chand_, calld, i);
1560
+ "chand=%p calld=%p attempt=%p: failing unstarted pending "
1561
+ "batch at index %" PRIuPTR,
1562
+ calld->chand_, calld, call_attempt_.get(), i);
1349
1563
  }
1350
1564
  closures->Add(pending->batch->on_complete, GRPC_ERROR_REF(error),
1351
1565
  "failing on_complete for pending batch");
@@ -1362,31 +1576,38 @@ void RetryFilter::CallData::CallAttempt::BatchData::RunClosuresForCompletedCall(
1362
1576
  CallCombinerClosureList closures;
1363
1577
  // First, add closure for recv_trailing_metadata_ready.
1364
1578
  AddClosureForRecvTrailingMetadataReady(GRPC_ERROR_REF(error), &closures);
1365
- // If there are deferred recv_initial_metadata_ready or recv_message_ready
1366
- // callbacks, add them to closures.
1367
- AddClosuresForDeferredRecvCallbacks(&closures);
1579
+ // If there are deferred batch completion callbacks, add them to closures.
1580
+ AddClosuresForDeferredCompletionCallbacks(&closures);
1368
1581
  // Add closures to fail any pending batches that have not yet been started.
1369
1582
  AddClosuresToFailUnstartedPendingBatches(GRPC_ERROR_REF(error), &closures);
1370
1583
  // Schedule all of the closures identified above.
1371
1584
  // Note: This will release the call combiner.
1372
1585
  closures.RunClosures(call_attempt_->calld_->call_combiner_);
1373
- // Don't need batch_data anymore.
1374
- Unref();
1375
1586
  GRPC_ERROR_UNREF(error);
1376
1587
  }
1377
1588
 
1378
1589
  void RetryFilter::CallData::CallAttempt::BatchData::RecvTrailingMetadataReady(
1379
1590
  void* arg, grpc_error_handle error) {
1380
- CallAttempt::BatchData* batch_data =
1381
- static_cast<CallAttempt::BatchData*>(arg);
1591
+ RefCountedPtr<BatchData> batch_data(static_cast<BatchData*>(arg));
1382
1592
  CallAttempt* call_attempt = batch_data->call_attempt_.get();
1383
1593
  CallData* calld = call_attempt->calld_;
1384
1594
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1385
1595
  gpr_log(GPR_INFO,
1386
- "chand=%p calld=%p: got recv_trailing_metadata_ready, error=%s",
1387
- calld->chand_, calld, grpc_error_std_string(error).c_str());
1596
+ "chand=%p calld=%p attempt=%p: got recv_trailing_metadata_ready, "
1597
+ "error=%s",
1598
+ calld->chand_, calld, call_attempt,
1599
+ grpc_error_std_string(error).c_str());
1388
1600
  }
1389
1601
  call_attempt->completed_recv_trailing_metadata_ = true;
1602
+ // If this attempt has been cancelled, then we're not going to use the
1603
+ // result of this recv_trailing_metadata op, so do nothing.
1604
+ if (call_attempt->cancelled_) {
1605
+ GRPC_CALL_COMBINER_STOP(calld->call_combiner_,
1606
+ "recv_trailing_metadata_ready after cancellation");
1607
+ return;
1608
+ }
1609
+ // Cancel per-attempt recv timer, if any.
1610
+ call_attempt->MaybeCancelPerAttemptRecvTimer();
1390
1611
  // Get the call's status and check for server pushback metadata.
1391
1612
  grpc_status_code status = GRPC_STATUS_OK;
1392
1613
  grpc_mdelem* server_pushback_md = nullptr;
@@ -1397,26 +1618,29 @@ void RetryFilter::CallData::CallAttempt::BatchData::RecvTrailingMetadataReady(
1397
1618
  &server_pushback_md, &is_lb_drop);
1398
1619
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1399
1620
  gpr_log(
1400
- GPR_INFO, "chand=%p calld=%p: call finished, status=%s is_lb_drop=%d",
1401
- calld->chand_, calld, grpc_status_code_to_string(status), is_lb_drop);
1621
+ GPR_INFO,
1622
+ "chand=%p calld=%p attempt=%p: call finished, status=%s is_lb_drop=%d",
1623
+ calld->chand_, calld, call_attempt, grpc_status_code_to_string(status),
1624
+ is_lb_drop);
1402
1625
  }
1403
1626
  // Check if we should retry.
1404
- if (batch_data->MaybeRetry(status, server_pushback_md, is_lb_drop)) {
1405
- // Unref batch_data for deferred recv_initial_metadata_ready or
1406
- // recv_message_ready callbacks, if any.
1407
- if (call_attempt->recv_initial_metadata_ready_deferred_batch_ != nullptr) {
1408
- GRPC_ERROR_UNREF(call_attempt->recv_initial_metadata_error_);
1409
- batch_data->Unref();
1410
- }
1411
- if (call_attempt->recv_message_ready_deferred_batch_ != nullptr) {
1412
- GRPC_ERROR_UNREF(call_attempt->recv_message_error_);
1413
- batch_data->Unref();
1414
- }
1415
- batch_data->Unref();
1627
+ grpc_millis server_pushback_ms = -1;
1628
+ if (call_attempt->ShouldRetry(status, is_lb_drop, server_pushback_md,
1629
+ &server_pushback_ms)) {
1630
+ // Start retry timer.
1631
+ calld->StartRetryTimer(server_pushback_ms);
1632
+ // Cancel call attempt.
1633
+ CallCombinerClosureList closures;
1634
+ call_attempt->Cancel(&closures);
1635
+ // Yields call combiner.
1636
+ closures.RunClosures(calld->call_combiner_);
1416
1637
  return;
1417
1638
  }
1418
1639
  // Not retrying, so commit the call.
1419
1640
  calld->RetryCommit(call_attempt);
1641
+ // If retry state is no longer needed, switch to fast path for
1642
+ // subsequent batches.
1643
+ call_attempt->MaybeSwitchToFastPath();
1420
1644
  // Run any necessary closures.
1421
1645
  batch_data->RunClosuresForCompletedCall(GRPC_ERROR_REF(error));
1422
1646
  }
@@ -1454,31 +1678,27 @@ void RetryFilter::CallData::CallAttempt::BatchData::
1454
1678
  void RetryFilter::CallData::CallAttempt::BatchData::
1455
1679
  AddClosuresForReplayOrPendingSendOps(CallCombinerClosureList* closures) {
1456
1680
  auto* calld = call_attempt_->calld_;
1681
+ bool have_pending_send_ops = call_attempt_->HaveSendOpsToReplay();
1457
1682
  // We don't check send_initial_metadata here, because that op will always
1458
1683
  // be started as soon as it is received from the surface, so it will
1459
1684
  // never need to be started at this point.
1460
- bool have_pending_send_message_ops =
1461
- call_attempt_->started_send_message_count_ < calld->send_messages_.size();
1462
- bool have_pending_send_trailing_metadata_op =
1463
- calld->seen_send_trailing_metadata_ &&
1464
- !call_attempt_->started_send_trailing_metadata_;
1465
- if (!have_pending_send_message_ops &&
1466
- !have_pending_send_trailing_metadata_op) {
1685
+ if (!have_pending_send_ops) {
1467
1686
  for (size_t i = 0; i < GPR_ARRAY_SIZE(calld->pending_batches_); ++i) {
1468
1687
  PendingBatch* pending = &calld->pending_batches_[i];
1469
1688
  grpc_transport_stream_op_batch* batch = pending->batch;
1470
1689
  if (batch == nullptr || pending->send_ops_cached) continue;
1471
- if (batch->send_message) have_pending_send_message_ops = true;
1472
- if (batch->send_trailing_metadata) {
1473
- have_pending_send_trailing_metadata_op = true;
1690
+ if (batch->send_message || batch->send_trailing_metadata) {
1691
+ have_pending_send_ops = true;
1692
+ break;
1474
1693
  }
1475
1694
  }
1476
1695
  }
1477
- if (have_pending_send_message_ops || have_pending_send_trailing_metadata_op) {
1696
+ if (have_pending_send_ops) {
1478
1697
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1479
1698
  gpr_log(GPR_INFO,
1480
- "chand=%p calld=%p: starting next batch for pending send op(s)",
1481
- calld->chand_, calld);
1699
+ "chand=%p calld=%p attempt=%p: starting next batch for pending "
1700
+ "send op(s)",
1701
+ calld->chand_, calld, call_attempt_.get());
1482
1702
  }
1483
1703
  call_attempt_->AddRetriableBatches(closures);
1484
1704
  }
@@ -1486,15 +1706,46 @@ void RetryFilter::CallData::CallAttempt::BatchData::
1486
1706
 
1487
1707
  void RetryFilter::CallData::CallAttempt::BatchData::OnComplete(
1488
1708
  void* arg, grpc_error_handle error) {
1489
- CallAttempt::BatchData* batch_data =
1490
- static_cast<CallAttempt::BatchData*>(arg);
1709
+ RefCountedPtr<BatchData> batch_data(static_cast<BatchData*>(arg));
1491
1710
  CallAttempt* call_attempt = batch_data->call_attempt_.get();
1492
1711
  CallData* calld = call_attempt->calld_;
1493
1712
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1494
- gpr_log(GPR_INFO, "chand=%p calld=%p: got on_complete, error=%s, batch=%s",
1495
- calld->chand_, calld, grpc_error_std_string(error).c_str(),
1713
+ gpr_log(GPR_INFO,
1714
+ "chand=%p calld=%p attempt=%p: got on_complete, error=%s, batch=%s",
1715
+ calld->chand_, calld, call_attempt,
1716
+ grpc_error_std_string(error).c_str(),
1496
1717
  grpc_transport_stream_op_batch_string(&batch_data->batch_).c_str());
1497
1718
  }
1719
+ // If this attempt has been cancelled, then we're not going to propagate
1720
+ // the completion of this batch, so do nothing.
1721
+ if (call_attempt->cancelled_) {
1722
+ GRPC_CALL_COMBINER_STOP(calld->call_combiner_,
1723
+ "on_complete after cancellation");
1724
+ return;
1725
+ }
1726
+ // If we got an error and have not yet gotten the
1727
+ // recv_trailing_metadata_ready callback, then defer propagating this
1728
+ // callback back to the surface. We can evaluate whether to retry when
1729
+ // recv_trailing_metadata comes back.
1730
+ if (GPR_UNLIKELY(!calld->retry_committed_ && error != GRPC_ERROR_NONE &&
1731
+ !call_attempt->completed_recv_trailing_metadata_)) {
1732
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1733
+ gpr_log(GPR_INFO, "chand=%p calld=%p attempt=%p: deferring on_complete",
1734
+ calld->chand_, calld, call_attempt);
1735
+ }
1736
+ call_attempt->on_complete_deferred_batch_ = std::move(batch_data);
1737
+ call_attempt->on_complete_error_ = GRPC_ERROR_REF(error);
1738
+ if (!call_attempt->started_recv_trailing_metadata_) {
1739
+ // recv_trailing_metadata not yet started by application; start it
1740
+ // ourselves to get status.
1741
+ call_attempt->StartInternalRecvTrailingMetadata();
1742
+ } else {
1743
+ GRPC_CALL_COMBINER_STOP(
1744
+ calld->call_combiner_,
1745
+ "on_complete failure before recv_trailing_metadata_ready");
1746
+ }
1747
+ return;
1748
+ }
1498
1749
  // Update bookkeeping in call_attempt.
1499
1750
  if (batch_data->batch_.send_initial_metadata) {
1500
1751
  call_attempt->completed_send_initial_metadata_ = true;
@@ -1512,33 +1763,21 @@ void RetryFilter::CallData::CallAttempt::BatchData::OnComplete(
1512
1763
  }
1513
1764
  // Construct list of closures to execute.
1514
1765
  CallCombinerClosureList closures;
1515
- // If a retry was already dispatched, that means we saw
1516
- // recv_trailing_metadata before this, so we do nothing here.
1517
- // Otherwise, invoke the callback to return the result to the surface.
1518
- if (!call_attempt->retry_dispatched_) {
1519
- // Add closure for the completed pending batch, if any.
1520
- batch_data->AddClosuresForCompletedPendingBatch(GRPC_ERROR_REF(error),
1521
- &closures);
1522
- // If needed, add a callback to start any replay or pending send ops on
1523
- // the LB call.
1524
- if (!call_attempt->completed_recv_trailing_metadata_) {
1525
- batch_data->AddClosuresForReplayOrPendingSendOps(&closures);
1526
- }
1527
- }
1528
- // Track number of in-flight send batches and determine if this was the
1529
- // last one.
1530
- --calld->num_in_flight_call_attempt_send_batches_;
1531
- const bool last_send_batch_complete =
1532
- calld->num_in_flight_call_attempt_send_batches_ == 0;
1533
- // Don't need batch_data anymore.
1534
- batch_data->Unref();
1766
+ // Add closure for the completed pending batch, if any.
1767
+ batch_data->AddClosuresForCompletedPendingBatch(GRPC_ERROR_REF(error),
1768
+ &closures);
1769
+ // If needed, add a callback to start any replay or pending send ops on
1770
+ // the LB call.
1771
+ if (!call_attempt->completed_recv_trailing_metadata_) {
1772
+ batch_data->AddClosuresForReplayOrPendingSendOps(&closures);
1773
+ }
1774
+ // If retry state is no longer needed (i.e., we're committed and there
1775
+ // are no more send ops to replay), switch to fast path for subsequent
1776
+ // batches.
1777
+ call_attempt->MaybeSwitchToFastPath();
1535
1778
  // Schedule all of the closures identified above.
1536
1779
  // Note: This yields the call combiner.
1537
1780
  closures.RunClosures(calld->call_combiner_);
1538
- // If this was the last in-flight send batch, unref the call stack.
1539
- if (last_send_batch_complete) {
1540
- GRPC_CALL_STACK_UNREF(calld->owning_call_, "retriable_send_batches");
1541
- }
1542
1781
  }
1543
1782
 
1544
1783
  //
@@ -1598,9 +1837,12 @@ void RetryFilter::CallData::CallAttempt::BatchData::
1598
1837
  AddRetriableSendMessageOp() {
1599
1838
  auto* calld = call_attempt_->calld_;
1600
1839
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1601
- gpr_log(GPR_INFO,
1602
- "chand=%p calld=%p: starting calld->send_messages[%" PRIuPTR "]",
1603
- calld->chand_, calld, call_attempt_->started_send_message_count_);
1840
+ gpr_log(
1841
+ GPR_INFO,
1842
+ "chand=%p calld=%p attempt=%p: starting calld->send_messages[%" PRIuPTR
1843
+ "]",
1844
+ calld->chand_, calld, call_attempt_.get(),
1845
+ call_attempt_->started_send_message_count_);
1604
1846
  }
1605
1847
  ByteStreamCache* cache =
1606
1848
  calld->send_messages_[call_attempt_->started_send_message_count_];
@@ -1650,6 +1892,7 @@ void RetryFilter::CallData::CallAttempt::BatchData::
1650
1892
  ++call_attempt_->started_recv_message_count_;
1651
1893
  batch_.recv_message = true;
1652
1894
  batch_.payload->recv_message.recv_message = &call_attempt_->recv_message_;
1895
+ batch_.payload->recv_message.call_failed_before_recv_message = nullptr;
1653
1896
  GRPC_CLOSURE_INIT(&call_attempt_->recv_message_ready_, RecvMessageReady, this,
1654
1897
  grpc_schedule_on_exec_ctx);
1655
1898
  batch_.payload->recv_message.recv_message_ready =
@@ -1671,6 +1914,12 @@ void RetryFilter::CallData::CallAttempt::BatchData::
1671
1914
  &call_attempt_->recv_trailing_metadata_ready_;
1672
1915
  }
1673
1916
 
1917
+ void RetryFilter::CallData::CallAttempt::BatchData::AddCancelStreamOp() {
1918
+ batch_.cancel_stream = true;
1919
+ batch_.payload->cancel_stream.cancel_error =
1920
+ GRPC_ERROR_CREATE_FROM_STATIC_STRING("retry attempt abandoned");
1921
+ }
1922
+
1674
1923
  //
1675
1924
  // CallData vtable functions
1676
1925
  //
@@ -1680,7 +1929,8 @@ grpc_error_handle RetryFilter::CallData::Init(
1680
1929
  auto* chand = static_cast<RetryFilter*>(elem->channel_data);
1681
1930
  new (elem->call_data) CallData(chand, *args);
1682
1931
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1683
- gpr_log(GPR_INFO, "chand=%p: created call=%p", chand, elem->call_data);
1932
+ gpr_log(GPR_INFO, "chand=%p calld=%p: created call", chand,
1933
+ elem->call_data);
1684
1934
  }
1685
1935
  return GRPC_ERROR_NONE;
1686
1936
  }
@@ -1758,7 +2008,7 @@ RetryFilter::CallData::CallData(RetryFilter* chand,
1758
2008
  pending_send_message_(false),
1759
2009
  pending_send_trailing_metadata_(false),
1760
2010
  retry_committed_(false),
1761
- last_attempt_got_server_pushback_(false) {}
2011
+ retry_timer_pending_(false) {}
1762
2012
 
1763
2013
  RetryFilter::CallData::~CallData() {
1764
2014
  grpc_slice_unref_internal(path_);
@@ -1788,10 +2038,26 @@ void RetryFilter::CallData::StartTransportStreamOpBatch(
1788
2038
  // will not be retried, because we have committed it here.
1789
2039
  if (call_attempt_ != nullptr) {
1790
2040
  RetryCommit(call_attempt_.get());
2041
+ // TODO(roth): When implementing hedging, this will get more
2042
+ // complex, because instead of just passing the batch down to a
2043
+ // single call attempt, we'll need to cancel multiple call
2044
+ // attempts and wait for the cancellation on_complete from each call
2045
+ // attempt before we propagate the on_complete from this batch
2046
+ // back to the surface.
1791
2047
  // Note: This will release the call combiner.
1792
- call_attempt_->lb_call()->StartTransportStreamOpBatch(batch);
2048
+ call_attempt_->CancelFromSurface(batch);
1793
2049
  return;
1794
2050
  }
2051
+ // Cancel retry timer.
2052
+ if (retry_timer_pending_) {
2053
+ if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
2054
+ gpr_log(GPR_INFO, "chand=%p calld=%p: cancelling retry timer", chand_,
2055
+ this);
2056
+ }
2057
+ retry_timer_pending_ = false; // Lame timer callback.
2058
+ grpc_timer_cancel(&retry_timer_);
2059
+ FreeAllCachedSendOpData();
2060
+ }
1795
2061
  // Fail pending batches.
1796
2062
  PendingBatchesFail(GRPC_ERROR_REF(cancel_error));
1797
2063
  // Note: This will release the call combiner.
@@ -1801,13 +2067,33 @@ void RetryFilter::CallData::StartTransportStreamOpBatch(
1801
2067
  }
1802
2068
  // Add the batch to the pending list.
1803
2069
  PendingBatch* pending = PendingBatchesAdd(batch);
2070
+ // If the timer is pending, yield the call combiner and wait for it to
2071
+ // run, since we don't want to start another call attempt until it does.
2072
+ if (retry_timer_pending_) {
2073
+ GRPC_CALL_COMBINER_STOP(call_combiner_,
2074
+ "added pending batch while retry timer pending");
2075
+ return;
2076
+ }
2077
+ // If we do not yet have a call attempt, create one.
1804
2078
  if (call_attempt_ == nullptr) {
1805
2079
  // If this is the first batch and retries are already committed
1806
2080
  // (e.g., if this batch put the call above the buffer size limit), then
1807
2081
  // immediately create an LB call and delegate the batch to it. This
1808
2082
  // avoids the overhead of unnecessarily allocating a CallAttempt
1809
2083
  // object or caching any of the send op data.
1810
- if (num_attempts_completed_ == 0 && retry_committed_) {
2084
+ // Note that we would ideally like to do this also on subsequent
2085
+ // attempts (e.g., if a batch puts the call above the buffer size
2086
+ // limit since the last attempt was complete), but in practice that's
2087
+ // not really worthwhile, because we will almost always have cached and
2088
+ // completed at least the send_initial_metadata op on the previous
2089
+ // attempt, which means that we'd need special logic to replay the
2090
+ // batch anyway, which is exactly what the CallAttempt object provides.
2091
+ // We also skip this optimization if perAttemptRecvTimeout is set in the
2092
+ // retry policy, because we need the code in CallAttempt to handle
2093
+ // the associated timer.
2094
+ if (num_attempts_completed_ == 0 && retry_committed_ &&
2095
+ (retry_policy_ == nullptr ||
2096
+ !retry_policy_->per_attempt_recv_timeout().has_value())) {
1811
2097
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1812
2098
  gpr_log(GPR_INFO,
1813
2099
  "chand=%p calld=%p: retry committed before first attempt; "
@@ -1819,7 +2105,9 @@ void RetryFilter::CallData::StartTransportStreamOpBatch(
1819
2105
  committed_call_->StartTransportStreamOpBatch(batch);
1820
2106
  return;
1821
2107
  }
1822
- // We do not yet have a call attempt, so create one.
2108
+ // Otherwise, create a call attempt.
2109
+ // The attempt will automatically start any necessary replays or
2110
+ // pending batches.
1823
2111
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1824
2112
  gpr_log(GPR_INFO, "chand=%p calld=%p: creating call attempt", chand_,
1825
2113
  this);
@@ -1829,9 +2117,8 @@ void RetryFilter::CallData::StartTransportStreamOpBatch(
1829
2117
  }
1830
2118
  // Send batches to call attempt.
1831
2119
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1832
- gpr_log(GPR_INFO,
1833
- "chand=%p calld=%p: starting batch on attempt=%p lb_call=%p",
1834
- chand_, this, call_attempt_.get(), call_attempt_->lb_call());
2120
+ gpr_log(GPR_INFO, "chand=%p calld=%p: starting batch on attempt=%p", chand_,
2121
+ this, call_attempt_.get());
1835
2122
  }
1836
2123
  call_attempt_->StartRetriableBatches();
1837
2124
  }
@@ -1849,36 +2136,8 @@ RetryFilter::CallData::CreateLoadBalancedCall() {
1849
2136
  }
1850
2137
 
1851
2138
  void RetryFilter::CallData::CreateCallAttempt() {
1852
- call_attempt_.reset(arena_->New<CallAttempt>(this));
2139
+ call_attempt_ = MakeRefCounted<CallAttempt>(this);
1853
2140
  call_attempt_->StartRetriableBatches();
1854
- // TODO(roth): When implementing hedging, change this to start a timer
1855
- // for the next hedging attempt.
1856
- }
1857
-
1858
- namespace {
1859
-
1860
- void StartBatchInCallCombiner(void* arg, grpc_error_handle /*ignored*/) {
1861
- grpc_transport_stream_op_batch* batch =
1862
- static_cast<grpc_transport_stream_op_batch*>(arg);
1863
- auto* lb_call = static_cast<ClientChannel::LoadBalancedCall*>(
1864
- batch->handler_private.extra_arg);
1865
- // Note: This will release the call combiner.
1866
- lb_call->StartTransportStreamOpBatch(batch);
1867
- }
1868
-
1869
- } // namespace
1870
-
1871
- void RetryFilter::CallData::AddClosureForBatch(
1872
- grpc_transport_stream_op_batch* batch, CallCombinerClosureList* closures) {
1873
- batch->handler_private.extra_arg = call_attempt_->lb_call();
1874
- GRPC_CLOSURE_INIT(&batch->handler_private.closure, StartBatchInCallCombiner,
1875
- batch, grpc_schedule_on_exec_ctx);
1876
- if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1877
- gpr_log(GPR_INFO, "chand=%p calld=%p: starting batch on LB call: %s",
1878
- chand_, this, grpc_transport_stream_op_batch_string(batch).c_str());
1879
- }
1880
- closures->Add(&batch->handler_private.closure, GRPC_ERROR_NONE,
1881
- "start_batch_on_lb_call");
1882
2141
  }
1883
2142
 
1884
2143
  //
@@ -1943,7 +2202,7 @@ void RetryFilter::CallData::FreeCachedSendMessage(size_t idx) {
1943
2202
 
1944
2203
  void RetryFilter::CallData::FreeCachedSendTrailingMetadata() {
1945
2204
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1946
- gpr_log(GPR_INFO, "chand_=%p calld=%p: destroying send_trailing_metadata",
2205
+ gpr_log(GPR_INFO, "chand=%p calld=%p: destroying send_trailing_metadata",
1947
2206
  chand_, this);
1948
2207
  }
1949
2208
  grpc_metadata_batch_destroy(&send_trailing_metadata_);
@@ -1982,7 +2241,7 @@ RetryFilter::CallData::PendingBatch* RetryFilter::CallData::PendingBatchesAdd(
1982
2241
  const size_t idx = GetBatchIndex(batch);
1983
2242
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
1984
2243
  gpr_log(GPR_INFO,
1985
- "chand_=%p calld=%p: adding pending batch at index %" PRIuPTR,
2244
+ "chand=%p calld=%p: adding pending batch at index %" PRIuPTR,
1986
2245
  chand_, this, idx);
1987
2246
  }
1988
2247
  PendingBatch* pending = &pending_batches_[idx];
@@ -2006,6 +2265,9 @@ RetryFilter::CallData::PendingBatch* RetryFilter::CallData::PendingBatchesAdd(
2006
2265
  if (batch->send_trailing_metadata) {
2007
2266
  pending_send_trailing_metadata_ = true;
2008
2267
  }
2268
+ // TODO(roth): When we implement hedging, if there are currently attempts
2269
+ // in flight, we will need to pick the one on which the max number of send
2270
+ // ops have already been sent, and we commit to that attempt.
2009
2271
  if (GPR_UNLIKELY(bytes_buffered_for_retry_ >
2010
2272
  chand_->per_rpc_retry_buffer_size_)) {
2011
2273
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
@@ -2126,18 +2388,15 @@ void RetryFilter::CallData::RetryCommit(CallAttempt* call_attempt) {
2126
2388
  }
2127
2389
  }
2128
2390
 
2129
- void RetryFilter::CallData::DoRetry(grpc_millis server_pushback_ms) {
2391
+ void RetryFilter::CallData::StartRetryTimer(grpc_millis server_pushback_ms) {
2130
2392
  // Reset call attempt.
2131
- call_attempt_.reset();
2393
+ call_attempt_.reset(DEBUG_LOCATION, "StartRetryTimer");
2132
2394
  // Compute backoff delay.
2133
2395
  grpc_millis next_attempt_time;
2134
2396
  if (server_pushback_ms >= 0) {
2135
2397
  next_attempt_time = ExecCtx::Get()->Now() + server_pushback_ms;
2136
- last_attempt_got_server_pushback_ = true;
2398
+ retry_backoff_.Reset();
2137
2399
  } else {
2138
- if (num_attempts_completed_ == 1 || last_attempt_got_server_pushback_) {
2139
- last_attempt_got_server_pushback_ = false;
2140
- }
2141
2400
  next_attempt_time = retry_backoff_.NextAttemptTime();
2142
2401
  }
2143
2402
  if (GRPC_TRACE_FLAG_ENABLED(grpc_retry_trace)) {
@@ -2148,23 +2407,25 @@ void RetryFilter::CallData::DoRetry(grpc_millis server_pushback_ms) {
2148
2407
  // Schedule retry after computed delay.
2149
2408
  GRPC_CLOSURE_INIT(&retry_closure_, OnRetryTimer, this, nullptr);
2150
2409
  GRPC_CALL_STACK_REF(owning_call_, "OnRetryTimer");
2151
- MutexLock lock(&timer_mu_);
2152
- canceller_ = new Canceller(this);
2410
+ retry_timer_pending_ = true;
2153
2411
  grpc_timer_init(&retry_timer_, next_attempt_time, &retry_closure_);
2154
2412
  }
2155
2413
 
2156
2414
  void RetryFilter::CallData::OnRetryTimer(void* arg, grpc_error_handle error) {
2157
2415
  auto* calld = static_cast<CallData*>(arg);
2158
- if (error == GRPC_ERROR_NONE) {
2159
- bool start_attempt = false;
2160
- {
2161
- MutexLock lock(&calld->timer_mu_);
2162
- if (calld->canceller_ != nullptr) {
2163
- calld->canceller_ = nullptr;
2164
- start_attempt = true;
2165
- }
2166
- }
2167
- if (start_attempt) calld->CreateCallAttempt();
2416
+ GRPC_CLOSURE_INIT(&calld->retry_closure_, OnRetryTimerLocked, calld, nullptr);
2417
+ GRPC_CALL_COMBINER_START(calld->call_combiner_, &calld->retry_closure_,
2418
+ GRPC_ERROR_REF(error), "retry timer fired");
2419
+ }
2420
+
2421
+ void RetryFilter::CallData::OnRetryTimerLocked(void* arg,
2422
+ grpc_error_handle error) {
2423
+ auto* calld = static_cast<CallData*>(arg);
2424
+ if (error == GRPC_ERROR_NONE && calld->retry_timer_pending_) {
2425
+ calld->retry_timer_pending_ = false;
2426
+ calld->CreateCallAttempt();
2427
+ } else {
2428
+ GRPC_CALL_COMBINER_STOP(calld->call_combiner_, "retry timer cancelled");
2168
2429
  }
2169
2430
  GRPC_CALL_STACK_UNREF(calld->owning_call_, "OnRetryTimer");
2170
2431
  }