grpc 1.26.0 → 1.27.0.pre1

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of grpc might be problematic. Click here for more details.

Files changed (209) hide show
  1. checksums.yaml +4 -4
  2. data/Makefile +1654 -1519
  3. data/etc/roots.pem +44 -0
  4. data/include/grpc/grpc_security.h +37 -15
  5. data/include/grpc/grpc_security_constants.h +27 -0
  6. data/include/grpc/impl/codegen/grpc_types.h +14 -0
  7. data/include/grpc/impl/codegen/port_platform.h +1 -1
  8. data/src/core/ext/filters/client_channel/client_channel.cc +0 -20
  9. data/src/core/ext/filters/client_channel/http_proxy.cc +4 -4
  10. data/src/core/ext/filters/client_channel/lb_policy.cc +4 -3
  11. data/src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.cc +191 -201
  12. data/src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb_balancer_addresses.cc +89 -0
  13. data/src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb_balancer_addresses.h +40 -0
  14. data/src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb_channel_secure.cc +3 -2
  15. data/src/core/ext/filters/client_channel/lb_policy/grpclb/load_balancer_api.cc +88 -121
  16. data/src/core/ext/filters/client_channel/lb_policy/grpclb/load_balancer_api.h +28 -57
  17. data/src/core/ext/filters/client_channel/lb_policy/subchannel_list.h +0 -7
  18. data/src/core/ext/filters/client_channel/lb_policy/xds/cds.cc +8 -9
  19. data/src/core/ext/filters/client_channel/lb_policy/xds/xds.cc +53 -34
  20. data/src/core/ext/filters/client_channel/resolver/dns/c_ares/dns_resolver_ares.cc +18 -5
  21. data/src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_wrapper.cc +24 -19
  22. data/src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_wrapper.h +2 -1
  23. data/src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_wrapper_fallback.cc +4 -2
  24. data/src/core/ext/filters/client_channel/server_address.cc +6 -9
  25. data/src/core/ext/filters/client_channel/server_address.h +3 -10
  26. data/src/core/ext/filters/client_channel/xds/xds_api.cc +394 -150
  27. data/src/core/ext/filters/client_channel/xds/xds_api.h +75 -35
  28. data/src/core/ext/filters/client_channel/xds/xds_bootstrap.cc +59 -22
  29. data/src/core/ext/filters/client_channel/xds/xds_bootstrap.h +13 -9
  30. data/src/core/ext/filters/client_channel/xds/xds_channel_secure.cc +8 -6
  31. data/src/core/ext/filters/client_channel/xds/xds_client.cc +456 -175
  32. data/src/core/ext/filters/client_channel/xds/xds_client.h +33 -21
  33. data/src/core/ext/filters/client_channel/xds/xds_client_stats.cc +5 -8
  34. data/src/core/ext/filters/client_channel/xds/xds_client_stats.h +18 -24
  35. data/src/core/ext/transport/chttp2/transport/chttp2_transport.cc +2 -2
  36. data/src/core/ext/upb-generated/src/proto/grpc/lb/v1/load_balancer.upb.c +13 -5
  37. data/src/core/ext/upb-generated/src/proto/grpc/lb/v1/load_balancer.upb.h +34 -0
  38. data/src/core/lib/channel/channelz.h +11 -1
  39. data/src/core/lib/gpr/time_precise.cc +1 -1
  40. data/src/core/lib/gprpp/optional.h +26 -0
  41. data/src/core/lib/gprpp/string_view.h +14 -10
  42. data/src/core/lib/iomgr/executor.cc +1 -1
  43. data/src/core/lib/iomgr/fork_posix.cc +4 -0
  44. data/src/core/lib/iomgr/poller/eventmanager_libuv.cc +87 -0
  45. data/src/core/lib/iomgr/poller/eventmanager_libuv.h +88 -0
  46. data/src/core/lib/iomgr/socket_utils_common_posix.cc +14 -0
  47. data/src/core/lib/iomgr/socket_utils_posix.h +12 -0
  48. data/src/core/lib/iomgr/tcp_custom.h +3 -0
  49. data/src/core/lib/iomgr/tcp_posix.cc +607 -56
  50. data/src/core/lib/iomgr/tcp_server_custom.cc +15 -2
  51. data/src/core/lib/iomgr/tcp_server_utils_posix_common.cc +8 -0
  52. data/src/core/lib/json/json.h +11 -1
  53. data/src/core/lib/json/json_reader.cc +206 -28
  54. data/src/core/lib/json/json_writer.cc +111 -24
  55. data/src/core/lib/security/credentials/composite/composite_credentials.cc +7 -0
  56. data/src/core/lib/security/credentials/composite/composite_credentials.h +5 -1
  57. data/src/core/lib/security/credentials/credentials.h +10 -1
  58. data/src/core/lib/security/credentials/fake/fake_credentials.h +2 -1
  59. data/src/core/lib/security/credentials/oauth2/oauth2_credentials.cc +1 -1
  60. data/src/core/lib/security/credentials/plugin/plugin_credentials.cc +6 -4
  61. data/src/core/lib/security/credentials/plugin/plugin_credentials.h +2 -1
  62. data/src/core/lib/security/credentials/tls/grpc_tls_credentials_options.cc +20 -0
  63. data/src/core/lib/security/credentials/tls/grpc_tls_credentials_options.h +8 -0
  64. data/src/core/lib/security/credentials/tls/{spiffe_credentials.cc → tls_credentials.cc} +23 -24
  65. data/src/core/lib/security/credentials/tls/{spiffe_credentials.h → tls_credentials.h} +9 -9
  66. data/src/core/lib/security/security_connector/alts/alts_security_connector.cc +13 -0
  67. data/src/core/lib/security/security_connector/fake/fake_security_connector.cc +22 -2
  68. data/src/core/lib/security/security_connector/load_system_roots_fallback.cc +2 -2
  69. data/src/core/lib/security/security_connector/load_system_roots_linux.cc +2 -2
  70. data/src/core/lib/security/security_connector/local/local_security_connector.cc +30 -3
  71. data/src/core/lib/security/security_connector/ssl_utils.cc +45 -3
  72. data/src/core/lib/security/security_connector/ssl_utils.h +12 -0
  73. data/src/core/lib/security/security_connector/tls/{spiffe_security_connector.cc → tls_security_connector.cc} +82 -69
  74. data/src/core/lib/security/security_connector/tls/{spiffe_security_connector.h → tls_security_connector.h} +17 -18
  75. data/src/core/lib/security/transport/client_auth_filter.cc +33 -0
  76. data/src/core/lib/surface/completion_queue.cc +22 -1
  77. data/src/core/lib/surface/version.cc +1 -1
  78. data/src/core/tsi/alts/handshaker/alts_tsi_handshaker.cc +11 -1
  79. data/src/core/tsi/alts/handshaker/alts_tsi_handshaker.h +1 -1
  80. data/src/core/tsi/alts/zero_copy_frame_protector/alts_zero_copy_grpc_protector.cc +3 -3
  81. data/src/core/tsi/fake_transport_security.cc +7 -3
  82. data/src/core/tsi/fake_transport_security.h +2 -0
  83. data/src/core/tsi/ssl_transport_security.cc +144 -8
  84. data/src/core/tsi/ssl_transport_security.h +15 -1
  85. data/src/core/tsi/transport_security.cc +13 -0
  86. data/src/core/tsi/transport_security_grpc.cc +2 -2
  87. data/src/core/tsi/transport_security_grpc.h +2 -2
  88. data/src/core/tsi/transport_security_interface.h +12 -0
  89. data/src/ruby/bin/math_pb.rb +5 -5
  90. data/src/ruby/ext/grpc/rb_call_credentials.c +4 -1
  91. data/src/ruby/ext/grpc/rb_grpc_imports.generated.c +2 -0
  92. data/src/ruby/ext/grpc/rb_grpc_imports.generated.h +4 -1
  93. data/src/ruby/lib/grpc/version.rb +1 -1
  94. data/src/ruby/pb/grpc/health/v1/health_pb.rb +3 -3
  95. data/src/ruby/pb/src/proto/grpc/testing/empty_pb.rb +1 -1
  96. data/src/ruby/pb/src/proto/grpc/testing/messages_pb.rb +23 -13
  97. data/third_party/abseil-cpp/absl/algorithm/algorithm.h +159 -0
  98. data/third_party/abseil-cpp/absl/base/attributes.h +609 -0
  99. data/third_party/abseil-cpp/absl/base/call_once.h +226 -0
  100. data/third_party/abseil-cpp/absl/base/casts.h +184 -0
  101. data/third_party/abseil-cpp/absl/base/config.h +622 -0
  102. data/third_party/abseil-cpp/absl/base/const_init.h +76 -0
  103. data/third_party/abseil-cpp/absl/base/dynamic_annotations.cc +129 -0
  104. data/third_party/abseil-cpp/absl/base/dynamic_annotations.h +389 -0
  105. data/third_party/abseil-cpp/absl/base/internal/atomic_hook.h +179 -0
  106. data/third_party/abseil-cpp/absl/base/internal/bits.h +218 -0
  107. data/third_party/abseil-cpp/absl/base/internal/cycleclock.cc +107 -0
  108. data/third_party/abseil-cpp/absl/base/internal/cycleclock.h +94 -0
  109. data/third_party/abseil-cpp/absl/base/internal/endian.h +266 -0
  110. data/third_party/abseil-cpp/absl/base/internal/hide_ptr.h +51 -0
  111. data/third_party/abseil-cpp/absl/base/internal/identity.h +37 -0
  112. data/third_party/abseil-cpp/absl/base/internal/inline_variable.h +107 -0
  113. data/third_party/abseil-cpp/absl/base/internal/invoke.h +187 -0
  114. data/third_party/abseil-cpp/absl/base/internal/low_level_scheduling.h +107 -0
  115. data/third_party/abseil-cpp/absl/base/internal/per_thread_tls.h +52 -0
  116. data/third_party/abseil-cpp/absl/base/internal/raw_logging.cc +237 -0
  117. data/third_party/abseil-cpp/absl/base/internal/raw_logging.h +179 -0
  118. data/third_party/abseil-cpp/absl/base/internal/scheduling_mode.h +58 -0
  119. data/third_party/abseil-cpp/absl/base/internal/spinlock.cc +233 -0
  120. data/third_party/abseil-cpp/absl/base/internal/spinlock.h +243 -0
  121. data/third_party/abseil-cpp/absl/base/internal/spinlock_akaros.inc +35 -0
  122. data/third_party/abseil-cpp/absl/base/internal/spinlock_linux.inc +67 -0
  123. data/third_party/abseil-cpp/absl/base/internal/spinlock_posix.inc +46 -0
  124. data/third_party/abseil-cpp/absl/base/internal/spinlock_wait.cc +81 -0
  125. data/third_party/abseil-cpp/absl/base/internal/spinlock_wait.h +93 -0
  126. data/third_party/abseil-cpp/absl/base/internal/spinlock_win32.inc +37 -0
  127. data/third_party/abseil-cpp/absl/base/internal/sysinfo.cc +414 -0
  128. data/third_party/abseil-cpp/absl/base/internal/sysinfo.h +66 -0
  129. data/third_party/abseil-cpp/absl/base/internal/thread_annotations.h +271 -0
  130. data/third_party/abseil-cpp/absl/base/internal/thread_identity.cc +140 -0
  131. data/third_party/abseil-cpp/absl/base/internal/thread_identity.h +250 -0
  132. data/third_party/abseil-cpp/absl/base/internal/throw_delegate.cc +108 -0
  133. data/third_party/abseil-cpp/absl/base/internal/throw_delegate.h +75 -0
  134. data/third_party/abseil-cpp/absl/base/internal/tsan_mutex_interface.h +66 -0
  135. data/third_party/abseil-cpp/absl/base/internal/unaligned_access.h +158 -0
  136. data/third_party/abseil-cpp/absl/base/internal/unscaledcycleclock.cc +103 -0
  137. data/third_party/abseil-cpp/absl/base/internal/unscaledcycleclock.h +124 -0
  138. data/third_party/abseil-cpp/absl/base/log_severity.cc +27 -0
  139. data/third_party/abseil-cpp/absl/base/log_severity.h +121 -0
  140. data/third_party/abseil-cpp/absl/base/macros.h +220 -0
  141. data/third_party/abseil-cpp/absl/base/optimization.h +181 -0
  142. data/third_party/abseil-cpp/absl/base/options.h +214 -0
  143. data/third_party/abseil-cpp/absl/base/policy_checks.h +111 -0
  144. data/third_party/abseil-cpp/absl/base/port.h +26 -0
  145. data/third_party/abseil-cpp/absl/base/thread_annotations.h +280 -0
  146. data/third_party/abseil-cpp/absl/container/inlined_vector.h +848 -0
  147. data/third_party/abseil-cpp/absl/container/internal/compressed_tuple.h +265 -0
  148. data/third_party/abseil-cpp/absl/container/internal/inlined_vector.h +892 -0
  149. data/third_party/abseil-cpp/absl/memory/memory.h +695 -0
  150. data/third_party/abseil-cpp/absl/meta/type_traits.h +759 -0
  151. data/third_party/abseil-cpp/absl/numeric/int128.cc +404 -0
  152. data/third_party/abseil-cpp/absl/numeric/int128.h +1091 -0
  153. data/third_party/abseil-cpp/absl/numeric/int128_have_intrinsic.inc +302 -0
  154. data/third_party/abseil-cpp/absl/numeric/int128_no_intrinsic.inc +308 -0
  155. data/third_party/abseil-cpp/absl/strings/ascii.cc +200 -0
  156. data/third_party/abseil-cpp/absl/strings/ascii.h +241 -0
  157. data/third_party/abseil-cpp/absl/strings/charconv.cc +985 -0
  158. data/third_party/abseil-cpp/absl/strings/charconv.h +119 -0
  159. data/third_party/abseil-cpp/absl/strings/escaping.cc +949 -0
  160. data/third_party/abseil-cpp/absl/strings/escaping.h +164 -0
  161. data/third_party/abseil-cpp/absl/strings/internal/char_map.h +156 -0
  162. data/third_party/abseil-cpp/absl/strings/internal/charconv_bigint.cc +359 -0
  163. data/third_party/abseil-cpp/absl/strings/internal/charconv_bigint.h +421 -0
  164. data/third_party/abseil-cpp/absl/strings/internal/charconv_parse.cc +504 -0
  165. data/third_party/abseil-cpp/absl/strings/internal/charconv_parse.h +99 -0
  166. data/third_party/abseil-cpp/absl/strings/internal/escaping.cc +180 -0
  167. data/third_party/abseil-cpp/absl/strings/internal/escaping.h +58 -0
  168. data/third_party/abseil-cpp/absl/strings/internal/memutil.cc +112 -0
  169. data/third_party/abseil-cpp/absl/strings/internal/memutil.h +148 -0
  170. data/third_party/abseil-cpp/absl/strings/internal/ostringstream.cc +36 -0
  171. data/third_party/abseil-cpp/absl/strings/internal/ostringstream.h +89 -0
  172. data/third_party/abseil-cpp/absl/strings/internal/resize_uninitialized.h +73 -0
  173. data/third_party/abseil-cpp/absl/strings/internal/stl_type_traits.h +248 -0
  174. data/third_party/abseil-cpp/absl/strings/internal/str_join_internal.h +314 -0
  175. data/third_party/abseil-cpp/absl/strings/internal/str_split_internal.h +455 -0
  176. data/third_party/abseil-cpp/absl/strings/internal/utf8.cc +53 -0
  177. data/third_party/abseil-cpp/absl/strings/internal/utf8.h +50 -0
  178. data/third_party/abseil-cpp/absl/strings/match.cc +40 -0
  179. data/third_party/abseil-cpp/absl/strings/match.h +90 -0
  180. data/third_party/abseil-cpp/absl/strings/numbers.cc +916 -0
  181. data/third_party/abseil-cpp/absl/strings/numbers.h +263 -0
  182. data/third_party/abseil-cpp/absl/strings/str_cat.cc +246 -0
  183. data/third_party/abseil-cpp/absl/strings/str_cat.h +408 -0
  184. data/third_party/abseil-cpp/absl/strings/str_join.h +293 -0
  185. data/third_party/abseil-cpp/absl/strings/str_replace.cc +82 -0
  186. data/third_party/abseil-cpp/absl/strings/str_replace.h +219 -0
  187. data/third_party/abseil-cpp/absl/strings/str_split.cc +139 -0
  188. data/third_party/abseil-cpp/absl/strings/str_split.h +513 -0
  189. data/third_party/abseil-cpp/absl/strings/string_view.cc +235 -0
  190. data/third_party/abseil-cpp/absl/strings/string_view.h +615 -0
  191. data/third_party/abseil-cpp/absl/strings/strip.h +91 -0
  192. data/third_party/abseil-cpp/absl/strings/substitute.cc +171 -0
  193. data/third_party/abseil-cpp/absl/strings/substitute.h +693 -0
  194. data/third_party/abseil-cpp/absl/types/bad_optional_access.cc +48 -0
  195. data/third_party/abseil-cpp/absl/types/bad_optional_access.h +78 -0
  196. data/third_party/abseil-cpp/absl/types/internal/optional.h +396 -0
  197. data/third_party/abseil-cpp/absl/types/internal/span.h +128 -0
  198. data/third_party/abseil-cpp/absl/types/optional.h +776 -0
  199. data/third_party/abseil-cpp/absl/types/span.h +713 -0
  200. data/third_party/abseil-cpp/absl/utility/utility.h +350 -0
  201. data/third_party/upb/upb/decode.c +4 -0
  202. data/third_party/upb/upb/port.c +0 -1
  203. data/third_party/upb/upb/port_def.inc +1 -3
  204. data/third_party/upb/upb/table.c +2 -1
  205. metadata +147 -43
  206. data/src/core/lib/json/json_common.h +0 -34
  207. data/src/core/lib/json/json_reader.h +0 -146
  208. data/src/core/lib/json/json_string.cc +0 -367
  209. data/src/core/lib/json/json_writer.h +0 -84
@@ -121,6 +121,16 @@ class StringView final {
121
121
  size());
122
122
  }
123
123
 
124
+ // Compares with other.
125
+ inline int compare(StringView other) {
126
+ const size_t len = GPR_MIN(size(), other.size());
127
+ const int ret = strncmp(data(), other.data(), len);
128
+ if (ret != 0) return ret;
129
+ if (size() == other.size()) return 0;
130
+ if (size() < other.size()) return -1;
131
+ return 1;
132
+ }
133
+
124
134
  private:
125
135
  const char* ptr_;
126
136
  size_t size_;
@@ -133,6 +143,10 @@ inline bool operator==(StringView lhs, StringView rhs) {
133
143
 
134
144
  inline bool operator!=(StringView lhs, StringView rhs) { return !(lhs == rhs); }
135
145
 
146
+ inline bool operator<(StringView lhs, StringView rhs) {
147
+ return lhs.compare(rhs) < 0;
148
+ }
149
+
136
150
  #endif // GRPC_USE_ABSL
137
151
 
138
152
  // Converts grpc_slice to StringView.
@@ -150,16 +164,6 @@ inline grpc_core::UniquePtr<char> StringViewToCString(const StringView sv) {
150
164
  return grpc_core::UniquePtr<char>(str);
151
165
  }
152
166
 
153
- // Compares lhs and rhs.
154
- inline int StringViewCmp(const StringView lhs, const StringView rhs) {
155
- const size_t len = GPR_MIN(lhs.size(), rhs.size());
156
- const int ret = strncmp(lhs.data(), rhs.data(), len);
157
- if (ret != 0) return ret;
158
- if (lhs.size() == rhs.size()) return 0;
159
- if (lhs.size() < rhs.size()) return -1;
160
- return 1;
161
- }
162
-
163
167
  } // namespace grpc_core
164
168
 
165
169
  #endif /* GRPC_CORE_LIB_GPRPP_STRING_VIEW_H */
@@ -143,7 +143,7 @@ void Executor::SetThreading(bool threading) {
143
143
 
144
144
  if (threading) {
145
145
  if (curr_num_threads > 0) {
146
- EXECUTOR_TRACE("(%s) SetThreading(true). curr_num_threads == 0", name_);
146
+ EXECUTOR_TRACE("(%s) SetThreading(true). curr_num_threads > 0", name_);
147
147
  return;
148
148
  }
149
149
 
@@ -22,6 +22,10 @@
22
22
 
23
23
  #ifdef GRPC_POSIX_FORK
24
24
 
25
+ #ifdef GRPC_POSIX_FORK_ALLOW_PTHREAD_ATFORK
26
+ #include <pthread.h>
27
+ #endif
28
+
25
29
  #include <string.h>
26
30
 
27
31
  #include <grpc/fork.h>
@@ -0,0 +1,87 @@
1
+ /*
2
+ *
3
+ * Copyright 2019 gRPC authors.
4
+ *
5
+ * Licensed under the Apache License, Version 2.0 (the "License");
6
+ * you may not use this file except in compliance with the License.
7
+ * You may obtain a copy of the License at
8
+ *
9
+ * http://www.apache.org/licenses/LICENSE-2.0
10
+ *
11
+ * Unless required by applicable law or agreed to in writing, software
12
+ * distributed under the License is distributed on an "AS IS" BASIS,
13
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ * See the License for the specific language governing permissions and
15
+ * limitations under the License.
16
+ *
17
+ */
18
+
19
+ #include <grpc/support/port_platform.h>
20
+
21
+ #include "src/core/lib/iomgr/poller/eventmanager_libuv.h"
22
+
23
+ #include <grpc/support/time.h>
24
+
25
+ grpc::experimental::LibuvEventManager::Options::Options() : num_workers_(-1) {}
26
+ grpc::experimental::LibuvEventManager::Options::Options(int num_workers)
27
+ : num_workers_(num_workers) {}
28
+
29
+ grpc::experimental::LibuvEventManager::LibuvEventManager(const Options& options)
30
+ : options_(options) {
31
+ int num_workers = options_.num_workers();
32
+ // Number of workers can't be 0 if we do not accept thread donation.
33
+ // TODO(guantaol): replaces the hard-coded number with a flag.
34
+ if (num_workers <= 0) num_workers = 32;
35
+
36
+ for (int i = 0; i < num_workers; i++) {
37
+ workers_.emplace_back(
38
+ options_.thread_name_prefix().c_str(),
39
+ [](void* em) { static_cast<LibuvEventManager*>(em)->RunWorkerLoop(); },
40
+ this);
41
+ workers_.back().Start();
42
+ }
43
+ }
44
+
45
+ grpc::experimental::LibuvEventManager::~LibuvEventManager() {
46
+ Shutdown();
47
+ for (auto& th : workers_) {
48
+ th.Join();
49
+ }
50
+ }
51
+
52
+ void grpc::experimental::LibuvEventManager::RunWorkerLoop() {
53
+ while (true) {
54
+ // TODO(guantaol): extend the worker loop with real work.
55
+ if (ShouldStop()) return;
56
+ gpr_sleep_until(gpr_time_add(gpr_now(GPR_CLOCK_MONOTONIC),
57
+ gpr_time_from_micros(10, GPR_TIMESPAN)));
58
+ }
59
+ }
60
+
61
+ bool grpc::experimental::LibuvEventManager::ShouldStop() {
62
+ return should_stop_.Load(grpc_core::MemoryOrder::ACQUIRE) != 0;
63
+ }
64
+
65
+ void grpc::experimental::LibuvEventManager::Shutdown() {
66
+ if (should_stop_.Load(grpc_core::MemoryOrder::ACQUIRE))
67
+ return; // Already shut down.
68
+
69
+ {
70
+ grpc_core::MutexLock lock(&shutdown_mu_);
71
+ while (shutdown_refcount_.Load(grpc_core::MemoryOrder::ACQUIRE) > 0) {
72
+ shutdown_cv_.Wait(&shutdown_mu_);
73
+ }
74
+ }
75
+ should_stop_.Store(true, grpc_core::MemoryOrder::RELEASE);
76
+ }
77
+
78
+ void grpc::experimental::LibuvEventManager::ShutdownRef() {
79
+ shutdown_refcount_.FetchAdd(1, grpc_core::MemoryOrder::RELAXED);
80
+ }
81
+
82
+ void grpc::experimental::LibuvEventManager::ShutdownUnref() {
83
+ if (shutdown_refcount_.FetchSub(1, grpc_core::MemoryOrder::ACQ_REL) == 1) {
84
+ grpc_core::MutexLock lock(&shutdown_mu_);
85
+ shutdown_cv_.Signal();
86
+ }
87
+ }
@@ -0,0 +1,88 @@
1
+ /*
2
+ *
3
+ * Copyright 2019 gRPC authors.
4
+ *
5
+ * Licensed under the Apache License, Version 2.0 (the "License");
6
+ * you may not use this file except in compliance with the License.
7
+ * You may obtain a copy of the License at
8
+ *
9
+ * http://www.apache.org/licenses/LICENSE-2.0
10
+ *
11
+ * Unless required by applicable law or agreed to in writing, software
12
+ * distributed under the License is distributed on an "AS IS" BASIS,
13
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ * See the License for the specific language governing permissions and
15
+ * limitations under the License.
16
+ *
17
+ */
18
+
19
+ #ifndef GRPC_CORE_LIB_IOMGR_POLLER_EVENTMANAGER_LIBUV_H
20
+ #define GRPC_CORE_LIB_IOMGR_POLLER_EVENTMANAGER_LIBUV_H
21
+
22
+ #include <grpc/support/port_platform.h>
23
+
24
+ #include <string>
25
+ #include <vector>
26
+
27
+ #include "src/core/lib/gprpp/atomic.h"
28
+ #include "src/core/lib/gprpp/sync.h"
29
+ #include "src/core/lib/gprpp/thd.h"
30
+
31
+ namespace grpc {
32
+ namespace experimental {
33
+
34
+ class LibuvEventManager {
35
+ public:
36
+ class Options {
37
+ public:
38
+ Options();
39
+ Options(int num_workers);
40
+
41
+ int num_workers() const { return num_workers_; }
42
+ void set_num_workers(int num) { num_workers_ = num; }
43
+
44
+ const std::string& thread_name_prefix() const {
45
+ return thread_name_prefix_;
46
+ }
47
+ void set_thread_name_prefix(const std::string& name) {
48
+ thread_name_prefix_ = name;
49
+ }
50
+
51
+ private:
52
+ // Number of worker threads to create at startup. If less than 0, uses the
53
+ // default value of 32.
54
+ int num_workers_;
55
+ // Name prefix used for worker.
56
+ std::string thread_name_prefix_;
57
+ };
58
+
59
+ explicit LibuvEventManager(const Options& options);
60
+ virtual ~LibuvEventManager();
61
+
62
+ void Shutdown();
63
+ void ShutdownRef();
64
+ void ShutdownUnref();
65
+
66
+ private:
67
+ // Function run by the worker threads.
68
+ void RunWorkerLoop();
69
+
70
+ // Whether the EventManager has been shut down.
71
+ bool ShouldStop();
72
+
73
+ const Options options_;
74
+ // Whether the EventManager workers should be stopped.
75
+ grpc_core::Atomic<bool> should_stop_{false};
76
+ // A refcount preventing the EventManager from shutdown.
77
+ grpc_core::Atomic<int> shutdown_refcount_{0};
78
+ // Worker threads of the EventManager.
79
+ std::vector<grpc_core::Thread> workers_;
80
+ // Mutex and condition variable used for shutdown.
81
+ grpc_core::Mutex shutdown_mu_;
82
+ grpc_core::CondVar shutdown_cv_;
83
+ };
84
+
85
+ } // namespace experimental
86
+ } // namespace grpc
87
+
88
+ #endif /* GRPC_CORE_LIB_IOMGR_POLLER_EVENTMANAGER_LIBUV_H */
@@ -50,6 +50,20 @@
50
50
  #include "src/core/lib/iomgr/sockaddr.h"
51
51
  #include "src/core/lib/iomgr/sockaddr_utils.h"
52
52
 
53
+ /* set a socket to use zerocopy */
54
+ grpc_error* grpc_set_socket_zerocopy(int fd) {
55
+ #ifdef GRPC_LINUX_ERRQUEUE
56
+ const int enable = 1;
57
+ auto err = setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, &enable, sizeof(enable));
58
+ if (err != 0) {
59
+ return GRPC_OS_ERROR(errno, "setsockopt(SO_ZEROCOPY)");
60
+ }
61
+ return GRPC_ERROR_NONE;
62
+ #else
63
+ return GRPC_OS_ERROR(ENOSYS, "setsockopt(SO_ZEROCOPY)");
64
+ #endif
65
+ }
66
+
53
67
  /* set a socket to non blocking mode */
54
68
  grpc_error* grpc_set_socket_nonblocking(int fd, int non_blocking) {
55
69
  int oldflags = fcntl(fd, F_GETFL, 0);
@@ -31,10 +31,22 @@
31
31
  #include "src/core/lib/iomgr/socket_factory_posix.h"
32
32
  #include "src/core/lib/iomgr/socket_mutator.h"
33
33
 
34
+ #ifdef GRPC_LINUX_ERRQUEUE
35
+ #ifndef SO_ZEROCOPY
36
+ #define SO_ZEROCOPY 60
37
+ #endif
38
+ #ifndef SO_EE_ORIGIN_ZEROCOPY
39
+ #define SO_EE_ORIGIN_ZEROCOPY 5
40
+ #endif
41
+ #endif /* ifdef GRPC_LINUX_ERRQUEUE */
42
+
34
43
  /* a wrapper for accept or accept4 */
35
44
  int grpc_accept4(int sockfd, grpc_resolved_address* resolved_addr, int nonblock,
36
45
  int cloexec);
37
46
 
47
+ /* set a socket to use zerocopy */
48
+ grpc_error* grpc_set_socket_zerocopy(int fd);
49
+
38
50
  /* set a socket to non blocking mode */
39
51
  grpc_error* grpc_set_socket_nonblocking(int fd, int non_blocking);
40
52
 
@@ -24,6 +24,9 @@
24
24
  #include "src/core/lib/iomgr/endpoint.h"
25
25
  #include "src/core/lib/iomgr/sockaddr.h"
26
26
 
27
+ // Same number as the micro of SO_REUSEPORT in kernel
28
+ #define GRPC_CUSTOM_SOCKET_OPT_SO_REUSEPORT (0x00000200u)
29
+
27
30
  typedef struct grpc_tcp_listener grpc_tcp_listener;
28
31
  typedef struct grpc_custom_tcp_connect grpc_custom_tcp_connect;
29
32
 
@@ -36,6 +36,7 @@
36
36
  #include <sys/types.h>
37
37
  #include <unistd.h>
38
38
  #include <algorithm>
39
+ #include <unordered_map>
39
40
 
40
41
  #include <grpc/slice.h>
41
42
  #include <grpc/support/alloc.h>
@@ -49,9 +50,11 @@
49
50
  #include "src/core/lib/debug/trace.h"
50
51
  #include "src/core/lib/gpr/string.h"
51
52
  #include "src/core/lib/gpr/useful.h"
53
+ #include "src/core/lib/gprpp/sync.h"
52
54
  #include "src/core/lib/iomgr/buffer_list.h"
53
55
  #include "src/core/lib/iomgr/ev_posix.h"
54
56
  #include "src/core/lib/iomgr/executor.h"
57
+ #include "src/core/lib/iomgr/socket_utils_posix.h"
55
58
  #include "src/core/lib/profiling/timers.h"
56
59
  #include "src/core/lib/slice/slice_internal.h"
57
60
  #include "src/core/lib/slice/slice_string_helpers.h"
@@ -71,6 +74,15 @@
71
74
  #define SENDMSG_FLAGS 0
72
75
  #endif
73
76
 
77
+ // TCP zero copy sendmsg flag.
78
+ // NB: We define this here as a fallback in case we're using an older set of
79
+ // library headers that has not defined MSG_ZEROCOPY. Since this constant is
80
+ // part of the kernel, we are guaranteed it will never change/disagree so
81
+ // defining it here is safe.
82
+ #ifndef MSG_ZEROCOPY
83
+ #define MSG_ZEROCOPY 0x4000000
84
+ #endif
85
+
74
86
  #ifdef GRPC_MSG_IOVLEN_TYPE
75
87
  typedef GRPC_MSG_IOVLEN_TYPE msg_iovlen_type;
76
88
  #else
@@ -79,6 +91,264 @@ typedef size_t msg_iovlen_type;
79
91
 
80
92
  extern grpc_core::TraceFlag grpc_tcp_trace;
81
93
 
94
+ namespace grpc_core {
95
+
96
+ class TcpZerocopySendRecord {
97
+ public:
98
+ TcpZerocopySendRecord() { grpc_slice_buffer_init(&buf_); }
99
+
100
+ ~TcpZerocopySendRecord() {
101
+ AssertEmpty();
102
+ grpc_slice_buffer_destroy_internal(&buf_);
103
+ }
104
+
105
+ // Given the slices that we wish to send, and the current offset into the
106
+ // slice buffer (indicating which have already been sent), populate an iovec
107
+ // array that will be used for a zerocopy enabled sendmsg().
108
+ msg_iovlen_type PopulateIovs(size_t* unwind_slice_idx,
109
+ size_t* unwind_byte_idx, size_t* sending_length,
110
+ iovec* iov);
111
+
112
+ // A sendmsg() may not be able to send the bytes that we requested at this
113
+ // time, returning EAGAIN (possibly due to backpressure). In this case,
114
+ // unwind the offset into the slice buffer so we retry sending these bytes.
115
+ void UnwindIfThrottled(size_t unwind_slice_idx, size_t unwind_byte_idx) {
116
+ out_offset_.byte_idx = unwind_byte_idx;
117
+ out_offset_.slice_idx = unwind_slice_idx;
118
+ }
119
+
120
+ // Update the offset into the slice buffer based on how much we wanted to sent
121
+ // vs. what sendmsg() actually sent (which may be lower, possibly due to
122
+ // backpressure).
123
+ void UpdateOffsetForBytesSent(size_t sending_length, size_t actually_sent);
124
+
125
+ // Indicates whether all underlying data has been sent or not.
126
+ bool AllSlicesSent() { return out_offset_.slice_idx == buf_.count; }
127
+
128
+ // Reset this structure for a new tcp_write() with zerocopy.
129
+ void PrepareForSends(grpc_slice_buffer* slices_to_send) {
130
+ AssertEmpty();
131
+ out_offset_.slice_idx = 0;
132
+ out_offset_.byte_idx = 0;
133
+ grpc_slice_buffer_swap(slices_to_send, &buf_);
134
+ Ref();
135
+ }
136
+
137
+ // References: 1 reference per sendmsg(), and 1 for the tcp_write().
138
+ void Ref() { ref_.FetchAdd(1, MemoryOrder::RELAXED); }
139
+
140
+ // Unref: called when we get an error queue notification for a sendmsg(), if a
141
+ // sendmsg() failed or when tcp_write() is done.
142
+ bool Unref() {
143
+ const intptr_t prior = ref_.FetchSub(1, MemoryOrder::ACQ_REL);
144
+ GPR_DEBUG_ASSERT(prior > 0);
145
+ if (prior == 1) {
146
+ AllSendsComplete();
147
+ return true;
148
+ }
149
+ return false;
150
+ }
151
+
152
+ private:
153
+ struct OutgoingOffset {
154
+ size_t slice_idx = 0;
155
+ size_t byte_idx = 0;
156
+ };
157
+
158
+ void AssertEmpty() {
159
+ GPR_DEBUG_ASSERT(buf_.count == 0);
160
+ GPR_DEBUG_ASSERT(buf_.length == 0);
161
+ GPR_DEBUG_ASSERT(ref_.Load(MemoryOrder::RELAXED) == 0);
162
+ }
163
+
164
+ // When all sendmsg() calls associated with this tcp_write() have been
165
+ // completed (ie. we have received the notifications for each sequence number
166
+ // for each sendmsg()) and all reference counts have been dropped, drop our
167
+ // reference to the underlying data since we no longer need it.
168
+ void AllSendsComplete() {
169
+ GPR_DEBUG_ASSERT(ref_.Load(MemoryOrder::RELAXED) == 0);
170
+ grpc_slice_buffer_reset_and_unref_internal(&buf_);
171
+ }
172
+
173
+ grpc_slice_buffer buf_;
174
+ Atomic<intptr_t> ref_;
175
+ OutgoingOffset out_offset_;
176
+ };
177
+
178
+ class TcpZerocopySendCtx {
179
+ public:
180
+ static constexpr int kDefaultMaxSends = 4;
181
+ static constexpr size_t kDefaultSendBytesThreshold = 16 * 1024; // 16KB
182
+
183
+ TcpZerocopySendCtx(int max_sends = kDefaultMaxSends,
184
+ size_t send_bytes_threshold = kDefaultSendBytesThreshold)
185
+ : max_sends_(max_sends),
186
+ free_send_records_size_(max_sends),
187
+ threshold_bytes_(send_bytes_threshold) {
188
+ send_records_ = static_cast<TcpZerocopySendRecord*>(
189
+ gpr_malloc(max_sends * sizeof(*send_records_)));
190
+ free_send_records_ = static_cast<TcpZerocopySendRecord**>(
191
+ gpr_malloc(max_sends * sizeof(*free_send_records_)));
192
+ if (send_records_ == nullptr || free_send_records_ == nullptr) {
193
+ gpr_free(send_records_);
194
+ gpr_free(free_send_records_);
195
+ gpr_log(GPR_INFO, "Disabling TCP TX zerocopy due to memory pressure.\n");
196
+ memory_limited_ = true;
197
+ } else {
198
+ for (int idx = 0; idx < max_sends_; ++idx) {
199
+ new (send_records_ + idx) TcpZerocopySendRecord();
200
+ free_send_records_[idx] = send_records_ + idx;
201
+ }
202
+ }
203
+ }
204
+
205
+ ~TcpZerocopySendCtx() {
206
+ if (send_records_ != nullptr) {
207
+ for (int idx = 0; idx < max_sends_; ++idx) {
208
+ send_records_[idx].~TcpZerocopySendRecord();
209
+ }
210
+ }
211
+ gpr_free(send_records_);
212
+ gpr_free(free_send_records_);
213
+ }
214
+
215
+ // True if we were unable to allocate the various bookkeeping structures at
216
+ // transport initialization time. If memory limited, we do not zerocopy.
217
+ bool memory_limited() const { return memory_limited_; }
218
+
219
+ // TCP send zerocopy maintains an implicit sequence number for every
220
+ // successful sendmsg() with zerocopy enabled; the kernel later gives us an
221
+ // error queue notification with this sequence number indicating that the
222
+ // underlying data buffers that we sent can now be released. Once that
223
+ // notification is received, we can release the buffers associated with this
224
+ // zerocopy send record. Here, we associate the sequence number with the data
225
+ // buffers that were sent with the corresponding call to sendmsg().
226
+ void NoteSend(TcpZerocopySendRecord* record) {
227
+ record->Ref();
228
+ AssociateSeqWithSendRecord(last_send_, record);
229
+ ++last_send_;
230
+ }
231
+
232
+ // If sendmsg() actually failed, though, we need to revert the sequence number
233
+ // that we speculatively bumped before calling sendmsg(). Note that we bump
234
+ // this sequence number and perform relevant bookkeeping (see: NoteSend())
235
+ // *before* calling sendmsg() since, if we called it *after* sendmsg(), then
236
+ // there is a possible race with the release notification which could occur on
237
+ // another thread before we do the necessary bookkeeping. Hence, calling
238
+ // NoteSend() *before* sendmsg() and implementing an undo function is needed.
239
+ void UndoSend() {
240
+ --last_send_;
241
+ if (ReleaseSendRecord(last_send_)->Unref()) {
242
+ // We should still be holding the ref taken by tcp_write().
243
+ GPR_DEBUG_ASSERT(0);
244
+ }
245
+ }
246
+
247
+ // Simply associate this send record (and the underlying sent data buffers)
248
+ // with the implicit sequence number for this zerocopy sendmsg().
249
+ void AssociateSeqWithSendRecord(uint32_t seq, TcpZerocopySendRecord* record) {
250
+ MutexLock guard(&lock_);
251
+ ctx_lookup_.emplace(seq, record);
252
+ }
253
+
254
+ // Get a send record for a send that we wish to do with zerocopy.
255
+ TcpZerocopySendRecord* GetSendRecord() {
256
+ MutexLock guard(&lock_);
257
+ return TryGetSendRecordLocked();
258
+ }
259
+
260
+ // A given send record corresponds to a single tcp_write() with zerocopy
261
+ // enabled. This can result in several sendmsg() calls to flush all of the
262
+ // data to wire. Each sendmsg() takes a reference on the
263
+ // TcpZerocopySendRecord, and corresponds to a single sequence number.
264
+ // ReleaseSendRecord releases a reference on TcpZerocopySendRecord for a
265
+ // single sequence number. This is called either when we receive the relevant
266
+ // error queue notification (saying that we can discard the underlying
267
+ // buffers for this sendmsg()) is received from the kernel - or, in case
268
+ // sendmsg() was unsuccessful to begin with.
269
+ TcpZerocopySendRecord* ReleaseSendRecord(uint32_t seq) {
270
+ MutexLock guard(&lock_);
271
+ return ReleaseSendRecordLocked(seq);
272
+ }
273
+
274
+ // After all the references to a TcpZerocopySendRecord are released, we can
275
+ // add it back to the pool (of size max_sends_). Note that we can only have
276
+ // max_sends_ tcp_write() instances with zerocopy enabled in flight at the
277
+ // same time.
278
+ void PutSendRecord(TcpZerocopySendRecord* record) {
279
+ GPR_DEBUG_ASSERT(record >= send_records_ &&
280
+ record < send_records_ + max_sends_);
281
+ MutexLock guard(&lock_);
282
+ PutSendRecordLocked(record);
283
+ }
284
+
285
+ // Indicate that we are disposing of this zerocopy context. This indicator
286
+ // will prevent new zerocopy writes from being issued.
287
+ void Shutdown() { shutdown_.Store(true, MemoryOrder::RELEASE); }
288
+
289
+ // Indicates that there are no inflight tcp_write() instances with zerocopy
290
+ // enabled.
291
+ bool AllSendRecordsEmpty() {
292
+ MutexLock guard(&lock_);
293
+ return free_send_records_size_ == max_sends_;
294
+ }
295
+
296
+ bool enabled() const { return enabled_; }
297
+
298
+ void set_enabled(bool enabled) {
299
+ GPR_DEBUG_ASSERT(!enabled || !memory_limited());
300
+ enabled_ = enabled;
301
+ }
302
+
303
+ // Only use zerocopy if we are sending at least this many bytes. The
304
+ // additional overhead of reading the error queue for notifications means that
305
+ // zerocopy is not useful for small transfers.
306
+ size_t threshold_bytes() const { return threshold_bytes_; }
307
+
308
+ private:
309
+ TcpZerocopySendRecord* ReleaseSendRecordLocked(uint32_t seq) {
310
+ auto iter = ctx_lookup_.find(seq);
311
+ GPR_DEBUG_ASSERT(iter != ctx_lookup_.end());
312
+ TcpZerocopySendRecord* record = iter->second;
313
+ ctx_lookup_.erase(iter);
314
+ return record;
315
+ }
316
+
317
+ TcpZerocopySendRecord* TryGetSendRecordLocked() {
318
+ if (shutdown_.Load(MemoryOrder::ACQUIRE)) {
319
+ return nullptr;
320
+ }
321
+ if (free_send_records_size_ == 0) {
322
+ return nullptr;
323
+ }
324
+ free_send_records_size_--;
325
+ return free_send_records_[free_send_records_size_];
326
+ }
327
+
328
+ void PutSendRecordLocked(TcpZerocopySendRecord* record) {
329
+ GPR_DEBUG_ASSERT(free_send_records_size_ < max_sends_);
330
+ free_send_records_[free_send_records_size_] = record;
331
+ free_send_records_size_++;
332
+ }
333
+
334
+ TcpZerocopySendRecord* send_records_;
335
+ TcpZerocopySendRecord** free_send_records_;
336
+ int max_sends_;
337
+ int free_send_records_size_;
338
+ Mutex lock_;
339
+ uint32_t last_send_ = 0;
340
+ Atomic<bool> shutdown_;
341
+ bool enabled_ = false;
342
+ size_t threshold_bytes_ = kDefaultSendBytesThreshold;
343
+ std::unordered_map<uint32_t, TcpZerocopySendRecord*> ctx_lookup_;
344
+ bool memory_limited_ = false;
345
+ };
346
+
347
+ } // namespace grpc_core
348
+
349
+ using grpc_core::TcpZerocopySendCtx;
350
+ using grpc_core::TcpZerocopySendRecord;
351
+
82
352
  namespace {
83
353
  struct grpc_tcp {
84
354
  grpc_endpoint base;
@@ -142,6 +412,8 @@ struct grpc_tcp {
142
412
  bool ts_capable; /* Cache whether we can set timestamping options */
143
413
  gpr_atm stop_error_notification; /* Set to 1 if we do not want to be notified
144
414
  on errors anymore */
415
+ TcpZerocopySendCtx tcp_zerocopy_send_ctx;
416
+ TcpZerocopySendRecord* current_zerocopy_send = nullptr;
145
417
  };
146
418
 
147
419
  struct backup_poller {
@@ -151,6 +423,8 @@ struct backup_poller {
151
423
 
152
424
  } // namespace
153
425
 
426
+ static void ZerocopyDisableAndWaitForRemaining(grpc_tcp* tcp);
427
+
154
428
  #define BACKUP_POLLER_POLLSET(b) ((grpc_pollset*)((b) + 1))
155
429
 
156
430
  static gpr_atm g_uncovered_notifications_pending;
@@ -339,6 +613,7 @@ static void tcp_handle_write(void* arg /* grpc_tcp */, grpc_error* error);
339
613
 
340
614
  static void tcp_shutdown(grpc_endpoint* ep, grpc_error* why) {
341
615
  grpc_tcp* tcp = reinterpret_cast<grpc_tcp*>(ep);
616
+ ZerocopyDisableAndWaitForRemaining(tcp);
342
617
  grpc_fd_shutdown(tcp->em_fd, why);
343
618
  grpc_resource_user_shutdown(tcp->resource_user);
344
619
  }
@@ -357,6 +632,7 @@ static void tcp_free(grpc_tcp* tcp) {
357
632
  gpr_mu_unlock(&tcp->tb_mu);
358
633
  tcp->outgoing_buffer_arg = nullptr;
359
634
  gpr_mu_destroy(&tcp->tb_mu);
635
+ tcp->tcp_zerocopy_send_ctx.~TcpZerocopySendCtx();
360
636
  gpr_free(tcp);
361
637
  }
362
638
 
@@ -390,6 +666,7 @@ static void tcp_destroy(grpc_endpoint* ep) {
390
666
  grpc_tcp* tcp = reinterpret_cast<grpc_tcp*>(ep);
391
667
  grpc_slice_buffer_reset_and_unref_internal(&tcp->last_read_buffer);
392
668
  if (grpc_event_engine_can_track_errors()) {
669
+ ZerocopyDisableAndWaitForRemaining(tcp);
393
670
  gpr_atm_no_barrier_store(&tcp->stop_error_notification, true);
394
671
  grpc_fd_set_error(tcp->em_fd);
395
672
  }
@@ -652,13 +929,13 @@ static void tcp_read(grpc_endpoint* ep, grpc_slice_buffer* incoming_buffer,
652
929
 
653
930
  /* A wrapper around sendmsg. It sends \a msg over \a fd and returns the number
654
931
  * of bytes sent. */
655
- ssize_t tcp_send(int fd, const struct msghdr* msg) {
932
+ ssize_t tcp_send(int fd, const struct msghdr* msg, int additional_flags = 0) {
656
933
  GPR_TIMER_SCOPE("sendmsg", 1);
657
934
  ssize_t sent_length;
658
935
  do {
659
936
  /* TODO(klempner): Cork if this is a partial write */
660
937
  GRPC_STATS_INC_SYSCALL_WRITE();
661
- sent_length = sendmsg(fd, msg, SENDMSG_FLAGS);
938
+ sent_length = sendmsg(fd, msg, SENDMSG_FLAGS | additional_flags);
662
939
  } while (sent_length < 0 && errno == EINTR);
663
940
  return sent_length;
664
941
  }
@@ -671,16 +948,52 @@ ssize_t tcp_send(int fd, const struct msghdr* msg) {
671
948
  */
672
949
  static bool tcp_write_with_timestamps(grpc_tcp* tcp, struct msghdr* msg,
673
950
  size_t sending_length,
674
- ssize_t* sent_length);
951
+ ssize_t* sent_length,
952
+ int additional_flags = 0);
675
953
 
676
954
  /** The callback function to be invoked when we get an error on the socket. */
677
955
  static void tcp_handle_error(void* arg /* grpc_tcp */, grpc_error* error);
678
956
 
957
+ static TcpZerocopySendRecord* tcp_get_send_zerocopy_record(
958
+ grpc_tcp* tcp, grpc_slice_buffer* buf);
959
+
679
960
  #ifdef GRPC_LINUX_ERRQUEUE
961
+ static bool process_errors(grpc_tcp* tcp);
962
+
963
+ static TcpZerocopySendRecord* tcp_get_send_zerocopy_record(
964
+ grpc_tcp* tcp, grpc_slice_buffer* buf) {
965
+ TcpZerocopySendRecord* zerocopy_send_record = nullptr;
966
+ const bool use_zerocopy =
967
+ tcp->tcp_zerocopy_send_ctx.enabled() &&
968
+ tcp->tcp_zerocopy_send_ctx.threshold_bytes() < buf->length;
969
+ if (use_zerocopy) {
970
+ zerocopy_send_record = tcp->tcp_zerocopy_send_ctx.GetSendRecord();
971
+ if (zerocopy_send_record == nullptr) {
972
+ process_errors(tcp);
973
+ zerocopy_send_record = tcp->tcp_zerocopy_send_ctx.GetSendRecord();
974
+ }
975
+ if (zerocopy_send_record != nullptr) {
976
+ zerocopy_send_record->PrepareForSends(buf);
977
+ GPR_DEBUG_ASSERT(buf->count == 0);
978
+ GPR_DEBUG_ASSERT(buf->length == 0);
979
+ tcp->outgoing_byte_idx = 0;
980
+ tcp->outgoing_buffer = nullptr;
981
+ }
982
+ }
983
+ return zerocopy_send_record;
984
+ }
985
+
986
+ static void ZerocopyDisableAndWaitForRemaining(grpc_tcp* tcp) {
987
+ tcp->tcp_zerocopy_send_ctx.Shutdown();
988
+ while (!tcp->tcp_zerocopy_send_ctx.AllSendRecordsEmpty()) {
989
+ process_errors(tcp);
990
+ }
991
+ }
680
992
 
681
993
  static bool tcp_write_with_timestamps(grpc_tcp* tcp, struct msghdr* msg,
682
994
  size_t sending_length,
683
- ssize_t* sent_length) {
995
+ ssize_t* sent_length,
996
+ int additional_flags) {
684
997
  if (!tcp->socket_ts_enabled) {
685
998
  uint32_t opt = grpc_core::kTimestampingSocketOptions;
686
999
  if (setsockopt(tcp->fd, SOL_SOCKET, SO_TIMESTAMPING,
@@ -708,7 +1021,7 @@ static bool tcp_write_with_timestamps(grpc_tcp* tcp, struct msghdr* msg,
708
1021
  msg->msg_controllen = CMSG_SPACE(sizeof(uint32_t));
709
1022
 
710
1023
  /* If there was an error on sendmsg the logic in tcp_flush will handle it. */
711
- ssize_t length = tcp_send(tcp->fd, msg);
1024
+ ssize_t length = tcp_send(tcp->fd, msg, additional_flags);
712
1025
  *sent_length = length;
713
1026
  /* Only save timestamps if all the bytes were taken by sendmsg. */
714
1027
  if (sending_length == static_cast<size_t>(length)) {
@@ -722,6 +1035,43 @@ static bool tcp_write_with_timestamps(grpc_tcp* tcp, struct msghdr* msg,
722
1035
  return true;
723
1036
  }
724
1037
 
1038
+ static void UnrefMaybePutZerocopySendRecord(grpc_tcp* tcp,
1039
+ TcpZerocopySendRecord* record,
1040
+ uint32_t seq, const char* tag);
1041
+ // Reads \a cmsg to process zerocopy control messages.
1042
+ static void process_zerocopy(grpc_tcp* tcp, struct cmsghdr* cmsg) {
1043
+ GPR_DEBUG_ASSERT(cmsg);
1044
+ auto serr = reinterpret_cast<struct sock_extended_err*>(CMSG_DATA(cmsg));
1045
+ GPR_DEBUG_ASSERT(serr->ee_errno == 0);
1046
+ GPR_DEBUG_ASSERT(serr->ee_origin == SO_EE_ORIGIN_ZEROCOPY);
1047
+ const uint32_t lo = serr->ee_info;
1048
+ const uint32_t hi = serr->ee_data;
1049
+ for (uint32_t seq = lo; seq <= hi; ++seq) {
1050
+ // TODO(arjunroy): It's likely that lo and hi refer to zerocopy sequence
1051
+ // numbers that are generated by a single call to grpc_endpoint_write; ie.
1052
+ // we can batch the unref operation. So, check if record is the same for
1053
+ // both; if so, batch the unref/put.
1054
+ TcpZerocopySendRecord* record =
1055
+ tcp->tcp_zerocopy_send_ctx.ReleaseSendRecord(seq);
1056
+ GPR_DEBUG_ASSERT(record);
1057
+ UnrefMaybePutZerocopySendRecord(tcp, record, seq, "CALLBACK RCVD");
1058
+ }
1059
+ }
1060
+
1061
+ // Whether the cmsg received from error queue is of the IPv4 or IPv6 levels.
1062
+ static bool CmsgIsIpLevel(const cmsghdr& cmsg) {
1063
+ return (cmsg.cmsg_level == SOL_IPV6 && cmsg.cmsg_type == IPV6_RECVERR) ||
1064
+ (cmsg.cmsg_level == SOL_IP && cmsg.cmsg_type == IP_RECVERR);
1065
+ }
1066
+
1067
+ static bool CmsgIsZeroCopy(const cmsghdr& cmsg) {
1068
+ if (!CmsgIsIpLevel(cmsg)) {
1069
+ return false;
1070
+ }
1071
+ auto serr = reinterpret_cast<const sock_extended_err*> CMSG_DATA(&cmsg);
1072
+ return serr->ee_errno == 0 && serr->ee_origin == SO_EE_ORIGIN_ZEROCOPY;
1073
+ }
1074
+
725
1075
  /** Reads \a cmsg to derive timestamps from the control messages. If a valid
726
1076
  * timestamp is found, the traced buffer list is updated with this timestamp.
727
1077
  * The caller of this function should be looping on the control messages found
@@ -783,73 +1133,76 @@ struct cmsghdr* process_timestamp(grpc_tcp* tcp, msghdr* msg,
783
1133
  /** For linux platforms, reads the socket's error queue and processes error
784
1134
  * messages from the queue.
785
1135
  */
786
- static void process_errors(grpc_tcp* tcp) {
1136
+ static bool process_errors(grpc_tcp* tcp) {
1137
+ bool processed_err = false;
1138
+ struct iovec iov;
1139
+ iov.iov_base = nullptr;
1140
+ iov.iov_len = 0;
1141
+ struct msghdr msg;
1142
+ msg.msg_name = nullptr;
1143
+ msg.msg_namelen = 0;
1144
+ msg.msg_iov = &iov;
1145
+ msg.msg_iovlen = 0;
1146
+ msg.msg_flags = 0;
1147
+ /* Allocate enough space so we don't need to keep increasing this as size
1148
+ * of OPT_STATS increase */
1149
+ constexpr size_t cmsg_alloc_space =
1150
+ CMSG_SPACE(sizeof(grpc_core::scm_timestamping)) +
1151
+ CMSG_SPACE(sizeof(sock_extended_err) + sizeof(sockaddr_in)) +
1152
+ CMSG_SPACE(32 * NLA_ALIGN(NLA_HDRLEN + sizeof(uint64_t)));
1153
+ /* Allocate aligned space for cmsgs received along with timestamps */
1154
+ union {
1155
+ char rbuf[cmsg_alloc_space];
1156
+ struct cmsghdr align;
1157
+ } aligned_buf;
1158
+ msg.msg_control = aligned_buf.rbuf;
1159
+ msg.msg_controllen = sizeof(aligned_buf.rbuf);
1160
+ int r, saved_errno;
787
1161
  while (true) {
788
- struct iovec iov;
789
- iov.iov_base = nullptr;
790
- iov.iov_len = 0;
791
- struct msghdr msg;
792
- msg.msg_name = nullptr;
793
- msg.msg_namelen = 0;
794
- msg.msg_iov = &iov;
795
- msg.msg_iovlen = 0;
796
- msg.msg_flags = 0;
797
-
798
- /* Allocate enough space so we don't need to keep increasing this as size
799
- * of OPT_STATS increase */
800
- constexpr size_t cmsg_alloc_space =
801
- CMSG_SPACE(sizeof(grpc_core::scm_timestamping)) +
802
- CMSG_SPACE(sizeof(sock_extended_err) + sizeof(sockaddr_in)) +
803
- CMSG_SPACE(32 * NLA_ALIGN(NLA_HDRLEN + sizeof(uint64_t)));
804
- /* Allocate aligned space for cmsgs received along with timestamps */
805
- union {
806
- char rbuf[cmsg_alloc_space];
807
- struct cmsghdr align;
808
- } aligned_buf;
809
- memset(&aligned_buf, 0, sizeof(aligned_buf));
810
-
811
- msg.msg_control = aligned_buf.rbuf;
812
- msg.msg_controllen = sizeof(aligned_buf.rbuf);
813
-
814
- int r, saved_errno;
815
1162
  do {
816
1163
  r = recvmsg(tcp->fd, &msg, MSG_ERRQUEUE);
817
1164
  saved_errno = errno;
818
1165
  } while (r < 0 && saved_errno == EINTR);
819
1166
 
820
1167
  if (r == -1 && saved_errno == EAGAIN) {
821
- return; /* No more errors to process */
1168
+ return processed_err; /* No more errors to process */
822
1169
  }
823
1170
  if (r == -1) {
824
- return;
1171
+ return processed_err;
825
1172
  }
826
- if ((msg.msg_flags & MSG_CTRUNC) != 0) {
1173
+ if (GPR_UNLIKELY((msg.msg_flags & MSG_CTRUNC) != 0)) {
827
1174
  gpr_log(GPR_ERROR, "Error message was truncated.");
828
1175
  }
829
1176
 
830
1177
  if (msg.msg_controllen == 0) {
831
1178
  /* There was no control message found. It was probably spurious. */
832
- return;
1179
+ return processed_err;
833
1180
  }
834
1181
  bool seen = false;
835
1182
  for (auto cmsg = CMSG_FIRSTHDR(&msg); cmsg && cmsg->cmsg_len;
836
1183
  cmsg = CMSG_NXTHDR(&msg, cmsg)) {
837
- if (cmsg->cmsg_level != SOL_SOCKET ||
838
- cmsg->cmsg_type != SCM_TIMESTAMPING) {
839
- /* Got a control message that is not a timestamp. Don't know how to
840
- * handle this. */
1184
+ if (CmsgIsZeroCopy(*cmsg)) {
1185
+ process_zerocopy(tcp, cmsg);
1186
+ seen = true;
1187
+ processed_err = true;
1188
+ } else if (cmsg->cmsg_level == SOL_SOCKET &&
1189
+ cmsg->cmsg_type == SCM_TIMESTAMPING) {
1190
+ cmsg = process_timestamp(tcp, &msg, cmsg);
1191
+ seen = true;
1192
+ processed_err = true;
1193
+ } else {
1194
+ /* Got a control message that is not a timestamp or zerocopy. Don't know
1195
+ * how to handle this. */
841
1196
  if (GRPC_TRACE_FLAG_ENABLED(grpc_tcp_trace)) {
842
1197
  gpr_log(GPR_INFO,
843
1198
  "unknown control message cmsg_level:%d cmsg_type:%d",
844
1199
  cmsg->cmsg_level, cmsg->cmsg_type);
845
1200
  }
846
- return;
1201
+ return processed_err;
847
1202
  }
848
- cmsg = process_timestamp(tcp, &msg, cmsg);
849
- seen = true;
850
1203
  }
851
1204
  if (!seen) {
852
- return;
1205
+ return processed_err;
853
1206
  }
854
1207
  }
855
1208
  }
@@ -870,18 +1223,28 @@ static void tcp_handle_error(void* arg /* grpc_tcp */, grpc_error* error) {
870
1223
 
871
1224
  /* We are still interested in collecting timestamps, so let's try reading
872
1225
  * them. */
873
- process_errors(tcp);
1226
+ bool processed = process_errors(tcp);
874
1227
  /* This might not a timestamps error. Set the read and write closures to be
875
1228
  * ready. */
876
- grpc_fd_set_readable(tcp->em_fd);
877
- grpc_fd_set_writable(tcp->em_fd);
1229
+ if (!processed) {
1230
+ grpc_fd_set_readable(tcp->em_fd);
1231
+ grpc_fd_set_writable(tcp->em_fd);
1232
+ }
878
1233
  grpc_fd_notify_on_error(tcp->em_fd, &tcp->error_closure);
879
1234
  }
880
1235
 
881
1236
  #else /* GRPC_LINUX_ERRQUEUE */
1237
+ static TcpZerocopySendRecord* tcp_get_send_zerocopy_record(
1238
+ grpc_tcp* tcp, grpc_slice_buffer* buf) {
1239
+ return nullptr;
1240
+ }
1241
+
1242
+ static void ZerocopyDisableAndWaitForRemaining(grpc_tcp* tcp) {}
1243
+
882
1244
  static bool tcp_write_with_timestamps(grpc_tcp* /*tcp*/, struct msghdr* /*msg*/,
883
1245
  size_t /*sending_length*/,
884
- ssize_t* /*sent_length*/) {
1246
+ ssize_t* /*sent_length*/,
1247
+ int /*additional_flags*/) {
885
1248
  gpr_log(GPR_ERROR, "Write with timestamps not supported for this platform");
886
1249
  GPR_ASSERT(0);
887
1250
  return false;
@@ -907,12 +1270,138 @@ void tcp_shutdown_buffer_list(grpc_tcp* tcp) {
907
1270
  }
908
1271
  }
909
1272
 
910
- /* returns true if done, false if pending; if returning true, *error is set */
911
1273
  #if defined(IOV_MAX) && IOV_MAX < 1000
912
1274
  #define MAX_WRITE_IOVEC IOV_MAX
913
1275
  #else
914
1276
  #define MAX_WRITE_IOVEC 1000
915
1277
  #endif
1278
+ msg_iovlen_type TcpZerocopySendRecord::PopulateIovs(size_t* unwind_slice_idx,
1279
+ size_t* unwind_byte_idx,
1280
+ size_t* sending_length,
1281
+ iovec* iov) {
1282
+ msg_iovlen_type iov_size;
1283
+ *unwind_slice_idx = out_offset_.slice_idx;
1284
+ *unwind_byte_idx = out_offset_.byte_idx;
1285
+ for (iov_size = 0;
1286
+ out_offset_.slice_idx != buf_.count && iov_size != MAX_WRITE_IOVEC;
1287
+ iov_size++) {
1288
+ iov[iov_size].iov_base =
1289
+ GRPC_SLICE_START_PTR(buf_.slices[out_offset_.slice_idx]) +
1290
+ out_offset_.byte_idx;
1291
+ iov[iov_size].iov_len =
1292
+ GRPC_SLICE_LENGTH(buf_.slices[out_offset_.slice_idx]) -
1293
+ out_offset_.byte_idx;
1294
+ *sending_length += iov[iov_size].iov_len;
1295
+ ++(out_offset_.slice_idx);
1296
+ out_offset_.byte_idx = 0;
1297
+ }
1298
+ GPR_DEBUG_ASSERT(iov_size > 0);
1299
+ return iov_size;
1300
+ }
1301
+
1302
+ void TcpZerocopySendRecord::UpdateOffsetForBytesSent(size_t sending_length,
1303
+ size_t actually_sent) {
1304
+ size_t trailing = sending_length - actually_sent;
1305
+ while (trailing > 0) {
1306
+ size_t slice_length;
1307
+ out_offset_.slice_idx--;
1308
+ slice_length = GRPC_SLICE_LENGTH(buf_.slices[out_offset_.slice_idx]);
1309
+ if (slice_length > trailing) {
1310
+ out_offset_.byte_idx = slice_length - trailing;
1311
+ break;
1312
+ } else {
1313
+ trailing -= slice_length;
1314
+ }
1315
+ }
1316
+ }
1317
+
1318
+ // returns true if done, false if pending; if returning true, *error is set
1319
+ static bool do_tcp_flush_zerocopy(grpc_tcp* tcp, TcpZerocopySendRecord* record,
1320
+ grpc_error** error) {
1321
+ struct msghdr msg;
1322
+ struct iovec iov[MAX_WRITE_IOVEC];
1323
+ msg_iovlen_type iov_size;
1324
+ ssize_t sent_length = 0;
1325
+ size_t sending_length;
1326
+ size_t unwind_slice_idx;
1327
+ size_t unwind_byte_idx;
1328
+ while (true) {
1329
+ sending_length = 0;
1330
+ iov_size = record->PopulateIovs(&unwind_slice_idx, &unwind_byte_idx,
1331
+ &sending_length, iov);
1332
+ msg.msg_name = nullptr;
1333
+ msg.msg_namelen = 0;
1334
+ msg.msg_iov = iov;
1335
+ msg.msg_iovlen = iov_size;
1336
+ msg.msg_flags = 0;
1337
+ bool tried_sending_message = false;
1338
+ // Before calling sendmsg (with or without timestamps): we
1339
+ // take a single ref on the zerocopy send record.
1340
+ tcp->tcp_zerocopy_send_ctx.NoteSend(record);
1341
+ if (tcp->outgoing_buffer_arg != nullptr) {
1342
+ if (!tcp->ts_capable ||
1343
+ !tcp_write_with_timestamps(tcp, &msg, sending_length, &sent_length,
1344
+ MSG_ZEROCOPY)) {
1345
+ /* We could not set socket options to collect Fathom timestamps.
1346
+ * Fallback on writing without timestamps. */
1347
+ tcp->ts_capable = false;
1348
+ tcp_shutdown_buffer_list(tcp);
1349
+ } else {
1350
+ tried_sending_message = true;
1351
+ }
1352
+ }
1353
+ if (!tried_sending_message) {
1354
+ msg.msg_control = nullptr;
1355
+ msg.msg_controllen = 0;
1356
+ GRPC_STATS_INC_TCP_WRITE_SIZE(sending_length);
1357
+ GRPC_STATS_INC_TCP_WRITE_IOV_SIZE(iov_size);
1358
+ sent_length = tcp_send(tcp->fd, &msg, MSG_ZEROCOPY);
1359
+ }
1360
+ if (sent_length < 0) {
1361
+ // If this particular send failed, drop ref taken earlier in this method.
1362
+ tcp->tcp_zerocopy_send_ctx.UndoSend();
1363
+ if (errno == EAGAIN) {
1364
+ record->UnwindIfThrottled(unwind_slice_idx, unwind_byte_idx);
1365
+ return false;
1366
+ } else if (errno == EPIPE) {
1367
+ *error = tcp_annotate_error(GRPC_OS_ERROR(errno, "sendmsg"), tcp);
1368
+ tcp_shutdown_buffer_list(tcp);
1369
+ return true;
1370
+ } else {
1371
+ *error = tcp_annotate_error(GRPC_OS_ERROR(errno, "sendmsg"), tcp);
1372
+ tcp_shutdown_buffer_list(tcp);
1373
+ return true;
1374
+ }
1375
+ }
1376
+ tcp->bytes_counter += sent_length;
1377
+ record->UpdateOffsetForBytesSent(sending_length,
1378
+ static_cast<size_t>(sent_length));
1379
+ if (record->AllSlicesSent()) {
1380
+ *error = GRPC_ERROR_NONE;
1381
+ return true;
1382
+ }
1383
+ }
1384
+ }
1385
+
1386
+ static void UnrefMaybePutZerocopySendRecord(grpc_tcp* tcp,
1387
+ TcpZerocopySendRecord* record,
1388
+ uint32_t seq, const char* tag) {
1389
+ if (record->Unref()) {
1390
+ tcp->tcp_zerocopy_send_ctx.PutSendRecord(record);
1391
+ }
1392
+ }
1393
+
1394
+ static bool tcp_flush_zerocopy(grpc_tcp* tcp, TcpZerocopySendRecord* record,
1395
+ grpc_error** error) {
1396
+ bool done = do_tcp_flush_zerocopy(tcp, record, error);
1397
+ if (done) {
1398
+ // Either we encountered an error, or we successfully sent all the bytes.
1399
+ // In either case, we're done with this record.
1400
+ UnrefMaybePutZerocopySendRecord(tcp, record, 0, "flush_done");
1401
+ }
1402
+ return done;
1403
+ }
1404
+
916
1405
  static bool tcp_flush(grpc_tcp* tcp, grpc_error** error) {
917
1406
  struct msghdr msg;
918
1407
  struct iovec iov[MAX_WRITE_IOVEC];
@@ -927,7 +1416,7 @@ static bool tcp_flush(grpc_tcp* tcp, grpc_error** error) {
927
1416
  // buffer as we write
928
1417
  size_t outgoing_slice_idx = 0;
929
1418
 
930
- for (;;) {
1419
+ while (true) {
931
1420
  sending_length = 0;
932
1421
  unwind_slice_idx = outgoing_slice_idx;
933
1422
  unwind_byte_idx = tcp->outgoing_byte_idx;
@@ -1027,12 +1516,21 @@ static void tcp_handle_write(void* arg /* grpc_tcp */, grpc_error* error) {
1027
1516
  if (error != GRPC_ERROR_NONE) {
1028
1517
  cb = tcp->write_cb;
1029
1518
  tcp->write_cb = nullptr;
1519
+ if (tcp->current_zerocopy_send != nullptr) {
1520
+ UnrefMaybePutZerocopySendRecord(tcp, tcp->current_zerocopy_send, 0,
1521
+ "handle_write_err");
1522
+ tcp->current_zerocopy_send = nullptr;
1523
+ }
1030
1524
  grpc_core::Closure::Run(DEBUG_LOCATION, cb, GRPC_ERROR_REF(error));
1031
1525
  TCP_UNREF(tcp, "write");
1032
1526
  return;
1033
1527
  }
1034
1528
 
1035
- if (!tcp_flush(tcp, &error)) {
1529
+ bool flush_result =
1530
+ tcp->current_zerocopy_send != nullptr
1531
+ ? tcp_flush_zerocopy(tcp, tcp->current_zerocopy_send, &error)
1532
+ : tcp_flush(tcp, &error);
1533
+ if (!flush_result) {
1036
1534
  if (GRPC_TRACE_FLAG_ENABLED(grpc_tcp_trace)) {
1037
1535
  gpr_log(GPR_INFO, "write: delayed");
1038
1536
  }
@@ -1042,6 +1540,7 @@ static void tcp_handle_write(void* arg /* grpc_tcp */, grpc_error* error) {
1042
1540
  } else {
1043
1541
  cb = tcp->write_cb;
1044
1542
  tcp->write_cb = nullptr;
1543
+ tcp->current_zerocopy_send = nullptr;
1045
1544
  if (GRPC_TRACE_FLAG_ENABLED(grpc_tcp_trace)) {
1046
1545
  const char* str = grpc_error_string(error);
1047
1546
  gpr_log(GPR_INFO, "write: %s", str);
@@ -1057,6 +1556,7 @@ static void tcp_write(grpc_endpoint* ep, grpc_slice_buffer* buf,
1057
1556
  GPR_TIMER_SCOPE("tcp_write", 0);
1058
1557
  grpc_tcp* tcp = reinterpret_cast<grpc_tcp*>(ep);
1059
1558
  grpc_error* error = GRPC_ERROR_NONE;
1559
+ TcpZerocopySendRecord* zerocopy_send_record = nullptr;
1060
1560
 
1061
1561
  if (GRPC_TRACE_FLAG_ENABLED(grpc_tcp_trace)) {
1062
1562
  size_t i;
@@ -1073,8 +1573,8 @@ static void tcp_write(grpc_endpoint* ep, grpc_slice_buffer* buf,
1073
1573
  }
1074
1574
 
1075
1575
  GPR_ASSERT(tcp->write_cb == nullptr);
1576
+ GPR_DEBUG_ASSERT(tcp->current_zerocopy_send == nullptr);
1076
1577
 
1077
- tcp->outgoing_buffer_arg = arg;
1078
1578
  if (buf->length == 0) {
1079
1579
  grpc_core::Closure::Run(
1080
1580
  DEBUG_LOCATION, cb,
@@ -1085,15 +1585,26 @@ static void tcp_write(grpc_endpoint* ep, grpc_slice_buffer* buf,
1085
1585
  tcp_shutdown_buffer_list(tcp);
1086
1586
  return;
1087
1587
  }
1088
- tcp->outgoing_buffer = buf;
1089
- tcp->outgoing_byte_idx = 0;
1588
+
1589
+ zerocopy_send_record = tcp_get_send_zerocopy_record(tcp, buf);
1590
+ if (zerocopy_send_record == nullptr) {
1591
+ // Either not enough bytes, or couldn't allocate a zerocopy context.
1592
+ tcp->outgoing_buffer = buf;
1593
+ tcp->outgoing_byte_idx = 0;
1594
+ }
1595
+ tcp->outgoing_buffer_arg = arg;
1090
1596
  if (arg) {
1091
1597
  GPR_ASSERT(grpc_event_engine_can_track_errors());
1092
1598
  }
1093
1599
 
1094
- if (!tcp_flush(tcp, &error)) {
1600
+ bool flush_result =
1601
+ zerocopy_send_record != nullptr
1602
+ ? tcp_flush_zerocopy(tcp, zerocopy_send_record, &error)
1603
+ : tcp_flush(tcp, &error);
1604
+ if (!flush_result) {
1095
1605
  TCP_REF(tcp, "write");
1096
1606
  tcp->write_cb = cb;
1607
+ tcp->current_zerocopy_send = zerocopy_send_record;
1097
1608
  if (GRPC_TRACE_FLAG_ENABLED(grpc_tcp_trace)) {
1098
1609
  gpr_log(GPR_INFO, "write: delayed");
1099
1610
  }
@@ -1121,6 +1632,7 @@ static void tcp_add_to_pollset_set(grpc_endpoint* ep,
1121
1632
  static void tcp_delete_from_pollset_set(grpc_endpoint* ep,
1122
1633
  grpc_pollset_set* pollset_set) {
1123
1634
  grpc_tcp* tcp = reinterpret_cast<grpc_tcp*>(ep);
1635
+ ZerocopyDisableAndWaitForRemaining(tcp);
1124
1636
  grpc_pollset_set_del_fd(pollset_set, tcp->em_fd);
1125
1637
  }
1126
1638
 
@@ -1172,9 +1684,15 @@ static const grpc_endpoint_vtable vtable = {tcp_read,
1172
1684
  grpc_endpoint* grpc_tcp_create(grpc_fd* em_fd,
1173
1685
  const grpc_channel_args* channel_args,
1174
1686
  const char* peer_string) {
1687
+ static constexpr bool kZerocpTxEnabledDefault = false;
1175
1688
  int tcp_read_chunk_size = GRPC_TCP_DEFAULT_READ_SLICE_SIZE;
1176
1689
  int tcp_max_read_chunk_size = 4 * 1024 * 1024;
1177
1690
  int tcp_min_read_chunk_size = 256;
1691
+ bool tcp_tx_zerocopy_enabled = kZerocpTxEnabledDefault;
1692
+ int tcp_tx_zerocopy_send_bytes_thresh =
1693
+ grpc_core::TcpZerocopySendCtx::kDefaultSendBytesThreshold;
1694
+ int tcp_tx_zerocopy_max_simult_sends =
1695
+ grpc_core::TcpZerocopySendCtx::kDefaultMaxSends;
1178
1696
  grpc_resource_quota* resource_quota = grpc_resource_quota_create(nullptr);
1179
1697
  if (channel_args != nullptr) {
1180
1698
  for (size_t i = 0; i < channel_args->num_args; i++) {
@@ -1199,6 +1717,23 @@ grpc_endpoint* grpc_tcp_create(grpc_fd* em_fd,
1199
1717
  resource_quota =
1200
1718
  grpc_resource_quota_ref_internal(static_cast<grpc_resource_quota*>(
1201
1719
  channel_args->args[i].value.pointer.p));
1720
+ } else if (0 == strcmp(channel_args->args[i].key,
1721
+ GRPC_ARG_TCP_TX_ZEROCOPY_ENABLED)) {
1722
+ tcp_tx_zerocopy_enabled = grpc_channel_arg_get_bool(
1723
+ &channel_args->args[i], kZerocpTxEnabledDefault);
1724
+ } else if (0 == strcmp(channel_args->args[i].key,
1725
+ GRPC_ARG_TCP_TX_ZEROCOPY_SEND_BYTES_THRESHOLD)) {
1726
+ grpc_integer_options options = {
1727
+ grpc_core::TcpZerocopySendCtx::kDefaultSendBytesThreshold, 0,
1728
+ INT_MAX};
1729
+ tcp_tx_zerocopy_send_bytes_thresh =
1730
+ grpc_channel_arg_get_integer(&channel_args->args[i], options);
1731
+ } else if (0 == strcmp(channel_args->args[i].key,
1732
+ GRPC_ARG_TCP_TX_ZEROCOPY_MAX_SIMULT_SENDS)) {
1733
+ grpc_integer_options options = {
1734
+ grpc_core::TcpZerocopySendCtx::kDefaultMaxSends, 0, INT_MAX};
1735
+ tcp_tx_zerocopy_max_simult_sends =
1736
+ grpc_channel_arg_get_integer(&channel_args->args[i], options);
1202
1737
  }
1203
1738
  }
1204
1739
  }
@@ -1215,6 +1750,7 @@ grpc_endpoint* grpc_tcp_create(grpc_fd* em_fd,
1215
1750
  tcp->fd = grpc_fd_wrapped_fd(em_fd);
1216
1751
  tcp->read_cb = nullptr;
1217
1752
  tcp->write_cb = nullptr;
1753
+ tcp->current_zerocopy_send = nullptr;
1218
1754
  tcp->release_fd_cb = nullptr;
1219
1755
  tcp->release_fd = nullptr;
1220
1756
  tcp->incoming_buffer = nullptr;
@@ -1228,6 +1764,20 @@ grpc_endpoint* grpc_tcp_create(grpc_fd* em_fd,
1228
1764
  tcp->socket_ts_enabled = false;
1229
1765
  tcp->ts_capable = true;
1230
1766
  tcp->outgoing_buffer_arg = nullptr;
1767
+ new (&tcp->tcp_zerocopy_send_ctx) TcpZerocopySendCtx(
1768
+ tcp_tx_zerocopy_max_simult_sends, tcp_tx_zerocopy_send_bytes_thresh);
1769
+ if (tcp_tx_zerocopy_enabled && !tcp->tcp_zerocopy_send_ctx.memory_limited()) {
1770
+ #ifdef GRPC_LINUX_ERRQUEUE
1771
+ const int enable = 1;
1772
+ auto err =
1773
+ setsockopt(tcp->fd, SOL_SOCKET, SO_ZEROCOPY, &enable, sizeof(enable));
1774
+ if (err == 0) {
1775
+ tcp->tcp_zerocopy_send_ctx.set_enabled(true);
1776
+ } else {
1777
+ gpr_log(GPR_ERROR, "Failed to set zerocopy options on the socket.");
1778
+ }
1779
+ #endif
1780
+ }
1231
1781
  /* paired with unref in grpc_tcp_destroy */
1232
1782
  new (&tcp->refcount) grpc_core::RefCount(1, &grpc_tcp_trace);
1233
1783
  gpr_atm_no_barrier_store(&tcp->shutdown_count, 0);
@@ -1294,6 +1844,7 @@ void grpc_tcp_destroy_and_release_fd(grpc_endpoint* ep, int* fd,
1294
1844
  grpc_slice_buffer_reset_and_unref_internal(&tcp->last_read_buffer);
1295
1845
  if (grpc_event_engine_can_track_errors()) {
1296
1846
  /* Stop errors notification. */
1847
+ ZerocopyDisableAndWaitForRemaining(tcp);
1297
1848
  gpr_atm_no_barrier_store(&tcp->stop_error_notification, true);
1298
1849
  grpc_fd_set_error(tcp->em_fd);
1299
1850
  }