@temporalio/core-bridge 1.5.2 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. package/Cargo.lock +304 -112
  2. package/lib/index.d.ts +8 -6
  3. package/lib/index.js.map +1 -1
  4. package/package.json +9 -4
  5. package/releases/aarch64-apple-darwin/index.node +0 -0
  6. package/releases/aarch64-unknown-linux-gnu/index.node +0 -0
  7. package/releases/x86_64-apple-darwin/index.node +0 -0
  8. package/releases/x86_64-pc-windows-msvc/index.node +0 -0
  9. package/releases/x86_64-unknown-linux-gnu/index.node +0 -0
  10. package/sdk-core/.buildkite/docker/Dockerfile +2 -2
  11. package/sdk-core/.buildkite/docker/docker-compose.yaml +1 -1
  12. package/sdk-core/.buildkite/pipeline.yml +2 -4
  13. package/sdk-core/.cargo/config.toml +5 -2
  14. package/sdk-core/.github/workflows/heavy.yml +29 -0
  15. package/sdk-core/Cargo.toml +1 -1
  16. package/sdk-core/README.md +20 -10
  17. package/sdk-core/client/src/lib.rs +215 -39
  18. package/sdk-core/client/src/metrics.rs +17 -8
  19. package/sdk-core/client/src/raw.rs +4 -4
  20. package/sdk-core/client/src/retry.rs +32 -20
  21. package/sdk-core/core/Cargo.toml +25 -12
  22. package/sdk-core/core/src/abstractions/take_cell.rs +28 -0
  23. package/sdk-core/core/src/abstractions.rs +204 -14
  24. package/sdk-core/core/src/core_tests/activity_tasks.rs +143 -50
  25. package/sdk-core/core/src/core_tests/child_workflows.rs +6 -5
  26. package/sdk-core/core/src/core_tests/determinism.rs +165 -2
  27. package/sdk-core/core/src/core_tests/local_activities.rs +431 -43
  28. package/sdk-core/core/src/core_tests/queries.rs +34 -16
  29. package/sdk-core/core/src/core_tests/workers.rs +8 -5
  30. package/sdk-core/core/src/core_tests/workflow_tasks.rs +588 -55
  31. package/sdk-core/core/src/ephemeral_server/mod.rs +113 -12
  32. package/sdk-core/core/src/internal_flags.rs +155 -0
  33. package/sdk-core/core/src/lib.rs +16 -9
  34. package/sdk-core/core/src/protosext/mod.rs +1 -1
  35. package/sdk-core/core/src/replay/mod.rs +16 -27
  36. package/sdk-core/core/src/telemetry/log_export.rs +1 -1
  37. package/sdk-core/core/src/telemetry/metrics.rs +69 -35
  38. package/sdk-core/core/src/telemetry/mod.rs +60 -21
  39. package/sdk-core/core/src/telemetry/prometheus_server.rs +19 -13
  40. package/sdk-core/core/src/test_help/mod.rs +73 -14
  41. package/sdk-core/core/src/worker/activities/activity_heartbeat_manager.rs +119 -160
  42. package/sdk-core/core/src/worker/activities/activity_task_poller_stream.rs +89 -0
  43. package/sdk-core/core/src/worker/activities/local_activities.rs +379 -129
  44. package/sdk-core/core/src/worker/activities.rs +350 -175
  45. package/sdk-core/core/src/worker/client/mocks.rs +22 -2
  46. package/sdk-core/core/src/worker/client.rs +18 -2
  47. package/sdk-core/core/src/worker/mod.rs +183 -64
  48. package/sdk-core/core/src/worker/workflow/bridge.rs +1 -3
  49. package/sdk-core/core/src/worker/workflow/driven_workflow.rs +3 -5
  50. package/sdk-core/core/src/worker/workflow/history_update.rs +916 -277
  51. package/sdk-core/core/src/worker/workflow/machines/activity_state_machine.rs +216 -183
  52. package/sdk-core/core/src/worker/workflow/machines/cancel_external_state_machine.rs +9 -12
  53. package/sdk-core/core/src/worker/workflow/machines/cancel_workflow_state_machine.rs +7 -9
  54. package/sdk-core/core/src/worker/workflow/machines/child_workflow_state_machine.rs +160 -87
  55. package/sdk-core/core/src/worker/workflow/machines/complete_workflow_state_machine.rs +13 -14
  56. package/sdk-core/core/src/worker/workflow/machines/continue_as_new_workflow_state_machine.rs +7 -9
  57. package/sdk-core/core/src/worker/workflow/machines/fail_workflow_state_machine.rs +14 -17
  58. package/sdk-core/core/src/worker/workflow/machines/local_activity_state_machine.rs +242 -110
  59. package/sdk-core/core/src/worker/workflow/machines/mod.rs +27 -19
  60. package/sdk-core/core/src/worker/workflow/machines/modify_workflow_properties_state_machine.rs +9 -11
  61. package/sdk-core/core/src/worker/workflow/machines/patch_state_machine.rs +321 -206
  62. package/sdk-core/core/src/worker/workflow/machines/signal_external_state_machine.rs +13 -18
  63. package/sdk-core/core/src/worker/workflow/machines/timer_state_machine.rs +20 -29
  64. package/sdk-core/core/src/worker/workflow/machines/transition_coverage.rs +2 -2
  65. package/sdk-core/core/src/worker/workflow/machines/upsert_search_attributes_state_machine.rs +257 -51
  66. package/sdk-core/core/src/worker/workflow/machines/workflow_machines/local_acts.rs +6 -17
  67. package/sdk-core/core/src/worker/workflow/machines/workflow_machines.rs +310 -150
  68. package/sdk-core/core/src/worker/workflow/machines/workflow_task_state_machine.rs +17 -20
  69. package/sdk-core/core/src/worker/workflow/managed_run/managed_wf_test.rs +31 -15
  70. package/sdk-core/core/src/worker/workflow/managed_run.rs +1052 -380
  71. package/sdk-core/core/src/worker/workflow/mod.rs +598 -390
  72. package/sdk-core/core/src/worker/workflow/run_cache.rs +40 -57
  73. package/sdk-core/core/src/worker/workflow/wft_extraction.rs +137 -0
  74. package/sdk-core/core/src/worker/workflow/wft_poller.rs +1 -4
  75. package/sdk-core/core/src/worker/workflow/workflow_stream/saved_wf_inputs.rs +117 -0
  76. package/sdk-core/core/src/worker/workflow/workflow_stream/tonic_status_serde.rs +24 -0
  77. package/sdk-core/core/src/worker/workflow/workflow_stream.rs +469 -718
  78. package/sdk-core/core-api/Cargo.toml +2 -1
  79. package/sdk-core/core-api/src/errors.rs +1 -34
  80. package/sdk-core/core-api/src/lib.rs +19 -9
  81. package/sdk-core/core-api/src/telemetry.rs +4 -6
  82. package/sdk-core/core-api/src/worker.rs +19 -1
  83. package/sdk-core/etc/deps.svg +115 -140
  84. package/sdk-core/etc/regen-depgraph.sh +5 -0
  85. package/sdk-core/fsm/rustfsm_procmacro/src/lib.rs +86 -61
  86. package/sdk-core/fsm/rustfsm_trait/src/lib.rs +29 -71
  87. package/sdk-core/histories/ends_empty_wft_complete.bin +0 -0
  88. package/sdk-core/histories/evict_while_la_running_no_interference-16_history.bin +0 -0
  89. package/sdk-core/histories/old_change_marker_format.bin +0 -0
  90. package/sdk-core/protos/api_upstream/.github/CODEOWNERS +2 -1
  91. package/sdk-core/protos/api_upstream/Makefile +6 -6
  92. package/sdk-core/protos/api_upstream/build/go.mod +7 -0
  93. package/sdk-core/protos/api_upstream/build/go.sum +5 -0
  94. package/sdk-core/protos/api_upstream/build/tools.go +29 -0
  95. package/sdk-core/protos/api_upstream/go.mod +6 -0
  96. package/sdk-core/protos/api_upstream/temporal/api/batch/v1/message.proto +9 -2
  97. package/sdk-core/protos/api_upstream/temporal/api/command/v1/message.proto +7 -26
  98. package/sdk-core/protos/api_upstream/temporal/api/common/v1/message.proto +13 -2
  99. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/batch_operation.proto +3 -2
  100. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/command_type.proto +3 -7
  101. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/common.proto +3 -2
  102. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/event_type.proto +8 -8
  103. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/failed_cause.proto +25 -2
  104. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/namespace.proto +2 -2
  105. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/query.proto +2 -2
  106. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/reset.proto +2 -2
  107. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/schedule.proto +2 -2
  108. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/task_queue.proto +2 -2
  109. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/update.proto +24 -19
  110. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/workflow.proto +2 -2
  111. package/sdk-core/protos/api_upstream/temporal/api/errordetails/v1/message.proto +2 -2
  112. package/sdk-core/protos/api_upstream/temporal/api/failure/v1/message.proto +2 -2
  113. package/sdk-core/protos/api_upstream/temporal/api/filter/v1/message.proto +2 -2
  114. package/sdk-core/protos/api_upstream/temporal/api/history/v1/message.proto +49 -26
  115. package/sdk-core/protos/api_upstream/temporal/api/namespace/v1/message.proto +4 -2
  116. package/sdk-core/protos/api_upstream/temporal/api/operatorservice/v1/request_response.proto +5 -2
  117. package/sdk-core/protos/api_upstream/temporal/api/operatorservice/v1/service.proto +2 -2
  118. package/sdk-core/protos/api_upstream/temporal/api/protocol/v1/message.proto +57 -0
  119. package/sdk-core/protos/api_upstream/temporal/api/query/v1/message.proto +2 -2
  120. package/sdk-core/protos/api_upstream/temporal/api/replication/v1/message.proto +2 -2
  121. package/sdk-core/protos/api_upstream/temporal/api/schedule/v1/message.proto +2 -2
  122. package/sdk-core/protos/api_upstream/temporal/api/sdk/v1/task_complete_metadata.proto +63 -0
  123. package/sdk-core/protos/api_upstream/temporal/api/taskqueue/v1/message.proto +2 -2
  124. package/sdk-core/protos/api_upstream/temporal/api/update/v1/message.proto +71 -6
  125. package/sdk-core/protos/api_upstream/temporal/api/version/v1/message.proto +2 -2
  126. package/sdk-core/protos/api_upstream/temporal/api/workflow/v1/message.proto +2 -2
  127. package/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto +64 -28
  128. package/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/service.proto +4 -4
  129. package/sdk-core/protos/local/temporal/sdk/core/activity_result/activity_result.proto +7 -8
  130. package/sdk-core/protos/local/temporal/sdk/core/activity_task/activity_task.proto +10 -7
  131. package/sdk-core/protos/local/temporal/sdk/core/child_workflow/child_workflow.proto +19 -30
  132. package/sdk-core/protos/local/temporal/sdk/core/common/common.proto +1 -0
  133. package/sdk-core/protos/local/temporal/sdk/core/core_interface.proto +1 -0
  134. package/sdk-core/protos/local/temporal/sdk/core/external_data/external_data.proto +8 -0
  135. package/sdk-core/protos/local/temporal/sdk/core/workflow_activation/workflow_activation.proto +67 -60
  136. package/sdk-core/protos/local/temporal/sdk/core/workflow_commands/workflow_commands.proto +85 -84
  137. package/sdk-core/protos/local/temporal/sdk/core/workflow_completion/workflow_completion.proto +9 -3
  138. package/sdk-core/protos/testsrv_upstream/temporal/api/testservice/v1/request_response.proto +2 -2
  139. package/sdk-core/protos/testsrv_upstream/temporal/api/testservice/v1/service.proto +2 -2
  140. package/sdk-core/sdk/Cargo.toml +5 -4
  141. package/sdk-core/sdk/src/lib.rs +108 -26
  142. package/sdk-core/sdk/src/workflow_context/options.rs +7 -1
  143. package/sdk-core/sdk/src/workflow_context.rs +24 -17
  144. package/sdk-core/sdk/src/workflow_future.rs +16 -15
  145. package/sdk-core/sdk-core-protos/Cargo.toml +5 -2
  146. package/sdk-core/sdk-core-protos/build.rs +36 -2
  147. package/sdk-core/sdk-core-protos/src/history_builder.rs +138 -106
  148. package/sdk-core/sdk-core-protos/src/history_info.rs +10 -1
  149. package/sdk-core/sdk-core-protos/src/lib.rs +272 -87
  150. package/sdk-core/sdk-core-protos/src/task_token.rs +12 -2
  151. package/sdk-core/test-utils/Cargo.toml +3 -1
  152. package/sdk-core/test-utils/src/canned_histories.rs +106 -296
  153. package/sdk-core/test-utils/src/histfetch.rs +1 -1
  154. package/sdk-core/test-utils/src/lib.rs +82 -23
  155. package/sdk-core/test-utils/src/wf_input_saver.rs +50 -0
  156. package/sdk-core/test-utils/src/workflows.rs +29 -0
  157. package/sdk-core/tests/fuzzy_workflow.rs +130 -0
  158. package/sdk-core/tests/{load_tests.rs → heavy_tests.rs} +125 -51
  159. package/sdk-core/tests/integ_tests/ephemeral_server_tests.rs +25 -3
  160. package/sdk-core/tests/integ_tests/heartbeat_tests.rs +10 -5
  161. package/sdk-core/tests/integ_tests/metrics_tests.rs +218 -16
  162. package/sdk-core/tests/integ_tests/polling_tests.rs +4 -47
  163. package/sdk-core/tests/integ_tests/queries_tests.rs +5 -128
  164. package/sdk-core/tests/integ_tests/visibility_tests.rs +83 -25
  165. package/sdk-core/tests/integ_tests/workflow_tests/activities.rs +161 -72
  166. package/sdk-core/tests/integ_tests/workflow_tests/cancel_external.rs +1 -0
  167. package/sdk-core/tests/integ_tests/workflow_tests/cancel_wf.rs +6 -13
  168. package/sdk-core/tests/integ_tests/workflow_tests/child_workflows.rs +80 -3
  169. package/sdk-core/tests/integ_tests/workflow_tests/continue_as_new.rs +6 -2
  170. package/sdk-core/tests/integ_tests/workflow_tests/determinism.rs +3 -10
  171. package/sdk-core/tests/integ_tests/workflow_tests/local_activities.rs +94 -200
  172. package/sdk-core/tests/integ_tests/workflow_tests/modify_wf_properties.rs +2 -4
  173. package/sdk-core/tests/integ_tests/workflow_tests/patches.rs +34 -28
  174. package/sdk-core/tests/integ_tests/workflow_tests/replay.rs +76 -7
  175. package/sdk-core/tests/integ_tests/workflow_tests/resets.rs +1 -0
  176. package/sdk-core/tests/integ_tests/workflow_tests/signals.rs +18 -14
  177. package/sdk-core/tests/integ_tests/workflow_tests/stickyness.rs +6 -20
  178. package/sdk-core/tests/integ_tests/workflow_tests/timers.rs +10 -21
  179. package/sdk-core/tests/integ_tests/workflow_tests/upsert_search_attrs.rs +7 -8
  180. package/sdk-core/tests/integ_tests/workflow_tests.rs +13 -14
  181. package/sdk-core/tests/main.rs +3 -13
  182. package/sdk-core/tests/runner.rs +75 -36
  183. package/sdk-core/tests/wf_input_replay.rs +32 -0
  184. package/src/conversions.rs +14 -8
  185. package/src/runtime.rs +9 -8
  186. package/ts/index.ts +8 -6
  187. package/sdk-core/bridge-ffi/Cargo.toml +0 -24
  188. package/sdk-core/bridge-ffi/LICENSE.txt +0 -23
  189. package/sdk-core/bridge-ffi/build.rs +0 -25
  190. package/sdk-core/bridge-ffi/include/sdk-core-bridge.h +0 -224
  191. package/sdk-core/bridge-ffi/src/lib.rs +0 -746
  192. package/sdk-core/bridge-ffi/src/wrappers.rs +0 -221
  193. package/sdk-core/protos/local/temporal/sdk/core/bridge/bridge.proto +0 -210
  194. package/sdk-core/sdk/src/conversions.rs +0 -8
@@ -1,4 +1,5 @@
1
1
  mod activity_heartbeat_manager;
2
+ mod activity_task_poller_stream;
2
3
  mod local_activities;
3
4
 
4
5
  pub(crate) use local_activities::{
@@ -7,26 +8,35 @@ pub(crate) use local_activities::{
7
8
  LocalInFlightActInfo, NewLocalAct,
8
9
  };
9
10
 
10
- use crate::telemetry::metrics::eager;
11
11
  use crate::{
12
- abstractions::{MeteredSemaphore, OwnedMeteredSemPermit},
12
+ abstractions::{
13
+ ClosableMeteredSemaphore, MeteredSemaphore, OwnedMeteredSemPermit,
14
+ TrackedOwnedMeteredSemPermit, UsedMeteredSemPermit,
15
+ },
13
16
  pollers::BoxedActPoller,
14
- telemetry::metrics::{activity_type, activity_worker_type, workflow_type, MetricsContext},
17
+ telemetry::metrics::{
18
+ activity_type, activity_worker_type, eager, workflow_type, MetricsContext,
19
+ },
15
20
  worker::{
16
- activities::activity_heartbeat_manager::ActivityHeartbeatError, client::WorkerClient,
21
+ activities::{
22
+ activity_heartbeat_manager::ActivityHeartbeatError,
23
+ activity_task_poller_stream::new_activity_task_poller,
24
+ },
25
+ client::WorkerClient,
17
26
  },
18
27
  PollActivityError, TaskToken,
19
28
  };
20
29
  use activity_heartbeat_manager::ActivityHeartbeatManager;
21
30
  use dashmap::DashMap;
22
- use governor::{
23
- clock::DefaultClock,
24
- middleware::NoOpMiddleware,
25
- state::{InMemoryState, NotKeyed},
26
- Quota, RateLimiter,
31
+ use futures::{
32
+ stream,
33
+ stream::{BoxStream, PollNext},
34
+ Stream, StreamExt,
27
35
  };
36
+ use governor::{Quota, RateLimiter};
28
37
  use std::{
29
38
  convert::TryInto,
39
+ future,
30
40
  sync::Arc,
31
41
  time::{Duration, Instant},
32
42
  };
@@ -37,13 +47,23 @@ use temporal_sdk_core_protos::{
37
47
  ActivityHeartbeat,
38
48
  },
39
49
  temporal::api::{
40
- failure::v1::{failure::FailureInfo, CanceledFailureInfo, Failure},
50
+ failure::v1::{failure::FailureInfo, ApplicationFailureInfo, CanceledFailureInfo, Failure},
41
51
  workflowservice::v1::PollActivityTaskQueueResponse,
42
52
  },
43
53
  };
44
- use tokio::sync::Notify;
54
+ use tokio::{
55
+ join,
56
+ sync::{
57
+ mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender},
58
+ Mutex, Notify,
59
+ },
60
+ };
61
+ use tokio_stream::wrappers::UnboundedReceiverStream;
62
+ use tokio_util::sync::CancellationToken;
45
63
  use tracing::Span;
46
64
 
65
+ type OutstandingActMap = Arc<DashMap<TaskToken, RemoteInFlightActInfo>>;
66
+
47
67
  #[derive(Debug, derive_more::Constructor)]
48
68
  struct PendingActivityCancel {
49
69
  task_token: TaskToken,
@@ -67,17 +87,18 @@ struct RemoteInFlightActInfo {
67
87
  pub base: InFlightActInfo,
68
88
  /// Used to calculate aggregation delay between activity heartbeats.
69
89
  pub heartbeat_timeout: Option<prost_types::Duration>,
70
- /// Set to true if we have already issued a cancellation activation to lang for this activity
71
- pub issued_cancel_to_lang: bool,
90
+ /// Set if we have already issued a cancellation activation to lang for this activity, with
91
+ /// the original reason we issued the cancel.
92
+ pub issued_cancel_to_lang: Option<ActivityCancelReason>,
72
93
  /// Set to true if we have already learned from the server this activity doesn't exist. EX:
73
94
  /// we have learned from heartbeating and issued a cancel task, in which case we may simply
74
95
  /// discard the reply.
75
96
  pub known_not_found: bool,
76
97
  /// The permit from the max concurrent semaphore
77
- _permit: OwnedMeteredSemPermit,
98
+ _permit: UsedMeteredSemPermit,
78
99
  }
79
100
  impl RemoteInFlightActInfo {
80
- fn new(poll_resp: &PollActivityTaskQueueResponse, permit: OwnedMeteredSemPermit) -> Self {
101
+ fn new(poll_resp: &PollActivityTaskQueueResponse, permit: UsedMeteredSemPermit) -> Self {
81
102
  let wec = poll_resp.workflow_execution.clone().unwrap_or_default();
82
103
  Self {
83
104
  base: InFlightActInfo {
@@ -88,53 +109,51 @@ impl RemoteInFlightActInfo {
88
109
  start_time: Instant::now(),
89
110
  },
90
111
  heartbeat_timeout: poll_resp.heartbeat_timeout.clone(),
91
- issued_cancel_to_lang: false,
112
+ issued_cancel_to_lang: None,
92
113
  known_not_found: false,
93
114
  _permit: permit,
94
115
  }
95
116
  }
96
117
  }
97
118
 
98
- struct NonPollActBuffer {
99
- tx: async_channel::Sender<PermittedTqResp>,
100
- rx: async_channel::Receiver<PermittedTqResp>,
101
- }
102
- impl NonPollActBuffer {
103
- pub fn new() -> Self {
104
- let (tx, rx) = async_channel::unbounded();
105
- Self { tx, rx }
106
- }
107
-
108
- pub async fn next(&self) -> PermittedTqResp {
109
- self.rx.recv().await.expect("Send half cannot be dropped")
110
- }
111
- }
112
-
113
119
  pub(crate) struct WorkerActivityTasks {
120
+ /// Token used to signal the server task poller that shutdown is beginning
121
+ poller_shutdown_token: CancellationToken,
114
122
  /// Centralizes management of heartbeat issuing / throttling
115
123
  heartbeat_manager: ActivityHeartbeatManager,
124
+ /// Combined stream for any ActivityTask producing source (polls, eager activities,
125
+ /// cancellations)
126
+ activity_task_stream: Mutex<BoxStream<'static, Result<ActivityTask, PollActivityError>>>,
116
127
  /// Activities that have been issued to lang but not yet completed
117
- outstanding_activity_tasks: DashMap<TaskToken, RemoteInFlightActInfo>,
118
- /// Buffers activity task polling in the event we need to return a cancellation while a poll is
119
- /// ongoing.
120
- poller: BoxedActPoller,
121
- /// Holds activity tasks we have received by non-polling means. EX: In direct response to
122
- /// workflow task completion.
123
- non_poll_tasks: NonPollActBuffer,
124
- /// Ensures we stay at or below this worker's maximum concurrent activity limit
125
- activities_semaphore: Arc<MeteredSemaphore>,
126
- /// Enables per-worker rate-limiting of activity tasks
127
- ratelimiter: Option<RateLimiter<NotKeyed, InMemoryState, DefaultClock, NoOpMiddleware>>,
128
- /// Wakes every time an activity is removed from the outstanding map
129
- complete_notify: Notify,
128
+ outstanding_activity_tasks: OutstandingActMap,
129
+ /// Ensures we don't exceed this worker's maximum concurrent activity limit for activities. This
130
+ /// semaphore is used to limit eager activities but shares the same underlying
131
+ /// [MeteredSemaphore] that is used to limit the concurrency for non-eager activities.
132
+ eager_activities_semaphore: Arc<ClosableMeteredSemaphore>,
133
+ /// Holds activity tasks we have received in direct response to workflow task completion (a.k.a
134
+ /// eager activities). Tasks received in this stream hold a "tracked" permit that is issued by
135
+ /// the `eager_activities_semaphore`.
136
+ eager_activities_tx: UnboundedSender<TrackedPermittedTqResp>,
130
137
 
131
138
  metrics: MetricsContext,
132
139
 
133
140
  max_heartbeat_throttle_interval: Duration,
134
141
  default_heartbeat_throttle_interval: Duration,
142
+
143
+ /// Wakes every time an activity is removed from the outstanding map
144
+ complete_notify: Arc<Notify>,
145
+ /// Token to notify when poll returned a shutdown error
146
+ poll_returned_shutdown_token: CancellationToken,
147
+ }
148
+
149
+ #[derive(derive_more::From)]
150
+ enum ActivityTaskSource {
151
+ PendingCancel(PendingActivityCancel),
152
+ PendingStart(Result<(PermittedTqResp, bool), PollActivityError>),
135
153
  }
136
154
 
137
155
  impl WorkerActivityTasks {
156
+ #[allow(clippy::too_many_arguments)]
138
157
  pub(crate) fn new(
139
158
  max_activity_tasks: usize,
140
159
  max_worker_act_per_sec: Option<f64>,
@@ -143,91 +162,230 @@ impl WorkerActivityTasks {
143
162
  metrics: MetricsContext,
144
163
  max_heartbeat_throttle_interval: Duration,
145
164
  default_heartbeat_throttle_interval: Duration,
165
+ graceful_shutdown: Option<Duration>,
146
166
  ) -> Self {
147
- Self {
148
- heartbeat_manager: ActivityHeartbeatManager::new(client),
149
- outstanding_activity_tasks: Default::default(),
167
+ let semaphore = Arc::new(MeteredSemaphore::new(
168
+ max_activity_tasks,
169
+ metrics.with_new_attrs([activity_worker_type()]),
170
+ MetricsContext::available_task_slots,
171
+ ));
172
+ let poller_shutdown_token = CancellationToken::new();
173
+ let rate_limiter = max_worker_act_per_sec.and_then(|ps| {
174
+ Quota::with_period(Duration::from_secs_f64(ps.recip())).map(RateLimiter::direct)
175
+ });
176
+ let outstanding_activity_tasks = Arc::new(DashMap::new());
177
+ let server_poller_stream = new_activity_task_poller(
150
178
  poller,
151
- non_poll_tasks: NonPollActBuffer::new(),
152
- activities_semaphore: Arc::new(MeteredSemaphore::new(
153
- max_activity_tasks,
154
- metrics.with_new_attrs([activity_worker_type()]),
155
- MetricsContext::available_task_slots,
156
- )),
157
- ratelimiter: max_worker_act_per_sec.and_then(|ps| {
158
- Quota::with_period(Duration::from_secs_f64(ps.recip())).map(RateLimiter::direct)
159
- }),
160
- complete_notify: Notify::new(),
179
+ semaphore.clone(),
180
+ rate_limiter,
181
+ metrics.clone(),
182
+ poller_shutdown_token.clone(),
183
+ );
184
+ let (eager_activities_tx, eager_activities_rx) = unbounded_channel();
185
+ let eager_activities_semaphore = ClosableMeteredSemaphore::new_arc(semaphore);
186
+
187
+ let start_tasks_stream_complete = CancellationToken::new();
188
+ let starts_stream = Self::merge_start_task_sources(
189
+ eager_activities_rx,
190
+ server_poller_stream,
191
+ eager_activities_semaphore.clone(),
192
+ start_tasks_stream_complete.clone(),
193
+ );
194
+ let (cancels_tx, cancels_rx) = unbounded_channel();
195
+ let heartbeat_manager = ActivityHeartbeatManager::new(client, cancels_tx.clone());
196
+ let complete_notify = Arc::new(Notify::new());
197
+ let source_stream = stream::select_with_strategy(
198
+ UnboundedReceiverStream::new(cancels_rx).map(ActivityTaskSource::from),
199
+ starts_stream.map(ActivityTaskSource::from),
200
+ |_: &mut ()| PollNext::Left,
201
+ );
202
+ // Create a task stream composed of (in poll preference order):
203
+ // cancels_stream ------------------------------+--- activity_task_stream
204
+ // eager_activities_rx ---+--- starts_stream ---|
205
+ // server_poll_stream ---|
206
+ let activity_task_stream = Self::merge_source_streams(
207
+ source_stream,
208
+ outstanding_activity_tasks.clone(),
209
+ start_tasks_stream_complete,
210
+ complete_notify.clone(),
211
+ graceful_shutdown,
212
+ cancels_tx,
213
+ metrics.clone(),
214
+ );
215
+
216
+ Self {
217
+ poller_shutdown_token,
218
+ eager_activities_tx,
219
+ heartbeat_manager,
220
+ activity_task_stream: Mutex::new(activity_task_stream.boxed()),
221
+ eager_activities_semaphore,
222
+ complete_notify,
161
223
  metrics,
162
224
  max_heartbeat_throttle_interval,
163
225
  default_heartbeat_throttle_interval,
226
+ poll_returned_shutdown_token: CancellationToken::new(),
227
+ outstanding_activity_tasks,
164
228
  }
165
229
  }
166
230
 
167
- pub(crate) fn notify_shutdown(&self) {
168
- self.poller.notify_shutdown();
231
+ /// Merges the server poll and eager [ActivityTask] sources
232
+ fn merge_start_task_sources(
233
+ non_poll_tasks_rx: UnboundedReceiver<TrackedPermittedTqResp>,
234
+ poller_stream: impl Stream<Item = Result<PermittedTqResp, tonic::Status>>,
235
+ eager_activities_semaphore: Arc<ClosableMeteredSemaphore>,
236
+ on_complete_token: CancellationToken,
237
+ ) -> impl Stream<Item = Result<(PermittedTqResp, bool), PollActivityError>> {
238
+ let non_poll_stream = stream::unfold(
239
+ (non_poll_tasks_rx, eager_activities_semaphore),
240
+ |(mut non_poll_tasks_rx, eager_activities_semaphore)| async move {
241
+ loop {
242
+ tokio::select! {
243
+ biased;
244
+
245
+ task_opt = non_poll_tasks_rx.recv() => {
246
+ // Add is_eager true and wrap in Result
247
+ return task_opt.map(|task| (
248
+ Ok((PermittedTqResp{ permit: task.permit.into(), resp: task.resp },
249
+ true)),
250
+ (non_poll_tasks_rx, eager_activities_semaphore)));
251
+ }
252
+ _ = eager_activities_semaphore.close_complete() => {
253
+ // Once shutting down, we stop accepting eager activities
254
+ non_poll_tasks_rx.close();
255
+ continue;
256
+ }
257
+ }
258
+ }
259
+ },
260
+ );
261
+ // Add is_eager false
262
+ let poller_stream = poller_stream.map(|res| res.map(|task| (task, false)));
263
+
264
+ // Prefer eager activities over polling the server
265
+ stream::select_with_strategy(non_poll_stream, poller_stream, |_: &mut ()| PollNext::Left)
266
+ .map(|res| res.map_err(|err| err.into()))
267
+ // This map, chain, filter_map sequence is here to cancel the token when this stream ends.
268
+ .map(Some)
269
+ .chain(futures::stream::once(async move {
270
+ on_complete_token.cancel();
271
+ None
272
+ }))
273
+ .filter_map(future::ready)
169
274
  }
170
275
 
171
- /// Wait for all outstanding activity tasks to finish
172
- pub(crate) async fn wait_all_finished(&self) {
173
- while !self.outstanding_activity_tasks.is_empty() {
174
- self.complete_notify.notified().await
175
- }
276
+ /// Builds an [ActivityTask] stream for both cancellation tasks from cancels delivered from
277
+ /// heartbeats as well as new activity starts
278
+ fn merge_source_streams(
279
+ source_stream: impl Stream<Item = ActivityTaskSource>,
280
+ outstanding_tasks: Arc<DashMap<TaskToken, RemoteInFlightActInfo>>,
281
+ start_tasks_stream_complete: CancellationToken,
282
+ complete_notify: Arc<Notify>,
283
+ grace_period: Option<Duration>,
284
+ cancels_tx: UnboundedSender<PendingActivityCancel>,
285
+ metrics: MetricsContext,
286
+ ) -> impl Stream<Item = Result<ActivityTask, PollActivityError>> {
287
+ let outstanding_tasks_clone = outstanding_tasks.clone();
288
+ source_stream
289
+ .filter_map(move |source| {
290
+ let outstanding_tasks = outstanding_tasks.clone();
291
+ let metrics = metrics.clone();
292
+ async move {
293
+ match source {
294
+ ActivityTaskSource::PendingCancel(next_pc) => {
295
+ // It's possible that activity has been completed and we no longer have
296
+ // an outstanding activity task. This is fine because it means that we
297
+ // no longer need to cancel this activity, so we'll just ignore such
298
+ // orphaned cancellations.
299
+ if let Some(mut details) =
300
+ outstanding_tasks.get_mut(&next_pc.task_token)
301
+ {
302
+ if details.issued_cancel_to_lang.is_some() {
303
+ // Don't double-issue cancellations
304
+ return None;
305
+ }
306
+
307
+ details.issued_cancel_to_lang = Some(next_pc.reason);
308
+ if next_pc.reason == ActivityCancelReason::NotFound {
309
+ details.known_not_found = true;
310
+ }
311
+ Some(Ok(ActivityTask::cancel_from_ids(
312
+ next_pc.task_token.0,
313
+ next_pc.reason,
314
+ )))
315
+ } else {
316
+ debug!(task_token = ?next_pc.task_token,
317
+ "Unknown activity task when issuing cancel");
318
+ // If we can't find the activity here, it's already been completed,
319
+ // in which case issuing a cancel again is pointless.
320
+ None
321
+ }
322
+ }
323
+ ActivityTaskSource::PendingStart(res) => {
324
+ Some(res.map(|(task, is_eager)| {
325
+ Self::about_to_issue_task(
326
+ outstanding_tasks,
327
+ task,
328
+ is_eager,
329
+ metrics,
330
+ )
331
+ }))
332
+ }
333
+ }
334
+ }
335
+ })
336
+ .take_until(async move {
337
+ start_tasks_stream_complete.cancelled().await;
338
+ // Issue cancels for any still-living act tasks after the grace period
339
+ let (grace_killer, stop_grace) = futures_util::future::abortable(async {
340
+ if let Some(gp) = grace_period {
341
+ // Make sure we've waited at least the grace period. This way if waiting for
342
+ // starts to finish took a while, we subtract that from the grace period.
343
+ tokio::time::sleep(gp).await;
344
+ for mapref in outstanding_tasks_clone.iter() {
345
+ let _ = cancels_tx.send(PendingActivityCancel::new(
346
+ mapref.key().clone(),
347
+ ActivityCancelReason::WorkerShutdown,
348
+ ));
349
+ }
350
+ }
351
+ });
352
+ join!(
353
+ async {
354
+ while !outstanding_tasks_clone.is_empty() {
355
+ complete_notify.notified().await
356
+ }
357
+ // If we were waiting for the grace period but everything already finished,
358
+ // we don't need to keep waiting.
359
+ stop_grace.abort();
360
+ },
361
+ grace_killer
362
+ )
363
+ })
176
364
  }
177
365
 
178
- pub(crate) async fn shutdown(self) {
179
- self.poller.shutdown_box().await;
366
+ pub(crate) fn initiate_shutdown(&self) {
367
+ self.poller_shutdown_token.cancel();
368
+ self.eager_activities_semaphore.close();
369
+ }
370
+
371
+ pub(crate) async fn shutdown(&self) {
372
+ self.initiate_shutdown();
373
+ self.poll_returned_shutdown_token.cancelled().await;
180
374
  self.heartbeat_manager.shutdown().await;
181
375
  }
182
376
 
183
- /// Wait until not at the outstanding activity limit, and then poll for an activity task.
377
+ /// Exclusive poll for activity tasks
184
378
  ///
185
- /// Returns `Ok(None)` if no activity is ready and the overall polling loop should be retried.
186
- pub(crate) async fn poll(&self) -> Result<Option<ActivityTask>, PollActivityError> {
187
- let poll_with_semaphore = async {
188
- // Acquire and subsequently forget a permit for an outstanding activity. When they are
189
- // completed, we must add a new permit to the semaphore, since holding the permit the
190
- // entire time lang does work would be a challenge.
191
- let perm = self
192
- .activities_semaphore
193
- .acquire_owned()
194
- .await
195
- .expect("outstanding activity semaphore not closed");
196
- if let Some(ref rl) = self.ratelimiter {
197
- rl.until_ready().await;
198
- }
199
- (self.poller.poll().await, perm)
200
- };
201
-
202
- tokio::select! {
203
- biased;
204
-
205
- cancel_task = self.next_pending_cancel_task() => {
206
- cancel_task
207
- }
208
- task = self.non_poll_tasks.next() => {
209
- Ok(Some(self.about_to_issue_task(task, true)))
210
- }
211
- (work, permit) = poll_with_semaphore => {
212
- match work {
213
- Some(Ok(work)) => {
214
- if work == PollActivityTaskQueueResponse::default() {
215
- // Timeout
216
- self.metrics.act_poll_timeout();
217
- return Ok(None)
218
- }
219
- let work = self.about_to_issue_task(PermittedTqResp {
220
- resp: work, permit
221
- }, false);
222
- Ok(Some(work))
223
- }
224
- None => {
225
- Err(PollActivityError::ShutDown)
226
- }
227
- Some(Err(e)) => Err(e.into())
228
- }
229
- }
230
- }
379
+ /// Polls the various task sources (server polls, eager activities, cancellations) while
380
+ /// respecting the provided rate limits and allowed concurrency. Returns
381
+ /// [PollActivityError::ShutDown] after shutdown is completed and all tasks sources are
382
+ /// depleted.
383
+ pub(crate) async fn poll(&self) -> Result<ActivityTask, PollActivityError> {
384
+ let mut poller_stream = self.activity_task_stream.lock().await;
385
+ poller_stream.next().await.unwrap_or_else(|| {
386
+ self.poll_returned_shutdown_token.cancel();
387
+ Err(PollActivityError::ShutDown)
388
+ })
231
389
  }
232
390
 
233
391
  pub(crate) async fn complete(
@@ -265,22 +423,40 @@ impl WorkerActivityTasks {
265
423
  .err()
266
424
  }
267
425
  aer::Status::Cancelled(ar::Cancellation { failure }) => {
268
- let details = if let Some(Failure {
269
- failure_info:
270
- Some(FailureInfo::CanceledFailureInfo(CanceledFailureInfo { details })),
271
- ..
272
- }) = failure
273
- {
274
- details
426
+ if matches!(
427
+ act_info.issued_cancel_to_lang,
428
+ Some(ActivityCancelReason::WorkerShutdown),
429
+ ) {
430
+ // We don't report cancels for graceful shutdown as failures, so we
431
+ // don't wait for the whole timeout to elapse, which is what would
432
+ // happen anyway.
433
+ client
434
+ .fail_activity_task(
435
+ task_token.clone(),
436
+ Some(worker_shutdown_failure()),
437
+ )
438
+ .await
439
+ .err()
275
440
  } else {
276
- warn!(task_token = ? task_token,
441
+ let details = if let Some(Failure {
442
+ failure_info:
443
+ Some(FailureInfo::CanceledFailureInfo(CanceledFailureInfo {
444
+ details,
445
+ })),
446
+ ..
447
+ }) = failure
448
+ {
449
+ details
450
+ } else {
451
+ warn!(task_token = ? task_token,
277
452
  "Expected activity cancelled status with CanceledFailureInfo");
278
- None
279
- };
280
- client
281
- .cancel_activity_task(task_token.clone(), details.map(Into::into))
282
- .await
283
- .err()
453
+ None
454
+ };
455
+ client
456
+ .cancel_activity_task(task_token.clone(), details.map(Into::into))
457
+ .await
458
+ .err()
459
+ }
284
460
  }
285
461
  };
286
462
 
@@ -337,48 +513,21 @@ impl WorkerActivityTasks {
337
513
  /// Returns a handle that the workflows management side can use to interact with this manager
338
514
  pub(crate) fn get_handle_for_workflows(&self) -> ActivitiesFromWFTsHandle {
339
515
  ActivitiesFromWFTsHandle {
340
- sem: self.activities_semaphore.clone(),
341
- tx: self.non_poll_tasks.tx.clone(),
516
+ sem: self.eager_activities_semaphore.clone(),
517
+ tx: self.eager_activities_tx.clone(),
342
518
  }
343
519
  }
344
520
 
345
- async fn next_pending_cancel_task(&self) -> Result<Option<ActivityTask>, PollActivityError> {
346
- let next_pc = self.heartbeat_manager.next_pending_cancel().await;
347
- // Issue cancellations for anything we noticed was cancelled during heartbeating
348
- if let Some(PendingActivityCancel { task_token, reason }) = next_pc {
349
- // It's possible that activity has been completed and we no longer have an
350
- // outstanding activity task. This is fine because it means that we no
351
- // longer need to cancel this activity, so we'll just ignore such orphaned
352
- // cancellations.
353
- if let Some(mut details) = self.outstanding_activity_tasks.get_mut(&task_token) {
354
- if details.issued_cancel_to_lang {
355
- // Don't double-issue cancellations
356
- return Ok(None);
357
- }
358
-
359
- details.issued_cancel_to_lang = true;
360
- if reason == ActivityCancelReason::NotFound {
361
- details.known_not_found = true;
362
- }
363
- Ok(Some(ActivityTask::cancel_from_ids(task_token.0, reason)))
364
- } else {
365
- debug!(task_token = ?task_token, "Unknown activity task when issuing cancel");
366
- // If we can't find the activity here, it's already been completed,
367
- // in which case issuing a cancel again is pointless.
368
- Ok(None)
369
- }
370
- } else {
371
- // The only situation where the next cancel would return none is if the manager
372
- // was dropped, which can only happen on shutdown.
373
- Err(PollActivityError::ShutDown)
374
- }
375
- }
376
-
377
- /// Called when there is a new act task about to be bubbled up out of the manager
378
- fn about_to_issue_task(&self, task: PermittedTqResp, is_eager: bool) -> ActivityTask {
521
+ /// Called when there is a new [ActivityTask] about to be bubbled up out of the poller
522
+ fn about_to_issue_task(
523
+ outstanding_tasks: Arc<DashMap<TaskToken, RemoteInFlightActInfo>>,
524
+ task: PermittedTqResp,
525
+ is_eager: bool,
526
+ metrics: MetricsContext,
527
+ ) -> ActivityTask {
379
528
  if let Some(ref act_type) = task.resp.activity_type {
380
529
  if let Some(ref wf_type) = task.resp.workflow_type {
381
- self.metrics
530
+ metrics
382
531
  .with_new_attrs([
383
532
  activity_type(act_type.name.clone()),
384
533
  workflow_type(wf_type.name.clone()),
@@ -391,12 +540,12 @@ impl WorkerActivityTasks {
391
540
  // activity_type and workflow_type, we won't bother.
392
541
 
393
542
  if let Some(dur) = task.resp.sched_to_start() {
394
- self.metrics.act_sched_to_start_latency(dur);
543
+ metrics.act_sched_to_start_latency(dur);
395
544
  };
396
545
 
397
- self.outstanding_activity_tasks.insert(
546
+ outstanding_tasks.insert(
398
547
  task.resp.task_token.clone().into(),
399
- RemoteInFlightActInfo::new(&task.resp, task.permit),
548
+ RemoteInFlightActInfo::new(&task.resp, task.permit.into_used()),
400
549
  );
401
550
 
402
551
  ActivityTask::start_from_poll_resp(task.resp)
@@ -404,40 +553,65 @@ impl WorkerActivityTasks {
404
553
 
405
554
  #[cfg(test)]
406
555
  pub(crate) fn remaining_activity_capacity(&self) -> usize {
407
- self.activities_semaphore.available_permits()
556
+ self.eager_activities_semaphore.available_permits()
408
557
  }
409
558
  }
410
559
 
411
560
  /// Provides facilities for the workflow side of things to interact with the activity manager.
412
561
  /// Allows for the handling of activities returned by WFT completions.
413
562
  pub(crate) struct ActivitiesFromWFTsHandle {
414
- sem: Arc<MeteredSemaphore>,
415
- tx: async_channel::Sender<PermittedTqResp>,
563
+ sem: Arc<ClosableMeteredSemaphore>,
564
+ tx: UnboundedSender<TrackedPermittedTqResp>,
416
565
  }
417
566
 
418
567
  impl ActivitiesFromWFTsHandle {
419
568
  /// Returns a handle that can be used to reserve an activity slot. EX: When requesting eager
420
569
  /// dispatch of an activity to this worker upon workflow task completion
421
- pub(crate) fn reserve_slot(&self) -> Option<OwnedMeteredSemPermit> {
570
+ pub(crate) fn reserve_slot(&self) -> Option<TrackedOwnedMeteredSemPermit> {
571
+ // TODO: check if rate limit is not exceeded and count this reservation towards the rate limit
422
572
  self.sem.try_acquire_owned().ok()
423
573
  }
424
574
 
425
575
  /// Queue new activity tasks for dispatch received from non-polling sources (ex: eager returns
426
576
  /// from WFT completion)
427
- pub(crate) fn add_tasks(&self, tasks: impl IntoIterator<Item = PermittedTqResp>) {
577
+ pub(crate) fn add_tasks(&self, tasks: impl IntoIterator<Item = TrackedPermittedTqResp>) {
428
578
  for t in tasks.into_iter() {
429
579
  // Technically we should be reporting `activity_task_received` here, but for simplicity
430
580
  // and time insensitivity, that metric is tracked in `about_to_issue_task`.
431
- self.tx.try_send(t).expect("Receive half cannot be dropped");
581
+ self.tx.send(t).expect("Receive half cannot be dropped");
432
582
  }
433
583
  }
434
584
  }
435
585
 
586
+ #[derive(Debug)]
436
587
  pub(crate) struct PermittedTqResp {
437
588
  pub permit: OwnedMeteredSemPermit,
438
589
  pub resp: PollActivityTaskQueueResponse,
439
590
  }
440
591
 
592
+ #[derive(Debug)]
593
+ pub(crate) struct TrackedPermittedTqResp {
594
+ pub permit: TrackedOwnedMeteredSemPermit,
595
+ pub resp: PollActivityTaskQueueResponse,
596
+ }
597
+
598
+ fn worker_shutdown_failure() -> Failure {
599
+ Failure {
600
+ message: "Worker is shutting down and this activity did not complete in time".to_string(),
601
+ source: "".to_string(),
602
+ stack_trace: "".to_string(),
603
+ encoded_attributes: None,
604
+ cause: None,
605
+ failure_info: Some(FailureInfo::ApplicationFailureInfo(
606
+ ApplicationFailureInfo {
607
+ r#type: "WorkerShutdown".to_string(),
608
+ non_retryable: false,
609
+ details: None,
610
+ },
611
+ )),
612
+ }
613
+ }
614
+
441
615
  #[cfg(test)]
442
616
  mod tests {
443
617
  use super::*;
@@ -469,10 +643,11 @@ mod tests {
469
643
  MetricsContext::no_op(),
470
644
  Duration::from_secs(1),
471
645
  Duration::from_secs(1),
646
+ None,
472
647
  );
473
648
  let start = Instant::now();
474
- atm.poll().await.unwrap().unwrap();
475
- atm.poll().await.unwrap().unwrap();
649
+ atm.poll().await.unwrap();
650
+ atm.poll().await.unwrap();
476
651
  // At least half a second will have elapsed since we only allow 2 tasks per second.
477
652
  // With no ratelimit, even on a slow CI server with lots of load, this would typically take
478
653
  // low single digit ms or less.