temporalio 0.0.2 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (202) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +25 -23
  3. data/bridge/Cargo.lock +168 -59
  4. data/bridge/Cargo.toml +4 -2
  5. data/bridge/sdk-core/README.md +19 -6
  6. data/bridge/sdk-core/client/src/lib.rs +215 -39
  7. data/bridge/sdk-core/client/src/metrics.rs +17 -8
  8. data/bridge/sdk-core/client/src/raw.rs +4 -4
  9. data/bridge/sdk-core/client/src/retry.rs +32 -20
  10. data/bridge/sdk-core/core/Cargo.toml +22 -9
  11. data/bridge/sdk-core/core/src/abstractions.rs +203 -14
  12. data/bridge/sdk-core/core/src/core_tests/activity_tasks.rs +76 -41
  13. data/bridge/sdk-core/core/src/core_tests/determinism.rs +165 -2
  14. data/bridge/sdk-core/core/src/core_tests/local_activities.rs +204 -83
  15. data/bridge/sdk-core/core/src/core_tests/queries.rs +3 -4
  16. data/bridge/sdk-core/core/src/core_tests/workers.rs +1 -3
  17. data/bridge/sdk-core/core/src/core_tests/workflow_tasks.rs +397 -54
  18. data/bridge/sdk-core/core/src/ephemeral_server/mod.rs +106 -12
  19. data/bridge/sdk-core/core/src/internal_flags.rs +136 -0
  20. data/bridge/sdk-core/core/src/lib.rs +16 -9
  21. data/bridge/sdk-core/core/src/telemetry/log_export.rs +1 -1
  22. data/bridge/sdk-core/core/src/telemetry/metrics.rs +69 -35
  23. data/bridge/sdk-core/core/src/telemetry/mod.rs +29 -13
  24. data/bridge/sdk-core/core/src/telemetry/prometheus_server.rs +17 -12
  25. data/bridge/sdk-core/core/src/test_help/mod.rs +62 -12
  26. data/bridge/sdk-core/core/src/worker/activities/activity_heartbeat_manager.rs +112 -156
  27. data/bridge/sdk-core/core/src/worker/activities/activity_task_poller_stream.rs +89 -0
  28. data/bridge/sdk-core/core/src/worker/activities/local_activities.rs +352 -122
  29. data/bridge/sdk-core/core/src/worker/activities.rs +233 -157
  30. data/bridge/sdk-core/core/src/worker/client/mocks.rs +22 -2
  31. data/bridge/sdk-core/core/src/worker/client.rs +18 -2
  32. data/bridge/sdk-core/core/src/worker/mod.rs +165 -58
  33. data/bridge/sdk-core/core/src/worker/workflow/bridge.rs +1 -3
  34. data/bridge/sdk-core/core/src/worker/workflow/driven_workflow.rs +3 -5
  35. data/bridge/sdk-core/core/src/worker/workflow/history_update.rs +856 -277
  36. data/bridge/sdk-core/core/src/worker/workflow/machines/activity_state_machine.rs +100 -43
  37. data/bridge/sdk-core/core/src/worker/workflow/machines/cancel_external_state_machine.rs +7 -7
  38. data/bridge/sdk-core/core/src/worker/workflow/machines/cancel_workflow_state_machine.rs +5 -4
  39. data/bridge/sdk-core/core/src/worker/workflow/machines/child_workflow_state_machine.rs +87 -27
  40. data/bridge/sdk-core/core/src/worker/workflow/machines/complete_workflow_state_machine.rs +5 -4
  41. data/bridge/sdk-core/core/src/worker/workflow/machines/continue_as_new_workflow_state_machine.rs +5 -4
  42. data/bridge/sdk-core/core/src/worker/workflow/machines/fail_workflow_state_machine.rs +5 -4
  43. data/bridge/sdk-core/core/src/worker/workflow/machines/local_activity_state_machine.rs +137 -62
  44. data/bridge/sdk-core/core/src/worker/workflow/machines/mod.rs +25 -17
  45. data/bridge/sdk-core/core/src/worker/workflow/machines/modify_workflow_properties_state_machine.rs +7 -6
  46. data/bridge/sdk-core/core/src/worker/workflow/machines/patch_state_machine.rs +103 -152
  47. data/bridge/sdk-core/core/src/worker/workflow/machines/signal_external_state_machine.rs +7 -7
  48. data/bridge/sdk-core/core/src/worker/workflow/machines/timer_state_machine.rs +9 -9
  49. data/bridge/sdk-core/core/src/worker/workflow/machines/transition_coverage.rs +2 -2
  50. data/bridge/sdk-core/core/src/worker/workflow/machines/upsert_search_attributes_state_machine.rs +14 -7
  51. data/bridge/sdk-core/core/src/worker/workflow/machines/workflow_machines/local_acts.rs +5 -16
  52. data/bridge/sdk-core/core/src/worker/workflow/machines/workflow_machines.rs +201 -121
  53. data/bridge/sdk-core/core/src/worker/workflow/machines/workflow_task_state_machine.rs +11 -14
  54. data/bridge/sdk-core/core/src/worker/workflow/managed_run/managed_wf_test.rs +30 -15
  55. data/bridge/sdk-core/core/src/worker/workflow/managed_run.rs +1026 -376
  56. data/bridge/sdk-core/core/src/worker/workflow/mod.rs +460 -384
  57. data/bridge/sdk-core/core/src/worker/workflow/run_cache.rs +40 -57
  58. data/bridge/sdk-core/core/src/worker/workflow/wft_extraction.rs +125 -0
  59. data/bridge/sdk-core/core/src/worker/workflow/wft_poller.rs +1 -4
  60. data/bridge/sdk-core/core/src/worker/workflow/workflow_stream/saved_wf_inputs.rs +117 -0
  61. data/bridge/sdk-core/core/src/worker/workflow/workflow_stream/tonic_status_serde.rs +24 -0
  62. data/bridge/sdk-core/core/src/worker/workflow/workflow_stream.rs +448 -718
  63. data/bridge/sdk-core/core-api/Cargo.toml +2 -1
  64. data/bridge/sdk-core/core-api/src/errors.rs +1 -34
  65. data/bridge/sdk-core/core-api/src/lib.rs +6 -2
  66. data/bridge/sdk-core/core-api/src/telemetry.rs +0 -6
  67. data/bridge/sdk-core/core-api/src/worker.rs +14 -1
  68. data/bridge/sdk-core/fsm/rustfsm_procmacro/src/lib.rs +18 -15
  69. data/bridge/sdk-core/fsm/rustfsm_trait/src/lib.rs +8 -3
  70. data/bridge/sdk-core/histories/evict_while_la_running_no_interference-16_history.bin +0 -0
  71. data/bridge/sdk-core/protos/api_upstream/temporal/api/command/v1/message.proto +5 -17
  72. data/bridge/sdk-core/protos/api_upstream/temporal/api/common/v1/message.proto +11 -0
  73. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/command_type.proto +1 -6
  74. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/event_type.proto +6 -6
  75. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/failed_cause.proto +5 -0
  76. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/update.proto +22 -6
  77. data/bridge/sdk-core/protos/api_upstream/temporal/api/history/v1/message.proto +48 -19
  78. data/bridge/sdk-core/protos/api_upstream/temporal/api/namespace/v1/message.proto +2 -0
  79. data/bridge/sdk-core/protos/api_upstream/temporal/api/operatorservice/v1/request_response.proto +3 -0
  80. data/bridge/sdk-core/protos/api_upstream/temporal/api/{enums/v1/interaction_type.proto → protocol/v1/message.proto} +29 -11
  81. data/bridge/sdk-core/protos/api_upstream/temporal/api/sdk/v1/task_complete_metadata.proto +63 -0
  82. data/bridge/sdk-core/protos/api_upstream/temporal/api/update/v1/message.proto +111 -0
  83. data/bridge/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto +59 -28
  84. data/bridge/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/service.proto +2 -2
  85. data/bridge/sdk-core/protos/local/temporal/sdk/core/activity_result/activity_result.proto +1 -0
  86. data/bridge/sdk-core/protos/local/temporal/sdk/core/activity_task/activity_task.proto +1 -0
  87. data/bridge/sdk-core/protos/local/temporal/sdk/core/child_workflow/child_workflow.proto +1 -0
  88. data/bridge/sdk-core/protos/local/temporal/sdk/core/common/common.proto +1 -0
  89. data/bridge/sdk-core/protos/local/temporal/sdk/core/core_interface.proto +1 -0
  90. data/bridge/sdk-core/protos/local/temporal/sdk/core/external_data/external_data.proto +1 -0
  91. data/bridge/sdk-core/protos/local/temporal/sdk/core/workflow_activation/workflow_activation.proto +7 -0
  92. data/bridge/sdk-core/protos/local/temporal/sdk/core/workflow_commands/workflow_commands.proto +1 -0
  93. data/bridge/sdk-core/protos/local/temporal/sdk/core/workflow_completion/workflow_completion.proto +6 -0
  94. data/bridge/sdk-core/sdk/Cargo.toml +3 -2
  95. data/bridge/sdk-core/sdk/src/lib.rs +87 -20
  96. data/bridge/sdk-core/sdk/src/workflow_future.rs +9 -8
  97. data/bridge/sdk-core/sdk-core-protos/Cargo.toml +5 -2
  98. data/bridge/sdk-core/sdk-core-protos/build.rs +36 -1
  99. data/bridge/sdk-core/sdk-core-protos/src/history_builder.rs +100 -87
  100. data/bridge/sdk-core/sdk-core-protos/src/history_info.rs +5 -1
  101. data/bridge/sdk-core/sdk-core-protos/src/lib.rs +175 -57
  102. data/bridge/sdk-core/sdk-core-protos/src/task_token.rs +12 -2
  103. data/bridge/sdk-core/test-utils/Cargo.toml +3 -1
  104. data/bridge/sdk-core/test-utils/src/canned_histories.rs +106 -296
  105. data/bridge/sdk-core/test-utils/src/histfetch.rs +1 -1
  106. data/bridge/sdk-core/test-utils/src/lib.rs +82 -23
  107. data/bridge/sdk-core/test-utils/src/wf_input_saver.rs +50 -0
  108. data/bridge/sdk-core/test-utils/src/workflows.rs +29 -0
  109. data/bridge/sdk-core/tests/fuzzy_workflow.rs +130 -0
  110. data/bridge/sdk-core/tests/{load_tests.rs → heavy_tests.rs} +125 -51
  111. data/bridge/sdk-core/tests/integ_tests/ephemeral_server_tests.rs +25 -3
  112. data/bridge/sdk-core/tests/integ_tests/heartbeat_tests.rs +5 -3
  113. data/bridge/sdk-core/tests/integ_tests/metrics_tests.rs +218 -16
  114. data/bridge/sdk-core/tests/integ_tests/polling_tests.rs +4 -47
  115. data/bridge/sdk-core/tests/integ_tests/queries_tests.rs +5 -128
  116. data/bridge/sdk-core/tests/integ_tests/visibility_tests.rs +83 -25
  117. data/bridge/sdk-core/tests/integ_tests/workflow_tests/activities.rs +93 -69
  118. data/bridge/sdk-core/tests/integ_tests/workflow_tests/cancel_external.rs +1 -0
  119. data/bridge/sdk-core/tests/integ_tests/workflow_tests/cancel_wf.rs +6 -13
  120. data/bridge/sdk-core/tests/integ_tests/workflow_tests/child_workflows.rs +1 -0
  121. data/bridge/sdk-core/tests/integ_tests/workflow_tests/continue_as_new.rs +6 -2
  122. data/bridge/sdk-core/tests/integ_tests/workflow_tests/determinism.rs +3 -10
  123. data/bridge/sdk-core/tests/integ_tests/workflow_tests/local_activities.rs +72 -191
  124. data/bridge/sdk-core/tests/integ_tests/workflow_tests/modify_wf_properties.rs +1 -0
  125. data/bridge/sdk-core/tests/integ_tests/workflow_tests/patches.rs +7 -28
  126. data/bridge/sdk-core/tests/integ_tests/workflow_tests/replay.rs +12 -7
  127. data/bridge/sdk-core/tests/integ_tests/workflow_tests/resets.rs +1 -0
  128. data/bridge/sdk-core/tests/integ_tests/workflow_tests/signals.rs +18 -14
  129. data/bridge/sdk-core/tests/integ_tests/workflow_tests/stickyness.rs +6 -20
  130. data/bridge/sdk-core/tests/integ_tests/workflow_tests/timers.rs +10 -21
  131. data/bridge/sdk-core/tests/integ_tests/workflow_tests/upsert_search_attrs.rs +6 -4
  132. data/bridge/sdk-core/tests/integ_tests/workflow_tests.rs +10 -11
  133. data/bridge/sdk-core/tests/main.rs +3 -13
  134. data/bridge/sdk-core/tests/runner.rs +75 -36
  135. data/bridge/sdk-core/tests/wf_input_replay.rs +32 -0
  136. data/bridge/src/connection.rs +41 -25
  137. data/bridge/src/lib.rs +269 -14
  138. data/bridge/src/runtime.rs +1 -1
  139. data/bridge/src/test_server.rs +153 -0
  140. data/bridge/src/worker.rs +89 -16
  141. data/lib/gen/temporal/api/command/v1/message_pb.rb +4 -18
  142. data/lib/gen/temporal/api/common/v1/message_pb.rb +4 -0
  143. data/lib/gen/temporal/api/enums/v1/command_type_pb.rb +1 -3
  144. data/lib/gen/temporal/api/enums/v1/event_type_pb.rb +3 -3
  145. data/lib/gen/temporal/api/enums/v1/failed_cause_pb.rb +2 -0
  146. data/lib/gen/temporal/api/enums/v1/update_pb.rb +6 -4
  147. data/lib/gen/temporal/api/history/v1/message_pb.rb +27 -19
  148. data/lib/gen/temporal/api/namespace/v1/message_pb.rb +1 -0
  149. data/lib/gen/temporal/api/operatorservice/v1/request_response_pb.rb +3 -0
  150. data/lib/gen/temporal/api/protocol/v1/message_pb.rb +30 -0
  151. data/lib/gen/temporal/api/sdk/v1/task_complete_metadata_pb.rb +23 -0
  152. data/lib/gen/temporal/api/testservice/v1/request_response_pb.rb +49 -0
  153. data/lib/gen/temporal/api/testservice/v1/service_pb.rb +21 -0
  154. data/lib/gen/temporal/api/update/v1/message_pb.rb +72 -0
  155. data/lib/gen/temporal/api/workflowservice/v1/request_response_pb.rb +26 -16
  156. data/lib/gen/temporal/sdk/core/activity_result/activity_result_pb.rb +13 -9
  157. data/lib/gen/temporal/sdk/core/activity_task/activity_task_pb.rb +10 -6
  158. data/lib/gen/temporal/sdk/core/child_workflow/child_workflow_pb.rb +13 -9
  159. data/lib/gen/temporal/sdk/core/common/common_pb.rb +7 -3
  160. data/lib/gen/temporal/sdk/core/core_interface_pb.rb +9 -3
  161. data/lib/gen/temporal/sdk/core/external_data/external_data_pb.rb +7 -3
  162. data/lib/gen/temporal/sdk/core/workflow_activation/workflow_activation_pb.rb +27 -21
  163. data/lib/gen/temporal/sdk/core/workflow_commands/workflow_commands_pb.rb +28 -24
  164. data/lib/gen/temporal/sdk/core/workflow_completion/workflow_completion_pb.rb +12 -5
  165. data/lib/temporalio/activity/context.rb +13 -8
  166. data/lib/temporalio/activity/info.rb +1 -1
  167. data/lib/temporalio/bridge/connect_options.rb +15 -0
  168. data/lib/temporalio/bridge/retry_config.rb +24 -0
  169. data/lib/temporalio/bridge/tls_options.rb +19 -0
  170. data/lib/temporalio/client/implementation.rb +8 -8
  171. data/lib/temporalio/connection/retry_config.rb +44 -0
  172. data/lib/temporalio/connection/service.rb +20 -0
  173. data/lib/temporalio/connection/test_service.rb +92 -0
  174. data/lib/temporalio/connection/tls_options.rb +51 -0
  175. data/lib/temporalio/connection/workflow_service.rb +731 -0
  176. data/lib/temporalio/connection.rb +55 -720
  177. data/lib/temporalio/interceptor/activity_inbound.rb +22 -0
  178. data/lib/temporalio/interceptor/activity_outbound.rb +24 -0
  179. data/lib/temporalio/interceptor/chain.rb +5 -5
  180. data/lib/temporalio/interceptor/client.rb +8 -4
  181. data/lib/temporalio/interceptor.rb +22 -0
  182. data/lib/temporalio/retry_policy.rb +13 -3
  183. data/lib/temporalio/testing/time_skipping_handle.rb +32 -0
  184. data/lib/temporalio/testing/time_skipping_interceptor.rb +23 -0
  185. data/lib/temporalio/testing/workflow_environment.rb +112 -0
  186. data/lib/temporalio/testing.rb +175 -0
  187. data/lib/temporalio/version.rb +1 -1
  188. data/lib/temporalio/worker/activity_runner.rb +26 -4
  189. data/lib/temporalio/worker/activity_worker.rb +44 -18
  190. data/lib/temporalio/worker/sync_worker.rb +47 -11
  191. data/lib/temporalio/worker.rb +27 -21
  192. data/lib/temporalio/workflow/async.rb +46 -0
  193. data/lib/temporalio/workflow/future.rb +138 -0
  194. data/lib/temporalio/workflow/info.rb +76 -0
  195. data/temporalio.gemspec +4 -3
  196. metadata +67 -17
  197. data/bridge/sdk-core/Cargo.lock +0 -2606
  198. data/bridge/sdk-core/protos/api_upstream/temporal/api/interaction/v1/message.proto +0 -87
  199. data/lib/bridge.so +0 -0
  200. data/lib/gen/temporal/api/enums/v1/interaction_type_pb.rb +0 -25
  201. data/lib/gen/temporal/api/interaction/v1/message_pb.rb +0 -49
  202. data/lib/gen/temporal/sdk/core/bridge/bridge_pb.rb +0 -222
@@ -8,40 +8,58 @@ mod history_update;
8
8
  mod machines;
9
9
  mod managed_run;
10
10
  mod run_cache;
11
+ mod wft_extraction;
11
12
  pub(crate) mod wft_poller;
12
13
  mod workflow_stream;
13
14
 
15
+ #[cfg(feature = "save_wf_inputs")]
16
+ pub use workflow_stream::replay_wf_state_inputs;
17
+
14
18
  pub(crate) use bridge::WorkflowBridge;
15
19
  pub(crate) use driven_workflow::{DrivenWorkflow, WorkflowFetcher};
16
- pub(crate) use history_update::{HistoryPaginator, HistoryUpdate};
17
- pub(crate) use machines::WFMachinesError;
20
+ pub(crate) use history_update::HistoryUpdate;
18
21
  #[cfg(test)]
19
22
  pub(crate) use managed_run::ManagedWFFunc;
20
23
 
24
+ use crate::worker::activities::TrackedPermittedTqResp;
21
25
  use crate::{
22
- abstractions::OwnedMeteredSemPermit,
23
- protosext::{legacy_query_failure, ValidPollWFTQResponse, WorkflowActivationExt},
24
- telemetry::VecDisplayer,
26
+ abstractions::{
27
+ stream_when_allowed, MeteredSemaphore, TrackedOwnedMeteredSemPermit, UsedMeteredSemPermit,
28
+ },
29
+ internal_flags::InternalFlags,
30
+ protosext::{legacy_query_failure, ValidPollWFTQResponse},
31
+ telemetry::{
32
+ metrics::workflow_worker_type, set_trace_subscriber_for_current_thread, TelemetryInstance,
33
+ VecDisplayer,
34
+ },
25
35
  worker::{
26
- activities::{ActivitiesFromWFTsHandle, PermittedTqResp},
36
+ activities::{ActivitiesFromWFTsHandle, LocalActivityManager},
27
37
  client::{WorkerClient, WorkflowTaskCompletion},
28
38
  workflow::{
29
- managed_run::{ManagedRun, WorkflowManager},
39
+ history_update::HistoryPaginator,
40
+ managed_run::RunUpdateAct,
41
+ wft_extraction::{HistoryFetchReq, WFTExtractor},
30
42
  wft_poller::validate_wft,
31
43
  workflow_stream::{LocalInput, LocalInputs, WFStream},
32
44
  },
33
- LocalActRequest, LocalActivityResolution,
45
+ LocalActRequest, LocalActivityExecutionResult, LocalActivityResolution,
34
46
  },
35
47
  MetricsContext,
36
48
  };
49
+ use anyhow::anyhow;
37
50
  use futures::{stream::BoxStream, Stream, StreamExt};
51
+ use futures_util::{future::abortable, stream};
52
+ use prost_types::TimestampError;
38
53
  use std::{
39
- collections::HashSet,
40
- fmt::{Debug, Display, Formatter},
54
+ cell::RefCell,
55
+ collections::VecDeque,
56
+ fmt::Debug,
41
57
  future::Future,
42
58
  ops::DerefMut,
59
+ rc::Rc,
43
60
  result,
44
61
  sync::Arc,
62
+ thread,
45
63
  time::{Duration, Instant},
46
64
  };
47
65
  use temporal_sdk_core_api::errors::{CompleteWfError, PollWfError};
@@ -59,36 +77,41 @@ use temporal_sdk_core_protos::{
59
77
  },
60
78
  temporal::api::{
61
79
  command::v1::{command::Attributes, Command as ProtoCommand, Command},
62
- common::v1::{Memo, RetryPolicy, SearchAttributes},
80
+ common::v1::{Memo, MeteringMetadata, RetryPolicy, SearchAttributes, WorkflowExecution},
63
81
  enums::v1::WorkflowTaskFailedCause,
82
+ query::v1::WorkflowQuery,
83
+ sdk::v1::WorkflowTaskCompletedMetadata,
64
84
  taskqueue::v1::StickyExecutionAttributes,
65
- workflowservice::v1::PollActivityTaskQueueResponse,
85
+ workflowservice::v1::{get_system_info_response, PollActivityTaskQueueResponse},
66
86
  },
67
87
  TaskToken,
68
88
  };
69
89
  use tokio::{
70
90
  sync::{
71
- mpsc::{unbounded_channel, UnboundedSender},
91
+ mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender},
72
92
  oneshot,
73
93
  },
74
- task,
75
- task::{JoinError, JoinHandle},
94
+ task::{spawn_blocking, LocalSet},
76
95
  };
77
96
  use tokio_stream::wrappers::UnboundedReceiverStream;
78
97
  use tokio_util::sync::CancellationToken;
79
98
  use tracing::Span;
80
99
 
81
100
  pub(crate) const LEGACY_QUERY_ID: &str = "legacy_query";
101
+ /// What percentage of a WFT timeout we are willing to wait before sending a WFT heartbeat when
102
+ /// necessary.
103
+ const WFT_HEARTBEAT_TIMEOUT_FRACTION: f32 = 0.8;
82
104
  const MAX_EAGER_ACTIVITY_RESERVATIONS_PER_WORKFLOW_TASK: usize = 3;
83
105
 
84
106
  type Result<T, E = WFMachinesError> = result::Result<T, E>;
85
107
  type BoxedActivationStream = BoxStream<'static, Result<ActivationOrAuto, PollWfError>>;
108
+ type InternalFlagsRef = Rc<RefCell<InternalFlags>>;
86
109
 
87
110
  /// Centralizes all state related to workflows and workflow tasks
88
111
  pub(crate) struct Workflows {
89
112
  task_queue: String,
90
113
  local_tx: UnboundedSender<LocalInput>,
91
- processing_task: tokio::sync::Mutex<Option<JoinHandle<()>>>,
114
+ processing_task: tokio::sync::Mutex<Option<thread::JoinHandle<()>>>,
92
115
  activation_stream: tokio::sync::Mutex<(
93
116
  BoxedActivationStream,
94
117
  // Used to indicate polling may begin
@@ -100,9 +123,12 @@ pub(crate) struct Workflows {
100
123
  sticky_attrs: Option<StickyExecutionAttributes>,
101
124
  /// If set, can be used to reserve activity task slots for eager-return of new activity tasks.
102
125
  activity_tasks_handle: Option<ActivitiesFromWFTsHandle>,
126
+ /// Ensures we stay at or below this worker's maximum concurrent workflow task limit
127
+ wft_semaphore: MeteredSemaphore,
128
+ local_act_mgr: Arc<LocalActivityManager>,
103
129
  }
104
130
 
105
- pub(super) struct WorkflowBasics {
131
+ pub(crate) struct WorkflowBasics {
106
132
  pub max_cached_workflows: usize,
107
133
  pub max_outstanding_wfts: usize,
108
134
  pub shutdown_token: CancellationToken,
@@ -110,53 +136,115 @@ pub(super) struct WorkflowBasics {
110
136
  pub namespace: String,
111
137
  pub task_queue: String,
112
138
  pub ignore_evicts_on_shutdown: bool,
139
+ pub fetching_concurrency: usize,
140
+ pub server_capabilities: get_system_info_response::Capabilities,
141
+ #[cfg(feature = "save_wf_inputs")]
142
+ pub wf_state_inputs: Option<UnboundedSender<Vec<u8>>>,
143
+ }
144
+
145
+ pub(crate) struct RunBasics<'a> {
146
+ pub namespace: String,
147
+ pub workflow_id: String,
148
+ pub workflow_type: String,
149
+ pub run_id: String,
150
+ pub history: HistoryUpdate,
151
+ pub metrics: MetricsContext,
152
+ pub capabilities: &'a get_system_info_response::Capabilities,
113
153
  }
114
154
 
115
155
  impl Workflows {
156
+ #[allow(clippy::too_many_arguments)] // Not much worth combining here
116
157
  pub(super) fn new(
117
158
  basics: WorkflowBasics,
118
159
  sticky_attrs: Option<StickyExecutionAttributes>,
119
160
  client: Arc<dyn WorkerClient>,
120
161
  wft_stream: impl Stream<Item = Result<ValidPollWFTQResponse, tonic::Status>> + Send + 'static,
121
- local_activity_request_sink: impl Fn(Vec<LocalActRequest>) -> Vec<LocalActivityResolution>
122
- + Send
123
- + Sync
124
- + 'static,
162
+ local_activity_request_sink: impl LocalActivityRequestSink,
163
+ local_act_mgr: Arc<LocalActivityManager>,
164
+ heartbeat_timeout_rx: UnboundedReceiver<HeartbeatTimeoutMsg>,
125
165
  activity_tasks_handle: Option<ActivitiesFromWFTsHandle>,
166
+ telem_instance: Option<&TelemetryInstance>,
126
167
  ) -> Self {
127
168
  let (local_tx, local_rx) = unbounded_channel();
169
+ let (fetch_tx, fetch_rx) = unbounded_channel();
128
170
  let shutdown_tok = basics.shutdown_token.clone();
129
171
  let task_queue = basics.task_queue.clone();
130
- let mut stream = WFStream::build(
131
- basics,
172
+ let wft_semaphore = MeteredSemaphore::new(
173
+ basics.max_outstanding_wfts,
174
+ basics.metrics.with_new_attrs([workflow_worker_type()]),
175
+ MetricsContext::available_task_slots,
176
+ );
177
+ // Only allow polling of the new WFT stream if there are available task slots
178
+ let proceeder = stream::unfold(wft_semaphore.clone(), |sem| async move {
179
+ Some((sem.acquire_owned().await.unwrap(), sem))
180
+ });
181
+ let wft_stream = stream_when_allowed(wft_stream, proceeder);
182
+ let extracted_wft_stream = WFTExtractor::build(
183
+ client.clone(),
184
+ basics.fetching_concurrency,
132
185
  wft_stream,
186
+ UnboundedReceiverStream::new(fetch_rx),
187
+ );
188
+ let locals_stream = stream::select(
133
189
  UnboundedReceiverStream::new(local_rx),
134
- client.clone(),
135
- local_activity_request_sink,
190
+ UnboundedReceiverStream::new(heartbeat_timeout_rx).map(Into::into),
136
191
  );
137
192
  let (activation_tx, activation_rx) = unbounded_channel();
138
193
  let (start_polling_tx, start_polling_rx) = oneshot::channel();
139
194
  // We must spawn a task to constantly poll the activation stream, because otherwise
140
195
  // activation completions would not cause anything to happen until the next poll.
141
- let processing_task = task::spawn(async move {
142
- // However, we want to avoid plowing ahead until we've been asked to poll at least once.
143
- // This supports activity-only workers.
144
- let do_poll = tokio::select! {
145
- sp = start_polling_rx => {
146
- sp.is_ok()
196
+ let tracing_sub = telem_instance.map(|ti| ti.trace_subscriber());
197
+ let processing_task = thread::spawn(move || {
198
+ if let Some(ts) = tracing_sub {
199
+ set_trace_subscriber_for_current_thread(ts);
200
+ }
201
+ let rt = tokio::runtime::Builder::new_current_thread()
202
+ .enable_all()
203
+ .thread_name("workflow-processing")
204
+ .build()
205
+ .unwrap();
206
+ let local = LocalSet::new();
207
+ local.block_on(&rt, async move {
208
+ let mut stream = WFStream::build(
209
+ basics,
210
+ extracted_wft_stream,
211
+ locals_stream,
212
+ local_activity_request_sink,
213
+ );
214
+
215
+ // However, we want to avoid plowing ahead until we've been asked to poll at least
216
+ // once. This supports activity-only workers.
217
+ let do_poll = tokio::select! {
218
+ sp = start_polling_rx => {
219
+ sp.is_ok()
220
+ }
221
+ _ = shutdown_tok.cancelled() => {
222
+ false
223
+ }
224
+ };
225
+ if !do_poll {
226
+ return;
147
227
  }
148
- _ = shutdown_tok.cancelled() => {
149
- false
228
+ while let Some(output) = stream.next().await {
229
+ match output {
230
+ Ok(o) => {
231
+ for fetchreq in o.fetch_histories {
232
+ fetch_tx
233
+ .send(fetchreq)
234
+ .expect("Fetch channel must not be dropped");
235
+ }
236
+ for act in o.activations {
237
+ activation_tx
238
+ .send(Ok(act))
239
+ .expect("Activation processor channel not dropped");
240
+ }
241
+ }
242
+ Err(e) => activation_tx
243
+ .send(Err(e))
244
+ .expect("Activation processor channel not dropped"),
245
+ }
150
246
  }
151
- };
152
- if !do_poll {
153
- return;
154
- }
155
- while let Some(act) = stream.next().await {
156
- activation_tx
157
- .send(act)
158
- .expect("Activation processor channel not dropped");
159
- }
247
+ });
160
248
  });
161
249
  Self {
162
250
  task_queue,
@@ -169,12 +257,14 @@ impl Workflows {
169
257
  client,
170
258
  sticky_attrs,
171
259
  activity_tasks_handle,
260
+ wft_semaphore,
261
+ local_act_mgr,
172
262
  }
173
263
  }
174
264
 
175
- pub async fn next_workflow_activation(&self) -> Result<WorkflowActivation, PollWfError> {
265
+ pub(super) async fn next_workflow_activation(&self) -> Result<WorkflowActivation, PollWfError> {
176
266
  loop {
177
- let r = {
267
+ let al = {
178
268
  let mut lock = self.activation_stream.lock().await;
179
269
  let (ref mut stream, ref mut beginner) = lock.deref_mut();
180
270
  if let Some(beginner) = beginner.take() {
@@ -182,17 +272,37 @@ impl Workflows {
182
272
  }
183
273
  stream.next().await.unwrap_or(Err(PollWfError::ShutDown))?
184
274
  };
185
- Span::current().record("run_id", r.run_id());
186
- match r {
275
+ Span::current().record("run_id", al.run_id());
276
+ match al {
187
277
  ActivationOrAuto::LangActivation(act) | ActivationOrAuto::ReadyForQueries(act) => {
188
278
  debug!(activation=%act, "Sending activation to lang");
189
279
  break Ok(act);
190
280
  }
191
281
  ActivationOrAuto::Autocomplete { run_id } => {
192
- self.activation_completed(WorkflowActivationCompletion {
193
- run_id,
194
- status: Some(workflow_completion::Success::from_variants(vec![]).into()),
195
- })
282
+ self.activation_completed(
283
+ WorkflowActivationCompletion {
284
+ run_id,
285
+ status: Some(
286
+ workflow_completion::Success::from_variants(vec![]).into(),
287
+ ),
288
+ },
289
+ // We need to say a type, but the type is irrelevant, so imagine some
290
+ // boxed function we'll never call.
291
+ Option::<Box<dyn Fn(&str, usize) + Send>>::None,
292
+ )
293
+ .await?;
294
+ }
295
+ ActivationOrAuto::AutoFail {
296
+ run_id,
297
+ machines_err,
298
+ } => {
299
+ self.activation_completed(
300
+ WorkflowActivationCompletion {
301
+ run_id,
302
+ status: Some(auto_fail_to_complete_status(machines_err)),
303
+ },
304
+ Option::<Box<dyn Fn(&str, usize) + Send>>::None,
305
+ )
196
306
  .await?;
197
307
  }
198
308
  }
@@ -202,10 +312,11 @@ impl Workflows {
202
312
  /// Queue an activation completion for processing, returning a future that will resolve with
203
313
  /// the outcome of that completion. See [ActivationCompletedOutcome].
204
314
  ///
205
- /// Returns the most-recently-processed event number for the run
206
- pub async fn activation_completed(
315
+ /// Returns the most-recently-processed event number for the run.
316
+ pub(super) async fn activation_completed(
207
317
  &self,
208
318
  completion: WorkflowActivationCompletion,
319
+ post_activate_hook: Option<impl Fn(&str, usize)>,
209
320
  ) -> Result<usize, CompleteWfError> {
210
321
  let is_empty_completion = completion.is_empty();
211
322
  let completion = validate_completion(completion)?;
@@ -213,7 +324,7 @@ impl Workflows {
213
324
  let (tx, rx) = oneshot::channel();
214
325
  let was_sent = self.send_local(WFActCompleteMsg {
215
326
  completion,
216
- response_tx: tx,
327
+ response_tx: Some(tx),
217
328
  });
218
329
  if !was_sent {
219
330
  if is_empty_completion {
@@ -230,7 +341,7 @@ impl Workflows {
230
341
  .await
231
342
  .expect("Send half of activation complete response not dropped");
232
343
  let mut wft_from_complete = None;
233
- let reported_wft_to_server = match completion_outcome.outcome {
344
+ let wft_report_status = match completion_outcome.outcome {
234
345
  ActivationCompleteOutcome::ReportWFTSuccess(report) => match report {
235
346
  ServerCommandsWithWorkflowInfo {
236
347
  task_token,
@@ -239,6 +350,7 @@ impl Workflows {
239
350
  mut commands,
240
351
  query_responses,
241
352
  force_new_wft,
353
+ sdk_metadata,
242
354
  },
243
355
  } => {
244
356
  let reserved_act_permits =
@@ -252,6 +364,13 @@ impl Workflows {
252
364
  sticky_attributes: None,
253
365
  return_new_workflow_task: true,
254
366
  force_create_new_workflow_task: force_new_wft,
367
+ sdk_metadata,
368
+ metering_metadata: MeteringMetadata {
369
+ nonfirst_local_activity_execution_attempts: self
370
+ .local_act_mgr
371
+ .get_nonfirst_attempt_count(&run_id)
372
+ as u32,
373
+ },
255
374
  };
256
375
  let sticky_attrs = self.sticky_attrs.clone();
257
376
  // Do not return new WFT if we would not cache, because returned new WFTs are
@@ -273,14 +392,14 @@ impl Workflows {
273
392
  Ok(())
274
393
  })
275
394
  .await;
276
- true
395
+ WFTReportStatus::Reported
277
396
  }
278
397
  ServerCommandsWithWorkflowInfo {
279
398
  task_token,
280
399
  action: ActivationAction::RespondLegacyQuery { result },
281
400
  } => {
282
401
  self.respond_legacy_query(task_token, *result).await;
283
- true
402
+ WFTReportStatus::Reported
284
403
  }
285
404
  },
286
405
  ActivationCompleteOutcome::ReportWFTFail(outcome) => match outcome {
@@ -292,29 +411,54 @@ impl Workflows {
292
411
  .await
293
412
  })
294
413
  .await;
295
- true
414
+ WFTReportStatus::Reported
296
415
  }
297
416
  FailedActivationWFTReport::ReportLegacyQueryFailure(task_token, failure) => {
298
417
  warn!(run_id=%run_id, failure=?failure, "Failing legacy query request");
299
418
  self.respond_legacy_query(task_token, legacy_query_failure(failure))
300
419
  .await;
301
- true
420
+ WFTReportStatus::Reported
302
421
  }
303
422
  },
304
- ActivationCompleteOutcome::DoNothing => false,
423
+ ActivationCompleteOutcome::WFTFailedDontReport => WFTReportStatus::DropWft,
424
+ ActivationCompleteOutcome::DoNothing => WFTReportStatus::NotReported,
425
+ };
426
+
427
+ let maybe_pwft = if let Some(wft) = wft_from_complete {
428
+ match HistoryPaginator::from_poll(wft, self.client.clone()).await {
429
+ Ok((paginator, pwft)) => Some((pwft, paginator)),
430
+ Err(e) => {
431
+ self.request_eviction(
432
+ &run_id,
433
+ format!("Failed to paginate workflow task from completion: {e:?}"),
434
+ EvictionReason::Fatal,
435
+ );
436
+ None
437
+ }
438
+ }
439
+ } else {
440
+ None
305
441
  };
306
442
 
443
+ if let Some(h) = post_activate_hook {
444
+ h(&run_id, completion_outcome.most_recently_processed_event);
445
+ }
446
+
307
447
  self.post_activation(PostActivationMsg {
308
448
  run_id,
309
- reported_wft_to_server,
310
- wft_from_complete,
449
+ wft_report_status,
450
+ wft_from_complete: maybe_pwft,
311
451
  });
312
452
 
313
453
  Ok(completion_outcome.most_recently_processed_event)
314
454
  }
315
455
 
316
456
  /// Tell workflow that a local activity has finished with the provided result
317
- pub fn notify_of_local_result(&self, run_id: impl Into<String>, resolved: LocalResolution) {
457
+ pub(super) fn notify_of_local_result(
458
+ &self,
459
+ run_id: impl Into<String>,
460
+ resolved: LocalResolution,
461
+ ) {
318
462
  self.send_local(LocalResolutionMsg {
319
463
  run_id: run_id.into(),
320
464
  res: resolved,
@@ -322,7 +466,7 @@ impl Workflows {
322
466
  }
323
467
 
324
468
  /// Request eviction of a workflow
325
- pub fn request_eviction(
469
+ pub(super) fn request_eviction(
326
470
  &self,
327
471
  run_id: impl Into<String>,
328
472
  message: impl Into<String>,
@@ -336,22 +480,39 @@ impl Workflows {
336
480
  }
337
481
 
338
482
  /// Query the state of workflow management. Can return `None` if workflow state is shut down.
339
- pub fn get_state_info(&self) -> impl Future<Output = Option<WorkflowStateInfo>> {
483
+ pub(super) fn get_state_info(&self) -> impl Future<Output = Option<WorkflowStateInfo>> {
340
484
  let (tx, rx) = oneshot::channel();
341
485
  self.send_local(GetStateInfoMsg { response_tx: tx });
342
486
  async move { rx.await.ok() }
343
487
  }
344
488
 
345
- pub async fn shutdown(&self) -> Result<(), JoinError> {
489
+ pub(super) fn available_wft_permits(&self) -> usize {
490
+ self.wft_semaphore.available_permits()
491
+ }
492
+
493
+ pub(super) async fn shutdown(&self) -> Result<(), anyhow::Error> {
346
494
  let maybe_jh = self.processing_task.lock().await.take();
347
495
  if let Some(jh) = maybe_jh {
348
- // This acts as a final wake up in case the stream is still alive and wouldn't otherwise
349
- // receive another message. It allows it to shut itself down.
350
- let _ = self.get_state_info();
351
- jh.await
352
- } else {
353
- Ok(())
496
+ // This serves to drive the stream if it is still alive and wouldn't otherwise receive
497
+ // another message. It allows it to shut itself down.
498
+ let (waker, stop_waker) = abortable(async {
499
+ let mut interval = tokio::time::interval(Duration::from_millis(10));
500
+ loop {
501
+ interval.tick().await;
502
+ let _ = self.get_state_info().await;
503
+ }
504
+ });
505
+ let (_, jh_res) = tokio::join!(
506
+ waker,
507
+ spawn_blocking(move || {
508
+ let r = jh.join();
509
+ stop_waker.abort();
510
+ r
511
+ })
512
+ );
513
+ jh_res?.map_err(|e| anyhow!("Error joining workflow processing thread: {e:?}"))?;
354
514
  }
515
+ Ok(())
355
516
  }
356
517
 
357
518
  /// Must be called after every activation completion has finished
@@ -393,7 +554,11 @@ impl Workflows {
393
554
  /// successfully.
394
555
  fn send_local(&self, msg: impl Into<LocalInputs>) -> bool {
395
556
  let msg = msg.into();
396
- let print_err = !matches!(msg, LocalInputs::GetStateInfo(_));
557
+ let print_err = match &msg {
558
+ LocalInputs::GetStateInfo(_) => false,
559
+ LocalInputs::LocalResolution(lr) if lr.res.is_la_cancel_confirmation() => false,
560
+ _ => true,
561
+ };
397
562
  if let Err(e) = self.local_tx.send(LocalInput {
398
563
  input: msg,
399
564
  span: Span::current(),
@@ -414,7 +579,7 @@ impl Workflows {
414
579
  /// Process eagerly returned activities from WFT completion
415
580
  fn handle_eager_activities(
416
581
  &self,
417
- reserved_act_permits: Vec<OwnedMeteredSemPermit>,
582
+ reserved_act_permits: Vec<TrackedOwnedMeteredSemPermit>,
418
583
  eager_acts: Vec<PollActivityTaskQueueResponse>,
419
584
  ) {
420
585
  if let Some(at_handle) = self.activity_tasks_handle.as_ref() {
@@ -435,7 +600,7 @@ impl Workflows {
435
600
  let with_permits = reserved_act_permits
436
601
  .into_iter()
437
602
  .zip(eager_acts.into_iter())
438
- .map(|(permit, resp)| PermittedTqResp { permit, resp });
603
+ .map(|(permit, resp)| TrackedPermittedTqResp { permit, resp });
439
604
  if with_permits.len() > 0 {
440
605
  debug!(
441
606
  "Adding {} activity tasks received from WFT complete",
@@ -458,7 +623,7 @@ impl Workflows {
458
623
  fn reserve_activity_slots_for_outgoing_commands(
459
624
  &self,
460
625
  commands: &mut [Command],
461
- ) -> Vec<OwnedMeteredSemPermit> {
626
+ ) -> Vec<TrackedOwnedMeteredSemPermit> {
462
627
  let mut reserved = vec![];
463
628
  for cmd in commands {
464
629
  if let Some(Attributes::ScheduleActivityTaskCommandAttributes(attrs)) =
@@ -509,186 +674,30 @@ impl Workflows {
509
674
  }
510
675
  }
511
676
 
512
- /// Manages access to a specific workflow run, and contains various bookkeeping information that the
513
- /// [WFStream] may need to access quickly.
514
- #[derive(derive_more::DebugCustom)]
515
- #[debug(
516
- fmt = "ManagedRunHandle {{ wft: {:?}, activation: {:?}, buffered_resp: {:?} \
517
- have_seen_terminal_event: {}, most_recently_processed_event: {}, more_pending_work: {}, \
518
- trying_to_evict: {}, last_action_acked: {} }}",
519
- wft,
520
- activation,
521
- buffered_resp,
522
- have_seen_terminal_event,
523
- most_recently_processed_event_number,
524
- more_pending_work,
525
- "trying_to_evict.is_some()",
526
- last_action_acked
677
+ /// Returned when a cache miss happens and we need to fetch history from the beginning to
678
+ /// replay a run
679
+ #[derive(Debug, derive_more::Display)]
680
+ #[display(
681
+ fmt = "CacheMissFetchReq(run_id: {})",
682
+ "original_wft.work.execution.run_id"
527
683
  )]
528
- struct ManagedRunHandle {
529
- /// If set, the WFT this run is currently/will be processing.
530
- wft: Option<OutstandingTask>,
531
- /// An outstanding activation to lang
532
- activation: Option<OutstandingActivation>,
533
- /// If set, it indicates there is a buffered poll response from the server that applies to this
534
- /// run. This can happen when lang takes too long to complete a task and the task times out, for
535
- /// example. Upon next completion, the buffered response will be removed and can be made ready
536
- /// to be returned from polling
537
- buffered_resp: Option<PermittedWFT>,
538
- /// True if this machine has seen an event which ends the execution
539
- have_seen_terminal_event: bool,
540
- /// The most recently processed event id this machine has seen. 0 means it has seen nothing.
541
- most_recently_processed_event_number: usize,
542
- /// Is set true when the machines indicate that there is additional known work to be processed
543
- more_pending_work: bool,
544
- /// Is set if an eviction has been requested for this run
545
- trying_to_evict: Option<RequestEvictMsg>,
546
- /// Set to true if the last action we tried to take to this run has been processed (ie: the
547
- /// [RunUpdateResponse] for it has been seen.
548
- last_action_acked: bool,
549
- /// For sending work to the machines
550
- run_actions_tx: UnboundedSender<RunAction>,
551
- /// Handle to the task where the actual machines live
552
- handle: JoinHandle<()>,
553
-
554
- /// We track if we have recorded useful debugging values onto a certain span yet, to overcome
555
- /// duplicating field values. Remove this once https://github.com/tokio-rs/tracing/issues/2334
556
- /// is fixed.
557
- recorded_span_ids: HashSet<tracing::Id>,
558
- metrics: MetricsContext,
684
+ #[must_use]
685
+ struct CacheMissFetchReq {
686
+ original_wft: PermittedWFT,
687
+ }
688
+ /// Bubbled up from inside workflow state if we're trying to apply the next workflow task but it
689
+ /// isn't in memory
690
+ #[derive(Debug)]
691
+ #[must_use]
692
+ struct NextPageReq {
693
+ paginator: HistoryPaginator,
694
+ span: Span,
559
695
  }
560
- impl ManagedRunHandle {
561
- fn new(
562
- wfm: WorkflowManager,
563
- activations_tx: UnboundedSender<RunUpdateResponse>,
564
- local_activity_request_sink: LocalActivityRequestSink,
565
- metrics: MetricsContext,
566
- ) -> Self {
567
- let (run_actions_tx, run_actions_rx) = unbounded_channel();
568
- let managed = ManagedRun::new(wfm, activations_tx, local_activity_request_sink);
569
- let handle = tokio::task::spawn(managed.run(run_actions_rx));
570
- Self {
571
- wft: None,
572
- activation: None,
573
- buffered_resp: None,
574
- have_seen_terminal_event: false,
575
- most_recently_processed_event_number: 0,
576
- more_pending_work: false,
577
- trying_to_evict: None,
578
- last_action_acked: true,
579
- run_actions_tx,
580
- handle,
581
- recorded_span_ids: Default::default(),
582
- metrics,
583
- }
584
- }
585
-
586
- fn incoming_wft(&mut self, wft: NewIncomingWFT) {
587
- if self.wft.is_some() {
588
- error!("Trying to send a new WFT for a run which already has one!");
589
- }
590
- self.send_run_action(RunActions::NewIncomingWFT(wft));
591
- }
592
- fn check_more_activations(&mut self) {
593
- // No point in checking for more activations if we have not acked the last update, or
594
- // if there's already an outstanding activation.
595
- if self.last_action_acked && self.activation.is_none() {
596
- self.send_run_action(RunActions::CheckMoreWork {
597
- want_to_evict: self.trying_to_evict.clone(),
598
- has_pending_queries: self
599
- .wft
600
- .as_ref()
601
- .map(|wft| !wft.pending_queries.is_empty())
602
- .unwrap_or_default(),
603
- has_wft: self.wft.is_some(),
604
- });
605
- }
606
- }
607
- fn send_completion(&mut self, c: RunActivationCompletion) {
608
- self.send_run_action(RunActions::ActivationCompletion(c));
609
- }
610
- fn send_local_resolution(&mut self, r: LocalResolution) {
611
- self.send_run_action(RunActions::LocalResolution(r));
612
- }
613
-
614
- fn insert_outstanding_activation(&mut self, act: &ActivationOrAuto) {
615
- let act_type = match &act {
616
- ActivationOrAuto::LangActivation(act) | ActivationOrAuto::ReadyForQueries(act) => {
617
- if act.is_legacy_query() {
618
- OutstandingActivation::LegacyQuery
619
- } else {
620
- OutstandingActivation::Normal {
621
- contains_eviction: act.eviction_index().is_some(),
622
- num_jobs: act.jobs.len(),
623
- }
624
- }
625
- }
626
- ActivationOrAuto::Autocomplete { .. } => OutstandingActivation::Autocomplete,
627
- };
628
- if let Some(old_act) = self.activation {
629
- // This is a panic because we have screwed up core logic if this is violated. It must be
630
- // upheld.
631
- panic!(
632
- "Attempted to insert a new outstanding activation {:?}, but there already was \
633
- one outstanding: {:?}",
634
- act, old_act
635
- );
636
- }
637
- self.activation = Some(act_type);
638
- }
639
-
640
- fn send_run_action(&mut self, action: RunActions) {
641
- self.last_action_acked = false;
642
- self.run_actions_tx
643
- .send(RunAction {
644
- action,
645
- trace_span: Span::current(),
646
- })
647
- .expect("Receive half of run actions not dropped");
648
- }
649
-
650
- /// Returns true if the managed run has any form of pending work
651
- /// If `ignore_evicts` is true, pending evictions do not count as pending work.
652
- /// If `ignore_buffered` is true, buffered workflow tasks do not count as pending work.
653
- fn has_any_pending_work(&self, ignore_evicts: bool, ignore_buffered: bool) -> bool {
654
- let evict_work = if ignore_evicts {
655
- false
656
- } else {
657
- self.trying_to_evict.is_some()
658
- };
659
- let act_work = if ignore_evicts {
660
- if let Some(ref act) = self.activation {
661
- !act.has_only_eviction()
662
- } else {
663
- false
664
- }
665
- } else {
666
- self.activation.is_some()
667
- };
668
- let buffered = if ignore_buffered {
669
- false
670
- } else {
671
- self.buffered_resp.is_some()
672
- };
673
- self.wft.is_some()
674
- || buffered
675
- || !self.last_action_acked
676
- || self.more_pending_work
677
- || act_work
678
- || evict_work
679
- }
680
696
 
681
- /// Returns true if the handle is currently processing a WFT which contains a legacy query.
682
- fn pending_work_is_legacy_query(&self) -> bool {
683
- // Either we know because there is a pending legacy query, or it's already been drained and
684
- // sent as an activation.
685
- matches!(self.activation, Some(OutstandingActivation::LegacyQuery))
686
- || self
687
- .wft
688
- .as_ref()
689
- .map(|t| t.has_pending_legacy_query())
690
- .unwrap_or_default()
691
- }
697
+ #[derive(Debug)]
698
+ struct WFStreamOutput {
699
+ activations: VecDeque<ActivationOrAuto>,
700
+ fetch_histories: VecDeque<HistoryFetchReq>,
692
701
  }
693
702
 
694
703
  #[derive(Debug, derive_more::Display)]
@@ -697,9 +706,15 @@ enum ActivationOrAuto {
697
706
  /// This type should only be filled with an empty activation which is ready to have queries
698
707
  /// inserted into the joblist
699
708
  ReadyForQueries(WorkflowActivation),
709
+ #[display(fmt = "Autocomplete(run_id={run_id})")]
700
710
  Autocomplete {
701
711
  run_id: String,
702
712
  },
713
+ #[display(fmt = "AutoFail(run_id={run_id})")]
714
+ AutoFail {
715
+ run_id: String,
716
+ machines_err: WFMachinesError,
717
+ },
703
718
  }
704
719
  impl ActivationOrAuto {
705
720
  pub fn run_id(&self) -> &str {
@@ -707,15 +722,53 @@ impl ActivationOrAuto {
707
722
  ActivationOrAuto::LangActivation(act) => &act.run_id,
708
723
  ActivationOrAuto::Autocomplete { run_id, .. } => run_id,
709
724
  ActivationOrAuto::ReadyForQueries(act) => &act.run_id,
725
+ ActivationOrAuto::AutoFail { run_id, .. } => run_id,
710
726
  }
711
727
  }
712
728
  }
713
729
 
730
+ /// A processed WFT which has been validated and had a history update extracted from it
714
731
  #[derive(derive_more::DebugCustom)]
715
- #[debug(fmt = "PermittedWft {{ {:?} }}", wft)]
732
+ #[cfg_attr(
733
+ feature = "save_wf_inputs",
734
+ derive(serde::Serialize, serde::Deserialize)
735
+ )]
736
+ #[debug(fmt = "PermittedWft({work:?})")]
716
737
  pub(crate) struct PermittedWFT {
717
- wft: ValidPollWFTQResponse,
718
- permit: OwnedMeteredSemPermit,
738
+ work: PreparedWFT,
739
+ #[cfg_attr(
740
+ feature = "save_wf_inputs",
741
+ serde(skip, default = "UsedMeteredSemPermit::fake_deserialized")
742
+ )]
743
+ permit: UsedMeteredSemPermit,
744
+ #[cfg_attr(
745
+ feature = "save_wf_inputs",
746
+ serde(skip, default = "HistoryPaginator::fake_deserialized")
747
+ )]
748
+ paginator: HistoryPaginator,
749
+ }
750
+ #[derive(Debug)]
751
+ #[cfg_attr(
752
+ feature = "save_wf_inputs",
753
+ derive(serde::Serialize, serde::Deserialize)
754
+ )]
755
+ struct PreparedWFT {
756
+ task_token: TaskToken,
757
+ attempt: u32,
758
+ execution: WorkflowExecution,
759
+ workflow_type: String,
760
+ legacy_query: Option<WorkflowQuery>,
761
+ query_requests: Vec<QueryWorkflow>,
762
+ update: HistoryUpdate,
763
+ }
764
+ impl PreparedWFT {
765
+ /// Returns true if the contained history update is incremental (IE: expects to hit a cached
766
+ /// workflow)
767
+ pub fn is_incremental(&self) -> bool {
768
+ let start_event_id = self.update.first_event_id();
769
+ let poll_resp_is_incremental = start_event_id.map(|eid| eid > 1).unwrap_or_default();
770
+ poll_resp_is_incremental || start_event_id.is_none()
771
+ }
719
772
  }
720
773
 
721
774
  #[derive(Debug)]
@@ -727,7 +780,7 @@ pub(crate) struct OutstandingTask {
727
780
  pub start_time: Instant,
728
781
  /// The WFT permit owned by this task, ensures we don't exceed max concurrent WFT, and makes
729
782
  /// sure the permit is automatically freed when we delete the task.
730
- pub permit: OwnedMeteredSemPermit,
783
+ pub permit: UsedMeteredSemPermit,
731
784
  }
732
785
 
733
786
  impl OutstandingTask {
@@ -806,49 +859,80 @@ pub(crate) enum ActivationAction {
806
859
  commands: Vec<ProtoCommand>,
807
860
  query_responses: Vec<QueryResult>,
808
861
  force_new_wft: bool,
862
+ sdk_metadata: WorkflowTaskCompletedMetadata,
809
863
  },
810
864
  /// We should respond to a legacy query request
811
865
  RespondLegacyQuery { result: Box<QueryResult> },
812
866
  }
813
867
 
814
- #[derive(Debug, Eq, PartialEq, Hash)]
815
- pub(crate) enum EvictionRequestResult {
816
- EvictionRequested(Option<u32>),
868
+ #[derive(Debug)]
869
+ enum EvictionRequestResult {
870
+ EvictionRequested(Option<u32>, RunUpdateAct),
817
871
  NotFound,
818
872
  EvictionAlreadyRequested(Option<u32>),
819
873
  }
874
+ impl EvictionRequestResult {
875
+ fn into_run_update_resp(self) -> RunUpdateAct {
876
+ match self {
877
+ EvictionRequestResult::EvictionRequested(_, resp) => resp,
878
+ EvictionRequestResult::NotFound
879
+ | EvictionRequestResult::EvictionAlreadyRequested(_) => None,
880
+ }
881
+ }
882
+ }
820
883
 
821
884
  #[derive(Debug)]
822
885
  #[allow(dead_code)] // Not always used in non-test
823
886
  pub(crate) struct WorkflowStateInfo {
824
887
  pub cached_workflows: usize,
825
888
  pub outstanding_wft: usize,
826
- pub available_wft_permits: usize,
827
889
  }
828
890
 
829
891
  #[derive(Debug)]
892
+ #[cfg_attr(
893
+ feature = "save_wf_inputs",
894
+ derive(serde::Serialize, serde::Deserialize)
895
+ )]
830
896
  struct WFActCompleteMsg {
831
897
  completion: ValidatedCompletion,
832
- response_tx: oneshot::Sender<ActivationCompleteResult>,
898
+ #[cfg_attr(feature = "save_wf_inputs", serde(skip))]
899
+ response_tx: Option<oneshot::Sender<ActivationCompleteResult>>,
833
900
  }
834
901
  #[derive(Debug)]
902
+ #[cfg_attr(
903
+ feature = "save_wf_inputs",
904
+ derive(serde::Serialize, serde::Deserialize)
905
+ )]
835
906
  struct LocalResolutionMsg {
836
907
  run_id: String,
837
908
  res: LocalResolution,
838
909
  }
839
910
  #[derive(Debug)]
911
+ #[cfg_attr(
912
+ feature = "save_wf_inputs",
913
+ derive(serde::Serialize, serde::Deserialize)
914
+ )]
840
915
  struct PostActivationMsg {
841
916
  run_id: String,
842
- reported_wft_to_server: bool,
843
- wft_from_complete: Option<ValidPollWFTQResponse>,
917
+ wft_report_status: WFTReportStatus,
918
+ wft_from_complete: Option<(PreparedWFT, HistoryPaginator)>,
844
919
  }
845
920
  #[derive(Debug, Clone)]
921
+ #[cfg_attr(
922
+ feature = "save_wf_inputs",
923
+ derive(serde::Serialize, serde::Deserialize)
924
+ )]
846
925
  struct RequestEvictMsg {
847
926
  run_id: String,
848
927
  message: String,
849
928
  reason: EvictionReason,
850
929
  }
851
930
  #[derive(Debug)]
931
+ pub(crate) struct HeartbeatTimeoutMsg {
932
+ pub(crate) run_id: String,
933
+ pub(crate) span: Span,
934
+ }
935
+ #[derive(Debug)]
852
936
  struct GetStateInfoMsg {
853
937
  response_tx: oneshot::Sender<WorkflowStateInfo>,
854
938
  }
@@ -869,16 +953,24 @@ enum ActivationCompleteOutcome {
869
953
  ReportWFTFail(FailedActivationWFTReport),
870
954
  /// There's nothing to do right now. EX: The workflow needs to keep replaying.
871
955
  DoNothing,
956
+ /// The workflow task failed, but we shouldn't report it. EX: We have failed 2 or more attempts
957
+ /// in a row.
958
+ WFTFailedDontReport,
872
959
  }
873
- #[derive(Debug)]
874
- struct FulfillableActivationComplete {
875
- result: ActivationCompleteResult,
876
- resp_chan: oneshot::Sender<ActivationCompleteResult>,
877
- }
878
- impl FulfillableActivationComplete {
879
- fn fulfill(self) {
880
- let _ = self.resp_chan.send(self.result);
881
- }
960
+ /// Did we report, or not, completion of a WFT to server?
961
+ #[derive(Debug, Copy, Clone)]
962
+ #[cfg_attr(
963
+ feature = "save_wf_inputs",
964
+ derive(serde::Serialize, serde::Deserialize)
965
+ )]
966
+ enum WFTReportStatus {
967
+ Reported,
968
+ /// The WFT completion was not reported when finishing the activation, because there's still
969
+ /// work to be done. EX: Running LAs.
970
+ NotReported,
971
+ /// We didn't report, but we want to clear the outstanding workflow task anyway. See
972
+ /// [ActivationCompleteOutcome::WFTFailedDontReport]
973
+ DropWft,
882
974
  }
883
975
 
884
976
  fn validate_completion(
@@ -908,8 +1000,7 @@ fn validate_completion(
908
1000
  reason: format!(
909
1001
  "Workflow completion had a legacy query response along with other \
910
1002
  commands. This is not allowed and constitutes an error in the \
911
- lang SDK. Commands: {:?}",
912
- commands
1003
+ lang SDK. Commands: {commands:?}"
913
1004
  ),
914
1005
  run_id: completion.run_id,
915
1006
  });
@@ -918,6 +1009,7 @@ fn validate_completion(
918
1009
  Ok(ValidatedCompletion::Success {
919
1010
  run_id: completion.run_id,
920
1011
  commands,
1012
+ used_flags: success.used_internal_flags,
921
1013
  })
922
1014
  }
923
1015
  Some(workflow_activation_completion::Status::Failed(failure)) => {
@@ -934,11 +1026,16 @@ fn validate_completion(
934
1026
  }
935
1027
 
936
1028
  #[derive(Debug)]
1029
+ #[cfg_attr(
1030
+ feature = "save_wf_inputs",
1031
+ derive(serde::Serialize, serde::Deserialize)
1032
+ )]
937
1033
  #[allow(clippy::large_enum_variant)]
938
1034
  enum ValidatedCompletion {
939
1035
  Success {
940
1036
  run_id: String,
941
1037
  commands: Vec<WFCommand>,
1038
+ used_flags: Vec<u32>,
942
1039
  },
943
1040
  Fail {
944
1041
  run_id: String,
@@ -955,112 +1052,6 @@ impl ValidatedCompletion {
955
1052
  }
956
1053
  }
957
1054
 
958
- /// Input to run tasks, sent to [ManagedRun]s via [ManagedRunHandle]s
959
- #[derive(Debug)]
960
- struct RunAction {
961
- action: RunActions,
962
- trace_span: Span,
963
- }
964
- #[derive(Debug)]
965
- #[allow(clippy::large_enum_variant)]
966
- enum RunActions {
967
- NewIncomingWFT(NewIncomingWFT),
968
- ActivationCompletion(RunActivationCompletion),
969
- CheckMoreWork {
970
- want_to_evict: Option<RequestEvictMsg>,
971
- has_pending_queries: bool,
972
- has_wft: bool,
973
- },
974
- LocalResolution(LocalResolution),
975
- HeartbeatTimeout,
976
- }
977
- #[derive(Debug)]
978
- struct NewIncomingWFT {
979
- /// This field is only populated if the machines already exist. Otherwise the machines
980
- /// are instantiated with the workflow history.
981
- history_update: Option<HistoryUpdate>,
982
- /// Wft start time
983
- start_time: Instant,
984
- }
985
- #[derive(Debug)]
986
- struct RunActivationCompletion {
987
- task_token: TaskToken,
988
- start_time: Instant,
989
- commands: Vec<WFCommand>,
990
- activation_was_eviction: bool,
991
- activation_was_only_eviction: bool,
992
- has_pending_query: bool,
993
- query_responses: Vec<QueryResult>,
994
- /// Used to notify the worker when the completion is done processing and the completion can
995
- /// unblock. Must always be `Some` when initialized.
996
- resp_chan: Option<oneshot::Sender<ActivationCompleteResult>>,
997
- }
998
-
999
- /// A response from a [ManagedRun] held by a [ManagedRunHandle]
1000
- #[derive(Debug)]
1001
- struct RunUpdateResponse {
1002
- kind: RunUpdateResponseKind,
1003
- span: Span,
1004
- }
1005
- #[derive(Debug, derive_more::Display)]
1006
- #[allow(clippy::large_enum_variant)]
1007
- enum RunUpdateResponseKind {
1008
- Good(GoodRunUpdate),
1009
- Fail(FailRunUpdate),
1010
- }
1011
- impl RunUpdateResponseKind {
1012
- pub(crate) fn run_id(&self) -> &str {
1013
- match self {
1014
- RunUpdateResponseKind::Good(g) => &g.run_id,
1015
- RunUpdateResponseKind::Fail(f) => &f.run_id,
1016
- }
1017
- }
1018
- }
1019
-
1020
- #[derive(Debug)]
1021
- struct GoodRunUpdate {
1022
- run_id: String,
1023
- outgoing_activation: Option<ActivationOrAuto>,
1024
- fulfillable_complete: Option<FulfillableActivationComplete>,
1025
- have_seen_terminal_event: bool,
1026
- /// Is true if there are more jobs that need to be sent to lang
1027
- more_pending_work: bool,
1028
- most_recently_processed_event_number: usize,
1029
- /// Is true if this update was in response to a new WFT
1030
- in_response_to_wft: bool,
1031
- }
1032
- impl Display for GoodRunUpdate {
1033
- fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
1034
- write!(
1035
- f,
1036
- "GoodRunUpdate(run_id: {}, outgoing_activation: {}, more_pending_work: {})",
1037
- self.run_id,
1038
- if let Some(og) = self.outgoing_activation.as_ref() {
1039
- format!("{}", og)
1040
- } else {
1041
- "None".to_string()
1042
- },
1043
- self.more_pending_work
1044
- )
1045
- }
1046
- }
1047
- #[derive(Debug)]
1048
- pub(crate) struct FailRunUpdate {
1049
- run_id: String,
1050
- err: WFMachinesError,
1051
- /// This is populated if the run update failed while processing a completion - and thus we
1052
- /// must respond down it when handling the failure.
1053
- completion_resp: Option<oneshot::Sender<ActivationCompleteResult>>,
1054
- }
1055
- impl Display for FailRunUpdate {
1056
- fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
1057
- write!(
1058
- f,
1059
- "FailRunUpdate(run_id: {}, error: {:?})",
1060
- self.run_id, self.err
1061
- )
1062
- }
1063
- }
1064
1055
  #[derive(Debug)]
1065
1056
  pub struct OutgoingServerCommands {
1066
1057
  pub commands: Vec<ProtoCommand>,
@@ -1068,9 +1059,22 @@ pub struct OutgoingServerCommands {
1068
1059
  }
1069
1060
 
1070
1061
  #[derive(Debug)]
1062
+ #[cfg_attr(
1063
+ feature = "save_wf_inputs",
1064
+ derive(serde::Serialize, serde::Deserialize)
1065
+ )]
1071
1066
  pub(crate) enum LocalResolution {
1072
1067
  LocalActivity(LocalActivityResolution),
1073
1068
  }
1069
+ impl LocalResolution {
1070
+ pub fn is_la_cancel_confirmation(&self) -> bool {
1071
+ match self {
1072
+ LocalResolution::LocalActivity(lar) => {
1073
+ matches!(lar.result, LocalActivityExecutionResult::Cancelled(_))
1074
+ }
1075
+ }
1076
+ }
1077
+ }
1074
1078
 
1075
1079
  #[derive(thiserror::Error, Debug, derive_more::From)]
1076
1080
  #[error("Lang provided workflow command with empty variant")]
@@ -1079,6 +1083,10 @@ pub struct EmptyWorkflowCommandErr;
1079
1083
  /// [DrivenWorkflow]s respond with these when called, to indicate what they want to do next.
1080
1084
  /// EX: Create a new timer, complete the workflow, etc.
1081
1085
  #[derive(Debug, derive_more::From, derive_more::Display)]
1086
+ #[cfg_attr(
1087
+ feature = "save_wf_inputs",
1088
+ derive(serde::Serialize, serde::Deserialize)
1089
+ )]
1082
1090
  #[allow(clippy::large_enum_variant)]
1083
1091
  pub enum WFCommand {
1084
1092
  /// Returned when we need to wait for the lang sdk to send us something
@@ -1171,12 +1179,9 @@ pub struct WorkflowStartedInfo {
1171
1179
  retry_policy: Option<RetryPolicy>,
1172
1180
  }
1173
1181
 
1174
- type LocalActivityRequestSink =
1175
- Arc<dyn Fn(Vec<LocalActRequest>) -> Vec<LocalActivityResolution> + Send + Sync>;
1176
-
1177
1182
  /// Wraps outgoing activation job protos with some internal details core might care about
1178
1183
  #[derive(Debug, derive_more::Display)]
1179
- #[display(fmt = "{}", variant)]
1184
+ #[display(fmt = "{variant}")]
1180
1185
  struct OutgoingJob {
1181
1186
  variant: workflow_activation_job::Variant,
1182
1187
  /// Since LA resolutions are not distinguished from non-LA resolutions as far as lang is
@@ -1198,3 +1203,74 @@ impl From<OutgoingJob> for WorkflowActivationJob {
1198
1203
  }
1199
1204
  }
1200
1205
  }
1206
+
1207
+ /// Errors thrown inside of workflow machines
1208
+ #[derive(thiserror::Error, Debug)]
1209
+ pub(crate) enum WFMachinesError {
1210
+ #[error("Nondeterminism error: {0}")]
1211
+ Nondeterminism(String),
1212
+ #[error("Fatal error in workflow machines: {0}")]
1213
+ Fatal(String),
1214
+ }
1215
+
1216
+ impl WFMachinesError {
1217
+ pub fn evict_reason(&self) -> EvictionReason {
1218
+ match self {
1219
+ WFMachinesError::Nondeterminism(_) => EvictionReason::Nondeterminism,
1220
+ WFMachinesError::Fatal(_) => EvictionReason::Fatal,
1221
+ }
1222
+ }
1223
+ }
1224
+
1225
+ impl From<TimestampError> for WFMachinesError {
1226
+ fn from(_: TimestampError) -> Self {
1227
+ Self::Fatal("Could not decode timestamp".to_string())
1228
+ }
1229
+ }
1230
+
1231
+ fn auto_fail_to_complete_status(err: WFMachinesError) -> workflow_activation_completion::Status {
1232
+ workflow_activation_completion::Status::Failed(Failure {
1233
+ failure: Some(
1234
+ temporal_sdk_core_protos::temporal::api::failure::v1::Failure {
1235
+ message: "Error while processing workflow task".to_string(),
1236
+ source: err.to_string(),
1237
+ stack_trace: "".to_string(),
1238
+ encoded_attributes: None,
1239
+ cause: None,
1240
+ failure_info: None,
1241
+ },
1242
+ ),
1243
+ force_cause: WorkflowTaskFailedCause::from(err.evict_reason()) as i32,
1244
+ })
1245
+ }
1246
+
1247
+ pub(crate) trait LocalActivityRequestSink: Send + Sync + 'static {
1248
+ fn sink_reqs(&self, reqs: Vec<LocalActRequest>) -> Vec<LocalActivityResolution>;
1249
+ }
1250
+
1251
+ #[derive(derive_more::Constructor)]
1252
+ pub(super) struct LAReqSink {
1253
+ lam: Arc<LocalActivityManager>,
1254
+ /// If we're recording WF inputs, we also need to store immediate resolutions so they're
1255
+ /// available on replay.
1256
+ #[allow(dead_code)] // sometimes appears unused due to feature flagging
1257
+ recorder: Option<UnboundedSender<Vec<u8>>>,
1258
+ }
1259
+
1260
+ impl LocalActivityRequestSink for LAReqSink {
1261
+ fn sink_reqs(&self, reqs: Vec<LocalActRequest>) -> Vec<LocalActivityResolution> {
1262
+ if reqs.is_empty() {
1263
+ return vec![];
1264
+ }
1265
+
1266
+ #[allow(clippy::let_and_return)] // When feature is off clippy doesn't like this
1267
+ let res = self.lam.enqueue(reqs);
1268
+
1269
+ // We always save when there are any reqs, even if the response might be empty, so that
1270
+ // calls/responses are 1:1
1271
+ #[cfg(feature = "save_wf_inputs")]
1272
+ self.write_req(&res);
1273
+
1274
+ res
1275
+ }
1276
+ }