temporalio 0.0.2 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +25 -23
  3. data/bridge/Cargo.lock +168 -59
  4. data/bridge/Cargo.toml +4 -2
  5. data/bridge/sdk-core/README.md +19 -6
  6. data/bridge/sdk-core/client/src/lib.rs +215 -39
  7. data/bridge/sdk-core/client/src/metrics.rs +17 -8
  8. data/bridge/sdk-core/client/src/raw.rs +4 -4
  9. data/bridge/sdk-core/client/src/retry.rs +32 -20
  10. data/bridge/sdk-core/core/Cargo.toml +22 -9
  11. data/bridge/sdk-core/core/src/abstractions.rs +203 -14
  12. data/bridge/sdk-core/core/src/core_tests/activity_tasks.rs +76 -41
  13. data/bridge/sdk-core/core/src/core_tests/determinism.rs +165 -2
  14. data/bridge/sdk-core/core/src/core_tests/local_activities.rs +204 -83
  15. data/bridge/sdk-core/core/src/core_tests/queries.rs +3 -4
  16. data/bridge/sdk-core/core/src/core_tests/workers.rs +1 -3
  17. data/bridge/sdk-core/core/src/core_tests/workflow_tasks.rs +397 -54
  18. data/bridge/sdk-core/core/src/ephemeral_server/mod.rs +106 -12
  19. data/bridge/sdk-core/core/src/internal_flags.rs +136 -0
  20. data/bridge/sdk-core/core/src/lib.rs +16 -9
  21. data/bridge/sdk-core/core/src/telemetry/log_export.rs +1 -1
  22. data/bridge/sdk-core/core/src/telemetry/metrics.rs +69 -35
  23. data/bridge/sdk-core/core/src/telemetry/mod.rs +29 -13
  24. data/bridge/sdk-core/core/src/telemetry/prometheus_server.rs +17 -12
  25. data/bridge/sdk-core/core/src/test_help/mod.rs +62 -12
  26. data/bridge/sdk-core/core/src/worker/activities/activity_heartbeat_manager.rs +112 -156
  27. data/bridge/sdk-core/core/src/worker/activities/activity_task_poller_stream.rs +89 -0
  28. data/bridge/sdk-core/core/src/worker/activities/local_activities.rs +352 -122
  29. data/bridge/sdk-core/core/src/worker/activities.rs +233 -157
  30. data/bridge/sdk-core/core/src/worker/client/mocks.rs +22 -2
  31. data/bridge/sdk-core/core/src/worker/client.rs +18 -2
  32. data/bridge/sdk-core/core/src/worker/mod.rs +165 -58
  33. data/bridge/sdk-core/core/src/worker/workflow/bridge.rs +1 -3
  34. data/bridge/sdk-core/core/src/worker/workflow/driven_workflow.rs +3 -5
  35. data/bridge/sdk-core/core/src/worker/workflow/history_update.rs +856 -277
  36. data/bridge/sdk-core/core/src/worker/workflow/machines/activity_state_machine.rs +100 -43
  37. data/bridge/sdk-core/core/src/worker/workflow/machines/cancel_external_state_machine.rs +7 -7
  38. data/bridge/sdk-core/core/src/worker/workflow/machines/cancel_workflow_state_machine.rs +5 -4
  39. data/bridge/sdk-core/core/src/worker/workflow/machines/child_workflow_state_machine.rs +87 -27
  40. data/bridge/sdk-core/core/src/worker/workflow/machines/complete_workflow_state_machine.rs +5 -4
  41. data/bridge/sdk-core/core/src/worker/workflow/machines/continue_as_new_workflow_state_machine.rs +5 -4
  42. data/bridge/sdk-core/core/src/worker/workflow/machines/fail_workflow_state_machine.rs +5 -4
  43. data/bridge/sdk-core/core/src/worker/workflow/machines/local_activity_state_machine.rs +137 -62
  44. data/bridge/sdk-core/core/src/worker/workflow/machines/mod.rs +25 -17
  45. data/bridge/sdk-core/core/src/worker/workflow/machines/modify_workflow_properties_state_machine.rs +7 -6
  46. data/bridge/sdk-core/core/src/worker/workflow/machines/patch_state_machine.rs +103 -152
  47. data/bridge/sdk-core/core/src/worker/workflow/machines/signal_external_state_machine.rs +7 -7
  48. data/bridge/sdk-core/core/src/worker/workflow/machines/timer_state_machine.rs +9 -9
  49. data/bridge/sdk-core/core/src/worker/workflow/machines/transition_coverage.rs +2 -2
  50. data/bridge/sdk-core/core/src/worker/workflow/machines/upsert_search_attributes_state_machine.rs +14 -7
  51. data/bridge/sdk-core/core/src/worker/workflow/machines/workflow_machines/local_acts.rs +5 -16
  52. data/bridge/sdk-core/core/src/worker/workflow/machines/workflow_machines.rs +201 -121
  53. data/bridge/sdk-core/core/src/worker/workflow/machines/workflow_task_state_machine.rs +11 -14
  54. data/bridge/sdk-core/core/src/worker/workflow/managed_run/managed_wf_test.rs +30 -15
  55. data/bridge/sdk-core/core/src/worker/workflow/managed_run.rs +1026 -376
  56. data/bridge/sdk-core/core/src/worker/workflow/mod.rs +460 -384
  57. data/bridge/sdk-core/core/src/worker/workflow/run_cache.rs +40 -57
  58. data/bridge/sdk-core/core/src/worker/workflow/wft_extraction.rs +125 -0
  59. data/bridge/sdk-core/core/src/worker/workflow/wft_poller.rs +1 -4
  60. data/bridge/sdk-core/core/src/worker/workflow/workflow_stream/saved_wf_inputs.rs +117 -0
  61. data/bridge/sdk-core/core/src/worker/workflow/workflow_stream/tonic_status_serde.rs +24 -0
  62. data/bridge/sdk-core/core/src/worker/workflow/workflow_stream.rs +448 -718
  63. data/bridge/sdk-core/core-api/Cargo.toml +2 -1
  64. data/bridge/sdk-core/core-api/src/errors.rs +1 -34
  65. data/bridge/sdk-core/core-api/src/lib.rs +6 -2
  66. data/bridge/sdk-core/core-api/src/telemetry.rs +0 -6
  67. data/bridge/sdk-core/core-api/src/worker.rs +14 -1
  68. data/bridge/sdk-core/fsm/rustfsm_procmacro/src/lib.rs +18 -15
  69. data/bridge/sdk-core/fsm/rustfsm_trait/src/lib.rs +8 -3
  70. data/bridge/sdk-core/histories/evict_while_la_running_no_interference-16_history.bin +0 -0
  71. data/bridge/sdk-core/protos/api_upstream/temporal/api/command/v1/message.proto +5 -17
  72. data/bridge/sdk-core/protos/api_upstream/temporal/api/common/v1/message.proto +11 -0
  73. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/command_type.proto +1 -6
  74. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/event_type.proto +6 -6
  75. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/failed_cause.proto +5 -0
  76. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/update.proto +22 -6
  77. data/bridge/sdk-core/protos/api_upstream/temporal/api/history/v1/message.proto +48 -19
  78. data/bridge/sdk-core/protos/api_upstream/temporal/api/namespace/v1/message.proto +2 -0
  79. data/bridge/sdk-core/protos/api_upstream/temporal/api/operatorservice/v1/request_response.proto +3 -0
  80. data/bridge/sdk-core/protos/api_upstream/temporal/api/{enums/v1/interaction_type.proto → protocol/v1/message.proto} +29 -11
  81. data/bridge/sdk-core/protos/api_upstream/temporal/api/sdk/v1/task_complete_metadata.proto +63 -0
  82. data/bridge/sdk-core/protos/api_upstream/temporal/api/update/v1/message.proto +111 -0
  83. data/bridge/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto +59 -28
  84. data/bridge/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/service.proto +2 -2
  85. data/bridge/sdk-core/protos/local/temporal/sdk/core/activity_result/activity_result.proto +1 -0
  86. data/bridge/sdk-core/protos/local/temporal/sdk/core/activity_task/activity_task.proto +1 -0
  87. data/bridge/sdk-core/protos/local/temporal/sdk/core/child_workflow/child_workflow.proto +1 -0
  88. data/bridge/sdk-core/protos/local/temporal/sdk/core/common/common.proto +1 -0
  89. data/bridge/sdk-core/protos/local/temporal/sdk/core/core_interface.proto +1 -0
  90. data/bridge/sdk-core/protos/local/temporal/sdk/core/external_data/external_data.proto +1 -0
  91. data/bridge/sdk-core/protos/local/temporal/sdk/core/workflow_activation/workflow_activation.proto +7 -0
  92. data/bridge/sdk-core/protos/local/temporal/sdk/core/workflow_commands/workflow_commands.proto +1 -0
  93. data/bridge/sdk-core/protos/local/temporal/sdk/core/workflow_completion/workflow_completion.proto +6 -0
  94. data/bridge/sdk-core/sdk/Cargo.toml +3 -2
  95. data/bridge/sdk-core/sdk/src/lib.rs +87 -20
  96. data/bridge/sdk-core/sdk/src/workflow_future.rs +9 -8
  97. data/bridge/sdk-core/sdk-core-protos/Cargo.toml +5 -2
  98. data/bridge/sdk-core/sdk-core-protos/build.rs +36 -1
  99. data/bridge/sdk-core/sdk-core-protos/src/history_builder.rs +100 -87
  100. data/bridge/sdk-core/sdk-core-protos/src/history_info.rs +5 -1
  101. data/bridge/sdk-core/sdk-core-protos/src/lib.rs +175 -57
  102. data/bridge/sdk-core/sdk-core-protos/src/task_token.rs +12 -2
  103. data/bridge/sdk-core/test-utils/Cargo.toml +3 -1
  104. data/bridge/sdk-core/test-utils/src/canned_histories.rs +106 -296
  105. data/bridge/sdk-core/test-utils/src/histfetch.rs +1 -1
  106. data/bridge/sdk-core/test-utils/src/lib.rs +82 -23
  107. data/bridge/sdk-core/test-utils/src/wf_input_saver.rs +50 -0
  108. data/bridge/sdk-core/test-utils/src/workflows.rs +29 -0
  109. data/bridge/sdk-core/tests/fuzzy_workflow.rs +130 -0
  110. data/bridge/sdk-core/tests/{load_tests.rs → heavy_tests.rs} +125 -51
  111. data/bridge/sdk-core/tests/integ_tests/ephemeral_server_tests.rs +25 -3
  112. data/bridge/sdk-core/tests/integ_tests/heartbeat_tests.rs +5 -3
  113. data/bridge/sdk-core/tests/integ_tests/metrics_tests.rs +218 -16
  114. data/bridge/sdk-core/tests/integ_tests/polling_tests.rs +4 -47
  115. data/bridge/sdk-core/tests/integ_tests/queries_tests.rs +5 -128
  116. data/bridge/sdk-core/tests/integ_tests/visibility_tests.rs +83 -25
  117. data/bridge/sdk-core/tests/integ_tests/workflow_tests/activities.rs +93 -69
  118. data/bridge/sdk-core/tests/integ_tests/workflow_tests/cancel_external.rs +1 -0
  119. data/bridge/sdk-core/tests/integ_tests/workflow_tests/cancel_wf.rs +6 -13
  120. data/bridge/sdk-core/tests/integ_tests/workflow_tests/child_workflows.rs +1 -0
  121. data/bridge/sdk-core/tests/integ_tests/workflow_tests/continue_as_new.rs +6 -2
  122. data/bridge/sdk-core/tests/integ_tests/workflow_tests/determinism.rs +3 -10
  123. data/bridge/sdk-core/tests/integ_tests/workflow_tests/local_activities.rs +72 -191
  124. data/bridge/sdk-core/tests/integ_tests/workflow_tests/modify_wf_properties.rs +1 -0
  125. data/bridge/sdk-core/tests/integ_tests/workflow_tests/patches.rs +7 -28
  126. data/bridge/sdk-core/tests/integ_tests/workflow_tests/replay.rs +12 -7
  127. data/bridge/sdk-core/tests/integ_tests/workflow_tests/resets.rs +1 -0
  128. data/bridge/sdk-core/tests/integ_tests/workflow_tests/signals.rs +18 -14
  129. data/bridge/sdk-core/tests/integ_tests/workflow_tests/stickyness.rs +6 -20
  130. data/bridge/sdk-core/tests/integ_tests/workflow_tests/timers.rs +10 -21
  131. data/bridge/sdk-core/tests/integ_tests/workflow_tests/upsert_search_attrs.rs +6 -4
  132. data/bridge/sdk-core/tests/integ_tests/workflow_tests.rs +10 -11
  133. data/bridge/sdk-core/tests/main.rs +3 -13
  134. data/bridge/sdk-core/tests/runner.rs +75 -36
  135. data/bridge/sdk-core/tests/wf_input_replay.rs +32 -0
  136. data/bridge/src/connection.rs +41 -25
  137. data/bridge/src/lib.rs +269 -14
  138. data/bridge/src/runtime.rs +1 -1
  139. data/bridge/src/test_server.rs +153 -0
  140. data/bridge/src/worker.rs +89 -16
  141. data/lib/gen/temporal/api/command/v1/message_pb.rb +4 -18
  142. data/lib/gen/temporal/api/common/v1/message_pb.rb +4 -0
  143. data/lib/gen/temporal/api/enums/v1/command_type_pb.rb +1 -3
  144. data/lib/gen/temporal/api/enums/v1/event_type_pb.rb +3 -3
  145. data/lib/gen/temporal/api/enums/v1/failed_cause_pb.rb +2 -0
  146. data/lib/gen/temporal/api/enums/v1/update_pb.rb +6 -4
  147. data/lib/gen/temporal/api/history/v1/message_pb.rb +27 -19
  148. data/lib/gen/temporal/api/namespace/v1/message_pb.rb +1 -0
  149. data/lib/gen/temporal/api/operatorservice/v1/request_response_pb.rb +3 -0
  150. data/lib/gen/temporal/api/protocol/v1/message_pb.rb +30 -0
  151. data/lib/gen/temporal/api/sdk/v1/task_complete_metadata_pb.rb +23 -0
  152. data/lib/gen/temporal/api/testservice/v1/request_response_pb.rb +49 -0
  153. data/lib/gen/temporal/api/testservice/v1/service_pb.rb +21 -0
  154. data/lib/gen/temporal/api/update/v1/message_pb.rb +72 -0
  155. data/lib/gen/temporal/api/workflowservice/v1/request_response_pb.rb +26 -16
  156. data/lib/gen/temporal/sdk/core/activity_result/activity_result_pb.rb +13 -9
  157. data/lib/gen/temporal/sdk/core/activity_task/activity_task_pb.rb +10 -6
  158. data/lib/gen/temporal/sdk/core/child_workflow/child_workflow_pb.rb +13 -9
  159. data/lib/gen/temporal/sdk/core/common/common_pb.rb +7 -3
  160. data/lib/gen/temporal/sdk/core/core_interface_pb.rb +9 -3
  161. data/lib/gen/temporal/sdk/core/external_data/external_data_pb.rb +7 -3
  162. data/lib/gen/temporal/sdk/core/workflow_activation/workflow_activation_pb.rb +27 -21
  163. data/lib/gen/temporal/sdk/core/workflow_commands/workflow_commands_pb.rb +28 -24
  164. data/lib/gen/temporal/sdk/core/workflow_completion/workflow_completion_pb.rb +12 -5
  165. data/lib/temporalio/activity/context.rb +13 -8
  166. data/lib/temporalio/activity/info.rb +1 -1
  167. data/lib/temporalio/bridge/connect_options.rb +15 -0
  168. data/lib/temporalio/bridge/retry_config.rb +24 -0
  169. data/lib/temporalio/bridge/tls_options.rb +19 -0
  170. data/lib/temporalio/client/implementation.rb +8 -8
  171. data/lib/temporalio/connection/retry_config.rb +44 -0
  172. data/lib/temporalio/connection/service.rb +20 -0
  173. data/lib/temporalio/connection/test_service.rb +92 -0
  174. data/lib/temporalio/connection/tls_options.rb +51 -0
  175. data/lib/temporalio/connection/workflow_service.rb +731 -0
  176. data/lib/temporalio/connection.rb +55 -720
  177. data/lib/temporalio/interceptor/activity_inbound.rb +22 -0
  178. data/lib/temporalio/interceptor/activity_outbound.rb +24 -0
  179. data/lib/temporalio/interceptor/chain.rb +5 -5
  180. data/lib/temporalio/interceptor/client.rb +8 -4
  181. data/lib/temporalio/interceptor.rb +22 -0
  182. data/lib/temporalio/retry_policy.rb +13 -3
  183. data/lib/temporalio/testing/time_skipping_handle.rb +32 -0
  184. data/lib/temporalio/testing/time_skipping_interceptor.rb +23 -0
  185. data/lib/temporalio/testing/workflow_environment.rb +112 -0
  186. data/lib/temporalio/testing.rb +175 -0
  187. data/lib/temporalio/version.rb +1 -1
  188. data/lib/temporalio/worker/activity_runner.rb +26 -4
  189. data/lib/temporalio/worker/activity_worker.rb +44 -18
  190. data/lib/temporalio/worker/sync_worker.rb +47 -11
  191. data/lib/temporalio/worker.rb +27 -21
  192. data/lib/temporalio/workflow/async.rb +46 -0
  193. data/lib/temporalio/workflow/future.rb +138 -0
  194. data/lib/temporalio/workflow/info.rb +76 -0
  195. data/temporalio.gemspec +4 -3
  196. metadata +67 -17
  197. data/bridge/sdk-core/Cargo.lock +0 -2606
  198. data/bridge/sdk-core/protos/api_upstream/temporal/api/interaction/v1/message.proto +0 -87
  199. data/lib/bridge.so +0 -0
  200. data/lib/gen/temporal/api/enums/v1/interaction_type_pb.rb +0 -25
  201. data/lib/gen/temporal/api/interaction/v1/message_pb.rb +0 -49
  202. data/lib/gen/temporal/sdk/core/bridge/bridge_pb.rb +0 -222
@@ -8,40 +8,58 @@ mod history_update;
8
8
  mod machines;
9
9
  mod managed_run;
10
10
  mod run_cache;
11
+ mod wft_extraction;
11
12
  pub(crate) mod wft_poller;
12
13
  mod workflow_stream;
13
14
 
15
+ #[cfg(feature = "save_wf_inputs")]
16
+ pub use workflow_stream::replay_wf_state_inputs;
17
+
14
18
  pub(crate) use bridge::WorkflowBridge;
15
19
  pub(crate) use driven_workflow::{DrivenWorkflow, WorkflowFetcher};
16
- pub(crate) use history_update::{HistoryPaginator, HistoryUpdate};
17
- pub(crate) use machines::WFMachinesError;
20
+ pub(crate) use history_update::HistoryUpdate;
18
21
  #[cfg(test)]
19
22
  pub(crate) use managed_run::ManagedWFFunc;
20
23
 
24
+ use crate::worker::activities::TrackedPermittedTqResp;
21
25
  use crate::{
22
- abstractions::OwnedMeteredSemPermit,
23
- protosext::{legacy_query_failure, ValidPollWFTQResponse, WorkflowActivationExt},
24
- telemetry::VecDisplayer,
26
+ abstractions::{
27
+ stream_when_allowed, MeteredSemaphore, TrackedOwnedMeteredSemPermit, UsedMeteredSemPermit,
28
+ },
29
+ internal_flags::InternalFlags,
30
+ protosext::{legacy_query_failure, ValidPollWFTQResponse},
31
+ telemetry::{
32
+ metrics::workflow_worker_type, set_trace_subscriber_for_current_thread, TelemetryInstance,
33
+ VecDisplayer,
34
+ },
25
35
  worker::{
26
- activities::{ActivitiesFromWFTsHandle, PermittedTqResp},
36
+ activities::{ActivitiesFromWFTsHandle, LocalActivityManager},
27
37
  client::{WorkerClient, WorkflowTaskCompletion},
28
38
  workflow::{
29
- managed_run::{ManagedRun, WorkflowManager},
39
+ history_update::HistoryPaginator,
40
+ managed_run::RunUpdateAct,
41
+ wft_extraction::{HistoryFetchReq, WFTExtractor},
30
42
  wft_poller::validate_wft,
31
43
  workflow_stream::{LocalInput, LocalInputs, WFStream},
32
44
  },
33
- LocalActRequest, LocalActivityResolution,
45
+ LocalActRequest, LocalActivityExecutionResult, LocalActivityResolution,
34
46
  },
35
47
  MetricsContext,
36
48
  };
49
+ use anyhow::anyhow;
37
50
  use futures::{stream::BoxStream, Stream, StreamExt};
51
+ use futures_util::{future::abortable, stream};
52
+ use prost_types::TimestampError;
38
53
  use std::{
39
- collections::HashSet,
40
- fmt::{Debug, Display, Formatter},
54
+ cell::RefCell,
55
+ collections::VecDeque,
56
+ fmt::Debug,
41
57
  future::Future,
42
58
  ops::DerefMut,
59
+ rc::Rc,
43
60
  result,
44
61
  sync::Arc,
62
+ thread,
45
63
  time::{Duration, Instant},
46
64
  };
47
65
  use temporal_sdk_core_api::errors::{CompleteWfError, PollWfError};
@@ -59,36 +77,41 @@ use temporal_sdk_core_protos::{
59
77
  },
60
78
  temporal::api::{
61
79
  command::v1::{command::Attributes, Command as ProtoCommand, Command},
62
- common::v1::{Memo, RetryPolicy, SearchAttributes},
80
+ common::v1::{Memo, MeteringMetadata, RetryPolicy, SearchAttributes, WorkflowExecution},
63
81
  enums::v1::WorkflowTaskFailedCause,
82
+ query::v1::WorkflowQuery,
83
+ sdk::v1::WorkflowTaskCompletedMetadata,
64
84
  taskqueue::v1::StickyExecutionAttributes,
65
- workflowservice::v1::PollActivityTaskQueueResponse,
85
+ workflowservice::v1::{get_system_info_response, PollActivityTaskQueueResponse},
66
86
  },
67
87
  TaskToken,
68
88
  };
69
89
  use tokio::{
70
90
  sync::{
71
- mpsc::{unbounded_channel, UnboundedSender},
91
+ mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender},
72
92
  oneshot,
73
93
  },
74
- task,
75
- task::{JoinError, JoinHandle},
94
+ task::{spawn_blocking, LocalSet},
76
95
  };
77
96
  use tokio_stream::wrappers::UnboundedReceiverStream;
78
97
  use tokio_util::sync::CancellationToken;
79
98
  use tracing::Span;
80
99
 
81
100
  pub(crate) const LEGACY_QUERY_ID: &str = "legacy_query";
101
+ /// What percentage of a WFT timeout we are willing to wait before sending a WFT heartbeat when
102
+ /// necessary.
103
+ const WFT_HEARTBEAT_TIMEOUT_FRACTION: f32 = 0.8;
82
104
  const MAX_EAGER_ACTIVITY_RESERVATIONS_PER_WORKFLOW_TASK: usize = 3;
83
105
 
84
106
  type Result<T, E = WFMachinesError> = result::Result<T, E>;
85
107
  type BoxedActivationStream = BoxStream<'static, Result<ActivationOrAuto, PollWfError>>;
108
+ type InternalFlagsRef = Rc<RefCell<InternalFlags>>;
86
109
 
87
110
  /// Centralizes all state related to workflows and workflow tasks
88
111
  pub(crate) struct Workflows {
89
112
  task_queue: String,
90
113
  local_tx: UnboundedSender<LocalInput>,
91
- processing_task: tokio::sync::Mutex<Option<JoinHandle<()>>>,
114
+ processing_task: tokio::sync::Mutex<Option<thread::JoinHandle<()>>>,
92
115
  activation_stream: tokio::sync::Mutex<(
93
116
  BoxedActivationStream,
94
117
  // Used to indicate polling may begin
@@ -100,9 +123,12 @@ pub(crate) struct Workflows {
100
123
  sticky_attrs: Option<StickyExecutionAttributes>,
101
124
  /// If set, can be used to reserve activity task slots for eager-return of new activity tasks.
102
125
  activity_tasks_handle: Option<ActivitiesFromWFTsHandle>,
126
+ /// Ensures we stay at or below this worker's maximum concurrent workflow task limit
127
+ wft_semaphore: MeteredSemaphore,
128
+ local_act_mgr: Arc<LocalActivityManager>,
103
129
  }
104
130
 
105
- pub(super) struct WorkflowBasics {
131
+ pub(crate) struct WorkflowBasics {
106
132
  pub max_cached_workflows: usize,
107
133
  pub max_outstanding_wfts: usize,
108
134
  pub shutdown_token: CancellationToken,
@@ -110,53 +136,115 @@ pub(super) struct WorkflowBasics {
110
136
  pub namespace: String,
111
137
  pub task_queue: String,
112
138
  pub ignore_evicts_on_shutdown: bool,
139
+ pub fetching_concurrency: usize,
140
+ pub server_capabilities: get_system_info_response::Capabilities,
141
+ #[cfg(feature = "save_wf_inputs")]
142
+ pub wf_state_inputs: Option<UnboundedSender<Vec<u8>>>,
143
+ }
144
+
145
+ pub(crate) struct RunBasics<'a> {
146
+ pub namespace: String,
147
+ pub workflow_id: String,
148
+ pub workflow_type: String,
149
+ pub run_id: String,
150
+ pub history: HistoryUpdate,
151
+ pub metrics: MetricsContext,
152
+ pub capabilities: &'a get_system_info_response::Capabilities,
113
153
  }
114
154
 
115
155
  impl Workflows {
156
+ #[allow(clippy::too_many_arguments)] // Not much worth combining here
116
157
  pub(super) fn new(
117
158
  basics: WorkflowBasics,
118
159
  sticky_attrs: Option<StickyExecutionAttributes>,
119
160
  client: Arc<dyn WorkerClient>,
120
161
  wft_stream: impl Stream<Item = Result<ValidPollWFTQResponse, tonic::Status>> + Send + 'static,
121
- local_activity_request_sink: impl Fn(Vec<LocalActRequest>) -> Vec<LocalActivityResolution>
122
- + Send
123
- + Sync
124
- + 'static,
162
+ local_activity_request_sink: impl LocalActivityRequestSink,
163
+ local_act_mgr: Arc<LocalActivityManager>,
164
+ heartbeat_timeout_rx: UnboundedReceiver<HeartbeatTimeoutMsg>,
125
165
  activity_tasks_handle: Option<ActivitiesFromWFTsHandle>,
166
+ telem_instance: Option<&TelemetryInstance>,
126
167
  ) -> Self {
127
168
  let (local_tx, local_rx) = unbounded_channel();
169
+ let (fetch_tx, fetch_rx) = unbounded_channel();
128
170
  let shutdown_tok = basics.shutdown_token.clone();
129
171
  let task_queue = basics.task_queue.clone();
130
- let mut stream = WFStream::build(
131
- basics,
172
+ let wft_semaphore = MeteredSemaphore::new(
173
+ basics.max_outstanding_wfts,
174
+ basics.metrics.with_new_attrs([workflow_worker_type()]),
175
+ MetricsContext::available_task_slots,
176
+ );
177
+ // Only allow polling of the new WFT stream if there are available task slots
178
+ let proceeder = stream::unfold(wft_semaphore.clone(), |sem| async move {
179
+ Some((sem.acquire_owned().await.unwrap(), sem))
180
+ });
181
+ let wft_stream = stream_when_allowed(wft_stream, proceeder);
182
+ let extracted_wft_stream = WFTExtractor::build(
183
+ client.clone(),
184
+ basics.fetching_concurrency,
132
185
  wft_stream,
186
+ UnboundedReceiverStream::new(fetch_rx),
187
+ );
188
+ let locals_stream = stream::select(
133
189
  UnboundedReceiverStream::new(local_rx),
134
- client.clone(),
135
- local_activity_request_sink,
190
+ UnboundedReceiverStream::new(heartbeat_timeout_rx).map(Into::into),
136
191
  );
137
192
  let (activation_tx, activation_rx) = unbounded_channel();
138
193
  let (start_polling_tx, start_polling_rx) = oneshot::channel();
139
194
  // We must spawn a task to constantly poll the activation stream, because otherwise
140
195
  // activation completions would not cause anything to happen until the next poll.
141
- let processing_task = task::spawn(async move {
142
- // However, we want to avoid plowing ahead until we've been asked to poll at least once.
143
- // This supports activity-only workers.
144
- let do_poll = tokio::select! {
145
- sp = start_polling_rx => {
146
- sp.is_ok()
196
+ let tracing_sub = telem_instance.map(|ti| ti.trace_subscriber());
197
+ let processing_task = thread::spawn(move || {
198
+ if let Some(ts) = tracing_sub {
199
+ set_trace_subscriber_for_current_thread(ts);
200
+ }
201
+ let rt = tokio::runtime::Builder::new_current_thread()
202
+ .enable_all()
203
+ .thread_name("workflow-processing")
204
+ .build()
205
+ .unwrap();
206
+ let local = LocalSet::new();
207
+ local.block_on(&rt, async move {
208
+ let mut stream = WFStream::build(
209
+ basics,
210
+ extracted_wft_stream,
211
+ locals_stream,
212
+ local_activity_request_sink,
213
+ );
214
+
215
+ // However, we want to avoid plowing ahead until we've been asked to poll at least
216
+ // once. This supports activity-only workers.
217
+ let do_poll = tokio::select! {
218
+ sp = start_polling_rx => {
219
+ sp.is_ok()
220
+ }
221
+ _ = shutdown_tok.cancelled() => {
222
+ false
223
+ }
224
+ };
225
+ if !do_poll {
226
+ return;
147
227
  }
148
- _ = shutdown_tok.cancelled() => {
149
- false
228
+ while let Some(output) = stream.next().await {
229
+ match output {
230
+ Ok(o) => {
231
+ for fetchreq in o.fetch_histories {
232
+ fetch_tx
233
+ .send(fetchreq)
234
+ .expect("Fetch channel must not be dropped");
235
+ }
236
+ for act in o.activations {
237
+ activation_tx
238
+ .send(Ok(act))
239
+ .expect("Activation processor channel not dropped");
240
+ }
241
+ }
242
+ Err(e) => activation_tx
243
+ .send(Err(e))
244
+ .expect("Activation processor channel not dropped"),
245
+ }
150
246
  }
151
- };
152
- if !do_poll {
153
- return;
154
- }
155
- while let Some(act) = stream.next().await {
156
- activation_tx
157
- .send(act)
158
- .expect("Activation processor channel not dropped");
159
- }
247
+ });
160
248
  });
161
249
  Self {
162
250
  task_queue,
@@ -169,12 +257,14 @@ impl Workflows {
169
257
  client,
170
258
  sticky_attrs,
171
259
  activity_tasks_handle,
260
+ wft_semaphore,
261
+ local_act_mgr,
172
262
  }
173
263
  }
174
264
 
175
- pub async fn next_workflow_activation(&self) -> Result<WorkflowActivation, PollWfError> {
265
+ pub(super) async fn next_workflow_activation(&self) -> Result<WorkflowActivation, PollWfError> {
176
266
  loop {
177
- let r = {
267
+ let al = {
178
268
  let mut lock = self.activation_stream.lock().await;
179
269
  let (ref mut stream, ref mut beginner) = lock.deref_mut();
180
270
  if let Some(beginner) = beginner.take() {
@@ -182,17 +272,37 @@ impl Workflows {
182
272
  }
183
273
  stream.next().await.unwrap_or(Err(PollWfError::ShutDown))?
184
274
  };
185
- Span::current().record("run_id", r.run_id());
186
- match r {
275
+ Span::current().record("run_id", al.run_id());
276
+ match al {
187
277
  ActivationOrAuto::LangActivation(act) | ActivationOrAuto::ReadyForQueries(act) => {
188
278
  debug!(activation=%act, "Sending activation to lang");
189
279
  break Ok(act);
190
280
  }
191
281
  ActivationOrAuto::Autocomplete { run_id } => {
192
- self.activation_completed(WorkflowActivationCompletion {
193
- run_id,
194
- status: Some(workflow_completion::Success::from_variants(vec![]).into()),
195
- })
282
+ self.activation_completed(
283
+ WorkflowActivationCompletion {
284
+ run_id,
285
+ status: Some(
286
+ workflow_completion::Success::from_variants(vec![]).into(),
287
+ ),
288
+ },
289
+ // We need to say a type, but the type is irrelevant, so imagine some
290
+ // boxed function we'll never call.
291
+ Option::<Box<dyn Fn(&str, usize) + Send>>::None,
292
+ )
293
+ .await?;
294
+ }
295
+ ActivationOrAuto::AutoFail {
296
+ run_id,
297
+ machines_err,
298
+ } => {
299
+ self.activation_completed(
300
+ WorkflowActivationCompletion {
301
+ run_id,
302
+ status: Some(auto_fail_to_complete_status(machines_err)),
303
+ },
304
+ Option::<Box<dyn Fn(&str, usize) + Send>>::None,
305
+ )
196
306
  .await?;
197
307
  }
198
308
  }
@@ -202,10 +312,11 @@ impl Workflows {
202
312
  /// Queue an activation completion for processing, returning a future that will resolve with
203
313
  /// the outcome of that completion. See [ActivationCompletedOutcome].
204
314
  ///
205
- /// Returns the most-recently-processed event number for the run
206
- pub async fn activation_completed(
315
+ /// Returns the most-recently-processed event number for the run.
316
+ pub(super) async fn activation_completed(
207
317
  &self,
208
318
  completion: WorkflowActivationCompletion,
319
+ post_activate_hook: Option<impl Fn(&str, usize)>,
209
320
  ) -> Result<usize, CompleteWfError> {
210
321
  let is_empty_completion = completion.is_empty();
211
322
  let completion = validate_completion(completion)?;
@@ -213,7 +324,7 @@ impl Workflows {
213
324
  let (tx, rx) = oneshot::channel();
214
325
  let was_sent = self.send_local(WFActCompleteMsg {
215
326
  completion,
216
- response_tx: tx,
327
+ response_tx: Some(tx),
217
328
  });
218
329
  if !was_sent {
219
330
  if is_empty_completion {
@@ -230,7 +341,7 @@ impl Workflows {
230
341
  .await
231
342
  .expect("Send half of activation complete response not dropped");
232
343
  let mut wft_from_complete = None;
233
- let reported_wft_to_server = match completion_outcome.outcome {
344
+ let wft_report_status = match completion_outcome.outcome {
234
345
  ActivationCompleteOutcome::ReportWFTSuccess(report) => match report {
235
346
  ServerCommandsWithWorkflowInfo {
236
347
  task_token,
@@ -239,6 +350,7 @@ impl Workflows {
239
350
  mut commands,
240
351
  query_responses,
241
352
  force_new_wft,
353
+ sdk_metadata,
242
354
  },
243
355
  } => {
244
356
  let reserved_act_permits =
@@ -252,6 +364,13 @@ impl Workflows {
252
364
  sticky_attributes: None,
253
365
  return_new_workflow_task: true,
254
366
  force_create_new_workflow_task: force_new_wft,
367
+ sdk_metadata,
368
+ metering_metadata: MeteringMetadata {
369
+ nonfirst_local_activity_execution_attempts: self
370
+ .local_act_mgr
371
+ .get_nonfirst_attempt_count(&run_id)
372
+ as u32,
373
+ },
255
374
  };
256
375
  let sticky_attrs = self.sticky_attrs.clone();
257
376
  // Do not return new WFT if we would not cache, because returned new WFTs are
@@ -273,14 +392,14 @@ impl Workflows {
273
392
  Ok(())
274
393
  })
275
394
  .await;
276
- true
395
+ WFTReportStatus::Reported
277
396
  }
278
397
  ServerCommandsWithWorkflowInfo {
279
398
  task_token,
280
399
  action: ActivationAction::RespondLegacyQuery { result },
281
400
  } => {
282
401
  self.respond_legacy_query(task_token, *result).await;
283
- true
402
+ WFTReportStatus::Reported
284
403
  }
285
404
  },
286
405
  ActivationCompleteOutcome::ReportWFTFail(outcome) => match outcome {
@@ -292,29 +411,54 @@ impl Workflows {
292
411
  .await
293
412
  })
294
413
  .await;
295
- true
414
+ WFTReportStatus::Reported
296
415
  }
297
416
  FailedActivationWFTReport::ReportLegacyQueryFailure(task_token, failure) => {
298
417
  warn!(run_id=%run_id, failure=?failure, "Failing legacy query request");
299
418
  self.respond_legacy_query(task_token, legacy_query_failure(failure))
300
419
  .await;
301
- true
420
+ WFTReportStatus::Reported
302
421
  }
303
422
  },
304
- ActivationCompleteOutcome::DoNothing => false,
423
+ ActivationCompleteOutcome::WFTFailedDontReport => WFTReportStatus::DropWft,
424
+ ActivationCompleteOutcome::DoNothing => WFTReportStatus::NotReported,
425
+ };
426
+
427
+ let maybe_pwft = if let Some(wft) = wft_from_complete {
428
+ match HistoryPaginator::from_poll(wft, self.client.clone()).await {
429
+ Ok((paginator, pwft)) => Some((pwft, paginator)),
430
+ Err(e) => {
431
+ self.request_eviction(
432
+ &run_id,
433
+ format!("Failed to paginate workflow task from completion: {e:?}"),
434
+ EvictionReason::Fatal,
435
+ );
436
+ None
437
+ }
438
+ }
439
+ } else {
440
+ None
305
441
  };
306
442
 
443
+ if let Some(h) = post_activate_hook {
444
+ h(&run_id, completion_outcome.most_recently_processed_event);
445
+ }
446
+
307
447
  self.post_activation(PostActivationMsg {
308
448
  run_id,
309
- reported_wft_to_server,
310
- wft_from_complete,
449
+ wft_report_status,
450
+ wft_from_complete: maybe_pwft,
311
451
  });
312
452
 
313
453
  Ok(completion_outcome.most_recently_processed_event)
314
454
  }
315
455
 
316
456
  /// Tell workflow that a local activity has finished with the provided result
317
- pub fn notify_of_local_result(&self, run_id: impl Into<String>, resolved: LocalResolution) {
457
+ pub(super) fn notify_of_local_result(
458
+ &self,
459
+ run_id: impl Into<String>,
460
+ resolved: LocalResolution,
461
+ ) {
318
462
  self.send_local(LocalResolutionMsg {
319
463
  run_id: run_id.into(),
320
464
  res: resolved,
@@ -322,7 +466,7 @@ impl Workflows {
322
466
  }
323
467
 
324
468
  /// Request eviction of a workflow
325
- pub fn request_eviction(
469
+ pub(super) fn request_eviction(
326
470
  &self,
327
471
  run_id: impl Into<String>,
328
472
  message: impl Into<String>,
@@ -336,22 +480,39 @@ impl Workflows {
336
480
  }
337
481
 
338
482
  /// Query the state of workflow management. Can return `None` if workflow state is shut down.
339
- pub fn get_state_info(&self) -> impl Future<Output = Option<WorkflowStateInfo>> {
483
+ pub(super) fn get_state_info(&self) -> impl Future<Output = Option<WorkflowStateInfo>> {
340
484
  let (tx, rx) = oneshot::channel();
341
485
  self.send_local(GetStateInfoMsg { response_tx: tx });
342
486
  async move { rx.await.ok() }
343
487
  }
344
488
 
345
- pub async fn shutdown(&self) -> Result<(), JoinError> {
489
+ pub(super) fn available_wft_permits(&self) -> usize {
490
+ self.wft_semaphore.available_permits()
491
+ }
492
+
493
+ pub(super) async fn shutdown(&self) -> Result<(), anyhow::Error> {
346
494
  let maybe_jh = self.processing_task.lock().await.take();
347
495
  if let Some(jh) = maybe_jh {
348
- // This acts as a final wake up in case the stream is still alive and wouldn't otherwise
349
- // receive another message. It allows it to shut itself down.
350
- let _ = self.get_state_info();
351
- jh.await
352
- } else {
353
- Ok(())
496
+ // This serves to drive the stream if it is still alive and wouldn't otherwise receive
497
+ // another message. It allows it to shut itself down.
498
+ let (waker, stop_waker) = abortable(async {
499
+ let mut interval = tokio::time::interval(Duration::from_millis(10));
500
+ loop {
501
+ interval.tick().await;
502
+ let _ = self.get_state_info().await;
503
+ }
504
+ });
505
+ let (_, jh_res) = tokio::join!(
506
+ waker,
507
+ spawn_blocking(move || {
508
+ let r = jh.join();
509
+ stop_waker.abort();
510
+ r
511
+ })
512
+ );
513
+ jh_res?.map_err(|e| anyhow!("Error joining workflow processing thread: {e:?}"))?;
354
514
  }
515
+ Ok(())
355
516
  }
356
517
 
357
518
  /// Must be called after every activation completion has finished
@@ -393,7 +554,11 @@ impl Workflows {
393
554
  /// successfully.
394
555
  fn send_local(&self, msg: impl Into<LocalInputs>) -> bool {
395
556
  let msg = msg.into();
396
- let print_err = !matches!(msg, LocalInputs::GetStateInfo(_));
557
+ let print_err = match &msg {
558
+ LocalInputs::GetStateInfo(_) => false,
559
+ LocalInputs::LocalResolution(lr) if lr.res.is_la_cancel_confirmation() => false,
560
+ _ => true,
561
+ };
397
562
  if let Err(e) = self.local_tx.send(LocalInput {
398
563
  input: msg,
399
564
  span: Span::current(),
@@ -414,7 +579,7 @@ impl Workflows {
414
579
  /// Process eagerly returned activities from WFT completion
415
580
  fn handle_eager_activities(
416
581
  &self,
417
- reserved_act_permits: Vec<OwnedMeteredSemPermit>,
582
+ reserved_act_permits: Vec<TrackedOwnedMeteredSemPermit>,
418
583
  eager_acts: Vec<PollActivityTaskQueueResponse>,
419
584
  ) {
420
585
  if let Some(at_handle) = self.activity_tasks_handle.as_ref() {
@@ -435,7 +600,7 @@ impl Workflows {
435
600
  let with_permits = reserved_act_permits
436
601
  .into_iter()
437
602
  .zip(eager_acts.into_iter())
438
- .map(|(permit, resp)| PermittedTqResp { permit, resp });
603
+ .map(|(permit, resp)| TrackedPermittedTqResp { permit, resp });
439
604
  if with_permits.len() > 0 {
440
605
  debug!(
441
606
  "Adding {} activity tasks received from WFT complete",
@@ -458,7 +623,7 @@ impl Workflows {
458
623
  fn reserve_activity_slots_for_outgoing_commands(
459
624
  &self,
460
625
  commands: &mut [Command],
461
- ) -> Vec<OwnedMeteredSemPermit> {
626
+ ) -> Vec<TrackedOwnedMeteredSemPermit> {
462
627
  let mut reserved = vec![];
463
628
  for cmd in commands {
464
629
  if let Some(Attributes::ScheduleActivityTaskCommandAttributes(attrs)) =
@@ -509,186 +674,30 @@ impl Workflows {
509
674
  }
510
675
  }
511
676
 
512
- /// Manages access to a specific workflow run, and contains various bookkeeping information that the
513
- /// [WFStream] may need to access quickly.
514
- #[derive(derive_more::DebugCustom)]
515
- #[debug(
516
- fmt = "ManagedRunHandle {{ wft: {:?}, activation: {:?}, buffered_resp: {:?} \
517
- have_seen_terminal_event: {}, most_recently_processed_event: {}, more_pending_work: {}, \
518
- trying_to_evict: {}, last_action_acked: {} }}",
519
- wft,
520
- activation,
521
- buffered_resp,
522
- have_seen_terminal_event,
523
- most_recently_processed_event_number,
524
- more_pending_work,
525
- "trying_to_evict.is_some()",
526
- last_action_acked
677
+ /// Returned when a cache miss happens and we need to fetch history from the beginning to
678
+ /// replay a run
679
+ #[derive(Debug, derive_more::Display)]
680
+ #[display(
681
+ fmt = "CacheMissFetchReq(run_id: {})",
682
+ "original_wft.work.execution.run_id"
527
683
  )]
528
- struct ManagedRunHandle {
529
- /// If set, the WFT this run is currently/will be processing.
530
- wft: Option<OutstandingTask>,
531
- /// An outstanding activation to lang
532
- activation: Option<OutstandingActivation>,
533
- /// If set, it indicates there is a buffered poll response from the server that applies to this
534
- /// run. This can happen when lang takes too long to complete a task and the task times out, for
535
- /// example. Upon next completion, the buffered response will be removed and can be made ready
536
- /// to be returned from polling
537
- buffered_resp: Option<PermittedWFT>,
538
- /// True if this machine has seen an event which ends the execution
539
- have_seen_terminal_event: bool,
540
- /// The most recently processed event id this machine has seen. 0 means it has seen nothing.
541
- most_recently_processed_event_number: usize,
542
- /// Is set true when the machines indicate that there is additional known work to be processed
543
- more_pending_work: bool,
544
- /// Is set if an eviction has been requested for this run
545
- trying_to_evict: Option<RequestEvictMsg>,
546
- /// Set to true if the last action we tried to take to this run has been processed (ie: the
547
- /// [RunUpdateResponse] for it has been seen.
548
- last_action_acked: bool,
549
- /// For sending work to the machines
550
- run_actions_tx: UnboundedSender<RunAction>,
551
- /// Handle to the task where the actual machines live
552
- handle: JoinHandle<()>,
553
-
554
- /// We track if we have recorded useful debugging values onto a certain span yet, to overcome
555
- /// duplicating field values. Remove this once https://github.com/tokio-rs/tracing/issues/2334
556
- /// is fixed.
557
- recorded_span_ids: HashSet<tracing::Id>,
558
- metrics: MetricsContext,
684
+ #[must_use]
685
+ struct CacheMissFetchReq {
686
+ original_wft: PermittedWFT,
687
+ }
688
+ /// Bubbled up from inside workflow state if we're trying to apply the next workflow task but it
689
+ /// isn't in memory
690
+ #[derive(Debug)]
691
+ #[must_use]
692
+ struct NextPageReq {
693
+ paginator: HistoryPaginator,
694
+ span: Span,
559
695
  }
560
- impl ManagedRunHandle {
561
- fn new(
562
- wfm: WorkflowManager,
563
- activations_tx: UnboundedSender<RunUpdateResponse>,
564
- local_activity_request_sink: LocalActivityRequestSink,
565
- metrics: MetricsContext,
566
- ) -> Self {
567
- let (run_actions_tx, run_actions_rx) = unbounded_channel();
568
- let managed = ManagedRun::new(wfm, activations_tx, local_activity_request_sink);
569
- let handle = tokio::task::spawn(managed.run(run_actions_rx));
570
- Self {
571
- wft: None,
572
- activation: None,
573
- buffered_resp: None,
574
- have_seen_terminal_event: false,
575
- most_recently_processed_event_number: 0,
576
- more_pending_work: false,
577
- trying_to_evict: None,
578
- last_action_acked: true,
579
- run_actions_tx,
580
- handle,
581
- recorded_span_ids: Default::default(),
582
- metrics,
583
- }
584
- }
585
-
586
- fn incoming_wft(&mut self, wft: NewIncomingWFT) {
587
- if self.wft.is_some() {
588
- error!("Trying to send a new WFT for a run which already has one!");
589
- }
590
- self.send_run_action(RunActions::NewIncomingWFT(wft));
591
- }
592
- fn check_more_activations(&mut self) {
593
- // No point in checking for more activations if we have not acked the last update, or
594
- // if there's already an outstanding activation.
595
- if self.last_action_acked && self.activation.is_none() {
596
- self.send_run_action(RunActions::CheckMoreWork {
597
- want_to_evict: self.trying_to_evict.clone(),
598
- has_pending_queries: self
599
- .wft
600
- .as_ref()
601
- .map(|wft| !wft.pending_queries.is_empty())
602
- .unwrap_or_default(),
603
- has_wft: self.wft.is_some(),
604
- });
605
- }
606
- }
607
- fn send_completion(&mut self, c: RunActivationCompletion) {
608
- self.send_run_action(RunActions::ActivationCompletion(c));
609
- }
610
- fn send_local_resolution(&mut self, r: LocalResolution) {
611
- self.send_run_action(RunActions::LocalResolution(r));
612
- }
613
-
614
- fn insert_outstanding_activation(&mut self, act: &ActivationOrAuto) {
615
- let act_type = match &act {
616
- ActivationOrAuto::LangActivation(act) | ActivationOrAuto::ReadyForQueries(act) => {
617
- if act.is_legacy_query() {
618
- OutstandingActivation::LegacyQuery
619
- } else {
620
- OutstandingActivation::Normal {
621
- contains_eviction: act.eviction_index().is_some(),
622
- num_jobs: act.jobs.len(),
623
- }
624
- }
625
- }
626
- ActivationOrAuto::Autocomplete { .. } => OutstandingActivation::Autocomplete,
627
- };
628
- if let Some(old_act) = self.activation {
629
- // This is a panic because we have screwed up core logic if this is violated. It must be
630
- // upheld.
631
- panic!(
632
- "Attempted to insert a new outstanding activation {:?}, but there already was \
633
- one outstanding: {:?}",
634
- act, old_act
635
- );
636
- }
637
- self.activation = Some(act_type);
638
- }
639
-
640
- fn send_run_action(&mut self, action: RunActions) {
641
- self.last_action_acked = false;
642
- self.run_actions_tx
643
- .send(RunAction {
644
- action,
645
- trace_span: Span::current(),
646
- })
647
- .expect("Receive half of run actions not dropped");
648
- }
649
-
650
- /// Returns true if the managed run has any form of pending work
651
- /// If `ignore_evicts` is true, pending evictions do not count as pending work.
652
- /// If `ignore_buffered` is true, buffered workflow tasks do not count as pending work.
653
- fn has_any_pending_work(&self, ignore_evicts: bool, ignore_buffered: bool) -> bool {
654
- let evict_work = if ignore_evicts {
655
- false
656
- } else {
657
- self.trying_to_evict.is_some()
658
- };
659
- let act_work = if ignore_evicts {
660
- if let Some(ref act) = self.activation {
661
- !act.has_only_eviction()
662
- } else {
663
- false
664
- }
665
- } else {
666
- self.activation.is_some()
667
- };
668
- let buffered = if ignore_buffered {
669
- false
670
- } else {
671
- self.buffered_resp.is_some()
672
- };
673
- self.wft.is_some()
674
- || buffered
675
- || !self.last_action_acked
676
- || self.more_pending_work
677
- || act_work
678
- || evict_work
679
- }
680
696
 
681
- /// Returns true if the handle is currently processing a WFT which contains a legacy query.
682
- fn pending_work_is_legacy_query(&self) -> bool {
683
- // Either we know because there is a pending legacy query, or it's already been drained and
684
- // sent as an activation.
685
- matches!(self.activation, Some(OutstandingActivation::LegacyQuery))
686
- || self
687
- .wft
688
- .as_ref()
689
- .map(|t| t.has_pending_legacy_query())
690
- .unwrap_or_default()
691
- }
697
+ #[derive(Debug)]
698
+ struct WFStreamOutput {
699
+ activations: VecDeque<ActivationOrAuto>,
700
+ fetch_histories: VecDeque<HistoryFetchReq>,
692
701
  }
693
702
 
694
703
  #[derive(Debug, derive_more::Display)]
@@ -697,9 +706,15 @@ enum ActivationOrAuto {
697
706
  /// This type should only be filled with an empty activation which is ready to have queries
698
707
  /// inserted into the joblist
699
708
  ReadyForQueries(WorkflowActivation),
709
+ #[display(fmt = "Autocomplete(run_id={run_id})")]
700
710
  Autocomplete {
701
711
  run_id: String,
702
712
  },
713
+ #[display(fmt = "AutoFail(run_id={run_id})")]
714
+ AutoFail {
715
+ run_id: String,
716
+ machines_err: WFMachinesError,
717
+ },
703
718
  }
704
719
  impl ActivationOrAuto {
705
720
  pub fn run_id(&self) -> &str {
@@ -707,15 +722,53 @@ impl ActivationOrAuto {
707
722
  ActivationOrAuto::LangActivation(act) => &act.run_id,
708
723
  ActivationOrAuto::Autocomplete { run_id, .. } => run_id,
709
724
  ActivationOrAuto::ReadyForQueries(act) => &act.run_id,
725
+ ActivationOrAuto::AutoFail { run_id, .. } => run_id,
710
726
  }
711
727
  }
712
728
  }
713
729
 
730
+ /// A processed WFT which has been validated and had a history update extracted from it
714
731
  #[derive(derive_more::DebugCustom)]
715
- #[debug(fmt = "PermittedWft {{ {:?} }}", wft)]
732
+ #[cfg_attr(
733
+ feature = "save_wf_inputs",
734
+ derive(serde::Serialize, serde::Deserialize)
735
+ )]
736
+ #[debug(fmt = "PermittedWft({work:?})")]
716
737
  pub(crate) struct PermittedWFT {
717
- wft: ValidPollWFTQResponse,
718
- permit: OwnedMeteredSemPermit,
738
+ work: PreparedWFT,
739
+ #[cfg_attr(
740
+ feature = "save_wf_inputs",
741
+ serde(skip, default = "UsedMeteredSemPermit::fake_deserialized")
742
+ )]
743
+ permit: UsedMeteredSemPermit,
744
+ #[cfg_attr(
745
+ feature = "save_wf_inputs",
746
+ serde(skip, default = "HistoryPaginator::fake_deserialized")
747
+ )]
748
+ paginator: HistoryPaginator,
749
+ }
750
+ #[derive(Debug)]
751
+ #[cfg_attr(
752
+ feature = "save_wf_inputs",
753
+ derive(serde::Serialize, serde::Deserialize)
754
+ )]
755
+ struct PreparedWFT {
756
+ task_token: TaskToken,
757
+ attempt: u32,
758
+ execution: WorkflowExecution,
759
+ workflow_type: String,
760
+ legacy_query: Option<WorkflowQuery>,
761
+ query_requests: Vec<QueryWorkflow>,
762
+ update: HistoryUpdate,
763
+ }
764
+ impl PreparedWFT {
765
+ /// Returns true if the contained history update is incremental (IE: expects to hit a cached
766
+ /// workflow)
767
+ pub fn is_incremental(&self) -> bool {
768
+ let start_event_id = self.update.first_event_id();
769
+ let poll_resp_is_incremental = start_event_id.map(|eid| eid > 1).unwrap_or_default();
770
+ poll_resp_is_incremental || start_event_id.is_none()
771
+ }
719
772
  }
720
773
 
721
774
  #[derive(Debug)]
@@ -727,7 +780,7 @@ pub(crate) struct OutstandingTask {
727
780
  pub start_time: Instant,
728
781
  /// The WFT permit owned by this task, ensures we don't exceed max concurrent WFT, and makes
729
782
  /// sure the permit is automatically freed when we delete the task.
730
- pub permit: OwnedMeteredSemPermit,
783
+ pub permit: UsedMeteredSemPermit,
731
784
  }
732
785
 
733
786
  impl OutstandingTask {
@@ -806,49 +859,80 @@ pub(crate) enum ActivationAction {
806
859
  commands: Vec<ProtoCommand>,
807
860
  query_responses: Vec<QueryResult>,
808
861
  force_new_wft: bool,
862
+ sdk_metadata: WorkflowTaskCompletedMetadata,
809
863
  },
810
864
  /// We should respond to a legacy query request
811
865
  RespondLegacyQuery { result: Box<QueryResult> },
812
866
  }
813
867
 
814
- #[derive(Debug, Eq, PartialEq, Hash)]
815
- pub(crate) enum EvictionRequestResult {
816
- EvictionRequested(Option<u32>),
868
+ #[derive(Debug)]
869
+ enum EvictionRequestResult {
870
+ EvictionRequested(Option<u32>, RunUpdateAct),
817
871
  NotFound,
818
872
  EvictionAlreadyRequested(Option<u32>),
819
873
  }
874
+ impl EvictionRequestResult {
875
+ fn into_run_update_resp(self) -> RunUpdateAct {
876
+ match self {
877
+ EvictionRequestResult::EvictionRequested(_, resp) => resp,
878
+ EvictionRequestResult::NotFound
879
+ | EvictionRequestResult::EvictionAlreadyRequested(_) => None,
880
+ }
881
+ }
882
+ }
820
883
 
821
884
  #[derive(Debug)]
822
885
  #[allow(dead_code)] // Not always used in non-test
823
886
  pub(crate) struct WorkflowStateInfo {
824
887
  pub cached_workflows: usize,
825
888
  pub outstanding_wft: usize,
826
- pub available_wft_permits: usize,
827
889
  }
828
890
 
829
891
  #[derive(Debug)]
892
+ #[cfg_attr(
893
+ feature = "save_wf_inputs",
894
+ derive(serde::Serialize, serde::Deserialize)
895
+ )]
830
896
  struct WFActCompleteMsg {
831
897
  completion: ValidatedCompletion,
832
- response_tx: oneshot::Sender<ActivationCompleteResult>,
898
+ #[cfg_attr(feature = "save_wf_inputs", serde(skip))]
899
+ response_tx: Option<oneshot::Sender<ActivationCompleteResult>>,
833
900
  }
834
901
  #[derive(Debug)]
902
+ #[cfg_attr(
903
+ feature = "save_wf_inputs",
904
+ derive(serde::Serialize, serde::Deserialize)
905
+ )]
835
906
  struct LocalResolutionMsg {
836
907
  run_id: String,
837
908
  res: LocalResolution,
838
909
  }
839
910
  #[derive(Debug)]
911
+ #[cfg_attr(
912
+ feature = "save_wf_inputs",
913
+ derive(serde::Serialize, serde::Deserialize)
914
+ )]
840
915
  struct PostActivationMsg {
841
916
  run_id: String,
842
- reported_wft_to_server: bool,
843
- wft_from_complete: Option<ValidPollWFTQResponse>,
917
+ wft_report_status: WFTReportStatus,
918
+ wft_from_complete: Option<(PreparedWFT, HistoryPaginator)>,
844
919
  }
845
920
  #[derive(Debug, Clone)]
921
+ #[cfg_attr(
922
+ feature = "save_wf_inputs",
923
+ derive(serde::Serialize, serde::Deserialize)
924
+ )]
846
925
  struct RequestEvictMsg {
847
926
  run_id: String,
848
927
  message: String,
849
928
  reason: EvictionReason,
850
929
  }
851
930
  #[derive(Debug)]
931
+ pub(crate) struct HeartbeatTimeoutMsg {
932
+ pub(crate) run_id: String,
933
+ pub(crate) span: Span,
934
+ }
935
+ #[derive(Debug)]
852
936
  struct GetStateInfoMsg {
853
937
  response_tx: oneshot::Sender<WorkflowStateInfo>,
854
938
  }
@@ -869,16 +953,24 @@ enum ActivationCompleteOutcome {
869
953
  ReportWFTFail(FailedActivationWFTReport),
870
954
  /// There's nothing to do right now. EX: The workflow needs to keep replaying.
871
955
  DoNothing,
956
+ /// The workflow task failed, but we shouldn't report it. EX: We have failed 2 or more attempts
957
+ /// in a row.
958
+ WFTFailedDontReport,
872
959
  }
873
- #[derive(Debug)]
874
- struct FulfillableActivationComplete {
875
- result: ActivationCompleteResult,
876
- resp_chan: oneshot::Sender<ActivationCompleteResult>,
877
- }
878
- impl FulfillableActivationComplete {
879
- fn fulfill(self) {
880
- let _ = self.resp_chan.send(self.result);
881
- }
960
+ /// Did we report, or not, completion of a WFT to server?
961
+ #[derive(Debug, Copy, Clone)]
962
+ #[cfg_attr(
963
+ feature = "save_wf_inputs",
964
+ derive(serde::Serialize, serde::Deserialize)
965
+ )]
966
+ enum WFTReportStatus {
967
+ Reported,
968
+ /// The WFT completion was not reported when finishing the activation, because there's still
969
+ /// work to be done. EX: Running LAs.
970
+ NotReported,
971
+ /// We didn't report, but we want to clear the outstanding workflow task anyway. See
972
+ /// [ActivationCompleteOutcome::WFTFailedDontReport]
973
+ DropWft,
882
974
  }
883
975
 
884
976
  fn validate_completion(
@@ -908,8 +1000,7 @@ fn validate_completion(
908
1000
  reason: format!(
909
1001
  "Workflow completion had a legacy query response along with other \
910
1002
  commands. This is not allowed and constitutes an error in the \
911
- lang SDK. Commands: {:?}",
912
- commands
1003
+ lang SDK. Commands: {commands:?}"
913
1004
  ),
914
1005
  run_id: completion.run_id,
915
1006
  });
@@ -918,6 +1009,7 @@ fn validate_completion(
918
1009
  Ok(ValidatedCompletion::Success {
919
1010
  run_id: completion.run_id,
920
1011
  commands,
1012
+ used_flags: success.used_internal_flags,
921
1013
  })
922
1014
  }
923
1015
  Some(workflow_activation_completion::Status::Failed(failure)) => {
@@ -934,11 +1026,16 @@ fn validate_completion(
934
1026
  }
935
1027
 
936
1028
  #[derive(Debug)]
1029
+ #[cfg_attr(
1030
+ feature = "save_wf_inputs",
1031
+ derive(serde::Serialize, serde::Deserialize)
1032
+ )]
937
1033
  #[allow(clippy::large_enum_variant)]
938
1034
  enum ValidatedCompletion {
939
1035
  Success {
940
1036
  run_id: String,
941
1037
  commands: Vec<WFCommand>,
1038
+ used_flags: Vec<u32>,
942
1039
  },
943
1040
  Fail {
944
1041
  run_id: String,
@@ -955,112 +1052,6 @@ impl ValidatedCompletion {
955
1052
  }
956
1053
  }
957
1054
 
958
- /// Input to run tasks, sent to [ManagedRun]s via [ManagedRunHandle]s
959
- #[derive(Debug)]
960
- struct RunAction {
961
- action: RunActions,
962
- trace_span: Span,
963
- }
964
- #[derive(Debug)]
965
- #[allow(clippy::large_enum_variant)]
966
- enum RunActions {
967
- NewIncomingWFT(NewIncomingWFT),
968
- ActivationCompletion(RunActivationCompletion),
969
- CheckMoreWork {
970
- want_to_evict: Option<RequestEvictMsg>,
971
- has_pending_queries: bool,
972
- has_wft: bool,
973
- },
974
- LocalResolution(LocalResolution),
975
- HeartbeatTimeout,
976
- }
977
- #[derive(Debug)]
978
- struct NewIncomingWFT {
979
- /// This field is only populated if the machines already exist. Otherwise the machines
980
- /// are instantiated with the workflow history.
981
- history_update: Option<HistoryUpdate>,
982
- /// Wft start time
983
- start_time: Instant,
984
- }
985
- #[derive(Debug)]
986
- struct RunActivationCompletion {
987
- task_token: TaskToken,
988
- start_time: Instant,
989
- commands: Vec<WFCommand>,
990
- activation_was_eviction: bool,
991
- activation_was_only_eviction: bool,
992
- has_pending_query: bool,
993
- query_responses: Vec<QueryResult>,
994
- /// Used to notify the worker when the completion is done processing and the completion can
995
- /// unblock. Must always be `Some` when initialized.
996
- resp_chan: Option<oneshot::Sender<ActivationCompleteResult>>,
997
- }
998
-
999
- /// A response from a [ManagedRun] held by a [ManagedRunHandle]
1000
- #[derive(Debug)]
1001
- struct RunUpdateResponse {
1002
- kind: RunUpdateResponseKind,
1003
- span: Span,
1004
- }
1005
- #[derive(Debug, derive_more::Display)]
1006
- #[allow(clippy::large_enum_variant)]
1007
- enum RunUpdateResponseKind {
1008
- Good(GoodRunUpdate),
1009
- Fail(FailRunUpdate),
1010
- }
1011
- impl RunUpdateResponseKind {
1012
- pub(crate) fn run_id(&self) -> &str {
1013
- match self {
1014
- RunUpdateResponseKind::Good(g) => &g.run_id,
1015
- RunUpdateResponseKind::Fail(f) => &f.run_id,
1016
- }
1017
- }
1018
- }
1019
-
1020
- #[derive(Debug)]
1021
- struct GoodRunUpdate {
1022
- run_id: String,
1023
- outgoing_activation: Option<ActivationOrAuto>,
1024
- fulfillable_complete: Option<FulfillableActivationComplete>,
1025
- have_seen_terminal_event: bool,
1026
- /// Is true if there are more jobs that need to be sent to lang
1027
- more_pending_work: bool,
1028
- most_recently_processed_event_number: usize,
1029
- /// Is true if this update was in response to a new WFT
1030
- in_response_to_wft: bool,
1031
- }
1032
- impl Display for GoodRunUpdate {
1033
- fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
1034
- write!(
1035
- f,
1036
- "GoodRunUpdate(run_id: {}, outgoing_activation: {}, more_pending_work: {})",
1037
- self.run_id,
1038
- if let Some(og) = self.outgoing_activation.as_ref() {
1039
- format!("{}", og)
1040
- } else {
1041
- "None".to_string()
1042
- },
1043
- self.more_pending_work
1044
- )
1045
- }
1046
- }
1047
- #[derive(Debug)]
1048
- pub(crate) struct FailRunUpdate {
1049
- run_id: String,
1050
- err: WFMachinesError,
1051
- /// This is populated if the run update failed while processing a completion - and thus we
1052
- /// must respond down it when handling the failure.
1053
- completion_resp: Option<oneshot::Sender<ActivationCompleteResult>>,
1054
- }
1055
- impl Display for FailRunUpdate {
1056
- fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
1057
- write!(
1058
- f,
1059
- "FailRunUpdate(run_id: {}, error: {:?})",
1060
- self.run_id, self.err
1061
- )
1062
- }
1063
- }
1064
1055
  #[derive(Debug)]
1065
1056
  pub struct OutgoingServerCommands {
1066
1057
  pub commands: Vec<ProtoCommand>,
@@ -1068,9 +1059,22 @@ pub struct OutgoingServerCommands {
1068
1059
  }
1069
1060
 
1070
1061
  #[derive(Debug)]
1062
+ #[cfg_attr(
1063
+ feature = "save_wf_inputs",
1064
+ derive(serde::Serialize, serde::Deserialize)
1065
+ )]
1071
1066
  pub(crate) enum LocalResolution {
1072
1067
  LocalActivity(LocalActivityResolution),
1073
1068
  }
1069
+ impl LocalResolution {
1070
+ pub fn is_la_cancel_confirmation(&self) -> bool {
1071
+ match self {
1072
+ LocalResolution::LocalActivity(lar) => {
1073
+ matches!(lar.result, LocalActivityExecutionResult::Cancelled(_))
1074
+ }
1075
+ }
1076
+ }
1077
+ }
1074
1078
 
1075
1079
  #[derive(thiserror::Error, Debug, derive_more::From)]
1076
1080
  #[error("Lang provided workflow command with empty variant")]
@@ -1079,6 +1083,10 @@ pub struct EmptyWorkflowCommandErr;
1079
1083
  /// [DrivenWorkflow]s respond with these when called, to indicate what they want to do next.
1080
1084
  /// EX: Create a new timer, complete the workflow, etc.
1081
1085
  #[derive(Debug, derive_more::From, derive_more::Display)]
1086
+ #[cfg_attr(
1087
+ feature = "save_wf_inputs",
1088
+ derive(serde::Serialize, serde::Deserialize)
1089
+ )]
1082
1090
  #[allow(clippy::large_enum_variant)]
1083
1091
  pub enum WFCommand {
1084
1092
  /// Returned when we need to wait for the lang sdk to send us something
@@ -1171,12 +1179,9 @@ pub struct WorkflowStartedInfo {
1171
1179
  retry_policy: Option<RetryPolicy>,
1172
1180
  }
1173
1181
 
1174
- type LocalActivityRequestSink =
1175
- Arc<dyn Fn(Vec<LocalActRequest>) -> Vec<LocalActivityResolution> + Send + Sync>;
1176
-
1177
1182
  /// Wraps outgoing activation job protos with some internal details core might care about
1178
1183
  #[derive(Debug, derive_more::Display)]
1179
- #[display(fmt = "{}", variant)]
1184
+ #[display(fmt = "{variant}")]
1180
1185
  struct OutgoingJob {
1181
1186
  variant: workflow_activation_job::Variant,
1182
1187
  /// Since LA resolutions are not distinguished from non-LA resolutions as far as lang is
@@ -1198,3 +1203,74 @@ impl From<OutgoingJob> for WorkflowActivationJob {
1198
1203
  }
1199
1204
  }
1200
1205
  }
1206
+
1207
+ /// Errors thrown inside of workflow machines
1208
+ #[derive(thiserror::Error, Debug)]
1209
+ pub(crate) enum WFMachinesError {
1210
+ #[error("Nondeterminism error: {0}")]
1211
+ Nondeterminism(String),
1212
+ #[error("Fatal error in workflow machines: {0}")]
1213
+ Fatal(String),
1214
+ }
1215
+
1216
+ impl WFMachinesError {
1217
+ pub fn evict_reason(&self) -> EvictionReason {
1218
+ match self {
1219
+ WFMachinesError::Nondeterminism(_) => EvictionReason::Nondeterminism,
1220
+ WFMachinesError::Fatal(_) => EvictionReason::Fatal,
1221
+ }
1222
+ }
1223
+ }
1224
+
1225
+ impl From<TimestampError> for WFMachinesError {
1226
+ fn from(_: TimestampError) -> Self {
1227
+ Self::Fatal("Could not decode timestamp".to_string())
1228
+ }
1229
+ }
1230
+
1231
+ fn auto_fail_to_complete_status(err: WFMachinesError) -> workflow_activation_completion::Status {
1232
+ workflow_activation_completion::Status::Failed(Failure {
1233
+ failure: Some(
1234
+ temporal_sdk_core_protos::temporal::api::failure::v1::Failure {
1235
+ message: "Error while processing workflow task".to_string(),
1236
+ source: err.to_string(),
1237
+ stack_trace: "".to_string(),
1238
+ encoded_attributes: None,
1239
+ cause: None,
1240
+ failure_info: None,
1241
+ },
1242
+ ),
1243
+ force_cause: WorkflowTaskFailedCause::from(err.evict_reason()) as i32,
1244
+ })
1245
+ }
1246
+
1247
+ pub(crate) trait LocalActivityRequestSink: Send + Sync + 'static {
1248
+ fn sink_reqs(&self, reqs: Vec<LocalActRequest>) -> Vec<LocalActivityResolution>;
1249
+ }
1250
+
1251
+ #[derive(derive_more::Constructor)]
1252
+ pub(super) struct LAReqSink {
1253
+ lam: Arc<LocalActivityManager>,
1254
+ /// If we're recording WF inputs, we also need to store immediate resolutions so they're
1255
+ /// available on replay.
1256
+ #[allow(dead_code)] // sometimes appears unused due to feature flagging
1257
+ recorder: Option<UnboundedSender<Vec<u8>>>,
1258
+ }
1259
+
1260
+ impl LocalActivityRequestSink for LAReqSink {
1261
+ fn sink_reqs(&self, reqs: Vec<LocalActRequest>) -> Vec<LocalActivityResolution> {
1262
+ if reqs.is_empty() {
1263
+ return vec![];
1264
+ }
1265
+
1266
+ #[allow(clippy::let_and_return)] // When feature is off clippy doesn't like this
1267
+ let res = self.lam.enqueue(reqs);
1268
+
1269
+ // We always save when there are any reqs, even if the response might be empty, so that
1270
+ // calls/responses are 1:1
1271
+ #[cfg(feature = "save_wf_inputs")]
1272
+ self.write_req(&res);
1273
+
1274
+ res
1275
+ }
1276
+ }