google-cloud-pipeline-components 2.14.1__py3-none-any.whl → 2.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of google-cloud-pipeline-components might be problematic. Click here for more details.

Files changed (88) hide show
  1. google_cloud_pipeline_components/_implementation/llm/generated/refined_image_versions.py +1 -1
  2. google_cloud_pipeline_components/_implementation/model_evaluation/llm_evaluation_preprocessor/component.py +24 -0
  3. google_cloud_pipeline_components/_implementation/starry_net/__init__.py +41 -0
  4. google_cloud_pipeline_components/_implementation/{model_evaluation/import_evaluation → starry_net/dataprep}/__init__.py +1 -2
  5. google_cloud_pipeline_components/_implementation/starry_net/dataprep/component.py +173 -0
  6. google_cloud_pipeline_components/_implementation/starry_net/evaluation/__init__.py +13 -0
  7. google_cloud_pipeline_components/_implementation/starry_net/evaluation/component.py +23 -0
  8. google_cloud_pipeline_components/_implementation/starry_net/evaluation/evaluation.yaml +197 -0
  9. google_cloud_pipeline_components/_implementation/starry_net/get_training_artifacts/__init__.py +13 -0
  10. google_cloud_pipeline_components/_implementation/starry_net/get_training_artifacts/component.py +62 -0
  11. google_cloud_pipeline_components/_implementation/starry_net/maybe_set_tfrecord_args/__init__.py +13 -0
  12. google_cloud_pipeline_components/_implementation/starry_net/maybe_set_tfrecord_args/component.py +77 -0
  13. google_cloud_pipeline_components/_implementation/starry_net/set_dataprep_args/__init__.py +13 -0
  14. google_cloud_pipeline_components/_implementation/starry_net/set_dataprep_args/component.py +97 -0
  15. google_cloud_pipeline_components/_implementation/starry_net/set_eval_args/__init__.py +13 -0
  16. google_cloud_pipeline_components/_implementation/starry_net/set_eval_args/component.py +76 -0
  17. google_cloud_pipeline_components/_implementation/starry_net/set_test_set/__init__.py +13 -0
  18. google_cloud_pipeline_components/_implementation/starry_net/set_test_set/component.py +48 -0
  19. google_cloud_pipeline_components/_implementation/starry_net/set_tfrecord_args/__init__.py +13 -0
  20. google_cloud_pipeline_components/_implementation/starry_net/set_tfrecord_args/component.py +70 -0
  21. google_cloud_pipeline_components/_implementation/starry_net/set_train_args/__init__.py +13 -0
  22. google_cloud_pipeline_components/_implementation/starry_net/set_train_args/component.py +90 -0
  23. google_cloud_pipeline_components/_implementation/starry_net/train/__init__.py +13 -0
  24. google_cloud_pipeline_components/_implementation/starry_net/train/component.py +220 -0
  25. google_cloud_pipeline_components/_implementation/starry_net/upload_decomposition_plots/__init__.py +13 -0
  26. google_cloud_pipeline_components/_implementation/starry_net/upload_decomposition_plots/component.py +64 -0
  27. google_cloud_pipeline_components/_implementation/starry_net/upload_model/__init__.py +13 -0
  28. google_cloud_pipeline_components/_implementation/starry_net/upload_model/component.py +23 -0
  29. google_cloud_pipeline_components/_implementation/starry_net/upload_model/upload_model.yaml +37 -0
  30. google_cloud_pipeline_components/_implementation/starry_net/version.py +18 -0
  31. google_cloud_pipeline_components/container/preview/custom_job/remote_runner.py +22 -0
  32. google_cloud_pipeline_components/container/utils/error_surfacing.py +45 -0
  33. google_cloud_pipeline_components/container/v1/model/get_model/remote_runner.py +36 -7
  34. google_cloud_pipeline_components/preview/automl/forecasting/forecasting_ensemble.py +1 -1
  35. google_cloud_pipeline_components/preview/automl/forecasting/forecasting_stage_1_tuner.py +2 -2
  36. google_cloud_pipeline_components/preview/automl/forecasting/forecasting_stage_2_tuner.py +2 -2
  37. google_cloud_pipeline_components/preview/automl/forecasting/learn_to_learn_forecasting_pipeline.yaml +38 -34
  38. google_cloud_pipeline_components/preview/automl/forecasting/sequence_to_sequence_forecasting_pipeline.yaml +38 -34
  39. google_cloud_pipeline_components/preview/automl/forecasting/temporal_fusion_transformer_forecasting_pipeline.yaml +38 -34
  40. google_cloud_pipeline_components/preview/automl/forecasting/time_series_dense_encoder_forecasting_pipeline.yaml +38 -34
  41. google_cloud_pipeline_components/preview/automl/forecasting/utils.py +49 -7
  42. google_cloud_pipeline_components/preview/automl/tabular/auto_feature_engineering.py +1 -1
  43. google_cloud_pipeline_components/preview/automl/tabular/automl_tabular_feature_selection_pipeline.yaml +39 -39
  44. google_cloud_pipeline_components/preview/automl/tabular/automl_tabular_v2_pipeline.yaml +41 -41
  45. google_cloud_pipeline_components/preview/automl/tabular/distillation_stage_feature_transform_engine.py +2 -2
  46. google_cloud_pipeline_components/preview/automl/tabular/feature_selection.py +2 -2
  47. google_cloud_pipeline_components/preview/automl/tabular/feature_selection_pipeline.yaml +4 -4
  48. google_cloud_pipeline_components/preview/automl/tabular/feature_transform_engine.py +3 -3
  49. google_cloud_pipeline_components/preview/automl/tabular/tabnet_hyperparameter_tuning_job.py +2 -2
  50. google_cloud_pipeline_components/preview/automl/tabular/tabnet_hyperparameter_tuning_job_pipeline.yaml +15 -15
  51. google_cloud_pipeline_components/preview/automl/tabular/tabnet_trainer.py +2 -2
  52. google_cloud_pipeline_components/preview/automl/tabular/tabnet_trainer_pipeline.yaml +13 -13
  53. google_cloud_pipeline_components/preview/automl/tabular/wide_and_deep_hyperparameter_tuning_job.py +2 -2
  54. google_cloud_pipeline_components/preview/automl/tabular/wide_and_deep_hyperparameter_tuning_job_pipeline.yaml +14 -14
  55. google_cloud_pipeline_components/preview/automl/tabular/wide_and_deep_trainer.py +2 -2
  56. google_cloud_pipeline_components/preview/automl/tabular/wide_and_deep_trainer_pipeline.yaml +13 -13
  57. google_cloud_pipeline_components/preview/automl/tabular/xgboost_hyperparameter_tuning_job_pipeline.yaml +14 -14
  58. google_cloud_pipeline_components/preview/automl/tabular/xgboost_trainer_pipeline.yaml +13 -13
  59. google_cloud_pipeline_components/preview/custom_job/utils.py +45 -6
  60. google_cloud_pipeline_components/preview/llm/rlhf/component.py +3 -6
  61. google_cloud_pipeline_components/preview/starry_net/__init__.py +19 -0
  62. google_cloud_pipeline_components/preview/starry_net/component.py +469 -0
  63. google_cloud_pipeline_components/proto/task_error_pb2.py +0 -1
  64. google_cloud_pipeline_components/v1/automl/forecasting/bqml_arima_predict_pipeline.yaml +10 -10
  65. google_cloud_pipeline_components/v1/automl/forecasting/bqml_arima_train_pipeline.yaml +31 -31
  66. google_cloud_pipeline_components/v1/automl/forecasting/prophet_predict_pipeline.yaml +13 -13
  67. google_cloud_pipeline_components/v1/automl/forecasting/prophet_trainer.py +3 -3
  68. google_cloud_pipeline_components/v1/automl/forecasting/prophet_trainer_pipeline.yaml +14 -14
  69. google_cloud_pipeline_components/v1/automl/tabular/automl_tabular_pipeline.yaml +37 -37
  70. google_cloud_pipeline_components/v1/automl/tabular/cv_trainer.py +2 -2
  71. google_cloud_pipeline_components/v1/automl/tabular/ensemble.py +2 -2
  72. google_cloud_pipeline_components/v1/automl/tabular/finalizer.py +1 -1
  73. google_cloud_pipeline_components/v1/automl/tabular/infra_validator.py +1 -1
  74. google_cloud_pipeline_components/v1/automl/tabular/split_materialized_data.py +1 -1
  75. google_cloud_pipeline_components/v1/automl/tabular/stage_1_tuner.py +2 -2
  76. google_cloud_pipeline_components/v1/automl/tabular/stats_and_example_gen.py +2 -2
  77. google_cloud_pipeline_components/v1/automl/tabular/training_configurator_and_validator.py +1 -1
  78. google_cloud_pipeline_components/v1/automl/tabular/transform.py +2 -2
  79. google_cloud_pipeline_components/v1/custom_job/component.py +3 -0
  80. google_cloud_pipeline_components/v1/custom_job/utils.py +4 -0
  81. google_cloud_pipeline_components/v1/model_evaluation/evaluation_llm_text_generation_pipeline.py +21 -0
  82. google_cloud_pipeline_components/version.py +1 -1
  83. {google_cloud_pipeline_components-2.14.1.dist-info → google_cloud_pipeline_components-2.16.0.dist-info}/METADATA +17 -20
  84. {google_cloud_pipeline_components-2.14.1.dist-info → google_cloud_pipeline_components-2.16.0.dist-info}/RECORD +87 -58
  85. {google_cloud_pipeline_components-2.14.1.dist-info → google_cloud_pipeline_components-2.16.0.dist-info}/WHEEL +1 -1
  86. google_cloud_pipeline_components/_implementation/model_evaluation/import_evaluation/component.py +0 -208
  87. {google_cloud_pipeline_components-2.14.1.dist-info → google_cloud_pipeline_components-2.16.0.dist-info}/LICENSE +0 -0
  88. {google_cloud_pipeline_components-2.14.1.dist-info → google_cloud_pipeline_components-2.16.0.dist-info}/top_level.txt +0 -0
@@ -54,7 +54,7 @@ def create_custom_training_job_from_component(
54
54
  display_name: str = '',
55
55
  replica_count: int = 1,
56
56
  machine_type: str = 'n1-standard-4',
57
- accelerator_type: str = '',
57
+ accelerator_type: str = 'ACCELERATOR_TYPE_UNSPECIFIED',
58
58
  accelerator_count: int = 1,
59
59
  boot_disk_type: str = 'pd-ssd',
60
60
  boot_disk_size_gb: int = 100,
@@ -83,7 +83,7 @@ def create_custom_training_job_from_component(
83
83
  replica_count: The count of instances in the cluster. One replica always counts towards the master in worker_pool_spec[0] and the remaining replicas will be allocated in worker_pool_spec[1]. See [more information.](https://cloud.google.com/vertex-ai/docs/training/distributed-training#configure_a_distributed_training_job)
84
84
  machine_type: The type of the machine to run the CustomJob. The default value is "n1-standard-4". See [more information](https://cloud.google.com/vertex-ai/docs/training/configure-compute#machine-types).
85
85
  accelerator_type: The type of accelerator(s) that may be attached to the machine per `accelerator_count`. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/MachineSpec#acceleratortype).
86
- accelerator_count: The number of accelerators to attach to the machine. Defaults to 1 if `accelerator_type` is set.
86
+ accelerator_count: The number of accelerators to attach to the machine. Defaults to 1 if `accelerator_type` is set statically.
87
87
  boot_disk_type: Type of the boot disk (default is "pd-ssd"). Valid values: "pd-ssd" (Persistent Disk Solid State Drive) or "pd-standard" (Persistent Disk Hard Disk Drive). boot_disk_type is set as a static value and cannot be changed as a pipeline parameter.
88
88
  boot_disk_size_gb: Size in GB of the boot disk (default is 100GB). `boot_disk_size_gb` is set as a static value and cannot be changed as a pipeline parameter.
89
89
  timeout: The maximum job running time. The default is 7 days. A duration in seconds with up to nine fractional digits, terminated by 's', for example: "3.5s".
@@ -148,7 +148,11 @@ def create_custom_training_job_from_component(
148
148
  )[0]['container']
149
149
 
150
150
  worker_pool_spec = {
151
- 'machine_spec': {'machine_type': machine_type},
151
+ 'machine_spec': {
152
+ 'machine_type': "{{$.inputs.parameters['machine_type']}}",
153
+ 'accelerator_type': "{{$.inputs.parameters['accelerator_type']}}",
154
+ 'accelerator_count': "{{$.inputs.parameters['accelerator_count']}}",
155
+ },
152
156
  'replica_count': 1,
153
157
  'container_spec': {
154
158
  'image_uri': user_component_container['image'],
@@ -161,9 +165,6 @@ def create_custom_training_job_from_component(
161
165
  'env': env or [],
162
166
  },
163
167
  }
164
- if accelerator_type:
165
- worker_pool_spec['machine_spec']['accelerator_type'] = accelerator_type
166
- worker_pool_spec['machine_spec']['accelerator_count'] = accelerator_count
167
168
  if boot_disk_type:
168
169
  worker_pool_spec['disk_spec'] = {
169
170
  'boot_disk_type': boot_disk_type,
@@ -210,6 +211,43 @@ def create_custom_training_job_from_component(
210
211
  'defaultValue'
211
212
  ] = default_value
212
213
 
214
+ # add machine parameters into the customjob component
215
+ if accelerator_type == 'ACCELERATOR_TYPE_UNSPECIFIED':
216
+ accelerator_count = 0
217
+
218
+ cj_component_spec['inputDefinitions']['parameters']['machine_type'] = {
219
+ 'parameterType': 'STRING',
220
+ 'defaultValue': machine_type,
221
+ 'isOptional': True,
222
+ }
223
+ cj_component_spec['inputDefinitions']['parameters']['accelerator_type'] = {
224
+ 'parameterType': 'STRING',
225
+ 'defaultValue': accelerator_type,
226
+ 'isOptional': True,
227
+ }
228
+ cj_component_spec['inputDefinitions']['parameters']['accelerator_count'] = {
229
+ 'parameterType': 'NUMBER_INTEGER',
230
+ 'defaultValue': accelerator_count,
231
+ 'isOptional': True,
232
+ }
233
+
234
+ # check if user component has any input parameters that already exist in the
235
+ # custom job component
236
+ for param_name in user_component_spec.get('inputDefinitions', {}).get(
237
+ 'parameters', {}
238
+ ):
239
+ if param_name in cj_component_spec['inputDefinitions']['parameters']:
240
+ raise ValueError(
241
+ f'Input parameter {param_name} already exists in the CustomJob component.' # pylint: disable=line-too-long
242
+ )
243
+ for param_name in user_component_spec.get('outputDefinitions', {}).get(
244
+ 'parameters', {}
245
+ ):
246
+ if param_name in cj_component_spec['outputDefinitions']['parameters']:
247
+ raise ValueError(
248
+ f'Output parameter {param_name} already exists in the CustomJob component.' # pylint: disable=line-too-long
249
+ )
250
+
213
251
  # merge parameters from user component into the customjob component
214
252
  cj_component_spec['inputDefinitions']['parameters'].update(
215
253
  user_component_spec.get('inputDefinitions', {}).get('parameters', {})
@@ -217,6 +255,7 @@ def create_custom_training_job_from_component(
217
255
  cj_component_spec['outputDefinitions']['parameters'].update(
218
256
  user_component_spec.get('outputDefinitions', {}).get('parameters', {})
219
257
  )
258
+
220
259
  # use artifacts from user component
221
260
  ## assign artifacts, not update, since customjob has no artifact outputs
222
261
  cj_component_spec['inputDefinitions']['artifacts'] = user_component_spec.get(
@@ -206,12 +206,9 @@ def rlhf_pipeline(
206
206
  has_inference_dataset == True, # pylint: disable=singleton-comparison
207
207
  name='Perform Inference',
208
208
  ):
209
- has_model_checkpoint = function_based.value_exists(
210
- value=rl_model_pipeline.outputs['output_model_path']
211
- ).set_display_name('Resolve Model Checkpoint')
212
- with kfp.dsl.Condition(
213
- has_model_checkpoint.output == True, # pylint: disable=singleton-comparison
214
- name='Test Model Checkpoint Exists',
209
+ with kfp.dsl.If(
210
+ rl_model_pipeline.outputs['output_model_path'] != '',
211
+ name='CheckModel Checkpoint Exists',
215
212
  ):
216
213
  component.infer_pipeline(
217
214
  project=project,
@@ -0,0 +1,19 @@
1
+ # Copyright 2024 The Kubeflow Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Starry Net Forecasting Pipeline."""
16
+
17
+ from google_cloud_pipeline_components.preview.starry_net.component import starry_net # pylint: disable=g-importing-member
18
+
19
+ __all__ = ['starry_net']
@@ -0,0 +1,469 @@
1
+ # Copyright 2024 The Kubeflow Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Defines the pipeline for Starry Net."""
15
+
16
+ from typing import List
17
+
18
+ # pylint: disable=g-importing-member
19
+ from google_cloud_pipeline_components import _placeholders
20
+ from google_cloud_pipeline_components._implementation.starry_net import DataprepOp
21
+ from google_cloud_pipeline_components._implementation.starry_net import EvaluationOp
22
+ from google_cloud_pipeline_components._implementation.starry_net import GetTrainingArtifactsOp
23
+ from google_cloud_pipeline_components._implementation.starry_net import MaybeSetTfrecordArgsOp
24
+ from google_cloud_pipeline_components._implementation.starry_net import SetDataprepArgsOp
25
+ from google_cloud_pipeline_components._implementation.starry_net import SetEvalArgsOp
26
+ from google_cloud_pipeline_components._implementation.starry_net import SetTestSetOp
27
+ from google_cloud_pipeline_components._implementation.starry_net import SetTfrecordArgsOp
28
+ from google_cloud_pipeline_components._implementation.starry_net import SetTrainArgsOp
29
+ from google_cloud_pipeline_components._implementation.starry_net import TrainOp
30
+ from google_cloud_pipeline_components._implementation.starry_net import UploadDecompositionPlotsOp
31
+ from google_cloud_pipeline_components._implementation.starry_net import UploadModelOp
32
+ from google_cloud_pipeline_components.preview.model_evaluation import model_evaluation_import_component
33
+ from google_cloud_pipeline_components.types import artifact_types
34
+ from google_cloud_pipeline_components.v1 import batch_predict_job
35
+ from kfp import dsl
36
+
37
+
38
+ @dsl.pipeline
39
+ def starry_net( # pylint: disable=dangerous-default-value
40
+ tensorboard_instance_id: str,
41
+ dataprep_backcast_length: int,
42
+ dataprep_forecast_length: int,
43
+ dataprep_train_end_date: str,
44
+ dataprep_n_val_windows: int,
45
+ dataprep_n_test_windows: int,
46
+ dataprep_test_set_stride: int,
47
+ dataprep_test_set_bigquery_dataset: str,
48
+ dataflow_machine_type: str = 'n1-standard-16',
49
+ dataflow_max_replica_count: int = 50,
50
+ dataflow_starting_replica_count: int = 1,
51
+ dataflow_disk_size_gb: int = 50,
52
+ dataprep_csv_data_path: str = '',
53
+ dataprep_csv_static_covariates_path: str = '',
54
+ dataprep_bigquery_data_path: str = '',
55
+ dataprep_ts_identifier_columns: List[str] = [],
56
+ dataprep_time_column: str = '',
57
+ dataprep_target_column: str = '',
58
+ dataprep_static_covariate_columns: List[str] = [],
59
+ dataprep_previous_run_dir: str = '',
60
+ dataprep_nan_threshold: float = 0.2,
61
+ dataprep_zero_threshold: float = 0.2,
62
+ trainer_machine_type: str = 'n1-standard-4',
63
+ trainer_accelerator_type: str = 'NVIDIA_TESLA_V100',
64
+ trainer_num_epochs: int = 50,
65
+ trainer_cleaning_activation_regularizer_coeff: float = 1e3,
66
+ trainer_change_point_activation_regularizer_coeff: float = 1e3,
67
+ trainer_change_point_output_regularizer_coeff: float = 1e3,
68
+ trainer_trend_alpha_upper_bound: float = 0.5,
69
+ trainer_trend_beta_upper_bound: float = 0.2,
70
+ trainer_trend_phi_lower_bound: float = 0.99,
71
+ trainer_trend_b_fixed_val: int = -1,
72
+ trainer_trend_b0_fixed_val: int = -1,
73
+ trainer_trend_phi_fixed_val: int = -1,
74
+ trainer_quantiles: List[float] = [],
75
+ trainer_model_blocks: List[str] = [
76
+ 'cleaning',
77
+ 'change_point',
78
+ 'trend',
79
+ 'day_of_week',
80
+ 'week_of_year',
81
+ 'residual',
82
+ ],
83
+ tensorboard_n_decomposition_plots: int = 25,
84
+ encryption_spec_key_name: str = '',
85
+ location: str = _placeholders.LOCATION_PLACEHOLDER,
86
+ project: str = _placeholders.PROJECT_ID_PLACEHOLDER,
87
+ ):
88
+ # fmt: off
89
+ """Starry Net is a state-of-the-art forecaster used internally by Google.
90
+
91
+ Starry Net is a glass-box neural network inspired by statistical time series
92
+ models, capable of cleaning step changes and spikes, modeling seasonality and
93
+ events, forecasting trend, and providing both point and prediction interval
94
+ forecasts in a single, lightweight model. Starry Net stands out among neural
95
+ network based forecasting models by providing the explainability,
96
+ interpretability and tunability of traditional statistical forecasters.
97
+ For example, it features time series feature decomposition and damped local
98
+ linear exponential smoothing model as the trend structure.
99
+
100
+ Args:
101
+ tensorboard_instance_id: The tensorboard instance ID. This must be in same
102
+ location as the pipeline job.
103
+ dataprep_backcast_length: The length of the context window to feed into the
104
+ model.
105
+ dataprep_forecast_length: The length of the forecast horizon used in the
106
+ loss function during training and during evaluation, so that the model is
107
+ optimized to produce forecasts from 0 to H.
108
+ dataprep_train_end_date: The last date of data to use in the training and
109
+ validation set. All dates after a train_end_date are part of the test set.
110
+ If last_forecasted_date is equal to the final day forecasted in the test
111
+ set, then last_forecasted_date =
112
+ train_end_date + forecast_length + (n_test_windows * test_set_stride).
113
+ last_forecasted_date must be included in the dataset.
114
+ dataprep_n_val_windows: The number of windows to use for the val set. If 0,
115
+ no validation set is used.
116
+ dataprep_n_test_windows: The number of windows to use for the test set. Must
117
+ be >= 1. See note in dataprep_train_end_date.
118
+ dataprep_test_set_stride: The number of timestamps to roll forward
119
+ when constructing each window of the val and test sets. See note in
120
+ dataprep_train_end_date.
121
+ dataprep_test_set_bigquery_dataset: The bigquery dataset where the test set
122
+ is saved in the format bq://project.dataset. This must be in the same
123
+ region or multi-region as the output or staging bucket of the pipeline and
124
+ the dataprep_bigquery_data_path, if using a Big Query data source.
125
+ dataflow_machine_type: The type of machine to use for dataprep,
126
+ batch prediction, and evaluation jobs..
127
+ dataflow_max_replica_count: The maximum number of replicas to scale the
128
+ dataprep, batch prediction, and evaluation jobs.
129
+ dataflow_starting_replica_count: The number of replicas to start the
130
+ dataprep, batch prediction, and evaluation jobs.
131
+ dataflow_disk_size_gb: The disk size of dataflow workers in GB for the
132
+ dataprep, batch prediction, and evaluation jobs.
133
+ dataprep_csv_data_path: The path to the training data csv in the format
134
+ gs://bucket_name/sub_dir/blob_name.csv. Each row of the csv represents
135
+ a time series, where the column names are the dates, and the index is the
136
+ unique time series names.
137
+ dataprep_csv_static_covariates_path: The path to the static covariates csv.
138
+ Each row of the csv represents the static covariate values for the series,
139
+ where the column names are the static covariate names, and the
140
+ index is the unique time series names. The index values must match the
141
+ index values of dataprep_csv_data_path. The column values must match
142
+ dataprep_static_covariate_columns.
143
+ dataprep_bigquery_data_path: The path to the training data on BigQuery in
144
+ the format bq://project.dataset.table_id. You should only set this or
145
+ csv_data_path. This must be in the same region or multi-region as the
146
+ output or staging bucket of the pipeline and the
147
+ dataprep_test_set_bigquery_dataset.
148
+ dataprep_ts_identifier_columns: The list of ts_identifier columns from the
149
+ BigQuery data source. These columns are used to distinguish the different
150
+ time series, so that if multiple rows have identical ts_identifier
151
+ columns, the series is generated by summing the target columns for each
152
+ timestamp. This is only used if dataprep_bigquery_data_path is set.
153
+ dataprep_time_column: The time column from the BigQuery data source. This is
154
+ only used if dataprep_bigquery_data_path is set.
155
+ dataprep_target_column: The column to be forecasted from the BigQuery data
156
+ source. This is only used if dataprep_bigquery_data_path is set.
157
+ dataprep_static_covariate_columns: The list of strings of static covariate
158
+ names. This needs to be set if training with static covariates regardless
159
+ of whether you're using bigquery_data_path or csv_static_covariates_path.
160
+ dataprep_previous_run_dir: The dataprep dir from a previous run. Use this
161
+ to save time if you've already created TFRecords from your BigQuery
162
+ dataset with the same dataprep parameters as this run.
163
+ dataprep_nan_threshold: Series having more nan / missing values than
164
+ nan_threshold (inclusive) in percentage for either backtest or forecast
165
+ will not be sampled in the training set (including missing due to
166
+ train_start and train_end). All existing nans are replaced by zeros.
167
+ dataprep_zero_threshold: Series having more 0.0 values than zero_threshold
168
+ (inclusive) in percentage for either backtest or forecast will not be
169
+ sampled in the training set.
170
+ trainer_machine_type: The machine type for training. Must be compatible with
171
+ trainer_accelerator_type.
172
+ trainer_accelerator_type: The accelerator type for training.
173
+ trainer_num_epochs: The number of epochs to train for.
174
+ trainer_cleaning_activation_regularizer_coeff: The L1 regularization
175
+ coefficient for the anomaly detection activation in the cleaning block.
176
+ The larger the value, the less aggressive the cleaning, so fewer and only
177
+ the most extreme anomalies are detected. A rule of thumb is that this
178
+ value should be about the same scale of your series.
179
+ trainer_change_point_activation_regularizer_coeff: The L1 regularization
180
+ coefficient for the change point detection activation in the change point
181
+ block. The larger the value, the less aggressive the cleaning, so fewer
182
+ and only the most extreme change points are detected. A rule of thumb is
183
+ that this value should be a ratio of the
184
+ trainer_change_point_output_regularizer_coeff to determine the sparsity
185
+ of the changes. If you want the model to detect many small step changes
186
+ this number should be smaller than the
187
+ trainer_change_point_output_regularizer_coeff. To detect fewer large step
188
+ changes, this number should be about equal to or larger than the
189
+ trainer_change_point_output_regularizer_coeff.
190
+ trainer_change_point_output_regularizer_coeff: The L2 regularization
191
+ penalty applied to the mean lag-one difference of the cleaned output of
192
+ the change point block. Intutively,
193
+ trainer_change_point_activation_regularizer_coeff determines how many
194
+ steps to detect in the series, while this parameter determines how
195
+ aggressively to clean the detected steps. The higher this value, the more
196
+ aggressive the cleaning. A rule of thumb is that this value should be
197
+ about the same scale of your series.
198
+ trainer_trend_alpha_upper_bound: The upper bound for data smooth parameter
199
+ alpha in the trend block.
200
+ trainer_trend_beta_upper_bound: The upper bound for trend smooth parameter
201
+ beta in the trend block.
202
+ trainer_trend_phi_lower_bound: The lower bound for damping param phi in the
203
+ trend block.
204
+ trainer_trend_b_fixed_val: The fixed value for long term trend parameter b
205
+ in the trend block. If set to anything other than -1, the trend block will
206
+ not learn to provide estimates but use the fixed value directly.
207
+ trainer_trend_b0_fixed_val: The fixed value for starting short-term trend
208
+ parameter b0 in the trend block. If set to anything other than -1, the
209
+ trend block will not learn to provide estimates but use the fixed value
210
+ directly.
211
+ trainer_trend_phi_fixed_val: The fixed value for the damping parameter phi
212
+ in the trend block. If set to anything other than -1, the trend block will
213
+ not learn to provide estimates but use the fixed value directly.
214
+ trainer_quantiles: The list of floats representing quantiles. Leave blank if
215
+ only training to produce point forecasts.
216
+ trainer_model_blocks: The list of model blocks to use in the order they will
217
+ appear in the model. Possible values are `cleaning`, `change_point`,
218
+ `trend`, `hour_of_week`, `day_of_week`, `day_of_year`, `week_of_year`,
219
+ `month_of_year`, `residual`.
220
+ tensorboard_n_decomposition_plots: How many decomposition plots from the
221
+ test set to save to tensorboard.
222
+ encryption_spec_key_name: Customer-managed encryption key options for the
223
+ CustomJob. If this is set, then all resources created by the CustomJob
224
+ will be encrypted with the provided encryption key.
225
+ location: The location where the pipeline components are run.
226
+ project: The project where the pipeline is run. Defaults to current project.
227
+ """
228
+ job_id = dsl.PIPELINE_JOB_NAME_PLACEHOLDER
229
+ create_dataprep_args_task = SetDataprepArgsOp(
230
+ model_blocks=trainer_model_blocks,
231
+ ts_identifier_columns=dataprep_ts_identifier_columns,
232
+ static_covariate_columns=dataprep_static_covariate_columns,
233
+ csv_data_path=dataprep_csv_data_path,
234
+ previous_run_dir=dataprep_previous_run_dir,
235
+ location=location,
236
+ )
237
+ create_trainer_args_task = SetTrainArgsOp(
238
+ quantiles=trainer_quantiles,
239
+ model_blocks=trainer_model_blocks,
240
+ static_covariates=dataprep_static_covariate_columns,
241
+ )
242
+ with dsl.If(create_dataprep_args_task.outputs['create_tf_records'] == True, # pylint: disable=singleton-comparison
243
+ 'create-tf-records'):
244
+ create_tf_records_task = DataprepOp(
245
+ backcast_length=dataprep_backcast_length,
246
+ forecast_length=dataprep_forecast_length,
247
+ train_end_date=dataprep_train_end_date,
248
+ n_val_windows=dataprep_n_val_windows,
249
+ n_test_windows=dataprep_n_test_windows,
250
+ test_set_stride=dataprep_test_set_stride,
251
+ model_blocks=create_dataprep_args_task.outputs['model_blocks'],
252
+ bigquery_source=dataprep_bigquery_data_path,
253
+ ts_identifier_columns=create_dataprep_args_task.outputs[
254
+ 'ts_identifier_columns'],
255
+ time_column=dataprep_time_column,
256
+ static_covariate_columns=create_dataprep_args_task.outputs[
257
+ 'static_covariate_columns'],
258
+ static_covariates_vocab_path='',
259
+ target_column=dataprep_target_column,
260
+ machine_type=dataflow_machine_type,
261
+ docker_region=create_dataprep_args_task.outputs['docker_region'],
262
+ location=location,
263
+ project=project,
264
+ job_id=job_id,
265
+ job_name_prefix='tf-records',
266
+ num_workers=dataflow_starting_replica_count,
267
+ max_num_workers=dataflow_max_replica_count,
268
+ disk_size_gb=dataflow_disk_size_gb,
269
+ test_set_only=False,
270
+ bigquery_output=dataprep_test_set_bigquery_dataset,
271
+ nan_threshold=dataprep_nan_threshold,
272
+ zero_threshold=dataprep_zero_threshold,
273
+ gcs_source=dataprep_csv_data_path,
274
+ gcs_static_covariate_source=dataprep_csv_static_covariates_path,
275
+ encryption_spec_key_name=encryption_spec_key_name
276
+ )
277
+ create_tf_records_task.set_display_name('create-tf-records')
278
+ set_tfrecord_args_this_run_task = (
279
+ SetTfrecordArgsOp(
280
+ dataprep_dir=create_tf_records_task.outputs['dataprep_dir'],
281
+ static_covariates=dataprep_static_covariate_columns))
282
+ with dsl.Else('skip-tf-record-generation'):
283
+ set_tfrecord_args_previous_run_task = (
284
+ MaybeSetTfrecordArgsOp(
285
+ dataprep_previous_run_dir=dataprep_previous_run_dir,
286
+ static_covariates=dataprep_static_covariate_columns))
287
+ set_tfrecord_args_previous_run_task.set_display_name(
288
+ 'set_tfrecord_args_previous_run')
289
+ static_covariates_vocab_path = dsl.OneOf(
290
+ set_tfrecord_args_previous_run_task.outputs[
291
+ 'static_covariates_vocab_path'],
292
+ set_tfrecord_args_this_run_task.outputs['static_covariates_vocab_path']
293
+ )
294
+ test_set_task = DataprepOp(
295
+ backcast_length=dataprep_backcast_length,
296
+ forecast_length=dataprep_forecast_length,
297
+ train_end_date=dataprep_train_end_date,
298
+ n_val_windows=dataprep_n_val_windows,
299
+ n_test_windows=dataprep_n_test_windows,
300
+ test_set_stride=dataprep_test_set_stride,
301
+ model_blocks=create_dataprep_args_task.outputs['model_blocks'],
302
+ bigquery_source=dataprep_bigquery_data_path,
303
+ ts_identifier_columns=create_dataprep_args_task.outputs[
304
+ 'ts_identifier_columns'],
305
+ time_column=dataprep_time_column,
306
+ static_covariate_columns=create_dataprep_args_task.outputs[
307
+ 'static_covariate_columns'],
308
+ static_covariates_vocab_path=static_covariates_vocab_path,
309
+ target_column=dataprep_target_column,
310
+ machine_type=dataflow_machine_type,
311
+ docker_region=create_dataprep_args_task.outputs['docker_region'],
312
+ location=location,
313
+ project=project,
314
+ job_id=job_id,
315
+ job_name_prefix='test-set',
316
+ num_workers=dataflow_starting_replica_count,
317
+ max_num_workers=dataflow_max_replica_count,
318
+ disk_size_gb=dataflow_disk_size_gb,
319
+ test_set_only=True,
320
+ bigquery_output=dataprep_test_set_bigquery_dataset,
321
+ nan_threshold=dataprep_nan_threshold,
322
+ zero_threshold=dataprep_zero_threshold,
323
+ gcs_source=dataprep_csv_data_path,
324
+ gcs_static_covariate_source=dataprep_csv_static_covariates_path,
325
+ encryption_spec_key_name=encryption_spec_key_name
326
+ )
327
+ test_set_task.set_display_name('create-test-set')
328
+ set_test_set_task = SetTestSetOp(
329
+ dataprep_dir=test_set_task.outputs['dataprep_dir'])
330
+ train_tf_record_patterns = dsl.OneOf(
331
+ set_tfrecord_args_previous_run_task.outputs['train_tf_record_patterns'],
332
+ set_tfrecord_args_this_run_task.outputs['train_tf_record_patterns']
333
+ )
334
+ val_tf_record_patterns = dsl.OneOf(
335
+ set_tfrecord_args_previous_run_task.outputs['val_tf_record_patterns'],
336
+ set_tfrecord_args_this_run_task.outputs['val_tf_record_patterns']
337
+ )
338
+ test_tf_record_patterns = dsl.OneOf(
339
+ set_tfrecord_args_previous_run_task.outputs['test_tf_record_patterns'],
340
+ set_tfrecord_args_this_run_task.outputs['test_tf_record_patterns']
341
+ )
342
+ trainer_task = TrainOp(
343
+ num_epochs=trainer_num_epochs,
344
+ backcast_length=dataprep_backcast_length,
345
+ forecast_length=dataprep_forecast_length,
346
+ train_end_date=dataprep_train_end_date,
347
+ csv_data_path=dataprep_csv_data_path,
348
+ csv_static_covariates_path=dataprep_csv_static_covariates_path,
349
+ static_covariates_vocab_path=static_covariates_vocab_path,
350
+ train_tf_record_patterns=train_tf_record_patterns,
351
+ val_tf_record_patterns=val_tf_record_patterns,
352
+ test_tf_record_patterns=test_tf_record_patterns,
353
+ n_decomposition_plots=tensorboard_n_decomposition_plots,
354
+ n_val_windows=dataprep_n_val_windows,
355
+ n_test_windows=dataprep_n_test_windows,
356
+ test_set_stride=dataprep_test_set_stride,
357
+ nan_threshold=dataprep_nan_threshold,
358
+ zero_threshold=dataprep_zero_threshold,
359
+ cleaning_activation_regularizer_coeff=trainer_cleaning_activation_regularizer_coeff,
360
+ change_point_activation_regularizer_coeff=trainer_change_point_activation_regularizer_coeff,
361
+ change_point_output_regularizer_coeff=trainer_change_point_output_regularizer_coeff,
362
+ alpha_upper_bound=trainer_trend_alpha_upper_bound,
363
+ beta_upper_bound=trainer_trend_beta_upper_bound,
364
+ phi_lower_bound=trainer_trend_phi_lower_bound,
365
+ b_fixed_val=trainer_trend_b_fixed_val,
366
+ b0_fixed_val=trainer_trend_b0_fixed_val,
367
+ phi_fixed_val=trainer_trend_phi_fixed_val,
368
+ quantiles=create_trainer_args_task.outputs['quantiles'],
369
+ use_static_covariates=create_trainer_args_task.outputs[
370
+ 'use_static_covariates'],
371
+ static_covariate_names=create_trainer_args_task.outputs[
372
+ 'static_covariate_names'],
373
+ model_blocks=create_trainer_args_task.outputs['model_blocks'],
374
+ freeze_point_forecasts=create_trainer_args_task.outputs[
375
+ 'freeze_point_forecasts'],
376
+ machine_type=trainer_machine_type,
377
+ accelerator_type=trainer_accelerator_type,
378
+ docker_region=create_dataprep_args_task.outputs['docker_region'],
379
+ location=location,
380
+ job_id=job_id,
381
+ project=project,
382
+ encryption_spec_key_name=encryption_spec_key_name
383
+ )
384
+ _ = UploadDecompositionPlotsOp(
385
+ project=project,
386
+ location=location,
387
+ tensorboard_id=tensorboard_instance_id,
388
+ display_name=job_id,
389
+ trainer_dir=trainer_task.outputs['trainer_dir'])
390
+ training_artifacts_task = GetTrainingArtifactsOp(
391
+ docker_region=create_dataprep_args_task.outputs['docker_region'],
392
+ trainer_dir=trainer_task.outputs['trainer_dir'])
393
+ model = dsl.importer(
394
+ artifact_uri=training_artifacts_task.outputs['artifact_uri'],
395
+ artifact_class=artifact_types.UnmanagedContainerModel,
396
+ metadata={
397
+ 'predictSchemata': {
398
+ 'instanceSchemaUri': training_artifacts_task.outputs[
399
+ 'instance_schema_uri'],
400
+ 'predictionSchemaUri': training_artifacts_task.outputs[
401
+ 'prediction_schema_uri'],
402
+ },
403
+ 'containerSpec': {
404
+ 'imageUri': training_artifacts_task.outputs['image_uri'],
405
+ 'healthRoute': '/health',
406
+ 'predictRoute': '/predict',
407
+ }
408
+ },
409
+ )
410
+ model.set_display_name('set-model')
411
+ upload_model_task = UploadModelOp(
412
+ project=project,
413
+ location=location,
414
+ display_name=job_id,
415
+ unmanaged_container_model=model.output,
416
+ encryption_spec_key_name=encryption_spec_key_name,
417
+ )
418
+ upload_model_task.set_display_name('upload-model')
419
+ batch_predict_task = batch_predict_job.ModelBatchPredictOp(
420
+ project=project,
421
+ location=location,
422
+ unmanaged_container_model=model.output,
423
+ job_display_name=f'batch-predict-{job_id}',
424
+ instances_format='bigquery',
425
+ predictions_format='bigquery',
426
+ bigquery_source_input_uri=set_test_set_task.outputs['uri'],
427
+ bigquery_destination_output_uri=dataprep_test_set_bigquery_dataset,
428
+ machine_type=dataflow_machine_type,
429
+ starting_replica_count=dataflow_starting_replica_count,
430
+ max_replica_count=dataflow_max_replica_count,
431
+ encryption_spec_key_name=encryption_spec_key_name,
432
+ generate_explanation=False,
433
+ )
434
+ batch_predict_task.set_display_name('run-batch-prediction')
435
+ set_eval_args_task = SetEvalArgsOp(
436
+ big_query_source=batch_predict_task.outputs['bigquery_output_table'],
437
+ quantiles=trainer_quantiles)
438
+ eval_task = EvaluationOp(
439
+ project=project,
440
+ location=location,
441
+ root_dir=test_set_task.outputs['dataprep_dir'],
442
+ target_field_name='HORIZON__x',
443
+ predictions_format='bigquery',
444
+ ground_truth_format='bigquery',
445
+ predictions_bigquery_source=batch_predict_task.outputs[
446
+ 'bigquery_output_table'],
447
+ ground_truth_bigquery_source=set_eval_args_task.outputs[
448
+ 'big_query_source'],
449
+ ground_truth_gcs_source=[],
450
+ forecasting_type=set_eval_args_task.outputs['forecasting_type'],
451
+ forecasting_quantiles=set_eval_args_task.outputs['quantiles'],
452
+ prediction_score_column=set_eval_args_task.outputs[
453
+ 'prediction_score_column'],
454
+ dataflow_service_account=_placeholders.SERVICE_ACCOUNT_PLACEHOLDER,
455
+ dataflow_machine_type=dataflow_machine_type,
456
+ dataflow_max_workers_num=dataflow_max_replica_count,
457
+ dataflow_workers_num=dataflow_starting_replica_count,
458
+ dataflow_disk_size=dataflow_disk_size_gb,
459
+ dataflow_use_public_ips=True,
460
+ encryption_spec_key_name=encryption_spec_key_name,
461
+ )
462
+ model_evaluation_import_component.model_evaluation_import(
463
+ forecasting_metrics=eval_task.outputs['evaluation_metrics'],
464
+ model=upload_model_task.outputs['model'],
465
+ dataset_type='bigquery',
466
+ dataset_path=set_test_set_task.outputs['uri'],
467
+ display_name=job_id,
468
+ problem_type='forecasting',
469
+ )
@@ -5,7 +5,6 @@
5
5
  """Generated protocol buffer code."""
6
6
  from google.protobuf import descriptor as _descriptor
7
7
  from google.protobuf import descriptor_pool as _descriptor_pool
8
- from google.protobuf import runtime_version as _runtime_version
9
8
  from google.protobuf import symbol_database as _symbol_database
10
9
  from google.protobuf.internal import builder as _builder
11
10
  # @@protoc_insertion_point(imports)