google-cloud-pipeline-components 2.14.1__py3-none-any.whl → 2.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of google-cloud-pipeline-components might be problematic. Click here for more details.
- google_cloud_pipeline_components/_implementation/llm/generated/refined_image_versions.py +1 -1
- google_cloud_pipeline_components/_implementation/model_evaluation/llm_evaluation_preprocessor/component.py +14 -0
- google_cloud_pipeline_components/_implementation/starry_net/__init__.py +41 -0
- google_cloud_pipeline_components/_implementation/{model_evaluation/import_evaluation → starry_net/dataprep}/__init__.py +1 -2
- google_cloud_pipeline_components/_implementation/starry_net/dataprep/component.py +159 -0
- google_cloud_pipeline_components/_implementation/starry_net/evaluation/__init__.py +13 -0
- google_cloud_pipeline_components/_implementation/starry_net/evaluation/component.py +23 -0
- google_cloud_pipeline_components/_implementation/starry_net/evaluation/evaluation.yaml +197 -0
- google_cloud_pipeline_components/_implementation/starry_net/get_training_artifacts/__init__.py +13 -0
- google_cloud_pipeline_components/_implementation/starry_net/get_training_artifacts/component.py +62 -0
- google_cloud_pipeline_components/_implementation/starry_net/maybe_set_tfrecord_args/__init__.py +13 -0
- google_cloud_pipeline_components/_implementation/starry_net/maybe_set_tfrecord_args/component.py +77 -0
- google_cloud_pipeline_components/_implementation/starry_net/set_dataprep_args/__init__.py +13 -0
- google_cloud_pipeline_components/_implementation/starry_net/set_dataprep_args/component.py +97 -0
- google_cloud_pipeline_components/_implementation/starry_net/set_eval_args/__init__.py +13 -0
- google_cloud_pipeline_components/_implementation/starry_net/set_eval_args/component.py +76 -0
- google_cloud_pipeline_components/_implementation/starry_net/set_test_set/__init__.py +13 -0
- google_cloud_pipeline_components/_implementation/starry_net/set_test_set/component.py +48 -0
- google_cloud_pipeline_components/_implementation/starry_net/set_tfrecord_args/__init__.py +13 -0
- google_cloud_pipeline_components/_implementation/starry_net/set_tfrecord_args/component.py +70 -0
- google_cloud_pipeline_components/_implementation/starry_net/set_train_args/__init__.py +13 -0
- google_cloud_pipeline_components/_implementation/starry_net/set_train_args/component.py +90 -0
- google_cloud_pipeline_components/_implementation/starry_net/train/__init__.py +13 -0
- google_cloud_pipeline_components/_implementation/starry_net/train/component.py +209 -0
- google_cloud_pipeline_components/_implementation/starry_net/upload_decomposition_plots/__init__.py +13 -0
- google_cloud_pipeline_components/_implementation/starry_net/upload_decomposition_plots/component.py +59 -0
- google_cloud_pipeline_components/_implementation/starry_net/upload_model/__init__.py +13 -0
- google_cloud_pipeline_components/_implementation/starry_net/upload_model/component.py +23 -0
- google_cloud_pipeline_components/_implementation/starry_net/upload_model/upload_model.yaml +37 -0
- google_cloud_pipeline_components/_implementation/starry_net/version.py +18 -0
- google_cloud_pipeline_components/container/utils/error_surfacing.py +45 -0
- google_cloud_pipeline_components/container/v1/model/get_model/remote_runner.py +36 -7
- google_cloud_pipeline_components/preview/llm/rlhf/component.py +3 -6
- google_cloud_pipeline_components/preview/starry_net/__init__.py +19 -0
- google_cloud_pipeline_components/preview/starry_net/component.py +443 -0
- google_cloud_pipeline_components/proto/task_error_pb2.py +0 -1
- google_cloud_pipeline_components/v1/model_evaluation/evaluation_llm_text_generation_pipeline.py +4 -0
- google_cloud_pipeline_components/version.py +1 -1
- {google_cloud_pipeline_components-2.14.1.dist-info → google_cloud_pipeline_components-2.15.0.dist-info}/METADATA +17 -20
- {google_cloud_pipeline_components-2.14.1.dist-info → google_cloud_pipeline_components-2.15.0.dist-info}/RECORD +43 -14
- {google_cloud_pipeline_components-2.14.1.dist-info → google_cloud_pipeline_components-2.15.0.dist-info}/WHEEL +1 -1
- google_cloud_pipeline_components/_implementation/model_evaluation/import_evaluation/component.py +0 -208
- {google_cloud_pipeline_components-2.14.1.dist-info → google_cloud_pipeline_components-2.15.0.dist-info}/LICENSE +0 -0
- {google_cloud_pipeline_components-2.14.1.dist-info → google_cloud_pipeline_components-2.15.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,443 @@
|
|
|
1
|
+
# Copyright 2024 The Kubeflow Authors. All Rights Reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Defines the pipeline for Starry Net."""
|
|
15
|
+
|
|
16
|
+
from typing import List
|
|
17
|
+
|
|
18
|
+
# pylint: disable=g-importing-member
|
|
19
|
+
from google_cloud_pipeline_components import _placeholders
|
|
20
|
+
from google_cloud_pipeline_components._implementation.starry_net import DataprepOp
|
|
21
|
+
from google_cloud_pipeline_components._implementation.starry_net import EvaluationOp
|
|
22
|
+
from google_cloud_pipeline_components._implementation.starry_net import GetTrainingArtifactsOp
|
|
23
|
+
from google_cloud_pipeline_components._implementation.starry_net import MaybeSetTfrecordArgsOp
|
|
24
|
+
from google_cloud_pipeline_components._implementation.starry_net import SetDataprepArgsOp
|
|
25
|
+
from google_cloud_pipeline_components._implementation.starry_net import SetEvalArgsOp
|
|
26
|
+
from google_cloud_pipeline_components._implementation.starry_net import SetTestSetOp
|
|
27
|
+
from google_cloud_pipeline_components._implementation.starry_net import SetTfrecordArgsOp
|
|
28
|
+
from google_cloud_pipeline_components._implementation.starry_net import SetTrainArgsOp
|
|
29
|
+
from google_cloud_pipeline_components._implementation.starry_net import TrainOp
|
|
30
|
+
from google_cloud_pipeline_components._implementation.starry_net import UploadDecompositionPlotsOp
|
|
31
|
+
from google_cloud_pipeline_components._implementation.starry_net import UploadModelOp
|
|
32
|
+
from google_cloud_pipeline_components.preview.model_evaluation import model_evaluation_import_component
|
|
33
|
+
from google_cloud_pipeline_components.types import artifact_types
|
|
34
|
+
from google_cloud_pipeline_components.v1 import batch_predict_job
|
|
35
|
+
from kfp import dsl
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dsl.pipeline
|
|
39
|
+
def starry_net( # pylint: disable=dangerous-default-value
|
|
40
|
+
tensorboard_instance_id: str,
|
|
41
|
+
dataprep_backcast_length: int,
|
|
42
|
+
dataprep_forecast_length: int,
|
|
43
|
+
dataprep_train_end_date: str,
|
|
44
|
+
dataprep_n_val_windows: int,
|
|
45
|
+
dataprep_n_test_windows: int,
|
|
46
|
+
dataprep_test_set_stride: int,
|
|
47
|
+
dataprep_test_set_bigquery_dataset: str,
|
|
48
|
+
dataflow_machine_type: str = 'n1-standard-16',
|
|
49
|
+
dataflow_max_replica_count: int = 50,
|
|
50
|
+
dataflow_starting_replica_count: int = 1,
|
|
51
|
+
dataflow_disk_size_gb: int = 50,
|
|
52
|
+
dataprep_csv_data_path: str = '',
|
|
53
|
+
dataprep_csv_static_covariates_path: str = '',
|
|
54
|
+
dataprep_bigquery_data_path: str = '',
|
|
55
|
+
dataprep_ts_identifier_columns: List[str] = [],
|
|
56
|
+
dataprep_time_column: str = '',
|
|
57
|
+
dataprep_target_column: str = '',
|
|
58
|
+
dataprep_static_covariate_columns: List[str] = [],
|
|
59
|
+
dataprep_previous_run_dir: str = '',
|
|
60
|
+
trainer_machine_type: str = 'n1-standard-4',
|
|
61
|
+
trainer_accelerator_type: str = 'NVIDIA_TESLA_V100',
|
|
62
|
+
trainer_num_epochs: int = 50,
|
|
63
|
+
trainer_cleaning_activation_regularizer_coeff: float = 1e3,
|
|
64
|
+
trainer_change_point_activation_regularizer_coeff: float = 1e3,
|
|
65
|
+
trainer_change_point_output_regularizer_coeff: float = 1e3,
|
|
66
|
+
trainer_trend_alpha_upper_bound: float = 0.5,
|
|
67
|
+
trainer_trend_beta_upper_bound: float = 0.2,
|
|
68
|
+
trainer_trend_phi_lower_bound: float = 0.99,
|
|
69
|
+
trainer_trend_b_fixed_val: int = -1,
|
|
70
|
+
trainer_trend_b0_fixed_val: int = -1,
|
|
71
|
+
trainer_trend_phi_fixed_val: int = -1,
|
|
72
|
+
trainer_quantiles: List[float] = [],
|
|
73
|
+
trainer_model_blocks: List[str] = [
|
|
74
|
+
'cleaning',
|
|
75
|
+
'change_point',
|
|
76
|
+
'trend',
|
|
77
|
+
'day_of_week',
|
|
78
|
+
'week_of_year',
|
|
79
|
+
'residual',
|
|
80
|
+
],
|
|
81
|
+
tensorboard_n_decomposition_plots: int = 25,
|
|
82
|
+
encryption_spec_key_name: str = '',
|
|
83
|
+
location: str = _placeholders.LOCATION_PLACEHOLDER,
|
|
84
|
+
project: str = _placeholders.PROJECT_ID_PLACEHOLDER,
|
|
85
|
+
):
|
|
86
|
+
# fmt: off
|
|
87
|
+
"""Trains a STARRY-Net model.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
tensorboard_instance_id: The tensorboard instance ID. This must be in same
|
|
91
|
+
location as the pipeline job.
|
|
92
|
+
dataprep_backcast_length: The length of the context window to feed into the
|
|
93
|
+
model.
|
|
94
|
+
dataprep_forecast_length: The length of the forecast horizon used in the
|
|
95
|
+
loss function during training and during evaluation, so that the model is
|
|
96
|
+
optimized to produce forecasts from 0 to H.
|
|
97
|
+
dataprep_train_end_date: The last date of data to use in the training and
|
|
98
|
+
validation set. All dates after a train_end_date are part of the test set.
|
|
99
|
+
If last_forecasted_date is equal to the final day forecasted in the test
|
|
100
|
+
set, then last_forecasted_date =
|
|
101
|
+
train_end_date + forecast_length + (n_test_windows * test_set_stride).
|
|
102
|
+
last_forecasted_date must be included in the dataset.
|
|
103
|
+
dataprep_n_val_windows: The number of windows to use for the val set. If 0,
|
|
104
|
+
no validation set is used.
|
|
105
|
+
dataprep_n_test_windows: The number of windows to use for the test set. Must
|
|
106
|
+
be >= 1. See note in dataprep_train_end_date.
|
|
107
|
+
dataprep_test_set_stride: The number of timestamps to roll forward
|
|
108
|
+
when constructing each window of the val and test sets. See note in
|
|
109
|
+
dataprep_train_end_date.
|
|
110
|
+
dataprep_test_set_bigquery_dataset: The bigquery dataset where the test set
|
|
111
|
+
is saved in the format bq://project.dataset. This must be in the same
|
|
112
|
+
region or multi-region as the output or staging bucket of the pipeline and
|
|
113
|
+
the dataprep_bigquery_data_path, if using a Big Query data source.
|
|
114
|
+
dataflow_machine_type: The type of machine to use for dataprep,
|
|
115
|
+
batch prediction, and evaluation jobs..
|
|
116
|
+
dataflow_max_replica_count: The maximum number of replicas to scale the
|
|
117
|
+
dataprep, batch prediction, and evaluation jobs.
|
|
118
|
+
dataflow_starting_replica_count: The number of replicas to start the
|
|
119
|
+
dataprep, batch prediction, and evaluation jobs.
|
|
120
|
+
dataflow_disk_size_gb: The disk size of dataflow workers in GB for the
|
|
121
|
+
dataprep, batch prediction, and evaluation jobs.
|
|
122
|
+
dataprep_csv_data_path: The path to the training data csv in the format
|
|
123
|
+
gs://bucket_name/sub_dir/blob_name.csv. Each row of the csv represents
|
|
124
|
+
a time series, where the column names are the dates, and the index is the
|
|
125
|
+
unique time series names.
|
|
126
|
+
dataprep_csv_static_covariates_path: The path to the static covariates csv.
|
|
127
|
+
Each row of the csv represents the static covariate values for the series,
|
|
128
|
+
where the column names are the static covariate names, and the
|
|
129
|
+
index is the unique time series names. The index values must match the
|
|
130
|
+
index values of dataprep_csv_data_path. The column values must match
|
|
131
|
+
dataprep_static_covariate_columns.
|
|
132
|
+
dataprep_bigquery_data_path: The path to the training data on BigQuery in
|
|
133
|
+
the format bq://project.dataset.table_id. You should only set this or
|
|
134
|
+
csv_data_path. This must be in the same region or multi-region as the
|
|
135
|
+
output or staging bucket of the pipeline and the
|
|
136
|
+
dataprep_test_set_bigquery_dataset.
|
|
137
|
+
dataprep_ts_identifier_columns: The list of ts_identifier columns from the
|
|
138
|
+
BigQuery data source. These columns are used to distinguish the different
|
|
139
|
+
time series, so that if multiple rows have identical ts_identifier
|
|
140
|
+
columns, the series is generated by summing the target columns for each
|
|
141
|
+
timestamp. This is only used if dataprep_bigquery_data_path is set.
|
|
142
|
+
dataprep_time_column: The time column from the BigQuery data source. This is
|
|
143
|
+
only used if dataprep_bigquery_data_path is set.
|
|
144
|
+
dataprep_target_column: The column to be forecasted from the BigQuery data
|
|
145
|
+
source. This is only used if dataprep_bigquery_data_path is set.
|
|
146
|
+
dataprep_static_covariate_columns: The list of strings of static covariate
|
|
147
|
+
names. This needs to be set if training with static covariates regardless
|
|
148
|
+
of whether you're using bigquery_data_path or csv_static_covariates_path.
|
|
149
|
+
dataprep_previous_run_dir: The dataprep dir from a previous run. Use this
|
|
150
|
+
to save time if you've already created TFRecords from your BigQuery
|
|
151
|
+
dataset with the same dataprep parameters as this run.
|
|
152
|
+
trainer_machine_type: The machine type for training. Must be compatible with
|
|
153
|
+
trainer_accelerator_type.
|
|
154
|
+
trainer_accelerator_type: The accelerator type for training.
|
|
155
|
+
trainer_num_epochs: The number of epochs to train for.
|
|
156
|
+
trainer_cleaning_activation_regularizer_coeff: The L1 regularization
|
|
157
|
+
coefficient for the anomaly detection activation in the cleaning block.
|
|
158
|
+
The larger the value, the less aggressive the cleaning, so fewer and only
|
|
159
|
+
the most extreme anomalies are detected. A rule of thumb is that this
|
|
160
|
+
value should be about the same scale of your series.
|
|
161
|
+
trainer_change_point_activation_regularizer_coeff: The L1 regularization
|
|
162
|
+
coefficient for the change point detection activation in the change point
|
|
163
|
+
block. The larger the value, the less aggressive the cleaning, so fewer
|
|
164
|
+
and only the most extreme change points are detected. A rule of thumb is
|
|
165
|
+
that this value should be a ratio of the
|
|
166
|
+
trainer_change_point_output_regularizer_coeff to determine the sparsity
|
|
167
|
+
of the changes. If you want the model to detect many small step changes
|
|
168
|
+
this number should be smaller than the
|
|
169
|
+
trainer_change_point_output_regularizer_coeff. To detect fewer large step
|
|
170
|
+
changes, this number should be about equal to or larger than the
|
|
171
|
+
trainer_change_point_output_regularizer_coeff.
|
|
172
|
+
trainer_change_point_output_regularizer_coeff: The L2 regularization
|
|
173
|
+
penalty applied to the mean lag-one difference of the cleaned output of
|
|
174
|
+
the change point block. Intutively,
|
|
175
|
+
trainer_change_point_activation_regularizer_coeff determines how many
|
|
176
|
+
steps to detect in the series, while this parameter determines how
|
|
177
|
+
aggressively to clean the detected steps. The higher this value, the more
|
|
178
|
+
aggressive the cleaning. A rule of thumb is that this value should be
|
|
179
|
+
about the same scale of your series.
|
|
180
|
+
trainer_trend_alpha_upper_bound: The upper bound for data smooth parameter
|
|
181
|
+
alpha in the trend block.
|
|
182
|
+
trainer_trend_beta_upper_bound: The upper bound for trend smooth parameter
|
|
183
|
+
beta in the trend block.
|
|
184
|
+
trainer_trend_phi_lower_bound: The lower bound for damping param phi in the
|
|
185
|
+
trend block.
|
|
186
|
+
trainer_trend_b_fixed_val: The fixed value for long term trend parameter b
|
|
187
|
+
in the trend block. If set to anything other than -1, the trend block will
|
|
188
|
+
not learn to provide estimates but use the fixed value directly.
|
|
189
|
+
trainer_trend_b0_fixed_val: The fixed value for starting short-term trend
|
|
190
|
+
parameter b0 in the trend block. If set to anything other than -1, the
|
|
191
|
+
trend block will not learn to provide estimates but use the fixed value
|
|
192
|
+
directly.
|
|
193
|
+
trainer_trend_phi_fixed_val: The fixed value for the damping parameter phi
|
|
194
|
+
in the trend block. If set to anything other than -1, the trend block will
|
|
195
|
+
not learn to provide estimates but use the fixed value directly.
|
|
196
|
+
trainer_quantiles: The list of floats representing quantiles. Leave blank if
|
|
197
|
+
only training to produce point forecasts.
|
|
198
|
+
trainer_model_blocks: The list of model blocks to use in the order they will
|
|
199
|
+
appear in the model. Possible values are `cleaning`, `change_point`,
|
|
200
|
+
`trend`, `hour_of_week`, `day_of_week`, `day_of_year`, `week_of_year`,
|
|
201
|
+
`month_of_year`, `residual`.
|
|
202
|
+
tensorboard_n_decomposition_plots: How many decomposition plots from the
|
|
203
|
+
test set to save to tensorboard.
|
|
204
|
+
encryption_spec_key_name: Customer-managed encryption key options for the
|
|
205
|
+
CustomJob. If this is set, then all resources created by the CustomJob
|
|
206
|
+
will be encrypted with the provided encryption key.
|
|
207
|
+
location: The location where the pipeline components are run.
|
|
208
|
+
project: The project where the pipeline is run. Defaults to current project.
|
|
209
|
+
"""
|
|
210
|
+
job_id = dsl.PIPELINE_JOB_NAME_PLACEHOLDER
|
|
211
|
+
create_dataprep_args_task = SetDataprepArgsOp(
|
|
212
|
+
model_blocks=trainer_model_blocks,
|
|
213
|
+
ts_identifier_columns=dataprep_ts_identifier_columns,
|
|
214
|
+
static_covariate_columns=dataprep_static_covariate_columns,
|
|
215
|
+
csv_data_path=dataprep_csv_data_path,
|
|
216
|
+
previous_run_dir=dataprep_previous_run_dir,
|
|
217
|
+
location=location,
|
|
218
|
+
)
|
|
219
|
+
create_trainer_args_task = SetTrainArgsOp(
|
|
220
|
+
quantiles=trainer_quantiles,
|
|
221
|
+
model_blocks=trainer_model_blocks,
|
|
222
|
+
static_covariates=dataprep_static_covariate_columns,
|
|
223
|
+
)
|
|
224
|
+
test_set_task = DataprepOp(
|
|
225
|
+
backcast_length=dataprep_backcast_length,
|
|
226
|
+
forecast_length=dataprep_forecast_length,
|
|
227
|
+
train_end_date=dataprep_train_end_date,
|
|
228
|
+
n_val_windows=dataprep_n_val_windows,
|
|
229
|
+
n_test_windows=dataprep_n_test_windows,
|
|
230
|
+
test_set_stride=dataprep_test_set_stride,
|
|
231
|
+
model_blocks=create_dataprep_args_task.outputs['model_blocks'],
|
|
232
|
+
bigquery_source=dataprep_bigquery_data_path,
|
|
233
|
+
ts_identifier_columns=create_dataprep_args_task.outputs[
|
|
234
|
+
'ts_identifier_columns'],
|
|
235
|
+
time_column=dataprep_time_column,
|
|
236
|
+
static_covariate_columns=create_dataprep_args_task.outputs[
|
|
237
|
+
'static_covariate_columns'],
|
|
238
|
+
target_column=dataprep_target_column,
|
|
239
|
+
machine_type=dataflow_machine_type,
|
|
240
|
+
docker_region=create_dataprep_args_task.outputs['docker_region'],
|
|
241
|
+
location=location,
|
|
242
|
+
project=project,
|
|
243
|
+
job_id=job_id,
|
|
244
|
+
job_name_prefix='test-set',
|
|
245
|
+
num_workers=dataflow_starting_replica_count,
|
|
246
|
+
max_num_workers=dataflow_max_replica_count,
|
|
247
|
+
disk_size_gb=dataflow_disk_size_gb,
|
|
248
|
+
test_set_only=True,
|
|
249
|
+
bigquery_output=dataprep_test_set_bigquery_dataset,
|
|
250
|
+
gcs_source=dataprep_csv_data_path,
|
|
251
|
+
gcs_static_covariate_source=dataprep_csv_static_covariates_path,
|
|
252
|
+
encryption_spec_key_name=encryption_spec_key_name
|
|
253
|
+
)
|
|
254
|
+
test_set_task.set_display_name('create-test-set')
|
|
255
|
+
set_test_set_task = SetTestSetOp(
|
|
256
|
+
dataprep_dir=test_set_task.outputs['dataprep_dir'])
|
|
257
|
+
with dsl.If(create_dataprep_args_task.outputs['create_tf_records'] == True, # pylint: disable=singleton-comparison
|
|
258
|
+
'create-tf-records'):
|
|
259
|
+
create_tf_records_task = DataprepOp(
|
|
260
|
+
backcast_length=dataprep_backcast_length,
|
|
261
|
+
forecast_length=dataprep_forecast_length,
|
|
262
|
+
train_end_date=dataprep_train_end_date,
|
|
263
|
+
n_val_windows=dataprep_n_val_windows,
|
|
264
|
+
n_test_windows=dataprep_n_test_windows,
|
|
265
|
+
test_set_stride=dataprep_test_set_stride,
|
|
266
|
+
model_blocks=create_dataprep_args_task.outputs['model_blocks'],
|
|
267
|
+
bigquery_source=dataprep_bigquery_data_path,
|
|
268
|
+
ts_identifier_columns=create_dataprep_args_task.outputs[
|
|
269
|
+
'ts_identifier_columns'],
|
|
270
|
+
time_column=dataprep_time_column,
|
|
271
|
+
static_covariate_columns=create_dataprep_args_task.outputs[
|
|
272
|
+
'static_covariate_columns'],
|
|
273
|
+
target_column=dataprep_target_column,
|
|
274
|
+
machine_type=dataflow_machine_type,
|
|
275
|
+
docker_region=create_dataprep_args_task.outputs['docker_region'],
|
|
276
|
+
location=location,
|
|
277
|
+
project=project,
|
|
278
|
+
job_id=job_id,
|
|
279
|
+
job_name_prefix='tf-records',
|
|
280
|
+
num_workers=dataflow_starting_replica_count,
|
|
281
|
+
max_num_workers=dataflow_max_replica_count,
|
|
282
|
+
disk_size_gb=dataflow_disk_size_gb,
|
|
283
|
+
test_set_only=False,
|
|
284
|
+
bigquery_output=dataprep_test_set_bigquery_dataset,
|
|
285
|
+
gcs_source=dataprep_csv_data_path,
|
|
286
|
+
gcs_static_covariate_source=dataprep_csv_static_covariates_path,
|
|
287
|
+
encryption_spec_key_name=encryption_spec_key_name
|
|
288
|
+
)
|
|
289
|
+
create_tf_records_task.set_display_name('create-tf-records')
|
|
290
|
+
set_tfrecord_args_this_run_task = (
|
|
291
|
+
SetTfrecordArgsOp(
|
|
292
|
+
dataprep_dir=create_tf_records_task.outputs['dataprep_dir'],
|
|
293
|
+
static_covariates=dataprep_static_covariate_columns))
|
|
294
|
+
with dsl.Else('skip-tf-record-generation'):
|
|
295
|
+
set_tfrecord_args_previous_run_task = (
|
|
296
|
+
MaybeSetTfrecordArgsOp(
|
|
297
|
+
dataprep_previous_run_dir=dataprep_previous_run_dir,
|
|
298
|
+
static_covariates=dataprep_static_covariate_columns))
|
|
299
|
+
set_tfrecord_args_previous_run_task.set_display_name(
|
|
300
|
+
'set_tfrecord_args_previous_run')
|
|
301
|
+
static_covariates_vocab_path = dsl.OneOf(
|
|
302
|
+
set_tfrecord_args_previous_run_task.outputs[
|
|
303
|
+
'static_covariates_vocab_path'],
|
|
304
|
+
set_tfrecord_args_this_run_task.outputs['static_covariates_vocab_path']
|
|
305
|
+
)
|
|
306
|
+
train_tf_record_patterns = dsl.OneOf(
|
|
307
|
+
set_tfrecord_args_previous_run_task.outputs['train_tf_record_patterns'],
|
|
308
|
+
set_tfrecord_args_this_run_task.outputs['train_tf_record_patterns']
|
|
309
|
+
)
|
|
310
|
+
val_tf_record_patterns = dsl.OneOf(
|
|
311
|
+
set_tfrecord_args_previous_run_task.outputs['val_tf_record_patterns'],
|
|
312
|
+
set_tfrecord_args_this_run_task.outputs['val_tf_record_patterns']
|
|
313
|
+
)
|
|
314
|
+
test_tf_record_patterns = dsl.OneOf(
|
|
315
|
+
set_tfrecord_args_previous_run_task.outputs['test_tf_record_patterns'],
|
|
316
|
+
set_tfrecord_args_this_run_task.outputs['test_tf_record_patterns']
|
|
317
|
+
)
|
|
318
|
+
trainer_task = TrainOp(
|
|
319
|
+
num_epochs=trainer_num_epochs,
|
|
320
|
+
backcast_length=dataprep_backcast_length,
|
|
321
|
+
forecast_length=dataprep_forecast_length,
|
|
322
|
+
train_end_date=dataprep_train_end_date,
|
|
323
|
+
csv_data_path=dataprep_csv_data_path,
|
|
324
|
+
csv_static_covariates_path=dataprep_csv_static_covariates_path,
|
|
325
|
+
static_covariates_vocab_path=static_covariates_vocab_path,
|
|
326
|
+
train_tf_record_patterns=train_tf_record_patterns,
|
|
327
|
+
val_tf_record_patterns=val_tf_record_patterns,
|
|
328
|
+
test_tf_record_patterns=test_tf_record_patterns,
|
|
329
|
+
n_decomposition_plots=tensorboard_n_decomposition_plots,
|
|
330
|
+
n_val_windows=dataprep_n_val_windows,
|
|
331
|
+
n_test_windows=dataprep_n_test_windows,
|
|
332
|
+
test_set_stride=dataprep_test_set_stride,
|
|
333
|
+
cleaning_activation_regularizer_coeff=trainer_cleaning_activation_regularizer_coeff,
|
|
334
|
+
change_point_activation_regularizer_coeff=trainer_change_point_activation_regularizer_coeff,
|
|
335
|
+
change_point_output_regularizer_coeff=trainer_change_point_output_regularizer_coeff,
|
|
336
|
+
alpha_upper_bound=trainer_trend_alpha_upper_bound,
|
|
337
|
+
beta_upper_bound=trainer_trend_beta_upper_bound,
|
|
338
|
+
phi_lower_bound=trainer_trend_phi_lower_bound,
|
|
339
|
+
b_fixed_val=trainer_trend_b_fixed_val,
|
|
340
|
+
b0_fixed_val=trainer_trend_b0_fixed_val,
|
|
341
|
+
phi_fixed_val=trainer_trend_phi_fixed_val,
|
|
342
|
+
quantiles=create_trainer_args_task.outputs['quantiles'],
|
|
343
|
+
use_static_covariates=create_trainer_args_task.outputs[
|
|
344
|
+
'use_static_covariates'],
|
|
345
|
+
static_covariate_names=create_trainer_args_task.outputs[
|
|
346
|
+
'static_covariate_names'],
|
|
347
|
+
model_blocks=create_trainer_args_task.outputs['model_blocks'],
|
|
348
|
+
freeze_point_forecasts=create_trainer_args_task.outputs[
|
|
349
|
+
'freeze_point_forecasts'],
|
|
350
|
+
machine_type=trainer_machine_type,
|
|
351
|
+
accelerator_type=trainer_accelerator_type,
|
|
352
|
+
docker_region=create_dataprep_args_task.outputs['docker_region'],
|
|
353
|
+
location=location,
|
|
354
|
+
job_id=job_id,
|
|
355
|
+
project=project,
|
|
356
|
+
encryption_spec_key_name=encryption_spec_key_name
|
|
357
|
+
)
|
|
358
|
+
_ = UploadDecompositionPlotsOp(
|
|
359
|
+
project=project,
|
|
360
|
+
location=location,
|
|
361
|
+
tensorboard_id=tensorboard_instance_id,
|
|
362
|
+
display_name=job_id,
|
|
363
|
+
trainer_dir=trainer_task.outputs['trainer_dir'])
|
|
364
|
+
training_artifacts_task = GetTrainingArtifactsOp(
|
|
365
|
+
docker_region=create_dataprep_args_task.outputs['docker_region'],
|
|
366
|
+
trainer_dir=trainer_task.outputs['trainer_dir'])
|
|
367
|
+
model = dsl.importer(
|
|
368
|
+
artifact_uri=training_artifacts_task.outputs['artifact_uri'],
|
|
369
|
+
artifact_class=artifact_types.UnmanagedContainerModel,
|
|
370
|
+
metadata={
|
|
371
|
+
'predictSchemata': {
|
|
372
|
+
'instanceSchemaUri': training_artifacts_task.outputs[
|
|
373
|
+
'instance_schema_uri'],
|
|
374
|
+
'predictionSchemaUri': training_artifacts_task.outputs[
|
|
375
|
+
'prediction_schema_uri'],
|
|
376
|
+
},
|
|
377
|
+
'containerSpec': {
|
|
378
|
+
'imageUri': training_artifacts_task.outputs['image_uri'],
|
|
379
|
+
'healthRoute': '/health',
|
|
380
|
+
'predictRoute': '/predict',
|
|
381
|
+
}
|
|
382
|
+
},
|
|
383
|
+
)
|
|
384
|
+
model.set_display_name('set-model')
|
|
385
|
+
upload_model_task = UploadModelOp(
|
|
386
|
+
project=project,
|
|
387
|
+
location=location,
|
|
388
|
+
display_name=job_id,
|
|
389
|
+
unmanaged_container_model=model.output,
|
|
390
|
+
encryption_spec_key_name=encryption_spec_key_name,
|
|
391
|
+
)
|
|
392
|
+
upload_model_task.set_display_name('upload-model')
|
|
393
|
+
batch_predict_task = batch_predict_job.ModelBatchPredictOp(
|
|
394
|
+
project=project,
|
|
395
|
+
location=location,
|
|
396
|
+
unmanaged_container_model=model.output,
|
|
397
|
+
job_display_name=f'batch-predict-{job_id}',
|
|
398
|
+
instances_format='bigquery',
|
|
399
|
+
predictions_format='bigquery',
|
|
400
|
+
bigquery_source_input_uri=set_test_set_task.outputs['uri'],
|
|
401
|
+
bigquery_destination_output_uri=dataprep_test_set_bigquery_dataset,
|
|
402
|
+
machine_type=dataflow_machine_type,
|
|
403
|
+
starting_replica_count=dataflow_starting_replica_count,
|
|
404
|
+
max_replica_count=dataflow_max_replica_count,
|
|
405
|
+
encryption_spec_key_name=encryption_spec_key_name,
|
|
406
|
+
generate_explanation=False,
|
|
407
|
+
)
|
|
408
|
+
batch_predict_task.set_display_name('run-batch-prediction')
|
|
409
|
+
set_eval_args_task = SetEvalArgsOp(
|
|
410
|
+
big_query_source=batch_predict_task.outputs['bigquery_output_table'],
|
|
411
|
+
quantiles=trainer_quantiles)
|
|
412
|
+
eval_task = EvaluationOp(
|
|
413
|
+
project=project,
|
|
414
|
+
location=location,
|
|
415
|
+
root_dir=test_set_task.outputs['dataprep_dir'],
|
|
416
|
+
target_field_name='HORIZON__x',
|
|
417
|
+
predictions_format='bigquery',
|
|
418
|
+
ground_truth_format='bigquery',
|
|
419
|
+
predictions_bigquery_source=batch_predict_task.outputs[
|
|
420
|
+
'bigquery_output_table'],
|
|
421
|
+
ground_truth_bigquery_source=set_eval_args_task.outputs[
|
|
422
|
+
'big_query_source'],
|
|
423
|
+
ground_truth_gcs_source=[],
|
|
424
|
+
forecasting_type=set_eval_args_task.outputs['forecasting_type'],
|
|
425
|
+
forecasting_quantiles=set_eval_args_task.outputs['quantiles'],
|
|
426
|
+
prediction_score_column=set_eval_args_task.outputs[
|
|
427
|
+
'prediction_score_column'],
|
|
428
|
+
dataflow_service_account=_placeholders.SERVICE_ACCOUNT_PLACEHOLDER,
|
|
429
|
+
dataflow_machine_type=dataflow_machine_type,
|
|
430
|
+
dataflow_max_workers_num=dataflow_max_replica_count,
|
|
431
|
+
dataflow_workers_num=dataflow_starting_replica_count,
|
|
432
|
+
dataflow_disk_size=dataflow_disk_size_gb,
|
|
433
|
+
dataflow_use_public_ips=True,
|
|
434
|
+
encryption_spec_key_name=encryption_spec_key_name,
|
|
435
|
+
)
|
|
436
|
+
model_evaluation_import_component.model_evaluation_import(
|
|
437
|
+
forecasting_metrics=eval_task.outputs['evaluation_metrics'],
|
|
438
|
+
model=upload_model_task.outputs['model'],
|
|
439
|
+
dataset_type='bigquery',
|
|
440
|
+
dataset_path=set_test_set_task.outputs['uri'],
|
|
441
|
+
display_name=job_id,
|
|
442
|
+
problem_type='forecasting',
|
|
443
|
+
)
|
|
@@ -5,7 +5,6 @@
|
|
|
5
5
|
"""Generated protocol buffer code."""
|
|
6
6
|
from google.protobuf import descriptor as _descriptor
|
|
7
7
|
from google.protobuf import descriptor_pool as _descriptor_pool
|
|
8
|
-
from google.protobuf import runtime_version as _runtime_version
|
|
9
8
|
from google.protobuf import symbol_database as _symbol_database
|
|
10
9
|
from google.protobuf.internal import builder as _builder
|
|
11
10
|
# @@protoc_insertion_point(imports)
|
google_cloud_pipeline_components/v1/model_evaluation/evaluation_llm_text_generation_pipeline.py
CHANGED
|
@@ -38,6 +38,7 @@ def evaluation_llm_text_generation_pipeline( # pylint: disable=dangerous-defaul
|
|
|
38
38
|
batch_predict_gcs_destination_output_uri: str,
|
|
39
39
|
model_name: str = 'publishers/google/models/text-bison@002',
|
|
40
40
|
evaluation_task: str = 'text-generation',
|
|
41
|
+
role_field_name: str = 'role',
|
|
41
42
|
input_field_name: str = 'input_text',
|
|
42
43
|
target_field_name: str = 'output_text',
|
|
43
44
|
batch_predict_instances_format: str = 'jsonl',
|
|
@@ -76,6 +77,7 @@ def evaluation_llm_text_generation_pipeline( # pylint: disable=dangerous-defaul
|
|
|
76
77
|
batch_predict_gcs_destination_output_uri: Required. The Google Cloud Storage location of the directory where the eval pipeline output is to be written to.
|
|
77
78
|
model_name: The Model name used to run evaluation. Must be a publisher Model or a managed Model sharing the same ancestor location. Starting this job has no impact on any existing deployments of the Model and their resources.
|
|
78
79
|
evaluation_task: The task that the large language model will be evaluated on. The evaluation component computes a set of metrics relevant to that specific task. Currently supported tasks are: `summarization`, `question-answering`, `text-generation`.
|
|
80
|
+
role_field_name: The field name of the role for input eval dataset instances that contains the input prompts to the LLM.
|
|
79
81
|
input_field_name: The field name of the input eval dataset instances that contains the input prompts to the LLM.
|
|
80
82
|
target_field_name: The field name of the eval dataset instance that contains an example reference text response. Alternatively referred to as the ground truth (or ground_truth_column) field. If not set, defaulted to `output_text`.
|
|
81
83
|
batch_predict_instances_format: The format in which instances are given, must be one of the Model's supportedInputStorageFormats. Only "jsonl" is currently supported. For more details about this input config, see https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.batchPredictionJobs#InputConfig.
|
|
@@ -124,6 +126,8 @@ def evaluation_llm_text_generation_pipeline( # pylint: disable=dangerous-defaul
|
|
|
124
126
|
location=location,
|
|
125
127
|
gcs_source_uris=batch_predict_gcs_source_uris,
|
|
126
128
|
input_field_name=input_field_name,
|
|
129
|
+
role_field_name=role_field_name,
|
|
130
|
+
model_name=model_name,
|
|
127
131
|
machine_type=machine_type,
|
|
128
132
|
service_account=service_account,
|
|
129
133
|
network=network,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: google-cloud-pipeline-components
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.15.0
|
|
4
4
|
Summary: This SDK enables a set of First Party (Google owned) pipeline components that allow users to take their experience from Vertex AI SDK and other Google Cloud services and create a corresponding pipeline using KFP or Managed Pipelines.
|
|
5
5
|
Home-page: https://github.com/kubeflow/pipelines/tree/master/components/google-cloud
|
|
6
6
|
Author: The Google Cloud Pipeline Components authors
|
|
@@ -10,7 +10,6 @@ Project-URL: User Documentation, https://cloud.google.com/vertex-ai/docs/pipelin
|
|
|
10
10
|
Project-URL: Reference Documentation, https://google-cloud-pipeline-components.readthedocs.io/
|
|
11
11
|
Project-URL: Source, https://github.com/kubeflow/pipelines/tree/master/components/google-cloud
|
|
12
12
|
Project-URL: Release Notes, https://github.com/kubeflow/pipelines/tree/master/components/google-cloud/RELEASE.md
|
|
13
|
-
Platform: UNKNOWN
|
|
14
13
|
Classifier: Development Status :: 4 - Beta
|
|
15
14
|
Classifier: Operating System :: Unix
|
|
16
15
|
Classifier: Operating System :: MacOS
|
|
@@ -31,24 +30,24 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
31
30
|
Requires-Python: >=3.8.0,<3.12.0
|
|
32
31
|
Description-Content-Type: text/markdown
|
|
33
32
|
License-File: LICENSE
|
|
34
|
-
Requires-Dist:
|
|
35
|
-
Requires-Dist:
|
|
36
|
-
Requires-Dist: google-cloud-aiplatform
|
|
37
|
-
Requires-Dist:
|
|
33
|
+
Requires-Dist: Jinja2 <4,>=3.1.2
|
|
34
|
+
Requires-Dist: google-api-core !=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5
|
|
35
|
+
Requires-Dist: google-cloud-aiplatform <2,>=1.14.0
|
|
36
|
+
Requires-Dist: kfp <=2.7.0,>=2.6.0
|
|
38
37
|
Provides-Extra: docs
|
|
39
|
-
Requires-Dist:
|
|
40
|
-
Requires-Dist:
|
|
41
|
-
Requires-Dist:
|
|
42
|
-
Requires-Dist:
|
|
43
|
-
Requires-Dist:
|
|
44
|
-
Requires-Dist: sphinx-immaterial
|
|
45
|
-
Requires-Dist: sphinx-
|
|
46
|
-
Requires-Dist:
|
|
47
|
-
Requires-Dist: sphinx
|
|
38
|
+
Requires-Dist: autodocsumm ==0.2.9 ; extra == 'docs'
|
|
39
|
+
Requires-Dist: commonmark ==0.9.1 ; extra == 'docs'
|
|
40
|
+
Requires-Dist: grpcio-status <=1.47.0 ; extra == 'docs'
|
|
41
|
+
Requires-Dist: m2r2 ==0.3.3.post2 ; extra == 'docs'
|
|
42
|
+
Requires-Dist: protobuf <5,>=4.21.1 ; extra == 'docs'
|
|
43
|
+
Requires-Dist: sphinx-immaterial ==0.9.0 ; extra == 'docs'
|
|
44
|
+
Requires-Dist: sphinx-notfound-page ==0.8.3 ; extra == 'docs'
|
|
45
|
+
Requires-Dist: sphinx-rtd-theme ==2.0.0 ; extra == 'docs'
|
|
46
|
+
Requires-Dist: sphinx <6.0.0,>=5.0.2 ; extra == 'docs'
|
|
48
47
|
Provides-Extra: tests
|
|
49
|
-
Requires-Dist:
|
|
50
|
-
Requires-Dist:
|
|
51
|
-
Requires-Dist: pytest
|
|
48
|
+
Requires-Dist: flake8 >=3.0.0 ; extra == 'tests'
|
|
49
|
+
Requires-Dist: mock >=4.0.0 ; extra == 'tests'
|
|
50
|
+
Requires-Dist: pytest >=6.0.0 ; extra == 'tests'
|
|
52
51
|
|
|
53
52
|
# Google Cloud Pipeline Components
|
|
54
53
|
|
|
@@ -93,5 +92,3 @@ Use the following command to install Google Cloud Pipeline Components from [PyPI
|
|
|
93
92
|
```shell
|
|
94
93
|
pip install -U google-cloud-pipeline-components
|
|
95
94
|
```
|
|
96
|
-
|
|
97
|
-
|