google-cloud-pipeline-components 2.13.1__py3-none-any.whl → 2.14.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of google-cloud-pipeline-components might be problematic. Click here for more details.
- google_cloud_pipeline_components/__init__.py +5 -6
- google_cloud_pipeline_components/_implementation/llm/deployment_graph.py +12 -34
- google_cloud_pipeline_components/_implementation/llm/env.py +1 -1
- google_cloud_pipeline_components/_implementation/llm/function_based.py +14 -48
- google_cloud_pipeline_components/_implementation/llm/generated/refined_image_versions.py +1 -1
- google_cloud_pipeline_components/_implementation/llm/infer_preprocessor.py +109 -0
- google_cloud_pipeline_components/_implementation/llm/online_evaluation_pairwise.py +8 -0
- google_cloud_pipeline_components/_implementation/llm/reinforcement_learning_graph.py +27 -36
- google_cloud_pipeline_components/_implementation/llm/reward_model_graph.py +31 -47
- google_cloud_pipeline_components/_implementation/llm/rlhf_preprocessor.py +84 -0
- google_cloud_pipeline_components/_implementation/llm/validate_pipeline.py +11 -0
- google_cloud_pipeline_components/_implementation/model_evaluation/__init__.py +0 -12
- google_cloud_pipeline_components/_implementation/model_evaluation/llm_embedding/evaluation_llm_embedding_pipeline.py +2 -1
- google_cloud_pipeline_components/_placeholders.py +30 -1
- google_cloud_pipeline_components/preview/automl/forecasting/forecasting_ensemble.py +1 -1
- google_cloud_pipeline_components/preview/automl/forecasting/forecasting_stage_1_tuner.py +2 -2
- google_cloud_pipeline_components/preview/automl/forecasting/forecasting_stage_2_tuner.py +2 -2
- google_cloud_pipeline_components/preview/automl/forecasting/learn_to_learn_forecasting_pipeline.yaml +34 -34
- google_cloud_pipeline_components/preview/automl/forecasting/sequence_to_sequence_forecasting_pipeline.yaml +34 -34
- google_cloud_pipeline_components/preview/automl/forecasting/temporal_fusion_transformer_forecasting_pipeline.yaml +34 -34
- google_cloud_pipeline_components/preview/automl/forecasting/time_series_dense_encoder_forecasting_pipeline.yaml +34 -34
- google_cloud_pipeline_components/preview/automl/tabular/auto_feature_engineering.py +1 -1
- google_cloud_pipeline_components/preview/automl/tabular/automl_tabular_feature_selection_pipeline.yaml +39 -39
- google_cloud_pipeline_components/preview/automl/tabular/automl_tabular_v2_pipeline.yaml +41 -41
- google_cloud_pipeline_components/preview/automl/tabular/distillation_stage_feature_transform_engine.py +2 -2
- google_cloud_pipeline_components/preview/automl/tabular/feature_selection.py +2 -2
- google_cloud_pipeline_components/preview/automl/tabular/feature_selection_pipeline.yaml +4 -4
- google_cloud_pipeline_components/preview/automl/tabular/feature_transform_engine.py +3 -3
- google_cloud_pipeline_components/preview/automl/tabular/tabnet_hyperparameter_tuning_job.py +2 -2
- google_cloud_pipeline_components/preview/automl/tabular/tabnet_hyperparameter_tuning_job_pipeline.yaml +17 -17
- google_cloud_pipeline_components/preview/automl/tabular/tabnet_trainer.py +2 -2
- google_cloud_pipeline_components/preview/automl/tabular/tabnet_trainer_pipeline.yaml +15 -15
- google_cloud_pipeline_components/preview/automl/tabular/wide_and_deep_hyperparameter_tuning_job.py +2 -2
- google_cloud_pipeline_components/preview/automl/tabular/wide_and_deep_hyperparameter_tuning_job_pipeline.yaml +16 -16
- google_cloud_pipeline_components/preview/automl/tabular/wide_and_deep_trainer.py +2 -2
- google_cloud_pipeline_components/preview/automl/tabular/wide_and_deep_trainer_pipeline.yaml +15 -15
- google_cloud_pipeline_components/preview/automl/tabular/xgboost_hyperparameter_tuning_job_pipeline.yaml +14 -14
- google_cloud_pipeline_components/preview/automl/tabular/xgboost_trainer_pipeline.yaml +13 -13
- google_cloud_pipeline_components/preview/automl/vision/data_converter.py +3 -1
- google_cloud_pipeline_components/preview/custom_job/component.py +2 -2
- google_cloud_pipeline_components/preview/custom_job/utils.py +3 -2
- google_cloud_pipeline_components/preview/llm/infer/component.py +22 -25
- google_cloud_pipeline_components/preview/llm/rlhf/component.py +72 -10
- google_cloud_pipeline_components/preview/model_evaluation/__init__.py +5 -2
- google_cloud_pipeline_components/preview/model_evaluation/model_evaluation_import_component.py +209 -0
- google_cloud_pipeline_components/proto/task_error_pb2.py +33 -0
- google_cloud_pipeline_components/proto/template_metadata_pb2.py +22 -15
- google_cloud_pipeline_components/v1/automl/forecasting/bqml_arima_predict_pipeline.yaml +10 -10
- google_cloud_pipeline_components/v1/automl/forecasting/bqml_arima_train_pipeline.yaml +31 -31
- google_cloud_pipeline_components/v1/automl/forecasting/prophet_predict_pipeline.yaml +13 -13
- google_cloud_pipeline_components/v1/automl/forecasting/prophet_trainer.py +13 -3
- google_cloud_pipeline_components/v1/automl/forecasting/prophet_trainer_pipeline.yaml +18 -15
- google_cloud_pipeline_components/v1/automl/tabular/automl_tabular_pipeline.yaml +37 -37
- google_cloud_pipeline_components/v1/automl/tabular/cv_trainer.py +2 -2
- google_cloud_pipeline_components/v1/automl/tabular/ensemble.py +2 -2
- google_cloud_pipeline_components/v1/automl/tabular/finalizer.py +1 -1
- google_cloud_pipeline_components/v1/automl/tabular/infra_validator.py +1 -1
- google_cloud_pipeline_components/v1/automl/tabular/split_materialized_data.py +1 -1
- google_cloud_pipeline_components/v1/automl/tabular/stage_1_tuner.py +2 -2
- google_cloud_pipeline_components/v1/automl/tabular/stats_and_example_gen.py +2 -2
- google_cloud_pipeline_components/v1/automl/tabular/training_configurator_and_validator.py +1 -1
- google_cloud_pipeline_components/v1/automl/tabular/transform.py +2 -2
- google_cloud_pipeline_components/v1/model_evaluation/__init__.py +3 -1
- google_cloud_pipeline_components/v1/model_evaluation/classification_component.py +2 -2
- google_cloud_pipeline_components/v1/model_evaluation/error_analysis_pipeline.py +8 -10
- google_cloud_pipeline_components/v1/model_evaluation/evaluated_annotation_pipeline.py +2 -2
- google_cloud_pipeline_components/v1/model_evaluation/evaluation_automl_tabular_feature_attribution_pipeline.py +2 -2
- google_cloud_pipeline_components/v1/model_evaluation/evaluation_automl_tabular_pipeline.py +2 -2
- google_cloud_pipeline_components/v1/model_evaluation/evaluation_automl_unstructure_data_pipeline.py +2 -2
- google_cloud_pipeline_components/v1/model_evaluation/evaluation_feature_attribution_pipeline.py +2 -2
- google_cloud_pipeline_components/v1/model_evaluation/evaluation_llm_classification_pipeline.py +4 -2
- google_cloud_pipeline_components/v1/model_evaluation/evaluation_llm_text_generation_pipeline.py +4 -2
- google_cloud_pipeline_components/{preview → v1}/model_evaluation/model_based_llm_evaluation/__init__.py +2 -2
- google_cloud_pipeline_components/{preview → v1}/model_evaluation/model_based_llm_evaluation/autosxs/autosxs_pipeline.py +1 -0
- google_cloud_pipeline_components/version.py +1 -1
- {google_cloud_pipeline_components-2.13.1.dist-info → google_cloud_pipeline_components-2.14.1.dist-info}/METADATA +18 -19
- {google_cloud_pipeline_components-2.13.1.dist-info → google_cloud_pipeline_components-2.14.1.dist-info}/RECORD +81 -79
- {google_cloud_pipeline_components-2.13.1.dist-info → google_cloud_pipeline_components-2.14.1.dist-info}/WHEEL +1 -1
- google_cloud_pipeline_components/proto/preflight_validations_pb2.py +0 -47
- /google_cloud_pipeline_components/{preview → v1}/model_evaluation/model_based_llm_evaluation/autosxs/__init__.py +0 -0
- {google_cloud_pipeline_components-2.13.1.dist-info → google_cloud_pipeline_components-2.14.1.dist-info}/LICENSE +0 -0
- {google_cloud_pipeline_components-2.13.1.dist-info → google_cloud_pipeline_components-2.14.1.dist-info}/top_level.txt +0 -0
|
@@ -21,12 +21,12 @@ from google_cloud_pipeline_components._implementation.llm import function_based
|
|
|
21
21
|
from google_cloud_pipeline_components._implementation.llm import preprocess_chat_dataset
|
|
22
22
|
from google_cloud_pipeline_components._implementation.llm import private_text_comparison_importer
|
|
23
23
|
from google_cloud_pipeline_components._implementation.llm import reward_model_trainer
|
|
24
|
+
from google_cloud_pipeline_components._implementation.llm import rlhf_preprocessor
|
|
24
25
|
from google_cloud_pipeline_components._implementation.llm import upload_tensorboard_metrics
|
|
25
26
|
import kfp
|
|
26
27
|
|
|
27
28
|
PipelineOutput = NamedTuple(
|
|
28
29
|
'Outputs',
|
|
29
|
-
reward_model_base_path=str,
|
|
30
30
|
reward_model_adapter_path=str,
|
|
31
31
|
reward_dataset_path=str,
|
|
32
32
|
)
|
|
@@ -39,6 +39,14 @@ PipelineOutput = NamedTuple(
|
|
|
39
39
|
def pipeline(
|
|
40
40
|
preference_dataset: str,
|
|
41
41
|
large_model_reference: str,
|
|
42
|
+
reward_model_reference: str,
|
|
43
|
+
reward_model_path: str,
|
|
44
|
+
machine_type: str,
|
|
45
|
+
tuning_location: str,
|
|
46
|
+
accelerator_type: str,
|
|
47
|
+
accelerator_count: int,
|
|
48
|
+
reward_model_image_uri: str,
|
|
49
|
+
comma_separated_candidates_field_names: str,
|
|
42
50
|
prompt_sequence_length: int = 512,
|
|
43
51
|
target_sequence_length: int = 64,
|
|
44
52
|
batch_size: int = 64,
|
|
@@ -48,10 +56,10 @@ def pipeline(
|
|
|
48
56
|
eval_dataset: Optional[str] = None,
|
|
49
57
|
instruction: Optional[str] = None,
|
|
50
58
|
project: str = _placeholders.PROJECT_ID_PLACEHOLDER,
|
|
51
|
-
accelerator_type: str = 'GPU',
|
|
52
59
|
location: str = _placeholders.LOCATION_PLACEHOLDER,
|
|
53
60
|
tensorboard_resource_id: str = '',
|
|
54
61
|
encryption_spec_key_name: str = '',
|
|
62
|
+
num_microbatches: int = 0,
|
|
55
63
|
) -> PipelineOutput:
|
|
56
64
|
# fmt: off
|
|
57
65
|
"""Trains a reward model.
|
|
@@ -59,6 +67,14 @@ def pipeline(
|
|
|
59
67
|
Args:
|
|
60
68
|
preference_dataset: Cloud storage path to a human preference JSONL dataset used to train a reward model. Each example in a preference dataset must contain `candidate_0` and `candidate_1` fields that contain candidate responses, `choice` that specifies the preferred candidate and either `input_text` (if tuning a text model) or `messages` (if tuning a chat model). Chat datasets must contain at least 1 message in a `messages` field. Each message must be valid JSON that contains `author` and `content` fields, where valid `author` values are `user` and `assistant` and `content` must be non-empty. Each row may contain multiple messages, but the first and last author must be the `user`. An optional `context` field may be provided for each example in a chat dataset. If provided, the `context` will preprended to the message `content`. The `instruction` serves as the default context. (Useful if most messages use the same system-level context.) Any context provided in the example will override the default value.
|
|
61
69
|
large_model_reference: Name of the base model. Supported values are `text-bison@001`, `t5-small`, `t5-large`, `t5-xl` and `t5-xxl`. `text-bison@001` and `t5-small` are supported in `us-central1` and `europe-west4`. `t5-large`, `t5-xl` and `t5-xxl` are only supported in `europe-west4`.
|
|
70
|
+
reward_model_reference: Name of the base model. The name should be in capitalized snake case format.
|
|
71
|
+
reward_model_path: The model checkpoint path for the reward model.
|
|
72
|
+
machine_type: The type of the machine to provision for the custom job. Must be a valid GCE instance type and compatible with the accelerator type.
|
|
73
|
+
tuning_location: The GCP region to run the custom job.
|
|
74
|
+
accelerator_type: Specific accelerator type for the custom job.
|
|
75
|
+
accelerator_count: The number of accelerator.
|
|
76
|
+
reward_model_image_uri: Docker image URI to use for the reward model training job.
|
|
77
|
+
comma_separated_candidates_field_names: Comma separated list of fields that contain candidate text, e.g. ``'field_1,field_2,field_3'``.
|
|
62
78
|
prompt_sequence_length: Maximum tokenized sequence length for input text. Higher values increase memory overhead. This value should be at most 8192. Default value is 512.
|
|
63
79
|
target_sequence_length: Maximum tokenized sequence length for target text. Higher values increase memory overhead. This value should be at most 1024. Default value is 64.
|
|
64
80
|
batch_size: Number of examples in each finetuning step. Default is 64.
|
|
@@ -67,28 +83,18 @@ def pipeline(
|
|
|
67
83
|
reward_model_train_steps: Number of steps to use when training a reward model. Default value is 1000.
|
|
68
84
|
instruction: This field lets the model know what task it needs to perform. Base models have been trained over a large set of varied instructions. You can give a simple and intuitive description of the task and the model will follow it, e.g. "Classify this movie review as positive or negative" or "Translate this sentence to Danish". Do not specify this if your dataset already prepends the instruction to the inputs field.
|
|
69
85
|
project: Project used to run custom jobs. If not specified the project used to run the pipeline will be used.
|
|
70
|
-
accelerator_type: One of 'TPU' or 'GPU'. If 'TPU' is specified, tuning components run in europe-west4. Otherwise tuning components run in us-central1 on GPUs. Default is 'GPU'.
|
|
71
86
|
location: Location used to run non-tuning components, i.e. components that do not require accelerators. If not specified the location used to run the pipeline will be used.
|
|
72
87
|
tensorboard_resource_id: Optional tensorboard resource id in format `projects/{project_number}/locations/{location}/tensorboards/{tensorboard_id}`. If provided, tensorboard metrics will be uploaded to this location.
|
|
73
88
|
encryption_spec_key_name: Customer-managed encryption key. If this is set, then all resources created by the CustomJob will be encrypted with the provided encryption key. Note that this is not supported for TPU at the moment.
|
|
89
|
+
num_microbatches: The number of microbatches to break the total batch size into during training.
|
|
74
90
|
|
|
75
91
|
Returns:
|
|
76
|
-
reward_model_base_path: Path to the base model used by the reward model.
|
|
77
92
|
reward_model_adapter_path: Path to the output LoRA adapter.
|
|
78
93
|
reward_dataset_path: Preference dataset use for tuning the reward model.
|
|
79
94
|
"""
|
|
80
95
|
# fmt: on
|
|
81
96
|
prompt_column = 'input_text'
|
|
82
|
-
candidate_columns = ['candidate_0', 'candidate_1']
|
|
83
97
|
choice_column = 'choice'
|
|
84
|
-
machine_spec = function_based.resolve_machine_spec(
|
|
85
|
-
accelerator_type=accelerator_type,
|
|
86
|
-
use_test_spec=env.get_use_test_machine_spec(),
|
|
87
|
-
).set_display_name('Resolve Machine Spec')
|
|
88
|
-
|
|
89
|
-
reference_model_metadata = function_based.resolve_reference_model_metadata(
|
|
90
|
-
large_model_reference=large_model_reference,
|
|
91
|
-
).set_display_name('Resolve Model Metadata')
|
|
92
98
|
|
|
93
99
|
processed_preference_dataset = (
|
|
94
100
|
preprocess_chat_dataset.preprocess_chat_dataset(
|
|
@@ -99,9 +105,6 @@ def pipeline(
|
|
|
99
105
|
).set_display_name('Preprocess Prompt Dataset')
|
|
100
106
|
)
|
|
101
107
|
|
|
102
|
-
comma_separated_candidates_field_names = (
|
|
103
|
-
function_based.convert_to_delimited_string(items=candidate_columns)
|
|
104
|
-
)
|
|
105
108
|
preference_dataset_importer = (
|
|
106
109
|
private_text_comparison_importer.private_text_comparison_importer(
|
|
107
110
|
project=project,
|
|
@@ -110,12 +113,10 @@ def pipeline(
|
|
|
110
113
|
'processed_dataset_uri'
|
|
111
114
|
],
|
|
112
115
|
inputs_field_name=prompt_column,
|
|
113
|
-
comma_separated_candidates_field_names=comma_separated_candidates_field_names
|
|
116
|
+
comma_separated_candidates_field_names=comma_separated_candidates_field_names,
|
|
114
117
|
choice_field_name=choice_column,
|
|
115
118
|
split=env.TRAIN_SPLIT,
|
|
116
|
-
large_model_reference=
|
|
117
|
-
'reward_model_reference'
|
|
118
|
-
],
|
|
119
|
+
large_model_reference=reward_model_reference,
|
|
119
120
|
instruction=instruction,
|
|
120
121
|
encryption_spec_key_name=encryption_spec_key_name,
|
|
121
122
|
)
|
|
@@ -129,12 +130,10 @@ def pipeline(
|
|
|
129
130
|
location=location,
|
|
130
131
|
input_text=eval_dataset,
|
|
131
132
|
inputs_field_name=prompt_column,
|
|
132
|
-
comma_separated_candidates_field_names=comma_separated_candidates_field_names
|
|
133
|
+
comma_separated_candidates_field_names=comma_separated_candidates_field_names,
|
|
133
134
|
choice_field_name=choice_column,
|
|
134
135
|
split=env.TRAIN_SPLIT,
|
|
135
|
-
large_model_reference=
|
|
136
|
-
'reward_model_reference'
|
|
137
|
-
],
|
|
136
|
+
large_model_reference=reward_model_reference,
|
|
138
137
|
instruction=instruction,
|
|
139
138
|
encryption_spec_key_name=encryption_spec_key_name,
|
|
140
139
|
)
|
|
@@ -142,21 +141,11 @@ def pipeline(
|
|
|
142
141
|
.set_caching_options(False)
|
|
143
142
|
)
|
|
144
143
|
|
|
145
|
-
reward_model_image_uri = function_based.resolve_private_refined_image_uri(
|
|
146
|
-
accelerator_type=machine_spec.outputs['accelerator_type'],
|
|
147
|
-
).set_display_name('Resolve Reward Model Image URI')
|
|
148
|
-
num_microbatches = function_based.resolve_num_microbatches(
|
|
149
|
-
large_model_reference=reference_model_metadata.outputs[
|
|
150
|
-
'reward_model_reference'
|
|
151
|
-
]
|
|
152
|
-
).set_display_name('Resolve Number of Microbatches')
|
|
153
144
|
reward_model = (
|
|
154
145
|
reward_model_trainer.reward_model_trainer(
|
|
155
146
|
project=project,
|
|
156
|
-
location=
|
|
157
|
-
input_model_path=
|
|
158
|
-
'reward_model_path'
|
|
159
|
-
],
|
|
147
|
+
location=tuning_location,
|
|
148
|
+
input_model_path=reward_model_path,
|
|
160
149
|
input_dataset_path=preference_dataset_importer.outputs[
|
|
161
150
|
'output_dataset_path'
|
|
162
151
|
],
|
|
@@ -164,19 +153,17 @@ def pipeline(
|
|
|
164
153
|
'output_dataset_path'
|
|
165
154
|
],
|
|
166
155
|
train_steps=reward_model_train_steps,
|
|
167
|
-
accelerator_type=
|
|
168
|
-
accelerator_count=
|
|
169
|
-
large_model_reference=
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
machine_type=machine_spec.outputs['machine_type'],
|
|
173
|
-
image_uri=reward_model_image_uri.output,
|
|
156
|
+
accelerator_type=accelerator_type,
|
|
157
|
+
accelerator_count=accelerator_count,
|
|
158
|
+
large_model_reference=reward_model_reference,
|
|
159
|
+
machine_type=machine_type,
|
|
160
|
+
image_uri=reward_model_image_uri,
|
|
174
161
|
inputs_sequence_length=prompt_sequence_length,
|
|
175
162
|
targets_sequence_length=target_sequence_length,
|
|
176
163
|
batch_size=batch_size,
|
|
177
164
|
learning_rate_multiplier=reward_model_learning_rate_multiplier,
|
|
178
165
|
lora_dim=lora_dim,
|
|
179
|
-
num_microbatches=num_microbatches
|
|
166
|
+
num_microbatches=num_microbatches,
|
|
180
167
|
encryption_spec_key_name=encryption_spec_key_name,
|
|
181
168
|
tensorboard_resource_id=tensorboard_resource_id,
|
|
182
169
|
)
|
|
@@ -185,9 +172,6 @@ def pipeline(
|
|
|
185
172
|
)
|
|
186
173
|
|
|
187
174
|
return PipelineOutput(
|
|
188
|
-
reward_model_base_path=reference_model_metadata.outputs[
|
|
189
|
-
'reward_model_path'
|
|
190
|
-
],
|
|
191
175
|
reward_model_adapter_path=reward_model.outputs['output_adapter_path'],
|
|
192
176
|
reward_dataset_path=preference_dataset_importer.outputs[
|
|
193
177
|
'output_dataset_path'
|
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
"""Component that preprocesses inputs for Reinforcement Learning from Human Feedback (RLHF)."""
|
|
15
15
|
|
|
16
16
|
import os
|
|
17
|
+
from typing import List
|
|
17
18
|
|
|
18
19
|
from google_cloud_pipeline_components import _placeholders
|
|
19
20
|
from google_cloud_pipeline_components import utils as gcpc_utils
|
|
@@ -23,24 +24,80 @@ from kfp import dsl
|
|
|
23
24
|
|
|
24
25
|
@dsl.container_component
|
|
25
26
|
def rlhf_preprocessor(
|
|
27
|
+
large_model_reference: str,
|
|
28
|
+
accelerator_type: str,
|
|
29
|
+
use_test_spec: bool,
|
|
30
|
+
project: str,
|
|
31
|
+
location: str,
|
|
32
|
+
artifact_registry: str,
|
|
33
|
+
tag: str,
|
|
26
34
|
gcp_resources: dsl.OutputPath(str), # pytype: disable=invalid-annotation
|
|
27
35
|
has_tensorboard_id: dsl.OutputPath(bool), # pytype: disable=invalid-annotation
|
|
28
36
|
has_inference_dataset: dsl.OutputPath(bool), # pytype: disable=invalid-annotation
|
|
37
|
+
metadata_candidate_columns_string: dsl.OutputPath(str), # pytype: disable=invalid-annotation
|
|
38
|
+
metadata_large_model_reference: dsl.OutputPath(str), # pytype: disable=invalid-annotation
|
|
39
|
+
metadata_reference_model_path: dsl.OutputPath(str), # pytype: disable=invalid-annotation
|
|
40
|
+
metadata_reward_model_reference: dsl.OutputPath(str), # pytype: disable=invalid-annotation
|
|
41
|
+
metadata_reward_model_path: dsl.OutputPath(str), # pytype: disable=invalid-annotation
|
|
42
|
+
metadata_machine_type: dsl.OutputPath(str), # pytype: disable=invalid-annotation
|
|
43
|
+
metadata_tuning_location: dsl.OutputPath(str), # pytype: disable=invalid-annotation
|
|
44
|
+
metadata_accelerator_type: dsl.OutputPath(str), # pytype: disable=invalid-annotation
|
|
45
|
+
metadata_accelerator_count: dsl.OutputPath(int), # pytype: disable=invalid-annotation
|
|
46
|
+
metadata_refined_image_uri: dsl.OutputPath(str), # pytype: disable=invalid-annotation
|
|
47
|
+
metadata_num_microbatches: dsl.OutputPath(int), # pytype: disable=invalid-annotation
|
|
48
|
+
metadata_upload_location: dsl.OutputPath(str), # pytype: disable=invalid-annotation
|
|
49
|
+
metadata_deploy_model: dsl.OutputPath(bool), # pytype: disable=invalid-annotation
|
|
50
|
+
metadata_model_display_name: dsl.OutputPath(str), # pytype: disable=invalid-annotation
|
|
51
|
+
metadata_upload_model: dsl.OutputPath(bool), # pytype: disable=invalid-annotation
|
|
52
|
+
use_experimental_image: bool = False,
|
|
29
53
|
evaluation_dataset: str = '',
|
|
30
54
|
tensorboard_resource_id: str = '',
|
|
55
|
+
input_reference_model_path: str = '',
|
|
31
56
|
image_uri: str = utils.get_default_image_uri('refined_cpu', ''),
|
|
57
|
+
upload_location: str = '',
|
|
58
|
+
model_display_name: str = '',
|
|
59
|
+
deploy_model: bool = True,
|
|
32
60
|
) -> dsl.ContainerSpec: # pylint: disable=g-doc-args
|
|
61
|
+
# fmt: off
|
|
33
62
|
"""Preprocess RLHF pipeline inputs.
|
|
34
63
|
|
|
35
64
|
Args:
|
|
65
|
+
large_model_reference: The model for fine tuning.
|
|
66
|
+
accelerator_type: Specific accelerator type for the job.
|
|
67
|
+
use_test_spec: Whether to use a lower resource machine for testing.
|
|
68
|
+
project: Project that contains the artifact registry.
|
|
69
|
+
location: Region that contains the artifact registry.
|
|
70
|
+
artifact_registry: Registry that contains Docker images.
|
|
71
|
+
tag: Image tag.
|
|
72
|
+
use_experimental_image: Whether to use refined experimental image.
|
|
36
73
|
evaluation_dataset: Path to evaluation data.
|
|
37
74
|
tensorboard_resource_id: TensorBoard resource id.
|
|
75
|
+
metadata_large_model_reference: The base model for fine tuning. The name should be in capitalized snake case format.
|
|
76
|
+
metadata_reference_model_path: The model checkpoint path for the reinforcer model
|
|
77
|
+
metadata_reward_model_reference: The base model for training reward model. The name should be in capitalized snake case format.
|
|
78
|
+
metadata_reward_model_path: The model checkpoint path for the reward model.
|
|
79
|
+
image_uri: Docker image URI to use for the custom job.
|
|
80
|
+
upload_location: Region where the model will be uploaded.
|
|
81
|
+
model_display_name: Display name of the model.
|
|
82
|
+
deploy_model: Whether to deploy the model.
|
|
38
83
|
|
|
39
84
|
Returns:
|
|
40
85
|
gcp_resources: GCP resources that can be used to track the custom job.
|
|
41
86
|
has_tensorboard_id: Whether a tensorboard id is provided.
|
|
42
87
|
has_inference_dataset: Whether inference data are provided.
|
|
88
|
+
metadata_machine_type: The type of the machine to provision for the custom job.
|
|
89
|
+
metadata_tuning_location: The GCP region to run the custom job.
|
|
90
|
+
metadata_accelerator_type: Specific accelerator type for the custom job.
|
|
91
|
+
metadata_accelerator_count: The number of accelerator.
|
|
92
|
+
metadata_refined_image_uri: Docker image URI to use for the custom job.
|
|
93
|
+
metadata_num_microbatches: Number of microbatches to break the total batch
|
|
94
|
+
size into during training.
|
|
95
|
+
metadata_upload_location: Regional endpoint.
|
|
96
|
+
metadata_deploy_model: Whether to deploy the model.
|
|
97
|
+
metadata_model_display_name: Display name of the model.
|
|
98
|
+
metadata_upload_model: Whether to upload the model.
|
|
43
99
|
"""
|
|
100
|
+
# fmt: on
|
|
44
101
|
return gcpc_utils.build_serverless_customjob_container_spec(
|
|
45
102
|
project=_placeholders.PROJECT_ID_PLACEHOLDER,
|
|
46
103
|
location=_placeholders.LOCATION_PLACEHOLDER,
|
|
@@ -52,8 +109,35 @@ def rlhf_preprocessor(
|
|
|
52
109
|
'--app_name=rlhf_preprocessor',
|
|
53
110
|
f'--evaluation_dataset={evaluation_dataset}',
|
|
54
111
|
f'--tensorboard_resource_id={tensorboard_resource_id}',
|
|
112
|
+
f'--large_model_reference={large_model_reference}',
|
|
113
|
+
f'--input_reference_model_path={input_reference_model_path}',
|
|
114
|
+
f'--accelerator_type={accelerator_type}',
|
|
115
|
+
f'--use_test_spec={use_test_spec}',
|
|
116
|
+
f'--project={project}',
|
|
117
|
+
f'--location={location}',
|
|
118
|
+
f'--artifact_registry={artifact_registry}',
|
|
119
|
+
f'--tag={tag}',
|
|
120
|
+
f'--use_experimental_image={use_experimental_image}',
|
|
121
|
+
f'--upload_location={upload_location}',
|
|
122
|
+
f'--deploy_model={deploy_model}',
|
|
123
|
+
f'--model_display_name={model_display_name}',
|
|
55
124
|
f'--has_tensorboard_id_path={has_tensorboard_id}',
|
|
56
125
|
f'--has_inference_dataset_path={has_inference_dataset}',
|
|
126
|
+
f'--metadata_candidate_columns_string_path={metadata_candidate_columns_string}',
|
|
127
|
+
f'--metadata_large_model_reference_path={metadata_large_model_reference}',
|
|
128
|
+
f'--metadata_reference_model_path_path={metadata_reference_model_path}',
|
|
129
|
+
f'--metadata_reward_model_reference_path={metadata_reward_model_reference}',
|
|
130
|
+
f'--metadata_reward_model_path_path={metadata_reward_model_path}',
|
|
131
|
+
f'--metadata_machine_type_path={metadata_machine_type}',
|
|
132
|
+
f'--metadata_tuning_location_path={metadata_tuning_location}',
|
|
133
|
+
f'--metadata_accelerator_type_path={metadata_accelerator_type}',
|
|
134
|
+
f'--metadata_accelerator_count_path={metadata_accelerator_count}',
|
|
135
|
+
f'--metadata_refined_image_uri_path={metadata_refined_image_uri}',
|
|
136
|
+
f'--metadata_num_microbatches_path={metadata_num_microbatches}',
|
|
137
|
+
f'--metadata_upload_location_path={metadata_upload_location}',
|
|
138
|
+
f'--metadata_deploy_model_path={metadata_deploy_model}',
|
|
139
|
+
f'--metadata_model_display_name_path={metadata_model_display_name}',
|
|
140
|
+
f'--metadata_upload_model_path={metadata_upload_model}',
|
|
57
141
|
],
|
|
58
142
|
),
|
|
59
143
|
gcp_resources=gcp_resources,
|
|
@@ -79,8 +79,19 @@ def validate_pipeline(
|
|
|
79
79
|
# ]
|
|
80
80
|
# [ Check CMEK
|
|
81
81
|
supported_pipeline_regions = {
|
|
82
|
+
'asia-northeast1',
|
|
83
|
+
'asia-northeast3',
|
|
84
|
+
'asia-southeast1',
|
|
85
|
+
'europe-west1',
|
|
86
|
+
'europe-west2',
|
|
87
|
+
'europe-west3',
|
|
82
88
|
'europe-west4',
|
|
89
|
+
'europe-west9',
|
|
90
|
+
'northamerica-northeast1',
|
|
83
91
|
'us-central1',
|
|
92
|
+
'us-east4',
|
|
93
|
+
'us-west1',
|
|
94
|
+
'us-west4',
|
|
84
95
|
}
|
|
85
96
|
if location not in supported_pipeline_regions:
|
|
86
97
|
raise ValueError(
|
|
@@ -25,27 +25,18 @@ from google_cloud_pipeline_components._implementation.model_evaluation.feature_a
|
|
|
25
25
|
from google_cloud_pipeline_components._implementation.model_evaluation.feature_attribution.feature_attribution_graph_component import feature_attribution_graph_component as FeatureAttributionGraphComponentOp
|
|
26
26
|
from google_cloud_pipeline_components._implementation.model_evaluation.feature_extractor.component import feature_extractor_error_analysis as FeatureExtractorOp
|
|
27
27
|
from google_cloud_pipeline_components._implementation.model_evaluation.import_evaluated_annotation.component import evaluated_annotation_import as ModelImportEvaluatedAnnotationOp
|
|
28
|
-
from google_cloud_pipeline_components._implementation.model_evaluation.import_evaluation.component import model_evaluation_import as ModelImportEvaluationOp
|
|
29
28
|
from google_cloud_pipeline_components._implementation.model_evaluation.llm_classification_postprocessor.component import llm_classification_predictions_postprocessor_graph_component as LLMEvaluationClassificationPredictionsPostprocessorOp
|
|
30
|
-
from google_cloud_pipeline_components._implementation.model_evaluation.llm_embedding.evaluation_llm_embedding_pipeline import evaluation_llm_embedding_pipeline
|
|
31
29
|
from google_cloud_pipeline_components._implementation.model_evaluation.llm_embedding_retrieval.component import llm_embedding_retrieval as LLMEmbeddingRetrievalOp
|
|
32
30
|
from google_cloud_pipeline_components._implementation.model_evaluation.llm_evaluation.component import model_evaluation_text_generation as LLMEvaluationTextGenerationOp
|
|
33
31
|
from google_cloud_pipeline_components._implementation.model_evaluation.llm_evaluation_preprocessor.component import llm_evaluation_dataset_preprocessor_graph_component as LLMEvaluationPreprocessorOp
|
|
34
32
|
from google_cloud_pipeline_components._implementation.model_evaluation.llm_information_retrieval_preprocessor.component import llm_information_retrieval_preprocessor as LLMInformationRetrievalPreprocessorOp
|
|
35
33
|
from google_cloud_pipeline_components._implementation.model_evaluation.llm_retrieval_metrics.component import llm_retrieval_metrics as LLMRetrievalMetricsOp
|
|
36
34
|
from google_cloud_pipeline_components._implementation.model_evaluation.llm_safety_bias.component import llm_safety_bias_metrics as LLMSafetyBiasMetricsOp
|
|
37
|
-
from google_cloud_pipeline_components._implementation.model_evaluation.llm_safety_bias.evaluation_llm_safety_bias_pipeline import evaluation_llm_safety_bias_pipeline
|
|
38
|
-
from google_cloud_pipeline_components._implementation.model_evaluation.model_inference.component import model_inference_and_evaluation_component
|
|
39
|
-
from google_cloud_pipeline_components._implementation.model_evaluation.model_inference.component import model_inference_component
|
|
40
35
|
from google_cloud_pipeline_components._implementation.model_evaluation.model_name_preprocessor.component import model_name_preprocessor as ModelNamePreprocessorOp
|
|
41
36
|
from google_cloud_pipeline_components._implementation.model_evaluation.target_field_data_remover.component import target_field_data_remover as TargetFieldDataRemoverOp
|
|
42
|
-
from google_cloud_pipeline_components._implementation.model_evaluation.text2sql.evaluation_llm_text2sql_pipeline import evaluation_llm_text2sql_pipeline
|
|
43
37
|
|
|
44
38
|
|
|
45
39
|
__all__ = [
|
|
46
|
-
'evaluation_llm_safety_bias_pipeline',
|
|
47
|
-
'evaluation_llm_embedding_pipeline',
|
|
48
|
-
'evaluation_llm_text2sql_pipeline',
|
|
49
40
|
'evaluation_llm_endpoint_batch_predict_pipeline_graph_component',
|
|
50
41
|
'ChunkingOp',
|
|
51
42
|
'EvaluationDataSamplerOp',
|
|
@@ -63,9 +54,6 @@ __all__ = [
|
|
|
63
54
|
'LLMSafetyBiasMetricsOp',
|
|
64
55
|
'ModelEvaluationFeatureAttributionOp',
|
|
65
56
|
'ModelImportEvaluatedAnnotationOp',
|
|
66
|
-
'ModelImportEvaluationOp',
|
|
67
57
|
'ModelNamePreprocessorOp',
|
|
68
58
|
'TargetFieldDataRemoverOp',
|
|
69
|
-
'model_inference_component',
|
|
70
|
-
'model_inference_and_evaluation_component',
|
|
71
59
|
]
|
|
@@ -14,11 +14,12 @@
|
|
|
14
14
|
"""LLM embedding evaluation pipeline based on information retrieval (IR) task."""
|
|
15
15
|
|
|
16
16
|
from typing import Dict, Optional, Union
|
|
17
|
+
|
|
17
18
|
from google_cloud_pipeline_components._implementation.model_evaluation.endpoint_batch_predict.component import evaluation_llm_endpoint_batch_predict_pipeline_graph_component as LLMEndpointBatchPredictOp
|
|
18
|
-
from google_cloud_pipeline_components._implementation.model_evaluation.import_evaluation.component import model_evaluation_import as ModelImportEvaluationOp
|
|
19
19
|
from google_cloud_pipeline_components._implementation.model_evaluation.llm_embedding_retrieval.component import llm_embedding_retrieval as LLMEmbeddingRetrievalOp
|
|
20
20
|
from google_cloud_pipeline_components._implementation.model_evaluation.llm_information_retrieval_preprocessor.component import llm_information_retrieval_preprocessor as LLMInformationRetrievalPreprocessorOp
|
|
21
21
|
from google_cloud_pipeline_components._implementation.model_evaluation.llm_retrieval_metrics.component import llm_retrieval_metrics as LLMRetrievalMetricsOp
|
|
22
|
+
from google_cloud_pipeline_components.preview.model_evaluation.model_evaluation_import_component import model_evaluation_import as ModelImportEvaluationOp
|
|
22
23
|
from google_cloud_pipeline_components.types.artifact_types import VertexModel
|
|
23
24
|
from google_cloud_pipeline_components.v1.batch_predict_job import ModelBatchPredictOp
|
|
24
25
|
import kfp
|
|
@@ -13,9 +13,38 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
"""Placeholders for use in component authoring."""
|
|
15
15
|
|
|
16
|
-
# prefer not using PIPELINE_TASK_ prefix like KFP does for reduced
|
|
16
|
+
# prefer not using PIPELINE_TASK_ or PIPELINE_ prefix like KFP does for reduced
|
|
17
|
+
# verbosity
|
|
17
18
|
PROJECT_ID_PLACEHOLDER = "{{$.pipeline_google_cloud_project_id}}"
|
|
19
|
+
"""A placeholder used to obtain Google Cloud project id where the pipeline
|
|
20
|
+
executes. The placeholder value is set at pipeline runtime.
|
|
21
|
+
"""
|
|
18
22
|
LOCATION_PLACEHOLDER = "{{$.pipeline_google_cloud_location}}"
|
|
23
|
+
"""A placeholder used to obtain Google Cloud location where the pipeline
|
|
24
|
+
executes. The placeholder value is set at pipeline runtime.
|
|
25
|
+
"""
|
|
26
|
+
SERVICE_ACCOUNT_PLACEHOLDER = "{{$.pipeline_service_account}}"
|
|
27
|
+
"""A placeholder used to obtain service account that is defined in [PipelineJob](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.pipelineJobs).
|
|
28
|
+
If PipelineJob doesn't have a service account set, this placeholder will be resolved to default service account.
|
|
29
|
+
The placeholder value is set at pipeline runtime.
|
|
30
|
+
"""
|
|
31
|
+
NETWORK_PLACEHOLDER = "{{$.pipeline_network}}"
|
|
32
|
+
"""A placeholder used to obtain network that is defined in [PipelineJob](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.pipelineJobs).
|
|
33
|
+
If PipelineJob doesn't have a network set, this placeholder will be empty. The
|
|
34
|
+
placeholder value is set at pipeline runtime.
|
|
35
|
+
"""
|
|
36
|
+
PERSISTENT_RESOURCE_ID_PLACEHOLDER = "{{$.pipeline_persistent_resource_id}}"
|
|
37
|
+
"""A placeholder used to obtain persistent resource id that is defined in
|
|
38
|
+
PipelineJob [RuntimeConfig](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.pipelineJobs#PipelineJob.RuntimeConfig).
|
|
39
|
+
If PipelineJob doesn't have a persistent resource id, this placeholder will be
|
|
40
|
+
empty. The placeholder value is set at pipeline runtime.
|
|
41
|
+
"""
|
|
42
|
+
ENCRYPTION_SPEC_KMS_KEY_NAME_PLACEHOLDER = "{{$.pipeline_encryption_key_name}}"
|
|
43
|
+
"""A placeholder used to obtain kmsKeyName that is defined in
|
|
44
|
+
PipelineJob's [EncryptionSpec](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/EncryptionSpec).
|
|
45
|
+
If PipelineJob doesn't have a encryption key name, this placeholder will be
|
|
46
|
+
empty. The placeholder value is set at pipeline runtime.
|
|
47
|
+
"""
|
|
19
48
|
|
|
20
49
|
|
|
21
50
|
# omit placeholder type annotation to avoid dependency on KFP SDK internals
|
|
@@ -72,7 +72,7 @@ def automl_forecasting_ensemble(
|
|
|
72
72
|
# fmt: on
|
|
73
73
|
job_id = dsl.PIPELINE_JOB_ID_PLACEHOLDER
|
|
74
74
|
task_id = dsl.PIPELINE_TASK_ID_PLACEHOLDER
|
|
75
|
-
image_uri = 'us-docker.pkg.dev/vertex-ai-restricted/automl-tabular/forecasting-training:
|
|
75
|
+
image_uri = 'us-docker.pkg.dev/vertex-ai-restricted/automl-tabular/forecasting-training:20240419_0625'
|
|
76
76
|
display_name = f'automl-forecasting-ensemble-{job_id}-{task_id}'
|
|
77
77
|
|
|
78
78
|
error_file_path = f'{root_dir}/{job_id}/{task_id}/error.pb'
|
|
@@ -99,14 +99,14 @@ def automl_forecasting_stage_1_tuner(
|
|
|
99
99
|
' 1, "machine_spec": {"machine_type": "n1-standard-8"},'
|
|
100
100
|
' "container_spec": {"image_uri":"'
|
|
101
101
|
),
|
|
102
|
-
'us-docker.pkg.dev/vertex-ai-restricted/automl-tabular/forecasting-training:
|
|
102
|
+
'us-docker.pkg.dev/vertex-ai-restricted/automl-tabular/forecasting-training:20240419_0625',
|
|
103
103
|
'", "args": ["forecasting_mp_l2l_stage_1_tuner',
|
|
104
104
|
'", "--region=',
|
|
105
105
|
location,
|
|
106
106
|
'", "--transform_output_path=',
|
|
107
107
|
transform_output.uri,
|
|
108
108
|
'", "--training_docker_uri=',
|
|
109
|
-
'us-docker.pkg.dev/vertex-ai-restricted/automl-tabular/forecasting-training:
|
|
109
|
+
'us-docker.pkg.dev/vertex-ai-restricted/automl-tabular/forecasting-training:20240419_0625',
|
|
110
110
|
'", "--reduce_search_space_mode=',
|
|
111
111
|
reduce_search_space_mode,
|
|
112
112
|
f'", "--component_id={dsl.PIPELINE_TASK_ID_PLACEHOLDER}',
|
|
@@ -97,14 +97,14 @@ def automl_forecasting_stage_2_tuner(
|
|
|
97
97
|
' 1, "machine_spec": {"machine_type": "n1-standard-8"},'
|
|
98
98
|
' "container_spec": {"image_uri":"'
|
|
99
99
|
),
|
|
100
|
-
'us-docker.pkg.dev/vertex-ai-restricted/automl-tabular/forecasting-training:
|
|
100
|
+
'us-docker.pkg.dev/vertex-ai-restricted/automl-tabular/forecasting-training:20240419_0625',
|
|
101
101
|
'", "args": ["forecasting_mp_l2l_stage_2_tuner',
|
|
102
102
|
'", "--region=',
|
|
103
103
|
location,
|
|
104
104
|
'", "--transform_output_path=',
|
|
105
105
|
transform_output.uri,
|
|
106
106
|
'", "--training_docker_uri=',
|
|
107
|
-
'us-docker.pkg.dev/vertex-ai-restricted/automl-tabular/forecasting-training:
|
|
107
|
+
'us-docker.pkg.dev/vertex-ai-restricted/automl-tabular/forecasting-training:20240419_0625',
|
|
108
108
|
f'", "--component_id={dsl.PIPELINE_TASK_ID_PLACEHOLDER}',
|
|
109
109
|
'", "--training_base_dir=',
|
|
110
110
|
root_dir,
|