PyPI - google-cloud-pipeline-components - Versions diffs - 2.10.0__py3-none-any.whl → 2.12.0__py3-none-any.whl - Mend

google-cloud-pipeline-components 2.10.0py3-none-any.whl → 2.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of google-cloud-pipeline-components might be problematic. Click here for more details.

Files changed (43) hide show

google_cloud_pipeline_components/_implementation/llm/private_text_importer.py CHANGED Viewed

@@ -37,10 +37,11 @@ def private_text_importer(
     imported_data_path: dsl.OutputPath(str),  # pytype: disable=invalid-annotation
     gcp_resources: dsl.OutputPath(str),  # pytype: disable=invalid-annotation
     instruction: str = '',
-    image_uri: str = utils.get_default_image_uri('text_importer_backup'),
+    image_uri: str = utils.get_default_image_uri('refined_cpu', ''),
     machine_type: str = 'e2-highmem-8',
     output_split_name: str = 'all',
     max_num_input_examples: Optional[int] = None,
+    encryption_spec_key_name: str = '',
 ) -> dsl.ContainerSpec:  # pylint: disable=g-doc-args
   """Import a text dataset.
@@ -59,6 +60,10 @@ def private_text_importer(
     output_split_name: The created seqio task has 1 split, its name is specified
       by this argument.
     max_num_input_examples: Maximum number of examples to import.
+    encryption_spec_key_name: Customer-managed encryption key. If this is set,
+      then all resources created by the CustomJob will be encrypted with the
+      provided encryption key. Note that this is not supported for TPU at the
+      moment.
   Returns:
     imported_data: Artifact representing the imported data and cached Tasks.
@@ -76,6 +81,7 @@ def private_text_importer(
           machine_type=machine_type,
           image_uri=_resolve_image(image_uri),
           args=[
+              '--app_name=text_importer',
               f'--input_text={input_text}',
               f'--inputs_field_name={inputs_field_name}',
               f'--targets_field_name={targets_field_name}',
@@ -88,6 +94,7 @@ def private_text_importer(
               f'--max_num_input_examples={max_num_input_examples}',
               '--executor_input={{$.json_escape[1]}}',
           ],
+          encryption_spec_key_name=encryption_spec_key_name,
       ),
       gcp_resources=gcp_resources,
   )

google_cloud_pipeline_components/_implementation/llm/reinforcement_learning_graph.py CHANGED Viewed

@@ -51,8 +51,10 @@ def pipeline(
     kl_coeff: float = 0.1,
     instruction: Optional[str] = None,
     project: str = _placeholders.PROJECT_ID_PLACEHOLDER,
+    accelerator_type: str = 'GPU',
     location: str = _placeholders.LOCATION_PLACEHOLDER,
-    tensorboard_resource_id: Optional[str] = None,
+    tensorboard_resource_id: str = '',
+    encryption_spec_key_name: str = '',
 ) -> PipelineOutput:
   # fmt: off
   """Trains a reward model.
@@ -72,8 +74,10 @@ def pipeline(
     kl_coeff: Coefficient for KL penalty. This regularizes the policy model and penalizes if it diverges from its initial distribution. If set to 0, the reference language model is not loaded into memory. Default value is 0.1.
     instruction: This field lets the model know what task it needs to perform. Base models have been trained over a large set of varied instructions. You can give a simple and intuitive description of the task and the model will follow it, e.g. "Classify this movie review as positive or negative" or "Translate this sentence to Danish". Do not specify this if your dataset already prepends the instruction to the inputs field.
     project: Project used to run custom jobs. If not specified the project used to run the pipeline will be used.
-    location: Location used to run custom jobs. If not specified the location used to run the pipeline will be used.
+    accelerator_type: One of 'TPU' or 'GPU'. If 'TPU' is specified, tuning components run in europe-west4. Otherwise tuning components run in us-central1 on GPUs. Default is 'GPU'.
+    location: Location used to run non-tuning components, i.e. components that do not require accelerators. If not specified the location used to run the pipeline will be used.
     tensorboard_resource_id: Optional tensorboard resource id in format `projects/{project_number}/locations/{location}/tensorboards/{tensorboard_id}`. If provided, tensorboard metrics will be uploaded to this location.
+    encryption_spec_key_name: Customer-managed encryption key. If this is set, then all resources created by the CustomJob will be encrypted with the provided encryption key. Note that this is not supported for TPU at the moment.
   Returns:
     output_model_path: Path to the trained model checkpoint.
@@ -82,17 +86,14 @@ def pipeline(
   # fmt: on
   prompt_column = 'input_text'
   machine_spec = function_based.resolve_machine_spec(
-      location=location, use_test_spec=env.get_use_test_machine_spec()
+      accelerator_type=accelerator_type,
+      use_test_spec=env.get_use_test_machine_spec(),
   ).set_display_name('Resolve Machine Spec')
   reference_model_metadata = function_based.resolve_reference_model_metadata(
       large_model_reference=large_model_reference,
   ).set_display_name('Resolve Model Metadata')
-  prompt_dataset_image_uri = function_based.resolve_private_image_uri(
-      image_name='text_importer'
-  ).set_display_name('Resolve Prompt Dataset Image URI')
   processed_dataset = preprocess_chat_dataset.preprocess_chat_dataset(
       large_model_reference=large_model_reference,
       input_dataset_uri=prompt_dataset,
@@ -111,16 +112,14 @@ def pipeline(
           large_model_reference=reference_model_metadata.outputs[
               'large_model_reference'
           ],
-          image_uri=prompt_dataset_image_uri.output,
           instruction=instruction,
+          encryption_spec_key_name=encryption_spec_key_name,
       )
       .set_display_name('Import Prompt Dataset')
       .set_caching_options(False)
   )
-  rl_image_uri = function_based.resolve_private_image_uri(
-      image_name='reinforcer',
+  rl_image_uri = function_based.resolve_private_refined_image_uri(
       accelerator_type=machine_spec.outputs['accelerator_type'],
-      accelerator_count=machine_spec.outputs['accelerator_count'],
   ).set_display_name('Resolve Reinforcer Image URI')
   num_microbatches = function_based.resolve_num_microbatches(
       large_model_reference=reference_model_metadata.outputs[
@@ -130,7 +129,7 @@ def pipeline(
   rl_model = (
       reinforcer.reinforcer(
           project=project,
-          location=location,
+          location=machine_spec.outputs['tuning_location'],
           input_reference_model_path=reference_model_metadata.outputs[
               'reference_model_path'
           ],
@@ -159,26 +158,13 @@ def pipeline(
           lora_dim=lora_dim,
           reward_lora_dim=reward_lora_dim,
           num_microbatches=num_microbatches.output,
+          encryption_spec_key_name=encryption_spec_key_name,
+          tensorboard_resource_id=tensorboard_resource_id,
       )
       .set_display_name('Reinforcer')
       .set_caching_options(False)
   )
-  has_tensorboard_id = function_based.value_exists(
-      value=tensorboard_resource_id
-  ).set_display_name('Resolve Tensorboard Resource ID')
-  with kfp.dsl.Condition(  # pytype: disable=wrong-arg-types
-      has_tensorboard_id.output == True,  # pylint: disable=singleton-comparison, g-explicit-bool-comparison
-      name='Upload Reinforcement Learning Tensorboard Metrics',
-  ):
-    _ = upload_tensorboard_metrics.upload_tensorboard_metrics(
-        tensorboard_resource_id=tensorboard_resource_id,
-        metrics_directory=rl_model.outputs['tensorboard_metrics'],
-        experiment_name=(
-            'rl-model-tuner-'
-            f'{kfp.dsl.PIPELINE_JOB_ID_PLACEHOLDER}-'
-            f'{kfp.dsl.PIPELINE_TASK_ID_PLACEHOLDER}'
-        ),
-    ).set_display_name('Reinforcement Learning Tensorboard Metrics Uploader')
   return PipelineOutput(
       output_model_path=rl_model.outputs['output_model_path'],
       output_adapter_path=rl_model.outputs['output_adapter_path'],

google_cloud_pipeline_components/_implementation/llm/reinforcer.py CHANGED Viewed

@@ -47,6 +47,8 @@ def reinforcer(
     lora_dim: int = 0,
     reward_lora_dim: int = 4,
     num_microbatches: int = 0,
+    encryption_spec_key_name: str = '',
+    tensorboard_resource_id: str = '',
 ) -> kfp.dsl.ContainerSpec:  # pylint: disable=g-doc-args
   """Trains a model using reinforcement learning.
@@ -86,6 +88,13 @@ def reinforcer(
     num_microbatches: Number of microbatches to break the total batch size into
       during training. If <= 1, the model is trained on the full batch size
       directly.
+    encryption_spec_key_name: Customer-managed encryption key. If this is set,
+      then all resources created by the CustomJob will be encrypted with the
+      provided encryption key. Note that this is not supported for TPU at the
+      moment.
+    tensorboard_resource_id: Optional tensorboard resource id. Format:
+      `projects/{project_number}/locations/{location}/tensorboards/{tensorboard_id}`.
+      If provided, tensorboard metrics will be uploaded to this location.
   Returns:
     output_model_path: Path to the trained model checkpoint.
@@ -105,6 +114,7 @@ def reinforcer(
           machine_type=machine_type,
           image_uri=image_uri,
           args=[
+              '--app_name=reinforcer',
               f'--input_reference_model_path={input_reference_model_path}',
               f'--input_reward_model_path={input_reward_model_path}',
               f'--input_reward_adapter_path={input_reward_adapter_path}',
@@ -126,6 +136,9 @@ def reinforcer(
               f'--reward_lora_dim={reward_lora_dim}',
               f'--num_microbatches={num_microbatches}',
           ],
+          encryption_spec_key_name=encryption_spec_key_name,
+          base_output_directory=tensorboard_metrics.uri,
+          tensorboard=tensorboard_resource_id,
       ),
       gcp_resources=gcp_resources,
   )

google_cloud_pipeline_components/_implementation/llm/reward_model_graph.py CHANGED Viewed

@@ -45,10 +45,13 @@ def pipeline(
     lora_dim: int = 4,
     reward_model_learning_rate_multiplier: float = 1.0,
     reward_model_train_steps: int = 1000,
+    eval_dataset: Optional[str] = None,
     instruction: Optional[str] = None,
     project: str = _placeholders.PROJECT_ID_PLACEHOLDER,
+    accelerator_type: str = 'GPU',
     location: str = _placeholders.LOCATION_PLACEHOLDER,
-    tensorboard_resource_id: Optional[str] = None,
+    tensorboard_resource_id: str = '',
+    encryption_spec_key_name: str = '',
 ) -> PipelineOutput:
   # fmt: off
   """Trains a reward model.
@@ -64,8 +67,10 @@ def pipeline(
     reward_model_train_steps: Number of steps to use when training a reward model. Default value is 1000.
     instruction: This field lets the model know what task it needs to perform. Base models have been trained over a large set of varied instructions. You can give a simple and intuitive description of the task and the model will follow it, e.g. "Classify this movie review as positive or negative" or "Translate this sentence to Danish". Do not specify this if your dataset already prepends the instruction to the inputs field.
     project: Project used to run custom jobs. If not specified the project used to run the pipeline will be used.
-    location: Location used to run custom jobs. If not specified the location used to run the pipeline will be used.
+    accelerator_type: One of 'TPU' or 'GPU'. If 'TPU' is specified, tuning components run in europe-west4. Otherwise tuning components run in us-central1 on GPUs. Default is 'GPU'.
+    location: Location used to run non-tuning components, i.e. components that do not require accelerators. If not specified the location used to run the pipeline will be used.
     tensorboard_resource_id: Optional tensorboard resource id in format `projects/{project_number}/locations/{location}/tensorboards/{tensorboard_id}`. If provided, tensorboard metrics will be uploaded to this location.
+    encryption_spec_key_name: Customer-managed encryption key. If this is set, then all resources created by the CustomJob will be encrypted with the provided encryption key. Note that this is not supported for TPU at the moment.
   Returns:
     reward_model_base_path: Path to the base model used by the reward model.
@@ -77,7 +82,8 @@ def pipeline(
   candidate_columns = ['candidate_0', 'candidate_1']
   choice_column = 'choice'
   machine_spec = function_based.resolve_machine_spec(
-      location=location, use_test_spec=env.get_use_test_machine_spec()
+      accelerator_type=accelerator_type,
+      use_test_spec=env.get_use_test_machine_spec(),
   ).set_display_name('Resolve Machine Spec')
   reference_model_metadata = function_based.resolve_reference_model_metadata(
@@ -93,9 +99,6 @@ def pipeline(
       ).set_display_name('Preprocess Prompt Dataset')
   )
-  preference_dataset_image_uri = function_based.resolve_private_image_uri(
-      image_name='text_comparison_importer'
-  ).set_display_name('Resolve Preference Dataset Image URI')
   comma_separated_candidates_field_names = (
       function_based.convert_to_delimited_string(items=candidate_columns)
   )
@@ -113,17 +116,34 @@ def pipeline(
           large_model_reference=reference_model_metadata.outputs[
               'reward_model_reference'
           ],
-          image_uri=preference_dataset_image_uri.output,
           instruction=instruction,
+          encryption_spec_key_name=encryption_spec_key_name,
       )
       .set_display_name('Import Preference Dataset')
       .set_caching_options(False)
   )
-  reward_model_image_uri = function_based.resolve_private_image_uri(
-      image_name='reward_model',
+  preference_eval_dataset_importer = (
+      private_text_comparison_importer.private_text_comparison_importer(
+          project=project,
+          location=location,
+          input_text=eval_dataset,
+          inputs_field_name=prompt_column,
+          comma_separated_candidates_field_names=comma_separated_candidates_field_names.output,
+          choice_field_name=choice_column,
+          split=env.TRAIN_SPLIT,
+          large_model_reference=reference_model_metadata.outputs[
+              'reward_model_reference'
+          ],
+          instruction=instruction,
+          encryption_spec_key_name=encryption_spec_key_name,
+      )
+      .set_display_name('Import Preference Eval Dataset')
+      .set_caching_options(False)
+  )
+  reward_model_image_uri = function_based.resolve_private_refined_image_uri(
       accelerator_type=machine_spec.outputs['accelerator_type'],
-      accelerator_count=machine_spec.outputs['accelerator_count'],
   ).set_display_name('Resolve Reward Model Image URI')
   num_microbatches = function_based.resolve_num_microbatches(
       large_model_reference=reference_model_metadata.outputs[
@@ -133,13 +153,16 @@ def pipeline(
   reward_model = (
       reward_model_trainer.reward_model_trainer(
           project=project,
-          location=location,
+          location=machine_spec.outputs['tuning_location'],
           input_model_path=reference_model_metadata.outputs[
               'reward_model_path'
           ],
           input_dataset_path=preference_dataset_importer.outputs[
               'output_dataset_path'
           ],
+          eval_dataset_path=preference_eval_dataset_importer.outputs[
+              'output_dataset_path'
+          ],
           train_steps=reward_model_train_steps,
           accelerator_type=machine_spec.outputs['accelerator_type'],
           accelerator_count=machine_spec.outputs['accelerator_count'],
@@ -154,27 +177,13 @@ def pipeline(
           learning_rate_multiplier=reward_model_learning_rate_multiplier,
           lora_dim=lora_dim,
           num_microbatches=num_microbatches.output,
+          encryption_spec_key_name=encryption_spec_key_name,
+          tensorboard_resource_id=tensorboard_resource_id,
       )
       .set_display_name('Reward Model Trainer')
       .set_caching_options(False)
   )
-  has_tensorboard_id = function_based.value_exists(
-      value=tensorboard_resource_id
-  ).set_display_name('Resolve TensorBoard Resource ID')
-  with kfp.dsl.Condition(  # pytype: disable=wrong-arg-types
-      has_tensorboard_id.output == True,  # pylint: disable=singleton-comparison, g-explicit-bool-comparison
-      name='Upload Reward Model TensorBoard Metrics',
-  ):
-    _ = upload_tensorboard_metrics.upload_tensorboard_metrics(
-        tensorboard_resource_id=tensorboard_resource_id,
-        metrics_directory=reward_model.outputs['tensorboard_metrics'],
-        experiment_name=(
-            'reward-model-tuner-'
-            f'{kfp.dsl.PIPELINE_JOB_ID_PLACEHOLDER}-'
-            f'{kfp.dsl.PIPELINE_TASK_ID_PLACEHOLDER}'
-        ),
-    ).set_display_name('Reward Model TensorBoard Metrics Uploader')
   return PipelineOutput(
       reward_model_base_path=reference_model_metadata.outputs[
           'reward_model_path'

google_cloud_pipeline_components/_implementation/llm/reward_model_trainer.py CHANGED Viewed

@@ -35,11 +35,14 @@ def reward_model_trainer(
     output_adapter_path: kfp.dsl.OutputPath(str),  # pytype: disable=invalid-annotation
     tensorboard_metrics: kfp.dsl.Output[kfp.dsl.Artifact],  # pytype: disable=unsupported-operands
     gcp_resources: kfp.dsl.OutputPath(str),  # pytype: disable=invalid-annotation
+    eval_dataset_path: str = '',
     train_split: str = 'train',
     batch_size: int = 64,
     learning_rate_multiplier: float = 1.0,
     lora_dim: int = 4,
     num_microbatches: int = 0,
+    encryption_spec_key_name: str = '',
+    tensorboard_resource_id: str = '',
 ) -> kfp.dsl.ContainerSpec:  # pylint: disable=g-doc-args
   """Trains a reward model.
@@ -48,6 +51,8 @@ def reward_model_trainer(
     location: Location used to run the job.
     input_model_path: Path to the base model to fine tune.
     input_dataset_path: Path to dataset to use to train a reward model.
+    eval_dataset_path: Path to eval dataset to use during the reward model
+      training.
     train_steps: Number of training steps. These are the number of steps on top
       of any steps used to train the base model.
     accelerator_type: Type of TPU accelerator. Can be either TPU_V2 or TPU_V3.
@@ -68,6 +73,13 @@ def reward_model_trainer(
     num_microbatches: Number of microbatches to break the total batch size into
       during training. If <= 1, the model is trained on the full batch size
       directly.
+    encryption_spec_key_name: Customer-managed encryption key. If this is set,
+      then all resources created by the CustomJob will be encrypted with the
+      provided encryption key. Note that this is not supported for TPU at the
+      moment.
+    tensorboard_resource_id: Optional tensorboard resource id. Format:
+      `projects/{project_number}/locations/{location}/tensorboards/{tensorboard_id}`.
+      If provided, tensorboard metrics will be uploaded to this location.
   Returns:
     output_adapter_path: Trained reward LoRA adapter.
@@ -85,9 +97,11 @@ def reward_model_trainer(
           machine_type=machine_type,
           image_uri=image_uri,
           args=[
+              '--app_name=reward_model_trainer',
               f'--train_steps={train_steps}',
               f'--input_model_path={input_model_path}',
               f'--input_dataset_path={input_dataset_path}',
+              f'--eval_dataset_path={eval_dataset_path}',
               f'--output_adapter_path={output_adapter_path}',
               f'--tensorboard_metrics_path={tensorboard_metrics.path}',
               f'--large_model_reference={large_model_reference}',
@@ -99,6 +113,9 @@ def reward_model_trainer(
               f'--lora_dim={lora_dim}',
               f'--num_microbatches={num_microbatches}',
           ],
+          encryption_spec_key_name=encryption_spec_key_name,
+          base_output_directory=tensorboard_metrics.uri,
+          tensorboard=tensorboard_resource_id,
       ),
       gcp_resources=gcp_resources,
   )

google_cloud_pipeline_components/_implementation/llm/rlhf_preprocessor.py ADDED Viewed

@@ -0,0 +1,60 @@
+# Copyright 2024 The Kubeflow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Component that preprocesses inputs for Reinforcement Learning from Human Feedback (RLHF)."""
+import os
+from google_cloud_pipeline_components import _placeholders
+from google_cloud_pipeline_components import utils as gcpc_utils
+from google_cloud_pipeline_components._implementation.llm import utils
+from kfp import dsl
+@dsl.container_component
+def rlhf_preprocessor(
+    gcp_resources: dsl.OutputPath(str),  # pytype: disable=invalid-annotation
+    has_tensorboard_id: dsl.OutputPath(bool),  # pytype: disable=invalid-annotation
+    has_inference_dataset: dsl.OutputPath(bool),  # pytype: disable=invalid-annotation
+    evaluation_dataset: str = '',
+    tensorboard_resource_id: str = '',
+    image_uri: str = utils.get_default_image_uri('refined_cpu', ''),
+) -> dsl.ContainerSpec:  # pylint: disable=g-doc-args
+  """Preprocess RLHF pipeline inputs.
+  Args:
+    evaluation_dataset: Path to evaluation data.
+    tensorboard_resource_id: TensorBoard resource id.
+  Returns:
+    gcp_resources: GCP resources that can be used to track the custom job.
+    has_tensorboard_id: Whether a tensorboard id is provided.
+    has_inference_dataset: Whether inference data are provided.
+  """
+  return gcpc_utils.build_serverless_customjob_container_spec(
+      project=_placeholders.PROJECT_ID_PLACEHOLDER,
+      location=_placeholders.LOCATION_PLACEHOLDER,
+      custom_job_payload=utils.build_payload(
+          display_name='rlhf_preprocessor',
+          machine_type='n1-standard-4',
+          image_uri=image_uri,
+          args=[
+              '--app_name=rlhf_preprocessor',
+              f'--evaluation_dataset={evaluation_dataset}',
+              f'--tensorboard_resource_id={tensorboard_resource_id}',
+              f'--has_tensorboard_id_path={has_tensorboard_id}',
+              f'--has_inference_dataset_path={has_inference_dataset}',
+          ],
+      ),
+      gcp_resources=gcp_resources,
+  )

google_cloud_pipeline_components/_implementation/llm/supervised_fine_tuner.py CHANGED Viewed

@@ -86,6 +86,7 @@ def supervised_fine_tuner(
           machine_type=machine_type,
           image_uri=image_uri,
           args=[
+              '--app_name=supervised_fine_tuner',
               f'--input_model_path={input_model_path}',
               f'--train_steps={train_steps}',
               f'--inputs_sequence_length={inputs_sequence_length}',

google_cloud_pipeline_components/_implementation/llm/utils.py CHANGED Viewed

@@ -30,6 +30,8 @@ def build_payload(
     encryption_spec_key_name: str = '',
     labels: Optional[Dict[str, str]] = None,
     scheduling: Optional[Dict[str, Any]] = None,
+    base_output_directory: Optional[str] = None,
+    tensorboard: Optional[str] = None,
 ) -> Dict[str, Any]:
   """Generates payload for a custom training job.
@@ -50,6 +52,11 @@ def build_payload(
       moment.
     labels: The labels with user-defined metadata to organize CustomJobs.
     scheduling: Scheduling options for a CustomJob.
+    base_output_directory: Cloud Storage location to store the output of this
+      CustomJob
+    tensorboard: The name of a Vertex AI TensorBoard resource to which this
+      CustomJob will upload TensorBoard logs. Format:
+      ``projects/{project}/locations/{location}/tensorboards/{tensorboard}``
   Returns:
     Custom job payload.
@@ -96,6 +103,14 @@ def build_payload(
   if scheduling:
     payload['job_spec']['scheduling'] = scheduling
+  if base_output_directory:
+    payload['job_spec']['base_output_directory'] = {
+        'output_uri_prefix': base_output_directory
+    }
+  if tensorboard:
+    payload['job_spec']['tensorboard'] = tensorboard
   return payload
@@ -109,7 +124,10 @@ def get_temp_location() -> str:
   )
-def get_default_image_uri(image_name: str) -> str:
+def get_default_image_uri(
+    image_name: str,
+    image_name_prefix: Optional[str] = None,
+) -> str:
   """Gets the default image URI for a given image.
   The URI is resolved using environment variables that define the artifact
@@ -119,6 +137,8 @@ def get_default_image_uri(image_name: str) -> str:
   Args:
     image_name: Name of the image to resolve.
+    image_name_prefix: prefix to add to the image name when constructing the
+      URI. If `None`, `env.PRIVATE_IMAGE_NAME_PREFIX'` is used.
   Returns:
     URI of the image.
@@ -128,9 +148,12 @@ def get_default_image_uri(image_name: str) -> str:
   else:
     image_tag = env.get_private_image_tag()
+  if image_name_prefix is None:
+    image_name_prefix = env.PRIVATE_IMAGE_NAME_PREFIX
   return '/'.join([
       f'{env.PRIVATE_ARTIFACT_REGISTRY_LOCATION}-docker.pkg.dev',
       env.PRIVATE_ARTIFACT_REGISTRY_PROJECT,
       env.PRIVATE_ARTIFACT_REGISTRY,
-      f'{env.PRIVATE_IMAGE_NAME_PREFIX}{image_name}:{image_tag}',
+      f'{image_name_prefix}{image_name}:{image_tag}',
   ])

google_cloud_pipeline_components/_implementation/llm/validate_pipeline.py ADDED Viewed

@@ -0,0 +1,113 @@
+# Copyright 2024 The Kubeflow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""KFP Component for validate_pipeline."""
+from typing import NamedTuple, Optional
+from google_cloud_pipeline_components import _image
+from google_cloud_pipeline_components import _placeholders
+from kfp import dsl
+@dsl.component(base_image=_image.GCPC_IMAGE_TAG, install_kfp_package=False)
+def validate_pipeline(
+    location: str,
+    encryption_spec_key_name: str = '',
+    accelerator_type: str = '',
+    eval_dataset: Optional[str] = None,
+) -> NamedTuple('PreprocessedInputs', reward_model_eval_dataset=str):
+  # fmt: off
+  """Validates and preprocesses RLHF pipeline parameters.
+  Args:
+    location: Location used to run non-tuning components, i.e. components
+      that do not require accelerators. If not specified the location used
+      to run the pipeline will be used.
+    encryption_spec_key_name: If set, CMEK support will be validated.
+    accelerator_type: One of 'TPU' or 'GPU'. If 'TPU' is specified, tuning
+      components run in europe-west4. Otherwise tuning components run in
+      us-central1 on GPUs. Default is 'GPU'.
+    eval_dataset: Optional Cloud storage path to an evaluation dataset. The
+      format should match that of the preference dataset.
+  """
+  # fmt: on
+  # pylint: disable=g-import-not-at-top,import-outside-toplevel
+  import json
+  import logging
+  import re
+  import sys
+  import glob
+  # pylint: enable=g-import-not-at-top,import-outside-toplevel
+  outputs = NamedTuple(
+      'PreprocessedInputs',
+      reward_model_eval_dataset=str,
+  )
+  try:
+    # [ Set eval_dataset
+    eval_dataset = eval_dataset or ''
+    gcs_eval_dataset_uri = re.sub('^gs://', '/gcs/', eval_dataset)
+    files_in_folder = glob.glob(gcs_eval_dataset_uri)
+    if not files_in_folder:
+      eval_dataset = ''
+    else:
+      first_file = files_in_folder[0]
+      required_fields = ('candidate_0', 'candidate_1', 'choice')
+      oneof_fields = {'input_text', 'messages'}
+      max_lines_to_check = 100
+      with open(first_file, 'r') as inputs:
+        for i, line in enumerate(inputs):
+          json_data = json.loads(line)
+          is_valid_preference_data = all(
+              field in json_data for field in required_fields
+          ) and any(oneof_field in json_data for oneof_field in oneof_fields)
+          if not is_valid_preference_data:
+            eval_dataset = ''
+          if not eval_dataset or i >= max_lines_to_check:
+            break
+    # ]
+    # [ Check CMEK
+    supported_pipeline_regions = {
+        'europe-west4',
+        'us-central1',
+    }
+    if location not in supported_pipeline_regions:
+      raise ValueError(
+          f'Unsupported pipeline region: {location}. Must be one of'
+          f' {supported_pipeline_regions}.'
+      )
+    valid_cmek_accelerator_types = {
+        'GPU',
+        'CPU',  # Only used for testing.
+    }
+    valid_cmek_config = (
+        location == 'us-central1'
+        and accelerator_type in valid_cmek_accelerator_types
+    )
+    if encryption_spec_key_name and not valid_cmek_config:
+      raise ValueError(
+          'encryption_spec_key_name (CMEK) is only supported for GPU training'
+          ' in us-central1. Please either unset encryption_spec_key_name or'
+          ' create your pipeline in us-central1 to use GPU instead.'
+      )
+    # CMEK ]
+    return outputs(reward_model_eval_dataset=eval_dataset)
+  except Exception as e:  # pylint: disable=broad-exception-caught
+    if isinstance(e, ValueError):
+      raise
+    logging.exception(str(e))
+    sys.exit(13)

google_cloud_pipeline_components/_implementation/model_evaluation/__init__.py CHANGED Viewed

@@ -37,6 +37,7 @@ from google_cloud_pipeline_components._implementation.model_evaluation.llm_safet
 from google_cloud_pipeline_components._implementation.model_evaluation.llm_safety_bias.evaluation_llm_safety_bias_pipeline import evaluation_llm_safety_bias_pipeline
 from google_cloud_pipeline_components._implementation.model_evaluation.model_inference.component import model_inference_and_evaluation_component
 from google_cloud_pipeline_components._implementation.model_evaluation.model_inference.component import model_inference_component
+from google_cloud_pipeline_components._implementation.model_evaluation.model_name_preprocessor.component import model_name_preprocessor as ModelNamePreprocessorOp
 from google_cloud_pipeline_components._implementation.model_evaluation.target_field_data_remover.component import target_field_data_remover as TargetFieldDataRemoverOp
 from google_cloud_pipeline_components._implementation.model_evaluation.text2sql.evaluation_llm_text2sql_pipeline import evaluation_llm_text2sql_pipeline
@@ -63,6 +64,7 @@ __all__ = [
     'ModelEvaluationFeatureAttributionOp',
     'ModelImportEvaluatedAnnotationOp',
     'ModelImportEvaluationOp',
+    'ModelNamePreprocessorOp',
     'TargetFieldDataRemoverOp',
     'model_inference_component',
     'model_inference_and_evaluation_component',

google_cloud_pipeline_components/_implementation/model_evaluation/endpoint_batch_predict/component.py CHANGED Viewed

@@ -24,7 +24,7 @@ from kfp.dsl import Output
 from kfp.dsl import OutputPath
 from kfp.dsl import PIPELINE_ROOT_PLACEHOLDER
-_IMAGE_URI = 'us-docker.pkg.dev/vertex-evaluation/public/llm:wjess-fishfooding'
+_IMAGE_URI = 'us-docker.pkg.dev/vertex-evaluation/public/llm:v0.5'
 @dsl.component(base_image=version.LLM_EVAL_IMAGE_TAG)

google-cloud-pipeline-components 2.10.0__py3-none-any.whl → 2.12.0__py3-none-any.whl

Potentially problematic release.

google-cloud-pipeline-components 2.10.0py3-none-any.whl → 2.12.0py3-none-any.whl