PyPI - snowflake-ml-python - Versions diffs - 1.7.4__py3-none-any.whl → 1.7.5__py3-none-any.whl - Mend

snowflake-ml-python 1.7.4py3-none-any.whl → 1.7.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

snowflake/ml/jobs/_utils/payload_utils.py CHANGED Viewed

@@ -1,5 +1,8 @@
+import functools
 import inspect
 import io
+import itertools
+import pickle
 import sys
 import textwrap
 from pathlib import Path, PurePath
@@ -19,9 +22,11 @@ import cloudpickle as cp
 from snowflake import snowpark
 from snowflake.ml.jobs._utils import constants, types
+from snowflake.snowpark import exceptions as sp_exceptions
 from snowflake.snowpark._internal import code_generation
 _SUPPORTED_ARG_TYPES = {str, int, float}
+_SUPPORTED_ENTRYPOINT_EXTENSIONS = {".py"}
 _STARTUP_SCRIPT_PATH = PurePath("startup.sh")
 _STARTUP_SCRIPT_CODE = textwrap.dedent(
     f"""
@@ -69,12 +74,11 @@ _STARTUP_SCRIPT_CODE = textwrap.dedent(
     shm_size=$(df --output=size --block-size=1 /dev/shm | tail -n 1)
     # Configure IP address and logging directory
-    eth0Ip=$(ifconfig eth0 | sed -En -e 's/.*inet ([0-9.]+).*/\1/p')
+    eth0Ip=$(ifconfig eth0 2>/dev/null | sed -En -e 's/.*inet ([0-9.]+).*/\1/p')
     log_dir="/tmp/ray"
-    # Check if eth0Ip is empty and set default if necessary
-    if [ -z "$eth0Ip" ]; then
-        # This should never happen, but just in case ethOIp is not set, we should default to localhost
+    # Check if eth0Ip is a valid IP address and fall back to default if necessary
+    if [[ ! $eth0Ip =~ ^[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+$ ]]; then
         eth0Ip="127.0.0.1"
     fi
@@ -120,6 +124,34 @@ _STARTUP_SCRIPT_CODE = textwrap.dedent(
 ).strip()
+def _resolve_entrypoint(parent: Path, entrypoint: Optional[Path]) -> Path:
+    parent = parent.absolute()
+    if entrypoint is None:
+        if parent.is_file():
+            # Infer entrypoint from source
+            entrypoint = parent
+        else:
+            raise ValueError("entrypoint must be provided when source is a directory")
+    elif entrypoint.is_absolute():
+        # Absolute path - validate it's a subpath of source dir
+        if not entrypoint.is_relative_to(parent):
+            raise ValueError(f"Entrypoint must be a subpath of {parent}, got: {entrypoint})")
+    else:
+        # Relative path
+        if (abs_entrypoint := entrypoint.absolute()).is_relative_to(parent) and abs_entrypoint.is_file():
+            # Relative to working dir iff path is relative to source dir and exists
+            entrypoint = abs_entrypoint
+        else:
+            # Relative to source dir
+            entrypoint = parent.joinpath(entrypoint)
+    if not entrypoint.is_file():
+        raise FileNotFoundError(
+            "Entrypoint not found. Ensure the entrypoint is a valid file and is under"
+            f" the source directory (source={parent}, entrypoint={entrypoint})"
+        )
+    return entrypoint
 class JobPayload:
     def __init__(
         self,
@@ -138,23 +170,23 @@ class JobPayload:
             # since we will generate the file from the serialized callable
             pass
         elif isinstance(self.source, Path):
-            # Validate self.source and self.entrypoint for files
-            if not self.source.exists():
-                raise FileNotFoundError(f"{self.source} does not exist")
-            if self.entrypoint is None:
-                if self.source.is_file():
-                    self.entrypoint = self.source
-                else:
-                    raise ValueError("entrypoint must be provided when source is a directory")
-            if not self.entrypoint.is_file():
-                # Check if self.entrypoint is a valid relative path
-                self.entrypoint = self.source.joinpath(self.entrypoint)
-                if not self.entrypoint.is_file():
-                    raise FileNotFoundError(f"File {self.entrypoint} does not exist")
-            if not self.entrypoint.is_relative_to(self.source):
-                raise ValueError(f"{self.entrypoint} must be a subpath of {self.source}")
-            if self.entrypoint.suffix != ".py":
-                raise NotImplementedError("Only Python entrypoints are supported currently")
+            # Validate source
+            source = self.source
+            if not source.exists():
+                raise FileNotFoundError(f"{source} does not exist")
+            source = source.absolute()
+            # Validate entrypoint
+            entrypoint = _resolve_entrypoint(source, self.entrypoint)
+            if entrypoint.suffix not in _SUPPORTED_ENTRYPOINT_EXTENSIONS:
+                raise ValueError(
+                    "Unsupported entrypoint type:"
+                    f" supported={','.join(_SUPPORTED_ENTRYPOINT_EXTENSIONS)} got={entrypoint.suffix}"
+                )
+            # Update fields with normalized values
+            self.source = source
+            self.entrypoint = entrypoint
         else:
             raise ValueError("Unsupported source type. Source must be a file, directory, or callable.")
@@ -168,12 +200,16 @@ class JobPayload:
         entrypoint = self.entrypoint or Path(constants.DEFAULT_ENTRYPOINT_PATH)
         # Create stage if necessary
-        stage_name = stage_path.parts[0]
-        session.sql(
-            f"create stage if not exists {stage_name.lstrip('@')}"
-            " encryption = ( type = 'SNOWFLAKE_SSE' )"
-            " comment = 'Created by snowflake.ml.jobs Python API'"
-        ).collect()
+        stage_name = stage_path.parts[0].lstrip("@")
+        # Explicitly check if stage exists first since we may not have CREATE STAGE privilege
+        try:
+            session.sql(f"describe stage {stage_name}").collect()
+        except sp_exceptions.SnowparkSQLException:
+            session.sql(
+                f"create stage if not exists {stage_name}"
+                " encryption = ( type = 'SNOWFLAKE_SSE' )"
+                " comment = 'Created by snowflake.ml.jobs Python API'"
+            ).collect()
         # Upload payload to stage
         if not isinstance(source, Path):
@@ -237,7 +273,7 @@ class JobPayload:
         )
-def get_parameter_type(param: inspect.Parameter) -> Optional[Type[object]]:
+def _get_parameter_type(param: inspect.Parameter) -> Optional[Type[object]]:
     # Unwrap Optional type annotations
     param_type = param.annotation
     if get_origin(param_type) is Union and len(get_args(param_type)) == 2 and type(None) in get_args(param_type):
@@ -249,7 +285,7 @@ def get_parameter_type(param: inspect.Parameter) -> Optional[Type[object]]:
     return cast(Type[object], param_type)
-def validate_parameter_type(param_type: Type[object], param_name: str) -> None:
+def _validate_parameter_type(param_type: Type[object], param_name: str) -> None:
     # Validate param_type is a supported type
     if param_type not in _SUPPORTED_ARG_TYPES:
         raise ValueError(
@@ -258,41 +294,60 @@ def validate_parameter_type(param_type: Type[object], param_name: str) -> None:
         )
-def generate_python_code(func: Callable[..., Any], source_code_display: bool = False) -> str:
-    signature = inspect.signature(func)
-    if any(
-        p.kind in {inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD}
-        for p in signature.parameters.values()
-    ):
-        raise NotImplementedError("Function must not have unpacking arguments (* or **)")
-    # Mirrored from Snowpark generate_python_code() function
-    # https://github.com/snowflakedb/snowpark-python/blob/main/src/snowflake/snowpark/_internal/udf_utils.py
+def _generate_source_code_comment(func: Callable[..., Any]) -> str:
+    """Generate a comment string containing the source code of a function for readability."""
     try:
-        source_code_comment = (
-            code_generation.generate_source_code(func) if source_code_display else ""  # type: ignore[arg-type]
-        )
+        if isinstance(func, functools.partial):
+            # Unwrap functools.partial and generate source code comment from the original function
+            comment = code_generation.generate_source_code(func.func)  # type: ignore[arg-type]
+            args = itertools.chain((repr(a) for a in func.args), (f"{k}={v!r}" for k, v in func.keywords.items()))
+            # Update invocation comment to show arguments passed via functools.partial
+            comment = comment.replace(
+                f"= {func.func.__name__}",
+                "= functools.partial({}({}))".format(
+                    func.func.__name__,
+                    ", ".join(args),
+                ),
+            )
+            return comment
+        else:
+            return code_generation.generate_source_code(func)  # type: ignore[arg-type]
     except Exception as exc:
         error_msg = f"Source code comment could not be generated for {func} due to error {exc}."
-        source_code_comment = code_generation.comment_source_code(error_msg)
+        return code_generation.comment_source_code(error_msg)
-    func_name = "func"
-    func_code = f"""
-{source_code_comment}
-import pickle
-{func_name} = pickle.loads(bytes.fromhex('{cp.dumps(func).hex()}'))
-"""
+def _serialize_callable(func: Callable[..., Any]) -> bytes:
+    try:
+        func_bytes: bytes = cp.dumps(func)
+        return func_bytes
+    except pickle.PicklingError as e:
+        if isinstance(func, functools.partial):
+            # Try to find which part of the partial isn't serializable for better debuggability
+            objects = [
+                ("function", func.func),
+                *((f"positional arg {i}", a) for i, a in enumerate(func.args)),
+                *((f"keyword arg '{k}'", v) for k, v in func.keywords.items()),
+            ]
+            for name, obj in objects:
+                try:
+                    cp.dumps(obj)
+                except pickle.PicklingError:
+                    raise ValueError(f"Unable to serialize {name}: {obj}") from e
+        raise ValueError(f"Unable to serialize function: {func}") from e
+def _generate_param_handler_code(signature: inspect.Signature, output_name: str = "kwargs") -> str:
     # Generate argparse logic for argument handling (type coercion, default values, etc)
     argparse_code = ["import argparse", "", "parser = argparse.ArgumentParser()"]
     argparse_postproc = []
     for name, param in signature.parameters.items():
         opts = {}
-        param_type = get_parameter_type(param)
+        param_type = _get_parameter_type(param)
         if param_type is not None:
-            validate_parameter_type(param_type, name)
+            _validate_parameter_type(param_type, name)
             opts["type"] = param_type.__name__
         if param.default != inspect.Parameter.empty:
@@ -324,6 +379,37 @@ import pickle
             )
     argparse_code.append("args = parser.parse_args()")
     param_code = "\n".join(argparse_code + argparse_postproc)
+    param_code += f"\n{output_name} = vars(args)"
+    return param_code
+def generate_python_code(func: Callable[..., Any], source_code_display: bool = False) -> str:
+    """Generate an entrypoint script from a Python function."""
+    signature = inspect.signature(func)
+    if any(
+        p.kind in {inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD}
+        for p in signature.parameters.values()
+    ):
+        raise NotImplementedError("Function must not have unpacking arguments (* or **)")
+    # Mirrored from Snowpark generate_python_code() function
+    # https://github.com/snowflakedb/snowpark-python/blob/main/src/snowflake/snowpark/_internal/udf_utils.py
+    source_code_comment = _generate_source_code_comment(func) if source_code_display else ""
+    func_name = "func"
+    func_code = f"""
+{source_code_comment}
+import pickle
+{func_name} = pickle.loads(bytes.fromhex('{_serialize_callable(func).hex()}'))
+"""
+    arg_dict_name = "kwargs"
+    if getattr(func, constants.IS_MLJOB_REMOTE_ATTR, None):
+        param_code = f"{arg_dict_name} = {{}}"
+    else:
+        param_code = _generate_param_handler_code(signature, arg_dict_name)
     return f"""
 ### Version guard to check compatibility across Python versions ###
@@ -348,5 +434,5 @@ if sys.version_info.major != {sys.version_info.major} or sys.version_info.minor
 if __name__ == '__main__':
 {textwrap.indent(param_code, '    ')}
-    {func_name}(**vars(args))
+    {func_name}(**{arg_dict_name})
 """

snowflake/ml/jobs/_utils/spec_utils.py CHANGED Viewed

@@ -141,37 +141,35 @@ def generate_service_spec(
         )
     # Mount 30% of memory limit as a memory-backed volume
-    memory_volume_name = "dshm"
     memory_volume_size = min(
         ceil(image_spec.resource_limits.memory * constants.MEMORY_VOLUME_SIZE),
         image_spec.resource_requests.memory,
     )
     volume_mounts.append(
         {
-            "name": memory_volume_name,
+            "name": constants.MEMORY_VOLUME_NAME,
             "mountPath": "/dev/shm",
         }
     )
     volumes.append(
         {
-            "name": memory_volume_name,
+            "name": constants.MEMORY_VOLUME_NAME,
             "source": "memory",
             "size": f"{memory_volume_size}Gi",
         }
     )
     # Mount payload as volume
-    stage_mount = PurePath("/opt/app")
-    stage_volume_name = "stage-volume"
+    stage_mount = PurePath(constants.STAGE_VOLUME_MOUNT_PATH)
     volume_mounts.append(
         {
-            "name": stage_volume_name,
+            "name": constants.STAGE_VOLUME_NAME,
             "mountPath": stage_mount.as_posix(),
         }
     )
     volumes.append(
         {
-            "name": stage_volume_name,
+            "name": constants.STAGE_VOLUME_NAME,
             "source": payload.stage_path.as_posix(),
         }
     )

snowflake/ml/jobs/decorators.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import copy
 import functools
-import inspect
 from typing import Callable, Dict, List, Optional, TypeVar
 from typing_extensions import ParamSpec
@@ -8,7 +7,7 @@ from typing_extensions import ParamSpec
 from snowflake import snowpark
 from snowflake.ml._internal import telemetry
 from snowflake.ml.jobs import job as jb, manager as jm
-from snowflake.ml.jobs._utils import payload_utils
+from snowflake.ml.jobs._utils import constants
 _PROJECT = "MLJob"
@@ -50,31 +49,12 @@ def remote(
         wrapped_func = copy.copy(func)
         wrapped_func.__code__ = wrapped_func.__code__.replace(co_firstlineno=func.__code__.co_firstlineno + 1)
-        # Validate function arguments based on signature
-        signature = inspect.signature(func)
-        pos_arg_names = []
-        for name, param in signature.parameters.items():
-            param_type = payload_utils.get_parameter_type(param)
-            if param_type is not None:
-                payload_utils.validate_parameter_type(param_type, name)
-            if param.kind in (param.POSITIONAL_ONLY, param.POSITIONAL_OR_KEYWORD):
-                pos_arg_names.append(name)
         @functools.wraps(func)
         def wrapper(*args: _Args.args, **kwargs: _Args.kwargs) -> jb.MLJob:
-            # Validate positional args
-            for i, arg in enumerate(args):
-                arg_name = pos_arg_names[i] if i < len(pos_arg_names) else f"args[{i}]"
-                payload_utils.validate_parameter_type(type(arg), arg_name)
-            # Validate keyword args
-            for k, v in kwargs.items():
-                payload_utils.validate_parameter_type(type(v), k)
-            arg_list = [str(v) for v in args] + [x for k, v in kwargs.items() for x in (f"--{k}", str(v))]
+            payload = functools.partial(func, *args, **kwargs)
+            setattr(payload, constants.IS_MLJOB_REMOTE_ATTR, True)
             job = jm._submit_job(
-                source=wrapped_func,
-                args=arg_list,
+                source=payload,
                 stage_name=stage_name,
                 compute_pool=compute_pool,
                 pip_requirements=pip_requirements,
@@ -83,7 +63,7 @@ def remote(
                 env_vars=env_vars,
                 session=session,
             )
-            assert isinstance(job, jb.MLJob)
+            assert isinstance(job, jb.MLJob), f"Unexpected job type: {type(job)}"
             return job
         return wrapper

snowflake/ml/jobs/job.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Any, List, Optional, cast
 from snowflake import snowpark
 from snowflake.ml._internal import telemetry
 from snowflake.ml.jobs._utils import constants, types
-from snowflake.snowpark.context import get_active_session
+from snowflake.snowpark import context as sp_context
 _PROJECT = "MLJob"
 TERMINAL_JOB_STATUSES = {"FAILED", "DONE", "INTERNAL_ERROR"}
@@ -13,7 +13,7 @@ TERMINAL_JOB_STATUSES = {"FAILED", "DONE", "INTERNAL_ERROR"}
 class MLJob:
     def __init__(self, id: str, session: Optional[snowpark.Session] = None) -> None:
         self._id = id
-        self._session = session or get_active_session()
+        self._session = session or sp_context.get_active_session()
         self._status: types.JOB_STATUS = "PENDING"
     @property
@@ -79,7 +79,7 @@ class MLJob:
         return self.status
-@telemetry.send_api_usage_telemetry(project=_PROJECT)
+@telemetry.send_api_usage_telemetry(project=_PROJECT, func_params_to_log=["job_id"])
 def _get_status(session: snowpark.Session, job_id: str) -> types.JOB_STATUS:
     """Retrieve job execution status."""
     # TODO: snowflake-snowpark-python<1.24.0 shows spurious error messages on
@@ -90,7 +90,7 @@ def _get_status(session: snowpark.Session, job_id: str) -> types.JOB_STATUS:
     return cast(types.JOB_STATUS, row["status"])
-@telemetry.send_api_usage_telemetry(project=_PROJECT)
+@telemetry.send_api_usage_telemetry(project=_PROJECT, func_params_to_log=["job_id", "limit"])
 def _get_logs(session: snowpark.Session, job_id: str, limit: int = -1) -> str:
     """
     Retrieve the job's execution logs.

snowflake/ml/model/_packager/model_env/model_env.py CHANGED Viewed

@@ -113,7 +113,33 @@ class ModelEnv:
             self._snowpark_ml_version = version.parse(snowpark_ml_version)
     def include_if_absent(self, pkgs: List[ModelDependency], check_local_version: bool = False) -> None:
-        """Append requirements into model env if absent.
+        """Append requirements into model env if absent. Depending on the environment, requirements may be added
+        to either the pip requirements or conda dependencies.
+        Args:
+            pkgs: A list of ModelDependency namedtuple to be appended.
+            check_local_version: Flag to indicate if it is required to pin to local version. Defaults to False.
+        """
+        if self.pip_requirements and not self.conda_dependencies and pkgs:
+            pip_pkg_reqs: List[str] = []
+            warnings.warn(
+                (
+                    "Dependencies specified from pip requirements."
+                    " This may prevent model deploying to Snowflake Warehouse."
+                ),
+                category=UserWarning,
+                stacklevel=2,
+            )
+            for conda_req_str, pip_name in pkgs:
+                _, conda_req = env_utils._validate_conda_dependency_string(conda_req_str)
+                pip_req = requirements.Requirement(f"{pip_name}{conda_req.specifier}")
+                pip_pkg_reqs.append(str(pip_req))
+            self._include_if_absent_pip(pip_pkg_reqs, check_local_version)
+        else:
+            self._include_if_absent_conda(pkgs, check_local_version)
+    def _include_if_absent_conda(self, pkgs: List[ModelDependency], check_local_version: bool = False) -> None:
+        """Append requirements into model env conda dependencies if absent.
         Args:
             pkgs: A list of ModelDependency namedtuple to be appended.
@@ -134,8 +160,8 @@ class ModelEnv:
                 if show_warning_message:
                     warnings.warn(
                         (
-                            f"Basic dependency {req_to_add.name} specified from PIP requirements."
-                            + " This may prevent model deploying to Snowflake Warehouse."
+                            f"Basic dependency {req_to_add.name} specified from pip requirements."
+                            " This may prevent model deploying to Snowflake Warehouse."
                         ),
                         category=UserWarning,
                         stacklevel=2,
@@ -157,11 +183,11 @@ class ModelEnv:
                         stacklevel=2,
                     )
-    def include_if_absent_pip(self, pkgs: List[str], check_local_version: bool = False) -> None:
-        """Append pip requirements into model env if absent.
+    def _include_if_absent_pip(self, pkgs: List[str], check_local_version: bool = False) -> None:
+        """Append pip requirements into model env pip requirements if absent.
         Args:
-            pkgs: A list of string to be appended in pip requirement.
+            pkgs: A list of strings to be appended to pip environment.
             check_local_version: Flag to indicate if it is required to pin to local version. Defaults to False.
         """
@@ -187,25 +213,6 @@ class ModelEnv:
                 self._conda_dependencies[channel].remove(spec)
     def generate_env_for_cuda(self) -> None:
-        if self.cuda_version is None:
-            return
-        cuda_spec = env_utils.find_dep_spec(
-            self._conda_dependencies, self._pip_requirements, conda_pkg_name="cuda", remove_spec=False
-        )
-        if cuda_spec and not cuda_spec.specifier.contains(self.cuda_version):
-            raise ValueError(
-                "The CUDA requirement you specified in your conda dependencies or pip requirements is"
-                " conflicting with CUDA version required. Please do not specify CUDA dependency using conda"
-                " dependencies or pip requirements."
-            )
-        if not cuda_spec:
-            self.include_if_absent(
-                [ModelDependency(requirement=f"nvidia::cuda=={self.cuda_version}.*", pip_name="cuda")],
-                check_local_version=False,
-            )
         xgboost_spec = env_utils.find_dep_spec(
             self._conda_dependencies, self._pip_requirements, conda_pkg_name="xgboost", remove_spec=True
         )
@@ -236,7 +243,7 @@ class ModelEnv:
                 check_local_version=False,
             )
-            self.include_if_absent_pip(["bitsandbytes>=0.41.0"], check_local_version=False)
+            self._include_if_absent_pip(["bitsandbytes>=0.41.0"], check_local_version=False)
     def relax_version(self) -> None:
         """Relax the version requirements for both conda dependencies and pip requirements.
@@ -252,7 +259,9 @@ class ModelEnv:
         self._pip_requirements = list(map(env_utils.relax_requirement_version, self._pip_requirements))
     def load_from_conda_file(self, conda_env_path: pathlib.Path) -> None:
-        conda_dependencies_dict, pip_requirements_list, python_version = env_utils.load_conda_env_file(conda_env_path)
+        conda_dependencies_dict, pip_requirements_list, python_version, cuda_version = env_utils.load_conda_env_file(
+            conda_env_path
+        )
         for channel, channel_dependencies in conda_dependencies_dict.items():
             if channel != env_utils.DEFAULT_CHANNEL_NAME:
@@ -310,6 +319,9 @@ class ModelEnv:
         if python_version:
             self.python_version = python_version
+        if cuda_version:
+            self.cuda_version = cuda_version
     def load_from_pip_file(self, pip_requirements_path: pathlib.Path) -> None:
         pip_requirements_list = env_utils.load_requirements_file(pip_requirements_path)
@@ -342,12 +354,17 @@ class ModelEnv:
         self.snowpark_ml_version = env_dict["snowpark_ml_version"]
     def save_as_dict(
-        self, base_dir: pathlib.Path, default_channel_override: str = env_utils.SNOWFLAKE_CONDA_CHANNEL_URL
+        self,
+        base_dir: pathlib.Path,
+        default_channel_override: str = env_utils.SNOWFLAKE_CONDA_CHANNEL_URL,
+        is_gpu: Optional[bool] = False,
     ) -> model_meta_schema.ModelEnvDict:
+        cuda_version = self.cuda_version if is_gpu else None
         env_utils.save_conda_env_file(
             pathlib.Path(base_dir / self.conda_env_rel_path),
             self._conda_dependencies,
             self.python_version,
+            cuda_version,
             default_channel_override=default_channel_override,
         )
         env_utils.save_requirements_file(

snowflake/ml/model/_packager/model_handlers/_utils.py CHANGED Viewed

@@ -39,7 +39,7 @@ def _is_callable(model: model_types.SupportedModelType, method_name: str) -> boo
 def get_truncated_sample_data(
-    sample_input_data: model_types.SupportedDataType, length: int = 100
+    sample_input_data: model_types.SupportedDataType, length: int = 100, is_for_modeling_model: bool = False
 ) -> model_types.SupportedLocalDataType:
     trunc_sample_input = model_signature._truncate_data(sample_input_data, length=length)
     local_sample_input: model_types.SupportedLocalDataType = None
@@ -47,6 +47,8 @@ def get_truncated_sample_data(
         # Added because of Any from missing stubs.
         trunc_sample_input = cast(SnowparkDataFrame, trunc_sample_input)
         local_sample_input = snowpark_handler.SnowparkDataFrameHandler.convert_to_df(trunc_sample_input)
+        if is_for_modeling_model:
+            local_sample_input.columns = trunc_sample_input.columns
     else:
         local_sample_input = trunc_sample_input
     return local_sample_input
@@ -58,13 +60,15 @@ def validate_signature(
     target_methods: Iterable[str],
     sample_input_data: Optional[model_types.SupportedDataType],
     get_prediction_fn: Callable[[str, model_types.SupportedLocalDataType], model_types.SupportedLocalDataType],
+    is_for_modeling_model: bool = False,
 ) -> model_meta.ModelMetadata:
     if model_meta.signatures:
         validate_target_methods(model, list(model_meta.signatures.keys()))
         if sample_input_data is not None:
-            local_sample_input = get_truncated_sample_data(sample_input_data)
+            local_sample_input = get_truncated_sample_data(
+                sample_input_data, is_for_modeling_model=is_for_modeling_model
+            )
             for target_method in model_meta.signatures.keys():
                 model_signature_inst = model_meta.signatures.get(target_method)
                 if model_signature_inst is not None:
                     # strict validation the input signature
@@ -77,7 +81,7 @@ def validate_signature(
     assert (
         sample_input_data is not None
     ), "Model signature and sample input are None at the same time. This should not happen with local model."
-    local_sample_input = get_truncated_sample_data(sample_input_data)
+    local_sample_input = get_truncated_sample_data(sample_input_data, is_for_modeling_model=is_for_modeling_model)
     for target_method in target_methods:
         predictions_df = get_prediction_fn(target_method, local_sample_input)
         sig = model_signature.infer_signature(

snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py CHANGED Viewed

@@ -146,6 +146,10 @@ class HuggingFacePipelineHandler(
             framework = getattr(model, "framework", None)
             batch_size = getattr(model, "batch_size", None)
+        has_tokenizer = getattr(model, "tokenizer", None) is not None
+        has_feature_extractor = getattr(model, "feature_extractor", None) is not None
+        has_image_preprocessor = getattr(model, "image_preprocessor", None) is not None
         if type_utils.LazyType("transformers.Pipeline").isinstance(model):
             params = {
                 **model._preprocess_params,  # type:ignore[attr-defined]
@@ -234,6 +238,9 @@ class HuggingFacePipelineHandler(
                 {
                     "task": task,
                     "batch_size": batch_size if batch_size is not None else 1,
+                    "has_tokenizer": has_tokenizer,
+                    "has_feature_extractor": has_feature_extractor,
+                    "has_image_preprocessor": has_image_preprocessor,
                 }
             ),
         )
@@ -308,6 +315,14 @@ class HuggingFacePipelineHandler(
         if os.path.isdir(model_blob_file_or_dir_path):
             import transformers
+            additional_pipeline_params = {}
+            if model_blob_options.get("has_tokenizer", False):
+                additional_pipeline_params["tokenizer"] = model_blob_file_or_dir_path
+            if model_blob_options.get("has_feature_extractor", False):
+                additional_pipeline_params["feature_extractor"] = model_blob_file_or_dir_path
+            if model_blob_options.get("has_image_preprocessor", False):
+                additional_pipeline_params["image_preprocessor"] = model_blob_file_or_dir_path
             with open(
                 os.path.join(
                     model_blob_file_or_dir_path,
@@ -324,6 +339,7 @@ class HuggingFacePipelineHandler(
                 model=model_blob_file_or_dir_path,
                 trust_remote_code=True,
                 torch_dtype="auto",
+                **additional_pipeline_params,
                 **device_config,
             )

snowflake-ml-python 1.7.4__py3-none-any.whl → 1.7.5__py3-none-any.whl

snowflake-ml-python 1.7.4py3-none-any.whl → 1.7.5py3-none-any.whl