PyPI - snowflake-ml-python - Versions diffs - 1.7.2__py3-none-any.whl → 1.7.3__py3-none-any.whl - Mend

snowflake-ml-python 1.7.2py3-none-any.whl → 1.7.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

snowflake/ml/fileset/parquet_parser.py DELETED Viewed

@@ -1,170 +0,0 @@
-import collections
-import logging
-import time
-from typing import Any, Deque, Dict, Iterator, List
-import fsspec
-import numpy as np
-import numpy.typing as npt
-import pyarrow as pa
-import pyarrow.dataset as ds
-_EMPTY_RECORD_BATCH = pa.RecordBatch.from_arrays([], [])
-# The row count for batches read from PyArrow Dataset. This number should be large enough so that
-# dataset.to_batches() would read in a very large portion of, if not entirely, a parquet file.
-_DEFAULT_DATASET_BATCH_SIZE = 1000000
-class _RecordBatchesBuffer:
-    """A queue that stores record batches and tracks the total num of rows in it."""
-    def __init__(self) -> None:
-        self.buffer: Deque[pa.RecordBatch] = collections.deque()
-        self.num_rows = 0
-    def append(self, rb: pa.RecordBatch) -> None:
-        self.buffer.append(rb)
-        self.num_rows += rb.num_rows
-    def appendleft(self, rb: pa.RecordBatch) -> None:
-        self.buffer.appendleft(rb)
-        self.num_rows += rb.num_rows
-    def popleft(self) -> pa.RecordBatch:
-        popped = self.buffer.popleft()
-        self.num_rows -= popped.num_rows
-        return popped
-class ParquetParser:
-    """Read and parse the given parquet files and yield batched numpy array in dict.
-    Args:
-        file_paths: A list of parquet file URIs to read and parse.
-        filesystem: A fsspec/pyarrow file system that is used to open given file URIs.
-        batch_size: Specifies the size of each batch that will be yield
-        shuffle: Whether the data in the file will be shuffled. If set to be true, it will first randomly shuffle
-            the order of files, and then shuflle the order of rows in each file.
-        drop_last_batch: Whether the last batch of data should be dropped. If set to be true, then the last batch will
-            get dropped if its size is smaller than the given batch_size.
-    Returns:
-        A PyTorch iterable datapipe that yields batched numpy array in dict. The keys will be the column names in
-        the parquet files, and the value will be the column value as a list.
-    """
-    def __init__(
-        self,
-        file_paths: List[str],
-        filesystem: fsspec.AbstractFileSystem,
-        batch_size: int,
-        shuffle: bool = True,
-        drop_last_batch: bool = True,
-    ) -> None:
-        self._file_paths = file_paths
-        self._fs = filesystem
-        self._batch_size = batch_size
-        self._dataset_batch_size = max(_DEFAULT_DATASET_BATCH_SIZE, self._batch_size)
-        self._shuffle = shuffle
-        self._drop_last_batch = drop_last_batch
-    def __iter__(self) -> Iterator[Dict[str, npt.NDArray[Any]]]:
-        """Iterate through PyArrow Dataset to generate batches whose length equals to expected batch size.
-        As we are generating batches with the exactly same length, the last few rows in each file might get left as they
-        are not long enough to form a batch. These rows will be put into a temporary buffer and combine with the first
-        few rows of the next file to generate a new batch.
-        Yields:
-            A dict mapping column names to the corresponding data fetch from that column.
-        """
-        self._rb_buffer = _RecordBatchesBuffer()
-        files = list(self._file_paths)
-        if self._shuffle:
-            np.random.shuffle(files)
-        pa_dataset: ds.Dataset = ds.dataset(files, format="parquet", filesystem=self._fs)
-        for rb in _retryable_batches(pa_dataset, batch_size=self._dataset_batch_size):
-            if self._shuffle:
-                rb = rb.take(np.random.permutation(rb.num_rows))
-            self._rb_buffer.append(rb)
-            while self._rb_buffer.num_rows >= self._batch_size:
-                yield self._get_batches_from_buffer()
-        if self._rb_buffer.num_rows and not self._drop_last_batch:
-            yield self._get_batches_from_buffer()
-    def _get_batches_from_buffer(self) -> Dict[str, npt.NDArray[Any]]:
-        """Generate new batches from the existing record batch buffer."""
-        cnt_rbs_num_rows = 0
-        candidates = []
-        # Keep popping record batches in buffer until there are enough rows for a batch.
-        while self._rb_buffer.num_rows and cnt_rbs_num_rows < self._batch_size:
-            candidate = self._rb_buffer.popleft()
-            cnt_rbs_num_rows += candidate.num_rows
-            candidates.append(candidate)
-        # When there are more rows than needed, slice the last popped batch to fit batch_size.
-        if cnt_rbs_num_rows > self._batch_size:
-            row_diff = cnt_rbs_num_rows - self._batch_size
-            slice_target = candidates[-1]
-            cut_off = slice_target.num_rows - row_diff
-            to_merge = slice_target.slice(length=cut_off)
-            left_over = slice_target.slice(offset=cut_off)
-            candidates[-1] = to_merge
-            self._rb_buffer.appendleft(left_over)
-        res = _merge_record_batches(candidates)
-        return _record_batch_to_arrays(res)
-def _merge_record_batches(record_batches: List[pa.RecordBatch]) -> pa.RecordBatch:
-    """Merge a list of arrow RecordBatches into one. Similar to MergeTables."""
-    if not record_batches:
-        return _EMPTY_RECORD_BATCH
-    if len(record_batches) == 1:
-        return record_batches[0]
-    record_batches = list(filter(lambda rb: rb.num_rows > 0, record_batches))
-    one_chunk_table = pa.Table.from_batches(record_batches).combine_chunks()
-    batches = one_chunk_table.to_batches(max_chunksize=None)
-    return batches[0]
-def _record_batch_to_arrays(rb: pa.RecordBatch) -> Dict[str, npt.NDArray[Any]]:
-    """Transform the record batch to a (string, numpy array) dict."""
-    batch_dict = {}
-    for column, column_schema in zip(rb, rb.schema):
-        # zero_copy_only=False because of nans. Ideally nans should have been imputed in feature engineering.
-        array = column.to_numpy(zero_copy_only=False)
-        batch_dict[column_schema.name] = array
-    return batch_dict
-def _retryable_batches(
-    dataset: ds.Dataset, batch_size: int, max_retries: int = 3, delay: int = 0
-) -> Iterator[pa.RecordBatch]:
-    """Make the Dataset to_batches retryable."""
-    retries = 0
-    current_batch_index = 0
-    while True:
-        try:
-            for batch_index, batch in enumerate(dataset.to_batches(batch_size=batch_size)):
-                if batch_index < current_batch_index:
-                    # Skip batches that have already been processed
-                    continue
-                yield batch
-                current_batch_index = batch_index + 1
-            # Exit the loop once all batches are processed
-            break
-        except Exception as e:
-            if retries < max_retries:
-                retries += 1
-                logging.info(f"Error encountered: {e}. Retrying {retries}/{max_retries}...")
-                time.sleep(delay)
-            else:
-                raise e

snowflake/ml/fileset/tf_dataset.py DELETED Viewed

@@ -1,88 +0,0 @@
-from typing import Any, Dict, Generator, List
-import fsspec
-import numpy.typing as npt
-import pyarrow as pa
-import pyarrow.parquet as pq
-import tensorflow as tf
-from snowflake.ml._internal.exceptions import (
-    error_codes,
-    exceptions as snowml_exceptions,
-)
-from snowflake.ml.fileset import parquet_parser
-def read_and_parse_parquet(
-    files: List[str],
-    filesystem: fsspec.AbstractFileSystem,
-    batch_size: int,
-    shuffle: bool,
-    drop_last_batch: bool,
-) -> tf.data.Dataset:
-    """Creates a tf.data.Dataset that reads given parquet files into batched Tensors.
-    Args:
-        files: A list of input parquet file URIs to read and parse. The parquet files should
-            have the same schema.
-        filesystem: A fsspec/pyarrow file system that is used to open given file URIs.
-        batch_size: Specifies the size of each batch that will be yield. It is preferred to
-            set it to your training batch size, and avoid using dataset.{batch(),rebatch()} later.
-        shuffle: Whether the data in the file will be shuffled. If set to be true, it will first randomly shuffle
-            the order of files, and then shuflle the order of rows in each file. It is preferred
-            to shuffle the data this way than dataset.unbatch().shuffle().rebatch().
-        drop_last_batch: Whether the last batch of data should be dropped. If set to be true, then the last batch will
-            get dropped if its size is smaller than the given batch_size.
-    Returns:
-        A tf.data.Dataset generates batched Tensors in a dict. The keys will be the column names in
-        the parquet files.
-    Raises:
-        SnowflakeMLException: if `files` is empty.
-    Example:
-        >>> from snowflake.ml.fileset import sfcfs, tf_dataset
-        >>> conn = snowflake.connector.connect(**connection_parameters)
-        >>> fs = sfcfs.SFFileSystem(conn)
-        >>> files = fs.ls(dir_path)
-        >>> ds = tf_dataset.parse_and_read_parquet(files, fs, batch_size = 2)
-        >>> for batch in ds:
-        >>>     print(batch)
-    ----
-    {'_COL_1': <tf.Tensor: shape=(2,), dtype=float32, numpy=[32.5000, 6.0000]>,
-     '_COL_2': <tf.Tensor: shape=(2,), dtype=float32, numpy=[-73.9542, -73.9875]>}
-    """
-    if not files:
-        raise snowml_exceptions.SnowflakeMLException(
-            error_code=error_codes.SNOWML_READ_FAILED,
-            original_exception=ValueError("At least one file is needed to create a TF dataset."),
-        )
-    def generator() -> Generator[Dict[str, npt.NDArray[Any]], None, None]:
-        yield from parquet_parser.ParquetParser(list(files), filesystem, batch_size, shuffle, drop_last_batch)
-    return tf.data.Dataset.from_generator(generator, output_signature=_derive_signature(files[0], filesystem))
-def _arrow_type_to_tensor_spec(field: pa.Field) -> tf.TensorSpec:
-    try:
-        dtype = tf.dtypes.as_dtype(field.type.to_pandas_dtype())
-    except TypeError:
-        raise snowml_exceptions.SnowflakeMLException(
-            error_code=error_codes.INVALID_DATA_TYPE,
-            original_exception=TypeError(f"Column {field.name} has unsupportd type {field.type}."),
-        )
-    # First dimension is batch dimension.
-    return tf.TensorSpec(shape=(None,), dtype=dtype)
-def _derive_signature(file: str, filesystem: fsspec.AbstractFileSystem) -> Dict[str, tf.TensorSpec]:
-    """Derives the signature of the TF dataset from one parquet file."""
-    # TODO(zpeng): pq.read_schema does not support `filesystem` until pyarrow>=10.
-    # switch to pq.read_schema when we depend on that.
-    schema = pq.read_table(file, filesystem=filesystem).schema
-    # Signature:
-    # The dataset yields dicts. Keys are column names; values are 1-D tensors (
-    # the first dimension is batch dimension).
-    return {field.name: _arrow_type_to_tensor_spec(field) for field in schema}

snowflake/ml/fileset/torch_datapipe.py DELETED Viewed

@@ -1,57 +0,0 @@
-from typing import Any, Dict, Iterator
-import fsspec
-import numpy.typing as npt
-from torchdata.datapipes.iter import IterDataPipe
-from snowflake.ml.fileset import parquet_parser
-class ReadAndParseParquet(IterDataPipe):
-    """Read and parse the parquet files yield batched numpy array in dict.
-    Args:
-        input_datapipe: A datapipe of input parquet file URIs to read and parse.
-            Note that the datapipe must be finite.
-        filesystem: A fsspec/pyarrow file system that is used to open given file URIs.
-        batch_size: Specifies the size of each batch that will be yield
-        shuffle: Whether the data in the file will be shuffled. If set to be true, it will first randomly shuffle
-            the order of files, and then shuflle the order of rows in each file.
-        drop_last_batch: Whether the last batch of data should be dropped. If set to be true, then the last batch will
-            get dropped if its size is smaller than the given batch_size.
-    Returns:
-        A PyTorch iterable datapipe that yields batched numpy array in dict. The keys will be the column names in
-        the parquet files.
-    Example:
-        >>> from snowflake.ml.fileset import sfcfs, torch_datapipe
-        >>> from torchdata.datapipes.iter import FSSpecFileLister
-        >>> conn = snowflake.connector.connect(**connection_parameters)
-        >>> fs = sfcfs.SFFileSystem(conn)
-        >>> filedp = FSSpecFileLister(root=dir_path, masks="*.parquet", mode="rb", sf_connection=conn)
-        >>> parquet_dp = torch_datapipe.ReadAndParseParquet(file_dp, fs, batch_size = 2)
-        >>> for batch in parquet_dp:
-        >>>     print(batch)
-    ----
-    {'_COL_1': [32.5000, 6.0000], '_COL_2': [-73.9542, -73.9875]}
-    """
-    def __init__(
-        self,
-        input_datapipe: IterDataPipe[str],
-        filesystem: fsspec.AbstractFileSystem,
-        batch_size: int,
-        shuffle: bool,
-        drop_last_batch: bool,
-    ) -> None:
-        self._input_datapipe = input_datapipe
-        self._fs = filesystem
-        self._batch_size = batch_size
-        self._shuffle = shuffle
-        self._drop_last_batch = drop_last_batch
-    def __iter__(self) -> Iterator[Dict[str, npt.NDArray[Any]]]:
-        yield from parquet_parser.ParquetParser(
-            list(self._input_datapipe), self._fs, self._batch_size, self._shuffle, self._drop_last_batch
-        )

snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py DELETED Viewed

@@ -1,151 +0,0 @@
-from typing import Any, List, Optional
-from snowflake.ml.modeling._internal.snowpark_implementations.snowpark_handlers import (
-    SnowparkTransformHandlers,
-)
-from snowflake.snowpark import DataFrame, Session
-class MLRuntimeTransformHandlers:
-    def __init__(
-        self,
-        dataset: DataFrame,
-        estimator: object,
-        class_name: str,
-        subproject: str,
-        autogenerated: Optional[bool] = False,
-    ) -> None:
-        """
-        Args:
-            dataset: The dataset to run transform functions on.
-            estimator: The estimator used to run transforms.
-            class_name: class name to be used in telemetry.
-            subproject: subproject to be used in telemetry.
-            autogenerated: Whether the class was autogenerated from a template.
-        Raises:
-            ModuleNotFoundError: The mlruntimes_client module is not available.
-        """
-        try:
-            from snowflake.ml.runtime import MLRuntimeClient
-        except ModuleNotFoundError as e:
-            # This is an internal exception, not a user-facing one. The snowflake.ml.runtime module should
-            # always be present when this class is instantiated.
-            raise e
-        self.client = MLRuntimeClient()
-        self.dataset = dataset
-        self.estimator = estimator
-        self._class_name = class_name
-        self._subproject = subproject
-        self._autogenerated = autogenerated
-    def batch_inference(
-        self,
-        inference_method: str,
-        input_cols: List[str],
-        expected_output_cols: List[str],
-        session: Session,
-        dependencies: List[str],
-        drop_input_cols: Optional[bool] = False,
-        expected_output_cols_type: Optional[str] = "",
-        *args: Any,
-        **kwargs: Any,
-    ) -> DataFrame:
-        """Run batch inference on the given dataset.
-        Temporary workaround - pushdown implementation is not currently ready for batch_inference.
-        We use a SnowparkTransformHandlers until we have a way to use the runtime client.
-        Args:
-            inference_method: the name of the method used by `estimator` to run inference.
-            input_cols: List of feature columns for inference.
-            session: An active Snowpark Session.
-            dependencies: List of dependencies for the transformer.
-            expected_output_cols: column names (in order) of the output dataset.
-            drop_input_cols: Boolean to determine whether to drop the input columns from the output dataset.
-            expected_output_cols_type: Expected type of the output columns.
-            args: additional positional arguments.
-            kwargs: additional keyword args.
-        Returns:
-            A new dataset of the same type as the input dataset.
-        """
-        mlrs_inference_methods = ["predict", "predict_proba", "predict_log_proba"]
-        if inference_method in mlrs_inference_methods:
-            result_df = self.client.inference(
-                estimator=self.estimator,
-                dataset=self.dataset,
-                inference_method=inference_method,
-                input_cols=input_cols,
-                output_cols=expected_output_cols,
-                drop_input_cols=drop_input_cols,
-            )
-        else:
-            handler = SnowparkTransformHandlers(
-                dataset=self.dataset,
-                estimator=self.estimator,
-                class_name=self._class_name,
-                subproject=self._subproject,
-                autogenerated=self._autogenerated,
-            )
-            result_df = handler.batch_inference(
-                inference_method,
-                input_cols,
-                expected_output_cols,
-                session,
-                dependencies,
-                drop_input_cols,
-                expected_output_cols_type,
-                *args,
-                **kwargs,
-            )
-        assert isinstance(result_df, DataFrame)  # mypy - The MLRS return types are annotated as `object`.
-        return result_df
-    def score(
-        self,
-        input_cols: List[str],
-        label_cols: List[str],
-        session: Session,
-        dependencies: List[str],
-        score_sproc_imports: List[str],
-        sample_weight_col: Optional[str] = None,
-        *args: Any,
-        **kwargs: Any,
-    ) -> float:
-        """Score the given test dataset.
-        Args:
-            session: An active Snowpark Session.
-            dependencies: score function dependencies.
-            score_sproc_imports: imports for score stored procedure.
-            input_cols: List of feature columns for inference.
-            label_cols: List of label columns for scoring.
-            sample_weight_col: A column assigning relative weights to each row for scoring.
-            args: additional positional arguments.
-            kwargs: additional keyword args.
-        Returns:
-            An accuracy score for the model on the given test data.
-        Raises:
-            TypeError: The ML Runtimes client returned a non-float result
-        """
-        output_score = self.client.score(
-            estimator=self.estimator,
-            dataset=self.dataset,
-            input_cols=input_cols,
-            label_cols=label_cols,
-            sample_weight_col=sample_weight_col,
-        )
-        if not isinstance(output_score, float):
-            raise TypeError(
-                f"The ML Runtimes Client returned a non-float value {output_score} of type {type(output_score)}"
-            )
-        return output_score

snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_trainer.py DELETED Viewed

@@ -1,66 +0,0 @@
-from typing import List, Optional
-from snowflake.snowpark import DataFrame, Session
-class MLRuntimeModelTrainer:
-    """ML model training using the ml runties client."""
-    def __init__(
-        self,
-        estimator: object,
-        dataset: DataFrame,
-        session: Session,
-        input_cols: List[str],
-        label_cols: Optional[List[str]],
-        sample_weight_col: Optional[str],
-        autogenerated: bool = False,
-        subproject: str = "",
-    ) -> None:
-        """
-        Initializes the MLRuntimeModelTrainer with a model, a Snowpark DataFrame, feature, and label column names.
-        Args:
-            estimator: SKLearn compatible estimator or transformer object.
-            dataset: The dataset used for training the model.
-            session: Snowflake session object to be used for training.
-            input_cols: The name(s) of one or more columns in a DataFrame containing a feature to be used for training.
-            label_cols: The name(s) of one or more columns in a DataFrame representing the target variable(s) to learn.
-            sample_weight_col: The column name representing the weight of training examples.
-            autogenerated: A boolean denoting if the trainer is being used by autogenerated code or not.
-            subproject: subproject name to be used in telemetry.
-        Raises:
-            ModuleNotFoundError: The mlruntimes_client module is not available.
-        """
-        try:
-            from snowflake.ml.runtime import MLRuntimeClient
-        except ModuleNotFoundError as e:
-            # This is an internal exception, not a user-facing one. The snowflake.ml.runtime module should
-            # always be present when this class is instantiated.
-            raise e
-        self.client = MLRuntimeClient()
-        self.estimator = estimator
-        self.dataset = dataset
-        self.session = session
-        self.input_cols = input_cols
-        self.label_cols = label_cols
-        self.sample_weight_col = sample_weight_col
-        self._autogenerated = autogenerated
-        self._subproject = subproject
-        self._class_name = estimator.__class__.__name__
-    def train(self) -> object:
-        """
-        Trains the model by pushing down the compute into SPCS ML Runtime
-        """
-        return self.client.train(
-            estimator=self.estimator,
-            dataset=self.dataset,
-            input_cols=self.input_cols,
-            label_cols=self.label_cols,
-            sample_weight_col=self.sample_weight_col,
-        )

{snowflake_ml_python-1.7.2.dist-info → snowflake_ml_python-1.7.3.dist-info}/LICENSE.txt RENAMED Viewed

File without changes

{snowflake_ml_python-1.7.2.dist-info → snowflake_ml_python-1.7.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

snowflake-ml-python 1.7.2__py3-none-any.whl → 1.7.3__py3-none-any.whl

snowflake-ml-python 1.7.2py3-none-any.whl → 1.7.3py3-none-any.whl