PyPI - snowflake-ml-python - Versions diffs - 1.5.3__py3-none-any.whl → 1.6.0__py3-none-any.whl - Mend

snowflake-ml-python 1.5.3py3-none-any.whl → 1.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (166) hide show

snowflake/ml/_internal/container_services/image_registry/imagelib.py CHANGED Viewed

@@ -13,6 +13,7 @@ This library only supports a limited set of features:
 It's recommended to use this library to copy previously tested images using sha256 to avoid surprises
 with respect to compatibility.
 """
 import dataclasses
 import hashlib
 import io
@@ -152,7 +153,8 @@ class BlobTransfer:
     src_image: ImageDescriptor
     dest_image: ImageDescriptor
     manifest: Manifest
-    image_registry_http_client: image_registry_http_client.ImageRegistryHttpClient
+    src_image_registry_http_client: image_registry_http_client.ImageRegistryHttpClient
+    dest_image_registry_http_client: image_registry_http_client.ImageRegistryHttpClient
     def upload_all_blobs(self) -> None:
         blob_digests = self.manifest.get_blob_digests()
@@ -169,7 +171,7 @@ class BlobTransfer:
         """
         Check if the blob already exists in the destination registry.
         """
-        resp = self.image_registry_http_client.head(self.dest_image.blob_link(blob_digest), headers={})
+        resp = self.dest_image_registry_http_client.head(self.dest_image.blob_link(blob_digest), headers={})
         return resp.status_code != 200
     def _fetch_blob(self, blob_digest: str) -> Tuple[io.BytesIO, int]:
@@ -178,7 +180,7 @@ class BlobTransfer:
         """
         src_blob_link = self.src_image.blob_link(blob_digest)
         headers = {_CONTENT_LENGTH_HEADER: "0"}
-        resp = self.image_registry_http_client.get(src_blob_link, headers=headers)
+        resp = self.src_image_registry_http_client.get(src_blob_link, headers=headers)
         assert resp.status_code == 200, f"Blob GET failed with code {resp.status_code}"
         assert _CONTENT_LENGTH_HEADER in resp.headers, f"Blob does not contain {_CONTENT_LENGTH_HEADER}"
@@ -189,7 +191,7 @@ class BlobTransfer:
         """
         Obtain the upload URL from the destination registry.
         """
-        response = self.image_registry_http_client.post(self.dest_image.blob_upload_link())
+        response = self.dest_image_registry_http_client.post(self.dest_image.blob_upload_link())
         assert (
             response.status_code == 202
         ), f"Failed to get the upload URL to destination. Status {response.status_code}. {str(response.content)}"
@@ -216,14 +218,14 @@ class BlobTransfer:
             headers[_CONTENT_RANGE_HEADER] = f"{start_byte}-{end_byte}"
             headers[_CONTENT_LENGTH_HEADER] = str(chunk_length)
-            resp = self.image_registry_http_client.patch(next_loc, headers=headers, data=chunk)
+            resp = self.dest_image_registry_http_client.patch(next_loc, headers=headers, data=chunk)
             assert resp.status_code == 202, f"Blob PATCH failed with code {resp.status_code}"
             next_loc = resp.headers[_LOCATION_HEADER]
             start_byte += chunk_length
         # Finalize the upload
-        resp = self.image_registry_http_client.put(f"{next_loc}&digest={blob_digest}")
+        resp = self.dest_image_registry_http_client.put(f"{next_loc}&digest={blob_digest}")
         assert resp.status_code == 201, f"Blob PUT failed with code {resp.status_code}"
     def _transfer(self, blob_digest: str) -> None:
@@ -340,21 +342,32 @@ def copy_image(
     src_image: ImageDescriptor,
     dest_image: ImageDescriptor,
     arch: _Arch,
-    retryable_http: image_registry_http_client.ImageRegistryHttpClient,
+    src_retryable_http: image_registry_http_client.ImageRegistryHttpClient,
+    dest_retryable_http: image_registry_http_client.ImageRegistryHttpClient,
 ) -> None:
     logger.debug(f"Pulling image manifest for {src_image}")
     # 1. Get the manifest
-    manifest = get_manifest(src_image, arch, retryable_http)
+    manifest = get_manifest(src_image, arch, src_retryable_http)
     logger.debug(f"Manifest pulled for {src_image} with digest {manifest.manifest_digest}")
     # 2: Retrieve all blob digests from manifest; fetch blob based on blob digest, then upload blob.
-    blob_transfer = BlobTransfer(src_image, dest_image, manifest, image_registry_http_client=retryable_http)
+    blob_transfer = BlobTransfer(
+        src_image,
+        dest_image,
+        manifest,
+        src_image_registry_http_client=src_retryable_http,
+        dest_image_registry_http_client=dest_retryable_http,
+    )
     blob_transfer.upload_all_blobs()
     # 3. Upload the manifest
     logger.debug(f"All blobs copied successfully. Copying manifest for {src_image} to {dest_image}")
-    put_manifest(dest_image, manifest, retryable_http)
+    put_manifest(
+        dest_image,
+        manifest,
+        dest_retryable_http,
+    )
     logger.debug(f"Image {src_image} copied to {dest_image}")

snowflake/ml/_internal/container_services/image_registry/registry_client.py CHANGED Viewed

@@ -201,6 +201,12 @@ class ImageRegistryClient:
         )
         # TODO[shchen]: Remove the imagelib, instead rely on the copy image system function later.
         imagelib.copy_image(
-            src_image=src_image, dest_image=dest_image, arch=arch, retryable_http=self.image_registry_http_client
+            src_image=src_image,
+            dest_image=dest_image,
+            arch=arch,
+            src_retryable_http=image_registry_http_client.ImageRegistryHttpClient(
+                repo_url=src_image.registry_name, no_cred=True
+            ),
+            dest_retryable_http=self.image_registry_http_client,
         )
         logger.info("Image copy completed successfully")

snowflake/ml/_internal/exceptions/dataset_errors.py CHANGED Viewed

@@ -1,11 +1,11 @@
 # Error code from Snowflake Python Connector.
-ERRNO_OBJECT_ALREADY_EXISTS = "002002"
-ERRNO_OBJECT_NOT_EXIST = "002043"
-ERRNO_FILES_ALREADY_EXISTING = "001030"
-ERRNO_VERSION_ALREADY_EXISTS = "092917"
-ERRNO_DATASET_NOT_EXIST = "399019"
-ERRNO_DATASET_VERSION_NOT_EXIST = "399012"
-ERRNO_DATASET_VERSION_ALREADY_EXISTS = "399020"
+ERRNO_OBJECT_ALREADY_EXISTS = 2002
+ERRNO_OBJECT_NOT_EXIST = 2043
+ERRNO_FILES_ALREADY_EXISTING = 1030
+ERRNO_VERSION_ALREADY_EXISTS = 92917
+ERRNO_DATASET_NOT_EXIST = 399019
+ERRNO_DATASET_VERSION_NOT_EXIST = 399012
+ERRNO_DATASET_VERSION_ALREADY_EXISTS = 399020
 class DatasetError(Exception):

snowflake/ml/_internal/exceptions/fileset_errors.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # Error code from Snowflake Python Connector.
-ERRNO_FILE_EXIST_IN_STAGE = "001030"
-ERRNO_DOMAIN_NOT_EXIST = "002003"
-ERRNO_STAGE_NOT_EXIST = "391707"
+ERRNO_FILE_EXIST_IN_STAGE = 1030
+ERRNO_DOMAIN_NOT_EXIST = 2003
+ERRNO_STAGE_NOT_EXIST = 391707
 class FileSetError(Exception):

snowflake/ml/_internal/exceptions/sql_error_codes.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""SQL Error Codes"""
+# SQL compilation error: Object ''{0}'' does not exist or not authorized.
+OBJECT_NOT_EXIST = 2001
+# SQL compilation error: Object ''{0}'' already exists.
+OBJECT_ALREADY_EXISTS = 2002

snowflake/ml/_internal/lineage/lineage_utils.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import copy
 import functools
-from typing import Any, Callable, List, Optional
+from typing import Any, Callable, List, Optional, get_args
 from snowflake import snowpark
-from snowflake.ml._internal.lineage import data_source
+from snowflake.ml.data import data_source
 _DATA_SOURCES_ATTR = "_data_sources"
@@ -39,7 +39,7 @@ def get_data_sources(*args: Any) -> Optional[List[data_source.DataSource]]:
     result: Optional[List[data_source.DataSource]] = None
     for arg in args:
         srcs = getattr(arg, _DATA_SOURCES_ATTR, None)
-        if isinstance(srcs, list) and all(isinstance(s, data_source.DataSource) for s in srcs):
+        if isinstance(srcs, list) and all(isinstance(s, get_args(data_source.DataSource)) for s in srcs):
             if result is None:
                 result = []
             result += srcs
@@ -49,7 +49,7 @@ def get_data_sources(*args: Any) -> Optional[List[data_source.DataSource]]:
 def set_data_sources(obj: Any, data_sources: Optional[List[data_source.DataSource]]) -> None:
     """Helper method for attaching data sources to an object"""
     if data_sources:
-        assert all(isinstance(ds, data_source.DataSource) for ds in data_sources)
+        assert all(isinstance(ds, get_args(data_source.DataSource)) for ds in data_sources)
     setattr(obj, _DATA_SOURCES_ATTR, data_sources)

snowflake/ml/_internal/telemetry.py CHANGED Viewed

@@ -10,6 +10,7 @@ from typing import (
     Dict,
     Iterable,
     List,
+    Mapping,
     Optional,
     Tuple,
     TypeVar,
@@ -92,6 +93,31 @@ def get_statement_params(
     )
+def add_statement_params_custom_tags(
+    statement_params: Optional[Dict[str, Any]], custom_tags: Mapping[str, Any]
+) -> Dict[str, Any]:
+    """
+    Add custom_tags to existing statement_params.  Overwrite keys in custom_tags dict that already exist.
+    If existing statement_params are not provided, do nothing as the information cannot be effectively tracked.
+    Args:
+        statement_params: Existing statement_params dictionary.
+        custom_tags: Dictionary of existing k/v pairs to add as custom_tags
+    Returns:
+        new statement_params dictionary with all keys and an updated custom_tags field.
+    """
+    if not statement_params:
+        return {}
+    existing_custom_tags: Dict[str, Any] = statement_params.pop(TelemetryField.KEY_CUSTOM_TAGS.value, {})
+    existing_custom_tags.update(custom_tags)
+    # NOTE: This can be done with | operator after upgrade from py3.8
+    return {
+        **statement_params,
+        TelemetryField.KEY_CUSTOM_TAGS.value: existing_custom_tags,
+    }
 # TODO: we can merge this with get_statement_params after code clean up
 def get_statement_params_full_func_name(frame: Optional[types.FrameType], class_name: Optional[str] = None) -> str:
     """
@@ -251,6 +277,7 @@ def send_api_usage_telemetry(
         ]
     ] = None,
     sfqids_extractor: Optional[Callable[..., List[str]]] = None,
+    subproject_extractor: Optional[Callable[[Any], str]] = None,
     custom_tags: Optional[Dict[str, Union[bool, int, str, float]]] = None,
 ) -> Callable[[Callable[_Args, _ReturnValue]], Callable[_Args, _ReturnValue]]:
     """
@@ -264,6 +291,7 @@ def send_api_usage_telemetry(
         conn_attr_name: Name of the SnowflakeConnection attribute in `self`.
         api_calls_extractor: Extract API calls from `self`.
         sfqids_extractor: Extract sfqids from `self`.
+        subproject_extractor: Extract subproject at runtime from `self`.
         custom_tags: Custom tags.
     Returns:
@@ -271,10 +299,14 @@ def send_api_usage_telemetry(
     Raises:
         TypeError: If `conn_attr_name` is provided but the conn attribute is not of type SnowflakeConnection.
+        ValueError: If both `subproject` and `subproject_extractor` are provided
     # noqa: DAR402
     """
+    if subproject is not None and subproject_extractor is not None:
+        raise ValueError("Specifying both subproject and subproject_extractor is not allowed")
     def decorator(func: Callable[_Args, _ReturnValue]) -> Callable[_Args, _ReturnValue]:
         @functools.wraps(func)
         def wrap(*args: Any, **kwargs: Any) -> _ReturnValue:
@@ -296,9 +328,13 @@ def send_api_usage_telemetry(
             if sfqids_extractor:
                 sfqids = sfqids_extractor(args[0])
+            subproject_name = subproject
+            if subproject_extractor is not None:
+                subproject_name = subproject_extractor(args[0])
             statement_params = get_function_usage_statement_params(
                 project=project,
-                subproject=subproject,
+                subproject=subproject_name,
                 function_category=TelemetryField.FUNC_CAT_USAGE.value,
                 function_name=_get_full_func_name(func),
                 function_parameters=params,
@@ -355,7 +391,7 @@ def send_api_usage_telemetry(
                         raise e.original_exception from e
             # TODO(hayu): [SNOW-750287] Optimize telemetry client to a singleton.
-            telemetry = _SourceTelemetryClient(conn=conn, project=project, subproject=subproject)
+            telemetry = _SourceTelemetryClient(conn=conn, project=project, subproject=subproject_name)
             telemetry_args = dict(
                 func_name=_get_full_func_name(func),
                 function_category=TelemetryField.FUNC_CAT_USAGE.value,

snowflake/ml/_internal/utils/identifier.py CHANGED Viewed

@@ -165,6 +165,20 @@ def parse_schema_level_object_identifier(
     )
+def is_fully_qualified_name(name: str) -> bool:
+    """
+    Checks if a given name is a fully qualified name, which is in the format '<db>.<schema>.<object_name>'.
+    Args:
+        name: The name to be checked.
+    Returns:
+        bool: True if the name is fully qualified, False otherwise.
+    """
+    res = parse_schema_level_object_identifier(name)
+    return res[0] is not None and res[1] is not None and res[2] is not None and not res[3]
 def get_schema_level_object_identifier(
     db: Optional[str],
     schema: Optional[str],

snowflake/ml/_internal/utils/snowpark_dataframe_utils.py CHANGED Viewed

@@ -1,22 +1,27 @@
 import logging
 import warnings
+from typing import List, Optional
 from snowflake import snowpark
+from snowflake.ml._internal.utils import sql_identifier
 from snowflake.snowpark import functions, types
-def cast_snowpark_dataframe(df: snowpark.DataFrame) -> snowpark.DataFrame:
+def cast_snowpark_dataframe(df: snowpark.DataFrame, ignore_columns: Optional[List[str]] = None) -> snowpark.DataFrame:
     """Cast columns in the dataframe to types that are compatible with tensor.
     It assists FileSet.make() in performing implicit data casting.
     Args:
         df: A snowpark dataframe.
+        ignore_columns: Columns to exclude from casting. These columns will be propagated unchanged.
     Returns:
         A snowpark dataframe whose data type has been casted.
     """
+    ignore_cols_set = {sql_identifier.SqlIdentifier(c).identifier() for c in ignore_columns} if ignore_columns else {}
     fields = df.schema.fields
     selected_cols = []
     for field in fields:
@@ -40,7 +45,9 @@ def cast_snowpark_dataframe(df: snowpark.DataFrame) -> snowpark.DataFrame:
             dest = field.datatype
             selected_cols.append(functions.cast(functions.col(src), dest).alias(src))
         else:
-            if field.datatype in (types.DateType(), types.TimestampType(), types.TimeType()):
+            if field.column_identifier.name in ignore_cols_set:
+                pass
+            elif field.datatype in (types.DateType(), types.TimestampType(), types.TimeType()):
                 logging.warning(
                     "A Column with DATE or TIMESTAMP data type detected. "
                     "It might not be able to get converted to tensors. "
@@ -90,7 +97,9 @@ def cast_snowpark_dataframe_column_types(df: snowpark.DataFrame) -> snowpark.Dat
                     " is being automatically converted to DoubleType in the Snowpark DataFrame. "
                     "This automatic conversion may lead to potential precision loss and rounding errors. "
                     "If you wish to prevent this conversion, you should manually perform "
-                    "the necessary data type conversion."
+                    "the necessary data type conversion.",
+                    UserWarning,
+                    stacklevel=2,
                 )
             else:
                 # IntegerType default as NUMBER(38, 0), but
@@ -102,7 +111,9 @@ def cast_snowpark_dataframe_column_types(df: snowpark.DataFrame) -> snowpark.Dat
                     " is being automatically converted to LongType in the Snowpark DataFrame. "
                     "This automatic conversion may lead to potential precision loss and rounding errors. "
                     "If you wish to prevent this conversion, you should manually perform "
-                    "the necessary data type conversion."
+                    "the necessary data type conversion.",
+                    UserWarning,
+                    stacklevel=2,
                 )
             selected_cols.append(functions.cast(functions.col(src), dest_dtype).alias(src))
         # TODO: add more type handling or error message

snowflake/ml/data/_internal/arrow_ingestor.py ADDED Viewed

@@ -0,0 +1,228 @@
+import collections
+import logging
+import os
+import time
+from typing import Any, Deque, Dict, Iterator, List, Optional
+import numpy as np
+import numpy.typing as npt
+import pandas as pd
+import pyarrow as pa
+import pyarrow.dataset as ds
+from snowflake import snowpark
+from snowflake.ml.data import data_ingestor, data_source
+from snowflake.ml.data._internal import ingestor_utils
+_EMPTY_RECORD_BATCH = pa.RecordBatch.from_arrays([], [])
+# The row count for batches read from PyArrow Dataset. This number should be large enough so that
+# dataset.to_batches() would read in a very large portion of, if not entirely, a parquet file.
+_DEFAULT_DATASET_BATCH_SIZE = 1000000
+class _RecordBatchesBuffer:
+    """A queue that stores record batches and tracks the total num of rows in it."""
+    def __init__(self) -> None:
+        self.buffer: Deque[pa.RecordBatch] = collections.deque()
+        self.num_rows = 0
+    def append(self, rb: pa.RecordBatch) -> None:
+        self.buffer.append(rb)
+        self.num_rows += rb.num_rows
+    def appendleft(self, rb: pa.RecordBatch) -> None:
+        self.buffer.appendleft(rb)
+        self.num_rows += rb.num_rows
+    def popleft(self) -> pa.RecordBatch:
+        popped = self.buffer.popleft()
+        self.num_rows -= popped.num_rows
+        return popped
+class ArrowIngestor(data_ingestor.DataIngestor):
+    """Read and parse the data sources into an Arrow Dataset and yield batched numpy array in dict."""
+    def __init__(
+        self,
+        session: snowpark.Session,
+        data_sources: List[data_source.DataSource],
+        format: Optional[str] = None,
+        **kwargs: Any,
+    ) -> None:
+        """
+        Args:
+            session: The Snowpark Session to use.
+            data_sources: List of data sources to ingest.
+            format: Currently “parquet”, “ipc”/”arrow”/”feather”, “csv”, “json”, and “orc” are supported.
+                Will be inferred if not specified.
+            kwargs: Miscellaneous arguments passed to underlying PyArrow Dataset initializer.
+        """
+        self._session = session
+        self._data_sources = data_sources
+        self._format = format
+        self._kwargs = kwargs
+        self._schema: Optional[pa.Schema] = None
+    @property
+    def data_sources(self) -> List[data_source.DataSource]:
+        return self._data_sources
+    def to_batches(
+        self,
+        batch_size: int,
+        shuffle: bool = True,
+        drop_last_batch: bool = True,
+    ) -> Iterator[Dict[str, npt.NDArray[Any]]]:
+        """Iterate through PyArrow Dataset to generate batches whose length equals to expected batch size.
+        As we are generating batches with the exactly same length, the last few rows in each file might get left as they
+        are not long enough to form a batch. These rows will be put into a temporary buffer and combine with the first
+        few rows of the next file to generate a new batch.
+        Args:
+            batch_size: Specifies the size of each batch that will be yield
+            shuffle: Whether the data in the file will be shuffled. If set to be true, it will first randomly shuffle
+                the order of files, and then shuflle the order of rows in each file.
+            drop_last_batch: Whether the last batch of data should be dropped. If set to be true, then the last
+                batch will get dropped if its size is smaller than the given batch_size.
+        Yields:
+            A dict mapping column names to the corresponding data fetch from that column.
+        """
+        self._rb_buffer = _RecordBatchesBuffer()
+        # Extract schema if not already known
+        dataset = self._get_dataset(shuffle)
+        if self._schema is None:
+            self._schema = dataset.schema
+        for rb in _retryable_batches(dataset, batch_size=max(_DEFAULT_DATASET_BATCH_SIZE, batch_size)):
+            if shuffle:
+                rb = rb.take(np.random.permutation(rb.num_rows))
+            self._rb_buffer.append(rb)
+            while self._rb_buffer.num_rows >= batch_size:
+                yield self._get_batches_from_buffer(batch_size)
+        if self._rb_buffer.num_rows and not drop_last_batch:
+            yield self._get_batches_from_buffer(batch_size)
+    def to_pandas(self, limit: Optional[int] = None) -> pd.DataFrame:
+        ds = self._get_dataset(shuffle=False)
+        table = ds.to_table() if limit is None else ds.head(num_rows=limit)
+        return table.to_pandas()
+    def _get_dataset(self, shuffle: bool) -> ds.Dataset:
+        format = self._format
+        sources = []
+        source_format = None
+        for source in self._data_sources:
+            if isinstance(source, str):
+                sources.append(source)
+                source_format = format or os.path.splitext(source)[-1]
+            elif isinstance(source, data_source.DatasetInfo):
+                if not self._kwargs.get("filesystem"):
+                    self._kwargs["filesystem"] = ingestor_utils.get_dataset_filesystem(self._session, source)
+                sources.extend(
+                    ingestor_utils.get_dataset_files(self._session, source, filesystem=self._kwargs["filesystem"])
+                )
+                source_format = "parquet"
+            elif isinstance(source, data_source.DataFrameInfo):
+                # FIXME: This currently loads all result batches into memory so that it
+                #        can be passed into pyarrow.dataset as a list/tuple of pa.RecordBatches
+                #        We may be able to optimize this by splitting the result batches into
+                #        in-memory (first batch) and file URLs (subsequent batches) and creating a
+                #        union dataset.
+                result_batches = ingestor_utils.get_dataframe_result_batches(self._session, source)
+                sources.extend(b.to_arrow() for b in result_batches)
+                source_format = "arrow"
+            else:
+                raise RuntimeError(f"Unsupported data source type: {type(source)}")
+            # Make sure source types not mixed
+            if format and format != source_format:
+                raise RuntimeError(f"Unexpected data source format (expected {format}, found {source_format})")
+            format = source_format
+        # Re-shuffle input files on each iteration start
+        if shuffle:
+            np.random.shuffle(sources)
+        pa_dataset: ds.Dataset = ds.dataset(sources, format=format, **self._kwargs)
+        return pa_dataset
+    def _get_batches_from_buffer(self, batch_size: int) -> Dict[str, npt.NDArray[Any]]:
+        """Generate new batches from the existing record batch buffer."""
+        cnt_rbs_num_rows = 0
+        candidates = []
+        # Keep popping record batches in buffer until there are enough rows for a batch.
+        while self._rb_buffer.num_rows and cnt_rbs_num_rows < batch_size:
+            candidate = self._rb_buffer.popleft()
+            cnt_rbs_num_rows += candidate.num_rows
+            candidates.append(candidate)
+        # When there are more rows than needed, slice the last popped batch to fit batch_size.
+        if cnt_rbs_num_rows > batch_size:
+            row_diff = cnt_rbs_num_rows - batch_size
+            slice_target = candidates[-1]
+            cut_off = slice_target.num_rows - row_diff
+            to_merge = slice_target.slice(length=cut_off)
+            left_over = slice_target.slice(offset=cut_off)
+            candidates[-1] = to_merge
+            self._rb_buffer.appendleft(left_over)
+        res = _merge_record_batches(candidates)
+        return _record_batch_to_arrays(res)
+def _merge_record_batches(record_batches: List[pa.RecordBatch]) -> pa.RecordBatch:
+    """Merge a list of arrow RecordBatches into one. Similar to MergeTables."""
+    if not record_batches:
+        return _EMPTY_RECORD_BATCH
+    if len(record_batches) == 1:
+        return record_batches[0]
+    record_batches = list(filter(lambda rb: rb.num_rows > 0, record_batches))
+    one_chunk_table = pa.Table.from_batches(record_batches).combine_chunks()
+    batches = one_chunk_table.to_batches(max_chunksize=None)
+    return batches[0]
+def _record_batch_to_arrays(rb: pa.RecordBatch) -> Dict[str, npt.NDArray[Any]]:
+    """Transform the record batch to a (string, numpy array) dict."""
+    batch_dict = {}
+    for column, column_schema in zip(rb, rb.schema):
+        # zero_copy_only=False because of nans. Ideally nans should have been imputed in feature engineering.
+        array = column.to_numpy(zero_copy_only=False)
+        batch_dict[column_schema.name] = array
+    return batch_dict
+def _retryable_batches(
+    dataset: ds.Dataset, batch_size: int, max_retries: int = 3, delay: int = 0
+) -> Iterator[pa.RecordBatch]:
+    """Make the Dataset to_batches retryable."""
+    retries = 0
+    current_batch_index = 0
+    while True:
+        try:
+            for batch_index, batch in enumerate(dataset.to_batches(batch_size=batch_size)):
+                if batch_index < current_batch_index:
+                    # Skip batches that have already been processed
+                    continue
+                yield batch
+                current_batch_index = batch_index + 1
+            # Exit the loop once all batches are processed
+            break
+        except Exception as e:
+            if retries < max_retries:
+                retries += 1
+                logging.info(f"Error encountered: {e}. Retrying {retries}/{max_retries}...")
+                time.sleep(delay)
+            else:
+                raise e

snowflake/ml/data/_internal/ingestor_utils.py ADDED Viewed

@@ -0,0 +1,58 @@
+from typing import List, Optional
+import fsspec
+from snowflake import snowpark
+from snowflake.connector import result_batch
+from snowflake.ml.data import data_source
+from snowflake.ml.fileset import snowfs
+_TARGET_FILE_SIZE = 32 * 2**20  # The max file size for data loading.
+def get_dataframe_result_batches(
+    session: snowpark.Session, df_info: data_source.DataFrameInfo
+) -> List[result_batch.ResultBatch]:
+    cursor = session._conn._cursor
+    if df_info.query_id:
+        query_id = df_info.query_id
+    else:
+        query_id = session.sql(df_info.sql).collect_nowait().query_id
+    # TODO: Check if query result cache is still live
+    cursor.get_results_from_sfqid(sfqid=query_id)
+    # Prefetch hook should be set by `get_results_from_sfqid`
+    # This call blocks until the query results are ready
+    if cursor._prefetch_hook is None:
+        raise RuntimeError("Loading data from result query failed unexpectedly. Please contact Snowflake support.")
+    cursor._prefetch_hook()
+    batches = cursor.get_result_batches()
+    if batches is None:
+        raise ValueError(
+            "Failed to retrieve training data. Query status:" f" {session._conn._conn.get_query_status(query_id)}"
+        )
+    return batches
+def get_dataset_filesystem(
+    session: snowpark.Session, ds_info: Optional[data_source.DatasetInfo] = None
+) -> fsspec.AbstractFileSystem:
+    # We can't directly load the Dataset to avoid a circular dependency
+    # Dataset -> DatasetReader -> DataConnector -> DataIngestor -> (?) ingestor_utils -> Dataset
+    # TODO: Automatically pick appropriate fsspec implementation based on protocol in URL
+    return snowfs.SnowFileSystem(
+        snowpark_session=session,
+        cache_type="bytes",
+        block_size=2 * _TARGET_FILE_SIZE,
+    )
+def get_dataset_files(
+    session: snowpark.Session, ds_info: data_source.DatasetInfo, filesystem: Optional[fsspec.AbstractFileSystem] = None
+) -> List[str]:
+    if filesystem is None:
+        filesystem = get_dataset_filesystem(session, ds_info)
+    assert bool(ds_info.url)  # Not null or empty
+    return sorted(filesystem.ls(ds_info.url))

snowflake-ml-python 1.5.3__py3-none-any.whl → 1.6.0__py3-none-any.whl

snowflake-ml-python 1.5.3py3-none-any.whl → 1.6.0py3-none-any.whl