PyPI - snowflake-ml-python - Versions diffs - 1.5.4__py3-none-any.whl → 1.6.0__py3-none-any.whl - Mend

snowflake-ml-python 1.5.4py3-none-any.whl → 1.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

snowflake/cortex/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from snowflake.cortex._classify_text import ClassifyText
 from snowflake.cortex._complete import Complete, CompleteOptions
 from snowflake.cortex._extract_answer import ExtractAnswer
 from snowflake.cortex._sentiment import Sentiment
@@ -5,6 +6,7 @@ from snowflake.cortex._summarize import Summarize
 from snowflake.cortex._translate import Translate
 __all__ = [
+    "ClassifyText",
     "Complete",
     "CompleteOptions",
     "ExtractAnswer",

snowflake/cortex/_classify_text.py ADDED Viewed

@@ -0,0 +1,36 @@
+from typing import List, Optional, Union
+from snowflake import snowpark
+from snowflake.cortex._util import CORTEX_FUNCTIONS_TELEMETRY_PROJECT, call_sql_function
+from snowflake.ml._internal import telemetry
+@telemetry.send_api_usage_telemetry(
+    project=CORTEX_FUNCTIONS_TELEMETRY_PROJECT,
+)
+def ClassifyText(
+    str_input: Union[str, snowpark.Column],
+    categories: Union[List[str], snowpark.Column],
+    session: Optional[snowpark.Session] = None,
+) -> Union[str, snowpark.Column]:
+    """Use the LLM inference service to classify the INPUT text into one of the target CATEGORIES.
+    Args:
+        str_input: A Column of strings to classify.
+        categories: A list of candidate categories to classify the INPUT text into.
+        session: The snowpark session to use. Will be inferred by context if not specified.
+    Returns:
+        A column of classification responses.
+    """
+    return _classify_text_impl("snowflake.cortex.classify_text", str_input, categories, session=session)
+def _classify_text_impl(
+    function: str,
+    str_input: Union[str, snowpark.Column],
+    categories: Union[List[str], snowpark.Column],
+    session: Optional[snowpark.Session] = None,
+) -> Union[str, snowpark.Column]:
+    return call_sql_function(function, session, str_input, categories)

snowflake/cortex/_complete.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import json
 import logging
-from typing import Iterator, List, Optional, TypedDict, Union, cast
+import time
+from typing import Any, Callable, Iterator, List, Optional, TypedDict, Union, cast
 from urllib.parse import urlunparse
 import requests
@@ -52,6 +53,38 @@ class ResponseParseException(Exception):
     pass
+_MAX_RETRY_SECONDS = 30
+def retry(func: Callable[..., requests.Response]) -> Callable[..., requests.Response]:
+    def inner(*args: Any, **kwargs: Any) -> requests.Response:
+        deadline = cast(Optional[float], kwargs["deadline"])
+        kwargs = {key: value for key, value in kwargs.items() if key != "deadline"}
+        expRetrySeconds = 0.5
+        while True:
+            if deadline is not None and time.time() > deadline:
+                raise TimeoutError()
+            response = func(*args, **kwargs)
+            if response.status_code >= 200 and response.status_code < 300:
+                return response
+            retry_status_codes = [429, 503, 504]
+            if response.status_code not in retry_status_codes:
+                response.raise_for_status()
+            logger.debug(f"request failed with status code {response.status_code}, retrying")
+            # Formula: delay(i) = max(RetryAfterHeader, min(2^i, _MAX_RETRY_SECONDS)).
+            expRetrySeconds = min(2 * expRetrySeconds, _MAX_RETRY_SECONDS)
+            retrySeconds = expRetrySeconds
+            retryAfterHeader = response.headers.get("retry-after")
+            if retryAfterHeader is not None:
+                retrySeconds = max(retrySeconds, int(retryAfterHeader))
+            logger.debug(f"sleeping for {retrySeconds}s before retrying")
+            time.sleep(retrySeconds)
+    return inner
+@retry
 def _call_complete_rest(
     model: str,
     prompt: Union[str, List[ConversationMessage]],
@@ -78,7 +111,7 @@ def _call_complete_rest(
     scheme = "https"
     if hasattr(session.connection, "scheme"):
         scheme = session.connection.scheme
-    url = urlunparse((scheme, session.connection.host, "api/v2/cortex/inference/complete", "", "", ""))
+    url = urlunparse((scheme, session.connection.host, "api/v2/cortex/inference:complete", "", "", ""))
     headers = {
         "Content-Type": "application/json",
@@ -105,19 +138,21 @@ def _call_complete_rest(
             data["top_p"] = options["top_p"]
     logger.debug(f"making POST request to {url} (model={model}, stream={stream})")
-    response = requests.post(
+    return requests.post(
         url,
         json=data,
         headers=headers,
         stream=stream,
     )
-    response.raise_for_status()
-    return response
-def _process_rest_response(response: requests.Response, stream: bool = False) -> Union[str, Iterator[str]]:
+def _process_rest_response(
+    response: requests.Response,
+    stream: bool = False,
+    deadline: Optional[float] = None,
+) -> Union[str, Iterator[str]]:
     if stream:
-        return _return_stream_response(response)
+        return _return_stream_response(response, deadline)
     try:
         content = response.json()["choices"][0]["message"]["content"]
@@ -128,9 +163,11 @@ def _process_rest_response(response: requests.Response, stream: bool = False) ->
         raise ResponseParseException("Failed to parse message from response.") from e
-def _return_stream_response(response: requests.Response) -> Iterator[str]:
+def _return_stream_response(response: requests.Response, deadline: Optional[float]) -> Iterator[str]:
     client = SSEClient(response)
     for event in client.events():
+        if deadline is not None and time.time() > deadline:
+            raise TimeoutError()
         try:
             yield json.loads(event.data)["choices"][0]["delta"]["content"]
         except (json.JSONDecodeError, KeyError, IndexError):
@@ -209,13 +246,20 @@ def _complete_impl(
     use_rest_api_experimental: bool = False,
     stream: bool = False,
     function: str = "snowflake.cortex.complete",
+    timeout: Optional[float] = None,
+    deadline: Optional[float] = None,
 ) -> Union[str, Iterator[str], snowpark.Column]:
+    if timeout is not None and deadline is not None:
+        raise ValueError('only one of "timeout" and "deadline" must be set')
+    if timeout is not None:
+        deadline = time.time() + timeout
     if use_rest_api_experimental:
         if not isinstance(model, str):
             raise ValueError("in REST mode, 'model' must be a string")
         if not isinstance(prompt, str) and not isinstance(prompt, List):
             raise ValueError("in REST mode, 'prompt' must be a string or a list of ConversationMessage")
-        response = _call_complete_rest(model, prompt, options, session=session, stream=stream)
+        response = _call_complete_rest(model, prompt, options, session=session, stream=stream, deadline=deadline)
+        assert response.status_code >= 200 and response.status_code < 300
         return _process_rest_response(response, stream=stream)
     if stream is True:
         raise ValueError("streaming can only be enabled in REST mode, set use_rest_api_experimental=True")
@@ -233,6 +277,8 @@ def Complete(
     session: Optional[snowpark.Session] = None,
     use_rest_api_experimental: bool = False,
     stream: bool = False,
+    timeout: Optional[float] = None,
+    deadline: Optional[float] = None,
 ) -> Union[str, Iterator[str], snowpark.Column]:
     """Complete calls into the LLM inference service to perform completion.
@@ -246,6 +292,8 @@ def Complete(
         stream (bool): Enables streaming. When enabled, a generator function is returned that provides the streaming
             output as it is received. Each update is a string containing the new text content since the previous update.
             The use of streaming requires the experimental use_rest_api_experimental flag to be enabled.
+        timeout (float): Timeout in seconds to retry failed REST requests.
+        deadline (float): Time in seconds since the epoch (as returned by time.time()) to retry failed REST requests.
     Raises:
         ValueError: If `stream` is set to True and `use_rest_api_experimental` is set to False.
@@ -254,6 +302,15 @@ def Complete(
         A column of string responses.
     """
     try:
-        return _complete_impl(model, prompt, options, session, use_rest_api_experimental, stream)
+        return _complete_impl(
+            model,
+            prompt,
+            options=options,
+            session=session,
+            use_rest_api_experimental=use_rest_api_experimental,
+            stream=stream,
+            timeout=timeout,
+            deadline=deadline,
+        )
     except ValueError as err:
         raise err

snowflake/cortex/_util.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Dict, Optional, Union, cast
+from typing import Dict, List, Optional, Union, cast
 from snowflake import snowpark
 from snowflake.snowpark import context, functions
@@ -23,7 +23,7 @@ class SnowflakeConfigurationException(Exception):
 def call_sql_function(
     function: str,
     session: Optional[snowpark.Session],
-    *args: Union[str, snowpark.Column, Dict[str, Union[int, float]]],
+    *args: Union[str, List[str], snowpark.Column, Dict[str, Union[int, float]]],
 ) -> Union[str, snowpark.Column]:
     handle_as_column = False
@@ -40,7 +40,7 @@ def call_sql_function(
 def _call_sql_function_column(
-    function: str, *args: Union[str, snowpark.Column, Dict[str, Union[int, float]]]
+    function: str, *args: Union[str, List[str], snowpark.Column, Dict[str, Union[int, float]]]
 ) -> snowpark.Column:
     return cast(snowpark.Column, functions.builtin(function)(*args))
@@ -48,7 +48,7 @@ def _call_sql_function_column(
 def _call_sql_function_immediate(
     function: str,
     session: Optional[snowpark.Session],
-    *args: Union[str, snowpark.Column, Dict[str, Union[int, float]]],
+    *args: Union[str, List[str], snowpark.Column, Dict[str, Union[int, float]]],
 ) -> str:
     session = session or context.get_active_session()
     if session is None:

snowflake/ml/_internal/lineage/lineage_utils.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import copy
 import functools
-from typing import Any, Callable, List, Optional
+from typing import Any, Callable, List, Optional, get_args
 from snowflake import snowpark
-from snowflake.ml._internal.lineage import data_source
+from snowflake.ml.data import data_source
 _DATA_SOURCES_ATTR = "_data_sources"
@@ -39,7 +39,7 @@ def get_data_sources(*args: Any) -> Optional[List[data_source.DataSource]]:
     result: Optional[List[data_source.DataSource]] = None
     for arg in args:
         srcs = getattr(arg, _DATA_SOURCES_ATTR, None)
-        if isinstance(srcs, list) and all(isinstance(s, data_source.DataSource) for s in srcs):
+        if isinstance(srcs, list) and all(isinstance(s, get_args(data_source.DataSource)) for s in srcs):
             if result is None:
                 result = []
             result += srcs
@@ -49,7 +49,7 @@ def get_data_sources(*args: Any) -> Optional[List[data_source.DataSource]]:
 def set_data_sources(obj: Any, data_sources: Optional[List[data_source.DataSource]]) -> None:
     """Helper method for attaching data sources to an object"""
     if data_sources:
-        assert all(isinstance(ds, data_source.DataSource) for ds in data_sources)
+        assert all(isinstance(ds, get_args(data_source.DataSource)) for ds in data_sources)
     setattr(obj, _DATA_SOURCES_ATTR, data_sources)

snowflake/ml/_internal/telemetry.py CHANGED Viewed

@@ -277,6 +277,7 @@ def send_api_usage_telemetry(
         ]
     ] = None,
     sfqids_extractor: Optional[Callable[..., List[str]]] = None,
+    subproject_extractor: Optional[Callable[[Any], str]] = None,
     custom_tags: Optional[Dict[str, Union[bool, int, str, float]]] = None,
 ) -> Callable[[Callable[_Args, _ReturnValue]], Callable[_Args, _ReturnValue]]:
     """
@@ -290,6 +291,7 @@ def send_api_usage_telemetry(
         conn_attr_name: Name of the SnowflakeConnection attribute in `self`.
         api_calls_extractor: Extract API calls from `self`.
         sfqids_extractor: Extract sfqids from `self`.
+        subproject_extractor: Extract subproject at runtime from `self`.
         custom_tags: Custom tags.
     Returns:
@@ -297,10 +299,14 @@ def send_api_usage_telemetry(
     Raises:
         TypeError: If `conn_attr_name` is provided but the conn attribute is not of type SnowflakeConnection.
+        ValueError: If both `subproject` and `subproject_extractor` are provided
     # noqa: DAR402
     """
+    if subproject is not None and subproject_extractor is not None:
+        raise ValueError("Specifying both subproject and subproject_extractor is not allowed")
     def decorator(func: Callable[_Args, _ReturnValue]) -> Callable[_Args, _ReturnValue]:
         @functools.wraps(func)
         def wrap(*args: Any, **kwargs: Any) -> _ReturnValue:
@@ -322,9 +328,13 @@ def send_api_usage_telemetry(
             if sfqids_extractor:
                 sfqids = sfqids_extractor(args[0])
+            subproject_name = subproject
+            if subproject_extractor is not None:
+                subproject_name = subproject_extractor(args[0])
             statement_params = get_function_usage_statement_params(
                 project=project,
-                subproject=subproject,
+                subproject=subproject_name,
                 function_category=TelemetryField.FUNC_CAT_USAGE.value,
                 function_name=_get_full_func_name(func),
                 function_parameters=params,
@@ -381,7 +391,7 @@ def send_api_usage_telemetry(
                         raise e.original_exception from e
             # TODO(hayu): [SNOW-750287] Optimize telemetry client to a singleton.
-            telemetry = _SourceTelemetryClient(conn=conn, project=project, subproject=subproject)
+            telemetry = _SourceTelemetryClient(conn=conn, project=project, subproject=subproject_name)
             telemetry_args = dict(
                 func_name=_get_full_func_name(func),
                 function_category=TelemetryField.FUNC_CAT_USAGE.value,

snowflake/ml/data/_internal/arrow_ingestor.py ADDED Viewed

@@ -0,0 +1,228 @@
+import collections
+import logging
+import os
+import time
+from typing import Any, Deque, Dict, Iterator, List, Optional
+import numpy as np
+import numpy.typing as npt
+import pandas as pd
+import pyarrow as pa
+import pyarrow.dataset as ds
+from snowflake import snowpark
+from snowflake.ml.data import data_ingestor, data_source
+from snowflake.ml.data._internal import ingestor_utils
+_EMPTY_RECORD_BATCH = pa.RecordBatch.from_arrays([], [])
+# The row count for batches read from PyArrow Dataset. This number should be large enough so that
+# dataset.to_batches() would read in a very large portion of, if not entirely, a parquet file.
+_DEFAULT_DATASET_BATCH_SIZE = 1000000
+class _RecordBatchesBuffer:
+    """A queue that stores record batches and tracks the total num of rows in it."""
+    def __init__(self) -> None:
+        self.buffer: Deque[pa.RecordBatch] = collections.deque()
+        self.num_rows = 0
+    def append(self, rb: pa.RecordBatch) -> None:
+        self.buffer.append(rb)
+        self.num_rows += rb.num_rows
+    def appendleft(self, rb: pa.RecordBatch) -> None:
+        self.buffer.appendleft(rb)
+        self.num_rows += rb.num_rows
+    def popleft(self) -> pa.RecordBatch:
+        popped = self.buffer.popleft()
+        self.num_rows -= popped.num_rows
+        return popped
+class ArrowIngestor(data_ingestor.DataIngestor):
+    """Read and parse the data sources into an Arrow Dataset and yield batched numpy array in dict."""
+    def __init__(
+        self,
+        session: snowpark.Session,
+        data_sources: List[data_source.DataSource],
+        format: Optional[str] = None,
+        **kwargs: Any,
+    ) -> None:
+        """
+        Args:
+            session: The Snowpark Session to use.
+            data_sources: List of data sources to ingest.
+            format: Currently “parquet”, “ipc”/”arrow”/”feather”, “csv”, “json”, and “orc” are supported.
+                Will be inferred if not specified.
+            kwargs: Miscellaneous arguments passed to underlying PyArrow Dataset initializer.
+        """
+        self._session = session
+        self._data_sources = data_sources
+        self._format = format
+        self._kwargs = kwargs
+        self._schema: Optional[pa.Schema] = None
+    @property
+    def data_sources(self) -> List[data_source.DataSource]:
+        return self._data_sources
+    def to_batches(
+        self,
+        batch_size: int,
+        shuffle: bool = True,
+        drop_last_batch: bool = True,
+    ) -> Iterator[Dict[str, npt.NDArray[Any]]]:
+        """Iterate through PyArrow Dataset to generate batches whose length equals to expected batch size.
+        As we are generating batches with the exactly same length, the last few rows in each file might get left as they
+        are not long enough to form a batch. These rows will be put into a temporary buffer and combine with the first
+        few rows of the next file to generate a new batch.
+        Args:
+            batch_size: Specifies the size of each batch that will be yield
+            shuffle: Whether the data in the file will be shuffled. If set to be true, it will first randomly shuffle
+                the order of files, and then shuflle the order of rows in each file.
+            drop_last_batch: Whether the last batch of data should be dropped. If set to be true, then the last
+                batch will get dropped if its size is smaller than the given batch_size.
+        Yields:
+            A dict mapping column names to the corresponding data fetch from that column.
+        """
+        self._rb_buffer = _RecordBatchesBuffer()
+        # Extract schema if not already known
+        dataset = self._get_dataset(shuffle)
+        if self._schema is None:
+            self._schema = dataset.schema
+        for rb in _retryable_batches(dataset, batch_size=max(_DEFAULT_DATASET_BATCH_SIZE, batch_size)):
+            if shuffle:
+                rb = rb.take(np.random.permutation(rb.num_rows))
+            self._rb_buffer.append(rb)
+            while self._rb_buffer.num_rows >= batch_size:
+                yield self._get_batches_from_buffer(batch_size)
+        if self._rb_buffer.num_rows and not drop_last_batch:
+            yield self._get_batches_from_buffer(batch_size)
+    def to_pandas(self, limit: Optional[int] = None) -> pd.DataFrame:
+        ds = self._get_dataset(shuffle=False)
+        table = ds.to_table() if limit is None else ds.head(num_rows=limit)
+        return table.to_pandas()
+    def _get_dataset(self, shuffle: bool) -> ds.Dataset:
+        format = self._format
+        sources = []
+        source_format = None
+        for source in self._data_sources:
+            if isinstance(source, str):
+                sources.append(source)
+                source_format = format or os.path.splitext(source)[-1]
+            elif isinstance(source, data_source.DatasetInfo):
+                if not self._kwargs.get("filesystem"):
+                    self._kwargs["filesystem"] = ingestor_utils.get_dataset_filesystem(self._session, source)
+                sources.extend(
+                    ingestor_utils.get_dataset_files(self._session, source, filesystem=self._kwargs["filesystem"])
+                )
+                source_format = "parquet"
+            elif isinstance(source, data_source.DataFrameInfo):
+                # FIXME: This currently loads all result batches into memory so that it
+                #        can be passed into pyarrow.dataset as a list/tuple of pa.RecordBatches
+                #        We may be able to optimize this by splitting the result batches into
+                #        in-memory (first batch) and file URLs (subsequent batches) and creating a
+                #        union dataset.
+                result_batches = ingestor_utils.get_dataframe_result_batches(self._session, source)
+                sources.extend(b.to_arrow() for b in result_batches)
+                source_format = "arrow"
+            else:
+                raise RuntimeError(f"Unsupported data source type: {type(source)}")
+            # Make sure source types not mixed
+            if format and format != source_format:
+                raise RuntimeError(f"Unexpected data source format (expected {format}, found {source_format})")
+            format = source_format
+        # Re-shuffle input files on each iteration start
+        if shuffle:
+            np.random.shuffle(sources)
+        pa_dataset: ds.Dataset = ds.dataset(sources, format=format, **self._kwargs)
+        return pa_dataset
+    def _get_batches_from_buffer(self, batch_size: int) -> Dict[str, npt.NDArray[Any]]:
+        """Generate new batches from the existing record batch buffer."""
+        cnt_rbs_num_rows = 0
+        candidates = []
+        # Keep popping record batches in buffer until there are enough rows for a batch.
+        while self._rb_buffer.num_rows and cnt_rbs_num_rows < batch_size:
+            candidate = self._rb_buffer.popleft()
+            cnt_rbs_num_rows += candidate.num_rows
+            candidates.append(candidate)
+        # When there are more rows than needed, slice the last popped batch to fit batch_size.
+        if cnt_rbs_num_rows > batch_size:
+            row_diff = cnt_rbs_num_rows - batch_size
+            slice_target = candidates[-1]
+            cut_off = slice_target.num_rows - row_diff
+            to_merge = slice_target.slice(length=cut_off)
+            left_over = slice_target.slice(offset=cut_off)
+            candidates[-1] = to_merge
+            self._rb_buffer.appendleft(left_over)
+        res = _merge_record_batches(candidates)
+        return _record_batch_to_arrays(res)
+def _merge_record_batches(record_batches: List[pa.RecordBatch]) -> pa.RecordBatch:
+    """Merge a list of arrow RecordBatches into one. Similar to MergeTables."""
+    if not record_batches:
+        return _EMPTY_RECORD_BATCH
+    if len(record_batches) == 1:
+        return record_batches[0]
+    record_batches = list(filter(lambda rb: rb.num_rows > 0, record_batches))
+    one_chunk_table = pa.Table.from_batches(record_batches).combine_chunks()
+    batches = one_chunk_table.to_batches(max_chunksize=None)
+    return batches[0]
+def _record_batch_to_arrays(rb: pa.RecordBatch) -> Dict[str, npt.NDArray[Any]]:
+    """Transform the record batch to a (string, numpy array) dict."""
+    batch_dict = {}
+    for column, column_schema in zip(rb, rb.schema):
+        # zero_copy_only=False because of nans. Ideally nans should have been imputed in feature engineering.
+        array = column.to_numpy(zero_copy_only=False)
+        batch_dict[column_schema.name] = array
+    return batch_dict
+def _retryable_batches(
+    dataset: ds.Dataset, batch_size: int, max_retries: int = 3, delay: int = 0
+) -> Iterator[pa.RecordBatch]:
+    """Make the Dataset to_batches retryable."""
+    retries = 0
+    current_batch_index = 0
+    while True:
+        try:
+            for batch_index, batch in enumerate(dataset.to_batches(batch_size=batch_size)):
+                if batch_index < current_batch_index:
+                    # Skip batches that have already been processed
+                    continue
+                yield batch
+                current_batch_index = batch_index + 1
+            # Exit the loop once all batches are processed
+            break
+        except Exception as e:
+            if retries < max_retries:
+                retries += 1
+                logging.info(f"Error encountered: {e}. Retrying {retries}/{max_retries}...")
+                time.sleep(delay)
+            else:
+                raise e

snowflake/ml/data/_internal/ingestor_utils.py ADDED Viewed

@@ -0,0 +1,58 @@
+from typing import List, Optional
+import fsspec
+from snowflake import snowpark
+from snowflake.connector import result_batch
+from snowflake.ml.data import data_source
+from snowflake.ml.fileset import snowfs
+_TARGET_FILE_SIZE = 32 * 2**20  # The max file size for data loading.
+def get_dataframe_result_batches(
+    session: snowpark.Session, df_info: data_source.DataFrameInfo
+) -> List[result_batch.ResultBatch]:
+    cursor = session._conn._cursor
+    if df_info.query_id:
+        query_id = df_info.query_id
+    else:
+        query_id = session.sql(df_info.sql).collect_nowait().query_id
+    # TODO: Check if query result cache is still live
+    cursor.get_results_from_sfqid(sfqid=query_id)
+    # Prefetch hook should be set by `get_results_from_sfqid`
+    # This call blocks until the query results are ready
+    if cursor._prefetch_hook is None:
+        raise RuntimeError("Loading data from result query failed unexpectedly. Please contact Snowflake support.")
+    cursor._prefetch_hook()
+    batches = cursor.get_result_batches()
+    if batches is None:
+        raise ValueError(
+            "Failed to retrieve training data. Query status:" f" {session._conn._conn.get_query_status(query_id)}"
+        )
+    return batches
+def get_dataset_filesystem(
+    session: snowpark.Session, ds_info: Optional[data_source.DatasetInfo] = None
+) -> fsspec.AbstractFileSystem:
+    # We can't directly load the Dataset to avoid a circular dependency
+    # Dataset -> DatasetReader -> DataConnector -> DataIngestor -> (?) ingestor_utils -> Dataset
+    # TODO: Automatically pick appropriate fsspec implementation based on protocol in URL
+    return snowfs.SnowFileSystem(
+        snowpark_session=session,
+        cache_type="bytes",
+        block_size=2 * _TARGET_FILE_SIZE,
+    )
+def get_dataset_files(
+    session: snowpark.Session, ds_info: data_source.DatasetInfo, filesystem: Optional[fsspec.AbstractFileSystem] = None
+) -> List[str]:
+    if filesystem is None:
+        filesystem = get_dataset_filesystem(session, ds_info)
+    assert bool(ds_info.url)  # Not null or empty
+    return sorted(filesystem.ls(ds_info.url))

snowflake-ml-python 1.5.4__py3-none-any.whl → 1.6.0__py3-none-any.whl

snowflake-ml-python 1.5.4py3-none-any.whl → 1.6.0py3-none-any.whl