snowflake-ml-python 1.17.0__py3-none-any.whl → 1.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/telemetry.py +3 -2
- snowflake/ml/experiment/_client/experiment_tracking_sql_client.py +18 -19
- snowflake/ml/experiment/callback/keras.py +3 -0
- snowflake/ml/experiment/callback/lightgbm.py +3 -0
- snowflake/ml/experiment/callback/xgboost.py +3 -0
- snowflake/ml/experiment/experiment_tracking.py +50 -70
- snowflake/ml/feature_store/feature_store.py +299 -69
- snowflake/ml/feature_store/feature_view.py +12 -6
- snowflake/ml/fileset/stage_fs.py +12 -1
- snowflake/ml/jobs/_utils/constants.py +12 -1
- snowflake/ml/jobs/_utils/payload_utils.py +7 -1
- snowflake/ml/jobs/_utils/stage_utils.py +4 -0
- snowflake/ml/jobs/_utils/types.py +5 -0
- snowflake/ml/jobs/job.py +19 -5
- snowflake/ml/jobs/manager.py +18 -7
- snowflake/ml/model/__init__.py +19 -0
- snowflake/ml/model/_client/model/batch_inference_specs.py +63 -0
- snowflake/ml/model/_client/model/inference_engine_utils.py +1 -5
- snowflake/ml/model/_client/model/model_version_impl.py +129 -11
- snowflake/ml/model/_client/ops/model_ops.py +11 -4
- snowflake/ml/model/_client/ops/service_ops.py +3 -0
- snowflake/ml/model/_client/service/model_deployment_spec.py +3 -0
- snowflake/ml/model/_client/service/model_deployment_spec_schema.py +1 -0
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +3 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +4 -1
- snowflake/ml/model/_packager/model_handlers/_utils.py +70 -0
- snowflake/ml/model/_packager/model_handlers/prophet.py +566 -0
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +6 -0
- snowflake/ml/model/_packager/model_runtime/_snowml_inference_alternative_requirements.py +1 -1
- snowflake/ml/model/type_hints.py +16 -0
- snowflake/ml/modeling/metrics/metrics_utils.py +9 -2
- snowflake/ml/monitoring/explain_visualize.py +3 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.17.0.dist-info → snowflake_ml_python-1.19.0.dist-info}/METADATA +50 -4
- {snowflake_ml_python-1.17.0.dist-info → snowflake_ml_python-1.19.0.dist-info}/RECORD +38 -37
- {snowflake_ml_python-1.17.0.dist-info → snowflake_ml_python-1.19.0.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.17.0.dist-info → snowflake_ml_python-1.19.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowflake_ml_python-1.17.0.dist-info → snowflake_ml_python-1.19.0.dist-info}/top_level.txt +0 -0
|
@@ -474,8 +474,8 @@ class FeatureStore:
|
|
|
474
474
|
feature_view: FeatureView instance to materialize.
|
|
475
475
|
version: version of the registered FeatureView.
|
|
476
476
|
NOTE: Version only accepts letters, numbers and underscore. Also version will be capitalized.
|
|
477
|
-
block:
|
|
478
|
-
|
|
477
|
+
block: Deprecated. To make the initial refresh asynchronous, set the `initialize`
|
|
478
|
+
argument on the `FeatureView` to `"ON_SCHEDULE"`. Default is true.
|
|
479
479
|
overwrite: Overwrite the existing FeatureView with same version. This is the same as dropping the
|
|
480
480
|
FeatureView first then recreate. NOTE: there will be backfill cost associated if the FeatureView is
|
|
481
481
|
being continuously maintained.
|
|
@@ -521,6 +521,15 @@ class FeatureStore:
|
|
|
521
521
|
"""
|
|
522
522
|
version = FeatureViewVersion(version)
|
|
523
523
|
|
|
524
|
+
if block is False:
|
|
525
|
+
raise snowml_exceptions.SnowflakeMLException(
|
|
526
|
+
error_code=error_codes.INVALID_ARGUMENT,
|
|
527
|
+
original_exception=ValueError(
|
|
528
|
+
'block=False is deprecated. Use FeatureView(..., initialize="ON_SCHEDULE") '
|
|
529
|
+
"for async initial refresh."
|
|
530
|
+
),
|
|
531
|
+
)
|
|
532
|
+
|
|
524
533
|
if feature_view.status != FeatureViewStatus.DRAFT:
|
|
525
534
|
try:
|
|
526
535
|
return self._get_feature_view_if_exists(feature_view.name, str(version))
|
|
@@ -1199,10 +1208,10 @@ class FeatureStore:
|
|
|
1199
1208
|
"""Get refresh history for online feature table."""
|
|
1200
1209
|
online_table_name = FeatureView._get_online_table_name(feature_view.name, feature_view.version)
|
|
1201
1210
|
select_cols = "*" if verbose else "name, state, refresh_start_time, refresh_end_time, refresh_action"
|
|
1202
|
-
|
|
1203
|
-
f"{self._config.database.
|
|
1204
|
-
f"{self._config.schema.
|
|
1205
|
-
f"{online_table_name.
|
|
1211
|
+
name = (
|
|
1212
|
+
f"{self._config.database.identifier()}."
|
|
1213
|
+
f"{self._config.schema.identifier()}."
|
|
1214
|
+
f"{online_table_name.identifier()}"
|
|
1206
1215
|
)
|
|
1207
1216
|
return self._session.sql(
|
|
1208
1217
|
f"""
|
|
@@ -1210,9 +1219,8 @@ class FeatureStore:
|
|
|
1210
1219
|
{select_cols}
|
|
1211
1220
|
FROM TABLE (
|
|
1212
1221
|
{self._config.database}.INFORMATION_SCHEMA.ONLINE_FEATURE_TABLE_REFRESH_HISTORY (
|
|
1213
|
-
|
|
1222
|
+
NAME => '{name}'
|
|
1214
1223
|
)
|
|
1215
|
-
|
|
1216
1224
|
)
|
|
1217
1225
|
"""
|
|
1218
1226
|
)
|
|
@@ -1591,6 +1599,7 @@ class FeatureStore:
|
|
|
1591
1599
|
spine_timestamp_col: Optional[str] = None,
|
|
1592
1600
|
exclude_columns: Optional[list[str]] = None,
|
|
1593
1601
|
include_feature_view_timestamp_col: bool = False,
|
|
1602
|
+
join_method: Literal["sequential", "cte"] = "sequential",
|
|
1594
1603
|
) -> DataFrame:
|
|
1595
1604
|
"""
|
|
1596
1605
|
Enrich spine dataframe with feature values. Mainly used to generate inference data input.
|
|
@@ -1604,6 +1613,8 @@ class FeatureStore:
|
|
|
1604
1613
|
exclude_columns: Column names to exclude from the result dataframe.
|
|
1605
1614
|
include_feature_view_timestamp_col: Generated dataset will include timestamp column of feature view
|
|
1606
1615
|
(if feature view has timestamp column) if set true. Default to false.
|
|
1616
|
+
join_method: Method for feature joins. "sequential" for layer-by-layer joins (default),
|
|
1617
|
+
"cte" for CTE method. (Internal use only - subject to change)
|
|
1607
1618
|
|
|
1608
1619
|
Returns:
|
|
1609
1620
|
Snowpark DataFrame containing the joined results.
|
|
@@ -1641,6 +1652,7 @@ class FeatureStore:
|
|
|
1641
1652
|
cast(list[Union[FeatureView, FeatureViewSlice]], features),
|
|
1642
1653
|
spine_timestamp_col,
|
|
1643
1654
|
include_feature_view_timestamp_col,
|
|
1655
|
+
join_method,
|
|
1644
1656
|
)
|
|
1645
1657
|
|
|
1646
1658
|
if exclude_columns is not None:
|
|
@@ -1659,6 +1671,7 @@ class FeatureStore:
|
|
|
1659
1671
|
spine_label_cols: Optional[list[str]] = None,
|
|
1660
1672
|
exclude_columns: Optional[list[str]] = None,
|
|
1661
1673
|
include_feature_view_timestamp_col: bool = False,
|
|
1674
|
+
join_method: Literal["sequential", "cte"] = "sequential",
|
|
1662
1675
|
) -> DataFrame:
|
|
1663
1676
|
"""
|
|
1664
1677
|
Generate a training set from the specified Spine DataFrame and Feature Views. Result is
|
|
@@ -1676,6 +1689,8 @@ class FeatureStore:
|
|
|
1676
1689
|
exclude_columns: Name of column(s) to exclude from the resulting training set.
|
|
1677
1690
|
include_feature_view_timestamp_col: Generated dataset will include timestamp column of feature view
|
|
1678
1691
|
(if feature view has timestamp column) if set true. Default to false.
|
|
1692
|
+
join_method: Method for feature joins. "sequential" for layer-by-layer joins (default),
|
|
1693
|
+
"cte" for CTE method. (Internal use only - subject to change)
|
|
1679
1694
|
|
|
1680
1695
|
Returns:
|
|
1681
1696
|
Returns a Snowpark DataFrame representing the training set.
|
|
@@ -1709,7 +1724,7 @@ class FeatureStore:
|
|
|
1709
1724
|
spine_label_cols = to_sql_identifiers(spine_label_cols) # type: ignore[assignment]
|
|
1710
1725
|
|
|
1711
1726
|
result_df, join_keys = self._join_features(
|
|
1712
|
-
spine_df, features, spine_timestamp_col, include_feature_view_timestamp_col
|
|
1727
|
+
spine_df, features, spine_timestamp_col, include_feature_view_timestamp_col, join_method
|
|
1713
1728
|
)
|
|
1714
1729
|
|
|
1715
1730
|
if exclude_columns is not None:
|
|
@@ -1757,6 +1772,7 @@ class FeatureStore:
|
|
|
1757
1772
|
include_feature_view_timestamp_col: bool = False,
|
|
1758
1773
|
desc: str = "",
|
|
1759
1774
|
output_type: Literal["dataset"] = "dataset",
|
|
1775
|
+
join_method: Literal["sequential", "cte"] = "sequential",
|
|
1760
1776
|
) -> dataset.Dataset:
|
|
1761
1777
|
...
|
|
1762
1778
|
|
|
@@ -1774,6 +1790,7 @@ class FeatureStore:
|
|
|
1774
1790
|
exclude_columns: Optional[list[str]] = None,
|
|
1775
1791
|
include_feature_view_timestamp_col: bool = False,
|
|
1776
1792
|
desc: str = "",
|
|
1793
|
+
join_method: Literal["sequential", "cte"] = "sequential",
|
|
1777
1794
|
) -> DataFrame:
|
|
1778
1795
|
...
|
|
1779
1796
|
|
|
@@ -1791,6 +1808,7 @@ class FeatureStore:
|
|
|
1791
1808
|
include_feature_view_timestamp_col: bool = False,
|
|
1792
1809
|
desc: str = "",
|
|
1793
1810
|
output_type: Literal["dataset", "table"] = "dataset",
|
|
1811
|
+
join_method: Literal["sequential", "cte"] = "sequential",
|
|
1794
1812
|
) -> Union[dataset.Dataset, DataFrame]:
|
|
1795
1813
|
"""
|
|
1796
1814
|
Generate dataset by given source table and feature views.
|
|
@@ -1811,6 +1829,8 @@ class FeatureStore:
|
|
|
1811
1829
|
(if feature view has timestamp column) if set true. Default to false.
|
|
1812
1830
|
desc: A description about this dataset.
|
|
1813
1831
|
output_type: (Deprecated) The type of Snowflake storage to use for the generated training data.
|
|
1832
|
+
join_method: Method for feature joins. "sequential" for layer-by-layer joins (default),
|
|
1833
|
+
"cte" for CTE method. (Internal use only - subject to change)
|
|
1814
1834
|
|
|
1815
1835
|
Returns:
|
|
1816
1836
|
If output_type is "dataset" (default), returns a Dataset object.
|
|
@@ -1874,6 +1894,7 @@ class FeatureStore:
|
|
|
1874
1894
|
exclude_columns=exclude_columns,
|
|
1875
1895
|
include_feature_view_timestamp_col=include_feature_view_timestamp_col,
|
|
1876
1896
|
save_as=table_name,
|
|
1897
|
+
join_method=join_method,
|
|
1877
1898
|
)
|
|
1878
1899
|
if output_type == "table":
|
|
1879
1900
|
warnings.warn(
|
|
@@ -2082,26 +2103,48 @@ class FeatureStore:
|
|
|
2082
2103
|
def _plan_online_update(
|
|
2083
2104
|
self, feature_view: FeatureView, online_config: Optional[fv_mod.OnlineConfig]
|
|
2084
2105
|
) -> _OnlineUpdateStrategy:
|
|
2085
|
-
"""Plan online update operations based on current state and target config.
|
|
2106
|
+
"""Plan online update operations based on current state and target config.
|
|
2107
|
+
|
|
2108
|
+
Handles three cases:
|
|
2109
|
+
- enable is None: Preserve current online state, only update if currently online
|
|
2110
|
+
- enable is True: Enable online storage (create if needed, update if exists)
|
|
2111
|
+
- enable is False: Disable online storage (drop if exists)
|
|
2112
|
+
|
|
2113
|
+
Args:
|
|
2114
|
+
feature_view: The FeatureView object to check current online state.
|
|
2115
|
+
online_config: The OnlineConfig with target enable and lag settings.
|
|
2116
|
+
|
|
2117
|
+
Returns:
|
|
2118
|
+
_OnlineUpdateStrategy containing operations and their rollbacks.
|
|
2119
|
+
"""
|
|
2086
2120
|
if online_config is None:
|
|
2087
2121
|
return self._OnlineUpdateStrategy([], [], None)
|
|
2088
2122
|
|
|
2089
2123
|
current_online = feature_view.online
|
|
2090
2124
|
target_online = online_config.enable
|
|
2091
2125
|
|
|
2092
|
-
#
|
|
2126
|
+
# Case 1: enable is None - preserve current online state, only update if currently online
|
|
2127
|
+
if target_online is None:
|
|
2128
|
+
if current_online and (online_config.target_lag is not None):
|
|
2129
|
+
# Online is currently enabled and user wants to update lag
|
|
2130
|
+
return self._plan_online_update_existing(feature_view, online_config)
|
|
2131
|
+
else:
|
|
2132
|
+
# No online changes needed (either not online, or lag not specified)
|
|
2133
|
+
return self._OnlineUpdateStrategy([], [], None)
|
|
2134
|
+
|
|
2135
|
+
# Case 2: Enable online (create table)
|
|
2093
2136
|
if target_online and not current_online:
|
|
2094
2137
|
return self._plan_online_enable(feature_view, online_config)
|
|
2095
2138
|
|
|
2096
|
-
# Disable online (drop table)
|
|
2139
|
+
# Case 3: Disable online (drop table)
|
|
2097
2140
|
elif not target_online and current_online:
|
|
2098
2141
|
return self._plan_online_disable(feature_view)
|
|
2099
2142
|
|
|
2100
|
-
# Update existing online table
|
|
2143
|
+
# Case 4: Update existing online table
|
|
2101
2144
|
elif target_online and current_online:
|
|
2102
2145
|
return self._plan_online_update_existing(feature_view, online_config)
|
|
2103
2146
|
|
|
2104
|
-
# No change needed
|
|
2147
|
+
# Case 5: No change needed
|
|
2105
2148
|
else:
|
|
2106
2149
|
return self._OnlineUpdateStrategy([], [], online_config)
|
|
2107
2150
|
|
|
@@ -2596,91 +2639,278 @@ class FeatureStore:
|
|
|
2596
2639
|
found_rows = self._find_object("TAGS", full_entity_tag_name)
|
|
2597
2640
|
return len(found_rows) == 1
|
|
2598
2641
|
|
|
2642
|
+
def _build_cte_query(
|
|
2643
|
+
self,
|
|
2644
|
+
feature_views: list[FeatureView],
|
|
2645
|
+
feature_columns: list[str],
|
|
2646
|
+
spine_ref: str,
|
|
2647
|
+
spine_timestamp_col: Optional[SqlIdentifier],
|
|
2648
|
+
include_feature_view_timestamp_col: bool = False,
|
|
2649
|
+
) -> str:
|
|
2650
|
+
"""
|
|
2651
|
+
Build a CTE query with the spine query and the feature views.
|
|
2652
|
+
|
|
2653
|
+
This method supports feature views with different join keys by:
|
|
2654
|
+
1. Creating a spine CTE that includes all possible join keys
|
|
2655
|
+
2. For each feature view, creating a deduplicated spine subquery with only that FV's join keys
|
|
2656
|
+
3. Performing ASOF JOINs on the deduplicated spine when timestamp columns exist
|
|
2657
|
+
4. Performing LEFT JOINs on the deduplicated spine when timestamp columns are missing
|
|
2658
|
+
5. Combining results by LEFT JOINing each FV CTE back to the original SPINE
|
|
2659
|
+
|
|
2660
|
+
Args:
|
|
2661
|
+
feature_views: A list of feature views to join.
|
|
2662
|
+
feature_columns: A list of feature column strings for each feature view.
|
|
2663
|
+
spine_ref: The spine query.
|
|
2664
|
+
spine_timestamp_col: The timestamp column from spine. Can be None if spine has no timestamp column.
|
|
2665
|
+
include_feature_view_timestamp_col: Whether to include the timestamp column of
|
|
2666
|
+
the feature view in the result. Default to false.
|
|
2667
|
+
|
|
2668
|
+
Returns:
|
|
2669
|
+
A SQL query string with CTE structure for joining feature views.
|
|
2670
|
+
"""
|
|
2671
|
+
if not feature_views:
|
|
2672
|
+
return f"SELECT * FROM ({spine_ref})"
|
|
2673
|
+
|
|
2674
|
+
# Create spine CTE with the spine query for reuse
|
|
2675
|
+
spine_cte = f"""SPINE AS (
|
|
2676
|
+
SELECT * FROM ({spine_ref})
|
|
2677
|
+
)"""
|
|
2678
|
+
|
|
2679
|
+
ctes = [spine_cte]
|
|
2680
|
+
cte_names = []
|
|
2681
|
+
for i, feature_view in enumerate(feature_views):
|
|
2682
|
+
cte_name = f"FV{i:03d}"
|
|
2683
|
+
cte_names.append(cte_name)
|
|
2684
|
+
|
|
2685
|
+
feature_timestamp_col = feature_view.timestamp_col
|
|
2686
|
+
|
|
2687
|
+
# Get the specific join keys for this feature view
|
|
2688
|
+
fv_join_keys = list({k for e in feature_view.entities for k in e.join_keys})
|
|
2689
|
+
join_keys_str = ", ".join(fv_join_keys)
|
|
2690
|
+
|
|
2691
|
+
# Use ASOF JOIN if both spine and feature view have timestamp columns, otherwise use LEFT JOIN
|
|
2692
|
+
if spine_timestamp_col is not None and feature_timestamp_col is not None:
|
|
2693
|
+
# Build the deduplicated spine columns set (join keys + timestamp)
|
|
2694
|
+
spine_dedup_cols_set = set(fv_join_keys)
|
|
2695
|
+
if spine_timestamp_col not in spine_dedup_cols_set:
|
|
2696
|
+
spine_dedup_cols_set.add(spine_timestamp_col)
|
|
2697
|
+
spine_dedup_cols_str = ", ".join(f'"{col}"' for col in spine_dedup_cols_set)
|
|
2698
|
+
|
|
2699
|
+
# Build the JOIN condition using only this feature view's join keys
|
|
2700
|
+
join_conditions_dedup = [f'SPINE_DEDUP."{col}" = FEATURE."{col}"' for col in fv_join_keys]
|
|
2701
|
+
|
|
2702
|
+
if include_feature_view_timestamp_col:
|
|
2703
|
+
f_ts_col_alias = identifier.concat_names(
|
|
2704
|
+
[feature_view.name, "_", str(feature_view.version), "_", feature_timestamp_col]
|
|
2705
|
+
)
|
|
2706
|
+
f_ts_col_str = f"FEATURE.{feature_timestamp_col} AS {f_ts_col_alias},"
|
|
2707
|
+
else:
|
|
2708
|
+
f_ts_col_str = ""
|
|
2709
|
+
ctes.append(
|
|
2710
|
+
f"""{cte_name} AS (
|
|
2711
|
+
SELECT
|
|
2712
|
+
SPINE_DEDUP.*,
|
|
2713
|
+
{f_ts_col_str}
|
|
2714
|
+
FEATURE.* EXCLUDE ({join_keys_str}, {feature_timestamp_col})
|
|
2715
|
+
FROM (
|
|
2716
|
+
SELECT DISTINCT {spine_dedup_cols_str}
|
|
2717
|
+
FROM SPINE
|
|
2718
|
+
) SPINE_DEDUP
|
|
2719
|
+
ASOF JOIN (
|
|
2720
|
+
SELECT {join_keys_str}, {feature_timestamp_col}, {feature_columns[i]}
|
|
2721
|
+
FROM {feature_view.fully_qualified_name()}
|
|
2722
|
+
) FEATURE
|
|
2723
|
+
MATCH_CONDITION (SPINE_DEDUP."{spine_timestamp_col}" >= FEATURE."{feature_timestamp_col}")
|
|
2724
|
+
ON {" AND ".join(join_conditions_dedup)}
|
|
2725
|
+
)"""
|
|
2726
|
+
)
|
|
2727
|
+
else:
|
|
2728
|
+
# Build the deduplicated spine columns list (just join keys, no timestamp)
|
|
2729
|
+
spine_dedup_cols_str = ", ".join(f'"{col}"' for col in fv_join_keys)
|
|
2730
|
+
|
|
2731
|
+
# Build the JOIN condition using only this feature view's join keys
|
|
2732
|
+
join_conditions_dedup = [f'SPINE_DEDUP."{col}" = FEATURE."{col}"' for col in fv_join_keys]
|
|
2733
|
+
|
|
2734
|
+
ctes.append(
|
|
2735
|
+
f"""{cte_name} AS (
|
|
2736
|
+
SELECT
|
|
2737
|
+
SPINE_DEDUP.*,
|
|
2738
|
+
FEATURE.* EXCLUDE ({join_keys_str})
|
|
2739
|
+
FROM (
|
|
2740
|
+
SELECT DISTINCT {spine_dedup_cols_str}
|
|
2741
|
+
FROM SPINE
|
|
2742
|
+
) SPINE_DEDUP
|
|
2743
|
+
LEFT JOIN (
|
|
2744
|
+
SELECT {join_keys_str}, {feature_columns[i]}
|
|
2745
|
+
FROM {feature_view.fully_qualified_name()}
|
|
2746
|
+
) FEATURE
|
|
2747
|
+
ON {" AND ".join(join_conditions_dedup)}
|
|
2748
|
+
)"""
|
|
2749
|
+
)
|
|
2750
|
+
|
|
2751
|
+
# Build final SELECT with LEFT joins to each FV CTE
|
|
2752
|
+
select_columns = []
|
|
2753
|
+
join_clauses = []
|
|
2754
|
+
|
|
2755
|
+
for i, cte_name in enumerate(cte_names):
|
|
2756
|
+
feature_view = feature_views[i]
|
|
2757
|
+
fv_join_keys = list({k for e in feature_view.entities for k in e.join_keys})
|
|
2758
|
+
join_conditions = [f'SPINE."{col}" = {cte_name}."{col}"' for col in fv_join_keys]
|
|
2759
|
+
# Only include spine timestamp in join condition if both spine and FV have timestamps
|
|
2760
|
+
if spine_timestamp_col is not None and feature_view.timestamp_col is not None:
|
|
2761
|
+
join_conditions.append(f'SPINE."{spine_timestamp_col}" = {cte_name}."{spine_timestamp_col}"')
|
|
2762
|
+
|
|
2763
|
+
if include_feature_view_timestamp_col and feature_view.timestamp_col is not None:
|
|
2764
|
+
f_ts_col_alias = identifier.concat_names(
|
|
2765
|
+
[feature_view.name, "_", str(feature_view.version), "_", feature_view.timestamp_col]
|
|
2766
|
+
)
|
|
2767
|
+
f_ts_col_str = f"{cte_name}.{f_ts_col_alias} AS {f_ts_col_alias}"
|
|
2768
|
+
select_columns.append(f_ts_col_str)
|
|
2769
|
+
|
|
2770
|
+
# Select features from the CTE
|
|
2771
|
+
# feature_columns[i] is already a comma-separated string of column names
|
|
2772
|
+
feature_cols_from_cte = []
|
|
2773
|
+
for col in feature_columns[i].split(", "):
|
|
2774
|
+
col_clean = col.strip()
|
|
2775
|
+
feature_cols_from_cte.append(f"{cte_name}.{col_clean}")
|
|
2776
|
+
select_columns.extend(feature_cols_from_cte)
|
|
2777
|
+
|
|
2778
|
+
# Create join condition using only this feature view's join keys
|
|
2779
|
+
join_clauses.append(
|
|
2780
|
+
f"""
|
|
2781
|
+
LEFT JOIN {cte_name}
|
|
2782
|
+
ON {" AND ".join(join_conditions)}"""
|
|
2783
|
+
)
|
|
2784
|
+
|
|
2785
|
+
query = f"""WITH
|
|
2786
|
+
{', '.join(ctes)}
|
|
2787
|
+
SELECT
|
|
2788
|
+
SPINE.*,
|
|
2789
|
+
{', '.join(select_columns)}
|
|
2790
|
+
FROM SPINE{' '.join(join_clauses)}
|
|
2791
|
+
"""
|
|
2792
|
+
|
|
2793
|
+
return query
|
|
2794
|
+
|
|
2599
2795
|
def _join_features(
|
|
2600
2796
|
self,
|
|
2601
2797
|
spine_df: DataFrame,
|
|
2602
2798
|
features: list[Union[FeatureView, FeatureViewSlice]],
|
|
2603
2799
|
spine_timestamp_col: Optional[SqlIdentifier],
|
|
2604
2800
|
include_feature_view_timestamp_col: bool,
|
|
2801
|
+
join_method: Literal["sequential", "cte"] = "sequential",
|
|
2605
2802
|
) -> tuple[DataFrame, list[SqlIdentifier]]:
|
|
2606
|
-
|
|
2607
|
-
|
|
2608
|
-
|
|
2803
|
+
# Validate join_method parameter
|
|
2804
|
+
if join_method not in ["sequential", "cte"]:
|
|
2805
|
+
raise ValueError(f"Invalid join_method '{join_method}'. Must be 'sequential' or 'cte'.")
|
|
2806
|
+
|
|
2807
|
+
feature_views: list[FeatureView] = []
|
|
2808
|
+
# Extract column selections for each feature view
|
|
2809
|
+
feature_columns: list[str] = []
|
|
2810
|
+
for feature in features:
|
|
2811
|
+
fv = feature.feature_view_ref if isinstance(feature, FeatureViewSlice) else feature
|
|
2812
|
+
if fv.status == FeatureViewStatus.DRAFT:
|
|
2609
2813
|
raise snowml_exceptions.SnowflakeMLException(
|
|
2610
2814
|
error_code=error_codes.NOT_FOUND,
|
|
2611
|
-
original_exception=ValueError(f"FeatureView {
|
|
2815
|
+
original_exception=ValueError(f"FeatureView {fv.name} has not been registered."),
|
|
2612
2816
|
)
|
|
2613
|
-
for e in
|
|
2817
|
+
for e in fv.entities:
|
|
2614
2818
|
for k in e.join_keys:
|
|
2615
2819
|
if k not in to_sql_identifiers(spine_df.columns):
|
|
2616
2820
|
raise snowml_exceptions.SnowflakeMLException(
|
|
2617
2821
|
error_code=error_codes.INVALID_ARGUMENT,
|
|
2618
2822
|
original_exception=ValueError(
|
|
2619
|
-
f"join_key {k} from Entity {e.name} in FeatureView {
|
|
2823
|
+
f"join_key {k} from Entity {e.name} in FeatureView {fv.name} "
|
|
2824
|
+
"is not found in spine_df."
|
|
2620
2825
|
),
|
|
2621
2826
|
)
|
|
2622
|
-
|
|
2827
|
+
feature_views.append(fv)
|
|
2828
|
+
if isinstance(feature, FeatureViewSlice):
|
|
2829
|
+
cols = feature.names
|
|
2830
|
+
else:
|
|
2831
|
+
cols = feature.feature_names
|
|
2832
|
+
feature_columns.append(", ".join(col.resolved() for col in cols))
|
|
2833
|
+
# TODO (SNOW-2396184): remove this check and the non-ASOF join path as ASOF join is enabled by default now.
|
|
2623
2834
|
if self._asof_join_enabled is None:
|
|
2624
2835
|
self._asof_join_enabled = self._is_asof_join_enabled()
|
|
2625
2836
|
|
|
2626
2837
|
# TODO: leverage Snowpark dataframe for more concise syntax once it supports AsOfJoin
|
|
2627
2838
|
query = spine_df.queries["queries"][-1]
|
|
2628
|
-
|
|
2629
|
-
|
|
2630
|
-
|
|
2631
|
-
|
|
2632
|
-
|
|
2633
|
-
|
|
2634
|
-
|
|
2635
|
-
|
|
2636
|
-
|
|
2637
|
-
|
|
2638
|
-
|
|
2639
|
-
|
|
2640
|
-
|
|
2641
|
-
|
|
2642
|
-
|
|
2643
|
-
|
|
2644
|
-
|
|
2645
|
-
|
|
2839
|
+
join_keys: list[SqlIdentifier] = []
|
|
2840
|
+
|
|
2841
|
+
if join_method == "cte":
|
|
2842
|
+
|
|
2843
|
+
logger.info(f"Using the CTE method with {len(features)} feature views")
|
|
2844
|
+
|
|
2845
|
+
query = self._build_cte_query(
|
|
2846
|
+
feature_views,
|
|
2847
|
+
feature_columns,
|
|
2848
|
+
spine_df.queries["queries"][-1],
|
|
2849
|
+
spine_timestamp_col,
|
|
2850
|
+
include_feature_view_timestamp_col,
|
|
2851
|
+
)
|
|
2852
|
+
else:
|
|
2853
|
+
# Use sequential joins layer by layer
|
|
2854
|
+
logger.info(f"Using the sequential join method with {len(features)} feature views")
|
|
2855
|
+
layer = 0
|
|
2856
|
+
for feature in features:
|
|
2857
|
+
if isinstance(feature, FeatureViewSlice):
|
|
2858
|
+
cols = feature.names
|
|
2859
|
+
feature = feature.feature_view_ref
|
|
2860
|
+
else:
|
|
2861
|
+
cols = feature.feature_names
|
|
2862
|
+
|
|
2863
|
+
join_keys = list({k for e in feature.entities for k in e.join_keys})
|
|
2864
|
+
join_keys_str = ", ".join(join_keys)
|
|
2865
|
+
assert feature.version is not None
|
|
2866
|
+
join_table_name = feature.fully_qualified_name()
|
|
2867
|
+
|
|
2868
|
+
if spine_timestamp_col is not None and feature.timestamp_col is not None:
|
|
2869
|
+
if self._asof_join_enabled:
|
|
2870
|
+
if include_feature_view_timestamp_col:
|
|
2871
|
+
f_ts_col_alias = identifier.concat_names(
|
|
2872
|
+
[feature.name, "_", feature.version, "_", feature.timestamp_col]
|
|
2873
|
+
)
|
|
2874
|
+
f_ts_col_str = f"r_{layer}.{feature.timestamp_col} AS {f_ts_col_alias},"
|
|
2875
|
+
else:
|
|
2876
|
+
f_ts_col_str = ""
|
|
2877
|
+
query = f"""
|
|
2878
|
+
SELECT
|
|
2879
|
+
l_{layer}.*,
|
|
2880
|
+
{f_ts_col_str}
|
|
2881
|
+
r_{layer}.* EXCLUDE ({join_keys_str}, {feature.timestamp_col})
|
|
2882
|
+
FROM ({query}) l_{layer}
|
|
2883
|
+
ASOF JOIN (
|
|
2884
|
+
SELECT {join_keys_str}, {feature.timestamp_col},
|
|
2885
|
+
{', '.join(col.resolved() for col in cols)}
|
|
2886
|
+
FROM {join_table_name}
|
|
2887
|
+
) r_{layer}
|
|
2888
|
+
MATCH_CONDITION (l_{layer}.{spine_timestamp_col} >= r_{layer}.{feature.timestamp_col})
|
|
2889
|
+
ON {' AND '.join([f'l_{layer}.{k} = r_{layer}.{k}' for k in join_keys])}
|
|
2890
|
+
"""
|
|
2646
2891
|
else:
|
|
2647
|
-
|
|
2892
|
+
query = self._composed_union_window_join_query(
|
|
2893
|
+
layer=layer,
|
|
2894
|
+
s_query=query,
|
|
2895
|
+
s_ts_col=spine_timestamp_col,
|
|
2896
|
+
f_df=feature.feature_df,
|
|
2897
|
+
f_table_name=join_table_name,
|
|
2898
|
+
f_ts_col=feature.timestamp_col,
|
|
2899
|
+
join_keys=join_keys,
|
|
2900
|
+
)
|
|
2901
|
+
else:
|
|
2648
2902
|
query = f"""
|
|
2649
2903
|
SELECT
|
|
2650
2904
|
l_{layer}.*,
|
|
2651
|
-
{
|
|
2652
|
-
r_{layer}.* EXCLUDE ({join_keys_str}, {f.timestamp_col})
|
|
2905
|
+
r_{layer}.* EXCLUDE ({join_keys_str})
|
|
2653
2906
|
FROM ({query}) l_{layer}
|
|
2654
|
-
|
|
2655
|
-
SELECT {join_keys_str}, {
|
|
2907
|
+
LEFT JOIN (
|
|
2908
|
+
SELECT {join_keys_str}, {', '.join(col.resolved() for col in cols)}
|
|
2656
2909
|
FROM {join_table_name}
|
|
2657
2910
|
) r_{layer}
|
|
2658
|
-
MATCH_CONDITION (l_{layer}.{spine_timestamp_col} >= r_{layer}.{f.timestamp_col})
|
|
2659
2911
|
ON {' AND '.join([f'l_{layer}.{k} = r_{layer}.{k}' for k in join_keys])}
|
|
2660
2912
|
"""
|
|
2661
|
-
|
|
2662
|
-
query = self._composed_union_window_join_query(
|
|
2663
|
-
layer=layer,
|
|
2664
|
-
s_query=query,
|
|
2665
|
-
s_ts_col=spine_timestamp_col,
|
|
2666
|
-
f_df=f.feature_df,
|
|
2667
|
-
f_table_name=join_table_name,
|
|
2668
|
-
f_ts_col=f.timestamp_col,
|
|
2669
|
-
join_keys=join_keys,
|
|
2670
|
-
)
|
|
2671
|
-
else:
|
|
2672
|
-
query = f"""
|
|
2673
|
-
SELECT
|
|
2674
|
-
l_{layer}.*,
|
|
2675
|
-
r_{layer}.* EXCLUDE ({join_keys_str})
|
|
2676
|
-
FROM ({query}) l_{layer}
|
|
2677
|
-
LEFT JOIN (
|
|
2678
|
-
SELECT {join_keys_str}, {', '.join(cols)}
|
|
2679
|
-
FROM {join_table_name}
|
|
2680
|
-
) r_{layer}
|
|
2681
|
-
ON {' AND '.join([f'l_{layer}.{k} = r_{layer}.{k}' for k in join_keys])}
|
|
2682
|
-
"""
|
|
2683
|
-
layer += 1
|
|
2913
|
+
layer += 1
|
|
2684
2914
|
|
|
2685
2915
|
# TODO: construct result dataframe with datframe APIs once ASOF join is supported natively.
|
|
2686
2916
|
# Below code manually construct result dataframe from private members of spine dataframe, which
|
|
@@ -3213,7 +3443,7 @@ class FeatureStore:
|
|
|
3213
3443
|
online_table_name = FeatureView._get_online_table_name(feature_view_name)
|
|
3214
3444
|
|
|
3215
3445
|
fully_qualified_online_name = self._get_fully_qualified_name(online_table_name)
|
|
3216
|
-
source_table_name = feature_view_name
|
|
3446
|
+
source_table_name = self._get_fully_qualified_name(feature_view_name)
|
|
3217
3447
|
|
|
3218
3448
|
# Extract join keys for PRIMARY KEY (preserve order and ensure unique)
|
|
3219
3449
|
ordered_join_keys: list[str] = []
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
+
"""Feature view module for Snowflake ML Feature Store."""
|
|
1
2
|
from __future__ import annotations
|
|
2
3
|
|
|
3
4
|
import json
|
|
4
|
-
import logging
|
|
5
5
|
import re
|
|
6
6
|
import warnings
|
|
7
7
|
from collections import OrderedDict
|
|
@@ -52,7 +52,7 @@ _RESULT_SCAN_QUERY_PATTERN = re.compile(
|
|
|
52
52
|
class OnlineConfig:
|
|
53
53
|
"""Configuration for online feature storage."""
|
|
54
54
|
|
|
55
|
-
enable: bool =
|
|
55
|
+
enable: Optional[bool] = None
|
|
56
56
|
target_lag: Optional[str] = None
|
|
57
57
|
|
|
58
58
|
def __post_init__(self) -> None:
|
|
@@ -248,6 +248,7 @@ class FeatureView(lineage_node.LineageNode):
|
|
|
248
248
|
- If `timestamp_col` is provided, it is added to the default clustering keys.
|
|
249
249
|
online_config: Optional configuration for online storage. If provided with enable=True,
|
|
250
250
|
online storage will be enabled. Defaults to None (no online storage).
|
|
251
|
+
NOTE: this feature is currently in Public Preview.
|
|
251
252
|
_kwargs: reserved kwargs for system generated args. NOTE: DO NOT USE.
|
|
252
253
|
|
|
253
254
|
Example::
|
|
@@ -289,8 +290,6 @@ class FeatureView(lineage_node.LineageNode):
|
|
|
289
290
|
|
|
290
291
|
# noqa: DAR401
|
|
291
292
|
"""
|
|
292
|
-
if online_config is not None:
|
|
293
|
-
logging.warning("'online_config' is in private preview since 1.12.0. Do not use it in production.")
|
|
294
293
|
|
|
295
294
|
self._name: SqlIdentifier = SqlIdentifier(name)
|
|
296
295
|
self._entities: list[Entity] = entities
|
|
@@ -533,8 +532,15 @@ class FeatureView(lineage_node.LineageNode):
|
|
|
533
532
|
return self._feature_desc
|
|
534
533
|
|
|
535
534
|
@property
|
|
536
|
-
def online(self) -> bool:
|
|
537
|
-
|
|
535
|
+
def online(self) -> bool: # noqa: DAR101
|
|
536
|
+
"""Check if online storage is enabled for this feature view.
|
|
537
|
+
|
|
538
|
+
Returns:
|
|
539
|
+
True if online storage is enabled, False otherwise.
|
|
540
|
+
"""
|
|
541
|
+
if self._online_config and self._online_config.enable is True:
|
|
542
|
+
return True
|
|
543
|
+
return False
|
|
538
544
|
|
|
539
545
|
@property
|
|
540
546
|
def online_config(self) -> Optional[OnlineConfig]:
|
snowflake/ml/fileset/stage_fs.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import inspect
|
|
2
2
|
import logging
|
|
3
|
+
import re
|
|
3
4
|
import time
|
|
4
5
|
from dataclasses import dataclass
|
|
5
6
|
from typing import Any, Optional, Union, cast
|
|
@@ -27,6 +28,8 @@ _PRESIGNED_URL_LIFETIME_SEC = 14400
|
|
|
27
28
|
# The threshold of when the presigned url should get refreshed before its expiration.
|
|
28
29
|
_PRESIGNED_URL_HEADROOM_SEC = 3600
|
|
29
30
|
|
|
31
|
+
# Regex pattern to match cloud storage prefixes (s3://, gcs://, azure://) and bucket/container name at start of string
|
|
32
|
+
_CLOUD_PATH_PREFIX_PATTERN = re.compile(r"^(s3|gcs|azure)://[^/]+/", re.IGNORECASE)
|
|
30
33
|
|
|
31
34
|
_PROJECT = "FileSet"
|
|
32
35
|
|
|
@@ -355,8 +358,16 @@ class SFStageFileSystem(fsspec.AbstractFileSystem):
|
|
|
355
358
|
|
|
356
359
|
Returns:
|
|
357
360
|
A string of the relative stage path.
|
|
361
|
+
|
|
362
|
+
Raises:
|
|
363
|
+
ValueError: If the stage path format is invalid.
|
|
358
364
|
"""
|
|
359
|
-
|
|
365
|
+
if stage_path.lower().startswith(self._stage.lower()):
|
|
366
|
+
return stage_path[len(self._stage) + 1 :]
|
|
367
|
+
elif match := _CLOUD_PATH_PREFIX_PATTERN.match(stage_path):
|
|
368
|
+
return stage_path[match.end() :]
|
|
369
|
+
|
|
370
|
+
raise ValueError(f"Invalid stage path: {stage_path}")
|
|
360
371
|
|
|
361
372
|
def _add_file_info_helper(
|
|
362
373
|
self,
|
|
@@ -56,8 +56,9 @@ ENABLE_HEALTH_CHECKS_ENV_VAR = "ENABLE_HEALTH_CHECKS"
|
|
|
56
56
|
ENABLE_HEALTH_CHECKS = "false"
|
|
57
57
|
|
|
58
58
|
# Job status polling constants
|
|
59
|
-
JOB_POLL_INITIAL_DELAY_SECONDS =
|
|
59
|
+
JOB_POLL_INITIAL_DELAY_SECONDS = 5
|
|
60
60
|
JOB_POLL_MAX_DELAY_SECONDS = 30
|
|
61
|
+
JOB_SPCS_TIMEOUT_SECONDS = 30
|
|
61
62
|
|
|
62
63
|
# Log start and end messages
|
|
63
64
|
LOG_START_MSG = "--------------------------------\nML job started\n--------------------------------"
|
|
@@ -73,6 +74,7 @@ COMMON_INSTANCE_FAMILIES = {
|
|
|
73
74
|
"CPU_X64_XS": ComputeResources(cpu=1, memory=6),
|
|
74
75
|
"CPU_X64_S": ComputeResources(cpu=3, memory=13),
|
|
75
76
|
"CPU_X64_M": ComputeResources(cpu=6, memory=28),
|
|
77
|
+
"CPU_X64_SL": ComputeResources(cpu=14, memory=54),
|
|
76
78
|
"CPU_X64_L": ComputeResources(cpu=28, memory=116),
|
|
77
79
|
"HIGHMEM_X64_S": ComputeResources(cpu=6, memory=58),
|
|
78
80
|
}
|
|
@@ -85,6 +87,7 @@ AWS_INSTANCE_FAMILIES = {
|
|
|
85
87
|
}
|
|
86
88
|
AZURE_INSTANCE_FAMILIES = {
|
|
87
89
|
"HIGHMEM_X64_M": ComputeResources(cpu=28, memory=244),
|
|
90
|
+
"HIGHMEM_X64_SL": ComputeResources(cpu=92, memory=654),
|
|
88
91
|
"HIGHMEM_X64_L": ComputeResources(cpu=92, memory=654),
|
|
89
92
|
"GPU_NV_XS": ComputeResources(cpu=3, memory=26, gpu=1, gpu_type="T4"),
|
|
90
93
|
"GPU_NV_SM": ComputeResources(cpu=32, memory=424, gpu=1, gpu_type="A10"),
|
|
@@ -92,7 +95,15 @@ AZURE_INSTANCE_FAMILIES = {
|
|
|
92
95
|
"GPU_NV_3M": ComputeResources(cpu=44, memory=424, gpu=2, gpu_type="A100"),
|
|
93
96
|
"GPU_NV_SL": ComputeResources(cpu=92, memory=858, gpu=4, gpu_type="A100"),
|
|
94
97
|
}
|
|
98
|
+
GCP_INSTANCE_FAMILIES = {
|
|
99
|
+
"HIGHMEM_X64_M": ComputeResources(cpu=28, memory=244),
|
|
100
|
+
"HIGHMEM_X64_SL": ComputeResources(cpu=92, memory=654),
|
|
101
|
+
"GPU_GCP_NV_L4_1_24G": ComputeResources(cpu=6, memory=28, gpu=1, gpu_type="L4"),
|
|
102
|
+
"GPU_GCP_NV_L4_4_24G": ComputeResources(cpu=44, memory=178, gpu=4, gpu_type="L4"),
|
|
103
|
+
"GPU_GCP_NV_A100_8_40G": ComputeResources(cpu=92, memory=654, gpu=8, gpu_type="A100"),
|
|
104
|
+
}
|
|
95
105
|
CLOUD_INSTANCE_FAMILIES = {
|
|
96
106
|
SnowflakeCloudType.AWS: AWS_INSTANCE_FAMILIES,
|
|
97
107
|
SnowflakeCloudType.AZURE: AZURE_INSTANCE_FAMILIES,
|
|
108
|
+
SnowflakeCloudType.GCP: GCP_INSTANCE_FAMILIES,
|
|
98
109
|
}
|