wedata-feature-engineering 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- feature_store/constants/__init__.py +0 -0
- feature_store/constants/constants.py +28 -0
- feature_store/entities/__init__.py +0 -0
- feature_store/entities/column_info.py +117 -0
- feature_store/entities/data_type.py +92 -0
- feature_store/entities/environment_variables.py +55 -0
- feature_store/entities/feature.py +53 -0
- feature_store/entities/feature_column_info.py +64 -0
- feature_store/entities/feature_function.py +55 -0
- feature_store/entities/feature_lookup.py +179 -0
- feature_store/entities/feature_spec.py +454 -0
- feature_store/entities/feature_spec_constants.py +25 -0
- feature_store/entities/feature_table.py +164 -0
- feature_store/entities/feature_table_info.py +40 -0
- feature_store/entities/function_info.py +184 -0
- feature_store/entities/on_demand_column_info.py +44 -0
- feature_store/entities/source_data_column_info.py +21 -0
- feature_store/entities/training_set.py +134 -0
- feature_store/feature_table_client/__init__.py +0 -0
- feature_store/feature_table_client/feature_table_client.py +313 -0
- feature_store/spark_client/__init__.py +0 -0
- feature_store/spark_client/spark_client.py +286 -0
- feature_store/training_set_client/__init__.py +0 -0
- feature_store/training_set_client/training_set_client.py +196 -0
- {wedata_feature_engineering-0.1.0.dist-info → wedata_feature_engineering-0.1.2.dist-info}/METADATA +1 -1
- wedata_feature_engineering-0.1.2.dist-info/RECORD +30 -0
- wedata_feature_engineering-0.1.0.dist-info/RECORD +0 -6
- {wedata_feature_engineering-0.1.0.dist-info → wedata_feature_engineering-0.1.2.dist-info}/WHEEL +0 -0
- {wedata_feature_engineering-0.1.0.dist-info → wedata_feature_engineering-0.1.2.dist-info}/top_level.txt +0 -0
File without changes
|
@@ -0,0 +1,286 @@
|
|
1
|
+
from collections import defaultdict
|
2
|
+
from typing import List
|
3
|
+
|
4
|
+
from pyspark.sql import SparkSession, DataFrame
|
5
|
+
from pyspark.sql.catalog import Column
|
6
|
+
from pyspark.sql.functions import when, isnull
|
7
|
+
from pyspark.sql.types import StructType, StringType, StructField
|
8
|
+
|
9
|
+
from feature_store.entities.feature import Feature
|
10
|
+
from feature_store.entities.feature_table import FeatureTable
|
11
|
+
from feature_store.entities.function_info import FunctionParameterInfo, FunctionInfo
|
12
|
+
from feature_store.utils.common_utils import unsanitize_identifier
|
13
|
+
from feature_store.utils.utils import sanitize_multi_level_name
|
14
|
+
|
15
|
+
|
16
|
+
class SparkClient:
|
17
|
+
def __init__(self, spark: SparkSession):
|
18
|
+
self._spark = spark
|
19
|
+
|
20
|
+
def createDataFrame(self, data, schema) -> DataFrame:
|
21
|
+
return self._spark.createDataFrame(data, schema)
|
22
|
+
|
23
|
+
def read_table(
|
24
|
+
self, qualified_table_name, as_of_delta_timestamp=None, streaming=False
|
25
|
+
):
|
26
|
+
"""
|
27
|
+
Reads a Delta table, optionally as of some timestamp.
|
28
|
+
"""
|
29
|
+
if streaming and as_of_delta_timestamp:
|
30
|
+
raise ValueError(
|
31
|
+
"Internal error: as_of_delta_timestamp cannot be specified when"
|
32
|
+
" streaming=True."
|
33
|
+
)
|
34
|
+
|
35
|
+
base_reader = (
|
36
|
+
# By default, Structured Streaming only handles append operations. Because
|
37
|
+
# we have a notion of primary keys, most offline feature store operations
|
38
|
+
# are not appends. For example, FeatureStoreClient.write_table(mode=MERGE)
|
39
|
+
# will issue a MERGE operation.
|
40
|
+
# In order to propagate the non-append operations to the
|
41
|
+
# readStream, we set ignoreChanges to "true".
|
42
|
+
# For more information,
|
43
|
+
# see https://docs.databricks.com/delta/delta-streaming.html#ignore-updates-and-deletes
|
44
|
+
self._spark.readStream.format("delta").option("ignoreChanges", "true")
|
45
|
+
if streaming
|
46
|
+
else self._spark.read.format("delta")
|
47
|
+
)
|
48
|
+
|
49
|
+
if as_of_delta_timestamp:
|
50
|
+
return base_reader.option("timestampAsOf", as_of_delta_timestamp).table(
|
51
|
+
sanitize_multi_level_name(qualified_table_name)
|
52
|
+
)
|
53
|
+
else:
|
54
|
+
return base_reader.table(sanitize_multi_level_name(qualified_table_name))
|
55
|
+
|
56
|
+
def get_current_catalog(self):
|
57
|
+
"""
|
58
|
+
Get current set catalog in the spark context.
|
59
|
+
"""
|
60
|
+
try:
|
61
|
+
df = self._spark.sql("SELECT CURRENT_CATALOG()").collect()
|
62
|
+
return unsanitize_identifier(df[0][0])
|
63
|
+
except Exception as e:
|
64
|
+
return None
|
65
|
+
|
66
|
+
def get_current_database(self):
|
67
|
+
"""
|
68
|
+
Get current set database in the spark context.
|
69
|
+
"""
|
70
|
+
try:
|
71
|
+
df = self._spark.sql("SELECT CURRENT_DATABASE()").collect()
|
72
|
+
return unsanitize_identifier(df[0][0])
|
73
|
+
except Exception as e:
|
74
|
+
return None
|
75
|
+
|
76
|
+
def read_table(self, table_name):
|
77
|
+
"""读取Spark表数据
|
78
|
+
|
79
|
+
Args:
|
80
|
+
table_name: 表名,支持格式: catalog.schema.table、schema.table 或 table
|
81
|
+
|
82
|
+
Returns:
|
83
|
+
DataFrame: 表数据
|
84
|
+
|
85
|
+
Raises:
|
86
|
+
ValueError: 当表不存在或读取失败时抛出
|
87
|
+
"""
|
88
|
+
try:
|
89
|
+
# 解析表名
|
90
|
+
parts = table_name.split('.')
|
91
|
+
if len(parts) == 3:
|
92
|
+
catalog, schema, table = parts
|
93
|
+
elif len(parts) == 2:
|
94
|
+
schema, table = parts
|
95
|
+
else:
|
96
|
+
table = table_name
|
97
|
+
|
98
|
+
# 验证表是否存在
|
99
|
+
if not self._spark.catalog.tableExists(table):
|
100
|
+
raise ValueError(f"表不存在: {table_name}")
|
101
|
+
|
102
|
+
return self._spark.table(table)
|
103
|
+
|
104
|
+
except Exception as e:
|
105
|
+
raise ValueError(f"读取表 {table_name} 失败: {str(e)}")
|
106
|
+
|
107
|
+
|
108
|
+
def get_features(self, table_name):
|
109
|
+
# 解析表名
|
110
|
+
parts = table_name.split('.')
|
111
|
+
if len(parts) == 3:
|
112
|
+
# 对于三部分名称(catalog.schema.table),使用schema.table格式
|
113
|
+
_, schema, table = parts
|
114
|
+
full_table_name = f"{schema}.{table}"
|
115
|
+
elif len(parts) == 2:
|
116
|
+
# 对于两部分名称(schema.table),直接使用
|
117
|
+
full_table_name = table_name
|
118
|
+
else:
|
119
|
+
# 单表名,使用当前数据库
|
120
|
+
current_db = self.get_current_database()
|
121
|
+
if not current_db:
|
122
|
+
raise ValueError("无法确定当前数据库")
|
123
|
+
full_table_name = f"{current_db}.{table_name}"
|
124
|
+
|
125
|
+
# 使用dbName.tableName格式查询列信息
|
126
|
+
columns = self._spark.catalog.listColumns(tableName=full_table_name)
|
127
|
+
return [
|
128
|
+
Feature(
|
129
|
+
feature_table=table_name,
|
130
|
+
feature_id=f"{table_name}_{row.name}",
|
131
|
+
name=row.name,
|
132
|
+
data_type=row.dataType,
|
133
|
+
description=row.description or ""
|
134
|
+
) for row in columns
|
135
|
+
]
|
136
|
+
|
137
|
+
def get_online_stores(self, table_name):
|
138
|
+
return None
|
139
|
+
|
140
|
+
|
141
|
+
|
142
|
+
def get_feature_table(self, table_name):
|
143
|
+
|
144
|
+
# 获取表元数据
|
145
|
+
table = self._spark.catalog.getTable(table_name)
|
146
|
+
|
147
|
+
parts = table_name.split('.')
|
148
|
+
if len(parts) == 3:
|
149
|
+
# 对于三部分名称(catalog.schema.table),只使用表名部分
|
150
|
+
table_to_describe = parts[2]
|
151
|
+
elif len(parts) == 2:
|
152
|
+
# 对于两部分名称(schema.table),只使用表名部分
|
153
|
+
table_to_describe = parts[1]
|
154
|
+
else:
|
155
|
+
# 单表名,直接使用
|
156
|
+
table_to_describe = table_name
|
157
|
+
# 获取表详细信息
|
158
|
+
table_details = self._spark.sql(f"DESCRIBE TABLE EXTENDED {table_to_describe}").collect()
|
159
|
+
|
160
|
+
table_properties = {}
|
161
|
+
for row in table_details:
|
162
|
+
if row.col_name == "Table Properties":
|
163
|
+
props = row.data_type[1:-1].split(", ")
|
164
|
+
table_properties = dict(p.split("=") for p in props if "=" in p)
|
165
|
+
|
166
|
+
# 获取特征列信息
|
167
|
+
features = self.get_features(table_name)
|
168
|
+
|
169
|
+
# 构建完整的FeatureTable对象
|
170
|
+
return FeatureTable(
|
171
|
+
name=table_name,
|
172
|
+
table_id=table_properties.get("table_id", table_name),
|
173
|
+
description=table.description or table_properties.get("description", table_name),
|
174
|
+
primary_keys=table_properties.get("primaryKeys", "").split(",") if table_properties.get("primaryKeys") else [],
|
175
|
+
partition_columns=table.partitionColumnNames if hasattr(table, 'partitionColumnNames') else [],
|
176
|
+
features=features,
|
177
|
+
creation_timestamp=None, # Spark表元数据不包含创建时间戳
|
178
|
+
online_stores=self.get_online_stores(table_name),
|
179
|
+
notebook_producers=None,
|
180
|
+
job_producers=None,
|
181
|
+
table_data_sources=None,
|
182
|
+
path_data_sources=None,
|
183
|
+
custom_data_sources=None,
|
184
|
+
timestamp_keys=table_properties.get("timestamp_keys"),
|
185
|
+
tags=table_properties.get("tags")
|
186
|
+
)
|
187
|
+
|
188
|
+
def _get_routines_with_parameters(self, full_routine_names: List[str]) -> DataFrame:
|
189
|
+
"""
|
190
|
+
Retrieve the routines with their parameters from information_schema.routines, information_schema.parameters.
|
191
|
+
Return DataFrame only contains routines that 1. exist and 2. the caller has GetFunction permission on.
|
192
|
+
|
193
|
+
Note: The returned DataFrame contains the cartesian product of routines and parameters.
|
194
|
+
For efficiency, routines table columns are only present in the first row for each routine.
|
195
|
+
"""
|
196
|
+
routine_name_schema = StructType(
|
197
|
+
[
|
198
|
+
StructField("specific_catalog", StringType(), False),
|
199
|
+
StructField("specific_schema", StringType(), False),
|
200
|
+
StructField("specific_name", StringType(), False),
|
201
|
+
]
|
202
|
+
)
|
203
|
+
routine_names_df = self.createDataFrame(
|
204
|
+
[full_routine_name.split(".") for full_routine_name in full_routine_names],
|
205
|
+
routine_name_schema,
|
206
|
+
)
|
207
|
+
routines_table = self.read_table(
|
208
|
+
"system.information_schema.routines"
|
209
|
+
)
|
210
|
+
parameters_table = self.read_table(
|
211
|
+
"system.information_schema.parameters"
|
212
|
+
)
|
213
|
+
|
214
|
+
# Inner join routines table to filter out non-existent routines.
|
215
|
+
# Left join parameters as routines may have no parameters.
|
216
|
+
full_routines_with_parameters_df = routine_names_df.join(
|
217
|
+
routines_table, on=routine_names_df.columns, how="inner"
|
218
|
+
).join(parameters_table, on=routine_names_df.columns, how="left")
|
219
|
+
|
220
|
+
# Return only relevant metadata from information_schema, sorted by routine name + parameter order.
|
221
|
+
# For efficiency, only preserve routine column values in the first of each routine's result rows.
|
222
|
+
# The first row will have parameter.ordinal_value is None (no parameters) or equals 0 (first parameter).
|
223
|
+
def select_if_first_row(col: Column) -> Column:
|
224
|
+
return when(
|
225
|
+
isnull(parameters_table.ordinal_position)
|
226
|
+
| (parameters_table.ordinal_position == 0),
|
227
|
+
col,
|
228
|
+
).otherwise(None)
|
229
|
+
|
230
|
+
return full_routines_with_parameters_df.select(
|
231
|
+
routine_names_df.columns
|
232
|
+
+ [
|
233
|
+
select_if_first_row(routines_table.routine_definition).alias(
|
234
|
+
"routine_definition"
|
235
|
+
),
|
236
|
+
select_if_first_row(routines_table.external_language).alias(
|
237
|
+
"external_language"
|
238
|
+
),
|
239
|
+
parameters_table.ordinal_position,
|
240
|
+
parameters_table.parameter_name,
|
241
|
+
parameters_table.full_data_type,
|
242
|
+
]
|
243
|
+
).sort(routine_names_df.columns + [parameters_table.ordinal_position])
|
244
|
+
|
245
|
+
def get_functions(self, full_function_names: List[str]) -> List[FunctionInfo]:
|
246
|
+
"""
|
247
|
+
Retrieves and maps Unity Catalog functions' metadata as FunctionInfos.
|
248
|
+
"""
|
249
|
+
# Avoid unnecessary Spark calls and return if empty.
|
250
|
+
if not full_function_names:
|
251
|
+
return []
|
252
|
+
|
253
|
+
# Collect dict of routine name -> DataFrame rows describing the routine.
|
254
|
+
routines_with_parameters_df = self._get_routines_with_parameters(
|
255
|
+
full_routine_names=full_function_names
|
256
|
+
)
|
257
|
+
routine_infos = defaultdict(list)
|
258
|
+
for r in routines_with_parameters_df.collect():
|
259
|
+
routine_name = f"{r.specific_catalog}.{r.specific_schema}.{r.specific_name}"
|
260
|
+
routine_infos[routine_name].append(r)
|
261
|
+
|
262
|
+
# Mock GetFunction DNE error, since information_schema does not throw.
|
263
|
+
for function_name in full_function_names:
|
264
|
+
if not function_name in routine_infos:
|
265
|
+
raise ValueError(f"Function '{function_name}' does not exist.")
|
266
|
+
|
267
|
+
# Map routine_infos into FunctionInfos.
|
268
|
+
function_infos = []
|
269
|
+
for function_name in full_function_names:
|
270
|
+
routine_info = routine_infos[function_name][0]
|
271
|
+
input_params = [
|
272
|
+
FunctionParameterInfo(name=p.parameter_name, type_text=p.full_data_type)
|
273
|
+
for p in routine_infos[function_name]
|
274
|
+
if p.ordinal_position is not None
|
275
|
+
]
|
276
|
+
function_infos.append(
|
277
|
+
FunctionInfo(
|
278
|
+
full_name=function_name,
|
279
|
+
input_params=input_params,
|
280
|
+
routine_definition=routine_info.routine_definition,
|
281
|
+
external_language=routine_info.external_language,
|
282
|
+
)
|
283
|
+
)
|
284
|
+
return function_infos
|
285
|
+
|
286
|
+
|
File without changes
|
@@ -0,0 +1,196 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
import os
|
4
|
+
from collections import defaultdict
|
5
|
+
from types import ModuleType
|
6
|
+
from typing import Any, Dict, List, Optional, Set, Union
|
7
|
+
|
8
|
+
import mlflow
|
9
|
+
import yaml
|
10
|
+
from mlflow.models import Model, ModelSignature
|
11
|
+
from mlflow.utils.file_utils import TempDir, YamlSafeDumper, read_yaml
|
12
|
+
from pyspark.sql import DataFrame
|
13
|
+
from pyspark.sql.functions import struct
|
14
|
+
|
15
|
+
from feature_store.entities.feature_column_info import FeatureColumnInfo
|
16
|
+
from feature_store.entities.feature_function import FeatureFunction
|
17
|
+
from feature_store.entities.feature_lookup import FeatureLookup
|
18
|
+
from feature_store.entities.feature_spec import FeatureSpec
|
19
|
+
from feature_store.entities.training_set import TrainingSet
|
20
|
+
from feature_store.spark_client.spark_client import SparkClient
|
21
|
+
|
22
|
+
from feature_store.constants.constants import (
|
23
|
+
_NO_RESULT_TYPE_PASSED,
|
24
|
+
_PREBUILT_ENV_URI,
|
25
|
+
_USE_SPARK_NATIVE_JOIN,
|
26
|
+
_WARN,
|
27
|
+
MODEL_DATA_PATH_ROOT,
|
28
|
+
PREDICTION_COLUMN_NAME,
|
29
|
+
)
|
30
|
+
|
31
|
+
from feature_store.utils import common_utils, training_set_utils
|
32
|
+
from feature_store.utils.feature_spec_utils import convert_to_yaml_string
|
33
|
+
|
34
|
+
_logger = logging.getLogger(__name__)
|
35
|
+
|
36
|
+
FEATURE_SPEC_GRAPH_MAX_COLUMN_INFO = 1000
|
37
|
+
|
38
|
+
|
39
|
+
class TrainingSetClient:
|
40
|
+
def __init__(
|
41
|
+
self,
|
42
|
+
spark_client: SparkClient
|
43
|
+
):
|
44
|
+
self._spark_client = spark_client
|
45
|
+
|
46
|
+
def create_training_set(
|
47
|
+
self,
|
48
|
+
feature_spec: FeatureSpec,
|
49
|
+
feature_column_infos: List[FeatureColumnInfo],
|
50
|
+
label_names: List[str],
|
51
|
+
df: DataFrame,
|
52
|
+
ft_metadata: training_set_utils._FeatureTableMetadata,
|
53
|
+
kwargs,
|
54
|
+
):
|
55
|
+
uc_function_infos = training_set_utils.get_uc_function_infos(
|
56
|
+
self._spark_client,
|
57
|
+
{odci.udf_name for odci in feature_spec.on_demand_column_infos},
|
58
|
+
)
|
59
|
+
|
60
|
+
# TODO(divyagupta-db): Move validation from _validate_join_feature_data in feature_lookup_utils.py
|
61
|
+
# to a helper function called here and in score_batch.
|
62
|
+
|
63
|
+
# Add consumer of each feature and instrument as final step
|
64
|
+
consumer_feature_table_map = defaultdict(list)
|
65
|
+
for feature in feature_column_infos:
|
66
|
+
consumer_feature_table_map[feature.table_name].append(feature.feature_name)
|
67
|
+
consumed_udf_names = [f.udf_name for f in feature_spec.function_infos]
|
68
|
+
|
69
|
+
# Spark query planning is known to cause spark driver to crash if there are many feature tables to PiT join.
|
70
|
+
# See https://docs.google.com/document/d/1EyA4vvlWikTJMeinsLkxmRAVNlXoF1eqoZElOdqlWyY/edit
|
71
|
+
# So we disable native join by default.
|
72
|
+
training_set_utils.warn_if_non_photon_for_native_spark(
|
73
|
+
kwargs.get(_USE_SPARK_NATIVE_JOIN, False), self._spark_client
|
74
|
+
)
|
75
|
+
return TrainingSet(
|
76
|
+
feature_spec,
|
77
|
+
df,
|
78
|
+
label_names,
|
79
|
+
ft_metadata.feature_table_metadata_map,
|
80
|
+
ft_metadata.feature_table_data_map,
|
81
|
+
uc_function_infos,
|
82
|
+
kwargs.get(_USE_SPARK_NATIVE_JOIN, False),
|
83
|
+
)
|
84
|
+
|
85
|
+
def create_training_set_from_feature_lookups(
|
86
|
+
self,
|
87
|
+
df: DataFrame,
|
88
|
+
feature_lookups: List[Union[FeatureLookup, FeatureFunction]],
|
89
|
+
label: Union[str, List[str], None],
|
90
|
+
exclude_columns: List[str],
|
91
|
+
**kwargs,
|
92
|
+
) -> TrainingSet:
|
93
|
+
|
94
|
+
# 获取特征查找列表和特征函数列表
|
95
|
+
features = feature_lookups
|
96
|
+
feature_lookups = [f for f in features if isinstance(f, FeatureLookup)]
|
97
|
+
feature_functions = [f for f in features if isinstance(f, FeatureFunction)]
|
98
|
+
|
99
|
+
# 如果未提供标签,则用空列表初始化label_names
|
100
|
+
label_names = common_utils.as_list(label, [])
|
101
|
+
del label
|
102
|
+
|
103
|
+
# 校验数据集和标签
|
104
|
+
training_set_utils.verify_df_and_labels(df, label_names, exclude_columns)
|
105
|
+
|
106
|
+
# 获取特征表元数据
|
107
|
+
ft_metadata = training_set_utils.get_table_metadata(
|
108
|
+
self._spark_client,
|
109
|
+
{fl.table_name for fl in feature_lookups}
|
110
|
+
)
|
111
|
+
|
112
|
+
column_infos = training_set_utils.get_column_infos(
|
113
|
+
feature_lookups,
|
114
|
+
feature_functions,
|
115
|
+
ft_metadata,
|
116
|
+
df_columns=df.columns,
|
117
|
+
label_names=label_names,
|
118
|
+
)
|
119
|
+
|
120
|
+
training_set_utils.validate_column_infos(
|
121
|
+
self._spark_client,
|
122
|
+
ft_metadata,
|
123
|
+
column_infos.source_data_column_infos,
|
124
|
+
column_infos.feature_column_infos,
|
125
|
+
column_infos.on_demand_column_infos,
|
126
|
+
label_names,
|
127
|
+
)
|
128
|
+
|
129
|
+
# Build feature_spec locally for comparison with the feature spec yaml generated by the
|
130
|
+
# FeatureStore backend. This will be removed once the migration is validated.
|
131
|
+
feature_spec = training_set_utils.build_feature_spec(
|
132
|
+
feature_lookups,
|
133
|
+
ft_metadata,
|
134
|
+
column_infos,
|
135
|
+
exclude_columns
|
136
|
+
)
|
137
|
+
|
138
|
+
return self.create_training_set(
|
139
|
+
feature_spec,
|
140
|
+
column_infos.feature_column_infos,
|
141
|
+
label_names,
|
142
|
+
df,
|
143
|
+
ft_metadata,
|
144
|
+
kwargs=kwargs,
|
145
|
+
)
|
146
|
+
|
147
|
+
|
148
|
+
|
149
|
+
|
150
|
+
|
151
|
+
def create_feature_spec(
|
152
|
+
self,
|
153
|
+
name: str,
|
154
|
+
features: List[Union[FeatureLookup, FeatureFunction]],
|
155
|
+
sparkClient: SparkClient,
|
156
|
+
exclude_columns: List[str] = [],
|
157
|
+
) -> FeatureSpec:
|
158
|
+
|
159
|
+
feature_lookups = [f for f in features if isinstance(f, FeatureLookup)]
|
160
|
+
feature_functions = [f for f in features if isinstance(f, FeatureFunction)]
|
161
|
+
|
162
|
+
# Maximum of 100 FeatureFunctions is supported
|
163
|
+
if len(feature_functions) > training_set_utils.MAX_FEATURE_FUNCTIONS:
|
164
|
+
raise ValueError(
|
165
|
+
f"A maximum of {training_set_utils.MAX_FEATURE_FUNCTIONS} FeatureFunctions are supported."
|
166
|
+
)
|
167
|
+
|
168
|
+
# Get feature table metadata and column infos
|
169
|
+
ft_metadata = training_set_utils.get_table_metadata(
|
170
|
+
self._spark_client,
|
171
|
+
{fl.table_name for fl in feature_lookups}
|
172
|
+
)
|
173
|
+
column_infos = training_set_utils.get_column_infos(
|
174
|
+
feature_lookups,
|
175
|
+
feature_functions,
|
176
|
+
ft_metadata,
|
177
|
+
)
|
178
|
+
|
179
|
+
column_infos = training_set_utils.add_inferred_source_columns(column_infos)
|
180
|
+
|
181
|
+
training_set_utils.validate_column_infos(
|
182
|
+
self._spark_client,
|
183
|
+
ft_metadata,
|
184
|
+
column_infos.source_data_column_infos,
|
185
|
+
column_infos.feature_column_infos,
|
186
|
+
column_infos.on_demand_column_infos,
|
187
|
+
)
|
188
|
+
|
189
|
+
feature_spec = training_set_utils.build_feature_spec(
|
190
|
+
feature_lookups,
|
191
|
+
ft_metadata,
|
192
|
+
column_infos,
|
193
|
+
exclude_columns
|
194
|
+
)
|
195
|
+
|
196
|
+
return feature_spec
|
@@ -0,0 +1,30 @@
|
|
1
|
+
feature_store/__init__.py,sha256=CP3YAMoy3pSTWRYzTza_CYBnGbTv_KzycVEBMQCeiD8,101
|
2
|
+
feature_store/client.py,sha256=FG1xK460rD859iSY4VA75XeYhqStJD8Wlr0sRxk25LI,5267
|
3
|
+
feature_store/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
+
feature_store/constants/constants.py,sha256=exW3kiFLDyCmU9cYHFjcvIQhPWEpFtkogLXeB9Arfd8,827
|
5
|
+
feature_store/entities/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
feature_store/entities/column_info.py,sha256=WezowI46YHDym5ZlbhCJDqhKbVcjXjnjt7dQdy3XqYM,4164
|
7
|
+
feature_store/entities/data_type.py,sha256=VpHS6Fr3TphQQ8NbAcEnDJ-8eOZV6ivYuWxv3pAM2RM,3394
|
8
|
+
feature_store/entities/environment_variables.py,sha256=ZEFml5H9MQuzBKM074mUrFYu-Sga4Knmxqiwpke2WGc,1679
|
9
|
+
feature_store/entities/feature.py,sha256=wX8fTBlJq3GYdj9rrBDCY3kFgcVBBAiOOZdxEhnQkNQ,1241
|
10
|
+
feature_store/entities/feature_column_info.py,sha256=-TGxRafYUaNKe0YzHus2XbfRaVrMv7pcffMdbtTT4nA,2031
|
11
|
+
feature_store/entities/feature_function.py,sha256=R17INrCE-U_Uj9KLbFz69aYlOkTETTwQHMMo470F4lQ,1865
|
12
|
+
feature_store/entities/feature_lookup.py,sha256=zUDMdDIboitOffYRZlurf_O_4UeBPmE5YS0PyCS2Fqg,7912
|
13
|
+
feature_store/entities/feature_spec.py,sha256=F4MiKEyvKZSBh6Uv7V4vVLbamZ9fRClaC3HCrUeynDE,20079
|
14
|
+
feature_store/entities/feature_spec_constants.py,sha256=YWDBfRiNDe6fUJFUBo3V4WYg2xsljoPAE-ZejfFZCgM,785
|
15
|
+
feature_store/entities/feature_table.py,sha256=4ghopIvJcoIlyFiSEuTkOcDWn88c1Kt6q5LWM4BYEHI,6073
|
16
|
+
feature_store/entities/feature_table_info.py,sha256=2vUaVdW_jw1dRAlmJWvBRueuMeuqWu_NYB9SlxLI7Uw,1126
|
17
|
+
feature_store/entities/function_info.py,sha256=l0kmiq2R_QNfSMJ7y0xZohlMiemgYSr1dN5vzV8ijIs,7314
|
18
|
+
feature_store/entities/on_demand_column_info.py,sha256=Eh5ieaj1TxC7DG6ipBZzH2ZyY0bwkLrDOkuZjgYr4gY,1297
|
19
|
+
feature_store/entities/source_data_column_info.py,sha256=a9jQOJvehwDIrKPwsP6W9YRBSPNK2nZYypE6-p80CwA,542
|
20
|
+
feature_store/entities/training_set.py,sha256=9H2uGnUxTAsk93Om50QxRELbeFCocwGMze2VexPVJWI,5569
|
21
|
+
feature_store/feature_table_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
|
+
feature_store/feature_table_client/feature_table_client.py,sha256=uir33K7oigrSnjTT6VbNOp0Nb22-X3JHd1_92kWjrow,10754
|
23
|
+
feature_store/spark_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
24
|
+
feature_store/spark_client/spark_client.py,sha256=vd-NCE9IGC0Ygqr-QSVY0teuWsQSkq_BFV4Mn6xMMNU,11578
|
25
|
+
feature_store/training_set_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
26
|
+
feature_store/training_set_client/training_set_client.py,sha256=Aa80xVXVE1KBdgplL9qqR8ftD5A5r2pfBttAhmySrB0,6696
|
27
|
+
wedata_feature_engineering-0.1.2.dist-info/METADATA,sha256=IALf_mmflM-eRTOIOqVDJg5OoVVfLXBUHofIdC1T_wI,493
|
28
|
+
wedata_feature_engineering-0.1.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
29
|
+
wedata_feature_engineering-0.1.2.dist-info/top_level.txt,sha256=15761LgVdJ7tJWbdlYk0EZ560G9k6C4TE42dfLx8d0I,14
|
30
|
+
wedata_feature_engineering-0.1.2.dist-info/RECORD,,
|
@@ -1,6 +0,0 @@
|
|
1
|
-
feature_store/__init__.py,sha256=CP3YAMoy3pSTWRYzTza_CYBnGbTv_KzycVEBMQCeiD8,101
|
2
|
-
feature_store/client.py,sha256=FG1xK460rD859iSY4VA75XeYhqStJD8Wlr0sRxk25LI,5267
|
3
|
-
wedata_feature_engineering-0.1.0.dist-info/METADATA,sha256=gTLzZnQR7SI9-sp0NWq8JYYwF1edzt2iodypP9mkvNk,493
|
4
|
-
wedata_feature_engineering-0.1.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
5
|
-
wedata_feature_engineering-0.1.0.dist-info/top_level.txt,sha256=15761LgVdJ7tJWbdlYk0EZ560G9k6C4TE42dfLx8d0I,14
|
6
|
-
wedata_feature_engineering-0.1.0.dist-info/RECORD,,
|
{wedata_feature_engineering-0.1.0.dist-info → wedata_feature_engineering-0.1.2.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|