tencent-wedata-feature-engineering-dev 0.1.50__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tencent-wedata-feature-engineering-dev might be problematic. Click here for more details.
- {tencent_wedata_feature_engineering_dev-0.1.50.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/METADATA +10 -8
- tencent_wedata_feature_engineering_dev-0.2.0.dist-info/RECORD +46 -0
- {tencent_wedata_feature_engineering_dev-0.1.50.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/WHEEL +1 -1
- wedata/feature_store/client.py +28 -92
- wedata/feature_store/constants/constants.py +2 -5
- wedata/feature_store/entities/feature_lookup.py +0 -17
- wedata/feature_store/entities/feature_spec.py +2 -2
- wedata/feature_store/entities/feature_table.py +1 -5
- wedata/feature_store/entities/function_info.py +4 -1
- wedata/feature_store/feature_table_client/feature_table_client.py +53 -528
- wedata/feature_store/spark_client/spark_client.py +15 -41
- wedata/feature_store/training_set_client/training_set_client.py +10 -9
- wedata/feature_store/utils/common_utils.py +4 -48
- wedata/feature_store/utils/feature_lookup_utils.py +43 -37
- wedata/feature_store/utils/feature_spec_utils.py +1 -1
- wedata/feature_store/utils/uc_utils.py +1 -1
- tencent_wedata_feature_engineering_dev-0.1.50.dist-info/RECORD +0 -66
- wedata/feature_store/cloud_sdk_client/__init__.py +0 -0
- wedata/feature_store/cloud_sdk_client/client.py +0 -108
- wedata/feature_store/cloud_sdk_client/models.py +0 -686
- wedata/feature_store/cloud_sdk_client/utils.py +0 -39
- wedata/feature_store/common/log/__init__.py +0 -0
- wedata/feature_store/common/log/logger.py +0 -40
- wedata/feature_store/common/store_config/__init__.py +0 -0
- wedata/feature_store/common/store_config/redis.py +0 -48
- wedata/feature_store/constants/engine_types.py +0 -34
- wedata/feature_store/feast_client/__init__.py +0 -0
- wedata/feature_store/feast_client/feast_client.py +0 -487
- wedata/feature_store/utils/env_utils.py +0 -108
- wedata/tempo/__init__.py +0 -0
- wedata/tempo/interpol.py +0 -448
- wedata/tempo/intervals.py +0 -1331
- wedata/tempo/io.py +0 -61
- wedata/tempo/ml.py +0 -129
- wedata/tempo/resample.py +0 -318
- wedata/tempo/tsdf.py +0 -1720
- wedata/tempo/utils.py +0 -254
- {tencent_wedata_feature_engineering_dev-0.1.50.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/top_level.txt +0 -0
|
@@ -1,108 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class EnvironmentError(Exception):
|
|
5
|
-
pass
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def get_project_id() -> str:
|
|
9
|
-
"""
|
|
10
|
-
获取当前项目名称
|
|
11
|
-
|
|
12
|
-
Returns:
|
|
13
|
-
str: 项目ID
|
|
14
|
-
|
|
15
|
-
Raises:
|
|
16
|
-
ValueError: 当环境变量 WEDATA_PROJECT_ID 未设置时
|
|
17
|
-
"""
|
|
18
|
-
project_id = os.environ.get("WEDATA_PROJECT_ID")
|
|
19
|
-
if project_id:
|
|
20
|
-
return project_id
|
|
21
|
-
raise EnvironmentError("environment variable WEDATA_PROJECT_ID is not set, please check environment configuration")
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def get_cloud_secret() -> (str, str):
|
|
25
|
-
"""
|
|
26
|
-
获取云上密钥
|
|
27
|
-
|
|
28
|
-
Returns:
|
|
29
|
-
tuple: 包含云上密钥的元组
|
|
30
|
-
"""
|
|
31
|
-
secret_id = os.environ.get("WEDATA_CLOUD_TEMP_SECRET_ID")
|
|
32
|
-
secret_key = os.environ.get("WEDATA_CLOUD_TEMP_SECRET_KEY")
|
|
33
|
-
return secret_id, secret_key
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def get_region() -> str:
|
|
37
|
-
"""
|
|
38
|
-
获取当前地域
|
|
39
|
-
"""
|
|
40
|
-
region_dlc = os.environ.get("DLC_REGION")
|
|
41
|
-
region_emr = os.environ.get("KERNEL_REGION")
|
|
42
|
-
region = region_dlc if region_dlc else region_emr
|
|
43
|
-
if not region:
|
|
44
|
-
raise EnvironmentError("environment variable DLC_REGION or KERNEL_REGION is not set, "
|
|
45
|
-
"please check environment configuration")
|
|
46
|
-
return region
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def get_database_name(database_name: str) -> str:
|
|
50
|
-
"""
|
|
51
|
-
获取数据库名称
|
|
52
|
-
|
|
53
|
-
Args:
|
|
54
|
-
database_name: 数据库名称
|
|
55
|
-
|
|
56
|
-
Returns:
|
|
57
|
-
str: 数据库名称
|
|
58
|
-
|
|
59
|
-
Raises:
|
|
60
|
-
EnvironmentError: 当环境变量 WEDATA_DEFAULT_FEATURE_STORE_DATABASE 未设置时
|
|
61
|
-
"""
|
|
62
|
-
feature_store_database_name = os.environ.get("WEDATA_DEFAULT_FEATURE_STORE_DATABASE")
|
|
63
|
-
if database_name:
|
|
64
|
-
return database_name
|
|
65
|
-
elif feature_store_database_name:
|
|
66
|
-
return feature_store_database_name
|
|
67
|
-
raise EnvironmentError("environment variable WEDATA_DEFAULT_FEATURE_STORE_DATABASE is not set, "
|
|
68
|
-
"please check environment configuration")
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
def get_engine_name() -> str:
|
|
72
|
-
"""
|
|
73
|
-
获取引擎名称
|
|
74
|
-
"""
|
|
75
|
-
# 因为DLC有特殊,所以先判断DLC,如果没有再判断EMR
|
|
76
|
-
if get_engine_type() == "DLC":
|
|
77
|
-
return _get_variable("KERNEL_ENGINE")
|
|
78
|
-
return _get_variable("KERNEL_ENGINE_NAME")
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def get_engine_id() -> str:
|
|
82
|
-
"""
|
|
83
|
-
获取引擎ID
|
|
84
|
-
"""
|
|
85
|
-
return _get_variable("KERNEL_ENGINE")
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
def get_engine_type() -> str:
|
|
89
|
-
"""
|
|
90
|
-
判断引擎类型
|
|
91
|
-
"""
|
|
92
|
-
return "DLC" if os.environ.get("DLC_REGION") else "EMR"
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
def get_feast_remote_url() -> str:
|
|
96
|
-
"""
|
|
97
|
-
获取Feast远程URL
|
|
98
|
-
"""
|
|
99
|
-
return _get_variable("KERNEL_FEAST_REMOTE_ADDRESS")
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
def _get_variable(variable_key: str, is_raise: bool = True, default_value: str = None) -> str:
|
|
103
|
-
val = os.environ.get(variable_key, default_value)
|
|
104
|
-
if not val:
|
|
105
|
-
if is_raise:
|
|
106
|
-
raise EnvironmentError(f"environment variable {variable_key} is not set, "
|
|
107
|
-
f"please check environment configuration")
|
|
108
|
-
return val
|
wedata/tempo/__init__.py
DELETED
|
File without changes
|
wedata/tempo/interpol.py
DELETED
|
@@ -1,448 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import Callable, List, Optional, Union
|
|
4
|
-
|
|
5
|
-
from pyspark.sql.dataframe import DataFrame
|
|
6
|
-
import pyspark.sql.functions as sfn
|
|
7
|
-
from pyspark.sql.types import NumericType
|
|
8
|
-
from pyspark.sql.window import Window
|
|
9
|
-
|
|
10
|
-
import wedata.tempo.resample as t_resample
|
|
11
|
-
import wedata.tempo.tsdf as t_tsdf
|
|
12
|
-
import wedata.tempo.utils as t_utils
|
|
13
|
-
|
|
14
|
-
# Interpolation fill options
|
|
15
|
-
method_options = ["zero", "null", "bfill", "ffill", "linear"]
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class Interpolation:
|
|
19
|
-
def __init__(self, is_resampled: bool):
|
|
20
|
-
self.is_resampled = is_resampled
|
|
21
|
-
|
|
22
|
-
def __validate_fill(self, method: str) -> None:
|
|
23
|
-
"""
|
|
24
|
-
Validate if the fill provided is within the allowed list of values.
|
|
25
|
-
|
|
26
|
-
:param fill: Fill type e.g. "zero", "null", "bfill", "ffill", "linear"
|
|
27
|
-
"""
|
|
28
|
-
if method not in method_options:
|
|
29
|
-
raise ValueError(
|
|
30
|
-
f"Please select from one of the following fill options: {method_options}"
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
def __validate_col(
|
|
34
|
-
self,
|
|
35
|
-
df: DataFrame,
|
|
36
|
-
partition_cols: Optional[List[str]],
|
|
37
|
-
target_cols: List[str],
|
|
38
|
-
ts_col: str,
|
|
39
|
-
ts_col_dtype: Optional[str] = None, # NB: added for testing purposes only
|
|
40
|
-
) -> None:
|
|
41
|
-
"""
|
|
42
|
-
Validate if target column exists and is of numeric type, and validates if partition column exists.
|
|
43
|
-
|
|
44
|
-
:param df: DataFrame to be validated
|
|
45
|
-
:param partition_cols: Partition columns to be validated
|
|
46
|
-
:param target_col: Target column to be validated
|
|
47
|
-
:param ts_col: Timestamp column to be validated
|
|
48
|
-
"""
|
|
49
|
-
|
|
50
|
-
if partition_cols is not None:
|
|
51
|
-
for column in partition_cols:
|
|
52
|
-
if column not in str(df.columns):
|
|
53
|
-
raise ValueError(
|
|
54
|
-
f"Partition Column: '{column}' does not exist in DataFrame."
|
|
55
|
-
)
|
|
56
|
-
for column in target_cols:
|
|
57
|
-
if column not in str(df.columns):
|
|
58
|
-
raise ValueError(
|
|
59
|
-
f"Target Column: '{column}' does not exist in DataFrame."
|
|
60
|
-
)
|
|
61
|
-
|
|
62
|
-
if ts_col not in str(df.columns):
|
|
63
|
-
raise ValueError(
|
|
64
|
-
f"Timestamp Column: '{ts_col}' does not exist in DataFrame."
|
|
65
|
-
)
|
|
66
|
-
|
|
67
|
-
if ts_col_dtype is None:
|
|
68
|
-
ts_col_dtype = df.select(ts_col).dtypes[0][1]
|
|
69
|
-
if ts_col_dtype != "timestamp":
|
|
70
|
-
raise ValueError("Timestamp Column needs to be of timestamp type.")
|
|
71
|
-
|
|
72
|
-
def __calc_linear_spark(
|
|
73
|
-
self, df: DataFrame, ts_col: str, target_col: str
|
|
74
|
-
) -> DataFrame:
|
|
75
|
-
"""
|
|
76
|
-
Native Spark function for calculating linear interpolation on a DataFrame.
|
|
77
|
-
|
|
78
|
-
:param df: prepared dataframe to be interpolated
|
|
79
|
-
:param ts_col: timeseries column name
|
|
80
|
-
:param target_col: column to be interpolated
|
|
81
|
-
"""
|
|
82
|
-
interpolation_expr = f"""
|
|
83
|
-
case when is_interpolated_{target_col} = false then {target_col}
|
|
84
|
-
when {target_col} is null then
|
|
85
|
-
(next_null_{target_col} - previous_{target_col})
|
|
86
|
-
/(unix_timestamp(next_timestamp_{target_col})-unix_timestamp(previous_timestamp_{target_col}))
|
|
87
|
-
*(unix_timestamp({ts_col}) - unix_timestamp(previous_timestamp_{target_col}))
|
|
88
|
-
+ previous_{target_col}
|
|
89
|
-
else
|
|
90
|
-
(next_{target_col}-{target_col})
|
|
91
|
-
/(unix_timestamp(next_timestamp)-unix_timestamp(previous_timestamp))
|
|
92
|
-
*(unix_timestamp({ts_col}) - unix_timestamp(previous_timestamp))
|
|
93
|
-
+ {target_col}
|
|
94
|
-
end as {target_col}
|
|
95
|
-
"""
|
|
96
|
-
|
|
97
|
-
# remove target column to avoid duplication during interpolation expression
|
|
98
|
-
cols: List[str] = df.columns
|
|
99
|
-
cols.remove(target_col)
|
|
100
|
-
interpolated: DataFrame = df.selectExpr(*cols, interpolation_expr)
|
|
101
|
-
# Preserve column order
|
|
102
|
-
return interpolated.select(*df.columns)
|
|
103
|
-
|
|
104
|
-
def _is_valid_method_for_column(
|
|
105
|
-
self, series: DataFrame, method: str, col_name: str
|
|
106
|
-
) -> bool:
|
|
107
|
-
"""
|
|
108
|
-
zero and linear interpolation are only valid for numeric columns
|
|
109
|
-
"""
|
|
110
|
-
if method in ["linear", "zero"]:
|
|
111
|
-
return isinstance(series.schema[col_name].dataType, NumericType)
|
|
112
|
-
else:
|
|
113
|
-
return True
|
|
114
|
-
|
|
115
|
-
def __interpolate_column(
|
|
116
|
-
self,
|
|
117
|
-
series: DataFrame,
|
|
118
|
-
ts_col: str,
|
|
119
|
-
target_col: str,
|
|
120
|
-
method: str,
|
|
121
|
-
) -> DataFrame:
|
|
122
|
-
"""
|
|
123
|
-
Apply interpolation to column.
|
|
124
|
-
|
|
125
|
-
:param series: input DataFrame
|
|
126
|
-
:param ts_col: timestamp column name
|
|
127
|
-
:param target_col: column to interpolate
|
|
128
|
-
:param method: interpolation function to fill missing values
|
|
129
|
-
"""
|
|
130
|
-
|
|
131
|
-
if not self._is_valid_method_for_column(series, method, target_col):
|
|
132
|
-
raise ValueError(
|
|
133
|
-
f"Interpolation method '{method}' is not supported for column "
|
|
134
|
-
f"'{target_col}' of type '{series.schema[target_col].dataType}'. "
|
|
135
|
-
f"Only NumericType columns are supported."
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
output_df: DataFrame = series
|
|
139
|
-
|
|
140
|
-
# create new column for if target column is interpolated
|
|
141
|
-
flag_expr = f"""
|
|
142
|
-
CASE WHEN {target_col} is null and is_ts_interpolated = false THEN true
|
|
143
|
-
WHEN is_ts_interpolated = true THEN true
|
|
144
|
-
ELSE false
|
|
145
|
-
END AS is_interpolated_{target_col}
|
|
146
|
-
"""
|
|
147
|
-
output_df = output_df.withColumn(
|
|
148
|
-
f"is_interpolated_{target_col}", sfn.expr(flag_expr)
|
|
149
|
-
)
|
|
150
|
-
|
|
151
|
-
# Handle zero fill
|
|
152
|
-
if method == "zero":
|
|
153
|
-
output_df = output_df.withColumn(
|
|
154
|
-
target_col,
|
|
155
|
-
sfn.when(
|
|
156
|
-
sfn.col(f"is_interpolated_{target_col}") == False, # noqa: E712
|
|
157
|
-
sfn.col(target_col),
|
|
158
|
-
).otherwise(sfn.lit(0)),
|
|
159
|
-
)
|
|
160
|
-
|
|
161
|
-
# Handle null fill
|
|
162
|
-
if method == "null":
|
|
163
|
-
output_df = output_df.withColumn(
|
|
164
|
-
target_col,
|
|
165
|
-
sfn.when(
|
|
166
|
-
sfn.col(f"is_interpolated_{target_col}") == False, # noqa: E712
|
|
167
|
-
sfn.col(target_col),
|
|
168
|
-
).otherwise(None),
|
|
169
|
-
)
|
|
170
|
-
|
|
171
|
-
# Handle forward fill
|
|
172
|
-
if method == "ffill":
|
|
173
|
-
output_df = output_df.withColumn(
|
|
174
|
-
target_col,
|
|
175
|
-
sfn.when(
|
|
176
|
-
sfn.col(f"is_interpolated_{target_col}") == True, # noqa: E712
|
|
177
|
-
sfn.col(f"previous_{target_col}"),
|
|
178
|
-
).otherwise(sfn.col(target_col)),
|
|
179
|
-
)
|
|
180
|
-
# Handle backwards fill
|
|
181
|
-
if method == "bfill":
|
|
182
|
-
output_df = output_df.withColumn(
|
|
183
|
-
target_col,
|
|
184
|
-
# Handle case when subsequent value is null
|
|
185
|
-
sfn.when(
|
|
186
|
-
(sfn.col(f"is_interpolated_{target_col}") == True) # noqa: E712
|
|
187
|
-
& (
|
|
188
|
-
sfn.col(f"next_{target_col}").isNull()
|
|
189
|
-
& (sfn.col(f"{ts_col}_{target_col}").isNull())
|
|
190
|
-
),
|
|
191
|
-
sfn.col(f"next_null_{target_col}"),
|
|
192
|
-
).otherwise(
|
|
193
|
-
# Handle standard backwards fill
|
|
194
|
-
sfn.when(
|
|
195
|
-
sfn.col(f"is_interpolated_{target_col}") == True, # noqa: E712
|
|
196
|
-
sfn.col(f"next_{target_col}"),
|
|
197
|
-
).otherwise(sfn.col(f"{target_col}"))
|
|
198
|
-
),
|
|
199
|
-
)
|
|
200
|
-
|
|
201
|
-
# Handle linear fill
|
|
202
|
-
if method == "linear":
|
|
203
|
-
output_df = self.__calc_linear_spark(
|
|
204
|
-
output_df,
|
|
205
|
-
ts_col,
|
|
206
|
-
target_col,
|
|
207
|
-
)
|
|
208
|
-
|
|
209
|
-
return output_df
|
|
210
|
-
|
|
211
|
-
def __generate_time_series_fill(
|
|
212
|
-
self, df: DataFrame, partition_cols: Optional[List[str]], ts_col: str
|
|
213
|
-
) -> DataFrame:
|
|
214
|
-
"""
|
|
215
|
-
Create additional timeseries columns for previous and next timestamps
|
|
216
|
-
|
|
217
|
-
:param df: input DataFrame
|
|
218
|
-
:param partition_cols: partition column names
|
|
219
|
-
:param ts_col: timestamp column name
|
|
220
|
-
"""
|
|
221
|
-
return df.withColumn(
|
|
222
|
-
"previous_timestamp",
|
|
223
|
-
sfn.col(ts_col),
|
|
224
|
-
).withColumn(
|
|
225
|
-
"next_timestamp",
|
|
226
|
-
sfn.lead(df[ts_col]).over(
|
|
227
|
-
Window.partitionBy(*partition_cols).orderBy(ts_col)
|
|
228
|
-
),
|
|
229
|
-
)
|
|
230
|
-
|
|
231
|
-
def __generate_column_time_fill(
|
|
232
|
-
self,
|
|
233
|
-
df: DataFrame,
|
|
234
|
-
partition_cols: Optional[List[str]],
|
|
235
|
-
ts_col: str,
|
|
236
|
-
target_col: str,
|
|
237
|
-
) -> DataFrame:
|
|
238
|
-
"""
|
|
239
|
-
Create timeseries columns for previous and next timestamps for a specific target column
|
|
240
|
-
|
|
241
|
-
:param df: input DataFrame
|
|
242
|
-
:param partition_cols: partition column names
|
|
243
|
-
:param ts_col: timestamp column name
|
|
244
|
-
:param target_col: target column name
|
|
245
|
-
"""
|
|
246
|
-
window = Window
|
|
247
|
-
if partition_cols is not None:
|
|
248
|
-
window = Window.partitionBy(*partition_cols)
|
|
249
|
-
|
|
250
|
-
return df.withColumn(
|
|
251
|
-
f"previous_timestamp_{target_col}",
|
|
252
|
-
sfn.last(sfn.col(f"{ts_col}_{target_col}"), ignorenulls=True).over(
|
|
253
|
-
window.orderBy(ts_col).rowsBetween(Window.unboundedPreceding, 0)
|
|
254
|
-
),
|
|
255
|
-
).withColumn(
|
|
256
|
-
f"next_timestamp_{target_col}",
|
|
257
|
-
sfn.last(sfn.col(f"{ts_col}_{target_col}"), ignorenulls=True).over(
|
|
258
|
-
window.orderBy(sfn.col(ts_col).desc()).rowsBetween(
|
|
259
|
-
Window.unboundedPreceding, 0
|
|
260
|
-
)
|
|
261
|
-
),
|
|
262
|
-
)
|
|
263
|
-
|
|
264
|
-
def __generate_target_fill(
|
|
265
|
-
self,
|
|
266
|
-
df: DataFrame,
|
|
267
|
-
partition_cols: Optional[List[str]],
|
|
268
|
-
ts_col: str,
|
|
269
|
-
target_col: str,
|
|
270
|
-
) -> DataFrame:
|
|
271
|
-
"""
|
|
272
|
-
Create columns for previous and next value for a specific target column
|
|
273
|
-
|
|
274
|
-
:param df: input DataFrame
|
|
275
|
-
:param partition_cols: partition column names
|
|
276
|
-
:param ts_col: timestamp column name
|
|
277
|
-
:param target_col: target column name
|
|
278
|
-
"""
|
|
279
|
-
window = Window
|
|
280
|
-
|
|
281
|
-
if partition_cols is not None:
|
|
282
|
-
window = Window.partitionBy(*partition_cols)
|
|
283
|
-
return (
|
|
284
|
-
df.withColumn(
|
|
285
|
-
f"previous_{target_col}",
|
|
286
|
-
sfn.last(df[target_col], ignorenulls=True).over(
|
|
287
|
-
window.orderBy(ts_col).rowsBetween(Window.unboundedPreceding, 0)
|
|
288
|
-
),
|
|
289
|
-
)
|
|
290
|
-
# Handle if subsequent value is null
|
|
291
|
-
.withColumn(
|
|
292
|
-
f"next_null_{target_col}",
|
|
293
|
-
sfn.last(df[target_col], ignorenulls=True).over(
|
|
294
|
-
window.orderBy(sfn.col(ts_col).desc()).rowsBetween(
|
|
295
|
-
Window.unboundedPreceding, 0
|
|
296
|
-
)
|
|
297
|
-
),
|
|
298
|
-
).withColumn(
|
|
299
|
-
f"next_{target_col}",
|
|
300
|
-
sfn.lead(df[target_col]).over(window.orderBy(ts_col)),
|
|
301
|
-
)
|
|
302
|
-
)
|
|
303
|
-
|
|
304
|
-
def interpolate(
|
|
305
|
-
self,
|
|
306
|
-
tsdf: t_tsdf.TSDF,
|
|
307
|
-
ts_col: str,
|
|
308
|
-
partition_cols: Optional[List[str]],
|
|
309
|
-
target_cols: List[str],
|
|
310
|
-
freq: Optional[str],
|
|
311
|
-
func: Optional[Union[Callable | str]],
|
|
312
|
-
method: str,
|
|
313
|
-
show_interpolated: bool,
|
|
314
|
-
perform_checks: bool = True,
|
|
315
|
-
) -> DataFrame:
|
|
316
|
-
"""
|
|
317
|
-
Apply interpolation function.
|
|
318
|
-
|
|
319
|
-
:param tsdf: input TSDF
|
|
320
|
-
:param ts_col: timestamp column name
|
|
321
|
-
:param target_cols: numeric columns to interpolate
|
|
322
|
-
:param partition_cols: partition columns names
|
|
323
|
-
:param freq: frequency at which to sample
|
|
324
|
-
:param func: aggregate function used for sampling to the specified interval
|
|
325
|
-
:param method: interpolation function usded to fill missing values
|
|
326
|
-
:param show_interpolated: show if row is interpolated?
|
|
327
|
-
:param perform_checks: calculate time horizon and warnings if True (default is True)
|
|
328
|
-
:return: DataFrame containing interpolated data.
|
|
329
|
-
"""
|
|
330
|
-
# Validate input parameters
|
|
331
|
-
self.__validate_fill(method)
|
|
332
|
-
self.__validate_col(tsdf.df, partition_cols, target_cols, ts_col)
|
|
333
|
-
|
|
334
|
-
if freq is None:
|
|
335
|
-
raise ValueError("freq cannot be None")
|
|
336
|
-
|
|
337
|
-
if func is None:
|
|
338
|
-
raise ValueError("func cannot be None")
|
|
339
|
-
|
|
340
|
-
if callable(func):
|
|
341
|
-
raise ValueError("func must be a string")
|
|
342
|
-
|
|
343
|
-
# Convert Frequency using resample dictionary
|
|
344
|
-
parsed_freq = t_resample.checkAllowableFreq(freq)
|
|
345
|
-
period, unit = parsed_freq[0], parsed_freq[1]
|
|
346
|
-
freq = f"{period} {t_resample.freq_dict[unit]}" # type: ignore[literal-required]
|
|
347
|
-
|
|
348
|
-
# Throw warning for user to validate that the expected number of output rows is valid.
|
|
349
|
-
if perform_checks:
|
|
350
|
-
t_utils.calculate_time_horizon(tsdf.df, ts_col, freq, partition_cols)
|
|
351
|
-
|
|
352
|
-
# Only select required columns for interpolation
|
|
353
|
-
input_cols: List[str] = [ts_col, *target_cols]
|
|
354
|
-
if partition_cols is not None:
|
|
355
|
-
input_cols += [*partition_cols]
|
|
356
|
-
|
|
357
|
-
sampled_input: DataFrame = tsdf.df.select(*input_cols)
|
|
358
|
-
|
|
359
|
-
if self.is_resampled is False:
|
|
360
|
-
# Resample and Normalize Input
|
|
361
|
-
sampled_input = tsdf.resample(
|
|
362
|
-
freq=freq, func=func, metricCols=target_cols
|
|
363
|
-
).df
|
|
364
|
-
|
|
365
|
-
# Fill timeseries for nearest values
|
|
366
|
-
time_series_filled = self.__generate_time_series_fill(
|
|
367
|
-
sampled_input, partition_cols, ts_col
|
|
368
|
-
)
|
|
369
|
-
|
|
370
|
-
# Generate surrogate timestamps for each target column
|
|
371
|
-
# This is required if multuple columns are being interpolated and may contain nulls
|
|
372
|
-
add_column_time: DataFrame = time_series_filled
|
|
373
|
-
for column in target_cols:
|
|
374
|
-
add_column_time = add_column_time.withColumn(
|
|
375
|
-
f"{ts_col}_{column}",
|
|
376
|
-
sfn.when(sfn.col(column).isNull(), None).otherwise(sfn.col(ts_col)),
|
|
377
|
-
)
|
|
378
|
-
add_column_time = self.__generate_column_time_fill(
|
|
379
|
-
add_column_time, partition_cols, ts_col, column
|
|
380
|
-
)
|
|
381
|
-
|
|
382
|
-
# Handle edge case if last value (latest) is null
|
|
383
|
-
edge_filled = add_column_time.withColumn(
|
|
384
|
-
"next_timestamp",
|
|
385
|
-
sfn.when(
|
|
386
|
-
sfn.col("next_timestamp").isNull(),
|
|
387
|
-
sfn.expr(f"{ts_col}+ interval {freq}"),
|
|
388
|
-
).otherwise(sfn.col("next_timestamp")),
|
|
389
|
-
)
|
|
390
|
-
|
|
391
|
-
# Fill target column for nearest values
|
|
392
|
-
target_column_filled = edge_filled
|
|
393
|
-
for column in target_cols:
|
|
394
|
-
target_column_filled = self.__generate_target_fill(
|
|
395
|
-
target_column_filled, partition_cols, ts_col, column
|
|
396
|
-
)
|
|
397
|
-
|
|
398
|
-
# Generate missing timeseries values
|
|
399
|
-
exploded_series = target_column_filled.withColumn(
|
|
400
|
-
f"new_{ts_col}",
|
|
401
|
-
sfn.expr(
|
|
402
|
-
f"explode(sequence({ts_col}, next_timestamp - interval {freq}, interval {freq} )) as timestamp"
|
|
403
|
-
),
|
|
404
|
-
)
|
|
405
|
-
# Mark rows that are interpolated if flag is set to True
|
|
406
|
-
flagged_series: DataFrame = exploded_series
|
|
407
|
-
|
|
408
|
-
flagged_series = (
|
|
409
|
-
exploded_series.withColumn(
|
|
410
|
-
"is_ts_interpolated",
|
|
411
|
-
sfn.when(sfn.col(f"new_{ts_col}") != sfn.col(ts_col), True).otherwise(
|
|
412
|
-
False
|
|
413
|
-
),
|
|
414
|
-
)
|
|
415
|
-
.withColumn(ts_col, sfn.col(f"new_{ts_col}"))
|
|
416
|
-
.drop(sfn.col(f"new_{ts_col}"))
|
|
417
|
-
)
|
|
418
|
-
|
|
419
|
-
# # Perform interpolation on each target column
|
|
420
|
-
interpolated_result: DataFrame = flagged_series
|
|
421
|
-
for target_col in target_cols:
|
|
422
|
-
# Interpolate target columns
|
|
423
|
-
interpolated_result = self.__interpolate_column(
|
|
424
|
-
interpolated_result, ts_col, target_col, method
|
|
425
|
-
)
|
|
426
|
-
|
|
427
|
-
interpolated_result = interpolated_result.drop(
|
|
428
|
-
f"previous_timestamp_{target_col}",
|
|
429
|
-
f"next_timestamp_{target_col}",
|
|
430
|
-
f"previous_{target_col}",
|
|
431
|
-
f"next_{target_col}",
|
|
432
|
-
f"next_null_{target_col}",
|
|
433
|
-
f"{ts_col}_{target_col}",
|
|
434
|
-
)
|
|
435
|
-
|
|
436
|
-
# Remove non-required columns
|
|
437
|
-
output: DataFrame = interpolated_result.drop(
|
|
438
|
-
"previous_timestamp", "next_timestamp"
|
|
439
|
-
)
|
|
440
|
-
|
|
441
|
-
# Hide is_interpolated columns based on flag
|
|
442
|
-
if show_interpolated is False:
|
|
443
|
-
interpolated_col_names = ["is_ts_interpolated"]
|
|
444
|
-
for column in target_cols:
|
|
445
|
-
interpolated_col_names.append(f"is_interpolated_{column}")
|
|
446
|
-
output = output.drop(*interpolated_col_names)
|
|
447
|
-
|
|
448
|
-
return output
|