tencent-wedata-feature-engineering-dev 0.1.50__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tencent-wedata-feature-engineering-dev might be problematic. Click here for more details.

Files changed (38) hide show
  1. {tencent_wedata_feature_engineering_dev-0.1.50.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/METADATA +10 -8
  2. tencent_wedata_feature_engineering_dev-0.2.0.dist-info/RECORD +46 -0
  3. {tencent_wedata_feature_engineering_dev-0.1.50.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/WHEEL +1 -1
  4. wedata/feature_store/client.py +28 -92
  5. wedata/feature_store/constants/constants.py +2 -5
  6. wedata/feature_store/entities/feature_lookup.py +0 -17
  7. wedata/feature_store/entities/feature_spec.py +2 -2
  8. wedata/feature_store/entities/feature_table.py +1 -5
  9. wedata/feature_store/entities/function_info.py +4 -1
  10. wedata/feature_store/feature_table_client/feature_table_client.py +53 -528
  11. wedata/feature_store/spark_client/spark_client.py +15 -41
  12. wedata/feature_store/training_set_client/training_set_client.py +10 -9
  13. wedata/feature_store/utils/common_utils.py +4 -48
  14. wedata/feature_store/utils/feature_lookup_utils.py +43 -37
  15. wedata/feature_store/utils/feature_spec_utils.py +1 -1
  16. wedata/feature_store/utils/uc_utils.py +1 -1
  17. tencent_wedata_feature_engineering_dev-0.1.50.dist-info/RECORD +0 -66
  18. wedata/feature_store/cloud_sdk_client/__init__.py +0 -0
  19. wedata/feature_store/cloud_sdk_client/client.py +0 -108
  20. wedata/feature_store/cloud_sdk_client/models.py +0 -686
  21. wedata/feature_store/cloud_sdk_client/utils.py +0 -39
  22. wedata/feature_store/common/log/__init__.py +0 -0
  23. wedata/feature_store/common/log/logger.py +0 -40
  24. wedata/feature_store/common/store_config/__init__.py +0 -0
  25. wedata/feature_store/common/store_config/redis.py +0 -48
  26. wedata/feature_store/constants/engine_types.py +0 -34
  27. wedata/feature_store/feast_client/__init__.py +0 -0
  28. wedata/feature_store/feast_client/feast_client.py +0 -487
  29. wedata/feature_store/utils/env_utils.py +0 -108
  30. wedata/tempo/__init__.py +0 -0
  31. wedata/tempo/interpol.py +0 -448
  32. wedata/tempo/intervals.py +0 -1331
  33. wedata/tempo/io.py +0 -61
  34. wedata/tempo/ml.py +0 -129
  35. wedata/tempo/resample.py +0 -318
  36. wedata/tempo/tsdf.py +0 -1720
  37. wedata/tempo/utils.py +0 -254
  38. {tencent_wedata_feature_engineering_dev-0.1.50.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/top_level.txt +0 -0
@@ -1,108 +0,0 @@
1
- import os
2
-
3
-
4
- class EnvironmentError(Exception):
5
- pass
6
-
7
-
8
- def get_project_id() -> str:
9
- """
10
- 获取当前项目名称
11
-
12
- Returns:
13
- str: 项目ID
14
-
15
- Raises:
16
- ValueError: 当环境变量 WEDATA_PROJECT_ID 未设置时
17
- """
18
- project_id = os.environ.get("WEDATA_PROJECT_ID")
19
- if project_id:
20
- return project_id
21
- raise EnvironmentError("environment variable WEDATA_PROJECT_ID is not set, please check environment configuration")
22
-
23
-
24
- def get_cloud_secret() -> (str, str):
25
- """
26
- 获取云上密钥
27
-
28
- Returns:
29
- tuple: 包含云上密钥的元组
30
- """
31
- secret_id = os.environ.get("WEDATA_CLOUD_TEMP_SECRET_ID")
32
- secret_key = os.environ.get("WEDATA_CLOUD_TEMP_SECRET_KEY")
33
- return secret_id, secret_key
34
-
35
-
36
- def get_region() -> str:
37
- """
38
- 获取当前地域
39
- """
40
- region_dlc = os.environ.get("DLC_REGION")
41
- region_emr = os.environ.get("KERNEL_REGION")
42
- region = region_dlc if region_dlc else region_emr
43
- if not region:
44
- raise EnvironmentError("environment variable DLC_REGION or KERNEL_REGION is not set, "
45
- "please check environment configuration")
46
- return region
47
-
48
-
49
- def get_database_name(database_name: str) -> str:
50
- """
51
- 获取数据库名称
52
-
53
- Args:
54
- database_name: 数据库名称
55
-
56
- Returns:
57
- str: 数据库名称
58
-
59
- Raises:
60
- EnvironmentError: 当环境变量 WEDATA_DEFAULT_FEATURE_STORE_DATABASE 未设置时
61
- """
62
- feature_store_database_name = os.environ.get("WEDATA_DEFAULT_FEATURE_STORE_DATABASE")
63
- if database_name:
64
- return database_name
65
- elif feature_store_database_name:
66
- return feature_store_database_name
67
- raise EnvironmentError("environment variable WEDATA_DEFAULT_FEATURE_STORE_DATABASE is not set, "
68
- "please check environment configuration")
69
-
70
-
71
- def get_engine_name() -> str:
72
- """
73
- 获取引擎名称
74
- """
75
- # 因为DLC有特殊,所以先判断DLC,如果没有再判断EMR
76
- if get_engine_type() == "DLC":
77
- return _get_variable("KERNEL_ENGINE")
78
- return _get_variable("KERNEL_ENGINE_NAME")
79
-
80
-
81
- def get_engine_id() -> str:
82
- """
83
- 获取引擎ID
84
- """
85
- return _get_variable("KERNEL_ENGINE")
86
-
87
-
88
- def get_engine_type() -> str:
89
- """
90
- 判断引擎类型
91
- """
92
- return "DLC" if os.environ.get("DLC_REGION") else "EMR"
93
-
94
-
95
- def get_feast_remote_url() -> str:
96
- """
97
- 获取Feast远程URL
98
- """
99
- return _get_variable("KERNEL_FEAST_REMOTE_ADDRESS")
100
-
101
-
102
- def _get_variable(variable_key: str, is_raise: bool = True, default_value: str = None) -> str:
103
- val = os.environ.get(variable_key, default_value)
104
- if not val:
105
- if is_raise:
106
- raise EnvironmentError(f"environment variable {variable_key} is not set, "
107
- f"please check environment configuration")
108
- return val
wedata/tempo/__init__.py DELETED
File without changes
wedata/tempo/interpol.py DELETED
@@ -1,448 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from typing import Callable, List, Optional, Union
4
-
5
- from pyspark.sql.dataframe import DataFrame
6
- import pyspark.sql.functions as sfn
7
- from pyspark.sql.types import NumericType
8
- from pyspark.sql.window import Window
9
-
10
- import wedata.tempo.resample as t_resample
11
- import wedata.tempo.tsdf as t_tsdf
12
- import wedata.tempo.utils as t_utils
13
-
14
- # Interpolation fill options
15
- method_options = ["zero", "null", "bfill", "ffill", "linear"]
16
-
17
-
18
- class Interpolation:
19
- def __init__(self, is_resampled: bool):
20
- self.is_resampled = is_resampled
21
-
22
- def __validate_fill(self, method: str) -> None:
23
- """
24
- Validate if the fill provided is within the allowed list of values.
25
-
26
- :param fill: Fill type e.g. "zero", "null", "bfill", "ffill", "linear"
27
- """
28
- if method not in method_options:
29
- raise ValueError(
30
- f"Please select from one of the following fill options: {method_options}"
31
- )
32
-
33
- def __validate_col(
34
- self,
35
- df: DataFrame,
36
- partition_cols: Optional[List[str]],
37
- target_cols: List[str],
38
- ts_col: str,
39
- ts_col_dtype: Optional[str] = None, # NB: added for testing purposes only
40
- ) -> None:
41
- """
42
- Validate if target column exists and is of numeric type, and validates if partition column exists.
43
-
44
- :param df: DataFrame to be validated
45
- :param partition_cols: Partition columns to be validated
46
- :param target_col: Target column to be validated
47
- :param ts_col: Timestamp column to be validated
48
- """
49
-
50
- if partition_cols is not None:
51
- for column in partition_cols:
52
- if column not in str(df.columns):
53
- raise ValueError(
54
- f"Partition Column: '{column}' does not exist in DataFrame."
55
- )
56
- for column in target_cols:
57
- if column not in str(df.columns):
58
- raise ValueError(
59
- f"Target Column: '{column}' does not exist in DataFrame."
60
- )
61
-
62
- if ts_col not in str(df.columns):
63
- raise ValueError(
64
- f"Timestamp Column: '{ts_col}' does not exist in DataFrame."
65
- )
66
-
67
- if ts_col_dtype is None:
68
- ts_col_dtype = df.select(ts_col).dtypes[0][1]
69
- if ts_col_dtype != "timestamp":
70
- raise ValueError("Timestamp Column needs to be of timestamp type.")
71
-
72
- def __calc_linear_spark(
73
- self, df: DataFrame, ts_col: str, target_col: str
74
- ) -> DataFrame:
75
- """
76
- Native Spark function for calculating linear interpolation on a DataFrame.
77
-
78
- :param df: prepared dataframe to be interpolated
79
- :param ts_col: timeseries column name
80
- :param target_col: column to be interpolated
81
- """
82
- interpolation_expr = f"""
83
- case when is_interpolated_{target_col} = false then {target_col}
84
- when {target_col} is null then
85
- (next_null_{target_col} - previous_{target_col})
86
- /(unix_timestamp(next_timestamp_{target_col})-unix_timestamp(previous_timestamp_{target_col}))
87
- *(unix_timestamp({ts_col}) - unix_timestamp(previous_timestamp_{target_col}))
88
- + previous_{target_col}
89
- else
90
- (next_{target_col}-{target_col})
91
- /(unix_timestamp(next_timestamp)-unix_timestamp(previous_timestamp))
92
- *(unix_timestamp({ts_col}) - unix_timestamp(previous_timestamp))
93
- + {target_col}
94
- end as {target_col}
95
- """
96
-
97
- # remove target column to avoid duplication during interpolation expression
98
- cols: List[str] = df.columns
99
- cols.remove(target_col)
100
- interpolated: DataFrame = df.selectExpr(*cols, interpolation_expr)
101
- # Preserve column order
102
- return interpolated.select(*df.columns)
103
-
104
- def _is_valid_method_for_column(
105
- self, series: DataFrame, method: str, col_name: str
106
- ) -> bool:
107
- """
108
- zero and linear interpolation are only valid for numeric columns
109
- """
110
- if method in ["linear", "zero"]:
111
- return isinstance(series.schema[col_name].dataType, NumericType)
112
- else:
113
- return True
114
-
115
- def __interpolate_column(
116
- self,
117
- series: DataFrame,
118
- ts_col: str,
119
- target_col: str,
120
- method: str,
121
- ) -> DataFrame:
122
- """
123
- Apply interpolation to column.
124
-
125
- :param series: input DataFrame
126
- :param ts_col: timestamp column name
127
- :param target_col: column to interpolate
128
- :param method: interpolation function to fill missing values
129
- """
130
-
131
- if not self._is_valid_method_for_column(series, method, target_col):
132
- raise ValueError(
133
- f"Interpolation method '{method}' is not supported for column "
134
- f"'{target_col}' of type '{series.schema[target_col].dataType}'. "
135
- f"Only NumericType columns are supported."
136
- )
137
-
138
- output_df: DataFrame = series
139
-
140
- # create new column for if target column is interpolated
141
- flag_expr = f"""
142
- CASE WHEN {target_col} is null and is_ts_interpolated = false THEN true
143
- WHEN is_ts_interpolated = true THEN true
144
- ELSE false
145
- END AS is_interpolated_{target_col}
146
- """
147
- output_df = output_df.withColumn(
148
- f"is_interpolated_{target_col}", sfn.expr(flag_expr)
149
- )
150
-
151
- # Handle zero fill
152
- if method == "zero":
153
- output_df = output_df.withColumn(
154
- target_col,
155
- sfn.when(
156
- sfn.col(f"is_interpolated_{target_col}") == False, # noqa: E712
157
- sfn.col(target_col),
158
- ).otherwise(sfn.lit(0)),
159
- )
160
-
161
- # Handle null fill
162
- if method == "null":
163
- output_df = output_df.withColumn(
164
- target_col,
165
- sfn.when(
166
- sfn.col(f"is_interpolated_{target_col}") == False, # noqa: E712
167
- sfn.col(target_col),
168
- ).otherwise(None),
169
- )
170
-
171
- # Handle forward fill
172
- if method == "ffill":
173
- output_df = output_df.withColumn(
174
- target_col,
175
- sfn.when(
176
- sfn.col(f"is_interpolated_{target_col}") == True, # noqa: E712
177
- sfn.col(f"previous_{target_col}"),
178
- ).otherwise(sfn.col(target_col)),
179
- )
180
- # Handle backwards fill
181
- if method == "bfill":
182
- output_df = output_df.withColumn(
183
- target_col,
184
- # Handle case when subsequent value is null
185
- sfn.when(
186
- (sfn.col(f"is_interpolated_{target_col}") == True) # noqa: E712
187
- & (
188
- sfn.col(f"next_{target_col}").isNull()
189
- & (sfn.col(f"{ts_col}_{target_col}").isNull())
190
- ),
191
- sfn.col(f"next_null_{target_col}"),
192
- ).otherwise(
193
- # Handle standard backwards fill
194
- sfn.when(
195
- sfn.col(f"is_interpolated_{target_col}") == True, # noqa: E712
196
- sfn.col(f"next_{target_col}"),
197
- ).otherwise(sfn.col(f"{target_col}"))
198
- ),
199
- )
200
-
201
- # Handle linear fill
202
- if method == "linear":
203
- output_df = self.__calc_linear_spark(
204
- output_df,
205
- ts_col,
206
- target_col,
207
- )
208
-
209
- return output_df
210
-
211
- def __generate_time_series_fill(
212
- self, df: DataFrame, partition_cols: Optional[List[str]], ts_col: str
213
- ) -> DataFrame:
214
- """
215
- Create additional timeseries columns for previous and next timestamps
216
-
217
- :param df: input DataFrame
218
- :param partition_cols: partition column names
219
- :param ts_col: timestamp column name
220
- """
221
- return df.withColumn(
222
- "previous_timestamp",
223
- sfn.col(ts_col),
224
- ).withColumn(
225
- "next_timestamp",
226
- sfn.lead(df[ts_col]).over(
227
- Window.partitionBy(*partition_cols).orderBy(ts_col)
228
- ),
229
- )
230
-
231
- def __generate_column_time_fill(
232
- self,
233
- df: DataFrame,
234
- partition_cols: Optional[List[str]],
235
- ts_col: str,
236
- target_col: str,
237
- ) -> DataFrame:
238
- """
239
- Create timeseries columns for previous and next timestamps for a specific target column
240
-
241
- :param df: input DataFrame
242
- :param partition_cols: partition column names
243
- :param ts_col: timestamp column name
244
- :param target_col: target column name
245
- """
246
- window = Window
247
- if partition_cols is not None:
248
- window = Window.partitionBy(*partition_cols)
249
-
250
- return df.withColumn(
251
- f"previous_timestamp_{target_col}",
252
- sfn.last(sfn.col(f"{ts_col}_{target_col}"), ignorenulls=True).over(
253
- window.orderBy(ts_col).rowsBetween(Window.unboundedPreceding, 0)
254
- ),
255
- ).withColumn(
256
- f"next_timestamp_{target_col}",
257
- sfn.last(sfn.col(f"{ts_col}_{target_col}"), ignorenulls=True).over(
258
- window.orderBy(sfn.col(ts_col).desc()).rowsBetween(
259
- Window.unboundedPreceding, 0
260
- )
261
- ),
262
- )
263
-
264
- def __generate_target_fill(
265
- self,
266
- df: DataFrame,
267
- partition_cols: Optional[List[str]],
268
- ts_col: str,
269
- target_col: str,
270
- ) -> DataFrame:
271
- """
272
- Create columns for previous and next value for a specific target column
273
-
274
- :param df: input DataFrame
275
- :param partition_cols: partition column names
276
- :param ts_col: timestamp column name
277
- :param target_col: target column name
278
- """
279
- window = Window
280
-
281
- if partition_cols is not None:
282
- window = Window.partitionBy(*partition_cols)
283
- return (
284
- df.withColumn(
285
- f"previous_{target_col}",
286
- sfn.last(df[target_col], ignorenulls=True).over(
287
- window.orderBy(ts_col).rowsBetween(Window.unboundedPreceding, 0)
288
- ),
289
- )
290
- # Handle if subsequent value is null
291
- .withColumn(
292
- f"next_null_{target_col}",
293
- sfn.last(df[target_col], ignorenulls=True).over(
294
- window.orderBy(sfn.col(ts_col).desc()).rowsBetween(
295
- Window.unboundedPreceding, 0
296
- )
297
- ),
298
- ).withColumn(
299
- f"next_{target_col}",
300
- sfn.lead(df[target_col]).over(window.orderBy(ts_col)),
301
- )
302
- )
303
-
304
- def interpolate(
305
- self,
306
- tsdf: t_tsdf.TSDF,
307
- ts_col: str,
308
- partition_cols: Optional[List[str]],
309
- target_cols: List[str],
310
- freq: Optional[str],
311
- func: Optional[Union[Callable | str]],
312
- method: str,
313
- show_interpolated: bool,
314
- perform_checks: bool = True,
315
- ) -> DataFrame:
316
- """
317
- Apply interpolation function.
318
-
319
- :param tsdf: input TSDF
320
- :param ts_col: timestamp column name
321
- :param target_cols: numeric columns to interpolate
322
- :param partition_cols: partition columns names
323
- :param freq: frequency at which to sample
324
- :param func: aggregate function used for sampling to the specified interval
325
- :param method: interpolation function usded to fill missing values
326
- :param show_interpolated: show if row is interpolated?
327
- :param perform_checks: calculate time horizon and warnings if True (default is True)
328
- :return: DataFrame containing interpolated data.
329
- """
330
- # Validate input parameters
331
- self.__validate_fill(method)
332
- self.__validate_col(tsdf.df, partition_cols, target_cols, ts_col)
333
-
334
- if freq is None:
335
- raise ValueError("freq cannot be None")
336
-
337
- if func is None:
338
- raise ValueError("func cannot be None")
339
-
340
- if callable(func):
341
- raise ValueError("func must be a string")
342
-
343
- # Convert Frequency using resample dictionary
344
- parsed_freq = t_resample.checkAllowableFreq(freq)
345
- period, unit = parsed_freq[0], parsed_freq[1]
346
- freq = f"{period} {t_resample.freq_dict[unit]}" # type: ignore[literal-required]
347
-
348
- # Throw warning for user to validate that the expected number of output rows is valid.
349
- if perform_checks:
350
- t_utils.calculate_time_horizon(tsdf.df, ts_col, freq, partition_cols)
351
-
352
- # Only select required columns for interpolation
353
- input_cols: List[str] = [ts_col, *target_cols]
354
- if partition_cols is not None:
355
- input_cols += [*partition_cols]
356
-
357
- sampled_input: DataFrame = tsdf.df.select(*input_cols)
358
-
359
- if self.is_resampled is False:
360
- # Resample and Normalize Input
361
- sampled_input = tsdf.resample(
362
- freq=freq, func=func, metricCols=target_cols
363
- ).df
364
-
365
- # Fill timeseries for nearest values
366
- time_series_filled = self.__generate_time_series_fill(
367
- sampled_input, partition_cols, ts_col
368
- )
369
-
370
- # Generate surrogate timestamps for each target column
371
- # This is required if multuple columns are being interpolated and may contain nulls
372
- add_column_time: DataFrame = time_series_filled
373
- for column in target_cols:
374
- add_column_time = add_column_time.withColumn(
375
- f"{ts_col}_{column}",
376
- sfn.when(sfn.col(column).isNull(), None).otherwise(sfn.col(ts_col)),
377
- )
378
- add_column_time = self.__generate_column_time_fill(
379
- add_column_time, partition_cols, ts_col, column
380
- )
381
-
382
- # Handle edge case if last value (latest) is null
383
- edge_filled = add_column_time.withColumn(
384
- "next_timestamp",
385
- sfn.when(
386
- sfn.col("next_timestamp").isNull(),
387
- sfn.expr(f"{ts_col}+ interval {freq}"),
388
- ).otherwise(sfn.col("next_timestamp")),
389
- )
390
-
391
- # Fill target column for nearest values
392
- target_column_filled = edge_filled
393
- for column in target_cols:
394
- target_column_filled = self.__generate_target_fill(
395
- target_column_filled, partition_cols, ts_col, column
396
- )
397
-
398
- # Generate missing timeseries values
399
- exploded_series = target_column_filled.withColumn(
400
- f"new_{ts_col}",
401
- sfn.expr(
402
- f"explode(sequence({ts_col}, next_timestamp - interval {freq}, interval {freq} )) as timestamp"
403
- ),
404
- )
405
- # Mark rows that are interpolated if flag is set to True
406
- flagged_series: DataFrame = exploded_series
407
-
408
- flagged_series = (
409
- exploded_series.withColumn(
410
- "is_ts_interpolated",
411
- sfn.when(sfn.col(f"new_{ts_col}") != sfn.col(ts_col), True).otherwise(
412
- False
413
- ),
414
- )
415
- .withColumn(ts_col, sfn.col(f"new_{ts_col}"))
416
- .drop(sfn.col(f"new_{ts_col}"))
417
- )
418
-
419
- # # Perform interpolation on each target column
420
- interpolated_result: DataFrame = flagged_series
421
- for target_col in target_cols:
422
- # Interpolate target columns
423
- interpolated_result = self.__interpolate_column(
424
- interpolated_result, ts_col, target_col, method
425
- )
426
-
427
- interpolated_result = interpolated_result.drop(
428
- f"previous_timestamp_{target_col}",
429
- f"next_timestamp_{target_col}",
430
- f"previous_{target_col}",
431
- f"next_{target_col}",
432
- f"next_null_{target_col}",
433
- f"{ts_col}_{target_col}",
434
- )
435
-
436
- # Remove non-required columns
437
- output: DataFrame = interpolated_result.drop(
438
- "previous_timestamp", "next_timestamp"
439
- )
440
-
441
- # Hide is_interpolated columns based on flag
442
- if show_interpolated is False:
443
- interpolated_col_names = ["is_ts_interpolated"]
444
- for column in target_cols:
445
- interpolated_col_names.append(f"is_interpolated_{column}")
446
- output = output.drop(*interpolated_col_names)
447
-
448
- return output