tencent-wedata-feature-engineering-dev 0.1.50__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tencent-wedata-feature-engineering-dev might be problematic. Click here for more details.

Files changed (38) hide show
  1. {tencent_wedata_feature_engineering_dev-0.1.50.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/METADATA +10 -8
  2. tencent_wedata_feature_engineering_dev-0.2.0.dist-info/RECORD +46 -0
  3. {tencent_wedata_feature_engineering_dev-0.1.50.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/WHEEL +1 -1
  4. wedata/feature_store/client.py +28 -92
  5. wedata/feature_store/constants/constants.py +2 -5
  6. wedata/feature_store/entities/feature_lookup.py +0 -17
  7. wedata/feature_store/entities/feature_spec.py +2 -2
  8. wedata/feature_store/entities/feature_table.py +1 -5
  9. wedata/feature_store/entities/function_info.py +4 -1
  10. wedata/feature_store/feature_table_client/feature_table_client.py +53 -528
  11. wedata/feature_store/spark_client/spark_client.py +15 -41
  12. wedata/feature_store/training_set_client/training_set_client.py +10 -9
  13. wedata/feature_store/utils/common_utils.py +4 -48
  14. wedata/feature_store/utils/feature_lookup_utils.py +43 -37
  15. wedata/feature_store/utils/feature_spec_utils.py +1 -1
  16. wedata/feature_store/utils/uc_utils.py +1 -1
  17. tencent_wedata_feature_engineering_dev-0.1.50.dist-info/RECORD +0 -66
  18. wedata/feature_store/cloud_sdk_client/__init__.py +0 -0
  19. wedata/feature_store/cloud_sdk_client/client.py +0 -108
  20. wedata/feature_store/cloud_sdk_client/models.py +0 -686
  21. wedata/feature_store/cloud_sdk_client/utils.py +0 -39
  22. wedata/feature_store/common/log/__init__.py +0 -0
  23. wedata/feature_store/common/log/logger.py +0 -40
  24. wedata/feature_store/common/store_config/__init__.py +0 -0
  25. wedata/feature_store/common/store_config/redis.py +0 -48
  26. wedata/feature_store/constants/engine_types.py +0 -34
  27. wedata/feature_store/feast_client/__init__.py +0 -0
  28. wedata/feature_store/feast_client/feast_client.py +0 -487
  29. wedata/feature_store/utils/env_utils.py +0 -108
  30. wedata/tempo/__init__.py +0 -0
  31. wedata/tempo/interpol.py +0 -448
  32. wedata/tempo/intervals.py +0 -1331
  33. wedata/tempo/io.py +0 -61
  34. wedata/tempo/ml.py +0 -129
  35. wedata/tempo/resample.py +0 -318
  36. wedata/tempo/tsdf.py +0 -1720
  37. wedata/tempo/utils.py +0 -254
  38. {tencent_wedata_feature_engineering_dev-0.1.50.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/top_level.txt +0 -0
wedata/tempo/io.py DELETED
@@ -1,61 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import logging
4
- from collections import deque
5
- from typing import Optional
6
-
7
- import pyspark.sql.functions as sfn
8
- import wedata.tempo.tsdf as t_tsdf
9
- from pyspark.sql import SparkSession
10
- from pyspark.sql.utils import ParseException
11
-
12
- logger = logging.getLogger(__name__)
13
-
14
-
15
- def write(
16
- tsdf: t_tsdf.TSDF,
17
- spark: SparkSession,
18
- tabName: str,
19
- optimizationCols: Optional[list[str]] = None,
20
- ) -> None:
21
- """
22
- param: tsdf: input TSDF object to write
23
- param: tabName Delta output table name
24
- param: optimizationCols list of columns to optimize on (time)
25
- """
26
- # hilbert curves more evenly distribute performance for querying multiple columns for Delta tables
27
- spark.conf.set("spark.wedata.io.skipping.mdc.curve", "hilbert")
28
-
29
- df = tsdf.df
30
- ts_col = tsdf.ts_col
31
- partitionCols = tsdf.partitionCols
32
-
33
- view_df = df.withColumn("event_dt", sfn.to_date(sfn.col(ts_col))).withColumn(
34
- "event_time",
35
- sfn.translate(sfn.split(sfn.col(ts_col).cast("string"), " ")[1], ":", "").cast(
36
- "double"
37
- ),
38
- )
39
- view_cols = deque(view_df.columns)
40
- view_cols.rotate(1)
41
- view_df = view_df.select(*list(view_cols))
42
-
43
- # Use replaceWhere instead of overwrite mode
44
- writer = view_df.write.format("delta").partitionBy("event_dt")
45
- writer = writer.option("replaceWhere", "true")
46
- writer.saveAsTable(tabName)
47
-
48
- if optimizationCols:
49
- try:
50
- spark.sql(
51
- "optimize {} zorder by {}".format(
52
- tabName,
53
- "(" + ",".join(partitionCols + optimizationCols + [ts_col]) + ")",
54
- )
55
- )
56
- except ParseException as e:
57
- logger.error(
58
- "Delta optimizations attempted, but was not successful.\nError: {}".format(
59
- e
60
- )
61
- )
wedata/tempo/ml.py DELETED
@@ -1,129 +0,0 @@
1
- from typing import Any, List, Tuple
2
- from functools import reduce
3
-
4
- from pyspark.sql import DataFrame
5
- from pyspark.sql.window import Window, WindowSpec
6
- from pyspark.sql import functions as sfn
7
-
8
- from pyspark.ml.param import Param, Params, TypeConverters
9
- from pyspark.ml.tuning import CrossValidator
10
-
11
-
12
- TMP_SPLIT_COL = "__tmp_split_col"
13
- TMP_GAP_COL = "__tmp_gap_row"
14
-
15
-
16
- def _get_parent_params():
17
- dummy = Params()
18
- dummy.uid = "undefined"
19
- return dummy
20
-
21
-
22
- class TimeSeriesCrossValidator(CrossValidator):
23
- # some additional parameters
24
-
25
- timeSeriesCol: Param[str] = Param(
26
- _get_parent_params(),
27
- "timeSeriesCol",
28
- "The name of the time series column",
29
- typeConverter=TypeConverters.toString,
30
- )
31
- seriesIdCols: Param[List[str]] = Param(
32
- _get_parent_params(),
33
- "seriesIdCols",
34
- "The name of the series id columns",
35
- typeConverter=TypeConverters.toListString,
36
- )
37
- gap: Param[int] = Param(
38
- _get_parent_params(),
39
- "gap",
40
- "The gap between training and test set",
41
- typeConverter=TypeConverters.toInt,
42
- )
43
-
44
- def __init__(
45
- self,
46
- timeSeriesCol: str = "event_ts",
47
- seriesIdCols: List[str] = [],
48
- gap: int = 0,
49
- **other_kwargs: Any,
50
- ) -> None:
51
- super(TimeSeriesCrossValidator, self).__init__(**other_kwargs)
52
- self._setDefault(timeSeriesCol="event_ts", seriesIdCols=[], gap=0)
53
- self._set(timeSeriesCol=timeSeriesCol, seriesIdCols=seriesIdCols, gap=gap)
54
-
55
- def getTimeSeriesCol(self) -> str:
56
- return self.getOrDefault(self.timeSeriesCol)
57
-
58
- def getSeriesIdCols(self) -> List[str]:
59
- return self.getOrDefault(self.seriesIdCols)
60
-
61
- def getGap(self) -> int:
62
- return self.getOrDefault(self.gap)
63
-
64
- def setTimeSeriesCol(self, value: str) -> "TimeSeriesCrossValidator":
65
- return self._set(timeSeriesCol=value)
66
-
67
- def setSeriesIdCols(self, value: List[str]) -> "TimeSeriesCrossValidator":
68
- return self._set(seriesIdCols=value)
69
-
70
- def setGap(self, value: int) -> "TimeSeriesCrossValidator":
71
- return self._set(gap=value)
72
-
73
- def _get_split_win(self, desc: bool = False) -> WindowSpec:
74
- ts_col_expr = sfn.col(self.getTimeSeriesCol())
75
- if desc:
76
- ts_col_expr = ts_col_expr.desc()
77
- win = Window.orderBy(ts_col_expr)
78
- series_id_cols = self.getSeriesIdCols()
79
- if series_id_cols and len(series_id_cols) > 0:
80
- win = win.partitionBy(*series_id_cols)
81
- return win
82
-
83
- def _kFold(self, dataset: DataFrame) -> List[Tuple[DataFrame, DataFrame]]:
84
- nFolds = self.getOrDefault(self.numFolds)
85
- nSplits = nFolds + 1
86
-
87
- # split the data into nSplits subsets by timeseries order
88
- split_df = dataset.withColumn(
89
- TMP_SPLIT_COL, sfn.ntile(nSplits).over(self._get_split_win())
90
- )
91
- all_splits = [
92
- split_df.filter(sfn.col(TMP_SPLIT_COL) == i).drop(TMP_SPLIT_COL)
93
- for i in range(1, nSplits + 1)
94
- ]
95
- assert len(all_splits) == nSplits
96
-
97
- # compose the k folds by including all previous splits in the training set,
98
- # and the next split in the test set
99
- kFolds = [
100
- (reduce(lambda a, b: a.union(b), all_splits[: i + 1]), all_splits[i + 1])
101
- for i in range(nFolds)
102
- ]
103
- assert len(kFolds) == nFolds
104
- for tv in kFolds:
105
- assert len(tv) == 2
106
-
107
- # trim out a gap from the training datasets, if specified
108
- gap = self.getOrDefault(self.gap)
109
- if gap > 0:
110
- order_cols = self.getSeriesIdCols() + [self.getTimeSeriesCol()]
111
- # trim each training dataset by the specified gap
112
- kFolds = [
113
- (
114
- (
115
- train_df.withColumn(
116
- TMP_GAP_COL,
117
- sfn.row_number().over(self._get_split_win(desc=True)),
118
- )
119
- .where(sfn.col(TMP_GAP_COL) > gap)
120
- .drop(TMP_GAP_COL)
121
- .orderBy(*order_cols)
122
- ),
123
- test_df,
124
- )
125
- for (train_df, test_df) in kFolds
126
- ]
127
-
128
- # return the k folds (training, test) datasets
129
- return kFolds
wedata/tempo/resample.py DELETED
@@ -1,318 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from typing import (
4
- Any,
5
- Callable,
6
- List,
7
- Optional,
8
- Tuple,
9
- TypedDict,
10
- Union,
11
- get_type_hints,
12
- )
13
-
14
- import pyspark.sql.functions as sfn
15
- from pyspark.sql import DataFrame
16
- from pyspark.sql.window import Window
17
-
18
- import wedata.tempo.tsdf as t_tsdf
19
-
20
- # define global frequency options
21
- MUSEC = "microsec"
22
- MS = "ms"
23
- SEC = "sec"
24
- MIN = "min"
25
- HR = "hr"
26
- DAY = "day"
27
-
28
- # define global aggregate function options for downsampling
29
- floor = "floor"
30
- min = "min"
31
- max = "max"
32
- average = "mean"
33
- ceiling = "ceil"
34
-
35
-
36
- class FreqDict(TypedDict):
37
- musec: str
38
- microsec: str
39
- microsecond: str
40
- microseconds: str
41
- ms: str
42
- millisecond: str
43
- milliseconds: str
44
- sec: str
45
- second: str
46
- seconds: str
47
- min: str
48
- minute: str
49
- minutes: str
50
- hr: str
51
- hour: str
52
- hours: str
53
- day: str
54
- days: str
55
-
56
-
57
- freq_dict: FreqDict = {
58
- "musec": "microseconds",
59
- "microsec": "microseconds",
60
- "microsecond": "microseconds",
61
- "microseconds": "microseconds",
62
- "ms": "milliseconds",
63
- "millisecond": "milliseconds",
64
- "milliseconds": "milliseconds",
65
- "sec": "seconds",
66
- "second": "seconds",
67
- "seconds": "seconds",
68
- "min": "minutes",
69
- "minute": "minutes",
70
- "minutes": "minutes",
71
- "hr": "hours",
72
- "hour": "hours",
73
- "hours": "hours",
74
- "day": "days",
75
- "days": "days",
76
- }
77
-
78
- ALLOWED_FREQ_KEYS: List[str] = list(get_type_hints(FreqDict).keys())
79
-
80
-
81
- def is_valid_allowed_freq_keys(val: str, literal_constant: List[str]) -> bool:
82
- return val in literal_constant
83
-
84
-
85
- allowableFreqs = [MUSEC, MS, SEC, MIN, HR, DAY]
86
- allowableFuncs = [floor, min, max, average, ceiling]
87
-
88
-
89
- def _appendAggKey(
90
- tsdf: t_tsdf.TSDF, freq: Optional[str] = None
91
- ) -> Tuple[t_tsdf.TSDF, int | str, Any]:
92
- """
93
- :param tsdf: TSDF object as input
94
- :param freq: frequency at which to upsample
95
- :return: triple - 1) return a TSDF with a new aggregate key (called agg_key) 2) return the period for use in interpolation, 3) return the time increment (also necessary for interpolation)
96
- """
97
- df = tsdf.df
98
- parsed_freq = checkAllowableFreq(freq)
99
- period, unit = parsed_freq[0], parsed_freq[1]
100
-
101
- agg_window = sfn.window(
102
- sfn.col(tsdf.ts_col), "{} {}".format(period, freq_dict[unit]) # type: ignore[literal-required]
103
- )
104
-
105
- df = df.withColumn("agg_key", agg_window)
106
-
107
- return (
108
- t_tsdf.TSDF(df, tsdf.ts_col, partition_cols=tsdf.partitionCols),
109
- period,
110
- freq_dict[unit], # type: ignore[literal-required]
111
- )
112
-
113
-
114
- def aggregate(
115
- tsdf: t_tsdf.TSDF,
116
- freq: str,
117
- func: Union[Callable, str],
118
- metricCols: Optional[List[str]] = None,
119
- prefix: Optional[str] = None,
120
- fill: Optional[bool] = None,
121
- ) -> DataFrame:
122
- """
123
- aggregate a data frame by a coarser timestamp than the initial TSDF ts_col
124
- :param tsdf: input TSDF object
125
- :param func: aggregate function
126
- :param metricCols: columns used for aggregates
127
- :param prefix: the metric columns with the aggregate named function
128
- :param fill: upsample based on the time increment for 0s in numeric columns
129
- :return: TSDF object with newly aggregated timestamp as ts_col with aggregated values
130
- """
131
- tsdf, period, unit = _appendAggKey(tsdf, freq)
132
-
133
- df = tsdf.df
134
-
135
- groupingCols = tsdf.partitionCols + ["agg_key"]
136
-
137
- if metricCols is None:
138
- metricCols = list(set(df.columns).difference(set(groupingCols + [tsdf.ts_col])))
139
-
140
- if prefix is None:
141
- prefix = ""
142
- else:
143
- prefix = prefix + "_"
144
-
145
- groupingCols = [sfn.col(column) for column in groupingCols]
146
-
147
- if func == floor:
148
- metricCol = sfn.struct([tsdf.ts_col] + metricCols)
149
- res = df.withColumn("struct_cols", metricCol).groupBy(groupingCols)
150
- res = res.agg(sfn.min("struct_cols").alias("closest_data")).select(
151
- *groupingCols, sfn.col("closest_data.*")
152
- )
153
- new_cols = [sfn.col(tsdf.ts_col)] + [
154
- sfn.col(c).alias("{}".format(prefix) + c) for c in metricCols
155
- ]
156
- res = res.select(*groupingCols, *new_cols)
157
- elif func == average:
158
- exprs = {x: "avg" for x in metricCols}
159
- res = df.groupBy(groupingCols).agg(exprs)
160
- agg_metric_cls = list(
161
- set(res.columns).difference(
162
- set(tsdf.partitionCols + [tsdf.ts_col, "agg_key"])
163
- )
164
- )
165
- new_cols = [
166
- sfn.col(c).alias(
167
- "{}".format(prefix) + (c.split("avg(")[1]).replace(")", "")
168
- )
169
- for c in agg_metric_cls
170
- ]
171
- res = res.select(*groupingCols, *new_cols)
172
- elif func == min:
173
- exprs = {x: "min" for x in metricCols}
174
- res = df.groupBy(groupingCols).agg(exprs)
175
- agg_metric_cls = list(
176
- set(res.columns).difference(
177
- set(tsdf.partitionCols + [tsdf.ts_col, "agg_key"])
178
- )
179
- )
180
- new_cols = [
181
- sfn.col(c).alias(
182
- "{}".format(prefix) + (c.split("min(")[1]).replace(")", "")
183
- )
184
- for c in agg_metric_cls
185
- ]
186
- res = res.select(*groupingCols, *new_cols)
187
- elif func == max:
188
- exprs = {x: "max" for x in metricCols}
189
- res = df.groupBy(groupingCols).agg(exprs)
190
- agg_metric_cls = list(
191
- set(res.columns).difference(
192
- set(tsdf.partitionCols + [tsdf.ts_col, "agg_key"])
193
- )
194
- )
195
- new_cols = [
196
- sfn.col(c).alias(
197
- "{}".format(prefix) + (c.split("max(")[1]).replace(")", "")
198
- )
199
- for c in agg_metric_cls
200
- ]
201
- res = res.select(*groupingCols, *new_cols)
202
- elif func == ceiling:
203
- metricCol = sfn.struct([tsdf.ts_col] + metricCols)
204
- res = df.withColumn("struct_cols", metricCol).groupBy(groupingCols)
205
- res = res.agg(sfn.max("struct_cols").alias("ceil_data")).select(
206
- *groupingCols, sfn.col("ceil_data.*")
207
- )
208
- new_cols = [sfn.col(tsdf.ts_col)] + [
209
- sfn.col(c).alias("{}".format(prefix) + c) for c in metricCols
210
- ]
211
- res = res.select(*groupingCols, *new_cols)
212
-
213
- # aggregate by the window and drop the end time (use start time as new ts_col)
214
- res = (
215
- res.drop(tsdf.ts_col)
216
- .withColumnRenamed("agg_key", tsdf.ts_col)
217
- .withColumn(tsdf.ts_col, sfn.col(tsdf.ts_col).start)
218
- )
219
-
220
- # sort columns so they are consistent
221
- non_part_cols = set(set(res.columns) - set(tsdf.partitionCols)) - set([tsdf.ts_col])
222
- sel_and_sort = tsdf.partitionCols + [tsdf.ts_col] + sorted(non_part_cols)
223
- res = res.select(sel_and_sort)
224
-
225
- fillW = Window.partitionBy(tsdf.partitionCols)
226
-
227
- imputes = (
228
- res.select(
229
- *tsdf.partitionCols,
230
- sfn.min(tsdf.ts_col).over(fillW).alias("from"),
231
- sfn.max(tsdf.ts_col).over(fillW).alias("until"),
232
- )
233
- .distinct()
234
- .withColumn(
235
- tsdf.ts_col,
236
- sfn.explode(
237
- sfn.expr("sequence(from, until, interval {} {})".format(period, unit))
238
- ),
239
- )
240
- .drop("from", "until")
241
- )
242
-
243
- metrics = []
244
- for col in res.dtypes:
245
- if col[1] in ["long", "double", "decimal", "integer", "float", "int"]:
246
- metrics.append(col[0])
247
-
248
- if fill:
249
- res = imputes.join(
250
- res, tsdf.partitionCols + [tsdf.ts_col], "leftouter"
251
- ).na.fill(0, metrics)
252
-
253
- return res
254
-
255
-
256
- def checkAllowableFreq(freq: Optional[str]) -> Tuple[Union[int | str], str]:
257
- """
258
- Parses frequency and checks against allowable frequencies
259
- :param freq: frequncy at which to upsample/downsample, declared in resample function
260
- :return: list of parsed frequency value and time suffix
261
- """
262
- if not isinstance(freq, str):
263
- raise TypeError(f"Invalid type for `freq` argument: {freq}.")
264
-
265
- # TODO - return either int OR str for first argument
266
- allowable_freq: Tuple[Union[int | str], str] = (
267
- 0,
268
- "will_always_fail_if_not_overwritten",
269
- )
270
-
271
- if is_valid_allowed_freq_keys(
272
- freq.lower(),
273
- ALLOWED_FREQ_KEYS,
274
- ):
275
- allowable_freq = 1, freq
276
- return allowable_freq
277
-
278
- try:
279
- periods = freq.lower().split(" ")[0].strip()
280
- units = freq.lower().split(" ")[1].strip()
281
- except IndexError:
282
- raise ValueError(
283
- "Allowable grouping frequencies are microsecond (musec), millisecond (ms), sec (second), min (minute), hr (hour), day. Reformat your frequency as <integer> <day/hour/minute/second>"
284
- )
285
-
286
- if is_valid_allowed_freq_keys(
287
- units.lower(),
288
- ALLOWED_FREQ_KEYS,
289
- ):
290
- if units.startswith(MUSEC):
291
- allowable_freq = periods, MUSEC
292
- elif units.startswith(MS) | units.startswith("millis"):
293
- allowable_freq = periods, MS
294
- elif units.startswith(SEC):
295
- allowable_freq = periods, SEC
296
- elif units.startswith(MIN):
297
- allowable_freq = periods, MIN
298
- elif units.startswith("hour") | units.startswith(HR):
299
- allowable_freq = periods, "hour"
300
- elif units.startswith(DAY):
301
- allowable_freq = periods, DAY
302
- else:
303
- raise ValueError(f"Invalid value for `freq` argument: {freq}.")
304
-
305
- return allowable_freq
306
-
307
-
308
- def validateFuncExists(func: Union[Callable | str]) -> None:
309
- if func is None:
310
- raise TypeError(
311
- "Aggregate function missing. Provide one of the allowable functions: "
312
- + ", ".join(allowableFuncs)
313
- )
314
- elif func not in allowableFuncs:
315
- raise ValueError(
316
- "Aggregate function is not in the valid list. Provide one of the allowable functions: "
317
- + ", ".join(allowableFuncs)
318
- )