tencent-wedata-feature-engineering-dev 0.1.49__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tencent-wedata-feature-engineering-dev might be problematic. Click here for more details.
- {tencent_wedata_feature_engineering_dev-0.1.49.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/METADATA +10 -8
- tencent_wedata_feature_engineering_dev-0.2.0.dist-info/RECORD +46 -0
- {tencent_wedata_feature_engineering_dev-0.1.49.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/WHEEL +1 -1
- wedata/feature_store/client.py +28 -92
- wedata/feature_store/constants/constants.py +2 -5
- wedata/feature_store/entities/feature_lookup.py +0 -17
- wedata/feature_store/entities/feature_spec.py +2 -2
- wedata/feature_store/entities/feature_table.py +1 -5
- wedata/feature_store/entities/function_info.py +4 -1
- wedata/feature_store/feature_table_client/feature_table_client.py +53 -528
- wedata/feature_store/spark_client/spark_client.py +15 -41
- wedata/feature_store/training_set_client/training_set_client.py +10 -9
- wedata/feature_store/utils/common_utils.py +4 -48
- wedata/feature_store/utils/feature_lookup_utils.py +43 -37
- wedata/feature_store/utils/feature_spec_utils.py +1 -1
- wedata/feature_store/utils/uc_utils.py +1 -1
- tencent_wedata_feature_engineering_dev-0.1.49.dist-info/RECORD +0 -66
- wedata/feature_store/cloud_sdk_client/__init__.py +0 -0
- wedata/feature_store/cloud_sdk_client/client.py +0 -108
- wedata/feature_store/cloud_sdk_client/models.py +0 -686
- wedata/feature_store/cloud_sdk_client/utils.py +0 -39
- wedata/feature_store/common/log/__init__.py +0 -0
- wedata/feature_store/common/log/logger.py +0 -40
- wedata/feature_store/common/store_config/__init__.py +0 -0
- wedata/feature_store/common/store_config/redis.py +0 -48
- wedata/feature_store/constants/engine_types.py +0 -34
- wedata/feature_store/feast_client/__init__.py +0 -0
- wedata/feature_store/feast_client/feast_client.py +0 -487
- wedata/feature_store/utils/env_utils.py +0 -108
- wedata/tempo/__init__.py +0 -0
- wedata/tempo/interpol.py +0 -448
- wedata/tempo/intervals.py +0 -1331
- wedata/tempo/io.py +0 -61
- wedata/tempo/ml.py +0 -129
- wedata/tempo/resample.py +0 -318
- wedata/tempo/tsdf.py +0 -1720
- wedata/tempo/utils.py +0 -254
- {tencent_wedata_feature_engineering_dev-0.1.49.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/top_level.txt +0 -0
wedata/tempo/io.py
DELETED
|
@@ -1,61 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import logging
|
|
4
|
-
from collections import deque
|
|
5
|
-
from typing import Optional
|
|
6
|
-
|
|
7
|
-
import pyspark.sql.functions as sfn
|
|
8
|
-
import wedata.tempo.tsdf as t_tsdf
|
|
9
|
-
from pyspark.sql import SparkSession
|
|
10
|
-
from pyspark.sql.utils import ParseException
|
|
11
|
-
|
|
12
|
-
logger = logging.getLogger(__name__)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def write(
|
|
16
|
-
tsdf: t_tsdf.TSDF,
|
|
17
|
-
spark: SparkSession,
|
|
18
|
-
tabName: str,
|
|
19
|
-
optimizationCols: Optional[list[str]] = None,
|
|
20
|
-
) -> None:
|
|
21
|
-
"""
|
|
22
|
-
param: tsdf: input TSDF object to write
|
|
23
|
-
param: tabName Delta output table name
|
|
24
|
-
param: optimizationCols list of columns to optimize on (time)
|
|
25
|
-
"""
|
|
26
|
-
# hilbert curves more evenly distribute performance for querying multiple columns for Delta tables
|
|
27
|
-
spark.conf.set("spark.wedata.io.skipping.mdc.curve", "hilbert")
|
|
28
|
-
|
|
29
|
-
df = tsdf.df
|
|
30
|
-
ts_col = tsdf.ts_col
|
|
31
|
-
partitionCols = tsdf.partitionCols
|
|
32
|
-
|
|
33
|
-
view_df = df.withColumn("event_dt", sfn.to_date(sfn.col(ts_col))).withColumn(
|
|
34
|
-
"event_time",
|
|
35
|
-
sfn.translate(sfn.split(sfn.col(ts_col).cast("string"), " ")[1], ":", "").cast(
|
|
36
|
-
"double"
|
|
37
|
-
),
|
|
38
|
-
)
|
|
39
|
-
view_cols = deque(view_df.columns)
|
|
40
|
-
view_cols.rotate(1)
|
|
41
|
-
view_df = view_df.select(*list(view_cols))
|
|
42
|
-
|
|
43
|
-
# Use replaceWhere instead of overwrite mode
|
|
44
|
-
writer = view_df.write.format("delta").partitionBy("event_dt")
|
|
45
|
-
writer = writer.option("replaceWhere", "true")
|
|
46
|
-
writer.saveAsTable(tabName)
|
|
47
|
-
|
|
48
|
-
if optimizationCols:
|
|
49
|
-
try:
|
|
50
|
-
spark.sql(
|
|
51
|
-
"optimize {} zorder by {}".format(
|
|
52
|
-
tabName,
|
|
53
|
-
"(" + ",".join(partitionCols + optimizationCols + [ts_col]) + ")",
|
|
54
|
-
)
|
|
55
|
-
)
|
|
56
|
-
except ParseException as e:
|
|
57
|
-
logger.error(
|
|
58
|
-
"Delta optimizations attempted, but was not successful.\nError: {}".format(
|
|
59
|
-
e
|
|
60
|
-
)
|
|
61
|
-
)
|
wedata/tempo/ml.py
DELETED
|
@@ -1,129 +0,0 @@
|
|
|
1
|
-
from typing import Any, List, Tuple
|
|
2
|
-
from functools import reduce
|
|
3
|
-
|
|
4
|
-
from pyspark.sql import DataFrame
|
|
5
|
-
from pyspark.sql.window import Window, WindowSpec
|
|
6
|
-
from pyspark.sql import functions as sfn
|
|
7
|
-
|
|
8
|
-
from pyspark.ml.param import Param, Params, TypeConverters
|
|
9
|
-
from pyspark.ml.tuning import CrossValidator
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
TMP_SPLIT_COL = "__tmp_split_col"
|
|
13
|
-
TMP_GAP_COL = "__tmp_gap_row"
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def _get_parent_params():
|
|
17
|
-
dummy = Params()
|
|
18
|
-
dummy.uid = "undefined"
|
|
19
|
-
return dummy
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class TimeSeriesCrossValidator(CrossValidator):
|
|
23
|
-
# some additional parameters
|
|
24
|
-
|
|
25
|
-
timeSeriesCol: Param[str] = Param(
|
|
26
|
-
_get_parent_params(),
|
|
27
|
-
"timeSeriesCol",
|
|
28
|
-
"The name of the time series column",
|
|
29
|
-
typeConverter=TypeConverters.toString,
|
|
30
|
-
)
|
|
31
|
-
seriesIdCols: Param[List[str]] = Param(
|
|
32
|
-
_get_parent_params(),
|
|
33
|
-
"seriesIdCols",
|
|
34
|
-
"The name of the series id columns",
|
|
35
|
-
typeConverter=TypeConverters.toListString,
|
|
36
|
-
)
|
|
37
|
-
gap: Param[int] = Param(
|
|
38
|
-
_get_parent_params(),
|
|
39
|
-
"gap",
|
|
40
|
-
"The gap between training and test set",
|
|
41
|
-
typeConverter=TypeConverters.toInt,
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
def __init__(
|
|
45
|
-
self,
|
|
46
|
-
timeSeriesCol: str = "event_ts",
|
|
47
|
-
seriesIdCols: List[str] = [],
|
|
48
|
-
gap: int = 0,
|
|
49
|
-
**other_kwargs: Any,
|
|
50
|
-
) -> None:
|
|
51
|
-
super(TimeSeriesCrossValidator, self).__init__(**other_kwargs)
|
|
52
|
-
self._setDefault(timeSeriesCol="event_ts", seriesIdCols=[], gap=0)
|
|
53
|
-
self._set(timeSeriesCol=timeSeriesCol, seriesIdCols=seriesIdCols, gap=gap)
|
|
54
|
-
|
|
55
|
-
def getTimeSeriesCol(self) -> str:
|
|
56
|
-
return self.getOrDefault(self.timeSeriesCol)
|
|
57
|
-
|
|
58
|
-
def getSeriesIdCols(self) -> List[str]:
|
|
59
|
-
return self.getOrDefault(self.seriesIdCols)
|
|
60
|
-
|
|
61
|
-
def getGap(self) -> int:
|
|
62
|
-
return self.getOrDefault(self.gap)
|
|
63
|
-
|
|
64
|
-
def setTimeSeriesCol(self, value: str) -> "TimeSeriesCrossValidator":
|
|
65
|
-
return self._set(timeSeriesCol=value)
|
|
66
|
-
|
|
67
|
-
def setSeriesIdCols(self, value: List[str]) -> "TimeSeriesCrossValidator":
|
|
68
|
-
return self._set(seriesIdCols=value)
|
|
69
|
-
|
|
70
|
-
def setGap(self, value: int) -> "TimeSeriesCrossValidator":
|
|
71
|
-
return self._set(gap=value)
|
|
72
|
-
|
|
73
|
-
def _get_split_win(self, desc: bool = False) -> WindowSpec:
|
|
74
|
-
ts_col_expr = sfn.col(self.getTimeSeriesCol())
|
|
75
|
-
if desc:
|
|
76
|
-
ts_col_expr = ts_col_expr.desc()
|
|
77
|
-
win = Window.orderBy(ts_col_expr)
|
|
78
|
-
series_id_cols = self.getSeriesIdCols()
|
|
79
|
-
if series_id_cols and len(series_id_cols) > 0:
|
|
80
|
-
win = win.partitionBy(*series_id_cols)
|
|
81
|
-
return win
|
|
82
|
-
|
|
83
|
-
def _kFold(self, dataset: DataFrame) -> List[Tuple[DataFrame, DataFrame]]:
|
|
84
|
-
nFolds = self.getOrDefault(self.numFolds)
|
|
85
|
-
nSplits = nFolds + 1
|
|
86
|
-
|
|
87
|
-
# split the data into nSplits subsets by timeseries order
|
|
88
|
-
split_df = dataset.withColumn(
|
|
89
|
-
TMP_SPLIT_COL, sfn.ntile(nSplits).over(self._get_split_win())
|
|
90
|
-
)
|
|
91
|
-
all_splits = [
|
|
92
|
-
split_df.filter(sfn.col(TMP_SPLIT_COL) == i).drop(TMP_SPLIT_COL)
|
|
93
|
-
for i in range(1, nSplits + 1)
|
|
94
|
-
]
|
|
95
|
-
assert len(all_splits) == nSplits
|
|
96
|
-
|
|
97
|
-
# compose the k folds by including all previous splits in the training set,
|
|
98
|
-
# and the next split in the test set
|
|
99
|
-
kFolds = [
|
|
100
|
-
(reduce(lambda a, b: a.union(b), all_splits[: i + 1]), all_splits[i + 1])
|
|
101
|
-
for i in range(nFolds)
|
|
102
|
-
]
|
|
103
|
-
assert len(kFolds) == nFolds
|
|
104
|
-
for tv in kFolds:
|
|
105
|
-
assert len(tv) == 2
|
|
106
|
-
|
|
107
|
-
# trim out a gap from the training datasets, if specified
|
|
108
|
-
gap = self.getOrDefault(self.gap)
|
|
109
|
-
if gap > 0:
|
|
110
|
-
order_cols = self.getSeriesIdCols() + [self.getTimeSeriesCol()]
|
|
111
|
-
# trim each training dataset by the specified gap
|
|
112
|
-
kFolds = [
|
|
113
|
-
(
|
|
114
|
-
(
|
|
115
|
-
train_df.withColumn(
|
|
116
|
-
TMP_GAP_COL,
|
|
117
|
-
sfn.row_number().over(self._get_split_win(desc=True)),
|
|
118
|
-
)
|
|
119
|
-
.where(sfn.col(TMP_GAP_COL) > gap)
|
|
120
|
-
.drop(TMP_GAP_COL)
|
|
121
|
-
.orderBy(*order_cols)
|
|
122
|
-
),
|
|
123
|
-
test_df,
|
|
124
|
-
)
|
|
125
|
-
for (train_df, test_df) in kFolds
|
|
126
|
-
]
|
|
127
|
-
|
|
128
|
-
# return the k folds (training, test) datasets
|
|
129
|
-
return kFolds
|
wedata/tempo/resample.py
DELETED
|
@@ -1,318 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import (
|
|
4
|
-
Any,
|
|
5
|
-
Callable,
|
|
6
|
-
List,
|
|
7
|
-
Optional,
|
|
8
|
-
Tuple,
|
|
9
|
-
TypedDict,
|
|
10
|
-
Union,
|
|
11
|
-
get_type_hints,
|
|
12
|
-
)
|
|
13
|
-
|
|
14
|
-
import pyspark.sql.functions as sfn
|
|
15
|
-
from pyspark.sql import DataFrame
|
|
16
|
-
from pyspark.sql.window import Window
|
|
17
|
-
|
|
18
|
-
import wedata.tempo.tsdf as t_tsdf
|
|
19
|
-
|
|
20
|
-
# define global frequency options
|
|
21
|
-
MUSEC = "microsec"
|
|
22
|
-
MS = "ms"
|
|
23
|
-
SEC = "sec"
|
|
24
|
-
MIN = "min"
|
|
25
|
-
HR = "hr"
|
|
26
|
-
DAY = "day"
|
|
27
|
-
|
|
28
|
-
# define global aggregate function options for downsampling
|
|
29
|
-
floor = "floor"
|
|
30
|
-
min = "min"
|
|
31
|
-
max = "max"
|
|
32
|
-
average = "mean"
|
|
33
|
-
ceiling = "ceil"
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
class FreqDict(TypedDict):
|
|
37
|
-
musec: str
|
|
38
|
-
microsec: str
|
|
39
|
-
microsecond: str
|
|
40
|
-
microseconds: str
|
|
41
|
-
ms: str
|
|
42
|
-
millisecond: str
|
|
43
|
-
milliseconds: str
|
|
44
|
-
sec: str
|
|
45
|
-
second: str
|
|
46
|
-
seconds: str
|
|
47
|
-
min: str
|
|
48
|
-
minute: str
|
|
49
|
-
minutes: str
|
|
50
|
-
hr: str
|
|
51
|
-
hour: str
|
|
52
|
-
hours: str
|
|
53
|
-
day: str
|
|
54
|
-
days: str
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
freq_dict: FreqDict = {
|
|
58
|
-
"musec": "microseconds",
|
|
59
|
-
"microsec": "microseconds",
|
|
60
|
-
"microsecond": "microseconds",
|
|
61
|
-
"microseconds": "microseconds",
|
|
62
|
-
"ms": "milliseconds",
|
|
63
|
-
"millisecond": "milliseconds",
|
|
64
|
-
"milliseconds": "milliseconds",
|
|
65
|
-
"sec": "seconds",
|
|
66
|
-
"second": "seconds",
|
|
67
|
-
"seconds": "seconds",
|
|
68
|
-
"min": "minutes",
|
|
69
|
-
"minute": "minutes",
|
|
70
|
-
"minutes": "minutes",
|
|
71
|
-
"hr": "hours",
|
|
72
|
-
"hour": "hours",
|
|
73
|
-
"hours": "hours",
|
|
74
|
-
"day": "days",
|
|
75
|
-
"days": "days",
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
ALLOWED_FREQ_KEYS: List[str] = list(get_type_hints(FreqDict).keys())
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def is_valid_allowed_freq_keys(val: str, literal_constant: List[str]) -> bool:
|
|
82
|
-
return val in literal_constant
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
allowableFreqs = [MUSEC, MS, SEC, MIN, HR, DAY]
|
|
86
|
-
allowableFuncs = [floor, min, max, average, ceiling]
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
def _appendAggKey(
|
|
90
|
-
tsdf: t_tsdf.TSDF, freq: Optional[str] = None
|
|
91
|
-
) -> Tuple[t_tsdf.TSDF, int | str, Any]:
|
|
92
|
-
"""
|
|
93
|
-
:param tsdf: TSDF object as input
|
|
94
|
-
:param freq: frequency at which to upsample
|
|
95
|
-
:return: triple - 1) return a TSDF with a new aggregate key (called agg_key) 2) return the period for use in interpolation, 3) return the time increment (also necessary for interpolation)
|
|
96
|
-
"""
|
|
97
|
-
df = tsdf.df
|
|
98
|
-
parsed_freq = checkAllowableFreq(freq)
|
|
99
|
-
period, unit = parsed_freq[0], parsed_freq[1]
|
|
100
|
-
|
|
101
|
-
agg_window = sfn.window(
|
|
102
|
-
sfn.col(tsdf.ts_col), "{} {}".format(period, freq_dict[unit]) # type: ignore[literal-required]
|
|
103
|
-
)
|
|
104
|
-
|
|
105
|
-
df = df.withColumn("agg_key", agg_window)
|
|
106
|
-
|
|
107
|
-
return (
|
|
108
|
-
t_tsdf.TSDF(df, tsdf.ts_col, partition_cols=tsdf.partitionCols),
|
|
109
|
-
period,
|
|
110
|
-
freq_dict[unit], # type: ignore[literal-required]
|
|
111
|
-
)
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
def aggregate(
|
|
115
|
-
tsdf: t_tsdf.TSDF,
|
|
116
|
-
freq: str,
|
|
117
|
-
func: Union[Callable, str],
|
|
118
|
-
metricCols: Optional[List[str]] = None,
|
|
119
|
-
prefix: Optional[str] = None,
|
|
120
|
-
fill: Optional[bool] = None,
|
|
121
|
-
) -> DataFrame:
|
|
122
|
-
"""
|
|
123
|
-
aggregate a data frame by a coarser timestamp than the initial TSDF ts_col
|
|
124
|
-
:param tsdf: input TSDF object
|
|
125
|
-
:param func: aggregate function
|
|
126
|
-
:param metricCols: columns used for aggregates
|
|
127
|
-
:param prefix: the metric columns with the aggregate named function
|
|
128
|
-
:param fill: upsample based on the time increment for 0s in numeric columns
|
|
129
|
-
:return: TSDF object with newly aggregated timestamp as ts_col with aggregated values
|
|
130
|
-
"""
|
|
131
|
-
tsdf, period, unit = _appendAggKey(tsdf, freq)
|
|
132
|
-
|
|
133
|
-
df = tsdf.df
|
|
134
|
-
|
|
135
|
-
groupingCols = tsdf.partitionCols + ["agg_key"]
|
|
136
|
-
|
|
137
|
-
if metricCols is None:
|
|
138
|
-
metricCols = list(set(df.columns).difference(set(groupingCols + [tsdf.ts_col])))
|
|
139
|
-
|
|
140
|
-
if prefix is None:
|
|
141
|
-
prefix = ""
|
|
142
|
-
else:
|
|
143
|
-
prefix = prefix + "_"
|
|
144
|
-
|
|
145
|
-
groupingCols = [sfn.col(column) for column in groupingCols]
|
|
146
|
-
|
|
147
|
-
if func == floor:
|
|
148
|
-
metricCol = sfn.struct([tsdf.ts_col] + metricCols)
|
|
149
|
-
res = df.withColumn("struct_cols", metricCol).groupBy(groupingCols)
|
|
150
|
-
res = res.agg(sfn.min("struct_cols").alias("closest_data")).select(
|
|
151
|
-
*groupingCols, sfn.col("closest_data.*")
|
|
152
|
-
)
|
|
153
|
-
new_cols = [sfn.col(tsdf.ts_col)] + [
|
|
154
|
-
sfn.col(c).alias("{}".format(prefix) + c) for c in metricCols
|
|
155
|
-
]
|
|
156
|
-
res = res.select(*groupingCols, *new_cols)
|
|
157
|
-
elif func == average:
|
|
158
|
-
exprs = {x: "avg" for x in metricCols}
|
|
159
|
-
res = df.groupBy(groupingCols).agg(exprs)
|
|
160
|
-
agg_metric_cls = list(
|
|
161
|
-
set(res.columns).difference(
|
|
162
|
-
set(tsdf.partitionCols + [tsdf.ts_col, "agg_key"])
|
|
163
|
-
)
|
|
164
|
-
)
|
|
165
|
-
new_cols = [
|
|
166
|
-
sfn.col(c).alias(
|
|
167
|
-
"{}".format(prefix) + (c.split("avg(")[1]).replace(")", "")
|
|
168
|
-
)
|
|
169
|
-
for c in agg_metric_cls
|
|
170
|
-
]
|
|
171
|
-
res = res.select(*groupingCols, *new_cols)
|
|
172
|
-
elif func == min:
|
|
173
|
-
exprs = {x: "min" for x in metricCols}
|
|
174
|
-
res = df.groupBy(groupingCols).agg(exprs)
|
|
175
|
-
agg_metric_cls = list(
|
|
176
|
-
set(res.columns).difference(
|
|
177
|
-
set(tsdf.partitionCols + [tsdf.ts_col, "agg_key"])
|
|
178
|
-
)
|
|
179
|
-
)
|
|
180
|
-
new_cols = [
|
|
181
|
-
sfn.col(c).alias(
|
|
182
|
-
"{}".format(prefix) + (c.split("min(")[1]).replace(")", "")
|
|
183
|
-
)
|
|
184
|
-
for c in agg_metric_cls
|
|
185
|
-
]
|
|
186
|
-
res = res.select(*groupingCols, *new_cols)
|
|
187
|
-
elif func == max:
|
|
188
|
-
exprs = {x: "max" for x in metricCols}
|
|
189
|
-
res = df.groupBy(groupingCols).agg(exprs)
|
|
190
|
-
agg_metric_cls = list(
|
|
191
|
-
set(res.columns).difference(
|
|
192
|
-
set(tsdf.partitionCols + [tsdf.ts_col, "agg_key"])
|
|
193
|
-
)
|
|
194
|
-
)
|
|
195
|
-
new_cols = [
|
|
196
|
-
sfn.col(c).alias(
|
|
197
|
-
"{}".format(prefix) + (c.split("max(")[1]).replace(")", "")
|
|
198
|
-
)
|
|
199
|
-
for c in agg_metric_cls
|
|
200
|
-
]
|
|
201
|
-
res = res.select(*groupingCols, *new_cols)
|
|
202
|
-
elif func == ceiling:
|
|
203
|
-
metricCol = sfn.struct([tsdf.ts_col] + metricCols)
|
|
204
|
-
res = df.withColumn("struct_cols", metricCol).groupBy(groupingCols)
|
|
205
|
-
res = res.agg(sfn.max("struct_cols").alias("ceil_data")).select(
|
|
206
|
-
*groupingCols, sfn.col("ceil_data.*")
|
|
207
|
-
)
|
|
208
|
-
new_cols = [sfn.col(tsdf.ts_col)] + [
|
|
209
|
-
sfn.col(c).alias("{}".format(prefix) + c) for c in metricCols
|
|
210
|
-
]
|
|
211
|
-
res = res.select(*groupingCols, *new_cols)
|
|
212
|
-
|
|
213
|
-
# aggregate by the window and drop the end time (use start time as new ts_col)
|
|
214
|
-
res = (
|
|
215
|
-
res.drop(tsdf.ts_col)
|
|
216
|
-
.withColumnRenamed("agg_key", tsdf.ts_col)
|
|
217
|
-
.withColumn(tsdf.ts_col, sfn.col(tsdf.ts_col).start)
|
|
218
|
-
)
|
|
219
|
-
|
|
220
|
-
# sort columns so they are consistent
|
|
221
|
-
non_part_cols = set(set(res.columns) - set(tsdf.partitionCols)) - set([tsdf.ts_col])
|
|
222
|
-
sel_and_sort = tsdf.partitionCols + [tsdf.ts_col] + sorted(non_part_cols)
|
|
223
|
-
res = res.select(sel_and_sort)
|
|
224
|
-
|
|
225
|
-
fillW = Window.partitionBy(tsdf.partitionCols)
|
|
226
|
-
|
|
227
|
-
imputes = (
|
|
228
|
-
res.select(
|
|
229
|
-
*tsdf.partitionCols,
|
|
230
|
-
sfn.min(tsdf.ts_col).over(fillW).alias("from"),
|
|
231
|
-
sfn.max(tsdf.ts_col).over(fillW).alias("until"),
|
|
232
|
-
)
|
|
233
|
-
.distinct()
|
|
234
|
-
.withColumn(
|
|
235
|
-
tsdf.ts_col,
|
|
236
|
-
sfn.explode(
|
|
237
|
-
sfn.expr("sequence(from, until, interval {} {})".format(period, unit))
|
|
238
|
-
),
|
|
239
|
-
)
|
|
240
|
-
.drop("from", "until")
|
|
241
|
-
)
|
|
242
|
-
|
|
243
|
-
metrics = []
|
|
244
|
-
for col in res.dtypes:
|
|
245
|
-
if col[1] in ["long", "double", "decimal", "integer", "float", "int"]:
|
|
246
|
-
metrics.append(col[0])
|
|
247
|
-
|
|
248
|
-
if fill:
|
|
249
|
-
res = imputes.join(
|
|
250
|
-
res, tsdf.partitionCols + [tsdf.ts_col], "leftouter"
|
|
251
|
-
).na.fill(0, metrics)
|
|
252
|
-
|
|
253
|
-
return res
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
def checkAllowableFreq(freq: Optional[str]) -> Tuple[Union[int | str], str]:
|
|
257
|
-
"""
|
|
258
|
-
Parses frequency and checks against allowable frequencies
|
|
259
|
-
:param freq: frequncy at which to upsample/downsample, declared in resample function
|
|
260
|
-
:return: list of parsed frequency value and time suffix
|
|
261
|
-
"""
|
|
262
|
-
if not isinstance(freq, str):
|
|
263
|
-
raise TypeError(f"Invalid type for `freq` argument: {freq}.")
|
|
264
|
-
|
|
265
|
-
# TODO - return either int OR str for first argument
|
|
266
|
-
allowable_freq: Tuple[Union[int | str], str] = (
|
|
267
|
-
0,
|
|
268
|
-
"will_always_fail_if_not_overwritten",
|
|
269
|
-
)
|
|
270
|
-
|
|
271
|
-
if is_valid_allowed_freq_keys(
|
|
272
|
-
freq.lower(),
|
|
273
|
-
ALLOWED_FREQ_KEYS,
|
|
274
|
-
):
|
|
275
|
-
allowable_freq = 1, freq
|
|
276
|
-
return allowable_freq
|
|
277
|
-
|
|
278
|
-
try:
|
|
279
|
-
periods = freq.lower().split(" ")[0].strip()
|
|
280
|
-
units = freq.lower().split(" ")[1].strip()
|
|
281
|
-
except IndexError:
|
|
282
|
-
raise ValueError(
|
|
283
|
-
"Allowable grouping frequencies are microsecond (musec), millisecond (ms), sec (second), min (minute), hr (hour), day. Reformat your frequency as <integer> <day/hour/minute/second>"
|
|
284
|
-
)
|
|
285
|
-
|
|
286
|
-
if is_valid_allowed_freq_keys(
|
|
287
|
-
units.lower(),
|
|
288
|
-
ALLOWED_FREQ_KEYS,
|
|
289
|
-
):
|
|
290
|
-
if units.startswith(MUSEC):
|
|
291
|
-
allowable_freq = periods, MUSEC
|
|
292
|
-
elif units.startswith(MS) | units.startswith("millis"):
|
|
293
|
-
allowable_freq = periods, MS
|
|
294
|
-
elif units.startswith(SEC):
|
|
295
|
-
allowable_freq = periods, SEC
|
|
296
|
-
elif units.startswith(MIN):
|
|
297
|
-
allowable_freq = periods, MIN
|
|
298
|
-
elif units.startswith("hour") | units.startswith(HR):
|
|
299
|
-
allowable_freq = periods, "hour"
|
|
300
|
-
elif units.startswith(DAY):
|
|
301
|
-
allowable_freq = periods, DAY
|
|
302
|
-
else:
|
|
303
|
-
raise ValueError(f"Invalid value for `freq` argument: {freq}.")
|
|
304
|
-
|
|
305
|
-
return allowable_freq
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
def validateFuncExists(func: Union[Callable | str]) -> None:
|
|
309
|
-
if func is None:
|
|
310
|
-
raise TypeError(
|
|
311
|
-
"Aggregate function missing. Provide one of the allowable functions: "
|
|
312
|
-
+ ", ".join(allowableFuncs)
|
|
313
|
-
)
|
|
314
|
-
elif func not in allowableFuncs:
|
|
315
|
-
raise ValueError(
|
|
316
|
-
"Aggregate function is not in the valid list. Provide one of the allowable functions: "
|
|
317
|
-
+ ", ".join(allowableFuncs)
|
|
318
|
-
)
|