tencent-wedata-feature-engineering-dev 0.1.50__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tencent-wedata-feature-engineering-dev might be problematic. Click here for more details.
- {tencent_wedata_feature_engineering_dev-0.1.50.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/METADATA +10 -8
- tencent_wedata_feature_engineering_dev-0.2.0.dist-info/RECORD +46 -0
- {tencent_wedata_feature_engineering_dev-0.1.50.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/WHEEL +1 -1
- wedata/feature_store/client.py +28 -92
- wedata/feature_store/constants/constants.py +2 -5
- wedata/feature_store/entities/feature_lookup.py +0 -17
- wedata/feature_store/entities/feature_spec.py +2 -2
- wedata/feature_store/entities/feature_table.py +1 -5
- wedata/feature_store/entities/function_info.py +4 -1
- wedata/feature_store/feature_table_client/feature_table_client.py +53 -528
- wedata/feature_store/spark_client/spark_client.py +15 -41
- wedata/feature_store/training_set_client/training_set_client.py +10 -9
- wedata/feature_store/utils/common_utils.py +4 -48
- wedata/feature_store/utils/feature_lookup_utils.py +43 -37
- wedata/feature_store/utils/feature_spec_utils.py +1 -1
- wedata/feature_store/utils/uc_utils.py +1 -1
- tencent_wedata_feature_engineering_dev-0.1.50.dist-info/RECORD +0 -66
- wedata/feature_store/cloud_sdk_client/__init__.py +0 -0
- wedata/feature_store/cloud_sdk_client/client.py +0 -108
- wedata/feature_store/cloud_sdk_client/models.py +0 -686
- wedata/feature_store/cloud_sdk_client/utils.py +0 -39
- wedata/feature_store/common/log/__init__.py +0 -0
- wedata/feature_store/common/log/logger.py +0 -40
- wedata/feature_store/common/store_config/__init__.py +0 -0
- wedata/feature_store/common/store_config/redis.py +0 -48
- wedata/feature_store/constants/engine_types.py +0 -34
- wedata/feature_store/feast_client/__init__.py +0 -0
- wedata/feature_store/feast_client/feast_client.py +0 -487
- wedata/feature_store/utils/env_utils.py +0 -108
- wedata/tempo/__init__.py +0 -0
- wedata/tempo/interpol.py +0 -448
- wedata/tempo/intervals.py +0 -1331
- wedata/tempo/io.py +0 -61
- wedata/tempo/ml.py +0 -129
- wedata/tempo/resample.py +0 -318
- wedata/tempo/tsdf.py +0 -1720
- wedata/tempo/utils.py +0 -254
- {tencent_wedata_feature_engineering_dev-0.1.50.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/top_level.txt +0 -0
wedata/tempo/utils.py
DELETED
|
@@ -1,254 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import logging
|
|
4
|
-
import os
|
|
5
|
-
import warnings
|
|
6
|
-
from typing import List, Optional, Union, overload
|
|
7
|
-
|
|
8
|
-
import pyspark.sql.functions as sfn
|
|
9
|
-
import wedata.tempo.resample as t_resample
|
|
10
|
-
import wedata.tempo.tsdf as t_tsdf
|
|
11
|
-
from IPython import get_ipython # type: ignore
|
|
12
|
-
from IPython.core.display import HTML # type: ignore
|
|
13
|
-
from IPython.display import display as ipydisplay # type: ignore
|
|
14
|
-
from pandas.core.frame import DataFrame as pandasDataFrame
|
|
15
|
-
from pyspark.sql.dataframe import DataFrame
|
|
16
|
-
|
|
17
|
-
logger = logging.getLogger(__name__)
|
|
18
|
-
IS_WEDATA = "WD_HOME" in os.environ.keys()
|
|
19
|
-
|
|
20
|
-
"""
|
|
21
|
-
WD_HOME env variable has been chosen and that's because this variable is a special variable that will be available in WeData Runtime.
|
|
22
|
-
|
|
23
|
-
This constant is to ensure the correct behaviour of the show and display methods are called based on the platform
|
|
24
|
-
where the code is running from.
|
|
25
|
-
"""
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class ResampleWarning(Warning):
|
|
29
|
-
"""
|
|
30
|
-
This class is a warning that is raised when the interpolate or resample with fill methods are called.
|
|
31
|
-
"""
|
|
32
|
-
|
|
33
|
-
pass
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def _is_capable_of_html_rendering() -> bool:
|
|
37
|
-
"""
|
|
38
|
-
This method returns a boolean value signifying whether the environment is a notebook environment
|
|
39
|
-
capable of rendering HTML or not.
|
|
40
|
-
"""
|
|
41
|
-
try:
|
|
42
|
-
shell = get_ipython().__class__.__name__
|
|
43
|
-
if shell == "ZMQInteractiveShell":
|
|
44
|
-
return True # Jupyter notebook or qtconsole
|
|
45
|
-
elif shell == "TerminalInteractiveShell":
|
|
46
|
-
return False # Terminal running IPython
|
|
47
|
-
else:
|
|
48
|
-
return False # Other type (?)
|
|
49
|
-
except NameError:
|
|
50
|
-
return False
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
def calculate_time_horizon(
|
|
54
|
-
df: DataFrame,
|
|
55
|
-
ts_col: str,
|
|
56
|
-
freq: str,
|
|
57
|
-
partition_cols: Optional[List[str]],
|
|
58
|
-
local_freq_dict: Optional[t_resample.FreqDict] = None,
|
|
59
|
-
) -> None:
|
|
60
|
-
# Convert Frequency using resample dictionary
|
|
61
|
-
if local_freq_dict is None:
|
|
62
|
-
local_freq_dict = t_resample.freq_dict
|
|
63
|
-
parsed_freq = t_resample.checkAllowableFreq(freq)
|
|
64
|
-
period, unit = parsed_freq[0], parsed_freq[1]
|
|
65
|
-
if t_resample.is_valid_allowed_freq_keys(
|
|
66
|
-
unit,
|
|
67
|
-
t_resample.ALLOWED_FREQ_KEYS,
|
|
68
|
-
):
|
|
69
|
-
freq = f"{period} {local_freq_dict[unit]}" # type: ignore[literal-required]
|
|
70
|
-
else:
|
|
71
|
-
raise ValueError(f"Frequency {unit} not supported")
|
|
72
|
-
|
|
73
|
-
# Get max and min timestamp per partition
|
|
74
|
-
partitioned_df: DataFrame = df.groupBy(*partition_cols).agg(
|
|
75
|
-
sfn.max(ts_col).alias("max_ts"),
|
|
76
|
-
sfn.min(ts_col).alias("min_ts"),
|
|
77
|
-
)
|
|
78
|
-
|
|
79
|
-
# Generate upscale metrics
|
|
80
|
-
normalized_time_df: DataFrame = (
|
|
81
|
-
partitioned_df.withColumn("min_epoch_ms", sfn.expr("unix_millis(min_ts)"))
|
|
82
|
-
.withColumn("max_epoch_ms", sfn.expr("unix_millis(max_ts)"))
|
|
83
|
-
.withColumn(
|
|
84
|
-
"interval_ms",
|
|
85
|
-
sfn.expr(
|
|
86
|
-
f"unix_millis(cast('1970-01-01 00:00:00.000+0000' as TIMESTAMP) + INTERVAL {freq})"
|
|
87
|
-
),
|
|
88
|
-
)
|
|
89
|
-
.withColumn(
|
|
90
|
-
"rounded_min_epoch",
|
|
91
|
-
sfn.expr("min_epoch_ms - (min_epoch_ms % interval_ms)"),
|
|
92
|
-
)
|
|
93
|
-
.withColumn(
|
|
94
|
-
"rounded_max_epoch",
|
|
95
|
-
sfn.expr("max_epoch_ms - (max_epoch_ms % interval_ms)"),
|
|
96
|
-
)
|
|
97
|
-
.withColumn("diff_ms", sfn.expr("rounded_max_epoch - rounded_min_epoch"))
|
|
98
|
-
.withColumn("num_values", sfn.expr("(diff_ms/interval_ms) +1"))
|
|
99
|
-
)
|
|
100
|
-
|
|
101
|
-
(
|
|
102
|
-
min_ts,
|
|
103
|
-
max_ts,
|
|
104
|
-
min_value_partition,
|
|
105
|
-
max_value_partition,
|
|
106
|
-
p25_value_partition,
|
|
107
|
-
p50_value_partition,
|
|
108
|
-
p75_value_partition,
|
|
109
|
-
total_values,
|
|
110
|
-
) = normalized_time_df.select(
|
|
111
|
-
sfn.min("min_ts"),
|
|
112
|
-
sfn.max("max_ts"),
|
|
113
|
-
sfn.min("num_values"),
|
|
114
|
-
sfn.max("num_values"),
|
|
115
|
-
sfn.percentile_approx("num_values", 0.25),
|
|
116
|
-
sfn.percentile_approx("num_values", 0.5),
|
|
117
|
-
sfn.percentile_approx("num_values", 0.75),
|
|
118
|
-
sfn.sum("num_values"),
|
|
119
|
-
).first()
|
|
120
|
-
|
|
121
|
-
warnings.simplefilter("always", ResampleWarning)
|
|
122
|
-
warnings.warn(
|
|
123
|
-
f"""
|
|
124
|
-
Resample Metrics Warning:
|
|
125
|
-
Earliest Timestamp: {min_ts}
|
|
126
|
-
Latest Timestamp: {max_ts}
|
|
127
|
-
No. of Unique Partitions: {normalized_time_df.count()}
|
|
128
|
-
Resampled Min No. Values in Single a Partition: {min_value_partition}
|
|
129
|
-
Resampled Max No. Values in Single a Partition: {max_value_partition}
|
|
130
|
-
Resampled P25 No. Values in Single a Partition: {p25_value_partition}
|
|
131
|
-
Resampled P50 No. Values in Single a Partition: {p50_value_partition}
|
|
132
|
-
Resampled P75 No. Values in Single a Partition: {p75_value_partition}
|
|
133
|
-
Resampled Total No. Values Across All Partitions: {total_values}
|
|
134
|
-
""",
|
|
135
|
-
ResampleWarning,
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
@overload
|
|
140
|
-
def display_html(df: pandasDataFrame) -> None: ...
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
@overload
|
|
144
|
-
def display_html(df: DataFrame) -> None: ...
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
def display_html(df: Union[pandasDataFrame, DataFrame]) -> None:
|
|
148
|
-
"""
|
|
149
|
-
Display method capable of displaying the dataframe in a formatted HTML structured output
|
|
150
|
-
"""
|
|
151
|
-
ipydisplay(HTML("<style>pre { white-space: pre !important; }</style>"))
|
|
152
|
-
if isinstance(df, DataFrame):
|
|
153
|
-
df.show(truncate=False, vertical=False)
|
|
154
|
-
elif isinstance(df, pandasDataFrame):
|
|
155
|
-
print(df.head())
|
|
156
|
-
else:
|
|
157
|
-
logger.error("'display' method not available for this object")
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
def display_unavailable() -> None:
|
|
161
|
-
"""
|
|
162
|
-
This method is called when display method is not available in the environment.
|
|
163
|
-
"""
|
|
164
|
-
logger.error(
|
|
165
|
-
"'display' method not available in this environment. Use 'show' method instead."
|
|
166
|
-
)
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
def get_display_df(tsdf: t_tsdf.TSDF, k: int) -> DataFrame:
|
|
170
|
-
# let's show the n most recent records per series, in order:
|
|
171
|
-
orderCols = tsdf.partitionCols.copy()
|
|
172
|
-
orderCols.append(tsdf.ts_col)
|
|
173
|
-
if tsdf.sequence_col:
|
|
174
|
-
orderCols.append(tsdf.sequence_col)
|
|
175
|
-
return tsdf.latest(k).df.orderBy(orderCols)
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
@overload
|
|
179
|
-
def display_improvised(obj: t_tsdf.TSDF) -> None: ...
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
@overload
|
|
183
|
-
def display_improvised(obj: pandasDataFrame) -> None: ...
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
@overload
|
|
187
|
-
def display_improvised(obj: DataFrame) -> None: ...
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
def display_improvised(obj: Union[t_tsdf.TSDF, pandasDataFrame, DataFrame]) -> None:
|
|
191
|
-
if isinstance(obj, t_tsdf.TSDF):
|
|
192
|
-
method(get_display_df(obj, k=5))
|
|
193
|
-
else:
|
|
194
|
-
method(obj)
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
@overload
|
|
198
|
-
def display_html_improvised(obj: Optional[t_tsdf.TSDF]) -> None: ...
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
@overload
|
|
202
|
-
def display_html_improvised(obj: Optional[pandasDataFrame]) -> None: ...
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
@overload
|
|
206
|
-
def display_html_improvised(obj: Optional[DataFrame]) -> None: ...
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
def display_html_improvised(
|
|
210
|
-
obj: Union[t_tsdf.TSDF, pandasDataFrame, DataFrame],
|
|
211
|
-
) -> None:
|
|
212
|
-
if isinstance(obj, t_tsdf.TSDF):
|
|
213
|
-
display_html(get_display_df(obj, k=5))
|
|
214
|
-
else:
|
|
215
|
-
display_html(obj)
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
ENV_CAN_RENDER_HTML = _is_capable_of_html_rendering()
|
|
219
|
-
|
|
220
|
-
if (
|
|
221
|
-
IS_WEDATA
|
|
222
|
-
and not (get_ipython() is None)
|
|
223
|
-
and ("display" in get_ipython().user_ns.keys())
|
|
224
|
-
):
|
|
225
|
-
method = get_ipython().user_ns["display"]
|
|
226
|
-
|
|
227
|
-
# Under 'display' key in user_ns the original wedata display method is present
|
|
228
|
-
# to know more refer: /wedata/python_shell/scripts/wd_ipykernel_launcher.py
|
|
229
|
-
|
|
230
|
-
display = display_improvised
|
|
231
|
-
|
|
232
|
-
elif ENV_CAN_RENDER_HTML:
|
|
233
|
-
|
|
234
|
-
display = display_html_improvised
|
|
235
|
-
|
|
236
|
-
else:
|
|
237
|
-
display = display_unavailable # type: ignore
|
|
238
|
-
|
|
239
|
-
"""
|
|
240
|
-
display method's equivalent for TSDF object
|
|
241
|
-
|
|
242
|
-
Example to show usage
|
|
243
|
-
---------------------
|
|
244
|
-
from pyspark.sql.functions import *
|
|
245
|
-
|
|
246
|
-
phone_accel_df = spark.read.format("csv").option("header", "true").load("dbfs:/home/tempo/Phones_accelerometer").withColumn("event_ts", (col("Arrival_Time").cast("double")/1000).cast("timestamp")).withColumn("x", col("x").cast("double")).withColumn("y", col("y").cast("double")).withColumn("z", col("z").cast("double")).withColumn("event_ts_dbl", col("event_ts").cast("double"))
|
|
247
|
-
|
|
248
|
-
from tempo import *
|
|
249
|
-
|
|
250
|
-
phone_accel_tsdf = TSDF(phone_accel_df, ts_col="event_ts", partition_cols = ["User"])
|
|
251
|
-
|
|
252
|
-
# Calling display method here
|
|
253
|
-
display(phone_accel_tsdf)
|
|
254
|
-
"""
|