tencent-wedata-feature-engineering-dev 0.1.49__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tencent-wedata-feature-engineering-dev might be problematic. Click here for more details.

Files changed (38) hide show
  1. {tencent_wedata_feature_engineering_dev-0.1.49.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/METADATA +10 -8
  2. tencent_wedata_feature_engineering_dev-0.2.0.dist-info/RECORD +46 -0
  3. {tencent_wedata_feature_engineering_dev-0.1.49.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/WHEEL +1 -1
  4. wedata/feature_store/client.py +28 -92
  5. wedata/feature_store/constants/constants.py +2 -5
  6. wedata/feature_store/entities/feature_lookup.py +0 -17
  7. wedata/feature_store/entities/feature_spec.py +2 -2
  8. wedata/feature_store/entities/feature_table.py +1 -5
  9. wedata/feature_store/entities/function_info.py +4 -1
  10. wedata/feature_store/feature_table_client/feature_table_client.py +53 -528
  11. wedata/feature_store/spark_client/spark_client.py +15 -41
  12. wedata/feature_store/training_set_client/training_set_client.py +10 -9
  13. wedata/feature_store/utils/common_utils.py +4 -48
  14. wedata/feature_store/utils/feature_lookup_utils.py +43 -37
  15. wedata/feature_store/utils/feature_spec_utils.py +1 -1
  16. wedata/feature_store/utils/uc_utils.py +1 -1
  17. tencent_wedata_feature_engineering_dev-0.1.49.dist-info/RECORD +0 -66
  18. wedata/feature_store/cloud_sdk_client/__init__.py +0 -0
  19. wedata/feature_store/cloud_sdk_client/client.py +0 -108
  20. wedata/feature_store/cloud_sdk_client/models.py +0 -686
  21. wedata/feature_store/cloud_sdk_client/utils.py +0 -39
  22. wedata/feature_store/common/log/__init__.py +0 -0
  23. wedata/feature_store/common/log/logger.py +0 -40
  24. wedata/feature_store/common/store_config/__init__.py +0 -0
  25. wedata/feature_store/common/store_config/redis.py +0 -48
  26. wedata/feature_store/constants/engine_types.py +0 -34
  27. wedata/feature_store/feast_client/__init__.py +0 -0
  28. wedata/feature_store/feast_client/feast_client.py +0 -487
  29. wedata/feature_store/utils/env_utils.py +0 -108
  30. wedata/tempo/__init__.py +0 -0
  31. wedata/tempo/interpol.py +0 -448
  32. wedata/tempo/intervals.py +0 -1331
  33. wedata/tempo/io.py +0 -61
  34. wedata/tempo/ml.py +0 -129
  35. wedata/tempo/resample.py +0 -318
  36. wedata/tempo/tsdf.py +0 -1720
  37. wedata/tempo/utils.py +0 -254
  38. {tencent_wedata_feature_engineering_dev-0.1.49.dist-info → tencent_wedata_feature_engineering_dev-0.2.0.dist-info}/top_level.txt +0 -0
wedata/tempo/utils.py DELETED
@@ -1,254 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import logging
4
- import os
5
- import warnings
6
- from typing import List, Optional, Union, overload
7
-
8
- import pyspark.sql.functions as sfn
9
- import wedata.tempo.resample as t_resample
10
- import wedata.tempo.tsdf as t_tsdf
11
- from IPython import get_ipython # type: ignore
12
- from IPython.core.display import HTML # type: ignore
13
- from IPython.display import display as ipydisplay # type: ignore
14
- from pandas.core.frame import DataFrame as pandasDataFrame
15
- from pyspark.sql.dataframe import DataFrame
16
-
17
- logger = logging.getLogger(__name__)
18
- IS_WEDATA = "WD_HOME" in os.environ.keys()
19
-
20
- """
21
- WD_HOME env variable has been chosen and that's because this variable is a special variable that will be available in WeData Runtime.
22
-
23
- This constant is to ensure the correct behaviour of the show and display methods are called based on the platform
24
- where the code is running from.
25
- """
26
-
27
-
28
- class ResampleWarning(Warning):
29
- """
30
- This class is a warning that is raised when the interpolate or resample with fill methods are called.
31
- """
32
-
33
- pass
34
-
35
-
36
- def _is_capable_of_html_rendering() -> bool:
37
- """
38
- This method returns a boolean value signifying whether the environment is a notebook environment
39
- capable of rendering HTML or not.
40
- """
41
- try:
42
- shell = get_ipython().__class__.__name__
43
- if shell == "ZMQInteractiveShell":
44
- return True # Jupyter notebook or qtconsole
45
- elif shell == "TerminalInteractiveShell":
46
- return False # Terminal running IPython
47
- else:
48
- return False # Other type (?)
49
- except NameError:
50
- return False
51
-
52
-
53
- def calculate_time_horizon(
54
- df: DataFrame,
55
- ts_col: str,
56
- freq: str,
57
- partition_cols: Optional[List[str]],
58
- local_freq_dict: Optional[t_resample.FreqDict] = None,
59
- ) -> None:
60
- # Convert Frequency using resample dictionary
61
- if local_freq_dict is None:
62
- local_freq_dict = t_resample.freq_dict
63
- parsed_freq = t_resample.checkAllowableFreq(freq)
64
- period, unit = parsed_freq[0], parsed_freq[1]
65
- if t_resample.is_valid_allowed_freq_keys(
66
- unit,
67
- t_resample.ALLOWED_FREQ_KEYS,
68
- ):
69
- freq = f"{period} {local_freq_dict[unit]}" # type: ignore[literal-required]
70
- else:
71
- raise ValueError(f"Frequency {unit} not supported")
72
-
73
- # Get max and min timestamp per partition
74
- partitioned_df: DataFrame = df.groupBy(*partition_cols).agg(
75
- sfn.max(ts_col).alias("max_ts"),
76
- sfn.min(ts_col).alias("min_ts"),
77
- )
78
-
79
- # Generate upscale metrics
80
- normalized_time_df: DataFrame = (
81
- partitioned_df.withColumn("min_epoch_ms", sfn.expr("unix_millis(min_ts)"))
82
- .withColumn("max_epoch_ms", sfn.expr("unix_millis(max_ts)"))
83
- .withColumn(
84
- "interval_ms",
85
- sfn.expr(
86
- f"unix_millis(cast('1970-01-01 00:00:00.000+0000' as TIMESTAMP) + INTERVAL {freq})"
87
- ),
88
- )
89
- .withColumn(
90
- "rounded_min_epoch",
91
- sfn.expr("min_epoch_ms - (min_epoch_ms % interval_ms)"),
92
- )
93
- .withColumn(
94
- "rounded_max_epoch",
95
- sfn.expr("max_epoch_ms - (max_epoch_ms % interval_ms)"),
96
- )
97
- .withColumn("diff_ms", sfn.expr("rounded_max_epoch - rounded_min_epoch"))
98
- .withColumn("num_values", sfn.expr("(diff_ms/interval_ms) +1"))
99
- )
100
-
101
- (
102
- min_ts,
103
- max_ts,
104
- min_value_partition,
105
- max_value_partition,
106
- p25_value_partition,
107
- p50_value_partition,
108
- p75_value_partition,
109
- total_values,
110
- ) = normalized_time_df.select(
111
- sfn.min("min_ts"),
112
- sfn.max("max_ts"),
113
- sfn.min("num_values"),
114
- sfn.max("num_values"),
115
- sfn.percentile_approx("num_values", 0.25),
116
- sfn.percentile_approx("num_values", 0.5),
117
- sfn.percentile_approx("num_values", 0.75),
118
- sfn.sum("num_values"),
119
- ).first()
120
-
121
- warnings.simplefilter("always", ResampleWarning)
122
- warnings.warn(
123
- f"""
124
- Resample Metrics Warning:
125
- Earliest Timestamp: {min_ts}
126
- Latest Timestamp: {max_ts}
127
- No. of Unique Partitions: {normalized_time_df.count()}
128
- Resampled Min No. Values in Single a Partition: {min_value_partition}
129
- Resampled Max No. Values in Single a Partition: {max_value_partition}
130
- Resampled P25 No. Values in Single a Partition: {p25_value_partition}
131
- Resampled P50 No. Values in Single a Partition: {p50_value_partition}
132
- Resampled P75 No. Values in Single a Partition: {p75_value_partition}
133
- Resampled Total No. Values Across All Partitions: {total_values}
134
- """,
135
- ResampleWarning,
136
- )
137
-
138
-
139
- @overload
140
- def display_html(df: pandasDataFrame) -> None: ...
141
-
142
-
143
- @overload
144
- def display_html(df: DataFrame) -> None: ...
145
-
146
-
147
- def display_html(df: Union[pandasDataFrame, DataFrame]) -> None:
148
- """
149
- Display method capable of displaying the dataframe in a formatted HTML structured output
150
- """
151
- ipydisplay(HTML("<style>pre { white-space: pre !important; }</style>"))
152
- if isinstance(df, DataFrame):
153
- df.show(truncate=False, vertical=False)
154
- elif isinstance(df, pandasDataFrame):
155
- print(df.head())
156
- else:
157
- logger.error("'display' method not available for this object")
158
-
159
-
160
- def display_unavailable() -> None:
161
- """
162
- This method is called when display method is not available in the environment.
163
- """
164
- logger.error(
165
- "'display' method not available in this environment. Use 'show' method instead."
166
- )
167
-
168
-
169
- def get_display_df(tsdf: t_tsdf.TSDF, k: int) -> DataFrame:
170
- # let's show the n most recent records per series, in order:
171
- orderCols = tsdf.partitionCols.copy()
172
- orderCols.append(tsdf.ts_col)
173
- if tsdf.sequence_col:
174
- orderCols.append(tsdf.sequence_col)
175
- return tsdf.latest(k).df.orderBy(orderCols)
176
-
177
-
178
- @overload
179
- def display_improvised(obj: t_tsdf.TSDF) -> None: ...
180
-
181
-
182
- @overload
183
- def display_improvised(obj: pandasDataFrame) -> None: ...
184
-
185
-
186
- @overload
187
- def display_improvised(obj: DataFrame) -> None: ...
188
-
189
-
190
- def display_improvised(obj: Union[t_tsdf.TSDF, pandasDataFrame, DataFrame]) -> None:
191
- if isinstance(obj, t_tsdf.TSDF):
192
- method(get_display_df(obj, k=5))
193
- else:
194
- method(obj)
195
-
196
-
197
- @overload
198
- def display_html_improvised(obj: Optional[t_tsdf.TSDF]) -> None: ...
199
-
200
-
201
- @overload
202
- def display_html_improvised(obj: Optional[pandasDataFrame]) -> None: ...
203
-
204
-
205
- @overload
206
- def display_html_improvised(obj: Optional[DataFrame]) -> None: ...
207
-
208
-
209
- def display_html_improvised(
210
- obj: Union[t_tsdf.TSDF, pandasDataFrame, DataFrame],
211
- ) -> None:
212
- if isinstance(obj, t_tsdf.TSDF):
213
- display_html(get_display_df(obj, k=5))
214
- else:
215
- display_html(obj)
216
-
217
-
218
- ENV_CAN_RENDER_HTML = _is_capable_of_html_rendering()
219
-
220
- if (
221
- IS_WEDATA
222
- and not (get_ipython() is None)
223
- and ("display" in get_ipython().user_ns.keys())
224
- ):
225
- method = get_ipython().user_ns["display"]
226
-
227
- # Under 'display' key in user_ns the original wedata display method is present
228
- # to know more refer: /wedata/python_shell/scripts/wd_ipykernel_launcher.py
229
-
230
- display = display_improvised
231
-
232
- elif ENV_CAN_RENDER_HTML:
233
-
234
- display = display_html_improvised
235
-
236
- else:
237
- display = display_unavailable # type: ignore
238
-
239
- """
240
- display method's equivalent for TSDF object
241
-
242
- Example to show usage
243
- ---------------------
244
- from pyspark.sql.functions import *
245
-
246
- phone_accel_df = spark.read.format("csv").option("header", "true").load("dbfs:/home/tempo/Phones_accelerometer").withColumn("event_ts", (col("Arrival_Time").cast("double")/1000).cast("timestamp")).withColumn("x", col("x").cast("double")).withColumn("y", col("y").cast("double")).withColumn("z", col("z").cast("double")).withColumn("event_ts_dbl", col("event_ts").cast("double"))
247
-
248
- from tempo import *
249
-
250
- phone_accel_tsdf = TSDF(phone_accel_df, ts_col="event_ts", partition_cols = ["User"])
251
-
252
- # Calling display method here
253
- display(phone_accel_tsdf)
254
- """