tencent-wedata-feature-engineering-dev 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tencent-wedata-feature-engineering-dev might be problematic. Click here for more details.

Files changed (64) hide show
  1. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/METADATA +19 -0
  2. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/RECORD +64 -0
  3. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/WHEEL +5 -0
  4. tencent_wedata_feature_engineering_dev-0.1.0.dist-info/top_level.txt +1 -0
  5. wedata/__init__.py +9 -0
  6. wedata/feature_store/__init__.py +0 -0
  7. wedata/feature_store/client.py +462 -0
  8. wedata/feature_store/cloud_sdk_client/__init__.py +0 -0
  9. wedata/feature_store/cloud_sdk_client/client.py +86 -0
  10. wedata/feature_store/cloud_sdk_client/models.py +686 -0
  11. wedata/feature_store/cloud_sdk_client/utils.py +32 -0
  12. wedata/feature_store/common/__init__.py +0 -0
  13. wedata/feature_store/common/protos/__init__.py +0 -0
  14. wedata/feature_store/common/protos/feature_store_pb2.py +49 -0
  15. wedata/feature_store/common/store_config/__init__.py +0 -0
  16. wedata/feature_store/common/store_config/redis.py +48 -0
  17. wedata/feature_store/constants/__init__.py +0 -0
  18. wedata/feature_store/constants/constants.py +59 -0
  19. wedata/feature_store/constants/engine_types.py +34 -0
  20. wedata/feature_store/entities/__init__.py +0 -0
  21. wedata/feature_store/entities/column_info.py +138 -0
  22. wedata/feature_store/entities/environment_variables.py +55 -0
  23. wedata/feature_store/entities/feature.py +53 -0
  24. wedata/feature_store/entities/feature_column_info.py +72 -0
  25. wedata/feature_store/entities/feature_function.py +55 -0
  26. wedata/feature_store/entities/feature_lookup.py +200 -0
  27. wedata/feature_store/entities/feature_spec.py +489 -0
  28. wedata/feature_store/entities/feature_spec_constants.py +25 -0
  29. wedata/feature_store/entities/feature_table.py +111 -0
  30. wedata/feature_store/entities/feature_table_info.py +49 -0
  31. wedata/feature_store/entities/function_info.py +90 -0
  32. wedata/feature_store/entities/on_demand_column_info.py +57 -0
  33. wedata/feature_store/entities/source_data_column_info.py +24 -0
  34. wedata/feature_store/entities/training_set.py +135 -0
  35. wedata/feature_store/feast_client/__init__.py +0 -0
  36. wedata/feature_store/feast_client/feast_client.py +482 -0
  37. wedata/feature_store/feature_table_client/__init__.py +0 -0
  38. wedata/feature_store/feature_table_client/feature_table_client.py +969 -0
  39. wedata/feature_store/mlflow_model.py +17 -0
  40. wedata/feature_store/spark_client/__init__.py +0 -0
  41. wedata/feature_store/spark_client/spark_client.py +289 -0
  42. wedata/feature_store/training_set_client/__init__.py +0 -0
  43. wedata/feature_store/training_set_client/training_set_client.py +572 -0
  44. wedata/feature_store/utils/__init__.py +0 -0
  45. wedata/feature_store/utils/common_utils.py +352 -0
  46. wedata/feature_store/utils/env_utils.py +86 -0
  47. wedata/feature_store/utils/feature_lookup_utils.py +564 -0
  48. wedata/feature_store/utils/feature_spec_utils.py +286 -0
  49. wedata/feature_store/utils/feature_utils.py +73 -0
  50. wedata/feature_store/utils/on_demand_utils.py +107 -0
  51. wedata/feature_store/utils/schema_utils.py +117 -0
  52. wedata/feature_store/utils/signature_utils.py +202 -0
  53. wedata/feature_store/utils/topological_sort.py +158 -0
  54. wedata/feature_store/utils/training_set_utils.py +579 -0
  55. wedata/feature_store/utils/uc_utils.py +296 -0
  56. wedata/feature_store/utils/validation_utils.py +79 -0
  57. wedata/tempo/__init__.py +0 -0
  58. wedata/tempo/interpol.py +448 -0
  59. wedata/tempo/intervals.py +1331 -0
  60. wedata/tempo/io.py +61 -0
  61. wedata/tempo/ml.py +129 -0
  62. wedata/tempo/resample.py +318 -0
  63. wedata/tempo/tsdf.py +1720 -0
  64. wedata/tempo/utils.py +254 -0
@@ -0,0 +1,25 @@
1
+ # Field names from feature_spec.proto.
2
+ SOURCE_DATA_COLUMN_INFO = "source_data_column_info"
3
+ FEATURE_COLUMN_INFO = "feature_column_info"
4
+ ON_DEMAND_COLUMN_INFO = "on_demand_column_info"
5
+ INPUT_COLUMNS = "input_columns"
6
+ NAME = "name"
7
+ OUTPUT_NAME = "output_name"
8
+ INPUT_TABLES = "input_tables"
9
+ TABLE_NAME = "table_name"
10
+ TABLE_ID = "table_id"
11
+ SERIALIZATION_VERSION = "serialization_version"
12
+ INPUT_FUNCTIONS = "input_functions"
13
+ INCLUDE = "include"
14
+ DATA_TYPE = "data_type"
15
+ TOPOLOGICAL_ORDERING = "topological_ordering"
16
+ UDF_NAME = "udf_name"
17
+ INPUT_BINDINGS = "input_bindings"
18
+ PARAMETER = "parameter"
19
+ BOUND_TO = "bound_to"
20
+
21
+ # FeatureSpec YAML source field and allowed values
22
+ SOURCE = "source"
23
+ TRAINING_DATA = "training_data"
24
+ FEATURE_STORE = "feature_store"
25
+ ON_DEMAND_FEATURE = "on_demand_feature"
@@ -0,0 +1,111 @@
1
+ from typing import Dict
2
+
3
+
4
+ class FeatureTable:
5
+ """
6
+ 特征表实体类
7
+ """
8
+
9
+ def __init__(
10
+ self,
11
+ name,
12
+ table_id,
13
+ description,
14
+ primary_keys,
15
+ partition_columns,
16
+ features,
17
+ creation_timestamp=None,
18
+ online_stores=None,
19
+ notebook_producers=None,
20
+ job_producers=None,
21
+ table_data_sources=None,
22
+ path_data_sources=None,
23
+ custom_data_sources=None,
24
+ timestamp_keys=None,
25
+ tags=None,
26
+ ):
27
+ """Initialize a FeatureTable object."""
28
+ """初始化特征表对象
29
+
30
+ :param name: 表名
31
+ :param table_id: 表ID
32
+ :param description: 描述
33
+ :param primary_keys: 主键列表
34
+ :param partition_columns: 分区列列表
35
+ :param features: 特征列列表
36
+ :param creation_timestamp: 创建时间戳(可选)
37
+ :param online_stores: 在线存储配置(可选)
38
+ :param notebook_producers: Notebook生产者列表(可选)
39
+ :param job_producers: 作业生产者列表(可选)
40
+ :param table_data_sources: 表数据源列表(可选)
41
+ :param path_data_sources: 路径数据源列表(可选)
42
+ :param custom_data_sources: 自定义数据源列表(可选)
43
+ :param timestamp_keys: 时间戳键列表(可选)
44
+ :param tags: 标签字典(可选)
45
+ """
46
+ self.name = name
47
+ self.table_id = table_id
48
+ self.description = description
49
+ self.primary_keys = primary_keys
50
+ self.partition_columns = partition_columns
51
+ self.features = features
52
+ self.creation_timestamp = creation_timestamp
53
+ self.online_stores = online_stores if online_stores is not None else []
54
+ self.notebook_producers = (
55
+ notebook_producers if notebook_producers is not None else []
56
+ )
57
+ self.job_producers = job_producers if job_producers is not None else []
58
+ self.table_data_sources = (
59
+ table_data_sources if table_data_sources is not None else []
60
+ )
61
+ self.path_data_sources = (
62
+ path_data_sources if path_data_sources is not None else []
63
+ )
64
+ self.custom_data_sources = (
65
+ custom_data_sources if custom_data_sources is not None else []
66
+ )
67
+ self.timestamp_keys = timestamp_keys if timestamp_keys is not None else []
68
+ self._tags = tags
69
+
70
+ def __str__(self):
71
+ """
72
+ 返回特征表实例的字符串表示,包含所有关键属性信息
73
+
74
+ 返回:
75
+ 格式化的字符串,包含表名、ID、描述、主键、分区列、特征数量、
76
+ 时间戳键、创建时间、数据源数量和标签数量等信息
77
+ """
78
+ if self.description and len(self.description) > 50:
79
+ desc = self.description[:50] + "..."
80
+ else:
81
+ desc = self.description
82
+ return (
83
+ f"FeatureTable(\n"
84
+ f" name='{self.name}',\n"
85
+ f" table_id='{self.table_id}',\n"
86
+ f" description='{desc}',\n"
87
+ f" primary_keys={self.primary_keys},\n"
88
+ f" partition_columns={self.partition_columns},\n"
89
+ f" features={len(self.features)},\n"
90
+ f" timestamp_keys={self.timestamp_keys},\n"
91
+ f" creation_timestamp={self.creation_timestamp},\n"
92
+ f" data_sources=[table:{len(self.table_data_sources)} "
93
+ f"path:{len(self.path_data_sources)} custom:{len(self.custom_data_sources)}],\n"
94
+ f" tags={len(self.tags) if self._tags else 0}\n"
95
+ f")"
96
+ )
97
+
98
+ @property
99
+ def tags(self) -> Dict[str, str]:
100
+ """
101
+ Get the tags associated with the feature table.
102
+
103
+ :return a Dictionary of all tags associated with the feature table as key/value pairs
104
+ """
105
+ if self._tags is None:
106
+ # If no tags are set, self._tags is expected an empty dictionary.
107
+ raise ValueError(
108
+ "Internal error: tags have not been fetched for this FeatureTable instance"
109
+ )
110
+ return self._tags
111
+
@@ -0,0 +1,49 @@
1
+ from typing import Optional
2
+
3
+
4
+ from wedata.feature_store.common.protos import feature_store_pb2
5
+
6
+ class FeatureTableInfo:
7
+ def __init__(
8
+ self, table_name: str, table_id: str, lookback_window: Optional[float] = None
9
+ ):
10
+ if not table_name:
11
+ raise ValueError("table_name must be non-empty.")
12
+ if not table_id:
13
+ raise ValueError("table_id must be non-empty.")
14
+ self._table_name = table_name
15
+ self._table_id = table_id
16
+ self._lookback_window = lookback_window
17
+
18
+ @property
19
+ def table_name(self):
20
+ return self._table_name
21
+
22
+ @property
23
+ def table_id(self):
24
+ return self._table_id
25
+
26
+ @property
27
+ def lookback_window(self):
28
+ return self._lookback_window
29
+
30
+ @classmethod
31
+ def from_proto(cls, feature_table_info_proto):
32
+ lookback_window = feature_table_info_proto.lookback_window or None
33
+ # lookback_window = (
34
+ # feature_table_info_proto.lookback_window
35
+ # if feature_table_info_proto.lookback_window != 0
36
+ # else None
37
+ # )
38
+ return cls(
39
+ table_name=feature_table_info_proto.table_name,
40
+ table_id=feature_table_info_proto.table_id,
41
+ lookback_window=lookback_window,
42
+ )
43
+
44
+ def to_proto(self):
45
+ return feature_store_pb2.FeatureTableInfo(
46
+ table_name=self.table_name,
47
+ table_id=self.table_id,
48
+ lookback_window=self.lookback_window,
49
+ )
@@ -0,0 +1,90 @@
1
+
2
+ from typing import List, Optional
3
+
4
+ from wedata.feature_store.common.protos import feature_store_pb2
5
+
6
+ class FunctionParameterInfo():
7
+ def __init__(self, name: str, type_text: str):
8
+ self._name = name
9
+ self._type_text = type_text
10
+
11
+ @property
12
+ def name(self) -> str:
13
+ return self._name
14
+
15
+ @property
16
+ def type_text(self) -> str:
17
+ return self._type_text
18
+
19
+ @classmethod
20
+ def from_dict(cls, function_parameter_info_json):
21
+ return FunctionParameterInfo(
22
+ function_parameter_info_json["name"],
23
+ function_parameter_info_json["type_text"],
24
+ )
25
+
26
+
27
+ class FunctionInfo():
28
+ """
29
+ Helper entity class that exposes properties in GetFunction's response JSON as attributes.
30
+ https://docs.databricks.com/api-explorer/workspace/functions/get
31
+
32
+ Note: empty fields (e.g. when 0 input parameters) are not included in the response JSON.
33
+ """
34
+
35
+ # Python UDFs have external_language = "Python"
36
+ PYTHON = "Python"
37
+
38
+ def __init__(
39
+ self,
40
+ full_name: str,
41
+ input_params: List[FunctionParameterInfo],
42
+ routine_definition: Optional[str],
43
+ external_language: Optional[str],
44
+ ):
45
+ self._full_name = full_name
46
+ self._input_params = input_params
47
+ self._routine_definition = routine_definition
48
+ self._external_language = external_language
49
+
50
+
51
+ @property
52
+ def full_name(self) -> str:
53
+ return self._full_name
54
+
55
+ @property
56
+ def input_params(self) -> List[FunctionParameterInfo]:
57
+ return self._input_params
58
+
59
+ @property
60
+ def routine_definition(self) -> Optional[str]:
61
+ return self._routine_definition
62
+
63
+ @property
64
+ def external_language(self) -> Optional[str]:
65
+ """
66
+ Field is None if language is SQL (not an external language).
67
+ """
68
+ return self._external_language
69
+
70
+ @classmethod
71
+ def from_dict(cls, function_info_json):
72
+ input_params = function_info_json.get("input_params", {}).get("parameters", [])
73
+ return FunctionInfo(
74
+ full_name=function_info_json["full_name"],
75
+ input_params=[FunctionParameterInfo.from_dict(p) for p in input_params],
76
+ routine_definition=function_info_json.get("routine_definition", None),
77
+ external_language=function_info_json.get("external_language", None),
78
+ )
79
+
80
+ @classmethod
81
+ def from_proto(cls, function_info_proto):
82
+ return cls(full_name=function_info_proto.udf_name)
83
+
84
+ def to_proto(self):
85
+ return feature_store_pb2.FunctionInfo(
86
+ full_name=self.full_name,
87
+ input_params=self.input_params,
88
+ routine_definition=self.routine_definition,
89
+ external_language=self.external_language
90
+ )
@@ -0,0 +1,57 @@
1
+ from typing import Dict
2
+
3
+ from wedata.feature_store.common.protos import feature_store_pb2
4
+
5
+ class OnDemandColumnInfo:
6
+ def __init__(
7
+ self,
8
+ udf_name: str,
9
+ input_bindings: Dict[str, str],
10
+ output_name: str,
11
+ ):
12
+ if not udf_name:
13
+ raise ValueError("udf_name must be non-empty.")
14
+ if not output_name:
15
+ raise ValueError("output_name must be non-empty.")
16
+
17
+ self._udf_name = udf_name
18
+ self._input_bindings = input_bindings
19
+ self._output_name = output_name
20
+
21
+ @property
22
+ def udf_name(self) -> str:
23
+ return self._udf_name
24
+
25
+ @property
26
+ def input_bindings(self) -> Dict[str, str]:
27
+ """
28
+ input_bindings is serialized as the InputBindings proto message.
29
+ """
30
+ return self._input_bindings
31
+
32
+ @property
33
+ def output_name(self) -> str:
34
+ return self._output_name
35
+
36
+ @classmethod
37
+ def from_proto(cls, on_demand_column_info_proto):
38
+ input_bindings_dict = {
39
+ input_binding.parameter: input_binding.bound_to
40
+ for input_binding in on_demand_column_info_proto.input_bindings
41
+ }
42
+ return OnDemandColumnInfo(
43
+ udf_name=on_demand_column_info_proto.udf_name,
44
+ input_bindings=input_bindings_dict,
45
+ output_name=on_demand_column_info_proto.output_name,
46
+ )
47
+
48
+ def to_proto(self):
49
+ input_bindings_list = [
50
+ feature_store_pb2.InputBinding(parameter=k, bound_to=v)
51
+ for k, v in self.input_bindings.items()
52
+ ]
53
+ return feature_store_pb2.OnDemandColumnInfo(
54
+ udf_name=self.udf_name,
55
+ input_bindings=input_bindings_list,
56
+ output_name=self.output_name,
57
+ )
@@ -0,0 +1,24 @@
1
+ from wedata.feature_store.common.protos import feature_store_pb2
2
+ class SourceDataColumnInfo:
3
+ def __init__(self, name: str):
4
+ if not name:
5
+ raise ValueError("name must be non-empty.")
6
+ self._name = name
7
+
8
+ @property
9
+ def name(self):
10
+ return self._name
11
+
12
+ @property
13
+ def output_name(self) -> str:
14
+ """
15
+ This field does not exist in the proto, and is provided for convenience.
16
+ """
17
+ return self._name
18
+
19
+ @classmethod
20
+ def from_proto(cls, source_data_column_info_proto):
21
+ return cls(name=source_data_column_info_proto.name)
22
+
23
+ def to_proto(self):
24
+ return feature_store_pb2.SourceDataColumnInfo(name=self._name)
@@ -0,0 +1,135 @@
1
+ from typing import Dict, List, Optional
2
+
3
+ from pyspark.sql import DataFrame
4
+
5
+ from wedata.feature_store.entities.feature_table import FeatureTable
6
+ from wedata.feature_store.entities.function_info import FunctionInfo
7
+ from wedata.feature_store.utils.feature_lookup_utils import (
8
+ join_feature_data_if_not_overridden,
9
+ )
10
+
11
+ from wedata.feature_store.entities.feature_spec import FeatureSpec
12
+ from wedata.feature_store.utils.feature_spec_utils import (
13
+ COLUMN_INFO_TYPE_FEATURE,
14
+ COLUMN_INFO_TYPE_ON_DEMAND,
15
+ COLUMN_INFO_TYPE_SOURCE,
16
+ get_feature_execution_groups,
17
+ )
18
+ from wedata.feature_store.utils.on_demand_utils import apply_functions_if_not_overridden
19
+
20
+
21
+ class TrainingSet:
22
+ """
23
+ .. note::
24
+
25
+ Aliases: `!databricks.feature_engineering.training_set.TrainingSet`, `!databricks.feature_store.training_set.TrainingSet`
26
+
27
+ Class that defines :obj:`TrainingSet` objects.
28
+
29
+ .. note::
30
+
31
+ The :class:`TrainingSet` constructor should not be called directly. Instead,
32
+ call :meth:`create_training_set() <databricks.feature_engineering.client.FeatureEngineeringClient.create_training_set>`.
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ feature_spec: FeatureSpec,
38
+ df: DataFrame,
39
+ labels: List[str],
40
+ feature_table_metadata_map: Dict[str, FeatureTable],
41
+ feature_table_data_map: Dict[str, DataFrame],
42
+ uc_function_infos: Dict[str, FunctionInfo],
43
+ use_spark_native_join: Optional[bool] = False,
44
+ ):
45
+ """Initialize a :obj:`TrainingSet` object."""
46
+ assert isinstance(
47
+ labels, list
48
+ ), f"Expected type `list` for argument `labels`. Got '{labels}' with type '{type(labels)}'."
49
+
50
+ self._feature_spec = feature_spec
51
+ self._df = df
52
+ self._labels = labels
53
+ self._feature_table_metadata_map = feature_table_metadata_map
54
+ self._feature_table_data_map = feature_table_data_map
55
+ self._uc_function_infos = uc_function_infos
56
+ self._use_spark_native_join = use_spark_native_join
57
+ # Perform basic validations and resolve FeatureSpec and label column data types.
58
+ self._validate_and_inject_dtypes()
59
+ self._label_data_types = {
60
+ name: data_type for name, data_type in df.dtypes if name in labels
61
+ }
62
+
63
+ @property
64
+ def feature_spec(self) -> FeatureSpec:
65
+ """Define a feature spec."""
66
+ return self._feature_spec
67
+
68
+ def _augment_df(self) -> DataFrame:
69
+ """
70
+ Internal helper to augment DataFrame with feature lookups and on-demand features specified in the FeatureSpec.
71
+ Does not drop excluded columns, and does not overwrite columns that already exist.
72
+ Return column order is df.columns + feature lookups + on-demand features.
73
+ """
74
+ execution_groups = get_feature_execution_groups(
75
+ self.feature_spec, self._df.columns
76
+ )
77
+
78
+ result_df = self._df
79
+ # Iterate over all levels and type of DAG nodes in FeatureSpec and execute them.
80
+ for execution_group in execution_groups:
81
+ if execution_group.type == COLUMN_INFO_TYPE_SOURCE:
82
+ continue
83
+ if execution_group.type == COLUMN_INFO_TYPE_FEATURE:
84
+ # Apply FeatureLookups
85
+ result_df = join_feature_data_if_not_overridden(
86
+ feature_spec=self.feature_spec,
87
+ df=result_df,
88
+ features_to_join=execution_group.features,
89
+ feature_table_metadata_map=self._feature_table_metadata_map,
90
+ feature_table_data_map=self._feature_table_data_map,
91
+ use_spark_native_join=self._use_spark_native_join,
92
+ )
93
+ elif execution_group.type == COLUMN_INFO_TYPE_ON_DEMAND:
94
+ # Apply all on-demand UDFs
95
+ result_df = apply_functions_if_not_overridden(
96
+ df=result_df,
97
+ functions_to_apply=execution_group.features,
98
+ uc_function_infos=self._uc_function_infos,
99
+ )
100
+ else:
101
+ # This should never be reached.
102
+ raise Exception("Unknown feature execution type:", execution_group.type)
103
+ return result_df
104
+
105
+ def _validate_and_inject_dtypes(self):
106
+ """
107
+ Performs validations through _augment_df (e.g. Delta table exists, Delta and feature table dtypes match),
108
+ then inject the result DataFrame dtypes into the FeatureSpec.
109
+ """
110
+ augmented_df = self._augment_df()
111
+ augmented_df_dtypes = {column: dtype for column, dtype in augmented_df.dtypes}
112
+
113
+ # Inject the result DataFrame column types into the respective ColumnInfo
114
+ for ci in self.feature_spec.column_infos:
115
+ ci._data_type = augmented_df_dtypes[ci.output_name]
116
+
117
+ def load_df(self) -> DataFrame:
118
+ """
119
+ Load a :class:`DataFrame <pyspark.sql.DataFrame>`.
120
+
121
+ Return a :class:`DataFrame <pyspark.sql.DataFrame>` for training.
122
+
123
+ The returned :class:`DataFrame <pyspark.sql.DataFrame>` has columns specified
124
+ in the ``feature_spec`` and ``labels`` parameters provided
125
+ in :meth:`create_training_set() <databricks.feature_engineering.client.FeatureEngineeringClient.create_training_set>`.
126
+
127
+ :return:
128
+ A :class:`DataFrame <pyspark.sql.DataFrame>` for training
129
+ """
130
+ augmented_df = self._augment_df()
131
+ # Return only included columns in order defined by FeatureSpec + labels
132
+ included_columns = [
133
+ ci.output_name for ci in self.feature_spec.column_infos if ci.include
134
+ ] + self._labels
135
+ return augmented_df.select(included_columns)
File without changes