wedata-feature-engineering 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. wedata/__init__.py +1 -1
  2. wedata/feature_store/client.py +113 -41
  3. wedata/feature_store/constants/constants.py +19 -0
  4. wedata/feature_store/entities/column_info.py +4 -4
  5. wedata/feature_store/entities/feature_lookup.py +5 -1
  6. wedata/feature_store/entities/feature_spec.py +46 -46
  7. wedata/feature_store/entities/feature_table.py +42 -99
  8. wedata/feature_store/entities/training_set.py +13 -12
  9. wedata/feature_store/feature_table_client/feature_table_client.py +85 -30
  10. wedata/feature_store/spark_client/spark_client.py +30 -56
  11. wedata/feature_store/training_set_client/training_set_client.py +209 -38
  12. wedata/feature_store/utils/common_utils.py +213 -3
  13. wedata/feature_store/utils/feature_lookup_utils.py +6 -6
  14. wedata/feature_store/utils/feature_spec_utils.py +6 -6
  15. wedata/feature_store/utils/feature_utils.py +5 -5
  16. wedata/feature_store/utils/on_demand_utils.py +107 -0
  17. wedata/feature_store/utils/schema_utils.py +1 -1
  18. wedata/feature_store/utils/signature_utils.py +205 -0
  19. wedata/feature_store/utils/training_set_utils.py +18 -19
  20. wedata/feature_store/utils/uc_utils.py +1 -1
  21. {wedata_feature_engineering-0.1.5.dist-info → wedata_feature_engineering-0.1.6.dist-info}/METADATA +1 -1
  22. wedata_feature_engineering-0.1.6.dist-info/RECORD +43 -0
  23. feature_store/__init__.py +0 -6
  24. feature_store/client.py +0 -169
  25. feature_store/constants/__init__.py +0 -0
  26. feature_store/constants/constants.py +0 -28
  27. feature_store/entities/__init__.py +0 -0
  28. feature_store/entities/column_info.py +0 -117
  29. feature_store/entities/data_type.py +0 -92
  30. feature_store/entities/environment_variables.py +0 -55
  31. feature_store/entities/feature.py +0 -53
  32. feature_store/entities/feature_column_info.py +0 -64
  33. feature_store/entities/feature_function.py +0 -55
  34. feature_store/entities/feature_lookup.py +0 -179
  35. feature_store/entities/feature_spec.py +0 -454
  36. feature_store/entities/feature_spec_constants.py +0 -25
  37. feature_store/entities/feature_table.py +0 -164
  38. feature_store/entities/feature_table_info.py +0 -40
  39. feature_store/entities/function_info.py +0 -184
  40. feature_store/entities/on_demand_column_info.py +0 -44
  41. feature_store/entities/source_data_column_info.py +0 -21
  42. feature_store/entities/training_set.py +0 -134
  43. feature_store/feature_table_client/__init__.py +0 -0
  44. feature_store/feature_table_client/feature_table_client.py +0 -313
  45. feature_store/spark_client/__init__.py +0 -0
  46. feature_store/spark_client/spark_client.py +0 -286
  47. feature_store/training_set_client/__init__.py +0 -0
  48. feature_store/training_set_client/training_set_client.py +0 -196
  49. feature_store/utils/__init__.py +0 -0
  50. feature_store/utils/common_utils.py +0 -96
  51. feature_store/utils/feature_lookup_utils.py +0 -570
  52. feature_store/utils/feature_spec_utils.py +0 -286
  53. feature_store/utils/feature_utils.py +0 -73
  54. feature_store/utils/schema_utils.py +0 -117
  55. feature_store/utils/topological_sort.py +0 -158
  56. feature_store/utils/training_set_utils.py +0 -580
  57. feature_store/utils/uc_utils.py +0 -281
  58. feature_store/utils/utils.py +0 -252
  59. feature_store/utils/validation_utils.py +0 -55
  60. wedata/feature_store/utils/utils.py +0 -252
  61. wedata_feature_engineering-0.1.5.dist-info/RECORD +0 -79
  62. {wedata_feature_engineering-0.1.5.dist-info → wedata_feature_engineering-0.1.6.dist-info}/WHEEL +0 -0
  63. {wedata_feature_engineering-0.1.5.dist-info → wedata_feature_engineering-0.1.6.dist-info}/top_level.txt +0 -0
@@ -4,21 +4,19 @@ from typing import Dict, List, Optional, Set
4
4
 
5
5
  from pyspark.sql import DataFrame
6
6
 
7
- from feature_store.entities.column_info import ColumnInfo
8
- from feature_store.entities.feature import Feature
9
- from feature_store.entities.feature_column_info import FeatureColumnInfo
10
- from feature_store.entities.feature_lookup import FeatureLookup
11
- from feature_store.entities.feature_spec import FeatureSpec
12
- from feature_store.entities.feature_table import FeatureTable
13
- from feature_store.entities.feature_table_info import FeatureTableInfo
14
- from feature_store.entities.function_info import FunctionInfo
15
- from feature_store.entities.on_demand_column_info import OnDemandColumnInfo
16
- from feature_store.entities.source_data_column_info import SourceDataColumnInfo
17
-
18
- from feature_store.constants.constants import _ERROR, _WARN
19
-
20
- from feature_store.utils import common_utils, validation_utils, uc_utils, schema_utils, utils
21
- from feature_store.utils.feature_spec_utils import assign_topological_ordering
7
+ from wedata.feature_store.entities.column_info import ColumnInfo
8
+ from wedata.feature_store.entities.feature import Feature
9
+ from wedata.feature_store.entities.feature_column_info import FeatureColumnInfo
10
+ from wedata.feature_store.entities.feature_lookup import FeatureLookup
11
+ from wedata.feature_store.entities.feature_spec import FeatureSpec
12
+ from wedata.feature_store.entities.feature_table import FeatureTable
13
+ from wedata.feature_store.entities.feature_table_info import FeatureTableInfo
14
+ from wedata.feature_store.entities.function_info import FunctionInfo
15
+ from wedata.feature_store.entities.on_demand_column_info import OnDemandColumnInfo
16
+ from wedata.feature_store.entities.source_data_column_info import SourceDataColumnInfo
17
+
18
+ from wedata.feature_store.utils import common_utils, validation_utils, uc_utils, schema_utils
19
+ from wedata.feature_store.utils.feature_spec_utils import assign_topological_ordering
22
20
 
23
21
  _logger = logging.getLogger(__name__)
24
22
 
@@ -99,9 +97,9 @@ def _explode_feature_lookup(
99
97
  FeatureColumnInfo(
100
98
  table_name=feature_lookup.table_name,
101
99
  feature_name=feature_name,
102
- lookup_key=utils.as_list(feature_lookup.lookup_key),
100
+ lookup_key=common_utils.as_list(feature_lookup.lookup_key),
103
101
  output_name=(feature_lookup._get_output_name(feature_name)),
104
- timestamp_lookup_key=utils.as_list(
102
+ timestamp_lookup_key=common_utils.as_list(
105
103
  feature_lookup.timestamp_lookup_key, default=[]
106
104
  ),
107
105
  )
@@ -280,13 +278,14 @@ def get_table_metadata(
280
278
  feature_table_metadata_map = get_feature_table_metadata_for_tables(
281
279
  spark_client, table_names=table_names
282
280
  )
281
+
283
282
  feature_table_data_map = load_feature_data_for_tables(
284
283
  spark_client, table_names=table_names
285
284
  )
286
285
  return _FeatureTableMetadata(
287
286
  feature_table_features_map,
288
287
  feature_table_metadata_map,
289
- feature_table_data_map,
288
+ feature_table_data_map
290
289
  )
291
290
 
292
291
 
@@ -515,7 +514,7 @@ def build_feature_spec(
515
514
  for table_name in consumed_table_names
516
515
  ]
517
516
  function_infos = [
518
- FunctionInfo(udf_name=udf_name) for udf_name in consumed_udf_names
517
+ FunctionInfo(full_name=udf_name) for udf_name in consumed_udf_names
519
518
  ]
520
519
 
521
520
  # Build FeatureSpec
@@ -2,7 +2,7 @@ import copy
2
2
  import re
3
3
  from typing import Optional, Set
4
4
 
5
- from feature_store.entities.feature_spec import FeatureSpec
5
+ from wedata.feature_store.entities.feature_spec import FeatureSpec
6
6
 
7
7
  SINGLE_LEVEL_NAMESPACE_REGEX = r"^[^\. \/\x00-\x1F\x7F]+$"
8
8
  TWO_LEVEL_NAMESPACE_REGEX = r"^[^\. \/\x00-\x1F\x7F]+(\.[^\. \/\x00-\x1F\x7F]+)$"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: wedata-feature-engineering
3
- Version: 0.1.5
3
+ Version: 0.1.6
4
4
  Summary: Wedata Feature Engineering Library
5
5
  Home-page:
6
6
  Author: meahqian
@@ -0,0 +1,43 @@
1
+ wedata/__init__.py,sha256=26GwucASB9KsmU109sN-VKotEKp1WZYQDGP0wgWZrzY,101
2
+ wedata/feature_store/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ wedata/feature_store/client.py,sha256=7a-9C8HIBHnQNQD6I4W3UtBQwkJE8G-Q7N24zydjpkY,8100
4
+ wedata/feature_store/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ wedata/feature_store/constants/constants.py,sha256=b4tgcSt66YIq0Fg7pMbqvbqPOI77Cz8znLVZ4ihUKss,1479
6
+ wedata/feature_store/entities/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ wedata/feature_store/entities/column_info.py,sha256=sU_WD9U0qse0M2speowLY30qSE6j4_57PuvtdPkwiFY,4192
8
+ wedata/feature_store/entities/data_type.py,sha256=VpHS6Fr3TphQQ8NbAcEnDJ-8eOZV6ivYuWxv3pAM2RM,3394
9
+ wedata/feature_store/entities/environment_variables.py,sha256=ZEFml5H9MQuzBKM074mUrFYu-Sga4Knmxqiwpke2WGc,1679
10
+ wedata/feature_store/entities/feature.py,sha256=wX8fTBlJq3GYdj9rrBDCY3kFgcVBBAiOOZdxEhnQkNQ,1241
11
+ wedata/feature_store/entities/feature_column_info.py,sha256=-TGxRafYUaNKe0YzHus2XbfRaVrMv7pcffMdbtTT4nA,2031
12
+ wedata/feature_store/entities/feature_function.py,sha256=R17INrCE-U_Uj9KLbFz69aYlOkTETTwQHMMo470F4lQ,1865
13
+ wedata/feature_store/entities/feature_lookup.py,sha256=YjYz8kLq42doFbgPzpmm1r3GPhPYkLsIss4H71x-KAo,8009
14
+ wedata/feature_store/entities/feature_spec.py,sha256=60RUOOe9y_Xsd1I3xqq4NZYnaox4_jjwSyGRTKXLiIw,20041
15
+ wedata/feature_store/entities/feature_spec_constants.py,sha256=YWDBfRiNDe6fUJFUBo3V4WYg2xsljoPAE-ZejfFZCgM,785
16
+ wedata/feature_store/entities/feature_table.py,sha256=dHZHSDPD4HJ2XanLVIrVTkaCYUeqZ6eWEpA0d3YO71g,4010
17
+ wedata/feature_store/entities/feature_table_info.py,sha256=2vUaVdW_jw1dRAlmJWvBRueuMeuqWu_NYB9SlxLI7Uw,1126
18
+ wedata/feature_store/entities/function_info.py,sha256=l0kmiq2R_QNfSMJ7y0xZohlMiemgYSr1dN5vzV8ijIs,7314
19
+ wedata/feature_store/entities/on_demand_column_info.py,sha256=Eh5ieaj1TxC7DG6ipBZzH2ZyY0bwkLrDOkuZjgYr4gY,1297
20
+ wedata/feature_store/entities/source_data_column_info.py,sha256=a9jQOJvehwDIrKPwsP6W9YRBSPNK2nZYypE6-p80CwA,542
21
+ wedata/feature_store/entities/training_set.py,sha256=ylt1h6Z_xU8hKYvnvd80CeewTGSN68-_kvFpoliwH7s,5679
22
+ wedata/feature_store/feature_table_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
+ wedata/feature_store/feature_table_client/feature_table_client.py,sha256=nrnY3FLQnMhW1BzByDjjfU89hirgaKlg2l2tAfcjvyM,12138
24
+ wedata/feature_store/spark_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
+ wedata/feature_store/spark_client/spark_client.py,sha256=DBCYjLsFrIVRvLErTNyfLIHRul3v0y9uZIY2JR1N92s,10323
26
+ wedata/feature_store/training_set_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
+ wedata/feature_store/training_set_client/training_set_client.py,sha256=gHeZU0rvvUcyNTfroXD3LAinFPdhDpnwTOIWj6z84Tc,15102
28
+ wedata/feature_store/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
+ wedata/feature_store/utils/common_utils.py,sha256=rmGXfBoEyDMlfqd7lxpEmKJaLoQ-d-ufWpAcE8nSHqA,10009
30
+ wedata/feature_store/utils/feature_lookup_utils.py,sha256=da6ULwf5D-FRVpZoNyag1rroBfq_XPSH4a3uEMB_8io,22372
31
+ wedata/feature_store/utils/feature_spec_utils.py,sha256=GFwc-WT6nN1tnal5F2c0bgVDRhH-YW58S0GaHBPZEsQ,11624
32
+ wedata/feature_store/utils/feature_utils.py,sha256=KKq28bVB_lCuhnR9Hk6JegJBOVgcelWlvrRM-F9onkA,2796
33
+ wedata/feature_store/utils/on_demand_utils.py,sha256=pazZRG5c0Se08MV_inBddIeX4Q9xlVN_H9SC_WK3xzs,4260
34
+ wedata/feature_store/utils/schema_utils.py,sha256=y6EYY1pUxjVg6MP4C7avdW8ZEBBaDo1YTV2CmPF4i8o,4491
35
+ wedata/feature_store/utils/signature_utils.py,sha256=_4_mo1Qlzklp-JrISMS3Jv89MPbaH6rz_cRDvJqFNXM,7957
36
+ wedata/feature_store/utils/topological_sort.py,sha256=ebzKxmxeCLk9seB1zR0ASCGXsZsa-DjxJeTc4KUadtg,6475
37
+ wedata/feature_store/utils/training_set_utils.py,sha256=MYsPZS1d9HKswHgjgxD8K7H9N3dWPyyTTx20Mkp4PVU,22497
38
+ wedata/feature_store/utils/uc_utils.py,sha256=A-W8Cd8yvTmAMEWaHeWmGmcIDMvUtjAfx2G2x_di1QE,10774
39
+ wedata/feature_store/utils/validation_utils.py,sha256=FslvrNs3kstqvM6THScLOluEE6O9RWlDrD9xiihTzlw,1735
40
+ wedata_feature_engineering-0.1.6.dist-info/METADATA,sha256=orxNq_A9F8FcSWYn6wTY1pQ2KtqNVIREvGziUnNa1ys,493
41
+ wedata_feature_engineering-0.1.6.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
42
+ wedata_feature_engineering-0.1.6.dist-info/top_level.txt,sha256=Xa0v1rh__RvfVTVDirW5r5UBKg7ZO_iuTeXfp8MNo2A,7
43
+ wedata_feature_engineering-0.1.6.dist-info/RECORD,,
feature_store/__init__.py DELETED
@@ -1,6 +0,0 @@
1
- """
2
- WeData Feature Engineering
3
- A toolkit for automated feature engineering
4
- """
5
-
6
- __version__ = "0.1.0"
feature_store/client.py DELETED
@@ -1,169 +0,0 @@
1
- """
2
- Wedata FeatureStoreClient Python实现
3
- """
4
-
5
- from __future__ import annotations
6
- from typing import Union, List, Dict, Optional, Any
7
- from pyspark.sql import DataFrame, SparkSession
8
- from pyspark.sql.streaming import StreamingQuery
9
- from pyspark.sql.types import StructType
10
-
11
- from feature_store.constants.constants import APPEND, DEFAULT_WRITE_STREAM_TRIGGER
12
- from feature_store.entities.feature_function import FeatureFunction
13
- from feature_store.entities.feature_lookup import FeatureLookup
14
- from feature_store.entities.training_set import TrainingSet
15
- from feature_store.feature_table_client.feature_table_client import FeatureTableClient
16
- from feature_store.spark_client.spark_client import SparkClient
17
- from feature_store.training_set_client.training_set_client import TrainingSetClient
18
- from feature_store.utils.feature_utils import format_feature_lookups_and_functions
19
-
20
-
21
- class FeatureStoreClient:
22
- """特征存储统一客户端,提供特征全生命周期管理能力"""
23
-
24
- def __init__(self, spark: SparkSession):
25
- """
26
- :param spark: 已初始化的SparkSession对象
27
- """
28
- self._spark = spark
29
- self._spark_client = SparkClient(spark)
30
- self._feature_table_client = FeatureTableClient(spark)
31
-
32
- def create_table(
33
- self,
34
- name: str,
35
- primary_keys: Union[str, List[str]],
36
- df: Optional[DataFrame] = None,
37
- *,
38
- timestamp_keys: Union[str, List[str], None] = None,
39
- partition_columns: Union[str, List[str], None] = None,
40
- schema: Optional[StructType] = None,
41
- description: Optional[str] = None,
42
- tags: Optional[Dict[str, str]] = None
43
- ):
44
- """
45
- 创建特征表(支持批流数据写入)
46
-
47
- Args:
48
- name: 特征表全称(格式:<table>)
49
- primary_keys: 主键列名(支持复合主键)
50
- df: 初始数据(可选,用于推断schema)
51
- timestamp_keys: 时间戳键(用于时态特征)
52
- partition_columns: 分区列(优化存储查询)
53
- description: 业务描述
54
- tags: 业务标签
55
-
56
- Returns:
57
- FeatureTable实例
58
-
59
- Raises:
60
- ValueError: 当schema与数据不匹配时
61
- """
62
-
63
- return self._feature_table_client.create_table(
64
- name=name,
65
- primary_keys=primary_keys,
66
- df=df,
67
- timestamp_keys=timestamp_keys,
68
- partition_columns=partition_columns,
69
- schema=schema,
70
- description=description,
71
- tags=tags
72
- )
73
-
74
-
75
- def read_table(self, name: str) -> DataFrame:
76
- """
77
- 读取特征表数据
78
-
79
- Args:
80
- name: 特征表名称
81
-
82
- Returns:
83
- DataFrame: 包含特征表数据的DataFrame对象
84
- """
85
- return self._feature_table_client.read_table(name)
86
-
87
-
88
- def drop_table(self, name: str) -> None:
89
- """
90
- 删除特征表
91
-
92
- Args:
93
- name: 要删除的特征表名称
94
-
95
- Returns:
96
- None
97
- """
98
- return self._feature_table_client.drop_table(name)
99
-
100
-
101
- def create_training_set(
102
- self,
103
- df: DataFrame,
104
- feature_lookups: List[Union[FeatureLookup, FeatureFunction]],
105
- label: Union[str, List[str], None],
106
- exclude_columns: Optional[List[str]] = None,
107
- **kwargs,
108
- ) -> TrainingSet:
109
-
110
- """
111
- 创建训练集
112
-
113
- Args:
114
- df: 基础数据
115
- feature_lookups: 特征查询列表
116
- label: 标签列名
117
- exclude_columns: 排除列名
118
-
119
- Returns:
120
- TrainingSet实例
121
- """
122
-
123
- if exclude_columns is None:
124
- exclude_columns = []
125
-
126
- features = feature_lookups
127
- del feature_lookups
128
-
129
- features = format_feature_lookups_and_functions(self._spark_client, features)
130
- # 创建TrainingSetClient实例
131
- training_set_client = TrainingSetClient(self._spark_client)
132
- return training_set_client.create_training_set_from_feature_lookups(
133
- df=df,
134
- feature_lookups=features,
135
- label=label,
136
- exclude_columns=exclude_columns,
137
- **kwargs
138
- )
139
-
140
- def write_table(
141
- self,
142
- name: str,
143
- df: DataFrame,
144
- mode: str = APPEND,
145
- checkpoint_location: Optional[str] = None,
146
- trigger: Dict[str, Any] = DEFAULT_WRITE_STREAM_TRIGGER,
147
- ) -> Optional[StreamingQuery]:
148
-
149
- """
150
- 写入数据到特征表(支持批处理和流式处理)
151
-
152
- Args:
153
- name: 特征表名称
154
- df: 要写入的数据DataFrame
155
- mode: 写入模式(默认追加)
156
- checkpoint_location: 流式处理的检查点位置(可选)
157
- trigger: 流式处理触发器配置(默认使用系统预设)
158
-
159
- Returns:
160
- 如果是流式写入返回StreamingQuery对象,否则返回None
161
- """
162
-
163
- return self._feature_table_client.write_table(
164
- name=name,
165
- df=df,
166
- mode=mode,
167
- checkpoint_location=checkpoint_location,
168
- trigger=trigger,
169
- )
File without changes
@@ -1,28 +0,0 @@
1
-
2
- OVERWRITE = "overwrite"
3
- APPEND = "append"
4
- PATH = "path"
5
- TABLE = "table"
6
- CUSTOM = "custom"
7
- PREDICTION_COLUMN_NAME = "prediction"
8
- MODEL_DATA_PATH_ROOT = "feature_store"
9
- UTF8_BYTES_PER_CHAR = 4
10
- MAX_PRIMARY_KEY_STRING_LENGTH_CHARS = 100
11
- MAX_PRIMARY_KEY_STRING_LENGTH_BYTES = (
12
- MAX_PRIMARY_KEY_STRING_LENGTH_CHARS * UTF8_BYTES_PER_CHAR
13
- )
14
- STREAMING_TRIGGER_CONTINUOUS = "continuous"
15
- STREAMING_TRIGGER_ONCE = "once"
16
- STREAMING_TRIGGER_PROCESSING_TIME = "processingTime"
17
- DEFAULT_WRITE_STREAM_TRIGGER = {STREAMING_TRIGGER_PROCESSING_TIME: "5 seconds"}
18
- _DEFAULT_PUBLISH_STREAM_TRIGGER = {STREAMING_TRIGGER_PROCESSING_TIME: "5 minutes"}
19
-
20
-
21
- _WARN = "WARN"
22
- _ERROR = "ERROR"
23
- _SOURCE_FORMAT_DELTA = "delta"
24
-
25
- _NO_RESULT_TYPE_PASSED = "NO_RESULT_TYPE"
26
- _USE_SPARK_NATIVE_JOIN = "use_spark_native_join"
27
- _PREBUILT_ENV_URI = "prebuilt_env_uri"
28
-
File without changes
@@ -1,117 +0,0 @@
1
- import copy
2
- from typing import Optional, Union
3
-
4
- from feature_store.entities.feature_column_info import FeatureColumnInfo
5
- from feature_store.entities.feature_spec_constants import SOURCE_DATA_COLUMN_INFO, FEATURE_COLUMN_INFO, \
6
- ON_DEMAND_COLUMN_INFO
7
- from feature_store.entities.on_demand_column_info import OnDemandColumnInfo
8
- from feature_store.entities.source_data_column_info import SourceDataColumnInfo
9
-
10
-
11
- class ColumnInfo:
12
- """
13
- ColumnInfo's structure and properties are mapped 1:1 to the ColumnInfo proto message, unless specified otherwise.
14
- """
15
-
16
- def __init__(
17
- self,
18
- info: Union[SourceDataColumnInfo, FeatureColumnInfo, OnDemandColumnInfo],
19
- include: bool,
20
- data_type: Optional[str] = None,
21
- topological_ordering: Optional[int] = None,
22
- ):
23
- if not isinstance(
24
- info, (SourceDataColumnInfo, FeatureColumnInfo, OnDemandColumnInfo)
25
- ):
26
- raise ValueError(
27
- "info must be one of SourceDataColumnInfo, FeatureColumnInfo, OnDemandColumnInfo."
28
- )
29
- self._info = info
30
- self._include = include
31
- self._data_type = data_type
32
- self._topological_ordering = topological_ordering
33
-
34
- @property
35
- def info(
36
- self,
37
- ) -> Union[SourceDataColumnInfo, FeatureColumnInfo, OnDemandColumnInfo]:
38
- return self._info
39
-
40
- @property
41
- def include(self) -> bool:
42
- return self._include
43
-
44
- @property
45
- def data_type(self) -> Optional[str]:
46
- """
47
- FeatureSpecs before v7 are not required to have data types.
48
- """
49
- return self._data_type
50
-
51
- @property
52
- def topological_ordering(self) -> Optional[int]:
53
- """
54
- FeatureSpecs before v8 are not required to have topological ordering.
55
- """
56
- return self._topological_ordering
57
-
58
- @property
59
- def output_name(self) -> str:
60
- """
61
- This field does not exist in the proto, and is provided for convenience.
62
- """
63
- return self.info.output_name
64
-
65
- def with_topological_ordering(self, ordering: int):
66
- new_column_info = copy.copy(self)
67
- new_column_info._topological_ordering = ordering
68
- return new_column_info
69
-
70
- @classmethod
71
- def from_proto(cls, column_info_proto):
72
- if column_info_proto.HasField(SOURCE_DATA_COLUMN_INFO):
73
- info = SourceDataColumnInfo.from_proto(
74
- column_info_proto.source_data_column_info
75
- )
76
- elif column_info_proto.HasField(FEATURE_COLUMN_INFO):
77
- info = FeatureColumnInfo.from_proto(column_info_proto.feature_column_info)
78
- elif column_info_proto.HasField(ON_DEMAND_COLUMN_INFO):
79
- info = OnDemandColumnInfo.from_proto(
80
- column_info_proto.on_demand_column_info
81
- )
82
- else:
83
- raise ValueError("Unsupported info type: " + str(column_info_proto))
84
-
85
- data_type = (
86
- column_info_proto.data_type
87
- if column_info_proto.HasField("data_type")
88
- else None
89
- )
90
- topological_ordering = (
91
- column_info_proto.topological_ordering
92
- if column_info_proto.HasField("topological_ordering")
93
- else None
94
- )
95
- return ColumnInfo(
96
- info=info,
97
- include=column_info_proto.include,
98
- data_type=data_type,
99
- topological_ordering=topological_ordering,
100
- )
101
-
102
- # def to_proto(self):
103
- # column_info = ProtoColumnInfo(
104
- # include=self.include,
105
- # data_type=self.data_type,
106
- # topological_ordering=self.topological_ordering,
107
- # )
108
- # if isinstance(self.info, SourceDataColumnInfo):
109
- # column_info.source_data_column_info.CopyFrom(self.info.to_proto())
110
- # elif isinstance(self.info, FeatureColumnInfo):
111
- # column_info.feature_column_info.CopyFrom(self.info.to_proto())
112
- # elif isinstance(self.info, OnDemandColumnInfo):
113
- # column_info.on_demand_column_info.CopyFrom(self.info.to_proto())
114
- # else:
115
- # raise ValueError("Unsupported info type: " + str(self.info))
116
- #
117
- # return column_info
@@ -1,92 +0,0 @@
1
- import json
2
- import re
3
- from typing import Any
4
-
5
- from pyspark.sql.types import ArrayType, DataType, DecimalType, MapType, StructType
6
-
7
-
8
-
9
- class DataType(_ProtoEnumEntity):
10
- """Online store types."""
11
-
12
- INTEGER = ProtoDataType.Value("INTEGER")
13
- FLOAT = ProtoDataType.Value("FLOAT")
14
- BOOLEAN = ProtoDataType.Value("BOOLEAN")
15
- STRING = ProtoDataType.Value("STRING")
16
- DOUBLE = ProtoDataType.Value("DOUBLE")
17
- LONG = ProtoDataType.Value("LONG")
18
- TIMESTAMP = ProtoDataType.Value("TIMESTAMP")
19
- DATE = ProtoDataType.Value("DATE")
20
- SHORT = ProtoDataType.Value("SHORT")
21
- ARRAY = ProtoDataType.Value("ARRAY")
22
- MAP = ProtoDataType.Value("MAP")
23
- BINARY = ProtoDataType.Value("BINARY")
24
- DECIMAL = ProtoDataType.Value("DECIMAL")
25
- STRUCT = ProtoDataType.Value("STRUCT")
26
-
27
- _FIXED_DECIMAL = re.compile("decimal\\(\\s*(\\d+)\\s*,\\s*(\\d+)\\s*\\)")
28
-
29
- @classmethod
30
- def _enum_type(cls) -> Any:
31
- return ProtoDataType
32
-
33
- @classmethod
34
- def from_spark_type(cls, spark_type):
35
- return cls.from_string(spark_type.typeName())
36
-
37
- @classmethod
38
- def spark_type_to_string(cls, spark_type):
39
- return DataType.to_string(DataType.from_spark_type(spark_type))
40
-
41
- @classmethod
42
- def top_level_type_supported(cls, spark_type: DataType) -> bool:
43
- """
44
- Checks whether the provided Spark data type is supported by Feature Store, only considering
45
- the top-level type for nested data types.
46
-
47
- Details on nested types:
48
- ArrayType: The elementType is not checked. Will return True.
49
- MapType: The keyType and valueType are not checked. Will return True.
50
- StructType: The struct fieds are not checked. Will return True.
51
- """
52
- cls.init()
53
- return spark_type.typeName().upper() in cls._STRING_TO_ENUM
54
-
55
- @classmethod
56
- def to_complex_spark_type(cls, json_value):
57
- """
58
- Constructs a complex Spark DataType from its compact JSON representation.
59
-
60
- Examples:
61
- - Input: '"decimal(1,2)"'
62
- Output: DecimalType(1,2)
63
- - Input: '{"containsNull":false,"elementType":"integer","type":"array"}'
64
- Output: ArrayType(IntegerType,false)
65
- - Input: '{"keyType":"integer","type":"map","valueContainsNull":True,"valueType":"integer"}'
66
- Output: MapType(IntegerType,IntegerType,true)
67
- """
68
- if not json_value:
69
- raise ValueError("Empty JSON value cannot be converted to Spark DataType")
70
-
71
- json_data = json.loads(json_value)
72
- if not isinstance(json_data, dict):
73
- # DecimalType does not have fromJson() method
74
- if json_value == "decimal":
75
- return DecimalType()
76
- if cls._FIXED_DECIMAL.match(json_data):
77
- m = cls._FIXED_DECIMAL.match(json_data)
78
- return DecimalType(int(m.group(1)), int(m.group(2)))
79
-
80
- if json_data["type"].upper() == cls.to_string(cls.ARRAY):
81
- return ArrayType.fromJson(json_data)
82
-
83
- if json_data["type"].upper() == cls.to_string(cls.MAP):
84
- return MapType.fromJson(json_data)
85
-
86
- if json_data["type"].upper() == cls.to_string(cls.STRUCT):
87
- return StructType.fromJson(json_data)
88
-
89
- else:
90
- raise ValueError(
91
- f"Spark type {json_data['type']} cannot be converted to a complex Spark DataType"
92
- )
@@ -1,55 +0,0 @@
1
- import os
2
-
3
-
4
- class _EnvironmentVariable:
5
- """
6
- Represents an environment variable for the feature store client for custom configurations as needed.
7
- """
8
-
9
- def __init__(self, name, type_, default):
10
- self.name = name
11
- self.type = type_
12
- self.default = default
13
-
14
- @property
15
- def defined(self):
16
- return self.name in os.environ
17
-
18
- def get_raw(self):
19
- return os.getenv(self.name)
20
-
21
- def set(self, value):
22
- os.environ[self.name] = str(value)
23
-
24
- def unset(self):
25
- os.environ.pop(self.name, None)
26
-
27
- def get(self):
28
- """
29
- Reads the value of the environment variable if it exists and converts it to the desired
30
- type. Otherwise, returns the default value.
31
- """
32
- if (val := self.get_raw()) is not None:
33
- try:
34
- return self.type(val)
35
- except Exception as e:
36
- raise ValueError(
37
- f"Failed to convert {val!r} to {self.type} for {self.name}: {e}"
38
- )
39
- return self.default
40
-
41
- def __str__(self):
42
- return f"{self.name} (default: {self.default}, type: {self.type.__name__})"
43
-
44
- def __repr__(self):
45
- return repr(self.name)
46
-
47
- def __format__(self, format_spec: str) -> str:
48
- return self.name.__format__(format_spec)
49
-
50
-
51
- # The threshold (in MB) where a broadcast join will be performed for the asof join for point in time feature join
52
- # Default is 20MB as benchmarks show diminishing returns with broadcast past this value.The default spark broadcast join threshold is 10MB
53
- BROADCAST_JOIN_THRESHOLD = _EnvironmentVariable(
54
- "BROADCAST_JOIN_THRESHOLD", int, 20 * 1024 * 1024
55
- )
@@ -1,53 +0,0 @@
1
-
2
-
3
- class Feature:
4
- def __init__(
5
- self,
6
- feature_table,
7
- feature_id,
8
- name,
9
- data_type,
10
- description,
11
- data_type_details=None,
12
- ):
13
- self._feature_table = feature_table
14
- self._name = name
15
- self._data_type = data_type
16
- self._description = description
17
- self._data_type_details = data_type_details
18
- self._feature_id = feature_id
19
-
20
- @property
21
- def feature_table(self):
22
- return self._feature_table
23
-
24
- @property
25
- def feature_id(self):
26
- return self._feature_id
27
-
28
- @property
29
- def name(self):
30
- return self._name
31
-
32
- @property
33
- def data_type(self):
34
- return self._data_type
35
-
36
- @property
37
- def data_type_details(self):
38
- return self._data_type_details
39
-
40
- @property
41
- def description(self):
42
- return self._description
43
-
44
- @classmethod
45
- def from_proto(cls, feature_proto):
46
- return cls(
47
- feature_table=feature_proto.table,
48
- feature_id=feature_proto.id,
49
- name=feature_proto.name,
50
- data_type=feature_proto.data_type,
51
- data_type_details=feature_proto.data_type_details,
52
- description=feature_proto.description,
53
- )